From aa7d0fe2d368ad8af839cfd912c9584d81fa8099 Mon Sep 17 00:00:00 2001 From: YugaBot Date: Mon, 2 Oct 2017 18:53:22 -0700 Subject: [PATCH] Import Kudu from commit 1ad16aa0b742a75b86152cd7034f150674070606 --- .gitignore | 35 + .ycm_extra_conf.py | 168 + CMakeLists.txt | 989 +++ CONTRIBUTING.adoc | 18 + DISCLAIMER | 8 + LICENSE.txt | 664 ++ NOTICE.txt | 4 + README.adoc | 397 + build-support/build_source_release.py | 150 + build-support/ccache-clang/clang | 21 + build-support/ccache-clang/clang++ | 21 + build-support/check_compatibility.py | 217 + build-support/dist_test.py | 428 + build-support/enable_devtoolset.sh | 32 + build-support/gen_version_info.py | 131 + build-support/generate_precompiled_xxd.sh | 40 + build-support/get-upstream-commit.sh | 29 + build-support/jenkins/build-and-test.sh | 435 + .../jenkins/cleanup-zombie-jenkins.sh | 66 + build-support/jenkins/dummy-junit.xml | 12 + build-support/jenkins/post-build-clean.sh | 41 + build-support/kudu_util.py | 82 + build-support/lint.sh | 61 + build-support/lsan-suppressions.txt | 19 + build-support/parse_test_failure.py | 241 + build-support/push_to_asf.py | 236 + build-support/report-test.sh | 98 + build-support/run-test.sh | 248 + build-support/run_dist_test.py | 138 + build-support/sanitize-blacklist.txt | 33 + build-support/stacktrace_addr2line.pl | 96 + build-support/test_result_server.py | 457 ++ build-support/tools/kudu-lint/.gitignore | 2 + build-support/tools/kudu-lint/CMakeLists.txt | 68 + build-support/tools/kudu-lint/README | 46 + .../kudu-lint/cmake_modules/FindClang.cmake | 51 + .../kudu-lint/cmake_modules/FindLLVM.cmake | 80 + build-support/tools/kudu-lint/kudu-lint.cc | 262 + build-support/trigger_gerrit.py | 114 + build-support/tsan-suppressions.txt | 82 + cmake_modules/CompilerInfo.cmake | 49 + cmake_modules/FindBitshuffle.cmake | 18 + cmake_modules/FindCrcutil.cmake | 20 + cmake_modules/FindCyrusSASL.cmake | 42 + cmake_modules/FindGFlags.cmake | 21 + cmake_modules/FindGLog.cmake | 21 + cmake_modules/FindGMock.cmake | 57 + cmake_modules/FindGPerf.cmake | 34 + cmake_modules/FindGTest.cmake | 91 + cmake_modules/FindKRPC.cmake | 113 + cmake_modules/FindLibEv.cmake | 21 + cmake_modules/FindLibUnwind.cmake | 37 + cmake_modules/FindLz4.cmake | 17 + cmake_modules/FindProtobuf.cmake | 181 + cmake_modules/FindSnappy.cmake | 21 + cmake_modules/FindSqueasel.cmake | 34 + cmake_modules/FindVmem.cmake | 21 + cmake_modules/FindZlib.cmake | 21 + docs/.gitignore | 2 + docs/administration.adoc | 195 + docs/configuration.adoc | 115 + docs/configuration_reference.adoc | 38 + docs/configuration_reference_unsupported.adoc | 40 + docs/contributing.adoc | 385 + docs/developing.adoc | 106 + docs/images/.gitignore | 0 docs/images/kudu-architecture-2.png | Bin 0 -> 414311 bytes docs/index.adoc | 78 + docs/installation.adoc | 691 ++ docs/introduction.adoc | 220 + docs/kudu_impala_integration.adoc | 995 +++ docs/media-src/README | 5 + docs/media-src/kudu-architecture-2.pdf | Bin 0 -> 178107 bytes docs/media-src/kudu-architecture.graffle | 2135 +++++ docs/media-src/kudu-tablet-flush-6.graffle | 4781 +++++++++++ docs/media-src/kudu-tablet-flush-6b.pdf | Bin 0 -> 274809 bytes docs/quickstart.adoc | 268 + docs/release_notes.adoc | 245 + docs/schema_design.adoc | 295 + docs/style_guide.adoc | 267 + .../jekyll-templates/document.html.erb | 133 + docs/support/scripts/Gemfile | 13 + docs/support/scripts/Gemfile.lock | 73 + docs/support/scripts/make_docs.sh | 208 + docs/support/scripts/make_site.sh | 82 + docs/support/xsl/gflags_to_asciidoc.xsl | 141 + docs/transaction_semantics.adoc | 240 + docs/troubleshooting.adoc | 237 + docs/whitepaper/.gitignore | 3 + docs/whitepaper/kudu.bib | 384 + docs/whitepaper/kudu.tex | 1069 +++ docs/whitepaper/vldb.cls | 1401 ++++ docs/whitepaper/ycsb-data/combine-data.sh | 16 + docs/whitepaper/ycsb-data/log-to-tsv.pl | 10 + docs/whitepaper/ycsb-data/plots.R | 25 + docs/whitepaper/ycsb-data/uniform-hbase.sh | 22 + .../whitepaper/ycsb-data/uniform-hbase/a.json | 89 + .../whitepaper/ycsb-data/uniform-hbase/b.json | 89 + .../whitepaper/ycsb-data/uniform-hbase/c.json | 61 + .../whitepaper/ycsb-data/uniform-hbase/d.json | 89 + .../ycsb-data/uniform-hbase/load.json | 85 + docs/whitepaper/ycsb-data/uniform-kudu.sh | 23 + docs/whitepaper/ycsb-data/uniform-kudu/a.json | 89 + docs/whitepaper/ycsb-data/uniform-kudu/b.json | 89 + docs/whitepaper/ycsb-data/uniform-kudu/c.json | 61 + docs/whitepaper/ycsb-data/uniform-kudu/d.json | 89 + .../ycsb-data/uniform-kudu/load.json | 61 + docs/whitepaper/ycsb-data/zipf-hbase.sh | 22 + .../whitepaper/ycsb-data/zipfian-hbase/a.json | 89 + .../whitepaper/ycsb-data/zipfian-hbase/b.json | 89 + .../whitepaper/ycsb-data/zipfian-hbase/c.json | 61 + .../whitepaper/ycsb-data/zipfian-hbase/d.json | 89 + .../ycsb-data/zipfian-hbase/load.json | 85 + docs/whitepaper/ycsb-data/zipfian-kudu.sh | 23 + docs/whitepaper/ycsb-data/zipfian-kudu/a.json | 89 + docs/whitepaper/ycsb-data/zipfian-kudu/b.json | 89 + docs/whitepaper/ycsb-data/zipfian-kudu/c.json | 61 + docs/whitepaper/ycsb-data/zipfian-kudu/d.json | 89 + .../ycsb-data/zipfian-kudu/load.json | 61 + docs/whitepaper/ycsb-results.pdf | Bin 0 -> 66741 bytes java/.gitignore | 7 + java/README.md | 160 + java/assembly.xml | 34 + java/interface-annotations/pom.xml | 71 + .../kududb/annotations/InterfaceAudience.java | 74 + .../annotations/InterfaceStability.java | 66 + .../ExcludePrivateAnnotationsJDiffDoclet.java | 61 + ...cludePrivateAnnotationsStandardDoclet.java | 60 + ...ncludePublicAnnotationsStandardDoclet.java | 65 + .../annotations/tools/RootDocProcessor.java | 248 + .../annotations/tools/StabilityOptions.java | 71 + .../annotations/tools/package-info.java | 22 + java/kudu-client-tools/pom.xml | 107 + .../org/kududb/mapreduce/tools/CsvParser.java | 162 + .../org/kududb/mapreduce/tools/ImportCsv.java | 116 + .../mapreduce/tools/ImportCsvMapper.java | 143 + .../tools/IntegrationTestBigLinkedList.java | 1660 ++++ .../kududb/mapreduce/tools/RowCounter.java | 127 + .../kududb/mapreduce/tools/ITImportCsv.java | 120 + .../kududb/mapreduce/tools/ITRowCounter.java | 68 + java/kudu-client/.gitignore | 3 + java/kudu-client/dev-support/build-proto.sh | 36 + java/kudu-client/pom.xml | 302 + .../protobuf/ZeroCopyLiteralByteString.java | 67 + .../main/java/org/kududb/ColumnSchema.java | 301 + .../src/main/java/org/kududb/Schema.java | 294 + .../src/main/java/org/kududb/Type.java | 136 + .../client/AbstractKuduScannerBuilder.java | 295 + .../org/kududb/client/AlterTableOptions.java | 107 + .../org/kududb/client/AlterTableRequest.java | 70 + .../org/kududb/client/AlterTableResponse.java | 32 + .../org/kududb/client/AsyncKuduClient.java | 2133 +++++ .../org/kududb/client/AsyncKuduScanner.java | 799 ++ .../org/kududb/client/AsyncKuduSession.java | 816 ++ .../main/java/org/kududb/client/Batch.java | 157 + .../java/org/kududb/client/BatchResponse.java | 96 + .../main/java/org/kududb/client/Bytes.java | 1148 +++ .../java/org/kududb/client/CallResponse.java | 162 + .../kududb/client/ColumnRangePredicate.java | 325 + .../client/ConnectionResetException.java | 49 + .../org/kududb/client/CreateTableOptions.java | 132 + .../org/kududb/client/CreateTableRequest.java | 72 + .../kududb/client/CreateTableResponse.java | 31 + .../org/kududb/client/DeadlineTracker.java | 155 + .../main/java/org/kududb/client/Delete.java | 41 + .../org/kududb/client/DeleteTableRequest.java | 68 + .../kududb/client/DeleteTableResponse.java | 32 + .../org/kududb/client/ErrorCollector.java | 83 + .../client/ExternalConsistencyMode.java | 42 + .../client/GetMasterRegistrationReceived.java | 219 + .../client/GetMasterRegistrationRequest.java | 74 + .../client/GetMasterRegistrationResponse.java | 88 + .../client/GetTableLocationsRequest.java | 84 + .../kududb/client/GetTableSchemaRequest.java | 75 + .../kududb/client/GetTableSchemaResponse.java | 79 + .../kududb/client/HasFailedRpcException.java | 44 + .../main/java/org/kududb/client/IPCUtil.java | 83 + .../main/java/org/kududb/client/Insert.java | 37 + .../client/InvalidResponseException.java | 80 + .../client/IsAlterTableDoneRequest.java | 69 + .../client/IsAlterTableDoneResponse.java | 44 + .../client/IsCreateTableDoneRequest.java | 66 + .../java/org/kududb/client/KeyEncoder.java | 193 + .../java/org/kududb/client/KuduClient.java | 363 + .../java/org/kududb/client/KuduException.java | 69 + .../main/java/org/kududb/client/KuduRpc.java | 306 + .../org/kududb/client/KuduRpcResponse.java | 55 + .../java/org/kududb/client/KuduScanner.java | 139 + .../kududb/client/KuduServerException.java | 47 + .../java/org/kududb/client/KuduSession.java | 182 + .../java/org/kududb/client/KuduTable.java | 188 + .../org/kududb/client/ListTablesRequest.java | 73 + .../org/kududb/client/ListTablesResponse.java | 42 + .../client/ListTabletServersRequest.java | 67 + .../client/ListTabletServersResponse.java | 58 + .../java/org/kududb/client/LocatedTablet.java | 141 + .../kududb/client/MasterErrorException.java | 39 + .../client/NoLeaderMasterFoundException.java | 61 + .../client/NonRecoverableException.java | 54 + .../java/org/kududb/client/Operation.java | 289 + .../org/kududb/client/OperationResponse.java | 111 + .../java/org/kududb/client/PartialRow.java | 626 ++ .../java/org/kududb/client/Partition.java | 182 + .../org/kududb/client/PartitionSchema.java | 142 + .../client/PleaseThrottleException.java | 105 + .../org/kududb/client/ProtobufHelper.java | 250 + .../kududb/client/RecoverableException.java | 61 + .../main/java/org/kududb/client/RowError.java | 99 + .../client/RowErrorsAndOverflowStatus.java | 51 + .../java/org/kududb/client/RowResult.java | 549 ++ .../org/kududb/client/RowResultIterator.java | 111 + .../org/kududb/client/SecureRpcHelper.java | 256 + .../kududb/client/SessionConfiguration.java | 158 + .../java/org/kududb/client/TabletClient.java | 771 ++ .../client/TabletServerErrorException.java | 40 + .../main/java/org/kududb/client/Update.java | 37 + .../main/java/org/kududb/util/AsyncUtil.java | 50 + .../java/org/kududb/util/HybridTimeUtil.java | 70 + .../main/java/org/kududb/util/NetUtil.java | 78 + .../src/main/java/org/kududb/util/Pair.java | 57 + .../src/main/java/org/kududb/util/Slice.java | 699 ++ .../src/main/java/org/kududb/util/Slices.java | 261 + .../java/org/kududb/client/BaseKuduTest.java | 356 + .../org/kududb/client/MiniKuduCluster.java | 427 + .../kududb/client/TestAsyncKuduClient.java | 135 + .../kududb/client/TestAsyncKuduSession.java | 448 ++ .../java/org/kududb/client/TestBitSet.java | 99 + .../java/org/kududb/client/TestBytes.java | 97 + .../client/TestColumnRangePredicate.java | 72 + .../kududb/client/TestDeadlineTracker.java | 74 + .../org/kududb/client/TestErrorCollector.java | 90 + .../client/TestFlexiblePartitioning.java | 350 + .../org/kududb/client/TestHybridTime.java | 161 + .../org/kududb/client/TestKeyEncoding.java | 272 + .../org/kududb/client/TestKuduClient.java | 319 + .../org/kududb/client/TestKuduSession.java | 160 + .../java/org/kududb/client/TestKuduTable.java | 248 + .../org/kududb/client/TestLeaderFailover.java | 70 + .../org/kududb/client/TestMasterFailover.java | 72 + .../java/org/kududb/client/TestOperation.java | 163 + .../java/org/kududb/client/TestRowErrors.java | 97 + .../java/org/kududb/client/TestRowResult.java | 130 + .../kududb/client/TestScannerMultiTablet.java | 222 + .../java/org/kududb/client/TestTestUtils.java | 114 + .../java/org/kududb/client/TestTimeouts.java | 71 + .../java/org/kududb/client/TestUtils.java | 243 + .../java/org/kududb/util/TestAsyncUtil.java | 75 + .../java/org/kududb/util/TestMurmurHash.java | 46 + .../java/org/kududb/util/TestNetUtil.java | 73 + java/kudu-client/src/test/resources/flags | 9 + .../src/test/resources/log4j.properties | 23 + java/kudu-csd/generate_mdl.py | 212 + java/kudu-csd/pom.xml | 129 + java/kudu-csd/src/descriptor/service.sdl | 311 + java/kudu-csd/src/images/icon.png | Bin 0 -> 693 bytes java/kudu-csd/src/scripts/kudu.sh | 125 + java/kudu-mapreduce/pom.xml | 113 + .../kududb/mapreduce/CommandLineParser.java | 144 + .../mapreduce/KuduTableInputFormat.java | 452 ++ .../mapreduce/KuduTableMapReduceUtil.java | 519 ++ .../mapreduce/KuduTableOutputCommitter.java | 57 + .../mapreduce/KuduTableOutputFormat.java | 215 + .../org/kududb/mapreduce/TableReducer.java | 28 + .../mapreduce/HadoopTestingUtility.java | 101 + .../kududb/mapreduce/TestInputFormatJob.java | 127 + .../mapreduce/TestKuduTableInputFormat.java | 132 + .../mapreduce/TestKuduTableOutputFormat.java | 66 + .../kududb/mapreduce/TestOutputFormatJob.java | 131 + java/kudu-spark/pom.xml | 169 + .../org/kududb/spark/DefaultSource.scala | 148 + .../scala/org/kududb/spark/KuduContext.scala | 67 + .../src/test/resources/log4j.properties | 23 + .../org/kududb/spark/DefaultSourceTest.scala | 40 + .../org/kududb/spark/KuduContextTest.scala | 35 + .../scala/org/kududb/spark/TestContext.scala | 82 + java/pom.xml | 291 + python/.gitignore | 43 + python/MANIFEST.in | 13 + python/Makefile | 21 + python/README.md | 12 + python/kudu/__init__.pxd | 17 + python/kudu/__init__.py | 113 + python/kudu/client.pyx | 1237 +++ python/kudu/compat.py | 86 + python/kudu/errors.pxd | 20 + python/kudu/errors.pyx | 62 + python/kudu/libkudu_client.pxd | 607 ++ python/kudu/schema.pxd | 59 + python/kudu/schema.pyx | 545 ++ python/kudu/tests/__init__.py | 0 python/kudu/tests/common.py | 153 + python/kudu/tests/test_client.py | 189 + python/kudu/tests/test_scanner.py | 105 + python/kudu/tests/test_schema.py | 182 + python/kudu/util.py | 21 + python/requirements.txt | 6 + python/setup.cfg | 2 + python/setup.py | 167 + src/kudu/benchmarks/CMakeLists.txt | 56 + src/kudu/benchmarks/bin/parse_rpc_bench.sh | 47 + src/kudu/benchmarks/rle.cc | 121 + .../benchmarks/tpch/line_item_tsv_importer.h | 135 + .../benchmarks/tpch/rpc_line_item_dao-test.cc | 165 + src/kudu/benchmarks/tpch/rpc_line_item_dao.cc | 236 + src/kudu/benchmarks/tpch/rpc_line_item_dao.h | 106 + src/kudu/benchmarks/tpch/tpch-schemas.h | 118 + src/kudu/benchmarks/tpch/tpch1.cc | 283 + src/kudu/benchmarks/tpch/tpch_real_world.cc | 396 + src/kudu/benchmarks/wal_hiccup-parser.py | 71 + src/kudu/benchmarks/wal_hiccup.cc | 288 + src/kudu/benchmarks/ycsb-schema.h | 50 + src/kudu/cfile/CMakeLists.txt | 71 + src/kudu/cfile/README | 186 + src/kudu/cfile/binary_dict_block.cc | 273 + src/kudu/cfile/binary_dict_block.h | 172 + src/kudu/cfile/binary_plain_block.cc | 290 + src/kudu/cfile/binary_plain_block.h | 140 + src/kudu/cfile/binary_prefix_block.cc | 569 ++ src/kudu/cfile/binary_prefix_block.h | 163 + src/kudu/cfile/block_cache-test.cc | 62 + src/kudu/cfile/block_cache.cc | 144 + src/kudu/cfile/block_cache.h | 167 + src/kudu/cfile/block_compression.cc | 134 + src/kudu/cfile/block_compression.h | 93 + src/kudu/cfile/block_encodings.h | 160 + src/kudu/cfile/block_handle.h | 97 + src/kudu/cfile/block_pointer.h | 94 + src/kudu/cfile/bloomfile-test-base.h | 133 + src/kudu/cfile/bloomfile-test.cc | 131 + src/kudu/cfile/bloomfile.cc | 312 + src/kudu/cfile/bloomfile.h | 142 + src/kudu/cfile/bshuf_block.cc | 185 + src/kudu/cfile/bshuf_block.h | 376 + src/kudu/cfile/cfile-dump.cc | 100 + src/kudu/cfile/cfile-test-base.h | 500 ++ src/kudu/cfile/cfile-test.cc | 837 ++ src/kudu/cfile/cfile.proto | 80 + src/kudu/cfile/cfile_reader.cc | 990 +++ src/kudu/cfile/cfile_reader.h | 476 ++ src/kudu/cfile/cfile_util.cc | 92 + src/kudu/cfile/cfile_util.h | 105 + src/kudu/cfile/cfile_writer.cc | 484 ++ src/kudu/cfile/cfile_writer.h | 226 + src/kudu/cfile/compression-test.cc | 131 + src/kudu/cfile/compression_codec.cc | 265 + src/kudu/cfile/compression_codec.h | 74 + src/kudu/cfile/encoding-test.cc | 879 ++ src/kudu/cfile/gvint_block.cc | 369 + src/kudu/cfile/gvint_block.h | 161 + src/kudu/cfile/index-test.cc | 327 + src/kudu/cfile/index_block.cc | 314 + src/kudu/cfile/index_block.h | 186 + src/kudu/cfile/index_btree.cc | 327 + src/kudu/cfile/index_btree.h | 126 + src/kudu/cfile/mt-bloomfile-test.cc | 53 + src/kudu/cfile/plain_bitmap_block.h | 211 + src/kudu/cfile/plain_block.h | 237 + src/kudu/cfile/rle_block.h | 423 + src/kudu/cfile/type_encodings.cc | 331 + src/kudu/cfile/type_encodings.h | 75 + src/kudu/client/CMakeLists.txt | 238 + .../client/MungeExportedInstallTargets.cmake | 31 + src/kudu/client/README | 132 + src/kudu/client/batcher.cc | 868 ++ src/kudu/client/batcher.h | 219 + src/kudu/client/callbacks.h | 185 + src/kudu/client/client-internal.cc | 857 ++ src/kudu/client/client-internal.h | 241 + src/kudu/client/client-test-util.cc | 81 + src/kudu/client/client-test-util.h | 61 + src/kudu/client/client-test.cc | 2718 +++++++ src/kudu/client/client-unittest.cc | 175 + src/kudu/client/client.cc | 1280 +++ src/kudu/client/client.h | 1076 +++ src/kudu/client/clientConfig.cmake.in | 11 + src/kudu/client/client_builder-internal.cc | 33 + src/kudu/client/client_builder-internal.h | 44 + src/kudu/client/client_samples-test.sh | 128 + src/kudu/client/client_symbol-test.sh | 87 + src/kudu/client/error-internal.cc | 34 + src/kudu/client/error-internal.h | 41 + src/kudu/client/error_collector.cc | 55 + src/kudu/client/error_collector.h | 61 + src/kudu/client/meta_cache.cc | 658 ++ src/kudu/client/meta_cache.h | 294 + src/kudu/client/row_result.h | 33 + src/kudu/client/samples/CMakeLists.txt | 29 + src/kudu/client/samples/sample.cc | 290 + src/kudu/client/scan_batch.cc | 308 + src/kudu/client/scan_batch.h | 223 + src/kudu/client/scan_predicate-internal.h | 94 + src/kudu/client/scan_predicate.cc | 91 + src/kudu/client/scan_predicate.h | 63 + src/kudu/client/scanner-internal.cc | 545 ++ src/kudu/client/scanner-internal.h | 223 + src/kudu/client/schema-internal.h | 98 + src/kudu/client/schema.cc | 534 ++ src/kudu/client/schema.h | 351 + src/kudu/client/session-internal.cc | 81 + src/kudu/client/session-internal.h | 89 + src/kudu/client/shared_ptr.h | 63 + src/kudu/client/stubs.h | 199 + src/kudu/client/symbols.map | 37 + src/kudu/client/table-internal.cc | 152 + src/kudu/client/table-internal.h | 57 + src/kudu/client/table_alterer-internal.cc | 114 + src/kudu/client/table_alterer-internal.h | 69 + src/kudu/client/table_creator-internal.cc | 39 + src/kudu/client/table_creator-internal.h | 57 + src/kudu/client/tablet_server-internal.cc | 34 + src/kudu/client/tablet_server-internal.h | 42 + src/kudu/client/value-internal.h | 82 + src/kudu/client/value.cc | 192 + src/kudu/client/value.h | 69 + src/kudu/client/write_op-internal.h | 32 + src/kudu/client/write_op.cc | 108 + src/kudu/client/write_op.h | 153 + src/kudu/codegen/CMakeLists.txt | 174 + src/kudu/codegen/README | 247 + src/kudu/codegen/code_cache.cc | 88 + src/kudu/codegen/code_cache.h | 93 + src/kudu/codegen/code_generator.cc | 233 + src/kudu/codegen/code_generator.h | 85 + src/kudu/codegen/codegen-test.cc | 362 + src/kudu/codegen/compilation_manager.cc | 196 + src/kudu/codegen/compilation_manager.h | 110 + src/kudu/codegen/jit_wrapper.cc | 36 + src/kudu/codegen/jit_wrapper.h | 79 + src/kudu/codegen/module_builder.cc | 314 + src/kudu/codegen/module_builder.h | 152 + src/kudu/codegen/precompiled.cc | 164 + src/kudu/codegen/precompiled.ll.h | 32 + src/kudu/codegen/row_projector.cc | 488 ++ src/kudu/codegen/row_projector.h | 155 + src/kudu/common/CMakeLists.txt | 92 + src/kudu/common/README | 16 + src/kudu/common/columnblock.h | 242 + src/kudu/common/common.proto | 270 + src/kudu/common/encoded_key-test.cc | 301 + src/kudu/common/encoded_key.cc | 218 + src/kudu/common/encoded_key.h | 117 + src/kudu/common/generic_iterators-test.cc | 305 + src/kudu/common/generic_iterators.cc | 598 ++ src/kudu/common/generic_iterators.h | 226 + src/kudu/common/id_mapping-test.cc | 91 + src/kudu/common/id_mapping.cc | 38 + src/kudu/common/id_mapping.h | 166 + src/kudu/common/iterator.h | 117 + src/kudu/common/iterator_stats.cc | 64 + src/kudu/common/iterator_stats.h | 56 + src/kudu/common/key_encoder.cc | 92 + src/kudu/common/key_encoder.h | 362 + src/kudu/common/partial_row-test.cc | 187 + src/kudu/common/partial_row.cc | 672 ++ src/kudu/common/partial_row.h | 250 + src/kudu/common/partition-test.cc | 451 ++ src/kudu/common/partition.cc | 784 ++ src/kudu/common/partition.h | 273 + src/kudu/common/predicate-test.cc | 131 + src/kudu/common/predicate_encoder-test.cc | 305 + src/kudu/common/predicate_encoder.cc | 244 + src/kudu/common/predicate_encoder.h | 83 + src/kudu/common/row.h | 760 ++ src/kudu/common/row_changelist-test.cc | 209 + src/kudu/common/row_changelist.cc | 359 + src/kudu/common/row_changelist.h | 388 + src/kudu/common/row_key-util-test.cc | 135 + src/kudu/common/row_key-util.cc | 118 + src/kudu/common/row_key-util.h | 73 + src/kudu/common/row_operations-test.cc | 671 ++ src/kudu/common/row_operations.cc | 576 ++ src/kudu/common/row_operations.h | 113 + src/kudu/common/rowblock.cc | 115 + src/kudu/common/rowblock.h | 333 + src/kudu/common/rowid.h | 60 + src/kudu/common/scan_predicate.cc | 111 + src/kudu/common/scan_predicate.h | 132 + src/kudu/common/scan_spec.cc | 91 + src/kudu/common/scan_spec.h | 120 + src/kudu/common/schema-test.cc | 421 + src/kudu/common/schema.cc | 487 ++ src/kudu/common/schema.h | 864 ++ src/kudu/common/timestamp.cc | 55 + src/kudu/common/timestamp.h | 97 + src/kudu/common/types-test.cc | 64 + src/kudu/common/types.cc | 93 + src/kudu/common/types.h | 562 ++ src/kudu/common/wire_protocol-test-util.h | 76 + src/kudu/common/wire_protocol-test.cc | 322 + src/kudu/common/wire_protocol.cc | 560 ++ src/kudu/common/wire_protocol.h | 150 + src/kudu/common/wire_protocol.proto | 171 + src/kudu/consensus/CMakeLists.txt | 146 + src/kudu/consensus/README | 280 + src/kudu/consensus/consensus-test-util.h | 870 ++ src/kudu/consensus/consensus.cc | 108 + src/kudu/consensus/consensus.h | 428 + src/kudu/consensus/consensus.proto | 493 ++ src/kudu/consensus/consensus.txt | 148 + src/kudu/consensus/consensus_meta-test.cc | 286 + src/kudu/consensus/consensus_meta.cc | 236 + src/kudu/consensus/consensus_meta.h | 174 + src/kudu/consensus/consensus_peers-test.cc | 329 + src/kudu/consensus/consensus_peers.cc | 470 ++ src/kudu/consensus/consensus_peers.h | 324 + src/kudu/consensus/consensus_queue-test.cc | 819 ++ src/kudu/consensus/consensus_queue.cc | 876 ++ src/kudu/consensus/consensus_queue.h | 431 + src/kudu/consensus/leader_election-test.cc | 636 ++ src/kudu/consensus/leader_election.cc | 373 + src/kudu/consensus/leader_election.h | 244 + src/kudu/consensus/local_consensus.cc | 192 + src/kudu/consensus/local_consensus.h | 123 + src/kudu/consensus/log-dump.cc | 242 + src/kudu/consensus/log-test-base.h | 378 + src/kudu/consensus/log-test.cc | 1051 +++ src/kudu/consensus/log.cc | 1021 +++ src/kudu/consensus/log.h | 552 ++ src/kudu/consensus/log.proto | 100 + .../consensus/log_anchor_registry-test.cc | 118 + src/kudu/consensus/log_anchor_registry.cc | 180 + src/kudu/consensus/log_anchor_registry.h | 161 + src/kudu/consensus/log_cache-test.cc | 308 + src/kudu/consensus/log_cache.cc | 481 ++ src/kudu/consensus/log_cache.h | 217 + src/kudu/consensus/log_index-test.cc | 113 + src/kudu/consensus/log_index.cc | 275 + src/kudu/consensus/log_index.h | 110 + src/kudu/consensus/log_metrics.cc | 66 + src/kudu/consensus/log_metrics.h | 53 + src/kudu/consensus/log_reader.cc | 497 ++ src/kudu/consensus/log_reader.h | 207 + src/kudu/consensus/log_util.cc | 810 ++ src/kudu/consensus/log_util.h | 397 + src/kudu/consensus/metadata.proto | 165 + src/kudu/consensus/mt-log-test.cc | 178 + src/kudu/consensus/opid.proto | 28 + src/kudu/consensus/opid_util.cc | 163 + src/kudu/consensus/opid_util.h | 114 + src/kudu/consensus/peer_manager.cc | 116 + src/kudu/consensus/peer_manager.h | 92 + src/kudu/consensus/quorum_util-test.cc | 60 + src/kudu/consensus/quorum_util.cc | 223 + src/kudu/consensus/quorum_util.h | 79 + src/kudu/consensus/raft_consensus-test.cc | 752 ++ src/kudu/consensus/raft_consensus.cc | 1995 +++++ src/kudu/consensus/raft_consensus.h | 467 ++ .../consensus/raft_consensus_quorum-test.cc | 1124 +++ .../consensus/raft_consensus_state-test.cc | 115 + src/kudu/consensus/raft_consensus_state.cc | 740 ++ src/kudu/consensus/raft_consensus_state.h | 386 + src/kudu/consensus/ref_counted_replicate.h | 50 + src/kudu/experiments/CMakeLists.txt | 30 + src/kudu/experiments/merge-test.cc | 140 + src/kudu/experiments/rwlock-perf.cc | 247 + src/kudu/fs/CMakeLists.txt | 47 + src/kudu/fs/block_id.cc | 52 + src/kudu/fs/block_id.h | 105 + src/kudu/fs/block_manager-stress-test.cc | 412 + src/kudu/fs/block_manager-test.cc | 758 ++ src/kudu/fs/block_manager.cc | 51 + src/kudu/fs/block_manager.h | 280 + src/kudu/fs/block_manager_metrics.cc | 71 + src/kudu/fs/block_manager_metrics.h | 50 + src/kudu/fs/block_manager_util-test.cc | 180 + src/kudu/fs/block_manager_util.cc | 175 + src/kudu/fs/block_manager_util.h | 90 + src/kudu/fs/file_block_manager.cc | 764 ++ src/kudu/fs/file_block_manager.h | 153 + src/kudu/fs/fs-test-util.h | 81 + src/kudu/fs/fs.proto | 111 + src/kudu/fs/fs_manager-test.cc | 178 + src/kudu/fs/fs_manager.cc | 511 ++ src/kudu/fs/fs_manager.h | 280 + src/kudu/fs/log_block_manager.cc | 1601 ++++ src/kudu/fs/log_block_manager.h | 343 + src/kudu/gutil/CMakeLists.txt | 71 + src/kudu/gutil/algorithm.h | 441 ++ src/kudu/gutil/arm_instruction_set_select.h | 52 + src/kudu/gutil/atomic_refcount.h | 195 + src/kudu/gutil/atomicops-internals-macosx.h | 406 + src/kudu/gutil/atomicops-internals-powerpc.h | 304 + src/kudu/gutil/atomicops-internals-tsan.h | 217 + src/kudu/gutil/atomicops-internals-x86.cc | 128 + src/kudu/gutil/atomicops-internals-x86.h | 513 ++ src/kudu/gutil/atomicops.h | 373 + .../atomicops-internals-arm-generic.h | 230 + .../atomicops-internals-arm-v6plus.h | 378 + .../auxiliary/atomicops-internals-windows.h | 508 ++ src/kudu/gutil/basictypes.h | 32 + src/kudu/gutil/bind.h | 539 ++ src/kudu/gutil/bind.h.pump | 153 + src/kudu/gutil/bind_helpers.h | 551 ++ src/kudu/gutil/bind_internal.h | 2695 +++++++ src/kudu/gutil/bind_internal.h.pump | 464 ++ src/kudu/gutil/bits.cc | 101 + src/kudu/gutil/bits.h | 267 + src/kudu/gutil/callback.h | 765 ++ src/kudu/gutil/callback.h.pump | 436 + src/kudu/gutil/callback_forward.h | 17 + src/kudu/gutil/callback_internal.cc | 36 + src/kudu/gutil/callback_internal.h | 177 + src/kudu/gutil/casts.h | 392 + src/kudu/gutil/charmap.h | 87 + src/kudu/gutil/cpu.cc | 289 + src/kudu/gutil/cpu.h | 90 + src/kudu/gutil/cycleclock-inl.h | 203 + src/kudu/gutil/dynamic_annotations.c | 173 + src/kudu/gutil/dynamic_annotations.h | 770 ++ src/kudu/gutil/endian.h | 358 + src/kudu/gutil/fixedarray.h | 181 + src/kudu/gutil/gscoped_ptr.h | 830 ++ src/kudu/gutil/hash/builtin_type_hash.h | 95 + src/kudu/gutil/hash/city.cc | 317 + src/kudu/gutil/hash/city.h | 53 + src/kudu/gutil/hash/hash.cc | 197 + src/kudu/gutil/hash/hash.h | 419 + src/kudu/gutil/hash/hash128to64.h | 24 + src/kudu/gutil/hash/jenkins.cc | 188 + src/kudu/gutil/hash/jenkins.h | 40 + src/kudu/gutil/hash/jenkins_lookup2.h | 156 + src/kudu/gutil/hash/legacy_hash.h | 84 + src/kudu/gutil/hash/string_hash.h | 85 + src/kudu/gutil/int128.cc | 19 + src/kudu/gutil/int128.h | 332 + src/kudu/gutil/integral_types.h | 104 + src/kudu/gutil/linux_syscall_support.h | 3680 +++++++++ src/kudu/gutil/logging-inl.h | 50 + src/kudu/gutil/macros.h | 271 + src/kudu/gutil/manual_constructor.h | 250 + src/kudu/gutil/map-util.h | 770 ++ src/kudu/gutil/mathlimits.cc | 123 + src/kudu/gutil/mathlimits.h | 256 + src/kudu/gutil/move.h | 218 + src/kudu/gutil/once.cc | 49 + src/kudu/gutil/once.h | 119 + src/kudu/gutil/paranoid.h | 92 + src/kudu/gutil/port.h | 1218 +++ .../raw_scoped_refptr_mismatch_checker.h | 128 + src/kudu/gutil/ref_counted.cc | 95 + src/kudu/gutil/ref_counted.h | 354 + src/kudu/gutil/ref_counted_memory.cc | 99 + src/kudu/gutil/ref_counted_memory.h | 150 + src/kudu/gutil/singleton.h | 153 + src/kudu/gutil/spinlock.cc | 187 + src/kudu/gutil/spinlock.h | 151 + src/kudu/gutil/spinlock_internal.cc | 122 + src/kudu/gutil/spinlock_internal.h | 64 + src/kudu/gutil/spinlock_linux-inl.h | 104 + src/kudu/gutil/spinlock_posix-inl.h | 62 + src/kudu/gutil/spinlock_win32-inl.h | 54 + src/kudu/gutil/stl_util.h | 977 +++ src/kudu/gutil/stringprintf.cc | 137 + src/kudu/gutil/stringprintf.h | 48 + src/kudu/gutil/strings/ascii_ctype.cc | 110 + src/kudu/gutil/strings/ascii_ctype.h | 75 + src/kudu/gutil/strings/charset.cc | 24 + src/kudu/gutil/strings/charset.h | 71 + src/kudu/gutil/strings/escaping.cc | 2023 +++++ src/kudu/gutil/strings/escaping.h | 677 ++ src/kudu/gutil/strings/fastmem.h | 130 + src/kudu/gutil/strings/human_readable.cc | 428 + src/kudu/gutil/strings/human_readable.h | 162 + src/kudu/gutil/strings/join.cc | 211 + src/kudu/gutil/strings/join.h | 371 + src/kudu/gutil/strings/memutil.cc | 138 + src/kudu/gutil/strings/memutil.h | 153 + src/kudu/gutil/strings/numbers.cc | 1461 ++++ src/kudu/gutil/strings/numbers.h | 575 ++ src/kudu/gutil/strings/serialize.cc | 328 + src/kudu/gutil/strings/serialize.h | 343 + src/kudu/gutil/strings/split.cc | 1088 +++ src/kudu/gutil/strings/split.h | 1209 +++ src/kudu/gutil/strings/split_internal.h | 413 + src/kudu/gutil/strings/strcat.cc | 252 + src/kudu/gutil/strings/strcat.h | 380 + src/kudu/gutil/strings/string_util-test.cc | 58 + src/kudu/gutil/strings/stringpiece.cc | 224 + src/kudu/gutil/strings/stringpiece.h | 377 + src/kudu/gutil/strings/strip.cc | 384 + src/kudu/gutil/strings/strip.h | 272 + src/kudu/gutil/strings/substitute.cc | 133 + src/kudu/gutil/strings/substitute.h | 192 + src/kudu/gutil/strings/util.cc | 1218 +++ src/kudu/gutil/strings/util.h | 514 ++ src/kudu/gutil/strtoint.cc | 47 + src/kudu/gutil/strtoint.h | 93 + src/kudu/gutil/synchronization_profiling.h | 51 + src/kudu/gutil/sysinfo.cc | 412 + src/kudu/gutil/sysinfo.h | 55 + src/kudu/gutil/template_util.h | 164 + src/kudu/gutil/thread_annotations.h | 236 + .../threading/thread_collision_warner.cc | 82 + .../gutil/threading/thread_collision_warner.h | 248 + src/kudu/gutil/tuple.h | 1291 +++ src/kudu/gutil/type_traits.h | 363 + src/kudu/gutil/utf/LICENSE | 13 + src/kudu/gutil/utf/rune.c | 350 + src/kudu/gutil/utf/utf.h | 233 + src/kudu/gutil/utf/utfdef.h | 14 + src/kudu/gutil/valgrind.h | 3924 +++++++++ src/kudu/gutil/walltime.cc | 208 + src/kudu/gutil/walltime.h | 179 + src/kudu/integration-tests/CMakeLists.txt | 68 + src/kudu/integration-tests/all_types-itest.cc | 451 ++ .../alter_table-randomized-test.cc | 446 ++ .../integration-tests/alter_table-test.cc | 1006 +++ .../integration-tests/client-stress-test.cc | 282 + .../client_failover-itest.cc | 155 + .../integration-tests/cluster_itest_util.cc | 803 ++ .../integration-tests/cluster_itest_util.h | 303 + .../integration-tests/cluster_verifier.cc | 155 + src/kudu/integration-tests/cluster_verifier.h | 97 + .../integration-tests/create-table-itest.cc | 190 + .../create-table-stress-test.cc | 319 + .../integration-tests/delete_table-test.cc | 1076 +++ .../external_mini_cluster-itest-base.h | 93 + .../external_mini_cluster-test.cc | 125 + .../external_mini_cluster.cc | 870 ++ .../integration-tests/external_mini_cluster.h | 428 + .../external_mini_cluster_fs_inspector.cc | 350 + .../external_mini_cluster_fs_inspector.h | 118 + .../flex_partitioning-itest.cc | 571 ++ .../full_stack-insert-scan-test.cc | 463 ++ .../integration-tests/linked_list-test-util.h | 868 ++ .../integration-tests/linked_list-test.cc | 311 + .../master_failover-itest.cc | 261 + .../master_replication-itest.cc | 213 + src/kudu/integration-tests/mini_cluster.cc | 318 + src/kudu/integration-tests/mini_cluster.h | 187 + .../integration-tests/raft_consensus-itest.cc | 2467 ++++++ .../integration-tests/registration-test.cc | 164 + .../remote_bootstrap-itest.cc | 686 ++ .../tablet_replacement-itest.cc | 309 + src/kudu/integration-tests/test_workload.cc | 245 + src/kudu/integration-tests/test_workload.h | 156 + src/kudu/integration-tests/ts_itest-base.h | 464 ++ .../integration-tests/ts_recovery-itest.cc | 138 + .../ts_tablet_manager-itest.cc | 196 + .../update_scan_delta_compact-test.cc | 313 + src/kudu/master/CMakeLists.txt | 84 + src/kudu/master/README | 238 + src/kudu/master/catalog_manager-test.cc | 110 + src/kudu/master/catalog_manager.cc | 3338 ++++++++ src/kudu/master/catalog_manager.h | 676 ++ src/kudu/master/master-path-handlers.cc | 454 ++ src/kudu/master/master-path-handlers.h | 88 + src/kudu/master/master-test-util.h | 124 + src/kudu/master/master-test.cc | 456 ++ src/kudu/master/master.cc | 264 + src/kudu/master/master.h | 146 + src/kudu/master/master.proto | 577 ++ src/kudu/master/master_main.cc | 77 + src/kudu/master/master_options.cc | 64 + src/kudu/master/master_options.h | 41 + src/kudu/master/master_rpc.cc | 221 + src/kudu/master/master_rpc.h | 146 + src/kudu/master/master_service.cc | 383 + src/kudu/master/master_service.h | 96 + src/kudu/master/mini_master.cc | 148 + src/kudu/master/mini_master.h | 94 + src/kudu/master/sys_catalog-test.cc | 370 + src/kudu/master/sys_catalog.cc | 617 ++ src/kudu/master/sys_catalog.h | 204 + src/kudu/master/ts_descriptor.cc | 231 + src/kudu/master/ts_descriptor.h | 155 + src/kudu/master/ts_manager.cc | 120 + src/kudu/master/ts_manager.h | 100 + src/kudu/rpc/CMakeLists.txt | 117 + src/kudu/rpc/README | 361 + src/kudu/rpc/acceptor_pool.cc | 152 + src/kudu/rpc/acceptor_pool.h | 79 + src/kudu/rpc/auth_store.cc | 61 + src/kudu/rpc/auth_store.h | 71 + src/kudu/rpc/blocking_ops.cc | 116 + src/kudu/rpc/blocking_ops.h | 64 + src/kudu/rpc/connection.cc | 619 ++ src/kudu/rpc/connection.h | 288 + src/kudu/rpc/constants.cc | 28 + src/kudu/rpc/constants.h | 52 + src/kudu/rpc/inbound_call.cc | 278 + src/kudu/rpc/inbound_call.h | 217 + src/kudu/rpc/messenger.cc | 307 + src/kudu/rpc/messenger.h | 275 + src/kudu/rpc/mt-rpc-test.cc | 291 + src/kudu/rpc/negotiation.cc | 240 + src/kudu/rpc/negotiation.h | 38 + src/kudu/rpc/outbound_call.cc | 488 ++ src/kudu/rpc/outbound_call.h | 365 + src/kudu/rpc/protoc-gen-krpc.cc | 685 ++ src/kudu/rpc/proxy.cc | 115 + src/kudu/rpc/proxy.h | 119 + src/kudu/rpc/reactor-test.cc | 98 + src/kudu/rpc/reactor.cc | 667 ++ src/kudu/rpc/reactor.h | 353 + src/kudu/rpc/remote_method.cc | 49 + src/kudu/rpc/remote_method.h | 51 + src/kudu/rpc/response_callback.h | 31 + src/kudu/rpc/rpc-bench.cc | 131 + src/kudu/rpc/rpc-test-base.h | 427 + src/kudu/rpc/rpc-test.cc | 515 ++ src/kudu/rpc/rpc.cc | 95 + src/kudu/rpc/rpc.h | 149 + src/kudu/rpc/rpc_context.cc | 184 + src/kudu/rpc/rpc_context.h | 187 + src/kudu/rpc/rpc_controller.cc | 96 + src/kudu/rpc/rpc_controller.h | 133 + src/kudu/rpc/rpc_header.proto | 172 + src/kudu/rpc/rpc_introspection.proto | 54 + src/kudu/rpc/rpc_service.h | 41 + src/kudu/rpc/rpc_sidecar.h | 70 + src/kudu/rpc/rpc_stub-test.cc | 433 + src/kudu/rpc/rtest.proto | 101 + src/kudu/rpc/rtest_diff_package.proto | 25 + src/kudu/rpc/sasl_client.cc | 492 ++ src/kudu/rpc/sasl_client.h | 167 + src/kudu/rpc/sasl_common.cc | 221 + src/kudu/rpc/sasl_common.h | 100 + src/kudu/rpc/sasl_helper.cc | 198 + src/kudu/rpc/sasl_helper.h | 132 + src/kudu/rpc/sasl_rpc-test.cc | 218 + src/kudu/rpc/sasl_server.cc | 447 ++ src/kudu/rpc/sasl_server.h | 167 + src/kudu/rpc/serialization.cc | 200 + src/kudu/rpc/serialization.h | 88 + src/kudu/rpc/service_if.cc | 78 + src/kudu/rpc/service_if.h | 64 + src/kudu/rpc/service_pool.cc | 183 + src/kudu/rpc/service_pool.h | 90 + src/kudu/rpc/transfer.cc | 232 + src/kudu/rpc/transfer.h | 168 + src/kudu/scripts/benchmarks.sh | 606 ++ src/kudu/scripts/compare-hbase-kudu.R | 80 + src/kudu/scripts/ensure_cpu_scaling.sh | 24 + src/kudu/scripts/get-job-stats-from-mysql.py | 42 + src/kudu/scripts/graph-metrics.py | 78 + src/kudu/scripts/jobs_runtime.R | 48 + src/kudu/scripts/mt-tablet-test-graph.R | 101 + src/kudu/scripts/multiplot.R | 65 + src/kudu/scripts/parse_metrics_log.py | 183 + src/kudu/scripts/parse_real_out.sh | 26 + src/kudu/scripts/si_vec.R | 43 + src/kudu/scripts/tpch.sh | 146 + src/kudu/scripts/write-jobs-stats-to-mysql.py | 44 + src/kudu/server/CMakeLists.txt | 105 + src/kudu/server/clock.h | 102 + src/kudu/server/default-path-handlers.cc | 229 + src/kudu/server/default-path-handlers.h | 49 + src/kudu/server/generic_service.cc | 151 + src/kudu/server/generic_service.h | 60 + src/kudu/server/glog_metrics.cc | 86 + src/kudu/server/glog_metrics.h | 52 + src/kudu/server/hybrid_clock-test.cc | 235 + src/kudu/server/hybrid_clock.cc | 493 ++ src/kudu/server/hybrid_clock.h | 225 + src/kudu/server/logical_clock-test.cc | 87 + src/kudu/server/logical_clock.cc | 107 + src/kudu/server/logical_clock.h | 91 + src/kudu/server/metadata.h | 26 + src/kudu/server/monitored_task.h | 61 + src/kudu/server/pprof-path-handlers.cc | 252 + src/kudu/server/pprof-path-handlers.h | 40 + src/kudu/server/rpc_server.cc | 196 + src/kudu/server/rpc_server.h | 95 + src/kudu/server/rpcz-path-handler.cc | 63 + src/kudu/server/rpcz-path-handler.h | 34 + src/kudu/server/server_base.cc | 333 + src/kudu/server/server_base.h | 136 + src/kudu/server/server_base.proto | 131 + src/kudu/server/server_base_options.cc | 52 + src/kudu/server/server_base_options.h | 54 + src/kudu/server/tcmalloc_metrics.cc | 106 + src/kudu/server/tcmalloc_metrics.h | 34 + src/kudu/server/tracing-path-handlers.cc | 266 + src/kudu/server/tracing-path-handlers.h | 41 + src/kudu/server/webserver-test.cc | 163 + src/kudu/server/webserver.cc | 454 ++ src/kudu/server/webserver.h | 169 + src/kudu/server/webserver_options.cc | 92 + src/kudu/server/webserver_options.h | 42 + src/kudu/server/webui_util.cc | 177 + src/kudu/server/webui_util.h | 41 + src/kudu/tablet/CMakeLists.txt | 118 + src/kudu/tablet/README | 759 ++ src/kudu/tablet/cbtree-test.cc | 782 ++ src/kudu/tablet/cfile_set-test.cc | 316 + src/kudu/tablet/cfile_set.cc | 486 ++ src/kudu/tablet/cfile_set.h | 233 + src/kudu/tablet/compaction-policy.txt | 397 + src/kudu/tablet/compaction-test.cc | 818 ++ src/kudu/tablet/compaction.cc | 1021 +++ src/kudu/tablet/compaction.h | 176 + src/kudu/tablet/compaction.txt | 95 + src/kudu/tablet/compaction_policy-test.cc | 56 + src/kudu/tablet/compaction_policy.cc | 347 + src/kudu/tablet/compaction_policy.h | 98 + src/kudu/tablet/composite-pushdown-test.cc | 388 + src/kudu/tablet/concurrent_btree.h | 1800 +++++ src/kudu/tablet/delta_applier.cc | 107 + src/kudu/tablet/delta_applier.h | 85 + src/kudu/tablet/delta_compaction-test.cc | 212 + src/kudu/tablet/delta_compaction.cc | 351 + src/kudu/tablet/delta_compaction.h | 140 + src/kudu/tablet/delta_iterator_merger.cc | 161 + src/kudu/tablet/delta_iterator_merger.h | 71 + src/kudu/tablet/delta_key.cc | 38 + src/kudu/tablet/delta_key.h | 133 + src/kudu/tablet/delta_stats.cc | 124 + src/kudu/tablet/delta_stats.h | 106 + src/kudu/tablet/delta_store.cc | 136 + src/kudu/tablet/delta_store.h | 208 + src/kudu/tablet/delta_tracker.cc | 532 ++ src/kudu/tablet/delta_tracker.h | 269 + src/kudu/tablet/deltafile-test.cc | 372 + src/kudu/tablet/deltafile.cc | 842 ++ src/kudu/tablet/deltafile.h | 314 + src/kudu/tablet/deltamemstore-test.cc | 504 ++ src/kudu/tablet/deltamemstore.cc | 371 + src/kudu/tablet/deltamemstore.h | 280 + src/kudu/tablet/diskrowset-test-base.h | 338 + src/kudu/tablet/diskrowset-test.cc | 540 ++ src/kudu/tablet/diskrowset.cc | 742 ++ src/kudu/tablet/diskrowset.h | 413 + src/kudu/tablet/local_tablet_writer.h | 139 + src/kudu/tablet/lock_manager-test.cc | 287 + src/kudu/tablet/lock_manager.cc | 399 + src/kudu/tablet/lock_manager.h | 131 + src/kudu/tablet/maintenance_manager-test.cc | 286 + src/kudu/tablet/maintenance_manager.cc | 412 + src/kudu/tablet/maintenance_manager.h | 280 + .../tablet/major_delta_compaction-test.cc | 364 + src/kudu/tablet/memrowset-test.cc | 523 ++ src/kudu/tablet/memrowset.cc | 630 ++ src/kudu/tablet/memrowset.h | 515 ++ src/kudu/tablet/metadata-test.cc | 131 + src/kudu/tablet/metadata.proto | 152 + src/kudu/tablet/mock-rowsets.h | 173 + src/kudu/tablet/mt-diskrowset-test.cc | 114 + .../tablet/mt-rowset_delta_compaction-test.cc | 205 + src/kudu/tablet/mt-tablet-test.cc | 470 ++ src/kudu/tablet/multi_column_writer.cc | 144 + src/kudu/tablet/multi_column_writer.h | 96 + src/kudu/tablet/mutation.cc | 90 + src/kudu/tablet/mutation.h | 113 + src/kudu/tablet/mvcc-test.cc | 619 ++ src/kudu/tablet/mvcc.cc | 584 ++ src/kudu/tablet/mvcc.h | 466 ++ src/kudu/tablet/row_op.cc | 59 + src/kudu/tablet/row_op.h | 73 + src/kudu/tablet/rowset.cc | 235 + src/kudu/tablet/rowset.h | 331 + src/kudu/tablet/rowset_info.cc | 308 + src/kudu/tablet/rowset_info.h | 88 + src/kudu/tablet/rowset_metadata.cc | 267 + src/kudu/tablet/rowset_metadata.h | 265 + src/kudu/tablet/rowset_tree-test.cc | 180 + src/kudu/tablet/rowset_tree.cc | 187 + src/kudu/tablet/rowset_tree.h | 115 + src/kudu/tablet/schema-change.txt | 107 + src/kudu/tablet/svg_dump.cc | 200 + src/kudu/tablet/svg_dump.h | 46 + src/kudu/tablet/tablet-harness.h | 145 + src/kudu/tablet/tablet-pushdown-test.cc | 199 + src/kudu/tablet/tablet-schema-test.cc | 299 + src/kudu/tablet/tablet-test-base.h | 475 ++ src/kudu/tablet/tablet-test-util.h | 270 + src/kudu/tablet/tablet-test.cc | 962 +++ src/kudu/tablet/tablet.cc | 1777 +++++ src/kudu/tablet/tablet.h | 630 ++ src/kudu/tablet/tablet.proto | 127 + src/kudu/tablet/tablet_bootstrap-test.cc | 555 ++ src/kudu/tablet/tablet_bootstrap.cc | 1478 ++++ src/kudu/tablet/tablet_bootstrap.h | 107 + src/kudu/tablet/tablet_metadata-test.cc | 98 + src/kudu/tablet/tablet_metadata.cc | 636 ++ src/kudu/tablet/tablet_metadata.h | 349 + src/kudu/tablet/tablet_metrics.cc | 270 + src/kudu/tablet/tablet_metrics.h | 107 + src/kudu/tablet/tablet_mm_ops-test.cc | 116 + src/kudu/tablet/tablet_mm_ops.h | 120 + src/kudu/tablet/tablet_peer-test.cc | 568 ++ src/kudu/tablet/tablet_peer.cc | 652 ++ src/kudu/tablet/tablet_peer.h | 353 + src/kudu/tablet/tablet_peer_mm_ops.cc | 238 + src/kudu/tablet/tablet_peer_mm_ops.h | 133 + src/kudu/tablet/tablet_random_access-test.cc | 567 ++ src/kudu/tablet/transaction_order_verifier.cc | 49 + src/kudu/tablet/transaction_order_verifier.h | 91 + .../transactions/alter_schema_transaction.cc | 145 + .../transactions/alter_schema_transaction.h | 137 + src/kudu/tablet/transactions/transaction.cc | 88 + src/kudu/tablet/transactions/transaction.h | 344 + .../tablet/transactions/transaction_driver.cc | 485 ++ .../tablet/transactions/transaction_driver.h | 260 + .../transactions/transaction_tracker-test.cc | 257 + .../transactions/transaction_tracker.cc | 255 + .../tablet/transactions/transaction_tracker.h | 116 + .../tablet/transactions/write_transaction.cc | 375 + .../tablet/transactions/write_transaction.h | 277 + .../tablet/triggering-maintenance-ops.txt | 211 + src/kudu/tools/CMakeLists.txt | 107 + src/kudu/tools/README.systemtap | 53 + src/kudu/tools/create-demo-table.cc | 120 + src/kudu/tools/data_gen_util.cc | 80 + src/kudu/tools/data_gen_util.h | 53 + src/kudu/tools/fs_dump-tool.cc | 213 + src/kudu/tools/fs_list-tool.cc | 153 + src/kudu/tools/fs_tool.cc | 577 ++ src/kudu/tools/fs_tool.h | 158 + src/kudu/tools/insert-generated-rows.cc | 129 + src/kudu/tools/ksck-test.cc | 258 + src/kudu/tools/ksck.cc | 497 ++ src/kudu/tools/ksck.h | 311 + src/kudu/tools/ksck_remote-test.cc | 303 + src/kudu/tools/ksck_remote.cc | 342 + src/kudu/tools/ksck_remote.h | 116 + src/kudu/tools/kudu-admin-test.cc | 190 + src/kudu/tools/kudu-admin.cc | 426 + src/kudu/tools/kudu-ksck.cc | 148 + src/kudu/tools/kudu-ts-cli-test.cc | 98 + src/kudu/tools/parse_debug_refcounted.pl | 92 + src/kudu/tools/pbc-dump.cc | 71 + src/kudu/tools/trace_io.stp | 242 + src/kudu/tools/ts-cli.cc | 493 ++ src/kudu/tserver/CMakeLists.txt | 172 + src/kudu/tserver/heartbeater.cc | 464 ++ src/kudu/tserver/heartbeater.h | 54 + src/kudu/tserver/mini_tablet_server.cc | 154 + src/kudu/tserver/mini_tablet_server.h | 105 + src/kudu/tserver/remote_bootstrap-test-base.h | 126 + src/kudu/tserver/remote_bootstrap.proto | 201 + .../tserver/remote_bootstrap_client-test.cc | 242 + src/kudu/tserver/remote_bootstrap_client.cc | 552 ++ src/kudu/tserver/remote_bootstrap_client.h | 215 + .../tserver/remote_bootstrap_service-test.cc | 459 ++ src/kudu/tserver/remote_bootstrap_service.cc | 359 + src/kudu/tserver/remote_bootstrap_service.h | 111 + .../tserver/remote_bootstrap_session-test.cc | 327 + src/kudu/tserver/remote_bootstrap_session.cc | 371 + src/kudu/tserver/remote_bootstrap_session.h | 190 + src/kudu/tserver/scanner_metrics.cc | 49 + src/kudu/tserver/scanner_metrics.h | 51 + src/kudu/tserver/scanners-test.cc | 85 + src/kudu/tserver/scanners.cc | 223 + src/kudu/tserver/scanners.h | 329 + src/kudu/tserver/tablet_peer_lookup.h | 58 + src/kudu/tserver/tablet_server-stress-test.cc | 121 + src/kudu/tserver/tablet_server-test-base.h | 475 ++ src/kudu/tserver/tablet_server-test.cc | 2277 ++++++ src/kudu/tserver/tablet_server.cc | 147 + src/kudu/tserver/tablet_server.h | 125 + src/kudu/tserver/tablet_server_main.cc | 73 + src/kudu/tserver/tablet_server_options.cc | 50 + src/kudu/tserver/tablet_server_options.h | 42 + src/kudu/tserver/tablet_server_test_util.cc | 46 + src/kudu/tserver/tablet_server_test_util.h | 54 + src/kudu/tserver/tablet_service.cc | 1697 ++++ src/kudu/tserver/tablet_service.h | 170 + src/kudu/tserver/ts_tablet_manager-test.cc | 241 + src/kudu/tserver/ts_tablet_manager.cc | 998 +++ src/kudu/tserver/ts_tablet_manager.h | 366 + src/kudu/tserver/tserver-path-handlers.cc | 570 ++ src/kudu/tserver/tserver-path-handlers.h | 83 + src/kudu/tserver/tserver.proto | 329 + src/kudu/tserver/tserver_admin.proto | 135 + src/kudu/tserver/tserver_service.proto | 70 + src/kudu/twitter-demo/CMakeLists.txt | 64 + src/kudu/twitter-demo/README | 24 + src/kudu/twitter-demo/example-deletes.txt | 163 + src/kudu/twitter-demo/example-tweets.txt | 505 ++ src/kudu/twitter-demo/ingest_firehose.cc | 125 + src/kudu/twitter-demo/insert_consumer.cc | 161 + src/kudu/twitter-demo/insert_consumer.h | 92 + src/kudu/twitter-demo/oauth-test.cc | 65 + src/kudu/twitter-demo/oauth.cc | 127 + src/kudu/twitter-demo/oauth.h | 68 + src/kudu/twitter-demo/parser-test.cc | 84 + src/kudu/twitter-demo/parser.cc | 103 + src/kudu/twitter-demo/parser.h | 86 + src/kudu/twitter-demo/twitter-schema.h | 86 + src/kudu/twitter-demo/twitter_streamer.cc | 202 + src/kudu/twitter-demo/twitter_streamer.h | 66 + src/kudu/util/CMakeLists.txt | 362 + src/kudu/util/alignment.h | 28 + src/kudu/util/async_util.h | 72 + src/kudu/util/atomic-test.cc | 131 + src/kudu/util/atomic.cc | 55 + src/kudu/util/atomic.h | 320 + src/kudu/util/auto_release_pool.h | 99 + src/kudu/util/bit-stream-utils.h | 163 + src/kudu/util/bit-stream-utils.inline.h | 222 + src/kudu/util/bit-util-test.cc | 50 + src/kudu/util/bit-util.h | 59 + src/kudu/util/bitmap-test.cc | 223 + src/kudu/util/bitmap.cc | 132 + src/kudu/util/bitmap.h | 212 + src/kudu/util/blocking_queue-test.cc | 214 + src/kudu/util/blocking_queue.h | 240 + src/kudu/util/bloom_filter-test.cc | 87 + src/kudu/util/bloom_filter.cc | 86 + src/kudu/util/bloom_filter.h | 248 + src/kudu/util/boost_mutex_utils.h | 45 + src/kudu/util/cache-test.cc | 246 + src/kudu/util/cache.cc | 488 ++ src/kudu/util/cache.h | 158 + src/kudu/util/cache_metrics.cc | 69 + src/kudu/util/cache_metrics.h | 47 + src/kudu/util/callback_bind-test.cc | 110 + src/kudu/util/coding-inl.h | 117 + src/kudu/util/coding.cc | 141 + src/kudu/util/coding.h | 110 + src/kudu/util/condition_variable.cc | 140 + src/kudu/util/condition_variable.h | 113 + src/kudu/util/countdown_latch-test.cc | 71 + src/kudu/util/countdown_latch.h | 139 + src/kudu/util/cow_object.h | 219 + src/kudu/util/crc-test.cc | 91 + src/kudu/util/crc.cc | 50 + src/kudu/util/crc.h | 39 + src/kudu/util/curl_util.cc | 90 + src/kudu/util/curl_util.h | 63 + src/kudu/util/debug-util-test.cc | 149 + src/kudu/util/debug-util.cc | 388 + src/kudu/util/debug-util.h | 163 + src/kudu/util/debug/leak_annotations.h | 53 + src/kudu/util/debug/leakcheck_disabler.h | 47 + src/kudu/util/debug/sanitizer_scopes.h | 47 + src/kudu/util/debug/trace_event.h | 1500 ++++ src/kudu/util/debug/trace_event_impl.cc | 2416 ++++++ src/kudu/util/debug/trace_event_impl.h | 717 ++ .../util/debug/trace_event_impl_constants.cc | 14 + src/kudu/util/debug/trace_event_memory.h | 28 + .../util/debug/trace_event_synthetic_delay.cc | 232 + .../util/debug/trace_event_synthetic_delay.h | 162 + src/kudu/util/debug/trace_logging.h | 118 + src/kudu/util/debug_ref_counted.h | 56 + src/kudu/util/env-test.cc | 705 ++ src/kudu/util/env.cc | 88 + src/kudu/util/env.h | 602 ++ src/kudu/util/env_posix.cc | 1139 +++ src/kudu/util/env_util.cc | 157 + src/kudu/util/env_util.h | 97 + src/kudu/util/errno-test.cc | 49 + src/kudu/util/errno.cc | 46 + src/kudu/util/errno.h | 35 + src/kudu/util/failure_detector-test.cc | 112 + src/kudu/util/failure_detector.cc | 213 + src/kudu/util/failure_detector.h | 179 + src/kudu/util/faststring.cc | 57 + src/kudu/util/faststring.h | 242 + src/kudu/util/fault_injection.cc | 83 + src/kudu/util/fault_injection.h | 66 + src/kudu/util/flag_tags-test.cc | 61 + src/kudu/util/flag_tags.cc | 88 + src/kudu/util/flag_tags.h | 166 + src/kudu/util/flags.cc | 287 + src/kudu/util/flags.h | 39 + src/kudu/util/group_varint-inl.h | 268 + src/kudu/util/group_varint-test.cc | 135 + src/kudu/util/group_varint.cc | 78 + src/kudu/util/hash_util-test.cc | 40 + src/kudu/util/hash_util.h | 68 + src/kudu/util/hdr_histogram-test.cc | 113 + src/kudu/util/hdr_histogram.cc | 490 ++ src/kudu/util/hdr_histogram.h | 344 + src/kudu/util/hexdump.cc | 74 + src/kudu/util/hexdump.h | 33 + src/kudu/util/high_water_mark.h | 85 + src/kudu/util/histogram.proto | 47 + src/kudu/util/init.cc | 54 + src/kudu/util/init.h | 34 + src/kudu/util/inline_slice-test.cc | 84 + src/kudu/util/inline_slice.h | 182 + src/kudu/util/interval_tree-inl.h | 313 + src/kudu/util/interval_tree-test.cc | 201 + src/kudu/util/interval_tree.h | 114 + src/kudu/util/jsonreader-test.cc | 170 + src/kudu/util/jsonreader.cc | 124 + src/kudu/util/jsonreader.h | 89 + src/kudu/util/jsonwriter-test.cc | 126 + src/kudu/util/jsonwriter.cc | 319 + src/kudu/util/jsonwriter.h | 96 + src/kudu/util/jsonwriter_test.proto | 72 + src/kudu/util/kernel_stack_watchdog.cc | 175 + src/kudu/util/kernel_stack_watchdog.h | 248 + src/kudu/util/knapsack_solver-test.cc | 168 + src/kudu/util/knapsack_solver.h | 269 + src/kudu/util/locks.cc | 42 + src/kudu/util/locks.h | 331 + src/kudu/util/logging-test.cc | 51 + src/kudu/util/logging.cc | 279 + src/kudu/util/logging.h | 235 + src/kudu/util/logging_callback.h | 46 + src/kudu/util/logging_test_util.h | 60 + src/kudu/util/malloc.cc | 35 + src/kudu/util/malloc.h | 32 + src/kudu/util/map-util-test.cc | 48 + src/kudu/util/mem_tracker-test.cc | 340 + src/kudu/util/mem_tracker.cc | 579 ++ src/kudu/util/mem_tracker.h | 420 + src/kudu/util/memcmpable_varint-test.cc | 207 + src/kudu/util/memcmpable_varint.cc | 257 + src/kudu/util/memcmpable_varint.h | 43 + src/kudu/util/memenv/memenv-test.cc | 312 + src/kudu/util/memenv/memenv.cc | 616 ++ src/kudu/util/memenv/memenv.h | 20 + src/kudu/util/memory/arena-test.cc | 190 + src/kudu/util/memory/arena.cc | 173 + src/kudu/util/memory/arena.h | 495 ++ src/kudu/util/memory/memory.cc | 365 + src/kudu/util/memory/memory.h | 978 +++ src/kudu/util/metrics-test.cc | 310 + src/kudu/util/metrics.cc | 683 ++ src/kudu/util/metrics.h | 1075 +++ src/kudu/util/monotime-test.cc | 201 + src/kudu/util/monotime.cc | 259 + src/kudu/util/monotime.h | 141 + src/kudu/util/mt-hdr_histogram-test.cc | 111 + src/kudu/util/mt-metrics-test.cc | 121 + src/kudu/util/mt-threadlocal-test.cc | 322 + src/kudu/util/mutex.cc | 107 + src/kudu/util/mutex.h | 138 + src/kudu/util/net/dns_resolver-test.cc | 55 + src/kudu/util/net/dns_resolver.cc | 63 + src/kudu/util/net/dns_resolver.h | 63 + src/kudu/util/net/net_util-test.cc | 137 + src/kudu/util/net/net_util.cc | 285 + src/kudu/util/net/net_util.h | 112 + src/kudu/util/net/sockaddr.cc | 136 + src/kudu/util/net/sockaddr.h | 89 + src/kudu/util/net/socket.cc | 558 ++ src/kudu/util/net/socket.h | 152 + src/kudu/util/nvm_cache.cc | 592 ++ src/kudu/util/nvm_cache.h | 30 + src/kudu/util/object_pool-test.cc | 84 + src/kudu/util/object_pool.h | 168 + src/kudu/util/oid_generator.cc | 35 + src/kudu/util/oid_generator.h | 49 + src/kudu/util/once-test.cc | 110 + src/kudu/util/once.cc | 32 + src/kudu/util/once.h | 110 + src/kudu/util/os-util-test.cc | 59 + src/kudu/util/os-util.cc | 148 + src/kudu/util/os-util.h | 65 + src/kudu/util/path_util-test.cc | 61 + src/kudu/util/path_util.cc | 63 + src/kudu/util/path_util.h | 40 + src/kudu/util/pb_util-internal.cc | 106 + src/kudu/util/pb_util-internal.h | 123 + src/kudu/util/pb_util-test.cc | 424 + src/kudu/util/pb_util.cc | 664 ++ src/kudu/util/pb_util.h | 309 + src/kudu/util/pb_util.proto | 40 + src/kudu/util/promise.h | 79 + src/kudu/util/proto_container_test.proto | 24 + src/kudu/util/proto_container_test2.proto | 28 + src/kudu/util/proto_container_test3.proto | 32 + src/kudu/util/protobuf-annotations.h | 33 + src/kudu/util/protobuf_util.h | 39 + src/kudu/util/protoc-gen-insertions.cc | 72 + src/kudu/util/pstack_watcher-test.cc | 85 + src/kudu/util/pstack_watcher.cc | 189 + src/kudu/util/pstack_watcher.h | 93 + src/kudu/util/random-test.cc | 163 + src/kudu/util/random.h | 232 + src/kudu/util/random_util-test.cc | 73 + src/kudu/util/random_util.cc | 53 + src/kudu/util/random_util.h | 39 + src/kudu/util/resettable_heartbeater-test.cc | 105 + src/kudu/util/resettable_heartbeater.cc | 178 + src/kudu/util/resettable_heartbeater.h | 79 + src/kudu/util/rle-encoding.h | 536 ++ src/kudu/util/rle-test.cc | 527 ++ src/kudu/util/rolling_log-test.cc | 121 + src/kudu/util/rolling_log.cc | 255 + src/kudu/util/rolling_log.h | 107 + src/kudu/util/rw_semaphore-test.cc | 90 + src/kudu/util/rw_semaphore.h | 195 + src/kudu/util/rwc_lock-test.cc | 143 + src/kudu/util/rwc_lock.cc | 123 + src/kudu/util/rwc_lock.h | 136 + src/kudu/util/safe_math-test.cc | 56 + src/kudu/util/safe_math.h | 69 + src/kudu/util/semaphore.cc | 95 + src/kudu/util/semaphore.h | 76 + src/kudu/util/semaphore_macosx.cc | 72 + src/kudu/util/slice-test.cc | 56 + src/kudu/util/slice.cc | 73 + src/kudu/util/slice.h | 214 + src/kudu/util/spinlock_profiling-test.cc | 74 + src/kudu/util/spinlock_profiling.cc | 299 + src/kudu/util/spinlock_profiling.h | 72 + src/kudu/util/stack_watchdog-test.cc | 107 + src/kudu/util/status-test.cc | 115 + src/kudu/util/status.cc | 162 + src/kudu/util/status.h | 359 + src/kudu/util/status_callback.cc | 27 + src/kudu/util/status_callback.h | 43 + src/kudu/util/stopwatch.h | 327 + src/kudu/util/string_case-test.cc | 63 + src/kudu/util/string_case.cc | 73 + src/kudu/util/string_case.h | 48 + src/kudu/util/striped64-test.cc | 152 + src/kudu/util/striped64.cc | 175 + src/kudu/util/striped64.h | 178 + src/kudu/util/subprocess-test.cc | 108 + src/kudu/util/subprocess.cc | 448 ++ src/kudu/util/subprocess.h | 152 + src/kudu/util/sync_point-test.cc | 59 + src/kudu/util/sync_point.cc | 92 + src/kudu/util/sync_point.h | 100 + src/kudu/util/test_graph.cc | 115 + src/kudu/util/test_graph.h | 87 + src/kudu/util/test_macros.h | 78 + src/kudu/util/test_main.cc | 76 + src/kudu/util/test_util.cc | 176 + src/kudu/util/test_util.h | 83 + src/kudu/util/thread-test.cc | 152 + src/kudu/util/thread.cc | 606 ++ src/kudu/util/thread.h | 343 + src/kudu/util/thread_restrictions.cc | 81 + src/kudu/util/thread_restrictions.h | 121 + src/kudu/util/threadlocal.cc | 71 + src/kudu/util/threadlocal.h | 143 + src/kudu/util/threadpool-test.cc | 293 + src/kudu/util/threadpool.cc | 351 + src/kudu/util/threadpool.h | 224 + src/kudu/util/trace-test.cc | 825 ++ src/kudu/util/trace.cc | 204 + src/kudu/util/trace.h | 186 + src/kudu/util/url-coding-test.cc | 120 + src/kudu/util/url-coding.cc | 214 + src/kudu/util/url-coding.h | 82 + src/kudu/util/user-test.cc | 42 + src/kudu/util/user.cc | 68 + src/kudu/util/user.h | 32 + src/kudu/util/version_info.cc | 79 + src/kudu/util/version_info.h | 48 + src/kudu/util/version_info.proto | 31 + src/kudu/util/web_callback_registry.h | 66 + thirdparty/.gitignore | 31 + thirdparty/LICENSE.txt | 630 ++ thirdparty/README.txt | 6 + .../boost_uuid/boost/uuid/name_generator.hpp | 125 + .../boost_uuid/boost/uuid/nil_generator.hpp | 34 + .../boost/uuid/random_generator.hpp | 118 + thirdparty/boost_uuid/boost/uuid/seed_rng.hpp | 262 + thirdparty/boost_uuid/boost/uuid/sha1.hpp | 208 + .../boost/uuid/string_generator.hpp | 184 + thirdparty/boost_uuid/boost/uuid/uuid.hpp | 221 + .../boost_uuid/boost/uuid/uuid_generators.hpp | 19 + thirdparty/boost_uuid/boost/uuid/uuid_io.hpp | 198 + .../boost_uuid/boost/uuid/uuid_serialize.hpp | 20 + thirdparty/build-definitions.sh | 320 + thirdparty/build-if-necessary.sh | 86 + thirdparty/build-thirdparty.sh | 342 + thirdparty/download-thirdparty.sh | 224 + .../crcutil-fix-libtoolize-on-osx.patch | 16 + .../glog-issue-198-fix-unused-warnings.patch | 47 + ...ault-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch | 34 + ...ock-on-OSX-instead-of-pthread_atfork.patch | 72 + .../patches/libstdcxx-fix-string-dtor.patch | 54 + .../libstdcxx-fix-tr1-shared-ptr.patch | 21 + .../patches/llvm-devtoolset-toolchain.patch | 16 + .../patches/llvm-fix-amazon-linux.patch | 22 + ...make-build-to-use-gnu-flags-on-clang.patch | 51 + thirdparty/vars.sh | 134 + version.txt | 1 + www/bootstrap/css/bootstrap-responsive.css | 1088 +++ .../css/bootstrap-responsive.min.css | 9 + www/bootstrap/css/bootstrap.css | 5893 ++++++++++++++ www/bootstrap/css/bootstrap.min.css | 9 + .../img/glyphicons-halflings-white.png | Bin 0 -> 8777 bytes www/bootstrap/img/glyphicons-halflings.png | Bin 0 -> 12799 bytes www/bootstrap/js/bootstrap.js | 2025 +++++ www/bootstrap/js/bootstrap.min.js | 6 + www/d3.v2.js | 7034 +++++++++++++++++ www/epoch.0.5.2.min.css | 1 + www/epoch.0.5.2.min.js | 92 + www/favicon.ico | Bin 0 -> 1150 bytes www/index.html | 5 + www/jquery-1.11.1.min.js | 4 + www/kudu.css | 49 + www/logo.png | Bin 0 -> 18396 bytes www/metrics-epoch.js | 218 + www/metrics.html | 32 + 1386 files changed, 350837 insertions(+) create mode 100644 .gitignore create mode 100644 .ycm_extra_conf.py create mode 100644 CMakeLists.txt create mode 100644 CONTRIBUTING.adoc create mode 100644 DISCLAIMER create mode 100644 LICENSE.txt create mode 100644 NOTICE.txt create mode 100644 README.adoc create mode 100755 build-support/build_source_release.py create mode 100755 build-support/ccache-clang/clang create mode 100755 build-support/ccache-clang/clang++ create mode 100755 build-support/check_compatibility.py create mode 100755 build-support/dist_test.py create mode 100755 build-support/enable_devtoolset.sh create mode 100755 build-support/gen_version_info.py create mode 100755 build-support/generate_precompiled_xxd.sh create mode 100755 build-support/get-upstream-commit.sh create mode 100755 build-support/jenkins/build-and-test.sh create mode 100644 build-support/jenkins/cleanup-zombie-jenkins.sh create mode 100644 build-support/jenkins/dummy-junit.xml create mode 100755 build-support/jenkins/post-build-clean.sh create mode 100644 build-support/kudu_util.py create mode 100755 build-support/lint.sh create mode 100644 build-support/lsan-suppressions.txt create mode 100755 build-support/parse_test_failure.py create mode 100755 build-support/push_to_asf.py create mode 100755 build-support/report-test.sh create mode 100755 build-support/run-test.sh create mode 100755 build-support/run_dist_test.py create mode 100644 build-support/sanitize-blacklist.txt create mode 100755 build-support/stacktrace_addr2line.pl create mode 100755 build-support/test_result_server.py create mode 100644 build-support/tools/kudu-lint/.gitignore create mode 100644 build-support/tools/kudu-lint/CMakeLists.txt create mode 100644 build-support/tools/kudu-lint/README create mode 100644 build-support/tools/kudu-lint/cmake_modules/FindClang.cmake create mode 100644 build-support/tools/kudu-lint/cmake_modules/FindLLVM.cmake create mode 100644 build-support/tools/kudu-lint/kudu-lint.cc create mode 100755 build-support/trigger_gerrit.py create mode 100644 build-support/tsan-suppressions.txt create mode 100644 cmake_modules/CompilerInfo.cmake create mode 100644 cmake_modules/FindBitshuffle.cmake create mode 100644 cmake_modules/FindCrcutil.cmake create mode 100644 cmake_modules/FindCyrusSASL.cmake create mode 100644 cmake_modules/FindGFlags.cmake create mode 100644 cmake_modules/FindGLog.cmake create mode 100644 cmake_modules/FindGMock.cmake create mode 100644 cmake_modules/FindGPerf.cmake create mode 100644 cmake_modules/FindGTest.cmake create mode 100644 cmake_modules/FindKRPC.cmake create mode 100644 cmake_modules/FindLibEv.cmake create mode 100644 cmake_modules/FindLibUnwind.cmake create mode 100644 cmake_modules/FindLz4.cmake create mode 100644 cmake_modules/FindProtobuf.cmake create mode 100644 cmake_modules/FindSnappy.cmake create mode 100644 cmake_modules/FindSqueasel.cmake create mode 100644 cmake_modules/FindVmem.cmake create mode 100644 cmake_modules/FindZlib.cmake create mode 100644 docs/.gitignore create mode 100644 docs/administration.adoc create mode 100644 docs/configuration.adoc create mode 100644 docs/configuration_reference.adoc create mode 100644 docs/configuration_reference_unsupported.adoc create mode 100644 docs/contributing.adoc create mode 100644 docs/developing.adoc create mode 100644 docs/images/.gitignore create mode 100644 docs/images/kudu-architecture-2.png create mode 100644 docs/index.adoc create mode 100644 docs/installation.adoc create mode 100644 docs/introduction.adoc create mode 100755 docs/kudu_impala_integration.adoc create mode 100644 docs/media-src/README create mode 100644 docs/media-src/kudu-architecture-2.pdf create mode 100644 docs/media-src/kudu-architecture.graffle create mode 100644 docs/media-src/kudu-tablet-flush-6.graffle create mode 100644 docs/media-src/kudu-tablet-flush-6b.pdf create mode 100644 docs/quickstart.adoc create mode 100644 docs/release_notes.adoc create mode 100644 docs/schema_design.adoc create mode 100644 docs/style_guide.adoc create mode 100644 docs/support/jekyll-templates/document.html.erb create mode 100644 docs/support/scripts/Gemfile create mode 100644 docs/support/scripts/Gemfile.lock create mode 100755 docs/support/scripts/make_docs.sh create mode 100755 docs/support/scripts/make_site.sh create mode 100644 docs/support/xsl/gflags_to_asciidoc.xsl create mode 100644 docs/transaction_semantics.adoc create mode 100644 docs/troubleshooting.adoc create mode 100644 docs/whitepaper/.gitignore create mode 100644 docs/whitepaper/kudu.bib create mode 100644 docs/whitepaper/kudu.tex create mode 100644 docs/whitepaper/vldb.cls create mode 100755 docs/whitepaper/ycsb-data/combine-data.sh create mode 100755 docs/whitepaper/ycsb-data/log-to-tsv.pl create mode 100644 docs/whitepaper/ycsb-data/plots.R create mode 100755 docs/whitepaper/ycsb-data/uniform-hbase.sh create mode 100644 docs/whitepaper/ycsb-data/uniform-hbase/a.json create mode 100644 docs/whitepaper/ycsb-data/uniform-hbase/b.json create mode 100644 docs/whitepaper/ycsb-data/uniform-hbase/c.json create mode 100644 docs/whitepaper/ycsb-data/uniform-hbase/d.json create mode 100644 docs/whitepaper/ycsb-data/uniform-hbase/load.json create mode 100755 docs/whitepaper/ycsb-data/uniform-kudu.sh create mode 100644 docs/whitepaper/ycsb-data/uniform-kudu/a.json create mode 100644 docs/whitepaper/ycsb-data/uniform-kudu/b.json create mode 100644 docs/whitepaper/ycsb-data/uniform-kudu/c.json create mode 100644 docs/whitepaper/ycsb-data/uniform-kudu/d.json create mode 100644 docs/whitepaper/ycsb-data/uniform-kudu/load.json create mode 100755 docs/whitepaper/ycsb-data/zipf-hbase.sh create mode 100644 docs/whitepaper/ycsb-data/zipfian-hbase/a.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-hbase/b.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-hbase/c.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-hbase/d.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-hbase/load.json create mode 100755 docs/whitepaper/ycsb-data/zipfian-kudu.sh create mode 100644 docs/whitepaper/ycsb-data/zipfian-kudu/a.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-kudu/b.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-kudu/c.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-kudu/d.json create mode 100644 docs/whitepaper/ycsb-data/zipfian-kudu/load.json create mode 100644 docs/whitepaper/ycsb-results.pdf create mode 100644 java/.gitignore create mode 100644 java/README.md create mode 100644 java/assembly.xml create mode 100644 java/interface-annotations/pom.xml create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceAudience.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceStability.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsJDiffDoclet.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsStandardDoclet.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/IncludePublicAnnotationsStandardDoclet.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/RootDocProcessor.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/StabilityOptions.java create mode 100644 java/interface-annotations/src/main/java/org/kududb/annotations/tools/package-info.java create mode 100644 java/kudu-client-tools/pom.xml create mode 100644 java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/CsvParser.java create mode 100644 java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsv.java create mode 100644 java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsvMapper.java create mode 100644 java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/IntegrationTestBigLinkedList.java create mode 100644 java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/RowCounter.java create mode 100644 java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITImportCsv.java create mode 100644 java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITRowCounter.java create mode 100644 java/kudu-client/.gitignore create mode 100755 java/kudu-client/dev-support/build-proto.sh create mode 100644 java/kudu-client/pom.xml create mode 100644 java/kudu-client/src/main/java/com/google/protobuf/ZeroCopyLiteralByteString.java create mode 100644 java/kudu-client/src/main/java/org/kududb/ColumnSchema.java create mode 100644 java/kudu-client/src/main/java/org/kududb/Schema.java create mode 100644 java/kudu-client/src/main/java/org/kududb/Type.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AbstractKuduScannerBuilder.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AlterTableOptions.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AlterTableRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AlterTableResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AsyncKuduClient.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AsyncKuduScanner.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/AsyncKuduSession.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Batch.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/BatchResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Bytes.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/CallResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ColumnRangePredicate.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ConnectionResetException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/CreateTableOptions.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/CreateTableRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/CreateTableResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/DeadlineTracker.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Delete.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/DeleteTableRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/DeleteTableResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ErrorCollector.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ExternalConsistencyMode.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationReceived.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetTableLocationsRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/HasFailedRpcException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/IPCUtil.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Insert.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/InvalidResponseException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/IsCreateTableDoneRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KeyEncoder.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduClient.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduRpc.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduRpcResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduScanner.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduServerException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduSession.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/KuduTable.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ListTablesRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ListTablesResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ListTabletServersRequest.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ListTabletServersResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/LocatedTablet.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/MasterErrorException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/NoLeaderMasterFoundException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/NonRecoverableException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Operation.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/OperationResponse.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/PartialRow.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Partition.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/PartitionSchema.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/PleaseThrottleException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/ProtobufHelper.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/RecoverableException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/RowError.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/RowErrorsAndOverflowStatus.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/RowResult.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/RowResultIterator.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/SecureRpcHelper.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/SessionConfiguration.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/TabletClient.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/TabletServerErrorException.java create mode 100644 java/kudu-client/src/main/java/org/kududb/client/Update.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/AsyncUtil.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/HybridTimeUtil.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/NetUtil.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/Pair.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/Slice.java create mode 100644 java/kudu-client/src/main/java/org/kududb/util/Slices.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/BaseKuduTest.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/MiniKuduCluster.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduClient.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduSession.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestBitSet.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestBytes.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestColumnRangePredicate.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestDeadlineTracker.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestErrorCollector.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestFlexiblePartitioning.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestHybridTime.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestKeyEncoding.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestKuduClient.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestKuduSession.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestKuduTable.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestLeaderFailover.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestMasterFailover.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestOperation.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestRowErrors.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestRowResult.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestScannerMultiTablet.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestTestUtils.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestTimeouts.java create mode 100644 java/kudu-client/src/test/java/org/kududb/client/TestUtils.java create mode 100644 java/kudu-client/src/test/java/org/kududb/util/TestAsyncUtil.java create mode 100644 java/kudu-client/src/test/java/org/kududb/util/TestMurmurHash.java create mode 100644 java/kudu-client/src/test/java/org/kududb/util/TestNetUtil.java create mode 100644 java/kudu-client/src/test/resources/flags create mode 100644 java/kudu-client/src/test/resources/log4j.properties create mode 100755 java/kudu-csd/generate_mdl.py create mode 100644 java/kudu-csd/pom.xml create mode 100644 java/kudu-csd/src/descriptor/service.sdl create mode 100644 java/kudu-csd/src/images/icon.png create mode 100644 java/kudu-csd/src/scripts/kudu.sh create mode 100644 java/kudu-mapreduce/pom.xml create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/CommandLineParser.java create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableInputFormat.java create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableMapReduceUtil.java create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputCommitter.java create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputFormat.java create mode 100644 java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/TableReducer.java create mode 100644 java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/HadoopTestingUtility.java create mode 100644 java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestInputFormatJob.java create mode 100644 java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableInputFormat.java create mode 100644 java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableOutputFormat.java create mode 100644 java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestOutputFormatJob.java create mode 100644 java/kudu-spark/pom.xml create mode 100644 java/kudu-spark/src/main/scala/org/kududb/spark/DefaultSource.scala create mode 100644 java/kudu-spark/src/main/scala/org/kududb/spark/KuduContext.scala create mode 100644 java/kudu-spark/src/test/resources/log4j.properties create mode 100644 java/kudu-spark/src/test/scala/org/kududb/spark/DefaultSourceTest.scala create mode 100644 java/kudu-spark/src/test/scala/org/kududb/spark/KuduContextTest.scala create mode 100644 java/kudu-spark/src/test/scala/org/kududb/spark/TestContext.scala create mode 100644 java/pom.xml create mode 100644 python/.gitignore create mode 100644 python/MANIFEST.in create mode 100644 python/Makefile create mode 100644 python/README.md create mode 100644 python/kudu/__init__.pxd create mode 100644 python/kudu/__init__.py create mode 100644 python/kudu/client.pyx create mode 100644 python/kudu/compat.py create mode 100644 python/kudu/errors.pxd create mode 100644 python/kudu/errors.pyx create mode 100644 python/kudu/libkudu_client.pxd create mode 100644 python/kudu/schema.pxd create mode 100644 python/kudu/schema.pyx create mode 100644 python/kudu/tests/__init__.py create mode 100644 python/kudu/tests/common.py create mode 100644 python/kudu/tests/test_client.py create mode 100644 python/kudu/tests/test_scanner.py create mode 100644 python/kudu/tests/test_schema.py create mode 100644 python/kudu/util.py create mode 100644 python/requirements.txt create mode 100644 python/setup.cfg create mode 100644 python/setup.py create mode 100644 src/kudu/benchmarks/CMakeLists.txt create mode 100755 src/kudu/benchmarks/bin/parse_rpc_bench.sh create mode 100644 src/kudu/benchmarks/rle.cc create mode 100644 src/kudu/benchmarks/tpch/line_item_tsv_importer.h create mode 100644 src/kudu/benchmarks/tpch/rpc_line_item_dao-test.cc create mode 100644 src/kudu/benchmarks/tpch/rpc_line_item_dao.cc create mode 100644 src/kudu/benchmarks/tpch/rpc_line_item_dao.h create mode 100644 src/kudu/benchmarks/tpch/tpch-schemas.h create mode 100644 src/kudu/benchmarks/tpch/tpch1.cc create mode 100644 src/kudu/benchmarks/tpch/tpch_real_world.cc create mode 100755 src/kudu/benchmarks/wal_hiccup-parser.py create mode 100644 src/kudu/benchmarks/wal_hiccup.cc create mode 100644 src/kudu/benchmarks/ycsb-schema.h create mode 100644 src/kudu/cfile/CMakeLists.txt create mode 100644 src/kudu/cfile/README create mode 100644 src/kudu/cfile/binary_dict_block.cc create mode 100644 src/kudu/cfile/binary_dict_block.h create mode 100644 src/kudu/cfile/binary_plain_block.cc create mode 100644 src/kudu/cfile/binary_plain_block.h create mode 100644 src/kudu/cfile/binary_prefix_block.cc create mode 100644 src/kudu/cfile/binary_prefix_block.h create mode 100644 src/kudu/cfile/block_cache-test.cc create mode 100644 src/kudu/cfile/block_cache.cc create mode 100644 src/kudu/cfile/block_cache.h create mode 100644 src/kudu/cfile/block_compression.cc create mode 100644 src/kudu/cfile/block_compression.h create mode 100644 src/kudu/cfile/block_encodings.h create mode 100644 src/kudu/cfile/block_handle.h create mode 100644 src/kudu/cfile/block_pointer.h create mode 100644 src/kudu/cfile/bloomfile-test-base.h create mode 100644 src/kudu/cfile/bloomfile-test.cc create mode 100644 src/kudu/cfile/bloomfile.cc create mode 100644 src/kudu/cfile/bloomfile.h create mode 100644 src/kudu/cfile/bshuf_block.cc create mode 100644 src/kudu/cfile/bshuf_block.h create mode 100644 src/kudu/cfile/cfile-dump.cc create mode 100644 src/kudu/cfile/cfile-test-base.h create mode 100644 src/kudu/cfile/cfile-test.cc create mode 100644 src/kudu/cfile/cfile.proto create mode 100644 src/kudu/cfile/cfile_reader.cc create mode 100644 src/kudu/cfile/cfile_reader.h create mode 100644 src/kudu/cfile/cfile_util.cc create mode 100644 src/kudu/cfile/cfile_util.h create mode 100644 src/kudu/cfile/cfile_writer.cc create mode 100644 src/kudu/cfile/cfile_writer.h create mode 100644 src/kudu/cfile/compression-test.cc create mode 100644 src/kudu/cfile/compression_codec.cc create mode 100644 src/kudu/cfile/compression_codec.h create mode 100644 src/kudu/cfile/encoding-test.cc create mode 100644 src/kudu/cfile/gvint_block.cc create mode 100644 src/kudu/cfile/gvint_block.h create mode 100644 src/kudu/cfile/index-test.cc create mode 100644 src/kudu/cfile/index_block.cc create mode 100644 src/kudu/cfile/index_block.h create mode 100644 src/kudu/cfile/index_btree.cc create mode 100644 src/kudu/cfile/index_btree.h create mode 100644 src/kudu/cfile/mt-bloomfile-test.cc create mode 100644 src/kudu/cfile/plain_bitmap_block.h create mode 100644 src/kudu/cfile/plain_block.h create mode 100644 src/kudu/cfile/rle_block.h create mode 100644 src/kudu/cfile/type_encodings.cc create mode 100644 src/kudu/cfile/type_encodings.h create mode 100644 src/kudu/client/CMakeLists.txt create mode 100644 src/kudu/client/MungeExportedInstallTargets.cmake create mode 100644 src/kudu/client/README create mode 100644 src/kudu/client/batcher.cc create mode 100644 src/kudu/client/batcher.h create mode 100644 src/kudu/client/callbacks.h create mode 100644 src/kudu/client/client-internal.cc create mode 100644 src/kudu/client/client-internal.h create mode 100644 src/kudu/client/client-test-util.cc create mode 100644 src/kudu/client/client-test-util.h create mode 100644 src/kudu/client/client-test.cc create mode 100644 src/kudu/client/client-unittest.cc create mode 100644 src/kudu/client/client.cc create mode 100644 src/kudu/client/client.h create mode 100644 src/kudu/client/clientConfig.cmake.in create mode 100644 src/kudu/client/client_builder-internal.cc create mode 100644 src/kudu/client/client_builder-internal.h create mode 100755 src/kudu/client/client_samples-test.sh create mode 100755 src/kudu/client/client_symbol-test.sh create mode 100644 src/kudu/client/error-internal.cc create mode 100644 src/kudu/client/error-internal.h create mode 100644 src/kudu/client/error_collector.cc create mode 100644 src/kudu/client/error_collector.h create mode 100644 src/kudu/client/meta_cache.cc create mode 100644 src/kudu/client/meta_cache.h create mode 100644 src/kudu/client/row_result.h create mode 100644 src/kudu/client/samples/CMakeLists.txt create mode 100644 src/kudu/client/samples/sample.cc create mode 100644 src/kudu/client/scan_batch.cc create mode 100644 src/kudu/client/scan_batch.h create mode 100644 src/kudu/client/scan_predicate-internal.h create mode 100644 src/kudu/client/scan_predicate.cc create mode 100644 src/kudu/client/scan_predicate.h create mode 100644 src/kudu/client/scanner-internal.cc create mode 100644 src/kudu/client/scanner-internal.h create mode 100644 src/kudu/client/schema-internal.h create mode 100644 src/kudu/client/schema.cc create mode 100644 src/kudu/client/schema.h create mode 100644 src/kudu/client/session-internal.cc create mode 100644 src/kudu/client/session-internal.h create mode 100644 src/kudu/client/shared_ptr.h create mode 100644 src/kudu/client/stubs.h create mode 100644 src/kudu/client/symbols.map create mode 100644 src/kudu/client/table-internal.cc create mode 100644 src/kudu/client/table-internal.h create mode 100644 src/kudu/client/table_alterer-internal.cc create mode 100644 src/kudu/client/table_alterer-internal.h create mode 100644 src/kudu/client/table_creator-internal.cc create mode 100644 src/kudu/client/table_creator-internal.h create mode 100644 src/kudu/client/tablet_server-internal.cc create mode 100644 src/kudu/client/tablet_server-internal.h create mode 100644 src/kudu/client/value-internal.h create mode 100644 src/kudu/client/value.cc create mode 100644 src/kudu/client/value.h create mode 100644 src/kudu/client/write_op-internal.h create mode 100644 src/kudu/client/write_op.cc create mode 100644 src/kudu/client/write_op.h create mode 100644 src/kudu/codegen/CMakeLists.txt create mode 100644 src/kudu/codegen/README create mode 100644 src/kudu/codegen/code_cache.cc create mode 100644 src/kudu/codegen/code_cache.h create mode 100644 src/kudu/codegen/code_generator.cc create mode 100644 src/kudu/codegen/code_generator.h create mode 100644 src/kudu/codegen/codegen-test.cc create mode 100644 src/kudu/codegen/compilation_manager.cc create mode 100644 src/kudu/codegen/compilation_manager.h create mode 100644 src/kudu/codegen/jit_wrapper.cc create mode 100644 src/kudu/codegen/jit_wrapper.h create mode 100644 src/kudu/codegen/module_builder.cc create mode 100644 src/kudu/codegen/module_builder.h create mode 100644 src/kudu/codegen/precompiled.cc create mode 100644 src/kudu/codegen/precompiled.ll.h create mode 100644 src/kudu/codegen/row_projector.cc create mode 100644 src/kudu/codegen/row_projector.h create mode 100644 src/kudu/common/CMakeLists.txt create mode 100644 src/kudu/common/README create mode 100644 src/kudu/common/columnblock.h create mode 100644 src/kudu/common/common.proto create mode 100644 src/kudu/common/encoded_key-test.cc create mode 100644 src/kudu/common/encoded_key.cc create mode 100644 src/kudu/common/encoded_key.h create mode 100644 src/kudu/common/generic_iterators-test.cc create mode 100644 src/kudu/common/generic_iterators.cc create mode 100644 src/kudu/common/generic_iterators.h create mode 100644 src/kudu/common/id_mapping-test.cc create mode 100644 src/kudu/common/id_mapping.cc create mode 100644 src/kudu/common/id_mapping.h create mode 100644 src/kudu/common/iterator.h create mode 100644 src/kudu/common/iterator_stats.cc create mode 100644 src/kudu/common/iterator_stats.h create mode 100644 src/kudu/common/key_encoder.cc create mode 100644 src/kudu/common/key_encoder.h create mode 100644 src/kudu/common/partial_row-test.cc create mode 100644 src/kudu/common/partial_row.cc create mode 100644 src/kudu/common/partial_row.h create mode 100644 src/kudu/common/partition-test.cc create mode 100644 src/kudu/common/partition.cc create mode 100644 src/kudu/common/partition.h create mode 100644 src/kudu/common/predicate-test.cc create mode 100644 src/kudu/common/predicate_encoder-test.cc create mode 100644 src/kudu/common/predicate_encoder.cc create mode 100644 src/kudu/common/predicate_encoder.h create mode 100644 src/kudu/common/row.h create mode 100644 src/kudu/common/row_changelist-test.cc create mode 100644 src/kudu/common/row_changelist.cc create mode 100644 src/kudu/common/row_changelist.h create mode 100644 src/kudu/common/row_key-util-test.cc create mode 100644 src/kudu/common/row_key-util.cc create mode 100644 src/kudu/common/row_key-util.h create mode 100644 src/kudu/common/row_operations-test.cc create mode 100644 src/kudu/common/row_operations.cc create mode 100644 src/kudu/common/row_operations.h create mode 100644 src/kudu/common/rowblock.cc create mode 100644 src/kudu/common/rowblock.h create mode 100644 src/kudu/common/rowid.h create mode 100644 src/kudu/common/scan_predicate.cc create mode 100644 src/kudu/common/scan_predicate.h create mode 100644 src/kudu/common/scan_spec.cc create mode 100644 src/kudu/common/scan_spec.h create mode 100644 src/kudu/common/schema-test.cc create mode 100644 src/kudu/common/schema.cc create mode 100644 src/kudu/common/schema.h create mode 100644 src/kudu/common/timestamp.cc create mode 100644 src/kudu/common/timestamp.h create mode 100644 src/kudu/common/types-test.cc create mode 100644 src/kudu/common/types.cc create mode 100644 src/kudu/common/types.h create mode 100644 src/kudu/common/wire_protocol-test-util.h create mode 100644 src/kudu/common/wire_protocol-test.cc create mode 100644 src/kudu/common/wire_protocol.cc create mode 100644 src/kudu/common/wire_protocol.h create mode 100644 src/kudu/common/wire_protocol.proto create mode 100644 src/kudu/consensus/CMakeLists.txt create mode 100644 src/kudu/consensus/README create mode 100644 src/kudu/consensus/consensus-test-util.h create mode 100644 src/kudu/consensus/consensus.cc create mode 100644 src/kudu/consensus/consensus.h create mode 100644 src/kudu/consensus/consensus.proto create mode 100644 src/kudu/consensus/consensus.txt create mode 100644 src/kudu/consensus/consensus_meta-test.cc create mode 100644 src/kudu/consensus/consensus_meta.cc create mode 100644 src/kudu/consensus/consensus_meta.h create mode 100644 src/kudu/consensus/consensus_peers-test.cc create mode 100644 src/kudu/consensus/consensus_peers.cc create mode 100644 src/kudu/consensus/consensus_peers.h create mode 100644 src/kudu/consensus/consensus_queue-test.cc create mode 100644 src/kudu/consensus/consensus_queue.cc create mode 100644 src/kudu/consensus/consensus_queue.h create mode 100644 src/kudu/consensus/leader_election-test.cc create mode 100644 src/kudu/consensus/leader_election.cc create mode 100644 src/kudu/consensus/leader_election.h create mode 100644 src/kudu/consensus/local_consensus.cc create mode 100644 src/kudu/consensus/local_consensus.h create mode 100644 src/kudu/consensus/log-dump.cc create mode 100644 src/kudu/consensus/log-test-base.h create mode 100644 src/kudu/consensus/log-test.cc create mode 100644 src/kudu/consensus/log.cc create mode 100644 src/kudu/consensus/log.h create mode 100644 src/kudu/consensus/log.proto create mode 100644 src/kudu/consensus/log_anchor_registry-test.cc create mode 100644 src/kudu/consensus/log_anchor_registry.cc create mode 100644 src/kudu/consensus/log_anchor_registry.h create mode 100644 src/kudu/consensus/log_cache-test.cc create mode 100644 src/kudu/consensus/log_cache.cc create mode 100644 src/kudu/consensus/log_cache.h create mode 100644 src/kudu/consensus/log_index-test.cc create mode 100644 src/kudu/consensus/log_index.cc create mode 100644 src/kudu/consensus/log_index.h create mode 100644 src/kudu/consensus/log_metrics.cc create mode 100644 src/kudu/consensus/log_metrics.h create mode 100644 src/kudu/consensus/log_reader.cc create mode 100644 src/kudu/consensus/log_reader.h create mode 100644 src/kudu/consensus/log_util.cc create mode 100644 src/kudu/consensus/log_util.h create mode 100644 src/kudu/consensus/metadata.proto create mode 100644 src/kudu/consensus/mt-log-test.cc create mode 100644 src/kudu/consensus/opid.proto create mode 100644 src/kudu/consensus/opid_util.cc create mode 100644 src/kudu/consensus/opid_util.h create mode 100644 src/kudu/consensus/peer_manager.cc create mode 100644 src/kudu/consensus/peer_manager.h create mode 100644 src/kudu/consensus/quorum_util-test.cc create mode 100644 src/kudu/consensus/quorum_util.cc create mode 100644 src/kudu/consensus/quorum_util.h create mode 100644 src/kudu/consensus/raft_consensus-test.cc create mode 100644 src/kudu/consensus/raft_consensus.cc create mode 100644 src/kudu/consensus/raft_consensus.h create mode 100644 src/kudu/consensus/raft_consensus_quorum-test.cc create mode 100644 src/kudu/consensus/raft_consensus_state-test.cc create mode 100644 src/kudu/consensus/raft_consensus_state.cc create mode 100644 src/kudu/consensus/raft_consensus_state.h create mode 100644 src/kudu/consensus/ref_counted_replicate.h create mode 100644 src/kudu/experiments/CMakeLists.txt create mode 100644 src/kudu/experiments/merge-test.cc create mode 100644 src/kudu/experiments/rwlock-perf.cc create mode 100644 src/kudu/fs/CMakeLists.txt create mode 100644 src/kudu/fs/block_id.cc create mode 100644 src/kudu/fs/block_id.h create mode 100644 src/kudu/fs/block_manager-stress-test.cc create mode 100644 src/kudu/fs/block_manager-test.cc create mode 100644 src/kudu/fs/block_manager.cc create mode 100644 src/kudu/fs/block_manager.h create mode 100644 src/kudu/fs/block_manager_metrics.cc create mode 100644 src/kudu/fs/block_manager_metrics.h create mode 100644 src/kudu/fs/block_manager_util-test.cc create mode 100644 src/kudu/fs/block_manager_util.cc create mode 100644 src/kudu/fs/block_manager_util.h create mode 100644 src/kudu/fs/file_block_manager.cc create mode 100644 src/kudu/fs/file_block_manager.h create mode 100644 src/kudu/fs/fs-test-util.h create mode 100644 src/kudu/fs/fs.proto create mode 100644 src/kudu/fs/fs_manager-test.cc create mode 100644 src/kudu/fs/fs_manager.cc create mode 100644 src/kudu/fs/fs_manager.h create mode 100644 src/kudu/fs/log_block_manager.cc create mode 100644 src/kudu/fs/log_block_manager.h create mode 100644 src/kudu/gutil/CMakeLists.txt create mode 100644 src/kudu/gutil/algorithm.h create mode 100644 src/kudu/gutil/arm_instruction_set_select.h create mode 100644 src/kudu/gutil/atomic_refcount.h create mode 100644 src/kudu/gutil/atomicops-internals-macosx.h create mode 100644 src/kudu/gutil/atomicops-internals-powerpc.h create mode 100644 src/kudu/gutil/atomicops-internals-tsan.h create mode 100644 src/kudu/gutil/atomicops-internals-x86.cc create mode 100644 src/kudu/gutil/atomicops-internals-x86.h create mode 100644 src/kudu/gutil/atomicops.h create mode 100644 src/kudu/gutil/auxiliary/atomicops-internals-arm-generic.h create mode 100644 src/kudu/gutil/auxiliary/atomicops-internals-arm-v6plus.h create mode 100644 src/kudu/gutil/auxiliary/atomicops-internals-windows.h create mode 100644 src/kudu/gutil/basictypes.h create mode 100644 src/kudu/gutil/bind.h create mode 100644 src/kudu/gutil/bind.h.pump create mode 100644 src/kudu/gutil/bind_helpers.h create mode 100644 src/kudu/gutil/bind_internal.h create mode 100644 src/kudu/gutil/bind_internal.h.pump create mode 100644 src/kudu/gutil/bits.cc create mode 100644 src/kudu/gutil/bits.h create mode 100644 src/kudu/gutil/callback.h create mode 100644 src/kudu/gutil/callback.h.pump create mode 100644 src/kudu/gutil/callback_forward.h create mode 100644 src/kudu/gutil/callback_internal.cc create mode 100644 src/kudu/gutil/callback_internal.h create mode 100644 src/kudu/gutil/casts.h create mode 100644 src/kudu/gutil/charmap.h create mode 100644 src/kudu/gutil/cpu.cc create mode 100644 src/kudu/gutil/cpu.h create mode 100644 src/kudu/gutil/cycleclock-inl.h create mode 100644 src/kudu/gutil/dynamic_annotations.c create mode 100644 src/kudu/gutil/dynamic_annotations.h create mode 100644 src/kudu/gutil/endian.h create mode 100644 src/kudu/gutil/fixedarray.h create mode 100644 src/kudu/gutil/gscoped_ptr.h create mode 100644 src/kudu/gutil/hash/builtin_type_hash.h create mode 100644 src/kudu/gutil/hash/city.cc create mode 100644 src/kudu/gutil/hash/city.h create mode 100644 src/kudu/gutil/hash/hash.cc create mode 100644 src/kudu/gutil/hash/hash.h create mode 100644 src/kudu/gutil/hash/hash128to64.h create mode 100644 src/kudu/gutil/hash/jenkins.cc create mode 100644 src/kudu/gutil/hash/jenkins.h create mode 100644 src/kudu/gutil/hash/jenkins_lookup2.h create mode 100644 src/kudu/gutil/hash/legacy_hash.h create mode 100644 src/kudu/gutil/hash/string_hash.h create mode 100644 src/kudu/gutil/int128.cc create mode 100644 src/kudu/gutil/int128.h create mode 100644 src/kudu/gutil/integral_types.h create mode 100644 src/kudu/gutil/linux_syscall_support.h create mode 100644 src/kudu/gutil/logging-inl.h create mode 100644 src/kudu/gutil/macros.h create mode 100644 src/kudu/gutil/manual_constructor.h create mode 100644 src/kudu/gutil/map-util.h create mode 100644 src/kudu/gutil/mathlimits.cc create mode 100644 src/kudu/gutil/mathlimits.h create mode 100644 src/kudu/gutil/move.h create mode 100644 src/kudu/gutil/once.cc create mode 100644 src/kudu/gutil/once.h create mode 100644 src/kudu/gutil/paranoid.h create mode 100644 src/kudu/gutil/port.h create mode 100644 src/kudu/gutil/raw_scoped_refptr_mismatch_checker.h create mode 100644 src/kudu/gutil/ref_counted.cc create mode 100644 src/kudu/gutil/ref_counted.h create mode 100644 src/kudu/gutil/ref_counted_memory.cc create mode 100644 src/kudu/gutil/ref_counted_memory.h create mode 100644 src/kudu/gutil/singleton.h create mode 100644 src/kudu/gutil/spinlock.cc create mode 100644 src/kudu/gutil/spinlock.h create mode 100644 src/kudu/gutil/spinlock_internal.cc create mode 100644 src/kudu/gutil/spinlock_internal.h create mode 100644 src/kudu/gutil/spinlock_linux-inl.h create mode 100644 src/kudu/gutil/spinlock_posix-inl.h create mode 100644 src/kudu/gutil/spinlock_win32-inl.h create mode 100644 src/kudu/gutil/stl_util.h create mode 100644 src/kudu/gutil/stringprintf.cc create mode 100644 src/kudu/gutil/stringprintf.h create mode 100644 src/kudu/gutil/strings/ascii_ctype.cc create mode 100644 src/kudu/gutil/strings/ascii_ctype.h create mode 100644 src/kudu/gutil/strings/charset.cc create mode 100644 src/kudu/gutil/strings/charset.h create mode 100644 src/kudu/gutil/strings/escaping.cc create mode 100644 src/kudu/gutil/strings/escaping.h create mode 100644 src/kudu/gutil/strings/fastmem.h create mode 100644 src/kudu/gutil/strings/human_readable.cc create mode 100644 src/kudu/gutil/strings/human_readable.h create mode 100644 src/kudu/gutil/strings/join.cc create mode 100644 src/kudu/gutil/strings/join.h create mode 100644 src/kudu/gutil/strings/memutil.cc create mode 100644 src/kudu/gutil/strings/memutil.h create mode 100644 src/kudu/gutil/strings/numbers.cc create mode 100644 src/kudu/gutil/strings/numbers.h create mode 100644 src/kudu/gutil/strings/serialize.cc create mode 100644 src/kudu/gutil/strings/serialize.h create mode 100644 src/kudu/gutil/strings/split.cc create mode 100644 src/kudu/gutil/strings/split.h create mode 100644 src/kudu/gutil/strings/split_internal.h create mode 100644 src/kudu/gutil/strings/strcat.cc create mode 100644 src/kudu/gutil/strings/strcat.h create mode 100644 src/kudu/gutil/strings/string_util-test.cc create mode 100644 src/kudu/gutil/strings/stringpiece.cc create mode 100644 src/kudu/gutil/strings/stringpiece.h create mode 100644 src/kudu/gutil/strings/strip.cc create mode 100644 src/kudu/gutil/strings/strip.h create mode 100644 src/kudu/gutil/strings/substitute.cc create mode 100644 src/kudu/gutil/strings/substitute.h create mode 100644 src/kudu/gutil/strings/util.cc create mode 100644 src/kudu/gutil/strings/util.h create mode 100644 src/kudu/gutil/strtoint.cc create mode 100644 src/kudu/gutil/strtoint.h create mode 100644 src/kudu/gutil/synchronization_profiling.h create mode 100644 src/kudu/gutil/sysinfo.cc create mode 100644 src/kudu/gutil/sysinfo.h create mode 100644 src/kudu/gutil/template_util.h create mode 100644 src/kudu/gutil/thread_annotations.h create mode 100644 src/kudu/gutil/threading/thread_collision_warner.cc create mode 100644 src/kudu/gutil/threading/thread_collision_warner.h create mode 100644 src/kudu/gutil/tuple.h create mode 100644 src/kudu/gutil/type_traits.h create mode 100644 src/kudu/gutil/utf/LICENSE create mode 100644 src/kudu/gutil/utf/rune.c create mode 100644 src/kudu/gutil/utf/utf.h create mode 100644 src/kudu/gutil/utf/utfdef.h create mode 100644 src/kudu/gutil/valgrind.h create mode 100644 src/kudu/gutil/walltime.cc create mode 100644 src/kudu/gutil/walltime.h create mode 100644 src/kudu/integration-tests/CMakeLists.txt create mode 100644 src/kudu/integration-tests/all_types-itest.cc create mode 100644 src/kudu/integration-tests/alter_table-randomized-test.cc create mode 100644 src/kudu/integration-tests/alter_table-test.cc create mode 100644 src/kudu/integration-tests/client-stress-test.cc create mode 100644 src/kudu/integration-tests/client_failover-itest.cc create mode 100644 src/kudu/integration-tests/cluster_itest_util.cc create mode 100644 src/kudu/integration-tests/cluster_itest_util.h create mode 100644 src/kudu/integration-tests/cluster_verifier.cc create mode 100644 src/kudu/integration-tests/cluster_verifier.h create mode 100644 src/kudu/integration-tests/create-table-itest.cc create mode 100644 src/kudu/integration-tests/create-table-stress-test.cc create mode 100644 src/kudu/integration-tests/delete_table-test.cc create mode 100644 src/kudu/integration-tests/external_mini_cluster-itest-base.h create mode 100644 src/kudu/integration-tests/external_mini_cluster-test.cc create mode 100644 src/kudu/integration-tests/external_mini_cluster.cc create mode 100644 src/kudu/integration-tests/external_mini_cluster.h create mode 100644 src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc create mode 100644 src/kudu/integration-tests/external_mini_cluster_fs_inspector.h create mode 100644 src/kudu/integration-tests/flex_partitioning-itest.cc create mode 100644 src/kudu/integration-tests/full_stack-insert-scan-test.cc create mode 100644 src/kudu/integration-tests/linked_list-test-util.h create mode 100644 src/kudu/integration-tests/linked_list-test.cc create mode 100644 src/kudu/integration-tests/master_failover-itest.cc create mode 100644 src/kudu/integration-tests/master_replication-itest.cc create mode 100644 src/kudu/integration-tests/mini_cluster.cc create mode 100644 src/kudu/integration-tests/mini_cluster.h create mode 100644 src/kudu/integration-tests/raft_consensus-itest.cc create mode 100644 src/kudu/integration-tests/registration-test.cc create mode 100644 src/kudu/integration-tests/remote_bootstrap-itest.cc create mode 100644 src/kudu/integration-tests/tablet_replacement-itest.cc create mode 100644 src/kudu/integration-tests/test_workload.cc create mode 100644 src/kudu/integration-tests/test_workload.h create mode 100644 src/kudu/integration-tests/ts_itest-base.h create mode 100644 src/kudu/integration-tests/ts_recovery-itest.cc create mode 100644 src/kudu/integration-tests/ts_tablet_manager-itest.cc create mode 100644 src/kudu/integration-tests/update_scan_delta_compact-test.cc create mode 100644 src/kudu/master/CMakeLists.txt create mode 100644 src/kudu/master/README create mode 100644 src/kudu/master/catalog_manager-test.cc create mode 100644 src/kudu/master/catalog_manager.cc create mode 100644 src/kudu/master/catalog_manager.h create mode 100644 src/kudu/master/master-path-handlers.cc create mode 100644 src/kudu/master/master-path-handlers.h create mode 100644 src/kudu/master/master-test-util.h create mode 100644 src/kudu/master/master-test.cc create mode 100644 src/kudu/master/master.cc create mode 100644 src/kudu/master/master.h create mode 100644 src/kudu/master/master.proto create mode 100644 src/kudu/master/master_main.cc create mode 100644 src/kudu/master/master_options.cc create mode 100644 src/kudu/master/master_options.h create mode 100644 src/kudu/master/master_rpc.cc create mode 100644 src/kudu/master/master_rpc.h create mode 100644 src/kudu/master/master_service.cc create mode 100644 src/kudu/master/master_service.h create mode 100644 src/kudu/master/mini_master.cc create mode 100644 src/kudu/master/mini_master.h create mode 100644 src/kudu/master/sys_catalog-test.cc create mode 100644 src/kudu/master/sys_catalog.cc create mode 100644 src/kudu/master/sys_catalog.h create mode 100644 src/kudu/master/ts_descriptor.cc create mode 100644 src/kudu/master/ts_descriptor.h create mode 100644 src/kudu/master/ts_manager.cc create mode 100644 src/kudu/master/ts_manager.h create mode 100644 src/kudu/rpc/CMakeLists.txt create mode 100644 src/kudu/rpc/README create mode 100644 src/kudu/rpc/acceptor_pool.cc create mode 100644 src/kudu/rpc/acceptor_pool.h create mode 100644 src/kudu/rpc/auth_store.cc create mode 100644 src/kudu/rpc/auth_store.h create mode 100644 src/kudu/rpc/blocking_ops.cc create mode 100644 src/kudu/rpc/blocking_ops.h create mode 100644 src/kudu/rpc/connection.cc create mode 100644 src/kudu/rpc/connection.h create mode 100644 src/kudu/rpc/constants.cc create mode 100644 src/kudu/rpc/constants.h create mode 100644 src/kudu/rpc/inbound_call.cc create mode 100644 src/kudu/rpc/inbound_call.h create mode 100644 src/kudu/rpc/messenger.cc create mode 100644 src/kudu/rpc/messenger.h create mode 100644 src/kudu/rpc/mt-rpc-test.cc create mode 100644 src/kudu/rpc/negotiation.cc create mode 100644 src/kudu/rpc/negotiation.h create mode 100644 src/kudu/rpc/outbound_call.cc create mode 100644 src/kudu/rpc/outbound_call.h create mode 100644 src/kudu/rpc/protoc-gen-krpc.cc create mode 100644 src/kudu/rpc/proxy.cc create mode 100644 src/kudu/rpc/proxy.h create mode 100644 src/kudu/rpc/reactor-test.cc create mode 100644 src/kudu/rpc/reactor.cc create mode 100644 src/kudu/rpc/reactor.h create mode 100644 src/kudu/rpc/remote_method.cc create mode 100644 src/kudu/rpc/remote_method.h create mode 100644 src/kudu/rpc/response_callback.h create mode 100644 src/kudu/rpc/rpc-bench.cc create mode 100644 src/kudu/rpc/rpc-test-base.h create mode 100644 src/kudu/rpc/rpc-test.cc create mode 100644 src/kudu/rpc/rpc.cc create mode 100644 src/kudu/rpc/rpc.h create mode 100644 src/kudu/rpc/rpc_context.cc create mode 100644 src/kudu/rpc/rpc_context.h create mode 100644 src/kudu/rpc/rpc_controller.cc create mode 100644 src/kudu/rpc/rpc_controller.h create mode 100644 src/kudu/rpc/rpc_header.proto create mode 100644 src/kudu/rpc/rpc_introspection.proto create mode 100644 src/kudu/rpc/rpc_service.h create mode 100644 src/kudu/rpc/rpc_sidecar.h create mode 100644 src/kudu/rpc/rpc_stub-test.cc create mode 100644 src/kudu/rpc/rtest.proto create mode 100644 src/kudu/rpc/rtest_diff_package.proto create mode 100644 src/kudu/rpc/sasl_client.cc create mode 100644 src/kudu/rpc/sasl_client.h create mode 100644 src/kudu/rpc/sasl_common.cc create mode 100644 src/kudu/rpc/sasl_common.h create mode 100644 src/kudu/rpc/sasl_helper.cc create mode 100644 src/kudu/rpc/sasl_helper.h create mode 100644 src/kudu/rpc/sasl_rpc-test.cc create mode 100644 src/kudu/rpc/sasl_server.cc create mode 100644 src/kudu/rpc/sasl_server.h create mode 100644 src/kudu/rpc/serialization.cc create mode 100644 src/kudu/rpc/serialization.h create mode 100644 src/kudu/rpc/service_if.cc create mode 100644 src/kudu/rpc/service_if.h create mode 100644 src/kudu/rpc/service_pool.cc create mode 100644 src/kudu/rpc/service_pool.h create mode 100644 src/kudu/rpc/transfer.cc create mode 100644 src/kudu/rpc/transfer.h create mode 100755 src/kudu/scripts/benchmarks.sh create mode 100644 src/kudu/scripts/compare-hbase-kudu.R create mode 100755 src/kudu/scripts/ensure_cpu_scaling.sh create mode 100644 src/kudu/scripts/get-job-stats-from-mysql.py create mode 100755 src/kudu/scripts/graph-metrics.py create mode 100644 src/kudu/scripts/jobs_runtime.R create mode 100644 src/kudu/scripts/mt-tablet-test-graph.R create mode 100644 src/kudu/scripts/multiplot.R create mode 100644 src/kudu/scripts/parse_metrics_log.py create mode 100755 src/kudu/scripts/parse_real_out.sh create mode 100644 src/kudu/scripts/si_vec.R create mode 100755 src/kudu/scripts/tpch.sh create mode 100644 src/kudu/scripts/write-jobs-stats-to-mysql.py create mode 100644 src/kudu/server/CMakeLists.txt create mode 100644 src/kudu/server/clock.h create mode 100644 src/kudu/server/default-path-handlers.cc create mode 100644 src/kudu/server/default-path-handlers.h create mode 100644 src/kudu/server/generic_service.cc create mode 100644 src/kudu/server/generic_service.h create mode 100644 src/kudu/server/glog_metrics.cc create mode 100644 src/kudu/server/glog_metrics.h create mode 100644 src/kudu/server/hybrid_clock-test.cc create mode 100644 src/kudu/server/hybrid_clock.cc create mode 100644 src/kudu/server/hybrid_clock.h create mode 100644 src/kudu/server/logical_clock-test.cc create mode 100644 src/kudu/server/logical_clock.cc create mode 100644 src/kudu/server/logical_clock.h create mode 100644 src/kudu/server/metadata.h create mode 100644 src/kudu/server/monitored_task.h create mode 100644 src/kudu/server/pprof-path-handlers.cc create mode 100644 src/kudu/server/pprof-path-handlers.h create mode 100644 src/kudu/server/rpc_server.cc create mode 100644 src/kudu/server/rpc_server.h create mode 100644 src/kudu/server/rpcz-path-handler.cc create mode 100644 src/kudu/server/rpcz-path-handler.h create mode 100644 src/kudu/server/server_base.cc create mode 100644 src/kudu/server/server_base.h create mode 100644 src/kudu/server/server_base.proto create mode 100644 src/kudu/server/server_base_options.cc create mode 100644 src/kudu/server/server_base_options.h create mode 100644 src/kudu/server/tcmalloc_metrics.cc create mode 100644 src/kudu/server/tcmalloc_metrics.h create mode 100644 src/kudu/server/tracing-path-handlers.cc create mode 100644 src/kudu/server/tracing-path-handlers.h create mode 100644 src/kudu/server/webserver-test.cc create mode 100644 src/kudu/server/webserver.cc create mode 100644 src/kudu/server/webserver.h create mode 100644 src/kudu/server/webserver_options.cc create mode 100644 src/kudu/server/webserver_options.h create mode 100644 src/kudu/server/webui_util.cc create mode 100644 src/kudu/server/webui_util.h create mode 100644 src/kudu/tablet/CMakeLists.txt create mode 100644 src/kudu/tablet/README create mode 100644 src/kudu/tablet/cbtree-test.cc create mode 100644 src/kudu/tablet/cfile_set-test.cc create mode 100644 src/kudu/tablet/cfile_set.cc create mode 100644 src/kudu/tablet/cfile_set.h create mode 100644 src/kudu/tablet/compaction-policy.txt create mode 100644 src/kudu/tablet/compaction-test.cc create mode 100644 src/kudu/tablet/compaction.cc create mode 100644 src/kudu/tablet/compaction.h create mode 100644 src/kudu/tablet/compaction.txt create mode 100644 src/kudu/tablet/compaction_policy-test.cc create mode 100644 src/kudu/tablet/compaction_policy.cc create mode 100644 src/kudu/tablet/compaction_policy.h create mode 100644 src/kudu/tablet/composite-pushdown-test.cc create mode 100644 src/kudu/tablet/concurrent_btree.h create mode 100644 src/kudu/tablet/delta_applier.cc create mode 100644 src/kudu/tablet/delta_applier.h create mode 100644 src/kudu/tablet/delta_compaction-test.cc create mode 100644 src/kudu/tablet/delta_compaction.cc create mode 100644 src/kudu/tablet/delta_compaction.h create mode 100644 src/kudu/tablet/delta_iterator_merger.cc create mode 100644 src/kudu/tablet/delta_iterator_merger.h create mode 100644 src/kudu/tablet/delta_key.cc create mode 100644 src/kudu/tablet/delta_key.h create mode 100644 src/kudu/tablet/delta_stats.cc create mode 100644 src/kudu/tablet/delta_stats.h create mode 100644 src/kudu/tablet/delta_store.cc create mode 100644 src/kudu/tablet/delta_store.h create mode 100644 src/kudu/tablet/delta_tracker.cc create mode 100644 src/kudu/tablet/delta_tracker.h create mode 100644 src/kudu/tablet/deltafile-test.cc create mode 100644 src/kudu/tablet/deltafile.cc create mode 100644 src/kudu/tablet/deltafile.h create mode 100644 src/kudu/tablet/deltamemstore-test.cc create mode 100644 src/kudu/tablet/deltamemstore.cc create mode 100644 src/kudu/tablet/deltamemstore.h create mode 100644 src/kudu/tablet/diskrowset-test-base.h create mode 100644 src/kudu/tablet/diskrowset-test.cc create mode 100644 src/kudu/tablet/diskrowset.cc create mode 100644 src/kudu/tablet/diskrowset.h create mode 100644 src/kudu/tablet/local_tablet_writer.h create mode 100644 src/kudu/tablet/lock_manager-test.cc create mode 100644 src/kudu/tablet/lock_manager.cc create mode 100644 src/kudu/tablet/lock_manager.h create mode 100644 src/kudu/tablet/maintenance_manager-test.cc create mode 100644 src/kudu/tablet/maintenance_manager.cc create mode 100644 src/kudu/tablet/maintenance_manager.h create mode 100644 src/kudu/tablet/major_delta_compaction-test.cc create mode 100644 src/kudu/tablet/memrowset-test.cc create mode 100644 src/kudu/tablet/memrowset.cc create mode 100644 src/kudu/tablet/memrowset.h create mode 100644 src/kudu/tablet/metadata-test.cc create mode 100644 src/kudu/tablet/metadata.proto create mode 100644 src/kudu/tablet/mock-rowsets.h create mode 100644 src/kudu/tablet/mt-diskrowset-test.cc create mode 100644 src/kudu/tablet/mt-rowset_delta_compaction-test.cc create mode 100644 src/kudu/tablet/mt-tablet-test.cc create mode 100644 src/kudu/tablet/multi_column_writer.cc create mode 100644 src/kudu/tablet/multi_column_writer.h create mode 100644 src/kudu/tablet/mutation.cc create mode 100644 src/kudu/tablet/mutation.h create mode 100644 src/kudu/tablet/mvcc-test.cc create mode 100644 src/kudu/tablet/mvcc.cc create mode 100644 src/kudu/tablet/mvcc.h create mode 100644 src/kudu/tablet/row_op.cc create mode 100644 src/kudu/tablet/row_op.h create mode 100644 src/kudu/tablet/rowset.cc create mode 100644 src/kudu/tablet/rowset.h create mode 100644 src/kudu/tablet/rowset_info.cc create mode 100644 src/kudu/tablet/rowset_info.h create mode 100644 src/kudu/tablet/rowset_metadata.cc create mode 100644 src/kudu/tablet/rowset_metadata.h create mode 100644 src/kudu/tablet/rowset_tree-test.cc create mode 100644 src/kudu/tablet/rowset_tree.cc create mode 100644 src/kudu/tablet/rowset_tree.h create mode 100644 src/kudu/tablet/schema-change.txt create mode 100644 src/kudu/tablet/svg_dump.cc create mode 100644 src/kudu/tablet/svg_dump.h create mode 100644 src/kudu/tablet/tablet-harness.h create mode 100644 src/kudu/tablet/tablet-pushdown-test.cc create mode 100644 src/kudu/tablet/tablet-schema-test.cc create mode 100644 src/kudu/tablet/tablet-test-base.h create mode 100644 src/kudu/tablet/tablet-test-util.h create mode 100644 src/kudu/tablet/tablet-test.cc create mode 100644 src/kudu/tablet/tablet.cc create mode 100644 src/kudu/tablet/tablet.h create mode 100644 src/kudu/tablet/tablet.proto create mode 100644 src/kudu/tablet/tablet_bootstrap-test.cc create mode 100644 src/kudu/tablet/tablet_bootstrap.cc create mode 100644 src/kudu/tablet/tablet_bootstrap.h create mode 100644 src/kudu/tablet/tablet_metadata-test.cc create mode 100644 src/kudu/tablet/tablet_metadata.cc create mode 100644 src/kudu/tablet/tablet_metadata.h create mode 100644 src/kudu/tablet/tablet_metrics.cc create mode 100644 src/kudu/tablet/tablet_metrics.h create mode 100644 src/kudu/tablet/tablet_mm_ops-test.cc create mode 100644 src/kudu/tablet/tablet_mm_ops.h create mode 100644 src/kudu/tablet/tablet_peer-test.cc create mode 100644 src/kudu/tablet/tablet_peer.cc create mode 100644 src/kudu/tablet/tablet_peer.h create mode 100644 src/kudu/tablet/tablet_peer_mm_ops.cc create mode 100644 src/kudu/tablet/tablet_peer_mm_ops.h create mode 100644 src/kudu/tablet/tablet_random_access-test.cc create mode 100644 src/kudu/tablet/transaction_order_verifier.cc create mode 100644 src/kudu/tablet/transaction_order_verifier.h create mode 100644 src/kudu/tablet/transactions/alter_schema_transaction.cc create mode 100644 src/kudu/tablet/transactions/alter_schema_transaction.h create mode 100644 src/kudu/tablet/transactions/transaction.cc create mode 100644 src/kudu/tablet/transactions/transaction.h create mode 100644 src/kudu/tablet/transactions/transaction_driver.cc create mode 100644 src/kudu/tablet/transactions/transaction_driver.h create mode 100644 src/kudu/tablet/transactions/transaction_tracker-test.cc create mode 100644 src/kudu/tablet/transactions/transaction_tracker.cc create mode 100644 src/kudu/tablet/transactions/transaction_tracker.h create mode 100644 src/kudu/tablet/transactions/write_transaction.cc create mode 100644 src/kudu/tablet/transactions/write_transaction.h create mode 100644 src/kudu/tablet/triggering-maintenance-ops.txt create mode 100644 src/kudu/tools/CMakeLists.txt create mode 100644 src/kudu/tools/README.systemtap create mode 100644 src/kudu/tools/create-demo-table.cc create mode 100644 src/kudu/tools/data_gen_util.cc create mode 100644 src/kudu/tools/data_gen_util.h create mode 100644 src/kudu/tools/fs_dump-tool.cc create mode 100644 src/kudu/tools/fs_list-tool.cc create mode 100644 src/kudu/tools/fs_tool.cc create mode 100644 src/kudu/tools/fs_tool.h create mode 100644 src/kudu/tools/insert-generated-rows.cc create mode 100644 src/kudu/tools/ksck-test.cc create mode 100644 src/kudu/tools/ksck.cc create mode 100644 src/kudu/tools/ksck.h create mode 100644 src/kudu/tools/ksck_remote-test.cc create mode 100644 src/kudu/tools/ksck_remote.cc create mode 100644 src/kudu/tools/ksck_remote.h create mode 100644 src/kudu/tools/kudu-admin-test.cc create mode 100644 src/kudu/tools/kudu-admin.cc create mode 100644 src/kudu/tools/kudu-ksck.cc create mode 100644 src/kudu/tools/kudu-ts-cli-test.cc create mode 100755 src/kudu/tools/parse_debug_refcounted.pl create mode 100644 src/kudu/tools/pbc-dump.cc create mode 100755 src/kudu/tools/trace_io.stp create mode 100644 src/kudu/tools/ts-cli.cc create mode 100644 src/kudu/tserver/CMakeLists.txt create mode 100644 src/kudu/tserver/heartbeater.cc create mode 100644 src/kudu/tserver/heartbeater.h create mode 100644 src/kudu/tserver/mini_tablet_server.cc create mode 100644 src/kudu/tserver/mini_tablet_server.h create mode 100644 src/kudu/tserver/remote_bootstrap-test-base.h create mode 100644 src/kudu/tserver/remote_bootstrap.proto create mode 100644 src/kudu/tserver/remote_bootstrap_client-test.cc create mode 100644 src/kudu/tserver/remote_bootstrap_client.cc create mode 100644 src/kudu/tserver/remote_bootstrap_client.h create mode 100644 src/kudu/tserver/remote_bootstrap_service-test.cc create mode 100644 src/kudu/tserver/remote_bootstrap_service.cc create mode 100644 src/kudu/tserver/remote_bootstrap_service.h create mode 100644 src/kudu/tserver/remote_bootstrap_session-test.cc create mode 100644 src/kudu/tserver/remote_bootstrap_session.cc create mode 100644 src/kudu/tserver/remote_bootstrap_session.h create mode 100644 src/kudu/tserver/scanner_metrics.cc create mode 100644 src/kudu/tserver/scanner_metrics.h create mode 100644 src/kudu/tserver/scanners-test.cc create mode 100644 src/kudu/tserver/scanners.cc create mode 100644 src/kudu/tserver/scanners.h create mode 100644 src/kudu/tserver/tablet_peer_lookup.h create mode 100644 src/kudu/tserver/tablet_server-stress-test.cc create mode 100644 src/kudu/tserver/tablet_server-test-base.h create mode 100644 src/kudu/tserver/tablet_server-test.cc create mode 100644 src/kudu/tserver/tablet_server.cc create mode 100644 src/kudu/tserver/tablet_server.h create mode 100644 src/kudu/tserver/tablet_server_main.cc create mode 100644 src/kudu/tserver/tablet_server_options.cc create mode 100644 src/kudu/tserver/tablet_server_options.h create mode 100644 src/kudu/tserver/tablet_server_test_util.cc create mode 100644 src/kudu/tserver/tablet_server_test_util.h create mode 100644 src/kudu/tserver/tablet_service.cc create mode 100644 src/kudu/tserver/tablet_service.h create mode 100644 src/kudu/tserver/ts_tablet_manager-test.cc create mode 100644 src/kudu/tserver/ts_tablet_manager.cc create mode 100644 src/kudu/tserver/ts_tablet_manager.h create mode 100644 src/kudu/tserver/tserver-path-handlers.cc create mode 100644 src/kudu/tserver/tserver-path-handlers.h create mode 100644 src/kudu/tserver/tserver.proto create mode 100644 src/kudu/tserver/tserver_admin.proto create mode 100644 src/kudu/tserver/tserver_service.proto create mode 100644 src/kudu/twitter-demo/CMakeLists.txt create mode 100644 src/kudu/twitter-demo/README create mode 100644 src/kudu/twitter-demo/example-deletes.txt create mode 100644 src/kudu/twitter-demo/example-tweets.txt create mode 100644 src/kudu/twitter-demo/ingest_firehose.cc create mode 100644 src/kudu/twitter-demo/insert_consumer.cc create mode 100644 src/kudu/twitter-demo/insert_consumer.h create mode 100644 src/kudu/twitter-demo/oauth-test.cc create mode 100644 src/kudu/twitter-demo/oauth.cc create mode 100644 src/kudu/twitter-demo/oauth.h create mode 100644 src/kudu/twitter-demo/parser-test.cc create mode 100644 src/kudu/twitter-demo/parser.cc create mode 100644 src/kudu/twitter-demo/parser.h create mode 100644 src/kudu/twitter-demo/twitter-schema.h create mode 100644 src/kudu/twitter-demo/twitter_streamer.cc create mode 100644 src/kudu/twitter-demo/twitter_streamer.h create mode 100644 src/kudu/util/CMakeLists.txt create mode 100644 src/kudu/util/alignment.h create mode 100644 src/kudu/util/async_util.h create mode 100644 src/kudu/util/atomic-test.cc create mode 100644 src/kudu/util/atomic.cc create mode 100644 src/kudu/util/atomic.h create mode 100644 src/kudu/util/auto_release_pool.h create mode 100644 src/kudu/util/bit-stream-utils.h create mode 100644 src/kudu/util/bit-stream-utils.inline.h create mode 100644 src/kudu/util/bit-util-test.cc create mode 100644 src/kudu/util/bit-util.h create mode 100644 src/kudu/util/bitmap-test.cc create mode 100644 src/kudu/util/bitmap.cc create mode 100644 src/kudu/util/bitmap.h create mode 100644 src/kudu/util/blocking_queue-test.cc create mode 100644 src/kudu/util/blocking_queue.h create mode 100644 src/kudu/util/bloom_filter-test.cc create mode 100644 src/kudu/util/bloom_filter.cc create mode 100644 src/kudu/util/bloom_filter.h create mode 100644 src/kudu/util/boost_mutex_utils.h create mode 100644 src/kudu/util/cache-test.cc create mode 100644 src/kudu/util/cache.cc create mode 100644 src/kudu/util/cache.h create mode 100644 src/kudu/util/cache_metrics.cc create mode 100644 src/kudu/util/cache_metrics.h create mode 100644 src/kudu/util/callback_bind-test.cc create mode 100644 src/kudu/util/coding-inl.h create mode 100644 src/kudu/util/coding.cc create mode 100644 src/kudu/util/coding.h create mode 100644 src/kudu/util/condition_variable.cc create mode 100644 src/kudu/util/condition_variable.h create mode 100644 src/kudu/util/countdown_latch-test.cc create mode 100644 src/kudu/util/countdown_latch.h create mode 100644 src/kudu/util/cow_object.h create mode 100644 src/kudu/util/crc-test.cc create mode 100644 src/kudu/util/crc.cc create mode 100644 src/kudu/util/crc.h create mode 100644 src/kudu/util/curl_util.cc create mode 100644 src/kudu/util/curl_util.h create mode 100644 src/kudu/util/debug-util-test.cc create mode 100644 src/kudu/util/debug-util.cc create mode 100644 src/kudu/util/debug-util.h create mode 100644 src/kudu/util/debug/leak_annotations.h create mode 100644 src/kudu/util/debug/leakcheck_disabler.h create mode 100644 src/kudu/util/debug/sanitizer_scopes.h create mode 100644 src/kudu/util/debug/trace_event.h create mode 100644 src/kudu/util/debug/trace_event_impl.cc create mode 100644 src/kudu/util/debug/trace_event_impl.h create mode 100644 src/kudu/util/debug/trace_event_impl_constants.cc create mode 100644 src/kudu/util/debug/trace_event_memory.h create mode 100644 src/kudu/util/debug/trace_event_synthetic_delay.cc create mode 100644 src/kudu/util/debug/trace_event_synthetic_delay.h create mode 100644 src/kudu/util/debug/trace_logging.h create mode 100644 src/kudu/util/debug_ref_counted.h create mode 100644 src/kudu/util/env-test.cc create mode 100644 src/kudu/util/env.cc create mode 100644 src/kudu/util/env.h create mode 100644 src/kudu/util/env_posix.cc create mode 100644 src/kudu/util/env_util.cc create mode 100644 src/kudu/util/env_util.h create mode 100644 src/kudu/util/errno-test.cc create mode 100644 src/kudu/util/errno.cc create mode 100644 src/kudu/util/errno.h create mode 100644 src/kudu/util/failure_detector-test.cc create mode 100644 src/kudu/util/failure_detector.cc create mode 100644 src/kudu/util/failure_detector.h create mode 100644 src/kudu/util/faststring.cc create mode 100644 src/kudu/util/faststring.h create mode 100644 src/kudu/util/fault_injection.cc create mode 100644 src/kudu/util/fault_injection.h create mode 100644 src/kudu/util/flag_tags-test.cc create mode 100644 src/kudu/util/flag_tags.cc create mode 100644 src/kudu/util/flag_tags.h create mode 100644 src/kudu/util/flags.cc create mode 100644 src/kudu/util/flags.h create mode 100644 src/kudu/util/group_varint-inl.h create mode 100644 src/kudu/util/group_varint-test.cc create mode 100644 src/kudu/util/group_varint.cc create mode 100644 src/kudu/util/hash_util-test.cc create mode 100644 src/kudu/util/hash_util.h create mode 100644 src/kudu/util/hdr_histogram-test.cc create mode 100644 src/kudu/util/hdr_histogram.cc create mode 100644 src/kudu/util/hdr_histogram.h create mode 100644 src/kudu/util/hexdump.cc create mode 100644 src/kudu/util/hexdump.h create mode 100644 src/kudu/util/high_water_mark.h create mode 100644 src/kudu/util/histogram.proto create mode 100644 src/kudu/util/init.cc create mode 100644 src/kudu/util/init.h create mode 100644 src/kudu/util/inline_slice-test.cc create mode 100644 src/kudu/util/inline_slice.h create mode 100644 src/kudu/util/interval_tree-inl.h create mode 100644 src/kudu/util/interval_tree-test.cc create mode 100644 src/kudu/util/interval_tree.h create mode 100644 src/kudu/util/jsonreader-test.cc create mode 100644 src/kudu/util/jsonreader.cc create mode 100644 src/kudu/util/jsonreader.h create mode 100644 src/kudu/util/jsonwriter-test.cc create mode 100644 src/kudu/util/jsonwriter.cc create mode 100644 src/kudu/util/jsonwriter.h create mode 100644 src/kudu/util/jsonwriter_test.proto create mode 100644 src/kudu/util/kernel_stack_watchdog.cc create mode 100644 src/kudu/util/kernel_stack_watchdog.h create mode 100644 src/kudu/util/knapsack_solver-test.cc create mode 100644 src/kudu/util/knapsack_solver.h create mode 100644 src/kudu/util/locks.cc create mode 100644 src/kudu/util/locks.h create mode 100644 src/kudu/util/logging-test.cc create mode 100644 src/kudu/util/logging.cc create mode 100644 src/kudu/util/logging.h create mode 100644 src/kudu/util/logging_callback.h create mode 100644 src/kudu/util/logging_test_util.h create mode 100644 src/kudu/util/malloc.cc create mode 100644 src/kudu/util/malloc.h create mode 100644 src/kudu/util/map-util-test.cc create mode 100644 src/kudu/util/mem_tracker-test.cc create mode 100644 src/kudu/util/mem_tracker.cc create mode 100644 src/kudu/util/mem_tracker.h create mode 100644 src/kudu/util/memcmpable_varint-test.cc create mode 100644 src/kudu/util/memcmpable_varint.cc create mode 100644 src/kudu/util/memcmpable_varint.h create mode 100644 src/kudu/util/memenv/memenv-test.cc create mode 100644 src/kudu/util/memenv/memenv.cc create mode 100644 src/kudu/util/memenv/memenv.h create mode 100644 src/kudu/util/memory/arena-test.cc create mode 100644 src/kudu/util/memory/arena.cc create mode 100644 src/kudu/util/memory/arena.h create mode 100644 src/kudu/util/memory/memory.cc create mode 100644 src/kudu/util/memory/memory.h create mode 100644 src/kudu/util/metrics-test.cc create mode 100644 src/kudu/util/metrics.cc create mode 100644 src/kudu/util/metrics.h create mode 100644 src/kudu/util/monotime-test.cc create mode 100644 src/kudu/util/monotime.cc create mode 100644 src/kudu/util/monotime.h create mode 100644 src/kudu/util/mt-hdr_histogram-test.cc create mode 100644 src/kudu/util/mt-metrics-test.cc create mode 100644 src/kudu/util/mt-threadlocal-test.cc create mode 100644 src/kudu/util/mutex.cc create mode 100644 src/kudu/util/mutex.h create mode 100644 src/kudu/util/net/dns_resolver-test.cc create mode 100644 src/kudu/util/net/dns_resolver.cc create mode 100644 src/kudu/util/net/dns_resolver.h create mode 100644 src/kudu/util/net/net_util-test.cc create mode 100644 src/kudu/util/net/net_util.cc create mode 100644 src/kudu/util/net/net_util.h create mode 100644 src/kudu/util/net/sockaddr.cc create mode 100644 src/kudu/util/net/sockaddr.h create mode 100644 src/kudu/util/net/socket.cc create mode 100644 src/kudu/util/net/socket.h create mode 100644 src/kudu/util/nvm_cache.cc create mode 100644 src/kudu/util/nvm_cache.h create mode 100644 src/kudu/util/object_pool-test.cc create mode 100644 src/kudu/util/object_pool.h create mode 100644 src/kudu/util/oid_generator.cc create mode 100644 src/kudu/util/oid_generator.h create mode 100644 src/kudu/util/once-test.cc create mode 100644 src/kudu/util/once.cc create mode 100644 src/kudu/util/once.h create mode 100644 src/kudu/util/os-util-test.cc create mode 100644 src/kudu/util/os-util.cc create mode 100644 src/kudu/util/os-util.h create mode 100644 src/kudu/util/path_util-test.cc create mode 100644 src/kudu/util/path_util.cc create mode 100644 src/kudu/util/path_util.h create mode 100644 src/kudu/util/pb_util-internal.cc create mode 100644 src/kudu/util/pb_util-internal.h create mode 100644 src/kudu/util/pb_util-test.cc create mode 100644 src/kudu/util/pb_util.cc create mode 100644 src/kudu/util/pb_util.h create mode 100644 src/kudu/util/pb_util.proto create mode 100644 src/kudu/util/promise.h create mode 100644 src/kudu/util/proto_container_test.proto create mode 100644 src/kudu/util/proto_container_test2.proto create mode 100644 src/kudu/util/proto_container_test3.proto create mode 100644 src/kudu/util/protobuf-annotations.h create mode 100644 src/kudu/util/protobuf_util.h create mode 100644 src/kudu/util/protoc-gen-insertions.cc create mode 100644 src/kudu/util/pstack_watcher-test.cc create mode 100644 src/kudu/util/pstack_watcher.cc create mode 100644 src/kudu/util/pstack_watcher.h create mode 100644 src/kudu/util/random-test.cc create mode 100644 src/kudu/util/random.h create mode 100644 src/kudu/util/random_util-test.cc create mode 100644 src/kudu/util/random_util.cc create mode 100644 src/kudu/util/random_util.h create mode 100644 src/kudu/util/resettable_heartbeater-test.cc create mode 100644 src/kudu/util/resettable_heartbeater.cc create mode 100644 src/kudu/util/resettable_heartbeater.h create mode 100644 src/kudu/util/rle-encoding.h create mode 100644 src/kudu/util/rle-test.cc create mode 100644 src/kudu/util/rolling_log-test.cc create mode 100644 src/kudu/util/rolling_log.cc create mode 100644 src/kudu/util/rolling_log.h create mode 100644 src/kudu/util/rw_semaphore-test.cc create mode 100644 src/kudu/util/rw_semaphore.h create mode 100644 src/kudu/util/rwc_lock-test.cc create mode 100644 src/kudu/util/rwc_lock.cc create mode 100644 src/kudu/util/rwc_lock.h create mode 100644 src/kudu/util/safe_math-test.cc create mode 100644 src/kudu/util/safe_math.h create mode 100644 src/kudu/util/semaphore.cc create mode 100644 src/kudu/util/semaphore.h create mode 100644 src/kudu/util/semaphore_macosx.cc create mode 100644 src/kudu/util/slice-test.cc create mode 100644 src/kudu/util/slice.cc create mode 100644 src/kudu/util/slice.h create mode 100644 src/kudu/util/spinlock_profiling-test.cc create mode 100644 src/kudu/util/spinlock_profiling.cc create mode 100644 src/kudu/util/spinlock_profiling.h create mode 100644 src/kudu/util/stack_watchdog-test.cc create mode 100644 src/kudu/util/status-test.cc create mode 100644 src/kudu/util/status.cc create mode 100644 src/kudu/util/status.h create mode 100644 src/kudu/util/status_callback.cc create mode 100644 src/kudu/util/status_callback.h create mode 100644 src/kudu/util/stopwatch.h create mode 100644 src/kudu/util/string_case-test.cc create mode 100644 src/kudu/util/string_case.cc create mode 100644 src/kudu/util/string_case.h create mode 100644 src/kudu/util/striped64-test.cc create mode 100644 src/kudu/util/striped64.cc create mode 100644 src/kudu/util/striped64.h create mode 100644 src/kudu/util/subprocess-test.cc create mode 100644 src/kudu/util/subprocess.cc create mode 100644 src/kudu/util/subprocess.h create mode 100644 src/kudu/util/sync_point-test.cc create mode 100644 src/kudu/util/sync_point.cc create mode 100644 src/kudu/util/sync_point.h create mode 100644 src/kudu/util/test_graph.cc create mode 100644 src/kudu/util/test_graph.h create mode 100644 src/kudu/util/test_macros.h create mode 100644 src/kudu/util/test_main.cc create mode 100644 src/kudu/util/test_util.cc create mode 100644 src/kudu/util/test_util.h create mode 100644 src/kudu/util/thread-test.cc create mode 100644 src/kudu/util/thread.cc create mode 100644 src/kudu/util/thread.h create mode 100644 src/kudu/util/thread_restrictions.cc create mode 100644 src/kudu/util/thread_restrictions.h create mode 100644 src/kudu/util/threadlocal.cc create mode 100644 src/kudu/util/threadlocal.h create mode 100644 src/kudu/util/threadpool-test.cc create mode 100644 src/kudu/util/threadpool.cc create mode 100644 src/kudu/util/threadpool.h create mode 100644 src/kudu/util/trace-test.cc create mode 100644 src/kudu/util/trace.cc create mode 100644 src/kudu/util/trace.h create mode 100644 src/kudu/util/url-coding-test.cc create mode 100644 src/kudu/util/url-coding.cc create mode 100644 src/kudu/util/url-coding.h create mode 100644 src/kudu/util/user-test.cc create mode 100644 src/kudu/util/user.cc create mode 100644 src/kudu/util/user.h create mode 100644 src/kudu/util/version_info.cc create mode 100644 src/kudu/util/version_info.h create mode 100644 src/kudu/util/version_info.proto create mode 100644 src/kudu/util/web_callback_registry.h create mode 100644 thirdparty/.gitignore create mode 100644 thirdparty/LICENSE.txt create mode 100644 thirdparty/README.txt create mode 100644 thirdparty/boost_uuid/boost/uuid/name_generator.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/nil_generator.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/random_generator.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/seed_rng.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/sha1.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/string_generator.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/uuid.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/uuid_generators.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/uuid_io.hpp create mode 100644 thirdparty/boost_uuid/boost/uuid/uuid_serialize.hpp create mode 100644 thirdparty/build-definitions.sh create mode 100755 thirdparty/build-if-necessary.sh create mode 100755 thirdparty/build-thirdparty.sh create mode 100755 thirdparty/download-thirdparty.sh create mode 100644 thirdparty/patches/crcutil-fix-libtoolize-on-osx.patch create mode 100644 thirdparty/patches/glog-issue-198-fix-unused-warnings.patch create mode 100644 thirdparty/patches/gperftools-Change-default-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch create mode 100644 thirdparty/patches/gperftools-hook-mi_force_unlock-on-OSX-instead-of-pthread_atfork.patch create mode 100644 thirdparty/patches/libstdcxx-fix-string-dtor.patch create mode 100644 thirdparty/patches/libstdcxx-fix-tr1-shared-ptr.patch create mode 100644 thirdparty/patches/llvm-devtoolset-toolchain.patch create mode 100644 thirdparty/patches/llvm-fix-amazon-linux.patch create mode 100644 thirdparty/patches/lz4-0001-Fix-cmake-build-to-use-gnu-flags-on-clang.patch create mode 100644 thirdparty/vars.sh create mode 100644 version.txt create mode 100644 www/bootstrap/css/bootstrap-responsive.css create mode 100644 www/bootstrap/css/bootstrap-responsive.min.css create mode 100644 www/bootstrap/css/bootstrap.css create mode 100644 www/bootstrap/css/bootstrap.min.css create mode 100644 www/bootstrap/img/glyphicons-halflings-white.png create mode 100644 www/bootstrap/img/glyphicons-halflings.png create mode 100644 www/bootstrap/js/bootstrap.js create mode 100644 www/bootstrap/js/bootstrap.min.js create mode 100644 www/d3.v2.js create mode 100644 www/epoch.0.5.2.min.css create mode 100644 www/epoch.0.5.2.min.js create mode 100644 www/favicon.ico create mode 100644 www/index.html create mode 100644 www/jquery-1.11.1.min.js create mode 100644 www/kudu.css create mode 100644 www/logo.png create mode 100644 www/metrics-epoch.js create mode 100644 www/metrics.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000000..a55a3514a287 --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +# Most common in-tree build directory. +# +# Note: build output files are not explicitly listed here because they are +# expected to fall within in the build directory (and indeed, if they're +# outside for some reason, we want to know as it's likely an error). +build/ + +perf.data +perf.data.old +oprofile_data +.ycm_extra_conf.pyc +*.kdev4 +.kdev4/ +.metadata/ +*.iml + +# VIM/emacs stuff +*.swp +*~ + +# IDE cruft +.settings/ +.idea/ +nbproject/ +.project +.cproject +.csettings/ + +# WWW dependencies which are not checked in directly +www/tracing.* + +# Python +python/kudu/*.so +python/kudu/*.cpp +*.py[ocd] diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py new file mode 100644 index 000000000000..7fdc91536168 --- /dev/null +++ b/.ycm_extra_conf.py @@ -0,0 +1,168 @@ +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# For more information, please refer to + +# This is a configuration file for YouCompleteMe (YCM), a Vim extension for +# navigation and code completion with C++ and other languages. +# +# To make YCM work with Kudu, add your Kudu source directory to the +# g:ycm_extra_conf_globlist variable in your .vimrc file. For details on how to +# install and configure YouCompleteMe, see +# https://github.com/Valloric/YouCompleteMe +# +# This file is based on the example configuration file from YouCompleteMe. + +import os +import ycm_core + +# These are the compilation flags that will be used in case there's no +# compilation database set (by default, one is not set). +# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR. +flags = [ +'-x', +'c++', +'-DKUDU_HEADERS_NO_STUBS=1', +'-DKUDU_HEADERS_USE_RICH_SLICE=1', +'-DKUDU_HEADERS_USE_SHORT_STATUS_MACROS=1', +'-DKUDU_STATIC_DEFINE', +'-Dintegration_tests_EXPORTS', +'-D_GLIBCXX_USE_CXX11_ABI=0', +'-D__STDC_FORMAT_MACROS', +'-fno-strict-aliasing', +'-msse4.2', +'-Wall', +'-Wno-sign-compare', +'-Wno-deprecated', +'-pthread', +'-ggdb', +'-Qunused-arguments', +'-Wno-ambiguous-member-template', +'-std=c++11', +'-g', +'-fPIC', +'-I', +'src', +'-I', +'build/latest/src', +'-isystem', +'thirdparty/installed/include', +'-isystem', +'thirdparty/installed-deps/include', +] + +# Set this to the absolute path to the folder (NOT the file!) containing the +# compile_commands.json file to use that instead of 'flags'. See here for +# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html +# +# You can get CMake to generate this file for you by adding: +# set( CMAKE_EXPORT_COMPILE_COMMANDS 1 ) +# to your CMakeLists.txt file. +# +# Most projects will NOT need to set this to anything; you can just change the +# 'flags' list of compilation flags. Notice that YCM itself uses that approach. +compilation_database_folder = '' + +if os.path.exists( compilation_database_folder ): + database = ycm_core.CompilationDatabase( compilation_database_folder ) +else: + database = None + +SOURCE_EXTENSIONS = [ '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ] + +def DirectoryOfThisScript(): + return os.path.dirname( os.path.abspath( __file__ ) ) + + +def MakeRelativePathsInFlagsAbsolute( flags, working_directory ): + if not working_directory: + return list( flags ) + new_flags = [] + make_next_absolute = False + path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ] + for flag in flags: + new_flag = flag + + if make_next_absolute: + make_next_absolute = False + if not flag.startswith( '/' ): + new_flag = os.path.join( working_directory, flag ) + + for path_flag in path_flags: + if flag == path_flag: + make_next_absolute = True + break + + if flag.startswith( path_flag ): + path = flag[ len( path_flag ): ] + new_flag = path_flag + os.path.join( working_directory, path ) + break + + if new_flag: + new_flags.append( new_flag ) + return new_flags + + +def IsHeaderFile( filename ): + extension = os.path.splitext( filename )[ 1 ] + return extension in [ '.h', '.hxx', '.hpp', '.hh' ] + + +def GetCompilationInfoForFile( filename ): + # The compilation_commands.json file generated by CMake does not have entries + # for header files. So we do our best by asking the db for flags for a + # corresponding source file, if any. If one exists, the flags for that file + # should be good enough. + if IsHeaderFile( filename ): + basename = os.path.splitext( filename )[ 0 ] + for extension in SOURCE_EXTENSIONS: + replacement_file = basename + extension + if os.path.exists( replacement_file ): + compilation_info = database.GetCompilationInfoForFile( + replacement_file ) + if compilation_info.compiler_flags_: + return compilation_info + return None + return database.GetCompilationInfoForFile( filename ) + + +def FlagsForFile( filename, **kwargs ): + if database: + # Bear in mind that compilation_info.compiler_flags_ does NOT return a + # python list, but a "list-like" StringVec object + compilation_info = GetCompilationInfoForFile( filename ) + if not compilation_info: + return None + + final_flags = MakeRelativePathsInFlagsAbsolute( + compilation_info.compiler_flags_, + compilation_info.compiler_working_dir_ ) + + else: + relative_to = DirectoryOfThisScript() + final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to ) + + return { + 'flags': final_flags, + 'do_cache': True + } diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000000..21100ccf4b4d --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,989 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Require cmake that supports BYPRODUCTS in add_custom_command() [1]. +# +# Note: cmake in thirdparty/ will always meet this minimum. +# +# 1. https://cmake.org/Bug/view.php?id=14963 +cmake_minimum_required(VERSION 3.2.0) + +# Prevent builds from the top-level source directory. This ensures that build +# output is well isolated from the source tree. +# +# May be overridden by setting KUDU_ALLOW_IN_SOURCE_BUILDS; this is only +# recommended for experts! +if("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}" AND + NOT "${KUDU_ALLOW_IN_SOURCE_BUILD}") + message(FATAL_ERROR + "Kudu may not be built from the top-level source directory. Create a new " + "directory and run cmake from there, passing the path to the top-level " + "source directory as the last argument. " + "To override this, rerun CMake with -DKUDU_ALLOW_IN_SOURCE_BUILD=1. " + "Also, delete 'CMakeCache.txt' and 'CMakeFiles' from the top-level source " + "directory, otherwise future builds will not work.") +endif() + +# Provide a 'latest' symlink to this build directory if the "blessed" +# multi-build layout is detected: +# +# build/ +# build/ +# build/ +# ... +if ("${CMAKE_CURRENT_BINARY_DIR}" MATCHES "^${CMAKE_CURRENT_SOURCE_DIR}/build/[^/]+$") + if ("${CMAKE_CURRENT_BINARY_DIR}" MATCHES "^${CMAKE_CURRENT_SOURCE_DIR}/build/latest") + message(FATAL_ERROR "Should not run cmake inside the build/latest symlink. " + "First change directories into the destination of the symlink.") + endif() + if (NOT APPLE) + set(MORE_ARGS "-T") + endif() + add_custom_target(latest_symlink ALL + ln ${MORE_ARGS} -sf ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/build/latest + COMMENT "Recreating ../build/latest symlink") + + # 'ALL' above doesn't actually add 'latest_symlink' as a dependency to all + # targets. So, we override add_executable to ensure that whenever any executable + # is built, the symlink is re-created. + function(add_executable name) + # Call through to the original add_executable function. + _add_executable(${name} ${ARGN}) + add_dependencies(${name} latest_symlink) + endfunction() +endif() + +# TODO: can we somehow pass this into the java build? +file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/version.txt" KUDU_VERSION_NUMBER) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") +include(CMakeParseArguments) + +set(BUILD_SUPPORT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/build-support) + +# Allow "make install" to not depend on all targets. +# +# Must be declared in the top-level CMakeLists.txt. +set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + +# Make sure thirdparty stuff is up-to-date. +if ("$ENV{NO_REBUILD_THIRDPARTY}" STREQUAL "") + execute_process( + COMMAND ${CMAKE_SOURCE_DIR}/thirdparty/build-if-necessary.sh + RESULT_VARIABLE THIRDPARTY_SCRIPT_RESULT) + if (NOT (${THIRDPARTY_SCRIPT_RESULT} EQUAL 0)) + message(FATAL_ERROR "Thirdparty was built unsuccessfully, terminating.") + endif() +endif() + +# Generate a Clang compile_commands.json "compilation database" file for use +# with various development tools, such as Vim's YouCompleteMe plugin. +# See http://clang.llvm.org/docs/JSONCompilationDatabase.html +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +endif() + +############################################################ +# Compiler flags +############################################################ + +# compiler flags that are common across debug/release builds +# - msse4.2: Enable sse4.2 compiler intrinsics. +# - Wall: Enable all warnings. +# - Wno-sign-compare: suppress warnings for comparison between signed and unsigned +# integers +# -Wno-deprecated: some of the gutil code includes old things like ext/hash_set, ignore that +# - pthread: enable multithreaded malloc +# -fno-strict-aliasing +# Assume programs do not follow strict aliasing rules. +# GCC cannot always verify whether strict aliasing rules are indeed followed due to +# fundamental limitations in escape analysis, which can result in subtle bad code generation. +# This has a small perf hit but worth it to avoid hard to debug crashes. +set(CXX_COMMON_FLAGS "-fno-strict-aliasing -msse4.2 -Wall -Wno-sign-compare -Wno-deprecated -pthread") + +# We want access to the PRI* print format macros. +add_definitions(-D__STDC_FORMAT_MACROS) + +# Explicitly disable the new gcc5 ABI. Until clang supports abi tags [1], Kudu's +# generated code (which always uses clang) must be built against the old ABI. +# There's no recourse for using both ABIs in the same process; gcc's advice [2] +# is to build everything against the old ABI. +# +# 1. https://llvm.org/bugs/show_bug.cgi?id=23529 +# 2. https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + +# We want short macros from util/status.h. +add_definitions(-DKUDU_HEADERS_USE_SHORT_STATUS_MACROS=1) + +# Slice includes many gutil dependencies that third-party users of the Kudu +# client library don't have. Our build has them, though. +add_definitions(-DKUDU_HEADERS_USE_RICH_SLICE=1) + +# We don't want to use any stubs; that's exclusively for builds using our +# exported client headers). +add_definitions(-DKUDU_HEADERS_NO_STUBS=1) + +# compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') +# For all builds: +# For CMAKE_BUILD_TYPE=Debug +# -ggdb: Enable gdb debugging +# For CMAKE_BUILD_TYPE=FastDebug +# Same as DEBUG, except with some optimizations on. +# For CMAKE_BUILD_TYPE=Release +# -O3: Enable all compiler optimizations +# -g: Enable symbols for profiler tools (TODO: remove for shipping) +# -DNDEBUG: Turn off dchecks/asserts/debug only code. +# -fno-omit-frame-pointer +# use frame pointers to allow simple stack frame walking for backtraces. +# This has a small perf hit but worth it for the ability to profile in production +# For profile guided optimization (PGO) builds, in addition to the flags for release builds: +# 1. Build first with CMAKE_BUILD_TYPE_PROFILE_GEN: +# -fprofile-generate: Indicates compiler should insert profile guided optimization events +# 2. Run the benchmarks (generates *.gcda profiling data). +# 3. Build again with CMAKE_BUILD_TYPE_PROFILE_BUILD +# -fprofile-use: Compiler will use the profile outputs for optimizations +set(CXX_FLAGS_DEBUG "-ggdb") +set(CXX_FLAGS_FASTDEBUG "-ggdb -O1 -fno-omit-frame-pointer") +set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG -fno-omit-frame-pointer") + +if (NOT "${KUDU_USE_LTO}" STREQUAL "") + set(CXX_FLAGS_RELEASE "${CXX_FLAGS_RELEASE} flto -fno-use-linker-plugin") +endif() + +set(CXX_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") +set(CXX_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") + +# if no build build type is specified, default to debug builds +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif(NOT CMAKE_BUILD_TYPE) + +string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) + +# Alias RELEASE as RELWITHDEBINFO and MINSIZEREL. These are common CMake +# release type names and this provides compatibility with the CLion IDE. +if ("${CMAKE_BUILD_TYPE}" STREQUAL "RELWITHDEBINFO" OR "${CMAKE_BUILD_TYPE}" STREQUAL "MINSIZEREL") + set(CMAKE_BUILD_TYPE RELEASE) +endif () + +# Set compile flags based on the build type. +message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_DEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_FASTDEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_RELEASE}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_GEN") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_GEN}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_BUILD") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_BUILD}) +else() + message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") +endif () + +# Add common flags +set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") + +# Determine compiler version +include(CompilerInfo) + +if ("${COMPILER_FAMILY}" STREQUAL "clang") + # Using Clang with ccache causes a bunch of spurious warnings that are + # purportedly fixed in the next version of ccache. See the following for details: + # + # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html + # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") + + # Clang generates ambiguous member template warnings when calling the ev++ api. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ambiguous-member-template") + + # Only hardcode -fcolor-diagnostics if stderr is opened on a terminal. Otherwise + # the color codes show up as noisy artifacts. + # + # This test is imperfect because 'cmake' and 'make' can be run independently + # (with different terminal options), and we're testing during the former. + execute_process(COMMAND test -t 2 RESULT_VARIABLE KUDU_IS_TTY) + if ((${KUDU_IS_TTY} EQUAL 0) AND (NOT ("$ENV{TERM}" STREQUAL "dumb"))) + message("Running in a controlling terminal") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") + else() + message("Running without a controlling terminal or in a dumb terminal") + endif() +elseif("${COMPILER_FAMILY}" STREQUAL "gcc") + # Blacklist gcc versions known to generate broken optimized code. + # + # See KUDU-1030 for more details. + if ("${COMPILER_VERSION}" MATCHES "^4.[67]") + message(FATAL_ERROR "Building with gcc version ${COMPILER_VERSION} is " + "forbidden as it is known to produce broken code in release mode. " + "Upgrade gcc, or build with clang from the thirdparty directory.") + endif() +endif() + +# Sanity check linking option. +if (NOT KUDU_LINK) + set(KUDU_LINK "a") +elseif(NOT ("auto" MATCHES "^${KUDU_LINK}" OR + "dynamic" MATCHES "^${KUDU_LINK}" OR + "static" MATCHES "^${KUDU_LINK}")) + message(FATAL_ERROR "Unknown value for KUDU_LINK, must be auto|dynamic|static") +else() + # Remove all but the first letter. + string(SUBSTRING "${KUDU_LINK}" 0 1 KUDU_LINK) +endif() + +# If not set, any file that includes kudu_export.h (an autogenerated file) will +# use visibility("hidden") with symbols annotated with KUDU_NO_EXPORT, even when +# compiled with default visibility flags. It is overridden as needed by +# ADD_EXPORTABLE_LIBRARY() when actually compiling exported library variants. +add_definitions("-DKUDU_STATIC_DEFINE") + +# Clang does not support using ASAN and TSAN simultaneously. +if ("${KUDU_USE_ASAN}" AND "${KUDU_USE_TSAN}") + message(SEND_ERROR "Can only enable one of ASAN or TSAN at a time") +endif() + +# Flag to enable clang address sanitizer +# This will only build if clang or a recent enough gcc is the chosen compiler +if (${KUDU_USE_ASAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) + message(SEND_ERROR "Cannot use ASAN without clang or gcc >= 4.8") + endif() + + # If UBSAN is also enabled, and we're on clang < 3.5, ensure static linking is + # enabled. Otherwise, we run into https://llvm.org/bugs/show_bug.cgi?id=18211 + if("${KUDU_USE_UBSAN}" AND + "${COMPILER_FAMILY}" STREQUAL "clang" AND + "${COMPILER_VERSION}" VERSION_LESS "3.5") + if("${KUDU_LINK}" STREQUAL "a") + message("Using static linking for ASAN+UBSAN build") + set(KUDU_LINK "s") + elseif("${KUDU_LINK}" STREQUAL "d") + message(SEND_ERROR "Cannot use dynamic linking when ASAN and UBSAN are both enabled") + endif() + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -DADDRESS_SANITIZER") +endif() + +# For any C code, use the same flags. +set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +# Flag to enable clang undefined behavior sanitizer +# We explicitly don't enable all of the sanitizer flags: +# - disable 'vptr' because it currently crashes somewhere in boost::intrusive::list code +# - disable 'alignment' because unaligned access is really OK on Nehalem and we do it +# all over the place. +if (${KUDU_USE_UBSAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.9"))) + message(SEND_ERROR "Cannot use UBSAN without clang or gcc >= 4.9") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover") +endif () + +# Flag to enable thread sanitizer (clang or gcc 4.8) +if (${KUDU_USE_TSAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) + message(SEND_ERROR "Cannot use TSAN without clang or gcc >= 4.8") + endif() + + add_definitions("-fsanitize=thread") + + # Enables dynamic_annotations.h to actually generate code + add_definitions("-DDYNAMIC_ANNOTATIONS_ENABLED") + + # changes atomicops to use the tsan implementations + add_definitions("-DTHREAD_SANITIZER") + + # Disables using the precompiled template specializations for std::string, shared_ptr, etc + # so that the annotations in the header actually take effect. + add_definitions("-D_GLIBCXX_EXTERN_TEMPLATE=0") + + # Compile and link against the thirdparty TSAN instrumented libstdcxx. + set(TSAN_GCC_DIR "${CMAKE_SOURCE_DIR}/thirdparty/installed-deps-tsan/gcc") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${TSAN_GCC_DIR}/lib -fsanitize=thread") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -nostdinc++ -L${TSAN_GCC_DIR}/lib") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${TSAN_GCC_DIR}/include/c++/4.9.3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${TSAN_GCC_DIR}/include/c++/4.9.3/backward") + + # Strictly speaking, TSAN doesn't require dynamic linking. But it does + # require all code to be position independent, and the easiest way to + # guarantee that is via dynamic linking (not all 3rd party archives are + # compiled with -fPIC e.g. boost). + if("${KUDU_LINK}" STREQUAL "a") + message("Using dynamic linking for TSAN") + set(KUDU_LINK "d") + elseif("${KUDU_LINK}" STREQUAL "s") + message(SEND_ERROR "Cannot use TSAN with static linking") + endif() +endif() + + +if ("${KUDU_USE_UBSAN}" OR "${KUDU_USE_ASAN}" OR "${KUDU_USE_TSAN}") + # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify a + # sanitizer blacklist. + if("${COMPILER_FAMILY}" STREQUAL "clang") + # Require clang 3.4 or newer; clang 3.3 has issues with TSAN and pthread + # symbol interception. + if("${COMPILER_VERSION}" VERSION_LESS "3.4") + message(SEND_ERROR "Must use clang 3.4 or newer to run a sanitizer build." + " Try using clang from thirdparty/") + endif() + add_definitions("-fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") + else() + message(WARNING "GCC does not support specifying a sanitizer blacklist. Known sanitizer check failures will not be suppressed.") + endif() +endif() + +# Code coverage +if ("${KUDU_GENERATE_COVERAGE}") + if("${CMAKE_CXX_COMPILER}" MATCHES ".*clang.*") + # There appears to be some bugs in clang 3.3 which cause code coverage + # to have link errors, not locating the llvm_gcda_* symbols. + # This should be fixed in llvm 3.4 with http://llvm.org/viewvc/llvm-project?view=revision&revision=184666 + message(SEND_ERROR "Cannot currently generate coverage with clang") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -DCOVERAGE_BUILD") + + # For coverage to work properly, we need to use static linkage. Otherwise, + # __gcov_flush() doesn't properly flush coverage from every module. + # See http://stackoverflow.com/questions/28164543/using-gcov-flush-within-a-library-doesnt-force-the-other-modules-to-yield-gc + if("${KUDU_LINK}" STREQUAL "a") + message("Using static linking for coverage build") + set(KUDU_LINK "s") + elseif("${KUDU_LINK}" STREQUAL "d") + message(SEND_ERROR "Cannot use coverage with static linking") + endif() +endif() + +# If we still don't know what kind of linking to perform, choose based on +# build type (developers like fast builds). +if ("${KUDU_LINK}" STREQUAL "a") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG" OR + "${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + message("Using dynamic linking for ${CMAKE_BUILD_TYPE} builds") + set(KUDU_LINK "d") + else() + message("Using static linking for ${CMAKE_BUILD_TYPE} builds") + set(KUDU_LINK "s") + endif() +endif() + +# Are we using the gold linker? It doesn't work with dynamic linking as +# weak symbols aren't properly overridden, causing tcmalloc to be omitted. +# Let's flag this as an error in RELEASE builds (we shouldn't release a +# product like this). +# +# See https://sourceware.org/bugzilla/show_bug.cgi?id=16979 for details. +# +# The gold linker is only for ELF binaries, which OSX doesn't use. We can +# just skip. +if (NOT APPLE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -Wl,--version OUTPUT_VARIABLE LINKER_OUTPUT) +endif () +if (LINKER_OUTPUT MATCHES "gold") + if ("${KUDU_LINK}" STREQUAL "d" AND + "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + message(SEND_ERROR "Cannot use gold with dynamic linking in a RELEASE build " + "as it would cause tcmalloc symbols to get dropped") + else() + message("Using gold linker") + endif() + set(KUDU_USING_GOLD 1) +else() + message("Using ld linker") +endif() + +# Having set KUDU_LINK due to build type and/or sanitizer, it's now safe to +# act on its value. +if ("${KUDU_LINK}" STREQUAL "d") + set(BUILD_SHARED_LIBS ON) + + # Position independent code is only necessary when producing shared objects. + add_definitions(-fPIC) +endif() + +# where to put generated archives (.a files) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/lib") +file(MAKE_DIRECTORY "${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}") + +# where to put generated libraries (.so files) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/lib") +file(MAKE_DIRECTORY "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}") + +# where to put generated binaries +set(EXECUTABLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/bin") +file(MAKE_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}") + +include_directories(${CMAKE_CURRENT_BINARY_DIR}/src) +include_directories(src) + +############################################################ +# Visibility +############################################################ +# For generate_export_header() and add_compiler_export_flags(). +include(GenerateExportHeader) + +# add_library() wrapper that adds a second variant of the library for use in the +# exported Kudu C++ client. This variant is suffixed with "_exported" and is +# compiled with special visibility flags to hide all symbols except those that +# are part of the public ABI. +# +# There are two different kinds of exported libraries: internal and leaf. +# Internal libraries are static archives while leaf libraries are shared +# objects built from internal libraries. In practice there is only one leaf +# library: the Kudu C++ client itself. +# +# Arguments: +# +# LIB_NAME is the name of the library. It must come first. Required. +# +# SRCS is the list of source files to compile into the library. Required. +# +# DEPS is the list of targets that both library variants depend on. Required. +# +# NONLINK_DEPS is the list of (non-linked) targets that both library variants +# depend on. Optional. +# +# COMPILE_FLAGS is a string containing any additional compilation flags that +# should be added to both library variants. Optional. +# +# EXPORTED_SHARED is a toggle that, if set, indicates that the exported variant +# is a "leaf" library. Otherwise it is an "internal" library. Optional. +# +# EXPORTED_OUTPUT_NAME is a string describing a different file name for the +# exported library variant. If not set, defaults to LIB_NAME. Optional. +# +# EXPORTED_OUTPUT_DIRECTORY is a string describing a different directory where +# the exported library variant should be written. If not set, defaults to the +# directory where this function was called. Optional. +# +# EXPORTED_DEPS is a list of targets that the exported library variant depends +# on. If not set, defaults to DEPS. Optional. +function(ADD_EXPORTABLE_LIBRARY LIB_NAME) + # Parse the arguments. + set(options EXPORTED_SHARED) + set(one_value_args COMPILE_FLAGS EXPORTED_OUTPUT_NAME EXPORTED_OUTPUT_DIRECTORY) + set(multi_value_args SRCS DEPS EXPORTED_DEPS NONLINK_DEPS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + # First add the regular version of the library. It uses + # whatever linkage was defined globally. + add_library(${LIB_NAME} ${ARG_SRCS}) + if(ARG_COMPILE_FLAGS) + set_target_properties(${LIB_NAME} + PROPERTIES COMPILE_FLAGS ${ARG_COMPILE_FLAGS}) + endif() + target_link_libraries(${LIB_NAME} ${ARG_DEPS}) + if(ARG_NONLINK_DEPS) + add_dependencies(${LIB_NAME} ${ARG_NONLINK_DEPS}) + endif() + + # Now start setting up the exported variant. + set(EXPORTED_LIB_NAME ${LIB_NAME}_exported) + if(ARG_EXPORTED_SHARED) + # Leaf library. + set(EXPORTED_LINKAGE "SHARED") + set(EXPORTED_LINK_PRIVATE "LINK_PRIVATE") + else() + # Internal library. + set(EXPORTED_LINKAGE "STATIC") + set(EXPORTED_LINK_PRIVATE) + endif() + add_library(${EXPORTED_LIB_NAME} ${EXPORTED_LINKAGE} ${ARG_SRCS}) + + # Compile with visibility flags: + # - default for classes annotated with KUDU_EXPORT. + # - hidden for classes annotated with KUDU_NO_EXPORT. + # - hidden for everything else. + add_compiler_export_flags(EXPORTED_FLAGS) + + # Exported variants are either static archives that will be linked to a shared + # object, or shared objects. Either way, -fPIC is needed. + if("${KUDU_LINK}" STREQUAL "s") + set(EXPORTED_FLAGS "${EXPORTED_FLAGS} -fPIC") + endif() + + # We need to remove some definitions previously added at directory scope. + # There doesn't appear to be a good way to do this in cmake, so we do it via + # the compiler with -U (e.g. "-UFOO" means "undefine the FOO definition"). + # Adding insult to injury, the COMPILE_DEFINITIONS property adds a -D prefix + # to anything passed into it, so we're forced to handle the removal via + # COMPILE_FLAGS, which, lucky for us, is emitted on the command line after + # COMPILE_DEFINITIONS. + + # Exported variants need KUDU_EXPORT definitions to take effect. + set(EXPORTED_FLAGS "${EXPORTED_FLAGS} -UKUDU_STATIC_DEFINE") + + # Exported variants may not use tcmalloc. + set(EXPORTED_FLAGS "${EXPORTED_FLAGS} -UTCMALLOC_ENABLED") + + set_target_properties(${EXPORTED_LIB_NAME} + PROPERTIES COMPILE_FLAGS "${ARG_COMPILE_FLAGS} ${EXPORTED_FLAGS}") + + # Handle EXPORTED_OUTPUT_NAME and EXPORTED_OUTPUT_DIRECTORY. + if(ARG_EXPORTED_OUTPUT_NAME) + set_target_properties(${EXPORTED_LIB_NAME} + PROPERTIES LIBRARY_OUTPUT_NAME ${ARG_EXPORTED_OUTPUT_NAME}) + endif() + if(ARG_EXPORTED_OUTPUT_DIRECTORY) + set_target_properties(${EXPORTED_LIB_NAME} + PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${ARG_EXPORTED_OUTPUT_DIRECTORY}) + endif() + + # Set up exported variant dependent targets. + # + # Every linked dependency is suffixed with "_exported". This is fine; the + # exported target graph is expected to be complete, and ADD_THIRDPARTY_LIB + # will provide an "exported variant" for each third party target. + if(ARG_EXPORTED_DEPS) + set(EXPORTED_DEPS ${ARG_EXPORTED_DEPS}) + else() + set(EXPORTED_DEPS ${ARG_DEPS}) + endif() + foreach(DEP ${EXPORTED_DEPS}) + list(APPEND EXPORTED_SUFFIXED_DEPS "${DEP}_exported") + endforeach() + target_link_libraries(${EXPORTED_LIB_NAME} ${EXPORTED_LINK_PRIVATE} ${EXPORTED_SUFFIXED_DEPS}) + if(ARG_NONLINK_DEPS) + add_dependencies(${EXPORTED_LIB_NAME} ${ARG_NONLINK_DEPS}) + endif() +endfunction() + +############################################################ +# Testing +############################################################ + +# Add a new test case, with or without an executable that should be built. +# +# REL_TEST_NAME is the name of the test. It may be a single component +# (e.g. monotime-test) or contain additional components (e.g. +# net/net_util-test). Either way, the last component must be a globally +# unique name. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_KUDU_TEST REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) + # This test has a corresponding .cc file, set it up as an executable. + set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") + add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") + target_link_libraries(${TEST_NAME} ${KUDU_TEST_LINK_LIBS}) + else() + # No executable, just invoke the test (probably a script) directly. + get_filename_component(TEST_NAME_WITH_EXT ${REL_TEST_NAME} NAME) + set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME_WITH_EXT}") + + # Ideally this would run only when the test is built, not when cmake runs, + # but add_test() doesn't yield a target (if it did, that target could depend + # on an add_custom_command() that copies the test file into place). + execute_process(COMMAND ln -sf ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME} + ${EXECUTABLE_OUTPUT_PATH}) + endif() + + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH}) + if(ARGN) + set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_TESTS. +function(ADD_KUDU_TEST_DEPENDENCIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + add_dependencies(${TEST_NAME} ${ARGN}) +endfunction() + +enable_testing() + +############################################################ +# Dependencies +############################################################ +function(ADD_THIRDPARTY_LIB LIB_NAME) + set(options) + set(one_value_args SHARED_LIB STATIC_LIB) + set(multi_value_args DEPS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(("${KUDU_LINK}" STREQUAL "s" AND ARG_STATIC_LIB) OR (NOT ARG_SHARED_LIB)) + if(NOT ARG_STATIC_LIB) + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") + endif() + add_library(${LIB_NAME} STATIC IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + else() + add_library(${LIB_NAME} SHARED IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + endif() + + if(ARG_DEPS) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() + + # Set up an "exported variant" for this thirdparty library (see "Visibility" + # above). It's the same as the real target, just with an "_exported" suffix. + # We prefer the static archive if it exists (as it's akin to an "internal" + # library), but we'll settle for the shared object if we must. + # + # A shared object exported variant will force any "leaf" library that + # transitively depends on it to also depend on it at runtime; this is + # desirable for some libraries (e.g. cyrus_sasl). + set(LIB_NAME_EXPORTED ${LIB_NAME}_exported) + if(ARG_STATIC_LIB) + add_library(${LIB_NAME_EXPORTED} STATIC IMPORTED) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + else() + add_library(${LIB_NAME_EXPORTED} SHARED IMPORTED) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + endif() + if(ARG_DEPS) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() +endfunction() + +# Look in thirdparty prefix paths before anywhere else for system dependencies. +set(THIRDPARTY_PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/installed) +set(CMAKE_PREFIX_PATH ${THIRDPARTY_PREFIX} ${CMAKE_PREFIX_PATH}) +if (${KUDU_USE_TSAN}) + set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/installed-deps-tsan + ${CMAKE_PREFIX_PATH}) +else() + set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/installed-deps + ${CMAKE_PREFIX_PATH}) +endif() + +## Cyrus SASL +find_package(CyrusSASL REQUIRED) +include_directories(SYSTEM ${CYRUS_SASL_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(cyrus_sasl + SHARED_LIB "${CYRUS_SASL_SHARED_LIB}" + DEPS ${CYRUS_SASL_LIB_DEPS}) + +## GLog +find_package(GLog REQUIRED) +include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(glog + STATIC_LIB "${GLOG_STATIC_LIB}" + SHARED_LIB "${GLOG_SHARED_LIB}") +list(APPEND KUDU_BASE_LIBS glog) + +## libunwind (dependent of glog) +## Doesn't build on OSX. +if (NOT APPLE) + find_package(LibUnwind REQUIRED) + include_directories(SYSTEM ${UNWIND_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(unwind + STATIC_LIB "${UNWIND_STATIC_LIB}" + SHARED_LIB "${UNWIND_SHARED_LIB}") + list(APPEND KUDU_BASE_LIBS unwind) +endif() + +## GFlags +find_package(GFlags REQUIRED) +include_directories(SYSTEM ${GFLAGS_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(gflags + STATIC_LIB "${GFLAGS_STATIC_LIB}" + SHARED_LIB "${GFLAGS_SHARED_LIB}") +list(APPEND KUDU_BASE_LIBS gflags) + +## GMock +find_package(GMock REQUIRED) +include_directories(SYSTEM ${GMOCK_INCLUDE_DIR} ${GTEST_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(gmock + STATIC_LIB ${GMOCK_STATIC_LIBRARY} + SHARED_LIB ${GMOCK_SHARED_LIBRARY}) + +## Protobuf +find_package(Protobuf REQUIRED) +include_directories(SYSTEM ${PROTOBUF_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(protobuf + STATIC_LIB "${PROTOBUF_STATIC_LIBRARY}" + SHARED_LIB "${PROTOBUF_SHARED_LIBRARY}") +ADD_THIRDPARTY_LIB(protoc + STATIC_LIB "${PROTOBUF_PROTOC_STATIC_LIBRARY}" + SHARED_LIB "${PROTOBUF_PROTOC_LIBRARY}" + DEPS protobuf) +find_package(KRPC REQUIRED) + +## Snappy +find_package(Snappy REQUIRED) +include_directories(SYSTEM ${SNAPPY_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(snappy + STATIC_LIB "${SNAPPY_STATIC_LIB}" + SHARED_LIB "${SNAPPY_SHARED_LIB}") + +## Libev +find_package(LibEv REQUIRED) +include_directories(SYSTEM ${LIBEV_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(libev + STATIC_LIB "${LIBEV_STATIC_LIB}" + SHARED_LIB "${LIBEV_SHARED_LIB}") + +## LZ4 +find_package(Lz4 REQUIRED) +include_directories(SYSTEM ${LZ4_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(lz4 STATIC_LIB "${LZ4_STATIC_LIB}") + +## Bitshuffle +find_package(Bitshuffle REQUIRED) +include_directories(SYSTEM ${BITSHUFFLE_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(bitshuffle STATIC_LIB "${BITSHUFFLE_STATIC_LIB}") + +## ZLib +find_package(Zlib REQUIRED) +include_directories(SYSTEM ${ZLIB_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(zlib + STATIC_LIB "${ZLIB_STATIC_LIB}" + SHARED_LIB "${ZLIB_SHARED_LIB}") + +## Squeasel +find_package(Squeasel REQUIRED) +include_directories(SYSTEM ${SQUEASEL_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(squeasel + STATIC_LIB "${SQUEASEL_STATIC_LIB}") + +## Google PerfTools +## +## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment +## near definition of KUDU_USING_GOLD). +find_package(GPerf REQUIRED) +if (NOT "${KUDU_USE_ASAN}" AND + NOT "${KUDU_USE_TSAN}" AND + NOT ("${KUDU_USING_GOLD}" AND "${KUDU_LINK}" STREQUAL "d")) + ADD_THIRDPARTY_LIB(tcmalloc + STATIC_LIB "${TCMALLOC_STATIC_LIB}" + SHARED_LIB "${TCMALLOC_SHARED_LIB}") + ADD_THIRDPARTY_LIB(profiler + STATIC_LIB "${PROFILER_STATIC_LIB}" + SHARED_LIB "${PROFILER_SHARED_LIB}") + list(APPEND KUDU_BASE_LIBS tcmalloc profiler) + add_definitions("-DTCMALLOC_ENABLED") + set(KUDU_TCMALLOC_AVAILABLE 1) +endif() + +# libvmem +if (NOT APPLE) + find_package(Vmem REQUIRED) + include_directories(SYSTEM ${VMEM_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(vmem + STATIC_LIB "${VMEM_STATIC_LIB}" + SHARED_LIB "${VMEM_SHARED_LIB}") +endif() + +## curl +find_package(CURL REQUIRED) + +## crcutil +find_package(Crcutil REQUIRED) +include_directories(SYSTEM ${CRCUTIL_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(crcutil + STATIC_LIB "${CRCUTIL_STATIC_LIB}" + SHARED_LIB "${CRCUTIL_SHARED_LIB}") + +## llvm +# Note that llvm has a unique cmake setup. See kudu/codegen/CMakeLists.txt +# for details. +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${THIRDPARTY_PREFIX}/share/llvm) +find_package(LLVM REQUIRED CONFIG) +if(${LLVM_PACKAGE_VERSION} VERSION_LESS 3.4) + message(FATAL_ERROR "LLVM version (${LLVM_PACKAGE_VERSION}) must be at least 3.4") +endif() +message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") +message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") + +## librt +if (NOT APPLE) + find_library(RT_LIB_PATH rt) + if(NOT RT_LIB_PATH) + message(FATAL_ERROR "Could not find librt on the system path") + endif() + ADD_THIRDPARTY_LIB(rt + SHARED_LIB "${RT_LIB_PATH}") +endif() + +## Boost +# It's important that Boost come last in the list of packages, because it's the only +# dependency that we don't currently bundle in thirdparty/. If we put this earlier +# in list, then we might end up with /usr/local/include taking precedence over +# thirdparty/installed/include and pulling in the wrong version of other dependencies +# that might be installed in the system. + +### Workaround for http://stackoverflow.com/questions/9948375/cmake-find-package-succeeds-but-returns-wrong-path +set(Boost_NO_BOOST_CMAKE ON) + +# Find Boost static libraries. +set(Boost_USE_STATIC_LIBS ON) +find_package(Boost COMPONENTS system thread REQUIRED) +set(BOOST_STATIC_LIBS ${Boost_LIBRARIES}) +list(LENGTH BOOST_STATIC_LIBS BOOST_STATIC_LIBS_LEN) +list(SORT BOOST_STATIC_LIBS) + +# Find Boost shared libraries. +set(Boost_USE_STATIC_LIBS OFF) +find_package(Boost COMPONENTS system thread REQUIRED) +set(BOOST_SHARED_LIBS ${Boost_LIBRARIES}) +list(LENGTH BOOST_SHARED_LIBS BOOST_SHARED_LIBS_LEN) +list(SORT BOOST_SHARED_LIBS) + +# We should have found the same number of libraries both times. +if(NOT ${BOOST_SHARED_LIBS_LEN} EQUAL ${BOOST_STATIC_LIBS_LEN}) + set(ERROR_MSG "Boost static and shared libraries are inconsistent.") + set(ERROR_MSG "${ERROR_MSG} Static libraries: ${BOOST_STATIC_LIBS}.") + set(ERROR_MSG "${ERROR_MSG} Shared libraries: ${BOOST_SHARED_LIBS}.") + message(FATAL_ERROR "${ERROR_MSG}") +endif() + +# Add each pair of static/shared libraries. +math(EXPR LAST_IDX "${BOOST_STATIC_LIBS_LEN} - 1") +foreach(IDX RANGE ${LAST_IDX}) + list(GET BOOST_STATIC_LIBS ${IDX} BOOST_STATIC_LIB) + list(GET BOOST_SHARED_LIBS ${IDX} BOOST_SHARED_LIB) + + # Remove the prefix/suffix from the library name. + # + # e.g. libboost_system-mt --> boost_system + get_filename_component(LIB_NAME ${BOOST_STATIC_LIB} NAME_WE) + string(REGEX REPLACE "lib([^-]*)(-mt)?" "\\1" LIB_NAME_NO_PREFIX_SUFFIX ${LIB_NAME}) + ADD_THIRDPARTY_LIB(${LIB_NAME_NO_PREFIX_SUFFIX} + STATIC_LIB "${BOOST_STATIC_LIB}" + SHARED_LIB "${BOOST_SHARED_LIB}") + list(APPEND KUDU_BASE_LIBS ${LIB_NAME_NO_PREFIX_SUFFIX}) +endforeach() +include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + + +############################################################ +# Linker setup +############################################################ +set(KUDU_MIN_TEST_LIBS kudu_test_main kudu_test_util ${KUDU_BASE_LIBS}) +set(KUDU_TEST_LINK_LIBS ${KUDU_MIN_TEST_LIBS}) + +############################################################ +# "make ctags" target +############################################################ +if (UNIX) + add_custom_target(ctags ctags --languages=c++,c -L + `find ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_BINARY_DIR}/src`) +endif (UNIX) + +############################################################ +# "make etags" target +# +# Requires the exuberant-ctags system package. +############################################################ +if (UNIX) + add_custom_target(etags etags --members --declarations + `find ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_BINARY_DIR}/src + -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or + -name \\*.h -or -name \\*.c`) +endif (UNIX) + +############################################################ +# "make cscope" target +############################################################ +if (UNIX) + add_custom_target(cscope + find ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_BINARY_DIR}/src + -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or + -name \\*.h -or -name \\*.c + > cscope.files && cscope -q -b) +endif (UNIX) + +############################################################ +# "make lint" target +############################################################ +if (UNIX) + # Full lint + add_custom_target(lint ${BUILD_SUPPORT_DIR}/lint.sh) + # Incremental lint - only checks files changed since the last + # merged upstream commit + add_custom_target(ilint ${BUILD_SUPPORT_DIR}/lint.sh -c) +endif (UNIX) + +############################################################ +# "make docs" target +############################################################ +if (UNIX) + add_custom_target(docs + # The docs output HTML will end up in a docs/ subdir. + ${CMAKE_CURRENT_SOURCE_DIR}/docs/support/scripts/make_docs.sh + --build_root ${CMAKE_CURRENT_BINARY_DIR}) +endif (UNIX) + +############################################################ +# "make site" target +############################################################ +if (UNIX) + add_custom_target(site + ${CMAKE_CURRENT_SOURCE_DIR}/docs/support/scripts/make_site.sh) +endif (UNIX) + +############################################################ +# Subdirectories +############################################################ + +# Google util libraries borrowed from supersonic, tcmalloc, Chromium, etc. +add_subdirectory(src/kudu/gutil) +add_subdirectory(src/kudu/util) +add_subdirectory(src/kudu/common) +add_subdirectory(src/kudu/cfile) +add_subdirectory(src/kudu/fs) +add_subdirectory(src/kudu/server) +add_subdirectory(src/kudu/tablet) +add_subdirectory(src/kudu/rpc) +add_subdirectory(src/kudu/tserver) +add_subdirectory(src/kudu/consensus) +add_subdirectory(src/kudu/master) +add_subdirectory(src/kudu/client) +add_subdirectory(src/kudu/integration-tests) +add_subdirectory(src/kudu/experiments) +add_subdirectory(src/kudu/benchmarks) +add_subdirectory(src/kudu/twitter-demo) +add_subdirectory(src/kudu/tools) +add_subdirectory(src/kudu/codegen) diff --git a/CONTRIBUTING.adoc b/CONTRIBUTING.adoc new file mode 100644 index 000000000000..5bd92860e1e6 --- /dev/null +++ b/CONTRIBUTING.adoc @@ -0,0 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +Please read the instructions for http://getkudu.io/docs/contributing.html[how to contribute to Kudu]. diff --git a/DISCLAIMER b/DISCLAIMER new file mode 100644 index 000000000000..2fc823b090ec --- /dev/null +++ b/DISCLAIMER @@ -0,0 +1,8 @@ +Apache Kudu is an effort undergoing incubation at The Apache Software +Foundation (ASF), sponsored by the Apache Incubator PMC. Incubation is +required of all newly accepted projects until a further review +indicates that the infrastructure, communications, and decision making +process have stabilized in a manner consistent with other successful +ASF projects. While incubation status is not necessarily a reflection +of the completeness or stability of the code, it does indicate that +the project has yet to be fully endorsed by the ASF. diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 000000000000..9a41fdfeec12 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,664 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- +src/kudu/gutil (some portions): Apache 2.0, and 3-clause BSD + +This module is derived from code in the Chromium project, copyright +(c) Google inc and (c) The Chromium Authors and licensed under the +Apache 2.0 License or the under the 3-clause BSD license: + + Copyright (c) 2013 The Chromium Authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +src/kudu/gutil/utf: licensed under the following terms: + + UTF-8 Library + + The authors of this software are Rob Pike and Ken Thompson. + Copyright (c) 1998-2002 by Lucent Technologies. + Permission to use, copy, modify, and distribute this software for any + purpose without fee is hereby granted, provided that this entire notice + is included in all copies of any software which is or includes a copy + or modification of this software and in all copies of the supporting + documentation for such software. + THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + + +-------------------------------------------------------------------------------- + +src/kudu/gutil/valgrind.h: Hybrid BSD (half BSD, half zlib) + + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2000-2008 Julian Seward. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +src/kudu/util (some portions): 3-clause BSD license + +Some portions of this module are derived from code from LevelDB +( https://github.com/google/leveldb ): + + Copyright (c) 2011 The LevelDB Authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +src/kudu/util (HdrHistogram-related classes): public domain + +Portions of these classes were ported from Java to C++ from the sources +available at https://github.com/HdrHistogram/HdrHistogram . + + The code in this repository code was Written by Gil Tene, Michael Barker, + and Matt Warren, and released to the public domain, as explained at + http://creativecommons.org/publicdomain/zero/1.0/ + +-------------------------------------------------------------------------------- + +src/kudu/util/random-util.cc: some portions adapted from WebRTC project +(modules/video_coding/main/test/test_util.cc) under a 3-clause BSD license. + + Copyright (c) 2011, The WebRTC project authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +src/kudu/util/sync_point: adapted from the RocksDB project under a 3-clause +BSD license with an additional grant of patent rights: + + Copyright (c) 2014, Facebook, Inc. + All rights reserved. + + Copyright (c) 2011 The LevelDB Authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Additional Grant of Patent Rights Version 2 + + "Software" means the RocksDB software distributed by Facebook, Inc. + + Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software + ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable + (subject to the termination provision below) license under any Necessary + Claims, to make, have made, use, sell, offer to sell, import, and otherwise + transfer the Software. For avoidance of doubt, no license is granted under + Facebook’s rights in any patent claims that are infringed by (i) modifications + to the Software made by you or any third party or (ii) the Software in + combination with any software or other technology. + + The license granted hereunder will terminate, automatically and without notice, + if you (or any of your subsidiaries, corporate affiliates or agents) initiate + directly or indirectly, or take a direct financial interest in, any Patent + Assertion: (i) against Facebook or any of its subsidiaries or corporate + affiliates, (ii) against any party if such Patent Assertion arises in whole or + in part from any software, technology, product or service of Facebook or any of + its subsidiaries or corporate affiliates, or (iii) against any party relating + to the Software. Notwithstanding the foregoing, if Facebook or any of its + subsidiaries or corporate affiliates files a lawsuit alleging patent + infringement against you in the first instance, and you respond by filing a + patent infringement counterclaim in that lawsuit against that party that is + unrelated to the Software, the license granted hereunder will not terminate + under section (i) of this paragraph due to such counterclaim. + + A "Necessary Claim" is a claim of a patent owned by Facebook that is + necessarily infringed by the Software standing alone. + + A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, + or contributory infringement or inducement to infringe any patent, including a + cross-claim or counterclaim. + +-------------------------------------------------------------------------------- + +src/kudu/server/url-coding.cc: some portions adapted from the Boost project +thirdparty/boost_uuid/: + + Boost Software License - Version 1.0 - August 17th, 2003 + + Permission is hereby granted, free of charge, to any person or organization + obtaining a copy of the software and accompanying documentation covered by + this license (the "Software") to use, reproduce, display, distribute, + execute, and transmit the Software, and to prepare derivative works of the + Software, and to permit third-parties to whom the Software is furnished to + do so, all subject to the following: + + The copyright notices in the Software and this entire statement, including + the above license grant, this restriction and the following disclaimer, + must be included in all copies of the Software, in whole or in part, and + all derivative works of the Software, unless such copies or derivative + works are solely in the form of machine-executable object code generated by + a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +www/bootstrap: Apache 2.0 license + + Copyright 2012 Twitter, Inc + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- + +www/d3.v2.js: BSD 3-clause license + + Copyright (c) 2012, Michael Bostock + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * The name Michael Bostock may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-------------------------------------------------------------------------------- + +www/epoch.*: MIT license + + Copyright (c) 2014 Fastly, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +-------------------------------------------------------------------------------- + +www/jquery*.js: MIT license + + Copyright 2005, 2014 jQuery Foundation and other contributors, + https://jquery.org/ + + This software consists of voluntary contributions made by many + individuals. For exact contribution history, see the revision history + available at https://github.com/jquery/jquery + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +java/kudu-client/src/main/java/org/kududb/util/: Slice.java and Slices.java + + Derived from the LevelDB Java project at https://github.com/dain/leveldb/ + Licensed under the Apache 2.0 license with the following copyrights: + + Copyright 2011 Dain Sundstrom + Copyright 2011 FuseSource Corp. http://fusesource.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- + +java/kudu-client/: Some classes are derived from the AsyncHBase project +under the following 3-clause BSD license: + + Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + This file is part of Async HBase. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + - Neither the name of the StumbleUpon nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +.ycm_extra_conf.py: public domain + + This is free and unencumbered software released into the public domain. + + Anyone is free to copy, modify, publish, use, compile, sell, or + distribute this software, either in source code form or as a compiled + binary, for any purpose, commercial or non-commercial, and by any + means. + + In jurisdictions that recognize copyright laws, the author or authors + of this software dedicate any and all copyright interest in the + software to the public domain. We make this dedication for the benefit + of the public at large and to the detriment of our heirs and + successors. We intend this dedication to be an overt act of + relinquishment in perpetuity of all present and future rights to this + software under copyright law. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + + For more information, please refer to + +================================================================================ + +Note on thirdparty dependencies downloaded during the build process +------------------------------------------------------------------- +The Kudu build process downloads many thirdparty dependencies automatically. +Many of these dependencies are statically linked into the resulting Kudu +binaries. Please refer to thirdparty/LICENSE.txt for relevant information. diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 000000000000..e346c66b85ef --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,4 @@ +Kudu + +Portions of this software were developed at +Cloudera, Inc (http://www.cloudera.com/). diff --git a/README.adoc b/README.adoc new file mode 100644 index 000000000000..bb83b38703b5 --- /dev/null +++ b/README.adoc @@ -0,0 +1,397 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. += Kudu Developer Documentation + +== Building and installing Kudu + +Follow the steps in the http://getkudu.io/docs/installation.html#_build_from_source[documentation] +to build and install Kudu from source + +=== Building Kudu out of tree + +A single Kudu source tree may be used for multiple builds, each with its +own build directory. Build directories may be placed anywhere in the +filesystem with the exception of the root directory of the source tree. The +Kudu build is invoked with a working directory of the build directory +itself, so you must ensure it exists (i.e. create it with _mkdir -p_). It's +recommended to place all build directories within the _build_ subdirectory; +_build/latest_ will be symlinked to most recently created one. + +The rest of this document assumes the build directory +_/build/debug_. + +=== Automatic rebuilding of dependencies + +The script `thirdparty/build-if-necessary.sh` is invoked by cmake, so +new thirdparty dependencies added by other developers will be downloaded +and built automatically in subsequent builds if necessary. + +To disable the automatic invocation of `build-if-necessary.sh`, set the +`NO_REBUILD_THIRDPARTY` environment variable: + +[source,bash] +---- +$ cd build/debug +$ NO_REBUILD_THIRDPARTY=1 cmake ../.. +---- + +This can be particularly useful when trying to run tools like `git bisect` +between two commits which may have different dependencies. + + +=== Building Kudu itself + +[source,bash] +---- +# Add /thirdparty/installed/bin to your $PATH +# before other parts of $PATH that may contain cmake, such as /usr/bin +# For example: "export PATH=$HOME/git/kudu/thirdparty/installed/bin:$PATH" +# if using bash. +$ mkdir -p build/debug +$ cd build/debug +$ cmake ../.. +$ make -j8 # or whatever level of parallelism your machine can handle +---- + +The build artifacts, including the test binaries, will be stored in +_build/debug/bin/_. + +To omit the Kudu unit tests during the build, add -DNO_TESTS=1 to the +invocation of cmake. For example: + +[source,bash] +---- +$ cd build/debug +$ cmake -DNO_TESTS=1 ../.. +---- + +== Running unit/functional tests + +To run the Kudu unit tests, you can use the `ctest` command from within the +_build/debug_ directory: + +[source,bash] +---- +$ cd build/debug +$ ctest -j8 +---- + +This command will report any tests that failed, and the test logs will be +written to _build/debug/test-logs_. + +Individual tests can be run by directly invoking the test binaries in +_build/debug/bin_. Since Kudu uses the Google C++ Test Framework (gtest), +specific test cases can be run with gtest flags: + +[source,bash] +---- +# List all the tests within a test binary, then run a single test +$ build/debug/bin/tablet-test --gtest_list_tests +$ build/debug/bin/tablet-test --gtest_filter=TestTablet/9.TestFlush +---- + +gtest also allows more complex filtering patterns. See the upstream +documentation for more details. + +=== Running tests with the clang AddressSanitizer enabled + + +AddressSanitizer is a nice clang feature which can detect many types of memory +errors. The Jenkins setup for kudu runs these tests automatically on a regular +basis, but if you make large changes it can be a good idea to run it locally +before pushing. To do so, you'll need to build using `clang`: + +[source,bash] +---- +$ mkdir -p build/asan +$ cd build/asan +$ CC=../../thirdparty/clang-toolchain/bin/clang \ + CXX=../../thirdparty/clang-toolchain/bin/clang++ \ + cmake -DKUDU_USE_ASAN=1 ../.. +$ make -j8 +$ ctest -j8 +---- + +The tests will run significantly slower than without ASAN enabled, and if any +memory error occurs, the test that triggered it will fail. You can then use a +command like: + + +[source,bash] +---- +$ cd build/asan +$ ctest -R failing-test +---- + +to run just the failed test. + +NOTE: For more information on AddressSanitizer, please see the +http://clang.llvm.org/docs/AddressSanitizer.html[ASAN web page]. + +=== Running tests with the clang Undefined Behavior Sanitizer (UBSAN) enabled + + +Similar to the above, you can use a special set of clang flags to enable the Undefined +Behavior Sanitizer. This will generate errors on certain pieces of code which may +not themselves crash but rely on behavior which isn't defined by the C++ standard +(and thus are likely bugs). To enable UBSAN, follow the same directions as for +ASAN above, but pass the `-DKUDU_USE_UBSAN=1` flag to the `cmake` invocation. + +In order to get a stack trace from UBSan, you can use gdb on the failing test, and +set a breakpoint as follows: + +---- +(gdb) b __ubsan::Diag::~Diag +---- + +Then, when the breakpoint fires, gather a backtrace as usual using the `bt` command. + +=== Running tests with ThreadSanitizer enabled + +ThreadSanitizer (TSAN) is a feature of recent Clang and GCC compilers which can +detect improperly synchronized access to data along with many other threading +bugs. To enable TSAN, pass `-DKUDU_USE_TSAN=1` to the `cmake` invocation, +recompile, and run tests. For example: + +[source,bash] +---- +$ mkdir -p build/tsan +$ cd build/tsan +$ CC=../../thirdparty/clang-toolchain/bin/clang \ + CXX=../../thirdparty/clang-toolchain/bin/clang++ \ + cmake -DKUDU_USE_TSAN=1 . +$ make -j8 +$ ctest -j8 +---- + +. Enabling TSAN supressions while running tests +[NOTE] +==== +Note that we rely on a list of runtime suppressions in _build-support/tsan-suppressions.txt_. +If you simply run a unit test like _build/tsan/bin/foo-test_, you won't get these suppressions. +Instead, use a command like: + +[source,bash] +---- +$ ctest -R foo-test +---- + +or + +[source,bash] +---- +$ build-support/run-test.sh build/tsan/bin/foo-test [--test-arguments-here] +---- + + +...and then view the logs in _build/tsan/test-logs/_ + +==== + +TSAN may truncate a few lines of the stack trace when reporting where the error +is. This can be bewildering. It's documented for TSANv1 here: +http://code.google.com/p/data-race-test/wiki/ThreadSanitizerAlgorithm +It is not mentioned in the documentation for TSANv2, but has been observed. +In order to find out what is _really_ happening, set a breakpoint on the TSAN +report in GDB using the following incantation: + +[source,bash] +---- +$ gdb -ex 'set disable-randomization off' -ex 'b __tsan::PrintReport' ./some-test +---- + + +=== Generating code coverage reports + + +In order to generate a code coverage report, you must build with gcc (not clang) +and use the following flags: + +[source,bash] +---- +$ mkdir -p build/coverage +$ cd build/coverage +$ cmake -DKUDU_GENERATE_COVERAGE=1 ../.. +$ make -j4 +$ ctest -j4 +---- + +This will generate the code coverage files with extensions .gcno and .gcda. You can then +use a tool like `lcov` or `gcovr` to visualize the results. For example, using gcovr: + +[source,bash] +---- +$ mkdir cov_html +$ ./thirdparty/gcovr-3.0/scripts/gcovr -r src/ +---- + +Or using `lcov` (which seems to produce better HTML output): + +[source,bash] +---- +$ lcov --capture --directory src --output-file coverage.info +$ genhtml coverage.info --output-directory out +---- + +=== Running lint checks + + +Kudu uses cpplint.py from Google to enforce coding style guidelines. You can run the +lint checks via cmake using the `ilint` target: + +[source,bash] +---- +$ make ilint +---- + +This will scan any file which is dirty in your working tree, or changed since the last +gerrit-integrated upstream change in your git log. If you really want to do a full +scan of the source tree, you may use the `lint` target instead. + +=== Building Kudu documentation + +Kudu's documentation is written in asciidoc and lives in the _docs_ subdirectory. + +To build the documentation (this is primarily useful if you would like to +inspect your changes before submitting them to Gerrit), use the `docs` target: + +[source,bash] +---- +$ make docs +---- + +This will invoke `docs/support/scripts/make_docs.sh`, which requires +`asciidoctor` to process the doc sources and produce the HTML documentation, +emitted to _build/docs_. This script requires `ruby` and `gem` to be installed +on the system path, and will attempt to install `asciidoctor` and other related +dependencies into `$HOME/.gems` using http://bundler.io/[bundler]. + +=== Updating the documentation on the Kudu web site + +To update the documentation that is integrated into the Kudu web site, +including Javadoc documentation, you may run the following command: + +[source,bash] +---- +$ ./docs/support/script/make_site.sh +---- + +This script will use your local Git repository to check out a shallow clone of +the 'gh-pages' branch and use `make_docs.sh` to generate the HTML documentation +for the web site. It will also build the Javadoc documentation. These will be +placed inside the checked-out web site, along with a tarball containing only +the generated documentation (the _docs/_ and _apidocs/_ paths on the web site). +Everything can be found in the _build/site_ subdirectory. + +You can proceed to commit the changes in the pages repository and send a code +review for your changes. In the future, this step may be automated whenever +changes are checked into the main Kudu repository. + +== Improving build times + +=== Caching build output + +The kudu build is compatible with ccache. Simply install your distro's _ccache_ package, +prepend _/usr/lib/ccache_ to your `PATH`, and watch your object files get cached. Link +times won't be affected, but you will see a noticeable improvement in compilation +times. You may also want to increase the size of your cache using "ccache -M new_size". + +=== Improving linker speed + +One of the major time sinks in the Kudu build is linking. GNU ld is historically +quite slow at linking large C++ applications. The alternative linker `gold` is much +better at it. It's part of the `binutils` package in modern distros (try `binutils-gold` +in older ones). To enable it, simply repoint the _/usr/bin/ld_ symlink from `ld.bfd` to +`ld.gold`. + +Note that gold doesn't handle weak symbol overrides properly (see +https://sourceware.org/bugzilla/show_bug.cgi?id=16979[this bug report] for details). +As such, it cannot be used with shared objects (see below) because it'll cause +tcmalloc's alternative malloc implementation to be ignored. + +=== Building Kudu with dynamic linking + +Kudu can be built into shared objects, which, when used with ccache, can result in a +dramatic build time improvement in the steady state. Even after a `make clean` in the build +tree, all object files can be served from ccache. By default, `debug` and `fastdebug` will +use dynamic linking, while other build types will use static linking. To enable +dynamic linking explicitly, run: + +[source,bash] +---- +$ cmake -DKUDU_LINK=dynamic ../.. +---- + +Subsequent builds will create shared objects instead of archives and use them when +linking the kudu binaries and unit tests. The full range of options for `KUDU_LINK` are +`static`, `dynamic`, and `auto`. The default is `auto` and only the first letter +matters for the purpose of matching. + +NOTE: Dynamic linking is incompatible with ASAN and static linking is incompatible +with TSAN. + + +== Developing Kudu in Eclipse + +Eclipse can be used as an IDE for Kudu. To generate Eclipse project files, run: + +[source,bash] +---- +$ mkdir -p +$ cd +$ rm -rf CMakeCache.txt CMakeFiles/ +$ cmake -G "Eclipse CDT4 - Unix Makefiles" -DCMAKE_CXX_COMPILER_ARG1=-std=c++11 +---- + +When the Eclipse generator is run in a subdirectory of the source tree, the +resulting project is incomplete. That's why it's recommended to use a directory +that's a sibling to the source tree. See [1] for more details. + +It's critical that _CMakeCache.txt_ be removed prior to running the generator, +otherwise the extra Eclipse generator logic (the CMakeFindEclipseCDT4.make module) +won't run and standard system includes will be missing from the generated project. + +Thanks to [2], the Eclipse generator ignores the `-std=c++11` definition and we must +add it manually on the command line via `CMAKE_CXX_COMPILER_ARG1`. + +By default, the Eclipse CDT indexer will index everything under the _kudu/_ +source tree. It tends to choke on certain complicated source files within +_thirdparty_. In CDT 8.7.0, the indexer will generate so many errors that it'll +exit early, causing many spurious syntax errors to be highlighted. In older +versions of CDT, it'll spin forever. + +Either way, these complicated source files must be excluded from indexing. To do +this, right click on the project in the Project Explorer and select Properties. In +the dialog box, select "C/C++ Project Paths", select the Source tab, highlight +"Exclusion filter: (None)", and click "Edit...". In the new dialog box, click +"Add Multiple...". Select every subdirectory inside _thirdparty_ except _installed_ +and _installed-deps_. Click OK all the way out and rebuild the project index by +right clicking the project in the Project Explorer and selecting Index -> Rebuild. + +With this exclusion, the only false positives (shown as "red squigglies") that +CDT presents appear to be in atomicops functions (`NoBarrier_CompareAndSwap` for +example). + +Another Eclipse annoyance stems from the "[Targets]" linked resource that Eclipse +generates for each unit test. These are probably used for building within Eclipse, +but one side effect is that nearly every source file appears in the indexer twice: +once via a target and once via the raw source file. To fix this, simply delete the +[Targets] linked resource via the Project Explorer. Doing this should have no effect +on writing code, though it may affect your ability to build from within Eclipse. + +1. https://cmake.org/pipermail/cmake-developers/2011-November/014153.html +2. http://public.kitware.com/Bug/view.php?id=15102 diff --git a/build-support/build_source_release.py b/build-support/build_source_release.py new file mode 100755 index 000000000000..d6973d990f8c --- /dev/null +++ b/build-support/build_source_release.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import hashlib +import logging +import os +import subprocess +import sys + +from kudu_util import check_output, confirm_prompt, Colors, get_my_email + +ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +GET_UPSTREAM_COMMIT_SCRIPT = os.path.join(ROOT, "build-support", "get-upstream-commit.sh") + + +def check_repo_not_dirty(): + """Check that the git repository isn't dirty.""" + dirty_repo = subprocess.call("git diff --quiet && git diff --cached --quiet", + shell=True) != 0 + if not dirty_repo: + return + print "The repository does not appear to be clean." + print Colors.RED + "The source release will not include your local changes." + \ + Colors.RESET + if not confirm_prompt("Continue?"): + sys.exit(1) + + +def check_no_local_commits(): + """ + Check that there are no local commits which haven't been pushed to the upstream + repo via Jenkins. + """ + upstream_commit = check_output(GET_UPSTREAM_COMMIT_SCRIPT).strip() + cur_commit = check_output(["git", "rev-parse", "HEAD"]).strip() + + if upstream_commit == cur_commit: + return + print "The repository appears to have local commits:" + subprocess.check_call(["git", "log", "--oneline", "%s..HEAD" % upstream_commit]) + + print Colors.RED + "This should not be an official release!" + \ + Colors.RESET + if not confirm_prompt("Continue?"): + sys.exit(1) + + +def get_version_number(): + """ Return the current version number of Kudu. """ + return file(os.path.join(ROOT, "version.txt")).read().strip() + + +def create_tarball(): + artifact_name = "apache-kudu-incubating-%s" % get_version_number() + build_dir = os.path.join(ROOT, "build") + if not os.path.exists(build_dir): + os.path.makedirs(build_dir) + tarball_path = os.path.join(build_dir, artifact_name + ".tar.gz") + print "Exporting source tarball..." + subprocess.check_output(["git", "archive", + "--prefix=%s/" % artifact_name, + "--output=%s" % tarball_path, + "HEAD"]) + print Colors.GREEN + "Generated tarball:\t" + Colors.RESET, tarball_path + return tarball_path + + +def sign_tarball(tarball_path): + """ Prompt the user to GPG-sign the tarball using their Apache GPG key. """ + if not confirm_prompt("Would you like to GPG-sign the tarball now?"): + return + + email = get_my_email() + if not email.endswith("@apache.org"): + print Colors.YELLOW, "Your email address for the repository is not an @apache.org address." + print "Release signatures should typically be signed by committers with @apache.org GPG keys." + print Colors.RESET, + if not confirm_prompt("Continue?"): + return + + try: + subprocess.check_call(["gpg", "--detach-sign", "--armor", "-u", email, tarball_path]) + except subprocess.CalledProcessError: + print Colors.RED + "GPG signing failed. Artifact will not be signed." + Colors.RESET + return + print Colors.GREEN + "Generated signature:\t" + Colors.RESET, tarball_path + ".asc" + + +def checksum_file(summer, path): + """ + Calculates the checksum of the file 'path' using the provided hashlib + digest implementation. Returns the hex form of the digest. + """ + with file(path, "rb") as f: + # Read the file in 4KB chunks until EOF. + for chunk in iter(lambda: f.read(4096), ""): + summer.update(chunk) + return summer.hexdigest() + + +def gen_checksum_files(tarball_path): + """ + Create md5 and sha files of the tarball. + + The output format is compatible with command line tools like 'sha1sum' + and 'md5sum' so they may be used to verify the checksums. + """ + hashes = [(hashlib.sha1, "sha"), + (hashlib.md5, "md5")] + for hash_func, extension in hashes: + digest = checksum_file(hash_func(), tarball_path) + path = tarball_path + "." + extension + with file(path, "w") as f: + print >>f, "%s\t%s" % (digest, os.path.basename(tarball_path)) + print Colors.GREEN + ("Generated %s:\t" % extension) + Colors.RESET, path + + +def main(): + # Change into the source repo so that we can run git commands without having to + # specify cwd=BUILD_SUPPORT every time. + os.chdir(ROOT) + check_repo_not_dirty() + check_no_local_commits() + tarball_path = create_tarball() + gen_checksum_files(tarball_path) + sign_tarball(tarball_path) + + print Colors.GREEN + "Release successfully generated!" + Colors.RESET + print + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/build-support/ccache-clang/clang b/build-support/ccache-clang/clang new file mode 100755 index 000000000000..a31b1ea0774a --- /dev/null +++ b/build-support/ccache-clang/clang @@ -0,0 +1,21 @@ +#!/bin/bash -e +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ROOT=$(dirname $BASH_SOURCE)/../.. +CLANG=$ROOT/thirdparty/clang-toolchain/bin/clang +CCACHE_CPP2=yes exec ccache $CLANG -Qunused-arguments `test -t 2 && echo -fcolor-diagnostics` "$@" diff --git a/build-support/ccache-clang/clang++ b/build-support/ccache-clang/clang++ new file mode 100755 index 000000000000..9cf7f908cf25 --- /dev/null +++ b/build-support/ccache-clang/clang++ @@ -0,0 +1,21 @@ +#!/bin/bash -e +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ROOT=$(dirname $BASH_SOURCE)/../.. +CLANG=$ROOT/thirdparty/clang-toolchain/bin/clang++ +CCACHE_CPP2=yes exec ccache $CLANG -Qunused-arguments `test -t 2 && echo -fcolor-diagnostics` "$@" diff --git a/build-support/check_compatibility.py b/build-support/check_compatibility.py new file mode 100755 index 000000000000..70c399f231d7 --- /dev/null +++ b/build-support/check_compatibility.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Script which checks Java API compatibility between two revisions of the +# Java client. +# +# Based on the compatibility checker from the HBase project, but ported to +# Python for better readability. + +import logging +import optparse +import os +import re +import shutil +import subprocess +import sys + +from kudu_util import check_output + +JAVA_ACC_GIT_URL = "https://github.com/lvc/japi-compliance-checker.git" + +# The annotations for what we consider our public API. +PUBLIC_ANNOTATIONS = ["InterfaceAudience.LimitedPrivate", + "InterfaceAudience.Public"] + +# Various relative paths +PATH_TO_REPO_DIR = "../" +PATH_TO_BUILD_DIR = "../build/compat-check" + + +def get_repo_dir(): + """ Return the path to the top of the repo. """ + dirname, _ = os.path.split(os.path.abspath(__file__)) + return os.path.abspath(os.path.join(dirname, PATH_TO_REPO_DIR)) + + +def get_scratch_dir(): + """ Return the path to the scratch dir that we build within. """ + dirname, _ = os.path.split(os.path.abspath(__file__)) + return os.path.abspath(os.path.join(dirname, PATH_TO_BUILD_DIR)) + + +def get_java_acc_dir(): + """ Return the path where we check out the Java API Compliance Checker. """ + return os.path.join(get_repo_dir(), "thirdparty/java-acc") + + +def clean_scratch_dir(scratch_dir): + """ Clean up and re-create the scratch directory. """ + if os.path.exists(scratch_dir): + logging.info("Removing scratch dir %s...", scratch_dir) + shutil.rmtree(scratch_dir) + logging.info("Creating empty scratch dir %s...", scratch_dir) + os.makedirs(scratch_dir) + + +def checkout_java_tree(rev, path): + """ Check out the Java source tree for the given revision into the given path. """ + logging.info("Checking out %s in %s", rev, path) + os.makedirs(path) + # Extract java source + subprocess.check_call(["bash", '-o', 'pipefail', "-c", + ("git archive --format=tar %s java/ | " + + "tar -C \"%s\" -xf -") % (rev, path)], + cwd=get_repo_dir()) + # Extract proto files which the Java build also relies on. + subprocess.check_call(["bash", '-o', 'pipefail', "-c", + ("git archive --format=tar %s src/ | " + + "tar -C \"%s\" --wildcards -xf - '*.proto'") % (rev, path)], + cwd=get_repo_dir()) + # Symlink thirdparty from the outer build so that protoc is available. + # This may break at some point in the future if we switch protobuf versions, + # but for now it's faster than rebuilding protobuf in both trees. + os.symlink(os.path.join(get_repo_dir(), "thirdparty"), + os.path.join(path, "thirdparty")) + + +def get_git_hash(revname): + """ Convert 'revname' to its SHA-1 hash. """ + return check_output(["git", "rev-parse", revname], + cwd=get_repo_dir()).strip() + + +def build_tree(path): + """ Run the Java build within 'path'. """ + java_path = os.path.join(path, "java") + logging.info("Building in %s...", java_path) + subprocess.check_call(["mvn", "-DskipTests", "-Dmaven.javadoc.skip=true", + "package"], + cwd=java_path) + + +def checkout_java_acc(force): + """ + Check out the Java API Compliance Checker. If 'force' is true, will re-download even if the + directory exists. + """ + acc_dir = get_java_acc_dir() + if os.path.exists(acc_dir): + logging.info("Java JAVA_ACC is already downloaded.") + if not force: + return + logging.info("Forcing re-download.") + shutil.rmtree(acc_dir) + logging.info("Checking out Java JAVA_ACC...") + subprocess.check_call(["git", "clone", "--depth=1", JAVA_ACC_GIT_URL, acc_dir]) + + +def find_client_jars(path): + """ Return a list of jars within 'path' to be checked for compatibility. """ + all_jars = set(check_output(["find", path, "-name", "*.jar"]).splitlines()) + + # If we see "original-foo.jar", then remove "foo.jar" since that's a post-shading + # duplicate. + dups = [] + for j in all_jars: + dirname, name = os.path.split(j) + m = re.match("original-(.+)", name) + if m: + dups.append(os.path.join(dirname, m.group(1))) + for d in dups: + all_jars.remove(d) + + return [j for j in all_jars if ( + "-tests" not in j and + "-sources" not in j and + "-with-dependencies" not in j)] + + +def run_java_acc(src_name, src, dst_name, dst): + """ Run the compliance checker to compare 'src' and 'dst'. """ + src_jars = find_client_jars(src) + dst_jars = find_client_jars(dst) + logging.info("Will check compatibility between original jars:\n%s\n" + + "and new jars:\n%s", + "\n".join(src_jars), + "\n".join(dst_jars)) + + annotations_path = os.path.join(get_scratch_dir(), "annotations.txt") + with file(annotations_path, "w") as f: + for ann in PUBLIC_ANNOTATIONS: + print >>f, ann + + java_acc_path = os.path.join(get_java_acc_dir(), "japi-compliance-checker.pl") + + out_path = os.path.join(get_scratch_dir(), "report.html") + subprocess.check_call(["perl", java_acc_path, + "-l", "Kudu", + "-v1", src_name, + "-v2", dst_name, + "-d1", ",".join(src_jars), + "-d2", ",".join(dst_jars), + "-report-path", out_path, + "-annotations-list", annotations_path]) + + +def main(argv): + logging.basicConfig(level=logging.INFO) + parser = optparse.OptionParser( + usage="usage: %prog SRC..[DST]") + parser.add_option("-f", "--force-download", dest="force_download_deps", + help=("Download dependencies (i.e. Java JAVA_ACC) even if they are " + + "already present")) + opts, args = parser.parse_args() + + if len(args) != 1: + parser.error("no src/dst revision specified") + sys.exit(1) + + src_rev, dst_rev = args[0].split("..", 1) + if dst_rev == "": + dst_rev = "HEAD" + src_rev = get_git_hash(src_rev) + dst_rev = get_git_hash(dst_rev) + + logging.info("Source revision: %s", src_rev) + logging.info("Destination revision: %s", dst_rev) + + # Download deps. + checkout_java_acc(opts.force_download_deps) + + # Set up the build. + scratch_dir = get_scratch_dir() + clean_scratch_dir(scratch_dir) + + # Check out the src and dst source trees. + src_dir = os.path.join(scratch_dir, "src") + dst_dir = os.path.join(scratch_dir, "dst") + checkout_java_tree(src_rev, src_dir) + checkout_java_tree(dst_rev, dst_dir) + + # Run the build in each. + build_tree(src_dir) + build_tree(dst_dir) + + run_java_acc(src_rev, src_dir, + dst_rev, dst_dir) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/build-support/dist_test.py b/build-support/dist_test.py new file mode 100755 index 000000000000..2f53293da927 --- /dev/null +++ b/build-support/dist_test.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python2 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This tool allows tests to be submitted to a distributed testing +# service running on shared infrastructure. +# +# See dist_test.py --help for usage information. + +import argparse +import glob +try: + import simplejson as json +except: + import json +import logging +import os +import pprint +import re +import sys +import shlex +import shutil +import subprocess +import time + +TEST_TIMEOUT_SECS = int(os.environ.get('TEST_TIMEOUT_SECS', '900')) +ARTIFACT_ARCHIVE_GLOBS = ["build/*/test-logs/*"] +ISOLATE_SERVER = os.environ.get('ISOLATE_SERVER', + "http://isolate.cloudera.org:4242/") +DIST_TEST_HOME = os.environ.get('DIST_TEST_HOME', + os.path.expanduser("~/dist_test")) + +# The number of times that flaky tests will be retried. +# Our non-distributed implementation sets a number of _attempts_, not a number +# of retries, so we have to subtract 1. +FLAKY_TEST_RETRIES = int(os.environ.get('KUDU_FLAKY_TEST_ATTEMPTS', 1)) - 1 + +PATH_TO_REPO = "../" + +TEST_COMMAND_RE = re.compile('Test command: (.+)$') +LDD_RE = re.compile(r'^\s+.+? => (\S+) \(0x.+\)') + +DEPS_FOR_ALL = \ + ["build-support/stacktrace_addr2line.pl", + "build-support/run-test.sh", + "build-support/run_dist_test.py", + "build-support/tsan-suppressions.txt", + "build-support/lsan-suppressions.txt", + + # The LLVM symbolizer is necessary for suppressions to work + "thirdparty/installed/bin/llvm-symbolizer", + + # Tests that use the external minicluster require these. + # TODO: declare these dependencies per-test. + "build/latest/bin/kudu-tserver", + "build/latest/bin/kudu-master", + "build/latest/bin/kudu-ts-cli", + + # parser-test requires these data files. + # TODO: again, we should do this with some per-test metadata file. + # TODO: these are broken now that we separate source and build trees. + #".../example-deletes.txt", + #".../example-tweets.txt", + + # Tests that require tooling require these. + "build/latest/bin/kudu-admin", + ] + + +class StagingDir(object): + @staticmethod + def new(): + dir = rel_to_abs("build/isolate") + if os.path.isdir(dir): + shutil.rmtree(dir) + os.makedirs(dir) + return StagingDir(dir) + + def __init__(self, dir): + self.dir = dir + + def archive_dump_path(self): + return os.path.join(self.dir, "dump.json") + + def gen_json_paths(self): + return glob.glob(os.path.join(self.dir, "*.gen.json")) + + def tasks_json_path(self): + return os.path.join(self.dir, "tasks.json") + + +def rel_to_abs(rel_path): + dirname, _ = os.path.split(os.path.abspath(__file__)) + abs = os.path.abspath(os.path.join(dirname, PATH_TO_REPO, rel_path)) + if rel_path.endswith('/') and not abs.endswith('/'): + abs += '/' + return abs + + +def abs_to_rel(abs_path, staging): + rel = os.path.relpath(abs_path, staging.dir) + if abs_path.endswith('/') and not rel.endswith('/'): + rel += '/' + return rel + + +def get_test_commandlines(): + ctest_bin = os.path.join(rel_to_abs("thirdparty/installed/bin/ctest")) + p = subprocess.Popen([ctest_bin, "-V", "-N", "-LE", "no_dist_test"], stdout=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + print >>sys.stderr, "Unable to list tests with ctest" + sys.exit(1) + lines = out.splitlines() + commands = [] + for l in lines: + m = TEST_COMMAND_RE.search(l) + if not m: + continue + commands.append(shlex.split(m.group(1))) + return commands + + +def is_lib_blacklisted(lib): + # These particular system libraries, we should ship to the remote nodes. + # No need to ship things like libc, libstdcxx, etc. + if "boost" in lib or "oauth" in lib: + return False + if lib.startswith("/lib") or lib.startswith("/usr"): + return True + return False + + +def is_outside_of_tree(path): + repo_dir = rel_to_abs("./") + rel = os.path.relpath(path, repo_dir) + return rel.startswith("../") + +def copy_system_library(lib): + """ + For most system libraries, we expect them to be installed on the test + machines. However, a couple are shipped from the submitter machine + to the cluster by putting them in a special directory inside the + isolated build tree. + + This function copies such libraries into that directory. + """ + sys_lib_dir = rel_to_abs("build/dist-test-system-libs") + if not os.path.exists(sys_lib_dir): + os.makedirs(sys_lib_dir) + dst = os.path.join(sys_lib_dir, os.path.basename(lib)) + if not os.path.exists(dst): + logging.info("Copying system library %s to %s...", lib, dst) + shutil.copy2(rel_to_abs(lib), dst) + return dst + + +def ldd_deps(exe): + """ + Runs 'ldd' on the provided 'exe' path, returning a list of + any libraries it depends on. Blacklisted libraries are + removed from this list. + + If the provided 'exe' is not a binary executable, returns + an empty list. + """ + if exe.endswith(".sh"): + return [] + p = subprocess.Popen(["ldd", exe], stdout=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + print >>sys.stderr, "failed to run ldd on ", exe + return [] + ret = [] + for l in out.splitlines(): + m = LDD_RE.match(l) + if not m: + continue + lib = m.group(1) + if is_lib_blacklisted(lib): + continue + path = m.group(1) + ret.append(m.group(1)) + + # ldd will often point to symlinks. We need to upload the symlink + # as well as whatever it's pointing to, recursively. + while os.path.islink(path): + path = os.path.join(os.path.dirname(path), os.readlink(path)) + ret.append(path) + return ret + + +def num_shards_for_test(test_name): + if 'raft_consensus-itest' in test_name: + return 8 + if 'cfile-test' in test_name: + return 4 + if 'mt-tablet-test' in test_name: + return 4 + return 1 + + +def create_archive_input(staging, argv, + disable_sharding=False): + """ + Generates .gen.json and .isolate files corresponding to the + test command 'argv'. The outputs are placed in the specified + staging directory. + + Some larger tests are automatically sharded into several tasks. + If 'disable_sharding' is True, this behavior will be suppressed. + """ + if not argv[0].endswith('run-test.sh') or len(argv) < 2: + print >>sys.stderr, "Unable to handle test: ", argv + return + test_name = os.path.basename(argv[1]) + abs_test_exe = os.path.realpath(argv[1]) + rel_test_exe = abs_to_rel(abs_test_exe, staging) + argv[1] = rel_test_exe + files = [] + files.append(rel_test_exe) + deps = ldd_deps(abs_test_exe) + for d in DEPS_FOR_ALL: + d = os.path.realpath(rel_to_abs(d)) + if os.path.isdir(d): + d += "/" + deps.append(d) + for d in deps: + # System libraries will end up being relative paths out + # of the build tree. We need to copy those into the build + # tree somewhere. + if is_outside_of_tree(d): + d = copy_system_library(d) + files.append(abs_to_rel(d, staging)) + + if disable_sharding: + num_shards = 1 + else: + num_shards = num_shards_for_test(test_name) + for shard in xrange(0, num_shards): + out_archive = os.path.join(staging.dir, '%s.%d.gen.json' % (test_name, shard)) + out_isolate = os.path.join(staging.dir, '%s.%d.isolate' % (test_name, shard)) + + command = ['../../build-support/run_dist_test.py', + '-e', 'GTEST_SHARD_INDEX=%d' % shard, + '-e', 'GTEST_TOTAL_SHARDS=%d' % num_shards, + '-e', 'KUDU_TEST_TIMEOUT=%d' % (TEST_TIMEOUT_SECS - 30), + '-e', 'KUDU_ALLOW_SLOW_TESTS=%s' % os.environ.get('KUDU_ALLOW_SLOW_TESTS', 1), + '-e', 'KUDU_COMPRESS_TEST_OUTPUT=%s' % \ + os.environ.get('KUDU_COMPRESS_TEST_OUTPUT', 0)] + command.append('--') + command += argv[1:] + + archive_json = dict(args=["-i", out_isolate, + "-s", out_isolate + "d"], + dir=rel_to_abs("."), + name='%s.%d/%d' % (test_name, shard + 1, num_shards), + version=1) + isolate_dict = dict(variables=dict(command=command, + files=files)) + with open(out_archive, "w") as f: + json.dump(archive_json, f) + with open(out_isolate, "w") as f: + pprint.pprint(isolate_dict, f) + + +def create_task_json(staging, + replicate_tasks=1, + flaky_test_set=set()): + """ + Create a task JSON file suitable for submitting to the distributed + test execution service. + + If 'replicate_tasks' is higher than one, each .isolate file will be + submitted multiple times. This can be useful for looping tests. + """ + tasks = [] + with file(staging.archive_dump_path(), "r") as isolate_dump: + inmap = json.load(isolate_dump) + + # Some versions of 'isolate batcharchive' directly list the items in + # the dumped JSON. Others list it in an 'items' dictionary. + items = inmap.get('items', inmap) + for k, v in items.iteritems(): + # The key is 'foo-test.'. So, chop off the last component + # to get the test name + test_name = ".".join(k.split(".")[:-1]) + max_retries = 0 + if test_name in flaky_test_set: + max_retries = FLAKY_TEST_RETRIES + + tasks += [{"isolate_hash": str(v), + "description": str(k), + "artifact_archive_globs": ARTIFACT_ARCHIVE_GLOBS, + "timeout": TEST_TIMEOUT_SECS + 30, + "max_retries": max_retries + }] * replicate_tasks + + outmap = {"tasks": tasks} + + with file(staging.tasks_json_path(), "wt") as f: + json.dump(outmap, f) + + +def run_isolate(staging): + """ + Runs 'isolate batcharchive' to archive all of the .gen.json files in + the provided staging directory. + + Throws an exception if the call fails. + """ + isolate_path = "isolate" + try: + subprocess.check_call([isolate_path, + 'batcharchive', + '-isolate-server=' + ISOLATE_SERVER, + '-dump-json=' + staging.archive_dump_path(), + '--'] + staging.gen_json_paths()) + except: + print >>sys.stderr, "Failed to run", isolate_path + raise + +def submit_tasks(staging, options): + """ + Runs the distributed testing tool to submit the tasks in the + provided staging directory. + + This requires that the tasks JSON file has already been generated + by 'create_task_json()'. + """ + if not os.path.exists(DIST_TEST_HOME): + print >>sys.stderr, "Cannot find dist_test tools at path %s " \ + "Set the DIST_TEST_HOME environment variable to the path to the dist_test directory. " \ + % DIST_TEST_HOME, + raise OSError("Cannot find path to dist_test tools") + client_py_path = os.path.join(DIST_TEST_HOME, "client.py") + try: + cmd = [client_py_path, "submit"] + if options.no_wait: + cmd.append('--no-wait') + cmd.append(staging.tasks_json_path()) + subprocess.check_call(cmd) + except: + print >>sys.stderr, "Failed to run", client_py_path + raise + +def get_flakies(): + path = os.getenv('KUDU_FLAKY_TEST_LIST') + if not path: + return set() + return set(l.strip() for l in file(path)) + +def run_all_tests(parser, options): + """ + Gets all of the test command lines from 'ctest', isolates them, + creates a task list, and submits the tasks to the testing service. + """ + commands = get_test_commandlines() + staging = StagingDir.new() + for command in commands: + create_archive_input(staging, command, + disable_sharding=options.disable_sharding) + + run_isolate(staging) + create_task_json(staging, flaky_test_set=get_flakies()) + submit_tasks(staging, options) + +def add_run_all_subparser(subparsers): + p = subparsers.add_parser('run-all', help='Run all of the dist-test-enabled tests') + p.set_defaults(func=run_all_tests) + +def loop_test(parser, options): + """ + Runs many instances of a user-provided test case on the testing service. + """ + if options.num_instances < 1: + parser.error("--num-instances must be >= 1") + command = ["run-test.sh", options.cmd] + options.args + staging = StagingDir.new() + create_archive_input(staging, command, + disable_sharding=options.disable_sharding) + run_isolate(staging) + create_task_json(staging, options.num_instances) + submit_tasks(staging, options) + +def add_loop_test_subparser(subparsers): + p = subparsers.add_parser('loop', help='Run many instances of the same test', + epilog="if passing arguments to the test, you may want to use a '--' " + + "argument before . e.g: loop -- foo-test --gtest_opt=123") + p.add_argument("--num-instances", "-n", dest="num_instances", type=int, + help="number of test instances to start", metavar="NUM", + default=100) + p.add_argument("cmd", help="test binary") + p.add_argument("args", nargs=argparse.REMAINDER, help="test arguments") + p.set_defaults(func=loop_test) + + +def main(argv): + logging.basicConfig(level=logging.INFO) + p = argparse.ArgumentParser() + p.add_argument("--disable-sharding", dest="disable_sharding", action="store_true", + help="Disable automatic sharding of tests", default=False) + p.add_argument("--no-wait", dest="no_wait", action="store_true", + help="Return without waiting for the job to complete", default=False) + sp = p.add_subparsers() + add_loop_test_subparser(sp) + add_run_all_subparser(sp) + args = p.parse_args(argv) + args.func(p, args) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/build-support/enable_devtoolset.sh b/build-support/enable_devtoolset.sh new file mode 100755 index 000000000000..f6001888c652 --- /dev/null +++ b/build-support/enable_devtoolset.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# Enables the Red Hat devtoolset on RHEL 6 based systems and executes the +# arguments. On non-RHEL 6 systems, the arguments are executed without changes +# to the environment. +# USAGE: ./enable_devtoolset.sh ... + +if [[ "$OSTYPE" =~ ^linux ]] && \ + [[ "$(lsb_release -irs)" =~ (CentOS|RedHatEnterpriseServer)[[:space:]]+6\.[[:digit:]]+ ]]; then + scl enable devtoolset-3 "$*" +else + $@ +fi diff --git a/build-support/gen_version_info.py b/build-support/gen_version_info.py new file mode 100755 index 000000000000..3688e085ba52 --- /dev/null +++ b/build-support/gen_version_info.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script generates a header file which contains definitions +# for the current Kudu build (eg timestamp, git hash, etc) + +import logging +import optparse +import os +import re +import sha +import subprocess +import sys +import time +from time import strftime, localtime + +from kudu_util import check_output + +def output_up_to_date(path, id_hash): + """ + Return True if the old file seems to be up to date, based on the + identification hash 'id_hash'. + """ + if not os.path.exists(path): + return False + f = file(path).read() + m = re.search("id_hash=(\w+)", f) + if not m: + return False + return m.group(1) == id_hash + +def main(): + logging.basicConfig(level=logging.INFO) + parser = optparse.OptionParser( + usage="usage: %prog --version= ") + parser.add_option("-v", "--version", help="Set version number", type="string", + dest="version", metavar="VERSION") + parser.add_option("-b", "--build-type", help="Set build type", type="string", + dest="build_type", metavar="BUILD_TYPE") + parser.add_option("-g", "--git-hash", help="Set git hash", type="string", + dest="git_hash", metavar="GIT_HASH") + opts, args = parser.parse_args() + + if not opts.version: + parser.error("no version number specified") + sys.exit(1) + + if len(args) != 1: + parser.error("no output path specified") + sys.exit(1) + + output_path = args[0] + + hostname = check_output(["hostname", "-f"]).strip() + build_time = "%s %s" % (strftime("%d %b %Y %H:%M:%S", localtime()), time.tzname[0]) + username = os.getenv("USER") + + if opts.git_hash: + # Git hash provided on the command line. + git_hash = opts.git_hash + clean_repo = "true" + else: + try: + # No command line git hash, find it in the local git repository. + git_hash = check_output(["git", "rev-parse", "HEAD"]).strip() + clean_repo = subprocess.call("git diff --quiet && git diff --cached --quiet", shell=True) == 0 + clean_repo = str(clean_repo).lower() + except Exception, e: + # If the git commands failed, we're probably building outside of a git + # repository. + logging.info("Build appears to be outside of a git repository... " + + "continuing without repository information.") + git_hash = "non-git-build" + clean_repo = "true" + + version_string = opts.version + build_type = opts.build_type + + # Add the Jenkins build ID + build_id = os.getenv("BUILD_ID", "") + + # Calculate an identifying hash based on all of the variables except for the + # timestamp. We put this hash in a comment, and use it to check whether to + # re-generate the file. If it hasn't changed since a previous run, we don't + # re-write the file. This avoids having to rebuild all binaries on every build. + identifying_hash = sha.sha(repr((git_hash, hostname, username, + clean_repo, build_id))).hexdigest() + + if output_up_to_date(output_path, identifying_hash): + return 0 + d = os.path.dirname(output_path) + if not os.path.exists(d): + os.makedirs(d) + with file(output_path, "w") as f: + print >>f, """ +// THIS FILE IS AUTO-GENERATED! DO NOT EDIT! +// +// id_hash=%(identifying_hash)s +#ifndef VERSION_INFO_H_ +#define VERSION_INFO_H_ + +#define KUDU_GIT_HASH "%(git_hash)s" +#define KUDU_BUILD_HOSTNAME "%(hostname)s" +#define KUDU_BUILD_TIMESTAMP "%(build_time)s" +#define KUDU_BUILD_USERNAME "%(username)s" +#define KUDU_BUILD_CLEAN_REPO %(clean_repo)s +#define KUDU_BUILD_ID "%(build_id)s" +#define KUDU_BUILD_TYPE "%(build_type)s" +#define KUDU_VERSION_STRING "%(version_string)s" +#endif +""" % locals() + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/build-support/generate_precompiled_xxd.sh b/build-support/generate_precompiled_xxd.sh new file mode 100755 index 000000000000..b2c42953019c --- /dev/null +++ b/build-support/generate_precompiled_xxd.sh @@ -0,0 +1,40 @@ +#!/bin/bash -e +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which embeds a piece of raw data into a C++ source file. Functionally +# the same as xxd -i, but inserts custom namespace and variable names. + +IN_FILE=$1 +OUT_FILE=$2 + +echo "// header generated by build-support/generate_precompiled_xxd.sh" > $OUT_FILE +echo "namespace kudu {" >> $OUT_FILE +echo "namespace codegen {" >> $OUT_FILE + +echo "extern const char precompiled_ll_data[] = {" >> $OUT_FILE +xxd -i - < $IN_FILE >> $OUT_FILE +# LLVM requires the binary to be null terminated. +echo ", 0x00" >> $OUT_FILE +echo "};" >> $OUT_FILE + +LEN=$(wc -c $IN_FILE | awk '{print $1}') +echo "extern const unsigned int precompiled_ll_len = ${LEN};" >> $OUT_FILE + +echo "} // namespace codegen" >> $OUT_FILE +echo "} // namespace kudu" >> $OUT_FILE diff --git a/build-support/get-upstream-commit.sh b/build-support/get-upstream-commit.sh new file mode 100755 index 000000000000..3918c8481044 --- /dev/null +++ b/build-support/get-upstream-commit.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which tries to determine the most recent git hash in the current +# branch which was checked in by gerrit. This commit hash is printed to +# stdout. +# +# It does so by looking for the 'Reviewed-on' tag added by gerrit. This is +# more foolproof than trying to guess at the "origin/" branch name, since the +# developer might be working on some local topic branch. +set -e + +git log --grep='Reviewed-on: ' -n1 --pretty=format:%H diff --git a/build-support/jenkins/build-and-test.sh b/build-support/jenkins/build-and-test.sh new file mode 100755 index 000000000000..837a91bd2df5 --- /dev/null +++ b/build-support/jenkins/build-and-test.sh @@ -0,0 +1,435 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script is invoked from the Jenkins builds to build Kudu +# and run all the unit tests. +# +# Environment variables may be used to customize operation: +# BUILD_TYPE: Default: DEBUG +# Maybe be one of ASAN|TSAN|DEBUG|RELEASE|COVERAGE|LINT +# +# KUDU_ALLOW_SLOW_TESTS Default: 1 +# Runs the "slow" version of the unit tests. Set to 0 to +# run the tests more quickly. +# +# TEST_TMPDIR Default: /tmp/kudutest-$UID +# Specifies the temporary directory where tests should write their +# data. It is expected that following the completion of all tests, this +# directory is empty (i.e. every test cleaned up after itself). +# +# RUN_FLAKY_ONLY Default: 0 +# Only runs tests which have failed recently, if this is 1. +# Used by the kudu-flaky-tests jenkins build. +# +# KUDU_FLAKY_TEST_ATTEMPTS Default: 1 +# If more than 1, will fetch the list of known flaky tests +# from the kudu-test jenkins job, and allow those tests to +# be flaky in this build. +# +# TEST_RESULT_SERVER Default: none +# The host:port pair of a server running test_result_server.py. +# This must be configured for flaky test resistance or test result +# reporting to work. +# +# ENABLE_DIST_TEST Default: 0 +# If set to 1, will submit C++ tests to be run by the distributed +# test runner instead of running them locally. This requires that +# $DIST_TEST_HOME be set to a working dist_test checkout (and that +# dist_test itself be appropriately configured to point to a cluster) +# +# BUILD_JAVA Default: 1 +# Build and test java code if this is set to 1. +# +# VALIDATE_CSD Default: 0 +# If 1, runs the CM CSD validator against the Kudu CSD. +# This requires access to an internal Cloudera maven repository. +# +# BUILD_PYTHON Default: 1 +# Build and test the Python wrapper of the client API. +# +# MVN_FLAGS Default: "" +# Extra flags which are passed to 'mvn' when building and running Java +# tests. This can be useful, for example, to choose a different maven +# repository location. + +# If a commit messages contains a line that says 'DONT_BUILD', exit +# immediately. +DONT_BUILD=$(git show|egrep '^\s{4}DONT_BUILD$') +if [ "x$DONT_BUILD" != "x" ]; then + echo "*** Build not requested. Exiting." + exit 1 +fi + +set -e +# We pipe our build output to a log file with tee. +# This bash setting ensures that the script exits if the build fails. +set -o pipefail +# gather core dumps +ulimit -c unlimited + +BUILD_TYPE=${BUILD_TYPE:-DEBUG} +BUILD_TYPE=$(echo "$BUILD_TYPE" | tr a-z A-Z) # capitalize +BUILD_TYPE_LOWER=$(echo "$BUILD_TYPE" | tr A-Z a-z) + +# Set up defaults for environment variables. +DEFAULT_ALLOW_SLOW_TESTS=1 + +# TSAN builds are pretty slow, so don't do SLOW tests unless explicitly +# requested. Setting KUDU_USE_TSAN influences the thirdparty build. +if [ "$BUILD_TYPE" = "TSAN" ]; then + DEFAULT_ALLOW_SLOW_TESTS=0 + export KUDU_USE_TSAN=1 +fi + +export KUDU_FLAKY_TEST_ATTEMPTS=${KUDU_FLAKY_TEST_ATTEMPTS:-1} +export KUDU_ALLOW_SLOW_TESTS=${KUDU_ALLOW_SLOW_TESTS:-$DEFAULT_ALLOW_SLOW_TESTS} +export KUDU_COMPRESS_TEST_OUTPUT=${KUDU_COMPRESS_TEST_OUTPUT:-1} +export TEST_TMPDIR=${TEST_TMPDIR:-/tmp/kudutest-$UID} +BUILD_JAVA=${BUILD_JAVA:-1} +VALIDATE_CSD=${VALIDATE_CSD:-0} +BUILD_PYTHON=${BUILD_PYTHON:-1} + +# Ensure that the test data directory is usable. +mkdir -p "$TEST_TMPDIR" +if [ ! -w "$TEST_TMPDIR" ]; then + echo "Error: Test output directory ($TEST_TMPDIR) is not writable on $(hostname) by user $(whoami)" + exit 1 +fi + +SOURCE_ROOT=$(cd $(dirname "$BASH_SOURCE")/../..; pwd) +BUILD_ROOT=$SOURCE_ROOT/build/$BUILD_TYPE_LOWER + +# Remove testing artifacts from the previous run before we do anything +# else. Otherwise, if we fail during the "build" step, Jenkins will +# archive the test logs from the previous run, thinking they came from +# this run, and confuse us when we look at the failed build. +rm -rf $BUILD_ROOT +mkdir -p $BUILD_ROOT + +list_flaky_tests() { + curl -s "http://$TEST_RESULT_SERVER/list_failed_tests?num_days=3&build_pattern=%25kudu-test%25" + return $? +} + +TEST_LOGDIR="$BUILD_ROOT/test-logs" +TEST_DEBUGDIR="$BUILD_ROOT/test-debug" + +cleanup() { + echo Cleaning up all build artifacts... + $SOURCE_ROOT/build-support/jenkins/post-build-clean.sh +} +# If we're running inside Jenkins (the BUILD_ID is set), then install +# an exit handler which will clean up all of our build results. +if [ -n "$BUILD_ID" ]; then + trap cleanup EXIT +fi + +export TOOLCHAIN_DIR=/opt/toolchain +if [ -d "$TOOLCHAIN_DIR" ]; then + PATH=$TOOLCHAIN_DIR/apache-maven-3.0/bin:$PATH +fi + +$SOURCE_ROOT/build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh + +THIRDPARTY_BIN=$(pwd)/thirdparty/installed/bin +export PPROF_PATH=$THIRDPARTY_BIN/pprof + +if which ccache >/dev/null ; then + CLANG=$(pwd)/build-support/ccache-clang/clang +else + CLANG=$(pwd)/thirdparty/clang-toolchain/bin/clang +fi + +# Before running cmake below, clean out any errant cmake state from the source +# tree. We need this to help transition into a world where out-of-tree builds +# are required. Once that's done, the cleanup can be removed. +rm -rf $SOURCE_ROOT/CMakeCache.txt $SOURCE_ROOT/CMakeFiles + +# Configure the build +# +# ASAN/TSAN can't build the Python bindings because the exported Kudu client +# library (which the bindings depend on) is missing ASAN/TSAN symbols. +cd $BUILD_ROOT +if [ "$BUILD_TYPE" = "ASAN" ]; then + $SOURCE_ROOT/build-support/enable_devtoolset.sh \ + "env CC=$CLANG CXX=$CLANG++ $THIRDPARTY_BIN/cmake -DKUDU_USE_ASAN=1 -DKUDU_USE_UBSAN=1 $SOURCE_ROOT" + BUILD_TYPE=fastdebug + BUILD_PYTHON=0 +elif [ "$BUILD_TYPE" = "TSAN" ]; then + $SOURCE_ROOT/build-support/enable_devtoolset.sh \ + "env CC=$CLANG CXX=$CLANG++ $THIRDPARTY_BIN/cmake -DKUDU_USE_TSAN=1 $SOURCE_ROOT" + BUILD_TYPE=fastdebug + EXTRA_TEST_FLAGS="$EXTRA_TEST_FLAGS -LE no_tsan" + BUILD_PYTHON=0 +elif [ "$BUILD_TYPE" = "COVERAGE" ]; then + DO_COVERAGE=1 + BUILD_TYPE=debug + $SOURCE_ROOT/build-support/enable_devtoolset.sh "$THIRDPARTY_BIN/cmake -DKUDU_GENERATE_COVERAGE=1 $SOURCE_ROOT" +elif [ "$BUILD_TYPE" = "LINT" ]; then + # Create empty test logs or else Jenkins fails to archive artifacts, which + # results in the build failing. + mkdir -p Testing/Temporary + mkdir -p $TEST_LOGDIR + + $SOURCE_ROOT/build-support/enable_devtoolset.sh "$THIRDPARTY_BIN/cmake $SOURCE_ROOT" + make lint | tee $TEST_LOGDIR/lint.log + exit $? +fi + +# Only enable test core dumps for certain build types. +if [ "$BUILD_TYPE" != "ASAN" ]; then + export KUDU_TEST_ULIMIT_CORE=unlimited +fi + +# If we are supposed to be resistant to flaky tests, we need to fetch the +# list of tests to ignore +if [ "$KUDU_FLAKY_TEST_ATTEMPTS" -gt 1 ]; then + echo Fetching flaky test list... + export KUDU_FLAKY_TEST_LIST=$BUILD_ROOT/flaky-tests.txt + mkdir -p $(dirname $KUDU_FLAKY_TEST_LIST) + echo -n > $KUDU_FLAKY_TEST_LIST + if [ -n "$TEST_RESULT_SERVER" ] && \ + list_flaky_tests > $KUDU_FLAKY_TEST_LIST ; then + echo Will retry flaky tests up to $KUDU_FLAKY_TEST_ATTEMPTS times: + cat $KUDU_FLAKY_TEST_LIST + echo ---------- + else + echo Unable to fetch flaky test list. Disabling flaky test resistance. + export KUDU_FLAKY_TEST_ATTEMPTS=1 + fi +fi + +# On distributed tests, force dynamic linking even for release builds. Otherwise, +# the test binaries are too large and we spend way too much time uploading them +# to the test slaves. +LINK_FLAGS= +if [ "$ENABLE_DIST_TEST" == "1" ]; then + LINK_FLAGS="-DKUDU_LINK=dynamic" +fi + +$SOURCE_ROOT/build-support/enable_devtoolset.sh "$THIRDPARTY_BIN/cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} $LINK_FLAGS $SOURCE_ROOT" + +# our tests leave lots of data lying around, clean up before we run +if [ -d "$TEST_TMPDIR" ]; then + rm -Rf $TEST_TMPDIR/* +fi + +# actually do the build +echo +echo Building C++ code. +echo ------------------------------------------------------------ +NUM_PROCS=$(getconf _NPROCESSORS_ONLN) +make -j$NUM_PROCS 2>&1 | tee build.log + +# If compilation succeeds, try to run all remaining steps despite any failures. +set +e + +# Run tests +export GTEST_OUTPUT="xml:$TEST_LOGDIR/" # Enable JUnit-compatible XML output. +if [ "$RUN_FLAKY_ONLY" == "1" ] ; then + if [ -z "$TEST_RESULT_SERVER" ]; then + echo Must set TEST_RESULT_SERVER to use RUN_FLAKY_ONLY + exit 1 + fi + echo + echo Running flaky tests only: + echo ------------------------------------------------------------ + list_flaky_tests | tee build/flaky-tests.txt + test_regex=$(perl -e ' + chomp(my @lines = <>); + print join("|", map { "^" . quotemeta($_) . "\$" } @lines); + ' build/flaky-tests.txt) + EXTRA_TEST_FLAGS="$EXTRA_TEST_FLAGS -R $test_regex" + + # We don't support detecting java flaky tests at the moment. + echo Disabling Java build since RUN_FLAKY_ONLY=1 + BUILD_JAVA=0 +fi + +EXIT_STATUS=0 +FAILURES="" + +# If we're running distributed tests, submit them asynchronously while +# we run the Java and Python tests. +if [ "$ENABLE_DIST_TEST" == "1" ]; then + echo + echo Submitting distributed-test job. + echo ------------------------------------------------------------ + export DIST_TEST_JOB_PATH=$BUILD_ROOT/dist-test-job-id + rm -f $DIST_TEST_JOB_PATH + if ! $SOURCE_ROOT/build-support/dist_test.py --no-wait run-all ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'Could not submit distributed test job\n' + fi + # Still need to run a few non-dist-test-capable tests locally. + EXTRA_TEST_FLAGS="$EXTRA_TEST_FLAGS -L no_dist_test" +fi + +if ! $THIRDPARTY_BIN/ctest -j$NUM_PROCS $EXTRA_TEST_FLAGS ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'C++ tests failed\n' +fi + +if [ "$DO_COVERAGE" == "1" ]; then + echo + echo Generating coverage report... + echo ------------------------------------------------------------ + if ! $SOURCE_ROOT/thirdparty/gcovr-3.0/scripts/gcovr -r $SOURCE_ROOT --xml \ + > $BUILD_ROOT/coverage.xml ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'Coverage report failed\n' + fi +fi + +if [ "$BUILD_JAVA" == "1" ]; then + echo + echo Building and testing java... + echo ------------------------------------------------------------ + # Make sure we use JDK7 + export JAVA_HOME=$JAVA7_HOME + export PATH=$JAVA_HOME/bin:$PATH + pushd $SOURCE_ROOT/java + export TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$SOURCE_ROOT/build-support/tsan-suppressions.txt history_size=7" + set -x + VALIDATE_CSD_FLAG="" + if [ "$VALIDATE_CSD" == "1" ]; then + VALIDATE_CSD_FLAG="-PvalidateCSD" + fi + if ! mvn $MVN_FLAGS -PbuildCSD \ + $VALIDATE_CSD_FLAG \ + -Dsurefire.rerunFailingTestsCount=3 \ + -Dfailsafe.rerunFailingTestsCount=3 \ + clean verify ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'Java build/test failed\n' + fi + set +x + popd +fi + + +if [ "$BUILD_PYTHON" == "1" ]; then + echo + echo Building and testing python. + echo ------------------------------------------------------------ + + # Failing to compile the Python client should result in a build failure + set -e + export KUDU_HOME=$SOURCE_ROOT + export KUDU_BUILD=$BUILD_ROOT + pushd $SOURCE_ROOT/python + + # Create a sane test environment + rm -Rf $KUDU_BUILD/py_env + virtualenv $KUDU_BUILD/py_env + source $KUDU_BUILD/py_env/bin/activate + pip install --upgrade pip + CC=$CLANG CXX=$CLANG++ pip install --disable-pip-version-check -r requirements.txt + + # Delete old Cython extensions to force them to be rebuilt. + rm -Rf build kudu_python.egg-info kudu/*.so + + # Assuming we run this script from base dir + CC=$CLANG CXX=$CLANG++ python setup.py build_ext + set +e + if ! python setup.py test \ + --addopts="kudu --junit-xml=$KUDU_BUILD/test-logs/python_client.xml" \ + 2> $KUDU_BUILD/test-logs/python_client.log ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'Python tests failed\n' + fi + popd +fi + +# If we submitted the tasks earlier, go fetch the results now +if [ "$ENABLE_DIST_TEST" == "1" ]; then + echo + echo Fetching previously submitted dist-test results... + echo ------------------------------------------------------------ + if ! $DIST_TEST_HOME/client.py watch ; then + EXIT_STATUS=1 + FAILURES="$FAILURES"$'Distributed tests failed\n' + fi + DT_DIR=$TEST_LOGDIR/dist-test-out + rm -Rf $DT_DIR + $DIST_TEST_HOME/client.py fetch --artifacts -d $DT_DIR + # Fetching the artifacts expands each log into its own directory. + # Move them back into the main log directory + rm -f $DT_DIR/*zip + for arch_dir in $DT_DIR/* ; do + # In the case of sharded tests, we'll have multiple subdirs + # which contain files of the same name. We need to disambiguate + # when we move back. We can grab the shard index from the task name + # which is in the archive directory name. + shard_idx=$(echo $arch_dir | perl -ne ' + if (/(\d+)$/) { + print $1; + } else { + print "unknown_shard"; + }') + for log_file in $arch_dir/build/$BUILD_TYPE_LOWER/test-logs/* ; do + mv $log_file $TEST_LOGDIR/${shard_idx}_$(basename $log_file) + done + rm -Rf $arch_dir + done +fi + +if [ $EXIT_STATUS != 0 ]; then + echo + echo Tests failed, making sure we have XML files for all tests. + echo ------------------------------------------------------------ + + # Tests that crash do not generate JUnit report XML files. + # We go through and generate a kind of poor-man's version of them in those cases. + for GTEST_OUTFILE in $TEST_LOGDIR/*.txt.gz; do + TEST_EXE=$(basename $GTEST_OUTFILE .txt.gz) + GTEST_XMLFILE="$TEST_LOGDIR/$TEST_EXE.xml" + if [ ! -f "$GTEST_XMLFILE" ]; then + echo "JUnit report missing:" \ + "generating fake JUnit report file from $GTEST_OUTFILE and saving it to $GTEST_XMLFILE" + zcat $GTEST_OUTFILE | $SOURCE_ROOT/build-support/parse_test_failure.py -x > $GTEST_XMLFILE + fi + done +fi + +# If all tests passed, ensure that they cleaned up their test output. +# +# TODO: Python is currently leaking a tmp directory sometimes (KUDU-1301). +# Temporarily disabled until that's fixed. +# +# if [ $EXIT_STATUS == 0 ]; then +# TEST_TMPDIR_CONTENTS=$(ls $TEST_TMPDIR) +# if [ -n "$TEST_TMPDIR_CONTENTS" ]; then +# echo "All tests passed, yet some left behind their test output:" +# for SUBDIR in $TEST_TMPDIR_CONTENTS; do +# echo $SUBDIR +# done +# EXIT_STATUS=1 +# fi +# fi + +set -e + +if [ -n "$FAILURES" ]; then + echo Failure summary + echo ------------------------------------------------------------ + echo $FAILURES +fi + +exit $EXIT_STATUS diff --git a/build-support/jenkins/cleanup-zombie-jenkins.sh b/build-support/jenkins/cleanup-zombie-jenkins.sh new file mode 100644 index 000000000000..a6dc23b7c6b3 --- /dev/null +++ b/build-support/jenkins/cleanup-zombie-jenkins.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which, when run from Jenkins, will hunt and kill any other jenkins +# processes running on the same machine from a different build. +# +# This is done by looking for all of the processes owned by the jenkins user, +# checking their BUILD_ID environment variable, and killing any whose BUILD_ID +# does not match the current $BUILD_ID environment. This assumes that there are +# no concurrent builds configured in Jenkins, for obvious reasons. +# +# Set $DRY_RUN before running this script to just see what would be killed. + +set -e + +JENKINS_USER=${JENKINS_USER:-jenkins} +CURRENT_BUILD=$BUILD_ID + +if [ "$USER" != "$JENKINS_USER" ]; then + echo Not running as user \'$JENKINS_USER\' + exit 1 +fi + +if [ -z "$CURRENT_BUILD" ]; then + echo Not running in the context of a Jenkins build + exit 1 +fi + +JENKINS_PIDS=$(pgrep -u $JENKINS_USER) +for pid in $JENKINS_PIDS; do + cmdline=$(ps h -p $pid -o cmd || echo '[pid exited]') + build_env=$(cat /proc/$pid/environ 2>/dev/null | tr '\0' '\n' | egrep '^BUILD_ID=' || :) + if [ -z "$build_env" ]; then + # Some Jenkins processes, like the slave itself, don't have a BUILD_ID + # set. We shouldn't kill those. + echo "Process $pid ($cmdline) not associated with any build. Skipping..." + continue + fi + build_id=$(echo $build_env | cut -d= -f2) + if [ "$build_id" != "$CURRENT_BUILD" ]; then + echo "Killing zombie process $pid (from build $build_id)" + ps -fww -p $pid || : + if [ -z "$DRY_RUN" ]; then + kill -9 $pid || : + fi + echo ---------- + else + echo "pid $pid ($cmdline) is from the current build. Not killing" + fi +done diff --git a/build-support/jenkins/dummy-junit.xml b/build-support/jenkins/dummy-junit.xml new file mode 100644 index 000000000000..1f026128c5c9 --- /dev/null +++ b/build-support/jenkins/dummy-junit.xml @@ -0,0 +1,12 @@ + + + + + + diff --git a/build-support/jenkins/post-build-clean.sh b/build-support/jenkins/post-build-clean.sh new file mode 100755 index 000000000000..def3e21549c1 --- /dev/null +++ b/build-support/jenkins/post-build-clean.sh @@ -0,0 +1,41 @@ +#!/bin/bash -x +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which runs on Jenkins slaves after the build/test to clean up +# disk space used by build artifacts. Our build tends to "make clean" +# before running anyway, so doing the post-build cleanup shouldn't +# hurt our build times. It does, however, save a fair amount of disk +# space in the Jenkins workspace disk. This can help prevent our EC2 +# slaves from filling up and causing spurious failures. + +ROOT=$(cd $(dirname "$BASH_SOURCE")/../..; pwd) +cd $ROOT + +# Note that we use simple shell commands instead of "make clean" +# or "mvn clean". This is more foolproof even if something ends +# up partially compiling, etc. + +# Clean up intermediate object files in the src tree +find build/latest/src -name \*.o -exec rm -f {} \; + +# Clean up the actual build artifacts +rm -Rf build/latest/bin build/latest/lib + +# Clean up any java build artifacts +find java -name \*.jar -delete -o -name \*.class -delete diff --git a/build-support/kudu_util.py b/build-support/kudu_util.py new file mode 100644 index 000000000000..7945da5be91e --- /dev/null +++ b/build-support/kudu_util.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script generates a header file which contains definitions +# for the current Kudu build (eg timestamp, git hash, etc) + +import os +import subprocess +import sys + +class Colors(object): + """ ANSI color codes. """ + + def __on_tty(x): + if not os.isatty(sys.stdout.fileno()): + return "" + return x + + RED = __on_tty("\x1b[31m") + GREEN = __on_tty("\x1b[32m") + YELLOW = __on_tty("\x1b[33m") + RESET = __on_tty("\x1b[m") + + +def check_output(*popenargs, **kwargs): + r"""Run command with arguments and return its output as a byte string. + Backported from Python 2.7 as it's implemented as pure python on stdlib. + >>> check_output(['/usr/bin/python', '--version']) + Python 2.6.2 + """ + process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs) + output, unused_err = process.communicate() + retcode = process.poll() + if retcode: + cmd = kwargs.get("args") + if cmd is None: + cmd = popenargs[0] + error = subprocess.CalledProcessError(retcode, cmd) + error.output = output + raise error + return output + + +def confirm_prompt(prompt): + """ + Issue the given prompt, and ask the user to confirm yes/no. Returns true + if the user confirms. + """ + while True: + print prompt, "[Y/n]:", + + if not os.isatty(sys.stdout.fileno()): + print "Not running interactively. Assuming 'N'." + return False + pass + + r = raw_input().strip().lower() + if r in ['y', 'yes', '']: + return True + elif r in ['n', 'no']: + return False + + +def get_my_email(): + """ Return the email address in the user's git config. """ + return check_output(['git', 'config', '--get', 'user.email']).strip() diff --git a/build-support/lint.sh b/build-support/lint.sh new file mode 100755 index 000000000000..180491e380a3 --- /dev/null +++ b/build-support/lint.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd) + +TMP=$(mktemp) +trap "rm $TMP" EXIT + +ONLY_CHANGED=false + +for flag in "$@" ; do + case $flag in + --changed-only | -c) + ONLY_CHANGED=true + ;; + *) + echo unknown flag: $flag + exit 1 + ;; + esac +done + +if $ONLY_CHANGED; then + FILES=$(git diff --name-only $($ROOT/build-support/get-upstream-commit.sh) \ + | egrep '\.(cc|h)$' | grep -v "gutil\|trace_event") + if [ -z "$FILES" ]; then + echo No source files changed + exit 0 + fi +else + FILES=$(find $ROOT/src -name '*.cc' -or -name '*.h' | grep -v "\.pb\.\|\.service\.\|\.proxy\.\|\.krpc\.\|gutil\|trace_event\|kudu_export\.h") +fi + +cd $ROOT + +$ROOT/thirdparty/installed/bin/cpplint.py \ + --verbose=4 \ + --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/include_order,-legal/copyright,-build/c++11 \ + $FILES 2>&1 | grep -v 'Done processing' | tee $TMP + +NUM_ERRORS=$(grep "Total errors found" $TMP | awk '{print $4}') + +if [ "$NUM_ERRORS" -ne 0 ]; then + exit 1 +fi diff --git a/build-support/lsan-suppressions.txt b/build-support/lsan-suppressions.txt new file mode 100644 index 000000000000..927afb39710b --- /dev/null +++ b/build-support/lsan-suppressions.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# False positive from atexit() registration in libc +leak:*__new_exitfn* diff --git a/build-support/parse_test_failure.py b/build-support/parse_test_failure.py new file mode 100755 index 000000000000..fd9f8c703757 --- /dev/null +++ b/build-support/parse_test_failure.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script parses a test log (provided on stdin) and returns +# a summary of the error which caused the test to fail. + +from xml.sax.saxutils import quoteattr +import argparse +import re +import sys + +# Read at most 100MB of a test log. +# Rarely would this be exceeded, but we don't want to end up +# swapping, etc. +MAX_MEMORY = 100 * 1024 * 1024 + +START_TESTCASE_RE = re.compile(r'\[ RUN\s+\] (.+)$') +END_TESTCASE_RE = re.compile(r'\[\s+(?:OK|FAILED)\s+\] (.+)$') +ASAN_ERROR_RE = re.compile('ERROR: AddressSanitizer') +TSAN_ERROR_RE = re.compile('WARNING: ThreadSanitizer.*') +END_TSAN_ERROR_RE = re.compile('SUMMARY: ThreadSanitizer.*') +FATAL_LOG_RE = re.compile(r'^F\d\d\d\d \d\d:\d\d:\d\d\.\d\d\d\d\d\d\s+\d+ (.*)') +LEAK_CHECK_SUMMARY_RE = re.compile('Leak check.*detected leaks') +LINE_RE = re.compile(r"^.*$", re.MULTILINE) +STACKTRACE_ELEM_RE = re.compile(r'^ @') +IGNORED_STACKTRACE_ELEM_RE = re.compile( + r'(google::logging|google::LogMessage|\(unknown\)| testing::)') +TEST_FAILURE_RE = re.compile(r'.*\d+: Failure$') +GLOG_LINE_RE = re.compile(r'^[WIEF]\d\d\d\d \d\d:\d\d:\d\d') + +def consume_rest(line_iter): + """ Consume and return the rest of the lines in the iterator. """ + return [l.group(0) for l in line_iter] + +def consume_until(line_iter, end_re): + """ + Consume and return lines from the iterator until one matches 'end_re'. + The line matching 'end_re' will not be returned, but will be consumed. + """ + ret = [] + for l in line_iter: + line = l.group(0) + if end_re.search(line): + break + ret.append(line) + return ret + +def remove_glog_lines(lines): + """ Remove any lines from the list of strings which appear to be GLog messages. """ + return [l for l in lines if not GLOG_LINE_RE.search(l)] + +def record_error(errors, name, error): + errors.setdefault(name, []).append(error) + +def extract_failures(log_text): + cur_test_case = None + tests_seen = set() + tests_seen_in_order = list() + errors_by_test = dict() + + # Iterate over the lines, using finditer instead of .split() + # so that we don't end up doubling memory usage. + line_iter = LINE_RE.finditer(log_text) + for match in line_iter: + line = match.group(0) + + # Track the currently-running test case + m = START_TESTCASE_RE.search(line) + if m: + cur_test_case = m.group(1) + if cur_test_case not in tests_seen: + tests_seen.add(cur_test_case) + tests_seen_in_order.append(cur_test_case) + + m = END_TESTCASE_RE.search(line) + if m: + cur_test_case = None + + # Look for ASAN errors. + m = ASAN_ERROR_RE.search(line) + if m: + error_signature = line + "\n" + asan_lines = remove_glog_lines(consume_rest(line_iter)) + error_signature += "\n".join(asan_lines) + record_error(errors_by_test, cur_test_case, error_signature) + + # Look for TSAN errors + m = TSAN_ERROR_RE.search(line) + if m: + error_signature = m.group(0) + error_signature += "\n".join(remove_glog_lines( + consume_until(line_iter, END_TSAN_ERROR_RE))) + record_error(errors_by_test, cur_test_case, error_signature) + + # Look for test failures + # - slight micro-optimization to check for substring before running the regex + m = 'Failure' in line and TEST_FAILURE_RE.search(line) + if m: + error_signature = m.group(0) + "\n" + error_signature += "\n".join(remove_glog_lines( + consume_until(line_iter, END_TESTCASE_RE))) + record_error(errors_by_test, cur_test_case, error_signature) + + # Look for fatal log messages (including CHECK failures) + # - slight micro-optimization to check for 'F' before running the regex + m = line and line[0] == 'F' and FATAL_LOG_RE.search(line) + if m: + error_signature = m.group(1) + "\n" + remaining_lines = consume_rest(line_iter) + remaining_lines = [l for l in remaining_lines if STACKTRACE_ELEM_RE.search(l) + and not IGNORED_STACKTRACE_ELEM_RE.search(l)] + error_signature += "\n".join(remaining_lines) + record_error(errors_by_test, cur_test_case, error_signature) + + # Look for leak check summary (comes at the end of a log, not part of a single test) + m = LEAK_CHECK_SUMMARY_RE.search(line) + if m: + heapcheck_test_case = "tcmalloc.heapcheck" + if heapcheck_test_case not in tests_seen: + tests_seen.add(heapcheck_test_case) + tests_seen_in_order.append(heapcheck_test_case) + error_signature = "Memory leak\n" + error_signature += line + "\n" + error_signature += "\n".join(consume_rest(line_iter)) + record_error(errors_by_test, heapcheck_test_case, error_signature) + + # Sometimes we see crashes that the script doesn't know how to parse. + # When that happens, we leave a generic message to be picked up by Jenkins. + if cur_test_case and cur_test_case not in errors_by_test: + record_error(errors_by_test, cur_test_case, "Unrecognized error type. Please see the error log for more information.") + + return (tests_seen_in_order, errors_by_test) + +# Return failure summary formatted as text. +def text_failure_summary(tests, errors_by_test): + msg = '' + for test_name in tests: + if test_name not in errors_by_test: + continue + for error in errors_by_test[test_name]: + if msg: msg += "\n" + msg += "%s: %s\n" % (test_name, error) + return msg + +# Parse log lines and return failure summary formatted as text. +# +# This helper function is part of a public API called from test_result_server.py +def extract_failure_summary(log_text): + (tests, errors_by_test) = extract_failures(log_text) + return text_failure_summary(tests, errors_by_test) + +# Print failure summary based on desired output format. +# 'tests' is a list of all tests run (in order), not just the failed ones. +# This allows us to print the test results in the order they were run. +# 'errors_by_test' is a dict of lists, keyed by test name. +def print_failure_summary(tests, errors_by_test, is_xml): + # Plain text dump. + if not is_xml: + sys.stdout.write(text_failure_summary(tests, errors_by_test)) + + # Fake a JUnit report file. + else: + # Example format: + """ + + + + + + + + + + """ + cur_test_suite = None + print '' + + found_test_suites = False + for test_name in tests: + if test_name not in errors_by_test: + continue + + (test_suite, test_case) = test_name.split(".") + + # Test suite initialization or name change. + if test_suite and test_suite != cur_test_suite: + if cur_test_suite: + print ' ' + cur_test_suite = test_suite + print ' ' % cur_test_suite + found_test_suites = True + + # Print each test case. + print ' ' % (test_case, cur_test_suite) + errors = "\n\n".join(errors_by_test[test_name]) + first_line = re.sub("\n.*", '', errors) + print ' ' % quoteattr(first_line) + print '' + print ' ' + print ' ' + + if found_test_suites: + print ' ' + print '' + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument("-x", "--xml", help="Print output in JUnit report XML format (default: plain text)", + action="store_true") + parser.add_argument("path", nargs="?", help="File to parse. If not provided, parses stdin") + args = parser.parse_args() + + if args.path: + in_file = file(args.path) + else: + in_file = sys.stdin + log_text = in_file.read(MAX_MEMORY) + (tests, errors_by_test) = extract_failures(log_text) + print_failure_summary(tests, errors_by_test, args.xml) + +if __name__ == "__main__": + main() diff --git a/build-support/push_to_asf.py b/build-support/push_to_asf.py new file mode 100755 index 000000000000..0bae118eb2a9 --- /dev/null +++ b/build-support/push_to_asf.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script fetches branches from the Gerrit repository and +# allows ASF committers to propagate commits from gerrit into the +# official ASF repository. +# +# Current ASF policy is that this mirroring cannot be automatic +# and should be driven by a committer who inspects and signs off +# on the commits being made into the ASF. Additionally, the ASF +# prefers that in most cases, the committer according to source +# control should be the same person to push the commit to a git +# repository. +# +# This script provides the committer the opportunity to review the +# changes to be pushed, warns them if they are pushing code for +# which they weren't the committer, and performs the actual push. + +import logging +import optparse +import re +import subprocess +import sys + +from kudu_util import check_output, confirm_prompt, Colors, get_my_email + +APACHE_REPO = "https://git-wip-us.apache.org/repos/asf/incubator-kudu.git" +GERRIT_URL_RE = re.compile(r"ssh://.+@gerrit.cloudera.org:29418/kudu") + +# ANSI color codes. +Colors.RED = "\x1b[31m" +Colors.GREEN = "\x1b[32m" +Colors.YELLOW = "\x1b[33m" +Colors.RESET = "\x1b[m" + +# Parsed options, filled in by main(). +OPTIONS = None + + +def check_apache_remote(): + """ + Checks that there is a remote named 'apache' set up correctly. + Otherwise, exits with an error message. + """ + try: + url = check_output(['git', 'config', '--local', '--get', 'remote.apache.url']).strip() + except subprocess.CalledProcessError: + print >>sys.stderr, "No remote named 'apache'. Please set one up, for example with: " + print >>sys.stderr, " git remote add apache", APACHE_REPO + sys.exit(1) + if url != APACHE_REPO: + print >>sys.stderr, "Unexpected URL for remote 'apache'." + print >>sys.stderr, " Got: ", url + print >>sys.stderr, " Expected:", APACHE_REPO + sys.exit(1) + + +def check_gerrit_remote(): + """ + Checks that there is a remote named 'gerrit' set up correctly. + Otherwise, exits with an error message. + """ + try: + url = check_output(['git', 'config', '--local', '--get', 'remote.gerrit.url']).strip() + except subprocess.CalledProcessError: + print >>sys.stderr, "No remote named 'gerrit'. Please set one up following " + print >>sys.stderr, "the contributor guide." + sys.exit(1) + if not GERRIT_URL_RE.match(url): + print >>sys.stderr, "Unexpected URL for remote 'gerrit'." + print >>sys.stderr, " Got: ", url + print >>sys.stderr, " Expected to find host '%s' in the URL" % GERRIT_HOST + sys.exit(1) + + +def fetch(remote): + """Run git fetch for the given remote, including some logging.""" + logging.info("Fetching from remote '%s'..." % remote) + subprocess.check_call(['git', 'fetch', remote]) + logging.info("done") + + +def get_branches(remote): + """ Fetch a dictionary mapping branch name to SHA1 hash from the given remote. """ + out = check_output(["git", "ls-remote", remote, "refs/heads/*"]) + ret = {} + for l in out.splitlines(): + sha, ref = l.split("\t") + branch = ref.replace("refs/heads/", "", 1) + ret[branch] = sha + return ret + + +def rev_parse(rev): + """Run git rev-parse, returning the sha1, or None if not found""" + try: + return check_output(['git', 'rev-parse', rev], stderr=subprocess.STDOUT).strip() + except subprocess.CalledProcessError: + return None + + +def rev_list(arg): + """Run git rev-list, returning an array of SHA1 commit hashes.""" + return check_output(['git', 'rev-list', arg]).splitlines() + + +def describe_commit(rev): + """ Return a one-line description of a commit. """ + return subprocess.check_output( + ['git', 'log', '--color', '-n1', '--oneline', rev]).strip() + + +def is_fast_forward(ancestor, child): + """ + Return True if 'child' is a descendent of 'ancestor' and thus + could be fast-forward merged. + """ + try: + merge_base = check_output(['git', 'merge-base', ancestor, child]).strip() + except: + # If either of the commits is unknown, count this as a non-fast-forward. + return False + return merge_base == rev_parse(ancestor) + + +def get_committer_email(rev): + """ Return the email address of the committer of the given revision. """ + return check_output(['git', 'log', '-n1', '--pretty=format:%ce', rev]).strip() + + +def do_update(branch, gerrit_sha, apache_sha): + """ + Displays and performs a proposed update of the Apache repository + for branch 'branch' from 'apache_sha' to 'gerrit_sha'. + """ + # First, verify that the update is fast-forward. If it's not, then something + # must have gotten committed to Apache outside of gerrit, and we'd need some + # manual intervention. + if not is_fast_forward(apache_sha, gerrit_sha): + print >>sys.stderr, "Cannot update branch '%s' from gerrit:" % branch + print >>sys.stderr, "Apache revision %s is not an ancestor of gerrit revision %s" % ( + apache_sha[:8], gerrit_sha[:8]) + print >>sys.stderr, "Something must have been committed to Apache and bypassed gerrit." + print >>sys.stderr, "Manual intervention is required." + sys.exit(1) + + # List the commits that are going to be pushed to the ASF, so that the committer + # can verify and "sign off". + commits = rev_list("%s..%s" % (apache_sha, gerrit_sha)) + commits.reverse() # Display from oldest to newest. + print "-" * 60 + print Colors.GREEN + ("%d commit(s) need to be pushed from Gerrit to ASF:" % len(commits)) + Colors.RESET + push_sha = None + for sha in commits: + oneline = describe_commit(sha) + print " ", oneline + committer = get_committer_email(sha) + if committer != get_my_email(): + print Colors.RED + " !!! Committed by someone else (%s) !!!" % committer, Colors.RESET + if not confirm_prompt( + Colors.RED + " !!! Are you sure you want to push on behalf of another committer?" + Colors.RESET): + # Even if they don't want to push this commit, we could still push any + # earlier commits that the user _did_ author. + if push_sha is not None: + print "... will still update to prior commit %s..." % push_sha + break + push_sha = sha + if push_sha is None: + print "Nothing to push" + return + + # Everything has been confirmed. Do the actual push + cmd = ['git', 'push', 'apache'] + if OPTIONS.dry_run: + cmd.append('--dry-run') + cmd.append('%s:refs/heads/%s' % (push_sha, branch)) + print Colors.GREEN + "Running: " + Colors.RESET + " ".join(cmd) + subprocess.check_call(cmd) + print Colors.GREEN + "Successfully updated %s to %s" % (branch, gerrit_sha) + Colors.RESET + print + + +def main(): + global OPTIONS + p = optparse.OptionParser( + epilog=("See the top of the source code for more information on the purpose of " + + "this script.")) + p.add_option("-n", "--dry-run", action="store_true", + help="Perform git pushes with --dry-run") + OPTIONS, args = p.parse_args() + if args: + p.error("no arguments expected") + sys.exit(1) + + # Pre-flight checks. + check_apache_remote() + check_gerrit_remote() + + # Ensure we have the latest state of gerrit. + fetch('gerrit') + + # Check the current state of branches on Apache. + # For each branch, we try to update it if the revisions don't match. + apache_branches = get_branches('apache') + for branch, apache_sha in sorted(apache_branches.iteritems()): + gerrit_sha = rev_parse("remotes/gerrit/" + branch) + print "Branch '%s':\t" % branch, + if gerrit_sha is None: + print Colors.YELLOW, "found on Apache but not in gerrit", Colors.RESET + continue + if gerrit_sha == apache_sha: + print Colors.GREEN, "up to date", Colors.RESET + continue + print Colors.YELLOW, "needs update", Colors.RESET + do_update(branch, gerrit_sha, apache_sha) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/build-support/report-test.sh b/build-support/report-test.sh new file mode 100755 index 000000000000..f5298dc59ff1 --- /dev/null +++ b/build-support/report-test.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Reports a test run to the central test server, which records +# the results in a database. This is what drives our "flaky test dashboard". +# This script does blocking network IO, so if you are running it from the +# context of a build, you may want to run it in the background. +# +# Note that this may exit with a non-zero code if the network is flaky or the +# test result server is down. + +set -e + +ROOT=$(dirname $BASH_SOURCE)/.. + +# Verify and parse command line and options +if [ $# -ne 3 ]; then + echo "usage: $0 " + echo + echo The \$TEST_RESULT_SERVER environment variable may be used + echo to specify where to report the tests. + exit 1 +fi +TEST_EXECUTABLE=$1 +LOGFILE=$2 +STATUS=$3 +TEST_RESULT_SERVER=${TEST_RESULT_SERVER:-localhost:8080} +REPORT_TIMEOUT=${REPORT_TIMEOUT:-10} + +# On Jenkins, we'll have this variable set. Otherwise, +# report the build ID as non-jenkins. +BUILD_ID=${BUILD_TAG:-non-jenkins} + +# Figure out the current git revision, and append a "-dirty" tag if it's +# not a pristine checkout +REVISION=$(cd $ROOT && git rev-parse HEAD) +if ! ( cd $ROOT && git diff --quiet . && git diff --cached --quiet . ) ; then + REVISION="${REVISION}-dirty" +fi + +# Parse out our "build config" - a space-separated list of tags +# which include the cmake build type as well as the list of configured +# sanitizers + +CMAKECACHE=$ROOT/CMakeCache.txt +BUILD_CONFIG=$(grep '^CMAKE_BUILD_TYPE:' $CMAKECACHE | cut -f 2 -d=) +if grep -q "KUDU_USE_ASAN:UNINITIALIZED=1" $CMAKECACHE ; then + BUILD_CONFIG="$BUILD_CONFIG asan" +fi +if grep -q "KUDU_USE_TSAN:UNINITIALIZED=1" $CMAKECACHE ; then + BUILD_CONFIG="$BUILD_CONFIG tsan" +fi +if grep -q "KUDU_USE_UBSAN:UNINITIALIZED=1" $CMAKECACHE ; then + BUILD_CONFIG="$BUILD_CONFIG ubsan" +fi + +# We sometimes have flaky infrastructure where NTP is broken. In that case +# do not report it as a failed test. +if grep -q 'Clock considered unsynchronized' $LOGFILE ; then + echo Not reporting test that failed due to NTP issues. + exit 1 +fi + +# Only upload a log if the test failed. +# This saves some space on S3, network bandwidth, etc, and we don't +# have a lot of use for the logs of successful tests anyway. +if [ "$STATUS" -ne 0 ]; then + LOG_PARAM="-F log=@$LOGFILE" +else + LOG_PARAM="" +fi + +curl -s \ + --max-time $REPORT_TIMEOUT \ + $LOG_PARAM \ + -F "build_id=$BUILD_ID" \ + -F "hostname=$(hostname)" \ + -F "test_name=$(basename $TEST_EXECUTABLE)" \ + -F "status=$STATUS" \ + -F "revision=$REVISION" \ + -F "build_config=$BUILD_CONFIG" \ + http://$TEST_RESULT_SERVER/add_result diff --git a/build-support/run-test.sh b/build-support/run-test.sh new file mode 100755 index 000000000000..f9f7f60569c8 --- /dev/null +++ b/build-support/run-test.sh @@ -0,0 +1,248 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which wraps running a test and redirects its output to a +# test log directory. +# +# If KUDU_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be +# gzip-compressed while they are written. +# +# If KUDU_FLAKY_TEST_ATTEMPTS is non-zero, and the test being run matches +# one of the lines in the file KUDU_FLAKY_TEST_LIST, then the test will +# be retried on failure up to the specified number of times. This can be +# used in the gerrit workflow to prevent annoying false -1s caused by +# tests that are known to be flaky in master. +# +# If KUDU_REPORT_TEST_RESULTS is non-zero, then tests are reported to the +# central test server. + +# Path to the test executable or script to be run. +# May be relative or absolute. +TEST_PATH=$1 + +# Absolute path to the root source directory. This script is expected to live within it. +SOURCE_ROOT=$(cd $(dirname "$BASH_SOURCE")/.. ; pwd) + +# Absolute path to the root build directory. The test path is expected to be within it. +BUILD_ROOT=$(cd $(dirname "$TEST_PATH")/.. ; pwd) + +TEST_LOGDIR=$BUILD_ROOT/test-logs +mkdir -p $TEST_LOGDIR + +TEST_DEBUGDIR=$BUILD_ROOT/test-debug +mkdir -p $TEST_DEBUGDIR + +TEST_DIRNAME=$(cd $(dirname $TEST_PATH); pwd) +TEST_FILENAME=$(basename $TEST_PATH) +ABS_TEST_PATH=$TEST_DIRNAME/$TEST_FILENAME +shift +TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if any). + +# Determine whether the test is a known flaky by comparing against the user-specified +# list. +TEST_EXECUTION_ATTEMPTS=1 +if [ -n "$KUDU_FLAKY_TEST_LIST" ]; then + if [ -f "$KUDU_FLAKY_TEST_LIST" ]; then + IS_KNOWN_FLAKY=$(grep --count --line-regexp "$TEST_NAME" "$KUDU_FLAKY_TEST_LIST") + else + echo "Flaky test list file $KUDU_FLAKY_TEST_LIST missing" + IS_KNOWN_FLAKY=0 + fi + if [ "$IS_KNOWN_FLAKY" -gt 0 ]; then + TEST_EXECUTION_ATTEMPTS=${KUDU_FLAKY_TEST_ATTEMPTS:-1} + echo $TEST_NAME is a known-flaky test. Will attempt running it + echo up to $TEST_EXECUTION_ATTEMPTS times. + fi +fi + + +# We run each test in its own subdir to avoid core file related races. +TEST_WORKDIR=$BUILD_ROOT/test-work/$TEST_NAME +mkdir -p $TEST_WORKDIR +pushd $TEST_WORKDIR >/dev/null || exit 1 +rm -f * + +set -o pipefail + +LOGFILE=$TEST_LOGDIR/$TEST_NAME.txt +XMLFILE=$TEST_LOGDIR/$TEST_NAME.xml + +# Remove both the compressed and uncompressed output, so the developer +# doesn't accidentally get confused and read output from a prior test +# run. +rm -f $LOGFILE $LOGFILE.gz + +if [ -n "$KUDU_COMPRESS_TEST_OUTPUT" ] && [ "$KUDU_COMPRESS_TEST_OUTPUT" -ne 0 ] ; then + pipe_cmd=gzip + LOGFILE=${LOGFILE}.gz +else + pipe_cmd=cat +fi + +# Suppressions require symbolization. We'll default to using the symbolizer in +# thirdparty. +if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then + export ASAN_SYMBOLIZER_PATH=$SOURCE_ROOT/thirdparty/clang-toolchain/bin/llvm-symbolizer +fi + +# Configure TSAN (ignored if this isn't a TSAN build). +# +# Deadlock detection (new in clang 3.5) is disabled because: +# 1. The clang 3.5 deadlock detector crashes in some Kudu unit tests. It +# needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others. +# 2. Many unit tests report lock-order-inversion warnings; they should be +# fixed before reenabling the detector. +TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0" +TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$SOURCE_ROOT/build-support/tsan-suppressions.txt" +TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" +TSAN_OPTIONS="$TSAN_OPTIONS external_symbolizer_path=$ASAN_SYMBOLIZER_PATH" +export TSAN_OPTIONS + +# Enable leak detection even under LLVM 3.4, where it was disabled by default. +# This flag only takes effect when running an ASAN build. +ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" +export ASAN_OPTIONS + +# Set up suppressions for LeakSanitizer +LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$SOURCE_ROOT/build-support/lsan-suppressions.txt" +export LSAN_OPTIONS + +# Set a 15-minute timeout for tests run via 'make test'. +# This keeps our jenkins builds from hanging in the case that there's +# a deadlock or anything. +KUDU_TEST_TIMEOUT=${KUDU_TEST_TIMEOUT:-900} + +# Allow for collecting core dumps. +KUDU_TEST_ULIMIT_CORE=${KUDU_TEST_ULIMIT_CORE:-0} +ulimit -c $KUDU_TEST_ULIMIT_CORE + +# Run the actual test. +for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # If the test fails, the test output may or may not be left behind, + # depending on whether the test cleaned up or exited immediately. Either + # way we need to clean it up. We do this by comparing the data directory + # contents before and after the test runs, and deleting anything new. + # + # The comm program requires that its two inputs be sorted. + TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + fi + + # gtest won't overwrite old junit test files, resulting in a build failure + # even when retries are successful. + rm -f $XMLFILE + + echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ + "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" + $ABS_TEST_PATH "$@" --test_timeout_after $KUDU_TEST_TIMEOUT 2>&1 \ + | $SOURCE_ROOT/build-support/stacktrace_addr2line.pl $ABS_TEST_PATH \ + | $pipe_cmd > $LOGFILE + STATUS=$? + + # TSAN doesn't always exit with a non-zero exit code due to a bug: + # mutex errors don't get reported through the normal error reporting infrastructure. + # So we make sure to detect this and exit 1. + # + # Additionally, certain types of failures won't show up in the standard JUnit + # XML output from gtest. We assume that gtest knows better than us and our + # regexes in most cases, but for certain errors we delete the resulting xml + # file and let our own post-processing step regenerate it. + export GREP=$(which egrep) + if zgrep --silent "ThreadSanitizer|Leak check.*detected leaks" $LOGFILE ; then + echo ThreadSanitizer or leak check failures in $LOGFILE + STATUS=1 + rm -f $XMLFILE + fi + + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # Now delete any new test output. + TEST_TMPDIR_AFTER=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + DIFF=$(comm -13 <(echo "$TEST_TMPDIR_BEFORE") \ + <(echo "$TEST_TMPDIR_AFTER")) + for DIR in $DIFF; do + # Multiple tests may be running concurrently. To avoid deleting the + # wrong directories, constrain to only directories beginning with the + # test name. + # + # This may delete old test directories belonging to this test, but + # that's not typically a concern when rerunning flaky tests. + if [[ $DIR =~ ^$TEST_TMPDIR/$TEST_NAME ]]; then + echo Deleting leftover flaky test directory "$DIR" + rm -Rf "$DIR" + fi + done + fi + + if [ -n "$KUDU_REPORT_TEST_RESULTS" ]; then + echo Reporting results + $SOURCE_ROOT/build-support/report-test.sh "$ABS_TEST_PATH" "$LOGFILE" "$STATUS" & + + # On success, we'll do "best effort" reporting, and disown the subprocess. + # On failure, we want to upload the failed test log. So, in that case, + # wait for the report-test.sh job to finish, lest we accidentally run + # a test retry and upload the wrong log. + if [ "$STATUS" -eq "0" ]; then + disown + else + wait + fi + fi + + if [ "$STATUS" -eq "0" ]; then + break + elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then + echo Test failed attempt number $ATTEMPT_NUMBER + echo Will retry... + fi +done + +# If we have a LeakSanitizer report, and XML reporting is configured, add a new test +# case result to the XML file for the leak report. Otherwise Jenkins won't show +# us which tests had LSAN errors. +if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then + echo Test had memory leaks. Editing XML + perl -p -i -e ' + if (m##) { + print "\n"; + print " \n"; + print " See txt log file for details\n"; + print " \n"; + print "\n"; + }' $XMLFILE +fi + +# Capture and compress core file and binary. +COREFILES=$(ls | grep ^core) +if [ -n "$COREFILES" ]; then + echo Found core dump. Saving executable and core files. + gzip < $ABS_TEST_PATH > "$TEST_DEBUGDIR/$TEST_NAME.gz" || exit $? + for COREFILE in $COREFILES; do + gzip < $COREFILE > "$TEST_DEBUGDIR/$TEST_NAME.$COREFILE.gz" || exit $? + done + # Pull in any .so files as well. + for LIB in $(ldd $ABS_TEST_PATH | grep $BUILD_ROOT | awk '{print $3}'); do + LIB_NAME=$(basename $LIB) + gzip < $LIB > "$TEST_DEBUGDIR/$LIB_NAME.gz" || exit $? + done +fi + +popd +rm -Rf $TEST_WORKDIR + +exit $STATUS diff --git a/build-support/run_dist_test.py b/build-support/run_dist_test.py new file mode 100755 index 000000000000..71c09507b696 --- /dev/null +++ b/build-support/run_dist_test.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python2 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script runs on the distributed-test slave and acts +# as a wrapper around run-test.sh. +# +# The distributed testing system can't pass in environment variables +# to commands, so this takes some parameters, turns them into environment +# variables, and then executes the test wrapper. +# +# We also 'cat' the test log upon completion so that the test logs are +# uploaded by the test slave back. + +import optparse +import os +import re +import shutil +import subprocess +import sys + +ME = os.path.abspath(__file__) +ROOT = os.path.abspath(os.path.join(os.path.dirname(ME), "..")) + +def is_elf_binary(path): + """ Determine if the given path is an ELF binary (executable or shared library) """ + if not os.path.isfile(path) or os.path.islink(path): + return False + try: + with file(path, "rb") as f: + magic = f.read(4) + return magic == "\x7fELF" + except: + # Ignore unreadable files + return False + +def fix_rpath_component(bin_path, path): + """ + Given an RPATH component 'path' of the binary located at 'bin_path', + fix the thirdparty dir to be relative to the binary rather than absolute. + """ + rel_tp = os.path.relpath(os.path.join(ROOT, "thirdparty/"), + os.path.dirname(bin_path)) + path = re.sub(r".*thirdparty/", "$ORIGIN/"+rel_tp + "/", path) + return path + +def fix_rpath(path): + """ + Fix the RPATH/RUNPATH of the binary located at 'path' so that + the thirdparty/ directory is properly found, even though we will + run the binary at a different path than it was originally built. + """ + # Fetch the original rpath. + p = subprocess.Popen(["chrpath", path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + if p.returncode != 0: + return + rpath = re.search("R(?:UN)?PATH=(.+)", stdout.strip()).group(1) + # Fix it to be relative. + new_path = ":".join(fix_rpath_component(path, c) for c in rpath.split(":")) + # Write the new rpath back into the binary. + subprocess.check_call(["chrpath", "-r", new_path, path]) + +def fixup_rpaths(root): + """ + Recursively walk the directory tree 'root' and fix the RPATH for any + ELF files (binaries/libraries) that are found. + """ + for dirpath, dirnames, filenames in os.walk(root): + for f in filenames: + p = os.path.join(dirpath, f) + if is_elf_binary(p): + fix_rpath(p) + +def main(): + p = optparse.OptionParser(usage="usage: %prog [options] ") + p.add_option("-e", "--env", dest="env", type="string", action="append", + help="key=value pairs for environment variables", + default=[]) + options, args = p.parse_args() + if len(args) < 1: + p.print_help(sys.stderr) + sys.exit(1) + test_exe = args[0] + test_name, _ = os.path.splitext(os.path.basename(test_exe)) + test_dir = os.path.dirname(test_exe) + + env = os.environ.copy() + for env_pair in options.env: + (k, v) = env_pair.split("=", 1) + env[k] = v + + # Fix the RPATHs of any binaries. During the build, we end up with + # absolute paths from the build machine. This fixes the paths to be + # binary-relative so that we can run it on the new location. + # + # It's important to do this rather than just putting all of the thirdparty + # lib directories into $LD_LIBRARY_PATH below because we need to make sure + # that non-TSAN-instrumented runtime tools (like 'llvm-symbolizer') do _NOT_ + # pick up the TSAN-instrumented libraries, whereas TSAN-instrumented test + # binaries (like 'foo_test' or 'kudu-tserver') _DO_ pick them up. + fixup_rpaths(os.path.join(ROOT, "build")) + fixup_rpaths(os.path.join(ROOT, "thirdparty")) + + env['LD_LIBRARY_PATH'] = ":".join( + [os.path.join(ROOT, "build/dist-test-system-libs/"), + os.path.abspath(os.path.join(test_dir, "..", "lib"))]) + + # GTEST_OUTPUT must be canonicalized and have a trailing slash for gtest to + # properly interpret it as a directory. + env['GTEST_OUTPUT'] = 'xml:' + os.path.abspath( + os.path.join(test_dir, "..", "test-logs")) + '/' + + env['ASAN_SYMBOLIZER_PATH'] = os.path.join(ROOT, "thirdparty/installed/bin/llvm-symbolizer") + rc = subprocess.call([os.path.join(ROOT, "build-support/run-test.sh")] + args, + env=env) + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/build-support/sanitize-blacklist.txt b/build-support/sanitize-blacklist.txt new file mode 100644 index 000000000000..6264b919aa12 --- /dev/null +++ b/build-support/sanitize-blacklist.txt @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Blacklist of things not to run the sanitizer on. + +# The safe math routines are expected to overflow, but anyone using them +# will check the resulting overflow flags. +fun:*kudu18safe_math_internal* + +# Workaround for a bug in clang's libstdc++ which causes a false positive +# trying to use ostream << std::hex +# See http://llvm.org/bugs/show_bug.cgi?id=18156 +fun:*_Ios_Fmtflags* + +# Workaround for a problem with gmock where a runtime error is caused by a call on a null pointer, +# on a mocked object. +# Seen error: +# thirdparty/gmock-1.7.0/include/gmock/gmock-spec-builders.h:1529:12: runtime error: member call on null pointer of type 'testing::internal::ActionResultHolder' +fun:*testing*internal*InvokeWith* diff --git a/build-support/stacktrace_addr2line.pl b/build-support/stacktrace_addr2line.pl new file mode 100755 index 000000000000..b81b91d4ca0e --- /dev/null +++ b/build-support/stacktrace_addr2line.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +####################################################################### +# This script will convert a stack trace with addresses: +# @ 0x5fb015 kudu::master::Master::Init() +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() +# @ 0x5c31fa kudu::master::MiniMaster::Start() +# @ 0x58270a kudu::MiniCluster::Start() +# @ 0x57dc71 kudu::CreateTableStressTest::SetUp() +# To one with line numbers: +# @ 0x5fb015 kudu::master::Master::Init() at /home/mpercy/src/kudu/src/master/master.cc:54 +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() at /home/mpercy/src/kudu/src/master/mini_master.cc:52 +# @ 0x5c31fa kudu::master::MiniMaster::Start() at /home/mpercy/src/kudu/src/master/mini_master.cc:33 +# @ 0x58270a kudu::MiniCluster::Start() at /home/mpercy/src/kudu/src/integration-tests/mini_cluster.cc:48 +# @ 0x57dc71 kudu::CreateTableStressTest::SetUp() at /home/mpercy/src/kudu/src/integration-tests/create-table-stress-test.cc:61 +# +# If the script detects that the output is not symbolized, it will also attempt +# to determine the function names, i.e. it will convert: +# @ 0x5fb015 +# @ 0x5c2d38 +# @ 0x5c31fa +# To: +# @ 0x5fb015 kudu::master::Master::Init() at /home/mpercy/src/kudu/src/master/master.cc:54 +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() at /home/mpercy/src/kudu/src/master/mini_master.cc:52 +# @ 0x5c31fa kudu::master::MiniMaster::Start() at /home/mpercy/src/kudu/src/master/mini_master.cc:33 +####################################################################### +use strict; +use warnings; + +if (!@ARGV) { + die < is magical in Perl. +while (defined(my $input = )) { + if ($input =~ /^\s+\@\s+(0x[[:xdigit:]]{6,})(?:\s+(\S+))?/) { + my $addr = $1; + my $lookup_func_name = (!defined $2); + if (!exists($addr2line_map{$addr})) { + $addr2line_map{$addr} = `addr2line -ifC -e $binary $addr`; + } + chomp $input; + $input .= parse_addr2line_output($addr2line_map{$addr}, $lookup_func_name) . "\n"; + } + print $input; +} + +exit 0; diff --git a/build-support/test_result_server.py b/build-support/test_result_server.py new file mode 100755 index 000000000000..8570eb2c9536 --- /dev/null +++ b/build-support/test_result_server.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Simple HTTP server which receives test results from the build slaves and +# stores them in a MySQL database. The test logs are also stored in an S3 bucket. +# +# Configuration here is done via environment variables: +# +# MySQL config: +# MYSQLHOST - host running mysql +# MYSQLUSER - username +# MYSQLPWD - password +# MYSQLDB - mysql database +# +# S3 config: +# AWS_ACCESS_KEY - AWS access key +# AWS_SECRET_KEY - AWS secret key +# TEST_RESULT_BUCKET - bucket to store results in (eg 'kudu-test-results') +# +# If the AWS credentials are not configured, falls back to using Boto's +# default configuration (http://boto.cloudhackers.com/en/latest/boto_config_tut.html) +# +# Installation instructions: +# You probably want to run this inside a virtualenv to avoid having +# to install python modules systemwide. For example: +# $ virtualenv ~/flaky-test-server-env/ +# $ . ~/flaky-test-server-env/bin/activate +# $ pip install boto +# $ pip install jinja2 +# $ pip install cherrypy +# $ pip install MySQL-python + +import boto +import cherrypy +import gzip +import itertools +from jinja2 import Template +import logging +import MySQLdb +import os +import parse_test_failure +from StringIO import StringIO +import threading +import uuid + +class TRServer(object): + def __init__(self): + self.thread_local = threading.local() + self.ensure_table() + self.s3 = self.connect_s3() + self.s3_bucket = self.s3.get_bucket(os.environ["TEST_RESULT_BUCKET"]) + + def connect_s3(self): + access_key = os.environ.get("AWS_ACCESS_KEY") + secret_key = os.environ.get("AWS_SECRET_KEY") + s3 = boto.connect_s3(access_key, secret_key) + logging.info("Connected to S3 with access key %s" % access_key) + return s3 + + + def upload_to_s3(self, key, fp, filename): + k = boto.s3.key.Key(self.s3_bucket) + k.key = key + # The Content-Disposition header sets the filename that the browser + # will use to download this. + # We have to cast to str() here, because boto will try to escape the header + # incorrectly if you pass a unicode string. + k.set_metadata('Content-Disposition', str('inline; filename=%s' % filename)) + k.set_contents_from_string(fp.read(), + reduced_redundancy=True) + + def connect_mysql(self): + if hasattr(self.thread_local, "db") and \ + self.thread_local.db is not None: + return self.thread_local.db + + host = os.environ["MYSQLHOST"] + user = os.environ["MYSQLUSER"] + pwd = os.environ["MYSQLPWD"] + db = os.environ["MYSQLDB"] + self.thread_local.db = MySQLdb.connect(host, user, pwd, db) + self.thread_local.db.autocommit(True) + logging.info("Connected to MySQL at %s" % host) + return self.thread_local.db + + def execute_query(self, query, *args): + """ Execute a query, automatically reconnecting on disconnection. """ + # We'll try up to 3 times to reconnect + MAX_ATTEMPTS = 3 + + # Error code for the "MySQL server has gone away" error. + MYSQL_SERVER_GONE_AWAY = 2006 + + attempt_num = 0 + while True: + c = self.connect_mysql().cursor(MySQLdb.cursors.DictCursor) + attempt_num = attempt_num + 1 + try: + c.execute(query, *args) + return c + except MySQLdb.OperationalError as err: + if err.args[0] == MYSQL_SERVER_GONE_AWAY and attempt_num < MAX_ATTEMPTS: + logging.warn("Forcing reconnect to MySQL: %s" % err) + self.thread_local.db = None + continue + else: + raise + + + def ensure_table(self): + c = self.execute_query(""" + CREATE TABLE IF NOT EXISTS test_results ( + id int not null auto_increment primary key, + timestamp timestamp not null default current_timestamp, + build_id varchar(100), + revision varchar(50), + build_config varchar(100), + hostname varchar(255), + test_name varchar(100), + status int, + log_key char(40), + INDEX (revision), + INDEX (test_name), + INDEX (timestamp) + );""") + + @cherrypy.expose + def index(self): + return "Welcome to the test result server!" + + @cherrypy.expose + def add_result(self, **kwargs): + args = {} + args.update(kwargs) + + # Only upload the log if it's provided. + if 'log' in kwargs: + log = kwargs['log'] + s3_id = uuid.uuid1() + self.upload_to_s3(s3_id, log.file, log.filename) + else: + s3_id = None + args['log_key'] = s3_id + + logging.info("Handling report: %s" % repr(args)) + + self.execute_query( + "INSERT INTO test_results(build_id, revision, build_config, hostname, test_name, status, log_key) " + "VALUES (%(build_id)s, %(revision)s, %(build_config)s, %(hostname)s, %(test_name)s," + "%(status)s, %(log_key)s)", + args) + return "Success!\n" + + @cherrypy.expose + def download_log(self, key): + expiry = 60 * 60 * 24 # link should last 1 day + k = boto.s3.key.Key(self.s3_bucket) + k.key = key + raise cherrypy.HTTPRedirect(k.generate_url(expiry)) + + @cherrypy.expose + def diagnose(self, key): + k = boto.s3.key.Key(self.s3_bucket) + k.key = key + log_text_gz = k.get_contents_as_string() + log_text = gzip.GzipFile(fileobj=StringIO(log_text_gz)).read().decode('utf-8') + summary = parse_test_failure.extract_failure_summary(log_text) + if not summary: + summary = "Unable to diagnose" + template = Template(""" +

Diagnosed failure

+
{{ summary|e }}
+

Full log

+
{{ log_text|e }}
+ """) + return self.render_container(template.render(summary=summary, log_text=log_text)) + + def recently_failed_html(self): + """ Return an HTML report of recently failed tests """ + c = self.execute_query( + "SELECT * from test_results WHERE status != 0 " + "AND timestamp > NOW() - INTERVAL 1 WEEK " + "ORDER BY timestamp DESC LIMIT 50") + failed_tests = c.fetchall() + + prev_date = None + for t in failed_tests: + t['is_new_date'] = t['timestamp'].date() != prev_date + prev_date = t['timestamp'].date() + + template = Template(""" +

50 most recent failures

+ + + + + + + + + + + {% for run in failed_tests %} + {% if run.is_new_date %} + + + + {% endif %} + + + + + + + + + + {% endfor %} +
testconfigexit coderevmachinetimebuild
{{ run.timestamp.date()|e }}
+ {{ run.test_name |e }} + {{ run.build_config |e }}{{ run.status |e }} + {% if run.log_key %} + failure log | + diagnose + {% endif %} + {{ run.revision |e }}{{ run.hostname |e }}{{ run.timestamp |e }}{{ run.build_id |e }}
+ """) + return template.render(failed_tests=failed_tests) + + def flaky_report_html(self): + """ Return an HTML report of recently flaky tests """ + c = self.execute_query( + """SELECT + test_name, + DATEDIFF(NOW(), timestamp) AS days_ago, + SUM(IF(status != 0, 1, 0)) AS num_failures, + COUNT(*) AS num_runs + FROM test_results + WHERE timestamp > NOW() - INTERVAL 1 WEEK + GROUP BY test_name, days_ago + HAVING num_failures > 0 + ORDER BY test_name""") + rows = c.fetchall() + + results = [] + for test_name, test_rows in itertools.groupby(rows, lambda r: r['test_name']): + # Convert to list so we can consume it multiple times + test_rows = list(test_rows) + + # Compute summary for last 7 days and last 2 days + runs_7day = sum(r['num_runs'] for r in test_rows) + failures_7day = sum(r['num_failures'] for r in test_rows) + runs_2day = sum(r['num_runs'] for r in test_rows if r['days_ago'] < 2) + failures_2day = sum(r['num_failures'] for r in test_rows if r['days_ago'] < 2) + + # Compute a sparkline (percentage failure for each day) + sparkline = [0 for x in xrange(8)] + for r in test_rows: + if r['num_runs'] > 0: + percent = float(r['num_failures']) / r['num_runs'] * 100 + else: + percent = 0 + sparkline[7 - r['days_ago']] = percent + + # Add to results list for tablet. + results.append(dict(test_name=test_name, + runs_7day=runs_7day, + failures_7day=failures_7day, + runs_2day=runs_2day, + failures_2day=failures_2day, + sparkline=",".join("%.2f" % p for p in sparkline))) + + return Template(""" +

Flaky rate over last week

+ + + + + + + + {% for r in results %} + + + + + + + {% endfor %} +
testfailure rate (7-day)failure rate (2-day)trend
+ {{ r.test_name |e }} + {{ r.failures_7day |e }} / {{ r.runs_7day }} + ({{ "%.2f"|format(r.failures_7day / r.runs_7day * 100) }}%) + {{ r.failures_2day |e }} / {{ r.runs_2day }} + {% if r.runs_2day > 0 %} + ({{ "%.2f"|format(r.failures_2day / r.runs_2day * 100) }}%) + {% endif %} + {{ r.sparkline |e }}
+ + """).render(results=results) + + @cherrypy.expose + def list_failed_tests(self, build_pattern, num_days): + num_days = int(num_days) + c = self.execute_query( + """SELECT DISTINCT + test_name + FROM test_results + WHERE timestamp > NOW() - INTERVAL %(num_days)s DAY + AND status != 0 + AND build_id LIKE %(build_pattern)s""", + dict(build_pattern=build_pattern, + num_days=num_days)) + cherrypy.response.headers['Content-Type'] = 'text/plain' + return "\n".join(row['test_name'] for row in c.fetchall()) + + @cherrypy.expose + def test_drilldown(self, test_name): + + # Get summary statistics for the test, grouped by revision + c = self.execute_query( + """SELECT + revision, + MIN(timestamp) AS first_run, + SUM(IF(status != 0, 1, 0)) AS num_failures, + COUNT(*) AS num_runs + FROM test_results + WHERE timestamp > NOW() - INTERVAL 1 WEEK + AND test_name = %(test_name)s + GROUP BY revision + ORDER BY first_run DESC""", + dict(test_name=test_name)) + revision_rows = c.fetchall() + + # Convert to a dictionary, by revision + rev_dict = dict( [(r['revision'], r) for r in revision_rows] ) + + # Add an empty 'runs' array to each revision to be filled in below + for r in revision_rows: + r['runs'] = [] + + # Append the specific info on failures + c.execute("SELECT * from test_results " + "WHERE timestamp > NOW() - INTERVAL 1 WEEK " + "AND test_name = %(test_name)s " + "AND status != 0", + dict(test_name=test_name)) + for failure in c.fetchall(): + rev_dict[failure['revision']]['runs'].append(failure) + + return self.render_container(Template(""" +

{{ test_name |e }} flakiness over recent revisions

+ {% for r in revision_rows %} +

{{ r.revision }} (Failed {{ r.num_failures }} / {{ r.num_runs }})

+ {% if r.num_failures > 0 %} + + + + + + + + + {% for run in r.runs %} + + + + + + + + {% endfor %} +
timeconfigexit codemachinebuild
{{ run.timestamp |e }}{{ run.build_config |e }}{{ run.status |e }} + {% if run.log_key %} + failure log | + diagnose + {% endif %} + {{ run.hostname |e }}{{ run.build_id |e }}
+ {% endif %} + {% endfor %} + """).render(revision_rows=revision_rows, test_name=test_name)) + + @cherrypy.expose + def index(self): + body = self.flaky_report_html() + body += "
" + body += self.recently_failed_html() + return self.render_container(body) + + def render_container(self, body): + """ Render the "body" HTML inside of a bootstrap container page. """ + template = Template(""" + + + Kudu test results + + + + + + + +
+ {{ body }} +
+ + + """) + return template.render(body=body) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + cherrypy.config.update( + {'server.socket_host': '0.0.0.0'} ) + cherrypy.quickstart(TRServer()) diff --git a/build-support/tools/kudu-lint/.gitignore b/build-support/tools/kudu-lint/.gitignore new file mode 100644 index 000000000000..d78d765bce6a --- /dev/null +++ b/build-support/tools/kudu-lint/.gitignore @@ -0,0 +1,2 @@ +# The binary +kudu-lint diff --git a/build-support/tools/kudu-lint/CMakeLists.txt b/build-support/tools/kudu-lint/CMakeLists.txt new file mode 100644 index 000000000000..77101e9e2ca7 --- /dev/null +++ b/build-support/tools/kudu-lint/CMakeLists.txt @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 2.8) + +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake_modules" ${CMAKE_MODULE_PATH}) + +# Find LLVM +find_package(LLVM REQUIRED COMPONENTS + option + analysis + bitreader + mcparser + instrumentation +) +find_package(Clang REQUIRED COMPONENTS + clang + clangAnalysis + clangAST + clangASTMatchers + clangBasic + clangCodeGen + clangDriver + clangEdit + clangFrontend + clangFrontendTool + clangLex + clangParse + clangSema + clangSerialization + clangTooling +) + +add_definitions(${LLVM_DEFINITIONS}) +include_directories(${CLANG_INCLUDE}) +include_directories(${LLVM_INCLUDE_DIRS} ${CLANG_INCLUDE_DIRS}) +link_directories(${LLVM_LIBRARY_DIRS}) + +set(START_GROUP "-Wl,--start-group") +set(END_GROUP "-Wl,--end-group") + +add_executable(kudu-lint kudu-lint.cc) +target_link_libraries(kudu-lint + ${START_GROUP} + ${CLANG_LIBS} + ${LLVM_LIBS} + ${END_GROUP} + ${LLVM_LDFLAGS} + ) + +# Disable RTTI since we have to inherit from Clang-provided classes, +# and Clang does not enable RTTI. +set_target_properties(kudu-lint PROPERTIES + COMPILE_FLAGS "${LLVM_CFLAGS} -fno-rtti -g") diff --git a/build-support/tools/kudu-lint/README b/build-support/tools/kudu-lint/README new file mode 100644 index 000000000000..d2eff1b723d1 --- /dev/null +++ b/build-support/tools/kudu-lint/README @@ -0,0 +1,46 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Overview +=============== +kudu-lint is a Clang-based tool for looking for kudu-specific coding errors. + +Currently, it only looks for cases where a function returning Status is called, +and then that Status is ignored. + +Over time we can add more AST matchers for other Kudu-specific lint checks we'd +like to implement. + + +Requirements +=============== + +This has currently only been tested against clang 3.4. Clang APIs tend to change +between versions, so this is unlikely to work on earlier versions of clang. + +Building +========= + +Set the CLANG_ROOT environment variable to point to the path at which clang/llvm +have been installed. This is the directory which contains bin/, lib/, etc. + +Run 'cmake .', followed by 'make', in this directory. + +Running +========= + +In the top-level kudu source directory, run: + +$ cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON . +$ find src -name \*.cc | xargs -n1 -P8 ./build-support/tools/kudu-lint/kudu-lint \ + -p . -extra-arg=-I$CLANG_ROOT/lib/clang/3.4/include diff --git a/build-support/tools/kudu-lint/cmake_modules/FindClang.cmake b/build-support/tools/kudu-lint/cmake_modules/FindClang.cmake new file mode 100644 index 000000000000..34015d5cd185 --- /dev/null +++ b/build-support/tools/kudu-lint/cmake_modules/FindClang.cmake @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# Find Clang +# +# It defines the following variables +# CLANG_FOUND - True if Clang found. +# CLANG_INCLUDE_DIRS - where to find Clang include files +# CLANG_LIBS - list of clang libs +# CLANG_LDFLAGS - list w/format: -lclangAST -lclangLex... +if(NOT LLVM_INCLUDE_DIRS OR NOT LLVM_LIBRARY_DIRS) + message(FATAL_ERROR "Clang support requires LLVM to be set up first.") +endif() + +if(NOT Clang_FIND_COMPONENTS) + message(FATAL_ERROR "Must specify which clang COMPONENTS are required") +endif() + +foreach(component ${Clang_FIND_COMPONENTS}) + find_library(CLANG_${component}_LIB ${component} + PATHS ${LLVM_LIBRARY_DIRS} ${CLANG_LIBRARY_DIRS}) + if(CLANG_${component}_LIB) + message(STATUS "Adding Clang component: ${component}") + set(CLANG_LIBS ${CLANG_LIBS} ${CLANG_${component}_LIB}) + set(CLANG_LDFLAGS ${CLANG_LDFLAGS} "-l${component}") + elseif(Clang_FIND_REQUIRED_${component}) + message(FATAL_ERROR "Could not find required Clang component ${component}\n" + "Please set CLANG_ROOT.") + endif() +endforeach(component) + +find_path(CLANG_INCLUDE_DIRS clang/Basic/Version.h HINTS ${LLVM_INCLUDE_DIRS}) +if(CLANG_LIBS AND CLANG_INCLUDE_DIRS) + message(STATUS "Found Clang libs: ${CLANG_LIBS}") + message(STATUS "Found Clang includes: ${CLANG_INCLUDE_DIRS}") +elseif(Clang_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find Clang") +endif() diff --git a/build-support/tools/kudu-lint/cmake_modules/FindLLVM.cmake b/build-support/tools/kudu-lint/cmake_modules/FindLLVM.cmake new file mode 100644 index 000000000000..346e89a25cee --- /dev/null +++ b/build-support/tools/kudu-lint/cmake_modules/FindLLVM.cmake @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# Find LLVM +# +# It defines the following variables +# LLVM_FOUND - True if llvm found. +# LLVM_INCLUDE_DIRS - where to find llvm include files +# LLVM_LIBRARY_DIRS - where to find llvm libs +# LLVM_CFLAGS - llvm compiler flags +# LLVM_LDFLAGS - llvm linker flags +# LLVM_LIBS - list of llvm libs for working with modules. + +if(NOT DEFINED CLANG_ROOT) + set(CLANG_ROOT $ENV{CLANG_ROOT}) +endif() + +find_program(LLVM_CONFIG_EXECUTABLE llvm-config + DOC "llvm-config executable" + HINTS ${CLANG_ROOT}/bin) + +if(LLVM_CONFIG_EXECUTABLE) + message(STATUS "LLVM llvm-config found at: ${LLVM_CONFIG_EXECUTABLE}") +elseif(LLVM_FIND_REQUIRED) + message(FATAL_ERROR "Could NOT find llvm-config executable") +endif() + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --version + OUTPUT_VARIABLE LLVM_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+).*" "\\1" LLVM_VERSION_MAJOR ${LLVM_VERSION}) +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+).*" "\\2" LLVM_VERSION_MINOR ${LLVM_VERSION}) + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir + OUTPUT_VARIABLE LLVM_INCLUDE_DIRS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --libdir + OUTPUT_VARIABLE LLVM_LIBRARY_DIRS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --cppflags + OUTPUT_VARIABLE LLVM_CFLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --ldflags + OUTPUT_VARIABLE LLVM_LDFLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND ${LLVM_CONFIG_EXECUTABLE} --libs ${LLVM_FIND_COMPONENTS} + OUTPUT_VARIABLE LLVM_LIBS + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +set(LLVM_FOUND TRUE) diff --git a/build-support/tools/kudu-lint/kudu-lint.cc b/build-support/tools/kudu-lint/kudu-lint.cc new file mode 100644 index 000000000000..a96fada4240b --- /dev/null +++ b/build-support/tools/kudu-lint/kudu-lint.cc @@ -0,0 +1,262 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "clang/AST/ASTContext.h" +#include "clang/AST/ASTTypeTraits.h" +#include "clang/ASTMatchers/ASTMatchers.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/AST/Stmt.h" +#include "clang/Driver/Options.h" +#include "clang/Frontend/FrontendActions.h" +#include "clang/Frontend/TextDiagnostic.h" +#include "clang/Tooling/CommonOptionsParser.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Support/CommandLine.h" + +// Using clang without importing namespaces is damn near impossible. +using namespace llvm; // NOLINT +using namespace clang::ast_matchers; // NOLINT + +using llvm::opt::OptTable; +using clang::ast_type_traits::DynTypedNode; +using clang::driver::createDriverOptTable; +using clang::driver::options::OPT_ast_dump; +using clang::tooling::CommonOptionsParser; +using clang::tooling::CommandLineArguments; +using clang::tooling::ClangTool; +using clang::tooling::newFrontendActionFactory; +using clang::ASTContext; +using clang::CharSourceRange; +using clang::ClassTemplateSpecializationDecl; +using clang::Decl; +using clang::DiagnosticsEngine; +using clang::Stmt; +using clang::FixItHint; +using clang::SourceLocation; +using clang::SourceRange; +using clang::Stmt; +using clang::TextDiagnostic; +using std::string; + +static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); +static cl::extrahelp MoreHelp( + "\tFor example, to run kudu-lint on all files in a subtree of the\n" + "\tsource tree, use:\n" + "\n" + "\t find path/in/subtree -name '*.cc'|xargs kudu-lint\n" + "\n" + "\tor using a specific build path:\n" + "\n" + "\t find path/in/subtree -name '*.cc'|xargs kudu-lint -p build/path\n" + "\n" + "\tNote, that path/in/subtree and current directory should follow the\n" + "\trules described above.\n" + "\n" + "\t Sometimes kudu-lint can't figure out the proper line number for an\n" + "\t error, and reports it inside some standard library. the -ast-dump\n" + "\t option can be useful for these circumstances.\n" + "\n" +); + +// Command line flags. +static OwningPtr gOptions(createDriverOptTable()); +static cl::list gArgsAfter( + "extra-arg", + cl::desc("Additional argument to append to the compiler command line")); +static cl::list gArgsBefore( + "extra-arg-before", + cl::desc("Additional argument to prepend to the compiler command line")); +static cl::opt gASTDump("ast-dump", + cl::desc(gOptions->getOptionHelpText(OPT_ast_dump))); + +namespace { + +// Callback for unused statuses. Simply reports the error at the point where the +// expression was found. +template +class ErrorPrinter : public MatchFinder::MatchCallback { + public: + ErrorPrinter(const std::string& error_msg, + const std::string& bound_name, + bool skip_system_headers) + : error_msg_(error_msg), + bound_name_(bound_name), + skip_system_headers_(skip_system_headers) { + } + + virtual void run(const MatchFinder::MatchResult& result) { + const NODETYPE* node; + if ((node = result.Nodes.getNodeAs(bound_name_))) { + SourceRange r = node->getSourceRange(); + + if (skip_system_headers_ && result.SourceManager->isInSystemHeader(r.getBegin())) { + return; + } + + if (gASTDump) { + node->dump(); + } + + if (r.isValid()) { + TextDiagnostic td(llvm::outs(), result.Context->getLangOpts(), + &result.Context->getDiagnostics().getDiagnosticOptions()); + td.emitDiagnostic(r.getBegin(), DiagnosticsEngine::Error, + error_msg_, + ArrayRef(CharSourceRange::getTokenRange(r)), + ArrayRef(), result.SourceManager); + } + + SourceLocation instantiation_point; + if (FindInstantiationPoint(result, node, &instantiation_point)) { + TextDiagnostic td(llvm::outs(), result.Context->getLangOpts(), + &result.Context->getDiagnostics().getDiagnosticOptions()); + td.emitDiagnostic(instantiation_point, DiagnosticsEngine::Note, + "previous error instantiated at", + ArrayRef(), + ArrayRef(), result.SourceManager); + } + } else { + llvm_unreachable("bound node missing"); + } + } + + private: + bool GetParent(ASTContext* ctx, const DynTypedNode& node, DynTypedNode* parent) { + ASTContext::ParentVector parents = ctx->getParents(node); + if (parents.empty()) return false; + assert(parents.size() == 1); + *parent = parents[0]; + return true; + } + + // If the AST node 'node' has an ancestor which is a template instantiation, + // fill the source location of that instantiation into 'loc'. Unfortunately, + // Clang doesn't retain enough information in the AST nodes to recurse here -- + // so in many cases this is useless, since the instantiation point will simply + // be inside another instantiated template. + bool FindInstantiationPoint(const MatchFinder::MatchResult& result, + const NODETYPE* node, + SourceLocation* loc) { + DynTypedNode dyn_node = DynTypedNode::create(*node); + + // Recurse up the tree. + while (true) { + const ClassTemplateSpecializationDecl* D = + dyn_node.get(); + if (D) { + *loc = D->getPointOfInstantiation(); + return true; + } + // TODO: there are probably other types of specializations to handle, but this is the only + // one seen so far. + + DynTypedNode parent; + if (!GetParent(result.Context, dyn_node, &parent)) { + return false; + } + dyn_node = parent; + } + } + + string error_msg_; + string bound_name_; + bool skip_system_headers_; +}; + +// Inserts arguments before or after the usual command line arguments. +class InsertAdjuster: public clang::tooling::ArgumentsAdjuster { + public: + enum Position { BEGIN, END }; + + InsertAdjuster(const CommandLineArguments &extra, Position pos) + : extra_(extra), pos_(pos) { + } + + InsertAdjuster(const char *extra_, Position pos) + : extra_(1, std::string(extra_)), pos_(pos) { + } + + virtual CommandLineArguments Adjust(const CommandLineArguments &Args) LLVM_OVERRIDE { + CommandLineArguments ret(Args); + + CommandLineArguments::iterator I; + if (pos_ == END) { + I = ret.end(); + } else { + I = ret.begin(); + ++I; // To leave the program name in place + } + + ret.insert(I, extra_.begin(), extra_.end()); + return ret; + } + + private: + const CommandLineArguments extra_; + const Position pos_; +}; + +} // anonymous namespace + +int main(int argc, const char **argv) { + CommonOptionsParser options_parser(argc, argv); + ClangTool Tool(options_parser.getCompilations(), + options_parser.getSourcePathList()); + if (gArgsAfter.size() > 0) { + Tool.appendArgumentsAdjuster(new InsertAdjuster(gArgsAfter, + InsertAdjuster::END)); + } + if (gArgsBefore.size() > 0) { + Tool.appendArgumentsAdjuster(new InsertAdjuster(gArgsBefore, + InsertAdjuster::BEGIN)); + } + + // Match expressions of type 'Status' which are parented by a compound statement. + // This implies that the expression is being thrown away, rather than assigned + // to some variable or function call. + // + // For more information on AST matchers, refer to: + // http://clang.llvm.org/docs/LibASTMatchersReference.html + StatementMatcher ignored_status_matcher = + expr(hasType(recordDecl(hasName("Status"))), + hasParent(compoundStmt())).bind("expr"); + ErrorPrinter ignored_status_printer("Unused status result", "expr", false); + + // Match class members which are reference-typed. This is confusing since they + // tend to "look like" copied values, but in fact often reference external + // entities passed in in the constructor. + DeclarationMatcher ref_member_matcher = + fieldDecl(hasType(referenceType())) + .bind("decl"); + ErrorPrinter ref_member_printer("Reference-typed member", "decl", true); + + // Disallow calls to sleep, usleep, and nanosleep. + // SleepFor(MonoDelta) should be used instead, as it is not prone to + // unit conversion errors, and also ignores EINTR so will safely sleep + // at least the requested duration. + StatementMatcher sleep_matcher = + callExpr(callee(namedDecl(matchesName("(nano|u)?sleep")))).bind("sleep_expr"); + ErrorPrinter sleep_printer("sleep, usleep or nanosleep call", "sleep_expr", true); + + MatchFinder finder; + finder.addMatcher(ignored_status_matcher, &ignored_status_printer); + finder.addMatcher(ref_member_matcher, &ref_member_printer); + finder.addMatcher(sleep_matcher, &sleep_printer); + + return Tool.run(newFrontendActionFactory(&finder)); +} diff --git a/build-support/trigger_gerrit.py b/build-support/trigger_gerrit.py new file mode 100755 index 000000000000..740af439e671 --- /dev/null +++ b/build-support/trigger_gerrit.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# This tool triggers a Jenkins build based on a particular gerrit URL. +# The Jenkins build will post a +1 or -1 on the gerrit. +# +# NOTE: currently this is hard-coded to an internal Cloudera server. +# We plan to move to upstream infrastructure at some later date. + +import logging +import json +import re +import subprocess +import sys +import urllib +import urllib2 +import urlparse + +from kudu_util import check_output + +GERRIT_HOST = "gerrit.cloudera.org" +JENKINS_URL = "http://sandbox.jenkins.cloudera.com/" +JENKINS_JOB = "kudu-public-gerrit" + + +def get_gerrit_ssh_command(): + url = check_output("git config --get remote.gerrit.url".split(" ")) + m = re.match(r'ssh://(.+)@(.+):(\d+)/.+', url) + if not m: + raise Exception("expected gerrit remote to be an ssh://user@host:port/ URL: %s" % url) + user, host, port = m.groups() + if host != GERRIT_HOST: + raise Exception("unexpected gerrit host %s in remote 'gerrit'. Expected %s" % ( + host, GERRIT_HOST)) + return ["ssh", "-p", port, "-l", user, host] + + +def current_ref_for_gerrit_number(change_num): + j = check_output(get_gerrit_ssh_command() + [ + "gerrit", "query", "--current-patch-set", "--format", "JSON", + "change:%d" % change_num]) + j = json.loads(j.split("\n")[0]) + return j['currentPatchSet']['ref'] + + +def url_to_ref(url): + u = urlparse.urlparse(url) + if not u.netloc.startswith(GERRIT_HOST): + print >>sys.stderr, "unexpected gerrit host %s, expected %s\n" % ( + u.netloc, GERRIT_HOST) + usage() + sys.exit(1) + if u.path == '/': + m = re.match(r'/c/(\d+)/', u.fragment) + if m: + return current_ref_for_gerrit_number(int(m.group(1))) + print >>sys.stderr, "invalid gerrit URL: ", url + usage() + sys.exit(1) + +def usage(): + print >>sys.stderr, "usage: %s \n" % sys.argv[0] + print >>sys.stderr, "The provided URL should look something like:" + print >>sys.stderr, "http://gerrit.cloudera.org:8080/#/c/963/\n" + + +def determine_ref(): + if len(sys.argv) != 2: + usage() + sys.exit(1) + + arg = sys.argv[1] + if arg.startswith("http"): + return url_to_ref(arg) + else: + print >>sys.stderr, "Unable to parse argument: %s\n" % arg + sys.exit(1) + + +def trigger_jenkins(ref): + logging.info("Will trigger Jenkins for ref %s" % ref) + url = "%s/job/%s/buildWithParameters" % (JENKINS_URL, JENKINS_JOB) + params = dict(GERRIT_BRANCH=ref) + req = urllib2.Request(url, + data=urllib.urlencode(params), + headers={"Accept": "application/json"}) + urllib2.urlopen(req).read() + logging.info("Successfuly triggered jenkins job!") + + +def main(): + gerrit_ref = determine_ref() + trigger_jenkins(gerrit_ref) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + main() diff --git a/build-support/tsan-suppressions.txt b/build-support/tsan-suppressions.txt new file mode 100644 index 000000000000..d7d6a5f6ca26 --- /dev/null +++ b/build-support/tsan-suppressions.txt @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +race:_dl_deallocate_tls +race:kudu::tablet::ScopedRowLock::Release + +# libunwind uses some double-checked locking which isn't perfectly safe. +# Reported at http://savannah.nongnu.org/bugs/index.php?42677 +# +# With TSAN in clang 3.5, it's the init() function that's flagged as a data +# race (not local_addr_space_init()), due to the former calling sigfillset() +# on an unprotected global variable. Given that init() invokes +# local_addr_space_init(), suppressing init() suppresses both races. +race:_ULx86_64_init + +# libev uses some lock-free synchronization, but doesn't have TSAN annotations. +# See http://lists.schmorp.de/pipermail/libev/2013q2/002178.html or KUDU-366 +# for examples. +race:evpipe_write +race:evpipe_init +race:epoll_ctl + +# concurrent btree uses optimistic concurrency, needs to be annotated a bunch +# more before it would pass. Relatively confident that it is correct based on +# a lot of stress testing. +race:concurrent_btree.h + +# We're currently a little lax on using atomic instructions for mutations where +# we might need to. There are possible bugs lurking here, though likely we are +# hiding behind x86 TSO for now. +race:kudu::tablet::ApplyMutationsAndGenerateUndos +race:kudu::tablet::MemRowSet::Iterator::ApplyMutationsToProjectedRow +race:kudu::tablet::MemRowSet::Iterator::FetchRows +race:kudu::tablet::Mutation::* +race:kudu::RowChangeListDecoder::Init + +# GLog's fatal signal handler isn't signal-safe -- it allocates memory. +# This isn't great, but nothing we can do about it. See +# https://code.google.com/p/google-glog/issues/detail?id=191 +signal:logging_fail + +# This method in Boost's UUID library operates on static state with impunity, +# triggering (harmless) data races in TSAN when boost::uuids::random_generator +# instances are created across threads (see kudu::ObjectIdGenerator). +race:boost::uuids::detail::seed_rng::sha1_random_digest_ + +# ------------------------------------------------------------ +# Known bugs below. As these JIRAs are resolved, please remove the relevant +# suppression. +# ------------------------------------------------------------ + +# KUDU-1283: TSAN warning from consensus OpId +race:kudu::consensus::OpId::CopyFrom + +# KUDU-186: sketchy synchronization in catalog manager +race:kudu::master::CatalogManager::Shutdown +race:kudu::master::CatalogManagerBgTasks::Shutdown +race:kudu::master::CatalogManager::~CatalogManager + +# KUDU-189: unsynchronized access to 'state_' in DeferredCallback +race:kudu::master::AsyncTabletRequestTask::DeferredCallback + +# KUDU-574: raft_consensus_quorum-test race on LocalTestPeerProxy destruction +race:kudu::consensus::LocalTestPeerProxy::~LocalTestPeerProxy + +# KUDU-569: unsynchronized access to 'state_', 'acceptor_pools_', in +# GetBoundAddresses() +race:kudu::Webserver::GetBoundAddresses +race:kudu::RpcServer::GetBoundAddresses diff --git a/cmake_modules/CompilerInfo.cmake b/cmake_modules/CompilerInfo.cmake new file mode 100644 index 000000000000..66c0d7b64e25 --- /dev/null +++ b/cmake_modules/CompilerInfo.cmake @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Sets COMPILER_FAMILY to 'clang' or 'gcc' +# Sets COMPILER_VERSION to the version +execute_process(COMMAND "${CMAKE_CXX_COMPILER}" -v + ERROR_VARIABLE COMPILER_VERSION_FULL) +message(INFO " ${COMPILER_VERSION_FULL}") + +# clang on Linux and Mac OS X before 10.9 +if("${COMPILER_VERSION_FULL}" MATCHES ".*clang version.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +# clang on Mac OS X 10.9 and later +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*based on LLVM ([0-9]+\\.[0.9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") + +# clang on Mac OS X, XCode 7. +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*clang-700\\..*") + set(COMPILER_FAMILY "clang") + set(COMPILER_VERSION "3.7.0svn") + +# gcc +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*gcc version.*") + set(COMPILER_FAMILY "gcc") + string(REGEX REPLACE ".*gcc version ([0-9\\.]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") +endif() +message("Selected compiler ${COMPILER_FAMILY} ${COMPILER_VERSION}") + diff --git a/cmake_modules/FindBitshuffle.cmake b/cmake_modules/FindBitshuffle.cmake new file mode 100644 index 000000000000..ed9b04d9da2f --- /dev/null +++ b/cmake_modules/FindBitshuffle.cmake @@ -0,0 +1,18 @@ +# - Find Bitshuffle (bitshuffle.h, bitshuffle.a) +# This module defines +# BITSHUFFLE_INCLUDE_DIR, directory containing headers +# BITSHUFFLE_STATIC_LIB, path to bitshuffle's static library +# BITSHUFFLE_FOUND, whether bitshuffle has been found + +find_path(BITSHUFFLE_INCLUDE_DIR bitshuffle.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(BITSHUFFLE_STATIC_LIB bitshuffle.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(BITSHUFFLE REQUIRED_VARS + BITSHUFFLE_STATIC_LIB BITSHUFFLE_INCLUDE_DIR) diff --git a/cmake_modules/FindCrcutil.cmake b/cmake_modules/FindCrcutil.cmake new file mode 100644 index 000000000000..6d93a9a63cf7 --- /dev/null +++ b/cmake_modules/FindCrcutil.cmake @@ -0,0 +1,20 @@ +# - Find CRCUTIL (crcutil/include.h, libcrcutil.a) +# This module defines +# CRCUTIL_INCLUDE_DIR, directory containing headers +# CRCUTIL_SHARED_LIB, path to libcrcutil's shared library +# CRCUTIL_STATIC_LIB, path to libcrcutil's static library +# CRCUTIL_FOUND, whether crcutil has been found + +find_path(CRCUTIL_INCLUDE_DIR crcutil/interface.h + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(CRCUTIL_SHARED_LIB crcutil + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(CRCUTIL_STATIC_LIB libcrcutil.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CRCUTIL REQUIRED_VARS + CRCUTIL_SHARED_LIB CRCUTIL_STATIC_LIB CRCUTIL_INCLUDE_DIR) diff --git a/cmake_modules/FindCyrusSASL.cmake b/cmake_modules/FindCyrusSASL.cmake new file mode 100644 index 000000000000..8f92cf53a959 --- /dev/null +++ b/cmake_modules/FindCyrusSASL.cmake @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# - Find Cyrus SASL (sasl.h, libsasl2.so) +# +# This module defines +# CYRUS_SASL_INCLUDE_DIR, directory containing headers +# CYRUS_SASL_SHARED_LIB, path to Cyrus SASL's shared library +# CYRUS_SASL_FOUND, whether Cyrus SASL and its plugins have been found +# +# N.B: we do _not_ include sasl in thirdparty, for a fairly subtle reason. The +# TLDR version is that newer versions of cyrus-sasl (>=2.1.26) have a bug fix +# for https://bugzilla.cyrusimap.org/show_bug.cgi?id=3590, but that bug fix +# relied on a change both on the plugin side and on the library side. If you +# then try to run the new version of sasl (e.g from our thirdparty tree) with +# an older version of a plugin (eg from RHEL6 install), you'll get a SASL_NOMECH +# error due to this bug. +# +# In practice, Cyrus-SASL is so commonly used and generally non-ABI-breaking that +# we should be OK to depend on the host installation. + + +find_path(CYRUS_SASL_INCLUDE_DIR sasl/sasl.h) +find_library(CYRUS_SASL_SHARED_LIB sasl2) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CYRUS_SASL REQUIRED_VARS + CYRUS_SASL_SHARED_LIB CYRUS_SASL_INCLUDE_DIR) diff --git a/cmake_modules/FindGFlags.cmake b/cmake_modules/FindGFlags.cmake new file mode 100644 index 000000000000..273c72ed3ddb --- /dev/null +++ b/cmake_modules/FindGFlags.cmake @@ -0,0 +1,21 @@ +# - Find GFLAGS (gflags.h, libgflags.a, libgflags.so, and libgflags.so.0) +# This module defines +# GFLAGS_INCLUDE_DIR, directory containing headers +# GFLAGS_SHARED_LIB, path to libgflags shared library +# GFLAGS_STATIC_LIB, path to libgflags static library +# GFLAGS_FOUND, whether gflags has been found + +find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(GFLAGS_SHARED_LIB gflags + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(GFLAGS_STATIC_LIB libgflags.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GFLAGS REQUIRED_VARS + GFLAGS_SHARED_LIB GFLAGS_STATIC_LIB GFLAGS_INCLUDE_DIR) diff --git a/cmake_modules/FindGLog.cmake b/cmake_modules/FindGLog.cmake new file mode 100644 index 000000000000..c214bd01e6f5 --- /dev/null +++ b/cmake_modules/FindGLog.cmake @@ -0,0 +1,21 @@ +# - Find GLOG (logging.h, libglog.a, libglog.so, and libglog.so.0) +# This module defines +# GLOG_INCLUDE_DIR, directory containing headers +# GLOG_SHARED_LIB, path to libglog's shared library +# GLOG_STATIC_LIB, path to libglog's static library +# GLOG_FOUND, whether glog has been found + +find_path(GLOG_INCLUDE_DIR glog/logging.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(GLOG_SHARED_LIB glog + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(GLOG_STATIC_LIB libglog.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GLOG REQUIRED_VARS + GLOG_SHARED_LIB GLOG_STATIC_LIB GLOG_INCLUDE_DIR) diff --git a/cmake_modules/FindGMock.cmake b/cmake_modules/FindGMock.cmake new file mode 100644 index 000000000000..7b6eb703d78b --- /dev/null +++ b/cmake_modules/FindGMock.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2009-2010 Volvox Development Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Author: Konstantin Lepa +# +# Find the Google Mock Framework, heavily cribbed from FindGTest.cmake. +# gmock ships a copy of gtest and bundles it in its libraries, so this also +# finds the gtest headers. +# +# This module defines +# GMOCK_INCLUDE_DIR, where to find gmock include files, etc. +# GTEST_INCLUDE_DIR, where to find gtest include files +# GMOCK_SHARED_LIBRARY, Location of libgmock's shared library +# GMOCK_STATIC_LIBRARY, Location of libgmock's static library +# GMOCK_FOUND, If false, do not try to use gmock. + +find_path(GMOCK_INCLUDE_DIR gmock/gmock.h + DOC "Path to the gmock header file" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_path(GTEST_INCLUDE_DIR gtest/gtest.h + DOC "Path to the gtest header file" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(GMOCK_SHARED_LIBRARY gmock + DOC "Google's framework for writing C++ tests (gmock)" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(GMOCK_STATIC_LIBRARY libgmock.a + DOC "Google's framework for writing C++ tests (gmock) static" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GMOCK REQUIRED_VARS + GMOCK_SHARED_LIBRARY GMOCK_STATIC_LIBRARY GMOCK_INCLUDE_DIR GTEST_INCLUDE_DIR) diff --git a/cmake_modules/FindGPerf.cmake b/cmake_modules/FindGPerf.cmake new file mode 100644 index 000000000000..48d4033e63d7 --- /dev/null +++ b/cmake_modules/FindGPerf.cmake @@ -0,0 +1,34 @@ +# -*- cmake -*- + +# - Find Google perftools +# Find the Google perftools includes and libraries +# This module defines +# GOOGLE_PERFTOOLS_INCLUDE_DIR, where to find heap-profiler.h, etc. +# GOOGLE_PERFTOOLS_FOUND, If false, do not try to use Google perftools. +# also defined for general use are +# TCMALLOC_SHARED_LIB, path to tcmalloc's shared library +# TCMALLOC_STATIC_LIB, path to tcmalloc's static library +# PROFILER_SHARED_LIB, path to libprofiler's shared library +# PROFILER_STATIC_LIB, path to libprofiler's static library + +FIND_PATH(GOOGLE_PERFTOOLS_INCLUDE_DIR google/heap-profiler.h + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +FIND_LIBRARY(TCMALLOC_SHARED_LIB tcmalloc + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +FIND_LIBRARY(TCMALLOC_STATIC_LIB libtcmalloc.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +FIND_LIBRARY(PROFILER_SHARED_LIB profiler + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +FIND_LIBRARY(PROFILER_STATIC_LIB libprofiler.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GOOGLE_PERFTOOLS REQUIRED_VARS + TCMALLOC_SHARED_LIB TCMALLOC_STATIC_LIB + PROFILER_SHARED_LIB PROFILER_STATIC_LIB + GOOGLE_PERFTOOLS_INCLUDE_DIR) diff --git a/cmake_modules/FindGTest.cmake b/cmake_modules/FindGTest.cmake new file mode 100644 index 000000000000..06fc02e8901d --- /dev/null +++ b/cmake_modules/FindGTest.cmake @@ -0,0 +1,91 @@ +# Copyright (c) 2009-2010 Volvox Development Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Author: Konstantin Lepa +# +# Find the Google Test Framework +# +# This module defines +# GTEST_INCLUDE_DIR, where to find gtest include files, etc. +# GTest_FOUND, If false, do not try to use gtest. +# GTEST_STATIC_LIBRARY, Location of libgtest.a +# GTEST_SHARED_LIBRARY, Location of libttest's shared library + +# also defined, but not for general use are +# GTEST_LIBRARY, where to find the GTest library. + +set(GTEST_SEARCH_PATH ${CMAKE_SOURCE_DIR}/thirdparty/gtest-1.7.0) + +set(GTEST_H gtest/gtest.h) + +find_path(GTEST_INCLUDE_DIR ${GTEST_H} + PATHS ${GTEST_SEARCH_PATH}/include + NO_DEFAULT_PATH + DOC "Path to the ${GTEST_H} file" +) + +find_library(GTEST_LIBRARY + NAMES gtest + PATHS ${GTEST_SEARCH_PATH} + NO_DEFAULT_PATH + DOC "Google's framework for writing C++ tests (gtest)" +) + +# Kudu does not use the gtest_main library (we have kudu_test_main). +#find_library(GTEST_MAIN_LIBRARY_PATH +# NAMES gtest_main +# PATHS GTEST_SEARCH_PATH +# NO_DEFAULT_PATH +# DOC "Google's framework for writing C++ tests (gtest_main)" +#) +set(GTEST_LIB_NAME libgtest) +if(GTEST_INCLUDE_DIR AND GTEST_LIBRARY) + set(GTEST_STATIC_LIBRARY ${GTEST_SEARCH_PATH}/${GTEST_LIB_NAME}.a) + if(EXISTS "${GTEST_STATIC_LIBRARY}") + set(GTEST_FOUND TRUE) + endif() + + set(GTEST_SHARED_LIBRARY ${GTEST_SEARCH_PATH}/${GTEST_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if(EXISTS "${GTEST_SHARED_LIBRARY}") + set(GTEST_FOUND TRUE) + endif() +else() + set(GTEST_FOUND FALSE) +endif() + +if(GTEST_FOUND) + if(NOT GTest_FIND_QUIETLY) + message(STATUS "Found the GTest library: ${GTEST_STATIC_LIBRARY} ${GTEST_SHARED_LIBRARY}") + endif(NOT GTest_FIND_QUIETLY) +else(GTEST_FOUND) + if(NOT GTest_FIND_QUIETLY) + if(GTest_FIND_REQUIRED) + message(FATAL_ERROR "Could not find the GTest library") + else(GTest_FIND_REQUIRED) + message(STATUS "Could not find the GTest library") + endif(GTest_FIND_REQUIRED) + endif(NOT GTest_FIND_QUIETLY) +endif(GTEST_FOUND) + +mark_as_advanced( + GTEST_INCLUDE_DIR + GTEST_STATIC_LIBRARY + GTEST_SHARED_LIBRARY) + diff --git a/cmake_modules/FindKRPC.cmake b/cmake_modules/FindKRPC.cmake new file mode 100644 index 000000000000..996d9f5ff576 --- /dev/null +++ b/cmake_modules/FindKRPC.cmake @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Kudu RPC generator support +######### +find_package(Protobuf REQUIRED) + +# +# Generate the KRPC files for a given .proto file. +# +function(KRPC_GENERATE SRCS HDRS TGTS) + if(NOT ARGN) + message(SEND_ERROR "Error: KRPC_GENERATE() called without protobuf files") + return() + endif(NOT ARGN) + + set(options) + set(one_value_args SOURCE_ROOT BINARY_ROOT) + set(multi_value_args EXTRA_PROTO_PATHS PROTO_FILES) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + set(${SRCS}) + set(${HDRS}) + set(${TGTS}) + + set(EXTRA_PROTO_PATH_ARGS) + foreach(PP ${ARG_EXTRA_PROTO_PATHS}) + set(EXTRA_PROTO_PATH_ARGS ${EXTRA_PROTO_PATH_ARGS} --proto_path ${PP}) + endforeach() + + if("${ARG_SOURCE_ROOT}" STREQUAL "") + SET(ARG_SOURCE_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") + endif() + GET_FILENAME_COMPONENT(ARG_SOURCE_ROOT ${ARG_SOURCE_ROOT} ABSOLUTE) + + if("${ARG_BINARY_ROOT}" STREQUAL "") + SET(ARG_BINARY_ROOT "${CMAKE_CURRENT_BINARY_DIR}") + endif() + GET_FILENAME_COMPONENT(ARG_BINARY_ROOT ${ARG_BINARY_ROOT} ABSOLUTE) + + foreach(FIL ${ARG_PROTO_FILES}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + # Ensure that the protobuf file is within the source root. + # This is a requirement of protoc. + FILE(RELATIVE_PATH PROTO_REL_TO_ROOT "${ARG_SOURCE_ROOT}" "${ABS_FIL}") + + GET_FILENAME_COMPONENT(REL_DIR "${PROTO_REL_TO_ROOT}" PATH) + + if(NOT REL_DIR STREQUAL "") + SET(REL_DIR "${REL_DIR}/") + endif() + + set(PROTO_CC_OUT "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.pb.cc") + set(PROTO_H_OUT "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.pb.h") + set(SERVICE_CC "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.service.cc") + set(SERVICE_H "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.service.h") + set(PROXY_CC "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.proxy.cc") + set(PROXY_H "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.proxy.h") + list(APPEND ${SRCS} "${PROTO_CC_OUT}" "${SERVICE_CC}" "${PROXY_CC}") + list(APPEND ${HDRS} "${PROTO_H_OUT}" "${SERVICE_H}" "${PROXY_H}") + + add_custom_command( + OUTPUT "${SERVICE_CC}" "${SERVICE_H}" "${PROXY_CC}" "${PROXY_H}" + "${PROTO_CC_OUT}" "${PROTO_H_OUT}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --plugin $ + --plugin $ + --cpp_out ${ARG_BINARY_ROOT} + --krpc_out ${ARG_BINARY_ROOT} + --insertions_out ${ARG_BINARY_ROOT} + --proto_path ${ARG_SOURCE_ROOT} + # Used to find built-in .proto files (e.g. FileDescriptorProto) + --proto_path ${PROTOBUF_INCLUDE_DIR} + ${EXTRA_PROTO_PATH_ARGS} ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc-gen-krpc protoc-gen-insertions + COMMENT "Running C++ protocol buffer compiler with KRPC plugin on ${FIL}" + VERBATIM) + + # This custom target enforces that there's just one invocation of protoc + # when there are multiple consumers of the generated files. The target name + # must be unique; adding parts of the filename helps ensure this. + set(TGT_NAME ${REL_DIR}${FIL}) + string(REPLACE "/" "-" TGT_NAME ${TGT_NAME}) + add_custom_target(${TGT_NAME} + DEPENDS "${SERVICE_CC}" "${SERVICE_H}" + "${PROXY_CC}" "${PROXY_H}" + "${PROTO_CC_OUT}" "${PROTO_H_OUT}") + list(APPEND ${TGTS} "${TGT_NAME}") + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) + set(${TGTS} ${${TGTS}} PARENT_SCOPE) +endfunction() diff --git a/cmake_modules/FindLibEv.cmake b/cmake_modules/FindLibEv.cmake new file mode 100644 index 000000000000..f1f9d7a73349 --- /dev/null +++ b/cmake_modules/FindLibEv.cmake @@ -0,0 +1,21 @@ +# - Find LIBEV (ev++.h, libev.a, and libev.so) +# This module defines +# LIBEV_INCLUDE_DIR, directory containing headers +# LIBEV_SHARED_LIB, path to libev's shared library +# LIBEV_STATIC_LIB, path to libev's static library +# LIBEV_FOUND, whether libev has been found + +find_path(LIBEV_INCLUDE_DIR ev++.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(LIBEV_SHARED_LIB ev + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(LIBEV_STATIC_LIB libev.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LIBEV REQUIRED_VARS + LIBEV_SHARED_LIB LIBEV_STATIC_LIB LIBEV_INCLUDE_DIR) diff --git a/cmake_modules/FindLibUnwind.cmake b/cmake_modules/FindLibUnwind.cmake new file mode 100644 index 000000000000..b6a8622aa043 --- /dev/null +++ b/cmake_modules/FindLibUnwind.cmake @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# - Find libunwind (libunwind.h, libunwind.so) +# +# This module defines +# UNWIND_INCLUDE_DIR, directory containing headers +# UNWIND_SHARED_LIB, path to libunwind's shared library +# UNWIND_STATIC_LIB, path to libunwind's static library + +find_path(UNWIND_INCLUDE_DIR libunwind.h + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(UNWIND_SHARED_LIB unwind + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(UNWIND_STATIC_LIB libunwind.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(UNWIND REQUIRED_VARS + UNWIND_SHARED_LIB UNWIND_STATIC_LIB UNWIND_INCLUDE_DIR) diff --git a/cmake_modules/FindLz4.cmake b/cmake_modules/FindLz4.cmake new file mode 100644 index 000000000000..87800a804bdb --- /dev/null +++ b/cmake_modules/FindLz4.cmake @@ -0,0 +1,17 @@ +# - Find LZ4 (lz4.h, liblz4.a) +# This module defines +# LZ4_INCLUDE_DIR, directory containing headers +# LZ4_STATIC_LIB, path to liblz4's static library +# LZ4_FOUND, whether lz4 has been found + +find_path(LZ4_INCLUDE_DIR lz4.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(LZ4_STATIC_LIB liblz4.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LZ4 REQUIRED_VARS + LZ4_STATIC_LIB LZ4_INCLUDE_DIR) diff --git a/cmake_modules/FindProtobuf.cmake b/cmake_modules/FindProtobuf.cmake new file mode 100644 index 000000000000..108d68c0a5a6 --- /dev/null +++ b/cmake_modules/FindProtobuf.cmake @@ -0,0 +1,181 @@ +######### +# Local rewrite of the protobuf support in cmake. +# +# Supports cross-module protobuf dependencies and protobufs inside +# packages much better than the one built into cmake. +######### +# +# Locate and configure the Google Protocol Buffers library. +# Defines the following variables: +# +# PROTOBUF_INCLUDE_DIR - the include directory for protocol buffers +# PROTOBUF_SHARED_LIBRARY - path to protobuf's shared library +# PROTOBUF_STATIC_LIBRARY - path to protobuf's static library +# PROTOBUF_PROTOC_SHARED_LIBRARY - path to protoc's shared library +# PROTOBUF_PROTOC_STATIC_LIBRARY - path to protoc's static library +# PROTOBUF_PROTOC_EXECUTABLE - the protoc compiler +# PROTOBUF_FOUND - whether the Protocol Buffers library has been found +# +# ==================================================================== +# Example: +# +# find_package(Protobuf REQUIRED) +# include_directories(${PROTOBUF_INCLUDE_DIR}) +# +# include_directories(${CMAKE_CURRENT_BINARY_DIR}) +# PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS PROTO_TGTS +# [SOURCE_ROOT ] +# [BINARY_ROOT ] +# PROTO_FILES foo.proto) +# add_executable(bar bar.cc ${PROTO_SRCS} ${PROTO_HDRS}) +# target_link_libraries(bar ${PROTOBUF_SHARED_LIBRARY}) +# +# NOTE: You may need to link against pthreads, depending +# on the platform. +# ==================================================================== +# +# PROTOBUF_GENERATE_CPP (public function) +# SRCS = Variable to define with autogenerated +# source files +# HDRS = Variable to define with autogenerated +# header files +# TGTS = Variable to define with autogenerated +# custom targets; if SRCS/HDRS need to be used in multiple +# libraries, those libraries should depend on these targets +# in order to "serialize" the protoc invocations +# ==================================================================== + +#============================================================================= +# Copyright 2011 Kirill A. Korinskiy +# Copyright 2009 Kitware, Inc. +# Copyright 2009 Philip Lowman +# Copyright 2008 Esben Mose Hansen, Ange Optimization ApS +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distributed this file outside of CMake, substitute the full +# License text for the above reference.) + +function(PROTOBUF_GENERATE_CPP SRCS HDRS TGTS) + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files") + return() + endif(NOT ARGN) + + set(options) + set(one_value_args SOURCE_ROOT BINARY_ROOT) + set(multi_value_args EXTRA_PROTO_PATHS PROTO_FILES) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + set(${SRCS}) + set(${HDRS}) + set(${TGTS}) + + set(EXTRA_PROTO_PATH_ARGS) + foreach(PP ${ARG_EXTRA_PROTO_PATHS}) + set(EXTRA_PROTO_PATH_ARGS ${EXTRA_PROTO_PATH_ARGS} --proto_path ${PP}) + endforeach() + + if("${ARG_SOURCE_ROOT}" STREQUAL "") + SET(ARG_SOURCE_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") + endif() + GET_FILENAME_COMPONENT(ARG_SOURCE_ROOT ${ARG_SOURCE_ROOT} ABSOLUTE) + + if("${ARG_BINARY_ROOT}" STREQUAL "") + SET(ARG_BINARY_ROOT "${CMAKE_CURRENT_BINARY_DIR}") + endif() + GET_FILENAME_COMPONENT(ARG_BINARY_ROOT ${ARG_BINARY_ROOT} ABSOLUTE) + + foreach(FIL ${ARG_PROTO_FILES}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + # Ensure that the protobuf file is within the source root. + # This is a requirement of protoc. + FILE(RELATIVE_PATH PROTO_REL_TO_ROOT "${ARG_SOURCE_ROOT}" "${ABS_FIL}") + + GET_FILENAME_COMPONENT(REL_DIR "${PROTO_REL_TO_ROOT}" PATH) + + if(NOT REL_DIR STREQUAL "") + SET(REL_DIR "${REL_DIR}/") + endif() + + set(PROTO_CC_OUT "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.pb.cc") + set(PROTO_H_OUT "${ARG_BINARY_ROOT}/${REL_DIR}${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${PROTO_CC_OUT}") + list(APPEND ${HDRS} "${PROTO_H_OUT}") + + add_custom_command( + OUTPUT "${PROTO_CC_OUT}" "${PROTO_H_OUT}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS + --plugin $ + --cpp_out ${ARG_BINARY_ROOT} + --insertions_out ${ARG_BINARY_ROOT} + --proto_path ${ARG_SOURCE_ROOT} + # Used to find built-in .proto files (e.g. FileDescriptorProto) + --proto_path ${PROTOBUF_INCLUDE_DIR} + ${EXTRA_PROTO_PATH_ARGS} ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc-gen-insertions + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + + # This custom target enforces that there's just one invocation of protoc + # when there are multiple consumers of the generated files. The target name + # must be unique; adding parts of the filename helps ensure this. + set(TGT_NAME ${REL_DIR}${FIL}) + string(REPLACE "/" "-" TGT_NAME ${TGT_NAME}) + add_custom_target(${TGT_NAME} + DEPENDS "${PROTO_CC_OUT}" "${PROTO_H_OUT}") + list(APPEND ${TGTS} "${TGT_NAME}") + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) + set(${TGTS} ${${TGTS}} PARENT_SCOPE) +endfunction() + + +find_path(PROTOBUF_INCLUDE_DIR google/protobuf/service.h + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(PROTOBUF_SHARED_LIBRARY protobuf + DOC "The Google Protocol Buffers Library" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(PROTOBUF_STATIC_LIBRARY libprotobuf.a + DOC "Static version of the Google Protocol Buffers Library" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(PROTOBUF_PROTOC_SHARED_LIBRARY protoc + DOC "The Google Protocol Buffers Compiler Library" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(PROTOBUF_PROTOC_STATIC_LIBRARY libprotoc.a + DOC "Static version of the Google Protocol Buffers Compiler Library" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_program(PROTOBUF_PROTOC_EXECUTABLE protoc + DOC "The Google Protocol Buffers Compiler" + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PROTOBUF REQUIRED_VARS + PROTOBUF_SHARED_LIBRARY PROTOBUF_STATIC_LIBRARY + PROTOBUF_PROTOC_SHARED_LIBRARY PROTOBUF_PROTOC_STATIC_LIBRARY + PROTOBUF_INCLUDE_DIR PROTOBUF_PROTOC_EXECUTABLE) diff --git a/cmake_modules/FindSnappy.cmake b/cmake_modules/FindSnappy.cmake new file mode 100644 index 000000000000..75a909f2df4b --- /dev/null +++ b/cmake_modules/FindSnappy.cmake @@ -0,0 +1,21 @@ +# - Find SNAPPY (snappy.h, libsnappy.a, libsnappy.so, and libsnappy.so.1) +# This module defines +# SNAPPY_INCLUDE_DIR, directory containing headers +# SNAPPY_SHARED_LIB, path to snappy's shared library +# SNAPPY_STATIC_LIB, path to snappy's static library +# SNAPPY_FOUND, whether snappy has been found + +find_path(SNAPPY_INCLUDE_DIR snappy.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(SNAPPY_SHARED_LIB snappy + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(SNAPPY_STATIC_LIB libsnappy.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SNAPPY REQUIRED_VARS + SNAPPY_SHARED_LIB SNAPPY_STATIC_LIB SNAPPY_INCLUDE_DIR) diff --git a/cmake_modules/FindSqueasel.cmake b/cmake_modules/FindSqueasel.cmake new file mode 100644 index 000000000000..631a4bf13aec --- /dev/null +++ b/cmake_modules/FindSqueasel.cmake @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# - Find Squeasel (squeasel.h, libsqueasel.a) +# This module defines +# SQUEASEL_INCLUDE_DIR, directory containing headers +# SQUEASEL_STATIC_LIB, path to libsqueasel.a +# SQUEASEL_FOUND, whether squeasel has been found + +find_path(SQUEASEL_INCLUDE_DIR squeasel.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +find_library(SQUEASEL_STATIC_LIB libsqueasel.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SQUEASEL REQUIRED_VARS + SQUEASEL_STATIC_LIB SQUEASEL_INCLUDE_DIR) diff --git a/cmake_modules/FindVmem.cmake b/cmake_modules/FindVmem.cmake new file mode 100644 index 000000000000..b85d2c452599 --- /dev/null +++ b/cmake_modules/FindVmem.cmake @@ -0,0 +1,21 @@ +# - Find VMEM (libvmem.h, libvmem.so) +# This module defines +# VMEM_INCLUDE_DIR, directory containing headers +# VMEM_SHARED_LIB, path to vmem's shared library +# VMEM_STATIC_LIB, path to vmem's static library +# VMEM_FOUND, whether libvmem has been found + +find_path(VMEM_INCLUDE_DIR libvmem.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(VMEM_SHARED_LIB vmem + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(VMEM_STATIC_LIB libvmem.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(VMEM REQUIRED_VARS + VMEM_SHARED_LIB VMEM_STATIC_LIB VMEM_INCLUDE_DIR) diff --git a/cmake_modules/FindZlib.cmake b/cmake_modules/FindZlib.cmake new file mode 100644 index 000000000000..ee8078b60052 --- /dev/null +++ b/cmake_modules/FindZlib.cmake @@ -0,0 +1,21 @@ +# - Find ZLIB (zlib.h, libz.a, libz.so, and libz.so.1) +# This module defines +# ZLIB_INCLUDE_DIR, directory containing headers +# ZLIB_SHARED_LIB, path to libz's shared library +# ZLIB_STATIC_LIB, path to libz's static library +# ZLIB_FOUND, whether zlib has been found + +find_path(ZLIB_INCLUDE_DIR zlib.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(ZLIB_SHARED_LIB z + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +find_library(ZLIB_STATIC_LIB libz.a + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ZLIB REQUIRED_VARS + ZLIB_SHARED_LIB ZLIB_STATIC_LIB ZLIB_INCLUDE_DIR) diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000000..3432c3fd8eb0 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +*.html + diff --git a/docs/administration.adoc b/docs/administration.adoc new file mode 100644 index 000000000000..f45d7f885c04 --- /dev/null +++ b/docs/administration.adoc @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[administration]] += Apache Kudu (incubating) Administration + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +NOTE: Kudu is easier to manage with link:http://www.cloudera.com/content/cloudera/en/products-and-services/cloudera-enterprise/cloudera-manager.htm[Cloudera Manager] +than in a standalone installation. See Cloudera's +link:http://www.cloudera.com/content/cloudera/en/downloads/betas/kudu/0-5-0.html[Kudu documentation] +for more details about using Kudu with Cloudera Manager. + +== Starting and Stopping Kudu Processes + +include::installation.adoc[tags=start_stop] + +== Kudu Web Interfaces + +Kudu tablet servers and masters expose useful operational information on a built-in web interface, + +=== Kudu Master Web Interface + +Kudu master processes serve their web interface on port 8051. The interface exposes several pages +with information about the cluster state: + +- A list of tablet servers, their host names, and the time of their last heartbeat. +- A list of tables, including schema and tablet location information for each. +- SQL code which you can paste into Impala Shell to add an existing table to Impala's list of known data sources. + +=== Kudu Tablet Server Web Interface + +Each tablet server serves a web interface on port 8050. The interface exposes information +about each tablet hosted on the server, its current state, and debugging information +about maintenance background operations. + +=== Common Web Interface Pages + +Both Kudu masters and tablet servers expose a common set of information via their web interfaces: + +- HTTP access to server logs. +- an `/rpcz` endpoint which lists currently running RPCs via JSON. +- pages giving an overview and detailed information on the memory usage of different + components of the process. +- information on the current set of configuration flags. +- information on the currently running threads and their resource consumption. +- a JSON endpoint exposing metrics about the server. +- information on the deployed version number of the daemon. + +These interfaces are linked from the landing page of each daemon's web UI. + +== Kudu Metrics + +Kudu daemons expose a large number of metrics. Some metrics are associated with an entire +server process, whereas others are associated with a particular tablet replica. + +=== Listing available metrics + +The full set of available metrics for a Kudu server can be dumped via a special command +line flag: + +[source,bash] +---- +$ kudu-tserver --dump_metrics_json +$ kudu-master --dump_metrics_json +---- + +This will output a large JSON document. Each metric indicates its name, label, description, +units, and type. Because the output is JSON-formatted, this information can easily be +parsed and fed into other tooling which collects metrics from Kudu servers. + +=== Collecting metrics via HTTP + +Metrics can be collected from a server process via its HTTP interface by visiting +`/metrics`. The output of this page is JSON for easy parsing by monitoring services. +This endpoint accepts several `GET` parameters in its query string: + +- `/metrics?metrics=,,...` - limits the returned metrics to those which contain +at least one of the provided substrings. The substrings also match entity names, so this +may be used to collect metrics for a specific tablet. + +- `/metrics?include_schema=1` - includes metrics schema information such as unit, description, +and label in the JSON output. This information is typically elided to save space. + +- `/metrics?compact=1` - eliminates unnecessary whitespace from the resulting JSON, which can decrease +bandwidth when fetching this page from a remote host. + +- `/metrics?include_raw_histograms=1` - include the raw buckets and values for histogram metrics, +enabling accurate aggregation of percentile metrics over time and across hosts. + +For example: + +[source,bash] +---- +$ curl -s 'http://example-ts:8050/metrics?include_schema=1&metrics=connections_accepted' +---- + +[source,json] +---- +[ + { + "type": "server", + "id": "kudu.tabletserver", + "attributes": {}, + "metrics": [ + { + "name": "rpc_connections_accepted", + "label": "RPC Connections Accepted", + "type": "counter", + "unit": "connections", + "description": "Number of incoming TCP connections made to the RPC server", + "value": 92 + } + ] + } +] + +[source,bash] +---- +$ curl -s 'http://example-ts:8050/metrics?metrics=log_append_latency' +---- + +[source,json] +---- +[ + { + "type": "tablet", + "id": "c0ebf9fef1b847e2a83c7bd35c2056b1", + "attributes": { + "table_name": "lineitem", + "partition": "hash buckets: (55), range: [(), ())", + "table_id": "" + }, + "metrics": [ + { + "name": "log_append_latency", + "total_count": 7498, + "min": 4, + "mean": 69.3649, + "percentile_75": 29, + "percentile_95": 38, + "percentile_99": 45, + "percentile_99_9": 95, + "percentile_99_99": 167, + "max": 367244, + "total_sum": 520098 + } + ] + } +] +---- + +NOTE: All histograms and counters are measured since the server start time, and are not reset upon collection. + +=== Collecting metrics to a log + +Kudu may be configured to periodically dump all of its metrics to a local log file using the +`--metrics_log_interval_ms` flag. Set this flag to the interval at which metrics should be written +to a log file. + +The metrics log will be written to the same directory as the other Kudu log files, with the same +naming format. After any metrics log file reaches 64MB uncompressed, the log will be rolled and +the previous file will be gzip-compressed. + +The log file generated has three space-separated fields. The first field is the word +`metrics`. The second field is the current timestamp in microseconds since the Unix epoch. +The third is the current value of all metrics on the server, using a compact JSON encoding. +The encoding is the same as the metrics fetched via HTTP described above. + +WARNING: Although metrics logging automatically rolls and compresses previous log files, it does +not remove old ones. Since metrics logging can use significant amounts of disk space, +consider setting up a system utility to monitor space in the log directory and archive or +delete old segments. diff --git a/docs/configuration.adoc b/docs/configuration.adoc new file mode 100644 index 000000000000..2c842b3bfb51 --- /dev/null +++ b/docs/configuration.adoc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[configuration]] += Configuring Apache Kudu (incubating) + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +NOTE: Kudu is easier to configure with link:http://www.cloudera.com/content/www/en-us/products/cloudera-manager.html[Cloudera Manager] +than in a standalone installation. See Cloudera's +link:http://www.cloudera.com/content/www/en-us/documentation/betas/kudu/0-5-0/topics/kudu_installation.html[Kudu documentation] +for more details about using Kudu with Cloudera Manager. + +== Configure Kudu + +=== Configuration Basics +To configure the behavior of each Kudu process, you can pass command-line flags when +you start it, or read those options from configuration files by passing them using +one or more `--flagfile=` options. You can even include the +`--flagfile` option within your configuration file to include other files. Learn more about gflags +by reading link:http://google-gflags.googlecode.com/svn/trunk/doc/gflags.html[its documentation]. + +You can place options for masters and tablet servers into the same configuration +file, and each will ignore options that do not apply. + +Flags can be prefixed with either one or two `-` characters. This +documentation standardizes on two: `--example_flag`. + +=== Discovering Configuration Options +Only the most common configuration options are documented here. For a more exhaustive +list of configuration options, see the link:configuration_reference.html[Configuration Reference]. + +To see all configuration flags for a given executable, run it with the `--help` option. +Take care when configuring undocumented flags, as not every possible +configuration has been tested, and undocumented options are not guaranteed to be +maintained in future releases. + +=== Configuring the Kudu Master +To see all available configuration options for the `kudu-master` executable, run it +with the `--help` option: +---- +$ kudu-master --help +---- + +[cols="m,d,m,d"] +.Supported Configuration Flags for Kudu Masters +|=== +| Flag | Valid Options | Default | Description + +// TODO commented out for the beta|--master_addresses | string | localhost | Comma-separated list of all the RPC +// addresses for Master quorum. If not specified, assumes a standalone Master. +|--fs_data_dirs | string | | Comma-separated list of +directories where the Master will place its data blocks. +|--fs_wal_dir | string | | The directory where the Master will +place its write-ahead logs. May be the same as _one of_ the directories listed in +`--fs_data_dirs`, but not a sub-directory of a data directory. +|--log_dir | string | /var/log/kudu | The directory to store Master log files. +|=== + +For the full list of flags for masters, see the +link:configuration_reference.html#master_configuration_reference[Kudu Master Configuration Reference]. + +=== Configuring Tablet Servers +To see all available configuration options for the `kudu-tserver` executable, +run it with the `--help` option: +---- +$ kudu-tserver --help +---- + +.Supported Configuration Flags for Kudu Tablet Servers +|=== +| Flag | Valid Options | Default | Description + +|--fs_data_dirs | string | | Comma-separated list +of directories where the Tablet Server will place its data blocks. +|--fs_wal_dir | string | | The directory where the Tablet Server will +place its write-ahead logs. May be the same as _one of_ the directories listed in +`--fs_data_dirs`, but not a sub-directory of a data directory. +|--log_dir | string | /var/log/kudu | The directory to store Tablet Server log files +|--tserver_master_addrs | string | `127.0.0.1:7051` | Comma separated +addresses of the masters which the tablet server should connect to. The masters +do not read this flag. +|--block_cache_capacity_mb | integer | 512 | Maximum amount of memory allocated to the Kudu Tablet Server's block cache. +|--memory_limit_hard_bytes | integer | 4294967296 | Maximum amount of memory a Tablet Server can consume before it starts rejecting all incoming writes. +|=== + +For the full list of flags for tablet servers, see the +link:configuration_reference.html#tablet_server_configuration_reference[Kudu Tablet Server Configuration Reference]. + + +== Next Steps +- link:quickstart.html[Get Started With Kudu] +- link:developing.html[Developing Applications With Kudu] diff --git a/docs/configuration_reference.adoc b/docs/configuration_reference.adoc new file mode 100644 index 000000000000..bd08bfd1ccc5 --- /dev/null +++ b/docs/configuration_reference.adoc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[configuration_reference]] += Apache Kudu (incubating) Configuration Reference + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +The contents of this file are generated from the output of the `--helpxml` +flag for each binary, during the build of the documentation. Do not edit this file +or the included files manually. + +// This gets replaced by the script that builds the docs +@@CONFIGURATION_REFERENCE@@ + +include::configuration_reference_unsupported.adoc[leveloffset=+1] diff --git a/docs/configuration_reference_unsupported.adoc b/docs/configuration_reference_unsupported.adoc new file mode 100644 index 000000000000..7ab13b79a3f1 --- /dev/null +++ b/docs/configuration_reference_unsupported.adoc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[configuration_reference_unsupported]] += Apache Kudu (incubating) Unsupported Configuration Flags +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +The contents of this file are generated from the output of the `--helpxml` +flag for each binary, during the build of the documentation. Do not edit this file +or the included files manually. + +WARNING: These flags are unsupported and are included for informational purposes only. +They are subject to change or be removed at any time. + +// This gets replaced by the script that builds the docs +@@CONFIGURATION_REFERENCE@@ + + diff --git a/docs/contributing.adoc b/docs/contributing.adoc new file mode 100644 index 000000000000..4d8c96fab50c --- /dev/null +++ b/docs/contributing.adoc @@ -0,0 +1,385 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[contributing]] += Contributing to Apache Kudu (incubating) +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +== Contributing Patches Using Gerrit + +The Kudu team uses Gerrit for code review, rather than Github pull requests. Typically, +you pull from Github but push to Gerrit, and Gerrit is used to review code and merge +it into Github. + +See the link:https://www.mediawiki.org/wiki/Gerrit/Tutorial[Gerrit Tutorial] +for an overview of using Gerrit for code review. + +=== Initial Setup for Gerrit + +. Sign in to link:http://gerrit.cloudera.org:8080[Gerrit] using your Github username. + +. Go to link:http://gerrit.cloudera.org:8080/#/settings/[Settings]. Update your name +and email address on the *Contact Information* page, and upload a SSH public key. +If you do not update your name, it will show up as "Anonymous Coward" in Gerrit reviews. + +. If you have not done so, clone the main Kudu repository. By default, the main remote +is called `origin`. When you fetch or pull, you will do so from `origin`. ++ +[source,bash] +---- +git clone https://github.com/cloudera/kudu +---- + +. Change to the new `kudu` directory. + +. Add a `gerrit` remote. In the following command, substitute with your +Github username. ++ +[source,bash] +---- +git remote add gerrit ssh://@gerrit.cloudera.org:29418/kudu +---- + +. Run the following command to install the +Gerrit `commit-msg` hook. Use the following command, replacing `` with your +Github username. ++ +---- +gitdir=$(git rev-parse --git-dir); scp -p -P 29418 @gerrit.cloudera.org:hooks/commit-msg ${gitdir}/hooks/ +---- + +. Be sure you have set the Kudu repository to use `pull --rebase` by default. You +can use the following two commands, assuming you have only ever checked out `master` +so far: ++ +---- +git config branch.autosetuprebase always +git config branch.master.rebase true +---- ++ +If for some reason you had already checked out branches other than `master`, substitute +`master` for the other branch names in the second command above. + +=== Submitting Patches + +To submit a patch, first commit your change (using a descriptive multi-line +commit message if possible), then push the request to the `gerrit` remote. For instance, to push a change +to the `master` branch: +---- +git push gerrit HEAD:refs/for/master --no-thin +---- + +or to push a change to the `gh-pages` branch (to update the website): +---- +git push gerrit HEAD:refs/for/gh-pages --no-thin +---- + +NOTE: The `--no-thin` argument is a workaround to prevent an error in Gerrit. See +https://code.google.com/p/gerrit/issues/detail?id=1582. + +TIP: Consider creating Git aliases for the above commands. Gerrit also includes +a command-line tool called +link:https://www.mediawiki.org/wiki/Gerrit/Tutorial#Installing_git-review[git-review], +which you may find helpful. + +Gerrit will add a change ID to your commit message and will create a Gerrit review, +whose URL will be emitted as part of the push reply. If desired, you can send a message +to the `kudu-dev` mailing list, explaining your patch and requesting review. + +After getting feedback, you can update or amend your commit, (for instance, using +a command like `git commit --amend`) while leaving the Change +ID intact. Push your change to Gerrit again, and this will create a new patch set +in Gerrit and notify all reviewers about the change. + +When your code has been reviewed and is ready to be merged into the Kudu code base, +a Kudu committer will merge it using Gerrit. You can discard your local branch. + +=== Abandoning a Review + +If your patch is not accepted or you decide to pull it from consideration, you can +use the Gerrit UI to *Abandon* the patch. It will still show in Gerrit's history, +but will not be listed as a pending review. + +=== Reviewing Patches In Gerrit + +You can view a unified or side-by-side diff of changes in Gerrit using the web UI. +To leave a comment, click the relevant line number or highlight the relevant part +of the line, and type 'c' to bring up a comment box. To submit your comments and/or +your review status, go up to the top level of the review and click *Reply*. You can +add additional top-level comments here, and submit them. + +To check out code from a Gerrit review, click *Download* and paste the relevant Git +commands into your Git client. You can then update the commit and push to Gerrit to +submit a patch to the review, even if you were not the original reviewer. + +Gerrit allows you to vote on a review. A vote of `+2` from at least one committer +(besides the submitter) is required before the patch can be merged. + +== Code Style + +Get familiar with these guidelines so that your contributions can be reviewed and +integrated quickly and easily. + +In general, Kudu follows the +link:http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml[Google {cpp} Style Guide], +with the following exceptions: + +=== Limitations on `boost` Library Use + +`boost` libraries can be used in cases where a suitable +replacement does not exist in the Kudu code base. However, try to avoid introducing +dependencies on new `boost` libraries, and use Kudu code in preference +to `boost` where available. For example, do not use `boost`'s scoped pointer +implementations. + +.Approved `boost` Libraries + +- `boost::assign` (container literals) +- `boost::shared_mutex` (but prefer Kudu's spin lock implementation for short + critical sections) + +Check that any features from `boost` you use are present in *`boost` 1.46* +or earlier, for compatibility with RHEL 6. + +.`boost` Libraries and the Kudu {cpp} Client +Do not use `boost` in any public headers for the Kudu {cpp} client, because +`boost` commonly breaks backward compatibility, and passing data between two `boost` +versions (one by the user, one by Kudu) causes serious issues. + +In addition, do not create dependencies from the Kudu {cpp} client to any `boost` +libraries. `libboost_system` is particularly troublesome, as any `boost` code +that throws exceptions will grow a dependency on it. Among other things, you +cannot use `boost::{lock_guard,unique_lock,shared_lock}` in any code consumed +by the {cpp} client (such as _common/_ and _util/_). + +=== Line length + +The Kudu team allows line lengths of 100 characters per line, rather than Google's standard of 80. Try to +keep under 80 where possible, but you can spill over to 100 or so if necessary. + +=== Pointers + +.Smart Pointers and Singly-Owned Pointers + +Generally, most objects should have clear "single-owner" semantics. +Most of the time, singly-owned objects can be wrapped in a `gscoped_ptr<>` +which ensures deletion on scope exit and prevents accidental copying. +`gscoped_ptr` is similar to {cpp}11's `unique_ptr` in that it has a `release` +method and also provides emulated `move` semantics (see _gscoped_ptr.h_ for +example usage). + +If an object is singly owned, but referenced from multiple places, such as when +the pointed-to object is known to be valid at least as long as the pointer itself, +associate a comment with the constructor which takes and stores the raw pointer, +as in the following example. + +[source,c++] +---- + // 'blah' must remain valid for the lifetime of this class + MyClass(const Blah* blah) : + blah_(blah) { + } +---- + +If you use raw pointers within STL collections or inside of vectors and other containers, +associate a comment with the container, which explains the ownership +semantics (owned or un-owned). Use utility code from _gutil/stl_util.h_, such as +`STLDeleteElements` or `ElementDeleter`, to ease handling of deletion of the +contained elements. + +WARNING: Using `std::auto_ptr` is strictly disallowed because of its difficult and +bug-prone semantics. + +.Smart Pointers for Multiply-Owned Pointers: + +Although single ownership is ideal, sometimes it is not possible, particularly +when multiple threads are in play and the lifetimes of the pointers are not +clearly defined. In these cases, you can use either `std::tr1::shared_ptr` or +Kudu's own `scoped_refptr` from _gutil/ref_counted.hpp_. Each of these mechanisms +relies on reference counting to automatically delete the referent once no more +pointers remain. The key difference between these two types of pointers is that +`scoped_refptr` requires that the object extend a `RefCounted` base class, and +stores its reference count inside the object storage itself, while `shared_ptr` +maintains a separate reference count on the heap. + +The pros and cons are: + +.`shared_ptr` +[none] +* icon:plus-circle[role="green",alt="pro"] can be used with any type of object, without the +object deriving from a special base class +* icon:plus-circle[role="green",alt="pro"] part of the standard library and familiar to most +{cpp} developers +* icon:minus-circle[role="red",alt="con"] creating a new object requires two allocations instead +of one (one to create the ref count, and one to create the object) +* icon:minus-circle[role="red",alt="con"] the ref count may not be near the object on the heap, +so extra cache misses may be incurred on access +* icon:minus-circle[role="red",alt="con"] the `shared_ptr` instance itself requires 16 bytes +(pointer to the ref count and pointer to the object) +* icon:minus-circle[role="red",alt="con"] if you convert from the `shared_ptr` to a raw pointer, +you can't get back the `shared_ptr` + + +.`scoped_refptr` +[none] +* icon:plus-circle[pro, role="green"] only requires a single allocation, and ref count +is on the same cache line as the object +* icon:plus-circle[pro, role="green"] the pointer only requires 8 bytes (since +the ref count is within the object) +* icon:plus-circle[pro, role="green"] you can manually increase or decrease +reference counts when more control is required +* icon:plus-circle[pro, role="green"] you can convert from a raw pointer back +to a `scoped_refptr` safely without worrying about double freeing +* icon:plus-circle[pro, role="green"] since we control the implementation, we +can implement features, such as debug builds that capture the stack trace of every +referent to help debug leaks. +* icon:minus-circle[con, role="red"] the referred-to object must inherit +from `RefCounted` +* icon:minus-circle[con, role="red"] does not support `weak_ptr<>` use cases + +Since `scoped_refptr` is generally faster and smaller, try to use it +rather than `shared_ptr` in new code. Existing code uses `shared_ptr` +in many places. When interfacing with that code, you can continue to use `shared_ptr`. + +=== Function Binding and Callbacks + +Existing code uses `boost::bind` and `boost::function` for function binding and +callbacks. For new code, use the `Callback` and `Bind` classes in `gutil` instead. +While less full-featured (`Bind` doesn't support argument +place holders, wrapped function pointers, or function objects), they provide +more options by the way of argument lifecycle management. For example, a +bound argument whose class extends `RefCounted` will be incremented during `Bind` +and decremented when the `Callback` goes out of scope. + +See the large file comment in _gutil/callback.h_ for more details, and +_util/callback_bind-test.cc_ for examples. + +=== `CMake` Style Guide + +`CMake` allows commands in lower, upper, or mixed case. To keep +the CMake files consistent, please use the following guidelines: + +- *built-in commands* in lowercase +---- +add_subdirectory(some/path) +---- +- *built-in arguments* in uppercase +---- +message(STATUS "message goes here") +---- +- *custom commands or macros* in uppercase +---- +ADD_KUDU_TEST(some-test) +---- + +=== GFlags + +Kudu uses gflags for both command-line and file-based configuration. Use these guidelines +to add a new gflag. All new gflags must conform to these +guidelines. Existing non-conformant ones will be made conformant in time. + +.Name + +The gflag's name conveys a lot of information, so choose a good name. The name +will propagate into other systems, such as the link:configuration_reference.html[Configuration +Reference]. +- The different parts of a multi-word name should be separated by underscores. + For example, `fs_data_dirs`. +- The name should be prefixed with the context that it affects. For example, + `webserver_num_worker_threads` and `cfile_default_block_size`. Context can be + difficult to define, so bear in mind that this prefix will be + used to group similar gflags together. If the gflag affects the entire + process, it should not be prefixed. +- If the gflag is for a quantity, the name should be suffixed with the units. + For example, `remote_bootstrap_idle_timeout_ms`. +- Where possible, use short names. This will save time for those entering + command line options by hand. +- The name is part of Kudu's compatibility contract, and should not change + without very good reason. + +.Default value + +Choosing a default value is generally simple, but like the name, it propagates +into other systems. +- The default value is part of Kudu's compatibility contract, and should not + change without very good reason. + +.Description + +The gflag's description should supplement the name and provide additional +context and information. Like the name, the description propagates into other +systems. +- The description may include multiple sentences. Each should begin with a + capital letter, end with a period, and begin one space after the previous. +- The description should NOT include the gflag's type or default value; they are + provided out-of-band. +- The description should be in the third person. Do not use words like `you`. +- A gflag description can be changed freely; it is not expected to remain the + same across Kudu releases. + +.Tags + +Kudu's gflag tagging mechanism adds machine-readable context to each gflag, for +use in consuming systems such as documentation or management tools. See the large block +comment in _flag_tags.h_ for guidelines. + +.Miscellaneous + +- Avoid creating multiple gflags for the same logical parameter. For + example, many Kudu binaries need to configure a WAL directory. Rather than + creating `foo_wal_dir` and `bar_wal_dir` gflags, better to have a single + `kudu_wal_dir` gflag for use universally. + +== Testing + +All new code should have tests.:: + Add new tests either in existing files, or create new test files as necessary. + +All bug fixes should have tests.:: + It's OK to fix a bug without adding a + new test if it's triggered by an existing test case. For example, if a + race shows up when running a multi-threaded system test after 20 + minutes or so, it's worth trying to make a more targeted test case to + trigger the bug. But if that's hard to do, the existing system test + should be enough. + +Tests should run quickly (< 1s).:: + If you want to write a time-intensive + test, make the runtime dependent on `KuduTest#AllowSlowTests`, which is + enabled via the `KUDU_ALLOW_SLOW_TESTS` environment variable and is + used by Jenkins test execution. + +Tests which run a number of iterations of some task should use a `gflags` command-line argument for the number of iterations.:: + This is handy for writing quick stress tests or performance tests. + +Commits which may affect performance should include before/after `perf-stat(1)` output.:: + This will show performance improvement or non-regression. + Performance-sensitive code should include some test case which can be used as a + targeted benchmark. + + +== Documentation +See link:style_guide.html[Documentation Style Guide] for guidelines about contributing +to the official Kudu documentation. diff --git a/docs/developing.adoc b/docs/developing.adoc new file mode 100644 index 000000000000..c53d1f11cd93 --- /dev/null +++ b/docs/developing.adoc @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[developing]] += Developing Applications With Apache Kudu (incubating) + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +Kudu provides C++ and Java client APIs, as well as reference examples to illustrate +their use. A Python API is included, but it is currently considered experimental, +unstable, and subject to change at any time. + +WARNING: Use of server-side or private interfaces is not supported, and interfaces +which are not part of public APIs have no stability guarantees. + +== Viewing the API Documentation +include::installation.adoc[tags=view_api] + +== Working Examples + +Several example applications are provided in the +link:https://github.com/cloudera/kudu-examples[kudu-examples] Github +repository. Each example includes a `README` that shows how to compile and run +it. These examples illustrate correct usage of the Kudu APIs, as well as how to +set up a virtual machine to run Kudu. The following list includes some of the +examples that are available today. Check the repository itself in case this list goes +out of date. + +`java-example`:: + A simple Java application which connects to a Kudu instance, creates a table, writes data to it, then drops the table. +`collectl`:: + A small Java application which listens on a TCP socket for time series data corresponding to the Collectl wire protocol. + The commonly-available collectl tool can be used to send example data to the server. +`clients/python`:: + An experimental Python client for Kudu. +`demo-vm-setup`:: + Scripts to download and run a VirtualBox virtual machine with Kudu already installed. + See link:quickstart.html[Quickstart] for more information. + +These examples should serve as helpful starting points for your own Kudu applications and integrations. + +=== Maven Artifacts +The following Maven `` element is valid for the Kudu public beta: + +[source,xml] +---- + + org.kududb + kudu-client + 0.5.0 + +---- + +Because the Maven artifacts are not in Maven Central, use the following `` +element: + +[source,xml] +---- + + cdh.repo + Cloudera Repositories + https://repository.cloudera.com/artifactory/cloudera-repos + + false + + +---- + +See subdirectories of https://github.com/cloudera/kudu-examples/tree/master/java for +example Maven pom.xml files. + +== Example Impala Commands With Kudu + +See link:kudu_impala_integration.html[Using Impala With Kudu] for guidance on installing +and using Impala with Kudu, including several `impala-shell` examples. + +== Integration with MapReduce, YARN, and Other Frameworks + +Kudu was designed to integrate with MapReduce, YARN, Spark, and other frameworks in +the Hadoop ecosystem. See link:https://github.com/cloudera/kudu/blob/master/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/RowCounter.java[RowCounter.java] +and +link:https://github.com/cloudera/kudu/blob/master/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsv.java[ImportCsv.java] +for examples which you can model your own integrations on. Stay tuned for more examples +using YARN and Spark in the future. diff --git a/docs/images/.gitignore b/docs/images/.gitignore new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/docs/images/kudu-architecture-2.png b/docs/images/kudu-architecture-2.png new file mode 100644 index 0000000000000000000000000000000000000000..fcaeba59fde325af528f7389af195627cc7d2748 GIT binary patch literal 414311 zcmeFZXH=8h);5fYiYTZcARVbvL+?%L(t8O6m0lx(fT7A(K|nf$PNa8`-irkSgd)9! zDxo*&5(vD}?S7tf&bRj%@1O6-_l@&|404y;Yt4JkYqo2xP+e_RQW9Dc0s;b3bu}e@ z0)i`<1O%7oh_B+0%ywRxARxFz?Wm}ztFEZXtP26#Il9;q5Il&ne)2?LjrT#{_{%;zoyQ1;w-?%AHAPS$#~`; ztMh4J9SVFYA>Hu4;B`=t;N>o3egGFih3`w3m>d@Y-%D>Ti9DI3TdsW34*8(-a#Uf$ zyWQ#0A8A3Gw-m&z$dmerD#HN`pQs2bGLQav`QG|*fPN8S*D-A{98Q(Oh7DqVpkDgm z(qR9|l?3JFH5zu3$oA;RM&Eli#)kC-gakxvpAfu}Rytf!c>Y2tEdAlz`vi|%B8e`) zy3oIR6)sOK7zermrnlDhs}&CG*tUABsXbpH&n91LvgwC12N99OKjhlXc^FGI30Da>@i?kZ#k5}DNR|}# zSWX)_yVzRYd{?mkn(PwMZ3TkA{a_;M|9)SmQt9u{;%{T-P`I7KaOrPv#-DB%zx2n~ z*1=doLSp8Z1b_GO(BbHxZYH=iM@0PSR7IonpKc~3%ACCRPh%(YqhKc9qT{1s{re{q z65oDp$Mg>qC%XMz>m_qFxZ393-wqo;75v2g|4}ago%r9q>|e+J`^Nvy-2PuybgaSf`^A&0k3{eCM63?^G@bXv2y8? zu@9bEmy<5UzR8Y@qW*O)646T#C2D$FKry2iCZC2vr;20HgbXuh#vb z3%235vl6$?Gin$2mYc>roR(LHH%Ft>{?}SL6kj5{x^?Z-e_40X?H9O7QApnu7E>8q zlykbO7%SvA4OMVM>^l}}fTS|?XD0Z)O6Qxg9o;KmH2d>u%3ssm`5()N z7`dKma*^|ed0U4@HQ=It8_fbJRb_Xc;<)y>G0)=~ysW`(?= z_uYGlZ-#o-8|5uk7pSYx7|Lts`QT#39$f#8!~$DaUQqZ6mOrn@5wfRz0$Bb+7HOPQaQcl556803D+q95B9#j zar%-fl{SwHd$s)T|2aJ84Sb?;}m8EL)?SA_Db*|Pmvp-a;; z%dj)G_2>i7>h4rd^W5T@t{iZ}jxL3!N{DHrPnnp0!REB7D~>5mG7s1AzU_@i6+|djI5)D_Neu-{w5`S3^rG zqr@eQPQBC=129Y89C$}N!_6$OD05g8?EDODpv4;%{o2sV|4}1gu&1STY|`a(H)<6U zoW(uwnmQmi#N;|{KuHShSt+qHQ`V}63edV1x)=MhO--bB2{zmeNRY13-q0PwS_hDq zx~)g4oC32%DczuIE(3dfHLU2tlw#y_u(PMmXrse}H|c5;=6Y4)Q-x7zQB7&C!&H3!K#ZGkDa7%VB`2kn)%2zd;#i1%`W0^$`Q9U&r-|QUb z#QR9Dp^r(?$fJlayOik2g(8Qd><-Mr&7XwdFvAa3)AwfqgqbNxIzC>O0KBO1B*@{*v;?M z(N0qNAOLfPkWsU!1&Pp+`W^d4Nf2L}`m4oV276rWQ4FsoXY8Kn%Ag4H*--0BB5X*7 z`L8hR4{qj%d=g74zbv2~d})W-Q;YXS6_{;x<>pw@F89Ebj1~;&A>z4Fw!aE0X+;P~ z%j;ow4V^t|>-JGPvHEMH!&2i14dx!k)qP$?W0Y&@EB^^+oKa29Ahe$#Z~R3L_c|p+ z)Y%zapibi{jFuh*gl{|7y*@Vl&c*e9-H0(R02en1PHb|a6K(Oc59dnBouio{zx91G zkp~Uy6++M)kSwn=2S9$*w~Z2cY{59!H)G3=vac4uGOJnlb!|$qP9Ib$2L!T=a?h_C zxt+Gs0F748?`|(RxN1OT`II#!CuXXJml7ps`7>Ah^K!n=NXU?P&<~4+8LFgr>`sZ< z8r2M@KtEfFz{bHg^3SEqd}Jdp@8`~%MpD$2c<@A?zR-%E3hvyJ?#IGh6L&$J&3jsb zyc{pigmg-R?*9cmB!u<_scKTczut>ba74(oZCx^}XzlmlqI?MtfhYQC57>O#)U<>2 z3O5PN#bl}a+Jrb&Ng2i6uYHqJ*ArZNSerBW&Zc}i+sygdYz>b|&)64-^~}baN;~wO z0hsoPEVoBuDg17m_*&(ro0;IT7p%hhf!tQw58dF)$|sO5%Y-svaNzm-n-cj4^oi6m zBn_u0irxFi3<)b1-g6`kjxjBg5v{CEm`x31wJ<9Cv|JiSlheV<21HCFg?cU&VU-)O z5d-KF1xY)VuRQCf@FDA&@sQ&yGm3w^>FN(++21}cHD8S)^dswDeQv` ze3OrRCF^KMmz-9JY~QtCN~v(kZX~VzaTc=l;%chH@l-;kPcc$^xg-+po}1FPD@CS> zk^l{-Pa=^FoT)Ah<-2F9S1XsKXR(r~TqFaLTNPVwgB9mym2KnBiLD!e)cP-9L5Yvz zb>5V!G5qrV9NG#vhPF)*56`&Ev4wqok>m6whk;(Cz`~C*Rhz>C*03FxOLmD|$!w$DVUnrA`$iS3k(yK${lgIRWs6fYN`B~?mplM0@kg*_n_q|Y zgT~YRq2;8umDK@@%hj1ZsWutV1pg4juLq3IJr6=QY?uB}U3$U7h=8168+~@xl2y>y z$gayPfPLZ=RU50*-F4n4wSGu5dHaep z#sQnT{l4j)o*4Px;jv~I!NXPcr}nwOgcU^DHLu)`1qX;PzE9mDQ$#K?ZSRsKh2%h1 z@%2!l@qh9xt z%TMP}OU+rum>WfcYZ$k><{=lrPN)e;ErTzk(vp^5Z`JDjAY`|2EC?{S7c!hM^@-Z^fw#FI3+ShB8%RmZo@)y*=qX|ao*Ti^n?(cAwNC$-h? z4tsI*%zbVVWz$BYFsJJ@+XS_X4)7~jd@Hjk1yu7A^XeI#;>5(LGR7dqWbXr%n`{Rf zKWt1hwTho*!^<(Y$nv#GVlOZng()(sC0gDeeQ&D zk*lSkjwjBgzDbud2@knEh21A6{mM5T`#7s5HETvn!X6xW+Sbopc2z}diXFGz7j7;U!w<$Y4 zz46cr(6;AbU>|PZG{H?zHzQwWb$J zBvI#iF-#!M+4Cjo!&}#-nm#&|sy6j6`3BcH7P$(SE%uxVAZ**-jSl2m+V5<+$X`pI zI`L}8&P!X^mE$H_0(@x7x4um$zEjUby;w?h`)L8@mxBL_LmprM&Qaa?IqT05m>Esj z2L#7z0U{gk1oKi`D+DyVzH-j66`hNbUTzXq*JZgS*8=DOelD9_GJU2bW}?Ja=*5m& z%Ix{zY<6R>6pTFT_DCIX>NVk*oAeB+uVG6Q2e6>0_uN1Lc~@c5tMxX?9FDV&8SisYZ}$?3}~`} zg(iD%;y#7Tvx|<7>Khou{V*Kwsy!|9>hqlUKc(9!6VBB-IC40x+i7gO!cYirYx9lK z=!fkk8(F!lQ^QV_5}kcm+?$a`Ecu&cC*IB4C15Y3%DB|=R!&Ke$X;#P?km4bC>)R4 zv0p^*Y*;h=iM^V1L8LrmU57I@m421y8Ro&5gAljrr)K`ez?V)ov$4)g8bvg3vF~eH z8CuoL_THL+P6{`S*f-*I$>^~OGWA_a2kQ13L%iHi;hrt-7Li?Rn9wU#m>i8(pH8Op z9OZ4(8Ua|crvBq|({25SGd?ST&Cv_pfXRJbm$`M=*2=c?SfCF0kIEv;Rq>2gAXOo= z2Qmh?YvX*U)-NE`6nrz#|A2A>^W!_*x4f#48gZNML2OqFiJ3v-H2i4Rq@?pGx5V0r zGzvAu%P1@jMD&;kz}PixMqg=^bU<(3>Q+EZ1xf|-X`Qz5P!`8A(BH^m&|2Dkh}0Wj z;u%x2XyY9ersi)1(jHs!?`Q;J!S&Mq*y^;p`Jx!XsM-1`o+v+L`sUoORA%d$9(F&| zvDTfW|}_v{IV0X&4tgIM|*^F6RkZemCt^V^2_4ZxiXPF26~qnRDDy zI%T2#jw84h71ps`O3Jyc&WQa0s5%OaOB~_qAU}W9pqB2@a=15CY*~30Ob*+!2TW0u zCUnD_l*Ygx+}gGv1*MOv_<9pN150ZLnCN)oC{2LgVR9!Om+I0vS;i-qHTs?U6Ev({ zZJNkA2Da4-qZ20IDHKm5&X;dM#`EY5i250nyF5MLt7}Th6Wu$GQhjBaJ$pX_wx$GL%znNcWiQv~O}9 z$;L}XC@D!&m~zAtHf=p}oUbf%%0;BB-%)@8LV@lAGSfa@~n z5<}gYJ#`Lgm;d+9r*-U?W8phy{qJZyb*%;wi^-qA=$Jmom6pl-{pbz%X40-IFm{{9L^cdw97rnK$dw*@XiWAx*lyp@P+|T}POhS1k$f`?0 zKOpk;o;+MKGG{k(4BV=Ie)6PwH+I)9AHEYuT_?*e+%IQE#S7x#*)d~Ht#doRPf%){ zFl4P@9yQi9VqcrKg>vRtlqs`#oRYvc7dak!)EB05B&1%mb7Gg#@-2?yw@b%l;9dGO znj@b0rzDsO2yKnH32^g6rt0B|=WjMkzq&HjdpDT?;Fg?2!E@8p895_oRUD2T2G46+ z7N-wjxhRYVf0K!aYqsY>GyjDiSd|C38=~L%JhZ@j(sgD#ID2UL2vIo-my*$U-{Fj# z`s(HAsnj>UHg+#mBa&a8VW(S_zk4jFi^(d+kZj|if~?%~=q)|Xiq}1mchqLRZ*7UG z@VT9sP+_98DfCx`tiBPg_A`D9eN zmAOE0@LX#T)7f?s84m{W_N!26YVDYPCq3C2)DjwCvl#+>&u}c5lT~s6X01Ll%lab` z+D~l-lzV&SvMZoIGSfJ^sxyT+bEV>I@YF)W1exC9g&3= z>F_tNe{zR6E4g7GCzn`Np)+*$Z|EO4JBfB=aHA4?jksWts<20Bb3@n(JFIN1)bV!R zGTmOzlUmlriB&>LK;O)wu1;}%iz-v=?$klwP#U8r(2;7W7De5*3zIBLS-Pj=ZG*Jx zapJ6J|6B=8s60!FwvfBfZ@_%xu>lR1V_GM}<7#7UV@6&%%GmkvriW$)WX`zs+!xIX zxDZTAk3u>#`{AVK@}`dcL~bo>s}=tCXv`Er?`PfY^u?DS{+XaLT^dbO);FnFV>Ic5 zxLDPSISxJlWY+*&B!wJQ^rD|_ZEYTRX<)`f7`6zS`eChS=Y*mkX*Ar<;Ud-kMlAcy zZkMr`ud{b!#KxiQHrIEfH{?hsgdMTDjv8#j8H( z=CKm}>GOWCLz664;RqS3_D|Jr(@Dy)SPkw0koQ#R@Uq?He-uWv+{)3>;U6W9%_*WBx zxL%>$_heWUHQ4<0jZDCM9>;|&jf+&JSbgPn1WY}4WlXr}$;2#%83>_*K|b2@Z%Gu~ zUZ9wB)y$a5@x*DK&)uv%E}zb0jvEY>&%FEsXg??ykS7wAR`%e2WnlL zFfUa4?&Q0@kpiC|>aW#ur7OiyLL3*Ke{0Sim&|XBOq@G)9A|x)^RcFNM-;EO*@Q{N{uS@|;`Ghc_@! z5TDymj&hQxn;*6=l2^O=E?hT@dl+qMxY0O1w{IJlT58o6knAG8N~LGyI$H)@z;YDx zU6%a@Y=|2Z`FLF!GK?^(FR21CVgoabtS;4iht%S7r>lJ;NfArgKTHZ2R*Dxsowm-yl@S^~pW#n4MfV4FP?BL47Oq>UF}SN`yj z4d{ZXfzHpxErs;euS~*t{64O1LvVwSCj;`4V(W2CGOgV;n&W|xOlKcA_C^NIbG*OW z*)VR4;fpp{0bcZaI0t5n{23}SM{D6*Rg)uZfIs7Z66(&BrSHKLd??20C1RUpU!n@7 zu>hY)nZ}Km)ljH9GL*H_d2t5Ptwq;d<*mtOUa3F*snDlZ6lFLo4KkE>8|PHK>_6mw z_R^*K)GvxT3lB>LflFRMv{GJQ)5(D4<@!GEdht`N>e3rU&K&R)hc8?2Wlq`;ArV!5 zc1H8M9dVFX*+#Do$|xP-35nDEbah+NmeLvV5jw5&6RJv7a|fJd8s-!xyiFp4DdIY1 zR*B1pYu^)>C~_GGOTR5uRtTka4G2{SbXdS%pl>*}8J_lkeR<$XpJ>k%s6bWX8F)iV z|Inn;0YmDlA=XL?=e@WwS!iDLd?Q7?Jeu>C?SW6D|L!NVZ&-fs8QRH!cA9vmSjDDR z4sl6z^S$nhr5dsU@q{fNEm^n;Z49Suz+EG7_QIksb8W=IE^f7F5;EYWDotNu6yJ@o zSe(!^D4#km%$C$XlLFxm7_@M~6&@RY8;K1&(o{UZEAc_y6mN&_{~3C{ELJdpni*PV z#>p?NJCtvbhko&=%)~75!V-2Q;YpHG_hy>-56|%F5yn1GwTPDQgzQ%BSeb?yRh0`i zfNjd5xV~*kyXY-zW?kV_4an`P-QXbaaBJNs(Tf9ZJH&P6W3Lp^)PTThu{c+TvM{ey zWzE13>_xLK+Io5T^r$TvRzGK++DK+As&#ZF&ANqD@9-QN2DK9^mGSA-D;spV<{RUa zEqbR8Ds`+?V|Y4SK=RQy-rw?I;=Sz3fpyE^`UjH$a?-@oaAUVg@)>=zI}xv&SHidC zBe$fzZW*m_?}D@UWcha2qjGs^_=e~18O_~b1K%3oUt5i$2#g%xJS9_%Y7Njslu^*Q zHvO1=%JjW&x|O!`cl+c?7%@I;@*#i!KUtHwi{=b3@<~d=u7&-G)w~d2j<#Kg)&Rz-LiqvX0%K;vQ&&xA-J-I_Uu7m}yb5>Qt`GyLP3! zG#%5I2$Y@GjN~)*O1Id4fqpS7sxg!5e9vsy7126fsh#*SD|ZhNA<9o7q(#!5VI2s# z!LZchx#r$y;*;%{KeY&TnB=Yl)4EpC7hV4&q~*L!p=P1gx7fwB4KxOx;Dx#Y=zu)g z0n;OkIFAjBWK!x4SHtwlW;c(7t@yS!>sEBUx+WdY{oBB95wYJcgFnvmR5bv^*;!fP zFU+Lz)b#4_FcSj8H*fmEEF^D#Mk&$0g!#i4TrEzCN4uRxca=^00MpItQ!2|3%styS znUveB8ohLSYV)^v_+y(8>+vb?E)PB}oQ`c3ecD16h;l7g9-4Tf{}@iOhY40~1(1p+ zo0*z`jB9i*tOu$ftUF_@&VH26tVc* zO0)(wyH0Z5*EJj@T-9QA7hAHDcuO3RYLAcebVl4%FY4uV7cS)vw8a=##C}y{CW7d^}NSnQ76=mS&+aYQ7IEE2kQW3%d)xUOxH5 zLLZ;TCj(={Fp>G#+^6=oJ>tEiG29zPPRcc%ETm;t{4Km~n`%m#$nBU1=7XQ!j;y;H z8mt9au-PR%hvjZckMY$+&D`%tg!ihLeJkFRDf{N}HlC`XjZ!Xe@k_`eG^b#4FJj#9 z-684-`s@simfx?Zy!_W&U?;L$wNuVh(kdYOGtx0od7{ubHHG;C+}aC>^3l?|0@)kj ziMg_09t~sX2PZ@}&MI~@kh;NpF21?1?07e*S`Kak1abn}E4EDVT`mo6 zr>5b2WjjiZ?&j^iTRX>m3>DP|L$BxEFgPRGXb!j8FguJR-?t=^cKl;!ofcz@r;qVuM z*B5|6B{a7RnZjM-#OKzd%O=F6O1(x2xiWy<`{l%i^_+5JYpU&)Wv}3+UD1yB6ZcB;lf19?yPyN{h z3{oJu{hAD8m;Lp(bpLcBzuTYqV+sOzoxf%+t@rq!9ubXiLSAl|d?4~W?EFu<%I%x@ z@ZjTwc0BoS;Q6-!;ExgK;L|<@H!+fk|9$Mg{YRvTM=F6@irjx<(-3kg+?I+?jd=7= zgq4rCnTZ+xD~A7y;g^>GR}B9j5`zNh_T!oHaZan#sqvhqCTZf1+;@MERRW|${Su2e z8!paIZ9Bp!hu_y)mp^0vKe6>s0KLTee(rVa$#Uw_w{U=%`hg`8;hooBT;l3~KSn&f zUb>CvlwAH0^MK`_0Mif;m=9+l@pS(Lm@n~wN#Md3_A7GxPsGMUC(%z{j&2?O??(6U zk^T#S6XH2|QUr9NEWaJezrpQK0Dnk{2h0&Z)rNoa8w7X`AX&Rg*4uxWr-JTJ*91lB zx&6yy{}uU}34Z#G|BB(iV)&`$|J8c=;t-X;V<>b;AHf79=`-+yTm}Dm0qZ|dJZ`ECA1O6a5 z+y~UHvh=YRPoOJURBixolSppX?{sS_GxXw_TpulqZSE${n2t@>2cX*Tz~(nQ_V&~! zM=32*Xc5uMk}n)Q((wjX$9EE)DL&O2yK==2&uW*zJ@Kwi!7XpgzMoAcuuD zczCqDQke4|(U{pTtu)puoF2~EW1OLOio8jq%*-KRhOAZ-D;s<$kvBPrN93Tw`}o?Y zlZ&cT3@f3qEJ}|P3h?e4qSLTkUX8=eY0F@mdZQ`S6*;ACye&6l+;e2^ob=SDJuuh5 z?+0)W7~<|==Ynwndn9WsTq1vfE?lVK&A0EpY44rPPLd3-va@fwB)xn});(8NZIw!* z8kmLpX3Sj7#U5R5RnDX;KIRdGlG}`puYJ;=4daEqD9u^?>IH2}NTNOeV%2ROI(Req zYR>k;1;4kO=tIbkve!k`2T91za4uY{z4DD2Ccs zT5rsK1Gy0md)hPRD$O@cwi9;zZ9{KLW%a-5h@^P#7y9Iu&0-SRcUK*@8i><1U*B+r ze}rcqG;r(QZ8Vue?|h)tnKKYMo3Xc9vfCJ>kD0O_t<+S0wiObPX|v%_yX4F>X!EXq z+LZF$GEc4?%&p4J;q$D~mUB#7{lk{6Y41lF=T1svE_Jlys3_VgOmLUX>7#e|1xSJ{ zr1DhtHm@f^n~?`ZcVmoxMiCSb@R~(fVK#a|@wLq?&SAL$6!84L798!icMUU#m~?qH zD6i$y6?_1kgcT|9%0$H1%snp@)k)GvVGv?exD-J~cwy_ya!^7o<>W8u; z&_@{;=T&>zm&?28LV)Bb{(AE=(BAciL!IqtIx6-Q(lv=6B{IJCIyoK)vs1N;4}Ql8 z@2*V3QGV++&ZXHdmWm8rpOL5qnf!p8D-G06M!JeM4+{c@c<#2EZ#QmLE*dsMgtnxFzAbFt=5ypnI0iF(e)5o6WE#VVp5M3 z8H%V&5>o3kfctSnJ{pXh_tigzgmV`cxz}rMH*SC$E?VhqOWKx1X_11X@_N^UZ+F3h zzQqn<+6{IoAvN3$hs~0{+M)4x6V>*5Sr}A7@r^41Y}d#bB5}HjAbt^5oay+ceuF0B zGT4kuoL8OUon-z4tE)bdUET<++c*0B{*ShZMvZOk9**2BRw5*l3JQrlVJyU1Ip>xI zROi|T5OqG#H7zL%?|zDJmU=Pvv3RT`gzdxc*N)M*1)y}J=>5(m4;M2Ax}*C1A3oqU z30(JS*qQ}&@u8?W0zUW@Xl3~Ols8_J;^PLnY?zbznRxZeu;q`GPH?upDCgy_n01}B zGwNGx?EZtp<;Jnn2!7`-@XV4kBds7X^w^)}4{?i5o4@9J71oKQjy z(&cd7)pLYLpK_)ypT%)T`<63DHs6LRpKoeWTu*`zP$|n7cv`9)A!d{~x%bvkbSh6Z zVby^_6&Q#l!DDU`AueK22;HEQ)s2iRntI|~UBv6|rUmT{?(sJ7cVt}gLQ0jMDA#U= zPeuK3h-BKwSHYkAq#CZwJ=vCtml`KZ8&`=@s~+@P6tVc$-5j7A*Du*^yD4kz+|af! zQTr)j(0W;E5Pgp{Um`zgKOF1+btJXK1Hp=T?rD|p5p$k%YBcVV=ONg6=iV!YJmf(R zDBI&CDKgnxwYX#;uEIlE!pR>bvO+v;VkH;x)VYEf?>Fb$@CaHgEXqLGN@8j?P<#qM z>mAw#AqN6@L#d6)g*k#Z**Bys6_?A@&V@BjE$m9nw$0Ao(la@VxUYVx!nb!e`o6^D zuzlV=vS_l1-r-aT==6)+q4*{<%V10{2%Yu`ll3FNwe=viV(>K>Kv;H)>cx@zlY+6<#;vT{@Xu4HJXIYQ| zP38~o30J=rQkdF5)q-`GjNkP@esOah3nm~O2EJrIIG8;jnY9nFb_ z6V|MVh(H?W|icmoYG~WF7aHIizfM*yVO~mE=H1k_M(k;=*~oMEblO&{AeQ#158$ zqaBhTT)94ZQg&@QA7B`VeGvB?a%1vD>wWeykZ5gG-$G2$9v`^nM)T| zwRWV$n$$rU`APorN$Qy@{=wd0siEv=DHt{EFd%enbB>PB`%KdA+W;c&;x3QjD=B%IC= z`j|bT+)qb?mtuf}|ITTlIm#792cNtQX^18vvyLt~A=XN>IwMHZDUVj>8 z)q=1g(1#7p(?|JbuLf~IdQ>s2$BDd;?8clzSFctg)2i?}jq1m_7yb zK}vF!&CP+iH8qxGa*ZN#{srY4AU~gw74fB4`SR}u=;(C8bG_;AEiQhLL~Kym;t+HU zIwUXgU|osr6%Ob0gUig}WUD=Bc>Nl&YdSV~X3rjUSv7#=s8jgSUKeg$eWCs3&3Asm zBe(9IH-uFJo3`d=0T}?r5boT=Tu!!1&&+sud{^_EBCGgs*I+6_+9t+0XWPY!YWUnu zlkGr5xapi2w;6w`(|B?>EACeA1*>YTwFq70+QyD4S2nSxdr;Qb%R>;&bm`HPDx-9H zwDhf>y&t<$Y_jXubiL24Eo1*!ndGQeK0hu}hM(-)oUmRn_p`QU@B5=^%O-voEu=R*!#mjTB5r|1Fp;CIcwvchx zx^Bd@dpIvQbUE5>nkL5$p1Pco?sh+BR(sIhtWI8_bh@^)ZqA7*$`+E#0zr+kt9S3i zXSkmpp6(y+azXd1#C>hn$Xh5|8h0i0Cf_t0Kb9gi0KWfVx(qNx&gdf29OSsQE?P~< z#9FykZ?)Zr9v)_w`C?E!Y$tCK07SW=>RGT-z#)Jho}miw>_^Y?EF>M*J|!XSCT#G< zY4bbpFH^>Xqck`Na`feXD9l|y@wg(dU#CY^F5>MoHX4$XR{=8S+p}*O6J6?gJ{*Z( zfUw?b;=QiQxD=CHxQ%>PwN|Hsx#3Yx)0%k0Nd0&)WGHPI*)ZMpZMZ0Lc--L>p7g`I z(ICZX>lQ2%8r2n1n;L!JQ5GBA@;=5EQn+Nxa;o&R*^TD}FcZtdj!Ixs#A%F%H5M#^ zt^90IAcgqtR1MZf_5=<^t#2=^Cd)>=mBePNOoe{T$e%)-N^I5?hpP9*D}ONV z zI)E&YZ6<;W*_{4PkZ6bmvw8@;8o*B4zk?k%wtEpw9dHhpshp-V>UrLMw~Dm7yS7Fg zm9iCjI`CsU=CPxnIqr&ydZVzRUnrMgv{m$r&shR}Q8Jx8C2zlJ@I6rT)n+<|?B2{9 zSJ1}CJ_||Pz8WBR`cGEaGje4AV0HjHYZgw``ZPwz%6ZcG^UwH4z$s`Ax+3VrNTt9m zO^AZV(F4cxyGr!s>U0?**L5xr?4}iKWV|aN@N7Zd%={4V@09HKEFiVSvJGt;ypW^p z4UYYo+Qv(IEr7*0@E~o(t(*esujMyc>dg;#Hpeb)pCtQ71Pq?ZzXQ-Q5x^Ye#_SOX3!evSC8-R0|cHD z`Wz^?AFr!REjJuky3vkD4cmcB?nfPWYND6k?#S}31yH}Wo<%2f5wqREY##zI>gSn) zjY2oJa>)vPGvh5is^Y0^P9pO+-pdHsCu7wym@pS(#jFfw;cRUwZ*$=x>eI)O2MA zUQ`&)RgJmFvF;5cybG8Q&VFF^Nij>*5*&%@){FdR&%&0I*6A6vczpY9Ri)+RH0o*? z;8sww?pWB7fz#XuKw47~(*35;%80Q%8@Q&X_{8VN0iF?`ZbcZJo44S2GMu~Lk(Q-x z@m*umUuSP}xUX!x#+cFqgdzTR&>gs4L-c<1kIdq7wNnmqWOJ&3BXRu=%-$hToY(mM zIrig}a#3o|(MRPmwu_K!k@eT!D{k0aw7JGE!1XKgYd_Amer+MXd-4Kw0cxWJelt%ND%EF=al8Gbj zwwp0=oOTi zfX=HoICUC9Mvd09dv?E{gDn)y?`l(MCA2bH~X-~Ou`6#vY`5zkzDb8Ih9C%mE z2xsML4Lx=_=j5_V7hnU(U^{YrKkC9FoKQbP9T{GFEwKo45WhapbFV89$Y){3o zBctp1^%s`D`f+bjm_5ltg!rbU06v~4)=a~9T0*Prw1zF_r_gQR_-{|KjN1-jl=biG z;qE5tYi|+f!I-Y9x#-bI0*F(bof)j#e{VBWY>iG=-s{Da#OY>Vm74^E_?kNt8IEn$Nn&iK>#%yA$(JypDmokzSUh{94xK!DkjldNaaJH&R& z4SFqS=hcYn$iOuFx!U75lfGMKBjx4MNbk|^8&JUGRBU!_yd_fZq-aT8t+%-k-NqhR zyXVtz?o$^<5-k5gL;uT1EwO_;Nwo(u5 zIKxMAx;eb&AS(CtynGY!!%nz-ZPhe1<2f67#A^y$Z45)$wkwlne5pU!u%&z}CFFHc zR%86D!ew`$ST*~UWM%Mdo|EcqjKkvdK?>V0&j&`5hGx>8j>#6rx2RRqy=xDU&iHk} zcE#mTE1D>4mn)9z^3AFp;hzWyKT{g^^^4 zKVk>ALicl!mDmXYI(sZ_mC(DlsXk>xFU$Dm{b;tQlcn%+B6!3+cB%MJV$;2ISkhGRk7QMf(o%e$LD`Gb=8+f%=;Z-kZ2Oi{=7_$NJ+I`BN`vw{RqL&ZRRIwp6pYW2X}g zbaZ@F-O^vbR=?sH)-Z?_^;t`(6oS<*#K&0O7~pBqYe~1<5=~^4WvhYS3e7eXM$4!J zWG3_S?x`D$Wx1PAjiqTirMjr}IMWKB#~|IM^9tnws!64f0v{6>h^NSFn#DY7Sc;M5 z_YgsCZ=rMh&tvtTwKmo%i5|*zmRZo&&D;Tlr7SrlVZHfKEfbzRRspIY5mk>!i?$Yx zWKMCnqC}@dm4_Ub1zE1A}ED8K}MdY<3N%cswPp(0oN` zPAzgNL3$~vhbOj@CKiQh7QFD%UawR!a&Yj2Q_(^fsm<#QXpcwLCSud>8KP~i!ewfl z)LFV+A5>&o8z5Jb8qeK|IZ1=sN$&Q?qSD@Y)9bw1--+(;l>H_l+nGpGUJUp5E&jIn z`7-Iy)S_BgM6j*BV)n?3v_(%4UyDvRInv{qzILS0^0c6pXQo6-T|$;wVY!JqlEQnB+U z;iIN+_cw|m(q3)x%fol>r7B>p&D$J=C&yQ(W%8P&1FcnW%`peg*Vo-!#YK#um&3Yh zOulEo79&@|m;dtd)Ue|Uk0PbJc$VW3$<4z`9!kH*fS;sZw=abGu2}S!E8`pXL6GE`N0~t)cEW8=>4so79K2ID;Pqx}$LmD>5m-4D-4h-pkwx|DPc7J{SZiV0F=%TaaudN0C^P;5MpPO@R<>i0- z+bJ@?e0hPt04I(wJpKI`+N<##3|UukSxVKvdHl~I{a?JvhpUcFYlRYjFXo>$-iHns z5oVYF;kYohtX;7Lk#nj0d+`|HVBYinR?oAz~(a(H@kahlOt5jXs z-yqp@(XO>{)*CdQw%B-Xagm1wDd7IyTD&i9R8@_C01Zj?xlYS^Ul@4s6Q`H;j9q)a za=Fn%#uB9Y%LJ0{UGl#(vf{AP@%-6XMgU*ktWR+>o_e*nkGQa~>rYD#2vSdawK2!` zzUtrEs?67}lnjhFVAHm2QV*D`MW1idi!J6|G|l1oK>=}jr*;#Xo{<}#bt1y@{y(if zsN%W=z^mUqy;xhAsfuc&o@UH{CV9VxM%=%ugbBC3?FPHR9WT|j;Yyh%o0c8d9kF&C zvTj4YA85w+lSG7F;2w0xkE{4U3;erbDQpu6&zmMtEp*V6G<0YgONuRl)pe3j@n-K+ZN)<%AW{o#oo7AtQ25WAG| z`i3VdW&PG=b6c$4RgW#ew|=0@s}8M5^AGL3nDvggK!47Z_DnB;J;NZYC8K1cQ8$4Qyz8zCLC5t|#W1cG?8lI<|G(UFSJ4lhsI1nC?F*vYik@koX z@^5jIbt!YtD96#?dEZkovwVx$Jb= zY%pid9XGgWcQVmVbSAv^(7~R8G#K`EY>|P10W#-*-0i#5vXUk%R<@G)Yi&oD)+G=3 zDmXs?pu^~Ypl8+qv6WG9?O$@+2PUQ$eO|`qZ`cQ01>&Gw06=1EYio`ZU3t!Rw*LTQ zO|who_RI3vZ28!f6xzW7bmMS^k!<<+83n0`<3ooPwE1!n@c2+#8STBRkx$u}(+Sw4aCC8@-zYNu zkKF{Hq!2eBO*!xVSd2pbAFjRvsO_ffw@8b7ix+5dcZWa=lv1ofaraV4DehjNI7NyR zoZ=qbT|;r#KyVKd?55B2zTds~o1MvICX;0Mzh}?+o!^m77j-fd6oSkd*}qr4;J?g! zn!OBe$C6HK^Zhv6jmT~8jUYpYhV++78S(zwCptP*h~!oJCr!;>$Z->=m5-Y9vkWx# z3f;3du5c?G|njfTO_N?vpBMh0!i>;G) zI%+h-KNcFuvHXVw?MqP?eHPpw9Nd&;y8BHezWbV5*Q7DsB; z{{b=E&UM^&TaK*i)8lGB;4{urmMrx_3cc4%TjQm}{kf$^EbITL5qKD%Uhso`qo*LL z2)xhtc0iv#AxKt%sFD<@CZmId<~x|Q4su*?tQVQlx-T!D82xsTxR2|sN_z#aN}Rm?~>Jdq5Nb)_DGp-!5;JMUwyBG;&U`hCV$RZ z+V(ybCm-wGe%tN{yc-gcryHtkEp$89GU}pw1@AX_bOzBm3vCVJ{1=`5gV>U3P!XY6 z@bkQ*i<*4j83B~f=Q&9~c_2By0P{4L-G}cO*`q_sa_|P48A61)&$WUR@72<0f#~7#Qf_-| zH6HVdQs;Xy9yHVUVcz&NUGc~b)mowaAw;mrL@4Rvit#1zue@hHy%049ZMq=;>*nU> zv_1gd07qo?-ye+hYFA7ikLhvqmsWVpaGR9{d~P3>dHCO_Ri>ksB^$&IuL1kc2%c(5 zmZD8=i#or~^eZd=yRE_FiPMAy_8*mur4novQ z)iR1=j`S=1#@55w2=@Q2`Tp-;5MqM;!*chmZ6xiCE+!x;h)wTp!OiWIImiNY=Fz=D zg#1cDj%PKEt1oW)yMJ4t{5=`z_L30&3Y3}iHWlHFl>SKT68nX0@UzyM#CmrL^h(JVzA19D%D6O#c7c4oMw^{j*xzdkLGBA2Y+LtL77sJOpwRSA4<0Ji3C$ zJ-3i(Xupy*S)t}~RN!w@zbB{~@*SR;!rcW(q!NE5|F|AXdr7aapIHECsu1V@nlYHC zdpooHuVVgxtlAo5R?skF^HRxXqU9G{+($idyh*)a<5~ZOOTtRaQ>&GbnaQ1h>{r1r ze?Wbjuy5Jw#GbR{*RlI;PhQP0_ItVe*7wQ#AUNgkP-M={|LoX5?*?Q$Bvey{vy@jQ z%+sttW|V`JnXA*LkR5>J1pPf&gncBH3DRt=jQ(c@>6@dC#xZGQO8lD|RrY1M)BgLb zpOzM;{)XoN?{A-IcSxA)fw)I0k*G-ZoiNh$YB*TuD>uEHq$TNe8|(ka#gVQ=o+P7o zETH6ZVH0`kO`zd^k1?v~zUf}!Ez;iKB9%BH2a3zblSSVT7}f*sL}k-3Ic5CvOq;nrxacjLE|$Rz#t0*g7fozRhM#*PcYkG z;63?P8uVBO^Kd#6_SuJb;;J%|YcOjVPYN?NRs01s0VY@if`Wt1$8GFLywk}hlK+3( zIAXT=pEpiKrnPzW3EYR3p5$Rd~1tyI@Qw`P@DVT&))q>0VB}+TKee z|6BYAI0FoPL(-A(xQ~~%b^f9Qu}*;%@2~praDO_|XbjpPHE!Jzm`vHVT7o<% zabzN z%r5o+ys^lM%$?&1??`wt-VY6>^<`;oZ7m1IGpl~^^J_ok=iwown)SJ-SKj6q5h16Y zv0-|$i`$WM%0DI|b79>VNm12CF?;_PW1gL#=VxWnJyb zJ(hRTKZb$)+9b|L_%(9XQHq1x9`ry`fa|rO>xRa6`W2&Ab2oRUt?%|t&$|toM9eeA zm5fPJ_G|Q|xVgC>4oq7CX{Jt+LO%O7Kh7zEnR3p?#;bp2HQ*F8*?Q2xV+K59iBtzF z=&%OXbYH?r+92zRU3!0MS9FyOF~;MT^JrwDOM4Z3)$3UD59sLVNM{xI>f%DF`EnkP zm*czEuoj53LCX)Y9;0sD$%%lP)86VFmds3=$2gxnbsTc0*`ZDKMFyaryZJ;l6$$g?GeSso|@d@4@Yi@P(?Wv0{o(Sd^pT) zudJtYuJUOAVzTgAzTX->T$B-QFE=#ww!53w_xDQqj(8E^eh>J5`mxd&fY@=y)oXuQ zB(=a7uhPDrg(lQ~<<-vY1RINj!$#AX$&9_LcG|u=B}>;CPI#0HZ#GF$EX$V7PEAhz z0WK-!f(XZrGwX`2G)8U?%j#F+j2{uRRZjX|Ud=mBX}&UFI5Gfay<^rkV?gT+O+u;q z7saMdUxz3Ce*PS6Jj8X$jL29W`v?>Ap;2`DDqc2b9@PF)w7+Qmr_*AQMxi;Z?%`qG zX`w@Yo7(MzN^vd0bYrQi&qZTdy*6l3Wr)=UFqPqvd!XHp(0um^!C4;Tcei#nf+QgWm(~_NsaHMdu;~(k919uGqR{XZfZ!T z3;21aq+o$g2!Y4>UPuCWu#4k@~^(UXW7gp6QyJUMRZyA zYtB#B=c>%l7Hlg9_bb}D>|0Jl(nTCzlg180pH*b7DWy1d+NEgZ0LVy-9vU4VJYWMi zy)|a{8)kx~ecU3C)T~$U4@@u77UA=W#K9qTt2#HSMPhKSO{GgnWjb8p-}@6=ayT-+Y}u^huM zitD?iVvnwF^YjU>ey<+^c0G@MJPK>XFLw_-rAQC{_+1v(hdsEvAb`O)=dfWUrlGkt zaOi#ExRt0qM0?js+mniWsu%6-_mudyuzmm4)debolhucYA1O^pjUaR{{cA94QVE_v zdm;)18|01IGm`B&rHNHUr7wDk4+U(zdQoH>>t*e4qW9#dzsG^Jdn0 z8TZ99XR1$my4Q+_<5vfV1AhG|vZD5hKcE^2#nFDP(^SJ(Eme#($VW2B+%0Kyt1*q)Pcq*?H-f-6!Xl* zd%NrVxs%&HZjEO72eZ1{RyXr=WF@@PI6~_2T*^>83lA>n*%|os6c!VVL)KS5iKxi= zi{t{rh>=XP+C(WvfCJ_ZRjjtLkKpA^FSqY2`Dp5}r=Kr`w~UtOgKr#9m(I7YV5Gyx zNutlfDxW?f4(9;mx$|F#E&K2i)y{UW4#c%1hlk!}uWGjzA4vETNf31Eoh!7>&2 z`HNIPXkmga2Mo@-&kSEUT%GuGt9R(SGt0SvRHhD!`+d)k3UAVKJ?+uOR$q>|23nCV$Og2WptQuofHo00s^ zFJ!`ag`>kT?ph~)p~HB2*BFZvq#Ug5ysp}jGr{-&D#-$MWXdkyoym)y8oR|YUOlnolT}IkTd)Lf`P*wq{;ug|`?PoUI zZjab0y6y{adi4dtr?amcEeQS{=?yd{Z*QvlY2qJ2KCNd(R#wCy7dA7DA^+W;ofF>wV)VdKZUxEG1hnbhJAzm4bwq(+VP!tLiw7feVbGCR_gs%LhG=`VILh|bUk((ouHp8^vN#gWU_X8Uq7FfrW^I2{X zrij`6P)gzX#H338nxvoLCzd(JE27CKt{`HEyYZdDHbh!?X_4*Babz8Cagrn_2Ktst3Y z{c>c64AvkYFaW)GxABOa;3VWbBXV7Rpt{=e`WUH)hCVQo7u9wzHE66RWd}F!6gU9) z7<%d)WGwhL()+-?OSu`hB^7fwNHU=Kcjya3H7OJzi0l0=(e3{c2m9fO`w6BEvndERUu>SQ9$<0 zg{-Q>!L5C0*p{tj|Cl5~@nw0!?ZZ|3amMb9TqF>d=~d2yJQGP$F}}Rd%h@)`qR}dc zxRVY7{tmKmjE1h1E86zQ`(yZn`FdCIoK?<^???-XoX@TSmsav!h$+qwQ+oHvgje12 zbf*%SgNk|NJiq3LEDa>rd5HP)hswBgFuf39a-Z;9lu|%?_uhT4+c!ZN%fJzvJ0zU;*P}%u6Sq7mc0` zh8lu`WKkv+1#V4G^$y*9cMikL@Iu#&OnHbOH3~B<-Zk`?7VO6iJOtfZl7QL0JeNIn zLpeugD)iZH8#5@h%K2?ZVOA$6J`mP1V8dUu+loxd3&eqT;c$MF)fm+n(KBeG zSU$BhBWLqyFV|4Yogd~QzD(_RTijdV#jlfL(wc|KKkqHJFMf)=8@MRyw(Mi2ucZ9; zM2mQ>;AMQQO6o^C6AjihVuu{dFiP8It+5blf{)CaJvXIHO8q}3-qMM7F_d<^XmWo0 zK7~bp{JCzN=2pNwqBg};0p(L8U;V3r(}i?s@#`FlSB|xKm}h!_xY1ki=@#U;RWP($ z-j!?mN2lgTop?4;O`$i|$U{t#l=pcgZM}vnp~* zX%1)KQhd)a?>x<)yhX7$J3VIqnc6GlyB)5Ywd-QgQ7;Z|bSdxOB+HH6ccv+WnE|NO6&?Y!stbZ9BZ=hCnO{1 zFvzS|tnvMxHbOD4xU7{hO+E(1V$Q4*r)J(i_`YX*oxXo35L5MX4>QCeV$nbc>)bp# z;;#M`dbcULLbtokfs$5GXpWfiGu63gL0F&UZQoLH6bB4s)#^mqN#33)0IGMd<@|ke z!QCu*#OmonsA!koTKN2g?0HefE~rz(orCY(_5_>ubGv?E=gr}3^*T)ZYzo$%>x!u5 zZps`gI0N~x@K?HPF;O#8oS6b1a=B&VTI6Ss_?}fP@9>&$jW}6NDsiTtJKn}jFMS|f zEGjh!0KPm8J{{T`e(nd$aKnz__C1a8jJm5%zM`%0IkeqEI&`dt_4^|(e^uzEv@Okm z!p@9D!tOj;{{Y%u*dl@N3EBUFS=u>hqOzGt3CzovgXDegS*#H07A$QVfBnPfvRM*JT?_iKel7_yG~7)`O#ZU5_f z6lzS2HaowU7qV@#kK$3o zX1)7I)$8h~xOebA-I{lUqJ=R(lS1eET2plOqJcwixu|JEF>l{Ab!(kdOV#m46H zu4_eIxytqlgcpoQ3&50uzhrRYPJe+A3-l>SgRaZ5Ljy2Cv0jhF+!cvlynlPAtj*b%7*rH z{r&C?yJ$}7-s=@6oANXXErUK=6O!@9FEizQ*s`*6SY@oAi6-{VA~4ER5>*n^N>u0) zFq8vjbHB5i20SU(uE=ye8ZKNHkpp~#?>jqQ?`#x8>)klx7JYAdh=o;qe4M3D!(xWh zR-7BX?>1?ZD3X=7CjbO@SALK69L8-11*)JGKLed z{9`ojBAVtwFY4(KvS=7Kbq7psvC41M_elF*b8=A;b%;e1t_PB4>s2XhOT^O;Ohp0T zlYSdV5aPC;%Z^+KSDf+kDNo$wurq=Bkqn|L<~|DPc`ibcZuE30 z>nsl^8(4(wRy)a7C6KFBf1(aaP9VrO99s``c(HI|v@;J3jm93VMxC)-c?0w9@ z0*vbx`Fx0ZyKvUaVp7?|>+HdM|B8r4Y?IAqd)nmyJ=ZAFaz``LR}7coaJ}xQ?|c4< zrE|)r7B9jJK{Zyo&QhV*D5KmMOiJmc{frep8Eo^l4nY!uydEgvFoWwgTBwcP@#$Y` zm*DGcCoe*<;%}mJ;lBXtmqcv*IHw2mBvOdy(Ef5kTe-CNzfVjJILze*2hxm%4OdQ5 zZ5$`^IrKRM z1DgWXXcVXTv17ywVvVm2sK<0q8dOw@blAiUl8Cw+$IX6xY3_*(*>{z=v0eLwcj`+h z08lB<9rZtCJ|J8pk*yv7x~U1sAkRJg-I1@!lf>a|Xf>An!#vP?$LG8<=7Q^Hm-{Fw zt=+GlDX;zh(0|l1cBN1%V}CKp3${H$N}!Y`qzoclAzc~KxO`%7_J9m!D__eE zlp98UNEi6(-{P?%;BlWnX))0d@W$WX>Jb<1@(PT0T62uZcr^vzAv$g{1e7Vu?^li9 zIvU!Ab}O7x{-z!H;pSh>k!=_T3~@1nojyolesXT$(x4tIr{67+tq?MBdjSGx^Dq6@ zm&WFkf4QixFXQLCyOX16eTT4~c}^X_RUviaIp1O#dUp-G8muQ;B;_!8CA{LE>wQkO z8fy!_RISu4~DrT=i|q`&9e54YAC`7VJF zk33DgTgnNV4ff{CzwlHFHLOW{S2jf8ALg>Li$%IomfYGM5|f~4bS~C-J)N{M+k9)e z9*AN2a9BTxc6DcT)OP#zCSK}>Bt_h_t`>qb$fd2ebS05IB3k8E;GC>=_%=r2J7s`Mft;F%sBUk1#klE1RmC>j$j47x^_P{|XwYIegTPJAxwy}X z23LN*ucfYc@0%tvZ<8Fwb2@|WBlLw?X6MoVj{ZKh`%1CvU;1B(jtn2)vfmNMVC~3+ z;w!WVplt_5ZUKZhNr?6Hl;MVK>5|k2gbk`iKb^{F$^6!%IZj91 zEBaN;ide{-JmL?JL8s=D`kY9PxXSmu(uuBbn~q{J1?tnXuL*L$STA&i3Eq6>hwDBQ zDtFbQs0*dEqqcTja{jJjh>@86F@rP~2VWjsrSdfBt!v4)O&YTiLEP9L21D*o?}B82 z3|izDCQAX*Cd#}BL~SoUJP$g!SAB2d zX$Mu?{cyl0%`*S=2ZPV(TOyIY#n``K{(lJ?h8PdZa*4I}%~BoN;0yaTSFGez2kBV_ z|Et}@FUn#L*{A~^-`CTVfTWu>lD+BgtW-|i$;W!oA}!QnQ8S*F@pR&FW<_cbhFdg` z7g{iiaeaFi2+yJ_c^Y@wE&C!|IV2oV`_eK>h9c4yqOes-WgpT_I@P3Ooeuk=`7GJvh0IMX2}4+Y(L-ai<(`Z z%CD^8B36kP2o5Xs%^aH4vJv!em5vm^??Oc{bO$I@@>$6J6Cf@H0x;`C_ii~2b<$87 zZv`&#j}j;7w0(00ssgA4r>{)2xUm<3VZcwS#{Qt!9D7t?iuxZHwoea|^q9dJu6yV= z?T-&AH^Xj*ul*cG;Z4dI?bmDFkN-ujMl@^bd%;=_s0i+d4Q?J<$=5Ig@359Vyd*_8 z29V&-5{H+$L}r!DF@k(luZ_N7{=r=SAU6R1&L`7o){xBf9Gp_g&`Xp26=-LX-I9vV zSXX`h4t_m{*?jk^!9lKe&)V@t`c91b?=L+rE=OIYC#JajBdu)XQ@uk^uniBAFg&tD zHsg$gc&U`@-q6n}`R~nDf;T+E?9@?QSahGFrf(z=d1qk=vW<|yPh3pOA@bJQGIw1|s}D08KG=h-iL|S= zWgF|V-XG9L<}va6iO+sb2jnE47UMFavL0qLDFH5XQCDRC#0-io3ucvpJr-?Y=c(>v z#P>~M6{IkDs&gnvQQ6}-=3CuI=H0PPAk0Z(fuWHTn;96Gom?Mikc>o*Z+%6C_xYmr zUG6rTX_HXyy@q|}>JJ}ok5yCn(rEN@7Ha7#Sej;d ze|?^mX`nbnAkrh0OUfG>yL&O@Zl3f^H>Kg%c>)p@NxEsFXQ5NT~3m9qcF+JLHQe63QjUf1W&5+p3x#9mxzq zug8|N8c-v}sLAhSQyz+AykkvGLM0uoKzl;Lk3FQPKI0)sq>MIR#n>m)-eY5|_jJ_2 zrnI#70}g!ejqmeSuArBbZ?R2aR{Kc`PM0xIkt;o}mpwnXCEkyfV123C)zZ(7ql7Q$ z)MTkQKRI!M`a+oOfaYB(fLy~RSSsDMyVo0|uAOD#OaFUCR6n?>e*0GsDs! zI}G<2xUrAN8lYdh{MM$nV8f=UUu=W_L5SfPXCHRLWgPjvvT`LW?NCq$Jdjn7dEDym6UHIib)Hp|ZzH z6hfVEyK+PQnXXc87005)g+h@jpt4uo6c(C*z=u}8MX@D%?$=Y`Wn!{uDI+DO%RqH6 zvs}Oq7I%n3RoIqdtrHcMc-8%xyvY$!Uz%M!jf&F6J#7e5F)2*AJSKqf_6LNXXi07sn^Dc#8Um;rs#OIu3hfS={YNhp^cbD8Ne?X`JzN*kr-mw zT)zPTR;w0{z4FdnL0H15?DxKrPu$)2O3%UK@MyG9)3N zm=LXfPeUz!gTQf#W;)CRJT_Y-{3EAkJQN!h{3ga~dHfX-x9hZWR0;FBY%DfILov!DjhcpSZm z5K&jSN-7$oL_-_!oxm4=>=u`wl`}eb*)qY_WfJ;c;t}uIxxG#6J56nVBJ8`tpxd%T zbb3YeY3pC#`{S8^|1~k6@Lw8KYzz$`n$u?sfK$y^*&64h^M`z~!AqLZ4^mL|7%Jmq zoLlGT5YiBdRhHpYI3C_-A@rxbCLW?P(%=X#L8NDf@Sf2QZ(Oi2UO9fgwW z5GAA8?HKTeW60`nXr|@42)73M83yDMB$mZXTy)$!-a`C}?;chr-vRy#wFLnK z%>a8giM0if*P4#k+CwPl7tX4CjrW-hY&h)!`ux6c?%kDhEv(~fx2SE}2>K_1@m(UK zCujQ{jJ1Yj4y-Hl-L;)o;#jJj-m!0bZGJ5LcJ$Ac@OZC?Q1|+AWhOA4t;jL1^&-LC z)KE0XcQXv2_VB6CnPLGhSCIQ;N1j7mx>plT>o!Xb}de=G$bezk=1QederB8(S zz5F_ec;XXT3))LDHRuDdC|DP^*Ad!q-nT2 z8+Lg-=%?{gvKm>-JZ!A(M{B>or}XFrt#J5Vt>ZZBQ z=YIcj?Py28;Gw$+d(9IHpdc_L+kNUsL7U$}bgha1EujK=>E#DdEefx)i$DaB3nE6n z66$Jy(Zy9i=rP$AIV+m!Rfh6(V|uQyjnLB^=)}wJTtx0T!Hmi! z^@|^#Z7x2$&JyH~3RZg(&SZxS>Qa$#=+KwOgQ|7lUxN9dp$rxS$GHjcj_=F_(Rfo|OHx=B z(j*VwFUf;f6UMw~hb^3m;>H5Me)4>-F4b7{ zVS!fmE$uD=yTbYO2A!?w$ofmNKTj*d7J7X^4#3dHRetw|3ERa$ng8|zNN=6Y^-f`R zeQ%>P9l13F6@6~>K(BLJ>XY2~2AdyqydF>^*LEIdKTsiqswWQlj}a96%VCQ}5AGnH zw)KEr4UFLpOZ}~T+IHo(Yuc+p(BnKuUb9G(a6c>SxQ33XhG?iR)d4wp zQ&P3K~n{CStZ zzR>!f!Y`&aR2|^Z!A*g=K?=^aA{MH#oaH3S&a6*a{vaRM_PYb{)3ZAJWhS@PdL0jp z%VT;;FW00(;rZ~+EIMsYcN6iGR=~$?< z&DXF0%4gQ|WAl$;=q0cBh0c7fZ`s|EwbZX$U#GNm--qC{GaT5!jsr<8DAws|WEJLP z=KHPd{7Sy(<|#2c(b=)f`a0_IA5toz^QS$mw83@$Ip1DUjhpZ@G07|A)$-2cO2Dgy zS}mS(Kc*N5qk0j(DvD*K8MN6*;jt={P}=aMEsn?i#iwl~q}D>a%uaz0Fj1=*wzR>y zNmx`V7cSUHB)6kGP}w{=stivZm2GUJ%a06qLlyn9i#>0xy$}A)&Ej7+aP9e}eGl!B z9QbYLJ`Dc3&79WN=U`;#JFXS#ZIwLf5KyXF(^tx`?L_pI_viKByyh0msnc&9%Kuuc zYK&AbWK22jmcbku!ArsL_1;W_hPHdFKWvo{45Y%Fp_9f?*h+HLarJw5AmBW77Vy6Sd%RX^{<6nx`;r0O)bQW z9*&`vAv7urANI!k*aFNF`8^`vFn>8v76#_FEQV3=g-gCzg1T&wU0T#h=7ec8m1zFL zq|ScKnA^XpcLQ)p-AtpM4KI?t-#lU}PCQy}9s;$)6CDpjS6GhkK*oa#pWz>FRm4Q? zK?91JX|&UekGGsivEh!Z@b(EP)oGPMUaxm^*8Xtg=D0q;xuyE>x!Az*&nF7WYQGar zcY1k9b`%wTl){HD1*u01y0<@n--tSjOhGrXX%kg!9#YVvX+a+o=%YBfHkzVPmR87! z*8S7xcbW`~P8SZ@zcsB2#|l{4T~P2ACT!A9_i=XH^xYSm+*f z)zUZgwzM=qh~nt&!37b+EzT-HuMf$n?3?O#%2c2?z-6TpIS5ZQ}2JKC-k= zppo<%Je0bQE;WWHFy*8~(g=NtO!u6b=L61A@VQP{7uL@c)cSf`H%BgyQ})dZC!Rmug^0L9+T|3GHKG-}2%q0`xCpSF>Z&4< zwj<~t2BPzJIXqw)Vt3OCO71wDhV%TPi66A1Dl8{wHqtIZgY0kYop-+{KaYu+ z-CdwJoO_Lz%cBk0gkWoHV9?~yl??g}tt}AHh_S>-Rlh%aykB;NWSm2YdE?r8|kkmnY0E|tcr-C`PYcdb1PrCRk; zI`VUGU8*rDqnY(R66Uj6l5YHbDJx1lm^N=6nW#qp?D;lBeBuN{!P;+}M7jA7ymmQ< zPJ|$(#MM_K zi!A5IhWY*STeEyF6DB=iEXN035{paSO^fSTC>h-@j4oLJNJt%r=&hV01=0h7mAojoA~DbKE}q*%l!}b5+PX?@a#L$*`+Wf(XGv;9G7O`N>NcL@6tl#UzT-K zqQhsUvHupw19UYwX*H0b>aC~6Y54jI(r);R!~4CEy$*AGV`4u+9s%)zMnS7mffDEy zNns{OZePr-eh~Q(TE~z*$@Ad&C?za3LkK#P2bQ_Y%irN)-4<9>7lFUSfa9Og`17A* zV&hVc5fy$>79Q)P`tiP)cw;@xh$(K;!+0$(@AQkG!bS$gm;6@>RCHP>K=Pq19Y=E3 zm2oW#ag|ir5Dt0qy)`0-lc%S5Z>8J2@IsnZy@_yJl~vBF8#v#TFo9ye9|{fR6|nJ( zta&3q+MYY($h*j$i#R-?M7q}}s2&&V!4p%pvk4nZREtdpJXdBO@Rr%(9));kF9N1) zr+>qtp9hgi7t3*Oe4JicL6U3n-)WYc2SKf0pG7AP?zxC>F}E9WwKtfjaJX9pl^WC- z#ys9Ght^*Gp}!r}`^wYWN*upL~0Z3*LWm*s3bqsnbYHvo3rZ%$a>l*IT0sa?h`Zn!hS1 zZF}#XcR^pPZT)#gK6%1Hy;MW`Q!xQ>Y@yZ?0W8K&FaMzxr*2r@@;bHEInEj+Nf(g> zV8o&IQKvDgHF!7~SFhDhBIchGaJa+t7%eg4I_L6fOus$T2z0!(7ebdUU)VtbW!>U^ zpz7t*Kk@fz6IZCnT!sz_XNH>>a2VGun=RTW4R-Z|ep#~D@154A(S9m&$i)~)`2j2h ziG8pdC8y^vjEZ}>_k00RHq#7XfPzdb&;-}7W@`ZUqP{PPI!~kD7z%?aRK3jgiDp{} z7LHGYd2TI6)q9cDHrHC6!eV1lUiwT~IGBNi~lB6UP2rfg!@@PU%h@_ebfyRD{LGMksAUF9Bz) zWsDS?SL|m$Xn+?y>%pkiYq()h!{QDaZ!yxNz=zVGW!k-oIT8^5Go|U~C4K+fLwVTf zR7X`exm{CvdbqVdLA4rJt)cW)7!$s((FcbL^O=0&ko|@x%6m$(GP9f+2Jc7f=q)8m zGEbg;t=5j@AIP9Mxw6-VEK1<|_{|f_56oOl17$F?;~kTVFlHZheYKoN>Zh_ij11vn znC5&4v$OOqLj23|7gw*+U$9^tb_r@cU}m8D-m8;g8a+D=C2&$fwmZfl1XxU@IIW^# z5SUX%37$dY6}Paw)lPdhBeoQ#ZZGG#<2uUnvuTeMAt{_NBX^=br+odZqk^;u^M=T# zqv>Z)8;6k3$$juII59r64><6L^`i-enXGR1Q+6K&r*CRFhwo*zKPoKMJse$ob{OUIafX|Wf;iO+D&t& zkAHjJwNX>7WmzOn&1UYCZ#@lTChoG}89DfY`6<5onW+@2F8kVReROFBW^qgJ1vY`{ zKFU_nC`Ht?3C6FUIolV+OCvCXXA9eT(@s!D?t&p~Y^sU}_dJ zC(ftlJHKEQHShZ|aqAER9<4DG&&m}j zku%V??HSw*Ov8p|UMf>A`aV7}TSQ%gmU=(aCq5kC3~hJka24~E^@{>Bt5PsAD^o;I zPj+WnH{jAFl%__53R$$ML`==~*Xg71swhwlkitx57&}zRWfN^L3RX)Tu9X4{9<8ts zYn4vVfiDI%tD%F_l#5#B3t7TVMV8gGF#@m+GMw^>-J&zE3F-4fuoE2BGNP+-uq25U6i;AEU-=QQhnmc3=eA`FHc{5PnN^Vc~eO^;VV!i5<$>iql4%m(Zemn5b5+51!6k79%6 z%eT;MdEtR>5mHWHJ(|uc&t=#7hDNoR)fFItt)n;<;xaaDGi2zyLy%?H+$+AX7bZ(< z1aVd7AxL@-t!ijo`y=!H0>=ll~no`|*HLVH{z5PP1viL$R`&dnp%$3?{M3Do{cTfhzbN+b>^5Pj$^BLrIycC5y z`=8%+K54T;G^r)~oGriYb_c_WLYO6!!0uV@t98NR=d}kYUaAGhothypeXcnIh z>-04I`QdPnxWv6%5l($cu_#e~dQKXY+|PKKlBFL798$)jE)Nn(fKe}g($Sm(c&wpL zcajU2RB`qkh*d>-<(98j4w^0M^?>WBh-PPoHQw{pthDsfoaqinX9r;njA$?q+t4hk zNKqJ9o{C}$V9TeO>2r=!B>j_MgN#kYVI+@3?bNgpfj`4E?l)czj&d0`o3Tgl^1aY^ zI9YcLjs5LSNx)~&dZ)mUr#&P_O;zQBp{-gW9hWMv*YwtR;Dm_pIRTJb$UAQ)DAdGZ zq%xb&Vvc85*d&Y5S1>ociW}DlknWk8u=xZC)!A`f7=I!W|K}2d&1SbdN;yf$K zA2E#M=HgErSX6dusZ=v~g?IO5HEs%-HhFQ9_em=5(FTj8uz@1Z*RgiX)pD6Dm2a-N zty{k?i_QC+OX7nkffY&`jRt@$R0B=gSuJs7yis*k*UbrfP3ubfstRG_SH z!M_)k2KsZ^rwqZ2hd}I>{}|Xm{~is!sOn{IWKkzse!jJg^%+5+rz9P3ESF&DgYvv0LvOCYXbZb^LAA19 z)&|Am6Z7pv!Tc6p7V5$i;RPKW6ip)mEKzn_y5_5AF9qV zIMVg&`f(=7#5N|*#I|kQo^;2yGqLStV%xTD+Y?)F&i|a}!&~{*Rq3wky6)ZA-ut)K za?lk*q71~TCb>FiCO^&{?Hz3ahmIOnwrbp2rY>8w9tb-DK9#((krp)>1`(6faz6}b zb-Y$noQd&jwapUnwh|i#{DD_HI0bFi@8X-CZfd%-I*z0W1h+m8LjUbmztQGGf3i_V z@ep%{{j1SK;xdc3rnfgvhE4j`incOzsVCbRo$SCarIJXV8q;JqkbXy1PA)ClAp5|p zwh3lvlYD+^Hy1~ivPLMM&R`_I-~+e=p89&-w1x8U1J+=6(q1rCv&;R zMch>uFO2<1cJ6J9*FPoX;hDz%^_89kkLYWqAafSde|KI-ewxcfOn$yLwcLn4#Qdb% z!xm{{#m>+OOm6rMy`po34Hx?%GYe=8Xg$wYUOW0dh?{`JkMVAAW>J=ojW&mW5IRoX3g?qpzGkWS;GL^-;VQ| z3S>I~g)uHdZ-f=W+LI}cF`n_7H6V8;OUNezw{MwE=q&WNCF)@GV#n$808STLwT4*5 zH>osN@_zcI<7`Y9!0TU^t}(c&|G4bHYhN-=;M=q!^>}7D@)%LJl0!xf|Ep5H!;&?V z`#A(Qw;{=o=D#iDLCrcOHC|_GvSR#wq{oGkH{*x08j<1jw6RDVHD2j>LUHbJ7Cc%s znKex5Bgets@4}J&j$56xMe_PyNP2|pF1F#ngFE57A7it8Js!@3_MPZY{5+xvHaw){ zqn0dOE*53Q@%|hr+D|?*A8YDHsm?_jc5TIy- z{k9{-n)dk;E!|xwco&I#(j#H+1GMg61jG)&cf?LoZ%ow5HO8)$8e!*c1F%-!+c`Sj zRs+!&TWm)AKN0)=eSM!3UxJR1{2%cY*~zg*5p>eQdsTQT+Wle%Az4R%p01`oHOsQ z$2DhwI%#g(o?*mBO|8EZ`(g^#uSjQgnIw*reZzR3a-na)MK;M@5g6z=Y%OD<<9qKn zz*V<5UdsMo@d{{Lgr<$zMNZ2vV-9AJ+NJZ8bOv9Ge1HMRS%I! zFJB>p8cSCWtx`IJ0wX3L7`*5U9fPShQ8Uhi(P#MTlr^)#3LjoxgP9tRsceGF2vS(B zlD_mXO61055_I@Gpq+&8yeBWewpWqX+#imq=L)0d`;5~!L7aCHO(9{H-hV5v4;n*z zM;g=s#?-=r=}ioi*W0jpyf0S4h31dabA!NI(4iTOL56VigRk0-P*Wj|GW29nKr*P= zZbj9I-#Z1wp2J#uFfOPasso3P)&=1HPTY+E{y_SA6>BM&xf1!G2MI`NWU#f}+Bnci zLh|fE($8jdlL%mK6g|xMTB-H?mlc$r1b3!TlavouRnaD=9k#r0k84!u=-%~~F0 zwQgZVL!_No9aAuA(Y^xZ3xr<0SnWg)JvFZm!wgj=SD!KSPh^OnFL$|EErHP{7fpnE znZbx^@w6pqf;%aE#BYD>_JT$^RWl~&vBSYMe#AG-P|MK!`Ino)|2hl2fdbfZ_%RAi z4Sgy3d#lUGrwwHY*s{py^>R#oL#(05XL5{$&C?%6TcMuNZAOGgl;r})>TK|+Q zExnQEbHBY^P=qct0BSoyWyBmjQhyo~$>2JD{t< zTwORGwzXZmCAQC3CldQ)H6(`;Lws>pHp^TqJL}Nh8c$PbB0Fqx;Rr6Yt&V2#)tMRh zX&OC@(SN7|*$tc^)dkpC?gmEpp~@frmX)=?=JJBTr1k{77%%LDVDmH+#*KoTfVif} z+?=E?urU^hH{i!prSzWsjB>?DpG9ZpX;ju7c@+4zh$#1(iOzK+`gQZ{kz?SR z+j$Z`w=iH%!;BG#X_Pj^-fIb%gK$P!YmS7BKH_e?Kc2xat4E%DIzyfBc)b$>?2TFv zG7oa}vy(=_+#1xb7-{>gmp>$b^$b56@8NQ(`l%w-3D$b8?(mFpt+G4sVQr4n+o}A`{ayPzC!m`VtjJX>u}V>+HXdA$TDIf#p*#!s6}5m z;BLU{TQnc&p%;w0KVo->c-IG3x*}rtyZGk^xFaYc@aXoFrRV$SY;=firYHWLl_7Z+ zqTM-^kUsYlhNGwfNMV=T6S>d4IixHlhZj#dcS1=#{{B;Rnj*IXm9?AE(D*}%y@k@L zxGO%5+14n3)iJXE&wJMRi4p~hkO)XcD^P}}|elwloVpU7LZzoQHhZ#nuxu$lwvkt9tZh)O2ON-3Ep zD79L2XVm;*__{$lxw4p1?>0I8IKK1z`?9*m?Na83$KCJ8A9(qp#r?<2X27o@b)h-WEyda$qz*zjI2-9oGK8v?gDuDY3A+WPHGwV05P zxb7P{wx!@;m^|^gdJS#3sH#4wB3atuT^}E5xNQG5fv5kBwa#A2m208p1Q2btw(Fp( zEX^S0(%sP~kir^JEoo;^7cqJc-J7YSVNgz&MyeiA3v;+$bHA_oRhn@`W!n7;XuKW7 zR!$+Niu4qO7Th(mOKIqf)qwt1M_mAgeUa=H7aV{t+4?6sB4bjM(b5LAh)A9zDsV_W z-8SZ1eQ|=t914~~0S{y&pUB788ca`u;OdbBD7V#x&H+3CXWM+UgQ!&~h&sA*z3oK^W36A%Wck2oF@6O_s0R956CLZLT-gc~?CtA5 zpB3PPS#~_FB-Y5{bEq&3)T!U%>LZf(Bqk?fkmwL12XFm_sUg5F z6TnwrasvWwiCbI&9s*y5iY6`!7hz^kHDi$ZCZY$e6}N+yFN5e1a_FkAzOy75v3IBK zx+D&|WYcSwJ=Z}#p-zI6J%a2^F#ik7D*GqoMvuXLvb<;LoMhTr4)y6x-uk9M=IrlA zzNq4NfP1#s=^vV0D&h7@Ny;@*T-|=FCjQgXyOW0#R*uMC1n@f)MbN9WsP=wkHd)6u z+sd!&Ujy{hN4U)_1lwCpD9~GWvuTLHCV$C7$uB<>xk5o`&Q+|VJJAT&O{eSrHN!Z4 z2(p4#15XzKhmmeLvViUH#b8#0=)Xx}sAE>Q4CU#FWWx0&T+dcyq^&{O^T=tyS<_+o zKkR%8d>vy)RGHy}VfWMzb%^=hD1ifhVBM%^@Vf^H{;|Y(Tp^RA$hR{cS%3$18Iggs z0yUwbaLpn#!^JbYJO1Vt<_CQCUaD#wawB1>6Q@ni7W8C;_G z0jld&al`fpE53sF^9;&4L-oSd?}#i^GaUfb*#8EZGz11bBmZJxs*pi{rsFFL}PcqiMXH7TBnkRMb)Q!tTLPg|<*+M!osL!=H?@RF7Dy(-hkfC+uR z=W{#XLU1~U+M(dTZIc8~jJgoTvZm@VESlDvhK=>%!NUP%bOu zcp7u1auCmEj#WxCU&*$MvPmtW*{}q|rFso@nj`b7uqaS=MTP4`mdADxD%4J5x(!L4 zT~WG-9hV2jOARwBl=v&EniCEML8Bs`feGJ`uz#n!6he-LlaZP0x*0(qAtxG&n1-8l zLY?AP%ME*x4@HeVFkB&x5h?!+%W+pSLzbe1hKK5pBd|XtZ)tfph|Bn&Cp-jt_L0Qk z{q0sP(zVk@*?oavHp9B1b58g>Op~Z;?kX_R&Z}6r0<1<+AL9m6oy?4rY#E*Uxb&(7 zEX!7e{DO@CC!KzB7r2D*#549B>LDAz3#WC%hJsBwSW~yq5bHO)Tl_`< zQqe0JUy*@2#24Q7A39EmJme5SN&Eh#N-Y^qHcj_TB<>eXw_miQZ7 zaUGB$EDgbY7o?p$Vy)uOrQw>^u!Ol_>zULpEA*evK9!Q6M8p%sXuyRB);WVIvO%gg z3i%3&EKv3r)uTB-Bv2GA1?bW(Rk&@N^S`EgUIh>)kn@=GCLitkT@lGI@e`CWSFwaaW!2jwGR3?M6mTBL%I6hls2vg3&5s@ z__9)Uptz$XK!s2zbL)N=@vi2OgJ^(Gkg|3W6jpo;eHiQSBffafvs#i-PMp-p*U15| z=s|yG%SgV~3IN6VUlv%@MO+5MEQ;p?8vO>1LIEH;WU@NdK^=9)wB1xqKd7ogP8_x< zSGZwjwO9k-0KxylTw1pRdN3PlAxyj;Ieom_;9hvoXLsA4e&dn?Op+FK39~eM zBpGBVcI_;KRscQE!;lFWMN|KpNOXf77dYMcrvM^<^ey_-DgCg)77vY}i^j7H>8dnA z2SYVyRTI^SxA0cjA=PxgPEI^;GMmRXbM|w@y3gnP&FWI88ePkN9NU=b;5_)qE2C(# z!4b@*_UzreJ~-ill9mJKT0qU{-bFDfNR<4YNK(Qf1Am$6W|A?k+uRh<>h&!?H?<>F z+92{n-xQ;2KksdWU*kZ;YU}W&XCO_%M$N=I{Xk=JE#BHV-Bq#??q1%>?;6=XRJ;1R zhULR0lvBL5yhrALM#G$PcgMq7D?a9IGRbz`x~ zdtee8Qh>e8FCs11_*brg>Q^p#KBW3_t%XrSP819*rJgv(g>w4W%^MQ80gT@Un`q1~ zbi*!)!D*FXnl0@_7`BA`Ni;gcaNmENhvLICH2ox#MVmJU^e^!;xf4668V-1 zur?8Gs)@5O;R?;%cMJV?hC~y5J@Jc4y5Y;7^QBA{KAo7G{kfW(?j&fN#LG{^vhCE8 z3!2x5k!JM{S-PzCX}*>K?fvAf7TLyDmFN+sOWC^X3$FOkSg7yXzrRVQWL*Si8*V{b-7szAnGvFWOk-zs4198bNGz{5^8!BjRX$8T^JNScKg|`Izl@7ErdwAEHnB3N{;Mb`H=ZeDjSU)7eo86o!jy!jv zlwfFG>*O%hsXUW_l58Jmcxp38CQxl=-m=yOqwgcd7yQL+9iGKG4->fI3~F(`!nX_a zpdKYbjZB;sLEj-d5W{eThP$Oj{0m%TjM1gLQ}HHC$6{h3%=CQdhWfx|?bo_=$bH;y z9nQ{5FFQ)#C66z{_p2(~l`w|r_({v6>RDkr^g5lM+YVYOxpl2S=kn!+Z1mle*YEcU zPBwK-K0mCPjQf>$Z#lc%4yQn6BLTDn* z2D(92o)DFnP^S8Z1B>%$IJZmEO2Hxv>pD$k3W;O}bOzoLxbB+7;!R>Fw4g5UpXHjc zx#A1*=PVam4;aUt)bfp}WYaiswDeHX^#SjFH@-I{a^EgIf^3C+=*jH7poq%YwP3g= zs?3~a!+46R1bC~BwrWKJ6@%^H4BQFzir66CpJalmW9{_3D<=@%-~1K#(&MEY?)TW+ zfYCq|Bd>({0V~StiUGEKD+Pa~Uy=L?HdcUu@3K?_jio^1MM$*=du&KRf?k6XsL z9SITe(=f{69M%U9wpVQ+WtW<1y18_bwHkZ@i6NNXA%HHdR8~;R@bBgLb-!a`{WR21 zr$X|m5evyvXW3`ELNKz5)HGO zt&AfwGJYbsF5g2Yy#F@H=sTVCb|lG4Sw;12Z(|n9GuuBV4eqYpinRkrWVq6Lv6l)NTN1(!nWjp4cQ&2`l$I_@(b3Ww%IF%Di1Y zdo+z~jT1cSB$%e|o%>}3@=YR4ZI~kB=_A@wE2JXWvLTbRbNPiq6?jtNVV{C*lX*yz z0_Xk9z$T4dZ7f3o#T_8KGR6JKT+tC0YbJ^Z*|(uDt_)P??=vy=ge;kdKcisz8YQb( zza>*xO^$PYLn_Bu+5nSQC2d+e^5ZzM&ek~e-sI^<4Dw;{N5&xpqZ@>sRq8}Tuy>-l zit8YaV_QWmLMc_}pX*LKmxa%RK>WPokFJR%CoOaZFgxO`4fpRhtXVJ5w!r@+`&j>X z`{_!+Q|4AY6(=DzhWe5zSY#L_)Y?KROi~3w4w9len$X7OlyZv! zMF?N6KAzsrp9K|PIb=Z`%&E-=JkK#2O+eamt0K;j(?a*nG=VZp6+se(cEnRVs;k>o z1I+lL&V^7rVd(4n9;UIj{ra?)~8 zwmsS8R0uzI-&MZe7bHRU<{P-Y-bHdNn0|sf$uONfK_4=8Uoh1qSHAEZ;EL{i$gLl8 zt#=fpbYWp6C1>UF!>ZsLg>|-4+nxOD{85jeQCtFr;nqVv?vw211BD*1n_QM34EaluW zBhC(t(DsE!JPia&HaL#HdRaq4i$lTENF|+3t(yG+(LSMu|HoGg7W%zB(H4|%Djxwi zD4;?^5zo@Fpbl0@?pdJ>Hu1Iwn>7}zOQVLeY4e8we@tsK>#vQbRL_}~NqO{~Io{%X zb&2gg=P8}Ua_<1NPng>mF2Sznfxnu~kIV;7f-IV`l#LKv0|wxCX)KP^6ZNO1M+VV$ z)o_%v-l`C@&On{^D6#4p3->60ka?Xfdrmq63l3@PjYiF^dwhsC=fo^o#F6$H9~G$w zz1KZ5`-}CR8-PimlkV%N!jvF8>uAzNq&-O2)v=h;lpj-du9|zbBF6jf;%q^t#~~V! zoz*&dqqM8+4xdXw$7!_v#?NE6P)Cv7&r1OZ7uW7Q4!pWpzpt9>L15!S)n2>QX|jOn z$I*#W^cMtJ;j#h)w0(`VO3<$A(D|_Ub{LuJdb^*9pO!@&75_MvqVgG+dC;zM{}DIc zz^66gtPxVbp62uNNRU~ctW@!q2bme`1Yek4xTG-iUZuY7R`BMGrMW7 zl}u}N2}*;4v*D#`luQ1YJ58ad$cXg0eQ-s|{iL;TpJA35iQJ&!9Tgi?C4mB!F@NUU zkzMmDc{r}sd7_!=sQ7arVTC7=DL5ozu87jrBrY5i|EWP&*V01{9RtyKZ;Z`T=ULr1_N z4KbOCmCKu&j>75&TTr+-xqwW2;Ih15p7>CL<9b6(9Hh%JUkdJsa1a>Y&xTDoyiXrr z5tz%=twUR$;;ViJ4~i^3S;`_O!{%^bOP=rpu^~5Cj_NL6hjpzLL=9%UI}1Av?kIUq zVD;2OKxJs18T6YXrhetL$ruuO!4jB)n;YwS$dZ10|6K@K-NlPPQy7-{Rmi>Jt?@%C zujS`2rnO)-?F8cNs2b>2slM*qgAgz_{FsRXy`r&mP37unURRotl|p#LQTWe0%a+5| zL+q-P+se4Ek8Sb&3(Rb#a!0;qy}z_Idc*-u>vY*4>0NR18(QZmv`pfBpL_IU`V;Az z!>}mJT~sh*K}Bk8Hx^p7P3vDPoYS~cJa?N48aO@2i9){47Rjmzx>@uSWROlzN-)4`KA zbhVWmWv%|D$bQE3l1@eLvlig-77*uStI)h3glKnHNpN3DEd%1&9}UQW4b&9qFYrC1 zpbJvYqFz_)6q+QrryE)g-&?NqTRH>XJVS{-W6(@@SByNXU!{m0l+Cz+(JdyK4DY+s z-sPS=jOz_eky~?gYPf}h((RDQklh8N8E3-hS|O9bpf%n23w;&<$f1FQ$cq?Lvsy}A ztbkQb+;_h6?J`>-#=ijVD@O!x5O>ec=__5SbfuYMm?Of|Y5QbF5)4^h}vW?T(Isx__T zcZUikooS73$hU_b=xOp3PZkcOeDZvB&qC>=SuAG9Z^9GrnInCiE#!u|A^C(&DrP&d zW=9HWDAU88)DzjN-MpzAhzY-2+m^%3w@P}|zD$>NLGDYfS-ZJR{8NK;iwS&)h=#j`kb+JrsWw0 z$5WCLZ@-ZT?I~nK!>eU7(Z9FrxBvb(yAMJ&WqOm{L;8(&CX4{yUT?7>Y%WsG8nnoW zBhd)LP12RFX2LjSX*0h`m%Z)EJ9EhDrn*`;|Gy8(MXSvHkvDwQt|x8;55iL-0Uw15 zbN@W$qsRA96Fu@3?ITZGS4;cPs*m0fv7R=t@Os%`GCXY}(!Vyv#4E}hI75EfH$F{J z`j+Gi2?HTJKj_yRL={{>FdXp<45#`%h*)$brh1Vv(vvs{XHZg_2l}8;r|dAR+5a8@r@1Gx>X}+wQSD3=%QvwLG?M)q6|*i-9YGwf-3NTp>V8 zPo^l#L}fUT-#MfEWYx)10qh>?{EQbyXtkS(WGW7VaLje+e481vi6tjm{P`2u zt5^+jTze~g(}qpC>BrXrdvH9E4s1psr zFKcriG486} z0;6k5)ohW|j_cyRQTk9#jyNQKvT?fG$5ZCk^Q6n@5#J*xDHVtJe+GWGByPUOp~*&` zH75+p4G$FJ2!;``U3bdPM@}mWAvFxOAhs&yjEyIFt)(ROiS*PAdRaiI$ynqFJo@|Y zc8&M}!>EkYZwxE=-_-^ie~E{5f~9%JVx>5VGP|beJV4Y*AL9#R>9xRz00(9?TZuN$ z2*%s7(CYt;#nLqo`2ML(2`1g1Y$luuU@+~Sfu#2xVGfO)A#h!gbX(2!0yA6t^j-Sv z+S|hkaMzekXTeG!$d%;ofI!~_sI}=W#g~@#P6RUVVtM2!Vr(jJ-sHLc{T1$cyJ{un zEb}>ie>42@MVa1Gx4gfUNwLKKt?4l8!AGX^rP)90YF4)XyuN3u5`a49YO#S-9@J>b z*xy~w;&TZ$5Ud;SDY$W|+mtP)bf`>RBh(~ISEr4k*Wb}>7U|l|tT+%hr+{dZ+Iv>pxZN{Nwl4Gu0w!*2{Y7N_gc%^9YoD+1BII zf3E&gp*&cklP`BT~_T1#z?t8Uk>uxi}56f@&0{ps*b56L^ze^^;7pthFJR;e&?APR$ z+nhesZ&5ML(@3qDFFmD17XEP{xxK6lfwF^^Je&ZKyAHL0T08K2{BkI^e?CYagT&OC zrfV7ZtS6Az+7mA;!@t=C-dx+MaM$xdFHVO)o;gV>hkdUeWbKX>A!74)WUXZ_N;c6E zO{}P-B@>#JKqsv}Q|Cr)ml7klL6usIytztcJoGZfB=1wp2)9qj108pLGZKGwc< z%idI~>r-M*d1QO~EIj^+V@j&v&Qd)@Pt;jAEs|Ayo3-&4tAsnPZ{2lYaUS9y9bh&b zdtyEaVE1Z^Ti+!0hZl$*}1rVygL3UQhx{Rr*k@BBY60LbjI0{$>WdcY*R_+%QMv%%&^&f5i~`N$E8a-K-f zfPG)lX5^lpo5`iRX`#)Jjsw0H_vIF+CaBzF`0Q3C>!(RB&CjQ;;o<3(J+5=B0bc%uui(A2o`=Lbu{!EeGSoljGouP29yvT2$=c#8*yc!6 zV~)9rWfo$U=0$sRt*59($nwiy7D5TPq7cjYM@+dq1m96wbkWVzwD5Yw zL~y9GlnIDBG^2T$L?A(^$0r;)ND@bEwRx|Sn%YHO_wDNmD7z5HD)Q%bFY7!KeraPK zsn*+)O-S57;b$){Y-Wn_7}A4AQ64#X7F-yWvo+H_jB|tvxaQ&u7JU*WEXi zAEQcMM_RkFJSY}SIA~zEQ7`|D+8q!@WnnsuT0F7qV^HU4edWU35WL-@#<-Ca*bx?< z%R*@{ST%AQ_QX{rp2JaUP`7V~*9iuGf9%)z!brKO?|cetwxlp`xX}_=kz3&tckNzG zSPUmy*&IbPw?Zh5#kaCQ+ABP~d@KK)_#NtPXi0_n#q7s`tn@UxR%Lw*v3caKQBN=q z9=1~k&-7PjAaP14A$D(?Of`v;+B(*sP)5%-eh$+Le< z%V~K$U~jcBQS?U3X(D#5>(zycbA-ehpG7Q1%Q}ld7>s*lK=CaCyU%u$XzP8;G~9H@ ztmP~z{f`}CivkW|)olh_?0qzk%Bj_)ThYRLO*h)XZRoMz{)Gccno9y3Yh~=#xV)oG zmsICl6tDfn5VegvAn*F@)6=Er_d7=+eU^TEOVLy!)k~Wn?M)(kV}dW83Q@t&TTCB8 zv#gPQr~LJ&EF2gMZ&Q``db*F4^_%U_7(_WOGTKX5X7xhBhKqvu<*YOXe;|EkhrU<* zT`$g2&_^Uj6S*K$8X0Sd-_Y$58EcCjfM*-rP4`g?B47BzEIT;3(HMuE`XC516 zK4ZH!f!ttI9LCZU$knC`Omz8RqneWWDmzQyPFq+`o?FL210Rozc(!i-S} zfLwcw3f7i07=43s`bsqfWBNde!VWD|d8`Itke^`!M57&+1yGh@PL;HHtoR$zCI1~^ zQBL?Her9YxpJ&RNk1yfZGKk9xozZ#vcb_o)_0T^1m!#eG{%?>Nk{~gehNz>$1cDYN zT9}xyMj~XUpds9B>ijjq^4pVt?QBY$)C{!g4E^VqiGlhOi!?`(4nnp^DXjuPtD`t$ zyYtj%)GZCuL2h2xLwf6vWOXj$Y(%7dl7pRnVg^fTGT!=?)w5h3-g41??HG-e)Wps# zw+US`{m)BBhR601vdMLhYu@f}H+i(O^0d@G0rnp)OvW17hf8e-&Mj;o1O`!_A00cPQ93@aKWc$%Y?@vB&Ml=c+=1Pz&Bqu#rYy1Pk8YZ^z$!eq zx8LWwK1DjQG2i=b;Gwd<1k7DJovJHY=9^oe8fq0{GmuG#EU#A5qoU z9LY1kjbWB0RCd5@Mic5bCCT#AiYaFl04u?D@;O`0=RV(|Ey>6U>9=PR+7$BiI=>P8SYD*22B5Sl8HuY$O&kU5E>9N#WqU zFdwS|@N)qB5h`ujlo|s@e*rP3MXLcW><6#oq*ofNQ(36qoS+QFMtADlBJP4yu=hic zNi6{^ICR#e3`Tt7#;Kq#&pKMS%6Sm{jpl(y{RO=sma+(`ub#%NHDvjNsPk_7pXxn( zFCHFSh0-a+_}e0@zop0TIqUa{{E3>$jd8wFE#+=Z4tW<-H9Iw;K{x{H9lNFz7}RVP z-qjh!VztlfRPeL)opIsmF2OSW)I%3ENeK?X0)kVP!Y&z8(Z*r_+0)Y#`^HNR1#y;q z`L7X2vAU3rYyBJTNISFH==5JxJpH)WPH(R_qOTjG$A0-=e)gL~S49KC9X3#zhd_J*_-G~D+>(k=@e=3e`M;#7H@yd55DN|J0o z%vx%7)uT{O<3BuT+RlX>j4Lj{`$-iZ4cgrcRP1ynfdUWm0orafo`A)llj_O2!73iE zWxMDr%Mk)d2HZV%7|D(KTA7+!(kpf(QqnPqi4zE!hce~(P{r>^2SR>Z>B!l`5J^#G z?R8OeDXgZ5ju~Ij+Fd?t#l?Zum`7J$UFGlF1=C_f&;Rdj50=%O_F-oDJfEuZcw7}) zUFZqU9fYuwBLCeo6rX_-YDaD0H8A=2pcqVNRvwBzzWed?I zmzQ!QvSC(cDdpj?j0WjMEgiNNIT@y%_#&l~x`1v9D9pRYDEgVGJ+uzKX4=znE8U@E z_je3H6X^@ox%(UwUHU^qB3k+MdUo(xJB<8JIt1N1$HfZxIL>BqEsSDNg1(mxs;t7$Txm zg!9Sq5@re)bF-<*tcx{H&<3l)yy49f3&kU-TYH}8Ag;XN=0K(zr-Ijww?X%i`U=}; zioFZ!-C&)N)&0rU&fW_b>Eq&FGk|78zi2d{am_`I^RUE}!c{dxV92WVsEkQ)q7pIt zY@@Mc+{7jfW^$b+Fhjjlf%c01Biz)e@5V;g*=C^{hoq74z6Ul)(T!HmXWm7TZ@vuj zfp=?8T)3d|#VGue8+pa?Pf>xvZ#sWv^6i<@esC$<>8w14J zVXN`0L|;d|xt|Vg_ZVUw$w=bW&mX|dRzQILyqE|AKBk2vrIVHg7Y&8xF7l+F8YJp) zL~zn|Q9dI1-gkrAC(?~}j~c{zw0dU7hUkkYbbajI-tHR}0CmN{)%Il>w-zB3duq?zO7vpVal|#W})nc_Z<8n4K$17{u+W{^=oR< zjhK4UyhzpTdEbF0?Be;XR@?cKr`U=G-RI0c{Uy%^{QHdAXU7wGM(uvvuRI~k;wZD- zwP<~pdrrGMC7JWBUcU)K{`H1rrd;749VK@Bo!MrudA_1F4|z78zz}waN5?z-to`{G z+t2%o6TIW_w%SQ6#)XR*oDvNdD@eIGW}M+1pTz0T{NADq9U!op)G>Rp zc>hx9M|CnC0PZdF)8w_rJ<4mQKi9INc6j;tH9!Od%6aN#xrv6unx$1Q!>^|@W@i6s zq3Xx>wpRPY9V!;X=+@48{7Hg)y@=CLt2kbDTTF!HwcDW z;!1SRQgk9_(`W3#Ss2f}g`+-h-JSzrqTX|FU3V^_i{anWK1v-9+@MjFAyrvROm76L z+ipG277oZ9XgUgc48gWC+#ImzBTn)0av?Cbv_g zll@@_;bfc%%=2^hi8VbKz0PIc@KnQhKN>{#OMd#pNgZfKtyT$Sh(8!Tyb)e$&)^)p zyXm6AwBFP@SuqOMwa>PApsFN5-~GIbuJLV6x}Dv?-=ZJ7nOCMX`iqsRvqA7O2vwTR z6Z$(GckkwDwg_1^U*fRpY3t9x6K5Wjb!QC>$$JPztU;wst9SHA1m~^Vu7Jt+Rz0;X z_AR(f{<>)TYGSs@tCH&pAHeNEwK-K$jCV%o3yK=VPT|>yv35=^5mG+?aku%*QR3fA z*2W6WI;gIX7aAocB_4|Ud#kIke~z?{28KeL^8&|3SHO}bRsP8|-yBF+u^9LMGwr!{ z6NEORsh^EJ2LO`7Iy`|Y*~fj7E+*FR>UgmEkDQxmN;31vdY3aOM!BRYpqJGG`C=1q z5dX$2K@n!X?H93{HTUo32X*4kyj7J!tfH!Qf(^xYrnyWoRy~)Xj0x(u;PX=Q z;*&k9T!RBrKZM3A(o7of0>(AZaPs1N)i!_`kZz?eZ(AxVdY1CX0%zcX?P?W->k-^M z7^s}4MkkG>1D7+f+RI2_Xwk{ZX12!p#i|+b2Y2fRzjW))=8>D(#hrlb zQPA-PV){8~iU%LzY_ximjwqzQCB;(`N2;aC_x%BqTYB=wNEd6my-Rn@?7uB=>=EM| zZtrqk&cAm=Kvz ztC%VF-}>pGrm}jmWIEes-NUZ&DNX*Xp#v}SUi+K16I(WJUwG}gahHMPCv;38o5i#n z4&AkO{)+?NCL8q9`U`2tGy~r23e1IkOo;o-R64_Fi0&{E^rC9=;I57a4dTR=#4wmV z-!ti^`x)&|y>={CtHnFO4STJC#4z%>_qpiz&5TF60bVvPL^+FAH*Rl`D*ih#j6BAH z)mIyW!AQ5}!>A*Rb=U7*I$sxyb33Y{+J(O{dn}8z{V(s>#TyS&IA^Z=hJ5Kuc`iM| z)h?6YR4$odVd%&E)?ZIo0`*qve9b}sS8ok5s$KZk4sOkYV_4o8eo+qo9c>#t_Sl-p zMokkT)VSr-bG7C)Vm{rz<%4%n#qiHhc|5*w4PN{nP~>dOnd@L$A4Bm_(X%wxNR)4Z z$ECk>3sZfT96eXagPNtLR0$y_D@XHNCzW>}($KGiMy+)XxyVc1y10xj#}dvnR${5A zQBvZ@f$28=CGxCTH`%6welwjikHp9Q4huduhw~S0go*^Xq9S7RJLLh~Qf{~OS1kbclHQw;QScSol zh%@}#7`SaNZkQLr##9_bc4^fcKkvf4++L)+&Y27snyIMn{Na0Kp>KXibS+t~5$$=dW5PZ%hFYvlw?uVPinh{kf%SyJLq{ z_m$~4kZxOFuky)DMroOwXI_i{vzQXmmEdPgQ|uED%RZp&>hrgO(b`KC_)>wizHMm! zr65pf{L{JtZll2PCzEpLfOMtoIB)e`Td0L$WyriH*5B6ODANi0>=Ijnx$O6S!NXF& zdF&y?f31f_>t96o>%B7?-3bD;_y|Aoox#P(T2xM@a26+Bs6^+)Mnwl9%yFbCfI<3r znf*!MS}j>L;(3-KE&X8XyoI*h8^BhRT3l#?=fF+~^Qgx4*a}Cuh0~`X zbb^IWJ|zy#^l^pNgV}&Be^Fb7yds*HQB&y}h;4jtfCTt1DL>FEt6gon;gZ7fLiM!e zKskchZD0?|(qnE~wg2y_X@Yz*&c{3JP$^ML;c=_Qea5UzZ9sJCp*t#)HhW^LQ~z)c z&%5uK%F_j9eZ)Y5zJM9`d1Gpq;#*HzLhOxS8qD(zn;^b*{OzoABrZ=x+qBv>0AXBx zqClkl7wPLatuNeCoHOXoWGMxujE}sy*S6XjH?8ehr$&q(b;XpQykG+HSoL!kV_0NW z(FL=${mO#&`Ac4aPIu6?m z`H|gZs@827-lF9L=XDGvM3|BJmLYELpx1!?l*ygBmS{5%n7jd|s=6*Ug9#v9nl&w; z4?S&=f=VFWbZf1)oW#+;=lVcY@@FT8z}`wQbJ_u?k~WG&v7VR~%dsxciG~ zVksEcALxw&cYf*y=e=4$gq&=C!ra#KNwME|#HQ1_F^k-A#5v{zQEB20YA&d}0!D_M#u4kJ2 z7j=x0Iw{q^f#19*B4LO^i5HWKt6c!1Yu25j457qUJveq5=HLK$qL9Cx2KZ(N=S8*j zH3V^ia#_V)4=gJW?-Oq>rEE_OT=ypmdgEFR89io*#62899mE=KHh zJ!o)^C^=wA^^yC$L z8j38J)G-z?o~bg}<*qOdAPVlUzrEfw+I*=M7vqb&pK9oM&9j>8&g;y#v8MxKrup@w zo8ILJIwcTQ>^7j5ovi~Rd2A7^B;M%L?P6T0yEB3``P0hkYFK0H7E%%35C};-R4uaj zM2ge~z^=F)l&ys-TPx}Qbo8q%IcSjyTa7~~H3OOn%;Ep{QU>tP1NVGBVITbzbIZPP za~7yVz?rOGT+m#GKV?-taQLn5ZoPtFO7U~T@ks<|q(|kIgerU(5$~#ql``*h%)uo7 zB3EkJeYfiwI?4ezZgAfHlY+Qi$eGDfLHW$sPaK#sbrY_T&C?3TTsARugzs07!H2`D ztrKxxV;lRYz&WpfJ!urtR>i)`8haKOEOq(|KQYyAa1`OKS!fi4LM(q-N z?^#vyWjhgd#qWfp9y&!lN-k%%pnI!x?;&EnYK@|S!!2DvtJpFqI|U@$>*zd7$SEr& zX9J_AjWCs?);&Oj0IG}3bUwk$Nr<{>s0urFciU?mPAiMg2vre{(PdwOlBzX}iRwto()M?*{HA;!AB?a$#cT+g;Ak%ysQJS$ z^jNUKZx{OPryNabCO4zDaqt{}G1doBQ%My#b6^;p;HL`~xR-Ww{DlK@t}q%hIkx`s@E>=)^W@i)7Vqe+u-N!*ZbBv5Y$)v=O8xRMxcPS`W5>##7l~Z%| z*zB1Iqkn!{`q`Q^InFYgXG}0!x=NEJLscIxG&6tw%>0G)WLCupIhS09hGJ;o%|-~J zhHitLOL2$1wKz7r2J#TI%zVS8-zMQMZm40-qD!!^MRV27jb>kO zk8k@vW&hdqOHjQ(M0;6d3=?8GxyguVzGpz<(==jGfbN~B6Bn;C7ce6{|$Q4$qCe*XR3=T#m8 z)$E}+c#K_i4M8Qkb zG>!T8Gi_Ze&>rSBu94R0M>#g9LGG60 zt0g4O-g^@UPE;(x(KhSXA8$rx4yEn)k7>ZVHP4`GOr;xfRp_8Bon7@c;bhhm?dIpl zZKqmu=lacmP49i>@B`}#9(MODDLg|#Za#cdzbAM-G@X_L?bfet134)b=C}CWzxfG3 zyO7=YO#uJvFJiotSb4Qr0ph!=)zOEj`w0;0TX%1CbrS6YpUkP2nBgOJJF&KXtC^0W zrtU|dJNFGw-c5d9Hgv#w3b8mVXVB>mNsT!r72xKUSHEap)(x8OXA)pPIN0K^*_G=-P0=ZS z>5kifH&Mcoy=!vXhseEmmn5WEU;UJE7&ckdWk8d^<Ew!)*yaX(v{GnpR{x4o z2FrF>_pC!#z%|$W5>{y*9W?@ut5+IL=yG>ZT)YEkUKkEd59*P_UkUUE>#UJ2boU@B zh%*D3-fRW>XTk``Qz+%Yeg$leS4tIG8ZZn>??jW+MU^UN3f1ir0=xH*@U`nC$m%x# zxF5Jo6G7mZ>^mu5MyG5FvtPf&zQ1is_-J0-@E+#$&TY+X;P>5X-;49PL$ZZ4ub%e{(z&87yg*U4ai{4qqTHNh`!y)uI&Q|$ZQ5uNxq8@hDR^FARuP^ME8K#&F)SWC+)1l8|o9$eSt!;PPc1TcQe(@ESU}Z4FZFAm2sfWBeqWdLtYttc8Pv# z#;-3q9|m$#5HO*rqU&?v+ndMs@1~mGD0<$&2peg}41Gr~Y5Nj!2$+>Ptyv9cAa{6# z3E-on^J~u{%<5@BxQP$@`#Wsb#xkpl0&d|9U+Z&zk7);iZ9(|??oUsc?hgZm_GP6@ z_lE5b4VS^j(Qn%}Ts<6aEPWpilx6e8x=*jY99D7IpeY~k^)g;~0FbOc=J!rRL^`A4 zKW5tvHw(9kQimev2Ctj3{bepkTh9}ImL=HIx>!4tPp*QB{;DPbvOI4rAe6^HEVMKH zb`!QkR^9|p`!S}XaOdf^;1u{qEY?-Ps?(!O!!Ser{(a;x(!R_(Z9C4YqlgAmSK!DDZFhhE=3&x( zog?3-QT1@f)+bf(ny3y_4jAz?jqYJW~`Pc(S@DVVShdndyg zs`pAa@T6;5yy!DlC!$uYbq#Xz+ccndv`QzZX3^%?*ZZ!xyUh+Noc9~kTA|n_vvPf1 z6rufU3at!inVf~R71?33sj@P?<}cE#+LvJ$UH>Ub3q~&3l{^+R-+|r3|IlvkXbNZ$ zpWIkP#Gn4uJYqU&(hE}u205}gQMOH^^D>k%&5!GpbwOP;7TbE3^PQcg zf1Iazm$fIZK%}X_Z6xMB&7*l!qQu-jE_BSW6cQ{7DYQ874x+!q;^s-0!n;=-iXO># zSco&1QKjErZJsFTq*9r$6r)AjbRx#h*!F$Xu12lfT19wesSV$uA4^KaE-jPo)o`>o zhkM_fA>V|Kgb(}mU`DO)xmyWAa5Y*6tc-n&Eg$;{+>KOdxcE7-&`|}yp+u#Xbw&8K z^hlg&HuzO8JTPr+SO50KU{n=yZ0y=a+tAh5pFi(e-nFmkZ-Lemb=snph#GxMYZMOJ zd!Ko`9a@mz2dyIk@CM38;R(qg_9tL+T~tS$)1Oh*UwQe^dB$G;K|D8Exy5xqeR@Xy zh(&iWSXg}UBG7o-?McxTY+;uwx{=d>BO~OsJh=WJ3cL&hj!TZg!0K? z6w{`X)w*BN?r;`Ncg0ML1a*%Er;PUv2X>A(mApDBj^v+?gRVTCvfmDGvPz9 z-TBpph&0p%Z4ncDzJF8e*)k7e(Y~7729~$E3i-@lR(G{Zzo4;v$%b_vdGhX3g-gSy z0VOHER%ZZ#ZF*pDJI2QMZ$f1l$73HNc~|g159EJ`q0m47?{HhOXQvf5uf2bZ$Axz* zTdEo~FRm}wE93~`iuEfMo{X0d)tVpb515Z2A8dl$bDlk{;6ZeM#o*lb%om}Wsy3ps z@^Zi@ynydp^rPrJ{lX1*$ZRrh(Y8F2IJwJXPLwbqcGa>Tha5DMx?aogdfsQaPg z|I5{k1N%vnJG)=%u`E3~x-6*dy~~NP;r1z&P=_II|0AG?)x4*Q)%~=qUQLcfG8wRc zFos-c<@imSQcErFBOplv=ZlT`%oo90#s247FyGM`R7p9uP^83sq29O^RlLB+Z}0q` z=Lx?!oPD>TUb{m=MKX#eug_F;{FFMO^9HX;w-~ zfHM?GgE#U}UwrXn$eo(09KYcW^Zd%k#o=UMZ zUqXi9vHjURBTZ&Ga5jMV;}>VXMTxnRD@D!8HvVO9PYMpV{-4()z)WjGZJq->3@NxX zYa!b>PRKoNL2~}Z{;3#|oTQ&7a=iM?68Y18Rd3+ajxbzCbL~=xBdgfre|% z!$3x(iB*hm73Oc=2hev`51P2ALv_2gF@{q~PBHpj!>Nmh98H$9^i^%xt?y`~;>oAI z9_ut(yr^ex1#f>g?atTa*mP3(q0)`+8mJ+c@VO8u>Lkh$kl>VGRp%H?>2zZWMSIUJ z+MnhekxA(XxUzT}9QoQdGXU>jHf%*^jm_i59xg| zw9(tgjMMt~W{(WJ4R@jhoEf5Y`-frV%RjzXr>W+Q)|92wnz4=C`v@-4MQc^gOi6C) z^W}t-`N-Smzoo@UVw`xYB@_9hG0m(o5V1W;)8pX;kk>Nsr)tlNj6w0m%wmJlqIO3? z{>GgLQrrT@;cz(okw`!EvbqVhj#>MENX7uaN-$%VLu6U>J~J4Pi%N)WPJ-5^iIIz> z0R-03^sYfZ9eYepL0mqh(RtMC`a=aYDw87f3s(2*Z_9lVW0Qp#0}_iVuGpm0_xMiD z5aMEkam&qKU>K{0g4QBo)qt=W0GY>@wflZoYiRb&NAJt=0C_muL_4j)FtNtdt$1Pk zQIPzu*qX$EL-y)vPDtR1{Wn6RM;$Zg5dT1=>~xV}(H{;ErB$E@*$Cu?ScW%K;XdWY z-LOFu3v37c63w|)5RJ#b?er1Y514rEXMbkXCVdEG6J}LN$yuPn&f@XLgjOBli{3`9 zNE%>9*uXX&QU@ZYkK$L;sj3A@gJxM10Zg!J$cm+ezx&v^Map`ApAzV zgfMCfhRkY~M!QtK%l$iXX-(#1_k2jD>{VSs>;$S5$J}-U*3ccW}U@`sXj{3!RaeAf6z3Kdtir-P! zCXDSdQj?;DpcHfZ?I2jM6H0sDR;0SXKGfu%Byt$`ZDowrMta^%QI;yaSX5e;G>i>< z{hw&Ek2j3YSV<8b&b&iW;HUv-HU_#(E^Nbie==cBqx zW%2Pp=%4J2s{cQy>ishn7ht^DOtsVOmp^t>ZKr{4@otgdOYzL^j$fKn>i!(5yI&2s zyg01uRj-n<^V_=e_g&K4!FYvC53cT_7aW@OMtk(0vGND`*?FIa%ACYq^glzVtaY>N6~0$B z=5cj`-LO5nCYj4GZF(On>>7z2172uG6w}uL#5aZ z`rp76(EQ=aym~;Vf&EIsCZzhSC4kkS%ItLJ{hC0x@}vNc7w(;Wa^4+6jHJy%!j4~W zPm{04CSUbv%gydX4qsR@IJ8rV$2BUd4W%P=vA)RL!cCC+yQhYEoKxxaJ?T(+VP=B3T zJVW&*b;dTOQ7iGkk4+ju@R=(MgW(}(42YK75a!$pyDxTX5Ku+pttdoB zPn?2HbxPct<3P98sD@^4*9#*SBRmkjfH)7HH3~;DQaM`6Nsys+vVbmkc5cbr348u) zTx+k9jKBC|G~R_sLPe!!=syjro+5Bj@uBXIpT`(T=2oAb=Ov$Qt)m!`W!ZPjEzVis z5qpipqzQpfsh$!cLgANoG}XMXfn|DQjGShVZ&+9jp8uln@Za2c|4m6r zK8jOCB-U^;%LH+9i%95GBWBX((TeS70?BPA2Jdm4^WA@h3$G4HRIVtbFg&DBM%X!p zoY~JA$`Lyo=xo4?av?h-HBDSHdynC@VvJqkd{M5J_p#>>dF%q4V!8qCvk4@5)l{EE zdW<;d#=H|^9F*ba0g94+&U7qeP@o0PC(T%TkK|0DyS$f?0tfXBa@Eo%rIlxqDh5Ad z(|^@e&i_29$HtiOSDHJ4_5K-Wla-S!bl9Zl%fmJ*xN#}zJ*^;M=6d8KzBxt|7-DQ) zB1dwB57}Q)$I&3|P`SS6bUMq0Ok?6GqPsVNE4Q!f<{URx?8@dA&R0IzT-z?xO3#-A z%?CCwrbjd%W<=bUuDer)+dF5JA5{$V zC>lp5NAkI7hoMAotHwVf+oZ^u-OX4?ue$w~bp`ZhGKy_b9g*_*3~^?f814qr9E`Fw zl1a^s+!F#*jC)BR>SO>n#je8hg`7r?AD@De(f{5hI!Mfod1nBK8l48zI$TnO-^%r^ZTexkHc ztpGJt2t`bDFTSDu3WrO)t^Z+dy~U0aQ|Myg8^iZ?Zda*NMMzfxaJackfoQ*KMage6sVnsgW25_8(+I$R<`@i)9fdPvoQ@);8cEh7G zW5Nop7}pJ^bleFrHFKk7s!ien^1EdzYdKq`V3la5TtfpYlvZjCm|Z(-D2pDryAVbJ z&hfreC>nh-94Mh+jNH=75s)wmlveSj@!bbh0|=gN|Dzu6%z=CdPf7pd5TL_=PDlN9 zTqeKzG05A&6pAG{ zO{%5sw!V0|;}uc8j68z(lb=cmq|ok$(sDiVC#3K8XSH){GXISF$jy{ntL;2FW4S)I;Hq_CP>2S}T~_u>*E1n~!xF`s-}jS2hascT`^Zs=%p9AcabG zrpwC>B_Dp#{ZvPT;JQcIg>$uq$Mx0OX1ogTJg`N?H1}aB4WF-`n=XPjE-@hlLFx~@ zokngDh!$k}<0T%|Ki?Fg`Z%MnVn(Hij)%f|9`8MeB7%@*`bY{+B8s@)=i2QcM^)_n z3y_9WTo^gmj?)&)VLWnjBueBQ%=O}{=_*(0iX7QtQ{OgX<$==}6yd(P>Y6hsqs>wA zYw9-2z{Q`_4Vral_|I+QfzIZKMxoQu$c`3`Y{h5E?Lo`tmNhM(Rt#cBx!0L^|FhVG zb2?2Gc+9i;tE#|ow@ylW8GN?Lh=(efM(mlf;DMbh8o$&MywTdZ;0cKvKe3c%_-g;4 zkkzM<&cyl~j{ZC?p6CQyhC~GU9st*btiGK~v-!*og-#P)*jN*n&)IH1TS(z)je1p$ z9JbbUZm={(jD#p0X7KxZN|Cv29(wInM$WQpoKqUh6(=jTs)cghxjO96Dhg|+8~1#l zVp&V|m)@H4ef=B7VWSLccDF;;U)QWFPA%4g0PeOANta3Itg6o7{OqVwnK+^Y{0=~g ze1Fwxu)?|&>m;|=f3Z4&IJCRlI%npvusa$@3);@B*7T|%3a0fGN~9zwNv6)~7%g!f z9U(EgG$6kI_@swb2ez2$#1|*AuR4bMUW3{2?L61LhE5OI5d z1ozq;PXRu!2HsJi)&BM})j8fdwSFRt`aPyR4ic7Fc01L1(mM3iY|Y-IR&zNky|ksQ zAIAtECs2_+_~>FMimGr)%27#*#}UQ65kz1XTRF}Na63V49P}4^Y>@ou=1(;CarcFIZk_{(3A-D8Zx1qf zFQCJ^WxIvl(y&q06y3G|Pne)F23LbsL-q^qj;1Nqly=9(%V}|{6-f(hP zq-}9HAlc490B25kS1F;ycpHPcNTW=;3Wuu(j6d@iE7ZXg5|RmJ_hf3w`z4q!ceDV@ zo`t2Ek|asI0qzaf=sI5&+{(4f_{=qOu8k;A)SGiF=6wOXNyXyXQ&gRyr^l3;^REMe zWpX8d9^GAf=BT<0Ir1Bv8yf2H&(W!L-c8XZu}M(CZyn+e(Buvjv?apM$PHH_Xg`;9 z38GI-@?AJYhplo{R3gqUTwRTo5{e_`kfnk%(&I*3X?1>Lmu&)MevHC|va!ydSy$7x zab?>L0{LK__Ba7-w*TC{wYOV8{Z!YQ_UKEc%+B;*FX+*)byPmf# z8dI7Ek~dMZ50qR2;?RVU*15VkU2GW9+H_-v;ijz!XdB4OznX8xe8UhWvt!43CYz~o z>W~dA64mIl#NfFX?DSn6M|trC*3x4EV&8Vfjiq_VVIILtwA_pX|5i=-`WWl;Q~Z1b zCBvriMP^Ld8eIJLfX~|)>S$5*o77jRloO4(%W)EUk?2e`3vjXpy60wjZBsBez9;B4 z;j0GfQ#vhrNTF#wauIGt(=3AHQvM>v1*tg^W4yLlF~^KKDCsCSp8G?x;QrfbnJ0GX z!LoX-V4_vG^|;>3K8}#tBP(pSw(GmKU@Z?9QQc>PIhI&pv8RMr0%`93su+Rtma`6H*o(3nNePgVFb z0sA#?wb@ynIcc}14N#~VlN0ScA6fN;h&-y%MVS_Z4wODbtrM7pQ@ZiH$Sipp{?wHI zi34vES`wgCLroy2^KTwc%B`7jgTq9bd@`-$d%B!wYB1C+aPp7lmVyB`$asB+&M#%+ z;?8L~!_~LNVvZ9gu2kyX)$n*0tZ4M{Ew7W^>N}ISx(-flMK9)5n?@%OD+}G_a?gy` z&0`Le5L{VH&(npruVxO5;hY%TVOL!zdJ#t2PJ5eE)@2v^EkESdf%f48`BwFJE)YIZl6koQ_&-$m|rs!s{ z#Kh#(F_Q1kif!*>rnZS;y#iGXIDbYYqZ_+Y?~oG>xU*5`%PS1eOkwqaxZx%V>K+&Z zz3T(NS->DPktPb|y77E{-DkcY96ve~lKj0d{Z5dV_S?=HP|nMadA0Tz9umvZejl|x z2@B3)@fH-Z7(c|FMTgP`?zGYJ!n&d)n#n2z?BeuI;lwsMwu#nO1g_Kf{>Z0!=a2Q$ zZQV$N|Jzk1IR{G0d2~hk#_oLjhTCn4|Fj#6HWv2dG`l@+4#)#+8W_D8?Cwo_GJ=kr z!-ee^>aQD3l4!d|mhKTzcb8!TGZIQm^l6$&iwr=cQovzx4Bm03uk0a zq)WV$zf^D4d=18L0M2K?SrVRly|9oXVJ5yKkcsrFv?ae*&uxm0(HEyHQpQACQuJbb z0&xAh(Eu#Sq2GR;&E;BCI&T&Zg5KSK`Ssoq10d#_b>aKdH4}5EeCAlwAhr%u)f!Xi z3@Qn@stsd}A2wJ}#%I4)F`}?} zaOOSMGfub{1xx|=(B7&%TA^9cpE)QYmQEFs?c4Dip=&j$I^)X!T%0m; z`hl*=g#lAng0yIxAikxy+;#g%w)}s@>HHic;PXGVc>#F>Ud|Lv!jJVRO7U*4QF-Zn zQtlM0*Y(|KO=IX}SdKW*_}m?%mYg_uI|A7=k;Cfhe$e+988(z~c8Nyjt)zuvYXowR zbED2M%3&($inv&||031q-B&qV1WeD|FxE(6W@T&pi++^y4S&g^-}xraj4)U&d)Y0= zwbcgWX#+DI`gRYc?D(F~S7BT1G62CIxW73cpq@{}*)tDfJ-)n*HbMBRN3I`bo3JNk z78*+(q*D+|Nh;cBHgxYGu zMt_KeaN~*ox>%paJ460Z2>L!ua&UlaDftShS$JHS@oJ-p(@B}niVFis2o$Q|X{3o_ z0Vusa1@P)D@roM>bVyNTfpu_T2nt68@A}hTna_hULo`Vks_ZkZqkqLvq|T2zRRM^xKK3KD zOI)td+*mR>;=uPURbA8Kz0T*^H#?uPN@29JT(v2E2lp5*#>n#KfSFd4GAV}y6n<1{ zU`f`?p5PO-zn@sKK*vLcMz#ki@}6uI)3pLGF=YD*pU4X>RpVa)Sh^TzhzQ7hBpq0h zUc0d|HxsPhtlJeI40KvGKpfvEILo$O@>Yl7xMj-|`gzk0Ka12zq) z?m`StWWsp=|G^5Uhq4m5`yd4dZlG7#k zei9yh4wwG1_e+coP(4ddG@_-<-gX3uuoV%{T78DtS+!UT_dNug-7DB>#sfEv9@3&! za)CXDOj-!YJKVguxUgcqj3f%ANjlcLyZ0E+9k=U;^DlCFQ^IN$)P+j1Ez`a#z6!<& zfQv2qz2g+=nYJ2Nv&~uv2!I(;s}X}rrQCn4iYWGqe}2P(@(jHY))i#J&%&Yw%#d5M#hyoBXHO2tr*M2N?n7$ z_nG@Yp4(pjA>hB-EyIbV()Qy$;03nCTZl3ZF}ZF~3KItiu~`$sY@B?;7B)jBxp^K} zGy+C769qEjBzKcAR18toea3QXsm2*ZX-=hO+26PSCJzIh|G||e6dtiU6M|vZs=;mG zEQzXe%4m+Fk~k%0TjTsBFZuGh0~a)g(rLCqNzH5Jy5w-VLiCiH5oPna>(z%x9_z-t zNO&>bu6N@*+?b<@wn7R2)P4VW^s~QFX-1q}*q{=9S>YXsQFJIYj_tf|_H%_fB(3K5 zNXH8Qq(p`vnEYbT?&uqbw=@LOqW|#SA&^6bD0{R)88rnaOxVcAsJsxtYJwSdkTEwq zW|mCO86^r_`o%+!0QQuq*m8<+;CKwtyYyOeKIR9Ty3= zjmIa$-x{TUhjzX>?RZR(s6a2}*b?!hDeu(rJ(E&iFUK~@JRIX2YFge zm|A%|b7PV*sM?VE%UBv^OxRNeF4$a=+rUT@sdslUz#YbPSoMkVtuQLlY^${`eqAU7 zJ*{4p*j$ZBC}JFocvFQ)3YNKws01iEG$$2mI7L9E=W2 z>ScI)C@v?TrV-y&F zr*{OPEy_4F4(AgDwHloWI9#riZBsl<-g+Dw1K+eSbCBcgS{v$&?Js=H^=KJH8|PQ& zo0N$N+cFk~mOd5OP6HClp+r2`d0@hjVgsE1G>tlE z+2Ts6py!0reil|wx{B?EIW3YhJsi(qq#jq6gLp=;$z7k3Zt_;ZWars$MTE9o8GIPT z<)9dw%qi9>Y9l)4CCJ)3+9XNNmbw@dthQ2XM?`zUJPtJCGU5k; zCF%b`;8p~hUgC1Qt9`^&a{Rzie_1fz!O9sN*Qku}bD3VrTTCt&{(OdR=1E$SVQZd{ z+(P6uQQ$n|im4i{P$edaQsVf`cMQOkybzLt1Pg5%CX!}179HJm|40O$m=oRn+;^a; zbUyGVfPxP=T*k1_5kpEfvfv%X?WczNsf>zK#1`lLKo?cBtXx@CT1_}-Zc&niduvtm z@5`V^E$Zz=rf63bjcTU327T?d2QT3yvA^7gjt{)i(WLd-;6Q{zqDEXKzHDY|fHkxO z*|@#}Ma+GM4ISA`fBovR;KKyjIZPszDPh=^|ZZedad|QnvI7;fCt%vXM4*O{*Eb%*1m8XDI#5gCyGwWKwIAhD-I zh(+Lu8ndP|=YnJ$C*;#rQ5Z`^ZXVHerE-xA@16?Jzw8U|wx{7*L9?c~YXs z@~RkkF9~9cO$sI2>PN_{EJOryGvx3!c@jeh0OhI=sRnKJq^G)rAH~i%GQ5EkGei{H zOm6i0rH_Zn0hWDotsF3j|~>!^NP09wnsGad$gN%5tN(Jv*-= zrcl|=uVkhQrW}`ldTWfm&=i^?qgOzI7FeVK(;ulI*IWkKedncAM1kurl^ia*sp@OCX{jT?YS23}!45S^jd310q*wK@XjoZGhu{IlG^JtsknDUOe>ksK}Y5+Do=?Xv7Xy;+Js}4M~$rZ9oV{N7>DPVRs zCbkAgW#=zCHrjDUP&HfYy_3d=*Yg&||J#*3o&o0ndLib3S=477g%L8U+yGC9ORrN~ zo}{o2HDMKIU~y-O&K0GY!74%f^0hN*@7SNqlHFkEP$JU=;;*N45dQ*77s1UBH$kW_)= zp}B~9O|;Tb|IBV`L3+>Tc0p3$vTa3F`@@lW7@W(j;fzReam~rEUxT zcJpt)J%o~fm;#ivZeU?Mh_65Wv-`$L8~7ZbATh%Y9(sud`-upC z{WlwHf95eU*}8PuUJTk2j7~nSck)^FW;$^q;ZqmJ<|2uES88C7zWH!gsUrDW&vps7yI4`YCf=NQ)R&d*; ziMAn8u-I#`&7)v^YfDttgxDk}9)^Wg8V{Qw*uWe4pL}gL0Q_+o%5ldbA}UHqN_uD= z{-JD=o}k($ZV{Tm+Gv@mYZ@NMMlWk>$%0~L#9OK&P5hEoIPcnM3G&AVX$`*Au{5O<|fBRE1-w?Guu;db)tQM2&43nuPl!I-55zJ9*YpFNNEz z;}A)Q^}xsOmy7WEq1h}jc%6%2qV(3>b=v-pNd(2|@ut8lAEFwoHD*BioMe*N9N%O~dW#6)SRFrD_sn+lH&UvqG?#v2myi z>#dga)ohKGvlhUa-Q9q%O-*%Or%c}S;Y%|;+5C7?;P((cJjGehlY^K(%M@>KWI9lC z^}Lb4Ss)jDcHRHB`y?vIR9dD%pr=m{Ge_S-k#WO?_lpmZZu1L5YNe|nv(^UPLwPj& z9|uo0RNP6F`vM|0zQv~dSBB9O?5vGW#o}_Ztf(>cnj~}?LZ@!d+I7TJ&C}op>>RJK zZ*N-(_hHBXaEdkezt*_nt*3`3^;zUHW35*+PDGm|YTx`|;&YcZN!f(D7&~hhZRfn! z7!Uk)3Aptu_-GqzRPqR#)G~PRUDW^irtE!bdL6bnhu^39lbDZdvm3?L`1nXoa-%3} z>SR=TuI^vtQ-F1rMx(o+Y+#2nb3MY7m*u^0=yYDoai_zA6~K4R*Q*N@VG3ni8^U8( zigAVPQeJ&W%o7u>`$$7VLtWG{pe$B(_=0>GBHjHPiYQXJZZXB18o#O0gVD;k1 z9T96c^CVfbe8XglbzI?95DQ9(pqF{Jy>BX`xD6i=?9dy>iMq$O zHz0E&<8pDY&S_JS^0LAPR-tds{(YUSxA5!bv;uh@$GFbU5aU*_A#OlVI#|G!mX4rL zD=R$?T>^5vQugPHi5_=Woo5_j@`4_Wec4^-=U;*P7#Et76?PmMc>50`H$>(P*Fq0=`)&C&$Ue4nKY!H>nY#UFq zhBH@A0sMQX`V(D~poSZrOF4p^WN5=TD zZWDVb9~b+oy#Hr#*U{`{zpG5_`mJOVnw79Y_pWu2v>WvL#P@EXR3Dzg64FAt?kO5e z=4>ia`CjS_P9*?hhB+||-m#n^7`2-zMHD74r+Sa@7hB$~g7E|L0H%9xKCN%L-ZrdjnY5)lYm zdrR%XL!1sB8EPUH9;N(|;VINEH4O-iBA-s6o*_=w0ro1Dh=b5lTI4a_DC|0maCIrF zCy^cCym6Y;t{v;ba*Kwr8*&CDFnyh@9sm569iyMNi(jX3tQWtwa`uC*g=ni1`hcU6THG!@8Qbb-vp|_^ZrhnPlDU~22tmJOv)Ewszf6+50|+kywsbRR z+04NOz@Y3Czi{=ibB%$JZy)~BR=2A+(%w9~3MqSMev_?kmh4;vasL(xFJCxI^v8Em z38vP9oAT7jOr$2zbKQ)p%89F-+d1a*`HFbC;xYoir@A;e$=oby z`c#^YPSA;xpflhx>_)0*UOp5G46ZNp@_@k^3RuvL4zRFn0(P)aUOY^SZ z3)O}s)Q@3nZ|#&?Yr@4MRz4arEdAQ-?^M+H^f-s9(fj>ecK-HkIn4I@$VF$I!|sJ)^4l1?DF1-8>}2BOz2%C zao`BSe>=)i{NCR#fp#6m_0!4Q$wP*TNk*>u+cb_`N$m(3caEO9*+Mh(%R1Vo^nHWa zGTw-oN+HFX#K5+y4}3f%6H}&d{Quq!0(mqBqe`;Uy3}iS^F}u@lS2s7RvCOAw^QO^ ze!LOx;70^mCmQMfid0Ida(bXh9=!b&vQk4}Cj^yFxCucz(@dv>Ab+AH-6_yCrWWbR z9wfJP?GXx#biCY8(}{j<5_lTryv#=e&p^NEs>0w&33B`Gt3SJ>;LO(sN4m3)@gnL|_JNWgW>c$#v~s5i#*TCy zMVh}aM@ofHT0a-qN|ZclYz%ntKVr$Z3p4l}c%vdD@enDr?Tt8Bn!nZ=)*#)4oHdKp zSB0>ZV5A(I(tA0xCgtx|OlhJHy-sno2~Z-0^@aN_nn-qY6yx?POJ~cyo;jeL3+758 zr+ZNaivSoO%?6-c!|LLzXRFyf3 zySuxQkdTI9=O7VbgDDIN%j-G;h@Ft#;ibBfb@xWe@NTVlvBn_aA=j&apIXw!{mra>0~diIiDmlz zvOPK}PZ5vSuQ-43U)*<*Lw#_tzFWw-w7V_FGMew>RKlea>+O%2JFkq-05F;e-68IO zrdf^}Mq30(*N|n%77V~Lm}wH}(b3|invn0Kd{PdUEUZx!cG;E{*-pg0bj87tyA$r@ z>QvAcO4DEjrMuO}TWqS`r~(e3C0fs=0vZ3Ab-I4W-F0NahRg1x_F+v~MbiO?x^v7= zjBgWqEqX4Ez7x!3nq2Op@&IY+_$7`wXI@G(rsuo>=WPpqj_=8rb+PAG&7XhbXgR8~ zMdC1a=RC1RAFgI zq;uA*aAvU_vs=+=g*Uvbuvs?py$KE7aC|&!X)X>~nRE;p4W*-N`ld-&z)=*+B^nzv z={?qk-n+`n1tM7_i3W@v9Z}6u=<0AsKorp90*uH|z%K;Ty@}Ozi=V2Pif=PRYFmd! z#{nrlKJi7Oj(ME(|MO*oNy9%(-TIc>OWeK6fUf+{*hBZxl|7S9<^qvn<%G+&H13<< zU!60)vOklUva`Nbok=>uIhNU5eGPcPB26CJWg_k-WQWLr$UXoFPNe2KDvn^`8yP z`NrFM#|y$C8ETEP5(AU+*ctxmc?a6 zWimt9y~>LYXf=m%%~BPcxS3C{E%4$AW4WcE>R|c6!Yz{jGo^{Mk-V3g4cZ2ek}K55 z+DIc9n|Ksv)-F7!G^o2SOx55!Z)mvfD*66ZO_BE(cHUYhOQU@M86Vu0=Z)woRj<@? zkAqY80ugd)XJ-M3QH#L+h~$(-+_hJM^BRiB}bhsnGYw$}@~ zl<7VFgnKgIzp8fE8J<Vit_(2k$W-6;yC$xyuN#W)_izxo-J6 zB_*gFIOF}cFp4S-C}$~`TEJlb9(U4xm0=wo7xM4*U*P(o_y^fv|AMs|tpEI^X$*Pm zJme>Q*b*RjV%-T;3eSj5%2((5tuDY^L>0)*=_|NrXQJ^X&9pG}Slm5DP)k&o%!39# z#DtZ;UV@a-GCY2Zk8NkcIq?8e5&)5CCUfAe$G@q5UL4r=pFbVqZozm$=y^97(COYh ze8~JcSsOD*N4glP)}N9#H$}zc&?sOEj)Z?>XP84vRbujk-yC2 z;u28Qpxk2Mqc^1Whvxo$I8f4PwPLJ0kT4(NDxW`fEWS&+yB#WGZcaJo8T|3rQh~WA zxzD&vpvG$0s6wxusMS}+Zk&iwde(XG@LexG2c=o{hF@@A2 zDuuA!QvFER7}dlEjOs7;uhu8DSY=5!vEKsr0;sUUXyU&!BY0O_LZ$;>UWo!Omv=WI~+H z*z2{oOrOk3A@(a{v1PjYH`Q8(AG))@E80UK_!&_QPJL52)xb>>D4uj{BsFMwxU+B` zYdRCWp9^;tLvJX-lU92`6rZuD1iG;tQV7b1$-Ig1u{T{fwpVRrR9S7&bBvU80?(b_ z`W#9%HIKepk4p@8(8<&`+Byu&@Y2cLDWU;ZsH41JNJ4RMIVH|Y)8HFo2n8Q_;p;!B zdvjiKWB*}I<$t+;MW2MaUoh`e_J+Bds&9P|#x;)(S)I}ED@?`2ypE;woRd6Qrr}YV zmWN6oN71G#tfD^eWZDJpy?#f>(VvG^UITh{Cv~%VC~Rj)y_tLepwLLjZ}BNJMJUKdX6X0gqFvCVeIY$!);Gl^Mm8T409;G}2CgJu^rc!DZTioyb^Lq=uc zq4{OgZ1w7nS#km#H5$+?63t&{&ed{5MZB0|5U=%kW<01x|A{9}vfS#&+8{{9{2M5N z<>%#o3?P-QsMVVteH-;4jVwLJoPcbCcMoLB)>T|PD(>5zCw#Y)gw+wFhw>kkzBSkM zAJpW^{3=y@m5Vh0t5{f8PLwog=?w$DNG4%tMPK^V>E5tAPsi1p0y*NJj^gY!zDJ`U z&f$R{+gjt>Jl}MXKl4x4pi7D@5l_uO8jnn~eK;Sm`sn#l*iti056%;jQ?h){e2La@ zJH*m*-8_D7 zkK2DUzrXvu8Nkx;I_nxDdF6G6#Ng&_i#xD_FJT4Du{3v-9Fbj_xBHVii?t3V4;`6S z#OZPHzMO`++fy#I8qe=>OGE1MZ?HX=@pp0ug+spph8UD6q#+r>NO3zb%Hq-O?KA1d z0V4ui+~)|Na-M(RLMj9Zbe&lI+9r{imL@($OYP7KZU0WCRF-@;Y0lmJFNOGu zZ0U>%c#(0jtDc67<)=mnH+`eU9G)&$tbiAllnFvI?fI(`PIP%#)KHqU{oRf2T7dfJ zc{jF}?=PNhtkgo;kEun7jpE@0*sJh1a7mLc`HwP<`q^C_$KjE(pVh9W;tp*VSvk=vb9c ztW4f4NH9u;k%n`bnRO(pICyOD;;ziWTrm8533{B|OlQc`p^Q%EUFi(i?Mn_f%Xpud zMM0vhjAuJ6PA|I?qq+brB3_@VM4?aRzXYOS%yeiz$r7bXP{Hd2Vsep z-OnayOQsfSD#iJUH+Rn8+`mpR=?%L*zS;X0OHmTo5RQK)a@SYSMaAwV5Hh3jo$Wj$ zsjc|S>oP4m3B!OUX)NRhjX!mjjRs=BCW}IPfVj-!Fv)poqWq|QRc`mYFF^g<(kvWK*0A!RCS30fwhLZ`Z6BI{eN627Ib%c|g8_H8%wpE~V#T1hyo$@+6 zc?m7?;XHDpB9CH{${9K281K6J#Y)Ux+sH zsOh|-RpNCfjv9|^GS@*I1|(_{hDWk7E4=JO`R4rpcARVk-tV=ZY-B1YD&wq#?f`4zbAG;~iZ=inx$Rr3Myid(tRc zrvBgzQ`@4p0*jBMbM`(yH@~h;2*wi%aLH?raqCn&es-g5BSo(_-zxj=$g3XFyV`fj zO>wCX5c6y#y@=N;O61@+g%L=7`i5)mDjXd-=Od6JQRmxJ=aCs~w62DQ(>)v*LY}+0 zH?AVPaBF8)g1>C)Q)Bg}C5#&IZjdBjxC5Tkn*Z(&r;lB*F|l^~rm7oC^^2-l2omLL z2JZTtrbeA`WH({`^=?q!`+#DiWXMb{#a>61M(2}$E9PalDKf7rAEC!#g|@-CK!gSV z$KeEsF;o(>HlH$*S9DqH{Y(K_Zc`^Z?KQ#=-htXxRUnISCi-abQA{*KG%%Q1T=Zjj zI8)%KPu;Jj#B`J$3?6>3^y74^nCQHc5>p`|Ms>&*b1*{{6B9E{Q9zdN`WX1>Lzk#V z-PNJX3WIvbCA-0MLO*(&R7@7C*MXy&wDX~NaTB4z#uh28>{cqh0?Li)IpkWYbk)W1V8(+i$8?b4AG|ih zSZerMA14)g;q(Ie8ZIl4CQW!MMQjah-w!R6pXU9z zo#WdrESx83KuA3(Xw3m3`U|3Jnyk8?5%k(@aUt^jqr{bshl=%ctDxd7MWJ`wLt#<^ zr+kK&=-j;K#bo_oIjt~Gb48j1GHCLY0Y4_fch3j@$Rm%0?OY%gAEBdD$F@-?;7lEU z2iYa2MrE8K7uas&gD0XeL@`_k(kZO$iz1HBInFO{`Vyeq@Dj!ekow7^jILct$FV5H za^NpKbX=XeO!W2>Vh^vw1WX}i_*%q@g29EW%Li6kO=+slSi{`g;fvT(5NQ<3d-h#G z1eHUZGRT%^_?Km<-=#~6MPDYRAKcYu$po>^+5+t+w8SF3CV*=qQZ!5DV;B-ou6cFm`Ya4$@dZ8Ur!*ob8}VJpayTUWdW9H7v7 zkomxX<*2E<+`&AqmG7f{Z|ZRaSW0JC(hW}8O{8^Ly{*gL9c#U_Zuk82-!N#R97?G7 z{v@pjuM-^65Aj$+=x+jRXi5r(?*zRdQX)oVl_TfVVZGimoAg422FMv4L(+KTcNr9` z@}hRuQr^3t;cYQ=MqzkVs!}n_OL|Q=Yw$upulgJ~nd8G`tN_3qaRD*ABA0ZOw(Yg6 z2rpJFW88M_dZiA0MLv~HZ`zX-g?H(538Tw%cj7=2+OE{EFzSSh4HKaQceSr7Qwc0-yR`=O^) zulr~Y@hUVopCfv@!%I~Sv|BfIQmAuyOf6}LHa*{F3ZXlVa%v{^Wll=*nR&fd4=PYG zS^?ueU(o@+dgm#3l*S+WI8D?yc5Xd#qwm%~pSr;n<;AT9#DR%^d~ca!x}jt)@etUM zPRlvcB+bP~RaV3yCsm?t2-^W+wFbcghD|eZJ?9r{M<6`Ojcsx3hOCFIxtT zT3aPpGcH8m+R3;}x14izV11vD^I3z}N3n`1t_%L0gmgMZFej7G7k%LK#oVu+B#MKe zX4@Pm@Oa}bOW=jAzFX)Tr?qV<$#H)dXjJSGw~RbuMKD^agZBZ2mPB{A0&Sw4*QtjI z08L$;EyQ zz!{vtW06{CXWaP*D30J(j~ExsL-o-H{oR{G`A;wopjcNm7gyF#@PmeFcTL74r`iR9 z_Kxv>q@bT4%^f{T=UKRG=_$@EuPv6_xp3@YH&=7{o_qD$4?&R!w6p7?cq&?;GM zecsl9IEvaplf~I5X-p>Xp0a>Ze3AD8xntB(B>TIq88<#xv{Yp^I z<;JKUcD^!+o-{ImA#?`BOBDi?4L|HL)%9;guA~sA3&xj9Pp4m@R5<0kg~EvA@jLN` zalimk5~tWn=eTo1oNyb)#lyz;I>;8F>t_7|#YaqjbkISM);dt5<0qsx4z@!KL0=~U zf5T(>#_}}{u|shGo%CZ2;k08?6`8ky7LA0EJ@9Jg24; z_M)&@ZfINuT<)z#wddObqb8pI!8-4`=dU3y0z3UlSV0{v9U%QN> zD*4{zn&yc-+4NeM##7c=gZU=rA!h0jbE;(^@r4f*XlPsZK;|9Urs3(7L|<8ATzle^Esf{ZdMMWOZX%wbft`TG^J=@}%mS3yuMb!LQ@;#%&2(`sLw$Gi ziUW9UHP%;RGRHer_XZD58NWOHYoa^X)b~8*LyNl)Dd&KNN~=^76K2qyx~5Rk{)Eae z(lFRGeTdXgE zFe1BD@tq6_I+je}L9ek&CC%N4F%ci3jGm&tXUrqW6LHtLTn+vbvxJ6DZY5B~L;7U_ z5{<;}&!20^>azg(VeMM0dsBi=Q_n^5md*LwPYY-=K@UYS4^AkmFdi=#Nw}154LyqM z@ejjng$B0{OT3z(Q0G~~uwSBmo~>;YvEh6x`VS9U2z1);g;g6bo3(CEh_BzaF3x(} zQfGZ0hHrSzeO})0;fd#l)qBwqeT#ZOuiD;{j#+M9yeEkmevT>CX$3FdH{%kfivwsJ ze`A=nzp-KID$?FI{Wc2f{gGa#R&oLGXg!uu=_d|cs<}5 zFh7$fRbJf~LtK^JJOpn4D?K<5Ob^kCDcr$6yTqk({=saSl8U6X#Tns&>_J z&J_j90=3oW_XgVEIp}Bd)!WsOofYd)*6)wi^7XTM!EL;Hxtj=2`*XQrBi{D7wX6!@c1K}V zdKUNy(v_0>V{uphkdXJ@n5|6<5_dB9IFc>g!)~%stdVJ)QaDHJSC$*B5yqu|8lZuE z0F|@b!TN5kRe;WGupKV%Ar%rXFOz&dmWGO+8-Vi}eb_9Q!Q(U7s^G_I_gk@Ncashw z|LrHl{?g-((8gdp-UjnD-g3MDDne9oB2a@L0V}?IX6@)(T!61yWh;1}U_)4U^6o`= zl4_1IX!1qR?K6RST~naT<8|WTTiCe8#u>AB#z`x&zO;~i-l`W>90HvjkGW_=yf}Su zscB9eb9c(zuusD8p@HIVN}vCNk8H^uW(s?F5izA(FRPB=>W;Yw>s4>9a00VR`uG~i zlyeDlJtE3?j{*iqQ5=N0(@cp*1Oajy^Bnrff~Q$8_4%K8Yd`(gy}i!FuB?Nu0C-O8 z{qp&4cQ*j#npq!6*Arm(2vUFtJFwn%8V6S&a?EsR?C}-yRftv)x*c0Fb)8>U29mAS5Eqa68FUC;Ormf<2cSd6$nog&%r9#{Kvnt{ zlG&aow4rX@_2_y9gBoe}w2JuUw5C-KmE*4AGP?C-wrw-#ij;zi`+r;unk>mA4nA2< zYi`Q}6*&IR%JN+T=1qzDucH^1xWQfZxLxBrmqr!PA2woIxx~y`qprS7QQ1$^>&umw zVx4d^(C9C8UHdtksl=x}#w|_5&i!RLWTFP;yCX=~VWvG@bdTmV$924aaR9~gNEqDU z^2fUNYKlea@+3^3s%`zr;n{~bem$y|a`5rry7?s=$Jnr8=}ZTBf@?xeyI(B({a&P{ zuS1v2vDCA7?IVJi#kp>r4r2AdWWUqb_+pQbKZNPAiHg(J z9|WDPJ_6*bH&EhTmosJozucy3r6BjK0-eeV#pCO?jrwPpZQqjEhY@Z9$K}Kr0x+Le z$X;9jfSpaX&?1zsBt!SAEDypQrc|$RF>kGaM9ug{bmHTXz7+D5d%)&VI562a@;#x{ zvJ=nt*WIGdN%)dkTG!2lu+IrAtX-hu4GnTnaRd)!cre!nTyW5%;$+vn(s9v?{nxGG z8LrtQZ@ib5ER}D_GM0oX(tWNx1`=UMn8iG6ZPS<-)E)C)f_LE?TkBM9- zaX^J2)vtAqGFq>EwCh|K(f-_XFBZUsNf4me zzsGC%T($4qfOFeM$+Z~s@Be~i1wCG}(AThZs6$p-4oVmEr*w=<=@ilAS45%p35uWi z>*Zj5eth3XQ0mpGyh35EUj{~_VSva=0bk%~tk|Eo#Ga^>^~Hd1{m=`(U&N`X?bb-d zPkbK#`V6x_E)Bk!bNyu=VUig5;04l$1~AJ)E|m##Rp?ed3Hm#wx@%-B8X zQpAw;kSaaLZ;1+O@DRDUw`R6~PY#5&&+J_^(02Cs&)mW1g+SI;?U<*6+l}zO!!tIR zg;7QpbgVktw_4?a*oHv>Maf__D}p7sS6!<5sg4x z)ceA`Vf>8!idoJc$`5wcBBmXH$BSx6>y2NdE^jHw(zP#QaQSW61aO#wkZs#$?_7P+ zuEcL7_c?V9?~)^qcPSRp2eFBYfc@GGx}=0o0-s}51&-Rk8q;FyeDygZd7gH<@LZwg zfJF-UTIcaU+@XNlT$VUsJ9(7mc3y`>Ff2@B!2|7(q6s3(dT#LJ7ZEEwv0z$G9w_R@OIq>%%eJ%EN9wog7;n~=+G+r8^(c-Dbuj8|g z_7*8MMVEQS#}YY4zT!EKfmD^khwQ&0*}12{Y7IlHcFZGnKy*x-Z&xB>^W12r%-S1z zgN`fc0PJi^ceeJLgTKkwE-P^N&u93DX?~r?roB+9*hwU|5~WC* zC(#|OL2^%6#61sEV?$Y>_saOiCXU`)`oU?|yPNmDLW2&;O4twPrK({v1uH$ZQW;Hy zD=l$+rr0-7mGnOpT>QkHqU1dUvL5hQopTQoN-CCm?UUn17os%-k>=#1;^oUmU8-@4 zm$@Uq)+Ic}T1H>zh}BCF3^2yLuxS}|=)rBA6j3LY3zkCOOtt^Stp5A+sOuTYo>U3T zCduCWY7i{1mJ}NKd(vdye6!=)VL7CjuOsFS#-;$pImu4CVvW8b{eQF4e8#+El;uEV z*0$Aw$d|+qeeJdu^U3}+TDNyTr!E4}GF(JWgcMt!gL@Z!-FYJ93ghBgUd0ps>ZPt} ztP9!Uo3WX`4Nwi9!Q<<{cVJQ=Z5%#!cy9MyWZHh2JjuIr=xpByS;GdF+>9k-E#Eho zlP+^~{d1Usc{PQ0^4Q?W(UOZiep~cjoD81y!n|V1;Rz+W`_+qk%ar#H_d{naAWoko zd$pE%fXVAd7_!(JX7v%Cc6V`nS92B*|I7Dym8yZ5CZC%Zkc~6c&oq3=u#k>0CBt{~ z_D*mG8Wzx-#W9)Y*9t}u@0J& zd5zp?dQeKVMztelKow#+24XWEICd594YaJUwhGT?T z1~tAo8LESxn;haEUy$XDdc;N@V))s2PW!^B;o|qRm4LjC2^M-Y?%;Vz%D9Qrg<=Dq z_uj`W%;cRn52dS@gIUl*x`%Ml`cq0necu@{lw#%w>Z^#0k4UN`SWT(wKzqD-*<2=K z58nxjy3IKMM)snbIhnUyI1rRHUDphs%dB;vaV+0J?s@dYX=5035drstDI}vv^&%7Q z#gm`wrd#1L?v{kR+~`6GnhslN{PO4uPg*B8tyh3-KS z2avPZkVzlyi{nF;3w{LmRX<~aot(($u~R5ytsWTxYGc97W2&{2TYgwk>&O;qFdOmS zt*_Dg^}=q?eqD&df%l0#EtA>b#@=$$+4;z(v+)ShXqijj&)ac8Dd;+Sx%E&8uBgDs znPYvkQ%xmelAS{AH{f)CvzAHQJcKaBf}nV$^qmy0gm(rtq1Qx5*lE4UgdvV46HwCc z=+!m?6SJ!75=t*OxtQcGRtKSAG57)7+I|2jh=2D#I zIJuOiI#+Mf$bRjede;V^El-+nmk6qBhwH@7ZPwfQb#doCx27Ac^%g7i9NB%~vs}ha zPZaJq13#6mdapwO4;AG)6VXC|3#}_R;72E~1RvOgUw!3*mVX*;BP@wxYOmJ1;tiGo z%@Y~7g-0*?ngtn~j#$CC&tsg?<~5VRowhmOnbaxp*}I<=nxbu#^1ynYZ2S+Mt`Kh20pWVnK4-=tp6SQ`364qu zkCvCmA~Vv*$ZKPm?@dg@*eTyOr+`&fP}t+4eDBl9evTss2+ttW$5L_KZzJlqg~m}o z^HNa@?qlg)o$Y8ug6VmZVw~!A3-uEpgnFj^sBpx~yQG|<6lS}2=yx@FHGxu_UCL*Y7MK^WK3PRp z{zA(M48I*GFT{EsJe=3eQPq5O1PA_R+iWF0o&}=>NTDX^-G~lQ<{Ga*_MFO;I#2O* zSQDH69aG|4uR=^-LYtMgmPUyFzZXCu%Lt>38Q&$5UvG64Hp8~fd*&QzQgo&CnwA*- zLP8?!1f;ZsF#w?#OdDRmKSXi(MT+w2et(IDBlg$&*{IxU7Y*lv8-AJ;LpvD}5`~Cy z0cpP`(Kpn0yWDlqErz>;B>5fYTSA>nF`=`+tUkjXH|?iOpciBNFe=~MKhE#jlF{AA zcU?4+FMs}8SL^!nuRb?EuhS0f4fB z3FZZY#bXgS7la@?Ri`?kCC~}~rnKB#V+wnAdu}=UzyC()Hd3rMRKjN~UR%Xti|2mB zj1hOrfb?fQ=+B^^epNhR7U*-Hc`Og)Qae997Pd(k{O(C5lwqvpI0xUa@#}D^x~_hf z*Fuuj2)?1ZV&GQ(eZRNvTBxfiCan$VcoydJ>zorn7p=69&fQ;!0&p}b!dJq3uS2ddJ?K_%5>lze-$Di(id>-=} zPB_|tYhU^38M7y9F$%ieCEEsywuc^7SNu4aeXmZlVgR;Z4FzZ1OP5ksw?w-oV8_I^g}`;X zI_79Qc%tRXu1@4tf{Nbamy8Ohj8FPR7v_jfwob{*L#u3>U@H zS*{ZaUkXRUOjU)ozQ3{*w!8qk$iav zx+4351gRw6j3LU*Z5BF#C4HwV_P;=2)ZK*`Qb!+9K?R7w$$%8#JkZO_YP0s^a$}y{`@WwwCQg_`3rookpS_t&bxd4B@4h~ z>_pay@d4E6*9NFxh(dinyom)7Dz=F4`s6i+9Pc#y2|;W{ps!<`r}u@;*6$)BK?p_F z@1I4c70DIEwSm_d{$s?{9h%{;>?XT??2zg-)rDc-D!vbYs3d{so8u$ z_BrPU)T`GT^|B@W?YC>gx$_F8Y|}jS{ryg#W}l58to7%S9~g)u$_WG#k9!nnT;U}m z#`?G`gX?XdiOj+%zWCJBe1fQaY8@52MIYLc{FG30U+DHJTk6QfdBAE?YOk#V_(~5+ zLASr)WU8nh$EE6KLwW5mVl*$1{h-QHQ!swP68@!P|N4{{4v>EXsJh&{N0)qx1OItP zt(dEmD))w87F8b(&`2v12$}6YhDehP-=K(psbsRDmN)1^gFzBQA8!-yK|qlduXeoZ zOCO_`#x%>Grde5OxyG#ZdMdHl_i|&kJh0MOR|)m109L?B%>Td8_pvk)^Uccd9?~s<7+o z6*W+E^}Wr7lRm~RYt5HS=wjsd5BSPBok1cuP;5g|0QQbj~#aTeu`lFq&!OVqy8MWlI(|wZW(sEkayJL zp06~M>}L(Lj99;2IUw_A^hRd>Ut#9zqASUhfS*jzcFP3v6v9hTlYA%pa)QgYgn@sy zz!{_`y0g*ItiFWs()Zg9$&z<^nu|}Cl#0I3~f-~o-luhCZ_aa1#2@+a3GMyqJ zb}g$<-(0hqH8u!`3ON(xhofw|7aIV_3JbCAnK@5XZV$s%c{p0IdYzF1mc^(PY-2k4 zy79qPi)Ttm9L2>Rg$*+r{kfA6*ZbK=pX23GVYoNqq4Q}Tx^f8{@q&X^=L1eSy(aVl z^8mL+{N{dAacH? zKKExOw^q&NSfcHLrpE1Jz%S~p;vdfoQ0ui%@6j>JINr1GZ@$<*g#JQ8Qzjb=pwkSu zGYMnbwEt>pB88fhxb%ap?<5FUQe|Ogl5!|4o&SvxFb{2Tk8AKaalwyL*GJpQ7^S+y z-#OzW%3K>o=u0`9%A;O>)zr^q&j&(!0Pzom>iLj;ara!bZGm%>HSU!zey&U2eLU;w zH@4Yk&Na1O6)A@U9JRd7cFq6beG^`I_!fd*Dg1AY!T%LyFAOrIs(LN@5?ztkU4?P? z2q_|qZ<@6&X<;C!B}rIa#Z=7eBFA@TU*S$1D zX#$P}FL^yqyY1gKviBpVQ&~OJ-0$Hr285YwNK_zp6{OW|h7tFBGHPJ?KFiidiENiK zBk1kc*~pLRPVd;6NJ}PlGl81fwj~tAMDw#hAfEK-R*s#BkCcNh)~|VdQ$GSBhY(lo>5k6;*y zw)s9PI9dee#Frcj3V7P`5;oVDpFZ)v41&U)^v$f_7wcT5Bth@=4{4zlN&*-4@E8}C zXrH-85Q>{-XObvaquX(s;Cgh@B!*I_&l&&(IW7hQd2(!9Pd}8GXWHIB(hd1R3z|(lA?k4OiE63o6s)qhF3CZnW)V|#Hm!Wg#De^LGR zU+G&6o&zagyAJ9QtQO+YN`J0Vq;YoKLv;uQomy%yL!KW$yKV`H6uxLooZ-YU30|9^ zO2BfT2lsIcA59#v(&uazyy!IV$miYXP$sJH)8exO2ofPAcbry&ab;a^Hzwvz@ZRe~ zqv`R6irc^wPfubX+GQ?SLmsGbzXK9`KYuFf+U#?3roObV?0FJR;(FSi2FXtME{=xb zicBfGU%E2&P%P=m&sWvv3)#5|QtBM^u`U5x3~@;>*Z3pW1^_0ki^NHj&&QkysNm{= z7~1nUYdo!FQYw90D!qmXCLZ2h=O0wa>`s+0y=+0T z?Yz$!zZ>}LicW+fqzaDj#$^LUnY_n8|5A&eyid3s#dnFV{g--H3N7=0q#f->6anSp zRp|dui<(It@12CR%9SCNKke;V7xgB#Z17S4%;G*0lKTF=tezrjd;kMU#wFxx3dOAj zUw6TBbgstJA5r)>o8{r-9I=AlToqqhIc7YrXBnW8D5z$h1rP9%@VVTsCC10biB9NJ zod+UuE9rMd>N)xc#VcFC>BblS*Dl+}?EyvsO3}Ohai<|0@6~g?afq`#mD{X!v3uhP z2EKW@UpCYJE-hT2hq&k9yJY4#38_hZo)UN>Id?MZLd&B*y9yE*7Q-!*6>)seh4q!>~@^wsW5x5FQ*5;UotXjcdW?L7Oh26Oo*Gm3v7rWO^{BH$iDW&^}$47IWTt}R0 zoGMCpZ$V|;vDMuPdxg(_QW#QE!%5Ecmu?QSDrWxq$!De9no+0vfJ7f3xC|U;qbV;< zeBjo@`{_znWO0UP6+5;`7ZN)^ z?Ys0dCyjE$C4*Ss1D1NG2W&Ukuslhj$6IPlxj&KX&z-5RHdu}TEj#yZS=2k~nQAhX@XnOCHUDF<*joWFBrM$cYm=wJ zcfl_Dz%T+dPhXCh->BrXkR35HDd+B_j*MW-JCcMmz=*tJRK29-?AAn^l`M^AbH}W-xgp zPH*~?Q&E=f?d(|{r8B)u8tD!}PT2mA9kt%k>(f1jTmnTu+UigI_-{|8f{b)+iVT~R z21N)9vRT7}h%~EOY6UB3nI)InraXQR;Hl$7oNy%s$N|K0VDoZ-1CW_b`paO({h zZH%I1@!&Am+Vwte%ZDU+usXKm-{V~qib;sC+O~;k80ZJQyUEl>kG63CiDY5)U|6+z zd;nQo@D#byKyY7MQ{1aqKHPjih8*b7-e--Utw6Sg--}DY@OYVRFlM=>dp>w_+UL&F?SCYRkatvMap3=>ub?}*<4jcGD zwIo%aWmYnVa_%icYx4zbo2Ja0e76Lj{5`O+X($BhNz|3ped_|a0A724 zB0=VC`Ly3h|0UKaPwBT{DIVbjfmFhud1b=w@q#Ju4kR6p-uM_-8zu<0R$Sy|+n)^jy}`y7Dw^S8s6MvEGD8>e4s<{*fC#*!?Xm56{y3 zM=MPz)ASe0v;Gs&2s5nVIFYzsIsw+)A2d<^83z z)WVh#G9F)|?b%I+Mm#eTOUTsbH5FTaa+{ic614v!Q;^m6$J)S@`W(nN`bBv3OYE-n zy^b`F7%MpffY^?w1it3x5#;KJULrkVWMN zR`1y<5J3XqtKklHuQVJ`#(Gf^+MKf#FC9++{!GUA!Dre^e-`kSu;NhRTDdcvHT({zgCuebQ~!x~%lDQsf6ZA!_#EZkU0@H*9A+60y)Kq`F> z(xq$|GeZW=g{^n$MSwp;^qA1k$(ycU%KxR}{{JDW$008Ypie^h-4db5xyJZlhisMK zs?_n=9D5^m ze2)sgMZAnj!7n+#x6*uykP*kF*v~PD;4j5}ATIGh$)UhckW3wgod3txUq;2XHC>=E z78(h`-5r9vy9aj&?%KEp8h3Zs;O_1cf_rdxcME#^ob$Zj_{O;RUyuE-_pViIR?S(n z?h0j3kND>C7dXdVNE{1k(M|Vm;QcW9IbSLD7?Ah<#lG;sFy*!_nonl&U}*nBI`Hod z%8X3KCaqaCw0!1-k{N9YbmN8zHOMXNNKl1d}y-x=7f;}(1Y|Z1!caJ z{;zSK(gIj|H2GZWXn?HY@6Tz2BuVn*e|5 zIuk_mR2mUl<6tLA&RDFm^Hp~T7hMUt#pBAsp9S&`4K|WQ<-w-un&w361fL;erZ!ZP zsec1^=a16FIgfNOD26o5&B>L+?7@m^@f3t2XB8Cx-DxEF@JspLi*wSP{PfO>F%H`1L0eU@&}I|auskkps7{EqscF&{U92AR>d;bf8E(Y7iUqWY z(Bls(eQ{+f0^f5APV0nEygSC98be$gH4QNQ1a*I+$g|2s2AHb)$p3c{2YTcwh$8Q# zAygCLpd#N9q{FMJ1!jryQjv=T(gMQK6e|*%k!%Pgb&J)K*x@aRZZM3V6i*0F98xbJ zGh6ZQl~Ewi=3$g*b3Le1r;i*-$_ICqf}}}SPhLO!r18VZ@r;Kf6GGh%LJKC~WkDg4 zB!qQx@1?t-DJjk5up)_RG|Sk^LJb>PNkq|88fUhrN>jMDmd_89+EH_Sns%A&m-!ASD>{oGc_uuTt8ST;FcEfoUN z>~TK0E+@bB*1j}XyN&ASc+}cMcgsr5ab~cY;`PFvth68<*GK#V)yl%mdQvr3$HgSQsa;921tg>W|ph{(9>g^E|5BaYNoc3A~Dt#Y0z=&6vd8 zD8ydAN-U<j7l9fCe`lFU3v80}YV?_0oV3Hs;m5KfY|4qWM2IY_9ebK@ zHDbQw#4mz$3p}?ih!Y;uZXN`Qms*qwhDr`+aL8zUvRlu)a65?dN~^MRWb&7{n{YN=Og5^D@_e~| zgwdnodWNfc@-FxOU~3*r!b(0Qu0a}i#4cM{;~|^b=Vr3+oKF!J2@D|+-&B6aml`VI zYKw^7nzGUIkWLJ$_t+sX_|BnK?zzdd@ut^F-HwZ3a)d5G}#A9h(e?P)-N@EyR zXugG4h@j*6j6VLUjqVR6lm7^u#EwjaG~ny7*4O=0Uw=OM2ZE+qeC_;UILN;x=E8PkGaN%hU*JEzE?=fB81nSx zHbF@lk~wNCDJ;(xxj>iNh%zS#m)n#Fx~ZSBF8HTP(3yX%!3 z>S`mmB&rLV5gIgVfx&*XcnB-ud3MtyxL*n|{fyEbdhZDGwaYGj%+d zIBBHt1093nY6C{6=@`62^2qOb<#68ZD*lCxLa%gwu}5Ix6j3u!>zn^sW9VN$@pXTk zy6!5q3`x@sOXQze`cC22ZI8e*x?6?lCXIhZxXda1%zOPe*@Ty%wDDPyh5%VaK2-Nf(F3L#nPeflv zlS~z2epWW%6vUAJafLs|yb01*nemxb^3!VT=d!)}iEg7DofO1G7UCv5aw%$v`xS%Q zc*LyEAn{l`Wv?Jzourv^ShA{>`!F@T!l zE47n!AUPCoZW!sf6<&O_UJx55I_DmCj`vpFIelt)!f;?`q54b0WD~7lN^FN2#k!K` z(&0}8QK)Z*DP#Pj@YYB0Tc{5Xzlh5fQd;#r%Qnig>BdTn3@EFDv9p@A1egPej=B%p zPdq`KZ^~BRW`E=yQa%DletU5|$HQ>6ctep*SqEAeaf9%O2gM-g_q}Nb-w>tehA8iV z(F9m(`8b716J%p$1(~o%L$Q=GtRaa+vE2-yez#;PBm3u?KECP(d%7ddogL#!8l6^$ z{HprVuXJ_Ee;WT)i$wvZ&kt%73M3x}@B9r(NaSP8D6?NNAmk|uau)_UP|gZKth_uh z7_wg+XOX1E;>9>!6Y9esP1@Nv1a^gT3YA$#2pYZTw|s`bH_5?63`*kB7|0SrJTHkw#VD(u7kpk%n- zo4RPu2NH}@kvhrkd_M=m{kU(^VB4%KVoh3T`uaU&A?d;^5)y^(BB>_;il+FO%(L1b zs^(-bo5ZZsHW_Mp8fktWJfS3oyypM)*mwoNfRpb{*#>-boI!qBC8%Q&c8{6~viJsb z3^g=k@Ht#8xj;}A@&pNvC!;W?R555zO3xUlSQzE5RSo;ZpHle?CQD?c1;(e@L8!S5 zw%%@b_A*L1f+PtAiufyWK7Mz8==?ouJ=VCgxB=_L0E5C$_JIbXr016;1+^;fBqFEN zgVgfr1zCcTxoa;LY5YmaX0urPq9yF8sq(Xmi;1aZtgMuk;Hbsihtvl%>@Ur|9FOeB z;)IW^Ob4BL;fSV?PA81Qv+?f2xk`7yAKYVJ^JbqnT9kWi3YLE22=8>9%XTDW@E8rF zQzZC79wB=uJU%@DgR7$JfU3(H_E1xvD3{Z)oR!@SPkTK{$xbxzW;=N9<_1{o$LSwD5>8PBU#O`XO?&a zwTV|J%P0BGUbLfFoe3Hp>%1}fXVKqP3;KOo%5urtkRrHUb}V*ohA(tJQJMX(qy57R z3@9HTWP>%3qrw2(`nOkUL6jlzrB+)~@%vO6M%g2Qj84toL5aB|`P$e?eV@!Mf+Xvs ztP`?ose&sT#W+!hN|D>BIAvF9C_I(aFkNANcZ{UJ@q3EpKPa1}v!<&^R;zCnqI$yp zCdJo!*vnVl$_Ur^-J&M4Chp77GB)v?aMKw0(4b4zA`YbMTh{Lz4h`@~H1OLx3cV!gPG>7KY;Nr{#GQ8jTp zsqi%cEXKJW6cN;rFFO^kVuQ?m!$u_p;h%(`)-q5ixuk7jp=_+T_#1*}kUdpNJnOR_ zc`&y(!!d_Mv-^CQz>uaNndn)zjZI=%o}IQwr)iPjI+R z2|O-kyKiVy=WFgvm1SrMTqWH?@XnZ@B1DO0EbYY=`lKb65JWD`K_xg+I=>&XeG1i8 z@?gL8f^x1grJSY6JrYysb?UQ=;f(r{#=U&j-nPhQ8km~W%ArD}0ST40a1x+|KD9JE za7+`PLaNNAaaUd?%IfBzQ$L{qkfjk^BKcNXd9zj6P%xZZQQ^!gWQrzczNkRb;U@uBM9D*W*jL4QNNHSyK#$*(LHsUScSejWYhh>j>y8WCrV zy)#@&Es-%ikDo(j-p>SzVDIl=5hxn#@_#`$&Kfj}@n;1kmVfGyu{mghw8*|Ta+0mu ziQ2voSZkz>kdmZeC)IlrW2N^mNTxRszm5}IP_c~bFYbswqpJ58@b?NH!i#K$aYD(d zV8^o~^^T}&NNoU3ZXW#HZ&~v279i7q(w!eO&J7mIwpAXAc(E1%$7N30j}b?X_hm-_ zh{M?ydE=kuvT3f3#L8>_q6u78*@I*gC9riJ`p~TCM3|6|$bhJx#Bg9Z$snSQpnrXZ z$7q(nriQM0W4;`|vXOY-Ni3b%8C`2Q-aZ$ryQy(B)m=1_-%)VR&Tvj<(3g32XMclK zVGx)VBDnuJD&`wH-WbnwK9bZ7Ry*$>uFLxLCMgre+NI-ALmWaDj-`A{0R!ECDiUx_ zf`x1uWxDx2CiWzdr|w9KoQsOO0ednF#QAbyj#_mB%w=#y3mH*Wa2bj`zeDT6VQtLd z+>^#Sn|Z*?8r&aba(oR`A*znN_u0e>a027>7!qqL^#aHQNM5_x!N4kKZBosf@RGtb z@gl*vQM;P}e?!`zqQ z-{HKTVa;U>L$55BTNca0ph7dTE8K*fcXK|=8e2r3EC;OEBU*!5R}F?E8T3+1#61X- z)@t;{ZGQIJ|DlS$e{K`N<3Ef|lplaQ%-QczM6h8aLK@vjVPF>HJ8oAxp#dz?PrAPV zGh3*rLXyW%#KZr5~^Zu6<^FhENu*QhcL({6w7B<8hXyJqtJUoXo@#x!lEj zUu{w0s9`}07%r3`E(2|*u@UqeN~Wu?)ZJweTW$a;8&N20!9sw~+o{3o+vBEvD%f@= z&IOYv4dB4wb_XpseSD&u1JYT;A80!GbOPf|3N@=oYS7_1d2xS|Y8y@w@_}1z5VF0}*qbv#o5 zR1&{Fbj+yCL{{@(3sMqA!vY-4v_Jodag_v^!vH641cBVe0TlqrLFkDs0#x~TM)?6j zj_2MBND-~+Q;QvOs~geHk;vK_F|6$Z846xx9Fh)rD7_v4BFy2VGJ727pyFy_L2szS z471j@;#Rs6RjNwEX67zlGU?d9nBptJddsCD$I8ZQWA?ot$E?Zemzj`m)`yVifUp*8uok5g~HqgvnY|8p<7+-$_A*raUoM$Rpzksn+gnrko-szr(b%2rb+cpZ>`&*KzG%x+iwS^#C$4+9 zetqUlAA6X2YTmS&=X|tpw#lFSHIIavmvO<{7j%sS zrh5hVqV1uEx+~%#wa!p&d1rNbVsTPEs1gwmfW37FMfmN zPCe{vq_G>?6{b89zobJ(#$z*r&a0KL5H`DeMA?rVpyg)mIP~lJvEe9I7Uea@aO%Ivn=ay9d zruq%eU@6!p$_KPPrwcdg$*TuL9R8tb)jVzc&rO3Fd0ytIk6(2g3?p|SKQ9o}a5#`_ zrkB95T8KjA!GA`hiYhWK3=48Ulo9iw_AS{Nbe{`EsxOKZfvb@I%$Zf-4}oVUm6!3! zNnDN1ux4Ri3U9WdD#&#J^4s4*nPfme1G3d9OuLF)xQezYtZ*nmT7OU57N-L;3+^H$ zp4%wB#IJtRX?}lda7Ixa(*xkQ0&P?>xcm`ZRkH0#qWSVUwp|734uYwD=Y98AX($P% zlF{TKej~nQ*k-Gb!pitaD!&=e^`u1993`;7;o>R(rMVDkIo|bzxZEC`tWLoyu7_U9 zV=d9m=E6|T2q%%R4C}P`;rx-&es;YM&zv_&-HJv*p`9^4wrQYD50d=VLJWi3|4J>? zo8GlH=kwRwsnk2_9SRrsmNNB8TO>ZUD5jX-xULGcHsDFC)*}~8>UflOuave??fv2@ z3noRLtaeJ^H&_gb+<{*~4#V3Nw6ro~=e$pmlIH{C9#9s>#wINDxmI@FdV;jf#l?v=fBo_QTQE8N;c* zmdAf`R6hM`ZO1JUlK*Z70l)hzhLl~1?O+2dO;YHW!>N~E2UPotPG~z-R>69{!Mm)9 z73b&t=O;T+y1IQjI41?(pa0*<|EF`JJ)Fk;lok$;xH0+nQYf- zDdB{^5GFAa16b%`nE`U>X1J6_dz=A`KH5adD+fDld2fXzrI=89+QVrnqgk-zLVxy(&lErfGv~=QE!gkv#ow z*j$7~uyfU4v{iWjCImKev!>yb%O9^LsF z*n56LUT7&3nii{y!fXg*A)p7eEox4hj)cQ(LrtIiT zB9ALeY`v3OrXcJSweIFn=!cv{Pf>Gzw)`g6zR^9<({)CX<9~yM+{pa6#?aBs5?NF2 zhE66!YZFK3CSM)SYY(T0-*UBP;8Ivu>0Dqhmt4j3PeGH^5MeGJ*Zz{Xc)$m(~j(tN5#? zq|3to9uw!og8Q)bPG;4@Mc&@j6bvMiE&Wwfm(p)Uj>jZ(g{(Ko1fy1hvhb_#XVe0M z<<4ITsQk5)Op5t^z*x*~+yBMJe*zr2U~n@h3}8H1Y>PCXA7qAfqHJFe9@)YVAPQDt zoHfi*RiiYZMj0xBVk+74lm;%!8A3xZ)7TS~R&( zV7Sq=CrWQd++jx6ofJaAoQN z-Xms`N@wJ_kNK;Q$qQ}2OFJc$wH$-M1^%0|z;YsuGurLDW3d8F)mPFo0COGcBbbLA zJ>%qd4G}a8YOA80H&SfuMgm@!sIpV8j%Hm=E}(AZD!j?_aHp; z$NDb(i#o1?Ans~CE7qv;H#3D=&ApcgKV>sH*rMcOdpJcd(rmQBH)zf=oje(U+Ys3?m@81F~wYUed9abM4O&4Ri+ zlRH+_MXE!WQ`Wew!|@;!-F^xoQn>}x6chD}x+nR97Pt0nn;7aGs;t!M^c$X*zC%E<5Y=I?-Kixe2bqZZ4t3melNQ6Qb=hNgC1i)NT_0!1Gx zsdhY3etOMm}oC&lEH8xWQ_bNb{Yr6k?qjaOtEMHWzeIkP( zQ^leriN8SebpDJy&>EBqbRuK(x>9D*e>TU?6U$~fqEQeRiwdDT8>@WFEfWKn)t$y( zQ|0hs1*Y*60rLFta<#FxV|T}BcKU`vxn>n%18B5leylu`h*oc?*3O^UJ-MwAD$k}L zNY~Ia$~}fujEFzVzIaJOQ|O*?%)E}ysZUFM!(}@tcd(jfHC;KFoinhOzCxAIN#boN zq;X0m_lw~7%zRyjI1rOWSFNQ-6OKrpnr}r3x(m)dV%a3-8(#nE^oc!eJ(n^2`6jip zPQCS2lKT5e8%gS{JPPz8>AX7D zCA4kUo+Ct-L^Li{jOpDfpAR5)eLd|DCCX0m@1UQX&mg4J_LFq_Vd-|eRW`9Yqr)T} z5hD5blb*mG#>q9f$8zzlyQIt2*;Ku1^W@T)ZU+A@54c^6nTG&Ik}0ukv=%4I?#{DI z-;WOYG-jiH*2?dy|ID`~-T6@=&h)u`9=HlBHeu@5nXlS6_slK_he;;srU)@j52(W-6g<<>TSAYedE@ zRAM~oRq#ONws_otQqKfg|G1=qAC1KHq+_V_OkOOL&gHs-6cuRTsRf_t6o{p8o3c*a z3;59XU@=Dsz}v-#h8=o26b0WOyOpE*S7$eSc$$~e&0MJj1vIfr3;q^}9x+dNKUx`3*M-o%Eb>feD?I#KPyd2R@)@rnsHykE{?11;el1CRnG7fN$ zO@Tum6AyJVI4ddd4PgB%AP~qryN#iYrHSPLHZC$1mG)iKuY^yrRcc2WK^!pU4HW#X zjP1Hr$r-OaZUi{a5_gmK*_pw@)U$$(&STsfjHd~b29UOGmfo7z^Q@*Lj7O44XJxH&Jg_1Mn|MkXVx*5_<&E$BrU4E4R z!b$DM*6{I@A&4)g*cnsNee(c#!(lMF{iYAD1&|caLFl=JAiTu@nrCnIRljbAXO`4{ z#W-RpQ;5lWUshVSq)+TNbS6HNJcmmHk8380RPx_(Cn`Aq0RsGg_6d*yek7rA zr=`mg92H@5fl{|Pq+gl5vUnkUjJ^5c&fj4T^%0uV!#`6?vP46|edquFSt-;g4_)7=Awvt~lms{%Oi{7*^{wT}}ZqR~c z_(q*l^{inPV%zMU+p4HiC^}`gM|HAU%_6Q#%l@l_imruS%J&^cn6pAv+}gDP(^8Uv zB(vE_0_y9EO{sJpJ@7UqM^X0h;R0Zwqz(mH~@v34jJ@8c@GFl$#7oQjfi z@ALC|T@%H=G;tb^N(8%#{sqt4KfwQ33T&U7OVkU8BY6MYnw8K`oj&9C;Ul~$%PokL zA0D1tuRlSd*Xd;AaZpHKYN83+_4XLz5 zQZ`!-t!=<)l=fH9=eKdhz62Q4%w^G)D#@TpPUkln*>+oijivIvbU!o7D|-pH3_E0L-{XoUS} z>bIo)4tw!ZD+@aIz#5jsKg=v|h zWVV-(^Ut&b{!geImklkP2m_q1g`X!AoQTb;hG4x>HFkbRz$<25)xev=U1QD=(5^m8 zJrz=8q9|kv5GQA1m@Fh+``5(5avbAWC{ZsDqCP+GC!M32~D61zAX`p{Dc~2-RG&CTABm7nRr$t`xNX#B3<5+rP zD?Omc@N-z$0Hh8&Iw@M-7jlV}gL)V^lvWH(9!yal5;3Zgy}+MDl{rO{3tCiq@Z|ja z^_`t)t}dtfXZp5dk96#6gb4xF~lL9cb( zZv`S>z$~o95#e|MF&9e>o+fh**P$UekiUuVb?Kyd`Zj_x-9u?jYhUo5&JZPLW$pSc zHMAMcZjU3l2_!aC9`~G%S1I^_hWf>g!OSVLe2Fo&`^n!3i!PEj5hZoOEkUt=TY?@p zUWR|vHP!ily@qGtaMepKnzG>gBS`7we(a}7p-_DX!3`-Lz4cJx!v9V?+7&D{ww$&M zVy(~(u1LjD$%C=M!AzKI$;y&j7gUz<6dYwq9bVe#cZI^@JPQ8=Ll8K!XTX;@QUcyb zLybII3=&L_U=T6H*;fs5!0>n6$Q*Hta^mRq3l21wt;HHEvBqDpPw=~$piayour zwaLaBM^@aY7?$(TWq8QScQdwG+JE|d6HP9 zCRP!q@+a;{(HpeT2@%9;b^j+7xV!l$KkO$XR1j&!5E=IJI_`*fJZ^ZM*V|k)3%KP3 zkqVJ@M$xnD?+y;f#wUnU60Zp)i17^Fy=ze(+0#mhzB>B|x3# zuucfYqv1o1x8QJTtj7~p&Wp-dX{oKW*!YE5Yx2!IC4OwdmfRasX)1XtLDcl7zNqlA zcS>M(=e6Auc&w5G>LR!tU6TFgLu|HXT0ks|UOtZuy^TJn<(8yfg5OnJXP*i3)Zi@8km#zIzDVZ)WR!HzxPMivZ#7`{c7@Ino= zU=K>Lm=Vn(RUKs!wUo@{k8HADG10YlCsa7ENWq0h$=UUdDR_&7q{bOdXEJ_aemd}( zjHt1~NRTx{?}gsZNRp2q&;HzblaR;IQiRd(MG{OAb?+CM zUwB;1;HhQ)?fqw`?hEI$zqP{&_Ybi3jZrNHsj%DO@`l}$!%3G5rgwen!(kmgNXP2B z{vPPty3gqK3%|!2ng1#5yz_RF6ZCwiAENos>jS^QFpn+unjgvMU5^{&WL||qL~ct) zXrN`L$tKy!7lY42f>@qW;XXMZqhu#8agu3H(m*N}|Ha57=ETtvHF@*m>)voww$_t< zp4+i}_;3j-g{cJUM8yzTNml>VaCY(0HNhk+Xlh~FOx3>E0t#%Tia}90DSP6K5E|) zAKyR8oOWy6Uc304IQ`cDDSD=l{oB#n=ZL`qA7(P5r#)&Rt3?CfuD7S(;B774hp7)c z*n(plw+mIkm*c6US?q!yAH?66VHiIQYXQ2by7MEyNc?Cu-aUg4E{i}`YxdENzE61eC$j86MZh4BhB-sCLYOV>g!AHDP2-f1cE&THC|c3i z$spoFAwLLwpKdZ9iAs9aKLw?1DKrssQeYnkq~y*2;Z7B9NrbFOqym9c39N@S#Njt+ zjjV{nLTY7;o2kJwCiit4gSQ8Xhx^O-AtE1Be_-o&h}(j;1qYovZ<}@155vD6Th6&J zS?exy@(;JUA8Y!~PgCdnUUos&Xk(eR zZEtGBFNiMQ2_4J5B7KPqgg1x+$<$k8f|EPvI$Pb@kdE))}R zQ*uLvs2o~l3%|a2S|a@%uGZG9twcH+E$c$cZ)9r=Ow}mP9S@vb&k*V$zBGz(o zrk%CWQy+4FYcgBM!-~y0lkeX#bi01*{X6rfaxk1W)E1~c%<;neeT?kA1?@KT@jh&> z=b7HMRm9Z7(7Z0<#aE#j%krmXC9nAGwCi??D3ctT6c4`bnTw{W$64+SVJQk%P%uHr zEH2W-{L4t&dWeRlSdNBtWQYj~oEW3)kIvrlq7i)@5!~0)xf83H89Q?7?7wn-plttz z2y3|D!dm}l;m`EO04qPeVB0>&pI_3%I z<;ls%OSVS!DvhMm}2*clt z8Tj$G`2qZRu>UasO#o`lcAp{SEw*oK?y~DCYu=3HYQE3uZfCcNT7+TyMsTlfC;7o1 zqqfgUK+!A03>}!KDT$KkFwE!oMKcVll}b1;m7YP76){4U7KWR4{EGda+AD;PXv=d? z#;nl7xK`{hKTRsYjOWcRXlpa9x)BRR!CLv-lVP^`F_Y{z$yD7EjJQq=AaL-@ z_P!Zup5#4^s{><8)Z5;ZiWA-teXymc=A>GlGM#jM7wzQjt=qzl?$C5TfQf5Y$u*5S_FaGMI?g#)EeoPG z6v!*o|LLYG!XtNnyj`|1w5-iORgACN_pDF8FVMR61${JDxvecXXVfKDHM~SC{%hVF2eK7R-i_r16U`?lxRxog^(|BNUS%^swgP)I zfDIFzZYO0WD{ia6k{|zX?1%%2>~PtL?E5tE@VwZw5hL)R^|H|NK2Y6d#27PnR*>5< z?zZM8;C0Q2zY?im>o320b_|EXJusz=h z*{6*+Vpc%B>!HDA1EMb|@$D>?YiWvwXCSxOVg>A(eYQ*d(R$u~S`AHn-}*E(*h6c&Z&Ix{(jp_6 zrNi%2J^)GjpjWP*n^ib}HB9m=&f_x&sQrTfC<26AtBPzw0aB!gn4kc0)Fiid3qfpC z{aiF;@B7d*WO8o7 zKgYRltpoOv(^(Cf%CR=7OHgO=x%;!$9k8oPY``5hsK1f>#hBj^7LOWQZ}5ufG;!ds zbe@dL=r66YI;Tdgd56P89Io&=AuAHLA+YtH;I2W0C7J{x>XkCi9NZ$S;c%)5%#3l& z=&MOT?g!8E32Er=la2zlkwO|q{{l^)Ab2E-H8-}zMX{RP25p;4l4baZxpgepH;=HQXMIzU<=pAHF_!Do?8o>P8&wk`0ehEH+7H^l`JqIOnLfVR zdM_PxXDf7!zsnDMB9ludxHfgCTJ1ds@EiNPMdn5S#azP3{}ALv3T1gb`T=H6K2d8egid`G75gF%~OP$5V|}ExNxoaPeCzq-5a$ zICvyiR#Zi}(ir}+S7cV1=$4-_3(fQT=4&abZeVLlVS=j3`*!}olTJh6iYqC|Wmx1T z-ovGN`~6Xo1akG^9qU1c6laf8oTmdC?_bXN%+J@(boQQ}V0J>f->eRAfd* z?gP`&_NNUW9;tdh_XHh#5kg?A=v*N9fx8s>(`rv~P0v_H4C;s67t&FIq3nJ6KbE79Jq{(o7$km*!TkObrI$rpD;r#(B~qY%@p) zZ|&rJ+jH}s@orGj*=q0N-D43wcxnx>DYSN@*#QMwwXbf)1g$b~9x9$gGjJ&8@N0us zbvACWGI?FUiu{j-1&@>$0G`Gu?ZzAjQKgn*eeTF#Cf&OW3|J?+PHWtb^$wScgoJ(% zZOJ}a0QvuLX0+qRls#0AOf@@Rvq};VOh-CH;qGSWUjcL)w}SQfU|FNha8~Z3pwg%U z!utoA9lcHSvu0Qb=tJmZ7Qa_zijkW-|4g0h-djYxl0HP4r-g3Ha-l*3^{3%|B#rba zFXaSUbGs`L{ z7lv%mw#F9`%*+^#hz4CTNIa*pPgp?nNr-$W02OE%rUy(D*+EsIwTeJsB)rHxgfCcK zbhuZqKZj=8J|H!m9pG2DfW5!%o<@G9y4+|A+5ec!j9Ts9OGO_1KXFhJ77}pkn0HL-X}Q^NGMn_W<&fb9Dd|_vylu2?FCEi+tr>f z8(+|~a|`|0x`E=yNwunahueAiv+to=c9&u{Z_lC6{mGwhgFE-`+ia)ZXs#6C>SM~O zoz{5{RTMGTLNv!=I?J=}stxAA#B#|L(_G22b+A5R3+9^4;5s5#*gY1*k$$nK#a!4T zKBk3;oOu(RcmkXlujPabjw|cpWr+0CM84GbHAlKMpd8P+D@7M2s?mh_U<3RbsBNs4 z%D{FyVNM^Vt!u!H_n=-;$|_*lch=XPT z!x(2b7;9xK2G%p8N=dT4sr+tsOFzG1%dOq6hp%Fxg-6Q4MhA{qG0ms0Q3>oa@i(8h z00W6w0&k&XmzEx;d5ISEfyFu@i2#oq#^(GH3K!y-*C5i)4#;yPT4#mNmg7g6wR61mW0X&sC)i7h;|C zz_VtyObWw;JYz>Vr0NQSZ`zeBo-#{8fTHB5c9L1(N8I{Fa$W>atJ+`_nT;apK(WMh6Y@d9&4E8cw4M$QE zn4n%3QWEkzmOct9Qs!DXdY$Gd*fGn_fyM^^w#6NF6 zzVk_SEWKoswuB*9RY8xpBCFk2LECNi%xQK9_t!=om*!hFHuk4L_2sZw7R;vz2rF3J*ias^hO?FFsv$uWt{ogVM zzEN~VJKES9pHU4CA$5)Fl`}dh&8p?umAN);MB=B`tpLPP8~xi1r)9I$zZ&=}r)}X% zA46Ur*)J_IGZfL7l~!>Z9q#sR_X|2YCaYjzpzLN&|SEY#V0En|_7HuM=J_ zNg5@d2L=$^dWSQI4Tdf*x$Rx$*B_T%)SWxxuM#zdTE_}Dj|_EGy|Yks3=cYOT!JN!b*FvOEeE2(RX##=%#I}idQ6KL|&J@+g z*rq~@?aY1OazS@IIFn@-S~=M1Sq*e(Gg*S$*TM+~B3?X+2k5 z@3|WSUj6i3Z#B?fKSNY$tJ||6$hdTy@38Pof9D@=WUMGv|2t#P>x~l_sGFyV{u|hi z_q3rD%u2lCe4L zP$H#&z*iURvgQ%&UpfNF?r3U-ZxruZE_b<6;ZMI08MEJ_-yp9@{`Oa&WVDdwcf-nU z;-y58fcTv>dk_q0s$nXf)ZA;icINfq-0W=#Xto|=F}iZEn7OI3P`SRca@eTcikZz~ zBFXc(=Dcp=F(fR8C)T*uOP03?2f#8Y1Ve+i&g*D<7sZUa^BF1PbC%|O2>Z3xy2{no zu{R_kPqc-U(-S<^-;Fqn3cCDrL%c(cR#=ntkyvsqbqs<*IM4`-RDx0{`1Ve=VqYV6 zSfYW)h-yKRpLk9(6w>0!-@GFYR_ub&sUr67*=yO zzBjK#Jd~{4Gb*lrJ!E)5F`82DR`$#B?@RZ51`8)2kr8hk`|{sWx4PriV*WYOxRAJD z`l+U%86Hy1FOyIMeP{DF=dqD_8?!x>=*RFIc`Uz*Mz!uSY_@uZ#`16Shm z%gCq(2El`=?C)5UnFrfYh}X{MiVO`C!U7fx7`LkDA&ySzvX&v)j0ORg?nd_^v6);w z&UY_uKU7nRu`6IRFa892YX0f|F+G@0pn)_vVKVHG$7uJNjmRa6aF1H?AYt-H0M_pu z?}@DLU8&JYJuje5l7LyqU*M{HDsbwIaSFwENCVw=UAsALGpgm!giiCtSFfD&cWFZd zr@lB)pQ6y)ofFvLvh{EX+4pDH0Sy;7T%k(eM5nF=3WT$ud7$s+%f@1P5|>bh34Vz6 zPz&`Lhq6XNm3GSjh*p5Oy9lD0DMN^9h!Y%jZJ%z7SC`C9GNMfyfVKl(KVVJPC{)?C z^?|!VHEst4%&{70VgLYsk*+LN%um~-h0!B^ix^ki+gYlfvT-u#W!eF{kQu@P!^>)> zAD3%NxHjM&HuMcFK&dFJDElaPotzCTds~-Sn<)Fg@x9$eSY9vYx;)@Tyk3N0+iraY zI5TEu1JT5yO7#)}$SP8nNJp+cUnmJb+oJ(GsDu~9&1Vj(sY3GVF zS$-7b7h0q#;|<)gv*{~l3s%Xc^LEK)+BwP&aJjCqdl~ZdsUy$F4m6LadNtLV4?i^G zhSaH<7#~~?r)fcg3NIGV*lqgyQdm}K-lw?_^lp9|j~6AsOVTvdm5kR;h*_{n>PYx_ z9eIxrk8cV_d%-Cxg{O=8goTx@OKaSfGB(6_ZEi%YH`!P$1GO#=AjAIZ@wumwdTK6P zJrlzop1ajFn!K=?{bpOu_EFpBNZdpRp}j#1P~0PHaI=MufsZH&46nbRWTrx1C2vqK z1W7NQOXpCOy?f@FD_zvg8OPgRqT=(Z%jrTD-SP6{!&cuwb3HKi<>3ZYRI(yiHCLk0 z*i}lMj5|lIEzLixN^d_}c-_3>Xq?VJvXe-?q;IsVvkkKvXxIP5*b&ce&&7*w%d~X8 z%g~5F#;DUaaRTgKk>g&7;#<>=p`3`$UB9Rk!ly%7)5)qaxb;V}lYAuRSNlQ$*ONOv z2N?6wN&kJq|Cz??@q%e8WAjB9DLiwt&C7n-%q#I?bUKQyFkf{s(sl04+;b#>(oSip zG)g%{VE$?>mFbcF<$4539>z3?=ylT!?DMeSR0FDdF}HaFdLZGMxEPq=!=6ENHUpO$ z$3P9QGdpJ@*(U!FU0)p#)waG(Nl1%GN|(~jz|fKs0@6qiA)V5QAR$92-69Ag-Q6&B zcgHYvck>PEx%d9gIp3cf_Uyg(yVm=x=gp-JwgEIKJ_l65&m?5+sOX9^u}%{B?2zwi z4Mi?FKOpFhAX~Vagj7M!&7kC~)|W1MCNn;|JI+l&C^#0SDX=YJzh)tz$zJ+>ezaq|v!|#Q)kuMFQJm{0! z&eP|fy-lAM$_mbcx^^5nC6p7nsudD}-!HpBCEgYekLKq~jOq6R4hVYA

nd;Kydm#cua=1H*}9eDHKd{a3PTn{b?r zk`40pr`yP?Cy}%jA19jq(RwMde5sa$SazIFZoMnU+C{jZi<&!~-lF$4^^S)QOnso% zh~D!))jk{GTaNZE#=&x1OmgDE)K|UJs<75xf)05ltNv0ej!E=Rbm7Uy_nSmW$nOy- z6`KtlyXfoXMGg7o>Wn-347usT{c1KiH;e3;r9*`WcHcEx_7v#D!e7wvTRZLWN?@4; zkQfvh`kD0p{Lq#pAZ<~-6eLyJorvu#r(^N! z`$h(5)+~mJ^Kr>14RE$64O|mMxOB)dHKx}robg3&dy{8Hl%p6Kf0-CK_n$`YLtn0f ze$0(DJGl~dXoBB|O;tF#Q#R4UAxltkCG2?`ZN6ja<57XqB~Z9~@en}+@p$=jS}vxs zy5ZSdUa&CelfUnslpYUsYR5mMz!WHl%A=@luotglxp4= z7h3(1O9N5T>ricK!D~-ZM~|lRflbQmVgU_y*yQ-cIb?F2e85qU);ing2pLt_2Xq zY)HO#{<9O`5ua04GJN-wFr=e_Q#9FY^)W{3!2$IBPba{uCUiLBf=%z$v8!?S=n9fs zo%g(@2gC1HViE3-++kPwLi-djAQ#@h@L3uasd(nLb3N4KmxTv^`eGHH8{Aj9 z%e)Kk7WfCne^0E~Z%BtGu)6`FhnQ`%e)8|9R))IsTx6$Ms}R=KWLuJMv!_V0)BfhdTUQwdlz6y(S~#`mAW{_pTwAKh~s*g%%_?>C6~((Jdh`xJZK9;LD0ok!eFrWw<}{Lf7M z_l@w8bHJy6-13juGbsEdgKz+-Xt)&QCtBzT|N6F@F!(TU!hWp$fy^SQ#m>*Ptra_O ztNul9=zi&+;P*?O#HrN&7SvZ0bTA2bmWfLa&A&cB1N&ZHGwcPozo)M02v3%_OhZBx zmlXeBMO}Blgnhd3$ELnv8qhCj!m{L4s*H_*SDKm)&p0}4hK`D3={ z#WtKhU#I<+0hS_)<8UvB?o_JM4Fkl<{}uQvz|&mWvR8^B%={T|)YN3rm|01`?tg|mx`d%6D(KEO#_t2=7gR}^7@BWXAYfp4Y6X7&=pUMa?q?K%a7h=MH~37g!`k&H zzYe!A71ZB0KtyK^l^{N&{H6OJjS)p(5m26t6MB-`&`rU7vt2sCcJJ$ri18M?NW2jy zWka4u$N2rNi2o~aKcgTqwKSj(6isr1+?m#yAX)EX3~R&#*M30c%G!Ou87(3({qXna z8wS%BIY6cDsNo4I2W?3vz6Q!RL2eWJua6vWt@hq!5s(G!&Q_}(Erd#F_WoVT(fx*k z#5SV>a@6=N*VFxBsO{(}pWe9&hT&qoST!dtvVS-4ZmwNyEcHZ<>9@FBiTzf!(xAXO z>iJ$W$eZn<ovk{1fy5r zL76S*51-`$8dux{z)}|dx~lEz@*Zk{x#f+`~|!iSqFfg%An|%Bs=b{&*n5mzbK(UF>LdnxHwvitM@$fAmY8l&Nk4oA^gWA z);Xi7;`ZZ(M6jyQ`e6FutkZydmG9lHCkvTt998Ic66z{n^Av99$G@bED>4q6d~`;sr7M| zb%{!bWD)P0Aa3)1=lM}qjL)A7a(o%`qn5ssq!UD<_;WNr5CcD9nq->GRQZXR}9-r3d(DjUuY*yh6E^v5i7+<(Ly;Lz=IodlFDh85Dx~;Zhr56BH9NN z+qm9sAcGAd;uZ4F>$r2yY~D4|)p~z4To3odNO4(h2_Vq%AIlDrn&TSJVl$*|h)8Wf zsn#NJP%WYt@68thpC9W}{$>IFX`(T3Bp4=e9>x%)wiIhaw0J%jumTEx#3ltCQvbCU z;rlDGodX~T6i@Ib5D*Jr26kstEt%#XRIQY^`$4d7=0Xhk+Y*TgCshyi1 z=fE8;`ihk^-aG1K06S&N+gq#V{NTR}Q^HV->V1FQrQxPu!1}~s(!q)qu390TLR^`` zJHhQLd_kdwd50!>6GwtqEN7Ol%*Kc)z9-^|Jho|K42T12MBuB2>VTU1qUO=ha0Qc+R5yDg_UKINC-7aHHT=AK6p5~oHf%KCzHKB&_4%>Uaw574|$ zJA3&LeBGh1X!Wf?+>)d8lzjS}m3*`VIXZ5z(wDeW92jw${5M%pagHR9%=I&I6Ys<{{>hCvfcUa=94F=y$HFb6rD}Ex-a;dNcz|b^bX(3L-p*DkTRyJZ|3u{6G9A zrG|KhSZ3Sp_*_HyTuM=KQ4|};gNrY5eY-6oYFU_thurfE)F-YjAmM;Mfg3xIL)o=3 zNzy2%TUte*eusU6fyf3^ea6f0g8!U&iN)`<@0BKwa~LJ8CQ3&plWSZyr}tIaU_8(% zi9^91IGm+)J0VGSs2TO+>8v7K42P6H!on2&5G~y3bht~B5g`*K1 zI^8034_6;*9~qq4<;4F}uLNi#J*cQJO*RA8d)tQ4hOpIw%CO=cq1$2bZ$CmnQxi?3iDRIG zN8bT3tQWihQ28>Tm1g2`XdZvm+Ef9|R3j>ga*{rcxLqz;Fhr8>H6meb1l)aQNaKPp zP|1H;ppJNA1`1C(T{=cfhX|5toYgwA6gC(3Gr?GDXHT&*1#JWR`_0hNNn}2wesxYj z!-ggC$nzV1fkgEa8g6rb!m`55z3w2eo&I?-Ks_S<4;IVu4No?OqjeNp zF1sNN2SYe)WIZI4TS3GknpPprvPG#noGd+JoKrAW-iqtjnVh;6@&L(WrbbH)NKO*m5uQVQ$%1juZ?z% zYn9hRK%zS`jv6F)WdB5_j>JO(?SDB7;_nN%^^$2jjirY6Vvjbp#yN{yK@jM+o)A}==77i6%Lp%%y}BR4xANAkIxzF&KWk8us$(^}(+w+( zGr7onxkoONvyMbr|9pfW^e9N=mF?y62AjupnI*EiR0Kxa?XtsEVe*me8hLz}-+1y} zQ$#h}h2#87TIbjv0ME$7kAIl*Z$snPg`ov)WDQNe!kYxVn+QB_j*<0GnMu@MIhv37 zKzPb&he98vJHiI1I0eDTU|0I;1f{R9ibD8fc6lVSAEiD70yh#%zGS`nJv*0_e&mH1 zBixW|JEZ`CoJ0=>Pmh>Y+}K_I&a(qa%F9;?f-Y~}dvxcAxlIsMkFn#9D32bO@z9yy zB@v6k5#72~98lC#9r5aD0pZ&4(Lb5(ItNk?|06zO57SLBNl7_9u)3Yz8CW+M#qM&H z88sot4~ZTZpUR~nguPBxyI^RmQivA`>)Q^&@Z}aAnZlOn{{GS<>kkcYYNBdFftUu; zrfy(4JqloClsX(nPdQG;TWbE^i1L%bMG*;#j06ZstZ#ft zwfy*RdpQtKOcR37CtQ|gJ2#r6R>|7h`Ul>M1H#wE6P;DHOUR zCVy2$!5EvC;~cLjr3o;jV)}2KI2J_$*wQ9ABsxR56@v%l2vQkXL8j|S()-j`1A0<* z-nTEwo6u9SM$oab3oq8&LlFF)|?aP z=CMN7;%Ai@=aRC;0x4fm|uehp`Gldf%%-ZD^KDM*eRKb^d^y(;(%$AABz{?>49;;;hY2D{CwJ={KZ^Gk# z?nV4EVZ{zoy7Y|u-*w)NCi3zPX~M>XGS@WeHLN7B9f#o*SGei6o7;+E|9onnw^<5b z8CLg3I@TtenHA;NKAR%`6@WuvUe_N|1c)`S+M(f{q)jsVL z=mdmVEknqk{b7FpM^k8t{IV!y+0`g}Da?JcMq;Ox2CkbQqRpYtVLYfIaKReK{o(No zrTjyh#FX9wTyagGhRjvi%m8a#w#MnO-QsV^IzI`jbcj_Ll>8IUGBEsHZz_f7+CBh* z9tKhD(4FH^T<*qaxdaE|{1Rtla_{Br6R8hoF$G)AL=L%9N@hmnH2;`%?AU&%UCblk zT9WL*RlY~nG^B!FnMrknLOZCnK9@;Ogwzdy4iL~mVLsEm=4oZa`i__Uzj$_<1|_Ln z1f9ph*-9ywhTL;2J@Z%(yyGgcYVZXSdn2XsJCM!PmXuO6dEL2Fjk>8=|A+}W6#l&d z8zey?=95YT&K2N@=XAcqiPY}MFjd!nZe(?3HQVtRYWKDSzyN-Mc4^l3eEm$QnNJiy zwEm+$yz~`L1pfx_#2VGp1K^htU4^+kEk46>(c9| z4fMo0@1y3coy;%bEl%I0`dn^MY_BYF_4dMTkN?0W&7g=MNU$}}d$!5}22Z?YBhq&> zvXba4u1(?Zai;0zbN*If_H;B!WKFaMUL@0?=Q5_QsL}q4^-(G&uk8 z?kfPlgr;UA(OuEQ7b2u0QxWG5-&73xJu?sBuKUAk+{>TpxykRA+d@DPFbT!PKUoA^ zXkr9@thyDC8(hjvI>Nb(T2S~UO3yLh5D7=&rOg;I#%_u5Sfd<^@)1S4}eiWPNaXG!;`7*)50|^oHt(HH6#(eXiZuKZS?&NEx=N zp;5>RCwbVyI;K(dgq9n3AqMY8MwAi;5y z%=@6EP_=W@2`H?@1Ma7^eXaNYSJl~yT0 zC7^Q10C=2yg+BF39@og=(aLj{f9i@ldeI|~yW5)rwY7)qqqW-SG0d4mGDEd!sspxY z4N&Rz*DaQ^&zA$7Dbl(H5J1U)`Maf`Ac1d)brO!_(Mh%v(BR2=bv!R5{n4hy6eHvG zJuIi-YLx(Vca#zDzhG}o#!NDjAFg8afM$$upPXdce zeync`@N>&?D*Q=5V%Ev#a4;qLIrSWB=G5 zxS@v_EANIBB*FFvYH2YMA>LT|z%USnq2o3yc^-!2*CF`qtTkaBQL{Kvd;hTw{?8VU zNNvx8o+%RI(-njTshDkNX{CjC3`y)Vs1g7CP zeGZ}q1Rr9<-G<@psWo{4{YVpAsskd zX7)S#2F_I2FvGi&x>RpuC0(j*ADO~XlpCj^Q;k>zL2DTJ{|<@l=$Qox9-zP+UGv1E z5!*pc3G<+&6lF10M7ve`#$U-l@JP-rI#0(1`PGKH(=4ONpfafOKCiFw?y8ZUjNg_8 zUe7l59wF(bhl5!TE73?K`Xj@?F#?H2Ej&XbqD#8m6z&&=FS3<{gOVAmzM#l;tDWc}kw+i=z>N)(-y=ZLubL;yxZRP)&XE@NWXfQHO64 ze$_^DG=dG8T-Xpyaw0#N_gYT>W4rxdKx!JolZA5EusVPTHoksW)?sYjjlEZ>+^LQE z<(yz-;#}y;?_bJki}r=y>#b+2?qrOc-nQofZ~|UxZT6+?%3{VP}lKtZ*%kT zTZ+EloVDF4x1~%go%`co=r2u+MEtSRW=7+l*lEam!QHE%wf>a+NH-4(ob9>VN{}<; z9GslK5+BMT%YOs6Tzh&GxO?Eiy41DwffS)i_*F+MSUHi$lF8-epOD!-HVOudsTEp7 zgeC(-a!K#)xi8(WSxL-TT!jPrOp?Fl7x)w&V_I`dgZjE=3a1?Vp%+#ye>#Mx&zE|^ z$sIZRL4EgjzVF~=ONJ&j4bF3<>bE7;#$1*G)x(Y6yb`w%;Oh-57~QcReS*=|!)`wh zChi|!e;%rYB22We_9gXPo@~w8O_z0z1oWN9vg*Rmo!MhAs)~h#kr_b3C^aDy_R0BL zfqe58d8QQH3>tzFBtlvH9lY|hlFYv6Mc(V##fR&i__`lW%UYjY^4sE&v!LNnFroGB z_r*IDea_kwIA5Q6xkD+v?S8ZxBNh52<|8JN$0Rt-z6Ar;Mxe|RO2mHJa6%MLtKC0c zZndLFyr&hu;`7t)kIr4=TWm%jXT908R(@B~ac^Mdmm&;1?`)W*F2irZ!a;)j@vQRM zjAMpUg*`p;*o;4_6bR5*lbTIq^C?OylA0$Pky?6iPZquXF~>2YpqwmFT=Xe;LNAFq zluHC&t2^HP&2l6Q8z(-aYMgul&4~u(7G5$Lea}LlUeQlq&FI#cgpq&4Sl}i%2 zH0AjuHC`K|3r6!1d9cJ=v*0g&#lsZ#SZEda(FC3I&bzc4cX*bWNM z-m_k_g>A-Ew{Jjr%2tE#Dcls(AMNO@M$MY86Slbo@4HfPFDM)JS6Ufvj#`2aUt?x# zP46LRMTb;KxE-#9fR@|jKrNYFsj0>X2NmjpQ}KO+Kvc!PuwLSLhe!^wj8x|bU8nO+ zST=e6mdXL|0t)oa}%M}78%bIKP2 zrX~{pc9op3z!qhdt-7(aLv%U z7|z8`l`6Am#AWxVH|`nipeVW_xfx8QfU8AdglEu{TvD-N^;3GFBW+(gm=~LWCos{g zF9t6IRKO53OG{_5NxQ(*G%$~!Rk?MwOY?CW3zV0zT{jO5{P?+hx3F*=kGJAt# zaF;j~E`D~A@`cSu_tje)y!F+BB@`ZUSMX{otO}ro#Y7#g4OEI4y@?_~s;6yozdVOV z#NC%@0nZoxVeld)$3{n(Ja;06ciJ|mYhQ0NZ*6add#rtyFh>HwGrYrG+iN^|3KVFk z$T^LGalfcsNGNP=P#9rfAs=@Cm-@5 z$&ApxxhuK~GK;V8azu77TyTKfqTw7uDP?9gBm-XeF26bflnUFq&D>CUN;nji6wKqM zLGyiy1lsq~At4?98e#OYKoXtiGL0lC#GwaRm0{DGH&$-^Pu5QSrweV%_XyD+2!$tx%gU1sT8oi zE#PpZ4bNxj9-Ex0%&4;QRf#OxESh^&YTEUPf3m0~m{2;?X<>XZOqmxi)Z zq6H;R=~xulDvGuLoY!a=#W$tC2g0eKB?BSH@t-6fu8l5W8^izTVh3oXk} z<-t#V7vNAd#L2C8Wpy1yJhNSX>)jU6SD3s>@9Bv z_eDRnhYcH5z9yi3X8nVQXZQX*z}@xHO+!If_W$M%B%%||!%kWl zJP(~UkIVQ3Kaf=4p&i|UrJzTnxi6WwN0a#XxTbM>Q<_xYiC40( zET;hL8(yGe>qF6rA1vUdb{Be~D!(aGsMnKk(GL6SAD+md(Xm^g;IXXWCj78M5jZ?& zT#zV}{NSuqio&D0o0!lw`oy@dS?nl}#Nt)zyS?e1!ex_8BPo?57fpT7cDI4IH2P5= zDeDfr0UD%-I?RRD#yPv06i9(g;{zUK^`INoc#yB&wmJ|Bd4 zzQv82zi(bPp4Ys9#K0Ps8DXB6nyhYC{An0V7YrK7?qeA-@8@VecuZ;1avC7cERD&c z+eXuOZX8GtArS)S_&?5NBuR5T$t-l`!#VYCuO8|R2q34$8z}C{iR=ukLnlRLL6xvV z!!`ae$0t zdG5-$Q@^})L~}~c_j0|3K#3(ZHu?1v7Ljz8BEpk8dc5q4e4W)qsK>J4(!lU8a51xw zAZaVqEsJ$^D6<`7H;2vz^XJLf*3Z=|Ug;ANK6(aG^Kcx*;9AEb)?>f19_Jtzn+R#eFvayltk=fYqXlp zyJ(rrpVWv$oTCbUTDwR}i*ysDTM02_=t^Z+cb31mKA#;&61A)>f8~_RrWxMpeEWO^ zun2N-?&CrQOEAf+WQ2zL6Z*5ytiE{O3?j}Mcjz|NJ+Wx$#eC#AQf~aDm0h?NrC!5 zT$~QE>H;j8pRY}x;+3xTJDJFkdYYWjp%xIMajZEWysK^wB_;N+>7X8{Kpv=doV9?J zdJK1zGquvPQ@*pl7JnN~Sj@-*gQja0%$maJ+KcbBVpL+n*!62{2^K3Tbzln3GR(S= zeJ1q?upe>APZna!PRweZJN(iwL*e8THCYNJgo{z#-`X*g&G?=V98(W2Pl>is0;?7} zs6>Shfk+Y0sQ>p<()_jZDiYIthWNJqte(or3}jR8vlaXt*b zX#z9rcWbt~@2&p0J+C#PhH}Ny_U&x11N5)ntE!ut zfog%cvE7RkbfD*=P_bA)*R~>4&qxi-{ATx{=uydM*Y@p@fJ@$fGV=1gcRi{m+h@9E z77p5V_73i2QiW_mKTSm{s=5lUI&2PFO(Ldgf`ce{?jl*eCg0fBY%rYpo|^lvh7#T% zCK>at4*GQ0d`eRMzb+j^=lklY_1l}VE9y@OsHFN)(hDc}G<&RL0B^i$N ziA5=(qJHQmSjp|yugHgUZvclKFR|9kIDrq0Z`r3OkM&Z7J>l6rfQs`TM^?CpEi^~>&!U`6W{_vOr-?MkVnhkK> zICdj5`jGGT#s{JeD=Qs%mYwYG3a@ERu;YhKWTnVj`!>5nvvJTK`8dL=(|$Cq=s|i6 zssu8EC6~5o%1r1=W&6Tao5e|-lpcDF&+zcb)YU?mbT2US^5=boUe!+Qd|;uFt94~< zkuo8Ae0cqlmHiOk}Xap~P^U7EhJuQxisf7FaVD~*M4A!Pl{u>ByT9*)$ULQK~%&n9pMm&9e4KF3o zZ8)pni%GI;sGi8@F`Ie5N)X3-HlZ6AKU_Hp<~>?te{P-484Ge_!YGfZek*{U#rxH8 z)GM0i97ty?m0`-nBlix}9!9M`l=B>0P+^NuCu#+C9ZwaeAya8PwSlm6%L>DnLZnm<%Py~fH;&_K@f*X|8ys{r=KCNk8$rJ5D{(4E%H z%=*om%Y%+?f{K`KXoT?F!g`UV!ZzRgDrPrwmm-PFK06<$`viyOKr#)Y?s{jG$^O;h zm+USNE=J(3J6;mOkBoxbpSE7-@IEH`KIg5JB6bU_A*To(no4Yitikk~vJQ3PliNUmbYJuz%kwUfMUS0=Q`Kuh{Iy@8%Qx7d7$oO;Lvr^#_ zY355UxA0vQI}%(HFU%wy60HgxBvMEbA$eJ+^04@U-$ufnxb$kzyu^h734^Q|`Bmz)H%HS5t1 z%F%)n5d3{>2);evExI7b7S~E??Jz)WMp<_yuYEH%B?`25QB7C)_A~B zx_aEZ7((0Y%U!VoyUm%3{W z94o9b?+I@_4{c=9)N^=X{KxAG4;T2U1ia59%2@Bv&(5^Yr72m zb+YHlxenMW^S-4$0;mv4jRv#rIt_UI7^9h(Tj}U?)&m$J(HjNvxaxt^H*ajNJVmb; zkzx2pxbyD{eSyJaF6RSYtK6UpoAN{E#zW@&^n*rbr_23xKksCm3`}V?jm_?-qDXaS zckuI^a=mt)(vE`OdhK)1y`Cw(7%kO^R_piB)^=Uh27M-ZenXon@+96Ucr5Cch@PhZ zBk=?c^#oz~R=04Fv)sxhkIstjEc=S#L+DS~xg@*A6FW}xo0!;%%8#~>j}Q^eKJn^& z^15YB)5C(a83QwXUb>^=4;;=ind#{rx#XP$Ihn@ixsCWEJt>5K?{WCV&#9L?^w>W#e7Z9LJ1!CR`NRqP<#A-d`{ z`d()H>P*V}S;YZB?ab_KL~m|UYbiNfOTQ-_dGIjypH%SxFt@FXUd$7bN%^qJ=tdH(aT@Id`ZlQX@b_P= z%P>hpx5FJ%N#uNqVMO3!N8$5wE3ZW{Y>{0m2(-Me$8UTD~R4*a9+n9R1{Tw zMH`EGmXV32f#ja6ARF2^zJ^M6sfcqo*o>fvqlwMCA`UQ)Es6xur>B!o8$7QaJbyp2 zkyV%8c5^7Su;Exa8MozoJ5oD)*Lyk)O511mISAd2V|%li>*hY@0#6Op&&Oe>Kos>g z@G4d@K$^XU8Qgxq?n=83@zvu%)b<)yH4t`>tw*%~(9X$o@}7zl|D{N)FF6Fn+YuQNh@(qgUoR1itGA8j zYID89F;(q%>g2Gqx*rlZ?-1(bkY09o@l++SP{urGXU&d|P#HJS5aTv0_W07Yv-`w& zvE?Zjl2pfGXP*}F!}_$r#dp;v-594-#-HoQecN??hK~3P3^aG^#wyZlF0u{>Z_Z-w zJFneP-Y!wbik|z6GNqljF?P3woT^AyBZ@D=Gg?m3WYK-wzZbIGl@aE1+vqa+Ncxy7 z5jKvqU|>k3il!>m7!as*UbyKtyYHM8GGb)w*uN_2J{o*E`P~X@6>`}<*KLVtZr1&i zsFs(t`@9_X9o$k*pqx^bc!RRy$CvEGMl<7n{WhC<9-VK(k33GF=gThmB&J;>wP$J? zGHwE9)@Q-grD4)`(O-BRavQm~%@8P?*(Irp{iBG%-$KT`STfkb@kMn6$Ev)j-j*Mt zUJ2VB1Wm&Ip)K_ULta;_eGqxKQ&=E5v@{@;XPUY(+wTcz$Nc1qsCE%w-sZOUVAcPu zNA#5qOc8RZru?k&WTsJf{%UD~e0XA!%e|5 zaCV1(N093eucQ*%t;+biQv8Lm!>8Xo`RLGbzP<37dCEI+qZ6l>&D?!{Gad$SOgtos z-)=K3u;#n^8^j=*aA*?ZJ;rDJihzEQ#& zNMaC!KdFB@>LYoIzz3R#kxvCfV4ldit)^G7&S> zqVdGD!q~iXJ3V@tiIUom7@xOA70hXCD7LLaa*v{d_ z9MNcdjg_;98I*BJK5mlQCRYWOzl8|^0j4cB2OLI4De-A~xTAuUlhj*|O>gYl zHW-B=7g9@q5wi{hw4G~6Q9(c-t2Dfoz;%(W?Ol_a*lekkCCSV)!*6{nub(%Jn+_9S z=K;MBjLStoS6~(XG&cTj!T#*$@vTfKzEwZjBgYSTqg4FQgyng;QYk@$&Ag@@Z0(X9 z!RJkWr$OS+1pPLqu8g=A1+ny>6tlTgwzOC`MitlMe{MLqs=e;^S0FJqsJpHWyg4 z(>xo}l1_+z@=irW7m&c-fz!BsMiM*h$b6IeADc2jt@%5efzG>>=d|p4ZRyP`AmLEA z@1MeOLUFb;p{^qeu z;t}V7e8-K9P2BgC4$07J>gmHsBX7pIEx|!vogKHc^S7WM3rKN2c1w3A-DNo=J%!%u zzRZOU$1bqOyP>-!J4)}<(lpkz>lH@J{z#W?b_6EpM)x(Itt&yQ z+(Dv9zE_pLx;}Sjz6%BpDaOp|1~-t(bLIlLZ<9;7OM!Vdvl0-EP?t-B@9+TOqdy%R zG&*pA5SdF+D3}UB+gMBY-67psfXH!f0@ub}zth1eR)$+e=j{-k%$1^SWtj#3lTVQ; zjFU*% zOI;pfGWRxn{cK! zeDLk?n&{e$M)5l#R5cMZrU_qk4)aBQhYc7Ei(71F03>n?%y8HTUQq)k98f3}O%#aQ89y zbyRX`-oN7A9jhpSR%x(~4A}G=F~0%T7EjN}CJ?{8X2pF;Ju}%mq7tE1cl%kq;O5H3 zIOS|N5S=U638LT}q*UM-w#hqYdOY?vf5MJ5BVsf^=#l^S5xysSe}tIl2ClPRsL5No z{fezos{^dud_HD-K7~BnpQ5G{m766;h5;Wr)@<;xk0tVpBYH+y~G^^==RD z8X$X`fi?Jv{2ce~S18rWSO1i<+-I?NguwNs10(qph98kX0&}I$-kHxpsNOY%KaRP; zOEUpz`H7jPAmTi%o^qR^QK+}-$n;}Du|o#sr*LNE*a`Ks57I+o5=)~z^Ah6<64wMg5Wt|F=mT%&=v**Yos zS?Q=6?60tSSprt*t99{GEJ0bg%?O9|T0j=CIzpR$QM^VCLY3(|@}pTfUJM*HH3c3Q;~0Z)^nYrP zJ`c`3XtHCQkrFJxhp;av>{)9NI;OYp?Lm#7wvlu9oIW~PEbD&xbZ){RjEwTA*~)rs z8dG<58^AhbGdS7V-DW z=(zB&v#k(lrJVyd&Kg>S-9;siI?W-{S(=tF__tLV)vuxm_kz=|e8SZV57f+S@MCyL z(a6ChEMIB4D_8B2EAD=j)VC+{x-@OJ%QXuXQ+?)HvhA6l9wft6YN9ud5b~oj?(IHU`x`L01Y&&kU=2|`8 z6O&Mw@$BuSgbJNU4;gnl7Mzy7Xrg@k@VS(-84)rvfEQCeulz%&#k!B_moSNSP96fD zhm4Q*gW%+WHL>_>9d+xito_QOSDRG>u5>5Z*IdGKYeVu$yM3plomURaAtzno07{9u zNyOpKM)aP%X=y>^Qa=5_a|cP0>o8Hf7dFTao87MOaco-Co4z})X)y#k*K{A>)^6+c zKlEJKmK8d8`t;yd*JHg!m{F6f{#LQ^)$MWN?Q|Ho!9}-bH@qLLp@JiO077HRY^Wqc zDcDjt(OzeF>T}BrK?{2WwzHRNtrLg5Eo->d=>xA0bFT3M zPIb`Yi)Gknm|V1l!;QW!-+X0fOE`TsK8+Z!LRP{WSPAsSv&23^ONEfT@4rZSQrb9e z5X=DdK_reeky#O{g$XMNfi2u{$cgPdMI9H=Ik3-5&yeE8Y%gnK)w% zr&9%E5h8jGhiATOr`yx*1IGJD$ZdAbg{_u&PKoB8Kwqo#1b^>_9(OiO>Y(fBr7t@= z$CUHIsmT5}r%y7m96OJ;{J@V@;qB$nQZfjS`Kr=MY@r!wJ}FMAHEVh=R7YW40?qY1 zDv%ivFmEvoZQ2jn(WKBd5*^TQcJe9^*l4hL-RBfvk=PMAS92Od+gyl@iJbaX#fCD~ zRR-|3vcg^PfRoh4(Po5Vny9bLO}|na%ARQYz11CHMc@V)M?iJ#M-$~;>Wrvt(W|1@ zTG`$ixHTKn37U|Sd#UDn*WeBFy`3}fIcvC~WK;QX^@8Cs)z_H8jI~M)Wi$*wWHg*I zkx`f3T(+gtId#NQ#NCk_oV1L&l$m<>fP%Vq_}QoiQoVCR@`&<2N8zceY)o`ns;zTA zomd}Rthh)Y;QD;j@s$HRsAN|BtJ; z42v^bwsmoLcXtTxuE863mxd4^IKkcB8g1O&T>=CrIKd%U;}F~lmv#1Ccc16}?w?)r zo3m<+dWSXw>LY=M!p;}h-l?(%gNW~u_nMpRaPa>OC|(+t+RB;yF|&w;7DF zR(HtAJK#6#5Z2ePDS23D7e|%W#3{Du{O&oRq(p2|i7^ykS&01ESLLH3#LK=Wc2E$` z5VQfARAXlRWMrya>(}xhlMe8M@AeraLIM+@N*qcmo-k6;Ljcpwsj1VxAf6&*b#E;3 zE{M2S6ZeZKl5JY9zjeEF=qWNMDz{tK<&LZ#4K%75kgHy#6O-aO9CpJh7XEl=d&{)g z#XrcjpJHDbWy@s^T+dBMw%5A$$|dXRCCaU;5H|Ipj9=ArB}vRTY8A6f8yrhjl}(6s zO`dB{+o_fuuX&5I;6tIqSWQ@qX$u;RL*LwA@>DY5@{Y55Mc~`oq+*;!ws$fQnu`+( zwGS5%kaj1+eF9MXm$*$QLeCIk{7m%YP!8N<+a9I;nBwqE-R^Sz^WL4M#_WGz73-dG z^L-f9@x0hL-cc9*nEK^2=Lmf8h5s|}GO*3%!^`nn7G217uPYpIcQIdHG3}` zY_mWJd_1orlH2P;$H%&5P&2^~d*{BUxd6HeT(>-h@ap`5uOSD^ zg51s0tgA7Gs7d2wf1BQrn!Vjr!QiAMZG_9~YowZG40M_7KR}}RMwpp4vT6Hu81FqX zNviYxPr`eApPj^)nFveY>P(jw(ru-!{Kh=X+2{~W6-4(5o8{P`%a>21D^5FmXVOXQ z-oHJq{rIQs?Q^Od^MA2!iqZZHB-n_i4Pj3%ADh8f)bhsSy_ihRe z^Wmq#XdzXl?79@ELhA4vwM*p;TyO6G2&uwLHg0pcrqH@3+X=NcASPpb0LAJGwPT@O z^tKFfs@#?JpqaU~=}$ashwzFf%!`aqxeS;%G|;_UBHhD2I8cV%*GW->Rzr$;nI+KvUI&&`G&pu!swQzljj^f<-*7{b+CApVG1;sqAL1|XN&=%0R-l?X);~82DdGYwoNv0LooZl>7&!r11GJt_M>UMnS zbhu$EPZY8-acpN57oyQVw=rgamktQ2Kd?t@0 z{$LbSO+^7Il~gG(S@UCSJ~MRL;S7GURk8BkjYM<$F-+E-gi84X(_q9-Ufk(R&*ciG zsOWP)cUA<(1K0(SG3Ipdw`x$?J8ML5ZgqL2 zCA0$`rvf15NLSYyn`n0BYgkbR#7NK!Ip#0fbpea~yME7RBDKPVAn zU-AV8T;D!HEAQVQPVd#mnhwdNN0F||%8MEj%GU)UL{7TvazG^t^uxs3VqYH_|9l!7 zC9>i*D^b5eCbA2$VP+p{Nq_kiV)DoMD9N6@k5SYkEA}*9-=7FTE{fIu&?VY;6Bl!aDLl;^XTh;=3KC#=l@Og_v3afMHin&-L$m=J$V~z|#LN z0{mxfCXMPtW@maT{5@3|-2Y(&>~vVa*7jbn4{QKC&vWZ#8t?2({~=@bJimVP6` zp-F@ybZlx+UA^Ylu7(AM<`zG#^|Sk$@SYSojy17O7L#lAOLJHZX|r%WFHRtD5&=c! z^>_Aq?!JVCOg-|@4$0*z($<`h>qc||&$)>6l`$fa(@G#h}6IQKvgm2 z{#6)-ZI!6{U0!8}kHYMigEv#?(-jrkQeSIJz3KHPNpH>is_NKI6?7=!^w;)cW?u%M z<+|LD4*S&%(pCQ+m{Xm$0W!saPlFMNi6;66MEQ#4oAPmmMp( z#f!NB1Ef|UzSw`Hc4(~mwMbT^Kd}G4U6!F(ChhOhaZNsOM|IO*F1j|~;9h1wMI1Gs z6?c^9yP0O$Qzp9W1@ZUdIghq zxpxD?1XVn@IDhzLRjkfb3qyp5A*8SvT>|BKPAvm|{(1ZO-G2L;38E9L(Z6al+oz1A zKw||x%?501JC59zuLcJ7!Y683jlIO)puF6g^D?FH$+^t3zQlo1HX2Sjd@*fad2>X} zyU<#A2$;sbtWCbzUo?%sM-@XnIiM-!xH%F2<{V1ZW%gq^4EznJ>9TrBxW38X(h2Z+ zIfwj5IuvPh`fiFrl>g@{&+O)}{)WeT?WZo9|FNS}eJuAKN&ZKqzY?j=kbw(U)GEuA ze)#g#E%D|AQPJ9E@7(MSS+}=%=^VlvnP&-na-|{*!wqW}|W$ zZfLM>2WPf{L>~TY-QK0k=1r;e7y2h&lp2K&f28NGAs#($GNAQzqeIF zD{zq&?w;8`8(}qnn>xA9eSh?MnmMhz(Fqv?i+U@0->>dmHB$QUuUw!zdC7EsSiG6GsRUN@aUhygz3FR5x`y~dAg%OJO3Ok%Ahk8k^}TTem`%g{1N z2{)9);9pWPry>cUan~qdjH}pg0J#*Ok4=a159gwqIa+)byEh}0w7MAI9u?q=ntJJL<#3htIdgMy?16( z(Gl|*Try@e|E!JKp3n8f$ek%oDU)t(wr9ERTZRE&>M!1J6V?q`V^(keB}Y=QkuKLa zYr4Wl_h%jEIz2FRaO5nB8w7Yxs!jB^|Ize3UopLSKPJDZ^sOVV$;fxn6@t_>Pinph z9wRfd-}MgoI%d4hy0po%Ul9y2swuzNCELtzC!7v+#{W_4(v_N~m8cfY8FZ=bq$*A( zw9N%(#k%oVN!2ZH1K-qRaRm&t;S+sRKDP^6eTLj88Ah)pobcgzw4wb1pbP1INqZR4 z$LugoauH2v)9B;NG(`XW^NILQk14e#&$o!{?GNn-(Q)|3|6xN*x87&tbo%F_-NNf$k(1vfM|#Gpw_LvWW!@*n zzCwzC|H#FNzyJFFd5Q4R`oFqyg8!iQ&eVz{;$`9L{yDw(ZrpwdkKKXSd)H%P5^fS! zOJ}YZa)ddJ2bNj~(=GTmWkH=3I!-e@&T>c)gkCCb4o+A9pHYDN$n;$}M#y$G3X+Q% zr*-n8`ujv6 zkTn!aE?0j*I(*fD_Cl6~&kLc?Y^wT-Cv9!L7(TbSdq>quFpfeqbJJZ+Q5;yeXn?7n z)WrMa8aHl>NhholMIEwsro|68u5;e++9wGm-Z3Hn z^55CUQOuyA8v-A?|GdE%Ph6k|vu<70UB@VMa7g@HTAmcMsqwkK0$u#LD&Os)Y81@q9P#=QT4p)S1<~mm2*CE(SnKBmHmXT_~n3W*LnTx53{#9I#x0$IKjQ$SE~qi zU}JA<>%M<^nps;SOFhrt?$3t}6N*-y`O{@ljl6)b%`e^G{x!B+4}R>?jL!ohC6Mwo z+zfUOWq|nV*0Z4@gKAqM*Sgcx%v2(IIU(P0K2VsPI_nU|o`eM45#^hv{VI=jSgav5 zx<^(A^fPa`SvdCGR;4AB#f)DjwJE%6rTA*pS6_#L5u0O-HmgzM>(|iM%~t2@_U{vV zb^2SL%PK}pr^9}V#j8uc42{=147X2*Ve8M|zs&EN{uEP<#C2bK%t;gjc)gU)sc@{1g)5!+W81e+EkM!uj?D zffCZ1-tQiDb?~y>Y{VtNAvz*L^xqQ?;j@9R#e`mz`%UO%5HkwUU_MHvG*Qlz>2n2u zeIY?3q)B59pDWf^E9uld0|$+>FR9t}o63m>>agnE%}C*wY^=xEncY^DHYToC4D#hQ z$JS7@iL1{G3~6HP3B~Eg2!ZH2eC)5d%z#Cc!;h#7AKG$p8_O1PDCd$C0;1CY8f0|kR zZZ<-{Nw3>{8_QO)9szHju@hhC=YO8 z^{m{o715Mx*_>S|>6DRZOIMp9%D`0!hHji=-*Ck6roPP}7t!84(YY$8IOnAyn|mt3 z`ToqM>Z8%;9*4*bE^%ewN!5;r1wECY?j@pjPR^q4fBf$o&JH1_H>3hJ56?H8fjfh) zPxwykdOdd;6Ggay7KCh}PY}k`!GH2lai@O|4*4XCgwV<+0V2I@B5!fmOVwMazAts& zugTY8VsIu_n7K>Y!9MSOhZDFqnL`4bVDFpB53_2^9~VV~k4-<|&MrN0rwDff{B*Nf zG5WM9cIf)ez)mF68M}U2j+cHKc7_w-usM|2<1HnUu@@DIg2Rx*6jOAjB2P*TTu65| z0aDP{G`{m4+}gSebm^%+EEHS#q{Jvj|2tu4I*Z{l+wu0>%nW()wNymC)M5U zqkPc|+Ljj&9KpI<6b38hl@>P~;!=h-!B2^voAU`xS69e~DER6oh6;!<^Cj~x@6`Z! zh+pP_IfDnnX&DxOZ0zw7i05_3^|j7K6X3^*o;VY2=MWTXBtvGR3irFiM~>2+;59rc zjO?o8C;}=(-QV9J1Be$Kr)R~+s`4Vb;D_t$wMh<+jNY9X?#(z~?SLcmv9H%G>~IJq z&~@Jb1pk%;@(oNMmfAc2*s#)5cDw-0grY6Jw%0OrkGARc?|rPKI#ttKLFlR(GgeN< zhZ%FV8Pu9UsqFPit*7CvHC|~_ zQ6>dMhW;vC&~Y%|93V8|GwhZtMC(Zq$Dp8MRepA1`D+7;<1ZX++U9aoJOoty6s`kW zKv^9I*3UYWwzR25rhV90{(03jFyL&|c$bA6KiUm0Aozjn!L6-L1b6tT`z<&;-f?mA zL~*2!(>;sr*`BzOW^F`v&*Nu0aqb5ZUR_tLZ9+W#IkP^`Z+{2z{5)QMBJVl4$$^s_ z(C$di-gBe_*Utbqm6v9em(T2+P!(+!l=kpBytjnh63fJx~AE8wNHLk9iy{$mR;=b>O+Yh*A z@}k^1(;xo|%iYR8q|gA>loCIItk3Ir<9pt2zxyY1F!J{IR+iFfZQRBEjvD;kU5!?| zJnRQ%U(Uppu=<1S?M@ogx=LqBhw8 zwzm2lyB9J0#{1s~Bj4P=;J*P6t{f;Zd2gF;_gnTK)bQqqKHK^aq0V{D!Rf7?G-yw( zjnCBS&(|EsT6r~LM_vJB&6zJ4Sh&F-icvWf7};DisZea7Srd7hNtK*?HQLB~nl)zkgGjTz|V;u{?^G^n<@kcvX_#c+9ICDa;K zRN3z86NMb7PEh>Epv!^nKF%~Dl67V}VDP})&zHb=Pa=}*o8r$LL%DHK+EY?3c>gg1 zl={|o;bdfSN2j;!G1xI)cHu9EV)NrV)YXy=C%qg=HY_2meaSY65g9X2%m(K;Y)|1c zBF-pc-Z5Rm+cuAnIXCmZDxfMxax})YX<-F=(^4fB30&bb7XKBlu2isi5{F&hgHfxG z45UVNTFAp=fW#;yY7)|^BlwSZl@*m>1$9QEUMF?tC-OM{$3IOS%~iMLwNz@aw~mZa z)3xc@ycvQ1^JKixD`?Ir;hbnR)u6OkPxNbkr8dJ-5d6vo(d)AhH2x&+$@^&!pHH^a zzC*lN;fvJ^#K&;g0wL@NDOeTX(MSbyqZ}eoxL9_)l41sF&AG-Iuv_rnjN~ zwaou{jlutUjBC?3RW-iBv{yi5gA~)k+hN{UYrSK6qz$|8$6PS0KXFCu8h3W`3FKKs z^mh#Q8s`gpH}V}Ilg(3O8(x~%<~ouwRnQ3;R&yI{~}#mksXSyJdp1+tZIG-&wHOtPiF928IkZxtzyKrx29_<6FGT zcOy}({Ybb!pUm12 zX#`B4)C~(-B<+H~gW#1IZ@W)iGeG|3pClvU0}Zy}!l6^~g>|=Lk)I@cil*^-;wMSZ zy#*Qbb3&JmZD-mD^Rle!&3{jv*7sKJB_m$AZMkp3`6}P=Ee9AmIvVv~$&{n-O|`8U zr?z8cQhmkgk5;YdN5{d3f&xD}Y|)ZHOZ`P_%of@krK8)E%>K21SjcN}J4n+(n|@4( z>Op6S%e#}jLkT=laE*cuDZwimZ%jwS#bg??^WdH}C1d!kG%K_Zl-rkPs_-7uCDJL} z=LqRx2=GAjRIo>s!oN$hF~tO!yF^8JKu`VPVg?H&R>vI-O5KFqyXHkA3z<1VCW;NG ziL3gN`N(-i&ST!(uVPNwJ-S!R`~^J7)p;3%vv3nwoY@tO`s>2^KU)BqJ~c#qqOw)) zLu*mH*ZD}I#;c1voo{VEkMiuf`6kozw`h|+NaiHmma}ipAX+E&reQbV2(PupHO1eb z$Kj<>NIdGGnz1vaiIR*qXVu2*XBaD!@BLZ$!%q#C7dhcU@XyJEe zS?u~d2erL&k!%ev9?Jv~q8G<+rl#?-u!YlIbuR;9?gl3mcRg&LuAtEJWDAB~4~mA{ z%m@5Fwo?;9sT^M+)&X~PW($eSzk9=Wy{&qF^El5^twTHAUADgEHniUak$cX*_ku_S zGz9AM;EZbW(wLLJqjs7wx3*T{{9uRsrq+*j)m5-i$_l9_Z3t(>p{7gdkk`Vg3aKM?BQ_sKbUKGY-d-t>=|HeNYes}mof~NjD zR?xX+8*Z7SH!FjswZW8!n-z?Sc|^F*+l8MjHoLx$Qv4Pq`hHsIj(KLVS?eos8dDp5 z6m%YgNGIxLHqxP-F}+mkMtJ;7eEhGl`^hiyI9~z5?z;$>-02>Hoq>_q=c^qw%t1X% zk1K8m$m|%b6QZ+GB*Y8r6VWv~I=UBA`%pUDm5g@@@ef7Yn<6>gL>Vr!nb_nyn9-%Sk0`iR-8dJ@@kWq)+Dz3KBK43vX#=}ghy4lL;Mcy; z2W0>%$^B_dD2CCZ5?YkIbYIB`rt zu3#9x)?0B$>Tn9?fx~!*pS+vHa)jqt7%ykatxDjzfRT zh$_O+on7mH_GF!mHz`?G&}B zqo~nfQMEYG=NchYczWUKpxY~|W1~XcyjNzpKn5l(9hXH8*iklS*2pc>W&uv>0~Ff6 zrK-?lczwsP=vzO*ucjlgp&1{$-XAMgS zi*Q;A!~kodRT&d+N%fUc2~VVQJO`9C3`6Q_r&?)ceN|MI9R4ZK5t)L3%%BX5J2tF0 z!I|27mY{M!hbw|E`uivdmu3E2t53>X;yO+*T={`toD0X^&Xls<-9GEeS||n>{>Vtw z5XGJBs*r57P25l{+680Jq|FBh(74rqcM?xz6zMESFX3Z2w=jy0&O@B$kFplbw0@e4 zcH$;(+W7wT2wILq3lm8oY8s|iqT(CrhWdMmr9u)99c|j8qNQJ}j;a#sI%^Gv0WP`S z(dh#E65h${c}$~--X)Oss>VMyH&jPgs$kw{X0z@n{Nlce^t4`mTqM13G(xD-Zmm zmSIy3t>XtP*I_NWb9z^qoOg;p2rk$XS$~>{9jVmoCoG2*YxYsn7TYg{mFx8MxpId_ zb_%j8a``YN@0ao?XXJANwg2Z%`ky1=uuZ^}J4^Zq&tBdB%}`R{^wWm_&F3GwRGIae z7(8XW7K`Lvj$t}-D7x$Y zQms;Gj9s~7k0rUgzQL&KINnORSA^Ubzg|955C2@d?;9MQcmP;H*v&y`Da{kCATtAvC zJ`aM$X!sAX?3_^Dl@7H5y{YkeplvGD_D(}g&b7t72q3CbOzRBUr>aTYR7_xY4vpky zK1A;|oo+mXINVvcx#C!D?#kV*&G71KRv+<$A338~8T^0Kjdtpbt>UuMH!3xe>H$1| z27^)syDaD-pGog#FM>f2ZdxK1@)v~g8ef@sPlCJu5!RQ zD`WqfQKN<&UDZJhvi6(~ADs0}wwaFy<;ucJ2ihgXP-q;K@}!fh+%OzG(#D$((NlMI zurf_3;a{Owlg@GYu*4eeqLbF<4Q00Rfv1hA4>H>zx<;1zd}MI zU+|J)z;y*lLF59|4O>IDaKHXU{A95o;ALuRLI)Vjd`SE^`pmND+_K2`DAq z6bg9}#CZ-vS=s{YwhlvI1g_L469pWPQxM3Mo!MIHAES1;OIFR0_Liu&+DWv2=}f8= zze|-p)2?%qG_LXPJ?2kE)LwTA3m?mEy4nc`bi&6cVw$wf22ldzLCeNjo1v^NTwBg+ zesxO+9VLy(l(iS(j-P#YDdN`SFSn`*{|+?xqw9Iz1OxO{A0$pxh?Ex_3y5wnUucH> z7NDK`JO;jFrraqOy`8txBcI4j@>_^-3#WV7LQBsdIdY7t8@|+CZiNG{gsC`3hRZep zM;l>oY_pG;C~Im>(M&RS!T);?UDh>!+(H4%Cf7}arxEa8%83Qt0r}$09AA{EEo1cG zb;$U-y1#R8UOoiDdO_(LpEoQdh#x~onnEM)On_KhG_&h_Msz8zvRU=;UGT~F0W#WDov=y}5mqG-o!$k?jd*T-vl8o}lQjV)S) z?khLZ5UnWuhC%gR^UMpF@_LIP9xg}K{!9-kYEeO$3qQJ=Y|uRXrSNNVv=f~Mch zMf*R0Fpi{HC!!V3sk6hAK7c0#+gn*^^}G0&+%9Vo zzOUTPJUH&(j^l|0ABpVw=q>9qH_$w{{M`D)w20Sz%#9xeIE+zp4KFSSS7hb!N89kp z#{EimAq!)}_A(Wi?q5-QOo{Qyr^f_8O%H|ZY02VI!O%1{vYTaa_Y$xLP4@Snjzvy( z(Wx}QV-MhRn4(fAR<)HYp&y`U6svq99> zWwp%2Ti`K!_Xq*qaeY*$6lH3!CYb1W0(T5>|v+lfqdZF>Y{#xB*Mp`u`SB+oU91 z+Q7X(ZC+eteEQHm9RXaX>j6J^LCeFjgm%8s2!h{Qn!a%9lDdGfuiX)NoLjLZ(tld{ zBN>}a>&R_1r&a#|*-#)p0GC+n;#y6{%U12{{lYV+Jq)sWq~HK2MEm+SFwVkuekt;T zy8JAotb5Z0fLDc?e*3!Wh;cKOTg5sZIWQIt7UeWpyJlOsQ-6w_wV(Sp1?tdHmzXI1 zr%`4~EfiRSb@ny7oKlQDtd*3kxJ&>!4ns?W!{HUQnx;sCt*~DYhdC{9e>G4NT^`HL zucB25y!duWrH}xo)3Z}JP7BluO}R`;s4gcZaN*+`;prR99})P5I(Y8>xHQCF12)dq zRbd}SQTj-NP?{+9dxT*2=pIq)*W|&WIVV(;z;f~*4G@pQk|me@yJ}GN2BAt(M7*zUHKS17@H)F#m&@y5 z=2UPPzw9bkn>xp?ySf?K&@i8r<}w_c($IvwDbB7R-69Z#mT3@nyL0>;9Q3nvA{6a< z%q^kSr`Hy@V{5Dq~U-x5a!N+Vr2d%`?h(JwKz0TZJ z;?~0?u;1f7m`un+$9vPHK>2|tNcO{}fvYI~Xp9OD#wFEYSA;Mj?kH-_74vzEgGx0* zcN;DB-Yt2gpo-XkT^kXx=&(NcgOgR5N}8@P!CaJi&0-74Q?Bk%?1n6%SY5(s9VPi~ z;#LNxVpmCSz-P{;`*K86oiMUt!SGM5t>|}9zm{dKYLr(%ij?O9nEtxEYYmpJyfS|P zg6=aZ-(-u_PCoI0#;+z&OWj0HhVdq8qwXw~e0Ee&1(iegRbh;)3Pyj`BMzJrd6;10 zvFot?w;`X0Ij#6l%?tZjD7EsdpgzLE37mFWsCn$Gib`SMm{J6{U#NUGc3zB%5O@W{ zC5KQmh{)m`f-^~;;3#pir@4mUamueVH{5ldnwa~n*Ys|Lo! zf@3skW1NaIb)Noy-&#v7E3O7*-)I?H?ZN6}EO zYNcfR7&ehJUIqDN**Sz^nGGP5Y}09h*)WnssRky$pc@3;C&X6JxR*Nnd%g8gdJEUF z*Ox9NiT*3Mp2)Jx3IUl?Q2mk5z%FYGtV?GiB-HH2K{GOS5{= zqGPA9`7lDWr7C_)I|MOy!xGgaTJZSkLB?nk7sc04gS$#&{;!rcxAo&OVQR~vTDgMY z1icwmoZiD~Yfcf!RY{$R3;R7fN>XsMBi48!Ax`4RpIhqL5{rT{;lF4$_;p(P>#mze2 zR8;7pWV21Aj6m|1CiNc0Klh`LHlwy|Hw%_J_Yu3~K*ViSC`1Pnp&9|qqw&%C|?{l5* zUA>?VrYYhOk!Sj(S5E{ARXBZZVK9M>?+TWl3~M~d7}DcKmv|Mu;vK{P)HazSSQ(Ck zem+9v!W@PkQ}z3v6X8x-37W8B3mmA0l}4+Sn0FJ{oP13g6MW~{R>SVh&F9#{Ss8H^ zfO6UT)JO{AbSIK>M`y0&@N=+I$r^}MH>`|&WzvY6nuiPt181aBcM>c+Unt?D%;UA8 zrpcc!<0L|eN;(!&5}XMpEhg<=yHW;~JrwajNbt?&yL8Q>wOptbxtc!QUSH#n*RDAd_}T zPTaW8JsFxOh1rzp%&SFFg4w?-Lrzc1IA1!^d~jCGs`m&*#i>R&vSWp;XMn5lmHN)v zEEamUT(Js&F?we7ONO%CV?!KfY{zEKbrZ6eeu-Bl7X8a+qEhg5AGSdeE$#SHYQMdp z_x1jr?7HUE|GtMCE({CYrR%$w%NEZ}%%N2com1;8q11YShx@X2KcXKhg(SiFP}K6O zmr3-SL(86v&NxA|s>a$F-ZWBz82M=(uaea@1#aQ${HUH3vMHft476;t(dZpeL&C;e z2qM+B#a*_eH;8$^JV-4a)Sv%A}MLo1u<3jEo~fl*L?ak_D70(*%W6d*Rc< zkOlF!d6N{BL{(K-x`a5y8}$#LK3BO#X|IxDdXqO>aX2S7t22CEeXLV%WjpD^MQ88OqcC`?H-R;V0NFj_^0OK>%hZPqPhyY9O?YSOz{{ z4c_23QWS8^q}`|`pCYPsm_-+L4lhX4T$b|h5GB1s6YuO7f*ot9oCQtHo9mVK07y~zC%`gSwgd$xnSMaJL6SQ9i{&y}Z= z?BA7Aw_aKn+!c%htPhGp*aV?7_Xkq@b;1^91gT2VC+_-kWRs!=tMs=$yzxvT{h7C9435a5DIWs3#?Bc>pKq}rl}Nw2FgkwleSVKspuI9$i-s9wqA4L{>0 zD%ByKF3Fw`IO6`g3DcEFHFsy9+cMN<1MSpOU`|_eyBA0QjBeHX#p!%dgvkZv6IY3I zl|nGqtBh|_@?ZjauE=Yz#Jz?_w4eipjU1-avdl5Fb9jAR&!_6J;9 zGCEBGEal?W6i%%JTsAO;O(aR3q)VCLNU<7!gKGK&gXN9eKUpgn>$5lV#S|>gu(Dv1 z$_-X&MR=6Y*mnT&@z~hX&QLHS#g}!5=J?)!&!?vi>F4UT6OTHp8?BWii%i8aB`ikC zAgth-i1dP}e_?{fQZ8SVzUs}jE%r$cn8D4lqJHIuW5?fpyq%HQVVc2oT6b7DL8Q5V zfo*l0ZLW7FD?C!7w8_8e`z+*hB>mBoz4qI;y6dKADB_P5p9d?@c7}1qQ$Eu#FzItm z>KS;ga99?Gq~EG&i6n^S9X^%>M2W`CN{Lo7Wy@QRdvOc8lqmjiaGu_lwsob8MH9vX z{hy-wZ!vi->lYX!ztwdv0_B$JQMLP&E*79v;R*^Euw7^qyWJs`9T&IQ_Wb=lf4WXT z=ec+0TY)2Av+079`{w)Y=BcR_u@=$>9^|hWJ8^cr@-yk+a>=L)tb&Zmqw!R5;P#l^ zT{r!u;asPawV|j~m@D$OgXzG2@Zh$+g5n5omLX&u1AWFSY0aLL>DeqEhA@3Dbd~He zIpW1E8YskEtyC6nQ$`y-%T5drTkoOoBWiwZcoy(xF5#TVV8<1-&Y);mrRz{b1L8%> zXAJ~BWX%|Hv@kz{q_Th#tIOZZEC}^u!u2uqsAuQZTLuD`iinYWXP}8_f*R4<`H#}9 zE@44l`x@+LHknPEBaoo`~^2oOl=iJ)j1Xu>GFyr>0LG+Ct02*0wya{1H1T9<)cvH;}F&GEmD^^ zChlwJ^Gafo?lDRgLP-G1L?x>}Jt-jXkObl^jbbK{1Iio8&>CS3uX)9xqadUsO;bed=3$=cU3eiTfV#84#iyi zz)M-)984{YIxc(o$37pUJm0|%x>4M$HTG&p(nJZh!L}hF%m^sMXFAeo2HGbj^c{y#DYN9d z3J;4*CgW{m^OiX)&kkLV4T5iC2X$ZxkQLBl{jKOYXfl%&X) z^bkoV1A55HGb-PCSbbWVDWL%PSjAdOPU%cGp-(3Dq}f_*U;R@ggRV)XvCdMEP1zO# zfkw>r!btvqB`ffsoAw83y^q1+_gNU1a#IcJZ>y|HYn3#%;(982uD_WNyi0Q4$3DAo zRbwIBGDM4BB4fG5p^z}Y?Z6R>+fG;syOA2a zd6OeD%cP#T6>r7KH%d8Hmhx18Cfac=@g1EnCDG`v!O);{Oaa%QaGGUe7TPOL4bJ(OtAY48?fYQ_X(@W$3G0^< zsylC#?feC$1n2$n@VHS6mQ*}DsPnM%kfY3RMe$TI^G9FY%DK34%qjzi^9lwcZBn&o z8!_0jrbTAmHh!**u-xoW{5CJhBT+LT7{|}FC@Si%SD3+#?jN7LA!(NMR5WCwPMn!n zk^Ss40mo+D$d$zTow+p}RkD$z$&lpgD_IFJ0xk=ShBh@;F)1o7Zb=~}sGR6i(%a%o zUb{xJBwHua!#}uZb45-Y10TVPEb7DuEDxtk1X5<^*iU(HG-1%$&j?5x!R!NPX|u}y z8X+Gbxt_olT-SzU;i*Ax1DMcj!6zb^az7u*DT*r1QasI-Q3GR*&nzR3Qc~8=m73AZ zQZQr`;_my|@F|scK&t$N{1 zU2)42sA+(syT92WlKN$Wij9@iJk#%Irqo*`fDmFv8bgL!b(kWYBKFJ?dTD1#`E*YD>v{h zqM$E+Wpi}PtlAC?EjwqnkU1tkAx`nkS&KG`N|yKF62yccR4Hh~iSF$xEk#}1RZ^fL zVRyVMtMc)&kONWEz%L)>+eu zgCy~Ag{c`Nu0)Z?LXuN@hy+r*qDSqZtLMesOzDLwUs&WCSGa44QHOrVp5{SZR9%GJ z2!q8k^`%U*j5VPOLMQBU*`@jLz z9#!;o<*ER5NWxqsJpp~nXe<@1HT8^>Hq_o7jP?ToyTYLg@6k2CuEgSZHY(on*wSIOYF(I7|q>H01pI9`b=``UA*)jKpmHgWT&} z<_|e4cGN99WWHC?GRmEGJCFI9*o?}vCEN{}ZNCUJeO%$qxgWChOZ@4u+QiQ9b=EQ8 zn7it=sq|6Z9;ZHOYai-cl-+0h-wtZj?)MEr@rvPCG)F2CrgI!`0&;xF5VABcb5%$x1A|l%BW);F zT#7@aW?C^=<5>G;Xg@5>MYF;izjyW@V~M+53@bdnNr!Q6aIqI)<4ClqrUEuqRMwEh7SGh(qbQC`i{r#&fky2GI*QV-k_@viPM;?J6!0u6#!9*k>5tmjJj5 zZgpv)Y;g=$WH{p;L7I8TF(rqHlc4lcs}5Gksp$Yxm?V_zWaq6ehy=IdNq7dH-r!t`nI>GUPy?)xD1O z=81MvU7IVE-R?;T%lP3Bm4Z*#LBACRR?z&okoH%EL@7cN|G2v)u-g|GV5zwVK5@&QG;cJZCgxsilrK$#xN9ZQx{R_e#a@0<@t zqMo9Kf|y3zg_A3KJzxfgctJC4d1Eu1jLXHOl1YF~G4zf=2W;2dnyTIybaTheKALLe zb&f|hf~sv}(~_#k&V^M~!o*yi7JbaWLn-HJ3UC=U`UNU|6c*@YEvy%Y97@&p`?IS) zAI9IHq~~(l%`4F%1#wQ73VU2Hn^^7sR!ks?8AzC6OT8(0u0rtsh_B3VG(&X%Zxj0x|_60Rhz^QeQ@?T41ys%n% zaF(q`_M%}Z1eG-D!(Ca@&~6X$5Nme|?&vsO2hET2pG-*nZI7y$D^LKf>#n+FyeP2upN?lan$W zCl*Q?5?^3l$jpd$B%ut>$YW*@^U`bMZF_=Z6wy(#u0pe4gy~`G0UR4P`pb1|zEJfE zNAcf9l8KvmzUIiT$Gm@^Ls-e9lJ{r4VnPWxmV}Yuc4m?xM=A;ER3C%+d!{Hbk9_cS zrWzV2@tiw)MmnAUm^6M-uIWF7V&YAT<576xR~$z^VX={ul?D;nBX1+7?7kaI`pS`PgU1evjqm}By;`)`jZbO^G4_ZxpT_%ejFNfHI zO;=PvWHmx9=x0AAh%PV2Cc8brP=ZCtjr^-6gU^y75w#>M$bux}lI_?*4Clv@_CG9E z>c@1*mJN(y7DE9csjG#fKk_*0_s=vkXc{<1B|{_R%@JpO2_E9`81*nnI0zkVIq`N0 z79GfpacKweQ)7uQMR41`4ha!SrYbZg1jY%6e7UX2xgUPewHZ&}UGX=Dn~R|$M{=p5 zbI6o0QW(~t79%X?AbetG$ra-#{!$ROca?7(_;s{+UsG%YN~Gf^o)9+@y~qyZy`y%e)!21bdFB%D5nDK9!xXo8d z+FG1H^gv{92?3GGUUmhPN6m~*E%c%~m!BdvVEIo)!w6W1ru7LNXd}TXMrb+|9jZ)p zxg(`I(_&|^8gUcxd*xbR))5czOj|ywQ|iZ#{_4MKSdfW8iuO9z8MNHHdgHfB8wkRT z$pSkm9)3d{%^qe#N}qqFn)ou`hFMPD$V5`r=}Wz}O;IYI0rfWQJWva-*|&e^L1Pf`-%Ulc#wCNAA!9pQ-v*viDNuU zoFkqxw~<#d8i8#K-6bkv2*d7cgK%iw%u|_nx$}SCJaH?6S4`d4+6f>{OnFYWfX;m= z)aIF+oQc<4spnN$002&Dp{9Zd9LCs2U5fF{g91bDS~}(MPsLU5g%?XTnB9%KCZ11| z0+nUX*uOu9K83V=zZL)u>fF|2X4B<3-)BEokuDU#b_CVMo;4g@DNH{r)vAjNmh{y3EYIfiHr58P}IiZV; zX9tHVR)x3IPRFw~+y{xdO?sn)B(z_57wSdF*@nl6AJ$?7l)p*rluMWL!O?yc#tA3( zRq&@L@=<pgj8<1wNa~x+ zZfXyoV;J6{Mjx%ejn#CmUJz47#u;XcxmKeaV9b0@R#UthzBo@E4UQ8`jzHg<&wSb; zYTRHLR;$ShpRoO40QwE@7M`?!nbZ#-f`bXu{oBVy8NkH4bhKif6#^KQ z`uYiG#UECbtajGPKhGVj4O{Yi@<48kEn#IG9e8S^RE6F2BY{bsMFm^+LJWaX8JAU} z6L_=8M4k=9;A@Pa<%^6-LXZeKdfL*cYy@mpbVbMFL{EQI-K?-*Q7jwa9`O;gF z0S(3LKu6O)k7Tk**R8andgOsC!-u8%LgCP?B$m} z!PJt0VD+O)gR8MnruR6Y3^uhbA2W_BBP#P$Xe&RM(Xws0F&ur-wGEqR3ZpNX+x}|q z^c}kua?IqfxKRI| zeuZ;ZNe5VoB?QWu6+7?hX6jQ2jdJTaL}de3HNaJ0ll2NF2A4gT3``9OMT+{Y9Sk?M z4d3Bd8qM)C{f%xDH2AApbOxGh*KkkKd?zl@gOySXZ%v-@9AmoNM~g-&_s>?PIhJ%j z=fWW0`NOl0p>yRr$#l;{=1Ud_@g-2w90m}ll$!_Q>FwLuhSP2o3U~~X(W$rQy}%{s zhgUq(o^s9@^VOkzcsgn#E&&;DnypuD{)Ao*pKUIF`C)Y7k8sVi`N(B~65?aK{vF%2 z{jZF4EUcm%hkQsX+Z&%jPLH^nY)JH0WW)FO5k4JudLRCKL)u>~e%_F=YrdcVBJxKv zrh;`lXJrf1l9aRd*bx;&9&h6l=@8wos1GA(r6wtUZOw5ZI~ENfY9%$oCbB6erN!dT zG_f~)?hV3W-TT%okw7sqoJTb$DA)2?iU{OThnLY5fgw4sr%0xFckLCP0DnFW3H+n9 zPoJ#FB%m3u{%dqEV${Vd;G0E&xbv7rT5O9}p+}6CzxSJ2-<+ulhNj71l%JmI_5@xB zy`uxmf7brh7>d=-4~1-(O6wusLE?iHDGt7C{4L+aVM6-0MWV@zcZuZXt{*RInB(fw z_sKHmUV&cDMe)yaJSB}Emw?sRDQB-;xXV~lc1#$2#sWYw*R9G6r;}75 zjekG_VVTHW(s*yvvlS6l{MbXVTlWB1S;|70nq+Q1N``Rp8_FKbuSM*|tk}odRh=K4 z8|ELre@3Od!>1cgWUuZ{{iMOqPiefv`6N!uj)~#XWL6(2QD{p-4Ba|=Vam@^y= zFxH`wkUag0CmymSLvb*o7LWD0lK8NAf3A|Tc8~FY2LM330ED;=J(tl{;g&h>2|Yi~ zTEYHMGJsta!eoY11ZmiA${1M_V;t$1_Q5s`5Y1oZuz{)(ZX*{v&VybfQV0(%YIExA zWs;u5vS=8$vtb<2NI}t zG?KWwC&yCXZW~vBftP1yu2=Wr@H?@arN`=drgleU^V&$99`q&y-Lf&FWeC#D`+Lk+ zARle&bwye!%@B#ygj45i!^f$DAH`&c!qrD;KFn-O9x#X|BT&$J;F2-myX0+0qaZC| zMj(2P%p|;5;Z19hCPeti;>|Kqp`YJWacSU`!_P*-JombSgM-)`=_6%HA^a+)*~X1Pqd%POiJZF6Va)HXXt#O zyxwAcjB}Q)m=IZ5B%VG};s2|`Oy2eHj7S3kcAw*}QV2!VFjK2dH3T8+MZhaHs{?Fh zWwTrMk7}^xDCkuhaI$++%f#8`{PzmYNouTVkodc7-(C`}?SkeavWp(SiSCYIU{g|* zSOL`#n+KpRVRlHWPe@5nsIP(ZLBL4&bgdi((sjryi>%u=@`-jLF<#aE`<%8y9w54q zL|pKwFoaO(Y%gqyNHsv^CBj+g2ve!BVW%${1MY|YwZ{rW5&}+7ERt_$^qU0ty33ra z?%d*hVg2q${;A#TZESP=PfPG|j6y5zt%tZ=w1I>=yh($t2jzJDDM7O(_l)I8w*F;u!AVfy;fv5Eh26tA&D?Q|)baSx$ z-#4-Z+jxfNJcMX*(7adus3Eh1Y_L<;<|Ml?3*XKF46Nt?Pb+vh$-NUY*XZ@n@f z$Ml?P_9>A_5mESvZNO&T7li6}h9QOa1>Z60FE1^rva~duDED89Rl0Xn9?7vhBDQnQ z0E0uH#0OfFvXnh+tS6{yOw(jG;A8Z_N6JkN#*Uv$6Et=7xi0Z|IQ+?nIP8i)i+K!% zS0X!aroUFl(TO37_>!TvaFzD$DKU7E*uKf{<L;;P=X%DU9qUUplC1L*3RgWOC^W;w<}O8!D@cd>fA04?C90d~52P`S=!(Pt@om)jU4HdGC@%%ick%2&>neWN%B&n%UoTPZg9=X8bS0KRu%;CfzZaP@ z-KOT;`p90jVg90&ybL~Yh_4gmm3|4p6r2#K**(Q)sq+H2%^?uCnz-|v2B9r$*v?Dq zv>JB5KqvND2Dlr#L5AeHcm%oarvZ|OGk>9|EPGMuZZ=!`rt)owM=ird0`5TOCsqBJ zGFffd%|L`};MHdw&jESkFMmamLY1&SQWV%yPH1o#AK-qOWc&VMkwpHZocSYi?FgEo zECJouJ}QH)DT2;F6*k3n(O>0wCHSH6<70`aSq1vqAvF9u=?;gVG(NUTgfz)ZSh08f ztfBw(k{u;ZdYH&MkIa~0=~+GdML+|ygjCTL*XOZ_HfTHw58ai&vk~pvulm@3d@5O( z@D|YpRHv;jv~udH@HjfNJZTjT2!v)OXLxDsCZT!`e-Yf~6nIuB!$@Y`1``@aVQ0R; zfjuv#U?h~3;Vj}{^bJs`Vw(hiAyI}0G)2y^#-oZFK*Kcp%|^4eHv1R|8*)!Ge{sYAG|YP~+FQW)!z zP5-QKigQ6)4iik0?n;mCd?@(dOE^6Gr$Aojs!VK&a=u_#N{oHV^9P|VxRs)?T~ zf#1!`%!VV~z?Z0{Oo}@e^4Y)0<+iB5SfBTq-wsnMa0lv=-}2q!L5`>O?9JlQ`EN>n zqXo&on2ndemv&C?v-kW8LLf`n-7O%1ZHn}w;n65TtHaKA;@>?8A?M!vTi| zIU9rU=;KW~8!_#=Lnv3Hv13vwCdYl8XnKV9G2+J=MWVtF#4rK;**F{+UEIaL04XN_>rkcev9U| z`&hbqD16T+aJ*Iyqv~^As`%jE)B7P=avtv|T@_xw&=>L6`G>(f%F@~iS-Tx(u$)1e@ zgRb|*AWGOFxt*7KS1P6TW3Y7}9f5(yztifpKgYdbrmX1xuuuoT0}~}` z5KEoQ=aFN?G_nL_LTjp+56q2+Suh0d~NRWa^70CL4uC`69-Z6{A{S5^|>|+eE=%SGEjl} zLm0++*t1~A6OvTSS->OrSq9_TDC68!5=u3ftepqA9lRSdaqL9uzKtqivWewpC#_n&%G9w}wd)`cMB01~ zL5r|0!e2AXJ>{wRS1m@rpU@cFB3I+}_dwVwk#Z zT~d~u*!$0oCpkr5B{FzV2dFX=ze=L56L|l8l9QyjiAfW3gWU=1OS`5*vi%SxsFEYa zaA%EFWV0npyYqeT=y5da# z%RA~i_7*PJ%Y_Z?807SBr&-eVq=XnPzJ-iduLNw~7G-+$RhrR*Pb2;+(!71;Ok?}o zbVJCj6t3#9jz46+8Oyq~cqNFBUMgx*u*f=vh=;VA+xKuYRsJH|rwZFy?)Grl7Xbm{ z2k$7VKF}F4=4W*CD=j{d|Apjyj+d+vHMi=PO8ZEPZr02y2>m!ZB?6BR!#X1@_!SSW-2q=TfBGF$wExC*y_y} z47;Xup4a(TC?g#x3GYQ4_euG}*+99!kuwVYJmSsDTVw3JGu~F$NUcIc>7n{0w@#<@ z01Lw5pubS(P41e4K~-x3FMx&vGA7)3Vr}a>!-Vk9IeO zOYF5S*>Tia_jho7`bIJx zB947gWxxE5vyPi$BZEwJVF-jENHY zFR!=gKCRD?KADULuVm83V;4P={Wvtm6p?=wXJ5i{=;dWc6F5nB`{TJXzv3#p5N~t+ zBi2|=sr1kBs^mn(k7N0dcON}+6pcq!h?(?FTrcQPkIFyj1NA zt18Laul#KJ4T?@c`2?`CE@R;L93_xYO700J=jTSLr@_`eoF^zaGMQtCC8CS|oa-mO zsg*UVi`Zr$5+J4h$O`8|qjNO4opWVBICwgMp_#SlM^j%?YPu4zZ#8z3AK&?XaMf>%#G8dT>D#nOG_x3^49xVX z&#zh6(bUtk@K&pRveWw&0YGA^6RQoj^JuTgzu+82yk`{t8b5)*W?X8KIUQYpBP=r7 z?RY&uy3cEv#(&<|;cMYpw?dHVGA89VaCxuUKX$(YM1P@Br`K5iq9$@fB*4>@TkO#qo&Tb$Xr+~rMCVVAaQ9K@jg;I&B~o{FMEtNSt|o(qBH(g7r4Cq)!HYi# z#=cwK0SiCBz&8b`aRzY-eOD%K{Y3qD5IHrA48f)vmCr*}4HMRQb`Uvx5Bm>Fd@)Xl zL9~TDdq%XCQ)+nON7H^W1=M4LVHx%IB)+yeV}Yy(E2)xtbG@q zT#OYMkPl9ZN$OAFVVF__=Hv15Cg=`reC-EaFIbcvm_Q;bXw^YuW4$w$-O#cL=!$XIN07p1<|-ZCy4xZJQK<-Ui;KT)b1Y2 zIY_BTx4|A-!-?-5U-E(&uYnRr>PnPy<9XB;*o?yJZKxMn2%SPusqmI{`AnCXWxI-6 zaw}vx>IqE{tjBS_z}d*syF+bgo2ck{1n;yg-tdHikk9U_+r8$6N?_-GwpzjJv=w5!ZUsuD&`&p*%!=>;KUQ5-2tfPpl9UnPPTd!zaR;1UR z0JNLlMfIa)Ej=eNhPT^A>6StY?`4iXakB6T=?gRmR@}iZk6qzt+YBkL?=*~j9mD)} z0l@G41euuC>dT3Sr}Wg-qr{uZuZ@M>LL-U~M+6|VQn5wAxhuDm2Kbo7u7G_P+;uI+ zvzmC1n<|U0#8`BLu;G2T|5&wbL_Sq#JS(p$^%BEuBg0yTTR*R zu>SM_(T^I&0Xn_s6*Bk7#}^Dsidd-%?+=+J*u+E ztsr6H#g@&DTuC8FrHZ;W+xNaG@33y>y#Qq&O zAqef9($cCa@<~T3DttK~?iT_jAA@5PSw^Ici%GuK!RRYv@XaRMuSTZQGVuQ{Lzgtg z5y#@l#oqR(B$bfAW{4$npM%dBi zOZo>J6pIt9O6ut;f~&-X`4tLmspRCazcM@-+Gf!Ev&S7}o6cLp&<~LpOuxY}EQjb0 zP)?gW{R)zH*|~L85_J0cub+v3nNP7@u)0Kbg(-;UOP$ZNqhQi*lsLlBR~pjk+4Nr? zO^R8i!=*Y2t3-lRxk|vnMwTt}$t)|@?uPZRZ{+W zkPv>3QL50K+a!!ZYty>okp}s5GvKY$+je?Z?`(An-MYCBy=B?%aj^7EATBf8ZGAZq zr(J500glQ0N(>4WQd+%^IRO_Upj1#j+hh}=^I0%uhiPZblwv*3 zWJbn*`onBKbZgBrZp~W;=5>1=0E%YOi^A|ZZCPJHw9DQ6WfGv!;TXys#JOt2=lf#& z<``S+_Y|;mauNa6d_!C9*J9iD%3SehRlW)BzOOs0It2eqj#@}Y^DKEeB2K0gGZ1ns z>T-FV!6hn!$SHY*W;qmp+qNt-FjNx0PMgT|3wi(3rje*CDt0fp^9i+~5lSl;9vDq`aHN`K$)zh7{0==B>mVp|jE3#F<-i?C+ zlx(FC<1qiT+($dv!R=U%ke_-yTf}LeIxMCmK#W=jtwckT;R`K0&}@nCvYhb8%zyH~Q8)EbP} z%IPs@E7^lNRfo`-b0nmv+nE-HP7I1R>zt9ued;FCIfrrp5G{~h#~C+xw5E8~acHmG z8*F`XYTtr2w&&y5_XE^kt9hd#=Q+9{#*C*Q!wavsO zyg6s@-tFM?Sd8c;^qJby!xZRa?@~1FyWpxJqBdq z&>$Z)k>ykms5{xUD1`h#-wtU!Yxz>CAbkE4WWPiVo}+qN?L1yuzAB7keD$jkm_lC< zdad+?^eT8B@Dme(%v~7u)86|ltW|`4%VWqJLcTGjS>cn5%sZX)W9HgTaiOulLbC-- zv;#cHdzoX%x;d&qc$Og2#df_gyr#^}s{FWRPZXV3%Mc$Et_KVIXMmZS!@3mYiZm5u z6%yxsz<)0864>|#QtYl<-PVmyFODTL16eHUB?Ux9XmJxl-Q~OYbOY!V+t*q6je17A z`BBK4Pj66Nz5A8V{0BYnYYq_4CkzVHJpNhO8kSgh_i}V`GhxOO4Sp-3$HfgFn7Q0x z^0tJWE*6hDopv6xaCf3}HsJL|E*&m+VF;R#cDVJQW~|!+XS`R;t7FEuyS`ulil42_ z)1jOb-(J|FW-?VTY}C+g+q1yE<|IsaM!=L`O}qOSs(XAmNuap%u-I0`xFUb$>X9=d z4Vy)5&?NHq#0u#f>{!OiB{1unrDm&%v=C!tk&HT6-L!|MGK1Ntg|7qh7ei`~66lYE zRh`gfeRjTxj1|vER;7@9D+>QJA7Mk-Gl6HhAp69fdS{1~>MqRD)y_S69|T#olM?FQ z0Zh#;w#o6>RQLOmn*>119|l9l@cY!cAazFMt03+;nsD2%4*z_hL%FX;wRdYX!cT9I zrkrx1Mb+cvSl>pL1yHGDNq=bBak{nXnw_dfu~k1Xfu|u}ovbioPyiO-GV^m_^vmC+ z`+sSI+CJ5O|7K}g;063#k2grWJ3FJIn*RMEp5=fyVnDm@Qk7xXBmGzzs&QE9xp?mb z5b8b)sIyd-yZ1ZwMqBEBRZt*a5jNl*lHQ%5NAzp7Dqnw^z*t4N-0psxRyor_!npSP zd$i;Q$G}&y)ZJps>R6~hqKr-__Rk(1gy`KEXrT<0;(oNoMY`qj^9#$q-qx$0E5JV+ zl$&ax>TyLI_OrH-e+quN8uM<^i{SqGE2K!8xJ)kLe^&w?bWCjSgMfd=r|+JSX20fu zXK`~}6kY}`PXn8C@D+0H>0(qz zzELOTxv#9I}F z-V*CMn>ohv%f)Wia@3PJJr_g54r77+Q?~1#fRTv8>O+4Ea3F>~D<2BEcikm*xn1A% zEyKWx{nqOS+09vP$P)bt*dNfs@fOa6nO%<@pzIl(q2 zLcLn{1!ayENd98#RX}Qf7_A$E^TfQ^8N^&)`sx}d)aB5Iw(RT&YBm4@>FcY(a^QHY zw>p_aR$%tMKyRpRqp~^1DveE|QrKSaH)ge^@{R8&c-mgGyQ|o)`DJxr7 z47d(y1t0^B>~2|RKOXb@2g zH^t1OV)nPkocX3->bmPxA>^bdwLo~i`fXCeHm+;aUf`~BjdMWcBxjk(e#JU&1jC&36Z!Zl?zD z4O6;aseoN70qeNOk{DnEMy^Z1%k#E1_y+TSO2B-T*kEPO3tz1w8tbzd$-qrZ;nQNj z_11pVKFPnP5N@Tp4X5i)v-bbui2uEw3g&4;3?){NhAO*0Lw8tzL3Cc$!Pn=3d0pV7tZXIHLSGl!K0?y~G=koqUrL zzyl?dXFs+Np`3W^oloAw-#rSW=bog%e-Gb@4Mc%hp4<8HxfYS;obq~Oc-u>%L!yMi z9ZR6hQektRw@2Cz>!)fryT&tttmgnaeRes;BR;jvCUQ2PAsdy)!wLu6^$;?0x2r9prDU1QIdGxcL$4%MGfD!+Hf4T&uXkIG zzs#0}p+=_gBz`_2ZxduxHe-+tpfUIo`k1qr?xc>z@1iIKXtY~4^oNrnPIKbO#1pb9 zjS(5OLjnu<`3+`I_dzL=(4^!|(u{xFl+uuSbo`X1cZ2Z<$q26A&thRTtD0 zj#>{EGLYPHHNI(`KB@ro!<-%UMHt+R7030M4tnm;|;MD>Y+(e1<>9k!k#zM($Tne%mBC~2PN7l3}O;+X0 zv{p5CmL|c%1Fg-p9f7upF8w1$XVm!k4+DpTi`l@2mBGwy+80ha$KHK6Gt3p34<$Et zn`k~-)kF2Yt`HS{Efcp`5mYE5O&1Cek2Dv&h#k0@5{Uz1b}XywH2Y$@Gyx+?c3un{{5dLk=XWv%yj ze3tcI4t(?sv(MJM4dVnKH^$o0mfAf7&r%b+&8bX;(Zy=JE?p81XOs)wMw$DLjCNUD zmr5`mV`KQw*I@Nb2Xi?_CPi*MK*hhItAE3?&2dwNM~`HS7Fu@pTo~ z@RM|S_PrP*X+_vAEfw5led@e|+$?sG(dW^j5}U@{oY#?m#$zWpo4aVAl^*|z977t*d|I17y3=pWF7bF(8u&JP>2eO zREd1oL}&BQvY6$rhBNUy;koy>(&a0?aB#LOqqW-Jo`Owd!>mbRcS2Y@IbgGz?F`vNZ$l66$`?{H=@`$U9x*HiOU z19T)&!VPYefo17{oLK$o#n`io`GYT9UI}vx{O*7$q_b!P-(mjuXeIw|exoeC*ENH) zEhPZ{9(46^@-^pdcyh>=j~5YmsV+NQqxHLnKPJD*MUHR>kV|prE!=pGIh%$uh^m!O z8M!6d-Jo+}(|Nj3NZk9n+bz++c|;upvKf|L_^Y3O6F%W}?$`mX?E${Mj>O1vA3w<5 z!l3yFe-9Q|a;lg?!u2haq_rz?Hl?iBl5ZC@;0(mB^VOnYG1npPJy~SgtAr8%u0upT zZ2tCGFb*U1D!17fchOkQ+_z}%_c+1_0hW&Yx<4s_3r&1&I=K9D>`h|%`eO-ziCtWG zpP-Oh`+%lOFphy=uF%5`Rs9{hcng7mZB*m67Bn{t&TGfr*M{6{vjj5m%>u+z>jB$g zmSyu+>*tsGnk)ucA%E$We%{4~Zd#KKEfOqTMzlwiGR>62a3X7T)}I$ z0!a3tvL(_A8o88rQ8fGRWMS-JVobnzk(5+U;Xh}<1E`aiO6qNf6T5ge-S+0|_Ls_j z72iVG5~c35N#Xuj_2J@0bHK}-IM6`QIQLn~nQ(Wqb0TB1Dv;bQ9B@`ocebf_Y3g=G zyl8eqNM9KV$Sf*sCRh2Slae!;>WuUn1%4$V8UH77g)ZB=xw>W7g*)! z-Cg>}0`VT3TMol>a9LL79A+4{!~IFMpn=WDK?M94y42S~z-=xB zgoG^un2W2(K&!sw(D0bm5z=zf6qY(dKWO2)BuU@y{hHkzzKyx-1-Tty#f381AR(9U z&OGzAclW)rz0d8mIS5V0kEqRAgx8PZ?!CAO)q5|U$0lsX#I`Z@qNTF7YXR?eb#s!} zD$6x24~(l8epYu&lj-%c?}mmV)%uppv%<`J*SKwR&pke4`j5t$yrpNuqfzK#z^8L~ zW5(-teR4y@t0`v1yY4XyhvN#DHCGPOM5Yi0&d+=^Re62=)7AQsb=$7H5&*vJx2Ywq zABi}{}3Sz3SS#MwEz92R;xhFN!vDx!D|Maq&)nPMQ`>J#c zS$ly@S!HVF`ntA2W0RB5S=u+!O00undX!2d!v^j?{WRWIpacMft4@}p#f>s)ggwDKIz;Ygt3d~YvF2O2e!G*x?XxDl7&lg@EUB`( z%lGV};K6uML0Jf{RJL?w>MQdgBV-$lpWMAU?v(-a@eu5w$AD~l`w_R^#k=j@zgflQzIl6J?&YooEBOw( zHFvBbclEzQR(4|th0vc@6}n-ylO=N|%`1Sj+|4sGK7)f>#Giw!<+HN@&!HDSgWcB} zxHd?nT;tp{XqPixR&kxx44z6?-9K*g3U$S;tc82uZ9pT!kuo;#fE7Md7z%Njy@Z6q z&P4O9Q?Ed1r*quBQ|`U%%a7!zMpY-+3`nTdTqDPDy9eUYLWeiO?qqBJna$i2y&Y*)laujj| zn^qKK00rPntklhDy2JzfWaD za0DI&{A~2+?LWS6)DN|EE9lw>K>!qs=S5-9Yo*MY*Zm4|P`ANzIpB2e0*GHD)UWuu z4IrFw-GoN!Sp+?^*bCl$6hJ}= zLR5HY6>@!XRcyU+dp}6rM!6mNo~1bhyc*trvkJ@M6GHC)QphPBHFXWP@dMbk+s3lo zJh)mchbt+KUj7@v2j`?tHm?6v%5}o>ZjL!2XUX|=SA;UztJCDya`84q_SmKtBHCDi z>-M(rW`g9mpNiZ13x7o`yyq^g102526Gml~0=T2ft@jQ*DYpin()Go>=4z~V@zJ^D zY2a2KNhT@PM%E6b;%O%0h1bWJ?aeVb+j6U-;W1$%32C(XXLD(zPHjBFLFbQqtYeoP zoSI(FC+BR2`bJ0I8vF0%F;bj#dtM&-AQ3ljmWL$VsQW5m?;Rboi{*QxBRRjkpcXFB8@}lkV|{TI^@zfPKmdtCX=%zbqk&z-u0*LoPi9w}YW60;UTh=`j3bkLUy?Z}${wTQp=Xd&^c7&v6$WHUVgdk#Z;}gweL^rp)cnfAc zowzV*9fvR7J4g-hB|?L4);?A7kuk>YGBA;bhcgkOIam!LCFL-6xy9?}_O z+aVXcdPSHpq9~FJXG)_uDW27yfzN}qfcwsj8{c{QXUI!Qwx1y0Vjg)y%BH-z;Q$qJ z)QbJr4}RiXpI>-R=VTG-KPQw8^XsHof5pEZC zaD4TkYD^tRdWT8VXBOylSd*H}3(kU$8Y+-%w$RAMvs^v4&hRJVE&-W-(`*=V!5}ARu*O$ZSC+0@jT`R9NVXV zx5gd3anI2?hj*#tYzY$vuK|CkfdDKcRzHkgI|PU;b=^41|6nls7Zq1h%`0q(^P7g$w%eHPP_MIyU*6*$YJ(-!gY%upA!z~3jCmdU*m>e!2v z12-wPqINvF2psGPuu=A{l932)?izC--EJT)4-MVjFuNKuDgH5~8=vL|?cTUeE}>73 z3W1Z}>fc_rd2B$FY5vUZo0FfAT-Hm$sc$?B=dVy#x2sf<_#?mH2Cox`AX*nKPR^da$b6z8PJdJ$t^^l7jWHYQq zeVf)_IZEQPq3_=gUr_9uW*_QP8sKi6^Q=>7hP-j`Pg=t;7JbG)3(C~b(Kt&Lk)};% z1@9597STl#5PyyA-{{Kq6GS@ivio16N1fYgVi>sD-O-iemk6!p+se@Yeq2%;3xy1u zp2xeXGwUH=u6SLe&7Sjky!KTja1)B3QL5ro3xGn|0I-@5=?FV1R_XTm-Pdqv2p%Sx z_Eu`jRqS5%bwB30;+^Q%K<8by12~4Umb)LBc}BsrHHvKyaj%m0Z8hH>OjHPpPD4 zbjToRne>ei_(;Hm5<8wMm+*Vm$;R5e^&3hrLRmNMKiJAzr#sxrpHSSI?vm%Jku&Ow zZQO21iMYYV3uW5~_p0%qE}&A-`fXI>S@aaCeJ+V`-(WP6Sy=96SJlT0)-6inzE^Uh z*dZP;hVL-Jx_g~6xy$nlh9K+e?GB9p-aoICTf-A%AFvO&K5QjFjO2G1iBB1+55dma zkw2u)WuId{=dP3#bJ=1>MP-$mOE( z5m|eVO(rlf_KFJf?}aA+zZwfP5NtNjzzMUxM{}K!^m=fYkC{{TXl&0oItU@hJlG3N zlZdf9bat$xX7i?;>AUiEnf|3uQ2SVgcHP)u?S_w-Yo`^J=X6c>Ew}69!*l`frXPB& zlMCig7`{I$k>Ln`3g|zzNY>{^lqvrVe2R;l8DNPZ(&6%#u1APLAdZOaku(uy{Vn<# zlgSW9NLjfavlyAg{q4|mGyMf35L*xtaWv`P!Fki-dAOe5mhEm~!G72Y*J|NHiB9lB zNVal9LgHkXbI+5nW;|Rwv|BJ2JGN=rwQ!y$=*Ekf6F;>ivLd`|ZL>9rk+HY%XYw&& z94XtKxmzgRf!MwhlC(JC?VAU}r&;%Q_U=o)nkk&CJ8IRpQ~_CSWWTs8IpE&9F)!|M zasYSwHToFVx?6Xi8mDDJl9#m-OY3#ELH{29F*ml6hNZSa`hi>a9xeT9J$xt~PGfYB zj{N$wxM&+B{+p?eVvz9=-lO3zHjh%{@s3A!?~hG$Ffu!rv99tzRt)UJYU!B%){=E& zgxML8du0RN1+Z$-FQRI^@V_CGI$j0qu(`y!A(od%pwh{iW0lJb zwVc$Ghu$yavA#kmYnWl5NPI~`&wolb{9wxF5%SSI5*2*BuV1dFq%+=I$sf{RuD=Rf z_MV^P9;u>;_G_EoCmGN1L0?_ELf*-#bsb>OM&}lIzAmA2m~W-BLHHFL@N6(Sq)-~` zK0RZ?c6cpTHY95zWoCc-Dnr-3@9;%aZcSug0sLBcR(G*2A`OisY(1NK6a{VYKFfsV zyK&6O%WeGBP6?mubn%i9BxGsqeOrWeiK*);AJS5K#bDS4uZBH~htq*vBjQ*SC$IY) z^}snONf2EG(Kp{u=){2V1%9V=ei8vYGAwmbj7Oo3)&mEwJ?wzMK%u(B=*b7=IFqum zQ-Ze#;xslFzpS-FICDMrnU=jH?}sab#pGSm`-YZn_VREI)mSQT@sX{!O$R3S$!yvAfQLU*K&;+EIL&cTE#l$({Osci~?o{zBku) zmG2h`)mu#AU%FiX_=Sl%DpvjcHKF$kv-qyMcL%F}Y(wf8hi{8mX>Vh8R7~iuST;Na zIpxIrV5P)JT;#{-rxtDAM}Dx!e691BdtNxd?kCe*Pz~W?jVoVCb&8k{7Nt7%VN>z} z-9A_*FAj;Xo>&2)KAz>|>towkMONTW8n>w1DAOM;ibhfMfN|J#gzhlB;hz4dSc1H= zxt|+AwCWb_b;WL^WWfrQYV_7LAAF6VS?S}#`klO7qe*M>LS&u{z3z78mh~FAjP^IL zu6-B<_MWv>33M{SHedGo&o>45uRrTCjCC!#i&oV zxR&do$298(?}t?dcCBGNO$$rHJ-1Bv6WSCTp@(*}l_@gvKDIu9#z5SCdWIE*u3>)C zwFHw_WXri?Wcz24zfp-LnY}Of=aJGOQyRFjUZm&)-+Dmk)+a#mq3O`28oiKjjGIHm z_7jRn6L&v3JzA7Q9eEUc(}&Oo+^Tg7>|zjkpW{@f{Z@W|zLe1Vgutlr&U0R@*>fr6 zzS9cO@;gw+L8W=S5F}{;2$fjEaEdE{J!iF!JYSSuB9)9uS)`U~q<@ztsIc~30iv?? zEv_vSsOVY))$c4TR7ScqOf;LN_uJ_@#8N3A4h`Ck&2}M2%DGL?I{$~MZ(y!0>bC8; zgO1Ja*tTukwr$%yv7L_Xbet30wr%I-yQp{nz^+vg+#X1Lx7Ra{k9YFA#(7ENMnLefr$L`1{KL1K4#lz=Qvz*w0|H$8 z6=rVU0}pDPMXX={wDH?+OkN~om0aZebBg{7$DqBo`AA0#aJWM9P#xFC3K(U*D>r9o zu;%^LVvq&ZSZP@q*5jer9Nqs)9AgAF;7<45l*wn*@9oGt99C<>r}jq@jG8=*}!K z`0ymx)(WL^!PNb!3}=o{ceiGjFgxSPjp~R~53&7gat_HT}5|*Bx0Fj1}f)Xe(c+^LWZb z0daR!PyI!ZMz@srq>zI zI1N=4xQ9bkm1_?L7ueo$Casq<+~jutv|Bep{?b3NTD2>k>7rOt^p}Q+w2J++zi1CS z`|mii0||GY1t%B-w?=H}Iexq=I37(lML9SrkH@P+MV9r3NqRD}4`vPWg5%V@Jk8Ka z;EBs*0@MZdyspP9_;Ooczhwy~5a=hpaO9?9aL7Dwh6JakS8vsWz-Ce>y}SDC*kzX4 z@vOFzDkhr>9~$QB^j93r?bP{Jf-8G?m&ALNaXZdFF_$Z#TOGuZOYXqp@wKZ@uo5BF zIFsc_!1NZ#8H1Y>q+eMlHB=N+juyYZUNKx(Ofc@xmJr-FJ30l zb5ii7)}2Qlo_kXk(;k+MT3_Snc0OLN_6A?Pt@160Q92RY-F7PoRRq6LTIhTf&sR7A z4ME^8rMvR}P%MGl9=fDY#>;X(nbEzPOlFFf%n4P@1_dbtuk>IY72Kc04QS@xpS+}_ z5@~v^Z_V%6=Ft=VNsB|xJ&Cn2wn51xj;Zt$S+YkM791myw}x4Aq~_K*3rhQQejSbo z+}Gc%=lm;wRva4$!m`k^Ks%5j`arKTw;fk37LPZDp6|e4JLw5G{t6RdGuMp*Qm8?m0M7+fHkJGpin?ujD(gl6ltO7OyyXI2o>G>b;_=A~+0^!M zwb+Ul`d&_UTp?#j7uUe7yibZ^fw+rp$4AtUHA2qO1XYRbA_G)1P@>3>aRnqBlOj0_VM@HQy1ED$^2Jue-MM!S&j0(*4^#8Q z?sxK}bNp8Uyf;g4F^y&Q{f)$Q2Etcte+NFEZbh78o84Oglp$@l6oYv6W6-@k0*j?8@#7MV ziOjYt8HO3eVmvb&RFZG9`m^DxZH-2Eu8fo=YzSAu+26 zas!3D%Q==c54!u?MT>%}YUu6l?Ss7WwoHS6OOK1HtmfV$&~meFK4C$w>!=)IMLyNG zhgbn|vWHtDItIiz%14+-tq7a+R#h%?5fhA{0h0cYq*pG6gm*_DAdfIPmpd#0DjS~H zYYcYq<83c{b{3cV_}KroOe3jNAYYw3*Dj=|uAdU??z4H&i|!z6{2f5K_y+l<^-X!7 zXb{gSK{Uo0LpJ^*L50fpejvV4l`(v|5pqCh?;$?k(Gm%%83$}K#&lIr1IlMHB^WvovE#(gfE_7xq{=5)rHrj}?VFgYU0!BO1^6&4V(!xR}N+ zdHIB3<%~NNd(WbGx!iVE&%4Jg4Gt*2_ldP*!dh8a6^+aLuhM?O?I;Kw9~6`zJbz5J zrKVk~SZ+tPd_~Lvb#3gkM`iq1P3DaWX)(KShCX)ntw&ZA5038aFD%Tsd`4E58-#pP z2)HMMEs3eNL#Mrd;S8D0?7bY^D%nru;Mq?KWJi|{VZb`{Yh3z6h8i8YJ}l=_)s{1| zkEHs>7LT)S|B5*$qqz?Z3SB*?s+~&=&yf#S(ESnwQ9GqP?*_uJ!v!UwkA)Jv1Nt+lCpv)Rp^k*#-@X zh>(}7K@mu|!H|6&ehueQ#DxW7(@Y-?E7iTtly2V*UhQnW+|eJMh}i@C8^q}eQgxXi zI<~kmSIv#G<-74-?@N7)d#Xz}BhP$S!65F7)ML~s7Hnvqipvx#&|%Klc7qvOEG=00 zrY$ORlwv7IQX^0;%~#DNhcw6oLIH()QtvNnGawsCg_oF7y*^UYodRI(j2vG=Z$O$en&H~ zZoFc`Y)Xk6Hrqk!{*9LomG0Zyn^gv- z98n+=O}O+k@B4B&aWu0txY5Q7>o0g;{+-m>^bR_2)qvuavZ(;->$bl6jJLO|BXvDl zd7t^KnEmfnTkFA9s$gIb7lLFEP0~*yUq5q0om{Z?p2&3 zzwCZ=aMp03S~iM4^{=M2dWn7DqUfsDV|co(P_m8CTVE%< zSta^X9KMUdA0w(S7tGeN*Xj{)8s?NJHIZVo(GA6RJpmn#)`L6f_&CK{Wi331&`L;p zYq+g$6z6FxW7^lF>dFLo=I!JO9#kiWhBv#WOW&&9%g3RO@X)MG3Q3v%)k`wrtv?5sdVvu)b!ka@@{hA*&zDUCC z-?zB4@e_HY>^kKKe(;fM8y#P~CFentS!8ez<*5KTE>E6~O=RZ(0^4uMcNKDhS$tW6 zyj|*3V>!W9&n24@=Eo?40pK$9e+tj2&AQCM7WFC9Jn$6Ba03XDa!FWiK~~fax=@Z{ zqMmh?$|h8nE)DP2#FA;l9Tm9l|)vZv&4XX8yrF~N&|(OE>c5=8UWN>SxL0T}r3@a7bwu1bY{_+eNd4NO}G=kQog=kw|zW?#&U_9R}; z{6fj8)hJOD90w(aH<5z#6uYeo-9I1LWfzS+AzK(u*V~vlb*|Z{kN8_iS2)8BZMOHG zi_wNwvoa_xge$j)O`!;UBgD@HSaPZk6eVs-g&Mn04KZUwG-WKQ+6SvtrlXS>(q;#U z@|?ocm{0nr15a+S{7O<2uyWl=1pBL#`_sPTIaT`t$pQ(<#{Lrp}UkmGvvNBt(=n3HG&a2@zu$V9%FLjnA~bXwc> zP&q?&Tr7-tWu)G$Ci&<%_4wjy`c~1cXCB%<1=?G1N(8bTd2hRkZ>4*HjplFO)w3pO zM#_Evb*yapUA^Sv49kHx70{@7>GoH){6YxGxBNmKOUZvdl*Mm{!KD%J2W9@R@XNrRMWz*iTudFNk4`S-PW47Ey3nNFzF20D+cXS87ev<&s> ztvW(}Rlh%B^?h%;xRk2hoCVd$BbOZaRf^@pJs)<@(kk*&P&!_BtWScLAk}5NE!%ag3{GN#C&LYj zW&z*^1K@ehY_sP zM^Vfg2YS5nPa-=LahG};T9tKj$TxRp9QxzICe^dEu^xsYmOSh*x1#otsA8X0 zL>Dc^vSAS@ofbojtEXx-k!p~o~2u{dlkQA|!2!L(mtT27Sc!J{R6JFXCSZ^u-6SmOJvo`vCeHWG{CjNb4F^R8i})MP$$4g3eJuovi-*O)eviQ0g0)pwr@K+cv;AV%z^SVe#YY zPKb1uEiYpLwJA=^k$tDv%WcnjUyo6~<~};!Y

^wKU%b!pGN2@OVTU>?ed(&3Oa$yw(fM*Gf`xKdFXF{7{VJ`A3Icn-^G+a;)3e zB6-r?cC+u}K=iuT{CCL!*d^OD<+&NMHRs{xKSk?aL}jo@0a5 zXfLS6EakO_x8*yftlx_5ck9A~B6uL8&UN9-f%~KeWQCwSTF97ovsQ6k2+FQ*hs?0s zr_LNypJyi{CG05;d5(Y%7T0ak?J`WYzP z$;Ry1<0)bxl)84lx)V;;druoKSM~lf13urn12;U!t9fOX3tdilkO~l%-tY$yp2yv) zF*(L94#{lC)-A5f1m-6l9D8-43%$<6|daTlH{I}DU_?qTF?7%kN5f0 zHT?X*v8Z#{Mmr_Dhzmp-@($WOWs-8$9ZQ5UE zlwRNU6_R@ULg4WcAYENC-ihr`D;sTi#A?lsMcs&X6Tv+lL9@h5gt7)Rq8u0+cx}EC z8IIalzD0IWf00ha>A+pl^;cQ~DILi1hq_G^%v!17Bc19*d_Fr^t)RYuG=uesIN^3s zL?WfR1@^h8!{Q9yTr{{GF^tk_HiLK7b&M55Nanz$Y=%kIuVTp=$9c1$N}|aNqpK<} zQDhWAf?0L&$n`73rk4|Df^61vda{vO8kFg=84Sb~3-v4?un!@*!2hA{<6QNcmnLDP zJR3u?q9(xkeBj)Vc8p0ZKCJ%sF@Zv8`0Uf2hsD*5 zGWCa(^X&oIq*9ng36wuh0CT~@!aS_kEZ4+{M~xuq9$fw$Z>WhR<|eW!0`P^HN&4(( zfkL3oxIUxij{(4)$&pMGzq8;PxR+8oWBUN;CaaKY3_WCtkt(w&21tUagTDqqa_nzG z9(&0zL=*BuB;w4pACAH9RdO6b#`~MzWoAMJM#Kw^4Xt-^bOR84t1A2m-{*OQPBj2+ zSqe|<*7-nAL0bgdLNGXvn4|sl^ETk)1K_%F3=$=~5+k{{lOk53P5HgGO7D4Zp5;6%k$Dg%+l5wXwK{L2iIL9ld~7!x0BaXq5AOHK$EPoD-eN?V37wp~^q3wO zt2wfyZp4i1CarfkSaXUQ>j4`Hx_)T^n(LkR*bJ99!Cp)63+vD*|Ai&qX@ zUCtah_fem(HmZ(F2N!D96{QaS!SoSBj(VVQY3(eFI0XrVI?FJ_1WYde4;=ZQHg|ig zt_P@8Yd7|bD{pQW2zM5cX?m)&L#=ET5W3;8nfy_aCwX&ZwkgreJLD4dTDj+s&vn5B zcL}Ismcyc?!`jgB=%G&fPrb4k1$1WGZX^IbIOhJH4vfWYXQBJz)Ijr}bxrlP`b-{1 zVuhVhzg{48gYF?5RTMV^NP!wH8v?CwGeI^FwN=D~<@NOpui4%-?F65+=zcdOmS~G2 z{V7$oB1xP>?$A#mpVotx^MALNd1sXhUF@ET8_bl2p z3-eU_v-%>RKCA}SWXB1-d&aK?s5;$DpS5MET4WTTo)z+B{1^pUKdDt9Uvneo)gvNv z6`H3b1kn)+`*@XG_Imd2?AVv8_AQU1(QM%#Z~gbbUI2qBF)uR$h6L)@jOa8pVqKv! z)3vTT+8@18&omGfok(Ku;C9^&m=$e0Vms(-c|thV{&nf8mi-U=Bx1W4A!4t=bEJjv zc5Qb^t!FV{b=MOuHw-rF6=i=iY-W&YG8f2Fh4)W`LbDwng&G`Y$-V>yCfe7bQkX&L zEY858YkPvf7>@X}P4Gf6wMe;qNMMIQHJ&-q_4$~a-5)p4sK_GXuY@+VKd9u}l{l8y zhLxJl7^}Zy!vtihB!73+U84%GwQgG>FF>L8ol21W>OmQ++1iok*7PK=dv##Wr*~Tt z=1f#$9`aAvn|?bEdEVBmIA*U7D$!~Zj1qO>IGpcUHp|)FygI-o!pR2##@-!u2E!Ji z01NjZfAzjy`9wR~E-6&3u##KMrQII<-fat{duZVUy*or`PR`baQYiqXapOQnIsEDw ziX~Zx|Etn4_%BnFkZZ=?GLd6*k^0 z{42;i0tk|gV7B?qB0EEFH}ELOz&;DU7Tu?!rV|p!`})%?*N2tUW%hJ!f@=8zdPy&X z?1NpINEhCf3^sqTHr*2k&bTeynb76d{xN)MO)PL&Q$_H1!{bitvX;+yLI$TWhR1?v zay3%R1YrPf9RpQo;CF9(jYqK|^)=bK_UT@&Pd$ z>CUs9TQnI{fkDUNkpN~t^_O5(hPNI#(2%)eBScKXh_LOdP0L=xaflSGO1kuV^ zw}SvFhS(_ee_s;c)!l6tAktd?#Sc(Ln)MqN_x*~HszC3K%@m8#Ftdt%BkPV$>UgT| zOExKl@O*!k-d>+gYjPj|nA%Mg7>tj4Nw1LY!+J4~_O{2Dy~@_5%m(I|mz~Y8q_Tf)I-tjwwcFiNel@P$S9=O68y6qipRJyW5pD zSEvM4$0pNe+3iTyG!ROwyVFjJT`K7npf>HbQ55`$diJy|HT~up-k+408>@sn(a)+~ z_j6OO@kR@o`hDa8SgBAqt+e#Ls1y+25CRUOOuJFMl4fF1!PeS zAm0(&b})0iDq9udxnMctA;WMC@-McRf}w!HMhW?5^{jyp8-rp(9 zZpELkf*&6t15`t2#ASWsgN6gq()vmEV0Y>x zY|Qj6W`*%cnD7mYd1rf7y&^-!aBG4eC5sKgIG5)u3WWH*h_jGFIH3$6WoKva*9Z!X z=!Uj@WA0eq<)XuUH|8z((3Hk*^ebkg0Md>s9XlYhKpm#6v5XCZDR{at)AV;9YV>1AOUF=gi;TIom z!awCmBpK0-!gD<$nqNw~d?$DrZkD=`T9cS zuz_@8KU9RPbF$Y^x1Y5+tZ-m|K=mTfZz6Rw7|EiIIrEc7i}^!xvi}=s(#OK^TCiCG z{q<#3-F$nUFi7`oLqcP26;Yz6@T@jdQR{9OO#lujwE~A?+h5Uw7Zj^R*PC-xg2ex& zCpUE5o4K_cw65}XRrF~zvD006*{v};>vsJ-@Mgc1?Pe%wp-X2Hg%wz5mJ^yEE5le|q(c>2AS(C_Lx>#a zJfNZA;3^oVNT?JDwtg-qEGrt1A{k$()-c80c@qV6QPkE^uPGd=&VX4U@FQeC02pbY zreun1#Qs=J_cYfe-AHE5D8dgsoAM(qiqtK@&_gsJ&R0(wS;ZU&Z(~m~y$3NGU$ID{ zp03UBW?aY%$dNSOq0-`yVcG~4(u#BZwhbdDMy>h+(cF%dmXJdD5a>)q8!nF`tnf?| zY;P&@AQ2K%PMw;BLws2X10&CO;4eb-!K?u-6Qsfuicn4Y#W_C!L+F87Iy?dw@P4QE zrjeq`ZFMgH?GCW~sI%#ggyg2~#Q;kGCdgaeCMkpFi67wXc?z?lM zRt7j}ijfZPTEX-0zMT|@u6qSd2(X(XG25I%VhqzqsUg7>q_eDHW}0+Zm{oZy<8h@D zW94y>hAD-|XbJ$M_s{9nrd4Alv>#huGV z|J2ctG74#sYVjW?eInX76SU6;jNnr;WwZMgmp!*KuBfLq>Gnt;qm5fZG>)+m+L{z@ zQ-uoondsV?|HbV!!NE}iM{M{}&V~>pf#&HhnkI^9I369D6mKj|btV)FD=NS5qwYSM z>|PA2TfcOB0n4BXO)5%475qt~IlB@>QbLmgrw}2ZU>-dQl7kjurbBL;HC&4y%DjTypV z!720z^w^^t@DPG3dN+x(-(Yy#Jrwai!-xvVo&xe5U4?gu<^(vA0kD@;rY3q)k3PZW zR)(Cl0XprNW>X&ULLJ2|73z@@X7GJwEB57s#mg^CcaHEnKE&6H445uM?GCIVL2 z(-j9+)j#Ydx6}-__jVUKQpx<19ar9OKYb+CJ4>7RYuTi6X?D8nMmfm+X#0OZF%~&E zR_ZKds{!OvoeGgoUL1!}=!CVzurI`s{4&C=MH*OlW5W^oZDkzT>S7`68xgVCRqqUC zBP2fYdbKIgpKY{g|6W&Mep7P}lEl2T9B0d89XKC`^VkX~3gNiMmj>b^WwvHy7aP|` zMbfA5D*i;fixx6WckH=7qC#pzpenXhw2Vb61k~3Dlsy{_qP^vgJ*e1IG6T%?75~D0 zrWZ;mYVaMYc$h*q6q43?isTThrr4YE4(^btq&*}d=aUE2*Pfu0)6xzZ#qoqLR=wP` zPfW(gzh6JuZ%lUw{#z`Z6+$0&=I=n+5oo;lLLy zJcq3LMX}1iRHxkIWSuGEjuE_)0}j#_F;cvn;w&+#x*3zKzEPs~6jMaPh9>eMSyQt) z#j?I6>viLe93+3EXt9GD8?A^v9Q&9``z0};5xkCvmPgU_6wyjXFm8$_`~3Pr+ppm) z?drRl+;pIHFn5Ag6T992QXi#`)#OHfdr6}^0!Y`)U2Kw_w8>dXmb>)f?VPW3juTuNsOCqArpf zbG+x$aGD_uN)gl)j$R{BM+ybB*YO=|VW{Rj*d4JoVk-Vw9INC<8jWJ^ON7J$n z%2UgmK@dCfUyF`!{w5E)dLUaa<%E*i>K#+9@%C@#s@;n--`6b$vu&t#%_Kb=%?8M( zmQ8#S=p$h3(RiW#d78HdbVjC+HbW(9TqhS+ji}kTMBUxIPuun`pFrrO&YXu72;$t^ z{EPBh{zTjHC~K=?mRP#R+k!9_?toJTBWtB$xh!S86DsV9-8H*Pv><#IS}o7Hk6Bvx z9TjrGB`z~hEsRuU<4VEMcfDXrKYE}|T+K>R1&VpH|D6xM%2mYUUn4acZ0pnUw+7eo za=~#R6mR#`1af=CiBlNARwRE?r zv>eOPxsN?;t}@eg6?a_;zKW9V&}sf`LW8@IL11h1e(kiGAvzVYBhQ&(7_tjO7l zjR_IAMqhFPSpSDa%CN{fGD`!mY#_HY%l*WI$hcz)mDb0n)i%?;=FtG=9Wp_O z+Ffi5IksY8{UIFg^IHII|G~YBCSF|w;;u-B@S$U~TB8PEa&~5uoh6(B zmg)?nUb}v(Zzt3$`{O9i6Nfy%%?P;C}4@d7^+e|5_JHn6R1r973f?q zSp@&U2hhA5>&wp( z4*&X{&l+_7a+GT{5RSzD`Mk#lX2XQW^dAbOuZDllq38<>EuPEIj2j$TV3r>mDeos7 zI<@AmZ(4!s@nRQOyUSkYX*5?;WJOr&ZjO@kprX4YVF?5eb0%3({TU(qA)E7`9nZts2mx9AsuW5 zVV$T8=$6Q@1q+R>Pz>N65^(ptIXIRd+{qP7G`Xd|&UO>@<36}`d9*ZW7#h}#8*Ftj zVsI31PXb?)ZYQWQ);;bFO`N25cd+E_7&&Q^>xwMSl>WdYn->ljF#C4)R)TDZh_>B& z=*7l^i9DyxftK=O{Tkv;d;u3`@*J537jfp*Keg4Elw|*=D z++i%kiMi{E2;KfBlCES%VlV}oFT|V>at1QXB#2j0D}Z<3`KOp_C=ME@jJ@>TMj&TbeJc4tHw1Ai4+(vfpu42`QjF?{E76V} z`bc$DOg;F~?WZg`oWRvC$V0suWXSmzO;BWZtqo}o+%csG806m4`n4S4YTC;x-RTFt z-v@6i$Jo3We@owbMN| zR|t>;RTr41#k+(NPQ)m%`!UA_jw4&SrC|SXE{!p&Rv*oIrt70rC#4d$&GtVJroLiD z7~?3ozM9g+`^nrowTt>bA3V9*(VwkbkBvOUX^KK${+#q%#s`M;<1{3Y?CX?x2K)XC z2x(G6lg1v2z94zTVfAX38I55C70FTU%lt7D`qfsB88q@Ov3D4#5N}d)B_&nfa_)v< zWJZsa4l|BlJVW$s)+dzzDoor2u3T06y>T-HG=X`B}B7$og` z{h=Jb|91cK9!b&Ve#0~va%AmwJWtRomdv=Vm!J-A9FOWP2xd6e_n7gNZ-yZ+_?LWh zx=HZdS0!^rK#I5$1_IN9ICZ5-P;;N=; z?T=ocST3#Hb0dtWy6#Fp!=tk=b=^CQcIRAe@FeobRR2DQiMlo`m{G;+jja!;9VFd- z$w@Mur%;=%Se7h;p-VpOWsV!$>mb_lLV|Iv3-vYL(~@7b7adNr+KGUg?9j$h(PJIQ zi}SnTvWq#-bu_9Dg|4?1^s7n(8L+!UfJ=|HCowKHzq6KBlQ~8%M^VJDYTwT(=&?AB zhr|!;QBRJx|HVh^koSK0$Xbfe3pQyeD@JJuP-3b4MrC7~YxpipwkN}GcT|=0YUxh< z^AIx^=DCP8Y-(emCQ3pn#JRfm;deh}L77yVF+HI$3S3uz_YALzvn7XJU?M!d=-(3*<|vvA6c!<<4{jRPVs~;F7%bW_14=YP#%uQ!voEK< zVA8=<5++!0DH=qtLN>n#8(Z%Df413u$?zMYK8(%o4}YIE7L*4K*AR?Cj9~$1XE|DG z|3)ViL?WQcNd`zf`BLu&VZ$N3D%C!d>yfzQ#Y;|vf}gFIVbPe{)g{o&Z>TMeCUK46 z>yuUE_aYi+*J6AAQ0~QmMyHXPO<@Unrtd{Fj4?THOAN7|BpOaIbkNVqaK8xWAk`UHGrq~WO06l)K z{}4unS=`-IGs`=VazsU@mi~42$Ff_tq%}|fm*u=wRCc@D{*vfDK`P_R%%)0iXGr=K zPt=N&uHZ_6AC^!dY1MSTmfQJyAzQnvMGz;Unpq+@+SCDj&6gA8=8B zz*bovA<2hbL5~RRLFeUj%>7g4U&eY_)kU&KSZ%pc$V_Aw-`1+=0d+sl6i*1**A1Mm zyAOVf_b6j6#H#B_^rqKkY}f0P=-QA=(gho(8ub8x!l@RwsqO;*t>6ru zzjwMqRaJj&)2mPkR*}{Vs#vZoc)=Sllh+D%)lL`TJv)NH%W?(Arey?wGT$){a8A4s)64{P1256Qm4l|I40ksxs5`C+p`WXp}b_@~e;fVCyNzoc_t z)s3)U^3k?CcU=KYOeRfnltj?tXa$FdZr4;yFH3yj?H==yBEh(8Hw`BwVuS< zX6bsCEx#7LZadyz#Xff;5qr5lfo0`(lkwBM=~slalHD8e+eB9@i$r}yqJl)MP_hiE zvMQXMDm;?5#YGZW=|dL`i$=U+hC1>kc?1KmNHv(tTu}%U=Nn?&%p16j=K!?WJxj&7 zbFGR1fbIrenc!qlXs|q^d5OtPd?!;)`vq6Ax^XG$L^x$77=?4PQ2D6!R(v0Z;Zgg+ zjMnk7XS8j!9M(pm_scS}UkEe;brWI7gB&Q>s43~LWdN8a7l6UbX@t%^k@(ST3tMH# z$Z2-p$gWNyistAXp&F_p$S?9|*cP;uA1Qh8dt4EK9Y88D`to-g~V9N@>7qY2q_EMC6HTz$zzHYD;_f~#)TX`n{a~j zR%xb)$JijN_M=E297_xP2MdZzQ5YJ_cc)4d)It9ieAn(C);SX?4O~dYD=&EA5i~4{ zi(}JeD`}EUpsyphZ}E02P`Ry&`-w`q8)MExwzL+hdP5slrz*3W*L(K6HMf(0PTmtz zE;+`AdR$HJ&^Tgih;mWz;JKAxc6?3uUD!RFMU=Bkf6%Iv7a+}viX$OyXy zfIx0&_@Ub?4_6bQ2RXL(5VI8IF#GA8AWY`JJNnHvS=`d!^%58lEnqF(F1cKLA*(q7 z_&)4ZTxV+Kl!q3=Ihn}~7P{$9A^g56-0j8=aLxMqEAw|a63yhf{{>^$q$F(fp3Qox z-qoCK;I&)9O|!uk&0Fzfl4)tLU~2+Yat*CdF6PZ{5np>fy&%kzD(lJ0ReX$ z>?0Ra90$2ePTb2~>E4Kh+qR=%nBE=vBzIBL2)W)nr7XvtbRnMo0t%*^vBYdQg!acL zHLlmGct2SG_4pdbfsa@*vfl2>7G4}vF5*bs^>e0IzQ~{?x66#Z_N%Axvx73Wzu0@k98hV9xhhjkjo-q})QY6(4{C%CR&3j@ z)M~VYap`||2;a?k3-z?&kDS*6I25~@&pjU%lc_EjOwq@aAg@r@KA;l8P%tI(grTmN+9-Vab62DOvu1nVZmL+^lHbd~I6I7|u{;pNOvr(=B3dPN(6n{ z|045*c9yd>wX(7vZ30?+yI*Xk|2fW#y#y(zcFj~pa&*N?T?4m@)l%4)!|>UwO9ikXGluHLDEU{_o|5XI5m7%ckvKGAVp!quYUHGL-F%aCvn zSOh`Wr-LxO!p34|vMHxCkXHzyz-B7D1I(>u6*=4kje=~U6zt)NHnQl{XhE{zR)JN- za}>;5fA~&R#L1`K69Xlv!s2cdCCHk%toQMrEaV3`K&E6ZS59Re7*&2BAwVOSCYuZh zu+JUlA%N}@tNc9KQwE{XBuPqQz+pTk1?c_C&GJaEPa8$%ftJ1^Hn<<9V7IFUIoaro z9h*#t_{>BPw^1}%wMTu^sy@;5dT2pgR;*~WvO3#mx$Q|ZjKPdB8#Y+~o9|6oc12zW z=W@<&`?Jy{eOHb1G>`d&8pi%gt32wND4t<1p*pm8r2L;*eS!JG0uG7NN1_?SU;Ymy zGj900&f36}0DWhE9O*q6gPWPOm=a34`){^Kf+~TyvGaf^+C^ch0vVxAG&53Wr$7`m;ZE1|-{Suw~4*zs_%~IZOSP@7m1irWEJOC86O(;6_MoZ>T#bfOV(2 ztG+h=EP-eKkQykNzxl`tSjANEG#{5WJy;;Oz8r{YHPn2fvi9EW79I|2L8;TheyUGr z&xAiN&5k9xxY&Sj$YzS)UmRu|*5|r$tt4o-!(g(A z^L!g_qrmpa;tF^=lxD&IHhev=Z>UpouoM_yePWovoi4)U{9*F~qI9BRhm)g=p$6^w zu-aFH;M8}5E?d{vUE&vM>f%1AZYo0Ki7L2!w%u#4nu+$S{b%QH_R8}btZtYXEf+{~ z4E|snm`Z${!=wEaoRs*i2J|&wVG?cvIk_um$vG%^-t70-_CLCluV@HtbffE&LWbx0 zvAW}IHioOg^yec!vb|3Rl=A0+&Q@r*l-_mIma@3pK|8`v zEys2H=w|_k3~Gs`s?=^x_gNu2{eu%OgI$q+WNeb54Uh7wNm@N-u%}YBa2O6ZRQQ!V zO|1g_uPZ;aE@D@sohC2efPg~D3bIURn9ufYcig!@ukaOQV|g#=+a|BHVXG>XY|vrO zw$S$ku~dGI1h0=95IQYXqH_g;jmPs7dSb(SKXDq>JbosLAu~1$yfIJ-f@ue?#HDhI z;xs^sHM;gi_*l5-Zq%I#Zhy{${aMTF(klJ!u%U)>%Vs`xRU7bEav$eA6vw32=b`Zw z@}fjal5dUOmk6hk&i7&Ao$+upuC(MlmPXO5h~{nk7q{~S)^?bpk^!PD&N@x0M%f%QC`7lYt*$=4#zqfR&cW4g@rs?14ULwT7`zOdweDa;7Fx~{ z?@*Q7dB~&IJn1|v)ple?J;&F;_{kiDTd&r0AM2TIU{%Y7e|4AZp0KHzwMKXInWbVe z*4DQ7m&&-WO+r`mYN;K@vQZB{HT^p&@qOp=LI}8$5|i(U?rio7>Hc_T~QJRJf7<<$FBcIy$ZPNs;q8#^f!N z;xugIY0yKf1(HkG11QxUDqn4L7^u|8h!6f#y)Ri4>A7ZD!_!f~I!2@Z4^d7wJ%+`_ zJJBqbm?@GAK=mByP)MUys1pWnuB(TlyCO#>^TzVx0u_A#9vhncSn}hSdLGZ8m_h!M zNEXc8-|g~5B?TJG%CO@ZEU+?}tRioZ7i`HSg|O&9Hv+1do4KyGX7~@yjYTJzSx*Lz zZd`z@wA(ftHs4M$ow9^VHIOM2>CK;YHZ2b^Hckv)n}qY`>*zdQZUQ>0!}s(33>C>- zp$D^n%hY4lIGTN0%aI4M*EG>Ns@#NYey6L@KvZQZPkXF{+3l`S;I8FJ#n?=+H$R*P zeH*WbY<4sX;bE6>1HNoA*DS;8s^q@|wX+8NKCyw#Y`m%N&Kt+eiUrZ0MOi^kiVH7d=DGNfk9IsCO|o8GhYJ$2lu zS))uyMzkHnPL^ZqYCgio0+r_3U8^K*7i*`ozwb6YIvxkf?iWoxc4i3b#9M0)#T>pWvcI~zdLFwB zcZ{PYai_VA_FG}6D_WsVBE#yf?nSyk9)bO8x&9rclDERzzG?*P^qpjL#MhUk!KqW7 z|4{yHf+qLs)5$Q{aSY*8DpX%_#7}oCBc)}L{8SH>z{rYc&vWCs(bV=?3@{&h9SfjQ z_<+KV?$2pMipX|gKt(Q`zzvn~U$uI{Gv7eo2EpTh?5r&gJS)CzkRa zlHY4+C`>b3QyR>QG6%RM6qTm4sBL}l$p0(_zVQ_vSo4bI=RE)TPzhHCTU}<@8hP}Vw_$1Yn;v|ox$DCGMD`T=cTrFcrVDZJhwM2Pue*=`$ z4aFEl)?B{nim9#t)E^UxoCzLPthG>;Fg9dj>W2h2O#|0wM}30#ZUm1XP-c6s3s>NbexMNhhI)5{iIG z2SIvAdap_7p$JHmPJqx$C?Rx0PhNiiJNLc!&NFkqoKNS>E^Du6?X`YxA4n>7@h4GF zJyrfOfN1khm6sU7q;lEKmv~PoI(9XPnQutih){_fIbC|bcD$oWZ99pOx(E$wTspKp zaj}mQIAE+7vXT4&VBuQm5XhJEcKX0oxHFgU??1r9Ba>BuHpl92Bxc5AkBF0GI19OK zgB4Y$-80E{2HV|AAse9R6n-i85dd(LyEMbT$o(ttTHt6agI6c?@^f8CA9Y@$dqAwC=4xhFPU zDBvNKSA#F4h~xl|>gB)QOPDh7bEdKqc(;t_9BIg}+^C*_nAc>V$7a&jQP{S>67wJG z5?KpJ-u8KwDhU)QkPK_8K;GLByhTkUaJxBl%bTnz4ETE9{?}?!7qqv*g_Q1jrW|3z zF_>7-xpsI049Y6ixF^|otaoW;N=TzHNmOnh7Vxt@4U#1SF1ED-EGe_PLk0Nwr({*x z=}os-jBKdir_A$qF(8|La=D+0tahOHBiftJPdIIVY!p$_)nU2c_sc!Cm{2Kgyw~%J zgLU-NAx9{8H9w`#(TLpCH8C(;VIA)?q#sau3Dui3M8 zYl?%j&XbD{c}m+2$5~X25ycJC zT5C18H!V(_J@C>}26S6B&MOLp+^@!Un`B$i=*L>1#iBvTqBn5)`WO(cC!yWK)dI~k zq$LTD66-m7XLCfujMn#AtHxz9qwOZXg!x}?X&JW28?D@#J^FLzqo^SKxW?qvzuS4f zTV$G_zx3X!miExUVvhI+_0GYki6JX7k_3Z_r5_VxL`#EWRrIxq^k4-x$Ib1TVqCr} zeZqrXu2{LZM|=8@%_QD#Fq2bs+3y_?ul{X(gNRsM(r&;6(dwoedFFS+XO@4epO_to z0#^dw@Yc1Sur?D`=7!}NahSfDjY2qp!@di3(ZL0pN%V(a8Bhyod-;JtpJCe7- zEQ(WF&3JXvaP;FDakaG)#D2`tr}W1$1WP*Jcb*12gzcqVVe9W&6yA{8zSBJWN0Y-B z4{qE_JFj4LtqC&3mNY~@=yWN&jMFJv7k{Hya*ZR}{v70gJx#bE5d}pza=O~lz9;)RL zdi0Gwd^qRC)X;6_fs%;j$3VvMQW{2HqtN1fiocIm3I{tScs54$SN8N}WV{w$yx`-L z6L>#pjC8hQS=nPjuaB&;+6%VBUw)wHZUZ4}nPzX`0(<=2i^R7dn=VCYYK|+pP-D;N zx5EU(G+-MJC||34YcDD%2Id?66S&&O%!0i0DN6JD7gD~9pRXdPRjpahhj?Uc!y!Ag zsHKKOvahWb@ybGpN+u0aBVfpB$1-qnlk37_$W>9e2)dbPkh3ytX7t)m_T>+VZskQI zQ{AuQ4AcRlOl|YZKZ@ZbJ#5Eki>6aiQ!#nqT@D-bWl^86?)`qtlyiu~FoEAnOgps@ zRbcu4w`+)Di*VF>lk%GId1LH}sK)|A>yUh?6rcL)%VI{e)cwM+zyj$%sBsKG3w*nV zE%lGmqo%So*Q6YEo`V-n+q97dB;Gn7YXyF!zq?b8rBVA5eWt3^Cau|j?_yY+)68qh zNW}ev0mdjP71m*2s%)2A0Lof2503uDYFql-j&AZ}wk)Xirk$E@Z?XtV;QfT$C`@5} zt?6IKOWlMw>pANWC3|AXs@qLRhcy3b4YQkz5*%M|uHcR<(H`&PK` z#d*4wtH;LL?S=NyylsVFG-l(YzbbdvClq>e#8veNR9o0*C?4^HpgXRTg8aUuOI4RH|y0VKi%co{jiNR8k z&Qw&eC~FY=w>jG`4Gvbp8N9vy0#EA=M^~?LwJuK;B=`v?I@&lG^jG6Do2x;3PtfCz zv@R9~Jmaf1^5xrOtruG%GjxsLIBo4$3~}RAvUr%8V0O&H?eEp<(GB&aZCoD2K&lPS za_$Uo7H5uV#wx(w5+-zG#Vi$@rqR%Y}jh<)p;3KFK>4S&gJ zHvUb=i|e_hCwnv=@D2{kE!Qi2Wnoa8t^KBA|97(XT_1RsJevtvGJxw)aPS!*^pESubR+Bjp5n|86o4IV=g1jyf?>-o^n1FQ8OT3 zu&*X$j#;4NmHo8r<=_P%Dz?QhhJUObk!#$6^sAvsb)O1L^SHt1M2yrm_5zZe>BQS4 zwer)%q&}18C$vXDfvh&PtrmZ&@_7vzzG9Hn*~ipbN8VzWT$Mn9>A@covGi_t%ht@e zvb1~Dtg(>s+8;W60`UU)a!^#ass%=Wa^b2f_}xo2c3 zxFhWIgjkulRMQ&@!s1&dcHPw-1l?7j^glEHYaRcOTAxcVTL*V3lT>Ts9khEJqLAJY z_WJyKq`k`~U4R3uw}Q3T#f{WzqzkWm2SDV}JuBYjG$%LlvL*B`Pa`gvDp-*psz4YU zGx8WPSxty9|751<#{5U5Y9QnF-CT{^S3QKAukNdzx};`O4_b@B6># zvQPMJuQY0ER^QY8HQIDnTOj?wdY*CkxDTHi%C8&Qkvv|qvSlw!-{~Fs>4f&g6kB=S z-y-GnX0Ol*%})sF&nJUei)`?R?A0oh)VR= z_U81B@(svG4hei$|8}Ii=Ubezr$?EH#J~Q*LA8-tf3Aw`Y*TTIQOleiC!Hdf6j@fA z-H!m89{q}Q;#47CNqC;F`&L=5v!p+2^#6K3OYQ9BTz%(nZh~U1bI2S1S>oII1{;3| zCb<6i< zrGRrim-S>xt$&$*EM(#;&V7|?&jh_Pd1C}0i%68wAO53^+_SbF5)Dw^X6AV$nl2#f z7*0CwShKfl@B!Cc!ZX1_yGp0tKW-=8pEd}c(T2@7uUk3cOgnD;14iXKr=$H~ZIQQNAtH+3jtV}^_&-}8o z5KqVv_Hr#>sU@zjgK~MiR{1Dsfu8;9fF=hRG|aiWuT4QN2B%oXgiG%LUmWc#Xh@d} zIu&*p_oV-zBy+TW&{_145q;hH{Z%PSNgui1CqEnYj#5Py8|PhC_lGLBjqnoK~^c^-|yhBX1yrf&AdR|Ux0glTS;L~OPXd~ zC2OZH$9J{LmWXPd6R$Zd*v}S={jQ#sU7AX{#MB7uE@HjWEg$PuLzlJ66u`~CCyV2x zm=^rayBpstD#D7COKIOnztJm-k< zsmjZWcdPLm#QR5`{B8^81(I^DO&+`z4B`|S|A61=g1PobS5Rh!Bxj~?X=NpJe(Weq z_6T@MY)DNtT>YQ}2>qe*y%?h~N{xH`tCJt%xtqNI<#0zUS#;~J()%N!d&@n4b^I1%pT1qYF2%UC6xtcDYE7K#kAm_6<8PCIGujI2!N&Xb3D%GONm*N4y zm&_43B!&z(6S+R9r@E>KKAq#NSoucRj=5&!MiJKQ)W(0~ zEXeofPr=GO2bmY+5nn!X)=jGT?vgmRt<_CLmCX$t@Xqb_cu9j#RvV?0YTF7w-?Huc ze4m$fKdgDKeLclFYS8mjZ;&mqEvu6EQI5!@&mJ>)Eu1-6D9A7PP-UZkqHVeQWpF#Y ze<_RHN@Bn6O5yr~)E5Spw98lC%p(cEC`P*4v<4_Y^GnFZi%O|jOngvS`?x%1YU%s& zMTuH}-$b(aLRUcjZCwdJF8tE`>)*c&%Y8H$1hKH5VI~fXiFOiawsng`$5tBGAIIJQ zc;G#R^eJ2V;4p#tKSft)7ZwITz$^@i?DX&nEK zZdK^%uwd+qg{iM?5k6#kM+NczljZQ*76$-Ia=!A~yoxY+ArBOsI=q?S|SO$n>%3Rp4 zSvBQ zT@P?Gyls_JZ{fOO;HMw9?Wh@Am~M%cYbinus8t5*$cWpD2~w1P5O8A}r#zi=8}N6b zl6YSVOx5)X+*DN(N^QS7g?#c-JQ5E-c|+~Q+BV+sq(++hu@C%4jH{vA6mlK@!lW~y zw{P4Os5$U@C6#6v%c!ag<$VWK6?frgtnNtgscrAX))Rj?+dO3HLtC=>ZFVqtF7k^j zX>7!dNJG{{>JzD#+`IH@A_o_(yO!j>nL>)6X}ATVrI@}v!?0T(-Z5~fCXJRjx48iS z!Q?m>SpFw;Um&M+7!0ZIJy=#-YCvQLW_UWhh)i=%0HN+WgmnJ6L8C#;;*5HKn@6rZ z1p3$0(B%Wa4sSgtagazLsj8-5U=jvvmk*zI`!h~k4gp`MX@naWNki;XIC>RhvRqlh zDfAjP*2H-Iip+Ao*C>`ReMeI+mLo@fHh%8AI(hFd_S#i5`E_W|<=b+KNT!N7V{AoQ zpcq^q!tl7^ny&Lc!rbNQ8O>KbjCRq5lx!+7vge$*l;HFUd}RH$?z`2?_uKFckB#)!RjM`LXTMaJ#hmX-J$rBH z2J!Cw#m2hopzu5;!q`*)(>PPepts%e9*O-}7?j zkcfZcWRZdql(msxYhsn6ZneZZ{r-5o<`CMP`csSB|9;F>@20|!D`pZnX83ELJhv^w zS5aC`^=ZZ+xs0_x>9d2L)-PX83ibRWcQG$?>f!czp7IP+NM_T=aat>}pWOH2zRS4!h& zvDINA&b!!(^ZuLKoL+7w{ zwlYoO=-{XYdfe*^uaSl)U&6O+x8I^+=}PObuDJG^ihlABzOd?I!bXB7y90TY9_ty} z*)ykH=ZLyEl$O~<8!b;k)*XDxK6*ZOior~qIf!m~{O8>un!5GNgu8a%M98=wnl1vz z)j|tzSMa|eyC^IF`sy`&UuK2*Sn#>@8yOGNgRjD0XR-8qzUF z%Z1?31r`Rax=n$o$p`&eC<$F1iOm6Wg_K%>t>QCPd-_vmyIG8bk#Yqq0$clM=*m-_ zuRez$MD!BR-42{_WuMb(N2zE?-x~MF-)Eklmr?b{vXWpX4>p3+VSPMF>hlmlz>Yz% zx7zn^Zkb6Wd)($_?adB(ts0Y$%kM?s?iFkzTWqJjZ?(J?o^|R(==Lhr{tpQ9ptEsr zOR0yR>J|$jvNG@zDk|yuGN()IF~b(H{b9>fk!E5~4t)3EH@)>LzuPpF+({aJ{|8?D zqC8^#clRMVZ6yicwvz}8Lxa4&+3eV^g^jRD@Rj?K0=>jx!7r!3$#r}&Q?b;`@SHh~ zulK8m!q_yuR5P6qA2(LjR=uP<1y_7Geor36c>KZEJb;kA-=%p6h*rv$Q5H45RwkPc z7f}?`wi;FW)($yYL{D!By9YtIy=V>Vf8M>86!#w|d)3Zl`?4FS;D@wYzFk3%QO<|o z67|AT#yD1k5cdBPGWCl7-P)fQ+o!zqv+oa+FA4jek33i6z95RxjoIdm3U$MNEgGYm z`gx)Rbq8i&-e$lz%^i|eBzXsbP-B${yF;n(@Mf{Xevm)aG^yo$d1qQPV7hjX5Dv_- z5MO^SyK-zBqNSmC_khXX%8D!TUYw}y0sH&3$fCgIYNeZr$V2Y~*qPJ?*)HPY&+o`t z`-*|s%4sU!5g?J zv66@M*i-4T55m$kRS!Q|-R`meB;)*vMm>u$%0DVR`BX_MGyFLtjiiK^ol^XtPf@pX z`R(rUpG#%&QB8&43xDx>G2p?&w{Kp2T!0dhyN>0sp$0@+ffPStvjjRcuWetU-8l_| z;Cv-$gqXcfb8y4KjRu#fB!sKv84SPji`ov&)#tL3Pk%3>$pXJ1NGD^qYI->+*_wa# zb&Coo@f++)>!2eOHvGbDp(DSaRtutdzje=|!p%GO{_?8Bfn?M9!i;E0kD)G?O^|?W z-DmjGz&(QC>pKwzGu+d^Vl`AfdPulmLmSswhPPLY#pZ#rn8heV?1jqvF~@aG{X*MH zEBcL_i@FMYWR?uZL7#EwjR1nju+i5*gvg0&T5&~3zcAzek)9lpEFFj#oNfiY7eW}} z8bF4H!53<9tYDGySm`ZfoPy-O7}q0q@aZ_%)R_9#P#(mlG5VafD(mvX1`O?}ln70c zZvFY;>QE>4-DP|O#Q0xEsZ-V0U8mur2GB=WbSi@@x}s_0-7kaZ!s~EppqmqPv6)=q z0D-n9ZgSz|R8iD}MMqbTzV6BB=<92ZfYi3(cBm28++~(9BJHu3G15;%SfC+RR-xHu z-La_|luO|mJ=1;odMX%gbDrCkU_+OGx^pIfad0s-j`otbpJ5RVSE*6{Gn?ceKRvj^ zIkN^hGlqIF<(vj&fO2#9XlhD7P|CA zhkh5rgA8H9kH?I`$A5KUh|3f5%>b#)jEMaw?Z&5caQ0DWbWFv+&IXSE?FFFD$KhLH z6TbZYd~r7G%DQP;f%fy^M8JkpC$C9$(ZK0|@C^!J%@1A7 z22}K9Bjn!k*+$`e>ciNwPt|bK1r{yb;X-rCoZl#c>RS_#TqyK&Mk*$n*rF?t)m?y~&22p=CsmHXXFFy$w8KdXkhi)_R!5XNUGZw9h#! zl7mg4?b5BF1HC}#ZLdFE#G0#JYssoPKLhc!C+_XOJ_ScmT18kI1ASLIDJm78PTWt2 zD^ROGeKh*6WR1G=To#Ig5+4RmN}Si=b2u~6nTY?AsQ}6{M4;39RFqqes9E3 zon~RBPq<88EVJE&Zn13qPo2IKuQC!no0A+lAc*7QIWw1G``x4!ph}SyWgIP^SoQwl zvlZL@R!(@O->xF=h#ojL|MvR)`1E~s3pQOCK`(~+wj`VYZaJI0sfY=a>z1vwZDR;N z>t|CFqau{n8oO||W4myFh{qH7{2*1w9xVIBbrlN#UvF zC(qN!6RX1}ur6V9OvtkbzZD{un^C0eqU4H#5@1GcNpq|tMJ9GfYZpXY4(mAKB=~g5GnA653Z9wlvMN zMT$cq4to|VS+kg-w__OoDPhqv{!nk|B($JY71|vn{G@s8B|Q3(;?FNXoNOf&U~MZ0 znWRxkR2(TP)>ORs!u-P-2EMan+n@U#WAm3ekdB*&Klq~NPUfq5K0mBhx|6Y|Rh1Mv z5(hYWv>Ng4%8qh7x{Lh58mN$a!aSE{p_|}&KP2#;z%9Y*?6Kc60gR|$6A_kntZ(i$ z2!ar6l+9JFHf!1szq&pydPpCU_>TM2@tqxs^NNevwWoR)%CNPI$&Q@^d=wYhre~+w z^cPKU(ekNi7GYO#KLg~k`iMxJge*;1gWY~~X@Uhft+WTr zpl14NqU?+1J!)tQ^ly|hs5^)9LR^A9Lf7+Ym!#I0phBo-o(E6aOl8ce zobs%8@Sxpvv_qne2Xh*l*9wfbOoAM;udmJGXCYPd7RVX^*Gu5&EItLFk(N0|HRk5l z6-=)WF*y7Qw~~wB0J4Ql#bDraTivr9qx}ROSEeS#uOa4JDp`_OM-a?l&n9?g=)vYt zIvQ-Zjyxghyzq(f-E>5dd|SyI(LJ?zEcU)UO13gW#3pMF4UeG3 zX@xiu>49bYc9}oLJVtB+ya!>D4w_BnPaLwb&m=Kh8|0D=2O3v@P4a^93xijsn1lEY z2^x!}*2G4s-=d+iJ?;3Py9CaRouWX^ga@t%Vuj$@+T|B!o3 zpsubg!NG?;64^Y$%2P=sef8{UhsqOCK2XF_<^u%a`MDF|CV=mY{ANK+CjqCMMfd}c zsc08dF;h!T^-+M9O!D!09+aXJUYR&8!EnVFyOqw&HLL>pBz*&=u|*4a$kiX5Ykg*3 z^%*9!mV{8d{ZGxhQgct1W*W~+h#q%9Ci0W&`YSIpGf9m%nDQym@lH6;|J;w_bE?i+MN5^v6?16acyQ8AtyxW^fP;F)6jzEtL zK`qkGVVYCC;QpE#`SLN@ZA7=bRW)2e_}?wy7XQu^8C~Ex+CJ&+;!k>F1kl3 z3I8wO0hWB_S&-H$fK_>CyVnQX4^wml2!y@x@1 zbq+FY1JKraMVNUHDdAR?eCUsII7G1`=rpzA6+FmR2Yw}id+5#9(|D+V(T5_KKu1QJ z>J+z7t68pEXuvxzYCFE+%>jt9!AUs~-g(4yCn=qcEWZ`TEFWQ49X|b>?aHoZnX$B!OcBS z=EUtU57I)ueNy9n@`!>E=xUm{W6-JOrT)>J8Tu9QmjCM*c}&c`zz+@B%qE!+T%6Q- zlhoOh>2}JLZq|S0NurE?oy+O4--6A9^jrg+=4oH06_YrwpF9X|7XcKh?tD)#wW0D) zr{_UmLm!=;WY-cnY2&{);nk|4T!AAqQDr`Z&3y+8GcJQ5hB|%3P6?g}XmGv2yhZ?~ zbxe1`xeYk3h34x2a2vq+k`+X5^O4b`Oh|M;+7RMo-urg~I|H59$w5X%yx*q?L3m}> zX^Hx1Hm+&z#^sr>9f?_(T(aMlaN6)NaQaPZ<=?0#d;?s+^aFCOvjRIy+4z9@Rv`W- zve026KkC|9)jc{!&BiQQ;KL7lr&r-m_B#PqEsB>^uokO82kFz0$xlDJ`vVim>q71l zynH40m-yg40Q?`g(frkeu(-0P`Xz5Y6w+8E_J!6BMDuVB0yiDbbpw5VKbqn*eaY4j z@YvwcqKQE-=0snoYHAVJE*LUg>ER?S_|?gOXGJ!I65n&|knNC!Cv+)vF?I0c6XVFu zV_pB^mVLvZnH;(8g+zYT>3)bMZc$ELgPv<0u1dgR(TFpC%KRQ+`O`|GR`N_fpal5R z!STxOqE_-O2+qOB&O%sh34gY2EtbN>LU8{{^4jC0d*uEv+7n)qkv4(ITm!UsCcyX^ zndiGc-N+HrCg51+*H@vr|R`587b>m^fv3_t=f247AFn; zzykFg$^RB6MyKGB^mW(- z_l4_^n!S!}+Ed243z#v*qu|s&*!xUZX5IHcBW~TGKk;#qUu>qQ&^x(1?v$}>dOvBW zQN6mMXCse`t*<;Gv%nE&{YtF6h+pm#V%4x_MkxoeC9XVsb*d}VvC zY0Y={&+>=+jdi>i`^O9NXN@C^>%uvT59;)Nv`j>Nq@qKw)Zh zRLfL30B~Ec+w~(JgOJ){J3NwM!>_pNe>IVz%O%Gs+TbSd$Wh^Q`1#E1NHcg?M<*(_ANG#h+-Pm z!yCEJ9OWkNr2MG7H3yjzFY-BMDQY?`8p7tdW8i|R=%WiWh1BGDrOJcl{2A?OztJNO zio!!aVkVe%7Yn=>B!;1OdAIJ}tOPS?HE~5hD$sx7%S=_!nV1};WENni|K;I0-23f= z{4qAo&(<@lj?H$-XkOGTyb>d1^FY^5Gb31a&UxjiLHRQVP68}xP?jonH#yU3TFbw< z=ZZdzzM@5*Te#REAgfce_%sVwbhVrSBByII8aBv=&WafXP2Enh7XD1DY*3GTlak_E z_91T~##ahDsa~o?FQysJM=0`LU-)MOy!ORtK0hH;_%3G*IiSyJ*$-+~gDX=IebH4f zl%$>8a@e!-AAF|hIX+U{{}5?eyev8c&>grY1O%;Rpc}QoVF@C^u@16tQdcY40>A3c#nY2&{S8zE(pC~@^l;@_8nH*K)^#LF z(w$MbxU#f~!$b0DQ0?|$1fAr=r(e1?cmn4*ot|~cr-W=_e+6C50?H-v~2YsCcKMr2)XLJ zTAhzY>Hv;yMm93iEZ&k=XbdxvfS!A2P>55yQF6uBn_>(Np>B2{KVD4iT)$qJ>&47C&(7djmD|)2s0)ApJpJ)U75SeJid+Ej6{qehY$9M-^ zSzPwKxPL~OPqYG;_qmq-0XI+GvlnM(b38avY|S9`yKJ0Nne`XTJi_bo;Hpn@gq8TE z-+60)I!O8-RsK45&1@2pW%;_$vehjFTi)(!K1UlucEgqkxs+FRfMPR#xvyj||55c_9_h5GFd4I`P}O-b z)4{`a+Mg(8sCJKz(YQoy{rQditv8$Zu6(RnrerEb{!<^$7(;RH*3SX0FRAN^_C7-q zXPlUb?HGOvOT;~~RY6`cf%*lVo~RTG_W@7@eR$8@d#l2R!^dPg)9s7BXP}YS2;ai1 z)9VPE46&i7B}JFAUHFX+At}}??Jf}+0|v*cMb?~C8aqg?jp}DN6^+9W z$lctm1u8Eh7hvORmZ7i?OSq!Noeg<@M*Bi#!sM}zwhO08CooBG>-hogFQ@(zE1&zN z99#GpMt4n|^_r{*aY8WTrUY@bPm;?<+ccCOcJ}zbg@A9TiP#wrUStUmEw6o{tD&Ou z#$12D4h{t|I?u-c)|Du0IV~s3-2;O7e6Hub_U?HL{Cp392f?SXO^Z}p*;+wH(NZ5+!GWg+X0Kkn=xDsi~pahT7xc1 zFn$Mvnu1ip9)|uOE@~Hu8IV4xghwF{4F6}r-q9NuF%uS2o+<09YCJr@Qx7we;#WTPGYrJ&mDp~;f*V||{tX!UE-KM3p8*?Mw~%_;4s@nD!$kU7 zvP}g&pyQ5Rd{an8CD#5dErXueJcojFhGJ;Wn~5#!aBK0?RXD)C>xtbTNwN4NzYRXZ z)#lUqqZM(kI)T5tni*s8xszTor2-64VKqm zP84^{eIwz>zNIv=O9fN5Q5U?Ox3w&X>ypfosTgGRc@uZvN(*a%1|W)8Z%7;uod{#a z91*$KY-^{0lMHs6uTNA>Izumqa`5-a<5w4|rq`X|c;l0sPXkcJg-uHV?23tXT$Rl= zgQ$NEnLC-}vlHfOpFHS94{x{zI)eG}w=KXCXZ`Fk@^ASmA8Iiy1z2ey|FPhh?Dqts z)#)|p;l9x&4LE6Bf0D$YE7Pu|Anl*w($L9XY|r|C@Zo9ge%~jTZ{$B02bfsKUI{#w zx7cUuEej~*e=+m(kNTwTj&WTNNjC;Q1pg(7uq}zT8To?vJowxg1E;>;+?%#2D6f`* z?(^WRky(6Vbpy!c>MytUr%9@ds6kL+6b3HtWh8!k_+=FNW^y4wEbhUu5POh%{5O5S zRQsEf`Rdux?*?6weXbaXv1qZSyvcvwzj{yYs>A@v$UW6M!Hqb;K{){MTYjYYT1Yb1 z&Z8de*P0~$fJaP2!xHhbC78%4s;*>?DCmxu@!U`pTja&4ep0Chxu*x_=Y^ zqP9Vv;jhu0F?-78P6Id78Ca)aP|%WybmTiMDOfp<|{O7>bcXBVb5;2(Av8Icj`@ zT92?Yx)`IE%)csjp?-f9iqB1(gmDR+Mt+kv@sWk1U8&t$HitJl{52I%9ONVnr8b60 z_bq@BCxd|}BQd}?yRT6LdNp#Q_}ip8k|}R{!etM?4$3D6tuoWCXPrTDsG;~h<-CbX zPk&MDJMv(0d5@;cu4JZKUUzN>u9MM-Q3-#)rh@@@+{`-5yw2a(gyj%j!Dsv*U(PV( zl9;A(YkqF`^iA(^AD4)+nlq;o!*w^!d(1lbK=9Mnj{Fc0>8Af5L0F}c?~Ik=&P0Ke z#)O?AIDP9`38Yt;*Ky$@)_7#c7adM3z_SukM83)%w5^Q|fY7NqYBh zz?bO(1AP{skKu3r>N)?-trI@cG#>Ze@F3L85qDKaEOzp#?C!Xy*2Awl4dx$&!gWQL zPA}wG4B2trOFamsN&%hN=FmS+u3|zc>1-k%JPvhgBk6l8FqiICIzz!H`rG%~a;h$9 zhXIeaVb9!QK3Wudo%LTK3PEVlL)K7TGF}-mQzY1 zhYujiH5-Y3fjeMHJ%imkeC4O`cc;bjciCt@WGMK5D`~+!dG_|N`_*&b5C7`&lXm+^ z_OG0DwE)PM)L)Y_WHqXuyxi(^(^DnugHk@i9L*_}2(LtxQ6u)wE(B z!S`J|JBGQg{ddL^w63S+Xc!;n@xrug7GEtrQ>tpnl@g~IKY7&0CF~1^8bgPv2FUNC zpE+!9Mg0SbzId&noj$WM3(1}@7WFk|*m_kB@*C&czTc$G^9@ZYTKuRkMQkNe;C95j z3{(EgHHHUqdCz{P@=;>HnVpCAt=v3RjCEg91b>~8VeaNpt1g5NlHiXv7V*ch(6^%< zb+k4TehME3bHN*a9~aC%ALj54xC5y*PlnmFN5;B#2|>3{$9qb{mc~wSzr|T*{Z7g} z5JP@@{)iLbkCm9GMy}3HtEqoonu)por7epxsJOKVG$`7R7QXr=TA|TBh06v)(Cv>iNW?m>e;#Dyv2R#AG?14 z(o&-Js|}m&*w0>!_Q5kM9q-I><%mk!&B1*pROC-kxc&t{E9fWzZ7E7Vg|j)|poxTR zQ(WA(aAD;B1lhX@k?6mfG1mhm?1kV2vd$u-wxu?v=XB;8cLM%bP2&O@@21S@5Yi`k zCReOO#6M+0Ulby(bsWKd?{+6!qLd4gt111O?E$i2Q@whsZCb;-+6rUsYI~#X{7b(ND zE;tlg@^YyZ{Z2RhlP&JhWF&ej?Bt$88eO=t>bKM0W848(g@dAN5`^b%byANGV!>6} zASXD3UNF$e|12NZM~bE_Y(W`&6=goaR-<}J8(Hg4cyZr2Tk-%W_;r{+&CyDsc&U-^ zG!|Xzx1tWJ1SHx8aGf6u!z%r@=WuKOM>n4&MBD6MMp?K=2j7oi!3BfK#ZR5D89YgZ zp#7aU2>c@Xj)fiZ>7#~X+>BJrJ1O-IRjRi=5>Bc-iC=jtCcURBU>UDxX9-tn3t-A5 zbKHxt@rV}?QI!glVTeJk2U@3c{wY`^Ogyf+l8(5v5H?#q81w9YOao!Bz_YOYc4JF^ z5~(0=my}sZ9pRL`D(AZ?ZG1x~Tek73RLg6LtzpfRHoh*BBN2@w@i!Dp+s)+7j*GXA zhLa_lxNh3kM5d`fGkqkWA4eJ<7zad}coQL`l+kwno>l`O$NwHZfci$BQO(oHh_p&^ zr0>zurYm|$^zKWvT`R0D%LnP}!RE?T-+PP|L~QnG_5|t>oxAdJB{Kak;GkDq@2o3W zPO^Zm=&P*D8py1uIr!oaQ!?5ZWqjN$2W#<|QQ6XtJ(dKogRVI~Yx(Wg+VNfxnnXwl zk*_H-m@<2f@%7#BmHud8wS0pxavsS~MITf{0|QcC+xvi5$p3aey7P9uxcsNxMZ(Ix zA)C17{HqP?gPQ@OtdG}S!aukV$86utbAN^1mCI0suS+CcWsn!s1~>aP(~EtJ5x74+ zFm=_jh8p=|uzHHWQ(*m7?9ji7_>I33@cLn)=ezVVYwFth^PG^Z-{6?1Py4#@hm= z>$-gzmq1c`g}?EpW+UEv8$0tZ2HLVT`{b6&)lUZ10%uS=urWE~57#>=^VK+{yRiqc zYvL}`gt-tFo`2+a0l=aUGIU|Sc;gB9665Cu)c#xs)nj3bDG60?BT>~X1+{_8_RWNt z^9;y5DN^%!TvF}>5_wHU^vY0uQO0WyiEibod-8L$IfTXfXoKd3FdVT0daJ^DTDU-~ zkkM7Ps2iH~-_(wQ28CDn-o|x(SK>^OR*75TyBql3xAp96M;-mEv5gNuF6rXP793dd z5P|<|n=U1?*KDS92(f)oa!(|3D0J@0Az-t8&8AicsO-GCclZ)2xWZHT*%CUnW3!@f$gA5f;hXAO5^>MWN&RUzqAq|8kAt6XE*Iau(LrG~trJ(1LxW^cx)B#g zP^x@mGG3rQZxh8l?Ff3oVIpigTTBl((gkKyC)rv(=J7XQf7*WAq2J@k@?bJt+~Vb5 z%FnhIFZWo@Ry?;Jz1#}s)tJiB_*X14s{L^0unzakgju%ULI0)9sM1**8P!%HGEg0L zO7#~4%vF9SeV%hYvj2CPV)s87@`J>Wae_FOWLlOdbQ8AslW?sz>(5Mp5IU4gw}C5# zVR#1+_ucGaTWjX$2-wUBuO(0DR!h6#PVMJ?PLe6^VU9vNSZCHT2H%Zp)vCli$#*mn zWBDeLC%%3pfF_EaZNR_DB*s_st_T#k{b`P!J?J`wS$OPV#2+;gAB9a&-;|$-n;@3A z;9UI|jL)#%)MokpC}qkIk{i91cEC%dHQd5uq1&&G{``cTDXAr-e0@M_kKss&6q>v6%!XY z%R}9`D=cx}SzPC8jh_%(@d=I#dFK)TyjQk8xUJ@fV?Z>1<#*g=$TFUt-{Jv1ksH|$ zbyl4tAzuF>j8J3t#qLqVeX+whGBT2mn-9CAB&Gd%`>*#5-!l&(1TAe12PZwOQ^n8q zBDX0WP0(;f^MhJFpKxWOtK0<~d-+WA8k7&qxtp-@u1tPk24?ZogBfVL>s@2N<+Upq zZ6P&aPm@YH+3(?l*(H1%nS}&mghp|)Z!yA;HY-Z&K z$^$XTrysmkPZG(uA!lM1aT>Wl%!apUR+z#+^EUZ@rwN5b|HFR#XyZ5|ah_$XB+k+4 z`)uRm!^0oYP|seA56`#5Q3sK$tTccfppWXSAz;mH%!cQARa$0Yq@7B!w~UDrQ?*Q; zkE3bJ%>H)F z&H59?I#^NTE>J(_w-sS#CVl>>l=S43wmhrsR>9Dp_)MiS(*l)H>k6lq1ccIA$MGZh zR(3iuZ&aenG17$3U!E$^hJ?91@G9KU;afB%^8Y1rUr{??YV1o7XN=I^>cD>z(FU6UxJ*M_}ZhCIu zys>r;o5FBTzPY%fb#QM*PG^@v#~J(RX>P+;3nP5hI2_QPkh~ zb#tQz+?%PGqP;wxiZ3A^2IK82pQy7wA)eK91%K$(LGip2u}*72)0pgM;_KIqQ5Bc| zx`?#n`MZ8qSgID7*M#x4N~)=m~I15)-K$^JyV$hX_vv>U%6w`-KTko2Uj*P zcHsM#qY~m2$J6VMbXDxujf#NN8n*@mBOf>U#OpGIwyz%64yJO}4PhS|Z8Vc9u|@Sb zAllO5sFj6biL2AGz${ur^YrTKjXPVP80|!rDva4OstxAz;GCWx6|P(Dmo;K)om_Ik zvKxjApIuL-NW?MUH`ZBBzji%#IyO;$c07LpvI7f9JeOUMjvXomFn?bzL;mdhl02rv z;{EYrF2)Yr1i{S0ujj{3Dtc6-n`_ow_M}7(S>Su>ISAVL^AG)RTO@y&4Ix`gwS&*-! zAI6e>9ss;C`}~hj|GMYe0nwMzCdZIKpK@LlH$V}ZNIx@Cn9ZHvXM9_?J$wCB?M+%u z>qr=l-ps(0_}GOoZ`C@Kx9D|%CqE94DK3^_AD$_&QFKV5=qhZH7hQ*McNMwb*u@p_ z#=oOzPtmf>)!x&410YjmkV^Q2`TwZ;2JbooZr_~b#71M=wrw^^W1EeQ6Wg}g*lujw zwi>fZU%t2Az3<*Nf5NQUv-gjgAJBeXm&15JZ70$$uLhttQf(nzEVW%!b6sheT%FO!w^~ixT>pukd=2ExTNh9i+hqAR}O+h zT`N6|@=G`QaXJrFh^770m1_fL0IqBax}z@guZC-eWLhR6FX9~n8sFRNSa_1 z=a-`!OFzXK-oAYUq!Xjar;2q1+kOqf|NrQuL3^QiU0DCo-}q;YJct$?mT>^LX@jxG z-UABHk4~fYEYR1u=x{kX7pOVt<|dwqP1T&OOQa@eRafHy?~0d0qJzg2jm4h)$#-3E z(7BUK9}1Gn_lWh&@nzlV93)TH2)kF(>dFU!p`?P2rcws@TC_fnYobF^RUQ zLF-G`75E-F3iImoL5xX5LF#WS+uWz(SAxW z5r|&K0{B$R-6u|C1=H+PYFMH<|0A0Pu>Q#4a$|`oFY7!DSWpt`zG_-fyc%NiyWKjx zJbVyBb5=1eT!@Y1aW{&8F!_f6&dfrvlA~b(NH20Rwl9jiH-#}9=7G&f)?yKZ)zzG{himuEdR0 zV7wuFr_wQ#5@H>#Lwo)h6o+^aFv|vQ65J3jy=qSx z4l-asDxK>{_FEEnT^~8gR~|+3Zc??a>UgBUwnZ; zf=jxEUktq>!JIRmG6(Ed1Y3 z?~kJZ!r<-7%uWrr7Ee%=lb}BSYFpbV5rY&sGOZiLGOMs|N87V;G2@*xof2exw~khz zbwM10%M$oZCQz8`_qkxMrjQ3BASjUwBEb=-J0Yed8%|sjMs^g_?Zb1AMgpi8?CNA0 zCX5J|KvuAOOs#B#TOF-gzUJpF{!#T^C$+0BK%MB6M@pH>{X2;q0HwH~pS_o!4tY!n zsgfUnvvK4V{Zc3a99Dpzz57<;F3&Qf*Ke`kekgtS* z&9vk(=6?x91Mgy!=|4XfO%Zu+dL>LZl=?mYr78Mg64>%ZZYx^g3Tl3H!rZFu8-&5m z33!Etdt)_Gu5eS5%nKzeX3ib;JSi@iB#A)xrx-@*r0RQ-jAtFtc`7M7Mms$qL7pOE z|9xtFLjFA?4vbi%9|MDQ)K=%7afLr+4ghiFj|@{X1$8GR|Juyxo$PG>H?0?rzzU z)G$yg@xVoh+_c2`Si^Q!{$Xhxb-a15TzT{*8Ntk5#lOG6)}P%mb~z4KVD1HpmOE*`?|k zRDPU{{jadkN3*M1t8$3QBqbFKE@eFa`EN<%2aT}qOhl{{N3pY0H)s@{!=K|fc)`Np zGq@ciz6U(lSIfaUz-`_|7G#|^<@9qIlaF&D!XVNw#8-!zUZFE>1~>3BQK}3>=kQVw*o*lL z#T}$0mER3Fqyvz1b`#7GFU$I4@)A(S{xGYzqEZAC&&MJ3)PGb4G9K@hN4Q$*o{Lpw{2r6jM()< zEN!KuY-$v>#tuggo5esx&^XO-&|)BYK$gHtdvO!T)kk>|?PJ`lM3YETP@g8r9i^)n zHFD$ z6rq9jLrWxk$^C{cB?vUk-ZU_zlHeHh*hok5TrXZLXq+N7L<8!L5TgyuWn${w(9UVr)!(+YqC~d zufV?GN140FrovbCxSLlxbqD2yVW~h-Fi<@Q5tC=}1Z_o4O8UcY^OEsPtP9E&`}2`9 z#?VZ3sFuCsJISM0$`NIHe#>x;+#VAJj;-7wG_xmJUpnTw(&XX{fv0(g-XVaI>b;UX zWOSV$63l=D<)zMLXj}m&2RSnll?%JjX2z3{3(TMNQ2$5)8vB5c65H0_Ec2-3%ipx9 z(R16#bC${^qA=TYv;W(<@E`v|+X>;R$f@^r=k%I@!?Qp>yggHgOTqMz_tpF!?tWVh1#20HH z*3i2dFjD&I9u)I{zGFYLOu_b)rnxulqB8+Tz^6*-nujXej{aRV;2Y0>*!#mFF!KdLQXMWbrP z`E`^zX_`miIEGUzXe&^T!Tg=Y4^C6Nv)fK8-%jW$5ypb&Yw@j}xxXR&><#^#9Dx<| z-QO9n-Wjd(oFfnwIgjmRP!Mcr-g90L7xL4w8i_rWo)e)D_hBD9SM}%#tWjFy)=mFHF|Yr!SW-B_ zW)cnWGibHN`w{BLoK}BA)na^UQS=L0ycWkkU}4awI1ZsJxkkZ>}6xO%B}`Xc(y=i#ZwgB zF!!u|pKmamSP%OmiCSD}gkrZ;|9dYGe(n``6h;=^QJrz^xeoB&4QPvA32D`ELF;|= z(hncGGn1#*IC&$FxQkfe_y?v6m65Q2Y2y#T$6v;2FPZKD{@UfNatH{U;d{-uprH)L z@(}gFVgvdBPf%?zbF@`6iRuzkEg~2jSJR=BKD8!2Nz6%9YKACmQnZ6Z-~i=(4kbRL z%TZW#`VRHV za^!FI=(ry%MNi+@%Lpd_7|M?ZEq2nRp#HGcMxzs|2ys`#(h?48roE6Ts@OwSP{w!(yGY04zAyOqL&Fq5{_e2LyMoWhkY5pnnOr4^s{lke!_t$H=LAak^07D4 z$C)alDMCo9TMa z$j4iF^1hH*dX%3%nrd#A(lIESlf5@mYJuW`gvhLoTXya^1VcvYxdYmg$SYKkeOwc( zP+_YzgkUC3CO&C0w`nd6GApz=#7f6ER8n2iWZwD$etUl3knL|t<{c!VMa<`n`;Fyf z;ImHwo55Tvi?h5AMOZcuRKaMR5c6LZi^~|iAN{{@jnG_*d^UB_#s4kCPJb)GaNvV2 z%_ECh9Xkv9XFKLn5RoJ3Cb9nmW&Tk%wcHWohQJ)T^uUxSF{#!_x38W>n{~IMYvzT) z2g^@7L#B};JNGWR%of(%4ex)@B+dUh{(o>Oab5Qu0~kl!+iEGmkF@at_`eOQ2*LD9 zcFk^UH`75L;%ap-?#$$AD#jslQq}SJmS4gwYH+G>2+Gc`oLT&RdWwAN3WV^uM2eHX zCdsfK%^Yf_dT_$H7GXZhoGNeHyD2NjrxX8tKWVVmVy&p=B0 zr%0K|2@;}=h4nG8AA_b|9ejl4exB|gyjP+eP2?((Om5W#YhcOt20^w;BU^ zVh^d?SkBUFBDe1`52X8tDWTne#xU%p$_R*yWXw2uc;y=L*Lx?nM`dyU9h#OI&Ttdu zMGZ{wxGFZy6{jsi&N@SPWMjHRs*-)J^F|YtXM_G;Sd%=afKm>iDypM6hlmUI_)PTA z7bbCe8w^#rJwck;lhm&7g|UrIJM32LsH z1-nbLw^zRsUU?4A;B{m);jrPu4E60{Ko1T@+HG-b?FQ21efpH6x^EKV4EYTF1BqUo zN7o#3O2#uIm8oLf7r~!JOyA?Zby|#XBTAysULa z-1so2yQ@S6<6Yb9WRv%keA*cm;I%C!JIrjcw8s9TcM)>`f99b7ScEoP1!jY0pRb;- znzPIZ4ZDX&qUhIBZ8GM_YIHYOjO{1g7#B?nhL?PFXO*V!XAZ>;#o)7G4(n)V!yX}m z$uZG!9>POFF6q%AM7~_Drn+%7C3_1zNb4>abSQq&qRGO6hNc_gT6g8fHuP_C?G1!R z1=8bX941mtBvxs6`h|7jx1y2`@<%2z4N8#mc~9xT3{mq?ji9WF8`RDdU7M?z`?UYO zF*m@ktH|)cjk-1jX!!%8QwbfXt-7k8vxwl!Z%+VEHOzl~y+CK@f z7iBce(nQq4slEubcZLX!FnwxZMOGti?ZAfDqWw3bHtg(K7(z5P5YX?e59vcL7kq`p zF?~In-MJ@{ILO>GJ69Q$JVyhE2c9Skb4-*HI;!FT%6pPAz@`H}h7P&*y~r;z<`htp zGXY&@-=pIA!QKMeNLX-!r!~^07px;dYI9sUC=OHv%{9) zvGi#n$UOVOYZL@~s>U_p#5By*?WQpBqzFiO9er6to)h3u($FrDzfFayp-hoGk{Fjr z=ela}xF$md zD24u6N@nG)X5v#7s$1BstK%v6LhQ1F!KF8Pr!W-|fQk(@Ac@}&4L<0I`vTssg4ACC9!yhrlLYX6u# zrqCy?6wiyUhaTJoWgl!PBQpUo80}00q-s~@<`$&EP2UExbCPz~wnnp$bE06>DmCDT zqjx7Z%@8TDWGTv)&$00jz?bdA1|I!`|+g&f{Uz+Jkgn9 z!qvErr5kv$YfLU-zol*^1u7jQ3TZ69n8J`ir6DlW@He=-HGQI50WX|-Sdh^gAq!l| zfC4P0u|L5@5GZ2-=omEsS^CCXuf%w zXUPDTQGZmLj6Am=aBD_(lVs!IyRGS=Jl;e%dGvjPhJA6`Eg61|hOMn)j-gnMC+~K? z3HIli7zb(b8X-qa#(`Ax6(4mlLY|MH9$uvUn-A~-J!%?;Zg~;`{daVLh zK{o!%8j9-tQ*o#3_?9^ubmod_#$-V(PVQBxe3>5&(2R#-reI;?=pY$2xITlY3JvXgY^>Ho?Ch{5!)^Ux^9vY00k=))5=>cdNo_;F#l2b0YSlTr2R)$vYPzB{JJ z@cF0busSGc`^0n5HvvD0@y&G73^mrceBb0a_g3mmVSF}lqTkpSn>9M5mk)+QgyrNk z+oT~-`_vqtH~=TmNo#+_9_-!&)+3q%ATfg7!k<-p8`8}=DvzS2UFpJOJad|i`N21O zY-^d)i(gBFS9b(IttC&m0or2|{Ni8D_z}^Vxnb_w2m4q<(N7WQh3TU2*`|A~Ymgu~ zlmDz?2R%ffyGd34K?8egrQL4{6@FSmvOEX39IQa=M_A~KEg_EltE{EENHS`4cBX&R zyK5eope&63sxOQ{4Po_deO-p?j>A6{A(M?a>ISEYBMLSeL2I;DMQ)a>xMQnY1BAUW zE79Obv)iohoPpe%Je7#GvqfFVRAQIuZp+~-cZejc!v%dBf0b)5F>C_)uLi-CC3kKn z4^}M7?#3sO#>!G9*f$ZvPZgW=cMn8-o47WL#sO%>C<7*c zUfZs;ZP&Fu+#CABG;<-`lbAYIj|hy4q~V*)h_J!*g8ui+g!&KIv^+cB9oskzq3>fE zva4X<5q`qkQ!{XosZW6kFWbYZ(Szw43IGmM@E7Y&o(u`m5i+0#_Wq(n56&jWBx2t` z$XAO>QM1B^FB9)XumKuVLqTD?#pe@Y2ZxXB!NQD)PU9Ei1d4z?7zspPS=}ltEg-{L zR~#PcFyapQRbn^yg^zl>BXz+5%#9LO9GR2;=9C`N{K#|No&3^~r6U_azyv!s*}%nc z!fphl6S`H-`DWGDe zxW!1D*P+CUk4jy#Mr(5z>}5p+Nd&sWO;`$+&{?fM?+*j==**Hu5t+q?blwPw*Dt_H zMW2;3ZVaOcYiz@tRKHKqmgo9P(X>lXPeAVBEs%TM@k*?gy6@Zu?bMNKw&~Lh;JHxY z5N=AGDR4=rPI%m}(U!F`By8h^q{wU0T^C`4$Gpvfh-jV}jL@NO@Sz2Tq5%;n(a@b7 zpVE~|@{rGJm4qWN;?mMv?_`R26mOYFj$*SLdy6qzM-6h7$agQV~lJd zNf4EW=PzA@=Xx1WrB&uIFy$EdPu)s-G4{zSKFdOmTi5q-$*wOD(rzg7n@(W+*6@Ex zPp<>C&b~%zZpzerOh>I3Xu_TGAFN9q|4rN>n28=~blQD~@^c^q)AR?U!k%@RIneq$ zTGbaoS+q4l|3x0hhXfeTs- zu(f}2yA;A}y%`c!%s#>hJc4S#IWE4Hh$8Z(cPXbRSAEr3k0M-<%lnkpr-6*oy9%nt z^>4uXQF7S$_q%X*;g_lHFVk6}F*?+#TUKdc>0O+whLWv-!axTmYu6gRz}bL&#$(5P zC~)88Q}osA-UdLyP#;D0dAt?QIO-zF66p*ifvpyfa6Y}opdIQtP0}+;T`fUGOt7mMi3y}ozlMT2MO$8cj9MiPFUnTnxw8{nFD(EbG1%tHl&j`il`Sltp3VLcdGuX!sxmh;UIz4G!nRg$=6O$d)8he(Iy-=oa$&Zq#PP462lj@7}0;>S-iKm+!{Gh|3qQ<&BX>hZ3Oy(NEuyd zG8KU{T0A1;3*d7^cQNSg5L*>&)<01jKOHvgTAQ0~R$8uFHfr>|ubLaTx9tu88*=Kk z{SV+Yvz&V7wnZg?l6IlaQM3Pe8%WLg1dZT8!4&QSFcA4+IGoOOLqUeAM}y2C+`c{Q7REyA3@?T z?&F>3ZR*$Ia(!h-z_R5}cc9m4poYCr9=O+@%|f*Lipi{WW;ZYF9F7J z%<3jEb6z8ujX+fB+n9CKqrqm*mnH1krFP z*uLbfzj#*zmWC;1HhY6NIn*UF`aL!HT4;F*9`I>Sc2ZDjPu~=lhZta1h_v7 zwt8XgTU33-LowAajC+wo#6)b<_?t>ISOjr6aTr8Jn=LvGt~|`BHcob(C3pZs@H5LF%SjE&04O< z4BsD*9kiW$!42PytBd~^O|;s<|8>dhXm~%be`0xqqMQZk?W7qm0<%Y^yEjQ@3T0rJ zOuC7z+t>zR9r2cL;?fhfF36arZ)!~&P{bDDxvI5 zm7o!{;;EHu>!z4Mn)(}fK4TSd3!oOc2t5D4mzcv_Fo8+*U5X%r89<`^`G_&(o_0|i zEsjErJD(_SAFc{Ds5mj41e)z^i%Xe2*fq9c8Bs(Zbd*FSgwD5{i{dcO!fGMBhSOSYDhUc zCrLUSghr=05pOpEC8i&RP}U3@ z{2curlLOH^X(SR9B2+a4C6U2v>^~`&X!I^_AW1ek28Epk+B5yWARUG zjv}-d51ZS&V}(EdGxhJO#P}@ci;!{T^#<{;KQCtt&xWm=%(;aap`uY;U+v2x0jyUF zzpHl&VJx4>_Ec#aPNlB3Fcw+1xa_1J5GJIM-N55=u|grFkR5uc4t9KJJYnxui&&70 z)2=&e?lr{95HzTUDc3+*jFOu-{n8P{h{yBTo9az+$ALO1meTs3+wTXrnBYvimlPEF)O88Xg^>5FS0rMaE%6%L*;&X13xbCFYwFHaPnobiPU z10<6nGXWsi1NHGNCSx76gze*ZI(w1-Zhld4Pz`K!_B>H#s5nC>INWqoGwKF z_E?JB?UGw}uMHMu1R&o!vqd-fM=xo~Q05;>v;UyZgWx&lyp^AVf1eFdTO*@A5%8-Q z;a^1{{Nh|;a;&08z8%FAT^9N^JEOtEe0Tbgz@*3+y_Pcpu}Br$iY2J5a+hH{CI`=I z>ylJ@e0Q+M&QFms5=8t&MV1T?<3{UYZbt$P;f+^Z8saBsj$D#0@+eV$2eyhwqkO$( zx4X)$iAlZWrP$;H4oTV#q^?2@K(XSgcSjmO@yYvo5R!0cD}2na;t@hrbLxJ)4#8Wy*G?93X()5YbScG3_LetPh?zJSY0_xZAoYuTV72!2eK zo&fLagVukH9Fpp862=Lv?ia;s(mKT{Ug8SLtymT23y`>dgLz2U7YqVnphP|bI?Mk5l6f6aJq95F)6LQBR(s*!4Si0! z*{RvAeCwUPc9vCov__>@>s&w69|fdKyC3K*fP~1KaQezxVY*2&igZ^^kAqAP_7K68 zqsDx`nU0~F4>5iw_HRZry+!FGoW7?NFP9imlT6ku9>gV&9$?hMmP0%*Mmvso9cT9t_KY5pcrp;`D>@v6gPJPI8P_+4 z>MyZcUi=N&;vCZ=&lVcm1H7PXQEXTj!FS3IougHSt3<|^EyKnT1}CU0@gR~o{wa*j z2f*zTCdiDs$_6`>d(D7cPV%>`aYMwbP`wCI?CRm^@B$`t&qh7^nrMw^^P0U82K#*! za%Kl#U^+-J*HjcL@%6=uF^4W+t}avUBiUzFLXzhrOrBZx+&du7?`K8FnK*0vgJ)OP z-0Eo%wYvmLxqtLGh!v^Pd;XLLk%X@Iyb1Cnsta6t=D?C~sqid`v>~8;YR&AZ?D=p% zF@wh zZG`xThOuy|Y$%WN|GidzF`3Kz|ACUUwL4ave|Uf(!Te2DUV6;q2-xdE0Ok&EtZF!< z@7!(NqF!7K8jtOx_uqQ%^bz2u`=vB58WiD9e*lMUU@!rroTwOuQz^v;S0tnkr9ZX- z#&}T77sS*)*!WSrwLhyy{=^Q3DihY`t*kCYKD>*2P~FdN;$9MM6E1ZiP}8F zF*t>Y2@8)(MB*z8a?-IBC9(8NeTa%0O2O}-E~<|ga_nPWVj{q*r397$q6>HHi-E@F zY!O(4&B}9^g^s;zP!m`SKZ*)5a@MR~`&##d+rPd*gG6X!pV^ZlqD~D~gTr(bN`?re zi7sUkioiB%4JYv zrZL%QL&FO$wL4AUq1!lqD&wj^TOV(xF0rsREh|VoC@K4bjH>BIdG#N3Cq0a{9G@rK=ef6j#d%V!+rbrBN1Y-?z@1*j{v)5v z7-p>uTWe~jR_Mo#3^#d?44)uH7{!;`Q7%%W$OuEocV(LAD8j0))5;*zFxw}Diky=` zMB}7SgmtPoO_oaJ3CsJU(kC2<~cZ? zz%;{*i7jRd@a!8?p9$SizWn0eM8EP`fV!G0^q8T&^RB!zULWSY4s!NsOt2uDGyWRv zt;u^fPiA+0n34)SmUEUqGzhHJDu>d#P&{-#)LF+0Uhf!jo=*VkDAe*VYi>M#V+5PJ z?|?kC*xQ17ijy$jq+NW){;&wsG3+rv>)Y;+i+TW;h=~KFh(5q36=GJgpwtd-t+Klx1U>+w;vXzn8Ty~FP-2UD5-gQ^ z$cN?vljUO859-;Tx{p<5Ir-MMIVtM%(;$^AOU9pAoOKmMhkwDv3*z^Eg z6ICE9R7Afjm+1lpDI`fxY7T$S`5y^jRaT6bz6^R7TnLUN5GL74ot0qqV7O`@G)~%(Jw>!1A)-i9tc9t`@ zsvtnI?kZ0}A`;@1Of5fF36 zXoOlR$nsNSmZnEaJYG^ge*CCUdEMxt{trcDJK~ZD%)F(6nlK$$odf6$d_L~CB{IEc zT`%d%j-ymGpfw7~I@vOGayIljt%D@lKz?`2jn|p1{{z==j@7MC>UW%<06RJNLT}vP z_$kCP4}vtDGlV|TvLv60Nyi^6obniwWqyb4JIO}Uz1Pf8B5&duJ>|NwS8ZcQJkSpD zYl@l)QKrM-)V~uYPfoilg_J*_?QXIZwWv_=$XB*m^S3sEcHwjd1xp-xBrx)ZN8gf0 zgySeD>%;cRT^JT_>J+K>CS~~&9L1c2h|{CdA-Vr@MH{7TVP-3cTUBm3j$B#VE zhW*cl^u6gzQSv6Z)z$6PI0exRwt`E8Nu%3KCsg)>O*hqh4+V(deJ_pkyIHjDE#a4! zK_Sy6>c|uE-dcBXF09*ET!ipykh`e2h>;o09GXPlDk=jjCJOkBb8=>gCOyxwM;O|} zI&Aquu-lX8NK#C&luQ4sjC@O1yr~quuqL%^s-@QM>zqfjVbFJmM0^k{M0)Sab*47Z z-Z~5wAD!1Bm+dlHGU$7`6Q}i|Z7P|NB&#T`h95&^MZ_wD_(mVwG}dECVuvI_Z{xsd zup6j>6gWL?cfgZ4mLx!p>l1_=A7V@8Mwb9xn&gg6!&U%QW*=Yz$R;`6hrjmg!XwPj z@#^p<#P$ijUiym5=4Drb-s4Q@hBc2dz;7(993kXYm%LJ+2U|Ilg_b|w|`3ludy?{qxPWi6l*`VG>=rDIhQbqsC+}L`{%PAj3Fp~v7 zmLa|)z0^e-^!wt^V{2p4zrYq^Rg)!-kyG-XiT0@$=^8p^I2X+9;M`kr8kGT0d?ml* z&0$Q{ud9Ye^{*+yh_0>Mc>9Di9`FTr8wA0KnENLf!9b*hKd-)h>LV0XwEF_(KZl5- z-KiDCR?KSvmhkuxb>C|gMqqA)mGd0pb`wOjoO;zZFnj&Lwo|M!@Ionqu+(OrU*F)} zmZy4i6uz!X?Il@Qln17kIo`=TiTVKXCww`>2K&^vF_n$^V`^dqo6;kU-PaNuJ$eYc z^Fi{yR9~|09b2?dVXzqmiys z(ISCF4U`0B4`!7l9V0m{z#W9bkIND^R|-zt4=IE&Z8vAPNE#PDINiax@nH zv+X5rNP3oJ6du7cIWn&qMh?1Mok?u| zf^|^@VDUs})MnW%yyFXUAV@%Y;|Zr( zHA=~=k##|&BMeWx#9eEEcr=UV{KFkF{n?$42x&fb8f>;Vj@ThOOZcabFsqx`byoeHMW@kh4Np`?p{Bp4dO<3L-y!Wd8THnJ2mk z``26lmfxNecV#(juL?D~k5rc8kOhK3(vcCJHB0ueOZ*~71p@>v zLY$&$<93?meK{!vwx(Dp(jOa071(Q-JUtcy_QjOy8Ox=HEy>Bgz^K>jSl-~L#L0|n zxLc1MNhDTXE_nFP_&(^o_;M=NOWw-A&OW-xw>r7vTEAmtO{#P#L#&%ZrzFetjtR+E zm#z5@ztw7TrQg#dzgKHMlE$5kfz()IkVd;63gI`1y5APvmfV?scK+${_I~_jKp@`u z(q_BX(6XS&*X;Ye<&LSdrtn{Y=_z6Q2jMASfrUSwD}1X1@{Q)d{h_E0CI7dDry@MR zS+^|{b@~Q#Mr<`Dg8@~8n&s{!>Hv@rbq>NPRmVfCSZIJU7^%wn9-)1u7glp#KMr5RR8t%>%Dal;icaYucAg1mHjB^+2 zqYD|-@5UGNT#rHa=pvA3z_7y$-$bP#r?@|;tJs`_hJYtHm1!=bm6b*i96#_pK;=%Q zHxOThF|D#=DF8 z^^NAf45Hbt2;;qIjC^as>8ik3=!6$ml{-c!z1F_b3fXZ!?Quj0*H8F<#xa_%rCyLg zyLY0y5X1@s9rb;&123C53u83|g7|vC8A0lz_&>*RFBMUgT_YDVVED1fVj@+r21e-%>Ke6BjRiO?ZuXmmd~HU zIl=Ys4&KAstJhI4iRF7#q z{Js8VTutP?p18dX5G^eex~cUPhCh|WNp}7adVGOP-iX>!OE^Ei91@qelRs@S?@@)>TwlZ+6wDik&0XQoXdDgp|A zmu5W88L<71)bV&)3E&>)jb3ZoFrk;38yIn&_7N`3afFQ6i#jz@bh|R^&XRX6?Vr@b z#IytE1-*6%b~vgvHUYZcu@crp(oM?RQZT@SPrnmeyRlBw+ zyVdev3kpLYMDzMOn(Y;*8?tQz;cnsQnUa5p-M_Nmp6>5+k1O70`$O39lt!eZcv$bf z2c&uM?j-GPMAp+wzj0jPsNpN$dM))w|BfNhgD=*QJJewr+$LwD;^%4-8c{_Fo1&1z z@S&Qjs4T;U5JR*{C&WENN}3s;)D?djw;CQ+DV>rubcbRSj4aM>*D%A`K4-B4HVjQf zqG45#qekzjnSOjL)IHh7nZxiOy9b|aRyv;Cz@i@>cJb<~WVK-pR6MGMs|qVQja(w= zDE!+H((%`r$GgI+Z~5{A?%mq2>K&V`tf38O7_1lD_)SNmOxuGW7uZ=*(!+ zVf*Rt15Q)oC z7iIASFg$$jIh?jI$=En$qp<)nIB`xAy8?Q{XkIKZ?x9Qkg9Q=Vi#p5$1`;g-SIJ)f5&YoYaw@04B>`G^ud7UW0Y9TL(~nBl9j?>u)q!lQ z0IjE(N%)R(tfz#0WWwnib=)-6y4}e$rF(KoG7{u)jns_Xe~+y@+taV7VH+De4^S7} zkBh1s_MKNyu1dd!kuTd0Vy_nRK3%*|_;${EcM(_gHa?H4o|{{LAE#U${NZc#Jt?eh z<+J~nu`%q6xHYau%n4D`AnK5URaa0Y; zf8Koj9oc}cI!|+_$MrVks*3GjVfT5+zwVD$o$W)gheQgwb)vt0L@(~x%T-z?>kNB! zKCgHGVRr3dOaGUuyBhjU@l zjg7>Tjp6O~F2gw9i^AN7&u)I@1dbFLkjf8msXI{yR{h?=912KgR1&mu&^H1AVKC}y zb{PX7J+?WNWHtQu>TnTcxZA(@RtMN04SvCNIL29Gcu{F^>x26kk#K1BiDb-C<`JgQ zNlwcZ*%ELg`_|=E(p^y_#^M*_C5Zbn@Yx53+jY#%vC=3B=3>0X08Udh3MgT=5B+G$ zmAhP*Q*^cI0)5mQD5_$a+lzlQ+uooJ zPc`@7CIsrg)*4L5Xj{m2uD1+TzuM&q_kj-JeSNT=ZNj25yW3$m38MCGV@Y1HM=}MJ zT(BsrqQplK^1ayj(R7^GTdBH-lpEG9rYW6%HdE`~(@zQX5zbFEpEEp-7Uol#9BltS z6tymLp0_0Zt-ag-Lhmpc@cnCagL@MZ^7a3GJ@Lkle%mAf=MmgJGwkdHA9n_N1p9xy zJsxfS@pEr~^Y78I@4B0Jx{I2oE$exqI09p{99l&%af~MK{JffftN_B2`&=6;vI^q8 zcPn`>(rqa+LH#87baa{*e|a{$*UD}%=U?S+^bElnKH*CmJCl(lI#`q{nBkig`}C1w z?|U)!W0$m1mG_rs`uH@~pV5>`(4W%=RFBP2Gtsr@lGiIdACVTXl z(Wj`CSRA!SfAoq<3V?<^xg(t?;q5$e6XMCGRKYqUf6O7Ul#`#`1 zhrHjW=vul|A}pzcw8|s5d6H2i|b5E%4YQ+;oDZ(ekkQinejPhvybmNO4S*E zO#3(c0qPF@KZLzySXAp5H!Puov?$VuC>;X>3^BA43NwU=G)PN#my}2g(#(jIC?UF^{fuFxh3|R1Qt~=L$Z( zQmtGKa7!xIq`Iz+8|>RYZ1rIln81?7KOwIba8*84k_aL;HmnU_%xn&a2;qrWfQyWx z4{?T{E~PdJe$`a-7!cFh6DzD;=4sD(D~*2|Jc$3m`N6{xqq}jG!T~QY*sE>(P`p`O z%AWkW7NKDRo0;TVL4w*{+(^hIG$j$-M@9bKw7 z7${C&a$RnZ7A*wa(|lR=V*_X1o>Ly*bgvg@3m}sZG`uX^5Yl$h-akKZY!N zcde}Jx-N(wO!~f3DTjn%tXc$fjtrueYT6cQ0#lnvS7N*hfpl~u#{_5|y7g`yRtV3G%2B_7W_hL+QO0&#dSDCtJ zHM3|=TBO=WjF3rLdc5a*B0u(aYcL3+j;f9-7nW1Ps0%j6HR}z#5>-9cN<7+y=1#RQ z7qr_{s!zY?0|Lq>qY{P|mLOp6qFT1Qov+{6mcRe4zMVFJjl=ewNB+dUS-DHOo^Rnh zd6q4P2DE`lagI70lJFo!kem~{zB%@Bbi!LAU5rrjPbBU2(i=B)=eU5@sEv4Z<-G>? z680q1>=7-8BW}9H0ooHhB16k&4?6dEHi%^>>R*m8G^gsFF($UsihHkqeq^~W@gTso zdZ_lEt)wOXidxMZu_tlOuPLs_bN+tKoNJnn!RCmC zf>&7U#gw%x>LNO!J!#Rh{G=(r*Wzh$=nyZb;ncQ2Br2x*e8E7cW=f?mpOO8FGYHYe zMX{hkgyBS|s{00CzorD$jH7IE7?h>}^%qUxHjQDCb+SXi!G1h)lk#OKbY&oa^*FLk zSBg?OT{dAg!R=1r@XlDY$Oj&}ZAfO7NvY}QyvtfR8uxWUQ~>ODn>e#)G%vLXF&0HN zq2+3*2;!-X)_TCK7_F^xSWsC=upO5KSz)2FD3x-Om{+450^HR4xl8xakWE=zmlt+M zypz%EuXeQcHsLe*nlDo$ENsS&#gzJ#8}n02zn6Wwp(l=ck=@J6`+WHc*oQ0GTEK6Y z{o$UB!`sQ1J~h}c(vIpzl<40V!IsgFG3OL9I!9Bf@maG|2}>6;>qQ^!q^TN$Dp zrPFV0V#pqDYm59qw$0YRi2Be>sr`b|-b^*$%^;5J!x|V@JQzFYv^39f?hU_hwZNTo z{j3Gs+wW0+k?%^)yA14zrz;M8(_*sqSlWhPa@XInRVIDQmnKUU+-UqHF({b#J|fRw z7u!b`nyt$n%y~}RGVWn$uv->P@HE(IF+X;E9Q5P{RUsjJ+uqis07h6rk5&*N!Oh6D ziT8Kc#k>~tj<@R+*dOvZQy-A~3`fzex|i&du3B}hXWKfQy?x~tpVc2}#*Ii1#Ep4XSQc7i>bHqn>$0g#O01Y!rTL(4S(-B+Mxf|#iles-m_$w1 zwqN)xSq+!nX_{%q_3JzVKZ0dQ!&dHO5mLRf6>^FEae~aZo@=rRAWPj@G3<&{Ct=fk zLBg*30$(e?kg~!_Mpb7k49ThI$sT%qX>N5=VS4XDjNCZ_Q~h4k^?UR~p_&X8XA|Ng zMCNwYQCVIhJh~=1G}R~=QH*(hus;@!yhh54mdKU|h6{S65@}tZF&^08{7`B==CYA5 zTf8@@+7L2RSU(>LpYY`@csai6OMG~=cec+ zMHLA(5ih8KOD;`(FCD(AR(F#>R?OG2I^Vv1&}ORAwC%zHV|MbpV+@bul&;=k&Tfmu zp$m!{V!!(Qhj%;jas|nr>(t27wud-^k|A6{ii*D#P)z|D8d);3bE@i8m>H{LKaGR! z_!&oKq1!Aml2Q9|H>TacnVL;SYv23`x>{zo!0bAHV7w<4oz|{0ER$kl%r~eNl_(#E zF#H@i$71-hrsI{Hg*@nb-7AZ!?OC^2!wBdh)|ap9EssY)jd+b&dY{E)3U4#&9*!vI zu*#sE8t&GE8k1%vXXWR=)Ec@bx>!HyO1s~F2iXu>8Xe@On$NKmWQD48hPr@5zVzE1Kko5Y96rh7}T$r6#gF8vky%!m=+oMif2Z&XV8 zbfYBMS_C_n=i9Q1-c)ZGW>z^KXf6y=STdH3Te+i7MrmU3)@a~pp@KvG2~+2byI1xM zm#|~_NI90|*-C%LS*Bf-G0I(H+Q?(k=+-4WSl(^Z7P)t_vHCTa{b<5I;4G;6a?6k3 zVn>(791+wX`3zrEb+Jq2bLbi=QsYb*q6!8H5IF(`>ZcMGAJfU(kP!nlj!S(kz);%wVvrUTLyvW`le*kfuZlFUSeP24{toD&L)>2IH0Fz#Dv8eGevrk2|J|`{i&S|$aF5nz`Cjf z`0}IAy$>fi_da0UqtsZ86EUt858_}f=(O615z!l_JR>Uaal_@@bn7_2tA7%4mnrDX zUX`PNy1+f~1M)e+#HNw;As&C?st`@&Di2v@7o^Cdq*@Vs^44NJWj?qBf6ygKi95$J z(g6F6N;uw<{WYyWA5!AMD!+$q`^$%@fSYP>ab7<1i!`!)M68qBYVtZZSOu-y!Bxq9}ahK)AamXid7hgSI zSGW77tvlO>4I}+O8VSpr9i2NOHUF=Ofg zu6mgV!Mp*if*sWMLLIQXY;669$jzsos z=a3mG0-+haQ6)83&81uQxN3JDDD`U;^%SJi#5NhFNohEV~FOF!%^Ip)cW>I{^lYzD-DO&`AtzT z)A9UmOxyy6?F4rUBfGT_4th#+Q^- z*6^-y&6$2k1DoI_JzhhsBv$^peqqczl&`DTBtb5N^XmC>Gn9=^O>gf}Hu-uB)qHT{ zGZD6wnNO8+eJKZ++}62iFYbQpy9r8mFHGRQ$-CXWBGV48m_S+6Af#<~PncXonnqnW znT~Q$<>PvgK0bcbX_uhDmP6>9oz|~3TE}(wPUlW@-4$no(Uuwe&_83%6@lx}tie(n&m(uy;kp~Nv7 zBdG06a`mG4Y((WG8mA|@t_t4xF6#>>ghK{3VA8idsL)Y zWyqEi#%k*sTeK9C7Z#T;LC3*)Z|!KhBWCR2L!4yr(Cd&VC}qDO;ywlKr#^@|h6V*D zgYCm5A8-?Slwr_}TP01e?JXt$Z{?mifrae_?Zj|1_OM*nLg?YK-olR5(1+&1?1Jsy z-Q}`?&QeMQai@BT*-EydeEiVs?JpeAxxEHrxSi+6Y8L~-HD5Wm-AE*Bfl)X662f)U zcOymM4+H6>5``1Fq!LBD(Qe~n>B!6KJ;)LP9L9FX>az)(-Y54?Rvz3bxw)x2cPSN; zg|WoJh=Z@Xe#4~B2Vd*uB8Na~2d^LA62m9X2UhVTg?H>I#h(*HC|@Ed6y%-wKA7=m z4u^Anp?Iw*lW(RHMd-Ta!Skg(M_~%QCqyosK6qRg`h`zE zV-N9@$uUJniU>c3;_~j(I$ap2#^lSUIF~>-D!HF7NVpWm)~{HUj8eJZiu(}fA9wyj zEZgRD~(PtY(Lucz}m7bleHr0L`ddmu5 z?%;AKvb*dAX=`)|tYz)KxmO{|5OO_5^ELJWlP1rGk&`yq~(|ID=k&^XI4tdVpV>Skl(0 zjw`%OPwLXZwqg66O_Vl9GFdI7KkFsWxi}TK`S)|)Z7uv)`HD%r&UBy1IUZ}4>a+;= zobm~osD1!^9!N|8jav=;nehHK2F9V$afh&s~+~7lZq#*%o8p?Z(?kY*;MX$ zd$$@ySL%6qQv5iMD=>^vEs*IxXfc(StU;e9PY;DT*sdgY3m`GNr)-(+~0(ESU7N+PZaOht*_Cw zy1WOpJ8lO$g0{DPV3`2JRI=5SL!rAZT%MQb`tC3M!^ZIL41y5x?*T8y83=O&J`Ldd zgJiLcn?+vSaSrVQCI0^20s4_X01{?>8->dmsXwh+kV z@*ZU_bKR6duGzAEMf&nJuq3#|+U31cl!r^-Vky7FoA=FUOR+?*?<%z#J4<+h8`MV@ z)du1=EJd9Rhs{y*TiZOsNaAD#CO5Z+#|GXLv0P6RYaVtda&B7r@Me7JY*`rm8ZIb0 z?(1ko)CaWEOcXqCLPltSlC8nb*VQ5Q1uaFvARAp--pP)S%j_EGSZ2??-PXhw^f&R| z4H@lZAVyzML&kT)Mpk5CFq=2Ky;9W7Yt(uDkpx0!wxd3VcgB_t91m*4c5Q6lZ$aVk zUFMq}bYhz$el?U}O4~WKskPD*m)wacgKArXj3u_vrB)DBWLNt5vu40p$ym|K(0rK| zr)$)r;Z56svrHl`sc(+)Tn72sMUfZ&E&5;>p<=Vr?rb?@24jg`t%JP#g29yMGWw0R#ioOq5YnGAU4U>X%#_1?wx($ zI*m=h{|lo6JaL_=E3rM5rzAt$a}QGZ6S!8ro)P4#d0 zH2pS-|6OqUa|*5|KuijrpxuXj3cnT5>+6_%8-M%wtI>`E9#fa$UESxI){utMyoO-G z=#Okry)85az5PQX7pA$L{Am2Kci)No?u$g`(c?WKb*S4=DURz)$ar@i6_NB-bt|m+ z)kjVVyee{2(7<|YYsz(p6tDg#$8HS=!+K$CEHARG%uE+<`PTSKc{y6!_R&lUzz_@L zP1tQvewnul@2n3$3MqI^B_&wE6F^Iy`W6LB?|s4M7ITU@!`0BUYj?4=QjlTQN1u& zP6;Y_A0dF-2~2)FGV&_D0h!p*ADNgHMU39rHiG%F4BNEDMih+W>)#g zY%Xj>P79`av`cmw~&#eS-@@{PIhZy~lWah#>qohCNFB-Ssm z5(ldVONgZEdXX5``Xh8kk_uwdDS}Bm_O3vX1*CH9LSLJ=x!COi5j?c6^f|jnAaPL>QK> zlQsKX0Rd|3m-L19jUGT(<4wQ=!+O~iLzdf)%!)&ke0RiPWY!q8AJ*=&)XCSUkWYORYgu3h&O5SX~ss?18B>R2J{)s zGj8Z<%{AzEm{tr$^X*aEGGx%t&8JQv_DVaA44q(M&LdM2mP(X2j2A1RGBPc0m7Opd z!WofHH7_oKGsLs5NhW=UtgRrdra90M#v?M(kD%O=@bf?)8|^Or zL{o`SDbQuLaPK44YgqV$qFrg6$twn~@br0IO=wIgn%UZ4goh$-4EOdlbmO9`{B97C+~P33%A*c0{up=jZhr`MDj?@cK7L z-aRLFcM2IP3W9IMbIxs8bFy1=*@=8lCGau?uSO>&!*nuB>@Pl^;D;%JtA&hnLt9YU zLOe2wQs**zGSs4E27iFyATj-1piTd?hfpY`$ z!kbUuMx+nDjPy^CJ2W;$&_6JP-?Jv~#Tz}`tRJ-Qv*ep}9B%92lhliX1uG}Awh=$s zV4|Cq5Som?L4nJYS@_CJGJU+^iI<9q)o6&LO}iW}Ca82aOknkMs!>lNdC|AYj70-spk)^EImd*76T3_P-MaXp0hSd1QA+RDlaP@~iJ@(DhWEk@yrQ{sM7-J}-Lz2#f62-G9zXXNs^# zX4$=7F75YAZyo@Xz`S_%o$b#vXI|H;32cC*htsgKO$qC}ME?xV44m?s;;)VR_wip5 zIwpFe*Ch}?5P2Lb{m*1;Chvb7vR=nio4xzBYTJsSza{%n!$e-rV|AK z&n{73Dn-A(0ph*lA2Sb**B}11s6Q_TYhOiis;83FenZ#wZ$%Nl)RGamA7TD^=0Cee zdbI$qKMT;oN+nWbNVezy&oh7ikZNJ>*ZbB7Wu*Qh^%}k>)2Wd#W&)vo`8bH>B;SU2 zJlo%f(J{N)(&2Nh0p?#@x(r5-(R5H9$o``PwQfomU$^AdH@vq0wfJu-8`(#lg|NQ^&IsbbA zEE#YQF7s;wSFiqcAF;e|cq_Ow?0cMTsBEu{+EH}7`TahyJi_IqHLx6CuZATuKev3< zh~r1j@#A~we_8Q0^m3pp@J;jj$L5X;@0S?x&OB}l2*%@W$uS7kdw7N0GyS4_@5=|@ z%K3$d0&oiqDpyi}q|PvioV4x+cpe9cWX1|HPmKj6_qjflmi@~Z_c58#m!nIQ?K(QX zX1Nnc9z3+0a{YSC@Atm1qkCUPgAD5Z@fNZz-Az@2fH!-oWOPTmPG71_5?;Nr}U|w$ismJZIl4NX29Q<2Oxm#=*g}>-E*b>v`(x{4t zlysR?yFeulxe|e2VmY;`Y&KSGo(lA(tGqDnH?QCbQF!#1*@HHTL}SJ$YSViMc%&w< zgf^=+?SJ#jAsHG;P2Oxio1o+7`m;!zo$m0bfh7>#0y%e+aOVH<>_6w{Z5<*?{!uAc z*1Z5v>Z?!6r9dc&FD9b|tZ8ATv=?3a;MULMu4%bnb3WCz$A4rroI^lH6BmseJ<366 zHyH1pGDXH9OF3PBdAfFXShZ>vBmNiqAl1f7Wgl6MHix=mWKUTPg!GKeVEWoMfn^a} z?pv+~s{@&F&l!J_)f}NWm7oLeYaUx6pn;G{aHn0h_$cRl_5~wp&5@&ZTXF;~k?QnoJ26#q>Q*kS@^%?~Z?APQ;4KbrRw6$K|A9F@Jm8EVmC+*MND%jLW@PB5Jn^>%^ z$^ez=J_y@>!r`KT0-(0GrVt98Cf7kx@kw>LmNIEquq=j1;eO=-%in~79*p;8^xTygVkAzKdx!%ei5~FxG*$$yx0cL!}3l&VDvh2TBY_rg| zdzDoMJWZG5WjR{ddUu-v>DG65i%q@QY4~emN^pv=(Xe`M-!H=1j-(BDREhfO%}pCN`r zXKp!?CoA+QRDoq5xfRwZS6(g2+Xw@_V+r*+kMXu;NKenphy7S`u|Wppnb@xPS~c$3 zRFayoo>h_>RdaP*ZZc!iSk*0H+YIstcdtY|p(M)+pfi`JS=oBL`-j*6$GP3~n2H>7 zm7oLtx)+XGALVdbZAw714Lu0*|h03L9=N*0iC-t> zf89H=JuV<{34Mg!Khf7hh#`5WXF3J9D(Oc=OsG==?9?h?%k#%s`$ox>7QhF>X!z%9&V5`m6w&xz4_8V}9(#E<;7{{KvP4&QqK z5IB{wgck%8gqrc(F859L*JSG>Pa1aS(*kHlR`Lc-`@zveAs`@PZ7_G z5BKhT+HUA<$W~fy!c!sZ-v|R>;@DJ4GTVaocxd;yol(naMDgDoiF`#C3U{L7*yt@E z@SjEspTV*Gb)uS2<{m%5w8(dpqE8Hb^0or@rcF;e6oGJ8w8g*LSUL21m9@+qV$uw z6o`!PanABX3Z^4~hy9HgZJ;ZOYXutqC=$YAds$4%72-9;9PRx$Vew~h5!;QY8meQA z&r0prUT`28KH6TifBOI?0x8k_qyK(tw=1;|aDNYm>%lCyc#2+%lwrK1L_N?#W`Cew zrld;6P=u3;SaO*gbPcnnu+6qfNvu0k+xObL+elsJsa6Rs=A8>nW&58NA;KIBhkkMh{@0v4HsXCL)q_fFIpR?29^lp_3|T$4Lzb|;+XD@$j}Mj z-i+~BiD~lKv^)PXS?S&OnSCweUO^jPRoOME8Fq$A8=h z!sA$Z-l25EW(pVs94R6J5tp%PI?}XTYfN6ZkRP>Ro z9=Ej{^GpjJ8^bTliZ1M$t$lg80XezwJ()ULC2c=bLCYbN&~Mz$O6+zWf@=8LpprT_M6 z|FCqh2&T)ylfLDE(gCXEnSMsO;nmSfxA`KQvyisW6v-=AI1g7aABD7f#olsep75mv z!ygH*L^VfrReW6jDP-`6=fTc0p)Hzo|1-Tx123lLrPD@=6qH-U754w|%*1lo;a7nGuL*5{gI zLE=W9XlU2^r*VZonD6$HCY@*cE@6Y5rn^BWcmCMkbm6ufw6^^0_nsKb1VjftEaXMF z`Cd3&d;#aR@&wzJJM`ik&(FzZM8w&HhMsKGRo=y3+F^)u!N{b>g7DR~z96NR@lUY^+``2tsepdo!If5QY< ziMF=?AVj~FLxZs)OghL5}}I3>CCq>~W{u=G{&dhkaw*JH1vZ7QV1Y70O?fPUvN?Q(DOv zt5-d*N-oNQOe`|}_WbAquI)MGn%=9M}S|r>@ zKv9uBr7~OTH&O)RMz6$Cvy4~dwBEYYO!4Ou zjzqe#ahzA082=kL{=>%rh$5}|&77)Gvv;Om_}$ts|WqyeVcml@||~rFsYF{+nOu4lwIn@e}jOoZ0OXXNffRxQpO0emXnE?o&dS z{+{HGcZh{Qe6UO#9)?!LCavZlcO<(qpRhb5e`8KdEgdW?>(VdtSHqbY!&{Rq#_>tF z*3lwE<&xT#Qg3=2!&x#(;TJ<2YTtfKGa~y9m;7fekOz_I z6va~xm>P9kr`@VwTo>XfJ#sSF3UH#!*Nnf)A2l&SqPXydYUxAQe=)^&4Y8pxV(`FoIS|30?~DKgM< zthlpYOV4h%`1H}aYSq-|kTRmO>uoo)8;xb5MnV2W9DRrVm5(vm!H~4zGAEIi-1;|}FbuH8 zk$FFWEocguU|?WSVeHAD?DHKov-ZR9NKRl?e(g%1m2>JAI*Ni+?SvRil-Wz;7@xeOA$37vpGd zYu!;^Hj)*gMX9|PKNR&fr|iqP+g_&uP_0|lT)R%=hYeoG_A~8MMd!VQsqq-z&$oYl z>JLBvBpT9cEZHQ1qAVbM-2n)NkoM&de0D3m!fHa4Jaqf=RIoI`dMI|ZXp&7E(?af# z)7$`N=J;P~BA6HxB<#5?L9_cL{3NNriZszRr0Zf6A3oND&=j%dvia_E#GTF7JE0Pb5(y-iT7q+=*H|xv3i7~Efl%!mB+{i(n@HeBC5S|zru#?`p#TvtX>({&C>wKPo^h{{Q2wQwM3IC#7DnFlo2=|hry6&_ z>q@o)O&cEH6V^2R18>qIwE6z8n8&ZzyaRA5W?$(R_vK;iRvEzNhUedr6<73+HE0als_&co2_@BN6#gS4ySQy`OJ zS_`M#iuP%4b%O*&)bv}@MNe%0Uj##3XBMb}^nIPE%zpJlre|IM>UgW*Pk6vQAb-y{ z(~Tj-&gG9X0Ob+3ppYnPJ=?AW8B4gdk?@Q(Dk4fQp}>(4g^EUO-zB(4LwwvQ_5RnOi^iO7v2TBz_50PXjWaC7TpiDm|8ptC=6cLbGl*SLP)UC=YZF2>ozl;T{A`(S*)XQcX zeOg-#G&VjS@H`tC1NLymc$`8o{>fQRMOLjPhY$(msz+;&@mZZgqbt{;0dd+m2j z5^tdOEvLjy5j+QwiY{1MpIkg2RdWWKzz54ayCkWse>6oC1ijWYI;M@wYMl-(~8+0kO-(6nSb$T`@|`Pge_Gm0ZSr1M;q09w$%XeEd7I zR?{Hv#gC*2CH-ivlrFo260w99uStlwcmVe{Vt_)V(Llmvu(AEV;F)A14}0D<0^Gfo zqRWtF7228qj(0_9*nZbCKJF}kcOH!c*z%(<;KjhHk1%|%&JV>6HF|q{(&d^2E!^j^ z$TbD3U+?$OV8)*9nZ#&Nj)8{!&G1ZirP`8!ds!CeNxbHx(#kVy)O9O3!gE>fGVS)^ zMI6qF3V-u!?qfRm)X1Mc@qE_rdyrT(6XM9JS2zvg%{*_eJaC4h~Ufvz5YlJ=Y%oJ|$ut zw8e7=gyoQz!(;%0-I+^hvThycVlBjIipw{nQMckidE-|U>&hzll&k-!dHgqA#kU4=KOFwOSik{bc%ziaPXUMO&D0+vd$Q3dn}|D7 zi*y(Ngi8W_LBo{&=O}l=goU|7R6|X?w$Hzhz%yB|+77 z(ZD6O{%Q^Q22#XPxM(=tGNt?TzxDPNO`?*V5rRd|G0}o~Ju*w<$hF4DEAEE@mj^D{ zp2Q34FGT*?nSTbjkh*XG)XsF4EIkxxIcI0r?@^|}k3BIac1=1BD&h3jNXwEe?U#(x zIR`u-x2}uSegmy^DS`7^7!vGIS0z9+Kh zCVbGx?i)9ii%2IbGq0B(oEWR-ooJHE$9w;`&71Rk=Jcd9XOU+SHHk6@EFwRyE7uGP zMqd|lIbtmz%@8$0zMpgW98T^CK`_7(=Yl9r3{jG}chd(n?v2{S z9Yy-qvspTtw_ZN#2B$^BXR2tgRgH5~ytZ)N@0AkDJ)&1nhnSDfKV`eIl((r?Z?}b4 zT!CtmE5RUu_AKQ)!07Nb{nzs@*}l4y#*?;hP(PXr6FPjkwCEH%gFDikIul3nSZ_pk z`6jXklL9BTH(j}I z3U3dD18P&TbkB1s$uYWov+V1Yp*;>%4_N>*t5=TCwyLYYnT*|oe8(*SpX={*#6xk;|XRA*XRjk*_GBG_slbqoUSaG7`!)c zyRmAr*BQ{wjzp3^D#{Qhb*28`+fpyX!BH=?BH{gfN)VwNql!h_L%j5leZ3 zWI0sxQ&~031}*@sQVm~l9dvY+ye7ARXmKgt4o*V}hx8PhJk+q%cI~HASb7}OdTi*% zRXyRlRq2PyuFQ+xxk=;JqkY1q;RG6VM--e`wCN`dnF-H59?%biXWEY6Nf=wl4!xis zNt7go+c~)@jvE_kjp+`J!7N%5x{pJXQXNlM+MU!MB1m`myB2vDQQ7LREXH2hstYW# zLj4cit%K2`G%Rw8Aj7dk`26$XVzv|v=nK9zRu8;r`Hz@`O`9x3XLyAZA&s1e`M~vL za83Yax1^C9Sa_NSitu^yjw&(prs=MiT&PK(pc!)ZjJPRxLwOjf@3Quy3G5oh#tS#M zm1e5L3D_G=SdQ%#Ipoi(MoAXH zj;Os_Iq{q77wF-Q4L0$nV>4z^pKLks0(4bGvd*AW3(f#VEpr&b6 z3q}-88oM!MifX&8q|ts}ij2euI;OLJeUZ(4C3Md8Liy7v-S>O_Lej>UX}xB>+pYH1 zKE)HG@rskC#7@PI0f0LG(fG^yl!{OHH#7CR^w<*?_Quy^w+uUhh8%Dp>)4H5K ztp;6W4vOs^VE|e=El2NHOro+c-%Gt+=`yc`49PEEG#FLZE<(mSVvC&n4I!^NGdPX1 zOHM+F4LVFn%WKYN0%#mOC+vqdNCr>#mK-gO!=)@W$5KXpsUKDydoMQKxq_$@T z(zS+XI~%-2S?|$5e#nrpHauzE#4564?n*2|by^x?9z+*l<2a&=dA>;*%c@@7(Yyg9 z&L-eyH~?Dc#X$&@HnJb^TS3OJdR6#kqT5m=~Tl? zutdezJ_~Li=Lfg1tELxpOp4U;pT+PO!@7(2v*H}44Fqm1ky+SHP7Rt`kdIh&HG)0N zF&+D;w|ZV9z+zb`+_%2le7H%LIba7AzgXS&8%|YVr-UGQMRxQ_(>g(NZgWixZKrq% z9rr6ySyQDWb>t=cdSRA~Yt)Uojz1W8ng_?^zEDxo@!GCzQo`lu#bNfXb) zi{dNCWu`0f9r0LHayjC$O1@tWh}KJC`YOZ!#~jU%QVpPggN0MmGKwW2-#$FaA`apy z7H2u*eUt6mu^SEn^UdLj9h9NdIc|T?!bIr>mK+qHrYNvsP zSHQXGv*m<%P*66~#4Hi>F>_l&dlw#MMqu!pS;-SB`+{Mf4m&*7vkhNtT1=8mnYI~Y z3BGRo7kz@6I6g(F`DMw(@r&)l>vXe*3A(RCnTi7_^HX^IG?(B^LwIjA5LlzY~k}77L z8XH|ZzKz~{`1vUFs!sEpCm}r5K%wVij3qy$qZdV1IxJ3WiDEH3b}d%LjrY}itNPnF zUM=2k*AWIwjpHe9lZWuv!Ee39%CObr_m7bYCbT^F=dPM zs&V-c$-$Ns3N>1h-cDRGBt=@>Iu1{N&^G-#7OYChGO~-9jY5)TTr|Z`=*-bmK!&M{ z`ZL$4orf251KK4VK5fhdQ150u>3iwE@IfX0GpwcDf>X!Z2hS6@(DH#8#+OWKKcL{k zV0bevkfN5cP562@-=g7(9Asi1VuH*Z8o&RFwHqw5bj>0(p=8?&cZtf_u3StwGNG6k z-%R%VS)Z^48BMSejjnwG*YsQy*E-U=M$Y;{+{7La{T*C3%N{3Q7en2w#q^W1 zfH_I$;H3a_@x*6y@y(@l*@osaVT zY>nsc7|K~@RlA<=)Qh$d?q|y;oWUAvnc>U^J^m<7Q&FQOVm&Qe9@s$48_Yl`ZT*e?olf*T9_*8CTMv59Nq?eQg@*Jp>dT+hGJ784HN!~~grYdxDhs#5U)vbmzJN|#)| z4oQA&{l#)-_L0HNdxHfX*XO6x5+pI#Z_+4|45vgFEPnjhh)oep&lgN*1|0_}D>_ zyo8a$hec0>b1hOIAjZDP+Yi55Nbi}862k_M6bkKz1WqX3^2#WF#2c9GgrYWv3d;HH zXH%Th4zwj?$tJKaeLjH)|QqF4)8N%?kDxbdZcw62+?F=Kg?r2%7m1u?_NkVr8LiyD>`RyPGm zq~DYdojJ7taHIIO?xqjngK@ql4W9_yJcQADOGBekK0}eAe!A^o81s-4g1}qLK~9Cb zpuzvn$gWIV|ACaxynP4#sHWI<}ER=NL<% z#O`5QJ60b*B2ArxmHTN4+!6~I!#p9@h)##~B~i$luimYZDGT$r|BoJ+n9F{SC|4c#iLye0WMIQaM~jf$N`#OVnEq@AqWDAC&ARcnb3 za!E9;*%5344bUMSDtB$WO}E!pkI&p5)5+3KAoQ8C*iK8e__!wDZU+j10!p?Bt&x2o zw3bw_043nyNM0HG>Mde{c8DGSeHf2+c2d-hwRli;d`u&l3inV(^5BIf1FR}T=(b%< zt<>NWCI|4vCjvR4`=ME|CTht~1$oHU2f|M6Xod|Hfkk&PmgQHr?(7 zA9#+0Uk;A%>7e#D&pLD29J965Evol~MGTVQ7MDk!KOA4o&b+(J;-&#ECL6lCmVJR8 zDJJ{$C-&`u4rCQyV%I1s(1J9uzKzUbE)G>+9@H6n+2z(_73iP5%Qrh|vuVkfXs10a z1!p8W2M^q3XPF**$elJ&K{!jUW)`mjwpN>%C5sbMjFN(-?qP;)5zXXYpYaKJcR069 z)CK>6S~c7$b1sw%r>JPn@njU%>SD2e(T(?sOj82{9Z;ZnL-~Jbdh3R$-tYTcq$L%l zLqI`t=o%QLK^l~h?(SxQkp^jylm-du&Y^o~q`PzI9^yCe&-cFnIZxo6b6tDywbyG! z^3;j?3B|D@BH+Y!9SJ{?QCD}u6miJ(Lk9>ZNGoflnZ1W zS`4p=x>o&wVJ(goW6GvF&S^}up=_eVG%;ZIZ)5~|+m96#GY+3TR;?fz#6~TS|!cNXl-@tq4S6}xHn+gP{$5KZl}8CxN%uoWmjD!>*=ykkFrA9PziW3 z%ngGXJUR1QM%`#`{FhU9q5hbTk!2bETWa<3*aOzk{%A)tw)C0&AW^^t&J-kJ%L=G} ze6{I4p@u`kc1^M6r-!&A=GXOMU6dj=`AfHBwIQ}#K?VyiCLm}}Y7z&MIO6^smC~Wl zw3{u~%W~72zRmuw?y7rJhezu_#^dlURmCU9%&3eSLF(rET_UJzGz(@k*RUDMYd%df78?8|xv!VUf z+(VCfyHVQ?JH=S5P%m<}oiw|k$QT)aq<6M2pgpMNHz=>av8jFXN71y;eE>1nepTj> zhy!RH@8?x!$K6f>04pgywz`}znSqW@0)GBvS-Z*e*I#hkJG7M_gtk#>SPTc~;F`I+ zW+Pcg=Y%KzZQEW~0j~?q7`SPDM=qyq$kW7A zwGvwHm>8={1fjVUeEL-`w?9Ab5Mb|&rlUZ@Hl|k0X(0+xF zK^>LvJ6i7BkSvl!$kQ%_ef4hC1B1!wA#61#w1Sx>TDY0gy(f-XGzfFje_H!_@MWiY6mTNITtk6iGTju z)|RW7TS~Tmo0t=~WHjwI`5-E7YE(a~1CJl%!B5yur^d^3UADX*w1Ayhdi9Yp5=r=* zb?^9|oQ7trwB^E?n$=|*>IV+YwM4EB%mn}O6x**cy#D^Yj(s;_KW8&cVD%$$cgh!b zxMnv2XJ1tE>@=|=^(47DW6V^G7F{ zW6_yggRREN;o^4`K_5*_6VdVIZxud$hDD(EezLDXMq^Cb+TpM~$K0+i#ya zUb;Ia>v%;ey{=BJ%3~U)$?xp4=7JID+@P$l_&7sjNv;LcX0vRt z#ST!RfA=3g?cCw{+M&&0-}fD9#fafpmwxc??a=uY^RX^;Sk~(2dh(4B=Z8v?5WGI8 z=*3p;-?N-1{w3q60G&JNAID%@cE^=&YLi4}iI=lQ?e^U;=U{RXjQsH0#&CvkleTl? z(T%1~<$@7WTCVTETrjV|AyMFnyY~#2>&hDcHmV1YYfI_@mQVY3`vp-A^*BuH#t&ND zfp6t#@~E7oFV2qseF~KhB;3m&vS`3FYO8TJXcvJ-TNw}Pnx@_6`|F~$Cw#^fc+ z%KSqwz^}>&NgS>#WCw7HrPcPLb@uAaS&Sm7}j?846FlMbeZ zpiN0mSR~A9-}s`c?NAA{2;bD^jnJ6QCmAr3tcdxJxHTk}R6z4|&+>GJwndxEmYzGB zM_Fh^k&z|{BT{=VDDu;1Yd^(lH&m_~t|(Lcp8__=jjRAxxYK(+qJ`k0C<*?zQ@~8~ zeB1x{%jVTiBDqYK>leZEpoF0ubI;89_8xtH72BKJi!$sluV1HASRRD*qc+;MuY+kK z=KDsbo(4gvW|7b+yIHzLewr$RVLYNQ;o+l?X)YGu$^t?@Z1GA{%U1!0g<@gSfJOP1 zfj}m{A62T()OkW|DB$hKM>a@XxG}L{E*wME8REuCDFtn5r=y{c>ssiSwstCFWD@A&*|pv^x}qjWx%E|;1wRzgYXu_D z411QMe>g$1?OvL_vrX?8r!AjEN(UEJIxI&s4p*3NuLJc9#PR z`%=L#%dO+-;^4t$FhjgN!Z}Hj)TU3px<}9k0)BP@1EvW0m1?WSvs6Jx0HbK8dnhz* z``)IU2}wo^4o#1LMmn!BZ;&7-!~Zd2Px>1Uw(rAUD4RcytUg3#xo)?7>1W!dj|S#Y zIt@Gd)jWI#XJCwGsa2lvDp$K`-}Ua3eE-l0Y9=Xl`WGS8_fA+B!bukpClG&>@MD90 zPR_}H+A?0kPtl2%yz>SWJ1oc+CXaAjU#yF2^|^8>+-}>? z8F`D9d}fc8;p1Q`$r7Ai*i>GW_AX@{_irb-m9GwiLF6^>E8P9ogM|&Dq(OonB9AR{ zyt_I9IMNrRA$T1`1mnE6x1{_dpa{-eW_>=J)=aV~Z~ePuQ$e|D{^1GfTguAH_CYMJ z9)lBa*Bfoay8LK61fGU8+tiQ;<5Rh=sWK0bW*~IKNMG>6D>RWL%TG&i^DFq({`(b! zVG^35&$HRX!GR|jaisC%-CtpX(-wi`hm&z!1H2}dgd&)EL||Bh6>-_JpeKc24Bx(( z6w3*_SG8_P;Hwjp0Egm4nT7i;m*H|ra)`ya8u6;%qFs|csBb^H_15%)ksN%wT4!!| z*J;L|c=Ei~?7ZGPy8tenb+e~@-f?Ivc{oA}Su0oN9`BUo{%^bB8V%|5yFvSShUIT0 z{-T?^Q1kAi6=lVoz#Te(oiXFgCj#&j{>!f38Av!?jm=?}ZQq5pRMYe$;l(aE_)DN@ z(Xx*P-u&5j94sI*KyqcqO#4>tb3L>8KfKXd75qHMU)k7xYx9#${O9gEdWez5p5I3i$$*L+bfk18Ea%&0+`W#A4iJ(;&D ztiXGLI4s$D?>zDPOH4OxY<@pb@i$+RJm5*}?zo{%wH$gc<9F~Yq0qq#rt=^j>Zgmw zvwiRXqEP#JOZh4PKNYx+$oWhnVCtyBFV*lk_& z5xee|c!TI~)K!E1IE8L=tC` zL!`;wt!2z^#t)o^K~_miA=h?58ueDR%Y92#rUi0!sDC1Q>ix(o$}HM?yWu=|)2`WA ze5MJx(%hz}xs$z1G`M@$-bcJxg1RM5jH3VhVzK>k@u_^bbU#*tfTcgfy*X+Zxzm5- zCMZAyd4l%U5uF%YAmt>Tw)WovVO}%<;wqd8rgXcM86N=tkD3e<6-&P9L?qZd^Z}wV%Gy;NUZUV6pj(ur2v+TNv!-H?anLDqNGQrG1tQrk-go>jAM-6J=@o_#ytZQB^M z|5H+2-DcbuR2*x`dp-v|PZ}&5MLLT+xSFf}|15z2ng_lAjLz&isv(4=AhddiRsEHR z9sXpGT<1@7Zy!HA$&XkCDdr8vepMeJ}Ev{Gc@eOp*+FwB1E@wP@U-&?;52A!SsrE873O12FcM($-_Q3YHSM44+fFGidW8z$NgV zXy(ur%+B`*R=@nVse!HZw(*EFH)TtKF#n%DCD{I(26K zllAK&Y7I`fEMuA}eb7;?dAuPmT|OTfYesz+ae*6GQ9GBXH9w8=p~A0K#c+EKJLP1; zs`7!@cP{C?qKqF+V1hh4`xWgCXjacjdh;VqBh1b@oDH35F*TlvTQbXjR9fkp4(Xl; zRy$ehD-YMJtg{b$KhL8e&uOA}XED#WF@6Cj+*x;#1`eufS>53;D&{@Mz$@xc*ACA! zy-L41^V09TxM42CMV^%zh87j^n9fnjr?7Dr_4jsna8rv!8D^{?_6S*lfJM zI(v=4mFyJ=8qsD_gMGbL8eh+mn!)X>k@;kk>Xl*UzjorMhEiPget)GZTWlyFu=1fO zVqE5jMWC!lmNC;-v!MyakO-TL$8gk-g(RiCw9@i{A{q$+xGP0X@R+!1Ll=PAE zB?}d2+vZ-_V8`gT9IUEx3O7E}2tCrX5*oj2VE|UTo7c?R6t%t0z-iNGzm0)-(C;yd`?n}NRmYwUyvnIj{T{j(i*!ppFdC@7>*?^hO$D$j9dTd++mLcO0&g21 z`>jeqNbLM5Xwra1Yz7VZMNT7*+v~Tso=u)crxA@g-9iI+iL;XjlsrYlMmNI08zmJFD*b5HFsBF&cV+6Qt{Z-sJYVs6 z)NQ((0mDQeV8Pop2CV}Q-!t82w702FJVRFnsADQ7&+!kC=1%|D|26KfL$%%^*A$*E zm7o9NjjJD>UDS+$)dlbF0IxP3^-hq9I=#@mH8Rf4Qbe4MWwZMlv&8s;e!vaOJ#jZ^ zO^V=sSIkON4yn+)`Sy13ggE5cIz?>o(s|UFLJh{}8G(bX|4a2*BL8t-(Sx8k zIlf%1FO?Qn6rynuR3Sun<@@vQfZ)JGPLktae0g=4G$MWQ9LlxmG0pJ`NMtwYkwG_+ zK~iFtAT4`VZq|c+N%e80tfR+Wl6NJjz#Xj2&sq znz7}_Es7zUa5^8jHRSbAP~cNlBA|;_pZ>MMlK>PQX`5OtL|i{#K)1hD2o^p}XFaH{ zhpu@RGPm2VG)3nIfjs;>P98>2eziX%+8aFRpDIhWs*Fi==^C(foqFFrp?*#G)3+?7 zH@FHhU}A|8>I2x81Iy+`7u{UgkQ`i5X;dqat?+zHHZ6x_j;8Zz$LCB4 z$4&s#PW_PH1yqDHTTyx_KZ-+zF=xbvubyfCEYgi4!bzNhcxbM6i!$lk?#q`$TKshF znXxMl?@j5a4|7MT8MA{5H@MCIDIv2a8_`9bt-(-N*`kqs72yC%j^gt$l}S;~588Z; zv_^8Fw#wfZu^t(iLM#y$_IcFFMpRMie}|3qyQEYWk}v^_8H>5P>Dhd*Kcp`DOx7Cz zGvoT)y~{^r;gEms@g>XlBvIc))bTbZ>vqxk>RmFT#Z2_3X)N(S&h_>EC8rP84<_ZQ z)6r{^){Sr5y`nDeshlnlgjy*+La9;Lgc2QF{3Z>bycC3GO{;vdEL5SVT(8;bH1e$~ z?poPWpAJcrCWWcWynQcuHB~Zt33J~X8`q5F{V9j~@tn}evp%-&Cz%Rk zvJG(^`RbQ~DH66;bt28=75^6UDE2*|U%(DmZ&|jdDH zxBB4g(|K=G(*^0T`kqV+7vdmI4c!!7fG#gB=MV+KEcYlt4CbcIw78eyT3ey>ZQ!Qb zCw4veZ>ti-CT*CRWg*sez%2&e6|Xec$2~s@{X8POlI^ybD$MKpxq10`Cq6FI(#$pF z*T@fyyx|g)QOH!`;&;D8+H~a?v=JC)oOFXznPKkU9Crz945k*si>6g5V9kf>a|?*0 z{$0jRYrW6b=Cd~Fdhi*}&q?ERD)zjb_r3V#e+zmY@6nv4$tAyvBM*~xsWlTCL<)}Z z?(^-H^W=rT>DgRm52JR~;DPNzGui0EJS7>lCcK!U&@lT)zcuJ%?k|gf1dY0K646tU zjC8za6nKkUpI-?@PHTGAoAJa?sIfSiZ_(oDOHSkAgn#+zeTX9*DLt$+02-?miO~@R z>?hJ!Xq+^a8Ou_sCRR1l4}TAD;v9ku!wxzP+khvcv0uGxiTZXDn%Q>jB=l=5G{mJ; z$tb@W{qYq(w+>o3{TfE=pWG1s^j-Vw=cn~Q)B|g~Qf$j``JXPGZ!cRLHL}lq->-mJ zo#%eO+kU=-i0Yb;WWo#Iuvf_#ZC~S}P>We?`XI@m$J!`Pi49P=-P3*4W)@i5>+s~s zNZTJ}fnBX!!N<>=N3`+lAnm2BDPgbb!t+1_XF$%RFE?|w?s?zv1eSz2msjf$pMNCG zRdW3zXCQsY3-me9CzRjj??;m#XRW5gd6R2*j-CrDSHWI&tKLl`3V9S2zsbL8i43>w*xCaxE8lJ3JZ+7^0FMV1 zVoa@?Ntzu4cr-7Cm7-9*4{Gcby&iWmS~C?nT)u1@k@I+v`;pI)^p481TRiQ3j&PAV{bZG19h_p(HurNW-GqqK;oXRy!2S@86KAd!MF#t9`;u25N%mZ6mqJ{vz!gu4?$KbDZ#IH7w zm4`DVd#I<)Ro0P63UM&`B*_#%v<$L6*M($@FrdbyqljDcf*K%=0sdldGmEBjGu)lI z*zrGm;pow>qm7ty-{OBok!sHEV;Ji2JN%@j_zY0UPsns@LVllkPucDSbC zQ{H&eHJ{lYyrh;-$?s4R?HxtT{TbKQNTdI`Bmz{`TiCUzPQ|-)YWx+~ke~mfVxsl3&a>hZGNX8a|C0PLAV+CblTXVWuq$ z-<7JdsoNm5qcPZ7PL% zY1dWqSfbzdmeo-Xu^!q35lvBd`5B9zM0^|j62o@Hm>w_klvrXM&SY^*gPs-tx=^^b zzRT`hiAh-;;Q0IPGxqs@)}~rXm9&?U^X6~nZMWUWzZB&&6$S?7t><6H$RcQjqaXfD z-*9~8)Y^FBvkP4LH>K}mF6g?=+>X&Wb9(mJ%a4*^U?6?6qcseOA3E@oE}1Pd;C4jl z(|@@u^xdi(x-~p9Tjb%N8P48 zZJo4dr{hC-J2kb59)dHD*UWHF^Hp&hDBG}$XGt>iW8R&g^XkqA2erpzXnNgE6@;>#D_QX+fH1Lw-S(n7X>?p$m`|TIswVOyXS>vo7<#|LXZ_ zW_J#lwfoN@%wZ4^kogLZs7h~?wN7{KSMLM-7_x9`QwJnC$v4};Uf<*(hYonRDz3(M zp$5rfVPjfbtV+HbD=^h0*V|Is$ta^`FYX4UMOvh~gZ7Da{pvR5Mu>Hv{V?L;lk$Ay zo=cFFU_!70i|ALO#mCz5Mfp_LEF<}v#xaV2Hia2b8lKAf5+dmmb&VObH;-cs6eeNy zz$A7eg+0b@;kdik+*to$?m)xvkm2IH zX9@co$=Ahm9IhY94D8Lhj!rv9Xv9m|dSu6mH?N$xf(M2qgod(*2`#uKcz(*j=1`qp zhPk&0+n5bS3Y7(G5C;>!uJdfbC)yF|4_jwURxIaisWy4nvOeqC%BmQ4kR=#I;=pyv z^Zu=ZdUn{Z0pHIPEKBm;-lB0E39(WxxhaFgsnNizdh<|4Jlof z@?Z{mh2L6kVsUq50opzZEBLW_`0=W$5DTkDMB*=R<+9gQltvhipq|Uf{ne793tXs= z_&yP4mN_fj`?zrXsD&WrdC05qoZK%$;FaAeqB_zWg4hCYy66MUwrVX*7G3(QymACbMO$9lS zM{uh4nEeQOcjla#NF$9~8bHX(^ywCfwE%0b>QU=8jHWIg0(zqoV~KiOXLd({CzLFT z-hJ-EcUuXgHTU&8uzI_POG*RR=6jL5WG$;Jn}cYx7Wc+a57srqx07l=f}QZ*cNu5c zpw!QJankqH?WSi^%_?u_XL{?oa!px^QxEg3E}GCzpQ%z+%Q@R&Qh0#^J)xr~UK-4) zrI8W!B^kbQVbN!6b}o0S5rw{Mqgd4&he5MOUu3HF$23%`Y|-!NN+-$)Lu1d%vLvIdM~qyZX$0 zg6Sk%lXHIIj3@KE@}n@fDb=z+F8XdxOgL?>ie&V5%gxY-({mkR?h5GE|AE3}(L9}E z{{ukyiW`o=8HR>N{}BcEdpMjGa~<=}H@>wv2`n3JZoc zq?IUSL9tqBvX&?);@M_T?avp&NWc)nH!GIorP1nr8@S|PzQkVSN@KdLVWnYDCdE+< zFOKK;S%0HBdiqQ+=_(_Ubw>w{zZ6CYBJ+raCIOw{O+ZA?Ljc$~9~Y-}M~Q63X-CR{ zr>BROSWR*_cxikKj4k>1N#uxdm?-II+{=uYz&W~)X$nn*rw7dG>VlfaGK3}+C&ZPd zl~4u_CLspj!A*2?8{Fj;F*Jv-DVISEva(WmvFfs4Wz_STTCo0xXzQYUa{)$+^8mOT zeDtf{e_&$@Y%XMt07PF{4~3eiR7G_1=FD=gyv8GYR#BHl)-m?VH##pQ$=An|vVDfQqR z_RS%Eclp4)u?r>ZWVOPh0zFlIv#&Y7P>Rz%ls}vF`0C~6Fb6fO%HTYPLT)M+l^#7; z2=R6Cge%<@5qYamBi9n5PyY3mgK8`rgSU5IDXK+riljyJ0NtjADgkFWQ7^QDhx#{< z*I)err+)@?7yM%*Od=FOCr?i|Axxqep0csnuTblL^w$x41<_P zB)6wthQ{W3TlB%cWk=os_rXzg4f+1ETBhvo`9bG1^^X3dUN=PR#s8-HQM3#nOb`8U zvX$H$Vkb^;gueC@arLiHP7JaGxiZD|J&)4D3-U=^&LVe#Pw^7EA3o-yZdo|(^uyUU zJ?91dvAJEYV(DYg`t2nVfv}Ms!%n9UE?Ga{ZNUFw zWb<8`k50?X>@PQBrJu96Kd#|G+hLSc`9ovH89tl^6QS)zo$OIQg+ z?_S{N9)`)vVA#j!orQ#u5B&oSqw?#220Z+n0ba$1W1Xksvc!EV&qwVK5SIz09C26p zgN|pK%ti-)$?;^(^#|(9geDhPN;+VV#-yNh#EL4JotK@6aU9?MZc0JrSBx_^6$88e zY%o_JxGzb7ka{4Pr)G|Nf{TnNTFT-z0~KezSHRBKqT=QR?6-QCsxMiY?h7-QR>~jG zlNWbI5y^0!rdyv&&i-1#Q0fIf(Q~YW(8?HI6+1xY1c(dIwA?=fm}QlPm=d+;e|Y=` zXA?aQgtyMN=^Nyk-Yh7%9{TUUzVfihDHV)?tE7v^9 zY0UfBu+yj)VK31}rv`a)syj_-0%dS^#W1bbO7Znf|0KbtR4roJeq<}WMD5XwC^7C} zpS;%DhGK=h4-MNyr@RhP5>ShWVOiD5t$|xlaF2dUh;NHm+H$A^_Yl3ufp_sk>!43|tFuYekO1m$A}rJq znHia6Plj0+61!d)JP)z`HlORmJ|+{Z9LBn2 zcffpcx+e~x+0)#w)1Jy6FDjw%xJb@*{GHgoXp`qNX_h0=^YU|5bOtO@T;Z{H?9Q zM%z&?!Ro8RG1r_hV3I?MC>6)nG{{>f7}2>)EFk5EBC zpC6j8WwP470CPOtH$r;;`!jKQIn?oRBgVO>hhNZGuf6!!5d*nRUVx{m7+mo~Ca9NN zi}lV(AUlV@xJRu4?Wc9{bR7NdL4Dg4v(s`FlNWg8)-Stb!}rrC)_9w5Y6pmF@&l>O z`+qufqEqj@jTF1C^d!EBF@o zn8Iq9buJYV*H48VRN|XG(O(QK1-FL-;D>b+XCj2`07(MM(rMYH7%J_SRP5Sx*1*jt zna~hPuN#d$571V(jX?W*fbNw9_|`gn+K35A(7o|+9`Jsern4_Aj21Vr%)xalW(|L$ zLXxSH-}>XRlnE^@4xzR&Q_cT zO)_X0|KwE1HArlGZ2V;vUasf|NqK3ku4F;$&AtjfMuWE8<^Q6!%k?O)CKm(&q7~4C zx(d7G=T3J2;?Xp$Qv+D^lIDfLX?L*8mc3E3mdCw%*&H^70KNbqMNN^j(aq6VmuVjp-7GstUz5z1Q83S6G>Qy>hElXG%n=Sfc zK?0NWv+oJIRq~`6YZ&|r;9ARl1p9ELdVBHZN0tn7s=~m0MYgg&jTjE z`gz^|`uY3$^P-HrxYN^j=)+hfbPw8&Q-r7mBa%Wsf@HyQZ((7*mg*svIQUr~teSOq zo#kuRD%nDK)?svsv2lRzh}TIK^<;LG1ghC=@(j6Z@f-OlbV2P@h6~iflFqXR=7a$c zLlnO;EYOBJB@;P+M^yqywBDJEMqBqWBS?j+mY)~}TgB-^0rW|Jgu?YD#7D3c^Y?LB z1N0f_o?HKFnCSrKv=SuiZEiR96|0WEE$bz-bvh2ssx9Q4U6 zBK0CURc&M8%I=Z;>zaJ@!BKA@Gy_lT8cj`1Ke_WX8CM_w#HzvP;`sUxV0Edy@u0vR zepS+1g`wB_fbefsdf#>q!r6$ z$yH&UNwnTxl^8BT3eq`6--nYgb~qM+GsIarF563lNx&XEESVcZJ&0NyToANWwzDj? zVfYC(lci7cO<#|UE2oi!{qdV$K3}UA`zEzc*8p(C0Bn>^+QSccPd}##g~;;?l=tJHr6sgh884T>Xn3L-jn4rhDl{zd7PF44cZ#T>>0`N5|`88CR>WmJ1fe_UCG<6Jwi3dL01nx7F>f8P7IB?6(ZZO^Yc zeG}KGUVL>A7&ws)?0d{Nbpv#0p(Ln&^#3)5UfBm-eQ%~Z4}4E66@EF%_`0%_79Bi+ z%~lZpgkA>HAK*!Oi+#{9R{LSF#e(NI7t_N!9))YjBmW|N_13%vKJ$A~&$~(k_+dS( zEKirc@st;y+W>_^uSXoLLa7S$yoDe_uWazT;V))8!V4=bPaczLCL8Mc8gyy{uaYYK z{)qTI`@%DUns@hI7Ee0BhXMM<;_tuobH=h0wqb?k0TQo*-D$WjnOiBKRy=W*^)q|Uy-L>~ zju^|WQ)n&`>Xzxs^0Ssf%rqR=imnzUZ?Co@v5Nf?B|P#mz;fsvavt960I>j`{ZcPW zUhQ-gc{?FSeWA!F-iz7PG?hY9_juiqzC$%$qTyiw{8({f3AqdY25-BHv2yZ`0+1%N zgZ}!Cl_*WRIfJjf80$6D!9Ku;)9!*euk6or_pD`see=Pxg%)n(s^YhxChiya9fQJ4 zB0hx`n1`EXtfpw_9^gdI&4zu`x46`;(E=dfz}XcNxrFPzDFc}SYC8*)PH!KMuT9ls zEMzgFPo^zrhb0Sm2G1IZp;>zy1E|qA@MYB*z3Zs&{2+Y!#9JLPLF19 zh;7yF`(Hs2LY`9UZ}i%`S~}!G1r^|kN7-UoMPjejx~>(>>)}(g4A+zKsKq?d*+oxx zu)bl@4EDN+v1i=#+22$aDyD*L48lFx4p?>kT1S)gmqw%%U|J@&m0g~B81EnDRORr~ zF`XkdNik83Rm5~lm99g;;u6Q$zl{T~-aDg;)5_|qlL8(qAPZ1S$j~kmBh0z`v-~~BM5>KzxsSGK} zl`jK5>ZcE;jvxBi;G+&J3w)r)Hjz@ja@|=!rnYjyn=twXxx&l>y~M)0?ZQ2w{a@}& z;f(e4(E@7cr@V4*3Ha@snc7y}&xs90ArwF84`H=y^fUp#vewZl{zwa5R`F2+<~#?@ zbxtpIr^Azdh7QaU6mf&Y2svEeM+7_fpp(z*Zf%$PN0$@xW2oNe|&|e|jg%8y%Q0~*W%kvZ0mr@||=O2?_v>HkP0dsrco2@Q|M5_iN zVM~h$;)kg*;pt=OjlM5h-Y6-p2DOs3d)0vD{t(9GU>Ah{EKgIqJe-Ovu zLInQlj-IQT#P$klg-fD!dtv>9ap`8)gtxr$mgbqrK!*>sbUZ$~2W)q;(YYuCE~qPm z<~8;|zSKl-gZARM5K|=Eq<3k3^jlN0V@NAs6fLWNS`z4FnnPb2lJ+v3IDeUyuhI{Z z+`wZ7VJtJ|Q|dRzZpmXKO9WFNA<{O5AEO|1p}~zmF{oA#fJbqTJ@HUOEXD8bo|^7R z2Us%It;StNUHA%@+7nNl*yHcVcb}87SU2%~dV3Q^1FsuEXc652S6r=XG8R;Ggjr)- z!#3W_s>W9abMmDZi*dyezxSzbyT}#g+B%!#3HdzW;&j*1Iz$o|Y;j5?rYQ{dcrcn` z5|oOm``K`*CyRXUS*}|K2^YyUj>nI$xtPD91_%U{$0XPy+-O3QnNOSSl$9T!Ld-N@ z8NIO=x*hC3B>M>)WCUha+$`TXw%ARJ!0UlCwGK=51brC@)SL~_GdGsS z#g+S^Yu{wuIVDkE_u-ZkCY?!qCi1TTZw_Jdzd3})$PZHMwYv%whTR9w#sRm>CNEDfqp9FeZZ->0j|%%6o1F8fI-7;QP-@#48lvmI z$Z6a2kJ2^H-0vOHCl|En{PM`RHez(#<8;%q%enZ3l-cupFBO+%1#F#6UcU8GJ_r&G z3MQE9>l!}N&F+*$<8m@p`PIu^+oJ85eE3F?1nwHpK@1o$q=;ZqBb1PO6UVGqu0Cws zDc4o+0I3$aTnmJYWmw#=-D57^UlG(IPFgC;+6}7Q6GWQbYzo0#RgM$6?ji2snQqtf za0RMC=V%5O9Ab8{#wT8@U-x=`@3~lp_ay(#u2_6Bm*M|_Y}Za6S5E2<%UjB!n^Y$$ zKPlgXD^SSO*+&UJu0d`IL}R^1?<#C}GoP8e=ifnhMu7`ZP{3aK^$+*ElW|qtl9R^e zPjrg-!c?}-@5_w|Cz zab3qhP{lp}nEah8#qT4`ak*QCVb^<1bW^GIx5K#<&k^4T?V_u~58p2U{(*h|uV^v= z!9}!BY!*$K{;@8LePK|b1vgN_MpcD6lS&jMP{N`-9#LH(d2M)}DNOMD`bcDYT}JsX1TedLW(qob-tX&|jr^4t9p$y9~(8pK~ReEdPEy-VTManj`Js!Mr_ z1$ttP#&t}fems0m`tNI7M``OYX>nULpN_%P4JP7Zddw(CpQp6J59EtM{xud(`7ws! zrTM;19aMP(j;w>+viq@b0>GvyDeKg2^Ahgjbn9nH{h#LWsK(;a%wT`Fk32Mz?vhAe zUMaf?-|yvJZzE2+vu5{yYu#Q5vGG+{A+L2$<^4~CF|=8Ec}b)CP1PCT6)(6g6_dx@ zQulJ<=Wl;{gbsSMh}hBi@YaJ%P!52HttI!`@OR&}cJA-8yww+<{*AhKvmGvKw)?ej zAQlxH${l_;faZ~gaw7v@otR8WBXMDJ{ky__A+xGYb>;6%N+MoPI*zu_W`4>J`!a-h z>}j_7h^~yxE%lY`*TSu-KEbJzYd0Y*Pr35nZC#55YK&x!?;F2p>R$Ftb^hp2@&Aej z{uw{bNL;%nr*XbSLaU?6&-B^+o4pNz-Q3blIXHW{=GwphJ`+(lfGn16uAuPImJ=H! z2JS01WrVVTpDK8SWz4<;dTIv8tI(Tg_w2Ru?(8WM-g35j$@PajtvJ#oH+1;F1I8zL z7O6oz>aDy8*=Wb&%^tHL=(_5JuS?=PI^Vtzv^Fx1VYK%dxf*uSSWvlj7pch;DUuov zuv#sGS zV>)S2pJ9(Q8eZLNZIb?O9&E=)vNs9hNAzg$Fwtx${#wS8n2W%Hv+sj14Yu9Crxw<# z`yS4)*+saYi69}rZG3i1bARyEpN!ZBhIzCP5A~`|t&p^Q^@ZZXj2R`ttFM5e^l~bR z{yj6Xh8j84FGr^fnQwUGN^^hE_s~J2FmWgv*V`O;9jl`S*2%k`W`B@y_7DSNw`OmP z&#wZa!e9G=Ywt5O_epbc3~m^vssoR65GT{dmm+X6cnZ`YBAKAr0IUG8s^2T3D+hYF zu)?^uSie_GMi|R4=Jv`jcJ)R;7vtL^+w9giq~KxnF<~IFkj9fA^X}NOfVp3UrT5Wc z%T!_;ZzRdIz-8}$IUP61cE1?scd|;Z^YWtoSkyj3OMW3XWSn%-`Uyt>F!wTqazd7t z*<%0m{zeIUr&cL7Nh&&#MkDOr=shUVz5XJ(!o$G!fpbU@-fsfy_<`2F8U=_{+> ze4ve6B>lfAIg$< zTRqMEz1MSt;g%WnkTCbzpzu+P>!NPmaniAE zvpTkI+qR90ZFZ85opkJUY#SZhsMzY0??3;!JGc9yF7{Jv&$;Ft;~k;ML(n@5&n3!z zVFXStJZ6JjV$Kb*SAiT!;(z=b98caFUM6j=|4s~p3zWsFVtABx&EloB)=$DFoo`VM zQ@8K*hmwAh3cqKxntu0(qcHK0@R|5o4MHUhyNi7H4tan4qVxQHFna3G#y(c$WByEg zQ;0z8zuMYlrJHx}h*tCj!&wELZYfE220Ln+uJ<<6^_vmKd1qqD*8QJE>sU;)d?D#v z-yLWj0Oq>VzdKE?%`CqU89FcPh9ZPa$qEo+2N5BM8a+Yie+cQhS;*@ZH%Y(UPNEI_kZn$MqE`(Fd{#|G4g4}u@{rPwuF6m}JAD$lZ(8G@9- zwDm@0^TdChcmybNU8PQK5k<}^a_v>Lp`YQNTK1+_v^~!Al+bOD%(?u2Ybp6@5EgrK z(aXj6=bDPON=20Ke|Mpp5CdfQZp|#c;dop-?7fhReGtEKqDN*a1ZH_LC|*X&1Xc!+ zcsGtkSzjQ2WQ8G(tyXL@WU*JQ++Ppa7u85MY4!HA#S~{j2B^i9&F~fVMPnhibG>HLa$FOX zbl`nRBhJL#jhVeUGH;W7V=llJx(ERYE_+JHEc$>oOq~IZ*^S5G=;d%}E zH=-0nmrd8@oCSLH)XnsfSEww525aMA0(RXV7gJ@5yc+el3=TJunXVYaJk zE$zf(NaNj9gA5)#6QD<@;t23={ExWCdVh0BP8_lUGPMWUNVXM<3jqxz;j;xj!Cp)$ z^nAwGd3A4e4CVAO$P1SA&rlxsfU?gxM-#g_@lI{vx(1^H|8Jha}?CV8K^764#j zmQ6FJX5w_)Ep}YmC&@KA-sl>>MOE=ZRxto7uUo<6nk?=MLsG|C z2($zn6lFySx1RsYp?!`=LaxV}HB4hB>zanYo^xIzb+5WTx?a<&wto~bzWd?YZV@3>;ALaXlOJ`0js>1^AqH#u=l0M(f#1S#*b_LGcVj=y zpw}tZU&eVJ@$hqg zor9&mx2M$^>N1<RB5Dj3$B4|@E~q;6hNUE&_^$VY!n_gj-z>H~=+wqYD>S!81A66aQPT#E(>v&9_M41~ zZ2_v6`C6Io6S3lECR_BV$$TU)A8O`M#C8Ocl{%P=iu)^j(k{V%m7oKj)6qJCZi;NW z2yeq~vt66{lu@8E14AW5^$$d3Vhx$N2{U@BI7XvX`O1-$8Y}qF-Wc zYf*8*Vd9vIPNVhrQRptO^%DX6Mr>^&YuyKO8{MVWxrq8Vmk^H^XOQ9!!x{NUFi5J) zDQg*N+2QYiyBb=D?(wTRJ~Ln)=}F|t__aX$$$V|jI|Iw=h$aBX`oZ}E<-u9_a-nI3 z&o_Qz*I$9_Nij9EJ66lQIDn)}_5%mrOR*AAl;Eqwo0~gFk!FgL>wRRMc6E9=q|>EW znGyCn;yUO3^@-vy2s=Jr!&dnzeztwGezM&5^Asm+UtzZX`=HOaqT@xqUqvWq&&yQYc?s_I=dh{N9fof&G^l=P}k-)DHPa)35l!Bi$w~yzSr> zGedtlK*&wh$bE9?`Ihf&n`6+-`Sqcfw|}TX;r__8>r2Dkt@+*n6942r`Whc^(?IRv z_M@|(rYSmO_LN4Z$p+ftwU@ZSWed~4DvktVUxYwKRwm=@{uM|=51sm5#W$!I)%E?u zHpAp&cvrty=@%Bj_>H;2uLWwp<=)n9to1y01#X4UPPa_p3zwg^X+GMA3d8ZS2*;}_gJU%}Qy$1;|3=fNL<$aZthe>VT^Xn-a+=l=?JHdkxD z79i@b58x9o1fT8u&^D}l8-ZHffkVFXGdaFiS%O!i9+PvDbVpDwenhSoH>kHD@+REy z`=m!(phqSD%$$RvK;Da94GwWx!p1EL`0+?;siGGXSVua8 zL4Me~_?mOKTDk$*DbMNP{@NHd(_Qm>Xvx`3(?v?huLV`wG$xs{jyF3MKE=M_Y0tlR zgaad%{EPY!^*Ed;Arb5UU;Rg*=gJT73Al~o=&tp0r1;9fl_z!C}uNo!Vpm` zAhvZwy%>7j$|-rae3bN_6b5g4uERx*w;lxfbe_LRy6J}wOG*KCtKeLV;7u{Ks*Ee` z$D-Ku7y=y{}CD z!o_WF8KK6A)$&sPntVl$<_L{OF5wO{t|~1XkNM;f5;1@w<#-&7ekOeF$s{(<8{wYs z&csX?S&=rl2Wy68W`>gLE*85>pjx;=+b?DeAB{n`z6@^9RB0oRg~St96tPo@{m0+O zMWT*bS>6g=yjEkrSQr&C(cm zivGPJg3TtW2SULWX&AA9eS^tUEh7k}VPM7bR zdi|@qck7)Ru5?V|LkRm-fU=tRL3C#;S9C$r2Km9)zfSpuK*i{C~FEHV+SF;24 z8|=M-Nv-`s+|~V;AGo;jv>l$re`As;fxyBeM=tx3myTM;v!y1hzec$ea0kiq9V^9KiC5_EZv|;yIM= z;cC+8Wr^1q7H@vyey7Qky^BM9?zIEm2!VJKFzZX-c9+({P~U9yFqO2XJ%!BDkOI%u zfw>wtC4$k>D!m-TaVuaV23Y1|vK9cWyS>9c>qrTRwVa@O-_Y`4E+OW~zS|;=yMCk4 z-IT1%Z9s=`;4R{L*kj3fmJ@zjOnx78N9pC-T@0`moz`rVg=t4~Ro=*Q4u8=3@kf z%LkI(F@htQ+ihGT+oQdC+^-Au#l{m@pg>2sHf19sCqWcw)C--&Ayt;<2-xTup+fkU^ z%~webM1lw#A&2t-n@ZKid{nY3J2jYK8J#{QN4p8TzHQ97DNd*C06;#H^$0+BHh%eX z^YJ!Vw#HL^K$8Dy_f?(%81FPW(lXL&tL)>u{s`F-^-!Hh?Rmngn?B9&@4~x{GEYey zpyD(w@`z;qzS@!2Y`S`v5cy((x^KBgDX^zMU5p-1c{p$*FMmW#-ecj(?1~Z~ajanb z-Y(F{U3e|^a<}{HXJPr6#8a4vLu^_9g7X25jBgzOIAr;toU6O)tW<*nG}|^z{aFom z15?q!haOh*Np^lrv8Mf}Qad*f)=aF&yew=msqS9Vn|1zex{^#hD?B?tt|h z#=>WC^oEiAi}4YI0G#zPJG%9InBmbz0U}{&en5%lkF5{Sj)yW^6g4(kRRyhdihl9b zB%{KItf5`oX^pT64w7s+HamM6y?SvFKy&$d-}|y7yPQHZGFDH{>r*j=HXAh(;n~zY zv>>2b!H!f|tiO+j;w3JwV@=ypcIm<&K-#t14LSQ)P-&bjlpqpmtVJybatIkCqYK`Z ze&jBaDS>?^R<~h%8$;0Qb?zoZm$sYR;lsYD3j%xiPqM&qP7|T)@TWxI?`G(2hR;c^ z8Vv7uFi^_>5(@ROC;`^+N06>t@mX%w^y9+r>Rc-(1rexGUd#Y7{NY#suBvD~I3G^G;DuagS@BMVHI~ebYI3Cbu&K z?vYryT{yD$EX;;BCAtOMX*?MmE#*+nC&}K~dZKi`S_a>*!RQ)OQbKHx*_6H^bdowF zyZNTt6)$NG>k~cbA)$I0hs6Xdk*yO0ypx;Y@3=(`B<`>PE5z&G`-AuQ=Zz!WJBGVe zPh@xKqPuez!&gc3iqR5-NzKG1$(r&G%pAecvuk|QQ8 zY}hMo?Q@4i8CQCTOf~_+@H@r>K+FD9JxRAJbx$v?^+Y(%C37mL(q6mY6$L`pqVegq z=T-af8@Q&=R!omhlMMmb@{Db-lCPr?o!Y)!dq@Rd5caFEWkGljT021*r6IWBme1h* zJiO8{0!E{CSlcgtaxUzc z_NxA6mfA}ggCnI9U_MExKs}uu+^jGlYugoDjzk(C4U+Y`<({9iW$epDZ$(WI7sjzj zgFS>$;@xYH%4?1^j3pJsvhLI>Vi`F=@+J>D*@>S2eL<82hEaRMmn4hl94|wmg-wAJ zRiGZiMOAh^jU;yAH2(zXlBbo2gUV}sl zC;@o#D^u+jA}6rSeSIdXQgmj%za=W_O9*L}Ad(KGWo5tj%e-!A=BIZfvUT7_p^{Me z()C>oGnI6XmUo_~Yz8z~CzxEAITNfokYff9Vm4P4yDvO}Q7ge0EUN{tI|Kz&o&g8bN!jqq&<*RmG}YeHQ1p&!w;M3k8sPeCeA71x|-? zdMI?_71gp&M$%Ghh&1~torB-}q(lEBXZTC6aCaqbE}Rtv-8g|CAI}e9`MnDdu|jEl z;(6_q6q%7#Ap_oNh)xOm7<{7%zvYw4yu?FbJaro+#j!UfJA$F>NIwy{0&g zTP+PA#0_m(&CEu2MXxReg`IPJx2R@`?Um`91+?s%Ab zTpwxqNtu>J*m4pvIf(&_2{r+2ZZYf_Rs{o-4196@9DDo#Tc2Pv1M!>jNSvWt6Q>QFi>5nab@10OFppwYL@cU~oflJY%^&jU30HV=#We^mn+1)uS`(9JwfDNK3FL9xnX2cx?(gBf($P5GRJS8oa?#V z$Bv;P-l+JQM1Jl+IQ$Vwvjg!Ip1L2)wOYUCx%7}FD(~*q6THO2R+#WC1NL=<=c1XT~$WqXZuuGor7N!Y`p`m z*ML@FAELO%Su;{uT5?`^==Eu$5!F~2rzp-(mVRn|Dq7Ogc)g5U%UcEk^w8Go$3un? z$_Tg16L;=>YC@Va{o%=UK^S>dDByvFr;5J^6R!&-SH5ve zzNp|omRc@;Z&W^pzs3DA_$NW(g7l+oy=}i4jG^5WhWI2YygMtmi+pKD4x)rp&c@*> zg9GY zezBge#2x5o*&!$EzZ;V?$82`2XoxBOlBR=~$xVbu8s<%cjfn$A#Q&{{B+=Q71%j=xL z#40FLjPuhj@H7_AP~73s{t{V^Iy14{K~HhD6^!gEjNxl$-vrD$SrJ!=6t}di<-tH` z``mCSYL~b5+3xRNVaIh_;mEo(s~@A(=n>tv(C#DhPvp4MYwWLp&)cTY=y?#8f99r|r;gu5UkjiA=>OdMcj$-iPKDxLG2j>+P`k$7>{wwLn<|~EQQMW~| zr}`QVzDYd7&%X&xgpZg=BGiGiM_71(&|}T7#5w93m#A7bGh$RNUW5_Up%@}`Pe~XT zRBYal#Q)ro?ACH7lp)Y2U1|qLM`(}FjaA$NN44Hv(Z8+dZ`E7rkS|ElLuq7Sln9=l zNm0??FgNl}0{Gy3$MqXU28}N(h%VC~js+S}N_HdEM&`>}Tnt~eyRI0sqq`k?K1?yD zqeTfkPD+L_E1=SDN78iroU$LyG}Oma-{Hvf*o?4s$I88moH3jwcV+da#EY3pBkqcS z^|Mm?nQns+-~y^AH}@y+T1y(cho@KaBGjw*2PzgLsV`m9()xrk{En&T<4SIXzs)=~ zBH4@y<)1BjlTu&Kc4Ray1=)u5oX1X1PpKD>O*k99E(C>l1gPoMzM-`r#XAx1{{Mi+3_68(8;}tC2k>RBDqNCci>+5Hc%}%U9junWZ_!{;t2i3);kaH-zVrVs>pONkR zzBot-M*n2iX-xZvGqVx7IEh3YGfh%_J~*!R34xvB+5s50lGR{A?MSkPB@5|ga3)fA zaG{iZSHgbIxx?np1Q}bPKqxV;So(ZJW*9n^$)*S!JS7Gc+-&NX2W=rZC4oq3z5%LM zh1L6h*@_f_HOFbGNcIm3wHXac=q#!wk46smYxnu~jN z<2Mp|SP5TZR7?|=%LinW@^scZkAjqKO7Jjeu9OFw%>qPN2pNkYxf(gitasA6Cy~`p z0mx8renZA-914-wZ=dDpBlj>O&}wk&xE)DJVA=m_Qcf4}W)Q)_fib}j>c^CvYi{$X z&h;d6o3q#h`AvqU#zhOlWO9$9H-4fTEBb;Se_4eKDfvD(J$3xYVSSab131Jv7gI_Z ztfNU+V}iY4KQVe?trrchf#YQGc_*eUh+uur zDhmcTs5(&#;*k2YDBRMbE{ViwAr^rmqcki@jEUa7WWvHyLJV>+5u0kE8c7Jx;uLdb zKXYY_J#oVr2v-~pr3_4cyaJZC{0b6y+Zkj%+eC?pL~^nBq#+@-^%`j7uX{_iz{3Ry zc&aMGoZei{0*l+j>QwvHiQ=K){$Uqbvm*1GsB(%UcgUGh0P?-eK2WgkNkw0KmwL_Y zQ}RL!_>IL_pW)SnIr)3ELug9CGUGX-EMlW;XLRe}sCrc|a)La9-8;Xwq0+)S1rG7N zYGkjyWYB+m#(h&f*I@AsIMA2-9r6;`gk7ptyhqwI<%l#XL}uy5unNU8a_!QEcEqH7 z2=K~AtSebhXc`TwicW;-_)iZAXA=K+z9#YCy&&PtL<;e1m%G?&CDl990)wI;wwCt- z2Q6>(gP8c~Mc9$XC7OBun&sqL?pqZ!qMQiH9gYYW7lH|kFwYnSGNY0ccYS%E*my-4 zae^gVWWC=+oy9SbioI@;8uSx4=|D1wkNJCj&{Of$IJ>sVdJEztEE+1l8UDgFn^ccF zV>a7amFmLcw-_F&%mP{NblT^Ex=!r!@oZBe4;t^~By2k~Xvx6Q+_}&=a5$7SE8{Iv zaoiPe#P8+T9p>29mxE)tRAA{$S7K*&smd)e?N~t~sWcN%-%KNe>r%{joYrgcoP4(c z@j+f{5OWP3e4sS?Uu7w)p=F6?DnYJ*sq7!kI-tI)B?admz;ocWK zsochU?cROhN5BNRnof;_aRJ3|KyZ_PU8;X%4}Uf%J*42g1TpsUePf662Ree1IgzwO(BY*s7b6<6u~R@9cq52 zG?`gn%<9oWU!i3>R-+0p)}`sM!z4nTS~U=v$bds?!PK_%&Vt1> z!1p|N9NZ$fH|JN*4}|jDHwRDl*zb)etf`l0i-UzQKmRUh{&Zkf7JNq`6L+sTf?NeC zI*4uVFFsvJz6@jb^}V(>FB7=Cw`R?`t`qab0~&tf7>+mOTAP7bX`00Kcbxbo-c-O- z<9*Ubj&O{`2n4WOq6L9<*W^ilx?qO#UqLoGN?7i56gA^p?vUuuXv~8EDMY)@n?}$~ z?e^l)^roUauSZkL_UD$zSoF)=NSmwaTYuTV8h03+Y5h5-kEG>hU`j*d=5wZ?!?L3- zqO0dc#`f0d7P0l4ZT}}OpAxa$eD&7_?T0(^%E#tm{4 z$4t&MB3|smVJr#(oaY@Uoh6Lq=$36%EVnLlNtUR^yl{{#fi@aFhaw{>w17zIu=l%G zUmR-`DPbr6%6C0@wn#EHcNAXoKg?o#L@xOdBO5cOO?)2YC;8ub6faXRJ;Y?u?kUI6 zTayz+N_Ov8Ci`jd19xyqp$K$S`$)3b@K~W?O(iDBe(J;q2crF)8sZpy(|}m|*^m7g zXPG~a=d>SB#iK#pqzZ<%S%3gRq_tT|45mstzuBdPrg6J-WmcC+_p3h*Lr@VA4H|>b@&5e(;#2HMYmjiO~SSwC*viM zTP(OJN^kniX~PmSzMcFF;d{aI_ns*OR_?Dm=Ywcshqji7WXV|h8{=6L&%6#c9|en1 zh_oW~Nh{0)JqB6fUoQsu97xPjp5 zR-}X96ytDdsI?IeRQx%4a~CYVGc_bV-cx94=NLU*s$~ZBE0~=%2w;fn!tPaQ}a4+8nKP-)5SU2Jp@Q`>jNG1C0YIh z8V`5{0Yn_N-Q_!urm5GABiMoXpxKt&J=yRuP)$5?&0rlY(1|~x@7Gl@UgV7LCc6;{ zA@O;p%HRmUX%vo4Qn_$23K2X7V`d>e6*NleoEQbipUBWLTzGJ12XnT_^{|mF(m$H%ys_Q|Lzu z$D|E&z^e0L?hUi4?c++H;sq)!RF+n0pyc42lQIXzW(B@k)d%YbMO)0on{wFP^5|MN zsC0F>N&F&?qMPA`CY6vT45z_u(Z)+{`2CB(R~p)d5~F#OfWF^#9E|vfT!_FR9SjOL zcQQg$uNP9#M7D8p3T^TLu*Wu^>iTW!S$|SUHTLMFW1&cmbTip{N|;v_SW=gHa!ua~ z?9L%sn;uDu*r!>%+`c9b0pFq&m3}a-U^&o$)`6quJyMPG{0H@WTQUVxzz%tyL$c{Y)$l$=op+?MWR=V-*tq+sTQ;j>1tpy3)}tyXmu38pRpVIEc=& zi$qAC4HvNvRsnX_&cxZ^-<>NhfVajP;6SsWoe~^PXEg%%;btxM%v$iP0iWJp0)*N* z8(EoH$cq2R?q?~4%BO1JTyT7Sj!>&DYgw{F8-oTc{oU{31L{qsfu`x z;}(V#L3`9Va`KHpS$)5b*UYWkK3s8mvhRYjst-8XYwOA24N2P0qhXmMsoV;Ir^|M9&QLS-GZh`3mN}B)kG^3MWju*lb(XkwKWTChH_;XC zRaDlV!#j0O`9m?|In_awhS+9s>BV+Ym`$+{33c|u#{Ra*9$fS&*mt**Y@;8Od?%DJ zJqvW+xaevFKtY@QT#$bqLa{c)oJv`3d_St@T9iObQ?Oq!302GMTScdD$KS~F(Cg@4R+JWV>xpM-=jkd0rf-AfAf}>s0zRu zTBY5Tw^Idcf?OB6*r3^U@wPNyD=z`WYR8(!h@K#4z`5_h#=nQ z38hmhvrOb30k>e(T=LS*uik2I-ukkk=(alK)P}51f3(^Q=7z4r-p?@i=Gm97t+y2en@Sps z>?&}q7mDBV6?Q2SIcPKbHeR3od4_Spc8RxL})X?bT0imYWcH-tj(KS8?1)Y5X_x>2Op$8ga8YpAxly30EZWwsZld3GZ| z_5!`8rVI)8z5i9t*u1sZwH;pKuPp&DJ@x9Y;8%d%{yyU(?k8c(j!?id zls>m8+=51WlR;ow;AhWKzYb!{Nu+#-M$HC%5JKrb6K9s7(1r1_MH!GA6AQ+TS~-QWC)}^` z?p;mspmzYFa>rfap7d8QH=+z{3!N-b{%0dfJ()jMq>oU#i`;OWuyN#L<770QZ~uHg z+-63i(&~ZpA&&Aq(wu>9ue~^i`iw6nQ6lTIP!A~SQZGK1X&>m?tpZD-|1dK_@}}MA z)OvuN7e)^5)KLhjhER$nn>_jbj2oXAP0|Wsbb^I-9o38|Jv4Uq^!%9z^(WD*ncrjA zVube)Y|Sg9PuM7SCtDX0=xF00IDS-hb zCD9VY$V%r^t7Jj!01!jG&(-B4`PM$X111|=O$KV&$tEgXGk9rPcanlr&SCg*qkSGu z!zDkYxlF_G!(nHmg%@lgGmVY3Y>LpJMN?Qt+-GDv&jVf&)l*4d`Ro;3I?RH8lNzNfSMb8?@9$iQX0F{VfyPV zrX3=PKCzx}dH+ULupl5?p&~W&tDd>JX!Zb3M&z*PfmV%{{*eZwcstgM=$lzv`Rn*Y zlWlvHTe>uULn%#fG!fA1)m3>!=*vaS)whA|vorSsn_wOFB8sJo=`YM)yOC|urm|YlOwG|Se;9uw$}|{2DcdV9&7>F)t!8pkoN2j z-X%Z?aWcUD?cxX#RFP9*JDu(E{Ft>?GO^9kvhtSH(_RET*fR%J#4Fvq9yL=oLw5mn z*W1i>eUD~E*9-EI3}>+>^Bn#->2^Ha8Z|=u=!4_ z<7CU@YC>nn$NiEj<;Rij=h&k735c^LFJS#`txXr|DzEb+q8XMyhh9GIqf^A~`i(uW zqlF(8W}~U){{0bH%X{dp&J62|YLovDcT>@H+{_>Ej)b-W=bhCeDoO}ZVR$vTXofSM zcqT0tr$&RxN~YE|t-?!BE*TbXYfep@$QI7}jr1IGi-7~iC5edkhqgrsMQU9eT7d_8 z5`+buH!)(7UO?ZDv!NW3r*)D9dhKTYtuj1^v2C}NAX}jQ4LH|4oh?dc3oCm>FDj0Xe`n15s&NN``tRGrrzB2R}`BiH4)9QEeUW@ z=ocL*anxR5UiEU8Vv~yrWsdkBDinD~bm+)Srt};63wgY={DH_B?3Qx~NlM^sWykoL zz;96~)QJHUPllm%cTq*OQKY#!5j0IJXNeW93^YRx(Fb*bq{Zm*pTF@_#WF=?8G><# zIhi3h%)ie?4gpnA=+L8HhewI9Iv>6TX>7~k?y(a>hdC@ql-uc!|HihXT#Ht5#5Q(` zJQXsHGEwM|LlEv8iHw-M(|}lXWS0uP&2E^!IhF}I6Zw~Uzd$wJ0#+;XqQF06cN4@K zl%k|Tw%1^cCZNE+l&9(_NyS~#cr{T1si*AbMgp<1s)#8rWaP|^y96V8n@D5m@nVin z;|I;1JtXOGTZ4oMYe0O-Lx!3ujQ;)V%Bg(=5LZE`J-84?9X{$B`kh5coWQC($VnzhX#W>fq=< zG$>58zT7;c_jvPIkL>UFK8l=nT=?Wy{9)MhnskY94)P3W0bUmcEUfMSbIpD{%X1KX z2+CTct^GIlcrNI_t0j++iQNO;efM=g+g@;J_j!%Zb>Q}1`uF@{jh(OS8_;o`ddgLr z{^2LmTKksu_|x-aQ0m}Mo>i9P+bV}&pOG3RU*oQ8;#hI7>d5oE^1s29jhUT zBU{}#n3YnOH_~mEwbWZQ&!l_YCX>vP4D(15FQetPZj z%O{^0#8obq`S{Xl32(egS=XT-pPwW()lt0OP)MK^ShEqBamO+TYo#lWKY!oEA0(!~ zid+VsR2%3!QXtgaYewKEx^j&7{9yR+;c+g(zlqTQdG~FS6-5G3?cA*cO`S5TD z8^kDo#BWsi!+@Kxfby zA7&55u7SRt3+?wW&CBg0XjSBwTthx^@E`n$;DNYu1=p26sjo3WcZq?y;-JXt496G1 zf23%{q|e#^=D{>#s6(;%Oy=@jFhX+(b2w2jDImQ2p=o0#xhVOJ4`>}&W|m9M$dB31 z%5OIj!|*poZTDtCfkNHdu5|$$kX%%K!ABa6AopYYJ;)yn5^fv8dX%*a0Y}Q7YL}v4 z0Y=e_A{c(rJj1E^oz8-)BqKkXe}-q1c81#aR#SpB+w(v&P)<5xlFUF8p6*kl1=j>o z%RT(nu!MLKKpd;Zo&1PAi`5z3^f)~dHDHjfi9Z!{(8{#`iC)tkokr~#_X_)R+Ci-~Sw6+gbkiO$7;jkVTXy9|mGiRIVwQ&j58ek?aRK{Rr zxW>d~-$d7aI6SeA-17=F#XCP*NmJWoiZ%qYQIl7*gkD0q{W$@Iz3e~mGJ)K=~ePVzpiwV?iwsj&Msap z=2SuxCEqUNDGB{af=sVbWp#HnWjpbck$04Flk={fdXcj9xYFZC3SfY}F*5VV{n|z#)s;9ecHwez*`(-gP%sey!`r{Vq*p8vuQLlj4>1mC38k^`6H!{1(Q&)=##M zQ1dBPRCSS9Yz^cdKm8PbeG4%8#vPm_nIT(Nrm$r#9$JM#%r3DS8nNM_kO)2C6yR)= zPb4wKs*UQ^N}H^|BP^&(vx<2||Is)XcWtaWc7?wZArHZ+!@1_DPhM|YVe>)Glg$LI zu>A*%(AF|kEEgoxLTKQ19EE=D$b*r`zuApN7`l`1wE)rOcwIVma8v&t(odnFHXKV> z3>%2IFm{$Cm&Fk8k3jkYw1!{!P8c}jinswBh}%ai)1Fv!*}S@N2WtoHs0O&XuO|m0 zR{D12_>Ktx1wU>)|Gjy5f_v1QGa5m3oFuo9E$6pxzDC&Z8erv1veY^_z8%gqfxtI> z?^Z|&$dl!|&g1?(^Fy(IGnmB7=Y5MuLf&v!)f6yT$=5Xbcg1AJ$l$g)~nKOt+!u?-#0AFX{XB z%kgrzrLmto!W2aNLchO_J0mK*lF?PSC8z!FL!eq^Kym{{*|z~dl7@G=ODM>Uuw;pLY1og55zk(MVkAulMtZ>ez)65-)J^3M zh{g4hp%>`YDR03u4i>_-r4`J^@Qf}Bk}-~?^fVA=>=vZRjV83+_9ia(&}E4uaesS4 z)3fB)rZi>QiirecU+j(1fp~TfjegoB;QTmUu?S47ypR{ICU@_qH5pu5x={e5P*%1e z-nA=y1t530VT#m?|J;aH&3!`{>l3I}hx;w2z+#_rvPoRrUr4=vMNJpS@x@^T0GzUj zGfEzOb!Cd!T`WiS$oi0v;AD5__nq9d0NC@W&5DTZkX1f zOf3nKkuU8Q1XBO$dmE&jaTYiV{7h@Bwoazu!~l zYx;83U7R=KQpBYGm^@PyEUjEY>5jLq=y?D%0>p_rxyPn44Nu#uSIrq3mmP|VQJ3ab zxJ6vdJ$NUs6`htLGI7s73?T3k{-ZFQ)7(gBFE)I}TlwqoIsTl4t-9R6(IWUc3Ns?d z`y`rY*UDE{V82z@qW?1Xl(4JATQ#u_98lNyN%c~E3z-S{qyM6rXO|uynEC3}ap~|g zw%PShD~IIgMvNwhy_JLg@$bjEyMW+D8om1@k3&Y=_3&oSCdJM&jYBlx>UMJn#qH9+ zktO>F#DE`r&S&$Fnypn1wwkpLf=}V-R8FFgU#7sT!TNt%U0ZL>wj;MmQJNeHf**3o zttB5Xl7d(-mFQJ11^~~k>$L^nruFr>ypNp6nzY8XIr#LJ!-@3P64~@KofV{<<_6%7 z=D)h?HI~iaw=EuzIX-6<-i-E=#5}To>8%|d&D$Ule9wa`2!i;Fa<{ro z6Ve}w{JEkC_bCf%kK-rI&bi|iT-KPLF91mbQst8wJFtj=Rvph>J&WC3Xm+IdwJ$j> zE{EeDH$&L;-EI2*=dT{;mI1QwlH(j{hw=d{U7#~eKdEjZb|ujirPe&+6EaH#wlMf! z45MH)PQ-Tl8#(k&=G@c;X8ClQU(Ca&u+&Dt?q8>ELcbX&qH`pE;Wem{%k_s&&h@hs$T7pR zLQhmFEL3;b^b}lg$@3&uq`{NgJw~dz?Bt%7f7;|~_j4tCefprG0R1a{e6kTE4ePUj z6+a8tS!L{cbQY}NdqckrI^6-wvqMO-XM#1{{ZIxhkWV%802A7&yr9CCAQZ@|^Cur~ zY#9nnO_msfTd+&Y)D3A|%F4a2NG>ZSNdo6V&t7o1C9i*dP_Zpz38zbj>&xkfe59@P zW70SJSc!eRykS>FHgn{KFbGc9vub~+KK?Yistk+yQ*ch9=dYPteIt3{@3%Jmd8d*s z8`_o%#BIEGHNQR-y|0Gy(<~xbu$Fl2#8;$ zUpDj|xtudyWPE-H!?Jz#qU!rKs~~wuy`Q{!iVpVMnl2>`~T$0PxCCGH60zr#l6jy4Tr=S%hvmXjhtkCVQj%?f~ zpLOvR4VC1I8mTVS5Wn~}pyOz-8DE2L<7-(mar?dPt5>I~tnXg>$@UMw{Zs9{6PC0E zOHOG^Mkizh(FG^uhRiMP>t86wG>-gUtlhOheLf7L#o;vBu_lWp@=zuqo6Pqv(LVf)$3|8!z;`{DMamcowES=gs6 zojKf2hLJ)~ds8kl-8%!_W)^J&1w_Vuf-?gZ9?a<{VFVdRIa z{<8b=!l`r@wO6mm6PUN$*na6vuWrB55%q*Pd3(F;V?Uwm+IGv6|EZl<$4GCyu6t{R~1lG$w_P%!6r|)kM-1ZCYgq`c#-~Qsqr>EDsJD1&lQTx%~ipRIVXXl{n?yvks zyZ!dteUX3P`OAq$;T@;8;Z)R7bv)mx{Mer&)?ZnyF+nZ{Od*;8OqDe`P?%uoWxGL^ zmnC|LOnL!^G=s0P5c$1n%Hi8IgOrptA+pQXK(8OC?gEu5G0{=3#!6{tXO|Mt^aB&d zsUb>=7>16-#aFw%$MUSmml(ctPXHts{5y4?BY%(}%hHVxCV$ml@Cr4sgmyBN-fkdl zi9M>aqX!z2Lk%3mjO}(Ny9=={9CU55%rcJU8L>vA*_RwMb9+8 z95T{^Y5W*WSP*eYp6TAFh6c_dymV?H+kp-4r>8GLQB*uV)w4m1o4C^Z$^+g0@Dy#u z$zIL02Oe3_2lmrNQLu^&{%mp4mv=#5hCCZDc=(HcU(&_Xrv*mn!zR;(@59YB+zyf& z*uMT7?Nej6^wzQpe#C|Ca`o?*>rji0ci+`ktzF+f_lN(n-GAS=+E+gP@izARg5r7q ztIvvqY`3o&EnZvJMeWM}{?WFgx2ia2;fal<<*CbMTEOnD&B6Yvl}mMB-~Ri*9k;mM zyrEt5nakU@9Z)H3yKRQwZENj;TW%WDb>;K!+$%-lMq0|AibGaf6FC!rWG$r|2Oq5luQ7@+dgP_>ea0UqC-kBt;&FS1xHs9HfpGv2HE_Ucz|An5 zVqZzUU@JfD8hoFqOGn%0J|6J`!ra@2&t15$K!f z_UPL{&DIYQ5?9~Cj|{q>)Gkadar?*6+@~c$Ekya*i4S}C>{6cwURT9vWXC1JJ3A9k zzWO|jwQdt}3!L#FY_$`kbQL5DhHwan@Iuvqs+$^jJF7>Ny1O>fwYHg_jU|MfDR!3= zPC+CfWM-=uY{Cd2YHMzniHr8umsk2rR+kJ{TH9KpGa3d>xnS^~=G5)rj7eDx>zP*8*IqEF%I^oMwz`54D8SEr2^tQ)k3BADF2Y^d>7849;1-O(pUNQ0v;zp=A zatHJMsH2W*bLY-qhYo!1q~nV&+B?pX^hTD3h|Du1+q>E-T=Ot=XZ$LCAHrn^sS zn$SLpwq*r`w&eG1t8~Pa$-y7SeH>8o*`JwAGEF!7Ods(Uy&(avBNyc-QDP<THT2>#tv)%fI z-)sN(hd(%F!MRUW>DRv^HY{#mckB29_xu0%_6Dz$6{X@iN49yrEXnytY1whxo#ShB z?fvbn6OL#**I&^-J!MVoW9?n<+?v@AH2G6*G27< zSKiV#Z{FMkqhBaLZ zeHJX(9{i#_z=rLi4;4kRft8{W5L7w{2+|c$q9PIqkc9l_J2SI)_ud-_NK@}5duQgf znLRsacINEtEJIZ)~pwe+9;gTeZe@s)3 z`~?EDas;7wI2R* zC+{~3fBFDU)X(NmM~ZM*#xBjIO*7WM#Bds_VBg|M+0D5$=c7h~;WsjvHZ-Q&vi%n2 z2M*J%83CpjGjdH|YSPvb<*TZxVeprAPzffXIny6AY2}f{2(@)4fPCiA%zl4J?dbjK z{W8V~{>mCKP2oTrQ$Qn51cCl^fXK^`A_|qief!gv&sa3=&;S_opiMIntT^+d5YMi} znALS9I5TE3v9alpEvpO)6KNE~!y8i&X@eHwGQQaGlTm<5@5WX}juWaVC(DvLn2UX4 zqa+;_V?E0#G{-(DF)vF>T&DpgqH!b2}L1!x{r0eGUAjS3+-E5*#RWZB!rS ziw0r51YeFB?KS+g0oS2P(GLSxrl)=V;QNJN1>mD*zm2{v%r+fm;^h%g;0aC|DwZ=F zTC7ZeH^7%BkA{BxX)Cgl6Od4`Ek;fsg9>jOgWj=a2U7T?NX)rn#j5hu$iF@r=yHBX zRIO?{fp1?N5V?2Gt9@Z=R8{xRnKrJ0&k=9-&dJ*NZCJ-3#mQbib6*{Z#udXi018w+ zmN_CCa08FSS6Jjg#FVKM81Z%(higH~kCYV7wO@xrJcMWaU@hA@^Hy^CGHthkV$e;`2wAeVKIG+O*hKzlW^jDJ6zS`=L56uWu8Vb?! zB$mF7xjcE3gCO>I?9TSuRZQCR3DaJdqk+^s1EDoZ&688lfpw%H;w_IBCwX%ZF}!*% zmQIQH^W$V`!IVvTJ|1t;l@At69$5`|O++n;W!o#{5H2oowen{mCWw`s6sK)WVtvy| zKtUQ)Fku0>*SI1pMigzpA42@D1zI~5B4-0?sFLR!Po^|eA~|TcZ2w~FVne1cpv@QN zrtBht_M<5pm?mi7QV(h5%<>_)+A{ajVUkc07Drna5X;mdi=@uzdlFhVPP>l+Tz%wSb*T1(K(rp*a#Fl+PVZ! zhGmTeID{|ug!o|5l8(;B(P(0@!>@s@m^(W?P)yI3b(3(Xt_x9H+*_;X z1^sXxI+<-VRxX$ophurESao+hWM^d?c#$GVi1)iKbC6S{tl4s7`)bTe_p_gg8WsGs zeq1bltZ3jbe1&qbbnX`cJ-2FD1xc%SgkVLBNP01zxHDvgw_Vc$KR$AKh(=Ovt#Q@w z`zrF}$l9KP@0NaxO)R-Z>4WLp z{%|EIz;AH=^dS)*F2ohUn1|~;B&~f4-ATsLdh@Y*tD^O18XZgbUf8Aa@>Ni-}C#XfEn{PqHq1$9&gV2iRe2x-ODp^(?zZ2 zH_|uP`BbxQM!2smt@0_TUe&*)Mh0elJP&)s84YC1Uq!Uta0q;JxTSGTh-DlIf*-wnSA=yO|z1m{^8!&+{0n;69_LV^rU^p;sC*mg7A?Ls~0w zR2x9frnv{XcTVIJtUj+?8`wSW}(wG(1Oqq$G8HkZUsB<>s{g*z#4<%~j z((WyhY)5HMwnyuxW0@QA>1SVIYeIFLeOdz)@jThAg$b-4a~?`_gt)^<<;2uX@GYZ_ zNacd9j?2d4!=M4@B#m}04nuK&d6BMfx%As!dCuK=+RK76olm{&RB@(C~~_D8~W|8gjSb^LIYSuSqp<&bdvI z6gUI1e_amAvP2+{otrRy#$5a+eMgs0^+Wf3i`Js|)wc=G$zuP;^U*(9X{mjiE~7Ln zo!OS%+rQMW3%OaluuJkqiWisFmu@Aek3b2vCMw(TnLN`hpSkFL8!3#eg7j^q{;LIU z>$7RoCY*cjxu{X2#$V#}3S`f=iA1wyDuHNAa!@+?amg%v_gk*6=+Z^Slqc=oxoTNQ zl`nCj>}({YG(xjeDjZN*Tb6u=PnTq%M!WOSqDpWP2?cp$6?-BZ7fi?eRa=qTz6syKBI;j{Nno=IeQw zo3R(g*y?F78YE^?^GxU+8`!f4NvRESM*VVzY@fd>Kq)rOn}N6I|B5DXH%0j78aT(6 z_~iUy+2Em=c>W9MTE9H9GQ|IOT?6>mj0)*pSeI8HRVB4iY}+3?R6id3a&`D=W9g%( z0nbf(^r~=}N{%p+Gm$C|+${HnO%}U-3H~?MIDnbGYj_Wo+48x3QqULV^@HlPn~WLg zR-s++NsEsxJlVBvXkuCax)<=~hU;+2(0x0vpYPWWNh#|vqBBiO_Q&~lQNW1Px;3y* zp$>lUoRbr_;G@73tTde2#*`O+4zc`^(SUP!xi~Zci(dZE^T%P#OG_;BEId5nW}IkW z*$n6SVfI)&@ziV!Zit6l?8gZmF5m$3aoB}bU(LbVt=ULv+8(D=2)nBspuB>U@50jQ zpJVx!Y}9IdCK^^s^ae{DG`WG3lgD8ek3iV{6Q1lp5b0=#`e#o>GxH)t6hgz5x#nYB z-sfJR)`RHSvHn5b@OJ&Q6kje}gFVG-p>>;76b*TF0#%D;j;scpEgsoY|9YkQ^~m)8 z`nLQu5N^Nu(M5yr*}MRc{yVthBN|-@mV!}j#R;7lg6uc7`)9oJzyp|%=BVBAf2i+X zSjvYnbIoL2)9*p)I|k#lP7MN1=%_4C0k$(MB>n6+pQmHlmOZH5PA13|rN0xL9wLBb z?BCp6-gGOLS>YS-(1{H&9BnXV{kv!!>1PuvUm)u~f-8?C(MUrj??cF643#YoPzD$O z6;AI^+H@gYeh~L2p2t8y>@>gRVr)Xg)R;PPuOIW6RJwh5h)+9LkY9b_f3>v6J~nWW;#? z+!~0B%fu&lKZZNRh6)t8Q~h?1gRPwL+Xw~BcU zncCjxg!Z>sU)q;+;<7d^!ql%;VNX(BoZY#Oz0Z?S4tH}lcWR$q5`QQy%+=-)#CneU zW9lE9uN(rU9v%@J=gkiaPP4wOAT*Hc-=qcJ=_>cTRpXEzysZdb8XBEqRzYrGvq0v_ z;b9s2wM;?3aInpf9(z6XyM+ z#4iF@Bduc{F`Vsq@wzYvbXrf09@-Zj8q}6Onv3Iufh}-}Z|jU!4Nnfqk^Q1C9`YV` z4*V8V-k*#wXHUb>i0wNJ`NEYy1J((>CTxG4GaJ?L_Vd{ghAZgaIVHBPz%0KKAlrt- z_RR@ht3N@ia1py4UhDb^mae@TCsvR2BE;c0EPl^9ph=vls{r?XiX8m#umg>K6Gve4 zcC5sxe*MK61N`IVnMc40Z3kp2c&_*38m09r2z_6WFIMZT1ff^`8xMki`J{0hOOI5&nR&D=R&KU2~BXkTl7@W zlBDlAUTKpMPdWPuaq-sDk1K}4`S!Q@bSTnU98icaW~{k#v4TyJlB$Y$aiA41lArNATaJPkn_c$qy}0yF8NH+p z#U|&3)i0sb~?yb8-RgI+=DY z^W|a%xvBf=?UKviz8@FdZtObaEOA0dG5?5(!@gDJM91Qo2KJ$WEPOLNu$&TU-RR5m z2l?)pxy|{2%aA@X;2@&!*?#Y=f;T^1JLfAO&;FC6r^{9yjg8+T!Z+8ak+*F|eDw?Q z-hec8^ZC-EOHae-=~HlxZ?(tHZJVN!7EL_R!F*IPG2mg+K*2^>hpF(xDK`8gf?eCz z^BlN=a;<@N9d!e03u4-r3j^qG4O5Y`=?gp-Sj5~Dv%h>5E!=9SjE^xmU^E@jgs&H5 zeTzYt^~2H<6;UEH6PYJpf!9aejIP(*j3J}8;@i2?@a%n;U}SorNSoP?#nS?XqRFXd z+XWIF?A|%s=Y{Q?)3qh+*45D}5rZ884LHXaSSbZ`3B>E z*bnoeMi)%=ShF{LgbS`5B7)oC`Ui)h28~bltk6DK99#LAaDgh4<2UpIaU29uLdj5n zC$Ha!hiFyESn~~*tzU_+KY9cAKJ`I3%VEv4KCaI@tOdn{|D`pMmzRjvH$IGZV#^G) zWned^-E%rdh_9#lMFVmF1*v9{q1dy28Z4i;#8kjB@{^csM=!GVU%1~m+?Kg&blkr} z4(NMc{{jO#pMt`foAL1rgVE#u33661W6sM4GM)0(KiQOJk$L?7n zv1A2PJ6R-D{ErMOj^T5RRqa^RKn}L=2-d4xi_{=MJYr|xKOFZ7V#pSYPtO@D(ue?Ry_2$ON*y4BireV~ZgRpk+kyx;}vaj{X-~xJ6+i=+__C zT6rew*|MhZo|k~GBj#du{fE%9_s{^H%XA$u8jsv}9cqVtL#@umcc9PMba>TOhOTJp zw$pV2V(~Ghf%s#pHixV-Sy(b-TtHf$9NagjYp43=%c4?tEF2vT7>8GAPhjH+ zFP)?~=EPeMXvv_0=^w5(|ysd2kAFz=JMAY(=i-_Dti z*%`+nt$8DqWz!WQe8p{X`V`DwxB`3PilbO^H8c`?YEi3#?ty4ov1P?PEM2z^)!KE$ zaX)>HH{PCxjO6whbaN*Z7eRS;sh?_T_m4<_Z!9?VVpuS9GN%4m0jG9sh4NA^)snUG z8+`on6f9i1Me1Hi%&!WXw`+$EvIkZPsTbD|@(b{0SGEKs|A4Co@^f-Y4&7%huN~ya^?D8wC^ZZsv_z)Ylg;+YkK>} zJ6VukvzRQ`S~=rmd^Texe%!SaiDhb{^FV)cSA@#UQ5_~rNA;_NDpinSV`Nvr0lSs^i^ z(1fITWc|7lpS&{#-+lj+EU->M&1Pre!mce)lKwK=rd<&f5NzfFR|3{rBJhUMGJ<&epXlj*#WX?7|*j+QEGBt{>On<4@*b;p$)Y zC5QyMM>mu4c83<#P{K#&l1a&hu`6R0-gxaZELrmtcE*)Joz|Vu^MWQQ6VEz$y@z87 zi!<@{haY3goNtjO4)laFC!_Tl?a{GrMUO6H@sHjG%p>XmB}lcFP`#wY7ePQs6FT}sT{SLnuDJ2u&Z75Q%n#gf0HAA%3FCHPcju9b1L@EI- zt{H;7%LZccqHmG0!;IC7CY49^I%%kw9Nlf1T)i`%TbhkWGPcO44~rl^Ug+YBqfBHK zB1((_j-CdhU(KUepF>k}&U(xZ_@F!B)RRMw9ao5PKud1GK(wdRIO1!feNdB6aDHt+ZEtU4nHl&l zfSrhT6~v>~bu_lT*B1x3cTN^2%nm39t(sK{loHDvc@4Nhy-wB-p!lYjUI~0L7IS|1 z1WUfV8kJ5d?M*G>@Cz2b_o+&&)u9zuq(?fWH_mtp{U3TOvc&%p)E!TLbTg{ieFcA> z_cD6*e>#HpgYi$I%|D;U{Z}=|4{r}b|KamJn)im?Dj(KN#q+b?Rws1MuZ!?V`(Aj* zqp*qL5_cbp*+aXdtR(5E{&?V}cvxOv=?!#B=>xDaaD zEC0Yn_sljl^850jYh@i!Eqt`$3voi{;oEobMw6>Xd-!1DM$Gx~|4>)Y(+#gCpk+np z(hgCN9x(`4jCvoBUe?$@ybBR1doC8d@o!vk>wiOVO^w1}iSK{&OI&(tImGQ=hTG1* z8gm@gstLHgpOpEJS20JN(5@y9y}#H`?@#ZKFNP0lz}aHV2_4kS!RDMk$*+qRwPn+K z3m#m1X3QhR9SO-#s-t8a9_UNTFRY*T7_PkQwMevfCzs>3S$BYM|A#6y370gr50-pf zFF*S*xQK0Y##(C-`n!;sr9#T)a7;M7u5y6M;~-qk7( zIls)u-SxW#cDPic2j%yQQTT3T4;;695w2-|fo`j#xMgF}>!Pv1-BYkjKE}jAcNK%# zjzg?9dj~cv1nAPXnqc{7?OYy#)o*_xFhQJO$wg4uo`)qcTmm!n%A_{!*|(L;$g%oB z`%HQ=_em9rJkW9?B^)_OR#p~ra%7d^_U%Wm&_8QQ{tH`RBQ5C3y4Dtl6_cJu^^gQ)Kq&p$YUdE-}UIy-+f}Fa?S*Tw@lD1Exv(|rx4&D1Y&WL!} z!}sB-8B5To;XW$qaXCGCfIpU?IQK}ZIUcUvZ|KWPb_8ztCBlH;N z4+@kAF7*$1FXwoaDSyan@CQ#I#$FH&gyup8q5hN5c=)ypymBud@G$-}{0>x>ecbXg7h^52XNha7L$4&ZoBV3{9jX9 z{lxS03->BSpp+{S4;Oq%1u1Vh&=JSteQKa^VdIi4WMB8*Cvgl}APul7x+B0473e7T ziF8B-kJVs|Ogd@JYy3g>h!V3_;5p}jPQ3twM~}n*UV0Y)y5=0q_zE69+jD8TOS=bwQT!hz_WO@`PAsv{mUItc6-{Co+TlK+Hua3v7&)+SN(@ePatP8}^ zoGp9e_QlY36K5v z36G!W?K&0hTRe6XfIJ5B|JA?Wc*Q zkDLagT_Y{sAtEkPzKDaXw3F{^Z_xy~QXOvfq*0y&y6a8DFy{4fcznb?ILqFGU%Trm ztk@CVTh`%zChvnv(%3R%U*I{}!%?$b3S<;5 zIJ*3B(cm#yOalj`fz0o4pL0OhItOFkd`4C%569y-J3F0#TVIoTj2D8f_1x=aq0HOQ zVAS;;EX7-BcjKGL)W#pARf630xWngwzG>Jfd_4Jo7;%lY>lX*2)6nT+*JaW7>0!A2 zQVSQm9x-Y-#_MV~N^;iWV#3tm@zb)T4I0w46jA! z9|?>07g5X4pUq(Wb=@Cr3?I`#Oan0u>|X=%`zQJHQDkTR64)Lity?p*GCA6*bRC0> zYm28@!gg84?+;x&H8opRR6Pf?#|)4?YDW6enjSEED*oAY15%sx_HfIFj>e+9I?JBX z@#s9_SG;$^9q4vv!29dbGSTyaF}U}pD}sC6RBVl@qx#_V{z1MD-u4*>v3HKFpa`6y zIOkJiKIp);ICe5_fWgra(}5|0R#V)*NNwB#7{3HRO`45OS2RcEqUH>~UpU1iWEFIc z^4TWs``!8DH+B_nD-3<&R@CM;5%OQXZZn)*A_M<=ZlSzbZpzG+SKZ8NpcFjv?_1GG z-nOCNq}GFVQR2H!cygUA$eyHAraD|NlR2c{#xAl#8+4~Y66z_X@$Cc%HoAuxNqEZxTZ~tLJD&PWgQ!I zwsP^n2~EyKpC%_`!DH`$>$+RY2MVvLlOSYy*uMNLe4w0N@adFMXqL>$6i}y0JJc`r zBRUSAE?T4uIxM8!f%#MKK(%7x#1U+}Z)>4f?ekO*w?8u(-A7*}ue=9&;&!dVGdGSg zz#b#9Xh>Hi%A|2`UJJDNytcgMa4qC4F@D-Aw7#?k&baV0RNFfjFHBkmRB3?=FTDUo z<<(n#l|CqXA30Higvj50u*6XLYT(*yuf?uiyKv4q&e51JX$*0I8qkMprb{_M&I2Pi zwm_RODYuf4P80@0!%SD<;XgY!&eso}0`2h3n0rvK1fSW*p@ys)sa0wVt{XW=U-hND zz-gua6CY0aCr*-njAgc%T^%>K?1fK-Zt$3?xL`y#?SuT~MD~DZ( zWGOB#PPPm>H4T?G=_PepfiWL{hu5w-OWvrejs-&}=$0Ize*foz8NG>PV6T zr~{rqS5a!$X$#$zYhH0wO$fr_l}Koq7qVvDeR;^IG@{Q>6@Kj*xy_mB40(46i3t-pa3#%rL zwksuTVZ_u)xU6onVDr$vWqWx=xd(bYD%SkS>sZwP8FcM=6>8_r#_%`f)vs#p(YyEM zC@v?JE4`G~g!+|)>(BP?EsgfwuSN&4TRnG)8S>j_(VhZsW$NSL$bUd}twLx)L?~>y>x#U}vs&5#w#& z0u56Jp#7k9;G+le{)+R_!#(}=aew>vvoMcuYsnj)Z{aFkVjYK0V%IfJ?16KK%*L{Z zU&s7`qjAwyS0XL$OAH&oQtY}NuDiacZ1YE9P6s~03N{_-nkDuXC4mhdKLhuiQ$u;; z&aH=xk3PX&f=-U3>n4gE+ZRw=^x#)ybc#r5K0Hy1H@uEMUxLUYVvv{yVjB2cYTz&F zgife^jSRf5iF!fn_k9(4azqOiJKraK_eDW*kD3wKdG0^4=-UjGmW^h2>?w_tYg9)W zJEavcpY6 zysiQ*p6uUN;&jxgQ~|{cmB5Ann}!aVdk|O5OtRwSbwwAl zbF*ZaL|)k!@om{^LPF|4?nJkmiIU>vh>*f)ckL}``oyK;6rU*P@DS?OCESG68}G$c zt*Q#S5^Kbvnw$R`xM)!vhh_*Tdspw-DQP;tmU6lGqn2mN zv={-Ii6o&$3m(QK9FXnB<99T%_&_wk{T_M5!r|2bt;aY@gKhnfm+Fimzn(RE32LTf z@v$eQcEx|6Jq5cQCEC=??m9I z9s0&fc6O1HdzVy9 z1|l?nPp?FqlhJ;WKa%AmHJc!T2Gd2^-jf+UQNwMa^h8m-?Td(`G6WVb_jb8BGV_>W)=hPUHIBZ^U`x zgq|e1AGhz88W!>JEH*E02flo5oWg3}dOy0KlB9NHOsgcZOOi}A61fEQ(R2~J$zY5hKPie~UF&oJdzm&QE;HX>;_RkTc_%mw2T;D@;cXw?@ z^bo${!>Xoa+L`$B-5S_g@m^Xc8ofOcxgi$}?7f})SQSg6lY;=qq%Akd!< z5QSxo4#I%@$U30hG1qsA zeaG?g0Yl%1MWYiR)ZRJKMa7`Uh6c>kfdd~7RCuEgAF!~Vxq_;O7UxyQSCf9go3p>g znQ1MMw|NoX`GNR)XeRqaV~g{262>6%dx?zfJV#x1xX;}5tb+}}CQJTYzRR@SpO8sM)S{MSLLXh+oIp9UL7(vgNM`Vx`Wv@R)c1uhUOCuG>30E&^N~1sUQ^=Sh3( z#Yr#2EL>V&woqzW*7O~A9z1*IY=f4%@y!f6pj>yQY3UCOiStT3M5Qrz5SH(*hjLnH z)80a%3NWRM93|mCuIijr)G|MeMve1C2rn<*7^?6K-&XGB~ZUZVu<9 z7z!`vSpI|$*sj0+dMsYN7}cvE+?dg6;;6==e|HH&SIu7|j!=lk;Tb^ml-Mx2D(3(XZ zdu3dgD`E=oHGLdcL^C)IYlg)Lc<1JYc^64y%(#36=|VNlhoKU>6BNYiZLnWv*l>oQ3k0!wpijY zYQX87v0M|09K>r1Ug;jjgX}XQ*OvDS26rEcq<1nSF(HU+lZ4O~$U~T)Hvx5+W>tMc zWUeJaKCPf(*^tZ8a8Ywq_?=lXYSwASY;i!Fu!nOVEz6yQSdiZ52{h$d0?Q#0QJL#8 zd%6J)?9ohM1ezmn`v!bsUh72XhO&ZNPsq?Z8kdWVog9l{Hs9cA_em7uc1pGN3?X8v zm3}up%e7 zI$LH$&&lyQe(K#Ec?)M4f(~7fk`QP1C$}t|Qn2K`zs$iyJ?{!^TSJn{ty<%USziRG z_o6hqj+%($-+mJ>je6OexXqe89y~_Kj>q?odmg9PC~5gTb~85Gxe>F|L#S1s=D&*2 zxndmk$th*!yySYDklGy%W!zMjo`oJruxw5y<46&HDARwg z@X$jK{kbZ9P_+%eM&>Hgo{e+bUN?97d>5G@HZcYH;93nw@2e9*dQT>w4_=UDo{Sq7 zywDG~J|Em~6=;ffP3K^G_>jaEP$W1-BG?@)*RwCL5`xYA9_f=k09NK}%<=-4N^pgd zhc;@;phMgkuEBd{SzT?I`Z(%!e;`y)oBnuvSYNcQQ5L!4#rEu6pD-EOuJXn?ZS|<- zu?I0cf$k|JJ^z%guOo2vWUWJ&)tp06dyXR(ANv~cUrm2HtmgJI>+oDqhLXSg;0Y5E z%DR?Bh)G-VUc1WqaUQxhx;yk?Ps^NkW@{{)>>rPZq-v_D->zN#{4}nl5G*hA2hRyi z=&}#KikLq+YI!%w2LCxY7~@F!g#Nzv8$PO*a1rWP;pfbyySYoYM2$wWYbj(PmX4*1yCP0j>N1?wrjBfektY)!Ul97ZxZRlbf^$Hp;PU%! zN85TQp;STw5)zAH^R!3M_uudPm?T-KJX&6IBU<<9hizMa!1t?`W6|{Y@Y1`S)UCop zk4;7EXD=4_ZIqCdf|ebme(6A?8=uGH*Ea$7s}m|NSGGHm^O2R_To~?7MUtW52byXr9l=%uA=+3frh#Kp1Evv2 zK44>mHm)hp^F#xq{j0}>4(aG8qz_(*BHkH;_+sfeVQ{v=TpVY>435_1tW$;=*2C}_ zP9xIW;)j(el#S1xbPniL-0*+*;Ecv8D4UpsV#SMKc0 zoj``Fub1wQ9c9Rbh3wb%4AiQb=C? zFC`4>pRzJkH|!(jpmsI3iZsrBIZN{Jl%)@1r2D&C{=Mq3q6Upz(zz9s0`VTs>0b3L zhkr_>EJiMqS^%ZEG*g-(AcPNlz(x)}U^Ap4I)YS|Vr<$00y?(@vFI>pz+4v!j<9EY z`|~b^*fCqYYa6BslHa4C7!8g_pBxiyi(%VhVEbbT*s?<8M^LW)t7O~md}vM5ZnO^a zvwPiS&jDTatebG>b)8Y|gtACVOhA0@It*=gzIE^zDXt6oeCax5$)`cNmn%=#oZ)u- zy3WwGY%D7heS!7bo0x+3XNon8Ik&uJ99|sI425|Q3P|hP zn6-P36ZRM43&t+=@5NjN?^3B`-YWA)qrLA`#Hj2KqKC-QxH6Ji%? z$o3d6&to4!A$Q|w)i??o*dCV@3?5R8U47|^(#ejBg zdJF~IAA@tY5U7w+$iwLQ3k+>L`8H32dvwjp=V9Z*xdxZkSiak1lBNv%)`Efzt+g=a zGnCP;Ll|u|5>=WX;pcI6u(6U+$xQf^9Yz+*#xxMqKuiNp1E%M6xmX<2KuiNiMFTos z_uuQI)xzEXJsbxg@>i%bnp|K8O|K2U1>=^GpgCHko#0W1CyEqHF`zCt;exhlC||k+ ziWcRn;_X=ULC68kA~$_L3SHW^#0`&pjvQHqSL(PbsMq2wTzTg+cx7l0BS!}wHt9w7 z85hSRI}C9CPuo#EzBo!Gl@Nz_5)$Pov1c**H*boTjT>Rwnr&v{V`bicaiyh(#UA(60+$mDLrjc~z5cKcb2$jo~(gA87e#d98hyz*z-7;LsOpz{3 zm9I~$ObQXfGL6Ob*M}L3bHk%Uyka{jI8B_9MnzD!9Jm4{bx*oHapS_d>KgqQu0>wjj<@<%G{GBVV|SV{&5?M#TP?jQli>3Z8<4xA$r%Wjhb>y zUb{oaD&aCtx6a_%Xp zQcf0_6qk8S+;)8Ss&zm|tIEq?VYRbsi_cy;7k95PnPkd{b~<)jo%GGWBSGvsS?t*y z2}s_x0GCv*i}G^3zdE##MkTuOy$W@LIbvX%hoohkux!p8CGT*09eMxMA9AsL{yYVw zb*Y1LUgexDM=N<}j07=jQNbR-K^O6yvEMKMM^8#%*-F!wZlpBv^+!`LUJONy?dPj?EVp%TqPs9 zyxiX~@AVP7_cM_@cgxZ{K_rwuPHERodIssse$}-8$jX6r%k8L52={XJu{c%SC$ska z&WWG6MDvo+^73AW@2y+#)F6@DzzH|^#j6W9%T^pxqO3w(IN{&8(fXavJfi^$OZjvIn?M>_kzT7cuCb6?`aX(p zwrPPP8`$vqbGUYtS)qPK=SC>b6%)J;&O5(-J2GVp5z_M9d3ZrG>UOrPEqJwvcN0Cs zq(nwnRJq3-OT;vAWHi7w7p>iCn~sL?Rm;)Ps(A39hiU7P%=wk&6KQK>Y{k*LI6x!7le#wysW3n5p^jB-9K7~sjHT!d1 z->nJarSHhzmFMj@ziaDn!YS@A_T%h>Rc~~Pb2nKzqz=Thzv9PFbLFj*>>QpeJ+spcx@BT&JH!qJ7t(zH!r8)L)O#^u^0TYo70 zym6bsOC$Sz?JNwte3*wcZ!WR??QJ#M>F zS~i4k-`iUAxPEwS;&Lm- zR4tazYDm?@vvt>u0PTUT9gE=yZJPRZf0`D7Rv07Q(o9yd`t8{iE~V(JCc-TSZ)k00 z{zOsPvSrbrK?8K`*zr$P>*!VGHHaSU1$7rTI+1M`ex$P*KP)^QF5Bh9>GI2m==1V& zuzTk^LzuM%Tb<43{D#SoKaQL2_``=qGY04BCvX}E?H}#TtQ5kxpS#{wTBgk=2qr|Q= zvxQC#O-M|!VfihX|Du_%v_DlA5Gk$LA8imBSe$Nfo#c%ygYzuW%d>Qh-u(xYrrm5 zJlfTWl^@f<-%ta*QM+-QFU=cz;5Ae`hzJn}bwzYSirbRI;Lhn-A4NqrCw}1}Pd-X` ze2rGXr?cdE6j#ZCloa`L{5t&LBVO`cUp!XqDcKjbEzXi@&nH-iN3J~^pPY9E>X+V) z#V^0%?U&l~s#Zvn@mO9$1vE~TZ}*A+=IOp?;AwQg3$tYJ!uGxK&!(^9Npar1c_^P<^jXUsS_eX^NMO@hEMV!~YJ9f-}1D{%d z7E*7;)u&Z5TykwH)jn0yqJudj(B;$-Xm|VTc(ms!POKdd^4tmHk0wkilvS|Hdd#Nopvgt3YkLuf5O zK<&v>sP>$i%15-vOG+2m<9IeBa8B3akbsJ5?W9AQF2@5nTMneAB1Jw-xXv+3Jb&cn zxVPwwc<}rrU*E9`gS&UYU>}_(o8OIIt=-!xq4KpV(s%TezGIB`9WUVQdM}_`&mPE{ z_ZlX<&Z^ek_{UlDp;oDIq2x+vZ~MCkE;aEUsn|%`+$@{Y$76lQOph zN#}$#lq~}F{H2nlQcC%uCY+u^a0elwKv{;wI?=UhC(Ot(^e#>MKp-yxmRIQl1d{Os z{poPJSpM*7fO>bAPSW1n@;0p~KaH)412JYGov~vlOu^odp$q+*7VLgKvWSF}uWdy# zUS3;p91<%xkd1Em*-zjNum2d)1$|B6hNM^IA{F+c%zxx4m3gqSa7*JK-uM zvcsUQ@?Sb$iA)ffKdlc59E^6V&M#$CdZl%ELMYQ8B=4@Ojyolz<2FByONX&K@Ub2j zE?gK37cM;3t9bv!zPaRIwvWqJ-6g_{OkgBDfI;;*An!9PKf+c9Ip1T5VS6tlJ6EqDHluIP&L zQ9P*IS5L#MQ#PQON4y(V&gg|s)nhu2cCiaZ1IA!64a78Xm^8qFf*vBjpf_{U`DtCp z$2UPGaKIl|_$0KvqMiYWT}#5dSkv?{J2HymfB81jXa5D4NaI%Ba7MfrWzh1b z>kO0r=EWAmLa%xK%dqBX?!vBe{gQbx9|QM+Y2SgYi(>U}mVvTr3>824>8dfobl|`x z{EazWI}C4pY`<15bSJ4xi2%~mW6Hx994$feu7}02U2`QMTfg*oVifA=Agew2(LG%V zEOKd&6z_11DM9}f?&QixNF}dMaUYyzoO0PcxckyZwtR`K1M|R9Y5)L007*naRJ!%m z4`bY0qu?-~3|ov$*w=?A9 zr)8S;!|Zp4sk4mS?~MyNplkLRf>|H+LpiRJ)P0>xqj{jelk>~oi#@V&bWXOc;+BKy zYi!1phTs1b=*gMy8SqCKe8p26%1s z;_(88l_3{QZI}o#99};Sq|e8eNyCRCdbQONYAl)DJc9NtFqY{ZLLq5a9l}E`!qRC4 zH3y1>s_A%x)mNAKO6m;RX{G^NRUpeOln*7 z@eMdK#SYbeX}5dijV?LVuEm~z|JC<4nvov;R~|^wX6a1Ml~`?BR({f+6)rH^9c`D- z)4ASK(?YLfE3sF#S-Cy}(sR`-D7LH}vD!MXtP4!@+R4=kUM%I+edr12)w35=$sBtu zK0+EW*MQfdV8PMvkJ?Yiv}q59%a04m@tTR&#nCVh3vMD1pp#7puN{aNu5+^$;aOoE z(C6HAHx|tNw|*;!lE&Zjjl4OOha$;l7wERv{{#14m_n21rJjy{{X(?8`#pi| zco}5JLR1aN0u~(=L`5DBV2->4z#_BdgJXOxbl0w3=Jim?@7}#z^Mvi(xl@lFJ9Z#F zJsnRz`6Sx6Z?EaYQLmWHm`62JlE#G6y z@4HY$oG8f^D&n}(9N#z_ME~{wiK$t8G09%>v*;m03;#EB=KsMq4*7hab3Z)}%!4D@Z1 zr|l}-((uaBwv9bYLiCbf-^Tt&D79~(47LGwbF=|dCIzwIlpK@tW1L>55N#`k1tB3` z9#7G3OkW$KFKqi>L8uHZ|@GQm)GgHW#uA1 zu@ovCe*#XBy)Mbcz%qEU{L9_(8@6W4y6ptnKU(@(7YwD#v7fNIvVEHsCH)IA4lilj zx6x@uCez1BZRv#)fc7tznfvIo9Q_!Rq?LlP%3yh>Z&Ezz`st_M{r?!NVnlVWzy5kW z|NQef% z>IzmHBu;HY+L?-wyzr4nZBqDID$7x)5W6jvG{t$va-PXQ9&Z(%wvbzh*Ob z$=D>ZR2iIbQh6kZK^2k*J6nI=Ds5EmL1jv#IL9|4M*A7LA8*~X9zXApK3g8CS4ov` zLdbYU>t}Vqve`04EvAkfr|?g}JAeHi8#eqR6VW0_E?FKGt5!q_k!ebhU(+p09rj|| zFH#427+$huvhIsLx;19anuY(3?25BH)ezxw-Q{}I(xjD}L)#I-%p1JY zf+Tiu1sFPyUa)>yl+Fbs{4uKQKzF{MjYK*f_zkO#V~r%IO}n-IJHEn}O^m*hJS z@{NahT_x>kB4cAk0;jXuSgVa`L!{@AIgHIQE!HTJdALIwwK(aIbXH)e2VgTE?1*-Q(*v*vrf$T2 zon(>TQ3yI_l5(GV>M5Lb(n;PNn{(|X`Cb+0(>#jFvTc{bGenSyAq7b$>G&!*`2GUun0H#;`uikV!Q6si-6ouCNSG-c5lB7;YMIbM0~_8q!?i2w@GAtV@gez_u; zbfr$Jj?yQwF!B*%s$Hl=IaKDCrlkzC2cyL1B;*{*AyrnCv-Ub+vlB7rpfQ6sBaJLt zJ*gMddGSf8RI3h7WPJsZ%ZG0|sS~^uVE#x**@cl*#@zf_2bOPSo0zl=1}?GwnY=;B z`3RC{MVeSRcnxTK6?UipgO~Jo0SUH88SFQmYuBTo%=mk5i`<%W~yuFDVWnwlf}d%xNG2`%rdwafzr>v#vVY z$XJ*sA}6?4=G-~-Nv<#_ zGyEEc?3s4PwnPfk3MIviv_=0og{uVBj_gEVUd!;xmGZQ&5~D0rsXEHC8bYonp+vk_OF|lMuFw!vm`EP>zE6CjjCW3v zk2&*fNX7FLB5m5_(S$@Y;z5bB$H_SoEsh>_(2>g*BojVP(G|veCWexlQ$1+Cj0VN% zRF1j78U0eKt-d4-ud`+5)ZV2os#kVEhD7aG?~ht}vQqDcN@ZF)2eG0BB{h?b8!~!Z z&XFk=j-m#5!FbpU>ea+vNH1>_?t)r-4x-6JySMkX2$_PSeK^rtBr^^mNw!}`@*ApA zC>0_lmB2+_naY(><|HYTS=x;a@kymoCCT#Idb@sz?XUbvRpiY?((@2`>{GN<%frU1 zfr15Xs~IQo66GqPQ8`X3B=DT%yjf$T7Um5P?YML~=iK8WJmU5hY#}z%dg)1f7G*#x zf-`L3!i|gZstUdKn%f9MZ+)kA)+vflYX%L!<+` zL(c)Ypsl=avtifs)xqUr8e(@3X>R1OwWThQf-RU7;xQ^@C3y)9y z()-xYA1ml@#+{XcFDJf;LD%<^4|dc+ojN=^qFb-)@!&HPFmL_VznMbzeY(y8v2UvA zSRFG6N(U6)ecnqrj2#O2;C3()8YECY@$QhM4x}VckftqKB7Y#tBempin4;$H-W-ZsyD?)@*g!Y%794Aj={#hE*@o<4M+5d%P z8bXz2`z(FaLi-ysQh=U`)|Yhbd^EN~RlW)c(G$g-p1u)bsly^=;g6yB|MENx+@&-d z`#B66$WOz1dG=eLPz4%UehlV%8Z3Dnoo)MSv%VV}Ws*kWl3*;yf%nxEVvZ=y)K$Wq zgOsU%d_Y7N28p#DvusJQe0Q%mHtHJ_2)Vo0>ph-4^5@J+aTLg4s-)!@lQgoOOpVDc zl~z>1RoAQG^r(JjohzEWbYp0z1ZWVo>jc}eTTdUY~YbQQF$l2tuOmVwwn zP;vuxzY4NM)C2!w;aJds|H>TZ`moWu;7>c4wh13i0PbaM!e4JcPk!lee&+o$nDe(v zpBzof59{o}E>^M3_yI2Ok4fmaFh7oI;usvj@HoOEV7148h6cXu{LX$HI}+z}7Ec#+`Qma8qtqytn*UeD>-H+}yV>ZoY2>mS(TR zsII|UrGLCoKJFPkd2O+$`^(x;CSBB{e&rimBcRt*Pqe;JDb<_S2uin=-2#e(+G!S}Z7=ijL z0dg?!)j^0WnSw68@5H$DfQ6b9O&^E;-CCk#oq?E_pC$g4RB}kpB@6Q=Jcv4p70|L< zZw!5YoNSk}49k}BNXNu+&tq`EOVA{xEIP`Uh_en!nSX8}I>3=Z3~$+b+tvY1=mzfK z24{#kQQ?taa3aiP#qbcu)xwO)%E?8vV9`PnO1S|AAu?~#!|`&`p%W+y)JOgTAdw4_ z)%ti6iK8S5oCNWZaSRstsv>>!eO&AqbtA=Q!dtN!Kcf5KiZ~zoBs&=gRg5 zIVzK{lLLZ`6$t8yg;0i&s$Nu&lI98xT`?s{s#s|#U(#Afis5+)k_Y*D zP@WzZFS%GaKn*x&!U3|y$VCI%=B1IkW~dET8?8}K2u@=8m*YVDHu9K*24Ljo*W8IV zM=0f(2+6N+Q=_A8kz0spy$meJE1zkVPK*R!-^TtVvTqZ5F;d-vLOVZpIwC=col{)< zHp#?MfRwL&8~YmeJF<6Vp7uN3+tu`KWc1FAX^PC}P+B2&^!~n$a$VnM%8+_dITF+X zEnM2)iEJt((6^~WOK3??;S9DAzT&U_5^-{{U-S2EO3FG>rmGl>rKtV%Z6u3@!>a+~ z^z<&Lxk!4UOu6oox$C_@TP7XZ&B3jpGB`fJq~)@+*IKoFcTWrx^#h zm=C;VtRj4BZz6~`%{q|J8>_go!(vm2ISZ|~Bz3GKa-2+#Q*tyfbx|AT`LSN+TwVQ* zUAuZoK|FWXL6Qu_D-+L0l02M#)ShJ=#q;Aq9>tS|hY{toWhsYnp{7iIXlQA)u-MNL z(ts$R7r;CAU96)t7uDq$A3)FsA#KFkQK)Pu&&slN9zK9(V{w4|V-F(@(!3erAS{Dv zHg1WvEDMT|D{ux^9~YBX-cm3U<(F?P5a!`&WBg+oTmhr2U?e6D<>hw>!o%4#`Sq|p zt1E-Cldtu0Wm*A>XWAE(mvv%r{Fa`1#amRUA`U<%2|jv{$H!}k>@R6!;b>@J{^XAX z#oqPNXmqXZR^nS!d{wk=74~+PYyo9ir9P^m&$*^RR! z2E8~fY|-W>Jl;zdAa?AiixE2xd{rFQi}6UWjz}z778NR0!037N@SrSI?5H#yQ75hr zUR*3+{0oHbm@rUqmZoE!I1HTdZHOXkJtjOpP6um?pB zW;}8c+TA&^U{_<=NHn>+7JhxNuWTm}3;AjwYc(FfEEW9=sMhqsU2xq=i}30-_45_^ zw?pV)L2m6b@aHYt{BnpM<=GJL^+L1%Tf2`!c+V>byl3JI|QCzr8qLnsNK6y?>t z$m8pjoL2E;UQVFan==kh^a%7tA!Y%q5TQ^y!AM@6emsAN6n?E1%M!S)A_-*zBd@a( zF034u8Jzj$nVR7ELU4*^ndD&6hSJenHeX~W-+Xmeu7i;AwRg^_c<%h;P>wXbN@h^a z+Ss58o{vzODv-###_F7MwB+(JvYtH1MzA}SoEZrUvuOwL=-e375h_9K=Rh>zoY0~6 zcp%(=Z(6livD0Y5=`~=}VT7(7n|{#AA=ICCc46&VuuL%+K26GxfjQntDUL2ImZEqH zv_l3uL+NZ$e)dORKW8MmvRye^RmxCip;2MBjuuW%Cyy!BjWd4Yv~cND62WWyYbt8!VB!3B$A5Af4DZZfW2B!U%#p^jxuKaPA4t68#XIA3hhIL`RySbWGe;A}RUU_Z{E4yU!3 zQZA#rXy%%e&b1fY3X~$0Hd}R>M&x*m4Ab6i^Ab4@>o1#T=h?Bc$$eYx*OY78F=^Q9 zDS@?N%XfK$b8xkLdQiDnhWtBf9jUP(7hIvX!Ky7QJ#EX_C`%xk`(D#t3(1d~w%Q3R zYvp-jy@cTOWVAHa!DXF26Q<{6Is#+QQdlADC^ulC5>}%FBadm%mIZ14%t?iEQV%j} zp@xrX1?f47{#byL3Em)_5M^_!3ZVQMLw!vD4j3>1rAn2;0}niKOxH7d<=k}>ER*4F z&SO%fp|N&h=sg0PAf2TRLlNpibP9*vUu?@@JFyMO$(E)T42rN=ZHr7hDr4WFJXr;A zZN(!~iOvFC@1RRxj4W4%#v+^I{l#d8LR`5_Gmyv;DzidEKdfJV!SNDrc~zAB*sLZz zTOv>itUyO3u}5Q38aaAD>_E~^K6(7<1CbG<{o@+gitj#44~Uul^;~Q{y(`LudHWjQ zJOztR-z?vEl8VR`c)}C-VTaSQ zj>Slmj~a3-MhzK*;fqG%P@b;kL|d1udF|Zg_cT`z7kB+3a*yN~Gi?!mj1lNr?i}bR zGWnNhm4~D>U}LCM{CgMt{I!70gHD>pOkYzqLp4-?TzZu z&7D{D6jV3>0+=vO3>4F(fD?R9o)qi(!u@JT%X;-6~4bx%isc-+CnMAMVfO_rb=%RzLhKdH#k15HTN zBv<;3ybUk^vt1gniMiCP`pUbnrUA8gU5V2IhazhK>?lg6sna>b(RoWY6l>c_JRrn) zZvPSA$lC}*Ur>M1zg>|>#EtOa$Ec9)yj$7<^F_`Hs?y~QkC~Yaj9D9Dag^XTRv0W( zpfNfN3l{K9FRl->YlOSaVSs5$R(WLBl(RJCzqB}JpJdmR?nqG|<$O{R!+sTtHbr>0 zTat-xvt%(8Ay49)4D=G)rlfF5BIWD%C;2W}M2Iqom$78JU=|@DrZZ%Jj>Keh0Z#>% z3zr|d3pB@X+VwA+HK4))1R#-c;AUfn+qdUQ4&bvkvxSH+eA{|s#~jEr^>DO z$Rae4B#$Ih(TjihvEFOQo~iTpMi%qV%ES6IAw<-l+WcnwGjZbv0_&Tk@xDcUn9kUz z$U+wl3V-Nq!Am^74{KZ_TJ>q=DZyQONwFOn4YF*I7A~f0c5jE#c~J~&AV?{ z16*mYnVsdR>wBXvKeT)5fZOoQ-$!H4!f>2MQA4z8-yUt+v@yR`!y18bIi5JR{uUOX zGyG><-xE*F@!i17=f0aP4kxQ>S>M9wf_+-HLf4#QEhbL=B*Tzej3SifjI zZtR|eL%mL4@SiMqF8}XlJohFa7s9+WekXSv59gLLL!U{}^Lzc3b1CFhFrYbq&p(9Y zNQW3UY~$#~#yS@FVM+W|f(>q%CLX%P#iURg!x_hz{>}3g{n{lM6S7L6Nu6Ba>4eIU zlPhClL^iAw52afgIXaH>OeS~%t~6bmk$G5nK8=f$ypSxe=8OmNi5RYs5X&26)k3tS zj`9f0!}9&l0Zwme!U;0Pb|AR2a>s!G3mX58TjV9qdH3&Wz}m#W$Kt=2I`j+cu}Pnh zIx>CQ+o-owM^`x2Zj)#d$k8V`SU<^tt&ilOdz8Nc*YXNEDQvbwaiwc9$5E}nX+G(6 zo|lY^)6frpj+mk|IPHfJ>0}Cos|Q53{v@FC3X@gO-;m69*yKFOHjVY;;x#|Tz7@F!oT*^hJdd;_^< zskqR_Gs37{=?V~U4qkPkJ!EhRX1e#6NDyJf%AysOw<&8TMi)?G*z)yO=X@iNAdlgB z|4h=!)7oMHcsrI1o|g|`WGjTFoSCLFYhjuH8mFkwsGnJwF!dm{cR6`_n3XJ4D<4;+ zr3)j{tAECa-buf$s4p`w;p#z_J`l`ce#X;IAa0>cz|HOgD?d+~c3yju@-hk2q;xm~#5fk-_o|@T4rlmpo45_R2YA^GNYb_z0G9CZlf1 zF+H-$pfDT%OF$vtM1UkAANXc8P=tofk|p4*EQj2B6;3*;8%lQf<225>vI5hFDycn zy|X!d_QlGCPP?Gb%F4SR23^FDC2p9B>)K{7_JV6p!za(fqIu&nD0ed{&uji5od3FQ zawQ^cAy9-vPQN-Qox8e|!ZOLufet&vb1?JKiZ$^4UWDd(H20567ra02y7ExO=RGl{ z;eq>?>sU3aXK&BnzO!-oGQ;S*>;ru|`$iw>dt%B$Oc{0?&#|e-vvumD{sF(nfbZbh zIC>h@E$7vP(teNQ%aH@owRz?MlfruKaowL^NB5}Y;E$d1VGiWKEt=e=t>8Xc(~Ugb zwi&PqneP3(HtGFno7&JX;47+?D%+SqxOl-oO}1>prHq)S%O+F|>P!mdXS*6rkdPy18d^k=oHpr@9E|uS z{)BU~86)7`pTB_Xu>F-OjUJQd2I1VGEhFr z%yiYM=zZ5W!$elHsZc6rc%$@3o+^(4f-jIR%fNhck{_H+co|QDVp87UiU#zdTDclE za__N!Ob5^=L%PaOLg2Y!ahm(qN=PNcYt7jpe{%81>pVM1;0)L5M{gXe*qO9?BURDm_ zT2|_}gv*|ZP+-NwWdH}7T**SS>`CM=8vPkRL)(1VcsbW-<0PZ(l0A5%FWJQ_qADo< z(rYH1!A};oERjEC;CP~ldUskoA5&#jCQ382CVanB37{0kybzwZrwb_d%G<;F-)~q0 zQoQufZ2eMLFah}E9mS&7UIbS?hyG~w%6RM-8n47QVk`{Z@xA_?r|$7$jgrSo^ky?Kr~I zMR?*Ew{ZdOkj4q#mz|IDR`x{M>Sc(=RQ!06pZ90cdcNvvx{Rmz>Mx}W$8-B*;gENd zc_K?-5;D~ogPBrm%G+g?DB|I{RnV7}Wb`IQCJTcrE$_a54aEA%ReA~@gB1!B>C#9F zt9?gUE>29#VQ6i|VX^IHV?=%_XzUmUQXA7>t|O!z@A6nq74rx*Mwga11n?LTpPO(ZgLVg`cV^#OIFKYyrK)ZDSRS^MEUXZe(Nde zbjcE2YWx&I?nfSxE;}_J`zAC{h+YHkD!1-ZKIc(*@Xo7nREH+m`1N$C|;(GParq@Myc5wDq<`v7=6K^%9j?P*=HrlJ79?wmm(=l=+TjUdc zx%O6whK)^ckBgpo64_FHhE?{_ zz-^T)BgK|N4ZHNp$tR7SKL>evaZ+@vxXs_M*-h z_1!I6N>u z=T!;dY7Bee)QZo~uG0#~cjvzHQJJFGuTeb{$%kA6x@T(+6z|=dBm~{~ImDbIDcztJ zNPO+1VU2wX?3Fc6TP&Q(+XqadRo=!tlwZt6r@R6Y!$Pr%DPekwW)nP<*~3Z;%N{la z6u!$9{E&y|RQ{JaS^n{rd`;{G2_C~51kz(Z5m#YHpl6daM?jgY(mHYg5JmgDQ{e8VC(I zw!@&7q`SrTEkJ0u>X+v1tX0_t0le)yNS!(|8Li<3PCclqqwYWPL?! zl}@L{8(X+Cxz*>qhgG6XmW%asm%|_odS7)p%?@Z{Mr^ZH40#{U9 zDOd!?17pk2m{>;fK^r4(_e9I&ja-b5`NbHwfQ6HZWvl}+%P@>M zW^=?#GSbeO$dQ-%lu~Xa!cdFZpoTu9Z2_{?fFf_q8N4pt#Y)?5^S-TT&;~RfMEMXM`(f9NuR*rW}iTu=N zohDO;kY@LFhPeUAooI=h!!vSpor#z8b3dd;q`;}X7ZIW_423>F!bK5KG3}%es%KUIPk!Iu}_O))F~=Ny;g0IvouZG za5hd(`_Ex&Q9B#w)KxgOxfZV%W76M8IhxMIu~!oGQUZexMZW{OV7~>x&{SWvvvJzA z3K->|3T2;-b5_9HlILgRta{)?J8(YtY#fs%B_F^ud$x2o&gbtS*B11D`lo2VSrO{t zqLF2|XtU}6eF?h1`#nAeI9N%ISK@8mhB1mLIi}leia-GbTlo>DO!^QTtLP&>0{Fzq z=75abhxG{6ZrbcG5Q^R#v8>R?YpA1 z!H!-Ql}j#^!*s$ZPk}c1G^JVb>1@0)VmRIy^A2Wx^rfNK-nlXM+W$Zt*tI(j*teK@ zvut9+Yr#9iF?r!iRH@q%haJ2Zs%SE3{C#4!bkA>yLb1)(Y#iU>D%xHUVm{k zR+3lY9-VOLzPkcT=is%GQ?a&c#*UciR)Iteep%UCY^v4_2Y1^CRS7Ftxd0X@cKm0zqaCO;}x(whL9%xd4ZM-5kS zn>;Xo;U_?b^KP3pP`7SfoOR!R>tNQ6ja%$|*ED++*HRj`p>5lRwn{whT20$F2IJqMZJV98Z6#<;+qQqS zZCROeFUKd-F!8;GD5%v09Xs!Z9fQ2$E9Vc%99V#5m`2<7>8b)Wrfu7+e(3A$ch>(z zKva{-WR_&2EyK!%@8cgMUJJJE3t>XGt+exj=-Rah_HWnN>`0|bvJGXJKkhY5`gA3# z*4`664{pU#jmu-dOkXD3Hgl=*X?@6HXjXx3`^SrKU=<0fH132Rv~2~lZKI}Ptv*Ua z7kNo`nU16ov~Ac?wyhL}L6)gstha~tn&)b<%U;%T1}H45iH41uqG{uLR!`!kwKq!+ z`NXjDwFClW<1Al>O@D%AAsRMes^i$89QJMf-El{Mo35xRZfN+nO1gQuVihDB7a&Gd zM@FguTQ}k17LYwQ;EC%N$j z-g@Z`SZ}Vi4-V_noO{Qz-4`=o#~ZUWQmKZ{haZgkJFcgV;1PnKud{wBY%d;vrC&vY zqSvn8gl3%&L92S~_bcXehuRdZqYa2_R-0&K?fV->R0TEbH$>xR&9HMdA6pu47gxb- zr+kza>eLbiV^v!Agj21FGj!=Wf#d_aWWRr$ zqYc`Txnn0{+~=FHV}KLmnN4d-|4O!fQj*U<+S=+DYYLX+VRWt7F5W5AaGFh<-Tt=;AC$ zjO7LLczxNsIRW?YZnTNJASs`19TJ#t0X zTlR}^;1BzQ+sL!fy64Y<&eF&BL~_SB+e7=U$1U2WVZ}0zdXQ%tXC7sA@6``S9`{51 z_{1Yooa!H&I~(WRW7}Zqm?v>c?~AiKw9dvkwRs!PX6It`sE~D7fL2WXsI+@`%osHa z2kg@fyG`S=^sr3t`nS%;8FVh%)cO~m9B>9M<`+s6r?YVeooKH#Z24>)?#gbcGkX%s za)J3{{+d-@@Wq559=iD@=yDA5fJ6ftE<~4%Ya+%Hi)xn(L@YB^Gdu`0shDHCSlx&XYmNrj=%CDoO#5aSy&6-z7GR` z|1QwvVmx)gUYtr%So`r2QgO6!vj}biAjW=`^X3`)WjjC21Icw!54`wE@Ccv zyz|&~IRDo#GLqwF&&K$%Bk0#B;EEIX^Y2fxax{)r9qBR`0@fOoE}%`^q;#La z#R+5#olj<)TRM?9aCAAAD?b8Oq{c_V1seqjhdxZ_nE(vAz^vTgmh-L}1TP%G3Z zScTVbxB$1!_PnxWcL~`xQ=#1? zwJ<24AO_vC2>;KiPdxDi=FgvxrcImvKTERx*<0NN6>q6OTQ!90$nML`Qvj>4wV3kY zFY(OB+aO1y^KoDM7GdFLJ*NNVHvIB|krfcm!;DF9;e|(k547upK{uR>CLFt~-6+dV ztLEVk1O5k78SZ=E%V<%*YHUaVsX$i0>b~B3G~QrHN1n|e{d-*dyNN)_W%zp+K7eA1 z-kT7jil+r>`8u;*PW^eqr|;mJ!+(mG9F+QTOz$k=ORa&9eg8R=i7wj>Vq4)WRx`Q!c!GTgNGX#4k?V7nNktc*Q?uWe?EhySf-s z4M=a+gZWDu=}n%<6<~k12TG1=`1zl2N&~8lsCb3t6pvv8q<$hi`XR~eFvDh>1&cft z7H$PR1C82`+y^6{#u+!V;kN991>K9;?*flp-iE#wFZ1JSzu3lhe)EgZa1omQKF5{) zYa05Ua&0n~y=&AfIJ;DD=(3Ni5!2UOn`O&Cz*Q$-iPv-JbEBna`~pw^>U17FIb6}F1uhG=}y4#=>yP(bV=3OWI)Pt{hF~$ z!a+V|Y{?NX;zo(oZ>^U)!0IC6L>loBUapIIr0{Z4^6{@l1NvUsYCD30?CBTQsevM% z!Q7B*Qo0U3u`jN>;~_recoEnA^dzL0iNKE%R_5l($z!&$!KO;EPpjRrc+OKvf3l?h z$D)MIvLaX1q77G$QhTA9_u`pDaaL+WCi*>%pS1VIxW#kc%jR7X(_C2TK4{8k|BJ`J zlTEt?xpG--E?%Q{)10I}xcv(4?#{cfqJhmcldr<++ecGad^BGFRd;l|YSb2m8^yi# zx~@8_6c3Ldg}&|UaUr?lvvDr!tf!uIb~esy*z5dE&+)TyNIHLJxmOy{{YH#`;u5q^ zzCa30d8>qsD(dv*cR?p-<2+Kq**H&}7mmXywzF|w8`3uQYQomf#<}{$_T|pWwlZe@ ztj6TOWHf{JkWZ%=<{-ds=}gm0-EoI= zS!LhFrpy|Ywnl9QvYe>6sc^;$)U3i8mj)9nrsEDS(r~&JTs!o+q!)JA7_4vuX1_87 z&%HIz(w@8RY^*4J3>O^Qn2V$uoN@A1Wf${NyDBPgq%@LO6SPW{O<6Ei^C?5GS_J_c zRV!=mLL5Wq4-s^ZwD46bm3tFm(;mN!3(%uH5B_#M;ZmG*XbCEsU3hpPxFub>DAok>rew!Rom`WQz#@I&^mKgfl6kLC3 z!>k=f|C|;z(h|qCZ{lUDoEgBDbn1kkz9_5t014$eSF_$!xFy?01lkk1l~ZNg3Z3WjX`8Wa z%&7`(+sjWRw(VKkw&k>KKRdM88S%Gr72fydYF8oqAhZc7spuG%O;PD8RZ7`5jiT65 z4J)D=F5$$tK1|y<^@%IE06jvS%*nQ$OxwmyRs4|eB>dr*Y1_6Af17pV$yM5=XZ$rj6kuddfyc+h#=# zn9;_$ZL3Y2!*Ua$dSfEUbd~hL@ojnYSK+MW^jEFI)W7}B?K7Umd50I_>2a51_e#!9 z^=gZGG+?Gj0nE|&f8!;nQ7F#nwCdMPsiqu8w2BDB6vnj|FaomZj-IW1zBa$}6y9x1Id{#^yBpFSl?3daUu%PwC

)Iz4u9HIcW$kK5iZE{oBQ8#Cxq=Sz#e|_8_XVe_Af&sdl7=CTq1? zP~{jz_tQ#+VWd)%JEl31QTtRIw0o_H*?+yM2VJa~fXlf6{d&lQBNbe7%c)!lC+EU- znELGfcxZ&*;~%{I2b|u&gqx3Rfx;$ja1Lo6;Pu+X_p#=LX57^&X(%K6<;RoXvatU= zj7jq@$F9Zk;XR=nKf#zd4CgtWjoG&(XSF@}V~cYoe2Z;oE3bGcjiP&hqQ7M97pS`f zYm9krs*>3ydhic8PN~X*23d%TRr4pM#wlmoIOVFI$|%@083gG@-e&r;_y|Q#zT+vM z2%zu9XCUvs84XxXFsq6bznL_FV@)btj*-0ZVZ_8D~tfh8gyrqYH98D9y9!W8QqATrzv)ad5N5Eda3U5F2w;=zs=(%?hrY*V*%_~Q_ z)#K^X{n3;V(>=NTuN0+AF8VPXNo!pTV8CUXMFdyN)Xsdj`(x&>!trJ%Qc6>e)DCyl~q19QIdZ_N&*Sd5*Tf zTW8~}z}#&U9y@4q zzy%Us`m@Ot8ED*OJNOD#;~#fko-9C@o_H-T`e`@RQhuT!-u?&l#Gw=Zgr5)4Gzob1 zm)D|mhyO)~T1;eynLI7oene;EE~RtUCh52)P24ab5|+k)M2vU>X;%?RH4ikmYSwe; zn=C+g#IK+IEqb=%RFZUlnrjaneZp~gN#=^qU0+8HgAHIrnj=vo97#{YwiMX(5BNT}t zo$5~Q!+0jMV6n~YP7tPF{MBSoaneUxPc?Af8!{7C&UzkwlLhFGxQ({$ zFxob=*H(fI*sC^jY_2lZ#`+3L@{N3L2h9M>lc(CyHF@HOaET-(M0VqU2dkFGiV2>j*OXX0_2 zqdtRyj~#$}`gLS3L`XLjiN!ddnOmDH(BOp5#vKuresuPsK+2@!!K|#-P(v!iWd$qn z!Zl|m3())h^tbr+Wk;bd%kSj?y7xF1$B%vp$DH$9BYfua3vp$B>bGuT1(Fhc|AXz&3s~3r~gEIbB<3Ex`%gI?V*F1Yz7l<+}-FKG*3$(wN5T#i$Im>H*B#6@T> z!7)w!Rejp?QNYgKmbWZK~%xN@aH>UG`#pTK2aXPlXBpySZ@&%C{FHrCZVTmC?~C~h z_1Lp6z`bYb9xIpOS9bqZ`18$|;nX|*9{c9Z^U%ChlbH;T^U1{3m(4}raFyvVw6r;Nsgg*^6&dqmc+Uypii4(7Vna!h(Hvl&yg`?bz~(>RN9 zVSThrZH1mO+=6X0fbF)tyqQd^Xc)FElUC+!MjiDvtj91UptrJ37|$ z-Jyf+4gKwuD=U}HMfjn^;X|?#yC2zZ8#Zxfk>n#O4Wy@1De(6zSj;CscRW8GtG@h% zuNw@GRa;@-e9MW(9**HNa$#(v&34ag#ht9^S;3-+xHXH=x-F9gpE~02sU;{Ia%6J7Y z89-@(?h^>%`2fNuX&S|7utkE6VL?*@tS3ssgE;!r6hVJ-8pN&pbSxeW5AhoHx&c?7 zc@S!bafYyTDO$h7kMZEZGZ~VbhnHTRMxUw7IvIa3uHl{WC(xe$o6y7|lay%2ML)jS z;L{viPK}u=S01VH)}O8mxFxvlrTfvN1$Ru5X8{*COg80T)D(U19**-%9Ch~nk7Ckd z?(Ey81&#=?r^d~~8a|=bhr2fL`f&Wnz&CnBF>&5XE33-8as47p2y4(K=XO8?W~Otr zZ9yiGtXs1Y>y0PBXU`)(RCg+mT`X*jZ=j^Q+^xpy)d7cc*vEfV23|WAABHO~)P$LC z**eyr@LWf>8=7I$t|oL^(BdSXzpP{Ov3!{>9A%~1o+kc65EVZ<6x)m-d6(C~H?0A) zb83*Gk*b;M@?TjaYpiUu(N4uAwPU7vvQfnK5^L*ZV*P@@i9ICNx`ks>w(Uk-eWo6+ zrI?5lX?Jb^V?27nDw?42l|~_w|3mkD3U;jn+>-hgr*IPw4DN0XiFygtldI|HGgr9UzlX! zW(OVCCEccKx{m(uaA81R^vE{CmN4cily7e|3Fi239 z1oNP9_0Whe@-)PiShdD2mg!&F>5qPmrakeC{+V$KM;82Z-3ZH&jECbZ2lZvirGBrt z((XZu{JN8vQvGWIq)tF;X@^J&s>d{K3Z>O%;K_+uWndBB{cTC&G~7$N#}fA zZ}FGC5D(A_|lNCCAH>F5m`~D3?XE@iO@<(J~3ygm7TtUPZ7h zgnZ;btbz6GIPOci8F%(ZhjZ@3l=+`y`SQg)8+s;254#V&tF^_QDLig1M&HAV38{kY z<=z>%{f=SyoXwYep)uv}x1oO9ZaC_wp6J=LC;g-F_>@IV`jC9n411wn(;8_&PPhj4 zKCpWZEJpibP`}PG;{FcV(HYm@ieXbeH{j~er{K2U_0f&8NT!}WkHS$s{{%|)739j! zr@~kTA38jwLwZ5HLaID)3mW*IE<$f$r(2$NsUH>9X^1x54b}Ie>+pmvN^@X%W1n*B zDOk7UE1!+SmuoQj`SgzAQXGDeI(T^(&&EkozRt68=KPyy;}r6qn+WQGb?ePKI9 z^U=78XWtDNFu<-$2Hc1FTR^Xv$@dVddvmxyYXF{^#QE^dld>0BgyRE%Kx+d94&ZHP zp6kfoW>t_3JlQGI6TF4v{^7`If>Ta7z>km6gd#vQ0ibS&-Z-tDCyamVJ$`=2Ur~*y z<|`vjzCxa!=H>9PCN>!t@igynEamKzyG<+K$4k#TZpm4fqK7>Rbu62#dZA22er6FT zU4L0%JObnAFXue79@-t%$zgw-#Ala*&-4M~YZE^V^jvWM z;)CRT>_ZN0WpL@-7DED-rW~3~iKjkzDBmWxiYpk>ED~tkv%-e*irz0#>{wL~t>o77 z5(lp>3ul-bt!dI=p}rIrP-JrAs~=^jv?Re&^FQ0Dfz_*5FfXx;w+(smJDm-Az>U$Yhp@*ms_|lMffdYIo{vScuCg^iw zXXX|9C23=C(6by}J9kXkwl}9^xjGI1(_O~-I;9@470AvJs4J}FW|SisakTiN&A@sS z#5=k?%DHl8%C=pGLwBV$k$%i}sw9jWjRL#lgd0+}ZSE32Oxp!*kJ#Vydt?GW<3q4b zT$$gvX&K%f?xX*-ZF_aXhh`d7hIOBRfH#~HJvz6tY_-!<4?&|VAis^gmv0nb)d>?e zQXbKJJ;_EhKc)y~j!3bdB}@7NTUkLBezGa!j34vcuz~bRnZ)HFV-n3G;_8uGSNv+r(BoM!RUuK%GwOSFJw|F?NfWFM@?W=6n zT)g?Rp`y`={jh(7%H%CMyiKTG#J*fmsR53>=p098OP{)6i6jx@#2LQD)Xo@F+h;?r zurCu&ZC&lz7L-LNnH(ugK&I5;liIB;@@+SQBd0!y-}JO}ft}(lh++c6I91!7aDkf;;ZG zNEyS6p9ElS=hMS+Agf(isMk}E5O6A8MV>>}& zo`y@FWmWKl^UfB5xgjskinpv(NdvDy<9$$hRdiXw0yo+#;@J3uV0Ik~TVkck_6Dnk zt&##slv`|gCAK@}73ELx*ct+^5op|`%ZQza!4tZOEI+uQZ>NfIRb&iD#JvA?YGAiL z+m~ZMQ|Ayu7Z z_1ogAoB};O`Dzps>#N9>lTo>7`41hIqTOEU3`8NFjWciddwB0Xzh=*yhk2=MHb0tndRz9q z#XMIhy#|dg*rz^Ma4T{Zo{cjwg;|QNo{ckab|TYzlp!tCJf_Wl^>lkxV9vjLHV*H> z+0WT{rrkRqrDRP9SL2EP-EjS#cjDGtZ?)^r=a<^oO4C^Xjp59VS~uv-O*%W~nhE#z zJ?bEVg*+6TET18a$U=@mhYjK6|BPCDPbFQ>mQ(HT@DhMfekiH zk`RuW@`UOA4nD}UoHP-itjgd9>5NtjvT~ZR*rd(IZN$}x$-<6pIGUs>PbS$V9C0Du z$`5BHuebI(Y+pjPI6)R+GFZlm)h1rVw!G@>-3i?T>vt#3C#(Q1I&}?@*YLqVeMiE^ zWwS6kj5|BFZ;N*O?PoY6UL21l%s~AAHFF}TgL0kFdCz(_iHms|m_N&xQm3nyVcF6p zSoY;&EMB|>OO`CfQpT4qTY}YEOc64F_z~~miooNlO%2kIU7EOYrE1;BZ{nWcj&i(~ z#XO2bIuQrsLK-Wn`8Jw0{Yu9|5i`fMDCtaB{LzHeSOm{A$p3Du1~kDu;)o+~|NZxG zD@h(NYQP#}jAZ+)niPYqhJvq=~AqT zwk;CSFXMT#Dw{J(bK{wnk3AF?zDv+(=ORPOTC|LGfq_%=SN|3HEn zpRZ2?(sR60xg;P~l)~1Jz0$00IW*D=BipT7%f7sB6~`#ctq*IQvYP!{*Q(We7|Tz3 zeL5#OGVBn3>{o)sb`ctYF%?W#El$3}UR}I5^HX2^DW5fQ10SU+>|Bh0h!`Xk%*tj> z&K7Wth~llnjCV#kUdL0=u0G+qr%Of==Je&VvO4H;WDmz5I|g&Ppj=S5B@Xgu;TSn- zj-`qF>H{pPJ$TZxA=GH=NcY;cj1K>eupI97Ex6NAkv%LqmY%SaP+SP2=%XzKvi}$Xj9H%eVwPGDH)3m%NubGR-f8%c>oP2Qy zROdM0h+l?rz+=y1!jcSs&n6>#%s1)a82}3R=!Dx33V^EmK85~#(Ob}^>)015@0Xk} zRCujWpNxO0TBw>i#wx>8V~Z7d`QAsu1FI73Szq_q=p=J`Y*JAfA}r$4S0`39UYS5- zobn@5^ixSmP-%JhO=_Uwk(c1{e%q5HzkxR96P(vRE}|CUn46vtZwG9dPieow_~d{4 z+S<~>cjMbRPE&fw-l(@_ay0E2$FnKK(qCafui_NIgN4n1i0Auj9I!Q4iovgcg1&8h z1^lbXwY3};=Wgc2;p~5#S>M|lvwXT~;QOgPmMZ%>%OZTZXf@i`NE>T!0W*g_lJjJ! zH(Kq%ol045ThGSfV%6M9+5CDPfZfYy>)gLfXX6xAN3$%Q@Ojk5*W&NJpT&`Os9j84 zjo0ov8>bD4arXy5dcjr=Q#Qr-GZt^#>EVvut~m&ZFva7UXc~Oj;ESHa8MRS=T0dhk z=z<@o0a@V_Z+i;Iw5ptZC}8)db@<|q21(2@3_jvV0^{<8Qj6;Dk?_&i;LBypNrYs! zMHhy&HYzD?ZJE9;mooTrM&r2bGHvd^a8UjFSUQ7WsW--FqIJW{vltu*4a)`mdE=yAxG5em+)tDHv@RcTVocj~<$Ik(beIV7aw` z?dRg6QY|z~**0ZWyJTNmXp?Q5O51iIZQGJ*ZwK3U2KH&N3A=98gnB4#+sl~suRdtc z!~Q>*^e$m^B?qB-EuNRh54>s-NyiPjqiyT4ZMMyJWCg0%Z@LK&EaS|3?OH3Mf!8d< z#77=a77)Aq{8F^rrIH;RWp@g@F}Qt{ChfI-5mHZ7pWiQ}oG2QDf=iux{3sWyHjQHN zZ{v?_$Wkr+33jQ$&O<&f0o#lj5$4QJp+2p*!O9A2p-~fH+&sdV zpY<Z9>zOIbZmwV>$x*o6qWdZ-Y9|bx1Whl zw}(hYUdaHAeEu{Z!$ytjoFj9yF!Y|ms29GGVb#X;3whA-}fj&|C$ zD%FHP0tB97yS_RRu>}~e6>Qo zHQ-$6M>_e=+wYCS5X^F;FXuS!>)xQ7CTY}1v+#rqjQ-V)*e$#);Ejz$6SAy#QT)&`1bOV>zN89AZ)RXUSz&#`yA{ zO&e6h*S+x9w?Dv|Gn?}q-q^P&>ka3j-VLJLjBEJ|eEQZYlvQ@a-lHo?Iu!b?Co$H|0U27{Nwb*%jF*QtRTIQ4;5XnpL1xVZPl7`54~O3{DNt?1vc7m9P% zriz;P!uU_d;O6rWD0kefv+ISyFWrW-k8G8!C#+n~C#xyfdw&jIx8e$GhYpj%d-%!C ztq~QW*B#?Arp?Vb;H+Cy6q#VJ{)2Jbb!Veh9WMR__f_PIw43w4SnzRnQlAd~O^0k~ zKDHei$kf<&5Pj=WH{|Zxoq>}|I$+RCPvVp=O;NaJIX>fVi&1~NonNaTmQ|cSkDtM< z{OfX#p767AS`FxhdWAY0=dNX=+&kkbT-aiXXn0}a5d(YwL$TApV?D^IE5uRf={FFyfeMcdl-&tau^0Ze-jSp z4$>v_Cg9=QPQ!z_#p(Y%&Te1c=v0u)_a9_~Z{F_H<1d!lZuY@fQqK6t^pd{i3!ty5 zZ<`aImXY1nYqTpTRm??~W?wPPWE+UkmkgDYucvcB=I@z1gZN7!Ial+sYRVV2k3syE zZbwfY=Mf5LWfmQCty;5zog+FCEwEeNsz#DbBZ;FStW9d{Ok~cx7;&tqDmb9gk>-+FDF9 z!T1vuW~p2G5iSWu8g;1GZU6Fcm^E&W{J0a*zQK+*v9pO8i(&O4UR0vNR2ve!Y z7!fX=7Pd8fb-^`o*C1ZVdG}Q{kk{W=)toie^2BP6YOk?bdXsiM!YnzoX8|LcBW(L` zgGLQ(&CBSMJ|GK|bQkL;l?$Ww(4?8EITy%=I#Jac4bYMZv(NQr^!-e^*pdY;xkm`Ci**1of(Aqm+gEs9|+Uh9tO zXd#>y$nZ7BV9V1LTAuDD6VWPRQGGlfY$SZaNld62$xA)i90oN$}(e-I%Y zcg%hVbC#50myPe^1sjHx;J|%%V_G5h?tTuRmp_2<&yU5I{W_p)0FG>)x-AdKSDMa@{3Wzq*p)sM`?NY zozXz7tmw0eZ`ldakt2!XVZ_R`On>s5Kwzl0!#0xbQn=X(TZk!F@mYxxPd?^&r}e_Y zJ2S|nqNe-fH|&a6ya7D&#|gNlS3}gvot!z96IfXMBukW!ykD{eX0s9-A~=Ay?s*^Z z`A)h&@cinV;%1eKx%4}w12}uAATguuOkx_A(q$TKb(ICkkXtC7mJUZW!775s^t}J| zYM_V@I4{D;vVNF9a|UM5`-CmOs@)%}RH=@7O`4-c^M>U=R+tj1s9|Ree7+2q%%6gp zvp?r*m|Rt(EhTO4qG6l951@?K_PCN_!?)%Nj1fhWyb$Qw%u zY-%J#UU9yl$$`cYo0!Ma&)ye0oiCgS9`OW?VTmSD<;z8R#X34WFEZw33*W@Ed^t%Y z(z-9^haGe;VfJS?pro<#R1utuuebYof`XT=U5r_AeYc7hj;R~I`r5in56YTyM!i}-Z|7@cMVtd<0^gLQh1F`LT zkM*`3VM5a$AElXHOWG{~YP$LfY5wRcPLev}=d^9jY1_1ekxN6?#Py?Sk=Zs*{FFpD zan7e$SI~lspNctG?fOwP&MoCHU6itI+7ZbA*v*WRHcEJI551q+(L;g}K^8VjX!6Sd zGNDWl>j)^|c1}G&s>EGVYrmKmY}<>NyXZRX*|>8dtJZJ$0<+#6g!oMTimRT)u)(LGh8Rg{F;nIt-Iyp* zo{%P;#(XVFI#nPp>M{6T>7$>1%C^W4T4_i0cX1I~dbc(oDkNz;s!DTKsSF80_2fZP zsim==E+Dch!CLkjpojm-W-A`q@J zSSf6*!tq7Gl^j0(1zw*%AL~wPX%l?e3ey7nzfrO;OI&VJ#*&3`B2wMn3J~9##qGkI z4P)`aM=P+`>hWQLxd%#j=i3rw)^MMW96>yQ*|T55`%BM7jTsXqKG&doqL{-4J9>-J zf>c>MUit^zb$VN@TO<9-d09+2@+{-Sx9U39j4INiy1Npb@e7Sp2osJ`t>ZXl8U{b| z7e{&SLJaLa0wvs|9T(44MT1`J`A@unWygsqhN>ixP=SQ=HB=- z-du1rj@?7cBH|l)GpIxDdBt}_44*#*d?ll-tPnkp;yXxOh(4Cb2km5yJww~|`oZ|~ zIeTOM>WUVN$%GH7>WII&3|;97XX4_6iv!&Cv3A{JOnU8ZJRBCid-ofPh0pznWesR? zK{|=YfcUE4McsQomMUFaojFeA zYR>hF+n`?YSEc@%=!)v`yKY-M?Bi?7JJ0rO(ZKg}5t@6MFM0BDT;KMzZB|-ozlZVa z1AYDV9X==F;_+ozBL2^yjng}8 zH*z|^7}nW1ul~o+#wlvm7vl#HM~CxsE*f>% zpU@H9s5cc;rtZ%DdU74nskEWVCY_gbJra1}ZoUfpHYRu7CE)Zx`;wiBBR8*G7?So0>I$^cQWL-iYAJ3U-W}4b8Uk^>(vu zqLGD@LGoV{sbG1QZL3#-ZF4+QQJB~!l)qR>w6twAsZD4JD17B6s0C=-YKKyC&mnEw zCfYXP@IU+rKLp8qMcTFnyCu}d9k{hQ*tV(DGq$Z`>8^<_Xx=Sj+a`6(*tY$e*9~^b zqz&?kD^R1h+cpUi?@2bXWZa9z(kM(^U2?_kkHpcvS{R=U4e5_R86AFm2CjROPY5TD z!^g`{flQk%I*~X|ePjQMJS`u~B;;TTxh;r5s|1%J|NHf8Ks8|Plv#N=#xi+Zjiu_v z`ioU)#K{~N(Yo30*ojBGsiE0Dlyo9Z+k#3v=w9gVi5xI{0V0RYXnt81fbnnPv3I{0Dr_Mx^fB6HYlAm|Ru0&AI zo9eZ|u_fI1!Ee1xoIDM?P95oNy6nS6CzHnF#?V8kwqq^(;PDPTCq#ehn_}kuS&;DX zEA&JsYl*{t)Emt-PBB~&k!I5maRQGMJo|Vbr%al>5GS;3X~!FhQ}`F6;rlP6BKKsE zSb<|OBuYVs=(xrp|MLxNAk>8_E9qfTgqW6&v(f}s+p)gHh&peUGXz4cg58faS_~%K z+D^zXVjBy!hsZA-w#4k8nmbg(PCIa;ik>!$#>8;`ju~e*9((__ms4~3N&L9vq-=&3 zn}U>!49Xx|xt7;9(T8*-1VA~RE*7*pW z(T4^=awMX#Pygg(^t|RY^m$-D@Wupuu>42pxKn}4=)xA={tf0=uc@L8xI1dfuQvUjtSd_*oJp?6NUW3M0IveNVtJAq;g>^R0 zdtXk*d0jK5`A?pW)Be0C@ZR%-&^;^HoJjZnczTR<$_3~+V9!DMZ;fi`qDf8kbw9TW zxecV^08O`X-P{>hItLDRg~7hlI|tVzm$f%Ig;RwwSyZydj9jQ`w*L_UG8AKGE+b4& z_X$!S%*ZZA^tbV!6YyYIgD!2;f=~Bd7AFuJ5%QN?jD)Wb{24^$kes?wJ9He|$?WUP zobBjbRsXvBCRA;-H-2CofyW*hfo0;y8K=sxMQR)KlC_3Vo4A|!&8Ke;*d5WlK`m}l zU<&bT@?7Zdmhu9A^P4+-a$3?2d-C-=las%-TXeErNSJ*$ZoKhv=fe*rG_L7o@j9XS zAqVqV9y|VoJDoN%ypg)o*@(Kd7;G02Z2>r;h{EP!+&!L2NQ#75z62jGX@5=2il z30-aW>aWTl_^WO_4>FrW} zHbXnOd2Ntg<0adiwrxx`8!X?Pc3RTT7$Y%c-YW0LvK`R=n3Dn$cfoRDOM2I@6@$UV zwo#X6(}In&ot@NOd|)Rgsxi|Gej$f8cfZA}ChON-e0Kf~+yv4`iu`jFsLmRNXm~+{B zOnKqXfZYyz)YAgCcG?G!c~%H-JC=jgro1*u_bDTzKl&Z(WYk)OL*~#8}Pe(d=dKC z&Rpc>pJ>a<>SI5Cq)_hOU;hsteZhk0eqa;Bk(8ci&RZEwl>&5bw#F*fr>#GSGLcBT zz;(IfnAJ+I?;de&Ww7_GhCbt=f$B=ipmE)>y^K>3g}9 z%_cJQ@m)|{!1*qC>(}#*IPE6*E;`nY8jX4Cw^j!Irmt8qhq=vVIHIt1vIbLE3ZL!A zHm49dQ}arX#AYo_1_=>cU?x5Cjd=^zTaM=$^Ad%6NoYSRP$~}#TU%E8B0P8BC?hOA z?le69#~bkY1OJ1^AG{WiKXe_2{_g<%>>%g+*I$gmLgVA{%&+8P9+9W@aP5Ky=8bqY zIivZ={r4cAaMSIB1DIeDvh^msiY|iX)^V$Liq7=P3^#EucW9rJ4Mp_&f{x3@JSC9n zBJyRN=}8@i`~s9|5s$vn3SD8zFDyC7_OPN#Hwz`sne?a|%U{@$Ug5mUYap+Iyavi^ zATFPlPx&4|3hTAUMGuTbzuV?x#P7WvUC z;y-oyAF$Vui&3IE8(pMVCIuTm#k05H~B~TVETIo*%F%^>VsN-^UOp)xGu!-EtL70@07~d+4n|uPh zl!>l_$d;ezqKiRe&WP$;VdZB+MZ~V+LqHL44AZ?94^bm&C zO4a^N*KGG%w9!sNVd%B^N;#@De39^y&G7zao&m*i*LFR#ZQGw{+w2$|@>ORFblA8M ze@@%BqxSOLVkU~wwpkw%EcA7Ot$&FRow|K=vTceR3*{9Bv=T8|*ciuLNZa=NXxsM1 zmA&_79)Xz{ky!MfzV(MJ+olhf>&+R{T2aNeO$%Eln;6ur$hNh%BQdN7v;suu$A(yt z8S6D;!Fh3UNc9qGi)qo;1>EQvy3<(Mg-5$%^&N4Hqbk_`8KaycNt^^$VuGNU)t8}> zr!8>FwwX>XHKEQ2zFrLkt+PVBO<3o%x?(+8yRgbq(CS@vr5f^G!3i~C>E1%*P>t&M z>HchVOe-tc72QtX2d_Ui4VZdAhQ3;gpZ3)5MDmO`Tx$=&q6tspcP|G$a2NjG_FlRe z2N=D=LjPE(`jAWIiCP4e@LF7zNanbsP^n%L3mBCUtPk?H(f&vC(`L`{Q)a;6GY4V6 zk=LWNmea*-&ENPj9=`4}W6SriaO&}GJfCoXE^e!puMr%m%ubHtXy2lak10e&CXG8C z&H#@TOX2L@m=E10u=Qox(Ug9xu#7g?5S%QeWV%?MSWcsl5kv5P>YP|(h9KGkJ8ncM z;5fzVl5|e*V--si%(F_?-^AYVE}D)v;f_rMGMrfT^o}oRkWM- zJ5h`vr1TytEmnbb$P_%QI0;e~)FWF;6Nb_qCt^mZe0?}c9|$&QUX+V1(i%yKI|g?Z z#|K+N=9M%)hP_b{uGUSuo$}LOI4*vl$nZGYpx>UeqI=%Zq~ecwb@~bT z$=*8|e*D>iHYCa)VdA~|LmpA{)1eM(c>xY>q4fpH1lE8uqKH@ZeQ?s*op8%TW109eZtSxVeFt8IuC0qv zmAD#{t@z|!{P~tE@zOjo;K?v&^+(5@HHadAzD_5Z%&l64*>mTidYAxJvK4ZdcjJZ) zsI^B6?8>ie#i7rl*)uV(N-dtLtm8N&lmycM25hXt2SQEjqt)>j;Gnzv;ti$_yYV!9 z^#1j@qHj+$*?A}1Ia9Dx*yPy1KPP%G`3yzv#(kDH_i^6)12~ko97oLLRV;m}m7;oY=P$e)&kSZ9mSoZ9-o0=_LI5 z=Cp0Q#B7@_Y^l#gW3}qD#I{wY%*n^b$YT?Cz}4ElId3lw3O203#9{a1=eNX1*lF9QvP!4bE3j=% zlC~;y&sAu&$+2w*wQWpcnaF>g^>4mxhB?}{X|in@dsIL>Z>%hx@14-LHEhuoH8>BC zR6Nwz?mvb{#YNj@HxX_8GPeRt*H8zXggYL~pzL|M?IKdFP*WMt{id9mbRjawP{+DI zB_=uKkoUem4XDm^>DQ`26=Oh$S`rh&BNF1obW2J4a@_gsQ19QNO1=7xox_!pLd_39 z3+*1e)E0V24Z0Yget0R)I=LHmueGDEF>P3hX@7qdzq}_N^x6*B{kSvRm&Pe)0LmRjR!9|2SfcCH}>oQ zKRBpCqJ>MNHt} zV3UP{61#QW!cv0fsk8RlE*ChR>^u)tzC>p4XWjgQ@l^wIFIprLb14kr{$2Lv2OATu2j|T+)tWx+f&bv9HDCgpY_LRtzgH#5A1Ifa8xI*hlOj@_T3_Ca9!_F*c)ZDU9_@=G;mfdS&I}ZXBu%B<-fGV{cqGh9M zc>AfiW2F~*wdE7$n74`GW!<$?7ruS*tnFz2{oiKdjM9DCzBK}Ow_whEEa3Mi)~HXc zT7&fk>+r!lZ{XBl#JLD?*WEwBE^OZ#+o%w6JhAkn4>0w8f^sJh&YES|h^lq>z@C&< zF4Cz}`8ZU9hAXO#I&rrGpRcA8bPR~Oi0g5CBP@Z(k`qT|i!2ndI*ha-A=Yc)?CnAp zAUf$TmsTL*C@t^u8pvxPuYoOVpm58{`P94y@*4P0Y9Mn1*X=t_?sVTyr`Ua52porn zn|xU0HCb!~pN((u98iqL;feS)KlT^6VDa_1_gU^T!c;smpdTJ#sAcQcSh;*EKAvZs z?5Fh9n=#O1%)oZLd6>U8(yjThj_T%;Ipav6pV z?SmRSwZ?#_Zo_~dU1^igcb~iweR!dyBlfMi81GKhs582b_{|GyFKdZQHgnv2EM7ZQHh;ooByie>(qQU2CoGuB)rMN^lcN_SEK?m6PkyW{q3O zg?KGNaX%(BfEb4EwJ;j6275f{>i{Q_uF}j|!#PXx*GSqms^7^Zi*F@K6B?rX7H-Yl zKgC|%owXP`8AKupH_>I_O8RMkIg(6&OYmBEHx;iY{jWir?PTjzeT(`)>>}Dnqq>bG z8Ka)sORUn?wlTcnnRmMQU8u$kpJlXp;}O^m*@SpH@sAzh_(Zxvkj|5K6|S)QDv<-v zI_v{8>hNq6%{u;ilkCV$-4wU(`!7q4$;_u-mIf;o@}^6OwI-a6|GE*Zjg@P!y=6M+ zCEIdS<6$}VT=H6ZL*5NO9-=OV!TOWAT%?&gE(&b(o?NmgO33+JR!JZY`)ZTPEbSxG zRB=ij$KLRth4h1xw(%-c0qZy1eJ%yw!7fM^#@C0Vhz!XyB2dF#Vxc(m*s=X5Td~v_ z_3tD-2o{33U}hw9Ul!C1AO@7OAq4>sI4|Zr3+^1WA8Zo}ZVO=ziDlAYS**-{{0vHe z(&zVdv{a$j-JjSoQ^F!~PCXn^dM|{n$)$OF!*^jII>pY_IL7|u>bx`E+R=c${Nqsa z*nPX`yeH*}mi%=vJ4pz7%*_nqrM@h18toH?)7^o!{4GZSiyN|?;RE8`KhyQ@G_Ys1 zPfy39xO)A;-*PXtb$%!iYvtDN&hT_SC43?m<+(s)B`4jF=21gO`WJmMd^Tu$T*sYZ z$I+~xZYc$#b2`>Yv3o`KK+fw!<_$y3?tA#=0J|~_HBc{f{8nTiNyqb9(|II90UtL+ z)_V@PeNOv)I_XO+MzOX&bk1m0*GA<>IXV|kuc?;uI`VTSxL8xAipang@=n3Xrev0G zH%pe>ZdQB`rDxDDqW^DgjCauGLX!RLB$+iH+IHU|z;4w03rSoo|3{4kxlym?C`r-z zrh?_+OoZQlZ1B&e^Ma4-nVTkcl#h@95^#%}Rd>&sZ8q8Bg;wo0!B7ZKRMKud(H}v%PY4Q9 zT?iA2YMB?b5Q10^Zze$%hRWym5oz|@aF)2d`ZwwQpVpmEFn{l@4eADez*6J&D>NFZ zIhraubD2*YR7VTVPo$(QLOd(-G=6d;Q0Min zitfls$c)i5Udx}5y3;(K3*xt^L63EW-{a4LKe2W?zc&k1mcnu*-7D%uEi&|>J~)mD znb-JXwW@g6TL*28350|;;<|ays0hxyP@~`NnsLI}b2m!f=Kj=`5zw@lp)}W`xcnOi z-6t62s!vjTZT#bqFMX#PLPq!bciH)f>eZAj-w@ozIMz*UnV9YMEZXs@@Qo%cUAFu$ z=ub5Ap}rRIT7#K_o;!Yv8+@Sg7)D8{qHRPC$FGT_(mHAL_KpP^qud8fb?X9qkRJ-j z)Fksp7*i7^S|sYGPMRyP|FnCk=e*f?9Y4?N9L~Fa{3733OT%Z)c>T%eq7!z(T1D-e z?Rksj0E7eyWh8O{42hkXa8C{?_Fs6Vk*i=#RFioKTw*@DX!(*HK+p_N0=y8i1Qc1$ z0=Xk62eKv{_<*>XuO%J`x|I(K1QNur=2_;u`}^B%N&ULLn}mxz+5BkQx{d~VK3uwk;YD`sbKeXSm z!*N&_9LDC3rQyAU|Hl)>`FYNI)Wj4GGbNjxj9C0-7P6)LHJP?*$Y1+8M8rNZ{e~9ku^nS?ao1N8!O6#dA z68_}f`&c&8k`b9_F`ehhyS5|RBWONTf8bt`%zjd2HKg;9!_aHgU0?6dFzJR{hBYq=Y0 zlENLS0Y?;UQik;Z=(-7_dwKad=H>k{gYUS#FyZy!-b0SIeNAdGNo;ig zIeqAH<#}4!K>4hme%rKrH;^~3FaLhJy4MP0>w#m7T5semIoAzv-`=7gd0G2S|CACr z@09JCZFE}z1NHf@$7h7%mygnYo5gT(b&IR!Ybo8!x~u2r0|I#_&lLOpgI&gzY+t{= z0I7{(Jp;biliKO#Zx6Co^{|UUo?X=h-RQ5(>p|BqBLY*Zp)e$<2C|a7#_ZRXmV^A{ zsPadlK5Z*`A2qrtuJa3zsCgM49z=!EHCaV9RgxzQerf<=cq6;St;lqfL43GcT)H5N zerL{aOnn0w^f^WPlk85Dn^5MGs~dY96-gniGmoQIzihMLLnxizXoYOnF_#2F=8}qR zbOjp&XV=T#w?6pSZpU0h|Fj?SZU@MDke7aw0iIi@@nqe`6ov%x?_zEjig^~VRS&#A zUE1a|hvY=@SfaSD*gMDXouksd-kjj!_?%tm3k~=LwW(huLT|f2-c(w21x(e6e&#l`gqJ2L0_ZJjomHu8}f6M*s zrlwGb2gJV+Qr8QHyowpt?6O~j5Vv1WB3u7)2suO+qr9cO0&^SfD(xZ?-ema%E^PEr zKOWwmjyE4QwVPUVok!SxTtda|1>_!yjtiw6KA{17@GK-ggW8y-PkcNYG~5v4>ub743ClK(jB#m1#dlxnEXZ_3PSg+GFVRYS_U- z)Z6_oKAYS7`q|rt`_7J+9{*N{cM9UUWqf5ub@|c5rV>CXwo#J9W9Z5uq9{WhP+hxS z+xz0&T(9g4?B-uO5*FdAzi_dhu2uG#-EOaGw>ms@jE&*Z$X^b>p)k#Dk>MV<1Y%4w zrH=^$M5IuN6%2UPUe*q|I9$Z;TeD7-qT|!&W{gQ)RyUk5NIWzkx{LgtYq^UNflHn)X3wY5 z2a%%QcQa(RxqArU1p}$>zI1&GIn>>$)WrVg!5znv`&iO(r|5ottR|R#+5X;lJrN|} z)ULjZ%?LHwOGEp{9bKAg0$k`J;@G(M;|#(2ytfbeJeD{F-tf6p#a44OoN2be(eNZr zV!2d_3X6q>U|V}*O%&i}1$`)<#tZ&GFMykb3Pnc|@-R$oGtFM>tfg7|Gz1We8dO|p z&7G>GiIwIa1!mb%^_5dBR)^Dc!{)A~MD7R4l{!m)Cv(^SrgQl2u8fn@25L}^B}xav zY^(*A6><8nR8xKjD>CE)2&|Mk-4ApyM>+9oDQT z4Id8gk6|BA?w@ZhxRR!fSb^Vy4w$)LcgVWWuscdMd1Je689vf}f7y?I?qq#lXzu>J zEM|P(#I4wNhhMgL?tke2z_n*ntY%a}^n7&dVFPXR_1GN@(zJj~cH++l@oHNc=hbe1 z04bvVs95P>ADIJA{Q#1FQ3E^3%xbNTfx3nlQo%5wR(6#exE#`&YFTDKo!CoMr8|ER zRN@v7OcINOs}%e&N7`h5!;9GngqapN;Ov~2a-sxTwg z#3@Bn4GfQaEc6mQf8f3ibHRD$yuC6j1J@1Mnk)%QvLO+UWU0`-1v?)OfOOo`fwT_P zmjF*c!vX(DO29cN0cYH{++q92MudA}lnaU5)*%wb8CY*^i9x9xylfP^^M|y18wbs{ z*O%Czc{J!8LH}&ZhbY~)@s`+?VE7giq?4+;UTs^)Gm*xiS)Vz z^@{cC)wg|}+EgglpqjT~FJvgji2WjKT>*1NoOohC+=;L`hRb**(91MVD*IT$H8<8e zhLgiVuCOLZPKo-URD9kqU`1#JxEeV4{`D7`d>HYH8yR|&iXn{8jl5=yPVj^Nltj_npbv zaw(~)a2xBlo{1(wxCse5f6Vw5nfByGj&kfQ>6`de9G1k*jIm>yitKqvv8zFZF6=7C z?+-&lpdnIwiPZuHinw7j&_cmM-#!hYu{8`r>FxqxYfuVm`FU1T@-$M~m(()J15Qn? zgBn`>N>UEYjRPKqvVOJY+=1c+@$n9`BzvKMJZUVaEc@>2KX63%u_p>&l`YrBaU4by zP{W0vvc#vc_b$&v6Z&hqv1^$s;hb#v95@;LK!=$cWA!7Y>Rag~T~LVFe5saHQ^3=J zG;K?h0W?D5Z+wS!0hwMBUsW&>mk>}*AqO9YQewn!oDJ$f4qWK10)d-%Irq}(u$0X? z8@O}ks&mfhlgs0BXX9X(yJV)4l@8fpe5@&x7<+CWJ*o;dWCmVv`Kbx>tO`OBQxIUt z2Stwcv(@~2h{?shic+M;lgfx67Vf(E#cjoOiVqIZ~{dnvToB{S?@m?xEh1<(*c8_M_C%j zQ+)BlzFyo}qx290y*4brx%lS-Bc^f}n;%;F{3uGItcOA$le1JVF2Wje%3T(aJ`5;| z{w&2c#2=H$`RuS%pvaNgU1?`v>^lqvtE>uyAK6dV(mK^n!%e7 zWmeEW-P^pe!i^A!RVgI!8eIdhmjkr|Mke$cp*Y%Xfy&{HP_j5Mp(Oe@#ji-zh;twc zs@SmV9n^vWoX#xgWl%?yVeXR?U zSakOE{s?rQm`v~_UBm`(SIMEktcym0afebRn-w)?<1FO+6}Wk7#wY0}?AaO`uXF^n z0|YmmrpUG6avr}nLf+9QD~~Ww)bt9oSRJG0Wc%e_Cx6V=Qk4c)Mt1Wcm)uYKc3b{# z43(cD^@F`n{hUzU=LsyV9(#TYv-;U7_DNq$<@PmgQ?gHRa%TwEeG7U5JUiVJ!>JWK zK2Tlt&e|00sJtr1JY6LkG1Y}=DXn3M?|I8hP5eP+wFOu(W0|7wK1+Z8b<}9~?DqGM zOU>_y>?dTY9#HoiBn<;dozls>q+&m>`k{QukMquvk&XK3K?LJ55g-&0k?HLL2)~Co z~rSV~feQK_8!R0)xqM=Sv4Ba}bI^00vpWU?q zwia%_w<^C|`}}6b?V(vAu>ATUl%aHCew*5P{Z7j|2UgBc{+vNYpWOxAmyBp}#;$|;}Jg|Vz_ z^v#5Fr)fa8QO4Du4uTe2e(a!3Jc(d|WeQaMa&AbByzoSonKZXDSS4HmZk+!F+Nw~^ zWyBEyX_+)uR&#E>I&CjC5|$PM*raWDKv5mVMR!JuYI#zYzQFi?m}7*_DDV5P#b&vJ zt8@pSi#0ouzu)lgPmX@c#`3UPhpB@oUVXuSPr#YjtT0Y-&oR|dSMKNkzy&YGkAy~A z*AM3!qh{^Wn;Bmy)zy)cyO7O3oCq11!Qq#HA{tBS86dV)$4eI9Mqgzzs{2T&7E3yY z00~ESb_IxoNoE}7Ujr6+_|JQ&EowPBWYO=Wx>N>>8Es7wUd>~5^T9=Sm0%t+$o^$M z|5BISJBh8)$$jRWcWdS=ss#3vutHeEj57EtKS1IACjxE=y)ay*MbjWRDiyI<5u6=xdXdgVA1v83^?(y*Czf6X{n8KL5rJ9%cYbZz z{7Mmm*>TDtIgW5P;!5E9mv3d>nvPXS7pq6A6>;|-qMn55lt}EDH#)>ysIY+aeL;%* zRJ{y#!!pj`S}3~L!Ch_&6y8d>#;IZaYLg7X0}=9Ot-@x~o_ zWvl=LWcDv$Lc~QXuHsoC5i+Elw{&R%1H5w&REVZew|>=kpdF}bl>FQDgk;Cf|8lwy zV%}%1parV8hx=_KzECO|&N0o!Q<^*Q#!nvBqWGsX@;RGk@uxB3A^|;#*kuz=Q!`HR zuH?;#)DsdMF8H)lUpp0M;ZRv&+Cdetg@qqngLMf=@iQ_6d_Lw znRQ^$UVp^uDO20IV*?m(<5Am68>>KYwwZe*cp4;YPRH1cGf$6PDKx0_jaaB3TbD=; z%Piad&^=MqYwn%{Z8!(co?VFh(e)&Z%LM#%$aagnD8o+vEJ}_wcKolf3L%%IPYT(V z?#}$Qp7@gX?-x_ay*npP!NaRJsa3&dfD3QOJVy7(Z{LU!{jLH>TGVbP6I8hTPmV?n z%j!jT$i`)lp-1%TeiVg%_+Y9K>NA6g-C@O9=KDp6Y2L(ZcQsN$8#S|0Gq&+2>Xoma zf>f~;VXkSRO*O@WGUqw#Ilh)aEC%YKZUZ#5kVo#7+Yd zU`_dGsA9#{=f?bim#{a-7t+a|(k(M*x&jo|3vlswI(ru_dI6;CI{ZPw?JA$|5Hb%B z>BA7ZHtyIo950N3>vVRwWEAVYBl4WV0kXe>wXIm}`z{%>f2R<|j8uYM~K_h!RaysB68WX*+{ox#m&O5IH6&Fj3fh zbFwV*DjDcA3;83-`p7HWGO!dQuw3z2) zN)2n~rcL;6l-97Bd-L$>hLxWO{9lv~XG(2DHCiXU1t`=9rBI8B#-pziz<$4hU|vRPQM2by_769aYpkS&mtr>_7)8&i8C%TH1!VN2BB3T~T9jw9YRmFgzyD%d= z-y(cq^Fx869GRY?T4E?8T}ZPMx7fskvVTuoAnJp^x%Y=NUXaP})~)j}#Q;CSJ)#l7 zD33#h5RRxIllBVDHQ3a|*5)VbTPUo_*nk;6;WKpwDGW!TR54XYX(26BEhMYtdc6zh zrc5aMb9-)(P$et)xj6H_6RK7t7Tr>Xno-2a>yFbAu1PzT&@;&fzYw5s>=%^eJGfNI zL3)$;&zT2?e3O;A;h<*K%p^v#?O&~E4N3nWRNMUzs`cYp82vys58w30!^;pGJ;zSg z2$opPYbRipRZ^{$`YdEWpkDiMs}!ePXq%S@rbDee#^D+(XxCm^Sz zIwOy2=Du01Op49rr3J0XCnr$%w?biHe5&?qnspo`&r2_2R59=8={~0SF6L0@l~Z;4 z!26;Mck~Y2#G3U|Nrf9f!X3MDRcXH0QnPc|r<*0yNUX9Pv)72@-*yT(2OCEUQAm$% zvLws6W0pg2bvXe9a|P@PU>;wETwdLC@{%I*Jg@pc0<$k+QkI7hTByp?O5Wk}FJ9gf z@&##PmZffwMCt&o(@Ff(BP-F9AyjMF$iAo&xh!}OB_)V5%{t0MZ0bdeEM}AsPH0=@3?b*Wq-!bv6FgASTe6z2D%u4Vg z3AP<<7@XZ%P@J$!qL?Fof5BDPc={FmT&3F&fIGH)u!B5t1u0qDs6Hpy9fzfps0~&H z8-jrVXS|Quyd@}*{V-u2;jrAd5e)#CHL*#Kw!r%qdQ$RX#&9*n13~N2vfEVxu~*Ru zQnKofzAblR_1F;~USw(NbR|p^)+y|2g>|g*r2y7F9=E=24A>ntfmf$DLNbTQb%^4x z;^mkgs-%Vpi8-R%2;0IU!R|;3H^ofiE{icEi`Dg>qer$eAoncd4hc?d)}`XT$?Pwb z@E+Jv!@t80o@8V0AZhZ4u$E+&BUnh>;3K9#fB3Z+tP-$^L>&{@-a!02M_Ff_!Y+M%a3KhB8Jb1`t3q4-j zST#OdUa)>uWav@ie{Ov=*{&GU0HRA5-7r0O{WXvnHGg?u{%<3jc`QgGTa-gqfA^}f zPkwi3@LVvh)f4%bB33?otkM>YgvxG*Txv%-8ys?@$luF$P#uE-#g!9y@RFOJzD-%a zGCI-&=5;4phYh~XEi5|!TL2{eH=$@kDi(iG(kU?Q4Q?mYxg<-n2c1UXg%RFsN5ZHP z-kaCp1U$%$S!M{*T(E5dwy7c=Jy|P-++MJ6LON~D>eH<3D9%-NqWoOm;*-a42o4MA zr=hepMQp#mVx8(rDLuev#+fc62Yw^t7E{hlb{!J--`++di2!vuw)>hHJe zZN;Z*clV);Y$WBYyT0E5d{nP=s9}j)cM!{lV>mmR3D{CCYR#h(>C;qRFog)Dv{*q; zWGOiTT#d+cWU@0gazhwy(+H*B0?!$b6Tn(8;`O2wF0u zen#`q88xiRq=z%^7bUt~VT&m}W3iE#qiN3Lpm&YK*%_BFD|7V*1W=~mEv@J$PaCyML>;wA|$-(=RYAwM= zINU1A)E6AMHNvYFzxd30O#?{We$t7^P(Jou>-W?|;nWtI!(mTxu=+7aN@etZXMyKr z9;Q>Zf_whc+k+Kuggg6~btgv+hVn7|^#6l8kMl_oHBx0?=jwRY2fq=k)%yK;?PG}J zYOv(=Z57>JnvB4oFkq}n1+?@qeivul-cgWVZBEnaP(VvbGbkn##1K1{g_PtD!^_YZ z3DiTQU;)xppfN{1PLtj-F6o&}YN4T&Z|%GVqVX(^zTmleRM=sqCfyFNvT4c5;oGaQ zRm25WBVDuf-voJ~dhTh1{#4=tS~}Wp!{6~kv7vo>RI>oyj#SMtyLjdW4gW#qYF^z2yzqo%Sc!BM%=wd_@_z(~MYLS|}Q2OmJ9 z5gCAPu_gn{g{O#yA_$2-^Kn@zn*R=6dTPMP%b&=B z)^8N_sNelsBiPPqseZj*aB!U4hFqo)h$^9f>$|qc$Qks2M$WO-{^aTBljDmBvTF*b zN$JYN0+lWWl_@s1{g)h@l7p*I`b1M9%m+>(9Da;+9V!1Xz79RDW1o%L*NpDzhPggI z5=6a2xKzpoe)fXArr?#nGdiM}at%wS0 zuR?cmC}A}E5Jl|;@Sdtj{$9skplcM3z9KG>)h|8_gbe2}iC1j^a%7vg=!5r+zMk-b zQ?wy{lShTV9q-|I{Bp_7{NzZSl+myKf2tUP08^;k#}LCK{xTG+kkWfB#3B^2bk)bH ziOs|j`^p`|f36~)K`9%^sY{eo-gm8ildPOaB>fi9{~|tL;&|s_cn0siKT^A7r22(A z;!7=qN8&{q)LHGW)l(11DvGt9fV>X*DFK(RZ?LMu^h#%v%;?)XzD)c>A?Db(XzBIia9pUjJpGDobs zWRxr84FuSiC9`$OEcDRX>Zjb%#m5;N!Bs`yp;Gbe3v+Hl;>FqTDhf=&MV0lY>y9vF`s`l^E#LCYbP5jCU#8#L#3 z6ENCqSjsda>Yq+sdLSycRo9L5`kO^%Ny-_m{04G5|0u^9 z+0Vr<-EHZ5=FI1U#O_686bb)v{_A`#J^y<$B+#QBsupG1QF^77xQEK?P7R&g#g7V> z9-htmtYSZpJDTM8LK4yv8H#KDWZ8|u9jd2x(4-1Yp2HbYTtRXJvDVYHm53+XuwHs! z8mHVUi$PKpg7GMPZj>Rjg^Nke-B68DT{|Jm(oXT+s&gI;&tTnp5Z6q3@YKvah}wrf zr(>nU!F%N&$g-)4Xvpv7(ApjT8mb0C;+A}fAVOa|>;1T)Vy*vTf1re0BMeb9!LWxx z%@z$ql}G~*GDl6gh}0(4kdGNF#kYidtB0CXRAuIER!dfs5Cn6C`Qm2~KemJfUWbgg zf8|Q;psn^qRA&s*B&c~R>gWo!#Uez!Monb7k-DSUVM8zkuP3A%V_8>1bPQo9bXRN=iGg3Iy%qwk0B6_LgS6LJQ; z2y!$L#{2dM+QhEP7+Rhedlgu{F5oP+LOU>u6+dy!2!db{A4TE4{5Aqq|6g54&wnh+ zvfq!uW9bQK`KImnT|D#2^CKslaO-54oNue!IZg{7Z1F!{*J|L2={|p)Lhi{7#IME| zUp|U$V%K^FBWHfaDG@&FD#G*X&||zveHt#gg6|ReB}uI6*9ai2_x3X!g%``587Fck z0KA!d1x&~(Ii4Cbdiezq(fi5)vRgM(-8NC`q_$N}02>X)-l#;$U!$d^P{I2p?1+j# zhgQWw#5VjX7$uSN7Pk8d#h<1aRorYgGXr@Y)pZpXdPUXY2Qx;I#L*k7hxRN>*R|;0 z(rlZc^H?3H6@Nmwd)P@dN5YhX4YhId+k6sgAk1d%=TBOtACQ<(W1omcbG*0@O9Xf) zC*l)k9*(Ag0bNktK4^;f|FRnzz>4xG$9Gx{}xdu|{oT!?4&q&wbN|NL!N7I8MR1l{ptGv^gz^Y}a0)$889o;?S% z6JCpr)c)=2OP_h(Y4{~AQ# zmtV_}UX>9b&jR+g{d6;Py#p%^Qd2T{Z0)+4ib$os;G)>ir#CvUkQ|5;5p(A6V_h?~ z#g`0VQa@Zv!U`4EyYR)Zb>KDz94>lfcp!r_5Q9H=G3?0ppU=v1@s-Qm5jBOrfK5W> zsv}&d7=uCj=_`3gxQ#S-IzaJLqm1lFbvHtATg{-KQvRAOuq#DD+Rzg(g=bDEHTK}C z167}HzXlfJj|6*!0FCV0`@6a?<8{CdXlkHvXl$YCa`&X55+W-)QwIq|j3(~3MW#&~ zxOH-}(h%WyOawsPuN>w^GD2}%jo?*d&bkbFDh+R*Gk^>C;2g)T8qZoaEh2THW&YV)9G{yD6!N>gUW zaeATJwIM>Cze&76sc|%_uXsK4`)_t6{a0McTdLzf%js3nt{jKXqrR%av}Fz|WtO$` zv&+Nav5gu>|EPhuE$$tpBnkdfP_|sA`4a?lN?0%wECvfco!(x`LFHcQi+JdwL8=Mx z;FLyOHmz1VSP+X{lim|})mmG@VUXZLSmHHShlV*6I09xOMOxwq9A(?FEfRlBqC=e- zGU}RhHb;MzQpQ8Qf~j5d=|!Gr?Q@XfG%1czSf<@H$TMhdS9QY@!`*c1qmA&HH%tBf znPu)+Tz_wg7;gsw<~TR`|}d7-|F ziXsq%g|j% zqk2da&IZHwwFRPKQ_hOqQ@D6A{Jdx(g(1!)kWlL}YqmZ|S8Ii-41NK)l7hX=9NlB) z!F#y2U3qBh*UWyQR0FQ8^{TKYXmTLWw|J(uNho`_Vpohh0!_YZg;C2{R8_`U$@|N5 zksMv=<}Sz$%ya{Kk&TQmUlWEkHjLw)U)>5!$t|r?VFRut98;R=8B$g8kO>d#V2gO| z(wJZH?@}u-Q|VC|ECS2OYvsUWA@OO?wx)<7N4AIj!+N6BU@ixAe9Iae{K>*+3*Nyl zOrbPZOBr)22GvyDAPgray4J>^9 z8~uw3^MS1Xn04_JjD&%ZbO3r(maR|YUba7V{pB81AM$J^KPP0$3i4@G`!*DgxH*w3eNnkg;UW&-uuQ*oXV!)>g-F&bEumanA=qpN zG?C36l;a@+AE$vYS7mW7$3>OSwg{ukkS*8^lJ*T=h8Gwbg?|UkdE4EZi9xj$nRohO zghkTF{kToWUhIe!y}h2A$i836mt`Ft(U5!Q86fi~>oortbw~$Jnz!Pa(^y0B4AyQm zL;w6R_vZzonNU0O^g#Y_!cG*w08##SEu7aR&1!-OPC%qoN0iG0i5m@JOZVbnivqTL z^&iCsT14yF98nO<;)H`RedF8-S#!nNe)7Z{@p&G|25rP5&W`z?i@tq^=%uFcy2-z} zdcnAL%4RxzT?&2Ku(d?7cw&)4tB0<%fH8N#@(ah*9Pd1&IPpz}B2@?e?VEI;WUmOf zhI4KTov;YOZT2}Z(whqVy9~<#_SkDvy!L(Ca)d!Nz}a)2&1>Fj_@rq8VL8k#(eI*M zJv#U*WQ=R3*ltZAylW4zfd5H~%K4ULVW7ul3oTzffgZ-R@(TfQ*Sf`|eCL5st8^32 z^{)myj|zhtiaUOQx*ck4tAWG0Q8O=@nPu@h6=KecV)c=@#|(v8XSB|@NTKi(^5c)p zTbm0v4vfAXe~dD(1AFm9k{A8z^st|{Z0!&VqBbryrp*{2F{y%j=Z-5iJr^_TAd zS8Gk&HP!+)9EO!u_$$(gQp=CZecUyZ)$T6 zYTypFNo^s9eMb4|{{0pU9M2n~dqm#yF3*e>EN5Q5mvTF(4eI{?Y?nb_cWl; z?N;t#^GM>93pJ2t8w6GG+(au%T*4=W;2x((WI*6T3T>rLuI7FMm{@U68zL*(`Tvdd z{wL*za=|whj9h!F;>73g?D#9C#RvoR>a~toTw?{}aFVKKPr9Q?inY}*QR-GuNTUjL z*GCD%3cmZoY5y?tPPnTNt}xCY$M&EsXJLP;CX|&s5}Ahn;a8di)KR^Y+BF^+ z9H3dQadfe0W5h%OOVUioLRJHK52lJ}wMJ0*jcv`NDk2;79~|l$V(=Q}h{44{ijN}# zjBZherI;09dk()Q{WME>XYW0VGFp<-iPvRtKS|Z7z;}F9DP^}>TBAOtCm%-Q*{WKo z%3ywZW~5dd7^}F)1h+P!b~t49LpmOw3L00VZ-sKb0sl5JnSrc!#(;vB%BMO367=S+W&$daQ@>|l`P)264`<>2uVE}6WYNu7@2M}~d%{v;x%mzcRY zm*P9t2xnTohgD~g?gBH<{${(f-|IkmwCc$g9+mYJ2A-L zPNx=Hc3H8Rsm>EVFbknx54OZ~aTNJ?X#MxT0p)4}2DmO8b>XuCx~AbB5Q=hnD9g&A zZ$`VxJEgYnC+cq2!ejcwSk>RV34Q zzI;QLiir^F2-^p&h8&RKA&Rq?dtw~-N?xXy%;TWc| zo_mV)`6dGra%Xp2Cvo>otmgxFZNhykTR|8)lF{F4P{`Lt%>vfY_+2|t43|FT7wo2-(y=MouUx{HB-E2w!%3cu^tJU63O<;j=(e&Z9einqh~Tr zhFzteI1$V?KS(3$>P(E$x=>-74RQu(QZm3wRyjn#?_s-ED^c}uv+k^!EQK2_KA$KU z-!Bfy^Xx=<-kWEJaBATfIH%A=nF z!c9&Z0vQNZCt;7oC*|?PR5eF?5%Zb4Vxei{T5?-SE17J%kB%DbL zPGb=uCXTHmvH1d{DW;X6eG)-BJdUVv0@M^7c4qAb^L4<>vwFPtq3Gni7OV-A?m$y3ZjMzO~DK@Dz9(aw0;!h7=8LjMU!{@Y+t1Ab-?U1#kN zO?lA-dDbS?)X`OsaU$tlT}vMX>rJQD((rs4VRmk4O7jME7+h8Amx5&HU|tiL9@l1A z!+d$D?b!IG9iC>30o~2FlF~utmvGdObE?Y}F_he2PX zom6e+GNn+76|+Hp+A8YoG*~6HZ=RxO#JFzEGDlz%52EY{%r$oG0=%GP-W571 z_c#c?PSM}Fb+UG=YNh0+y_w8)Ws{60Nt*ocDFi+f`2EO?dPbbIh; z#7;j(;0L7#beMrO=XKDQt`H~E;9|A@&Ee`R`Js_xA9BT*VfPKuo*WL-77^wTtmlXX!5cO-NlSanUs4y*o7 zCi{n2l~45x^8pQy94)0$>{E!D0F2Hw@|-hKjA?~cUA4^X3F}JYSX!knNuRoYy`|v# zF7i;Lj);zBSBTXNw57zRKiwHe+sWj`GmH)owFEZx($reQ@4cfpdg}jOCjI?S0v7US z7*v730zh;hRx6$jw-!Z8!9X~ojdDD5bfF2yG)wkZ&0 zkepjj)3z6UUtHFGZkMSt;T8xq(e5V{GM0*Ifk{8Wd9@&1?PPM_R1x4do^r%G#CxX^ zLjZ6t2~>=iGX^?=f;0S`QbI`z{ybnL7e&!&qs}S3$Dw-Dp$xJ#(f~$=J9H`N zpI&ixnGf~Q>M5Q2+u=)~@FuHJiX5i#*teQ4G@@`*`>2>Nb-X}QiSR*k4Xzv58L}a3 z9!^G59X=@yJCEhE4)Uas19_USJCjDTgSUxY7Xm3uWQebF3iThPYvAP$}b}HcX9J zz|ck~BNNFus4X%GvgQd4C>yEv0TDhlG78CJsWt7m*^1-5UK+JD2~8boG`#VHp@H&8 zt4E{vx!<%XOvdi=Xfxm`S63XwI-w>6tx5ndF4yO1gPK4<6X<>IFcr1A$>KKt9l1OC z<0zRPg zV|5%M*|Si?HD^Cy


=xWpAa6Rw9XD&(()0jVQOqL-a;%n-)l-+Z@dW}m-sg1e64 zK|nYs@uZ@Q^y`_jBi!;0st?JNpqfuNPvKRrRs|+jK)CEBNJ)hgcoqcte=8ZI4&vAm zKX8Xt{vWp9GAyp9*&4=z1b24=3GO;bAV`9RK!Ur4;O-8=-9314cNlyIf(-6HxDW1p z|vB+xovAo7I&9VaPN6PCbT z!$xG4`o);cphQ|YW5eb7KP*(1~9%#=y&Qb%f*+M zo^EuQWDgnGOBk!nP$g;3?V>!D*x zFa|K-t?-*BYOOo0nm(cOL?NMm)J=_4wr?wt$utqW0un2i0p<37>L z^(gn`z>lobC&RgF}oFzn1EE(W3{xB)$%O*vK*=5XJma zGmwXIi&WmsGeTsD!s0cXkqp3T$dY$kbm)+@fvZVGoR6AL;T_}eIJ+803QP_XAgUiT ztlfh-r|8=83A#<74-??ec%i$mYe4?usFww19PxnhBHBjN|HY*{h{$Sl0f|C=M{LKw zE$jr3lD?S9irvh$Hj}l5XUSR#W8L`pxT!=%Cn-}w*>y12q#r-_Muy+AdHyu#cy!H_ zK6nJtX2y*!-6_`f_v_>9-*ni4iCCxY4ZAP*qG%`QzL*@GZY!G{j^m@Wz~?LScXQ7X zeEHg1`yFGKh5vKS+cm&6a4}vJ1r9Zr94<+N0FZ29GpRp0p8-4!$F9v$pB#?cho<#?3PIk2XClOHDi+ae7JL*dd&AMe8u} zj&4&5E&pjA*ueCvHLd9Fpz>7{^WUVd#ekL!{tffnRr>?>UxC}g-V*xhMR&bWjW5ZM zP#%msMW@X$XxVwwwh2kQBweuD$Cpw@5$Ri%{-n43eVEeah3$R5+AZu}_4abJ!A)ABQ7Q#sXY zWaIy%RwDV~lZ`z*31{SZcG)dnlh@PE6=-usS1};FK6hSSE^i%pFWs0z}Y+4dx}Fr0o+; zuU1XCDZGpdvd8S<=*D>jpi3JW6$Yl-P_Y*qQxneb}zH+x&B$=G5*4mnEapGt=X zW@2hJG&Ov(1s2ll?)cTYcvH}2{lDu<)?Y*9P>)x#hK@Jqh25~6py+gG??>OlBq zMG`e7qr;$Omm@`?2d= z0w)^k*vU^BtT0#(?=1XuZ)3JLt8|I34eNlHihSN{DD&G0`Fyn^ z@SSD?34Ld+sx~)E8f~440fn{sq}2!+W;TWm+^yNXV>Y2x4-?a60r&VX%a4|Q|IJjq z^e|Ws0_x3{hrVpJw4=12&h#%Hh*S_4%_hK5qtw~;Hq={e`s)48C-RQ*?@BU!xUsM% zs6^ik2Uy1BL{7_~zK~ZI$CHb7GHDHrh0AfzbFP%N0o)#5V{Y9yA2^CKzL<$5e}vEA zl!fS(Wr^N>O+cGWu&WKc3={Yjw^h4ad>QLEH)N#JZ)G;Xo%EQbuZhMnvOn_H=hpzY z1^iPT(kT&FXNF3rR%aYTzH0ki!>NjP?66#K3dx>mY*j*v1-X|6$KehB!G;Aez(&#l zZ|5l_T$lMxTM(Tf%!LvSroVt$fl-b9y}qz5r-7nUtVTZwdN7tuMQEn0{IX6iBJxFg z%SW*9M-vi%$wp{ArMjdl4$D}<2+rZP#=^y!Vj0uo$A_;m;C1})S_MlDCb66LXMh_S zwe|xd6>0M1IYw7N%IBaDF=wZFO^l}IeatmXX#@UPT*KE@;S1W0b)&H_{He_&pX9W znoGIzN{cuHe01gYT3ftY-iW#y5t}gU+=UZNqFx8#&Irx1xhRfmkNlAL^zDq<RZ$VFbA5Cs!n%4=6Z-xaOiHl(9md)MNzLK z{Xp|m)1R!^@-J+9cm3VsF-pJ3-~M93V~}{M-9~maKIQJ)zCF@ho|c|TaL$>_58-ZY z=If0I7taQo+}nT!t2lRr-;!Ik)p~z(42euf6xbTk z%*UC(Lx}+u`(J6^#t#$1wN1N_w`KnYNjCzUFLCg9tDns)6@|D(W65)yWZ{qh%LOn8 zdXpab#*N=$$V|71e3rbuCGp_w1VP|^-d7=X4{QD(Fv!G;0Qs*VbL(ww3^ykM11+K- zcOl-ww?%MwMev&6x7t?_I%Y~d^w4?~>DBV;B<~SZg{)oeLm~vC^haUb*9mV)~Mhg04?to%WwF~ z=EVnY9LZpONu2l^9NidSJuztKm9Ircb-S#w1v@!3j=-r{{E?wxLT!ewQ2U8pc!DW$ z=4EZW28`Js!4JopV6coR3t5ihv#Ml&7hqG-ulJEuHg?tTa0%g3`hEk}h#L%Xe(e|N zd1;u6q^5^Aei>s3y+6BD5}@qMq4H8N%^})8hV@uMLg*`{prJRRLFO<{Py~WK*xWPhWF?5E=A49G` zv=-j^Fx51$hmRD@^F`Oj?@eW~_7dW;z$M5jH~xeVwFD0m3BKS=-@_P}l=v7p@c3J) zV3l$R1HrCOBg(r9eOE6R>sY!rFUrlOL?pH5jrZwWi&Sn&ir1e7p9)gkgZrH8Ej$ke zv!k3CSf0Whon+KOjjZ&buQ0E*#$Q5EzV0C}sYPi^&*~v$G{#SltDjko0|r?MJYGkr zDK?hBYA#uzM^B7!6ldk^;{lGr>z|XE`OhiQruyi&iOhQ^HAh)_N$NgT`{oskoBX-M z($Gvt^ULO^^;~JLGlgpm4`{%QYt43@MZ6HG0TsU!21qzW9l?HwMxi-ns$Lp%J5$>+ zcf4D^>{H*IYpz(q##bxY#silU044Xe|SPX=6B{fIg5;< zgDm0(MC*?n0Po|*1Y+<0Yc&D%M=~;Pr3b_m4!T^g86cMu#mu{vk1r{_R)3QnrxCUP zZlW~3Y^kbaFHKN2KOWhND+%uvz&M`IIH0xIx`LxEtFxAm1^scAyGwSbDd?`ej`Z}g zDH_1tdxp%)ho#*bvGWl>f$+=Yoz}GVkGe!svmg0c?|;1CdJ{;L=tH(ZLS%+o_9}3< zStV%4&qiEW>RapDhC~G3+_$A~50EE;eb|$E?-&%d8jeb>xv=ty!kC|A z5*}?j5!fVYlZDBx+U2Ti1ah#adddYmXo-YyKH+>)yGYn0n6^#E)N(s-(e&n)8yU6mv>Kg5w{u8*KzAxhf$^7mMa&?_Q0K`W~Mc*8O^zAUA_nE_3Rpn0|UZdJ@&C!+6lR$b<){Xryd|2OSzn5}yfV zZ4)@Ir{o1LtfqB%iN;gGlq8E)i+eBoIy4GdfzO(ZtEJ zo>Ob_kl?evcBY75@)%0p#mW^TfU~y5=GUq@x!af*SGHu=TyttpdOL=DfJC%Z z&fGkI{9TL&HNk%O8~n2(^>7*c@3fJf1#6b%6Ni+xNEE;teBeW4qDS^a51-ZWgp~J% zTs~IwL83EjsmM9+`%Tjah)Q*H#5~?xkAoi_^15++EGHi0;k<%Tx2vGjrkP+@ zl`W4~V!Rfz^VV^hWGC}dmk6}01@g-@DS#>0Px4ejtOp|F9EKtCi%y%mEAfb_XtVEk zK&Y|J*pSQfjd3Yf<w9g;`T^xnfb1V0Bx3x|VL@n9uci>XP1$^0LDd><6KjgaO{K^Of z`Wr~&y0A^Mwri{rBeX5>+aiZ9#Mf1x;*U3oeoh^TRv7JJjyU|epT=;&tf@5z14oe& ze6jEt;^W}gVnGf2V75O7PAr1U<8~t-R)m->e!zLdz8N%{5x<~rBcqA)G@&dlecGt- z>UeNiR_Obr#Ab%MhCdWLQz^{{F@|%$C|I2H!K_)y=S;z5?I8~*QEm@h)ocVfSATA@ zzcd#=ATJ6LyU5+vR2De)SMS*W3JtNord2HA0 zq*6TvWLUT=#a|dtiX`VlN(1`+N1guQC->)jh+Ci}`Cfb)j&#N%(8zX4#NI^#xzW|W zQsSbYgQrZc;-RjlT;dRxlnb%O#Y49??N2w=7C$HAuGT(JvaV`Oe6viI9lGh4EcGl@ z5bi<|n>DaMCnlkNh&@Q8mzUI6EN*p9v6MXi4a?gjrtZ!O?UjagUW&V^(gd)^*btU{B zJ5-ORc56+-)`J?G_(QnJ11OMUVCm{WBd-l(^~e{3Ei=IGd+hX&GZX5E{vj_sNXf8K z0icb}WlhZB@_vjdz%!ClZ%@`a1gmv!S!aP1-<{cmB{^b1c{n02wUI)YkaXZc@Ll(i zhD)S;h#cQaVU9n~n<#3WzUa5Y+4zocv^{Rtu7rVE5`ZBp@GpW%IyHybhoCwKFt9q6 zYJoWdMUxA_wdxk*P8LW>bfKpXU^R)K zr~A&$A8`PhF<&_6IFn4ou>k|`qRs7w(NG8{Poaxfwnj^jaH4u~6adMm7Uky@*q_b^ z$6j0XgGBH9t0>oYC?#~d%BR5^SdqEV1o7x@1AomibBq-mAW_u3+)wEWHa--7UmVD# zSk7B=+wIJprYJG(v1@s=#_<{U8rqq>9KY>B(gQizZM;lUlBVbXF{S7N@>;jXK1Grl z&akE!$<6(^GwaXAEYimN^88aVUU%0-x*nD)yKN7FLH6yfjL)6Bw zaJ$66xZ4;y;;7QZ^2jwgNn|${0QAx@q)zU*s<8NF?QvEx5dLtje*uGqa6FmQ-=$AD z{g)fy>LrQ8f$v4OM$Y*Ye#xfhhmE*uy`4ibVKV!!+$OL1(1?FcBcv@A(uY(!oy6}?zQ z@8UNzFn33&QrBt})vUhAartt^D!y9p4+#@1h$%KPii<+E7@AeYBr6Zb7gl-eWBV5E zgj~G)9y#pFULt|Ks{}^|NBDb8n9Cm~#9nm1Z`CCmQG^$_sh9UZ(?v8AdU6qtJ~K|# z+S(}FIGF3F5_+`LiURd5H8(?(X+UU$XSx%Eud$+iTlPy~8^i$LhXu%Ei2a_d7+_TF z{tz~i0#s8rublL1+XtNqbnGcY>lWIdi+%5k$?0`rTmCzM9UM1uxqGZ2pe?@`;+g#9ei2h5kcG@%8?ZEvx zN38nBp|wbj)pKCh)i})JIw?^$6 zm<(2v6t8vMay2D{+w@2N2P32C7J^V5IlShjJ+ET7pA+>ab8nF!mV90<4Qe@U1paPn zKW>0pJP$#lV{~2iny))fM17uie1U^A4@6gRzr3+U#~_j$~I6~%Y5c{0qkGPsTXuzby1Dt8DF@p@V42BcuOIj(6OF6(IES^8r0T=HVp%&h)Lz78VmmC$dd zt?MlFK=_2#r)^>75vRo$Li<5+@~HK3%q3wXW>D)qnZAAHwOnE1f*ZV5Uzr=)7-wwz|civ`n zl12)NX?S+qKI}K|%!=B*1V)KEWS%!%Q+i%pkQfU9Fa54%1}?@?+vpS|h273#i{G9H zmJB-^Day^$dPXEennOe7lATZqv9og+!1lkdiMX_14jEzz(b>5QV=9UzN5|UjKuEL- zaLS-UfCn^brZaqc$3b2@jEaMXjF%F>X7=`}D`^=8fEGN&Z&*adTg&m?a-5svP{sK7 z6=R$w=wQ<~=;zO+voEQBj3|wA>soL`(h!x>lcw(A>dBp2Q+DAV$J8;u@3KQVhjG4v zQlR!4+2;?u5H(Z1z%)!3h!#3@(!6FJucK$5ugYWVW@q*ZI#3FdDAG;6s!a6yv>(TWZB)ysgcS&^YvksJN(*$zi zC*fq(gz|QJt*q33KHoQUCzOzEmXxH-DAMR$!@cW06+L0=-QXlG2TPNE46So=FqB?}yPLh%jj`1C zqpcg>j3aIN`D;jWU`P!S?wB49V?|YijvH*tVMAT-avqk~={4l`^nCu)zCY=?{|D9c4LIiS znyWz_P$Mui`PYIKaer}!t_-rUyjZmCZhq2ZkjEv6d)>Bx@3{c@Jm=X7<*$ZUWIf#h2RPOZ;}pRee|xr6j_>X+eoF7*6nFe#q; zu?h$?G08t{jim)BJ^iqy{T^X!G(`PbaGHZLT^ep7wF-&&$&M%eCk{s0RCdZAKToz9 zq<}N?gZ?1|3N0%p9(}mdX=KHtE=S{ShW6(%!-atZF-&&eoQJ-o7xuhQyKSSs5ohP% zhFLvwQeod~SKlV_%{7pXqQENR(RBvLdRfG|Nawi-_o(7?Kp|!I~jC5jupH9H53%RqoNp2CMK?@O! zVz);9_Mf;aw_T(d2)XO%9;HvqJDuK{&qV^!=qp#ruq~~sOs}&+y|Pn7%&HcR$zzYT z`bj17D$nHukYmpM60E*h)Y@yUS3N4{`O=rF^DMF{`o!&q6zWOkJ>xbd1sMy0zv~)? z{!ECD#zwT{4Elv}%%Tcyr4+qsTwjxSLH@W;$zw*mVwI_d6>6=ymdyLkT*ua-rvuHu zbjz8lG)?V{=~85Gzmel6y#>*HqDIk?kDeX)H{_#Qb49 zWN)Hx_WzWL@#tN~oKX5chPy_duh`{ohkVZVJZ^ys_&^07{J}*_4gZM}QT!WtmGPuu z`QqbWMl8%#cwF|t;#I}zy$r1~wnbeFk!^=z9k@#0z@CdEc5QINKUkgrckY~P7SZ#D z_}dmAJUP?8={iLBi2*8q?GU1Gnm4xY{GoXYosCCQ4+X|*2 zF}E?-3}p!PcJ0meGP)b(GLaERE=ll}Ppiip_FiffFr zo_nFiz0-`Qc@&fA-bWvgBfaZZ^}yK&GdCJ?8i7E2Q_|v0hj4KXWBT;s(-#QwZ0EJF znnPwQbaLWN;dp4j(AQceP&?Xs?G4gB@hSDC7$YYoth&h&UuoLM>;BaT;vB3>#y!$Q zA#}=R3av$(QI=xacEeD)dE$g{74M7rhcyO1)qG77EuL159zZU;?aMGrPiStc)fE*l zu^}tq3E5MIU}&1pWt+*r8|VwdYGn zVAYcQ%mu@8YpJLjBPQ^**BB7Lgs^Aowg zK$jocg~4^mUMo5Z6z}r`EvrE>wx2`MrYVWF7IDNWhaTp*;akDV0s^>*={1Eqzdrny zAUr`ShBLlGUE%Wa`yLxi1{PD3dL2F$yJwgrONGT3}vPGDV~}I=8r)R5kI3B=f3GOR&2^fnLB*m%X!*^ z8H-;TKUJqzF{#h7I|a{U@Z}LYAbB*bs(igjotb@2u;mDGcJ5W0_C$@ znF*^qUA7fJMRMuMkt=^F15KwyXjg_0lwdQ(9>G?4kiNo2_EQWC zeC)~<2plr_{jSm0aN3|U^FYcKOiXY#IR3FOa{UVfJ{Y;_}fwhXGgx<*t zqp39gBIg`Z>uO8%J$Z1y;8?2?IxePC7j%IcxoOtjL#H~`0C(Xa-OOr5Lqr_$X(-?e z0nqS+NK@YFtYiamT%KZv6Z`$WSs3}J;vz-q&w^cqdT+EftdX;wS#RXj4Ow2TkkjK& zWNdAe1#`|!6mgH({Q+ube31W?Bm4SA%34H6OV4ut?VX`wYQ+~aObP$~hchcy(Aam!nlQwf_d6Dj)ssZ~E_97lz~ z%4KJ~!>dn6tI?e|5uJ8VX%H{7_Ge1lbN|5T=_O~7&`$JZrwmvW2y$GcRN6dv`WT%N zzO%>j5Jk@fBuLIaA&^o)yU4kZt&O9dWS3j3=3)x9NiU4e@4W^Sr1cT34#=aH!G+>G zB}%xQwfg~J-vbyjJ;h4LMia(~h{B5EP@8H@q=5yjA7slg;~QzpM_cKKWY~`^+Fbsq zw9B=>4huRO+;L3lVx6ycZ=^0Qu*i9bS@U7@=Zv1JHRPs;FNZqCYd6onCK~&ow*lMt z6cMVL(_Y#yXb!y<0YXJyAno&Y4E2pawMJ)Y=X#cQRgW1$)%*+K$%}XhtauoOt0HYV ztBtvHn_TUjsBsRTVKB!TGS3lFdZFdFuid}Da_{FPU4nW$5^z7^p@*?Vk?NB05TobU zEa!v0UVny?gluQ_nU|{skCet0mV!*AA?^Fs)oWyypu9+{(6Q{yn&EDTPzn#pn3@yJ zALrbE67cjGd4HLO^(0TBEeCJ2*B%TRbwy@b^)Q=Qd!on{GIJ8s{p4ruSFBmOcr!E^ zxyvN<;~FgHk_Nr)CHCp6QLs8+$TV%;5j_zCp4A&H7&sI*OL*YIz+pwv?#dCCpgr&Ew|E;OsW}nCgeh%?q$#4ro?6@ z2|t%8lxqW%WI1Uku=;9F&qLhT`bH&11RL!0_#Jl7(Q4FwzMK->q-!_mHYCDMb@J%v z^SkjFrA_F8lXG^LNemViNTUA_QZGZX3IEG{k+D&c=_ z;3He#$ZrH%UKh*3-)NTFYlhDhDFv$v{7gKXEHvZo;4aS!OfG6i9~LlSMYTjxtTqa} z2-3}Y#R66$MjFgl5r{dpSd+pLp!7?K*|vD^1}i zn@6|h_-qT_)=xTTZey=T3o)KdZM~C+yt?}w&3T1C&dDRNYxkYSXTERaqj^56KH(iY z>1TJ4>Tgn|Q5{CAyciNCiZVLQrw2t6+aOs)HLQm~Kwe3`GBtO%l#lXzBeJT^x1YZozjEZq&v7I zsCAVHgy>`vncp&@jU`)IND)@fhjrPubZH`uIBVLUxG_)u2Pp^-rVx2!VTvsXGV13R z6h%dQBp8ZT9)Ig{F6IW(@6R@%D#MTkRi^5ctGnW2kd~5GMVrFTM!21jN_k$vG?GsX zqRp!;DDb=U$%T#5KClv_Hwfioc=BE~1nzOsHR0rrss~g6;?T-Q!&Q z1ZL_Y!&x>m-jL)}`e0h zv_X*}Lbd&iZDqmf#4=;vB~%Y$`ab!zw#86@53KXytwx%*i;}!s64L^yxx8AGH0<6V z5d|d6@jP|Y6(YtjW;YH7-;3YWz>cwXj?Wrq8&}OL%(LCM*A^ajWyL*CU-Rq1YK#6& zHd@m8YZeI1I?p0@R#dHFbLblcJz|;*qk-cY6!*cCphb zXZq8KFYe@KzI>YHHlYj7a?Z3%NdTcSW$`Mh`D~!MlDY!kizDS%t;V$%_DY5{kVLdu zj7z7HbB|lVCp1J6h2$A`OJ|v4Ee@=(j2p$(5<7xaej>E8dc8G$U%>O~Lqw3POrT-M z;K*kKvn+n=-?xnQww%wtS1W)&wVRbEk)`Etcvvh~uM5jcuL(;U{0gYp*d%_vCkd7j20(();V{X@z3qA$Me+3oqq;cJ41 z_o%o>*HPO=J&!#K^J|xFSi(Lr)lMy|&7~zb*acd2z707DMe|F@oJPV#8kqCiMe;3N z8ht1J>@`twhah|p_YJK9epTr#eg~Qh;Rl(dgTPaZic;2B!(>;#-!}DyYX^zNUg^;N z#(?g+18MQ{ObWwmtO~!*IqL?0I_w+_5F92vm0TNGm)iTaJULhFb5gl%agwb|bv%C= zzWQ9dT?pp0oDqcKa?BnB7kyII#T8uMea2TqxIOI^f9O57pvzGv8FCXV1{if}+Fk69 z*Dp54_smAuv<`qyjL3)3tywv0JN$_`8empAUx~=v=gk+d`hw9~6qW{cJN3?`kBtS* z=eI20RjNGwxy(F)#ui_-6rAc$vMTx93gW3H#!!E%%ZiDqh9GYXaK$yf2qY7|3dMxKE_Am zARGoru3Nj;`lO&Y4^O{wtD@Ez*YZtn_s;6H7Vddc{BhDHN=HFi{INd9LXV86GR>1h+?`qU zVwdXLc^bzMp=`TwnuWMP$lK}w7y3)bPikU-ZWZ-2 z^{@i(zShD}+$=jiW#fMPf>|=pMqSwufy$s%APhfUNI+g2_Et4mPkKs!u!>i>JWQB= zqn=>EtIzXrcP;GVY&39hOC77=h~r3^1K2KZE%5vh;W^1KpCjt|Ze9W3o$A~u)qW%u zRHfY%%_>AogA~;Dn}!C@L(L!&mzJ{;_4E{6SIW8c@@Gt{DvNH3qk=-DQ-0c|-)?zVpsQE-2 zGDw-w(*B9V7w*qJ??M4Y3_&Ptm@4v0lkFRJADH3Fvwv&)bKlK?i3Lw8Q|J|Zc=O;x zZ;mv3;t{LS7H)WAu1RI@#*}GowQS_H^6@7Lz-(N9R`mF2MB@+A$QaT=-{DgLzhEk| zsMz;C`n^zGGg&N32Rcx%RETx|&b$(1$bbfIt6?wl^WDNsg*N^Zh}_nxb19U+1tvu! zx4W*M6{){iS(r}CzVfgw&=%u^xp3q000Te1BBvF13NO0kH|*@R&k@Q$HUwcW(HuwbbnP{K>h@Wh57RfNMqaa zHOH^3qlG;nKEeYPC?NQ%$7`I9eDK2n#m``B0r2aP?Lx&GN` zDI{l!%nVIqxs%k~9`b1#;Mf%SxOI+9PwT6+jtOY=qPayKoT2?`4CB25=px;MmEvRu z0oi9ei8}gJ>Tgy9yqZ)Iy|8y@Ly1*>TSV%fIIl*lXw zb|mzosT(10+XcgGym!0A^w)>c9mRHZMr#BEs40yB=SC07+BPuQ@J>p}Sv=sXG5XZO zP4Nm)Ddhu41VLw_xc1y8jU5N%8|&0Xfd^H0Ch#GLt|x!2L!{B~CQ7#)t^4b5O%W~T z@y_LUicBMRHU;+#h9-C<8b|v%y&z7bq;oWq`JaTnnL*9@1IKwKxxj%}g3y z=zPc&2uBuF%(6vaN5H^ZF(_1D?6TQpA~qYA*PvA+{DC*`!;p)rGA<>TgZt_ey#)lA zZ*c;ow>WuvAEEtyFUu3FII9i2Fssc9>$|6r{wPc{GDU`~(p?NX|BX!yTfegXEXWTB z2SGi;Pj0tNT4^h&OOU<`XwzN@Y7+To#fi2A^!9|h9?r$})ODcf_XjBSpVU@qM)z^e zuc3gJ7O`=0ujXfqJgqkdHgN--XNKo3zMUNME;+Yf;nH7>=Au=WAaMdwg4yUG> zjn)gUM`{9VgTgkQ7NsTzskJS7{`?R_5*oLrZcm*%qm4Taz57p0 z73B3<%v?JiCtEZFYOa)XMLjHJT;@HREta69oi14BtCTI6rj!jZG=#|jasc*A&N2ZX z+eRhnyXW+Rr+wTQcOTqne4wBx_H$~J9~{3gn&~-b_1f2v52Yh@Tyf#k8iD>oR~y!q z*3X7K)q-Q%N z)7!qZjFgX}9N;Vfv!veYdUIzmeYnz_aMlEsm~s86kxb{SL(;aUdC}1UpqD<>$AvD7 z)3g5;2UMK&=yut-siC$0&U}bt_?6|lwYiDQzP~OV%i%{HZkU5A zHhlIXPO_WN5+}thzqYG{$$K5HWLTuBSlQ|2^CXjUktmbv-2Nh^O1bgeBoF=NdCw96 zn#z^=D8b`%Yc(;e4&usD1z6|2@V;>yH+p034T>PnX$ad|8<%tW&?Y_Bp+Ku5IiB0s z+14FV15+dIG%d91vLr@hYIGFTj9WSYF6|D~kb1<02wU>g`QWbu#Tdk5DO1Mt;QFH) zpCu4C$Dq5WfZLQ_yf%sPUHLM#?Eait0EC5Hz{UxpVMa~_I{S%9UW73Z_|jgKPL;Fj z4)0$@nbSbpn+4)qjTdy-loNKlDYysctY{N=Fsz7p$^=yFadxiuzWjOC_d*+w7EkRf zysk>N=>i7pI_E*h=8Na;0+*xNX?OQ#_g+2V1a0$+epCk5!JisttP7#$W75fBy6JDp z_FD60s%-AYzdrQHDAn$q4XT+ zeCkVCmnhac_3* zAL8rL_3~~%0)pfPt`@{iIWkUBCbyV+jxMJxd`rxXGexh%ppRSWody|u_IkCn#`-$n zq|YY82sJB+tr*YiaUsO#>7coj1^v>rVc^p`$DEtyfBM0=V>*R^ojtIv)gp$`5{Y}&)LqV|l zf~QP`lKvAGjD@DSQ}qLqoXFMrI^G>cDH100`ZvCb#jKdi8(KTZ>O&+*ueq8H<5Tb zs;H!$T-mhwG`TPHToPTEoBvdwt3#x^o>;$Qy6BcjlqJyJZT~_@dr)K-m{1WnU^XwA zBDVOH-)URap_ZNG;c33GkxF&Wzk+5ya(ba4PoKe+&-asbXnt|7QG7!XxR(azg|@m; zLy0GZv(*sH*^BC)1XKGiy*8W`az-i&gStV$z9#U`f%n8fX z?|1AkKgAAI;SrR9ghs89 zFq5_&mgaJe@(%pB(x6S&s_)TBBT~I)8{o~Vkvg)Ns_;8zE4f_eXpAqZ*oF<}&h!?H z=H*>M-)*kFx_To_+#AZE3xhjlVbnKWD3w`Qx=_(|V70^6r|aGhJ4A$IeV8Phq+X?T zCg-u`r6`+;mm?ho3p%WvH*+BR-e765Ii_KC)omV+V`@LelJ?xYk;wst$!)PYWzB77 zAPNIMd$8tJ*Vd<-WU4DXQIM@@av0A~ktkyj&uaNv9WoXj2}A&iu@))@Hcz_ZqS$-Y zX!sW-s^jTb z0%4=Z?7ef5_BtW{nt+%^Fkghh1}V?V^0k=$ zrySIO9{t4we--yX7YPLLabpP=eI@h%&+UINUhsS$k?aD8ua$kW^WT#FaGI}h$6P38 zpM<_)2me!;0TODp?}_*Ia>`!_e2@J;>Q|`Re}zZ&x1@k?w0#M>5!>jDafovLx{Uvy z`ZqQ`2XnWZ(kHKBGBa)Z@a5k{oCSSnEo`m4S21X zByavl1%KcB??>%2FRkvBf%~fSul$vmf3-lqqtfxeTl!b>u#gvF!d#W6=>8*&EK)t5 z2SvB~^#37@ogy6#-d#GNVn6X;HR!xUsKLAYH1zR5y85pS{~bilfEU?0VqzTrGa$lg z?1tEy3bM1ao$imCl;fz%GX9ThXpmY1yCRR(CR>QUWlgg{_MQ3;9!9@05dNb~|H1IC zi@iw^F%%GV<@v=WC5nlKRRjbA?L!;^cjCuc-w^+|7%vT94B_MD>wh}9YnLA9`sz-Pn)^q@PWSFC zfQ!{-pSpJaG?k>{8_WOEXoEbu&yaT(#`o=YqRS&bks@f`Eu+y)oGF$m|Kfj111Fr& zd%(J#4TErRQkKHmu6g~fLuyd8Q5=+;%3=$^0MPK$wQNko$%XIeQIh~|8e+$pp!2D3(-Fv z`cK>bJNx**i-2=HOw|W_wPl;f3+UPC$nB=O$rdAK!qJ`4V`?;4`92=(y&O>;Y4rL( zg9?xS^XdUqOxZPb)_r+@1L%D}(fQc)%>VdzquygOV`Zx1M+P>knp~%dx5UkT=kxu= zMvckC@JI4ZR{srX?Hs!A*1Q*!a8>CDsNl`7YuInv%GRGBxQX=EIl^^Wqo}+%W6Cxl%Fq@Ls;lq{*Pg&V2_k4 zb#-;c?|2y3g?kQ(+Xa5hvZv2I_&I5)(cWUDrOWhysG70@5X*lyrAXDJb2IFyzoB($d`}CDJu? zNDV!NpOd+oK(b*^)*y-a-WyBiQ@A)W^oSabg@ z=(<0b{QlWZX!IMDx{$cC8=Zw?LOUEbBdbN&cww;6L3FoMxz7T1C+tN@XSV-;yp7=) z74C^iM=~dVRS1*B#>!wVMv5JQ?M&5+CsLmKR&Ar-!D0-XD-^^$Zca}FwLK6J*|38TquqRm5Za_glYHZ z$*_P=!cg;f<5d|6S?gS+YlxmQ#4Gosi(Ey0eTl82wGyD%&W(WseD?c<%Mjh)udc7> zl_d^v3~*qx!&%ig2p;W-mi8#=+#1jn<@{=}XB6>-Z{-ILPJ?1E8fL%v9PO%BFa_&M zji{XEn#nxncJx0{lx3T;!Zin4%2n z4uM?vLX;mo&Snp#$JS(D#+FFIa4k8ek%|iCX$(k5rH7Ve(&v%o&@cH_{q_y*TQs=m z$LI|>PImMq2dhHNS`(y3T|2=H>tC6T>Z9W#J!tHhJ8XqHzp@34%642-y!Fk##{i$r z@`j6M#stLY--t`773kbN4|Aga&luoW6eKqIZ0C~tC|R@`KefRidMrR(N>H>cDM+NL)L~L z_=bz(CZKrS&s?c64Nt*b%+?jqY&E`I!k`chbvm(6D)8rCeUr) z?**tAInj>oV;>F!zCiR_#{XT_zkVzE0{t~4%$@c`V#70lof2y-rCX2P&eVsPj=Spc zl&l&=dSv^Ry^M~FVD(-CU;90hqeI2-qxYMW4IkhV*q8>ei(9lQ_EU!)2gK9H{{3xz-GEN5hWoZ90lZ$*G;xmq|bDi+r05{Pk-QErI)@GdBBpi9$vX9*~oc2 zeWp>43Pq#^xmlmA3^AmQku%ZzdGQI=FU6D&UN(Uzd6m&Wl(vov@+RZlt;G6|tQjM_f|3oL@6e#Q};nD7t4iX!|gJ}LX^mOEyjlEI)e8&RH z0yg+Tm|A&Y4l-W+)L~BXg6b{@cB4pKt+fX(y`d)wCd9t<^PiL`IL0nUx*tW>r2p|g zNX&e6_ytWn;K)8an8N7lX%_WyU)-py$^7jocYv+wQ4_pK)?${h1EBLYjw__ zq`L(zhx&gI8XuvhAU?sudfs@1^`Z;*oR4{}|37U)Q4TeEb=f$x_qsfj7A%CPHCU9w zJ81>{s@v7M@8;6DLkC|Mz@+-*AJ*rT7xU1*>wzMiQC3cbIu6bl#p>{zlKF&7?2Wo2 zPG~fj)<6jsCVvi_MZ+UU*4ZAd;s!~}(G zJO@9v)XjToAc`ie_Sg9T^^D0Q=&zGHCyWNkQ%HM+Vk$Dc%pF*dsR~VgXv8yMQ92EO zDO0-w&4OZb#ucN;c)hC}j3D!4GQoULikgy8rFZiBNojI(q=! z!>Ox>INf1KDbJ0EB2B?u0)=(iO@CCyw2L}@htH$eehut|VYnguwA5vpOq>Uj`oU|#7TEGaHb0M|1Lti~#+~s61c)+9C0Fis zq6$c?xRFr>`ShZj;0!Vn5H!~S9R8UF0PGK-_@DI@L|vg!CeFNayl}(CpL95u^sTqw z)t(m~fIpd{kh!CdrxmrQiozjWbJo5e5x%J#AF*j5bVUQ~6!^De`t4KO17xzjTZx?{ zNywWSPr8!T;vjY$cs(d=8_`<<FS8k4Qa_0tN*Gn{j0i36Y&;NJ{20YbN$=K^ zBdc%IiDy+2&aw43TmA=>{&}i3$?QyqF^@6!w#a8Ad7W{TPCFw{yvHso2poujN3W5f z{ocLT?3HEWm`k5E%^f3HsjHH0WJx@`Iwy{zU;YCPWwU)Mx;rDaLu3V3<~9ZUI%EQJ z7yCm^ejVSU_5ce~rnDbu&yO@=iXlD$aQZR4a>*6P|8U^QO>Dq5Y$ayhh-43zp*WFV z`H?%xl>L&x)LE|;BtB!?h=ha>29v$w9--^@qM(o6CysN1M+cT;Y%?y}(%SsN;cv}CNx18$!tS#2h&DnmqXFIHhl>9G9Ww!uv?JO{_ylKK0VWwqKJ3a|ZR zB1kjhl90`0G|UTboxxcC_CGu!l6ycyITd2x2(C$Y6V56d7nce4@#I?&rgX(85wllJ zeo&t*Br8Ba6N7z2l|>uwxR;~x4?+xoj^e;}@RY{-{AeUk{3shst0{ur8kn{$OW{sO z&Sy_0eAxZ~37W@MbcLE|hN>~2-$e0Qtr)%+g-THH^mfY5XxEjdz4?8a45~AY~oCXVJ_B!C`cJ3sRGWfl&Ll0#*0CCeCXu0DLUZ+TBc09y?WGK}+?v!3Ki53@p1vSL zL0TR1rH@9!t^`=VT)DH59wf@>^z5YIpQ2%V*@u9nggU6oRy(D)q=|W+{4eMoPcw&DD*6UjCL2?nYhi@i&u}pnh$)AY=!bYj6VB=!sLv zBK7(v37ke35TCPZ*ed1SMKVaX=fBvM;W6zYQtV29vlVq~som`6ASom%=BkY+OwsZT zLzv=QvGSiaNVZ3BzM4N!&bnS>Kqlt!0(O9ol8D+nB@wjeJ){Z z){#Me>U95*NT6DF7+%bX9;(iVKVa_@?ov)vaozH_RdK14ANL#ng4KV=M!y)Ub|H?9 zSX@Ynk9p*Crrb`MJQHbPFPP&ghH&CmKj62it}X9gPZGeo-W&}Np349T^#n$x4T84A4ZqY+tUG?8zOOVoS%39 z7Yh(W#|pl8BEJ!91AbTTLXQp8C4jKYmYQ|WkUzPEkG`fM{U?T%Jw&B&Tk@;%y}M=K zd!rnHPs)x`}2?Ohp3Q@<3@_jj~{bRJ>i5mq*c8K=3pdqn`T+e`uC9! z(ZknumMh5VxD0V|XaqX6d6w_1J@_x{YbZh0zEY5^($gI0Zr+s}lh?GT{obw4THX7f zccRBdnMbe(Z8_o&2Y9^=Nf@c5tWlRhQ~e)j_0N7HPZP2z*Y##Ah0_87B8xYA0la3U zK7m~PkU0JvzTe}e$&@3mtNh2v3xtpXAR_os6Ge6EU0Z?-MF1DI`+vhDRH!na#jd@36FZC1XbBrF5MCF*|9)oypRowl>ju zq-w-uw&&ut*o%yn-iz#Z=*Q;&Uk{NvGRR<}FR$6;f&HafSZ^oNMHlK2&hxeEZ@niD z-;YHHl!r9tyoH5@GLWbU*;^~Gd*3?Ccw`cgUwxR0_qziac`*u3;njW<+v(|Pi@7Jq zzigWdwG7p7+J&|s^(8s%=y5H+M{~4B%ijaFjJrjDpIpkxOKj`uS6bxPwy#MnsbwMF zBepf#Qb7PHPXkHW?Sg)NTDOHsMCg`3M5k) zdXBvr?9f@AL=-r8E)&?Xi>-MJY?mX8>nWqf@evfsy zRfmTzSToW=8R58!Kt^fF9tYuqP?a%!G5sJHVp$&|lWG^FP z)_lENuU};<&smnls+a2F;ZY{d_mctXCiQ7ilG0>*$ZD{28yUe0b(M}ZIpD`yGk>c= z@`3F|YK3FJCKwd{nt=jCQX&V*AYlr`JNZK3+Mj@;TZ~@u#o_8~Z{NTJmJjb|D$n~> znm~s&TR<{VVT0ehik+Ph5pyZlOgNhrD|}bC3tAKh=Rmvd1CxZRQpL=^FYC{(=yS^7 zXU1rD3yojHGc{qlrIG20W;J^q4|T}*nDS0|HN8HV$s^T}tQo?t3+fs=mPcf>Uv}_C z-H+c3xNj>8I2;27tZq@Qa8wFSz$yhleh^q4PGcB!cYWWb+9^XNK>sKQTSp3cKNPFy zBQ2%gpveWG-R9GB6^KcsAKLdamvqV?K&@a^V`fw`7@K<)v7Tgb#y(WP(BM=UW!1wY zbuuB)8`CPHVCw<=sF4JUgjsw zvQTnCjt|xk$N)hS-9$`QRI-Mb<*(Fx9GdPbwMm0n*`SBC&R$n2s;TTl)A}tQ&R~b3 z@7BNkd;uYfI6e4=$pe|!xc`U4TPilV`G^==ydHw*vw2$$%5Py{ekP&+6;+ANP&G5I z*Wy$PEV8_Lgii)JVGWy^3T(}XpFMkj#_)xFeG&F(tSbLxe5~7PyyKzM7|nIZ{V*I& zFDe#&F5aQ6siX~TcwvJR3)kl_6@?51f@U8eHJbc#Bw4;R3~UIDlt8Vwz-WTfV_~f| zTktbJ(`weRYnf!uA>|Co{bGLlNU%zJRl)*zeDuevvLa>W;nELUSbI(`Y$WFga75xf z7;XIaLGXBHOYm6hO>hR4KfKfTByEul^h({>cDAPP#(7FF31mB;3bL7rwXI)%#LyVT z@-6&j@IAH=WcO{6xvuEl^~q4Zd0Kc(J%wijr`_#mF5hJo9JEIp1^1gC9XTC6*Z|b} zQtC_$ro?T)(|9GEK>&?$lXKLb5otl>fEcj;?erZS#>p(!Z@(V+?zvSWk%=vd*>nGJ zi+)Y91?eQ6Ynqq++VuRWeDF&)7uf(J;B|cqP+&b`pljU~ZGN zf3`V5j2ACN#caMr?Qb;ugo}j!hY5CbwTKT8477R@I%HVj4!mbj9*ln3!`m7|;SGNm zCFxeRedK8S1u31Fua$eisbk(MwY^1u`0==B(@b+kTzCYx3GPC2DMXhopX>)HCfA^s|{;+pp|Z* zH`e{dqTJH>PYfqHMT5hVSQWw&nUydjQ1xouZ&&nw~pF=L?iv@bV^A?ab$D+12RbcE)ZjD`+?eQ$bELB9-w~^<7RMV| zQ|X{nXR=gy<@%7kj179}7}#;xPGOeWb6h=SoV`WV(x}K2^kX-}@B8W7?6cT|J^|l) zzAqa=(*s|<2G^_RwME941B#Ck(g+lbn&7G z4nVe(!8X$DK&)u=IYssyRPCEt44H0UpHCTF%}#mMIu$~sacjZ=J}4hI&57bjaknAvVsdj+RR+-hQ4c8KiL_TV7r<^Rqhx(CFQ1QVHm;wCq~}&Udru zFLqlQip!_xI2fgtR*BfMA@7tGG?+p9cmd8c8tz+qX4AwmFq#ruTb(+{3c5q9)7GVSl^3{Cvfy3?nBu}&z)iADLXAY zpNy0tPzI#x0yb_jUJYqW&-C)}^jIlM8Z%j(OKd&r!UTO6u&eEGVM2Dt`ECVWY~LYA z(#I{_?7J2oTtprm3V(EZBA_gVI$eB;9F5W(wR3gQwx}@#s&gs>q*R)(7Sw#)Ypm%% z!!c+u&j3KG`pq6Z1Dc%F5gY?;g50mw!&26&V?x7gGs~iB`Kor*8#%C`ig7-XrjK`0!ZFte+J$R zNCRF^8+u$U2hZOPA6!RjDT+j14n6JR&J^DGVI&*mj>;9TP%DBJs9qxYrY?aLk;369)62ohPvGK+4&&^a(YTC= zDf(_bBasXvXBv(7m#4Y!tp=R8jOTR7CA-{OaTIpPnPKjeg%5JOI955+iP}7!SKoNn z^_Aj_)D;FSj-QypNhwR%%-A6G4TWj#jJ@#KMWb*cXl++t=ehfM5O5%3XI%93S~pR+ zkkaEK1wKLxDQNROEP|iPW;)@Y{2H8uPoYrmd9yQ)bKQ=U=X(pebJw;iHec_d^wVo{ zxlF!YV-QW5t+PxvDmI+5olWPlX%H!GvQ^M-gEewZfrgXWJJs=>GN!qoJj9&uTNfTO zWgc4xU-K=uqNEJ?#~n8rOO2qWIF5K zZPf~Bq4^z8h+iUAWU^o4;eQOq@XfETqBrU0&Zir&_HXIz=) zHn^{Lz{~1v-q?AsBH#s6r3w#js#$OZI-5BaB0ehY(*S4nUQUHDW%LuO$`Nn(zK>!E zqA)bl?d$k%quzni>Jr16gO>8;7guN8rlo}DXzWV!By(RDta^(RBBH(cEX&CRRwvg~swXNi$36FvDO5L5B!l0H zRoF5Qeq-PvbQ9I)$#=7w%f{D-F?z|=c02&{Y~A$!&CrhRL?kEfh@jY%|ERw`K_-Jw z+x*3u)a|C@%i8uLYc30P#yn^rFVyA4x?`)mMndKx0S{|yOvjH=E0BoGU@coJ2pt|ZeWQa02)W^A{s%Y zS>Q728~u9IJxyt&OVC3)#WEaMCS_{pvR7gLg&xXdgOxAlJPN{P=~prz^wG#)80B$T z>EByx8)ITK!Iv0LYGDFmUC%vBZ#HLoZUf5K$#PX9?PxS?G+zH+GpA47GNCS_b}E0h z+y!nuviE>fwq>^7;ut8EPrFcfkv_Vs7QL#5J)pc0Lv$NlcXvfm{HcZOKS2sn%Y&ki z|I*ZNsG=E!)V>z~5o2c4qFle8|1}F0p0&#~y~$Sbh9`fgTqph(<~pC;dbLoBY?A9% zYB0Yuw@{*DHxv_aFSK;j-{N~0Zd=bt6tB4U4&=h`ZKao!)FzXq5bu$z%%ID{pjf?F z!|*19HOfVEI=)jorYRg~C2KN>sS{%<8((x4=luQA)26M|uw8Y!0ce!6+{DAL*rs?4 zgI)Mz@yW-}PHH42bJK)v#I z4sxUWtBgL+(=Zp+(U@};1W#L2AD^<954+8s@15a5_i`1st(?*mUlZSR+A{{)If{mvflD?Lwh+s%iaamPf%I#)5W(Zz~u z&_@*Jem4y-J8{TA=N6^b-EClte^kCpha-4fXKtddyHsXw#{|bTDBO_sKi)gq78|Jh z_5Q)FEn*rhlIXL-;6rmWi$q4Lymk~raldJOpp+OLP*#)fZ|xN+D)L&2l1IGXX+tm} zap_N)C9SJ|ZqEx5Wag#(rulqf2nP1u8ipgF8qZ(%7*_;JQnP=T8@A52eyV7EBmfJg z{;c{vQ5<0!{FDP#!+Py~OnatOS)5s|d4g+b0;7t|&y>(_J^E?|V}R{q(>;$-iX@XJ zTiKVJ?zztBY1z+?(5bIN&|+&CRtN##9x}`kl9Cr)I^XImBm@qGK;hHCq{P zrrYXyZ>DitIE7Dkhv9<0hk@{;JH)5Bob$yySHfF&4=8kDvc~wU3?Zv-vdD2vl;Ru5 zz5BG7i^vHympBS%#3?>gb*+sp(M2@Z zSpT>+8Ni}cP23YG7;u(MA;TYFmC>b-SK6Sj=Bm*iVuCYa@jB0BvB6mY;i^6n9!7Kp zTh6rht7Jn?gKJE@f((VG}gC2cY~&k zj&+G7ngJQ3(dTBSh&TIlY(TA1C4=Lz2jfSStb;8Bt|;Lf3-_$~$}nObTKU?1GLjEJ zUnUe!|gdvb*UVClSJ*Mo%(=yjfoL37qW)6sj*T ztCq5Y(6*J{D9PB4=ihXj4xg0on0ywvUbKl>XQc_)b5LkvT+8o`cGQ~WDpaS|N03Bg z4`JI#hT<9whn8ET`j3PqjG8^;Y&7SK18h;%MfXU>pl9qdcwRgkeQN9(xw~}&uW=sO zhTq*T-f`Sqt9WT3F7Iy6#}<8#D&d*uUWGRoGqhG5xayR*hrtTI^eXLP`3Kj-2Q@6t z%6U6B#SwR>2G?@h6>e7gPj{yzD2T!-hhsg5-JS7hoHWo?%J5ACuWD~%S4jreG09Uy zA*y`kj2j;P8Zsv=kFnCQxXVF#S}BnA9(B?MjykoFx5YRq1ap%i%_ym7YnVSZ*mW8x zR8q;Yno3pCtWrCi6)oLweTLRrlN@`SlgY0a37+=diK^CRO;f4>%{V5`TRooGR?{s{fdwYhCfy535#`pLIGiArH?v$0jWVRM?j^ZEO=W3>`o3Md;&!5 zEnfIuzo3W2%2Ij*Ra$w)%D9wS-W51SlzPFpVFo`9QpC(cT9$33M46y5J6mbYlyZy~rNQ;RMZ9(RyiYufCj_{Yb|{T3wxb|2yuvgKvS z?yt5xrZN&%f0O#2C(rTniNg~!%$CsVr)FOU9Jp4@;;b}%iuwm)2coFVcb*2IuZ#9S z^H$LT>CM_ccu}YDe5K%EU1D+9a7t*h52)%RN8HG0m~-)U^(@pg<3-+Zg?I+2t5H6f zETxX&n8Q7sdG>yv3O`8tA%(VWJh%P z`FWv=*9w7J+2VEu^33OkwL=$g&)`4RRPvMbJ^Fm=BPO~C7KJa@;ud=Ae66piseVd^ zed?8x@~#om43!C9ac=w)oLQb-x~6ZEr{F$5B=weFmRfiO9U9FZAvrp(#k)fl29zv~ zr=gZ*fcN(QjKu70)mVMQ-=@`_yaQpX5B0QW8FYTuNoXoO@S5n%>6PoZVYcIRyikQs z)jTSoRjY!d;MWgB!)3|~Inf$|GEEO4VO|l7=}?z>iQZ9B1ALgFr%HMSr=9`vPJaIv zi#k6ME9+h9GnE1|1x6=#UJG|&(B{5p+s!HL#P_5k>KSit=Qmp1RYXsdzQq+YWk zEUJxaY}Q*y$&|@d7{eWK6@2Q7%bc-Z`eC@iz{fXx?E1h=_?@w&zwv_{X{Rg}V>4l8 zo>Q3+rGp1dDKAYw&xv1E%A-v`L-C#U`f>YK*mnV+FehuN$7jT5MttGnA_2+Dj7u0# z?MXVSZiCsyD~H(4QsjtENr3M-vyvn~*a?9cBQI|gY=N(Uo=#;#N5p^CC*M-P-G84T z^$y7_|2fHjg|20ff8|m6$U+UBx|9l`YtA+La+cRw5gdMkRyxxo!L?u8)hP$ajVno5 z{4)AjwU(}*W}mJt+{eV4FC*IGb5XT`z$v_FOMQrhz0OH8o+&BxXUy+nbG&HK#11Hrar4qA+ z7wFY!x>D-R23Cky7ri!2S&XOnz$4Dj+IAIoItWum!bgIHATEqw22@SeX~X$12`h#| z&e8GQ^NPCi%YwEO+-+7bX;p1xx407@HAdX&1YAI ztTS`|VtpO&LJGtucL~S#Af652Fl}4>dE)dB&MK_NZ8hJcbJ)=0Z4SKV=53CyEADjN zfX~z8yK7{QQf$EPc-V{=pTf6RazRLx7{oD{=b(>~rc#_?RAy#pPXD^FlE1V9l#@bHQF&U)1xQ_dqhpM6O9^YL*kYYQAeKHh$X1&rndp|nq&V&f52 z8p)&g(^D}@KbC)fwnwCIx`P$4&z?1~IkH;wOa`6jamJ*3eHfjeDb?buxY zn%7g8(=CiJ3u@g_ecD13s`~ z+tY#T$!+8rQt?r2P2(%Ruz(wl_ja|xn>!fJt$^_9Aa)X~HeWP0g@8-ZF=|4(u;-+l z2<>Kgc)CrQuOZbF3jXw4xXB&6@S#SQlAYI83oMeimH+e{a#Z$Ie|*ISdrvg(XF5bt zNI-m67b^qeBu5)CvQh4BI_131_u}x>IiXZ6w~-j-XQ&Mweb}3lctHgn7m(^cj{f8j zp;=Q+XhZMrvm%132J_yz^sI>3?9&3Ra7p1!gp<`a6(NX`=;%yPs(1AHaE_#o1g`{K zZvmZILWp+`^pnwHM`Q2Uopa*Na6`WCz6QIIY4thY*bJ8GuH!?pYC5Qlzms2tB-&YF z+vQH%o2S=&7a#SBdagmQXJguPg4~QxcH;R19AHiO3E)O zVPpc`@8F#7fv4GyiVRLvgxaW%-#~icT*YKc36_)*XDf-f%F?6j4XujZQa;6Uwm1mq z6m`K);4n_h*D&q)6Uzx+NYj|)gOF!*wiFzFdI6VANA3sO$4Ji(k4}q>Q;UxP;j)#Q z=F?qu5~2)DifKo#ONvdD6*+yo?p->9^PY>wid<b6qdjHr0kfVhlSaYuAOHu9K+%yLme$Gri7P5UQT`{#EG3^@mPa~5L@8RdCx7psdZ zzNdYn%Df8>8jb5Z3(;k5r)NJ!83@LF&KkV>U#t~7?sW|7pl)J`&iZlgUo)dL33<(F zrjJNtp`X`;KkMLkKSG&`T~F{1_Gfl}`LHtFLB}OcV~>UtTpKcY>-C~4wBVBWafqn_ z^VM6ir#ht7sk1uk{y%r!EFY`Kl%Jih6o^;9nY&uEYQ->2-GH&tKF}cogu9)0B-TQ% zRL2VT6Svz}6Zpmgr-s%QMEyRASO|7G-4G47F49s*D9kyBrrlmP&&_Ett06wa#L2E& zhZ7+pBQsp?$uE>x3QRHzJ%q^5PIlfQRU1O~uf6w}43rHT?+(UzZ;x`!p%KrRH1cu0 z4t*8Mcis4J`VQ`V%Zh(DZDkW6Gb_Oynw0-@#y_JYb7mHp;9@KPP4}G}&&@)XP-^R0 z?=alez-QF^je%U+D=bUy{sC_FBobw;qw77yM$Q z$7u~NRT05gxAc4Sr*AxVfB4!%9v_^h)fKStE~fG<0-dz$jvC|5*Pbmpo zJssKY*9RHw){DP-ZBXWEU={vEVUtoPPfA^!d~-r)i}tgl`oe1F)ixS%{>jU<9fyt? z4)0>D;*cz*0HIbxEC>4NB2j{&>0VKqWav`iH?GJt!IuCSrcJRi08W`!Q&S#YKx9s^ zGGid1wf3{p1Qsewsc?63C<$8*rci{4o`myqV)IUOpO7z}w8Bg6yo)EGaU!P-zGPZ| zgk znS+$rNjeHEsGm&cb@ECu$VOXyWjuyTexVuIA*{~eGiF-r8sg7M&DAQ#?Y8yT55Ccj zj1Oy;n|>vbFY110YWh|#Af^6wkE}R#=-l{(H zPv87j;*iEuBDaXV%T4ZXt!XiSzus9b#ss`=RTe6v7sD>`sTv}%G*f-$#T3aq<3c7_ zXQvQFf^pL|)s~38J0AxUK8e4&L*5(m@#+yo&YIVj!`MSX+jm|)%pggBUOkJ6~LD%5&rxU_t&^_Iz&QLYv?XmYnb!|(4<(g`uR!}^^d$)1d zBziQbXIobA^|9X>ve3WKBAqEPx?ZsP?1L^R= zC{Ojt^OXjhNNoNd%K;V;bC^`RQAX(UWIY7c?ad}jl*mwp-K*DM^!!~ak&5?SG{B?b zlhsf3*`&Q}hf3(3lLi!2Vz$L!TUgav?9X&PjIJ19Vl@F4~Y1!$&ktJa>) zi#b5m&dQLTr;f+Y-SKdZ5PbX z1-*ut!Ok%nz*1(Z$>I5{v2lovii$|f&~+H>h@mc@VUSYPVH;l|jA&TF&Zkoe23Yj4 z_5efo71MajxX2kfLdq;xUv$%L@u0W>N_dRjhYE&Z!lH+_HNKEC_^!z!lGV7BnmvQ7 z!7eI)z*njv^(wG7_Z{=0eWl{5QkCLE>#6*NiwWzo&)s3^OOeHe&VAhYOa@U6AK~2> z_8+j#49vq~*H<2==~lgPKIS?OtM7ZU5=wUT?b*^R({)!D`k}iQ0FCc3E;=VGKZ0?~ zr+Rhk7}oFN>*R{bYBhUCvWsYGrp3GULa92H^^0sSr)z~eN|I!~Y%!fby zd+O9)i6y!`yf;f%w!FT79nhZivTbX?Qil-w|v zqsMAvH>~kgVcg@F?jUxuTq}06Oe=k|Ognu8Qd)OPtZiq4M%bMa-^a{3$brF~ zE0u_*5@Xxi2gZDH<_yZ|ifk>-Of5Agt{x6o!|)Xa!H9H^7s71-S&!lCzPQ5R8Y=v1 z3y5OfTs`}|J2VmAC|Isi-eeutVxwuISNOy{>z?YX*I}PqvEvZyjiLA8OhB{M>p9JS z^qG86)9UuuAt{WUh-{WYersv5VMm7h?Z9_|ldtEh)pJ~8%zU5|~&j}!`YA<7@M=Ohej z6@S{;hjRa9GKsh&<8s)|?nGGLa>k`$SoW}e9qBVuQV-JaJFO}Mf3b?!)Kav}=YN!v zAszUbwf+&iZeuc3ECVnctk5(#*6J~XWdJ1PkJ}{62nV@byxOf&^=#xGFtV;_ijJJ& zA!*V_Di8MOQuN);WIfi`$*l_UAZ*ZvIU`Kg;S5qlX`m&Fr@!4o6*r(|IS9BqRF_9; z+t@w4p&SLNC0Q;geojh5hpWM1BXv;1i6+=KYewKt&XeZLd z;C+%J41;Z+6%=MIT9|IM0%t)V-8L>;_N#|-!0r!PtS0&cttP(gCb48>C8nL*_kFJM znn4L;Q#{!`)p!Vnei$2m9H?#UimUKseC?)az~ZKa=vbGB*$m~oyjI?po+_n{di;=p z)(hkfRA!;r4zD0$~%`Jb3>E?*qQA?LsOTb(a>TR}2!tPU0)SV0zU z5=Ek2_Ghhq3_Q#>Gl5G>F|Q}t(~WehZDuQ8>9v`)l-d|%*)|&a$3W7?Hino0P{W-< z4M1?vmbozfO>jkL9(n49bO07aO1z8|{e3j=yJ@KtUCwK~9d=`lv^$$hZIR#>UcnnFq=(s-dDrebm?=wPR~sGS?4p|{P@(xts!#kl??3X=5sll6Ig4@ZuG@aZBTd`1uU@+8Tr z2E#Ps(h2bBN6E~|Y)KjdLllG5Ow>{zOo~|`>p8*VVZm=IabV!b>v4t3g+ENd z<4dC^u&pI_+H5Qa+MGyjJ~!ZNauFfRX>uN4I&vYmX^eHr;3!QuzKp6WTIUn2@>*;4 zyeB*A=50P78f{m%Yr55W$_=bNz5Iz3QP5^vV{7Sf9Ww7=`0XaOHj|U0bY|Jg|Daq~(_qb1;Qo_$0JiszY)&~g z{WhuIb$i>={*Ddx<*LJLH7T7S(zB56xN;E@=k50b&d2dBbOIYvN9?3iCE7u{B0-|I zCfB`qq|o4Cgk=%qry|PAW%Ph21w-Xr8>17nMI?V!5ne_&o~$Bj5ob25-{2u`71Q&gVtRb-ww+9P53#ci%kYbF=|Msw+*@XBp0edOEM<34(cDJzfbY%@j3Sd%DJXnY)a*y0SUF z_*!E;gRhg)HoZCk2n+bYlvprPoS0(pansDbx=~2<&M?Uh8+2LEN>WgTPU1dq9Prjv z=6!JSIeB}H?i5~#+Pcy6!mjFOGAF~T3(S>D(|Q*XKVg}ER?b+b2KO4IfTrp2XSloE zmOn`Ke8kUt0|TdkR;GYL*o%=|1US-+SG=JDRi`ZZX8wWg_9MB8vkO}Q@T}S#0y+{L z0K>5c()x%Qm?<2EqgIIr+;zv#kmCE$FrFZ9FP+X~K1kJxNEsxrdUF_2FJf`^GK(GO zUAU;3Nvd6UI@ii&%_S3cG+XeG_I5FC$m=alve`SjOMD1J*1 z#Nvs%m2!33nEqm`Gc@$kOx+n${cJu@Yfso$gsFYAP&$loy5Am?2%-|k(lWB5U^C!< zAG*>-E|q%YM`vWcgXQ=mpnQRVa(F$KKZ(HKk)O?3{iVL+plKh_ynp54U61mp#)vTL z%A8w-bH|FAQhC-aeQ{{btY+A@o34oi01mAcv@n#^t~2s~1}gqxAT;w0bWU)aGBBn~ zf?ET&d3blUDhmj_8wI9M`~ZDMY~iWDqZQo`IK#kHNq_BoErktu$j%hj`S5b!%S!0( z))kpf_mh-epZz&{k)(Qeb#^vQ%S?L_2WUr^dLUbs-LzPVApd@H70pd^6jx-*`?GhM z!T%py-xyqJ+jJXDCKKDXZQGjIwr$VE&ct>y(T;6f6WiIb&69bb_xZk4r%rX%-c|el zclW;5)xElVb-l44|4E9WHp)OH5!R|zDl(8_gTV1e`L?`RqGeLuYyK|KMRHJ-AI2)< z^VIGJf4b>PSnpqLryx;6$q1-rPB!Vk!O5otS~q!+efhVVwy+>QZ%iv9b1|EK8NCE? zy|xMr28ayY)fCjmvL^S&%C$2#d#~W>Q5w5w{-2tT&WZ#@@pX4!XO6@*vg9Zfa%$$o zmvcDhBXZ2{^3Sxhpsci8Y)J30v|T64_UeF9{a0v{rml7IjFm}Ko)Gm_5$Ayhb*f2f zB~{7njc{Bv;+mEd$>4R-g4ao@+3aaWFE$x*@15^+i`h^C86Y5nAX1`2DpRAU{Hu0^ zp|=mnw9E}Act4=>jr|CXY7AnW$_yilYkX#ODvaDtB?}NN48BN((IsFdr;V4LX(qQ6 z^99?Cz6dDOuWVEF*fx)!!18ca@wd)E`2{d$+Ll{EQ55WEega(%4v(1IsL@g!N(!X! zo=q#ejEOIpm%)BZ?IfF{I^fh3J0{T9r_SlAqVn_1 z8oy*#>V>oyRlY|~Njfh4ef3V`J z?{Vc7{6+sQrN8=a5#gl&O_3mwEXWs>t1pgg|2{QvK?=wnCKVgAi*IS3#36@IQ}l?I zwaHaKX{2p3a`nTLAW2y25PziC0r50FjYJ`5WGS|GeCO5sBe&+N82;LO-~m<8lV1T; z#%P3^U^8`E){&ST9r_NN;i^~S5n$x0$2v+}DBJoe{7*gOe30#GL$#B=DIIKOC9f7;CI)5tl31$v_N`tMF?K%V>9H zJ8)hkmSIsma@9a5FP=Yj)!x^`#(U{2E*WE5j3=A)X#Khv_Vj1+7tb7Z?=K;3^Hn!) z-;Q>|>c)v`*a&U^+}axiIApVg$qp@khVb^O>gqkTgr%Q{#a->Zqp~*S15Y#JxOAcj zV3~-!ulxMX(tubCdZQ!se9eNlFjeam2>d>BTatY+9?aO&v(RrLuVhU@8}O0J#8r_8 z^eW<9uHHQ30Vlv(xn0>xcxr%=QwTk(>`YW!D-DN&A-9=hBO0Ss&=%#NO%rFumod=kilZC)6DIaOq2 z@h_XO!Q#vBMdD4O*l+mN2n;)UbDU* z-1Ax!pK<4vLy(BGz9bDlt>9w~C(Y}Vg$oY8C-^2>7e%6mZBD)BlCN9eHyK$K))+xJ zQM0?pa?fCLg3%M;Qy$%_@g!HNCt;ThOu(`r98AjQ&Tc^+TBB9L_Mk&D)e-uSj?3pP zcUTbqSIqGL@|spe)vhNCx1XC2>224V9j#n-I?z5(lM2$~Sl8>=?O!rV|1nq}Q?+f^ zc-%4sUX_Fc7>Fm$3=_(ej~bD7JnSMa!>u!-n9x+lKrSL!8`I?#eVaS~F1zGe%D77u zirC&du~>*S&w6Jxp(RNYpEUUOS~H7ZlwleHV{qk$S_BjBYlM-jT51a^Zu}pzB2rA< zFPt!+!;BR|ga;mz9k-Zd+@W-D&34CH+$nivbaY0F?6-0usktN6rg0LMJ7k*!CTZon znm?+^Y>mi&^2WTQ^dA1Ya8%zj@Iu@ka7s2wuv?Jal|2p>S3V&_;y6ODk+_LuJF%#X zNK;xswed=H(`sukHyvmkE9kiGku+;ciB@w&xkQLt8ZXX3gImPMtIzd<_0GV_ zTiwsqE0&*4Tm|MHIk-@+MtE6mb5T3#*SI61*ICl(niHw~5 zX6#K|pSVBehg*k7&Vr?`RVXzqj`AhilXM?I*;Zoh=&`L)3v5IL0wYfxx5<58$kkcoXi`;u_qEMA3a5s?xTv~7;PF`*)9-l z8^o3^^r5m)Z^kPeY5;92$uAnm!Ur@BP)Bth~B&JTEZH?o^I1rBdgIQ8gtN`JpoYHH4;y1Yp#4GW6t1R znJz5?+;E@%pXxlTY+=aOP(*=D7H>r3!)hO-mr2s3LYaJdo9UDld{0+??gko9{JE=X zA+*@rXuph{}f`pTkl4Zvg-leQEKb)fDu&*^n7RxR3Fk?Jg=x_FW@2qOILPg`B zicU=|JaqRSX}%zDvHWr|N;JVwCw__6bC4Wqef&xK-~#@#3t zI_xlDmYw^xSXyhli*SiY)NlYOU^08;MAcs`c|1m04i(zBn5b;UWSeI=FgWgL5pZ=OahC3l7sjINdTsY*0APP#TzBGBg0*2|Qg}=6mzqDAR;5HC zv_>DE4W@J@v{=B7Qn;FRYKy1aWi`=NB--myxznUt`dAS=nZjCIC2~j{ZB1S3(OR{9 zRnZy%Vce8CKU^j@AV>Zpp7TXBMWdgjUih4|>pllO(m`%H=^Vq9K1IQ&n4x4@X$O;J z>7e$~Q3KWkpM0(TOw)$jGZ^4pwvTR1R&I@`YV(L#FRF(YHRtSPK-H)eJBE`bTbrMw zz4vxUHdQPGrfo+IoYgB2AFZJ0i1Pw#rUg8kaqD$IpTG%k!kOS8k-6}}R5~V(yl{T% zbEqW~UToV)*=R~IRPSN>T3h7(S~n8GeekgNA!A#$T=Ewp;pmmANoh*D_sy#$fVZ5< zmx`hCbu}nJKaL_d*M8!cb{^i2LK=0&3G~(fh`cpNz^^7y=;%#X$dNzUxO=~iIk~>r ziE6%W@ja>%c-ou|q-1<)z1_I*HpWg%ay~lAdG_#fU?fZTe!gA11w8ocxBej)ork6% z_$;MTY;q|_{&Ug%w_z!$8%2o;4GFn4LDRBX^n)`~$y0 zW&8kxrY;W<0(D0>MLJeOvzphMu__8FtAw_y6g<(>@uM>+SE*Ua-%_eTtIYpVn44Q` zEQa`9Pz=T@0K`yg*uRs7;BYjNw+)b|ij* zQ63uhC>a*S0slSh1gn3yXX3TOL}6XTx_He^;D*o5E`*C|>vf{M%b7E{AsdUx>iSJw zTi*$RW(W{>fIl6jpQHnQN^O212%~#M=d^CEeISiCTShobTh8tlL&r|G3JdbNfc$A# zrPAtN%S^ttRc;kZE@_U~u(TO_#dc(JtRb2v)76g| zG3|*L2@gGELQk0^3MD18G@l}!xzV~<&kpsOPbDxvDYYd5gq4N&EL9v$p4I;d5+8)_ z0Ng9r(X*pM??+3g-DI$Z=BD{6tZtb4`DuhqJNFvP8_sK^@p_gcx{aGN#A7Lj)^EZ8 zu>ci=!6+<^P9M+TV*mb_4!QhB+9_5w&U@6yCd1Pd?@z%{pw7a@6hc6u6pJgK z=^IW-ODn3Vhy)G2_pRq5W8(y!6r@{wDOk?s|Hc3;DIr$t-swLHGTcPb(_oCeHfGSCAD3MbrE-?<$qz zOcDD3J!zX>5d1HvEH0-@NVY5WpM%Ukmz6IF@FEbxpLU-GjVVb4#ZhRckDq1V7sCIl zQK;+e!y!M5;wsfmGC6pWh3Dy4$9>!fOlV{VlZGaTl2K#fCLFd<9tw~y1IVKEoi$k9 zda(Nx5I7r;o;6W}79n{M2Z-_1VaM0Ur>RDOg&>^~FV?!|QeccZt-b!XPv#N_?6ezM zi?n#AD&mq1xR<5Hdn0ajzFuvlq!$R-S9h&JI_Ba!2!n zDt)buuE%#$f?#o@M)Eki&CEK(VwO&zpUYHRQ1aXxoh5nS&6M>+k}P+dL9tX)yj)sJ z@5r8yJ}6ZHJhL!CFM0pJf%*UK#Vfvx#m)U9_6SqGY83#yTUV0X+? zb`Lg$-aN)~(G8}&Gs<69eb5sb-4%C#ScFW*?N37ICL|Lq{D{sT@0>oU`|%>!hBlsw zwEs+mB+zeEDVtCLt0LifZ-Q#F6{dvU>6Ytm4q-(Qd$n27TTF~Y%7QWvV_x9Cs|Qu> zJ{?J5^gT9Y36(0s&x~%)9n-}SR1Jt>V6BE;OIfCe;c8v-69$IyWBw`}%at2Y6axcu zU8DDAp0D=S>HzTFkKN2TK$vNg1zCOFuF5sQYm^puwc?;8A#hCTf~|VdYs(35++WRI zX@b7RFAI1hT^E@VPW>MEuSshKeoAc|j_GwiAD2s~8%ols5Pq70f=a-p7%ZI+4-($@ zZSR|c!UP^&{i!SJrE>3+AutwGZwIgZJU97S0*`tL^1H08lhQ*1J4)UIRJ@p+l^5

-C@+gQV{#f`CN4|<_&2h-- z;*`XoJ@aV=%FY3_3GPl-3+!|~g|^fxr1#=}#++&AyPdVx;NP8?6*%nE@^V%qFk8(D z{Tz&*X%+%!jVsW@TUKvT@aJW+B*N{iq>tnuy8ZHX9Rm`O9mexqFBZ*xH`gfi)9ZKq z4ERG#PELI=$Wj9GAm{#JVl*#wit*Y{@ELZI0Q0NGW=Cb#r1n&W*hom@KYt!^sDfpN z3O)wjWJ9@8ITjFc;`d+ChJ> z3Nd7Y2+ip>F$_Bdp0I9Z3Xw8VCxlmbuRgdVq7J96{)+w++$2eiQ0q22I$R9LITd^Q z$HY}uAWj==!OBR`S`!)S8oIXE4zJ9b_{4Z5?>Q&kIn?uUnZ@y;4yFR?(r`k|)zS`d zIVOpQJMn}l>$1#&jP0&6Gf{iyT;o;>Pfng2XC-`~S4)}j!xFlenWmDYLGW}Ry5jw^ zgOoCo-eSI)^_Ihb+@QW|@0__#m(dUPd4%ce$`jMslV_gh`LzpkpVlczgAv`g)hmOE zC#|Bdvqiz%=P8y9)6Ea2j}F4%K|Iv#>ev3Ort#(OE`wCXR?HeyJjcBqSF!O<*~lT) zi)7Wa0ke2?bnTTYliM8S9_}*>teit%F0)FvFT7fajxhZHV!U@H*g3(?Il+rJT|6S^ z-y@s@5cJpdoo#csueUJtKUygJe`r^Oq%X&iL%6z|L?3(%n2J#ob7PSdorl+CAW~llEpj2BT;5S8k00lb0HXohI;dww8(mRrUdaib}-B1gdNzFkEzRc(!CDkxmMK5 z9`BCUsd}dW>b;8Ww0)F}j6Vzd8#U(Rm3hv1lUjV_2EtV|*MH4MH9Y`e_LZdXWG zPLMbt>oF*PuyVOSZ23uZLs(|8m{3PobpXw$#4@BqVIxIyk#C% z3cZb8p;SoY15Nd3l?BQ+Gut-{o3iDN6 z#^Qfnmj~7e1{wt4B>Wi|bpuqkKDRscM>2GVSilhbwG^xwWT5!A*l^@ZV(nX(w$UMZena>_mu-1p~P4usdY$tr1UWL zv;g$TMESe=PHAfIgcXpLR3_q%M#pj>UlVz_$~6^RxCGPU4KhO!)pV{wFO#*#!4~V7E%XryoNi z-<|5np}j6KuBJjqkC%sN;%ElFHW!OhD@=RkL1uW`7m<8YJ4AKN%t%z9zU6-9_hOBP z4!*w}e)71_8cN)0GzO5aroMTV?zI8J=QV=HbqZV`lVPU6j3`X{>P~fV$34{pbpd8O>5H zgXr!AP(uu1oVT#1C-Ri-S7`I4RV(cV zy*jODe)9Q&=fQr;mTc7o5_TC{l&e&B4-iiX-?({W7b}bA# zS-upmWs*HwGFtY1;~r0KrbYJ5ws86^8XDE_N2wGnc9w!o_IRNK#KuG&?M@q=#7?E5 zdOpB(UV(T)FjO49e{v`jYI2&|y$erfqZxt@Rg)KDJsB_PUXDT12J-qYT`oT}y68f# zNxb-EHm?X7CZ`>x&sSxcf(2^Z5!*M_0cE3{F__*zSqv_yX_%>aI*6u z?QjMl27P!Qlvf3d_KncRv>BQx^xwFsLl5*JQcwe`W065j@^3`6#iI$@i~4ocK`<1Y z-p#Ldu%w0|mP3E2q$wq^3OH-J{GB2zdrIb*GoXYvHpc$ega_m)Jfk2>fg^_P7wX+_ zWxcmTSIl7OQyqx4w?^j$DVp6#M%yy=8293$=R5=qvg9i$7bKkvR$CQA6jU|7KPm}B zbfAei$^60h3NqxFx}^B}*f((kyfZ0D+Op{+x@dq8J2J1zW8{`4KVeCSB+a0fpbXiB z;_MfapK(^=$+-L0^$v1bEkcJhLZ8V*XKYP4Z|m+OI=o)vkJDX7@$*2Y+R}mJqR1MD zW$E_oe*Mekl1Rq0l=iV3JQs!j@~M3}Tw|Tb{Tql&t!R=Rs^Fot zyOgl-zp&?B_lbEM8S@A=f71|Z1qfVLL*`me@$_3CLE>o6;3C0LZWfxiKiU+=CC!&1 zw}X9j{<0Snn1!Pjz$@NAhc}rqQrWvaE4MM$aTFXzb}?r5Lr7yO$_-I1!_rc=RumMz z1rQoZ3x6QDrTn@W&1Q={{B=MJyC>CdH~ z)tf0eRX_R&m4S-A9Al+xx}1w8R%I9+e7~@9udqwjAWILJ*%9gP5?aqU^}X66G}Mu1 z`H{lFgFUF?5BMcu1%4fUm!E-5NlFl7xnPP^p_~C5=+Ms1C0>z|IUSKzy%OVlo|j__ z+LAUi)c~5AvTDxlj8CjR83`(>Mo#cHD$+(F^Iy31*zyz-u8LMi6EY8g;g==d+44f` z?~RsZ64v2Hl_(bhY;c4-XE_eoNlqf!ZD!ghzIz^sS{rgx6fP&?F~KopK1f5T`7*L# zag;!69cN?kw|%>z;EmecbkL>hUS4!^DJj6A(QNkdL*ANvL*p)ao~#Jenw-9Gu!cU> zxI`k~fBKy(ogh7#^Cq3D2o(nJTfjWV4VY~cvk+2FnDJO!cnpRYHEmipOOjzx8Tn6zm8q5+ zov*cQR%y1$=x=MzR)Y=3mmwnQ*5r0#)#$V5m~*~!MXTg+)f5bK#RFqv6Fx3PLg$&m z^kpr&^*n!xA-+vRA~2EX%6F8JKIXU@)aX6cMFsTc1K)W?Sg5ZhAY4eC(ytuq>(pJn zm2hfkMkv{F!%LquTNE-lJ6**rv~U=CJON>`xozIm--F053VqaEM*;)mbo+^AeJ^=k zI_NjndZmzk>xabb>#fc1ff`SnPf$r~b`m4xI3sG!K5EFn#p$^lDx!PIZW7AVya^)B zj4}mo`ue303Y(b4xg<*Is7Ln03C~Zh|%xD zT|om%zNEOtu*4KpuP!1b8S>~J-eq9z&{s)YZ06;|Y#iwOd=$Gg(wxXKTa*Q@s{N(! znYx)Ya;Av}q?JlyxNSevr6Um_@mED{hP?Gno_mF6<}`Fl$KTTAvTqfMry}{4T*&G2 zNrb!2uL=~n*AI^^l=T62>>%t-i+PpZGq)t*{F7L%r0HPG1X{0D8kK&*6w=bz>Q)K> zxt~h#(J{C1)iHt=NcPIC8NycQ%`MZDDROMgb87{Pg!g(7*G$e{dc;1P@)Bv4th7xA z@5V(Shn-Ea{YmDo5*ccAif=dmdAsqoC3V_aYaJBnG+%Xt2I}+`9{CplarbE~IXGts z@hpc!79_uS|7eb&jjem9+p_Sdv(V+mt_{_gNUGb=tyw*4YJLs?8IBR7n$ELm+o949 z)bnn>vf|EkFW#930%*~`ssFCk36GQbFAdhMS1Fjq?Rt4sl0(dLg^<;euJjH6Gc~J5 zP}@2#uNutca=!djStI7*agCnl=gjkEHWeo2UDB}^Z~HhMy0M6*&H9OkeB_w>{fjgV zVX|Pr?~_Dn?+7lx?(3EQg*H*k&P*B4>$M`u9uEv^!SBp^X*jcJt|M(@U8~e4_Hgq< z@;s&A`PU zUpF?>Wonuek7rnLD3wOTE%_YfQ<;|g{=rW39Xb(0o@-JcQa7k6=Si8^=mKe$>0l)W z$^OrwHVREb+XLrdXSD;`hW8B7thn!S#gwCI8+*L%%=K0C^+LOa>`wYk>u}3Af^i?7 z9)VrVI$LqIhP?wku2B4Nbh%eLHZ|kpAk8IW1J-I)KkPhh1rmlw6FVnYS33+==x^rv zlSz(<^opkK)I=r!9+eKIz`wC{QZtdo-w@lq1nhG$a^LZM`PB$T4k1P8ra7GL5ZV$G z>GhR&9i#Id`zgiu2=GZGRLN5`q(5w--N|&nmutcOrq2nXF!JMm3eMrk3^d z&Jg^<@F%Q_nVehA#K+9#2kK)N^6yGZc9%rTaLh_QRwLz56abVl;qiIp@la|bnJxkr zFTuX5Q-U+CK701R^oN`5#MH9h9KXxGe~dluq{}tp{S>CZk73KY!1Gj%-2w38vyC{c zSfepc#dnw^hn1xu4>)Oz-)eUO^g9LMA{SAAUjABz_~}VS*YUozRyN#`y?(2a>0u(m z(1REE>!s`|fyr~js{Yguo||0`OfU0$J3-^s-BTu2Y6mF+U0YVTTWG((WPMSbQ8ghs zhZf}k&c?M2spoNR()ZW|#q4QTG{2f`h{uvF+?8@r1@WQGC)QQ-E|t3h6x-SR1+!V#C{MW|IvHEBJqCkpB(Ym!M5dDu%2@&WG$VlDbH_euiozuJlmxG`>0A&pu9U zEe|ixeXRrVd9{7TS_()3oW$bnJsjtuw0RQN6HJxJw9&je;gfgPebKhU4<^W-Svajc z>Rdd}5dZzFebw?vWsT*qsrjMEgsD~E7vcT(MP`N^!k7ln6~RI09JFTzd|=}VadS}G zLeI**g(g!<9MlY!AZjQjX|widK2{Yr2z^n1>ILV#&754zjt}``{H%K2zz0CF;e&Oa zICRqVaU2hl=Wc+BW>m+!Xy5sQ)VMI*qgm243tX9}Y}nkF2!hW-$&r^z&OZxNuJ3cu zuj~fanNcWD9fhNt@c>CTadh2%B-*F7vm?oj0qE zHY&9fdeKeR*VssjBkB`=!M=4oSP>A1{(pQRN-6!UlsFud94*g}UV^KK=E|F-u#sx0 zrrW@eZN5JT!j5e?(h7IDkhjwnpf<+LKJh@aStZ%ff-2)@xyg`lDL=3pI<*<%Z0`j# z+=5fCfPu(ei9;;=)s$N6in1wGzmTkQ)rS( zU2>h&TCuhfbOME6-JgGpj%V`Q!hOSateg2ghXI97Xzv64JkH=qRSvr5E-H|A zWw%$v+doWIt2*TPwR=)O5zGLHk7Jt=zq~W#;V20I4687brxJ%7m3jS@-X5di7-tbc zTO=mlfPJ1JWjPSol0x&n)8FuHpH$@WUp)N3zb`Byq{|AXD!YUTJSQ*$3H)>gNQ`N+ zPu-CKzaB`lT$j;24rtdb?joH&)<@X>5q^=GTqqT`Ss1qo@$g1=6c;1_Q3a1lcC~CR2OiDwQe!R+6s0MYG&6A(GWK2as09+E+DOWmDhwq{g}e$qgf_|b z1T?kfqaLjKf{@3&vUR>1+v_log7$homY;&JzpU(5dnF+}9vJiseBDjZW}w=qD0w)0 zJQ?21H7@X=$*|{tA&r;Vl&>7;@K4N$-sm$k@@nImDcxL$;^R2vGaxwW~NN5Cqt7~8JUhy?;(AL0{^dFt^)$V>#*hdpd&1#@h?f71u76YsuFe->a57v3;&T_ul9``E122a2nbw^~XlcHd&v z4UNTJ>(5@nZ{Mzbxb5`g z6#saD(0Qhfdsu)Rp5lap?8oWFF6du=psU9rcTs{=S*gjlYQ8?z;A4|4!u5ZpLIK7(RX0y6LTtpR9d+owerT61u z!w)Bt|49o&=0d%MX~mTnY6x!;x*-eSTgH5Gyg&Ih_VT!yqxx|6!!V)SpprW~kRZnw z`FOYqi}`WvBVrRj6?EE4=b$uoez0X0}0)asxBUsg% zJ+Dwp#D({a_Lg&n-ms{$0ciGTJYGPH^yn}~Vhwig%)g8i%Xwji9B$6HH<;y{FBw44 zR)@b>H0G=%b@>n#_Qin2kIx@pZH(ZD)v(?P8-isFdh~diDep5Vyy3ZZCaq4CtL6(h&I0jCuGtQSUg>GQcBX*r`G}!iYVnl#&{$~xUI{pp|AuhIM)!xp zSj0j4-AukMJodaig7$))V@P2e+5M#zKaL~VR?QK5Zs{^Uyu;XOSr&1O+OqI- zW+AFNJXyy415hvmYRz|uObXz{4D`L^zZZKpx9_&3=7Vw3VMa$z;lROqf@j275eha0 z^xVS*^=IGBB%zE%(`@ux3Y=Zg$~-*4_4wNRgyrC6U2pGF9L+BL)KjObh|tjSWtZ!a zfh@%wLIBR9-5}RL$V1f5!{8$qFW>DWNTIDu)1bJX_*-L< z{(<_rt{h0+He`E={l8wbSKXRb5E#gE-w5CQyO~^VhiLVmJE21fv=@Fq5I<8-%*CCR z2sk?{)0cnSHTA{@mg~>Ir`FE*g!Jltd)HjuP~&@8rkGmC&v)(W_JMP`BOqZU`eDaD zeRJq_C-6Ol`+GZV*B!{A^*MeZ6v}&yVM-J8y@F0`wmXcdY9&3u=aGNSC2t>?8Ec5J z5o_LN2TZxpnqYhke-XQIthe7oT zx2MeeqYJcHR`~e$h&ri;Jy(uhK_Ud4`!D7&oewZIpCG@y@^gWhZ*oAamM!!#yx)QV z1UCJehn;;dss(mRm9dR7Dp-M5&hLC8vA=A4SIIYlTkhP_)z5ug;ZFOR{&@JF$CkVv z>qkibuWgZcPp+{w<&nx^Lm~Ma>_TnE{9`nA$l_?BuPN+qxT`~*ud(n>Z^Z;$rLy`& z)BkWTjbP{cjCq9r5-@pSn$e;b3m z`eMl%6Eeb`fUqFuvO0GOn^)!>s(yW^nmXNJ&abowP z>AqYvJmP`z!1sbQT|omFK8v(*)m*8%@&}RcxCC;leq{<9damqw)PCMp|6teZn6k5P z+PQptte>Gd$(-Ax9HAPczwClR+iGZ3C?t*%_V-4xcm@+|NkQo7Z z>|Je=loW%i#>ORORo;I{%a zs`E^VX(9C&U`}AHp4XCJT^?JAoDY-$Um5!H;YtmT7VfpO&aJ<_;Y^2u@kRs53~*=6 zfwXuxc6YBFW1@8%qpSJLylL!%ZAHP{+$n@@aoKFcjXv|o8|CE|)2zep5nTf=SxM87 zI*OA+XmdH$riZ~z&(?65&xi>f;4C**NFeAm0>nBYm5g6gP#?d;lub#cJu8w#rQ+4fu(v&lH|@V&gT`M!%et;TBAjQNPxu0IGr<<><_wE({Q@IHt(2Ny9K`3OF7t zt>w26<2v>7V%klVB5Dlw!bn{?RZ_z`Cc!30eJyp&n({5zZ&1zGLF}Jx&v4T*fS*^u z@|Zfm!>N`2Ah^^Ov4|Q%xUp|p-FG!oX^9z&@HdpRB8WaPXwx+06!%9s50LBv;(Cq6 z&(fSkp;LxV`?kVGnc#0xxq#-q%7Tt(wx8H^Q$6}0%+Lq|t}H7Hs0QkZjc9234(mf2 z)fOS)y=dKDn=|2t+d(~#p0RzfIb-o+8`~7>ySOy(M(buMk+aj{eRm!YX(g_6$04g% z37J~8Khiq)OBqTd@E2iwN2q=A>(BuS&^CR^bPIu}0$rNdl*t5i(bbV;^a$q=m7xcZ zXb0`K26vf(Q@evK`*qT4!7N_k-E0l%Rv^ox{IhmPhUzipm1!lbZRx3X#CxQP2dtGb z&$CqNG-RNe;|^FetV0}Job=x1yI*|5%L=l_)e&6&;fJIldZiB&e*_u+OBX%Vfx!nI z2(*j9{BzGPLq}n;d>u<+kf<}8a9bE~pPM8^J3!Mki`YY?cyiP7vTuO5i)nmvzS;x# z@8@w=Gg{Vmt}%cFoq9KfoKqDHUQYX~Pe}GW%eKx_%FzLPxAZicgHEO!6sXjhz4|I@KzF0cjf+_Z^sC`nU0{2Py~I zo8j0N?|pA%H_bZvmJW=kdw%V!kcoVAUd_Z6?bJ6vWUJTvO zmniqDtyNOfC<`(N4&Bv+$3Jm*j?c*3y|~zAmfcf*u6TiUv&gYR9Q3XQgHLgp&Ic5u zZ8?^vI$+e*~84%5*;N`uXTgC){EtQJam1j{F#cTn6Xy2 z6-Kg5eMsXxXRdc}_;V0B6Q*{NAp`Pe#u0yoZJWa~h{0`b%5!%V!GQ5_cBWC2ZdZXV z%6@%Iq?0Sp(1hL#@2ovhO*bP>If7gcoWz+JVhnFKy(aP;{A4wJZTcdM_V%B^2_;N0x|!xJ<@6U0p`6JBk)~{njn#u5I*-D96!{pva^#r`=5f(hZU0g4WeU z+;o7J3$VMy%16_!7!UHA4@ZJ42*{|ZK&Q|7PFwK$?_PD2-vB^^7n#d)n>{j&yj88| z(O?*ITZx8kW?C#Ng7}gQ*8Ov?Kn^D{@18%lfm!KcYY+Rgw^F6MHSYBhr{TN>8F3}b zcN55+Zof|lsH^OVH86c>&=}pps{%gtWU^XicJT3iHzpQvk;VoXimbnUVfKwu75aYt z_>ib1zb9Wn;Qo6J)JLCs zG)?eS<7Aj@vam)f8@|WVY`9mCTM9fT=jJ#>$=j?q67&Hhng z>ciQw^vC1wvG==fEZXotA?qA;;d?lrD*oXA75+p*b0V(1{kZH%bxAe(2RPHAIjf1I zhmFr9$3}E7(vqc$XXS=y#)OobEMyD|JyzE-c7l@O90e!dzN^kH(4`uJSj&9t zxFk%`^r$UcyA2cX0@PezbYF*gpoBsn*?cWM^((~g#;ty25v7;%-^n(2w77ky6e z6dw|Foyfd1#gM94pGyTZZKeP)(APwm8vl_xDSdLB^`x;(P0oFt#`RdyiAUSb9O(6~ zP4)cnOGnljC~BBLCi7OCadqHReBYThAeL0*7ZjZ5HXf9LG6b1z~_};T$*3`=c(ef&2AuRS`95Z&0Z2V}}k$_P4QyrK)4U-U%+Of5uH|CHoHvv;%k?C71#1i6$wadTB6pTrq0>FJhPWBxYt`v^RR5HT@HvZ$ zBw_Kbq^_iDh@L=+;Ed~>7P=ig5AoM$_hQKZ7kPOqK5^DvBR}_FPUORv#b9aW~A3b>-saCX(tYgbULYixD-ObrZ$iUIs;job{-k1`nY9!_LR4 zH`F2q8WP?7YEp4=bYHGOXH=D_sQ>DF^F-T893$#~c`}p_O5LXTLeyg;Iq;I(=-RA3dKVN@U zF+G&Vs&v$yI#1AXdSGTg(a@$^nXJoOkgvRoG9_$dyQ(8+l?a8;Bd|p!7ggl%N_6Nc z@i^Vcr(;U&SsJ#VR5faePxQgF`EjZiL!;(Z=Yl|Zq5g;!`*gH-8<-(aOouOo@aw-+ zrL-bwQ*Wts(&fXlw4;zP@?Tebmx77n;u2rk7R61L5aY0`aX$fu0JFT1wIU(sP25|z z1c4`E&fIJRKaR$NoX+T6?ENSh?{^j(%yqA4G$+?>C;3i((Cxe72-7wWr&aTO_TR$&1Pc?=_Tff zUai(BZK!cUjqY?tI~<`f1)AyZ5#^A3hGCKU~{yo&+PqD~0~m0?7Q_Uk&Cf z{ZT&sR<7;{6iC4$;cMzAsp5+oi>X2p`(B zXu1-bFDOIGPkF`c;K=onoe`*UFe9ZcEz{ZatrzM!?Y)w<7}jS{cb}9kF7Vr*t`$&p zkYMrUw-ly;FP|(mq#`%Y?iI}e6A&Fke#0!;Am~oW7#~|W2LYut;IYX-PNYmW;h2!Q z)=xVBB#!z^yx+QG)?S;vZZ9tnRzl>qy}r`F9%pJbf3$<<&i8NSIC7%=6s; z0_7z7Fy+0zARCmQqH|nyj@^|w+izLWF~NZIn-S7VmJOguVPS^vVUXyG1P_Jpmy3l# z<~xJnwN=?$!*b{4f$id~$txTMQh~oTaYD3I1}dTp>M{JvC%$QbwtB3*EJyQ@!CH6l z==)@FW4wZ&%1^0JoM&3M{eI>AAFkdqxUDAZ8a2jj$1%s0#LUdhiJ6%lGc$9{6f-lk zWoBk(W|B!}mf`dB&di;uTk1Ne>->B9MS+QsIK#OO3AgRRr?$i6-&9%5?v2 zELxyw7>c+18?7Ejm>0L+mo_s#Y&xB&-bxjP{QEbv*>?U~wn28UyMeD(SoP)SLU4Jl z0V^LY+WoGDY@cwI(}>^3zu8!=7m%Inw52ms^_j54Z*F##rONbQm$G;Ajkj_rX^@=n zt7AmacyJr)D4pPZVKFpSd8;%!u<;gZ70ONMew*05tJVE1J3^O0AA$}-uKn#ZQOA8v zlU?Pe;yfDknHZ)#5{ZS~OXUmUN#SF-JWYVor{N#CDskV9*jT)v<-@{HLeb#V&AU(- zWZdAq9WO+HlbKf^UW7_a^|mcgI;~DU_2zSSSJiNNASUfhOF_fnorqgzc*dJFgwgbo z*Q%hyn+afLxHg6Nc}elJ7xqAO6K%HVCKyv*+#7XSNzdc$QZc@&jM-gtxHzrB*#l1{ zk!lM?o>Rpu1sHx#X#GMhlFj{QVA1!kx*hQ{F+9GC=8R7q4sAK&bMZxgZIJD;tOkCi z0Znk@1e{8MD{@2C%c18J{(>?<83%m3Orn#gnA_KCpw7PdqMZ0Rskrem2MU;FwV%elVmd-? z$6#KiyU44K!|!_X1A0IbBeGGEe<+W}u7!KcMCinjyD`M(eaFw#qtKQpPYMg1pyk{L z9H5T#t!B0yfc%PQ@hbJZm@?-eRYVB4$ZIA9f=B-aFe(TIAV6 z&h^Y}$Y2wf^@y`|-^(bbc@} zw7Rioz=-gfEg7Cp6yez=1*%NdvnUgd?$aY^ai z4Ytq|(H5ScnyYtNXk;Wh0W!fx`qWb)Ml@arS0vsNHR7_9n^7ci{>#u<`UfcemI3y! zvE(fwBl9g8oC2_Idfk{$-an*R%^|C)O?LY3M< z4e=rmD^TMUpFcCtORiYQi)soc5ON4eWXl4a+__1`bdn1%5|sAvy-A{u;oBm{ z|H#{y@i{WP6z<@kd?}DIt>fbvon<@x7UDQmqi_SqPe^4Ia(U~44RD8WDFpB2>XWfVO zL~_UuQzztxJGP9T-eA2-w}!`^2t|Et-*40QWL)54=}l$XuAk3HekGZ4K@xAG|KiD3 zgIQ5H>26VK>;)=Kcrd#JhFY`YO$qk6&NZ8-5WdJN*C@mSQ-~MSOuXzCXJUJy;ixNO z)#4!gvA?eH8a~#rFYwp8Rg*LGmFM07+I#S(^*S-9za<=6cOZ~$;6vq%&>iFO$9ogu zTtJan(6w;i9jtnjLMM%KKI+%`>WFZ^kRaG6bv^k%CnzWRE@H?{THQauaRbiJC`H^2 z_&GMPnaLd0Hi4ZzW&YP88H#im3zk?r4LvA!J(tP`^lu(2#mXkQ#mauZ+{kqQNBsPU zHvPXZLF#WcPzrEjMCzZ#RA<%1udw|kF%!OAvWnAG@P7|?=*DEvs#=tK{0V6C7%WVT z9a;#~#SP#%AA+x_+AmGGABd~cO0+ycgy13a9?cb@aP6oJpP&fznNO~8OGu20RaM3r zC2Ksg+sGRU&~pSL{<1ucoW1Xje2g|RWQJQ&_B*9_$JJA2WoDb5ZBAiN8Wo#h%Y?Je z;6)_oJ(Y7d`%6s{LlR))>$1arZSy>;vZYvBW~OpRDb68q~O zfy|rkClZeqISOVHzJnZh&iH5u`>HbO6ZIM}=3mG95B71@EJ+M~l&`dY%+EJ-OW=hv zOr|={dmNb-N13WGR{#FNo#AQb*$Gvrm;D!>`)DB+0YWn)6(ww}LT5)-y)ixE0!b z#1Fdb&}=93B7uV_j1G&m>q4m~w}NYtjc$fMvq=4?&Hg{?xcOF1^tYRX>7ujNALRT0oDKeaKU>54mXq^SY?PH&?&=d4mGF0J80yiL z`S}w~+k<-P;zQ3%b3L_3OhAjSkYTb@nNGoPt1qU)bO8QicrzsyE_1mp$>r5%bp%rv zK|Z1^8Q~~9CHJ(u{Hp!_O*S4@Il3kTa1nqd9L-C zZ<3|cI;(JfPAZUwjYPxg^GKLf4%mDtdWsvcyO&TIu0yRgRfJv3575tT^L7IvcD zW_zgM@cmZv6lLM(A|G>&C>b!@({_g9DS( zcfU$yOl5?tDx9p4()}N*!hfr4@K|et-2YiitzF&W!r70o<~@6W*4<#wxCYkk&ytm`~=~IuVqksxMWXFZJumR4wy_#2-cy9rI8+_) zV@hjIMt>TgB817>-=3A4NaoKRzh!DR&g*w9V3^x|B$&go#*2ULoLj8W2s{`|Hri}= z2g}YLGdFCPc6v#gZ{9#DpBywSC3<2yS|TjnW+eg8PYYR+r%oaSsSK{Owk-@h&!S~X zlt!B+mrg`G=ab6452_f+2M6=R>RW$GUS(mnY>Llg#NnkZbkE*|0H^@=DucOc3gV7) z5|j(lz~r zjr{v_!7tXl^JLjR@i?z>yUDzd*8(546=U)j(pfo@)%b}gyQ(x6K6p6?6IAKH-d;MK z>}<1-^9BSbm(yIh_*|X>C)D@9_?|vfC5aeW%0DAoPpJw1HE}=i-@3K|1$tcnSdW0) zim=BhGCUsjdYX)^KNC0ma3nWds>-~#{<_d;0*mWt9|(e&20Xwv-S?&x28~iay2-+z zN?2An{*ZKP-7Zs7Qs@greP;UpK@OSWCd^%(N3d6DcAxxmwgU)k({PR2Ej^`>Ccjkq1)`cE=NoUsTjz!Xrrdg~8^2sW;Cwu>1_P})hxX!k0a0IEXXi9k-|Vx$5+2QGJspv!@@y1q zdaGkc-t=XOahmwms@kyKp#@tkTo@^J#HA7>;w*Pnf0p%i)V$2U)(c)bl8D8GPHDz8C&rtdz8%y$n)C zc7RQ^?M&3_t*jd7jsz}srBfPsRcq#F?K!sN5(YW~iLyhD%T*0FBS`+9ye3jbEL_Lq zU5Iz)IZq<}Jjn5MJ$`l(-W#`KwNAgi?5gGvb3}(>Mvo4V5a;bJl+$@;^fSL<)y<`Ly=z$?kkf? zglA+zOwihjMdQFm@4@1km)+HB4-(!J=EEiH3ws(={jHF-nA%Wp2_X9isp@bvKuHiFYPeax@=C!ydg31u%6de zU^FT*;KVVfFPKb)9?H=+(={?)0ib4}dLl(47-JR6-C5s8t`~db&*p+!5Svf+FX&n( zM`Lr__D~>b)MOY;gD*>ILN+!7%1xeLRAQsj%%wDkek{?^CmdI`0-U?eoT`^qCRFw{ za_H2)+(_slQr?rpjYRQ_`D<7({9+w1q2^Y9+N!Nu3jyEiB0f|5|n(2i{=`pY7oahtbiD z;wGdl&$EObm)UBX8Kj01=Ru3=)|yl=5;G(%X8q`IblG*W3PKD&P5GPV>(H$CQ@iXQkd!6t4wl6v&;9P zbcLPdxpPguWBuIS$hC=eTy*+(wzFbdlK5W+(Z@XG%N3c(UTRloPg zMLlL=zWRHMh;!kyh3SjBQoiA9I#7!g^9xMlL)E)2(<}UE zQ|ej9`zlqIkrYHx%>jgmV0K036c!Yf0!Q{3fSDZ_{oK8$>D#EUvSN;w*D;dB%1ut_ z8yTnhMqJ-@Vv#l}q9jtbmRkvmvx>0$iVQ#pooB0lCQOp|t;ALf11B&mH63aYCLb@E zTQ-KD)0;2?M+q^k$78zqs(+hF{_y2fL*>n|c%wFdXE#`PK=jte-iOABfUn$2)k zLzD0wKb&L(vkP(`%0jf$ZoUF3Fy&7j*i1*XPvx)wv_pp9w3IVCuj67iqZA_X|J(x@ zx+hB3W9Gxh!}y(JK^!S2u3(9Ol(7mudf5McLfYn0f_%rxycjR&75ygw?%iND>&^Uo zdxcr8mhEDOHt+mCrF@&$-~Q}DI&tX15~|9ddTx=20i3#XK}IjyCFr$R#I}DmzmT?eDKqF)*})2NK^Q+ zSo&bz2KG~F*rG&JAJYU{0+YJ~{I#)Q<`3G;H*AxNRqDnn_LJQCk^NPyA)+4)3SV%f zin`1}NAd1}Urg#~<{E4#Ilm%IIBVU#pGM1#S0{7i2}F^l@CAy_f6Sc<5?*Ik{ar<7 z*j40cUEJ6F6)>coDgC`%UCujr?!3aUP4o}Z-XNHXC>5+Ys7mJX=PY~^zF^}L?{G>Z z($F1{>aQ{JT5igeCq*O%b;i#BOv=h5pM}aWIk-SIPs8>2QrOs}W+mxvy}E%-X|pTjjSk6I0uuZ_{*zMHkD()@f$`VVa*dld<(cS> zLU3VcC^Ir6c!iwT-{h^BCL5s^e@o@hbn9?khfLP-z=gE9?xynihriHMFu@$p=a43@ zFBDD!c*QOP2q7T74WF;W6mmTd(rK(X_{{erH%dN!`KPH6=Fl(OaVTZ|%FQXX0sHCLDcEMcR&h~^9lle7xS+O#$4YMdFi{k4juN1qHR@C9q z;vJ;Hc?(F)0G&cuzfifdEZQauY&Uopc~e zE3x#Q=n(Hif2)cJVNK4U?j{fyj0yT@^|s*Xd0Im4ds0Ii7(Ix%+R4yx1B1fcTppL} zb8eSs+})0@lTz4@R}@D!Jw{*)-T3o&{Mi)hEtqs4Qilf^e3wak>;q5O;>Lo8x)`B@ zmKukew;mj+S7Qjg8>%MhHMuaRis&33 zS#qF+LcSpV{erxz6E&kv0vgifed5yg0RIq*1RILE222~XY1;IDGyx6B8%oHyl_{UX(`=OCX3AA`5Pee4^H50eiX*wlN^s_9*6h@ zM38=1Vy}!0FGnJ9ynN*-c#G2suFkEN8v?*x6n8)T!S_*-ZM$K-3KTkWfHz=ZkqD3& z8O$Jv@#ap$jBfzj`r8-U^vfJh1yfD*0`B61*JsUmelx?bFX+WOOZC zblX+#s(#I^ymYPLa_s^f4qoEH*&=h$<#+;kT31`Xe15qxva&{P$Vi8>eO(go2uIU6 zBsOhdWcbD{4sK&{pMfocbC7OTbFrH~d7I(Z9YdshcVmnv99=3fmD5>hN`h2?4?_Lf_UoEqe$g(xwTi%)18Dsa5m<(n{ zDAk!@y9nGul$L$A9N>#GX>;18bGGAowO!|?L8n^g*;{=6>5~GgQoT-YVF*m8cSWdp ztLnr&^)RNUx9tMv>aVz7uJ^fH8(5Uow1T-dLf>NxrYf}XS8CxL^RYpd%v1_G1k6-Rke}T-S=dWq&K-i^1d9#$L+lgTs2y+PCk&8ot`GYbN>An-I^SdT|)k zV`~}$8N9Gxe+0ujzWa}z9gaRp2}nQF%V}~+e z5v+kU@`Kj3H>iQe|KAGV-Kh)t8gkpoANJSx<6*aMsi2DsyW3fOlzxB~RPj$}Jakrt z!30{UodOt|BR)rQr&#Ry3+{|bGIe;@qZmdB8G?e4q#(PIKnJ9+4(#+!L=vt zRB`d`+Un|=E6eR#fLe#~nZKT}19MKG6Kjaz+dX!Fl9;7S#CXz?Pi#GtR*h99;{vnyNrh(HknAA%ai{%;?- zRiaTB=7w74Su0iSr&VENWTObNWO z^5;99t7|{x(>>nMENxGpm{1%=Ry+?zXG3ex&_*G-{j!Ip$IoZ%D_f|w`|IFm@m2RR z@28zQmZ+YoaF)mCr`DBbh3iy9kRJx)HX2U82A#s_dgMd4FJ??fi)dlK4JZB+S@mx8 zCKM;}D)ws-xP&17xAP-8WbU1kAQ0J$Um%O&66VnR8lPa6RJSxk;RSCq(=1TehZ;56 zt4aBQcu?BXefC!^>Kf~zRhBBtWcSjrz!`+{K?wN++DIg*HPe~@q~#sfe)R{#ix-h@ zy20V)u1@mU=vlIzW3`9|uJdpuw=@b!{gjM=Dr+aD*+wvn$ibnofn1fz$uke?M*ME@ zYUw~M_nH4K*9L&jlrStF5e?jXSh^ipE*})n+oWg^0tH~DB;f!O+`OJcI8}&Rgl)jI zbW(48dx61XVqzsr1?<>-yknHhjvv|&Upx>D79q{fnH*{`V_vUTV1Ns2J}|CpRVgeJ z@d#9&>I`_ZSX%i7wJ=t4zk~8BJX+E2;A)}PZPzBo)FO|BrJMIcIByo)L-YSs_@RC% zK#=nATvxA8KEu2G2e@;gz+SMuqd@PsFTn`^B;-CQZQts@bq5P0pS>RDz-617NrsA3 z{*DbNw;f8>QY$`mU`AiS*9bFdKD4@3ykR*pMIj6|^%>N27ptN^L*R}#)@>;oBPJ0I z;)k^>y=HrR+U`=B$i(J-B!BblL*NWQii15F$NnY@-RyArdH;~pE(}n%z0=m&y%n+< zOhcUdJ$x$vgA_*cK0`dputJ_x#}9tktOC z0|*{N8EOQ9d!a6|EMdE#g#p2sEhUHKBj|6}{Sj^*lruz2rM`fBni8Pjm{RIa3x zr-SK%CBw%Jc>v4%9zCnB_l=@yMj&7kd;_Z4Y%bSS)_LZ)f)bJ-vGcJIxEXx0)J z)T-w`LNg7O&JjcBMtd|AbBqxXD#fb`K-_4puLF_DEpjJ6m&_gquLfUeRxQ{{s! z=Ir_}W@Vi^xCUfSb{)=(9>m`&mI7&_}%w6Hi z$gzP*)@y={hb14jDp+-@HvVVbfgi>AJfAijvsewkmiQOr@osdg>T=S?7wEi~bZW3T{k@PDCZ6irk5{n6 z!V@6#!nEk&a=#xJT_C%pVMi}3&DX`nldbl_>SnVv6j-rr((On{1Sax;e!KXrY&et7 z{_IwUlz8L%31KlaCq=Ue!>95S7V9rw+Xl6vW1;ykts4rmK)qH^Q<_cRrWWy>1EEyq!3E1LKNr${W%c1j?y`6u(^W8r6lP zsb#L2!-IHxi7fcShW34-m*dts`KD!v=13)JX!DRvFW61D{Uyn<$+O2HDM!{Alg!Eb z_R`182*Ziw9NUqPrZl-P`F&E=!ACtZw*~nnufP#nrr@ z_j}e!-49B~hl32ftp#^XW^&2iF*>ilYguscp>Uvl*3Y)yz{aHUyQpe8nyG%ViYG1- zO16kzZdchHRCj zA1_s_X0bP^HvOgn64hn*10_Yb(6>OXzK{rJB{N(Ncl#f0`F{H(na%n9Ao1=mr*GWq z>q5E_5?d+w0t}Xq``-jvCB*v-(Z57qyB`2h*tPQVBjW;hH`9@X9GV&U7*xjjMGZ5R zl*oSQ;G*$vRY2~YO`GCNiUAdLkeF426q}0?{+RdVT02$d+pg$do->a2{ahAh^SQ#t z>h2^EsIOmqem@XeU)*>jfiVa)Sb9)bKq%YEps9u+YDtyE1&@YsJ|s&}5oT8~Z0XRh zqZ)(0xT{mBlo|S(9v_w(2_!*gM5ZB~gQZVex!S-asHbbQ=>piAdu{5A#R#oa8J6=U z{&*}Vl%G4MNHgOog?2?F6hz00csrX=80lzTxi%z?+RBo|Ra8vQ;X5zo_(F48W>WA4 z2G^<;@HuZ^v@@3F83Pz|A}!%!VEBdo*FhV+_qJCs!LymTefxC97(wXd4gHX1654gtScKIPyUMlyPZ# zDfP)#)`Qs6caaFLfm@dS0>T8I@pTKWOR!qpgzYGptnMoA(tqsgQ9v_)3-%h1(Vm-s z(+={Z|HYPUK>d1BtIR*iG*<<>Q=AntH{U{msVOO`g~>J@MkdH4sr9{nIwOD0PJ*x& z#zEAN-TIH%U#>O_85RF_zr?!P#PJ_0*~*6Tx*~`p?vAvV+*F=L>&v1`)0;TtEGj6) zeDeMx`)@Zcnv0=G;yOY14p!As8SV_L$2h;EyIZ^u-eru8KvzxoLV;R)_5mkg&U4Ou z1jOWMf72g&I7eaFZ~1AA%z>{5uzkm?emEh^+J@$`OT?qyF^ojss~n+NiX}rplv$P0 zAet>X^+bw~bR;_VlotK6OoWY`q<9HnoEuCWHFgbocTcHZKn@P~aGiWH{m8?+Yl1g~FBCRT5Uo@I7tUPU*J z`?Su{;?e^B#A`dcEztyQAZwDKY!GGz*;Jz7@i&NVLmC|c;$f3b%mK3m4E}#WovfVC zNBJ`!R2pxp17=4$=b&M`iCK@KtJb!|*yslPS8*$P?U<-jq#6T0@Z<6i@aMVTd!vOZ z)+ipsQn!zhqE)w@Uy>Nt2|K%Boy}|Qb|hWGuvZW{phioD{IeO{XOZ>GA%BxjF-9rN zxm{P#x-rC9)S}#qUFRuU$V&|Jp6y8!w?4Nv)3?4{DqBEb$PQ89ij5NBBUHqXJ-Qe(dlahvkkaFrco&18dW^FDWSTy=<^X5eTb}N{rV7VdMXthn6T3L-V zhgqX-pE<$iY|7SnKkp165V&pYf{L2N4J~${`9c8~Blk|AMnthCKAjX*ckVXhb7~6q zad42>z3I}c0@GJ#?5SSir)LmZEBJyB&0Kn^3fUaad$u=aB(GSsUT_&u?Z*B}h4G1_ zipxF`Bsa0ZKThsr@n{7-=*m6B4LY?ihbvF|MK4d9kp4Pgr1reHlu!RJAYLfJ!VAl# zu_B~i=S5JT%W!NJL;Yz9{@2dd?YJ8of%SBr1adE?hAo+~WyNNgOp?R1}A&1iKKm4Tx{Da?T}4(j7n-0*9E~xhyT< zZrunlExF8Gl1!a#&ziSkEBASZSSZ#PHvTM2yVO2(_k)c zt88a)ySbEdV4Tb;qjCS-%v+=B7Bhqn+99zDo?H$|o92Av$n}l!Zx+?RBZ~tIn-Fr! zSAH%hQptSEK3`m1wU)Z39_fv!iF|-Hvg!^NknU!3!&Sv_Vd;Ot zBlrpRo%D~tpr7;?G^XFkBB4xu+kb05TqQ(}8vU#_HqZSZZmPV!xJ zlvO1)ohOan=^QU~JbW10bX+`amRCKWES3k>5OqCE`|4>i!eIr+D4c@1ZW4Ac+bfF3 z>%8v=Mt4<8NP>n>>X4ydYg(Ge)sOac#`t~&+m|h=A$a8PK17`p4=SnhnlgT^mXam; zSZG?k{AENkX?(zT_>rG=o4&KZ_uzzeg>gFW_)u2fdI&tC#GDz1WO`Jr=;>H-RJZq@ zdvAp`>OVrZJ%$K4WWPK8;BUs&{5{n5^&zga`wZ@JrfG@q!=k34Y4sfeW%a`!?v1@3 zrRkt2M`S_rhInO7mV4GH^6v7y`mJ}EpKqTrgDB+$8>0pR;W zlw{}W;s1Jf8dKl-y`ui!5kdlH3b@}Zn$@Q}^p?wDR5Rm<`8ZXKpcMh11&J$a9tS@T zn48)#3(2^4_naErGQx@GIzi+)J(FA3FwL-#@`iN+aW-$2-k0ieC0VvP&)I2Ms+LOaj|>P<8P*&;ZFon zqLymTOC3#fy;9l#4ww*|xW@9a&FdH1384zY!sH{bAT(;b!SE?Dt)Ef#xf`2#M=4&u zaW#_4Mxp0f|y`F%yH~K%XY7mTK(EWoL(`Pd{D9!r{d|^${J1B z=5ex}E;C9IE!Zi&97>#b4Je#?)Ut~}U$!6*jD+^~&bp~h6mgp-WNQ7gxOIx(* z74(?zh7>?`UxCx!cNX9XQrl`BUd*zumuWemUhfV(UU3clI8gVbG-GTe;DGBD?PC_4RE>B&oE{YNr#t0tp7OJr5z`&k?Mja)KY5#LpgV z+GeY|p7{GD(tpmiUm)IEdKmwnS8jDRiO(E9|qgjzu%)Mr%t{nc^hsm z*nj+Uv_M9mbNF7*vO&OdJ&R)6!?j{|p%g%k&?5CDK}avMmABH_{uVR%>z^YfSzdSn zr~xjO+33v2ov9Upu08#=*07u7zeUm^mzmbiU0Bs#bsfMbDNba?GvHyRCFt6hXQ)$M z^ddR?icRY#N^Q*xDAP6Db%OC)_UfO8h0Q;koTr;Ex5#2Lkm++Au^zF~wepF>R$6Z{ zkvz_i7?ywnG3l_^mOz7O$3aD8*{0~(5TF31H8eNji|sqj^V_I4!ki7F3uZ}ohwcG5;i%(s9Fm$2M9J?w$)5bfk{ z4hID@{wZPG+ncC9>5oBk-qKnKCBH-}(Rt88`<0dTef+7akQ>i*QGU?PRmT9GnEJ^O ze~Z8gT4hTW%+Legw-h7qh6xQ9(H4eVF_#-qbIjeK9+v+TDG&UX=r+NFx;-oDvEvn8 z>cvVFT1VfZwx>ov*I`hj6TKQXIP{O=Zno9?oajgq4tdJHf`@2=*K zDMv1d>oCNndUvK6zT|BJi=NN58+pLfnx~Xm$v9q0e>#c(hH@+GtQjep$O0*ue`OP6 z-qtRc&|}?bzh~GtpwVt=C_D1z> zP}Ut|$$2qOIzp;=^m*Yv@dx5m+DMCC^|HWhySXFdQYzCDD}VG0-cU#X=Hv4#QVc0< zEJ&uh5$E*^A9M%Y4Bm19H*D0uEx3%I0ibI2?hEBo)tIYV8^9_kh0~EUv~^E_=f)vL z?;i;Aoa4mtWA@jfg%E@w1M8$dfsP523-&FOT_S9H_~g@yYWPM>lBa54k5{Z0p2doW zSiT3!DOyaH9ifA|SoTd9zQsogkHu@g)8br1qj5KScxW-zcR3bm=(|D{0<$P_Vg#|5 zA!sCMb{1Qi%P33Gn}9a}q>^^@1zqZ}XK9arlBQ!Inrr?(c=~h(cxKDcU5&BZL<2f0 zuWBNDc%AjWe}B(5T85VEYa=!*-gbD!UoShZrKtw+RfTg7Ww^of|1J~1b}NjSNEH|} z87BbXSpn^hphpurJBCl^*W|0KRu1mt)H>Fa45-?wkwY?N);Hd7ulLTZi#xVdb8o7v z)s2ejh8#6MMai3mEz?`{sZQs99tp|qAWMWSg0QFcYN%HH=mED3k|g@;^IBZXg;8SF2eUV{A=+RVTs z_$M7aji!Z+{z|jy99g#Up@BKbRQ1fH?pjFv{y5}vHPCz@j8oJc{moGETd=xz*z|{E zQb>;cECrE$mva`g+jhgjQu2kP+haopvfEdKyYlR>mv{D~aoYDl5MI&wowTR!)q8UF zdmCS`Bg3a*Yu9Bc{F-3mEF-_adHFip*I(-N?}Z-Zy>R21LyIyyF09ow-3Gj(Z=z((7+X;QmoN zf$Nk5b~GO%+ux0Z&w243J9=H$ruR$PQN-4sD}XWYET64|9Wj@E+dbXspV_bTA!qHr zS9u-VE8xLS{m)g`MYcr->p=HE0+&Zin1}xCZtwm825Ua zCLswHH4GJdO1vxw(LV>)GN_?k51#eWe4w ziayWK*gS32CTgSGDS&dQV+=#3BLZtIn_1}b|u&;#$tsH8~#r0+O?y;D1@ z?j){qS+zXdexfBYp~imf6+9_ot@9K9N-%iKyJOQ!N^0Q$+W$n9dWt6)+xK< zoFg0Ra7&o#89TIB8-4kq%DEHUM6yFUDd5086asA}(&Au)xqVUVWxh{+#7p!f5@{d5 z75dr6*jBygjgx|ycGtr)YzU3cP=6fKm_s6jiF`gKJ^TY0_~Zne_Ku#E1R?(UWd&j4 zx}BGFfDHF)73#6%2mc_)v~r8rf91>$t7qg$0!PR7x}eUjm}< z?Lvrp{*YdWpLSENGmWvJ2x$KwM?i^rCG-~h#d|}SX*vkw;k_huN?cjbvE7X zbiIRY*MzHzoosLmr5|?IY#YmM`rix%H1D}Q4Xfr{bgbac`&t%WOTH*PggtpciXARD zl1E6Kb;_tS@zLNV-1oW^tcDz2)MCe^B1s;PG5zJts-prJ*%8wdn4!fml4pOJ0@o7y zs8k%a)Ct(E){it=ubG@KRd1~nuiE>#lit8TuXOi8*(SL+xC?(*n{TZ@BE;{QnZ)XG z-eqE-$Sdk z`16NNua3$z?y2C$8GMYM7j5N>H470X?A5n1?3M0{zBAB;cUI<;nt)J(?J^$4hJo*8 zi$HBMVZ(UQwfUzW;?BJv;Oq|n5&yEd`1(e{nyJhyX&(-)_ zNavpn?HEFWi&nGXXzNy9)lr5697C6Ip>EuR%TxEL8amD2Ck5)d?65Wo_i<8facoWH;$`q?i8+x!NUbpc#O~V9VAd%<$79(u(G5`3cs3rF^*L5@w!q|m;z(1;V1%N=RQ&6l3($_o zB-U`E8Kqyoe#y%e;5`s) zPbU(S_0`eW)WP0crdw}Y)lDtS)mYU}j^N?dhvkqiC#tUu0J-;b-+H-xr+%iPqVXq_UH~6>&ElR6f{10`$wS9J5N1w>N?mzXc##9nx3h>iY zqm9>eJPfS+I+RvE_lmgy(_;8RNfgldjcjV5GDd5R*=q>i#2>Jqwe&M-XJ1jhZd zZ+0U8Ol<9J6R+|QPJRrJ;O@mrtth>gjT(bMzK_H!U)|QK!atE_M)sFkctk59C`mFO zUlb{vYDK$sF%Aah`!9D2`l&u_JsMm1yu?T$duVS9off9Wquqd>&vW8Uk=9)1$Go2V zNh&Tv_U?1G7a?2*()1{|#-kaIRwR8&pnkPC*{c_|w%dCY_JHbVhOa#BE7yG>x(oMe z9k(mjvyAu0$k2hu&WlL!PPU>>ou?QJ%D+im)&hh4?>4@jBRu5?1h^+~RWFP@C` zlulE7%FBIQK@8z~m}B2`xu7TGOVbzz4e|HdD!F_c?dfL-d?J-jh-h|O&44=TuRF!8 zR4^}px843Y;ZlgT%nDNkh3)zw4Skk%_qZwnpQu%M-HB*SV0ImasMPWnX`7F+#~|C+ z+84HQs9ni!0o}meYEumyZ!Rrta9ZchWnzB(M+2s-zTo1n`uygMmAyf^OM{+^J;@tW z=^pJ*w7!;hZJDSUVTto=jL1_PLY{9tQw=whZzMeJGsny?v-)&`#73sxhw1lCrtRHD z_Hy37H>|*$DO#S#eN9&Wo#8?^>YkUDRon+7BB-w19U-2h0uE&o0`DW;0aW70II`-Z zq3q}eonk}eY$@T@Ig?$F!_k8s>l6yt7-5#CT%YtVHz5SyEFGu5KJ|`h7QdT*B0H-t z2DTW}t83171~yb`Z}A!izcVoL)bVX0$L^zOttmo|^grRT3csC>AGce8MildgCj?qvMk&>F9NWiN8r^S9nm$amXPOm| z-|w5ctbv)XSsf=T)wM|UKTG#VCIw$VK7>55`@HSK=3&#bTE`RA;4&=)Co}k0&OOq# zeUI?|Y07t#|MA=iRMA9yu&W40piBMpSaCvti;=|q1oZa+N7M2B=^ojW23iLhVCv&r4i{AB$Q@o7($SS5$PO4KmC)NEJ&p`;U*|DZeSW+Vn&s-F2K`3(xK z=#kEWXXsepVR9aNh@i>^d5CEir25d0__V_r*l`G_(`q_OU6iPV|wjgfSOL{FOFmM_D z6PiIm9%cTqX-%RyWhaif)%U-ReOFnyJonb(ZwE-CLxXC zciURHWb^$MBZ={pPxExyO+@#RZ8{%{Uygvq@y^ov1c=b{ahmJ0MjWFyr>6pnYyghm z-u%K)4%MEX1?TQ`m>L!zzo`u7lP?4%->4~aYK3)?C3x_ZacCr&MI|WU>U{uHf^Xil z$zC>me|vV_FAO~8SjqU6Y?a6R>pRj2->)#msR z-rBL~KT7{;cViK3(+vSn#pDubYj&*Gv!Ef%O*k%*9B%e` zbeeq|Eo>Vzs~A8F_!N4b&3nm3KAOU3N7&HRh2?J2Bv{rpbI@Z(ddVsG73XY1-7iN1 zOGbj@NXO$_z8e|a)|2FD-F5K;ep)4;;##~IhqHrzdAB2RqtSpn=n?TslI}Rs>enBQ zd${1?GpVz!=n0>2Nie}nhEA^9QMCt|o-Z`%V<}^2rWGN$49hGRv0Ud8-T;fQ=dll} zx~)orV`GMXj17xxiJn(FktU1sLYIrU8wV3-ootF1bC?KEn$;pVLVxj>q^cy(<>B?A zl4Da#VRhA#hKr|)h-B@=yt5onX>+=_sti0Wh`?%L<(~aU2!9D=VAAZ)udL}b$$8J@ z0s&U@&HO0Y8}*3D^@U189z}`fQY+%bsALE;GaaZk^h_tB&Lo|CV1pegVGgc%LPPIw zxA;r)HvAVR!ep$b*)oKb}<^7hv1w` z%ns44NS9r;Le5@17&>pnHf#J=MxgCb!S~E;PuhUyD*mAq9W|pYRf_-HMYUNSztDpG zshPfG`RLHMy8WO->Y{|_bE29<5|ngObxogiAMZTrC3Qa1EiY2yfVCk66P}8L?*)XE zdSOz3wCs-=AH_#>e-@|Fxt12&VO-z;==Qi(bK*x#Zf&G|XL#EQ(;XM3J$@Ou*DtH} zXSb)OZzY*f_^m(xd}v@|#rPTO9mVRxj2p{)B|g9!JiF2x7(NE*;Wnz^0O@wGbLsTE z#)^$Fz7EkZ_A{xZyL@K`#p3(wED0S>O(9V%{J<)ZfL86uvZ5_Jo^9L-aH}pdtvu-i zS?cgwL^XZK=gG-{$zeEAMM@9o4 zc8IqY^6A@z!XusVo)YrO^Zr(rk3WX|xXC``P)46U&6z_rTqOium$#WXoBLim7F^bH zhEwyF@^l9Rp^L|frt=O5_q-*0uV2U;i$w?gib5PzfZ~nz+z@}A)4+Pn zHtKY?0m3TpRYG}2y-ok?RiCGh#z{RK1bq>E#I*?VGUzR;N`Kv62XrZwDc-pPjI%@# zwgFJ^iabpW15>90IXydV4FsRcFGYAqyvBJktL(0Jhrs;Hkkdn*5~+ujaCbs3L{cLV zdn7y9)u@5_ZuZz7ezPMC2p1!y?QM!}@<)-$U7lPXs_s?mjTFz!3fve8(N1ESh){>- zQz|rlsR?rk)H#$6vr%=)J&(Ue7m9F9Vabwy4M;aPN|k|TNC-@qdax3Twe2m$sM3&k zB-Gp^P2*Ond_&D!^Hn{ldi?bCRCv}fv)69mRBtOxtNN{Cv-qtk9F(hl{M}$S`1H6h zmH(Mrzhh#Bm_~|yvfHTlgrQqi2ewFmqMO-r=Hhnpm7|0N%&PP4A|}uGkf-J z+Yy0(3|9+@JjJiWHhhEu;r@me<Dmrdbf7Rs;aSP$rc>wos}>I;+iP+{D`0Ag#kvUgr_>!fkwzr>ETVZ`<_6?A7&V@pp5| z=8YjF(4!@$s?J(*d&~CcgEWAYtntglUX^f-Bm=XX0o}?+lPX1FY2su#NeN}C(q-D$ zU$A!v#2M04%Gu!SI#a=O;rQzV5sGV=yQ0*Qho-4=El+9)U=Pj@8~Qj}ycyG%%v9L| zM{#LbUc}G2hcNM%hj?+HUF`CyE!Bmt#7jBR$iBMt`j~coP(qwd7AECp@H4J9^rs&! zSrp214x;MAg?pW^p`pM}z>pc+2+5a&Fsg|nkO>X0wt;~SSs$wPAmoG4H7Bx^pek2- z!ju6NX0Xf$p)hl5Huk{o7JnFrPqEv$hDPAqIt)P>7PHXE-7d1p(@@Gw%=$9O>C_^U zRaD{PceF57G0*09$0T4H9|IQa{Tu zB)-#&@}Eo^thPRYGw0NPhQ8)N31%Uj>J*QoGOvMWianx=9u+`c?d3>rg1B`_lLeTD zV2(I{hq9Tui(93|szbBseb&3Mgt`}Qy% zAfgb>u&dY_f~OzCs(oWs`9!Jja?9R9&UE!1 zlNizI#N+t7i^M{1H_`MmCzSBRkRY+uky>SU>eb|M1Sp43LcUcVW2v9XiZoXZyb6~l zkRW{~p*OWRPaQx^O;GIp{PPBZPXrOj9lI1*7_?3=7JDZZO%O;}VGjZ8{l$SHt;q5r zOB}_3mn^^8F^1M*J;`#~Ji*X{MZ37Nt*wobz_k1H2Qn?@nP^U_+LJddiNY9A8z2#y zlgzoc9U;@SnO2!79ks$vZ#q?2(+B%Ng#`R|?Fa2zNgD6iC1i92-t}+jOfdPx&5y0Z z?WK0~Rp@;`mr_TnPu9r5S#8ZfrjIaorcvfcjU3My)kVBW_>g*RUs(s(9-v%XO@$Ba zX&NDZp%B34A_x%bZa=C`0fMIABIF_bEzEuxD=HZ#Fs_&Wyx@B{V0#;9iRxdLj^4u^ z**F~MfuEIdbiBuS!mZ$~8-1eKuAx$@U!68f6OEr905L#@{l||yO|6Xz8Xw{DNY~>k zl3?Vg3dX<=r9X|(TypNIw{2oMh@Y!+#jX50^I%_bf^ZXFsU1_`aD81!As4b$GC-$l zz`%@5q_{3m6Sp&&H!Y-)OPZnFh6OzNNPRwZ{hZ@el{o$9Vz)mq)H=GJYi1TA1lm}q zbbQzrr6biEOtstLelSGV8;rFh?R&}P7d{pd)AqSyYm}#VZ`*b!C{x~l~<>Ul2vhL zNF-UdL=1UOPs43KHif;KG3)v<*~^%E4;!mXN+OT}juDk`%StnIB_X>Y+AJExRe{Tu zE!;H(GA4;+cMjo^fC#(Wp6l4KB-mrL)`~mnO#ng!`NhbX5`1~sZ)6n*iB*a(p-Q#M$FREz+sJL`6+YDr!1s6 z{QLgf0&{|{2iRd>p<};<1l9-CH%fj_$}WsFV8Q2eHgNm-(w@BK7Qm3 z3^irPmkoH$%){IE!ymoL>W`GK?ohONY zzruUo`bJ9z9IF`*8pGHe-5X>DYDMQs62Su_G4y%rS{0<}<;K#Rr{b}4RF;B&%#8}b ztvn6D-|tlk@~F^jt;JOLS7=-cIJt$wX&V*viCCiCG*4Ed4?o9nkV}8FJ{{mfmQhy0 z@E1+)_b@!)(xlFZwh%(3CD&;?58J-Khr?FrKpB?C^TDrM8mkdX&Aa<|uy0uZez2UCJpfzE)M+YkyRz}5+B_G7$oxs<#0ZCq; zTzyv~K9VS=5BInZ`Zhcs*i)aa#Tkil#m0W{fGe|qy^$*VmXr1gyDUBU+5xn5!G=TUyO;e>=FN9%BhhrZ>4rW+-MK=j?_^#EJrZG1mG%6F z^LNnw_p9mUJyR*X0PlTieJei)Ko(F8Q5W)ao(rK~B`JiO`x?ui0YttOUW*qmrxZpj z+k^G$6L};HT@FvTWHd0-uY;ckb8VD^a$|m{TlhL@dN=l6B9j#td*5+pno#vAJA;-C z66(v9nUND^3>0YG<`(JN?f3HA?ZPvd62CY3p8h!Pr|_Z1$N7=O9w)hQjI-It7P+^X z>1~Q$>#gIS_pLuVY+P?}%7N)MedpX|2Sl{+*klS-H(j1Lqg~v)ki3af(@lluEN_@h zS}TE;Q6sBXvY*_Wi5HQyMX;!bD?+PPVLX;#>(1`zpLng{&_cJ=-Lwe*I3$vF@iH|_ zJKEzuHm}Dbx6!0nqlDpwM(7Zr*sX=c6|bQQ7^>?yFx*eWa_Z!GoNEw7Q<(ko=z+5R<#djS`1u%+PO`cNA^<=H(LrUoG!l^n-R<{1} zT666A9`!QoxDz*LZmKO{I!Nr2Q>yTwaB}$9m*It#_Kh+`Av>-lQJ z&iOxyQ>De04_~(Q14UPTM0JPEV#L2SIB2d%Y8E{q*gO8AaNfYV9%*h9wDZw53QYOyvJQ(LbR>Uu3pS7r(789qs zB=buz8si588xHHFrSRBRi57xKPEkM7JJy8!5}3O`7Ub!?iUhk3gUvB|0j`%g*zM#6 zU&t^Gaa&u4{qE7e0kXTW#!Ja`^E{L_MgRFgXK?3ClD(kP44V?QtXJmIWKeTi`xNFA zb7^x}Hc^Ke>+wx)epy?;uR5O;V9?3YHuiIP1?EU3Sp*Y)>veFjk)`4}U=6%3b=Uq9x z`jgKY@e{zAs$CTBA;+sTr^_{3cEdS?%Ur>ogdm(iGy@IRqiPZMqa(?LW>K^wo^V`< z#&k|Nnja6MfUArRej1_BaI@AuE> zVEONjxkF95=Jrt|xhHIMBc+@KyuzMuC z2d{3XNUm$0nm(wCyqn52qaLw2p37{4l=8D7wvyG`_Y(; z-ewBIGMKHT4*5~SMQFChW4cIM&JgJ7=@3P00K2}3f#5uZ)v5&0dmT8NM37>3hs7<+ z8$&d{|1^Ge2QQ#>$hbaM?(*1v7nHv)RIL zG7CF!?RpmYu9vl#JaW24_yfZiFI=eeV;ql&eGA_n@hS!dA+XA6#Mu0)`?gu+kDG`A z-gwn%9{bDD@k&dgYPGhFh&NCxpt*6f>5wk@TI--f%!_*)G2&o(I{@V+jGvc#gak`7 zZY?)o&y>q3af+G-w+`!Ohsu^v0>}%Mg*beqV#}e_zIRy~72D2asUQI+Q8xEEnQW#@ zrlKeH?n|sqCmRetXeOhKOZ@^X%kwEnD~F8rS{?RkjvfUiyW|Ssn1c&;mG#K;m&ZAF3a+^d|pJGuDq zo2cX)hZDMIL3*06MxsAFKn2sa)S}B67eBJVMJJR6EeXq923eo~L|UkI5`HWx=F=Dn~cW#Ta|#q^j-(Xb1PR?aF{vVl2)G|8bLYFZ(4(I8>zG?LR+v7 z0|K2fpBprgh5COMCo*s@3 zh~F!gNYyvM8~FJZvP&9XkO?Z@Gq6~GFw&aXxYsly*V`x1vd^bPsC=C-C(_c?MnmwG zvB-dV2lgyX6kO*tIn(>W4*W|0$z|Y);%8EGr`u%}!SpT8aG0y?y)#;l+!m+5fbm#L1oomVd_KD6|Uvdq;_CeXpAyn z*xLB&9w*V`f~pbgJUwe__%!E8Yc*s|Fr1L+`uDj{kk$~TeT{=U=1nd+@6#V0RIuyB(gO}x`hYS0Wk*&O(s|7jd6bK#*xr0p_L@Yg)y zep68JC21VzAGG)24sU1l{3Z3womb=ZaAH2ZRfF~*uEe{`u3?v7zuIO><8ej6rPQm8 zBeg&@mG)0h!YJ7l)aU#%I-PR%zWVj^KO16F?9_W@Y`Tf%&Q!ov;3aMOq`pFE^-17M z|BB|JN;Fp&RHHEi)0}5sa$#{l*!xXRoccq!CL4qxbi=J%w3|J+ne*+ z$}Sw45v@ckPe$IfPcmxPWo@Jz!KPy5;|?(-E?Jbx@3#bTk!!a&b#a$Crf<{6nB8lD-ix1u2n8UT;V*Mm-qobd z!(P5#@ulENO3bRv8`1#R+{EX4y%;N$JTpRs2w5=&BU@3KuR)oe#b&Y;ULrHjvfZ`b zz|(JV>IoazgGXoAr=rk7n8x(xG9sj1jd(Oli_Yx@h0SJP@J=@~R_f4!n&=SH7Vf}` znDTJTF8HlV-9yghHt!wsyR*GZJ)NIK556wK9YsFw^817L$6utXC+=JIE!oE&O#cOqaa#d98 zBP3kocypLB*lI_^Q&V}Kn9tWHRNE_(@4-}ef+YO)k!A%$V|dPhVmO9X`Z6KcAxANh z(U@#3J7}dAx`qeC>{at2i%9^}I1ye|<8Y2r)~sY8Qyp|t3G8G?P#yODHfuSUsR=4t zt2U&#W!^=tu{gdhC_Z(<#Nq9fDV1Pfc9LRvuJ3IxY7?Oxe(MgHF+E~`SD|s@PuPnT zZ%;SYr3BE9U^sG-t8<$hfE7!SmsVv}gSMjr!Bzx~e1JSEk!7H6IR|-KQ1|x|pjOj* zT3yM~w%z^m9g5|rYIXFXG!pKkQ#Ff2Ka)@xAtM?+4di9NdG5{=Zw5{8QNt z?+2q^4=cNoXPT7#%2B>zkfg=POiEd1&Z@o1FIV8wEco@-bc`x4W_{}g)I=hI?Xw1U zW@2DOjuUv&DsO|JSFy>nt}`Ectm4hY!$ z#9;DtO=(NQbav8w@bafio@ne^Z|)nCoBv{&c%p8qHO7UR?Nwc$1&UQ9*V7h9^Q|`b z0>F;6ar)u@xAY~@{3ui~F_l4>6tJ*&b-rG5y<2#tLf{j#ohVxD^MHHc2T2IB#9+Jx zv(#vx<13vmu5hvo0UCE5kEyJ;??PB25v87F6PnZ#zdZIWF}e(PBcA#g$bTE8J27Jg z>zba6C5B7P1V2T#X=o$BIUhn`0r^1&A9i2_TDl1uj50%&4J6)NcJp6J^8p}uJY~nu z5IpgW)j|N;l{m}P3EQ&K6GG!Zs0;7n%wREfS2~qzn`s|2EG3<^y}4R@FY+- z%zB|0ketF}a$9SDAt}Z|k;^j4<4GULWYvLm8K}=s2NSQ&?0m~RJ*t!}ZS=PBbx)G^ zP|UFu1mE+UD9t%o&~#%-(0wp8`;TWt2gzPwSvl7~?3K>&6tCVAr^(gep-k&zH9M&7yq`KSHLJNRop z)|lx@PR^1x4k3XPiMUC@dK>5+9d3ZscQ`BGJa-5jZ|)idG3aLGkg_xX;#$ATWw0e$gka(-Z+0J ztTU$Kjdwcls@p0^aAYfsFNto<>GU5D=Ql7Ul%5OXMl(y|(mj|jlus&-P_GZ9x2>IT zRK%0nl$P%28RX^YfL27ApJnRj7tE75_Ez<|R(lZPI5Gg6OGk9B=-?wOTGaD6IN~U~ z2W$bd#LYhf(_NTvsxp~nOY-%IuIj&kQ2&<1PkwdJ>DGy6QfvK-U7;{IB-tz|^m-W5 zuy?bHTc&fOIhG?LHvI8=-SNKAs-xF)LHApHAvBYF48L6)0Z6@ytF?#qw|!{X>*3WW zpZ(qHQyruMsrzw1L}KCbmBmRt-AgrnD`pH_>Tg=Lm!p*x`r}E`_{;_0R&IVw1^Xy< zP4I}SfsUfB{Q9F=O8a_OK8PCgVH%%VQs&~bFq0;f;~T!Vz)bpn%5pPR6c!hR9|F%* zSUD7GNtCTHikkyd3q9|9DSIzGos1=UtGim<7NLO38pmMLn6&3C~O-J(+l8Kgu2OrHk7umzgsq|g+v160A;Gq@$*70lSoL@cJ+Zx+@J>Wt? ze9s2iAP>BaW?77fs0uvQ`(R%-{O8^608MzI+1zhY`HA#9vx?T2BFDrl?{5N4w(jKozQ{`ekzvk4)iv=0{ zUhvQeo;*?e&AH9ZuBt`kR?j4(@U#Bw7im&5)4IoRY}jA(PX8=frS&UK`|{FtUd6Z* z?#t#VWD+`(sz{cdtk3tz`x>!A8Dsit^*EfYl&X4N>x9j0O;8Bu%-p_HQpkd=ESoZV z%4naZ`~AW+v}k>JZ^DSCfM&O=hMah~ALk{Su<#yPt5oz{6^S1OoUy-;m^Ezu-XsS@HF!w3(n%i!?f$#gw z*wr6#b7;7)Ml%yps*E4Po%$VX^{@>ckEz!Vun`w{SvhBHROQCz+3t_c&goHlkCzrV zf9`Z#)c)E6wMz&OL*NupHhKpF^nJoFfT86G`NP;V?!itvTSuR)mV_xBiW>w`Uu`KO zsCIRc#-faZAHs+s{uS-iXnvvzR4k5p=JF!mvAhHpL6D+)e=WoiHf01W;<}pTl)KR0 zB@lMlqvr0G)bR?<^GsHxxx|chIV)F67BH!9-=( z85hOJQ#i;L%!oDcZOJX$(gNXcF5QrkWXTj!k5>L|h0kDo?0pbGQ^{vNK568&I~Ic| zNkjS{-M1dzS%AFGmx6c6hTNeVIQ*~hqJ+1Y5XX5Ewk@}a7JJ{*iG=Bv;>J1g^J4m) z^K4hQ`|Wv41u<>hYrT*%9ee!& zN;6M;@u!FPeVi?w^ogeI(Vv1!I4H(%AkJ{38;PhLtq?|s= zOGJpF*9tizyrRW^;bLiQ{;bFzRNxYhzDuxI9*k_{J*QP{@ZR`pnPfNR7GB}nnkhg1 zm;+R|+k7T2_(f2z*6w=rBp}SJ596lW5gU&T+&tkC+eE{v3oceS^Oc~odbgbErtb;& zCad-r7k3+;pz>CF)KApacxAW|NCJ74|LMND9j7M2;s=8TH1F__z?Jh^?|I6G)L_bR z^IO<1I`p*1`$=(YF76CW#Gi{(u*-nv!I*dp4Bd?#P}=`6?*nDjmtFDCUc6#+YSfcI zH8h!AoHIgA9d=sw4d2UXU4g#R$%xpW{y;ee-`2()1EZ%9hnWu@+z(!HrMT|2&_9K= zw|mt#^#|CdVqq^~X&aj_Tx})n##0pMb=}5OdaOnpjsFg&szmzD=2T={{s-9PxU_f;d*^CF|##RV0$)4yX^YPa{)1L8BW4c%`Hv|@Rx8dp_cwl)fdnlP7dv2R`)FXb4~aW z4LmE7LgP}VC?}nL>KE701uwb25*v}Axft-&ObC{A+M{8WrEP6o4O@@=W?_sGqNHmN zXOWr=icHyUkf@&?yuFk?jWM3x5wlQ&*wJ+Dvbhfelv({byq7TVLWZY;#gAzVgPMQU zw)Vysi+THb4T&}LJ4ZVX6S^U;udAwyXcj&;CtaL zI*qPBp&k0m&8x~NdNutA}G3I%&@Qa_)y<0@FjV%H|s<`#X04} z+qeBBj`=1xjjbP`rUKARG-@@;u^7zZrQYG@a>4Wza_kMFszp3A*;{YcdEMRA8lgJL zA)(Pd2ceq!0+KnSlog$5e>bT?p|UjoJ*U;26dscY+lT)5R)TAHZc4kk-beLWSx`(A zRVezLY9KbjJ|;CK^<7l1b1njih89HXT^x13$RCB@_ReE>U;RH({ePTx>8$?9{)&A_ z-P6C{lVnfAg#M$;_^dpVBJrK!l$hS(0nkg+L; zsAK4EgX%m*f|Cq7HSMR;ei^TdNd8_OTwr*6$JnQ106uXq8Jeu>6eLbgh=@?z%gX-w zxIXI2F7!kv?pe_HG2mh&Q_U)M7p0)qt_Ik^t1heK5rq6K$<`KBzBPB{FE;%e(Oku6 zId+ygm*8S2q~@sNIwkNtDKd`l(rLf$mR8u@Ommx*1=b2gHmuMd3 z7pos@P$eTR{kd@P2yqZBVbd4S*7v!W@$L2!D3ZJb&g-EhhG@~MQi`h#zePdwh_@ja0ROzpz^ zt~9h#ng$Gs8$3Q5cR1#z@;*8MzwSfh-=lt)^|E9HfQYjD&L*BvzqPUk43%G#)^{gK z5^CYbK~wAU!@G%4lvwvkRKP7s_&guwL@mdCxT0-(L7jCAa$B&~Ht;Mnbo?RPe3?{3 z^EO!s7~iyQXgQ5oxb>N_t(jBcFu&7`hLe!cZqVQzGUoNK>UyWWrqfH}Eone5V-Hhk zjW0(Nc4%&vXm-l`*ehs^ofGU&zlYoWFsj%MG&?I_iJI+*60=eDf{uEq3BPv8L668^ zXntTS$E4Oa@(IQSrH{pW&P>Oi8{14B9`Do##<%W7wb=Ks=B94Y!oVC$#Fa~KjvzFC zZ)v>mHycN>dR10NH(DF%iXli?r7*P*c>>&&u48_^&SimaX|*JdXf7A)HUl49M!V8L z;f&YA=ww`^0+C!CH65v@ulqyCQ}z41wv8g{ zltFHeo;U2}HZ(ENxFd^7H6KRaMUHQ{YCksr0ypoAzc{%%1pCeoik%rCjuIRZf%y0f4QJS3d&OUmM=7BwQ0kUNFNM*H-$d~|1Q zIdgViif~{#zc0GHwI|Oy-2}Bn!zkPcxenVVigfPOxe{dl$i)f6joG4Db_py`*(C4* zNq1@K4tJ-S^u*3h?@pzg(Y+%R(-Y)p9K5wTpu>!=1z)KO*ar(~^H_>$i+9@{xKg?p zb4Pl9UfOz3GYFdGw>g`wg4#9T=rtTB7wme@I<5G7?E|vqT|4pQCsXE+pL@Ym7Hr@L z_k(`t1c_3koI7wra;d=$-v7A!)1TaR^IOJh53!x89ta8w+8L3jDRQc|oo}47MK-A6 zwL|vvE4BAU%(%{bz%H~YIBFL^GV&B&A`K);-2SrR#8Y>n8xsb=$h**uFY=JYCA-?C zM+m4u^V@}sLARZ(aGy{Z*Zg#rN<)8&oxG@ZG?nKticR>=;u@ySLKpn$XSKzg(kD5A z9(XQU>HN^QWON}l@>4~G-zUdSIhyxAmu+gDnZ;ayESqr0^3(bA*>*Gn?as0QD^Jgy zc-0JgsjJdfCIGWj5*pB6VYr3YPx17%EPkRF;`BAm+u5pNzu4bj^y@8;Xr?#aP}bh^ z^IC@}Rl;O85-}_eQ)I8}EZN@oIcsq3BGRrDn4t_A_Tn(K@M52vsF^!6vVpsP;*2C{ zTDVVYkFhnY{#JYG~)EkD~^SX5NOy*Qde0u8#*TZ_ ze!%|{eEK0}W463EsriY_djKl?8^d7?+Q=$&xdq$F&)R0ct~cY$vXIgJ(BiJhxV+C! z-MCqrJx)&<9O4z{Hwb{oO$XP@Hq~}>76tp>t#B$!`FuWe=U%&4Eun;2rnv z45&Z?c2y2bpGYoUuJ7`MikmM#=-&WoO501Al@s> z+pRar#FFe>ydZ!H7H2r6AMy*#K61Dm>721IZH>4&l=j4x*pOFrNp8On)6ylx;u$x| zPD)6@>=M^fDss;3!ZIiuuue4P?LOYFJ5NyO#D>t(WluJmEz|GX_@c^3?9Jf2l%Um0|uo?$>z;)2Z&5yeB z=niC%A}q*?D)&+OM|PS#F)2(~mV@Z%K63m>#T*ySvA7C-8t-~E;y zRgP>eN#1Rvk^=jgcR4JLGlkr)*JplOmgt=Y(vUZvR+XGYvLTbLYmBeAw0)F~JvVcj zUth`b%(AL25jGdJ4IIR@;=;0-?0AC zRf?Y<93`~Oo>u`em;(X=44XKT8}Gz`(xdxfgjZqwHA;%*@U?yL%pLZp1i3n)CwEgYFSK+?&=xV8PJl!jBP?v_4ZX#b+~;% zx@t;q4KE=w%wWJgI$`YBN!SM6M(vlbrFKXG7Cq5PUXj=gQ({?Y(-D`}%LzX1l14Fm zZTxw=BRjn8{2Ib(Z~bXu&0g;j-axN$@kYDsBG3VzK4O^ubWxgO_Xcfzrs+%~!Zz(N z7-fGgh#MZfh|mc&o>|*6wj1(Z=h@8&z;=obGk>fBmMuZdTx7PtVoY1r-@}q{Sgz{F zNjj=u=NpKu!S&kkM}Qho&+ztQA00k$1+EX&Or1Zq#^#>~dSW*}7k9fU=BKzQ6soXU zxA8{j^Ge4WOpnyJlXN`Q_i%W@tM$T+jF*H1eu{Vf;B%_ukMpAEpU>dw(DQCDS-8To zSky|ls@ZcNu%`j{UUyV_7I?uo*`?)7v;1#<$&NrEX1`9WFv(aqlXJ4S}{45(du(bmWb@guP;#Q-fl?^1?AvcXI zQ7ZFUdc0~fp4{o<_l>H{e(J;N!nUosrd|8Izv_-^;&VPE2|br;V>16v2u{}M=MOQ* z4XarcxL<8Y1?7LI@6d;7r(g4A3MyYSnX9aCOQ6MMKOM-uMUiAbZ9~CtSM358{jg0l z^^kT;UBAYMzE|DX{rdUA$MZ8mnYHyooph{ddkEM!=Zo_^vlm>70)VV%Q z4$)k2vx_#KHar`nLJ1%F>un)E|c)8TezYnaM?ObWGt4K>`n@}>)~afx7!xd6s+J>90f>*>76aItPZ5E zA@*AqnBD}4PUt#~pe}U&;?Z|W@FU6- z+ZlJ&aTU6gbL9m15*UYJ0Tq*Y5}8oEXX><07_U3;wcj-7b|t6%)rheb7(R2@O|yU} z>(dIEaE90C7I_%{Oql3og&fmS){` zm#EZ<3*652P?f7_&iU|04@1W7#$q0rix|Oj<8L6n(~nfc_xt7Lfds#muf#Vf-VfS& zM@-wkBH0^x4tWBxTSKE&YU@;H{~FCF`H37JgZ+a;od%iA1SwwB%_{2&&m&$D03wBQRLX>t(a-*w5^JMAk4S+M>zmqlJclmx zG9f)4$~DFl9w5v4#$pdDMw)fPG2NJpJa=K-#b5v?3Hz~a<7sfoHub)Xvau0NX<)myX?C-oy5xd6>bJwy z@D3EzQ|s;~{05Az1kazUFFvN%KtoCveiVZr`852t0i))cVV$$*E8cg8usKz~lKrEA zjJ*YlUH`j_oH_1;YO=k+X%(Op=G> zKdLyXD#{gC-JPEBY2z@WK<7WVvqoNcHjOuN^T&c8@ycW{Rp`cgryge`G30N;B(ltn z{NiV6?9DpR_+2#A9dd4UzYCwAmmyO9VpXJ{p{P(dW3o^i>Nc7DgqW*uE z|Mv|^v$|WKR8GL{PaSf{LpoFV$Zn$S<06cqNG; zl@9)}k^di3NF%Q~cX&hi=aRhp{z!(M?&%5P`t70q?;##tUF?ccL2`OuXgdb!C!e#O{eABrn;T;aG#=+s&W398CwEx|4M#JOEmS7uSa2@@b3f*zk*OIb7+~CTLvg=#Q z=;XA^@u+;Tv8839CX~|n@0b6+OS;j+a~~%Ylhy^*b+7()YS(hU8OCx}JWsk!=IY<0 zmej^uq!E8WjcL>ml7~vH zHOL~ANRLzYjIP$-Y8U^WPX99WuZyX>h5I3A^+G=DdXLkd#r`vt4jhTu^&yD8VmvFE z8!>=g!1&(R$N=*sq_K$Ha{+W#^l-}&@)_lS)YC)`Rn!)pRj0IdVQAfJus=m4abM); zxREMCW)*bad)OoMP)q4A2W{}9T*cHPV~KnEu{x zIV}e%eU~KtlQ_5&HBwcUgi&q71_^T*#0B0L{E?^sl0a;V8V0!= zN+Js!{0pl53uYFL{czCcyD!n`Vo8;J@qgD3AxhfKMjjj7+kdzpE{ocs=9kN%Ee>4C znwqk?p$<3PD$GyXoIM{rivN0gz%F)wt@{$49dalaYeq?d${F#1C3 zqlX>f8T`|IZOMwkR8TESG6(-gpK9|Iltw^lDM@JNcjn!H3y=B$NsNx9yl<)d6uK~k z#5zD6)|7!vFP<7aOHeLL=BnlU1f#U%-gp_9{-l>&`mgMfe0n=6a64R)%JGO>RlnKc zC6`Ab(eQEJMyk+T#lUgEf9iPrBQWP={q{GL_2FW43pnNJ%TC1+>jEwd1Q^5d) zmn_ad|M1U&h?xI6qHEntE*_`}3>yL{zW<^Y?AGg?zwAUv>eEU5CK zP`62;MV(mn);~1+=RkzS-@Bxz5kLyRrrW-l=iCOUz(KS#73p-maEroO{b{GdO67O# z|JoRki0$cTj(beF0MA#9uw@?PVAZv`9TC>J-J3j`K}XJBuM9?x|`h^{JL^66}{&j6{eg}Cl3 zB;BIN1^+(#@YBz}J1HK`Aw8Ny7wvv782#S=ZwEKp3hvUhF+bg#7%GAm_rTHAlbla} zyZ9O37k|d2wl6>B)c5`~H;(~x!P%)7wmfzF-*fm=pNG)Ga+cCwP3Rbz!U1u{n!pK7 zg~sQ_Ic9Fw4Yx~|ue&t2j4AhHEK&uQEPA zWAaSzmU%w6q`aXS@)pO3RtDjCmx8UvuQiP>`z6lY_H3K03GQaTMp?%`%*4A*B>>8OD+gbFBRtEnT1J zm>2(gdg{fS$(7$u?<&F)(rObP8Gm0C2~1LNHLaHW2e0v)pOaj>X>yp8zgb}~tT^j; zY|!V?&@hizNZF}rVt%?`Fz75i>%bZQ%uF@;H|`Wo|75HD`sTb`Yv*-SFIXQAydVQF zaCR@avvS_HGs02Twr@{C+G@yE+&np!SxUy=HC_Wl?Yq(2g%GeOu=|Na<}74)ptCRZ z0y~1`j{`5y!HhgaWQ+zFJnN1I7`*X7FVdQ&MBb@0P{qT A!T +++++ + +link:introduction.html[Introducing Kudu]:: + Get familiar with what sets Kudu apart. + +link:release_notes.html[Kudu Beta Release Notes]:: + Find out what to expect in Kudu public beta releases, as well as known issues, workarounds, + and limitations. + +link:quickstart.html[Getting Started With Kudu]:: + Deploy a simple proof-of-concept Kudu cluster to try it out for yourself. + +link:installation.html[Installation Guide]:: + Read about all the different options for installing Kudu. + +link:configuration.html[Configuring Kudu]:: + Find out how to customize your Kudu cluster. + +link:kudu_impala_integration.html[Using Kudu with Apache Impala (incubating)]:: + Learn about using Impala to create, query, and update your Kudu tables. + +link:administration.html[Administering Kudu]:: + Keep Kudu running smoothly. + +link:troubleshooting.html[Troubleshooting Kudu]:: + Find guidelines for solving problems with your Kudu cluster. + +link:developing.html[Developing Applications With Kudu]:: + Get information about developing with the Kudu APIs and links to working example code. + +link:schema_design.html[Kudu Schema Design]:: + Learn about designing Kudu table schemas. + +link:transaction_semantics.html[Kudu Transaction Semantics]:: + Information about transaction semantics in Kudu. + +link:contributing.html[Contributing to Kudu]:: + Get involved in the Kudu community. + +link:style_guide.html[Kudu Documentation Style Guide]:: + Get familiar with the guidelines for documentation contributions to the Kudu project. + +link:configuration_reference.html[Kudu Configuration Reference]:: + Find out about individual Kudu configuration options. + +++++ + +++++ diff --git a/docs/installation.adoc b/docs/installation.adoc new file mode 100644 index 000000000000..a695bcd7d5e3 --- /dev/null +++ b/docs/installation.adoc @@ -0,0 +1,691 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[installation]] += Installing Apache Kudu (incubating) + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +You can deploy Kudu on a cluster using packages or you can build Kudu +from source. To run Kudu without installing anything, use the link:quickstart.html#quickstart_vm[Kudu Quickstart VM]. + +NOTE: Kudu is currently easier to install and manage with link:http://www.cloudera.com/content/www/en-us/products/cloudera-manager.html[Cloudera Manager], +version 5.4.7 or newer. If you use Cloudera Manager, see also Cloudera's +link:http://www.cloudera.com/content/www/en-us/documentation/betas/kudu/latest/topics/kudu_installation.html[Kudu documentation]. + +.Upgrading Kudu +To upgrade Kudu from a previous version, see <>. + +== Prerequisites and Requirements +.Hardware +- A host to run the Kudu master. +- One or more hosts to run Kudu tablet servers. When using replication, a minimum of +three tablet servers is necessary. + +.Operating System Requirements +Linux:: + - RHEL 6, RHEL 7, Ubuntu 14.04 (Trusty), or SLES 12. + - A kernel and fileystem that support _hole punching_. Hole punching is the use of the + `fallocate(2)` system call with the `FALLOC_FL_PUNCH_HOLE` option set. See + link:troubleshooting.html#req_hole_punching[troubleshooting hole punching] for more + information. + - ntp. +OS X:: + - OS X 10.10 Yosemite or OS X 10.11 El Capitan. + - Prebuilt OS X packages are not provided. +Windows:: + - Microsoft Windows is unsupported. + +.Storage +- If solid state storage is available, storing Kudu WALs on such high-performance +media may significantly improve latency when Kudu is configured for its highest +durability levels. + +.Management +- If you use Cloudera Manager and CDH, Cloudera Manager 5.4.3 or newer is required. +Cloudera Manager 5.4.7 and newer provide better monitoring and administration options. + +[[install_packages]] +== Install Using Packages +You can install Kudu using packages managed by the operating system. + +[[kudu_package_locations]] +.Kudu Package Locations +[cols=">s,<,<",options="header"] +|=== +| OS | Repository | Individual Packages +| RHEL | link:http://archive.cloudera.com/beta/kudu/redhat/6/x86_64/kudu/cloudera-kudu.repo[RHEL 6] | link:http://archive.cloudera.com/beta/kudu/redhat/6/x86_64/kudu/0.5.0/RPMS/x86_64/[RHEL 6] +| Ubuntu | link:http://archive.cloudera.com/beta/kudu/ubuntu/trusty/amd64/kudu/cloudera.list[Trusty] | http://archive.cloudera.com/beta/kudu/ubuntu/trusty/amd64/kudu/pool/contrib/k/kudu/[Trusty] +|=== + +NOTE: For later versions of Ubuntu, the Ubuntu Trusty packages are reported to install, though they have not been extensively tested. + +NOTE: packages are not yet provided for SLES. + +=== Install On RHEL Hosts + +. Download and configure the Kudu repositories for your operating system, or manually +download individual RPMs, the appropriate link from <>. + +. If using a Yum repository, use the following commands to install Kudu packages on +each host. ++ +---- +sudo yum install kudu # Base Kudu files +sudo yum install kudu-master # Kudu master init.d service script and default configuration +sudo yum install kudu-tserver # Kudu tablet server init.d service script and default configuration +sudo yum install kudu-client0 # Kudu C++ client shared library +sudo yum install kudu-client-devel # Kudu C++ client SDK +---- + +. To manually install the Kudu RPMs, first download them, then use the command +`sudo rpm -ivh ` and install the +`kudu-master` and `kudu-tserver` packages on the appropriate hosts. These packages +provide the operating system commands to start and stop Kudu. + +=== Install On Ubuntu or Debian Hosts + +. If using an Ubuntu or Debian repository, use the following commands to install Kudu +packages on each host. ++ +---- +sudo apt-get install kudu # Base Kudu files +sudo apt-get install kudu-master # Service scripts for managing kudu-master +sudo apt-get install kudu-tserver # Service scripts for managing kudu-tserver +sudo apt-get install libkuduclient0 # Kudu C++ client shared library +sudo apt-get install libkuduclient-dev # Kudu C++ client SDK +---- + +. To manually install individual DEBs, first download them, then use the command +`sudo dpkg -i ` to install them. + +=== Verify the Installation + +// tag::verify_install[] +. Verify that services are running using one of the following methods: + - Examine the output of the `ps` command on servers to verify one or both of `kudu-master` + or `kudu-tserver` processes is running. + - Access the Master or Tablet Server web UI by opening `\http://<_host_name_>:8051/` + for masters + or `\http://<_host_name_>:8050/` for tablet servers. +. If Kudu isn't running, have a look at the log files in '/var/log/kudu', and if there's a file + ending with '.FATAL' then it means Kudu wasn't able to start. + - If the error is 'Error during hole punch test', it might be a problem + link:troubleshooting.html#req_hole_punching[with your OS]. + - If the error is 'Couldn't get the current time', it's a + link:troubleshooting.html#ntp[problem with ntp]. + - If it's something else that doesn't seem obvious or if you've tried the above solutions without + luck, you can ask for help on the + link:https://groups.google.com/forum/#!forum/kudu-user[user mailing list]. + +// end::verify_install[] + +[[required_config_without_cm]] +=== Required Configuration + +Additional configuration steps are required on each host before you can start Kudu services. + +. The packages create a `kudu-conf` entry in the operating system's alternatives database, +and they ship the built-in `conf.dist` alternative. To adjust your configuration, +you can either edit the files in `/etc/kudu/conf/` directly, or create a new alternative +using the operating system utilities, make sure it is the link pointed to by `/etc/kudu/conf/`, +and create custom configuration files there. Some parts of the configuration are configured +in `/etc/default/kudu-master` and `/etc/default/kudu-tserver` files as well. You +should include or duplicate these configuration options if you create custom configuration files. ++ +Review the configuration, including the default WAL and data directory locations, +and adjust them according to your requirements. + +// tag::start_stop[] +. Start Kudu services using the following commands: ++ +[source,bash] +---- +$ sudo service kudu-master start +$ sudo service kudu-tserver start +---- + +. To stop Kudu services, use the following commands: ++ +[source,bash] +---- +$ sudo service kudu-master stop +$ sudo service kudu-tserver stop +---- +// end::start_stop[] + +. Configure the Kudu services to start automatically when the server starts, by adding +them to the default runlevel. ++ +[source,bash] +---- +$ sudo chkconfig kudu-master on # RHEL / CentOS +$ sudo chkconfig kudu-tserver on # RHEL / CentOS + +$ sudo update-rc.d kudu-master defaults # Debian / Ubuntu +$ sudo update-rc.d kudu-tserver defaults # Debian / Ubuntu +---- + +. For additional configuration of Kudu services, see link:configuration.html[Configuring +Kudu]. + +== Build From Source +If installing Kudu using parcels or packages does not provide the flexibility you +need, you can build Kudu from source. You can build from source on any supported operating system. + +[WARNING] +.Known Build Issues +==== +* It is not possible to build Kudu on Microsoft Windows. +* A C++11 capable compiler (GCC 4.8+) is required. +==== + +[[rhel_from_source]] +=== RHEL or CentOS +RHEL or CentOS 6.6 or later is required to build Kudu from source. To build +on a version older than 7.0, the Red Hat Developer Toolset must be installed +(in order to have access to a C++11 capable compiler). + +. Install the prerequisite libraries, if they are not installed. ++ +---- +$ sudo yum install gcc gcc-c++ autoconf automake libtool \ + boost-static boost-devel cyrus-sasl-devel \ + cyrus-sasl-plain patch pkgconfig make rsync vim-common gdb +---- + +. If building on RHEL or CentOS older than 7.0, install the Red Hat Developer +Toolset. ++ +---- +$ DTLS_RPM=rhscl-devtoolset-3-epel-6-x86_64.noarch.rpm +$ DTLS_RPM_URL=https://www.softwarecollections.org/en/scls/rhscl/devtoolset-3/epel-6-x86_64/download/${DTLS_RPM} +$ wget ${DTLS_RPM_URL} -O ${DTLS_RPM} +$ sudo yum install -y scl-utils ${DTLS_RPM} +$ sudo yum install -y devtoolset-3-toolchain +---- + +. Optional: Install the `asciidoctor` gem if you plan to build documentation. ++ +---- +$ sudo gem install asciidoctor +---- + +. Clone the Git repository and change to the new `kudu` directory. ++ +[source,bash] +---- +$ git clone https://github.com/cloudera/kudu +$ cd kudu +---- + +. Build any missing third-party requirements using the `build-if-necessary.sh` script. ++ +[source,bash] +---- +$ build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh +---- + +. Build Kudu, using the utilities installed in the previous step. Choose a build +directory for the intermediate output, which can be anywhere in your filesystem +except for the `kudu` directory itself. ++ +[source,bash] +---- +mkdir -p build/release +cd build/release +../../build-support/enable_devtoolset.sh \ + ../../thirdparty/installed/bin/cmake \ + -DCMAKE_BUILD_TYPE=release \ + ../.. +make -j4 +---- + +. Optional: Install Kudu binaries, libraries, and headers. +If you do not specify an installation directory through the `DESTDIR` +environment variable, `/usr/local/` is the default. ++ +[source,bash] +---- +sudo make DESTDIR=/opt/kudu install +---- + +. Optional: Build the documentation. NOTE: This command builds local documentation that +is not appropriate for uploading to the Kudu website. ++ +---- +$ make docs +---- + +.RHEL / CentOS Build Script +==== +This script provides an overview of the procedure to build Kudu on a +newly-installed RHEL or CentOS host, and can be used as the basis for an +automated deployment scenario. It skips the steps marked *Optional* above. + +[source,bash] +---- +#!/bin/bash + +sudo yum -y install gcc gcc-c++ autoconf automake libtool \ + boost-static boost-devel cyrus-sasl-devel \ + cyrus-sasl-plain patch pkgconfig make rsync vim-common gdb +DTLS_RPM=rhscl-devtoolset-3-epel-6-x86_64.noarch.rpm +DTLS_RPM_URL=https://www.softwarecollections.org/en/scls/rhscl/devtoolset-3/epel-6-x86_64/download/${DTLS_RPM} +wget ${DTLS_RPM_URL} -O ${DTLS_RPM} +sudo yum install -y scl-utils ${DTLS_RPM} +sudo yum install -y devtoolset-3-toolchain +cd kudu +build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh +mkdir -p build/release +cd build/release +../../build-support/enable_devtoolset.sh \ + ../../thirdparty/installed/bin/cmake \ + -DCMAKE_BUILD_TYPE=release \ + ../.. +make -j4 +---- +==== + +[[ubuntu_from_source]] +=== Ubuntu or Debian + +. Install the prerequisite libraries, if they are not installed. ++ +---- +$ sudo apt-get install git autoconf automake libboost-thread-dev \ + libboost-system-dev curl gcc g++ libsasl2-dev libsasl2-modules \ + libtool ntp patch pkg-config make rsync unzip vim-common gdb python +---- + +. Optional: Install the `asciidoctor` gem and xsltproc if you plan to build documentation. ++ +---- +$ sudo apt-get install asciidoctor xsltproc +---- + +. Clone the Git repository and change to the new `kudu` directory. ++ +[source,bash] +---- +$ git clone https://github.com/cloudera/kudu +$ cd kudu +---- + +. Build any missing third-party requirements using the `build-if-necessary.sh` script. ++ +[source,bash] +---- +$ thirdparty/build-if-necessary.sh +---- + +. Build Kudu, using the utilities installed in the previous step. Choose a build +directory for the intermediate output, which can be anywhere in your filesystem +except for the `kudu` directory itself. ++ +[source,bash] +---- +mkdir -p build/release +cd build/release +../../thirdparty/installed/bin/cmake -DCMAKE_BUILD_TYPE=release ../.. +make -j4 +---- + +. Optional: Install Kudu binaries, libraries, and headers. +If you do not specify an installation directory through the `DESTDIR` +environment variable, `/usr/local/` is the default. ++ +[source,bash] +---- +sudo make DESTDIR=/opt/kudu install +---- + +. Optional: Build the documentation. NOTE: This command builds local documentation that +is not appropriate for uploading to the Kudu website. ++ +---- +$ make docs +---- + +.Ubuntu / Debian Build Script +==== +This script provides an overview of the procedure to build Kudu on Ubuntu, and +can be used as the basis for an automated deployment scenario. It skips +the steps marked *Optional* above. + +[source,bash] +---- +#!/bin/bash + +sudo apt-get -y install git autoconf automake libboost-thread-dev \ + libboost-system-dev curl gcc g++ libsasl2-dev libsasl2-modules \ + libtool ntp patch pkg-config make rsync unzip vim-common gdb python +git clone https://github.com/cloudera/kudu +cd kudu +thirdparty/build-if-necessary.sh +mkdir -p build/release +cd build/release +../../thirdparty/installed/bin/cmake \ + -DCMAKE_BUILD_TYPE=release \ + ../.. +make -j4 +---- +==== + +[[sles_from_source]] +=== SUSE Linux Enterprise Server +Building Kudu on SLES requires building Boost from source, since SLES does not +have system packages containing Boost static libraries. Boost may be built +anywhere, provided that the `BOOST_ROOT` environment variable is set to the +location while invoking CMake to build Kudu. In the instructions below, Boost is +built alongside Kudu. + +. Install the prerequisite libraries, if they are not installed. ++ +---- +$ sudo zypper install autoconf automake curl cyrus-sasl-devel gcc gcc-c++ \ + gdb git libtool make ntp patch pkg-config python rsync unzip vim +---- + +. Install Boost. ++ +---- +wget https://downloads.sourceforge.net/project/boost/boost/1.59.0/boost_1_59_0.tar.gz +tar xzf boost_1_59_0.tar.gz +pushd boost_1_59_0 +./bootstrap.sh +./b2 --with-system --with-threads +export BOOST_ROOT=$(pwd) +popd +---- + +. Clone the Git repository and change to the new `kudu` directory. ++ +[source,bash] +---- +$ git clone https://github.com/cloudera/kudu +$ cd kudu +---- + +. Build any missing third-party requirements using the `build-if-necessary.sh` script. ++ +[source,bash] +---- +$ thirdparty/build-if-necessary.sh +---- + +. Build Kudu, using the utilities installed in the previous step. Choose a build +directory for the intermediate output, which can be anywhere in your filesystem +except for the `kudu` directory itself. ++ +[source,bash] +---- +mkdir -p build/release +cd build/release +BOOST_ROOT=../../boost_1_59_0 \ + ../../thirdparty/installed/bin/cmake \ + -DCMAKE_BUILD_TYPE=release \ + ../.. +make -j4 +---- + +. Optional: Install Kudu binaries, libraries, and headers. +If you do not specify an installation directory through the `DESTDIR` +environment variable, `/usr/local/` is the default. ++ +[source,bash] +---- +sudo make DESTDIR=/opt/kudu install +---- + +.SLES Build Script +==== +This script provides an overview of the procedure to build Kudu on SLES, and +can be used as the basis for an automated deployment scenario. It skips +the steps marked *Optional* above. + +[source,bash] +---- +#!/bin/bash + +sudo zypper install autoconf automake curl cyrus-sasl-devel gcc gcc-c++ \ + gdb git libtool make ntp patch pkg-config python rsync unzip vim +wget https://downloads.sourceforge.net/project/boost/boost/1.59.0/boost_1_59_0.tar.gz +tar xzf boost_1_59_0.tar.gz +pushd boost_1_59_0 +./bootstrap.sh +./b2 --with-system --with-threads +popd +git clone https://github.com/cloudera/kudu +cd kudu +thirdparty/build-if-necessary.sh +mkdir -p build/release +cd build/release +BOOST_ROOT=../../boost_1_59_0 \ + ../../thirdparty/installed/bin/cmake \ + -DCMAKE_BUILD_TYPE=release \ + ../.. +make -j4 +---- +==== + +[[osx_from_source]] +=== OS X +The Xcode toolchain is necessary for compiling Kudu. Use `xcode-select --install` +to install the Xcode Command Line Tools if Xcode is not already installed. These +instructions use link:http://brew.sh/[Homebrew] to install dependencies, but +manual dependency installation is possible. + +[WARNING] +.OS X Known Issues +==== +Kudu support for OS X is experimental, and should only be used for development. +See link:https://issues.cloudera.org/browse/KUDU-1219[OS X Limitations & Known Issues] +for more information. +==== + +. Install the prerequisite libraries, if they are not installed. ++ +---- +$ brew install autoconf automake cmake libtool pkg-config boost pstree +---- + +. Clone the Git repository and change to the new `kudu` directory. ++ +[source,bash] +---- +$ git clone https://github.com/cloudera/kudu +$ cd kudu +---- + +. Build any missing third-party requirements using the `build-if-necessary.sh` script. ++ +[source,bash] +---- +$ thirdparty/build-if-necessary.sh +---- + +. Build Kudu. Choose a build directory for the intermediate output, which can be +anywhere in your filesystem except for the `kudu` directory itself. ++ +[source,bash] +---- +mkdir -p build/release +cd build/release +../../thirdparty/installed/bin/cmake -DCMAKE_BUILD_TYPE=release ../.. +make -j4 +---- + +.OSX Build Script +==== +This script provides an overview of the procedure to build Kudu on OSX, and can +be used as the basis for an automated deployment scenario. It assumes Xcode and Homebrew +are installed. + +---- +#!/bin/bash + +brew install autoconf automake cmake libtool pkg-config boost pstree +git clone https://github.com/cloudera/kudu +cd kudu +thirdparty/build-if-necessary.sh +mkdir -p build/release +cd build/release +../../thirdparty/installed/bin/cmake -DCMAKE_BUILD_TYPE=release ../.. +make -j4 +---- +==== + +[[build_cpp_client]] +== Installing the C++ Client Libraries + +If you need access to the Kudu client libraries for development, +install the `kudu-client` and `kudu-client-devel` package for your platform. +See <>. + +WARNING: Only build against the client libraries and headers (`kudu_client.so` and `client.h`). +Other libraries and headers are internal to Kudu and have no stability guarantees. + +[[build_java_client]] +== Build the Java Client + +.Requirements +- JDK 7 +- Apache Maven 3.x +- `protoc` 2.6 or newer installed in your path, or built from the `thirdparty/` directory. +You can run the following commands to build `protoc` from the third-party dependencies: +[source,bash] +---- +$ thirdparty/download-thirdparty.sh +$ thirdparty/build-thirdparty.sh protobuf +---- + +To build the Java client, clone the Kudu Git +repository, change to the `java` directory, and issue the following command: + +[source,bash] +---- +$ mvn install -DskipTests +---- + +For more information about building the Java API, as well as Eclipse integration, +see `java/README.md`. + +[[view_api]] +== View API Documentation + +// tag::view_api[] +.C++ API Documentation +The documentation for the C++ client APIs is included in the header files in +`/usr/include/kudu/` if you installed Kudu using packages or subdirectories +of `src/kudu/client/` if you built Kudu from source. If you installed Kudu using parcels, +no headers are included in your installation. and you will need to <> in order to have access to the headers and shared libraries. + +The following command is a naive approach to finding relevant header files. Use +of any APIs other than the client APIs is unsupported. + +[source,bash] +---- +$ find /usr/include/kudu -type f -name *.h +---- + +.Java API Documentation +You can view the link:../apidocs/index.html[Java API documentation] online. Alternatively, +after <>, Java API documentation is available +in `java/kudu-client/target/apidocs/index.html`. +// end::view_api[] + +[[upgrade]] +== Upgrade from 0.5.0 to 0.6.0 + +Before upgrading, see <> and <>. +To upgrade from Kudu 0.5.0 to 0.6.0, perform the following high-level steps, which +are detailed in <>: + +. Shut down all Kudu services. +. Install the new Kudu packages or parcels, or install Kudu 0.6.0 from source. +. Restart all Kudu services. + +It is technically possible to upgrade Kudu using rolling restarts, but it has not +been tested and is not recommended. + +[[client_compatibility_0.6.0]] +=== Client compatibility + +Kudu 0.6.0 maintains wire compatibility with Kudu 0.5.0. This means that a Kudu 0.6.0 +client can communicate with a Kudu 0.5.0 cluster, and vice versa. For that reason, +you do not need to upgrade client JARs at the same time the cluster is upgraded. + +The same wire compatibility guarantees apply to the Impala_Kudu fork that was released +with Kudu 0.5.0. + +[[api_compatibility_0.6.0]] + +=== API Compatibility + +The Kudu 0.6.0 client API is not compatible with the Kudu 0.5.0 client API. See the +link:release_notes.html#0.6.0[Kudu 0.6.0 release notes] for details. + +[[upgrade_procedure_0.6.0]] +=== Upgrade procedure + +. Stop the Kudu master and tablet server services: ++ +[source,bash] +---- +$ sudo service kudu-master stop +$ sudo service kudu-tserver stop +---- + +. Upgrade the packages. + - On RHEL or CentOS hosts: ++ +[source,bash] +---- +sudo yum clean all +sudo yum upgrade kudu +---- + - On Ubuntu or Debian hosts: ++ +[source,bash] +---- +sudo apt-get update +sudo apt-get install kudu +---- + +. Start the Kudu master and tablet server services: ++ +[source,bash] +---- +$ sudo service kudu-master start +$ sudo service kudu-tserver start +---- + +[[next_steps]] +== Next Steps +- link:configuration.html[Configuring Kudu] +- link:administration.html[Kudu Administration] + diff --git a/docs/introduction.adoc b/docs/introduction.adoc new file mode 100644 index 000000000000..20a5b08f5a50 --- /dev/null +++ b/docs/introduction.adoc @@ -0,0 +1,220 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[introduction]] += Introducing Apache Kudu (incubating) +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +Kudu is a columnar storage manager developed for the Hadoop platform. Kudu shares +the common technical properties of Hadoop ecosystem applications: it runs on commodity +hardware, is horizontally scalable, and supports highly available operation. + +Kudu's design sets it apart. Some of Kudu's benefits include: + +- Fast processing of OLAP workloads. +- Integration with MapReduce, Spark and other Hadoop ecosystem components. +- Tight integration with Cloudera Impala, making it a good, mutable alternative + to using HDFS with Parquet. +- Strong but flexible consistency model, allowing you to choose consistency + requirements on a per-request basis, including the option for strict-serializable consistency. +- Strong performance for running sequential and random workloads simultaneously. +- Easy to administer and manage with Cloudera Manager. +- High availability. Tablet Servers and Masters use the <>, which ensures that + as long as more than half the total number of replicas is available, the tablet is available for + reads and writes. For instance, if 2 out of 3 replicas or 3 out of 5 replicas are available, the tablet + is available. ++ +Reads can be serviced by read-only follower tablets, even in the event of a +leader tablet failure. +- Structured data model. + +By combining all of these properties, Kudu targets support for families of +applications that are difficult or impossible to implement on current generation +Hadoop storage technologies. A few examples of applications for which Kudu is a great +solution are: + +* Reporting applications where newly-arrived data needs to be immediately available for end users +* Time-series applications that must simultaneously support: + - queries across large amounts of historic data + - granular queries about an individual entity that must return very quickly +* Applications that use predictive models to make real-time decisions with periodic +refreshes of the predictive model based on all historic data + +For more information about these and other scenarios, see <>. + +== Concepts and Terms +[[kudu_columnar_data_store]] +.Columnar Data Store + +Kudu is a _columnar data store_. A columnar data store stores data in strongly-typed +columns. With a proper design, it is superior for analytical or data warehousing +workloads for several reasons. + +Read Efficiency:: For analytical queries, you can read a single column, or a portion +of that column, while ignoring other columns. This means you can fulfill your query +while reading a minimal number of blocks on disk. With a row-based store, you need +to read the entire row, even if you only return values from a few columns. + +Data Compression:: Because a given column contains only one type of data, pattern-based +compression can be orders of magnitude more efficient than compressing mixed data +types. Combined with the efficiencies of reading data from columns, compression allows +you to fulfill your query while reading even fewer blocks from disk. See +link:schema_design.html#encoding[Data Compression] + +.Table + +A _table_ is where your data is stored in Kudu. A table has a schema and +a totally ordered primary key. A table is split into segments called tablets. + +.Tablet + +A _tablet_ is a contiguous segment of a table. A given tablet is +replicated on multiple tablet servers, and one of these replicas is considered +the leader tablet. Any replica can service reads, and writes require consensus +among the set of tablet servers serving the tablet. + +.Tablet Server + +A _tablet server_ stores and serves tablets to clients. For a +given tablet, one tablet server serves the lead tablet, and the others serve +follower replicas of that tablet. Only leaders service write requests, while +leaders or followers each service read requests. Leaders are elected using +<>. One tablet server can serve multiple tablets, and one tablet can be served +by multiple tablet servers. + +.Master + +The _master_ keeps track of all the tablets, tablet servers, the +<>, and other metadata related to the cluster. At a given point +in time, there can only be one acting master (the leader). If the current leader +disappears, a new master is elected using <>. + +The master also coordinates metadata operations for clients. For example, when +creating a new table, the client internally sends an RPC to the master. The +master writes the metadata for the new table into the catalog table, and +coordinates the process of creating tablets on the tablet servers. + +All the master's data is stored in a tablet, which can be replicated to all the +other candidate masters. + +Tablet servers heartbeat to the master at a set interval (the default is once +per second). + +[[raft]] +.Raft Consensus Algorithm + +Kudu uses the link:https://raft.github.io/[Raft consensus algorithm] as +a means to guarantee fault-tolerance and consistency, both for regular tablets and for master +data. Through Raft, multiple replicas of a tablet elect a _leader_, which is responsible +for accepting and replicating writes to _follower_ replicas. Once a write is persisted +in a majority of replicas it is acknowledged to the client. A given group of `N` replicas +(usually 3 or 5) is able to accept writes with at most `(N - 1)/2` faulty replicas. + +[[catalog_table]] +.Catalog Table + +The _catalog table_ is the central location for +metadata of Kudu. It stores information about tables and tablets. The catalog +table is accessible to clients via the master, using the client API. + +Tables:: table schemas, locations, and states + +Tablets:: the list of existing tablets, which tablet servers have replicas of +each tablet, the tablet's current state, and start and end keys. + +.Logical Replication + +Kudu replicates operations, not on-disk data. This is referred to as _logical +replication_, as opposed to _physical replication_. Physical operations, such as +compaction, do not need to transmit the data over the network. This results in a +substantial reduction in network traffic for heavy write scenarios. + +== Architectural Overview + +The following diagram shows a Kudu cluster with three masters and multiple tablet +servers, each serving multiple tablets. It illustrates how Raft consensus is used +to allow for both leaders and followers for both the masters and tablet servers. In +addition, a tablet server can be a leader for some tablets, and a follower for others. +Leaders are shown in gold, while followers are shown in blue. + +NOTE: Multiple masters are not supported during the Kudu beta period. + +image::kudu-architecture-2.png[Kudu Architecture, 800] + +[[kudu_use_cases]] +== Example Use Cases +.Streaming Input with Near Real Time Availability + +A common challenge in data analysis is one where new data arrives rapidly and constantly, +and the same data needs to be available in near real time for reads, scans, and +updates. Kudu offers the powerful combination of fast inserts and updates with +efficient columnar scans to enable real-time analytics use cases on a single storage layer. + +.Time-series application with widely varying access patterns + +A time-series schema is one in which data points are organized and keyed according +to the time at which they occurred. This can be useful for investigating the +performance of metrics over time or attempting to predict future behavior based +on past data. For instance, time-series customer data might be used both to store +purchase click-stream history and to predict future purchases, or for use by a +customer support representative. While these different types of analysis are occurring, +inserts and mutations may also be occurring individually and in bulk, and become available +immediately to read workloads. Kudu can handle all of these access patterns +simultaneously in a scalable and efficient manner. + +Kudu is a good fit for time-series workloads for several reasons. With Kudu's support for +hash-based partitioning, combined with its native support for compound row keys, it is +simple to set up a table spread across many servers without the risk of "hotspotting" +that is commonly observed when range partitioning is used. Kudu's columnar storage engine +is also beneficial in this context, because many time-series workloads read only a few columns, +as opposed to the whole row. + +In the past, you might have needed to use multiple data stores to handle different +data access patterns. This practice adds complexity to your application and operations, and +duplicates storage. Kudu can handle all of these access patterns natively and efficiently, +without the need to off-load work to other data stores. + +.Predictive Modeling + +Data analysts often develop predictive learning models from large sets of data. The +model and the data may need to be updated or modified often as the learning takes +place or as the situation being modeled changes. In addition, the scientist may want +to change one or more factors in the model to see what happens over time. Updating +a large set of data stored in files in HDFS is resource-intensive, as each file needs +to be completely rewritten. In Kudu, updates happen in near real time. The scientist +can tweak the value, re-run the query, and refresh the graph in seconds or minutes, +rather than hours or days. In addition, batch or incremental algorithms can be run +across the data at any time, with near-real-time results. + +.Combining Data In Kudu With Legacy Systems + +Companies generate data from multiple sources and store it in a variety of systems +and formats. For instance, some of your data may be stored in Kudu, some in a traditional +RDBMS, and some in files in HDFS. You can access and query all of these sources and +formats using Impala, without the need to change your legacy systems. + +== Next Steps +- link:quickstart.html[Get Started With Kudu] +- link:installation.html[Installing Kudu] diff --git a/docs/kudu_impala_integration.adoc b/docs/kudu_impala_integration.adoc new file mode 100755 index 000000000000..af357c20eebd --- /dev/null +++ b/docs/kudu_impala_integration.adoc @@ -0,0 +1,995 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[kudu_impala]] += Using Apache Kudu (incubating) with Apache Impala (incubating) +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +Kudu has tight integration with Impala, allowing you to use Impala +to insert, query, update, and delete data from Kudu tablets using Impala's SQL +syntax, as an alternative to using the link:installation.html#view_api[Kudu APIs] +to build a custom Kudu application. In addition, you can use JDBC or ODBC to connect +existing or new applications written in any language, framework, or business intelligence +tool to your Kudu data, using Impala as the broker. + +NOTE: The following instructions assume a +link:http://www.cloudera.com/content/www/en-us/products/cloudera-manager.html[Cloudera Manager] +deployment. However, you can use Kudu with Impala without Cloudera Manager. + +== Requirements and Implications + +This integration relies on features that released versions of Impala do not have yet. +In the interim, you need +to install a fork of Impala, which this document will refer to as _Impala_Kudu_. + +* You can install Impala_Kudu using parcels or packages. + +* Kudu itself requires CDH 5.4.3 or later. To use Cloudera Manager with Impala_Kudu, +you need Cloudera Manager 5.4.3 or later. Cloudera Manager 5.4.7 is recommended, as +it adds support for collecting metrics from Kudu. + +* If you have an existing Impala instance on your cluster, you can install Impala_Kudu +alongside the existing Impala instance *if you use parcels*. The new instance does +not share configurations with the existing instance and is completely independent. +A script is provided to automate this type of installation. See <>. + +* It is especially important that the cluster has adequate +unreserved RAM for the Impala_Kudu instance. + +* Consider shutting down the original Impala service when testing Impala_Kudu if you +want to be sure it is not impacted. + +* Before installing Impala_Kudu, you must have already installed and configured +services for HDFS (though it is not used by Kudu), the Hive Metastore (where Impala +stores its metadata), and link:installation.html[Kudu]. You may need HBase, YARN, +Sentry, and ZooKeeper services as well. Meeting the Impala installation requirements +is out of the scope of this document. See +link:http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/impala_prereqs.html[Impala Prequisites] +in the official Impala documentation for more information. + + +== Installing Impala_Kudu Using Cloudera Manager + +If you use Cloudera Manager, you can install Impala_Kudu using +<> or +<>. However, if you have an existing Impala +instance, you must use parcels and you should use the instructions provided in +<>, rather than these instructions. + +[[install_impala_kudu_parcels]] +=== Installing the Impala_Kudu Service Using Parcels + +[[install_impala_kudu_parcels_side_by_side]] +==== Manual Installation + +NOTE: Manual installation of Impala_Kudu is only supported where there is no other Impala +service already running in the cluster, and when you use parcels. + +. Obtain the Impala_Kudu parcel either by using the parcel repository or downloading it manually. + * To use the parcel repository: + ** Go to *Hosts / Parcels*. + ** Click *Edit Settings*. Add http://archive.cloudera.com/beta/impala-kudu/parcels/latest/ + as a *Remote Parcel Repository URL*. Click *Save Changes*. + * To download the parcel manually: + ** Download the parcel for your operating system from + http://archive.cloudera.com/beta/impala-kudu/parcels/latest/ and upload + it to `/opt/cloudera/parcel-repo/` on the Cloudera Manager server. + ** Create a SHA1 file for the parcel. Cloudera Manager expects the SHA1 to be named + with the exact same name as the parcel, with a `.sha` ending added, and to only + contain the SHA1 itself, not the name of the parcel. ++ +---- +sha1sum | awk {'print $1'} > .sha +---- ++ +. Go to *Hosts / Parcels*. Click *Check for New Parcels.* Verify that *Impala_Kudu* +is in the list. +. Download (if necessary), distribute, and activate the *Impala_Kudu* parcel. +. Add a new Impala service. This service will use the Impala_Kudu parcel. + * Go to the cluster and click *Actions / Add a Service*. + * Choose one host to run the Catalog Server, one to run the StateServer, and one +or more to run Impala Daemon instances. Click *Continue*. + * Choose one or more Impala scratch directories. Click *Continue*. The Impala service + starts. *However, the features that Impala needs in order to work with Kudu are not + enabled yet.* +. Enable the features that allow Impala to work with Kudu. + * Go to the new Impala service. Click *Configuration*. + * Search for the *Impala Service Environment Advanced Configuration Snippet (Safety + Valve)* configuration item. Add the following to the text field and save your changes: + `IMPALA_KUDU=1` + * Restart the Impala service. + * You can verify that the Kudu features are available to Impala by running the following + query in Impala Shell: ++ +[source,sql] +---- +select if(version() like '%KUDU%', "all set to go!", "check your configs") as s; + +Query: select if(version() like '%KUDU%', "all set to go!", "check your configs") as s ++----------------+ +| s | ++----------------+ +| all set to go! | ++----------------+ +Fetched 1 row(s) in 0.02s +---- ++ +If you do not 'all set to go!', carefully review the previous instructions to be sure +that you have not missed a step. + + +==== Installation using the `deploy.py` Script + +If you use parcels, Cloudera recommends using the included `deploy.py` script to +install and deploy the Impala_Kudu service into your cluster. If your cluster does +not have an existing Impala instance, the script is optional. However, if you do +have an existing Impala instance and want to install Impala_Kudu side-by-side, +you must use the script. + +.Prerequisites +* The script depends upon the Cloudera Manager API Python bindings. Install the bindings +using `sudo pip install cm-api` (or as an unprivileged user, with the `--user` +option to `pip`), or see http://cloudera.github.io/cm_api/docs/python-client/ +for more details. +* You need the following information to run the script: +** The IP address or fully-qualified domain name of the Cloudera Manager server. +** The IP address or fully-qualified domain name of the host that should run the Kudu +master process, if different from the Cloudera Manager server. +** The cluster name, if Cloudera Manager manages multiple clusters. +** If you have an existing Impala service and want to clone its configuration, you + need to know the name of the existing service. +** If your cluster has more than one instance of a HDFS, Hive, HBase, or other CDH + service that this Impala_Kudu service depends upon, the name of the service this new + Impala_Kudu service should use. +** A name for the new Impala service. +** A user name and password with *Full Administrator* privileges in Cloudera Manager. +** The IP address or host name of the host where the new Impala_Kudu service's master role + should be deployed, if not the Cloudera Manager server. +** A comma-separated list of local (not HDFS) scratch directories which the new +Impala_Kudu service should use, if you are not cloning an existing Impala service. +* Your Cloudera Manager server needs network access to reach the parcel repository +hosted on `cloudera.com`. + +.Procedure + +- Download the `deploy.py` from https://github.com/cloudera/impala-kudu/blob/feature/kudu/infra/deploy/deploy.py +using `curl` or another utility of your choice. ++ +[source,bash] +---- +$ curl -O https://raw.githubusercontent.com/cloudera/impala-kudu/feature/kudu/infra/deploy/deploy.py +---- ++ +- Run the `deploy.py` script. The syntax below creates a standalone IMPALA_KUDU +service called `IMPALA_KUDU-1` on a cluster called `Cluster 1`. Exactly one HDFS, Hive, +and HBase service exist in Cluster 1, so service dependencies are not required. +The cluster should not already have an Impala instance. ++ +[source,bash] +---- +$ python deploy.py create IMPALA_KUDU-1 --cluster 'Cluster 1' \ + --master_host \ + --host +---- + +NOTE: If you do not specify `--master_host`, the Kudu master is configured to run +on the Cloudera Manager server (the value specified by the `--host` parameter). + +- If two HDFS services are available, called `HDFS-1` and `HDFS-2`, use the following +syntax to create the same `IMPALA_KUDU-1` service using `HDFS-2`. You can specify +multiple types of dependencies; use the `deploy.py create -h` command for details. ++ +[source,bash] +---- +$ python deploy.py create IMPALA_KUDU-1 --cluster 'Cluster 1' --hdfs_dependency HDFS-2 \ + --host +---- + +- Run the `deploy.py` script with the following syntax to clone an existing IMPALA +service called `IMPALA-1` to a new IMPALA_KUDU service called `IMPALA_KUDU-1`, where +Cloudera Manager only manages a single cluster. This new `IMPALA_KUDU-1` service +can run side by side with the `IMPALA-1` service if there is sufficient RAM for both. +`IMPALA_KUDU-1` should be given at least 16 GB of RAM and possibly more depending +on the complexity of the workload and the query concurrency level. ++ +[source,bash] +---- +$ python deploy.py clone IMPALA_KUDU-1 IMPALA-1 --host +---- + +- Additional parameters are available for `deploy.py`. To view them, use the `-h` +argument. You can also use commands such as `deploy.py create -h` or +`deploy.py clone -h` to get information about additional arguments for individual operations. + +- The service is created *but not started*. Review the configuration in Cloudera Manager +and start the service. + +[[install_impala_kudu_packages]] +=== Installing Impala_Kudu Using Packages + +Before installing Impala_Kudu packages, you need to uninstall any existing Impala +packages, using operating system utilities. For this reason, you cannot use Impala_Kudu +alongside another Impala instance if you use packages. + +[[impala_kudu_package_locations]] +.Impala_Kudu Package Locations +[cols=">s,<,<",options="header"] +|=== +| OS | Repository | Individual Packages +| RHEL | link:http://archive.cloudera.com/beta/impala-kudu/redhat/6/x86_64/impala-kudu/cloudera-impala-kudu.repo[RHEL 6] | link:http://archive.cloudera.com/beta/impala-kudu/redhat/6/x86_64/impala-kudu/0.5.0/RPMS/x86_64/[RHEL 6] +| Ubuntu | link:http://archive.cloudera.com/beta/impala-kudu/ubuntu/trusty/amd64/impala-kudu/cloudera.list[Trusty] | http://archive.cloudera.com/beta/impala-kudu/ubuntu/trusty/amd64/impala-kudu/pool/contrib/i/impala-kudu/[Trusty] +|=== + +. Download and configure the Impala_Kudu repositories for your operating system, or manually +download individual RPMs, the appropriate link from <>. + +. An Impala cluster has at least one `impala-kudu-server` and at most one `impala-kudu-catalog` +and `impala-kudu-state-store`. To connect to Impala from the command line, install +the `impala-kudu-shell` package. + +=== Adding Impala service in Cloudera Manager +. Add a new Impala service in Cloudera Manager. +** Go to the cluster and click *Actions / Add a Service*. +** Choose one host to run the Catalog Server, one to run the Statestore, and at + least three to run Impala Daemon instances. Click *Continue*. +** Choose one or more Impala scratch directories. Click *Continue*. +. The Impala service starts. + +== Installing Impala_Kudu Without Cloudera Manager + +Before installing Impala_Kudu packages, you need to uninstall any existing Impala +packages, using operating system utilities. For this reason, you cannot use Impala_Kudu +alongside another Impala instance if you use packages. + +IMPORTANT: Do not use these command-line instructions if you use Cloudera Manager. +Instead, follow <>. + +[[impala_kudu_non-cm_locations]] +.Impala_Kudu Package Locations +[cols=">s,<,<",options="header"] +|=== +| OS | Repository | Individual Packages +| RHEL | link:http://archive.cloudera.com/beta/impala-kudu/redhat/6/x86_64/impala-kudu/cloudera-impala-kudu.repo[RHEL 6] | link:http://archive.cloudera.com/beta/impala-kudu/redhat/6/x86_64/impala-kudu/0.5.0/RPMS/x86_64/[RHEL 6] +| Ubuntu | link:http://archive.cloudera.com/beta/impala-kudu/ubuntu/trusty/amd64/impala-kudu/cloudera.list[Trusty] | http://archive.cloudera.com/beta/impala-kudu/ubuntu/trusty/amd64/impala-kudu/pool/contrib/i/impala-kudu/[Trusty] +|=== + +. Download and configure the Impala_Kudu repositories for your operating system, or manually +download individual RPMs, the appropriate link from <>. + +. An Impala cluster has at least one `impala-kudu-server` and at most one `impala-kudu-catalog` +and `impala-kudu-state-store`. To connect to Impala from the command line, install +the `impala-kudu-shell` package. + +=== Starting Impala_Kudu Services +. Use the Impala start-up scripts to start each service on the relevant hosts: ++ +---- +$ sudo service impala-state-store start + +$ sudo service impala-catalog start + +$ sudo service impala-server start +---- + +== Using the Impala Shell + +NOTE: This is only a small sub-set of Impala Shell functionality. For more details, see the +link:http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/impala_impala_shell.html[Impala Shell] documentation. + +Neither Kudu nor Impala need special configuration in order for you to use the Impala +Shell or the Impala API to insert, update, delete, or query Kudu data using Impala. +However, you do need to create a mapping between the Impala and Kudu tables. Kudu +provides the Impala query to map to an existing Kudu table in the web UI. + +- Be sure you are using the `impala-shell` binary provided by the Impala_Kudu package, +rather than the default CDH Impala binary. The following shows how to verify this +using the `alternatives` command on a RHEL 6 host. ++ +[source,bash] +---- +$ sudo alternatives --display impala-shell + +impala-shell - status is auto. + link currently points to /opt/cloudera/parcels/CDH-5.5.0-1.cdh5.5.0.p0.1007/bin/impala-shell +/opt/cloudera/parcels/CDH-5.5.0-1.cdh5.5.0.p0.1007/bin/impala-shell - priority 10 +/opt/cloudera/parcels/IMPALA_KUDU-2.3.0-1.cdh5.5.0.p0.119/bin/impala-shell - priority 5 +Current `best' version is /opt/cloudera/parcels/CDH-5.5.0-1.cdh5.5.0.p0.1007/bin/impala-shell. + +$ sudo alternatives --set impala-shell /opt/cloudera/parcels/IMPALA_KUDU-2.3.0-1.cdh5.5.0.p0.119/bin/impala-shell +---- +- Start Impala Shell using the `impala-shell` command. By default, `impala-shell` +attempts to connect to the Impala daemon on `localhost` on port 21000. To connect +to a different host,, use the `-i ` option. To automatically connect to +a specific Impala database, use the `-d ` option. For instance, if all your +Kudu tables are in Impala in the database `impala_kudu`, use `-d impala_kudu` to use +this database. +- To quit the Impala Shell, use the following command: `quit;` + +=== Internal and External Impala Tables +When creating a new Kudu table using Impala, you can create the table as an internal +table or an external table. + +Internal:: An internal table is managed by Impala, and when you drop it from Impala, +the data and the table truly are dropped. When you create a new table using Impala, +it is generally a internal table. + +External:: An external table (created by `CREATE EXTERNAL TABLE`) is not managed by +Impala, and dropping such a table does not drop the table from its source location +(here, Kudu). Instead, it only removes the mapping between Impala and Kudu. This is +the mode used in the syntax provided by Kudu for mapping an existing table to Impala. + +See link:http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/impala_tables.html +for more information about internal and external tables. + +=== Querying an Existing Kudu Table In Impala +. Go to http://kudu-master.example.com:8051/tables/, where _kudu-master.example.com_ +is the address of your Kudu master. +. Click the table ID for the relevant table. +. Scroll to the bottom of the page, or search for `Impala CREATE TABLE statement`. +Copy the entire statement. +. Paste the statement into Impala. Impala now has a mapping to your Kudu table. + +[[kudu_impala_create_table]] +=== Creating a New Kudu Table From Impala +Creating a new table in Kudu from Impala is similar to mapping an existing Kudu table +to an Impala table, except that you need to write the `CREATE` statement yourself. +Use the following example as a guideline. Impala first creates the table, then creates +the mapping. + +IMPORTANT: When creating a new Kudu table, you are strongly encouraged to specify +a distribution scheme. If you do not, your table will consist of a single tablet, +and thus load will not be distributed across your cluster. See +<>. The table creation example below is distributed into 16 +buckets by hashing the `id` column, for simplicity. See +<> for guidelines on partitioning. + +[source,sql] +---- +CREATE TABLE my_first_table ( +id BIGINT, +name STRING +) +DISTRIBUTE BY HASH (id) INTO 16 BUCKETS +TBLPROPERTIES( + 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', + 'kudu.table_name' = 'my_first_table', + 'kudu.master_addresses' = 'kudu-master.example.com:7051', + 'kudu.key_columns' = 'id' +); +---- + +In the `CREATE TABLE` statement, the columns that comprise the primary key must +be listed first. Additionally, primary key columns are implicitly marked `NOT NULL`. + +The following table properties are required, and the `kudu.key_columns` property must +contain at least one column. + +`storage_handler`:: the mechanism used by Impala to determine the type of data source. +For Kudu tables, this must be `com.cloudera.kudu.hive.KuduStorageHandler`. +`kudu.table_name`:: the name of the table that Impala will create (or map to) in Kudu. +`kudu.master_addresses`:: the list of Kudu masters Impala should communicate with. +`kudu.key_columns`:: the comma-separated list of primary key columns, whose contents +should not be nullable. + +==== `CREATE TABLE AS SELECT` +You can create a table by querying any other table or tables in Impala, using a `CREATE +TABLE ... AS SELECT` statement. The following example imports all rows from an existing table +`old_table` into a Kudu table `new_table`. The columns in `new_table` will have the +same names and types as the columns in `old_table`, but you need to populate the `kudu.key_columns` +property. In this example, the primary key columns are `ts` and `name`. + +[source,sql] +---- +CREATE TABLE new_table +TBLPROPERTIES( + 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', + 'kudu.table_name' = 'new_table', + 'kudu.master_addresses' = 'kudu-master.example.com:7051', + 'kudu.key_columns' = 'ts, name' +) AS SELECT * FROM old_table; +---- +[NOTE] +==== +For `CREATE TABLE ... AS SELECT` we currently require that the first columns that are +projected in the `SELECT` statement correspond to the Kudu table keys and are in the +same order (`ts` then `name` in the example above). If the default projection generated by `*` +does not meet this requirement, the user should avoid using `*` and explicitly mention +the columns to project, in the correct order. +==== + +You can refine the `SELECT` statement to only match the rows and columns you want +to be inserted into the new table. You can also rename the columns by using syntax +like `SELECT name as new_name`. + +==== Pre-Splitting Tables + +Tables are divided into tablets which are each served by one or more tablet +servers. Ideally, tablets should split a table's data relatively equally. Kudu currently +has no mechanism for automatically (or manually) splitting a pre-existing tablet. +Until this feature has been implemented, you must pre-split your table when you create +it, When designing your table schema, consider primary keys that will allow you to +pre-split your table into tablets which grow at similar rates. You can provide split +points using a `DISTRIBUTE BY` clause when creating a table using Impala: + +NOTE: Impala keywords, such as `group`, are enclosed by back-tick characters when +they are not used in their keyword sense. + +[source,sql] +---- +CREATE TABLE cust_behavior ( + _id BIGINT, + salary STRING, + edu_level INT, + usergender STRING, + `group` STRING, + city STRING, + postcode STRING, + last_purchase_price FLOAT, + last_purchase_date BIGINT, + category STRING, + sku STRING, + rating INT, + fulfilled_date BIGINT +) +DISTRIBUTE BY RANGE(_id) +SPLIT ROWS((1439560049342), (1439566253755), (1439572458168), (1439578662581), (1439584866994), (1439591071407)) +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'cust_behavior', +'kudu.master_addresses' = 'a1216.halxg.cloudera.com:7051', +'kudu.key_columns' = '_id', +'kudu.num_tablet_replicas' = '3', +); +---- + +If you have multiple primary key columns, you can specify split points by separating +them with commas within the inner brackets: `[['va',1],['ab',2]]`. The expression +must be valid JSON. + +==== Impala Databases and Kudu + +Impala uses a database containment model. In Impala, you can create a table within a specific +scope, referred to as a _database_. To create the database, use a `CREATE DATABASE` +statement. To use the database for further Impala operations such as `CREATE TABLE`, +use the `USE` statement. For example, to create a table in a database called `impala_kudu`, +use the following statements: + +NOTE: Impala uses a namespace mechanism to allow for tables to be created within different +scopes, called `databases`. To create a database, use a `CREATE DATABASE` +statement. To use the database for further Impala operations such as `CREATE TABLE`, +use the `USE` statement. For example, to create a table in a database called `impala_kudu`, +use the following SQL: ++ +[source,sql] +---- +CREATE DATABASE impala_kudu +USE impala_kudu; +CREATE TABLE my_first_table ( +id BIGINT, +name STRING +) +TBLPROPERTIES( + 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', + 'kudu.table_name' = 'my_first_table', + 'kudu.master_addresses' = 'kudu-master.example.com:7051', + 'kudu.key_columns' = 'id' +); +---- ++ +The `my_first_table` table is created within the `impala_kudu` database. To refer +to this database in the future, without using a specific `USE` statement, you can +refer to the table using `.` syntax. For example, to specify the +`my_first_table` table in database `impala_kudu`, as opposed to any other table with +the same name in another database, use `impala_kudu:my_first_table`. This also applies +to `INSERT`, `UPDATE`, `DELETE`, and `DROP` statements. + +WARNING: Currently, Kudu does not encode the Impala database into the table name +in any way. This means that even though you can create Kudu tables within Impala databases, +the actual Kudu tables need to be unique within Kudu. For example, if you create `database_1.my_kudu_table` +and `database_2.my_kudu_table`, you will have a naming collision within Kudu, even +though this would not cause a problem in Impala. This can be resolved by specifying +a unique Kudu table name in the `kudu.table_name` property. + +==== Impala Keywords Not Supported for Kudu Tables + +The following Impala keywords are not supported when creating Kudu tables: +- `PARTITIONED` +- `STORED AS` +- `LOCATION` +- `ROWFORMAT` + +=== Optimizing Performance for Evaluating SQL Predicates + +If the `WHERE` clause of your query includes comparisons with the operators +`=`, `<=`, or `>=`, Kudu evaluates the condition directly and only returns the +relevant results. This provides optimum performance, because Kudu only returns the +relevant results to Impala. For predicates `<`, `>`, `!=`, or any other predicate +type supported by Impala, Kudu does not evaluate the predicates directly, but returns +all results to Impala and relies on Impala to evaluate the remaining predicates and +filter the results accordingly. This may cause differences in performance, depending +on the delta of the result set before and after evaluating the `WHERE` clause. + +In the `CREATE TABLE` statement, the first column must be the primary key. Additionally, +the primary key can never be NULL when inserting or updating a row. + +All properties in the `TBLPROPERTIES` statement are required, and the `kudu.key_columns` +must contain at least one column. + +[[partitioning_tables]] +=== Partitioning Tables + +Tables are partitioned into tablets according to a partition schema on the primary +key columns. Each tablet is served by at least one tablet server. Ideally, a table +should be split into tablets that are distributed across a number of tablet servers +to maximize parallel operations. The details of the partitioning schema you use +will depend entirely on the type of data you store and how you access it. For a full +discussion of schema design in Kudu, see link:schema_design.html[Schema Design]. + +Kudu currently has no mechanism for splitting or merging tablets after the table has +been created. Until this feature has been implemented, you must provide a partition +schema for your table when you create it. When designing your tables, consider using +primary keys that will allow you to partition your table into tablets which grow +at similar rates. + +You can partition your table using Impala's `DISTRIBUTE BY` keyword, which +supports distribution by `RANGE` or `HASH`. The partition scheme can contain zero +or more `HASH` definitions, followed by an optional `RANGE` definition. The `RANGE` +definition can refer to one or more primary key columns. +Examples of <> and <> +partitioning are shown below. + +NOTE: Impala keywords, such as `group`, are enclosed by back-tick characters when +they are used as identifiers, rather than as keywords. + +[[basic_partitioning]] +==== Basic Partitioning + +.`DISTRIBUTE BY RANGE` +You can specify split rows for one or more primary key columns that contain integer +or string values. Range partitioning in Kudu allows splitting a table based based +on the lexicographic order of its primary keys. This allows you to balance parallelism +in writes with scan efficiency. + +The split row does not need to exist. It defines an exclusive bound in the form of: +`(START_KEY, SplitRow), [SplitRow, STOP_KEY)` In other words, the split row, if +it exists, is included in the tablet after the split point. For instance, if you +specify a split row `abc`, a row `abca` would be in the second tablet, while a row +`abb` would be in the first. + +Suppose you have a table that has columns `state`, `name`, and `purchase_count`. The +following example creates 50 tablets, one per US state. + +[NOTE] +.Monotonically Increasing Values +==== +If you partition by range on a column whose values are monotonically increasing, +the last tablet will grow much larger than the others. Additionally, all data +being inserted will be written to a single tablet at a time, limiting the scalability +of data ingest. In that case, consider distributing by `HASH` instead of, or in +addition to, `RANGE`. +==== + +[source,sql] +---- +CREATE TABLE customers ( + state STRING, + name STRING, + purchase_count int32, +) DISTRIBUTE BY RANGE(state) +SPLIT ROWS(('al'), ('ak'), ('ar'), .., ('wv'), ('wy')) +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'customers', +'kudu.master_addresses' = 'kudu-master.example.com:7051', +'kudu.key_columns' = 'state, name' +); +---- + +[[distribute_by_hash]] +.`DISTRIBUTE BY HASH` + +Instead of distributing by an explicit range, or in combination with range distribution, +you can distribute into a specific number of 'buckets' by hash. You specify the primary +key columns you want to partition by, and the number of buckets you want to use. Rows are +distributed by hashing the specified key columns. Assuming that the values being +hashed do not themselves exhibit significant skew, this will serve to distribute +the data evenly across buckets. + +You can specify multiple definitions, and you can specify definitions which +use compound primary keys. However, one column cannot be mentioned in multiple hash +definitions. Consider two columns, `a` and `b`: +* icon:check[pro, role="green"] `HASH(a)`, `HASH(b)` +* icon:check[pro, role="green"] `HASH(a,b)` +* icon:times[pro, role="red"] `HASH(a), HASH(a,b)` + +NOTE: `DISTRIBUTE BY HASH` with no column specified is a shortcut to create the desired +number of buckets by hashing all primary key columns. + +Hash partitioning is a reasonable approach if primary key values are evenly +distributed in their domain and no data skew is apparent, such as timestamps or +serial IDs. + +The following example creates 16 tablets by hashing the `id` column. This spreads +writes across all 16 tablets. In this example, a query for a range of `sku` values +is likely to need to read all 16 tablets, so this may not be the optimum schema for +this table. See <> for an extended example. + +[source,sql] +---- +CREATE TABLE cust_behavior ( + id BIGINT, + sku STRING, + salary STRING, + edu_level INT, + usergender STRING, + `group` STRING, + city STRING, + postcode STRING, + last_purchase_price FLOAT, + last_purchase_date BIGINT, + category STRING, + rating INT, + fulfilled_date BIGINT +) +DISTRIBUTE BY HASH (id) INTO 16 BUCKETS +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'cust_behavior', +'kudu.master_addresses' = 'kudu-master.example.com:7051', +'kudu.key_columns' = 'id, sku' +); +---- + + +[[advanced_partitioning]] +==== Advanced Partitioning + +You can combine `HASH` and `RANGE` partitioning to create more complex partition schemas. +You can specify zero or more `HASH` definitions, followed by zero or one `RANGE` definitions. +Each definition can encompass one or more columns. While enumerating every possible distribution +schema is out of the scope of this document, a few examples illustrate some of the +possibilities. + +.`DISTRIBUTE BY RANGE` Using Compound Split Rows + +This example creates 100 tablets, two for each US state. Per state, the first tablet +holds names starting with characters before 'm', and the second tablet holds names +starting with 'm'-'z'. Writes are spread across at least 50 tablets, and possibly +up to 100. A query for a range of names in a given state is likely to only need to read from +one tablet, while a query for a range of names across every state will likely +read from at most 50 tablets. + +[source,sql] +---- +CREATE TABLE customers ( + state STRING, + name STRING, + purchase_count int32, +) DISTRIBUTE BY RANGE(state, name) +SPLIT ROWS(('al', ''), ('al', 'm'), ('ak', ''), ('ak', 'm'), .., ('wy', ''), ('wy', 'm')) +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'customers', +'kudu.master_addresses' = 'kudu-master.example.com:7051', +'kudu.key_columns' = 'state, name' +); +---- + +==== `DISTRIBUTE BY HASH` and `RANGE` + +Consider the <> example above, If you often query for a range of `sku` +values, you can optimize the example by combining hash partitioning with range partitioning. + +The following example still creates 16 tablets, by first hashing the `id` column into 4 +buckets, and then applying range partitioning to split each bucket into four tablets, +based upon the value of the `sku` string. Writes are spread across at least four tablets +(and possibly up to 16). When you query for a contiguous range of `sku` values, you have a +good chance of only needing to read from a quarter of the tablets to fulfill the query. + +[source,sql] +---- +CREATE TABLE cust_behavior ( + id BIGINT, + sku STRING, + salary STRING, + edu_level INT, + usergender STRING, + `group` STRING, + city STRING, + postcode STRING, + last_purchase_price FLOAT, + last_purchase_date BIGINT, + category STRING, + rating INT, + fulfilled_date BIGINT +) +DISTRIBUTE BY HASH (id) INTO 4 BUCKETS, +RANGE (sku) SPLIT ROWS(('g'), ('o'), ('u')) +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'cust_behavior', +'kudu.master_addresses' = 'kudu-master.example.com:7051', +'kudu.key_columns' = 'id, sku' +); +---- + +.Multiple `DISTRIBUTE BY HASH` Definitions +Again expanding the example above, suppose that the query pattern will be unpredictable, +but you want to ensure that writes are spread across a large number of tablets +You can achieve maximum distribution across the entire primary key by hashing on +both primary key columns. + +[source,sql] +---- +CREATE TABLE cust_behavior ( + id BIGINT, + sku STRING, + salary STRING, + edu_level INT, + usergender STRING, + `group` STRING, + city STRING, + postcode STRING, + last_purchase_price FLOAT, + last_purchase_date BIGINT, + category STRING, + rating INT, + fulfilled_date BIGINT +) +DISTRIBUTE BY HASH (id) INTO 4 BUCKETS, HASH (sku) INTO 4 BUCKETS +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'cust_behavior', +'kudu.master_addresses' = 'kudu-master.example.com:7051', +'kudu.key_columns' = 'id, sku' +); +---- + +The example creates 16 buckets. You could also use `HASH (id, sku) INTO 16 BUCKETS`. +However, a scan for `sku` values would almost always impact all 16 buckets, rather +than possibly being limited to 4. + +---- + +[[partitioning_rules_of_thumb]] +==== Partitioning Rules of Thumb + +- For large tables, such as fact tables, aim for as many tablets as you have + cores in the cluster. +- For small tables, such as dimension tables, aim for a large enough number of tablets + that each tablet is at least 1 GB in size. + +In general, be mindful the number of tablets limits the parallelism of reads, +in the current implementation. Increasing the number of tablets significantly +beyond the number of cores is likely to have diminishing returns. + +=== Inserting Data Into Kudu Tables + +Impala allows you to use standard SQL syntax to insert data into Kudu. + +==== Inserting Single Values + +This example inserts a single row. + +[source,sql] +---- +INSERT INTO my_first_table VALUES (99, "sarah"); +---- + +This example inserts three rows using a single statement. + +[source,sql] +---- +INSERT INTO my_first_table VALUES (1, "john"), (2, "jane"), (3, "jim"); + +[[kudu_impala_insert_bulk]] +==== Inserting In Bulk + +When inserting in bulk, there are at least three common choices. Each may have advantages +and disadvantages, depending on your data and circumstances. + +Multiple single `INSERT` statements:: This approach has the advantage of being easy to +understand and implement. This approach is likely to be inefficient because Impala +has a high query start-up cost compared to Kudu's insertion performance. This will +lead to relatively high latency and poor throughput. + +Single `INSERT` statement with multiple `VALUES`:: If you include more +than 1024 `VALUES` statements, Impala batches them into groups of 1024 (or the value +of `batch_size`) before sending the requests to Kudu. This approach may perform +slightly better than multiple sequential `INSERT` statements by amortizing the query start-up +penalties on the Impala side. To set the batch size for the current Impala +Shell session, use the following syntax: `set batch_size=10000;` ++ +NOTE: Increasing the Impala batch size causes Impala to use more memory. You should +verify the impact on your cluster and tune accordingly. + +Batch Insert:: The approach that usually performs best, from the standpoint of +both Impala and Kudu, is usually to import the data using a `SELECT FROM` statement +in Impala. ++ +. If your data is not already in Impala, one strategy is +to link:http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/impala_txtfile.html +[import it from a text file], such as a TSV or CSV file. ++ +. <>, being mindful that the columns +designated as primary keys cannot have null values. ++ +. Insert values into the Kudu table by querying the table containing the original +data, as in the following example: ++ +[source,sql] +---- +INSERT INTO my_kudu_table + SELECT * FROM legacy_data_import_table; +---- + +Ingest using the C++ or Java API:: In many cases, the appropriate ingest path is to +use the C++ or Java API to insert directly into Kudu tables. Unlike other Impala tables, +data inserted into Kudu tables via the API becomes available for query in Impala without +the need for any `INVALIDATE METADATA` statements or other statements needed for other +Impala storage types. + +[[insert_ignore]] +==== `INSERT` and the `IGNORE` Keyword + +Normally, if you try to insert a row that has already been inserted, the insertion +will fail because the primary key would be duplicated. See <>. +If an insert fails part of the way through, you can re-run the insert, using the +`IGNORE` keyword, which will ignore only those errors returned from Kudu indicating +a duplicate key.. + +The first example will cause an error if a row with the primary key `99` already exists. +The second example will still not insert the row, but will ignore any error and continue +on to the next SQL statement. + +[source,sql] +---- +INSERT INTO my_first_table VALUES (99, "sarah"); +INSERT IGNORE INTO my_first_table VALUES (99, "sarah"); +---- + +=== Updating a Row + +[source,sql] +---- +UPDATE my_first_table SET name="bob" where id = 3; +---- + +IMPORTANT: The `UPDATE` statement only works in Impala when the target table is in +Kudu. + +==== Updating In Bulk + +You can update in bulk using the same approaches outlined in +<>. + +==== `UPDATE` and the `IGNORE` Keyword + +Similar to <>You can use the `IGNORE` operation to ignore an `UPDATE` +which would otherwise fail. For instance, a row may be deleted while you are +attempting to update it. In Impala, this would cause an error. The `IGNORE` +keyword causes the error to be ignored. + +[source,sql] +---- +UPDATE IGNORE my_first_table SET name="bob" where id = 3; +---- + +=== Deleting a Row + +[source,sql] +---- +DELETE FROM my_first_table WHERE id < 3; +---- + +You can also delete using more complex syntax. A comma in the `FROM` sub-clause is +one way that Impala specifies a join query. For more information about Impala joins, +see http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/impala_joins.html. +[source,sql] +---- +DELETE c FROM my_second_table c, stock_symbols s WHERE c.name = s.symbol; +---- + +IMPORTANT: The `DELETE` statement only works in Impala when the target table is in +Kudu. + +==== Deleting In Bulk + +You can delete in bulk using the same approaches outlined in +<>. + +==== `DELETE` and the `IGNORE` Keyword + +Similar to <>You can use the `IGNORE` operation to ignore an `DELETE` +which would otherwise fail. For instance, a row may be deleted by another process +while you are attempting to delete it. In Impala, this would cause an error. The +`IGNORE` keyword causes the error to be ignored. + +[source,sql] +---- +DELETE IGNORE FROM my_first_table WHERE id < 3; +---- + +[[impala_insertion_caveat]] +=== Failures During `INSERT`, `UPDATE`, and `DELETE` Operations + +`INSERT`, `UPDATE`, and `DELETE` statements cannot be considered transactional as +a whole. If one of these operations fails part of the way through, the keys may +have already been created (in the case of `INSERT`) or the records may have already +been modified or removed by another process (in the case of `UPDATE` or `DELETE`). +You should design your application with this in mind. See <>. + +=== Altering Table Properties + +You can change Impala's metadata relating to a given Kudu table by altering the table's +properties. These properties include the table name, the list of Kudu master addresses, +and whether the table is managed by Impala (internal) or externally. You cannot modify +a table's split rows after table creation. + +IMPORTANT: Altering table properties only changes Impala's metadata about the table, +not the underlying table itself. These statements do not modify any table metadata +in Kudu. + +.Rename a Table +[source,sql] +---- +ALTER TABLE my_table RENAME TO my_new_table; +---- + +.Change the Kudu Master Address +[source,sql] +---- +ALTER TABLE my_table +SET TBLPROPERTIES('kudu.master_addresses' = 'kudu-new-master.example.com:7051'); +---- + +.Change an Internally-Managed Table to External +[source,sql] +---- +ALTER TABLE my_table SET TBLPROPERTIES('EXTERNAL' = 'TRUE'); +---- + +=== Dropping a Kudu Table Using Impala + +- If the table was created as an internal table in Impala, using `CREATE TABLE`, the +standard `DROP TABLE` syntax drops the underlying Kudu table and all its data. If +the table was created as an external table, using `CREATE EXTERNAL TABLE`, the mapping +between Impala and Kudu is dropped, but the Kudu table is left intact, with all its +data. ++ +[source,sql] +---- +DROP TABLE my_first_table; +---- + +== What's Next? + +The examples above have only explored a fraction of what you can do with Impala Shell. + +- Learn about the link:http://impala.io[Impala project]. +- Read the link:http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/impala.html[Impala documentation]. +- View the link:http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/impala_langref.html[Impala SQL reference]. +- Read about Impala internals or learn how to contribute to Impala on the link:https://github.com/cloudera/Impala/wiki[Impala Wiki]. +- Read about the native link:installation.html#view_api[Kudu APIs]. diff --git a/docs/media-src/README b/docs/media-src/README new file mode 100644 index 000000000000..117927e01d32 --- /dev/null +++ b/docs/media-src/README @@ -0,0 +1,5 @@ +Source files (i.e. .graffle, .psd, .eps, .xcf files) for artifacts used in docs +or the web site. + +Don't put display-resolution images for the docs here, better to put those in +the images/ directory, or even on a CDN. diff --git a/docs/media-src/kudu-architecture-2.pdf b/docs/media-src/kudu-architecture-2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b88cffd7d352d85c32288681bd56de0230650211 GIT binary patch literal 178107 zcmeF4by!^6n&z?K5(or`!h^d7g1fszaJN9=?h>3pa3{FCI|L2x1b6oU0fyw}oW67V z&dlw5o<2`ccl|?ccI~~^Uf+^}@3-E1$>fAYXc_64;mNu;&o=k-&ND{4d*N9C3;-K_ zb9ioU0KJrvwF%G^!1{Db0YEQeW(hR1dwR9h0~!e#8Q2&a0eE=e?SXbidRFkx;KP~= zc6fED9_eN6B$w?|hz!h4=ca=)$|*T$n#Fc#9|<>Dm16|e8HLH(((P}d4*~`;$jB(; zhIg?un%?a$fM3;JSl!ZIeq*QS4BzKm(#boXDBInwauS?xmGt}WRww<@peG1+pw>NJ;<#{!Y_h2R4w6Or!k z57`(^1c$tohbh7vpVg5_%59}JiZj0*Cb3uEltkQC_(e)g!cvBFOCbn-Y~JGyKuO>b#&al5-6zFa!or)Bq~+fZjq z>0&q2nW+5G_^z<8?xGH zEq3Zs^gIP@-*(J}xy+y0#O;>cLdi}cy6$l9Fc~A?3>Nu=kI!hY9d8CkY*pq^J0})V z-!>F|+zkSr=3&Z?lU-RgI&Y&m%{&0_a_$(lJz`s1(LSPA;riD5M8f%EA0oJV-sdRt z*Nr^9eHevhF&d9m+L|KQc0VOm#@K0rIgoa4EJaLK?(oG4-&!{h?ys#Q z^u{K-Tjih__|Er^5tw)jDgufxN?u2*+z}Zeaec@YF~z=>5As%4z4m=W$7n=<#aV~K zLcxSD5*K~vn|l7q-Zi1#I0561N6LtGx48Xz1$38xYF$5?-?!#~0G}SY4Ft9(8Y^HG z<~Olc(_*CR7f9?oF06JXshUTOs8o6D%rok&QS_Ou4)z`n+@^T9iq>IuxGSq}bBeuI z3Fq)!L!zWg%;cHc{X3TQuL-BbV|yKq+5_dAO5O>M=AXX(YWvHLeP0XFR$X7R_AI~l^-}bCKm1Gc( z2Yd1Af99=gwO;O!{p>a#{0+_|atUIBdNCwrfuu0v{$N7g@-+feogS-HGRvHjzM>t~ zD)I7?_VTADOx`6e1LRUq&XXV)?C!0ID&dHkF9=y{f_+ z#_&WqwFX^(;8eXnBRIe1ui;HO2dnZcy14P4_~7zTGjfQMH=Rr!@w3 z3#2^Z7OT1OdM}?u^%fa-b_DFP<|}E0_Hgesj>TEfHE)}2o6hq6-QgkcID!x#uzt4H z)x)W!#VvAF7|l_M)(4kHHan@{S#AdHE9@m$XE9`M*BMHt)bh0h|nO+S?}|y+#*F!f2of^_>3g zR!?VKPh}Q;rw1-Rf5G@m?G#+W$icA;LmaVSyUPS63z`LNl=yyAd|7;(Zj9|u{QYE< zHPWR49+&biLiQ!sehN;cUw}?d+^>*MxS!PLtkNsBbVUr`ohegHf1^QCo`h?E$; z)r_z~%YxmJ3etFHm~)LA$W}x}%HLBND0c8Nss>q?>hrP6#N6Cx`~L2?gQOgtyJ^5W5<+`?iFBxWzXtLHXYF5%&F6BwC+{x&;)j5wj&! zB+cE3Br+*XvhfEAw(LpF%*yLX%-4~c;>HW}Z5ActF(=xDwz4JV>+wj>Y(-Zv_*rLy z%maXO)Idy>Dfqm}gS^~ju?Da=Tp1WOpKem)XuWFN+?vg{5s^ku|&WGjG9;-?XHYJhLZyWJdoLH9?C-zwYPE?!y)80_ocw9X!=#i@P$%wJ zd>!Pb+ghVsxW8N;&u>1K?DOVsnE>w>FS9qkiIro#b{r4A03h9^%7tWVR8qM^pc7xZ z)Jad&>sWcyo3-#%c==(&luR|ms-(aa)VN?a=iI)eRQVX>o~g}6Kj;@x*t{NdOlUb> zs3l9d0QJs)PT^GJ!vgQ1s% z8O{)BvdNc)x~-?J$v!sBKK7=ZrUAj>NFSH$-*)++ z;EqYyTCXrvAz&>{*YKr9ufJs0V3{SDd0L;al`9f+b>IA?=@y*I*}Zh8-({++ol!BN zlAkO>cqUIz>v{E6dH0)qm%SNEl0lc)Z`~!XqXtk4qDmMhFYS1y>131X^%NrqxpT&# zBio4-8zjCt0nLmz@M(Yp9Bj=qe9!AIkZhnl)2{J9tf~s#&KNWTM!&Rm-`sq>;d%V} zv4K4%5|2K#-fxmVHTKp46CBn#*Jq<;ttiMgJcX3`)EF(FqF;cBnjKz7fyvRz>_a-K zn{0|#ns4I$V*LiTzO%*<#T#~V-G~GH5ykY2MKw*k>m9vo@2V2H3dI5Cq3_)g-bpy9 z@EUD*67VyYP%lf8PWxH9P=n3LWeXE)bBq!FB6gU1X!x3aMdAlZCNth`(23?ps4z2mi9d`q@^UxoCCk@A%KTG7iQzYR+$zee1AW%`oPMjw07dvwn$bD(* zLHMzHUt%`+s5luxS@)4lt^}fEeTWpxH}#2D|4Fv!jV91j=WQ%k4sI@D=n?WlCZXRL)h8jBrs$WExD zW~~)sOfK{rWg0ivo-0>s%x?$ZOEqOJMJ#p~6Av`vzIzs>_vRMM>TZGeJ>O8Rhr>F8 z%;01qRK;u*;y_I(4-@nj?+LG5uNbo`Sy2r%MTKarUM}|SYd?C!z zRM>p=u3zQ6`=reE+@owU`n80MMgJA&Rw(#PfpAwzGHiS(TyM-b@5+gAiRB^>{TouD zWN>)F>d8oAny{*3Rv{c*nvjZ*FyFMQq==-+jtz1c+_mIbt7u~Cm~!UtyWX%2KwglT zOD}YiaO2aF%|#rEAO?X?wGoesabMzx&oLBjDAq{0l)1T7^H}E&ph+KA!6Ftx!RAL# z9_n{vS7{)o*N3g4v?Co)9hej5aYD1OD)g=hN*M63c4OhBl+$H&*M*%eh6%Rah&AU; zq@=*p<8x^$RV6OCuKTR)_93LIFsqoSF%7J=8pIO1j4S|n2yii2c44t5jt~Vyl%`NI z!@axpFA9ixu7ptuk6}ppqlMpx%V<~v#|=_Qc#=9o(cg!YFEhQJ8M))sHZVw@Bj81Y zin50~HPg>tI$Ej{FqziSbNb4P=qJj}_S&psY&0g=J|dohm`Ww9Vk{a>Auv{mFs~m? zXH5L5T-Ou?nP4AsetiS6EjJ%v)RqSFfrRE$W%>qDFIRnIyLqF*mj{Yp-t|UN=W4Ud z8u)o~uu+lw&6@UHV^@p~kMacwcEeotmxu0;cPue&^df3@_i+#q^BjtA!0CMCur5+% zcf5pLGYE2C>S_51jvE^<2leqYYZBU`y1^&``cfLH?)d0Nrbbn(>UubjNDPq%6#w#O^B;S6&E0Vtsh%0Jsu>`r~Xd+lw1qtBdW6V|FP_bv@ zSntE1e{A8mUdUK6*uLKj4M~6V4cYCH4KNKprS8Bykr}Ar>7%t_yxw>LWUt@7nvPqOwDu4 zpCTa0WjgPqoQIOTnD%;APshff#5~Hp@6Lt=DoD+R-x`Ks>scfGES2!A+^SwK(&01a z{ezP-J5l?bre><*acb@q@R+)j^2Hl?GrQII6*H@PX&LK99bID$tt+$0RO(Q=A^jAP zv6atmb<>4S0dn6Pn5{Tg^t4_$QJ(q7u|Ah* zt9P8Hug0j=Z;5;gAOBL=j917>Ki!*G=8(xf($BuhLcU|757+JW6I$7{S;q?=_!ja} zfWy5I4EYsI8dE8`8DW6U2eKp_N(ji{waA1;xtkFp^ zY(JEOF4h@*0Q^HO)brdg!yFgM1vqa4j@(J}&P<+V0jl}|1bkT$6r*`j`2!DaCb@)B z*PV6dUz6VztpmC&?G!qpVw$d4TzSSG2`^5X5k2b?r|!-xWxdkDVvxqa3AxNg-$8j< zO;p-ELWW;D>BAdY8~)0_JRSW=+x*DY{CH(yVdMPq?vG=pzf0*Tx!4*3=;icGe*UyG zvIYW}|CZiSFtWFCurn~S2eAF^grJQz@acVfz>k#BQ~pQV$k0qrz{VM%$?$Z7jgtw$ z!p^1*|C9pyRnL$6e$@9PbtGqJW1wgR1ZX}T3W)&dm5iK$0D5t&r-}uBy$b$%6$fYo z=ml*oZR`|n^$d&vKW-PaX9TeSs22X|@TsD|JrWPkQ}*fUseWFfq{IYZ{qZDEkEW!= z3}9va?U?cD!G1et0kHmP{?BVz;n{w_h7G{>xAQ;BRFl;=H!}EJvEo+j0H$BXid!)~ z)$uF+C2qyc0AT+WEO9GF26(1lfqMFd3Gnph&uYZ2*Z_Y-(p8DnAM%K>``E6+Z%1Sc* z)|Wp=^lx2Aujrr;{CVc*SOd_jni)Q&Fqs$_0rX-J+3Q3cF^_Hsscf;LvRHr78T4#z*tiT;=nzvn~`M;XDv zz`!~)E@iC{=!5z6+P%Rcp%JkdbpOX65I{nn2!MhA`z7pW$NpTmIGKK1T7FK#KMVlY zUnA>pOWOZq0I;+D-v6vm{q}$DevV%g<8R&1@oSv@E8U-)9ArPE>e%xb{bA0MJXw6Y z|KN3^wp$GCx+Pn(VJt^T6T1bjL3si6gfKHcqA3|96_z+G5g!^X94i!r5}3nSv}qRm zgqC9}Q=ge@%MoM7X%vV-B?}rXA@XS^kIJgKqx+sra6<%UY;Ziwm9~V;=e?1>N zYRsCyjd_EV>xNS|$!=9(F+X2k*Id!iT;QTP(RwI7yc@*6kd$jIjf$mTI1%id9;Y%WLK+PLR+R*V{|t#ZZ#+|fQk8$c2VthTRACp zo2kmzPrSHqHt%fOt})j?J=(4vn2?*?Uw&xdO=f%CW`6lFanSP^Q*#mEGWb@f{tWZI zN?R>(IC-$DuXq5v?zT(Zf*n@ZdUXg1cYDJsNmZ_LC3{>LmCqBkjm}LIqb=!Kd6=&@ ztAMKF_Rau~T2%(PUfM*RH#*m!8lE2YxShF#A=qME+fH<3i=!W zTm)9ePEnyAcCzD4)eV{LjBa;G**y&bI@dMf?37)3hE(YeA#J&n`wEdY|HQWfYwppn;^?%QU4L zGpFYT1(cSjzm9$#8=dZ$>AmV2%jP{RxvfR^48#Ycq4BG>TL$M4ZqdXIKzFIPCP^S- zL#1VybARQ0o4U)Jz@TW&Otx?HwMs0d=2b-ZTRI6byf)*doEjHn)_|(>P^nya!ERaD z1PSU8{*R_$Y6}1lwj(2PsE?G2J8D%WZ7mfo*#+w-U%Nz;BJq~`?5ml%MI+7vuch%t z`G%2C&ZDSvY(rhI2>~*wfnB68d6?K{R8^{=Yh4`@XARF!)4Q!cjodexAsFRVVK$|< z-ab%8b|-XyHkUIS`z}xbG<%1=V8o-C*yKU6n!e4brda193wdi;H%1}p3lXfFI-;G; z4>|LYoytv{stwcFi0tTa*R>6qs!f~fin6!WB{~Oh5`WKlDUg2vOCLICXz+&3$IK_> zQLMra7Y+k$yWh$~{t*$EkU809$F^^mqaPl!?;%BDVuIM4k9YQ+ z%1;oa90Tw98-w{9-`Kt?pPLT;kqbGp7n zA#)*wlHL?axNWrE&ggRpW0uZwcmPqL8!GKbiP`CURh4U53}Nr2Ek{#eXhHVQP#`Yb z7^ zmjd+T7>C^*o$o*C=G49ul}2zdipWDOTMQPmBfcRH5fWSc2v&1yUA6} zicgGbGAXN;?}^`6Y4+7do{5ru2CXoY!*425=(*)g7md4+Sq-Tp$g zlgbjG0rFDNkEGBjob5i~FBhpl@llvkd`aVEb-ub7T}iYTT*b!Oh@^n=aW`HGi@HkV z8xnK!2C%Y`X(U={!Yqh!>V<}LPv;}btLx;Rkt6pO_Y^L_r1_Xl2NzLT{VKE2SrRTT zr9M0*7%K{@6E<3H`4!srk%>VZC31*RCyY0pe3pVy7fl-CXtob+I=B<1?hDeNiSdkw}VAt%lw#S*$#w z2v}q~s#7`iOIz)F;OW{)?)xrGdZcrovX9R84{HIL=i4JGxEmBcb6y$@Wb+T;#5n5^ zpY9?R#lfX2R^`Q!LU>`(sMM#{g>|xOw)i31J#ibVJ zoQkoaiJKd;2W0kTxFA%Jdkgy6I<1aB<3oB&&}w&i7Ds&*hoVaVwv`D{q4inGbNHnZ zavVTBN~|KR6cv=b%WT0#zL1zR#Fb1bu)$7O3OB-}cZPbw^$REHD&ibNSJje{SNSRV4#Z6N(yh_M#Sy_m+BjkQq^tX-$8`iXaT4#bUov zPuL6Ay?|fzMHQ`S{?hOJK~F%Lw3Z@T460u&4qt{JCI>Kv1=z7(YIfYdy5y>}S#0s} zD|6o(4#!n3!^tcc_lJlb?(lbGAk0QPKh6Jeu15LZ|GSdFN_~@E;InL14oXF3xS)cF zO(9E%`t+B>wrunY;f2I0S*itX3MTnV3`ZodY&o59>CM!{p+o#k)M+b3cEG{C>uj_k z=UbZUj?UfPHQye8ZFJk-I@FxGq2q;|5lgbtvmfPD(&q^s_{u|^T8usz*O}nz9Ytg- zAf}=u8nW&|-_w=p+;83`cIBiv75&!1nIWabnge|mHtu#~?og=EK4Q3!)FbQS z{-na*q(s=E;8P9}j6Bxd@f-(}WwpJqic>DjC^%F$#9c&}=bGqYCGaAiTn^*}pIu#T zlNIySi1rO@im`p=NGknnM6ZwBZ&%j85PrS8%)U$KZM>fAigf961G=4m%HP<|` z2Fbvw4u(XW@ciRK3hoCvB$E9gp;vx986KzoevjV{R@=O8!E*Ql)w9oVfGBUVMM{sj z>tBAIa=~-TC4j|(d+c`1qS>9kjl)tkbjyVov5+NG#>1jiWK`xtRkcsZ5<>5Nbw`2K zkF$hsbFYns?6S<@%7CV?DxIIly|$)J(K*jaVg7?<0B>HI$=y6;*R=6eaeU zQxS;1w2LkyPxG_!D%iF)1iVwshF@l~q6@^+7tH~rLW8#$@sQS&HY`I?+^<6x3-GeQShF$L!{ zzF}PsSUr=Jm;l)poRpfM^Z|O&>MD{WaDTdHe+E5FiR%CnYmV${KoVcMWSWqV`PDYH zov4ErCkS&V=*&>9?Lk(se8@Kff%x6co;;N$|@vLNSC1sIaf(A9hJ&G09!043b%Y+nRGG->?COshbx}LJU_qX4FS7-Ox4sl>pM|8 zme&WCHM-dv5oyE$V~xt@5dI1jaN=z75&#}R5gOJ>=#hVV1U|WpfHuraQFuB6o>(o! zm1m3`IV7YN#bVqpXmzL7e6Kz)kWG?e;a{y zd&RD0U6D~SMao>S$wiThCgCIlic*P@j7_)XTKCG`tzmP-J}UASpPnSLH~0CYvf-jZ zUXX)rz9>ky4`(HMfwry_grGdUVWsRjfyy2=Au5LR`ION`BI2kpW~;CU!hdWdl(gCH z;4mUV;zCfV67gkM%;b28o`zyXq4%X9`X?}itZ%E4fiiPZIf8JRbU{g+M8|_NT*xFC z-@_N)kT9A7OGOw~5(dXlr|LT=WNdsbn zp;WafAx6%JId9>pdb+naL%3i4B3@KzD2u;WPBk0}GaslKl-aet#Xu?W66%7_wBreN z`|NKxUOUiy0Gsz&s$twy%GzE56I@Qb2z=Q8Nb3W&IvN*>lugWwv zRio#XIm-iNR)#nbsB!T^0_PH#5srkaR&|^lkPI!%e3o`P84{@D58yZt-ol}Vj({Ld zFiGD7$Q`Nlmg(S$xFBU~R6fOmn)M+Wiw3(I1R&jRm{5e+X{t=`rokjtxN!-$#`k%i#7ad`fKy0NBeqDc)wHrQoJC9aqI2`!-f3?JOVu zB&C8u3M?6O|W#MVd$_a78tx5 z#dVIrJozQWOLX*;_7K9a&z%fm9Q}guHf}e}UI&^IS5UWCV4)~r~e!pAAtQ9Y@!Qnh})9p{_E^@8UbeHn<1ngzNecC z5D!(FEHBTLU#KL(Gb9KXj3Muz&t62MZ0ENeyzH*6VzYy1GN^$?c{i8`mGK@{Q|0^hJ}@-a?uY^RkVg;*4VxJDL~ z>k?12#2#_Ojo=9B@(*`WaYB+NXb5Xp{4UH%unq(zmIWqjt@ULjlH)XN08LVp_qP`13J5IC`ksj0oA2sdr19k6(&L|wRA5z z-TAN}X=2VHThi6Kr___Mq(4Pbmd7wWE9FdnlX3j;nx3fhrRgi-w^pHH*T~`DWsn5Z zJ4)qP^^9&OhtU^yNbHdruCTcY(L%@%PW#!HhjW-%Utl>vyFU;m)`ntOkl4#Bue>ng zu_a5e<|kA!ed$|b{3d6_yE?v~Iy}C@vNQD34-;^L`^MFed=B0d`ba7<9M6g`h#}(( zz6fKz2gP~NqO=fN_@~cU@q^UFpIVvC&tR#%n2hKy!Bn@$@ZMW%#}+hX%tmKNI=a&Z z41e3AlX>+GjG9p(L{89r&!|I-Z9j7cweJq*STl=RD!M6;sfIPbBLwVu*OMjUc}&n*hv;n(0~kI`Vs*>|_9zw6Xu z3UC_3o8=M(C=y=!PcL1b2vkFKd{_j-BVv+k-fluhs>z1hjyPNc(%*7}SxrY%%U5Du zx5b#UOu_LLuZPS~KvPXX~smK6%QFZqW~Tdh+}{B z<8i|Bq@>$p+(Cq}?VLyQy0BSKbGGVq5|MVb12RQ@&lcB1WGb26-lok9R@S|(&I>AWA5MGqZ`uh zGw283N@9;^axLjM44Kw{F&5|_lLu~D%ktbDl`4m9b0_rP30yNDzDzI>@S`S=8XF(U zG_!7Or-hXWzUQxTWdnFF`g>!_<~Gds;org4rsBcy^;$o!Ox0zcEtE-%4N)jo@K9q# z#PfKhwokfC21_sXT4!kU!Ipx{{qj+&V@ z1QGtRmMFEv+>$L;l7g4j&8 zPdle-$xD~*hGHT7u3r+a(w|wzf1SM$x0C>cI;%(=<&fTJjdq^z!NX86EVxxH-I8M=qBp}^=%k6h`DXZI zUk-)^K{9FrTi1(+HZZn3%V~vqCt&ihFAKz;6O1L!s`wsIV96$$4(>x-JEdcHtzePg zrD@Nh$Gz#7iaS-&jb(wuaYOzJenp)X@oYhEk+-81C2Y0YV3=hhHf{Hd`8LfA^XNtG z-3E$2*uA^EvJw8+>d(oZ!}UDcLxrOPabK8=RCh#&PAm$rEq3T5%RN8Y!i2;&57zP~ zmF0&!zNCFMXkuS2%ttaF!WEDvP9_r60(Glmm@RwTWy#~`Ro(5i|5o&VPcZXGg7Z`E_6LIVq%Zra==!G! z4$G4i?q3yQas1Mk{VNOyB*Fs8&HiG|oiidLY4<-AxBZwff16oszoF5e($POmtS7DL zKR>ZJpA>Ds#Enndwg0=g?SI!ca{QXV|Jt@YONRA1TV}j(E zAUP&TjtP=ug5;PWIp+U(Ip$9=^8?oWZ*t5ZJpB&@=LyRE^Jwf(5FC&k6GU)81P4TL zKm-Rwa6kkHL~uX^2Sji{1P4TLKm-Rwa6kkHL~uX^2Sji{1P4TLKm-Rwa6kkHL~#Dc z6P%x5=8pvDr+wfb2oBRPwDM1BaQ=1G80Vi7WBylCjr|PRk0Ab?YV4;K;~xV1Z-R{EmVr@QU?@IRMt-MKzrri#= zY|zcQ&KhfdFM9Xzy77`Qdx4Jduq|tfx2@)WZ#u$4lgk=w!DW56G`ZK27H`30aP~;p z0J1I|Ws$LUd*t+uQ{-`rllyVZX~$h3%p(`$xAyK<7>cVmEEUJAA@`9|2VRq33|`!w zX9qnUuX7(Pe_!cd@j9e9X!+tbzqIalgtJ`bY`Jn(vGj<&;IThj>J@p|nE0ig!tAiV zwB|lZfX@c8DyX!}`$4h|3*V927>fNJvAU@4ALi>V0XZIjR zPx|0eeH=4Z;)T29SOA$aGw-Qv!G)-^JbzPGk*Yrmb`}{A{`Ghf4j&2*Cpc|Gion_2 zZf93!oAT?~@hVqGGKq)xW5p>y1$s0f&W9^X8aH_+tD+aj|F))g-iEEQZ6c(5*@!e= zDHK{`p^W8CMkoXTO&Ux)-5oI-#YiGPQg|Q@DECDQ`CVEn_qUYvZ3>jVZ-g$^K{n3w z9q^RZos@eRnHUKn!^s-t^ZS_@fC|m%Z`rkC=pmRQXu0F)fio^xbme^ISW;+6U?CZ> z&u^nP$Hjx0r@w80fsY}0>ZEXYyA*K1yiKX@YI6T#kIW`|s}^+x@JGwsVXL4_Kp4Y# zypfY5tJp3!-WCI*$?vKtxi-XNDedVwSM@X{O2zE7!1jY@$!ZB-2m^a@(zP;l(w>Sm zIvXug3|i;BDCAuwvS~r@fX*q^5y#|m4Sog7;Ad*YUswM~;|-xE=lRD=M?wx7-8n9pi zwNaYkm=zL5rs>t3rRG4pl_z8KspSFCS+Q~=MW}hQgq##XsZfd^%QcAsQyGyQdW@p< zEKGC}p|tm7DihD+zM|;Ygx6=E((67&C?S7Ywq03~7LzaQ!ZQylm~Ey$Cq!>r`CuJu zmvt~nQUMrinXJ?+Rp=az$>`J7FNZk6*p}G=yS$nl+f)>7T2p6q_>j$e_T38R!I;k+ z$8#m{qQ9pHE1zdln@f~P`LMs`{ z$*33GZp!O7Swd^61D72p#lQ|>#zfLFcs>ra*L)?3Jgw8R*B7V2>3MsN{y5DP~?J$ zzG9(?90p>jZK zHjHF?L!S_R!Yd_WDvY@I%_75(9DBKm+a`!^w9&j5NmsLOV?^=0-Yi%344LgQVyEPQ3#pn<=rzZN=pS9DaTvxM^_PIo~nW2Sgb|eTN<#V z1hbtnDT zVh5^3`gAQZsT^;UJYvv7_Lp>`ULguh%LuYJn>acjADrlNH-GIqPb43LC%4oW!X5G* z@Mp2rcMED`mj5R){MeBM8F}*o!tzB#xflep9g}EC@`YQ&TY#W>pf; zTt!Lsw5OsPE>=-FOm zWLn^-THsePyFxW1+cPMHJiemx)5?n=o-r4zHF{p9(uY;1lp1S5(<2xlXh7As^T}LQ zIMKYvmK%$fc0hhmz{)q8!M~db#@W*PpnFkmxD{uptBRyNME`n4j37?FRONAlNWC^y z?gV}MmX5LTCZ3_U88a4XlCwbR+K)WO}ln1}sq3a-UajW=AJE-6EZ z#75{jX_&uZ1nNSj#5E0eF{>=0Q;TGnv&_i4TQF)Sd69OA z%zYg8l9F^$s3_ug6buQ29)p>~XRExdsjG{3uQkYcTZ7UBk5+i%+>n*E3T?Bt)i!Ip znAdf_;Pl&>FsFVm7#B%ppcsqUM$_LLXbFJ8$>9r#=@u-z)SQm=s zYHlVXjl&C)N|`O-G#|j%f$LseC3AH^sEbVCQZUd;eU#f)Tj0G|Q;=*{fE?{I zqy8LJubhh;6gs68ZZfSQN12ciIRHilUL2St3P$PhfkQ?ca>F6Qpt^EVLoG9YUmVNq zL9wjWfYvFtv0&=G^4PMN>bk@&fV6>uoT@+ORlI&&J5<2{XRAiQx?`aRvgqNkm=^Pt z&Y*f*#A}K&)sOkoACcb4`}>Zd0CJRJ*J_H(Oy5YlsfNA{4=iT055d8K;47R#a7!a)H}oMb*G_iul(=9?gnk zM_!9uy<^&taW<}zs8kICEsk7o>vBpvv-A~UjiS7wlEAUeZIRMu~L25zJH z^H;}LJH!av?>G-zkhRMyD;Gt;`W;{E5NVgjSyajw1Y*c1hMq&cc$ByFWvS%~W945L z-`fMoFGV;k$qjtAQD(HLhKp-8Eih5mid`s++D(!YQ=xkO(V|B??0lg% zhlru#8=uMA%8@uXNlP_s52ZYHi;>dai{f}A)j|^w766*0I%K9k&gL8=E&i~w0d+fc zU!LhMu#xg(zLJH^z1M41xKBw%ETI(uVDfwnZypf{>II?<(xnRXj>x7NIAgoI7;+o;j=k8 zCX!*>nc_ImG7fT7Swmqvci})7wuxiBAeTg$7tQG z?@&qYjF$^C<0EO(qOqnSR5i$`SJKQ#_{l)3R8le5M6@`vA_*uBg!FGs~T9%va^X*F&t7q5hznl#BoZO zRez5Dw1JTT|HU6&2AsONy&nbTG@eEJs-Yo#!@XUP@$*anVGbIizAVOYef92RXXAe&*gi#o8uyvvz-$7wt6Q{ygI>O-Ol zwc%RJkC(BO$s!ckarQiM)4#sxMZCSGfK%wZhT2F4wElhY#7JsK)kQ9^!+Aa_>GKxl|ng~~5-gLj=_n~AvO?U5yPoU(s&$+=t+pwz+Gh7fwZ}H(!{8U@= zJa0!a=~bCVKzL;UL7*OI5N$!Zq>8#AywVn#YqR`3l#p^GdXX~qGh>aK&@5Wo!Awom zLTOH-0CeOf?vmKJHXJ(s31aLWbdNDT7J9mpA#A?IuVKDPvBIqnH9f5J2 zID?1O_BSHxBcB{0US!rztcmp$l*MDyuV5H{NFh{v@y?w>Q`wdRm?i5XM{W3C01?{z z01AF34%oI|#N&iURW(jm%p+h_dt0wu6bV0cv)RL`Yfeo`2lMjjFL4?^9EAnOs%(6* zXoPs6vwl4>O9_|pPH;duJQn!8eq1~Og7<;hq^d`4_`s0v`{dwhha5GGJ$NbVleIvT z0O<~?MrHrX?Si4TqO{r_;$}X!$+@u?Gy#w-t?5*g4a~qSdaJ&QaY{^Sw15tmtGQYo zjkh<6Vn){GW)od*F)AGr6-k+HsL751$@j(Wq--@pP6o67)s6T9H6yM~b>4UdB}NM9 z9mSuY2}4=a1Dz4H@LChw$d2lzIbLtVPMC1DQ$s}|czSH+4Jdu8?88(ac2`0(wcK_p ztscbr9z@m%pX^%@AiJy5q$V)%74VL#+@C6)n;%-t^WtT_5T0Zq##7vGFLJdUrSG=0 zi=dtd6_QFIY`Lql4%y^GKA5QQM2jXJy|BX|J61$wZA2z zKalYahj7a?HBAz99sOEYu+947ZyV2@z6*ajR_Utunm0C>G@*yRCyRsN!`s9$c|!It+q;neMYaDzO1+nm7P@eY;4^N8*3R}>vFueT=~ANBr-cDjDW-{@yjr zQ0ky<2n=<}(IHN_7QECQgxl&z$M0U}oI548-|uTS34uLm39qIh?Q7tvfB=bc1CARbtFl`(l-a|tBpja-PMKEK;XcmI-%tq z3O7UHp>eaL{hJ#3Y7X&{)+ZkIeKpM7jk7r!i-Au0gp zCO13ioO_OJn#pu4#r9)%6JXqlE*vLNQKX}{>Ji4?X{Y0dk%~Rg&PdPdH%pbDNvR)+ zDi&6D_TNr2|CTG{<_6G98CjbEO##e*jU5Oh!=L!lqr=hvN?YcCM(huHL6#rhAAiUT zviwIJZ2pWj6eGi*k|QvJKbhEcW?afzA87oegvc+0qIBno9Rb>CZ+zE8D{$*e<}UQKiG@S_M>$_!u@wU zM%jO4ixmL$|5Rr9PvixeKs!c3J4Qh}MnOA9K|4l4J4Qh}MnOA9K|4nO7kh6VRpq*` ze-jg!^rS}ABlul^`L;;Z&Bm^X+q`SKt>68vh$#>#f?!A6%tuxNv zXOF$dI_I7Iq0^!0c%I>XKhJev*Y|V55~HxhC@e7wON_!2qp-v%EHMg8jKUJ5KLcz( z4yAs7F39mGiP4{F!++@X>VMwE%JH-DRov3aQN`3j!p_Ft&eqh{3Cs^>dthk|jYmR% zzRgEUni|`gn4*b@p#4h9afAPBF&ci{vD;hwo5Uy=?;p zoX7voo#nl~y5F6+^4{L<|HO&wzmUX)wcNs5ZecCAu$Eg`%Pp+s7S?hLYq^EB+`?LJ zVJ)|?mRnfMEv)4h)^ZDLxrMdd!dh-&Ew`|iTUg62tmPKgatmv@{lC$2`@@;Jb!&LJ z__%+1kz9XbJ97PpKs7%>>Hqs}#~*j>ceW$PpV*E+@ge_^?f5T&YPkM|frOnw7#jY+ zK*Rsnpqiid=x;zZ9Dg$E^ApbW5B(4L7cBjsBSruF*Vd0a_Pe$9Cj_8>1_1Ogl`j0; z9sg%{7Phkfiy>UN;??3gV|(l zp=e)wyIZm^*vY}!6wLMyPTzokO8_p5pL(1q1`3?N$-(`+m+rqU^iftF-ydwClAL<$~qA^P!BdyY?vU20ge2FiGd&v-oq|TI! z5SIA%P&Yub8&Au(Y&t5&%mS54^g}j{(Q}J^uEF$@@|s7oCS^hfmi}SYl`8$p=-^k# zmS|f|pBp)!yrK(_U|4x^Btx|}jPPDuA|0nZB9ozW7(hUq%7hIL3YH{jAK7;P1b%~| z8=DqMA*HLQr>pDfi%R-IideI*gSjWN-pU`sXt@SYXAzmjPDmPbzYmQeBt2_tyWNd2Ykeb1i0&84c}AgWjWbZP5R(?$ zvpPFV{@fqgQih9&Zimj?GNGE813edvnmYDWip)?!&5)JZoYqB((1he^nO5OMs~D+) z^5@cLNL%vbxQXLi7Z;wxso^P7@$q}rlPaH$-=!3nt6<+lM=gMSvE~)0W;X4Ty(gdK z*t~Sis0p#*o!%iVEdx#upk1|J4(CCGQ?z(J8}it(<-8=78aY)&?>JT~{D z=tSb|Ih0uZHvOmmYoY~Xhxw1&yz!Zs^wAsg3)41?-k8qDwS|>5(!RNqV@k0b$&(z| zCXcUenDXH*QVpQxj#TaIFBs_K%gv{ELO+{~7+N3(Ef#Ef0;^Qzp*J@_D-5Ik;m!wQciw9j`~%~5X_6Rasp za&U+Ons?OK1PAPj~k$f|~>KFmbR_x}CUd zt<#1reY5*p63qw%8J*iUlQI~Hapb>5`sFDv>G8;fSUh3AW>(K9>k22u(ucYs@cv-D z%#Wl`Y#ZL0B1a3ywChMfqM}ydl3saSl|J?~ZRVCQ#;BvPOy_Rpu0*y0NxR>@#PpH$ z=p9gbRKzUrF2zawG(g(-zMg$Ebr7-tJ3wA)u7?8{`fZFD7#A{lnUqyQ2`LfB2!!A0!(lFk;Ca8M44Gm?O;>syq3@p=TO18t91kWST<_q6 z#FfOUHrLmY44*zhkVxiGrbA0yR8Q^1d2OZoP&%fYqUj;^!7XCpS4=we(~k+}ADj3u zpV)bSA{aRSHsSo6u}h90JotZfzyIB!_`i3*-%0qcq4or~1L=u4f*%A70s^2v0&wud zgENc802ox0r3~7UnNog5_dt)I_=AAK-Z9>m$r}l~DZ34}xx3zbt4Mz3*#ik&q6*0) zifjKO*4%%0xk0k6dUh#b#$(+Y1T=+{6(>qmEq&# z;#3#}jm6XAh_3raS`e?L`963PpnBK%8Smf2b-xH{tFM2K8`?TA3JD{W2^U5NoV8F~ zUV60|QP@R+EFj{8wD4)JuGoGMhHp3*a*gJx3WQ`-*qE4V49(P3Zw;bG_c^G9b<%+7 zF+CFgredm%-^08m#r^12`(IT0dMd;7#{pyl{8!V$>`hJ7ox<<&Dv`fW9kReJla-a7 z2yK#h!^`n7LpDUCqb=7cy?~J#-|(A3!l*vg386UqSBQ?z>`YUgK19X-qqsYZR;JRR zMIn9mhdBlLhEFw)8ifO$z1`j2{nyCMLU--$`84`=h33-(@;i8$7}%It zN2kXp-vGgm$GSSR_oa+(z*SV7bgW`Y?;N;vvj=*+tKvLGZ^R6TBqRS@Bcxu9UZgv9a0+c57lI>77meQ4%YwT&DN;S=g8uSv?}- zqBE716~iK41Ea!%`hf_**DADGnztFFpPE66HIuPv=)=aYA(ro4v%+O74k(K!am`cp5f?XKXz z_0o7~ns4{?`K%d2p`GVNUw}jlbw-Bq@-kj@G~r9rFk)c^C8h9)h`KxEcS+axyhmmY zpIX{-Kfdp+6PZy{peoab;400iGgIef+I^<|1SxYM{8b;zPV)V_D?|zE1()G(&pp08 zO-}wiwI({bgO2|4kmoL%LU23jcYWsMtk`u}mUPoYIpEPW6@-({5R z=LZ(tP4_LD9NkX;_9ZSWMelI1v*w^HXM<$x-A;fvyfABFe!-_gJ8NrO8=FIGD=SB1 zV{aR0Q_G?2CSD;fp6|Am4WFmZW)bUnRp|(IR%4^{Aj~gE=U5mDtu{C!Ubjq?_xEL3 zIbqmVREX|w@9ysIP>_+4-M#BazT7dixVYHVbUnAb+p+WM)$!P>R88Mgk{uhK*wwi( z%xK^&fAb^1_c}3e)U>9*nYlTb^_%HyYU$}_);IASdpUV|c{zBTQ%}7Vg>A4ei%i~CAyXJVyXXhAjwKuKA!c?nV z5&|>RMcU^{Ob%+s{%v<(6H5*QU(KmlkrqwHCbT*xyl9QSjNCSWYrPQFlmMA}QALqG z*D1y-d$cyM+SQgq)}%}Gh|WxyGZ8s^O@j<)+b|5cumU(#M)b8C@G$qemRveh07=*6 zeWuRZ^e^SFD&!LD0m)!!Pg77L%NbCg9fblQS2uV&=9ggTn``L#ft#zdZD~dpyDrPA zBi$e|^C4uR8)CEo)HLn{RRi|&agY`9WBsLp$9VSIuT5n2ty=||Kk3Rel173CZ`ei` zP+P#6XU5{6y21gZfEx8GK`z;5ApZ>n9^s`u=!<1+eF!`-x}5g=#I9Jij_^P-@(Pd&I8>VBqQ++5?EijtsnR zYfX{YCqkr*BrDzuc_M@1_iB-7&Y`Xrt8$qvj{r*}93J&UcO_A$ds7})ewOH?jn9F) zZ)js71M1ZzG9a432612VjxoP3Tq3|{a(Jv!K6uyx10At1>advZLN9%n)GH+3x zqHN6mY~;K1UI>ocLjeQ>_a_IwzLQ^J2@7=We=LEifs%!4{Q9w3l_2H5HBvxsV?vX* zT(-Wl!91kaxQoQEe~64tfWJ;zIl?6s&!VuNJD5!$dwa2ua?+In$pH9acrcE#$%DiY z;RMk#eiF2A;Y{Z3n`#H#z299g83nnZrGfA{J|rhbI6bFid#G2@>V!~Akh+5`>1&QG z?fcsLOjukpeNJWMX%@gDu2Y=KoG%bUYV^Dzun*U`)Yu4>o*pF*?_2<(kX|GW@a5Yx zOgzpa>I_QuJ6K=rW8v<NAvdb zDyp+4PuE?g4ODzXJA-LY-Xu5Ti5Q*z`4v-nd#^mHEiNP;zB$0r_@nI(#d}>}J>zH{ z4>B7js#L?kBvi5z5E6UQCo$uh5k7`M`_hIwC)Bh|>|Jao8pT=$Fi@wc6AsV<<^y)n z2Gxm>k?+{BC}NjA(gh*nZkyXdxF6Ok6jy*TUQC*Qah6C}H0!`I8pv_tkpgg;>`|-qQBW)$y4J$oTAy{R9M6o@oWS8oq5owJr8tK&o;l$#-~)Y&Y#d z;6g!l4Se8u_a!>8&X|iLHg@rDw-Yx2o=r{j(Gh1LK~s#B5#{yb_oKiw>Bj5p{`Tez zc)5M&%{ppsVno~p|N0WNaLkVT{&T^dk4W>B7~qUBZ;)6g_E}C&PerKGf?6A?D;E_H zb|t{INh$QWaO{=r`v5Uy%s@tAk5!Dl-BD4U+=(4Ev*3kF)2dK}W**o5av?z*V|8Rg zKW~v9IJS6}bTrUI-{&ApxHfnd6qkyLZNkH?ucwQw-8GEWp_c>))y-e~?o+Gr3dvD} zP1u26vtj~n&p<%8)o@=*u>~JQavei_K25E`!R+aZ(>*fCQMqKfUuyXXpO@;NXJq z)Xc<1RyMj*tA;2yaUN3&@wd;zr5tIh6n=N;1$C;HHrE*6bm9*su5f!R>G`F7z*rsB zGWM1j!R`Zxl$Vw#DD0$$y0|r9~{d?U5L)o->8RD5n zZ1tobNCdi3@6`*!A6wbYRDT-i?t;Dx($4NNrQAn(Nc2cI(pi5L&-0;%Id^)Z_M`8$ zOP1fGhj9^s)u$y~PL_P7n+#I+zLPyqhNJ?sRhUsR9x_1Rd3a>Iax5$MRGuWR4@<@) znbB^gg?0K}blrYFiE- zZgfcLbM!Ljdj;kus)!GDeTYnsr&IIh73%z&J-puCJ5N-ljFY96eB!!F;1|W5u;E*w zxx|L~Ca9Aj47kc;CfwMIf?K9VjMMmnkPa~s5RdtBS)H9MBuqV+Nqm((c>k1o#P8(; zkW=$wmjb^51`cr`9#$2ZG}(4I;)^@%bE$`zE{!-0n30X_ZQ3i&Q-J}&;@RBnn;IQ~lT5p&)U>(Klf-S@#H}n0dyAhDUM3%ZXT|!k%K1e|VLt71J*G zs95R|h*4brYod?MT(qXc5qK>(;d|tT#Jf`g`hlG~Y>K&gJkY|#&3$+by-nDUyI>qO=h|PrED<}ft!1{(j;gdD>0f8^s3*a zxJp<=R1yrtLu;Xyq>9p{it>=9B77-{vnXz;`4Ix@#a?nSU|K-5w%2sRtGu5;Dr5ev z@-kAzFi?i-RoLs$SJ7Rpbe*fAXi}k(D<4qZ*H;#t0W@*_ZrnGR>;0eRG%C z$4LoMDJfa9b?No*@>TDxlvKuB8NUDO~l1%R6Z)0h>MANi@)Vmfo?}|`R3-ZjLhbN*3<4xnkwc8 zq?ACTAqGj|W`qE06^reK{f};KnM@mBX10)ms&T!g#_BG3u%B+-9pmFB#fV5sOiI$8 zK_F6$zzl9bKbJ1Uw#wbpcoZ8q$5`!7%|@^0_{9CdpZ{%#uM7~K*-v3PSZ+B+qtqR# zdbqo0tnB!eBFiV624~^D?M+@%{aWq>4-bzE;mw^LjsyiZed1TT$qlZh758myc)3T% zW(tRih}OP-pzXYSXm`oz=x6kTmT&yO!aLc;8& zshgXLlarIRwU@H7v4H@mb#W~hb{)Cb{lR%FP8XNbGF~3xT8+nUEQ$no`ZJ*T+E9@S=7|lcC1EXCBv$$EGed$O;-jR=Nlhb{keTOh^NvspG($g1Tv}<$<`d+!P$d~SZfdLw?vK^>c^;kRxMc75m zU2Uti_uRCu-K%H%Yib&wdiInpYQE#VgSje@DN0WdRd1s4a8kbX+Gmr7Y&T8ltq*)B zA9NIZe8ot42+{wpCO&}lH6WG-B8!`_5v_k>Z z(WTX!gv(o>hu$kvY+%R2J!@H3`zDv0l#--GuL);Bu3U6J3t+keHZ?CW_8Q~}6R1>b z0fLe)UpK>JxCS;Wraj~*NVWIV!)Jw)4xIFtaSuBoeq$jZhie!aqJ+D*8`TO1$YS_f zGDyQ&d=xyCB&~iS;r$_P?(7btHeC80lK1O~LqUG53Ql)6x3*qWR?lQe^1(rjIa?Qy z(c45+HXP#R;;M6;aNfu{2q#P)`QAnlE_#Sdfu9DKeSv=~uth>&IeEB|i~WKs|#jz!K|+T*6RAPiT}~+`bi!At<}Z(&+PaAVMPVA zx?olp%<6(!T`;Q)W_7`=E|}E?v$|kb7tHE{SzZ5ot*#$x>sFZkzgE|6=hN>G;y7JL!`;*QxzF&cP|4+wap7l41VkIX1o0Cq zArCJYam%O-Hn`9h+_gKc+VXb89&pVycXtz^Xxq3DB*UlLe-Gq$?`(L&pX?8od(*2- zBa08nJ{*WP(~v2VKVZ+wCB=UDs9iWrKGv*)`8`!gNpr2$mY;z|Kt!Ow-!TuHn#r^8 zxMDibtm;*0NT@uk`D#f)LqLk7bgkIn4igJ&vkV;TB=y2{}CJVhSf9<6bglOzIpHp|_mw1L3jTQd-S^F_yM(qlXR z=SlU4ksH%)lUB2DBe0KCtDR(I23DBLlzg}oo_l?ZjMy+4-5b64J^XG7R@B%lx3y9& zA2$zo@0-gd%J+MPY|7Zjo0fF-X#`Qjjq{Y8XEU;{;Ad|Wm;*0eA^XHDx}?AqRUn##K$y-W^PULQA| zU-!^OfUhdUu4xe@4a63R%8j)2|LAQ0Sj4xholk)4@5!-0X;S0h{Aa4v{=+^8>~$LU zIt_cBhP_V1UZ-KN(|@t#*pJ8N?_a0+|AceR|1WT^|Gh8Mu-q6dHwMd%!E$4;+}MAY z8~d?`|Jdxg{!T~`;QOOF0o6zUxHka{3B!0eFdhz!hXdo`{H1s}KlboHZ1#T^5oZ7E zM}++X0SJCYNC;vsKrsMPhM}c32tEMF%hF9~V)+o9PYTRq76SrW>bvVp`<;{{jBlLP zocBE0*nr&k9sA#XPuqI2QvB|`HrL}<77p8m&wy`qG*@w+0joj5yG7-F`Ezr6`fSlV zs2<4g7X}o;ApfeGmy%0rbhuSD%kuzR+x-YunHqd33da|U!Xfl@lcUkon^Vmh?dZ@5 zUXu>jIX^#dq^PZ{qrp|iE(YGBaFC!V91XrS6`GT`Zn90;`Vfdgq%c3fWEv0?UClp8 zt)-`EP$h_pMw1mA4P!qJzF&Tckq{M0M&Qm8U>8Z;p7l89J4$x|L0b2INax4(_{qvM zK#W|>M07U_a2FR=~yc`skic&##2e&B?ccx$q7q>dPzX^=0qCydplbc z#Q7?EFT-EUtK|HRg;V9Ik=OSt3kM6z!eQm&D*Ylvs`x7lr|cdp-92#!@DwLCmrQc^;1u>VL) zOZD4F&C{Zf`(n@MP6eJMxbj%$bjjvcezcd0?46caUkicr9hBIl$Pa&g%fd1F!NU2# z^cxmV{ZAH-@=q4d#_w4;inlBr4D>g$lCPmG9E_Jr3S%bT@AZ#9me!`1Xz6RrAC-J} zmd|S%9GaS%9XVY4gp*H%(J)`Xv9Yncxou%*0I6T3}(_qWxJ5kXtGSav(xZl~iJ~?^T8aQR0LW4|%hqr8O zoU!*H82REN+^$*(gQf4)d2V(Dx&!w_1_5X2;Fum7y@qQcoayDKeE(td^Y*palm5HL z`Zo*gmC5W4?-NTj&CEWHS7Gx`UtTshH&abEH(%EFnO(4(m-TCAW%uG4Q%fLv+f`%q zv89OiPtZxDT!hSS1I+n~?PL_5x;%8>k(Y=V;^x1!wKj5i`i0Z#${-0PLcWJ7&&q0R z3py{UEj&Flqp7d2t@}hzr^vggUqM-^7CO~tJ$|zf$-}~q1Qt3(M1NMn1r^k5tLRM( z8=+eCj(ci^)Vv8Z)}f zQNG#ZVAmEXBgIO#m^-L_H~AM9j)jrO_|qhP9jzy6KUg?!&~a5Shi_cBES&7<<&2D$ zt1I0F>-+7Dh8<~eD_sM5Ktj+eibA;pdR!NM67wnE%H>OW{#_PjazemEa;l}uW7 zeHm(xb$aSjK0(Ant&3?oPb&>@`-WN{22gxBX0;q|dUyBA$!NP`0Iu`zKOH67yDwh-18Me6YWPhw6KNgYw1hOpE~_hI@QC)Ze`=BduuY@or{1 z3S!JY0Pn|y80trD@QPAm9k=%L17hxnnrL?^C$&3S1m4iv6yMz{s`Oxmw}Oa6Zf-sx zYY-zdwo%|c#iXMd1W!uG+1Ku&0K<K_VsxPYJr1sKCGcWd3IG;HJ z-cX+i7~Q22KMSLTV3>nC6YCYAs2&;cqxLe$3A$eLK-X|4st}Dlv)^ux6UQK!J&X3C(w{qHf;_sbCSH z8Wc#pqDArC>grnLbwbz~BWmB@+47sM6#!gdl_7TSVtSL{G85snQ_6oZG?G}*d-mbW zA0Rk-{|^Lb673TCyI4!bQQ?QF1;LWE&p^XaEnUUDP~22qDs0@T8bf0$OpXTqX0uSz z5ss)}RA5KP#o(bXWPKS}n{pP93s$@gMkSeynv!S~r=TbXtbZpHg5TD*(z<9jFTw&L zMl`lUTs5=qG*0kvDB!FC=hIZyGcv9|(e{9M?y-TbBbgVU`Yr>@+m2=$S2moM!#;5w zz6p*7fU^Ad7kY_vwJ+Mu*3u>>!7uK5PKN<4h!;%|-2EAR{cJEvms4!zK+TJD<4Z_^ zv3mTX%+cHj>Av715(qMw7+FAU5}R@RU~%6bO0#)d^l9@QaioEeH$r;ol4;m|Bt!mI zk&%Ks-5B&juR)FvGV@59yK~m!bCBDcXwb!r@7%q!Zn+D@dM$JZT@8`Ar6RCKP)<{a zj(*UY0Xgno`3f1pTfD!M$m`vLB@UJE=godWkQUS7041?>wkR`elfc^z3m}5h%YF5s z7SLnBduCkp1MCU)<{HJ!!b zK~`sX_;p%zw|91)ONyh1`bpb2MK8_1fVlDTNYlU>*c&cj`D2g*)!dG{Nm6EWeE8K1 zr|1+hnx1z%;9Laf?k-rmnsj%6#9&6a@QQ`kW7n1>0H?!UfrsJ35L2>*6x!Am>Zc(@s-*u!THI=WcQNt!y^@GhiPVb&+!^5NS zn2&VC$T>)_{G}lGi@?I4J?Ay`b)UqO@Rg0ln$QJZ9IB&?no$*)Z>I2&giHKD;ZXmA z!kM=Jstrc`go9(z&nYrErT#Eg7P!hp-;bc&P!xDHGn`8&5n*X>;zWi7R*74Ke0ETn zLY7f@XX9HdiIheAz|cdK!)Vj%$)h*Xq%2?aMEeVja)j?aNz&JPa??GCL^|FG%7{>O z{6=iZ(kubNENBc-{OwuFgUq9^Oqxop;-T-Uc%_mwbo3rg zjLsOXukUi4_&4(jf3w9uy8qc7x3zM^5`7YbR1f;x&CZ%rPV8GY&H$ezc+9)FmexJ} zcwNk|5sgn7_w7+7Q@<_M9u!8mGX3P#Ia5?551H{`H7~aOq@NX^!Sg~P>)KBYgfI5c z#Kz6d!ouD@b>3u?2m1DbKz=Z$jCsBpMqhoI&+V*U)T_4M9Ezn;z+VGw9fp(oa8E!n zDOw|yiBUqJ^s)Z@;E@^^FdfQ}K$Z})NKe)sUC7PIm^HAlLSj5JRck@_<_SqNjR^bZ zSZdkz45x`RR7fAQ&OfYRQkzcWL3_?E3a9T^6b{AjQ8?KX8NWf{kpDp8a9i??KEnxv z=!e#1q`w6E$xt1mt1?gRnAd>_^lLN8DnOpQm zqdPTFa1q3ghXwuh_vJtd)(#vLq2-;SLVSA?$;hApX2M{n346vOwxzusbt5CDQUE{s=_UHy2UL1-eoMjoPjQO_vPAD& zIy9Dya1vWbS(5Ovwir(N@US>GULLpH&>-F*f1zP`@W(s@#}BP_tID7OVV<9f2L3-e zg5&>Zj^O^od4}K4ZB8QifkEXUNfma|*mh(b_7H=SbZfC#LZs$gh^8s3IGET4p|ig9 za$$R<-x)V|Ayq=i{03KGQdz$nW003)Mw-gm+xU@Coqw;N#-p za6T?G$aMzrvEAK8V8_NzQ*r7J6vxKCOsYw`b16;q3!wB?mEpR3gtpDii9i9R6BEi| zYs%rm=ik3KQ+RpRwi&T0{{kreMU~wGN^w2q8?${33{;AkX=v10vC*+Le5F)cI`)g$ ztAx2F>CmyzFd`B9wYO<;(Pp9?cw13tYS0_2T9rvbH0V(;7^$c-(qlM^pB7xq)Yulf+h4977c_k;ef%9e>h_*5D?p{nh)A4fH! z)l>_!Gv`O4>vYV_#0<(4HTOY@O~#B!Ako@kc|~`B=fP7shc{XTM0n&mKy=h^P(bOC zJ<2btY*xx>9xg$cN|0A9`JKak-Hbjcp)}f)??%sX$XoXy;LVF4gwlv#38eu>vsdZO zo=`&RS16%$`~#FwnzNKnz0#ygq3oALjR1$BEDH${Cwc$&O;%TDeY9(#jE3s@$IFB0 z?%d5pE$`XS=fUmmSy}Hxc>H*HM@92SCdSn@MBH`s9;@m&v9Zz1C#59fQxM!`CO{|0 z0f6ue6n<4@see;twckzuuF5u@*)%DLuf48vv$LTH$&9*BtEP@>61s|}Qtp<|3~giA zw8Tvqz&h?ouGQOgZvaxQ*)!dY4Bykz<@D!AFH8F1^z=_p?x4H~mVO}J(cVE%N3TdX zg3sX=|Lz?z1%)Sj#mlCIvsGiojGn(H^ited_4CVatM_+2JYGpq+qKhYXPNgp#=4Fp z?N`>8RF~J=ezvi;wLP$QbZjxxzLjNKrS+eC`xO-HNp|<3s?1(F?==t~oz=0Y>j@Yu zTxFj12ch&elu*h$@GGITe9rE_RAm>R6sONOzom~E@xD6wp~_kV8^vJp~dTKR(sxY4WeG-I<{L? zCVtOh^`!iz3N9&Bl~qE=Ho47_ej$|ZT(xvWG=MHpc)nd8ATE;a?(JAl$M$CYqRNmx z4p5(eyH#a9bI%SDyiP|#i+#M2q1;SUjJr|aqSDB=zvO3KuZ7Kwu7it{?juKW>tv3N zks>R^n$jKUtWh4&h|&lV!yQ~JkAoyhC}e0*_B47WY+}Ciz2z(xlC~$1B?u+KJ_>Hi z`A*;osEMaS;1_AX9D!Sy{yaDJzTpkO2yW|Q|K@f`S~ZyE@!F1z3KsykWo_Iu#1D58 zDB=Z%SGl*aK2~s7_7j=te2=x zEC;bnEZm}i64fbSKW$@5y}KDyW5t8W93|C`F7|wqeW&4b?Q?-DW^brt;VhR}=uULb ztBHYrD@Wq}9L{B$I;0bvc&VslR(4I}g|2tJPU>n5&D$E))#-1O+ z``1%eo-&-`2DK#3#YMtZBphxwf+{Dl-9x|G%J(Y56(Te&o}#pXQZdNjZB1ItahmbU zsuLem(>6MXGSfJ?NB~#FdmGpmO+!f?p@dSXE+e`nlCmMLql@%k288O2lObJa_v4N6O2X!op9dgRsm7KxPv*1&(k z$>-j=6EB=xU%50E?=CAr4>Fa@{=G;heOu0 z8YDhFS_uYWotDS4f=iFYZi>W*TVEphHzK{nfUsi>i%llsmwWRI&&=v1-hG-UNRLdT zsqFd-p;Q+XEU`^p=PXfyluRxiB9JjWhGBR-y83rnR*qSkB;?=z%-r}_LaDYilu-Hs zu}qAkQoWe=D|j8x;lcC94pf(yAo!zPAzpCl*Pu96{Q&742~I;K(~bE-1I)E zD$^hg4LW=AHQIQI0^ZYex;&@?@$JHn{VA4)A%=`EvbQM#F|Q|uSPkW{obebc5+r}I z0pf@`Z)9h8dLLIy21D2%-6gDI4{@>YNH_}4w<*Yv9>{3{R#+TjGM}tbyBSn3)Xoma zXqq#v3b|}nuPB-(PcCP_hxoG5^UwIC<>UEu%RSkyX+*2eVXRd8?zLI_mW~6RZ`;C# zfj|-r1btT(EIl#+f}Q}ifeh$uer`^uSmEf+bHVtD9fw57*M-0-qcYAA9tHf2SQTV7 zr={UV#7jhjJ4dg|qz)}ym$3yVs1V)BFHErqc~tk_ZfNU8#G9G1Nb)qX%dlABGb3#J zAuT4%UIg3NI6re~o_wtRCIKI(QH@+BDYP~*$8*?ZogCRRINFA7|34lu(Z9qN|6$oqOCP%vd)3^>a}7j9AI-;*y8bP#%Z|F(9f8W zaUUB?dl?k$=%rsJ59G-?sK#*CpD%CUd()X7UG_aQ=9mpbH`Ndmi|c$6gFXU>@S|}L zbUzGdFg=VFi0sNzShfpBIj`2f!$h%WsU8Mb@4vc21-j@lRx5RHTZK~4`&fcs@21R& zsGL1`Fdyz$Z_cUcN6g1%jGQFZf^yi~dvr9-7;S0fh0!WvJioR3x||4eOqB5J(>GM6 zxb zi};myU7<%cMB3{>Eug-7DT-B;Ns*kK>KM1uKD{MVzMplMN;XwkF6LcUUl-G0%-b|F zt-L}zi}M^3z)|Nn)whW~GdK@T*9nx07nVde2I^CAlJnaE#OTR(djk>MLD%s7i;-RStxf&> z_LcC4+tvbg!P4h-A5os=Z#xK$Z@EzTrJ z-wjtTzeUa#d-vM&8qsbz5RdX5ck*2x)G4tMKIKHRqU}-)WNj0z_E$ajnkR8trdaU} zKHuM)EHorsF>w4*ll@Ryze+Ohze%z`G0Ft~x<1*HACipF1Qiv4V2BRz-v;>u(H^u6 zPl6GUIeya_QQnCg2w~sO^UZ&Cf0k?6>8DR-U2eYCn?Oz8LNQDVC~bG;+aM@` znV*3jf@VKw#1^Hp@S5Rk0KcE7E3`A>Ocv4Kgz_7^lq9q?Lcv{w&Tl0dM#r|0 zKuz__)_QnOcS$Fm^n0|&X2Dv~G0C26d(WE!(wAO*c_W<};avTTQIFaWS#@qMZYM<0c6fp@JsGv4bNY@!p)}IPj zb#H|*_FCm(X5o+02)NtQh*!6z5m3bC-;FZtUyU+_774_wy5AaQj$;$XE2_nfV<}Lh z40{hE)%N`nh1 z4?E?Z8{f?QRQ}DP-va_1h99L7pCM$#Y1P`Aw?-NFt0pULv#*0Zg2)xx+7uP8E-o&j zJJ7E^3SaW24ya3ZJvWyowQ}eB#+LU*JizGGZwcx8-^JJ)a4J%L!fw~@U)TYV-TT{{H^@*_n9}9@HpX&S+@~sH@X$oYK@zs0cr)FBpFM z7A>bUW_p@cT-L(NqiH0ofbuXOvv!0JI;yjCwQ~4XKji8asr$8;xINa1$c}X(o0h_@ zB+Hb9N-~_wmC(&4XlaCTth?Lkh+lDo2;wi23}=-qproP3mg|&HVCad-|vm1Arw1b5pZc72Wu|^oK5zDjdK-s3enKfZ%8% z)6)mS9pAW?`YfP`QG#exL@9RjdlLnphH);Lio6EC4RFT-RdR!{$Pm_usRCJ>gscDo z$Y^R!62Jo-XlaC`tXg`9Fmr)kh_3(`fAB^$rwMNuoO329{;4Sh`1ZCmLYPZT9QmCO z1Rg-x{CZF+!zmaMte{EsJ&&My#TViaprqQnS?wYE))%!vjc?Z4)&C%;{1tc~4Jiqz zBiHF?;)metV-$#lIpTxvr+W5nRo@~I`Aupbs;c6+s;8pigG+$~GyZp<<0G3 z%7D}i2M0nR!k0%7$nQ_=)Dpz@D;Ph z2hRT_(YFv|qZ-Ws7m9-%s&cj*A*vMZE9pgUXlWAwZ}9S7g=(<@yO{WduP+FT<{BSC z#W($h_Nz}aCQR&}i_8dYcnz1$V4e6EydO#B@!{1P$Uz>X8iN92Sy=t!L>TMO4JgkQQAprWkX1cclbh!6le|Yt&;sGsjNBIJw`q zk}wSdz$>MqgGyC+7!a5rmk>>E^&o+C4AY$vdysQTxu`?zm2AewBfo@Q_(g)Ng87Su z5(O6Gm-=EJmt=0Fm~9SEfF2tvFH`mv8Q2=G5JDWAQm_{iiSGxNpBn&2$-e|5ua^t|ntIO}}^;KeD0 z8%6_4#gZN?D_f@8yH!?{nUJJcbjyP}9m=v3gA91*HeNF6y-JWE6-0%^BL|{vF0bRQpw^6vX5-Gfs4hf(i)y-3VuAp@_#&TzGLb&JBQVu`C$ZTyv=E_T$g&fuLB(O#+tf*2!P^I>neVP#~`G;{_ z&=IO7rT+dig-Z&!#<_k$sAwb9@f{QryfS=-dakmD(C<>Ef@H}h$qViYIz!RxkUqN} zycFNMu=Mo4zJin5cd7Jz!4!o^1qrCHMIhcf5%G7C_z;7vVIYka+(3B^Z*WRYRSLoq z1`uuI?}-2cUDX^V5#TUhBkz!OAz?z|TQfBJi(FA+7bp@pd8R)^i50m%VWnzdUnSpm zQB-^Uz|??pKe)@tmnfXjm!CKaS%nPF*}&Xt%#64G`Ev@7hkDuTRNrFgKfX-iI}KOR zvSE1T8`%U|U?CY6Q-`K+e0??gf$Si-o9XGKp78^{VP%xVYF#C~buzLFO&eZhBLc*> zkF@xR_fYAs@}uB(tOM$cs|`4up*ET)HD}}mv{3?1mwHDpp&r^O;qK8cr;hhQ%t7o) z5yb2OFoM`GFdadmjy%y&nbW#8KbouksNL>B;rhmAd{sM@(P$KvFw0xgZQH$^%u0e`v9rz?{f6ibmybrh@Fm${NvA z`5qOlPh%5&+*hL*{uc69Nlk-Ti5BKWJZ_uo7JS5Cw8YT7r$+b1^)ZbsrZSfNbre11 zyi7vWx0^I13RBaWzD0H8KRn32Q(P|S#x(u;rC_MxZi&48*-j*IsyMAO= zbh4yU2;l7dlJN%j(~053hlXO%v{c>9w!$mHE3?hx4}p^mQWl`x;jh5?@RD5$n&=nO zlI^W+^mMd%mRE#7^jdE^RIjbEc5YTc^;&`pV$gA5W&B4gv?^wmdz9VPTx_V}CS3|B z!fm+vmJov?1y*bg9piSyUN5k0raun6*GB`+RE5APh?foq1`5?}n;M8DoYKp?(`g|c z2as%E$@RlTLs;^wS>?0$yKBMpa zs4F!x#uK1_F$M@YiD?aQs-sx6A!E5CnlgS?&V=%xZ^C z?ZBpXU{gD=sU6tV&R=V4=XMXj-Rn7iGos1C{%6D+_P;Rk=GTN6Y-|TMwgVg6fsO6J z#&%$1JFu~xAKlfku^rgh4s2`(HnsyB+xcHLwsWhsez6>WGq&?5SbzS1hU(AFBk;F= zO%8U>e|N#=PBN4_>_gUmOC9boP7oCrD^bDR^yrLRG+ujYsaa%?fN_eDYa0_7o1Wd-;7l|`iMwGN?dN0>DJ}PF+ zjjF2M03<`Vl*fcKGB|e1O5n_BAr-x$;4(62#15QHno9&q>oeVpc3_UcdU?Q>*9q>Y1gbtCg7%k&!nd^6rd0d!G}* z{P=S0R7qN7)?W+~vxnQLQI^6B4*WFBf5>1`gZcAq8rseqZYn&uC*51M~ zE;IVDMbmVF^E4KWXx+=X<2GYWXQ|50oJg={))cjJ7M^uVoR-&CJ#D1dFWLPSr_vRk z8kC3Sy+fZQ7+b5HZ4Mm4Hu@c|+VK&RrO7t$b9=FUlM4eq&A0KbJ z@DW@dxLdHCyqQ|Ps*u*4DdL=}CYpankyiKej=t2)8&%z3=sjMFe7Gr)kNaVP0=Jf? z!ul95GxJ#w0fLK{dvWE1&bP==P4mR3D_)>tAsaFlg?s=%%_78^3^kbqAPB2`2`y)R zc4a91`UDEwrI0Npa-UU|;x0$BM8&s)|55jNmn$&d^3i=yYsX{pdg80>F=={-yZ2Oo zYCY=I%X$SD*)pQbl`?|xA$F%HXwHEF*P(gCza^rc-P>0iw)=;(WbLHWv9Skhg9!qJ z)KL#>%avJKO%pC7^NP2;|M<{1O-nb`mJS<-L3{*sSnw9g3U$Xi#_=(&e0{1fb}|i5 z!{*M7kLF;$d~FM6?kjsnWJwwh0Rvj|Jf_GZ^I*g@i^CO&AMILJG`Unt8aDyZ>{R_Lv zMy5}7B^Tswbcm8h_MD$Qzr1)F+gw3&Hds)gLefM|>?ls+bbih*Adz9J9@r-zv%qkl za`I#AvrEp&V^w73Ne8eXpMs*$^=F&d3JMHFe868H-%-d1==VPOf&WU4{(|3M<}(Jy ze{-*7{|lUE|2sJS-|JrarsaLp^1f+#UxVJfX?fqYyl-0GH!bg*miJA|`=;f6)AGJ) zdH>(j^1i-@{bpGEla`n6FW%Vb>Hf|u+nZ_a&9wGrTKm7-s`qAE`=4Z5`wf5py={%{ zFO+F)e@B@1rbB+yA;0O6-*m`tI^_RRI^t{6EmC_}8!a^#}har($}#znb6C z{hj&Uf3H*~<##_bbpaq>zPCu^Z;{B~B9XsEB7ciS{uYV+EfV=#B=WaN|8IUL-{sUaL+>%ti985F5I~)Z4HYY(KoO>5 zo*ti=LJa?gMlBrr5f&#eB3~n+heNqaSQ&v_KyoM5+j2Qh&F%t!yEx@G`d%>onw@O) zumv7(c?Ah!b>^8H5C&`?6)va~_#GT5E&sGVmVZaOBbklCSR5ZdqPQVGRD#5(MJR_s zd+B_I_p78KLLeh9v%_R$W7t7tK0U}3$YQyZ7*eQV!`n8#F3tG(Ja|u=r+mJhdev%F zr2aBr_z_TCT*GB%5q`p@#*rgLj2KomaPja8;X@!nbrJ~?#;6g-xQP>@h6%k}6VOvD z!%^@+X{gaR)|$7^2=AE zh(iwT5^CV$Ck=~=XhFasM+pni^IOHI3FG(g6JkPwL6s#IbYP(+rV_x$z@d~BQ!1n( z`BJFJY^FNN$WS|9dVsCg&~Renl&F%H5|f!(Al{PGnqI0vvH7(=*|bRP{i=Nm69t9~ z{t6GOD88VSRj?c)PSR&Nk`m_Kl(`1i8EfQ4Dyqh^CPgg;je?n^B-MK5@|mQBgm~nA zhdO9$l!w^o^IB|65si-n`NR!WBA5ga`s43~xV{7V6Umt#tezCQbmvp<&M)qReriPY z6rOImXNLc{k2}rEgb5Rsmin^L@FEbJbQEl0&)uD9ExdX0v8r-n(lU9YD-kW3qP@E7 zsUKT>ua6i6G?lN^Mxew-f@+N`Xye2{({$}+g!IcAhi#5r#|L{Xm^#f&DQ<3V_gs62 zhx92@v^wY!+8J%mHFeyUmMlzD(~A`oXlUCzc_f3mo&M-bghwZz3~ciFWK83=G8{K# zj2ttvx76KDuAvoIN_<)U1qF@6j9gp{Ulty=u<(#IFwkXZu&8Qggm1y|;2v8sXK-?= zsbyj2YF1Top_WF$8p(}=3}V-VL^cjqD(|0Eo@RdjarxC+*QSqwi{q*HzMs6r(aqS& z$;!%W8O&Tyuj6r&g_o6;larItveqlvgO{G=BYm6a(u#q{Q%#kIPV}u}cr@_47VKc( zMTiuRioAkb*HNtVzMFgaA1Uk$-+Q}dC7tfi1}LIC*VHsM?HfU9u;J?Kzkao-qxIC# zsvI7bk(HJS*H~_4;Bi-SWfUZUHIB9g_m)v(AR?AX$uDu$_h4jT=Qh#Q`dU%Skv=lI zkM*+dGtEq*=)(QHW|LO=a70AJ<>*N5^t93ay68fw~awkw{;QZh*nDKcS$O zj6ByCx!dSF*}6KyI)G`{wRwQf&&|z!x`ZWnwnSS5iSn3Ro(KA{FVh>k#A38-?*4f& zI?UBKz+W}uiya}&2Wq!59y%TqXRAid9mEF$o* z_v1+K8i@n1q1O81aAD>$AKo8_8FWt#(BEN|&&MyXcyWNt6A606GO8m6u9b?LhBM7+ zU(V#PY^t8FAW0CPSHZFDYpwTs?m4v0Nj}~lW4ob=z3cpM|z9-@%^}hg#g!aMm@}+T(uF=ITP}XAIkK8v2nxx?`u`D?hu&Agw38P*P=|1EO}|3ARZsZbm4=D!}yrJ7X>gal#E5jOZ2JM7f;zGp6YY$c zba}TXikMMBCX9XJNa)OumEx1W&>ps4lMDe7-lrABC#&g_nrARY1i0im` zwcN-!O3TXD9WQ&|-0YqwRjk0lQ;NQtK+v4>r- zwP3Q}kO4O$u#>-SW#5V8P~}`(mmG>p^b&N7Sb>HKS!_PJ1|aNx5~vboF;Hk~{JFNj z&;XK#PwhfXg7KTC;Z4)J>6fd!AMX4cNSrMgX%Y^euL^asD6X$|6@@73xEFw zs_FlVqN4wQlA`+6s@D$q6Gg@JSIuYoz4$Ssdj~rsJxeGTz)#96)(c9=JR#N!d8oYjYNA1*-4)7; znhjW(Wj$cxaruW5sQJ6I6vUextRY2M;0wFoie=q>S-~Cv`R0J(#kdGy!SvnduG(wF z6@w^>#l_rzcEJ;n9)c>bH96^%DRyaH-VL&^uokTbdV$OTXSqA1qWVR=VXN2d&V8EhHUeca{?wDkQ)M|yW` z^s4bolKtd9BDNVv6U6E~G@oOYo0?gJpD~4fie{^Ns~r-a#VIfuQ`P069~n|OKScGBTQ((pkH;H? zYjn!S7(|um8Q&%KE;DvBc{e``+LVblADy){ao_-k?mElc%t5;ihP|LYrJ(;Uw_Wt7 zkPwj!Dz*M}kpyR+TqZ+&SBnS>7YRdOz^BeZJ$gdf2}Fw}SN5ol-hlA0Xf4z^^7)*- z6*g`CRcgG7bAS(c`qc+?SSCn$m8`RA2N04S9360=PP?=R1rWgV4JIb`I{ z9{4=p2g{Gl9b?`NM?D3mR!UM{5Xp|wGie?o;e#WYI}9F6>iPElX11b=`oU5Gh*Q@sG%t7T_ca?7sD?`q;d} zd{u7#-mUK#5orf+AJ5Q4!`fK!Q_fS7RiREH+SCf?#{dk@0%A`JPs+#lo@Ac1k5Zuy z>Wzet#M%k1F*Km;O~)-+B(v@nI(kS;q1vX?=2^^SiZTQ4n`m3{56ur>xv(mc8+35v zzHGLf96(+T!#v`&pgTZSf0V9ur#=NiXM!gI%K)3{QR)sTaX?jH$cigaVCn0by&(GH zq?Nvv80x~28A>J=avUbx)w|9JRF~O&^WlcHvHRz>&% z${}rRSlbJ11ujRt#OVji0E+5dMR5f=q{l!#gZ8v&5XvwU%IMa67U%kw?_s&bt^S0O zKlzvMTZNsLU@VU`g@T^HbMC8{Y8E1S9wJ<}Va^Ah$aris=7fP?Hnyzqb%aV5nsV5X$&(C9e%=*un|KM@kIH;G0dzp`H$l?m(Fa}3NoR;psxo&``+MT^g!T&E>D zTabv>rE$?scgW(!+knqhBp+>0HkwkwkPk3dcgDpxGk zgqPtX#O95Ix+&;9NVHv0w*84RO4T=z+DCzCnOnq1^vF)^=y->nw!PbC`x1Pr64QAM zu&IHq!CP1C&W1#>UEARYvF(vv1{@`tY4#FUYWxa&09R!@MQRbwX9VSqRz!85ORVt7 zfdqqw1YzZMInVl$J;*p@Wr1l!R6dz|1Kq?Dke;;%r`-12nWzcs3TG!NfT?eelXFSi z?cz=0MoXW+I}_I}kF*Xt)2&Ab>IB?p8mmCo&n!E@lS5*;G7|AI!4T|v!kj*tzN)4l z3?^ER^ca#}#9ptj5d(8MOgQf1l3ga)KZ`>L-y*+dcU8hAlJ?@~A?Brx>f|8B3-}9< zgBNkn#&WoRq?`v*ha*vmbE55=`r#05%0db=m$k7KN%1%Xud&ry>zrZ|T7_{g4_1ox z`@#8zS>=kLmMY*hb#@6+0RN4Iyk{EaqcBlPF^a%dw>?!>W@RM~{dl!1w zeD~``zC!|&nXtwSz_lju#I=5(*{Lj-NQ%)so2r#n-?_k_ykckCK)lAfoLI?Kq>Tz}Az5_3Z&MoDb>UJ-AY0M>XGYO|JC_J2=Y`;$?xp=~ z1K3plZkCLQ8t9Y5Rjj64JnJKI!z4FQvcbgbJ|a8%qmEJ!0ByFp$*A1L{DA<5azk-N zumpM|A@%{b;sj15#VdU$n9&f!tKiJli}!wQvZfiG-eK=|ZlS;PEJ4&hHa>+Xc3V$O zgqE{&u2p?^lXS5W&Wl=~~Q@2_(e@M-@GGqj9EXC=is)S%H8)|N3uLVS`7 zOgBjU2~IH-5cv>sA331F08%F*P!Z&JUAau0qW-yPHdeVDSe>{7~5V8sqg8@JZPqT1&k)mw@ z#Vb~KV=z2}3|u6TGX&Bzpk_nnloD=qRajho*`{pjPfjA;%*Iq7!Dwd63~!QKP9<5( z4F}2AN+ww~$@Ks?YEcqT1=zZNKRJ1mE(Jh`oT4+dLz-<7!XS){aTIa}1OO4=#olZO z7k0~TnhD@iV6qts5+bf&@NOpAZ0l#A@!e^);;lYCWqhafTCj`M=tT`oBpsShg>|AZ zAc?wVifPPY8TR3!`j2hX+qStCTrb5(?moABo8={VFw2m_JCRDG?YJC@3Z`@2igMJ zuTut(?o~Jo(0F}`#8DWHgtrR56i5mU4Z*-AjT$~)BRp@kvzra~cT%NiwWFPW`l&4c zGafP^#I)%Xm#1P3*{6E?NO!>Nvfynr= z*nnw)sJ6q}K*RWm@<9c5NYVi=`V%7L?<@AacNLgwXcC`wTvY&jj`e=o#=X8E8@ub{629&s-%ZHiYQ+ z!Brs_lyq>k_JuWxdQ={`y6I zqIT%&d>>@XiL>x^NZ@06<>KFyBFjo(3k%H2uuC-Xx$?OR<}1J!T1}}HCpTtj$hHW1 zN_f(W#xW+HCvEhX#g)XEC+sJ=lgP%;BsnMP&}5P65O_;G6+F}f$#?K|DEj7Bs$`3- z)qLmvBrcqqlb|20pQ<0C@4wF6El<)M>l<5#C|kt0TRB0JPk<>fneRGLUn5<^F>f?a zha<|GcRv}o+kJuev*iNPh{Q;9>-`q?7UovJC$Hjj0B7 zEr`aJ#!l@Ax`dRYAthsy>M-`~vhu?()Z;vPOC{W828GpXCb=@zBS_k@G@7jZby{_b z&H}QMv}&|E74pZ~Jn~s(S$S=uZJOQ#?PK`!x?%ij9fjGU*=>56dbw)`M8z>B;}p3X zxtu+02H$K$PkA;pNh?D##AC%H`lS_TrVocSU$ldQT6|9pnnxzI3ZCumdD=yv0_YxUT6F*JrW;W{oh*;C332^$*w zcSF)M5j##@mR(D;+;flh_muFG@G+ucz42e-b}&bwKLjULsK%&P)Wu%wAAdal!o$H6 z!V|%R)auk);z@g>_XzapbhmUqbFy+5Dd~Z0y|Rm!i!%tdzOayBSCd#&-$b8i zSOlshs2G~0bc{gg_63oLkz^r#ZkmJfMoe=R~^6VocV4R#Ye~Ik~0>K7Y&;Q)ZEle z3Xtf9R#Iv+a+9^tb~5GC2{sOxO&=*E2lJ6RT-m1|! z`%#womSS@!gCc_wMp!FWq03Y9pXDoCbxo^}w|fK{1mxFRs7Ey6EV?aFEDo29tIIwv zBoTfXztkUG4_%ifS&ZFZQTJ56T^hOyIBdzk%D;DBb%JnST-t3sS4+`X|1dXJzN%fe zHFTD9R@bC@(5j-RX{!rw7G)S!!%+COMWW)P==7JpyggscsMPJ2J@>l9jQQS$nYI;6 z?@7Q2;BIgpTx9HK@4jm}^D=(RZoS2|($3NSoV~_26K@@Fp+m9Lm{HMBqWQhwqa3F> zr)F3SS>sv0N5uA2ZP@l&KdxR1x+9hB$}#3P+<>qVu%E8?-ryZ)!f?&GhI$t7l09Z! zeH+sV)yV#__#@<79*dc~l>5rA&twM92o5!G+Ox++_*F`3^?-@wXv_#odK!C{v)7qG z&!^L{5VRl;RR^BNn$3xf^2+k%LEof`52p=wMGGzW56Vd;CgtqfoSqq;Er)@R5kGL7 zG81(Uyv%lMk1bnvZK@v=H_9SgK|OcwwC`z`uNpnKBA+v`fWyGe?|Lpj zj)ND$f8ZT*V|z4xG&tDEA%l{o$Sme9dz`+RF)nJkIQTrHJ6auV5*1i4Nc)^{TOr#C zI$AVJDN-Y{Ct4;t8h#tT^y#4K8|Pb`NB4WD~L?c%3~UAI@AzEseKV zt^>~R|7v3U#~hY{g`VM$i7o3NviErgd|D|ZD-#FPmzVtC)$#v@ ziS3)|@y+!3W_o-xJ^rVr$5*KN_ol}`FzKJ6>oxUH(Di4#qmrx53o|08XJTZJPb+O? zXr?D%?SilQYQcPgM0{3u2JP3?u^g=&UJa6xW`_3on)tu*E84G&$?sBLdZs^127aql zP|rcn!rJ7I+J7?)(5e{O*_&Bg;nUL7F#V|l=09wvuW0()`6AXY7o}Bn)OYyx9hK}H zjeg5`HK_iUsUadPASOmBW@O=HA$l^|inM z<3DC*7W~)$|ELrT3oF!XJ}UzQ{>%PLFTYY)fBe0k&GyUji1qjDy_S0&i{DDI|FRGN zQJU^oO?1Dm`Nwtt*#1$5;n(?p95KGs%}hs!|Js^=8o&RK{{Kf;{?@fJdX_J@%U^W6 zg`SE1ukL#t0D)ik5H&k9Jw7!P8~w{;k%<|fo}Ph08;Vx^<)$z*;I}fdFnVbhEx)~i z(aSKivcBxUUhMUNnt}1hy9lF`h)rN1DpAGuJE@9_{&$NqkDOV|EK(5i-xzB zQg1CU zp~*Zm1%t{TC}rvfml%@8uF_bO+h3%;^XM&kEL~lPiOh12jySJr3y@)g4K#JKp0}~ zB>EhX&<+fW!!r!<5JUl9sYa2oVXfghlE8^5$qU%RB;~{Tv5mU7!s|iD`?>TnRi{Pp z2AeV7;4)t==G`#B7y=cZ$}u`^%t~iFc3@yB-H-rW_=Pqx7)QT< zC14T!EVC&J z?ftPx<-A)v^*nv>iVQzFcHpbgTYd`BME~U2clmya7r9l=C>FFX42+?+5wOqROEDv3 zl6-*Wz`YKEV zX^(BMm2%7LoWB;P_TnDo2JDu!R5j^{gvyS?|12%`r4L{ZKE?3)v-7uoP^opBsNsS4 z^EjkByeb))$FOk`zTfvQmN2Hhtb4GNSaT#ECGFjdY~5Kdo%}zd^3431_k?Q0)T~BP1iojcfPK}Grfk4r_cSirv_Sz8dD z0ZP?%JXRg^VCSbvqlO$K79@303+nWJa`x$+$4HJDY-~ba*a2~qsgGA2u1TFu|2V{Y zG@$a0QfI~q0HF5l;OO(2@yAVZQ19(Df+M!*rAHqra68qtUA@~NH0WA6iz&M;Mfkpx(7$QThZkD#AJ1fY+>> z&a@-nfJLs;@rc^|2wWv`QTJLVb3zXsMF7PI(LFoE zSQxu!#!x#6HZP(&iF*fjsHc!DWz13?%c8l zu|j1DUsW3_K}bVz?gZ+i=n9gqJFe+zcEn?|?(iS`3>r4%O@axmKOh_OuQjTE2Brd* zHdqJ@^EDgmHj|DwNM1qE(7l%;#A$&l%4AfJLd zk4?z0!Ns6D(<#j@jpJDq_4SjI0Yd~)kZCet1IkIl2I(5_lQLD(DHgwjnd3s~h@SR0 zqcUjj_>rG|*AUSWiY#yOoogs{CfIY+&^b^}#zn4rs`{eC24G$zPicR>2Vrs^Ifq(7 z^R86x>lecK=2PH7-q|W8n+3;D1bbx`md6z^pc=SBHYDGK*EQ-0R3}$^jCT_YdJyIq zC<2H#pyHJfSzF0mG*#*aC1e}E!YqG#U;%=n6ZU7)a0*+F}Gj4t$bYADz3(Nc50&$mw zOdxY(PRKSS6K`T1oHIhVjIfh2jk#xrj+C2ANTW3v8p@wPO=>N$C9T$$uj7m~g8XB~ zb@4H4)oN)S<7q#Fy)2g*;%fg|zR)-$-CkBIADMGB__rcF)BYJF4hD z_iY*)h!g}DEm}B}TxpT-hBmD%@tT{pHBBt}4s^L~p+nE#r=1~m1d4P)ZW~1T7Q~+# zx+_jf&&p&@!L|BL-=|f6Ldm7pr83zPuAIoTSd_xh@!VMKk&_jccrP~|Q0=ql{;ieX z0hPI~2-18jN-`ADdaovYK+A_=$3*{j(}nV9(}gaQ^y8yK8f*kEy~DRQ-h3SmYg-;E zU40QCK&OY4LV8Ouu2Y^icpds5yZps;o*vxv?MX8Hn`@;aUOjzy zzLmeYZ2K96vl`XOa=Tv)S+AHoNVZc*Kh;+R!rn|jCXkcvdkr@8crlX?s|zyuUI*e+ zB|1-6+)Ub($j%rISn_abJL)?77K$(dya&Y!ffBU)DZYFp#D^|MYX8{b@D)`^S3TVr z5N-5T!q_f@R9Cb~^mBAZbjp=aXq%^0eQto7>$?0u@*eu#8$Bn(_##6# zNxIK{mz{NRZpl07F>h_bVY6jE^uh00HM^gg zPItQoNPjBr3eg$()Om8?#L%D}`ELsdVvwh+usMc~#tl%qz!!7_`OkGxB8C7l3RSbY z4~Cf>r!iPx7sM&^T)7r_;D|q8Ji;dM?wM0+d^D(;VUR)WOyx8$U@2X!vDX||jx%KkNiu5=+!!4)>yWSy-7kg{+ZJ6EX+DzcDV&xga6Kq?hTg3F0Ko+422 zNVh#YIy&8ABNJb?urxK`OP6KcMpsF`b~adak3pV}3rD!V~etBFwOmba-4y&^rux zy>HJ}E`#n<9%ibWQ*jkzLA!+WXbnN?c1EO-q9sO5tx@$<9ZEpUdQ|ZBZkJ84HGFcM zNaSpBpad(X)g&&pDkX=~RZZ|)6<9?PQ@cE6IDucq)P*w5@x&+Ghj~CpPpL*v;&+fK zGUWKg5Wyt;8+}elxll}u*5Xap(^=i23)ZMM()SBRrxFTRUDkT`C(}gV}%3F^KkS_u0LBH^5|ka+y?rT zDfu3dS#zAxbU0E4LV}-SJ}Q$^ZuaM&c_b2PiSL?qrN;t8?vm@xE%@>$f<Imql!aq z6(WZwbQL7Js}yw_5?nnTy^psWtxtWj4$34Nnl`T(n;~@F*@}!5N|f;AiZm$e;9={7 zNB3oz>bCdV99 zcX-&baA#;S3c(A_q#TW@q?=exm(p@|ofWIU$ z*Wdu3k5POPK&UJ)fTAtdM1HKD`Z*`umq4kEo(`%boYKeW)X&8mpMOh#Dl|B7Ow4c4 zhn*~3m)WtR%5Zv@Se{l>G&JmV)J{ECLj(7;*7pjuLM@8a^t<6I8;O#ijc&9;K~l-^ z61pBM?4{i)n>Qj;MN3$zW$WO3ijZJZJBc-=bpm6>SoPyMLF`Qc*VI{=>hbNQoUnSv z#oZE~tBHh^q1g1WYpr43l>zdH;dt|vm_uRD1K1&BsYE+ z_55V=WNQgS1*`Q0r;l|du(#pEX8o#~I3so4D1+&kBI zIeP%(Vc9D}Mek4N&$LQ-S=Og=McJndBeJ#VTP4Wp7?o}K=yxw)3gds#P z*lUl2a*N|PoPGR)>+1Brk<#sEaZPk|RV3*w>F3Im9sAvV8&Y-R{px^h4qEQ0PH4yR z3Td_Y_P&twmb+mmnah$YCO4j^Be7e26V43M(ea|2deq~IX?r`_5aF~IlG!FlLw>gN83cTE_maI~cPMWbYHmn-!%!uaG+qnVXu$ zrM&b_La7B$TpuaS-Fnj2%h}XyrMsEg9yoXkEAx%s3BjrASMjvAh^bUzpqVBbZi@jP zp?PIla6#oY(SjZm)5ON7$6H)&#&WI4L32w@fBQ1C+)VY6Qx+aq>v3=I8z?k;raYFN zs1l}1CoT0dIyL*~>Q2 z>;Z9)rti=eGDqxfc?2~^Ss8sh^W5wGX=RMoCi{rZ-R_ZvCr7rPgqiU98Gn(zB7B zU)SqxRw!t`5k!cba#OYXlO#!4@mX6Q*;pUZ(adZ}GzTfIsrvKf_#zD0!IZ2f_$hqi z+-UU~X#Wah>@Tv^Bu>^G;Et6R)h`=6uu0UJSaUaT5j7N|cbR{>$W-1d#vSTS;GGb2YBjIKloUi3?J2a@ zOj)fnGQbzi6?`$AH6gnTD`k~^rl0cwJiva-ed_8lKNz(1d>LiE+pK=_mx6Ol_D+vehiXm z*i1amR<$+7G~~c3#rmYbAg+IcVKS`}(J7lGQ>kr~R=BlD5$X#@h?C_Kj&xIZq~EmV zE$rrvL-n>M79PbrW$=bJUe88%?XRjEQzpjusf=;)lU}yR<&(4ryWWoSSEaz5Lr$Mq zIx_3hOA)_Pvxi}A8?rn&cspJ$_u;xzbXttRAK>=gL*aGPonaKQQ=i{&GH5euS!5r} z6|EzUcK7lA@syY$j)IsG4=mUeCP1kl^upR<6Ac zn4|>Qq}_4rgz;+hSma{jPyM1W6fi>`U!%|yWONqoV|2g6D*1i?%qouoSWQNKR8KKf zu$kAtBr1_<)^KwBC5dH(_B{_@(&6ms*o^F5@rdmXp?6UR7S780 zPQPr*Ik`3T@%t5rmcv`RTfwNi=BH*~8dNrVYFnLHqB6CPbMc(ckrHh2kF;&QPo!}zDs^T4F@L%l;imbpLAca`k&WyF(P44v;x1))Z@evuy&-12*9ZMfcF zP;?j*H#EXbM1EJ-q@@)AbzyY} zmy)I4ua4b_yaXJDZE`Mydh}%oBM?^|YmTxk?QB+a2=W_PTj6%hVtZX!ynDk^V6T+> zusb8luGt6gY&CL~eC1oTJn9QtNzJ2$&vVh0{0oP>68mRPw{UFm##k8X0yIIx%zne^ zu@-g7_EHVIA$u~5!_X~LX{n#}tHTqs=5a(a9XF2#6?RS~$|BFA$+qXT5{bijUELa^ zuD>jt)zY+CaopIK#hrooURt>jD^Qtz$Wsd>YN8bUdlJS@%9FL0yw3S<$N9;>__N#Zwc3~&+EXSgd4aLCv*hcxU7H}v1zQ$xyH&U9_1t-FioHMoe`6*|~@4u`pu zb~g4IsoRpJal7>gM$9>{D7BdEZTgeJ;hd$MBc+qhNIh>0b2{0CI!Dt-d(@n9wH`y2-nM&Z9TJ*_@{Skk!A#F11pz}(}e9v%XhcPR#spZ#^i8wucP z6rde6uP%m1=nf_gC7McXS4>!JS}Z~gM{GX^mv0^m>v4^fE|b)>-)i%m>;x3)1_S9P z6w<>6NQd1wi_y0tqkVMg>V_HK{+*j4wgXjHm^yEe!vH zZ~UF+{0F=EH~2>O{~!3qKRCia7{J%Yu->@l-4y>gIHzi;th zx39U(FEN}Lm|vOE*OcG)Uef-ki;;~TpZSHpV|*En*Rpi@Z1hb3j(Pl7I{KeU+&?pq z|ItYN3k3Ps{bcG@;8~$?}8$Fd}g*6Z=(N{jx1Ay zw7{Omcv?y0J~(xA?scDcA9g!fx-7r8`SGFG1hrRG0LT_m-$DRF<^Vg|hB;f#H+U)! za8d&zuMk4v(g#Q(KRFT1gdZ>g6|hV~sffBPTuHtbJAp#kM+xQY?zQoc9yoH*Znfvu z=N8xF$7^~XhlMq>ujB7x3+wM!H>vY@RmcKWomRZWn71D&!b`3fzh3JaJy{Ye>L2@Z zUZbBEQxM)~drrnF_N4RlsLcj}8jQ@-vmA1|{Dd>Gas}I{1-Dw(cD42dK40wU=cKqW z1GK*^==Y!O@jAAq`?hR%J_Dz`?W~90vh{6uz>6jFR+MsQV!x|$FBeHk8oXafix-O& zzW`*@=+6CM*Yl3)(BZ*U0C16Ov2xJS@-bdI8T%(?FhC1G%6`5PAo0~=w-W_H z$D(SsX(0jxMX{drk##ZLI?*b_8CIJiOmaS&6Mt9|hK+C_i5fA;p@!@ug9{utfA8}Jny2ou#iwlK;_4&g-%Ae5F2RD^y zvRdHa0g=gfgglTZOsp~_{Gg%_ZDFZY%`K_9qEsk)3@e;E{iL*~weIjq0_EfYnWC(z zMxu69?q_UZSkl1MK=f7`&L5O`?lb~{kG^ptVA~L zmv=Q6D+Ek98nb%znYI(-WXHR$t<=A9dpc(Yi{ZI_nmPM@+9f$^H?=OT7fHC3s6Ex1X zE%jXPjFD~)mw@EDkB#pRhPBVrUVIM2(AHjz_LDu`mJiW;kDUrC^Pxx!D~v)u2~m0W^o(<; zttEYnbFix}OJ_8z8%`u?W9P#Ym*Iqvr9dpW@HVkO@nB-RXhN&0 zmTAO&?2nlnqhwIy8cqfcnqq?in~Omlh`B-AI(GBF;|^RXCNXx1Zo=v8o=VE zQ3gn`>0J;+6K!)hwQm&9LP^ag#snXxGQQjhT6;zD60qsxC7REA51>A2%ym8yxCk(v z(4QFQ-`Hh3?2)K-6fblD39A84uxso0VP8I6ZsLvb4hD@T-JFc)>ZZoIt$7~p&@eFw z`&}*CYz-1$=&d?{N6Sp0#t7}{g2AisBy&o z68;O*tCb)dkmG`O6nw4BV+qhLi<0M4#hiYks?ji8H9$&2P#7@+!N}KoUQ-0}5dM@T7KLnEa*PcS{{W$g;^o@|QUX~@x{3-U3nXi-NB>UF_#*kc zu}f-un(uv;RnIA6vW;BzQ4`EN<#DD9%xxH_;h>cA7kT*qYV9k)qHMbUMMb1RK)RK# zWnq`@4(TrGmhMtIB_&0=5s(H6NtJG-q@@L=1@V8cK0ZDy@B6*)_g#Ok^>R7P+%xyg znKN_F{LVR?LHxFu1>;gQHgzFjY);bJq-fH$7VE|(;xnUHEm2=)#cr7E?)2Zho}$lb z6}cU@AM`{uo~Odi#ANsJ^z(;QrXBM}Lxwosxv`I!m9Sp05}@(KK441rL;wjPu%Qv$ z*hZMeIJvuUbK&jd(Z@*uv&SU(Vy@U-Yq_D-8B)qpb|7qq`pxbM>TVw5%DA`09qEp} z9Fcw+BGfx8M762bzY8K zz7Orn7@zN81ba?X#q{odm+OK)=vP6>-)on@=}}%0`w+oZ#?Yqo1D#D>rYy5@4qB0^ z6tk!mp>l7GcJLT`ZX#vPUnCEupVaC4xoDm5^=gV%*FIA4iEuq7BvMb=5h z9g>oT5v8J=7z-H*jUL5sKgCoO(Zq#J5-}4X8V1=KZ@1NH$y3aIc;OdM3PGIQCR$G+ zdCH0Td6T%6(Yvlja9qb^?x|*u-QkU9Xw+)_Rzagz!9xv0*v$%F;sqRnwbqPH8;U%$78s1P7)kK8Q_WEjcQMNH1~6x^F_S z;c(rlUl;5Q^%YfpwYo|=)g+2a`T3cKnfYmLM~LL0UtqM1Zq41B5BY9dT!6P0zw$UM zsU585J?JkkP2YL`{e6bM=T~!)xD3~(eu;s^5B&b!wC6~PlVoi4TzHK+rAl^<=3Ryr z0U?wR0_Alp-s5H1;XJhtj5?2S^Og2;W~!3(L_1>3_z|jS67Ay8TDDdzu(vAG`F;A` zAaf=Q4_ybN7}1FF$8`>4E<`P?{lH|}PCm*{$RLGK50SoQ0|tP3pUFvvdc=tI)h$f0 z6{A;%uJ@X#633$1H(R)FAuvB?dE@!!=0m}S2XA~Hhae~*3|!Mk91EY_DC|rRWe}7sJMU;dB7ulQrrw?H2);JNE3QotXn?AuO^rh_5{guhS_blo*Z?hk<|CL4H%gZ zUO)Jt6n2E67mbd`&oV)or9$oIVr96(asVDO=A&l*+FB}e+HRw?0f>{L8sm)#+Z)Vn zatilm(=m8_gmr9coM*>ITIo|rn7g|oqGmR)D}~UKx$;j5B$TSeiJs=y7S34JaG$dj z$R>&~H*kUqavJK3LBtMP>BXDHZknbFs`zt`i`2D2e&jDI4o3-1LGkWUFY0e9)NPD3 zP!ts{foP;?Wzui+AW0yp8sGYi(`+2_X;@Lu8HvH!HS{{kme9u$coUnOey0XhcvbmdS!2RG!VY+_YTu~f zEWAhPCH9c@W-Q&FUR2Mdu3V*}A@X?@%*v6pK7J@T{QC9DD{}t&_~^mNFX^;oUKY;{ zdiz(&E4O+CAM(a})h3ZkmJMg1MvpMuc)dOu(va&nYBuEOHbT+DA8Z*oF??%UN@Ij( zT29A;xN>0pM56W9Yr+=)#aVZz3Vz$E3R~~x#<9kY8+8ig)2&`#dp0^oH=d+n$_@J= zj1GEs{wssRZ%<}0#~)`Aj^Bvu{}paoT}xhGQtc0J`KK{K!N|ta!Nn8>4#NyUSz(Cc z{}YB81q7(Se`A<`qJIDA^#7}Ge<6*3*$!a#gI_iT_-FjWF8`+f6WF0td6cK=;jChMw{>18&e?ACzOAUNzGzxZ0t>w} zj@3r6&GnZFH?=KnQBCQsr!ZJZ+Jqx+zjX*ikVH%hhzy}$A67OC3_$gF?-WMDdrGao z$VFC1NWxd^kv@MM$C0&euZ=x%Mm zV$PIib9vk7X?}c80lxkO7dRHwLBmEP*0@Ssu|nIRIH7&tH%_uGKiq};PBQ4auH70&&>1lM z*z#j{_tC5(l1*%R5VOBe`#Gb9lXK4auvjeGcfY6J{X*NO0>y(bltY)G2xU^Vn zM`Lh=*rl^n!waM$AzTVEa;GM=s42l*JVRN=>o@NbIpJ*GtWK7}p^Ijx8*Yt}(t<{{ zU3hL6xQP(R9yUkBm#_^9Gwc{*s0&SMGOQ18rsLA9#i$7cG2a%gDmYU7Aa{)2O?G%t z(T3zz$go3DjgQ|)J}SyIV^pNCJKmovbbIO{7w$u&+Pr+7zS1HW9sWcX*~yg&7TKu0 z=7t)bo-u9bY8MYj<~h|h{m37$XJy_c?T?8Xq=7hzpPns{Jy zO}vup-27@Ec$vJc(q9wBF}wiX%gApQLH^>(1RN(~K4a}9ZdsO$y=6b({Xnu1t~*GS zD?r5fGK<>+nq}MGv|~iXf2~Q6OUzEnI+?ZjRPcI&^K4#d8Tak1AII2Oog?K;R3UrL zq1+b)SfE8*?2(SVu#xrkw0aPwDpOZ1wsTX4hA8L3H9hnFskC<7MS(@6S8}ryWQm^> z271-Liky}=D$n6GIuS4r>1*0w&(;jQX=`uCl8yW@Gzekf1{bLp+T2$c#Wx2;G}(gN z>~+dD#KX_61#ZcTKl3<8TSBAqNm(Os#%FtGSd7NR+Dh>ttP0^sX7C8Di8kX~Ua4g9 zjv$&=%=}6~jQC^po*UK?-t-_7+Z3=#ORZlO+raJR$Ym`5FUwnfz2-7ERhICu1s6SP zABmt^p9t~ujCidg9=aGtDt4KDSMGKk$E-N=rJB>+CJt_brsgBL)_!rZrQ|htlrXc; zv9}o$g;)e{-y#J+5x=%|AI$O4q5DoKB|}+zwunyRGh=ZgD`RO-Qr`_Z>+BRY!|;30 zi_BzG$sb5H880cEdBOcspsv=Z(IfD+iQ`U++DCPDs^`YTBlL$S zE8XJZ@4dA3GE$F+h|Jk>UOaC#)vTm@F&8so1=V;`2Xnf>ZH z4lhWMoWP@#Ol?`d&p)WPIDfC|u4kX278KyLZ0INw$k$gB)M)KBFPPEYk8}E*Zc~@1 z^Y($}$@>{Ox8*dX2yoh_`3L2fINQ&5?9$czjJ}o3-etP6rsQvMsES+B#q4`nfS5s? zMS&r(O-XIKF<@QrU_a^sgE}o`@&J3jI(~1#*u{m+hMk;jb7a48`|4@8R5XcXPg1Pa z>652ok0QE7h8tQNUU%|S2Gw_G1o%7Pw@Hcxiais=swuoPv>d_k9qaDSTMF-{7m~O? zjKfpH-UJmGp`qK5l_ai8)P_BC-`OmvG|X$%`EK;e+iKyNU0Hb5-j5(IVr)o8UXjlY zq)dX+>^F%{Pb7w#9z*I(R7sQ6@U!LFeCFefybNhNU%1|LPhZm1zp%I){J|XMLcCUf zj?qSywes1S7OnMR%Nv?Py^x^YAk^aaiw)*4J=?9xw*{d>wem6jLZVX_cLnFEbcVh( zC(~KEjy=eJhZvePs+Hr`t$jGFeP~;IAWl#)cZS(U`T5Jox5r~veQMi;68BpzqlY3X zFuD>Ek};GLlLn8K4iwe27+MzPnG^3mC4Psqntg$ka*xdTYt)lFF4v@u1{Tamw2BZK zasz$wKiJ-b9!Z50Fc@GA*awP*M}T!(&P9bJ+B;j?vrT7jlWg54aWY^hLoSND9>RALl%Ij2Qwd2DA-~>!Lj0q($R-QZb+djU z7Kif+l%kH_Z@o9&b*i}CDAd!gMRpu3c zPga6iVl?6LQVZB=AIGp&KETH@%KOgRs0i=%Bh0d5%XIP;yL&J}RZ(^A4(rZh5I)j$^% zU5Wg>zQ?Af##x@9mTHMY_~pN+;#FZYTIKg?`F#m(WW7Iq{?_O7^Mr$`V^%+jcWUD~ zL5V4ty|m4f1yg#PT8A?Omakpq^nOSUDs`3!$p<9mx#c2;@&&2iw6T{(Lu7=uUjK5? zjzfrf&CiAHn)Iy?2v3N}h}jUU{mmdknT!aA9hi8S=z27*FX^7hWLbcOzeq6H-+whY z^`$=1gLx*9f66efD=k5Nt=LIJ6P!F=xwQAjqo&oHPiJpj)DAhiOxtU4Hw~&V_eHnM zB8j`a;n4E#d&lII)xA0ciJ7czyyvpm_)LBy4XeI)Cq9X-#v`WNazg?zhS>d}?9?y@9fclLg!np8r$-}kBXS<~yKZhtwk<(HxeX>aetAM`O(J~PR2QCsE{6;uD@cch_~ zuRYF+G}bn%KEc+nyyUo=mn-+KsBF6FEZ%&czA_Iyu_qA;E z9vUNSBEvU>B!*ijlEeZwdpw!wb%zFCci!My;tt5%6)?kHdZ8)<#4PllF3h%S3@{tA zDSkUsBAM2QrN+0Z_{d>5$8+#tBi@X6*FvYZ?4^@~?0#}MUaz+KY_rW52Ji7DbI@M> zlRZ17b1&>469-%i+;12kLlMZYx7^fydk;D08UZ2!YT@-+L<>ac>$Mo)$Ri6VpU0XK z@H-Ibfg;ODBoxRcCed>!JLJz62i$P_UC^FMR%wQMJB1uVF>`jwB&;t`Qd(^^e{32a zq#APAxWw~qaeWs|E}5P9SNipDhh{K{>+c+l5SV}F|N6pSMO#Gpf%+f)`qvfmA2ks0 z3x@eC|DSYf3LtpdC6)Sb)%dqM`{)0wZnjt5r2qY8_rG|HD|g%!I{)9?Z2zT7it-;+ zQV8s2FS*(Nr2L+--zq8OmpASol@tWa-SnSS(q>IRd)#r-A1&V<+@v#U5c=*ferRS% zn#cAhLe4Vb{wyLQg?4kD)PMGVL$VkZAzIObkR6jIcTDkZHG9=>9Vnqp_`#1J=Ee>y z6{rc9yc{FUyV!KjtyR~??LVsWdM{nbej4B~-}L>$d)MdVTC>+{FaDBAF$*T~u!8Qq z#m|t+GKV$FvBWZaxAp;&$4qt`JFz&mnnFO-wyl;tMnhh*donI_m5w|1UZdYtz3$DX zhK?r1tnS#~vvDO|Sve|u!F$K`li@oMqecx{7r6fmsoo-QaCbqSVQ*c~YpSqw2fx7Q zv(1}FFDTFG-}!$YkInMFd#yzytgUzSMuJFPTsoJGq}BtIA3DVmH^ut7%7~lt3`*;* zK5_ThYaWGv%h38duJGf_{C+I!{^(~nZw$#E`xbiJpn#yqeOQ8Nu72!BH62ltMMatg z+G7l3TS^fT=DhShFEY;*$t72d0&KHt3e%LxCGg^=^)iqm!jO0U$c**#iu43N`OI5#R%B^YY^2C;-Vhuj<&(pt;UQhPS zcdb#J=V_xL_OwQ5B=v~?q>Sa+z4V=MS z3|I!)`O>SSb0Yx4A&>sFF`$jbr9UaWM?aA!hjPTkc~c3T8& zw;!+a?dgOpERd`aloRh7?Xwe=SA7o|l}GQvPRpBecFkm6wGRGn#g!lxE< zGMxo33<9T$<>Q~@?9&K8t9geM_P5g67s^MSEHzt|8z7(g`}BVt@0EF;3^+GV|ftS%TWi+(|rA>U@d=+ZD!I_vL2SMSe7 zXiRb)6|NkPA3|L>^9&FMIxNfVn~*v6A8I6XizBPb}0@(nsC;}Pav$BGh8 zV@Z9y4*3L{Xj9LEj!G=!bEM>t&K;nsWlq{UPkgXGplpn}J1Q_8ebj1>;wD94&S%@C zfT8_Z07~W9lWa$lE5|^|%Gv%p_9+fgki%n-~I~;TV=}fkF0<0J}OvwQw1RadGtax;C2v z`Olb+zD_2+p}Fy$qcKW5Ihu3yL1{PH-FyO94aV0<-YzOnUOk_GGq{>>1SKipeWl|z z>^a@Xn#13naq4qGyPfcCon59CyJ_W)-_!AihdRUqw=53dY6~Qfu!7Q+vl!2WC{3szvt= zn^fr$WMQW;u+Ol(SH(%OVvo=31zVa~hHYz@?3iXyZP{%2Upw&fkx`r&(DrQ3n+Mn~ z>-C%k#aa4}fR-`kRJq!5?b~xo154yB4Pi+u%P8sSg0R)isGG@eQ{ece$WY@)sRvSKsU#JSfNiE=A7!>`G%~KsHx+m0hV;z z9+^5PQ4!yR_`n}E2r4{KzGD`Xx%PVqeI`Ohh!5l+y}G4z&9%`ZLasow*Lg3H@f}+~ z#zIPt!v}DaBo2R#A@ei`TFzY3HT=zkE|p5B(vTQv=(!@3&n5iF&k+g)-cZFO+YSTiA*B?$YL_#-8ezR z^em&J+|%*e?iH&B(lqHk%{xz|n79zpAc${pnA=0G*q}uc{%wTo=HXMAsTBNo#W2%F z`GvI0=nW_>t&qv`z4eVB3JXPibL1Sy?__v*hQV>G8ex0(A(ob@bgocSO9ywL2N7Em z9;L5sHh$AD=y0w|@CV z$Ki(h(8>Xi&Bh{WklRr&SocL|R3h}6Z9~`I0b{3qmj%HK5JNCyyp$vxA5Jo(e6Zi% z9Ei{QW`|mmxu*Qmt#?yGCE93S#97 z^$3+17He8)gRLTHlhMJ8!Ac-+<2xI-gNdJMPP!PU?fUu55amt`pGnV*Gw#dKL|k%l1*(;?N^9^e;c211VcNy2L6><;at`Vt z{+{49MFiG}5zEMD1y7B|y^E8dJ?qe>*TK@6W5)K5hkC33Pzw-TI*TdZ988pNUYthm zX;Ue9uq=v$1F<_#VavbpuU9V`e=1x5wNeDj2=#ZgB;ckI`0aPJq@TBWmu}|%jUM+$CHtkK z{faO2A36>ch3P>k+%Og5Pq-lv8!&diu)dr?0HJ?@^#uVK-oGyAu7G|2c{#@g0iOIn zgMEQ`N&ohu4oGtQ3kCjJ*9S zpweR(a~)jgR~50s;CEbjgOD4J`+9yx?`0|c zZhXbDMErJuh!?sR=H?kR3r z2yDq^tbQP9M7sOTGMZxfnD~C^n|FzNn`uyABbzyAtzE`CspZy(=1ZqeA$mqyYRu!| z7_YBUm@FobRNSfBETzsjA0%ijnl9$AvnZ+BR5l%s9eE-Crn3L}=8(Tq$C@E-DOe5RLjtS#kP>TGhDwx1U7qQ{jNA03OU;P#( zv{xj~=Rr0@IjWC^y6F63XB#90*ZUv4qIR$<+lSk3980CCfVnH%;aD3i%%i zNRvsnF9sZkhkc4ZK+P3yr7PM6>>4J8%v%)j3wHRl+wsn3J3qGg+*ee!s=!YB@>}tE zu^J<}7URSsX5vQ_jXV#(l~0 z*8dseuuL8b8D5RL&`O+G6D1GyGDA{pCzDyy6BPNY-xC4_29@+!JxQXO{P)CKu9;`k)7-@K?i zboBO3<$b)hBkAV6rW(<|e-9B9>FhrYbGOZS(IUp5N9Hf^f>nsWwxwuwTEN$x*lMzo z(dye_2l>rii*SP(L4jrj{Z_t@2|`f`cG@&WD_~*1@Hbn(?*+%k*>PwL)47(?}8ckTl;$J`I+%u zM{K-A0(_tPf4rWJ7VPWTy~DDE@w)eoD0@+HOms|-UUg+m#x*So9Qx=c`zPy5?X>l2 zX6ZDiobrP5jtTi6;$puyuOkb{MOZ@gq)t~hpa)!C78?{{r+xu0N(kwa+zIT?fzh!d zC8+Yga+vc&3&<%PtwXs}bn!|ksB7+pQRLY(il0Sei=Nbt+)7}kbIYQmw_>el`8Zux z?=R_oV|CAe0ZUv5M5VKzi#e>%J~iwSeU!SE{%Am=u(^L~N0i?70nhx~V`XN`b6J!< z7ggcYrjne8>30LHSUGd5v)&!Wd)S>NhfRZ6vLCmvgP(TOd260!R%?2dk0sRW@xCx; zD11S3gT=jUV!2t;tKBQb70uKy!^!|bn`Gk1bJCKxJGrSu@AaHD`?vGb^2X_ivXGj5 z|8#BJqL&8MqX|cK)}xsQcqG~KDVw7-Ui0HJU*+%vSdUmnx|0M7qQoURMh!Wz`o<)@ z+$=`3h?ddU>8Aq@4>G`$-0hrwtFGzGJ@$F8by*7DM^_I{kjyoa zD`kDTz)0%IOFgFxl!2001R;^R(X+f4FT)1%wiMi+$EMlc6{BE(!nIvNAMzdvt5d|8 zqLAKS*(ONm4hNZgTqjG|-I%tzWQ=LPN|##mqYAB*(H_zHqtL0;C+3TFw3uE|>8K@C z-2!%F$#l1SiAgO!6H>I)JZc~+7Sl3ZY+ohj(M}`YYDf0t`h-cVXw*G1;Tb)w{&FjX zC@n|?y5jI^Gu2b+EsjK$AT~SEcgpXIC3CwRF6oMO-$X+SS(>OE3t5tH*VNt8>9}yG zn^w}R_1(#QRh!$bZE%Ec{0!N6t4nj-wRN;xn{jO1sI=ziXY*kVe`2zCeOB>y@m;Y3 zvPYVQ#HVWA=cXmp!DP|Fa=y3QhsODf)1DTej^%$+49hlL)UOKFm-Oh@o;qteL_3gg z{!wlAPE2peRjHiA$rW=>-jMn|!v~>ceR+dQ}vQ&iK&-q@K znEcY@BT=*O57kRt+C69DDnL*38CHl3OxL|2T#;*#ob;b|oRu+)?k{C~KU2rqC_ATcN54Ch3l?d+<40WorYyK&P;CL_KNgRCFnMVqNNvLo^CCl*UgH;242 zAe=_V0ZnTM`96NB454zODqW2fZ|?UIhYgP+;TF*rSzi?a`a$J;F45m!5ZRx<*P45s zcYOMhP-qJ*OM`z}!ERa|$dbopHY0~W6b*{F@eb*gmBR37=*`&qtl@KPpIf-0Ar2S{ zD_!qo-RUPCZp?Q`#zarGoa<~8D5oaSN5x6Vswvlu-K&u<&s^zQUPb(BbxQHI>TAhY z{CCCdy6=@g(eFRA9n0Mo@s;u*_|TLor(#|tJEmrp)+j!gKk<&WxV)H{bcS^2X*B5% z`xRb%?sEG_POj~BWMb__(pj&(u?p5!RYfVwKBaRc9os*T4=vMfBy{I=5_nky>DgaF z9uerAI1HW83r%INmiQ>@SAF!}PbjxEU>NcZ?-{-Rx1h~DV$18lS|do%?c&}Ud z?K8-*iA2!VCbJww{`A4CO88mq(e}gIeq4MP56Hlt;gMdKQ}|k_XYgRvtMZElTA!~; zo?<&wCu2H=_$&$tZ>}T!@IJ9z=RBhD<>FQ2yQ4y{1;*o|6ek?Dc`WFJjtrtkLgQo; zd{nK5Uav&H>BKTYV5QJI@M^HIl5YBK+~*BV`f9hc!xd?gPoN)?Jxq;^u)dHZF85I zgZ?GEW2lGMn2B|jG^HJWOWmq0>yt*QKyA_6%K`K({tufl$4X%JYCnr4C6OEOUkbV46?IzbSN+dz#r~G$f96Y41GJ&Md!uuw;cZtrJGi$}~8)OX! zQa!KvTHo`oGIaQE52t??ojStej0z#h6RTmEPWxFtU(nm7oW{Ymu)PLnzJvI0r3<6^ z=rq~yb1Ss)zMMtAFky$%f2J}y6 z*=uZ!{PW;=rv=*~iTZ7z@Ua{gMml!;VnRUx7E8#W;gh|lUChOB;oE~h9tGm(0%cJb zcdipJikl-ZcA5L{#eqfKEqg|VZ)>!#dsM5%i?ce`Bb6rEP1eFhKGUNUf} z9X}npJr*p&mHO2gtG#5=NtMFe0cG)FnQ+194%-6T*Ul5rPn-opPg(D~5C|}pw~ya4 z7+dm&5=5iahkANhu-9 z`8Lqj^9av_{f7L?lL(9xq}FHrJ$yYhB-3g^Hzv-aNENg+f?S@p-pxf7ZX_u4^Nz-H z4_vtYDdmCHrwU7&qP;eB@=t}hU4j+2a4mP9Q`T43JM=p(i>!5zkDz3ZR6HFzu;Qa40$VOZ~AMHW_7jqW^U}HGsAs1 z{mS!?5ublpr5p$y_DyF|_tF?l?Tw>ex2qHomtJr@mPhu~l@&K4nrQXJI1!gsXQof9 zYpS4nRmM3yqKLz)uiad>D06O!$BYzrg3UF(sUVV^O=ul`AQ<<3B;j9ad%r>0U^Xxa z_IH?;2ZH>DGtgPPn7A+-IvQJ8I-43hyEvM%m^&Jpnc0}q|0(<%7eEJu3!qR?q0j^( zQL_LaP6{O#Lq}&%3PmvqQNY$^>hAof77#T1mr6+|Z)t2{Xlg^DXzFO}@s9$(bxt}3 z+eel__DEQ(6r3y^EFAwR^c!M9C&t6h1_aXPW`_X(*x5O>+1O}-&!3|I)L<>0fk*v% zOTc6(?BZ-;=STti*(Kn2MMpam7h@pBE(r1$(n1tStqX)?2a5cK1^7t-LcxmxDR@yp z(7yt4%iEd22tcu4gg-VkcSln*0BZq)f>GH1{-J;Xx)>*g8O2W;2+9G5Av<6{6pwz< zfPmhdTz~@mPa5!&>|kJi{z-#D6#F2!Q>+M1wsK@C>*_V}r7T0ks;A z1_r@$-@$(V?jI)@3^V$|(YQF_+l7L-A%I=sQaQj2gB!jL2$&0qU2wS^gyUzr(aUu~ z9I&YGaI~Kh<}cBJH{*tHmz|9h2zPL~E*mF1H~f3BaY6yL{Bk)k7|IUcKQISS8?GD& z8w94FU+N178~2qqKwRA1@O44lTtI9FxVq4*ya7a~=Z5bKP#2cX;8MH57{DT4T%vJu zL7;%07mmiw4PO@uSXbcdLV@XkAA4YqU{>Nw&x3Gq!1Ef6o!Q{q;DCUDRdBg3K)b?2 z91w0;6pl-6a6s7rw-b087!2PAU{nVT=$Fd@ybH@~`_JbA49*Tq)qRNu0daxh$AA;e z1w`DqRE`Z8Q+PfA))trxjt@YbPzWd7ya9sd>Ns-%0YKpT2aF$->&n^(#uUDPAa*b; z70^H5gAL$Qm|VI<1Nan3Z*Z9gtdA?>#||(8TwOq10u5iT3mEcX*$^&24+{Qi-@DW< zlpA`b4WK7icmoPzld~qT&fF<70koy5)F_WY;f%Y`Iflh*9Xi;3kdacsVVhHMSDpvr2E(sk*x1AGt017yVEA@{xrE>2 z0p^4G%v~NAFa!cGH$WWhY%u@COY;UCL12-U;A!yt5wHV5uk7bsz_h`Q6&p9u8+hIT z!;WWgZ2*0SmvtautggNrlpQ#IT8!R>Y zr7-~37W`bo+JJe`UM>ezhHn?l2E8)Bz!wE-I|n78adpU1`x6oelupwG}N>j-*f z3;;m_G;;ZQATT%ERX$*Yz@KSgym6&3fK0ryCj)cN3AcV>eZF!|1m@_9i~x8D{(J?K zYgc$2pux**V4nuTpB-U)H~g6pCS!p-MVH1DNcY5c<$XDUGZOrm0+NXxUTy%Z1Ev%F zbBCA3c9jZ0os+d z1<>Hn%0S&KyauCPnM**o0y1L#Juc3UhCrqz$Di4hR4hGV8XMretZZioWD0}r0WfsE z^dmDnAaB^O?_xmaFg`BWjtybw;x;k`gN%R?s`OiguT~0bV X1A6gKoe=0P2Nx&G{rlny5-9% + + + + ActiveLayerIndex + 0 + ApplicationVersion + + com.omnigroup.OmniGraffle.MacAppStore + 139.18 + + AutoAdjust + + BackgroundGraphic + + Bounds + {{0, 0}, {756, 553}} + Class + SolidGraphic + ID + 2 + Style + + shadow + + Draws + NO + + stroke + + Draws + NO + + + + BaseZoom + 0 + CanvasOrigin + {0, 0} + ColumnAlign + 1 + ColumnSpacing + 36 + CreationDate + 2012-11-13 08:14:49 +0000 + Creator + Michael Percy + DisplayScale + 1 0/72 in = 1.0000 in + GraphDocumentVersion + 8 + GraphicsList + + + Bounds + {{647.625, 256}, {49, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 262 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Align + 0 + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Tablet\ +Server X} + VerticalPad + 0 + + Wrap + NO + + + Class + LineGraphic + ID + 261 + Points + + {507, 303} + {507, 243} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + ID + 260 + Points + + {389, 301} + {389, 241} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{410.75, 242}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 258 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet 2\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{293.875, 243}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 257 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet 1\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{275, 232}, {437, 80}} + Class + ShapedGraphic + ID + 256 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + VerticalPad + 0 + + + + Bounds + {{647.625, 350}, {49, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 255 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Align + 0 + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Tablet\ +Server Y} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{647.625, 442}, {49, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 254 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Align + 0 + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Tablet\ +Server Z} + VerticalPad + 0 + + Wrap + NO + + + Class + LineGraphic + ID + 253 + Points + + {507, 489} + {507, 429} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + ID + 252 + Points + + {507, 397} + {507, 337} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + ID + 251 + Points + + {389, 487} + {389, 427} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + ID + 250 + Points + + {389, 395} + {389, 335} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{528.75, 429}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 249 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet +\i \expnd0\expndtw0\kerning0 +n +\i0 \expnd0\expndtw0\kerning0 +\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{528.75, 337}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 248 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + b + 0.201172 + g + 0.760682 + r + 0.932896 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet +\i \expnd0\expndtw0\kerning0 +n +\i0 \expnd0\expndtw0\kerning0 +\ +LEADER} + VerticalPad + 0 + + + + Bounds + {{410.75, 429}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 247 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + b + 0.201172 + g + 0.760682 + r + 0.932896 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet 2\ +LEADER} + VerticalPad + 0 + + + + Bounds + {{410.75, 337}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 246 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet 2\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{293.875, 337}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 244 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet 1\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{275, 418}, {437, 80}} + Class + ShapedGraphic + ID + 243 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + VerticalPad + 0 + + + + Bounds + {{275, 326}, {437, 80}} + Class + ShapedGraphic + ID + 242 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + VerticalPad + 0 + + + + Bounds + {{645.625, 164}, {53, 28}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 220 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Align + 0 + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Tablet\ +Server W} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{499, 107}, {17, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 217 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 . . .} + VerticalPad + 0 + + Wrap + NO + + + Class + LineGraphic + ID + 213 + Points + + {507, 211} + {507, 151} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + ID + 210 + Points + + {389, 209} + {389, 149} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{528.75, 151}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 207 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Tablet +\i \expnd0\expndtw0\kerning0 +n +\i0 \expnd0\expndtw0\kerning0 +\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{144.625, 336.75}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 200 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Master tablet\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{60, 325.75}, {179, 80}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 199 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Master\ +Server C} + VerticalPad + 0 + + + + Bounds + {{144.625, 243.25}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 198 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + a + 0.75 + b + 0.380418 + g + 0.323697 + r + 0.101111 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Master tablet\ +FOLLOWER} + VerticalPad + 0 + + + + Bounds + {{60, 232.25}, {179, 80}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 197 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Master\ +Server B} + VerticalPad + 0 + + + + Bounds + {{293.875, 151}, {75.5, 58}} + Class + ShapedGraphic + ID + 188 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + b + 0.201172 + g + 0.760682 + r + 0.932896 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 Tablet 1\ +LEADER} + VerticalPad + 0 + + + + Bounds + {{256.5, 55}, {234, 27}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue-Bold + Size + 18 + + ID + 185 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs36 \cf0 Kudu network architecture} + VerticalPad + 2 + + Wrap + NO + + + Class + LineGraphic + ID + 184 + Points + + {60, 94} + {696, 94} + + Style + + stroke + + HeadArrow + 0 + Legacy + + TailArrow + 0 + + + + + Bounds + {{540, 106}, {53, 20}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue + Size + 13 + + ID + 167 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs26 \cf0 Tablet +\i n} + VerticalPad + 2 + + Wrap + NO + + + Bounds + {{422, 106}, {53, 20}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue + Size + 13 + + ID + 166 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs26 \cf0 Tablet 2} + VerticalPad + 2 + + Wrap + NO + + + Bounds + {{305.125, 106}, {53, 20}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue + Size + 13 + + ID + 165 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs26 \cf0 Tablet 1} + VerticalPad + 2 + + Wrap + NO + + + Bounds + {{138.875, 106}, {87, 20}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue + Size + 13 + + ID + 164 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs26 \cf0 Master tablet} + VerticalPad + 2 + + Wrap + NO + + + Class + LineGraphic + ID + 263 + Points + + {257.5, 498} + {256.5, 138} + + Style + + stroke + + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{144.625, 151}, {75.5, 58}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 10 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.953631 + g + 0.970953 + r + 1 + + FillType + 3 + GradientCenter + {0.1333333333, -0.11428571425714285} + GradientColor + + b + 0.201172 + g + 0.760682 + r + 0.932896 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc + +\f0\fs20 \cf0 \expnd0\expndtw0\kerning0 +Master tablet\ +LEADER} + VerticalPad + 0 + + + + Bounds + {{60, 140}, {179, 80}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 19 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\i\b\fs24 \cf0 Master\ +Server A} + VerticalPad + 0 + + + + Bounds + {{275, 140}, {437, 80}} + Class + ShapedGraphic + ID + 201 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + Text + + Align + 0 + VerticalPad + 0 + + + + GridInfo + + GuidesLocked + NO + GuidesVisible + YES + HPages + 1 + ImageCounter + 1 + KeepToScale + + Layers + + + Lock + NO + Name + Layer 1 + Print + YES + View + YES + + + LayoutInfo + + Animate + NO + circoMinDist + 18 + circoSeparation + 0.0 + layoutEngine + dot + neatoSeparation + 0.0 + twopiSeparation + 0.0 + + LinksVisible + NO + MagnetsVisible + NO + MasterSheets + + ModificationDate + 2015-09-25 23:49:35 +0000 + Modifier + Michael Percy + NotesVisible + NO + Orientation + 2 + OriginVisible + NO + PageBreaks + YES + PrintInfo + + NSBottomMargin + + float + 41 + + NSHorizonalPagination + + coded + BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG + + NSLeftMargin + + float + 18 + + NSOrientation + + coded + BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwGG + + NSPaperSize + + size + {792, 612} + + NSPrintReverseOrientation + + int + 0 + + NSRightMargin + + float + 18 + + NSTopMargin + + float + 18 + + + PrintOnePage + + ReadOnly + NO + RowAlign + 1 + RowSpacing + 36 + SheetTitle + Canvas 1 + SmartAlignmentGuidesActive + YES + SmartDistanceGuidesActive + YES + UniqueID + 1 + UseEntirePage + + VPages + 1 + WindowInfo + + CurrentSheet + 0 + ExpandedCanvases + + + name + Canvas 1 + + + Frame + {{47, 80}, {1391, 947}} + ListView + + OutlineWidth + 142 + RightSidebar + + ShowRuler + + Sidebar + + SidebarWidth + 120 + VisibleRegion + {{-250, -126}, {1256, 805}} + Zoom + 1 + ZoomValues + + + Canvas 1 + 1 + 4 + + + + + diff --git a/docs/media-src/kudu-tablet-flush-6.graffle b/docs/media-src/kudu-tablet-flush-6.graffle new file mode 100644 index 000000000000..0a5c3bec4c34 --- /dev/null +++ b/docs/media-src/kudu-tablet-flush-6.graffle @@ -0,0 +1,4781 @@ + + + + + ActiveLayerIndex + 0 + ApplicationVersion + + com.omnigroup.OmniGraffle.MacAppStore + 139.18 + + AutoAdjust + + BackgroundGraphic + + Bounds + {{0, 0}, {756, 733}} + Class + SolidGraphic + FontInfo + + Font + Helvetica + Size + 7 + + ID + 2 + Style + + fill + + Color + + a + 0.61 + b + 1 + g + 1 + r + 1 + + + shadow + + Draws + NO + + stroke + + Draws + NO + + + + BaseZoom + 0 + CanvasOrigin + {0, 0} + ColumnAlign + 1 + ColumnSpacing + 36 + CreationDate + 2012-11-13 08:14:49 +0000 + Creator + Michael Percy + DisplayScale + 1 0/72 in = 1.0000 in + GraphDocumentVersion + 8 + GraphicsList + + + Bounds + {{590.31354949367233, 527.7891845703125}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 442 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=5} + + + + Bounds + {{590.31352088344283, 505.56698608398438}, {126.38891410827637, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 439 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=5 (val=1)} + + + + Bounds + {{408.83619447277601, 504.64398193359375}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 432 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=3} + + + + Bounds + {{408.83619447277607, 482.42179042270982}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 431 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=4} + + + + Bounds + {{429.03063540606638, 464.71344148841285}, {92, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 430 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 3 UNDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{400.53063782557308, 460.25977013668307}, {143, 75.302757263183594}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 429 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + + + + + Bounds + {{72.870914939013616, 515.66392968780144}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 428 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=1} + + + + Bounds + {{93.065355872303911, 497.95558075350459}, {92, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 427 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 1 UNDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{64.565358291810625, 493.67780258829634}, {143, 52.000007629394531}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 426 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + + + + + Bounds + {{239.16112667355992, 538.8125}, {126.19647121429429, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 425 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=2} + + + + Bounds + {{590.28745505702682, 483.34477242907275}, {126.41497993469238, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 421 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=6} + + + + Bounds + {{594.33953612428036, 458.8304985382091}, {118, 28.880979261988241}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica + Size + 12 + + ID + 417 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Future Rowset 4 UNDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{581.83952633675585, 461.25694833797661}, {143, 97.743049621582031}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 416 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + Pattern + 1 + Width + 2 + + + + + Bounds + {{-106.67912496096972, 425.6421488592494}, {265.28924560546875, 28.880979261988241}} + Class + ShapedGraphic + ID + 415 + Rotation + 270 + Shape + Rectangle + Style + + fill + + Color + + b + 0.952941 + g + 0.898039 + r + 0.788235 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 DISK} + + + + Bounds + {{-75.756295889511676, 132.52576077295058}, {203.44358825683594, 28.880979261988241}} + Class + ShapedGraphic + ID + 414 + Rotation + 270 + Shape + Rectangle + Style + + fill + + Color + + b + 0.764706 + g + 0.968627 + r + 0.796078 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 MEMORY} + + + + Bounds + {{631.67637159621972, 261.64463122384166}, {49, 46.941173396060925}} + Class + ShapedGraphic + FontInfo + + Color + + b + 0 + g + 0 + r + 0 + + + ID + 409 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 FLUSH\ +TO DISK} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{608.37446616712191, 237.52359553489259}, {94.214866638183594, 105.11808013916016}} + Class + ShapedGraphic + FontInfo + + Color + + b + 0.259317 + g + 0.366908 + r + 0.536095 + + Font + Helvetica-Bold + Size + 12 + + ID + 408 + Rotation + 90 + Shape + AdjustableArrow + ShapeData + + ratio + 0.50000017881393433 + width + 20.000001907348633 + + Style + + fill + + Color + + b + 0.692953 + g + 0.887133 + r + 0.853588 + + FillType + 3 + GradientCenter + {0, -0.22857142851428569} + GradientColor + + b + 0.303015 + g + 0.723636 + r + 0.933288 + + MiddleFraction + 0.4523809552192688 + + shadow + + Color + + a + 0.4 + b + 0 + g + 0 + r + 0 + + ShadowVector + {0, 2} + + + Text + + Pad + 0 + + TextRelativeArea + {{0.125, 0.25}, {0.75, 0.5}} + isConnectedShape + + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 422 + Points + + {52.325008607912196, 72.672246726576759} + {744.4749934219725, 72.672246726576759} + + Style + + stroke + + HeadArrow + 0 + Legacy + + TailArrow + 0 + + + + + Bounds + {{611.64866664024271, 350.13513272118695}, {85, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 398 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\i\b\fs24 \cf0 to be flushed...} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{590.14511678418546, 399.30488493311145}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 396 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=6 (val=1)} + + + + Bounds + {{590.14511678418546, 421.61687860590104}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 395 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=5 (val=2)} + + + + Bounds + {{594.322197851439, 373.13771853237409}, {118, 28.880979261988241}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica + Size + 12 + + ID + 390 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Future Rowset 4 BASE} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{581.82218806391438, 373.91127582226795}, {143, 78.154823303222656}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 389 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.362305 + g + 0.254092 + r + 0.130547 + + Pattern + 1 + Width + 3 + + + + + Bounds + {{408.63645209957929, 398.91456168798953}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 387 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=4 (val=1)} + + + + Bounds + {{408.63645209957923, 421.22654415179437}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 386 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=3 (val=1)} + + + + Bounds + {{413.03065038578097, 371.94740724269724}, {118, 28.880979261988241}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica + Size + 12 + + ID + 378 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 3 BASE} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{400.53064059825635, 373.54741078752795}, {143, 78.518692016601562}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 377 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.362305 + g + 0.254092 + r + 0.130547 + + Width + 3 + + + + + Bounds + {{239.16112571988552, 516.59024908838239}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 370 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=2 (val=1)} + + + + Bounds + {{259.35556665317586, 498.88190015408577}, {92, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 369 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 2 UNDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{230.85556907268256, 493.55081624408638}, {143, 75.302757263183594}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 368 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + + + + + Bounds + {{239.3535212862345, 394.98443806268727}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 367 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=2 (val=3)} + + + + Bounds + {{260.04796221952483, 377.27608912839042}, {91, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 366 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 2 REDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{231.04796463903148, 372.99831096318206}, {143, 52.000007629394531}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 365 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + + + + + Bounds + {{239.35351961840817, 455.38734597373895}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 364 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=2 (val=2)} + + + + Bounds + {{258.8950680418248, 437.67899703944215}, {90, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 363 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 2 BASE} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{231.04796297120515, 433.40121887423379}, {143, 52.000007629394531}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 362 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.362305 + g + 0.254092 + r + 0.130547 + + Width + 3 + + + + + Bounds + {{72.853493606278633, 395.33092836562167}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 358 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.85098 + g + 0.564706 + r + 0.239216 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=1 (val=2)} + + + + Bounds + {{93.547934539568956, 377.62257943132482}, {91, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 357 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 1 REDO} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{64.547936959075599, 373.34480126611646}, {143, 52.000007629394531}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 356 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.853841 + g + 0.560202 + r + 0.23762 + + + + + + Bounds + {{72.853491938452308, 454.98383627667334}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 348 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.997488 + g + 0.816758 + r + 0.674172 + + + stroke + + Color + + b + 0.360784 + g + 0.254902 + r + 0.129412 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +k=1 (val=1)} + + + + Bounds + {{94.047932871742631, 437.27548734237655}, {90, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 354 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 1 BASE} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{64.547935291249274, 433.49770917716819}, {143, 51.903518676757812}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 355 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 3 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.915711 + g + 0.744622 + r + 0.515742 + + + stroke + + Color + + b + 0.362305 + g + 0.254092 + r + 0.130547 + + Width + 3 + + + + + Bounds + {{405.83619370825545, 144.14580997515895}, {137.6944468900009, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 341 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +REINSERT k=4 (val=4)} + + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 340 + Points + + {562.16943883112856, 337.56007107904719} + {563.16943894739666, 598.34708857427268} + + Style + + stroke + + CornerRadius + 7 + HeadArrow + 0 + Legacy + + TailArrow + 0 + + + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 339 + Points + + {218.87765945892409, 337.38344765471368} + {218.87765945892409, 597.9920781444772} + + Style + + stroke + + CornerRadius + 7 + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 338 + Points + + {385.4404080255236, 341.87068453618679} + {386.44040820324426, 602.47931984895695} + + Style + + stroke + + CornerRadius + 7 + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{405.83611574648387, 173.89581255030038}, {137.69452485177248, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 334 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=4} + + + + Bounds + {{405.83611574648376, 203.64581130853693}, {137.69452485177248, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 333 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=4 (val=2)} + + + + Bounds + {{240.80135243568887, 204.99304364809566}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 330 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=2 (val=4)} + + + + Bounds + {{72.870908462235647, 143.65973910490951}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 328 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +DELETE k=1 } + + + + Bounds + {{72.870905126583054, 173.89582208704346}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 327 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=1 (val=4)} + + + + Bounds + {{72.870905126582997, 203.64582084528001}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 326 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=1 (val=3)} + + + + Bounds + {{609.08223652234835, 84.815238166516338}, {94, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 325 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Future Rowset 4} + VerticalPad + 0 + + Wrap + NO + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 311 + Points + + {385.44040326156733, 72.672246726576759} + {385.44040326156733, 248.72918426990734} + + Style + + stroke + + CornerRadius + 7 + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Class + LineGraphic + FontInfo + + Font + Helvetica + Size + 13 + + ID + 309 + Points + + {219.53056795056918, 72.672246726576759} + {219.53056795056918, 248.72917230946223} + + Style + + stroke + + CornerRadius + 7 + HeadArrow + 0 + Legacy + + Pattern + 1 + TailArrow + 0 + + + + + Bounds + {{593.11383543180818, 143.9027586835428}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 303 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +INSERT k=6 (val=1)} + + + + Bounds + {{593.11383543180818, 173.65276865076407}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 302 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +UPDATE k=5 (val=2)} + + + + Bounds + {{593.11384902277086, 203.40276691229587}, {126.38888549804688, 22.222221374511719}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 12 + + ID + 301 + Shape + Rectangle + Style + + fill + + Color + + b + 0.973291 + g + 1 + r + 0.958452 + + FillType + 2 + GradientAngle + 90 + GradientCenter + {0.53333333319999998, -0.66666666649999995} + GradientColor + + b + 0.51482 + g + 0.874515 + r + 0.879605 + + + + Text + + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\fs24 \cf0 \expnd0\expndtw0\kerning0 +INSERT k=5 (val=1)} + + + + Bounds + {{417.03063678841681, 120.44445357963095}, {110, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 300 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 DeltaMemRowset 3} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{84.06534605987332, 120.44445357963095}, {110, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 299 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 DeltaMemRowset 1} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{250.98548629648991, 120.44446883841992}, {110, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 298 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 DeltaMemRowset 2} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{620.80825447694724, 120.44445288755473}, {71, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 297 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 MemRowset} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{448.18338612174728, 85.141681916184325}, {53, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 293 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Align + 0 + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural + +\f0\b\fs24 \cf0 Rowset 3} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{279.48548208696121, 85.175689223740122}, {53, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 292 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 2} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{109.56535309542909, 85.14167595903109}, {53, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + Helvetica + Size + 12 + + ID + 291 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Rowset 1} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{397.53058821283821, 110.61112341527746}, {153.37847900390625, 124.11109924316406}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 290 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.831922 + g + 1 + r + 0.855654 + + FillType + 3 + GradientCenter + {0.24761904755714284, -0.30476190468571424} + GradientColor + + b + 0.445857 + g + 0.846871 + r + 0.535721 + + + + Text + + VerticalPad + 0 + + TextPlacement + 0 + + + Bounds + {{231.04797132247799, 110.91669067321159}, {143, 124.11109924316406}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 289 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.831922 + g + 1 + r + 0.855654 + + FillType + 3 + GradientCenter + {0.24761904755714284, -0.30476190468571424} + GradientColor + + b + 0.445857 + g + 0.846871 + r + 0.535721 + + + + Text + + VerticalPad + 0 + + TextPlacement + 0 + + + Bounds + {{64.565348479379963, 110.61112007962494}, {143, 124.41667175292969}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 288 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.831922 + g + 1 + r + 0.855654 + + FillType + 3 + GradientCenter + {0.24761904755714284, -0.30476190468571424} + GradientColor + + b + 0.445857 + g + 0.846871 + r + 0.535721 + + + + Text + + VerticalPad + 0 + + TextPlacement + 0 + + + Bounds + {{207.49579036853535, 51.559797964384622}, {193, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 269 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 Deltas to previously-flushed rows} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{625.32891214468816, 51.559798443429237}, {57, 14}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + ID + 267 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 0 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs24 \cf0 New rows} + VerticalPad + 0 + + Wrap + NO + + + Bounds + {{154.46113077126833, 9.2500136507899704}, {473, 27}} + Class + ShapedGraphic + FitText + YES + Flow + Resize + FontInfo + + Font + HelveticaNeue-Bold + Size + 18 + + ID + 185 + Shape + Rectangle + Style + + fill + + Draws + NO + + shadow + + Draws + NO + + stroke + + Draws + NO + + + Text + + Pad + 2 + Text + {\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +\cocoascreenfonts1{\fonttbl\f0\fnil\fcharset0 HelveticaNeue;} +{\colortbl;\red255\green255\blue255;} +\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardirnatural\qc + +\f0\b\fs36 \cf0 Kudu tablet internal storage design: flushes and deltas} + VerticalPad + 2 + + Wrap + NO + + + Bounds + {{584.80825969235366, 111.61111765018423}, {143, 123.11110687255859}} + Class + ShapedGraphic + FontInfo + + Font + Helvetica-Oblique + Size + 12 + + ID + 19 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Rectangle + Style + + fill + + Color + + b + 0.831922 + g + 1 + r + 0.855654 + + FillType + 3 + GradientCenter + {0.24761904755714284, -0.30476190468571424} + GradientColor + + b + 0.445857 + g + 0.846871 + r + 0.535721 + + + + Text + + VerticalPad + 0 + + TextPlacement + 0 + + + Bounds + {{50.725008607912173, 275.9297623696242}, {693.74998481406067, 326.19583129882812}} + Class + ShapedGraphic + FontInfo + + Color + + w + 0 + + Font + Helvetica + NSKern + 0.0 + Size + 10 + + ID + 305 + Magnets + + {0, 1} + {0, -1} + {1, 0} + {-1, 0} + + Shape + Cylinder + Style + + fill + + Color + + b + 0.883527 + g + 0.906504 + r + 0.944775 + + FillType + 3 + GradientCenter + {0.26666666659999999, -0.66666666649999995} + GradientColor + + b + 0.612047 + g + 0.612023 + r + 0.612064 + + + + Text + + VerticalPad + 0 + + + + Bounds + {{562.16943883112856, 45.244502403340448}, {182.30555459084394, 203.48471069335938}} + Class + ShapedGraphic + ID + 412 + Shape + Rectangle + Style + + fill + + Color + + b + 0.737928 + g + 1 + r + 0.873222 + + + shadow + + Draws + NO + + + + + Bounds + {{51.600006723741672, 45.244458564345052}, {692.14998481406076, 203.48471069335938}} + Class + ShapedGraphic + ID + 410 + Shape + Rectangle + Style + + fill + + Color + + b + 0.787239 + g + 0.944676 + r + 0.803297 + + + + + + GridInfo + + GuidesLocked + NO + GuidesVisible + YES + HPages + 1 + ImageCounter + 1 + KeepToScale + + Layers + + + Lock + NO + Name + Layer 1 + Print + YES + View + YES + + + LayoutInfo + + Animate + NO + circoMinDist + 18 + circoSeparation + 0.0 + layoutEngine + dot + neatoSeparation + 0.0 + twopiSeparation + 0.0 + + LinksVisible + NO + MagnetsVisible + NO + MasterSheets + + ModificationDate + 2015-09-25 23:38:18 +0000 + Modifier + Michael Percy + NotesVisible + NO + Orientation + 2 + OriginVisible + NO + PageBreaks + YES + PrintInfo + + NSBottomMargin + + float + 41 + + NSHorizonalPagination + + coded + BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG + + NSLeftMargin + + float + 18 + + NSPaperName + + string + 11x11 + + NSPaperSize + + size + {792, 792} + + NSPrintReverseOrientation + + int + 0 + + NSRightMargin + + float + 18 + + NSTopMargin + + float + 18 + + + PrintOnePage + + ReadOnly + NO + RowAlign + 1 + RowSpacing + 36 + SheetTitle + Canvas 1 + SmartAlignmentGuidesActive + YES + SmartDistanceGuidesActive + YES + UniqueID + 1 + UseEntirePage + + VPages + 1 + WindowInfo + + CurrentSheet + 0 + ExpandedCanvases + + + name + Canvas 1 + + + Frame + {{47, 80}, {1384, 947}} + ListView + + OutlineWidth + 142 + RightSidebar + + ShowRuler + + Sidebar + + SidebarWidth + 120 + VisibleRegion + {{-247, -36}, {1249, 805}} + Zoom + 1 + ZoomValues + + + Canvas 1 + 1 + 4 + + + + + diff --git a/docs/media-src/kudu-tablet-flush-6b.pdf b/docs/media-src/kudu-tablet-flush-6b.pdf new file mode 100644 index 0000000000000000000000000000000000000000..108fba9bcb6a4599fa62f1aacd1accadc24b1ab9 GIT binary patch literal 274809 zcmdS9bzGI*)-Fs-3DPMnx>>9Riw^1T7NkLt?vQQ-q#LA>PAOS*cZVR|At@lCo~4hE zzGt8P{ob?p+2{LypMSXSm@(#cjWOoUyat_$gcKWyofDm|XZw8n@a;v;L{A?&7k~p` zZ}I|NP!IrAFt>Z|VhMo29;pFGjo8jFuJpglew`ix(9q~ z)^a>~V_e9|A=81AN2k`j->lSLSNz_yCF?4-(NI(Is}KcMdbaplnIUwM3PbaDL;zm# z!ND5}C(Gdm-a!O(WOB1N_wYOYx7e34mvMeYB~|@yrK^Hv-*wt6MQ-kG)kS>tc-FHU zl@FD_;dL6W&RrN!INyIS^CEqDSK^iC>-CZRK`^j8=v()NX3NQevhVk!(dp-Ye%I4H zz(XG3jAz-HU-?@L)NZS#8j9CpJ061Et~?E4Zp5Jn`(?KYhOI9SfWD}kZ_+`6A1~#X zfng1gJyPM7`WrG1zc^T9^Oznr@9%paV=X%#AJ~y%T?hh8c8@==mr%5HJCeWeq*M~R zzFFsuD#|`+c!nh|6879vK82s+{sWW965Hnb!sCA1{(O-UcPrc5T#Do!4sLHv9_+A9 zaDB<<{#2dMH06%(i)sJe$nmRk^JDi9o^CI&mR(b%yaY+gx-u3F0b50%;6Dn!rD}Y6IPZq%Zxm@cpcq6d z`*`xl7*E8_U)QRDrTzOc-B`{fJX zK&hga#^Mj>E(UKQ9W{D8O_~|T{H)>vi#A7WMt+%Civ16GHeDGUk&{qa>YhsX$TdWe?e z#502~H!Kf3QkRVyzR#>?%Xv@28X8NEy>a%Ryp;5U@=?4Ng=A1PR{G$4S|@3`=^pvA zz`EjWAfdkX2}jb`%n!IH%__0_*tWC<`s(l|)k5njedz+r@(MNj6g(wyvTF2{%NArL zT!}Ey=k=9|tIeAW+#S)T(u~FKkI~Y4ox)W{2gDNY!??!%tc^Z%WM5a%(Pj7es~vkP z73%gMes-x%3F*5Go6No7ps*cj1sa`ISA%%gJnXRL!;TdbChriAg#;SA+9kDU6Z2t>8?kPRkh|4~C3lr9!;AKI$@7aFZTOD<+p_U> zI-Jp%jC93U{FFC-a3G^ZMDhK4Va)d?Dn9OT?DWQ{vR&+a3ZJH^&)ha5dEKGs zb@wPZPFqX8w=$*y(;%vlN=Lo&^UK892NU%&iE0q6p0Ht%RIToxv@9zVwLP#Qr7uvX zB;5lfS6?d=J;;nN_@ZF{9GpsXBK%3wklhJAin@lZue-LIM#m$?HF^e=5gwv6uKA1y zAa2z#>OkBn{U8pXp**N^R0JGSd>Mdqhu0OeGk#mFZv^xr0yR?~Huh!opl6(Dt181KX+IgbB3{Pr1qdJ2Ztk_HyNzeM?n+ zjshc%WWw21Hb&#`H5Q#5O44oe7ShUQ!McgsM6p9Kh^hBrNEI@9ZXnNa3Oc zp$mRy%~KN!8r}t$#@VJ-#_#CQI}*9W(VCOlZKzU zEU`Xi!_xvn1qR4E=HoVcnkvjkA_n>Oqpi{{xQiMgB*T3c%loRrn^B2{9&`O-Bc zUP|S)>mn{F#6c{5)_Mlk4Rj>iLf_wceWfUUDjd1eEq_+v!8zSA7F+H_G97DXgX8j5~O3L`kEYP6+<<*XA1vgLCS^w}v9EKHrXPzm6W-k!@2wMA zDX4H{Cf;5#4dZJag1-Aki8usnF+b$-yLvh^uipZP(Rco84g`dX_Dogm;$MxOi$6}sPg{qjHj#V$=A2_9`{wB2X-cJcRX(~C@&giAN8v*f8jl{71?dv zPQv}nkXiM}yW^&2dC4S_)=c{CUa$O!Y`oRmjdP5Km4;ZMaW(5vsfDAJIxV`BmGz0u zl#NxD+UqD+)5f>WA*Z3 z%&vLl=$tZgNN^6sT;(vkLZb2y&khbK@HBPcT`p6KNZjA0mn<`XThFrE*-_0RFtB2! zVi^`(8?y0T!G584vfB`aVFi&qH1E{qm0Y0#?em!#OW6JW7kVoYewiaRJJ`GA!# zz)7>oD={|LO#^sMMaqt#Iuefm(J|9;NQrQ_(Jsp11DN1Y6S+)Bhe{7fwwxyF+MvMA zm|U`m`AQKo#^x9?ExK>$VA67^A<|)$!2s6A9MLTG9j4r{1D#=DqLU_==jpJ30my zpL_`vc)QZJ_EBpe%@0Y)(hy*N{=_=A;V`IN)U__KVWtRL5<9!+D4R5kT0l;OJ^idj zq2QyLG_p9#J3Tv6$<7%I<74}7+p(pm;eHI)GRJ76x!)$z)=rpCrY=ZpCDp_Wa@I*4q#yYjv79-5 zuiN}&{w^(e5Gyp0lw78s&K2WAZfvj$*XL_iNpjVjCa30Sj;psjcghc5mipp%et(f45wC6@Y$@ak9BD*jXo3utu3KZ_TG#385a({X9$)c7i7@#x-x5+LZ-6Gk8mm8%r}<2_sQu zDEp`pETVEwB?ib7D9xX+40AwfbY+<3pe2#TERHajsKXm&COW>Pn|+v-E}j;o;M<)s zt)x+V2%o<-^gU#L0Gd!Vt7MaNRYBin4#Xh2@}bzi*(p-RcQYZ}`Nm#yymjPiHY|Sp zd9&DVRSzZaD;p}O;M@1M0c8|ROm|CgeIsYB@Q8|8 zLhBK|OPk?yH?6jeJ~w3DP9D_6A#WB#f>y*iJVyRXjP%U#rnR^n#8;q^1mlY1XBr57i@uC}Bc!48)nXQh>XTjKA8s*;Z7xs` z|Due8Y0Vuum2nZSZ_xmmxSWwDSBJKcNUOnM^6~w{cM}m6?s8wY+HGsHk7B!VIrtI_ z_-N;4dxFJk+H5r#`*z*&2yqrSsU<=A;bt^LM9%}quakd#tL%KcB9L}1+^{TROB1If z^W2lIQYxa~Z7ac1Po&aQs?4(-T*Xpa5ne;dos(%@NGzAWH}41w#hG zyHs=UiY(b`k}y+i+aDT531z-W6@aTNq9;0+JV|(eip*Xd9Mo-&Yy@kpNTY|AOpn1i z+l1%7Nk%B$FIBP@-drNy#C@p&X(daj(uRqLQBG znJ{`&&1cfD#*nl})3vD8RJTqIF5Q(q`ZwY#gbSnt>3Jg}WK;{k=8On*#(6zpj>zdW zSs_`WXIWPaF2JaK3d@RgQ0=eJ9@q!cYv3J&?2}TnBWjyhzioMcDR>WhVZL~Jr(C5G z`z@7eAXOAEP%MylrWG$nYxyM{&`Qrg#x%oh0#g%+L_#LnyOWnQ#53LSqLx()D3h+Y z8Wi<*kp}<5=_r(21K2Z`csdR_)@hUfvVV}RT-TCw&)cykQ}EENN{@PJW}}q)M(k_x zt4CdM#l8ZcFTxhq6hDlA9SR>6JE>U1uiZaYg%2ItTY=j)` zmV`KDI&s1Y=LLYJV?sn`U+(D2cTr5e&K8nm{Y2Dv(e9;I`jm>vGQ)u}knItcB<+3L&Dgo&9QX4gQtVei zJKqC_Zxm_CXezmN033Kq)~^alXP%ns&87RqlJj`%>>AO-a+-(%*yBm1^n(b(S6t31tdYBSx9>$K{SXC~m>gic-2xnGoMa2%=!! z`omX_(J)zKj4iJLDK%Bd5$30u*@x4M4r-1adg_Y?!Q5&4l=eQ7MofSb^EL7CY+DS9 z6@h(X^{IKQ{9F34$4d2aU9_1wWeAfvB~(XwG?7O_MBCu5z@3873B~8)puO7tPcJZS z3E~WLz+D#IFWIHbDPLdfNE+qv=5f9n3AybW@IvTb?~%hb58QlM7oH|N`%2PYPbhf% zvL~%eRU$Y2i6{RUCkki%E2%7F!P4$;Qv*X*xU3;qN(n|3VjSI9y0rYWqsV>vBZh+N zXd-COnIW?L&l8E2W0`zO%sD7XS+{Uh6<4^2gL9@St^qDWo2oIHn%fdt-*Vg;f-K^W zVg^0B_>5YiloR6Qak#HgwIjcor){Lu2Q5eSo6jKaV>()9n04mCe8n-67T-F6wyC%Q zCpgR*5+Puy?4jgPqwH~CQ^y=S!Q~ZF_b%j>V_aEkhg+Arn9PJ?z!5F%uC`Igd#1`A z;p47G)B^s;ezBz)+>>T4Y&lm2N|%KsWM+L!A|k6*+D~1wiKmGwojSV8aQG5v_^Nw~ zz$JQ333NPazV)LGCNy88_{~#}Pv?crs>)*tO$W3smp#RL3}3SPzVmandJnA$NOfG4YI%D8+mmR|e}?V}PRB zDPW5>tz1KmZ{?O9Ah|u&}nMw#KnJ0ctpemg8 z9=m}&84qith4ml@u(@`-5QFa`P4_NIlI`86vljq5h=fvAzDs;v&8B|o!B~u;(F6}QUg!-3=T1`q$r!P? z_G}-LsNjySqxe5mmm-VCgMJp$uVl_1a!NTwQhhA}#9E$|WAZF~Oo_G3V@Wn9)I&m3 zp2K-3i|dMuJNE3rsLiTiV$4j{`Q)=3B-U-2J-TWVM|fp=Zy8N-S_pCMWYZ08aD7># z9aU@NN;2JXW;k2gLO$wUY(pSkl{fmU)o&JKkbP5dG zEQ!o%N93W!8giOf`(EDf(qdf}H!0#PuiNo;NxBs|dzv1!3fvmb@c7+_bH44^7}mt<76O*~XtFW8 zw;CxZ^dU7zIl&{uoE^QXSFFvczV>3xalv&wZvJYn1y>qS;F#)p0wpE9@b0-mA{O|( z>(s+n#e^b0e;xeQj1i<8fJ^l@^0K_yHaUs0X6`zi_v9&?bi@mv;(Dh}U7(%9R z7SL6%NNd9=qcc1pN~^1>1F+W{aOv26*I(u;+aQ- z1pEH!8Rs`LmPcKm0^UD)oCfrS4mi`M)FbmFX`eZS=D>s0 z=We$xc%J&0BnnT-s_gEM+>oJ>gtpZSG4R%g+RZ6$Lpc|LCJJ4W4vRi$0D33?o_Mvf zl_WScNTI40LJz55v5>R>p3n@Tz>{x}Ri8fymkVm4A!;%AZz_gE%Ef;s>Q)+}S^>KmB}EXxDi!bB%}h;L^8f zm|fpVm9gLrj`V-cw30eJU~# z>+r`|`rXR1&*XM$O?EZR*%#M4--^FGmVl!jxyW3}2l=YEOwcf_NGOIa@DRqJ9~ z{4-w1l>M94fHV{_?U6`64?ui}OFNV7 zf^#%9QOuo<>lE_!= z`;hJtO4_J6UXYzYqupRqq(tCGH_7Cu^g2@sJ7m~C2oppFEjX7YPf?4d*$( z+dOW1nVjDIy-4-5e%p}ks{rajZM{JWssx)gHMB2rABBZ*{k$oTEuJ;%vhiP;J@Zd3 z#)+hkl6u-S8CE{$KyIP+wH%>s>s=u_wtC{ARheDXu)ZVqs4Pb(fzMEk28rKZz=$68 zQB#-tJkQ=cpHh2#o|F$G;8B}IQH)76eIvrw#)TQlxD5L{0p#q{ri8PwG4&nF^7lkr zk?D=b{?L%>_qh%tOBS!!4FWdpbQcQ|q>V+OPv3^A1>}Ex*DuBE4@nzP47lRQ4G6fWs`k$GV$C8^GPZhzN@8ve>PsC$5sQI%)*O?ne);0PYOl<_U8e_1yb7k} zXsi66DWlLo-awc&mbzxcgK^+Y!wVyPK}gHTJ75kYu;i4UU1jWOZ38_!n&%ZGrXQy9uGxYcA}RajNIb?M;p9cuG-zk3Q6&-DvQD4$ z24cfl9uQn$Y_gl3jf51K$E0xNnlv{(%48C9vh|q3KSY-pAIc^!HFG_<)5%H%jplY$ z%rfbWN6b#%L^!C2RN`Rnim8ioNn3RJ#EKS?1cmKgp?_V0XRpIuKWhkKr7Q`<^NR87 zp<8yO!C-TqEqn}>CbIhQl~1B^ochH{!RDRWPHxi~J@zv`S)Cq?JO~l{@a|FW))$C} z!ich6hItKuf266gZNz(O7SqNyzUHG?-6oot#Om~Cor*`qPgzKYqrz5!qTJfTZ*~`5 zWmRI04eC!^Ww8joI&l>lN$jm_bIkM)Do(1*pr8AlKSA^6trZ7#7OmfUV`2)2$JZ2i z@O~$@b1{CgWrK~h^h;Bgdd%Y$x*o$==w6mHpN75@JsfuAbo&y;%fo`XPt4#?4a_*4}TF3-hAc6C`3+og)gX2?st6G25l$~$F zm^cU3)4-v1NQHqdeLq;;I^iNGHWrU;#n#t$L>~EiUZU_ivW|Q7tZ~v~m=+^;VgCe| z7T;SnqKx=)*b6mo%aHKvDs0-?4_b#A;&i3Xxg!7vWzxLqa#V zGyA1V!>)da**_HQAHO`jAif`Oe_VtAi_op%>0k~3su(~2`M;C7oeO~TFVS1g+}YmM z$<*8#!2Q<)aeF%#*n4Nd51|~UiYuC%Ss9DjdjRw}U=MhBxB%Q>o@eMVVf@!TKl=SG zsH-^Ho2r|;0Q6v!Nk{>J8s;7@0HCZbY+|urzv91sWdY9sKyiB;dna`VV^ed$4?SMo z83f?@F)cdm5)1%w{8)n;OoRsjB|TiE)m>o1{LhBcKi=~GYzhEM!+L>ud4D&Rg00|Z zD-8|U3yz-)MF(nV{21`pqBJx(0U*#X2sAXf03hza-u#$KN7>|sx#?d+$l8K<0bGBX z0b2(c0)YI)2_1IB!wc*7%LZ6GJ^C?nCZXoU;y#`UKAH6fbX}p zFc3g61pi(yA3E=ERJeFxGyU$x$pPU0V<0z-&|mDx+VTOwzdOP32K@!|PX=HshWwf9 z2dBTs;D*3j{vIAi0BmKy%m9G^e{6>}``PE`cmVM4MnAme?+&o}|F8)I9K`*5QW&a# zz=N>~TL)~GAA><)rv5RU7d9TxuR;?T@w@=ogJ0%AU?%*Rc|icK-+*y*p@U#;e>DX2 z0JwkK1oK1CAGd-T1mgUI5{#WcgK)v_IDUl{%r#*I|AB%N1mONXJ{LED@AnN1l|M6c z!=U}+ov?{uPV;M-FxEhS0tChe2=v&$#HwcE>pLw|ee7`M%Q2_dbAuL)zf5r2U zXoLX)`eO#nL;hm<2bW+@z#l(@t?v)iutk7if%j`>FdyJoBKa{M4C%kz;b*LK17HaQ z*6zo6oIC(neE#YL3z6UUL0}#YbI-ruz%1hUYgw?w^e5&1#QX1@`>Tiv7Ir^-{gvwf z&S5}xR}+_?$^T~(1^~6K%wQEyAec`8Wz4OfTe<*X2?Zz%%lS?Y_CE@r&|REdVY|eC z$;=?$Ul~AM(b(A<0Qx)B#jISMRm`2l?QI?G?S5=2y#KU@>c>{`yN2r1L;xJ@Uw6)p zI^|Y~igAZOJR%AvF^AEA^5QKbqEZ4J{QtU5{>119ZXgiH-`m$u{{I0>Sm^!>Si&6d zZ!BTq^Dkog%T511Ea4x*u{5*CqX|*19L+D zCi1Vn_#c=A{hi(ZH^YMSU!d~utnznQ{K`N74wbE0`;E^eaeBXdNIp`PK*H(OMPg8K zL`vID3l3(YihZ)fv%M{JZ|SP-?yBjknRV=JuTwU!cXDU{=~;_gw)1h2z5aJuVG*AT z{>-oy=(oDYwWE9Tu_b|akC(eC4l5sS2r}K>eZL-$*{?wxFP6M1glq0vyeasaF?J^t zTH5_YvP3%g>hJp1Tl@l@y-vn&1g|<*HgE2?yM=iDzFMBx-w2+aQDnCJLBGc+Pu(zN za^8LCulsJ#`~3QQ!gxaVu}IRm-}vJW&)bikoY{pJteNB3Z|?5bJb8Dq4F-05F&+R7 zn$D&|$bf!vV=RV67Y8Xk1}&*$hYsw^9-C9qJnQ65eVssjI9W`JT9({Z&KpXVXVw6XRc&mX2sW5n`4mGmDN|!C72Ut z9u5wH*4AP@(FNw#(z4fF_wEbQ2n2MZS z9<$zzbaNVR-sMKi>7Q4?p)x14X!%i&MMdtUdb<*R#FTm#{^M!YC?5O4q9eq(+C-Z~ zpIz~cKn9P&Iaj7uK^WDHt#<(q7f}IAb33N;HhN&ZQn*!m?J;7X5t)s=kl~?IMSY8@ zVhiGnKmyi__yJ-F-kel`gFK0;8H57+v^v_~M`I3p zHV`Tqa9W{(eg3&EkBVG3TDpO45AEiKpq}}H(m?53K=<{=Q*QA-3;}UXGC~ovZR6S= zW>5%bCp@c5*Cc=AII-(e9=|)Ssi0j&AI(zo2LWUy;+`l`N8K8Z$TE4-_iN_R#5RfM zI8W78y;$8DK^+!w-^UjUrjZS7y5`~W)!3)mb8xGC#*L4$HI@0gQ^dO8=s&AJd=*J_ z4iAKM74*od0_&x!rNDi!D1s{boK1%@PwSMvh^^uc4Vw2w`8u7w#CyYqV4yS5y&BW+ z;i$dEyg7qjE^Ion@N|e+<=~v6dxW^GKbd6h8AEw%rDrQa;%Vx7b5ksl8D@>Ml_MQz z=B{R?s%7w*MpZ1+3+%O#`065UQ&K$*+^3f)$ zlYf4Hx|WcxuVcB^8gauKee-4d%5&QY7>8z0n-J`W&+2Rw{gE^S-gf{krC}3b!D`_l zKse>vPJnS!r~K(4I4yTADchgfV($>6Drt$+rnSe$ERB@xg$M!}W*?!g-eq#TkQDJ} zCHys81{+W}G?F};Btg@dAcd7n!Qe&|VjD_!UK;mCk9xTjU%PyOKeuqVXbn()Rs&nT zBLi2X=_*=&2*wg7Tcv}Oac-xnw|(gS*;{|29x=iF$@EfSGEYyN0?kMwvGkleqC?N? zZt%|)woVu0UBygrF!V6E&Ps~#=f9I!qMjpep>x@rko&aubBiprLJPe{p646jYHg<|lkd>uD zw%*Bu8`(GWp>#HkKCZxA3Fz%|bEyJag-RJ8QtV{&cpw39?gzXs`SR^g@pkKrrA>MB zfw-(SG}0D~T~fYE;_XvOVe|M^_pmgO>}A2C0i)><=063JqAp-{92-ayk?yatHt4ea15J4l_oa=c!E zx^Yrt^ya5r8cYbcU-VUC&&cxGwaQ%5d+=Ym>zOrl285dsi%Ar-O?9avZ$8yioX zK>Iy~avnFOSxM{juZAl(6dn027AoT#`{e}*d!>kX&9T4%FtF9|*8G^v4TnC(q~3!Obe((^vUlYz*M_z-A~=WKOu24f+x4OllIs+YqBg$S6-96zEJZt_^jGdrWGEN zl9>7y{N@xsttfdLCS40)VVH(Gbgie>lr)*;PBJ_MBpIb`)Q&j3@Q85O+l!PUZu_DP zWps+A8>2ofA%R&uEAAfx?IU83BBAc~%i18_=}myPde*E%ddnoe%p(*`iddMF9PN?J z=pp!zztQPLf@mx`3BmVkTXGu*Bh`^-WU0oB(9cDJBN&vA=zMUhL?g~bwPI&uu?M_f z(6_uSt3_ue-I@a$?;Oxe2_HEw_SaJs=|Rz zHb+(Shq}HR9R(J*v??vv+t!&WB%FlywAzsvC(ZAxR%Qpy+qzV$=yyZ4R}#-ui+rYx!>da?LrBvxFeT>oV`{{+n9wt!0 z#%4@jH;)En7gYr^6SE|QP&FpXrtiTS2GV84MzZL{YQGwmBmFeBszCYylg40sTKVd9 zLwR5tG89f3-{vA=1AQu2U^Mve}R~Rlb^04iru&r>jihiV~H*KXP zXL;3Hb9Wi6Yy&!#K&B;e#8!hJp=N?p`^54IThOLCDpimldU2n@+px-OU>!-3_ig!3kPI`k<_?3`BBWrLlxa3%vm_ksO7onq&^r>ZJr^5W33ZS0`^K9&Id!lAJ5G zVFF`_^xCAps#J~y&@EQ#9KPlrSOoH}k&TvDl)?;DbO{II(dx7|1R3Y{-Gy z=pS~n_(8n}FjGQ_gnO({Z;QXLN8#@f0B7yr`5K%Zu&Tmn?dQw z{d_z&^puKy3`2%Mr>JS9Yl5e{mZ_!Nnvc8zJT#xO)=SZV#u`_&Kn=EwlCuN{=WI+d z?DgJIfBlH~m=@XZ#254d&~@Guf}H3$zTME`JC({K&SIlsaMS(GBGn#xe zsyX{&-!~p{bn!j@8hkg1OdTv@b0jnXb+N+|T_+mvLC!N;n9=u-J^A{^GjcPwZcfp= z*HMA9(gd&~4wYm}g&JIZ>DXQ|&f?isX7HKMNpBf@)x8Hw``O7VkL{c1*8H63Bx**|0&49}G!_J#yqfb2p0zXb(?FA3L|MWYiVrwM!rGg2 zq=JLa26I_NArI@l_iuW$k@h}sOTIQY&@~bC@E9mAMKJJ~s@h+_DgRndu}Jehqw?*( znHeh9_mz<9T9WrQ6AGjmX}QI@yujO%xqDUX{NyTQ-&ZL_I99=mGOQAW9{EK(!_k|Y z2jV(n`b8u|1)Y?oo%PnyBs-)p&c>|YXz%lA_{l(_ID}_TC}~={(Uj6R$<=vXNF zBAemM&&CZQ9=)tedK`x?x&Acl9|{O=9@_PjS>9}yg@eQTZMPx&yj;DsjKX9JJIlAc3&&mA4i_dL|DZ!E0??26BjJagEeRwyqAXn(Gi7IH~Vo50Z69e&-U@YfGd= z_ql-*vQEY=Ab|OX6DOpKo7W^>Mj?CzR`>OqXXJi+N~BO(vc4VKJ?b$lQ$_PYcO$B& zhptE^Ev$T}?@c-r4_w#-gb2^$k)g#v5_Y6Mmw2YL-x7eekZ9X!*Cz7>> zN?l#y{RVlhlI4?%0D%^4vF@N$XggE(Kw`X+P)griW>XVMEv-E^kilj_xyo}`dp9gP zWplY#uV&D&l;pyn;X?;GBPa3H0Lp1)7 zj1&4^FnfiC!N(KFF`Dk`gK*0R08GcFI2uQ`F7L!S{=p{@LRFc*b#sVm;&Ee)Xh=(B zoF96FH_vuhbX8c& zV?G-L+gBu=`IKhKL*(|XSC(p=9kZYZUlao24qs!+U&LEGxnzuZh<57n8+?Ok3qze2 z7O;T6+40&IwLQ{iTRAv$Z?W`DE<+2=CDqLd>pyqqE$RkLcvX0=>AT5rd{?Y)Y8o*c zh!^y;s}Oq5JC2e0Ncp>e@wvG3cV^!`=^iWj#1`U|)Ua||DGXhlbj1o-U_Ute(wW^3 z73#)?_RVkgj+Xg4s~P53bBKW3@qv@$7zP`Mj4%Qn6-+8rGsU?Tq6Z&#pt*sB{WU$( zEShx0*}|fyBy7`&+F^wTj809-mW5beL{ZIoLIVg%kX*hQg=X0@2{%&8M?KDWoZ6U< znw$z5=Ub;6hE~NFWj=0GFTeUlw=u)g9d4Cw1juYV9+`HhRN*z}6K=7JfCR0GBl5tP zt8&%U20jmaDMMg6i_CzU{G4NT`=PUM$#B)&xE@O#q>Eo(qa^5um#~8gMS>?#ic1t9 z#-6EnMZ{nmnH$VvdML z&Ie3w?5Zb*UyP$?eT#3!Q?$(@Dtk88IvxQvDN-wNWeHL}nP+m^U{@kNp(bLi6 z%wgc(Ap^P~;KfhPY+!uAL1$qm*(~t@<__s~B;`*LYoo3tum z-FznW9v&`6+aOu`bK#Y_Yqm+bkyCI;vJNdFa!f)?{=kqyQOKNn1~Sp3dyCLsk7z95 z^?p6aNY(DFW_W&b6C>>7FP-Ifz}x;^v1}O93l!c*%kjskhOnc*-mrO{1=MuLy8Y$g z6rQdk)=4Z1`2i0Tje!CoQWzbQN5rBW%IC`d=I33cLzNG2iZZ*`*|rO{I6CE{Im{z4 z)JL3p9nCy5D{_)MbYF?PRelDy*_-L4r*J}_F6?=%BdS_B^X9=e|KdB?*F5zr*9Sk(T_g0?J><=|9fcAw0ZXf1kYn6lDM5 z;2owa{d*D$OriT1C6r%c&A)T*{{M;`^)JgF|BqsrJ|<=y@;}o-1YC(-Z|%%8ABe(Q-pHTi!)9fsY%0Q&!dHVfwXr5*o! zp#OiV&HlvbCzgM-*`Jj%|A6J+H7);}g5JNcbA$h?90CCU|A*yIjQ#=3pOr@cfaUKB zsGk)A|6R?%f6=*r)Jgm|+2DUJ8UX()WB5M>b9D{r|5nw&e=be`hszoMgNAt^B9cD) zXB)^*Px#>-oSeT42!9s*{Da593RnJxOa`lf`Fs2LSF23Ge^o5~yV3W%GQt*%C;Z9( zoidZZcaQr|{rgktud2R(02TCCVGyib@_#E1VO1=DqYA4<`)|VV zf8LbAf0y?CJ2~yY6e-~2NZ{ij5fDg{Ka$k2Kz#&2qYZe3N<|aAwLOY>K6+4Dc~E*# zD(o}XYFuHUd-F~2<$UF`&+*ICis%QcD_{1P$V3F*jEVR2_W!0NHd)2r4Mu7J%kDs5{#o)A_@|zF0-Dwpf zU3@K)27Rt8;`q_-D%IO`zsM)+1>1HJCT&J zv3B+&!R+yNk)<2#%t(Rnqq`yF?gk^fh4L-H?q=xS(Dt(Sy8Z{BPeaFt4wc7kj;_}W z^)4&WhU=Zn*2c<{)`z>1t(%5NtrzC}C%(RnB1pMm`UZb&tTQAlXxt5ZUH_xdy*DsR-r))x zctU9q;OyW5$+iv)o5zZ&q@?h1OdLybXcK-PH?D0+>2ukhpW$NX;~e~aGoy+!6A{Cc zE$9-z)5}Ykbql!~w+f^ap&fcDg~W1!pmTy-j)%%}5nru@S6&CZK+xGmo%4Kn*rgOe z=Lyd`c(@4P!H=X<=izE3q%b7r2w%ci&8-S6SJ2hntbRjhZth5DuGl5!7yv74sI6r# zXJyyY!qwIyd{q__vVov8fRSi!l-q<0d#<8Ipsl4PtskLHQ;7wnsX=EEpjucHHw!{0 zyeCN7e5NO z2iu1n#K^v&qR7pf-ej;^4}FL<@0kENtA*L8I z(cN;Uhx&mbrYxcLqeEU4wy~L~A&~+|*3zBSjsXwTYc!4Gnh2vR+VF}7A@uChhB&|g z8~E~m4kML5RO2iqc*Lab=a`5r&oNPni&+^=k*W=#8b*>Nre_PJE@C1U9ve>aQUyR9 zzWAPVN2A9;t4FpIXu&9ROl7QOsN*OBDjjM89OJAH=0fl}lFgz9Ym%}aAy8RFy}|2C zdt@5QQW;5(ajiBID@&7>+aJjmr%nyvtk&=vhc_jk!!rvaBs9lVIx(?{F1nuOl;UbY zLG%`}XxeZxrFvD?A9?P*oaDQM}%9MXo{iO zJ(})j&2G)NCI(Hww`>YDhux&gu;v_>#GXJmg!Pdqcuf>$j8aU@;8akat@Q-jsGELo zQsd@~1=mI?hLgHw4vERzfNuFY((n9Dmin8YV)&E=mo*A@EVI) zxb)DCR^zLD!0h*o>rtbli-RxkVVMsiNX80|nJ~blqe>5^|NFF@4|ZcT6$Q$JuQOet}*hNqj~3MYcJhGa zEX7pCC1v1BbCA$?b#i1aC^(B_sC*HM1JoVHFnDZ+o%1jruVLROZ;asKx>;Z95oqQG zYhUu}p3{j2;(a2X8%&&aa3wTrbuMX8G01Z9NP?Yp%WX{z6o3LJ)^v2bU5~dGm|fWo zZ6GA3I}7RTXm@E7;lrPUSzgJnIFD}k(y)s4Jr&nc>O*~hJs1W0U@Gy0)txJ)A+(H7 z3q6?XUE>5q2697Y;oTSsf_4U#Z5nTgc#=BgsW)9QJvu$tMiWsU>v)@!n@8a}7z+BB zLp^K?^-Qa1HQ?W5#S&Jg1gbR>aTW)=_ii8Sd)NnBEGUPv1E)vo=+5WLW6kZ6UO3Q} z4@RwD6CV=mO%k+Yo#x{4FP#;-;8-=7x^&?3)9BqzJ=<*q#ovXtw_VK zD|SGAW-+Sl4U8QZy>mxWJWb)`E@IPo>0}86u(<}#L!>GoMIM6>$f^U+2vMN4uwzaY zYqs+Dq(BNix25zZ$ZG_srwC^p+92BVUd)lx<1I1Al18p~H2F>CBx>=>#^R@H7m-7Z zJH5%o)WdiWJHMtTZbw(R)Fc`3vytjeRyg9XJ;9#r;3^<_eU7^= z8suSXglCO$Bc7nIlIZ3soYH=;l4hrt@7T&ZSx+Rae8aoM`XYCuPc z3*_uVuY&T*Cz-!gsAReJh%X3kjKRsHMB+}w)ptq0Cia!ndJjH~5`t^Bs=Szl3JSUi#bk7a1cC(!!QI^{EI0%RZb1tuieQBYw?H6R;RJUH z8r%sG3U{|au;4Di9g^?ldHcQnjpz2gxBKmJzcKm;a7GPIo!V=!z3S|_=A5S+V$Q%X zgZ+jP>NS1Zt6Mn{r6k?vLFO#|uoRZP8y-B=9oe(S8w^(=lYupc>aka&r&h~puNt%Ok%r26%i!suvW7ezUl-f6a-YU|<=DP5Mj?2R{>ZIgBE;@8rSxJR1 z9_VbXDmv6QuycrhDOBn#^7vEY@K2GitAjDkdP5r*GlzcmBf~-!Tc1_3LUTWACJFg- zc$+C7{xX|G^;Vn3JO-F%OQNY+M3xcBVe6WY&k_A>UcJzbcwQIY3#a zc%+qPpK93nKfx_UQGA1fL3x}Q+o;#9%;yd3`k<~lQkS01zU$L!MFAN|{AQ^qsh|i# zTO!}%G&tE4(E+fkH2Y;c}}s#FGDD)PZy!j^MoZo!D%ic zvP??-w4ATzfvR93e!r`mpz6`4)8pN3Y$Ro z(z3=mAX@|R)_f{(YGUI0;5|*=Fy-_dRrtf|D`SkNTi!e8uybisE?#>6Jb_R;PrNd* zFS@Rsj-oT-Ir@z%!`DRJn+TnC?xt|kXIPaVh?81|hm=%Qs?qh8RRfxjW;xMxU4|sL zf>tk*oTZ_STiuaPz^ojfnyl85E`gvG-5VV1lT@CoiuP54ia`Q9L9dH#GaM6BK)`%$ zJ12NFFO9Q{y}+W(m|z&dg8z@NTsRWM;JHj2WdC`EevB?*6OS z$1JRk43Z&Za$#_`FRGeSwr8@q|uQdWNqSa@(%tOZgEAMm@P z1rJoE<%l9!D8AnEKER886ivN>t~9TVsxU)sbA5I~&l*rAnn0j6Bu$K&+>~lhXSeRe z_}(hgt&o%bjRN_U#1!{e){$~noB(BE4H+!XoMF(YOPqYe)Q?N7b8^vZ9EuqacYv>5 zc9TL~JezLD&qKsI!KSr9q8T_|;gz-Gr|k{gLu0~7=}DIelMSsJb;No-FYXnI;geXK z$8qVzOH4R>5uU}qfp<4HVREK)p0d4GL8#v#0BOWTc#RC)*qPGt(d}lNct_FXfvG4e z7-DrTd9H1C|A|xy3OQy1+jtgKO{^!LayS5xROQDdIC;Lq0>(IFcq)O7q+&sAgqL3g74)+oMf{)%r1s`y$`Msd1B= zU7B?6%_q@E9DO_mZugEm(=9G#1UiElZ}D7WtsNrUAB*;0$!HKoM3gX3!Or>aO`V$T zb6mz=p%5@d3B$l~2nwjltir0Jsjog1hM6wSWSNSaHXNm(%KA(;`D2g(L&eH13E%y@ z?oGz1EYeH(_j%80PHA!UsBWzr?&CKlk z&%%c6F@Jb-5hQ2xch3jIb}9!dNWYrUuQ8s&ePB~^vR}G&O%KAw&2Mb5fIOjd3_Z&F z9*4~Y_HVpK2G}#oJq4wnE%Y97a8A$ZBiaQ4{au-2}b#6ARu|v6o=(eB?YqmEUA9;OZ`fQlZ^=Q%`=F?-I%*K z5Br6!vC>VN4)=iWDD6Hku-MlGlroL!5j)s|dgsFtTBfnH6y{1+RP||c8cs;m&x6h$ z^x|vex(l)nzS!qH*JHNEN~0|w$c-_k=LWTXYgKt)soyb+N1Do3$X_{5!U{sFVYRtt z7QQra$D3(vT>Mg1OFt&q^+uHow)3n~|2Et!HBq>xyS1BvmsB$Dg?mw0TOX|t-dus- zBYn^2r zgK(HAGnxg;>O+AW7so^A;c>%w+w#@ZLBBB6E;)O_#m&+o<<>fXgb5v*p{cxS#^EN- zx`)=Mn&|NlX_%u_iTO%{wc1<`so^4qevDA+hGPcDgq{5d;5@^bx1Fc z5%iiD7;b+DeOY44et7gCFR#3nl~CzDByOgOYTpR}Yc zeJgF1r1JbKa6`aiB?PV)b*{{~MUSuSDsMGSOYX?kd}IN3>6um}%T&vA)wp3U>lz*3 zO!T{CnAZ4t9Jg9EG_;GeSxIIC#mAfKpqJBanb6=fka4v5N-J6}+MfIdJ7TCM`FsIb z+IAu`*6?4)X(P%PJ#v2Q#SklRW z$m%K=w}3S=UYTE+6@X7xT$>AG7(LJ#`QyE)MwlK=^>l111)hI|tcM;v7-r=TWUEIU zsv8;p0MAB!Dx7#)BOGv7efcHnXm4l>yPBIEY7>N^{0*-(0=&t%7oA&U7nyM!H5fS# zvC(y+5s^rqGNvJ{Qh+plP`ZMQOe->5KlxK3hTz}#5E;H%zkF8r@FHSVH6z+{elm14 zLt9xr3jiW_rwx20g6wAQ8rWc2{xvO4CV{`c{xn~5ys+?mMIWg}r(8C2sc9j&`hpGF zoU>l{?IW&C*TBL!jDS0?b~>m_eD5bxk<1)hxb0@}SS}W~i!rHwF6&-8YZ4^3)-8V@_H;AuC`G`4uv=sxuCde>#CLJrkZnXFbbZyxqRN(b9pX77w?XA+WUr=m~87a?(o(p6X8`9Bt z^}qJC4pSw|d&$I1MpOFDs?L8A(o=nYEcqMFnP7_aKj_B%g`)j|*7#otfc^=%{iOr^ z8{P)U68{Sz=6|Oh^G_+?zXG@YAL&j0KT75PiqSv7%m1sj^_TAWZ?P2qF9E{;tyuos zto_f65|T5)UtfSt1PBwDDw#%{E^R(P9#O1#<<^`H$>5Lc0}sin1YK9SF#B_+ilP%`edAce$zR_U9{! zBhK<)Fzbtlg8D1&^7oF`hh0NH5?>LQ3+%RSc`j~;S9;%nNo~*HQdgGy`DBhUfB*j5 z)$C!+^=-xP&8=JG-9>}n3*}L5%Ufr=hf#iubJxt7&E&toHlH|%TnS1vH=Nu&sEVK+ zIryHuIeVTNl5Q-qD0Uw@7Av1FPdzmUSpV!_$Cu-~X1&ey`RcYunmsl4WmIh2<+A5s zF=Eb3WT+N%s5XM#CrKBU%;dwRZmCze;961f*4XezYisXz@BHP6n?!@)7l&pSOPB2m z(W@`+Kc`X;#ZnH%@@repth~2Ro@M@E7HwAdxDi|U@%|TKHnH)U{;zi)o02|EKCNlL zzGpuWact>?^dPoAxV?T;o>Y~h7gS&P22?5XkC&e$trMDv!@$GJZ-k+^GiRIuJqQYp zRYDV0bfPmnaA(AuAJ4)5#2t9hClOx4;NhV^NBfE&E{6XcD4(PWL6?y^n7LlhK;N|J zXe9;qtl?P*JB8KALh?C?zSpyH5D~N6it7-)scuBnfFZdY?RZWo;PgIcFdbKbn?Zt( ze?7zC(=|Hq^W?|2^da>6>%Ih;Dl zIB=6^KMB#sB5^oLS9jpeSY8=cMOR@z>pE3yqyWTfxRFq%aqxE=db8#1u7D0jaaWa}ehABrlkrSF@m z=lOz~QN_j8g@q5T>JDy?i==%VnCKRbjGAx}{UkEHM&j!lpPWsg#ticdpI)9s6tJ(I zFgb@wB57BU4h$=jGZS>%J9D@a!W1Qhh(pj;fIHtErr-n>10zFIQ&qNj zJd?i7@|>s&LgT}Y5wzi7)xrB5uNW_BjHJPoh5>lT;Ob2z6AfLA2xJq9FKQPSSmIwq zGI%0LpX;UasTj-xl2AP**itq}!Ym~kDeEnH#kb$CSjo{Y-Ah9rRPzWE7{_#z z4dL#4b^qt2c@&0#j-xtJh?dvHepPc#Q&i6ezQQ=9HyEKU_%vxD!Ue2LQ}(mRX4>Yn zM+4avYNsd#4;d-4n!@9}SkR*gz4g&Y)EQMod}|x?8^Ox%=G5yhd_k=Y4xUgoQ)$(X zJq}RJ(g+^%u2C{cV&qG2-6k6oyVx?a-4+T%0-krga>8E*?Mu5bF5EN1O2IK^lSNqE z{Nx6v6`TEUqLw3wR5hgNk@o z?i$g0^RfkaGVF;!Ec3rA?k%?DdzfP>1x85;37UQC8!ynhnjIwjA=WPM6ueZt6{O?T zWRtwWtli8b!_Z~!IqiuOcGH^db?Dgsp$6Dv@3!zgPcqwIZ;gB%U2eDFd?A`~>^K7e z|KIw9xL}TXr5ROBT!fef;~F{$1{t;uT*0FFWW;0Tpl(fb0!b(?*k-!KIDusMy$_9S zX2f)3CTQGj)~xiM-XsTxts2P}vxKjiYntK+25cRvi9-x{@|unk$1A39k1$JTOp$4o znEgt1FoOC?283EA!lTd8{9#h zGLCUXmg^DYq7PB!@tGx<*^b(?qh2p!P&+^>h*#jtS8#;ZA<>4ZV~Wznm3p z(QAb!ouKw-Np{&l?cUEGfF3D-ZM9J7TzZkhfp`T8` z@*`_m6CY}??yWOwRSc-t!SH8z8(+H~FD+NJ6LDA;k5i{ETyKpxUar->Y1QKN>6x-V z9RsXHrD(!|oQPKrvFyQ}X`|Px!)&*l$T5RqyWrI7*po6rkX+)f6Zv9E!i#UXORl8y zxDFLT1HvbjjCIDN2QM>{XRw>C5IG;2VJ_&>x362wqcd}u_JW;AVLL?TM0zKetnYs% zNa&rd@*FE&aj&76pMDs5I8`R6fD@`=ut%3`h|D#_?3oT3L}4>)>)k5@JsK0zF?6(y zDj!85625U5i_`QwPuyf{_&ZNRlfW*MwUD@N@ybp-{IHYtE>T)8q>Og5bkOk%z6axU zb-|Juf!_V>7Y#f?B%gwBQ68}u!OE7`qXUdU@{dzN(~`4{cUvRT>|UF4v9sCOeSDS? zpaY9UiRMfQx|j0sNsJX<#r92w+4XV3IOHHb_~YkZ{{VFCV)#(eORwATpGlP-XK+p*57Opc%4@ydZ^x0WdI zDhX0^QJY$P`q?qfFx6&##p0vNweDB&%}Q}si1&jmFh1fC&0_2WsauuEA3#brz=J@c z{6>5HagdCEBzlBNM%2nJG znbv4iJx&13=iVg)B{c9>=?*)bBDFh#T{q@6dWVf%Ke}!u;JdFhE-J9JBgkuVfYIf? zc-_xs*jRRSAOvF|ZPkLDX!0tO*&IHlMq{ijJ6qlCGBGj+JR(9VR#3u(^)~Fd!7}qu!PeR%KMxJ8rHyQ@?Uf2T4O_MhUYluPNtL$l|cpk2~|y-a_rd zR$XX!O2av_EtmvM(7;jE^>3KVj2!fht6*!+%X1hU-p+){kdpBoMKa{B(!gY4K=KwL z9kG(3{SVl8U2fZf%KhmH9{mozSanC5Z3#)9jj(ZBjPt6c`IiB{g)FO&F_^ z3#xg^2Cb5%p_lAlU5PcvQH+c=K#HC{yfgKRRRb?Xp9Qn`0y5!aPQU)+nF2*FuB+o> zU8@uz^)=whY#R^9&cx7DCtJH*w%b8Z@$3r)j#1u-w3YSDcf5CMA1<6(FoxLZ7otV^ zOyq}2?D&Y7xEP{L5Apcy7;d1@f!};IljL|qfoM&pHj$(PXk4lvWjQ-bF4QR1gvL4v zO;IIGr%8REIZc}Z#d-=0Ekuu*yFOqO@9cN^%cRgBq!nPgsc!)~yk#^M{+bQHf|vq> z{RV`zZj$^|;0--G2avtP`1#l#KsZ_yLM#YG4C^#b;wj=A5o%Zmp_X}g+N6QV4|H7u zz0df%KqhfQrlB4R5D?LpXfhlByEwVN<3T}gDrg|0^B3cL?Tti3puEx9YH6+Tf(e;r z)r%y-3U5^s8VB)O5v!V@ye~BDPK=-|;*CanL&YvPvH@}D<>PRZyuf5qVWlu@b44@i z7LW@6qyg&-q^mddq|AG)b(Z)M7eN3Zg&BGg9T@IWoY0FtqA$ zQpL-zEW7uCT{kwkyQnC2<-?I-E{BjwDCsn`q3Eft-?#z3UitVA?)Ml{cdSZZsD73u z)CNT>sfdy2>$~5Th>h?j(7T-J;>6rL>Vhu9x&H`m`(s-5XD`er$p63b+x`%c|8G0w ze{(tdPXQaC{|4!t7jWVGmv_jA{fRJOHJTO{)*+llg|HPjrg}Dlbm*4ji6OYE`o81q8o-KteLHu?`E<)5Rev^zAM`dTs zU$bU32>M>m^p33}T22&1+B_}SlO5F@?)T2ODpxb+?nhEbmI3nX&Cc9N-%`)nb#r=~ zXvlV$i=jHA;o|P&75d<_UGEzG zzS>1B6W>;}vhuLi*X=5*wArEa#`(%zaH+}B#~DMSXxTs4nvg7y zR%%-U$5s!hePT!FJ38h&R;L5IxN%M(foIdJ7p`x8+&tGMJ>M3(?C;Bn>>E=f6+hk2 z4+uuUI^X#`CptZSW90Y-xAGas+O)r0Zo4!Tbw&U%;1UCzo8QoK14oaDPLROe>|JaK zE_+LGcZ3OT<-@S8J@p&PM`gHBsVL9-Dh-GtK!v5^9IUB|Ys|07VzTG~T6v!UA)$6s zfLk$JL^n!49hiYjgQ(L$#)1^gP=U37%fN>paIkYj1E+6;N=e&+SM;H*@Xf2ET4H`4@UN#P|z#2 z>(0$XBwNFdTf2~q=L#6$XnZ7AA~Qx#s8U$F9-7!NX&We0NH$7cL$7{Guv8tW9)hPK)CdH8x!4|xzol2zZbGaO$ zq=IHv;OBgcksVbuutw4@AzBy~mErfyNYH9fx|iL{$sP3y9&1^I?8eFpZU?Xbc&A8| zb&TCOP7uXh zXR#xBpDM=9#wshdifK2R@v6zpWQ2V6 z=z9Nr);Py7e0L?!pKb!&2!7mZjivPd(s)IH5l8}ct;+C-B?S1Us56K^JlBja!}6t` znVngKjUbMS8p`x6$5&1!hTsxUFE?ii>`EtK(<`W9J+c6|{s)pSUxF7&u4$KmgYzaR(w(*=6-NZHTp?aJVq|=wgl{svr*tC2E zwHqNde65(mO=l^1%!Xfs?>u~skIFDDw#$50`^}kbdy3kgF&P2K7HK<@o@ewN6$pnxnmm+kb_CPGA%wv zb_}R&fU2u)c5T{o_X{D)HNYm*Y-n(#EN;vv1lIg2;(QEbR1ob@iWE&1%^)FetwNR~;$JyP|)aX$z2szpV`j&~n+sF+C?J8OJK;seAE;<|M>>1((2- zr28Vl)UeMO@5cJ|tmr7{q?52a@)X=47D$}TG_v80iM?G*V+$7{{n};wAvrA6E{vlZ z*Zf-N^}vk^_8V5_$mOw=KZB~lPwESDr5tboto2;^5TZ@PUCcMF^FLx0A#+%-Iu@RB z?|Hh+#vm$=rk!ODjE9HRFLsGk==h|yF2NFL$Ua#63K@=j?}pkyw9Ewdy<$$OHp%W> zc*)K5W|7}vv(GAp5#0?&Vabx6vh0bBTeP;tijG&={ex*0yztE9ely?cUbPd42P9*0$Kp)4~62CB&8#q3v zEWNd(O@YL(1vnTyo~a@g=>q(g!b0PSn{xxp(8~)@S9TP@^1DIy7)M6(ntv#;GHYoH zWSb@XN;i*yoEAe6o$}I0a=xa^i-6s~Xpw&M{-i^mGQi+*vhDK^ZhiwK75KLcPB)7} zUqMcpj)jd(OvwvGnvr^nc@A*-j)<(s=8^>2%z53WdSz_;8dC?)b;%6>qUA#zQX_n8 z9a9}{M!K+Ex&-tfd*l_GJyRYJwZm1EP^eJGjGkYRBD7+VfcIk>!4Dj@#keYI=5s3i zOJXiPh}2V)#6K0Qq;a?<>>d}Dd>xr@SW)6VHA4uo8{)JJpAxX^(U)_|y7EE+C_(_k zG@b#tO;I+gr6gy@BCc2k))JRXpo($rWK`S2&$z}C(pwY<2T#?#p<`3?^B#6Xy3ev% zrm=$?LdfQ>dl2NN3H@bcu-u;uSL34|P03|ARs&q`h;BmWs731_fZYaAjq`bqc1kzA zHW+WQx29uf(Rq{8%=@ZThTnkene9qHl8WbgS10Gc>Ash}y; z&G=H6XUl>y3Gy0X|6mtQnB+|Owfc#%E2~v-R2055c+U3+-TI(8mW{}tNXsb`*Z^EX z5e8vVbnH2}OlcH0Dp9{=sv_~#`XY-3k^R;b@2dtNPQ`$S9J>mc$A@ImwjEgNMNbmA zVapX$mr>W3_GuwF>Z(|qUz*Q=-Z74{)6{81zmWwNslqkYRl4s>iTQ&qt6LzIzaNX0 zRHSWOCk{279lKHx;0N!WnVR!=rXrr*L@4k*~^&p z)xMrYKWSCHlT3F#rl?(WAVj!=JBG9i6yY}(o{2+)1iL-zp7j7FTTmLieWIzdosO9w zWcCv7L2T~f+!QR=NMC6RXFqnn;=Xo5oWGH1OSOFeCnM+_zC8;QXca@92?ZSYNPVjt zZE`y7a|C71<|unSE#UVGfyPpa0eUGpN~gZFBvX2uo3Fa>yEL@$;@PzN$-L`UXyn#j z`s8y22;lL_5p6WBotpXFr80yMxsx5H`P3nXH4Ad4=nJq_(^Njns{Zm`t~Q** z5Qf$1tb*ZJOvy`4u7qt2)}Am0+ZqaJ!1pg@*q<*LuPSS}R96 zEN9+94-jaL|%~OE;i)<1JS`Q+E zQ7@rC7qE=kd0xC@jIpm4I)r$i3szW5Qz3IdTw9i*qK0B~-+FCCQU3hrEnLkVSwzHF9wluM>I5gO8UdP>sxQG&jUu3|xZW7jXdIwWrAd@Xpr$Z`u2=WZ zYsnLiPqr&}^Otu~fRbQupcT>nxY=8P<3(5VVLUnJ59z>eK*4hU#gW&#pBi{L2jYIf zNz$&Yot@02yqiMrTffMX!U1n1oN~SSN-1^X zUK|E&Yxqwc=Le7!yniP0NuU0~7 zy3#lSDaLiLHKW^N+M_Y2yZNiF}ugmDU+ zD0OEc{e{FTrtWDUQihW2(`M3F5U0r$Gx%4ai^jeYELoV-DW~5ma?6WQ$^Xbv%?%2R z#FrqUe`gkAfF6CEV~NBu5J9{#I=ZQQ!oWqi-jWX*!dh6J#w356lD(xPpHds(mDG<% zE>+w{c+%g@LAVU@CLM)M#o_z;UNqKF%??a)tn(6?E{4`O>jUSx=XO(_Tp*hScE|Dt+N^9<#if#cVl-OaZrm(|J4;U5zu zt&rMEVKZFQq+ek|?U)I#&?;JXM&JYFw3TGLC!5Acq$EvxeTv;fZjyW>2)NUCNB^lCgA3b zFTpQaW0(Emu|Ns7<&BBFannaOdaU0?F)M`nrfQU&HA5F;WJs0hspq{(X<>A-QNMe* z`Fon~64N=kX&B9NHc;}>iB!3!d%vsY*}`|xY&s3PD50BUMZqReE@Nlk0blxy*YITh zj>cD5j7;v99x=2vc#_|QrlgSpZHPqk5%7IA)Gdvtw>{aG%%5id7c(9e@qvJ{Q6|Th zz~)(|h49KWwaLX0fqhtp2qY|uh7Sdp6?BoC^xyvSlaqIDUK%&4^ zd5Sr3`s~$xy{_G}k6Fpp-6;K z%amib{#I|>j#bUap_m2#{yRvoSm<4k-Xc;K#|0JHv8)O z=ewK26*u4;u3##~0`BhV<`6$pxab>{yG0xlbv_jT^WOPRW`mKA?#xvAKq0bL+_LAT zIXz(J6+(%7FWAn8fH6n)Voj$ls-ONmKJA1Xw8Cz4)~01;sZ1gxeOZ@F4QnM?X~_eE zPO5mIo}ynCQPg0LGF}@8bf4e+2YrXXAi+O4A-}MI;6DNIe>}71@95z_pWyPJLl1#R zy!?|<2?Ef-{~~<&-=O9lW`=y{)bZ;5y={dz6&cp6pd~W7Ux2AV%&L^gG=db0PDi(b zO$XhrK(risj13DfdqlmSnYW!;Jj$xfI7{F58d|E|zSnDY|5a+-yfTf?<$2MKX>xy> zAK~kD+9Raav{7H#)p%a-(B62U_ZfOwI>puJbW2#7{dm29-?Fpu^t+|e&3 zjNAL8tr??N{K%KQx8-Qb=cu`1@yeT|!562`e%TMF_wZ`HasAiTPoEVWp4Ky(=w_+_ zr-No5Dhc-&k@C?$&ibiM7VnTruf%Wr?x^1X_U57LI(bj>alP^P367)KNe8l**mXA= zj)XAjvGGk0&M{6$_T{gK`>x&4wr74v)JJ!ZqrF#;I@x{>BEn3NY^4kGw9-pE22(=p&1U)F+9^b_iL zr(Ha#t+94*!u5uTKHXaqe{#&}B-b{rM`SZbo~u5q5=iIs9NRa*TK@sPa7GryAat$0aUP&Os1HvPkjiTg^@JvO$!Dm&T-Y zt9Pp*I3w^goyJ6KJXY58hhk|`mZ5k8wo~miSZy%V{6T@NmOYU*YC0$X2n=&1V%&O} z*M8JqH%Q;^vMMN5Ea`PxA5kFo&u|z?`Sf9(w9j=2H5ZmilsQ@<(`5cQs|MEdJTbI1 zX01g6d-s>M;}C4%eHpFNc!YtUh><^kHM@QK?A(>RT}|TpG`lIp+%YV|FdX24@pxw zXQ1>=f*o1xG{&6zPpS9@!+zl-rqarDcGdb%(JF7)SSZuwJ=zVCn0aH8w94Xu?`edFkinbqcfZ=4s#`C(Nol})o$ZF zJ+M;4pJ!*u`4NM3XkcwHUT5cSQwxFFn5+q=+o*8xHMJvy8E(jgg$u^EcJ(eoVks4i2(W?zqEvoh4h#t$wLsg%g7`VjxxfNq7iGe3qRus204-@{&nG z%d|jl;5h~5RNqJOX2F=6Jr7iVa0T*aTClh5(CILLfml}N@C$> zsp^~;wvT&0g*{Neef-1qs<}*6!NQ^S<5#cALBqPaltJh23{1Y9{0uZcQsf*fy1u^b z2a-M)k?U;Ngk(}JeCyXN$RobZFSEYwv-7`5W|qSjb8o_|n}1RDsVQd?vbR+}cL1{6 zbZUqUe5J}2xu%zE8* zmgj!w1Ncr)YOa;0tT7Y#az`B^#FU-co1Ty7`+Bno5{uS(Cop$?3)4v}z%=LA)B2?P z(q$Nimd;sRmg7T%UJD<3#^|gap#Ezo-f87G+ssq#^7LBBK`)|4rNEn*YJ@sxP;VwhjjkW9umSgY(`)+VIN)z)F6n%Fc#WLcm4-4Db| z{@Mr;!JFF-vc^ex5FDJ!W4_!t`ItY?r<(MtmeT4iw;)cnn7*o2$R$UC)Z(~uV&O1s z{ytd{+jKl~?%?jqy1P4U2>ix5@6!o5EYYq?{Sgg*9v*+fNq1F^0*aMFc-4Z`s~tHn zL0EI}mVbn*vXxide1`kME~mRc3(DH=u6H3nr`4%l;Y5P|Zp`mR^4hty-y0=CBRo&z zYjg#{-2@lc@#@*(>wVc`772;os7ffTzr$0-WX{}SU#?NN`SrFtRnn~oI@TE6GH1>& zPz{IT=g!Yl#@{{d=KM7BvUaxYbBzKYX1r-ZUgUGX)7`0)Y!sHh@i<)EaSHJQQHIQ* z>SK-^msBQI%}T+5pso6E&ddH=n$^}?%u6{x=yRg5TZBiLtfB=)pT@tA?oRy{KA=z zDxxBrDXUxQPXcMhnBN%d*jdNLtW0Gw`58@X>}z&V6UOe(waC$vjoZ1m9P%SsUX3-) z8Ned=X~%7i6?C!n>FQple`&NePIKb2^GlmSNh3^`jwg2Iob0}TY+(Memq1|~WXsMC zt9@IWb^n~^Spvs^W!CTW)a~CJfmL6mRI`@UAAK^~;ZH+Q4O@4TKdv<@XqufVRlb1t z7&VnATfa4&W1Pn-4|GdgHs0Nh71WYuk2rCh%>SGkz|?>#^m`(7mr5CRh$Z{|FLkFX zg_l`H<234d7*7*_J|DoB*#>`h4T~wLzn&q=xU9_D`VypTg_{1}PFgECti98I{N))u>Nfr#4 zr5ElKFbKf|G3kq1LA|xDRyFpCv$37J1#2~z`Lq+_@=3V+{T{zk z34h4UW8(C2_Ch9mRzd6VDeJ->@3T>=A5BfJlCcvIBVTK^W)XV9MHTI2&jIXov=rQ~ zg^u0}qcivC73inqHxqd|^iS$PKN1Lr%VodWY*&)k-Hf*wP1BKVn3JBGf9K=zc>yqY5&6O|3h-qNp9YcIjULii~^5vEa;@SSfZaxwWCZ=>iWRB?e$rh!T!{I8BLKKPL+%{yjWmpEJt%a& z%2o1?0ZBFg(`E07tKjnVLo}RI4ol`SNcqa1K7&pQYJ$*qXJxf6^d$x+l83&WrX5jw z9bz!TLu}R>Wh<8Q?ep4;y4u5MWjeODmucx5o|S?1tZK)Gt_Cd_J;~JH?$+uJBhE~1 zzVNt_%*5B7syzNi@KhPSsDMT#mW;DgHsBd9V-vj+0c;p%DTs-G61)|Cr#HK|Q(Ia1 zz^?lnT=gU?kH|EnKTXI+c_jlJB(Oj$A3) zIMXvo>M2IV>(+km6|ASN>@bPrDd{K-EoKs~vpD3rEotWB_a2z(b5GJ7d!*IKJ?K}- zOJ6KDRf$At{$#7E6XeD=Fi^=&*ZWqddjyGT4tF;SnoGBDkV^Z&rg~yH>`*J+BF``v z8?ZirD|-4HsgJ5akFl8NgqIJ!wqed6Hh=ypeT~DqoRD>a_}#^;LC&ct3*F>zUz!K) zcns&&;klNvpFLKTMe3f^@e;ao$YI46$*-b!ZoKQ(*4KDTuti@7ik>T*JRm>HX=<2% zX22UWdyxi6 zFoC^OjA;Qac52j$s%T@;VwqVv=vH`Nv~+H-P~Az#-X)!6DTbWaFy*J+B|6z=6NMB; z5n;~S*D`^vX{y6$9$!W@B_oh@bq!{6eEN%ZccQr8?S_FU;glnZaq%80H?>Qw+|8cY z>`PPFO@j7DybG$QSOc7&Wt2%{krT=L@U*rBgx4ksN@P{yum=%uWq;~^SWGW1seuy> zs>orG>mRg5^z6si&Ad1rv~5{aQEG_j5pzuvpMwpxgwMeXW0pDEXw>y@p_4gvx0sr$ zis3X9ue_hcYx5%l`d#u>;%VF|K!GDA94u%bFQQ*YrqHO&K41C0{*2E$q{UJF?W_rR zdHJ;XoU4gqcorfexndw;e#(8itGO4eh~bs^)CV0weje<3jLGex57A$}4U6QHUIz#g z2hB|^ik92s(LT$J7ws8L3acHTn`XWH`D^2jQL{<>TUK4a`;#=m)7%QA$DAVAHWnj= zWcP|FZuAq0`>!WrLbU~`*Z-{EK zMdPngQ}$1T!oA7I31nVMyw3M8J|{(L%$E%*nz7j}RXBY^?$yy7X3KAD(0o=A=G654 z-A+EGs2UNr^iWlAF&Y<|LZ(%a9zCR0x#Xzf?s6pFN)pF#6QrFLe4+Ur`?yD1{C z`K`K|Jd{Iq(f-;(8Iw%TM?G8RuIrpV1^FyN6B}P=W!*A0uaQ-TkfG|!J7iZ#JBK6P(bzlb)f;PZ!>Cw ztdffT=SnFw1)a1YZrqLUF1{A~GAz22D*T6nSjq7-^m%OZ37Gs^bagqvVsL%7zjP;J zQ9JjJOZ!fmwyC_{a=(FHN}QWFR<3ZJX)?E2YOW2BW^m+)bz5InIDx@>-6};G>9dag zC|*v%cxqwqnt8ZhX2C;ibr12dd-h1X)?w=Bfqc=8o5NX}Q=*Dq4J0z7UX>FPS_Zi}OY!Ngf_o zD5-VLcXr8NytuSCckf>6coDT7Ud9Rr8)#}@d8(_t`dr2V(S4$#hl=P8pwDismy6ca zlcq41XQfi+cM!#~$`x}mo)Qu$T#%yVaZM5l9-iS_;LYY83x;G#6yW7rrZ4uohCU>n z8U9ek)x$n{8~%-E@F}(wC%28;Qx$yO`Vr~!d;B)cglOc7=lUNk<5c|de&IC^JMN6$ z=PQl+&4u5~w)^AYgvC8~`yNQ{qXwPWUl<0| z5+O#?ik+)?Z=(YnP(_*i1z%^KDiGuf8=mN7hx4X4ak&jr!GJwR@#Q^HCGlQX6rH!s z-1=*n6enw^er>PYp*3qv>Z3s+-kJ7@;vRZqWY>i3_JvKSD*;)>MDq|lNO0*;0NzsG z`1$Zm>1dY9L zJ|P6ghhgLmKO%-~^@hLH;SLr)Om&{gVPzuDhIPI6&fYev(KBgv)5XTZ%FW(hcaCY($9*a#2jz&r*hw^DWH!Zwq!_T88J$ z0Taxkp4CYeXZUk*jl+Gvf$;;I1~g;kslcb_AB6Dqb4#UO=i4AU69y;4r{9CUs?XPA zzx9#TIP-$ig+gHLnRC~~ZB3DA8HiElOHLJ>JUT|&d zFo&Pg=FAZDQ}ZLrS7a$ASI*ezOS2$eaBVym?$20jA%pK+&HLV9y~#IIJ7DRHPKq1q)9#_UPVUNZ5&0_o4THCV`JKI$+DX*=W=NygYBSAY2W{);eabdE7 z(#<3#UauX0kWM{jk8`6Q?XSUFP2`_#-$j?iM)MTK9;^^>ilV45>i!ysZ1IxRim29T z*lyp|l2xo35_&7WjxXH2!&;u>Rgd?O#RxP)L91st?Z%HBU^O!|Ha;0 zM#r%&X`*6gEHN`PS{5^7iP>UiS}Ogg- zXUw!oC#IVb5N}*Mc6{d5*3z37pHcmKw5*r`T)k{5QQVreK{dCeSL6gnU|ZwlOQxoQ zsZ*RiB^V_f&0i@<*FXh47UCQ2rIEwNdWZ8Us8ZwwwV!A9Ji>u_A6D{USD>4;O|%rN zS{(35*=pvrX-QSPCz(y#wQXvEpFosp*DU38E)|5=G&#+=s1@}mb*}865~~g4##Xhu zNBz#6MjwQXp>e~M>OVrZu8Ts;SQ2(hTs`8$eeOp2#F=`zgqR%!=`jvIzV19qWU&Y_ z8wkP>UK2dWQKDIaAe?KL18_Lc#1Bl(sBJGV04!;+hz<{8S_j?3NVF2;Q0Nq}+xpcd zSTG(6Yvy!MyS`UFkh{Y*Hb{2zfEQ7=;0R%T-bZ zsc!hKRsk)>2zF8B%u}emK~>5{Zk7JBH8}NqwS(n%^lMWI%&;L1vJn#L0&=C~2sFku zC!cH6RI{nzl+;w&)HTL&bSia>GDaB~pY_8QKF~u7L#cevmr~H+3BYG)TzX}eEuIBZ zH6^;lDf{dsl1Qot2Iqba7(yCNOcjbSLQ-%EX7txUi@J^iS`zyU~$O#5(2W%+@-c6-PEv|t6{X6D_$7x+)Hb$pnlf`#Cf@&h=VzJWz`t_?k3o?efAgITjgJH8X<; z%`&0i?OtB7N;NtSR*|VcAjD2NjK&w(Q=%E|9c)$9Ks<+f72{jZ?m=P z{L$t9(tO=|WI@qR!(EEG#4ok3nZ!pRY16;GLQkw4mau`x`MQ5PzeVzwPU z^4bhBi$vXmp*68I8~`$9=uE22p*QBP5;KS?6sTq*qocm#dVaoch9Z=l^urlnF2N6t zQ3D|8M5@51Jss@MlS|&x_Hf`_wjqJ_)q6_xBeqX~egkjMnCvJr7Q2#Q-2w1XQ@SfCuRE&24va z!t)L4^M#s~k_qN}c>HIFztbifWo%tCsuImChult4+&-p)fO`AqX=5(1?i^V)6>atW zAwzOt!+c=zY$1pvj=;pq{skSwVA&u9Aj_dnbk`!)Y`koVf9?3mvuQym-c0*z1HZ#o zBK8K-P1f^;G)OTV`!|d_s$7g8nl)5nA7VWS_v1r;>hbN@NV1ix=;|p7bBlz(RdL{i zlt{T;zj#J1!+8!0O%xOV0?$BVcSXZWSl3V}V!*<#z?2WI*lvEZfz8v*$F>EpFqJWX zgUq{ZXITkG?!&@1Eoq<*ZW1IaPiC|17v(gO0}E!uio0MQhri0bx>y{Rg1{ae)uaoL z2d8inO}gqf*Kt{%(09xT+3V&9DZ5LyR~bFlP5UMm^^7Ak2AYkespuRS(dhoFZF@^*bDibeuN&`Up_C$1#pn@ zlIL;uWs7ByT9|5ea!FxYIm!A_nTW~xqZ-f-=q+T;DjXsC;lmGN#z;6AdzFL>7V^*8 zO|auowRmk`vJ=c7+1zPgP`%O?iw zNwrj~Ow4#IH3; zvNy@d=3nSMW+zzNv~%f}<@KQ;Yrtp${b3r}{9cx7l{`VL3@P1ai(xV89w`sLK+z<^K*27RMT+*LcAA8UYewcF{POQV!#LtGX{1hRozs-Qep{J< z6&GQdXFwsDVNoeYUnFz*Sb0$BQ#Pk@^4=w{ocHK=DGC-ShkOK#zoMv%Tlr|eL9eyj zUYyOt3BKd+g$`Kc<5K}OSzpIO$wXG&G42r`i0 z$L!YMZD@dCvuuVM=Ab_=}AYGIGcnE1|os0X0 z9_i1CPIDMoZTDs9e2y{_;GoiHFXHQK|j zr|X;}o9xZuIGDMx-Hu@{f54NitvPBJxXp<(w+o6`To#e^i#>!{Ivzv&4z(-{Lnf_a zq2D=)*0(O4KKVz0-y>}jVdYzGj=i7!qhtsLeSh^KTzv#;0XeyQnFMYa+7{Uy#>c&? zjMGlLM;AU1adzUh&m;{sne>}d*-CzeqGrvGAXUx!N7aq}sLQs8tqP(gz#PbSJw)~` z9vX4)L$_^^cz86PQGU;TU%R#O#DeLq5(IJ1NzZ54Wic^840~G|WGE`Ebn-dF6qDK| zV*VMOP7HdbX>pYFlviacKB!!p!%&xeBG;E6tdMo@_w;lif?7CwSf0c? z9(gPbi~`%HVCvIoUue(bxk;r-&O6J|vxz2%46}%~ zxk2;8A8N%V-iUnZ+}x}L)QU2b9dD-~$+@oei^Nw=B_gbM_8pBKzwJs9{HCz@JL+^o zr-Qp#b5g~aoa*i8^U{|KdcSEcI2y9~8n9Y2OAJdvm$rafl(T`d67fjVWCQ7~vug}S z>yE|K!Ow!yek{;OeMBzYxWRAoLAoM}1A8#__9kJ@L2hm&vsE|eFR=9n;gNQ2V9_{p zeSSuz)c1}q&y#+(*fbnC6_RW!7wgEftK;GgNZ2UQ4ISC*m<3b`*nWx*6#nM0yfKmiE zYSk!Kg$9#B`p;%EUowKk=eve@h^1P_aeLKpdMQ*gDYR|uA~^dn^iW+k)cYfeZzhQl5VD2j^xRfeYEQ!&l1vJ7vP^a67kBkWP2iEzDPjE z*j9Wy_|9{)>AL;6`Z=B74LVzAdV}+vx)|}qZXc!NT{BEmPwP?)-!~3X% zjId|3XC9sYBbR|fy&nbNBG?dmy2th}0sr*DZo%z0AJt61viNZpFvM_MZ1MF)>In84 z!_lp;TU)?n`Z3q|T5#nN+wOpKW+Wsq45p@ykaJbyjB>0PzmJ^Q-yAA0|CQcHo7cw- zU9O{tUe?k$qN8F~^X#W`Ix7$Y25w6`_ zBrd58r4oQcioY8Xe0`7IV{o&9CJ3+Ct*yvbpcP~bZMk5Ua>8loELX()X^l0SXTI%$ zWHvl4S?|KKUwN$+xl#O)4Ot>(zDB~3YljR`9;Twa0Kv4A1V84jIpZkBl0CxNoE2mivn22h3;N3-*(#idjbg&2;Hrnq=)wKn z#TA={B7CGVFToq@e%%R*S`@-HZo4r-)JYQd?|5_dJPtoXs98;db_n*?n&Wn%qWE%G zFb2N0L1#SmdLxD0lSFK|$^O{*B+IGaD}DaHQ1C9;YN>vMP)?cL7(~ysh#MY)BRql& zen_C~E%vcwvkgWhQ+c~0P^Rka;omCt_X?75R4lj^C0$+)k*P2$9&q*~EQ#y{gboP} z?moMc!}umQSdoCqP5v;2RZ-Vz+t{e{^&x`exLVW1 zLkXrZ`m846+c#WGendI)?ux$hkXX8A;wbv|YkXS;Dt(Z9j#u!)=T|Rq(S6DI!}COa zgN(dch;N8_JVWUr%N{*yxUz__x6RJd~CtHLo^~(hJ zD-zkVoC%wLQC&K5Pb*}LpyIi65Dfb+Eo2P8A0?=1PcmRUMO$TK<8C6QBTyLD+!AZ4 z(>zd;Z+|GtO{la!CI1(rMw8wan&%EWyolov{knMDq+4xa854Y*Z|!diYs%-+J$*uw z`Srf84h7HMTO4XEdLA;z;1p74wpq{5GLRWJq;8wyMcv?38zY+$&ITDD{I9e!v3CiK zK(V^jh^Ii8)?PQkbDi&mr4rA2@32oNk6JjcC^JEBCi{cO{ztTmIUJD-d2i#1EFlkF z8(M9W00Nzr=7~DV_BnSlmGQ5p7d53)L23xn^*K16(x?t}+AjxmdkAH3`Y&6~1z>mP z=0~1-s)Qia_TZd!%wqPxp~oE^%Zqvx2&bxS z%&JM+6j--u`=xY#)V|N#<7qD3v&1r@xmdg}7QYL@k*L-0{U9T}RP8blLz*(PUE_I# z9TJ)`BORd^dAA}HJ%9n<+J~L_O)z%*8>OrkJy4YEpq)}Vx=7Adbhu-XofALejdpoD z=n4Fr4cDGo<_rzq9IIh64UMg=0n~u+Coz#?Dc2v;&SB2g{T+Om<`Mi7!6_MH$4x63 zT0e=38YtxKS7?;r3Ma~yFf7>hei9(cifl!GEIW;QN}^?Cbg_6PPK<}ILa&%lom8XJ zCIfTYU#}2-5G`XhWS;At_xu%I)@i;7S!X!>!6?~`8zuuokFj**?a0iB=qqDzb7tUX${T=M!pMbzW z8;_Zpx!C>{JjnS+m+{|%2mcfwQ~pP3{r(x!0@nWvNDG+%#lruNw1E4+H~N33+neiy z#PTn^ChG?Q!he9*{2K;Lw*TEZ3;sV;{bK%q6_otnSE=m3#{6Fkzz-Dszl`u-M)7P?)?8TcA`q8erAx6xzZ5>#WWlQsKE5}J_BZIgb;}KxdxdL-tm60pex*- z$Anu5s`n6>Re!rMMFMv(+)<=EJLpw^=8psCT67cW%K_8|@QYiJRZyZ|vjit@gcaa& z--TWCE;-`}%GMAXz>5_~`0OiJ;t1@Q;2OZ2b%fn|8R%7^YdHPS1(haA_~MS)Mv(w) z!RNh){Ql>?L_X{YLVstw@TXb-+%9nbE7cayUzsdFKQWW6sjZo_IWf<_B$NM``TrRb ze71jrrRaaHpunFt`qP&GMzzKB2QS>;?7MjWr-?KCrDgj+wv_)@YGwavqrb5w3(MaN z41e+A{bMWhpV`E+{ZpIReaErj;n=CbRQ)3?aPDAX75G=m zeS8$bqYCaJ5NO$(MxzRzdJ9qE{}~+KNA&MRg+Fcjr}a6w{wg7G|3!xQ@1g?tzxe7u zX8V7quFdryHXr`YW|sZ`p3STd0zy3G-@94x>&kC&2p_ z5j)d*v(M)0+iCS{#7yrVHmb4a{lFTq;^jHbk(=|CA?EhM%T*xiOrT@O6OgO#K5&fP z*Lr;$Y4^I9kZG^ud5fL8+4-|U`$5-dqMOb(xc>L}y8U_g+eGDMufu(|zMj*uztttL z{e>>{!$$Oe``34b{pYX0y}+Yhy4z8Q-}E<~e`|1kzQP8y@IR(%8}i@X+b|QpKf40H z{`MT%>e+gN`DpkIcDTA6vER7;-uIR0_h-PbPyYMwREGGTAj2x5ghAJL`Jp-QP{RW= zJKtaEV+UgSL-&;kXNY|6o(5t+{jQF}?)LV2i^~ll8X@UH@p)?G?_P!XO?tl@xWd0m z$M+1&1$>rrZ@$XfO&_=o)evGj&XEDZN-zSJ(pD{iV#T$iNwHKldIG<~EW3PAqrqch z#TfxpNvpiUA|-W6iXfJk;NT@`=t(K+!WO+Hf0Vq0C-pvO7?PBS_CP5koG}KuT-gRd z7G17JE$(*N<3%TpV1RzQ?r%}(qmRR+g*VKvvK7>Xt;&SO8vAiWT=}LE-vCzS(jaWgqlrV4uq2d%8BeMi zUm~gQBsbTw8ULer5%l?T4EqI@#M3e?O0}fHG(5jxq%)w^FSWk_@JNLX{<$9@InL|U zGAM`;hyiX^=ZXQYq>c;Tvj=F+u8IKRj`9(P+kT*=cFQH8EV!L_Q z3TO?Za6}5ELwMC36g3%5o)iEmM_0NNsV9 z(^~XN8tU4cBu#3Z{`qw`-TlzL2XEh%MCEvt2}(3#`7?*;1h%~gkxz7X~ zuu&PKl%3=r98^RTCMYX|aF#s$=en{RLq@@B6jf*546?eBncuA9y$oXfHyZq;A>a(R zYfntXvf*+ok}yorladc;5WOX(NptpZHY&G2|I*^vIlElXm}UQT3t4Ic-+#=yb-c0(amd1c zKry)qY+1R`;(G^>G4!k394@wQe*<8Yfl>;%00^U5U%hXung!I62C1Sa8G7#C|8&7$ zf`5YZi6zc(NH;K1iCfdOq>|=okym2e~nlEhK7MqWdz(vdhpO>Oh zt$^YM(rVxJ)Pc;lkFz!LLX}u2n+$(K@fYJ%l%GZy zG9iXIn1m;HS~`3}Hoe&$If_Ra-*pgsZS-$CU<`qk@g{ZyYcfTx7K`*(QctJ=sza7< z|Gsj>lhEh%s%Yqq)csW2FRk+he(4K_#hFk#@m444R?TRIS!in2j)*Z(EZ7CcEU0=! zKn@51?}EMS4lu!#R2meK;}Flm)@5sm!46U3z{}IYz;4NJotog#z=8G9OVFQo23bv0I zsHj}~x*N*u^Lv2qRX-?tfo4%bV3pF^ zeZ~gp-M4_xOu&gZ*GbJpQY)CVE4^S-^;uea3l)BOVN_?#7}#&5 z$pDBlK7*pFa}WQz+6U17Mb|VW2$m}|3PnSbmcrY1B!Ja;Wq_SQq^kdnICBQ0SL-&Y7SJILRH`eBo+YO<4KCJ zYQ%((DBc7QhmI5pH9e!$y!hlY!a4w0Lx8w)>S2oI#fhotU;7(kEt2bBLNq3})tjw(*_Bv(Evy-=8lQNK7ECJqtOJyLCE%G=fKbL4AGd5G1~r?KiZpLKn% zN~gJWo0+?j*I6slK7PHCm(xc7g~a7=_N$7$hSabTr~N=SE99EACqCXG!EtUTetgyP z@D=8=`4)a20n4tCFUN{`4Nd_eklDPYure|=6QEW;beUcHEo;Qxee>0rhLO4>DfNgc zRJ>>~?b#I2oQ47s$ug@DGs}tY$`^S{=A-5NG0}LNu9caT@Lwq95L|zDF3rhVsL}De zGypWXo+RD5=?85Zwd~p62AAW_#w-9#dE(7N%ae5)gAzYLJtc%nNFO6uwJC@mPl%xe zz7Mo}H5^|j2D=2J1HUD^ceDc^P!%X6UGyz`@ZD5#jhVkm!q}a2Ys_bbP%uS@5~eFR za#*yd)m4jBO1j9*RrzMhevR=*O0jt(v{R%2e!Is@*+=3klmD@o-RicPZP|hL#ez<} zKi*$L4?keMX@TE(-dze=npa6Q$e2vv<}+d<&OP^ftL@y*ZkVk^h*1>b9nBa8J_+ryju)&)o!6Fb^pO}FdGeH z&>Y0P>pl>kO4ASlV*gvBE_tG|6}pIIha~4Oz%+V@E_vta*B#ivsWb=?+K#x?LnbPg zEWtrgrRmsKSeWiyXbm-*XSE{bW=k0+Np)nw)-uDUH_gU&ph9n5J9my>{!1gLUQ#As zkmu`=ee{CCdZGT2XsMne*8moZ^hH<)hnKRPy&4J?OsWl;d7c|Y1fEn8IArAPE{)vkvOA=HWa-=8iHPVGfHEnWEYE zAyqL@6nVK*&RBQN5u64ZUJ70F>AQ1;!#2Pm9U2GzjwFm%8axozZ}de0 zM=zh%Z?HD4vy9zEVwn51kjq?n#&X4-S5U4USt$S?b}Y9M zQ{6j(h1DDp?1Y`TEo@QHk^6Sb|=?zpv_pXAnObAFEFyYCE3_aGc;yEZOlW0fDI_?yPJ8qkjO<&EU~X z?_Xcb)k|?l1c^cWcx~+fbq?5ZnH)mcyyx&Ef<@zjp|ZWGsD4HwlYxf9;z>Dm$Zhrq zc{UQk7Oepoqi_bgQK2H22UXQ^%_SGVtano7>+x)$6gc-?w>p2mT+uS>yB;2>okb!! z?=^K~soUG|Z&|iqINwGOod^Te(kA7BQXDH7PjEcevj$6q#`Xt0>^4p4Sy1uz&YW7j zgJZ7Ml$rUPQSE{ylb8X9_95m-r`{7|mB*$ulQQ%wPCK~RSW4gr#X+Z1%!jkbUAdOh za=~%)#{9bk?$@DIDe?AuEO(22t~Zvbu%;8V+=zDo&pj+c6OG(XS}NOUxEd2Spw6L3 z&N;glxTV*lWmu?|Z+WquS;?-q`Yq*_-$6-)@#Zd>*U)63E`+S6BJ)bg86EjEEDcQ+ zx0-*6!ORZCGUdnt<3|*t(lKSbba5KlpB$18G<#i`(?&yJ@Zj;L&}+bmi2Brof|(A- zpdy2EpF5cfM@?UNpy6+Q$`zO)sz@g}cG@Z!VOTAztz*N$3Cs1FrlkSr0=S>lHi)H8 zg$G?4KQj^iCD6U8sRJK#m2~Y#li#G$fNkJ60SGvWHms8qysPz{Qj_%;lfSgFp5l2VvPMs zi33Fe1)5@tyfEWD5mhwAkn1dr!ltP7b~_!E`JY$a@nI#F9L(5WL4Q_f_L+6AKsxd- zSfC+vB{EIsKO^dC1$Hu%bB0I9-7n%IF%fm73;wwz@D0MkDXbljrv`%X!Sx4afGO79}6E0P2{TI(h3(ySBWpn!cY*P!ON7mJCjOB@s|)&OTa}F zMTx>${*Dy*kgg^VDnb91ZC$Uz7oI_ypVdDeKRYFsgIChB7A4hyV>QI_1FOV95+0t% zqp5i~47<~{8}@4SryhK`o>71-I9Zdgkhzc*0X)&(Ytv~MBlT8721;U^KtD!;suUb0 zE=WoQ{EPe<|L%$PO#al>W|YgGBO+qcH47kwm51jk^n^`%XK9W~|C_sHCq()UYj|IS zNT)wLx1a+fdcMQiK>gV@%K}W;YoIv4;UcqI{?EBUAk}=_33SFD4GFXr25e%hHSE+0 zIL7D~Yl@m~?GtN$Xc0Z5Xsab<=5zPcERDO+Q5lMoTqY4b-J$PaP0}XZ^T36wP3^^* z^{;-N#V#<#yMc95Fn?M~y9_xVPNP@w?=e+IVPSmTX&J1%33?YS70R_YVH3}lN9#~1 zKZ-y^biw^ty-qDDsZSo-T_K~?Os#Qg@jUq)mN>b>h8Yh%o+N$&Nu-o0tpbVU)Xo?= zNrB1yhAhUyL~^XpK_w<^38DIml-FA8>D=TN=T z5cRx{yqS6pEFd!^ZyVDV2%-o8)5Cq{Ro=;y)a+UMmQ1?){mTxOPW;g*gy~M97R1yPV!1z|WE4M)$c3tomP+8P zvyO0mrCZ7DQ=?wdChgExo``_}Jw}&4Fm8v!l>HR&i^|M$My3%MT~!cLJ{7IU@$P%5 zB}%S|Mt~HF`*A1g%2QO4h9iB2{7@lPq6>tt>DzE<#>oty@I}(X&^TsCW8x`Xs$CFt z;yzsZ!L;U2N>ZN$6izWJ+_d~jNKh-rt9-yKo|cJ$KGYSLFRB<<#7_e}m;R7msM?X{ zG$wN>&X5^8Of@U=7@J)%FbhM}dn*%S!5hPet7Y{4Fk?z!oZ+h*8hm(_M9E@?3%h+K zldPy$JdVeN84fKhoE}(QfX0DAw9WA308^wDdg*T9ZXC8v%0bh$ynZ^iuPphGNOw(1 z_Ha$ryU|%6Zc8*m4$ECcX#btwi^?C!;Q={|il8c381c_PzC3ou!f*Z7 zCxTtViWv;nzHnbs*$(|_S);6Dv|pJ61ubHG)v#(leTDQ~S8dKo+1c6y${^}D6c{gM zVCa2M>vSqxCT0)GH>5;OJ z`39DErije=_c^Zcu0Tf`tOwlh(JJG4UiucL?JwT(!sf(~VGKOPr^W8lL=|Oe``6l9 zPgF`RwBst{&kWMC^XU>>b}@f2{UB<_XhzeVF!f^Vg4p0_@xvmDcIgOY=LEb0y zgpK|h0^1Kc$({B4y;dX4$`)S6!6N!xfPx=tfYF0Tuk&oMDWCFM@!myHfVNEY*?*9o z_RG({UsFdzdtElaHvF|8Nk76h+CxD~nxeG8iH9u8oBKUCzi=LiGC>u?Af~_s(CVN$ znhsLru$XA~Aj?b5L(J*qCiOY`QXfL0S#h zi3Z#bA@BQ}nXBKEQnj8j(BzB~hM-0*^UIf+mDO4{$8>&y+m1m~Hqs@)^-c$hl12BeTIBa|!e5?6?VqzOrbKM|cgt;r|F zxMhjyFfbAWG@LKL{%&k*b&T#*b$wc`aX1wFQs}}de9U~zw1-LALmr%rmPssPDL5Wm z#4!?M4ZHOOMmoRy+tD11Alik2PK?HXyD<-&*}udOAYEoZvQV@EI5+w)Z5R6ZrqqeA>#bbmrJm}tG*x5S^ zFR{4zQFkgg3fO0lc91krdB-fMH2m@^1{@A=S{(}J@cW%6KQ0-e296Foq7eWHiL2&c zx=$;wLdPqU4Hn|e(Wj{EgRIln6dgDs$%(=pwdUJ@1)3G1DmbZxo@%{=*itjhc>L@k zV5KYOHQWPbB9OAbL5B55RD`DuKK;`BfW&| zk%9_~OG!hPfKyWguCxqLK1f#6D_0-e#r5Dq>Sb%PUd1~za!1}n&LlP)V%>)@uqc)# z#A2(kv}!qwp|N?iutX8eV?%lh{6r4AZ@}N$xdW)*L|Vk7U2w?nsn=Qd=q=_f5I54l$yjsNRltmO)99R29e~ja90ROX-Jyl&kxQj(n1iU2eN4NC@Y5?yZk_6^#fQ zumHu#vZo+F6NZTpMI{+<(K?pAjBE}e}1RBStACN z90t=fB7_(>&bjNp!A^O2llmBkr`~bd-_U7^$`~dOE|V>^Pb!`DHYr*_=zh->Ta$|ubN_H3=@m0GsL*zg}Hwfd4*^0=QG4cO?>U1l#EZNik?$2W@#jQrM06P z(Wt0N&$y^Tgy3L#6Sy4}Nb(cS@;QJF#~kC+(_J|fSN&*sLvNe!@OTdL3 zL*+(xSg?YYbU6SBZLrZ8&Kh&DP(CP`b;H-;*Zvyrvrjxoo)kY?+WV+j4o_f#b4P}~ zj9XkKfFuG(jppd|O7t88WLa^kVC1n(diSD}V=ce~tcdqo9HJ0oPtKGD_6CEZGLmp9 zIols$P{D`Of4;wH4qDLwrn9^(IMqMW+v_LYKm2!iLOG4l2;JQy{7DoGd_PMIHb$6Y z`xve7Br-t!L=-YS3`0>;R!=$v*3jF5rgwY~uCFfZ;>Xv{tj>m2U|(ffDs-o#RUDlFO_(ps471*J8ny-zAh-Gn44d)oO^Al^Ak(u;kHVsg!P?<75<%f8k z9?&(`tLUxe+A)+1kgKC*R#vUX%A3nunLm15X?JaEBdI)p|E?yQVR`p*{p%{zpAe6D zR9Z>`6#Y&(95peZ(bKi-#cpx}azHBm92>_GZ#*-4>W;#U!{8n6`3HljIr_A9lKo!pJJQ|PLlk<$fzj#Au!=mXqiFO_Ybp&>#DE0cExBA zbd9L9l#DDfbXDo&rIf@+17>NfrLrc^5sC%bDca#*2;GEkgnnj2eVaJX4#dt*LRg5l z_t`(j{^|KkKO9@MpBIpl%&MSRBAp)VkJd=CpjQnfJYdRs?e|HVDos0_z2_{9qei<` zatNU3iA%uUZ1<8`AXw&`L9O!2dJ7RO1%YTQmx9GXr8I{TG~0=C1E|A!Ya8r^@J=;{5C7#0H{7bW?i+P{MLDg?2yaK1f}RLrvh{$kTW8ol~)a^cc5m+(nV&h1NY+Rwwxlm z$n1bG5K*!P89jyZw93_mnc0ZG#Ze!Nr2FuAG^Vx1xGAsFS`wzlu#|pPqYkYPv1bv_ z@EX@G5FF)1D6$qDl_2Y#jFXJZJ9WOo`CX8b_jU7W3?Gq5yK@{bAe10{6tT_~_l1oj zRwR7jZosN!Hg(I|?8PYj6!oT(-iKWG#BCh#OFe~&a#Hd4(Vt6YO^c*RT#4Xcp2FH{ z4f;gk7>~|a^MobxXbeS*rUTIl6u!RXqSab+Dr@^d*6tv$dxX5FS(WufpjBNvnt>(# zx79N=ytEjk>r5HG1NEDnT}goc3jc)rL`*!_uv~awCt&FstucQHJ3}O=@Xl#G(BhE4 z5&v09i#1oaRFRZP#&AE*ye81n3>#B}SOt2ood{pq`Ylri?pq6m30GvwX@IfD@3ijf z=VK7XMbs-{8XQW!D#st|zehW%0nP5CHf6fsmnE~KjfX0myuw$AKgtneCg0_g9IXQ; zpI%9tlx=f}p=1epw$Yn zupao;MD0O;(&4{;bIJILPpu7DE>COIV%~tQJvgTEh1AmuXu>ENAVRVDf4OQ&5_ZaLz6VlM{Va0PcVAgdm&$>guwG+Y?7D@ zo<>e#2IUsC2#M^U^y;^m1z=xwv(m8qVK8~0vsAu)F`vSWS7mPwy)UOP6`Hz6hr9)Y zI;3=Jl%vYLWR_^M8`UswQg#{S@bdTzBSBqzOci5%$twVT(h0TkRX1t`v3UWXdzTnCXw#6Hpy!;&OqS?3=Wkb@SV zIrlDxC#zd`;K`ZJiQOGTOjbWR`InWshO{Lk6Ml6PI-^2)+lNKvICfY2c)MC@yZze0 zc)B?R5Kx*K7?wUi2ye9hH4>(lbLQaz+gLrg@M#M1a2F*O(w8!u06$SI$5jb4b2p{9 za@e`6@844Tl*+Q|Zt*WX#q;Nxe&@lC==a{1m*i!>q9}|UY5a33P5OjaU6t=?D5*^wgyZC%- z+s%3%K~u$1g0KqvzR$U4CZ?pS6Hd<^Hw)6oMYRcf`wGxwCsWjm8;02u33NGUd|aV? zl1y^)+c~{f8Il-;s0|nnhoQR`X7e3ye=UA1i&#-7nZps&lZnmk*e%@qh_8(8+4RrS zPgAFt;H3LPqW~R~k@V~W^xedct{dDgP=tEu_x^YFuSl2S;U9z4>E%Na9tpR5-;D9F z=;69MmYFbIpTws4qT~=mf5cixwCb-%r=fjm88UeuQ?cIoZOOr9uDaIyno1+9A(r<*M$Ce#6tQ z#S>T{)={M%z<%QzIrD>hhl||&gyZ2^MP>|v{c*>B{T#|^?e%w{f`3BI{s5)%aB{Hx zg-YY({O?h(oc}4Y-j8qp1>k03{tpq{e+w4;=OM&?d2i*|G(m9xhqvblf{!KsJ*eTI zll8}}vaEbM>Bm@Mr7DHQX+Q()%!4+GQRw3cxE z6Da1PEVopXCe)$Nozd9)9NarDy?%&kI}kD4y$#mW9gRH4>MjB}#jGe{N{E)ZBUg-c5d!Q1A9NIb16A(&OWO>Z>0^KE=;{J$XDBah8|W`^@q* zCM5tZFYn`N`OL;!TKf6(r#HxKwgs+fSznKonM|`%yX?$vZ`=Ee+sYo^o@XmwZ1wH7 zy5~zVfu9W3?_VtL--?@lF;#opTU-oGcnP$3KCIJO^%Od}T>R2lc^5ohdnT%__Vvo1 zp;}q_?QHxu<7`{=`|#*l|8zs^_j+}7&-T&n=|!~f#_A(gb_d??_Ks^i-d~&o9WB>S zZ*siQKka<34Q^4gLo@XSHuzt|X5u9>C5RSQc^^#99uuq4opWrnz1m%V;N&jMG*9y% zJ3S1YZ$z$o@J=+sPc%l-d8Md-Nu~5+R4~&hS$D3fvDMe!?Cu^u9$tHxauI0ZY_{ug zGIKhv;rrF>dbyB(!Jl@)U)0!jYyS248Yz2|imyY#?TLSV)B7DW7hC^U^WD?!@Us`C zS9iwyO70sL{gGPeQ0I}Ki;_WgN?n>pP*aHkOD*qzdmAEUAJg#D7vQ9f0goQ)@-0Ka zP$xe9E~cRz4Aw0gus_mZ69wpx-G`?4@mzriI63jh(V3Lr{p249GS?a0dfVXiRE$c? zdInu;Dk*_OduXOS4&e==Iz{wYD^0ZYSlDz{pVY7n>RUS-fI0>Ay=V@Qz|0XxASq*j zi&m1Fe-rsQ+#V?qB^A_)EVOB891Y0f09%VKuTNP^TV-q=cpemtD6IL%D;O4^p=BTs zy+c1PJ!4gp0R;Zz zI+ofjF;Be}{84ZU@wM&yck7e6NmvbSc9r)&HVl>_I}{Td>nE zCCgH`7TjM@{e1Y$xH#bQlAwk0EFh5@`_mvqS#?-!dk1R=!7{EUMEg!`LEYqb?t1b@ z!g75-=vm^nrqIAobyES!Q9@tU+bj*OQi$?p|$q;9ZdJ z9_tjdt744CnrJba2wn1rEsNTver$9VxtSnH2Sy8Wa~kK3%oeNs`aX`k$=@sC5WH1> z6r|?RW|_K9rP{$PLOx*PzT^%W{?wi7abe%<*8pNwf7-={l45+e-yNm&L65fXc+Z!0 zWxssl_;YbIi1CYkVP#ewB_k$$@vNd+C;5k5FDhqQVrnO-OwgdR3C3qVRG{Tjg?!@>)w*z zl7~{j9~T;R!BU57Y@P2zbQn~>h=}Ml1#7jZT+h5A`CLTu1(0aS(uwi$yBv)Kv>`kS zx`gqQ&Krsu`lxCRY*d&D!a_FVBx9>-8#<`R4Ok-kH^#blqhWUkj>))CjS8kzu0@zi z67&!LC(~hbXyGLJKU{LaQqS7^|6%XF!{KhXNADmy5#8urf?zOeLZXGJkzll8^cqH& zDA7jmy@!YdgF*DsM(-jBk?1XYl<4wKa?bPH``Rbhd-k()o%eS==Y9Sg!}!j3-D|D; zUhBKoXH9Mbuc*e}+KBKEru68Jg2Lt-T5RD!2Tl~{a1d=FID-FBDG-;3Q=5DuN2F_# z`Q__(#wf}aAP(&XuwCA$cj(oOm2%YZz;&j_d&=@(ADGw|6ycO9_ppPkV{anrMew?G zPhIekalkGo!|jL`kybrnHh~xzZFr6l46O^-TM$lEYWAp~x5T7NRQ1_8t%}v&ymMlp z5mrLYdJ1OS>2>MwILs#VPY~Kq!Lw^jmCmT@9VdlzL!Zddi(Q2yAO^8 z7jP~1o)2E^sWLFasWk}sVoJ57rfTEX&H4?ZNqHXYovZp-GC~2v1o&A<9~B8YkrAsK zu<@r}X^z5ocxWcT$7|;hdo3$a$0`ap=2lYh z`CU(+QonR7hi(BYekG$$%8>(V8*?veKRWNJlr%NqP6SqZz7p~Q&!$mye?esVm5SmgRha4oPh|;*FJ!>~5%AN|2N9JRT$Y+R?xW8U31pvM|9D#3a1J988F! z-@F4tF_WhER;sGrS%Sag^K8Zfx{O<>iKl^ntt9(M=q}Njuo75dWrmTWoGLvZufEBr zhmY8tSYy)fc4Bntqw_gR^Ahqo+KpgK!dNnzNx}_`>LQax0|E5CLMU9OG1{Gem4eU? zgPtsLUOQ6oo7raq>|0l6FUvT+Rq9gHplW5i+Ir0Xyf90_c_@5dgz`Pf_!05)uHA0L z1$>eEQIl$TyxyMr18z~z*qaX(Op`I!RxU6=O5ymtrlgsMd7EoT0zBZMb#RZe{Lj(Z z@H?mk`UhxuZ@*#wS$@EQr7}OOy%tX6J#e&ja5|+FgHamg3NdZ7Kv8E@Mx9;72o+{M z*S~QxL4r;noRLN_rgtRq>Bc=IXtPu5A0g@A`O~Z6BCAR61IDMI_?oLtV8T< zN?36MnjaZ`)fQHvx7#9O!RIPR?9VT^Lyb+o)Eb8|&f$hj>Mc14-WRjcPG^KCXZezX zeLW3Nqo0)aA+wOx&dUofEQ&6fKslp{A?h6KW2PxJ`UH~2UG&S0CzwI;^R8@bB@z6& zmfRpHJ~+Cf+LQ-j?4)mkv|4bPog(Ca=0crnR6Me#LXXp45|k>5?qZFUi=4~SBtkfK z5xO5P$wox!K)s4GJEF)JD$RYA>EQsl1(>A#wu7j5Z=-!Damu99N)wPkU6M0^;d!+< zj}M0rUD-~I;A5ExwU1E-I$=k-4~+a)iE=B8R{^}XX;wFI%UK}vypkm40ub$Qybz=U zD~D{y{9L>RnMzc=0d~yf)|r`iyasqWW?k)eG-HBH!0s=?7MY=X=7}eK; zr}8dc?7GlG_F(o7`MjrnUUza2mHCH+BQxd}v!4o|X*@r4xk1>^%P}1zEdsqiK<6Mr z!!5uWZMH=r;=p+Vf%JYCVI5_l=nulLH?xZZfba#>V-;?piw`xJG$e-6)Mj`xX5#?A zYp`*1jI(}Ef*-2KBUl}{OtJsH^j=EXXFwU4{ry;=jxd6?EI_;VGMGCk#J^YK(MgKG z8oahsXVYkXEAeA|C&{42n^0OYjKoOx5@d)_V>Ec-ZPbq}A~Hq|q&^e;nEyy zgX)2p%NIarCQ_$UqkG6@4tc%rnUPvV{P^0K;mH~jNN28vd_2g5V+R-gRBjM#N}9Gq zC;aP$)s1$(3Hr|61oiqvy^k#U3tAY}oKPIRFu(G!{lr6S;sxcm?x>T_WLo<5%*Lmq zbVBkfd+XR-mzsK$Nz&dbEU&yQQ__?N6jF#v)zu3b6uObv<-SZB;7p_Isn3+#zq9Gf z+)N_L4dL71Uz;Yz|J-*qM@Q4*NNB|5`gR#>o~qNziPz5h*peW4Nq8wdDe z0Zm!mMk1?gR7yuCLgIf}L?jQC*V%m_Jn)34be=jh`Wj?~phN}ZOc80tBwvFb7X~Yh ziPl}`zJmxt(M$6%l8f{m<$gV#ytrrox^CClU(rZ^RA>e02 zoKH7Cq~pz-^ttCdhWY#w&@jL_r1lMwz5j>-rC#aC8u^DffCovrA4EUL8e)h0DCG?o zO~cdg<*22Idf?N%@uKAXGnQBF)KkC6-2DuDe+0-PVxkg%WbS~VUo@-(F@f&SEWp2+ zZ}>+)skr2yY$yHWO8-g6ItDOa3sEEw@kgfD>xLxExtXy)N5o#$7$#p8nuD8{lfb<0fVOWIJ1Z^|Rr# z)j7mYQ*HB+OjdLgnb@~^pSpT~ne5n_Z-NE;U$=ZEaxpWG)K*7_u(JB4@pHEq>aneN z=Bk>4hUYg~eBy^bwzYk1n;#Eq7bN>?6tq7+f9Uqa$K7jD*6T^3>&Aw>)P@NQb`jA= zL0|~V3jNgQI?djmsj;&udHFT|h4BFQ{8l*#-UKK(1a%uTYuc1e5cBE^%~x!22VXlc zO2E+?+z|<7E58`9cVsbTx`H5w+>Q3CMrs+U=r1CfZ}nAT3grqabGR)#k;+Cd5S#*! zQv%(Kcq2P-3)sP&0$Ma^19?jTn6r#z<5Ym}$Jf@3JLZ{kFDxUY2O1dMyinHY5v3RY2W=Tj7H&uD0Cr31EgXd9qFu1`il!z&a;N}_JT4;PY2Fn>^jX~ZSj zkl^hPz)iGwbNDRBuqmC#C*bL#A`*Pk+1|gqK+YYnj?S;ZE#QV0MZF8bRIQ*N02Zh6 z`NIHW<~HE#BA=q#kgQg9fOZKsD-veI57|+``QS`%hkK)Inq@+^3Mhr8xjFJS;eZje zRJ3iJ!w6Y0_Jjqi1vnBX&qYq~`n8x+oh&3r8KFoL>VeEaN?Ke+pD?26UDL*-9-cj>UE(Y=x?3$> zxh8tUfzCJmwjl3K1*J|Zf~_`j%)puJ&Dp9vLv-Hb3vg#xSK1~J-mUxAG5v~DR$MX9 zOw8FGsF-{??|>s4ExmfweG$fauG1R}T#w;nyok;K9Lsn8}H?FUUk8>l5Df&kC#~O28;xYQNVZR15Ad+oK%O;zGS69NwDuEvB%WQVW}#+4AhlVIirAJKDJtmA zlBfdV2hu^@kXfGb)J|9J@Fq}Ez$x>r zcEi${&WW{2gQZ~pvP!1R9A`xs)DT?kL3R^HU*5z>OO&Qgr-`WC}T+E@Xc;3!l^?p4lu$i{yDBY`x zD>Qo#Y7M9-dLgmopOcRdpPe(ye{y`qcR9yCDv{S0Jmlfi-SH|BF52$wm8?FsYsU|C zg0`x%Q4;3H0TI2-?JbiF<6d8CsBw>*DKoTdYn_>j>Iwh?^E%%wSGYI}IT0xZjpUnYzkXjEl8+YY1w>lH@htZb%__ppm!3MP|v61c{zn2Q>S zAd4wAVoJB4MZMt^rH#M85bQDk0BGLXYw2`;LNNtuuk=3j>P(?y4GML{f2>U|e4J`p zDq6GPXfM>t!1Gv;P?9nwt!*dUh=;LvH?Iyav5OzF@P!^F7Cf&Q%FO6ej1QqmKTvua zsXWK*#e{>`7V$$JNTuc}+s@K4H*=(Mkp>hfvsx{7vOTAp1!0mN`@0bbue?dwR6p=N zAJD=9cMT59@iI>AAtZ{0>PX(;WsN%8EB za_=+VemTA)v#Vtxn8@*H0*rf%tMP{A^KZ1CtRC&}~;*I4NB;Cync3l#FI#)OO~3N%nP8Kio?X{aV({ zj#1txzOVRrEpCd^I}=29>r&3`Q~P^s-8m_e8z#)$nly6q-CxKr0NTN4cJrmF+WR+6 z_<>J(V^I+uoWe0jya90&L=u6PUApe*C(lh*f>+YF*Ma6UOT5y16e3yuB>?ETyL^}$ zrxLRrLV~bgg3a%0M^mDgmSfgP2g&8NH#@;!(bOGLd*IsJLA0sdgG(;Nq^nh|_HZdc zL%aR+)bKEeaQ+H%i({RKy(emt3#taj3GLaTobblYxHC!!I;y>>_fHzJX=%aePQNl(U@IpZ2!^ z*?6cNyKckM?NU7iC{i0a%%eV7O*65~;io zloRh}@?yB$Cw);$h23V=~ z*Ppxl_tKS7o=(F~K1nnHVcE`wb==&k(=^&qdMYoS;P=}ibFeIAsd9NH|+Z#OP(I-SLK}UbTZgZ;@S?vL^KWgH<49eN@7+ z8C0LiG(IID<#-O5DZkMQ=o#H5f+qj)SjBY%P2uO{cnpg;A_FQizAT)fwC}ck`BXK^_{$2P8eA9o z5u-lW9?nG-yvxW#fRvBUlkH zB$7{!-t&2?bjzAm`eA1ZxOT;jSeHxJkBz7h9DP*uSX55LfWtX~32g=&)UUf?6{RNd zzP)6_kBRs*Z%zj&U9^YfCP1oXL?;0+f{(OZjIrT#ec#MN6rGmJ0gOgyHwNf9tKJHF zAAtkD!^01hR}(mIV%@(NmpR|vnd~p8uJ<+DLytT9iv>nR(--CP7T6ntDhbYx?l$*i%c3qAHk71|-Q<;`@Yu_Il z!R|D6EgaN*8*d`4j(Gy$S5aq|*XEcfCci($)ahjjKUWSixtkoQmzMWn&u^0Mz25S& zV#j&A)+35U-bX!{)~GpFrGT)M^C| zj6>k^yHc5wp%$y4e)(*+!f=XE((0|wJQ#k&ojOzRM%}_`>t&^EU&#mw{7{1+eSOGv zR9-1~A2l$+&n0FXrDT(0aC#zm#d^}Wy!EuJV?6w8qKbS5bhU;U3R01=%a>po-Ki_S zP`!L%AHf*U6>(EHsG9(D!Iu(do{pF>8v<&7_Mv+PR^7;I2!k1dCj`X1juPNP?GXIs zYvm`XkJb$9_+B8v8if=({7zIPmtIm&{#ERrwd+M|CWIRrVf`rIe6aFDh8n%c*21hj zGYf=N@YH)LhJ_x&JzB-(!6_Djc&ih~VCiPzd_(Ha+?A9iYMqdzd1F|81+pe2x=sq1 zlh73hh#|u`stlQ88N4(e;PL#%K@~&N?$_1wF9ow-a4{~y(cn>JD^}w(5WjeP<5nUA z@n^Z9j^*v1gBoXH+a8t?n?%%kFHF_}(aJ{;E$v{`8|#spQQj1;gEK8Q9)TXyL~MA4 zhb)hF7{|$DZuCvbu|`y_3J17xDTCiWTFgOVu*+-4Cb#0@`W2Capi|)@c~2&p%8cEO z)l~#YWDKzud~js~bFNjVKh7M#*+s@lY75`iak&8cWBhC3<_6#Qb1&nR&pu%9ULtai zCa3VS4$ew?S;WW#ENw=LH;Qv%tnC@M1O!BnyJV030y@T3=@*;?k}PiI1a)PzMX2ua z`uhinTQ>TD{A|4DRT&Y~43DCeq2qXJ9%?`kvzup)&EFeIyEHVktoxNyfO@g1 z0N775JwHy&@FXpFMdyB6RiJlD4+VqV#O6Bz5TE#!`_}5ESB`AJH_Bd&IJ3|sU{-a8 zJKA)s&)!7@^|+f4PJ3s8i73<<@};tJFG2}?-^_=J8$VsND`v!?G+t9_bK9FbxmwLg ztZP`AzK`SrG^M^5NE_FO>@yi8Xg|k>?0UW!jpbP-x8f`eGD`q6TTrLRumJSQGb-*U z7=|+4`*E%}$J;S7hPiuK@Ys@M|kzUn|FgJjtQGxV?_ii!3Ry86(vzJ}F*% zjKgMk9vA8_Fb5|txOiBsj?QY*Tfk$JWNolpN~|UX#sLjh60O8Ziuh$sYlHA!2DWng zFAS=i4;(1;@7vrR#QgyE^wie9ZBS$~$Cc}tL$%RFxhw}5~~U0}THgT_mH z_!oRTv_U%)WO?{a2oxe<;^Noq$8q=&o~qwgr%1xZ?P2X1$5u%pJ0|g74#!TQPNs1O zvtRXh$WiF6j<%$VW>~D1_m%g4n6sqC(0TT0H2vp^5~sT{+zq>RF&Sw+U8 zCK#PflY4IkFtNljp!YhKYZ|}R8;sbFd<5CS1o>%KST#L=YfOBwrw9%>I z?+6C9gJcN|PoZ+$-v>!$1603bBRI><;3!uR&`YyFihx#A7>FZQn{#A;Pbc1OM#=lfpT7%ntYJeVQfQ!^_<%aj@`{ zUvT~kZD{mCf8zGJ%bENV7d!jDnd+u;RMj2p&U+RdV^@qqnaIz@T6s}cJkebw;~Aec z_qKQU&~SrhtJA!NaRmitXjGJ+Yx|5bUKu!UN`Hdf;Fa!*L4kys9${hhU35mWsD*GA zB=Iw6i&S}n_@|2!mMPwUzrPfr@rO)*RyX~F&F?^MzpI=6u5S9fy6NxgroXG3{;qEN zySnM`>Zbp!y6H!&&A(PRfqoGn|Iw54Z^p-%f~NmDSO!x+^}j*o|4UmU=r7w#^;1Sa zvkm{~rTW#Boc6=!?T?v9z<LbZ4UfHPlu3 z7v4|FSjw(H#Gu zl)Y|^0I_~r;Gw1?DcRo^_FJQ zE7+A0az6PLm+EIo_A?9?mK6S@XA$%ZsLvmjK>z2ndSZXwtlp0Z{$H|sK(Sx-fBumm z`uC*yI~82wzm?Z}hlkd>@_hctKP_yHot?cQOgOm+Ljg{27fYz)AKD81Da@Z@2mRN`ZO|_!8~>3m{`Z9bJIPe>-^%~}caDkvDWiW; zVUb^?azy@2DhK#mvi!@Y#eT}@Uu5}X9`OG{p9B6m{M>K3gWo7A+#2~S6?FasSNBf^ zotQD%|Eu)Hf7ICie?nZubnpL0iuW(!jDHIAU!1YTFSfv+?8|>!=)dDYBz|iL{O@$e z|HTLW=e+u_9#R1R+*69*aUg%J+kci=|4TWLAA0jYG+F$Qvxev|jujZ){1fiupE$hY zfA+EB&w2G2oB7XV|CfBIL5qWvQt_+6<)_qtuFgN=7NE#4mIo;Er`G4UY~!~!*S~~& z|0&FWaqnWkI3NB|PVn!Ywg1T|{yd!ek0*csnN|Gs@Be>?Q~%-V{uxd!{EJN+DEz0m zMZaT9lE1ZU|E2bsA2R)yeFiABdl^nWXW{cCu#KSF_jLW2W;wZH#} zUjM`f|4%q;%vtz9fIBUt@h;^H?F*HO)^3LyGid5tZRODBwsTCQoD?YX&xXwz* z8E~C|jU#wzc@SrBaJ{T-y?DJ?#&@U*S_;uWxiozCq3pBo_Osp6X#Clk<1f<;veG$2 z^4;QomqU&cWG${|v^$0SJ;)y;K%M@;V%wDwXw38+y<3nur*9_1j^DQy-EvO6 z&URMcdw)CsCM%<|nw$GXHg#yIMPT!G?r@9j^a)v3r1bZ})sSIN$iV6=)keYgdem9p z@@Ktyqbc95zU>X?vh8M=`_W{z+YGAaXyve}wrr<~bTzVR(PXn}-%@JFul{Hz+D+rr zr1Z(SOx7du_uk)EF7azeC4KF`4=71li+__vk*?0Y9CRCUWcEQN51o_-Y1X_yp&A-$ z3&{=|_8^-#`gG^qM=)E}`5T$w4htl})!7S_Yabf=;aY4N3$rgQX(-jXS*9U=SkHvT z_ps$}@D2rnl7MeN z0jWWWsKUI!2CouKc$03RH3GQ2vG{v8K4P^>T?JQpxtrcm>yv|F6-gq+9_p(Y>0{Vk zIb4>OFfL1Vv>Ys;GNG=rk{7`*psP!vr%Mx25)!h24elXMvNX-BqrkkcrAw`+i)pwH z*W)O=Cdg4i#3zjbamiZ*;nGOvg}9Ga+;fvdwk-uu;qf6hgH40v^bLFQswe|VW#q-c zrqnzRroOt4*R!uHtD~jx#Jp}zy}*gg7UJt~)z(N$0<8q&EMUu8h@f%?M~>O**jv#- zWyh42_ZSVUOy?&9(?iUOm807qm~R*bhM4ogs|Wj@GCRg)?S@23U$wvA#tsW0&8W~Z zjjyALDs84L>;-WP+&3W?3~<0gbPJhkbrC?bHL!4!m#s-~_^e3?=-%-2m|sOgP>)R& z0p?#P>D}aHt-Kao5|j!A$t4py_F$$=g0|NkM+k%QCT^DSGY|~p1!%Qu2M7)G<4=T8 z38h#>^;RTjU&m&%iprtvO22Ln=PQe3BtFs}h*RN6&+Cr775|8xRRsC?=`fZ#;{>He z5DkqbiN=nZLv-QMxQLQy4IYk7&B{cu02_)34JkWf7{w$vSjAE$Th$Pt728g0F9>acs2HYhwOf=#b7^&;xsZAPP$d zkC(WZgd{i(nD1ymLOAGQ)R|nnIA$ffP)zLd$mRvW>~%o9>I6NaJcGFQ=c9SBCq6q+jO-*rgR0^WfA*Opyo#Fe&@udl6YMhqnq4&42LL@8gs3ly%42RP@z{Z zJ_9oFD-9lsQ&_TJ%l;OkfnJc=&|^BI4itlp@|G2eC+sD(?j#x5-CmM|71fHq<#<_# z0B9#@Lgjb0_apmmtYGk3?ERFaZ5L@t%h9E770D2(Tl9wSN?}xU_}R#2&(Y}$n4Gv( zRAfn_XaOK*k78NrX;nPSDJ9uZ>t(g)WC7hO}RI_F=CC?EYr5x zZRaUlIBZ(thj6h_2#TPI%25)k$(GO5%5IRM4Q|)uJMt0jzB@~|)UbJ?MibuetsPcz zy%VF&#)*x7;C=Y8s;XKsA`j1X>-tAUT3yi^Zl~z%gx#Qn(VIlO7uLCL&v@QZx*U$( z_RioD@H;LrM=iY)xX7HRK87w>ezSz6M|K!laEi*4nsPzB$M?E)%il#mkZbXzcagi8 z3D5l!5z^lg)wv)X0#~D#x2g-%PpH$u#!a=l- zd6yb2y?gwiS*nTGZTd62fTvgV*WwGMn6LrW2WsqzUvc8*Rh+77`S_)43Ll`~c>Zv4 zlzTYG6(PjtU11osQU7)ij#U_jPgH&rk{_#`A`!s<%v^Qr(tHZ9O1*Dp8(`0^Iv zD&Fx;o`c#&ePH`rfNSS+_winZ3P|24ak<0`ASQ)+EK}gQ7n16QYKs)v)$UP1-?vN1 z4z|pWd-{6q@azTUwI|&RU7TS6pLS}W>q#^IV&eid^w~`M8F>x&Qx+&PQ(0C*Sd>t2 zJ9?;XF({F!iRbjK4J>DQ?Z9Mg0xqjVMp z>mvZyNXnnnrZf%oKTuPvAkbG;4{Y3-yoImp+Aq5jJb#$tA_u8k>4<`1R%Le&I{y|h z1B8@yEb%XnGJDOdI3mSVf>!fdJt3B1tf8sh$(X+1kBO zugy^I(OL{MX+qoF2bV@VPtWo=9XS;fO_>qWPfL5}2g3`4<&!fPwtXye0H>YNFq?cc zsMOxy8=Aip%}?&7V&U_dgttVlzn=UJd0m3il@u7dP0hwk>A$tIDskaOjBP&>DS+rg_o@Aa2N$qECM)^*S>q|JZswe%DyTL@y3+!S@g59vi_`DqniF76%4iCsv*O6b>(rC0aUlG8jCf3TZwfvA`k zelV@F4C-`gQ%|^i$Z2)H9(I|85huc(;0xb601#KN`je4^(A#LRS=A57nDY?@0>kQC z?>&dag#$PMGt{YuHg!6;ySxt1RcPU(7&K-=ChZJ28D^q)kzdf+iCwso8M~K4mrXF< zn2~eqHK^nQJ-7~?))wo2vP!$A0=?jV6CGk?b1ZxC*!=tj-~q&Fmm6&7^$PiJF?m04 zCRIeR0pZ{+NkJT!4%2<}&`sforGN~L7Q#OgEodKcGr^Jz+Kv(e03SgZo?EhadQSzT z{>)25yDMduo;w@05tITEfZ4#hPuN7A7)FdaYE-l2U3tlr)y#X^ePKfpL$d_BdNU@A z{Hu88h#6%JmZvoaDyh*!Oxqpx1kOU9+UQ!Zb8az8FzD>#1v*{dMQ2)mlLw)LxlSou z<87UyTCt?Nj^r`Z6(fsz-di1roSVVw9rIj=6>+J!qb04t2`EMgbW(ZV+00KLVnv)O z$8DWXo-q)utbX&FLTYRXbO%)y_MMjFw| zCnt7~wAIo>+YOiIYq29-k2-@VWJSrPtZuy>&D}+o!B*D052rm3w7<;HTMV`Po{xb! zjcZg;+b;UeO^2@8X2czf6FZWhY@@K9oDT`4+~9z^V;q4ffNJNf60ar>&jk1_ukjtWX^{RU zuM4RC1eMhJRL&@!#_`P@taA}=^|`-YMyF9o~@jK8I)0{R=v@Mj;TY_Y3dnM zVjwf^9{UEAf;{ZP<+^6$)W#Ok;xx~w#@IWt8dCW&|_I}#5r-FJF0?5l&cobM4e`j3SNuoFJ1WT zLE9M;f^lJZ?Zo2cwvgQ7*$BaTp)`{eTAhPJMK`XlAhlvOj+Y~0+}=$-lmV!LmM?7L zOwX(FV9RE7OSTE&$rVUA#cUrM6z-A4fjuwf7J1AIfmuzXrr8kA}*?eLPuE zuD><&chS3-sc`y1kjO_)X}CL2$MRN>q%BD%w0>(nGw+gZLl|T13!n$tCLE|WQZc=Q zfyhT@@so@e4{VsLGnBMo*eutB_b>DkE0sE4a8(P#yPYcLF@SYW*-pNTtZE{Lcotnq zgRE%0JlKq{g3aROW(>xQSGmYL2a|C#Wi^5j)nS-}i}RMtz=&a@ed+vOpMN-ByON{W zr{$7<)#kSWC@4F=q1k=2tgU6%MbAesDq}{TXAlp;`b%&pj9}?{joGy!l|qc7^65)_ zl;QVi$+~!#*kxS%sBgoQDHE%oP-lWg}M&weid zqyT1u)gKu#!JwmB*>Ff>0id`gb0uS*uKeIAXbEIF7Yf&lK2YUa;h`<#-XI!^iE5} zK;GH1flazpx;535G_t=b^?@|Rsi2&#^ zgEdytIfE9zbGDlT`-~Q z7)41Wc$sTGCcn}lDr+~oFKWcdP8Y^1C6oHzgq0e@zOH}%;K*ojT!qK>>JJam5C5=^ z%JR#(^qF7BFe=rxBU|&fIolc`%{RSXiOF}N?cjrv^e0otm;lTA=d#p8gDxAkEoPh# zExi;uq8_oUA_l){TZ+w#^Ws?Cs@8pSMIhTPs4#&r@Js+RfZR2)>jgk6JI@|&zZ^20 zPa^1Q0?^ODxt@75#VEeYEEvJ@mcXdz8lhh^2Xuak$o%%5Q94k21NSqlwY>)|cVuY9 z@D8Z=_LR~Dfg+uccj*+Z-t^|d$Ba6$BxDqy0CfB9FRbV|PdvPqKRSaT^|Q{WkjB8N zlpt@|x@q`Ly-f_r(lOY8I(r8D`X^6FGa)87K~JnR>RjHi)Z`CpUvWgK!K)`;+|P4# zw8`&<2Oa-MA;q7}xc$Z>o8|1#TBgqU zuJ5bH&w@Os)~b5};WpO+@l^yAfcTdcz1%$HQNS>t8W%ksy`zC9&EUhF_p3SY-uoUk ziMN~-Hy3Yzk=!{nAAYu2H#;wJc`mLY@qLg|z-zzbj#JLnMRcu3^KR)Q0W@sA(<@~+^%Y$b11&;m>66m*MI^hgD>}WbRMvehh;ATC#;ugIG~RFS>|oMMyV1hK#EN6&zqml!DQLn6b9pP!*A^p zEi~*@`K~rS0q%Om1myDvz5)uhq3M+Sq!Q6{`p;(Ffg|l7x$zqTN|Y&Vt?B$ko9N zop%Ry(~w3aqWVC@!x~}j+HY9MSH(%MgC7(4dOS6(BX(o?u*NE=2bzq58Qqxv4u=+^Wj(xxCa-UWw9+D`{p z-~SwPa`$auGTB=?0&e768txK?e8Q~$V%)pjT;DXKO(Oi*--O-7IX#_Ays+84o~6x^ zTslV`#Fp89odhs(vNWyg34?jiCI-7}IMbF;=za@0>v^jObY7YaBDB!2C^s#iVKk+^ zGWC!@fnBYDHGhBX1DSfLoPsl)Uxj3geS>eQJOx#mXw5K-VATOL)DssWQH^2Ly3{z2 zLPC&uGI4z+0(cSTEMvNCR~|r^q16pIKr ziZfqL35g7tC{M%a#fyZ%s!$}SruV(@4D1XiX756JP5=Ec|4epDO6|R;R0~ky(9$Ys zn^s4%sXAir+xEisBY^>9aH{QMaP>Ws1MZQ2qILSH02N(66)vS%nf+*%Eq~JVy{BPZ z6JLu1q5VeqW@N=THv%j9l(z$?X(_>DmLErr`MD7G^(=G-YHZMSvbQNAy+qT`KjCmv z(pBo@7QY!0Wl{Tv&t znQ;osU0qtvW|NBM7KjY?6h?3?O^YD=1;US|^@xp-WN zrhjxaU?YIiuzH5DR)P zIy_jl*cc971T138llbgy$is`*A<2;l_h#RBM0=Y$g!WlTPCP07J93>#X%SdOufHEN zkzKX!SsQe?%}@7Wx%xWcYgM!wE2)|T{KPTpC!AZ z1D~Fr?Bw`eJ_Ep`yz5PEkatZ+qp%^7wj~F2vIL0nL=IUx#1N5P=4X%qqLVZGZYr{_ zh8+ynzIUaAt|uTSpC$3>%E`{>yWthhBkrwdlCGV~Vbu&5tZIg1zLjsOZ+FcyB1UZT zxvVE_J^~Oa$WyvEm2&EL_R&Yfqox)VWa&kUa>Yrv97{uRVJ~*f5sxQ4qT9Bfs*isZ zNoOq#v93`NYK@4*!9fH#u^8S91Sx|FBC%QHgu%~AydMfB4J5O(iND~DeaL6egA@R) zB2;LPoTzE;vgw1sjAZ_o$ZreiA&xPZ*&b3F;$gpYp&pbU1nQiRz@Au_+Q1i~6O+tu zDjX!~Y9F)7pd*P1ej~^<(;C!x++r@|r-ZCqjuFwn|Eb}T(80tV@ICh+1_RG*&`dl- z!5Mjs7rwc)C9Ki5@miPJLlV9lt;Vjzo^!y~^eP@I>z)`UNOjKwH|AFZ_Tw*t0@O*K z6yLQ3epxPa3-a0T@j&{$n3=muBPh!!P3O!z7hLC&xIy}4;t5fY(N3TWEMBLdC~{%) z$$BBHYh&*r-UqkOTN=5f4D~MiJ+V_g%ZAT-ngj=Kf7@oV-Y?L$ z!x(37QHnOaiFmfTRmZNLo%tKy#W}}Kds2Q#Oyv91#1q7K>qneIuecy+A;8V7kdC77 z&5SFM#ICzwYA4`zr)!nTblXotsvmK7=HJ6i>1|A3=#_fg>obupo#Eraj8iEV(gx!^ z=S3t!71#FYzF2XF>!hIy-*Bxp1oHFQ?5UO)+0@TIY9LgS4$zSK#baOCIY-4zejckA3;1&eN!GZ{}6Qu623JKEEIQJ!E` z^M=9DCfK;&2mKzPpd#KJNwn z8e62R2REJG_9QwTF%Q@=(Yd+pw1?kHpB-)+Ilzq+yA>j7!7n~ckVr+uyHu1h)@tC~ za);kB@a{dR^Oo;vV%M2jw{6kbLgv9dy!+jq*u-(0_P5I%^9glfsRC}V^)FLU^n!0v z8HBAqC=}da;LdIk*53Yh^^h7S=5KLJ2--%SHqoHx8ILfA-^=8k%t&u@jD0_D^I840 z#?86Tfjg;dtI%m<%^*?88|^^+KAI5S-zmTC;7kY?VTz|;SANy8gViDBbFhzv9M>JH zpslPti&O_(#(Uk5VD3+aMA8)nfhCq@Xqp;R ziMbHJm-5^pRreQT9)^#dim2UqBYCrCLxDH_Zl7-MELMRjua7fM-88YyXJc7n=5{B+ zDt5;cbs@2+V8?gTy4~&pB&kmtE_gm6c)woH&#z*{0@OeWiJU9iOp^2z)Luf_cl2zE zIXYvQo@GLGiR??+jIsikqBFUf*eVT$6Ln3>3 z8PZhxh57jPCyT&_t>^19c&|&rj-)$UH)<4Ks;gS4iq<;y8o=D!gi?&7Q%^`$q0Ub; z`;rWbayhCa^#XQYX(B-Zb$nFPMDHJ4^158Ps((-58WL~o(~$XqSUP}zuvGk` z0Xd=gM;pE=^f-fKy_YhIGosCW*KByK&s04Vf z*B}i)Iys}b)$c$Xab1iYsT-kn7kG}bxG3U;BZerkpd*GXYXJ}X0o5!eH~54IrwEI8 zc*l!+BA)us@0>R83O-J%*UNwJ){%5FU8FL7)iPt@N$Ju}&he);yCp6MI3@v?Vw>W0 zqZ3Ox7c;G~wE}s@n^vkt_v5un7I8E446mI|Z&onaCF=tTtQfUJ<&4iP^BC#JDS z%85941_)7r0Nm6`hK|zgxH7trsGY8b*De%SVk4rzrO&_ZJd>vx*ERei8WD^NPoy@Q z7*qE}BV{=?@9)JJ3b>AIa3uFK34RSlm#C=4>5M7BNQnrM7nG~g4j)tHS?Xob7KdYY z+I1qFQLa!P%^e`k=tS{W&G}G?F(;!p`t5$PBD!WeYT6q#CATf}b)}@YD^kR0xLS%~U8(Mimg$qj za>w?zQEUYjAwJVFsdcg4%^VoPt#mA_pRYp`F zWDbJOh*gXl^enEY&VEWuq?PubipqGMP~vq1mDRx?Ise76CoTJnrQ3_JJ1+~lS5FPr z&a$?yid~s{bGFu9VA^Q)e3=6}(`xaawA~kf#wn$-VleC(T(1$SI}MLb&>JM)q!A|@ zbRiH7j~JY$6fWorI@E6d`0T(iD}%jIL(5z{7s?K|DX?b zE9BH8OGR7$l$V~Kjqbse4qp3XZ*su_lP=ZQO%8?HT!kT3WKli|FoqjfI2B%aQ%yVV^<6E|C&7GLP)8cVWh|m*&9&VBhr9O-YC8Pd z21!UDB$Na~Z=n~FCMZQ9bP$j#O-cYo6qT-Yk`Q`VDI!W21eD$sA#@M~L<9tt-a%gko^qJ#_{oy2I`E3Eo`b4|@ zu6g%K8C%rTz|{uN?JSmo2ht*fhviqc7G2n$iN3+xuvDfIu5zk6|9J+LoxyrHw)~4V zSWw!;Nb+`k_~hMHslg!mTHcSJ;dz!vj`ff4;+5u*v4gZYYL<-Hk5;4i>$|%A;xb_u z%odh`!4ko?BdXPBwwam2ZdIwXxtoWC*P^l^YQ}#S?5>){_QqU(wbrUu*4ddpbc;$( zs>}{P`DdczE~TIr7A$`E8x!=R)zG%^Grn~$wy5~<0$Au_{n){nQ4Y%7c>U7(PT$9} zi!8j?bdzqm%#p2m($!>EDe$dCgNc*z%4%t3yOG;~$pRZJe;2Gj z_dYkdAMSL0DL<-{>DTln(+85lD971atsY&H$HKo7++Z%<7VLu+1Gi=O`ZC?!w}YPp zuV`S8@9_5)8186_eRFc1uo3C4dB%D85;E!4Cpw$_py!5G?Yo*iKE|&$oSK*m4a&BA zinTd-a)t zTjOR+9e2@=8qGVjT9*p+8+etM_d#PJNjkkuBfs2_ZTl~>4H$9sM(#c1~4r_^v0 z8d0_Jcsoo5D2N{hmDg5_JyPzxmLdN=xg~@2X}EFnZVzT!ta^GfKGVNYZtuWW?!`(x z>r6B82h=}59Zbjv6)Dc{}cJ1Mg>{Yur5XZMTlD!Db5gzU>} zZvM}yrsS}1-^|ln6B)l6mMN<)@W&fiQ-^*zT-ryZz2oi%(JzY094d%M?tgh+em)ZO za5wT_&L6m7eC2&Fr1E}48Lrb}X!+OustL}?q4B)OS;bGeJZqU9;Esl7#FRa-8u@bQ z?e(3^4+i4*>Q|PkKwGm`3eHS(_rqcKz{u_sg{4 z0x8p~tXEG8a*$+8pm&&GOc|lFCHA^*w43D9n>V{}xUg5t3S8D=xxw}Gg-n)*q=Fzj z{lLW9R}eRyWUEgv=7BGqmcP5sx1Zj9Ncm$?21dK{c3;kXrU?0=%_NUsOrug=yIS*N ztHCP$hq73=N3m~BA3aDpBpi$M42ez_73s#}EO!jqa__yRy#rzpuA5)zaN+A-yJhf8 zM_#I%;P;0uIV?5-<6L`cPW{;0;yd^*f31ioMB&^jo$l-v`$Ll%T~grN8)Nx!d2nM(Z^XrKd|ATb`_iCnSc zE6B+VcOP$&xaXg9BrGl(qaAr%N#h;U(!ehGAbdkfp6lMeG~O)_{T(1TmB#9#4;^AP z8OS@s_}WLl?=;RIPu%PRDQXd$qQ^2SCN(GH)Sp-lnF%~)I6qScv9(DUWxs^4q@9MF zIlLC{-0bOf?Zlgx*+0|COxJwa72lJe9O@c!(1Rwt?74P}t*~&?-K7$Or`cMVlj$ep zs!-o7O-w~YV4=RKbA!<@sY&~|^^MAtRN1)boZ_U$^efSAkx|>dDg5z3(nG%oo{?}C zUbzdM{zi-r!$s~m83>cf-J5nk2rJ~3-s}y@+^hsfw$Q7}EH7iY-0?#!`-X(~NtrrDDz*=%=Ok0A`YDoo$E<|xrXjq!* zHvQUO)|Y|UluohKA?h}ub97O#qFPkUA7N~Uk`x6;4aGsvQytgS*uGB~72orx#(cWa z%+$*_*{gU?#Be_71V1Q_wNttkklzJA@}Qk8@T%xvq-gY6hrmF z^K8d|vZ9^|-<@*IT-|re*nq4{LUzjeY9A+N9abV#>wYsXc4?pT9S?qzG+}T8QA0~! zr&*XKC)kE7$SkiYa!w7r`7m@@6V%rdI%JEUsmWe7=tUJ9n73Aa=rd7D`#iI zq|3pner^Q)%QtLYUhMmymG>lBamDKCzHWP8KAr9+`@&kpccL|MdzVl>-|G_)sHd$_ z;CM|#92L}Em^2!^q6Oig*R5(*G0&dCvetszsWNoq zlQQF41yf==yNdd(i^xjt$4h*0l>2g|(?2u4Ysf=;vt$MVLB?;NHmb?$U1~a=zJ_hN zT0OUUU(4+rH*xeUiB{~PZ;|N7t)FNp21_S8HIqqnQSW&aHS@#wF0@N^J~7E+iPC(A zqr{E-rCX&jJl&@cyRK{BQ~8L;WJF#i#OHF0)h|=mca&>wPOw^7!VM1VTeq&VD*7cS zCKIw6`y?*a2PCGWCcT>4tRw9V$U=el9`jG7Pcl)6>d85D#NRBJ5dDM%%R@`c?Fi;%;f7Z;pgi?iS^(`+tser5| zS=DA$nm0^Vix6|^PWMV-mOFR)#h)4^tqWD}nRx^F@}~ovuAG$>vjN_^_vFm82-zFH zG-%7hFANWseY&u>_+j7y&+Q(QQ_HIk#%-ZPh0b&HYBuf{GJ2`%}H{#E` z|F%_u>XEiFdI9{mkZ}W!7(&NPexiX{gDoCbv^5e=bUgFXU0R2u%#5uK@CngZJ9GCI zU%7oh4@m4%2?N#DDro%*B@e ze)B8rWre_LY7W(Pq~IM=iVew-O$$3 z1i`xO8EtLM23~x@oORAi0~CJn7P$nxlVmlH%N8W-m$i}Q=qgQJv;^ex%o)V&xmmAh zuhNQ)Xm+FW=Loh|O9AT`?nZuoC8gLm?#uFDijZABoC||;LNZIeoxTZM_Z{HEF%3F=&;>m6%uKIOVi((%|B7@XC=JELo7-~+$ZF< zN10^1p29iyik5?*SW`m!sZ1Ixy3fvgIC~xnDAxInn@~;L{N`#<=Bcq!BB=>;4v;75}Hd3-ci@&;iJ+ZUqcnn8{r^TjS zDgcO(*u1ti_w>&>d$8yQ{P*(2YO|2m6XX)RbNb3^^ znME1VBL~xawSYQ4t{YwnjwbiQi{y<~RR*^~=Wa_^>5~}U6&-7RRksphD}z76B0O-V zM~JG_>>Q>Yy$n@84%csQnU-1n9Bh7S4Y!%P`_6JQ2k_g*ha`k7A<_qn`LNlB;%Szfc&?r*^VXpo z660{$BT=QSRd2QO-ZMX&<`{6gpJ!?9>)_WnA{fu;ki7S;<3V^{kyEvZTS*RSU!4}C z@6sXtIW|{UwtaHX|4jF5_5(estfjw8GBKZ>42RseGEM?xbt>O1JmY53^jOGzb-58b zmjjb5*Ruv*OP+tsVLJc1;H0nSI&t_Le&HEs`l66`kpV^COco|LwP@k;U@v@$z0O{c z-Q?B{*|6DH(-)$G49^qJ-a(=E9Bbfrc4{eSY>7}(5Z8Km%OwL}n&kU@)Gn=OZ7o*U zS1tZFKC#;x{lH;ZcX&~BL~Qb2?G^J$y(X;8>r14kDcSs@t$W<{0rB*Qd{!ERMV$Qma? z^Pet%E#jl{@NJQ+KXtUVzY6^U>o{z?7N9a_H{$Ai^%bP9tfh~pDko6Kh&sJ;$(p85>o1y%PMixRP)J-8P6EBL#HUZ=3< zkdV+9CwB>jo3diReg%~W`q1wRygY-v_R~-vX1>EX-T1ZAol%MX8nubcP71H)f1Hey zWp*Pxu*%g@`yJy-wm(<+F%hzS7*Z1vI+|plkTp8K=QZnh>z3xrH)C`Y^JFJ{@ygzB zuIqn7<8|T+#;&AlsTR3e!J&#GsNUN{wUD8`P)l;(``}>YTh+iJ=9^KvKIM#J2k}#; z?lF4>KA=CQ4;dw52SNm7lv9wxVq!A$Urj`}53h+pzWIS1im75m#;edS=vv zzz&sPR~g(is7X7)jTD2?N@GT!(@m@Cq8>*ZKCwDW%PbEO%YcU+Wpwk3>YH8*CndE4 z+9%gf_%1U%!;W3F7x#xQ`et)G9PDrrZns5#-=BIPtTO-2k|uUNT{Ah^TZWl!xhn<8*yhfMLbVme>#PoF zTiC^1_}z^Uil5)Pynpp`Wk>F>B(5Z0nR>sXHtmL*wl=Yx7SwU+`Wug=(-bxeCy!@M z>;J$Q`!5>Uf4E_)%KsO}*#9(gK;eIF=D`0}XYBv~C0zb{IKhAGXdnVd z4VVdRaWX@)4tnDo&70%7(516xlhgT~E7oBWjV#t-TiuVXPs(>$XLO=_Ml42?#TDjEbk{%!YHYf|^N#v}jz zJIHURESC9LowhdmbdXPG<)ZHHe5j5r-8rqM{5zX^dvXec{9eF~@?+B+>Pee-YObJr{Zn02ZX40sXt@F>6 zAhozTpR!b~b++cCdwSAAx!gph%*s*E9#2x%{&bWarMcccjiQnoo1-ZIsFV}C8Om{f z)VJ9V%D;}Pv!f2G6>rq@y@!Wv0Udu>e}0?juO@cb{mYFy)fd1~$vbyK3n)v?f$RPk z`q`Le_p?U&asneVRTT_U6!@q5-OR8!>g3|C}OlRuHzDv=A9b2K=2L@ zcju+zVlVIrtNlVq61|XDbkrt<{202tM-^s3A>)l{tZ^5lXt0(sm`^~0H+*(v=yeUf zB6K-BgBrLyx?6n{#Y_ETL8Xmgq0Zxx3@vdSM%U2I==qp(Fpnrr@>nVe%b9C|O7r|6 zJtl?P9vc9fF-v0k^Oo}0lNvR7ciETiMvH|6?~nc-$bV_RpVdK0C#U(~WaDX?Kze&o zufgM1=ro!x;F+2|>N+kLSOP+Ga9Lq@@QUDjP^?M;J+1~})n;)A$LB+DNHC9)8{ZvX zjNZc|4%D_U>}$WZ*){2Ax%s+@{t0p>5~RhqaQvEdEJ6&E0VY9+Vg&{mUaAYoS7xkP z19%74<21I1Y?B0h&<_oNQ&{(hzcZNj=S$e$d`5 z7f*inDASGFVKs9bXBQgD5IN>RPrZr2IuwjPbi+ME4p*WT!HhRh8TgKc8wJlnjIu@< zSYLR14p6Hy{$5Zf74fBI;j1W6OuuUGm-U8edi6xe6Zke>3mc^YOqX7V+&k#-%Q^o64LQ1a`Pr;g6$8AWW`m-LbEOacq2U$#fnt65HS z*AkM+^Wjzl|4LJ-Yx*>9xI+gTUg`&q^)Wr-AnQE+!Z%cn8K4GKVNNGAxLMu>yoh>1 z^UBhIn#O+ge$?=wI`eoh%y_h(hJvR~@xyF9J3SGY4rpiPkl{E`j&|^GyggjyI*wL`E5TyyLdD?-X%z5>>F?Va+JmWkBcFNyy1APA z4-#+QHy1eZ>pJ_`Y(VZXx7Ai5Ng(DQXi_=?3c+}GM4<5YB`1ibaLoq};dFeXPPrT) z^5S6uEY})S1z~3&Tf|C$5El!gFMS2$9&A+~@()MiKO+3q_xsh%d5)TiEEmkxC}$A) zdUT-B*o@xpTbZ%r8V;{IJ}+nBcp~Jn+ORWHfeBbjFSnM3d&il!G%frK?&shA z+J0AG0NuV3G9b!1Qc^-RF^D`reZP{amcVtLmZckGZ>7iBrAUkn_EjL%seYF9U)Yb2H4f|awQ`>c1fK;Zxm zFJJBZmgP>!{yPL`PCY!)4O*Tsy{jX{VWY2-31iru9>%2RZ>62wnvVO1A|8(q96ogX zHJ?nTp&+O*j`f|iUn|~)aS8LKjlM59%HPGW4lT%<11uTjy|u%g;f3f-8V6?o8wGOd zaO5sqClChVl;(%`wI6JF(GSzp%(-}c;2+vKXt+sqy_!~|0g1Gaxp|g$2iH_l&@XY$ z1G@DZb6eKl0r5T$h-BcJ)MwES&OwOF8p*|dLe&@p#1MAS^>1I6^OHpCy!9f`4@qrR z(dti+p~op26{^jsjxYh{$nsZzZ8`EJ$ZIC@!X&LX(B*gFnP*sk{b3C5#|qjJYX4Ez z;2PFf`${oUMLkQZMi%tg6eM^QK71$EkpJlR%S+Z6P!tb!j*uaE(*+XRA(MU)kna#P z)(Rq65gRxvtkIMB_ptG3n!zr0Y5D6+HgY8}?Qg)_J4(EsJ|&ReWl58%b&`2sIMa#y z^c1lgB}imZrcb2SOW@NSP%X0lD@6kyteAI@&Ft7t<3riNu?j_;&)p8W7w zON6rUK3EOmdn~2Gsq=DcaWsoYm0w&G{aD-kse2t?3Aa!YFeqir2w(^qwis(9w$ud! zZSL8m9!E@%W{k#0@RKdywI$=!#IM^~9rN6W=7PhI4w$;-R(R(tZ_MY4 zpAAxJIPE(}hAZZeh?e~Jm2Ou$YUoAjWNISVu^ia>6W5ne z1aMTE;Q&;EE{B2M8q@qx)-OkAVe6&j4|MW?qDY&I@?I=^_#?mc zD{-N*WX<{P3Qa~*#PH*E^VkS>GDfkGo%|T4lu2{;S`s#-{a7}!0aoZPd*2j5et%9D zsd0pTG)Imtp-mwwFlj{Y*BbRa$jT$DO5P8pmRd%w%1E(hj_12TbU*Zod-+?xL50s^e=8-vGz3|`&*4Rxtgrvoc*Olh>5!gW>s z&0BMmVmQ|IeA(UbF6`qODRkRY8CtwY3-XtD?~pshC2Q0Wmz&Xxi8Pv=eA0w#4Ds*c zd98>jKzE=$L)TKWZw-ei9&3+sUW;L|!1e-MA%Pqa;RXu_b*{??9-xn5)f+i?js%W{ zp65+s;!J2|>kU4=;5(^U<0ksdN8Q6IG5Gp0sW~Cwt_c1P?e%mZ9jD%J#62VIM@g?* z4pB)Ip16U;6y}Hmdzldz;WBW4?5`KT07MvYhMoAKMld4MU%jenv~%u4UQ0=O*eicm z+nh&a1lgRMnu&D!lT_)7xcoubGYg+)Ox*mBVaPzLTML%7QAU79*?Yl4vLGm|g~K!+ zz4#jo<0&@$eh|5u)nXEFORhEt1+&WkKsieh ze!T^mI*mEP$3AImLqVSXSRfv}9&{>F&Fpuv{0llqSc#V@rR`Gh!66trzaU}|_q7S~ zv>N4l&l`fCg>KLojR5((O{%$nF7X&&o>;6?1Y13$FJ%c z%F9u++k3GeQ^}DZux=WVSgjF-UR+vW7b-ZggN?TV{x%NM#e zX|?!Vh+2ijEU=>t;gu*r6tX60GCrw)jJj${KkQ^75Amv8TE4A_jmiYhZ2Okdyx?Dw zk?eeHjeZuJ!*A7qzNEjN3J;#~_-n-9&~*2>;9=3n2(N>?CEXkso3XU67q8qtcLgj> zWt3Ylh*6nXduS?$PXA3^zUzRmCf&>*3ht^lc)Gpi3seQ`=)f_!97R;R_XcXZ5X3;q zrYQbvE&0>^O7zD@ju!KOc3*`u_n2Y2BIY_?K0y1LYTz~8KbX6r6kHDKCp7JjlcjGG zR#Ix(I|(lPSoY(KXxU1GkPvsy92etOR2V(KtWVL4d8VZcQ+EBuVKT zez`>6fzwSk{hc=U_5wS*g%o^OCmHw?(YS7>cDtAJea!x}Q_nQczeqCArK7j!YTQ}K zK-`|7sd#6H1LUtPl-Fu^*Q0YpqbLq3?}}&;0&K||J=5(+Vcw;FxCrJGviATuu4e!n z-utt2UL`J3Y{JqAV&blT>LWSkKO=dM04MTo=&-dEyBiEBPW{Q&nsz=(M+m^3?qVM}W-{kKn#FzFb+wk4BB)J)|CrF6|4$JBOj~6g{|JvB?edmJ z*(NUfJn-Jqmq4!DvGJx3cy@9iIrJ*1$0FRW z)AJ6#RIHyGRxro>VIqZw&?n_dtz9Ovyli&8LrUY+S$a%=es%G(V&0=3;ieZ*>Gd@? znV}WfYXD_td!#P`?a}F#$b~SKl?+tTfdc-r>i*LKA70+6&E;b{x<#DR*W|m&bxWG^ zfWbZowsm1~TPK}VN^=El!!WN+N{2Z@fWj~zE9{j1*J=hYr2HcGB1jXar)Fy}&)T^J z92&Wt!&~u`WsG%8c!Z5vyawq3N@?c0QmSn9sJC z{G35A5JnCpF7Uqzt}<+`sn=pm-|VMGw#3r=G;K5EW`@tD#uI8rz`T7A81o-N=H2qH z#D-$eg!dnGe@#!*atAI^Pgo1yLfmE7&mNQh#3ZmI)>;ePoYQyk_uz0Omw0bG)!!KW zk4GpIc!#0W2BxGcpHSdi2nd4S-2(6M2TOSHB4h>KSd54CY*k-f5zqf)A5nghWnOMc z%K}?Ae;|f=bT=mo7apX&;@PSXeB?aVh(ui|Jzm0!fZp*7%UFyn2pc}7?a3j`wSeQ+ zMbczvX8RGtcfx#5^U`N=-NG5>{Z#dI?8bX9J0SD>0#$H3&4KQ0i&B?0P7GRpi~y1) zpriz1gD!%DZ%v1BkZC+Jj#i+bJm|$Tkk3qdv+W@y2xT4{qn7kP6~&atL>|ii;Fp|3 zdC3C0z!8#v5DKuaMvgplyG!zmTLp5SO79t;M9sPc4Oqx(IpY0tukuX!+4DS1kC6 zY`ImzlIy#q>5;Cb608oO6^-W-Qb+t@=v0K6y6Qy)?HLL1t2}&Tl6HJ)X0Lv|Z1!vn zD~P7p8M_1&lRAumw)YBkfJL&B4YFQOVxm_HB*D`$-O0d7#f0~_Y#4gm`9I0f6!!a# zhTMN8^!P9|1KKeXPd9Thue#vljtlb0WWbQz@%P__XTUyx$Za*124OE<;Ij=Zn6t;u z@WazC2Vft{R{;!xwbQ^Rcs}6kptw$a$HG=c097>ESD0pZh8Q+Vqi|GCQw?1?mzNk0J6OUcK&yPjWrk{d*)Ger!;E2uNt-bEY4 zOmfI!|G>k&t9>D9R4_II6xBub7NLx0(-`9xJX@`R9L{j5xDh-yBgPOK$WXJ|TLBM0 zk($skwtmOCuCx1h4TE#6w6l05A}VR&nYb6uBvE_y^EXDdMX9r^e??Hp#Tbh?MDq(d zNZM|Avqa5#6Bj*Mb^l9%I2i^dSQD4WbKU{}w1YX;vt+$^K=vHA^+Zi@RguL6>_IKc zj00^=G{3?jtoIht|7wI&U$HJ#kK(hd{3iBKE+2W)Z8_>Ykh4>Zq3gF3)<0-h32ek* z|5&>ltoRN5{1T8CNqRjD8_3Uey@3NmYb;E!>2lT;coK{TB_Us6E?EkK*a*Rc0AuTl zTLJ1?(WQj@`i`AcCQ-B%rW?HH?^Q~?@2tlFyvC6w4KJk0=czQ1%!t60ITpx(j#AzHZOzSL>Bv|KNIOUCq7O)iX!hWT~ zRwJsK@187^+DoAO^Stc%lN+=zfo-ySgn1bbq;&%tgaPHIWJ1R|COXajj^<7`^?m)Z z7r~rL!++vFY`C08;$)QvEj<3rt2cr4O@TSSL|(4nML@j{faD5eLvWzngaev&&j>Kb z3~of-NNkEnTN>6nmLsHU7rh}PdlBpEai`uj=CtjatB=W1W_?urxz!vc#z*49OR;`w z@dAX$PGk1W&J4gdzU3QwgT3w%K_CZIk7NxW{lvIe8QLB)|AR^G)hqwPD`sD$^dQ|q zca54|oD_T}17@;s(`)mUriKJ+%GZQ537?+4pb48H1@58%2SHzkSm6QZz)uA|E=H!x z44QWhaDf>j7svw}2nS#b0o~hhOZ!VkoYq*kUq?rzk6ha^gFc!aOC7@Jk;H8S9D(}6 z1X~3jRQ*tY~2**BUs>23<%ThA{|n@mn%I4Ax=4X#XVIfNdm(_bZMj zQgej2M)2(j^28TY`0<3NpDF`pWR-_?ZC`hwcTVR3>2Zi_l}gv|mLe^ZboTcLuUcoV z4cHQhcW@6AkK(b(MuF=Yg$rN(r8;+ZB7!AQH!{oI8|*NGdXb&fN^I11+!;GU8BA~< zn@OHN9n!|60qZ1rYycVYA1&^S0sEO#Wac-fZ{bin1N=7z6i#h|3oW8`)js#^#S$E` zYG*&prlW2Bo1=5e(tLoFOJF@{ODzmahYK^oIS5a-SrlTWK%c@DUVV7)D=gL$nAj?{ zA5XjlTuc<+d39(=^dNG+Y5Lx=2N9jlCM5MIy0;&8-i!Pj(ye6zc)Wra*2ZYS^4|gv ziqdy=075DwcxB<_8^v_3>V@dj!&;GhIZOwMqYR8B1~T`F*6Phx4zjBdU>y9dv`_;w zJO!4j)XhW{+*xp8eoqTcX1k>c9(Hotv9N#zs`|16U=`sN&p2%WPuHHfgHCq+j#R2W z+KG{u=L43zaWMva9Tx!mn>jq7uCM3l!2F10jiPcvFCZW6Df_C9Cbsbl@~%Ht6USWW z8AV(UGXIHa-}$xfJGRe|PfM>|tH!gBp!yV0tEI4(Xpmc!+I)%QSO$vtNjoTL?7MV< zEXU^{OaCix1?ZJHN@nkImkQZmi--rC=t0);c@JMm#%MN4q;ECm#`=E$qEMZ3bZI(z z&A9mEeF2IIRE2|(0OOei2XTr@#OP|HKX=`+Mt^j{TB6|eNV#QLCjoqexgt~fTo?=T zo#%T|4T|ZVR)&14j{=}WZRROpPKUx0k_RBuOQ!!u*yXk=%{*dKDBXJlx`|EF$jJtl*P+#DBDk%@-|@mKpAC z+uJmG0yJqH3%Jws1C?I4yDOr&-{wfaCh%b!~rVz6|U_-$oNGkq&pN2k3{L=%DS1?_TSX130`gt-A1Vgq~@^ zUhBA!I6O|py*MH@L~|yrrg|XmzWzHKGehO2sF%R9mVz&kkAD}qKmy^=X`O=42vc1^ zUR?$FGVGpOJS!m1T9=2>CGf{gg|=(=zeA18cEz?kdz-V-#TuV(Yr3^*KL=;{LXwC_&WtCLvFiMJ50{Nmz;Eh+9PZ=4cO1VZlE>ETlik>2w+g*kU$AnXy|o z@U{W*HTq-p``nnn#yGYc=+g5VfTvCkCQe5tX8<6fL*!S!JHMHpclr1iWMV0eG_sKq z8Cae#@}QS&Ki-L%73AK=0naOKLKGIuf(d+4Hp2_;c^6CU4_oJA>_&_XhrJGQ2Mz;U za}ZTt|BOcLz#~k#;bY^objzDjo!hN@T);(#Zh}Rf4bqu-f+62zvj=rxNXz z+z0iA?ZWxQNWNVJRlW%mcDBN42-R~=Z>RS?~#`wqc-0J56bu`fQHyJW48X;URY?z;(xre{`1kFvuzYYwTHUcmh>d zcHI}gD=F_H?%NI>o-|TH!d(U!dW+#z^=Q{?SL!hO2p%FUKN6?%5(!-M8s;z{8Q{Jw zuNP}z1r5>wC&5X7lqe?u0mfe-cpv7ucx!vY=V2vV%D^hVlIzgJ5I0+9tJPjcc6fI3 zo947#N#YXGQi#ie$jK~q+iC#pq2#-Qwl5B`e<;JH7K0R*W0XIx5^}NdWBzkB+?KSV zGV!9ue4yGkE^s&ab^jEc`UH6MJW`%l-qo7^6~RVkX`0f%XjDjS;0p|c5m~%p|50Bo&4<+QRo4FNDBz%o2a513%sHZ~*rerSs6AU&r;j*MipJ zuz9_&2QH_#itZRdW0*}vq&3sdY4cZBiPw*DR^PU_=5k-~u^&Mwmno0ggIDtW*Oegg zGP|)32#q-icanaunW#82CvH&>X9>+UaR*l^E1s`-}bpcqBoH71?(Ao3BEf= zX#V6td$9XgRvxu$Whfby`V@HmZ7oij^AO@ReFHi`y={l&Oc2m4{z z96xwsXblH65T2Agf6rv_1{8STQ8IoR?^sqQ_BJqWPk}h{c+O@qp9rs zgtLx-U3PDJ#4K4e1e`?zU?9Z&wjo-$6wql9FJGiUy-WM8JIZqI@&lVO0j}lQY-fA~ zT~viWErkuN!-~qH((&GEdFa=T!RK^#s^G3hn+mG57qZUf?WbrvhsT z@W!KHxVlFTkj@hsNq?F$R{kmqL>~UO)3bGnaU?9t5pYt|ydPG2@#xDy4t?cQgHyWG zK(^=~zT|9K2(ps}c$^5_&sOFF-?NMtT@;`vz0`++58tONSSOE<8BhwJ{wGC~SDlss z>Ua^qeT=A})g**4hY_Pp~=Vi`;J!9N~4beq>S=?QNxkabI+0022 zgEX3IERZXW4k*bHj)t&V%3rmjosK9#&vJkCNIQ5vh^V38U9#lgjZy?JXDXHohiZVc z5_M$3GD5rCV}2Z7)^rNOJ$i+OM!}-&L7clfwRbiR)%)nEHakp{0TtcamEDG6(9=Zf z@dhdXw2DL887{-t$Qci1hghM1XkV9wTh(LR?p0ON6w(~%{I!)G_3>D#swaf7AGwAz#vPu?X@*i)IIYg&nhN=( zr7ZS*vZBS=EpLFCtoI1`m*Wcp3TH(LlL9i;r>+4+IgATV10`s{@rbS2>&h57x+(p37G+`1roFGrG~6VXv`iB0xzv_FFL8CDYpGio?V=dz2p z-5#d|mSNj&QF5Rlw%WD?f(h;mETwQ7iSL%F@z&G%&%5>zVzXiet33ew?{{P;{Oo4! z;4jri0*2LVMoDqBr!GfYG?DkMz*$mgJXVtaCjoemZ=LPdheCT!;2g7ABWeJz7~#p< z$qr6~(TJvjDX!5@yK_C1N}AS%2cT7v8pCD=T{Hd2&JHD=XxNjLg2u#kXRCr=PIiT8 zoAY9uWXGqMOEj_PSa+M(*rhs+v>%{|z8D66c+hD{&E9s~8slbsxZ;4Ehd|Loo*mw8 z%UcV3rE9bEQG!w^x#yNCW1T8Qamb-wx{2=Tqk#_vG++G^=`N|l^>&c}Zb0cbqD?P49kiVG;qu z17(Gguid@fsrR2nTG!E?Fp8tQN}ne|c@x3+r{SjvKj4p5-?>`TYU7vo&h{V-+1$(m zg9Dn$tpx7=t=%$9fWm$IE#W&-94fW2G|XiEy>T zmyj(dv{#B}Lh;^s$NTA5X$Nk@-5I8~Z1m5Qsp@4||Aqg6H8S=uMCu7xP9t-7Z6flm zG>|crhjTT=4chepYC5NS1YCV!D?f@!>t_?VK!JgSRUxdMZNi~X5ct`O(Q1C%>f8ls zUm0eVQlTeFs<3~(e`Dc2d|&SWYk8@--t*SdIx3o3LS8ubW)MyOL%uJmb43wv?90-Q z4!PW^y~|OKCJtC@^kCP&UiZzmX-AG4N%?o#wjV9dGR24;x`4$PQmBx3ZWXlCPaWy0 zZO<%?Saow)Bz!4+QJUCnarbuOuTsv2wAD|UF{Zz%;U+;UtOtRx`FxA>ss@5x-p=rx zdQ@wmQ%b{$XQj_-OV2!K670+EJ=cNP>OKdDOb)lz6%6wbuDw3A@wA*?0J`73r59jf zuX;r=BAv}<=&_fewpsut?E56|=0pmXGk7Wmz)2CEauKsC0o#Q#Z3RGtHhZ zocG+o5#(L_-Kd+;!`uFMEar1K6UBh9k}CNt3g?j=G6S9B2sj-J55Ho7ho_6bVAJv! zfz+H|F5$nQhoNyXnf?x7orGyN;A>z4aMDzP4L{^i^Ne|uK?9pYmvgF}2n?Ej-sBO+ zzv(NpymXXIy$0X_4IAUKPyRlUw{E*bXo1=z;rfN&^s1cI3t|@KVFPIyICzy;Cs+Cd z>jr&7W|t-0E{5}`HvgU_DXN}($cO7k-2Bkuc?cCf+jFGDL#@q3q3g~Qu=P#IEATQ; zp$O7q2~C9US!{~}Em1)gy6Q4HhP0T@d+73_^ySF*NlD1tu+uuRVe9F#McsQov=q75 zG^c*0{LpFr!X>1ND5#71L}tF;^9Bq8b5g}5!eK)qoHQ4YIR-oVZ-TGp=3PGbo@+QU zCONsQFLtB&!OPpe>A=GGxk$x#~!~xu;6VwUk&Ct zS^mEGQtOz}7d4d!&jb;*d;Lmt6OLisY-*J`^t;`i_unpQnF89;M$RJim8*TQI1OmJ zGV6if{a(NP^@r_l=!+yJLA-Lg%o%(3TyGB1Ov8%iN zY3Jll{%_=fc7Z3r94rdHWqIfX?O7sb>JS}vk9g(PQ@!i}h%Vh46vCR3&INMGCB|`FSaiQ3& zzxqd;*~UbnCJPCCL`yL*#)f{x?&|p0u9ryO>IF=}J`;#?OhM|)1mK-Xl`Y^GUV!oy=~WXJZNrgzQvVn+cVC%-O4ZJ2Y9OpOT;uiC#v90l7f?48^hCq=!6N+Ka< zaJKW3eian(q29faxS|7h$7R9#*pWO4&2V}G%}rLr#qcACUuxVx#~UY_4xkks3Ny?Frx_5?ekD=tPzt9)?;pw8*f4N zH@q*6`#=D>J!Sst#$DOOmeij4@8Y<@>Vs!~F1_JEExz#|P|BZppt^oxgj_=oBze;U z0gM4(<>ZoyALwL7%-^?NuH06GXE4Y|Je3pI|1 zt&d}XZIXJD@(Hve;MGEVHau9pfrx)21^ongRh5fa{at=?GaFXU>4c)2V9L3Awz2E? ze^B?gS!WJhvF0~?(Wh;ad(#@!L>k(ltO87X`w|5r4-re z`z<-wUhlQfTIb#0zV`VC9ul79xo74c`OVxj$2FV|&uo6Bs8p@J5Sxo!=fDJ|+6K*D zvk_MtT~8;ZJr=juh9{svF-l8T)ls1s%!Ey=Dui{zB9JGg$v-Ji(?mV>JI1T`5Jt6Ykc>zekHw`XM zvwj_O9?5Fr(;kbA=Tn_mRfvH9wdsY0wDjPm2(4u#%lFY5q*uINV$xbXImE!&T^hzj zTNSyVMOlq)mS=fh`mlVZ@{R8~m(4G}bI|IDubN07e5Wn$LA)ZYUyV^v9Nwoz2slFt zPKzsa;vbJN+#>Z>SqypE$`P@EsPY}>N_mBNq|tbIqfD8DpUBagt_*rE0m{`}^v&u1 z$3^+}9kMFL$}6gUC-l?MB*52>k+s5#y=Qvo%UQgWiE+y!3twb^$rh`VYDS0cMZCeS z@R67!a|ADZsd!G^ja>3l#%%FGj?pSsA%Vo)e_?qz?miIhyi*lqQ65ncNOkxQMTke& zb)DdW7N|tcXOLikRsBfYs2Cndy-AEOyXi8-CGO`q9s!N=K~Y${DLka7>^>?c3g;Lm z;6dWUSj)Y^iOu!P@v9kJMG8i}z`Xse z*368q7-xy;PxD0z9|W@#u!auFBYUP$q9piA#T6t9NNO;9C5pOeHB zy)Dnccf7y>7U4t(v$kv3`HU{3Pb>2_K?>f6RZx`~-*sE%wpU%fBbHPgkwCzesMp83 z2#=Km15CwW0!-1i90JjhL^uixV?k#Lj*RisBlCH!q2wC`1$#iA%|x4|h0U%Sz-JSn zFi&U)#s1M_z9Qt?PZ@yN9k>p=Iex@?vvhjo8!}(UJgb25Xc{p=G@4C$q**yhH_bBI z@zS2y0QROW0$p0<2Xyvk@00Q?Z<_;u22~7F*eRudiEZq6M`~}V?Y65jE-|AI&#FVS zI**~zx2?usEFfVv*I3HLWy-3#uO98XQWkg-0N-%{xrrAy;4Bmw*h$zzi|)hglY>v5 zX*U7G7m$zP^@`^sh}Nbgu3Cb71Fq-MI(t>!VwJ15CDaR-NXI0SI2(j+E!4v!J*5ko z+GNjS@Lk*Bz1@{+rI+yn3jkf*oVW#fhEbrjyqwaJj~RX3XDWUCkCtV<0s&>WSOGKG zr}WPQ3MNVhtX}$<2kqAkk`uGbGkAxi=95{@MCA~+5TaA_wsWJpE-ysTFqXidLi2%s z(zJV)kr=(~_1-cBaWRb=$W#f2gOw*qho}oFJf0B&|~9e>E?8wrm(*zhxxg^DlC$Wg72C zbI9|2pAdB+RYj<$HmWk@R5cj+oP0gbm9=pHMXqJh%oS$W>-!wN2tlx|$_^mIu50#`|5MMOw0C72l;SW5(Nfwk*(4`O| z6A;E|unmtlKV*5@x@sXrU>E+_#h}CTNrG_PW;~$atKa8KjVhE(DOsGpJB zUqnho3|hVP!+&rg)*?&f$AbC3$>i`*!}y@ZXNL@tcOvLmHV*~DJb(nJ%&k@pfyS-c z4&miflOUvz!j6UyOz5It<3GsRBtnMY$;p&7UQykvW4XRgabwxeC*vm!*QZ=-ma`Ew zw+g37OQ)0}Jxtgfi}Ew>09F&yYAay->td8{37@xa6&k*rZer@xvsuJVd=oDBoN_F* zR5~6-0#u+FPKo{mjG92>RoXHSctroI(s7CmnSh_{R4{$wt68#ui8+3m+{ipDuff8r z?*wiW_J!=5Tg}9?Ht1Lc3ok>?<1aW6+U))r@+1J{xfzD<2s)r<5XlFQ63g$>?Q90V8BOdta*e#dkUuSzMXgf0D z>S2Lntl0=ez=yf3iH?0`z(Sya=&Ba#B0cvaa!m_65lJkT%-sDV|I!P74L+&O&_sZN znI11zjVJwPG$+VB%(&Who+>wE=uZDbK7TdtPXVAqOEJ-qxzq>Q*ZTDrdtRSVvmQj9eOCPvg0H}Nl) z$vA|+syq}SkZ_VtYbQ7l|4I#zV34hy5xTA8v?Pt5W3+Od9N{h8Cl}^sA+Q!M#96_J6YB$ipacTs z8H8|6nNV|^;hGZ)f*6xSXapKtfC%PykJ|;^w%o9mPRL+IlbtpttdA9BB>TKZB?{|H zc0!L5MO6h&Q7gMc9?>CsS(pNuG5gc=95$l{A6uf=rix7Qn zUR>&P*#!Khw+nX;GU<_g1XmxlZK5<{XUHMpEHs&#$DsC|t|#bLE8$x}li|=xcxDGa zbkDMc-6w}k%mkx=(n$%_uEOCBRR$pt&fk`I)LHR!z(Eo{<(YKaaF|#y$K}i#71jO) zOQVOxq8T#i9t2Y1UaEq7Q8%lUje_Lm&KmJWN;WYN=;2M1FVcNWKXP-f+Z1IuBcC?A zDeP1Ug%g!2LvF62GTuaoiGkW=J-J2TIP|(EAt10WIOc&F^%o*?2{Wd|2`CX_5OIv{ zn~Hb|!saG7!dF9AO2A<(n2APK&BkB?oSfkFw-Y^lJI)@NtC!M8Dw+sL2v~anrQCjp zXoRnJ2v7q3&EhbIK;M6K(_kHA43#T<*33r+i4mBG5b!k#4`eYOCt|zo?%5FoO(?IH zt!R9)N8nk>Pgn1F#Wu}m5w+{9vM7k-Q7~?-ctk`Hj>GHYZsehc`#sDuQ8o)yCg#)T zqpNVd{eBBKT&(<^>*f&?4gRs@@WA8yy|W2Y_G0cF-5u#wkSBxX-~iMo#+D4o6h8HGy2;KZy#Gdm za6u{MT81MWEKr%E5TV9Tp=^g1_zNMM%0nGSpU83#t9)Em4q|65LE2rwGp;ve816Q& z9_Q(@Y5t=AoD2Y7v{P%Jx@%i<5SVxt2QfIL%o0bW1Lq&3fD~K$uc@n%2mn&Vc>+KJ z?ny8o4^Ld1r80p;&Hapv~T9-mKRYT>zA6-(0L_@G@UV`0{_HZEi3V$;k zbdn}}fcahOL{5Pgv;}O z_=6I%D#lW_tN|l#P)WsS$1B-he0IadL%%qA8w=2fo*kcT zbtvvObae+%@LH6n*Wz4IHu{4+sy#c1zTQ2yh9rb+K>3 zilKWRETR$>#tbz{-lss%0a45#R-Jt?6bb6aEhVqpbz)c&N+o{Wt0Ypa zxX+H7i0qE000X%<#6gBTjZoYthZNjgZ2Bw!hk(^b2>?-gSX-%7es8VAFc=iB`HjnhV4Z+5(CBo{1(Tn48sv(Mx^=tFNjr}7y+J&H_td@1eA(8 zb(@~92yOF;yi@l6$PtPjt(C)XT$OIHSuAUkKpD3#qLQR*_H=d56+>^m(m zV>PxJ%3wUEDBtS)0wve-`Q|^wSbIql<4zg)9Abgv&{~Z1kOb&p7sLwBkMqUKGgktQ z889CD9a2amd_gHZp6U)eZY}Ec2>>rW3IZ>}uPCA)g2p83oG3PRF4? zCkA|8+}e#My8racSA0b=vEv!POgrrmW@Wa-<(6m6u)@I|!}DvE(bqIrhuMlf?S z;BfTim~#Am_BWDjgT*KVc;_hlM~f4*%J;FI z%G9f8vR`C3iIs#H4iT^&;A>n;n{;O6V@7IaETt&fSRJF+N+J|yT@!5&@z*q&MVKXp zjC8+JADAq=(WX&b+TBL@z^;y=eA62tKCccw|?(3 zMY^8??NDSMQ~{F1iZ>N+@%!6IPhM0o$YB(8$W63|H8BAci3R1&TMjV@^4QCDbFk;* zxrgGlQC{B2ef1693-ZF6J8E&xYbqoj#ed)g!ilFu;3{QQ=8l>e!KB~mm=1p%z{<+W z%t(Dks12ZUzwU@We|g+Qh6wjvI`qndtNE;OkL_h=#daU|9DUZ#wkz>D-z` zRJl5=Sl8aD({^KAe(b{lZ6?AVX@_tQ^kK5sh_ptT5rMBrJnGbZ&lTar5)3j-XqT!rrtXMwcImr<@?DZNM{^WDanZNo_1B znG1YQ?P1XlMKvKtq&8JY!bmg8o;miLiM|qGA=R^g0T7H*h!x(C$5t|?Y8WmSWiXbo=B@YIAoER}dJ&DP3OOYSNDd1Bi zyzAJ=_r^321P|GO9I|b1+i)!jP1Ig|kww(g3sgULj225adMYmueuDP3^zmqN8v7)C z>o=BuzpMtI34Reg4=lu7Lx;2N?l8Pg4ZYWu3;9y889he`xbri)2GP6%!!RZjs0*N| zTPacOtEOQ@iVA$kc&%S0ed{Cc*cj=sOM})QOPKuPn+W)|a(=BfyC}J7Q6^0ikw=Dw z87^RSylefmbvAW;fK<&0Zl6ZNSe{frYO70-)Lc(sl?nbrwsUC_*>$~<$Jk<6Cy6Igw9M}q|%(m1(RDf-~H6YJ3!QJ5=9^?*^J;k?EB`E z{cj9NBE)UNX>jbln$=Lf?I~bk4KK_5TALTgdEQ#R>`d{pwvkJG(}UZ4_Vh{`_mvxC z@O38U^_Hap;($3AOT=8s2&~Pq2?~Fa$ORhzREf$f*d~YGVQGXrr-e#CTmXs@guiD7 z7vCY{wN;@Z3h??4L*->Y4iMjMS4eFZJa%K6ZJWoxCBkwvcXWy1pez#xG03d!zJBVLzsS(n7~KYL+l7LvqP-ph)K&C z#^cb+k%ScRDIb8MguyL8T07k@G%!As*AQ;T8k`mx6$h4#X$7a?v-?%YM3}T#PLY{O zqw*;=DjHYHjbOfu7KM*~!8r}`k9lZ4li3M%vS3)Fcq17`_wxQ|OG*b(PPAYrD_x1n zV>JY(gsP>};9te@5k-@eT~Wwv`Cw^veTw=z700|gua6B2W6jX z4%WP3b41_m(ys6!yr;pE_>fS#%HeHm%zhOuJwHa}cZdOEwJXh|Hibs{n!yED727b# zZ(NvjzFv@`tlxb}BfAcBAodqaeNYOY1uTO$G%$k2p|V3AV2pB=FdCtTe@m>jk%4=@Xk#L>l)Y70LW3SEl2o33hRC@6kWG}eR40w~I!lIaa#wTN z*MM7Ac>4u|3Gf-lfnzirH5A<*Nw{#(I7V=7O2H4mvMUnGPaBAEB>FL}zW?nG-NUJe z^Lt>&#LKuYRk_%c1soVYOg#!3R~}i4ai=niNsz}=?=e&^r$>({VQ~JCM^D+gZvMs$ z=Q~b2$~o0$Ebiu~QpAzk9qV{yCr2LCQn69lp=Z{}9t=uREKiA*UKfKZ1^s=bCE^yU zx+8wNP_k+sp~59LDty}=PIrdDsiC)W!-I1G1F~Ky2DSJoO!~=VbfLE`YP}htS&qon z?YArzQE2g>8XS4B)X1`&oyDK+=f%iTX9u-Ojja|A!qEbL>5QcDyzFFR71Mm2f12v1 zk1-C#Cjz)_rHw=cQkmQROq-9m3!@g6j|a7cHRlp5nDOm+i#HG{fKRQZH%P$bl8mQ% zNG5j!CytIZ5-~ps@Z2Y)h$0}k_`Jk~fI8{VYb8#}-#fwG&oOLtPIUxc)@xHr_&PdR z0ONaFaB82vjb|wM8vN_IacnV-z3m#LyiltBtf(4p%!oJwmCvEtvuYmw6>oCea_mCV9KySky50K01bo7P zYVKRTUa|v@gGL^SVI`qO1FvvY_lPe_#W`z?wq%tk*Yas7TE;qc;y*p(#wszprYJ8oFV6?$!6Jq!2C%bHb8 zrPVxSyR}gzOrumC%?8vV>WG)@Tx7Dj3@v)_C7}nel0;O~^wNR56>!A}NH+*{%Fqv< z@w5xjGL3~bu4h`WBJViq_g|gh`n+|${}%d?CWqW5SYA(wz*pzT7Pa^|@PDSnIfjR1 zzh)5KM7B~U-Ul@wy!8MuvD2W>`mmw?&bT0!2&L z;BLr~)=v&e5_CdKw3&*AdI^tg#jg3j*fOx2tsM?N)P!<7(jk3HOD+5)^)X$Uya&?1+UQtMa6WZwqJ<@7(M{a`dWd?1g6Fkzl2`!?PBLzTz#3aZ^~u(c`Uj<_mESYXq|m&wfku9Gink+WDu}Rr1-4mV z<|j8)@jKb(1l|1Vs7~A~Sg@5v3A)9VP021Hw#~q9y?6&WcG-MvNvm#8qko>9brJ4% zJ&`gUQH}5VdsCdMX`5+PHI!+=Jk${G*r5nnuh@q)0!fRnpDZXgJ+a@=(%o!)qYcC` zeyGyfq-Hpk8_EXfJ!g!@5Tfi#a%J9gO@_j731mF16e&;JNeA`FJq5KsNlD=Ch(4Px z#+{HE>FI!MS!f|mdW0Gkzl&4iY3`sOj%|C-qIt*U|6u9o*XXe_`D!UJPX6; zBR?t%Ir zr;A0}&mgarzoDeiFLN`4H7kF=#jScN^(+j2d|VsvecSro5%p&9S6Aq-jc>)t4|)tD zL%3$c07*}Kg~)1-X)g~OMq9Im1SM%ySDz*0FW#iEYft*8Gg3yc9zDUcTo+UMjr4pv zADjim0tWEw*Gs+_@d?Qjh*}Of$`I6wuMtNp+d5Xwiw(N0+Z>Z`J$>=ui?n+}-#$`- zK=n`kF-d2Hvb|)oIYX(_=s0etow5qTUX(Ckhtdey6_bx zPqe(dZxxKeA24DeLK3y9!nZaPL=i-ln434#fv>pNJR3UGwX|8Pt!I)HEGTDxh$fhqm3@2|BHAMv8>N@wTP4qx;@o|d- z+VP-iMMh(}ju@DbBe{&Bxv%J7BjD2At`9b0~Za!V{g(BVJd$B9qFV~q3L_RN{ z6bASDex@i|obQ~IIPkt2JN}UPK0s`y4K&l1$Q7JpV4uqx%%kaGSh?=g*!aZ4WMg1p z{9t_T>b0+AmvE>1D{lwygGTWmoqk^z^G_u5P9)0P`Y#-V4$g23H`v5qX+HfavA*%_ zo}`G(;=<@Y@adjZFl+F@^ZR#2cckEb{n)X-{V-o`%Z8lhJj1AtN=rzS*nho@nzKV< z_1GRZt7<7~POx&p9WmBN4c;QLQpX^@AcPGkT5jOO!pVjR%~2BrL}9ZtzpuV1hh5J8 zzEC}TE@J)(1j+|?BQx~RXv26EZ z!(>b^M6HMoonnBzEl_wy3rTkNNJa~bfa*AD`pvl2v%rZ|?hKMF7MsK@-_vk96J{=U z)lvCr&IwYkndF8xriqmfzD~4)W*(jqY$EgK(P(%^5(9E%wr>gwTGnL>YEnY0Wbd|s zf^($S3frULalULIEW}lm3T%~4F%qS3kDbI_2)P+wIoXVQYUdg~6`g0rnk>?n(Z|pT z8xB0l!;7jxtPEX4UDXbG<;4LzK54TleK0R()NXk55I;U)4Pnkad6zXkGvX7YTlN4} zS_YVjC{~i48kct1k2}Q*_!24b4f6#F1yFhCs4)3seDli0UGbZP>3sz1qV`ZqUKp`} zi#~45W_*7+EKARfp;>8YYxkwA!bEcZB*96i$v$EPTxHt`tVYRML1;J0$y(%IG#H*}8KWSKFXPSg{| z%2`kJh8fV;e;spPcCLEbMfn45Se%-Vk^)6T3ArpCf|O{uGlxW1&`c_@y}Py>t?Fk> zx8p4kHNfodZ?0i2t3LD@3`zU=nh@p4ESEexNhZjk$Jxp?y`J)@pW1|&FOXkBv~$|6 zZUpnvuOPk-mS*#|8k<*;%GkPbZ_+aPQzBW$M{yJ@!|}K}PjKOM3y@7MC<%iL?{qS% zn%@~x#?$C#<``u=Il}cOw-u_SX3T+*b!JO;n2iCqEp{#X4_b_L$Rt`2<*=8~eP2K2 zs|*hjxQPbrdz9HW|7$)!trWvGXa%7hS2lu{<-N+EKlGPBwZ&G6OqLcAwke)isL=hf zI!(DD0ax^l{#dggrSI9}oV(7Z_li%BdBoO#*&i7Hb09b1!~-7IisaIA)GvUaWAkNa zAW1u=!FAp1QoP{QW9886+tOqdkG)4(T|qM|4+*GZK})}nS&nm;K)5SA+`B?pgxdpT@5Ks)01LX%D)&6GB-B_VWsQ7ub2IKi(mUAA>v_&zAzlrz|qXIeq} zh!OX_G|V6zqhjI8m_F<~XPE~lH5if+V9d~#3D{)WxXih6pY2B}bgiU7xKULy%NcxxXZYMiKcC{}sM|-8D`B{7# zy>lhGGZhP$@*0#zrAbC1hg6RG9s#s*gBvfj<}}E-yUEvtK>@fSKLbySCt3VUKF?u; z7{hJ;Ooaizo-Mm`tP52mMrQk3wdyzSRHXb0eV?B47CIY_>$ACj$_B4sx~zK+c#P$} zA9ifKj~+rqA(ie$$OQYuLRG+Jvq|dp%?7k?zy#m1Tf<-j zTaUv$DeMnPZAlH!t~j3EXGt1fZ1J6{{NUXN+Mb8KzB^Z?qK+Zaz~oA4&=Py!mG;?s z#yA;=P0#RFHQ0{YieerEy`VW*S%h3*!Jgf7InbXV*C{5@pTsJ1#A-V>m}^7(Ozovw`b4$Y%S1um9QJ*TrIGYtKUGZ^d}6?_uJau*ufsq*^W~w#?1yv zKl*?mBjVB8paXRC`sy_wsFrFZNx^H9-Q0~sn zvr=n3)iI@lcrG>KJpx@5A*Zk4bm^w__1|W{Lq_(=#1@WlHB0EIM?SruvJm zy%8%ji=!^HIOE3cxkqfS0`Om@LD9N4p`CO64X8m5~Inmau9w!P@&ylz}`6>E_?pO+%3g{=m5`)wKrl)h1Yp zGKM0?7KYs<;VKv~^~z-ol(VX^-w(l17iA8o3hQ{p7R(e(-uNwrS5GoQZ7s=IKki47 zifPCJR#7XW8ylxno;^FTiOzhBQxv<2$6V)%)4Ul<&m=XvwV7^u*fD1~nbuRp1*{Zj$z26VS$UX~txpIAT$z8Gn@>Fm(7&Z3 zoQXv5v35?P5JKlsPla+1*IsJSXo$=YlUSokS}#+C;Ce3GAe8kYHhQcf8*fL%9^u79 zgG^4`21O%`ad9a^pTC~IF(d50U8FCOm&@EVhARKx|k=WPp$)ZqsG+r%I*}(tYtA5^vATP;w zyO~UsM|>%#9(#Q{O|UA9%%u=jiFdM9wxR2Xs0qF`=6D47k-_f5!$#fsG9j--;8w}= zEamtr)@ItIv6c162YUGny^2=&h#(Vftv_G@!D+X;|?PuR&@LS(#0WqRcfsy;Qele?N!d}DdW5%ucaHb1Pd z-VWLEHp&H=bPor~K6@uM*qtCdqTW~^Ci@lxPbL{}{6eS)g82G!tB)(W>$%C31d^AW z0W#J&wh6xKBpK3 ze(VSR)zv-`1aq-|A-rDWnkJsjwgT}dS8Pj z2a(f1RpFm~!q%`w@*)}6d>;eR!M`!CKCc%0EK;vBN?}Q`8=v)YePkb{I2$#A!1Rxz zvzygB8JDW=Q?pQcQ=_+hubZFxznGDwdet**}B4EttM3Kp@7 zqgXcYs>X5&T`(pxY*;uXc$Y@uhusunZuHW@+!?5wQ_Vu!8~D)Fx0}!d2`pKz$u9jx zS205J{$Jx`$RAQ{>9_d{bz+CW|# z<%XY>N+!f7ep!lE8Ed`-XVxr58~?@ZY}u|}iWXx;D8#pV z3&pubj+RDD2tgoge*>xhhOzw@G5;X@KW7#|Ab+w4AdvrL6Z|!@|Aj{TCku7B!+c0F zZ>lCNwel#i(IWHnhb;Nnv13qs-?XYuh)oXzqE5*Ne`Z{uvfamEEW;l`WH@zMDSc^rBuxfiIl;Zi7|5a^#!{|}n}7xf?g ze*5?3qd$@PNB^0~BOpp8J0}N6A9@Jn(SN9c{?6wAZ@A|_M@DP`qP6}zanHY~hyLB$ z@8L*IVSlQxezW8^zxet2{?0m)KLJ-o{xfX#FAV!Yu$qrvho1MZa&Gs(i*Z#bC>oi6 zM$12F^v5>%=dcfm$Zvh~`$Y)}dR{fVCk{T2^pHO&j9BCU0p0(XEb;#@bpJoZ74lEr zFZd@SEcky+g!%uH!TqQ5`9Ckhf6(ZE7GZ?@_pc(1a7g~xrvIJ@Ba&c$*X8{GX|e@z z!(UJc|BfA{9K8OU9ep3}9;Zd)A51xk``sP=eK;TD!l#MI7X9l8K`YA11=4$jAzh$3 zd`er3qP{BYSNv5tji!1HyA`EElR@00?p+#<$1+yyZ%g_V&JET>Zxjwcme!tS5KlW0 zn8YO)h(3nzNrWH6=|TjtW&XsJ{z2A17$E#lef%dG#J@Y-{~q~;h5kW)#Pp&6%$Mi? z-;+W7zt@-lcWvmO1|p~7$o(OIY7BlW-rtPm7ZCcpVTJq&BO!lN(ElDIAN_Ok69lnW z|3gmxCCmDcsLTxy{r}Ra>@RWhzagppA3V>gC?BuXpUKf5^6^`a1cV`fmm@*Gzsc7B z!>17Zo6Y(6bJf6)97Wi9#-Weq{-+JCi{!oN-Ezw7Xc{!?2+`2JtH^FNJu6Zi|h$5_Epmz%!l z7^2@wHH4MfT2D_&FM>IuRGT<{)v~FWJi9nN?;7$QLx~sTZU6qj{o~$=hcqLhI%oZ5 z^Pr-OO}TNHP3qm;<8K9RAy4OwI#xa&ybSujctG2K`=RW?=PM+MYfkb$zb(}w;OZ-j(Ceg_GhYI z_FOFqScqQE8fxBQ?)2)0-mKL)4EmkF|Dq}R%bow_uig|Y4ateK#S9?UF~n{JOU{h0#90~VX6(bG~rxg|4#`GO^am3~%Rz$aGlQaOS(Dw#5SKOYkn zb>v%5^PkNlMHD0950!c-?L*jXJR-2S>eG=)S!`fYN!Hr@Ku|tvntS1OTk64}uk3i2 z?_t2LlE(3%o=P}|yHEcG1LJv*Uv}z{axiMjc>wqtX>M<6R+M4K11c)t zCTCJx35R`KJw7J=Vg5O&6Lsj8_=Lsv*L{yCD0&#m_%(0XHB{J5z}Mgl#j!SQI->b( zBzCjBSz`{5rcgWde6OcYnPy1CkbQZ4On zu#qdYeIM_2G~T>NPVh%39%g|UMVxM@wWVt1m|i4)y%55rIA$M0!GX}*F>)!9d+Q{B ze72ii8RZVjFQeR+{^Rb9NR2O zp{A;Rr1#$Y(0m zdLJuowd171D@h9uSH0>?Myf`Tj$13wBf&*fxo`NO@DD!6doNOGw!43>mXP0IgfK_W zMDcsL)YLC=V=J?26C5Gu%J+dUS?5p%JT_%ZqY9hU^B z`8MKtw1PM5^wVDN$XbmaWi3_GIptT{lk-aHql=PSoqtnwyWKuaCqJhTVP3#O?8f(z z#@9)~Pqt=H)6I&p55&IyJdD8;F01vYq^{E)czf)ta@CsQ+Q=neyWc#6#+aL8Xp*(X zx|jsRw1u)oPrb3AbsLdR*P%B)RJ>q2leuVPcxm*$wV@f7FwTniW2pPB?ItuxK=_7Q zBXv7Nc;y?!t+s?Q1f8OGj|_9NDFt!2Cnk%2x-$DkIBQ)G>vleiW{TYgC-tu=&X)w@ zn%%Ca{vW&EpA!eY*s6(lm2f6~j!e;!suYbVwGV72S9HHDy3d#=%N3Gb0Sb9@kB+pm ztk-I{F$Y|Eub>rdb8L`aVQFY8zl#NW@#q){AKNlI zvdM*Xd;p@E3L0a>CL|!;YKto|^TP`LK~QJr_3~KcXlAQ^&~QbY@#}5eTS33zrf#@XvL~pZZ`>Bx)!xEh zSyldvru7Xft*st|B_S-tdl!Zsfdy@(+t=jJO_<1F8{@xTf{{9OJkQP{?3{D<=uWAA zb$HpDSn0{N&Fzu%vXxQau1woTRPHr1c;ewgi3*@XK=Xn11~W9jhjcr=b?-5QjZwka zlhYTSdf!@fJ9FokPGs!QDSD>)RI-J}!{QSqTZ49f9egOB?59JC^>set;L!F4x>=;awA3U#7$3u_0cTPQ%*3s$Ik4CGQ z=Vy(${R{*s$h5xq?mDx&!_ROPU4@(;Y{?#NmVeaTToPmPa9)eE)f;u*OKg?-ZS6I=fUq>z8RV7BdERpTy1MOHg(yRM$VEHca zlg@y6T9vw`dB(w)r(`RrH)|#JwO&DB;Fmbxu+v;zu5HXY#|$DraoFcjfG_KerLq;FL)G&8nLR+WIu|Ui}#ZPuahY=UUTt}Pk)|f zzuyq|`;y{*L@W|`*LC94(LvI9+ZoOFdEalScdYZ?x~=qbs#T{rq1$;A>-)P8L*z6m zuBPC1oBmD~vaGU{x&~WbmZpZaVIt5)Z)dGxEJ@0TQM+fGE49?YYdK@I;%2@9(fhXF zr+PDYzQn=8y&!Wo<{7E_Jx@it4s(8aSyeoxnKa{a)Bos~v7Q6Gp|*e_%l4RgU>zjB zrM$%1)oyN%WPV3FgT**1P!Nslc5IxrD>HGZx-nQV zlrbVKjtO~-$@6IFLpyyL)npB%(v@7qjg3us^&LR*DPU$fHn616_MF*o*2pVVp6jsX zDZ!L3&*uQ6RF}d;Rl`1J?Udq)wOvtxbun7`+N$K?sa)C{`>{s&*)L6DVIGMbwz)NK zS}4np0a+;iX)#jZw`LF#&*0;t0+`Ic$D6mr#WdaVqcQlUZA=!U#-7mdwMiULuTLD z5;>G8bO?V|vls01uD>JtXP`x<7CjaNFhN8+#yG<@)E{H~2{5am#`YAVvjlaIWUP2c zxf=vaoWZ&HwVL~b;g^^T-5Dxfg5s?H3UOeh%;{O20j87IqGR#s(A!xXi8vymbSumu zrM4I3&MI;_YN2(v$l53man`p>te82DrKk zMRSuJmXIAhVJ}-LuP&f>tewRuWQzj6Z^(ZxQO58=&A!G|SF(*)X0_u@R5b<%eLE39 zq=OlJyMj%M5rCZC`&`C;QOmsCV+4L~!f@69?i?qs{cdk+F;jHbF1ygpd+mh6f-oMj z#Hp_ZQ1;525@d#&oKf)1P|^0}l3gEuV)n6*``e794JuK;+k}#Fej-&irpVgSE7oTe zI5+vvS{!?xiT#pXD@a{=3cg`1ZHUH&Don}P8OV-=DY0O&QmA5#?~uutd=H|s4PJC7 zpWcG)D9V{>4wxgJ8ue>v zZz?M5C}D)MK9vRPh~o6#lwjt~7N6dv!emlY$q`xgy8*ovN9U;m&fNBBUv*BzrcTm? zB@qr2mo9KIP9>-zx`k<6tBGz)S?hs$@;n8~^-fsw1BLnwtFuP->h^Ym_HM@7GiF!A zn@lLygZl!;lfP>ZtI49IQ*uXTN#kJXDXqlw=i6uxDc$bNKkcnh7i^QUcVd6|>=JzR zanMY1y7z_Z&&`1*&oA#V4jPMms&s$F*eEi~0xE#)&P*v+1-tY@={54r%tJ|T7ti7O zYPefBH0VNEx$gI(5i|3T&uhGoQVGEsqK1teMd?yIF8d?aEz?@L%9dW-XbjOgd=|Jj zEU#EYMh^tMF6CKaA5=VrY7f*H1ZuSG+3$PMewyGkVHAv`&rR?5;!Bo|E?TQGFm4;W zs!Sd5jM_))?Xl~;F==yT)KBEb8CD6nB*yH+%eSxA>_4Am%f@^7pm@KCQ*CNf#}@5< z$ab#y z@syd#!#{WV3~xoV#=2tOT_=snC(wWL^Mj5#GBQ&rY0sSGg-C6AAOFm-q1HxJOQo=& z?l2I6qReb{magff0-{=fDw1gm=ic|^-TfS$i@Dn%uR8}p8Oj%QW(_?Fp403o;zluM zeT}xkQSEqpZ@bGo5@(+#;E`$hb404|TE6oYs_WDVudacvuy12Y*Nvbbz~r! zR(ANcF!)=QQ2Ylgnu`NyGgs>$_PDD||kFI6x#6slZo~bZ$uTLiC4J z=Y0moa|YwGESYHX6%8n}bDAhkDJ{-Q-$=ARZPO_T)*bS%%AHz$+nP4?UC1^}-tK~L6O~S{k$O(M zFXx`I2xh@v#90Dl48)Pe;zri9mPff_DjF%fw;(sjNM?Ew2Wz#g&F-9ZTq37{;^6qc zufwyb^~g%icfpe^3pLM7>j5()i0G&otWDN#sjx>eJ9-eaa+px7YUUZV+G%r?xm0Gf zR1>`JyE$bO{uwn`9+Ewldp1S5O{(Zefv?VCrv=$NAtF$Iy>1!N)ePe5`&RA4&q|&l z^WvcJW3_sb-=iUsN(vV5n&F?LT$OZD-o#A{^OST7KBgB~m@H6hL3`~IRMDS%`;fi7 zU+1HPVet=RLxvz$H8V)MwQ@uY7nqaWh~`)h4W9=EpW$L*;>X_3-Tz?kt)t@Hwsy}D z+=9EiOBD_Y4#C~M@F2kxGz6Ez-3e|56cC)??yd>$Zb`5}*WTUxoYSZG7=6y!_ulV* zJ^C-GVo);Qx#nDJz0dDid5{iQbS;K$31b`&g*4q!_ld-33F!`I8#$}93R4s8e2If7 zS+R~+7kwdz14L(Y=B#-!;rbNe=l0sIBt`5cFOQ<#2xuN#G1^IEwb0#1;4*dDrMT78 z77t{^@ZQ>4idH%6+$22?+}mvRpyUTPDP+?bps?)ry(umro(z+a6XOs+ryY}^=IPR? zU63sP(TNKjO=n40)VY!N`}sJ@FJO&J%4r|YUZrVDpE!+^^y6j3Q@C7&mJ6dX2fMsd z8Ygk|+VX@P&z4Y6DUOui@f@<)-Jw*rYMH5&7+-!xum+9AUDvVj4SKQ#b1swa!2;7j z62G`U}8-4rM%5WbVb~#ADxsPct9*) z=8L>O(Ux#q10{x8?5=Jgb)K5Kr8)|WW=8GsSN?H$zK^46>k1N@V0sKv;u8==&*#UB zE^+ut42ZOV1b1=z0#(xu--eX*U6Zu?d0t-wLvZ7GcmvCYZdUP|>E-WodFme1>Fe;@ zv)S?TONN6egbBCU(6+};WZfHVb!qHdh+-+3qq3rwsLlJl*YBf?6lPG@T!>OQVV5p_ zv=6Q1;tftOYc2z?%=XTx7QP9O0^zr5I;*w%+qX}eJ!?^T@<&RM2A{XS{ZOx&er?2^ zCc;->dHU4408kPmYVAl;Ei4n9BQwo!)#N&^~*y3ne zIym+Fq~{zfnNwfRr4+F0Scnt4uElKa;8N~66KH(H<^0Q^X%Z9!YjAPwuvf_85YV79 z%xECbNzXMoKFsV$WUpI&=eUkOe@AYKtI>sQXy%1&r;mp*#2kIk>Alt7(N*e8EMbqE zL95{67*6{+dit5wg2NqA$*hK7D$ExjgUL54ceKl_FrRbCoB zR~`*BRlZqOA3xkk&RFqbCoN?3T#jrhcikJ!|Y&thmAg66>%*Ca=2B_*RDFjN5mj9B}vwMP*;)^ zcU)v#fWxyeU(ru@`tg&$+|!9M7wvVpLM-1I4o0{Q)wNE#m8f4zLtY)v6a~gI`59{~ zvdSxSOCsqdjlMebhqs^xg2xz=u#4Cui;j{s;EVEW6;iDg;2g+h$TYx-_CV}5bo}ms z*seT#S$$fspR|nSdi)4(6ZaEqSEtoK(fC>Kmh06I71O5!V3R^~c!*v@D1~oSwz>J>? zFd7lNPF0Qpx{oU^dR6*5EeY|A9Lz1`|I=q;4O2AGmX?Ogl9i@G%07IanNKY_nIuoy zYbzIDmV8doUP+cA!yGOP#bm++4`Z-R75ep8&Zv#4bV1pVNb3roUH(_6QygeN_04Dr zQ2<@&!WRo0_QO-Fia{Y~lx4U|zBFcbOZ~W(t3HpMoiw>LdCJln~ z*j~|u2FR_Stgvv##!n6i^NVI=#ETa=ULz5WZ> zUbbUoXdIBim+~nNdV8}CM6^#IK7P>US~|73L-67j?2YJ1tqeUsB{w3}7ttlRwDOKX z)X@M2B5q|nJ#wwb#VXh`(K)1!@u<)s6YfCk*iDk>8E@ms`IeH$x#AoFx%<6TX_wrZk<9c)ZlgE;lYw$u2kixnosn zaEAUcziN|f?6EfTTIYkS`%cMYo2$dloJED0$mzKKfUf^zrAUWh&7GpZiCDAu`SxM8 zO68Q-*6eb%YU@=dkG)fl{{2aoh|^f*uT%Ey?~N0cPpi4_$E!PhKK#NW`s#JoeMhzT z&Fg8iKlkBweY!s^mH*LqDLab%rx`g-YjoZEquG(?%&RR^ljn3 zbpTyfXJ)#Kb~N9f8J_yO_*{uS`g+-1RfuFheNnVIn*3H-eb(%D^EfZ?voU7U3+i#* zHD43v-ViJH_3=l%2ot4Ku5X)Poj^O6MquA4Qo0}kvTwFrQxv;=qIFQ#k z^Z77L7A0lKo*fdDQp*deORrsZWId8n)OXa&<=7r>NoE-VE3%4K zt@}6xU8Q3-fc!@J(_k+sYC{I;6z=8soLDjo+=0=n9$AjqD*MTODuS<72BGXsZTMNv zKNZ}&pP|E^<=YR2nJ7qwSFHD?>?aXq$Jr3Za%8|hq*Hi0q7@wML5}{*hH~h`Vt)Gf z_%wYrLbHp-&2;VV&6++Q^x@|a?gmPFCSH)0ie4DO!2DZM!)z5jqEt|5AQ8I@vCb$l zIrobk3^;fg3ukNX(Y+bf1sB948@L1Pr|-HX*-A_Uh<*WFv;|_i_TSK2V+0cL^rwsv ztHV&!RM;(jE9hmEx-cjOzzTgU%hWbz!KT95i^21zaD(9zsWm?^L_95NE|lSUUeM0- zjk^^9c0w&SN)e1S4VVlLi2`-A!f&KDy6IJl3%+bvCZUU}%)pCOvQ@20Qu4Pzk5+?*U)%SUOz~vK_Gxei=}r2(=(vDJQPZ zEJl*hg^!fHN9fL};p;|agF#<$7Hu%3j$9~Q7wKppEon!G;{~y#{zur)9DPD|h{2g2 z6l&+^3<#Susb?jjgBb7(6Q|?d)r2@!xJ!!Q833dr6iG{TDD73~=U22dauf}mM z+ZoE_03PA(nUz3=5mbdQFo2zC79;PQy}1ZUf?ABe*pZM(6~1S)C=?e2M+Hz8*gJer z4z5?yizdM)dRH}#5s%S#uF*D0o-wGPM{Fps|D3R2N94uA*=l;DBOsHynhNMD<5 zup?NGLGL>y3`}-6#`P}o)D2_}f<4p3d7+I|r8wPPr0{JKZ@#x^nW_ZgLQ<{w z_G4;Nn$)fhNzb9uS#|)nh2pARGA78TUpyA@vLUpA36|go9q^+s|Bi26oaDF@G z|K8`-oW!|i4}%$#lY_m^3VY|mN$&a!X{+>XgURGoNLfe- zd%);kH#B>5`%R$5;Qgk}1abB&S=a6+(y$r^(r@Z2wy7<*eE<;< zXBBU=7JF|rJs3Qu#?ZstR@m;l`V2@3;lYxX-Y_7 zgPzy|>05G=Dv&s(X@8*_Xs{V+EmufP2xfzs0n&5zAa%@q@-3w`MG~gwZWeZgM!>IUglI7sagRIF8cLnJ2GbcfshONgmbx|*1U(vE;%tZ=pSg
@X^LEY9nAbMz@ldOTB#8#%sHN*P}mT8tpx#H-wB*EoAbvYi+mBpWD^ zPGQU}hcwz<#D`*#p^3K%(Y(2eu*5<=gEMsFmqo(}J|nwWx2V}2pg z7sq>9u$O%q94_x<)wPJKENiTkJ)7MLhGAWeBvP9SBD#--aeY=9dH&{(Sk>4sIQ+_R zkRs9t11Ayq%ztizoPvt*B4yuuI%vO*R>u`3V#43dM?N1yWpi4;4Yf~dQsUta)urTD zGAUadWazKNzepG?oJ^2XIfUL--PGgd$mg>5roOR18BnPjwzZbL4rb|DnbRCT=X04e z&%@PC;WaOrlDmBC$>(*^e(cqZqoXh?zB`bf-#@&mwAjAcNVQ)skULfbqu~#ODHaS% za|{3lrbc}>jEA^O5o~)hKNEcUhOE=NWkXVr1;TZ;%^x%IQ}h^l$7ymXx~45E{Q8`X zrTL#fB&q0uNiL3Xe?eLX&$V54LXg@~S@I=nSCSKEkXxol4i0&5(+XU@O%{DxUsGMFP_DkF29=SF}t?)`F#6OHS+H zyiP>~nY~EHT1k6kTF5l2=>a)xj0i$0j$)6!6+GYaTq-6WZYu~3n!upA(li<@E}o&Q z-MRI?d)H_pdPd~*{FFCiLH3>94Rd(wuLVIis{7f{wiqQqkIhX+Q*E{)KB>CVQj9|e z4N+SVsArqu9`jsMp%lWxhd}pu>c&*0-WcZ9QUMJ%0NwNt%ZLmHG-Xg4eX-!P>A*yLM z(b(c7(pr$a?b5l_GaUy91}QRz5V~1}-zlARE}-fNHOPv9_TG_aV>3LMjh04K;G%~# zL5L@;OXoRlLgR@KI1W)avVpZ=)Phs<> zsBCD1vp%3;?&t>?_ASdfv;a9{&Yqrqd!Mq~0(&3e8#*hX(&q`digly;{Wrv4xgjP@ zQBN)F@NMxMTYLRos0B^o_&uZ?Rm0;l{@v>!M3=I{Dej*6bF^5DD{iib$*AifTsli_ zJ3>KZw#bTaoj|x#`vmPrhJDSuE1NCRe?wRW0U^y4<@GxFG3BtsYZ<(Jhd^<>rH$sR) zT|9@Zd#bXKS`ccup6c~HM4q^iMM&`sb+LD`KDx*$yzouQ-b z&QrD))*3_ArciX`1v}>q#y2rh8IzlS(-&MX$bh4nADHi%_tD#^Yxsc-ym;hWJ{yy+ z*r|&#Hnq-kj0>Nd+}R=FiYJ4=tOw;q@54CR92VQ8b7K9p2yqlGAGk~+bl*k+wp*V* zvy?I9Uh7(;>TCqx5h&Y?4*Gmk2vrqGkiED3<-fdNsDAKCaEL5(LC!>08b75}BP>ZT zpV69xMsrrlSuARp^j&w8^9FokSK1U?6)hpFOW7XpQpLAO5?30k#~V&6lil5&P+fs) zN=0^t-MHH$?#<*l0!hW$uek?^eKn`#kU_;C!9gmEEgOe;x*UYoAxH_b4`F!8yh=O} zEUt>s$R5Se9)}M*w{cYAtS0FUtP!{0^j`oCZ3+2wV23tCSb9U4?7nQ`I`PyZlK7M3 z_i29)>3c==ZhKSpjpvCS1{Vc?Z}I3@KQ$MH&;yEQ3UOp_JSfKn8=+!wQ zsRp68M^VnC4*>5WL)Zm3W&s61_zNBfBxq_^$+8({|p)XZvi>+ z{}#;O!~S>33BdPfU=H8E1?K#vu>U0?_ZjD`Edb>Hq)X zZ2vok_V;Y(|C8NM{(o=t^Dh+aKWP2`Ea?B01^f3H{Zqk)?fw2g>NU&+@!v?`e;~?W zmIwbHK*IaKd-MAr=W%I-ff0}RXXwc9(f<9a2>#6{g6B{C3jUZV{EJx6UvLWjqk#Qq z8UA-V?7z?{^bhg|0Qj2^3PA9W#`zEPJb>VD`TwutIDg3m0yatf{hj`^sQ$O;k-rD? zpL!(!pQQ5dVAQ`H^uIE@hI!Zgg`oeJ7VE#K@ZX6to!{)`@Nz3uPI}HiP!$GEMtFby8pOI@Fy+d50{>Q zt%&{ODEhbIcfP;Wi2pW2IW7_ z-T?p0W^cdmy1>Tz|Gd`szm0|0A1gut-`^~}0Nj7fM*cDSFe3fmm^=T2;0OS768V4W zeE+04`eS+jpM-xWJFJnSFTVe2&)6sE{nCE@5J`kBHc5OP3NA@}LCp{&6HXK$+_%A1 zS6lbz=oi)C?>RGjIa4!!Kfefc+?KbOpB@OEeYcqK*=kzb5PWV4&u zgRs@zsM3C3sma;xa@gmTi?o%{sM@i8!o%r@++Q`ay5w;y(#qxkbCk_FtN!PgI*`%r zs*rB``%~?+9>KH2pZ>^O8Cc_CE+@!DM)zGgM=wsAA16Z=s9N@sh^b6`x5838n+|#u zo$mIhGsRktcVjs_5c>yozeZY|wZ~R}{W+A}=;u1;m}}%G5@UaQ!xLqH_HgCF{lnk; zRTO;gs`L5`+1VX>ROV^F%hjvguBM&UiMK^HXYC!`O(!478^xY&U5QZkx-dlr<(`S% zAV$GsZ8RN$D8tS^NcX>rC2!wTC;z#5gF9ehy4v{Rym7FRs$aFh85(5nI)bJ!cplLJyDUx?k@}q6{bS^agm?srkZW3df&mq|&g2dZcte z*MOVnLjcPeJ5#`ZXJDu9gH}=N`H&FCw?xVrR~BN=KdR!u9&A)09a#r@fV&RmaUe1p zFEq#Y9|t-hzkL=}I&K*|rI$1XBSG;T7u^*xpfZS>J|^LCWU|Mhj8j3aoK6W>cMDoF zVy&c=@m2c}YgbEJa!Bo6oRtU3@F5*jMXbIKhJ6u)ylmnei(WY%vhPY^7g_ag8b1-2 zPlm$dW4rgdix_Q}^jI8X1jjkrX3JTf-`++H?iEXaANnmzR#Km zV6tbK5h)TPiIMbdaR?25JsS5J0ErX=eD(sSHI)B~jxzz!;8peg-f7G0J3Jn4Xok`D z!U7@Yu6qry+k*<@KG>rRM_G@oPkc-#4QhnJs}F#vHvlynf51VD%9BchwNU)tc^fvN zyHomZR@{mwNmfdSs!x@0?}jY7WwibH4z)ggUfJi`Jxv^bnjctCi{xz|P^)1tv1BK9 zhc={RqVqGM{-ZOFnFn0ljIeMd2@8Q%5BDNvQE;1D7EmEii<-$HK1KL;jbeI`a>gnlMutRTxr%^E336lY}kp&D%( z`GK}HoCW^wZawjr?J;(i23>OHIzbRkW)C(7$^7m1isE}0mv=;o!5%8kL>0I?KLYLt zKkEXVw-FYZ1n`<6Ay6dIxZw&;q(}>@Ssj{Z3ZP_58HQv#v~hw|^{kTo zVl`69!=oV84V1K;$2*YN^i%h}x_2-nL2q~Guo@pDvk5;XsVBLW8M1CJ^s)+i7i?7z z?d#M*w{I|b!_mJ~9+%wW{{2Hl66&zUrMv8as^2eYfTjsDUUgE3oN1ok?(j(J5Ooy+ zRPszx`M#9gq#O!$sfmN~M+W?iv)D@s0SC-ir@`{#Q$k=}7?fVsOCloUd4?0ES23xo z|5dnuCM_PG#-%s@CI}c-SqJJ;>rFOOhOGZM-NgRMIcltz`erM*UIOEqadHIhkTfbl zUWZMdK{i(ODw^uVA2a>ZJDg$uqC5~ZVuWmtRnBr0Sj#4N8n{A)r#EZ0JZ;R*0I_eT zA~sN>0i|PoP6-)8Tlx4Eo*s|5Ry()6bbRM!$U*2tUG_UB283$|z9b3_o@DK|L8ovr zUX!k&I(rW*qDXXuKx5rWU!OJG485)vf5qZ2r)wHwP&cBgZUY6qO?cg2aiyj4stK#P zMBor{!9dP0Yu@u$($R78sOl!GPT{$G+dJ7b!qJSJ;NbT_2;KGy4`djv%7sV=q|ag0 z1lB{hh<$Y<9vy%zp>KM;SCLdKIeEQ&ym$KLdA#Hly)JB` zxT(M&WXb2Sp%Muh8XaV1ODmDNQ)%t0MFW!(d0;p|AGG8M-~|$sWyR*Th6b+!Eu?g= zT!-xGIqO1}de9-tk&qL=`&qQ{VUQS;M$aK$%55h({1;S+Wyp_M&AISdw>8nnuZ^nL zJ*d$PI+u}Mli;KCFUP(}@z`#Jpe!hN^Ds&!>swikw|&GK9+5s=MU(wCFZrZ^m7A zSL&BO6!EWAzK2^Xm(%0r`r3+~jk8J(%CoS+8)V?lRbr~hNCGGzPx#%_YaV$r)PXh) z$WWVw8TAr>$b9503zyP`jtS91`j$3M%tBljRo9Nb{-Osp@8Y)DGW%nvG#=^eq@@uD zKleV2U8}zcJIk4_MmDh|Wgtd!spp@CCysQ9TzZdw|2ahcS4l`R7~;|HH-&b2tc_}) z1*Jz4)_*10my{L>S61?LLxF{0Gq^wI@RFTaVW%D&wd9RG2c#$+a*|X}r}x$|>h?zj zf!55bfzp(*T*Z7>EpeJ%= zq|szr8i7x&vFdF0Oi)geJ%~KfPbUIGaTdwv_Ye@KzWyV?GZHa=HlDKVrsY6q3=iw?%z&Saxux8U9SdXmj-tXeFnd9jlTSG zN&!Zh_S8^CK2TJ%_{*W#g`Uh4>B1mTclK8-yK0-UKt+eAZhT;HvQ5yozw(1js~Ka{=zMH zywwbE>6H8p<=&ojzqa>!iWV(nVP9P4J}l@D0=Q(5)-j^R#6O)=k`8HgPQS>cM$+?B z8Dn+}P~-Fm!gm4}et%02ZcrCN+I)-d`*?_P zs6-~~Vn%wGW9-bF?KkDS26yKs2gaoSlVYc%eS<`9uWkqQnP}ARkv^$Y2WQk!9*TI< zL%E{fGq_%1-|H(l=$y`Y4N-0FJy=oCC*|}pC&{1ys~gz5V^h~6m}F4# zwm7rWWtc*;`959}1)xL+V~{#i&Qj-tj0gmkO~o97(gLix2DMuE7Mf=s$k{t{ILQDZETGDY!BH8i8*o;n3gruEV#60fo6)jjb$y1QFTwc& zigfaI++vhJorHa?QefEt2UTF%mTjqfj?A&yQ?89Zr0d(#yK;CS|Lx2|l>86bEs{9G zU(Z>Z_CgvK6w4yz1KuAtKn<6joZA?cUx8W?U5y+GT0u>}kQ`$X+t3q!l11?c{Upn> z;e1Crs+c9~9y-rM&qL)E(fg?xjj8$Dl+&>|msV1P%dIlhzo-aRXco?rRrYHC6{(My~)7`KVwc^FM zBzY=&v0E|J59TqN7*QOpV7FIx=Ei?QyNx+mJVHs#h3hIutD+@W*Cc-yfPY*(2sMAx zmejz%$k-Sjyp8I0HTSstOUv2H<=d&_hGofDsQUJEuB=ZO_d9k&QMghCrGpuh#9yb` z+Byb;G0|{m`wLO#0YP$$XK!8_#n#}XB2(P7Ro2KzzIi2smFqwQu5c&C$5^;T`<|-I zHp3Evrx^VLElO!DK#Fh&`hFA6@44@b=#N9k!JZvu#mKZU<4C}!NdGuXe+Izixnwp4n*KTv4a3d7InK&zdWGnhSo$g55%tu~*aY1%^kYH+tT9&Jf6#Z0_GO=I!>kP2Vih$Dja(3Is1*~A&UtWh zfa>OOEBtq#P^`P)A?w-A%azdk`L}z#pFqJ@Na5mdMHR7etJ_17Y%8KymELo2|Q`?w9@O zfv9Bb#8~mVNqSQAsLGvh8d*DOdCkZMg{VytdV}CFxrHPStnZJ1ao-7PJUhx-h~;;w zXUr2gXQKsWt=2mBSmbg$4T|9tReQ6NlF|?>ENLV4XnEmqjhgl=mV9w2)?g?Osl$r$ zO>iM&esxPr$hc{Gg1HQKP8&=?c8f{N_>>~omD5S7vlg&Mk6U~~o7mLy+W3Z~$^qSY ztZ3!(wGA<2eB(|s`{oJ?>2ON?cAsR>dN~S~=RT%nu8Y*o?Or#ZXV;-lbl?4KeVl?d=*UV(bzLfOv}uBuL|+!D zg;fz7{E|q>_$&#HbNQovebZg2>;~_l88cZ&>#%)y#Vb_iVr~}-)btdKG|jI|j1{UV zlUK1}PbusSUz_Dx_Rpm#7}@^FQbalmzSUx>~D^fjg?-V7?SpdR5o{VqiFwr`&Nl06h1cd3M}vkaH_leUg<8rU`|z?asp($p$r}{}D&~0Q zO}weiq|KA<|GZ-u(snrXeM57sFI~~W*GOe6*S`yMb&*HOk=c{lV5LT^SL()M0E3+hF$<8^a(5CwP(|)?*wLz1qK%)>fInBvSTuzGG=qP zw0o;2!bH=WEeg;HhL@^;$%5(+n`%is1T3zJkfBKR0CmB;6_TDCs4*y z8<&j6UebPjGFooMZaBtQ@fp4FbhxfrIf4fTIvTfxaOe-dE`@mRrKqO(o8i&=`8U00 ziwxC^^Wn*^d0ZD)I$W7M;kb$HyHUurZZ%Wrm~Lp8)iKEfmtV*o4w4JET@%q1woDk9 zX+tO)>K?DfG+l<2AuX@I8yS`Dn!D_fE(!Gr6Sn}DbWOU*Bh#^N*=9|L(H~`-heoaf zQ7lpfY3cHY@CUER=efj*ts?0` zK;C3vzXmXVxSUEah0t?epMSVZJHw_5uW+!%=ezt|640eKvc2j6nI)PZna+BjQjkb& zUGqX8+u*fwXY>W4@B_cM)&5Cxt1P2k-Oe@xnoX6C_P7g9 z=DQhSwmBm_m*iKNR2WB0Y8E>?qoGF*2qse#jaFoZT&NK(i?StY2SaBWg9!S#%O5j5 zp9$acNj!4lsk~qB$&U5>#G(XA$w~N>hX0`c+}_6}q$IeaDO5!&2*jLdXxO7|LQu^~ z$uEYbP5dpBJ;j0@IR^Nhs)DXr?+}H z5Kw7r*Ec#$$b0rIWmhG+S$cw9&hVmqLL&NCo*?s?Ghf~Tu@;WcM(#W-kwgxT<5%KZ%6_L9U>=h=BI6}vQ&}7$ zV(ini0dzn0k9>p+k#9kecnj5_pgiH8e5I7_~C!Y<|$keJpddO0`jE&m_V@T|a(AIOt3-0-m?h$=yw216p1xviV zym@nz3Y5TSM;mMG0}lZZ>N#~qC3-J(Z)a*)!pIatW4|2DJ&}Sd>Nky}jf62*BSD)O z66OuQ?+3weT^y}HU}KSSv30r1Vd#(Mn+LF?U@E>b)OEtqz!&ICJz&mAiNL_~e#wWG z9YSm#ID&Nv?36SSb(`B{ZE0Pe`DCUiB!w9&>gBRC=Xv5s^z-GEp;0pjl^3EI*DuT? zDzs{5fPOblLB6+7tkN;^-bKpPU}$s~L(It2Fje;$m zX8K2KkWhFSO!OI+U{~KT(mKnn>1m5S;62gZYHi( zhc?6>%NKOjwEn2oSLvw+cbxRr5lcpC%BJS$C3w$qw`g?va)rOluITUh3DurrY<5fB zlm8qVW>!QsLU{5nERI!Ej*8In1+Z`L0Erd`%UVbHMG<>Ru`oWaf zOQRb=t~D`)WEaI@&%9o9LcFbwwoQce;MWbqbk3*2srZbhN*|?P?_NJ=G~Whv$xdSP zit8&IaS#`ua&qY^VoDJbiNKOzU|U&lA-sj}zE-mrulYzo3Mrx0qJn)3)ju;9dvJ|9Qm@GF4w1<(IXqw$tE0lErzEf|I zoh^c@%emtk28>$qqQ-JoH>;r@lZ%E5`tAs9Rrm$YDu^=3>vXdX&{FO)Qj{|GBVfG< z{p2DHA~l}tk>JVPpu-Vf7`zAtk8=xqnR@+#w!H*q z>7lCEsF4^|cpnV&VeFM-M#d2cimaH>?bLkXA;+dR-~M29r!1|2f32q+g-S_*Qwx;Q%jv`GvUU4f3>R!YiOBYot-*!wr&u3n@@E*Y|?0 zouw;5F$K$CG%4UQY}zGGBNjFB64^&ZO0T!#9jW3tx1g63f0M?ll4;eWLYonK4SIw& zp{AwfOI)B!RAf|eN_{^QPn}z29QC|yL!0IY-Pd6nsXBi<& zWQEh(+t=NwtRF)PfxhN#FNz$Wl9BOq2xC=!*W{Wfu%fMDbM8*v8s{jkbkI}k*(8y? z1BpHtZ_zw$;m!-R!Iwu{ZyK}S-nUd4O$m-_N;-XnR&YwN;V=4|nOi)p?n zrW=nLB?F)1vGraQqU#Lo^Z9}kvHT(7a+(@5IC>~I-vl==*m$~iBgsnIeYFU6Q9@;; z+97SYT&rb_={?`uAP9B%Iw8_-6qfj5g9c{79m~0qvdW)z0@3r2#(g^temO~P?PF6b z%WKL8Mg+a=%DC9f0WHBMB)TzSd|Ve~FR!#`Oje+v^sH(aESJ$vZ&;M#Phx1)jlvR% zN6lc>BunJAj=CZa?zVN1raT_vX6B0&y9imhyjFWUp651Tjp=|?st1{pwg)bx3fRf) zlw4V=s0?}}xB?Qp@O0y%J=D2t!8#gSZu@2=^%NAmyrE_0+uT$6_}#rk8$(<)>|1?Z z&T3vYOY22yUxdEOnNF{9kGM=3#&sOx7uT725EhgQ>XOhZO0t#lIn$`L9GR+K23tr- zcc4Fk_o4McRC`D!GUarF*ep{{oEy(Kkva|5cUY@kS+uyr98*KTewF)RvT)sC$TBvq zOH(swZelJP-`BZMQGhmP^*R3Q4eusWM_cJLRaXhe(9n($(JqPfXXPXT#h%eEHKQ%q|2%B5oxEo*B&1?4A*V&UD=m`NcJ*zp}2&MtA~^0Da6?E!VUPqm**;x zUC^JC0Z-Vmp|L!l;OK7fY>!1RWYh@3=s_ESOQn(3%3jU4_^@+s(+mkFVQG_A(Wmk= z0wT%i;T{Xg54Bf5K=8KQsS%H2zX9*u?|>gRBqTm}&as3CQ3ksp(WkA$={e)%+-!<5 zm@GLI!nSX`niKTdH`(cb=quUPz}1oyYuiQ`DBAhvbIa`t4jm)F%9a%GFawf+q^67z zsE}4e_KcQ}6r1-Q4kRWu4f8}gUWlB~bgmL}8_Pvs>zmkUo07Tn3egKU3Y~DT2(K-i zxS;~_kEyR2Yide#hQ~NB0U|+7LlWZ>WBk%MaEwh=Q8?t(j3}hsDxMcq=|qSfFOn7s2aO!E)HR*;UHZqV!ZGxtF6KHMeY5h7 zs_8(5X*ZW5@Mbb59js%a8B(0>^B1naq-IhbrCCz8zHhOIj2XQiF+YmGbq|%SDBDOP z!4kFDNY3G}M_WrnXD*f2e6A3+{+>{2z4+5@KUAf2bS`isi70N9#y(&6kz1J)ooiuJ z&BXD-cZ;A_kBUIiQk&CfC1;V%-{UoG!Pz%N)1IdJe2L(xh$ys>2JAt+@7u&9>-;Rr zf&8gkEUB$YLuFU(dwqiL(L^iZ>)LA$`f-MNyBY!2dFh6a?8Q7ciP5PP^vxF}b#CFv z*W?rMf1&=1lk9%zPKHQIyuLwjN~oGL<-=bKd^1R|*- z<8wGFue#`V^W4PknSAmmQu3pR>fw&I_8jl8=pEN>3T}dvy5pwCFOweI`g$C2q~x*` z*k$_|EVW)MHkSAGXOJSKKPQ({U5rMxMj#x1ka!^1bQjr&OOHvyu|*Scn)IjBTe(vB zuYJOH0x0vPq=b z^#`NCf}M*~VwE%RaLKSwlL9efLir#KY&0Mpb)VHp4a&%cxB5eWi>D`krI7o`3qyY? zr&0`ttMCOS^N@>ArGi@e&eRdhKta&}ifYth%*TrSAa%~?xD+G>=sOaMOjP#|YiCKi z$FG}jlVDbf+-MQIxNH{HBSod(YF?!C+|uwr2j&&yOsm^Rj-bAEe*0RXw>q}H=-wMk zj<_a#KiS{Wy+d&IB`WCE6Zpil{!w-UcDAe2{h`KUey@QRIYX_wdWr>|y4%~15~OvY z$f51VZD9?i#I4FLIX`OB$4L%7Mpv{==Ts8?s}tg2tF{kImkTYPg6nez*_i1aHPA*X{DA9nrvZWg)B&F9m5JCqo}_MCX;Z% zSD=V!I(^H#&@C|n(F@Qx6-G#S%IW6_*b#hzA&L{lP#;&(C zHRT2SxccHc=o}f?Zw^|4V^tGgPce^Y+`r;1rACkH22O=pmW6&jbc z)w?$qRWtB54N6z~p{(RL z+xn%9+%iajK=EV{47v}lri^IkYigRj3$a35f31{$}XdA)cKHQ5uffa;i#r( z?aKzHCvLkDh#PGqv>i!AGm@4 z(RaM-5>$5auBFed(?0Ad5QineGDi)kRP1&CG^j zs;ZZxdAUXsb#u^|fH~ep(OidsT3vfB_tl;gMg72WI+{9y#i*NZuy+{aUg`q>qr~WL z2^M&Q3d87?rh(_j0$eI(joJ2>^x4ZcLpGcVDt`TGehX^}n$O-6OsEO+3pGBtSl|K3 zj@VqY4?Aj0Jl{sRB9c~P#dLVq)i%Ur%TXmP8}+a#a}d6)?w9EThZhl}JA#sKo3Hf=kl|j}3o86IAW$Ln;+d*ose>o>t zjSV`)yf-0CJw-sDoWbfbvOGgGysS)=Hl)o z`DbnU|Kzgs^+YkzhZy<%&!?-usvO_;+CvLzCtWYayOWaWYTpYzpxY>s$@OlIX76*IqW zxYxwafSl^m=(7hK+Y`3fii-nWUDQI9 zEtPZ|eKz7E6LJ`)kU}1`+^LZCk|jzW(iYMuMVv|nb5U%{M32C@!on0-se73H98B;K zpe~W(3lPy4gv25X_L}Ym$e7q;h04W^tXN+KoZd_qhEHBLFct0|g5vu5or;`qEhFrZ zvG)&T5F&)V?|Ne#A2J#{bC){FLr|ZfPFz$bW{KaN&@a-?b#GT5T>P+h=#*d25T4Mp z4rQF#Pd+|HR)}mhm8+96VM_P>j2{YordIgv%1E#G*>Qaj#g5E;)C6?z$jaN8fvJr3 zxA-?#t*#7HtoyUY6$&(xHwGSIq5nhLfx7lCmZFdW8nVy|2!S7(W6*Z9; zvI@>M>{_EKV{M&!BT7Se^!Lm!U1*PAxz)%|kx_`?e`eP|2}>IqL&g%NEHvBO$Hsi< zW$xm?saMVy-Wsxq5k}d0=-&#v9Q4LJ(mV9T(?oR<4tD@JaE6yaH{lC5ylTJ^oCI44 z$D4nl7K*p)X)cIh4o)e6VAooW=f~iMiU(Q0s2tuN-wGZ6LG_SaaJhdGOz<t|kg$=kzspmNf5(xiULVKFp8mp0+d2Ch zbw|d&ia~8%?|=`2m7Jl#fjgeX!juf~a4VbJa7Vaf5+hukG^pQM1U75hAEDbYkY}Ur zCq6*73SIo5kv!pj`?*i&R;XHQj3Kz$k~yN6h00~8HN7@-ww!f`wU@k~&B?;pp=F1L zj&jrzy?cBJ)1TlVgJvW|Uvc9|gus^Y6^yO@B%bKXcygVc+wjyBB8Gowgm!h8;>NiG z4HhQUxec@UONsH2=3K`h9m~2qSFfdf?g?MTjq9&thZ@NXdldrcO>U(knaiMsI#r~J z^U!RKuDc^M)~z$Sll|dx2JiA@jKoIKb@JzI`Iu%}+4Z9U6!o(72FgW26?G$Ug?w-CSqW7h@(u3`61CUGgpJG% z*#q6)dPoxPpmQA$lii$nPjYvQl}t8@82 z*ZjY^bJ=6T1NfE#G!jpT6_AKEJ8W6$;g&5FQiP4f6joOF==9K0h@YAIQ zg)ZBX>A1}QD}t$bH%d-tbQn?v}xF9|SHMz%PC7S|Ry78Y)Ic!91z zp>=ANPm!VcXo|d}iVnE;2E3vuzfTXxYHyjuNT5XW=iy3yAM)%FjA9)%_@o8{r4y8E za2phzsfI{~1X`X5X{n5g_}!-fmAX|}il`5u(3TXhwG+YwVLvn8!cFNSr~3H?jqxCU zP!fe3ic=te&ru_k8|uf)2$5ssET|6>WEDj~K}`OZ6!4au?iCJl!vwI7` zW6t{k>RD2(4@aK5B&IzV)zDA@{!lPr*S9XN9;qcY85I>7pPZ7Ct=ZTl^W{E*Vf2~m z!Ua%D8jYACzY<-ap5D&8A$WfyJ636Xc4q6gNT9uym=MUw1@B+Zip#|HD$Jq0U1s8P z*O8xJ*%N~=-hN~kle`m0|8iCl4ZH#louM#wPMdF$on$9koQYYHDqdo5L zfqTCme%YLy+``I z#Na{@j13#zLL+=}_;Klo`b)3ZPHRf%jn^-GXQsTPqSi8b-)F8xja8nVRmyH(&Gyx- zb)+9R=x`Aneq8mFg=FgNn$rddzpgTkH-&D|o0*>in@mkICv&|_ug)9}ErLWXn-x38 z%WI!K&Tcf{xSy|NdAWOj=URW)VhNI0>uPVUX;spgMFikmg70p}-_CH6wtl$j5Wn#i zgKYwG_%tF2&oQYM>o}*VoPg{smJum1E?@CNgu-kpk^kOjq_mtFFsulj@B+2&LNE}U zF9St}y<8#21o(}Wv#!&n;~DHL^x78J=bu5#Hf_9WT^)>cZ*YJTSWC#2A!hPk@XWyK> z3EO~rjW9oRgU84g|4cQ#e5XE`^8S7fgUREI-@Tr+np6fT2b=wfBCfZbJo+ZC_d6X8 zCP>x&_tS)_90~7kY};OPR=i^oW7dGMCcx6r$Cxw>^Me?zy$R5^-)l3FSM}R>X<-!O zB*y_W-Me`Re8y8W7;HBdvt6Mt1@|n5hhWVkhaSEKSNvE9=zrZ;GU>KC+z-)?ir+E( zLt{}6I1HLcw5LQ8UTK}W=uzD)k+|F1SjV%k+;BY+sPW))P#olGtdMj&pE)B5DJ!~s zgXSr)C6Z2C+gC!EQ)4PT?B;@wGm3Y%*G|-)f%inHVVuzKt`U&bacuQd6|acdxDV$u zG?~7~?i;!jTu`xbomOsncR}cShJfYt_=&nJE3>Ei$JC8EyaXP{Vf&F3fJD-VX)Z0e zH9>?d&g>Lc$V%^aoSq0g5ic=LM+75izsK((_z67G^B8Xw1;Oj9ujr;D!z;p}6ElRO z)~v_tt-brudLndGy!s!nUx$jNjwI}^0gv}O#_L0-(IyfPw}heJNMluewr#1x%U#4p z)eMq=f|`29NFAGEVBlahRrN5X+HL1>`D!uLYJYM@#p2Fb%Pm~VstSvcl_0&OuAhdw zx~UPO2N?Eca4*n>%4BxJ!PGc=@C9x3f|!h;Ly{AQgjDF0(58Qy^7A@wAu$7&*!xtM9LqaL2p#J=QzSXYV-O z_6sq9I4oS}3wHxSgQRd*sl-XM4Q;>!2wy~z%1V~XuONg=NsSDTa3=d@e7L;4{J^vA zC}gFT5G^6Z%h)@j<%!a+V|mr8m8+84kS2vFIfO21g9M-J|M8>5m4}z!8xq|bo)6`0 z`J(BV7kW#c^JJ}WQW3r$@({%I>PY$VV345Pc>4;rw}N1O-BdG`7bFku1{F19#Grt! z2(|>a30tzPRY}5TOonYr=mjR%88BoWcbuj8XgTW*LAwlm=YU3W8Gv2A!bB|S)A-(w z4=9P;Z7Hdk|DcH);vU$B+W6#0vkv%!kBge&Pog;*rYB;Zzu9$~|LAo5uXR5EjV|gR zHtS#iSGV(hu0Evrf9iH_an-mGQDgg2^zZUK2??NoI{S1)^T+7wzqF6iJh~;n00#c+ z3;s=>|13Cw>Z#ZKZ%-UR{VRe$GMM}aPd^RK;|S<~_=(3oNKflwbVY=QhQi^5zEAnW zP??s4Vp~gk#k;nGklh`^NQxFO&fsY4vP}Px3{?65|nxg^G`+L9AQ@7NLN@ z9$IENm;_1G3vOvV4{bD@&pMzTIkZY@UtjkOL9=FS8v2Ju8BxWmv|2euOc~jaMy`}7 z_VIa0`^SZMXnx?z1SlH6j$>0(XDZw$&=1Pw>XrIH3*JF2r*Bo$a9^aMo`a1yIx{jg zKdr%F6qP{>{uXUu)8HJUEzE#SO>DQV^`dS@r3wm06iMDEtU$x|ikthWw7p}BDKs|M zZ%{jIo?f4>I1*LfN76{Ukb#jlye3j4(;PTsgkgPyfc+k4(GpEYMiN)rLNA-c<&CB8 z(!h4(i!Xzu)#fVqB;yNm=F51HVmoA_s$oOI(7ikHwWjY2XZiHDu6MJaVN#UKyv(%CH!|wylxlwN=Dllle^5}4mShhlm&AOuE)DfE#+-qzVW0Jgz20qFMI%=bBL@g zj;2gE8}qG*@4dR}m7DJ)6CQhofJyZmD)FgU-E1vQwVgNKv9ns+EZ1_sU(2)9U z?sW5zSUpjaJnEE6NVlZgrqSK(G`^O!n`XT^QPc$m#-4WSl%$LiECNuMhdBT4vhOOa z^oXVj;|`Dh)GQ{uNUB#rRzPM~Y&(C4jj*0O8v^_-j@PrLruPe~`Y4a1iVEn`XqeK6 z;*C|4s;3vW$kJDI>Idf}v{6PYMvZvjUSy-unU^tcZ@)%Y1quopxK?tm^pgkRsWl*j za#DL{$YPjN#gXLq;XE2%D7L}UiX5UMVV6FM(iD<&X~ig|wy1Q;d<8yEKjbl~jxav5 zTm-A|jC)wY9zS_H%@97nx{3Fm&w^oF7#>d4DIGHuepK7vmC}=@qNe&Y5~cZBBuevZ zk?6m^;{L-}|HWec!Ls>Ri}j6o@V^{_|BbimA6DGI(^tRKSHII&|2F9>nmX`j~HZr}2u~ewBOd}O>(T1^N!?dKz4@%kOIKBM~NtF(FRx^85&U>!*gKWvw zJmKpPLhqnL`y----aO}nLt=am!X0Ykl|$s4VkN=}hnEWqAs7zop>2gUukpN$y7guB5BedTN2Kax!-D1`OQ zNG+<{w&>o+WEIvFSbpoXDuii%5((xR!&S&I18kL^S)03GGZ+=0`{ru~b{5S#qpOE? zYVbgk01`&n{634R{2&Ht)d{oiF>o7WzakqlOzI4*mRH+bpjYUdJ7&!))?Z?^Q&#;@ z;Uo>pj~fXo?N}XHnW>q;f*Fd{*@J8=lL{?1Vn-E*Lo~|lF$*AVJNXFp*H?gIZ*7rr zl2I2zU~Rnfgm-78ZY9F!_DNu?`1!MzN%I$4kM?$kXWJw8jkATJ$|@LdcOb&gXO`az zZd7-q(ASbBE)cYu9G!B+VRID}4z|XJg)tn7=P$IBUazeb)ZCk@Bv0HX++V+D<3=nG zAWeQOPjkOXYc9WvWZ*b^@PK13ZUL|FzD^Ie;%GE3^LxXoZwxo`?D zS~1dGYFS-lGR}*MvhFlbj5~~K_X`|(S;ACXo9k-#&TY0FPFh<82EAn4hGdwyQbkE8 zspasduIyeWVRKDA8@StSLu14`KeO}Z4kA29{iH}z+w8lF6mLq&EM?tPTe&iB`u@X= zauMh=x9TYEkE=jr12J(g0@7<5^5`9kwm{y+q3`6A!a5%JgVQA?P`2f$BvfQO(k*MBTS@KKE=Z@7lAPv~qlWgZCKl;BX0v z3O!c9gw2wWF&rJEF|6glg-I=u!~!=E{E)bG7;}fI3R(?7{KLiowD{xl&1c4GI1Osz zB*gwwz&s6Xf<#nP*kG9kMD#gCBoW=KK=9@vY3`&B6a1NHS{NYRgGTogv(fBXh(L)g zuu{d6JU?;o{w&g=`6JIj^GL(=Lv;ESWd1@}{ufrKN7}r9THX95MFR@~PxJzfN~7iH z;eAYgEj9xHd=s4 zHc+xh)nyY;(v>v!$ePiwcn>yjs>^B3or<{MY@ z(}#Zzxxew#|JOtAKPv_vxr+aBYxE~N4#n}G(D9$zqzwGGscL@NZ^M$6tS`UeRx|s2sB--v=QLS#!j&xJGGFwBcf4Y zGq~PXN)m%(dydW@N_oOU5_@BGkicXjMJeD*peyCTTQfdP3u0Ljn_!=hP}DTXeD(yx zZZ^EBi0S4z=7;`dxp8r=2jCrg(UufK^+Hrq?_{bj$md1x5gU*~A@06HA<*d|MlY?C z-9&~#G(>pgL~hOO<%(27qaEZS+1y-s8oB4DMQ+?g_RSIhg(KbA8XOq3s4Wq(T&T|2 z3$Mph>( zj6^ZK4e3ijvWN(O8|T~#kjag+bV81U@8_c_8RiogTNrz}Wc}gXDF=ry`5#rX@yt(7 zUk^=n#h#fImH4rEtJtxfIM5rNI-9W0UH7Jf`O?EBseNhaL9MnIswpp5?Q8BK6Y+(h z<%b<1=|25}TFiqWK7Mjw#}NZ-k$vlUq3U)gh-^qH@k+$a2#9>UVU)=%o<7S0U;z#f>;Dv+b-d*RsYIhV zRrapfW0UJv&&QmfTk02|x{=nOZth>|7FM4y$F1)fEgG3nG&Y3IklcH*bQI2ZS~*eJ z(WOj#pR-u|N6a=Y_Q$Sj>8*9_z|1<{Y2lZDN~nExTAMkbec8F^S#Ok6DP+Hz)76^2`X%G|tEZCQ? zCq6TDT5j-N#!<8882iI0Y>uxOMEGejYFIl}u)8k~1Kq1YUEa`!jDRy{-BpSbSf^*= zOzU>+Nj`kW9$S&qsVAakw!?`hH;hJnFB@(`&$2&h60QRVVt*c7==f|XXNpm&LZRja zF2Gy&bxgoqU?EG;`Wm4v`*6W<9XX?~h9GyKZ|@|I}1}df_zRre6Hr zC;)yfMETFdUjAV({5KB!8;AXk!~Xl=u-|ma-&|3^w=r^m(>j31X#N*s|9|t_Ho%`XDbkEpYNuA#nx4G#6=e6)Ys37mUhN??4Q_8aI7k~w%Ru_<}WBL zf7PwBO%S7qR=G}_4iOPX#Yz-*s-cq>EU5uHG6hCj_VrY-^UxE_3A#`C^WDV$P_J9- z2Ky!JHPpq48?M`8kh+>Qb`v>D&x2A+5A#Ezd8JiC(1!O3d{VV#!r1h+%87TwR|01I z?zM2muC;Ky>`8*db(Z+#?kP}7sc9CO;=w_7_TuTGB7rmwV>f*Lau%Ix@f|3&p|;(0 zbx1oy_Uhe|nHq3>LPn$zl)%&W>lMgzzl$y>T+aS<#t-0`QK-5hS^O*AOQQmMk+w8% zbC+~j^Ng_+S9hAD5^!ptIg!Pp4~nhBoOg@sbxC)B!m<$Nx;t5))_Kem_lcmDdtY_n zk1Hln_Ms!j>d2lW1Rh%>7@k@q)SsFp{(pR=A*jZzT1$;GIM&mj zHdJS2ef`BF?AGO~ZGQrr<3TGinWJgnjc9O#A3QGG!NJ-wmxE5x;Yu(2dhbwtoat6> z&hey-Dt6mkQOzo^b+T~R#PIBHAf}N+LI0q4;AU*t)Qdm=Fw^@pemg@Sd`eo>j~?Xw$7aD|Pug#+2S zb%MOB&gs1Atxri~DrS`dF9pMOyIB~1I}1~bZ@^a(I;(7P0jE}VH|+!CrB%R>Q|7~r)rG<(6`@? z|6%|3>$hn#?{VC>QHei5{qM~G_yy$tmtg*Xm5U{w*~h#*{_$1#{(Qd~UaCJPY|t?N zq*9QE=~pTRsp%R1coV1}U#x%H^3FUwTHeRb#YA%!eHIpL{e6>!(z%EkndsMbOq77w z$5t+Jc4+>FkO?+>F2o33=&z!zz~VXKWQ7 zC>hDtYTzQ37%OW0rW~-X!-{_TC9Skua2rf;D}Z2BhM>MAk~=xm+k_yKAkEbuZ}YWy z;wrO_I%ARAVA1LX)ay-Yh2CPF;{2l$@%=?~Z7J}fwGcHC-Gf4NvjdbNeS2r?J2iS+ z>TP>J@a@z52pT(*)=zPYiVM@En9)NUB^A|`P2rN-oS7q&Xc6nCs?UyMP28*xQltt~ z#Vf5!s$&NS+9B#pSY7i3Pr?~k+3QQHRP!2GnE8^^?W2azr$6sHOp;ax8(#1s~YZCmhlpT0*oA3V^0|kV+BOHm!rOPoFpZ51(2i=+ih_;$dl;f zAEKU}X^MtInZOu-?(w+(@44;tgQE0>w!3tIan`~GLwtS5S>{|P{jU5Z|5@^xE0|f^ zw!!>9rbr(-`z({g&C&?!>DB4dUh6tjalg`q-u@RM(goCvRvY#ammitw=l7C+FvicS{Ty5AMdZ->Mm`? zk|$hXHO4gP&Aha}{#k13Qka4X zlC4&0*ix!Q%8Y6C;q^Lsd~k+hQ`@>s1KWK$F!^dhb5}m2>0|8!!+hLXzIwQ3kf(w5 z7qJ*+XiFwtUs0q$vJpEC;ZbvArni)z$kXVxsG(i%45G~9k8hA|?<8V}$`j;1Z^vGj z(hOZb)&_`P>&r-S0|!lFpfrC5L5IJ5Er}UY@uRx@t^~hZPU=6?vyTbJA9B|Kx?j?l z|N0@?H+lXZqWxry>0{&Nuf(Rmle52*v%izGzmu~+PtJbRB~MD{FUeWJ&v1CaZyf$N zozHJNpWk#o{~dHbKf~c^zD=TkQbm6)38DR;g~QW)ljmdeX-@gm~ZTj?oxfbSM z-n*swW199~5)jls0mNvie+3Yu{_WlV_HKWBx4*sHpL@68bjjbmTNlM;_e-Ye-=P8En?&DUi@!7g{0w7a_$7?#-@yI&k3r{ux*veYZ~fyl zr}-s^li|;N9QC9B`cG$JW*5Yz?a{z3oSej*r$?}qoD5PGh!Rlbcv331{a;uLz#G|o z{$eo(o@Ph3(}@;Iy-ua`4tXOat*hy9UW4WYQ^&b2hkU`ZyYI; zTuzqo8l8`iiSQ1Yfniu2`vG+5!6U~FEh-L$(JQnwYDtb{l$*Tic(EdSc3L!o;X^pv zpnFxQ)=Hd_I>H`O2KH3Q^;+dU1YW@l#~Nb!fKP>X*}bZ%P73~cb;80J^oqmKDg%VA zkl<76HqQCRcOE#>M-yW*vb!b9c#U=^ug4G5rq*SQK~mSpEpFcUAh1skS7aCEjV77* z0k4C0D;13UD7{r}96}RjvTWEexKZ{XiP0K2F2@GVWlWi-MjG!}{ zQdPO8RbstZjgi5nu0FHqv)Qx@rFlivp3y!tfZ5gd$jm7=akkms+3h?(Bz-BRT3}q) z)lrbR*!`xly*;zzv}YTg+woMx(99`=vaGs?c*lBPIfuT2mm{&xPJb;XduMlbDk6E9 zLBE;NHDhR_Z_u?^6{ac~LySB_KKcIMGnLi-_HM6Nf8}5|pQ78bSi{P}(^sqegduuj zT>Z9N?EZ4EyJtAc&qG_3#a^L(my)ItbxWP6Ng(tc%oPZgfc`PBC#caE)c2gs5tImc z65LHdFNR}X!f-s$kHIuT2UY_>AxA?gfxK)8)bRvTd|ZIhG5}Y&oFH_o&@DKL-KdZV zId9Z~z_&#H&2M?aakQ2h#8x`_Jl|Oh+hhiD#=)GMk(t)^ZXY%znN6Q?r@3fey@Anf z9oyMdsJ6M1!E*n&@HE7jqdcOLAGW!kJ)&=Ssj*ZSb}*;L_`<%UOK;t5PT8$RVpw>= z4M2@%3M;q14`a5=B*%ai?0T%b-E-T|>r+O>z!DqS?3|+3KWDwl6Hrd$P_i+)IJw&A zt>_beeZ715j&``FoXnmwVx-s3UR0Z3e4%CWa1)K?ik0bfDEA_yiq@feKTPjjEitLi z)s){Ut+dI_XV^aNLV;iStGp6XK)iEtESst#@zL(|oJ~q$I^eSyGt1|~e)DTr-Iv1q zNoH7tXu2JWYO>PUlZ`Q%yO7t$u2?Z`BwJUz3EFB{T|0qR9_bksDra&u$RXQy$fPKO zNLCO9bg>Zn=T(J5scn)nePaIp7{0z7G%#5)4R?Yb^u4-wtoB}*#`ZO0*+dB=k z6hugb7vMffv1GES3W?Jx+!?7pt*ZfTHyGb&P+IW)}Uc;_D!I*zM0*QnlqbxlQ-LPk(vm+Q_l*ls1Z? zC*_x!j8>{s*OV)>wS_&)O3PXnYb(tX7m zY%RBs#|)oVq2{_fvs*pc#c{%#jWoWzRja34WTdAu-X5FH`6%6(!F+D%4V4!K;FTni*X6_m^&)9UkwbaX%!l-sf_e|CYtAtPUwl zfaw@Qn-l+CkC0DoED&)#W}s~p5Habibv=*_6FaJLS&)nZolo(Qp=DF83i6M?!ZpnZ z-u37~XsiDIqu&eh`;GX$HU5Ql1n!znRpw(Qm7Si|_4(|?$;0ZCro8fXZ)7=Xt$oWv zZ&T^R*?(~`6M5qco7Cj6cG7%%H^lLzC>Mvkre;Q(olg!oVWlZ-PT3xYyX`re-0aWq z7rvdppG`JXX*%qjtkP$WnUHsnitlYyKF8vCxR2tPt-Zj1GO4rm&el#w%V#l3>hi?v z(NbbM=rhjrd(1A^iLmUkwQgQd`oCyUCHA`ia)?dYsn^>?wr9&_X8Y{zO-N$w<(_X0 z`})&Y7M6}UNO0X--0a*x&oVi_aX%YtAJHrwu6Nw#a>-STd+BXP!;9>yjn046qtWQ~VpM#AJ^cgtDtaB9QhIoflR=^D4W>7wIf z7mCC{a&W@K!SdH=5W)Yyk8UZw$YA2PHh6Wzb zYVa!ODj2*d-@=hRlf185(+M5W*=f^-al`_72?C?}sW(Hq`*N6a?Z8dHeffw7F2l%p zgcXz()JU4M1nyXSVfEqGAej8LkGq8*=X`-i?V1qBoeKyU6z<*kA07vw_b}%PRTDqP zr8M8%zHhN8fZpmJ&_g~`unegEkIPe)Lj8-NH6?w2QkZE06l<$(e^?vNi*fOFLO$W z`1NFKdT1M(55V;whZp3;DB(uE^pt|!lakf((>EM!eHIPPFSAFIz7gmGr?Yz67nQ~L z8N<2dRT>kqRZ<`NM73P1%(;4|r>(Q8PQ(VhAIlzVZRVODtm+f1PBfJH zl-QQesSVYdzI?K$+o)wg3+}u(ro}i-N}NuNtqpZ_y4cF_28cG7zTTVf)nSgmDW0-9 z9yVs!eXYuZcuxI_I_`@iayH;o14T6HL#0ZC7&vCO)*6Cj*wl~?556u^UjM#DorAjh z?ZmY!HKtwsm}i(&1GRRpOX;{3fBD+A^-eSUMDNrW(=q=mnO*)qsktf^pydqm+JBW84(&HV%C7At;V9!sq?Y9l}wDag0{^*OLX8K7+hWe4t zQSe)D9tK)iOqzX{1>j1I|c0gX|aN za=tIc{GaxoWf>fXO5{si8Ha~yL`QV>1y|U88<>M6!DfraLqQv+uBDXgVo0itGyCfo zZDkmPUVCtZ)?~d@&0VT^s~*x8+(iBehJLso%vvEYc?bGGkwJiNPR(3g-vS>W-!$Af zV-W$F0q}eGYVfA8q4nsMsP&lx0(u!mhhqh z-P!18##hHH+PWH!I678(Q#gUTk{dZu>CjHdR2cf%j4HZs6+6(VaNZtJbWtx(eVqJPMNOKBx+%6uYlrnj7%64W^SYqpvSa?(y>>!vmH z+rq64gRi+9s6`Yq$u;o9#aX9XwNrFmBgIUHuA*If)ru6`#AD{(pjtR_b;PntjY~T8 z51dLTQ#zBzvO)~G2Jb-f-L26P9%?1RFZ;f}6hi0+dk_*_25_?w3DTn8lH&UfzSv8_ z8&Wwej>KaJRCtB+ShqGy{rbhPBUV#kND$_+bR%GlSj^IaFikjtncpF&RON(Esb}8a zp+p^DEC^qnHKf*jVu%zheoeC0(U5w%(EwaiZ9}wN<%3)GX_hg1C37)4D8{q*XxNBx z(PB~yyh@<<&Z`bg*f@?Te8MTtfNgh;@O2<-Gk%sjF|g3$0;n5EOb%|$kK6M5W_#L} z41Z$tfS=WA0e-Dc>)*f?cuMyF%@uegRsAt3@!047EBfgFY@eR;<7_7V*B#ztU>|Cr z(um-BgC<}(3w(0?5U58l4f{Kw!;JFmL`10_{Km$8K57Yy6u_uOI6RJ*mk5M6XS|ze zvWEw|Q_$BTfacge@p`$T+Xc{%*>J^X^5uFHAcOM-`MC!UNf3CwKVLeR#25IbcT4PR zo;c_jK3`+|EF?r8Gr%8OsbG)-TVHru;vmSGrLul_LlQ`(08ixmx_^b2-@o;^NdWsh>|RB43jV2q57T2JrZO?mcqB(ty4^x;%>U_QT~m z$`Lq42!(|7&JE=kh5~(C@MZEt|%J1tofqn z#mN^)@3I=mSg@YB&hg-ZqJ4_Qb4%wL?x4D+q9V0I$I0<~h5Vguf|teh>L=<^X#&x2 zR(Yvdt+>8qPU;vMic0ezEQSslV9;+uKwLZ7Br2I^7iF7kNSLF6Q)x0v1(^B_O2EuBg(sFwt|gZ8KXad7OizC{NZaEZiH7ypv5A50e7A9AMs0 zd8W!Ci>#{}tmQDYN+8%fn#hd^ogW{Zi0P|+vooIMsy(Ntyij6p<~nBw6#4G9+*Wg# z_Ha|pPvMK^FsqRjZ*31mU6^2Ro%X_(4c6|#-X2Jm!rt!w-X1nk=)2qeyOo>fvuU6B zaJ+rPIH^qZ^x7p0EG^si?Zw59$RS$wS!c~w7<13C1TQh%Z5Vso$=b-Zbx zUv#(f}kFJ$MHbsOJM?9S8H#L4Y{TyX*SsYCcq>`Z%b^AzGnY|)VUqH27-3u5K~ zNgq;TP){(GI-PTcbzcD3ogTPf{GbzjI8Rf3QnZI0b#$DI@_k@#rn z2o$1rB1Ue!(#3|IMzEge!PXmT75Ax$9A|On$V9R5}ksbJ9kG!WN^*w^uEfG zwoFu!-Y&r-+i~*-#}_kHXD!Zb8s=nPg4Ir_bzv*VWW3{EI z;~UsQmECQ)SaEqJFtD}Z=TraqNzL$O>AS~UfAn#NWXfI(=8#}W*a&mTPMsC_DQQqjhR?aWV?Bf04_^QVv7kW}8xzX*W z)rYBU!sBG>UKiS${a_y?<|tU(c?Wvkj{tn(T1z?BmFep_w`Nv#RESTlrtm%+I+bWs zwSq?k#;E_<%C>$Ve^c$&u}M9~I_Q#d{YT=IXka*h5pR(9R#5)FAO3M$>YcM{(H2Q8N`Iqb*}t zMX{-&W;N?LIxlV6S=Lc;>&zH*8RyKFo1#}Nc%=j`u1gQ9)rQ`eQbe%#OOHr%I4($3 zLR=C)xM0=6@yW@F&Tzu+3tUS-o`l%AFB>IWSk%NwAJK*M(}zkeK|F9?;-S{!m_e#N zkKiHBO_2)c<@e)DJ8Z{;OIzqKH$~qvIj3z(`9#K=0GF zQg@vCVF`mG2GLvVig&tO&acDr-PZ}^qRbIal~W=J41ZmtHbpY}nU0yLst&p5t)&fQ zn2?!w>Mba6-Z=^$$h)bw_>rm=T&2lHyEK}N(VxzPg|Q*x58Zrgc6P6;JBcZB2kvE$ z>2lw}ksNX;VkS`i=r_`QbBdn4CmL$HKl^s+eiF|C=zb}l`#)UZKO>DGiP5ELzFXG2 z+*79Vf>$1%G8fb+l$pRU@>7-`ilAcFm_ZhIfg~sdiNU?3T84wxA;?S+Eo((`Xi?yqN({nnm*c4s7)qTWmo2^=;n8+-eaVPS zN@$6MFpKj6p@J)4TyT&kmVA*~S0CLG6O~&ibk822&mxwqS3j`#}8tZ>$aEUp*zY@wQddXMEpL=d07A{mp4i!d%aE>agetZTSBtF;&tapCa8`gDCU)>g-mR0&u z`gu3OdSrsp8N?%xeT@9Oa#{|i+ZN|}TPj)7nCCpiSI;u{cU;?~1F$FPfy(c4?FKeOD%muuEy62XsB)j(bYY~0A2+h>a%sz}*AGxanSYOwg z_g2{Z1J7Qg*{M)PS3675-k_Gl-qMUwSIozU^f83wsR zDOWF1|MrWiD_%Q&#}3ODnVMi`f75lF#<(}7Ul{Qb+CIq?&30w$Vt(nbo(9rp`=sC9 zO@){93W~pq*AIt>YZ&TrSm|ZK8!Ej-kBgSX^GJVQ!7Xjz0juvw%y-<%IRn2$(@uX^)sQ={F`B|g}_{B)=&q9vh$kA`) z=r?loOUTjp&Gt9sh=%$n4gd}HuQ>q!o;2UzgnU|#KWV;cn0^K)G5r#p^gp5j=bOmi zUX?!^Z~#ARzybVH1I}O8=8p^J|S+ zkBx5sL6k`SI9=f%R|bDhhk$|of1eHk{it^};D@a^Wx9TF@bF=->bSVLLjD4F zl0w}m@Ka^_g!N^5uV$#$j~=rW6K8b`2P2UE7K4F(CaG7P??vvQ= zJbTcdUv}S;btOQF2@+2M6&fD^uvPg{&ii9yLBP?vF{ zzI?u;0utD zmk$oF*1?7vAh4#d$~x*$y)`M|>FA3-Hv=5O_uF*GoJ?FRlN3XB?coblD4mf|U*>%< zN#@v~J0C4~yLz%ayK>WGc&Lyn^nCY>`-+G3Ajg(fHsf9OKTw zR{;G15n5xEagzc(_kPHNT29i#80*_);bn?_`VcSv_LVq7fko>j;zq(QCIa!KK7eQf z6)*$_gWc{Y>U92)&!uaaKsUI_>zr7N+FCr$?&pn!m;+$}M5wD&1B)(8paYKHm;=-u zn=eI(gl=Z$jW``L?F?gvdLnVk3t&;rG7Y;?j3kA$#zT3_)8vTIY*WosFTM z9?C?SZf~@Gpm_SV z?&-kFM|oVzp`f6uw2*s9NJyrAWOoY@6lda9))#I#`V$?1P6S-~JV6eaj*o0yIPT*938b0x@0UuN4tB$TKP z!RXBF9$G1|JPK0N@PM!>Z_ow5Fez6NF|}Sl*Li@*pg%%roNDcB$a2n|prs@L4~9-s zwKKyGthl3k`;KB8#Ue<#1QS()1x8P?`0;5)ype19Y|LCbOp>#{3PchMk_+c8_L&%` z2Umk^N=-?rPJ5$BwchSvN?J~?%Ko6qbxm)ti@8O1sn81o7VC4~>9}UWTNVCht8Q59 zn1|;MRIKZq4@XxjF2i^8z~_52^{D*Q<@yZ4)zB{=t=if_W`^coGv(yv494}GRS-ALtJ6Sy z56kWa8E%c;9&6A|X)$lTS_7ufcA@6r$yCAu>gde83v+b4LCCL>b>!2OC*TSk=0plx zgz`e6vg=ck1Oz{@8bXkel4`=g2*E7Q%VV}YUL`IV$sYW|9AGOZXRDFeRrlVOx`e_~ zm42rOo8km;d=djPbEScLnd6b2LqFK9&#=Mh8kf!NxC|QXaEtcR`GYxV!?R0tqU2)3 z<^&@? z><6~DPZ6nn5fxP>Gb=BP_9Ay4&jC|9-=U8tZATz$^p`7z%KhNvDFR090fmwGfmi^v zHDA71wJR^T)^ep&wA@T{j($h%YhACc`1l=P{<*k(I(9Q9?3)mj(@pyZ{f2)1i4h z-QC@SySoI3;7*Xn-QC^2aqmO&yL)E#OsdYwtv$Q;xi|a+_0iqc&)Z+GcfF6Sb+RgP z81R{3oO>tB%{7RawR3FqV5St@s=EFLa~VF=qL$TqgOSW!_wLH5rxPLY7H z`M;Z~MKy-on=WEvBv=v8nok}JbrAA5`%ra4P&K=#b-mBkXoe}!62XHh82c1+A+M^c zcnx;2)*%!@$jx#Pq%8R;zv9GwK5gsSd{eXqy0G>1+z=|c&NwP`b~vGmAmVnpIejgu zq^PJmfVR@;cwO2AGk!h!iK_|ksGLULGV}S?%$|tA-XsYp?OCm?vU9bzmR?f;HXC>N z+wWTLYQ)zt=?bgNXUPqc_Sw=Uy*TQNAq^)o)a&W`S_Rjb!Bks+jHwl7DL6WvZZa_? z9-4OiXiul@@rZ(J0_C`NL+LdJz$b+1sdN|md@I3!edi+G)S40n-bgw7om~B^D!I*E zU4g2#Xf3(Q1=S=3b^jjMew^N1F+{w3e~Q1n2u6X{dj>E5R&0La%ZCPXx7)KVTEN3u zzh>FvW!{*}eb?o2$}|?yx_NfK^9Jv|DTs@0ftFS?`Fg;%39acGO!MX3fT*uiN7`(d z{l^}>8~^+V23+;K2mEcUw+}aC*#Wto$k@wScyC4co}LR{Vd>_W*QjFe;9t9z_NI3a zUuhwu%|HT69MNH-(z#tfs|NUKf8V}#H*bRb0vR|UFhuLqj7zE=OXsgO*qZp-+$-0!CkzX~cq(mfQF$tGDRN%Q2i!Iv9$5)C*1kuIFC7)wAjE7qRJAspwIR+_ zK`s$e2#%|JueMdoU1Vn>_X2gJn^hnXpwo<8jNEomM>^93s84oVUGd4_wJN$kaTdX! z=B?k+)QE24SaAhw{A~DU*FKixJ%L|1D|_I5J9|~2WyEz*(MGB?lJ*A*S|NMa+B%;r zQXG-d`N*{4;rRaaAnh-uJ>fXa3_Pv4vM*XNzM6y+-}Jti%q!F{aOkfl-lkieXFPlr z-fCFS1Kwt&+B4Rq0kyuQq0Ktd{9Q?Dm!~7}xb-X=>^wk;qrW>pI!(jBg$X z;_F0%u|0BwWGXjJYZogXK6E(6)|tRQOMRDv{ubXS*XvS(1$tY{p_S0#?&>sOW!F8_ z`fYLsyqFhClGqXSBpz%n7&ugnpEEuqGpFEc`hJ$S%`uP!Ng9TNtw*BZyyM97Zxn{V zZo{UhgZV_q;lu+hSP**|Sg*cxtnWkqdd*xwwb*zqE8}f2&)~eZMs{c?w2}h3`S+{rai*d`o zwl$&2AknM50&Nba*s6fnkB_Rj!M9y1ln9;pq#S^KZZuM#7ucz^4pW1X ztPh(z%Nxvvwg4hqEb13{j?6u+cELh0IMcJgI)=nrd4cmtFgiqmUq81+xSXZ zPHspc_S=ep#0@LWi*kA{gT*hxviTz@?IGg~3<+hYP_|yP!mYb2ZyC7C-D%6+`mQna z{QbAzX(VB4*wkO{Wm&zmCo1)%6jxtd=~(Y^iOs7#t0J+=cwW^ciOyQ|I>m)_Gdq@^ zS8^-?udLncWt4Q=X=vKfD~%!LvcDzdt` z3;^c#w4Up=(s*M7z7=wutHDAm1cO{=IFo&l`?xi5Q_<>vqIljV*4)IsNcC!jMbL7Z zMHR1-JuZi7*u&TNHQSr@4-pa))%+27QTnJjX>{bA_&BUY|~5NLKmz%7P496Me z4St6>cDT#pApqhgLRh;o97fk$Y@lpR43_T7?hX|~!|N1H_!=-{IbUh7+83^_Pq-$V zKi8H3)ISMJsXNK^c5gk@1P&CkCR)n6fL!C%T)Q$BSRCJ>1=+FiiizloYFCx{F}EzY ze40w-8_w8(`Z#%ieKh9>%+@;Zw*O}P>^MFyqdY;vZJ@^`;_?C162$yM1#2 zy%b6WR0*iQ#J)cE4SIta+>;e-IGjE}s!V1{s~`>c-+F zt{Cd|sqpcbZU&v9)~iMIRnL*IJz)T;cXuhIIdu`9)Xa?Tlao|Hmj9lqB8_?h%$_cB zv*k#5bk;-D`6QcJU=QD22V_*$J zCk4BF)^EGT5;@))d%VXkTP|-Xcj_Uuqesu+_5}9VB&YZ!Y8Gh6o+8+W)7e^W%$(B# z@VQ&`A;Qpa3dChQn0P37$~@mQi;pYPJ6SysPS4ggR@W0-x*rD*R{Ql;q2+}V5g?Da zyV{=wyFB3z4<&;Z-$R8hQopIx9E`jfL(3aZjj41pW|^-qcQTxN`Xn^lg_RpjFJYDJ z6#izL>UGuWjz2Zzs5!3&M8?-{si7!bZfa)4abzN`)7C}4F|r7vQu385$jq4%mGlZD z4^*xQTue^lB38CKOFW+wmZ!{AiU#pU33XSnv(Z-@q+}$yIr^naRrH>!d+BGsT5iYf zx!-nm-h23*yc_L=q3_miS@S`@-%$5!$pTP%YmwP)#0ZK4oPeS4#K&gJ%Cq2H9F5oF z5gX=kLbiv}d5py#BE6-XzdP<>#B#%->F9g^h;4q^^=97}<8|alSCq{6Tvbo_i zc5RumS}Zk3&|r2I=y+6I;j9A=F84PcPT`pNR78>q)E$X1+%Hja@07NlD#zbAX|(9I zrQ*c&boK~d2kp)ms>>z=y=imQm7K~veElPmq7$3Q84Z`>%{f==_w6pbzA@JwdjZjs z$oVU9f_%fzJA}$;Pcwvxd!?j=0enKd-&ixpMt39#$`%3n)pV9e!J(54a4~{CUHq4a z{T5G6$Ex^3C-IJM{MP%Ssm?*^Qw_ z7SFyXr8lwE@+=U^wi;aB^;!Xun}}1c#dHDFN+XiVe(^N1AdrS&zS?B$AqLdC^so~- zT7K-VyG}Nkn9!o5?$MVHDI}5F>~slpaI?qg*5V4;ogsk^DyhDldOC+=3cT7e0#^rZ zN-uFcwOHj9p{AZsQ)zNJ-qmYEzU|=jt9@#qH;}Tx!W`qZFl%oLO$`%hydq*(cHg@z zakaM#dTY`W9 zZ>m|==o@BR23Sgs5drQrBZr5KjLLA{dgUj1Lt1r!JE(2Lh2v_U?eqq;6nGiuIRXW- z`r_{K-qCXJ7U=7`>b9-ab)1DbR~WRhac6%rS*veF&uTeWD;u$UL(!LH8q)t>G7aEx z@Z?K2qS5k^Xh3O=_wd?{;!Z8O6?FUjs<&A?tYy8o$3v7;^01B=n>517*jQ7|pFW?) zj=jnGVo4pv%RHXOK^i5lm}|Hn_(g{Vrq;Hx8_V83Ax;XFJquV=K%JI3}y7 z8}Fls2jQh7%KY~t)6~NF%*0u`pF>&|afVcD;7_nJB5O;lEAIrCCa}s@T0ohR7}2V5 zPU&;QfUMMvMS#P19o6IZzEAgdaOOBS7c*Qrx+96!eUvYeQwyC>52sWWDp@L3Sqt;| zAi#%-mI3bao}m4{%Aus7x7i&FtOz*N^Irsz(o(-$%m68`Gn%!@QpA%pNLN)L1h;!4 zh&G;rU`(D5r+0ftwy z-KFC&(PA9sb-8f4&!g z@Zk?m{@D)xZ`~Fj3b*F5PI#Uy?svaSqTP7FF=^lKjGpe&+uGC;HD|>GR`honA9J7S z#F6-8+J3N4F5~*WgYm=znPuS?AhyWjiHPL%z6V)cg)ptUa&uRIPoIF{3Dz-N@>U$~ z;aGKeh-G66iVl57%N-yx&*G=g15!<@@6~9-Pu@n<4wm$h;y3|7WF%%XKp6*lbfCJ zzBDM=wnS9&eBylfp%38saI_QxJ0o$Zk-{awroy|>rq^@#@ew?+?>mH6V$sIj#pRc0lNDV`=!AdV=@A(zY3w(p(h^27Y- zkjn51u;`-ji5wggnXSdwDyT|1cp* z{gK1&gw@TYKX|C|y`*M~Gf4X}MA0(#{JNgRJ)BoOvyC42_Hls~z~y_| zj&Miz+xCyDH7`O35}A|h5R6xHXS~+XP_Yb3`c^a?3ib!PzOkN6^0=+LZr|XO8$mB^ zLhk}@I8k314D$HvqMvQRUY|^=v3uSrb)~O2Er#C0ZLHt<+8ku~n&5Pa4W_-l>k3ci za@96C&TCKGijy}S)+}tWFFSt4t-E|9&w~WxtuW@Gx zqol#T?09YCZiiN**#mx5(5^$aXjjnjt7o}5aiAQ^hNi}|G2RW|(UR&+nHrnzx{-jg z@mk)ZzHuKTX?W<-$~V&_jd0j;4Uityj9h)cpMaiDWV`U+k9KQ?a+EfD?vUbVa62DL-5y8# zgf$FI$QVcx5D_f^9G#RJBjc%Y5wVvYZBrRPa@y??&Uv1m(#N1iW9HBfES2jogjYhw z^+=?%d+w)gWS|dX(-=*#>p3;>64`VGGu!60hextf&KBMT3I1fx@q^hzV$g))kZNoc;)p=5 zYdRF1CSM|8_*(vC2Rc{ zBZO>nQV~X*#%sHOvli^$Sc_$$0Wi6Zp4{v#L=!~B!zaKQi;Gre0m)F{jSVH&8O?qF zk;nOd9gK^@X|{Po0VjOE=d8Xwy`H9jkNnDxSuh;0?s%~*d)JVb&b~YcAf_S?x}gKC zVR)Eei;H#2bOD;2G(Z;>2Bd* zFyFqZ3>VWAhD*pUmJA;zqBZC?>4jtPiT_|DUJSz5QgAM0c zWolcC)>ythZzrqihvq&>ZKy`jW=@~*v;=no4g*?5h&v8j|HxsYDBmIYSG)<8(M#@{G)p%qVHG+WV2hWEx6;wupX8 zLazzHeakFQdEkjN!pm);3%CKfIc4mK!KGl5uI*eVz9ZyD=^)n$MZ+5HBn#Kg{D6^w ziM72`qL~(xmXw~%;5Z#xsKxb$hJ}>leWl^GMcP`u`De;-OwEPf3^BaZMUEVEqX3Dt zMu&)9#8LVRYhZS_L@I;3fsBoa$=&!CrR|LQVw3#m?A!?aZyfgBi7g(!Rj|^cvKyb? zbR$b2+1q>BZ{R?+Js3OPL4wYqT6PmOKcke}(e1oILQV7^YbRV=U^Ok(%w4O>msB9& zcH8xeaO#*}#28>0&vd#9HI5+*#R*~|-OzZomPk1tQ0u*6NwX=Ke7ho*ldp>ES*JnI@K8-i?(lYxKBWI2TSL4wyLgaeN_A6f?}AE+$7ULR`=n9 zJa1Gfk^!~(QnhK^Bd_5B!K!R?p2rqEGx);+CsR4{EY^cm-Wq4!m(yJMshykvWTKC+ zPL>08IEdffC`*szeYJQKaT=WgdoU4qQ2?zCyiR(sslvi$&kA^a37+Ly2)%bPU}o!Sb_6Yu@rIS*7G}*I^ljIC zb>ImDkobE4c%up*Oc%AqWh?rXTjl<2RKG9AmMfnVC(Qa#%5>Qfvp?2g3MZa8m$MHs zAp;xfcTG*{K?aM6wR+~fV8FLw32pW7_9t!8FnevEM#r;zvhs{w zj+=fI?2^b$5QtJ(cCaicfpSE}Ts9XsL9+RXrnUA%;@`W`Wfs6VKi7T@d#$Rn@Xeg) z+OVqD5skOD5pcKf(*X0>!dI%@SY-m{{gRBMU|^Xw@sRxw(W8B=LEF*6k|z@iFF7&;+m~=?!#o-WT(1-6{Z%Gtzez zUhnQY5O8GW_}S^Lc#J{n>KMDAySSW9?lB_lckSUQ&rvFtU{0xq1lOI+)&%Igm!Hwi z&)d3tyPIpR#CKom>vop1R^P4xr%Y!JEUW4vzq7|VX-1xY^f}&{Sb36%0p&erv|MHK zx|*@mXld(2Mwk^ZX?@4XKfQ!3Qat7HAaNOMkpbWnj{>lY=s%|N*k|oy^9;zNoF1#^ z?yf>eo(BJ{Q3|bHmanaI1A&XZ-Zc$G$Z;ttPkhD6SQ{a66m3Fy zozu17DEle2uT{%6fUEVJ=7O*WL<-!-E)fBbrk388PFa~m=5EKW2lTWf7gPpN026jHD;bm;|B{{Br@aX>@P-;f^}%Kx zte77kVmiCu8AdZK!n|+&G_$x-#ypN|h&n=PN z$PsAm#cRFKJu9tNJLM7&Mgb!ZKXxZhG4wW@lm~9%s@yIn|2+o+I&nt>X-MW|jw0Uf zh_(u;p;AItk8!=&%h1j_;l6Vj%)Go@L#iQ_mN#Sx<*Zxge7dq%65($S4}GbQbmhnA z4To``y95U31+S?c99!!OPkJPYQxU_#{fH=`8_0zsbb$ukyE)ov8spgfg1^wpwqONMfNw_ zrdUy6G1B_OjOzp2X-GB2km3T6bR1fDShF2f*QVXh)Bz5cMX85d%AyIaCm&hLVV%}* zDs%;L+VDwc+nZelBdrqX+&ISwxe+>UjIMbt*Gak0HKD}Nm<^E8nWd@<**yYH94<4J zI$Y4_U3_+2Fz;M=v}h$%qob1Ij{24ZhhdK1h`r?;oi9IHi(lulK126MT^l;fvlJtR zc-W?X`!KBTP#=5a8}Kj;1<(3&2Zke8Gy~vtiyCK9-QgX0+6x~A=GFn@qbD5$aMRWV zNqUlj_P8^hZ(7T?8}K4sy57qm$`3-f_Vlnr=o1$yCluSASdZ9|1N)LGt|0o5UK z6&GJFPUI6~F`m2{Eh%90TF+G%LwZ-Nu{yL6c*g9)$jH^wXUoPXmcgN4X{?3ts$jK^ z9uju-T#e|3AY6c>z;nHsK{T>lRp22Yi+e)mxz&eW8gDACvGQ9R2de&3ggH04!;1r# zFBg~mt?3-x+N8J(y&AFc67vJ}VVk9Vrj~=B%+cy7j+b27Vk21x@LHa`49~nXxC5SV z^HeFQ25F>itKs)TNh{4{lcooB5xQ?YZ+fk;PW`!@vB}RUXJ2^kxm-0M@XP#K6dapDu}JQ)IP+ZcFa_F#v}_J*rh z90@!=)OrBUwsjd$rDRsA?WBW&SCF6pPdiE)FlQ#)+e`+yv}QC8d5!j=UU-(I6m|GRR3CH zgU4o?EqIZwjK}i{Vjh3eI4#VhId8ig(QS1T0xr4-^-_@M2#r^fS|N?91<_}^gIAr8 zcW@N#&17vfNC}bi~pO`JEd14dCRYP@PRCHq$Er21-T zyUJp&t{pQ>v@Qt(D>r01ZDc+zjSa1tGt3qeCcs)Z$0f+|^P$#Q$W@mi+axW_i;FUJ zI-Wjeux*l1Bo-e}l6K7xD^#uX^+?+4DJ@spRTbQGIbq}7Og_^F=C1WZy3^f?Z(~Sf zt}mbl-<@&?#K3uH11M5E=`;1p_=6yCIV7Ezw}({eiURFq^6hnma!)8IvmL>a&+fgK z%VFJmVcm;6R*^wUJ>}sGPd%8cDo3*>L!;t| zZ#r{9bCmVmv!!(r_VK#^6s%d}{w(B01TAkb4uAo6!?lJ8{}fB?yo_tc;3d6oAP&D? z>+!6xK@^QTkeVpptkxlCwYz{Dtg3dGrAc}DlzSMAwVltcd$3cvDu&K|11}~PT25ly zc<&3%c!Kv`a2z{(LnqSFYPf<>iS1ZZLBPFFiK_Hj!EKiLC^G2Xy9FTT>b!RinyF&P z{L-Y!%`2(Sb&YfKg}MZCH^s&J1Kvarip8CHV5U<= z@(q;=x+QHY_wpgUeFO%ygkK_RY_?w3cdE1?uzY2-rEvd99+)Ltfk~_JMMoT3E|$_C zV%k(@+-&S+Y4^b-MJft{vfL55gLzYC7+!x1L8!5JnCW7ct-jvswE?0XcO%2y%VjSO zMjj@`7IF5c!xNz3PQtX(jU_$4pxXkW1;sHE0r&C^v9TMx)ded&KBB}r-7sndPA6^z zflgu%A|4C=xX_0?@{8NV3bPBiNt)&Sz?(ukUh_Mk;`L&TaLUAS_=c8wz0au_i6W1Ymex& z#*1w#sh4?13dL={2=Yo{$0Pn-N(avGl!7n&bi{V&yE6S$BB=2aEZFI+^yqAEx5^|F z957DmWNT($b_JkMCRB}JrCBxI$A|KY0jNh)^jhv z(|HBIo4qZ-3nhTuiW}%x$ESt(QpJ~7nGY0<1Q`<1*0ZjQN@{e4mbxjNIbOhp5IbJ@ z;Rxr|7^E!U_35ca=X@(h*)Z0^SND4drUfsSK=ZeCf#O-wWLS_=eA7ph<$= zji(N{-XmxV)c+>^{v?(tmX}awkftZ zfHQ)*WEt)vpE#A~iukgyfu?OwHYInb9aOq&!o|IA-`hBedMPV|q?Y>&#k0Hk%RP^C zLlFC*?E1`V4Aa9hZ-uS&zV}la+)d_F_;cPIZB*9uABM($j*ztiUVDRJPfMISe=&gf zbFA*?03IVdJJa7r@K}E(9L@IsG2!T+@%-oMO?vhpG5)yr4?fQH>_60`|BvmA|C>dB zjwI2P(YyBs)Aqzp?D|Ld&|%>HyyG$BRaCnaND$RkHwB~>{Gh*QzAz!CQo-$lwol0W ztmV!J;e*Wt?N?B)gQC{^WXgH^6`}l?%r-0!Ajx_?;Cz;vVmrc4K_T2i2`d+POZ3M* z{*Qb7ANTnG0{8g8u;d58{G*c-%P;gJ7+8KwKjPVK?r$^W{~czYYofBQ*s!NZhpEoB zRkk+AW&>}hr{_ZTXzXJKs6th2=d-Y4qUgBDqb_vxvXCg~eD)4~FqDn%NOhgsAWB-B zNESVh2%G6fzT{(#q-0oUoP0a-rggnZHK!#?2|M$|Dz6%*CG=Vvwnj?F8Wwa~ZnJi{ zFLWQmaQ-T6CmP-Jwko{mt*}Lf@am8N-qj|)4>`pUbrCO$UWS*Zxv>I?v=vivAA2gz z9Ny^sr-X|a_qz!nk5pPI8&i>nefr>Ny7K+9(tzpw(e9|+z+t1$=M*$;KE^u&bNPD} z{3gWQi3B=nhgvSgae;ful9;zKmZW5#mle3vMRv}z1$^Cuvv1WnFlJPnqpi+*8%XDf zf^#e1cIz3tIE339UX-aO5IT0s#rBfi_Fkn%`_RdXfU@t(#3ag!(>YsJiLs#ZC_ zwgzZW@j%BrYClG%TpK%SbzO3w>lYU`KYUDhaCkh)qJSxKZD}7MFU(dE``}`kcau|j zXT7uV1b%Snxo}&r;w&?GvF{OKKj4)<7(OI7W72rfi>$n}^JHOO!oe6Xz7IYS??Z+y zjD%x5aIoMC8vrUFUTa&vx;smyj}3*=|N2xXq56gpLHW(7*GpvxxfB)5QY2GI) zl;9aU$7p|RIL&5%yJIx4+_rAFwKTnR!uw>9Mzq3f+5Alm3)U6y7!9~BP?UaQw0HY$ zXg;C2thnkrxAu15*_df*!YQi1#DAUjEaqI)l9ksS)Wxxo!LmX#B)M(b`Kf5@`^;Ew z_3%?;_lDur%n7eq`EbB>*|!lwJjZ)ZJepi;x6x;G(UwqRzcPVbrd)!_eYv5~i-khfRP9hwSaTTR6Y!fsrOJ)IDnBX#Ta~q%S z8 z^Soz4q`x69-81_Bn>8jz|EAKhqmB4mDjjZGRsM7r|GF}Nh5KVKV)?uH2GcK`Qy7?j z%Q@wbi@_fkgFh|?|8HFk{`#{0_*VX-ivja5zLt#7LScW-r3fA1#4Si*J@jHVLg~dX zP)tN1Uw0V2>Yzh0F$yL*<*uYl%_L4q?#6?9>jI@QQh^}uqh3Upnp2ZXoIKbdLufZ<=E+M63d>hyaab5-&>e2k})Q z&AnFv%%Y>l=nC3Un+!QR5rBO&cik84;)pvEun~>Lq#B}YK@Zv{C2eY(aMOPCoUZN7 zfpD59E%PdxN4DER_=pbt(%(bf5&)$&j0|+xcpt^6T*}Jwo4=0hj;z0qa*|^kN-|3? z+8%D%-JR_uG3kCDhOx~q8Q$r|sOHS_63lq~G&eU8wfpB>`sZBwUvMt{3rqg_T>3Nd z_2c3{WWyNPe+zZ~=d0KKjORy?e^;+#VE6^xGBEsB;D6G9f8bUJ!`{S?+}CcGf8*I- zqAKJ>aR%ia*wJ`1v8I|J>1?i3Boj;A;5)>sg9gl<0FvIf1H@KaT95OO} z8E&7IyZ@#5z`*)@61&f0hoAM04UO!GpH;>G-DlFoKgEYXB<~OE!1nQT#$~k&hYeeO zsKg-RP+JGS>9)NH9FR8ZW3fSicHqy|`hckH{=JV%xfq(I6J3egnO!-H*z`@TDHaCI z=j5?mVD|;WD{dS??;WMjeRxHkUQ|SKXFevThI#tR3IW*cIr0A$sPhv`{t0y$f0glJ z{Jo6N|9F!3S3Lhj@|b^Bb$Qm_{hc%Q-<7JLOXmM>xcx2FZmm`?aL9ku>aEAYwywC= z|JOVq!_Sz1EDr16+;14zeziDkzqL63`b_avN?tgHMCZmIZd@>Q6SgUp`nfEy~FMq5efi)){Oo)3)~! zU!$ZuLlJ~vUCNke@G1KhYC95>nYYms6g29cO2}-!Av3dfs2_>fcqF`wnIMOOg~-!a zN$sIDIrvnPFfV1hmyqFEL_=F)2w$wz866-U&#H|7PX8&mC|NeJu#gAm)LGrZmgE1H zP53V?`KL{o;a7(B48Lbs|0m(_C*kl1b^f4Ed?fBKINnbz`6tw2{FTEI<8L|~{Yg0d zsVe4W)m;9E>-^z5f4I&cuJdcI^Ak({33V9he&H&` zNcTIgQuP1m(f6GA`)_ObGknSWzi;^4@dkfQ?}_uT)bFn#e=Ns8xHYmcqeRAa<+R!w2_80I+W8?ZMl`~bIiU$e>}>ear=6}3+4AXv)1$FRA* zlXUXh6{+zbAFq}u9;=QYFNh1iBwGDIs#w5OolakgM_N(ahKs{3UWk4chZ9*sDHj_~ zuWW{tFhD)5Mia1P!mv-lOh?Qy7-AXQCm)+$!^YjB#r9^poez{Z(Z5eI)elun$;2_e zkHo*xC+47R*wTq zQWLLo@T5{LVoB~Xw{}=Gn-p2F2Izju*PsVu@Kdq0@ssU(+&0c%Kr>t#HZ}#s2GtL_ zgz3~k#ZoZA3~WCu4eb^$5FvEUt>!r{7jNroP=2CN?8?VZvHT`w>1cV}jb_c#(s-1P zkB;WiXsz$|PQUsVEw}sj(ug4~sA_Ms)YC&w0-B)MVd@gHg*$SQdw+3JGDEX-?%d98 z!T31W6H0S{!>7=u%9hi^<@Ql1^wGoF1zWB5{a$2DuA&(jL+%$^AnjI$oTm&Z>aL8* zGw1f68*=>V)4Zqfy`(XSI*m8yJi+cs?Fc&6?kRqm_wxx+&NcG*J?!42@5K|J3+VK5 z?|y`6?}0iEkb7SD{FvdDZyA6Y#m6Vvn$BkdDp8|Uk030rLVLVnDVH+mhRvt^g(AHU znixV7I@@k=s#qxEa(sJ<6EVbnWJwZLV|&;EchI8wOl@|G!rKy$Yc2__nhyzIc33c+hg9`** zrphvdO17pbIpFGrNj=n)8N+uT)7Ww8j0qDLcXU%GhEQ-lY23K8Y^KoO;%W|m<|JyC zl%=+mfsF_fo49QOkOc$U<0I%0Sh`{?f8J-LHLxYTZ4;k&`jnPI9Tms z0BNs4ST=Hk3^mq1Rw7sGW;Nd2?5PIf$w7D_fw$$xL5LW$1NGpMF|*+?Pn9_^{yYhC z`u@CSi=EVZsj~PG=T_v~MHaMAraYaVh-X9)T1*Q|Eml-XOBAnx zV4iaaD@Z3qgc(-1S}PV_z`TTnIW#)D??;;IZ(sxL<#M977KT0pWD-H9Zs1;!G(>b> z#)`aV;7{_Nbhwgvfs*()^N{>PZX{Z6to02(*!V=i({h(Dn$-`?yPl(-)h^k6`(N;9 zMa;4KYXlFV@au7W&R*iTf>~X`!9B?HlbwG0!s|`4NVz?(8r7ON;XhTE`M`%1h(tbx zZmu=NvSUDewhx}O!Yt@@y=sfaW~a4>)9dN>;fn8tqub}v4X@+MBxT?5_9x0+o>bgd zs&qgwHe!1)<8oDQqZaGa7Z7xlTGYgA=6iupFD8*KOB;8}uT&aZp$Vslag9U}n@@DF zceWzgzQm$H8+90@ghWPC9v^)o{D1)lSEh{;d5Yx5@mlocQQd`U*M5!~Z`SEG5VI`J z5v;v6#;YWeDI1EMmhv+D3a5|10O~cIZPQ3zy=av>h93XBglAL*YEA-#99!<#;hX@1y%T&|O?pxee5B#1^b zeR>?d4WYOZCXLD7P8xjMIDi-0JwbxhlTTm-`<9PkICn8~#_)&i;o8D+N1p* zL~I!kDk>dL1u+3(={qfk+6rRW&k#hgrStW%kjs*hjx969Co8#&I}%Or-YQ0+z_yTC zVe~c9rD%V1-5@9`i4@f?(XF8uJ)^NR;K8|Ni`3)d*;sFd``B}x%?H|#yB2$cd(l9U zcmis@hef{8%Rz)sh7j`~Gh*tbX|yYZ_vQgTU$tyenQ+fVtvI-lR#79lu$ zOpyWM%Q)%tAmvbW3d^#6xpfOZ$2jLcN6w|`ynSFSgf)6eN4T5}F}SPJjFb|hF0)VU z3<~)9BtIr-{3ry|HoZ`@ciIWHkerlqjpx+y-JY&D^vlpRD5%THJ`3=T8%k6#oQwLZ z#j6dXZksVtOoo()ldR1tgb5q4BI-@ZD6rTQH?*&v6e_F?gNev5^|@C4a}@Gt@DN-G zygZqwx_Ffy($wC!TFd*w_P-NMaSA*+=Eq|5ZFp}}?r}Y+UuJuP7XL`O=Mm%$LWBf? zLyv9W{wt~Yi!1z~63@eYe;X)aVEYBXWMKO(e))ev!SqKwKUoyZfAA@K_Fo<6>3-`t z|35$QpYi;2;6IAOKd${lcPu^okGlE)@vQhijLNV-bIO17iDF^F6ipQ!=2VX+Dd~8S zWKR#3p{I>021eD?B`L652mMn_JSK9^$G0`o6fi_Slgp7_Wm|PrYa*X0(qnl?6>lbe}WKx?Q;7+NQmZGz$eRi!1 zlV|epfkM~L%u`z^(D35+3#s^QciuXSWvORC| zWY%PaA%gAQk){1_Myfq2DKOc%T9-Kyo450^Bpw{=mw ztxi)-&cL3)FjW*zAPlB0MtA5J$_MP%pC4QlG|4I{2L{+F=98G(-+ziJtQi{Tt5Ex}@w56`q6pW+ z-^G5;N@-&aBw2)Z7aJ@(;w7rkq6iueZ3kuL*P*pBfAChALLreqMwcN{n0 z&+Y1?`Ts>Ps}>DDr07{V^>NSiQs_~0u@F??L5-mxYpD6iEQ(tJJ8&DJi6m*lYn|_4 zbF-!EamzQj>yg|G*2Rh|&f|LSaI&xlggeG3@>p=a-&SVe*q8mTnZ?oEQFp@a-f*i8 zcxd+FTp740V$tMjc66OxxKxYBbg3EGa3rGEuUk&Iy0SJbyYQfFyx2Xy(J!ex=Sbc@ zGFde-qiJr6T6q8D%h^@3*lpuN@i)~>N@@b>YtfZV~<7Ft*}U5y0l?d zZ9bUh8}@`{SvKtsEreX#emWzOJlm9o5O2{$_(Tkez3nq}Lg@3XrW>7;3L%Wjc6@iM zg-=F-e59{Dj8k5ma3w_&E{PwEhCX!j&_oQApY}lt-3bu!m7?7xKP0mWpH^h&Z6&HL zL@+)(Z4lQ^?u!B69*TLa$TwK)eX+^b5GL<&04PxqJju8RcCVPQ*wE7C%haY`p2MFe z(Hbe16;zEef`=0xbW->yWc8&wR5j%&rg^o(_v1 zOvT;V!444kpaV{+ry16W+)eY5b+3?^b=|~Kd zDDz1ei(Xz{4O@89eHqCFG)`!g(Hc`IYxU`-kWjui;nSR(J-z~jA+0jrO`lj;7VkpK zzbY|=i4&`+oZr8I$vz1)jBmZ&W;HTvCWpmPVH|!3EDIm5zI|lpXE9HPEP0(mOzB|Qo2PD@ z$*qT`*L93AS8a+^9bo2DJe&49#bL<%DpEWWqmfZ>qLh7zoN$~7ER&AfJgxoG$ zuDf0XrxKQ3Ke)XUjA(Byu>c(y5%FY~JswX>V`Hbifb>g>b;!TUGIu-lBy8QW1!a$U zNH=EZ8c>ZD;m8G`-Hvw%o`(Uo>tHjMkh@b1Hnb@+Ua0!)gLFXx%~#d8E(hwcJSVjp ztQpf6U464FRMKB`_DpkKYr4-9%AWj3z9o#{)wt%G?kV-kWVJ4(qO`r4z__f@I-kQDS37m-8o!wd*E`W5vFj09cn;n3{ftMbI zK}J0CWyBwf*H8h8TApW+O3(%S7@hZBpbiR87P%NOpRJowIg}~n`}S9h`GIl?Z{WDa z`R8u~*{tuXs_!*TXK-rN&ylLC?hm4RPV9EkaI8YTJ;7PzdBOh`v;G32ADG3?_IIxS zTt502=Em@x#QF{9_NM~%rvmk-0`;c?_3H}MPb~R~It+g+JTTDxs->Flw_2+Ipw1uE z`GY!tQ0LdE^Ak({33V7*el^d)@>}x^|Krnhj6dV~5&1vNaI^m^A^V(V`yG<^_wgQj zhJXJ!<=-WK4F9>t{hvp9=>BE6=YM-8-y1xYBf@PRnKC1$NJKgzS)I$~&Y>*yrZYxMkTQXa#Ar8PV7T&skk z_f}Nm!9GD?xi9i2lc7>3HZE@m`oU!sTry{m%Fn38d!s#2L`DVMj)C&2#U;o-IbTb^ z?_2d96L5)Ye_nsheZ8*JYI3?IlmEO-Ac|am6Wv zbnX)&Gu_<+NKVDY3lFBlU29;9lewCL49dTn^8o8NMg6SW@x$EQ``-QNWX5-{&s^c@ z(!aDUn}(5_qy5nt1wRvQEgIe_s6_H;N#Rj0cIAW;zFt(cV3VqJt@Fq6;pB2p+=*qO zB-EE$_Qyl;*z2WDBJsV3-YnK8>hv|zPOBf6d?|PWW#wa?sWCBZKV(bSI;!t?w#pix zt+g`e9@C!hv3LZLY9gpI0l#4TJmj?3+{Lr;UA_1raMpCIdX$&?SXrEO6SAXjGF6J) z+h}vj8*OLd$Ks}n60uzzj*iypkR&fCI{?*D;N(JwJ{tb*J zBaN>2F3o^l^3?-ORawLh_Higz^btHli?uV4U?=Q2QwVApnNqtzuMX|R#pBw)yQcrx zVcx<0bp;=@5hD^6MeR@Vg9R`m1^yOn=wg_`B@x-{p<|uD9{O zA@lv0EcqAJn33hL%z&?I%YULF{+<5vclyiU=`Vk$zx-AD3*%2&@~={d>926EX274a8unKyBaHu1%E)W|-rpl! z-hx8^Plvdq9x7s*BKPSp7`ra)jD{nGbwCk)hVZ8`D5*mf%AHbjB+4g{woaRslWZY2 zua>c_QY&Qs7?uzg_AMx{uQ)79u@*M@*;0EdaA&1J_r5&+;sL0po9Q&DHO=&zn{mMh zjAMvzOp~^s9`qwpB~U*KI%%R=(ApM=crkjg@eBm@Fk@smAtZJ;!n*=|a*6Zj(iE`DTiDP~A}@PB$Qw+Ye5`*TH)WtFZjTYYeU>eQw&XNi=9`AEovx7VH1eQhZcVWm1##y|5 zY))5un;(7Yo4|n&y?D^7reu8F=LK^nStxWb(rFAFk1J$~Om|qlQT>|a(YUJ=)JD7< zvol(@O0T^ccoC%YYUXTgxhakbkK*fR=TO_9nOBb+Qj;jfVb?IZcw#gucxicPavz@$ zcUKM<#$<3T?3KhJW|zkoQ`1uDxaSvXUi{9=v_2?lWi)>Z1xv#YXxD-k)9YUk^g9H# zL9=T>b_KviJu3@T0Hh=#N6wbLB%%21o^9<)@AfvXvtfOowr1oF4qO@kI)Z z70?Q39;6ELjWCo8fEH2WWl9%d#+vWQ3SKAtvrC%e}t zw&Yw(SPh9L{hRNIP5hbCYDf_A9=?D7mdToFmHT+^XwSmO^r2-qBv@U_MR%?`nlH1x zO+{<%y<4NVt@A;u@)4LhtgW+84f~;;AWv z$B;?&mCBnmY>doN+U<7HK_9MEad^CF5N*_Q^p{StsD*`P1_V4AX{}KyA|uAcKGCs^ zeao9g4ixc1=^)1FTWpQVY;s-}wzZX|TXYh@eNrbXGFg9>h32S6MjTib$o;(BL_mm-444AQ3=)sjYhn^?EG z6f<|aaM3K6&n5gZq+rC@l$AZhNn~2Db|YS`F_I?VKPiDd*ghCUj3v%gCXzezU9MIm zD#h4LVW3)B<$UgdIyBHNT|xXACqEWHFfv`}OshU|ty?F^?y=YSy#3)8=`QCn;xEi^ zKc&_$`Ns6meqVnT!^ila#qj+&Ui-l|Hv8te>nu`P*&m1t-8#?1_FoOF)9#^TMZq! z4w1M^7{shGMMFV0l{bu8-cs;+CLAnbbP|)g%$Vq4ciJjEVzwRi9ths%x`d~^Z*OJ6 z-JbU$!*uN8u?(;e*Jg zR57hr!mE01{k6`Ha~+@{%uz_n;AW~#@y>rDxmQp|Hm+f24_vymp{dij6qQ{i=tKxK zJHC&2k$d&}qn;}S`%35N`(cIcLbmQ!IE*b|@M#f;^j-je?JoxzQP|8e()nq^+M)qSo4Gcbr4PG)RT*QAA~&gimDJ zC44k=07uUEZZ2Lce_!5e&2SSXPDVJ>tgz)e+VXj6j{%u@9S=7PS>-i+qrp&bisyH_ z`w(mpc6ok(gRR$QTV|vn+kY>MSI$svrhg-3-Zr!YAJr2VLA9_f+p*2s?wypG0Rhgf z!W5B^cm1(FetLBQxt)SUIjw(H_C-MfqOQ(8IM{ITh6<|*&v(#Rm8rP+$4ctn=lGgB z5BhtDjbRNJB>Dwg$%E}JO|N4ciuT;oWkhnX{BEokZy(aS#@er7Ni+E)sSEph1G?>p z?{}%McSorVtDEKyMnpS|CT(?ilOsW7XTw^Kz5|ukc6A27KaCo^y%4g%r7zw-ihB@z z!s$O2PUKD^XOgV=?6~2#1iz&?<(vEgyOqi5s;fA+&R@j^C)n&Y z?W+dxQ2bzQBE&wiI`2$1%hFO;4@`h9`?T}8I6x@DI@C)=O45&1UJp(QQ{l?8@QN) zjn!H7&NGe7B!Y5#ucl|+YgTr$$%FOT&FHDu%y*&tx<=&KU+R17Dd#=!Ad-$Z8^rQ1 z5hLzO;7t8Yxd6%Fuf)>{@WM9^xCKLr=eR0 z|24LX-^$6@!RF^|LdGuUhQzxu_ zCCR!e_A0x9E5kJ!$s<5S(MIis-?n|hefet3TZ%xxO>(j^<)EHRs#@Aq6!R}uq^Qe7 zKQShA=|m4uST`qh#p4abeRavk7)F)on>Zl#DL46G`fhO^v^fK9@l)o?Zinc;kPwj!DviNRu>@D1Tn1x&Ppb$kHwoiFU{Lq4J_DibYk$EN zcaG?-{=mpDXst9k^7&l-m3Hle)oOf-3xJP!1~o_YSf)sMRc!OAM-Y;ooSkr>P!86d zD3nuP!Afi>Tb9=BOqIz8H7M6Es|C~K^=F$b} zMufwo1vo_lduY3^KE9~1Se={y{myTkh_sV$fOllFX=A+PIp?|9rbxF4ZF-H%A`pYC zfY^)Di|UEoi_DAeNh;h){VU-Uv35dR3@s=}^J!})$-GCUu0GOAxVG7hWhM)mqRf!T zHrh`7W6R?gZmcTgCSBaP!tK_xBgpG-Fi$wG=uS{I7SeSdG(SMlnc+#m(!u8XlzIb8 zolw=6GUEyqSO@y%FNq3Ww9fDl66#!p7$05;k za;{rbOOXsPZKiYDs|a60Ii-$&)Aj~igUb;wb(v)yLQ!3)EUBb`^c-qr)SeLyLHWjv zGPXm`>e|@a6_HEa7C`uMpMUkCP1t1x#`;84DCDJ!>rll^vk1xS7~!fNb20c##&ee` zCj$KHYwPA-uGltMj3Q^%+5^sk zJf5_klq-hQuwC1itv1oLGqO<8cWQK=S@>~?#!rUD>rbH($gdq%MQ6Zz^_>FqjhCr7 zbYudQR@1#U$!*Y*oG04coW;Qtc7yN0tIE`e z!3X-7Z7wAR3=dfOu(2VL8Yk0`!n9D&!=r>IYGUy4L!`9BEs4CfcD|?$rUmvk$D_oF zdbcu({{-aBuxia(O?VYPN^H?2q?eN3lSJDMWjBB*y-a-zsbdU?j-^$6RG;k3fu8T1 z%dSuR{6K1X!6~lgQ=RJ zo^W=e0+{;t1O>OW!vWqj?kDMscjw}I6(4Ow&h;A6fw}akwkt5=ndW z^AhvXMR#+O;zfFbc%mWHX#!Tdfy*z&tY)dm@|F&81V#(;1#AgpfDNqM3Ym$F1Y)AO z)PZ@vv^FI6Sk}8grSl~rvC#J%R#A`J+x=+4f>`PGM>`eRBC=2Ue$A<9H$^L9!R*BkSxlmb z4sKB{u1ldgy(@>=hOjC8y{zd`wa{nB>sZZqc(y0vMu{Il$%a4Q3=lamoOG6X0%)@@ zOhxA|=MM!kRv3vZf+a8*3vmpwmn3j0DP9}6z>I|%U593@Uy}Q`%bKNk`$UlM-9dNp zu0Yg1eSMDn+-o~I8D7D`orm!;S6I=Hw5$X1Ld81i7H`B`$fphcNmFsDGKGd=tzMqe zIci_`{%eTGUkSP2wK4{J_J5=rz51?xlV5)pb8k}ZPh#$`Fv9PZIR6(kwDixeN{S1p zA!DscJ<_#~H@A0Y83xx`RFxY%3Gb{#T0)+dvs|pEJoT%uE$lVSEtsBq&3y%sd-MnCa#92IWzJSZJ~`~v zLPQ$wxdB2)0EANm?AS0d0TI0w0-AnoqEKB4tBYF?slmm_tYeesDQWD_I? z1Ar2qVdeHFMcV<2SFGv9V0-}?x=f&83}#?N&4$b=Bi!n#w7M?brD`5bN+jLR#(YiO zZeh-dY?fP1A=$`{1j*J)B3U=h^#nI=RT57D*tsE}nz~Js0-#4u)*aa+&9(|-6h_84 z2|EV@fQawmXmNmxxZ^O(0B|cX-3|u{6E`S$H!Y?R5K;scz60@SXiUEl^=>BRPI!jM5n935FX0%ls$ zQth)nWcu3HE2uPnbG4;dI{)*Qs`r}l}~2rAHLX2S=al?)uiJKYl5%vl$}D~Lm`Q5<~0 zmIpwP2$IK3=e>J8XsvLY4Jm#?#ocR|Xd?%b9BPc* zwLq&*AjD7LPFYaQezf#J)OxVkzD#z&tM~wczG4W#pMhz`fXv^6eBtMv2Y}*dmv-0Lx^?_uM4@Mq=BP#ENw_MqVmEubfIlvodHk>>2}3IydQu7o)A%k!1^I1 z@JWpr@Ew6=3}GHnv2e;KxIC<;D6ts(ZD6pFZ2cs3)IN-@5cS9n0~P&Q0}1+>B*P>X zDYRWy69gu|JO6xxnd$;HvNApwNY;0$Q8Ya&deM5CRf*NW^ZG|z*l_(F7@OF3-i_#5 zj5aJ4APw+ykaMz;o`r64pL27_l`oj}^ z#wWHSRzkpp*n@ZqeAmS+?^h)2L>P?V-lJteQT=hAzeOCIR3;8tJeU|wftVt>)=x>? zf>4{3hqRR-ZjjSR)B#=ny^d@JaVEYl34AP{Ts%1`vaAHQu)u;0heXqR_xJ9C`3kT_ zHq&Y)Nng`7WLt&2B)sTEYB?!Q81CQUL3s*Slp>z{rn` zi)@aJzF^*M8rvFMVy!gSd1;|YxTCb8^h_X3Fi8MRh%S**ww*zqNiVT3k)5WuAhpo9 zSg?4Qp)HLXD;#sp7)#e>e%XEu`Cj!k>_g+d#Z~urp6_v(DVS}T7?{eKiOhX07%4U> zgDE>Hd(8c{W*X3SAR0Rwdv!YW39of{luSr!A~>?kD~=0kCV2B!N_ol+i)z$Nb7g8q zk+frJHQD$Zv>Fs$1!N`Z)aZ08RBKT7~i?YMB+x0W_ zb2ki$N@7YUD04M(x%$`*>+QpT@NQ|6R)wXD$BIV{N-NG!uPW(%9jhIKvD`HyWiDlD z?$4=OR#z_*na$$p<{t3Obx*ycg64@(i%3UZLDi)`rdFgmq{dUHPzP1BuKiwh-@VwA zVySKD)gNrJWmq#>u(DfMUQDy*(thm4m7-m>g4!(8Ja!FtP540izyQt@;ug{hcQfzg zLf#zcHS_Lpf1`2qN8}bbnFm?*lz8DKzEC@SD8RR}x9*78t3OFw` zBP1fIGe~zQU}v*G9^IravDP#ykies$OAt3C3ye0(y(2m;HFFv1k$a=Eo41=Q2#tZT zkYG=uwh_elSa>*V2rpqsUBx?66_>hAt~(%e zam^tQv3q?~dr?2MZ;c;gKoSGL1WpJg_GpJ0idTrEh;ISO^4G09f6+bn+b)j(6rW4M zR6J4q%{;L7wsuN@L_fTWN}Gv?td*{tIhS7W>yY`(i84~o`-vDSaz}NO=A@6$QdeS; z#Hl0!>d-Y7Usg>;jWdVjN8asTZl`S94C4%53_f_Xn6S7yo3^<;2S0YQoLIt@2hCP! zEj7uwYIlC!xN19$w#K&>TR0mQ8ICf>TC)jXotDp%uWHjXt2y266KE2U-)N;7)r7O^ zwL-BvUNNaDw^&Lf)S0+47~TxulqFe?JzP`wQoUOlxeh#T&A-lna9wwSa9v(G_6%5Ay@VJG1DvDtr%cbWmi zz2F}1RdPV~lzCl0t`V-0J-a*`R-eae?jhx|cHldejx&lw!18wi|S;c#KMAx#(ErUF6D#>dW4<;mE|Pt)O;_=lYw3iPB7O*N0Y9 zgTbCN%ZbBeK8e+$Hjew*YwJCu{p~4N^V5RD@YT{bzH7gy><7ck{+NR!*<0C!%qTur zPsqn}H&Sbp-L;#*i-*6^Z2wWknUR&@ABl+!?7yq5f7Wc@M*pbUvi*gljfw3yN8A5` zX8X78@o(GX-?qoUZIAz*?eV9m`B&Q`1IJsjkGIi339p~B>W{+fH==*~S>*IhjUDmn zq>YWt^#yF*@HOAuc&}0spN)f2`z@H%*~aP3C@E=fIznnLM z`cC>*wx;-WzqtQaw^GH}!O`5-2A__BmiZ417=DHdza9ENk4nV$byhk>X9K67t|KJ} zXX9U-->j;?-V{@nlGIS45;L}PF?KRH)Ta@!wK9@5urjxIHin{mqY*N8G;}bxbFy{7 zXJY%U7K*UVtIx;W=Jn8|^$itlt@UmGIU-{2;OHc1rvFOLh)*Y_|7!<|@vQ{Cs=1Mq z+0P5W_GT6NuYMT+NAE)YSD*Ti$6qWI(5 z`fEW6{JcSE80cRcdD1Yy2HY^Pyy^_EW%^lO?-cRZC2wxXZ)0j@j8F4gX^`L1(D-!` zv%TsXbZ=9??a;gi3PRBd>f8O+N1N@{X-=o;WNfYSdd29}ej8%D>m08I zVt*d+tMk&B@pUP``92l?a0C4s`TYixGPW^wGJ7Ta%^Uh_p8=nV+V_Q%Lr=gTXG7~;CRF$*~QWoQXe2>((7%Ro8HZ5e0 zSZ?0O8Y@#cnADpWr`#uC$Yo(gYSK5GSC>30)?Zs!YizUMm1v$s0e>gf6@wkT1kWDQ}yE$EW03FVubq^X`qJlYzGLBHm;6P0|%3V&*i z@x$yWIbEKC=iKZ^T*KC75hIReR2Vy+M+|WHMf`CNgXkQuyz-idWO6#*==zr2(q8n$ z!;U2`;4$>mLag-!Yrj!FCvQew-L_UsRo8ggd$z^!W3GTc&t2dbD27MP;t<55^btLL=eXJqeEzlf+ z^Uk<-{vt6vNYNgxics)*I+!ARm2F@i`k6k$lTAw5aTKU^{&D+}z8X=u*rpPyT3Rm< z5h7gavIjArlz4ZB`8LfvU-!VLrB1h4ei5jgfI`-m0paK-k%GYDvTn;KN)}f69w%-k zYposq191hoBz@}|Jgh7R=J@xVFOX8Pk`@V{l5|fskD;$rI0O=uy+cYUXR>2H=$hpX zW#2y~8K=}DqdH#7r+%wiPztn{RFcf4Q#yc%X>ZRww7TS&k+bz_&&6Yh;guqq0*Neu zri2CL2E;iNa!$IM6~w?S{JsuU_+3fAupR1DhP>GNQn_1UhA(qF!bw0a(5or*Lw;&4 z+BDH|j@z8E94U`o)nt zcR)8%&L)8Sflid%&)JX9iIUw4-A_|$YY{Y;pj2JMM-1P9REdR9-i4`U6oh}Lr+Ns# zW}uq%E;O9SYND7MATsLy$c3z#JfKS2w-7%BTFh)XDF%(E)@ajh5d*yrVdD%}U2SZV z^^G5LBpc|{e2-+J7zdMeL{~i2d-Vx%R3{TZD1Ogal*+-bymQbIq0wdkN1&P^rHQSp zMs3zziUHT%6loL1f~B1pYuw}WtG1M!E{;Xrc>&hu(%rBtSzWwooo_NOkL$|1<}Ra) zr&bR!as}q5+!9YW+c9C)Uj-7c{jP6LzIKOLWb&f(=00SRohLtHdVZ2{+!q@Ge?dt! z`bf_)3aq0yxtY+eoebiG%s58t&a)fK0k{AMHD=H(7H(mcHNu`+L9`L-XrCWVgz0P~jVZbwt@mKuJ@&NjE~c18W<)&d=#YYL zTv0EgN5n*7Swz$<4}s1W?XaR#_Q9a&7eNRLuRN*1Y@8!WPlA|9y*IMMGZUCS*a#&> zR|KgHQR+2ejbZ-|bF_5xS@C6Vb=*eVzIi^5I@Q=`-NCXjun9OI02tVG)a;wpar%-~ zz;)b`Yb{&TapWSQ&!^ygfD_p@&!U~6<#PcW4mxkb5*>m_W*5}-#cc{upTat6_a6Rt>5;1&?xI%7l-zUomVvMQu6$lxv=AEaJV&=c85Ebuul|UpF93_8^k}WnhPLY*a@OH6}7@<{^33(^I56io8Vlv8nR(e$|6jkExb5J?D5lHSkD-^kSN_v zBf^gh;JrW$Yf=qp4IP&9>LT?G1S)KU8Jt-Ejd(_6w@NIQRcn{<4?&j&^hloyy_7sX zF>2_rsBxE5kJK`m$?~0E5_;V+=)%)g)tX$chI$5|5~|mRb}MP-3Dc&JaU(rxRo79I zrEwUVqXNvoBhY^gqu}R+H~65n*>l?OltkA*lBnMXb77q119wHAzHPP{T8-H;MybTZ zMCXYeQ1$ZuD~sXYgtBn*n#p$>WbUQ`owbHto8;Cr-&)U{(&93ES>xua`)4|tfr{Gj z*s0XuK%x%9vxfvvwP*B`^eD&4fe*mnrkgB{p5i-3C+4?gI*Wm4#^2!$61R}6M-%g1 zaEzNx$G+pi>WT|weWFB!>Bax11*KGg+((pOz|m@h%!6&0tiYC9u01UR-#B@WLnWb@ zm%W+oxVYaG*UKk0XsP4+?HQf#Vuvb|>AI)na(GL$r(&3;aBfzTQzks}ov^==ry@V< z%ULB-jEMq^SAxo>Vy@rp#E0_EN&Oo@`Z@lK z9_nrT*9>nQ{yf*)QM`@3@%ZER@8|LMpZ%xd?v(=S*X&HM`){${4uoX7he7s9Q+61^IxqzEB#l*^9F$5KKh1upfq0* z&ui!2Ux&=wYZC_$fWbRXCT??D1#P=khVg0ZLygz{m+>(gl z19pLc+=cSL$3{khc<#dhiSbByCoJD0EoWu4&Y;zj)4`35je>z8Dug|& z7zg2gVbV0~_)YWhYLHj?K`FgN+v9Yx&T?+>+}O322uC9VeVV@2dXPKs5vmq~fy5ED zjwWpLhb~cI!{~w5N#v#$X}oPx!!}w(PU9_GIrdaFglKXk!O-Y_-fWgGd)&1{iGmaZ|uo0`{*Kb!Z zEc=yxPq~dh+VXT`pT2LM>Ye$hn48_QWZ#8H_^IU>vRW$EkHJRmhNeGwKAN5=W<1VT5_54Aw3ILXV3xk-%J9F&(=9}{`q<`X%dGB@IXSWE zCE>8thT#6%B`$_+h5<{bC$?{5Q++>a1CpII8cyG~AvdF~eocOlYo{n*+m$QNOjLMc z7McgD*L}iM_-=X80_K z4rqFkKr+eXE>!&rdf@>y#y}?mWjn}00e=KKOyO2^A`9U<^6-{{a3LWBPx5n9iU11* z?#e>Qu}QAd?W@4u4KP=sDzR~Pl5%UefEW*g*wBd$Hp_fg_Gv?%Pzc zpXRKn1uYkW?`6Z=BVOmie~RQ4@Zoo&HlY|pVg^YW zaLBm(2{IvzjQm zcf#-j7Gc9l+~OMa7|~6xTzGd>KfsV>2z5CZp3wzk*Bel{-sA?k8lcL1 ze_Aa$kMn*BxfbyFeaLv~KFmGRSN{*=EXaM-0oJ3V{|?(%}l2;gHKx2a6f zFBqt*QX3zd0^YHn>uh6=r$s)Rfs9;+6+zZ@v%s$epS*7uy3q%;=J%NJ0jfhA2QBOS zkQNZKVZn{D=5O0eHBosc^T^2?O}8m|3!O##kv%kZ`;*N6sm_;}BZ8G}fCGJ)5eE}k z$H2R!yN@i@G)M0mY3nClB<4 zeluX->yh^)KvIM?hfsYRby7s7>-B3SF-3kh!$6DcGh(P4$s1Tp<{8o1#hM+KF-`nZ zx5Vc)kKh;FCY+}Rb>iU*z!a3_%CE^#rFB4$M*E;;fqfO+l^twYN}5)+jJ zm4l697#G~RUG6cZ<0`ZRmS?e#2j*=`mHp|P4xF)Zh)74QrM*_+(M*_^)eM~FBISsn z8HO1C@+SGsB{LXO*05iSPf}8-!<(1Z`IlhfH3DSZl$;&Yv*Oz3z=YXm;-BU0W4w6{PLB$l>KVm-yN9gKK&zk$VCWE*hruLqrz`A(Kwo|}`|2#T^o znX0XxIFy?0ZJFrJ4uSCfaAmZ@_MO=5%Eit%*~Rl})gGnP0+hljWlPi5VU^#qmil1I-$AIi+q~$Wax|+EF2H359!dCShFI zR2vx_-hCgq_=O*0f-=6FprsygHK^xULQd_(q1m_0WN6A$ZXFO5sNp%&vC;06jav;w z5sebN)+3XB<94AP`&k@2x96wHRCaD4Gh7)Ct^3NYvw_#W@2})mpKS0IfGmcQ;i8|D z*|nY(*|~Sg>>CZ<$h9DuRpsDRpAOlzypTdHpE*MM;U-~)J_1|xQ({*uyD!Na z+#T^t*c#knXSuT)-z~p{P}{cq?Sy(DSrY+5s2_=N93AiL13+el0>z^M90AOaN(+yD z$rQjW_rdd$vG-%X(kvcsgyIw99AuP?`Fp}WpVn0Hu3z_lJ4u&eATkX+y7-9{# z?rA6rr?$@Lv@1nBdZVjZsrU6km=|a^4Cw#+~nmgV3hV^eo zrB57X{H(3z9V$r!q>H{UgVh!kA0<~eCP~S@NJUnSc{T&bC2>@{a=095yM5nvqSl(dARAc!tlvZAafOq>udL-{-g~Ul z`(b=QUDCA7f=b<4Zy_Yc)wL{{+15Z?SVdQfRXFBisL@e=&pQFhR=Tr^fLbCy`Lj`_ z>cxt|F(}Q11=CiZ(Z>WvXPwF{-6O==d`mp$vFhcZ-MrJuz1`i|LdSrnLdW(|W5 z&`6Mp5{{DjW>&|!(jbz7=2qTO;;i}(t;;z!eBwKZ%lC~W&k&h53%forxukg213qnls|sWt8C>mc>6W+d7i7s5KQ9rEP)_ zcyi7{4V0$+%w%}@w7sM)-A$tMi?_KOEyo~k0{lnoyP#o2Z$%ybu@yV6Gu5Hoffjn<1`#senpZlJG`>-FlceTe2r|b6 z)x5LNmN0AiNocA4P81F~m6*{+ zBdm?5Do+L4Z3(c?9q-VGyN?Vv(;F6j@^ou|1W&sG zaYg=UMxTa?AF84F+)n(o$=oUZC{X}y)eX;WYAEB_GlX0ttOcw1O~vus2&u?6yACoa zioO_T_cVrIa?blnp#*>ez;KXIYDY>P5{E$L8W>UwKQwneKMDO#qzaKLh;`A9_SYFSBR!atUfFF1hVdJ~8+KV`Z5>Ftn;s;sS$eeTA_iY=oT8_b;YH z*W$8+gIo{k<{GqcZ^FK1Nk7=YF;a2szw#lvv5wgSbJAmrj1xRipe8h8CS!E*1CVdA zt2y$cR8qmC#rkybqV<0C%Bvgaxb$-+C1*3JOyhwPQx4H&E+uhjPB>GNa4IFvT@Oq` z|2i8gbdW)DNlzm{9hOQC7Yx<}aIp2hefPyaM|bLe$!#lT4AKl`jhS+_{{-#)3@QU)}pQF6gm=BPe~Z(teIR`Xv8}RLnUabNWd8Mn0H(JCxOPEJ?Nzo zKYAB8i@g2n@<(Pe`D|emzoYyRO~&4_2S5Zyr>I3n)ve0RngtcEs$X}j2`gbYLE8v8 zClh)W@Rn^8*p@weSC$WlbD`cB_uK|^&GQn6F^A-fLsiWC`b>j5f$eR`B;5%$!XyYp zt6FC>kkTub3xMkwtk8&ZkGW-UMzx9=Dwue-jRRES|4u@_hyYPlg}Xmj3FsGqwFlG+ zI<#&JjVowTEy#=E4QG}Ht8`htTqj?ae+rEcWFqXb?R8SQteQ3Elt`~}o?K^=OS5XS zX0X>d5r!(l#Ho@X7LK%qcr<8WpGHG*=YJ^CUVNBcFp97*pO&D|%l`u%8ak9KC}}h5 z4%g4s58PlpfM~Zi2{x(e?D?Ux)#YT_a(h2W`<&r!zJl+?9gVg(znL~rG`*jJ&okO^ z$JYCn|Jh_FIZ=l*0Iw@mw&ijX@=gVemw;d&V|_(ooQNfDm08@okgL;T)6>4V3W9{d zhw}$XnD}Ts`)X_#nkHhw!Ey4yo^*||QLx=P>~-?}))iWGP;@6$aU)`WcrFT@FCFBC zvE7|LS~88(PWGWJqH{AfscH!w3%DuOA%fivv&{i{(nK2DO|Evgqa$&iPE z8%?OgDA-+*XKkVh0n#TQ8crgU-Cw$>JN7dJwNA*&lRRl9u>E36d*nTPAwH& z6u2@~_GmM_Ah>>e6jZT-VKh@HgR2z2Ea)*)v%Fq_Ib7(O;P?T<#VMFz=*8WymyFMw zVS=|y&L|})yimWa59MOfveL3DY3HVLh67zqQO&nw zt6q-QPu2qHGY9=NLoZPx5{f|!Cuyt8OHQ~4vH8}uq;zUEl4e_nq3|8Bz5cgyD%^^p z`??m(A+K}DyVyrlcfRD%HLJ1a-jr4os}#BgZF+~*ke8slQq>c>AN4Zn>ov`y1yKiX zTh@1))LI5p9jwiIE1ZC7$Z}zMy)C-m_uzKlGPAeH2-zc&>=m*lTbUUR zB9WDl$R5cENlLctEh~Jl`<%}?@8fl!&*$^~egFA6>fAl=>vfIi^?W{_5T=N|=Yv9>%-rar@GBlMkd2Kk+{t%J`7Pif3Y zoZ*JjPF!=3BiX3Na@ZRDhWwP(o6t?CjoiE+ZZ|F8kUxl?e!qM0KsW39dnA&1ZciIhrqq>&* zTjKUtE+lKws{pw{y66ik{kzwjYIXF-PixhQ`lpg)$IPoRu9mX=MoY>E@G7UJRFvq# zdCp3P2(s4JzkDF!5EEJ2hVec%X|+UtC!sreacI8aMYJY)*oE;QQLynBHRN}RV5#oE zp3zyq>5bGOmhvGL7tft2Q>SX#Q{X%Oy z^_P4TqV{UW)WW}sj7!}8zg~!A&NgzMZ4fFBUUW0RE6Hr#6B4S^8`U=?;Nt(A?l-|= zQpx?vt&G~_mwj;_Qh_UR*H^TWwyjilvE>8zJZ8i@4foG%oauDm_xXKxU!m`3_e;$K zv`Ox0eNlgTNz1U{wAQ^e+H!PwWTeWNB)@QjU7`)`Sv`tWX!v}9$RzoeOzq*(bA|J6yn-KGyGqKHg_Uyh>aYG?>$h$NYU z*5(x8JC95DlF5D+{PM(x@)six}~_~kayHsdr3Oki}@{_ z-A+;c`)5j|F^dkf^e?EQpVm+tZpftHSG@3&nfQSsjf~&bLZW-zFsm$~kM~oK9xfWc z-X!1U)MVcJ=i_har*uFHxdlXGvr~#-32l8-xS#$#k}xx&Obh@qsmgy zgzP=DlVN*NUBguS!kbi()Kz!hKB1x-813ZJkoxemQV6rmNsrg7PP=$F67U55(z%Og zFR8cHkvO@(?`AKiQKHm$8}QtP5wsIMDvBHSxFIs%I{9;2%Z2J4!AddVUH40uSnA?2 z;%`}LLlnZaX2zU|R`EVYC@(dt6A=aF;(Z+Ib`V`Vn=HaKpP68qUwh39fbk{fu zhW5dik$zHA9;SB53WwZ3kav?t@zhbYGwN?|YaE=n>&ST?wz!#CP@gkFI38(zl>hqe zS3;|%3+=6MK9W$Er8lV=&b{DgeSv=eMD)UGMN?Kxo0HwQpmTH#{ z6BNI>D8bU?Q@9q~O8uWSJm7#SQRpuJUGgs}Xx49I_bIY{Uf%iHU3200_8U346DvPN2TJkC>P1VZg$a(ooYiPnt#lM>E9pO*A3oi71)Ad#gZkjLu zYQk@PoeW?9aY}m79%~EUc{-O~_QruHYL`V&b~*hcnRsWi$SBe{B;y%Jx;#W_`4{GvW5QRVAZGw~VRjt1c9xrp$6FnIaow zy2vc`UF@SHzsjSbs=dlQDZea>d_VPs_lcLP$xFB`+X3S^sLmqxGhtHyep9{KOcP$&wp=0 zHM@K!L_ITy?J6A>BdgjS)Rcd_bUfbZW5}n-*7Ig1 zuJ4l-qNw|nk{b{xbN!@hI{gZ4KUq?rAe;JwfnlD{$MlX-_3j3_AtvVZ|4r@dj+O%eVuJ^1LH+p$V3_fr;Z={fnykJ%{yL(35=HKNEzk!s6?(6+) zk9swCZ`Hut9r};xV_wP_5@vjDsGl#Gu?m6fwfDoDIPg8J$M0LcqjWan$WAb@oYQ^t zZUtr>a3B%QMKnl9A1xP!w|Uy1<}x$Ey!;8K5~~vfnBou>{c9)oyrrH`%GdFh zU@5Dh;=Jwvve((@u9e4-Y2}f+pTN9cspYCug?S;$JZ2Oy?rYew>xd8yw-4FS#mIsI(L)p@{wg zYocm8$LN8GrrwXQixQjnb0J@_;w28RKe2!PrDDCrn#TIO6Jc)lMM0x#@h1P$_^TF}B0&jWj>6tu8G5FVR;!dwt>xikP2poca;v{N&>6^)fx}r2?_J7wmc*tQ zf?}rNFJ~HPE62lu(b*$IA4tDA*JdnDz$Q(Q4I{YC9IV9zT<%~k#$YjuhHv3xkLATa z<<0JpZF-&I&z2@ILlc;Z5SU32m>~$vPz3M&iR=7`1KKFxxe?b{5|f`CKiBp}>HOg+ zJ#X>Aziqht>%~kA9ESRP!yO#=y`w+h4Ei5m8&Z^)m%n=LKO63F_u8ScySa40d3FDv zUljryqrirvzg`vkm%Vn_9clmBW%t*vG3>6ifBW;-_b{-BOcagXUWonguRU^q?V$UU z`qzHAKXqcAbH1KH+c#;yUTE_iX;}cAc@K0P}Emoso(H0s0?a zv-#sZ-D9uW{M!k;$iGh5{d?CLObom{_rLBstM$F(ceUl~5z?*utD9%J%CF>Fp08@C zJ1VH{`1trF=F6{SI4PS2snqd;0@=@U5DG9D2~%-OcWRi*G`tbOL#frjm042=Vo@|pD zno2)^R_kFZ$mV2e_&taOo)z5ytF`J_PlB6zF*n7wsQo0LUbx6%_AbX`T6^F5tWVoP zk=n4;bnPj)`ijZk*g8jmK)6O`=WxV_2fPnggxFIZ!frhd-48rP%JlBh$sOr@ouC#I zezxLL+OJ<$G|re17O`(^4<4~${1&3#|32va_2@?x8}U&+!F18OPJa76>l67TGZf7S z=4hn=3bsL*$X=6n#MRiVp;OyhFgYl-;7}km|6LGMawBZj-IV=q>C351f9zXM%Mykc%&x4wrS2Nj~gK1yv z%xf*FAw0kSobKG!^$eRE54giW(?c`4*utZE=gK_KomXOllvCxpc|*vJ(UQE$D8r9I zNpBU3BzZh){F!aP>b!h68LycjXMNE+vG!nD<<|p-mSxtLo#(&O@{UUCFLZiEbU2h& z4OU)wHn`B8YUyQn^-e8A_Ab-=toIS%{GHW+A8mKM`LaKrBglMwmWrU_F%>}&#RIN` zY0nmZSB4K~lbhNI!s^31v|tI63Rx-C*>|s)6AA>H6D!?~XjQAHk>x@rLyrJAX2MVp?KvKo^lC5wR1 z(a)c4wEX&TC|vsT!)GH#i9!9+LLYhKM-ktQ^w&gh=_w}(Js5U)FnaE@e)+=@-iptH zfBf>k>A0nmS!^_9xHjhcT2twBV@GU!G5o67Eq&egPfUep9@xQ3T?w7-WSTOG*uIeI zqv#h;e4>b#df3>0{Zr?6pI-T7`J>i{Pjs}D<+-L$qVeMzO=s~5nVYL9X6l`)Wa!3+ zvdG%T{VhY7Gwxg?Zldwdlz+BT@3#X_HX|>t*F=*#_}P(gTP*8O#J*BAZT3D_7cYF4)&k(!aeg2>Ar z<2|>W_w|mfN~3tjVjhlo=)|@dkvC?ugWP4surA&6R2Yp72!8GmPIIIA4jD#M%3^OQ z=9B*?X+^Gkp%jSH&vZlN(v5rjt5PAlH)*Tm1o^&oXbSYLe41P&>ZW0oXGT0GS}`&t zjbq2-8x!<9dBlD<*H*Df(5_n{>gmyG7Z~Fm{;eFAx2r!wxAhnM`Jb};SaYVS4#u-s z^ZBIM_SM{=J|MrYauG@P(b1N6LwKM4ThEqj4Xq0}eMa}w#;5Iif)~ydf*}<{H3nn1 zs_?#$gm~mqit%&NT9f}6vA!rNo5jgDd(}1kod!ihTu0ti@f6W?emd_J>Fb)6{Y{&+ zd%WfGUps!E+JL(yvGiP7=3hpJJv!3J5h}y1+rAduQ`uC@uq1STYAf*gRD#_DK3JTgh zpLM7;p2WP45q>hVDLLH5jZFKL7{ztMIqRvuZRAiCG!sC?djmr)DjK9Dz z>spdrC##8^c)xVrn(CJ)?8J1lvbfm)+Kk-o1`vk@a?I=dRtqR1nt4? z8Oq;GLN;*yuVg)CwDBJ#V~2NHr=HM~jaVw$hW9MEUnPyTIUiIgVBitOHDymy%4%@y)|=)UBlwf8H=%UQA7Zt)+7TQPo2D0RW6twi zcI-Y=yHvTe3aGE-oN{{2^%Q69L z7YpVM1~0vZcfAAl8hS|`ns$`^FneF>Hwl>)uu#`Vx-E$>?}RMQ4jbhdE-XH$>}NXj ziSC7Pk+ij|Uby;8Y2@9OANTdPek3zKU7jEOl{FtfR7Tj4G(;D9b4kI;^%?%l+;hPi z9Ay^D#44={=U;F;Ichjp3c7|qd_`)Y)bo^%fd0AIma)Q1iQRB>2i^AyuiMo<2T3-D z5q!T6<$r}PlIR%Dh!n?!y*~;NJV>$2>3#QWaRrtk;m>e_lVhwv4kH&^?aC!g^G4qH zb&5>ScdBNRV)h0nVR;|k@bklGCdplv-wtYfO$x!q1*{q?1Pc}oj0)E|ubypQ7m`eO z$>$d=cER|*ns6AIVA|Pee>nJrCd}B4Hb-6MQ_~wN&Z5eR>wT7Ut)^b7*I5z+YOFIk zR}cmx$${?(Vz}Mf15fV^@Y=PXZ6?^{E$5Ap#Z=6l=P(Yjpcqzd9(yfqKl0}6()N5;gU}?VWbFC^Bk@Z5OKmGDZQU^yV;+~Pr zh9t6r-t{^5t*@QZOrdTeRZWeReNbm}Ww!k2W@uEzbgNj~ z)7LGCMxIfs#@tbYahFlz@iGe)C3a?+v`+5Om7Q%4i2#De=_4i!7egl+S3K1VABYY7 z)QGiy^AZ~baD~1M=V1wQ7X4tt!(pE~R z3PYUY30+}SG*l;VSlV#j&bOzp+luu0bu0c^Uh~}J;N*$cT}nrV0&BL=a%D~(sXm4r z(j4Dk#2;sDgUPmSwXB9BR!r(Q{o7X_c%O;08fpw6)r@9qd^cC6vcmh*0vYZyG*E=( z6kRrHY`-+-WZx+@@pN`6?w#RSiA3C;6Vvz3e5U04937$L?MhukHN}25+gtpE5jV?h zL*Ra0@%w1A2ab&_!$)_og(u#B!r&!sH-@;Tc*`A*|512`$CK(#=s^Z`m4p+Q*A+M$ zg@VyeNF!qW?!!V?iaHgSBfQ0LRVy|nV$@c4Mwgu!47p`P0^h}OuQAvOTQbYB$`Q#; zE1)B9pZvI)-JFg-v$1}13(Xxre2=mNo)Xo8!!ThO0wri&XM0>SAoBA+* z=-k*Ko~AX5>B#q*rYvq-;O~nT8tlhtO%1AF7^-tH_E3z3zp7rO**2XjY73y=n;p7dqD!@vsO-&rkCUG;ieql2 zdwpfX;y!mrF_F|;4f0wi$5Q>tEm#=JvMhm{tk~S)*UzhJI*Vyz9i6GVo(Mz)&FuqG z7pk8>FWp-k4q3YSN`i8CMS3y%w|?@tpuo7K(>cyDae;UkKh0;s&X?B$=uZ_ePiY~nM66nt~uQD zO?vI4nu}Wj-yFXAmPL}LH41k6Cr1cUXOH17<0qlx%eHJNm+j6HAsI@`#&d5lp#JVk^*jc;__{#Ex?RoFmMvcuZR zu3aB$gGl~Dg{48?33n1HZ6D6 zmD8{L(X?OhUuQ*JTYdR4Ku%&K&QI=&gwv+UQ(5C2AGDN|boKen$*A?ou9|mA!?3Jg z-U3zEWSu+VRxFdbVWNbUPlAVJc4c-?1iRX-mf6a1jdNLW$?0S~vReyG+{k&`FF|VV zSC6>N+L$m=wqHMw+9y`?4!uCnhO$Fg&NQ3~p59h7x5v|Q91(Ah7Ab$7-+Kc=N7o#v zyQ+ZFF?2o`ZN<&tA{ImZ^a@kGLUi_l`qIk)mxToStiI|kepajfP@9@>aRxqB0c!z? zn~!>KmKCshqt{zfOf7wHsqNy|7%b0nT1(5Z@WhQ&`q{M1<7XDv`dS`rHeTKTv@FJX zHk4LLB5SR@`nyzgOOqp9Lq+eh#FI^)%*zeBgyY^Pd(6FQub*u@NV>8iNX`44zWtT= z_&ovj^jMcjQ4&P3K`{Z*r-Bb~;u~r<1C&*L8khjjm>Cn!Sg)eQ8<`3@+Qz9%_`%01{m`0A`)qCd^Ra7L_?nT2Mk8FFMg^}1 zM~loJ+qIF*d$SG{G>3-Lir1Q^#LxE5`Sd=z`ka$SP~&y0bVhu9!MT+7GKJWU7i&g! z&2MQ`ne9cpQ(KG4GZbtn9#TAgbMbn0uDw0ODZwZEG^}*H>Qps5Sn|-XxIL2bMf9@JLL+G=p)!tk2@HPl}3uz?BltfBAow`Yo)W0C8stkku z+;n1J(E zJOA*?{Zu^WoExN%)c-j^Mi#{#QX2mPGg9oLVR+>I-r;vfT2w5x$CMU5Cp}*whwh-{ z+h1wZ{`+jxS-;!%YSwKP4sTD*NPeyBOfijlr?$RkD7?&_=5_Gw==YCvQXfv%My#Wt zynGL@r(u&|u$S;-kTwej*?gQN@Uv&(%k#w z@t4lePfWVROlEOw6$sAnKH#MsODH+&$`__m+6-%MK{L`@ya|vf z8K+L)9Pe(>-8ajN`*f}Egzb!JwsK>F5o1i=*_3LAD5W^Npc|t(Be~BAFJ}lP1b1;0 z@A|$J{gG*2YB=iUc`H`vV+dLL%`G%OMKae6dPbr---FMGMxw1;qDePPH(%uPNe)H7 z^ByjQp<-kFc@j0Y6XrKPCkX^b6qx2+9T@YPtOuC9(`9GBEm0?H36%2&tX8%ke!0Qb z;qPhJ|D(ONp-xWaM{a{qjHtYttJC7l=KBDi^=>vTr9B(txu$BD6Z@1Arsg53-QNsR zv8CgD9J%JYEhG*F`nuYJIX0>{@wloQsIn<}UD`4(-S#-ozBT)Shd_irPtm@GOxcR0 zhOY~6n1j>bak*#RmC!WijYsxK8lrOWPV(1&FP1(Q2Thw#=Kaq%3Ie`i2 z_@Yo5J2NxA@F}~>c|3Tkz=Bs?YSCJ-N475i)CcWzog~pod;?uU3PGdYOmUbR?w9se zoN3A-MAs4g!A^{Wp_WURw2qwJnDn<5*o;tvZ8nzvs;}3CKf#h8o`{p-Je_A%c7k~7Ft6LnIL zwf=lm@>~0^>faLfwesLHNH&tgUk8zf!0l4ixs8x7!u9%JT@o zJ9b1>nFpReqUnD^Bzmuad-lBO!nCMS(BKSS#s}@(s>j=(GVP@A9#tj;Z>Vp7xPOYP zQ)Q*quDYMC){|SjDpL~P&1#V>$=ju{qmpRMKRF-9@-#NCn5)7hUc}{ewNH3bhPgx- zn~q?-x=u4`O8>b1C&|ag7Uu4#8V;UFm*Hrl!Dnh--(+5>&)jkuc#tTF{;uL#nb4tS zlIqw+DdAN7VWQJ}wA$@(>Bbi`S-u?u^Z?#TpM$35hYnUwk44l(p7H+fE8$DL!?_*h zLZ@uEpNO>Nt9qQ4*n6vor|)-+znz2c-Pen;yJr`Cd{@U3E0bS$y?Zp7hQ8iSo0P7% zv7u_9^r0KcpZP88e1*c*%@@BhAqy0AZ<9T;A0=G52 zD;#k1-aR)2RjqJb9{Nf9wMqMpLd_k`x9d_38RoA2=9iTkpNJ?X!Gwh*HCjseXJW#? zo80|?#&3$Pu>gJw(KuZ3(6f1ssZ+7g(fHyC$H=5%2F_b9$%25vZ$MhV5rsZGVFoWDe#s5QIQ zu1z1KlJYKmsNjV%A~m{9IPpwj=lE7MPjhQr+MQZA&!Z`RX^Erx*QW050b0CSsvqC~ zZ0gOrW^?;}wVToHsYq#s!!oanatj?1H<{jGTGPDO9Y$qa&#B-7QCWmW-w^|M6A+N1$ApNCce{TTAAdn zRt>klt;?*AG~8Lgd!-WIcW!P(TZzv#@3ydX$!z_G+b?9TSdY@-8zI-2{5J{m&uQ$w z&dqwMnoUuy8Q(l1MWu$=xTi{;QmDB`H|RhoKySaJBb%p>=pgiai_m+fDYVg_0j_rG zym3wTxpCe&hNI7_BT}IX;Zlk<(0k>t@md5)1Hc;-H2R?a(GzU3LPKTO_ZwG%LqE@WXuwj z3wO`Rl`Rh)-zemZ@j zo|o$01+vO{oiTnUT?u_Hs$|jc=e|!bo;U1E9#xKw)6%(nck7vr=4aXE6>0I2ipDAZ zZ?Ds?WzDERb6tEyu_WQYzLr&KYx7xhBP54L;j*9VGG3WsJHoEVDI@Xg&PHuTMa?NT zEfuR&G6#h)wZiM0ja--R7V@QV?=L9Ysil9J9J}UX8lUKbSE}83LH&MBHQ{%G7S#Z1 zui)x>zxI5zX3$;AD+qVV5!t)Zr?vUt8VW|RanY)|OT_u?TuanFPZ(Qv!=@@mH{Ux< z)gq>CM2o2TiSFkW$4j!&@LS46%q3$?ky5nXDM79(rwQzJ{bS7xpZPG-3zWZ4Gk%77 zb`~#q@nQphSEn-%PnhZi=exnr2NFITuHdR;UrgW8&c+SDh>qR%z6Ml!A9Xcb+lipA zo%r9%LAxx5ld%h8~p*=v4ES;t7-k~194HZ*6J@ePv#%MkvX+}GmL7UB}5 z!gyxl4wt>;SYL#)_~w*KCcS#7G%EtOGw&LXWlP*jntjeF9&W*bKSJzubvi#^ioC9% z^0j-(Ymu1+@-H9Pn4}M#>AX*jd_J|Q8oVq}t?bx*SGa6P-I~z%T1{H)xfS-ySJS;$ z%{Z2JM#GA03RBzL1G3%g?3#~wQyYT!d)>cV?5>(IR(q3QAy)7<6{9;Ey%KrERz)4` zsz9Jc$g_Rs$d5OZTHiolznggP<~Mq;ii>9GO4@#_YMn^UoQQbegq)@VE@f%CL{AU; zgu`feYBO!FTy`4{!uaRxU5ZB0o0K{i`=oNPMSNN2IQy~_oD?I33r~kn66_-q+m1+M z+uM%1V#YR<)(D^NG+vR&x)AOC{sxVi#QpbHG}0C{W|;fMOf6qxCi0>u&MB_xDy{V^ zt;IasVSKhzqqvr6d%JDB?#l0pD}E`>N76Ug2+#h@GV$Mc)uAwAn7@~aVgFCd#8Ju6%>} z|MOhRzt3M06$3}b{I3h)BZfB}jqMl%-n;o_Of{GKPuz55+eh=2vW%W<({w9(G~M@# zMdM<}L~LTNhA-n1D#p{XUYi$W5Zf=CVfIV-~G%AMk79mOM^HsZv zvL!a7g!o}`iP6$b)GdC(C3+>_p6!6qUE!7OcLw`Ax_mS3m-$q3CW5SxT8V1|<6Yg% z-g{O0bEoh0I!WwIJ+bI;@EyfAbps@SCQ# zSZUq4N24+|2eZ_t3Pu-t9FIx6bi`Ju@cUj5=BQq} za^u&1zr%SCHw+XeefxxAQjUMp_1XDBJKU z@2#n7t{>w(iPQ`>Re@n@Jo>4X_01)Er>*UCW}1x+KPw85>nbZ||7c8P_N1_CcTlJ7 zy~!i3U9n2i$!!r6{20F*Sz7F=O1E*z7GY&61L>lV!E5@F=Sf=#pY$C(G zh=xd&TkcmJ*+xWjQ(pVr3RqZthbmynO1mynaZX6;V^<1dm~$yhJa{|n#j0iLl$q5} z)m+m>tBHMclIJ}sB~4bJS$dh|($1xF^HkbLSz7x!{2Z9*usE^=FOyS5R3x;hP3lIa zgddFSkd%MCnRwaqRq*|f9`Rq9+qU;U2-lRJdUL+3XG8K6HRpJX+Q!DA&-Z4APfcpk z1^89`*<5ep1WPA&XpXK9GzCaKj@W`tohP$e37>j1qRSP}{gEeOCGcVR?V#5#8rmN# zWZo&__dV|+)>$}^aj3bdymT}1Rmjz#kDlcA8zUT_S5D3{HMZ2S`;`T+cGR#h8b*|C z{2;CwoC=S3QeHq%{_;~KzO!;_JbC}}P$lo2c<4d=*S53dg&ZcE7yHsb;f+72OcOjX zK6hxwVQ@Pr<2p}R_1W+d5%%A)ilMT7q4-x%*pKPGK5M~8n<`VAf$nJ=;(slaQ>N5m z!Xig}Ia*Gl&58s0Na^LL-1&QOztLjVfa)*qYo=_s_iZoV;6YGpwQSuBEL*ERd;7E~ z+}5V$t)mT0wS@OV*XE%^s{2)dt4`m-8^r6=ufPiUMOb}kT0 z7^!bm9eNT<>-qNg*$pY3ZsL9$D%R>!y&ZmAVv)e@Q;Tn1emqyzr*+|Nx`x*tx`FQL znHK6E_Xw2tOYG>Tf9waNvlc_IzZIBTXUn+M_A6U*zx!+Xj(D)#4<#wboB@PsQb){? zO4a2dsxOHSUpfVwgiEsfzK7I4G^zWRaMN5|SFwBPV6ynb3Zmy?C)??i+cbxJ%DbG# z)^+Rqf~_l$ivk3^2~Q<@B(UG7{_;cc>0mzJ8wz!?Inn3D;jpWZrJQ4B;a?bOOz!tD zed-r}}C6K82QUxrD%#%%tTZQw>G~1 zvK(X^E_s1YC4(8O4V~>tPd@kiiGB;~x?z#2Mr7Uy)ukZ}jV@C~min%4?TyOUB6C4A z%C`l-_7}9lcAS+Pb9h| zm);Vab%8x&iih4_Tg65s{1XFBt6g(L8?yH)bwBsOmG9)-L-Lmj<2IJYY8g^W{1`6} z<&X3Vxfun$3KDT9BR%uBn=tL93$6Eci(=ytexZltMnw(o%4&o~)4F+tJwLK$e+4O9 zo8J*JUMErS-Rlpy`_}BVfzhv!b74>9AEDfDI4<_a53*YEet+S>H75`*eeEfMyeGb< z@bgEbgz{Bdc(*PL>j$*a%G?Oocq@BW`=khIIsprTB-tMCEKl#v#~Y6;Wi|F6-y03t zJ*|8q%g^hg=a!{G_kR68+LhUyq{W>KQ*l3objok+bIruG%QVgPpqBX}{U??gVNSpN z)!3g!AUSS6NdLU`+TpC`MdBYLb6Fj%dP`~ZT}zG@yhYN~+Q5*$vU^KfPLVr2$$WwK*C5f|0ws^=)ZW0$DW``{bbO`Vk-k{A8%V@|OXl7NYW24P z;ZL8(H%`8|a8Fv?3*mLWs$O#bd6l@=8{+-$QNw4_pNEt7sV`ZOIu1(@h#{Hm*?vgF zwkpQlJVGK5ITtDvj3m#QE#CD*(2t6L`zBq(BVO0@h@o&+v`m~`p5tYR?_8zYz$uwb zy;^aO%WTRuczEkN3fWYf{IdW3UpgAt?~3O7!>?Bhk9_{6f1v-4Ksnc$NQs-!{WpxL zd%uC&FLZ<8N0iIm&vL^tvz79Mjv0i$OO@V{r%flXxPQ6j+f)7BmYwl)NWG7?`6uVT z(V-mPU|(_bQo??=EIL1mY0?_80--R;+BA~5`nUUBwd0HYo%77uq%NHJL76j!yUExRQ)4W z_j4Ui(8Yq8M!oZR>fw6lr+svrAESd#LGm;6C#T<84vA0H_2~t+RH1bv8{g^}&dztW z&^_E4%o<{S-~Ql4`4Yp|tAfQP4U4hM+DiA=#PPPy{#cPgWl7hpT*zKK<8H?)r>^tk znjs~xW(LE17c%L2=aoly_I-k1eK&4C$VqY#eiofj{qP5&FlTeUZd2#UeAc}lZ<4@> z$)80_yJnV2?+ITHaF--Hnbfen6fzMG;#}Q0+2r+f^O35_O8s1^XbdAJtW5?nz$|-u zq{)t0N^X@gxQEg&d<;pD-7rc9r1>+0@+u>97{gG|^^kU*dj%3|9ql?2Xr^nbiZTfa z?AF%&jNoH(VnQzTSV;;g7S*n6la(jV&oMm&YW3fi-B=507-*&^Oup?V7s-k_Fq>O2H^pX%aYpOaAU${8lT%C4_7;VWNyySMpwbw7K zoRuV`OrzH+zVW3WSUkGG(lCijoRW+m5K=Nh};ibr)Vlw7jDvefQ$f9TdyOZT99 z`xUz{B1%z3nx0`XFCQ5cmtJptSrR(p<$I^d!Nb6z%Vsj0l{^t&toB;VL~>y0B^HW# zlnhtUwh3IoC?(eMCWH2qOSid;Bt9f*(ed*6zo;v@@Kvxz=+=*>;LY{U{^{+f*UUoL zrG$xppcMPRD~?$Ogix?akP>aa4>pd-?i0dN}Nv&)v)wE(8;Gm*LdJmHv`! zn7|1>ir+rnt9!3|Gu=oI9|e_7{GNWl0qYq4Agz>ord7w-S&;Qf)4frtz4@YqXnGdb zNf=wsSV9b!@(q>>hmq4!Z<`2z4DGl79GGZ(uEF%0;7IGVJ^y)O3 z`&lbDAIYA{TE3o6#_|vD^c1z!r}wuj^AD3}?oEz&ko;Q5yHoTor544~_j|%hJ%cGy zqx_9(Xwjk0b^kE&-L7SCW`?ul%=S#J#R>4?%M^X6Va-c7_?DiM$d^Uf-wz1CcIGZi zqs3~fh1P4g-{ZAbHB`nku~L#=zuSY-db%=G)yO1W+>;Lpvk#@3{w-zj*FEA$I2!&} z$^eP}>wwquH$AL9gxoDG9c|o&Y#lw^ZU~`-?OZKvZ5?g+|5N<0V>i!(Ls{9iblDBr z;V@zFE6RSw!@||wmt9+4Q4XA@Y2)qwkEgJKG&Y7huVHU>!@|aqUE9Xh%I6;i{@V3^ zUh9^Vz0#kpvWp6f2#fru5C$7W|51y)I06Rl3cw&x;1Adbz>Bv8>aR(*jTx~4=xJToUqgMYSfXh7O z!9D7v$Uk}ks2aDeu>^3}0JdBI3>m#$ZEQiN!8VSoRyqeMlKqU^Tpf6x$MzdE>B z;g4_Go&Kc3U}6|h;usAf0tc@F9H(ImVE3tGfByL_9P%s-hK3;lU;kMSffPl9D?X0V zU?QR-*n3v~Sq|LvjD|s;g^7uZVsBnKR!$59!?NiZ4K9Yl9$I>whDLzPP#7o=#6%GykUql^fW?r$z!6}FI&{3?Srp`1?AWlmAMDTHJcWZP z!lfY*(7vExe@N4@XHhUQP}A}H5NHexsL!!-2q3?ZK7;WhvDd^LD+hRhz2xE;4Gp>n zeHIN9K|$pTOu8seyFeiz&%#Au;1t>8Z2+1W5<&y#<03?$V}qkbQ4ktG~1pqe-|OMvc zg~J0F0-FrR{`@V^Vi+{YmK~=783Nfc2n~Tfx)?%3!m&7q{rRgtI1+;+lYoZ7nO}fm z&^EB=72~uaiV=nK3MPudVDqd0Y!`;XfLX_(iQwQB9EdIsU&K(@tnNSSg99Xi${L{k zOF1MAn|43`EK&qo9}LuofbtL2hlI8(0!Ksp2QU!{l_4w*Dt}n)#K8zOfI3JUFf?3L z6f$ox5v<)jJ`O-b;q(Rc8H(e8FM#{Up9Q%4$3@x4`GOLK+8&q)8Ue)@Km%ZWygn=q z+801WL-`B`?<3*pG|*@y4(|axLuDMy5sv-E0NsL=!(te2Uoa>feFL%!)k_3Y3_mu&0AqHX2LKRJNGP90k$-qG|Ku~ctQ>=*GZ-|k?Lmr&;n+|SQDACt+C_ka z#E;83=nGW05MYL&GLDr#QHZW$bqiP`sLu*)&%f{m1DwL+ZD4tTy?f#KoTD*N z{e=NN!leNh4#Gbe=nIbR1~M)R(JdJ8L2zUjj>55TU@iebAMYO+8v^^*>9IZ|Fen_| z0xZSRX`q(i<7eSpM}aK2$|3*X#ap+h3Yx*fkkoW z1BvUi0{JP81LgefQtcV3L^L5YDOTu$IBsr5`x#skLepQ$f8gj$GXZ;dkNYQ zh4K_kE&>{>A&_tpC|}@cF)^q;2Q+}F$NK`9A_|QI01cPVAkPC<*dgP90pop~2HZ6q zTMO>GhC%HSfE*m$05pJh(E2b)1TGDQGjD(gP+JT5B8KB5iJ<}FL)!q>5rTgJaDeGQ zUJlUyxO)2-ZUE_n>I^V5fD4dk5x|v#>MsJg@(|g8qk(yY;0vH(aK;8h;n*ky4EQ;a zHbC@&!N~!Lq7b-kfUpOG8`zWruACw;(AXZZ1xy`uya-WfJ`Ag?P#FUE@#6S|fCc~% zQXjbZ6gqDJ!eBT!1dtP&uY;ptxbY=VAgQ(6d?wB2^|EY=8!FBSfA7SI|)12k0yY^??ys0EF-vU>L4W1J?qI zYj6-9;m8#VILpw!z+p%z76WGk1hL0?ij^Up*dAaa)Gxt~4M#R$A~-ova9bLfq~mRX z$Q}b72Udn~>?%0>1C67D*eLsWeOQ|Z#cW{TM4|JBwRun*3P1=r)sXtc{)JayWKhgT zU~}$}u>s5mo(`lwfbBT=2Sgm20|l53GJwaQ1$GdGNH{dyTpF-Z*gLt8VIr`NFi2kj zpOMgf7}hr8$UP8A9GL{Xg6a$$=oXHx05%0jUVx0_j16Er4qt%HgZe|@;xP;qv%$&@ zNZ3Kf3;a{4t$+jH2ih*6A#waVAo$RE!^#lUE?|8I967*--OxD#*$wFYVtqI$?jk_0 z9;Yt=>u~r2EG^Xc#_B#)Hn5lt^~d4BHbQ+LfG;Q|gJL@zWd6bVP{;KWxOGt6!1_o~*~MZfjx7@dAd6!wU?^z2 zSRVvB=ODQO5_XWjh=I@#mxhGeEjaL1aC8u09j?DC3Y;!T8)y`+KM8n+gD==+HAr8u zF$B~`VKEWM?*u*^G*^K2(V?*cHl~BlC2&eW_WyXFu{0d}2J)CVHVP>QBnJ8{1}Are zjT@nPE^M5KGe^ME;@Iwg$&X{xwm3KhA{-pN2blt>UyK0x1E>svLu(6Y(V)3XgFg-fjm0Irv{mSQK+l|8V+`1Y5$CS z++8i~9c^4mv3IuW+WTTxuE7PBI=64Tvm>!K6nhSwij(bab|4dfe3oZ7W|y$Gg`;eN zf+H<$tpR3(AOt9wsI3SRVJn6}TEL}9|Mwz)?j(0}2m1s5SQ`gD11dzy!=s?3Ncw*O DzJOxC literal 0 HcmV?d00001 diff --git a/docs/quickstart.adoc b/docs/quickstart.adoc new file mode 100644 index 000000000000..06f7f8f8ed00 --- /dev/null +++ b/docs/quickstart.adoc @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[quickstart]] += Apache Kudu (incubating) Quickstart +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +Follow these instructions to set up and run the Kudu VM, and start with Kudu, Kudu_Impala, +and CDH in minutes. + + +[[quickstart_vm]] +== Get The Kudu Quickstart VM + +=== Prerequisites + +1. Install https://www.virtualbox.org/[Oracle Virtualbox]. The VM has been tested to work +with VirtualBox version 4.3 on Ubuntu 14.04 and VirtualBox version 5 on OSX +10.9. VirtualBox is also included in most package managers: apt-get, brew, etc. + +2. After the installation, make sure that `VBoxManage` is in your `PATH` by using the +`which VBoxManage` command. + +=== Installation + +To download and start the VM, execute the following command in a terminal window. + +[source,bash] +---- +$ curl -s https://raw.githubusercontent.com/cloudera/kudu-examples/master/demo-vm-setup/bootstrap.sh | bash +---- + +This command downloads a shell script which clones the `kudu-examples` Git repository and +then downloads a VM image of about 1.2GB size into the current working +directory.footnote:[In addition, the script will create a host-only network between host +and guest and setup an enty in the `/etc/hosts` file with the name `quickstart.cloudera` +and the guest's IP address.] You can examine the script after downloading it by removing +the `| bash` component of the command above. Once the setup is complete, you can verify +that everything works by connecting to the guest via SSH: + +[source,bash] +---- +$ ssh demo@quickstart.cloudera +---- + +The username and password for the demo account are both `demo`. In addition, the `demo` +user has password-less `sudo` privileges so that you can install additional software or +manage the guest OS. You can also access the `kudu-examples` as a shared folder in +`/home/demo/kudu-examples/` on the guest or from your VirtualBox shared folder location on +the host. This is a quick way to make scripts or data visible to the guest. + +You can quickly verify if Kudu and Impala are running by executing the following commands: + +[source,bash] +---- +$ ps aux | grep kudu +$ ps aux | grep impalad +---- + +If you have issues connecting to the VM or one of the processes is not running, make sure +to consult the <> section. + +== Load Data + +To perform some typical operations with Kudu and Impala, you can load the +http://www.flysfo.com/media/facts-statistics/air-traffic-statistics[SFO Passenger Data] +into Impala and then load it into Kudu. + +1. Upload the sample data from the home directory to HDFS. ++ +[source,bash] +---- +$ hdfs dfs -mkdir /data +$ hdfs dfs -put examples/SFO_Passenger_Data/MonthlyPassengerData_200507_to_201506.csv /data +---- +2. Create a new external Impala table to access the plain text data. To connect to Impala +in the virtual machine issue the following command: ++ +[source,bash] +---- +ssh demo@quickstart.cloudera -t impala-shell +---- ++ +Now, you can execute the following commands: ++ +[source,sql] +---- +CREATE EXTERNAL TABLE passenger_data_raw ( + id int, + activity_period int, + operating_airline string, + airline_iata_code string, + published_airline string, + published_airline_iata_code string, + geo_summary string, + geo_region string, + activity_type_code string, + price_category_code string, + terminal string, + boarding_area string, + passenger_count bigint +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY ',' +LOCATION '/data/'; +---- ++ +3. Validate if the data was actually loaded run the following command: ++ +[source,sql] +---- +SELECT count(*) FROM passenger_data_raw; + ++----------+ +| count(*) | ++----------+ +| 13901 | ++----------+ +---- ++ +4. It's easy to convert data from any Hadoop file format and store it Kudu using the +`CREATE TABLE AS SELECT` statement. ++ +[source,sql] +---- +CREATE TABLE passenger_data +TBLPROPERTIES( +'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', +'kudu.table_name' = 'passenger_data', +'kudu.master_addresses' = '127.0.0.1', +'kudu.key_columns' = 'id' + ) AS SELECT * FROM passenger_data_raw; + ++-----------------------+ +| summary | ++-----------------------+ +| Inserted 13901 row(s) | ++-----------------------+ +Fetched 1 row(s) in 1.26s +---- +[NOTE] +==== +For `CREATE TABLE ... AS SELECT` we currently require that the first columns that are +projected in the `SELECT` statement correspond to the Kudu table keys and are in the +same order (`id` in the example above). If the default projection generated by `*` +does not meet this requirement, the user should avoid using `*` and explicitly mention +the columns to project, in the correct order. +==== ++ +The created table uses a simple single column primary key. See +<> for a more detailed +introduction to the extended SQL syntax for Impala. ++ +The columns of the created table are copied from the `passenger_data_raw` base table. See +http://www.cloudera.com/content/www/en-us/documentation/enterprise/latest/topics/impala_create_table.html[Impala's +documentation] for more details about the extended SQL syntax for Impala. + +== Read and Modify Data + +Now that the data is stored in Kudu, you can run queries against it. The following query +lists the airline with the highest passenger volume over the entire reporting timeframe. + +[source,sql] +---- +SELECT sum(passenger_count) AS total, operating_airline FROM passenger_data + GROUP BY operating_airline + HAVING total IS NOT null + ORDER BY total DESC LIMIT 10; + ++-----------+----------------------------------+ +| total | operating_airline | ++-----------+----------------------------------+ +| 105363917 | United Airlines - Pre 07/01/2013 | +| 51319845 | United Airlines | +| 32657456 | SkyWest Airlines | +| 31727343 | American Airlines | +| 23801507 | Delta Air Lines | +| 23685267 | Virgin America | +| 22507320 | Southwest Airlines | +| 16235520 | US Airways | +| 11860630 | Alaska Airlines | +| 6706438 | JetBlue Airways | ++-----------+----------------------------------+ +---- + +Looking at the result, you can already see a problem with the dataset. There is a +duplicate airline name. Since the data is stored in Kudu rather than HDFS, you can quickly +change any individual record and fix the problem without having to rewrite the entire +table. + +[source,sql] +---- +UPDATE passenger_data + SET operating_airline="United Airlines" + WHERE operating_airline LIKE "United Airlines - Pre%"; + +SELECT sum(passenger_count) AS total, operating_airline FROM passenger_data + GROUP BY operating_airline + HAVING total IS NOT null + ORDER BY total DESC LIMIT 10; + ++-----------+--------------------+ +| total | operating_airline | ++-----------+--------------------+ +| 156683762 | United Airlines | +| 32657456 | SkyWest Airlines | +| 31727343 | American Airlines | +| 23801507 | Delta Air Lines | +| 23685267 | Virgin America | +| 22507320 | Southwest Airlines | +| 16235520 | US Airways | +| 11860630 | Alaska Airlines | +| 6706438 | JetBlue Airways | +| 6266220 | Northwest Airlines | ++-----------+--------------------+ +---- + +[[trouble]] +=== Troubleshooting + +==== Problems accessing the VM via SSH + +* Make sure the host has a SSH client installed. +* Make sure the VM is running, by running the following command and checking for a VM called `kudu-demo`: ++ +[source,bash] +---- +$ VBoxManage list runningvms +---- + * Verify that the VM's IP address is included in the host's `/etc/hosts` file. You should + see a line that includes an IP address followed by the hostname + `quickstart.cloudera`. To check the running VM's IP address, use the `VBoxManage` + command below. ++ +[source,bash] +---- +$ VBoxManage guestproperty get kudu-demo /VirtualBox/GuestInfo/Net/0/V4/IP +Value: 192.168.56.100 +---- + * If you've used a Cloudera Quickstart VM before, your `.ssh/known_hosts` file may + contain references to the previous VM's SSH credentials. Remove any references to + `quickstart.cloudera` from this file. + +== Next Steps +- link:installation.html[Installing Kudu] +- link:configuration.html[Configuring Kudu] diff --git a/docs/release_notes.adoc b/docs/release_notes.adoc new file mode 100644 index 000000000000..e8430dd48ba0 --- /dev/null +++ b/docs/release_notes.adoc @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[release_notes]] += Apache Kudu (incubating) Release Notes + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +== Introducing Kudu + +Kudu is a columnar storage manager developed for the Hadoop platform. Kudu shares +the common technical properties of Hadoop ecosystem applications: it runs on +commodity hardware, is horizontally scalable, and supports highly available operation. + +Kudu’s design sets it apart. Some of Kudu’s benefits include: + +* Fast processing of OLAP workloads. +* Integration with MapReduce, Spark, and other Hadoop ecosystem components. +* Tight integration with Apache Impala (incubating), making it a good, mutable alternative to +using HDFS with Parquet. See link:kudu_impala_integration.html[Kudu Impala Integration]. +* Strong but flexible consistency model. +* Strong performance for running sequential and random workloads simultaneously. +* Efficient utilization of hardware resources. +* High availability. Tablet Servers and Masters use the Raft Consensus Algorithm. +Given a replication factor of `2f+1`, if `f` tablet servers serving a given tablet +fail, the tablet is still available. ++ +NOTE: High availability for masters is not supported during the public beta. + +By combining all of these properties, Kudu targets support for families of +applications that are difficult or impossible to implement on current-generation +Hadoop storage technologies. + +[0.6.0] +=== Release notes specific to 0.6.0 + +The 0.6.0 release contains incremental improvements and bug fixes. The most notable +changes are: + +. The Java client's CreateTableBuilder and AlterTableBuilder classes have been renamed +to CreateTableOptions and AlterTableOptions. Their methods now also return `this` objects, +allowing them to be used as builders. +. The Java client's AbstractKuduScannerBuilder#maxNumBytes() setter is now called +batchSizeBytes as is the corresponding property in AsyncKuduScanner. This makes it +consistent with the C++ client. +. The "kudu-admin" tool can now list and delete tables via its new subcommands +"list_tables" and "delete_table ". +. OSX is now supported for single-host development. Please consult its specific installation +instructions in link:installation.html#osx_from_source[OS X]. + +The upgrade instructions are situated at link:installation.html#upgrade[Upgrade from 0.5.0 to 0.6.0]. + +=== Kudu-Impala Integration Features +`CREATE TABLE`:: + Impala supports creating and dropping tables using Kudu as the persistence layer. + The tables follow the same internal / external approach as other tables in Impala, + allowing for flexible data ingestion and querying. +`INSERT`:: + Data can be inserted into Kudu tables in Impala using the same mechanisms as + any other table with HDFS or HBase persistence. +`UPDATE` / `DELETE`:: + Impala supports the `UPDATE` and `DELETE` SQL commands to modify existing data in + a Kudu table row-by-row or as a batch. The syntax of the SQL commands is chosen + to be as compatible as possible to existing solutions. In addition to simple `DELETE` + or `UPDATE` commands, you can specify complex joins in the `FROM` clause of the query + using the same syntax as a regular `SELECT` statement. +Flexible Partitioning:: + Similar to partitioning of tables in Hive, Kudu allows you to dynamically + pre-split tables by hash or range into a predefined number of tablets, in order + to distribute writes and queries evenly across your cluster. You can partition by + any number of primary key columns, by any number of hashes and an optional list of + split rows. See link:schema_design.html[Schema Design]. +Parallel Scan:: + To achieve the highest possible performance on modern hardware, the Kudu client + within Impala parallelizes scans to multiple tablets. +High-efficiency queries:: + Where possible, Impala pushes down predicate evaluation to Kudu, so that predicates + are evaluated as close as possible to the data. Query performance is comparable + to Parquet in many workloads. + +== About the Kudu Public Beta + +This release of Kudu is a public beta. Do not run this beta release on production clusters. +During the public beta period, Kudu will be supported via a +link:https://issues.cloudera.org/projects/KUDU[public JIRA] and a public +link:https://groups.google.com/forum/#!forum/kudu-user[mailing list], which will be +monitored by the Kudu development team and community members. Commercial support +is not available at this time. + +* You can submit any issues or feedback related to your Kudu experience via either +the JIRA system or the mailing list. The Kudu development team and community members +will respond and assist as quickly as possible. +* The Kudu team will work with early adopters to fix bugs and release new binary drops +when fixes or features are ready. However, we cannot commit to issue resolution or +bug fix delivery times during the public beta period, and it is possible that some +fixes or enhancements will not be selected for a release. +* We can't guarantee time frames or contents for future beta code drops. However, +they will be announced to the user group when they occur. +* No guarantees are made regarding upgrades from this release to follow-on releases. +While multiple drops of beta code are planned, we can't guarantee their schedules +or contents. + +== Disclaimer on Apache Incubation + +Apache Kudu is an effort undergoing incubation at The Apache Software +Foundation (ASF), sponsored by the Apache Incubator PMC. Incubation is +required of all newly accepted projects until a further review +indicates that the infrastructure, communications, and decision making +process have stabilized in a manner consistent with other successful +ASF projects. While incubation status is not necessarily a reflection +of the completeness or stability of the code, it does indicate that +the project has yet to be fully endorsed by the ASF. + + +== Resources + +- link:http://getkudu.io[Kudu Website] +- link:http://github.com/cloudera/kudu[Kudu Github Repository] +- link:index.html[Kudu Documentation] + +== Installation Options +* A Quickstart VM is provided to get you up and running quickly. +* You can install Kudu using provided deb/yum packages. +* You can install Kudu, in clusters managed by Cloudera Manager, using parcels or deb/yum packages. +* You can build Kudu from source. + +For full installation details, see link:installation.html[Kudu Installation]. + +== Limitations of the Public Beta + +=== Operating System Limitations +* RHEL 6.4 or newer, CentOS 6.4 or newer, and Ubuntu Trusty are are the only +operating systems supported for installation in the public beta. Others may work +but have not been tested. + +=== Storage Limitations +* Kudu has been tested with up to 4 TB of data per tablet server. More testing +is needed for denser storage configurations. + +=== Schema Limitations +* Testing with more than 20 columns has been limited. +* Kudu is primarily designed for analytic use cases and, in the beta release, +you are likely to encounter issues if a single row contains multiple kilobytes of data. +* The columns which make up the primary key must be listed first in the schema. +* Key columns cannot be altered. You must drop and recreate a table to change its keys. +* Key columns must not be null. +* Columns with `DOUBLE`, `FLOAT`, or `BOOL` types are not allowed as part of a +primary key definition. +* Type and nullability of existing columns cannot be changed by altering the table. +* A table’s primary key cannot be changed. +* Dropping a column does not immediately reclaim space. Compaction must run first. +There is no way to run compaction manually, but dropping the table will reclaim the +space immediately. + +=== Ingest Limitations +* Ingest via Sqoop or Flume is not supported in the public beta. The recommended +approach for bulk ingest is to use Impala’s `CREATE TABLE AS SELECT` functionality +or use the Kudu Java or C++ API. +* Tables must be manually pre-split into tablets using simple or compound primary +keys. Automatic splitting is not yet possible. See +link:schema_design.html[Schema Design]. +* Tablets cannot currently be merged. Instead, create a new table with the contents +of the old tables to be merged. + +=== Replication and Backup Limitations +* Replication and failover of Kudu masters is considered experimental. It is +recommended to run a single master and periodically perform a manual backup of +its data directories. + +=== Impala Limitations +* To use Kudu with Impala, you must install a special release of Impala called +Impala_Kudu. Obtaining and installing a compatible Impala release is detailed in Kudu's +link:kudu_impala_integration.html[Impala Integration] documentation. +* To use Impala_Kudu alongside an existing Impala instance, you must install using parcels. +* Updates, inserts, and deletes via Impala are non-transactional. If a query +fails part of the way through, its partial effects will not be rolled back. +* All queries will be distributed across all Impala hosts which host a replica +of the target table(s), even if a predicate on a primary key could correctly +restrict the query to a single tablet. This limits the maximum concurrency of +short queries made via Impala. +* No timestamp and decimal type support. +* The maximum parallelism of a single query is limited to the number of tablets +in a table. For good analytic performance, aim for 10 or more tablets per host +or use large tables. +* Impala is only able to push down predicates involving `=`, `<=`, `>=`, +or `BETWEEN` comparisons between any column and a literal value, and `<` and `>` +for integer columns only. For example, for a table with an integer key `ts`, and +a string key `name`, the predicate `WHERE ts >= 12345` will convert into an +efficient range scan, whereas `where name > 'lipcon'` will currently fetch all +data from the table and evaluate the predicate within Impala. + +=== Security Limitations +* Authentication and authorization are not included in the public beta. +* Data encryption is not included in the public beta. + +=== Client and API Limitations +* Potentially-incompatible C++ and Java API changes may be required during the +public beta. +* `ALTER TABLE` is not yet fully supported via the client APIs. More `ALTER TABLE` +operations will become available in future betas. +* The Python API is experimental and not supported. + +=== Application Integration Limitations +* The Spark DataFrame implementation is not yet complete. + +=== Other Known Issues +The following are known bugs and issues with the current beta release. They will +be addressed in later beta releases. + +* Building Kudu from source using `gcc` 4.6 causes runtime and test failures. Be sure +you are using a different version of `gcc` if you build Kudu from source. +* If the Kudu master is configured with the `-log_fsync_all` option, tablet servers +and clients will experience frequent timeouts, and the cluster may become unusable. +* If a tablet server has a very large number of tablets, it may take several minutes +to start up. It is recommended to limit the number of tablets per server to 100 or fewer. +Consider this limitation when pre-splitting your tables. If you notice slow start-up times, +you can monitor the number of tablets per server in the web UI. + +== Next Steps +- link:quickstart.html[Kudu Quickstart] +- link:installation.html[Installing Kudu] +- link:configuration.html[Configuring Kudu] + diff --git a/docs/schema_design.adoc b/docs/schema_design.adoc new file mode 100644 index 000000000000..0a852b910add --- /dev/null +++ b/docs/schema_design.adoc @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[schema_design]] += Apache Kudu (incubating) Schema Design +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +Kudu tables have a structured data model similar to tables in a traditional +RDBMS. Schema design is critical for achieving the best performance and operational +stability from Kudu. Every workload is unique, and there is no single schema design +that is best for every table. This document outlines effective schema design +philosophies for Kudu, paying particular attention to where they differ from +approaches used for traditional RDBMS schemas. + +At a high level, there are three concerns in Kudu schema design: +<>, <>, and +<>. Of these, only data distribution will +be a new concept for those familiar with traditional relational databases. The +next sections discuss <> of an existing table, +and <> with regard to schema design. + +[[column-design]] +== Column Design + +A Kudu Table consists of one or more columns, each with a predefined type. +Columns that are not part of the primary key may optionally be nullable. +Supported column types include: + +* boolean +* 8 bit signed integer +* 16 bit signed integer +* 32 bit signed integer +* 64 bit signed integer +* timestamp +* single-precision (32 bit) IEEE-754 floating-point number +* double-precision (64 bit) IEEE-754 floating-point number +* UTF-8 encoded string +* binary + +Kudu takes advantage of strongly-typed columns and a columnar on-disk storage +format to provide efficient encoding and serialization. To make the most of these +features, columns must be specified as the appropriate type, rather than +simulating a 'schemaless' table using string or binary columns for data which +may otherwise be structured. In addition to encoding, Kudu optionally allows +compression to be specified on a per-column basis. + +[[encoding]] +=== Column Encoding + +Each column in a Kudu table can be created with an encoding, based on the type +of the column. Columns use plain encoding by default. + +.Encoding Types +|=== +| Column Type | Encoding +| integer, timestamp | plain, bitshuffle, run length +| float | plain, bitshuffle +| bool | plain, dictionary, run length +| string, binary | plain, prefix, dictionary +|=== + +[[plain]] +Plain Encoding:: Data is stored in its natural format. For example, `int32` values +are stored as fixed-size 32-bit little-endian integers. + +[[bitshuffle]] +Bitshuffle Encoding:: Data is rearranged to store the most significant bit of +every value, followed by the second most significant bit of every value, and so +on. Finally, the result is LZ4 compressed. Bitshuffle encoding is a good choice for +columns that have many repeated values, or values that change by small amounts +when sorted by primary key. The +https://github.com/kiyo-masui/bitshuffle[bitshuffle] project has a good +overview of performance and use cases. + +[[run-length]] +Run Length Encoding:: _Runs_ (consecutive repeated values), are compressed in a +column by storing only the value and the count. Run length encoding is effective +for columns with many consecutive repeated values when sorted by primary key. + +[[dictionary]] +Dictionary Encoding:: A dictionary of unique values is built, and each column value +is encoded as its corresponding index in the dictionary. Dictionary encoding +is effective for columns with low cardinality. If the column values of a given row set +are unable to be compressed because the number of unique values is too high, Kudu will +transparently fall back to plain encoding for that row set. This is evaluated during +flush. + +[[prefix]] +Prefix Encoding:: Common prefixes are compressed in consecutive column values. Prefix +encoding can be effective for values that share common prefixes, or the first +column of the primary key, since rows are sorted by primary key within tablets. + +[[compression]] +=== Column Compression + +Kudu allows per-column compression using LZ4, `snappy`, or `zlib` compression +codecs. By default, columns are stored uncompressed. Consider using compression +if reducing storage space is more important than raw scan performance. ++ +Every data set will compress differently, but in general LZ4 has the least effect on +performance, while `zlib` will compress to the smallest data sizes. +Bitshuffle-encoded columns are inherently compressed using LZ4, so it is not +typically beneficial to apply additional compression on top of this encoding. + +[[primary-keys]] +== Primary Keys + +Each Kudu table must declare a primary key comprised of one or more columns. +Primary key columns must be non-nullable, and may not be a boolean or +floating-point type. Every row in a table must have a unique set of values for +its primary key columns. As with a traditional RDBMS, primary key +selection is critical to ensuring performant database operations. + +Unlike an RDBMS, Kudu does not provide an auto-incrementing column feature, so +the application must always provide the full primary key during insert or +ingestion. In addition, Kudu does not allow the primary key values of a row to +be updated. + +Within a tablet, rows are stored sorted lexicographically by primary key. Advanced +schema designs can take advantage of this ordering to achieve good distribution of +data among tablets, while retaining consistent ordering in intra-tablet scans. See +<> for more information. + +[[data-distribution]] +== Data Distribution + +Kudu tables, unlike traditional relational tables, are partitioned into tablets +and distributed across many tablet servers. A row always belongs to a single +tablet (and its replicas). The method of assigning rows to tablets is specified +in a configurable _partition schema_ for each table, during table creation. + +Choosing a data distribution strategy requires you to understand the data model and +expected workload of a table. For write-heavy workloads, it is important to +design the distribution such that writes are spread across tablets in order to +avoid overloading a single tablet. For workloads involving many short scans, performance +can be improved if all of the data for the scan is located in the same +tablet. Understanding these fundamental trade-offs is central to designing an effective +partition schema. + +Kudu provides two types of partition schema: <> and +<>. These schema types can be <> or independently. Kudu does not yet allow tablets to be split after +creation, so you must design your partition schema ahead of time to ensure that +a sufficient number of tablets are created. + +[[range-partitioning]] +=== Range Partitioning + +With range partitioning, rows are distributed into tablets using a totally-ordered +distribution key. Each tablet is assigned a contiguous segment of the table's +distribution keyspace. By default, the distribution key uses all of the columns of the +primary key, but it may be configured to use any subset of the primary key +columns. + +During table creation, tablet boundaries are specified as a sequence of _split +rows_. Consider the following table schema (using SQL syntax for clarity): + +[source,sql] +---- +CREATE TABLE customers ( + first_name STRING NOT NULL, + last_name STRING NOT NULL, + order_count INT32, + PRIMARY KEY (last_name, first_name), +) +---- + +Specifying the split rows as `\(("b", ""), ("c", ""), ("d", ""), .., ("z", ""))` +(25 split rows total) will result in the creation of 26 tablets, with each +tablet containing a range of customer surnames all beginning with a given letter. +This is an effective partition schema for a workload where customers are inserted +and updated uniformly by last name, and scans are typically performed over a range +of surnames. + +It may make sense to partition a table by range using only a subset of the +primary key columns, or with a different ordering than the primary key. For +instance, you can change the above example to specify that the range partition +should only include the `last_name` column. In that case, Kudu would guarantee that all +customers with the same last name would fall into the same tablet, regardless of +the provided split rows. + +[[hash-bucketing]] +=== Hash Bucketing + +Hash bucketing distributes rows by hash value into one of many buckets. Each +tablet is responsible for the rows falling into a single bucket. The number of +buckets (and therefore tablets), is specified during table creation. Typically, +all of the primary key columns are used as the columns to hash, but as with range +partitioning, any subset of the primary key columns can be used. + +Hash partitioning is an effective strategy to increase the amount of parallelism +for workloads that would otherwise skew writes into a small number of tablets. +Consider the following table schema. + +[source,sql] +---- +CREATE TABLE metrics ( + host STRING NOT NULL, + metric STRING, + time TIMESTAMP NOT NULL, + measurement DOUBLE, + PRIMARY KEY (time, metric, host), +) +---- + +If you use the default range partitioning over the primary key columns, inserts will +tend to only go to the tablet covering the current time, which limits the +maximum write throughput to the throughput of a single tablet. If you use hash +partitioning, you can guarantee a number of parallel writes equal to the number +of buckets specified when defining the partition schema. The trade-off is that a +scan over a single time range now must touch each of these tablets, instead of +(possibly) a single tablet. Hash bucketing can be an effective tool for mitigating +other types of write skew as well, such as monotonically increasing values. + +As an advanced optimization, you can create a table with more than one +hash bucket component, as long as the column sets included in each are disjoint, +and all hashed columns are part of the primary key. The total number of tablets +created will be the product of the hash bucket counts. For example, the above +`metrics` table could be created with two hash bucket components, one over the +`time` column with 4 buckets, and one over the `metric` and `host` columns with +8 buckets. The total number of tablets will be 32. The advantage of using two +separate hash bucket components is that scans which specify equality constraints +on the `metric` and `host` columns will be able to skip 7/8 of the total +tablets, leaving a total of just 4 tablets to scan. + +WARNING: This optimization is not yet implemented. See +<> for details. + +[[hash-and-range]] +=== Hash Bucketing and Range Partitioning + +Hash bucketing can be combined with range partitioning. Adding hash bucketing to +a range partitioned table has the effect of parallelizing operations that would +otherwise operate sequentially over the range. The total number of tablets is +the product of the number of hash buckets and the number of split rows plus one. + +[[alter-schema]] +== Schema Alterations + +You can alter a table's schema in the following ways: + +- Rename the table +- Rename, add, or drop columns +- Rename (but not drop) primary key columns + +You cannot modify the partition schema after table creation. + +[[known-limitations]] +== Known Limitations + +Kudu currently has some known limitations that may factor into schema design: + +Immutable Primary Keys:: Kudu does not allow you to update the primary key of a + row after insertion. + +Non-alterable Primary Key:: Kudu does not allow you to alter the primary key + columns after table creation. + +Non-alterable Partition Schema:: Kudu does not allow you to alter the + partition schema after table creation. + +Partition Pruning:: When tables use hash buckets, the Java and C++ clients do +not yet use scan predicates to prune tablets for scans over these tables. In the +future, specifying an equality predicate on all columns in the hash bucket +component will limit the scan to only the tablets corresponding to the hash +bucket. + +Tablet Splitting:: You currently cannot split or merge tablets after table +creation. You must create the appropriate number of tablets in the +partition schema at table creation. As a workaround, you can copy the contents +of one table to another by using a `CREATE TABLE AS SELECT` statement or creating +an empty table and using an `INSERT` query with `SELECT` in the predicate to +populate the new table. diff --git a/docs/style_guide.adoc b/docs/style_guide.adoc new file mode 100644 index 000000000000..bab3842f79e3 --- /dev/null +++ b/docs/style_guide.adoc @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[documentation_style_guide]] += Apache Kudu (incubating) Documentation Style Guide + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +This document gives you the information you need to get started contributing to Kudu +documentation. For code contribution guidelines, see +link:contributing.html[Contributing to Kudu]. + +== Asciidoc +Kudu documentation is written in link:https://en.wikipedia.org/wiki/AsciiDoc[Asciidoc] +and compiled into HTML and output using the link:http://asciidoctor.org/[Asciidoctor] +toolchain. This provides several advantages. Among them: + +- Asciidoc is a superset of Markdown, so if you already know Markdown you can get +started right away. +- Github includes support for Asciidoc in its Atom editor, as well as real-time +simplified HTML rendering. +- Patch submissions are small and easy to review. + +== Code Standards +Within reason, try to adhere to these standards: + +- 100 or fewer columns per line +- 2 spaces rather than tabs for indentation +- No more than 4 nested levels in the documentation if possible: `(Document -> Chapter +-> Section -> Subsection)` +- When possible, provide the language that a code listing is in, using the +`[source,]` macro. for example, `[source,sql]` +- In general, do not indent Asciidoc, as indentation is significant. Code listings +are one exception. + +== Building Documentation +To build the documentation locally, you need the Asciidoctor Ruby application. To build the +entire Kudu documentation set, change to the `docs/` directory and run: +[source,bash] +---- +asciidoctor -d book -D docs *.adoc +---- +This builds the HTML output in a new _docs/_ directory within the current directory. +Some content, such as the per-daemon configuration reference files, is not populated +during a local build. + +To view the HTML, open _docs/index.html_ in your local browser. + +You can also build only a single chapter. such as _introduction.adoc_, by passing its name instead. + +== Asciidoc Style Guide +Asciidoc supports a lot of syntax that we do not need to use. When possible, stick +with the following, adapted from the +link:https://hbase.apache.org/book.html#_hbase_reference_guide_style_guide_and_cheat_sheet[HBase Reference Guide]: + +.AsciiDoc Cheat Sheet +[cols="1,1,a",options="header"] +|=== +| Element Type | Desired Rendering | How to do it +| A paragraph | a paragraph | Just type some text with a blank line at the top and bottom. +| Add line breaks within a paragraph without adding blank lines | Manual line breaks | This will break + at the plus sign. Or prefix the whole paragraph with a line containing '[%hardbreaks]' +| Give a title to anything | Colored italic bold differently-sized text | .MyTitle (no space between the period and the words) on the line before the thing to be titled +| In-Line Code or commands | monospace | \`text` +| In-line literal content (things to be typed exactly as shown) | bold mono | \*\`typethis`* +| In-line replaceable content (things to substitute with your own values) | bold italic mono | \*\_typesomething_* +| Code blocks with highlighting | monospace, highlighted, preserve space | +........ +[source,java] +---- + myAwesomeCode() { +} +---- +........ +| Code block included from a separate file | included just as though it were part of the main file | +................ +[source,ruby] +---- +\include::path/to/app.rb[] +---- +................ +| Include only part of a separate file | Similar to Javadoc | See http://asciidoctor.org/docs/user-manual/#by-tagged-regions +| File names, directory names, new terms | italic | \_hbase-default.xml_ +| External naked URLs | A link with the URL as link text | +---- +http://www.google.com +---- + +| External URLs with text | A link with arbitrary link text | +---- +link:http://www.google.com[Google] +---- + +| Create an internal anchor to cross-reference | not rendered | +---- +[[anchor_name]] +---- +| Cross-reference an existing anchor using its default title| an internal hyperlink using the element title if available, otherwise using the anchor name | +---- +<> +---- +| Cross-reference an existing anchor using custom text | an internal hyperlink using arbitrary text | +---- +<> +---- +| A block image | The image with alt text | +---- +image::sunset.jpg[Alt Text] +---- +(put the image in the src/main/site/resources/images directory) +| An inline image | The image with alt text, as part of the text flow | +---- +image:sunset.jpg [Alt Text] +---- +(only one colon) +| Link to a remote image | show an image hosted elsewhere | +---- +image::http://inkscape.org/doc/examples/tux.svg[Tux,250,350] +---- +(or `image:`) +| Add dimensions or a URL to the image | depends | inside the brackets after the alt text, specify width, height and/or link="http://my_link.com" +| A footnote | subscript link which takes you to the footnote | +---- +Some text.footnote:[The footnote text.] +---- +| A note or warning with no title | The admonition image followed by the admonition | +---- +NOTE: My note here +---- + +---- +WARNING: My warning here +---- +| A complex note | The note has a title and/or multiple paragraphs and/or code blocks or lists, etc | +........ +.The Title +[NOTE] +==== +Here is the note text. +Everything until the second set of four equals signs +is part of the note. +---- +some source code +---- +==== +........ +| Bullet lists | bullet lists | +---- +* list item 1 +---- +(see http://asciidoctor.org/docs/user-manual/#unordered-lists) +| Numbered lists | numbered list | +---- +. list item 2 +---- +(see http://asciidoctor.org/docs/user-manual/#ordered-lists) +| Checklists | Checked or unchecked boxes | +Checked: +---- +- [*] +---- +Unchecked: +---- +- [ ] +---- +| Multiple levels of lists | bulleted or numbered or combo | +---- +. Numbered (1), at top level +* Bullet (2), nested under 1 +* Bullet (3), nested under 1 +. Numbered (4), at top level +* Bullet (5), nested under 4 +** Bullet (6), nested under 5 +- [x] Checked (7), at top level +---- +| Labelled lists / variablelists | a list item title or summary followed by content | +---- +Title:: content + +Title:: + content +---- +|GUI menu cascades | bold text with arrows to show levels | Use an ASCII arrow. +........ +*File -> Print* +........ +renders like *File -> Print* +| Sidebars, quotes, or other blocks of text | a block of text, formatted differently from the default | Delimited using different delimiters, see http://asciidoctor.org/docs/user-manual/#built-in-blocks-summary. Some of the examples above use delimiters like \...., ----,====. +........ +[example] +==== +This is an example block. +==== + +[source] +---- +This is a source block. +---- + +[note] +==== +This is a note block. +==== + +[quote] +____ +This is a quote block. +____ +........ + +*If you want to insert literal Asciidoc content that keeps being interpreted, when in doubt, use eight dots as the delimiter at the top and bottom.* +| Nested Sections | chapter, section, sub-section, etc | +---- += Book (or chapter if the chapter can be built alone, see leveloffset below) + +== Chapter (or section if the chapter is standalone) + +=== Section (or subsection, etc) + +==== Subsection +---- + +and so on up to 6 levels (think carefully about going deeper than 4 levels, maybe you can just titled paragraphs or lists instead). Note that you can include a book inside another book by adding the `:leveloffset:+1` macro directive directly before your include, and resetting it to 0 directly after. See the _book.adoc_ source for examples, as this is how this guide handles chapters. *Don't do it for prefaces, glossaries, appendixes, or other special types of chapters.* + +| Include one file from another | Content is included as though it were inline | + +---- +include::[/path/to/file.adoc] +---- + +For plenty of examples. see _docs/docs.adoc_. +| A table | a table | See http://asciidoctor.org/docs/user-manual/#tables. Generally rows are separated by newlines and columns by pipes +| Comment out a single line | A line is skipped during rendering | +`+//+ This line won't show up` +| Comment out a block | A section of the file is skipped during rendering | +---- +//// +Nothing between the slashes will show up. +//// +---- +| Highlight text for review | text shows up with yellow background | +---- +Test between #hash marks# is highlighted yellow. +---- +|=== + diff --git a/docs/support/jekyll-templates/document.html.erb b/docs/support/jekyll-templates/document.html.erb new file mode 100644 index 000000000000..c5e9ed3f9902 --- /dev/null +++ b/docs/support/jekyll-templates/document.html.erb @@ -0,0 +1,133 @@ +<% +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +%> +--- +title: <%= doctitle(:sanitize => true) || (attr 'untitled-label') %> +layout: default +active_nav: docs +last_updated: '<%= %(#{attr 'last-update-label'} #{attr :docdatetime}) %>' +--- + +<% +case attr 'source-highlighter' +when 'coderay' + if (attr 'coderay-css', 'class') == 'class' + if @safe >= Asciidoctor::SafeMode::SECURE || (attr? :linkcss) %> +<% + else %> +<%= Asciidoctor::Stylesheets.embed_coderay_stylesheet %> +<% + end + end +when 'pygments' + if (attr 'pygments-css', 'class') == 'class' + if @safe >= Asciidoctor::SafeMode::SECURE || (attr? :linkcss) %> +<% + else %> +<%= Asciidoctor::Stylesheets.embed_pygments_stylesheet(attr 'pygments-style') %> +<% + end + end +when 'highlightjs' %> + + + +<% +when 'prettify' %> + + +<% +end %> + + + +<% +unless footnotes.empty? || !footnotes? || attr?(:nofootnotes) %> +
+
<% + footnotes.each do |fn| %> +
+ <%= fn.index %>. <%= fn.text %> +
<% + end %> +
<% +end #footnotes +%> + diff --git a/docs/support/scripts/Gemfile b/docs/support/scripts/Gemfile new file mode 100644 index 000000000000..cc67e629fc63 --- /dev/null +++ b/docs/support/scripts/Gemfile @@ -0,0 +1,13 @@ +source 'https://rubygems.org' + +# We need to use versions that can run on CentOS 6, which ships Ruby 1.8.7. +gem 'thread_safe' +gem 'redcarpet', '~> 2.3.0' +gem 'jekyll', '~> 1.5.1' +gem 'therubyracer' # V8 runtime installer for Jekyll +gem 'ref', '~> 1.0' +gem 'json' +gem 'rdoc' + +gem 'asciidoctor' +gem 'tilt' diff --git a/docs/support/scripts/Gemfile.lock b/docs/support/scripts/Gemfile.lock new file mode 100644 index 000000000000..0d43a614ee5e --- /dev/null +++ b/docs/support/scripts/Gemfile.lock @@ -0,0 +1,73 @@ +GEM + remote: https://rubygems.org/ + specs: + asciidoctor (1.5.3) + blankslate (2.1.2.4) + classifier (1.3.4) + fast-stemmer (>= 1.0.0) + colorator (0.1) + commander (4.1.6) + highline (~> 1.6.11) + fast-stemmer (1.0.2) + ffi (1.9.10) + highline (1.6.21) + jekyll (1.5.1) + classifier (~> 1.3) + colorator (~> 0.1) + commander (~> 4.1.3) + liquid (~> 2.5.5) + listen (~> 1.3) + maruku (= 0.7.0) + pygments.rb (~> 0.5.0) + redcarpet (~> 2.3.0) + safe_yaml (~> 1.0) + toml (~> 0.1.0) + json (1.8.3) + libv8 (3.16.14.13) + liquid (2.5.5) + listen (1.3.1) + rb-fsevent (>= 0.9.3) + rb-inotify (>= 0.9) + rb-kqueue (>= 0.2) + maruku (0.7.0) + parslet (1.5.0) + blankslate (~> 2.0) + posix-spawn (0.3.11) + pygments.rb (0.5.4) + posix-spawn (~> 0.3.6) + yajl-ruby (~> 1.1.0) + rb-fsevent (0.9.6) + rb-inotify (0.9.5) + ffi (>= 0.5.0) + rb-kqueue (0.2.4) + ffi (>= 0.5.0) + rdoc (4.2.0) + json (~> 1.4) + redcarpet (2.3.0) + ref (1.0.5) + safe_yaml (1.0.4) + therubyracer (0.12.2) + libv8 (~> 3.16.14.0) + ref + thread_safe (0.3.5) + tilt (2.0.1) + toml (0.1.2) + parslet (~> 1.5.0) + yajl-ruby (1.1.0) + +PLATFORMS + ruby + +DEPENDENCIES + asciidoctor + jekyll (~> 1.5.1) + json + rdoc + redcarpet (~> 2.3.0) + ref (~> 1.0) + therubyracer + thread_safe + tilt + +BUNDLED WITH + 1.10.6 diff --git a/docs/support/scripts/make_docs.sh b/docs/support/scripts/make_docs.sh new file mode 100755 index 000000000000..8c9480458ba6 --- /dev/null +++ b/docs/support/scripts/make_docs.sh @@ -0,0 +1,208 @@ +#!/bin/bash +######################################################################## +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script runs after Kudu binaries are built. It does: +# 1. For each binary, run $binary --helpxml and save the output to an XML file +# 2. For each generated XML file, run it through xsltproc two times: +# a. Once to capture "supported" (stable) flags +# b. Once to capture "unsupported" (evolving or experimental) flags +# 3. For each generated Asciidoc file, include it in either configuration_reference.adoc +# or configuration_reference_unsupported.adoc +# +# Usage: make_docs.sh +######################################################################## +set -e + +usage() { + echo usage: "$0 --build_root [--site ]" +} + +while [[ $# > 0 ]] ; do + arg=$1 + case $arg in + --help) + usage + exit 1 + ;; + --build_root) + BUILD_ROOT=$2 + shift + shift + ;; + --site|-s) + SITE=$2 + if [ -z "$SITE" ]; then + usage + exit 1 + fi + shift + shift + if [ ! -d "$SITE"/.git ] || [ ! -d "$SITE/_layouts/" ]; then + echo "path $SITE doesn't appear to be the root of a git checkout " + echo "of the Kudu site. Expected to find .git/ and _layouts/ directories" + exit 1 + fi + SITE=$(cd $SITE && pwd) + OUTPUT_DIR=$SITE/docs + ;; + --no-jekyll) + NO_JEKYLL=1 + shift + ;; + *) + echo unknown argument: $arg + exit 1 + esac +done + +if [ -z "$BUILD_ROOT" ]; then + usage + exit 1 +fi + +if [ -z "$SITE" ]; then + OUTPUT_DIR=$BUILD_ROOT/docs +fi + +GEN_DOC_DIR=$BUILD_ROOT/gen-docs +SOURCE_ROOT=$(cd $(dirname $0)/../../..; pwd) + +if ! which ruby > /dev/null; then + echo "ruby must be installed in order to build the docs." + echo 1 +fi + +DOCS_SCRIPTS="$SOURCE_ROOT/docs/support/scripts" + +# We must set GEM_PATH because bundler depends on it to find its own libraries. +export GEM_PATH="$BUILD_ROOT/gems" +echo GEM_PATH=$GEM_PATH + +export PATH="$GEM_PATH/bin:$PATH" +echo PATH="$GEM_PATH/bin:$PATH" + +BUNDLE="$GEM_PATH/bin/bundle" + +echo "Locally installing ruby gems needed to build docs." +if [ ! -x "$BUNDLE" ]; then + set -x + gem install --no-ri --no-rdoc -q --install-dir "$GEM_PATH" bundler + set +x +fi + +set -x +cd "$BUILD_ROOT" +cp $DOCS_SCRIPTS/Gemfile . +cp $DOCS_SCRIPTS/Gemfile.lock . +$BUNDLE install --no-color --path "$GEM_PATH" +set +x + +# We need the xsltproc package. +for requirement in "xsltproc"; do + if ! which $requirement > /dev/null; then + echo "$requirement is required, but cannot be found. Make sure it is in your path." + exit 1 + fi +done + +mkdir -p "$OUTPUT_DIR" "$GEN_DOC_DIR" + +# Create config flag references for each of the binaries below +binaries=("kudu-master" \ + "kudu-tserver") + +for binary in ${binaries[@]}; do + echo "Running $binary --helpxml" + + ( + # Reset environment to avoid affecting the default flag values. + for var in $(env | awk -F= '{print $1}' | egrep -i 'KUDU|GLOG'); do + echo "unset $var" + eval "unset $var" + done + + # Create the XML file. + # This command exits with a nonzero value. + $BUILD_ROOT/bin/$binary --helpxml > ${GEN_DOC_DIR}/$(basename $binary).xml || true + ) + + # Create the supported config reference + xsltproc \ + --stringparam binary $binary \ + --stringparam support-level stable \ + -o $GEN_DOC_DIR/${binary}_configuration_reference.adoc \ + $SOURCE_ROOT/docs/support/xsl/gflags_to_asciidoc.xsl \ + ${GEN_DOC_DIR}/$binary.xml + INCLUSIONS_SUPPORTED+="include::${binary}_configuration_reference.adoc[leveloffset=+1]\n" + + # Create the unsupported config reference + xsltproc \ + --stringparam binary $binary \ + --stringparam support-level unsupported \ + -o $GEN_DOC_DIR/${binary}_configuration_reference_unsupported.adoc \ + $SOURCE_ROOT/docs/support/xsl/gflags_to_asciidoc.xsl \ + ${GEN_DOC_DIR}/$binary.xml + INCLUSIONS_UNSUPPORTED+="include::${binary}_configuration_reference_unsupported.adoc[leveloffset=+1]\n" +done + +# Add the includes to the configuration reference files, replacing the template lines +cp $SOURCE_ROOT/docs/configuration_reference* $GEN_DOC_DIR/ +sed -i "s#@@CONFIGURATION_REFERENCE@@#${INCLUSIONS_SUPPORTED}#" ${GEN_DOC_DIR}/configuration_reference.adoc +sed -i "s#@@CONFIGURATION_REFERENCE@@#${INCLUSIONS_UNSUPPORTED}#" ${GEN_DOC_DIR}/configuration_reference_unsupported.adoc + +# If we're generating the web site, pass the template which causes us +# to generate Jekyll templates instead of full HTML. +if [ -n "$SITE" ]; then + TEMPLATE_FLAG="-T $SOURCE_ROOT/docs/support/jekyll-templates" +else + TEMPLATE_FLAG="" +fi + +bundle exec asciidoctor -d book $TEMPLATE_FLAG \ + $SOURCE_ROOT/docs/*.adoc ${GEN_DOC_DIR}/*.adoc -D "$OUTPUT_DIR" + +mkdir -p "$OUTPUT_DIR/images" +cp $SOURCE_ROOT/docs/images/* "$OUTPUT_DIR/images/" + + +echo +echo ---------------------------- +echo "Docs built in $OUTPUT_DIR." + +# If we're building the site, try to run Jekyll for them to make +# it a bit easier to quickly preview the results. +if [ -n "$SITE" ] && [ -z "$NO_JEKYLL" ]; then + # We need to generate a config file which fakes the "github.url" property + # so that relative links within the site work. + BASE_URL="file://$SITE/_site/" + TMP_CONFIG=$(mktemp --suffix=.yml) + trap "rm $TMP_CONFIG" EXIT + printf "github:\n url: %s" "$BASE_URL" > $TMP_CONFIG + + # Now rebuild the site itself. + echo Attempting to re-build via Jekyll... + bundle exec jekyll build --source "$SITE" --config "$TMP_CONFIG" + + # Output the URL so it's easy to click on from the terminal. + echo ---------------------- + echo Rebuild successful. View your site at + echo $BASE_URL/index.html + echo ---------------------- +fi diff --git a/docs/support/scripts/make_site.sh b/docs/support/scripts/make_site.sh new file mode 100755 index 000000000000..486ce11e9719 --- /dev/null +++ b/docs/support/scripts/make_site.sh @@ -0,0 +1,82 @@ +#!/bin/bash +######################################################################## +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script generates site documentation and Javadocs. +# +# Usage: make_site.sh +######################################################################## +set -e + +BUILD_TYPE=release +SOURCE_ROOT=$(cd $(dirname $0)/../../..; pwd) +BUILD_ROOT="$SOURCE_ROOT/build/$BUILD_TYPE" +SITE_OUTPUT_DIR="$BUILD_ROOT/site" +set -x + +cd "$SOURCE_ROOT" + +# Build Kudu thirdparty +$SOURCE_ROOT/build-support/enable_devtoolset.sh $SOURCE_ROOT/thirdparty/build-if-necessary.sh +echo "Successfully built third-party dependencies." + +# Build the binaries so we can auto-generate the command-line references +mkdir -p "$BUILD_ROOT" +cd "$BUILD_ROOT" +rm -rf CMakeCache CMakeFiles/ +$SOURCE_ROOT/build-support/enable_devtoolset.sh \ + $SOURCE_ROOT/thirdparty/installed/bin/cmake \ + -DNO_TESTS=1 \ + -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ + $SOURCE_ROOT +make -j$(getconf _NPROCESSORS_ONLN) + +# Check out the gh-pages repo into $SITE_OUTPUT_DIR +git clone -q $(git config --get remote.origin.url) --reference $SOURCE_ROOT -b gh-pages --depth 1 "$SITE_OUTPUT_DIR" + +# Build the docs using the styles from the Jekyll site +rm -Rf "$SITE_OUTPUT_DIR/docs" +$SOURCE_ROOT/docs/support/scripts/make_docs.sh --build_root $BUILD_ROOT --site "$SITE_OUTPUT_DIR" +if [ -f "$SITE_OUTPUT_DIR/docs/index.html" ]; then + echo "Successfully built docs." +else + echo "Docs failed to build." + exit 1 +fi + +cd "$SOURCE_ROOT/java" +mvn clean install -DskipTests +mvn clean javadoc:aggregate + +if [ -f "$SOURCE_ROOT/java/target/site/apidocs/index.html" ]; then + echo "Successfully built Javadocs." +else + echo "Javadocs failed to build." + exit 1 +fi + +rm -Rf "$SITE_OUTPUT_DIR/apidocs" +cp -au "$SOURCE_ROOT/java/target/site/apidocs" "$SITE_OUTPUT_DIR/" + +cd "$SITE_OUTPUT_DIR" +SITE_ARCHIVE="$SITE_OUTPUT_DIR/website_archive.zip" +zip -rq "$SITE_ARCHIVE" docs apidocs + +echo "Generated web site at $SITE_OUTPUT_DIR" +echo "Docs zip generated at $SITE_ARCHIVE" diff --git a/docs/support/xsl/gflags_to_asciidoc.xsl b/docs/support/xsl/gflags_to_asciidoc.xsl new file mode 100644 index 000000000000..ef79f88a20ed --- /dev/null +++ b/docs/support/xsl/gflags_to_asciidoc.xsl @@ -0,0 +1,141 @@ + + + + + + + + + + + + + + + + + + +//// +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +//// + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + + +[[_stable]] += `` Flags + +== Stable Flags + +Flags tagged `stable` and not `advanced` are safe to use for common +configuration tasks. + + +[[_]] +=== `--` + + + +[cols="1h,3d", width="50%"] +|=== +| Type | +| Default | ``none +| Tags | +|=== +{nbsp} + + + + + +[[_stable_advanced]] +== Stable, Advanced Flags + +Flags tagged `stable` and `advanced` are supported, but should be considered +"expert" options and should be used carefully and after thorough testing. + + +[[_]] +=== `--` + + + +[cols="1h,3d", width="50%"] +|=== +| Type | +| Default | ``none +| Tags | +|=== +{nbsp} + + + +''' + + + + + +[[_unsupported]] += `` Unsupported Flags + +Flags marked `advanced` or `experimental` and not marked `stable`, or flags with no stability tag, are *unsupported* and are included +for informational purposes only. They are subject to change or be removed without notice. + + +[[_]] +== `--` + + + +[cols="1h,3d", width="50%"] +|=== +| Type | +| Default | ``none +| Tags | +|=== + + +''' + + + + diff --git a/docs/transaction_semantics.adoc b/docs/transaction_semantics.adoc new file mode 100644 index 000000000000..b2c51c4a049b --- /dev/null +++ b/docs/transaction_semantics.adoc @@ -0,0 +1,240 @@ +[[installation]] += Transaction Semantics in Apache Kudu (incubating) + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 3 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +**** +This is a brief introduction to Kudu's transaction and consistency semantics. For an +in-depth technical exposition of most of what is mentioned here, and why it is correct, +see the technical report <<1>>. +**** + +Kudu's transactional semantics and architecture are inspired by state-of-the-art +systems such as Spanner <<2>> and Calvin <<3>>. Kudu builds upon decades of database +research. The core philosophy is to make the lives of developers easier by providing transactions with +simple, strong semantics, without sacrificing performance or the ability to tune to different +requirements. + +Kudu is designed to eventually be fully ACID, however, multi-tablet transactions are not +yet implemented. As such, this discussion focuses on single-tablet write operations, and only +briefly touches multi-tablet reads. Eventually Kudu will support fully strict-serializable +semantics. In fact it already does in a limited context, but not all corner cases are covered +as this is still a work in progress. + +Kudu currently allows the following operations: + +* *Write operations* are sets of rows to be inserted, updated, or deleted in the storage +engine, in a single tablet with multiple replicas. Write operations do not have separate +"read sets" i.e. they do not scan existing data before performing the write. Each write +is only concerned with previous state of the rows that are about to change. +Writes are not "committed" explicitly by the user. Instead, they are committed automatically +by the system, after completion. + +* *Scans* are read operations that can traverse multiple tablets and read information +with some consistency or correctness guarantees. Scans can perform time-travel reads, i.e. +the user is able to set a scan timestamp in the past and get back results that reflect +the state of the storage engine at that point in time. + +[NOTE] +.Before We Begin +==== +* The term _timestamp_ is mentioned several times to illustrate the +functionality, but _timestamp_ is an internal concept mostly invisible to users, +except when setting timestamp on a `KuduScanner`. + +* We generally refer to methods and classes of the _async java_ client. While the {cpp} +client mostly has analogous methods and classes, parity between the APIs is still +a work in progress. At times, we may refer specifically to the {cpp} client. +==== + +== Single tablet write operations + +Kudu employs _Multiversion Concurrency Control (MVCC)_ and the _Raft consensus_ algorithm <<4>>. +Each write operation in Kudu must go through the tablet's leader. + +. The leader acquires all locks for the rows that it will change. +. The leader assigns the write a timestamp before the write is submitted for +replication. This timestamp will be the write's "tag" in MVCC. +. After a majority of replicas acknowledges the change, the actual rows are changed. +. After the changes are complete, they are made visible to concurrent writes +and reads, atomically. + +All replicas of a tablet observe the same order of operations and if a write +operation is assigned timestamp _n_ and changes row _x_, a second write operation +at timestamp _m > n_ is guaranteed to see the new value of _x_. + +This strict ordering of lock acquisition and timestamp assignment is enforced to be +consistent across all replicas of a tablet through consensus. Therefore, write operations +are totally ordered with regard to clock-assigned timestamps, relative to other writes +in the same tablet. In other words, writes have strict-serializable semantics, +though in an admittedly limited context. See this +link:http://www.bailis.org/blog/linearizability-versus-serializability[blog post] +for a little more context regarding what these semantics mean. + +While Isolated and Durable in an ACID sense, write operations are not yet fully Atomic. +The failure of a single write in a batch operation does not roll back the operation, +but produces per-row errors. + +== Writing to multiple tablets + +Kudu does not yet support transactions that span multiple tablets. However, +consistent snapshot reads are possible (with caveats in the current implementation) +as explained below. + +Writes to a Kudu client are optionally buffered in memory until they are flushed and sent +to the server. During the client's session flush, the rows for each tablet are batched +together, and sent to the tablet server which hosts the leader replica of the tablet. +Since there are no inter-tablet transactions, each of these batches represents a single, +independent write operation with its own timestamp. +However you have the option to impose some constraints on the assigned timestamps +and on how writes to different tablets can be observed by clients. + +Kudu, like Spanner, was designed to be externally consistent <<5>>, preserving consistency +even when operations span multiple tablets and even multiple data centers. In practice this +means that, if a write operation changes item _x_ at tablet _A_, and a following write +operation changes item _y_ at tablet _B_, you might want to enforce that if +the change to _y_ is observed, the change to _x_ must also be observed. There +are many examples where this can be important. For example, if Kudu is +storing clickstreams for further analysis, and two clicks follow each other but +are stored in different tablets, subsequent clicks should be assigned subsequent +timestamps so that the causal relationship between them is captured. + +.`CLIENT_PROPAGATED` Consistency +Kudu's default external consistency mode is called `CLIENT_PROPAGATED`. +See <<1>> for an extensive explanation on how it works. In brief, this mode causes writes +from _a single client_ to be automatically externally consistent. In this mode, writes are only externally +consistent from the perspective of a single client. In the clickstream scenario above, +if the two clicks are submitted by different client instances, the application must +manually propagate timestamps from one client to the other for the causal relationship +to be captured. + +`CLIENT_PROPAGATED` consistency is currently only available on the java client +and is exposed through the `AsyncKuduClient#getLastPropagatedTimestamp()` and +`AsyncKuduClient#setLastPropagatedTimestamp()` methods. + +.`Commit Wait` Consistency +Kudu also implements an experimental implementation of an external consistency +model used in Google's Spanner , called `Commit Wait`. `Commit Wait` works +by tightly synchronizing the clocks on all machines in the cluster. Then, when a +write occurs, timestamps are assigned and the results of the write are not made +visible until enough time has passed so that no other machine in the cluster could +possibly assign a lower timestamp to a following write. + +For the moment, Kudu's experimental implementation of `Commit Wait` is only available +in the java client, by setting `KuduSession#setExternalConsistencyMode()` +to `COMMIT_WAIT`. When using this mode, the latency of writes is tightly +tied to the accuracy of clocks on all the cluster hosts, and using this mode +with loose clock synchronization causes writes to take a long time to complete or even time +out. See <>. + +== Read Operations (Scans) + +Scans are read operations performed by clients that may span one or more rows across +one or more tablets. When a server receives a scan, it takes a snapshot of the MVCC +state and then proceeds in one of two ways depending on the read mode selected by +the user by means of the `KuduScanner::SetReadMode()` method. + +`READ_LATEST`:: This is the default read mode. The server takes a snapshot of +the MVCC state and proceeds with the read immediately. Reads in this mode only yield +'Read Committed' isolation. + +`READ_AT_SNAPSHOT`:: In this read mode, scans are consistent and repeatable. A +timestamp for the snapshot is selected either by the server, or set +explicitly by the user through `KuduScanner::SetSnapshotMicros()`. Explicitly setting +the timestamp is recommended; see <>. The server waits until this +timestamp is 'safe' (until all write operations that have a lower timestamp have +completed and are visible). This delay, coupled with an external consistency method, +will eventually allow Kudu to have full `strict-serializable` semantics for reads +and writes. This is still a work in progress and some anomalies are still possible +(see <>). Only scans in this mode can be fault-tolerant. + +Selecting between read modes requires balancing the trade-offs and making a choice +that fits your workload. For instance, a reporting application that needs to +scan the entire database might need to perform careful accounting operations, so that +scan may need to be fault-tolerant, but probably doesn't require a to-the-microsecond +up-to-date view of the database. In that case, you might choose 'READ_AT_SNAPSHOT' +and select a timestamp that is a few seconds in the past when the scan starts. On +the other hand, a machine learning workload that is not ingesting the whole data +set and is already statistical in nature might not require the scan to be repeatable, +so you might choose `READ_LATEST` instead. + +[[known_issues]] +== Known Issues and Limitations + +We plan to fix the following issues. Monitor the linked JIRAs for progress. + +== Serialization +There are several gaps and corner cases that prevent Kudu from being fully strictly-serializable +in some situations, at the moment. Below are the details and next, some recommendations. + +[[known_issues_scans]] +=== Scans +* Support for `COMMIT_WAIT` is experimental and requires careful tuning of the + time-synchronization protocol, such as NTP (Network Time Protocol). +* Support for externally-consistent write modes is only fully available in the Java + API at this time. (see link:https://issues.cloudera.org/browse/KUDU-1187[KUDU-1187]) +* In some rare circumstances, the `READ_AT_SNAPSHOT` scan mode may yield anomalous, + non-repeatable reads. + ** When scanning a replica at a snapshot, the replica may not have received all the writes + from the leader and might reply immediately, yielding a non-repeatable read (see link:https://issues.cloudera.org/browse/KUDU-798[KUDU-798]). + ** On a leader change, scans at a snapshot whose timestamp is beyond the last + write may also yield non-repeatable reads (see link:https://issues.cloudera.org/browse/KUDU-1188[KUDU-1188]). See <> for a workaround. + ** When performing multi-tablet scans without selecting a snapshot timestamp (see link:https://issues.cloudera.org/browse/KUDU-1189[KUDU-1189]). +* Impala scans are currently performed as `READ_LATEST` and have no consistency + guarantees. + + +=== Writes +* When a write fails with a timeout or is aborted, it is possible that it may + actually be committed. Kudu is currently missing a way to determine if a particular + timed-out write ever actually succeeded. On a retry, the write may succeed but + may also generate errors if some rows have already been inserted, or deleted (see link:https://issues.cloudera.org/browse/KUDU-568[KUDU-568]). + +* When a delete is performed to a row that has already been flushed, and the row is reinserted + all history is reset (see link:https://issues.cloudera.org/browse/KUDU-237[KUDU-237]). + This is not the case for rows that haven't been flushed yet and still reside in memory. + +[[recommendations]] +== Recommendations + +* If repeatable snapshot reads are a requirement, use `READ_AT_SNAPSHOT` + with a timestamp that is slightly in the past (between 2-5 seconds, ideally). + This will circumvent the anomalies described in <>. Even when the + anomalies have been addressed, back-dating the timestamp will always make scans + faster, since they are unlikely to block. + +* If external consistency is a requirement and you decide to use `Commit Wait`, the + time-synchronization protocol needs to be tuned carefully. Each transaction will wait + 2x the maximum clock error at the time of execution, which is usually in the 100 msec. + to 1 sec. range with the default settings, maybe more. Thus, transactions would take at least + 200 msec. to 2 sec. to complete when using the default settings and may even time out. + + ** A local server should be used as a time server. We've performed experiments using the default + NTP time source available in a Google Compute Engine data center and were able to obtain + a reasonable tight max error bound, usually varying between 12-17 milliseconds. + + ** The following parameters should be adjusted in `/etc/ntp.conf` to tighten the maximum error: + - `server my_server.org iburst minpoll 1 maxpoll 8` + - `tinker dispersion 500` + - `tinker allan 0` + +IMPORTANT: The above parameters minimize `maximum error` at the expense of `estimated error`, +the latter might be orders of magnitude above it's "normal" value. These parameters also +may place a greater load on the time server, since they make the servers poll much more +frequently. + +[bibliography] +.References +- [[[1]]] David Alves, Todd Lipcon and Vijay Garg. Technical Report: HybridTime - Accessible Global Consistency with High Clock Uncertainty. April, 2014. http://pdsl.ece.utexas.edu/david/hybrid-time-tech-report-01.pdf +- [[[2]]] James C. Corbett, Jeffrey Dean, Michael Epstein, Andrew Fikes, Christopher Frost, J. J. Furman, Sanjay Ghemawat, Andrey Gubarev, Christopher Heiser, Peter Hochschild, Wilson Hsieh, Sebastian Kanthak, Eugene Kogan, Hongyi Li, Alexander Lloyd, Sergey Melnik, David Mwaura, David Nagle, Sean Quinlan, Rajesh Rao, Lindsay Rolig, Yasushi Saito, Michal Szymaniak, Christopher Taylor, Ruth Wang, and Dale Woodford. 2012. Spanner: Google's globally-distributed database. In Proceedings of the 10th USENIX conference on Operating Systems Design and Implementation (OSDI'12). USENIX Association, Berkeley, CA, USA, 251-264. +- [[[3]]] Alexander Thomson, Thaddeus Diamond, Shu-Chun Weng, Kun Ren, Philip Shao, and Daniel J. Abadi. 2012. Calvin: fast distributed transactions for partitioned database systems. In Proceedings of the 2012 ACM SIGMOD International Conference on Management of Data (SIGMOD '12). ACM, New York, NY, USA, 1-12. DOI=10.1145/2213836.2213838 http://doi.acm.org/10.1145/2213836.2213838 +- [[[4]]] Diego Ongaro and John Ousterhout. 2014. In search of an understandable consensus algorithm. In Proceedings of the 2014 USENIX conference on USENIX Annual Technical Conference (USENIX ATC'14), Garth Gibson and Nickolai Zeldovich (Eds.). USENIX Association, Berkeley, CA, USA, 305-320. +- [[[5]]] Kwei-Jay Lin, "Consistency issues in real-time database systems," in System Sciences, 1989. Vol.II: Software Track, Proceedings of the Twenty-Second Annual Hawaii International Conference on , vol.2, no., pp.654-661 vol.2, 3-6 Jan 1989 doi: 10.1109/HICSS.1989.48069 diff --git a/docs/troubleshooting.adoc b/docs/troubleshooting.adoc new file mode 100644 index 000000000000..23d026779124 --- /dev/null +++ b/docs/troubleshooting.adoc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +[[troubleshooting]] += Apache Kudu (incubating) Troubleshooting + +:author: Kudu Team +:imagesdir: ./images +:icons: font +:toc: left +:toclevels: 2 +:doctype: book +:backend: html5 +:sectlinks: +:experimental: + +== Issues starting the Master or Tablet Server + +[[req_hole_punching]] +=== Errors During Hole Punching Test +Kudu requires hole punching capabilities in order to be efficient. Hole punching support +depends upon your operation system kernel version and local filesystem implementation. + + - RHEL or CentOS 6.4 or later, patched to kernel version of 2.6.32-358 or later. + Unpatched RHEL or CentOS 6.4 does not include a kernel with support for hole punching. + - Ubuntu 14.04 includes version 3.13 of the Linux kernel, which supports hole punching. + - Newer versions of the EXT4 or XFS file systems support hole punching, but EXT3 does + not. Older versions of XFS that do not support hole punching return a `EOPNOTSUPP` + (operation not supported) error. Older versions of either EXT4 or XFS that do + not support hole punching cause Kudu to emit an error message such as the following + and to fail to start: ++ +---- +Error during hole punch test. The log block manager requires a +filesystem with hole punching support such as ext4 or xfs. On el6, +kernel version 2.6.32-358 or newer is required. To run without hole +punching (at the cost of some efficiency and scalability), reconfigure +Kudu with --block_manager=file. Refer to the Kudu documentation for more +details. Raw error message follows. +---- + +Without hole punching support, the log block manager is unsafe to use. It won't +ever delete blocks, and will consume ever more space on disk. + +If you can't use hole punching in your environment, you can still +try Kudu. Enable the file block manager instead of the log block manager by +adding the `--block_manager=file` flag to the commands you use to start the master +and tablet servers. The file block manager does not scale as well as the log block +manager. + +WARNING: The file block manager is known to scale and perform poorly, and should +only be used for small-scale evaluation and development. + +[[ntp]] +=== NTP clock synchronization +For the master and tablet server daemons, the server's clock must be synchronized using NTP. +In addition, the *maximum clock error* (not to be mistaken with the estimated error) +be below a configurable threshold. The default value is 10 seconds, but it can be set with the flag +`--max_clock_sync_error_usec`. + +If NTP is not installed, or if the clock is reported as unsynchronized, Kudu will not +start, and will emit a message such as: +---- +F0924 20:24:36.336809 14550 hybrid_clock.cc:191 Couldn't get the current time: Clock unsynchronized. Status: Service unavailable: Error reading clock. Clock considered unsynchronized. +---- +If NTP is installed and synchronized, but the maximum clock error is too high, +the user will see a message such as: +---- +Sep 17, 8:13:09.873 PM FATAL hybrid_clock.cc:196 Couldn't get the current time: Clock synchronized, but error: 11130000, is past the maximum allowable error: 10000000 +---- +or +---- +Sep 17, 8:32:31.135 PM FATAL tablet_server_main.cc:38 Check failed: _s.ok() Bad status: Service unavailable: Cannot initialize clock: Cannot initialize HybridClock. Clock synchronized but error was too high (11711000 us). +---- + +TIP: If NTP is installed the user can monitor the synchronization status by running +`ntptime`. The relevant value is what is reported for `maximum error`. + +To install NTP, use the appropriate command for your operating system: +[cols="1,1", options="header"] +|=== +| OS | Command +| Debian/Ubuntu | `sudo apt-get install ntp` +| RHEL/CentOS | `sudo yum install ntp` +|=== + +If NTP is installed but not running, start it using one of these commands: +[cols="1,1", options="header"] +|=== +| OS | Command +| Debian/Ubuntu | `sudo service ntp restart` +| RHEL/CentOS | `sudo /etc/init.d/ntpd restart` +|=== + +TIP: NTP requires a network connection and may take a few minutes to synchronize the clock. +In some cases a spotty network connection may make NTP report the clock as unsynchronized. +A common, though temporary, workaround for this is to restart NTP with one of the commands above. + +If the clock is being reported as synchronized by NTP, but the maximum error is too high, +the user can increase the threshold to a higher value by setting the above +mentioned flag. For example to increase the possible maximum error to +20 seconds the flag should be set like: `--max_clock_sync_error_usec=20000000` + +== Troubleshooting Performance Issues + +[[kudu_tracing]] +=== Kudu Tracing + +The `kudu-master` and `kudu-tserver` daemons include built-in tracing support +based on the open source +link:https://www.chromium.org/developers/how-tos/trace-event-profiling-tool[Chromium Tracing] +framework. You can use tracing to help diagnose latency issues or other problems +on Kudu servers. + +==== Accessing the tracing interface +The tracing interface is accessed via a web browser as part of the +embedded web server in each of the Kudu daemons. + +.Tracing Interface URLs +|=== +| Daemon | URL +| Tablet Server | http://tablet-server-1.example.com:8050/tracing.html +| Master | http://master-1.example.com:8051/tracing.html +|=== + +WARNING: The tracing interface is known to work in recent versions of Google Chrome. +Other browsers may not work as expected. + +==== Collecting a trace + +After navigating to the tracing interface, click the *Record* button on the top left corner +of the screen. When beginning to diagnose a problem, start by selecting all categories. +Click *Record* to begin recording a trace. + +During the trace collection, events are collected into an in-memory ring buffer. +This ring buffer is fixed in size, so it will eventually fill up to 100%. However, new events +are still being collected while older events are being removed. While recording the trace, +trigger the behavior or workload you are interested in exploring. + +After collecting for several seconds, click *Stop*. The collected trace will be +downloaded and displayed. Use the *?* key to display help text about using the tracing +interface to explore the trace. + +==== Saving a trace + +You can save collected traces as JSON files for later analysis by clicking *Save* +after collecting the trace. To load and analyze a saved JSON file, click *Load* +and choose the file. + +=== RPC Timeout Traces + +If client applications are experiencing RPC timeouts, the Kudu tablet server +`WARNING` level logs should contain a log entry which includes an RPC-level trace. For example: + +---- +W0922 00:56:52.313848 10858 inbound_call.cc:193] Call kudu.consensus.ConsensusService.UpdateConsensus +from 192.168.1.102:43499 (request call id 3555909) took 1464ms (client timeout 1000). +W0922 00:56:52.314888 10858 inbound_call.cc:197] Trace: +0922 00:56:50.849505 (+ 0us) service_pool.cc:97] Inserting onto call queue +0922 00:56:50.849527 (+ 22us) service_pool.cc:158] Handling call +0922 00:56:50.849574 (+ 47us) raft_consensus.cc:1008] Updating replica for 2 ops +0922 00:56:50.849628 (+ 54us) raft_consensus.cc:1050] Early marking committed up to term: 8 index: 880241 +0922 00:56:50.849968 (+ 340us) raft_consensus.cc:1056] Triggering prepare for 2 ops +0922 00:56:50.850119 (+ 151us) log.cc:420] Serialized 1555 byte log entry +0922 00:56:50.850213 (+ 94us) raft_consensus.cc:1131] Marking committed up to term: 8 index: 880241 +0922 00:56:50.850218 (+ 5us) raft_consensus.cc:1148] Updating last received op as term: 8 index: 880243 +0922 00:56:50.850219 (+ 1us) raft_consensus.cc:1195] Filling consensus response to leader. +0922 00:56:50.850221 (+ 2us) raft_consensus.cc:1169] Waiting on the replicates to finish logging +0922 00:56:52.313763 (+1463542us) raft_consensus.cc:1182] finished +0922 00:56:52.313764 (+ 1us) raft_consensus.cc:1190] UpdateReplicas() finished +0922 00:56:52.313788 (+ 24us) inbound_call.cc:114] Queueing success response +---- + +These traces can give an indication of which part of the request was slow. Please +include them in bug reports related to RPC latency outliers. + +=== Kernel Stack Watchdog Traces + +Each Kudu server process has a background thread called the Stack Watchdog, which +monitors the other threads in the server in case they have blocked for +longer-than-expected periods of time. These traces can indicate operating system issues +or bottlenecked storage. + +When the watchdog thread identifies a case of thread blockage, it logs an entry +in the `WARNING` log like the following: + +---- +W0921 23:51:54.306350 10912 kernel_stack_watchdog.cc:111] Thread 10937 stuck at /data/kudu/consensus/log.cc:505 for 537ms: +Kernel stack: +[] do_get_write_access+0x29d/0x520 [jbd2] +[] jbd2_journal_get_write_access+0x31/0x50 [jbd2] +[] __ext4_journal_get_write_access+0x38/0x80 [ext4] +[] ext4_reserve_inode_write+0x73/0xa0 [ext4] +[] ext4_mark_inode_dirty+0x4c/0x1d0 [ext4] +[] ext4_dirty_inode+0x40/0x60 [ext4] +[] __mark_inode_dirty+0x3b/0x160 +[] file_update_time+0xf2/0x170 +[] __generic_file_aio_write+0x230/0x490 +[] generic_file_aio_write+0x88/0x100 +[] ext4_file_write+0x61/0x1e0 [ext4] +[] do_sync_readv_writev+0xfb/0x140 +[] do_readv_writev+0xd6/0x1f0 +[] vfs_writev+0x46/0x60 +[] sys_pwritev+0xa2/0xc0 +[] system_call_fastpath+0x16/0x1b +[] 0xffffffffffffffff + +User stack: + @ 0x3a1ace10c4 (unknown) + @ 0x1262103 (unknown) + @ 0x12622d4 (unknown) + @ 0x12603df (unknown) + @ 0x8e7bfb (unknown) + @ 0x8f478b (unknown) + @ 0x8f55db (unknown) + @ 0x12a7b6f (unknown) + @ 0x3a1b007851 (unknown) + @ 0x3a1ace894d (unknown) + @ (nil) (unknown) +---- + +These traces can be useful for diagnosing root-cause latency issues when they are caused by systems +below Kudu, such as disk controllers or file systems. diff --git a/docs/whitepaper/.gitignore b/docs/whitepaper/.gitignore new file mode 100644 index 000000000000..3eec47da1211 --- /dev/null +++ b/docs/whitepaper/.gitignore @@ -0,0 +1,3 @@ +*.aux +*.log +*.pdf diff --git a/docs/whitepaper/kudu.bib b/docs/whitepaper/kudu.bib new file mode 100644 index 000000000000..14df4c268333 --- /dev/null +++ b/docs/whitepaper/kudu.bib @@ -0,0 +1,384 @@ +@inproceedings{impala, + author = {Marcel Kornacker and + Alexander Behm and + Victor Bittorf and + Taras Bobrovytsky and + Casey Ching and + Alan Choi and + Justin Erickson and + Martin Grund and + Daniel Hecht and + Matthew Jacobs and + Ishaan Joshi and + Lenni Kuff and + Dileep Kumar and + Alex Leblang and + Nong Li and + Ippokratis Pandis and + Henry Robinson and + David Rorke and + Silvius Rus and + John Russell and + Dimitris Tsirogiannis and + Skye Wanderman{-}Milne and + Michael Yoder}, + title = {Impala: {A} Modern, Open-Source {SQL} Engine for {Hadoop}}, + booktitle = {{CIDR} 2015, Seventh Biennial Conference on Innovative Data Systems + Research, Asilomar, CA, USA, January 4-7, 2015, Online Proceedings}, + year = {2015}, + crossref = {DBLP:conf/cidr/2015}, + url = {http://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper28.pdf}, + timestamp = {Mon, 23 Mar 2015 09:07:18 +0100}, + biburl = {http://dblp.uni-trier.de/rec/bib/conf/cidr/KornackerBBBCCE15}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@proceedings{DBLP:conf/cidr/2015, + title = {{CIDR} 2015, Seventh Biennial Conference on Innovative Data Systems + Research, Asilomar, CA, USA, January 4-7, 2015, Online Proceedings}, + publisher = {www.cidrdb.org}, + year = {2015}, + timestamp = {Mon, 23 Mar 2015 08:57:30 +0100}, + biburl = {http://dblp.uni-trier.de/rec/bib/conf/cidr/2015}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@inproceedings{spark, + author = {Zaharia, Matei and Chowdhury, Mosharaf and Franklin, Michael J. and Shenker, Scott and Stoica, Ion}, + title = {Spark: Cluster Computing with Working Sets}, + booktitle = {Proceedings of the 2Nd USENIX Conference on Hot Topics in Cloud Computing}, + series = {HotCloud'10}, + year = {2010}, + location = {Boston, MA}, + pages = {10--10}, + numpages = {1}, + url = {http://dl.acm.org/citation.cfm?id=1863103.1863113}, + acmid = {1863113}, + publisher = {USENIX Association}, + address = {Berkeley, CA, USA}, +} + +@article{mapreduce, + author = {Dean, Jeffrey and Ghemawat, Sanjay}, + title = {MapReduce: Simplified Data Processing on Large Clusters}, + journal = {Commun. ACM}, + issue_date = {January 2008}, + volume = {51}, + number = {1}, + month = jan, + year = {2008}, + issn = {0001-0782}, + pages = {107--113}, + numpages = {7}, + url = {http://doi.acm.org/10.1145/1327452.1327492}, + doi = {10.1145/1327452.1327492}, + acmid = {1327492}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@article{tail_at_scale, + author = {Dean, Jeffrey and Barroso, Luiz Andr{\'e}}, + title = {The Tail at Scale}, + journal = {Commun. ACM}, + issue_date = {February 2013}, + volume = {56}, + number = {2}, + month = feb, + year = {2013}, + issn = {0001-0782}, + pages = {74--80}, + numpages = {7}, + url = {http://doi.acm.org/10.1145/2408776.2408794}, + doi = {10.1145/2408776.2408794}, + acmid = {2408794}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@inproceedings{hdfs, + author = {Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert}, + title = {The {Hadoop Distributed File System}}, + booktitle = {Proceedings of the 2010 IEEE 26th Symposium on Mass Storage Systems and Technologies (MSST)}, + series = {MSST '10}, + year = {2010}, + isbn = {978-1-4244-7152-2}, + pages = {1--10}, + numpages = {10}, + url = {http://dx.doi.org/10.1109/MSST.2010.5496972}, + doi = {10.1109/MSST.2010.5496972}, + acmid = {1914427}, + publisher = {IEEE Computer Society}, + address = {Washington, DC, USA}, +} + + +@article{cassandra, + author = {Lakshman, Avinash and Malik, Prashant}, + title = {Cassandra: A Decentralized Structured Storage System}, + journal = {SIGOPS Oper. Syst. Rev.}, + issue_date = {April 2010}, + volume = {44}, + number = {2}, + month = apr, + year = {2010}, + issn = {0163-5980}, + pages = {35--40}, + numpages = {6}, + url = {http://doi.acm.org/10.1145/1773912.1773922}, + doi = {10.1145/1773912.1773922}, + acmid = {1773922}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@article{bigtable, + author = {Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C. and Wallach, Deborah A. and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E.}, + title = {Bigtable: A Distributed Storage System for Structured Data}, + journal = {ACM Trans. Comput. Syst.}, + issue_date = {June 2008}, + volume = {26}, + number = {2}, + month = jun, + year = {2008}, + issn = {0734-2071}, + pages = {4:1--4:26}, + articleno = {4}, + numpages = {26}, + url = {http://doi.acm.org/10.1145/1365815.1365816}, + doi = {10.1145/1365815.1365816}, + acmid = {1365816}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {Large-Scale Distributed Storage}, +} + +@inproceedings{spanner, + author = {Corbett, James C. and Dean, Jeffrey and Epstein, Michael and Fikes, Andrew and Frost, Christopher and Furman, J. J. and Ghemawat, Sanjay and Gubarev, Andrey and Heiser, Christopher and Hochschild, Peter and Hsieh, Wilson and Kanthak, Sebastian and Kogan, Eugene and Li, Hongyi and Lloyd, Alexander and Melnik, Sergey and Mwaura, David and Nagle, David and Quinlan, Sean and Rao, Rajesh and Rolig, Lindsay and Saito, Yasushi and Szymaniak, Michal and Taylor, Christopher and Wang, Ruth and Woodford, Dale}, + title = {Spanner: Google's Globally-distributed Database}, + booktitle = {Proceedings of the 10th USENIX Conference on Operating Systems Design and Implementation}, + series = {OSDI'12}, + year = {2012}, + isbn = {978-1-931971-96-6}, + location = {Hollywood, CA, USA}, + pages = {251--264}, + numpages = {14}, + url = {http://dl.acm.org/citation.cfm?id=2387880.2387905}, + acmid = {2387905}, + publisher = {USENIX Association}, + address = {Berkeley, CA, USA}, +} + +@inproceedings{raft, + author = {Ongaro, Diego and Ousterhout, John}, + title = {In Search of an Understandable Consensus Algorithm}, + booktitle = {Proceedings of the 2014 USENIX Conference on USENIX Annual Technical Conference}, + series = {USENIX ATC'14}, + year = {2014}, + isbn = {978-1-931971-10-2}, + location = {Philadelphia, PA}, + pages = {305--320}, + numpages = {16}, + url = {http://dl.acm.org/citation.cfm?id=2643634.2643666}, + acmid = {2643666}, + publisher = {USENIX Association}, + address = {Berkeley, CA, USA}, +} + +@article{raft_refloated, + author = {Howard, Heidi and Schwarzkopf, Malte and Madhavapeddy, Anil and Crowcroft, Jon}, + title = {Raft Refloated: Do We Have Consensus?}, + journal = {SIGOPS Oper. Syst. Rev.}, + issue_date = {January 2015}, + volume = {49}, + number = {1}, + month = jan, + year = {2015}, + issn = {0163-5980}, + pages = {12--21}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/2723872.2723876}, + doi = {10.1145/2723872.2723876}, + acmid = {2723876}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@phdthesis(diego_thesis, +author = {Diego Ongaro}, +title = {Consensus: Bridging Theory and Practice}, +year = 2014, +school = {Stanford University}, +url = {https://ramcloud.stanford.edu/~ongaro/thesis.pdf} +) + +@article{bloom_filter, + author = {Bloom, Burton H.}, + title = {Space/Time Trade-offs in Hash Coding with Allowable Errors}, + journal = {Commun. ACM}, + issue_date = {July 1970}, + volume = {13}, + number = {7}, + month = jul, + year = {1970}, + issn = {0001-0782}, + pages = {422--426}, + numpages = {5}, + url = {http://doi.acm.org/10.1145/362686.362692}, + doi = {10.1145/362686.362692}, + acmid = {362692}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {hash addressing, hash coding, retrieval efficiency, retrieval trade-offs, scatter storage, searching, storage efficiency, storage layout}, +} + +@INPROCEEDINGS{x100, + author = {Peter Boncz and Marcin Zukowski and Niels Nes}, + title = {MonetDB/X100: Hyper-pipelining query execution}, + booktitle = {In CIDR}, + year = {2005} +} + +@INPROCEEDINGS{abadi, +author={Abadi, D.J. and Myers, D.S. and DeWitt, D.J. and Madden, S.R.}, +booktitle={Data Engineering, 2007. ICDE 2007. IEEE 23rd International Conference on}, +title={Materialization Strategies in a Column-Oriented DBMS}, +year={2007}, +pages={466-475}, +keywords={database management systems;column-oriented DBMS;column-stores;database architecture;materialization strategy;read-mostly query workload;row-stores;Data structures;Data warehouses;Database systems;Relational databases}, +doi={10.1109/ICDE.2007.367892}, +month={April},} + +@inproceedings{masstree, + author = {Mao, Yandong and Kohler, Eddie and Morris, Robert Tappan}, + title = {Cache Craftiness for Fast Multicore Key-value Storage}, + booktitle = {Proceedings of the 7th ACM European Conference on Computer Systems}, + series = {EuroSys '12}, + year = {2012}, + isbn = {978-1-4503-1223-3}, + location = {Bern, Switzerland}, + pages = {183--196}, + numpages = {14}, + url = {http://doi.acm.org/10.1145/2168836.2168855}, + doi = {10.1145/2168836.2168855}, + acmid = {2168855}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {in-memory, key-value, multicore, persistent}, +} + +@article{fractured_mirrors, + author = {Ramamurthy, Ravishankar and DeWitt, David J. and Su, Qi}, + title = {A Case for Fractured Mirrors}, + journal = {The VLDB Journal}, + issue_date = {August 2003}, + volume = {12}, + number = {2}, + month = aug, + year = {2003}, + issn = {1066-8888}, + pages = {89--101}, + numpages = {13}, + url = {http://dx.doi.org/10.1007/s00778-003-0093-1}, + doi = {10.1007/s00778-003-0093-1}, + acmid = {950483}, + publisher = {Springer-Verlag New York, Inc.}, + address = {Secaucus, NJ, USA}, + keywords = {Data placement, Disk mirroring, Vertical partitioning}, +} + +@article{bplus_tree, + author = {Comer, Douglas}, + title = {Ubiquitous B-Tree}, + journal = {ACM Comput. Surv.}, + issue_date = {June 1979}, + volume = {11}, + number = {2}, + month = jun, + year = {1979}, + issn = {0360-0300}, + pages = {121--137}, + numpages = {17}, + url = {http://doi.acm.org/10.1145/356770.356776}, + doi = {10.1145/356770.356776}, + acmid = {356776}, + publisher = {ACM}, + address = {New York, NY, USA}, +} + +@article{gfs, + author = {Ghemawat, Sanjay and Gobioff, Howard and Leung, Shun-Tak}, + title = {The Google File System}, + journal = {SIGOPS Oper. Syst. Rev.}, + issue_date = {December 2003}, + volume = {37}, + number = {5}, + month = oct, + year = {2003}, + issn = {0163-5980}, + pages = {29--43}, + numpages = {15}, + url = {http://doi.acm.org/10.1145/1165389.945450}, + doi = {10.1145/1165389.945450}, + acmid = {945450}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {clustered storage, data storage, fault tolerance, scalability}, +} + +@TECHREPORT{hybridtime, +AUTHOR = {David Alves, Todd Lipcon and Vijay Garg}, +TITLE = {HybridTime - Accessible Global Consistency with High Clock Uncertainty}, +INSTITUTION = {UT Austin, Cloudera Inc.}, +MONTH = {April}, +YEAR = {2014}, +PAGES = {}, +URL = {http://pdsl.ece.utexas.edu/david/hybrid-time-tech-report-01.pdf}, +CONTACT = {dralves@utexas.edu} +} + +@misc{parquet, + title = {Apache {Parquet}}, + howpublished = {http://parquet.apache.org}, +} + +@misc{avro, + title = {Apache {Avro}}, + howpublished = {http://avro.apache.org}, +} + +@misc{hbase, + title = {Apache {HBase}}, + howpublished = {http://hbase.apache.org}, +} + +@misc{phoenix, + title = {Apache {Phoenix}}, + howpublished = {http://phoenix.apache.org}, +} + +@misc{riak, + title = {Riak}, + howpublished = {https://github.com/basho/riak} +} + +@misc(llvm, + title = {{LLVM}}, + howpublished = {http://www.llvm.org} +) + +@misc(mongodb, + title = {{MongoDB}}, + howpublished = {http://www.mongodb.org} +) + +@misc(bitshuffle, + title = {Bitshuffle}, + howpublished = {https://github.com/kiyo-masui/bitshuffle}, + author = {Kiyoshi Masui} +) + +@misc(orcfile, + title = {ORCFile}, + howpublished = {https://cwiki.apache.org/confluence/display/Hive/LanguageManual+ORC} +) \ No newline at end of file diff --git a/docs/whitepaper/kudu.tex b/docs/whitepaper/kudu.tex new file mode 100644 index 000000000000..750bf0f4f7ca --- /dev/null +++ b/docs/whitepaper/kudu.tex @@ -0,0 +1,1069 @@ +% Confidential Cloudera Information, covered by NDA. +\documentclass{vldb} +\usepackage{graphicx} +\usepackage{verbatim} +\begin{document} + +\title{Kudu: Storage for Fast Analytics on Fast Data +\titlenote{\bf +This document is a draft. Edits will be made and re-published to the Kudu +open source project web site on a rolling basis.} +} +% Authors +%----------------- +\numberofauthors{1} +\author{ + \alignauthor Todd Lipcon, David Alves, Dan Burkert, Jean-Daniel Cryans, Adar Dembo, Mike Percy, Silvius Rus, Dave Wang, + Matteo Bertozzi, Colin Patrick McCabe, Andrew Wang\\ + \affaddr{Cloudera, inc.}\\ +} + +\date{28 September 2015} + +\maketitle + +\begin{abstract} +Kudu is an open source storage engine for structured data which supports low-latency random access +together with efficient analytical access patterns. Kudu distributes data using +horizontal partitioning and replicates each partition using Raft consensus, providing low +mean-time-to-recovery and low tail latencies. Kudu is designed within the context of the Hadoop +ecosystem and supports many modes of access via tools such as Cloudera Impala\cite{impala}, +Apache Spark\cite{spark}, and MapReduce\cite{mapreduce}. +\end{abstract} + +\section{Introduction} +\label{sec:introduction} +In recent years, explosive growth in the amount of data being generated and captured by +enterprises has resulted in the rapid adoption of open source technology which is able to +store massive data sets at scale and at low cost. In particular, the Hadoop ecosystem has become a focal +point for such ``big data'' workloads, because many traditional open source database systems have +lagged in offering a scalable alternative. + +Structured storage in the Hadoop ecosystem has typically been achieved in two ways: for static data sets, +data is typically stored on HDFS using binary data formats such as Apache Avro\cite{avro} or +Apache Parquet\cite{parquet}. However, neither HDFS nor these formats has any provision for updating +individual records, or for efficient random access. Mutable data sets are typically stored in +semi-structured stores such as Apache HBase\cite{hbase} or Apache Cassandra\cite{cassandra}. These systems allow for low-latency +record-level reads and writes, but lag far behind the static file formats in terms of sequential +read throughput for applications such as SQL-based analytics or machine learning. + +The gap between the analytic performance offered by static data sets on HDFS and the +low-latency row-level random access capabilities of HBase and Cassandra has required +practitioners to develop complex architectures when the need for both access patterns +arises in a single application. In particular, many of Cloudera's customers +have developed data pipelines which involve streaming ingest and updates in HBase, followed +by periodic jobs to export tables to Parquet for later analysis. Such architectures +suffer several downsides: + +\begin{enumerate} +\item Application architects must write complex code to manage the + flow and synchronization of data between the two systems. +\item Operators must manage consistent backups, security policies, + and monitoring across multiple distinct systems. +\item The resulting architecture may exhibit significant lag between the arrival + of new data into the HBase ``staging area'' and the time when the new data + is available for analytics. +\item In the real world, systems often need to accomodate late-arriving data, corrections + on past records, or privacy-related deletions on data that has already been + migrated to the immutable store. Achieving this may involve expensive rewriting + and swapping of partitions and manual intervention. +\end{enumerate} + +Kudu is a new storage system designed and implemented from the ground up to fill this gap between +high-throughput sequential-access storage systems such as HDFS\cite{hdfs} and low-latency random-access +systems such as HBase or Cassandra. While these existing systems continue to hold advantages in some +situations, Kudu offers a ``happy medium'' alternative that can dramatically simplify the +architecture of many common workloads. In particular, Kudu offers a simple API for row-level +inserts, updates, and deletes, while providing table scans at throughputs similar to Parquet, +a commonly-used columnar format for static data. + +This paper introduces the architecture of Kudu. Section \ref{sec:high-level} describes the system +from a user's point of view, introducing the data model, APIs, and operator-visible constructs. +Section \ref{sec:architecture} describes the architecture of Kudu, including how it partitions and +replicates data across nodes, recovers from faults, and performs common operations. Section +\ref{sec:storage} explains how Kudu stores its data on disk in order to combine fast random +access with efficient analytics. Section \ref{sec:integration} discusses integrations between Kudu +and other Hadoop ecosystem projects. Section \ref{sec:benchmarks} presents preliminary performance +results in synthetic workloads. + +\section{Kudu at a high level} +\label{sec:high-level} + +\subsection{Tables and schemas} +From the perspective of a user, Kudu is a storage system for tables of structured data. +A Kudu cluster may have any number of tables, each of which has a well-defined {\em schema} +consisting of a finite number of columns. Each such column has a name, type (e.g {\tt INT32} +or {\tt STRING}) and optional nullability. Some ordered subset of those columns +are specified to be the table's {\em primary key}. The primary key enforces a uniqueness constraint +(at most one row may have a given primary key tuple) and acts as the sole index by which +rows may be efficiently updated or deleted. This data model is familiar to users of relational +databases, but differs from many other distributed datastores such as Cassandra. +MongoDB\cite{mongodb}, Riak\cite{riak}, BigTable\cite{bigtable}, etc. + +As with a relational database, the user must define the schema of a table at the time of creation. +Attempts to insert data into undefined columns result in errors, as do violations of the primary +key uniqueness constraint. The user may at any time issue an {\em alter table} command to add +or drop columns, with the restriction that primary key columns cannot be dropped. + +Our decision to explicitly specify types for columns instead of using a NoSQL-style ``everything +is bytes'' is motivated by two factors: +\begin{enumerate} +\item Explicit types allow us to use type-specific columnar encodings such as bit-packing for + integers. +\item Explicit types allow us to expose SQL-like metadata to other systems such as commonly used + business intelligence or data exploration tools. +\end{enumerate} + +Unlike most relational databases, Kudu does not currently offer secondary indexes or uniqueness constraints +other than the primary key. Currently, Kudu requires that every table has a primary key defined, +though we anticipate that a future version will add automatic generation of surrogate keys. + +\subsection{Write operations} +After creating a table, the user mutates the table using {\tt Insert}, {\tt Update}, and {\tt Delete} +APIs. In all cases, the user must fully specify a primary key -- predicate-based deletions or updates +must be handled by a higher-level access mechanism (see section {\ref{sec:integration}}). + +Kudu offers APIs in Java and C++, with experimental support for Python. The APIs allow +precise control over batching and asynchronous error handling to amortize the cost of round trips +when performing bulk data operations (such as data loads or large updates). Currently, Kudu does +not offer any multi-row transactional APIs: each mutation conceptually executes as its own transaction, +despite being automatically batched with other mutations for better performance. Modifications within +a single row are always executed atomically across columns. + +\subsection{Read operations} +Kudu offers only a {\tt Scan} operation to retrieve data from a table. On a scan, the user may add +any number of predicates to filter the results. Currently, we offer only two types of predicates: +comparisons between a column and a constant value, and composite primary key ranges. These +predicates are interpreted both by the client API and the server to efficiently cull the amount of +data transferred from the disk and over the network. +% TODO: provide an example schema and some API usage + +In addition to applying predicates, the user may specify a projection for a scan. A projection +consists of a subset of columns to be retrieved. Because Kudu's on-disk storage is columnar, +specifying such a subset can substantially improve performance for typical analytic workloads. + +\subsection{Other APIs} +In addition to data path APIs, the Kudu client library offers other useful functionality. In particular, +the Hadoop ecosystem gains much of its performance by scheduling for data locality. Kudu provides +APIs for callers to determine the mapping of data ranges to particular servers to aid distributed +execution frameworks such as Spark, MapReduce, or Impala in scheduling. + +\subsection{Consistency Model} +Kudu provides clients the choice between two consistency modes. The default consistency mode is snapshot +consistency. A scan is guaranteed to yield a snapshot with no anomalies in which causality would be +violated\footnote{In the current beta release of Kudu, this consistency support is not yet fully implemented. +However, this paper describes the architecture and design of the system, despite the presence of +some known consistency-related bugs.}. As such, it also guarantees read-your-writes consistency from a single client. + +By default, Kudu does not provide an {\em external consistency} guarantee. That is to say, if a client +performs a write, then communicates with a different client via an external mechanism (e.g.\ a message +bus) and the other performs a write, the causal dependence between the two writes is not captured. +A third reader may see a snapshot which contains the second write without the first. + +Based on our experiences supporting other systems such as HBase that also do not offer external consistency +guarantees, this is sufficient for many use cases. However, for users who require a stronger +guarantee, Kudu offers the option to manually propagate timestamps between clients: after performing +a write, the user may ask the client library for a timestamp token. This token may be propagated to +another client through the external channel, and passed to the Kudu API on the other side, thus preserving +the causal relationship between writes made across the two clients. + +If propagating tokens is too complex, Kudu optionally uses {\em commit-wait} as in Spanner\cite{spanner}. After +performing a write with commit-wait enabled, the client may be delayed for a period of time to +ensure that any later write will be causally ordered correctly. Absent specialized time-keeping +hardware, this can introduce significant latencies in writes (100-1000ms with default NTP configurations), +so we anticipate that a minority of users will take advantage of this option. We also note +that, since the publication of Spanner, several data stores have started to take advantage +of real-time clocks. Given this, it is plausible that within a few years, cloud providers +will offer tight global time synchronization as a +differentiating service. + +The assignment of operation timestamps is based on a clock algorithm termed {\em HybridTime}\cite{hybridtime}. +Please refer to the cited article for details. + +\subsection{Timestamps} + +Although Kudu uses timestamps internally to implement concurrency control, Kudu does not allow +the user to manually set the timestamp of a write operation. This differs from systems such as +Cassandra and HBase, which treat the timestamp of a cell as a first-class part of the data model. +In our experiences supporting users of these other systems, we have found that, while advanced users +can make effective use of the timestamp dimension, the vast majority of users find this aspect of the data model +confusing and a source of user error, especially with regard to the semantics of back-dated insertions and deletions. + +We do, however, allow the user to specify a timestamp for a read operation. This allows the user to perform +point-in-time queries in the past, as well as to ensure that different distributed tasks that together +make up a single ``query'' (e.g. as in Spark or Impala) read a consistent snapshot. + +\section{Architecture} +\label{sec:architecture} + +\subsection{Cluster roles} + +Following the design of BigTable and GFS\cite{gfs} (and their open-source analogues HBase and HDFS), Kudu +relies on a single Master server, responsible for metadata, and an arbitrary number of Tablet +Servers, responsible for data. The master server can be replicated for fault tolerance, supporting +very fast failover of all responsibilities in the event of an outage. Typically, all roles are deployed +on commodity hardware, with no extra requirements for master nodes. + +\subsection{Partitioning} +\label{sec:partitioning} + +As in most distributed database systems, tables in Kudu are horizontally partitioned. Kudu, like +BigTable, calls these horizontal partitions {\em tablets}. Any row may be mapped to exactly one +tablet based on the value of its primary key, thus ensuring that random access operations such as +inserts or updates affect only a single tablet. For large tables where throughput is important, we +recommend on the order of 10-100 tablets per machine. Each tablet can be tens of gigabytes. + +Unlike BigTable, which offers only key-range-based partitioning, and unlike Cassandra, which is +nearly always deployed with hash-based partitioning, Kudu supports a flexible array of partitioning +schemes. When creating a table, the user specifies a partition schema for that table. The partition schema +acts as a function which can map from a primary key tuple into a binary {\em partition key}. Each +tablet covers a contiguous range of these partition keys. Thus, a client, when performing a read or +write, can easily determine which tablet should hold the given key and route the request +accordingly. + +The partition schema is made up of zero or more {\em hash-partitioning} rules followed by an +optional {\em range-partitioning} rule: +\begin{itemize} +\item A hash-partitioning rule consists of a subset of the primary key columns and a number of + buckets. For example, as expressed in our SQL dialect, {\tt DISTRIBUTE BY HASH(hostname, ts) INTO + 16 BUCKETS}. These rules convert tuples into binary keys by first concatenating the values + of the specified columns, and then computing the hash code of the resulting string + modulo the requested number of buckets. This resulting bucket number is encoded as a 32-bit + big-endian integer in the resulting partition key. + +\item A range-partitioning rule consists of an ordered subset of the primary key columns. This + rule maps tuples into binary strings by concatenating the values of the specified columns + using an order-preserving encoding. +\end{itemize} + +\begin{comment} +When concatenating multiple values to form a compound partition key, it is important to ensure +that tuples are encoded uniquely and in such a way as to preserve their original lexicographic +sort order. For example, the tuple {\tt ('foo', 'bar')} must be distinguished from {\tt ('foob', 'ar')}. +To achieve this, while still allowing arbitrary binary values in user-supplied data, we insert two null +bytes after each variable-length component in the compound key, and encode null bytes in the original values +as {\tt \textbackslash x00\textbackslash x01}. +\end{comment} + +By employing these partitioning rules, users can easily trade off between query parallelism and +query concurrency based on their particular workload. For example, consider a time series +application which stores rows of the form {\tt (host, metric, time, value)} and in which inserts +are almost always done with monotonically increasing {\tt time} values. Choosing to +hash-partition by timestamp optimally spreads the insert load across all servers; however, a query +for a specific metric on a specific host during a short time range must scan all tablets, limiting +concurrency. A user might instead choose to range-partition by timestamp while adding separate +hash partitioning rules for the metric name and hostname, which would provide a good trade-off +of parallelism on write and concurrency on read. + +Though users must understand the concept of partitioning to optimally use Kudu, the details +of partition key encoding are fully transparent to the user: encoded partition keys are not exposed +in the API. Users always specify rows, partition split points, and key ranges using structured +row objects or SQL tuple syntax. Although this flexibility in partitioning is relatively unique +in the ``NoSQL'' space, it should be quite familiar to users and administrators of +analytic MPP database management systems. + +\subsection{Replication} +\label{sec:replication} + +In order to provide high availability and durability while running on large commodity clusters, +Kudu replicates all of its table data across multiple machines. When creating a table, +the user specifies a replication factor, typically 3 or 5, depending on the application's +availability SLAs. Kudu's master strives to ensure that the requested number of replicas are maintained +at all times (see Section \ref{sec:cluster_coordination}). + +Kudu employs the Raft\cite{raft} consensus algorithm to replicate its tablets. +In particular, Kudu uses Raft to agree upon a logical log of operations (e.g. insert/update/delete) +for each tablet. When a client wishes to perform a write, it first locates the leader replica (see +Section \ref{tablet_directory}) and sends a {\tt Write} RPC to this replica. If the client's information +was stale and the replica is no longer the leader, it rejects the request, causing the client +to invalidate and refresh its metadata cache and resend the request to the new leader. If +the replica is in fact still acting as the leader, it employs a local lock manager to serialize +the operation against other concurrent operations, picks an MVCC timestamp, and proposes +the operation via Raft to its followers. If a majority of replicas accept the write and log +it to their own local write-ahead logs\footnote{Kudu gives administrators the option of considering +a write-ahead log entry committed either after it has been written to the operating system buffer +cache, or only after an explicit {\tt fsync} operation has been performed. The latter provides +durability even in the event of a full datacenter outage, but decreases write performance +substantially on spinning hard disks.}, the write is considered +durably replicated and thus can be committed on all replicas. Note that there is no restriction +that the leader must write an operation to its local log before it may be committed: +this provides good latency-smoothing properties even if the leader's disk is performing poorly. + +In the case of a failure of a minority of replicas, the leader can continue to propose +and commit operations to the tablet's replicated log. If the leader itself fails, +the Raft algorithm quickly elects a new leader. By default, Kudu uses a 500-millisecond heartbeat +interval and a 1500-millisecond election timeout; thus, after a leader fails, a new leader is +typically elected within a few seconds. + +Kudu implements some minor improvements on the Raft algorithm. In particular: +\begin{enumerate} +\item As proposed in \cite{raft_refloated} we employ an exponential back-off algorithm + after a failed leader election. We found that, as we typically commit Raft's + persistent metadata to contended hard disk drives, such an extension was + necessary to ensure election convergence on busy clusters. +\item When a new leader contacts a follower whose log diverges from its own, + Raft proposes marching backward one operation at a time until discovering + the point where they diverged. Kudu instead immediately jumps back to + the last known {\em committedIndex}, which is always guaranteed to be + present on any divergent follower. This minimizes the potential number + of round trips at the cost of potentially sending redundant operations + over the network. We found this simple to implement, and it ensures that + divergent operations are aborted after a single round-trip. +% TODO: Mike -- are there any other improvements? talk about integrating +% WAL and Raft log? +\end{enumerate} + +Kudu does not replicate the on-disk storage of a tablet, but rather just +its operation log. The physical storage of each replica of a tablet is fully decoupled. +This yields several advantages: + +\begin{itemize} + +\item When one replica is undergoing physical-layer background operations such as flushes or compactions +(see Section \ref{sec:storage}), it is unlikely that other nodes are operating on the same tablet at the same time. Because +Raft may commit after an acknowledgment by a majority of replicas, this reduces the impact +of such physical-layer operations on the tail latencies experienced by clients for writes. +In the future, we anticipate implementing techniques such as the speculative read requests +described in \cite{tail_at_scale} to further decrease tail latencies for reads in concurrent read/write +workloads. + +\item During development, we discovered some rare race conditions in the physical storage +layer of the Kudu tablet. Because the storage layer is decoupled across replicas, none of these +race conditions resulted in unrecoverable data loss: in all cases, we were able to detect that one +replica had become corrupt (or silently diverged from the majority) and repair it. +\end{itemize} + +\subsubsection{Configuration Change} + +Kudu implements Raft configuration change following the {\em one-by-one} algorithm proposed in +\cite{diego_thesis}. In this approach, the number of voters in the Raft configuration may change +by at most one in each configuration change. In order to grow a 3-replica configuration to 5 +replicas, two separate configuration changes (3$\rightarrow$4, 4$\rightarrow$5) must be proposed +and committed. + +Kudu implements the addition of new servers through a process called {\em remote bootstrap}. +In our design, in order to add a new replica, we first add it as a new member in the +Raft configuration, even before notifying the destination server that a new replica will +be copied to it. When this configuration change has been committed, the current Raft leader +replica triggers a {\tt StartRemoteBootstrap} RPC, which causes the destination server to pull a +snapshot of the tablet data and log from the current leader. When the transfer +is complete, the new server opens the tablet following the same process as after +a server restart. When the tablet has opened the tablet data and replayed any necessary +write-ahead logs, it has fully replicated the state of the leader at the time it began the transfer, +and may begin responding to Raft RPCs as a fully-functional replica. + +In our current implementation, new servers are added immediately as {\tt VOTER} replicas. This has +the disadvantage that, after moving from a 3-server configuration to a 4-server configuration, three +out of the four servers must acknowledge each operation. Because the new server is in the process of +copying, it is unable to acknowledge operations. If another server were to crash during the +snapshot-transfer process, the tablet would become unavailable for writes until the remote bootstrap +finished. + +To address this issue, we plan to implement a {\tt PRE\_VOTER} replica state. In this +state, the leader will send Raft updates and trigger remote bootstrap on the +target replica, but not count it as a voter when calculating the size of the configuration's +majority. Upon detecting that the {\tt PRE\_VOTER} replica has fully caught up to +the current logs, the leader will automatically propose and commit another configuration change to +transition the new replica to a full {\tt VOTER}. + +When removing replicas from a tablet, we follow a similar approach: the current Raft leader +proposes an operation to change the configuration to one that does not include the node +to be evicted. If this is committed, then the remaining nodes will no longer send messages +to the evicted node, though the evicted node will not know that it has been removed. When the +configuration change is committed, the remaining nodes report the configuration +change to the Master, which is responsible for cleaning up the orphaned replica (see +Section \ref{sec:cluster_coordination}). + +\begin{comment} +% Commented out since this section's a bit too detailed compared to the rest of the paper +One subtlety that arises during the deletion of replicas is that, even after a replica +has been removed from the tablet configuration, the tablet server must retain a +tombstone record with a small amount of metadata. This is necessary to prevent a +split-brain scenario in the following sequence: + +\begin{enumerate} +\item A raft configuration is operating with nodes {\tt (A, B, C)}. +\item Node {\tt C}, the leader, pauses for a lengthy amount of time. +\item Node {\tt A} is elected leader, and adds a new server {\tt D}, then adds another server {\tt E}. +\item Node {\tt D} is elected leader and removes {\tt A}, then {\tt B}, then adds {\tt F}. +\item Node {\tt C} recovers from its pause. +\end{enumerate} + +In this case, node {\tt C} may still believe itself to be leader for a short period of time, and +send update messages to {\tt B} and {\tt C}. Upon receiving {\tt TABLET\_NOT\_FOUND} responses, +{\tt C} may proceed to bootstrap the target nodes from itself and form a split-brain +configuration. To combat this, upon deletion of any replica, we leave a tombstone metadata record +which includes the Raft operation index at which the replica was removed from the configuration. +Such a tombstoned replica will refuse to bootstrap from any leader whose log is older than +this index. +\end{comment} + +\subsection{The Kudu Master} + +Kudu's central master process has several key responsibilities: +\begin{enumerate} +\item Act as a {\em catalog manager}, keeping track of which tables and tablets exist, as +well as their schemas, desired replication levels, and other metadata. When tables are created, +altered, or deleted, the Master coordinates these actions across the tablets and ensures +their eventual completion. +\item Act as a {\em cluster coordinator}, keeping track of which servers in the cluster +are alive and coordinating redistribution of data after server failures. +\item Act as a {\em tablet directory}, keeping track of which tablet servers are +hosting replicas of each tablet. +\end{enumerate} + +We chose a centralized, replicated master design over a fully peer-to-peer design for simplicity of implementation, +debugging, and operations. + +\subsubsection{Catalog Manager} + +The master itself hosts a single-tablet table which is restricted from direct access by users. +The master internally writes catalog information to this tablet, while keeping a +full write-through cache of the catalog in memory at all times. Given the large amounts of +memory available on current commodity hardware, and the small amount of metadata stored per tablet, we do not +anticipate this becoming a scalability issue in the near term. If scalability becomes an issue, moving to a +paged cache implementation would be a straightforward evolution of the architecture. + +The catalog table maintains a small amount of state for each table in the system. In particular, it keeps +the current version of the table schema, the state of the table (creating, running, deleting, etc), +and the set of tablets which comprise the table. The master services a request to create +a table by first writing a table record to the catalog table indicating a {\tt CREATING} +state. Asynchronously, it selects tablet servers to host tablet replicas, creates the Master-side +tablet metadata, and sends asynchronous requests to create the replicas on the tablet servers. +If the replica creation fails or times out on a majority of replicas, the tablet can be safely deleted +and a new tablet created with a new set of replicas. If the Master fails in the middle +of this operation, the table record indicates that a roll-forward is necessary and the +master can resume where it left off. A similar approach is used for other operations such +as schema changes and deletion, where the Master ensures that the change is propagated to +the relevant tablet servers before writing the new state to its own storage. In all cases, the +messages from the Master to the tablet servers are designed to be idempotent, such that on +a crash and restart, they can be safely resent. + +Because the catalog table is itself persisted in a Kudu tablet, the Master supports using +Raft to replicate its persistent state to backup master processes. Currently, the +backup masters act only as Raft followers and do not serve client requests. Upon becoming +elected leader by the Raft algorithm, a backup master scans its catalog table, loads +its in-memory cache, and begins acting as an active master following the same process +as a master restart. + +\subsubsection{Cluster Coordination} +\label{sec:cluster_coordination} + +Each of the tablet servers in a Kudu cluster is statically configured with a list of host names +for the Kudu masters. Upon startup, the tablet servers register with the Masters and proceed to send +{\em tablet reports} indicating the total set of tablets which they are hosting. +The first such tablet report contains information about all tablets. All future tablet +reports are {\em incremental}, only containing reports for tablets that have been +newly created, deleted, or modified (e.g. processed a schema change or Raft configuration +change). + +A critical design point of Kudu is that, while the Master is the source of truth about +catalog information, it is only an observer of the dynamic cluster state. +The tablet servers themselves are always authoritative about the location of tablet +replicas, the current Raft configuration, the current schema version of a tablet, etc. +Because tablet replicas agree on all state changes via Raft, every such change +can be mapped to a specific Raft operation index in which it was committed. This allows +the Master to ensure that all tablet state updates are idempotent +and resilient to transmission delays: the Master simply compares the Raft operation index +of a tablet state update and discards it if the index is not newer than the Master's current +view of the world. + +This design choice leaves much responsibility in the hands of the tablet servers themselves. +For example, rather than detecting tablet server crashes from the Master, Kudu instead +delegates that responsibility to the Raft {\tt LEADER} replicas of any tablets with replicas +on the crashed machine. The leader keeps track +of the last time it successfully communicated with each follower, and if it has failed +to communicate for a significant period of time, it declares the follower dead and proposes +a Raft configuration change to evict the follower from the Raft configuration. When this +configuration change is successfully committed, the remaining tablet servers will +issue a tablet report to the Master to advise it of the decision made by the leader. + +In order to regain the desired replication count for the tablet, the Master selects +a tablet server to host a new replica based on its global view of the cluster. +After selecting a server, the Master {\em suggests} a configuration change to the current +leader replica for the tablet. However, the Master itself is powerless to change +a tablet configuration -- it must wait for the leader replica to propose and commit +the configuration change operation, at which point the Master is notified of the configuration +change's success via a tablet report. If the Master's suggestion failed (e.g. because the message was lost) +it will stubbornly retry periodically until successful. Because these operations +are tagged with the unique index of the degraded configuration, they are fully +idempotent and conflict-free, even if the Master issues several conflicting +suggestions, as might happen soon after a master fail-over. + +The master responds similarly to extra replicas of tablets. If the Master receives a tablet +report which indicates that a replica has been removed from a tablet configuration, it stubbornly +sends {\tt DeleteTablet} RPCs to the removed node until the RPC succeeds. To ensure eventual cleanup +even in the case of a master crash, the Master also sends such RPCs in response to a tablet report +which identifies that a tablet server is hosting a replica which is not in the newest committed Raft +configuration. + + +\subsubsection{Tablet Directory} +\label{tablet_directory} + +In order to efficiently perform read and write operations without intermediate network hops, +clients query the Master for tablet location information. Clients are ``thick'' and maintain +a local metadata cache which includes their most recent information about each tablet they +have previously accessed, including the tablet's partition key range and its Raft configuration. +At any point in time, the client's cache may be stale; if the client attempts to send a write +to a server which is no longer the leader for a tablet, the server will reject the request. +The client then contacts the Master to learn about the new leader. In the case that the +client receives a network error communicating with its presumed leader, it follows the same +strategy, assuming that the tablet has likely elected a new leader. + +In the future, we plan to piggy-back the {\em current} Raft configuration on the error response +if a client contacts a non-leader replica. This will prevent extra round-trips to the +master after leader elections, since typically the followers will have up-to-date information. + +Because the Master maintains all tablet partition range information in memory, it scales +to a high number of requests per second, and responds with very low latency. In a 270-node +cluster running a benchmark workload with thousands of tablets, we measured the 99.99th percentile +latency of tablet location lookup RPCs at 3.2ms, with the 95th percentile at 374 microseconds +and 75th percentile at 91 microseconds. Thus, we do not anticipate that the tablet directory +lookups will become a scalability bottleneck at current target cluster sizes. If they do become a +bottleneck, we note that it is always safe to serve stale location information, and thus this +portion of the Master can be trivially partitioned and replicated across any number of machines. + +\section{Tablet storage} +\label{sec:storage} + +Within a tablet server, each tablet replica operates as an entirely separate entity, +significantly decoupled from the partitioning and replication systems described in +sections \ref{sec:partitioning} and \ref{sec:replication}. During development of +Kudu, we found that it was convenient to develop the storage layer somewhat independently +from the higher-level distributed system, and in fact many of our functional and unit +tests operate entirely within the confines of the tablet implementation. + +Due to this decoupling, we are exploring the idea of providing the ability to select +an underlying {\em storage layout} on a per-table, per-tablet or even per-replica basis -- a distributed analogue +of Fractured Mirrors, as proposed in \cite{fractured_mirrors}. However, we currently +offer only a single storage layout, described in this section. + +\subsection{Overview} + +The implementation of tablet storage in Kudu addresses several goals: + +\begin{enumerate} +\item {\bf Fast columnar scans} - In order to provide analytic performance comparable to + best-of-breed immutable data formats such as Parquet and ORCFile\cite{orcfile}, it's critical + that the majority of scans can be serviced from efficiently encoded columnar data files. +\item {\bf Low-latency random updates} - In order to provide fast access to update or read + arbitrary rows, we require $O(\lg n)$ lookup complexity for random access. +\item {\bf Consistency of performance} - Based on our experiences supporting other + data storage systems, we have found that users are willing to trade off peak performance + in order to achieve predictability. +\end{enumerate} + +In order to provide these characteristics simultaneously, Kudu does not reuse any +pre-existing storage engine, but rather chooses to implement a new hybrid columnar store +architecture. + +\subsection{RowSets} + +Tablets in Kudu are themselves subdivided into smaller units called {\em RowSets}. +Some RowSets exist in memory only, termed {\em MemRowSets}, while others +exist in a combination of disk and memory, termed {\em DiskRowSets}. Any given +live (not deleted) row exists in exactly one RowSet; thus, RowSets form disjoint +sets of rows. However, note that the primary key {\em intervals} of different RowSets +may intersect. + +At any point in time, a tablet has a single MemRowSet which stores all recently-inserted +rows. Because these stores are entirely in-memory, a background thread periodically +flushes MemRowSets to disk. The scheduling of these flushes is described in further +detail in Section \ref{sec:maintenance}. + +When a MemRowSet has been selected to be flushed, a new, empty MemRowSet is swapped in to +replace it. The previous MemRowSet is written to disk, and becomes one or more DiskRowSets. This flush +process is fully concurrent: readers can continue to access the old MemRowSet while it is being +flushed, and updates and deletes of rows in the flushing MemRowSet are carefully tracked and rolled +forward into the on-disk data upon completion of the flush process. + +\subsection{MemRowSet Implementation} + +MemRowSets are implemented by an in-memory concurrent B-tree with optimistic +locking, broadly based off the design of MassTree\cite{masstree}, with the following +changes: +\begin{enumerate} +\item We do not support removal of elements from the tree. Instead, we use MVCC + records to represent deletions. MemRowSets eventually flush to other storage, + so we can defer removal of these records to other parts of the system. +\item Similarly, we do not support arbitrary in-place updates of records in the tree. + Instead, we allow only modifications which do not change the value's size: + this permits atomic compare-and-swap operations to append mutations to a + per-record linked list. +\item We link together leaf nodes with a {\tt next} pointer, as in the B+-tree\cite{bplus_tree}. + This improves our sequential scan performance, a critical operation. +\item We do not implement the full ``trie of trees'', but rather just a single + tree, since we are less concerned about extremely high random access throughput + compared to the original application. +\end{enumerate} + +In order to optimize for scan performance over random access, we use slightly larger internal and +leaf nodes sized at four cache-lines (256 bytes) each. + +Unlike most data in Kudu, MemRowSets store rows in a row-wise layout. This still +provides acceptable performance, since the data is always in memory. To maximize +throughput despite the choice of row storage, we utilize +SSE2 memory prefetch instructions to prefetch one leaf node ahead of our scanner, +and JIT-compile record projection operations using LLVM\cite{llvm}. +These optimizations provide significant performance boosts relative to the +naive implementation. + +In order to form the {\em key} for insertion into the B-tree, we encode +each row's primary key using an order-preserving encoding as described in +Section \ref{sec:partitioning}. This allows efficient tree traversal using +only {\tt memcmp} operations for comparison, and the sorted nature of the +MemRowSet allows for efficient scans over primary key ranges or individual +key lookups. + +\subsection{DiskRowSet Implementation} + +When MemRowSets flush to disk, they become DiskRowSets. While flushing a +MemRowSet, we {\em roll} the DiskRowSet after each 32 MB of IO. This ensures +that no DiskRowSet is too large, thus allowing efficient incremental compaction +as described later in Section \ref{sec:rowset_compaction}. Because a MemRowSet +is in sorted order, the flushed DiskRowSets will themselves also be in +sorted order, and each rolled segment will have a disjoint interval +of primary keys. + +A DiskRowSet is made up of two main components: {\em base data} and {\em delta stores}. The base +data is a column-organized representation of the rows in the DiskRowSet. Each column is separately +written to disk in a single contiguous block of data. The column itself is subdivided into small +pages to allow for granular random reads, and an embedded B-tree index allows efficient seeking to +each page based on its ordinal offset within the rowset. Column pages are encoded using a variety of +encodings, such as dictionary encoding, bitshuffle\cite{bitshuffle}, or front coding, and is optionally +compressed using generic binary compression schemes such as {\em LZ4}, +{\em gzip}, or {\em bzip2}. These encodings and compression options may be specified +explicitly by the user on a per-column basis, for example to designate that a large +infrequently-accessed text column should be gzipped, while a column that typically stores small +integers should be bit-packed. Several of the page formats supported by Kudu are common +with those supported by Parquet, and our implementation shares much code with Impala's Parquet +library. + +In addition to flushing columns for each of the user-specified columns in the table, we also write a +primary key index column, which stores the encoded primary key for each row. We also flush a +chunked Bloom filter\cite{bloom_filter} which can be used to test for the possible presence of a row based on its +encoded primary key. + +Because columnar encodings are difficult to update in place, the columns within the base data +are considered immutable once flushed. Instead, updates and deletes are tracked through +structures termed {\em delta stores}. Delta stores are either in-memory {\em DeltaMemStores}, +or on-disk {\em DeltaFiles}. A DeltaMemStore is a concurrent B-tree which shares the implementation +described above. A DeltaFile is a binary-typed column block. In both cases, delta stores +maintain a mapping from {\tt (row\_offset, timestamp)} tuples to {\em RowChangeList} records. +The row offset is simply the ordinal index of a row within the RowSet -- for example, the row with the +lowest primary key has offset 0. The timestamp is the MVCC timestamp assigned when the operation +was originally written. The RowChangeList is a binary-encoded list of changes to a row, for example +indicating {\tt SET column id 3 = `foo'} or {\tt DELETE}. + +When servicing an update to data within a DiskRowSet, we first consult the primary key index column. +By using its embedded B-tree index, we can efficiently seek to the page containing the target +row. Using page-level metadata, we can determine the row offset for the first cell within that +page. By searching within the page (eg via in-memory binary search) we can then calculate the +target row's offset within the entire DiskRowSet. Upon determining this offset, we insert a new +delta record into the rowset's DeltaMemStore. + +\subsection{Delta Flushes} + +Because the DeltaMemStore is an in-memory store, it has finite capacity. The same background +process which schedules flushes of MemRowSets also schedules flushes of DeltaMemStores. +When flushing a DeltaMemStore, a new empty store is swapped in while the existing one +is written to disk and becomes a DeltaFile. A DeltaFile is a simple binary column +which contains an immutable copy of the data that was previously in memory. + +\subsection{INSERT path} + +As described previously, each tablet has a single MemRowSet which is holds +recently inserted data; however, it is not sufficient to simply write all inserts directly +to the current MemRowSet, since Kudu enforces a primary key uniqueness constraint. In other +words, unlike many NoSQL stores, Kudu differentiates {\tt INSERT } from {\tt UPSERT}. + +In order to enforce the uniqueness constraint, Kudu must consult all of the existing DiskRowSets +before inserting the new row. Because there may be hundreds or thousands of DiskRowSets per +tablet, it is important that this be done efficiently, both by culling the number of DiskRowSets +to consult and by making the lookup within a DiskRowSet efficient. + +In order to cull the set of DiskRowSets to consult on an {\tt INSERT} operation, each DiskRowSet +stores a Bloom filter of the set of keys present. Because new keys are +never inserted into an existing DiskRowSet, this Bloom filter is static data. We chunk the Bloom +filter into 4KB pages, each corresponding to a small range of keys, and index those pages using +an immutable B-tree structure. These pages as well as their index are cached in a server-wide +LRU page cache, ensuring that most Bloom filter accesses do not require a physical disk seek. + +Additionally, for each DiskRowSet, we store the minimum and maximum primary key, and use these +key bounds to index the DiskRowSets in an interval tree. This further culls +the set of DiskRowSets to consult on any given key lookup. A background compaction process, +described in Section \ref{sec:rowset_compaction} reorganizes DiskRowSets to improve the effectiveness +of the interval tree-based culling. + +For any DiskRowSets that are not able to be culled, we must fall back to looking up the +key to be inserted within its encoded primary key column. This is done via the embedded +B-tree index in that column, which ensures a logarithmic number of disk seeks in the worst +case. Again, this data access is performed through the page cache, ensuring that for hot +areas of key space, no physical disk seeks are needed. + + +\subsection{Read path} + +Similar to systems like X100\cite{x100}, Kudu's read path always operates in batches of rows in order to +amortize function call cost and provide better opportunities for loop unrolling and SIMD +instructions. Kudu's in-memory batch format consists of a top-level structure which contains +pointers to smaller blocks for each column being read. Thus, the batch itself is columnar in +memory, which avoids any offset calculation cost when copying from columnar on-disk stores +into the batch. + +When reading data from a DiskRowSet, Kudu first determines if a range predicate on the scan +can be used to cull the range of rows within this DiskRowSet. For example, if the scan has +set a primary key lower bound, we perform a seek within the primary key column in order +to determine a lower bound row offset; we do the same with any upper bound key. This converts +the key range predicate into a row offset range predicate, which is simpler to satisfy as it +requires no expensive string comparisons. + +Next, Kudu performs the scan one column at a time. First, it seeks the target column to the +correct row offset (0, if no predicate was provided, or the start row, if it previously +determined a lower bound). Next, it copies cells from the source column into our row batch +using the page-encoding specific decoder. Last, it consult the delta stores to see if +any later updates have replaced cells with newer versions, based on the current scan's +MVCC snapshot, applying those changes to our in-memory batch as necessary. Because deltas +are stored based on numerical row offsets rather than primary keys, this delta application +process is extremely efficient: it does not require any per-row branching or expensive +string comparisons. + +After performing this process for each row in the projection, it returns the batch results, +which will likely be copied into an RPC response and sent back to the client. The tablet +server maintains stateful iterators on the server side for each scanner so that successive +requests do not need to re-seek, but rather can continue from the previous point in each column file. + +\subsection{Lazy Materialization} + +If predicates have been specified for the scanner, we perform lazy +materialization\cite{abadi} of column data. In particular, we prefer +to read columns which have associated range predicates before reading any other +columns. After reading each such column, we evaluate the associated predicate. +In the case that the predicate filters all rows in this batch, we short circuit +the reading of other columns. This provides a significant speed boost when applying +selective predicates, as the majority of data from the other selected columns +will never be read from disk. + +\subsection{Delta Compaction} + +Because deltas are not stored in a columnar format, the scan speed of a tablet will +degrade as ever more deltas are applied to the base data. Thus, Kudu's background +maintenance manager periodically scans DiskRowSets to find any cases where a large number +of deltas (as identified by the ratio between base data row count and delta count) have accumulated, +and schedules a delta compaction operation which merges those +deltas back into the base data columns. + +In particular, the delta compaction operation identifies the common case where the +majority of deltas only apply to a subset of columns: for example, it is common for a SQL +batch operation to update just one column out of a wide table. In this case, the delta +compaction will only rewrite that single column, avoiding IO on the other unmodified columns. + +\subsection{RowSet Compaction} +\label{sec:rowset_compaction} + +In addition to compacting deltas into base data, Kudu also periodically compacts different +DiskRowSets together in a process called RowSet compaction. This process performs a +key-based merge of two or more DiskRowSets, resulting in a sorted stream of output rows. +The output is written back to new DiskRowSets, again rolling every 32 MB, to ensure that no +DiskRowSet in the system is too large. + +RowSet compaction has two goals: +\begin{enumerate} +\item We take this opportunity to remove deleted rows. +\item This process reduces the number of DiskRowSets that overlap in key range. +By reducing the amount by which RowSets overlap, we reduce the number of RowSets +which are expected to contain a randomly selected key in the tablet. This value acts +as an upper bound for the number of Bloom filter lookups, and thus disk seeks, expected +to service a write operation within the tablet. +\end{enumerate} + +\subsection{Scheduling maintenance} +\label{sec:maintenance} + +As described in the sections above, Kudu has several different background maintenance operations +that it performs to reduce memory usage and improve performance of the on-disk layout. These +operations are performed by a pool of maintenance threads that run within the tablet server +process. Toward the design goal of consistent performance, these +threads run all the time, rather than being triggered by specific events or conditions. +Upon the completion of one maintenance operation, a scheduler process evaluates the +state of the on-disk storage and picks the next operation to perform based on a set +of heuristics meant to balance memory usage, write-ahead log retention, and the +performance of future read and write operations. + +In order to select DiskRowSets to compact, the maintenance scheduler solves an optimization +problem: given an IO budget (typically 128 MB), select a set of DiskRowSets +such that compacting them would reduce the expected number of seeks, as described above. +This optimization turns out to be a series of instances of the well-known integer knapsack +problem, and is able to be solved efficiently in a few milliseconds. + +Because the maintenance threads are always running small units of work, the operations +can react quickly to changes in workload behavior. For example, when insertion workload +increases, the scheduler quickly reacts and flushes in-memory stores to disk. When +the insertion workload reduces, the server performs compactions in the background +to increase performance for future writes. This provides smooth transitions in performance, +making it easier for developers and operators to perform capacity planning and +estimate the latency profile of their workloads. + +\section{Hadoop Integration} +\label{sec:integration} + +\subsection{MapReduce and Spark} + +Kudu was built in the context of the Hadoop ecosystem, and we have prioritized several +key integrations with other Hadoop components. In particular, we provide bindings for +MapReduce jobs to either input or output data to Kudu tables. These bindings can be +easily used in Spark\cite{spark} as well. A small glue layer binds Kudu tables to higher-level +Spark concepts such as DataFrames and Spark SQL tables. + +These bindings offer native support for several key features: +\begin{itemize} +\item {\bf Locality} - internally, the input format queries the Kudu master process to determine +the current locations for each tablet, allowing for data-local processing. +\item {\bf Columnar Projection} - the input format provides a simple API allowing the user to +select which columns are required for their job, thus minimizing the amount of IO required. +\item {\bf Predicate pushdown} - the input format offers a simple API to specify predicates +which will be evaluated server-side before rows are passed to the job. This predicate push-down +increases performance and can be easily accessed through higher-level interfaces such as +SparkSQL. +\end{itemize} + +\subsection{Impala} + +Kudu is also deeply integrated with Cloudera Impala\cite{impala}. In fact, Kudu provides +no shell or SQL parser of its own: the only support for SQL operations is via its integration +with Impala. The Impala integration includes several key features: +\begin{itemize} +\item {\bf Locality} - the Impala planner uses the Kudu Java API to inspect tablet location +information and distributes backend query processing tasks to the same nodes which store the +data. In typical queries, no data is transferred over the network from Kudu to Impala. We +are currently investigating further optimizations based on shared memory transport to +make the data transfer even more efficient. + +\item {\bf Predicate pushdown support} - the Impala planner has been modified to identify +predicates which are able to be pushed down to Kudu. In many cases, pushing a predicate +allows significant reduction in IO, because Kudu lazily materializes columns only after predicates +have been passed. + +\item {\bf DDL extensions } - Impala's DDL statements such as {\tt CREATE TABLE} have been +extended to support specifying Kudu partitioning schemas, replication factors, and primary +key definitions. + +\item {\bf DML extensions } - Because Kudu is the first mutable store in the Hadoop ecosystem that +is suitable for fast analytics, Impala previously did not support mutation statements such as +{\tt UPDATE} and {\tt DELETE}. These statements have been implemented for Kudu tables. +\end{itemize} + +Impala's modular architecture allows a single query to transparently join data from +multiple different storage components. For example, a text log file on HDFS can be joined against +a large dimension table stored in Kudu. + +\section{Performance evaluation} +\label{sec:benchmarks} + +\subsection{Comparison with Parquet} + +\begin{table} + \begin{tabular}{rllllllllllllll} +& Q1 & Q2 & Q3 & Q4 & Q5 & Q6 & Q7 & Q8\\\hline +HDFS & 4.1 & 10.4 & 7.6 & 9.2 & 17.5 & 3.5 & 12.7 & 31.5\\ +Kudu & 4.3 & 9.1 & 6.1 & 7.5 & 16.0 & 1.4 & 13.8 & 10.5 \\\\ + +& Q9 & Q10 & Q11 & Q12 & Q13 & Q14 & Q15 & Q16\\\hline +HDFS & 49.7 & 6.9 & 3.3 & 8.5 & 6.1 & 3.3 & 4.2 & 2.8\\ +Kudu & 47.7 & 3.8 & 3.4 & 3.0 & 5.5 & 1.4 & 3.9 & 2.4\\\\ + +& Q17 & Q18 & Q19 & Q20 & Q21 & Q22 & \multicolumn{3}{l}{\bf Geomean}\\\hline +HDFS & 23.4 & 14.8 & 19.4 & 6.1 & 22.4 & 3.6 & \multicolumn{3}{l}{\bf 8.8} \\ +Kudu & 17.7 & 19.0 & 17.8 & 7.0 & 12.0 & 3.6 & \multicolumn{3}{l}{\bf 6.7} + \end{tabular} + \caption{TPC-H query times: Impala on Kudu vs Impala on Parquet/HDFS (seconds, lower is better)} + \label{fig:parquet_vs_kudu} +\end{table} + +To evaluate the performance of Kudu for analytic workloads, we loaded the industry-standard TPC-H +data set at scale factor 100 on a cluster of 75 nodes, each with 64GB of memory, 12 spinning +disks, and dual 6-core Xeon E5-2630L processors running at 2GHz. Because the total memory on the +cluster is much larger than the data to be queried, all queries operate fully against cached +data; however, all data is fully persisted in the columnar DiskRowSet storage of Kudu rather than being +left in memory stores. + +We used Impala 2.2 to run the full set of 22 TPC-H queries against the same data set stored in Parquet +as well as on Kudu. For the Kudu tables, we hash-partitioned each table by its primary key into +256 buckets, with the exception of the very small {\tt nation} and {\tt region} dimension tables, +which were stored in a single tablet each. All data was loaded using {\tt CREATE +TABLE AS SELECT} statements from within Impala. + +While we have not yet performed an in-depth benchmark including concurrent workloads, we compared +the wall time of each TPC-H query between the two systems. The results are summarized in Table \ref{fig:parquet_vs_kudu}. +Across the set of queries, Kudu performed on average 31\% faster than Parquet. We believe that Kudu's performance +advantage is due to two factors: +\begin{enumerate} +\item {\bf Lazy materialization} - Several of the queries in TPC-H include a restrictive predicate + on larger tables such as {\tt lineitem}. Kudu supports lazy materialization, avoiding IO and + CPU costs on other columns in the cases where the predicate does not match. The current + implementation of Parquet in Impala does not support this feature. +\item {\bf CPU efficiency} - The Parquet reader in Impala has not been fully optimized, + and currently invokes many per-row function calls. These branches limit its CPU efficiency. +\end{enumerate} + +We expect that our advantage over Parquet will eventually be eroded as the Parquet implementation +continues to be optimized. Additionally, we expect that Parquet will perform better on disk-resident +workloads as it issues large 8MB IO accesses, as opposed to the smaller page-level accesses performed by Kudu. + +While the performance of Kudu compared with columnar formats warrants further investigation, it is clear that +Kudu is able to achieve similar scan speeds to immutable storage while providing mutable characteristics. + +\subsection{Comparison with Phoenix} +\label{sec:phoenix} + +Another implementation of SQL in the Hadoop ecosystem is Apache Phoenix\cite{phoenix}. Phoenix +provides a SQL query layer on top of HBase. Although Phoenix is not primarily +targeted at analytic workloads, we performed a small number of comparisons to illustrate the +order-of-magnitude difference in performance between Kudu and HBase for scan-heavy analytic +workloads. + +To eliminate scalability effects and compare raw scan performance, we ran these comparisons on +a smaller cluster, consisting of 9 worker nodes plus one master node, each with +48GB of RAM, 3 data disks, and dual 4-core Xeon L5630 processors at 2.13GHz. +We used Phoenix 4.3 and HBase 1.0. + +In this benchmark, we loaded the same TPC-H {\tt lineitem} table (62GB in CSV format) into Phoenix +using the provided {\tt CsvBulkLoadTool} MapReduce job. We configured the Phoenix table to use 100 +hash partitions, and created an equal number of tablets within Kudu. In both Kudu and Phoenix, +we used the {\tt DOUBLE} type for non-integer numeric columns, since Kudu does not currently +support the {\tt DECIMAL} type. We configured HBase with default block cache settings, +resulting in 9.6GB of on-heap cache per server. Kudu was configured with only 1GB of in-process +block cache, instead relying on the OS-based buffer cache to avoid physical disk IO. +We used the default HBase table attributes provided by Phoenix: {\tt FAST\_DIFF} encoding, +no compression, and one historical version per cell. On Impala, we used a per-query option +to disable runtime code generation in queries where it was not beneficial, eliminating a source +of constant overhead unrelated to the storage engine. + +After loading the data, we performed explicit major compactions to ensure 100\% HDFS block locality, +and ensured that the table's regions (analogous to Kudu tablets) were equally spread across +the 9 worker nodes. The 62GB data set expanded to approximately 570GB post-replication +in HBase, whereas the data in Kudu was 227GB post-replication\footnote{In fact, our current +implementation of {\tt CREATE TABLE AS SELECT} does not enable dictionary compression. With +this compression enabled, the Kudu table size is cut in half again.}. +HBase region servers and Kudu tablet servers were allocated 24GB of RAM, and we ran each +service alone in the cluster for its benchmark. We verified during both workloads that +no hard disk reads were generated, to focus on CPU efficiency, though we project that +on a disk-resident workload, Kudu will increase its performance edge due to its columnar +layout and better storage efficiency. + +\begin{table} +\begin{tabular}{r|l|l} + Q1 & scan 6 columns & [TPC-H Q1]\\\hline + Q2 & scan no columns & {\tt SELECT COUNT(*) FROM lineitem;}\\\hline + Q3 & non-key predicate & \parbox{2in}{\tt \vspace{0.2em}SELECT COUNT(*) FROM lineitem WHERE l\_quantity = 48\vspace{0.2em}}\\\hline + Q4 & key lookup & \parbox{2in}{\tt \vspace{0.2em} SELECT COUNT(*) FROM lineitem WHERE l\_orderkey = 2000} +\end{tabular} +\caption{Queries used to compare Impala-Kudu vs Phoenix-HBase} +\label{tab:phoenix_queries} +\end{table} + +\begin{table} +\begin{tabular}{r|lllll} + & Load & Q1 & Q2 & Q3 & Q4\\\hline +Phoenix-HBase & 2152s* & 219 & 76 & 131 & 0.04s\\ +Impala-Kudu & 1918s & 13.2 & 1.7 & 0.7 & 0.15s\\ +Impala-Parquet& 155s & 9.3 & 1.4 & 1.5s & 1.37ss +\end{tabular} +% https://docs.google.com/spreadsheets/d/1woDcR5Sot4cRKKS0Kdnz_xwxCrajnGiSoR0ZJqgHYhw/edit#gid=0 +\caption{Phoenix-HBase vs Impala-Kudu. Load time for Phoenix does not include the time required for a major compaction to +ensure data locality, which required an additional 20 minutes to complete.} +\label{tab:phoenix_results} +\end{table} + +In order to focus on scan speed rather than join performance, we focused only on TPCH Q1, which +reads only the {\tt lineitem} table. We also ran several other simple queries, listed in +Table \ref{tab:phoenix_queries}, in order to quantify the performance difference between +the Impala-Kudu system and the Phoenix-HBase system on the same hardware. We ran each +query 10 times and reported the median runtime. Across the analytic queries, Impala-Kudu +outperformed Phoenix-HBase by between 16x and 187x. For short scans of primary key +ranges, both Impala-Kudu and Phoenix-HBase returned sub-second results, with Phoenix winning +out due to lower constant factors during query planning. The results are summarized in +Table \ref{tab:phoenix_results}. + +\subsection{Random access performance} + +Although Kudu is not designed to be an OLTP store, one of its key goals is to be suitable for +lighter random-access workloads. To evaluate Kudu's random-access performance, we used +the Yahoo Cloud Serving Benchmark (YCSB)\cite{ycsb} on the same 10-node cluster used in +Section \ref{sec:phoenix}. We built YCSB from its master +branch\footnote{git hash {\tt 1f8cc5abdcad206c37039d9fbaea7cbf76089b48}} and added a binding to +run against Kudu. For these benchmarks, we configured both Kudu and HBase to use up to +24 GB of RAM. HBase automatically allocated 9.6 GB for the block cache and the remainder of the +heap for its in-memory stores. For Kudu, we allocated only 1GB for the block cache, preferring +to rely on Linux buffer caching. We performed no other tuning. For both Kudu and HBase, we pre-split +the table into 100 tablets or regions, and ensured that they were spread evenly across +the nodes. + +We configured YCSB to load a data set with 100 million rows, each row holding 10 data columns with 100 +bytes each. Because Kudu does not have the concept of a special row key column, we added an explicit +key column in the Kudu schema. For this benchmark, the data set fits entirely in RAM; in the +future we hope to do further benchmarks on flash-resident or disk-resident workloads, but +we assume that, given the increasing capacity of inexpensive RAM, most latency-sensitive +online workloads will primarily fit in memory. + +\begin{table} + \begin{tabular}{r|l} + Workload & Description\\\hline + Load & Load the table\\ + A & 50\% random-read, 50\% update\\ + B & 95\% random-read, 5\% update\\ + C & 100\% random read\\ + D & 95\% random read, 5\% insert + \end{tabular} + \caption{YCSB Workloads} + \label{tab:ycsb_workloads} +\end{table} + +Results for the five YCSB workloads are summarized in Table +\ref{tab:ycsb_workloads}. We ran the workloads in sequence by first loading the table +with data, then running workloads $A$ through $D$ in that order, with no pause in between. +Each workload ran for 10 million operations. For loading data, we used 16 client threads and enabled +client-side buffering to send larger batches of data to the backend storage engines. For all other +workloads, we used 64 client threads and disabled client-side buffering. + +We ran this full sequence two times for each storage engine, deleting and reloading the +table in between. In the second run, we substituted a uniform access distribution for +workloads $A$ through $C$ instead of the default Zipfian (power-law) distribution. +Workload $D$ uses a special access distribution which inserts rows randomly, and random-reads +those which have been recently inserted. + +We did not run workload $E$, which performs short range scans, because the Kudu client +currently lacks the ability to specify a limit on the number of rows returned. We did not +run workload $F$, because it relies on an atomic compare-and-swap primitive which Kudu does +not yet support. When these features are added to Kudu, we plan to run these workloads +as well. + +\begin{figure} + \includegraphics[width=3in]{ycsb-results.pdf} + \caption{Operation throughput of YCSB random-access workloads, comparing Kudu vs.\ HBase} + \label{fig:ycsb_throughput} +\end{figure} + +Figure \ref{fig:ycsb_throughput} presents the throughput reported by YCSB for each of the +workloads. In nearly all workloads, HBase out-performs Kudu in terms of throughput. In +particular, Kudu performs poorly in the Zipfian update workloads, where the CPU time +spent in reads is dominated by applying long chains of mutations stored in delta stores +\footnote{We have identified several potential optimizations in this code path, tracked +in $KUDU-749$.}. HBase, on the other hand, has long targeted this type of online workload +and performs comparably in both access distributions. + +Due to time limitations in preparing this paper for the first Kudu beta release, we do +not have sufficient data to report on longer-running workloads, or to include a summary +of latency percentiles. We anticipate updating this paper as results become available. + +% \section{Conclusion} +% TODO + +\section{Acknowledgements} +Kudu has benefited from many contributors outside of the authors of this paper. In particular, thanks to Chris Leroy, Binglin Chang, Guangxiang Du, Martin Grund, Eli Collins, Vladimir Feinberg, Alex Feinberg, Sarah Jelinek, Misty Stanley-Jones, Brock Noland, Michael Crutcher, Justin Erickson, and Nong Li. + +\bibliographystyle{abbrv} +\bibliography{kudu} + +\end{document} diff --git a/docs/whitepaper/vldb.cls b/docs/whitepaper/vldb.cls new file mode 100644 index 000000000000..ee70e0837405 --- /dev/null +++ b/docs/whitepaper/vldb.cls @@ -0,0 +1,1401 @@ +% VLDB.CLS - Version 1.8c +% +% based on: +% SIG-ALTERNATE.CLS - VERSION 1.8 +% "COMPATIBLE" WITH THE "ACM_PROC_ARTICLE-SP.CLS" V2.7SP +% Gerald Murray July 26th. 2005 +% +% ---- Start of 'updates' ---- +% +% Allowance made to switch default fonts between those systems using +% METAFONT and those using 'Type 1' or 'Truetype' fonts. +% See LINE NUMBER 252 for details. +% Also provided for enumerated/annotated Corollaries 'surrounded' by +% enumerated Theorems (line 841). +% Gerry November 11th. 1999 +% +% Made the Permission Statement / Conference Info / Copyright Info +% 'user definable' in the source .tex file OR automatic if +% not specified. +% +% Georgia fixed bug in sub-sub-section numbering in paragraphs (July 29th. 2002) +% JS/GM fix to vertical spacing before Proofs (July 30th. 2002) +% +% Footnotes inside table cells using \minipage (Oct. 2002) +% +% Enforced 'US letter' page size and updated PVLDB copyright (UR, Oct 2010) +% +% Made the file template-able, so vol/no information can be dynmacally generated. (AhmetSacan, Sep 2012) +% +% Added widow line penalties. (AhmetSacan, Sep 2012) +% ---- End of 'updates' ---- +% +\def\fileversion{v1.8c} % for VLDB's and ACM's tracking purposes +\def\filedate{July 26, 2005} % Gerry Murray's tracking data +\def\docdate {Tuesday 26th. July 2005} % Gerry Murray (with deltas to doc} +\usepackage[pdftex,letterpaper]{geometry} % fixed to US letter size for output (since version 1.8c - UR 2010) +\usepackage{epsfig} +\usepackage{amssymb} +\usepackage{amsmath} +\usepackage{amsfonts} +% +% VLDB DOCUMENT STYLE +% based on ACM's sig-alternate.cls, modified 31 Oct 2010 for PVLDB, +% with VLDB-specific copyright notice and fixed US letter paper size. +% +% SIG-ALTERNATE DOCUMENT STYLE +% G.K.M. Tobin August-October 1999 +% adapted from ARTICLE document style by Ken Traub, Olin Shivers +% also using elements of esub2acm.cls +% HEAVILY MODIFIED, SUBSEQUENTLY, BY GERRY MURRAY 2000 +% ARTICLE DOCUMENT STYLE -- Released 16 March 1988 +% for LaTeX version 2.09 +% Copyright (C) 1988 by Leslie Lamport +% +% +%%% sig-alternate.cls is an 'ALTERNATE' document style for producing +%%% two-column camera-ready pages for ACM conferences. +%%% THIS FILE DOES NOT STRICTLY ADHERE TO THE SIGS (BOARD-ENDORSED) +%%% PROCEEDINGS STYLE. It has been designed to produce a 'tighter' +%%% paper in response to concerns over page budgets. +%%% The main features of this style are: +%%% +%%% 1) Two columns. +%%% 2) Side and top margins of 4.5pc, bottom margin of 6pc, column gutter of +%%% 2pc, hence columns are 20pc wide and 55.5pc tall. (6pc =3D 1in, approx) +%%% 3) First page has title information, and an extra 6pc of space at the +%%% bottom of the first column for the ACM copyright notice. +%%% 4) Text is 9pt on 10pt baselines; titles (except main) are 9pt bold. +%%% 5) US letter paper size (since v1.8c) +%%% +%%% +%%% There are a few restrictions you must observe: +%%% +%%% 1) You cannot change the font size; ACM wants you to use 9pt. +%%% 3) You must start your paper with the \maketitle command. Prior to the +%%% \maketitle you must have \title and \author commands. If you have a +%%% \date command it will be ignored; no date appears on the paper, since +%%% the proceedings will have a date on the front cover. +%%% 4) Marginal paragraphs, tables of contents, lists of figures and tables, +%%% and page headings are all forbidden. +%%% 5) The `figure' environment will produce a figure one column wide; if you +%%% want one that is two columns wide, use `figure*'. +%%% +% +%%% Copyright Space: +%%% This style automatically reserves 1" blank space at the bottom of page 1/ +%%% column 1. This space can optionally be filled with some text using the +%%% \toappear{...} command. If used, this command must be BEFORE the \maketitle +%%% command. If this command is defined AND [preprint] is on, then the +%%% space is filled with the {...} text (at the bottom); otherwise, it is +%%% blank. If you use \toappearbox{...} instead of \toappear{...} then a +%%% box will be drawn around the text (if [preprint] is on). +%%% +%%% A typical usage looks like this: +%%% \toappear{To appear in the Ninth AES Conference on Medievil Lithuanian +%%% Embalming Technique, June 1991, Alfaretta, Georgia.} +%%% This will be included in the preprint, and left out of the conference +%%% version. +%%% +%%% WARNING: +%%% Some dvi-ps converters heuristically allow chars to drift from their +%%% true positions a few pixels. This may be noticeable with the 9pt sans-serif +%%% bold font used for section headers. +%%% You may turn this hackery off via the -e option: +%%% dvips -e 0 foo.dvi >foo.ps +%%% +\typeout{Document Class 'vldb' - based on 'sig-alternate' <26th. July '05>. Modified by G.K.M. Tobin/Gerry Murray} +\typeout{Based in part upon document Style `acmconf' <22 May 89>. Hacked 4/91 by} +\typeout{shivers@cs.cmu.edu, 4/93 by theobald@cs.mcgill.ca} +\typeout{Excerpts were taken from (Journal Style) 'esub2acm.cls'.} +\typeout{****** Bugs/comments/suggestions/technicalities to Gerry Murray -- murray@hq.acm.org ******} +\typeout{Questions on the style, SIGS policies, etc. to Adrienne Griscti griscti@acm.org} +\oddsidemargin 4.5pc +\evensidemargin 4.5pc +\advance\oddsidemargin by -1in % Correct for LaTeX gratuitousness +\advance\evensidemargin by -1in % Correct for LaTeX gratuitousness +\marginparwidth 0pt % Margin pars are not allowed. +\marginparsep 11pt % Horizontal space between outer margin and + % marginal note + + % Top of page: +\topmargin 4.5pc % Nominal distance from top of page to top of + % box containing running head. +\advance\topmargin by -1in % Correct for LaTeX gratuitousness +\headheight 0pt % Height of box containing running head. +\headsep 0pt % Space between running head and text. + % Bottom of page: +\footskip 30pt % Distance from baseline of box containing foot + % to baseline of last line of text. +\@ifundefined{footheight}{\newdimen\footheight}{}% this is for LaTeX2e +\footheight 12pt % Height of box containing running foot. + +%% Must redefine the top margin so there's room for headers and +%% page numbers if you are using the preprint option. Footers +%% are OK as is. Olin. +\advance\topmargin by -37pt % Leave 37pt above text for headers +\headheight 12pt % Height of box containing running head. +\headsep 25pt % Space between running head and text. + +\textheight 666pt % 9 1/4 column height +\textwidth 42pc % Width of text line. + % For two-column mode: +\columnsep 2pc % Space between columns +\columnseprule 0pt % Width of rule between columns. +\hfuzz 1pt % Allow some variation in column width, otherwise it's + % too hard to typeset in narrow columns. + +\footnotesep 5.6pt % Height of strut placed at the beginning of every + % footnote =3D height of normal \footnotesize strut, + % so no extra space between footnotes. + +\skip\footins 8.1pt plus 4pt minus 2pt % Space between last line of text and + % top of first footnote. +\floatsep 11pt plus 2pt minus 2pt % Space between adjacent floats moved + % to top or bottom of text page. +\textfloatsep 18pt plus 2pt minus 4pt % Space between main text and floats + % at top or bottom of page. +\intextsep 11pt plus 2pt minus 2pt % Space between in-text figures and + % text. +\@ifundefined{@maxsep}{\newdimen\@maxsep}{}% this is for LaTeX2e +\@maxsep 18pt % The maximum of \floatsep, + % \textfloatsep and \intextsep (minus + % the stretch and shrink). +\dblfloatsep 11pt plus 2pt minus 2pt % Same as \floatsep for double-column + % figures in two-column mode. +\dbltextfloatsep 18pt plus 2pt minus 4pt% \textfloatsep for double-column + % floats. +\@ifundefined{@dblmaxsep}{\newdimen\@dblmaxsep}{}% this is for LaTeX2e +\@dblmaxsep 18pt % The maximum of \dblfloatsep and + % \dbltexfloatsep. +\@fptop 0pt plus 1fil % Stretch at top of float page/column. (Must be + % 0pt plus ...) +\@fpsep 8pt plus 2fil % Space between floats on float page/column. +\@fpbot 0pt plus 1fil % Stretch at bottom of float page/column. (Must be + % 0pt plus ... ) +\@dblfptop 0pt plus 1fil % Stretch at top of float page. (Must be 0pt plus ...) +\@dblfpsep 8pt plus 2fil % Space between floats on float page. +\@dblfpbot 0pt plus 1fil % Stretch at bottom of float page. (Must be + % 0pt plus ... ) +\marginparpush 5pt % Minimum vertical separation between two marginal + % notes. + +\parskip 0pt plus 1pt % Extra vertical space between paragraphs. +\parindent 9pt % GM July 2000 / was 0pt - width of paragraph indentation. +\partopsep 2pt plus 1pt minus 1pt% Extra vertical space, in addition to + % \parskip and \topsep, added when user + % leaves blank line before environment. + +\@lowpenalty 51 % Produced by \nopagebreak[1] or \nolinebreak[1] +\@medpenalty 151 % Produced by \nopagebreak[2] or \nolinebreak[2] +\@highpenalty 301 % Produced by \nopagebreak[3] or \nolinebreak[3] + +\@beginparpenalty -\@lowpenalty % Before a list or paragraph environment. +\@endparpenalty -\@lowpenalty % After a list or paragraph environment. +\@itempenalty -\@lowpenalty % Between list items. + +% Try to prevent widow lines. +\clubpenalty=9996 +\widowpenalty=9999 +\brokenpenalty=4991 +\predisplaypenalty=10000 +\postdisplaypenalty=1549 +\displaywidowpenalty=1602 + +\@namedef{ds@10pt}{\@latexerr{The `10pt' option is not allowed in the `acmconf' + document style.}\@eha} +\@namedef{ds@11pt}{\@latexerr{The `11pt' option is not allowed in the `acmconf' + document style.}\@eha} +\@namedef{ds@12pt}{\@latexerr{The `12pt' option is not allowed in the `acmconf' + document style.}\@eha} + +\@options + +\lineskip 2pt % \lineskip is 1pt for all font sizes. +\normallineskip 2pt +\def\baselinestretch{1} + +\abovedisplayskip 9pt plus2pt minus4.5pt% +\belowdisplayskip \abovedisplayskip +\abovedisplayshortskip \z@ plus3pt% +\belowdisplayshortskip 5.4pt plus3pt minus3pt% +\let\@listi\@listI % Setting of \@listi added 9 Jun 87 + +\def\small{\@setsize\small{9pt}\viiipt\@viiipt +\abovedisplayskip 7.6pt plus 3pt minus 4pt% +\belowdisplayskip \abovedisplayskip +\abovedisplayshortskip \z@ plus2pt% +\belowdisplayshortskip 3.6pt plus2pt minus 2pt +\def\@listi{\leftmargin\leftmargini %% Added 22 Dec 87 +\topsep 4pt plus 2pt minus 2pt\parsep 2pt plus 1pt minus 1pt +\itemsep \parsep}} + +\def\footnotesize{\@setsize\footnotesize{9pt}\ixpt\@ixpt +\abovedisplayskip 6.4pt plus 2pt minus 4pt% +\belowdisplayskip \abovedisplayskip +\abovedisplayshortskip \z@ plus 1pt% +\belowdisplayshortskip 2.7pt plus 1pt minus 2pt +\def\@listi{\leftmargin\leftmargini %% Added 22 Dec 87 +\topsep 3pt plus 1pt minus 1pt\parsep 2pt plus 1pt minus 1pt +\itemsep \parsep}} + +\newcount\aucount +\newcount\originalaucount +\newdimen\auwidth +\auwidth=\textwidth +\newdimen\auskip +\newcount\auskipcount +\newdimen\auskip +\global\auskip=1pc +\newdimen\allauboxes +\allauboxes=\auwidth +\newtoks\addauthors +\newcount\addauflag +\global\addauflag=0 %Haven't shown additional authors yet + +\newtoks\subtitletext +\gdef\subtitle#1{\subtitletext={#1}} + +\gdef\additionalauthors#1{\addauthors={#1}} + +\gdef\numberofauthors#1{\global\aucount=#1 +\ifnum\aucount>3\global\originalaucount=\aucount \global\aucount=3\fi %g} +\global\auskipcount=\aucount\global\advance\auskipcount by 1 +\global\multiply\auskipcount by 2 +\global\multiply\auskip by \auskipcount +\global\advance\auwidth by -\auskip +\global\divide\auwidth by \aucount} + +% \and was modified to count the number of authors. GKMT 12 Aug 1999 +\def\alignauthor{% % \begin{tabular} +\end{tabular}% + \begin{tabular}[t]{p{\auwidth}}\centering}% + +% *** NOTE *** NOTE *** NOTE *** NOTE *** +% If you have 'font problems' then you may need +% to change these, e.g. 'arialb' instead of "arialbd". +% Gerry Murray 11/11/1999 +% *** OR ** comment out block A and activate block B or vice versa. +% ********************************************** +% +% -- Start of block A -- (Type 1 or Truetype fonts) +%\newfont{\secfnt}{timesbd at 12pt} % was timenrb originally - now is timesbd +%\newfont{\secit}{timesbi at 12pt} %13 Jan 00 gkmt +%\newfont{\subsecfnt}{timesi at 11pt} % was timenrri originally - now is timesi +%\newfont{\subsecit}{timesbi at 11pt} % 13 Jan 00 gkmt -- was times changed to timesbi gm 2/4/2000 +% % because "normal" is italic, "italic" is Roman +%\newfont{\ttlfnt}{arialbd at 18pt} % was arialb originally - now is arialbd +%\newfont{\ttlit}{arialbi at 18pt} % 13 Jan 00 gkmt +%\newfont{\subttlfnt}{arial at 14pt} % was arialr originally - now is arial +%\newfont{\subttlit}{ariali at 14pt} % 13 Jan 00 gkmt +%\newfont{\subttlbf}{arialbd at 14pt} % 13 Jan 00 gkmt +%\newfont{\aufnt}{arial at 12pt} % was arialr originally - now is arial +%\newfont{\auit}{ariali at 12pt} % 13 Jan 00 gkmt +%\newfont{\affaddr}{arial at 10pt} % was arialr originally - now is arial +%\newfont{\affaddrit}{ariali at 10pt} %13 Jan 00 gkmt +%\newfont{\eaddfnt}{arial at 12pt} % was arialr originally - now is arial +%\newfont{\ixpt}{times at 9pt} % was timenrr originally - now is times +%\newfont{\confname}{timesi at 8pt} % was timenrri - now is timesi +%\newfont{\crnotice}{times at 8pt} % was timenrr originally - now is times +%\newfont{\ninept}{times at 9pt} % was timenrr originally - now is times + +% ********************************************* +% -- End of block A -- +% +% +% -- Start of block B -- METAFONT +% +++++++++++++++++++++++++++++++++++++++++++++ +% Next (default) block for those using Metafont +% Gerry Murray 11/11/1999 +% *** THIS BLOCK FOR THOSE USING METAFONT ***** +% ********************************************* +\newfont{\secfnt}{ptmb at 12pt} +\newfont{\secit}{ptmbi at 12pt} %13 Jan 00 gkmt +\newfont{\subsecfnt}{ptmri at 11pt} +\newfont{\subsecit}{ptmbi at 11pt} % 13 Jan 00 gkmt -- was ptmr changed to ptmbi gm 2/4/2000 + % because "normal" is italic, "italic" is Roman +\newfont{\ttlfnt}{phvb at 18pt} +\newfont{\ttlit}{phvbo at 18pt} % GM 2/4/2000 +\newfont{\subttlfnt}{phvr at 14pt} +\newfont{\subttlit}{phvro at 14pt} % GM 2/4/2000 +\newfont{\subttlbf}{phvb at 14pt} % 13 Jan 00 gkmt +\newfont{\aufnt}{phvr at 12pt} +\newfont{\auit}{phvro at 12pt} % GM 2/4/2000 +\newfont{\affaddr}{phvr at 10pt} +\newfont{\affaddrit}{phvro at 10pt} % GM 2/4/2000 +\newfont{\eaddfnt}{phvr at 12pt} +\newfont{\ixpt}{ptmr at 9pt} +\newfont{\confname}{ptmri at 8pt} +\newfont{\crnotice}{ptmr at 8pt} +\newfont{\ninept}{ptmr at 9pt} +% +++++++++++++++++++++++++++++++++++++++++++++ +% -- End of block B -- + + +\def\email#1{{{\eaddfnt{\vskip 4pt#1}}}} + +\def\addauthorsection{\ifnum\originalaucount>3 + \section{Additional Authors}\the\addauthors + \fi} + +\newcount\savesection +\newcount\sectioncntr +\global\sectioncntr=1 + +\setcounter{secnumdepth}{3} + +\def\appendix{\par +\section*{APPENDIX} +\setcounter{section}{0} + \setcounter{subsection}{0} + \def\thesection{\Alph{section}} } + +\leftmargini 22.5pt +\leftmarginii 19.8pt % > \labelsep + width of '(m)' +\leftmarginiii 16.8pt % > \labelsep + width of 'vii.' +\leftmarginiv 15.3pt % > \labelsep + width of 'M.' +\leftmarginv 9pt +\leftmarginvi 9pt + +\leftmargin\leftmargini +\labelsep 4.5pt +\labelwidth\leftmargini\advance\labelwidth-\labelsep + +\def\@listI{\leftmargin\leftmargini \parsep 3.6pt plus 2pt minus 1pt% +\topsep 7.2pt plus 2pt minus 4pt% +\itemsep 3.6pt plus 2pt minus 1pt} + +\let\@listi\@listI +\@listi + +\def\@listii{\leftmargin\leftmarginii + \labelwidth\leftmarginii\advance\labelwidth-\labelsep + \topsep 3.6pt plus 2pt minus 1pt + \parsep 1.8pt plus 0.9pt minus 0.9pt + \itemsep \parsep} + +\def\@listiii{\leftmargin\leftmarginiii + \labelwidth\leftmarginiii\advance\labelwidth-\labelsep + \topsep 1.8pt plus 0.9pt minus 0.9pt + \parsep \z@ \partopsep 1pt plus 0pt minus 1pt + \itemsep \topsep} + +\def\@listiv{\leftmargin\leftmarginiv + \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} + +\def\@listv{\leftmargin\leftmarginv + \labelwidth\leftmarginv\advance\labelwidth-\labelsep} + +\def\@listvi{\leftmargin\leftmarginvi + \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} + +\def\labelenumi{\theenumi.} +\def\theenumi{\arabic{enumi}} + +\def\labelenumii{(\theenumii)} +\def\theenumii{\alph{enumii}} +\def\p@enumii{\theenumi} + +\def\labelenumiii{\theenumiii.} +\def\theenumiii{\roman{enumiii}} +\def\p@enumiii{\theenumi(\theenumii)} + +\def\labelenumiv{\theenumiv.} +\def\theenumiv{\Alph{enumiv}} +\def\p@enumiv{\p@enumiii\theenumiii} + +\def\labelitemi{$\bullet$} +\def\labelitemii{\bf --} +\def\labelitemiii{$\ast$} +\def\labelitemiv{$\cdot$} + +\def\verse{\let\\=\@centercr + \list{}{\itemsep\z@ \itemindent -1.5em\listparindent \itemindent + \rightmargin\leftmargin\advance\leftmargin 1.5em}\item[]} +\let\endverse\endlist + +\def\quotation{\list{}{\listparindent 1.5em + \itemindent\listparindent + \rightmargin\leftmargin \parsep 0pt plus 1pt}\item[]} +\let\endquotation=\endlist + +\def\quote{\list{}{\rightmargin\leftmargin}\item[]} +\let\endquote=\endlist + +\def\descriptionlabel#1{\hspace\labelsep \bf #1} +\def\description{\list{}{\labelwidth\z@ \itemindent-\leftmargin + \let\makelabel\descriptionlabel}} + +\let\enddescription\endlist + +\def\theequation{\arabic{equation}} + +\arraycolsep 4.5pt % Half the space between columns in an array environment. +\tabcolsep 5.4pt % Half the space between columns in a tabular environment. +\arrayrulewidth .4pt % Width of rules in array and tabular environment. +\doublerulesep 1.8pt % Space between adjacent rules in array or tabular env. + +\tabbingsep \labelsep % Space used by the \' command. (See LaTeX manual.) + +\skip\@mpfootins =\skip\footins + +\fboxsep =2.7pt % Space left between box and text by \fbox and \framebox. +\fboxrule =.4pt % Width of rules in box made by \fbox and \framebox. + +\def\thepart{\Roman{part}} % Roman numeral part numbers. +\def\thesection {\arabic{section}} +\def\thesubsection {\thesection.\arabic{subsection}} +%\def\thesubsubsection {\thesubsection.\arabic{subsubsection}} % GM 7/30/2002 +%\def\theparagraph {\thesubsubsection.\arabic{paragraph}} % GM 7/30/2002 +\def\thesubparagraph {\theparagraph.\arabic{subparagraph}} + +\def\@pnumwidth{1.55em} +\def\@tocrmarg {2.55em} +\def\@dotsep{4.5} +\setcounter{tocdepth}{3} + +\def\tableofcontents{\@latexerr{\tableofcontents: Tables of contents are not + allowed in the `acmconf' document style.}\@eha} + +\def\l@part#1#2{\addpenalty{\@secpenalty} + \addvspace{2.25em plus 1pt} % space above part line + \begingroup + \@tempdima 3em % width of box holding part number, used by + \parindent \z@ \rightskip \@pnumwidth %% \numberline + \parfillskip -\@pnumwidth + {\large \bf % set line in \large boldface + \leavevmode % TeX command to enter horizontal mode. + #1\hfil \hbox to\@pnumwidth{\hss #2}}\par + \nobreak % Never break after part entry + \endgroup} + +\def\l@section#1#2{\addpenalty{\@secpenalty} % good place for page break + \addvspace{1.0em plus 1pt} % space above toc entry + \@tempdima 1.5em % width of box holding section number + \begingroup + \parindent \z@ \rightskip \@pnumwidth + \parfillskip -\@pnumwidth + \bf % Boldface. + \leavevmode % TeX command to enter horizontal mode. + \advance\leftskip\@tempdima %% added 5 Feb 88 to conform to + \hskip -\leftskip %% 25 Jan 88 change to \numberline + #1\nobreak\hfil \nobreak\hbox to\@pnumwidth{\hss #2}\par + \endgroup} + + +\def\l@subsection{\@dottedtocline{2}{1.5em}{2.3em}} +\def\l@subsubsection{\@dottedtocline{3}{3.8em}{3.2em}} +\def\l@paragraph{\@dottedtocline{4}{7.0em}{4.1em}} +\def\l@subparagraph{\@dottedtocline{5}{10em}{5em}} + +\def\listoffigures{\@latexerr{\listoffigures: Lists of figures are not + allowed in the `acmconf' document style.}\@eha} + +\def\l@figure{\@dottedtocline{1}{1.5em}{2.3em}} + +\def\listoftables{\@latexerr{\listoftables: Lists of tables are not + allowed in the `acmconf' document style.}\@eha} +\let\l@table\l@figure + +\def\footnoterule{\kern-3\p@ + \hrule width .4\columnwidth + \kern 2.6\p@} % The \hrule has default height of .4pt . +% ------ +\long\def\@makefntext#1{\noindent +%\hbox to .5em{\hss$^{\@thefnmark}$}#1} % original +\hbox to .5em{\hss\textsuperscript{\@thefnmark}}#1} % C. Clifton / GM Oct. 2nd. 2002 +% ------- + +\long\def\@maketntext#1{\noindent +#1} + +\long\def\@maketitlenotetext#1#2{\noindent + \hbox to 1.8em{\hss$^{#1}$}#2} + +\setcounter{topnumber}{2} +\def\topfraction{.7} +\setcounter{bottomnumber}{1} +\def\bottomfraction{.3} +\setcounter{totalnumber}{3} +\def\textfraction{.2} +\def\floatpagefraction{.5} +\setcounter{dbltopnumber}{2} +\def\dbltopfraction{.7} +\def\dblfloatpagefraction{.5} + +% +\long\def\@makecaption#1#2{ + \vskip \baselineskip + \setbox\@tempboxa\hbox{\textbf{#1: #2}} + \ifdim \wd\@tempboxa >\hsize % IF longer than one line: + \textbf{#1: #2}\par % THEN set as ordinary paragraph. + \else % ELSE center. + \hbox to\hsize{\hfil\box\@tempboxa\hfil}\par + \fi} + +% + +\long\def\@makecaption#1#2{ + \vskip 10pt + \setbox\@tempboxa\hbox{\textbf{#1: #2}} + \ifdim \wd\@tempboxa >\hsize % IF longer than one line: + \textbf{#1: #2}\par % THEN set as ordinary paragraph. + \else % ELSE center. + \hbox to\hsize{\hfil\box\@tempboxa\hfil} + \fi} + +\@ifundefined{figure}{\newcounter {figure}} % this is for LaTeX2e + +\def\fps@figure{tbp} +\def\ftype@figure{1} +\def\ext@figure{lof} +\def\fnum@figure{Figure \thefigure} +\def\figure{\@float{figure}} +\let\endfigure\end@float +\@namedef{figure*}{\@dblfloat{figure}} +\@namedef{endfigure*}{\end@dblfloat} + +\@ifundefined{table}{\newcounter {table}} % this is for LaTeX2e + +\def\fps@table{tbp} +\def\ftype@table{2} +\def\ext@table{lot} +\def\fnum@table{Table \thetable} +\def\table{\@float{table}} +\let\endtable\end@float +\@namedef{table*}{\@dblfloat{table}} +\@namedef{endtable*}{\end@dblfloat} + +\newtoks\titleboxnotes +\newcount\titleboxnoteflag + +\def\maketitle{\par + \begingroup + \def\thefootnote{\fnsymbol{footnote}} + \def\@makefnmark{\hbox + to 0pt{$^{\@thefnmark}$\hss}} + \twocolumn[\@maketitle] +\@thanks + \endgroup + \setcounter{footnote}{0} + \let\maketitle\relax + \let\@maketitle\relax + \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\gdef\@subtitle{}\let\thanks\relax + \@copyrightspace} + +%% CHANGES ON NEXT LINES +\newif\if@ll % to record which version of LaTeX is in use + +\expandafter\ifx\csname LaTeXe\endcsname\relax % LaTeX2.09 is used +\else% LaTeX2e is used, so set ll to true +\global\@lltrue +\fi + +\if@ll + \NeedsTeXFormat{LaTeX2e} + \ProvidesClass{vldb} [2010/10/19 - V1.8b - based on sig-alternate V1.8 - based on acmproc.cls V1.3 ] + \RequirePackage{latexsym}% QUERY: are these two really needed? + \let\dooptions\ProcessOptions +\else + \let\dooptions\@options +\fi +%% END CHANGES + +\def\@height{height} +\def\@width{width} +\def\@minus{minus} +\def\@plus{plus} +\def\hb@xt@{\hbox to} +\newif\if@faircopy +\@faircopyfalse +\def\ds@faircopy{\@faircopytrue} + +\def\ds@preprint{\@faircopyfalse} + +\@twosidetrue +\@mparswitchtrue +\def\ds@draft{\overfullrule 5\p@} +%% CHANGE ON NEXT LINE +\dooptions + +\lineskip \p@ +\normallineskip \p@ +\def\baselinestretch{1} +\def\@ptsize{0} %needed for amssymbols.sty + +%% CHANGES ON NEXT LINES +\if@ll% allow use of old-style font change commands in LaTeX2e +\@maxdepth\maxdepth +% +\DeclareOldFontCommand{\rm}{\ninept\rmfamily}{\mathrm} +\DeclareOldFontCommand{\sf}{\normalfont\sffamily}{\mathsf} +\DeclareOldFontCommand{\tt}{\normalfont\ttfamily}{\mathtt} +\DeclareOldFontCommand{\bf}{\normalfont\bfseries}{\mathbf} +\DeclareOldFontCommand{\it}{\normalfont\itshape}{\mathit} +\DeclareOldFontCommand{\sl}{\normalfont\slshape}{\@nomath\sl} +\DeclareOldFontCommand{\sc}{\normalfont\scshape}{\@nomath\sc} +\DeclareRobustCommand*{\cal}{\@fontswitch{\relax}{\mathcal}} +\DeclareRobustCommand*{\mit}{\@fontswitch{\relax}{\mathnormal}} +\fi +% +\if@ll + \renewcommand{\rmdefault}{cmr} % was 'ttm' +% Note! I have also found 'mvr' to work ESPECIALLY well. +% Gerry - October 1999 +% You may need to change your LV1times.fd file so that sc is +% mapped to cmcsc - -for smallcaps -- that is if you decide +% to change {cmr} to {times} above. (Not recommended) + \renewcommand{\@ptsize}{} + \renewcommand{\normalsize}{% + \@setfontsize\normalsize\@ixpt{10.5\p@}%\ninept% + \abovedisplayskip 6\p@ \@plus2\p@ \@minus\p@ + \belowdisplayskip \abovedisplayskip + \abovedisplayshortskip 6\p@ \@minus 3\p@ + \belowdisplayshortskip 6\p@ \@minus 3\p@ + \let\@listi\@listI + } +\else + \def\@normalsize{%changed next to 9 from 10 + \@setsize\normalsize{9\p@}\ixpt\@ixpt + \abovedisplayskip 6\p@ \@plus2\p@ \@minus\p@ + \belowdisplayskip \abovedisplayskip + \abovedisplayshortskip 6\p@ \@minus 3\p@ + \belowdisplayshortskip 6\p@ \@minus 3\p@ + \let\@listi\@listI + }% +\fi +\if@ll + \newcommand\scriptsize{\@setfontsize\scriptsize\@viipt{8\p@}} + \newcommand\tiny{\@setfontsize\tiny\@vpt{6\p@}} + \newcommand\large{\@setfontsize\large\@xiipt{14\p@}} + \newcommand\Large{\@setfontsize\Large\@xivpt{18\p@}} + \newcommand\LARGE{\@setfontsize\LARGE\@xviipt{20\p@}} + \newcommand\huge{\@setfontsize\huge\@xxpt{25\p@}} + \newcommand\Huge{\@setfontsize\Huge\@xxvpt{30\p@}} +\else + \def\scriptsize{\@setsize\scriptsize{8\p@}\viipt\@viipt} + \def\tiny{\@setsize\tiny{6\p@}\vpt\@vpt} + \def\large{\@setsize\large{14\p@}\xiipt\@xiipt} + \def\Large{\@setsize\Large{18\p@}\xivpt\@xivpt} + \def\LARGE{\@setsize\LARGE{20\p@}\xviipt\@xviipt} + \def\huge{\@setsize\huge{25\p@}\xxpt\@xxpt} + \def\Huge{\@setsize\Huge{30\p@}\xxvpt\@xxvpt} +\fi +\normalsize + +% make aubox hsize/number of authors up to 3, less gutter +% then showbox gutter showbox gutter showbox -- GKMT Aug 99 +\newbox\@acmtitlebox +\def\@maketitle{\newpage + \null + \setbox\@acmtitlebox\vbox{% +\baselineskip 20pt +\vskip 2em % Vertical space above title. + \begin{center} + {\ttlfnt \@title\par} % Title set in 18pt Helvetica (Arial) bold size. + \vskip 1.5em % Vertical space after title. +%This should be the subtitle. +{\subttlfnt \the\subtitletext\par}\vskip 1.25em%\fi + {\baselineskip 16pt\aufnt % each author set in \12 pt Arial, in a + \lineskip .5em % tabular environment + \begin{tabular}[t]{c}\@author + \end{tabular}\par} + \vskip 1.5em % Vertical space after author. + \end{center}} + \dimen0=\ht\@acmtitlebox + \advance\dimen0 by -12.75pc\relax % Increased space for title box -- KBT + \unvbox\@acmtitlebox + \ifdim\dimen0<0.0pt\relax\vskip-\dimen0\fi} + + +\newcount\titlenotecount +\global\titlenotecount=0 +\newtoks\tntoks +\newtoks\tntokstwo +\newtoks\tntoksthree +\newtoks\tntoksfour +\newtoks\tntoksfive + +\def\abstract{ +\ifnum\titlenotecount>0 % was =1 + \insert\footins{% + \reset@font\footnotesize + \interlinepenalty\interfootnotelinepenalty + \splittopskip\footnotesep + \splitmaxdepth \dp\strutbox \floatingpenalty \@MM + \hsize\columnwidth \@parboxrestore + \protected@edef\@currentlabel{% + }% + \color@begingroup +\ifnum\titlenotecount=1 + \@maketntext{% + \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\@finalstrut\strutbox}% +\fi +\ifnum\titlenotecount=2 + \@maketntext{% + \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\@finalstrut\strutbox}% +\fi +\ifnum\titlenotecount=3 + \@maketntext{% + \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\@finalstrut\strutbox}% +\fi +\ifnum\titlenotecount=4 + \@maketntext{% + \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\S$}\rule\z@\footnotesep\ignorespaces\the\tntoksfour\@finalstrut\strutbox}% +\fi +\ifnum\titlenotecount=5 + \@maketntext{% + \raisebox{4pt}{$\ast$}\rule\z@\footnotesep\ignorespaces\the\tntoks\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\dagger$}\rule\z@\footnotesep\ignorespaces\the\tntokstwo\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\ddagger$}\rule\z@\footnotesep\ignorespaces\the\tntoksthree\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\S$}\rule\z@\footnotesep\ignorespaces\the\tntoksfour\par\@finalstrut\strutbox}% +\@maketntext{% + \raisebox{4pt}{$\P$}\rule\z@\footnotesep\ignorespaces\the\tntoksfive\@finalstrut\strutbox}% +\fi + \color@endgroup} %g} +\fi +\setcounter{footnote}{0} +\section*{ABSTRACT}\normalsize%\ninept +} + +\def\endabstract{\if@twocolumn\else\endquotation\fi} + +\def\keywords{\if@twocolumn +\section*{Keywords} +\else \small +\quotation +\fi} + +\def\terms{\if@twocolumn +\section*{General Terms} +\else \small +\quotation +\fi} + +% -- Classification needs to be a bit smart due to optionals - Gerry/Georgia November 2nd. 1999 +\newcount\catcount +\global\catcount=1 + +\def\category#1#2#3{% +\ifnum\catcount=1 +\section*{Categories and Subject Descriptors} +\advance\catcount by 1\else{\unskip; }\fi + \@ifnextchar [{\@category{#1}{#2}{#3}}{\@category{#1}{#2}{#3}[]}% +} + +\def\@category#1#2#3[#4]{% + \begingroup + \let\and\relax + #1 [\textbf{#2}]% + \if!#4!% + \if!#3!\else : #3\fi + \else + :\space + \if!#3!\else #3\kern\z@---\hskip\z@\fi + \textit{#4}% + \fi + \endgroup +} +% + +%%% This section (written by KBT) handles the 1" box in the lower left +%%% corner of the left column of the first page by creating a picture, +%%% and inserting the predefined string at the bottom (with a negative +%%% displacement to offset the space allocated for a non-existent +%%% caption). +%%% +\newtoks\copyrightnotice +\def\ftype@copyrightbox{8} +\def\@copyrightspace{ +\@float{copyrightbox}[b] +\begin{center} +\setlength{\unitlength}{1pc} +\begin{picture}(20,6) %Space for copyright notice +\put(0,-0.95){\crnotice{\@toappear}} +\end{picture} +\end{center} +\end@float} + +\def\@toappear{} % Default setting blank - commands below change this. +\long\def\toappear#1{\def\@toappear{\parbox[b]{20pc}{\baselineskip 9pt#1}}} +\def\toappearbox#1{\def\@toappear{\raisebox{5pt}{\framebox[20pc]{\parbox[b]{19pc}{#1}}}}} + +\newtoks\conf +\newtoks\confinfo +\def\conferenceinfo#1#2{\global\conf={#1}\global\confinfo{#2}} + + +\def\marginpar{\@latexerr{The \marginpar command is not allowed in the + `acmconf' document style.}\@eha} + +\mark{{}{}} % Initializes TeX's marks + +\def\today{\ifcase\month\or + January\or February\or March\or April\or May\or June\or + July\or August\or September\or October\or November\or December\fi + \space\number\day, \number\year} + +\def\@begintheorem#1#2{% + \parskip 0pt % GM July 2000 (for tighter spacing) + \trivlist + \item[% + \hskip 10\p@ + \hskip \labelsep + {{\sc #1}\hskip 5\p@\relax#2.}% + ] + \it +} +\def\@opargbegintheorem#1#2#3{% + \parskip 0pt % GM July 2000 (for tighter spacing) + \trivlist + \item[% + \hskip 10\p@ + \hskip \labelsep + {\sc #1\ #2\ % This mod by Gerry to enumerate corollaries + \setbox\@tempboxa\hbox{(#3)} % and bracket the 'corollary title' + \ifdim \wd\@tempboxa>\z@ % and retain the correct numbering of e.g. theorems + \hskip 5\p@\relax % if they occur 'around' said corollaries. + \box\@tempboxa % Gerry - Nov. 1999. + \fi.}% + ] + \it +} +\newif\if@qeded +\global\@qededfalse + +% -- original +%\def\proof{% +% \vspace{-\parskip} % GM July 2000 (for tighter spacing) +% \global\@qededfalse +% \@ifnextchar[{\@xproof}{\@proof}% +%} +% -- end of original + +% (JSS) Fix for vertical spacing bug - Gerry Murray July 30th. 2002 +\def\proof{% +\vspace{-\lastskip}\vspace{-\parsep}\penalty-51% +\global\@qededfalse +\@ifnextchar[{\@xproof}{\@proof}% +} + +\def\endproof{% + \if@qeded\else\qed\fi + \endtrivlist +} +\def\@proof{% + \trivlist + \item[% + \hskip 10\p@ + \hskip \labelsep + {\sc Proof.}% + ] + \ignorespaces +} +\def\@xproof[#1]{% + \trivlist + \item[\hskip 10\p@\hskip \labelsep{\sc Proof #1.}]% + \ignorespaces +} +\def\qed{% + \unskip + \kern 10\p@ + \begingroup + \unitlength\p@ + \linethickness{.4\p@}% + \framebox(6,6){}% + \endgroup + \global\@qededtrue +} + +\def\newdef#1#2{% + \expandafter\@ifdefinable\csname #1\endcsname + {\@definecounter{#1}% + \expandafter\xdef\csname the#1\endcsname{\@thmcounter{#1}}% + \global\@namedef{#1}{\@defthm{#1}{#2}}% + \global\@namedef{end#1}{\@endtheorem}% + }% +} +\def\@defthm#1#2{% + \refstepcounter{#1}% + \@ifnextchar[{\@ydefthm{#1}{#2}}{\@xdefthm{#1}{#2}}% +} +\def\@xdefthm#1#2{% + \@begindef{#2}{\csname the#1\endcsname}% + \ignorespaces +} +\def\@ydefthm#1#2[#3]{% + \trivlist + \item[% + \hskip 10\p@ + \hskip \labelsep + {\it #2% + \savebox\@tempboxa{#3}% + \ifdim \wd\@tempboxa>\z@ + \ \box\@tempboxa + \fi.% + }]% + \ignorespaces +} +\def\@begindef#1#2{% + \trivlist + \item[% + \hskip 10\p@ + \hskip \labelsep + {\it #1\ \rm #2.}% + ]% +} +\def\theequation{\arabic{equation}} + +\newcounter{part} +\newcounter{section} +\newcounter{subsection}[section] +\newcounter{subsubsection}[subsection] +\newcounter{paragraph}[subsubsection] +\def\thepart{\Roman{part}} +\def\thesection{\arabic{section}} +\def\thesubsection{\thesection.\arabic{subsection}} +\def\thesubsubsection{\thesubsection.\arabic{subsubsection}} %removed \subsecfnt 29 July 2002 gkmt +\def\theparagraph{\thesubsubsection.\arabic{paragraph}} %removed \subsecfnt 29 July 2002 gkmt +\newif\if@uchead +\@ucheadfalse + +%% CHANGES: NEW NOTE +%% NOTE: OK to use old-style font commands below, since they were +%% suitably redefined for LaTeX2e +%% END CHANGES +\setcounter{secnumdepth}{3} +\def\part{% + \@startsection{part}{9}{\z@}{-10\p@ \@plus -4\p@ \@minus -2\p@} + {4\p@}{\normalsize\@ucheadtrue}% +} +\def\section{% + \@startsection{section}{1}{\z@}{-10\p@ \@plus -4\p@ \@minus -2\p@}% GM + {4\p@}{\baselineskip 14pt\secfnt\@ucheadtrue}% +} + +\def\subsection{% + \@startsection{subsection}{2}{\z@}{-8\p@ \@plus -2\p@ \@minus -\p@} + {4\p@}{\secfnt}% +} +\def\subsubsection{% + \@startsection{subsubsection}{3}{\z@}{-8\p@ \@plus -2\p@ \@minus -\p@}% + {4\p@}{\subsecfnt}% +} +\def\paragraph{% + \vskip 12pt\@startsection{paragraph}{3}{\z@}{6\p@ \@plus \p@}% + {-5\p@}{\subsecfnt}% +} +\let\@period=. +\def\@startsection#1#2#3#4#5#6{% + \if@noskipsec %gkmt, 11 aug 99 + \global\let\@period\@empty + \leavevmode + \global\let\@period.% + \fi + \par % + \@tempskipa #4\relax + \@afterindenttrue + \ifdim \@tempskipa <\z@ + \@tempskipa -\@tempskipa + \@afterindentfalse + \fi + \if@nobreak + \everypar{}% + \else + \addpenalty\@secpenalty + \addvspace\@tempskipa + \fi +\parskip=0pt % GM July 2000 (non numbered) section heads + \@ifstar + {\@ssect{#3}{#4}{#5}{#6}} + {\@dblarg{\@sect{#1}{#2}{#3}{#4}{#5}{#6}}}% +} +\def\@sect#1#2#3#4#5#6[#7]#8{% + \ifnum #2>\c@secnumdepth + \let\@svsec\@empty + \else + \refstepcounter{#1}% + \edef\@svsec{% + \begingroup + %\ifnum#2>2 \noexpand\rm \fi % changed to next 29 July 2002 gkmt + \ifnum#2>2 \noexpand#6 \fi + \csname the#1\endcsname + \endgroup + \ifnum #2=1\relax .\fi + \hskip 1em + }% + \fi + \@tempskipa #5\relax + \ifdim \@tempskipa>\z@ + \begingroup + #6\relax + \@hangfrom{\hskip #3\relax\@svsec}% + \begingroup + \interlinepenalty \@M + \if@uchead + \uppercase{#8}% + \else + #8% + \fi + \par + \endgroup + \endgroup + \csname #1mark\endcsname{#7}% + \vskip -12pt %gkmt, 11 aug 99 and GM July 2000 (was -14) - numbered section head spacing +\addcontentsline{toc}{#1}{% + \ifnum #2>\c@secnumdepth \else + \protect\numberline{\csname the#1\endcsname}% + \fi + #7% + }% + \else + \def\@svsechd{% + #6% + \hskip #3\relax + \@svsec + \if@uchead + \uppercase{#8}% + \else + #8% + \fi + \csname #1mark\endcsname{#7}% + \addcontentsline{toc}{#1}{% + \ifnum #2>\c@secnumdepth \else + \protect\numberline{\csname the#1\endcsname}% + \fi + #7% + }% + }% + \fi + \@xsect{#5}\hskip 1pt + \par +} +\def\@xsect#1{% + \@tempskipa #1\relax + \ifdim \@tempskipa>\z@ + \par + \nobreak + \vskip \@tempskipa + \@afterheading + \else + \global\@nobreakfalse + \global\@noskipsectrue + \everypar{% + \if@noskipsec + \global\@noskipsecfalse + \clubpenalty\@M + \hskip -\parindent + \begingroup + \@svsechd + \@period + \endgroup + \unskip + \@tempskipa #1\relax + \hskip -\@tempskipa + \else + \clubpenalty \@clubpenalty + \everypar{}% + \fi + }% + \fi + \ignorespaces +} +\def\@trivlist{% + \@topsepadd\topsep + \if@noskipsec + \global\let\@period\@empty + \leavevmode + \global\let\@period.% + \fi + \ifvmode + \advance\@topsepadd\partopsep + \else + \unskip + \par + \fi + \if@inlabel + \@noparitemtrue + \@noparlisttrue + \else + \@noparlistfalse + \@topsep\@topsepadd + \fi + \advance\@topsep \parskip + \leftskip\z@skip + \rightskip\@rightskip + \parfillskip\@flushglue + \@setpar{\if@newlist\else{\@@par}\fi} + \global\@newlisttrue + \@outerparskip\parskip +} + +%%% Actually, 'abbrev' works just fine as the default +%%% Bibliography style. + +\typeout{Using 'Abbrev' bibliography style} +\newcommand\bibyear[2]{% + \unskip\quad\ignorespaces#1\unskip + \if#2..\quad \else \quad#2 \fi +} +\newcommand{\bibemph}[1]{{\em#1}} +\newcommand{\bibemphic}[1]{{\em#1\/}} +\newcommand{\bibsc}[1]{{\sc#1}} +\def\@normalcite{% + \def\@cite##1##2{[##1\if@tempswa , ##2\fi]}% +} +\def\@citeNB{% + \def\@cite##1##2{##1\if@tempswa , ##2\fi}% +} +\def\@citeRB{% + \def\@cite##1##2{##1\if@tempswa , ##2\fi]}% +} +\def\start@cite#1#2{% + \edef\citeauthoryear##1##2##3{% + ###1% + \ifnum#2=\z@ \else\ ###2\fi + }% + \ifnum#1=\thr@@ + \let\@@cite\@citeyear + \else + \let\@@cite\@citenormal + \fi + \@ifstar{\@citeNB\@@cite}{\@normalcite\@@cite}% +} +\def\cite{\start@cite23} +\def\citeNP{\cite*} +\def\citeA{\start@cite10} +\def\citeANP{\citeA*} +\def\shortcite{\start@cite23} +\def\shortciteNP{\shortcite*} +\def\shortciteA{\start@cite20} +\def\shortciteANP{\shortciteA*} +\def\citeyear{\start@cite30} +\def\citeyearNP{\citeyear*} +\def\citeN{% + \@citeRB + \def\citeauthoryear##1##2##3{##1\ [##3% + \def\reserved@a{##1}% + \def\citeauthoryear####1####2####3{% + \def\reserved@b{####1}% + \ifx\reserved@a\reserved@b + ####3% + \else + \errmessage{Package acmart Error: author mismatch + in \string\citeN^^J^^J% + See the acmart package documentation for explanation}% + \fi + }% + }% + \@ifstar\@citeyear\@citeyear +} +\def\shortciteN{% + \@citeRB + \def\citeauthoryear##1##2##3{##2\ [##3% + \def\reserved@a{##2}% + \def\citeauthoryear####1####2####3{% + \def\reserved@b{####2}% + \ifx\reserved@a\reserved@b + ####3% + \else + \errmessage{Package acmart Error: author mismatch + in \string\shortciteN^^J^^J% + See the acmart package documentation for explanation}% + \fi + }% + }% + \@ifstar\@citeyear\@citeyear % GM July 2000 +} +\def\@citenormal{% + \@ifnextchar [{\@tempswatrue\@citex;} + {\@tempswafalse\@citex,[]}% % GM July 2000 +} +\def\@citeyear{% + \@ifnextchar [{\@tempswatrue\@citex,}% + {\@tempswafalse\@citex,[]}% +} +\def\@citex#1[#2]#3{% + \let\@citea\@empty + \@cite{% + \@for\@citeb:=#3\do{% + \@citea + \def\@citea{#1 }% + \edef\@citeb{\expandafter\@iden\@citeb}% + \if@filesw + \immediate\write\@auxout{\string\citation{\@citeb}}% + \fi + \@ifundefined{b@\@citeb}{% + {\bf ?}% + \@warning{% + Citation `\@citeb' on page \thepage\space undefined% + }% + }% + {\csname b@\@citeb\endcsname}% + }% + }{#2}% +} +\let\@biblabel\@gobble +\newdimen\bibindent +\setcounter{enumi}{1} +\bibindent=0em +\def\thebibliography#1{% +\ifnum\addauflag=0\addauthorsection\global\addauflag=1\fi + \section[References]{% <=== OPTIONAL ARGUMENT ADDED HERE + {References} % was uppercased but this affects pdf bookmarks (SP/GM October 2004) + \vskip -9pt % GM July 2000 (for tighter spacing) + \@mkboth{{\refname}}{{\refname}}% + }% + \list{[\arabic{enumi}]}{% + \settowidth\labelwidth{[#1]}% + \leftmargin\labelwidth + \advance\leftmargin\labelsep + \advance\leftmargin\bibindent + \parsep=0pt\itemsep=1pt % GM July 2000 + \itemindent -\bibindent + \listparindent \itemindent + \usecounter{enumi} + }% + \let\newblock\@empty + \raggedright % GM July 2000 + \sloppy + \sfcode`\.=1000\relax +} + + +\gdef\balancecolumns +{\vfill\eject +\global\@colht=\textheight +\global\ht\@cclv=\textheight +} + +\newcount\colcntr +\global\colcntr=0 +\newbox\savebox + +\gdef \@makecol {% +\global\advance\colcntr by 1 +\ifnum\colcntr>2 \global\colcntr=1\fi + \ifvoid\footins + \setbox\@outputbox \box\@cclv + \else + \setbox\@outputbox \vbox{% +\boxmaxdepth \@maxdepth + \@tempdima\dp\@cclv + \unvbox \@cclv + \vskip-\@tempdima + \vskip \skip\footins + \color@begingroup + \normalcolor + \footnoterule + \unvbox \footins + \color@endgroup + }% + \fi + \xdef\@freelist{\@freelist\@midlist}% + \global \let \@midlist \@empty + \@combinefloats + \ifvbox\@kludgeins + \@makespecialcolbox + \else + \setbox\@outputbox \vbox to\@colht {% +\@texttop + \dimen@ \dp\@outputbox + \unvbox \@outputbox + \vskip -\dimen@ + \@textbottom + }% + \fi + \global \maxdepth \@maxdepth +} +\def\titlenote{\@ifnextchar[\@xtitlenote{\stepcounter\@mpfn +\global\advance\titlenotecount by 1 +\ifnum\titlenotecount=1 + \raisebox{9pt}{$\ast$} +\fi +\ifnum\titlenotecount=2 + \raisebox{9pt}{$\dagger$} +\fi +\ifnum\titlenotecount=3 + \raisebox{9pt}{$\ddagger$} +\fi +\ifnum\titlenotecount=4 +\raisebox{9pt}{$\S$} +\fi +\ifnum\titlenotecount=5 +\raisebox{9pt}{$\P$} +\fi + \@titlenotetext +}} + +\long\def\@titlenotetext#1{\insert\footins{% +\ifnum\titlenotecount=1\global\tntoks={#1}\fi +\ifnum\titlenotecount=2\global\tntokstwo={#1}\fi +\ifnum\titlenotecount=3\global\tntoksthree={#1}\fi +\ifnum\titlenotecount=4\global\tntoksfour={#1}\fi +\ifnum\titlenotecount=5\global\tntoksfive={#1}\fi + \reset@font\footnotesize + \interlinepenalty\interfootnotelinepenalty + \splittopskip\footnotesep + \splitmaxdepth \dp\strutbox \floatingpenalty \@MM + \hsize\columnwidth \@parboxrestore + \protected@edef\@currentlabel{% + }% + \color@begingroup + \color@endgroup}} + +%%%%%%%%%%%%%%%%%%%%%%%%% +\ps@plain +\baselineskip=11pt +\let\thepage\relax % For NO page numbers - GM Nov. 30th. 1999 and July 2000 +\def\setpagenumber#1{\global\setcounter{page}{#1}} +\pagenumbering{arabic} % Arabic page numbers GM July 2000 +\twocolumn % Double column. +\flushbottom % Even bottom -- alas, does not balance columns at end of document +\pagestyle{plain} + +% Need Copyright Year and Copyright Data to be user definable (in .tex file). +% Gerry Nov. 30th. 1999 +\newtoks\copyrtyr +\newtoks\acmcopyr +\newtoks\boilerplate +\global\acmcopyr={X-XXXXX-XX-X/XX/XX} % Default - 5/11/2001 *** Gerry +\global\copyrtyr={200X} % Default - 3/3/2003 *** Gerry +\def\CopyrightYear#1{\global\copyrtyr{#1}} +\def\crdata#1{\global\acmcopyr{#1}} +\def\permission#1{\global\boilerplate{#1}} +% +%\global\boilerplate={Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee.} +%\newtoks\copyrightetc +%\global\copyrightetc{Copyright \the\copyrtyr\ ACM \the\acmcopyr\ ...\$5.00} + +% Future proceedings chair, please update the following: 1) conference sequence number, 2) date, +% 3) location, 4) volume number, 5) Copyright year. +\global\boilerplate={Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. Articles from this volume were invited to present their results at The 41st International Conference on Very Large Data Bases, August 31st - September 4th 2015, Kohala Coast, Hawai'i.} + +\global\conf{Proceedings of the VLDB Endowment,} +\global\confinfo{Vol. 8, No. X} +\newtoks\copyrightetc +\global\copyrightetc{Copyright 201X VLDB Endowment 2150-8097/YY/MM... \$ 10.00} + +%% Uncomment for camera ready +%\toappear{\the\boilerplate\par +%{\confname{\the\conf}} \the\confinfo\par \the\copyrightetc.} + +%% End of vldb.cls -- V1.8c - 05/15/2011 -- +%% Ahmet Sacan -- December 2011 (volume, issue, and dates are dynamically updated) +%% Uwe Roehm -- Oct-Dec 2010 & Jan-Apr 2011 +%% Gerry Murray -- Wednesday July 26th. 2005 diff --git a/docs/whitepaper/ycsb-data/combine-data.sh b/docs/whitepaper/ycsb-data/combine-data.sh new file mode 100755 index 000000000000..087ac7b2ca7c --- /dev/null +++ b/docs/whitepaper/ycsb-data/combine-data.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +prepend_lines() { + tail --lines=+2 $1 | perl -p -e "s,^,$2,g;" +} + +printf "dist\tsys\tworkload\ttime\ttput\n" +for dist in zipfian uniform ; do + for sys in hbase kudu ; do + prepend_lines $dist-$sys/load-100M.log.tsv "${dist}\t${sys}\tload\t" + prepend_lines $dist-$sys/run-workloada.log.tsv "${dist}\t${sys}\ta\t" + prepend_lines $dist-$sys/run-workloadb.log.tsv "${dist}\t${sys}\tb\t" + prepend_lines $dist-$sys/run-workloadc.log.tsv "${dist}\t${sys}\tc\t" + prepend_lines $dist-$sys/run-workloadd.log.tsv "latest\t${sys}\td\t" + done +done diff --git a/docs/whitepaper/ycsb-data/log-to-tsv.pl b/docs/whitepaper/ycsb-data/log-to-tsv.pl new file mode 100755 index 000000000000..feff455f7970 --- /dev/null +++ b/docs/whitepaper/ycsb-data/log-to-tsv.pl @@ -0,0 +1,10 @@ +#!/usr/bin/env perl + +print "time\ttput\n"; +while (<>) { + next unless /sec;/; + next if /CLEANUP/; + if (/(\d+) sec.*?([\d.]+) current ops./) { + print "$1\t$2\n"; + } +} diff --git a/docs/whitepaper/ycsb-data/plots.R b/docs/whitepaper/ycsb-data/plots.R new file mode 100644 index 000000000000..aeda345502f8 --- /dev/null +++ b/docs/whitepaper/ycsb-data/plots.R @@ -0,0 +1,25 @@ +library(ggplot2) +library(scales) + +data <- read.table(file="data.tsv",header=T) +systems <- levels(data$sys) +workloads <- levels(data$workload) + +for (w in workloads) { + cat("iterating for workload ", w, "\n") + s <- subset(data, workload==w) + dists <- unique(s$dist) + for (d in dists) { + s2 <- subset(s, dist==d) + cat("Plotting", nrow(s2), "points for workload", w, "dist", d, "\n") + filename = paste(d, "-", w, ".png", sep="") + cat("into filename", filename, "\n") + png(filename) + print(qplot(time, tput, data=s2, colour=sys, + main=paste("Workload '", w, "'\n", d, " distribution", sep=""), + geom="line", xlab="Time (sec)", ylab="Throughput\n(ops/sec)") + + scale_y_continuous(labels=comma) + + theme(legend.position="bottom")) + dev.off() + } +} diff --git a/docs/whitepaper/ycsb-data/uniform-hbase.sh b/docs/whitepaper/ycsb-data/uniform-hbase.sh new file mode 100755 index 000000000000..68444c4fd0a0 --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase.sh @@ -0,0 +1,22 @@ +#/bin/bash + +EXPORTER="com.yahoo.ycsb.measurements.exporter.JSONArrayMeasurementsExporter" +COMMON_FLAGS="-p recordcount=100000000 -p columnfamily=family -cp /etc/hbase/conf -p exporter=$EXPORTER -p table=ycsb_100m" +OUT_DIR=uniform-hbase + +mkdir -p $OUT_DIR +if true ; then + ./bin/ycsb load hbase10 $COMMON_FLAGS -p exportfile=$OUT_DIR/load.json -p clientbuffering=true \ + -P workloads/workloada -p recordcount=100000000 -threads 16 -s 2>&1 | tee $OUT_DIR/load-100M.log +fi +for x in a b c d ; do + dist_param= + if [ "$x" != "d" ]; then + dist_param="-p requestdistribution=uniform" + fi + ./bin/ycsb run hbase10 -P workloads/workload$x -p recordcount=100000000 -p operationcount=10000000 \ + $COMMON_FLAGS -p exportfile=$OUT_DIR/$x.json \ + $dist_param \ + -threads 64 -s 2>&1 | tee $OUT_DIR/run-workload$x.log +done + diff --git a/docs/whitepaper/ycsb-data/uniform-hbase/a.json b/docs/whitepaper/ycsb-data/uniform-hbase/a.json new file mode 100644 index 000000000000..32ff63631cfc --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase/a.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1505342.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 6643.008698355589 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 307190.3515625 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 4.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 1738751.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 1149951.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 1676287.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 5003017.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 11691.531354580646 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 295.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 8691711.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 28367.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 132735.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 5003017 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 4996983.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 7439.936314572213 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 959.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 3923967.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2423.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 106623.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 4996983 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-hbase/b.json b/docs/whitepaper/ycsb-data/uniform-hbase/b.json new file mode 100644 index 000000000000..547134c6b6d8 --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase/b.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 384296.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 26021.6083435685 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 331885.0859375 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 3.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 1917951.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 1064959.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 1778687.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9499613.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 2259.664387801903 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 286.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 906751.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1161.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 33951.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9499613 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 500387.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 4927.060950824062 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 1111.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 781823.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2323.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 113855.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 500387 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-hbase/c.json b/docs/whitepaper/ycsb-data/uniform-hbase/c.json new file mode 100644 index 000000000000..099e7eb3442d --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase/c.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 236765.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 42235.972377674065 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 57533.625 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 2.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 216319.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 122815.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 171135.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 1.0E7 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 1429.0645596 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 276.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 514047.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1094.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 14503.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 10000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-hbase/d.json b/docs/whitepaper/ycsb-data/uniform-hbase/d.json new file mode 100644 index 000000000000..8d8c62ffc584 --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase/d.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 224182.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 44606.61426876377 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 481497.0078125 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 3.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 3438591.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 1411071.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 2785279.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 499699.0 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 5777.362734366088 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 1297.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 1222655.0 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 2845.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 136831.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 499699 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9500301.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 1101.9676368148757 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 261.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 1221631.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1197.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 3707.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9500301 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-hbase/load.json b/docs/whitepaper/ycsb-data/uniform-hbase/load.json new file mode 100644 index 000000000000..2ec1157ef9ce --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-hbase/load.json @@ -0,0 +1,85 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 948063.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 105478.22243880418 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 1721112.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 276480.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 4624383.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 3839999.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 4624383.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 1.0E8 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 142.72419053 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 8.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 2.9786111E7 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 15.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 34.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 100000000 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 1598572.0 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 135296.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 4517887.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 3721215.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 4517887.0 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-kudu.sh b/docs/whitepaper/ycsb-data/uniform-kudu.sh new file mode 100755 index 000000000000..cacd919a93f5 --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu.sh @@ -0,0 +1,23 @@ +#/bin/bash + +EXPORTER="com.yahoo.ycsb.measurements.exporter.JSONArrayMeasurementsExporter" +COMMON_FLAGS="-p recordcount=100000000 -p exporter=$EXPORTER -p table_name=ycsb_100m -p masterQuorum=a1216" +OUT_DIR=uniform-kudu + +mkdir -p $OUT_DIR +if true ; then + ./bin/ycsb load kudu $COMMON_FLAGS -p exportfile=$OUT_DIR/load.json -p sync_ops=false \ + -p pre_split_num_tablets=100 \ + -P workloads/workloada -p recordcount=100000000 -threads 16 -s 2>&1 | tee $OUT_DIR/load-100M.log +fi +for x in a b c d ; do + dist_param= + if [ "$x" != "d" ]; then + dist_param="-p requestdistribution=uniform" + fi + ./bin/ycsb run kudu -P workloads/workload$x -p recordcount=100000000 -p operationcount=10000000 -p sync_ops=true \ + $COMMON_FLAGS -p exportfile=$OUT_DIR/$x.json \ + $dist_param \ + -threads 64 -s 2>&1 | tee $OUT_DIR/run-workload$x.log +done + diff --git a/docs/whitepaper/ycsb-data/uniform-kudu/a.json b/docs/whitepaper/ycsb-data/uniform-kudu/a.json new file mode 100644 index 000000000000..9fbb52eed69e --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu/a.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1201638.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 8321.97383904304 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 102.96875 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 26.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2989.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 69.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 268.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 5002940.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 10737.369901897684 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 567.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 4448255.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 5711.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 192767.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 5002940 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 4997060.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 4537.363224976286 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 751.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 6832127.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2985.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 88127.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 4997060 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-kudu/b.json b/docs/whitepaper/ycsb-data/uniform-kudu/b.json new file mode 100644 index 000000000000..b08acd3b5abc --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu/b.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 391477.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 25544.284849429212 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 80.53125 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 20.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2689.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 57.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 208.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9499398.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 2513.110372046734 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 686.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 258047.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 4651.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 6731.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9499398 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 500602.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 1766.2885885393985 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 830.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 149503.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2501.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 13191.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 500602 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-kudu/c.json b/docs/whitepaper/ycsb-data/uniform-kudu/c.json new file mode 100644 index 000000000000..f9280322b9e5 --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu/c.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 382028.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 26176.09180478918 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 80.03125 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 23.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2201.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 66.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 178.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 1.0E7 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 2415.2632466 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 686.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 101759.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 4531.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 6539.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 10000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-kudu/d.json b/docs/whitepaper/ycsb-data/uniform-kudu/d.json new file mode 100644 index 000000000000..bd57bfd2510c --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu/d.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 328111.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 30477.490849133374 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 81.53125 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 18.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2613.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 59.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 185.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 501108.0 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 1842.4402025112352 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 827.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 408063.0 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 2221.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 13295.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 501108 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9498892.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 2081.6849507289903 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 543.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 332287.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 3617.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 5755.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9498892 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/uniform-kudu/load.json b/docs/whitepaper/ycsb-data/uniform-kudu/load.json new file mode 100644 index 000000000000..8466f93a100b --- /dev/null +++ b/docs/whitepaper/ycsb-data/uniform-kudu/load.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1536188.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 65096.19916312327 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 1861495.5 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 30848.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 5206015.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 3123199.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 5206015.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 1.0E8 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 235.68800275 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 9.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 3.1309823E7 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 19.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 67.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 100000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipf-hbase.sh b/docs/whitepaper/ycsb-data/zipf-hbase.sh new file mode 100755 index 000000000000..ea358df69356 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipf-hbase.sh @@ -0,0 +1,22 @@ +#/bin/bash + +EXPORTER="com.yahoo.ycsb.measurements.exporter.JSONArrayMeasurementsExporter" +COMMON_FLAGS="-p recordcount=100000000 -p columnfamily=family -cp /etc/hbase/conf -p exporter=$EXPORTER -p table=ycsb_100m" +OUT_DIR=zipfian-hbase + +mkdir -p $OUT_DIR +if true ; then + ./bin/ycsb load hbase10 $COMMON_FLAGS -p exportfile=$OUT_DIR/load.json -p clientbuffering=true \ + -P workloads/workloada -p recordcount=100000000 -threads 16 -s 2>&1 | tee $OUT_DIR/load-100M.log +fi +for x in a b c d ; do + dist_param= + if [ "$x" != "d" ]; then + dist_param="-p requestdistribution=zipfian" + fi + ./bin/ycsb run hbase10 -P workloads/workload$x -p recordcount=100000000 -p operationcount=10000000 \ + $COMMON_FLAGS -p exportfile=$OUT_DIR/$x.json \ + $dist_param \ + -threads 64 -s 2>&1 | tee $OUT_DIR/run-workload$x.log +done + diff --git a/docs/whitepaper/ycsb-data/zipfian-hbase/a.json b/docs/whitepaper/ycsb-data/zipfian-hbase/a.json new file mode 100644 index 000000000000..bc1db4d5c903 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-hbase/a.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1222059.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 8182.910972383494 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 699485.7578125 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 3.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 3940351.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 2410495.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 3332095.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 4998331.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 8727.320922323872 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 294.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 4448255.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 21583.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 109823.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 4998331 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 5001669.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 6768.774347922664 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 856.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 4190207.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2517.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 103295.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 5001669 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-hbase/b.json b/docs/whitepaper/ycsb-data/zipfian-hbase/b.json new file mode 100644 index 000000000000..d683b8b2c46c --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-hbase/b.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 391659.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 25532.414677053253 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 447502.40625 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 4.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2725887.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 1494015.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 2543615.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9500310.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 2273.3790924717196 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 289.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 1138687.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1106.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 33279.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9500310 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 499690.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 5413.400896555865 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 1030.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 1395711.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2267.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 121343.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 499690 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-hbase/c.json b/docs/whitepaper/ycsb-data/zipfian-hbase/c.json new file mode 100644 index 000000000000..a1dfbde3af6a --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-hbase/c.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 212892.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 46972.17368430942 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 57716.21875 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 2.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 175359.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 119423.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 166783.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 1.0E7 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 1271.0970615 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 261.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 905215.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1057.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 13279.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 10000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-hbase/d.json b/docs/whitepaper/ycsb-data/zipfian-hbase/d.json new file mode 100644 index 000000000000..15f656f01360 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-hbase/d.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 228833.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 43699.99082300193 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 128.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 364305.6875 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 4.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2537471.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 1204223.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 2279423.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 500637.0 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 5841.371750390003 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 1296.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 969215.0 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 2881.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 139391.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 500637 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9499363.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 1119.9024302998 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 266.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 1174527.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 1216.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 4103.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9499363 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-hbase/load.json b/docs/whitepaper/ycsb-data/zipfian-hbase/load.json new file mode 100644 index 000000000000..8722f71323f1 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-hbase/load.json @@ -0,0 +1,85 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 883569.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 113177.35230638468 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 1493992.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 248832.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 5152767.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 2791423.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 5152767.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 1.0E8 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 130.42025215 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 8.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 4.0566783E7 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 15.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 35.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 100000000 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 1374304.0 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 131456.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 5029887.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 2664447.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 5029887.0 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu.sh b/docs/whitepaper/ycsb-data/zipfian-kudu.sh new file mode 100755 index 000000000000..527304fefc94 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu.sh @@ -0,0 +1,23 @@ +#/bin/bash + +EXPORTER="com.yahoo.ycsb.measurements.exporter.JSONArrayMeasurementsExporter" +COMMON_FLAGS="-p recordcount=100000000 -p exporter=$EXPORTER -p table_name=ycsb_100m -p masterQuorum=a1216" +OUT_DIR=zipfian-kudu + +mkdir -p $OUT_DIR +if true ; then + ./bin/ycsb load kudu $COMMON_FLAGS -p exportfile=$OUT_DIR/load.json -p sync_ops=false \ + -p pre_split_num_tablets=100 \ + -P workloads/workloada -p recordcount=100000000 -threads 16 -s 2>&1 | tee $OUT_DIR/load-100M.log +fi +for x in a b c d ; do + dist_param= + if [ "$x" != "d" ]; then + dist_param="-p requestdistribution=zipfian" + fi + ./bin/ycsb run kudu -P workloads/workload$x -p recordcount=100000000 -p operationcount=10000000 -p sync_ops=true \ + $COMMON_FLAGS -p exportfile=$OUT_DIR/$x.json \ + $dist_param \ + -threads 64 -s 2>&1 | tee $OUT_DIR/run-workload$x.log +done + diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu/a.json b/docs/whitepaper/ycsb-data/zipfian-kudu/a.json new file mode 100644 index 000000000000..6f1b79c96a43 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu/a.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1374928.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 7273.108119116056 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 105.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 27.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2657.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 66.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 456.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 4999360.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 7044.319521298727 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 476.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 4061183.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 9623.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 108543.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 4999360 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 5000640.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 10279.158677489282 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 743.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 7700479.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 3355.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 278527.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 5000640 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu/b.json b/docs/whitepaper/ycsb-data/zipfian-kudu/b.json new file mode 100644 index 000000000000..5286087fb6d5 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu/b.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1727133.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 5789.942060049805 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 102.234375 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 26.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2447.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 62.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 442.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9500948.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 11121.853540194094 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 674.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 507135.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 32335.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 150271.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9500948 +}, { + "metric" : "UPDATE", + "measurement" : "Operations", + "value" : 499052.0 +}, { + "metric" : "UPDATE", + "measurement" : "AverageLatency(us)", + "value" : 8145.531078925643 +}, { + "metric" : "UPDATE", + "measurement" : "MinLatency(us)", + "value" : 954.0 +}, { + "metric" : "UPDATE", + "measurement" : "MaxLatency(us)", + "value" : 469759.0 +}, { + "metric" : "UPDATE", + "measurement" : "95thPercentileLatency(us)", + "value" : 23439.0 +}, { + "metric" : "UPDATE", + "measurement" : "99thPercentileLatency(us)", + "value" : 108415.0 +}, { + "metric" : "UPDATE", + "measurement" : "Return=0", + "value" : 499052 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu/c.json b/docs/whitepaper/ycsb-data/zipfian-kudu/c.json new file mode 100644 index 000000000000..d3888caa8a16 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu/c.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1291418.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 7743.4262183119645 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 101.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 25.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2855.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 68.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 276.0 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 1.0E7 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 8214.7128077 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 639.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 282879.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 27663.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 105087.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 10000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu/d.json b/docs/whitepaper/ycsb-data/zipfian-kudu/d.json new file mode 100644 index 000000000000..f586fd498682 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu/d.json @@ -0,0 +1,89 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 280893.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 35600.74476758054 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 64.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 84.640625 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 24.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 2577.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 70.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 229.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 500538.0 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 1608.465431196033 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 838.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 462591.0 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 1610.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 10823.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 500538 +}, { + "metric" : "READ", + "measurement" : "Operations", + "value" : 9499462.0 +}, { + "metric" : "READ", + "measurement" : "AverageLatency(us)", + "value" : 1777.9812548331684 +}, { + "metric" : "READ", + "measurement" : "MinLatency(us)", + "value" : 514.0 +}, { + "metric" : "READ", + "measurement" : "MaxLatency(us)", + "value" : 463871.0 +}, { + "metric" : "READ", + "measurement" : "95thPercentileLatency(us)", + "value" : 2515.0 +}, { + "metric" : "READ", + "measurement" : "99thPercentileLatency(us)", + "value" : 4063.0 +}, { + "metric" : "READ", + "measurement" : "Return=0", + "value" : 9499462 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-data/zipfian-kudu/load.json b/docs/whitepaper/ycsb-data/zipfian-kudu/load.json new file mode 100644 index 000000000000..2482de5b9bb0 --- /dev/null +++ b/docs/whitepaper/ycsb-data/zipfian-kudu/load.json @@ -0,0 +1,61 @@ +[ { + "metric" : "OVERALL", + "measurement" : "RunTime(ms)", + "value" : 1786257.0 +}, { + "metric" : "OVERALL", + "measurement" : "Throughput(ops/sec)", + "value" : 55982.98565100095 +}, { + "metric" : "CLEANUP", + "measurement" : "Operations", + "value" : 16.0 +}, { + "metric" : "CLEANUP", + "measurement" : "AverageLatency(us)", + "value" : 2622248.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MinLatency(us)", + "value" : 463872.0 +}, { + "metric" : "CLEANUP", + "measurement" : "MaxLatency(us)", + "value" : 7688191.0 +}, { + "metric" : "CLEANUP", + "measurement" : "95thPercentileLatency(us)", + "value" : 4214783.0 +}, { + "metric" : "CLEANUP", + "measurement" : "99thPercentileLatency(us)", + "value" : 7688191.0 +}, { + "metric" : "INSERT", + "measurement" : "Operations", + "value" : 1.0E8 +}, { + "metric" : "INSERT", + "measurement" : "AverageLatency(us)", + "value" : 267.57111786 +}, { + "metric" : "INSERT", + "measurement" : "MinLatency(us)", + "value" : 9.0 +}, { + "metric" : "INSERT", + "measurement" : "MaxLatency(us)", + "value" : 4.3384831E7 +}, { + "metric" : "INSERT", + "measurement" : "95thPercentileLatency(us)", + "value" : 18.0 +}, { + "metric" : "INSERT", + "measurement" : "99thPercentileLatency(us)", + "value" : 65.0 +}, { + "metric" : "INSERT", + "measurement" : "Return=0", + "value" : 100000000 +} ] \ No newline at end of file diff --git a/docs/whitepaper/ycsb-results.pdf b/docs/whitepaper/ycsb-results.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1570b19b9f2d343f2e5dc66115f327fb5b057c30 GIT binary patch literal 66741 zcmeHQX>?OZmY&n6=Z~i6oL-ppIrB@WC+Q{&$U@#AkPt9tGfOa&K+_4xw!DCCIkFs< z4o+Ihuq7m9Lefc^bdsL1BrJijr?UWI$2R+7;{|NM#w#{nWJ|hlzA7!&YD>0k87O#u zs#mX;Tle0&@7AqbRgX-Mi5;RG8ZMV9IhlI7(kUx@<(ZY$ ztJ5HPJ)Y;BG%5s)yLW=%_&uSwJ+ELTj-2$M(i;sK6gx=x!IuhGjR9WhfhNonda z+C}m?VTcQlj+CpSlnfG`Hp^SIHdBv!v*qEe1hhCcCnG_wWEh#@1T}kWhdNG^nW9gX zN7$2O>vbA+hU_I7tBh5O3i)qhhKz$7qm@0&FlS3>!y(Jhic~4>SY^3c%pAD_q>(FT zY8L5PgF%t8K&CBUlr0qy(@7A95j%WVg4l2-Q^e;a=s6jAK{CZubp~h(G|CjQY3X{6 z4)jvY$jP)eD@K#3P0}#ZJ)f3@R-#QZ#RN@SN~)fz zx<)rvn~|l>WJHWcty9$53*<^Gk`y+S4d>|rwG9&nN&FWW5EvME7|55#tCI7Aav7v* zX?*eUUkBy##8mBr2x*f<=VEm5lFsxD9pwOwpmbFB92KJS)u?&`062g*&6Hw_qt_6- zlcrJ}&8AE4^ehWDJ<;OPbkVux2p^%R=?)QP_bpByMnw{>r<8%|@oAedE1+JKY@O2taf1(el1mX8dWt>dWbgQrNp`d&U#~*8dEI>TM zv9O%u=nM}#;)C6PVmL-dqv`^h4vY+;Y5);UAJZY)%R94?exzj-f%|);LyC&N=MM(X zcJ!YXDg~$1keL>9EF6c7smHrq{^azoFR?NhJRza=ak#Ic4BUzSjNT;CXOsi%%?GXu zfH%f5yXZ+W_cgbSMy091h(s&FtZ|UDlM(Ydje&4m%EONMzQoH|N~0v2N7n$z4SJSl zAp8j_sEzL8O)O1;G;YSbE$CMhA#X0Y<0XhPS7=pY?0C;&pjaXm>l5_&0!n>Lg5Mz4kYS!;5?B}O2BUgjpuVLUfyUpM^O(+)m zX&ZEu`!@X#?1{tZD#~`A0dq{mu@ZBG;-}E_LUM!TDbZ61a)aZj(bMXGE%4uC)CDDD zsPUiG@BDwnW=D0%iPcjV|rT1iQ?fxPJ~uaKIr%#0CM64oHLl7vI9;D zW;ag>C&vvTErhfXT3x??)?KurP)iGppf_S<9ZR$55+sJJ_mGn-laLicRtQ<~pvnrX zzf-73V1oZb%YOp@p~ZjbbOq7>8RZ}Jk*(?@=?ysY)O|5irbe=V}8M)l4m&LHh|BfU3P5Y@PN zBn=Rxt>e3N+>+=G8j8iiqYp`B!b0aGR}U$t&`bP(lQdrR=~;I43DZCVBW&NP@F#YQ=KY~@C< zDeA6gT5E(ioP@uRXmnrH*?aX8Uvl7uLumn&v-<+d-fNXsC`}3|(u3)6-)ogFSlJDI z=qA$?LT!yvJ6VQ;qUL%Q0@QXPw1TGga+ZUt@C@x0hy}ufppSEuLLWbX9#STdYDH@A zhYCQe@d*qrcQK*!zzYE9F@->e(~$0gM}~dWBM@O51Pesi5Fy6FU8z*|esgbN$)Xw^ z;5gWGA+$mIwo4_`iV;zh%QTJyjROaJpK{xA=adfCj+(I>ZY)-nkEB)9!A6pO&n_|x{n-OyfG&8U$Gp%>Qw_Mm1Q&{@p?y3_E2v#? zL+DqJ^z>Wtfz7Z|Exz|5UjZa&*n1D~P_N?ZDGlwm388blskr_8DCpNagx~A^@7}Nt z`eOsIOBB(f?rp*rRcxZzP}*&-szZ0SOm_&t={H9&!3o&w$<7GMCZP~h%3Mm zYSZW`?6VpNf8syFi+Bf)SBs-1u5R}sYlXwF-r5RanE5xm?h4CJc`U@qj?4?c5b3gU z9-A2_J4R3`?4|*G<9lGz=4tN&LFfLl1@t7KI!Eku#nt@K!tqpqdR@HoqkH@TH%PD( zb>Wf(XdTBRatJ#;?BUsN!&m1nmIeG&L#r?NYFdW(fYz5UexE|^bbz5xxLtSE^9nos z*gZV)`ws4JosJ#!3AO9~UcR7q@xkvqi0vAOsG+M~)PGnKnx8*(xHqsA`g1e!#yy=YQ5@@K z9(4S8jKz96s>Qg>W3`XKF~-y5F%mq*50-aY%kDDb^?S^X5gBkC*jgnII`?4AZLQa~ zl54=MSxV~8A)v>W0q2Y*U217Tv=Q*$GjJ9z4G`=G&z8aQi@rEuRxu3(3>|=iWueVuMF)MDzj{Wyu#!~1lJBfX6 z@q~r6Yjt{{7;`3M{7;5v=z(A72dj{S+9_tqc zikJhNtC6nvhETWVmIBG1NbaTg5iF8^uMYJfaL+8DVwET#I_7vAKJ~pN)*xNC8Dd?P zeCB>iu`uf$v6B(l&63a9OC-%Ek%p2|SqlERK@crI_G$pXvC zd3`4}s1aDcgthntHyS_oMKwrH)VWMg6p3BWLcZ(S&oGO|&ua6kg*(w}J;e)pg?^_& z|LBG94B;%xDB34zc5l$k^aO=rCzl-nPD~5JI!@12L+7;jofCLeLRMG z-bnPL&J{pjC-x92Xw;RCZk2UnMyb0QC5EB<=l?}^R(%rAefbMD^HUYKo7A+F8sK{9 zi1#OKkgiDBOHYfkGWfoR0p`BSJYVM`_Z!scPkqj53=(p`LmOd5*l9`x48_iG{RuOA z{vgpEil-YG^L#>Y^Ih}`abiXjp8tx{UEcct#9_OOa?c{qZot;#a!MtfH$Mw|8M1P6 zdtMG4PH)mexSwqVW{OZ^1Iqe==JmWpj=S$@3CgR6Ya4Og8Bv}I=L-A{zfSM`2^*wR zdma-%mnh);XyaG33h*>yw?d;obvmapNH-)n=FAd=O9>Bmc@*TYbKB-}xW{`HVT~SWs*$BRA1h+Zf`4i?) zeN78F-^D_*1)Lvz(ngiYUyc1M4+@0R~|HP$o70N9~o}cJN@J3DL@`w8;B1o^ZW^2s?@1x}m{rzMciM&vE!rF53=nyVc0b_SsNCu;l*bvmAr zs|j#U#k&5&do4YcV6zir-mu&`T@dr3|!kGRM~l=p_srKjEH7Pm=|$ zT14ycG=|ROPB?w5%<(1`yMnvs96^_mV*xZZ*BlbyJc{OfVwPfc!$FZmR5R7FMEBb8ycSVfHH z<+Mc2jPYX}uave|BS*>L@QE ziOEFp{Tk^v#tzGZHAPWr`6Ct-;6mW7FR9{R%IPwObnK&bjhcUE{XD<3a$B`0^Mbf$#FK$X%v+cw+M!vs{%E6-fWu z(cW&cmEB& zN6*(HSKiMHo2nZ+Fv(B&NC8kZ$^;SC!4(V(ex+PvTT9B)4dH18QR!y_y&@N!iNY0l zMwW|bjQsh6Gg}YUdO&y?oN*Q^-ZA!J%WZ3GedD$F|NPPyx%v%(UdXjHdwrm}kbY_Q zhF5ef<5G3@h9h6@GoCqO>jKt}(B=Kku^;?TY^tfmSJq0J3g`v-X!r?c%gAjp8n?{< zWATXJl#Y5R1ebc$LnXM-ZT-HA{ZIMipvJF1wKn{SgpW74)PyFtw(VH?TGfiI>Q&hR zT{Ww+Z@iIFwLIXAsO9!bU8OanI>1_yxpJA-UqDytT2CLbHW~ZTG(-4V&?)awJMI>0 z;{wFdQFls%AU?kSX7A0Kw#vG8T<11*1gV4=04V%Wj ze15{Bi{lqwcyUG1?S_snODZ;=g2X+FG3*7D_9H~OG|1W!mt>(!p>u3_bLpm<%Z0v6 z!5c^&Iti(KnvOtrpTpeG0oJ7lxp8*Mh+o&d^|J4hE=#6%h--HGk_qi26TVo3+*xHs z*4=Nn`YyvCj8pT!@7cS*)o8|NsJ#OfUGj-+sE6R!U$wQiELruHdgbNBRaajAsHCou ziCZ5O#hW_x>znv$oWi{l8hR_fTIlH!j6S*l7n6Q0HuJ;_ub!@|YLwdB$k^r8!IWPg zRdLjxz;O2)>>)OfMRZmz?z$E-v6EVa81X{Gp}*4CC4Yq!4fkFsU!ue0lm z>-j}@eX||jn4tre!c2c0UEvnp1{1V`8Zg?hZkh7R`tX#zsQCrBBGL-B9;)fGETd@; zj5i-+!1)yh<%dyDJ_el9+}5_Xb&Hqmo|mvMQG;u5qUPe_^qN&Vs7l>lH(pP#f)MkS za=G^I{w)~uct|J!YC?l5smN3Xer6-?HjENm01n8$?WKh4af2(SJ#M|OPkpRx%4663 zxJoAvE{l8ASIV+Uzpwqk;0>Ie(%iuL!p=Tb74=1zTFOgIRrVNn&MLPw)_bqk&Nx52 zbEnalET@*=tpBQ06NIh}joU)SKKT*IWrDl|NUaB&JNN*erzU90Z7r#}8=-#<&pd}K za{lT0?-W61!86_md*h86%yI;RU_UhEL$C;K$NeHk*RUr2wzaMG-Q1NQr!QNVxnf<$ z^3UJde&EaO=#+?7+oyZEY=!mTh?PwF33)XZ*S58BXipy--u#By|`Y zs;i7mkY64;hpD5&i|_}JHos|$~|iNk$KpucuT6RM+SJE9wQ8tNO?x(Zt9Yk08~V}S zueS!GH`}$LF7+cz%U$mkqLti2M(sZ{Ti96F#Xx3|+HC`W{!z7sAy~NOky#(vXTl`4 z5O}cxhC(Br%@BI!7)KTvuTnK^aJQh<6vBX52t}lXED#WN_Ga+?|t3w>KDiZRyQ=O>+>Faq) zGh&#?x&g1sdi>QfgQ)BOEv#G&N?GhlHlJ|}kXHj6)yP+a)?Ps97UC;V%60rP-TXG< zN)f7tjt-0Mdkn@?v=8I(-4NS9V7_k~G{L}&x&Z3R`FtpeY#p5tqybM@2_C z%;zeb`P^oHSBBf3Bb*aO*dm#YedgjUja)HnTEcuyqF$zm&q>hpn28zc6pc(VRh@y? zQ`BL8S>wZ_JFW4mC|7g5&DK9&r(WE(O+K=RHhFKD6LA6q0s{gA0s{gA0s{gA0s{gA z_Xh^P@H!@7x-%OS42up|I>rQ1&gy=eK3yXpoS3TC=^ur=419I;Y1+&fwH~oCBZh@3 zBf^Fci&931DOKS}Fh!f>^WAivHYq0&4myMRU}1_nB`tA*JaTB%(5NABzCl^xL&8); z!Xo9*DkJ5odVSUig<|2tg+tTW*aCk@K2)nqc~k}m9NU0pkxr8=3zI9OWnqqg^2mq? zRfIfQ?ucWqJ?x)6vokINu10QgN@cW1oC>+z)2X5{Y;wyR9tKY&_c)bj9pRDgT*0i&TLCo^dMB)jeGVe3#tgqTJgN8L13&hXrxNoM6%G)M@D& zos10zdYV~sFmoProperties->Maven->Lifecycle Mapping). +Nevertheless, maven-protoc-plugin isn't being run correctly. + +To work around this, you can download, build, and install a +user-made m2e extension for maven-protoc-plugin: + + http://www.masterzen.fr/2011/12/25/protobuf-maven-m2e-and-eclipse-are-on-a-boat + +See http://wiki.eclipse.org/M2E_plugin_execution_not_covered +for far more excruciating detail. + +## maven-eclipse-plugin (Maven to Eclipse) + +The maven-eclipse-plugin approach, despite being old +fashioned and largely unsupported, is easier to use. The +very first time you want to use it, run the following: + +$ mvn -Declipse.workspace= eclipse:configure-workspace + +This will add the M2_REPO classpath variable to Eclipse. You +can verify this in +Preferences->Java->Build Path->Classpath Variables. It +should be set to `/home//.m2/repository`. + +To generate the Eclipse project files, run: + +$ mvn eclipse:eclipse + +If you want to look at Javadoc/source in Eclipse for +dependent artifacts, run: + +$ mvn eclipse:eclipse -DdownloadJavadocs=true -DdownloadSources=true + +So what's the problem with maven-eclipse-plugin? The issue +lies with maven-protoc-plugin. Because all of our .proto +files are in src/kudu, the "resource path" in +maven-protoc-plugin must be absolute and prefixed with +${project.baseDir). This absolute path is copied verbatim +to an Eclipse .classpath , and Eclipse +doesn't know what to do with it, causing it avoid building +kudu-client altogether. Other plugins (like +maven-avro-plugin) don't seem to have this problem, so it's +likely a bug in maven-protoc-plugin. + +There's a simple workaround: delete the errant folder within +Eclipse and refresh the kudu-client project. diff --git a/java/assembly.xml b/java/assembly.xml new file mode 100644 index 000000000000..08e1df05e69a --- /dev/null +++ b/java/assembly.xml @@ -0,0 +1,34 @@ + + + assemble + + jar + + false + + + ${project.basedir}/src + / + + + ${project.build.directory}/descriptor + /descriptor + + service.mdl + + + + diff --git a/java/interface-annotations/pom.xml b/java/interface-annotations/pom.xml new file mode 100644 index 000000000000..db0e8e72a7d3 --- /dev/null +++ b/java/interface-annotations/pom.xml @@ -0,0 +1,71 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + interface-annotations + Kudu Annotations + + + + jdiff + jdiff + 1.0.9 + + + + + + jdk1.7 + + 1.7 + + + + jdk.tools + jdk.tools + 1.7 + system + ${java.home}/../lib/tools.jar + + + + + jdk1.8 + + 1.8 + + + + jdk.tools + jdk.tools + 1.8 + system + ${java.home}/../lib/tools.jar + + + + + + diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceAudience.java b/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceAudience.java new file mode 100644 index 000000000000..b834947b4b28 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceAudience.java @@ -0,0 +1,74 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Annotation to inform users of a package, class or method's intended audience. + * Currently the audience can be {@link Public}, {@link LimitedPrivate} or + * {@link Private}.
+ * All public classes must have InterfaceAudience annotation.
+ *
    + *
  • Public classes that are not marked with this annotation must be + * considered by default as {@link Private}.
  • + * + *
  • External applications must only use classes that are marked + * {@link Public}. Avoid using non public classes as these classes + * could be removed or change in incompatible ways.
  • + * + *
  • Hadoop projects must only use classes that are marked + * {@link LimitedPrivate} or {@link Public}
  • + * + *
  • Methods may have a different annotation that it is more restrictive + * compared to the audience classification of the class. Example: A class + * might be {@link Public}, but a method may be {@link LimitedPrivate} + *
+ */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class InterfaceAudience { + /** + * Intended for use by any project or application. + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface Public {}; + + /** + * Intended only for the project(s) specified in the annotation. + * For example, "Common", "HDFS", "MapReduce", "ZooKeeper", "HBase". + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface LimitedPrivate { + String[] value(); + }; + + /** + * Intended for use only within Kudu itself. + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface Private {}; + + private InterfaceAudience() {} // Audience can't exist on its own +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceStability.java b/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceStability.java new file mode 100644 index 000000000000..84950e6f8c65 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/InterfaceStability.java @@ -0,0 +1,66 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +import org.kududb.annotations.InterfaceAudience.LimitedPrivate; +import org.kududb.annotations.InterfaceAudience.Private; +import org.kududb.annotations.InterfaceAudience.Public; + +/** + * Annotation to inform users of how much to rely on a particular package, + * class or method not changing over time. Currently the stability can be + * {@link Stable}, {@link Evolving} or {@link Unstable}.
+ * + *
  • All classes that are annotated with {@link Public} or + * {@link LimitedPrivate} must have InterfaceStability annotation.
  • + *
  • Classes that are {@link Private} are to be considered unstable unless + * a different InterfaceStability annotation states otherwise.
  • + *
  • Incompatible changes must not be made to classes marked as stable.
  • + *
+ */ +@Public +@InterfaceStability.Evolving +public class InterfaceStability { + /** + * Can evolve while retaining compatibility for minor release boundaries.; + * can break compatibility only at major release (ie. at m.0). + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface Stable {}; + + /** + * Evolving, but can break compatibility at minor release (i.e. m.x) + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface Evolving {}; + + /** + * No guarantee is provided as to reliability or stability across any + * level of release granularity. + */ + @Documented + @Retention(RetentionPolicy.RUNTIME) + public @interface Unstable {}; +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsJDiffDoclet.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsJDiffDoclet.java new file mode 100644 index 000000000000..5c0c7b8ee4b0 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsJDiffDoclet.java @@ -0,0 +1,61 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations.tools; + +import com.sun.javadoc.DocErrorReporter; +import com.sun.javadoc.LanguageVersion; +import com.sun.javadoc.RootDoc; + +import jdiff.JDiff; +import org.kududb.annotations.InterfaceAudience; + +/** + * A Doclet + * for excluding elements that are annotated with + * {@link InterfaceAudience.Private} or + * {@link InterfaceAudience.LimitedPrivate}. + * It delegates to the JDiff Doclet, and takes the same options. + */ +public class ExcludePrivateAnnotationsJDiffDoclet { + + public static LanguageVersion languageVersion() { + return LanguageVersion.JAVA_1_5; + } + + public static boolean start(RootDoc root) { + System.out.println( + ExcludePrivateAnnotationsJDiffDoclet.class.getSimpleName()); + return JDiff.start(RootDocProcessor.process(root)); + } + + public static int optionLength(String option) { + Integer length = StabilityOptions.optionLength(option); + if (length != null) { + return length; + } + return JDiff.optionLength(option); + } + + public static boolean validOptions(String[][] options, + DocErrorReporter reporter) { + StabilityOptions.validOptions(options, reporter); + String[][] filteredOptions = StabilityOptions.filterOptions(options); + return JDiff.validOptions(filteredOptions, reporter); + } +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsStandardDoclet.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsStandardDoclet.java new file mode 100644 index 000000000000..af8b088bf509 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/ExcludePrivateAnnotationsStandardDoclet.java @@ -0,0 +1,60 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations.tools; + +import com.sun.javadoc.DocErrorReporter; +import com.sun.javadoc.LanguageVersion; +import com.sun.javadoc.RootDoc; +import com.sun.tools.doclets.standard.Standard; +import org.kududb.annotations.InterfaceAudience; + +/** + * A Doclet + * for excluding elements that are annotated with + * {@link InterfaceAudience.Private} or + * {@link InterfaceAudience.LimitedPrivate}. + * It delegates to the Standard Doclet, and takes the same options. + */ +public class ExcludePrivateAnnotationsStandardDoclet { + + public static LanguageVersion languageVersion() { + return LanguageVersion.JAVA_1_5; + } + + public static boolean start(RootDoc root) { + System.out.println( + ExcludePrivateAnnotationsStandardDoclet.class.getSimpleName()); + return Standard.start(RootDocProcessor.process(root)); + } + + public static int optionLength(String option) { + Integer length = StabilityOptions.optionLength(option); + if (length != null) { + return length; + } + return Standard.optionLength(option); + } + + public static boolean validOptions(String[][] options, + DocErrorReporter reporter) { + StabilityOptions.validOptions(options, reporter); + String[][] filteredOptions = StabilityOptions.filterOptions(options); + return Standard.validOptions(filteredOptions, reporter); + } +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/IncludePublicAnnotationsStandardDoclet.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/IncludePublicAnnotationsStandardDoclet.java new file mode 100644 index 000000000000..b5a67b246143 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/IncludePublicAnnotationsStandardDoclet.java @@ -0,0 +1,65 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations.tools; + +import com.sun.javadoc.DocErrorReporter; +import com.sun.javadoc.LanguageVersion; +import com.sun.javadoc.RootDoc; +import com.sun.tools.doclets.standard.Standard; +import org.kududb.annotations.InterfaceAudience; + +/** + * A Doclet + * that only includes class-level elements that are annotated with + * {@link InterfaceAudience.Public}. + * Class-level elements with no annotation are excluded. + * In addition, all elements that are annotated with + * {@link InterfaceAudience.Private} or + * {@link InterfaceAudience.LimitedPrivate} + * are also excluded. + * It delegates to the Standard Doclet, and takes the same options. + */ +public class IncludePublicAnnotationsStandardDoclet { + + public static LanguageVersion languageVersion() { + return LanguageVersion.JAVA_1_5; + } + + public static boolean start(RootDoc root) { + System.out.println( + IncludePublicAnnotationsStandardDoclet.class.getSimpleName()); + RootDocProcessor.treatUnannotatedClassesAsPrivate = true; + return Standard.start(RootDocProcessor.process(root)); + } + + public static int optionLength(String option) { + Integer length = StabilityOptions.optionLength(option); + if (length != null) { + return length; + } + return Standard.optionLength(option); + } + + public static boolean validOptions(String[][] options, + DocErrorReporter reporter) { + StabilityOptions.validOptions(options, reporter); + String[][] filteredOptions = StabilityOptions.filterOptions(options); + return Standard.validOptions(filteredOptions, reporter); + } +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/RootDocProcessor.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/RootDocProcessor.java new file mode 100644 index 000000000000..c4f19fbbb7fc --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/RootDocProcessor.java @@ -0,0 +1,248 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations.tools; + +import com.sun.javadoc.AnnotationDesc; +import com.sun.javadoc.AnnotationTypeDoc; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.ConstructorDoc; +import com.sun.javadoc.Doc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.MethodDoc; +import com.sun.javadoc.PackageDoc; +import com.sun.javadoc.ProgramElementDoc; +import com.sun.javadoc.RootDoc; + +import java.lang.reflect.Array; +import java.lang.reflect.InvocationHandler; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.Proxy; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.WeakHashMap; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Process the {@link RootDoc} by substituting with (nested) proxy objects that + * exclude elements with Private or LimitedPrivate annotations. + *

+ * Based on code from http://www.sixlegs.com/blog/java/exclude-javadoc-tag.html. + */ +class RootDocProcessor { + + static String stability = StabilityOptions.UNSTABLE_OPTION; + static boolean treatUnannotatedClassesAsPrivate = false; + + public static RootDoc process(RootDoc root) { + return (RootDoc) process(root, RootDoc.class); + } + + private static Object process(Object obj, Class type) { + if (obj == null) { + return null; + } + Class cls = obj.getClass(); + if (cls.getName().startsWith("com.sun.")) { + return getProxy(obj); + } else if (obj instanceof Object[]) { + Class componentType = type.isArray() ? type.getComponentType() + : cls.getComponentType(); + Object[] array = (Object[]) obj; + Object[] newArray = (Object[]) Array.newInstance(componentType, + array.length); + for (int i = 0; i < array.length; ++i) { + newArray[i] = process(array[i], componentType); + } + return newArray; + } + return obj; + } + + private static Map proxies = + new WeakHashMap(); + + private static Object getProxy(Object obj) { + Object proxy = proxies.get(obj); + if (proxy == null) { + proxy = Proxy.newProxyInstance(obj.getClass().getClassLoader(), + obj.getClass().getInterfaces(), new ExcludeHandler(obj)); + proxies.put(obj, proxy); + } + return proxy; + } + + private static class ExcludeHandler implements InvocationHandler { + private Object target; + + public ExcludeHandler(Object target) { + this.target = target; + } + + @Override + public Object invoke(Object proxy, Method method, Object[] args) + throws Throwable { + String methodName = method.getName(); + if (target instanceof Doc) { + if (methodName.equals("isIncluded")) { + Doc doc = (Doc) target; + return !exclude(doc) && doc.isIncluded(); + } + if (target instanceof RootDoc) { + if (methodName.equals("classes")) { + return filter(((RootDoc) target).classes(), ClassDoc.class); + } else if (methodName.equals("specifiedClasses")) { + return filter(((RootDoc) target).specifiedClasses(), ClassDoc.class); + } else if (methodName.equals("specifiedPackages")) { + return filter(((RootDoc) target).specifiedPackages(), PackageDoc.class); + } + } else if (target instanceof ClassDoc) { + if (isFiltered(args)) { + if (methodName.equals("methods")) { + return filter(((ClassDoc) target).methods(true), MethodDoc.class); + } else if (methodName.equals("fields")) { + return filter(((ClassDoc) target).fields(true), FieldDoc.class); + } else if (methodName.equals("innerClasses")) { + return filter(((ClassDoc) target).innerClasses(true), + ClassDoc.class); + } else if (methodName.equals("constructors")) { + return filter(((ClassDoc) target).constructors(true), + ConstructorDoc.class); + } + } + } else if (target instanceof PackageDoc) { + if (methodName.equals("allClasses")) { + if (isFiltered(args)) { + return filter(((PackageDoc) target).allClasses(true), + ClassDoc.class); + } else { + return filter(((PackageDoc) target).allClasses(), ClassDoc.class); + } + } else if (methodName.equals("annotationTypes")) { + return filter(((PackageDoc) target).annotationTypes(), + AnnotationTypeDoc.class); + } else if (methodName.equals("enums")) { + return filter(((PackageDoc) target).enums(), + ClassDoc.class); + } else if (methodName.equals("errors")) { + return filter(((PackageDoc) target).errors(), + ClassDoc.class); + } else if (methodName.equals("exceptions")) { + return filter(((PackageDoc) target).exceptions(), + ClassDoc.class); + } else if (methodName.equals("interfaces")) { + return filter(((PackageDoc) target).interfaces(), + ClassDoc.class); + } else if (methodName.equals("ordinaryClasses")) { + return filter(((PackageDoc) target).ordinaryClasses(), + ClassDoc.class); + } + } + } + + if (args != null) { + if (methodName.equals("compareTo") || methodName.equals("equals") + || methodName.equals("overrides") + || methodName.equals("subclassOf")) { + args[0] = unwrap(args[0]); + } + } + try { + return process(method.invoke(target, args), method.getReturnType()); + } catch (InvocationTargetException e) { + throw e.getTargetException(); + } + } + + private static boolean exclude(Doc doc) { + AnnotationDesc[] annotations = null; + if (doc instanceof ProgramElementDoc) { + annotations = ((ProgramElementDoc) doc).annotations(); + } else if (doc instanceof PackageDoc) { + annotations = ((PackageDoc) doc).annotations(); + } + if (annotations != null) { + for (AnnotationDesc annotation : annotations) { + String qualifiedTypeName = annotation.annotationType().qualifiedTypeName(); + if (qualifiedTypeName.equals( + InterfaceAudience.Private.class.getCanonicalName()) + || qualifiedTypeName.equals( + InterfaceAudience.LimitedPrivate.class.getCanonicalName())) { + return true; + } + if (stability.equals(StabilityOptions.EVOLVING_OPTION)) { + if (qualifiedTypeName.equals( + InterfaceStability.Unstable.class.getCanonicalName())) { + return true; + } + } + if (stability.equals(StabilityOptions.STABLE_OPTION)) { + if (qualifiedTypeName.equals( + InterfaceStability.Unstable.class.getCanonicalName()) + || qualifiedTypeName.equals( + InterfaceStability.Evolving.class.getCanonicalName())) { + return true; + } + } + } + for (AnnotationDesc annotation : annotations) { + String qualifiedTypeName = + annotation.annotationType().qualifiedTypeName(); + if (qualifiedTypeName.equals( + InterfaceAudience.Public.class.getCanonicalName())) { + return false; + } + } + } + if (treatUnannotatedClassesAsPrivate) { + return doc.isClass() || doc.isInterface() || doc.isAnnotationType(); + } + return false; + } + + private static Object[] filter(Doc[] array, Class componentType) { + if (array == null || array.length == 0) { + return array; + } + List list = new ArrayList(array.length); + for (Doc entry : array) { + if (!exclude(entry)) { + list.add(process(entry, componentType)); + } + } + return list.toArray((Object[]) Array.newInstance(componentType, list + .size())); + } + + private Object unwrap(Object proxy) { + if (proxy instanceof Proxy) + return ((ExcludeHandler) Proxy.getInvocationHandler(proxy)).target; + return proxy; + } + + private boolean isFiltered(Object[] args) { + return args != null && Boolean.TRUE.equals(args[0]); + } + + } + +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/StabilityOptions.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/StabilityOptions.java new file mode 100644 index 000000000000..d5cf5e199a47 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/StabilityOptions.java @@ -0,0 +1,71 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.annotations.tools; + +import com.sun.javadoc.DocErrorReporter; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +class StabilityOptions { + public static final String STABLE_OPTION = "-stable"; + public static final String EVOLVING_OPTION = "-evolving"; + public static final String UNSTABLE_OPTION = "-unstable"; + + public static Integer optionLength(String option) { + String opt = option.toLowerCase(Locale.ENGLISH); + if (opt.equals(UNSTABLE_OPTION)) return 1; + if (opt.equals(EVOLVING_OPTION)) return 1; + if (opt.equals(STABLE_OPTION)) return 1; + return null; + } + + public static void validOptions(String[][] options, + DocErrorReporter reporter) { + for (int i = 0; i < options.length; i++) { + String opt = options[i][0].toLowerCase(Locale.ENGLISH); + if (opt.equals(UNSTABLE_OPTION)) { + RootDocProcessor.stability = UNSTABLE_OPTION; + } else if (opt.equals(EVOLVING_OPTION)) { + RootDocProcessor.stability = EVOLVING_OPTION; + } else if (opt.equals(STABLE_OPTION)) { + RootDocProcessor.stability = STABLE_OPTION; + } + } + } + + public static String[][] filterOptions(String[][] options) { + List optionsList = new ArrayList(); + for (int i = 0; i < options.length; i++) { + if (!options[i][0].equalsIgnoreCase(UNSTABLE_OPTION) + && !options[i][0].equalsIgnoreCase(EVOLVING_OPTION) + && !options[i][0].equalsIgnoreCase(STABLE_OPTION)) { + optionsList.add(options[i]); + } + } + String[][] filteredOptions = new String[optionsList.size()][]; + int i = 0; + for (String[] option : optionsList) { + filteredOptions[i++] = option; + } + return filteredOptions; + } + +} diff --git a/java/interface-annotations/src/main/java/org/kududb/annotations/tools/package-info.java b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/package-info.java new file mode 100644 index 000000000000..ec0103b19ad6 --- /dev/null +++ b/java/interface-annotations/src/main/java/org/kududb/annotations/tools/package-info.java @@ -0,0 +1,22 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +@InterfaceAudience.Private +package org.kududb.annotations.tools; + +import org.kududb.annotations.InterfaceAudience; diff --git a/java/kudu-client-tools/pom.xml b/java/kudu-client-tools/pom.xml new file mode 100644 index 000000000000..00a4d8736494 --- /dev/null +++ b/java/kudu-client-tools/pom.xml @@ -0,0 +1,107 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + kudu-client-tools + Collection of tools that interact directly with Kudu + + + + org.kududb + kudu-client + ${project.version} + test-jar + test + + + org.kududb + kudu-mapreduce + ${project.version} + + + org.kududb + kudu-mapreduce + ${project.version} + test-jar + test + + + log4j + log4j + ${log4j.version} + + + com.stumbleupon + async + ${async.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + junit + junit + ${junit.version} + test + + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + + + org.apache.maven.plugins + maven-assembly-plugin + ${maven-assembly-plugin.version} + + + jar-with-dependencies + + true + + + + package + + single + + + + + + + diff --git a/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/CsvParser.java b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/CsvParser.java new file mode 100644 index 000000000000..945b82ca0c87 --- /dev/null +++ b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/CsvParser.java @@ -0,0 +1,162 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce.tools; + +import com.google.common.base.Preconditions; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.Bytes; + +import java.util.ArrayList; +import java.util.List; + +/** + * Column-separated values parser that gives access to the different columns inside each line of + * data. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class CsvParser { + + private final byte separatorByte; + + private final int maxColumnCount; + + private final List columnNames; + + /** + * @param columnsSpecification the list of columns to parse out, comma separated. + * @param separatorStr The 1 byte separator. + */ + public CsvParser(String columnsSpecification, String separatorStr) { + // Configure separator + byte[] separator = Bytes.fromString(separatorStr); + Preconditions.checkArgument(separator.length == 1, "CsvParser only supports single-byte " + + "separators"); + separatorByte = separator[0]; + + // Configure columns + columnNames = Lists.newArrayList(Splitter.on(',').trimResults().split(columnsSpecification)); + + maxColumnCount = columnNames.size(); + } + + /** + * Creates a ParsedLine of a line of data. + * @param lineBytes Whole line as a byte array. + * @param length How long the line really is in the byte array + * @return A parsed line of CSV. + * @throws BadCsvLineException + */ + public ParsedLine parse(byte[] lineBytes, int length) throws BadCsvLineException { + // Enumerate separator offsets + List tabOffsets = new ArrayList(maxColumnCount); + for (int i = 0; i < length; i++) { + if (lineBytes[i] == separatorByte) { + tabOffsets.add(i); + } + } + if (tabOffsets.isEmpty()) { + throw new BadCsvLineException("No delimiter"); + } + + // trailing separator shouldn't count as a column + if (lineBytes[length - 1] != separatorByte) { + tabOffsets.add(length); + } + + if (tabOffsets.size() > maxColumnCount) { + throw new BadCsvLineException("Excessive columns"); + } + + if (tabOffsets.size() < maxColumnCount) { + throw new BadCsvLineException("Not enough columns"); + } + + return new ParsedLine(tabOffsets, lineBytes); + } + + /** + * Helper class that knows where the columns are situated in the line. + */ + class ParsedLine { + private final List tabOffsets; + private final byte[] lineBytes; + + ParsedLine(List tabOffsets, byte[] lineBytes) { + this.tabOffsets = tabOffsets; + this.lineBytes = lineBytes; + } + + /** + * Get the position for the given column. + * @param idx Column to lookup. + * @return Offset in the line. + */ + public int getColumnOffset(int idx) { + if (idx > 0) { + return tabOffsets.get(idx - 1) + 1; + } else { + return 0; + } + } + + /** + * Get how many bytes the given column occupies. + * @param idx Column to lookup. + * @return Column's length. + */ + public int getColumnLength(int idx) { + return tabOffsets.get(idx) - getColumnOffset(idx); + } + + /** + * Get the number of columns in this file. + * @return Number of columns. + */ + public int getColumnCount() { + return tabOffsets.size(); + } + + /** + * Get the bytes originally given for this line. + * @return Original byte array. + */ + public byte[] getLineBytes() { + return lineBytes; + } + + /** + * Get the given column's name. + * @param idx Column to lookup. + * @return Column's name. + */ + public String getColumnName(int idx) { + return columnNames.get(idx); + } + } + + /** + * Exception used when the CsvParser is unable to parse a line. + */ + @SuppressWarnings("serial") + public static class BadCsvLineException extends Exception { + public BadCsvLineException(String err) { + super(err); + } + } +} diff --git a/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsv.java b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsv.java new file mode 100644 index 000000000000..f407c5cff64c --- /dev/null +++ b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsv.java @@ -0,0 +1,116 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce.tools; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.mapreduce.CommandLineParser; +import org.kududb.mapreduce.KuduTableMapReduceUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import java.io.IOException; + +/** + * Map-only job that reads CSV files and inserts them into a single Kudu table. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class ImportCsv extends Configured implements Tool { + + public static enum Counters { BAD_LINES }; + + static final String NAME = "importcsv"; + static final String DEFAULT_SEPARATOR = "\t"; + static final String SEPARATOR_CONF_KEY = "importcsv.separator"; + static final String JOB_NAME_CONF_KEY = "importcsv.job.name"; + static final String SKIP_LINES_CONF_KEY = "importcsv.skip.bad.lines"; + static final String COLUMNS_NAMES_KEY = "importcsv.column.names"; + + /** + * Sets up the actual job. + * + * @param conf The current configuration. + * @param args The command line parameters. + * @return The newly created job. + * @throws java.io.IOException When setting up the job fails. + */ + @SuppressWarnings("deprecation") + public static Job createSubmittableJob(Configuration conf, String[] args) + throws IOException, ClassNotFoundException { + + Class mapperClass = ImportCsvMapper.class; + conf.set(COLUMNS_NAMES_KEY, args[0]); + String tableName = args[1]; + Path inputDir = new Path(args[2]); + + String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName); + Job job = new Job(conf, jobName); + job.setJarByClass(mapperClass); + FileInputFormat.setInputPaths(job, inputDir); + job.setInputFormatClass(TextInputFormat.class); + job.setMapperClass(mapperClass); + job.setNumReduceTasks(0); + new KuduTableMapReduceUtil.TableOutputFormatConfiguratorWithCommandLineParser( + job, + tableName) + .configure(); + return job; + } + + /* + * @param errorMsg Error message. Can be null. + */ + private static void usage(final String errorMsg) { + if (errorMsg != null && errorMsg.length() > 0) { + System.err.println("ERROR: " + errorMsg); + } + String usage = + "Usage: " + NAME + " \n\n" + + "Imports the given input directory of CSV data into the specified table.\n" + + "\n" + + "The column names of the CSV data must be specified in the form of " + + "comma-separated column names.\n" + + "Other options that may be specified with -D include:\n" + + " -D" + SKIP_LINES_CONF_KEY + "=false - fail if encountering an invalid line\n" + + " '-D" + SEPARATOR_CONF_KEY + "=|' - eg separate on pipes instead of tabs\n" + + " -D" + JOB_NAME_CONF_KEY + "=jobName - use the specified mapreduce job name for the" + + " import.\n" + + CommandLineParser.getHelpSnippet(); + + System.err.println(usage); + } + + @Override + public int run(String[] otherArgs) throws Exception { + if (otherArgs.length < 3) { + usage("Wrong number of arguments: " + otherArgs.length); + return -1; + } + Job job = createSubmittableJob(getConf(), otherArgs); + return job.waitForCompletion(true) ? 0 : 1; + } + + public static void main(String[] args) throws Exception { + int status = ToolRunner.run(new ImportCsv(), args); + System.exit(status); + } +} diff --git a/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsvMapper.java b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsvMapper.java new file mode 100644 index 000000000000..21f43b59ed2b --- /dev/null +++ b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/ImportCsvMapper.java @@ -0,0 +1,143 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce.tools; + +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.*; +import org.kududb.mapreduce.KuduTableMapReduceUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Mapper; + +import java.io.IOException; + +/** + * Mapper that ingests CSV lines and turns them into Kudu Inserts. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class ImportCsvMapper extends Mapper { + + private static final NullWritable NULL_KEY = NullWritable.get(); + + /** Column seperator */ + private String separator; + + /** Should skip bad lines */ + private boolean skipBadLines; + private Counter badLineCount; + + private CsvParser parser; + + private KuduTable table; + private Schema schema; + + /** + * Handles initializing this class with objects specific to it (i.e., the parser). + */ + @Override + protected void setup(Context context) { + Configuration conf = context.getConfiguration(); + + this.separator = conf.get(ImportCsv.SEPARATOR_CONF_KEY); + if (this.separator == null) { + this.separator = ImportCsv.DEFAULT_SEPARATOR; + } + + this.skipBadLines = conf.getBoolean(ImportCsv.SKIP_LINES_CONF_KEY, true); + this.badLineCount = context.getCounter(ImportCsv.Counters.BAD_LINES); + + this.parser = new CsvParser(conf.get(ImportCsv.COLUMNS_NAMES_KEY), this.separator); + + this.table = KuduTableMapReduceUtil.getTableFromContext(context); + this.schema = this.table.getSchema(); + } + + /** + * Convert a line of CSV text into a Kudu Insert + */ + @Override + public void map(LongWritable offset, Text value, + Context context) + throws IOException { + byte[] lineBytes = value.getBytes(); + + try { + CsvParser.ParsedLine parsed = this.parser.parse(lineBytes, value.getLength()); + + Insert insert = this.table.newInsert(); + PartialRow row = insert.getRow(); + for (int i = 0; i < parsed.getColumnCount(); i++) { + String colName = parsed.getColumnName(i); + ColumnSchema col = this.schema.getColumn(colName); + String colValue = Bytes.getString(parsed.getLineBytes(), parsed.getColumnOffset(i), + parsed.getColumnLength(i)); + switch (col.getType()) { + case BOOL: + row.addBoolean(colName, Boolean.parseBoolean(colValue)); + break; + case INT8: + row.addByte(colName, Byte.parseByte(colValue)); + break; + case INT16: + row.addShort(colName, Short.parseShort(colValue)); + break; + case INT32: + row.addInt(colName, Integer.parseInt(colValue)); + break; + case INT64: + row.addLong(colName, Long.parseLong(colValue)); + break; + case STRING: + row.addString(colName, colValue); + break; + case FLOAT: + row.addFloat(colName, Float.parseFloat(colValue)); + break; + case DOUBLE: + row.addDouble(colName, Double.parseDouble(colValue)); + break; + default: + throw new IllegalArgumentException("Type " + col.getType() + " not recognized"); + } + } + context.write(NULL_KEY, insert); + } catch (CsvParser.BadCsvLineException badLine) { + if (this.skipBadLines) { + System.err.println("Bad line at offset: " + offset.get() + ":\n" + badLine.getMessage()); + this.badLineCount.increment(1); + return; + } else { + throw new IOException("Failing task because of a bad line", badLine); + } + } catch (IllegalArgumentException e) { + if (this.skipBadLines) { + System.err.println("Bad line at offset: " + offset.get() + ":\n" + e.getMessage()); + this.badLineCount.increment(1); + return; + } else { + throw new IOException("Failing task because of an illegal argument", e); + } + } catch (InterruptedException e) { + throw new IOException("Failing task since it was interrupted", e); + } + } +} diff --git a/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/IntegrationTestBigLinkedList.java b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/IntegrationTestBigLinkedList.java new file mode 100644 index 000000000000..ec108292d63c --- /dev/null +++ b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/IntegrationTestBigLinkedList.java @@ -0,0 +1,1660 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce.tools; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.*; +import org.kududb.mapreduce.CommandLineParser; +import org.kududb.mapreduce.KuduTableMapReduceUtil; +import org.kududb.util.Pair; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.CounterGroup; +import org.apache.hadoop.mapreduce.Counters; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.math.BigInteger; +import java.security.SecureRandom; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; + +/** + *

+ * This is an integration test borrowed from goraci, written by Keith Turner, + * which is in turn inspired by the Accumulo test called continous ingest (ci). + * The original source code can be found here: + *

+ * + * + *

+ * Apache Accumulo has a simple test suite that verifies that data is not + * lost at scale. This test suite is called continuous ingest. This test runs + * many ingest clients that continually create linked lists containing 25 + * million nodes. At some point the clients are stopped and a map reduce job is + * run to ensure no linked list has a hole. A hole indicates data was lost. + *

+ * + *

+ * The nodes in the linked list are random. This causes each linked list to + * spread across the table. Therefore if one part of a table loses data, then it + * will be detected by references in another part of the table. + *

+ * + *

+ * THE ANATOMY OF THE TEST + *

+ * + *

+ * Below is rough sketch of how data is written. For specific details look at + * the Generator code. + *

+ *
    + *
  1. + * Write out 1 million nodes + *
  2. + *
  3. + * Flush the client + *
  4. + *
  5. + * Write out 1 million that reference previous million + *
  6. + *
  7. + * If this is the 25th set of 1 million nodes, then update 1st set of million to point to last + *
  8. + *
  9. + * Goto 1 + *
  10. + *
+ * + *

+ * The key is that nodes only reference flushed nodes. Therefore a node should + * never reference a missing node, even if the ingest client is killed at any + * point in time. + *

+ * + *

+ * When running this test suite w/ Accumulo there is a script running in + * parallel called the Agitator that randomly and continuously kills server + * processes. The outcome was that many data loss bugs were found in Accumulo + * by doing this. This test suite can also help find bugs that impact uptime + * and stability when run for days or weeks. + *

+ * + *

+ * This test suite consists the following: + *

+ *
    + *
  • + * A few Java programs + *
  • + *
  • + * A little helper script to run the java programs + *
  • + *
  • + * A maven script to build it. + *
  • + *
+ * + *

+ * When generating data, its best to have each map task generate a multiple of + * 25 million. The reason for this is that circular linked list are generated + * every 25M. Not generating a multiple in 25M will result in some nodes in the + * linked list not having references. The loss of an unreferenced node can not + * be detected. + *

+ * + *

+ * Below is a description of the Java programs + *

+ * + *
    + *
  • + * Generator - A map only job that generates data. As stated previously, + * its best to generate data in multiples of 25M. + *
  • + *
  • + * Verify - A map reduce job that looks for holes. Look at the counts after running. REFERENCED and + * UNREFERENCED are ok, any UNDEFINED counts are bad. Do not run at the same + * time as the Generator. + *
  • + *
  • + * Print - A standalone program that prints nodes in the linked list + *
  • + *
  • + * Delete - Disabled. A standalone program that deletes a single node + *
  • + *
  • + * Walker - Disabled. A standalong program that start following a linked list and emits timing + * info. + *
  • + *
+ * + *

+ * KUDU-SPECIFIC CHANGES + *

+ * + *
    + *
  • + * The 16 bytes row key is divided into two 8 byte long since we don't have a "bytes" type in + * Kudu. Note that the C++ client can store bytes directly in string columns. Using longs + * enables us to pretty print human readable keys than can then be passed back just as easily. + *
  • + *
  • + * The table can be pre-split when running the Generator. The row keys' first component will be + * spread over the Long.MIN_VALUE - Long.MAX_VALUE keyspace. + *
  • + *
  • + * The Walker and Deleter progams were disabled to save some time but they can be re-enabled then + * ported to Kudu without too much effort. + *
  • + *
+ */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class IntegrationTestBigLinkedList extends Configured implements Tool { + private static final byte[] NO_KEY = new byte[1]; + + protected static final String TABLE_NAME_KEY = "IntegrationTestBigLinkedList.table"; + + protected static final String DEFAULT_TABLE_NAME = "IntegrationTestBigLinkedList"; + + protected static final String HEADS_TABLE_NAME_KEY = "IntegrationTestBigLinkedList.heads_table"; + + protected static final String DEFAULT_HEADS_TABLE_NAME = "IntegrationTestBigLinkedListHeads"; + + /** Row key, two times 8 bytes. */ + private static final String COLUMN_KEY_ONE = "key1"; + private static final String COLUMN_KEY_TWO = "key2"; + + /** Link to the id of the prev node in the linked list, two times 8 bytes. */ + private static final String COLUMN_PREV_ONE = "prev1"; + private static final String COLUMN_PREV_TWO = "prev2"; + + /** identifier of the mapred task that generated this row. */ + private static final String COLUMN_CLIENT = "client"; + + /** the id of the row within the same client. */ + private static final String COLUMN_ROW_ID = "row_id"; + + /** The number of times this row was updated. */ + private static final String COLUMN_UPDATE_COUNT = "update_count"; + + /** How many rows to write per map task. This has to be a multiple of 25M. */ + private static final String GENERATOR_NUM_ROWS_PER_MAP_KEY + = "IntegrationTestBigLinkedList.generator.num_rows"; + + private static final String GENERATOR_NUM_MAPPERS_KEY + = "IntegrationTestBigLinkedList.generator.map.tasks"; + + private static final String GENERATOR_WIDTH_KEY + = "IntegrationTestBigLinkedList.generator.width"; + + private static final String GENERATOR_WRAP_KEY + = "IntegrationTestBigLinkedList.generator.wrap"; + + private static final int WIDTH_DEFAULT = 1000000; + private static final int WRAP_DEFAULT = 25; + private static final int ROWKEY_LENGTH = 16; + + private String toRun; + private String[] otherArgs; + + static class CINode { + String key; + String prev; + String client; + long rowId; + int updateCount; + } + + static Schema getTableSchema() { + List columns = new ArrayList(7); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_KEY_ONE, Type.INT64) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_KEY_TWO, Type.INT64) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_PREV_ONE, Type.INT64) + .nullable(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_PREV_TWO, Type.INT64) + .nullable(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_ROW_ID, Type.INT64) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_CLIENT, Type.STRING) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_UPDATE_COUNT, Type.INT32) + .build()); + return new Schema(columns); + } + + static Schema getHeadsTableSchema() { + List columns = new ArrayList(2); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_KEY_ONE, Type.INT64) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder(COLUMN_KEY_TWO, Type.INT64) + .key(true) + .build()); + return new Schema(columns); + } + + /** + * A Map only job that generates random linked list and stores them. + */ + static class Generator extends Configured implements Tool { + + private static final Log LOG = LogFactory.getLog(Generator.class); + + static class GeneratorInputFormat extends InputFormat { + static class GeneratorInputSplit extends InputSplit implements Writable { + @Override + public long getLength() throws IOException, InterruptedException { + return 1; + } + @Override + public String[] getLocations() throws IOException, InterruptedException { + return new String[0]; + } + @Override + public void readFields(DataInput arg0) throws IOException { + } + @Override + public void write(DataOutput arg0) throws IOException { + } + } + + static class GeneratorRecordReader extends RecordReader { + private long count; + private long numNodes; + private Random rand; + + @Override + public void close() throws IOException { + } + + @Override + public BytesWritable getCurrentKey() throws IOException, InterruptedException { + byte[] bytes = new byte[ROWKEY_LENGTH]; + rand.nextBytes(bytes); + return new BytesWritable(bytes); + } + + @Override + public NullWritable getCurrentValue() throws IOException, InterruptedException { + return NullWritable.get(); + } + + @Override + public float getProgress() throws IOException, InterruptedException { + return (float)(count / (double)numNodes); + } + + @Override + public void initialize(InputSplit arg0, TaskAttemptContext context) + throws IOException, InterruptedException { + numNodes = context.getConfiguration().getLong(GENERATOR_NUM_ROWS_PER_MAP_KEY, 25000000); + // Use SecureRandom to avoid issue described in HBASE-13382. + rand = new SecureRandom(); + } + + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + return count++ < numNodes; + } + + } + + @Override + public RecordReader createRecordReader( + InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + GeneratorRecordReader rr = new GeneratorRecordReader(); + rr.initialize(split, context); + return rr; + } + + @Override + public List getSplits(JobContext job) throws IOException, InterruptedException { + int numMappers = job.getConfiguration().getInt(GENERATOR_NUM_MAPPERS_KEY, 1); + + ArrayList splits = new ArrayList(numMappers); + + for (int i = 0; i < numMappers; i++) { + splits.add(new GeneratorInputSplit()); + } + + return splits; + } + } + + /** Ensure output files from prev-job go to map inputs for current job */ + static class OneFilePerMapperSFIF extends SequenceFileInputFormat { + @Override + protected boolean isSplitable(JobContext context, Path filename) { + return false; + } + } + + /** + * Some ASCII art time: + * [ . . . ] represents one batch of random longs of length WIDTH + * + * _________________________ + * | ______ | + * | | || + * __+_________________+_____ || + * v v v ||| + * first = [ . . . . . . . . . . . ] ||| + * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ||| + * | | | | | | | | | | | ||| + * prev = [ . . . . . . . . . . . ] ||| + * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ||| + * | | | | | | | | | | | ||| + * current = [ . . . . . . . . . . . ] ||| + * ||| + * ... ||| + * ||| + * last = [ . . . . . . . . . . . ] ||| + * | | | | | | | | | | |-----||| + * | |--------|| + * |___________________________| + */ + static class GeneratorMapper + extends Mapper { + + private byte[][] first = null; + private byte[][] prev = null; + private byte[][] current = null; + private String id; + private long rowId = 0; + private int i; + private KuduClient client; + private KuduTable table; + private KuduSession session; + private KuduTable headsTable; + private long numNodes; + private long wrap; + private int width; + + @Override + protected void setup(Context context) throws IOException, InterruptedException { + id = "Job: " + context.getJobID() + " Task: " + context.getTaskAttemptID(); + Configuration conf = context.getConfiguration(); + CommandLineParser parser = new CommandLineParser(conf); + client = parser.getClient(); + try { + table = client.openTable(getTableName(conf)); + headsTable = client.openTable(getHeadsTable(conf)); + } catch (Exception e) { + throw new IOException(e); + } + session = client.newSession(); + session.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH); + session.setMutationBufferSpace(WIDTH_DEFAULT); + session.setIgnoreAllDuplicateRows(true); + + this.width = context.getConfiguration().getInt(GENERATOR_WIDTH_KEY, WIDTH_DEFAULT); + current = new byte[this.width][]; + int wrapMultiplier = context.getConfiguration().getInt(GENERATOR_WRAP_KEY, WRAP_DEFAULT); + this.wrap = (long)wrapMultiplier * width; + this.numNodes = context.getConfiguration().getLong( + GENERATOR_NUM_ROWS_PER_MAP_KEY, (long)WIDTH_DEFAULT * WRAP_DEFAULT); + if (this.numNodes < this.wrap) { + this.wrap = this.numNodes; + } + } + + @Override + protected void cleanup(Context context) throws IOException, InterruptedException { + try { + session.close(); + client.shutdown(); + } catch (Exception ex) { + // ugh. + throw new IOException(ex); + } + } + + @Override + protected void map(BytesWritable key, NullWritable value, Context output) throws IOException { + current[i] = new byte[key.getLength()]; + System.arraycopy(key.getBytes(), 0, current[i], 0, key.getLength()); + if (++i == current.length) { + persist(output, current, false); + i = 0; + + // Keep track of the first row so that we can point to it at the end. + if (first == null) { + first = current; + } + prev = current; + current = new byte[this.width][]; + + rowId += current.length; + output.setStatus("Count " + rowId); + + // Check if it's time to wrap up this batch. + if (rowId % wrap == 0) { + // this block of code turns the 1 million linked list of length 25 into one giant + // circular linked list of 25 million. + circularLeftShift(first); + + persist(output, first, true); + + Operation insert = headsTable.newInsert(); + PartialRow row = insert.getRow(); + row.addLong(COLUMN_KEY_ONE, Bytes.getLong(first[0])); + row.addLong(COLUMN_KEY_TWO, Bytes.getLong(first[0], 8)); + try { + session.apply(insert); + session.flush(); + } catch (Exception e) { + throw new IOException("Couldn't flush the head row, " + insert, e); + } + + first = null; + prev = null; + } + } + } + + private static void circularLeftShift(T[] first) { + T ez = first[0]; + for (int i = 0; i < first.length - 1; i++) + first[i] = first[i + 1]; + first[first.length - 1] = ez; + } + + private void persist(Context output, byte[][] data, boolean update) + throws IOException { + try { + for (int i = 0; i < data.length; i++) { + Operation put = update ? table.newUpdate() : table.newInsert(); + PartialRow row = put.getRow(); + + long keyOne = Bytes.getLong(data[i]); + long keyTwo = Bytes.getLong(data[i], 8); + + row.addLong(COLUMN_KEY_ONE, keyOne); + row.addLong(COLUMN_KEY_TWO, keyTwo); + + // prev is null for the first line, we'll update it at the end. + if (prev == null) { + row.setNull(COLUMN_PREV_ONE); + row.setNull(COLUMN_PREV_TWO); + } else { + row.addLong(COLUMN_PREV_ONE, Bytes.getLong(prev[i])); + row.addLong(COLUMN_PREV_TWO, Bytes.getLong(prev[i], 8)); + } + + if (!update) { + // We only add those for new inserts, we don't update the heads with a new row, etc. + row.addLong(COLUMN_ROW_ID, rowId + i); + row.addString(COLUMN_CLIENT, id); + row.addInt(COLUMN_UPDATE_COUNT, 0); + } + session.apply(put); + + if (i % 1000 == 0) { + // Tickle progress every so often else maprunner will think us hung + output.progress(); + } + } + + session.flush(); + } catch (Exception ex) { + throw new IOException(ex); + } + } + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 4) { + System.out.println("Usage : " + Generator.class.getSimpleName() + + " [ ]"); + System.out.println(" where should be a multiple of " + + " width*wrap multiplier, 25M by default"); + return 0; + } + + int numMappers = Integer.parseInt(args[0]); + long numNodes = Long.parseLong(args[1]); + int numTablets = Integer.parseInt(args[2]); + Path tmpOutput = new Path(args[3]); + Integer width = (args.length < 5) ? null : Integer.parseInt(args[4]); + Integer wrapMuplitplier = (args.length < 6) ? null : Integer.parseInt(args[5]); + return run(numMappers, numNodes, numTablets, tmpOutput, width, wrapMuplitplier); + } + + protected void createTables(int numTablets) throws Exception { + + createSchema(getTableName(getConf()), getTableSchema(), numTablets); + createSchema(getHeadsTable(getConf()), getHeadsTableSchema(), numTablets); + } + + protected void createSchema(String tableName, Schema schema, int numTablets) throws Exception { + CommandLineParser parser = new CommandLineParser(getConf()); + KuduClient client = parser.getClient(); + try { + if (numTablets < 1) { + numTablets = 1; + } + + if (client.tableExists(tableName)) { + return; + } + + CreateTableOptions builder = + new CreateTableOptions().setNumReplicas(parser.getNumReplicas()); + if (numTablets > 1) { + BigInteger min = BigInteger.valueOf(Long.MIN_VALUE); + BigInteger max = BigInteger.valueOf(Long.MAX_VALUE); + BigInteger step = max.multiply(BigInteger.valueOf(2)).divide(BigInteger.valueOf + (numTablets)); + LOG.info(min.longValue()); + LOG.info(max.longValue()); + LOG.info(step.longValue()); + PartialRow splitRow = schema.newPartialRow(); + splitRow.addLong("key2", Long.MIN_VALUE); + for (int i = 1; i < numTablets; i++) { + long key = min.add(step.multiply(BigInteger.valueOf(i))).longValue(); + LOG.info("key " + key); + splitRow.addLong("key1", key); + builder.addSplitRow(splitRow); + } + } + + client.createTable(tableName, schema, builder); + } finally { + // Done with this client. + client.shutdown(); + } + } + + public int runRandomInputGenerator(int numMappers, long numNodes, Path tmpOutput, + Integer width, Integer wrapMuplitplier) throws Exception { + LOG.info("Running RandomInputGenerator with numMappers=" + numMappers + + ", numNodes=" + numNodes); + Job job = new Job(getConf()); + + job.setJobName("Random Input Generator"); + job.setNumReduceTasks(0); + job.setJarByClass(getClass()); + + job.setInputFormatClass(GeneratorInputFormat.class); + job.setOutputKeyClass(BytesWritable.class); + job.setOutputValueClass(NullWritable.class); + + setJobConf(job, numMappers, numNodes, width, wrapMuplitplier); + + job.setMapperClass(Mapper.class); //identity mapper + + FileOutputFormat.setOutputPath(job, tmpOutput); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + + boolean success = job.waitForCompletion(true); + + return success ? 0 : 1; + } + + public int runGenerator(int numMappers, long numNodes, int numTablets, Path tmpOutput, + Integer width, Integer wrapMuplitplier) throws Exception { + LOG.info("Running Generator with numMappers=" + numMappers +", numNodes=" + numNodes); + createTables(numTablets); + + Job job = new Job(getConf()); + + job.setJobName("Link Generator"); + job.setNumReduceTasks(0); + job.setJarByClass(getClass()); + + FileInputFormat.setInputPaths(job, tmpOutput); + job.setInputFormatClass(OneFilePerMapperSFIF.class); + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + + setJobConf(job, numMappers, numNodes, width, wrapMuplitplier); + + job.setMapperClass(GeneratorMapper.class); + + job.setOutputFormatClass(NullOutputFormat.class); + + job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); + // If we fail, retrying will fail again in case we were able to flush at least once since + // we'll be creating duplicate rows. Better to just have one try. + job.getConfiguration().setInt("mapreduce.map.maxattempts", 1); + // Lack of YARN-445 means we can't auto-jstack on timeout, so disabling the timeout gives + // us a chance to do it manually. + job.getConfiguration().setInt("mapreduce.task.timeout", 0); + KuduTableMapReduceUtil.addDependencyJars(job); + + boolean success = job.waitForCompletion(true); + + return success ? 0 : 1; + } + + public int run(int numMappers, long numNodes, int numTablets, Path tmpOutput, + Integer width, Integer wrapMuplitplier) throws Exception { + int ret = runRandomInputGenerator(numMappers, numNodes, tmpOutput, width, wrapMuplitplier); + if (ret > 0) { + return ret; + } + return runGenerator(numMappers, numNodes, numTablets, tmpOutput, width, wrapMuplitplier); + } + } + + /** + * A Map Reduce job that verifies that the linked lists generated by + * {@link Generator} do not have any holes. + */ + static class Verify extends Configured implements Tool { + + private static final Log LOG = LogFactory.getLog(Verify.class); + private static final BytesWritable DEF = new BytesWritable(NO_KEY); + private static final Joiner COMMA_JOINER = Joiner.on(","); + private static final byte[] rowKey = new byte[ROWKEY_LENGTH]; + private static final byte[] prev = new byte[ROWKEY_LENGTH]; + + private Job job; + + public static class VerifyMapper extends Mapper { + private BytesWritable row = new BytesWritable(); + private BytesWritable ref = new BytesWritable(); + + @Override + protected void map(NullWritable key, RowResult value, Mapper.Context context) + throws IOException ,InterruptedException { + Bytes.setLong(rowKey, value.getLong(0)); + Bytes.setLong(rowKey, value.getLong(1), 8); + + row.set(rowKey, 0, rowKey.length); + // Emit that the row is defined + context.write(row, DEF); + if (value.isNull(2)) { + LOG.warn(String.format("Prev is not set for: %s", Bytes.pretty(rowKey))); + } else { + Bytes.setLong(prev, value.getLong(2)); + Bytes.setLong(prev, value.getLong(3), 8); + ref.set(prev, 0, prev.length); + // Emit which row is referenced by this row. + context.write(ref, row); + } + } + } + + public enum Counts { + UNREFERENCED, UNDEFINED, REFERENCED, EXTRAREFERENCES + } + + public static class VerifyReducer extends Reducer { + private ArrayList refs = new ArrayList(); + + private AtomicInteger rows = new AtomicInteger(0); + + @Override + public void reduce(BytesWritable key, Iterable values, Context context) + throws IOException, InterruptedException { + + int defCount = 0; + + refs.clear(); + // We only expect two values, a DEF and a reference, but there might be more. + for (BytesWritable type : values) { + if (type.getLength() == DEF.getLength()) { + defCount++; + } else { + byte[] bytes = new byte[type.getLength()]; + System.arraycopy(type.getBytes(), 0, bytes, 0, type.getLength()); + refs.add(bytes); + } + } + + // TODO check for more than one def, should not happen + + List refsList = new ArrayList<>(refs.size()); + String keyString = null; + if (defCount == 0 || refs.size() != 1) { + for (byte[] ref : refs) { + refsList.add(COMMA_JOINER.join(Bytes.getLong(ref), Bytes.getLong(ref, 8))); + } + keyString = COMMA_JOINER.join(Bytes.getLong(key.getBytes()), + Bytes.getLong(key.getBytes(), 8)); + + LOG.error("Linked List error: Key = " + keyString + " References = " + refsList); + } + + if (defCount == 0 && refs.size() > 0) { + // this is bad, found a node that is referenced but not defined. It must have been + // lost, emit some info about this node for debugging purposes. + context.write(new Text(keyString), new Text(refsList.toString())); + context.getCounter(Counts.UNDEFINED).increment(1); + } else if (defCount > 0 && refs.size() == 0) { + // node is defined but not referenced + context.write(new Text(keyString), new Text("none")); + context.getCounter(Counts.UNREFERENCED).increment(1); + } else { + if (refs.size() > 1) { + if (refsList != null) { + context.write(new Text(keyString), new Text(refsList.toString())); + } + context.getCounter(Counts.EXTRAREFERENCES).increment(refs.size() - 1); + } + // node is defined and referenced + context.getCounter(Counts.REFERENCED).increment(1); + } + + } + } + + @Override + public int run(String[] args) throws Exception { + + if (args.length != 2) { + System.out.println("Usage : " + Verify.class.getSimpleName() + " "); + return 0; + } + + String outputDir = args[0]; + int numReducers = Integer.parseInt(args[1]); + + return run(outputDir, numReducers); + } + + public int run(String outputDir, int numReducers) throws Exception { + return run(new Path(outputDir), numReducers); + } + + public int run(Path outputDir, int numReducers) throws Exception { + LOG.info("Running Verify with outputDir=" + outputDir +", numReducers=" + numReducers); + + job = new Job(getConf()); + + job.setJobName("Link Verifier"); + job.setNumReduceTasks(numReducers); + job.setJarByClass(getClass()); + + Joiner columnsToQuery = Joiner.on(","); + + new KuduTableMapReduceUtil.TableInputFormatConfiguratorWithCommandLineParser( + job, getTableName(getConf()), + columnsToQuery.join(COLUMN_KEY_ONE, COLUMN_KEY_TWO, COLUMN_PREV_ONE, COLUMN_PREV_TWO)) + .configure(); + job.setMapperClass(VerifyMapper.class); + job.setMapOutputKeyClass(BytesWritable.class); + job.setMapOutputValueClass(BytesWritable.class); + job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); + + job.setReducerClass(VerifyReducer.class); + job.setOutputFormatClass(TextOutputFormat.class); + TextOutputFormat.setOutputPath(job, outputDir); + + boolean success = job.waitForCompletion(true); + + return success ? 0 : 1; + } + + @SuppressWarnings("deprecation") + public boolean verify(long expectedReferenced) throws Exception { + if (job == null) { + throw new IllegalStateException("You should call run() first"); + } + + Counters counters = job.getCounters(); + + Counter referenced = counters.findCounter(Counts.REFERENCED); + Counter unreferenced = counters.findCounter(Counts.UNREFERENCED); + Counter undefined = counters.findCounter(Counts.UNDEFINED); + Counter multiref = counters.findCounter(Counts.EXTRAREFERENCES); + + boolean success = true; + //assert + if (expectedReferenced != referenced.getValue()) { + LOG.error("Expected referenced count does not match with actual referenced count. " + + "Expected referenced=" + expectedReferenced + ", actual=" + referenced.getValue()); + success = false; + } + + if (unreferenced.getValue() > 0) { + boolean couldBeMultiRef = (multiref.getValue() == unreferenced.getValue()); + LOG.error("Unreferenced nodes were not expected. Unreferenced count=" + unreferenced.getValue() + + (couldBeMultiRef ? "; could be due to duplicate random numbers" : "")); + success = false; + } + + if (undefined.getValue() > 0) { + LOG.error("Found an undefined node. Undefined count=" + undefined.getValue()); + success = false; + } + + // TODO Add the rows' location on failure. + if (!success) { + //Configuration conf = job.getConfiguration(); + //HConnection conn = HConnectionManager.getConnection(conf); + //TableName tableName = getTableName(conf); + CounterGroup g = counters.getGroup("undef"); + Iterator it = g.iterator(); + while (it.hasNext()) { + String keyString = it.next().getName(); + //byte[] key = Bytes.toBytes(keyString); + //HRegionLocation loc = conn.relocateRegion(tableName, key); + LOG.error("undefined row " + keyString /*+ ", " + loc*/); + } + g = counters.getGroup("unref"); + it = g.iterator(); + while (it.hasNext()) { + String keyString = it.next().getName(); + //byte[] key = Bytes.toBytes(keyString); + //HRegionLocation loc = conn.relocateRegion(tableName, key); + LOG.error("unreferred row " + keyString /*+ ", " + loc*/); + } + } + return success; + } + } + + /** + * Executes Generate and Verify in a loop. Data is not cleaned between runs, so each iteration + * adds more data. + */ + static class Loop extends Configured implements Tool { + + private static final Log LOG = LogFactory.getLog(Loop.class); + + IntegrationTestBigLinkedList it; + + FileSystem fs; + + protected void runGenerator(int numMappers, long numNodes, int numTablets, + String outputDir, Integer width, Integer wrapMuplitplier) throws Exception { + Path outputPath = new Path(outputDir); + UUID uuid = UUID.randomUUID(); //create a random UUID. + Path generatorOutput = new Path(outputPath, uuid.toString()); + + Generator generator = new Generator(); + generator.setConf(getConf()); + int retCode = generator.run(numMappers, numNodes, numTablets, generatorOutput, width, + wrapMuplitplier); + if (retCode > 0) { + throw new RuntimeException("Generator failed with return code: " + retCode); + } + fs.delete(generatorOutput, true); + } + + protected void runVerify(String outputDir, + int numReducers, + long expectedNumNodes, + int retries) throws Exception { + // Kudu doesn't fully support snapshot consistency so we might start reading from a node that + // doesn't have all the data. This happens often with under "chaos monkey"-type of setups. + for (int i = 0; i < retries; i++) { + if (i > 0) { + long sleep = 60 * 1000; + LOG.info("Retrying in " + sleep + "ms"); + Thread.sleep(sleep); + } + + Path outputPath = new Path(outputDir); + UUID uuid = UUID.randomUUID(); //create a random UUID. + Path iterationOutput = new Path(outputPath, uuid.toString()); + + Verify verify = new Verify(); + verify.setConf(getConf()); + int retCode = verify.run(iterationOutput, numReducers); + if (retCode > 0) { + LOG.warn("Verify.run failed with return code: " + retCode); + } else if (!verify.verify(expectedNumNodes)) { + LOG.warn("Verify.verify failed"); + } else { + fs.delete(iterationOutput, true); + LOG.info("Verify finished with success. Total nodes=" + expectedNumNodes); + } + } + throw new RuntimeException("Ran out of retries to verify"); + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 6) { + System.err.println("Usage: Loop " + + " [ " + + "]"); + return 1; + } + LOG.info("Running Loop with args:" + Arrays.deepToString(args)); + + int numIterations = Integer.parseInt(args[0]); + int numMappers = Integer.parseInt(args[1]); + long numNodes = Long.parseLong(args[2]); + int numTablets = Integer.parseInt(args[3]); + String outputDir = args[4]; + int numReducers = Integer.parseInt(args[5]); + int width = (args.length < 6) ? null : Integer.parseInt(args[6]); + int wrapMuplitplier = (args.length < 8) ? null : Integer.parseInt(args[7]); + long expectedNumNodes = (args.length < 9) ? 0 : Long.parseLong(args[8]); + int numVerifyRetries = (args.length < 10) ? 3 : Integer.parseInt(args[9]); + + if (numIterations < 0) { + numIterations = Integer.MAX_VALUE; // run indefinitely (kind of) + } + + fs = FileSystem.get(getConf()); + + for (int i = 0; i < numIterations; i++) { + LOG.info("Starting iteration = " + i); + runGenerator(numMappers, numNodes, numTablets, outputDir, width, wrapMuplitplier); + expectedNumNodes += numMappers * numNodes; + + runVerify(outputDir, numReducers, expectedNumNodes, numVerifyRetries); + } + + return 0; + } + } + + /** + * A stand alone program that prints out portions of a list created by {@link Generator} + */ + private static class Print extends Configured implements Tool { + @Override + public int run(String[] args) throws Exception { + Options options = new Options(); + options.addOption("s", "start", true, "start key, only the first component"); + options.addOption("e", "end", true, "end key (exclusive), only the first component"); + options.addOption("l", "limit", true, "number to print"); + + GnuParser parser = new GnuParser(); + CommandLine cmd = null; + try { + cmd = parser.parse(options, args); + if (cmd.getArgs().length != 0) { + throw new ParseException("Command takes no arguments"); + } + } catch (ParseException e) { + System.err.println("Failed to parse command line " + e.getMessage()); + System.err.println(); + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(getClass().getSimpleName(), options); + System.exit(-1); + } + + CommandLineParser cmdLineParser = new CommandLineParser(getConf()); + long timeout = cmdLineParser.getOperationTimeoutMs(); + KuduClient client = cmdLineParser.getClient(); + + KuduTable table = client.openTable(getTableName(getConf())); + KuduScanner.KuduScannerBuilder builder = + client.newScannerBuilder(table) + .scanRequestTimeout(timeout); + + + if (cmd.hasOption("s")) { + PartialRow row = table.getSchema().newPartialRow(); + row.addLong(0, Long.parseLong(cmd.getOptionValue("s"))); + builder.lowerBound(row); + } + if (cmd.hasOption("e")) { + PartialRow row = table.getSchema().newPartialRow(); + row.addLong(0, Long.parseLong(cmd.getOptionValue("e"))); + builder.exclusiveUpperBound(row); + } + + int limit = cmd.hasOption("l") ? Integer.parseInt(cmd.getOptionValue("l")) : 100; + + int count = 0; + + KuduScanner scanner = builder.build(); + while (scanner.hasMoreRows() && count < limit) { + RowResultIterator rowResults = scanner.nextRows(); + count = printNodesAndGetNewCount(count, limit, rowResults); + } + RowResultIterator rowResults = scanner.close(); + printNodesAndGetNewCount(count, limit, rowResults); + + client.shutdown(); + + return 0; + } + + private static int printNodesAndGetNewCount(int oldCount, int limit, + RowResultIterator rowResults) { + int newCount = oldCount; + if (rowResults == null) { + return newCount; + } + + CINode node = new CINode(); + for (RowResult result : rowResults) { + newCount++; + node = getCINode(result, node); + printCINodeString(node); + if (newCount == limit) { + break; + } + } + return newCount; + } + } + + /** + * This tool needs to be run separately from the Generator-Verify loop. It can run while the + * other two are running or in between loops. + * + * Each mapper scans a "heads" table and, for each row, follows the circular linked list and + * updates their counter until it reaches the head of the list again. + */ + private static class Updater extends Configured implements Tool { + + private static final Log LOG = LogFactory.getLog(Updater.class); + + private static final String MAX_LINK_UPDATES_PER_MAPPER = "kudu.updates.per.mapper"; + + public enum Counts { + // Stats on what we're updating. + UPDATED_LINKS, + UPDATED_NODES, + FIRST_UPDATE, + SECOND_UPDATE, + THIRD_UPDATE, + FOURTH_UPDATE, + MORE_THAN_FOUR_UPDATES, + // Stats on what's broken. + BROKEN_LINKS, + BAD_UPDATE_COUNTS + } + + public static class UpdaterMapper extends Mapper { + private KuduClient client; + private KuduTable table; + private KuduSession session; + + /** + * Schema we use when getting rows from the linked list, we only need the reference and + * its update count. + */ + private final List SCAN_COLUMN_NAMES = ImmutableList.of( + COLUMN_PREV_ONE, COLUMN_PREV_TWO, COLUMN_UPDATE_COUNT, COLUMN_CLIENT); + + private long numUpdatesPerMapper; + + /** + * Processing each linked list takes minutes, meaning that it's easily possible for our + * scanner to timeout. Instead, we gather all the linked list heads that we need and + * process them all at once in the first map invocation. + */ + private List> headsCache; + + @Override + protected void setup(Context context) throws IOException, InterruptedException { + Configuration conf = context.getConfiguration(); + CommandLineParser parser = new CommandLineParser(conf); + client = parser.getClient(); + try { + table = client.openTable(getTableName(conf)); + } catch (Exception e) { + throw new IOException("Couldn't open the linked list table", e); + } + session = client.newSession(); + + Schema tableSchema = table.getSchema(); + + + numUpdatesPerMapper = conf.getLong(MAX_LINK_UPDATES_PER_MAPPER, 1); + headsCache = new ArrayList>((int)numUpdatesPerMapper); + } + + @Override + protected void map(NullWritable key, RowResult value, Mapper.Context context) + throws IOException, InterruptedException { + // Add as many heads as we need, then we skip the rest. + do { + if (headsCache.size() < numUpdatesPerMapper) { + value = (RowResult)context.getCurrentValue(); + headsCache.add(new Pair(value.getLong(0), value.getLong(1))); + } + } while (context.nextKeyValue()); + + // At this point we've exhausted the scanner and hopefully gathered all the linked list + // heads we needed. + LOG.info("Processing " + headsCache.size() + + " linked lists, out of " + numUpdatesPerMapper); + processAllHeads(context); + } + + private void processAllHeads(Mapper.Context context) throws IOException { + for (Pair value : headsCache) { + processHead(value, context); + } + } + + private void processHead(Pair head, Mapper.Context context) throws IOException { + long headKeyOne = head.getFirst(); + long headKeyTwo = head.getSecond(); + long prevKeyOne = headKeyOne; + long prevKeyTwo = headKeyTwo; + int currentCount = -1; + int newCount = -1; + String client = null; + + // Always printing this out, really useful when debugging. + LOG.info("Head: " + getStringFromKeys(headKeyOne, headKeyTwo)); + + do { + RowResult prev = nextNode(prevKeyOne, prevKeyTwo); + if (prev == null) { + context.getCounter(Counts.BROKEN_LINKS).increment(1); + LOG.warn(getStringFromKeys(prevKeyOne, prevKeyTwo) + " doesn't exist"); + break; + } + + // It's possible those columns are null, let's not break trying to read them. + if (prev.isNull(0) || prev.isNull(1)) { + context.getCounter(Counts.BROKEN_LINKS).increment(1); + LOG.warn(getStringFromKeys(prevKeyOne, prevKeyTwo) + " isn't referencing anywhere"); + break; + } + + int prevCount = prev.getInt(2); + String prevClient = prev.getString(3); + if (currentCount == -1) { + // First time we loop we discover what the count was and set the new one. + currentCount = prevCount; + newCount = currentCount + 1; + client = prevClient; + } + + if (prevCount != currentCount) { + context.getCounter(Counts.BAD_UPDATE_COUNTS).increment(1); + LOG.warn(getStringFromKeys(prevKeyOne, prevKeyTwo) + " has a wrong updateCount, " + + prevCount + " instead of " + currentCount); + // Game over, there's corruption. + break; + } + + if (!prevClient.equals(client)) { + context.getCounter(Counts.BROKEN_LINKS).increment(1); + LOG.warn(getStringFromKeys(prevKeyOne, prevKeyTwo) + " has the wrong client, " + + "bad reference? Bad client= " + prevClient); + break; + } + + updateRow(prevKeyOne, prevKeyTwo, newCount); + context.getCounter(Counts.UPDATED_NODES).increment(1); + if (prevKeyOne % 10 == 0) { + context.progress(); + } + prevKeyOne = prev.getLong(0); + prevKeyTwo = prev.getLong(1); + } while (headKeyOne != prevKeyOne && headKeyTwo != prevKeyTwo); + + updateStatCounters(context, newCount); + context.getCounter(Counts.UPDATED_LINKS).increment(1); + } + + /** + * Finds the next node in the linked list. + */ + private RowResult nextNode(long prevKeyOne, long prevKeyTwo) throws IOException { + KuduScanner.KuduScannerBuilder builder = client.newScannerBuilder(table) + .setProjectedColumnNames(SCAN_COLUMN_NAMES); + + configureScannerForRandomRead(builder, table, prevKeyOne, prevKeyTwo); + + try { + return getOneRowResult(builder.build()); + } catch (Exception e) { + // Goes right out and fails the job. + throw new IOException("Couldn't read the following row: " + + getStringFromKeys(prevKeyOne, prevKeyTwo), e); + } + } + + private void updateRow(long keyOne, long keyTwo, int newCount) throws IOException { + Update update = table.newUpdate(); + PartialRow row = update.getRow(); + row.addLong(COLUMN_KEY_ONE, keyOne); + row.addLong(COLUMN_KEY_TWO, keyTwo); + row.addInt(COLUMN_UPDATE_COUNT, newCount); + try { + session.apply(update); + } catch (Exception e) { + // Goes right out and fails the job. + throw new IOException("Couldn't update the following row: " + + getStringFromKeys(keyOne, keyTwo), e); + } + } + + /** + * We keep some statistics about the linked list we update so that we can get a feel of + * what's being updated. + */ + private void updateStatCounters(Mapper.Context context, int newCount) { + switch (newCount) { + case -1: + case 0: + // TODO We didn't event get the first node? + break; + case 1: + context.getCounter(Counts.FIRST_UPDATE).increment(1); + break; + case 2: + context.getCounter(Counts.SECOND_UPDATE).increment(1); + break; + case 3: + context.getCounter(Counts.THIRD_UPDATE).increment(1); + break; + case 4: + context.getCounter(Counts.FOURTH_UPDATE).increment(1); + break; + default: + context.getCounter(Counts.MORE_THAN_FOUR_UPDATES).increment(1); + break; + } + } + + @Override + protected void cleanup(Context context) throws IOException, InterruptedException { + try { + session.close(); + client.shutdown(); + } catch (Exception ex) { + // Goes right out and fails the job. + throw new IOException("Coulnd't close the scanner after the task completed", ex); + } + } + } + + public int run(long maxLinkUpdatesPerMapper) throws Exception { + LOG.info("Running Updater with maxLinkUpdatesPerMapper=" + maxLinkUpdatesPerMapper); + + Job job = new Job(getConf()); + + job.setJobName("Link Updater"); + job.setNumReduceTasks(0); + job.setJarByClass(getClass()); + + Joiner columnsToQuery = Joiner.on(","); + + new KuduTableMapReduceUtil.TableInputFormatConfiguratorWithCommandLineParser( + job, getHeadsTable(getConf()), + columnsToQuery.join(COLUMN_KEY_ONE, COLUMN_KEY_TWO)) + .configure(); + + job.setMapperClass(UpdaterMapper.class); + job.setMapOutputKeyClass(BytesWritable.class); + job.setMapOutputValueClass(BytesWritable.class); + job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); + // If something fails we want to exit ASAP. + job.getConfiguration().setInt("mapreduce.map.maxattempts", 1); + // Lack of YARN-445 means we can't auto-jstack on timeout, so disabling the timeout gives + // us a chance to do it manually. + job.getConfiguration().setInt("mapreduce.task.timeout", 0); + job.getConfiguration().setLong(MAX_LINK_UPDATES_PER_MAPPER, maxLinkUpdatesPerMapper); + + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + job.setOutputFormatClass(NullOutputFormat.class); + + KuduTableMapReduceUtil.addDependencyJars(job); + + boolean success = job.waitForCompletion(true); + + Counters counters = job.getCounters(); + + if (success) { + // Let's not continue looping if we have broken linked lists. + Counter brokenLinks = counters.findCounter(Counts.BROKEN_LINKS); + Counter badUpdates = counters.findCounter(Counts.BAD_UPDATE_COUNTS); + if (brokenLinks.getValue() > 0 || badUpdates.getValue() > 0) { + LOG.error("Corruption was detected, see the job's counters. Ending the update loop."); + success = false; + } + } + return success ? 0 : 1; + } + + @Override + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: Update "); + System.err.println(" where will be 'infinite' if passed a negative value" + + " or zero"); + return 1; + } + LOG.info("Running Loop with args:" + Arrays.deepToString(args)); + + int numIterations = Integer.parseInt(args[0]); + long maxUpdates = Long.parseLong(args[1]); + + if (numIterations <= 0) { + numIterations = Integer.MAX_VALUE; + } + + if (maxUpdates < 1) { + maxUpdates = 1; + } + + for (int i = 0; i < numIterations; i++) { + LOG.info("Starting iteration = " + i); + int ret = run(maxUpdates); + if (ret != 0) { + LOG.error("Can't continue updating, last run failed."); + return ret; + } + } + + return 0; + } + } + + /** + * A stand alone program that deletes a single node. + * TODO + */ + /*private static class Delete extends Configured implements Tool { + @Override + public int run(String[] args) throws Exception { + if (args.length != 1) { + System.out.println("Usage : " + Delete.class.getSimpleName() + " "); + return 0; + } + byte[] val = Bytes.toBytesBinary(args[0]); + + org.apache.hadoop.hbase.client.Delete delete + = new org.apache.hadoop.hbase.client.Delete(val); + + HTable table = new HTable(getConf(), getTableName(getConf())); + + table.delete(delete); + table.flushCommits(); + table.close(); + + System.out.println("Delete successful"); + return 0; + } + }*/ + + /** + * A stand alone program that follows a linked list created by {@link Generator} + * and prints timing info. + * + */ + private static class Walker extends Configured implements Tool { + + private KuduClient client; + private KuduTable table; + + @Override + public int run(String[] args) throws IOException { + if (args.length < 1) { + System.err.println("Usage: Walker []"); + System.err.println(" where defaults to 100 nodes that will be printed out"); + return 1; + } + int maxNumNodes = 100; + if (args.length == 2) { + maxNumNodes = Integer.parseInt(args[1]); + } + System.out.println("Running Walker with args:" + Arrays.deepToString(args)); + + String[] keys = args[0].split(","); + if (keys.length != 2) { + System.err.println("The row key must be formatted like key1,key2"); + return 1; + } + + long keyOne = Long.parseLong(keys[0]); + long keyTwo = Long.parseLong(keys[1]); + + System.out.println("Walking with " + getStringFromKeys(keyOne, keyTwo)); + + try { + walk(keyOne, keyTwo, maxNumNodes); + } catch (Exception e) { + throw new IOException(e); + } + return 0; + } + + private void walk(long headKeyOne, long headKeyTwo, int maxNumNodes) throws Exception { + CommandLineParser parser = new CommandLineParser(getConf()); + client = parser.getClient(); + table = client.openTable(getTableName(getConf())); + + long prevKeyOne = headKeyOne; + long prevKeyTwo = headKeyTwo; + CINode node = new CINode(); + int nodesCount = 0; + + do { + RowResult rr = nextNode(prevKeyOne, prevKeyTwo); + if (rr == null) { + System.err.println(getStringFromKeys(prevKeyOne, prevKeyTwo) + " doesn't exist!"); + break; + } + getCINode(rr, node); + printCINodeString(node); + if (rr.isNull(2) || rr.isNull(3)) { + System.err.println("Last node didn't have a reference, breaking"); + break; + } + prevKeyOne = rr.getLong(2); + prevKeyTwo = rr.getLong(3); + nodesCount++; + } while ((headKeyOne != prevKeyOne && headKeyTwo != prevKeyTwo) && (nodesCount < + maxNumNodes)); + } + + private RowResult nextNode(long keyOne, long keyTwo) throws Exception { + KuduScanner.KuduScannerBuilder builder = client.newScannerBuilder(table); + configureScannerForRandomRead(builder, table, keyOne, keyTwo); + + return getOneRowResult(builder.build()); + } + } + + private static void configureScannerForRandomRead(AbstractKuduScannerBuilder builder, + KuduTable table, + long keyOne, + long keyTwo) { + PartialRow lowerBound = table.getSchema().newPartialRow(); + lowerBound.addLong(0, keyOne); + lowerBound.addLong(1, keyTwo); + builder.lowerBound(lowerBound); + + PartialRow upperBound = table.getSchema().newPartialRow(); + // Adding 1 since we want a single row, and the upper bound is exclusive. + upperBound.addLong(0, keyOne + 1); + upperBound.addLong(1, keyTwo + 1); + builder.exclusiveUpperBound(upperBound); + } + + private static String getTableName(Configuration conf) { + return conf.get(TABLE_NAME_KEY, DEFAULT_TABLE_NAME); + } + + private static String getHeadsTable(Configuration conf) { + return conf.get(HEADS_TABLE_NAME_KEY, DEFAULT_HEADS_TABLE_NAME); + } + + private static CINode getCINode(RowResult result, CINode node) { + + node.key = getStringFromKeys(result.getLong(0), result.getLong(1)); + if (result.isNull(2) || result.isNull(3)) { + node.prev = "NO_REFERENCE"; + } else { + node.prev = getStringFromKeys(result.getLong(2), result.getLong(3)); + } + node.rowId = result.getInt(4); + node.client = result.getString(5); + node.updateCount = result.getInt(6); + return node; + } + + private static void printCINodeString(CINode node) { + System.out.printf("%s:%s:%012d:%s:%s\n", node.key, node.prev, node.rowId, node.client, + node.updateCount); + } + + private static String getStringFromKeys(long key1, long key2) { + return new StringBuilder().append(key1).append(",").append(key2).toString(); + } + + private static RowResult getOneRowResult(KuduScanner scanner) throws Exception { + RowResultIterator rowResults; + rowResults = scanner.nextRows(); + if (rowResults.getNumRows() == 0) { + return null; + } + if (rowResults.getNumRows() > 1) { + throw new Exception("Received too many rows from scanner " + scanner); + } + return rowResults.next(); + } + + private void usage() { + System.err.println("Usage: " + this.getClass().getSimpleName() + " COMMAND [COMMAND options]"); + System.err.println(" where COMMAND is one of:"); + System.err.println(""); + System.err.println(" Generator A map only job that generates data."); + System.err.println(" Verify A map reduce job that looks for holes"); + System.err.println(" Look at the counts after running"); + System.err.println(" REFERENCED and UNREFERENCED are ok"); + System.err.println(" any UNDEFINED counts are bad. Do not"); + System.err.println(" run at the same time as the Generator."); + System.err.println(" Print A standalone program that prints nodes"); + System.err.println(" in the linked list."); + System.err.println(" Loop A program to Loop through Generator and"); + System.err.println(" Verify steps"); + System.err.println(" Update A program to updade the nodes"); + /* System.err.println(" Delete A standalone program that deletes a"); + System.err.println(" single node.");*/ + System.err.println(" Walker A standalong program that starts "); + System.err.println(" following a linked list"); + System.err.println("\t "); + System.err.flush(); + } + + protected void processOptions(String[] args) { + //get the class, run with the conf + if (args.length < 1) { + usage(); + throw new RuntimeException("Incorrect Number of args."); + } + toRun = args[0]; + otherArgs = Arrays.copyOfRange(args, 1, args.length); + } + + @Override + public int run(String[] args) throws Exception { + Tool tool = null; + processOptions(args); + if (toRun.equals("Generator")) { + tool = new Generator(); + } else if (toRun.equals("Verify")) { + tool = new Verify(); + } else if (toRun.equals("Loop")) { + Loop loop = new Loop(); + loop.it = this; + tool = loop; + + } else if (toRun.equals("Print")) { + tool = new Print(); + } else if (toRun.equals("Update")) { + tool = new Updater(); + } else if (toRun.equals("Walker")) { + tool = new Walker(); + } /*else if (toRun.equals("Delete")) { + tool = new Delete(); + }*/ else { + usage(); + throw new RuntimeException("Unknown arg"); + } + + return ToolRunner.run(getConf(), tool, otherArgs); + } + + private static void setJobConf(Job job, int numMappers, long numNodes, + Integer width, Integer wrapMultiplier) { + job.getConfiguration().setInt(GENERATOR_NUM_MAPPERS_KEY, numMappers); + job.getConfiguration().setLong(GENERATOR_NUM_ROWS_PER_MAP_KEY, numNodes); + if (width != null) { + job.getConfiguration().setInt(GENERATOR_WIDTH_KEY, width); + } + if (wrapMultiplier != null) { + job.getConfiguration().setInt(GENERATOR_WRAP_KEY, wrapMultiplier); + } + } + + public static void main(String[] args) throws Exception { + int ret = ToolRunner.run(new IntegrationTestBigLinkedList(), args); + System.exit(ret); + } +} diff --git a/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/RowCounter.java b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/RowCounter.java new file mode 100644 index 000000000000..45e7837bd5ed --- /dev/null +++ b/java/kudu-client-tools/src/main/java/org/kududb/mapreduce/tools/RowCounter.java @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce.tools; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.mapreduce.CommandLineParser; +import org.kududb.mapreduce.KuduTableMapReduceUtil; +import org.kududb.client.RowResult; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import java.io.IOException; + +/** + * Map-only job that counts all the rows in the provided table. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class RowCounter extends Configured implements Tool { + + static final String NAME = "rowcounter"; + static final String COLUMN_PROJECTION_KEY = "rowcounter.column.projection"; + + /** Counter enumeration to count the actual rows. */ + public static enum Counters { ROWS } + + /** + * Simple row counter + */ + static class RowCounterMapper extends + Mapper { + + @Override + protected void map(NullWritable key, RowResult value, Context context) throws IOException, + InterruptedException { + context.getCounter(Counters.ROWS).increment(1); + } + } + + /** + * Sets up the actual job. + * + * @param conf The current configuration. + * @param args The command line parameters. + * @return The newly created job. + * @throws java.io.IOException When setting up the job fails. + */ + @SuppressWarnings("deprecation") + public static Job createSubmittableJob(Configuration conf, String[] args) + throws IOException, ClassNotFoundException { + + String columnProjection = conf.get(COLUMN_PROJECTION_KEY); + + Class mapperClass = RowCounterMapper.class; + String tableName = args[0]; + + String jobName = NAME + "_" + tableName; + Job job = new Job(conf, jobName); + job.setJarByClass(mapperClass); + job.setMapperClass(mapperClass); + job.setNumReduceTasks(0); + job.setOutputFormatClass(NullOutputFormat.class); + new KuduTableMapReduceUtil.TableInputFormatConfiguratorWithCommandLineParser( + job, + tableName, + columnProjection) + .configure(); + return job; + } + + /* + * @param errorMsg Error message. Can be null. + */ + private static void usage(final String errorMsg) { + if (errorMsg != null && errorMsg.length() > 0) { + System.err.println("ERROR: " + errorMsg); + } + String usage = + "Usage: " + NAME + " \n\n" + + "Counts all the rows in the given table.\n" + + "\n" + + "Other options that may be specified with -D include:\n" + + " -D" + COLUMN_PROJECTION_KEY + "=a,b,c - comma-separated list of columns to read " + + "as part of the row count. By default, none are read so that the count is as fast " + + "as possible. When specifying columns that are keys, they must be at the beginning" + + ".\n" + + CommandLineParser.getHelpSnippet(); + + System.err.println(usage); + } + + @Override + public int run(String[] otherArgs) throws Exception { + if (otherArgs.length != 1) { + usage("Wrong number of arguments: " + otherArgs.length); + return -1; + } + Job job = createSubmittableJob(getConf(), otherArgs); + return job.waitForCompletion(true) ? 0 : 1; + } + + public static void main(String[] args) throws Exception { + int status = ToolRunner.run(new RowCounter(), args); + System.exit(status); + } +} diff --git a/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITImportCsv.java b/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITImportCsv.java new file mode 100644 index 000000000000..a06511222d4b --- /dev/null +++ b/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITImportCsv.java @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce.tools; + +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.mapreduce.CommandLineParser; +import org.kududb.mapreduce.HadoopTestingUtility; +import org.kududb.client.BaseKuduTest; +import org.kududb.client.CreateTableOptions; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.util.GenericOptionsParser; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ITImportCsv extends BaseKuduTest { + + private static final String TABLE_NAME = + ITImportCsv.class.getName() + "-" + System.currentTimeMillis(); + + private static final HadoopTestingUtility HADOOP_UTIL = new HadoopTestingUtility(); + + private static Schema schema; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + + ArrayList columns = new ArrayList(4); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column1_i", Type.INT32) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column2_d", Type.DOUBLE) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column3_s", Type.STRING) + .nullable(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column4_b", Type.BOOL) + .build()); + schema = new Schema(columns); + + createTable(TABLE_NAME, schema, new CreateTableOptions()); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + try { + BaseKuduTest.tearDownAfterClass(); + } finally { + HADOOP_UTIL.cleanup(); + } + } + + @Test + public void test() throws Exception { + Configuration conf = new Configuration(); + String testHome = + HADOOP_UTIL.setupAndGetTestDir(ITImportCsv.class.getName(), conf).getAbsolutePath(); + + // Create a 2 lines input file + File data = new File(testHome, "data.csv"); + writeCsvFile(data); + + StringBuilder sb = new StringBuilder(); + for (ColumnSchema col : schema.getColumns()) { + sb.append(col.getName()); + sb.append(","); + } + sb.deleteCharAt(sb.length() - 1); + String[] args = new String[] { + "-D" + CommandLineParser.MASTER_ADDRESSES_KEY + "=" + getMasterAddresses(), + sb.toString(), TABLE_NAME, data.toString()}; + + GenericOptionsParser parser = new GenericOptionsParser(conf, args); + Job job = ImportCsv.createSubmittableJob(parser.getConfiguration(), parser.getRemainingArgs()); + assertTrue("Test job did not end properly", job.waitForCompletion(true)); + + assertEquals(1, job.getCounters().findCounter(ImportCsv.Counters.BAD_LINES).getValue()); + + assertEquals(3, countRowsInScan( + client.newScannerBuilder(openTable(TABLE_NAME)).build())); + // TODO: should verify the actual returned rows, not just the count! + } + + private void writeCsvFile(File data) throws IOException { + FileOutputStream fos = new FileOutputStream(data); + fos.write("1\t3\t2.3\tsome string\ttrue\n".getBytes()); + fos.write("2\t5\t4.5\tsome more\tfalse\n".getBytes()); + fos.write("3\t7\twait this is not a double\tbad row\ttrue\n".getBytes()); + fos.write("4\t9\t10\ttrailing separator isn't bad mkay?\ttrue\t\n".getBytes()); + fos.close(); + } +} diff --git a/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITRowCounter.java b/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITRowCounter.java new file mode 100644 index 000000000000..52984bb72605 --- /dev/null +++ b/java/kudu-client-tools/src/test/java/org/kududb/mapreduce/tools/ITRowCounter.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce.tools; + +import org.kududb.mapreduce.CommandLineParser; +import org.kududb.mapreduce.HadoopTestingUtility; +import org.kududb.client.BaseKuduTest; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.util.GenericOptionsParser; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ITRowCounter extends BaseKuduTest { + + private static final String TABLE_NAME = + ITRowCounter.class.getName() + "-" + System.currentTimeMillis(); + + private static final HadoopTestingUtility HADOOP_UTIL = new HadoopTestingUtility(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + try { + BaseKuduTest.tearDownAfterClass(); + } finally { + HADOOP_UTIL.cleanup(); + } + } + + @Test + public void test() throws Exception { + Configuration conf = new Configuration(); + HADOOP_UTIL.setupAndGetTestDir(ITRowCounter.class.getName(), conf).getAbsolutePath(); + + createFourTabletsTableWithNineRows(TABLE_NAME); + + String[] args = new String[] { + "-D" + CommandLineParser.MASTER_ADDRESSES_KEY + "=" + getMasterAddresses(), TABLE_NAME}; + GenericOptionsParser parser = new GenericOptionsParser(conf, args); + Job job = RowCounter.createSubmittableJob(parser.getConfiguration(), parser.getRemainingArgs()); + assertTrue("Job did not end properly", job.waitForCompletion(true)); + + assertEquals(9, job.getCounters().findCounter(RowCounter.Counters.ROWS).getValue()); + } +} diff --git a/java/kudu-client/.gitignore b/java/kudu-client/.gitignore new file mode 100644 index 000000000000..4bc0ed38b939 --- /dev/null +++ b/java/kudu-client/.gitignore @@ -0,0 +1,3 @@ + +# Maven build artifacts +dependency-reduced-pom.xml diff --git a/java/kudu-client/dev-support/build-proto.sh b/java/kudu-client/dev-support/build-proto.sh new file mode 100755 index 000000000000..9a03a0b79cee --- /dev/null +++ b/java/kudu-client/dev-support/build-proto.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +## +# Script to find and run protoc to generate protocol buf files. +# Should be used exclusively by Maven. +# + +KUDU_DIR=`dirname $0`/../../.. +PROTOC_BIN=$KUDU_DIR/thirdparty/installed-deps/bin/protoc +if [ ! -f "$PROTOC_BIN" ] ; then + if which protoc > /dev/null; then + PROTOC_BIN=`which protoc` + else + echo 'Error: protoc is missing from the 3rd party folder and on the PATH' + exit 1 + fi +fi + +$PROTOC_BIN "$@" diff --git a/java/kudu-client/pom.xml b/java/kudu-client/pom.xml new file mode 100644 index 000000000000..666f49470d46 --- /dev/null +++ b/java/kudu-client/pom.xml @@ -0,0 +1,302 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + kudu-client + Kudu Java Client + + + + org.kududb + interface-annotations + ${project.version} + + + com.google.protobuf + protobuf-java + ${protobuf.version} + + + com.google.guava + guava + ${guava.version} + + + io.netty + netty + ${netty.version} + + + log4j + log4j + ${log4j.version} + provided + + + org.mockito + mockito-all + ${mockito-all.version} + test + + + junit + junit + ${junit.version} + test + + + + com.stumbleupon + async + ${async.version} + + + com.sangupta + murmur + ${murmur.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + provided + + + org.slf4j + slf4j-api + ${slf4j.version} + + + commons-io + commons-io + ${commons-io.version} + test + + + + + + + com.google.protobuf.tools + maven-protoc-plugin + ${maven-protoc-plugin.version} + + + true + ${project.basedir}/../../src + ${project.basedir}/dev-support/build-proto.sh + + + ${project.basedir}/../../thirdparty/installed-deps/include + + + + **/*test*.proto + + + + + + compile + + generate-sources + + + + + org.apache.maven.plugins + maven-clean-plugin + 2.5 + + + delete-test-protos + + process-resources + + clean + + + true + + + ${project.build.directory}/classes + + **/*test*.proto + + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.3 + + + + com.google.guava:guava + com.google.protobuf:protobuf-java + com.sangupta:murmur + io.netty:netty + + + + + com.google.common + org.kududb.client.shaded.com.google.common + + + com.google.protobuf + org.kududb.client.shaded.com.google.protobuf + + + com.sangupta + org.kududb.client.shaded.com.sangupta + + + org.jboss.netty + org.kududb.client.shaded.org.jboss.netty + + + + + + package + + shade + + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.4 + + + attach-sources + + jar + + + + attach-test-sources + + test-jar + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.3 + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.6 + + + + test-jar + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-antrun-plugin + [${maven-antrun-plugin.version},) + + run + + + + + false + + + + + + com.google.protobuf.tools + maven-protoc-plugin + [${maven-protoc-plugin.version},) + + compile + + + + + false + + + + + + + + + + + diff --git a/java/kudu-client/src/main/java/com/google/protobuf/ZeroCopyLiteralByteString.java b/java/kudu-client/src/main/java/com/google/protobuf/ZeroCopyLiteralByteString.java new file mode 100644 index 000000000000..5bd933d5f0a4 --- /dev/null +++ b/java/kudu-client/src/main/java/com/google/protobuf/ZeroCopyLiteralByteString.java @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2013 The Async HBase Authors. All rights reserved. + * This file is part of Async HBase. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package com.google.protobuf; // This is a lie. + +/** + * Helper class to extract byte arrays from {@link ByteString} without copy. + *

+ * Without this protobufs would force us to copy every single byte array out + * of the objects de-serialized from the wire (which already do one copy, on + * top of the copies the JVM does to go from kernel buffer to C buffer and + * from C buffer to JVM buffer). + *

+ * This class isn't part of the public API. + */ +public final class ZeroCopyLiteralByteString extends LiteralByteString { + + /** Private constructor so this class cannot be instantiated. */ + private ZeroCopyLiteralByteString() { + super(null); + throw new UnsupportedOperationException("Should never be here."); + } + + /** + * Wraps a byte array in a {@link ByteString} without copying it. + */ + public static ByteString wrap(final byte[] array) { + return new LiteralByteString(array); + } + + /** + * Extracts the byte array from the given {@link ByteString} without copy. + * @param buf A buffer from which to extract the array. This buffer must be + * actually an instance of a {@code LiteralByteString}. + */ + public static byte[] zeroCopyGetBytes(final ByteString buf) { + if (buf instanceof LiteralByteString) { + return ((LiteralByteString) buf).bytes; + } + throw new UnsupportedOperationException("Need a LiteralByteString, got a " + + buf.getClass().getName()); + } + +} \ No newline at end of file diff --git a/java/kudu-client/src/main/java/org/kududb/ColumnSchema.java b/java/kudu-client/src/main/java/org/kududb/ColumnSchema.java new file mode 100644 index 000000000000..c3c617d4c01a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/ColumnSchema.java @@ -0,0 +1,301 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb; + +import org.kududb.Common.CompressionType; +import org.kududb.Common.EncodingType; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Represents a Kudu Table column. Use {@link ColumnSchema.ColumnSchemaBuilder} in order to + * create columns. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class ColumnSchema { + + private final String name; + private final Type type; + private final boolean key; + private final boolean nullable; + private final Object defaultValue; + private final int desiredBlockSize; + private final Encoding encoding; + private final CompressionAlgorithm compressionAlgorithm; + + /** + * Specifies the encoding of data for a column on disk. + * Not all encodings are available for all data types. + * Refer to the Kudu documentation for more information on each encoding. + */ + public enum Encoding { + UNKNOWN(EncodingType.UNKNOWN_ENCODING), + AUTO_ENCODING(EncodingType.AUTO_ENCODING), + PLAIN_ENCODING(EncodingType.PLAIN_ENCODING), + PREFIX_ENCODING(EncodingType.PREFIX_ENCODING), + GROUP_VARINT(EncodingType.GROUP_VARINT), + RLE(EncodingType.RLE), + DICT_ENCODING(EncodingType.DICT_ENCODING), + BIT_SHUFFLE(EncodingType.BIT_SHUFFLE); + + final EncodingType internalPbType; + + Encoding(EncodingType internalPbType) { + this.internalPbType = internalPbType; + } + + @InterfaceAudience.Private + public EncodingType getInternalPbType() { + return internalPbType; + } + }; + + /** + * Specifies the compression algorithm of data for a column on disk. + */ + public enum CompressionAlgorithm { + UNKNOWN(CompressionType.UNKNOWN_COMPRESSION), + DEFAULT_COMPRESSION(CompressionType.DEFAULT_COMPRESSION), + NO_COMPRESSION(CompressionType.NO_COMPRESSION), + SNAPPY(CompressionType.SNAPPY), + LZ4(CompressionType.LZ4), + ZLIB(CompressionType.ZLIB); + + final CompressionType internalPbType; + + CompressionAlgorithm(CompressionType internalPbType) { + this.internalPbType = internalPbType; + } + + @InterfaceAudience.Private + public CompressionType getInternalPbType() { + return internalPbType; + } + }; + + private ColumnSchema(String name, Type type, boolean key, boolean nullable, + Object defaultValue, int desiredBlockSize, Encoding encoding, + CompressionAlgorithm compressionAlgorithm) { + this.name = name; + this.type = type; + this.key = key; + this.nullable = nullable; + this.defaultValue = defaultValue; + this.desiredBlockSize = desiredBlockSize; + this.encoding = encoding; + this.compressionAlgorithm = compressionAlgorithm; + } + + /** + * Get the column's Type + * @return the type + */ + public Type getType() { + return type; + } + + /** + * Get the column's name + * @return A string representation of the name + */ + public String getName() { + return name; + } + + /** + * Answers if the column part of the key + * @return true if the column is part of the key, else false + */ + public boolean isKey() { + return key; + } + + /** + * Answers if the column can be set to null + * @return true if it can be set to null, else false + */ + public boolean isNullable() { + return nullable; + } + + /** + * The Java object representation of the default value that's read + * @return the default read value + */ + public Object getDefaultValue() { + return defaultValue; + } + + /** + * Gets the desired block size for this column. + * If no block size has been explicitly specified for this column, + * returns 0 to indicate that the server-side default will be used. + * + * @return the block size, in bytes, or 0 if none has been configured. + */ + public int getDesiredBlockSize() { + return desiredBlockSize; + } + + /** + * Return the encoding of this column, or null if it is not known. + */ + public Encoding getEncoding() { + return encoding; + } + + /** + * Return the compression algorithm of this column, or null if it is not known. + */ + public CompressionAlgorithm getCompressionAlgorithm() { + return compressionAlgorithm; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ColumnSchema that = (ColumnSchema) o; + + if (key != that.key) return false; + if (!name.equals(that.name)) return false; + if (!type.equals(that.type)) return false; + + return true; + } + + @Override + public int hashCode() { + int result = name.hashCode(); + result = 31 * result + type.hashCode(); + result = 31 * result + (key ? 1 : 0); + return result; + } + + @Override + public String toString() { + return "Column name: " + name + ", type: " + type.getName(); + } + + /** + * Builder for ColumnSchema. + */ + public static class ColumnSchemaBuilder { + private final String name; + private final Type type; + private boolean key = false; + private boolean nullable = false; + private Object defaultValue = null; + private int blockSize = 0; + private Encoding encoding = null; + private CompressionAlgorithm compressionAlgorithm = null; + + /** + * Constructor for the required parameters. + * @param name column's name + * @param type column's type + */ + public ColumnSchemaBuilder(String name, Type type) { + this.name = name; + this.type = type; + } + + /** + * Sets if the column is part of the row key. False by default. + * @param key a boolean that indicates if the column is part of the key + * @return this instance + */ + public ColumnSchemaBuilder key(boolean key) { + this.key = key; + return this; + } + + /** + * Marks the column as allowing null values. False by default. + * @param nullable a boolean that indicates if the column allows null values + * @return this instance + */ + public ColumnSchemaBuilder nullable(boolean nullable) { + this.nullable = nullable; + return this; + } + + /** + * Sets the default value that will be read from the column. Null by default. + * @param defaultValue a Java object representation of the default value that's read + * @return this instance + */ + public ColumnSchemaBuilder defaultValue(Object defaultValue) { + this.defaultValue = defaultValue; + return this; + } + + /** + * Set the desired block size for this column. + * + * This is the number of bytes of user data packed per block on disk, and + * represents the unit of IO when reading this column. Larger values + * may improve scan performance, particularly on spinning media. Smaller + * values may improve random access performance, particularly for workloads + * that have high cache hit rates or operate on fast storage such as SSD. + * + * Note that the block size specified here corresponds to uncompressed data. + * The actual size of the unit read from disk may be smaller if + * compression is enabled. + * + * It's recommended that this not be set any lower than 4096 (4KB) or higher + * than 1048576 (1MB). + * @param blockSize the desired block size, in bytes + * @return this instance + * + */ + public ColumnSchemaBuilder desiredBlockSize(int blockSize) { + this.blockSize = blockSize; + return this; + } + + /** + * Set the block encoding for this column. See the documentation for the list + * of valid options. + */ + public ColumnSchemaBuilder encoding(Encoding encoding) { + this.encoding = encoding; + return this; + } + + /** + * Set the compression algorithm for this column. See the documentation for the list + * of valid options. + */ + public ColumnSchemaBuilder compressionAlgorithm(CompressionAlgorithm compressionAlgorithm) { + this.compressionAlgorithm = compressionAlgorithm; + return this; + } + + /** + * Builds a {@link ColumnSchema} using the passed parameters. + * @return a new {@link ColumnSchema} + */ + public ColumnSchema build() { + return new ColumnSchema(name, type, + key, nullable, defaultValue, + blockSize, encoding, compressionAlgorithm); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/Schema.java b/java/kudu-client/src/main/java/org/kududb/Schema.java new file mode 100644 index 000000000000..9e4192b27ab0 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/Schema.java @@ -0,0 +1,294 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb; + +import com.google.common.collect.ImmutableList; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.Bytes; +import org.kududb.client.PartialRow; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Represents table's schema which is essentially a list of columns. + * This class offers a few utility methods for querying it. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class Schema { + + /** + * Mapping of column index to column. + */ + private final List columnsByIndex; + + /** + * The primary key columns. + */ + private final List primaryKeyColumns; + + /** + * Mapping of column name to index. + */ + private final Map columnsByName; + + /** + * Mapping of column ID to index, or null if the schema does not have assigned column IDs. + */ + private final Map columnsById; + + /** + * Mapping of column index to backing byte array offset. + */ + private final int[] columnOffsets; + + private final int varLengthColumnCount; + private final int rowSize; + private final boolean hasNullableColumns; + + /** + * Constructs a schema using the specified columns and does some internal accounting + * @param columns the columns in index order + * @throws IllegalArgumentException If the key columns aren't specified first + * + * See {@code ColumnPBsToSchema()} in {@code src/kudu/common/wire_protocol.cc} + */ + public Schema(List columns) { + this(columns, null); + } + + /** + * Constructs a schema using the specified columns and IDs. + * + * This is not a stable API, prefer using {@link Schema#Schema(List)} to create a new schema. + * + * @param columns the columns in index order + * @param columnIds the column ids of the provided columns, or null + * @throws IllegalArgumentException If the primary key columns aren't specified first + * @throws IllegalArgumentException If the column ids length does not match the columns length + * + * See {@code ColumnPBsToSchema()} in {@code src/kudu/common/wire_protocol.cc} + */ + public Schema(List columns, List columnIds) { + boolean hasColumnIds = columnIds != null; + if (hasColumnIds && columns.size() != columnIds.size()) { + throw new IllegalArgumentException( + "Schema must be constructed with all column IDs, or none."); + } + + this.columnsByIndex = ImmutableList.copyOf(columns); + int varLenCnt = 0; + this.columnOffsets = new int[columns.size()]; + this.columnsByName = new HashMap<>(columns.size()); + this.columnsById = hasColumnIds ? new HashMap(columnIds.size()) : null; + int offset = 0; + boolean hasNulls = false; + int primaryKeyCount = 0; + // pre-compute a few counts and offsets + for (int index = 0; index < columns.size(); index++) { + final ColumnSchema column = columns.get(index); + if (column.isKey()) { + primaryKeyCount += 1; + if (primaryKeyCount != index + 1) + throw new IllegalArgumentException("Got out-of-order primary key column: " + column); + } + + hasNulls |= column.isNullable(); + columnOffsets[index] = offset; + offset += column.getType().getSize(); + if (this.columnsByName.put(column.getName(), index) != null) { + throw new IllegalArgumentException( + String.format("Column names must be unique: %s", columns)); + } + if (column.getType() == Type.STRING || column.getType() == Type.BINARY) { + varLenCnt++; + } + + if (hasColumnIds) { + if (this.columnsById.put(columnIds.get(index), index) != null) { + throw new IllegalArgumentException( + String.format("Column IDs must be unique: %s", columnIds)); + } + } + } + + this.hasNullableColumns = hasNulls; + this.varLengthColumnCount = varLenCnt; + this.primaryKeyColumns = columns.subList(0, primaryKeyCount); + this.rowSize = getRowSize(this.columnsByIndex); + } + + /** + * Get the list of columns used to create this schema + * @return list of columns + */ + public List getColumns() { + return this.columnsByIndex; + } + + /** + * Get the count of columns with variable length (BINARY/STRING) in + * this schema. + * @return strings count + */ + public int getVarLengthColumnCount() { + return this.varLengthColumnCount; + } + + /** + * Get the size a row built using this schema would be + * @return size in bytes + */ + public int getRowSize() { + return this.rowSize; + } + + /** + * Get the index at which this column can be found in the backing byte array + * @param idx column's index + * @return column's offset + */ + public int getColumnOffset(int idx) { + return this.columnOffsets[idx]; + } + + /** + * Get the index for the provided column name. + * @param columnName column to search for + * @return an index in the schema + */ + public int getColumnIndex(String columnName) { + Integer index = this.columnsByName.get(columnName); + if (index == null) { + throw new IllegalArgumentException( + String.format("Unknown column: %s", columnName)); + } + return index; + } + + /** + * Get the column at the specified index in the original list + * @param idx column's index + * @return the column + */ + public ColumnSchema getColumnByIndex(int idx) { + return this.columnsByIndex.get(idx); + } + + /** + * Get the column index of the column with the provided ID. + * This method is not part of the stable API. + * @param columnId the column id of the column + * @return the column index of the column. + */ + public int getColumnIndex(int columnId) { + if (!hasColumnIds()) throw new IllegalStateException("Schema does not have Column IDs"); + Integer index = this.columnsById.get(columnId); + if (index == null) throw new IllegalArgumentException( + String.format("Unknown column id: %s", columnId)); + return index; + } + + /** + * Get the column associated with the specified name + * @param columnName column's name + * @return the column + */ + public ColumnSchema getColumn(String columnName) { + return columnsByIndex.get(getColumnIndex(columnName)); + } + + /** + * Get the count of columns in this schema + * @return count of columns + */ + public int getColumnCount() { + return this.columnsByIndex.size(); + } + + /** + * Get the count of columns that are part of the primary key. + * @return count of primary key columns. + */ + public int getPrimaryKeyColumnCount() { + return this.primaryKeyColumns.size(); + } + + /** + * Get the primary key columns. + * @return the primary key columns. + */ + public List getPrimaryKeyColumns() { + return primaryKeyColumns; + } + + /** + * Get a schema that only contains the columns which are part of the key + * @return new schema with only the keys + */ + public Schema getRowKeyProjection() { + return new Schema(primaryKeyColumns); + } + + /** + * Tells if there's at least one nullable column + * @return true if at least one column is nullable, else false. + */ + public boolean hasNullableColumns() { + return this.hasNullableColumns; + } + + /** + * Tells whether this schema includes IDs for columns. A schema created by a client as part of + * table creation will not include IDs, but schemas for open tables will include IDs. + * This method is not part of the stable API. + * + * @return whether this schema includes column IDs. + */ + public boolean hasColumnIds() { + return columnsById != null; + } + + /** + * Gives the size in bytes for a single row given the specified schema + * @param columns the row's columns + * @return row size in bytes + */ + private static int getRowSize(List columns) { + int totalSize = 0; + boolean hasNullables = false; + for (ColumnSchema column : columns) { + totalSize += column.getType().getSize(); + hasNullables |= column.isNullable(); + } + if (hasNullables) { + totalSize += Bytes.getBitSetSize(columns.size()); + } + return totalSize; + } + + /** + * Creates a new partial row for the schema. + * @return a new partial row + */ + public PartialRow newPartialRow() { + return new PartialRow(this); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/Type.java b/java/kudu-client/src/main/java/org/kududb/Type.java new file mode 100644 index 000000000000..fd238358e73e --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/Type.java @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb; + +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import com.google.common.primitives.Shorts; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import static org.kududb.Common.DataType; + +/** + * Describes all the types available to build table schemas. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public enum Type { + + INT8 (DataType.INT8, "int8"), + INT16 (DataType.INT16, "int16"), + INT32 (DataType.INT32, "int32"), + INT64 (DataType.INT64, "int64"), + BINARY (DataType.BINARY, "binary"), + STRING (DataType.STRING, "string"), + BOOL (DataType.BOOL, "bool"), + FLOAT (DataType.FLOAT, "float"), + DOUBLE (DataType.DOUBLE, "double"), + TIMESTAMP (DataType.TIMESTAMP, "timestamp"); + + + private final DataType dataType; + private final String name; + private final int size; + + /** + * Private constructor used to pre-create the types + * @param dataType DataType from the common's pb + * @param name string representation of the type + */ + private Type(DataType dataType, String name) { + this.dataType = dataType; + this.name = name; + this.size = getTypeSize(this.dataType); + } + + /** + * Get the data type from the common's pb + * @return A DataType + */ + public DataType getDataType() { + return this.dataType; + } + + /** + * Get the string representation of this type + * @return The type's name + */ + public String getName() { + return this.name; + } + + /** + * The size of this type on the wire + * @return A size + */ + public int getSize() { + return this.size; + } + + @Override + public String toString() { + return "Type: " + this.name + ", size: " + this.size; + } + + /** + * Gives the size in bytes for a given DataType, as per the pb specification + * @param type pb type + * @return size in bytes + */ + static int getTypeSize(DataType type) { + switch (type) { + case STRING: + case BINARY: return 8 + 8; // offset then string length + case BOOL: + case INT8: return 1; + case INT16: return Shorts.BYTES; + case INT32: + case FLOAT: return Ints.BYTES; + case INT64: + case DOUBLE: + case TIMESTAMP: return Longs.BYTES; + default: throw new IllegalArgumentException("The provided data type doesn't map" + + " to know any known one."); + } + } + + /** + * Convert the pb DataType to a Type + * @param type DataType to convert + * @return a matching Type + */ + public static Type getTypeForDataType(DataType type) { + switch (type) { + case STRING: return STRING; + case BINARY: return BINARY; + case BOOL: return BOOL; + case INT8: return INT8; + case INT16: return INT16; + case INT32: return INT32; + case INT64: return INT64; + case TIMESTAMP: return TIMESTAMP; + case FLOAT: return FLOAT; + case DOUBLE: return DOUBLE; + default: + throw new IllegalArgumentException("The provided data type doesn't map" + + " to know any known one: " + type.getDescriptorForType().getFullName()); + + } + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AbstractKuduScannerBuilder.java b/java/kudu-client/src/main/java/org/kududb/client/AbstractKuduScannerBuilder.java new file mode 100644 index 000000000000..c4c17ef51566 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AbstractKuduScannerBuilder.java @@ -0,0 +1,295 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import java.util.ArrayList; +import java.util.List; + +import com.google.common.collect.ImmutableList; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; +import org.kududb.util.HybridTimeUtil; + +/** + * Abstract class to extend in order to create builders for scanners. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public abstract class AbstractKuduScannerBuilder + , T> { + final AsyncKuduClient client; + final KuduTable table; + final List columnRangePredicates; + + AsyncKuduScanner.ReadMode readMode = AsyncKuduScanner.ReadMode.READ_LATEST; + int batchSizeBytes = 1024*1024; + long limit = Long.MAX_VALUE; + boolean prefetching = false; + boolean cacheBlocks = true; + long htTimestamp = AsyncKuduClient.NO_TIMESTAMP; + byte[] lowerBoundPrimaryKey = AsyncKuduClient.EMPTY_ARRAY; + byte[] upperBoundPrimaryKey = AsyncKuduClient.EMPTY_ARRAY; + byte[] lowerBoundPartitionKey = AsyncKuduClient.EMPTY_ARRAY; + byte[] upperBoundPartitionKey = AsyncKuduClient.EMPTY_ARRAY; + List projectedColumnNames = null; + List projectedColumnIndexes = null; + long scanRequestTimeout; + + AbstractKuduScannerBuilder(AsyncKuduClient client, KuduTable table) { + this.client = client; + this.table = table; + this.columnRangePredicates = new ArrayList<>(); + this.scanRequestTimeout = client.getDefaultOperationTimeoutMs(); + } + + /** + * Sets the read mode, the default is to read the latest values. + * @param readMode a read mode for the scanner + * @return this instance + */ + public S readMode(AsyncKuduScanner.ReadMode readMode) { + this.readMode = readMode; + return (S) this; + } + + /** + * Adds a predicate for a column. + * @param predicate predicate for a column to add + * @return this instance + */ + public S addColumnRangePredicate(ColumnRangePredicate predicate) { + columnRangePredicates.add(predicate.getPb()); + return (S) this; + } + + /** + * Adds a list of predicates in their raw format, + * as given by {@link ColumnRangePredicate#toByteArray(List)}. + * @param predicateBytes predicates to add + * @return this instance + * @throws IllegalArgumentException thrown when the passed bytes aren't valid + */ + public S addColumnRangePredicatesRaw(byte[] predicateBytes) { + List predicates = + ColumnRangePredicate.fromByteArray(predicateBytes); + columnRangePredicates.addAll(predicates); + return (S) this; + } + + /** + * Set which columns will be read by the Scanner. + * Calling this method after {@link #setProjectedColumnIndexes(List)} will reset the projected + * columns to those specified in {@code columnNames}. + * @param columnNames the names of columns to read, or 'null' to read all columns + * (the default) + */ + public S setProjectedColumnNames(List columnNames) { + projectedColumnIndexes = null; + if (columnNames != null) { + projectedColumnNames = ImmutableList.copyOf(columnNames); + } else { + projectedColumnNames = null; + } + return (S) this; + } + + /** + * Set which columns will be read by the Scanner. + * Calling this method after {@link #setProjectedColumnNames(List)} will reset the projected + * columns to those specified in {@code columnIndexes}. + * @param columnIndexes the indexes of columns to read, or 'null' to read all columns + * (the default) + */ + public S setProjectedColumnIndexes(List columnIndexes) { + projectedColumnNames = null; + if (columnIndexes != null) { + projectedColumnIndexes = ImmutableList.copyOf(columnIndexes); + } else { + projectedColumnIndexes = null; + } + return (S) this; + } + + /** + * Sets the maximum number of bytes returned by the scanner, on each batch. The default is 1MB. + *

+ * Kudu may actually return more than this many bytes because it will not + * truncate a rowResult in the middle. + * @param batchSizeBytes a strictly positive number of bytes + * @return this instance + */ + public S batchSizeBytes(int batchSizeBytes) { + this.batchSizeBytes = batchSizeBytes; + return (S) this; + } + + /** + * Sets a limit on the number of rows that will be returned by the scanner. There's no limit + * by default. + * @param limit a positive long + * @return this instance + */ + public S limit(long limit) { + this.limit = limit; + return (S) this; + } + + /** + * Enables prefetching of rows for the scanner, i.e. whether to send a request for more data + * to the server immediately after we receive a response (instead of waiting for the user + * to call {@code nextRows()}). Disabled by default. + * NOTE: This is risky until KUDU-1260 is resolved. + * @param prefetching a boolean that indicates if the scanner should prefetch rows + * @return this instance + */ + public S prefetching(boolean prefetching) { + this.prefetching = prefetching; + return (S) this; + } + + /** + * Sets the block caching policy for the scanner. If true, scanned data blocks will be cached + * in memory and made available for future scans. Enabled by default. + * @param cacheBlocks a boolean that indicates if data blocks should be cached or not + * @return this instance + */ + public S cacheBlocks(boolean cacheBlocks) { + this.cacheBlocks = cacheBlocks; + return (S) this; + } + + /** + * Sets a previously encoded HT timestamp as a snapshot timestamp, for tests. None is used by + * default. + * Requires that the ReadMode is READ_AT_SNAPSHOT. + * @param htTimestamp a long representing a HybridClock-encoded timestamp + * @return this instance + * @throws IllegalArgumentException on build(), if the timestamp is less than 0 or if the + * read mode was not set to READ_AT_SNAPSHOT + */ + @InterfaceAudience.Private + public S snapshotTimestampRaw(long htTimestamp) { + this.htTimestamp = htTimestamp; + return (S) this; + } + + /** + * Sets the timestamp the scan must be executed at, in microseconds since the Unix epoch. None is + * used by default. + * Requires that the ReadMode is READ_AT_SNAPSHOT. + * @param timestamp a long representing an instant in microseconds since the unix epoch. + * @return this instance + * @throws IllegalArgumentException on build(), if the timestamp is less than 0 or if the + * read mode was not set to READ_AT_SNAPSHOT + */ + public S snapshotTimestampMicros(long timestamp) { + this.htTimestamp = HybridTimeUtil.physicalAndLogicalToHTTimestamp(timestamp, 0); + return (S) this; + } + + /** + * Sets how long each scan request to a server can last. + * Defaults to {@link KuduClient#getDefaultOperationTimeoutMs()}. + * @param scanRequestTimeout a long representing time in milliseconds + * @return this instance + */ + public S scanRequestTimeout(long scanRequestTimeout) { + this.scanRequestTimeout = scanRequestTimeout; + return (S) this; + } + + /** + * Add a lower bound (inclusive) primary key for the scan. + * If any bound is already added, this bound is intersected with that one. + * @param partialRow a partial row with specified key columns + * @return this instance + */ + public S lowerBound(PartialRow partialRow) { + return lowerBoundRaw(partialRow.encodePrimaryKey()); + } + + /** + * Like lowerBoundPrimaryKey() but the encoded primary key is an opaque byte array obtained elsewhere. + * @param startPrimaryKey bytes containing an encoded start key + * @return this instance + * @deprecated use {@link #lowerBound(PartialRow)} + */ + @Deprecated + public S lowerBoundRaw(byte[] startPrimaryKey) { + if (lowerBoundPrimaryKey == AsyncKuduClient.EMPTY_ARRAY || + Bytes.memcmp(startPrimaryKey, lowerBoundPrimaryKey) > 0) { + this.lowerBoundPrimaryKey = startPrimaryKey; + } + return (S) this; + } + + /** + * Add an upper bound (exclusive) primary key for the scan. + * If any bound is already added, this bound is intersected with that one. + * @param partialRow a partial row with specified key columns + * @return this instance + */ + public S exclusiveUpperBound(PartialRow partialRow) { + return exclusiveUpperBoundRaw(partialRow.encodePrimaryKey()); + } + + /** + * Like exclusiveUpperBound() but the encoded primary key is an opaque byte array obtained elsewhere. + * @param endPrimaryKey bytes containing an encoded end key + * @return this instance + * @deprecated use {@link #exclusiveUpperBound(PartialRow)} + */ + @Deprecated + public S exclusiveUpperBoundRaw(byte[] endPrimaryKey) { + if (upperBoundPrimaryKey == AsyncKuduClient.EMPTY_ARRAY || + Bytes.memcmp(endPrimaryKey, upperBoundPrimaryKey) < 0) { + this.upperBoundPrimaryKey = endPrimaryKey; + } + return (S) this; + } + + /** + * Set an encoded (inclusive) start partition key for the scan. + * + * @param partitionKey the encoded partition key + * @return this instance + */ + @InterfaceAudience.LimitedPrivate("Impala") + public S lowerBoundPartitionKeyRaw(byte[] partitionKey) { + if (Bytes.memcmp(partitionKey, lowerBoundPartitionKey) > 0) { + this.lowerBoundPartitionKey = partitionKey; + } + return (S) this; + } + + /** + * Set an encoded (exclusive) end partition key for the scan. + * + * @param partitionKey the encoded partition key + * @return this instance + */ + @InterfaceAudience.LimitedPrivate("Impala") + public S exclusiveUpperBoundPartitionKeyRaw(byte[] partitionKey) { + if (upperBoundPartitionKey.length == 0 || Bytes.memcmp(partitionKey, upperBoundPartitionKey) < 0) { + this.upperBoundPartitionKey = partitionKey; + } + return (S) this; + } + + public abstract T build(); +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AlterTableOptions.java b/java/kudu-client/src/main/java/org/kududb/client/AlterTableOptions.java new file mode 100644 index 000000000000..ecffb5e080f6 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AlterTableOptions.java @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.ColumnSchema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import static org.kududb.master.Master.AlterTableRequestPB; + +/** + * This builder must be used to alter a table. At least one change must be specified. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class AlterTableOptions { + + AlterTableRequestPB.Builder pb = AlterTableRequestPB.newBuilder(); + + /** + * Change a table's name. + * @param newName new table's name, must be used to check progress + * @return this instance + */ + public AlterTableOptions renameTable(String newName) { + pb.setNewTableName(newName); + return this; + } + + /** + * Add a new column that's not nullable. + * @param name name of the new column + * @param type type of the new column + * @param defaultVal default value used for the currently existing rows + * @return this instance + */ + public AlterTableOptions addColumn(String name, Type type, Object defaultVal) { + if (defaultVal == null) { + throw new IllegalArgumentException("A new column must have a default value, " + + "use addNullableColumn() to add a NULLABLE column"); + } + AlterTableRequestPB.Step.Builder step = pb.addAlterSchemaStepsBuilder(); + step.setType(AlterTableRequestPB.StepType.ADD_COLUMN); + step.setAddColumn(AlterTableRequestPB.AddColumn.newBuilder().setSchema(ProtobufHelper + .columnToPb(new ColumnSchema.ColumnSchemaBuilder(name, type) + .defaultValue(defaultVal) + .build()))); + return this; + } + + /** + * Add a new column that's nullable, thus has no default value. + * @param name name of the new column + * @param type type of the new column + * @return this instance + */ + public AlterTableOptions addNullableColumn(String name, Type type) { + AlterTableRequestPB.Step.Builder step = pb.addAlterSchemaStepsBuilder(); + step.setType(AlterTableRequestPB.StepType.ADD_COLUMN); + step.setAddColumn(AlterTableRequestPB.AddColumn.newBuilder().setSchema(ProtobufHelper + .columnToPb(new ColumnSchema.ColumnSchemaBuilder(name, type) + .nullable(true) + .build()))); + return this; + } + + /** + * Drop a column. + * @param name name of the column + * @return this instance + */ + public AlterTableOptions dropColumn(String name) { + AlterTableRequestPB.Step.Builder step = pb.addAlterSchemaStepsBuilder(); + step.setType(AlterTableRequestPB.StepType.DROP_COLUMN); + step.setDropColumn(AlterTableRequestPB.DropColumn.newBuilder().setName(name)); + return this; + } + + /** + * Change the name of a column. + * @param oldName old column's name, must exist + * @param newName new name to use + * @return this instance + */ + public AlterTableOptions renameColumn(String oldName, String newName) { + AlterTableRequestPB.Step.Builder step = pb.addAlterSchemaStepsBuilder(); + step.setType(AlterTableRequestPB.StepType.RENAME_COLUMN); + step.setRenameColumn(AlterTableRequestPB.RenameColumn.newBuilder().setOldName(oldName) + .setNewName(newName)); + return this; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AlterTableRequest.java b/java/kudu-client/src/main/java/org/kududb/client/AlterTableRequest.java new file mode 100644 index 000000000000..751290c09a3a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AlterTableRequest.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +import static org.kududb.master.Master.*; + +/** + * RPC used to alter a table. When it returns it doesn't mean that the table is altered, + * a success just means that the master accepted it. + */ +@InterfaceAudience.Private +class AlterTableRequest extends KuduRpc { + + static final String ALTER_TABLE = "AlterTable"; + private final String name; + private final AlterTableRequestPB.Builder builder; + + AlterTableRequest(KuduTable masterTable, String name, AlterTableOptions ato) { + super(masterTable); + this.name = name; + this.builder = ato.pb; + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + TableIdentifierPB tableID = + TableIdentifierPB.newBuilder().setTableName(name).build(); + this.builder.setTable(tableID); + return toChannelBuffer(header, this.builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return ALTER_TABLE; + } + + @Override + Pair deserialize(final CallResponse callResponse, + String tsUUID) throws Exception { + final AlterTableResponsePB.Builder respBuilder = AlterTableResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + AlterTableResponse response = new AlterTableResponse(deadlineTracker.getElapsedMillis(), + tsUUID); + return new Pair( + response, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AlterTableResponse.java b/java/kudu-client/src/main/java/org/kududb/client/AlterTableResponse.java new file mode 100644 index 000000000000..7d1d581b3a82 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AlterTableResponse.java @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class AlterTableResponse extends KuduRpcResponse { + + /** + * @param ellapsedMillis Time in milliseconds since RPC creation to now. + */ + AlterTableResponse(long ellapsedMillis, String tsUUID) { + super(ellapsedMillis, tsUUID); + } +} \ No newline at end of file diff --git a/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduClient.java new file mode 100644 index 000000000000..1a396b173611 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduClient.java @@ -0,0 +1,2133 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package org.kududb.client; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Lists; +import com.google.common.net.HostAndPort; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.google.protobuf.Message; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; + +import org.jboss.netty.buffer.ChannelBuffer; +import org.kududb.Common; +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.consensus.Metadata; +import org.kududb.master.Master; +import org.kududb.master.Master.GetTableLocationsResponsePB; +import org.kududb.util.AsyncUtil; +import org.kududb.util.NetUtil; +import org.kududb.util.Pair; +import org.kududb.util.Slice; +import org.jboss.netty.channel.ChannelEvent; +import org.jboss.netty.channel.ChannelStateEvent; +import org.jboss.netty.channel.DefaultChannelPipeline; +import org.jboss.netty.channel.socket.ClientSocketChannelFactory; +import org.jboss.netty.channel.socket.SocketChannel; +import org.jboss.netty.channel.socket.SocketChannelConfig; +import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory; +import org.jboss.netty.handler.timeout.ReadTimeoutHandler; +import org.jboss.netty.util.HashedWheelTimer; +import org.jboss.netty.util.Timeout; +import org.jboss.netty.util.TimerTask; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.concurrent.GuardedBy; + +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.SocketAddress; +import java.net.UnknownHostException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.*; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.kududb.client.ExternalConsistencyMode.CLIENT_PROPAGATED; + +/** + * A fully asynchronous and thread-safe client for Kudu. + *

+ * This client should be + * instantiated only once. You can use it with any number of tables at the + * same time. The only case where you should have multiple instances is when + * you want to use multiple different clusters at the same time. + *

+ * If you play by the rules, this client is completely + * thread-safe. Read the documentation carefully to know what the requirements + * are for this guarantee to apply. + *

+ * This client is fully non-blocking, any blocking operation will return a + * {@link Deferred} instance to which you can attach a {@link Callback} chain + * that will execute when the asynchronous operation completes. + * + *

Note regarding {@code KuduRpc} instances passed to this class

+ * Every {@link KuduRpc} passed to a method of this class should not be + * changed or re-used until the {@code Deferred} returned by that method + * calls you back. Changing or re-using any {@link KuduRpc} for + * an RPC in flight will lead to unpredictable results and voids + * your warranty. + * + *

{@code throws} clauses

+ * None of the asynchronous methods in this API are expected to throw an + * exception. But the {@link Deferred} object they return to you can carry an + * exception that you should handle (using "errbacks", see the javadoc of + * {@link Deferred}). In order to be able to do proper asynchronous error + * handling, you need to know what types of exceptions you're expected to face + * in your errbacks. In order to document that, the methods of this API use + * javadoc's {@code @throws} to spell out the exception types you should + * handle in your errback. Asynchronous exceptions will be indicated as such + * in the javadoc with "(deferred)". + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class AsyncKuduClient implements AutoCloseable { + + public static final Logger LOG = LoggerFactory.getLogger(AsyncKuduClient.class); + public static final int SLEEP_TIME = 500; + public static final byte[] EMPTY_ARRAY = new byte[0]; + public static final long NO_TIMESTAMP = -1; + public static final long DEFAULT_OPERATION_TIMEOUT_MS = 10000; + public static final long DEFAULT_SOCKET_READ_TIMEOUT_MS = 5000; + + private final ClientSocketChannelFactory channelFactory; + + /** + * This map and the next 2 maps contain the same data, but indexed + * differently. There is no consistency guarantee across the maps. + * They are not updated all at the same time atomically. This map + * is always the first to be updated, because that's the map from + * which all the lookups are done in the fast-path of the requests + * that need to locate a tablet. The second map to be updated is + * tablet2client, because it comes second in the fast-path + * of every requests that need to locate a tablet. The third map + * is only used to handle TabletServer disconnections gracefully. + * + * This map is keyed by table ID. + */ + private final ConcurrentHashMap> tabletsCache = new ConcurrentHashMap<>(); + + /** + * Maps a tablet ID to the RemoteTablet that knows where all the replicas are served. + */ + private final ConcurrentHashMap tablet2client = new ConcurrentHashMap<>(); + + /** + * Maps a client connected to a TabletServer to the list of tablets we know + * it's serving so far. + */ + private final ConcurrentHashMap> client2tablets = + new ConcurrentHashMap<>(); + + /** + * Cache that maps a TabletServer address ("ip:port") to the clients + * connected to it. + *

+ * Access to this map must be synchronized by locking its monitor. + * Lock ordering: when locking both this map and a TabletClient, the + * TabletClient must always be locked first to avoid deadlocks. Logging + * the contents of this map (or calling toString) requires copying it first. + *

+ * This isn't a {@link ConcurrentHashMap} because we don't use it frequently + * (just when connecting to / disconnecting from TabletClients) and when we + * add something to it, we want to do an atomic get-and-put, but + * {@code putIfAbsent} isn't a good fit for us since it requires to create + * an object that may be "wasted" in case another thread wins the insertion + * race, and we don't want to create unnecessary connections. + *

+ * Upon disconnection, clients are automatically removed from this map. + * We don't use a {@code ChannelGroup} because a {@code ChannelGroup} does + * the clean-up on the {@code channelClosed} event, which is actually the + * 3rd and last event to be fired when a channel gets disconnected. The + * first one to get fired is, {@code channelDisconnected}. This matters to + * us because we want to purge disconnected clients from the cache as + * quickly as possible after the disconnection, to avoid handing out clients + * that are going to cause unnecessary errors. + * @see TabletClientPipeline#handleDisconnect + */ + private final HashMap ip2client = + new HashMap(); + + @GuardedBy("sessions") + private final Set sessions = new HashSet(); + + // Since the masters also go through TabletClient, we need to treat them as if they were a normal + // table. We'll use the following fake table name to identify places where we need special + // handling. + static final String MASTER_TABLE_NAME_PLACEHOLDER = "Kudu Master"; + final KuduTable masterTable; + private final List masterAddresses; + + private final HashedWheelTimer timer = new HashedWheelTimer(20, MILLISECONDS); + + /** + * Timestamp required for HybridTime external consistency through timestamp + * propagation. + * @see src/kudu/common/common.proto + */ + private long lastPropagatedTimestamp = NO_TIMESTAMP; + + // A table is considered not served when we get an empty list of locations but know + // that a tablet exists. This is currently only used for new tables. The objects stored are + // table IDs. + private final Set tablesNotServed = Collections.newSetFromMap(new + ConcurrentHashMap()); + + /** + * Semaphore used to rate-limit master lookups + * Once we have more than this number of concurrent master lookups, we'll + * start to throttle ourselves slightly. + * @see #acquireMasterLookupPermit + */ + private final Semaphore masterLookups = new Semaphore(50); + + private final Random sleepRandomizer = new Random(); + + private final long defaultOperationTimeoutMs; + + private final long defaultAdminOperationTimeoutMs; + + private final long defaultSocketReadTimeoutMs; + + private volatile boolean closed; + + private AsyncKuduClient(AsyncKuduClientBuilder b) { + this.channelFactory = b.createChannelFactory(); + this.masterAddresses = b.masterAddresses; + this.masterTable = new KuduTable(this, MASTER_TABLE_NAME_PLACEHOLDER, + MASTER_TABLE_NAME_PLACEHOLDER, null, null); + this.defaultOperationTimeoutMs = b.defaultOperationTimeoutMs; + this.defaultAdminOperationTimeoutMs = b.defaultAdminOperationTimeoutMs; + this.defaultSocketReadTimeoutMs = b.defaultSocketReadTimeoutMs; + } + + /** + * Updates the last timestamp received from a server. Used for CLIENT_PROPAGATED + * external consistency. This is only publicly visible so that it can be set + * on tests, users should generally disregard this method. + * + * @param lastPropagatedTimestamp the last timestamp received from a server + */ + @VisibleForTesting + public synchronized void updateLastPropagatedTimestamp(long lastPropagatedTimestamp) { + if (this.lastPropagatedTimestamp == -1 || + this.lastPropagatedTimestamp < lastPropagatedTimestamp) { + this.lastPropagatedTimestamp = lastPropagatedTimestamp; + } + } + + @VisibleForTesting + public synchronized long getLastPropagatedTimestamp() { + return lastPropagatedTimestamp; + } + + /** + * Create a table on the cluster with the specified name and schema. Default table + * configurations are used, mainly the table will have one tablet. + * @param name the table's name + * @param schema the table's schema + * @return a deferred object to track the progress of the createTable command that gives + * an object to communicate with the created table + */ + public Deferred createTable(String name, Schema schema) { + return this.createTable(name, schema, new CreateTableOptions()); + } + + /** + * Create a table on the cluster with the specified name, schema, and table configurations. + * @param name the table's name + * @param schema the table's schema + * @param builder a builder containing the table's configurations + * @return a deferred object to track the progress of the createTable command that gives + * an object to communicate with the created table + */ + public Deferred createTable(final String name, Schema schema, + CreateTableOptions builder) { + checkIsClosed(); + if (builder == null) { + builder = new CreateTableOptions(); + } + CreateTableRequest create = new CreateTableRequest(this.masterTable, name, schema, + builder); + create.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(create).addCallbackDeferring( + new Callback, CreateTableResponse>() { + @Override + public Deferred call(CreateTableResponse createTableResponse) throws Exception { + return openTable(name); + } + }); + } + + /** + * Delete a table on the cluster with the specified name. + * @param name the table's name + * @return a deferred object to track the progress of the deleteTable command + */ + public Deferred deleteTable(String name) { + checkIsClosed(); + DeleteTableRequest delete = new DeleteTableRequest(this.masterTable, name); + delete.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(delete); + } + + /** + * Alter a table on the cluster as specified by the builder. + * + * When the returned deferred completes it only indicates that the master accepted the alter + * command, use {@link AsyncKuduClient#isAlterTableDone(String)} to know when the alter finishes. + * @param name the table's name, if this is a table rename then the old table name must be passed + * @param ato the alter table builder + * @return a deferred object to track the progress of the alter command + */ + public Deferred alterTable(String name, AlterTableOptions ato) { + checkIsClosed(); + AlterTableRequest alter = new AlterTableRequest(this.masterTable, name, ato); + alter.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(alter); + } + + /** + * Helper method that checks and waits until the completion of an alter command. + * It will block until the alter command is done or the deadline is reached. + * @param name the table's name, if the table was renamed then that name must be checked against + * @return a deferred object to track the progress of the isAlterTableDone command + */ + public Deferred isAlterTableDone(String name) throws Exception { + checkIsClosed(); + IsAlterTableDoneRequest request = new IsAlterTableDoneRequest(this.masterTable, name); + request.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(request); + } + + /** + * Get the list of running tablet servers. + * @return a deferred object that yields a list of tablet servers + */ + public Deferred listTabletServers() { + checkIsClosed(); + ListTabletServersRequest rpc = new ListTabletServersRequest(this.masterTable); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(rpc); + } + + Deferred getTableSchema(String name) { + GetTableSchemaRequest rpc = new GetTableSchemaRequest(this.masterTable, name); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(rpc); + } + + /** + * Get the list of all the tables. + * @return a deferred object that yields a list of all the tables + */ + public Deferred getTablesList() { + return getTablesList(null); + } + + /** + * Get a list of table names. Passing a null filter returns all the tables. When a filter is + * specified, it only returns tables that satisfy a substring match. + * @param nameFilter an optional table name filter + * @return a deferred that yields the list of table names + */ + public Deferred getTablesList(String nameFilter) { + ListTablesRequest rpc = new ListTablesRequest(this.masterTable, nameFilter); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + return sendRpcToTablet(rpc); + } + + /** + * Test if a table exists. + * @param name a non-null table name + * @return true if the table exists, else false + */ + public Deferred tableExists(final String name) { + if (name == null) { + throw new IllegalArgumentException("The table name cannot be null"); + } + return getTablesList().addCallbackDeferring(new Callback, + ListTablesResponse>() { + @Override + public Deferred call(ListTablesResponse listTablesResponse) throws Exception { + for (String tableName : listTablesResponse.getTablesList()) { + if (name.equals(tableName)) { + return Deferred.fromResult(true); + } + } + return Deferred.fromResult(false); + } + }); + } + + /** + * Open the table with the given name. If the table was just created, the Deferred will only get + * called back when all the tablets have been successfully created. + * @param name table to open + * @return a KuduTable if the table exists, else a MasterErrorException + */ + public Deferred openTable(final String name) { + checkIsClosed(); + + // We create an RPC that we're never going to send, and will instead use it to keep track of + // timeouts and use its Deferred. + final KuduRpc fakeRpc = new KuduRpc(null) { + @Override + ChannelBuffer serialize(Message header) { return null; } + + @Override + String serviceName() { return null; } + + @Override + String method() { + return "IsCreateTableDone"; + } + + @Override + Pair deserialize(CallResponse callResponse, String tsUUID) + throws Exception { return null; } + }; + fakeRpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + + return getTableSchema(name).addCallbackDeferring(new Callback, + GetTableSchemaResponse>() { + @Override + public Deferred call(GetTableSchemaResponse response) throws Exception { + KuduTable table = new KuduTable(AsyncKuduClient.this, + name, + response.getTableId(), + response.getSchema(), + response.getPartitionSchema()); + // We grab the Deferred first because calling callback on the RPC will reset it and we'd + // return a different, non-triggered Deferred. + Deferred d = fakeRpc.getDeferred(); + if (response.isCreateTableDone()) { + LOG.debug("Opened table {}", name); + fakeRpc.callback(table); + } else { + LOG.debug("Delaying opening table {}, its tablets aren't fully created", name); + fakeRpc.attempt++; + delayedIsCreateTableDone( + table, + fakeRpc, + getOpenTableCB(fakeRpc, table), + getDelayedIsCreateTableDoneErrback(fakeRpc)); + } + return d; + } + }); + } + + /** + * This callback will be repeatadly used when opening a table until it is done being created. + */ + Callback, Master.IsCreateTableDoneResponsePB> getOpenTableCB( + final KuduRpc rpc, final KuduTable table) { + return new Callback, Master.IsCreateTableDoneResponsePB>() { + @Override + public Deferred call( + Master.IsCreateTableDoneResponsePB isCreateTableDoneResponsePB) throws Exception { + String tableName = table.getName(); + Deferred d = rpc.getDeferred(); + if (isCreateTableDoneResponsePB.getDone()) { + LOG.debug("Table {}'s tablets are now created", tableName); + rpc.callback(table); + } else { + rpc.attempt++; + LOG.debug("Table {}'s tablets are still not created, further delaying opening it", + tableName); + + delayedIsCreateTableDone( + table, + rpc, + getOpenTableCB(rpc, table), + getDelayedIsCreateTableDoneErrback(rpc)); + } + return d; + } + }; + } + + /** + * Get the timeout used for operations on sessions and scanners. + * @return a timeout in milliseconds + */ + public long getDefaultOperationTimeoutMs() { + return defaultOperationTimeoutMs; + } + + /** + * Get the timeout used for admin operations. + * @return a timeout in milliseconds + */ + public long getDefaultAdminOperationTimeoutMs() { + return defaultAdminOperationTimeoutMs; + } + + /** + * Get the timeout used when waiting to read data from a socket. Will be triggered when nothing + * has been read on a socket connected to a tablet server for {@code timeout} milliseconds. + * @return a timeout in milliseconds + */ + public long getDefaultSocketReadTimeoutMs() { + return defaultSocketReadTimeoutMs; + } + + /** + * Creates a new {@link AsyncKuduScanner.AsyncKuduScannerBuilder} for a particular table. + * @param table the name of the table you intend to scan. + * The string is assumed to use the platform's default charset. + * @return a new scanner builder for this table + */ + public AsyncKuduScanner.AsyncKuduScannerBuilder newScannerBuilder(KuduTable table) { + checkIsClosed(); + return new AsyncKuduScanner.AsyncKuduScannerBuilder(this, table); + } + + /** + * Package-private access point for {@link AsyncKuduScanner}s to open themselves. + * @param scanner The scanner to open. + * @return A deferred {@link AsyncKuduScanner.Response} + */ + Deferred openScanner(final AsyncKuduScanner scanner) { + return sendRpcToTablet(scanner.getOpenRequest()).addErrback( + new Callback() { + public Exception call(final Exception e) { + String message = "Cannot openScanner because: "; + LOG.warn(message, e); + // Don't let the scanner think it's opened on this tablet. + scanner.invalidate(); + return e; // Let the error propagate. + } + public String toString() { + return "openScanner errback"; + } + }); + } + + /** + * Create a new session for interacting with the cluster. + * User is responsible for destroying the session object. + * This is a fully local operation (no RPCs or blocking). + * @return a new AsyncKuduSession + */ + public AsyncKuduSession newSession() { + checkIsClosed(); + AsyncKuduSession session = new AsyncKuduSession(this); + synchronized (sessions) { + sessions.add(session); + } + return session; + } + + /** + * This method is for KuduSessions so that they can remove themselves as part of closing down. + * @param session Session to remove + */ + void removeSession(AsyncKuduSession session) { + synchronized (sessions) { + boolean removed = sessions.remove(session); + assert removed == true; + } + } + + /** + * Package-private access point for {@link AsyncKuduScanner}s to scan more rows. + * @param scanner The scanner to use. + * @return A deferred row. + */ + Deferred scanNextRows(final AsyncKuduScanner scanner) { + final RemoteTablet tablet = scanner.currentTablet(); + final TabletClient client = clientFor(tablet); + final KuduRpc next_request = scanner.getNextRowsRequest(); + final Deferred d = next_request.getDeferred(); + if (client == null) { + // Oops, we no longer know anything about this client or tabletSlice. Our + // cache was probably invalidated while the client was scanning. This + // means that we lost the connection to that TabletServer, so we have to + // try to re-connect and check if the scanner is still good. + return sendRpcToTablet(next_request); + } + next_request.attempt++; + client.sendRpc(next_request); + return d; + } + + /** + * Package-private access point for {@link AsyncKuduScanner}s to close themselves. + * @param scanner The scanner to close. + * @return A deferred object that indicates the completion of the request. + * The {@link AsyncKuduScanner.Response} can contain rows that were left to scan. + */ + Deferred closeScanner(final AsyncKuduScanner scanner) { + final RemoteTablet tablet = scanner.currentTablet(); + // Getting a null tablet here without being in a closed state means we were in between tablets. + if (tablet == null) { + return Deferred.fromResult(null); + } + + final TabletClient client = clientFor(tablet); + if (client == null) { + // Oops, we no longer know anything about this client or tabletSlice. Our + // cache was probably invalidated while the client was scanning. So + // we can't close this scanner properly. + LOG.warn("Cannot close " + scanner + " properly, no connection open for " + + (tablet == null ? null : tablet)); + return Deferred.fromResult(null); + } + final KuduRpc close_request = scanner.getCloseRequest(); + final Deferred d = close_request.getDeferred(); + close_request.attempt++; + client.sendRpc(close_request); + return d; + } + + Deferred sendRpcToTablet(final KuduRpc request) { + if (cannotRetryRequest(request)) { + return tooManyAttemptsOrTimeout(request, null); + } + request.attempt++; + final String tableId = request.getTable().getTableId(); + byte[] partitionKey = null; + if (request instanceof KuduRpc.HasKey) { + partitionKey = ((KuduRpc.HasKey)request).partitionKey(); + } + final RemoteTablet tablet = getTablet(tableId, partitionKey); + + // Set the propagated timestamp so that the next time we send a message to + // the server the message includes the last propagated timestamp. + long lastPropagatedTs = getLastPropagatedTimestamp(); + if (request.getExternalConsistencyMode() == CLIENT_PROPAGATED && + lastPropagatedTs != NO_TIMESTAMP) { + request.setPropagatedTimestamp(lastPropagatedTs); + } + + if (tablet != null) { + TabletClient tabletClient = clientFor(tablet); + if (tabletClient != null) { + request.setTablet(tablet); + final Deferred d = request.getDeferred(); + tabletClient.sendRpc(request); + return d; + } + } + + // Right after creating a table a request will fall into locateTablet since we don't know yet + // if the table is ready or not. If discoverTablets() didn't get any tablets back, + // then on retry we'll fall into the following block. It will sleep, then call the master to + // see if the table was created. We'll spin like this until the table is created and then + // we'll try to locate the tablet again. + if (tablesNotServed.contains(tableId)) { + return delayedIsCreateTableDone(request.getTable(), request, + new RetryRpcCB(request), + getDelayedIsCreateTableDoneErrback(request)); + } + Callback, Master.GetTableLocationsResponsePB> cb = new RetryRpcCB<>(request); + Callback, Exception> eb = new RetryRpcErrback<>(request); + Deferred returnedD = + locateTablet(request.getTable(), partitionKey); + return AsyncUtil.addCallbacksDeferring(returnedD, cb, eb); + } + + /** + * Callback used to retry a RPC after another query finished, like looking up where that RPC + * should go. + *

+ * Use {@code AsyncUtil.addCallbacksDeferring} to add this as the callback and + * {@link AsyncKuduClient.RetryRpcErrback} as the "errback" to the {@code Deferred} + * returned by {@link #locateTablet(String, byte[])}. + * @param RPC's return type. + * @param Previous query's return type, which we don't use, but need to specify in order to + * tie it all together. + */ + final class RetryRpcCB implements Callback, D> { + private final KuduRpc request; + RetryRpcCB(KuduRpc request) { + this.request = request; + } + public Deferred call(final D arg) { + return sendRpcToTablet(request); // Retry the RPC. + } + public String toString() { + return "retry RPC"; + } + } + + /** + * "Errback" used to delayed-retry a RPC if it fails due to no leader master being found. + * Other exceptions are passed through to be handled by the caller. + *

+ * Use {@code AsyncUtil.addCallbacksDeferring} to add this as the "errback" and + * {@link RetryRpcCB} as the callback to the {@code Deferred} returned by + * {@link #locateTablet(String, byte[])}. + * @see #delayedSendRpcToTablet(KuduRpc, KuduException) + * @param The type of the original RPC. + */ + final class RetryRpcErrback implements Callback, Exception> { + private final KuduRpc request; + + public RetryRpcErrback(KuduRpc request) { + this.request = request; + } + + @Override + public Deferred call(Exception arg) { + if (arg instanceof NoLeaderMasterFoundException) { + // If we could not find the leader master, try looking up the leader master + // again. + Deferred d = request.getDeferred(); + // TODO: Handle the situation when multiple in-flight RPCs are queued waiting + // for the leader master to be determine (either after a failure or at initialization + // time). This could re-use some of the existing piping in place for non-master tablets. + delayedSendRpcToTablet(request, (NoLeaderMasterFoundException) arg); + return d; + } + // Pass all other exceptions through. + return Deferred.fromError(arg); + } + + @Override + public String toString() { + return "retry RPC after error"; + } + } + + /** + * This errback ensures that if the delayed call to IsCreateTableDone throws an Exception that + * it will be propagated back to the user. + * @param request Request to errback if there's a problem with the delayed call. + * @param Request's return type. + * @return An errback. + */ + Callback getDelayedIsCreateTableDoneErrback(final KuduRpc request) { + return new Callback() { + @Override + public Exception call(Exception e) throws Exception { + // TODO maybe we can retry it? + request.errback(e); + return e; + } + }; + } + + /** + * This method will call IsCreateTableDone on the master after sleeping for + * getSleepTimeForRpc() based on the provided KuduRpc's number of attempts. Once this is done, + * the provided callback will be called. + * @param table the table to lookup + * @param rpc the original KuduRpc that needs to access the table + * @param retryCB the callback to call on completion + * @param errback the errback to call if something goes wrong when calling IsCreateTableDone + * @return Deferred used to track the provided KuduRpc + */ + Deferred delayedIsCreateTableDone(final KuduTable table, final KuduRpc rpc, + final Callback, + Master.IsCreateTableDoneResponsePB> retryCB, + final Callback errback) { + + final class RetryTimer implements TimerTask { + public void run(final Timeout timeout) { + String tableId = table.getTableId(); + final boolean has_permit = acquireMasterLookupPermit(); + if (!has_permit) { + // If we failed to acquire a permit, it's worth checking if someone + // looked up the tablet we're interested in. Every once in a while + // this will save us a Master lookup. + if (!tablesNotServed.contains(tableId)) { + try { + retryCB.call(null); + return; + } catch (Exception e) { + // we're calling RetryRpcCB which doesn't throw exceptions, ignore + } + } + } + IsCreateTableDoneRequest rpc = new IsCreateTableDoneRequest(masterTable, tableId); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + final Deferred d = + sendRpcToTablet(rpc).addCallback(new IsCreateTableDoneCB(tableId)); + if (has_permit) { + // The errback is needed here to release the lookup permit + d.addCallbacks(new ReleaseMasterLookupPermit(), + new ReleaseMasterLookupPermit()); + } + d.addCallbacks(retryCB, errback); + } + } + long sleepTime = getSleepTimeForRpc(rpc); + if (rpc.deadlineTracker.wouldSleepingTimeout(sleepTime)) { + return tooManyAttemptsOrTimeout(rpc, null); + } + + newTimeout(new RetryTimer(), sleepTime); + return rpc.getDeferred(); + } + + private final class ReleaseMasterLookupPermit implements Callback { + public T call(final T arg) { + releaseMasterLookupPermit(); + return arg; + } + public String toString() { + return "release master lookup permit"; + } + } + + /** Callback executed when IsCreateTableDone completes. */ + private final class IsCreateTableDoneCB implements Callback { + final String tableName; + IsCreateTableDoneCB(String tableName) { + this.tableName = tableName; + } + public Master.IsCreateTableDoneResponsePB call(final Master.IsCreateTableDoneResponsePB response) { + if (response.getDone()) { + LOG.debug("Table {} was created", tableName); + tablesNotServed.remove(tableName); + } else { + LOG.debug("Table {} is still being created", tableName); + } + return response; + } + public String toString() { + return "ask the master if " + tableName + " was created"; + } + } + + boolean isTableNotServed(String tableId) { + return tablesNotServed.contains(tableId); + } + + + long getSleepTimeForRpc(KuduRpc rpc) { + byte attemptCount = rpc.attempt; + assert (attemptCount > 0); + if (attemptCount == 0) { + LOG.warn("Possible bug: attempting to retry an RPC with no attempts. RPC: " + rpc, + new Exception("Exception created to collect stack trace")); + attemptCount = 1; + } + // TODO backoffs? Sleep in increments of 500 ms, plus some random time up to 50 + long sleepTime = (attemptCount * SLEEP_TIME) + sleepRandomizer.nextInt(50); + if (LOG.isDebugEnabled()) { + LOG.debug("Going to sleep for " + sleepTime + " at retry " + rpc.attempt); + } + return sleepTime; + } + + /** + * Modifying the list returned by this method won't change how AsyncKuduClient behaves, + * but calling certain methods on the returned TabletClients can. For example, + * it's possible to forcefully shutdown a connection to a tablet server by calling {@link + * TabletClient#shutdown()}. + * @return Copy of the current TabletClients list + */ + @VisibleForTesting + List getTableClients() { + synchronized (ip2client) { + return new ArrayList(ip2client.values()); + } + } + + /** + * This method first clears tabletsCache and then tablet2client without any regards for + * calls to {@link #discoverTablets}. Call only when AsyncKuduClient is in a steady state. + * @param tableId table for which we remove all the RemoteTablet entries + */ + @VisibleForTesting + void emptyTabletsCacheForTable(String tableId) { + tabletsCache.remove(tableId); + Set> tablets = tablet2client.entrySet(); + for (Map.Entry entry : tablets) { + if (entry.getValue().getTableId().equals(tableId)) { + tablets.remove(entry); + } + } + } + + TabletClient clientFor(RemoteTablet tablet) { + if (tablet == null) { + return null; + } + + synchronized (tablet.tabletServers) { + if (tablet.tabletServers.isEmpty()) { + return null; + } + if (tablet.leaderIndex == RemoteTablet.NO_LEADER_INDEX) { + // TODO we don't know where the leader is, either because one wasn't provided or because + // we couldn't resolve its IP. We'll just send the client back so it retries and probably + // dies after too many attempts. + return null; + } else { + // TODO we currently always hit the leader, we probably don't need to except for writes + // and some reads. + return tablet.tabletServers.get(tablet.leaderIndex); + } + } + } + + /** + * Checks whether or not an RPC can be retried once more. + * @param rpc The RPC we're going to attempt to execute. + * @return {@code true} if this RPC already had too many attempts, + * {@code false} otherwise (in which case it's OK to retry once more). + * @throws NonRecoverableException if the request has had too many attempts + * already. + */ + static boolean cannotRetryRequest(final KuduRpc rpc) { + return rpc.deadlineTracker.timedOut() || rpc.attempt > 100; // TODO Don't hardcode. + } + + /** + * Returns a {@link Deferred} containing an exception when an RPC couldn't + * succeed after too many attempts or if it already timed out. + * @param request The RPC that was retried too many times or timed out. + * @param cause What was cause of the last failed attempt, if known. + * You can pass {@code null} if the cause is unknown. + */ + static Deferred tooManyAttemptsOrTimeout(final KuduRpc request, + final KuduException cause) { + String message; + if (request.deadlineTracker.timedOut()) { + message = "Time out: "; + } else { + message = "Too many attempts: "; + } + final Exception e = new NonRecoverableException(message + request, cause); + request.errback(e); + return Deferred.fromError(e); + } + + /** + * Sends a getTableLocations RPC to the master to find the table's tablets. + * @param table table to lookup + * @param partitionKey can be null, if not we'll find the exact tablet that contains it + * @return Deferred to track the progress + */ + Deferred locateTablet(KuduTable table, byte[] partitionKey) { + final boolean has_permit = acquireMasterLookupPermit(); + String tableId = table.getTableId(); + if (!has_permit) { + // If we failed to acquire a permit, it's worth checking if someone + // looked up the tablet we're interested in. Every once in a while + // this will save us a Master lookup. + RemoteTablet tablet = getTablet(tableId, partitionKey); + if (tablet != null && clientFor(tablet) != null) { + return Deferred.fromResult(null); // Looks like no lookup needed. + } + } + GetTableLocationsRequest rpc = + new GetTableLocationsRequest(masterTable, partitionKey, partitionKey, tableId); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + final Deferred d; + + // If we know this is going to the master, check the master consensus configuration (as specified by + // 'masterAddresses' field) to determine and cache the current leader. + if (isMasterTable(tableId)) { + d = getMasterTableLocationsPB(); + } else { + d = sendRpcToTablet(rpc); + } + d.addCallback(new MasterLookupCB(table)); + if (has_permit) { + d.addBoth(new ReleaseMasterLookupPermit()); + } + return d; + } + + /** + * Update the master config: send RPCs to all config members, use the returned data to + * fill a {@link Master.GetTabletLocationsResponsePB} object. + * @return An initialized Deferred object to hold the response. + */ + Deferred getMasterTableLocationsPB() { + final Deferred responseD = + new Deferred(); + final GetMasterRegistrationReceived received = + new GetMasterRegistrationReceived(masterAddresses, responseD); + for (HostAndPort hostAndPort : masterAddresses) { + Deferred d; + // Note: we need to create a client for that host first, as there's a + // chicken and egg problem: since there is no source of truth beyond + // the master, the only way to get information about a master host is + // by making an RPC to that host. + TabletClient clientForHostAndPort = newMasterClient(hostAndPort); + if (clientForHostAndPort == null) { + String message = "Couldn't resolve this master's address " + hostAndPort.toString(); + LOG.warn(message); + d = Deferred.fromError(new NonRecoverableException(message)); + } else { + d = getMasterRegistration(clientForHostAndPort); + } + d.addCallbacks(received.callbackForNode(hostAndPort), received.errbackForNode(hostAndPort)); + } + return responseD; + } + + + /** + * Get all or some tablets for a given table. This may query the master multiple times if there + * are a lot of tablets. + * This method blocks until it gets all the tablets. + * @param tableId the table to locate tablets from + * @param startPartitionKey where to start in the table, pass null to start at the beginning + * @param endPartitionKey where to stop in the table, pass null to get all the tablets until the + * end of the table + * @param deadline deadline in milliseconds for this method to finish + * @return a list of the tablets in the table, which can be queried for metadata about + * each tablet + * @throws Exception MasterErrorException if the table doesn't exist + */ + List syncLocateTable(String tableId, + byte[] startPartitionKey, + byte[] endPartitionKey, + long deadline) throws Exception { + return locateTable(tableId, startPartitionKey, endPartitionKey, deadline).join(); + } + + private Deferred> loopLocateTable(final String tableId, + final byte[] startPartitionKey, final byte[] endPartitionKey, final List ret, + final DeadlineTracker deadlineTracker) { + if (deadlineTracker.timedOut()) { + return Deferred.fromError(new NonRecoverableException( + "Took too long getting the list of tablets, " + deadlineTracker)); + } + GetTableLocationsRequest rpc = new GetTableLocationsRequest(masterTable, startPartitionKey, + endPartitionKey, tableId); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + final Deferred d = sendRpcToTablet(rpc); + return d.addCallbackDeferring( + new Callback>, Master.GetTableLocationsResponsePB>() { + @Override + public Deferred> call(GetTableLocationsResponsePB response) { + // Table doesn't exist or is being created. + if (response.getTabletLocationsCount() == 0) { + Deferred.fromResult(ret); + } + byte[] lastEndPartition = startPartitionKey; + for (Master.TabletLocationsPB tabletPb : response.getTabletLocationsList()) { + LocatedTablet locs = new LocatedTablet(tabletPb); + ret.add(locs); + Partition partition = locs.getPartition(); + if (lastEndPartition != null && !partition.isEndPartition() + && Bytes.memcmp(partition.getPartitionKeyEnd(), lastEndPartition) < 0) { + return Deferred.fromError(new IllegalStateException( + "Server returned tablets out of order: " + "end partition key '" + + Bytes.pretty(partition.getPartitionKeyEnd()) + "' followed " + + "end partition key '" + Bytes.pretty(lastEndPartition) + "'")); + } + lastEndPartition = partition.getPartitionKeyEnd(); + } + // If true, we're done, else we have to go back to the master with the last end key + if (lastEndPartition.length == 0 + || (endPartitionKey != null && Bytes.memcmp(lastEndPartition, endPartitionKey) > 0)) { + return Deferred.fromResult(ret); + } else { + return loopLocateTable(tableId, lastEndPartition, endPartitionKey, ret, + deadlineTracker); + } + } + }); + } + + /** + * Get all or some tablets for a given table. This may query the master multiple times if there + * are a lot of tablets. + * @param tableId the table to locate tablets from + * @param startPartitionKey where to start in the table, pass null to start at the beginning + * @param endPartitionKey where to stop in the table, pass null to get all the tablets until the + * end of the table + * @param deadline max time spent in milliseconds for the deferred result of this method to + * get called back, if deadline is reached, the deferred result will get erred back + * @return a deferred object that yields a list of the tablets in the table, which can be queried + * for metadata about each tablet + * @throws Exception MasterErrorException if the table doesn't exist + */ + Deferred> locateTable(final String tableId, + final byte[] startPartitionKey, final byte[] endPartitionKey, long deadline) { + final List ret = Lists.newArrayList(); + final DeadlineTracker deadlineTracker = new DeadlineTracker(); + deadlineTracker.setDeadline(deadline); + return loopLocateTable(tableId, startPartitionKey, endPartitionKey, ret, deadlineTracker); + } + + /** + * We're handling a tablet server that's telling us it doesn't have the tablet we're asking for. + * We're in the context of decode() meaning we need to either callback or retry later. + */ + void handleTabletNotFound(final KuduRpc rpc, KuduException ex, TabletClient server) { + invalidateTabletCache(rpc.getTablet(), server); + handleRetryableError(rpc, ex); + } + + /** + * A tablet server is letting us know that it isn't the specified tablet's leader in response + * a RPC, so we need to demote it and retry. + */ + void handleNotLeader(final KuduRpc rpc, KuduException ex, TabletClient server) { + rpc.getTablet().demoteLeader(server); + handleRetryableError(rpc, ex); + } + + void handleRetryableError(final KuduRpc rpc, KuduException ex) { + // TODO we don't always need to sleep, maybe another replica can serve this RPC. + delayedSendRpcToTablet(rpc, ex); + } + + private void delayedSendRpcToTablet(final KuduRpc rpc, KuduException ex) { + // Here we simply retry the RPC later. We might be doing this along with a lot of other RPCs + // in parallel. Asynchbase does some hacking with a "probe" RPC while putting the other ones + // on hold but we won't be doing this for the moment. Regions in HBase can move a lot, + // we're not expecting this in Kudu. + final class RetryTimer implements TimerTask { + public void run(final Timeout timeout) { + sendRpcToTablet(rpc); + } + } + long sleepTime = getSleepTimeForRpc(rpc); + if (cannotRetryRequest(rpc) || rpc.deadlineTracker.wouldSleepingTimeout(sleepTime)) { + tooManyAttemptsOrTimeout(rpc, ex); + // Don't let it retry. + return; + } + newTimeout(new RetryTimer(), sleepTime); + } + + /** + * Remove the tablet server from the RemoteTablet's locations. Right now nothing is removing + * the tablet itself from the caches. + */ + private void invalidateTabletCache(RemoteTablet tablet, TabletClient server) { + LOG.info("Removing server " + server.getUuid() + " from this tablet's cache " + + tablet.getTabletIdAsString()); + tablet.removeTabletServer(server); + } + + /** Callback executed when a master lookup completes. */ + private final class MasterLookupCB implements Callback { + final KuduTable table; + MasterLookupCB(KuduTable table) { + this.table = table; + } + public Object call(final Master.GetTableLocationsResponsePB arg) { + try { + discoverTablets(table, arg); + } catch (NonRecoverableException e) { + // Returning the exception means we early out and errback to the user. + return e; + } + return null; + } + public String toString() { + return "get tablet locations from the master for table " + table.getName(); + } + }; + + boolean acquireMasterLookupPermit() { + try { + // With such a low timeout, the JVM may chose to spin-wait instead of + // de-scheduling the thread (and causing context switches and whatnot). + return masterLookups.tryAcquire(5, MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); // Make this someone else's problem. + return false; + } + } + + /** + * Releases a master lookup permit that was acquired. + * @see #acquireMasterLookupPermit + */ + void releaseMasterLookupPermit() { + masterLookups.release(); + } + + @VisibleForTesting + void discoverTablets(KuduTable table, Master.GetTableLocationsResponsePB response) + throws NonRecoverableException { + String tableId = table.getTableId(); + String tableName = table.getName(); + if (response.getTabletLocationsCount() == 0) { + // Keep a note that the table exists but it's not served yet, we'll retry. + if (LOG.isDebugEnabled()) { + LOG.debug("Table {} has not been created yet", tableName); + } + tablesNotServed.add(tableId); + return; + } + // Doing a get first instead of putIfAbsent to avoid creating unnecessary CSLMs because in + // the most common case the table should already be present + ConcurrentSkipListMap tablets = tabletsCache.get(tableId); + if (tablets == null) { + tablets = new ConcurrentSkipListMap<>(Bytes.MEMCMP); + ConcurrentSkipListMap oldTablets = tabletsCache.putIfAbsent + (tableId, tablets); + if (oldTablets != null) { + tablets = oldTablets; + } + } + + for (Master.TabletLocationsPB tabletPb : response.getTabletLocationsList()) { + // Early creating the tablet so that it parses out the pb + RemoteTablet rt = createTabletFromPb(tableId, tabletPb); + Slice tabletId = rt.tabletId; + + // If we already know about this one, just refresh the locations + RemoteTablet currentTablet = tablet2client.get(tabletId); + if (currentTablet != null) { + currentTablet.refreshServers(tabletPb); + continue; + } + + // Putting it here first doesn't make it visible because tabletsCache is always looked up + // first. + RemoteTablet oldRt = tablet2client.putIfAbsent(tabletId, rt); + if (oldRt != null) { + // someone beat us to it + continue; + } + LOG.info("Discovered tablet {} for table {} with partition {}", + tabletId.toString(Charset.defaultCharset()), tableName, rt.getPartition()); + rt.refreshServers(tabletPb); + // This is making this tablet available + // Even if two clients were racing in this method they are putting the same RemoteTablet + // with the same start key in the CSLM in the end + tablets.put(rt.getPartition().getPartitionKeyStart(), rt); + } + } + + RemoteTablet createTabletFromPb(String tableId, Master.TabletLocationsPB tabletPb) { + Partition partition = ProtobufHelper.pbToPartition(tabletPb.getPartition()); + Slice tabletId = new Slice(tabletPb.getTabletId().toByteArray()); + return new RemoteTablet(tableId, tabletId, partition); + } + + /** + * Gives the tablet's ID for the table ID and partition key. + * In the future there will be multiple tablets and this method will find the right one. + * @param tableId table to find the tablet for + * @return a tablet ID as a slice or null if not found + */ + RemoteTablet getTablet(String tableId, byte[] partitionKey) { + ConcurrentSkipListMap tablets = tabletsCache.get(tableId); + + if (tablets == null) { + return null; + } + + // We currently only have one master tablet. + if (isMasterTable(tableId)) { + if (tablets.firstEntry() == null) { + return null; + } + return tablets.firstEntry().getValue(); + } + + Map.Entry tabletPair = tablets.floorEntry(partitionKey); + + if (tabletPair == null) { + return null; + } + + Partition partition = tabletPair.getValue().getPartition(); + + // If the partition is not the end partition, but it doesn't include the key + // we are looking for, then we have not yet found the correct tablet. + if (!partition.isEndPartition() + && Bytes.memcmp(partitionKey, partition.getPartitionKeyEnd()) >= 0) { + return null; + } + + return tabletPair.getValue(); + } + + /** + * Retrieve the master registration (see {@link GetMasterRegistrationResponse} + * for a replica. + * @param masterClient An initialized client for the master replica. + * @return A Deferred object for the master replica's current registration. + */ + Deferred getMasterRegistration(TabletClient masterClient) { + GetMasterRegistrationRequest rpc = new GetMasterRegistrationRequest(masterTable); + rpc.setTimeoutMillis(defaultAdminOperationTimeoutMs); + Deferred d = rpc.getDeferred(); + rpc.attempt++; + masterClient.sendRpc(rpc); + return d; + } + + /** + * If a live client already exists for the specified master server, returns that client; + * otherwise, creates a new client for the specified master server. + * @param masterHostPort The RPC host and port for the master server. + * @return A live and initialized client for the specified master server. + */ + TabletClient newMasterClient(HostAndPort masterHostPort) { + String ip = getIP(masterHostPort.getHostText()); + if (ip == null) { + return null; + } + // We should pass a UUID here but we have a chicken and egg problem, we first need to + // communicate with the masters to find out about them, and that's what we're trying to do. + // The UUID is used for logging, so instead we're passing the "master table name" followed by + // host and port which is enough to identify the node we're connecting to. + return newClient(MASTER_TABLE_NAME_PLACEHOLDER + " - " + masterHostPort.toString(), + ip, masterHostPort.getPort()); + } + + TabletClient newClient(String uuid, final String host, final int port) { + final String hostport = host + ':' + port; + TabletClient client; + SocketChannel chan; + synchronized (ip2client) { + client = ip2client.get(hostport); + if (client != null && client.isAlive()) { + return client; + } + final TabletClientPipeline pipeline = new TabletClientPipeline(); + client = pipeline.init(uuid); + chan = channelFactory.newChannel(pipeline); + ip2client.put(hostport, client); // This is guaranteed to return null. + } + this.client2tablets.put(client, new ArrayList()); + final SocketChannelConfig config = chan.getConfig(); + config.setConnectTimeoutMillis(5000); + config.setTcpNoDelay(true); + // Unfortunately there is no way to override the keep-alive timeout in + // Java since the JRE doesn't expose any way to call setsockopt() with + // TCP_KEEPIDLE. And of course the default timeout is >2h. Sigh. + config.setKeepAlive(true); + chan.connect(new InetSocketAddress(host, port)); // Won't block. + return client; + } + + /** + * Invokes {@link #shutdown()} and waits for the configured admin timeout. This method returns + * void, so consider invoking shutdown directly if there's a need to handle dangling RPCs. + * @throws Exception if an error happens while closing the connections + */ + @Override + public void close() throws Exception { + shutdown().join(defaultAdminOperationTimeoutMs); + } + + /** + * Performs a graceful shutdown of this instance. + *

+ *

    + *
  • {@link AsyncKuduSession#flush Flushes} all buffered edits.
  • + *
  • Cancels all the other requests.
  • + *
  • Terminates all connections.
  • + *
  • Releases all other resources.
  • + *
+ * Not calling this method before losing the last reference to this + * instance may result in data loss and other unwanted side effects + * @return A {@link Deferred}, whose callback chain will be invoked once all + * of the above have been done. If this callback chain doesn't fail, then + * the clean shutdown will be successful, and all the data will be safe on + * the Kudu side. In case of a failure (the "errback" is invoked) you will have + * to open a new AsyncKuduClient if you want to retry those operations. + * The Deferred doesn't actually hold any content. + */ + public Deferred> shutdown() { + checkIsClosed(); + closed = true; + // This is part of step 3. We need to execute this in its own thread + // because Netty gets stuck in an infinite loop if you try to shut it + // down from within a thread of its own thread pool. They don't want + // to fix this so as a workaround we always shut Netty's thread pool + // down from another thread. + final class ShutdownThread extends Thread { + ShutdownThread() { + super("AsyncKuduClient@" + AsyncKuduClient.super.hashCode() + " shutdown"); + } + public void run() { + // This terminates the Executor. + channelFactory.releaseExternalResources(); + } + } + + // 3. Release all other resources. + final class ReleaseResourcesCB implements Callback, ArrayList> { + public ArrayList call(final ArrayList arg) { + LOG.debug("Releasing all remaining resources"); + timer.stop(); + new ShutdownThread().start(); + return arg; + } + public String toString() { + return "release resources callback"; + } + } + + // 2. Terminate all connections. + final class DisconnectCB implements Callback>, + ArrayList>> { + public Deferred> call(final ArrayList> arg) { + return disconnectEverything().addCallback(new ReleaseResourcesCB()); + } + public String toString() { + return "disconnect callback"; + } + } + // 1. Flush everything. + return closeAllSessions().addBothDeferring(new DisconnectCB()); + } + + private void checkIsClosed() { + if (closed) { + throw new IllegalStateException("Cannot proceed, the client has already been closed"); + } + } + + private Deferred>> closeAllSessions() { + // We create a copy because AsyncKuduSession.close will call removeSession which would get us a + // concurrent modification during the iteration. + Set copyOfSessions; + synchronized (sessions) { + copyOfSessions = new HashSet(sessions); + } + if (sessions.isEmpty()) { + return Deferred.fromResult(null); + } + // Guaranteed that we'll have at least one session to close. + List>> deferreds = new ArrayList<>(copyOfSessions.size()); + for (AsyncKuduSession session : copyOfSessions ) { + deferreds.add(session.close()); + } + + return Deferred.group(deferreds); + } + + /** + * Closes every socket, which will also cancel all the RPCs in flight. + */ + private Deferred> disconnectEverything() { + ArrayList> deferreds = + new ArrayList>(2); + HashMap ip2client_copy; + synchronized (ip2client) { + // Make a local copy so we can shutdown every Tablet Server clients + // without hold the lock while we iterate over the data structure. + ip2client_copy = new HashMap(ip2client); + } + + for (TabletClient ts : ip2client_copy.values()) { + deferreds.add(ts.shutdown()); + } + final int size = deferreds.size(); + return Deferred.group(deferreds).addCallback( + new Callback, ArrayList>() { + public ArrayList call(final ArrayList arg) { + // Normally, now that we've shutdown() every client, all our caches should + // be empty since each shutdown() generates a DISCONNECTED event, which + // causes TabletClientPipeline to call removeClientFromCache(). + HashMap logme = null; + synchronized (ip2client) { + if (!ip2client.isEmpty()) { + logme = new HashMap(ip2client); + } + } + if (logme != null) { + // Putting this logging statement inside the synchronized block + // can lead to a deadlock, since HashMap.toString() is going to + // call TabletClient.toString() on each entry, and this locks the + // client briefly. Other parts of the code lock clients first and + // the ip2client HashMap second, so this can easily deadlock. + LOG.error("Some clients are left in the client cache and haven't" + + " been cleaned up: " + logme); + } + return arg; + } + + public String toString() { + return "wait " + size + " TabletClient.shutdown()"; + } + }); + } + + /** + * Blocking call. + * Performs a slow search of the IP used by the given client. + *

+ * This is needed when we're trying to find the IP of the client before its + * channel has successfully connected, because Netty's API offers no way of + * retrieving the IP of the remote peer until we're connected to it. + * @param client The client we want the IP of. + * @return The IP of the client, or {@code null} if we couldn't find it. + */ + private InetSocketAddress slowSearchClientIP(final TabletClient client) { + String hostport = null; + synchronized (ip2client) { + for (final Map.Entry e : ip2client.entrySet()) { + if (e.getValue() == client) { + hostport = e.getKey(); + break; + } + } + } + + if (hostport == null) { + HashMap copy; + synchronized (ip2client) { + copy = new HashMap(ip2client); + } + LOG.error("WTF? Should never happen! Couldn't find " + client + + " in " + copy); + return null; + } + final int colon = hostport.indexOf(':', 1); + if (colon < 1) { + LOG.error("WTF? Should never happen! No `:' found in " + hostport); + return null; + } + final String host = getIP(hostport.substring(0, colon)); + if (host == null) { + // getIP will print the reason why, there's nothing else we can do. + return null; + } + + int port; + try { + port = parsePortNumber(hostport.substring(colon + 1, + hostport.length())); + } catch (NumberFormatException e) { + LOG.error("WTF? Should never happen! Bad port in " + hostport, e); + return null; + } + return new InetSocketAddress(host, port); + } + + /** + * Removes all the cache entries referred to the given client. + * @param client The client for which we must invalidate everything. + * @param remote The address of the remote peer, if known, or null. + */ + private void removeClientFromCache(final TabletClient client, + final SocketAddress remote) { + + if (remote == null) { + return; // Can't continue without knowing the remote address. + } + + String hostport; + if (remote instanceof InetSocketAddress) { + final InetSocketAddress sock = (InetSocketAddress) remote; + final InetAddress addr = sock.getAddress(); + if (addr == null) { + LOG.error("WTF? Unresolved IP for " + remote + + ". This shouldn't happen."); + return; + } else { + hostport = addr.getHostAddress() + ':' + sock.getPort(); + } + } else { + LOG.error("WTF? Found a non-InetSocketAddress remote: " + remote + + ". This shouldn't happen."); + return; + } + + TabletClient old; + synchronized (ip2client) { + old = ip2client.remove(hostport); + } + LOG.debug("Removed from IP cache: {" + hostport + "} -> {" + client + "}"); + if (old == null) { + // Currently we're seeing this message when masters are disconnected and the hostport we got + // above is different than the one the user passes (that we use to populate ip2client). At + // worst this doubles the entries for masters, which has an insignificant impact. + // TODO When fixed, make this a WARN again. + LOG.trace("When expiring " + client + " from the client cache (host:port=" + + hostport + "), it was found that there was no entry" + + " corresponding to " + remote + ". This shouldn't happen."); + } + + ArrayList tablets = client2tablets.remove(client); + if (tablets != null) { + // Make a copy so we don't need to synchronize on it while iterating. + RemoteTablet[] tablets_copy; + synchronized (tablets) { + tablets_copy = tablets.toArray(new RemoteTablet[tablets.size()]); + tablets = null; + // If any other thread still has a reference to `tablets', their + // updates will be lost (and we don't care). + } + for (final RemoteTablet remoteTablet : tablets_copy) { + remoteTablet.removeTabletServer(client); + } + } + } + + private boolean isMasterTable(String tableId) { + // Checking that it's the same instance so there's absolutely no chance of confusing the master + // 'table' for a user one. + return MASTER_TABLE_NAME_PLACEHOLDER == tableId; + } + + private final class TabletClientPipeline extends DefaultChannelPipeline { + + private final Logger log = LoggerFactory.getLogger(TabletClientPipeline.class); + /** + * Have we already disconnected?. + * We use this to avoid doing the cleanup work for the same client more + * than once, even if we get multiple events indicating that the client + * is no longer connected to the TabletServer (e.g. DISCONNECTED, CLOSED). + * No synchronization needed as this is always accessed from only one + * thread at a time (equivalent to a non-shared state in a Netty handler). + */ + private boolean disconnected = false; + + TabletClient init(String uuid) { + final TabletClient client = new TabletClient(AsyncKuduClient.this, uuid); + if (defaultSocketReadTimeoutMs > 0) { + super.addLast("timeout-handler", + new ReadTimeoutHandler(timer, + defaultSocketReadTimeoutMs, + TimeUnit.MILLISECONDS)); + } + super.addLast("kudu-handler", client); + + return client; + } + + @Override + public void sendDownstream(final ChannelEvent event) { + if (event instanceof ChannelStateEvent) { + handleDisconnect((ChannelStateEvent) event); + } + super.sendDownstream(event); + } + + @Override + public void sendUpstream(final ChannelEvent event) { + if (event instanceof ChannelStateEvent) { + handleDisconnect((ChannelStateEvent) event); + } + super.sendUpstream(event); + } + + private void handleDisconnect(final ChannelStateEvent state_event) { + if (disconnected) { + return; + } + switch (state_event.getState()) { + case OPEN: + if (state_event.getValue() == Boolean.FALSE) { + break; // CLOSED + } + return; + case CONNECTED: + if (state_event.getValue() == null) { + break; // DISCONNECTED + } + return; + default: + return; // Not an event we're interested in, ignore it. + } + + disconnected = true; // So we don't clean up the same client twice. + try { + final TabletClient client = super.get(TabletClient.class); + SocketAddress remote = super.getChannel().getRemoteAddress(); + // At this point Netty gives us no easy way to access the + // SocketAddress of the peer we tried to connect to. This + // kinda sucks but I couldn't find an easier way. + if (remote == null) { + remote = slowSearchClientIP(client); + } + + // Prevent the client from buffering requests while we invalidate + // everything we have about it. + synchronized (client) { + removeClientFromCache(client, remote); + } + } catch (Exception e) { + log.error("Uncaught exception when handling a disconnection of " + getChannel(), e); + } + } + + } + + /** + * Gets a hostname or an IP address and returns the textual representation + * of the IP address. + *

+ * This method can block as there is no API for + * asynchronous DNS resolution in the JDK. + * @param host The hostname to resolve. + * @return The IP address associated with the given hostname, + * or {@code null} if the address couldn't be resolved. + */ + private static String getIP(final String host) { + final long start = System.nanoTime(); + try { + final String ip = InetAddress.getByName(host).getHostAddress(); + final long latency = System.nanoTime() - start; + if (latency > 500000/*ns*/ && LOG.isDebugEnabled()) { + LOG.debug("Resolved IP of `" + host + "' to " + + ip + " in " + latency + "ns"); + } else if (latency >= 3000000/*ns*/) { + LOG.warn("Slow DNS lookup! Resolved IP of `" + host + "' to " + + ip + " in " + latency + "ns"); + } + return ip; + } catch (UnknownHostException e) { + LOG.error("Failed to resolve the IP of `" + host + "' in " + + (System.nanoTime() - start) + "ns"); + return null; + } + } + + /** + * Parses a TCP port number from a string. + * @param portnum The string to parse. + * @return A strictly positive, validated port number. + * @throws NumberFormatException if the string couldn't be parsed as an + * integer or if the value was outside of the range allowed for TCP ports. + */ + private static int parsePortNumber(final String portnum) + throws NumberFormatException { + final int port = Integer.parseInt(portnum); + if (port <= 0 || port > 65535) { + throw new NumberFormatException(port == 0 ? "port is zero" : + (port < 0 ? "port is negative: " + : "port is too large: ") + port); + } + return port; + } + + void newTimeout(final TimerTask task, final long timeout_ms) { + try { + timer.newTimeout(task, timeout_ms, MILLISECONDS); + } catch (IllegalStateException e) { + // This can happen if the timer fires just before shutdown() + // is called from another thread, and due to how threads get + // scheduled we tried to call newTimeout() after timer.stop(). + LOG.warn("Failed to schedule timer." + + " Ignore this if we're shutting down.", e); + } + } + + /** + * This class encapsulates the information regarding a tablet and its locations. + * + * Leader failover mechanism: + * When we get a complete peer list from the master, we place the leader in the first + * position of the tabletServers array. When we detect that it isn't the leader anymore (in + * TabletClient), we demote it and set the next TS in the array as the leader. When the RPC + * gets retried, it will use that TS since we always pick the leader. + * + * If that TS turns out to not be the leader, we will demote it and promote the next one, retry. + * When we hit the end of the list, we set the leaderIndex to NO_LEADER_INDEX which forces us + * to fetch the tablet locations from the master. We'll repeat this whole process until a RPC + * succeeds. + * + * Subtleties: + * We don't keep track of a TS after it disconnects (via removeTabletServer), so if we + * haven't contacted one for 10 seconds (socket timeout), it will be removed from the list of + * tabletServers. This means that if the leader fails, we only have one other TS to "promote" + * or maybe none at all. This is partly why we then set leaderIndex to NO_LEADER_INDEX. + * + * The effect of treating a TS as the new leader means that the Scanner will also try to hit it + * with requests. It's currently unclear if that's a good or a bad thing. + * + * Unlike the C++ client, we don't short-circuit the call to the master if it isn't available. + * This means that after trying all the peers to find the leader, we might get stuck waiting on + * a reachable master. + */ + public class RemoteTablet implements Comparable { + + private static final int NO_LEADER_INDEX = -1; + private final String tableId; + private final Slice tabletId; + private final ArrayList tabletServers = new ArrayList(); + private final Partition partition; + private int leaderIndex = NO_LEADER_INDEX; + + RemoteTablet(String tableId, Slice tabletId, Partition partition) { + this.tabletId = tabletId; + this.tableId = tableId; + this.partition = partition; + } + + void refreshServers(Master.TabletLocationsPB tabletLocations) throws NonRecoverableException { + + synchronized (tabletServers) { // TODO not a fat lock with IP resolving in it + tabletServers.clear(); + leaderIndex = NO_LEADER_INDEX; + List lookupExceptions = + new ArrayList<>(tabletLocations.getReplicasCount()); + for (Master.TabletLocationsPB.ReplicaPB replica : tabletLocations.getReplicasList()) { + + List addresses = replica.getTsInfo().getRpcAddressesList(); + if (addresses.isEmpty()) { + LOG.warn("Tablet server for tablet " + getTabletIdAsString() + " doesn't have any " + + "address"); + continue; + } + byte[] buf = Bytes.get(replica.getTsInfo().getPermanentUuid()); + String uuid = Bytes.getString(buf); + // from meta_cache.cc + // TODO: if the TS advertises multiple host/ports, pick the right one + // based on some kind of policy. For now just use the first always. + try { + addTabletClient(uuid, addresses.get(0).getHost(), addresses.get(0).getPort(), + replica.getRole().equals(Metadata.RaftPeerPB.Role.LEADER)); + } catch (UnknownHostException ex) { + lookupExceptions.add(ex); + } + } + leaderIndex = 0; + if (leaderIndex == NO_LEADER_INDEX) { + LOG.warn("No leader provided for tablet " + getTabletIdAsString()); + } + + // If we found a tablet that doesn't contain a single location that we can resolve, there's + // no point in retrying. + if (!lookupExceptions.isEmpty() && + lookupExceptions.size() == tabletLocations.getReplicasCount()) { + throw new NonRecoverableException("Couldn't find any valid locations, exceptions: " + + lookupExceptions); + } + } + } + + // Must be called with tabletServers synchronized + void addTabletClient(String uuid, String host, int port, boolean isLeader) + throws UnknownHostException { + String ip = getIP(host); + if (ip == null) { + throw new UnknownHostException("Failed to resolve the IP of `" + host + "'"); + } + TabletClient client = newClient(uuid, ip, port); + + final ArrayList tablets = client2tablets.get(client); + + if (tablets == null) { + // We raced with removeClientFromCache and lost. The client we got was just disconnected. + // Reconnect. + addTabletClient(uuid, host, port, isLeader); + } else { + synchronized (tablets) { + if (isLeader) { + tabletServers.add(0, client); + } else { + tabletServers.add(client); + } + tablets.add(this); + } + } + } + + @Override + public String toString() { + return getTabletIdAsString(); + } + + /** + * Removes the passed TabletClient from this tablet's list of tablet servers. If it was the + * leader, then we "promote" the next one unless it was the last one in the list. + * @param ts A TabletClient that was disconnected. + * @return True if this method removed ts from the list, else false. + */ + boolean removeTabletServer(TabletClient ts) { + synchronized (tabletServers) { + // TODO unit test for this once we have the infra + int index = tabletServers.indexOf(ts); + if (index == -1) { + return false; // we removed it already + } + + tabletServers.remove(index); + if (leaderIndex == index && leaderIndex == tabletServers.size()) { + leaderIndex = NO_LEADER_INDEX; + } else if (leaderIndex > index) { + leaderIndex--; // leader moved down the list + } + + return true; + // TODO if we reach 0 TS, maybe we should remove ourselves? + } + } + + /** + * If the passed TabletClient is the current leader, then the next one in the list will be + * "promoted" unless we're at the end of the list, in which case we set the leaderIndex to + * NO_LEADER_INDEX which will force a call to the master. + * @param ts A TabletClient that gave a sign that it isn't this tablet's leader. + */ + void demoteLeader(TabletClient ts) { + synchronized (tabletServers) { + int index = tabletServers.indexOf(ts); + // If this TS was removed or we're already forcing a call to the master (meaning someone + // else beat us to it), then we just noop. + if (index == -1 || leaderIndex == NO_LEADER_INDEX) { + return; + } + + if (leaderIndex == index) { + if (leaderIndex + 1 == tabletServers.size()) { + leaderIndex = NO_LEADER_INDEX; + } else { + leaderIndex++; + } + } + } + } + + public String getTableId() { + return tableId; + } + + Slice getTabletId() { + return tabletId; + } + + public Partition getPartition() { + return partition; + } + + byte[] getTabletIdAsBytes() { + return tabletId.getBytes(); + } + + String getTabletIdAsString() { + return tabletId.toString(Charset.defaultCharset()); + } + + List getAddressesFromPb(Master.TabletLocationsPB tabletLocations) { + List addresses = new ArrayList(tabletLocations + .getReplicasCount()); + for (Master.TabletLocationsPB.ReplicaPB replica : tabletLocations.getReplicasList()) { + addresses.add(replica.getTsInfo().getRpcAddresses(0)); + } + return addresses; + } + + @Override + public int compareTo(RemoteTablet remoteTablet) { + if (remoteTablet == null) { + return 1; + } + + return ComparisonChain.start() + .compare(this.tableId, remoteTablet.tableId) + .compare(this.partition, remoteTablet.partition).result(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + RemoteTablet that = (RemoteTablet) o; + + return this.compareTo(that) == 0; + } + + @Override + public int hashCode() { + return Objects.hashCode(tableId, partition); + } + } + + /** + * Builder class to use in order to connect to Kudu. + * All the parameters beyond those in the constructors are optional. + */ + public final static class AsyncKuduClientBuilder { + private static final int DEFAULT_MASTER_PORT = 7051; + private static final int DEFAULT_BOSS_COUNT = 1; + private static final int DEFAULT_WORKER_COUNT = 2 * Runtime.getRuntime().availableProcessors(); + + private final List masterAddresses; + private long defaultAdminOperationTimeoutMs = DEFAULT_OPERATION_TIMEOUT_MS; + private long defaultOperationTimeoutMs = DEFAULT_OPERATION_TIMEOUT_MS; + private long defaultSocketReadTimeoutMs = DEFAULT_SOCKET_READ_TIMEOUT_MS; + + private Executor bossExecutor; + private Executor workerExecutor; + private int bossCount = DEFAULT_BOSS_COUNT; + private int workerCount = DEFAULT_WORKER_COUNT; + + /** + * Creates a new builder for a client that will connect to the specified masters. + * @param masterAddresses comma-separated list of "host:port" pairs of the masters + */ + public AsyncKuduClientBuilder(String masterAddresses) { + this.masterAddresses = + NetUtil.parseStrings(masterAddresses, DEFAULT_MASTER_PORT); + } + + /** + * Creates a new builder for a client that will connect to the specified masters. + * + *

Here are some examples of recognized formats: + *

    + *
  • example.com + *
  • example.com:80 + *
  • 192.0.2.1 + *
  • 192.0.2.1:80 + *
  • [2001:db8::1] + *
  • [2001:db8::1]:80 + *
  • 2001:db8::1 + *
+ * + * @param masterAddresses list of master addresses + */ + public AsyncKuduClientBuilder(List masterAddresses) { + this.masterAddresses = + Lists.newArrayListWithCapacity(masterAddresses.size()); + for (String address : masterAddresses) { + this.masterAddresses.add( + NetUtil.parseString(address, DEFAULT_MASTER_PORT)); + } + } + + /** + * Sets the default timeout used for administrative operations (e.g. createTable, deleteTable, + * etc). + * Optional. + * If not provided, defaults to 10s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public AsyncKuduClientBuilder defaultAdminOperationTimeoutMs(long timeoutMs) { + this.defaultAdminOperationTimeoutMs = timeoutMs; + return this; + } + + /** + * Sets the default timeout used for user operations (using sessions and scanners). + * Optional. + * If not provided, defaults to 10s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public AsyncKuduClientBuilder defaultOperationTimeoutMs(long timeoutMs) { + this.defaultOperationTimeoutMs = timeoutMs; + return this; + } + + /** + * Sets the default timeout to use when waiting on data from a socket. + * Optional. + * If not provided, defaults to 5s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public AsyncKuduClientBuilder defaultSocketReadTimeoutMs(long timeoutMs) { + this.defaultSocketReadTimeoutMs = timeoutMs; + return this; + } + + /** + * Set the executors which will be used for the embedded Netty boss and workers. + * Optional. + * If not provided, uses a simple cached threadpool. If either argument is null, + * then such a thread pool will be used in place of that argument. + * Note: executor's max thread number must be greater or equal to corresponding + * worker count, or netty cannot start enough threads, and client will get stuck. + * If not sure, please just use CachedThreadPool. + */ + public AsyncKuduClientBuilder nioExecutors(Executor bossExecutor, Executor workerExecutor) { + this.bossExecutor = bossExecutor; + this.workerExecutor = workerExecutor; + return this; + } + + /** + * Set the maximum number of boss threads. + * Optional. + * If not provided, 1 is used. + */ + public AsyncKuduClientBuilder bossCount(int bossCount) { + Preconditions.checkArgument(bossCount > 0, "bossCount should be greater than 0"); + this.bossCount = bossCount; + return this; + } + + /** + * Set the maximum number of worker threads. + * Optional. + * If not provided, (2 * the number of available processors) is used. + */ + public AsyncKuduClientBuilder workerCount(int workerCount) { + Preconditions.checkArgument(workerCount > 0, "workerCount should be greater than 0"); + this.workerCount = workerCount; + return this; + } + + /** + * Creates the channel factory for Netty. The user can specify the executors, but + * if they don't, we'll use a simple thread pool. + */ + private NioClientSocketChannelFactory createChannelFactory() { + Executor boss = bossExecutor; + Executor worker = workerExecutor; + if (boss == null || worker == null) { + Executor defaultExec = Executors.newCachedThreadPool( + new ThreadFactoryBuilder() + .setNameFormat("kudu-nio-%d") + .setDaemon(true) + .build()); + if (boss == null) boss = defaultExec; + if (worker == null) worker = defaultExec; + } + return new NioClientSocketChannelFactory(boss, worker, bossCount, workerCount); + } + + /** + * Creates a new client that connects to the masters. + * Doesn't block and won't throw an exception if the masters don't exist. + * @return a new asynchronous Kudu client + */ + public AsyncKuduClient build() { + return new AsyncKuduClient(this); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduScanner.java b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduScanner.java new file mode 100644 index 000000000000..ecb916ac9bd0 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduScanner.java @@ -0,0 +1,799 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import java.util.ArrayList; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.List; + +import com.google.protobuf.Message; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.kududb.ColumnSchema; +import org.kududb.Common; +import org.kududb.Schema; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.base.Preconditions.checkArgument; +import static org.kududb.tserver.Tserver.*; + +/** + * Creates a scanner to read data from Kudu. + *

+ * This class is not synchronized as it's expected to be + * used from a single thread at a time. It's rarely (if ever?) useful to + * scan concurrently from a shared scanner using multiple threads. If you + * want to optimize large table scans using extra parallelism, create a few + * scanners and give each of them a partition of the table to scan. Or use + * MapReduce. + *

+ * There's no method in this class to explicitly open the scanner. It will open + * itself automatically when you start scanning by calling {@link #nextRows()}. + * Also, the scanner will automatically call {@link #close} when it reaches the + * end key. If, however, you would like to stop scanning before reaching the + * end key, you must call {@link #close} before disposing of the scanner. + * Note that it's always safe to call {@link #close} on a scanner. + *

+ * A {@code AsyncKuduScanner} is not re-usable. Should you want to scan the same rows + * or the same table again, you must create a new one. + * + *

A note on passing {@code byte} arrays in argument

+ * None of the method that receive a {@code byte[]} in argument will copy it. + * For more info, please refer to the documentation of {@link KuduRpc}. + *

A note on passing {@code String}s in argument

+ * All strings are assumed to use the platform's default charset. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public final class AsyncKuduScanner { + + private static final Logger LOG = LoggerFactory.getLogger(AsyncKuduScanner.class); + + /** + * The possible read modes for scanners. + */ + @InterfaceAudience.Public + @InterfaceStability.Evolving + public enum ReadMode { + /** + * When READ_LATEST is specified the server will always return committed writes at + * the time the request was received. This type of read does not return a snapshot + * timestamp and is not repeatable. + * + * In ACID terms this corresponds to Isolation mode: "Read Committed" + * + * This is the default mode. + */ + READ_LATEST(Common.ReadMode.READ_LATEST), + + /** + * When READ_AT_SNAPSHOT is specified the server will attempt to perform a read + * at the provided timestamp. If no timestamp is provided the server will take the + * current time as the snapshot timestamp. In this mode reads are repeatable, i.e. + * all future reads at the same timestamp will yield the same data. This is + * performed at the expense of waiting for in-flight transactions whose timestamp + * is lower than the snapshot's timestamp to complete, so it might incur a latency + * penalty. + * + * In ACID terms this, by itself, corresponds to Isolation mode "Repeatable + * Read". If all writes to the scanned tablet are made externally consistent, + * then this corresponds to Isolation mode "Strict-Serializable". + * + * Note: there currently "holes", which happen in rare edge conditions, by which writes + * are sometimes not externally consistent even when action was taken to make them so. + * In these cases Isolation may degenerate to mode "Read Committed". See KUDU-430. + */ + READ_AT_SNAPSHOT(Common.ReadMode.READ_AT_SNAPSHOT); + + private Common.ReadMode pbVersion; + private ReadMode(Common.ReadMode pbVersion) { + this.pbVersion = pbVersion; + } + + @InterfaceAudience.Private + public Common.ReadMode pbVersion() { + return this.pbVersion; + } + } + + ////////////////////////// + // Initial configurations. + ////////////////////////// + + private final AsyncKuduClient client; + private final KuduTable table; + private final Schema schema; + private final List columnRangePredicates; + + /** + * Maximum number of bytes returned by the scanner, on each batch. + */ + private final int batchSizeBytes; + + /** + * The maximum number of rows to scan. + */ + private final long limit; + + /** + * The start partition key of the next tablet to scan. + * + * Each time the scan exhausts a tablet, this is updated to that tablet's end partition key. + */ + private byte[] nextPartitionKey; + + /** + * The end partition key of the last tablet to scan. + */ + private final byte[] endPartitionKey; + + /** + * Set in the builder. If it's not set by the user, it will default to EMPTY_ARRAY. + * It is then reset to the new start primary key of each tablet we open a scanner on as the scan + * moves from one tablet to the next. + */ + private final byte[] startPrimaryKey; + + /** + * Set in the builder. If it's not set by the user, it will default to EMPTY_ARRAY. + * It's never modified after that. + */ + private final byte[] endPrimaryKey; + + private final boolean prefetching; + + private final boolean cacheBlocks; + + private final ReadMode readMode; + + private final long htTimestamp; + + ///////////////////// + // Runtime variables. + ///////////////////// + + private boolean closed = false; + + private boolean hasMore = true; + + /** + * The tabletSlice currently being scanned. + * If null, we haven't started scanning. + * If == DONE, then we're done scanning. + * Otherwise it contains a proper tabletSlice name, and we're currently scanning. + */ + private AsyncKuduClient.RemoteTablet tablet; + + /** + * This is the scanner ID we got from the TabletServer. + * It's generated randomly so any value is possible. + */ + private byte[] scannerId; + + /** + * The sequence ID of this call. The sequence ID should start at 0 + * with the request for a new scanner, and after each successful request, + * the client should increment it by 1. When retrying a request, the client + * should _not_ increment this value. If the server detects that the client + * missed a chunk of rows from the middle of a scan, it will respond with an + * error. + */ + private int sequenceId; + + private Deferred prefetcherDeferred; + + private boolean inFirstTablet = true; + + final long scanRequestTimeout; + + private static final AtomicBoolean PARTITION_PRUNE_WARN = new AtomicBoolean(true); + + AsyncKuduScanner(AsyncKuduClient client, KuduTable table, List projectedNames, + List projectedIndexes, ReadMode readMode, long scanRequestTimeout, + List columnRangePredicates, long limit, + boolean cacheBlocks, boolean prefetching, + byte[] startPrimaryKey, byte[] endPrimaryKey, + byte[] startPartitionKey, byte[] endPartitionKey, + long htTimestamp, int batchSizeBytes) { + checkArgument(batchSizeBytes > 0, "Need a strictly positive number of bytes, " + + "got %s", batchSizeBytes); + checkArgument(limit > 0, "Need a strictly positive number for the limit, " + + "got %s", limit); + if (htTimestamp != AsyncKuduClient.NO_TIMESTAMP) { + checkArgument(htTimestamp >= 0, "Need non-negative number for the scan, " + + " timestamp got %s", htTimestamp); + checkArgument(readMode == ReadMode.READ_AT_SNAPSHOT, "When specifying a " + + "HybridClock timestamp, the read mode needs to be set to READ_AT_SNAPSHOT"); + } + + this.client = client; + this.table = table; + this.readMode = readMode; + this.scanRequestTimeout = scanRequestTimeout; + this.columnRangePredicates = columnRangePredicates; + this.limit = limit; + this.cacheBlocks = cacheBlocks; + this.prefetching = prefetching; + this.startPrimaryKey = startPrimaryKey; + this.endPrimaryKey = endPrimaryKey; + this.htTimestamp = htTimestamp; + this.batchSizeBytes = batchSizeBytes; + + if (!table.getPartitionSchema().isSimpleRangePartitioning() && + (startPrimaryKey != AsyncKuduClient.EMPTY_ARRAY || + endPrimaryKey != AsyncKuduClient.EMPTY_ARRAY) && + PARTITION_PRUNE_WARN.getAndSet(false)) { + LOG.warn("Starting full table scan. " + + "In the future this scan may be automatically optimized with partition pruning."); + } + + if (table.getPartitionSchema().isSimpleRangePartitioning()) { + // If the table is simple range partitioned, then the partition key space + // is isomorphic to the primary key space. We can potentially reduce the + // scan length by only scanning the intersection of the primary key range + // and the partition key range. This is a stop-gap until real partition + // pruning is in place that can work across any partitioning type. + + if ((endPartitionKey.length != 0 && Bytes.memcmp(startPrimaryKey, endPartitionKey) >= 0) || + (endPrimaryKey.length != 0 && Bytes.memcmp(startPartitionKey, endPrimaryKey) >= 0)) { + // The primary key range and the partition key range do not intersect; + // the scan will be empty. + this.nextPartitionKey = startPartitionKey; + this.endPartitionKey = endPartitionKey; + } else { + // Assign the scan's partition key range to the intersection of the + // primary key and partition key ranges. + if (Bytes.memcmp(startPartitionKey, startPrimaryKey) < 0) { + this.nextPartitionKey = startPrimaryKey; + } else { + this.nextPartitionKey = startPartitionKey; + } + if (endPrimaryKey.length != 0 && Bytes.memcmp(endPartitionKey, endPrimaryKey) > 0) { + this.endPartitionKey = endPrimaryKey; + } else { + this.endPartitionKey = endPartitionKey; + } + } + } else { + this.nextPartitionKey = startPartitionKey; + this.endPartitionKey = endPartitionKey; + } + + // Map the column names to actual columns in the table schema. + // If the user set this to 'null', we scan all columns. + if (projectedNames != null) { + List columns = new ArrayList(); + for (String columnName : projectedNames) { + ColumnSchema columnSchema = table.getSchema().getColumn(columnName); + if (columnSchema == null) { + throw new IllegalArgumentException("Unknown column " + columnName); + } + columns.add(columnSchema); + } + this.schema = new Schema(columns); + } else if (projectedIndexes != null) { + List columns = new ArrayList(); + for (Integer columnIndex : projectedIndexes) { + ColumnSchema columnSchema = table.getSchema().getColumnByIndex(columnIndex); + if (columnSchema == null) { + throw new IllegalArgumentException("Unknown column index " + columnIndex); + } + columns.add(columnSchema); + } + this.schema = new Schema(columns); + } else { + this.schema = table.getSchema(); + } + } + + /** + * Returns the maximum number of rows that this scanner was configured to return. + * @return a long representing the maximum number of rows that can be returned + */ + public long getLimit() { + return this.limit; + } + + /** + * Tells if the last rpc returned that there might be more rows to scan. + * @return true if there might be more data to scan, else false + */ + public boolean hasMoreRows() { + return this.hasMore; + } + + /** + * Returns if this scanner was configured to cache data blocks or not. + * @return true if this scanner will cache blocks, else else. + */ + public boolean getCacheBlocks() { + return this.cacheBlocks; + } + + /** + * Returns the maximum number of bytes returned by the scanner, on each batch. + * @return a long representing the maximum number of bytes that a scanner can receive at once + * from a tablet server + */ + public long getBatchSizeBytes() { + return this.batchSizeBytes; + } + + /** + * Returns the ReadMode for this scanner. + * @return the configured read mode for this scanner + */ + public ReadMode getReadMode() { + return this.readMode; + } + + /** + * Returns the projection schema of this scanner. If specific columns were + * not specified during scanner creation, the table schema is returned. + * @return the projection schema for this scanner + */ + public Schema getProjectionSchema() { + return this.schema; + } + + long getSnapshotTimestamp() { + return this.htTimestamp; + } + + /** + * Scans a number of rows. + *

+ * Once this method returns {@code null} once (which indicates that this + * {@code Scanner} is done scanning), calling it again leads to an undefined + * behavior. + * @return a deferred list of rows. + */ + public Deferred nextRows() { + if (closed) { // We're already done scanning. + return Deferred.fromResult(null); + } else if (tablet == null) { + + // We need to open the scanner first. + return client.openScanner(this).addCallbackDeferring( + new Callback, AsyncKuduScanner.Response>() { + public Deferred call(final AsyncKuduScanner.Response resp) { + if (!resp.more || resp.scanner_id == null) { + scanFinished(); + return Deferred.fromResult(resp.data); // there might be data to return + } + scannerId = resp.scanner_id; + sequenceId++; + hasMore = resp.more; + if (LOG.isDebugEnabled()) { + LOG.debug("Scanner " + Bytes.pretty(scannerId) + " opened on " + tablet); + } + //LOG.info("Scan.open is returning rows: " + resp.data.getNumRows()); + return Deferred.fromResult(resp.data); + } + public String toString() { + return "scanner opened"; + } + }); + } else if (prefetching && prefetcherDeferred != null) { + // TODO KUDU-1260 - Check if this works and add a test + prefetcherDeferred.chain(new Deferred().addCallback(prefetch)); + return prefetcherDeferred; + } + final Deferred d = + client.scanNextRows(this).addCallbacks(got_next_row, nextRowErrback()); + if (prefetching) { + d.chain(new Deferred().addCallback(prefetch)); + } + return d; + } + + private final Callback prefetch = + new Callback() { + @Override + public RowResultIterator call(RowResultIterator arg) throws Exception { + if (hasMoreRows()) { + prefetcherDeferred = client.scanNextRows(AsyncKuduScanner.this).addCallbacks + (got_next_row, nextRowErrback()); + } + return null; + } + }; + + /** + * Singleton callback to handle responses of "next" RPCs. + * This returns an {@code ArrayList>} (possibly inside a + * deferred one). + */ + private final Callback got_next_row = + new Callback() { + public RowResultIterator call(final Response resp) { + if (!resp.more) { // We're done scanning this tablet. + scanFinished(); + return resp.data; + } + sequenceId++; + hasMore = resp.more; + //LOG.info("Scan.next is returning rows: " + resp.data.getNumRows()); + return resp.data; + } + public String toString() { + return "get nextRows response"; + } + }; + + /** + * Creates a new errback to handle errors while trying to get more rows. + */ + private final Callback nextRowErrback() { + return new Callback() { + public Exception call(final Exception error) { + final AsyncKuduClient.RemoteTablet old_tablet = tablet; // Save before invalidate(). + String message = old_tablet + " pretends to not know " + AsyncKuduScanner.this; + LOG.warn(message, error); + invalidate(); // If there was an error, don't assume we're still OK. + return error; // Let the error propagate. + } + public String toString() { + return "NextRow errback"; + } + }; + } + + void scanFinished() { + Partition partition = tablet.getPartition(); + // Stop scanning if we have scanned until or past the end partition key. + if (partition.isEndPartition() + || (this.endPartitionKey != AsyncKuduClient.EMPTY_ARRAY + && Bytes.memcmp(this.endPartitionKey, partition.getPartitionKeyEnd()) <= 0)) { + hasMore = false; + closed = true; // the scanner is closed on the other side at this point + return; + } + if (LOG.isDebugEnabled()) { + LOG.debug("Done scanning tablet {} for partition {} with scanner id {}", + tablet.getTabletIdAsString(), tablet.getPartition(), Bytes.pretty(scannerId)); + } + nextPartitionKey = partition.getPartitionKeyEnd(); + scannerId = null; + invalidate(); + } + + /** + * Closes this scanner (don't forget to call this when you're done with it!). + *

+ * Closing a scanner already closed has no effect. The deferred returned + * will be called back immediately. + * @return A deferred object that indicates the completion of the request. + * The {@link Object} can be null, a RowResultIterator if there was data left + * in the scanner, or an Exception. + */ + public Deferred close() { + if (closed) { + return Deferred.fromResult(null); + } + final Deferred d = + client.closeScanner(this).addCallback(closedCallback()); // TODO errBack ? + return d; + } + + /** Callback+Errback invoked when the TabletServer closed our scanner. */ + private Callback closedCallback() { + return new Callback() { + public RowResultIterator call(Response response) { + closed = true; + if (LOG.isDebugEnabled()) { + LOG.debug("Scanner " + Bytes.pretty(scannerId) + " closed on " + + tablet); + } + tablet = null; + scannerId = "client debug closed".getBytes(); // Make debugging easier. + return response == null ? null : response.data; + } + public String toString() { + return "scanner closed"; + } + }; + } + + public String toString() { + final String tablet = this.tablet == null ? "null" : this.tablet.getTabletIdAsString(); + final StringBuilder buf = new StringBuilder(); + buf.append("KuduScanner(table="); + buf.append(table.getName()); + buf.append(", tablet=").append(tablet); + buf.append(", scannerId=").append(Bytes.pretty(scannerId)); + buf.append(", scanRequestTimeout=").append(scanRequestTimeout); + buf.append(')'); + return buf.toString(); + } + + // ---------------------- // + // Package private stuff. // + // ---------------------- // + + KuduTable table() { + return table; + } + + /** + * Sets the name of the tabletSlice that's hosting {@code this.start_key}. + * @param tablet The tabletSlice we're currently supposed to be scanning. + */ + void setTablet(final AsyncKuduClient.RemoteTablet tablet) { + this.tablet = tablet; + } + + /** + * Invalidates this scanner and makes it assume it's no longer opened. + * When a TabletServer goes away while we're scanning it, or some other type + * of access problem happens, this method should be called so that the + * scanner will have to re-locate the TabletServer and re-open itself. + */ + void invalidate() { + tablet = null; + } + + /** + * Returns the tabletSlice currently being scanned, if any. + */ + AsyncKuduClient.RemoteTablet currentTablet() { + return tablet; + } + + /** + * Returns an RPC to open this scanner. + */ + KuduRpc getOpenRequest() { + checkScanningNotStarted(); + // This is the only point where we know we haven't started scanning and where the scanner + // should be fully configured + if (this.inFirstTablet) { + this.inFirstTablet = false; + } + return new ScanRequest(table, State.OPENING); + } + + /** + * Returns an RPC to fetch the next rows. + */ + KuduRpc getNextRowsRequest() { + return new ScanRequest(table, State.NEXT); + } + + /** + * Returns an RPC to close this scanner. + */ + KuduRpc getCloseRequest() { + return new ScanRequest(table, State.CLOSING); + } + + /** + * Throws an exception if scanning already started. + * @throws IllegalStateException if scanning already started. + */ + private void checkScanningNotStarted() { + if (tablet != null) { + throw new IllegalStateException("scanning already started"); + } + } + + /** + * Helper object that contains all the info sent by a TS afer a Scan request + */ + static final class Response { + /** The ID associated with the scanner that issued the request. */ + private final byte[] scanner_id; + /** The actual payload of the response. */ + private final RowResultIterator data; + + /** + * If false, the filter we use decided there was no more data to scan. + * In this case, the server has automatically closed the scanner for us, + * so we don't need to explicitly close it. + */ + private final boolean more; + + Response(final byte[] scanner_id, + final RowResultIterator data, + final boolean more) { + this.scanner_id = scanner_id; + this.data = data; + this.more = more; + } + + public String toString() { + return "AsyncKuduScanner$Response(scannerId=" + Bytes.pretty(scanner_id) + + ", data=" + data + ", more=" + more + ") "; + } + } + + private enum State { + OPENING, + NEXT, + CLOSING + } + + /** + * RPC sent out to fetch the next rows from the TabletServer. + */ + private final class ScanRequest extends KuduRpc implements KuduRpc.HasKey { + + State state; + + ScanRequest(KuduTable table, State state) { + super(table); + this.state = state; + this.setTimeoutMillis(scanRequestTimeout); + } + + @Override + String serviceName() { return TABLET_SERVER_SERVICE_NAME; } + + @Override + String method() { + return "Scan"; + } + + /** Serializes this request. */ + ChannelBuffer serialize(Message header) { + final ScanRequestPB.Builder builder = ScanRequestPB.newBuilder(); + switch (state) { + case OPENING: + // Save the tablet in the AsyncKuduScanner. This kind of a kludge but it really + // is the easiest way. + AsyncKuduScanner.this.tablet = super.getTablet(); + NewScanRequestPB.Builder newBuilder = NewScanRequestPB.newBuilder(); + newBuilder.setLimit(limit); // currently ignored + newBuilder.addAllProjectedColumns(ProtobufHelper.schemaToListPb(schema)); + newBuilder.setTabletId(ZeroCopyLiteralByteString.wrap(tablet.getTabletIdAsBytes())); + newBuilder.setReadMode(AsyncKuduScanner.this.getReadMode().pbVersion()); + newBuilder.setCacheBlocks(cacheBlocks); + // if the last propagated timestamp is set send it with the scan + if (table.getAsyncClient().getLastPropagatedTimestamp() != AsyncKuduClient.NO_TIMESTAMP) { + newBuilder.setPropagatedTimestamp(table.getAsyncClient().getLastPropagatedTimestamp()); + } + newBuilder.setReadMode(AsyncKuduScanner.this.getReadMode().pbVersion()); + + // if the mode is set to read on snapshot sent the snapshot timestamp + if (AsyncKuduScanner.this.getReadMode() == ReadMode.READ_AT_SNAPSHOT && + AsyncKuduScanner.this.getSnapshotTimestamp() != AsyncKuduClient.NO_TIMESTAMP) { + newBuilder.setSnapTimestamp(AsyncKuduScanner.this.getSnapshotTimestamp()); + } + + if (AsyncKuduScanner.this.startPrimaryKey != AsyncKuduClient.EMPTY_ARRAY && + AsyncKuduScanner.this.startPrimaryKey.length > 0) { + newBuilder.setStartPrimaryKey(ZeroCopyLiteralByteString.copyFrom(startPrimaryKey)); + } + + if (AsyncKuduScanner.this.endPrimaryKey != AsyncKuduClient.EMPTY_ARRAY && + AsyncKuduScanner.this.endPrimaryKey.length > 0) { + newBuilder.setStopPrimaryKey(ZeroCopyLiteralByteString.copyFrom(endPrimaryKey)); + } + + if (!columnRangePredicates.isEmpty()) { + newBuilder.addAllRangePredicates(columnRangePredicates); + } + builder.setNewScanRequest(newBuilder.build()) + .setBatchSizeBytes(batchSizeBytes); + break; + case NEXT: + builder.setScannerId(ZeroCopyLiteralByteString.wrap(scannerId)) + .setCallSeqId(sequenceId) + .setBatchSizeBytes(batchSizeBytes); + break; + case CLOSING: + builder.setScannerId(ZeroCopyLiteralByteString.wrap(scannerId)) + .setBatchSizeBytes(0) + .setCloseScanner(true); + } + + ScanRequestPB request = builder.build(); + if (LOG.isDebugEnabled()) { + LOG.debug("Sending scan req: " + request.toString()); + } + + return toChannelBuffer(header, request); + } + + @Override + Pair deserialize(final CallResponse callResponse, + String tsUUID) throws Exception { + ScanResponsePB.Builder builder = ScanResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + ScanResponsePB resp = builder.build(); + final byte[] id = resp.getScannerId().toByteArray(); + TabletServerErrorPB error = resp.hasError() ? resp.getError() : null; + if (error != null && error.getCode().equals(TabletServerErrorPB.Code.TABLET_NOT_FOUND)) { + if (state == State.OPENING) { + // Doing this will trigger finding the new location. + return new Pair(null, error); + } else { + throw new NonRecoverableException("Cannot continue scanning, " + + "the tablet has moved and this isn't a fault tolerant scan"); + } + } + RowResultIterator iterator = new RowResultIterator( + deadlineTracker.getElapsedMillis(), tsUUID, schema, resp.getData(), + callResponse); + + boolean hasMore = resp.getHasMoreResults(); + if (id.length != 0 && scannerId != null && !Bytes.equals(scannerId, id)) { + throw new InvalidResponseException("Scan RPC response was for scanner" + + " ID " + Bytes.pretty(id) + " but we expected " + + Bytes.pretty(scannerId), resp); + } + Response response = new Response(id, iterator, hasMore); + if (LOG.isDebugEnabled()) { + LOG.debug(response.toString()); + } + return new Pair(response, error); + } + + public String toString() { + return "ScanRequest(scannerId=" + Bytes.pretty(scannerId) + + (tablet != null? ", tabletSlice=" + tablet.getTabletIdAsString() : "") + + ", attempt=" + attempt + ')'; + } + + @Override + public byte[] partitionKey() { + // This key is used to lookup where the request needs to go + return nextPartitionKey; + } + } + + /** + * A Builder class to build {@link AsyncKuduScanner}. + * Use {@link AsyncKuduClient#newScannerBuilder} in order to get a builder instance. + */ + public static class AsyncKuduScannerBuilder + extends AbstractKuduScannerBuilder { + + AsyncKuduScannerBuilder(AsyncKuduClient client, KuduTable table) { + super(client, table); + } + + /** + * Builds an {@link AsyncKuduScanner} using the passed configurations. + * @return a new {@link AsyncKuduScanner} + */ + public AsyncKuduScanner build() { + return new AsyncKuduScanner( + client, table, projectedColumnNames, projectedColumnIndexes, readMode, + scanRequestTimeout, columnRangePredicates, limit, cacheBlocks, + prefetching, lowerBoundPrimaryKey, upperBoundPrimaryKey, + lowerBoundPartitionKey, upperBoundPartitionKey, + htTimestamp, batchSizeBytes); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduSession.java b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduSession.java new file mode 100644 index 000000000000..e5fdb4260337 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/AsyncKuduSession.java @@ -0,0 +1,816 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Range; +import com.google.common.collect.Ranges; +import com.google.common.collect.Sets; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.master.Master; +import org.kududb.util.Slice; +import org.jboss.netty.util.Timeout; +import org.jboss.netty.util.TimerTask; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.concurrent.GuardedBy; +import java.util.*; + +import static org.kududb.client.ExternalConsistencyMode.CLIENT_PROPAGATED; + +/** + * A AsyncKuduSession belongs to a specific AsyncKuduClient, and represents a context in + * which all read/write data access should take place. Within a session, + * multiple operations may be accumulated and batched together for better + * efficiency. Settings like timeouts, priorities, and trace IDs are also set + * per session.

+ * + * AsyncKuduSession is separate from KuduTable because a given batch or transaction + * may span multiple tables. This is particularly important in the future when + * we add ACID support, but even in the context of batching, we may be able to + * coalesce writes to different tables hosted on the same server into the same + * RPC.

+ * + * AsyncKuduSession is separate from AsyncKuduClient because, in a multi-threaded + * application, different threads may need to concurrently execute + * transactions. Similar to a JDBC "session", transaction boundaries will be + * delineated on a per-session basis -- in between a "BeginTransaction" and + * "Commit" call on a given session, all operations will be part of the same + * transaction. Meanwhile another concurrent Session object can safely run + * non-transactional work or other transactions without interfering.

+ * + * Therefore, this class is not thread-safe.

+ * + * Additionally, there is a guarantee that writes from different sessions do not + * get batched together into the same RPCs -- this means that latency-sensitive + * clients can run through the same AsyncKuduClient object as throughput-oriented + * clients, perhaps by setting the latency-sensitive session's timeouts low and + * priorities high. Without the separation of batches, a latency-sensitive + * single-row insert might get batched along with 10MB worth of inserts from the + * batch writer, thus delaying the response significantly.

+ * + * Though we currently do not have transactional support, users will be forced + * to use a AsyncKuduSession to instantiate reads as well as writes. This will make + * it more straight-forward to add RW transactions in the future without + * significant modifications to the API.

+ * + * Timeouts are handled differently depending on the flush mode. + * With AUTO_FLUSH_SYNC, the timeout is set on each apply()'d operation. + * With AUTO_FLUSH_BACKGROUND and MANUAL_FLUSH, the timeout is assigned to a whole batch of + * operations upon flush()'ing. It means that in a situation with a timeout of 500ms and a flush + * interval of 1000ms, an operation can be oustanding for up to 1500ms before being timed out. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class AsyncKuduSession implements SessionConfiguration { + + public static final Logger LOG = LoggerFactory.getLogger(AsyncKuduSession.class); + private static final Range PERCENTAGE_RANGE = Ranges.closed(0.0f, 1.0f); + + private final AsyncKuduClient client; + private final Random randomizer = new Random(); + private final ErrorCollector errorCollector; + private int interval = 1000; + private int mutationBufferSpace = 1000; // TODO express this in terms of data size. + private float mutationBufferLowWatermarkPercentage = 0.5f; + private int mutationBufferLowWatermark; + private FlushMode flushMode; + private ExternalConsistencyMode consistencyMode; + private long timeoutMs; + + // We assign a number to each operation that we batch, so that the batch can sort itself before + // being sent to the server. We never reset this number. + private long nextSequenceNumber = 0; + + /** + * The following two maps are to be handled together when batching is enabled. The first one is + * where the batching happens, the second one is where we keep track of what's been sent but + * hasn't come back yet. A batch cannot be in both maps at the same time. A batch cannot be + * added to the in flight map if there's already another batch for the same tablet. If this + * happens, and the batch in the first map is full, then we fail fast and send it back to the + * client. + * The second map stores Deferreds because KuduRpc.callback clears out the Deferred it contains + * (as a way to reset the RPC), so we want to store the Deferred that's with the RPC that's + * sent out. + */ + @GuardedBy("this") + private final Map operations = new HashMap<>(); + + @GuardedBy("this") + private final Map> operationsInFlight = new HashMap<>(); + + /** + * This Set is used when not in AUTO_FLUSH_SYNC mode in order to keep track of the operations + * that are looking up their tablet, meaning that they aren't in any of the maps above. This is + * not expected to grow a lot except when a client starts and only for a short amount of time. + */ + @GuardedBy("this") + private final Set operationsInLookup = Sets.newIdentityHashSet(); + // Only populated when we're waiting to flush and there are operations in lookup + private Deferred lookupsDone; + + /** + * Tracks whether the session has been closed. + */ + volatile boolean closed; + + private boolean ignoreAllDuplicateRows = false; + + /** + * Package-private constructor meant to be used via AsyncKuduClient + * @param client client that creates this session + */ + AsyncKuduSession(AsyncKuduClient client) { + this.client = client; + this.flushMode = FlushMode.AUTO_FLUSH_SYNC; + this.consistencyMode = CLIENT_PROPAGATED; + this.timeoutMs = client.getDefaultOperationTimeoutMs(); + setMutationBufferLowWatermark(this.mutationBufferLowWatermarkPercentage); + errorCollector = new ErrorCollector(mutationBufferSpace); + } + + @Override + public FlushMode getFlushMode() { + return this.flushMode; + } + + @Override + public void setFlushMode(FlushMode flushMode) { + if (hasPendingOperations()) { + throw new IllegalArgumentException("Cannot change flush mode when writes are buffered"); + } + this.flushMode = flushMode; + } + + @Override + public void setExternalConsistencyMode(ExternalConsistencyMode consistencyMode) { + if (hasPendingOperations()) { + throw new IllegalArgumentException("Cannot change consistency mode " + + "when writes are buffered"); + } + this.consistencyMode = consistencyMode; + } + + @Override + public void setMutationBufferSpace(int size) { + if (hasPendingOperations()) { + throw new IllegalArgumentException("Cannot change the buffer" + + " size when operations are buffered"); + } + this.mutationBufferSpace = size; + // Reset the low watermark, using the same percentage as before. + setMutationBufferLowWatermark(mutationBufferLowWatermarkPercentage); + } + + @Override + public void setMutationBufferLowWatermark(float mutationBufferLowWatermarkPercentage) { + if (hasPendingOperations()) { + throw new IllegalArgumentException("Cannot change the buffer" + + " low watermark when operations are buffered"); + } else if (!PERCENTAGE_RANGE.contains(mutationBufferLowWatermarkPercentage)) { + throw new IllegalArgumentException("The low watermark must be between 0 and 1 inclusively"); + } + this.mutationBufferLowWatermarkPercentage = mutationBufferLowWatermarkPercentage; + this.mutationBufferLowWatermark = + (int)(this.mutationBufferLowWatermarkPercentage * mutationBufferSpace); + } + + /** + * Lets us set a specific seed for tests + * @param seed + */ + @VisibleForTesting + void setRandomSeed(long seed) { + this.randomizer.setSeed(seed); + } + + @Override + public void setFlushInterval(int interval) { + this.interval = interval; + } + + @Override + public void setTimeoutMillis(long timeout) { + this.timeoutMs = timeout; + } + + @Override + public long getTimeoutMillis() { + return this.timeoutMs; + } + + @Override + public boolean isClosed() { + return closed; + } + + @Override + public boolean isIgnoreAllDuplicateRows() { + return ignoreAllDuplicateRows; + } + + @Override + public void setIgnoreAllDuplicateRows(boolean ignoreAllDuplicateRows) { + this.ignoreAllDuplicateRows = ignoreAllDuplicateRows; + } + + @Override + public int countPendingErrors() { + return errorCollector.countErrors(); + } + + @Override + public RowErrorsAndOverflowStatus getPendingErrors() { + return errorCollector.getErrors(); + } + + /** + * Flushes the buffered operations and marks this sessions as closed. + * See the javadoc on {@link #flush()} on how to deal with exceptions coming out of this method. + * @return a Deferred whose callback chain will be invoked when. + * everything that was buffered at the time of the call has been flushed. + */ + public Deferred> close() { + closed = true; + client.removeSession(this); + return flush(); + } + + /** + * Flushes the buffered operations. + * @return a Deferred whose callback chain will be invoked when + * everything that was buffered at the time of the call has been flushed. + */ + public Deferred> flush() { + LOG.trace("Flushing all tablets"); + synchronized (this) { + if (!operationsInLookup.isEmpty()) { + lookupsDone = new Deferred<>(); + return lookupsDone + .addCallbackDeferring(new OperationsInLookupDoneCB()) + .addCallbackDeferring(new ConvertBatchToListOfResponsesCB()); + } + } + return flushAllBatches().addCallbackDeferring(new ConvertBatchToListOfResponsesCB()); + } + + class OperationsInLookupDoneCB implements + Callback>, Void> { + @Override + public Deferred> + call(Void nothing) throws Exception { + return flushAllBatches(); + } + } + + /** + * Deferring callback used to send a list of OperationResponse instead of BatchResponse since the + * latter is an implementation detail. + */ + class ConvertBatchToListOfResponsesCB implements + Callback>, ArrayList> { + @Override + public Deferred> call(ArrayList batchResponsesList) + throws Exception { + Deferred> deferred = new Deferred<>(); + if (batchResponsesList == null) { + deferred.callback(null); + return deferred; + } + + // flushTablet() can return null when a tablet we wanted to flush was already flushed. Those + // nulls along with BatchResponses are then put in a list by Deferred.group(). We first need + // to filter the nulls out. + batchResponsesList.removeAll(Collections.singleton(null)); + if (batchResponsesList.isEmpty()) { + deferred.callback(null); + return deferred; + } + + // First compute the size of the union of all the lists so that we don't trigger expensive + // list growths while adding responses to it. + int size = 0; + for (BatchResponse batchResponse : batchResponsesList) { + size += batchResponse.getIndividualResponses().size(); + } + + ArrayList responsesList = new ArrayList<>(size); + for (BatchResponse batchResponse : batchResponsesList) { + responsesList.addAll(batchResponse.getIndividualResponses()); + } + deferred.callback(responsesList); + return deferred; + } + } + + /** + * This will flush all the batches but not the operations that are currently in lookup. + */ + private Deferred> flushAllBatches() { + HashMap copyOfOps; + final ArrayList> d = new ArrayList<>(operations.size()); + synchronized (this) { + copyOfOps = new HashMap<>(operations); + } + for (Map.Entry entry: copyOfOps.entrySet()) { + d.add(flushTablet(entry.getKey(), entry.getValue())); + } + return Deferred.group(d); + } + + @Override + public boolean hasPendingOperations() { + synchronized (this) { + return !this.operations.isEmpty() || !this.operationsInFlight.isEmpty() || + !this.operationsInLookup.isEmpty(); + } + } + + /** + * Apply the given operation. + * The behavior of this function depends on the current flush mode. Regardless + * of flush mode, however, Apply may begin to perform processing in the background + * for the call (e.g looking up the tablet, etc). + * @param operation operation to apply + * @return a Deferred to track this operation + */ + public Deferred apply(final Operation operation) { + if (operation == null) { + throw new NullPointerException("Cannot apply a null operation"); + } + + if (AsyncKuduClient.cannotRetryRequest(operation)) { + return AsyncKuduClient.tooManyAttemptsOrTimeout(operation, null); + } + + // This can be called multiple times but it's fine, we don't allow "thawing". + operation.getRow().freeze(); + + // If we autoflush, just send it to the TS + if (flushMode == FlushMode.AUTO_FLUSH_SYNC) { + if (timeoutMs != 0) { + operation.setTimeoutMillis(timeoutMs); + } + operation.setExternalConsistencyMode(this.consistencyMode); + return client.sendRpcToTablet(operation); + } + + // We need this protection because apply() can be called multiple times for the same operations + // due to retries, but we only want to set the sequence number once. Since a session isn't + // thread-safe, it means that we'll always set the sequence number from the user's thread, and + // we'll read later from other threads. + if (operation.getSequenceNumber() == -1) { + operation.setSequenceNumber(nextSequenceNumber++); + } + + String tableId = operation.getTable().getTableId(); + byte[] partitionKey = operation.partitionKey(); + AsyncKuduClient.RemoteTablet tablet = client.getTablet(tableId, partitionKey); + // We go straight to the buffer if we know the tabletSlice + if (tablet != null) { + operation.setTablet(tablet); + // Handles the difference between manual and auto flush + return addToBuffer(tablet.getTabletId(), operation); + } + + synchronized (this) { + operationsInLookup.add(operation); + } + // TODO starts looking a lot like sendRpcToTablet + operation.attempt++; + if (client.isTableNotServed(tableId)) { + Callback, Master.IsCreateTableDoneResponsePB> cb = + new TabletLookupCB<>(operation); + return client.delayedIsCreateTableDone(operation.getTable(), operation, + cb, getOpInLookupErrback(operation)); + } + + Deferred d = + client.locateTablet(operation.getTable(), partitionKey); + d.addErrback(getOpInLookupErrback(operation)); + return d.addCallbackDeferring( + new TabletLookupCB(operation)); + } + + /** + * This errback is different from the one in AsyncKuduClient because we need to be able to remove + * the operation from operationsInLookup if whatever master query we issue throws an Exception. + * @param operation Operation to errback to. + * @return An errback. + */ + Callback getOpInLookupErrback(final Operation operation) { + return new Callback() { + @Override + public Exception call(Exception e) throws Exception { + // TODO maybe we can retry it? + synchronized (this) { + operationsInLookup.remove(operation); + } + operation.errback(e); + return e; + } + }; + } + + final class TabletLookupCB implements Callback, D> { + final Operation operation; + TabletLookupCB(Operation operation) { + this.operation = operation; + } + public Deferred call(final D arg) { + return handleOperationInLookup(operation); + } + public String toString() { + return "retry RPC after lookup"; + } + } + + // This method takes an Object since we use it for both callback and errback. + // The actual type doesn't matter, we just want to be called back in order to retry. + Callback, Object> + getRetryOpInLookupCB(final Operation operation) { + final class RetryOpInFlightCB implements Callback, Object> { + public Deferred call(final Object arg) { + return handleOperationInLookup(operation); + } + + public String toString() { + return "retry RPC after PleaseThrottleException"; + } + } + return new RetryOpInFlightCB(); + } + + private Deferred handleOperationInLookup(Operation operation) { + try { + return apply(operation); // Retry the RPC. + } catch (PleaseThrottleException pte) { + return pte.getDeferred().addBothDeferring(getRetryOpInLookupCB(operation)); + } + } + + /** + * For manual and background flushing, this will batch the given operation + * with the others, if any, for the specified tablet. + * @param tablet tablet used to for batching + * @param operation operation to batch + * @return Defered to track the operation + */ + private Deferred addToBuffer(Slice tablet, Operation operation) { + boolean scheduleFlush = false; + boolean batchIsFull = false; + Batch batch; + + // First check if we need to flush the current batch. + synchronized (this) { + batch = operations.get(tablet); + if (batch != null && batch.ops.size() + 1 > mutationBufferSpace) { + if (flushMode == FlushMode.MANUAL_FLUSH) { + throw new NonRecoverableException("MANUAL_FLUSH is enabled but the buffer is too big"); + } + if (operationsInFlight.containsKey(tablet)) { + // There's is already another batch in flight for this tablet. + // We cannot continue here, we have to send this back to the client. + // This is our high watermark. + throw new PleaseThrottleException("The RPC cannot be buffered because the current " + + "buffer is full and the previous buffer hasn't been flushed yet", null, + operation, operationsInFlight.get(tablet)); + } + batchIsFull = true; + } + } + + // We're doing this out of the synchronized block because flushTablet can take some time + // encoding all the data. + if (batchIsFull) { + flushTablet(tablet, batch); + } + + Deferred lookupsDoneCopy = null; + synchronized (this) { + // We need to get the batch again since we went out of the synchronized block. We can get a + // new one, the same one, or null. + batch = operations.get(tablet); + + if (mutationBufferLowWatermark < mutationBufferSpace && // look if it's enabled + batch != null && // and if we have a batch + operationsInFlight.containsKey(tablet) && // and if there's another batch outstanding + batch.ops.size() + 1 > mutationBufferLowWatermark) { // and if we'll be over the mark + + // This is our low watermark, we throw PleaseThrottleException before hitting the high + // mark. As we get fuller past the watermark it becomes likelier to trigger it. + int randomWatermark = batch.ops.size() + 1 + randomizer.nextInt(mutationBufferSpace - + mutationBufferLowWatermark); + if (randomWatermark > mutationBufferSpace) { + throw new PleaseThrottleException("The previous buffer hasn't been flushed and the " + + "current one is over the low watermark, please retry later", null, operation, + operationsInFlight.get(tablet)); + } + } + if (batch == null) { + // We found a tablet that needs batching, this is the only place where + // we schedule a flush. + batch = new Batch(operation.getTable(), ignoreAllDuplicateRows); + batch.setExternalConsistencyMode(this.consistencyMode); + Batch oldBatch = operations.put(tablet, batch); + assert (oldBatch == null); + addBatchCallbacks(batch); + scheduleFlush = true; + } + batch.ops.add(operation); + if (!operationsInLookup.isEmpty()) { + + boolean operationWasLookingUpTablet = operationsInLookup.remove(operation); + if (operationWasLookingUpTablet) { + // We know that the operation we just added was in the 'operationsInLookup' list so we're + // very likely adding it out of order from a different thread. + // We'll need to sort the whole list later. + batch.needsSorting = true; + } + + if (lookupsDone != null && operationsInLookup.isEmpty()) { + lookupsDoneCopy = lookupsDone; + lookupsDone = null; + } + } + if (flushMode == FlushMode.AUTO_FLUSH_BACKGROUND && scheduleFlush) { + // Accumulated a first insert but we're not in manual mode, + // schedule the flush. + LOG.trace("Scheduling a flush"); + scheduleNextPeriodicFlush(tablet, batch); + } + } + + // We do this outside of the synchronized block because we might end up calling flushTablet. + if (lookupsDoneCopy != null) { + lookupsDoneCopy.callback(null); + } + + // Get here if we accumulated an insert, regardless of if it scheduled + // a flush. + return operation.getDeferred(); + } + + /** + * Creates callbacks to handle a multi-put and adds them to the request. + * @param request the request for which we must handle the response + */ + private void addBatchCallbacks(final Batch request) { + final class BatchCallback implements + Callback { + public BatchResponse call(final BatchResponse response) { + LOG.trace("Got a Batch response for " + request.ops.size() + " rows"); + if (response.getWriteTimestamp() != 0) { + AsyncKuduSession.this.client.updateLastPropagatedTimestamp(response.getWriteTimestamp()); + } + + // Send individualized responses to all the operations in this batch. + for (OperationResponse operationResponse : response.getIndividualResponses()) { + operationResponse.getOperation().callback(operationResponse); + if (flushMode == FlushMode.AUTO_FLUSH_BACKGROUND && operationResponse.hasRowError()) { + errorCollector.addError(operationResponse.getRowError()); + } + } + + return response; + } + + @Override + public String toString() { + return "apply batch response"; + } + } + + final class BatchErrCallback implements Callback { + @Override + public Exception call(Exception e) throws Exception { + // Send the same exception to all the operations. + for (int i = 0; i < request.ops.size(); i++) { + request.ops.get(i).errback(e); + } + return e; + } + + @Override + public String toString() { + return "apply batch error response"; + } + } + + request.getDeferred().addCallbacks(new BatchCallback(), new BatchErrCallback()); + } + + /** + * Schedules the next periodic flush of buffered edits. + */ + private void scheduleNextPeriodicFlush(Slice tablet, Batch batch) { + client.newTimeout(new FlusherTask(tablet, batch), interval); + } + + /** + * Flushes the edits for the given tablet. It will also check that the Batch we're flushing is + * the one that was requested. This is mostly done so that the FlusherTask doesn't trigger + * lots of small flushes under a write-heavy scenario where we're able to fill a Batch multiple + * times per interval. + * + * Also, if there's already a Batch in flight for the given tablet, + * the flush will be delayed and the returned Deferred will be chained to it. + * + * This method should not be called within a synchronized block because we can spend a lot of + * time encoding the batch. + */ + private Deferred flushTablet(Slice tablet, Batch expectedBatch) { + assert (expectedBatch != null); + assert (!Thread.holdsLock(this)); + Batch batch; + synchronized (this) { + // Check this first, no need to wait after anyone if the batch we were supposed to flush + // was already flushed. + if (operations.get(tablet) != expectedBatch) { + LOG.trace("Had to flush a tablet but it was already flushed: " + Bytes.getString(tablet)); + // It is OK to return null here, since we currently do not use the returned value + // when doing background flush or auto flushing when buffer is full. + // The returned value is used when doing manual flush, but it will not run into this + // condition, or there is a bug. + return Deferred.fromResult(null); + } + + if (operationsInFlight.containsKey(tablet)) { + if (LOG.isTraceEnabled()) { + LOG.trace("Tablet " + Bytes.getString(tablet) + + " is already in flight, attaching a callback to retry " + + expectedBatch.toDebugString() + " later."); + } + // No matter previous batch get error or not, we still have to flush this batch. + FlushRetryCallback retryCallback = new FlushRetryCallback(tablet, operations.get(tablet)); + FlushRetryErrback retryErrback = new FlushRetryErrback(tablet, operations.get(tablet)); + // Note that if we do manual flushing multiple times when previous batch is still inflight, + // we may add the same callback multiple times, later retry of flushTablet will return null + // immediately. Since it is an illegal use case, we do not handle this currently. + operationsInFlight.get(tablet).addCallbacks(retryCallback, retryErrback); + return expectedBatch.getDeferred(); + } + + batch = operations.remove(tablet); + if (batch == null) { + LOG.trace("Had to flush a tablet but there was nothing to flush: " + + Bytes.getString(tablet)); + return Deferred.fromResult(null); + } + Deferred batchDeferred = batch.getDeferred(); + batchDeferred.addCallbacks(getOpInFlightCallback(tablet), getOpInFlightErrback(tablet)); + Deferred oldBatch = operationsInFlight.put(tablet, batchDeferred); + assert (oldBatch == null); + if (timeoutMs != 0) { + batch.deadlineTracker.reset(); + batch.setTimeoutMillis(timeoutMs); + } + } + return client.sendRpcToTablet(batch); + } + + + /** + * Simple callback so that we try to flush this tablet again if we were waiting on the previous + * Batch to finish. + */ + class FlushRetryCallback implements Callback { + private final Slice tablet; + private final Batch expectedBatch; + public FlushRetryCallback(Slice tablet, Batch expectedBatch) { + this.tablet = tablet; + this.expectedBatch = expectedBatch; + } + + @Override + public BatchResponse call(BatchResponse o) throws Exception { + if (LOG.isTraceEnabled()) { + LOG.trace("Previous batch in flight is done. " + toString()); + } + flushTablet(tablet, expectedBatch); + return o; + } + + @Override + public String toString() { + return String.format("FlushRetryCallback: retry flush tablet %s %s", Bytes.getString(tablet), + expectedBatch.toDebugString()); + } + } + + /** + * Same callback as above FlushRetryCallback, for the case that previous batch has error. + */ + class FlushRetryErrback implements Callback { + private final Slice tablet; + private final Batch expectedBatch; + public FlushRetryErrback(Slice tablet, Batch expectedBatch) { + this.tablet = tablet; + this.expectedBatch = expectedBatch; + } + + @Override + public Exception call(Exception e) throws Exception { + if (LOG.isTraceEnabled()) { + LOG.trace("Previous batch ended with an error. " + toString()); + } + flushTablet(tablet, expectedBatch); + return e; + } + + @Override + public String toString() { + return String.format("FlushRetryErrback: retry flush tablet %s %s", Bytes.getString(tablet), + expectedBatch.toDebugString()); + } + } + + /** + * Simple callback that removes the tablet from the in flight operations map once it completed. + */ + private Callback + getOpInFlightCallback(final Slice tablet) { + return new Callback() { + @Override + public BatchResponse call(BatchResponse o) throws Exception { + tabletInFlightDone(tablet); + return o; + } + + @Override + public String toString() { + return "callback: mark tablet " + Bytes.getString(tablet) + " inflight done"; + } + }; + } + + /** + * We need a separate callback for errors since the generics are different. We still remove the + * tablet from the in flight operations since there's nothing we can do about it, + * and by returning the Exception we will bubble it up to the user. + */ + private Callback getOpInFlightErrback(final Slice tablet) { + return new Callback() { + @Override + public Exception call(Exception e) throws Exception { + tabletInFlightDone(tablet); + return e; + } + + @Override + public String toString() { + return "errback: mark tablet " + Bytes.getString(tablet) + " inflight done"; + } + }; + } + + private void tabletInFlightDone(Slice tablet) { + synchronized (AsyncKuduSession.this) { + LOG.trace("Unmarking this tablet as in flight: " + Bytes.getString(tablet)); + operationsInFlight.remove(tablet); + } + } + + /** + * A FlusherTask is created for each scheduled flush per tabletSlice. + */ + class FlusherTask implements TimerTask { + final Slice tabletSlice; + final Batch expectedBatch; + + FlusherTask(Slice tabletSlice, Batch expectedBatch) { + this.tabletSlice = tabletSlice; + this.expectedBatch = expectedBatch; + } + + public void run(final Timeout timeout) { + if (isClosed()) { + return; // we ran too late, no-op + } + LOG.trace("Timed flushing: " + Bytes.getString(tabletSlice)); + flushTablet(this.tabletSlice, this.expectedBatch); + } + public String toString() { + return "flush commits of session " + AsyncKuduSession.this + + " for tabletSlice " + Bytes.getString(tabletSlice); + } + }; +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Batch.java b/java/kudu-client/src/main/java/org/kududb/client/Batch.java new file mode 100644 index 000000000000..4ab82e6a6179 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Batch.java @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.annotations.VisibleForTesting; +import com.google.protobuf.Message; +import com.google.protobuf.ZeroCopyLiteralByteString; + +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.tserver.Tserver; +import org.kududb.tserver.Tserver.TabletServerErrorPB; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +/** + * Used internally to batch Operations together before sending to the cluster + */ +@InterfaceAudience.Private +class Batch extends KuduRpc implements KuduRpc.HasKey { + + private static final OperationsComparatorBySequenceNumber SEQUENCE_NUMBER_COMPARATOR = + new OperationsComparatorBySequenceNumber(); + + final List ops; + + // Operations can be added out of order to 'ops' if the tablet had to be looked up. We can detect + // this situation in AsyncKuduSession and set this to true. + boolean needsSorting = false; + + /** See {@link SessionConfiguration#setIgnoreAllDuplicateRows(boolean)} */ + final boolean ignoreAllDuplicateRows; + + Batch(KuduTable table, boolean ignoreAllDuplicateRows) { + this(table, ignoreAllDuplicateRows, 1000); + } + + Batch(KuduTable table, boolean ignoreAllDuplicateRows, int estimatedBatchSize) { + super(table); + this.ops = new ArrayList(estimatedBatchSize); + this.ignoreAllDuplicateRows = ignoreAllDuplicateRows; + } + + @Override + ChannelBuffer serialize(Message header) { + + // This should only happen if at least one operation triggered a tablet lookup, which is rare + // on a long-running client. + if (needsSorting) { + Collections.sort(ops, SEQUENCE_NUMBER_COMPARATOR); + } + + final Tserver.WriteRequestPB.Builder builder = + Operation.createAndFillWriteRequestPB(ops.toArray(new Operation[ops.size()])); + builder.setTabletId(ZeroCopyLiteralByteString.wrap(getTablet().getTabletIdAsBytes())); + builder.setExternalConsistencyMode(this.externalConsistencyMode.pbVersion()); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return TABLET_SERVER_SERVICE_NAME; } + + @Override + String method() { + return Operation.METHOD; + } + + @Override + Pair deserialize(final CallResponse callResponse, + String tsUUID) throws Exception { + Tserver.WriteResponsePB.Builder builder = Tserver.WriteResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + + List errorsPB = builder.getPerRowErrorsList(); + if (ignoreAllDuplicateRows) { + boolean allAlreadyPresent = true; + for (Tserver.WriteResponsePB.PerRowErrorPB errorPB : errorsPB) { + if (errorPB.getError().getCode() != WireProtocol.AppStatusPB.ErrorCode.ALREADY_PRESENT) { + allAlreadyPresent = false; + break; + } + } + if (allAlreadyPresent) { + errorsPB = Collections.emptyList(); + } + } + + BatchResponse response = new BatchResponse(deadlineTracker.getElapsedMillis(), tsUUID, + builder.getTimestamp(), errorsPB, ops); + + if (injectedError != null) { + if (injectedlatencyMs > 0) { + try { + Thread.sleep(injectedlatencyMs); + } catch (InterruptedException e) { + } + } + return new Pair(response, injectedError); + } + + return new Pair(response, builder.hasError() ? builder.getError() : null); + } + + @Override + public byte[] partitionKey() { + assert this.ops.size() > 0; + return this.ops.get(0).partitionKey(); + } + + public String toDebugString() { + return "Batch(" + ops.size() + " ops)@" + Integer.toHexString(hashCode()); + } + + /** + * Sorts the Operations by their sequence number. + */ + private static class OperationsComparatorBySequenceNumber implements Comparator { + @Override + public int compare(Operation o1, Operation o2) { + return Long.compare(o1.getSequenceNumber(), o2.getSequenceNumber()); + } + } + + private static TabletServerErrorPB injectedError; + private static int injectedlatencyMs; + + /** + * Inject tablet server side error for Batch rpc related tests. + * @param error error response from tablet server + * @param latencyMs blocks response handling thread for some time to simulate + * write latency + */ + @VisibleForTesting + static void injectTabletServerErrorAndLatency(TabletServerErrorPB error, int latencyMs) { + injectedError = error; + injectedlatencyMs = latencyMs; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/BatchResponse.java b/java/kudu-client/src/main/java/org/kududb/client/BatchResponse.java new file mode 100644 index 000000000000..ee9cc66ef63a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/BatchResponse.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.tserver.Tserver; + +/** + * Response type for Batch (which is used internally by AsyncKuduSession). + * Provides the Hybrid Time write timestamp returned by the Tablet Server. + */ +@InterfaceAudience.Private +public class BatchResponse extends KuduRpcResponse { + + private final long writeTimestamp; + private final List rowErrors; + private final List individualResponses; + + /** + * Package-private constructor to be used by the RPCs. + * @param elapsedMillis time in milliseconds since RPC creation to now + * @param writeTimestamp HT's write timestamp + * @param errorsPB a list of row errors, can be empty + * @param operations the list of operations which created this response + */ + BatchResponse(long elapsedMillis, String tsUUID, long writeTimestamp, + List errorsPB, + List operations) { + super(elapsedMillis, tsUUID); + this.writeTimestamp = writeTimestamp; + individualResponses = new ArrayList<>(operations.size()); + if (errorsPB.isEmpty()) { + rowErrors = Collections.emptyList(); + } else { + rowErrors = new ArrayList<>(errorsPB.size()); + } + + // Populate the list of individual row responses and the list of row errors. Not all the rows + // maybe have errors, but 'errorsPB' contains them in the same order as the operations that + // were sent. + int currentErrorIndex = 0; + Operation currentOperation; + for (int i = 0; i < operations.size(); i++) { + RowError rowError = null; + currentOperation = operations.get(i); + if (currentErrorIndex < errorsPB.size() && + errorsPB.get(currentErrorIndex).getRowIndex() == i) { + rowError = RowError.fromRowErrorPb(errorsPB.get(currentErrorIndex), + currentOperation, tsUUID); + rowErrors.add(rowError); + currentErrorIndex++; + } + individualResponses.add( + new OperationResponse(currentOperation.deadlineTracker.getElapsedMillis(), tsUUID, + writeTimestamp, currentOperation, rowError)); + } + assert (rowErrors.size() == errorsPB.size()); + assert (individualResponses.size() == operations.size()); + } + + /** + * Gives the write timestamp that was returned by the Tablet Server. + * @return a timestamp in milliseconds, 0 if the external consistency mode set in AsyncKuduSession + * wasn't CLIENT_PROPAGATED + */ + public long getWriteTimestamp() { + return writeTimestamp; + } + + /** + * Package-private method to get the individual responses. + * @return a list of OperationResponses + */ + List getIndividualResponses() { + return individualResponses; + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Bytes.java b/java/kudu-client/src/main/java/org/kududb/client/Bytes.java new file mode 100644 index 000000000000..e327b9b908f9 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Bytes.java @@ -0,0 +1,1148 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import com.google.protobuf.ByteString; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Slice; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.util.CharsetUtil; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +/** + * Helper functions to manipulate byte arrays. + */ +@InterfaceAudience.Private +public final class Bytes { + + // Two's complement reference: 2^n . + // In this case, 2^64 (so as to emulate a unsigned long) + // from http://stackoverflow.com/questions/10886962/interpret-a-negative-number-as-unsigned-with- + // biginteger-java + private static final BigInteger TWO_COMPL_REF = BigInteger.ONE.shiftLeft(64); + + private Bytes() { // Can't instantiate. + } + + // -------------------------------- // + // Byte array conversion utilities. // + // -------------------------------- // + + /** + * Reads a boolean from the beginning of the given array. + * @param b The array to read from. + * @return A boolean + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static boolean getBoolean(final byte[] b) { + byte v = getByte(b, 0); + return v == 1; + } + + /** + * Reads a boolean from an offset in the given array. + * @param b The array to read from. + * @param offset The offset into the array. + * @return A boolean + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static boolean getBoolean(final byte[] b, final int offset) { + byte v = getByte(b, offset); + return v == 1; + } + + /** + * Reads a byte from the beginning of the given array. + * @param b The array to read from. + * @return A byte + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static byte getByte(final byte[] b) { + return getByte(b, 0); + } + + /** + * Reads a byte from an offset in the given array. + * @param b The array to read from. + * @return A byte + * @return + */ + public static byte getByte(final byte[] b, final int offset) { + return b[offset]; + } + + /** + * Reads an unsigned byte from the beginning of the given array. + * @param b The array to read from. + * @return A positive byte + */ + public static short getUnsignedByte(final byte[] b) { + return getUnsignedByte(b, 0); + } + + /** + * Reads an unsigned byte from an offset in the given array. + * @param b The array to read from. + * @return A positive byte + */ + public static short getUnsignedByte(final byte[] b, final int offset) { + return (short) (b[offset] & 0x00FF); + } + + /** + * Writes an unsigned byte at the beginning of the given array. + * @param b The array to write to. + * @param n An unsigned byte. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedByte(final byte[] b, final short n) { + setUnsignedByte(b, n, 0); + } + + /** + * Writes an unsigned byte at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n An unsigned byte. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedByte(final byte[] b, final short n, + final int offset) { + b[offset] = (byte) n; + } + + /** + * Creates a new byte array containing an unsigned byte. + * @param n An unsigned byte. + * @return A new byte array containing the given value. + */ + public static byte[] fromUnsignedByte(final short n) { + final byte[] b = new byte[1]; + setUnsignedByte(b, n); + return b; + } + + /** + * Reads a little-endian 2-byte short from the beginning of the given array. + * @param b The array to read from. + * @return A short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static short getShort(final byte[] b) { + return getShort(b, 0); + } + + /** + * Reads a little-endian 2-byte short from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return A short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static short getShort(final byte[] b, final int offset) { + return (short) (b[offset] & 0xFF | b[offset + 1] << 8 ); + } + + /** + * Reads a little-endian 2-byte unsigned short from the beginning of the + * given array. + * @param b The array to read from. + * @return A positive short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static int getUnsignedShort(final byte[] b) { + return getUnsignedShort(b, 0); + } + + /** + * Reads a little-endian 2-byte unsigned short from an offset in the + * given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return A positive short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static int getUnsignedShort(final byte[] b, final int offset) { + return getShort(b, offset) & 0x0000FFFF; + } + + /** + * Writes a little-endian 2-byte short at the beginning of the given array. + * @param b The array to write to. + * @param n A short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setShort(final byte[] b, final short n) { + setShort(b, n, 0); + } + + /** + * Writes a little-endian 2-byte short at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n A short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setShort(final byte[] b, final short n, + final int offset) { + b[offset + 0] = (byte) (n >>> 0); + b[offset + 1] = (byte) (n >>> 8); + } + + /** + * Writes a little-endian 2-byte unsigned short at the beginning of the given array. + * @param b The array to write to. + * @param n An unsigned short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedShort(final byte[] b, final int n) { + setUnsignedShort(b, n, 0); + } + + /** + * Writes a little-endian 2-byte unsigned short at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n An unsigned short integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedShort(final byte[] b, final int n, + final int offset) { + b[offset + 0] = (byte) (n >>> 0); + b[offset + 1] = (byte) (n >>> 8); + } + + /** + * Creates a new byte array containing a little-endian 2-byte short integer. + * @param n A short integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromShort(final short n) { + final byte[] b = new byte[2]; + setShort(b, n); + return b; + } + + /** + * Creates a new byte array containing a little-endian 2-byte unsigned short integer. + * @param n An unsigned short integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromUnsignedShort(final int n) { + final byte[] b = new byte[2]; + setUnsignedShort(b, n); + return b; + } + + /** + * Reads a little-endian 4-byte integer from the beginning of the given array. + * @param b The array to read from. + * @return An integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static int getInt(final byte[] b) { + return getInt(b, 0); + } + + /** + * Reads a little-endian 4-byte integer from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return An integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static int getInt(final byte[] b, final int offset) { + return (b[offset + 0] & 0xFF) << 0 + | (b[offset + 1] & 0xFF) << 8 + | (b[offset + 2] & 0xFF) << 16 + | (b[offset + 3] & 0xFF) << 24; + } + + /** + * Reads a little-endian 4-byte unsigned integer from the beginning of the + * given array. + * @param b The array to read from. + * @return A positive integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static long getUnsignedInt(final byte[] b) { + return getUnsignedInt(b, 0); + } + + /** + * Reads a little-endian 4-byte unsigned integer from an offset in the + * given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return A positive integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static long getUnsignedInt(final byte[] b, final int offset) { + return getInt(b, offset) & 0x00000000FFFFFFFFL; + } + + /** + * Writes a little-endian 4-byte int at the beginning of the given array. + * @param b The array to write to. + * @param n An integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setInt(final byte[] b, final int n) { + setInt(b, n, 0); + } + + /** + * Writes a little-endian 4-byte int at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n An integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setInt(final byte[] b, final int n, final int offset) { + b[offset + 0] = (byte) (n >>> 0); + b[offset + 1] = (byte) (n >>> 8); + b[offset + 2] = (byte) (n >>> 16); + b[offset + 3] = (byte) (n >>> 24); + } + + /** + * Writes a little-endian 4-byte unsigned int at the beginning of the given array. + * @param b The array to write to. + * @param n An unsigned integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedInt(final byte[] b, final long n) { + setUnsignedInt(b, n, 0); + } + + /** + * Writes a little-endian 4-byte unsigned int at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n An unsigned integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedInt(final byte[] b, final long n, final int offset) { + b[offset + 0] = (byte) (n >>> 0); + b[offset + 1] = (byte) (n >>> 8); + b[offset + 2] = (byte) (n >>> 16); + b[offset + 3] = (byte) (n >>> 24); + } + + public static void putVarInt32(final ByteBuffer b, final int v) { + int B = 128; + if (v < (1<<7)) { + b.put((byte)v); + } else if (v < (1<<14)) { + b.put((byte)(v | B)); + b.put((byte)((v>>7) | B)); + } else if (v < (1<<21)) { + b.put((byte)(v | B)); + b.put((byte)((v>>7) | B)); + b.put((byte)(v>>14)); + } else if (v < (1<<28)) { + b.put((byte)(v | B)); + b.put((byte)((v>>7) | B)); + b.put((byte)((v>>14) | B)); + b.put((byte)(v>>21)); + } else { + b.put((byte)(v | B)); + b.put((byte)((v>>7) | B)); + b.put((byte)((v>>14) | B)); + b.put((byte)((v>>21) | B)); + b.put((byte)(v>>28)); + } + } + + /** + * Reads a 32-bit variable-length integer value as used in Protocol Buffers. + * @param buf The buffer to read from. + * @return The integer read. + */ + static int readVarInt32(final ChannelBuffer buf) { + int result = buf.readByte(); + if (result >= 0) { + return result; + } + result &= 0x7F; + result |= buf.readByte() << 7; + if (result >= 0) { + return result; + } + result &= 0x3FFF; + result |= buf.readByte() << 14; + if (result >= 0) { + return result; + } + result &= 0x1FFFFF; + result |= buf.readByte() << 21; + if (result >= 0) { + return result; + } + result &= 0x0FFFFFFF; + final byte b = buf.readByte(); + result |= b << 28; + if (b >= 0) { + return result; + } + throw new IllegalArgumentException("Not a 32 bit varint: " + result + + " (5th byte: " + b + ")"); + } + + public static byte[] fromBoolean(final boolean n) { + final byte[] b = new byte[1]; + b[0] = (byte) (n ? 1 : 0); + return b; + } + + /** + * Creates a new byte array containing a little-endian 4-byte integer. + * @param n An integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromInt(final int n) { + final byte[] b = new byte[4]; + setInt(b, n); + return b; + } + + /** + * Creates a new byte array containing a little-endian 4-byte unsigned integer. + * @param n An unsigned integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromUnsignedInt(final long n) { + final byte[] b = new byte[4]; + setUnsignedInt(b, n); + return b; + } + + /** + * Reads a little-endian 8-byte unsigned long from the beginning of the given array. + * @param b The array to read from. + * @return A long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static BigInteger getUnsignedLong(final byte[] b) { + return getUnsignedLong(b, 0); + } + + /** + * Reads a little-endian 8-byte unsigned long from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return A long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static BigInteger getUnsignedLong(final byte[] b, final int offset) { + long l = getLong(b, offset); + BigInteger bi = new BigInteger(l+""); + if (bi.compareTo(BigInteger.ZERO) < 0) { + bi = bi.add(TWO_COMPL_REF); + } + return bi; + } + + /** + * Reads a little-endian 8-byte long from the beginning of the given array. + * @param b The array to read from. + * @return A long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static long getLong(final byte[] b) { + return getLong(b, 0); + } + + /** + * Reads a little-endian 8-byte long from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return A long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static long getLong(final byte[] b, final int offset) { + return (b[offset + 0] & 0xFFL) << 0 + | (b[offset + 1] & 0xFFL) << 8 + | (b[offset + 2] & 0xFFL) << 16 + | (b[offset + 3] & 0xFFL) << 24 + | (b[offset + 4] & 0xFFL) << 32 + | (b[offset + 5] & 0xFFL) << 40 + | (b[offset + 6] & 0xFFL) << 48 + | (b[offset + 7] & 0xFFL) << 56; + } + + /** + * Writes a little-endian 8-byte long at the beginning of the given array. + * @param b The array to write to. + * @param n A long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setLong(final byte[] b, final long n) { + setLong(b, n, 0); + } + + /** + * Writes a little-endian 8-byte long at an offset in the given array. + * @param b The array to write to. + * @param n A long integer. + * @param offset The offset in the array to start writing at. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setLong(final byte[] b, final long n, final int offset) { + b[offset + 0] = (byte) (n >>> 0); + b[offset + 1] = (byte) (n >>> 8); + b[offset + 2] = (byte) (n >>> 16); + b[offset + 3] = (byte) (n >>> 24); + b[offset + 4] = (byte) (n >>> 32); + b[offset + 5] = (byte) (n >>> 40); + b[offset + 6] = (byte) (n >>> 48); + b[offset + 7] = (byte) (n >>> 56); + } + + /** + * Writes a little-endian 8-byte unsigned long at the beginning of the given array. + * @param b The array to write to. + * @param n An unsigned long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedLong(final byte[] b, final BigInteger n) { + setUnsignedLong(b, n, 0); + } + + /** + * Writes a little-endian 8-byte unsigned long at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n An unsigned long integer. + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setUnsignedLong(final byte[] b, final BigInteger n, final int offset) { + setLong(b, n.longValue(), offset); + } + + /** + * Creates a new byte array containing a little-endian 8-byte long integer. + * @param n A long integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromLong(final long n) { + final byte[] b = new byte[8]; + setLong(b, n); + return b; + } + + /** + * Creates a new byte array containing a little-endian 8-byte unsigned long integer. + * @param n An unsigned long integer. + * @return A new byte array containing the given value. + */ + public static byte[] fromUnsignedLong(final BigInteger n) { + final byte[] b = new byte[8]; + setUnsignedLong(b, n); + return b; + } + + /** + * Reads a little-endian 4-byte float from the beginning of the given array. + * @param b The array to read from. + * @return a float + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static float getFloat(final byte[] b) { + return getFloat(b, 0); + } + + /** + * Reads a little-endian 4-byte float from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return a float + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static float getFloat(final byte[] b, final int offset) { + return Float.intBitsToFloat(getInt(b, offset)); + } + + /** + * Writes a little-endian 4-byte float at the beginning of the given array. + * @param b The array to write to. + * @param n a float + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setFloat(final byte[] b, final float n) { + setFloat(b, n, 0); + } + + /** + * Writes a little-endian 4-byte float at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n a float + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setFloat(final byte[] b, final float n, final int offset) { + setInt(b, Float.floatToIntBits(n), offset); + } + + /** + * Creates a new byte array containing a little-endian 4-byte float. + * @param n A float + * @return A new byte array containing the given value. + */ + public static byte[] fromFloat(float n) { + byte[] b = new byte[4]; + setFloat(b, n); + return b; + } + + /** + * Reads a little-endian 8-byte double from the beginning of the given array. + * @param b The array to read from. + * @return a double + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static double getDouble(final byte[] b) { + return getDouble(b, 0); + } + + /** + * Reads a little-endian 8-byte double from an offset in the given array. + * @param b The array to read from. + * @param offset The offset in the array to start reading from. + * @return a double + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static double getDouble(final byte[] b, final int offset) { + return Double.longBitsToDouble(getLong(b, offset)); + } + + /** + * Writes a little-endian 8-byte double at the beginning of the given array. + * @param b The array to write to. + * @param n a double + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setDouble(final byte[] b, final double n) { + setDouble(b, n, 0); + } + + /** + * Writes a little-endian 8-byte double at an offset in the given array. + * @param b The array to write to. + * @param offset The offset in the array to start writing at. + * @param n a double + * @throws IndexOutOfBoundsException if the byte array is too small. + */ + public static void setDouble(final byte[] b, final double n, final int offset) { + setLong(b, Double.doubleToLongBits(n), offset); + } + + /** + * Creates a new byte array containing a little-endian 8-byte double. + * @param n A double + * @return A new byte array containing the given value. + */ + public static byte[] fromDouble(double n) { + byte[] b = new byte[8]; + setDouble(b, n); + return b; + } + + /** + * Extracts the byte array from the given {@link ByteString} without copy. + * @param buf A buffer from which to extract the array. This buffer must be + * actually an instance of a {@code LiteralByteString}. + * @since 1.5 + */ + public static byte[] get(final ByteString buf) { + return ZeroCopyLiteralByteString.zeroCopyGetBytes(buf); + } + + /** Transforms a string into an UTF-8 encoded byte array. */ + public static byte[] UTF8(final String s) { + return s.getBytes(CharsetUtil.UTF_8); + } + + /** Transforms a string into an ISO-8859-1 encoded byte array. */ + public static byte[] ISO88591(final String s) { + return s.getBytes(CharsetUtil.ISO_8859_1); + } + + // ---------------------------- // + // Pretty-printing byte arrays. // + // ---------------------------- // + + private static final byte[] HEX = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'A', 'B', 'C', 'D', 'E', 'F' + }; + + /** + * Pretty-prints a byte array into a human-readable output buffer. + * @param outbuf The buffer where to write the output. + * @param array The (possibly {@code null}) array to pretty-print. + */ + public static void pretty(final StringBuilder outbuf, final byte[] array) { + if (array == null) { + outbuf.append("null"); + return; + } + int ascii = 0; + final int start_length = outbuf.length(); + final int n = array.length; + outbuf.ensureCapacity(start_length + 1 + n + 1); + outbuf.append('"'); + for (int i = 0; i < n; i++) { + final byte b = array[i]; + if (' ' <= b && b <= '~') { + ascii++; + outbuf.append((char) b); + } else if (b == '\n') { + outbuf.append('\\').append('n'); + } else if (b == '\t') { + outbuf.append('\\').append('t'); + } else { + outbuf.append("\\x") + .append((char) HEX[(b >>> 4) & 0x0F]) + .append((char) HEX[b & 0x0F]); + } + } + if (ascii < n / 2) { + outbuf.setLength(start_length); + outbuf.append(Arrays.toString(array)); + } else { + outbuf.append('"'); + } + } + + /** + * Pretty-prints an array of byte arrays into a human-readable output buffer. + * @param outbuf The buffer where to write the output. + * @param arrays The (possibly {@code null}) array of arrays to pretty-print. + * @since 1.3 + */ + public static void pretty(final StringBuilder outbuf, final byte[][] arrays) { + if (arrays == null) { + outbuf.append("null"); + return; + } else { // Do some right-sizing. + int size = 2; + for (int i = 0; i < arrays.length; i++) { + size += 2 + 2 + arrays[i].length; + } + outbuf.ensureCapacity(outbuf.length() + size); + } + outbuf.append('['); + for (int i = 0; i < arrays.length; i++) { + Bytes.pretty(outbuf, arrays[i]); + outbuf.append(", "); + } + outbuf.setLength(outbuf.length() - 2); // Remove the last ", " + outbuf.append(']'); + } + + /** + * Pretty-prints a byte array into a human-readable string. + * @param array The (possibly {@code null}) array to pretty-print. + * @return The array in a pretty-printed string. + */ + public static String pretty(final byte[] array) { + if (array == null) { + return "null"; + } + final StringBuilder buf = new StringBuilder(1 + array.length + 1); + pretty(buf, array); + return buf.toString(); + } + + // This doesn't really belong here but it doesn't belong anywhere else + // either, so let's put it close to the other pretty-printing functions. + /** + * Pretty-prints a {@code long} into a fixed-width hexadecimal number. + * @return A string of the form {@code 0x0123456789ABCDEF}. + */ + public static String hex(long v) { + final byte[] buf = new byte[2 + 16]; + buf[0] = '0'; + buf[1] = 'x'; + int i = 2 + 16; + do { + buf[--i] = HEX[(int) v & 0x0F]; + v >>>= 4; + } while (v != 0); + for (/**/; i > 1; i--) { + buf[i] = '0'; + } + return new String(buf); + } + + // Ugly stuff + // ---------- + // Background: when using ReplayingDecoder (which makes it easy to deal with + // unframed RPC responses), the ChannelBuffer we manipulate is in fact a + // ReplayingDecoderBuffer, a package-private class that Netty uses. This + // class, for some reason, throws UnsupportedOperationException on its + // array() method. This method is unfortunately the only way to easily dump + // the contents of a ChannelBuffer, which is useful for debugging or logging + // unexpected buffers. An issue (NETTY-346) has been filed to get access to + // the buffer, but the resolution was useless: instead of making the array() + // method work, a new internalBuffer() method was added on ReplayingDecoder, + // which would require that we keep a reference on the ReplayingDecoder all + // along in order to properly convert the buffer to a string. + // So we instead use ugly reflection to gain access to the underlying buffer + // while taking into account that the implementation of Netty has changed + // over time, so depending which version of Netty we're working with, we do + // a different hack. Yes this is horrible, but it's for the greater good as + // this is what allows us to debug unexpected buffers when deserializing RPCs + // and what's more important than being able to debug unexpected stuff? + private static final Class ReplayingDecoderBuffer; + private static final Field RDB_buffer; // For Netty 3.5.0 and before. + private static final Method RDB_buf; // For Netty 3.5.1 and above. + static { + try { + ReplayingDecoderBuffer = Class.forName("org.jboss.netty.handler.codec." + + "replay.ReplayingDecoderBuffer"); + Field field = null; + try { + field = ReplayingDecoderBuffer.getDeclaredField("buffer"); + field.setAccessible(true); + } catch (NoSuchFieldException e) { + // Ignore. Field has been removed in Netty 3.5.1. + } + RDB_buffer = field; + if (field != null) { // Netty 3.5.0 or before. + RDB_buf = null; + } else { + RDB_buf = ReplayingDecoderBuffer.getDeclaredMethod("buf"); + RDB_buf.setAccessible(true); + } + } catch (Exception e) { + throw new RuntimeException("static initializer failed", e); + } + } + + /** + * Pretty-prints all the bytes of a buffer into a human-readable string. + * @param buf The (possibly {@code null}) buffer to pretty-print. + * @return The buffer in a pretty-printed string. + */ + public static String pretty(final ChannelBuffer buf) { + if (buf == null) { + return "null"; + } + byte[] array; + try { + if (buf.getClass() != ReplayingDecoderBuffer) { + array = buf.array(); + } else if (RDB_buf != null) { // Netty 3.5.1 and above. + array = ((ChannelBuffer) RDB_buf.invoke(buf)).array(); + } else { // Netty 3.5.0 and before. + final ChannelBuffer wrapped_buf = (ChannelBuffer) RDB_buffer.get(buf); + array = wrapped_buf.array(); + } + } catch (UnsupportedOperationException e) { + return "(failed to extract content of buffer of type " + + buf.getClass().getName() + ')'; + } catch (IllegalAccessException e) { + throw new AssertionError("Should not happen: " + e); + } catch (InvocationTargetException e) { + throw new AssertionError("Should not happen: " + e); + } + return pretty(array); + } + + // ---------------------- // + // Comparing byte arrays. // + // ---------------------- // + // Don't ask me why this isn't in java.util.Arrays. + + /** + * A singleton {@link Comparator} for non-{@code null} byte arrays. + * @see #memcmp + */ + public static final MemCmp MEMCMP = new MemCmp(); + + /** {@link Comparator} for non-{@code null} byte arrays. */ + private static final class MemCmp implements Comparator { + + private MemCmp() { // Can't instantiate outside of this class. + } + + @Override + public int compare(final byte[] a, final byte[] b) { + return memcmp(a, b); + } + + } + + /** + * {@code memcmp} in Java, hooray. + * @param a First non-{@code null} byte array to compare. + * @param b Second non-{@code null} byte array to compare. + * @return 0 if the two arrays are identical, otherwise the difference + * between the first two different bytes, otherwise the different between + * their lengths. + */ + public static int memcmp(final byte[] a, final byte[] b) { + final int length = Math.min(a.length, b.length); + if (a == b) { // Do this after accessing a.length and b.length + return 0; // in order to NPE if either a or b is null. + } + for (int i = 0; i < length; i++) { + if (a[i] != b[i]) { + return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned. + } + } + return a.length - b.length; + } + + /** + * {@code memcmp(3)} with a given offset and length. + * @param a First non-{@code null} byte array to compare. + * @param b Second non-{@code null} byte array to compare. + * @param offset The offset at which to start comparing both arrays. + * @param length The number of bytes to compare. + * @return 0 if the two arrays are identical, otherwise the difference + * between the first two different bytes (treated as unsigned), otherwise + * the different between their lengths. + * @throws IndexOutOfBoundsException if either array isn't large enough. + */ + public static int memcmp(final byte[] a, final byte[] b, + final int offset, int length) { + if (a == b && a != null) { + return 0; + } + length += offset; + for (int i = offset; i < length; i++) { + if (a[i] != b[i]) { + return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned. + } + } + return 0; + } + + /** + * De-duplicates two byte arrays. + *

+ * If two byte arrays have the same contents but are different, this + * function helps to re-use the old one and discard the new copy. + * @param old The existing byte array. + * @param neww The new byte array we're trying to de-duplicate. + * @return {@code old} if {@code neww} is a different array with the same + * contents, otherwise {@code neww}. + */ + public static byte[] deDup(final byte[] old, final byte[] neww) { + return memcmp(old, neww) == 0 ? old : neww; + } + + /** + * Tests whether two byte arrays have the same contents. + * @param a First non-{@code null} byte array to compare. + * @param b Second non-{@code null} byte array to compare. + * @return {@code true} if the two arrays are identical, + * {@code false} otherwise. + */ + public static boolean equals(final byte[] a, final byte[] b) { + return memcmp(a, b) == 0; + } + + /** + * {@code memcmp(3)} in Java for possibly {@code null} arrays, hooray. + * @param a First possibly {@code null} byte array to compare. + * @param b Second possibly {@code null} byte array to compare. + * @return 0 if the two arrays are identical (or both are {@code null}), + * otherwise the difference between the first two different bytes (treated + * as unsigned), otherwise the different between their lengths (a {@code + * null} byte array is considered shorter than an empty byte array). + */ + public static int memcmpMaybeNull(final byte[] a, final byte[] b) { + if (a == null) { + if (b == null) { + return 0; + } + return -1; + } else if (b == null) { + return 1; + } + return memcmp(a, b); + } + + public static int getBitSetSize(int items) { + return (items + 7) / 8; + } + + public static byte[] fromBitSet(BitSet bits, int colCount) { + byte[] bytes = new byte[getBitSetSize(colCount)]; + for (int i = 0; i < bits.length(); i++) { + if (bits.get(i)) { + bytes[i / 8] |= 1 << (i % 8); + } + } + return bytes; + } + + public static BitSet toBitSet(byte[] b, int offset, int colCount) { + BitSet bs = new BitSet(colCount); + for (int i = 0; i < colCount; i++) { + if ((b[offset + (i / 8)] >> (i % 8) & 1) == 1) { + bs.set(i); + } + } + return bs; + } + + /** + * This method will apply xor on the left most bit of the provided byte. This is used in Kudu to + * have unsigned data types sorting correctly. + * @param value byte whose left most bit will be xor'd + * @return same byte with xor applied on the left most bit + */ + public static byte xorLeftMostBit(byte value) { + value ^= (1 << 7); + return value; + } + + /** + * Get the byte array representation of this string, with UTF8 encoding + * @param data String get the byte array from + * @return UTF8 byte array + */ + public static byte[] fromString(String data) { + return UTF8(data); + } + + /** + * Get a string from the passed byte array, with UTF8 encoding + * @param b byte array to convert to string, possibly coming from {@link #fromString(String)} + * @return A new string built with the byte array + */ + public static String getString(byte[] b) { + return getString(b, 0, b.length); + } + + public static String getString(Slice slice) { + return slice.toString(CharsetUtil.UTF_8); + } + + /** + * Get a string from the passed byte array, at the specified offset and for the specified + * length, with UTF8 encoding + * @param b byte array to convert to string, possibly coming from {@link #fromString(String)} + * @param offset where to start reading from in the byte array + * @param len how many bytes we should read + * @return A new string built with the byte array + */ + public static String getString(byte[] b, int offset, int len) { + if (len == 0) { + return ""; + } + return new String(b, offset, len, CharsetUtil.UTF_8); + } + + /** + * Utility methd to write a byte array to a data output. Equivalent of doing a writeInt of the + * length followed by a write of the byte array. Convert back with {@link #readByteArray} + * @param dataOutput + * @param b + * @throws IOException + */ + public static void writeByteArray(DataOutput dataOutput, byte[] b) throws IOException { + dataOutput.writeInt(b.length); + dataOutput.write(b); + } + + /** + * Utility method to read a byte array written the way {@link #writeByteArray} does it. + * @param dataInput + * @return + * @throws IOException + */ + public static byte[] readByteArray(DataInput dataInput) throws IOException { + int len = dataInput.readInt(); + byte[] data = new byte[len]; + dataInput.readFully(data); + return data; + } + + /** A convenient map keyed with a byte array. */ + public static final class ByteMap extends TreeMap + implements Iterable> { + + public ByteMap() { + super(MEMCMP); + } + + /** Returns an iterator that goes through all the entries in this map. */ + @Override + public Iterator> iterator() { + return super.entrySet().iterator(); + } + + /** {@code byte[]} friendly implementation. */ + @Override + public String toString() { + final int size = size(); + if (size == 0) { + return "{}"; + } + final StringBuilder buf = new StringBuilder(size << 4); + buf.append('{'); + for (final Map.Entry e : this) { + Bytes.pretty(buf, e.getKey()); + buf.append('='); + final V value = e.getValue(); + if (value instanceof byte[]) { + Bytes.pretty(buf, (byte[]) value); + } else { + buf.append(value == this ? "(this map)" : value); + } + buf.append(", "); + } + buf.setLength(buf.length() - 2); // Remove the extra ", ". + buf.append('}'); + return buf.toString(); + } + + private static final long serialVersionUID = 1280744742; + + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/CallResponse.java b/java/kudu-client/src/main/java/org/kududb/client/CallResponse.java new file mode 100644 index 000000000000..77cbd5eaf1c4 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/CallResponse.java @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import java.util.List; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.rpc.RpcHeader; +import org.kududb.util.Slice; + +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * This class handles information received from an RPC response, providing + * access to sidecars and decoded protobufs from the message. + */ +@InterfaceAudience.Private +final class CallResponse { + private final ChannelBuffer buf; + private final RpcHeader.ResponseHeader header; + private final int totalResponseSize; + + // Non-header main message slice is generated upon request and cached. + private Slice message = null; + + /** + * Performs some sanity checks on the sizes recorded in the packet + * referred to by {@code buf}. Assumes that {@code buf} has not been + * read from yet, and will only be accessed by this class. + * + * Afterwards, this constructs the RpcHeader from the buffer. + * @param buf Channel buffer which call response reads from. + * @throws IllegalArgumentException If either the entire recorded packet + * size or recorded response header PB size are not within reasonable + * limits as defined by {@link KuduRpc#checkArrayLength(ChannelBuffer, long)}. + * @throws IndexOutOfBoundsException if the ChannelBuffer does not contain + * the amount of bytes specified by its length prefix. + */ + public CallResponse(final ChannelBuffer buf) { + this.buf = buf; + + this.totalResponseSize = buf.readInt(); + KuduRpc.checkArrayLength(buf, this.totalResponseSize); + TabletClient.ensureReadable(buf, this.totalResponseSize); + + final int headerSize = Bytes.readVarInt32(buf); + final Slice headerSlice = nextBytes(buf, headerSize); + RpcHeader.ResponseHeader.Builder builder = RpcHeader.ResponseHeader.newBuilder(); + KuduRpc.readProtobuf(headerSlice, builder); + this.header = builder.build(); + } + + /** + * @return the parsed header + */ + public RpcHeader.ResponseHeader getHeader() { + return this.header; + } + + /** + * @return the total response size + */ + public int getTotalResponseSize() { return this.totalResponseSize; } + + /** + * @return A slice pointing to the section of the packet reserved for the main + * protobuf message. + * @throws IllegalArgumentException If the recorded size for the main message + * is not within reasonable limits as defined by + * {@link KuduRpc#checkArrayLength(ChannelBuffer, long)}. + * @throws IllegalStateException If the offset for the main protobuf message + * is not valid. + */ + public Slice getPBMessage() { + cacheMessage(); + final int mainLength = this.header.getSidecarOffsetsCount() == 0 ? + this.message.length() : this.header.getSidecarOffsets(0); + if (mainLength < 0 || mainLength > this.message.length()) { + throw new IllegalStateException("Main protobuf message invalid. " + + "Length is " + mainLength + " while the size of the message " + + "excluding the header is " + this.message.length()); + } + return subslice(this.message, 0, mainLength); + } + + /** + * @param sidecar The index of the sidecar to retrieve. + * @return A slice pointing to the desired sidecar. + * @throws IllegalStateException If the sidecar offsets specified in the + * header response PB are not valid offsets for the array. + * @throws IllegalArgumentException If the sidecar with the specified index + * does not exist. + * @throws IllegalArgumentException If the recorded size for the main message + * is not within reasonable limits as defined by + * {@link KuduRpc#checkArrayLength(ChannelBuffer, long)}. + */ + public Slice getSidecar(int sidecar) { + cacheMessage(); + + List sidecarList = this.header.getSidecarOffsetsList(); + if (sidecar < 0 || sidecar > sidecarList.size()) { + throw new IllegalArgumentException("Sidecar " + sidecar + + " not valid, response has " + sidecarList.size() + " sidecars"); + } + + final int prevOffset = sidecarList.get(sidecar); + final int nextOffset = sidecar + 1 == sidecarList.size() ? + this.message.length() : sidecarList.get(sidecar + 1); + final int length = nextOffset - prevOffset; + + if (prevOffset < 0 || length < 0 || prevOffset + length > this.message.length()) { + throw new IllegalStateException("Sidecar " + sidecar + " invalid " + + "(offset = " + prevOffset + ", length = " + length + "). The size " + + "of the message " + "excluding the header is " + this.message.length()); + } + + return subslice(this.message, prevOffset, length); + } + + // Reads the message after the header if not read yet + private void cacheMessage() { + if (this.message != null) return; + final int length = Bytes.readVarInt32(buf); + this.message = nextBytes(buf, length); + } + + // Accounts for a parent slice's offset when making a new one with relative offsets. + private static Slice subslice(Slice parent, int offset, int length) { + return new Slice(parent.getRawArray(), parent.getRawOffset() + offset, length); + } + + // After checking the length, generates a slice for the next 'length' + // bytes of 'buf'. + private static Slice nextBytes(final ChannelBuffer buf, final int length) { + KuduRpc.checkArrayLength(buf, length); + byte[] payload; + int offset; + if (buf.hasArray()) { // Zero copy. + payload = buf.array(); + offset = buf.arrayOffset() + buf.readerIndex(); + } else { // We have to copy the entire payload out of the buffer :( + payload = new byte[length]; + buf.readBytes(payload); + offset = 0; + } + return new Slice(payload, offset, length); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ColumnRangePredicate.java b/java/kudu-client/src/main/java/org/kududb/client/ColumnRangePredicate.java new file mode 100644 index 000000000000..bf46306a4507 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ColumnRangePredicate.java @@ -0,0 +1,325 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.kududb.ColumnSchema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * A range predicate on one of the columns in the underlying data. + * Both boundaries are inclusive. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class ColumnRangePredicate { + + private final Tserver.ColumnRangePredicatePB.Builder pb = Tserver.ColumnRangePredicatePB + .newBuilder(); + private final ColumnSchema column; + private byte[] lowerBound = null; + private byte[] upperBound = null; + + /** + * Create the predicate on the specified column + * @param column + */ + public ColumnRangePredicate(ColumnSchema column) { + this.column = column; + this.pb.setColumn(ProtobufHelper.columnToPb(column)); + } + + private void setLowerBoundInternal(byte[] value) { + this.lowerBound = value; + pb.setLowerBound(ZeroCopyLiteralByteString.wrap(this.lowerBound)); + } + + private void setUpperBoundInternal(byte[] value) { + this.upperBound = value; + pb.setUpperBound(ZeroCopyLiteralByteString.wrap(this.upperBound)); + } + + /** + * Set a boolean for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(boolean lowerBound) { + checkColumn(Type.BOOL); + setLowerBoundInternal(Bytes.fromBoolean(lowerBound)); + } + + /** + * Set a byte for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(byte lowerBound) { + checkColumn(Type.INT8); + setLowerBoundInternal(new byte[] {lowerBound}); + } + + /** + * Set a short for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(short lowerBound) { + checkColumn(Type.INT16); + setLowerBoundInternal(Bytes.fromShort(lowerBound)); + } + + /** + * Set an int for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(int lowerBound) { + checkColumn(Type.INT32); + setLowerBoundInternal(Bytes.fromInt(lowerBound)); + } + + /** + * Set a long for the lower bound + * + * If 'lowerBound' is a timestamp see {@link PartialRow#addLong(String, long)} for the + * format. + * + * @param lowerBound value for the lower bound + */ + public void setLowerBound(long lowerBound) { + checkColumn(Type.INT64, Type.TIMESTAMP); + setLowerBoundInternal(Bytes.fromLong(lowerBound)); + } + + /** + * Set a string for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(String lowerBound) { + checkColumn(Type.STRING); + setLowerBoundInternal(lowerBound.getBytes()); + } + + /** + * Set a binary value for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(byte[] lowerBound) { + checkColumn(Type.BINARY); + setLowerBoundInternal(lowerBound); + } + + /** + * Set a float for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(float lowerBound) { + checkColumn(Type.FLOAT); + setLowerBoundInternal(Bytes.fromFloat(lowerBound)); + } + + /** + * Set a double for the lower bound + * @param lowerBound value for the lower bound + */ + public void setLowerBound(double lowerBound) { + checkColumn(Type.DOUBLE); + setLowerBoundInternal(Bytes.fromDouble(lowerBound)); + } + + /** + * Set a boolean for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(boolean upperBound) { + checkColumn(Type.BOOL); + setUpperBoundInternal(Bytes.fromBoolean(upperBound)); + } + + /** + * Set a byte for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(byte upperBound) { + checkColumn(Type.INT8); + setUpperBoundInternal(new byte[] {upperBound}); + } + + /** + * Set a short for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(short upperBound) { + checkColumn(Type.INT16); + setUpperBoundInternal(Bytes.fromShort(upperBound)); + } + + /** + * Set an int for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(int upperBound) { + checkColumn(Type.INT32); + setUpperBoundInternal(Bytes.fromInt(upperBound)); + } + + /** + * Set a long for the upper bound + * + * If 'upperBound' is a timestamp see {@link PartialRow#addLong(String, long)} for the + * format. + * + * @param upperBound value for the upper bound + */ + public void setUpperBound(long upperBound) { + checkColumn(Type.INT64, Type.TIMESTAMP); + setUpperBoundInternal(Bytes.fromLong(upperBound)); + } + + /** + * Set a string for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(String upperBound) { + checkColumn(Type.STRING); + setUpperBoundInternal(upperBound.getBytes()); + } + + /** + * Set a binary value for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(byte[] upperBound) { + checkColumn(Type.BINARY); + setUpperBoundInternal(upperBound); + } + + /** + * Set a float for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(float upperBound) { + checkColumn(Type.FLOAT); + setUpperBoundInternal(Bytes.fromFloat(upperBound)); + } + + /** + * Set a double for the upper bound + * @param upperBound value for the upper bound + */ + public void setUpperBound(double upperBound) { + checkColumn(Type.DOUBLE); + setUpperBoundInternal(Bytes.fromDouble(upperBound)); + } + + /** + * Get the column used by this predicate + * @return the column + */ + public ColumnSchema getColumn() { + return column; + } + + /** + * Get the lower bound in its raw representation + * @return lower bound as a byte array + */ + public byte[] getLowerBound() { + return lowerBound; + } + + /** + * Get the upper bound in its raw representation + * @return upper bound as a byte array + */ + public byte[] getUpperBound() { + return upperBound; + } + + /** + * Converts a list of predicates into an opaque byte array. This is a convenience method for use + * cases that require passing predicates as messages. + * @param predicates a list of predicates + * @return an opaque byte array, or null if the list was empty + */ + public static byte[] toByteArray(List predicates) { + if (predicates.isEmpty()) { + return null; + } + + Tserver.ColumnRangePredicateListPB.Builder predicateListBuilder = + Tserver.ColumnRangePredicateListPB.newBuilder(); + + for (ColumnRangePredicate crp : predicates) { + predicateListBuilder.addRangePredicates(crp.getPb()); + } + + return predicateListBuilder.build().toByteArray(); + } + + /** + * Converts a given byte array to a list of predicates in their pb format. + * @param listBytes bytes obtained from {@link #toByteArray(List)} + * @return a list of predicates + * @throws IllegalArgumentException thrown when the passed bytes aren't valid + */ + static List fromByteArray(byte[] listBytes) { + List predicates = new ArrayList<>(); + if (listBytes == null || listBytes.length == 0) { + return predicates; + } + Tserver.ColumnRangePredicateListPB list = ColumnRangePredicate.getPbFromBytes(listBytes); + return list.getRangePredicatesList(); + } + + /** + * Get the predicate in its protobuf form. + * @return this predicate in protobuf + */ + Tserver.ColumnRangePredicatePB getPb() { + return pb.build(); + } + + /** + * Convert a list of predicates given in bytes back to its pb format. It also hides the + * InvalidProtocolBufferException. + */ + private static Tserver.ColumnRangePredicateListPB getPbFromBytes(byte[] listBytes) { + try { + return Tserver.ColumnRangePredicateListPB.parseFrom(listBytes); + } catch (InvalidProtocolBufferException e) { + // We shade our pb dependency so we can't send out the exception above since other modules + // won't know what to expect. + throw new IllegalArgumentException("Encountered an invalid column range predicate list: " + + Bytes.pretty(listBytes), e); + } + } + + private void checkColumn(Type... passedTypes) { + for (Type type : passedTypes) { + if (this.column.getType().equals(type)) return; + } + throw new IllegalArgumentException(String.format("%s's type isn't %s, it's %s", + column.getName(), Arrays.toString(passedTypes), column.getType().getName())); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ConnectionResetException.java b/java/kudu-client/src/main/java/org/kududb/client/ConnectionResetException.java new file mode 100644 index 000000000000..2fa66a4bad9d --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ConnectionResetException.java @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Exception thrown when an RPC was in flight while we got disconnected. + */ +@SuppressWarnings("serial") +@InterfaceAudience.Public +@InterfaceStability.Evolving +public final class ConnectionResetException extends RecoverableException { + + ConnectionResetException(final String msg) { + super(msg); + } + + /** + * Constructor. + */ + ConnectionResetException(final String msg, final Exception cause) { + super(msg, cause); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/CreateTableOptions.java b/java/kudu-client/src/main/java/org/kududb/client/CreateTableOptions.java new file mode 100644 index 000000000000..da8c2bc0cb4c --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/CreateTableOptions.java @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Common; +import com.google.common.collect.Lists; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.master.Master; + +import java.util.List; + +/** + * This is a builder class for all the options that can be provided while creating a table. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class CreateTableOptions { + + private Master.CreateTableRequestPB.Builder pb = Master.CreateTableRequestPB.newBuilder(); + private final List splitRows = Lists.newArrayList(); + + /** + * Add a split point for the table. The table in the end will have splits + 1 tablets. + * The row may be reused or modified safely after this call without changing the split point. + * + * @param row a key row for the split point + * @return this instance + */ + public CreateTableOptions addSplitRow(PartialRow row) { + splitRows.add(new PartialRow(row)); + return this; + } + + /** + * Add a set of hash partitions to the table. + * + * Each column must be a part of the table's primary key, and an individual + * column may only appear in a single hash component. + * + * For each set of hash partitions added to the table, the total number of + * table partitions is multiplied by the number of buckets. For example, if a + * table is created with 3 split rows, and two hash partitions with 4 and 5 + * buckets respectively, the total number of table partitions will be 80 + * (4 range partitions * 4 hash buckets * 5 hash buckets). + * + * @param columns the columns to hash + * @param buckets the number of buckets to hash into + * @return this instance + */ + public CreateTableOptions addHashPartitions(List columns, int buckets) { + addHashPartitions(columns, buckets, 0); + return this; + } + + /** + * Add a set of hash partitions to the table. + * + * This constructor takes a seed value, which can be used to randomize the + * mapping of rows to hash buckets. Setting the seed may provide some + * amount of protection against denial of service attacks when the hashed + * columns contain user provided values. + * + * @param columns the columns to hash + * @param buckets the number of buckets to hash into + * @param seed a hash seed + * @return this instance + */ + public CreateTableOptions addHashPartitions(List columns, int buckets, int seed) { + Common.PartitionSchemaPB.HashBucketSchemaPB.Builder hashBucket = + pb.getPartitionSchemaBuilder().addHashBucketSchemasBuilder(); + for (String column : columns) { + hashBucket.addColumnsBuilder().setName(column); + } + hashBucket.setNumBuckets(buckets); + hashBucket.setSeed(seed); + return this; + } + + /** + * Set the columns on which the table will be range-partitioned. + * + * Every column must be a part of the table's primary key. If not set, the + * table will be created with the primary-key columns as the range-partition + * columns. If called with an empty vector, the table will be created without + * range partitioning. + * + * @param columns the range partitioned columns + * @return this instance + */ + public CreateTableOptions setRangePartitionColumns(List columns) { + Common.PartitionSchemaPB.RangeSchemaPB.Builder rangePartition = + pb.getPartitionSchemaBuilder().getRangeSchemaBuilder(); + for (String column : columns) { + rangePartition.addColumnsBuilder().setName(column); + } + return this; + } + + /** + * Sets the number of replicas that each tablet will have. If not specified, it uses the + * server-side default which is usually 3 unless changed by an administrator. + * + * @param numReplicas the number of replicas to use + * @return this instance + */ + public CreateTableOptions setNumReplicas(int numReplicas) { + pb.setNumReplicas(numReplicas); + return this; + } + + Master.CreateTableRequestPB.Builder getBuilder() { + if (!splitRows.isEmpty()) { + pb.setSplitRows(new Operation.OperationsEncoder().encodeSplitRows(splitRows)); + } + return pb; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/CreateTableRequest.java b/java/kudu-client/src/main/java/org/kududb/client/CreateTableRequest.java new file mode 100644 index 000000000000..b62d252e378f --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/CreateTableRequest.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * RPC to create new tables + */ +@InterfaceAudience.Private +class CreateTableRequest extends KuduRpc { + + static final String CREATE_TABLE = "CreateTable"; + + private final Schema schema; + private final String name; + private final Master.CreateTableRequestPB.Builder builder; + + CreateTableRequest(KuduTable masterTable, String name, Schema schema, + CreateTableOptions builder) { + super(masterTable); + this.schema = schema; + this.name = name; + this.builder = builder.getBuilder(); + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + this.builder.setName(this.name); + this.builder.setSchema(ProtobufHelper.schemaToPb(this.schema)); + return toChannelBuffer(header, this.builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return CREATE_TABLE; + } + + @Override + Pair deserialize(final CallResponse callResponse, + String tsUUID) throws Exception { + final Master.CreateTableResponsePB.Builder builder = Master.CreateTableResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + CreateTableResponse response = + new CreateTableResponse(deadlineTracker.getElapsedMillis(), tsUUID); + return new Pair( + response, builder.hasError() ? builder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/CreateTableResponse.java b/java/kudu-client/src/main/java/org/kududb/client/CreateTableResponse.java new file mode 100644 index 000000000000..7906c5f5ae58 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/CreateTableResponse.java @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +@InterfaceAudience.Private +public class CreateTableResponse extends KuduRpcResponse { + + /** + * @param ellapsedMillis Time in milliseconds since RPC creation to now. + */ + CreateTableResponse(long ellapsedMillis, String tsUUID) { + super(ellapsedMillis, tsUUID); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/DeadlineTracker.java b/java/kudu-client/src/main/java/org/kududb/client/DeadlineTracker.java new file mode 100644 index 000000000000..633c64255ba0 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/DeadlineTracker.java @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Stopwatch; + +/** + * This is a wrapper class around {@link com.google.common.base.Stopwatch} used to track a relative + * deadline in the future. + *

+ * The watch starts as soon as this object is created with a deadline of 0, + * meaning that there's no deadline. + * The deadline has been reached once the stopwatch's elapsed time is equal or greater than the + * provided deadline. + */ +public class DeadlineTracker { + private final Stopwatch stopwatch; + /** relative deadline in milliseconds **/ + private long deadline = 0; + + /** + * Creates a new tracker, which starts the stopwatch right now. + */ + public DeadlineTracker() { + this(new Stopwatch()); + } + + /** + * Creates a new tracker, using the specified stopwatch, and starts it right now. + * The stopwatch is reset if it was already running. + * @param stopwatch Specific Stopwatch to use + */ + public DeadlineTracker(Stopwatch stopwatch) { + if (stopwatch.isRunning()) { + stopwatch.reset(); + } + this.stopwatch = stopwatch.start(); + } + + /** + * Check if we're already past the deadline. + * @return true if we're past the deadline, otherwise false. Also returns false if no deadline + * was specified + */ + public boolean timedOut() { + if (!hasDeadline()) { + return false; + } + return deadline - stopwatch.elapsedMillis() <= 0; + } + + /** + * Get the number of milliseconds before the deadline is reached. + *

+ * This method is used to pass down the remaining deadline to the RPCs, so has special semantics. + * A deadline of 0 is used to indicate an infinite deadline, and negative deadlines are invalid. + * Thus, if the deadline has passed (i.e. deadline - stopwatch.elapsedMillis() <= 0), + * the returned value is floored at 1. + *

+ * Callers who care about this behavior should first check {@link #timedOut()}. + * + * @return the remaining millis before the deadline is reached, or 1 if the remaining time is + * lesser or equal to 0, or Long.MAX_VALUE if no deadline was specified (in which case it + * should never be called). + * @throws IllegalStateException if this method is called and no deadline was set + */ + public long getMillisBeforeDeadline() { + if (!hasDeadline()) { + throw new IllegalStateException("This tracker doesn't have a deadline set so it cannot " + + "answer getMillisBeforeDeadline()"); + } + long millisBeforeDeadline = deadline - stopwatch.elapsedMillis(); + millisBeforeDeadline = millisBeforeDeadline <= 0 ? 1 : millisBeforeDeadline; + return millisBeforeDeadline; + } + + public long getElapsedMillis() { + return this.stopwatch.elapsedMillis(); + } + + /** + * Tells if a non-zero deadline was set. + * @return true if the deadline is greater than 0, false otherwise. + */ + public boolean hasDeadline() { + return deadline != 0; + } + + /** + * Utility method to check if sleeping for a specified amount of time would put us past the + * deadline. + * @param plannedSleepTime number of milliseconds for a planned sleep + * @return if the planned sleeps goes past the deadline. + */ + public boolean wouldSleepingTimeout(long plannedSleepTime) { + if (!hasDeadline()) { + return false; + } + return getMillisBeforeDeadline() - plannedSleepTime <= 0; + } + + /** + * Sets the deadline to 0 (no deadline) and restarts the stopwatch from scratch. + */ + public void reset() { + deadline = 0; + stopwatch.reset(); + stopwatch.start(); + } + + /** + * Get the deadline (in milliseconds). + * @return the current deadline + */ + public long getDeadline() { + return deadline; + } + + /** + * Set a new deadline for this tracker. It cannot be smaller than 0, + * and if it is 0 then it means that there is no deadline (which is the default behavior). + * This method won't call reset(). + * @param deadline a number of milliseconds greater or equal to 0 + * @throws IllegalArgumentException if the deadline is lesser than 0 + */ + public void setDeadline(long deadline) { + if (deadline < 0) { + throw new IllegalArgumentException("The deadline must be greater or equal to 0, " + + "the passed value is " + deadline); + } + this.deadline = deadline; + } + + public String toString() { + StringBuffer buf = new StringBuffer("DeadlineTracker(timeout="); + buf.append(deadline); + buf.append(", elapsed=").append(stopwatch.elapsedMillis()); + buf.append(")"); + return buf.toString(); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Delete.java b/java/kudu-client/src/main/java/org/kududb/client/Delete.java new file mode 100644 index 000000000000..7a068bc5a1eb --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Delete.java @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.ColumnSchema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Class of Operation for whole row removals. + * Only columns which are part of the key can be set. + * Instances of this class should not be reused. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class Delete extends Operation { + + Delete(KuduTable table) { + super(table); + } + + @Override + ChangeType getChangeType() { + return ChangeType.DELETE; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/DeleteTableRequest.java b/java/kudu-client/src/main/java/org/kududb/client/DeleteTableRequest.java new file mode 100644 index 000000000000..7f8fa519fde9 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/DeleteTableRequest.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * RPC to delete tables + */ +@InterfaceAudience.Private +class DeleteTableRequest extends KuduRpc { + + static final String DELETE_TABLE = "DeleteTable"; + + private final String name; + + DeleteTableRequest(KuduTable table, String name) { + super(table); + this.name = name; + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final Master.DeleteTableRequestPB.Builder builder = Master.DeleteTableRequestPB.newBuilder(); + Master.TableIdentifierPB tableID = + Master.TableIdentifierPB.newBuilder().setTableName(name).build(); + builder.setTable(tableID); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return DELETE_TABLE; + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + final Master.DeleteTableResponsePB.Builder builder = Master.DeleteTableResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + DeleteTableResponse response = + new DeleteTableResponse(deadlineTracker.getElapsedMillis(), tsUUID); + return new Pair( + response, builder.hasError() ? builder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/DeleteTableResponse.java b/java/kudu-client/src/main/java/org/kududb/client/DeleteTableResponse.java new file mode 100644 index 000000000000..51f3ba776b74 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/DeleteTableResponse.java @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class DeleteTableResponse extends KuduRpcResponse { + + /** + * @param ellapsedMillis Time in milliseconds since RPC creation to now. + */ + DeleteTableResponse(long ellapsedMillis, String tsUUID) { + super(ellapsedMillis, tsUUID); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ErrorCollector.java b/java/kudu-client/src/main/java/org/kududb/client/ErrorCollector.java new file mode 100644 index 000000000000..db2952cbc69d --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ErrorCollector.java @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Preconditions; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.ArrayDeque; +import java.util.Queue; + +/** + * Class that helps tracking row errors. All methods are thread-safe. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class ErrorCollector { + private final Queue errorQueue; + private final int maxCapacity; + private boolean overflowed; + + /** + * Create a new error collector with a maximum capacity. + * @param maxCapacity how many errors can be stored, has to be higher than 0 + */ + public ErrorCollector(int maxCapacity) { + Preconditions.checkArgument(maxCapacity > 0, "Need to be able to store at least one row error"); + this.maxCapacity = maxCapacity; + this.errorQueue = new ArrayDeque<>(maxCapacity); + } + + /** + * Add a new error to this collector. If it is already at max capacity, the oldest error will be + * discarded before the new one is added. + * @param rowError a row error to collect + */ + public synchronized void addError(RowError rowError) { + if (errorQueue.size() >= maxCapacity) { + errorQueue.poll(); + overflowed = true; + } + errorQueue.add(rowError); + } + + /** + * Get the current count of collected row errors. Cannot be greater than the max capacity this + * instance was configured with. + * @return the count of errors + */ + public synchronized int countErrors() { + return errorQueue.size(); + } + + /** + * Get all the errors that have been collected and an indication if the list overflowed. + * The list of errors cleared and the overflow state is reset. + * @return an object that contains both the list of row errors and the overflow status + */ + public synchronized RowErrorsAndOverflowStatus getErrors() { + RowError[] returnedErrors = new RowError[errorQueue.size()]; + errorQueue.toArray(returnedErrors); + errorQueue.clear(); + + RowErrorsAndOverflowStatus returnObject = + new RowErrorsAndOverflowStatus(returnedErrors, overflowed); + overflowed = false; + return returnObject; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ExternalConsistencyMode.java b/java/kudu-client/src/main/java/org/kududb/client/ExternalConsistencyMode.java new file mode 100644 index 000000000000..adfb6248e3dc --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ExternalConsistencyMode.java @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Common; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * The possible external consistency modes on which Kudu operates. + * See {@code src/kudu/common/common.proto} for a detailed explanations on the + * meaning and implications of each mode. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public enum ExternalConsistencyMode { + CLIENT_PROPAGATED(Common.ExternalConsistencyMode.CLIENT_PROPAGATED), + COMMIT_WAIT(Common.ExternalConsistencyMode.COMMIT_WAIT); + + private Common.ExternalConsistencyMode pbVersion; + private ExternalConsistencyMode(Common.ExternalConsistencyMode pbVersion) { + this.pbVersion = pbVersion; + } + @InterfaceAudience.Private + public Common.ExternalConsistencyMode pbVersion() { + return pbVersion; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationReceived.java b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationReceived.java new file mode 100644 index 000000000000..3e17f3855a9c --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationReceived.java @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.net.HostAndPort; +import com.google.protobuf.ByteString; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.Common; +import org.kududb.consensus.Metadata; +import org.kududb.master.Master; +import org.kududb.util.NetUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Class grouping the callback and the errback for GetMasterRegistration calls + * made in getMasterTableLocationsPB. + */ +@InterfaceAudience.Private +final class GetMasterRegistrationReceived { + + private static final Logger LOG = LoggerFactory.getLogger(GetMasterRegistrationReceived.class); + + private final List masterAddrs; + private final Deferred responseD; + private final int numMasters; + + // Used to avoid calling 'responseD' twice. + private final AtomicBoolean responseDCalled = new AtomicBoolean(false); + + // Number of responses we've receives: used to tell whether or not we've received + // errors/replies from all of the masters, or if there are any + // GetMasterRegistrationRequests still pending. + private final AtomicInteger countResponsesReceived = new AtomicInteger(0); + + // Exceptions received so far: kept for debugging purposes. + // (see: NoLeaderMasterFoundException#create() for how this is used). + private final List exceptionsReceived = + Collections.synchronizedList(new ArrayList()); + + /** + * Creates an object that holds the state needed to retrieve master table's location. + * @param masterAddrs Addresses of all master replicas that we want to retrieve the + * registration from. + * @param responseD Deferred object that will hold the GetTableLocationsResponsePB object for + * the master table. + */ + public GetMasterRegistrationReceived(List masterAddrs, + Deferred responseD) { + this.masterAddrs = masterAddrs; + this.responseD = responseD; + this.numMasters = masterAddrs.size(); + } + + /** + * Creates a callback for a GetMasterRegistrationRequest that was sent to 'hostAndPort'. + * @see GetMasterRegistrationCB + * @param hostAndPort Host and part for the RPC we're attaching this to. Host and port must + * be valid. + * @return The callback object that can be added to the RPC request. + */ + public Callback callbackForNode(HostAndPort hostAndPort) { + return new GetMasterRegistrationCB(hostAndPort); + } + + /** + * Creates an errback for a GetMasterRegistrationRequest that was sent to 'hostAndPort'. + * @see GetMasterRegistrationErrCB + * @param hostAndPort Host and port for the RPC we're attaching this to. Used for debugging + * purposes. + * @return The errback object that can be added to the RPC request. + */ + public Callback errbackForNode(HostAndPort hostAndPort) { + return new GetMasterRegistrationErrCB(hostAndPort); + } + + /** + * Checks if we've already received a response or an exception from every master that + * we've sent a GetMasterRegistrationRequest to. If so -- and no leader has been found + * (that is, 'responseD' was never called) -- pass a {@link NoLeaderMasterFoundException} + * to responseD. + */ + private void incrementCountAndCheckExhausted() { + if (countResponsesReceived.incrementAndGet() == numMasters) { + if (responseDCalled.compareAndSet(false, true)) { + boolean allUnrecoverable = true; + for (Exception ex : exceptionsReceived) { + if (!(ex instanceof NonRecoverableException)) { + allUnrecoverable = false; + break; + } + } + String allHosts = NetUtil.hostsAndPortsToString(masterAddrs); + // Doing a negative check because allUnrecoverable stays true if there are no exceptions. + if (!allUnrecoverable) { + if (exceptionsReceived.isEmpty()) { + LOG.warn("None of the provided masters (" + allHosts + ") is a leader, will retry."); + } else { + LOG.warn("Unable to find the leader master (" + allHosts + "), will retry"); + } + responseD.callback(NoLeaderMasterFoundException.create( + "Master config (" + allHosts + ") has no leader.", + exceptionsReceived)); + } else { + // This will stop retries. + responseD.callback(new NonRecoverableException("Couldn't find a valid master in (" + + allHosts + "), exceptions: " + exceptionsReceived)); + } + } + } + } + + /** + * Callback for each GetMasterRegistrationRequest sent in getMasterTableLocations() above. + * If a request (paired to a specific master) returns a reply that indicates it's a leader, + * the callback in 'responseD' is invoked with an initialized GetTableLocationResponsePB + * object containing the leader's RPC address. + * If the master is not a leader, increment 'countResponsesReceived': if the count equals to + * the number of masters, pass {@link NoLeaderMasterFoundException} into + * 'responseD' if no one else had called 'responseD' before; otherwise, do nothing. + */ + final class GetMasterRegistrationCB implements Callback { + private final HostAndPort hostAndPort; + + public GetMasterRegistrationCB(HostAndPort hostAndPort) { + this.hostAndPort = hostAndPort; + } + + @Override + public Void call(GetMasterRegistrationResponse r) throws Exception { + Master.TabletLocationsPB.ReplicaPB.Builder replicaBuilder = + Master.TabletLocationsPB.ReplicaPB.newBuilder(); + + Master.TSInfoPB.Builder tsInfoBuilder = Master.TSInfoPB.newBuilder(); + tsInfoBuilder.addRpcAddresses(ProtobufHelper.hostAndPortToPB(hostAndPort)); + tsInfoBuilder.setPermanentUuid(r.getInstanceId().getPermanentUuid()); + replicaBuilder.setTsInfo(tsInfoBuilder); + if (r.getRole().equals(Metadata.RaftPeerPB.Role.LEADER)) { + replicaBuilder.setRole(r.getRole()); + Master.TabletLocationsPB.Builder locationBuilder = Master.TabletLocationsPB.newBuilder(); + locationBuilder.setPartition( + Common.PartitionPB.newBuilder().setPartitionKeyStart(ByteString.EMPTY) + .setPartitionKeyEnd(ByteString.EMPTY)); + locationBuilder.setTabletId( + ByteString.copyFromUtf8(AsyncKuduClient.MASTER_TABLE_NAME_PLACEHOLDER)); + locationBuilder.setStale(false); + locationBuilder.addReplicas(replicaBuilder); + // No one else has called this before us. + if (responseDCalled.compareAndSet(false, true)) { + responseD.callback( + Master.GetTableLocationsResponsePB.newBuilder().addTabletLocations( + locationBuilder.build()).build() + ); + } else { + LOG.debug("Callback already invoked, discarding response(" + r.toString() + ") from " + + hostAndPort.toString()); + } + } else { + incrementCountAndCheckExhausted(); + } + return null; + } + + @Override + public String toString() { + return "get master registration for " + hostAndPort.toString(); + } + } + + /** + * Errback for each GetMasterRegistrationRequest sent in getMasterTableLocations() above. + * Stores each exception in 'exceptionsReceived'. Increments 'countResponseReceived': if + * the count is equal to the number of masters and no one else had called 'responseD' before, + * pass a {@link NoLeaderMasterFoundException} into 'responseD'; otherwise, do + * nothing. + */ + final class GetMasterRegistrationErrCB implements Callback { + private final HostAndPort hostAndPort; + + public GetMasterRegistrationErrCB(HostAndPort hostAndPort) { + this.hostAndPort = hostAndPort; + } + + @Override + public Void call(Exception e) throws Exception { + LOG.warn("Error receiving a response from: " + hostAndPort, e); + exceptionsReceived.add(e); + incrementCountAndCheckExhausted(); + return null; + } + + @Override + public String toString() { + return "get master registration errback for " + hostAndPort.toString(); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationRequest.java b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationRequest.java new file mode 100644 index 000000000000..bc3d81eed1b0 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationRequest.java @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import static org.kududb.consensus.Metadata.*; +import static org.kududb.master.Master.*; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * Package-private RPC that can only go to master. + */ +@InterfaceAudience.Private +public class GetMasterRegistrationRequest extends KuduRpc { + private static final String GET_MASTER_REGISTRATION = "GetMasterRegistration"; + + public GetMasterRegistrationRequest(KuduTable masterTable) { + super(masterTable); + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final GetMasterRegistrationRequestPB.Builder builder = + GetMasterRegistrationRequestPB.newBuilder(); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return GET_MASTER_REGISTRATION; + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + final GetMasterRegistrationResponsePB.Builder respBuilder = + GetMasterRegistrationResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + RaftPeerPB.Role role = RaftPeerPB.Role.FOLLOWER; + if (!respBuilder.hasError() || respBuilder.getError().getCode() != + MasterErrorPB.Code.CATALOG_MANAGER_NOT_INITIALIZED) { + role = respBuilder.getRole(); + } + GetMasterRegistrationResponse response = new GetMasterRegistrationResponse( + deadlineTracker.getElapsedMillis(), + tsUUID, + role, + respBuilder.getRegistration(), + respBuilder.getInstanceId()); + return new Pair( + response, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationResponse.java b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationResponse.java new file mode 100644 index 000000000000..292710ce2dd2 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetMasterRegistrationResponse.java @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.consensus.Metadata; +import org.kududb.master.Master; + +/** + * Response for {@link GetMasterRegistrationRequest}. + */ +@InterfaceAudience.Private +public class GetMasterRegistrationResponse extends KuduRpcResponse { + + private final Metadata.RaftPeerPB.Role role; + private final WireProtocol.ServerRegistrationPB serverRegistration; + private final WireProtocol.NodeInstancePB instanceId; + + /** + * Describes a response to a {@link GetMasterRegistrationRequest}, built from + * {@link Master.GetMasterRegistrationResponsePB}. + * + * @param role Master's role in the config. + * @param serverRegistration server registration (RPC and HTTP addresses) for this master. + * @param instanceId Node instance (permanent uuid and + */ + public GetMasterRegistrationResponse(long elapsedMillis, String tsUUID, + Metadata.RaftPeerPB.Role role, + WireProtocol.ServerRegistrationPB serverRegistration, + WireProtocol.NodeInstancePB instanceId) { + super(elapsedMillis, tsUUID); + this.role = role; + this.serverRegistration = serverRegistration; + this.instanceId = instanceId; + } + + /** + * Returns this master's role in the config. + * + * @see Metadata.RaftPeerPB.Role + * @return Node's role in the cluster, or FOLLOWER if the node is not initialized. + */ + public Metadata.RaftPeerPB.Role getRole() { + return role; + } + + /** + * Returns the server registration (list of RPC and HTTP ports) for this master. + * + * @return The {@link WireProtocol.ServerRegistrationPB} object for this master. + */ + public WireProtocol.ServerRegistrationPB getServerRegistration() { + return serverRegistration; + } + + /** + * The node instance (initial sequence number and permanent uuid) for this master. + * + * @return The {@link WireProtocol.NodeInstancePB} object for this master. + */ + public WireProtocol.NodeInstancePB getInstanceId() { + return instanceId; + } + + @Override + public String toString() { + return "GetMasterRegistrationResponse{" + + "role=" + role + + ", serverRegistration=" + serverRegistration + + ", instanceId=" + instanceId + + '}'; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetTableLocationsRequest.java b/java/kudu-client/src/main/java/org/kududb/client/GetTableLocationsRequest.java new file mode 100644 index 000000000000..616b52306687 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetTableLocationsRequest.java @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.ByteString; +import com.google.protobuf.Message; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * Package-private RPC that can only go to a master. + */ +@InterfaceAudience.Private +class GetTableLocationsRequest extends KuduRpc { + + private final byte[] startPartitionKey; + private final byte[] endKey; + private final String tableId; + + GetTableLocationsRequest(KuduTable table, byte[] startPartitionKey, + byte[] endPartitionKey, String tableId) { + super(table); + if (startPartitionKey != null && endPartitionKey != null + && Bytes.memcmp(startPartitionKey, endPartitionKey) > 0) { + throw new IllegalArgumentException( + "The start partition key must be smaller or equal to the end partition key"); + } + this.startPartitionKey = startPartitionKey; + this.endKey = endPartitionKey; + this.tableId = tableId; + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return "GetTableLocations"; + } + + @Override + Pair deserialize( + final CallResponse callResponse, String tsUUID) + throws Exception { + Master.GetTableLocationsResponsePB.Builder builder = Master.GetTableLocationsResponsePB + .newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + Master.GetTableLocationsResponsePB resp = builder.build(); + return new Pair( + resp, builder.hasError() ? builder.getError() : null); + } + + @Override + ChannelBuffer serialize(Message header) { + final Master.GetTableLocationsRequestPB.Builder builder = Master + .GetTableLocationsRequestPB.newBuilder(); + builder.setTable(Master.TableIdentifierPB.newBuilder(). + setTableId(ByteString.copyFromUtf8(tableId))); + if (startPartitionKey != null) { + builder.setPartitionKeyStart(ZeroCopyLiteralByteString.wrap(startPartitionKey)); + } + if (endKey != null) { + builder.setPartitionKeyEnd(ZeroCopyLiteralByteString.wrap(endKey)); + } + return toChannelBuffer(header, builder.build()); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaRequest.java b/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaRequest.java new file mode 100644 index 000000000000..bb17816c5c6a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaRequest.java @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import static org.kududb.master.Master.*; + +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * RPC to fetch a table's schema + */ +@InterfaceAudience.Private +public class GetTableSchemaRequest extends KuduRpc { + static final String GET_TABLE_SCHEMA = "GetTableSchema"; + private final String name; + + + GetTableSchemaRequest(KuduTable masterTable, String name) { + super(masterTable); + this.name = name; + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final GetTableSchemaRequestPB.Builder builder = GetTableSchemaRequestPB.newBuilder(); + TableIdentifierPB tableID = + TableIdentifierPB.newBuilder().setTableName(name).build(); + builder.setTable(tableID); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return GET_TABLE_SCHEMA; + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + final GetTableSchemaResponsePB.Builder respBuilder = GetTableSchemaResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + Schema schema = ProtobufHelper.pbToSchema(respBuilder.getSchema()); + GetTableSchemaResponse response = new GetTableSchemaResponse( + deadlineTracker.getElapsedMillis(), + tsUUID, + schema, + respBuilder.getTableId().toStringUtf8(), + ProtobufHelper.pbToPartitionSchema(respBuilder.getPartitionSchema(), schema), + respBuilder.getCreateTableDone()); + return new Pair( + response, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaResponse.java b/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaResponse.java new file mode 100644 index 000000000000..72ac68e0a348 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/GetTableSchemaResponse.java @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; + +@InterfaceAudience.Private +public class GetTableSchemaResponse extends KuduRpcResponse { + + private final Schema schema; + private final PartitionSchema partitionSchema; + private final boolean createTableDone; + private final String tableId; + + /** + * @param ellapsedMillis Time in milliseconds since RPC creation to now + * @param schema the table's schema + * @param partitionSchema the table's partition schema + */ + GetTableSchemaResponse(long ellapsedMillis, + String tsUUID, + Schema schema, + String tableId, + PartitionSchema partitionSchema, + boolean createTableDone) { + super(ellapsedMillis, tsUUID); + this.schema = schema; + this.partitionSchema = partitionSchema; + this.createTableDone = createTableDone; + this.tableId = tableId; + } + + /** + * Get the table's schema. + * @return Table's schema + */ + public Schema getSchema() { + return schema; + } + + /** + * Get the table's partition schema. + * @return the table's partition schema + */ + public PartitionSchema getPartitionSchema() { + return partitionSchema; + } + + /** + * Tells if the original CreateTable call has completed and the tablets are ready. + * @return true if the table is created, otherwise false + */ + public boolean isCreateTableDone() { + return createTableDone; + } + + /** + * Get the table's unique identifier. + * @return the table's tableId + */ + public String getTableId() { + return tableId; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/HasFailedRpcException.java b/java/kudu-client/src/main/java/org/kududb/client/HasFailedRpcException.java new file mode 100644 index 000000000000..08dda52f46e7 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/HasFailedRpcException.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Interface implemented by {@link KuduException}s that can tell you which + * RPC failed. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public interface HasFailedRpcException { + + /** + * Returns the RPC that caused this exception. + */ + KuduRpc getFailedRpc(); + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/IPCUtil.java b/java/kudu-client/src/main/java/org/kududb/client/IPCUtil.java new file mode 100644 index 000000000000..45240dda1bf1 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/IPCUtil.java @@ -0,0 +1,83 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.client; + +import com.google.protobuf.CodedOutputStream; +import com.google.protobuf.Message; +import org.kududb.annotations.InterfaceAudience; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Helper methods for RPCs. + */ +@InterfaceAudience.Private +public class IPCUtil { + /** + * Write out header, param, and cell block if there is one. + * @param dos + * @param header + * @param param + * @return Total number of bytes written. + * @throws java.io.IOException + */ + public static int write(final OutputStream dos, final Message header, final Message param) + throws IOException { + // Must calculate total size and write that first so other side can read it all in in one + // swoop. This is dictated by how the server is currently written. Server needs to change + // if we are to be able to write without the length prefixing. + int totalSize = IPCUtil.getTotalSizeWhenWrittenDelimited(header, param); + return write(dos, header, param, totalSize); + } + + private static int write(final OutputStream dos, final Message header, final Message param, + final int totalSize) + throws IOException { + // I confirmed toBytes does same as say DataOutputStream#writeInt. + dos.write(toBytes(totalSize)); + header.writeDelimitedTo(dos); + if (param != null) param.writeDelimitedTo(dos); + dos.flush(); + return totalSize; + } + + /** + * @return Size on the wire when the two messages are written with writeDelimitedTo + */ + public static int getTotalSizeWhenWrittenDelimited(Message ... messages) { + int totalSize = 0; + for (Message m: messages) { + if (m == null) continue; + totalSize += m.getSerializedSize(); + totalSize += CodedOutputStream.computeRawVarint32Size(m.getSerializedSize()); + } + return totalSize; + } + + public static byte[] toBytes(int val) { + byte [] b = new byte[4]; + for(int i = 3; i > 0; i--) { + b[i] = (byte) val; + val >>>= 8; + } + b[0] = (byte) val; + return b; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Insert.java b/java/kudu-client/src/main/java/org/kududb/client/Insert.java new file mode 100644 index 000000000000..67b389ff3348 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Insert.java @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Represents a single row insert. Instances of this class should not be reused. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class Insert extends Operation { + + Insert(KuduTable table) { + super(table); + } + + @Override + ChangeType getChangeType() { + return ChangeType.INSERT; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/InvalidResponseException.java b/java/kudu-client/src/main/java/org/kududb/client/InvalidResponseException.java new file mode 100644 index 000000000000..4221075cf481 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/InvalidResponseException.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Exception used when the server sends an invalid response to an RPC. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public final class InvalidResponseException extends NonRecoverableException { + + private final Object response; + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + * @param response The response that was received from the server. + */ + InvalidResponseException(final String msg, final Object response) { + super(msg); + this.response = response; + } + + /** + * Constructor. + * @param msg The message of the exception. + * @param cause The exception explaining why the response is invalid. + */ + InvalidResponseException(final String msg, final Exception cause) { + super(msg, cause); + this.response = null; + } + + /** + * Constructor for unexpected response types. + * @param expected The type of the response that was expected. + * @param response The response that was received from the server. + */ + InvalidResponseException(final Class expected, final Object response) { + super("Unexpected response type. Expected: " + expected.getName() + + ", got: " + (response == null ? "null" + : response.getClass() + ", value=" + response)); + this.response = response; + } + + /** + * Returns the possibly {@code null} response received from the server. + */ + public Object getResponse() { + return response; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneRequest.java b/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneRequest.java new file mode 100644 index 000000000000..ca161f5c2c38 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneRequest.java @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import static org.kududb.master.Master.*; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * RPC used to check if an alter is running for the specified table + */ +@InterfaceAudience.Private +class IsAlterTableDoneRequest extends KuduRpc { + + static final String IS_ALTER_TABLE_DONE = "IsAlterTableDone"; + private final String name; + + + IsAlterTableDoneRequest(KuduTable masterTable, String name) { + super(masterTable); + this.name = name; + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final IsAlterTableDoneRequestPB.Builder builder = IsAlterTableDoneRequestPB.newBuilder(); + TableIdentifierPB tableID = + TableIdentifierPB.newBuilder().setTableName(name).build(); + builder.setTable(tableID); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return IS_ALTER_TABLE_DONE; + } + + @Override + Pair deserialize(final CallResponse callResponse, + String tsUUID) throws Exception { + final IsAlterTableDoneResponsePB.Builder respBuilder = IsAlterTableDoneResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + IsAlterTableDoneResponse resp = new IsAlterTableDoneResponse(deadlineTracker.getElapsedMillis(), + tsUUID, respBuilder.getDone()); + return new Pair( + resp, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneResponse.java b/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneResponse.java new file mode 100644 index 000000000000..356c0850f470 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/IsAlterTableDoneResponse.java @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Response to a isAlterTableDone command to use to know if an alter table is currently running on + * the specified table. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class IsAlterTableDoneResponse extends KuduRpcResponse { + + private final boolean done; + + IsAlterTableDoneResponse(long elapsedMillis, String tsUUID, boolean done) { + super(elapsedMillis, tsUUID); + this.done = done; + } + + /** + * Tells if the table is done being altered or not. + * @return whether the table alter is done + */ + public boolean isDone() { + return done; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/IsCreateTableDoneRequest.java b/java/kudu-client/src/main/java/org/kududb/client/IsCreateTableDoneRequest.java new file mode 100644 index 000000000000..8e4679c6e270 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/IsCreateTableDoneRequest.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.ByteString; +import com.google.protobuf.Message; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +/** + * Package-private RPC that can only go to a master. + */ +@InterfaceAudience.Private +class IsCreateTableDoneRequest extends KuduRpc { + + private final String tableId; + + IsCreateTableDoneRequest(KuduTable table, String tableId) { + super(table); + this.tableId = tableId; + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return "IsCreateTableDone"; + } + + @Override + Pair deserialize( + final CallResponse callResponse, String tsUUID) throws Exception { + Master.IsCreateTableDoneResponsePB.Builder builder = Master.IsCreateTableDoneResponsePB + .newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + Master.IsCreateTableDoneResponsePB resp = builder.build(); + return new Pair( + resp, builder.hasError() ? builder.getError() : null); + } + + @Override + ChannelBuffer serialize(Message header) { + final Master.IsCreateTableDoneRequestPB.Builder builder = Master + .IsCreateTableDoneRequestPB.newBuilder(); + builder.setTable(Master.TableIdentifierPB.newBuilder().setTableId( + ByteString.copyFromUtf8(tableId))); + return toChannelBuffer(header, builder.build()); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KeyEncoder.java b/java/kudu-client/src/main/java/org/kududb/client/KeyEncoder.java new file mode 100644 index 000000000000..2fbde58a2c4a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KeyEncoder.java @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.primitives.UnsignedLongs; +import com.sangupta.murmur.Murmur2; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.client.PartitionSchema.HashBucketSchema; + +import java.io.ByteArrayOutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; + +/** + * Utility class for encoding rows into primary and partition keys. + */ +@InterfaceAudience.Private +class KeyEncoder { + + private final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + + /** + * Encodes the primary key of the row. + * + * @param row the row to encode + * @return the encoded primary key of the row + */ + public byte[] encodePrimaryKey(final PartialRow row) { + buf.reset(); + + final Schema schema = row.getSchema(); + for (int columnIdx = 0; columnIdx < schema.getPrimaryKeyColumnCount(); columnIdx++) { + final boolean isLast = columnIdx + 1 == schema.getPrimaryKeyColumnCount(); + encodeColumn(row, columnIdx, isLast); + } + return extractByteArray(); + } + + /** + * Encodes the provided row into a partition key according to the partition schema. + * + * @param row the row to encode + * @param partitionSchema the partition schema describing the table's partitioning + * @return an encoded partition key + */ + public byte[] encodePartitionKey(PartialRow row, PartitionSchema partitionSchema) { + buf.reset(); + if (!partitionSchema.getHashBucketSchemas().isEmpty()) { + ByteBuffer bucketBuf = ByteBuffer.allocate(4 * partitionSchema.getHashBucketSchemas().size()); + bucketBuf.order(ByteOrder.BIG_ENDIAN); + + for (final HashBucketSchema hashBucketSchema : partitionSchema.getHashBucketSchemas()) { + encodeColumns(row, hashBucketSchema.getColumnIds()); + byte[] encodedColumns = extractByteArray(); + long hash = Murmur2.hash64(encodedColumns, + encodedColumns.length, + hashBucketSchema.getSeed()); + int bucket = (int) UnsignedLongs.remainder(hash, hashBucketSchema.getNumBuckets()); + bucketBuf.putInt(bucket); + } + + assert bucketBuf.arrayOffset() == 0; + buf.write(bucketBuf.array(), 0, bucketBuf.position()); + } + + encodeColumns(row, partitionSchema.getRangeSchema().getColumns()); + return extractByteArray(); + } + + /** + * Encodes a sequence of columns from the row. + * @param row the row containing the columns to encode + * @param columnIds the IDs of each column to encode + */ + private void encodeColumns(PartialRow row, List columnIds) { + for (int i = 0; i < columnIds.size(); i++) { + boolean isLast = i + 1 == columnIds.size(); + encodeColumn(row, row.getSchema().getColumnIndex(columnIds.get(i)), isLast); + } + } + + /** + * Encodes a single column of a row. + * @param row the row being encoded + * @param columnIdx the column index of the column to encode + * @param isLast whether the column is the last component of the key + */ + private void encodeColumn(PartialRow row, int columnIdx, boolean isLast) { + final Schema schema = row.getSchema(); + final ColumnSchema column = schema.getColumnByIndex(columnIdx); + if (!row.isSet(columnIdx)) { + throw new IllegalStateException(String.format("Primary key column %s is not set", + column.getName())); + } + final Type type = column.getType(); + + if (type == Type.STRING || type == Type.BINARY) { + addBinaryComponent(row.getVarLengthData().get(columnIdx), isLast); + } else { + addComponent(row.getRowAlloc(), + schema.getColumnOffset(columnIdx), + type.getSize(), + type); + } + } + + /** + * Encodes a byte buffer into the key. + * @param value the value to encode + * @param isLast whether the value is the final component in the key + */ + private void addBinaryComponent(ByteBuffer value, boolean isLast) { + value.reset(); + + // TODO find a way to not have to read byte-by-byte that doesn't require extra copies. This is + // especially slow now that users can pass direct byte buffers. + while (value.hasRemaining()) { + byte currentByte = value.get(); + buf.write(currentByte); + if (!isLast && currentByte == 0x00) { + // If we're a middle component of a composite key, we need to add a \x00 + // at the end in order to separate this component from the next one. However, + // if we just did that, we'd have issues where a key that actually has + // \x00 in it would compare wrong, so we have to instead add \x00\x00, and + // encode \x00 as \x00\x01. -- key_encoder.h + buf.write(0x01); + } + } + + if (!isLast) { + buf.write(0x00); + buf.write(0x00); + } + } + + /** + * Encodes a value of the given type into the key. + * @param value the value to encode + * @param offset the offset into the {@code value} buffer that the value begins + * @param len the length of the value + * @param type the type of the value to encode + */ + private void addComponent(byte[] value, int offset, int len, Type type) { + switch (type) { + case INT8: + case INT16: + case INT32: + case INT64: + case TIMESTAMP: + // Picking the first byte because big endian. + byte lastByte = value[offset + (len - 1)]; + lastByte = Bytes.xorLeftMostBit(lastByte); + buf.write(lastByte); + if (len > 1) { + for (int i = len - 2; i >= 0; i--) { + buf.write(value[offset + i]); + } + } + break; + default: + throw new IllegalArgumentException(String.format( + "The column type %s is not a valid key component type", type)); + } + } + + /** + * Returns the encoded key, and resets the key encoder to be used for another key. + * @return the encoded key which has been built through calls to {@link #addComponent} + */ + private byte[] extractByteArray() { + byte[] bytes = buf.toByteArray(); + buf.reset(); + return bytes; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduClient.java b/java/kudu-client/src/main/java/org/kududb/client/KuduClient.java new file mode 100644 index 000000000000..a2003b628439 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduClient.java @@ -0,0 +1,363 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.stumbleupon.async.Deferred; +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executor; + +/** + * A synchronous and thread-safe client for Kudu. + *

+ * This class acts as a wrapper around {@link AsyncKuduClient}. The {@link Deferred} objects are + * joined against using the default admin operation timeout + * (see {@link org.kududb.client.KuduClient.KuduClientBuilder#defaultAdminOperationTimeoutMs(long)} (long)}). + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduClient implements AutoCloseable { + + public static final Logger LOG = LoggerFactory.getLogger(AsyncKuduClient.class); + + private final AsyncKuduClient asyncClient; + + KuduClient(AsyncKuduClient asyncClient) { + this.asyncClient = asyncClient; + } + + /** + * Create a table on the cluster with the specified name and schema. Default table + * configurations are used, mainly the table will have one tablet. + * @param name Table's name + * @param schema Table's schema + * @return an object to communicate with the created table + */ + public KuduTable createTable(String name, Schema schema) throws Exception { + return createTable(name, schema, new CreateTableOptions()); + } + + /** + * Create a table on the cluster with the specified name, schema, and table configurations. + * @param name the table's name + * @param schema the table's schema + * @param builder a builder containing the table's configurations + * @return an object to communicate with the created table + */ + public KuduTable createTable(String name, Schema schema, CreateTableOptions builder) + throws Exception { + Deferred d = asyncClient.createTable(name, schema, builder); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Delete a table on the cluster with the specified name. + * @param name the table's name + * @return an rpc response object + */ + public DeleteTableResponse deleteTable(String name) throws Exception { + Deferred d = asyncClient.deleteTable(name); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Alter a table on the cluster as specified by the builder. + * + * When the method returns it only indicates that the master accepted the alter + * command, use {@link KuduClient#isAlterTableDone(String)} to know when the alter finishes. + * @param name the table's name, if this is a table rename then the old table name must be passed + * @param ato the alter table builder + * @return an rpc response object + */ + public AlterTableResponse alterTable(String name, AlterTableOptions ato) throws Exception { + Deferred d = asyncClient.alterTable(name, ato); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Helper method that checks and waits until the completion of an alter command. + * It will block until the alter command is done or the timeout is reached. + * @param name Table's name, if the table was renamed then that name must be checked against + * @return a boolean indicating if the table is done being altered + */ + public boolean isAlterTableDone(String name) throws Exception { + long totalSleepTime = 0; + while (totalSleepTime < getDefaultAdminOperationTimeoutMs()) { + long start = System.currentTimeMillis(); + + Deferred d = asyncClient.isAlterTableDone(name); + IsAlterTableDoneResponse response; + try { + response = d.join(AsyncKuduClient.SLEEP_TIME); + } catch (Exception ex) { + throw ex; + } + + if (response.isDone()) { + return true; + } + + // Count time that was slept and see if we need to wait a little more. + long elapsed = System.currentTimeMillis() - start; + // Don't oversleep the deadline. + if (totalSleepTime + AsyncKuduClient.SLEEP_TIME > getDefaultAdminOperationTimeoutMs()) { + return false; + } + // elapsed can be bigger if we slept about 500ms + if (elapsed <= AsyncKuduClient.SLEEP_TIME) { + LOG.debug("Alter not done, sleep " + (AsyncKuduClient.SLEEP_TIME - elapsed) + + " and slept " + totalSleepTime); + Thread.sleep(AsyncKuduClient.SLEEP_TIME - elapsed); + totalSleepTime += AsyncKuduClient.SLEEP_TIME; + } else { + totalSleepTime += elapsed; + } + } + return false; + } + + /** + * Get the list of running tablet servers. + * @return a list of tablet servers + */ + public ListTabletServersResponse listTabletServers() throws Exception { + Deferred d = asyncClient.listTabletServers(); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Get the list of all the tables. + * @return a list of all the tables + */ + public ListTablesResponse getTablesList() throws Exception { + return getTablesList(null); + } + + /** + * Get a list of table names. Passing a null filter returns all the tables. When a filter is + * specified, it only returns tables that satisfy a substring match. + * @param nameFilter an optional table name filter + * @return a deferred that contains the list of table names + */ + public ListTablesResponse getTablesList(String nameFilter) throws Exception { + Deferred d = asyncClient.getTablesList(nameFilter); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Test if a table exists. + * @param name a non-null table name + * @return true if the table exists, else false + */ + public boolean tableExists(String name) throws Exception { + Deferred d = asyncClient.tableExists(name); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Open the table with the given name. If the table was just created, this method will block until + * all its tablets have also been created. + * @param name table to open + * @return a KuduTable if the table exists, else a MasterErrorException + */ + public KuduTable openTable(final String name) throws Exception { + Deferred d = asyncClient.openTable(name); + return d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Create a new session for interacting with the cluster. + * User is responsible for destroying the session object. + * This is a fully local operation (no RPCs or blocking). + * @return a synchronous wrapper around KuduSession. + */ + public KuduSession newSession() { + AsyncKuduSession session = asyncClient.newSession(); + return new KuduSession(session); + } + + /** + * Creates a new {@link KuduScanner.KuduScannerBuilder} for a particular table. + * @param table the name of the table you intend to scan. + * The string is assumed to use the platform's default charset. + * @return a new scanner builder for this table + */ + public KuduScanner.KuduScannerBuilder newScannerBuilder(KuduTable table) { + return new KuduScanner.KuduScannerBuilder(asyncClient, table); + } + + /** + * Analogous to {@link #shutdown()}. + * @throws Exception if an error happens while closing the connections + */ + @Override + public void close() throws Exception { + asyncClient.close(); + } + + /** + * Performs a graceful shutdown of this instance. + * @throws Exception + */ + public void shutdown() throws Exception { + Deferred> d = asyncClient.shutdown(); + d.join(getDefaultAdminOperationTimeoutMs()); + } + + /** + * Get the timeout used for operations on sessions and scanners. + * @return a timeout in milliseconds + */ + public long getDefaultOperationTimeoutMs() { + return asyncClient.getDefaultOperationTimeoutMs(); + } + + /** + * Get the timeout used for admin operations. + * @return a timeout in milliseconds + */ + public long getDefaultAdminOperationTimeoutMs() { + return asyncClient.getDefaultAdminOperationTimeoutMs(); + } + + /** + * Builder class to use in order to connect to Kudu. + * All the parameters beyond those in the constructors are optional. + */ + public final static class KuduClientBuilder { + private AsyncKuduClient.AsyncKuduClientBuilder clientBuilder; + + /** + * Creates a new builder for a client that will connect to the specified masters. + * @param masterAddresses comma-separated list of "host:port" pairs of the masters + */ + public KuduClientBuilder(String masterAddresses) { + clientBuilder = new AsyncKuduClient.AsyncKuduClientBuilder(masterAddresses); + } + + /** + * Creates a new builder for a client that will connect to the specified masters. + * + *

Here are some examples of recognized formats: + *

    + *
  • example.com + *
  • example.com:80 + *
  • 192.0.2.1 + *
  • 192.0.2.1:80 + *
  • [2001:db8::1] + *
  • [2001:db8::1]:80 + *
  • 2001:db8::1 + *
+ * + * @param masterAddresses list of master addresses + */ + public KuduClientBuilder(List masterAddresses) { + clientBuilder = new AsyncKuduClient.AsyncKuduClientBuilder(masterAddresses); + } + + /** + * Sets the default timeout used for administrative operations (e.g. createTable, deleteTable, + * etc). + * Optional. + * If not provided, defaults to 10s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public KuduClientBuilder defaultAdminOperationTimeoutMs(long timeoutMs) { + clientBuilder.defaultAdminOperationTimeoutMs(timeoutMs); + return this; + } + + /** + * Sets the default timeout used for user operations (using sessions and scanners). + * Optional. + * If not provided, defaults to 10s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public KuduClientBuilder defaultOperationTimeoutMs(long timeoutMs) { + clientBuilder.defaultOperationTimeoutMs(timeoutMs); + return this; + } + + /** + * Sets the default timeout to use when waiting on data from a socket. + * Optional. + * If not provided, defaults to 5s. + * A value of 0 disables the timeout. + * @param timeoutMs a timeout in milliseconds + * @return this builder + */ + public KuduClientBuilder defaultSocketReadTimeoutMs(long timeoutMs) { + clientBuilder.defaultSocketReadTimeoutMs(timeoutMs); + return this; + } + + /** + * Set the executors which will be used for the embedded Netty boss and workers. + * Optional. + * If not provided, uses a simple cached threadpool. If either argument is null, + * then such a thread pool will be used in place of that argument. + * Note: executor's max thread number must be greater or equal to corresponding + * worker count, or netty cannot start enough threads, and client will get stuck. + * If not sure, please just use CachedThreadPool. + */ + public KuduClientBuilder nioExecutors(Executor bossExecutor, Executor workerExecutor) { + clientBuilder.nioExecutors(bossExecutor, workerExecutor); + return this; + } + + /** + * Set the maximum number of boss threads. + * Optional. + * If not provided, 1 is used. + */ + public KuduClientBuilder bossCount(int bossCount) { + clientBuilder.bossCount(bossCount); + return this; + } + + /** + * Set the maximum number of worker threads. + * Optional. + * If not provided, (2 * the number of available processors) is used. + */ + public KuduClientBuilder workerCount(int workerCount) { + clientBuilder.workerCount(workerCount); + return this; + } + + /** + * Creates a new client that connects to the masters. + * Doesn't block and won't throw an exception if the masters don't exist. + * @return a new asynchronous Kudu client + */ + public KuduClient build() { + AsyncKuduClient client = clientBuilder.build(); + return new KuduClient(client); + } + + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduException.java b/java/kudu-client/src/main/java/org/kududb/client/KuduException.java new file mode 100644 index 000000000000..726649b257fe --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduException.java @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * The parent class of all {@link RuntimeException} created by this package. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public abstract class KuduException extends RuntimeException { + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + */ + KuduException(final String msg) { + super(msg); + } + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + * @param cause The exception that caused this one to be thrown. + */ + KuduException(final String msg, final Throwable cause) { + super(msg, cause); + } + + /** + * Factory method to make it possible to create an exception from another + * one without having to resort to reflection, which is annoying to use. + * Sub-classes that want to provide this internal functionality should + * implement this method. + * @param arg Some arbitrary parameter to help build the new instance. + * @param rpc The RPC that failed, if any. Can be {@code null}. + */ + KuduException make(final Object arg, final KuduRpc rpc) { + throw new AssertionError("Must not be used."); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduRpc.java b/java/kudu-client/src/main/java/org/kududb/client/KuduRpc.java new file mode 100644 index 000000000000..9ef81b88e651 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduRpc.java @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import com.google.protobuf.CodedOutputStream; +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.protobuf.Message; +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.kududb.util.Slice; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.buffer.ChannelBuffers; + +import java.io.IOException; + +import static org.kududb.client.ExternalConsistencyMode.CLIENT_PROPAGATED; + +/** + * Abstract base class for all RPC requests going out to Kudu. + *

+ * Implementations of this class are not expected to be synchronized. + * + *

A note on passing {@code byte} arrays in argument

+ * None of the method that receive a {@code byte[]} in argument will copy it. + * If you change the contents of any byte array you give to an instance of + * this class, you may affect the behavior of the request in an + * unpredictable way. If you need to change the byte array, + * {@link Object#clone() clone} it before giving it to this class. For those + * familiar with the term "defensive copy", we don't do it in order to avoid + * unnecessary memory copies when you know you won't be changing (or event + * holding a reference to) the byte array, which is frequently the case. + */ +@InterfaceAudience.Private +public abstract class KuduRpc { + + // Service names. + protected static final String MASTER_SERVICE_NAME = "kudu.master.MasterService"; + protected static final String TABLET_SERVER_SERVICE_NAME = "kudu.tserver.TabletServerService"; + + public interface HasKey { + /** + * Returns the partition key this RPC is for. + *

+ * DO NOT MODIFY THE CONTENTS OF THE ARRAY RETURNED. + */ + byte[] partitionKey(); + } + + /** + * The Deferred that will be invoked when this RPC completes or fails. + * In case of a successful completion, this Deferred's first callback + * will be invoked with an {@link Object} containing the de-serialized + * RPC response in argument. + * Once an RPC has been used, we create a new Deferred for it, in case + * the user wants to re-use it. + */ + private Deferred deferred; + + private AsyncKuduClient.RemoteTablet tablet; + + final KuduTable table; + + final DeadlineTracker deadlineTracker; + + protected long propagatedTimestamp = -1; + protected ExternalConsistencyMode externalConsistencyMode = CLIENT_PROPAGATED; + + /** + * How many times have we retried this RPC?. + * Proper synchronization is required, although in practice most of the code + * that access this attribute will have a happens-before relationship with + * the rest of the code, due to other existing synchronization. + */ + byte attempt; // package-private for TabletClient and AsyncKuduClient only. + + KuduRpc(KuduTable table) { + this.table = table; + this.deadlineTracker = new DeadlineTracker(); + } + + /** + * To be implemented by the concrete sub-type. + * + * Notice that this method is package-private, so only classes within this + * package can use this as a base class. + */ + abstract ChannelBuffer serialize(Message header); + + /** + * Package private way of getting the name of the RPC service. + */ + abstract String serviceName(); + + /** + * Package private way of getting the name of the RPC method. + */ + abstract String method(); + + /** + * To be implemented by the concrete sub-type. + * This method is expected to de-serialize a response received for the + * current RPC. + * + * Notice that this method is package-private, so only classes within this + * package can use this as a base class. + * + * @param callResponse The call response from which to deserialize. + * @param tsUUID A string that contains the UUID of the server that answered the RPC. + * @return An Object of type R that will be sent to callback and an Object that will be an Error + * of type TabletServerErrorPB or MasterErrorPB that will be converted into an exception and + * sent to errback. + * @throws Exception An exception that will be sent to errback. + */ + abstract Pair deserialize(CallResponse callResponse, String tsUUID) throws Exception; + + /** + * Sets the external consistency mode for this RPC. + * TODO make this cover most if not all RPCs (right now only scans and writes use this). + * @param externalConsistencyMode the mode to set + */ + public void setExternalConsistencyMode(ExternalConsistencyMode externalConsistencyMode) { + this.externalConsistencyMode = externalConsistencyMode; + } + + public ExternalConsistencyMode getExternalConsistencyMode() { + return this.externalConsistencyMode; + } + + /** + * Sets the propagated timestamp for this RPC. + * @param propagatedTimestamp the timestamp to propagate + */ + public void setPropagatedTimestamp(long propagatedTimestamp) { + this.propagatedTimestamp = propagatedTimestamp; + } + + private void handleCallback(final Object result) { + final Deferred d = deferred; + if (d == null) { + return; + } + deferred = null; + attempt = 0; + deadlineTracker.reset(); + d.callback(result); + } + + /** + * Package private way of making an RPC complete by giving it its result. + * If this RPC has no {@link Deferred} associated to it, nothing will + * happen. This may happen if the RPC was already called back. + *

+ * Once this call to this method completes, this object can be re-used to + * re-send the same RPC, provided that no other thread still believes this + * RPC to be in-flight (guaranteeing this may be hard in error cases). + */ + final void callback(final R result) { + handleCallback(result); + } + + /** + * Same as callback, except that it accepts an Exception. + */ + final void errback(final Exception e) { + handleCallback(e); + } + + /** Package private way of accessing / creating the Deferred of this RPC. */ + final Deferred getDeferred() { + if (deferred == null) { + deferred = new Deferred(); + } + return deferred; + } + + AsyncKuduClient.RemoteTablet getTablet() { + return this.tablet; + } + + void setTablet(AsyncKuduClient.RemoteTablet tablet) { + this.tablet = tablet; + } + + public KuduTable getTable() { + return table; + } + + void setTimeoutMillis(long timeout) { + deadlineTracker.setDeadline(timeout); + } + + public String toString() { + + final StringBuilder buf = new StringBuilder(); + buf.append("KuduRpc(method="); + buf.append(method()); + buf.append(", tablet="); + if (tablet == null) { + buf.append("null"); + } else { + buf.append(tablet.getTabletIdAsString()); + } + buf.append(", attempt=").append(attempt); + buf.append(", ").append(deadlineTracker); + buf.append(", ").append(deferred); + buf.append(')'); + return buf.toString(); + } + + static void readProtobuf(final Slice slice, + final com.google.protobuf.GeneratedMessage.Builder builder) { + final int length = slice.length(); + final byte[] payload = slice.getRawArray(); + final int offset = slice.getRawOffset(); + try { + builder.mergeFrom(payload, offset, length); + if (!builder.isInitialized()) { + throw new InvalidResponseException("Could not deserialize the response," + + " incompatible RPC? Error is: " + builder.getInitializationErrorString(), null); + } + } catch (InvalidProtocolBufferException e) { + final String msg = "Invalid RPC response: length=" + length + + ", payload=" + Bytes.pretty(payload); + throw new InvalidResponseException(msg, e); + } + } + + static ChannelBuffer toChannelBuffer(Message header, Message pb) { + int totalSize = IPCUtil.getTotalSizeWhenWrittenDelimited(header, pb); + byte[] buf = new byte[totalSize+4]; + ChannelBuffer chanBuf = ChannelBuffers.wrappedBuffer(buf); + chanBuf.clear(); + chanBuf.writeInt(totalSize); + final CodedOutputStream out = CodedOutputStream.newInstance(buf, 4, totalSize); + try { + out.writeRawVarint32(header.getSerializedSize()); + header.writeTo(out); + + out.writeRawVarint32(pb.getSerializedSize()); + pb.writeTo(out); + out.checkNoSpaceLeft(); + } catch (IOException e) { + throw new NonRecoverableException("Cannot serialize the following message " + pb, e); + } + chanBuf.writerIndex(buf.length); + return chanBuf; + } + + /** + * Upper bound on the size of a byte array we de-serialize. + * This is to prevent Kudu from OOM'ing us, should there be a bug or + * undetected corruption of an RPC on the network, which would turn a + * an innocuous RPC into something allocating a ton of memory. + * The Hadoop RPC protocol doesn't do any checksumming as they probably + * assumed that TCP checksums would be sufficient (they're not). + */ + static final long MAX_BYTE_ARRAY_MASK = + 0xFFFFFFFFF0000000L; // => max = 256MB + + /** + * Verifies that the given length looks like a reasonable array length. + * This method accepts 0 as a valid length. + * @param buf The buffer from which the length was read. + * @param length The length to validate. + * @throws IllegalArgumentException if the length is negative or + * suspiciously large. + */ + static void checkArrayLength(final ChannelBuffer buf, final long length) { + // 2 checks in 1. If any of the high bits are set, we know the value is + // either too large, or is negative (if the most-significant bit is set). + if ((length & MAX_BYTE_ARRAY_MASK) != 0) { + if (length < 0) { + throw new IllegalArgumentException("Read negative byte array length: " + + length + " in buf=" + buf + '=' + Bytes.pretty(buf)); + } else { + throw new IllegalArgumentException("Read byte array length that's too" + + " large: " + length + " > " + ~MAX_BYTE_ARRAY_MASK + " in buf=" + + buf + '=' + Bytes.pretty(buf)); + } + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduRpcResponse.java b/java/kudu-client/src/main/java/org/kududb/client/KuduRpcResponse.java new file mode 100644 index 000000000000..ff10469574ae --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduRpcResponse.java @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; + +/** + * Base class for RPC responses. + */ +@InterfaceAudience.Private +abstract class KuduRpcResponse { + private final long elapsedMillis; + private final String tsUUID; + + /** + * Constructor with information common to all RPCs. + * @param elapsedMillis Time in milliseconds since RPC creation to now. + * @param tsUUID A string that contains the UUID of the server that answered the RPC. + */ + KuduRpcResponse(long elapsedMillis, String tsUUID) { + this.elapsedMillis = elapsedMillis; + this.tsUUID = tsUUID; + } + + /** + * Get the number of milliseconds elapsed since the RPC was created up to the moment when this + * response was created. + * @return Elapsed time in milliseconds. + */ + public long getElapsedMillis() { + return elapsedMillis; + } + + /** + * Get the identifier of the tablet server that sent the response. + * @return A string containing a UUID. + */ + public String getTsUUID() { + return tsUUID; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduScanner.java b/java/kudu-client/src/main/java/org/kududb/client/KuduScanner.java new file mode 100644 index 000000000000..cdaaa4696922 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduScanner.java @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.stumbleupon.async.Deferred; + +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.AsyncKuduScanner.ReadMode; + +/** + * Synchronous version of {@link AsyncKuduScanner}. Offers the same API but with blocking methods. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduScanner { + + private final AsyncKuduScanner asyncScanner; + + KuduScanner(AsyncKuduScanner asyncScanner) { + this.asyncScanner = asyncScanner; + } + + /** + * Tells if the last rpc returned that there might be more rows to scan. + * @return true if there might be more data to scan, else false + */ + public boolean hasMoreRows() { + return asyncScanner.hasMoreRows(); + } + + /** + * Scans a number of rows. + *

+ * Once this method returns {@code null} once (which indicates that this + * {@code Scanner} is done scanning), calling it again leads to an undefined + * behavior. + * @return a list of rows. + */ + public RowResultIterator nextRows() throws Exception { + Deferred d = asyncScanner.nextRows(); + return d.join(asyncScanner.scanRequestTimeout); + } + + /** + * Closes this scanner (don't forget to call this when you're done with it!). + *

+ * Closing a scanner already closed has no effect. + * @return a deferred object that indicates the completion of the request + */ + public RowResultIterator close() throws Exception { + Deferred d = asyncScanner.close(); + return d.join(asyncScanner.scanRequestTimeout); + } + + /** + * Returns the maximum number of rows that this scanner was configured to return. + * @return a long representing the maximum number of rows that can be returned + */ + public long getLimit() { + return asyncScanner.getLimit(); + } + + /** + * Returns if this scanner was configured to cache data blocks or not. + * @return true if this scanner will cache blocks, else else. + */ + public boolean getCacheBlocks() { + return asyncScanner.getCacheBlocks(); + } + + /** + * Returns the maximum number of bytes returned by the scanner, on each batch. + * @return a long representing the maximum number of bytes that a scanner can receive at once + * from a tablet server + */ + public long getBatchSizeBytes() { + return asyncScanner.getBatchSizeBytes(); + } + + /** + * Returns the ReadMode for this scanner. + * @return the configured read mode for this scanner + */ + public ReadMode getReadMode() { + return asyncScanner.getReadMode(); + } + + /** + * Returns the projection schema of this scanner. If specific columns were + * not specified during scanner creation, the table schema is returned. + * @return the projection schema for this scanner + */ + public Schema getProjectionSchema() { + return asyncScanner.getProjectionSchema(); + } + + /** + * A Builder class to build {@link KuduScanner}. + * Use {@link KuduClient#newScannerBuilder} in order to get a builder instance. + */ + @InterfaceAudience.Public + @InterfaceStability.Evolving + public static class KuduScannerBuilder + extends AbstractKuduScannerBuilder { + + KuduScannerBuilder(AsyncKuduClient client, KuduTable table) { + super(client, table); + } + + /** + * Builds a {@link KuduScanner} using the passed configurations. + * @return a new {@link KuduScanner} + */ + public KuduScanner build() { + return new KuduScanner(new AsyncKuduScanner( + client, table, projectedColumnNames, projectedColumnIndexes, readMode, + scanRequestTimeout, columnRangePredicates, limit, cacheBlocks, + prefetching, lowerBoundPrimaryKey, upperBoundPrimaryKey, + lowerBoundPartitionKey, upperBoundPartitionKey, + htTimestamp, batchSizeBytes)); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduServerException.java b/java/kudu-client/src/main/java/org/kududb/client/KuduServerException.java new file mode 100644 index 000000000000..db12c94cf9e6 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduServerException.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.rpc.RpcHeader; + +/** + * This class is used for errors sent in response to a RPC. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public class KuduServerException extends KuduException { + + KuduServerException(String serverUuid, RpcHeader.ErrorStatusPB errorStatus) { + this(serverUuid, errorStatus.getMessage(), errorStatus.getCode().toString(), + errorStatus.getCode().getNumber(), null); + } + + KuduServerException(String serverUuid, WireProtocol.AppStatusPB appStatus) { + this(serverUuid, appStatus.getMessage(), appStatus.getCode().toString(), + appStatus.getCode().getNumber(), null); + } + + KuduServerException(String serverUuid, String message, String errorDesc, + int errCode, Throwable cause) { + super("Server[" + serverUuid + "] " + + errorDesc + "[code " + errCode + "]: " + message, cause); + } +} \ No newline at end of file diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduSession.java b/java/kudu-client/src/main/java/org/kududb/client/KuduSession.java new file mode 100644 index 000000000000..f13f9c647a5c --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduSession.java @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.*; + +import com.stumbleupon.async.Deferred; +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Synchronous version of {@link AsyncKuduSession}. + * Offers the same API but with blocking methods.

+ * + * This class is not thread-safe.

+ * + * A major difference with {@link AsyncKuduSession} is that the time spent waiting on operations is + * defined by {@link #setTimeoutMillis(long)} which defaults to getting it from + * {@link KuduClient#getDefaultOperationTimeoutMs()}. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduSession implements SessionConfiguration { + + public static final Logger LOG = LoggerFactory.getLogger(KuduSession.class); + + private final AsyncKuduSession session; + + KuduSession(AsyncKuduSession session) { + this.session = session; + } + + /** + * Blocking call with a different behavior based on the flush mode. PleaseThrottleException is + * managed by this method and will not be thrown, unlike {@link AsyncKuduSession#apply}. + *

+ *

    + *
  • AUTO_FLUSH_SYNC: the call returns when the operation is persisted, + * else it throws an exception. + *
  • AUTO_FLUSH_BACKGROUND: the call returns when the operation has been added to the buffer. + * This call should normally perform only fast in-memory operations but + * it may have to wait when the buffer is full and there's another buffer being flushed. Row + * errors can be checked by calling {@link #countPendingErrors()} and can be retrieved by calling + * {@link #getPendingErrors()}. + *
  • MANUAL_FLUSH: the call returns when the operation has been added to the buffer, + * else it throws an exception such as a NonRecoverableException if the buffer is full. + *
+ * + * @param operation operation to apply + * @return an OperationResponse for the applied Operation + * @throws Exception if anything went wrong + */ + public OperationResponse apply(Operation operation) throws Exception { + while (true) { + try { + Deferred d = session.apply(operation); + if (getFlushMode() == FlushMode.AUTO_FLUSH_SYNC) { + return d.join(getTimeoutMillis()); + } + break; + } catch (PleaseThrottleException ex) { + try { + ex.getDeferred().join(getTimeoutMillis()); + } catch (Exception e) { + // This is the error response from the buffer that was flushing, + // we can't do much with it at this point. + LOG.error("Previous batch had this exception", e); + } + } catch (Exception e) { + throw e; + } + } + return null; + } + + /** + * Blocking call that force flushes this session's buffers. Data is persisted when this call + * returns, else it will throw an exception. + * @return a list of OperationResponse, one per operation that was flushed + * @throws Exception if anything went wrong. If it's an issue with some or all batches, + * it will be of type DeferredGroupException. + */ + public List flush() throws Exception { + return session.flush().join(getTimeoutMillis()); + } + + /** + * Blocking call that flushes the buffers (see {@link #flush()} and closes the sessions. + * @return List of OperationResponse, one per operation that was flushed + * @throws Exception if anything went wrong. If it's an issue with some or all batches, + * it will be of type DeferredGroupException. + */ + public List close() throws Exception { + return session.close().join(getTimeoutMillis()); + } + + @Override + public FlushMode getFlushMode() { + return session.getFlushMode(); + } + + @Override + public void setFlushMode(AsyncKuduSession.FlushMode flushMode) { + session.setFlushMode(flushMode); + } + + @Override + public void setMutationBufferSpace(int size) { + session.setMutationBufferSpace(size); + } + + @Override + public void setMutationBufferLowWatermark(float mutationBufferLowWatermarkPercentage) { + session.setMutationBufferLowWatermark(mutationBufferLowWatermarkPercentage); + } + + @Override + public void setFlushInterval(int interval) { + session.setFlushInterval(interval); + } + + @Override + public long getTimeoutMillis() { + return session.getTimeoutMillis(); + } + + @Override + public void setTimeoutMillis(long timeout) { + session.setTimeoutMillis(timeout); + } + + @Override + public boolean isClosed() { + return session.isClosed(); + } + + @Override + public boolean hasPendingOperations() { + return session.hasPendingOperations(); + } + + @Override + public void setExternalConsistencyMode(ExternalConsistencyMode consistencyMode) { + session.setExternalConsistencyMode(consistencyMode); + } + + @Override + public boolean isIgnoreAllDuplicateRows() { + return session.isIgnoreAllDuplicateRows(); + } + + @Override + public void setIgnoreAllDuplicateRows(boolean ignoreAllDuplicateRows) { + session.setIgnoreAllDuplicateRows(ignoreAllDuplicateRows); + } + + @Override + public int countPendingErrors() { + return session.countPendingErrors(); + } + + @Override + public RowErrorsAndOverflowStatus getPendingErrors() { + return session.getPendingErrors(); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/KuduTable.java b/java/kudu-client/src/main/java/org/kududb/client/KuduTable.java new file mode 100644 index 000000000000..b0dd4e7f4b79 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/KuduTable.java @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import com.stumbleupon.async.Deferred; + +import java.util.List; + +/** + * A KuduTable represents a table on a particular cluster. It holds the current + * schema of the table. Any given KuduTable instance belongs to a specific AsyncKuduClient + * instance. + * + * Upon construction, the table is looked up in the catalog (or catalog cache), + * and the schema fetched for introspection. The schema is not kept in sync with the master. + * + * This class is thread-safe. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduTable { + + private final Schema schema; + private final PartitionSchema partitionSchema; + private final AsyncKuduClient client; + private final String name; + private final String tableId; + + /** + * Package-private constructor, use {@link KuduClient#openTable(String)} to get an instance. + * @param client the client this instance belongs to + * @param name this table's name + * @param schema this table's schema + */ + KuduTable(AsyncKuduClient client, String name, String tableId, + Schema schema, PartitionSchema partitionSchema) { + this.schema = schema; + this.partitionSchema = partitionSchema; + this.client = client; + this.name = name; + this.tableId = tableId; + } + + /** + * Get this table's schema, as of the moment this instance was created. + * @return this table's schema + */ + public Schema getSchema() { + return this.schema; + } + + /** + * Gets the table's partition schema. + * + * This method is new, and not considered stable or suitable for public use. + * + * @return the table's partition schema. + */ + @InterfaceAudience.LimitedPrivate("Impala") + @InterfaceStability.Unstable + public PartitionSchema getPartitionSchema() { + return partitionSchema; + } + + /** + * Get this table's name. + * @return this table's name + */ + public String getName() { + return this.name; + } + + /** + * Get this table's unique identifier. + * @return this table's tableId + */ + public String getTableId() { + return tableId; + } + + /** + * Get the async client that created this instance. + * @return an async kudu client + */ + public AsyncKuduClient getAsyncClient() { + return this.client; + } + + /** + * Get a new insert configured with this table's schema. The returned object should not be reused. + * @return an insert with this table's schema + */ + public Insert newInsert() { + return new Insert(this); + } + + /** + * Get a new update configured with this table's schema. The returned object should not be reused. + * @return an update with this table's schema + */ + public Update newUpdate() { + return new Update(this); + } + + /** + * Get a new delete configured with this table's schema. The returned object should not be reused. + * @return a delete with this table's schema + */ + public Delete newDelete() { + return new Delete(this); + } + + /** + * Get all the tablets for this table. This may query the master multiple times if there + * are a lot of tablets. + * @param deadline deadline in milliseconds for this method to finish + * @return a list containing the metadata and locations for each of the tablets in the + * table + * @throws Exception + */ + public List getTabletsLocations( + long deadline) throws Exception { + return getTabletsLocations(null, null, deadline); + } + + /** + * Asynchronously get all the tablets for this table. + * @param deadline max time spent in milliseconds for the deferred result of this method to + * get called back, if deadline is reached, the deferred result will get erred back + * @return a {@link Deferred} object that yields a list containing the metadata and + * locations for each of the tablets in the table + */ + public Deferred> asyncGetTabletsLocations( + long deadline) throws Exception { + return asyncGetTabletsLocations(null, null, deadline); + } + + /** + * Get all or some tablets for this table. This may query the master multiple times if there + * are a lot of tablets. + * This method blocks until it gets all the tablets. + * @param startKey where to start in the table, pass null to start at the beginning + * @param endKey where to stop in the table, pass null to get all the tablets until the end of + * the table + * @param deadline deadline in milliseconds for this method to finish + * @return a list containing the metadata and locations for each of the tablets in the + * table + * @throws Exception + */ + public List getTabletsLocations( + byte[] startKey, byte[] endKey, long deadline) throws Exception { + return client.syncLocateTable(tableId, startKey, endKey, deadline); + } + + /** + * Asynchronously get all or some tablets for this table. + * @param startKey where to start in the table, pass null to start at the beginning + * @param endKey where to stop in the table, pass null to get all the tablets until the end of + * the table + * @param deadline max time spent in milliseconds for the deferred result of this method to + * get called back, if deadline is reached, the deferred result will get erred back + * @return a {@link Deferred} object that yields a list containing the metadata and locations + * for each of the tablets in the table + */ + public Deferred> asyncGetTabletsLocations( + byte[] startKey, byte[] endKey, long deadline) throws Exception { + return client.locateTable(tableId, startKey, endKey, deadline); + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ListTablesRequest.java b/java/kudu-client/src/main/java/org/kududb/client/ListTablesRequest.java new file mode 100644 index 000000000000..78725f073d77 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ListTablesRequest.java @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +import java.util.ArrayList; +import java.util.List; + +@InterfaceAudience.Private +class ListTablesRequest extends KuduRpc { + + private final String nameFilter; + + ListTablesRequest(KuduTable masterTable, String nameFilter) { + super(masterTable); + this.nameFilter = nameFilter; + } + + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final Master.ListTablesRequestPB.Builder builder = + Master.ListTablesRequestPB.newBuilder(); + if (nameFilter != null) { + builder.setNameFilter(nameFilter); + } + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return "ListTables"; + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + final Master.ListTablesResponsePB.Builder respBuilder = + Master.ListTablesResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + int serversCount = respBuilder.getTablesCount(); + List tables = new ArrayList(serversCount); + for (Master.ListTablesResponsePB.TableInfo info : respBuilder.getTablesList()) { + tables.add(info.getName()); + } + ListTablesResponse response = new ListTablesResponse(deadlineTracker.getElapsedMillis(), + tsUUID, tables); + return new Pair( + response, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ListTablesResponse.java b/java/kudu-client/src/main/java/org/kududb/client/ListTablesResponse.java new file mode 100644 index 000000000000..70daee23254a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ListTablesResponse.java @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.List; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class ListTablesResponse extends KuduRpcResponse { + + private final List tablesList; + + ListTablesResponse(long ellapsedMillis, String tsUUID, List tablesList) { + super(ellapsedMillis, tsUUID); + this.tablesList = tablesList; + } + + /** + * Get the list of tables as specified in the request. + * @return a list of table names + */ + public List getTablesList() { + return tablesList; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersRequest.java b/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersRequest.java new file mode 100644 index 000000000000..bf26626e1dbd --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersRequest.java @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.Message; +import static org.kududb.master.Master.*; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +import java.util.ArrayList; +import java.util.List; + +@InterfaceAudience.Private +public class ListTabletServersRequest extends KuduRpc { + + public ListTabletServersRequest(KuduTable masterTable) { + super(masterTable); + } + @Override + ChannelBuffer serialize(Message header) { + assert header.isInitialized(); + final ListTabletServersRequestPB.Builder builder = + ListTabletServersRequestPB.newBuilder(); + return toChannelBuffer(header, builder.build()); + } + + @Override + String serviceName() { return MASTER_SERVICE_NAME; } + + @Override + String method() { + return "ListTabletServers"; + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + final ListTabletServersResponsePB.Builder respBuilder = + ListTabletServersResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), respBuilder); + int serversCount = respBuilder.getServersCount(); + List servers = new ArrayList(serversCount); + for (ListTabletServersResponsePB.Entry entry : respBuilder.getServersList()) { + servers.add(entry.getRegistration().getRpcAddresses(0).getHost()); + } + ListTabletServersResponse response = new ListTabletServersResponse(deadlineTracker + .getElapsedMillis(), tsUUID, serversCount, servers); + return new Pair( + response, respBuilder.hasError() ? respBuilder.getError() : null); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersResponse.java b/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersResponse.java new file mode 100644 index 000000000000..373a14d08e1a --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ListTabletServersResponse.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.List; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class ListTabletServersResponse extends KuduRpcResponse { + + private final int tabletServersCount; + private final List tabletServersList; + + /** + * @param ellapsedMillis Time in milliseconds since RPC creation to now. + * @param tabletServersCount How many tablet servers the master is reporting. + * @param tabletServersList List of tablet servers. + */ + ListTabletServersResponse(long ellapsedMillis, String tsUUID, + int tabletServersCount, List tabletServersList) { + super(ellapsedMillis, tsUUID); + this.tabletServersCount = tabletServersCount; + this.tabletServersList = tabletServersList; + } + + /** + * Get the count of tablet servers as reported by the master. + * @return TS count. + */ + public int getTabletServersCount() { + return tabletServersCount; + } + + /** + * Get the list of tablet servers, as represented by their hostname. + * @return List of hostnames, one per TS. + */ + public List getTabletServersList() { + return tabletServersList; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/LocatedTablet.java b/java/kudu-client/src/main/java/org/kududb/client/LocatedTablet.java new file mode 100644 index 000000000000..28caa77a7b69 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/LocatedTablet.java @@ -0,0 +1,141 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.kududb.client; + +import java.util.List; + +import com.google.common.collect.Lists; +import com.google.common.collect.ImmutableList; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.consensus.Metadata.RaftPeerPB.Role; +import org.kududb.master.Master.TabletLocationsPB; +import org.kududb.master.Master.TabletLocationsPB.ReplicaPB; + +/** + * Information about the locations of tablets in a Kudu table. + * This should be treated as immutable data (it does not reflect + * any updates the client may have heard since being constructed). + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class LocatedTablet { + private final Partition partition; + private final byte[] tabletId; + + private final List replicas; + + LocatedTablet(TabletLocationsPB pb) { + this.partition = ProtobufHelper.pbToPartition(pb.getPartition()); + this.tabletId = pb.getTabletId().toByteArray(); + + List reps = Lists.newArrayList(); + for (ReplicaPB repPb : pb.getReplicasList()) { + reps.add(new Replica(repPb)); + } + this.replicas = ImmutableList.copyOf(reps); + } + + public List getReplicas() { + return replicas; + } + + public Partition getPartition() { + return partition; + } + + /** + * DEPRECATED: use {@link #getPartition()} + */ + @Deprecated + public byte[] getStartKey() { + return getPartition().getPartitionKeyStart(); + } + + /** + * DEPRECATED: use {@link #getPartition()} + */ + @Deprecated() + public byte[] getEndKey() { + return getPartition().getPartitionKeyEnd(); + } + + public byte[] getTabletId() { + return tabletId; + } + + /** + * Return the current leader, or null if there is none. + */ + public Replica getLeaderReplica() { + return getOneOfRoleOrNull(Role.LEADER); + } + + /** + * Return the first occurrence for the given role, or null if there is none. + */ + private Replica getOneOfRoleOrNull(Role role) { + for (Replica r : replicas) { + if (r.getRole() == role.toString()) return r; + } + return null; + } + + @Override + public String toString() { + return Bytes.pretty(tabletId) + " " + partition.toString(); + } + + /** + * One of the replicas of the tablet. + */ + @InterfaceAudience.Public + @InterfaceStability.Evolving + public static class Replica { + private final ReplicaPB pb; + + private Replica(ReplicaPB pb) { + this.pb = pb; + } + + public String getRpcHost() { + if (pb.getTsInfo().getRpcAddressesList().isEmpty()) { + return null; + } + return pb.getTsInfo().getRpcAddressesList().get(0).getHost(); + } + + public Integer getRpcPort() { + if (pb.getTsInfo().getRpcAddressesList().isEmpty()) { + return null; + } + return pb.getTsInfo().getRpcAddressesList().get(0).getPort(); + } + + public String getRole() { + return pb.getRole().toString(); + } + + public String toString() { + return pb.toString(); + } + } + +}; diff --git a/java/kudu-client/src/main/java/org/kududb/client/MasterErrorException.java b/java/kudu-client/src/main/java/org/kududb/client/MasterErrorException.java new file mode 100644 index 000000000000..2bbb7a50a568 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/MasterErrorException.java @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.master.Master; +import org.kududb.rpc.RpcHeader; + +/** + * This exception is thrown when a Master responds to an RPC with an error message + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public class MasterErrorException extends KuduServerException { + + MasterErrorException(String serverUuid, RpcHeader.ErrorStatusPB errorStatus) { + super(serverUuid, errorStatus); + } + + MasterErrorException(String serverUuid, Master.MasterErrorPB error) { + super(serverUuid, error.getStatus()); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/NoLeaderMasterFoundException.java b/java/kudu-client/src/main/java/org/kududb/client/NoLeaderMasterFoundException.java new file mode 100644 index 000000000000..3cc502e8a85f --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/NoLeaderMasterFoundException.java @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Functions; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.List; + +/** + * Indicates that the request failed because we couldn't find a leader master server. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public final class NoLeaderMasterFoundException extends RecoverableException { + + NoLeaderMasterFoundException(final String msg) { + super(msg); + } + NoLeaderMasterFoundException(final String msg, final Exception cause) { + super(msg, cause); + } + + /** + * Factory method that creates a NoLeaderException given a message and a list + * (which may be empty, but must be initialized) of exceptions encountered: they indicate + * why {@link GetMasterRegistrationRequest} calls to the masters in the config + * have failed, to aid in debugging the issue. If the list is non-empty, each exception's + * 'toString()' message is appended to 'msg' and the last exception is used as the + * cause for the exception. + * @param msg A message detailing why this exception occured. + * @param causes List of exceptions encountered when retrieving registration from individual + * masters. + * @return An instantiated NoLeaderMasterFoundException which can be thrown. + */ + static NoLeaderMasterFoundException create(String msg, List causes) { + if (causes.isEmpty()) { + return new NoLeaderMasterFoundException(msg); + } + String joinedMsg = msg + ". Exceptions received: " + + Joiner.on(",").join(Lists.transform(causes, Functions.toStringFunction())); + return new NoLeaderMasterFoundException(joinedMsg, causes.get(causes.size() - 1)); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/NonRecoverableException.java b/java/kudu-client/src/main/java/org/kududb/client/NonRecoverableException.java new file mode 100644 index 000000000000..a11a05e546ce --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/NonRecoverableException.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public class NonRecoverableException extends KuduException { + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + */ + NonRecoverableException(final String msg) { + super(msg); + } + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + * @param cause The exception that caused this one to be thrown. + */ + NonRecoverableException(final String msg, final Throwable cause) { + super(msg, cause); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Operation.java b/java/kudu-client/src/main/java/org/kududb/client/Operation.java new file mode 100644 index 000000000000..61b788968b23 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Operation.java @@ -0,0 +1,289 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.protobuf.ByteString; +import com.google.protobuf.Message; +import com.google.protobuf.ZeroCopyLiteralByteString; + +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.WireProtocol.RowOperationsPB; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; +import org.kududb.util.Pair; +import org.jboss.netty.buffer.ChannelBuffer; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; + +/** + * Base class for the RPCs that related to WriteRequestPB. It contains almost all the logic + * and knows how to serialize its child classes. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public abstract class Operation extends KuduRpc implements KuduRpc.HasKey { + + // Number given by the session when apply()'d for the first time. Necessary to retain operations + // in their original order even after tablet lookup. + private long sequenceNumber = -1; + + enum ChangeType { + INSERT((byte)RowOperationsPB.Type.INSERT.getNumber()), + UPDATE((byte)RowOperationsPB.Type.UPDATE.getNumber()), + DELETE((byte)RowOperationsPB.Type.DELETE.getNumber()), + SPLIT_ROWS((byte)RowOperationsPB.Type.SPLIT_ROW.getNumber()); + + ChangeType(byte encodedByte) { + this.encodedByte = encodedByte; + } + + byte toEncodedByte() { + return encodedByte; + } + + /** The byte used to encode this in a RowOperationsPB */ + private byte encodedByte; + } + + static final String METHOD = "Write"; + + private final PartialRow row; + + /** + * Package-private constructor. Subclasses need to be instantiated via AsyncKuduSession + * @param table table with the schema to use for this operation + */ + Operation(KuduTable table) { + super(table); + this.row = table.getSchema().newPartialRow(); + } + + /** + * Classes extending Operation need to have a specific ChangeType + * @return Operation's ChangeType + */ + abstract ChangeType getChangeType(); + + + /** + * Sets the sequence number used when batching operations. Should only be called once. + * @param sequenceNumber a new sequence number + */ + void setSequenceNumber(long sequenceNumber) { + assert (this.sequenceNumber == -1); + this.sequenceNumber = sequenceNumber; + } + + /** + * Returns the sequence number given to this operation. + * @return a long representing the sequence number given to this operation after it was applied, + * can be -1 if it wasn't set + */ + long getSequenceNumber() { + return this.sequenceNumber; + } + + @Override + String serviceName() { return TABLET_SERVER_SERVICE_NAME; } + + @Override + String method() { + return METHOD; + } + + @Override + ChannelBuffer serialize(Message header) { + final Tserver.WriteRequestPB.Builder builder = createAndFillWriteRequestPB(this); + builder.setTabletId(ZeroCopyLiteralByteString.wrap(getTablet().getTabletIdAsBytes())); + builder.setExternalConsistencyMode(this.externalConsistencyMode.pbVersion()); + if (this.propagatedTimestamp != AsyncKuduClient.NO_TIMESTAMP) { + builder.setPropagatedTimestamp(this.propagatedTimestamp); + } + return toChannelBuffer(header, builder.build()); + } + + @Override + Pair deserialize(CallResponse callResponse, + String tsUUID) throws Exception { + Tserver.WriteResponsePB.Builder builder = Tserver.WriteResponsePB.newBuilder(); + readProtobuf(callResponse.getPBMessage(), builder); + Tserver.WriteResponsePB.PerRowErrorPB error = null; + if (builder.getPerRowErrorsCount() != 0) { + error = builder.getPerRowErrors(0); + } + OperationResponse response = new OperationResponse(deadlineTracker.getElapsedMillis(), tsUUID, + builder.getTimestamp(), this, error); + return new Pair( + response, builder.hasError() ? builder.getError() : null); + } + + @Override + public byte[] partitionKey() { + return this.getTable().getPartitionSchema().encodePartitionKey(row); + } + + /** + * Get the underlying row to modify. + * @return a partial row that will be sent with this Operation + */ + public PartialRow getRow() { + return this.row; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(super.toString()); + sb.append(" row_key="); + sb.append(row.stringifyRowKey()); + return sb.toString(); + } + + /** + * Helper method that puts a list of Operations together into a WriteRequestPB. + * @param operations The list of ops to put together in a WriteRequestPB + * @return A fully constructed WriteRequestPB containing the passed rows, or + * null if no rows were passed. + */ + static Tserver.WriteRequestPB.Builder createAndFillWriteRequestPB(Operation... operations) { + if (operations == null || operations.length == 0) return null; + Schema schema = operations[0].table.getSchema(); + RowOperationsPB rowOps = new OperationsEncoder().encodeOperations(operations); + if (rowOps == null) return null; + + Tserver.WriteRequestPB.Builder requestBuilder = Tserver.WriteRequestPB.newBuilder(); + requestBuilder.setSchema(ProtobufHelper.schemaToPb(schema)); + requestBuilder.setRowOperations(rowOps); + return requestBuilder; + } + + static class OperationsEncoder { + private Schema schema; + private ByteBuffer rows; + // We're filling this list as we go through the operations in encodeRow() and at the same time + // compute the total size, which will be used to right-size the array in toPB(). + private List indirect; + private long indirectWrittenBytes; + + /** + * Initializes the state of the encoder based on the schema and number of operations to encode. + * + * @param schema the schema of the table which the operations belong to. + * @param numOperations the number of operations. + */ + private void init(Schema schema, int numOperations) { + this.schema = schema; + + // Set up the encoded data. + // Estimate a maximum size for the data. This is conservative, but avoids + // having to loop through all the operations twice. + final int columnBitSetSize = Bytes.getBitSetSize(schema.getColumnCount()); + int sizePerRow = 1 /* for the op type */ + schema.getRowSize() + columnBitSetSize; + if (schema.hasNullableColumns()) { + // nullsBitSet is the same size as the columnBitSet + sizePerRow += columnBitSetSize; + } + + // TODO: would be more efficient to use a buffer which "chains" smaller allocations + // instead of a doubling buffer like BAOS. + this.rows = ByteBuffer.allocate(sizePerRow * numOperations) + .order(ByteOrder.LITTLE_ENDIAN); + this.indirect = new ArrayList<>(schema.getVarLengthColumnCount() * numOperations); + } + + /** + * Builds the row operations protobuf message with encoded operations. + * @return the row operations protobuf message. + */ + private RowOperationsPB toPB() { + RowOperationsPB.Builder rowOpsBuilder = RowOperationsPB.newBuilder(); + + // TODO: we could implement a ZeroCopy approach here by subclassing LiteralByteString. + // We have ZeroCopyLiteralByteString, but that only supports an entire array. Here + // we've only partially filled in rows.array(), so we have to make the extra copy. + rows.limit(rows.position()); + rows.flip(); + rowOpsBuilder.setRows(ByteString.copyFrom(rows)); + if (indirect.size() > 0) { + // TODO: same as above, we could avoid a copy here by using an implementation that allows + // zero-copy on a slice of an array. + byte[] indirectData = new byte[(int)indirectWrittenBytes]; + int offset = 0; + for (ByteBuffer bb : indirect) { + int bbSize = bb.remaining(); + bb.get(indirectData, offset, bbSize); + offset += bbSize; + } + rowOpsBuilder.setIndirectData(ZeroCopyLiteralByteString.wrap(indirectData)); + } + return rowOpsBuilder.build(); + } + + private void encodeRow(PartialRow row, ChangeType type) { + rows.put(type.toEncodedByte()); + rows.put(Bytes.fromBitSet(row.getColumnsBitSet(), schema.getColumnCount())); + if (schema.hasNullableColumns()) { + rows.put(Bytes.fromBitSet(row.getNullsBitSet(), schema.getColumnCount())); + } + int colIdx = 0; + byte[] rowData = row.getRowAlloc(); + int currentRowOffset = 0; + for (ColumnSchema col : row.getSchema().getColumns()) { + // Keys should always be specified, maybe check? + if (row.isSet(colIdx) && !row.isSetToNull(colIdx)) { + if (col.getType() == Type.STRING || col.getType() == Type.BINARY) { + ByteBuffer varLengthData = row.getVarLengthData().get(colIdx); + varLengthData.reset(); + rows.putLong(indirectWrittenBytes); + int bbSize = varLengthData.remaining(); + rows.putLong(bbSize); + indirect.add(varLengthData); + indirectWrittenBytes += bbSize; + } else { + // This is for cols other than strings + rows.put(rowData, currentRowOffset, col.getType().getSize()); + } + } + currentRowOffset += col.getType().getSize(); + colIdx++; + } + } + + public RowOperationsPB encodeOperations(Operation... operations) { + if (operations == null || operations.length == 0) return null; + init(operations[0].table.getSchema(), operations.length); + for (Operation operation : operations) { + encodeRow(operation.row, operation.getChangeType()); + } + return toPB(); + } + + public RowOperationsPB encodeSplitRows(List rows) { + if (rows == null || rows.isEmpty()) return null; + init(rows.get(0).getSchema(), rows.size()); + for (PartialRow row : rows) { + encodeRow(row, ChangeType.SPLIT_ROWS); + } + return toPB(); + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/OperationResponse.java b/java/kudu-client/src/main/java/org/kududb/client/OperationResponse.java new file mode 100644 index 000000000000..3e3e0e957b95 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/OperationResponse.java @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; + +import java.util.ArrayList; +import java.util.List; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class OperationResponse extends KuduRpcResponse { + + private final long writeTimestamp; + private final RowError rowError; + private final Operation operation; + + /** + * Package-private constructor to build an OperationResponse with a row error in the pb format. + * @param elapsedMillis time in milliseconds since RPC creation to now + * @param writeTimestamp HT's write timestamp + * @param operation the operation that created this response + * @param errorPB a row error in pb format, can be null + */ + OperationResponse(long elapsedMillis, String tsUUID, long writeTimestamp, + Operation operation, Tserver.WriteResponsePB.PerRowErrorPB errorPB) { + super(elapsedMillis, tsUUID); + this.writeTimestamp = writeTimestamp; + this.rowError = errorPB == null ? null : RowError.fromRowErrorPb(errorPB, operation, tsUUID); + this.operation = operation; + } + + /** + * Package-private constructor to build an OperationResponse with a row error. + * @param elapsedMillis time in milliseconds since RPC creation to now + * @param writeTimestamp HT's write timestamp + * @param operation the operation that created this response + * @param rowError a parsed row error, can be null + */ + OperationResponse(long elapsedMillis, String tsUUID, long writeTimestamp, + Operation operation, RowError rowError) { + super(elapsedMillis, tsUUID); + this.writeTimestamp = writeTimestamp; + this.rowError = rowError; + this.operation = operation; + } + + /** + * Utility method that collects all the row errors from the given list of responses. + * @param responses a list of operation responses to collect the row errors from + * @return a combined list of row errors + */ + public static List collectErrors(List responses) { + List errors = new ArrayList<>(responses.size()); + for (OperationResponse resp : responses) { + if (resp.hasRowError()) { + errors.add(resp.getRowError()); + } + } + return errors; + } + + /** + * Gives the write timestamp that was returned by the Tablet Server. + * @return a timestamp in milliseconds, 0 if the external consistency mode set in AsyncKuduSession + * wasn't CLIENT_PROPAGATED + */ + public long getWriteTimestamp() { + return writeTimestamp; + } + + /** + * Returns a row error. If {@link #hasRowError()} returns false, then this method returns null. + * @return a row error, or null if the operation was successful + */ + public RowError getRowError() { + return rowError; + } + + /** + * Tells if this operation response contains a row error. + * @return true if this operation response has errors, else false + */ + public boolean hasRowError() { + return rowError != null; + } + + /** + * Returns the operation associated with this response. + * @return an operation, cannot be null + */ + Operation getOperation() { + return operation; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/PartialRow.java b/java/kudu-client/src/main/java/org/kududb/client/PartialRow.java new file mode 100644 index 000000000000..b5f30694ce98 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/PartialRow.java @@ -0,0 +1,626 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Class used to represent parts of a row along with its schema.

+ * + * Values can be replaced as often as needed, but once the enclosing {@link Operation} is applied + * then they cannot be changed again. This means that a PartialRow cannot be reused.

+ * + * Each PartialRow is backed by an byte array where all the cells (except strings and binary data) + * are written. The others are kept in a List.

+ * + * This class isn't thread-safe. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class PartialRow { + + private final Schema schema; + + // Variable length data. If string, will be UTF-8 encoded. Elements of this list _must_ have a + // mark that we can reset() to. Readers of these fields (encoders, etc) must call reset() before + // attempting to read these values. + private final List varLengthData; + private final byte[] rowAlloc; + + private final BitSet columnsBitSet; + private final BitSet nullsBitSet; + + private boolean frozen = false; + + /** + * This is not a stable API, prefer using {@link Schema#newPartialRow()} + * to create a new partial row. + * @param schema the schema to use for this row + */ + public PartialRow(Schema schema) { + this.schema = schema; + this.columnsBitSet = new BitSet(this.schema.getColumnCount()); + this.nullsBitSet = schema.hasNullableColumns() ? + new BitSet(this.schema.getColumnCount()) : null; + this.rowAlloc = new byte[schema.getRowSize()]; + // Pre-fill the array with nulls. We'll only replace cells that have varlen values. + this.varLengthData = Arrays.asList(new ByteBuffer[this.schema.getColumnCount()]); + } + + /** + * Creates a new partial row by deep-copying the data-fields of the provided partial row. + * @param row the partial row to copy + */ + PartialRow(PartialRow row) { + this.schema = row.schema; + + this.varLengthData = Lists.newArrayListWithCapacity(row.varLengthData.size()); + for (ByteBuffer data: row.varLengthData) { + if (data == null) { + this.varLengthData.add(null); + } else { + data.reset(); + // Deep copy the ByteBuffer. + ByteBuffer clone = ByteBuffer.allocate(data.remaining()); + clone.put(data); + clone.flip(); + + clone.mark(); // We always expect a mark. + this.varLengthData.add(clone); + } + } + + this.rowAlloc = row.rowAlloc.clone(); + this.columnsBitSet = (BitSet) row.columnsBitSet.clone(); + this.nullsBitSet = row.nullsBitSet == null ? null : (BitSet) row.nullsBitSet.clone(); + } + + /** + * Add a boolean for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBoolean(int columnIndex, boolean val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.BOOL); + rowAlloc[getPositionInRowAllocAndSetBitSet(columnIndex)] = (byte) (val ? 1 : 0); + } + + /** + * Add a boolean for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBoolean(String columnName, boolean val) { + addBoolean(schema.getColumnIndex(columnName), val); + } + + /** + * Add a byte for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addByte(int columnIndex, byte val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.INT8); + rowAlloc[getPositionInRowAllocAndSetBitSet(columnIndex)] = val; + } + + /** + * Add a byte for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addByte(String columnName, byte val) { + addByte(schema.getColumnIndex(columnName), val); + } + + /** + * Add a short for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addShort(int columnIndex, short val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.INT16); + Bytes.setShort(rowAlloc, val, getPositionInRowAllocAndSetBitSet(columnIndex)); + } + + /** + * Add a short for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addShort(String columnName, short val) { + addShort(schema.getColumnIndex(columnName), val); + } + + /** + * Add an int for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addInt(int columnIndex, int val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.INT32); + Bytes.setInt(rowAlloc, val, getPositionInRowAllocAndSetBitSet(columnIndex)); + } + + /** + * Add an int for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addInt(String columnName, int val) { + addInt(schema.getColumnIndex(columnName), val); + } + + /** + * Add an long for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addLong(int columnIndex, long val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.INT64, Type.TIMESTAMP); + Bytes.setLong(rowAlloc, val, getPositionInRowAllocAndSetBitSet(columnIndex)); + } + + /** + * Add an long for the specified column. + * + * If this is a TIMESTAMP column, the long value provided should be the number of microseconds + * between a given time and January 1, 1970 UTC. + * For example, to encode the current time, use setLong(System.currentTimeMillis() * 1000); + * + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addLong(String columnName, long val) { + addLong(schema.getColumnIndex(columnName), val); + } + + /** + * Add an float for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addFloat(int columnIndex, float val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.FLOAT); + Bytes.setFloat(rowAlloc, val, getPositionInRowAllocAndSetBitSet(columnIndex)); + } + + /** + * Add an float for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addFloat(String columnName, float val) { + addFloat(schema.getColumnIndex(columnName), val); + } + + /** + * Add an double for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addDouble(int columnIndex, double val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.DOUBLE); + Bytes.setDouble(rowAlloc, val, getPositionInRowAllocAndSetBitSet(columnIndex)); + } + + /** + * Add an double for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addDouble(String columnName, double val) { + addDouble(schema.getColumnIndex(columnName), val); + } + + /** + * Add a String for the specified column. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addString(int columnIndex, String val) { + addStringUtf8(columnIndex, Bytes.fromString(val)); + } + + /** + * Add a String for the specified column. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addString(String columnName, String val) { + addStringUtf8(columnName, Bytes.fromString(val)); + } + + /** + * Add a String for the specified value, encoded as UTF8. + * Note that the provided value must not be mutated after this. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addStringUtf8(int columnIndex, byte[] val) { + // TODO: use Utf8.isWellFormed from Guava 16 to verify that + // the user isn't putting in any garbage data. + checkColumn(schema.getColumnByIndex(columnIndex), Type.STRING); + addVarLengthData(columnIndex, val); + } + + /** + * Add a String for the specified value, encoded as UTF8. + * Note that the provided value must not be mutated after this. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + * + */ + public void addStringUtf8(String columnName, byte[] val) { + addStringUtf8(schema.getColumnIndex(columnName), val); + } + + /** + * Add binary data with the specified value. + * Note that the provided value must not be mutated after this. + * @param columnIndex the column's index in the schema + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBinary(int columnIndex, byte[] val) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.BINARY); + addVarLengthData(columnIndex, val); + } + + /** + * Add binary data with the specified value, from the current ByteBuffer's position to its limit. + * This method duplicates the ByteBuffer but doesn't copy the data. This means that the wrapped + * data must not be mutated after this. + * @param columnIndex the column's index in the schema + * @param value byte buffer to get the value from + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBinary(int columnIndex, ByteBuffer value) { + checkColumn(schema.getColumnByIndex(columnIndex), Type.BINARY); + addVarLengthData(columnIndex, value); + } + + /** + * Add binary data with the specified value. + * Note that the provided value must not be mutated after this. + * @param columnName Name of the column + * @param val value to add + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBinary(String columnName, byte[] val) { + addBinary(schema.getColumnIndex(columnName), val); + } + + /** + * Add binary data with the specified value, from the current ByteBuffer's position to its limit. + * This method duplicates the ByteBuffer but doesn't copy the data. This means that the wrapped + * data must not be mutated after this. + * @param columnName Name of the column + * @param value byte buffer to get the value from + * @throws IllegalArgumentException if the column doesn't exist or if the value doesn't match + * the column's type + * @throws IllegalStateException if the row was already applied + */ + public void addBinary(String columnName, ByteBuffer value) { + addBinary(schema.getColumnIndex(columnName), value); + } + + private void addVarLengthData(int columnIndex, byte[] val) { + addVarLengthData(columnIndex, ByteBuffer.wrap(val)); + } + + private void addVarLengthData(int columnIndex, ByteBuffer val) { + // A duplicate will copy all the original's metadata but still point to the same content. + ByteBuffer duplicate = val.duplicate(); + // Mark the current position so we can reset to it. + duplicate.mark(); + + varLengthData.set(columnIndex, duplicate); + // Set the usage bit but we don't care where it is. + getPositionInRowAllocAndSetBitSet(columnIndex); + // We don't set anything in row alloc, it will be managed at encoding time. + } + + /** + * Set the specified column to null + * @param columnIndex the column's index in the schema + * @throws IllegalArgumentException if the column doesn't exist or cannot be set to null + * @throws IllegalStateException if the row was already applied + */ + public void setNull(int columnIndex) { + setNull(this.schema.getColumnByIndex(columnIndex)); + } + + /** + * Set the specified column to null + * @param columnName Name of the column + * @throws IllegalArgumentException if the column doesn't exist or cannot be set to null + * @throws IllegalStateException if the row was already applied + */ + public void setNull(String columnName) { + setNull(this.schema.getColumn(columnName)); + } + + private void setNull(ColumnSchema column) { + assert nullsBitSet != null; + checkNotFrozen(); + checkColumnExists(column); + if (!column.isNullable()) { + throw new IllegalArgumentException(column.getName() + " cannot be set to null"); + } + int idx = schema.getColumns().indexOf(column); + columnsBitSet.set(idx); + nullsBitSet.set(idx); + } + + /** + * Verifies if the column exists and belongs to one of the specified types + * It also does some internal accounting + * @param column column the user wants to set + * @param types types we expect + * @throws IllegalArgumentException if the column or type was invalid + * @throws IllegalStateException if the row was already applied + */ + private void checkColumn(ColumnSchema column, Type... types) { + checkNotFrozen(); + checkColumnExists(column); + for(Type type : types) { + if (column.getType().equals(type)) return; + } + throw new IllegalArgumentException(String.format("%s isn't %s, it's %s", column.getName(), + Arrays.toString(types), column.getType().getName())); + } + + /** + * @param column column the user wants to set + * @throws IllegalArgumentException if the column doesn't exist + */ + private void checkColumnExists(ColumnSchema column) { + if (column == null) + throw new IllegalArgumentException("Column name isn't present in the table's schema"); + } + + /** + * @throws IllegalStateException if the row was already applied + */ + private void checkNotFrozen() { + if (frozen) { + throw new IllegalStateException("This row was already applied and cannot be modified."); + } + } + + /** + * Sets the column bit set for the column index, and returns the column's offset. + * @param columnIndex the index of the column to get the position for and mark as set + * @return the offset in rowAlloc for the column + */ + private int getPositionInRowAllocAndSetBitSet(int columnIndex) { + columnsBitSet.set(columnIndex); + return schema.getColumnOffset(columnIndex); + } + + /** + * Tells if the specified column was set by the user + * @param column column's index in the schema + * @return true if it was set, else false + */ + boolean isSet(int column) { + return this.columnsBitSet.get(column); + } + + /** + * Tells if the specified column was set to null by the user + * @param column column's index in the schema + * @return true if it was set, else false + */ + boolean isSetToNull(int column) { + if (this.nullsBitSet == null) { + return false; + } + return this.nullsBitSet.get(column); + } + + /** + * Returns the encoded primary key of the row. + * @return a byte array containing an encoded primary key + */ + public byte[] encodePrimaryKey() { + return new KeyEncoder().encodePrimaryKey(this); + } + + /** + * Transforms the row key into a string representation where each column is in the format: + * "type col_name=value". + * @return a string representation of the operation's row key + */ + public String stringifyRowKey() { + int numRowKeys = schema.getPrimaryKeyColumnCount(); + StringBuilder sb = new StringBuilder(); + sb.append("("); + for (int i = 0; i < numRowKeys; i++) { + if (i > 0) { + sb.append(", "); + } + + ColumnSchema col = schema.getColumnByIndex(i); + assert !col.isNullable(); + Preconditions.checkState(columnsBitSet.get(i), + "Full row key not specified, missing at least col: " + col.getName()); + Type type = col.getType(); + sb.append(type.getName()); + sb.append(" "); + sb.append(col.getName()); + sb.append("="); + + if (type == Type.STRING || type == Type.BINARY) { + ByteBuffer value = getVarLengthData().get(i).duplicate(); + value.reset(); // Make sure we start at the beginning. + byte[] data = new byte[value.limit()]; + value.get(data); + if (type == Type.STRING) { + sb.append(Bytes.getString(data)); + } else { + sb.append(Bytes.pretty(data)); + } + } else { + switch (type) { + case INT8: + sb.append(Bytes.getByte(rowAlloc, schema.getColumnOffset(i))); + break; + case INT16: + sb.append(Bytes.getShort(rowAlloc, schema.getColumnOffset(i))); + break; + case INT32: + sb.append(Bytes.getInt(rowAlloc, schema.getColumnOffset(i))); + break; + case INT64: + sb.append(Bytes.getLong(rowAlloc, schema.getColumnOffset(i))); + break; + case TIMESTAMP: + sb.append(Bytes.getLong(rowAlloc, schema.getColumnOffset(i))); + break; + default: + throw new IllegalArgumentException(String.format( + "The column type %s is not a valid key component type", type)); + } + } + } + sb.append(")"); + + return sb.toString(); + } + + /** + * Get the schema used for this row. + * @return a schema that came from KuduTable + */ + Schema getSchema() { + return schema; + } + + /** + * Get the list variable length data cells that were added to this row. + * @return a list of binary data, may be empty + */ + List getVarLengthData() { + return varLengthData; + } + + /** + * Get the byte array that contains all the data added to this partial row. Variable length data + * is contained separately, see {@link #getVarLengthData()}. In their place you'll find their + * index in that list and their size. + * @return a byte array containing the data for this row, except strings + */ + byte[] getRowAlloc() { + return rowAlloc; + } + + /** + * Get the bit set that indicates which columns were set. + * @return a bit set for columns with data + */ + BitSet getColumnsBitSet() { + return columnsBitSet; + } + + /** + * Get the bit set for the columns that were specifically set to null + * @return a bit set for null columns + */ + BitSet getNullsBitSet() { + return nullsBitSet; + } + + /** + * Prevents this PartialRow from being modified again. Can be called multiple times. + */ + void freeze() { + this.frozen = true; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/Partition.java b/java/kudu-client/src/main/java/org/kududb/client/Partition.java new file mode 100644 index 000000000000..6e8951e041c7 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Partition.java @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Objects; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.Arrays; +import java.util.List; + +/** + * A Partition describes the set of rows that a Tablet is responsible for + * serving. Each tablet is assigned a single Partition.

+ * + * Partitions consist primarily of a start and end partition key. Every row with + * a partition key that falls in a Tablet's Partition will be served by that + * tablet.

+ * + * In addition to the start and end partition keys, a Partition holds metadata + * to determine if a scan can prune, or skip, a partition based on the scan's + * start and end primary keys, and predicates. + * + * This class is new, and not considered stable or suitable for public use. + */ +@InterfaceAudience.LimitedPrivate("Impala") +@InterfaceStability.Unstable +public class Partition implements Comparable { + final byte[] partitionKeyStart; + final byte[] partitionKeyEnd; + + final byte[] rangeKeyStart; + final byte[] rangeKeyEnd; + + final List hashBuckets; + + /** + * Size of an encoded hash bucket component in a partition key. + */ + private static final int ENCODED_BUCKET_SIZE = 4; + + /** + * Creates a new partition with the provided start and end keys, and hash buckets. + * @param partitionKeyStart the start partition key + * @param partitionKeyEnd the end partition key + * @param hashBuckets the partition hash buckets + */ + Partition(byte[] partitionKeyStart, + byte[] partitionKeyEnd, + List hashBuckets) { + this.partitionKeyStart = partitionKeyStart; + this.partitionKeyEnd = partitionKeyEnd; + this.hashBuckets = hashBuckets; + this.rangeKeyStart = rangeKey(partitionKeyStart, hashBuckets.size()); + this.rangeKeyEnd = rangeKey(partitionKeyEnd, hashBuckets.size()); + } + + /** + * Gets the start partition key. + * @return the start partition key + */ + public byte[] getPartitionKeyStart() { + return partitionKeyStart; + } + + /** + * Gets the end partition key. + * @return the end partition key + */ + public byte[] getPartitionKeyEnd() { + return partitionKeyEnd; + } + + /** + * Gets the start range key. + * @return the start range key + */ + public byte[] getRangeKeyStart() { + return rangeKeyStart; + } + + /** + * Gets the end range key. + * @return the end range key + */ + public byte[] getRangeKeyEnd() { + return rangeKeyEnd; + } + + /** + * Gets the partition hash buckets. + * @return the partition hash buckets + */ + public List getHashBuckets() { + return hashBuckets; + } + + /** + * @return true if the partition is the absolute end partition + */ + public boolean isEndPartition() { + return partitionKeyEnd.length == 0; + } + + /** + * Equality only holds for partitions from the same table. Partition equality only takes into + * account the partition keys, since there is a 1 to 1 correspondence between partition keys and + * the hash buckets and range keys. + * + * @return the hash code + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Partition partition = (Partition) o; + return Arrays.equals(partitionKeyStart, partition.partitionKeyStart) + && Arrays.equals(partitionKeyEnd, partition.partitionKeyEnd); + } + + /** + * The hash code only takes into account the partition keys, since there is a 1 to 1 + * correspondence between partition keys and the hash buckets and range keys. + * + * @return the hash code + */ + @Override + public int hashCode() { + return Objects.hashCode(Arrays.hashCode(partitionKeyStart), Arrays.hashCode(partitionKeyEnd)); + } + + /** + * Partition comparison is only reasonable when comparing partitions from the same table, and + * since Kudu does not yet allow partition splitting, no two distinct partitions can have the + * same start partition key. Accordingly, partitions are compared strictly by the start partition + * key. + * + * @param other the other partition of the same table + * @return the comparison of the partitions + */ + @Override + public int compareTo(Partition other) { + return Bytes.memcmp(this.partitionKeyStart, other.partitionKeyStart); + } + + /** + * Returns the range key portion of a partition key given the number of buckets in the partition + * schema. + * @param partitionKey the partition key containing the range key + * @param numHashBuckets the number of hash bucket components of the table + * @return the range key + */ + private static byte[] rangeKey(byte[] partitionKey, int numHashBuckets) { + int bucketsLen = numHashBuckets * ENCODED_BUCKET_SIZE; + if (partitionKey.length > bucketsLen) { + return Arrays.copyOfRange(partitionKey, bucketsLen, partitionKey.length); + } else { + return AsyncKuduClient.EMPTY_ARRAY; + } + } + + @Override + public String toString() { + return String.format("[%s, %s)", + Bytes.pretty(partitionKeyStart), + Bytes.pretty(partitionKeyEnd)); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/PartitionSchema.java b/java/kudu-client/src/main/java/org/kududb/client/PartitionSchema.java new file mode 100644 index 000000000000..fdee32ed4030 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/PartitionSchema.java @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.util.List; + +/** + * A partition schema describes how the rows of a table are distributed among + * tablets. + * + * Primarily, a table's partition schema is responsible for translating the + * primary key column values of a row into a partition key that can be used to + * find the tablet containing the key. + * + * The partition schema is made up of zero or more hash bucket components, + * followed by a single range component. + * + * Each hash bucket component includes one or more columns from the primary key + * column set, with the restriction that an individual primary key column may + * only be included in a single hash component. + * + * This class is new, and not considered stable or suitable for public use. + */ +@InterfaceAudience.LimitedPrivate("Impala") +@InterfaceStability.Unstable +public class PartitionSchema { + + private final RangeSchema rangeSchema; + private final List hashBucketSchemas; + private final boolean isSimple; + + /** + * Creates a new partition schema from the range and hash bucket schemas. + * + * @param rangeSchema the range schema + * @param hashBucketSchemas the hash bucket schemas + * @param schema the table schema + */ + PartitionSchema(RangeSchema rangeSchema, + List hashBucketSchemas, + Schema schema) { + this.rangeSchema = rangeSchema; + this.hashBucketSchemas = hashBucketSchemas; + + boolean isSimple = hashBucketSchemas.isEmpty() + && rangeSchema.columns.size() == schema.getPrimaryKeyColumnCount(); + if (isSimple) { + int i = 0; + for (Integer id : rangeSchema.columns) { + if (schema.getColumnIndex(id) != i++) { + isSimple = false; + break; + } + } + } + this.isSimple = isSimple; + } + + /** + * Returns the encoded partition key of the row. + * @return a byte array containing the encoded partition key of the row + */ + public byte[] encodePartitionKey(PartialRow row) { + return new KeyEncoder().encodePartitionKey(row, this); + } + + public RangeSchema getRangeSchema() { + return rangeSchema; + } + + public List getHashBucketSchemas() { + return hashBucketSchemas; + } + + /** + * Returns true if the partition schema if the partition schema does not include any hash + * components, and the range columns match the table's primary key columns. + * + * @return whether the partition schema is the default simple range partitioning. + */ + boolean isSimpleRangePartitioning() { + return isSimple; + } + + public static class RangeSchema { + private final List columns; + + RangeSchema(List columns) { + this.columns = columns; + } + + public List getColumns() { + return columns; + } + } + + public static class HashBucketSchema { + private final List columnIds; + private int numBuckets; + private int seed; + + HashBucketSchema(List columnIds, int numBuckets, int seed) { + this.columnIds = columnIds; + this.numBuckets = numBuckets; + this.seed = seed; + } + + /** + * Gets the column IDs of the columns in the hash partition. + * @return the column IDs of the columns in the has partition + */ + public List getColumnIds() { + return columnIds; + } + + public int getNumBuckets() { + return numBuckets; + } + + public int getSeed() { + return seed; + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/PleaseThrottleException.java b/java/kudu-client/src/main/java/org/kududb/client/PleaseThrottleException.java new file mode 100644 index 000000000000..dc84e6091781 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/PleaseThrottleException.java @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * This exception notifies the application to throttle its use of Kudu. + *

+ * Since all APIs of {@link AsyncKuduSession} are asynchronous and non-blocking, + * it's possible that the application would produce RPCs at a rate higher + * than Kudu is able to handle. When this happens, {@link AsyncKuduSession} + * will typically do some buffering up to a certain point beyond which RPCs + * will fail-fast with this exception, to prevent the application from + * running itself out of memory. + *

+ * This exception is expected to be handled by having the application + * throttle or pause itself for a short period of time before retrying the + * RPC that failed with this exception as well as before sending other RPCs. + * The reason this exception inherits from {@link NonRecoverableException} + * instead of {@link RecoverableException} is that the usual course of action + * when handling a {@link RecoverableException} is to retry right away, which + * would defeat the whole purpose of this exception. Here, we want the + * application to retry after a reasonable delay as well as throttle + * the pace of creation of new RPCs. What constitutes a "reasonable + * delay" depends on the nature of RPCs and rate at which they're produced. + *

+ * One effective strategy to handle this exception is to set a flag to true + * when this exception is first emitted that causes the application to pause + * or throttle its use of Kudu. Then you can retry the RPC that failed + * (which is accessible through {@link #getFailedRpc}) and add a callback to + * it in order to unset the flag once the RPC completes successfully. + * Note that low-throughput applications will typically rarely (if ever) + * hit this exception, so they don't need complex throttling logic. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public final class PleaseThrottleException extends NonRecoverableException + implements HasFailedRpcException { + + /** The RPC that was failed with this exception. */ + private final Operation rpc; + + /** A deferred one can wait on before retrying the failed RPC. */ + private final Deferred deferred; + + /** + * Constructor. + * @param msg A message explaining why the application has to throttle. + * @param cause The exception that requires the application to throttle + * itself (can be {@code null}). + * @param rpc The RPC that was made to fail with this exception. + * @param deferred A deferred one can wait on before retrying the failed RPC. + */ + PleaseThrottleException(final String msg, + final KuduException cause, + final Operation rpc, + final Deferred deferred) { + super(msg, cause); + this.rpc = rpc; + this.deferred = deferred; + } + + /** + * The RPC that was made to fail with this exception. + */ + public Operation getFailedRpc() { + return rpc; + } + + /** + * Returns a deferred one can wait on before retrying the failed RPC. + * @since 1.3 + */ + public Deferred getDeferred() { + return deferred; + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/ProtobufHelper.java b/java/kudu-client/src/main/java/org/kududb/client/ProtobufHelper.java new file mode 100644 index 000000000000..4f91bd2e348f --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/ProtobufHelper.java @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Charsets; +import com.google.common.collect.ImmutableList; +import com.google.common.net.HostAndPort; +import com.google.protobuf.ByteString; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.kududb.ColumnSchema; +import org.kududb.Common; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; + +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; + +@InterfaceAudience.Private +public class ProtobufHelper { + + /** + * Utility method to convert a Schema to its wire format. + * @param schema Schema to convert + * @return a list of ColumnSchemaPB + */ + public static List schemaToListPb(Schema schema) { + ArrayList columns = + new ArrayList(schema.getColumnCount()); + Common.ColumnSchemaPB.Builder schemaBuilder = Common.ColumnSchemaPB.newBuilder(); + for (ColumnSchema col : schema.getColumns()) { + columns.add(columnToPb(schemaBuilder, col)); + schemaBuilder.clear(); + } + return columns; + } + + public static Common.SchemaPB schemaToPb(Schema schema) { + Common.SchemaPB.Builder builder = Common.SchemaPB.newBuilder(); + builder.addAllColumns(schemaToListPb(schema)); + return builder.build(); + } + + public static Common.ColumnSchemaPB columnToPb(ColumnSchema column) { + return columnToPb(Common.ColumnSchemaPB.newBuilder(), column); + } + + public static Common.ColumnSchemaPB + columnToPb(Common.ColumnSchemaPB.Builder schemaBuilder, ColumnSchema column) { + schemaBuilder + .setName(column.getName()) + .setType(column.getType().getDataType()) + .setIsKey(column.isKey()) + .setIsNullable(column.isNullable()) + .setCfileBlockSize(column.getDesiredBlockSize()); + if (column.getEncoding() != null) { + schemaBuilder.setEncoding(column.getEncoding().getInternalPbType()); + } + if (column.getCompressionAlgorithm() != null) { + schemaBuilder.setCompression(column.getCompressionAlgorithm().getInternalPbType()); + } + if (column.getDefaultValue() != null) schemaBuilder.setReadDefaultValue + (ZeroCopyLiteralByteString.wrap(objectToWireFormat(column, column.getDefaultValue()))); + return schemaBuilder.build(); + } + + public static Schema pbToSchema(Common.SchemaPB schema) { + List columns = new ArrayList<>(schema.getColumnsCount()); + List columnIds = new ArrayList<>(schema.getColumnsCount()); + for (Common.ColumnSchemaPB columnPb : schema.getColumnsList()) { + Type type = Type.getTypeForDataType(columnPb.getType()); + Object defaultValue = columnPb.hasReadDefaultValue() ? byteStringToObject(type, + columnPb.getReadDefaultValue()) : null; + ColumnSchema.Encoding encoding = ColumnSchema.Encoding.valueOf(columnPb.getEncoding().name()); + ColumnSchema.CompressionAlgorithm compressionAlgorithm = + ColumnSchema.CompressionAlgorithm.valueOf(columnPb.getCompression().name()); + ColumnSchema column = new ColumnSchema.ColumnSchemaBuilder(columnPb.getName(), type) + .key(columnPb.getIsKey()) + .nullable(columnPb.getIsNullable()) + .defaultValue(defaultValue) + .encoding(encoding) + .compressionAlgorithm(compressionAlgorithm) + .build(); + columns.add(column); + int id = columnPb.getId(); + if (id < 0) { + throw new IllegalArgumentException("Illegal column ID: " + id); + } + columnIds.add(id); + } + return new Schema(columns, columnIds); + } + + /** + * Factory method for creating a {@code PartitionSchema} from a protobuf message. + * + * @param pb the partition schema protobuf message + * @return a partition instance + */ + static PartitionSchema pbToPartitionSchema(Common.PartitionSchemaPB pb, Schema schema) { + List rangeColumns = pbToIds(pb.getRangeSchema().getColumnsList()); + PartitionSchema.RangeSchema rangeSchema = new PartitionSchema.RangeSchema(rangeColumns); + + ImmutableList.Builder hashSchemas = ImmutableList.builder(); + + for (Common.PartitionSchemaPB.HashBucketSchemaPB hashBucketSchemaPB + : pb.getHashBucketSchemasList()) { + List hashColumnIds = pbToIds(hashBucketSchemaPB.getColumnsList()); + + PartitionSchema.HashBucketSchema hashSchema = + new PartitionSchema.HashBucketSchema(hashColumnIds, + hashBucketSchemaPB.getNumBuckets(), + hashBucketSchemaPB.getSeed()); + + hashSchemas.add(hashSchema); + } + + return new PartitionSchema(rangeSchema, hashSchemas.build(), schema); + } + + /** + * Constructs a new {@code Partition} instance from the a protobuf message. + * @param pb the protobuf message + * @return the {@code Partition} corresponding to the message + */ + static Partition pbToPartition(Common.PartitionPB pb) { + return new Partition(pb.getPartitionKeyStart().toByteArray(), + pb.getPartitionKeyEnd().toByteArray(), + pb.getHashBucketsList()); + } + + /** + * Deserializes a list of column identifier protobufs into a list of column IDs. This method + * relies on the fact that the master will aways send a partition schema with column IDs, and not + * column names (column names are only used when the client is sending the partition schema to + * the master as part of the create table process). + * + * @param columnIdentifiers the column identifiers + * @return the column IDs + */ + private static List pbToIds( + List columnIdentifiers) { + ImmutableList.Builder columnIds = ImmutableList.builder(); + for (Common.PartitionSchemaPB.ColumnIdentifierPB column : columnIdentifiers) { + switch (column.getIdentifierCase()) { + case ID: + columnIds.add(column.getId()); + break; + case NAME: + throw new IllegalArgumentException( + String.format("Expected column ID from master: %s", column)); + case IDENTIFIER_NOT_SET: + throw new IllegalArgumentException("Unknown column: " + column); + } + } + return columnIds.build(); + } + + private static byte[] objectToWireFormat(ColumnSchema col, Object value) { + switch (col.getType()) { + case BOOL: + return Bytes.fromBoolean((Boolean) value); + case INT8: + return new byte[] {(Byte) value}; + case INT16: + return Bytes.fromShort((Short) value); + case INT32: + return Bytes.fromInt((Integer) value); + case INT64: + case TIMESTAMP: + return Bytes.fromLong((Long) value); + case STRING: + return ((String) value).getBytes(Charsets.UTF_8); + case BINARY: + return (byte[]) value; + case FLOAT: + return Bytes.fromFloat((Float) value); + case DOUBLE: + return Bytes.fromDouble((Double) value); + default: + throw new IllegalArgumentException("The column " + col.getName() + " is of type " + col + .getType() + " which is unknown"); + } + } + + private static Object byteStringToObject(Type type, ByteString value) { + byte[] buf = ZeroCopyLiteralByteString.zeroCopyGetBytes(value); + switch (type) { + case BOOL: + return Bytes.getBoolean(buf); + case INT8: + return Bytes.getByte(buf); + case INT16: + return Bytes.getShort(buf); + case INT32: + return Bytes.getInt(buf); + case INT64: + case TIMESTAMP: + return Bytes.getLong(buf); + case FLOAT: + return Bytes.getFloat(buf); + case DOUBLE: + return Bytes.getDouble(buf); + case STRING: + return new String(buf, Charsets.UTF_8); + case BINARY: + return buf; + default: + throw new IllegalArgumentException("This type is unknown: " + type); + } + } + + /** + * Convert a {@link com.google.common.net.HostAndPort} to {@link org.kududb.Common.HostPortPB} + * protobuf message for serialization. + * @param hostAndPort The host and port object. Both host and port must be specified. + * @return An initialized HostPortPB object. + */ + public static Common.HostPortPB hostAndPortToPB(HostAndPort hostAndPort) { + return Common.HostPortPB.newBuilder() + .setHost(hostAndPort.getHostText()) + .setPort(hostAndPort.getPort()) + .build(); + } + + /** + * Convert a {@link org.kududb.Common.HostPortPB} to {@link com.google.common.net.HostAndPort}. + * @param hostPortPB The fully initialized HostPortPB object. Must have both host and port + * specified. + * @return An initialized initialized HostAndPort object. + */ + public static HostAndPort hostAndPortFromPB(Common.HostPortPB hostPortPB) { + return HostAndPort.fromParts(hostPortPB.getHost(), hostPortPB.getPort()); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/RecoverableException.java b/java/kudu-client/src/main/java/org/kududb/client/RecoverableException.java new file mode 100644 index 000000000000..50ff2ac35812 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/RecoverableException.java @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * An exception for which it's typically useful to retry + *

+ * The retry strategy is up to you, but it's typically recommended to put an + * upper bound on the number of retries and to use some kind of an exponential + * backoff. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public abstract class RecoverableException extends KuduException { + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + */ + RecoverableException(final String msg) { + super(msg); + } + + /** + * Constructor. + * @param msg The message of the exception, potentially including a stack + * trace. + * @param cause The exception that caused this one to be thrown. + */ + RecoverableException(final String msg, final Exception cause) { + super(msg, cause); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/RowError.java b/java/kudu-client/src/main/java/org/kududb/client/RowError.java new file mode 100644 index 000000000000..c2e0b5929cfd --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/RowError.java @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.tserver.Tserver; + +/** + * Wrapper class for a single row error. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class RowError { + private final String status; + private final String message; + private final Operation operation; + private final String tsUUID; + + /** + * Package-private for unit tests. + */ + RowError(String errorStatus, String errorMessage, Operation operation, String tsUUID) { + this.status = errorStatus; + this.message = errorMessage; + this.operation = operation; + this.tsUUID = tsUUID; + } + + /** + * Get the string-representation of the error code that the tablet server returned. + * @return A short string representation of the error. + */ + public String getStatus() { + return status; + } + + /** + * Get the error message the tablet server sent. + * @return The error message. + */ + public String getMessage() { + return message; + } + + /** + * Get the Operation that failed. + * @return The same Operation instance that failed. + */ + public Operation getOperation() { + return operation; + } + + /** + * Get the identifier of the tablet server that sent the error. + * @return A string containing a UUID. + */ + public String getTsUUID() { + return tsUUID; + } + + @Override + public String toString() { + return "Row error for primary key=" + Bytes.pretty(operation.getRow().encodePrimaryKey()) + + ", tablet=" + operation.getTablet().getTabletIdAsString() + + ", server=" + tsUUID + + ", status=" + status + + ", message=" + message; + } + + /** + * Converts a PerRowErrorPB into a RowError. + * @param errorPB a row error in its pb format + * @param operation the original operation + * @param tsUUID a string containing the originating TS's UUID + * @return a row error + */ + static RowError fromRowErrorPb(Tserver.WriteResponsePB.PerRowErrorPB errorPB, + Operation operation, String tsUUID) { + WireProtocol.AppStatusPB statusPB = errorPB.getError(); + return new RowError(statusPB.getCode().toString(), + statusPB.getMessage(), operation, tsUUID); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/RowErrorsAndOverflowStatus.java b/java/kudu-client/src/main/java/org/kududb/client/RowErrorsAndOverflowStatus.java new file mode 100644 index 000000000000..17a477842af6 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/RowErrorsAndOverflowStatus.java @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Container class used as a response when retrieving pending row errors. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class RowErrorsAndOverflowStatus { + private final RowError[] rowErrors; + private final boolean overflowed; + + RowErrorsAndOverflowStatus(RowError[] rowErrors, boolean overflowed) { + this.rowErrors = rowErrors; + this.overflowed = overflowed; + } + + /** + * Get the collected row errors. + * @return an array of row errors, may be empty + */ + public RowError[] getRowErrors() { + return rowErrors; + } + + /** + * Check if the error collector had an overflow and had to discard row errors. + * @return true if row errors were discarded, false otherwise + */ + public boolean isOverflowed() { + return overflowed; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/RowResult.java b/java/kudu-client/src/main/java/org/kududb/client/RowResult.java new file mode 100644 index 000000000000..2caa1f99f3a4 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/RowResult.java @@ -0,0 +1,549 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.util.Slice; + +import java.nio.ByteBuffer; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.BitSet; +import java.util.Date; +import java.util.TimeZone; + +/** + * RowResult represents one row from a scanner. Do not reuse or store the objects. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class RowResult { + + private static final int INDEX_RESET_LOCATION = -1; + private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + { + DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("GMT")); + } + private static final long MS_IN_S = 1000L; + private static final long US_IN_S = 1000L * 1000L; + private int index = INDEX_RESET_LOCATION; + private int offset; + private BitSet nullsBitSet; + private final int rowSize; + private final int[] columnOffsets; + private final Schema schema; + private final Slice rowData; + private final Slice indirectData; + + /** + * Prepares the row representation using the provided data. Doesn't copy data + * out of the byte arrays. Package private. + * @param schema Schema used to build the rowData + * @param rowData The Slice of data returned by the tablet server + * @param indirectData The full indirect data that contains the strings + */ + RowResult(Schema schema, Slice rowData, Slice indirectData) { + this.schema = schema; + this.rowData = rowData; + this.indirectData = indirectData; + int columnOffsetsSize = schema.getColumnCount(); + if (schema.hasNullableColumns()) { + columnOffsetsSize++; + } + this.rowSize = this.schema.getRowSize(); + columnOffsets = new int[columnOffsetsSize]; + // Empty projection, usually used for quick row counting + if (columnOffsetsSize == 0) { + return; + } + int currentOffset = 0; + columnOffsets[0] = currentOffset; + // Pre-compute the columns offsets in rowData for easier lookups later + // If the schema has nullables, we also add the offset for the null bitmap at the end + for (int i = 1; i < columnOffsetsSize; i++) { + int previousSize = schema.getColumnByIndex(i - 1).getType().getSize(); + columnOffsets[i] = previousSize + currentOffset; + currentOffset += previousSize; + } + } + + /** + * Package-protected, only meant to be used by the RowResultIterator + */ + void advancePointer() { + advancePointerTo(this.index + 1); + } + + void resetPointer() { + advancePointerTo(INDEX_RESET_LOCATION); + } + + void advancePointerTo(int rowIndex) { + this.index = rowIndex; + this.offset = this.rowSize * this.index; + if (schema.hasNullableColumns() && this.index != INDEX_RESET_LOCATION) { + this.nullsBitSet = Bytes.toBitSet( + this.rowData.getRawArray(), + this.rowData.getRawOffset() + + getCurrentRowDataOffsetForColumn(schema.getColumnCount()), + schema.getColumnCount()); + } + } + + int getCurrentRowDataOffsetForColumn(int columnIndex) { + return this.offset + this.columnOffsets[columnIndex]; + } + + /** + * Get the specified column's integer + * @param columnName name of the column to get data for + * @return An integer + * @throws IllegalArgumentException if the column is null + */ + public int getInt(String columnName) { + return getInt(this.schema.getColumnIndex(columnName)); + } + + /** + * Get the specified column's integer + * @param columnIndex Column index in the schema + * @return An integer + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public int getInt(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getInt(this.rowData.getRawArray(), + this.rowData.getRawOffset() + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the specified column's short + * @param columnName name of the column to get data for + * @return A short + * @throws IllegalArgumentException if the column is null + */ + public short getShort(String columnName) { + return getShort(this.schema.getColumnIndex(columnName)); + } + + /** + * Get the specified column's short + * @param columnIndex Column index in the schema + * @return A short + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public short getShort(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getShort(this.rowData.getRawArray(), + this.rowData.getRawOffset() + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the specified column's boolean + * @param columnName name of the column to get data for + * @return A boolean + * @throws IllegalArgumentException if the column is null + */ + public boolean getBoolean(String columnName) { + return getBoolean(this.schema.getColumnIndex(columnName)); + } + + /** + * Get the specified column's boolean + * @param columnIndex Column index in the schema + * @return A boolean + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public boolean getBoolean(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + byte b = Bytes.getByte(this.rowData.getRawArray(), + this.rowData.getRawOffset() + + getCurrentRowDataOffsetForColumn(columnIndex)); + return b == 1; + } + + /** + * Get the specified column's byte + * @param columnName name of the column to get data for + * @return A byte + * @throws IllegalArgumentException if the column is null + */ + public byte getByte(String columnName) { + return getByte(this.schema.getColumnIndex(columnName)); + + } + + /** + * Get the specified column's byte + * @param columnIndex Column index in the schema + * @return A byte + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public byte getByte(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getByte(this.rowData.getRawArray(), + this.rowData.getRawOffset() + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the specified column's long + * + * If this is a TIMESTAMP column, the long value corresponds to a number of microseconds + * since midnight, January 1, 1970 UTC. + * + * @param columnName name of the column to get data for + * @return A positive long + * @throws IllegalArgumentException if the column is null\ + */ + public long getLong(String columnName) { + return getLong(this.schema.getColumnIndex(columnName)); + } + + /** + * Get the specified column's long + * + * If this is a TIMESTAMP column, the long value corresponds to a number of microseconds + * since midnight, January 1, 1970 UTC. + * + * @param columnIndex Column index in the schema + * @return A positive long + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public long getLong(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getLong(this.rowData.getRawArray(), + this.rowData.getRawOffset() + + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the specified column's float + * @param columnName name of the column to get data for + * @return A float + */ + public float getFloat(String columnName) { + return getFloat(this.schema.getColumnIndex(columnName)); + + } + + /** + * Get the specified column's float + * @param columnIndex Column index in the schema + * @return A float + */ + public float getFloat(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getFloat(this.rowData.getRawArray(), + this.rowData.getRawOffset() + + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the specified column's double + * @param columnName name of the column to get data for + * @return A double + */ + public double getDouble(String columnName) { + return getDouble(this.schema.getColumnIndex(columnName)); + + } + + /** + * Get the specified column's double + * @param columnIndex Column index in the schema + * @return A double + */ + public double getDouble(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + return Bytes.getDouble(this.rowData.getRawArray(), + this.rowData.getRawOffset() + + getCurrentRowDataOffsetForColumn(columnIndex)); + } + + /** + * Get the schema used for this scanner's column projection. + * @return A column projection as a schema. + */ + public Schema getColumnProjection() { + return this.schema; + } + + /** + * Get the specified column's string. + * @param columnName name of the column to get data for + * @return A string + * @throws IllegalArgumentException if the column is null + */ + public String getString(String columnName) { + return getString(this.schema.getColumnIndex(columnName)); + + } + + /** + * Get the specified column's string. + * @param columnIndex Column index in the schema + * @return A string + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public String getString(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + // C++ puts a Slice in rowData which is 16 bytes long for simplity, but we only support ints + long offset = getLong(columnIndex); + long length = rowData.getLong(getCurrentRowDataOffsetForColumn(columnIndex) + 8); + assert offset < Integer.MAX_VALUE; + assert length < Integer.MAX_VALUE; + return Bytes.getString(indirectData.getRawArray(), + indirectData.getRawOffset() + (int)offset, + (int)length); + } + + /** + * Get a copy of the specified column's binary data. + * @param columnName name of the column to get data for + * @return a byte[] with the binary data. + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public byte[] getBinaryCopy(String columnName) { + return getBinaryCopy(this.schema.getColumnIndex(columnName)); + + } + + /** + * Get a copy of the specified column's binary data. + * @param columnIndex Column index in the schema + * @return a byte[] with the binary data. + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public byte[] getBinaryCopy(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + // C++ puts a Slice in rowData which is 16 bytes long for simplicity, + // but we only support ints + long offset = getLong(columnIndex); + long length = rowData.getLong(getCurrentRowDataOffsetForColumn(columnIndex) + 8); + assert offset < Integer.MAX_VALUE; + assert length < Integer.MAX_VALUE; + byte[] ret = new byte[(int)length]; + System.arraycopy(indirectData.getRawArray(), indirectData.getRawOffset() + (int) offset, + ret, 0, (int) length); + return ret; + } + + /** + * Get the specified column's binary data. + * + * This doesn't copy the data and instead returns a ByteBuffer that wraps it. + * + * @param columnName name of the column to get data for + * @return a byte[] with the binary data. + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public ByteBuffer getBinary(String columnName) { + return getBinary(this.schema.getColumnIndex(columnName)); + } + + /** + * Get the specified column's binary data. + * + * This doesn't copy the data and instead returns a ByteBuffer that wraps it. + * + * @param columnIndex Column index in the schema + * @return a byte[] with the binary data. + * @throws IllegalArgumentException if the column is null + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public ByteBuffer getBinary(int columnIndex) { + checkValidColumn(columnIndex); + checkNull(columnIndex); + // C++ puts a Slice in rowData which is 16 bytes long for simplicity, + // but we only support ints + long offset = getLong(columnIndex); + long length = rowData.getLong(getCurrentRowDataOffsetForColumn(columnIndex) + 8); + assert offset < Integer.MAX_VALUE; + assert length < Integer.MAX_VALUE; + return ByteBuffer.wrap(indirectData.getRawArray(), indirectData.getRawOffset() + (int) offset, + (int) length); + } + + /** + * Get if the specified column is NULL + * @param columnName name of the column to get data for + * @return true if the column cell is null and the column is nullable, + * false otherwise + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public boolean isNull(String columnName) { + return isNull(this.schema.getColumnIndex(columnName)); + } + + /** + * Get if the specified column is NULL + * @param columnIndex Column index in the schema + * @return true if the column cell is null and the column is nullable, + * false otherwise + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public boolean isNull(int columnIndex) { + checkValidColumn(columnIndex); + if (nullsBitSet == null) { + return false; + } + return schema.getColumnByIndex(columnIndex).isNullable() + && nullsBitSet.get(columnIndex); + } + + /** + * Get the type of a column in this result. + * @param columnName name of the column + * @return a type + */ + public Type getColumnType(String columnName) { + return this.schema.getColumn(columnName).getType(); + } + + /** + * Get the type of a column in this result. + * @param columnIndex column index in the schema + * @return a type + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + public Type getColumnType(int columnIndex) { + return this.schema.getColumnByIndex(columnIndex).getType(); + } + + /** + * Get the schema associated with this result. + * @return a schema + */ + public Schema getSchema() { + return schema; + } + + /** + * @throws IndexOutOfBoundsException if the column doesn't exist + */ + private void checkValidColumn(int columnIndex) { + if (columnIndex >= schema.getColumnCount()) { + throw new IndexOutOfBoundsException("Requested column is out of range, " + + columnIndex + " out of " + schema.getColumnCount()); + } + } + + /** + * @throws IllegalArgumentException if the column is null + */ + private void checkNull(int columnIndex) { + if (!schema.hasNullableColumns()) { + return; + } + if (isNull(columnIndex)) { + throw new IllegalArgumentException("The requested column (" + columnIndex + ") is null"); + } + } + + @Override + public String toString() { + return "RowResult index: " + this.index + ", size: " + this.rowSize + ", " + + "schema: " + this.schema; + } + + /** + * Transforms a timestamp into a string, whose formatting and timezone is consistent + * across kudu. + * @param timestamp the timestamp, in microseconds + * @return a string, in the format: YYYY-MM-DD HH:MM:SS.ssssss GMT + */ + static String timestampToString(long timestamp) { + long tsMillis = timestamp / MS_IN_S; + long tsMicros = timestamp % US_IN_S; + StringBuffer formattedTs = new StringBuffer(); + formattedTs.append(DATE_FORMAT.format(new Date(tsMillis))); + formattedTs.append(String.format(".%06d GMT", tsMicros)); + return formattedTs.toString(); + } + + /** + * Return the actual data from this row in a stringified key=value + * form. + */ + public String rowToString() { + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < schema.getColumnCount(); i++) { + ColumnSchema col = schema.getColumnByIndex(i); + if (i != 0) { + buf.append(", "); + } + buf.append(col.getType().name()); + buf.append(" ").append(col.getName()).append("="); + if (isNull(i)) { + buf.append("NULL"); + } else { + switch (col.getType()) { + case INT8: buf.append(getByte(i)); break; + case INT16: buf.append(getShort(i)); + break; + case INT32: buf.append(getInt(i)); break; + case INT64: buf.append(getLong(i)); break; + case TIMESTAMP: { + buf.append(timestampToString(getLong(i))); + } break; + case STRING: buf.append(getString(i)); break; + case BINARY: buf.append(Bytes.pretty(getBinaryCopy(i))); break; + case FLOAT: buf.append(getFloat(i)); break; + case DOUBLE: buf.append(getDouble(i)); break; + default: buf.append(""); break; + } + } + } + return buf.toString(); + } + + /** + * @return a string describing the location of this row result within + * the iterator as well as its data. + */ + public String toStringLongFormat() { + StringBuffer buf = new StringBuffer(this.rowSize); // super rough estimation + buf.append(this.toString()); + buf.append("{"); + buf.append(rowToString()); + buf.append("}"); + return buf.toString(); + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/RowResultIterator.java b/java/kudu-client/src/main/java/org/kududb/client/RowResultIterator.java new file mode 100644 index 000000000000..3ed082287282 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/RowResultIterator.java @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import java.util.Iterator; +import org.kududb.Schema; +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.util.Slice; + +/** + * Class that contains the rows sent by a tablet server, exhausting this iterator only means + * that all the rows from the last server response were read. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class RowResultIterator extends KuduRpcResponse implements Iterator, + Iterable { + + private final Schema schema; + private final Slice bs; + private final Slice indirectBs; + private final int numRows; + private final RowResult rowResult; + private int currentRow = 0; + + /** + * Package private constructor, only meant to be instantiated from AsyncKuduScanner. + * @param ellapsedMillis Time in milliseconds since RPC creation to now. + * @param schema Schema used to parse the rows + * @param data PB containing the data + * @param callResponse the call response received from the server for this + * RPC. + */ + RowResultIterator(long ellapsedMillis, String tsUUID, Schema schema, + WireProtocol.RowwiseRowBlockPB data, + final CallResponse callResponse) { + super(ellapsedMillis, tsUUID); + this.schema = schema; + if (data == null || data.getNumRows() == 0) { + this.bs = this.indirectBs = null; + this.rowResult = null; + this.numRows = 0; + return; + } + this.bs = callResponse.getSidecar(data.getRowsSidecar()); + this.indirectBs = callResponse.getSidecar(data.getIndirectDataSidecar()); + this.numRows = data.getNumRows(); + + // Integrity check + int rowSize = schema.getRowSize(); + int expectedSize = numRows * rowSize; + if (expectedSize != bs.length()) { + throw new NonRecoverableException("RowResult block has " + bs.length() + " bytes of data " + + "but expected " + expectedSize + " for " + numRows + " rows"); + } + this.rowResult = new RowResult(this.schema, this.bs, this.indirectBs); + } + + @Override + public boolean hasNext() { + return this.currentRow < numRows; + } + + @Override + public RowResult next() { + // The rowResult keeps track of where it is internally + this.rowResult.advancePointer(); + this.currentRow++; + return rowResult; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Get the number of rows in this iterator. If all you want is to count + * rows, call this and skip the rest. + * @return number of rows in this iterator + */ + public int getNumRows() { + return this.numRows; + } + + @Override + public String toString() { + return "RowResultIterator for " + this.numRows + " rows"; + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/SecureRpcHelper.java b/java/kudu-client/src/main/java/org/kududb/client/SecureRpcHelper.java new file mode 100644 index 000000000000..421f618b3e01 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/SecureRpcHelper.java @@ -0,0 +1,256 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the aabove copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +package org.kududb.client; + +import com.google.protobuf.ByteString; +import com.google.protobuf.ZeroCopyLiteralByteString; +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.buffer.ChannelBuffers; +import org.jboss.netty.channel.Channel; +import org.jboss.netty.channel.Channels; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.rpc.RpcHeader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.security.auth.callback.Callback; +import javax.security.auth.callback.CallbackHandler; +import javax.security.auth.callback.NameCallback; +import javax.security.auth.callback.PasswordCallback; +import javax.security.auth.callback.UnsupportedCallbackException; +import javax.security.sasl.RealmCallback; +import javax.security.sasl.RealmChoiceCallback; +import javax.security.sasl.Sasl; +import javax.security.sasl.SaslClient; +import javax.security.sasl.SaslException; +import java.util.Map; +import java.util.TreeMap; + +@InterfaceAudience.Private +public class SecureRpcHelper { + + public static final Logger LOG = LoggerFactory.getLogger(TabletClient.class); + + private final TabletClient client; + private SaslClient saslClient; + public static final String SASL_DEFAULT_REALM = "default"; + public static final Map SASL_PROPS = + new TreeMap(); + private static final int SASL_CALL_ID = -33; + private volatile boolean negoUnderway = true; + private boolean useWrap = false; // no QOP at the moment + + public static final String USER_AND_PASSWORD = "java_client"; + + public SecureRpcHelper(TabletClient client) { + this.client = client; + try { + saslClient = Sasl.createSaslClient(new String[]{"PLAIN" + }, null, null, SASL_DEFAULT_REALM, + SASL_PROPS, new SaslClientCallbackHandler(USER_AND_PASSWORD, USER_AND_PASSWORD)); + } catch (SaslException e) { + throw new RuntimeException("Could not create the SASL client", e); + } + } + + public void sendHello(Channel channel) { + sendNegotiateMessage(channel); + } + + private void sendNegotiateMessage(Channel channel) { + RpcHeader.SaslMessagePB.Builder builder = RpcHeader.SaslMessagePB.newBuilder(); + builder.setState(RpcHeader.SaslMessagePB.SaslState.NEGOTIATE); + sendSaslMessage(channel, builder.build()); + } + + private void sendSaslMessage(Channel channel, RpcHeader.SaslMessagePB msg) { + RpcHeader.RequestHeader.Builder builder = RpcHeader.RequestHeader.newBuilder(); + builder.setCallId(SASL_CALL_ID); + RpcHeader.RequestHeader header = builder.build(); + + ChannelBuffer buffer = KuduRpc.toChannelBuffer(header, msg); + Channels.write(channel, buffer); + } + + public ChannelBuffer handleResponse(ChannelBuffer buf, Channel chan) throws SaslException { + if (!saslClient.isComplete() || negoUnderway) { + RpcHeader.SaslMessagePB response = parseSaslMsgResponse(buf); + switch (response.getState()) { + case NEGOTIATE: + handleNegotiateResponse(chan, response); + break; + case CHALLENGE: + handleChallengeResponse(chan, response); + break; + case SUCCESS: + handleSuccessResponse(chan, response); + break; + default: + System.out.println("Wrong sasl state"); + } + return null; + } + return unwrap(buf); + } + + /** + * When QOP of auth-int or auth-conf is selected + * This is used to unwrap the contents from the passed + * buffer payload. + */ + public ChannelBuffer unwrap(ChannelBuffer payload) { + if(!useWrap) { + return payload; + } + int len = payload.readInt(); + try { + payload = + ChannelBuffers.wrappedBuffer(saslClient.unwrap(payload.readBytes(len).array(), 0, len)); + return payload; + } catch (SaslException e) { + throw new IllegalStateException("Failed to unwrap payload", e); + } + } + + /** + * When QOP of auth-int or auth-conf is selected + * This is used to wrap the contents + * into the proper payload (ie encryption, signature, etc) + */ + public ChannelBuffer wrap(ChannelBuffer content) { + if(!useWrap) { + return content; + } + try { + byte[] payload = new byte[content.writerIndex()]; + content.readBytes(payload); + byte[] wrapped = saslClient.wrap(payload, 0, payload.length); + ChannelBuffer ret = ChannelBuffers.wrappedBuffer(new byte[4 + wrapped.length]); + ret.clear(); + ret.writeInt(wrapped.length); + ret.writeBytes(wrapped); + if (LOG.isDebugEnabled()) { + LOG.debug("Wrapped payload: "+Bytes.pretty(ret)); + } + return ret; + } catch (SaslException e) { + throw new IllegalStateException("Failed to wrap payload", e); + } + } + + private RpcHeader.SaslMessagePB parseSaslMsgResponse(ChannelBuffer buf) { + CallResponse response = new CallResponse(buf); + RpcHeader.ResponseHeader responseHeader = response.getHeader(); + int id = responseHeader.getCallId(); + if (id != SASL_CALL_ID) { + throw new IllegalStateException("Received a call that wasn't for SASL"); + } + + RpcHeader.SaslMessagePB.Builder saslBuilder = RpcHeader.SaslMessagePB.newBuilder(); + KuduRpc.readProtobuf(response.getPBMessage(), saslBuilder); + return saslBuilder.build(); + } + + + private void handleNegotiateResponse(Channel chan, RpcHeader.SaslMessagePB response) throws + SaslException { + RpcHeader.SaslMessagePB.SaslAuth negotiatedAuth = null; + for (RpcHeader.SaslMessagePB.SaslAuth auth : response.getAuthsList()) { + negotiatedAuth = auth; + } + byte[] saslToken = new byte[0]; + if (saslClient.hasInitialResponse()) + saslToken = saslClient.evaluateChallenge(saslToken); + + RpcHeader.SaslMessagePB.Builder builder = RpcHeader.SaslMessagePB.newBuilder(); + if (saslToken != null) { + builder.setToken(ZeroCopyLiteralByteString.wrap(saslToken)); + } + builder.setState(RpcHeader.SaslMessagePB.SaslState.INITIATE); + builder.addAuths(negotiatedAuth); + sendSaslMessage(chan, builder.build()); + + } + + private void handleChallengeResponse(Channel chan, RpcHeader.SaslMessagePB response) throws + SaslException { + ByteString bs = response.getToken(); + byte[] saslToken = saslClient.evaluateChallenge(bs.toByteArray()); + if (saslToken == null) { + throw new IllegalStateException("Not expecting an empty token"); + } + RpcHeader.SaslMessagePB.Builder builder = RpcHeader.SaslMessagePB.newBuilder(); + builder.setToken(ZeroCopyLiteralByteString.wrap(saslToken)); + builder.setState(RpcHeader.SaslMessagePB.SaslState.RESPONSE); + sendSaslMessage(chan, builder.build()); + } + + private void handleSuccessResponse(Channel chan, RpcHeader.SaslMessagePB response) { + LOG.debug("nego finished"); + negoUnderway = false; + client.sendContext(chan); + } + + private static class SaslClientCallbackHandler implements CallbackHandler { + private final String userName; + private final char[] userPassword; + + public SaslClientCallbackHandler(String user, String password) { + this.userName = user; + this.userPassword = password.toCharArray(); + } + + public void handle(Callback[] callbacks) + throws UnsupportedCallbackException { + NameCallback nc = null; + PasswordCallback pc = null; + RealmCallback rc = null; + for (Callback callback : callbacks) { + if (callback instanceof RealmChoiceCallback) { + continue; + } else if (callback instanceof NameCallback) { + nc = (NameCallback) callback; + } else if (callback instanceof PasswordCallback) { + pc = (PasswordCallback) callback; + } else if (callback instanceof RealmCallback) { + rc = (RealmCallback) callback; + } else { + throw new UnsupportedCallbackException(callback, + "Unrecognized SASL client callback"); + } + } + if (nc != null) { + nc.setName(userName); + } + if (pc != null) { + pc.setPassword(userPassword); + } + if (rc != null) { + rc.setText(rc.getDefaultText()); + } + } + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/SessionConfiguration.java b/java/kudu-client/src/main/java/org/kududb/client/SessionConfiguration.java new file mode 100644 index 000000000000..94e0a66b33e1 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/SessionConfiguration.java @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Interface that defines the methods used to configure a session. It also exposes ways to + * query its state. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public interface SessionConfiguration { + + @InterfaceAudience.Public + @InterfaceStability.Evolving + enum FlushMode { + // Every write will be sent to the server in-band with the Apply() + // call. No batching will occur. This is the default flush mode. In this + // mode, the Flush() call never has any effect, since each Apply() call + // has already flushed the buffer. + AUTO_FLUSH_SYNC, + + // Apply() calls will return immediately, but the writes will be sent in + // the background, potentially batched together with other writes from + // the same session. If there is not sufficient buffer space, then Apply() + // may block for buffer space to be available. + // + // Because writes are applied in the background, any errors will be stored + // in a session-local buffer. Call CountPendingErrors() or GetPendingErrors() + // to retrieve them. + // + // The Flush() call can be used to block until the buffer is empty. + AUTO_FLUSH_BACKGROUND, + + // Apply() calls will return immediately, and the writes will not be + // sent until the user calls Flush(). If the buffer runs past the + // configured space limit, then Apply() will return an error. + MANUAL_FLUSH + } + + /** + * Get the current flush mode. + * @return flush mode, AUTO_FLUSH_SYNC by default + */ + FlushMode getFlushMode(); + + /** + * Set the new flush mode for this session. + * @param flushMode new flush mode, can be the same as the previous one. + * @throws IllegalArgumentException if the buffer isn't empty. + */ + void setFlushMode(FlushMode flushMode); + + /** + * Set the number of operations that can be buffered. + * @param size number of ops. + * @throws IllegalArgumentException if the buffer isn't empty. + */ + void setMutationBufferSpace(int size); + + /** + * Set the low watermark for this session. The default is set to half the mutation buffer space. + * For example, a buffer space of 1000 with a low watermark set to 50% (0.5) will start randomly + * sending PleaseRetryExceptions once there's an outstanding flush and the buffer is over 500. + * As the buffer gets fuller, it becomes likelier to hit the exception. + * @param mutationBufferLowWatermarkPercentage a new low watermark as a percentage, + * has to be between 0 and 1 (inclusive). A value of 1 disables + * the low watermark since it's the same as the high one + * @throws IllegalArgumentException if the buffer isn't empty or if the watermark isn't between + * 0 and 1 + */ + void setMutationBufferLowWatermark(float mutationBufferLowWatermarkPercentage); + + /** + * Set the flush interval, which will be used for the next scheduling decision. + * @param interval interval in milliseconds. + */ + void setFlushInterval(int interval); + + /** + * Get the current timeout. + * @return operation timeout in milliseconds, 0 if none was configured. + */ + long getTimeoutMillis(); + + /** + * Sets the timeout for the next applied operations. + * The default timeout is 0, which disables the timeout functionality. + * @param timeout Timeout in milliseconds. + */ + void setTimeoutMillis(long timeout); + + /** + * Returns true if this session has already been closed. + */ + boolean isClosed(); + + /** + * Check if there are operations that haven't been completely applied. + * @return true if operations are pending, else false. + */ + boolean hasPendingOperations(); + + /** + * Set the new external consistency mode for this session. + * @param consistencyMode new external consistency mode, can the same as the previous one. + * @throws IllegalArgumentException if the buffer isn't empty. + */ + void setExternalConsistencyMode(ExternalConsistencyMode consistencyMode); + + /** + * Tells if the session is currently ignoring row errors when the whole list returned by a tablet + * server is of the AlreadyPresent type. + * @return true if the session is enforcing this, else false + */ + boolean isIgnoreAllDuplicateRows(); + + /** + * Configures the option to ignore all the row errors if they are all of the AlreadyPresent type. + * This can be needed when facing KUDU-568. The effect of enabling this is that operation + * responses that match this pattern will be cleared of their row errors, meaning that we consider + * them successful. + * This is disabled by default. + * @param ignoreAllDuplicateRows true if this session should enforce this, else false + */ + void setIgnoreAllDuplicateRows(boolean ignoreAllDuplicateRows); + + /** + * Return the number of errors which are pending. Errors may accumulate when + * using the AUTO_FLUSH_BACKGROUND mode. + * @return a count of errors + */ + int countPendingErrors(); + + /** + * Return any errors from previous calls. If there were more errors + * than could be held in the session's error storage, the overflow state is set to true. + * Resets the pending errors. + * @return an object that contains the errors and the overflow status + */ + RowErrorsAndOverflowStatus getPendingErrors(); +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/TabletClient.java b/java/kudu-client/src/main/java/org/kududb/client/TabletClient.java new file mode 100644 index 000000000000..4556b52a1a81 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/TabletClient.java @@ -0,0 +1,771 @@ +/* + * Copyright (C) 2010-2012 The Async HBase Authors. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * - Neither the name of the StumbleUpon nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package org.kududb.client; + +import com.stumbleupon.async.Deferred; + +import org.jboss.netty.handler.timeout.ReadTimeoutException; +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.master.Master; +import org.kududb.rpc.RpcHeader; +import org.kududb.tserver.Tserver; +import org.kududb.util.Pair; + +import org.jboss.netty.buffer.ChannelBuffer; +import org.jboss.netty.buffer.ChannelBuffers; +import org.jboss.netty.channel.Channel; +import org.jboss.netty.channel.ChannelEvent; +import org.jboss.netty.channel.ChannelFuture; +import org.jboss.netty.channel.ChannelFutureListener; +import org.jboss.netty.channel.ChannelHandlerContext; +import org.jboss.netty.channel.ChannelStateEvent; +import org.jboss.netty.channel.Channels; +import org.jboss.netty.channel.ExceptionEvent; +import org.jboss.netty.handler.codec.replay.ReplayingDecoder; +import org.jboss.netty.handler.codec.replay.VoidEnum; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.security.sasl.SaslException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * Stateful handler that manages a connection to a specific TabletServer. + *

+ * This handler manages the RPC IDs, the serialization and de-serialization of + * RPC requests and responses, and keeps track of the RPC in flights for which + * a response is currently awaited, as well as temporarily buffered RPCs that + * are awaiting to be sent to the network. + *

+ * This class needs careful synchronization. It's a non-sharable handler, + * meaning there is one instance of it per Netty {@link Channel} and each + * instance is only used by one Netty IO thread at a time. At the same time, + * {@link AsyncKuduClient} calls methods of this class from random threads at + * random times. The bottom line is that any data only used in the Netty IO + * threads doesn't require synchronization, everything else does. + *

+ * Acquiring the monitor on an object of this class will prevent it from + * accepting write requests as well as buffering requests if the underlying + * channel isn't connected. + */ +@InterfaceAudience.Private +public class TabletClient extends ReplayingDecoder { + + public static final Logger LOG = LoggerFactory.getLogger(TabletClient.class); + + private ArrayList> pending_rpcs; + + public static final byte RPC_CURRENT_VERSION = 9; + /** Initial part of the header for 0.95 and up. */ + private static final byte[] RPC_HEADER = new byte[] { 'h', 'r', 'p', 'c', + RPC_CURRENT_VERSION, // RPC version. + 0, + 0 + }; + public static final int CONNECTION_CTX_CALL_ID = -3; + + /** + * A monotonically increasing counter for RPC IDs. + * RPCs can be sent out from any thread, so we need an atomic integer. + * RPC IDs can be arbitrary. So it's fine if this integer wraps around and + * becomes negative. They don't even have to start at 0, but we do it for + * simplicity and ease of debugging. + */ + private final AtomicInteger rpcid = new AtomicInteger(-1); + + /** + * The channel we're connected to. + * This will be {@code null} while we're not connected to the TabletServer. + * This attribute is volatile because {@link #shutdown} may access it from a + * different thread, and because while we connect various user threads will + * test whether it's {@code null}. Once we're connected and we know what + * protocol version the server speaks, we'll set this reference. + */ + private volatile Channel chan; + + /** + * Set to {@code true} once we've disconnected from the server. + * This way, if any thread is still trying to use this client after it's + * been removed from the caches in the {@link AsyncKuduClient}, we will + * immediately fail / reschedule its requests. + *

+ * Manipulating this value requires synchronizing on `this'. + */ + private boolean dead = false; + + /** + * Maps an RPC ID to the in-flight RPC that was given this ID. + * RPCs can be sent out from any thread, so we need a concurrent map. + */ + private final ConcurrentHashMap> rpcs_inflight = + new ConcurrentHashMap>(); + + private final AsyncKuduClient kuduClient; + + private final String uuid; + + private final long socketReadTimeoutMs; + + private SecureRpcHelper secureRpcHelper; + + public TabletClient(AsyncKuduClient client, String uuid) { + this.kuduClient = client; + this.uuid = uuid; + this.socketReadTimeoutMs = client.getDefaultSocketReadTimeoutMs(); + } + + void sendRpc(KuduRpc rpc) { + if (!rpc.deadlineTracker.hasDeadline()) { + LOG.warn(getPeerUuidLoggingString() + " sending an rpc without a timeout " + rpc); + } + if (chan != null) { + final ChannelBuffer serialized = encode(rpc); + if (serialized == null) { // Error during encoding. + return; // Stop here. RPC has been failed already. + } + + final Channel chan = this.chan; // Volatile read. + if (chan != null) { // Double check if we disconnected during encode(). + Channels.write(chan, serialized); + return; + } + } + boolean tryagain = false; + boolean copyOfDead; + synchronized (this) { + copyOfDead = this.dead; + // Check if we got connected while entering this synchronized block. + if (chan != null) { + tryagain = true; + } else if (!copyOfDead) { + if (pending_rpcs == null) { + pending_rpcs = new ArrayList>(); + } + pending_rpcs.add(rpc); + } + } + if (copyOfDead) { + failOrRetryRpc(rpc, new ConnectionResetException(null)); + return; + } else if (tryagain) { + // This recursion will not lead to a loop because we only get here if we + // connected while entering the synchronized block above. So when trying + // a second time, we will either succeed to send the RPC if we're still + // connected, or fail through to the code below if we got disconnected + // in the mean time. + sendRpc(rpc); + return; + } + } + + private ChannelBuffer encode(final KuduRpc rpc) { + final int rpcid = this.rpcid.incrementAndGet(); + ChannelBuffer payload; + final String service = rpc.serviceName(); + final String method = rpc.method(); + try { + final RpcHeader.RequestHeader.Builder headerBuilder = RpcHeader.RequestHeader.newBuilder() + .setCallId(rpcid) + .setRemoteMethod( + RpcHeader.RemoteMethodPB.newBuilder().setServiceName(service).setMethodName(method)); + + // If any timeout is set, find the lowest non-zero one, since this will be the deadline that + // the server must respect. + if (rpc.deadlineTracker.hasDeadline() || socketReadTimeoutMs > 0) { + long millisBeforeDeadline = Long.MAX_VALUE; + if (rpc.deadlineTracker.hasDeadline()) { + millisBeforeDeadline = rpc.deadlineTracker.getMillisBeforeDeadline(); + } + + long localRpcTimeoutMs = Long.MAX_VALUE; + if (socketReadTimeoutMs > 0) { + localRpcTimeoutMs = socketReadTimeoutMs; + } + + headerBuilder.setTimeoutMillis((int) Math.min(millisBeforeDeadline, localRpcTimeoutMs)); + } + + payload = rpc.serialize(headerBuilder.build()); + } catch (Exception e) { + LOG.error("Uncaught exception while serializing RPC: " + rpc, e); + rpc.errback(e); // Make the RPC fail with the exception. + return null; + } + final KuduRpc oldrpc = rpcs_inflight.put(rpcid, rpc); + if (oldrpc != null) { + final String wtf = getPeerUuidLoggingString() + + "WTF? There was already an RPC in flight with" + + " rpcid=" + rpcid + ": " + oldrpc + + ". This happened when sending out: " + rpc; + LOG.error(wtf); + // Make it fail. This isn't an expected failure mode. + oldrpc.errback(new NonRecoverableException(wtf)); + } + + if (LOG.isDebugEnabled()) { + LOG.debug(getPeerUuidLoggingString() + chan + " Sending RPC #" + rpcid + + ", payload=" + payload + ' ' + Bytes.pretty(payload)); + } + + payload = secureRpcHelper.wrap(payload); + + return payload; + } + + public Deferred shutdown() { + // First, check whether we have RPCs in flight and cancel them. + for (Iterator> ite = rpcs_inflight.values().iterator(); ite + .hasNext();) { + KuduRpc rpc = ite.next(); + rpc.errback(new ConnectionResetException(null)); + ite.remove(); + } + + // Same for the pending RPCs. + synchronized (this) { + if (pending_rpcs != null) { + for (Iterator> ite = pending_rpcs.iterator(); ite.hasNext();) { + ite.next().errback(new ConnectionResetException(null)); + ite.remove(); + } + } + } + + final Channel chancopy = chan; + if (chancopy == null) { + return Deferred.fromResult(null); + } + if (chancopy.isConnected()) { + Channels.disconnect(chancopy); // ... this is going to set it to null. + // At this point, all in-flight RPCs are going to be failed. + } + if (chancopy.isBound()) { + Channels.unbind(chancopy); + } + // It's OK to call close() on a Channel if it's already closed. + final ChannelFuture future = Channels.close(chancopy); + // Now wrap the ChannelFuture in a Deferred. + final Deferred d = new Deferred(); + // Opportunistically check if it's already completed successfully. + if (future.isSuccess()) { + d.callback(null); + } else { + // If we get here, either the future failed (yeah, that sounds weird) + // or the future hasn't completed yet (heh). + future.addListener(new ChannelFutureListener() { + public void operationComplete(final ChannelFuture future) { + if (future.isSuccess()) { + d.callback(null); + return; + } + final Throwable t = future.getCause(); + if (t instanceof Exception) { + d.callback(t); + } else { + // Wrap the Throwable because Deferred doesn't handle Throwables, + // it only uses Exception. + d.callback(new NonRecoverableException("Failed to shutdown: " + + TabletClient.this, t)); + } + } + }); + } + return d; + } + + /** + * The reason we are suppressing the unchecked conversions is because the KuduRpc is coming + * from a collection that has RPCs with different generics, and there's no way to get "decoded" + * casted correctly. The best we can do is to rely on the RPC to decode correctly, + * and to not pass an Exception in the callback. + */ + @Override + @SuppressWarnings("unchecked") + protected Object decode(ChannelHandlerContext ctx, Channel chan, ChannelBuffer buf, + VoidEnum voidEnum) { + final long start = System.nanoTime(); + final int rdx = buf.readerIndex(); + LOG.debug("------------------>> ENTERING DECODE >>------------------"); + + try { + buf = secureRpcHelper.handleResponse(buf, chan); + } catch (SaslException e) { + String message = getPeerUuidLoggingString() + "Couldn't complete the SASL handshake"; + LOG.error(message); + throw new NonRecoverableException(message, e); + } + if (buf == null) { + return null; + } + + CallResponse response = new CallResponse(buf); + + RpcHeader.ResponseHeader header = response.getHeader(); + if (!header.hasCallId()) { + final int size = response.getTotalResponseSize(); + final String msg = getPeerUuidLoggingString() + "RPC response (size: " + size + ") doesn't" + + " have a call ID: " + header + ", buf=" + Bytes.pretty(buf); + LOG.error(msg); + throw new NonRecoverableException(msg); + } + final int rpcid = header.getCallId(); + + @SuppressWarnings("rawtypes") + final KuduRpc rpc = rpcs_inflight.get(rpcid); + + if (rpc == null) { + final String msg = getPeerUuidLoggingString() + "Invalid rpcid: " + rpcid + " found in " + + buf + '=' + Bytes.pretty(buf); + LOG.error(msg); + // The problem here is that we don't know which Deferred corresponds to + // this RPC, since we don't have a valid ID. So we're hopeless, we'll + // never be able to recover because responses are not framed, we don't + // know where the next response will start... We have to give up here + // and throw this outside of our Netty handler, so Netty will call our + // exception handler where we'll close this channel, which will cause + // all RPCs in flight to be failed. + throw new NonRecoverableException(msg); + } + + Pair decoded = null; + Exception exception = null; + KuduException retryableHeaderException = null; + if (header.hasIsError() && header.getIsError()) { + RpcHeader.ErrorStatusPB.Builder errorBuilder = RpcHeader.ErrorStatusPB.newBuilder(); + KuduRpc.readProtobuf(response.getPBMessage(), errorBuilder); + RpcHeader.ErrorStatusPB error = errorBuilder.build(); + if (error.getCode().equals(RpcHeader.ErrorStatusPB.RpcErrorCodePB.ERROR_SERVER_TOO_BUSY)) { + // We can't return right away, we still need to remove ourselves from 'rpcs_inflight', so we + // populate 'retryableHeaderException'. + retryableHeaderException = new TabletServerErrorException(uuid, error); + } else { + String message = getPeerUuidLoggingString() + + "Tablet server sent error " + error.getMessage(); + exception = new NonRecoverableException(message); + LOG.error(message); // can be useful + } + } else { + try { + decoded = rpc.deserialize(response, this.uuid); + } catch (Exception ex) { + exception = ex; + } + } + if (LOG.isDebugEnabled()) { + LOG.debug(getPeerUuidLoggingString() + "rpcid=" + rpcid + + ", response size=" + (buf.readerIndex() - rdx) + " bytes" + + ", " + actualReadableBytes() + " readable bytes left" + + ", rpc=" + rpc); + } + + { + final KuduRpc removed = rpcs_inflight.remove(rpcid); + if (removed == null) { + // The RPC we were decoding was cleaned up already, give up. + throw new NonRecoverableException("RPC not found"); + } + } + + // This check is specifically for the ERROR_SERVER_TOO_BUSY case above. + if (retryableHeaderException != null) { + kuduClient.handleRetryableError(rpc, retryableHeaderException); + return null; + } + + // We can get this Message from within the RPC's expected type, + // so convert it into an exception and nullify decoded so that we use the errback route. + // Have to do it for both TS and Master errors. + if (decoded != null) { + if (decoded.getSecond() instanceof Tserver.TabletServerErrorPB) { + Tserver.TabletServerErrorPB error = (Tserver.TabletServerErrorPB) decoded.getSecond(); + exception = dispatchTSErrorOrReturnException(rpc, error); + if (exception == null) { + // It was taken care of. + return null; + } else { + // We're going to errback. + decoded = null; + } + + } else if (decoded.getSecond() instanceof Master.MasterErrorPB) { + Master.MasterErrorPB error = (Master.MasterErrorPB) decoded.getSecond(); + exception = dispatchMasterErrorOrReturnException(rpc, error); + if (exception == null) { + // Exception was taken care of. + return null; + } else { + decoded = null; + } + } + } + + try { + if (decoded != null) { + assert !(decoded.getFirst() instanceof Exception); + rpc.callback(decoded.getFirst()); + } else { + rpc.errback(exception); + } + } catch (Exception e) { + LOG.debug(getPeerUuidLoggingString() + "Unexpected exception while handling RPC #" + rpcid + + ", rpc=" + rpc + ", buf=" + Bytes.pretty(buf), e); + } + if (LOG.isDebugEnabled()) { + LOG.debug("------------------<< LEAVING DECODE <<------------------" + + " time elapsed: " + ((System.nanoTime() - start) / 1000) + "us"); + } + return null; // Stop processing here. The Deferred does everything else. + } + + /** + * Takes care of a few kinds of TS errors that we handle differently, like tablets or leaders + * moving. Builds and returns an exception if we don't know what to do with it. + * @param rpc The original RPC call that triggered the error. + * @param error The error the TS sent. + * @return An exception if we couldn't dispatch the error, or null. + */ + private Exception dispatchTSErrorOrReturnException(KuduRpc rpc, + Tserver.TabletServerErrorPB error) { + WireProtocol.AppStatusPB.ErrorCode code = error.getStatus().getCode(); + TabletServerErrorException ex = new TabletServerErrorException(uuid, error.getStatus()); + if (error.getCode() == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND) { + kuduClient.handleTabletNotFound(rpc, ex, this); + // we're not calling rpc.callback() so we rely on the client to retry that RPC + } else if (code == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) { + kuduClient.handleRetryableError(rpc, ex); + // The following two error codes are an indication that the tablet isn't a leader. + } else if (code == WireProtocol.AppStatusPB.ErrorCode.ILLEGAL_STATE || + code == WireProtocol.AppStatusPB.ErrorCode.ABORTED) { + kuduClient.handleNotLeader(rpc, ex, this); + } else { + return ex; + } + return null; + } + + /** + * Provides different handling for various kinds of master errors: re-uses the + * mechanisms already in place for handling tablet server errors as much as possible. + * @param rpc The original RPC call that triggered the error. + * @param error The error the master sent. + * @return An exception if we couldn't dispatch the error, or null. + */ + private Exception dispatchMasterErrorOrReturnException(KuduRpc rpc, + Master.MasterErrorPB error) { + WireProtocol.AppStatusPB.ErrorCode code = error.getStatus().getCode(); + MasterErrorException ex = new MasterErrorException(uuid, error); + if (error.getCode() == Master.MasterErrorPB.Code.NOT_THE_LEADER) { + kuduClient.handleNotLeader(rpc, ex, this); + } else if (code == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE && + (!(rpc instanceof GetMasterRegistrationRequest))) { + // TODO: This is a crutch until we either don't have to retry RPCs going to the + // same server or use retry policies. + kuduClient.handleRetryableError(rpc, ex); + } else { + return ex; + } + return null; + } + + /** + * Decodes the response of an RPC and triggers its {@link Deferred}. + *

+ * This method is used by FrameDecoder when the channel gets + * disconnected. The buffer for that channel is passed to this method in + * case there's anything left in it. + * @param ctx Unused. + * @param chan The channel on which the response came. + * @param buf The buffer containing the raw RPC response. + * @return {@code null}, always. + */ + @Override + protected Object decodeLast(final ChannelHandlerContext ctx, + final Channel chan, + final ChannelBuffer buf, + final VoidEnum unused) { + // When we disconnect, decodeLast is called instead of decode. + // We simply check whether there's any data left in the buffer, in which + // case we attempt to process it. But if there's no data left, then we + // don't even bother calling decode() as it'll complain that the buffer + // doesn't contain enough data, which unnecessarily pollutes the logs. + if (buf.readable()) { + try { + return decode(ctx, chan, buf, unused); + } finally { + if (buf.readable()) { + LOG.error(getPeerUuidLoggingString() + "After decoding the last message on " + chan + + ", there was still some undecoded bytes in the channel's" + + " buffer (which are going to be lost): " + + buf + '=' + Bytes.pretty(buf)); + } + } + } else { + return null; + } + } + + /** + * Tells whether or not this handler should be used. + *

+ * This method is not synchronized. You need to synchronize on this + * instance if you need a memory visibility guarantee. You may not need + * this guarantee if you're OK with the RPC finding out that the connection + * has been reset "the hard way" and you can retry the RPC. In this case, + * you can call this method as a hint. After getting the initial exception + * back, this thread is guaranteed to see this method return {@code false} + * without synchronization needed. + * @return {@code false} if this handler is known to have been disconnected + * from the server and sending an RPC (via {@link #sendRpc} or any other + * indirect means such as {@code GetTableLocations()}) will fail immediately + * by having the RPC's {@link Deferred} called back immediately with a + * {@link ConnectionResetException}. This typically means that you got a + * stale reference (or that the reference to this instance is just about to + * be invalidated) and that you shouldn't use this instance. + */ + public boolean isAlive() { + return !dead; + } + + /** + * Ensures that at least a {@code nbytes} are readable from the given buffer. + * If there aren't enough bytes in the buffer this will raise an exception + * and cause the {@link ReplayingDecoder} to undo whatever we did thus far + * so we can wait until we read more from the socket. + * @param buf Buffer to check. + * @param nbytes Number of bytes desired. + */ + static void ensureReadable(final ChannelBuffer buf, final int nbytes) { + buf.markReaderIndex(); + buf.skipBytes(nbytes); // can puke with Throwable + buf.resetReaderIndex(); + } + + @Override + public void channelConnected(final ChannelHandlerContext ctx, + final ChannelStateEvent e) { + final Channel chan = e.getChannel(); + ChannelBuffer header = connectionHeaderPreamble(); + header.writerIndex(RPC_HEADER.length); + Channels.write(chan, header); + + secureRpcHelper = new SecureRpcHelper(this); + secureRpcHelper.sendHello(chan); + } + + @Override + public void handleUpstream(final ChannelHandlerContext ctx, + final ChannelEvent e) throws Exception { + if (LOG.isDebugEnabled()) { + LOG.debug(getPeerUuidLoggingString() + e.toString()); + } + super.handleUpstream(ctx, e); + } + + @Override + public void channelDisconnected(final ChannelHandlerContext ctx, + final ChannelStateEvent e) throws Exception { + chan = null; + super.channelDisconnected(ctx, e); // Let the ReplayingDecoder cleanup. + cleanup(e.getChannel()); + } + + @Override + public void channelClosed(final ChannelHandlerContext ctx, + final ChannelStateEvent e) { + chan = null; + // No need to call super.channelClosed() because we already called + // super.channelDisconnected(). If we get here without getting a + // DISCONNECTED event, then we were never connected in the first place so + // the ReplayingDecoder has nothing to cleanup. + cleanup(e.getChannel()); + } + + /** + * Cleans up any outstanding or lingering RPC (used when shutting down). + *

+ * All RPCs in flight will fail with a {@link ConnectionResetException} and + * all edits buffered will be re-scheduled. + */ + private void cleanup(final Channel chan) { + final ConnectionResetException exception = + new ConnectionResetException(getPeerUuidLoggingString() + "Connection reset on " + chan); + for (Iterator> ite = rpcs_inflight.values().iterator(); ite + .hasNext();) { + KuduRpc rpc = ite.next(); + failOrRetryRpc(rpc, exception); + ite.remove(); + } + + final ArrayList> rpcs; + synchronized (this) { + dead = true; + rpcs = pending_rpcs; + pending_rpcs = null; + } + if (rpcs != null) { + failOrRetryRpcs(rpcs, exception); + } + } + + /** + * Retry all the given RPCs. + * @param rpcs a possibly empty but non-{@code null} collection of RPCs to retry or fail + * @param exception an exception to propagate with the RPCs + */ + private void failOrRetryRpcs(final Collection> rpcs, + final ConnectionResetException exception) { + for (final KuduRpc rpc : rpcs) { + failOrRetryRpc(rpc, exception); + } + } + + /** + * Retry the given RPC. + * @param rpc an RPC to retry or fail + * @param exception an exception to propagate with the RPC + */ + private void failOrRetryRpc(final KuduRpc rpc, + final ConnectionResetException exception) { + AsyncKuduClient.RemoteTablet tablet = rpc.getTablet(); + if (tablet == null) { // Can't retry, dunno where this RPC should go. + rpc.errback(exception); + } else { + kuduClient.handleTabletNotFound(rpc, exception, this); + } + } + + + @Override + public void exceptionCaught(final ChannelHandlerContext ctx, + final ExceptionEvent event) { + final Throwable e = event.getCause(); + final Channel c = event.getChannel(); + + if (e instanceof RejectedExecutionException) { + LOG.warn(getPeerUuidLoggingString() + "RPC rejected by the executor," + + " ignore this if we're shutting down", e); + } else if (e instanceof ReadTimeoutException) { + LOG.debug(getPeerUuidLoggingString() + "Encountered a read timeout"); + // Doing the cleanup here since we want to invalidate all the RPCs right _now_, and not let + // the ReplayingDecoder continue decoding through Channels.close() below. + cleanup(c); + } else { + LOG.error(getPeerUuidLoggingString() + "Unexpected exception from downstream on " + c, e); + } + if (c.isOpen()) { + Channels.close(c); // Will trigger channelClosed(), which will cleanup() + } else { // else: presumably a connection timeout. + cleanup(c); // => need to cleanup() from here directly. + } + } + + + private ChannelBuffer connectionHeaderPreamble() { + return ChannelBuffers.wrappedBuffer(RPC_HEADER); + } + + public void becomeReady(Channel chan) { + this.chan = chan; + sendQueuedRpcs(); + } + + /** + * Sends the queued RPCs to the server, once we're connected to it. + * This gets called after {@link #channelConnected}, once we were able to + * handshake with the server + */ + private void sendQueuedRpcs() { + ArrayList> rpcs; + synchronized (this) { + rpcs = pending_rpcs; + pending_rpcs = null; + } + if (rpcs != null) { + for (final KuduRpc rpc : rpcs) { + LOG.debug(getPeerUuidLoggingString() + "Executing RPC queued: " + rpc); + sendRpc(rpc); + } + } + } + + void sendContext(Channel channel) { + Channels.write(channel, header()); + becomeReady(channel); + } + + private ChannelBuffer header() { + RpcHeader.ConnectionContextPB.Builder builder = RpcHeader.ConnectionContextPB.newBuilder(); + RpcHeader.UserInformationPB.Builder userBuilder = RpcHeader.UserInformationPB.newBuilder(); + userBuilder.setEffectiveUser(SecureRpcHelper.USER_AND_PASSWORD); // TODO set real user + userBuilder.setRealUser(SecureRpcHelper.USER_AND_PASSWORD); + builder.setUserInfo(userBuilder.build()); + RpcHeader.ConnectionContextPB pb = builder.build(); + RpcHeader.RequestHeader header = RpcHeader.RequestHeader.newBuilder().setCallId + (CONNECTION_CTX_CALL_ID).build(); + return KuduRpc.toChannelBuffer(header, pb); + } + + private String getPeerUuidLoggingString() { + return "[Peer " + uuid + "] "; + } + + /** + * Returns this tablet server's uuid. + * @return a string that contains this tablet server's uuid + */ + String getUuid() { + return uuid; + } + + public String toString() { + final StringBuilder buf = new StringBuilder(13 + 10 + 6 + 64 + 7 + 32 + 16 + 1 + 17 + 2 + 1); + buf.append("TabletClient@") // =13 + .append(hashCode()) // ~10 + .append("(chan=") // = 6 + .append(chan) // ~64 (up to 66 when using IPv4) + .append(", uuid=") // = 7 + .append(uuid) // = 32 + .append(", #pending_rpcs="); // =16 + int npending_rpcs; + synchronized (this) { + npending_rpcs = pending_rpcs == null ? 0 : pending_rpcs.size(); + } + buf.append(npending_rpcs); // = 1 + buf.append(", #rpcs_inflight=") // =17 + .append(rpcs_inflight.size()) // ~ 2 + .append(')'); // = 1 + return buf.toString(); + } + +} diff --git a/java/kudu-client/src/main/java/org/kududb/client/TabletServerErrorException.java b/java/kudu-client/src/main/java/org/kududb/client/TabletServerErrorException.java new file mode 100644 index 000000000000..68fc54b90adf --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/TabletServerErrorException.java @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.WireProtocol; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.rpc.RpcHeader; + +/** + * This exception is thrown by Tablet Servers when something goes wrong processing a request. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +@SuppressWarnings("serial") +public class TabletServerErrorException extends KuduServerException { + + TabletServerErrorException(String serverUuid, WireProtocol.AppStatusPB appStatus) { + super(serverUuid, appStatus); + } + + TabletServerErrorException(String serverUuid, RpcHeader.ErrorStatusPB errorStatus) { + super(serverUuid, errorStatus); + } +} + diff --git a/java/kudu-client/src/main/java/org/kududb/client/Update.java b/java/kudu-client/src/main/java/org/kududb/client/Update.java new file mode 100644 index 000000000000..3db202698a39 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/client/Update.java @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +/** + * Operation to update columns on an existing row. Instances of this class should not be reused. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class Update extends Operation { + + Update(KuduTable table) { + super(table); + } + + @Override + ChangeType getChangeType() { + return ChangeType.UPDATE; + } +} \ No newline at end of file diff --git a/java/kudu-client/src/main/java/org/kududb/util/AsyncUtil.java b/java/kudu-client/src/main/java/org/kududb/util/AsyncUtil.java new file mode 100644 index 000000000000..e05c5dab84dd --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/AsyncUtil.java @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import org.kududb.annotations.InterfaceAudience; + +/** + * Utility methods for various parts of async, such as Deferred. + * TODO (KUDU-602): Some of these methods could eventually be contributed back to async or to a + * custom fork/derivative of async. + */ +@InterfaceAudience.Private +public class AsyncUtil { + + /** + * Register a callback and an "errback". + *

+ * This has the exact same effect as {@link Deferred#addCallbacks(Callback, Callback)} + * keeps the type information "correct" when the callback and errback return a + * {@code Deferred}. + * @param d The {@code Deferred} we want to add the callback and errback to. + * @param cb The callback to register. + * @param eb The errback to register. + * @return {@code d} with an "updated" type. + */ + @SuppressWarnings("unchecked") + public static , E> + Deferred addCallbacksDeferring(final Deferred d, + final Callback cb, + final Callback eb) { + return d.addCallbacks((Callback) ((Object) cb), + (Callback) ((Object) eb)); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/util/HybridTimeUtil.java b/java/kudu-client/src/main/java/org/kududb/util/HybridTimeUtil.java new file mode 100644 index 000000000000..31436e787d62 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/HybridTimeUtil.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import org.kududb.annotations.InterfaceAudience; + +import java.util.concurrent.TimeUnit; + +/** + * Set of common utility methods to handle HybridTime and related timestamps. + */ +@InterfaceAudience.Private +public class HybridTimeUtil { + + public static final int hybridTimeNumBitsToShift = 12; + public static final int hybridTimeLogicalBitsMask = (1 << hybridTimeNumBitsToShift) - 1; + + /** + * Converts the provided timestamp, in the provided unit, to the HybridTime timestamp + * format. Logical bits are set to 0. + * + * @param timestamp the value of the timestamp, must be greater than 0 + * @param timeUnit the time unit of the timestamp + * @throws IllegalArgumentException if the timestamp is less than 0 + */ + public static long clockTimestampToHTTimestamp(long timestamp, TimeUnit timeUnit) { + if (timestamp < 0) { + throw new IllegalArgumentException("Timestamp cannot be less than 0"); + } + long timestampInMicros = TimeUnit.MICROSECONDS.convert(timestamp, timeUnit); + return timestampInMicros << hybridTimeNumBitsToShift; + } + + /** + * Extracts the physical and logical values from an HT timestamp. + * + * @param htTimestamp the encoded HT timestamp + * @return a pair of {physical, logical} long values in an array + */ + public static long[] HTTimestampToPhysicalAndLogical(long htTimestamp) { + long timestampInMicros = htTimestamp >> hybridTimeNumBitsToShift; + long logicalValues = htTimestamp & hybridTimeLogicalBitsMask; + return new long[] {timestampInMicros, logicalValues}; + } + + /** + * Encodes separate physical and logical components into a single HT timestamp + * + * @param physical the physical component, in microseconds + * @param logical the logical component + * @return an encoded HT timestamp + */ + public static long physicalAndLogicalToHTTimestamp(long physical, long logical) { + return (physical << hybridTimeNumBitsToShift) + logical; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/util/NetUtil.java b/java/kudu-client/src/main/java/org/kududb/util/NetUtil.java new file mode 100644 index 000000000000..1ff77a2ea026 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/NetUtil.java @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.google.common.base.Functions; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.net.HostAndPort; +import org.kududb.annotations.InterfaceAudience; + +import java.util.List; + +/** + * Networking related methods. + */ +@InterfaceAudience.Private +public class NetUtil { + + /** + * Convert a list of {@link HostAndPort} objects to a comma separate string. + * The inverse of {@link #parseStrings(String, int)}. + * + * @param hostsAndPorts A list of {@link HostAndPort} objects. + * @return Comma separate list of "host:port" pairs. + */ + public static String hostsAndPortsToString(List hostsAndPorts) { + return Joiner.on(",").join(Lists.transform(hostsAndPorts, Functions.toStringFunction())); + } + + /** + * Parse a "host:port" pair into a {@link HostAndPort} object. If there is no + * port specified in the string, then 'defaultPort' is used. + * + * @param addrString A host or a "host:port" pair. + * @param defaultPort Default port to use if no port is specified in addrString. + * @return The HostAndPort object constructed from addrString. + */ + public static HostAndPort parseString(String addrString, int defaultPort) { + return addrString.indexOf(':') == -1 ? HostAndPort.fromParts(addrString, defaultPort) : + HostAndPort.fromString(addrString); + } + + /** + * Parse a comma separated list of "host:port" pairs into a list of + * {@link HostAndPort} objects. If no port is specified for an entry in + * the comma separated list, then a default port is used. + * The inverse of {@link #hostsAndPortsToString(List)}. + * + * @param commaSepAddrs The comma separated list of "host:port" pairs. + * @param defaultPort The default port to use if no port is specified. + * @return A list of HostAndPort objects constructed from commaSepAddrs. + */ + public static List parseStrings(final String commaSepAddrs, int defaultPort) { + Iterable addrStrings = Splitter.on(',').trimResults().split(commaSepAddrs); + List hostsAndPorts = Lists.newArrayListWithCapacity(Iterables.size(addrStrings)); + for (String addrString : addrStrings) { + HostAndPort hostAndPort = parseString(addrString, defaultPort); + hostsAndPorts.add(hostAndPort); + } + return hostsAndPorts; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/util/Pair.java b/java/kudu-client/src/main/java/org/kududb/util/Pair.java new file mode 100644 index 000000000000..341ec10d05e5 --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/Pair.java @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.google.common.base.Objects; +import org.kududb.annotations.InterfaceAudience; + +@InterfaceAudience.Private +public class Pair { + private final A first; + private final B second; + + public Pair(A first, B second) { + this.first = first; + this.second = second; + } + + public A getFirst() { + return first; + } + + public B getSecond() { + return second; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Pair pair = (Pair) o; + + if (first != null ? !first.equals(pair.first) : pair.first != null) return false; + if (second != null ? !second.equals(pair.second) : pair.second != null) return false; + + return true; + } + + @Override + public int hashCode() { + return Objects.hashCode(first, second); + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/util/Slice.java b/java/kudu-client/src/main/java/org/kududb/util/Slice.java new file mode 100644 index 000000000000..a2d5ad1b8d2e --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/Slice.java @@ -0,0 +1,699 @@ +/* + * Copyright 2009 Red Hat, Inc. + * + * Red Hat licenses this file to you under the Apache License, version 2.0 + * (the "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ +package org.kududb.util; + +import com.google.common.base.Preconditions; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import com.google.common.primitives.Shorts; +import org.kududb.annotations.InterfaceAudience; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; +import java.nio.channels.FileChannel; +import java.nio.channels.GatheringByteChannel; +import java.nio.channels.ScatteringByteChannel; +import java.nio.charset.Charset; +import java.util.Arrays; + +import static java.nio.ByteOrder.LITTLE_ENDIAN; + +/** + * Little Endian slice of a byte array. + */ +@InterfaceAudience.Private +public final class Slice implements Comparable +{ + private final byte[] data; + private final int offset; + private final int length; + + private int hash; + + public Slice(int length) + { + data = new byte[length]; + this.offset = 0; + this.length = length; + } + + public Slice(byte[] data) + { + Preconditions.checkNotNull(data, "array is null"); + this.data = data; + this.offset = 0; + this.length = data.length; + } + + public Slice(byte[] data, int offset, int length) + { + Preconditions.checkNotNull(data, "array is null"); + this.data = data; + this.offset = offset; + this.length = length; + } + + /** + * Length of this slice. + */ + public int length() + { + return length; + } + + /** + * Gets the array underlying this slice. + */ + public byte[] getRawArray() + { + return data; + } + + /** + * Gets the offset of this slice in the underlying array. + */ + public int getRawOffset() + { + return offset; + } + + /** + * Gets a byte at the specified absolute {@code index} in this buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 1} is greater than {@code this.capacity} + */ + public byte getByte(int index) + { + Preconditions.checkPositionIndexes(index, index + 1, this.length); + index += offset; + return data[index]; + } + + /** + * Gets an unsigned byte at the specified absolute {@code index} in this + * buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 1} is greater than {@code this.capacity} + */ + public short getUnsignedByte(int index) + { + return (short) (getByte(index) & 0xFF); + } + + /** + * Gets a 16-bit short integer at the specified absolute {@code index} in + * this slice. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 2} is greater than {@code this.capacity} + */ + public short getShort(int index) + { + Preconditions.checkPositionIndexes(index, index + Shorts.BYTES, this.length); + index += offset; + return (short) (data[index] & 0xFF | data[index + 1] << 8); + } + + /** + * Gets a 32-bit integer at the specified absolute {@code index} in + * this buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 4} is greater than {@code this.capacity} + */ + public int getInt(int index) + { + Preconditions.checkPositionIndexes(index, index + Ints.BYTES, this.length); + index += offset; + return (data[index] & 0xff) | + (data[index + 1] & 0xff) << 8 | + (data[index + 2] & 0xff) << 16 | + (data[index + 3] & 0xff) << 24; + } + + /** + * Gets a 64-bit long integer at the specified absolute {@code index} in + * this buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 8} is greater than {@code this.capacity} + */ + public long getLong(int index) + { + Preconditions.checkPositionIndexes(index, index + Longs.BYTES, this.length); + index += offset; + return ((long) data[index] & 0xff) | + ((long) data[index + 1] & 0xff) << 8 | + ((long) data[index + 2] & 0xff) << 16 | + ((long) data[index + 3] & 0xff) << 24 | + ((long) data[index + 4] & 0xff) << 32 | + ((long) data[index + 5] & 0xff) << 40 | + ((long) data[index + 6] & 0xff) << 48 | + ((long) data[index + 7] & 0xff) << 56; + } + + /** + * Transfers this buffer's data to the specified destination starting at + * the specified absolute {@code index}. + * + * @param dstIndex the first index of the destination + * @param length the number of bytes to transfer + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0}, + * if the specified {@code dstIndex} is less than {@code 0}, + * if {@code index + length} is greater than + * {@code this.capacity}, or + * if {@code dstIndex + length} is greater than + * {@code dst.capacity} + */ + public void getBytes(int index, Slice dst, int dstIndex, int length) + { + getBytes(index, dst.data, dstIndex, length); + } + + /** + * Transfers this buffer's data to the specified destination starting at + * the specified absolute {@code index}. + * + * @param destinationIndex the first index of the destination + * @param length the number of bytes to transfer + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0}, + * if the specified {@code dstIndex} is less than {@code 0}, + * if {@code index + length} is greater than + * {@code this.capacity}, or + * if {@code dstIndex + length} is greater than + * {@code dst.length} + */ + public void getBytes(int index, byte[] destination, int destinationIndex, int length) + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + Preconditions.checkPositionIndexes(destinationIndex, destinationIndex + length, destination.length); + index += offset; + System.arraycopy(data, index, destination, destinationIndex, length); + } + + public byte[] getBytes() + { + return getBytes(0, length); + } + + public byte[] getBytes(int index, int length) + { + index += offset; + if (index == 0) { + return Arrays.copyOf(data, length); + } else { + byte[] value = new byte[length]; + System.arraycopy(data, index, value, 0, length); + return value; + } + } + + /** + * Transfers this buffer's data to the specified destination starting at + * the specified absolute {@code index} until the destination's position + * reaches its limit. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + dst.remaining()} is greater than + * {@code this.capacity} + */ + public void getBytes(int index, ByteBuffer destination) + { + Preconditions.checkPositionIndex(index, this.length); + index += offset; + destination.put(data, index, Math.min(length, destination.remaining())); + } + + /** + * Transfers this buffer's data to the specified stream starting at the + * specified absolute {@code index}. + * + * @param length the number of bytes to transfer + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + length} is greater than + * {@code this.capacity} + * @throws java.io.IOException if the specified stream threw an exception during I/O + */ + public void getBytes(int index, OutputStream out, int length) + throws IOException + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + out.write(data, index, length); + } + + /** + * Transfers this buffer's data to the specified channel starting at the + * specified absolute {@code index}. + * + * @param length the maximum number of bytes to transfer + * @return the actual number of bytes written out to the specified channel + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + length} is greater than + * {@code this.capacity} + * @throws java.io.IOException if the specified channel threw an exception during I/O + */ + public int getBytes(int index, GatheringByteChannel out, int length) + throws IOException + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + return out.write(ByteBuffer.wrap(data, index, length)); + } + + /** + * Sets the specified 16-bit short integer at the specified absolute + * {@code index} in this buffer. The 16 high-order bits of the specified + * value are ignored. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 2} is greater than {@code this.capacity} + */ + public void setShort(int index, int value) + { + Preconditions.checkPositionIndexes(index, index + Shorts.BYTES, this.length); + index += offset; + data[index] = (byte) (value); + data[index + 1] = (byte) (value >>> 8); + } + + /** + * Sets the specified 32-bit integer at the specified absolute + * {@code index} in this buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 4} is greater than {@code this.capacity} + */ + public void setInt(int index, int value) + { + Preconditions.checkPositionIndexes(index, index + Ints.BYTES, this.length); + index += offset; + data[index] = (byte) (value); + data[index + 1] = (byte) (value >>> 8); + data[index + 2] = (byte) (value >>> 16); + data[index + 3] = (byte) (value >>> 24); + } + + /** + * Sets the specified 64-bit long integer at the specified absolute + * {@code index} in this buffer. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 8} is greater than {@code this.capacity} + */ + public void setLong(int index, long value) + { + Preconditions.checkPositionIndexes(index, index + Longs.BYTES, this.length); + index += offset; + data[index] = (byte) (value); + data[index + 1] = (byte) (value >>> 8); + data[index + 2] = (byte) (value >>> 16); + data[index + 3] = (byte) (value >>> 24); + data[index + 4] = (byte) (value >>> 32); + data[index + 5] = (byte) (value >>> 40); + data[index + 6] = (byte) (value >>> 48); + data[index + 7] = (byte) (value >>> 56); + } + + /** + * Sets the specified byte at the specified absolute {@code index} in this + * buffer. The 24 high-order bits of the specified value are ignored. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * {@code index + 1} is greater than {@code this.capacity} + */ + public void setByte(int index, int value) + { + Preconditions.checkPositionIndexes(index, index + 1, this.length); + index += offset; + data[index] = (byte) value; + } + + /** + * Transfers the specified source buffer's data to this buffer starting at + * the specified absolute {@code index}. + * + * @param srcIndex the first index of the source + * @param length the number of bytes to transfer + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0}, + * if the specified {@code srcIndex} is less than {@code 0}, + * if {@code index + length} is greater than + * {@code this.capacity}, or + * if {@code srcIndex + length} is greater than + * {@code src.capacity} + */ + public void setBytes(int index, Slice src, int srcIndex, int length) + { + setBytes(index, src.data, src.offset + srcIndex, length); + } + + /** + * Transfers the specified source array's data to this buffer starting at + * the specified absolute {@code index}. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0}, + * if the specified {@code srcIndex} is less than {@code 0}, + * if {@code index + length} is greater than + * {@code this.capacity}, or + * if {@code srcIndex + length} is greater than {@code src.length} + */ + public void setBytes(int index, byte[] source, int sourceIndex, int length) + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + Preconditions.checkPositionIndexes(sourceIndex, sourceIndex + length, source.length); + index += offset; + System.arraycopy(source, sourceIndex, data, index, length); + } + + /** + * Transfers the specified source buffer's data to this buffer starting at + * the specified absolute {@code index} until the source buffer's position + * reaches its limit. + * + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + src.remaining()} is greater than + * {@code this.capacity} + */ + public void setBytes(int index, ByteBuffer source) + { + Preconditions.checkPositionIndexes(index, index + source.remaining(), this.length); + index += offset; + source.get(data, index, source.remaining()); + } + + /** + * Transfers the content of the specified source stream to this buffer + * starting at the specified absolute {@code index}. + * + * @param length the number of bytes to transfer + * @return the actual number of bytes read in from the specified channel. + * {@code -1} if the specified channel is closed. + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + length} is greater than {@code this.capacity} + * @throws java.io.IOException if the specified stream threw an exception during I/O + */ + public int setBytes(int index, InputStream in, int length) + throws IOException + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + int readBytes = 0; + do { + int localReadBytes = in.read(data, index, length); + if (localReadBytes < 0) { + if (readBytes == 0) { + return -1; + } + else { + break; + } + } + readBytes += localReadBytes; + index += localReadBytes; + length -= localReadBytes; + } while (length > 0); + + return readBytes; + } + + /** + * Transfers the content of the specified source channel to this buffer + * starting at the specified absolute {@code index}. + * + * @param length the maximum number of bytes to transfer + * @return the actual number of bytes read in from the specified channel. + * {@code -1} if the specified channel is closed. + * @throws IndexOutOfBoundsException if the specified {@code index} is less than {@code 0} or + * if {@code index + length} is greater than {@code this.capacity} + * @throws java.io.IOException if the specified channel threw an exception during I/O + */ + public int setBytes(int index, ScatteringByteChannel in, int length) + throws IOException + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + ByteBuffer buf = ByteBuffer.wrap(data, index, length); + int readBytes = 0; + + do { + int localReadBytes; + try { + localReadBytes = in.read(buf); + } + catch (ClosedChannelException e) { + localReadBytes = -1; + } + if (localReadBytes < 0) { + if (readBytes == 0) { + return -1; + } + else { + break; + } + } + else if (localReadBytes == 0) { + break; + } + readBytes += localReadBytes; + } while (readBytes < length); + + return readBytes; + } + + public int setBytes(int index, FileChannel in, int position, int length) + throws IOException + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + ByteBuffer buf = ByteBuffer.wrap(data, index, length); + int readBytes = 0; + + do { + int localReadBytes; + try { + localReadBytes = in.read(buf, position + readBytes); + } + catch (ClosedChannelException e) { + localReadBytes = -1; + } + if (localReadBytes < 0) { + if (readBytes == 0) { + return -1; + } + else { + break; + } + } + else if (localReadBytes == 0) { + break; + } + readBytes += localReadBytes; + } while (readBytes < length); + + return readBytes; + } + + public Slice copySlice() + { + return copySlice(0, length); + } + + /** + * Returns a copy of this buffer's sub-region. Modifying the content of + * the returned buffer or this buffer does not affect each other at all. + */ + public Slice copySlice(int index, int length) + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + + index += offset; + byte[] copiedArray = new byte[length]; + System.arraycopy(data, index, copiedArray, 0, length); + return new Slice(copiedArray); + } + + public byte[] copyBytes() + { + return copyBytes(0, length); + } + + public byte[] copyBytes(int index, int length) + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + if (index == 0) { + return Arrays.copyOf(data, length); + } else { + byte[] value = new byte[length]; + System.arraycopy(data, index, value, 0, length); + return value; + } + } + + /** + * Returns a slice of this buffer's readable bytes. Modifying the content + * of the returned buffer or this buffer affects each other's content + * while they maintain separate indexes and marks. + */ + public Slice slice() + { + return slice(0, length); + } + + /** + * Returns a slice of this buffer's sub-region. Modifying the content of + * the returned buffer or this buffer affects each other's content while + * they maintain separate indexes and marks. + */ + public Slice slice(int index, int length) + { + if (index == 0 && length == this.length) { + return this; + } + + Preconditions.checkPositionIndexes(index, index + length, this.length); + if (index >= 0 && length == 0) { + return Slices.EMPTY_SLICE; + } + return new Slice(data, offset + index, length); + } + + /** + * Converts this buffer's readable bytes into a NIO buffer. The returned + * buffer shares the content with this buffer. + */ + public ByteBuffer toByteBuffer() + { + return toByteBuffer(0, length); + } + + /** + * Converts this buffer's sub-region into a NIO buffer. The returned + * buffer shares the content with this buffer. + */ + public ByteBuffer toByteBuffer(int index, int length) + { + Preconditions.checkPositionIndexes(index, index + length, this.length); + index += offset; + return ByteBuffer.wrap(data, index, length).order(LITTLE_ENDIAN); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + Slice slice = (Slice) o; + + // do lengths match + if (length != slice.length) { + return false; + } + + // if arrays have same base offset, some optimizations can be taken... + if (offset == slice.offset && data == slice.data) { + return true; + } + for (int i = 0; i < length; i++) { + if (data[offset + i] != slice.data[slice.offset + i]) { + return false; + } + } + return true; + } + + @Override + public int hashCode() + { + if (hash != 0) { + return hash; + } + + int result = length; + for (int i = offset; i < offset + length; i++) { + result = 31 * result + data[i]; + } + if (result == 0) { + result = 1; + } + hash = result; + return hash; + } + + /** + * Compares the content of the specified buffer to the content of this + * buffer. This comparison is performed byte by byte using an unsigned + * comparison. + */ + public int compareTo(Slice that) + { + if (this == that) { + return 0; + } + if (this.data == that.data && length == that.length && offset == that.offset) { + return 0; + } + + int minLength = Math.min(this.length, that.length); + for (int i = 0; i < minLength; i++) { + int thisByte = 0xFF & this.data[this.offset + i]; + int thatByte = 0xFF & that.data[that.offset + i]; + if (thisByte != thatByte) { + return (thisByte) - (thatByte); + } + } + return this.length - that.length; + } + + /** + * Decodes this buffer's readable bytes into a string with the specified + * character set name. + */ + public String toString(Charset charset) + { + return toString(0, length, charset); + } + + /** + * Decodes this buffer's sub-region into a string with the specified + * character set. + */ + public String toString(int index, int length, Charset charset) + { + if (length == 0) { + return ""; + } + + return Slices.decodeString(toByteBuffer(index, length), charset); + } + + public String toString() + { + return getClass().getSimpleName() + '(' + + "length=" + length() + + ')'; + } +} diff --git a/java/kudu-client/src/main/java/org/kududb/util/Slices.java b/java/kudu-client/src/main/java/org/kududb/util/Slices.java new file mode 100644 index 000000000000..7fb9f17e861e --- /dev/null +++ b/java/kudu-client/src/main/java/org/kududb/util/Slices.java @@ -0,0 +1,261 @@ +/** + * Copyright (C) 2011 the original author or authors. + * + * See the LICENSE.txt file distributed with this work for additional + * information regarding copyright ownership. + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.kududb.util; + +import com.google.common.base.Preconditions; +import org.kududb.annotations.InterfaceAudience; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.util.IdentityHashMap; +import java.util.Map; + +@InterfaceAudience.Private +public final class Slices +{ + /** + * A buffer whose capacity is {@code 0}. + */ + public static final Slice EMPTY_SLICE = new Slice(0); + + private Slices() + { + } + + public static Slice ensureSize(Slice existingSlice, int minWritableBytes) + { + if (existingSlice == null) { + existingSlice = EMPTY_SLICE; + } + + if (minWritableBytes <= existingSlice.length()) { + return existingSlice; + } + + int newCapacity; + if (existingSlice.length() == 0) { + newCapacity = 1; + } + else { + newCapacity = existingSlice.length(); + } + int minNewCapacity = existingSlice.length() + minWritableBytes; + while (newCapacity < minNewCapacity) { + newCapacity <<= 1; + } + + Slice newSlice = Slices.allocate(newCapacity); + newSlice.setBytes(0, existingSlice, 0, existingSlice.length()); + return newSlice; + } + + public static Slice allocate(int capacity) + { + if (capacity == 0) { + return EMPTY_SLICE; + } + return new Slice(capacity); + } + + public static Slice wrappedBuffer(byte[] array) + { + if (array.length == 0) { + return EMPTY_SLICE; + } + return new Slice(array); + } + + public static Slice copiedBuffer(ByteBuffer source, int sourceOffset, int length) + { + Preconditions.checkNotNull(source, "source is null"); + int newPosition = source.position() + sourceOffset; + return copiedBuffer((ByteBuffer) source.duplicate().order(ByteOrder.LITTLE_ENDIAN).clear().limit(newPosition + length).position(newPosition)); + } + + public static Slice copiedBuffer(ByteBuffer source) + { + Preconditions.checkNotNull(source, "source is null"); + Slice copy = allocate(source.limit() - source.position()); + copy.setBytes(0, source.duplicate().order(ByteOrder.LITTLE_ENDIAN)); + return copy; + } + + public static Slice copiedBuffer(String string, Charset charset) + { + Preconditions.checkNotNull(string, "string is null"); + Preconditions.checkNotNull(charset, "charset is null"); + + return wrappedBuffer(string.getBytes(charset)); + } + + public static ByteBuffer encodeString(CharBuffer src, Charset charset) + { + final CharsetEncoder encoder = getEncoder(charset); + final ByteBuffer dst = ByteBuffer.allocate( + (int) ((double) src.remaining() * encoder.maxBytesPerChar())); + try { + CoderResult cr = encoder.encode(src, dst, true); + if (!cr.isUnderflow()) { + cr.throwException(); + } + cr = encoder.flush(dst); + if (!cr.isUnderflow()) { + cr.throwException(); + } + } + catch (CharacterCodingException x) { + throw new IllegalStateException(x); + } + dst.flip(); + return dst; + } + + public static String decodeString(ByteBuffer src, Charset charset) + { + final CharsetDecoder decoder = getDecoder(charset); + final CharBuffer dst = CharBuffer.allocate( + (int) ((double) src.remaining() * decoder.maxCharsPerByte())); + try { + CoderResult cr = decoder.decode(src, dst, true); + if (!cr.isUnderflow()) { + cr.throwException(); + } + cr = decoder.flush(dst); + if (!cr.isUnderflow()) { + cr.throwException(); + } + } + catch (CharacterCodingException x) { + throw new IllegalStateException(x); + } + return dst.flip().toString(); + } + + /** + * Toggles the endianness of the specified 16-bit short integer. + */ + public static short swapShort(short value) + { + return (short) (value << 8 | value >>> 8 & 0xff); + } + + /** + * Toggles the endianness of the specified 32-bit integer. + */ + public static int swapInt(int value) + { + return swapShort((short) value) << 16 | + swapShort((short) (value >>> 16)) & 0xffff; + } + + /** + * Toggles the endianness of the specified 64-bit long integer. + */ + public static long swapLong(long value) + { + return (long) swapInt((int) value) << 32 | + swapInt((int) (value >>> 32)) & 0xffffffffL; + } + + private static final ThreadLocal> encoders = + new ThreadLocal>() + { + @Override + protected Map initialValue() + { + return new IdentityHashMap(); + } + }; + + private static final ThreadLocal> decoders = + new ThreadLocal>() + { + @Override + protected Map initialValue() + { + return new IdentityHashMap(); + } + }; + + /** + * Returns a cached thread-local {@link CharsetEncoder} for the specified + * charset. + */ + private static CharsetEncoder getEncoder(Charset charset) + { + if (charset == null) { + throw new NullPointerException("charset"); + } + + Map map = encoders.get(); + CharsetEncoder e = map.get(charset); + if (e != null) { + e.reset(); + e.onMalformedInput(CodingErrorAction.REPLACE); + e.onUnmappableCharacter(CodingErrorAction.REPLACE); + return e; + } + + e = charset.newEncoder(); + e.onMalformedInput(CodingErrorAction.REPLACE); + e.onUnmappableCharacter(CodingErrorAction.REPLACE); + map.put(charset, e); + return e; + } + + + /** + * Returns a cached thread-local {@link CharsetDecoder} for the specified + * charset. + */ + private static CharsetDecoder getDecoder(Charset charset) + { + if (charset == null) { + throw new NullPointerException("charset"); + } + + Map map = decoders.get(); + CharsetDecoder d = map.get(charset); + if (d != null) { + d.reset(); + d.onMalformedInput(CodingErrorAction.REPLACE); + d.onUnmappableCharacter(CodingErrorAction.REPLACE); + return d; + } + + d = charset.newDecoder(); + d.onMalformedInput(CodingErrorAction.REPLACE); + d.onUnmappableCharacter(CodingErrorAction.REPLACE); + map.put(charset, d); + return d; + } + +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/BaseKuduTest.java b/java/kudu-client/src/test/java/org/kududb/client/BaseKuduTest.java new file mode 100644 index 000000000000..c6f06b50a950 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/BaseKuduTest.java @@ -0,0 +1,356 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import com.google.common.collect.ImmutableList; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.master.Master; +import org.kududb.util.NetUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; +import com.google.common.net.HostAndPort; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; + +public class BaseKuduTest { + + private static final Logger LOG = LoggerFactory.getLogger(BaseKuduTest.class); + + private static final String NUM_MASTERS_PROP = "NUM_MASTERS"; + private static final int NUM_TABLET_SERVERS = 3; + private static final int DEFAULT_NUM_MASTERS = 1; + + // Number of masters that will be started for this test if we're starting + // a cluster. + private static final int NUM_MASTERS = + Integer.getInteger(NUM_MASTERS_PROP, DEFAULT_NUM_MASTERS); + + private static MiniKuduCluster miniCluster; + + // Comma separate describing the master addresses and ports. + protected static String masterAddresses; + protected static List masterHostPorts; + + protected static final int DEFAULT_SLEEP = 50000; + + // We create both versions of the client for ease of use. + protected static AsyncKuduClient client; + protected static KuduClient syncClient; + protected static Schema basicSchema = getBasicSchema(); + protected static Schema allTypesSchema = getSchemaWithAllTypes(); + + private static List tableNames = new ArrayList<>(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + LOG.info("Setting up before class..."); + + miniCluster = new MiniKuduCluster.MiniKuduClusterBuilder() + .numMasters(NUM_MASTERS) + .numTservers(NUM_TABLET_SERVERS) + .defaultTimeoutMs(DEFAULT_SLEEP) + .build(); + masterAddresses = miniCluster.getMasterAddresses(); + masterHostPorts = miniCluster.getMasterHostPorts(); + + LOG.info("Creating new Kudu client..."); + client = new AsyncKuduClient.AsyncKuduClientBuilder(masterAddresses).build(); + syncClient = new KuduClient(client); + LOG.info("Waiting for tablet servers..."); + if (!miniCluster.waitForTabletServers(NUM_TABLET_SERVERS)) { + fail("Couldn't get " + NUM_TABLET_SERVERS + " tablet servers running, aborting"); + } + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + try { + if (client != null) { + Deferred> d = client.shutdown(); + d.addErrback(defaultErrorCB); + d.join(DEFAULT_SLEEP); + // No need to explicitly shutdown the sync client, + // shutting down the async client effectively does that. + } + } finally { + miniCluster.shutdown(); + } + } + + protected static KuduTable createTable(String tableName, Schema schema, + CreateTableOptions builder) { + LOG.info("Creating table: {}", tableName); + Deferred d = client.createTable(tableName, schema, builder); + final AtomicBoolean gotError = new AtomicBoolean(false); + d.addErrback(new Callback() { + @Override + public Object call(Object arg) throws Exception { + gotError.set(true); + LOG.error("Error : " + arg); + return null; + } + }); + KuduTable table = null; + try { + table = d.join(DEFAULT_SLEEP); + } catch (Exception e) { + fail("Timed out"); + } + if (gotError.get()) { + fail("Got error during table creation, is the Kudu master running at " + + masterAddresses + "?"); + } + tableNames.add(tableName); + return table; + } + + /** + * Counts the rows from the {@code scanner} until exhaustion. It doesn't require the scanner to + * be new, so it can be used to finish scanning a previously-started scan. + */ + protected static int countRowsInScan(AsyncKuduScanner scanner) + throws Exception { + final AtomicInteger counter = new AtomicInteger(); + + Callback cb = new Callback() { + @Override + public Object call(RowResultIterator arg) throws Exception { + if (arg == null) return null; + counter.addAndGet(arg.getNumRows()); + return null; + } + }; + + while (scanner.hasMoreRows()) { + Deferred data = scanner.nextRows(); + data.addCallbacks(cb, defaultErrorCB); + data.join(DEFAULT_SLEEP); + } + + Deferred closer = scanner.close(); + closer.addCallbacks(cb, defaultErrorCB); + closer.join(DEFAULT_SLEEP); + return counter.get(); + } + + protected List scanTableToStrings(KuduTable table) throws Exception { + List rowStrings = Lists.newArrayList(); + KuduScanner scanner = syncClient.newScannerBuilder(table).build(); + while (scanner.hasMoreRows()) { + RowResultIterator rows = scanner.nextRows(); + for (RowResult r : rows) { + rowStrings.add(r.rowToString()); + } + } + Collections.sort(rowStrings); + return rowStrings; + } + + private static final int[] KEYS = new int[] {10, 20, 30}; + protected static KuduTable createFourTabletsTableWithNineRows(String tableName) throws + Exception { + CreateTableOptions builder = new CreateTableOptions(); + for (int i : KEYS) { + PartialRow splitRow = basicSchema.newPartialRow(); + splitRow.addInt(0, i); + builder.addSplitRow(splitRow); + } + KuduTable table = createTable(tableName, basicSchema, builder); + AsyncKuduSession session = client.newSession(); + + // create a table with on empty tablet and 3 tablets of 3 rows each + for (int key1 : KEYS) { + for (int key2 = 1; key2 <= 3; key2++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, key1 + key2); + row.addInt(1, key1); + row.addInt(2, key2); + row.addString(3, "a string"); + row.addBoolean(4, true); + session.apply(insert).join(DEFAULT_SLEEP); + } + } + session.close().join(DEFAULT_SLEEP); + return table; + } + + public static Schema getSchemaWithAllTypes() { + List columns = + ImmutableList.of( + new ColumnSchema.ColumnSchemaBuilder("int8", Type.INT8).key(true).build(), + new ColumnSchema.ColumnSchemaBuilder("int16", Type.INT16).build(), + new ColumnSchema.ColumnSchemaBuilder("int32", Type.INT32).build(), + new ColumnSchema.ColumnSchemaBuilder("int64", Type.INT64).build(), + new ColumnSchema.ColumnSchemaBuilder("bool", Type.BOOL).build(), + new ColumnSchema.ColumnSchemaBuilder("float", Type.FLOAT).build(), + new ColumnSchema.ColumnSchemaBuilder("double", Type.DOUBLE).build(), + new ColumnSchema.ColumnSchemaBuilder("string", Type.STRING).build(), + new ColumnSchema.ColumnSchemaBuilder("binary-array", Type.BINARY).build(), + new ColumnSchema.ColumnSchemaBuilder("binary-bytebuffer", Type.BINARY).build(), + new ColumnSchema.ColumnSchemaBuilder("null", Type.STRING).nullable(true).build(), + new ColumnSchema.ColumnSchemaBuilder("timestamp", Type.TIMESTAMP).build()); + + return new Schema(columns); + } + + public static Schema getBasicSchema() { + ArrayList columns = new ArrayList(5); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column1_i", Type.INT32).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column2_i", Type.INT32).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column3_s", Type.STRING) + .nullable(true) + .desiredBlockSize(4096) + .encoding(ColumnSchema.Encoding.DICT_ENCODING) + .compressionAlgorithm(ColumnSchema.CompressionAlgorithm.LZ4) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("column4_b", Type.BOOL).build()); + return new Schema(columns); + } + + protected Insert createBasicSchemaInsert(KuduTable table, int key) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, key); + row.addInt(1, 2); + row.addInt(2, 3); + row.addString(3, "a string"); + row.addBoolean(4, true); + return insert; + } + + static Callback defaultErrorCB = new Callback() { + @Override + public Object call(Object arg) throws Exception { + if (arg == null) return null; + if (arg instanceof Exception) { + LOG.warn("Got exception", (Exception) arg); + } else { + LOG.warn("Got an error response back " + arg); + } + return new Exception("Can't recover from error, see previous WARN"); + } + }; + + /** + * Helper method to open a table. It sets the default sleep time when joining on the Deferred. + * @param name Name of the table + * @return A KuduTable + * @throws Exception MasterErrorException if the table doesn't exist + */ + protected static KuduTable openTable(String name) throws Exception { + Deferred d = client.openTable(name); + return d.join(DEFAULT_SLEEP); + } + + /** + * Helper method to easily kill a tablet server that serves the given table's only tablet's + * leader. The currently running test case will be failed if there's more than one tablet, + * if the tablet has no leader after some retries, or if the tablet server was already killed. + * + * This method is thread-safe. + * @param table a KuduTable which will get its single tablet's leader killed. + * @throws Exception + */ + protected static void killTabletLeader(KuduTable table) throws Exception { + LocatedTablet.Replica leader = null; + DeadlineTracker deadlineTracker = new DeadlineTracker(); + deadlineTracker.setDeadline(DEFAULT_SLEEP); + while (leader == null) { + if (deadlineTracker.timedOut()) { + fail("Timed out while trying to find a leader for this table: " + table.getName()); + } + List tablets = table.getTabletsLocations(DEFAULT_SLEEP); + if (tablets.isEmpty() || tablets.size() > 1) { + fail("Currently only support killing leaders for tables containing 1 tablet, table " + + table.getName() + " has " + tablets.size()); + } + LocatedTablet tablet = tablets.get(0); + if (tablet.getReplicas().size() == 1) { + fail("Table " + table.getName() + " only has 1 tablet, please enable replication"); + } + leader = tablet.getLeaderReplica(); + if (leader == null) { + LOG.info("Sleeping while waiting for a tablet LEADER to arise, currently slept " + + deadlineTracker.getElapsedMillis() + "ms"); + Thread.sleep(50); + } + } + + Integer port = leader.getRpcPort(); + miniCluster.killTabletServerOnPort(port); + } + + /** + * Helper method to easily kill the leader master. + * + * This method is thread-safe. + * @throws Exception If there is an error finding or killing the leader master. + */ + protected static void killMasterLeader() throws Exception { + int leaderPort = findLeaderMasterPort(); + miniCluster.killMasterOnPort(leaderPort); + } + + /** + * Find the port of the leader master in order to retrieve it from the port to process map. + * @return The port of the leader master. + * @throws Exception If we are unable to find the leader master. + */ + protected static int findLeaderMasterPort() throws Exception { + Stopwatch sw = new Stopwatch().start(); + int leaderPort = -1; + while (leaderPort == -1 && sw.elapsedMillis() < DEFAULT_SLEEP) { + Deferred masterLocD = client.getMasterTableLocationsPB(); + Master.GetTableLocationsResponsePB r = masterLocD.join(DEFAULT_SLEEP); + leaderPort = r.getTabletLocations(0) + .getReplicas(0) + .getTsInfo() + .getRpcAddresses(0) + .getPort(); + } + if (leaderPort == -1) { + fail("No leader master found after " + DEFAULT_SLEEP + " ms."); + } + return leaderPort; + } + + /** + * Return the comma-separated list of "host:port" pairs that describes the master + * config for this cluster. + * @return The master config string. + */ + protected static String getMasterAddresses() { + return masterAddresses; + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/MiniKuduCluster.java b/java/kudu-client/src/test/java/org/kududb/client/MiniKuduCluster.java new file mode 100644 index 000000000000..9bd5442c6814 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/MiniKuduCluster.java @@ -0,0 +1,427 @@ +/** + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.client; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; +import com.google.common.net.HostAndPort; +import org.apache.commons.io.FileUtils; +import org.kududb.util.NetUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import sun.management.VMManagement; + +/** + * Utility class to start and manipulate Kudu clusters. Relies on being IN the Kudu source code with + * both the kudu-master and kudu-tserver binaries already compiled. {@link BaseKuduTest} should be + * extended instead of directly using this class in almost all cases. + */ +public class MiniKuduCluster implements AutoCloseable { + + private static final Logger LOG = LoggerFactory.getLogger(MiniKuduCluster.class); + + // TS and Master ports will be assigned starting with this one. + private static final int PORT_START = 64030; + + // List of threads that print + private final List PROCESS_INPUT_PRINTERS = new ArrayList<>(); + + // Map of ports to master servers. + private final Map masterProcesses = new ConcurrentHashMap<>(); + + // Map of ports to tablet servers. + private final Map tserverProcesses = new ConcurrentHashMap<>(); + + private final List pathsToDelete = new ArrayList<>(); + private final List masterHostPorts = new ArrayList<>(); + + // Client we can use for common operations. + private final KuduClient syncClient; + private final int defaultTimeoutMs; + + private String masterAddresses; + + private MiniKuduCluster(int numMasters, int numTservers, int defaultTimeoutMs) throws Exception { + this.defaultTimeoutMs = defaultTimeoutMs; + + startCluster(numMasters, numTservers); + + syncClient = new KuduClient.KuduClientBuilder(getMasterAddresses()) + .defaultAdminOperationTimeoutMs(defaultTimeoutMs) + .defaultOperationTimeoutMs(defaultTimeoutMs) + .build(); + } + + /** + * Wait up to this instance's "default timeout" for an expected count of TS to + * connect to the master. + * @param expected How many TS are expected + * @return true if there are at least as many TS as expected, otherwise false + */ + public boolean waitForTabletServers(int expected) throws Exception { + int count = 0; + Stopwatch stopwatch = new Stopwatch().start(); + while (count < expected && stopwatch.elapsedMillis() < defaultTimeoutMs) { + Thread.sleep(200); + count = syncClient.listTabletServers().getTabletServersCount(); + } + return count >= expected; + } + + /** + * @return the local PID of this process. + * This is used to generate unique loopback IPs for parallel test running. + */ + private static int getPid() { + try { + RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean(); + java.lang.reflect.Field jvm = runtime.getClass().getDeclaredField("jvm"); + jvm.setAccessible(true); + VMManagement mgmt = (VMManagement)jvm.get(runtime); + Method pid_method = mgmt.getClass().getDeclaredMethod("getProcessId"); + pid_method.setAccessible(true); + + return (Integer)pid_method.invoke(mgmt); + } catch (Exception e) { + LOG.warn("Cannot get PID", e); + return 1; + } + } + + /** + * @return a unique loopback IP address for this PID. This allows running + * tests in parallel, since 127.0.0.0/8 all act as loopbacks on Linux. + * + * The generated IP is based on pid, so this requires that the parallel tests + * run in separate VMs. + * + * On OSX, the above trick doesn't work, so we can't run parallel tests on OSX. + * Given that, we just return the normal localhost IP. + */ + private static String getUniqueLocalhost() { + if ("Mac OS X".equals(System.getProperty("os.name"))) { + return "127.0.0.1"; + } + + int pid = getPid(); + return "127." + ((pid & 0xff00) >> 8) + "." + (pid & 0xff) + ".1"; + } + + /** + * Starts a Kudu cluster composed of the provided masters and tablet servers. + * @param numMasters how many masters to start + * @param numTservers how many tablet servers to start + * @throws Exception + */ + private void startCluster(int numMasters, int numTservers) throws Exception { + Preconditions.checkArgument(numMasters > 0, "Need at least one master"); + Preconditions.checkArgument(numTservers > 0, "Need at least one tablet server"); + // The following props are set via kudu-client's pom. + String baseDirPath = TestUtils.getBaseDir(); + String localhost = getUniqueLocalhost(); + + long now = System.currentTimeMillis(); + LOG.info("Starting {} masters...", numMasters); + int port = startMasters(PORT_START, numMasters, baseDirPath); + LOG.info("Starting {} tablet servers...", numTservers); + for (int i = 0; i < numTservers; i++) { + port = TestUtils.findFreePort(port); + String dataDirPath = baseDirPath + "/ts-" + i + "-" + now; + String flagsPath = TestUtils.getFlagsPath(); + String[] tsCmdLine = { + TestUtils.findBinary("kudu-tserver"), + "--flagfile=" + flagsPath, + "--fs_wal_dir=" + dataDirPath, + "--fs_data_dirs=" + dataDirPath, + "--tserver_master_addrs=" + masterAddresses, + "--webserver_interface=" + localhost, + "--local_ip_for_outbound_sockets=" + localhost, + "--rpc_bind_addresses=" + localhost + ":" + port}; + tserverProcesses.put(port, configureAndStartProcess(tsCmdLine)); + port++; + + if (flagsPath.startsWith(baseDirPath)) { + // We made a temporary copy of the flags; delete them later. + pathsToDelete.add(flagsPath); + } + pathsToDelete.add(dataDirPath); + } + } + + /** + * Start the specified number of master servers with ports starting from a specified + * number. Finds free web and RPC ports up front for all of the masters first, then + * starts them on those ports, populating 'masters' map. + * @param masterStartPort the starting point of the port range for the masters + * @param numMasters number of masters to start + * @param baseDirPath the base directory where the mini cluster stores its data + * @return the next free port + * @throws Exception if we are unable to start the masters + */ + private int startMasters(int masterStartPort, int numMasters, + String baseDirPath) throws Exception { + LOG.info("Starting {} masters...", numMasters); + // Get the list of web and RPC ports to use for the master consensus configuration: + // request NUM_MASTERS * 2 free ports as we want to also reserve the web + // ports for the consensus configuration. + String localhost = getUniqueLocalhost(); + List ports = TestUtils.findFreePorts(masterStartPort, numMasters * 2); + int lastFreePort = ports.get(ports.size() - 1); + List masterRpcPorts = Lists.newArrayListWithCapacity(numMasters); + List masterWebPorts = Lists.newArrayListWithCapacity(numMasters); + for (int i = 0; i < numMasters * 2; i++) { + if (i % 2 == 0) { + masterRpcPorts.add(ports.get(i)); + masterHostPorts.add(HostAndPort.fromParts(localhost, ports.get(i))); + } else { + masterWebPorts.add(ports.get(i)); + } + } + masterAddresses = NetUtil.hostsAndPortsToString(masterHostPorts); + for (int i = 0; i < numMasters; i++) { + long now = System.currentTimeMillis(); + String dataDirPath = baseDirPath + "/master-" + i + "-" + now; + String flagsPath = TestUtils.getFlagsPath(); + // The web port must be reserved in the call to findFreePorts above and specified + // to avoid the scenario where: + // 1) findFreePorts finds RPC ports a, b, c for the 3 masters. + // 2) start master 1 with RPC port and let it bind to any (specified as 0) web port. + // 3) master 1 happens to bind to port b for the web port, as master 2 hasn't been + // started yet and findFreePort(s) is "check-time-of-use" (it does not reserve the + // ports, only checks that when it was last called, these ports could be used). + List masterCmdLine = Lists.newArrayList( + TestUtils.findBinary("kudu-master"), + "--flagfile=" + flagsPath, + "--fs_wal_dir=" + dataDirPath, + "--fs_data_dirs=" + dataDirPath, + "--webserver_interface=" + localhost, + "--local_ip_for_outbound_sockets=" + localhost, + "--rpc_bind_addresses=" + localhost + ":" + masterRpcPorts.get(i), + "--webserver_port=" + masterWebPorts.get(i)); + if (numMasters > 1) { + masterCmdLine.add("--master_addresses=" + masterAddresses); + } + masterProcesses.put(masterRpcPorts.get(i), + configureAndStartProcess(masterCmdLine.toArray(new String[masterCmdLine.size()]))); + + if (flagsPath.startsWith(baseDirPath)) { + // We made a temporary copy of the flags; delete them later. + pathsToDelete.add(flagsPath); + } + pathsToDelete.add(dataDirPath); + } + return lastFreePort + 1; + } + + /** + * Starts a process using the provided command and configures it to be daemon, + * redirects the stderr to stdout, and starts a thread that will read from the process' input + * stream and redirect that to LOG. + * @param command Process and options + * @return The started process + * @throws Exception Exception if an error prevents us from starting the process, + * or if we were able to start the process but noticed that it was then killed (in which case + * we'll log the exit value). + */ + private Process configureAndStartProcess(String[] command) throws Exception { + LOG.info("Starting process: {}", Joiner.on(" ").join(command)); + ProcessBuilder processBuilder = new ProcessBuilder(command); + processBuilder.redirectErrorStream(true); + Process proc = processBuilder.start(); + ProcessInputStreamLogPrinterRunnable printer = + new ProcessInputStreamLogPrinterRunnable(proc.getInputStream()); + Thread thread = new Thread(printer); + thread.setDaemon(true); + thread.setName(command[0]); + PROCESS_INPUT_PRINTERS.add(thread); + thread.start(); + + Thread.sleep(300); + try { + int ev = proc.exitValue(); + throw new Exception("We tried starting a process (" + command[0] + ") but it exited with " + + "value=" + ev); + } catch (IllegalThreadStateException ex) { + // This means the process is still alive, it's like reverse psychology. + } + return proc; + } + + /** + * Kills the TS listening on the provided port. Doesn't do anything if the TS was already killed. + * @param port port on which the tablet server is listening on + * @throws InterruptedException + */ + public void killTabletServerOnPort(int port) throws InterruptedException { + Process ts = tserverProcesses.remove(port); + if (ts == null) { + // The TS is already dead, good. + return; + } + LOG.info("Killing server at port " + port); + ts.destroy(); + ts.waitFor(); + } + + /** + * Kills the master listening on the provided port. Doesn't do anything if the master was + * already killed. + * @param port port on which the master is listening on + * @throws InterruptedException + */ + public void killMasterOnPort(int port) throws InterruptedException { + Process master = masterProcesses.remove(port); + if (master == null) { + // The master is already dead, good. + return; + } + LOG.info("Killing master at port " + port); + master.destroy(); + master.waitFor(); + } + + /** + * See {@link #shutdown()}. + * @throws Exception never thrown, exceptions are logged + */ + @Override + public void close() throws Exception { + shutdown(); + } + + /** + * Stops all the processes and deletes the folders used to store data and the flagfile. + */ + public void shutdown() { + for (Iterator masterIter = masterProcesses.values().iterator(); masterIter.hasNext(); ) { + masterIter.next().destroy(); + masterIter.remove(); + } + for (Iterator tsIter = tserverProcesses.values().iterator(); tsIter.hasNext(); ) { + tsIter.next().destroy(); + tsIter.remove(); + } + for (Thread thread : PROCESS_INPUT_PRINTERS) { + thread.interrupt(); + } + + for (String path : pathsToDelete) { + try { + File f = new File(path); + if (f.isDirectory()) { + FileUtils.deleteDirectory(f); + } else { + f.delete(); + } + } catch (Exception e) { + LOG.warn("Could not delete path {}", path, e); + } + } + } + + /** + * Returns the comma-separated list of master addresses. + * @return master addresses + */ + public String getMasterAddresses() { + return masterAddresses; + } + + /** + * Returns a list of master addresses. + * @return master addresses + */ + public List getMasterHostPorts() { + return masterHostPorts; + } + + /** + * Helper runnable that can log what the processes are sending on their stdout and stderr that + * we'd otherwise miss. + */ + static class ProcessInputStreamLogPrinterRunnable implements Runnable { + + private final InputStream is; + + public ProcessInputStreamLogPrinterRunnable(InputStream is) { + this.is = is; + } + + @Override + public void run() { + try { + String line; + BufferedReader in = new BufferedReader(new InputStreamReader(is)); + while ((line = in.readLine()) != null) { + LOG.info(line); + } + in.close(); + } + catch (Exception e) { + if (!e.getMessage().contains("Stream closed")) { + LOG.error("Caught error while reading a process' output", e); + } + } + } + } + + public static class MiniKuduClusterBuilder { + + private int numMasters = 1; + private int numTservers = 3; + private int defaultTimeoutMs = 50000; + + public MiniKuduClusterBuilder numMasters(int numMasters) { + this.numMasters = numMasters; + return this; + } + + public MiniKuduClusterBuilder numTservers(int numTservers) { + this.numTservers = numTservers; + return this; + } + + /** + * Configures the internal client to use the given timeout for all operations. Also uses the + * timeout for tasks like waiting for tablet servers to check in with the master. + * @param defaultTimeoutMs timeout in milliseconds + * @return this instance + */ + public MiniKuduClusterBuilder defaultTimeoutMs(int defaultTimeoutMs) { + this.defaultTimeoutMs = defaultTimeoutMs; + return this; + } + + public MiniKuduCluster build() throws Exception { + return new MiniKuduCluster(numMasters, numTservers, defaultTimeoutMs); + } + } + +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduClient.java b/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduClient.java new file mode 100644 index 000000000000..7b4c932bb18d --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduClient.java @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Charsets; +import com.google.protobuf.ByteString; +import com.stumbleupon.async.Deferred; +import org.junit.BeforeClass; +import org.junit.Test; +import org.kududb.Common; +import org.kududb.consensus.Metadata; +import org.kududb.master.Master; + +import static org.junit.Assert.*; + +public class TestAsyncKuduClient extends BaseKuduTest { + + private static final String TABLE_NAME = + TestAsyncKuduClient.class.getName() + "-" + System.currentTimeMillis(); + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + // Set to 1 for testDisconnect to always test disconnecting the right server. + CreateTableOptions options = new CreateTableOptions().setNumReplicas(1); + table = createTable(TABLE_NAME, basicSchema, options); + } + + @Test(timeout = 100000) + public void testDisconnect() throws Exception { + // Test that we can reconnect to a TS after a disconnection. + // 1. Warm up the cache. + assertEquals(0, countRowsInScan(client.newScannerBuilder(table).build())); + + // 2. Disconnect the TabletClient. + client.getTableClients().get(0).shutdown().join(DEFAULT_SLEEP); + + // 3. Count again, it will trigger a re-connection and we should not hang or fail to scan. + assertEquals(0, countRowsInScan(client.newScannerBuilder(table).build())); + + + // Test that we can reconnect to a TS while scanning. + // 1. Insert enough rows to have to call next() multiple times. + KuduSession session = syncClient.newSession(); + session.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_BACKGROUND); + int rowCount = 200; + for (int i = 0; i < rowCount; i++) { + session.apply(createBasicSchemaInsert(table, i)); + } + session.flush(); + + // 2. Start a scanner with a small max num bytes. + AsyncKuduScanner scanner = client.newScannerBuilder(table) + .batchSizeBytes(1) + .build(); + Deferred rri = scanner.nextRows(); + // 3. Register the number of rows we get back. We have no control over how many rows are + // returned. When this test was written we were getting 100 rows back. + int numRows = rri.join(DEFAULT_SLEEP).getNumRows(); + assertNotEquals("The TS sent all the rows back, we can't properly test disconnection", + rowCount, numRows); + + // 4. Disconnect the TS. + client.getTableClients().get(0).shutdown().join(DEFAULT_SLEEP); + // 5. Make sure that we can continue scanning and that we get the remaining rows back. + assertEquals(rowCount - numRows, countRowsInScan(scanner)); + } + + @Test + public void testBadHostnames() throws Exception { + String badHostname = "some-unknown-host-hopefully"; + + // Test that a bad hostname for the master makes us error out quickly. + AsyncKuduClient invalidClient = new AsyncKuduClient.AsyncKuduClientBuilder(badHostname).build(); + try { + invalidClient.listTabletServers().join(1000); + fail("This should have failed quickly"); + } catch (Exception ex) { + assertTrue(ex instanceof NonRecoverableException); + assertTrue(ex.getMessage().contains(badHostname)); + } + + Master.GetTableLocationsResponsePB.Builder builder = + Master.GetTableLocationsResponsePB.newBuilder(); + + // Builder three bad locations. + Master.TabletLocationsPB.Builder tabletPb = Master.TabletLocationsPB.newBuilder(); + for (int i = 0; i < 3; i++) { + Common.PartitionPB.Builder partition = Common.PartitionPB.newBuilder(); + partition.setPartitionKeyStart(ByteString.copyFrom("a" + i, Charsets.UTF_8.name())); + partition.setPartitionKeyEnd(ByteString.copyFrom("b" + i, Charsets.UTF_8.name())); + tabletPb.setPartition(partition); + tabletPb.setStale(false); + tabletPb.setTabletId(ByteString.copyFromUtf8("some id " + i)); + Master.TSInfoPB.Builder tsInfoBuilder = Master.TSInfoPB.newBuilder(); + Common.HostPortPB.Builder hostBuilder = Common.HostPortPB.newBuilder(); + hostBuilder.setHost(badHostname + i); + hostBuilder.setPort(i); + tsInfoBuilder.addRpcAddresses(hostBuilder); + tsInfoBuilder.setPermanentUuid(ByteString.copyFromUtf8("some uuid")); + Master.TabletLocationsPB.ReplicaPB.Builder replicaBuilder = + Master.TabletLocationsPB.ReplicaPB.newBuilder(); + replicaBuilder.setTsInfo(tsInfoBuilder); + replicaBuilder.setRole(Metadata.RaftPeerPB.Role.FOLLOWER); + tabletPb.addReplicas(replicaBuilder); + builder.addTabletLocations(tabletPb); + } + + // Test that a tablet full of unreachable replicas won't make us retry. + try { + KuduTable badTable = new KuduTable(client, "Invalid table name", + "Invalid table ID", null, null); + client.discoverTablets(badTable, builder.build()); + fail("This should have failed quickly"); + } catch (Exception ex) { + assertTrue(ex instanceof NonRecoverableException); + assertTrue(ex.getMessage().contains(badHostname)); + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduSession.java b/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduSession.java new file mode 100644 index 000000000000..6e47df893a5b --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestAsyncKuduSession.java @@ -0,0 +1,448 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.Schema; +import org.kududb.WireProtocol.AppStatusPB; +import org.kududb.tserver.Tserver.TabletServerErrorPB; +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import com.stumbleupon.async.TimeoutException; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.Collections; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * This class can either start its own cluster or rely on an existing one. + * By default it assumes that the master is at localhost:64000. + * The cluster's configuration flags is found at flagsPath as defined in the pom file. + * Set startCluster to true in order have the test start the cluster for you. + * All those properties are set via surefire's systemPropertyVariables, meaning this: + * $ mvn test -DstartCluster=false + * will use an existing cluster at default address found above. + * + * The test creates a table with a unique(ish) name which it deletes at the end. + */ +public class TestAsyncKuduSession extends BaseKuduTest { + // Generate a unique table name + private static final String TABLE_NAME = + TestAsyncKuduSession.class.getName()+"-"+System.currentTimeMillis(); + + private static Schema schema = getBasicSchema(); + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + table = createTable(TABLE_NAME, schema, new CreateTableOptions()); + } + + /** + * Regression test for case where an error in the previous batch could cause the next + * batch to hang in flush() + */ + @Test(timeout = 100000) + public void testBatchErrorCauseSessionStuck() throws Exception { + try { + AsyncKuduSession session = client.newSession(); + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + session.setFlushInterval(100); + TabletServerErrorPB error = TabletServerErrorPB.newBuilder() + .setCode(TabletServerErrorPB.Code.UNKNOWN_ERROR) + .setStatus(AppStatusPB.newBuilder() + .setCode(AppStatusPB.ErrorCode.UNKNOWN_ERROR) + .setMessage("injected error for test") + .build()) + .build(); + Batch.injectTabletServerErrorAndLatency(error, 200); + // 0ms: insert first row, which will be the first batch. + Deferred resp1 = session.apply(createInsert(1)); + Thread.sleep(120); + // 100ms: start to send first batch. + // 100ms+: first batch got response from ts, + // will wait 200s and throw erorr. + // 120ms: insert another row, which will be the second batch. + Deferred resp2 = session.apply(createInsert(2)); + // 220ms: start to send the second batch, but first batch is inflight, + // so add callback to retry after first batch finishes. + // 300ms: first batch's callback handles error, retry second batch. + try { + resp1.join(2000); + } catch (TimeoutException e) { + fail("First batch should not timeout in case of tablet server error"); + } catch (TabletServerErrorException e) { + // Expected. + assertTrue(e.getMessage().contains("injected error for test")); + } + try { + resp2.join(2000); + } catch (TimeoutException e) { + fail("Second batch should not timeout in case of tablet server error"); + } catch (TabletServerErrorException e) { + // expected + assertTrue(e.getMessage().contains("injected error for test")); + } + } finally { + Batch.injectTabletServerErrorAndLatency(null, 0); + } + } + + @Test(timeout = 100000) + public void test() throws Exception { + + AsyncKuduSession session = client.newSession(); + // disable the low watermark until we need it + session.setMutationBufferLowWatermark(1f); + + // First testing KUDU-232, the cache is empty and we want to force flush. We force the flush + // interval to be higher than the sleep time so that we don't background flush while waiting. + // If our subsequent manual flush throws, it means the logic to block on in-flight tablet + // lookups in flush isn't working properly. + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + session.setFlushInterval(DEFAULT_SLEEP + 1000); + Deferred d = session.apply(createInsert(0)); + session.flush().join(DEFAULT_SLEEP); + assertTrue(exists(0)); + // set back to default + session.setFlushInterval(1000); + + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + for (int i = 1; i < 10; i++) { + session.apply(createInsert(i)).join(DEFAULT_SLEEP); + } + + assertEquals(10, countInRange(0, 10)); + + session.setFlushMode(AsyncKuduSession.FlushMode.MANUAL_FLUSH); + session.setMutationBufferSpace(10); + + session.apply(createInsert(10)); + + try { + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + } catch (IllegalArgumentException ex) { + /* expected, flush mode remains manual */ + } + + assertFalse(exists(10)); + + for (int i = 11; i < 20; i++) { + session.apply(createInsert(i)); + } + + assertEquals(0, countInRange(10, 20)); + try { + session.apply(createInsert(20)); + } catch (NonRecoverableException ex) { + /* expected, buffer would be too big */ + } + assertEquals(0, countInRange(10, 20)); // the buffer should still be full + + session.flush().join(DEFAULT_SLEEP); + assertEquals(10, countInRange(10, 20)); // now everything should be there + + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + + d = session.apply(createInsert(20)); + Thread.sleep(50); // waiting a minimal amount of time to make sure the interval is in effect + assertFalse(exists(20)); + // Add 10 items, the last one will stay in the buffer + for (int i = 21; i < 30; i++) { + d = session.apply(createInsert(i)); + } + Deferred buffered = session.apply(createInsert(30)); + long now = System.currentTimeMillis(); + d.join(DEFAULT_SLEEP); // Ok to use the last d, everything is going to the buffer + // auto flush will force flush if the buffer is full as it should be now + // so we check that we didn't wait the full interval + long elapsed = System.currentTimeMillis() - now; + assertTrue(elapsed < 950); + assertEquals(10, countInRange(20, 31)); + buffered.join(); + assertEquals(11, countInRange(20, 31)); + + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + Update update = createUpdate(30); + PartialRow row = update.getRow(); + row.addInt(2, 999); + row.addString(3, "updated data"); + d = session.apply(update); + d.addErrback(defaultErrorCB); + d.join(DEFAULT_SLEEP); + assertEquals(31, countInRange(0, 31)); + + Delete del = createDelete(30); + d = session.apply(del); + d.addErrback(defaultErrorCB); + d.join(DEFAULT_SLEEP); + assertEquals(30, countInRange(0, 31)); + + session.setFlushMode(AsyncKuduSession.FlushMode.MANUAL_FLUSH); + session.setMutationBufferSpace(35); + for (int i = 0; i < 20; i++) { + buffered = session.apply(createDelete(i)); + } + assertEquals(30, countInRange(0, 31)); + session.flush(); + buffered.join(DEFAULT_SLEEP); + assertEquals(10, countInRange(0, 31)); + + for (int i = 30; i < 40; i++) { + session.apply(createInsert(i)); + } + + for (int i = 20; i < 30; i++) { + buffered = session.apply(createDelete(i)); + } + + assertEquals(10, countInRange(0, 40)); + session.flush(); + buffered.join(DEFAULT_SLEEP); + assertEquals(10, countInRange(0, 40)); + + // Test nulls + // add 10 rows with the nullable column set to null + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + for (int i = 40; i < 50; i++) { + session.apply(createInsertWithNull(i)).join(DEFAULT_SLEEP); + } + + // now scan those rows and make sure the column is null + assertEquals(10, countNullColumns(40, 50)); + + // Test sending edits too fast + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + session.setMutationBufferSpace(10); + + // The buffer has a capacity of 10, we insert 21 rows, meaning we fill the first one, + // force flush, fill a second one before the first one could come back, + // and the 21st row will be sent back. + boolean gotException = false; + for (int i = 50; i < 71; i++) { + try { + session.apply(createInsert(i)); + } catch (PleaseThrottleException ex) { + gotException = true; + assertEquals(70, i); + // Wait for the buffer to clear + ex.getDeferred().join(DEFAULT_SLEEP); + session.apply(ex.getFailedRpc()); + session.flush().join(DEFAULT_SLEEP); + } + } + assertTrue("Expected PleaseThrottleException", gotException); + assertEquals(21, countInRange(50, 71)); + + // Now test a more subtle issue, basically the race where we call flush from the client when + // there's a batch already in flight. We need to finish joining only when all the data is + // flushed. + for (int i = 71; i < 91; i++) { + session.apply(createInsert(i)); + } + session.flush().join(DEFAULT_SLEEP); + // If we only waited after the in flight batch, there would be 10 rows here. + assertEquals(20, countInRange(71, 91)); + + // Test empty scanner projection + AsyncKuduScanner scanner = getScanner(71, 91, Collections.emptyList()); + assertEquals(20, countRowsInScan(scanner)); + + // Test removing the connection and then do a rapid set of inserts + client.getTableClients().get(0).shutdown().join(DEFAULT_SLEEP); + session.setMutationBufferSpace(1); + for (int i = 91; i < 101; i++) { + try { + session.apply(createInsert(i)); + } catch (PleaseThrottleException ex) { + // Wait for the buffer to clear + ex.getDeferred().join(DEFAULT_SLEEP); + session.apply(ex.getFailedRpc()); + } + } + session.flush().join(DEFAULT_SLEEP); + assertEquals(10, countInRange(91, 101)); + + // Test a tablet going missing or encountering a new tablet while inserting a lot + // of data. This code used to fail in many different ways. + client.emptyTabletsCacheForTable(table.getTableId()); + for (int i = 101; i < 151; i++) { + Insert insert = createInsert(i); + while (true) { + try { + session.apply(insert); + break; + } catch (PleaseThrottleException ex) { + // Wait for the buffer to clear + ex.getDeferred().join(DEFAULT_SLEEP); + } + } + } + session.flush().join(DEFAULT_SLEEP); + assertEquals(50, countInRange(101, 151)); + + // Test the low watermark. + // Before the fix for KUDU-804, a change to the buffer space did not result in a change to the + // low watermark causing this test to fail. + session.setMutationBufferLowWatermark(0.1f); + session.setMutationBufferSpace(10); + session.setRandomSeed(12345); // Will make us hit the exception after 6 tries + gotException = false; + for (int i = 151; i < 171; i++) { + try { + session.apply(createInsert(i)); + } catch (PleaseThrottleException ex) { + // We're going to hit the exception after filling up the buffer a first time then trying + // to insert 6 more rows. + assertEquals(167, i); + gotException = true; + assertTrue(ex.getMessage().contains("watermark")); + // Once we hit the exception we wait on the batch to finish flushing and then insert the + // rest of the data. + ex.getDeferred().join(DEFAULT_SLEEP); + session.apply(ex.getFailedRpc()); + } + } + session.flush().join(DEFAULT_SLEEP); + assertEquals(20, countInRange(151, 171)); + assertTrue(gotException); + } + + private Insert createInsert(int key) { + return createBasicSchemaInsert(table, key); + } + + private Insert createInsertWithNull(int key) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, key); + row.addInt(1, 2); + row.addInt(2, 3); + row.setNull(3); + row.addBoolean(4, false); + return insert; + } + + private Update createUpdate(int key) { + Update update = table.newUpdate(); + PartialRow row = update.getRow(); + row.addInt(0, key); + return update; + } + + private Delete createDelete(int key) { + Delete delete = table.newDelete(); + PartialRow row = delete.getRow(); + row.addInt(0, key); + return delete; + } + + public static boolean exists(final int key) throws Exception { + + AsyncKuduScanner scanner = getScanner(key, key + 1); + final AtomicBoolean exists = new AtomicBoolean(false); + + Callback cb = + new Callback() { + @Override + public Object call(RowResultIterator arg) throws Exception { + if (arg == null) return null; + for (RowResult row : arg) { + if (row.getInt(0) == key) { + exists.set(true); + break; + } + } + return null; + } + }; + + while (scanner.hasMoreRows()) { + Deferred data = scanner.nextRows(); + data.addCallbacks(cb, defaultErrorCB); + data.join(DEFAULT_SLEEP); + if (exists.get()) { + break; + } + } + + Deferred closer = scanner.close(); + closer.join(DEFAULT_SLEEP); + return exists.get(); + } + + public static int countNullColumns(final int startKey, final int endKey) throws Exception { + + AsyncKuduScanner scanner = getScanner(startKey, endKey); + final AtomicInteger ai = new AtomicInteger(); + + Callback cb = new Callback() { + @Override + public Object call(RowResultIterator arg) throws Exception { + if (arg == null) return null; + for (RowResult row : arg) { + if (row.isNull(3)) { + ai.incrementAndGet(); + } + } + return null; + } + }; + + while (scanner.hasMoreRows()) { + Deferred data = scanner.nextRows(); + data.addCallbacks(cb, defaultErrorCB); + data.join(DEFAULT_SLEEP); + } + + Deferred closer = scanner.close(); + closer.join(DEFAULT_SLEEP); + return ai.get(); + } + + public static int countInRange(final int start, final int exclusiveEnd) throws Exception { + + AsyncKuduScanner scanner = getScanner(start, exclusiveEnd); + return countRowsInScan(scanner); + } + + private static AsyncKuduScanner getScanner(int start, int exclusiveEnd) { + return getScanner(start, exclusiveEnd, null); + } + + private static AsyncKuduScanner getScanner(int start, int exclusiveEnd, + List columnNames) { + + PartialRow lowerBound = schema.newPartialRow(); + lowerBound.addInt(schema.getColumnByIndex(0).getName(), start); + + PartialRow upperBound = schema.newPartialRow(); + upperBound.addInt(schema.getColumnByIndex(0).getName(), exclusiveEnd); + + AsyncKuduScanner scanner = client.newScannerBuilder(table) + .lowerBound(lowerBound) + .exclusiveUpperBound(upperBound) + .setProjectedColumnNames(columnNames) + .build(); + return scanner; + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestBitSet.java b/java/kudu-client/src/test/java/org/kududb/client/TestBitSet.java new file mode 100644 index 000000000000..ab27e63afbc0 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestBitSet.java @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; + +import java.util.BitSet; + +public class TestBitSet { + + /** + * Test out BitSet-related operations + */ + @Test + public void test() { + int colCount = 1; + BitSet bs = new BitSet(colCount); + bs.set(0); + int size = Bytes.getBitSetSize(colCount); + byte[] result = Bytes.fromBitSet(bs, colCount); + assertEquals(size, result.length); + BitSet newBs = Bytes.toBitSet(result, 0, colCount); + assertTrue(newBs.get(0)); + + colCount = 7; + bs = new BitSet(colCount); + bs.set(0); + bs.set(5); + size = Bytes.getBitSetSize(colCount); + result = Bytes.fromBitSet(bs, colCount); + assertEquals(size, result.length); + newBs = Bytes.toBitSet(result, 0, colCount); + assertTrue(newBs.get(0)); + assertFalse(newBs.get(1)); + assertFalse(newBs.get(2)); + assertFalse(newBs.get(3)); + assertFalse(newBs.get(4)); + assertTrue(newBs.get(5)); + assertFalse(newBs.get(6)); + + colCount = 8; + bs = new BitSet(colCount); + bs.set(0); + bs.set(5); + bs.set(7); + size = Bytes.getBitSetSize(colCount); + result = Bytes.fromBitSet(bs, colCount); + assertEquals(size, result.length); + newBs = Bytes.toBitSet(result, 0, colCount); + assertTrue(newBs.get(0)); + assertFalse(newBs.get(1)); + assertFalse(newBs.get(2)); + assertFalse(newBs.get(3)); + assertFalse(newBs.get(4)); + assertTrue(newBs.get(5)); + assertFalse(newBs.get(6)); + assertTrue(newBs.get(7)); + + colCount = 11; + bs = new BitSet(colCount); + bs.set(0); + bs.set(5); + bs.set(7); + bs.set(9); + size = Bytes.getBitSetSize(colCount); + result = Bytes.fromBitSet(bs, colCount); + assertEquals(size, result.length); + newBs = Bytes.toBitSet(result, 0, colCount); + assertTrue(newBs.get(0)); + assertFalse(newBs.get(1)); + assertFalse(newBs.get(2)); + assertFalse(newBs.get(3)); + assertFalse(newBs.get(4)); + assertTrue(newBs.get(5)); + assertFalse(newBs.get(6)); + assertTrue(newBs.get(7)); + assertFalse(newBs.get(8)); + assertTrue(newBs.get(9)); + assertFalse(newBs.get(10)); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestBytes.java b/java/kudu-client/src/test/java/org/kududb/client/TestBytes.java new file mode 100644 index 000000000000..57a93da712d7 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestBytes.java @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import java.math.BigInteger; + +public class TestBytes { + + @Test + public void test() { + byte[] bytes = new byte[8]; + + // Boolean + Bytes.setUnsignedByte(bytes, (short) 1); + assert(Bytes.getBoolean(bytes)); + Bytes.setUnsignedByte(bytes, (short) 0); + assert(!Bytes.getBoolean(bytes)); + + // BYTES + short smallUbyte = 120; + Bytes.setUnsignedByte(bytes, smallUbyte); + assertEquals(smallUbyte, Bytes.getUnsignedByte(bytes)); + short largeUbyte = 250; + Bytes.setUnsignedByte(bytes, largeUbyte); + assertEquals(largeUbyte, Bytes.getUnsignedByte(bytes)); + + // SHORTS + short nshort = -300; + Bytes.setShort(bytes, nshort); + assertEquals(nshort, Bytes.getShort(bytes)); + short pshort = 300; + Bytes.setShort(bytes, pshort); + assertEquals(pshort, Bytes.getShort(bytes)); + int smallUshort = 300; + Bytes.setUnsignedShort(bytes, smallUshort); + assertEquals(smallUshort, Bytes.getUnsignedShort(bytes)); + int largeUshort = 60000; + Bytes.setUnsignedShort(bytes, largeUshort); + assertEquals(largeUshort, Bytes.getUnsignedShort(bytes)); + + // INTS + int nint = -60000; + Bytes.setInt(bytes, nint); + assertEquals(nint, Bytes.getInt(bytes)); + int pint = 60000; + Bytes.setInt(bytes, pint); + assertEquals(pint, Bytes.getInt(bytes)); + long smallUint = 60000; + Bytes.setUnsignedInt(bytes, smallUint); + assertEquals(smallUint, Bytes.getUnsignedInt(bytes)); + long largeUint = 4000000000L; + Bytes.setUnsignedInt(bytes, largeUint); + assertEquals(largeUint, Bytes.getUnsignedInt(bytes)); + + // LONGS + long nlong = -4000000000L; + Bytes.setLong(bytes, nlong); + assertEquals(nlong, Bytes.getLong(bytes)); + long plong = 4000000000L; + Bytes.setLong(bytes, plong); + assertEquals(plong, Bytes.getLong(bytes)); + BigInteger smallUlong = new BigInteger("4000000000"); + Bytes.setUnsignedLong(bytes, smallUlong); + assertEquals(smallUlong, Bytes.getUnsignedLong(bytes)); + BigInteger largeUlong = new BigInteger("10000000000000000000"); + Bytes.setUnsignedLong(bytes, largeUlong); + assertEquals(largeUlong, Bytes.getUnsignedLong(bytes)); + + // FLOAT + float aFloat = 123.456f; + Bytes.setFloat(bytes, aFloat); + assertEquals(aFloat, Bytes.getFloat(bytes), 0.001); + + // DOUBLE + double aDouble = 123.456; + Bytes.setDouble(bytes, aDouble); + assertEquals(aDouble, Bytes.getDouble(bytes), 0.001); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestColumnRangePredicate.java b/java/kudu-client/src/test/java/org/kududb/client/TestColumnRangePredicate.java new file mode 100644 index 000000000000..b5af961ef79f --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestColumnRangePredicate.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.collect.Lists; +import org.junit.Test; +import org.kududb.ColumnSchema; +import org.kududb.Type; +import org.kududb.tserver.Tserver; + +import java.io.IOException; +import java.util.List; + +import static org.junit.Assert.*; + +public class TestColumnRangePredicate { + + @Test + public void testRawLists() { + ColumnSchema col1 = new ColumnSchema.ColumnSchemaBuilder("col1", Type.INT32).build(); + ColumnSchema col2 = new ColumnSchema.ColumnSchemaBuilder("col2", Type.STRING).build(); + + ColumnRangePredicate pred1 = new ColumnRangePredicate(col1); + pred1.setLowerBound(1); + + ColumnRangePredicate pred2 = new ColumnRangePredicate(col1); + pred2.setUpperBound(2); + + ColumnRangePredicate pred3 = new ColumnRangePredicate(col2); + pred3.setLowerBound("aaa"); + pred3.setUpperBound("bbb"); + + List preds = Lists.newArrayList(pred1, pred2, pred3); + + byte[] rawPreds = ColumnRangePredicate.toByteArray(preds); + + List decodedPreds = null; + try { + decodedPreds = ColumnRangePredicate.fromByteArray(rawPreds); + } catch (IllegalArgumentException e) { + fail("Couldn't decode: " + e.getMessage()); + } + + assertEquals(3, decodedPreds.size()); + + assertEquals(col1.getName(), decodedPreds.get(0).getColumn().getName()); + assertEquals(1, Bytes.getInt(Bytes.get(decodedPreds.get(0).getLowerBound()))); + assertFalse(decodedPreds.get(0).hasUpperBound()); + + assertEquals(col1.getName(), decodedPreds.get(1).getColumn().getName()); + assertEquals(2, Bytes.getInt(Bytes.get(decodedPreds.get(1).getUpperBound()))); + assertFalse(decodedPreds.get(1).hasLowerBound()); + + assertEquals(col2.getName(), decodedPreds.get(2).getColumn().getName()); + assertEquals("aaa", Bytes.getString(Bytes.get(decodedPreds.get(2).getLowerBound()))); + assertEquals("bbb", Bytes.getString(Bytes.get(decodedPreds.get(2).getUpperBound()))); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestDeadlineTracker.java b/java/kudu-client/src/test/java/org/kududb/client/TestDeadlineTracker.java new file mode 100644 index 000000000000..c6d3a19981d5 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestDeadlineTracker.java @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.*; + +import com.google.common.base.Stopwatch; +import com.google.common.base.Ticker; +import org.junit.Test; + +import java.util.concurrent.atomic.AtomicLong; + +public class TestDeadlineTracker { + + @Test + public void testTimeout() { + final AtomicLong timeToReturn = new AtomicLong(); + Ticker ticker = new Ticker() { + @Override + public long read() { + return timeToReturn.get(); + } + }; + Stopwatch stopwatch = new Stopwatch(ticker); + + // no timeout set + DeadlineTracker tracker = new DeadlineTracker(stopwatch); + tracker.setDeadline(0); + assertFalse(tracker.hasDeadline()); + assertFalse(tracker.timedOut()); + + // 500ms timeout set + tracker.reset(); + tracker.setDeadline(500); + assertTrue(tracker.hasDeadline()); + assertFalse(tracker.timedOut()); + assertFalse(tracker.wouldSleepingTimeout(499)); + assertTrue(tracker.wouldSleepingTimeout(500)); + assertTrue(tracker.wouldSleepingTimeout(501)); + assertEquals(500, tracker.getMillisBeforeDeadline()); + + // fast forward 200ms + timeToReturn.set(200 * 1000000); + assertTrue(tracker.hasDeadline()); + assertFalse(tracker.timedOut()); + assertFalse(tracker.wouldSleepingTimeout(299)); + assertTrue(tracker.wouldSleepingTimeout(300)); + assertTrue(tracker.wouldSleepingTimeout(301)); + assertEquals(300, tracker.getMillisBeforeDeadline()); + + // fast forward another 400ms, so the RPC timed out + timeToReturn.set(600 * 1000000); + assertTrue(tracker.hasDeadline()); + assertTrue(tracker.timedOut()); + assertTrue(tracker.wouldSleepingTimeout(299)); + assertTrue(tracker.wouldSleepingTimeout(300)); + assertTrue(tracker.wouldSleepingTimeout(301)); + assertEquals(1, tracker.getMillisBeforeDeadline()); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestErrorCollector.java b/java/kudu-client/src/test/java/org/kududb/client/TestErrorCollector.java new file mode 100644 index 000000000000..01be75c8ec44 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestErrorCollector.java @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.Assert; +import org.junit.Test; + +public class TestErrorCollector { + + @Test + public void testErrorCollector() { + int maxErrors = 10; + ErrorCollector collector = new ErrorCollector(maxErrors); + + // Test with no errors. + int countToTest = 0; + Assert.assertEquals(countToTest, collector.countErrors()); + RowErrorsAndOverflowStatus reos = collector.getErrors(); + Assert.assertEquals(0, collector.countErrors()); + Assert.assertFalse(reos.isOverflowed()); + Assert.assertEquals(countToTest, reos.getRowErrors().length); + + // Test a single row error. + countToTest = 1; + collector.addError(createRowError(countToTest)); + Assert.assertEquals(countToTest, collector.countErrors()); + reos = collector.getErrors(); + Assert.assertEquals(0, collector.countErrors()); + Assert.assertFalse(reos.isOverflowed()); + Assert.assertEquals(countToTest, reos.getRowErrors().length); + Assert.assertEquals(countToTest + "", reos.getRowErrors()[0].getStatus()); + + // Test filling the collector to the max. + countToTest = maxErrors; + fillCollectorWith(collector, countToTest); + Assert.assertEquals(countToTest, collector.countErrors()); + reos = collector.getErrors(); + Assert.assertEquals(0, collector.countErrors()); + Assert.assertFalse(reos.isOverflowed()); + Assert.assertEquals(countToTest, reos.getRowErrors().length); + Assert.assertEquals((countToTest - 1) + "", reos.getRowErrors()[9].getStatus()); + + // Test overflowing. + countToTest = 95; + fillCollectorWith(collector, countToTest); + Assert.assertEquals(maxErrors, collector.countErrors()); + reos = collector.getErrors(); + Assert.assertEquals(0, collector.countErrors()); + Assert.assertTrue(reos.isOverflowed()); + Assert.assertEquals(maxErrors, reos.getRowErrors().length); + Assert.assertEquals((countToTest - 1) + "", reos.getRowErrors()[9].getStatus()); + + // Test overflowing on a newly created collector. + countToTest = 95; + collector = new ErrorCollector(maxErrors); + fillCollectorWith(collector, countToTest); + Assert.assertEquals(maxErrors, collector.countErrors()); + reos = collector.getErrors(); + Assert.assertEquals(0, collector.countErrors()); + Assert.assertTrue(reos.isOverflowed()); + Assert.assertEquals(maxErrors, reos.getRowErrors().length); + Assert.assertEquals((countToTest - 1) + "", reos.getRowErrors()[9].getStatus()); + } + + private void fillCollectorWith(ErrorCollector collector, int errorsToAdd) { + for (int i = 0; i < errorsToAdd; i++) { + collector.addError(createRowError(i)); + } + } + + private RowError createRowError(int id) { + // Use the error status as a way to message pass and so that we can test we're getting the right + // messages on the other end. + return new RowError(id + "", "test", null, "test"); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestFlexiblePartitioning.java b/java/kudu-client/src/test/java/org/kududb/client/TestFlexiblePartitioning.java new file mode 100644 index 000000000000..2fc2656579d9 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestFlexiblePartitioning.java @@ -0,0 +1,350 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Predicate; +import com.google.common.base.Predicates; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.junit.Before; +import org.junit.Test; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +public class TestFlexiblePartitioning extends BaseKuduTest { + private String tableName; + + @Before + public void setTableName() { + tableName = TestKuduClient.class.getName() + "-" + System.currentTimeMillis(); + } + + private static Schema createSchema() { + ArrayList columns = new ArrayList<>(3); + columns.add(new ColumnSchema.ColumnSchemaBuilder("a", Type.STRING).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("b", Type.STRING).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c", Type.STRING).key(true).build()); + return new Schema(columns); + } + + private static Set rows() throws Exception { + Set rows = new HashSet<>(); + for (int a = 0; a < 6; a++) { + for (int b = 0; b < 6; b++) { + for (int c = 0; c < 6; c++) { + rows.add(new Row(String.format("%s", a), + String.format("%s", b), + String.format("%s", c))); + } + } + } + return rows; + } + + private void insertRows(KuduTable table, Set rows) throws Exception { + KuduSession session = syncClient.newSession(); + try { + for (Row row : rows) { + Insert insert = table.newInsert(); + PartialRow insertRow = insert.getRow(); + row.fillPartialRow(insertRow); + session.apply(insert); + } + } finally { + session.close(); + } + } + + private Set collectRows(KuduScanner scanner) throws Exception { + Set rows = new HashSet<>(); + while (scanner.hasMoreRows()) { + for (RowResult result : scanner.nextRows()) { + rows.add(Row.fromResult(result)); + } + } + return rows; + } + + private void testPartitionSchema(CreateTableOptions tableBuilder) throws Exception { + Schema schema = createSchema(); + + syncClient.createTable(tableName, schema, tableBuilder); + + KuduTable table = syncClient.openTable(tableName); + + Set rows = rows(); + insertRows(table, rows); + + // Full table scan + assertEquals(rows, collectRows(syncClient.newScannerBuilder(table).build())); + + { // Lower bound + Row minRow = new Row("1", "3", "5"); + PartialRow lowerBound = schema.newPartialRow(); + minRow.fillPartialRow(lowerBound); + + Set expected = Sets.filter(rows, minRow.gtePred()); + + KuduScanner scanner = syncClient.newScannerBuilder(table).lowerBound(lowerBound).build(); + Set results = collectRows(scanner); + + assertEquals(expected, results); + } + + { // Upper bound + Row maxRow = new Row("1", "3", "5"); + PartialRow upperBound = schema.newPartialRow(); + maxRow.fillPartialRow(upperBound); + + Set expected = Sets.filter(rows, maxRow.ltPred()); + + KuduScanner scanner = syncClient.newScannerBuilder(table) + .exclusiveUpperBound(upperBound) + .build(); + Set results = collectRows(scanner); + + assertEquals(expected, results); + } + + { // Lower & Upper bounds + Row minRow = new Row("1", "3", "5"); + Row maxRow = new Row("2", "4", ""); + PartialRow lowerBound = schema.newPartialRow(); + minRow.fillPartialRow(lowerBound); + PartialRow upperBound = schema.newPartialRow(); + maxRow.fillPartialRow(upperBound); + + Set expected = Sets.filter(rows, Predicates.and(minRow.gtePred(), maxRow.ltPred())); + + KuduScanner scanner = syncClient.newScannerBuilder(table) + .lowerBound(lowerBound) + .exclusiveUpperBound(upperBound) + .build(); + Set results = collectRows(scanner); + + assertEquals(expected, results); + } + + List tablets = table.getTabletsLocations(TestTimeouts.DEFAULT_SLEEP); + + { // Per-tablet scan + Set results = new HashSet<>(); + + for (LocatedTablet tablet : tablets) { + KuduScanner scanner = syncClient.newScannerBuilder(table) + .lowerBoundPartitionKeyRaw(tablet.getPartition().getPartitionKeyStart()) + .exclusiveUpperBoundPartitionKeyRaw(tablet.getPartition().getPartitionKeyEnd()) + .build(); + Set tabletResults = collectRows(scanner); + Set intersection = Sets.intersection(results, tabletResults); + assertEquals(new HashSet<>(), intersection); + results.addAll(tabletResults); + } + + assertEquals(rows, results); + } + + { // Per-tablet scan with lower & upper bounds + Row minRow = new Row("1", "3", "5"); + Row maxRow = new Row("2", "4", ""); + PartialRow lowerBound = schema.newPartialRow(); + minRow.fillPartialRow(lowerBound); + PartialRow upperBound = schema.newPartialRow(); + maxRow.fillPartialRow(upperBound); + + Set expected = Sets.filter(rows, Predicates.and(minRow.gtePred(), maxRow.ltPred())); + Set results = new HashSet<>(); + + for (LocatedTablet tablet : tablets) { + KuduScanner scanner = syncClient.newScannerBuilder(table) + .lowerBound(lowerBound) + .exclusiveUpperBound(upperBound) + .lowerBoundPartitionKeyRaw(tablet.getPartition().getPartitionKeyStart()) + .exclusiveUpperBoundPartitionKeyRaw(tablet.getPartition().getPartitionKeyEnd()) + .build(); + Set tabletResults = collectRows(scanner); + Set intersection = Sets.intersection(results, tabletResults); + assertEquals(new HashSet<>(), intersection); + results.addAll(tabletResults); + } + + assertEquals(expected, results); + } + } + + @Test + public void testHashBucketedTable() throws Exception { + CreateTableOptions tableBuilder = new CreateTableOptions(); + tableBuilder.addHashPartitions(ImmutableList.of("a"), 3); + tableBuilder.addHashPartitions(ImmutableList.of("b", "c"), 3, 42); + tableBuilder.setRangePartitionColumns(ImmutableList.of()); + testPartitionSchema(tableBuilder); + } + + @Test + public void testNonDefaultRangePartitionedTable() throws Exception { + Schema schema = createSchema(); + CreateTableOptions tableBuilder = new CreateTableOptions(); + tableBuilder.setRangePartitionColumns(ImmutableList.of("c", "b")); + + PartialRow split = schema.newPartialRow(); + split.addString("c", "3"); + tableBuilder.addSplitRow(split); + + split = schema.newPartialRow(); + split.addString("c", "3"); + split.addString("b", "3"); + tableBuilder.addSplitRow(split); + + testPartitionSchema(tableBuilder); + } + + @Test + public void testHashBucketedAndRangePartitionedTable() throws Exception { + Schema schema = createSchema(); + CreateTableOptions tableBuilder = new CreateTableOptions(); + tableBuilder.addHashPartitions(ImmutableList.of("a"), 3); + tableBuilder.addHashPartitions(ImmutableList.of("b", "c"), 3, 42); + tableBuilder.setRangePartitionColumns(ImmutableList.of("c", "b")); + + PartialRow split = schema.newPartialRow(); + split.addString("c", "3"); + tableBuilder.addSplitRow(split); + + split = schema.newPartialRow(); + split.addString("c", "3"); + split.addString("b", "3"); + tableBuilder.addSplitRow(split); + + testPartitionSchema(tableBuilder); + } + + @Test + public void testSimplePartitionedTable() throws Exception { + Schema schema = createSchema(); + CreateTableOptions tableBuilder = new CreateTableOptions(); + + PartialRow split = schema.newPartialRow(); + split.addString("c", "3"); + tableBuilder.addSplitRow(split); + + split = schema.newPartialRow(); + split.addString("c", "3"); + split.addString("b", "3"); + tableBuilder.addSplitRow(split); + + testPartitionSchema(tableBuilder); + } + + public static class Row implements Comparable { + private final String a; + private final String b; + private final String c; + + public Row(String a, String b, String c) { + this.a = a; + this.b = b; + this.c = c; + } + + public String getA() { + return a; + } + + public String getB() { + return b; + } + + public String getC() { + return c; + } + + public void fillPartialRow(PartialRow row) { + row.addString("a", a); + row.addString("b", b); + row.addString("c", c); + } + + private static Row fromResult(RowResult result) { + return new Row(result.getString("a"), + result.getString("b"), + result.getString("c")); + } + + public Predicate gtePred() { + return new Predicate() { + @Override + public boolean apply(Row other) { + return other.compareTo(Row.this) >= 0; + } + }; + } + + public Predicate ltPred() { + return new Predicate() { + @Override + public boolean apply(Row other) { + return other.compareTo(Row.this) < 0; + } + }; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Row row = (Row) o; + return Objects.equals(a, row.a) + && Objects.equals(b, row.b) + && Objects.equals(c, row.c); + } + + @Override + public int hashCode() { + return Objects.hash(a, b, c); + } + + @Override + public int compareTo(Row other) { + return ComparisonChain.start() + .compare(a, other.a) + .compare(b, other.b) + .compare(c, other.c) + .result(); + } + + @Override + public String toString() { + return com.google.common.base.Objects.toStringHelper(this) + .add("a", a) + .add("b", b) + .add("c", c) + .toString(); + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestHybridTime.java b/java/kudu-client/src/test/java/org/kududb/client/TestHybridTime.java new file mode 100644 index 000000000000..2ce3cde1429e --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestHybridTime.java @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.stumbleupon.async.Deferred; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static org.kududb.Type.STRING; +import static org.kududb.client.ExternalConsistencyMode.CLIENT_PROPAGATED; +import static org.kududb.util.HybridTimeUtil.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * This only tests client propagation since it's the only thing that is client-specific. + * All the work for commit wait is done and tested on the server-side. + */ +public class TestHybridTime extends BaseKuduTest { + private static final Logger LOG = LoggerFactory.getLogger(TestHybridTime.class); + + // Generate a unique table name + protected static final String TABLE_NAME = + TestHybridTime.class.getName() + "-" + System.currentTimeMillis(); + + protected static Schema schema = getSchema(); + protected static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + + // Using multiple tablets doesn't work with the current way this test works since we could + // jump from one TS to another which changes the logical clock. + CreateTableOptions builder = new CreateTableOptions(); + table = createTable(TABLE_NAME, schema, builder); + } + + private static Schema getSchema() { + ArrayList columns = new ArrayList(1); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", STRING) + .key(true) + .build()); + return new Schema(columns); + } + + /** + * We write three rows. We increment the timestamp we get back from the first write + * by some amount. The remaining writes should force an update to the server's clock and + * only increment the logical values. + * + * @throws Exception + */ + @Test(timeout = 100000) + public void test() throws Exception { + AsyncKuduSession session = client.newSession(); + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + session.setExternalConsistencyMode(CLIENT_PROPAGATED); + long[] clockValues; + long previousLogicalValue = 0; + long previousPhysicalValue = 0; + + // Test timestamp propagation with single operations + String[] keys = new String[] {"1", "2", "3"}; + for (int i = 0; i < keys.length; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addString(schema.getColumnByIndex(0).getName(), keys[i]); + Deferred d = session.apply(insert); + OperationResponse response = d.join(DEFAULT_SLEEP); + assertTrue(response.getWriteTimestamp() != 0); + clockValues = HTTimestampToPhysicalAndLogical(response.getWriteTimestamp()); + LOG.debug("Clock value after write[" + i + "]: " + new Date(clockValues[0] / 1000).toString() + + " Logical value: " + clockValues[1]); + // on the very first write we update the clock into the future + // so that remaining writes only update logical values + if (i == 0) { + assertEquals(clockValues[1], 0); + long toUpdateTs = clockValues[0] + 5000000; + previousPhysicalValue = toUpdateTs; + // After the first write we fake-update the clock into the future. Following writes + // should force the servers to update their clocks to this value. + client.updateLastPropagatedTimestamp( + clockTimestampToHTTimestamp(toUpdateTs, TimeUnit.MICROSECONDS)); + } else { + assertEquals(clockValues[0], previousPhysicalValue); + assertTrue(clockValues[1] > previousLogicalValue); + previousLogicalValue = clockValues[1]; + } + } + + // Test timestamp propagation with Batches + session.setFlushMode(AsyncKuduSession.FlushMode.MANUAL_FLUSH); + keys = new String[] {"11", "22", "33"}; + for (int i = 0; i < keys.length; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addString(schema.getColumnByIndex(0).getName(), keys[i]); + session.apply(insert); + Deferred> d = session.flush(); + List responses = d.join(DEFAULT_SLEEP); + assertEquals("Response was not of the expected size: " + responses.size(), + 1, responses.size()); + + OperationResponse response = responses.get(0); + assertTrue(response.getWriteTimestamp() != 0); + clockValues = HTTimestampToPhysicalAndLogical(response.getWriteTimestamp()); + LOG.debug("Clock value after write[" + i + "]: " + new Date(clockValues[0] / 1000).toString() + + " Logical value: " + clockValues[1]); + assertEquals(clockValues[0], previousPhysicalValue); + assertTrue(clockValues[1] > previousLogicalValue); + previousLogicalValue = clockValues[1]; + } + + // Scan all rows with READ_LATEST (the default) we should get 6 rows back + assertEquals(6, countRowsInScan(client.newScannerBuilder(table).build())); + + // Now scan at multiple instances with READ_AT_SNAPSHOT we should get different + // counts depending on the scan timestamp. + long snapTime = physicalAndLogicalToHTTimestamp(previousPhysicalValue, 0); + assertEquals(1, scanAtSnapshot(snapTime)); + snapTime = physicalAndLogicalToHTTimestamp(previousPhysicalValue, 5); + assertEquals(4, scanAtSnapshot(snapTime)); + // Our last snap time needs to one one into the future w.r.t. the last write's timestamp + // for us to be able to get all rows, but the snap timestamp can't be bigger than the prop. + // timestamp so we increase both. + client.updateLastPropagatedTimestamp(client.getLastPropagatedTimestamp() + 1); + snapTime = physicalAndLogicalToHTTimestamp(previousPhysicalValue, previousLogicalValue + 1); + assertEquals(6, scanAtSnapshot(snapTime)); + } + + private int scanAtSnapshot(long time) throws Exception { + AsyncKuduScanner.AsyncKuduScannerBuilder builder = client.newScannerBuilder(table) + .snapshotTimestampRaw(time) + .readMode(AsyncKuduScanner.ReadMode.READ_AT_SNAPSHOT); + return countRowsInScan(builder.build()); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestKeyEncoding.java b/java/kudu-client/src/test/java/org/kududb/client/TestKeyEncoding.java new file mode 100644 index 000000000000..e44644555664 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestKeyEncoding.java @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import com.google.common.base.Charsets; +import com.google.common.collect.ImmutableList; +import org.junit.Test; +import org.kududb.ColumnSchema; +import org.kududb.ColumnSchema.ColumnSchemaBuilder; +import org.kududb.Common; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.client.PartitionSchema.HashBucketSchema; +import org.kududb.client.PartitionSchema.RangeSchema; + +import java.util.ArrayList; +import java.util.List; + +public class TestKeyEncoding { + + private static Schema buildSchema(ColumnSchemaBuilder... columns) { + int i = 0; + Common.SchemaPB.Builder pb = Common.SchemaPB.newBuilder(); + for (ColumnSchemaBuilder column : columns) { + Common.ColumnSchemaPB.Builder columnPb = + ProtobufHelper.columnToPb(column.build()).toBuilder(); + columnPb.setId(i++); + pb.addColumns(columnPb); + } + return ProtobufHelper.pbToSchema(pb.build()); + } + + private static void assertBytesEquals(byte[] actual, byte[] expected) { + assertTrue(String.format("expected: '%s', got '%s'", + Bytes.pretty(expected), + Bytes.pretty(actual)), + Bytes.equals(expected, actual)); + } + + private static void assertBytesEquals(byte[] actual, String expected) { + assertBytesEquals(actual, expected.getBytes(Charsets.UTF_8)); + } + + /** + * Builds the default partition schema for a schema. + * @param schema the schema + * @return a default partition schema + */ + private PartitionSchema defaultPartitionSchema(Schema schema) { + List columnIds = new ArrayList<>(); + for (int i = 0; i < schema.getPrimaryKeyColumnCount(); i++) { + // Schema does not provide a way to lookup a column ID by column index, + // so instead we assume that the IDs for the primary key columns match + // their respective index, which holds up when the schema is created + // with buildSchema. + columnIds.add(i); + } + return new PartitionSchema( + new PartitionSchema.RangeSchema(columnIds), + ImmutableList.of(), schema); + } + + @Test + public void testPrimaryKeys() { + Schema schemaOneString = + buildSchema(new ColumnSchema.ColumnSchemaBuilder("key", Type.STRING).key(true)); + KuduTable table = new KuduTable(null, "one", "one", schemaOneString, + defaultPartitionSchema(schemaOneString)); + Insert oneKeyInsert = new Insert(table); + PartialRow row = oneKeyInsert.getRow(); + row.addString("key", "foo"); + assertBytesEquals(row.encodePrimaryKey(), "foo"); + + Schema schemaTwoString = buildSchema( + new ColumnSchema.ColumnSchemaBuilder("key", Type.STRING).key(true), + new ColumnSchema.ColumnSchemaBuilder("key2", Type.STRING).key(true)); + KuduTable table2 = new KuduTable(null, "two", "two", schemaTwoString, + defaultPartitionSchema(schemaTwoString)); + Insert twoKeyInsert = new Insert(table2); + row = twoKeyInsert.getRow(); + row.addString("key", "foo"); + row.addString("key2", "bar"); + assertBytesEquals(row.encodePrimaryKey(), "foo\0\0bar"); + + Insert twoKeyInsertWithNull = new Insert(table2); + row = twoKeyInsertWithNull.getRow(); + row.addString("key", "xxx\0yyy"); + row.addString("key2", "bar"); + assertBytesEquals(row.encodePrimaryKey(), "xxx\0\1yyy\0\0bar"); + + // test that we get the correct memcmp result, the bytes are in big-endian order in a key + Schema schemaIntString = buildSchema( + new ColumnSchema.ColumnSchemaBuilder("key", Type.INT32).key(true), + new ColumnSchema.ColumnSchemaBuilder("key2", Type.STRING).key(true)); + PartitionSchema partitionSchemaIntString = defaultPartitionSchema(schemaIntString); + KuduTable table3 = new KuduTable(null, "three", "three", + schemaIntString, partitionSchemaIntString); + Insert small = new Insert(table3); + row = small.getRow(); + row.addInt("key", 20); + row.addString("key2", "data"); + byte[] smallPK = small.getRow().encodePrimaryKey(); + assertEquals(0, Bytes.memcmp(smallPK, smallPK)); + + Insert big = new Insert(table3); + row = big.getRow(); + row.addInt("key", 10000); + row.addString("key2", "data"); + byte[] bigPK = big.getRow().encodePrimaryKey(); + assertTrue(Bytes.memcmp(smallPK, bigPK) < 0); + assertTrue(Bytes.memcmp(bigPK, smallPK) > 0); + + // The following tests test our assumptions on unsigned data types sorting from KeyEncoder + byte four = 4; + byte onHundredTwentyFour = -4; + four = Bytes.xorLeftMostBit(four); + onHundredTwentyFour = Bytes.xorLeftMostBit(onHundredTwentyFour); + assertTrue(four < onHundredTwentyFour); + + byte[] threeHundred = Bytes.fromInt(300); + byte[] reallyBigNumber = Bytes.fromInt(-300); + threeHundred[0] = Bytes.xorLeftMostBit(threeHundred[0]); + reallyBigNumber[3] = Bytes.xorLeftMostBit(reallyBigNumber[3]); + assertTrue(Bytes.memcmp(threeHundred, reallyBigNumber) < 0); + } + + @Test + public void testPrimaryKeyEncoding() { + Schema schema = buildSchema( + new ColumnSchemaBuilder("int8", Type.INT8).key(true), + new ColumnSchemaBuilder("int16", Type.INT16).key(true), + new ColumnSchemaBuilder("int32", Type.INT32).key(true), + new ColumnSchemaBuilder("int64", Type.INT64).key(true), + new ColumnSchemaBuilder("string", Type.STRING).key(true), + new ColumnSchemaBuilder("binary", Type.BINARY).key(true)); + + PartialRow rowA = schema.newPartialRow(); + rowA.addByte("int8", Byte.MIN_VALUE); + rowA.addShort("int16", Short.MIN_VALUE); + rowA.addInt("int32", Integer.MIN_VALUE); + rowA.addLong("int64", Long.MIN_VALUE); + rowA.addString("string", ""); + rowA.addBinary("binary", "".getBytes(Charsets.UTF_8)); + + assertBytesEquals(rowA.encodePrimaryKey(), + "\0" + + "\0\0" + + "\0\0\0\0" + + "\0\0\0\0\0\0\0\0" + + "\0\0" + + ""); + + PartialRow rowB = schema.newPartialRow(); + rowB.addByte("int8", Byte.MAX_VALUE); + rowB.addShort("int16", Short.MAX_VALUE); + rowB.addInt("int32", Integer.MAX_VALUE); + rowB.addLong("int64", Long.MAX_VALUE); + rowB.addString("string", "abc\1\0def"); + rowB.addBinary("binary", "\0\1binary".getBytes(Charsets.UTF_8)); + + assertBytesEquals(rowB.encodePrimaryKey(), + new byte[] { + -1, + -1, -1, + -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + 'a', 'b', 'c', 1, 0, 1, 'd', 'e', 'f', 0, 0, + 0, 1, 'b', 'i', 'n', 'a', 'r', 'y', + }); + + PartialRow rowC = schema.newPartialRow(); + rowC.addByte("int8", (byte) 1); + rowC.addShort("int16", (short) 2); + rowC.addInt("int32", 3); + rowC.addLong("int64", 4); + rowC.addString("string", "abc\n123"); + rowC.addBinary("binary", "\0\1\2\3\4\5".getBytes(Charsets.UTF_8)); + + assertBytesEquals(rowC.encodePrimaryKey(), + new byte[] { + (byte) 0x81, + (byte) 0x80, 2, + (byte) 0x80, 0, 0, 3, + (byte) 0x80, 0, 0, 0, 0, 0, 0, 4, + 'a', 'b', 'c', '\n', '1', '2', '3', 0, 0, + 0, 1, 2, 3, 4, 5, + }); + } + + @Test + public void testPartitionKeyEncoding() { + KeyEncoder encoder = new KeyEncoder(); + Schema schema = buildSchema( + new ColumnSchemaBuilder("a", Type.INT32).key(true), + new ColumnSchemaBuilder("b", Type.STRING).key(true), + new ColumnSchemaBuilder("c", Type.STRING).key(true)); + + PartitionSchema partitionSchema = + new PartitionSchema(new RangeSchema(ImmutableList.of(0, 1, 2)), + ImmutableList.of( + new HashBucketSchema(ImmutableList.of(0, 1), 32, 0), + new HashBucketSchema(ImmutableList.of(2), 32, 42)), + schema); + + PartialRow rowA = schema.newPartialRow(); + rowA.addInt("a", 0); + rowA.addString("b", ""); + rowA.addString("c", ""); + assertBytesEquals(encoder.encodePartitionKey(rowA, partitionSchema), + new byte[]{ + 0, 0, 0, 0, // hash(0, "") + 0, 0, 0, 0x14, // hash("") + (byte) 0x80, 0, 0, 0, // a = 0 + 0, 0, // b = ""; c is elided + }); + + PartialRow rowB = schema.newPartialRow(); + rowB.addInt("a", 1); + rowB.addString("b", ""); + rowB.addString("c", ""); + assertBytesEquals(encoder.encodePartitionKey(rowB, partitionSchema), + new byte[]{ + 0, 0, 0, 0x5, // hash(1, "") + 0, 0, 0, 0x14, // hash("") + (byte) 0x80, 0, 0, 1, // a = 0 + 0, 0, // b = ""; c is elided + }); + + PartialRow rowC = schema.newPartialRow(); + rowC.addInt("a", 0); + rowC.addString("b", "b"); + rowC.addString("c", "c"); + assertBytesEquals(encoder.encodePartitionKey(rowC, partitionSchema), + new byte[]{ + 0, 0, 0, 0x1A, // hash(0, "b") + 0, 0, 0, 0x1D, // hash("c") + (byte) 0x80, 0, 0, 0, // a = 1 + 'b', 0, 0, // b = "b" + 'c' // b = "c" + }); + + PartialRow rowD = schema.newPartialRow(); + rowD.addInt("a", 1); + rowD.addString("b", "b"); + rowD.addString("c", "c"); + assertBytesEquals(encoder.encodePartitionKey(rowD, partitionSchema), + new byte[]{ + 0, 0, 0, 0, // hash(1, "b") + 0, 0, 0, 0x1D, // hash("c") + (byte) 0x80, 0, 0, 1, // a = 0 + 'b', 0, 0, // b = "b" + 'c' // b = "c" + }); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestKuduClient.java b/java/kudu-client/src/test/java/org/kududb/client/TestKuduClient.java new file mode 100644 index 000000000000..c99238c953ed --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestKuduClient.java @@ -0,0 +1,319 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.kududb.client.RowResult.timestampToString; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; + +import org.junit.Before; +import org.junit.Test; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; + +public class TestKuduClient extends BaseKuduTest { + private String tableName; + + @Before + public void setTableName() { + tableName = TestKuduClient.class.getName() + "-" + System.currentTimeMillis(); + } + + private Schema createManyStringsSchema() { + ArrayList columns = new ArrayList(4); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.STRING).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c1", Type.STRING).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c2", Type.STRING).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c3", Type.STRING).nullable(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c4", Type.STRING).nullable(true).build()); + return new Schema(columns); + } + + private Schema createSchemaWithBinaryColumns() { + ArrayList columns = new ArrayList(); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.BINARY).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c1", Type.STRING).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c2", Type.DOUBLE).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c3", Type.BINARY).nullable(true).build()); + return new Schema(columns); + } + + private Schema createSchemaWithTimestampColumns() { + ArrayList columns = new ArrayList(); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.TIMESTAMP).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c1", Type.TIMESTAMP).nullable(true).build()); + return new Schema(columns); + } + + /** + * Test creating and deleting a table through a KuduClient. + */ + @Test(timeout = 100000) + public void testCreateDeleteTable() throws Exception { + // Check that we can create a table. + syncClient.createTable(tableName, basicSchema); + assertFalse(syncClient.getTablesList().getTablesList().isEmpty()); + assertTrue(syncClient.getTablesList().getTablesList().contains(tableName)); + + // Check that we can delete it. + syncClient.deleteTable(tableName); + assertFalse(syncClient.getTablesList().getTablesList().contains(tableName)); + + // Check that we can re-recreate it, with a different schema. + List columns = new ArrayList<>(basicSchema.getColumns()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("one more", Type.STRING).build()); + Schema newSchema = new Schema(columns); + syncClient.createTable(tableName, newSchema); + + // Check that we can open a table and see that it has the new schema. + KuduTable table = syncClient.openTable(tableName); + assertEquals(newSchema.getColumnCount(), table.getSchema().getColumnCount()); + assertTrue(table.getPartitionSchema().isSimpleRangePartitioning()); + + // Check that the block size parameter we specified in the schema is respected. + assertEquals(4096, newSchema.getColumn("column3_s").getDesiredBlockSize()); + assertEquals(ColumnSchema.Encoding.DICT_ENCODING, + newSchema.getColumn("column3_s").getEncoding()); + assertEquals(ColumnSchema.CompressionAlgorithm.LZ4, + newSchema.getColumn("column3_s").getCompressionAlgorithm()); + } + + /** + * Test inserting and retrieving string columns. + */ + @Test(timeout = 100000) + public void testStrings() throws Exception { + Schema schema = createManyStringsSchema(); + syncClient.createTable(tableName, schema); + + KuduSession session = syncClient.newSession(); + KuduTable table = syncClient.openTable(tableName); + for (int i = 0; i < 100; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addString("key", String.format("key_%02d", i)); + row.addString("c2", "c2_" + i); + if (i % 2 == 1) { + row.addString("c3", "c3_" + i); + } + row.addString("c4", "c4_" + i); + // NOTE: we purposefully add the strings in a non-left-to-right + // order to verify that we still place them in the right position in + // the row. + row.addString("c1", "c1_" + i); + session.apply(insert); + if (i % 50 == 0) { + session.flush(); + } + } + session.flush(); + + List rowStrings = scanTableToStrings(table); + assertEquals(100, rowStrings.size()); + assertEquals( + "STRING key=key_03, STRING c1=c1_3, STRING c2=c2_3, STRING c3=c3_3, STRING c4=c4_3", + rowStrings.get(3)); + assertEquals( + "STRING key=key_04, STRING c1=c1_4, STRING c2=c2_4, STRING c3=NULL, STRING c4=c4_4", + rowStrings.get(4)); + } + + /** + * Test to verify that we can write in and read back UTF8. + */ + @Test(timeout = 100000) + public void testUTF8() throws Exception { + Schema schema = createManyStringsSchema(); + syncClient.createTable(tableName, schema); + + KuduSession session = syncClient.newSession(); + KuduTable table = syncClient.openTable(tableName); + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addString("key", "กขฃคฅฆง"); // some thai + row.addString("c1", "✁✂✃✄✆"); // some icons + + row.addString("c2", "hello"); // some normal chars + row.addString("c4", "🐱"); // supplemental plane + session.apply(insert); + session.flush(); + + List rowStrings = scanTableToStrings(table); + assertEquals(1, rowStrings.size()); + assertEquals( + "STRING key=กขฃคฅฆง, STRING c1=✁✂✃✄✆, STRING c2=hello, STRING c3=NULL, STRING c4=🐱", + rowStrings.get(0)); + } + + /** + * Test inserting and retrieving binary columns. + */ + @Test(timeout = 100000) + public void testBinaryColumns() throws Exception { + Schema schema = createSchemaWithBinaryColumns(); + syncClient.createTable(tableName, schema); + + byte[] testArray = new byte[] {1, 2, 3, 4, 5, 6 ,7, 8, 9}; + + KuduSession session = syncClient.newSession(); + KuduTable table = syncClient.openTable(tableName); + for (int i = 0; i < 100; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addBinary("key", String.format("key_%02d", i).getBytes()); + row.addString("c1", "✁✂✃✄✆"); + row.addDouble("c2", i); + if (i % 2 == 1) { + row.addBinary("c3", testArray); + } + session.apply(insert); + if (i % 50 == 0) { + session.flush(); + } + } + session.flush(); + + List rowStrings = scanTableToStrings(table); + assertEquals(100, rowStrings.size()); + for (int i = 0; i < rowStrings.size(); i++) { + StringBuilder expectedRow = new StringBuilder(); + expectedRow.append(String.format("BINARY key=\"key_%02d\", STRING c1=✁✂✃✄✆, DOUBLE c2=%.1f," + + " BINARY c3=", i, (double) i)); + if (i % 2 == 1) { + expectedRow.append(Bytes.pretty(testArray)); + } else { + expectedRow.append("NULL"); + } + assertEquals(expectedRow.toString(), rowStrings.get(i)); + } + } + + /** + * Test inserting and retrieving timestamp columns. + */ + @Test(timeout = 100000) + public void testTimestampColumns() throws Exception { + Schema schema = createSchemaWithTimestampColumns(); + syncClient.createTable(tableName, schema); + + List timestamps = new ArrayList<>(); + + KuduSession session = syncClient.newSession(); + KuduTable table = syncClient.openTable(tableName); + long lastTimestamp = 0; + for (int i = 0; i < 100; i++) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + long timestamp = System.currentTimeMillis() * 1000; + while(timestamp == lastTimestamp) { + timestamp = System.currentTimeMillis() * 1000; + } + timestamps.add(timestamp); + row.addLong("key", timestamp); + if (i % 2 == 1) { + row.addLong("c1", timestamp); + } + session.apply(insert); + if (i % 50 == 0) { + session.flush(); + } + lastTimestamp = timestamp; + } + session.flush(); + + List rowStrings = scanTableToStrings(table); + assertEquals(100, rowStrings.size()); + for (int i = 0; i < rowStrings.size(); i++) { + StringBuilder expectedRow = new StringBuilder(); + expectedRow.append(String.format("TIMESTAMP key=%s, TIMESTAMP c1=", + timestampToString(timestamps.get(i)))); + if (i % 2 == 1) { + expectedRow.append(timestampToString(timestamps.get(i))); + } else { + expectedRow.append("NULL"); + } + assertEquals(expectedRow.toString(), rowStrings.get(i)); + } + } + + /** + * Creates a local client that we auto-close while buffering one row, then makes sure that after + * closing that we can read the row. + */ + @Test(timeout = 100000) + public void testAutoClose() throws Exception { + try (KuduClient localClient = new KuduClient.KuduClientBuilder(masterAddresses).build()) { + localClient.createTable(tableName, basicSchema); + KuduTable table = localClient.openTable(tableName); + KuduSession session = localClient.newSession(); + + session.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH); + Insert insert = createBasicSchemaInsert(table, 0); + session.apply(insert); + } + + KuduTable table = syncClient.openTable(tableName); + AsyncKuduScanner scanner = new AsyncKuduScanner.AsyncKuduScannerBuilder(client, table).build(); + assertEquals(1, countRowsInScan(scanner)); + } + + @Test(timeout = 100000) + public void testCustomNioExecutor() throws Exception { + long startTime = System.nanoTime(); + final KuduClient localClient = new KuduClient.KuduClientBuilder(masterAddresses) + .nioExecutors(Executors.newFixedThreadPool(1), Executors.newFixedThreadPool(2)) + .bossCount(1) + .workerCount(2) + .build(); + long buildTime = (System.nanoTime() - startTime) / 1000000000L; + assertTrue("Building KuduClient is slow, maybe netty get stuck", buildTime < 3); + localClient.createTable(tableName, basicSchema); + Thread[] threads = new Thread[4]; + for (int t = 0; t < 4; t++) { + final int id = t; + threads[t] = new Thread(new Runnable() { + @Override + public void run() { + try { + KuduTable table = localClient.openTable(tableName); + KuduSession session = localClient.newSession(); + session.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_SYNC); + for (int i = 0; i < 100; i++) { + Insert insert = createBasicSchemaInsert(table, id * 100 + i); + session.apply(insert); + } + session.close(); + } catch (Exception e) { + fail("insert thread should not throw exception: " + e); + } + } + }); + threads[t].start(); + } + for (int t = 0; t< 4;t++) { + threads[t].join(); + } + localClient.shutdown(); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestKuduSession.java b/java/kudu-client/src/test/java/org/kududb/client/TestKuduSession.java new file mode 100644 index 000000000000..293c968c581f --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestKuduSession.java @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestKuduSession extends BaseKuduTest { + // Generate a unique table name + private static final String TABLE_NAME_PREFIX = + TestKuduSession.class.getName()+"-"+System.currentTimeMillis(); + + private KuduTable table; + + @Test(timeout = 100000) + public void testBasicOps() throws Exception { + String tableName = TABLE_NAME_PREFIX + "-testBasicOps"; + table = createTable(tableName, basicSchema, new CreateTableOptions()); + + KuduSession session = syncClient.newSession(); + for (int i = 0; i < 10; i++) { + session.apply(createInsert(i)); + } + assertEquals(10, countRowsInScan(client.newScannerBuilder(table).build())); + + OperationResponse resp = session.apply(createInsert(0)); + assertTrue(resp.hasRowError()); + + session.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH); + + for (int i = 10; i < 20; i++) { + session.apply(createInsert(i)); + } + session.flush(); + assertEquals(20, countRowsInScan(client.newScannerBuilder(table).build())); + } + + @Test(timeout = 100000) + public void testBatchWithSameRow() throws Exception { + String tableName = TABLE_NAME_PREFIX + "-testBatchWithSameRow"; + table = createTable(tableName, basicSchema, new CreateTableOptions()); + + KuduSession session = syncClient.newSession(); + session.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH); + + // Insert 25 rows, one per batch, along with 50 updates for each, and a delete at the end, + // while also clearing the cache between each batch half the time. The delete is added here + // so that a misplaced update would fail if it happens later than its delete. + for (int i = 0; i < 25; i++) { + session.apply(createInsert(i)); + for (int j = 0; j < 50; j++) { + Update update = table.newUpdate(); + PartialRow row = update.getRow(); + row.addInt(basicSchema.getColumnByIndex(0).getName(), i); + row.addInt(basicSchema.getColumnByIndex(1).getName(), 1000); + session.apply(update); + } + Delete del = table.newDelete(); + PartialRow row = del.getRow(); + row.addInt(basicSchema.getColumnByIndex(0).getName(), i); + session.apply(del); + session.flush(); + if (i % 2 == 0) { + client.emptyTabletsCacheForTable(table.getTableId()); + } + } + assertEquals(0, countRowsInScan(client.newScannerBuilder(table).build())); + } + + /** + * Regression test for KUDU-1226. Calls to session.flush() concurrent with AUTO_FLUSH_BACKGROUND + * can end up giving ConvertBatchToListOfResponsesCB a list with nulls if a tablet was already + * flushed. Only happens with multiple tablets. + */ + @Test(timeout = 10000) + public void testConcurrentFlushes() throws Exception { + String tableName = TABLE_NAME_PREFIX + "-testConcurrentFlushes"; + CreateTableOptions builder = new CreateTableOptions(); + int numTablets = 4; + int numRowsPerTablet = 100; + + // Create a 4 tablets table split on 1000, 2000, and 3000. + for (int i = 1; i < numTablets; i++) { + PartialRow split = basicSchema.newPartialRow(); + split.addInt(0, i * numRowsPerTablet); + builder.addSplitRow(split); + } + table = createTable(tableName, basicSchema, builder); + + // Configure the session to background flush as often as it can (every 1ms). + KuduSession session = syncClient.newSession(); + session.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_BACKGROUND); + session.setFlushInterval(1); + + // Fill each tablet in parallel 1 by 1 then flush. Without the fix this would quickly get an + // NPE. + for (int i = 0; i < numRowsPerTablet; i++) { + for (int j = 0; j < numTablets; j++) { + session.apply(createInsert(i + (numRowsPerTablet * j))); + } + session.flush(); + } + } + + @Test(timeout = 10000) + public void testOverWritingValues() throws Exception { + String tableName = TABLE_NAME_PREFIX + "-OverridingValues"; + table = createTable(tableName, basicSchema, null); + KuduSession session = syncClient.newSession(); + Insert insert = createInsert(0); + PartialRow row = insert.getRow(); + + // Overwrite all the normal columns. + int magicNumber = 9999; + row.addInt(1, magicNumber); + row.addInt(2, magicNumber); + row.addBoolean(4, false); + // Spam the string column since it's backed by an array. + for (int i = 0; i <= magicNumber; i++) { + row.addString(3, i + ""); + } + // We're supposed to keep a constant size. + assertEquals(5, row.getVarLengthData().size()); + session.apply(insert); + + KuduScanner scanner = syncClient.newScannerBuilder(table).build(); + RowResult rr = scanner.nextRows().next(); + assertEquals(magicNumber, rr.getInt(1)); + assertEquals(magicNumber, rr.getInt(2)); + assertEquals(magicNumber + "", rr.getString(3)); + assertEquals(false, rr.getBoolean(4)); + + // Test setting a value post-apply. + try { + row.addInt(1, 0); + fail("Row should be frozen and throw"); + } catch (IllegalStateException ex) { + // Ok. + } + } + + private Insert createInsert(int key) { + return createBasicSchemaInsert(table, key); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestKuduTable.java b/java/kudu-client/src/test/java/org/kududb/client/TestKuduTable.java new file mode 100644 index 000000000000..b158aa76d061 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestKuduTable.java @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.*; + +public class TestKuduTable extends BaseKuduTest { + + private static final Logger LOG = LoggerFactory.getLogger(TestKuduTable.class); + + private static final String BASE_TABLE_NAME = TestKuduTable.class.getName(); + + private static Schema schema = getBasicSchema(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + } + + @Test(expected = IllegalArgumentException.class) + public void testBadSchema() { + // Test creating a table with keys in the wrong order + List badColumns = new ArrayList(2); + badColumns.add(new ColumnSchema.ColumnSchemaBuilder("not_key", Type.STRING).build()); + badColumns.add(new ColumnSchema.ColumnSchemaBuilder("key", Type.STRING) + .key(true) + .build()); + new Schema(badColumns); + } + + @Test(timeout = 100000) + public void testAlterTable() throws Exception { + String tableName = BASE_TABLE_NAME + System.currentTimeMillis(); + createTable(tableName, basicSchema, null); + + // Add a col. + AlterTableOptions ato = new AlterTableOptions().addColumn("testaddint", Type.INT32, 4); + submitAlterAndCheck(ato, tableName); + + // Rename that col. + ato = new AlterTableOptions().renameColumn("testaddint", "newtestaddint"); + submitAlterAndCheck(ato, tableName); + + // Delete it. + ato = new AlterTableOptions().dropColumn("newtestaddint"); + submitAlterAndCheck(ato, tableName); + + String newTableName = tableName +"new"; + + // Rename our table. + ato = new AlterTableOptions().renameTable(newTableName); + submitAlterAndCheck(ato, tableName, newTableName); + + // Rename it back. + ato = new AlterTableOptions().renameTable(tableName); + submitAlterAndCheck(ato, newTableName, tableName); + + // Try adding two columns, where one is nullable. + ato = new AlterTableOptions() + .addColumn("testaddmulticolnotnull", Type.INT32, 4) + .addNullableColumn("testaddmulticolnull", Type.STRING); + submitAlterAndCheck(ato, tableName); + } + + /** + * Helper method to submit an Alter and wait for it to happen, using the default table name to + * check. + */ + private void submitAlterAndCheck(AlterTableOptions ato, String tableToAlter) + throws Exception { + submitAlterAndCheck(ato, tableToAlter, tableToAlter); + } + + private void submitAlterAndCheck(AlterTableOptions ato, + String tableToAlter, String tableToCheck) throws + Exception { + if (masterHostPorts.size() > 1) { + LOG.info("Alter table is not yet supported with multiple masters. Specify " + + "-DnumMasters=1 on the command line to start a single-master cluster to run this test."); + return; + } + AlterTableResponse alterResponse = syncClient.alterTable(tableToAlter, ato); + boolean done = syncClient.isAlterTableDone(tableToCheck); + assertTrue(done); + } + + /** + * Test creating tables of different sizes and see that we get the correct number of tablets back + * @throws Exception + */ + @Test + public void testGetLocations() throws Exception { + String table1 = BASE_TABLE_NAME + System.currentTimeMillis(); + + // Test a non-existing table + try { + openTable(table1); + fail("Should receive an exception since the table doesn't exist"); + } catch (Exception ex) { + // expected + } + // Test with defaults + String tableWithDefault = BASE_TABLE_NAME + "WithDefault" + System.currentTimeMillis(); + CreateTableOptions builder = new CreateTableOptions(); + List columns = new ArrayList(schema.getColumnCount()); + int defaultInt = 30; + String defaultString = "data"; + for (ColumnSchema columnSchema : schema.getColumns()) { + + Object defaultValue; + + if (columnSchema.getType() == Type.INT32) { + defaultValue = defaultInt; + } else if (columnSchema.getType() == Type.BOOL) { + defaultValue = true; + } else { + defaultValue = defaultString; + } + columns.add( + new ColumnSchema.ColumnSchemaBuilder(columnSchema.getName(), columnSchema.getType()) + .key(columnSchema.isKey()) + .nullable(columnSchema.isNullable()) + .defaultValue(defaultValue).build()); + } + Schema schemaWithDefault = new Schema(columns); + KuduTable kuduTable = createTable(tableWithDefault, schemaWithDefault, builder); + assertEquals(defaultInt, kuduTable.getSchema().getColumnByIndex(0).getDefaultValue()); + assertEquals(defaultString, + kuduTable.getSchema().getColumnByIndex(columns.size() - 2).getDefaultValue()); + assertEquals(true, + kuduTable.getSchema().getColumnByIndex(columns.size() - 1).getDefaultValue()); + + // Make sure the table's schema includes column IDs. + assertTrue(kuduTable.getSchema().hasColumnIds()); + + // Test we can open a table that was already created. + openTable(tableWithDefault); + + // Test splitting and reading those splits + KuduTable kuduTableWithoutDefaults = createTableWithSplitsAndTest(0); + // finish testing read defaults + assertNull(kuduTableWithoutDefaults.getSchema().getColumnByIndex(0).getDefaultValue()); + createTableWithSplitsAndTest(3); + createTableWithSplitsAndTest(10); + + KuduTable table = createTableWithSplitsAndTest(30); + + Listtablets = table.getTabletsLocations(null, getKeyInBytes(9), DEFAULT_SLEEP); + assertEquals(10, tablets.size()); + assertEquals(10, table.asyncGetTabletsLocations(null, getKeyInBytes(9), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(0), getKeyInBytes(9), DEFAULT_SLEEP); + assertEquals(10, tablets.size()); + assertEquals(10, table.asyncGetTabletsLocations(getKeyInBytes(0), getKeyInBytes(9), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(5), getKeyInBytes(9), DEFAULT_SLEEP); + assertEquals(5, tablets.size()); + assertEquals(5, table.asyncGetTabletsLocations(getKeyInBytes(5), getKeyInBytes(9), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(5), getKeyInBytes(14), DEFAULT_SLEEP); + assertEquals(10, tablets.size()); + assertEquals(10, table.asyncGetTabletsLocations(getKeyInBytes(5), getKeyInBytes(14), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(5), getKeyInBytes(31), DEFAULT_SLEEP); + assertEquals(26, tablets.size()); + assertEquals(26, table.asyncGetTabletsLocations(getKeyInBytes(5), getKeyInBytes(31), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(5), null, DEFAULT_SLEEP); + assertEquals(26, tablets.size()); + assertEquals(26, table.asyncGetTabletsLocations(getKeyInBytes(5), null, DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(null, getKeyInBytes(10000), DEFAULT_SLEEP); + assertEquals(31, tablets.size()); + assertEquals(31, table.asyncGetTabletsLocations(null, getKeyInBytes(10000), DEFAULT_SLEEP).join().size()); + + tablets = table.getTabletsLocations(getKeyInBytes(20), getKeyInBytes(10000), DEFAULT_SLEEP); + assertEquals(11, tablets.size()); + assertEquals(11, table.asyncGetTabletsLocations(getKeyInBytes(20), getKeyInBytes(10000), DEFAULT_SLEEP).join().size()); + + // Test listing tables. + assertEquals(0, client.getTablesList(table1).join(DEFAULT_SLEEP).getTablesList().size()); + assertEquals(1, client.getTablesList(tableWithDefault) + .join(DEFAULT_SLEEP).getTablesList().size()); + assertEquals(6, client.getTablesList().join(DEFAULT_SLEEP).getTablesList().size()); + assertFalse(client.getTablesList(tableWithDefault). + join(DEFAULT_SLEEP).getTablesList().isEmpty()); + + assertFalse(client.tableExists(table1).join(DEFAULT_SLEEP)); + assertTrue(client.tableExists(tableWithDefault).join(DEFAULT_SLEEP)); + } + + public byte[] getKeyInBytes(int i) { + PartialRow row = schema.newPartialRow(); + row.addInt(0, i); + return row.encodePrimaryKey(); + } + + public KuduTable createTableWithSplitsAndTest(int splitsCount) throws Exception { + String tableName = BASE_TABLE_NAME + System.currentTimeMillis(); + CreateTableOptions builder = new CreateTableOptions(); + + if (splitsCount != 0) { + for (int i = 1; i <= splitsCount; i++) { + PartialRow row = schema.newPartialRow(); + row.addInt(0, i); + builder.addSplitRow(row); + } + } + KuduTable table = createTable(tableName, schema, builder); + + // calling getTabletsLocation won't wait on the table to be assigned so we trigger the wait + // by scanning + countRowsInScan(client.newScannerBuilder(table).build()); + + List tablets = table.getTabletsLocations(DEFAULT_SLEEP); + assertEquals(splitsCount + 1, tablets.size()); + assertEquals(splitsCount + 1, table.asyncGetTabletsLocations(DEFAULT_SLEEP).join().size()); + for (LocatedTablet tablet : tablets) { + assertEquals(3, tablet.getReplicas().size()); + } + return table; + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestLeaderFailover.java b/java/kudu-client/src/test/java/org/kududb/client/TestLeaderFailover.java new file mode 100644 index 000000000000..544feab67468 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestLeaderFailover.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestLeaderFailover extends BaseKuduTest { + + private static final String TABLE_NAME = + TestLeaderFailover.class.getName() + "-" + System.currentTimeMillis(); + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + + CreateTableOptions builder = new CreateTableOptions().setNumReplicas(3); + createTable(TABLE_NAME, basicSchema, builder); + + table = openTable(TABLE_NAME); + } + + /** + * This test writes 3 rows, kills the leader, then tries to write another 3 rows. Finally it + * counts to make sure we have 6 of them. + * + * This test won't run if we didn't start the cluster. + */ + @Test(timeout = 100000) + public void testFailover() throws Exception { + KuduSession session = syncClient.newSession(); + session.setIgnoreAllDuplicateRows(true); + for (int i = 0; i < 3; i++) { + session.apply(createBasicSchemaInsert(table, i)); + } + + // Make sure the rows are in there before messing things up. + AsyncKuduScanner scanner = client.newScannerBuilder(table).build(); + assertEquals(3, countRowsInScan(scanner)); + + killTabletLeader(table); + + for (int i = 3; i < 6; i++) { + OperationResponse resp = session.apply(createBasicSchemaInsert(table, i)); + if (resp.hasRowError()) { + fail("Encountered a row error " + resp.getRowError()); + } + } + + scanner = client.newScannerBuilder(table).build(); + assertEquals(6, countRowsInScan(scanner)); + } +} \ No newline at end of file diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestMasterFailover.java b/java/kudu-client/src/test/java/org/kududb/client/TestMasterFailover.java new file mode 100644 index 000000000000..edbfdc737e5e --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestMasterFailover.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.junit.Assert.assertEquals; + + +/** + * Tests {@link AsyncKuduClient} with multiple masters. + */ +public class TestMasterFailover extends BaseKuduTest { + private static final Logger LOG = LoggerFactory.getLogger(TestMasterFailover.class); + private static final String TABLE_NAME = + TestMasterFailover.class.getName() + "-" + System.currentTimeMillis(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + createTable(TABLE_NAME, basicSchema, new CreateTableOptions()); + } + + /** + * This test is disabled as we're not supporting multi-master just yet. + */ + @Test(timeout = 30000) + @Ignore + public void testKillLeader() throws Exception { + int countMasters = masterHostPorts.size(); + if (countMasters < 3) { + LOG.info("This test requires at least 3 master servers, but only " + countMasters + + " are specified."); + return; + } + killMasterLeader(); + + // Test that we can open a previously created table after killing the leader master. + KuduTable table = openTable(TABLE_NAME); + assertEquals(0, countRowsInScan(client.newScannerBuilder(table).build())); + + // Test that we can create a new table when one of the masters is down. + String newTableName = TABLE_NAME + "-afterLeaderIsDead"; + createTable(newTableName, basicSchema, new CreateTableOptions()); + table = openTable(newTableName); + assertEquals(0, countRowsInScan(client.newScannerBuilder(table).build())); + + // Test that we can initialize a client when one of the masters specified in the + // connection string is down. + AsyncKuduClient newClient = new AsyncKuduClient.AsyncKuduClientBuilder(masterAddresses).build(); + table = newClient.openTable(newTableName).join(DEFAULT_SLEEP); + assertEquals(0, countRowsInScan(newClient.newScannerBuilder(table).build())); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestOperation.java b/java/kudu-client/src/test/java/org/kududb/client/TestOperation.java new file mode 100644 index 000000000000..b84dc361b1e0 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestOperation.java @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.ArrayList; + +import org.junit.Test; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.kududb.Type; +import org.kududb.WireProtocol.RowOperationsPB; +import org.kududb.client.Operation.ChangeType; +import org.kududb.tserver.Tserver.WriteRequestPBOrBuilder; +import org.mockito.Mockito; + +import com.google.common.primitives.Longs; + +/** + * Unit tests for Operation + */ +public class TestOperation { + + private Schema createManyStringsSchema() { + ArrayList columns = new ArrayList(4); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c0", Type.STRING).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c1", Type.STRING).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c2", Type.STRING).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c3", Type.STRING).nullable(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c4", Type.STRING).nullable(true).build()); + return new Schema(columns); + } + + @Test + public void testSetStrings() { + KuduTable table = Mockito.mock(KuduTable.class); + Mockito.doReturn(createManyStringsSchema()).when(table).getSchema(); + Insert insert = new Insert(table); + PartialRow row = insert.getRow(); + row.addString("c0", "c0_val"); + row.addString("c2", "c2_val"); + row.addString("c1", "c1_val"); + row.addString("c3", "c3_val"); + row.addString("c4", "c4_val"); + + { + WriteRequestPBOrBuilder pb = Operation.createAndFillWriteRequestPB(insert); + RowOperationsPB rowOps = pb.getRowOperations(); + assertEquals(6 * 5, rowOps.getIndirectData().size()); + assertEquals("c0_valc1_valc2_valc3_valc4_val", rowOps.getIndirectData().toStringUtf8()); + byte[] rows = rowOps.getRows().toByteArray(); + assertEquals(ChangeType.INSERT.toEncodedByte(), rows[0]); + // The "isset" bitset should have 5 bits set + assertEquals(0x1f, rows[1]); + // The "null" bitset should have no bits set + assertEquals(0, rows[2]); + + // Check the strings. + int offset = 3; + for (int i = 0; i <= 4; i++) { + // The offset into the indirect buffer + assertEquals(6 * i, Bytes.getLong(rows, offset)); + offset += Longs.BYTES; + // The length of the pointed-to string. + assertEquals(6, Bytes.getLong(rows, offset)); + offset += Longs.BYTES; + } + + // Should have used up whole buffer. + assertEquals(rows.length, offset); + } + + // Setting a field to NULL should add to the null bitmap and remove + // the old value from the indirect buffer. + row.setNull("c3"); + { + WriteRequestPBOrBuilder pb = Operation.createAndFillWriteRequestPB(insert); + RowOperationsPB rowOps = pb.getRowOperations(); + assertEquals(6 * 4, rowOps.getIndirectData().size()); + assertEquals("c0_valc1_valc2_valc4_val", rowOps.getIndirectData().toStringUtf8()); + byte[] rows = rowOps.getRows().toByteArray(); + assertEquals(ChangeType.INSERT.toEncodedByte(), rows[0]); + // The "isset" bitset should have 5 bits set + assertEquals(0x1f, rows[1]); + // The "null" bitset should have 1 bit set for the null column + assertEquals(1 << 3, rows[2]); + + // Check the strings. + int offset = 3; + int indirOffset = 0; + for (int i = 0; i <= 4; i++) { + if (i == 3) continue; + // The offset into the indirect buffer + assertEquals(indirOffset, Bytes.getLong(rows, offset)); + indirOffset += 6; + offset += Longs.BYTES; + // The length of the pointed-to string. + assertEquals(6, Bytes.getLong(rows, offset)); + offset += Longs.BYTES; + } + // Should have used up whole buffer. + assertEquals(rows.length, offset); + } + } + + private Schema createAllTypesKeySchema() { + ArrayList columns = new ArrayList(7); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c0", Type.INT8).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c1", Type.INT16).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c2", Type.INT32).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c3", Type.INT64).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c4", Type.TIMESTAMP).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c5", Type.STRING).key(true).build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("c6", Type.BINARY).key(true).build()); + return new Schema(columns); + } + + @Test + public void testRowKeyStringify() { + KuduTable table = Mockito.mock(KuduTable.class); + Mockito.doReturn(createAllTypesKeySchema()).when(table).getSchema(); + Insert insert = new Insert(table); + PartialRow row = insert.getRow(); + row.addByte("c0", (byte) 1); + row.addShort("c1", (short) 2); + row.addInt("c2", 3); + row.addLong("c3", 4); + row.addLong("c4", 5); + row.addString("c5", "c5_val"); + row.addBinary("c6", Bytes.fromString("c6_val")); + + assertEquals("(int8 c0=1, int16 c1=2, int32 c2=3, int64 c3=4, timestamp c4=5, string" + + " c5=c5_val, binary c6=\"c6_val\")", + insert.getRow().stringifyRowKey()); + + // Test an incomplete row key. + insert = new Insert(table); + row = insert.getRow(); + row.addByte("c0", (byte) 1); + try { + row.stringifyRowKey(); + fail("Should not be able to stringifyRowKey when not all keys are specified"); + } catch (IllegalStateException ise) { + // Expected. + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestRowErrors.java b/java/kudu-client/src/test/java/org/kududb/client/TestRowErrors.java new file mode 100644 index 000000000000..39e977dfb546 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestRowErrors.java @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.*; + +public class TestRowErrors extends BaseKuduTest { + + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + + } + + + @Test(timeout = 100000) + public void singleTabletTest() throws Exception { + String tableName = TestRowErrors.class.getName() + "-" + System.currentTimeMillis(); + createTable(tableName, basicSchema, new CreateTableOptions()); + table = openTable(tableName); + AsyncKuduSession session = client.newSession(); + + // Insert 3 rows to play with. + for (int i = 0; i < 3; i++) { + session.apply(createInsert(i)).join(DEFAULT_SLEEP); + } + + // Try a single dupe row insert with AUTO_FLUSH_SYNC. + Insert dupeForZero = createInsert(0); + OperationResponse resp = session.apply(dupeForZero).join(DEFAULT_SLEEP); + assertTrue(resp.hasRowError()); + assertTrue(resp.getRowError().getOperation() == dupeForZero); + + // Now try inserting two dupes and one good row, make sure we get only two errors back. + dupeForZero = createInsert(0); + Insert dupeForTwo = createInsert(2); + session.setFlushMode(AsyncKuduSession.FlushMode.MANUAL_FLUSH); + session.apply(dupeForZero); + session.apply(dupeForTwo); + session.apply(createInsert(4)); + + List responses = session.flush().join(DEFAULT_SLEEP); + List errors = OperationResponse.collectErrors(responses); + assertEquals(2, errors.size()); + assertTrue(errors.get(0).getOperation() == dupeForZero); + assertTrue(errors.get(1).getOperation() == dupeForTwo); + } + + /** + * Test collecting errors from multiple tablets. + * @throws Exception + */ + @Test(timeout = 100000) + public void multiTabletTest() throws Exception { + String tableName = TestRowErrors.class.getName() + "-" + System.currentTimeMillis(); + createFourTabletsTableWithNineRows(tableName); + table = openTable(tableName); + KuduSession session = syncClient.newSession(); + session.setFlushMode(KuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + + int dupRows = 3; + session.apply(createInsert(12)); + session.apply(createInsert(22)); + session.apply(createInsert(32)); + + session.flush(); + + RowErrorsAndOverflowStatus reos = session.getPendingErrors(); + assertEquals(dupRows, reos.getRowErrors().length); + assertEquals(0, session.countPendingErrors()); + } + + private Insert createInsert(int key) { + return createBasicSchemaInsert(table, key); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestRowResult.java b/java/kudu-client/src/test/java/org/kududb/client/TestRowResult.java new file mode 100644 index 000000000000..5febcbd9ac00 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestRowResult.java @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.BeforeClass; +import org.junit.Test; +import org.kududb.Type; + +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestRowResult extends BaseKuduTest { + + // Generate a unique table name + private static final String TABLE_NAME = + TestRowResult.class.getName() + "-" + System.currentTimeMillis(); + + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + createTable(TABLE_NAME, allTypesSchema, new CreateTableOptions()); + + table = openTable(TABLE_NAME); + } + + @Test(timeout = 10000) + public void test() throws Exception { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + + row.addByte(0, (byte) 1); + row.addShort(1, (short) 2); + row.addInt(2, 3); + row.addLong(3, 4l); + row.addBoolean(4, true); + row.addFloat(5, 5.6f); + row.addDouble(6, 7.8); + row.addString(7, "string-value"); + row.addBinary(8, "binary-array".getBytes()); + ByteBuffer bb = ByteBuffer.wrap("binary-bytebuffer".getBytes()); + bb.position(7); // We're only inserting the bytebuffer part of the original array. + row.addBinary(9, bb); + row.setNull(10); + row.addLong(11, 11l); + + KuduSession session = syncClient.newSession(); + session.apply(insert); + + KuduScanner scanner = syncClient.newScannerBuilder(table).build(); + while (scanner.hasMoreRows()) { + RowResultIterator it = scanner.nextRows(); + assertTrue(it.hasNext()); + RowResult rr = it.next(); + + assertEquals((byte) 1, rr.getByte(0)); + assertEquals((byte) 1, rr.getByte(allTypesSchema.getColumnByIndex(0).getName())); + + assertEquals((short) 2, rr.getShort(1)); + assertEquals((short) 2, rr.getShort(allTypesSchema.getColumnByIndex(1).getName())); + + assertEquals(3, rr.getInt(2)); + assertEquals(3, rr.getInt(allTypesSchema.getColumnByIndex(2).getName())); + + assertEquals(4, rr.getLong(3)); + assertEquals(4, rr.getLong(allTypesSchema.getColumnByIndex(3).getName())); + + assertEquals(true, rr.getBoolean(4)); + assertEquals(true, rr.getBoolean(allTypesSchema.getColumnByIndex(4).getName())); + + assertEquals(5.6f, rr.getFloat(5), .001f); + assertEquals(5.6f, rr.getFloat(allTypesSchema.getColumnByIndex(5).getName()), .001f); + + assertEquals(7.8, rr.getDouble(6), .001); + assertEquals(7.8, rr.getDouble(allTypesSchema.getColumnByIndex(6).getName()), .001f); + + assertEquals("string-value", rr.getString(7)); + assertEquals("string-value", rr.getString(allTypesSchema.getColumnByIndex(7).getName())); + + assertArrayEquals("binary-array".getBytes(), rr.getBinaryCopy(8)); + assertArrayEquals("binary-array".getBytes(), + rr.getBinaryCopy(allTypesSchema.getColumnByIndex(8).getName())); + + ByteBuffer buffer = rr.getBinary(8); + assertEquals(buffer, rr.getBinary(allTypesSchema.getColumnByIndex(8).getName())); + byte[] binaryValue = new byte[buffer.remaining()]; + buffer.get(binaryValue); + assertArrayEquals("binary-array".getBytes(), binaryValue); + + assertArrayEquals("bytebuffer".getBytes(), rr.getBinaryCopy(9)); + + assertEquals(true, rr.isNull(10)); + assertEquals(true, rr.isNull(allTypesSchema.getColumnByIndex(10).getName())); + + assertEquals(11, rr.getLong(11)); + assertEquals(11, rr.getLong(allTypesSchema.getColumnByIndex(11).getName())); + + // We test with the column name once since it's the same method for all types, unlike above. + assertEquals(Type.INT8, rr.getColumnType(allTypesSchema.getColumnByIndex(0).getName())); + assertEquals(Type.INT8, rr.getColumnType(0)); + assertEquals(Type.INT16, rr.getColumnType(1)); + assertEquals(Type.INT32, rr.getColumnType(2)); + assertEquals(Type.INT64, rr.getColumnType(3)); + assertEquals(Type.BOOL, rr.getColumnType(4)); + assertEquals(Type.FLOAT, rr.getColumnType(5)); + assertEquals(Type.DOUBLE, rr.getColumnType(6)); + assertEquals(Type.STRING, rr.getColumnType(7)); + assertEquals(Type.BINARY, rr.getColumnType(8)); + assertEquals(Type.TIMESTAMP, rr.getColumnType(11)); + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestScannerMultiTablet.java b/java/kudu-client/src/test/java/org/kududb/client/TestScannerMultiTablet.java new file mode 100644 index 000000000000..85e45ac0cca0 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestScannerMultiTablet.java @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.collect.Lists; +import com.stumbleupon.async.Deferred; +import org.kududb.ColumnSchema; +import org.kududb.Schema; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.ArrayList; + +import static org.junit.Assert.assertNull; +import static org.kududb.Type.STRING; +import static org.junit.Assert.assertEquals; + +public class TestScannerMultiTablet extends BaseKuduTest { + // Generate a unique table name + private static final String TABLE_NAME = + TestScannerMultiTablet.class.getName()+"-"+System.currentTimeMillis(); + + private static Schema schema = getSchema(); + private static KuduTable table; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + // create a 4-tablets table for scanning + CreateTableOptions builder = new CreateTableOptions(); + + for (int i = 1; i < 4; i++){ + PartialRow splitRow = schema.newPartialRow(); + splitRow.addString("key1", "" + i); + splitRow.addString("key2", ""); + builder.addSplitRow(splitRow); + } + + createTable(TABLE_NAME, schema, builder); + + table = openTable(TABLE_NAME); + + AsyncKuduSession session = client.newSession(); + session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_SYNC); + + // The data layout ends up like this: + // tablet '', '1': no rows + // tablet '1', '2': '111', '122', '133' + // tablet '2', '3': '211', '222', '233' + // tablet '3', '': '311', '322', '333' + String[] keys = new String[] {"1", "2", "3"}; + for (String key1 : keys) { + for (String key2 : keys) { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addString(0, key1); + row.addString(1, key2); + row.addString(2, key2); + Deferred d = session.apply(insert); + d.join(DEFAULT_SLEEP); + } + } + } + + // Test various combinations of start/end row keys. + @Test(timeout = 100000) + public void testKeyStartEnd() throws Exception { + assertEquals(0, + countRowsInScan(getScanner("", "", "1", ""))); // There's nothing in the 1st tablet + assertEquals(1, countRowsInScan(getScanner("", "", "1", "2"))); // Grab the very first row + assertEquals(3, countRowsInScan(getScanner("1", "1", "1", "4"))); // Grab the whole 2nd tablet + assertEquals(3, countRowsInScan(getScanner("1", "1", "2", ""))); // Same, and peek at the 3rd + assertEquals(3, countRowsInScan(getScanner("1", "1", "2", "0"))); // Same, different peek + assertEquals(4, + countRowsInScan(getScanner("1", "2", "2", "3"))); // Middle of 2nd to middle of 3rd + assertEquals(3, + countRowsInScan(getScanner("1", "4", "2", "4"))); // Peek at the 2nd then whole 3rd + assertEquals(6, countRowsInScan(getScanner("1", "5", "3", "4"))); // Whole 3rd and 4th + assertEquals(9, countRowsInScan(getScanner("", "", "4", ""))); // Full table scan + + assertEquals(9, + countRowsInScan(getScanner("", "", null, null))); // Full table scan with empty upper + assertEquals(9, + countRowsInScan(getScanner(null, null, "4", ""))); // Full table scan with empty lower + assertEquals(9, + countRowsInScan(getScanner(null, null, null, null))); // Full table scan with empty bounds + + // Test that we can close a scanner while in between two tablets. We start on the second + // tablet and our first nextRows() will get 3 rows. At that moment we want to close the scanner + // before getting on the 3rd tablet. + AsyncKuduScanner scanner = getScanner("1", "", null, null); + Deferred d = scanner.nextRows(); + RowResultIterator rri = d.join(DEFAULT_SLEEP); + assertEquals(3, rri.getNumRows()); + d = scanner.close(); + rri = d.join(DEFAULT_SLEEP); + assertNull(rri); + } + + // Test mixing start/end row keys with predicates. + @Test(timeout = 100000) + public void testKeysAndPredicates() throws Exception { + // First row from the 2nd tablet. + ColumnRangePredicate predicate = new ColumnRangePredicate(schema.getColumnByIndex(2)); + predicate.setLowerBound("1"); + predicate.setUpperBound("1"); + assertEquals(1, countRowsInScan(getScanner("1", "", "2", "", predicate))); + + // All the 2nd tablet. + predicate = new ColumnRangePredicate(schema.getColumnByIndex(2)); + predicate.setLowerBound("1"); + predicate.setUpperBound("3"); + assertEquals(3, countRowsInScan(getScanner("1", "", "2", "", predicate))); + + // Value that doesn't exist. + predicate = new ColumnRangePredicate(schema.getColumnByIndex(2)); + predicate.setLowerBound("4"); + assertEquals(0, countRowsInScan(getScanner("1", "", "2", "", predicate))); + + // First row from every tablet. + predicate = new ColumnRangePredicate(schema.getColumnByIndex(2)); + predicate.setLowerBound("1"); + predicate.setUpperBound("1"); + assertEquals(3, countRowsInScan(getScanner(null, null, null, null, predicate))); + + // All the rows. + predicate = new ColumnRangePredicate(schema.getColumnByIndex(2)); + predicate.setLowerBound("1"); + assertEquals(9, countRowsInScan(getScanner(null, null, null, null, predicate))); + } + + @Test(timeout = 100000) + public void testProjections() throws Exception { + // Test with column names. + AsyncKuduScanner.AsyncKuduScannerBuilder builder = client.newScannerBuilder(table); + builder.setProjectedColumnNames(Lists.newArrayList(schema.getColumnByIndex(0).getName(), + schema.getColumnByIndex(1).getName())); + buildScannerAndCheckColumnsCount(builder, 2); + + // Test with column indexes. + builder = client.newScannerBuilder(table); + builder.setProjectedColumnIndexes(Lists.newArrayList(0, 1)); + buildScannerAndCheckColumnsCount(builder, 2); + + // Test with column names overriding indexes. + builder = client.newScannerBuilder(table); + builder.setProjectedColumnIndexes(Lists.newArrayList(0, 1)); + builder.setProjectedColumnNames(Lists.newArrayList(schema.getColumnByIndex(0).getName())); + buildScannerAndCheckColumnsCount(builder, 1); + } + + private AsyncKuduScanner getScanner(String lowerBoundKeyOne, + String lowerBoundKeyTwo, + String exclusiveUpperBoundKeyOne, + String exclusiveUpperBoundKeyTwo) { + return getScanner(lowerBoundKeyOne, lowerBoundKeyTwo, + exclusiveUpperBoundKeyOne, exclusiveUpperBoundKeyTwo, null); + } + + private AsyncKuduScanner getScanner(String lowerBoundKeyOne, + String lowerBoundKeyTwo, + String exclusiveUpperBoundKeyOne, + String exclusiveUpperBoundKeyTwo, + ColumnRangePredicate predicate) { + AsyncKuduScanner.AsyncKuduScannerBuilder builder = client.newScannerBuilder(table); + + if (lowerBoundKeyOne != null) { + PartialRow lowerBoundRow = schema.newPartialRow(); + lowerBoundRow.addString(0, lowerBoundKeyOne); + lowerBoundRow.addString(1, lowerBoundKeyTwo); + builder.lowerBound(lowerBoundRow); + } + + if (exclusiveUpperBoundKeyOne != null) { + PartialRow upperBoundRow = schema.newPartialRow(); + upperBoundRow.addString(0, exclusiveUpperBoundKeyOne); + upperBoundRow.addString(1, exclusiveUpperBoundKeyTwo); + builder.exclusiveUpperBound(upperBoundRow); + } + + if (predicate != null) { + builder.addColumnRangePredicate(predicate); + } + + return builder.build(); + } + + private void buildScannerAndCheckColumnsCount(AsyncKuduScanner.AsyncKuduScannerBuilder builder, + int count) throws Exception { + AsyncKuduScanner scanner = builder.build(); + scanner.nextRows().join(DEFAULT_SLEEP); + RowResultIterator rri = scanner.nextRows().join(DEFAULT_SLEEP); + assertEquals(count, rri.next().getSchema().getColumns().size()); + } + + private static Schema getSchema() { + ArrayList columns = new ArrayList<>(3); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key1", STRING) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("key2", STRING) + .key(true) + .build()); + columns.add(new ColumnSchema.ColumnSchemaBuilder("val", STRING) + .build()); + return new Schema(columns); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestTestUtils.java b/java/kudu-client/src/test/java/org/kududb/client/TestTestUtils.java new file mode 100644 index 000000000000..b150f8e4627d --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestTestUtils.java @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import org.junit.After; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.concurrent.atomic.AtomicLong; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Tests for non-trivial helper methods in TestUtils. + */ +public class TestTestUtils { + + public static final Logger LOG = LoggerFactory.getLogger(TestUtils.class); + + private Process proc; + + @After + public void tearDown() { + if (proc != null) { + proc.destroy(); + } + } + + /** + * Starts a process that executes the "yes" command (which prints 'y' in a loop), + * sends a SIGSTOP to the process, and ensures that SIGSTOP does indeed pause the process. + * Afterwards, sends a SIGCONT to the process and ensures that the process resumes. + */ + @Test(timeout = 2000) + public void testPauseAndResume() throws Exception { + ProcessBuilder processBuilder = new ProcessBuilder("yes"); + proc = processBuilder.start(); + LineCounterRunnable lineCounter = new LineCounterRunnable(proc.getInputStream()); + Thread thread = new Thread(lineCounter); + thread.setDaemon(true); + thread.start(); + TestUtils.pauseProcess(proc); + long prevCount; + do { + prevCount = lineCounter.getCount(); + Thread.sleep(10); + } while (prevCount != lineCounter.getCount()); + assertEquals(prevCount, lineCounter.getCount()); + TestUtils.resumeProcess(proc); + do { + prevCount = lineCounter.getCount(); + Thread.sleep(10); + } while (prevCount == lineCounter.getCount()); + assertTrue(lineCounter.getCount() > prevCount); + } + + /** + * Counts the number of lines in a specified input stream. + */ + static class LineCounterRunnable implements Runnable { + private final AtomicLong counter; + private final InputStream is; + + public LineCounterRunnable(InputStream is) { + this.is = is; + counter = new AtomicLong(0); + } + + @Override + public void run() { + BufferedReader in = null; + try { + in = new BufferedReader(new InputStreamReader(is)); + while (in.readLine() != null) { + counter.incrementAndGet(); + } + } catch (Exception e) { + LOG.error("Error while reading from the process", e); + } finally { + if (in != null) { + try { + in.close(); + } catch (IOException e) { + LOG.error("Error closing the stream", e); + } + } + } + } + + public long getCount() { + return counter.get(); + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestTimeouts.java b/java/kudu-client/src/test/java/org/kududb/client/TestTimeouts.java new file mode 100644 index 000000000000..57494c3b945f --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestTimeouts.java @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import static org.junit.Assert.fail; + +import com.stumbleupon.async.TimeoutException; +import org.junit.Test; + +public class TestTimeouts extends BaseKuduTest { + + private static final String TABLE_NAME = + TestTimeouts.class.getName() + "-" + System.currentTimeMillis(); + + /** + * This test case tries different methods that should all timeout, while relying on the client to + * pass down the timeouts to the session and scanner. + */ + @Test(timeout = 100000) + public void testLowTimeouts() throws Exception { + KuduClient lowTimeoutsClient = new KuduClient.KuduClientBuilder(masterAddresses) + .defaultAdminOperationTimeoutMs(1) + .defaultOperationTimeoutMs(1) + .build(); + + try { + lowTimeoutsClient.listTabletServers(); + fail("Should have timed out"); + } catch (TimeoutException ex) { + // Expected. + } + + createTable(TABLE_NAME, basicSchema, new CreateTableOptions()); + KuduTable table = openTable(TABLE_NAME); + + KuduSession lowTimeoutSession = lowTimeoutsClient.newSession(); + + try { + lowTimeoutSession.apply(createBasicSchemaInsert(table, 1)); + fail("Should have timed out"); + } catch (TimeoutException ex) { // If we timeout on the Deferred + // Expected. + } catch (NonRecoverableException ex) { // If we timeout when doing an internal deadline check + // Expected. + } + + KuduScanner lowTimeoutScanner = lowTimeoutsClient.newScannerBuilder(table).build(); + try { + lowTimeoutScanner.nextRows(); + fail("Should have timed out"); + } catch (TimeoutException ex) { + // Expected. + } catch (NonRecoverableException ex) { + // Expected. + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/client/TestUtils.java b/java/kudu-client/src/test/java/org/kududb/client/TestUtils.java new file mode 100644 index 000000000000..c40cda4f70f3 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/client/TestUtils.java @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.client; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import com.sun.security.auth.module.UnixSystem; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.Field; +import java.net.ServerSocket; +import java.net.URL; +import java.net.URLDecoder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.List; +import java.util.Set; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A grouping of methods that help unit testing. + */ +public class TestUtils { + private static final Logger LOG = LoggerFactory.getLogger(TestUtils.class); + + // Used by pidOfProcess() + private static String UNIX_PROCESS_CLS_NAME = "java.lang.UNIXProcess"; + private static Set VALID_SIGNALS = ImmutableSet.of("STOP", "CONT", "TERM", "KILL"); + + private static final String BIN_DIR_PROP = "binDir"; + + /** + * @return the path of the flags file to pass to daemon processes + * started by the tests + */ + public static String getFlagsPath() { + URL u = BaseKuduTest.class.getResource("/flags"); + if (u == null) { + throw new RuntimeException("Unable to find 'flags' file"); + } + if (u.getProtocol() == "file") { + return urlToPath(u); + } + // If the flags are inside a JAR, extract them into our temporary + // test directory. + try { + // Somewhat unintuitively, createTempFile() actually creates the file, + // not just the path, so we have to use REPLACE_EXISTING below. + Path tmpFile = Files.createTempFile( + Paths.get(getBaseDir()), "kudu-flags", ".flags"); + Files.copy(BaseKuduTest.class.getResourceAsStream("/flags"), tmpFile, + StandardCopyOption.REPLACE_EXISTING); + return tmpFile.toAbsolutePath().toString(); + } catch (IOException e) { + throw new RuntimeException("Unable to extract flags file into tmp", e); + } + } + + /** + * Return the path portion of a file URL, after decoding the escaped + * components. This fixes issues when trying to build within a + * working directory with special characters. + */ + private static String urlToPath(URL u) { + try { + return URLDecoder.decode(u.getPath(), "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + private static String findBuildDir() { + URL myUrl = BaseKuduTest.class.getProtectionDomain().getCodeSource().getLocation(); + File myPath = new File(urlToPath(myUrl)); + while (myPath != null) { + if (new File(myPath, ".git").isDirectory()) { + return new File(myPath, "build/latest/bin").getAbsolutePath(); + } + myPath = myPath.getParentFile(); + } + LOG.warn("Unable to find build dir! myUrl={}", myUrl); + return null; + } + + /** + * @param binName the binary to look for (eg 'kudu-tserver') + * @return the absolute path of that binary + * @throws FileNotFoundException if no such binary is found + */ + public static String findBinary(String binName) throws FileNotFoundException { + String binDir = System.getProperty(BIN_DIR_PROP); + if (binDir != null) { + LOG.info("Using binary directory specified by property: {}", + binDir); + } else { + binDir = findBuildDir(); + } + + File candidate = new File(binDir, binName); + if (candidate.canExecute()) { + return candidate.getAbsolutePath(); + } + throw new FileNotFoundException("Cannot find binary " + binName + + " in binary directory " + binDir); + } + + /** + * @return the base directory within which we will store server data + */ + public static String getBaseDir() { + String s = System.getenv("TEST_TMPDIR"); + if (s == null) { + s = String.format("/tmp/kudutest-%d", new UnixSystem().getUid()); + } + File f = new File(s); + f.mkdirs(); + return f.getAbsolutePath(); + } + + /** + * Finds the next free port, starting with the one passed. Keep in mind the + * time-of-check-time-of-use nature of this method, the returned port might become occupied + * after it was checked for availability. + * @param startPort First port to be probed. + * @return A currently usable port. + * @throws IOException IOE is thrown if we can't close a socket we tried to open or if we run + * out of ports to try. + */ + public static int findFreePort(int startPort) throws IOException { + ServerSocket ss; + for(int i = startPort; i < 65536; i++) { + try { + ss = new ServerSocket(i); + } catch (IOException e) { + continue; + } + ss.close(); + return i; + } + throw new IOException("Ran out of ports."); + } + + /** + * Finds a specified number of parts, starting with one passed. Keep in mind the + * time-of-check-time-of-use nature of this method. + * @see {@link #findFreePort(int)} + * @param startPort First port to be probed. + * @param numPorts Number of ports to reserve. + * @return A list of currently usable ports. + * @throws IOException IOE Is thrown if we can't close a socket we tried to open or if run + * out of ports to try. + */ + public static List findFreePorts(int startPort, int numPorts) throws IOException { + List ports = Lists.newArrayListWithCapacity(numPorts); + for (int i = 0; i < numPorts; i++) { + startPort = findFreePort(startPort); + ports.add(startPort++); + } + return ports; + } + + /** + * Gets the pid of a specified process. Relies on reflection and only works on + * UNIX process, not guaranteed to work on JDKs other than Oracle and OpenJDK. + * @param proc The specified process. + * @return The process UNIX pid. + * @throws IllegalArgumentException If the process is not a UNIXProcess. + * @throws Exception If there are other getting the pid via reflection. + */ + static int pidOfProcess(Process proc) throws Exception { + Class procCls = proc.getClass(); + if (!procCls.getName().equals(UNIX_PROCESS_CLS_NAME)) { + throw new IllegalArgumentException("stopProcess() expects objects of class " + + UNIX_PROCESS_CLS_NAME + ", but " + procCls.getName() + " was passed in instead!"); + } + Field pidField = procCls.getDeclaredField("pid"); + pidField.setAccessible(true); + return (Integer) pidField.get(proc); + } + + /** + * Send a code specified by its string representation to the specified process. + * TODO: Use a JNR/JNR-Posix instead of forking the JVM to exec "kill". + * @param proc The specified process. + * @param sig The string representation of the process (e.g., STOP for SIGSTOP). + * @throws IllegalArgumentException If the signal type is not supported. + * @throws IllegalStateException If we are unable to send the specified signal. + */ + static void signalProcess(Process proc, String sig) throws Exception { + if (!VALID_SIGNALS.contains(sig)) { + throw new IllegalArgumentException(sig + " is not a supported signal, only " + + Joiner.on(",").join(VALID_SIGNALS) + " are supported"); + } + int pid = pidOfProcess(proc); + int rv = Runtime.getRuntime() + .exec(String.format("kill -%s %d", sig, pid)) + .waitFor(); + if (rv != 0) { + throw new IllegalStateException(String.format("unable to send SIG%s to process %s(pid=%d): " + + "expected return code from kill, but got %d instead", sig, proc, pid, rv)); + } + } + + /** + * Pause the specified process by sending a SIGSTOP using the kill command. + * @param proc The specified process. + * @throws Exception If error prevents us from pausing the process. + */ + static void pauseProcess(Process proc) throws Exception { + signalProcess(proc, "STOP"); + } + + /** + * Resumes the specified process by sending a SIGCONT using the kill command. + * @param proc The specified process. + * @throws Exception If error prevents us from resuming the process. + */ + static void resumeProcess(Process proc) throws Exception { + signalProcess(proc, "CONT"); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/util/TestAsyncUtil.java b/java/kudu-client/src/test/java/org/kududb/util/TestAsyncUtil.java new file mode 100644 index 000000000000..fce7ddc47018 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/util/TestAsyncUtil.java @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.stumbleupon.async.Callback; +import com.stumbleupon.async.Deferred; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import static org.junit.Assert.assertEquals; + +/** + * Test for {@link AsyncUtil}. + */ +public class TestAsyncUtil { + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void testAddCallbacksDeferring() throws Exception { + Deferred d = new Deferred(); + TestCallback cb = new TestCallback(); + TestErrback eb = new TestErrback(); + + // Test normal callbacks. + AsyncUtil.addCallbacksDeferring(d, cb, eb); + final String testStr = "hello world"; + d.callback(testStr); + assertEquals(d.join(), "callback: " + testStr); + + d = new Deferred(); + AsyncUtil.addCallbacksDeferring(d, cb, eb); + d.callback(new IllegalArgumentException()); + assertEquals(d.join(), "illegal arg"); + + d = new Deferred(); + AsyncUtil.addCallbacksDeferring(d, cb, eb); + d.callback(new IllegalStateException()); + exception.expect(IllegalStateException.class); + d.join(); + } + + final class TestCallback implements Callback, String> { + @Override + public Deferred call(String arg) throws Exception { + return Deferred.fromResult("callback: " + arg); + } + } + + final class TestErrback implements Callback, Exception> { + @Override + public Deferred call(Exception arg) { + if (arg instanceof IllegalArgumentException) { + return Deferred.fromResult("illegal arg"); + } + return Deferred.fromError(arg); + } + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/util/TestMurmurHash.java b/java/kudu-client/src/test/java/org/kududb/util/TestMurmurHash.java new file mode 100644 index 000000000000..051107c42f91 --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/util/TestMurmurHash.java @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.google.common.primitives.UnsignedLongs; +import com.sangupta.murmur.Murmur2; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * Test Murmur2 Hash64 returns the expected values for inputs. + * + * These tests are duplicated on the C++ side to ensure that hash computations + * are stable across both platforms. + */ +public class TestMurmurHash { + + @Test + public void testMurmur2Hash64() throws Exception { + long hash; + + hash = Murmur2.hash64("ab".getBytes("UTF-8"), 2, 0); + assertEquals(UnsignedLongs.parseUnsignedLong("7115271465109541368"), hash); + + hash = Murmur2.hash64("abcdefg".getBytes("UTF-8"), 7, 0); + assertEquals(UnsignedLongs.parseUnsignedLong("2601573339036254301"), hash); + + hash = Murmur2.hash64("quick brown fox".getBytes("UTF-8"), 15, 42); + assertEquals(UnsignedLongs.parseUnsignedLong("3575930248840144026"), hash); + } +} diff --git a/java/kudu-client/src/test/java/org/kududb/util/TestNetUtil.java b/java/kudu-client/src/test/java/org/kududb/util/TestNetUtil.java new file mode 100644 index 000000000000..5c003ae8863f --- /dev/null +++ b/java/kudu-client/src/test/java/org/kududb/util/TestNetUtil.java @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.util; + +import com.google.common.net.HostAndPort; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +/** + * Test for {@link NetUtil}. + */ +public class TestNetUtil { + + /** + * Tests parsing strings into {@link HostAndPort} objects with and without specifying + * the port in the string. + */ + @Test + public void testParseString() { + String aStringWithPort = "1.2.3.4:1234"; + HostAndPort hostAndPortForAStringWithPort = NetUtil.parseString(aStringWithPort, 0); + assertEquals(hostAndPortForAStringWithPort.getHostText(), "1.2.3.4"); + assertEquals(hostAndPortForAStringWithPort.getPort(), 1234); + + String aStringWithoutPort = "1.2.3.4"; + HostAndPort hostAndPortForAStringWithoutPort = NetUtil.parseString(aStringWithoutPort, 12345); + assertEquals(hostAndPortForAStringWithoutPort.getHostText(), aStringWithoutPort); + assertEquals(hostAndPortForAStringWithoutPort.getPort(), 12345); + } + + /** + * Tests parsing comma separated list of "host:port" pairs and hosts into a list of + * {@link HostAndPort} objects. + */ + @Test + public void testParseStrings() { + String testAddrs = "1.2.3.4.5,10.0.0.1:5555,127.0.0.1:7777"; + List hostsAndPorts = NetUtil.parseStrings(testAddrs, 3333); + assertArrayEquals(hostsAndPorts.toArray(), + new HostAndPort[] { HostAndPort.fromParts("1.2.3.4.5", 3333), + HostAndPort.fromParts("10.0.0.1", 5555), + HostAndPort.fromParts("127.0.0.1", 7777) } + ); + } + + @Test + public void testHostsAndPortsToString() { + List hostsAndPorts = Arrays.asList( + HostAndPort.fromParts("127.0.0.1", 1111), + HostAndPort.fromParts("1.2.3.4.5", 0) + ); + assertEquals(NetUtil.hostsAndPortsToString(hostsAndPorts), "127.0.0.1:1111,1.2.3.4.5:0"); + } +} diff --git a/java/kudu-client/src/test/resources/flags b/java/kudu-client/src/test/resources/flags new file mode 100644 index 000000000000..8432b1fbda8c --- /dev/null +++ b/java/kudu-client/src/test/resources/flags @@ -0,0 +1,9 @@ +--webserver_port=0 +--max_message_size=67108864 + +--minloglevel=0 +--v=2 +--log_dir=target/testdata +--enable_data_block_fsync=false +--enable_leader_failure_detection=true + diff --git a/java/kudu-client/src/test/resources/log4j.properties b/java/kudu-client/src/test/resources/log4j.properties new file mode 100644 index 000000000000..2d7619c6c9f6 --- /dev/null +++ b/java/kudu-client/src/test/resources/log4j.properties @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +log4j.rootLogger = DEBUG, out +log4j.appender.out = org.apache.log4j.ConsoleAppender +log4j.appender.out.layout = org.apache.log4j.PatternLayout +log4j.appender.out.layout.ConversionPattern = %d (%t) [%p - %l] %m%n + +log4j.logger.org.kududb = DEBUG diff --git a/java/kudu-csd/generate_mdl.py b/java/kudu-csd/generate_mdl.py new file mode 100755 index 000000000000..2d34aa19169b --- /dev/null +++ b/java/kudu-csd/generate_mdl.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script to generate a CM-compatible MDL file from the metrics +# metadata dumped by our daemon processes. +# +# Requires that the daemon processes have already been built and available +# in the build/latest/bin directory. +# +# Outputs the MDL file on stdout by default or to a file specified in the first +# argument. + +import collections +try: + import simplejson as json +except: + import json +import os +import subprocess +import sys + +BINARIES=["kudu-master", "kudu-tserver"] + +RELATIVE_BUILD_DIR="../../build/latest/bin" + +def find_binary(bin_name): + dirname, _ = os.path.split(os.path.abspath(__file__)) + build_dir = os.path.join(dirname, RELATIVE_BUILD_DIR) + path = os.path.join(build_dir, bin_name) + if os.path.exists(path): + return path + raise Exception("Cannot find %s in build dir %s" % (bin_name, build_dir)) + +def load_all_metrics(): + """ + For each binary, dump and parse its metrics schema by running it with + the --dump_metrics_json flag. + """ + all_metrics = [] + for binary in BINARIES: + binary = find_binary(binary) + p = subprocess.Popen([binary, "--dump_metrics_json"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + rc = p.returncode + + if rc != 0: + print >>sys.stderr, "error: %s exited with return code %d:\n" % (binary, rc) + print >>sys.stderr, stderr + sys.exit(1) + + metrics_dump = json.loads(stdout) + all_metrics.extend(m for m in metrics_dump['metrics']) + return all_metrics + +def append_sentence(a, b): + if not a.endswith("."): + a += "." + return a + " " + b + +def metric_to_mdl_entries(m): + if m['type'] == 'histogram': + return [ + dict( + context=(m['name'] + "::total_count"), + name=('kudu_' + m['name'].lower()) + '_count', + counter=True, + numeratorUnit='samples', + description=append_sentence(m['description'], + "This is the total number of recorded samples."), + label=m['label'] + ": Samples"), + dict( + context=(m['name'] + "::total_sum"), + name=('kudu_' + m['name'].lower()) + '_sum', + counter=True, + numeratorUnit=m['unit'], + description=append_sentence(m['description'], + "This is the total sum of recorded samples."), + label=m['label'] + ": Total") + ] + + return [dict( + context=(m['name'] + "::value"), + name=('kudu_' + m['name'].lower()), + counter=(m['type'] == 'counter'), + numeratorUnit=m['unit'], + description=m['description'], + label=m['label'])] + +def metrics_to_mdl(metrics): + """ + For each metric returned by the daemon, convert it to the MDL-compatible dictionary. + Returns a map of entity_type_name -> [metric dicts]. + """ + seen = set() + + by_entity = collections.defaultdict(lambda: []) + for m in metrics: + # Don't process any metric more than once. Some metrics show up + # in both daemons. + key = (m['entity_type'], m['name']) + if key in seen: + continue + seen.add(key) + + # Convert to the format that CM expects. + by_entity[m['entity_type']].extend(metric_to_mdl_entries(m)) + return by_entity + +def main(): + all_metrics = load_all_metrics() + metrics_by_entity = metrics_to_mdl(all_metrics) + server_metrics = metrics_by_entity['server'] + tablet_metrics = metrics_by_entity['tablet'] + + output = dict( + name="KUDU", + version="0.6.0", + metricDefinitions=[], + nameForCrossEntityAggregateMetrics="kudus", + roles=[ + dict(name="KUDU_TSERVER", + nameForCrossEntityAggregateMetrics="kudu_tservers", + metricDefinitions=server_metrics), + dict(name="KUDU_MASTER", + nameForCrossEntityAggregateMetrics="kudu_masters", + metricDefinitions=server_metrics) + ], + metricEntityAttributeDefinitions=[ + dict(name="kuduTableId", + label="Table ID", + description="UUID for Kudu Table.", + valueCaseSensitive=False), + dict(name="kuduTableName", + label="Table Name", + description="Name for Kudu Table.", + valueCaseSensitive=True), + dict(name="kuduTableState", + label="Table State", + description="State for Kudu Table.", + valueCaseSensitive=False), + dict(name="kuduTabletId", + label="Tablet ID", + description="UUID for Kudu Tablet.", + valueCaseSensitive=False), + dict(name="kuduTabletState", + label="Tablet State", + description="State for Kudu Tablet.", + valueCaseSensitive=False) + # TODO: add the role's persistent UUID after discussing with + # Chris on how to inject it into their CM entity. + ], + metricEntityTypeDefinitions=[ + dict(name="KUDU_TABLE", + nameForCrossEntityAggregateMetrics="kudu_tables", + immutableAttributeNames=["serviceName", "kuduTableId"], + mutableAttributeNames=["kuduTableName", "kuduTableState"], + entityNameFormat=["serviceName", "kuduTableId"], + description="A Kudu table.", + label="Kudu Table", + labelPlural="Kudu Tables", + entityLabelFormat="$kuduTableName ($serviceDisplayName)", + parentMetricEntityTypeNames=["KUDU"], + metricDefinitions=[]), + dict(name="KUDU_TABLET", + nameForCrossEntityAggregateMetrics="kudu_tablets", + immutableAttributeNames=["serviceName", "kuduTableId", "kuduTabletId"], + mutableAttributeNames=["kuduTabletState"], + entityNameFormat=["serviceName", "kuduTabletId"], + description="A Kudu tablet.", + label="Kudu Tablet", + labelPlural="Kudu Tablets", + entityLabelFormat="$kuduTabletId ($kuduTableName) ($serviceDisplayName)", + parentMetricEntityTypeNames=["KUDU_TABLE"], + metricDefinitions=[]), + dict(name="KUDU_REPLICA", + nameForCrossEntityAggregateMetrics="kudu_replicas", + immutableAttributeNames=["kuduTabletId", "serviceName", "roleName"], + entityNameFormat=["roleName","kuduTabletId"], + description="A Kudu replica.", + label="Kudu Replica", + labelPlural="Kudu Replicas", + entityLabelFormat="$kuduTabletId ($kuduTableName) ($hostname)", + parentMetricEntityTypeNames=["KUDU_TABLET","KUDU-KUDU_TSERVER"], + metricDefinitions=tablet_metrics), + ]) + + + f = sys.stdout + if len(sys.argv) > 1: + f = open(sys.argv[1], 'w') + f.write(json.dumps(output, indent=4)) + +if __name__ == "__main__": + main() diff --git a/java/kudu-csd/pom.xml b/java/kudu-csd/pom.xml new file mode 100644 index 000000000000..ac1eb27b7898 --- /dev/null +++ b/java/kudu-csd/pom.xml @@ -0,0 +1,129 @@ + + + 4.0.0 + + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + com.cloudera.csd + KUDU + 0.8.0 + The Kudu CSD + pom + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2 + + + + + mkdir + + -p + ${project.build.directory}/descriptor + + + generate-target-descriptor-directory + compile + + exec + + + + + + ${basedir}/generate_mdl.py + + ${project.build.directory}/descriptor/service.mdl + + + generate-mdl + compile + + exec + + + + + + + org.apache.maven.plugins + maven-assembly-plugin + + false + + ../assembly.xml + + + + + make-assembly + package + + single + + + + + + + + + + + validateCSD + + + + com.cloudera.enterprise + schema-validator-maven-plugin + ${schema-validator-maven-plugin.version} + + + validate-descriptors + test + + validate + + + + ${project.basedir} + + true + + + + + + + + + diff --git a/java/kudu-csd/src/descriptor/service.sdl b/java/kudu-csd/src/descriptor/service.sdl new file mode 100644 index 000000000000..d07e57225e9a --- /dev/null +++ b/java/kudu-csd/src/descriptor/service.sdl @@ -0,0 +1,311 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +{ + "name" : "KUDU", + "label" : "Kudu (Beta)", + "description" : "Kudu is a true column store for the Hadoop ecosystem. Kudu is currently in Beta and is not supported. Before adding this service, ensure that you have installed the Kudu binaries, which are not included in CDH.", + "version" : "0.6.0", + "compatibility" : { + "generation" : 1 + }, + "runAs" : { + "user" : "kudu", + "group" : "kudu" + }, + "parcel" : { + "repoUrl" : "http://archive.cloudera.com/beta/kudu/parcels/latest/", + "requiredTags" : [ "kudu" ] + }, + "icon" : "images/icon.png", + "parameters" : [ + { + "name" : "enable_core_dump", + "label" : "Enable Core Dump", + "description" : "Used to generate a core dump to get more information about a Kudu crash. Unless otherwise configured systemwide using /proc/sys/kernel/core_pattern, the dump is generated in the configured core dump directory. The core file can be very large.", + "type" : "boolean", + "required" : "true", + "default" : "false" + }, + { + "name" : "core_dump_directory", + "label" : "Core Dump Directory", + "description" : "If Enable Core Dump is set, Kudu processes will dump cores to this location.", + "type" : "path", + "pathType" : "serviceSpecific", + "required" : "true", + "default" : "/var/log/kudu" + } + ], + "rolesWithExternalLinks" : [ + "KUDU_MASTER" + ], + "roles" : [ + { + "name" : "KUDU_MASTER", + "label" : "Master", + "pluralLabel" : "Masters", + "startRunner" : { + "program" : "scripts/kudu.sh", + "args" : [ + "master" + ], + "environmentVariables" : { + "ENABLE_CORE_DUMP" : "${enable_core_dump}", + "CORE_DUMP_DIRECTORY" : "${core_dump_directory}" + } + }, + "logging" : { + "dir" : "/var/log/kudu", + "filename" : "kudu-master.INFO", + "modifiable" : true, + "loggingType" : "glog" + }, + "externalLink" : { + "name" : "kudu_master_web_ui", + "label" : "Kudu Master Web UI", + "url" : "http://${host}:${webserver_port}" + }, + "topology" : { "minInstances" : 1 }, + "parameters" : [ + { + "name" : "webserver_port", + "label" : "Kudu Master Web UI Port", + "description" : "The port of the Kudu Master Web UI.", + "type" : "port", + "required" : "true", + "default" : 8051 + }, + { + "name" : "webserver_interface", + "label" : "Kudu Master Web UI Interface", + "description" : "The interface of the Kudu Master Web UI. If blank, binds to 0.0.0.0.", + "type" : "string" + }, + { + "name" : "master_address", + "label" : "Master Address", + "description" : "Configuration that's automatically set by Cloudera Manager to propagate the Master's address to the Tablet Servers.", + "configName" : "server.address", + "required" : "false", + "type" : "string", + "default" : "" + }, + { + "name" : "default_num_replicas", + "label" : "Default Number of Replicas", + "description" : "Default number of replicas for each tablet.", + "required" : "true", + "type" : "long", + "min" : "1", + "softMin" : "3", + "default" : "3" + }, + { + "name" : "fs_wal_dir", + "label" : "Kudu Master WAL Directory", + "description" : "Directory where Kudu masters will store write-ahead logs. It can be the same as one of the data directories, but not a sub-directory of a data directory. Master and tablet servers must use different directories when co-located on the same machine.", + "required" : "true", + "configurableInWizard" : true, + "type" : "path", + "pathType" : "localDataDir" + }, + { + "name" : "fs_data_dirs", + "label" : "Kudu Master Data Directories", + "description" : "Directories where Kudu masters will store data blocks.", + "required" : "true", + "configurableInWizard" : true, + "type" : "path_array", + "pathType" : "localDataDir" + }, + { + "name" : "log_force_fsync_all", + "label" : "Kudu Master WAL Fsyncs All Entries", + "description" : "If true, the Master will use the fsync system call to ensure that all modifications to the catalog table are durably written to disk. WARNING: In this release, enabling this option can cause serious issues.", + "required" : "true", + "default" : "false", + "type" : "boolean" + } + ], + "configWriter" : { + "generators" : [ + { + "filename" : "gflagfile", + "configFormat" : "gflags", + "excludedParams" : [ + "master_address", + "enable_core_dump", + "core_dump_directory" + ] + }, + { + "filename" : "kudu-monitoring.properties", + "configFormat" : "properties", + "includedParams" : [ + "webserver_interface", + "webserver_port" + ] + } + ], + "peerConfigGenerators" : [ + { + "filename" : "master.properties", + "params" : [ "master_address" ], + "roleName" : "KUDU_MASTER" + } + ] + } + }, + { + "name" : "KUDU_TSERVER", + "label" : "Tablet Server", + "pluralLabel" : "Tablet Servers", + "startRunner" : { + "program" : "scripts/kudu.sh", + "args" : [ + "tserver" + ], + "environmentVariables" : { + "ENABLE_CORE_DUMP" : "${enable_core_dump}", + "CORE_DUMP_DIRECTORY" : "${core_dump_directory}" + } + }, + "logging" : { + "dir" : "/var/log/kudu", + "filename" : "kudu-tserver.INFO", + "modifiable" : true, + "loggingType" : "glog" + }, + "externalLink" : { + "name" : "kudu_ts_web_ui", + "label" : "Kudu Tablet Server Web UI", + "url" : "http://${host}:${webserver_port}" + }, + "topology" : { "minInstances" : 1 }, + "parameters" : [ + { + "name" : "webserver_interface", + "label" : "Kudu Tablet Server Web UI Interface", + "description" : "The interface of the Kudu Tablet Server Web UI. If blank, binds to 0.0.0.0.", + "type" : "string" + }, + { + "name" : "webserver_port", + "label" : "Kudu Tablet Server Web UI Port", + "description" : "The port of the Kudu Tablet Server Web UI.", + "type" : "port", + "required" : "true", + "default" : 8050 + }, + { + "name" : "fs_wal_dir", + "label" : "Kudu Tablet Server WAL Directory", + "description" : "Directory where Kudu tablet servers will store write-ahead logs. It can be the same as one of the data directories, but not a sub-directory of a data directory. Master and tablet servers must use different directories when co-located on the same machine.", + "required" : "true", + "configurableInWizard" : true, + "type" : "path", + "pathType" : "localDataDir" + }, + { + "name" : "fs_data_dirs", + "label" : "Kudu Tablet Server Data Directories", + "description" : "Directories where Kudu tablet servers will store data blocks.", + "required" : "true", + "configurableInWizard" : true, + "type" : "path_array", + "pathType" : "localDataDir" + }, + { + "name" : "memory_limit_hard_bytes", + "label" : "Kudu Tablet Server Hard Memory Limit", + "description" : "Maximum amount of memory that the Kudu Tablet Server will use before it starts rejecting all incoming writes.", + "required" : "true", + "type" : "memory", + "unit" : "bytes", + "min" : 1073741824, + "default" : 4294967296, + "scaleFactor" : 1.3 + }, + { + "name" : "block_cache_capacity_mb", + "label" : "Kudu Tablet Server Block Cache Capacity", + "description" : "Maximum amount of memory allocated to the Kudu Tablet Server's block cache.", + "required" : "true", + "type" : "long", + "unit" : "megabytes", + "softMin" : 256, + "min" : 16, + "default" : 512 + }, + { + "name" : "log_force_fsync_all", + "label" : "Kudu Tablet Server WAL Fsyncs All Entries", + "description" : "If true, the Tablet Server will use the fsync system call to ensure that all writes are durably written to to the write-ahead log (WAL) before responding. If false, edits will be written to the Linux buffer cache on a majority of replicas before responding.", + "required" : "true", + "type" : "boolean", + "default" : "false" + }, + { + "name" : "maintenance_manager_num_threads", + "label" : "Kudu Tablet Server Maintenance Threads", + "description" : "The number of threads devoted to background maintenance operations such as flushes and compactions. If the tablet server appears to be falling behind on write operations (inserts, updates, and deletes) but CPU and disk resources are not saturated, increasing this thread count will devote more resources to these background operations.", + "required" : "true", + "default" : 1, + "type" : "long", + "min" : 1, + "softMax" : 8 + } + + ], + "configWriter" : { + "generators" : [ + { + "filename" : "gflagfile", + "configFormat" : "gflags", + "excludedParams" : [ + "enable_core_dump", + "core_dump_directory" + ] + }, + { + "filename" : "kudu-monitoring.properties", + "configFormat" : "properties", + "includedParams" : [ + "webserver_interface", + "webserver_port" + ] + } + ], + "peerConfigGenerators" : [ + { + "filename" : "master.properties", + "params" : [ "master_address" ], + "roleName" : "KUDU_MASTER" + } + ] + }, + "cgroup" : { + "cpu" : { + "autoConfigured" : true + }, + "blkio" : { + "autoConfigured" : true + } + } + } + ] +} diff --git a/java/kudu-csd/src/images/icon.png b/java/kudu-csd/src/images/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..9100401e18c84ee8a4b2d05a952c82c091172f4a GIT binary patch literal 693 zcmV;m0!safP)aB^>EX>4U6ba`-PAVE-2F#rH~oK?uN4S+BV0KwiP|x=MXxWzVegh&fyR-_XW#eK=et4K5L4)&Acd2+Wa?ch=m7pVoNM$FPP_3rU_y@ zn=(LzoGLYz0?Lz7p&4#R&K`)x!g@h)7OBXl+R4o`?dvAXfI==<=SuADmEZ*-5`;*4 zvv!vP&hKf?UfT88+GI+dxZo|wJBZk379g{Obr+xZ!+w7doEsn~FkW8LvbE1U1^_y$ z!UGTn2`>c8r{50KLT3?vsyo08Vg|%CnRHe5qhElov3uL1Gxv+XRVJrwKx|-$yGe0l z{jGPE{z|Q<9Y6rkb+-By1m}b&;y72aId@89KWq{Y_VOnf@Js_nULPq=Z@Kev;|j(Q z0YHANdYqt_Ba!ZW>_L%V42iex002P$1H4Q*$N3;}30JjSB53henb{{KPru}n1 z8exMPjM-|eDf|@!s3$^IHuQ)x(PcF_2L+L?zV>QmoU;cF^AIU|g29xkt-LwB=KVjd b{m=XWOCy|Gt1^rN00000NkvXXu0mjfLG~a# literal 0 HcmV?d00001 diff --git a/java/kudu-csd/src/scripts/kudu.sh b/java/kudu-csd/src/scripts/kudu.sh new file mode 100644 index 000000000000..f3b2b9487eb6 --- /dev/null +++ b/java/kudu-csd/src/scripts/kudu.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -x + +# Time marker for both stderr and stdout +date 1>&2 + +export KUDU_HOME=${KUDU_HOME:-/usr/lib/kudu} + +CMD=$1 +shift 2 + +function log { + timestamp=$(date) + echo "$timestamp: $1" #stdout + echo "$timestamp: $1" 1>&2; #stderr +} + +# Reads a line in the format "$host:$key=$value", setting those variables. +function readconf { + local conf + IFS=':' read host conf <<< "$1" + IFS='=' read key value <<< "$conf" +} + +log "KUDU_HOME: $KUDU_HOME" +log "CONF_DIR: $CONF_DIR" +log "CMD: $CMD" + +# Make sure we've got the main gflagfile. +GFLAG_FILE="$CONF_DIR/gflagfile" +if [ ! -r "$GFLAG_FILE" ]; then + log "Could not find $GFLAG_FILE, exiting" + exit 1 +fi + +# Make sure we've got a file describing the master config. +MASTER_FILE="$CONF_DIR/master.properties" +if [ ! -r "$MASTER_FILE" ]; then + log "Could not find $MASTER_FILE, exiting" + exit 1 +fi + +# Parse the master config. +MASTER_IPS= +for line in $(cat "$MASTER_FILE") +do + readconf "$line" + case $key in + server.address) + # Fall back to the host only if there's no defined value. + if [ -n "$value" ]; then + actual_value="$value" + else + actual_value="$host" + fi + + # Append to comma-separated MASTER_IPS. + if [ -n "$MASTER_IPS" ]; then + MASTER_IPS="${MASTER_IPS}," + fi + MASTER_IPS="${MASTER_IPS}${actual_value}" + ;; + esac +done +log "Found master(s) on $MASTER_IPS" + +# Enable core dumping if requested. +if [ "$ENABLE_CORE_DUMP" == "true" ]; then + # The core dump directory should already exist. + if [ -z "$CORE_DUMP_DIRECTORY" -o ! -d "$CORE_DUMP_DIRECTORY" ]; then + log "Could not find core dump directory $CORE_DUMP_DIRECTORY, exiting" + exit 1 + fi + # It should also be writable. + if [ ! -w "$CORE_DUMP_DIRECTORY" ]; then + log "Core dump directory $CORE_DUMP_DIRECTORY is not writable, exiting" + exit 1 + fi + + ulimit -c unlimited + cd "$CORE_DUMP_DIRECTORY" + STATUS=$? + if [ $STATUS != 0 ]; then + log "Could not change to core dump directory to $CORE_DUMP_DIRECTORY, exiting" + exit $STATUS + fi +fi + +if [ "$CMD" = "master" ]; then + # Only pass --master_addresses if there's more than one master. + # + # Need to use [[ ]] for regex support. + if [[ "$MASTER_IPS" =~ , ]]; then + MASTER_ADDRESSES="--master_addresses=$MASTER_IPS" + fi + + exec "$KUDU_HOME/sbin/kudu-master" \ + $MASTER_ADDRESSES \ + --flagfile="$GFLAG_FILE" +elif [ "$CMD" = "tserver" ]; then + exec "$KUDU_HOME/sbin/kudu-tserver" \ + --tserver_master_addrs="$MASTER_IPS" \ + --flagfile="$GFLAG_FILE" +else + log "Unknown command: $CMD" + exit 2 +fi diff --git a/java/kudu-mapreduce/pom.xml b/java/kudu-mapreduce/pom.xml new file mode 100644 index 000000000000..4f2535ca7969 --- /dev/null +++ b/java/kudu-mapreduce/pom.xml @@ -0,0 +1,113 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + kudu-mapreduce + Kudu's MapReduce bindings + + + + org.kududb + kudu-client + ${project.version} + + + org.kududb + kudu-client + ${project.version} + test-jar + test + + + log4j + log4j + ${log4j.version} + provided + + + com.stumbleupon + async + ${async.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + provided + + + org.slf4j + slf4j-api + ${slf4j.version} + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + test-jar + + + junit + junit + ${junit.version} + test + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + org.apache.maven.plugins + maven-assembly-plugin + ${maven-assembly-plugin.version} + + + jar-with-dependencies + + true + + + + package + + single + + + + + + + diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/CommandLineParser.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/CommandLineParser.java new file mode 100644 index 000000000000..05c18f23e0fa --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/CommandLineParser.java @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.AsyncKuduClient; +import org.apache.hadoop.conf.Configuration; +import org.kududb.client.KuduClient; + +/** + * Utility class that manages common configurations to all MR jobs. For example, + * any job that uses {#KuduTableMapReduceUtil} to setup an input or output format + * and that has parsed the command line arguments with + * {@link org.apache.hadoop.util.GenericOptionsParser} can simply be passed: + * + * -Dmaster.address=ADDRESS + * + * in order to specify where the master is. + * Use {@link CommandLineParser#getHelpSnippet()} to provide usage text for the configurations + * managed by this class. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class CommandLineParser { + private final Configuration conf; + public static final String MASTER_ADDRESSES_KEY = "kudu.master.addresses"; + public static final String MASTER_ADDRESSES_DEFAULT = "127.0.0.1"; + public static final String OPERATION_TIMEOUT_MS_KEY = "kudu.operation.timeout.ms"; + public static final long OPERATION_TIMEOUT_MS_DEFAULT = + AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS; + public static final String ADMIN_OPERATION_TIMEOUT_MS_KEY = "kudu.admin.operation.timeout.ms"; + public static final String SOCKET_READ_TIMEOUT_MS_KEY = "kudu.socket.read.timeout.ms"; + public static final long SOCKET_READ_TIMEOUT_MS_DEFAULT = + AsyncKuduClient.DEFAULT_SOCKET_READ_TIMEOUT_MS; + public static final String NUM_REPLICAS_KEY = "kudu.num.replicas"; + public static final int NUM_REPLICAS_DEFAULT = 3; + + /** + * Constructor that uses a Configuration that has already been through + * {@link org.apache.hadoop.util.GenericOptionsParser}'s command line parsing. + * @param conf the configuration from which job configurations will be extracted + */ + public CommandLineParser(Configuration conf) { + this.conf = conf; + } + + /** + * Get the configured master's config. + * @return a string that contains the passed config, or the default value + */ + public String getMasterAddresses() { + return conf.get(MASTER_ADDRESSES_KEY, MASTER_ADDRESSES_DEFAULT); + } + + /** + * Get the configured timeout for operations on sessions and scanners. + * @return a long that represents the passed timeout, or the default value + */ + public long getOperationTimeoutMs() { + return conf.getLong(OPERATION_TIMEOUT_MS_KEY, OPERATION_TIMEOUT_MS_DEFAULT); + } + + /** + * Get the configured timeout for admin operations. + * @return a long that represents the passed timeout, or the default value + */ + public long getAdminOperationTimeoutMs() { + return conf.getLong(ADMIN_OPERATION_TIMEOUT_MS_KEY, OPERATION_TIMEOUT_MS_DEFAULT); + } + + /** + * Get the configured timeout for socket reads. + * @return a long that represents the passed timeout, or the default value + */ + public long getSocketReadTimeoutMs() { + return conf.getLong(SOCKET_READ_TIMEOUT_MS_KEY, SOCKET_READ_TIMEOUT_MS_DEFAULT); + } + + /** + * Get the number of replicas to use when configuring a new table. + * @return an int that represents the passed number of replicas to use, or the default value. + */ + public int getNumReplicas() { + return conf.getInt(NUM_REPLICAS_KEY, NUM_REPLICAS_DEFAULT); + } + + /** + * Get an async client connected to the configured Master(s). + * @return an async kudu client + */ + public AsyncKuduClient getAsyncClient() { + return new AsyncKuduClient.AsyncKuduClientBuilder(getMasterAddresses()) + .defaultOperationTimeoutMs(getOperationTimeoutMs()) + .defaultAdminOperationTimeoutMs(getAdminOperationTimeoutMs()) + .defaultSocketReadTimeoutMs(getSocketReadTimeoutMs()) + .build(); + } + + /** + * Get a client connected to the configured Master(s). + * @return a kudu client + */ + public KuduClient getClient() { + return new KuduClient.KuduClientBuilder(getMasterAddresses()) + .defaultOperationTimeoutMs(getOperationTimeoutMs()) + .defaultAdminOperationTimeoutMs(getAdminOperationTimeoutMs()) + .defaultSocketReadTimeoutMs(getSocketReadTimeoutMs()) + .build(); + } + + /** + * This method returns a single multi-line string that contains the help snippet to append to + * the tail of a usage() or help() type of method. + * @return a string with all the available configurations and their defaults + */ + public static String getHelpSnippet() { + return "\nAdditionally, the following options are available:" + + " -D" + OPERATION_TIMEOUT_MS_KEY + "=TIME - timeout for read and write " + + "operations, defaults to " + OPERATION_TIMEOUT_MS_DEFAULT + " \n"+ + " -D" + ADMIN_OPERATION_TIMEOUT_MS_KEY + "=TIME - timeout for admin operations " + + ", defaults to " + OPERATION_TIMEOUT_MS_DEFAULT + " \n"+ + " -D" + SOCKET_READ_TIMEOUT_MS_KEY + "=TIME - timeout for socket reads " + + ", defaults to " + SOCKET_READ_TIMEOUT_MS_DEFAULT + " \n"+ + " -D" + MASTER_ADDRESSES_KEY + "=ADDRESSES - addresses to reach the Masters, " + + "defaults to " + MASTER_ADDRESSES_DEFAULT + " which is usually wrong.\n" + + " -D " + NUM_REPLICAS_KEY + "=NUM - number of replicas to use when configuring a new " + + "table, defaults to " + NUM_REPLICAS_DEFAULT; + } +} diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableInputFormat.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableInputFormat.java new file mode 100644 index 000000000000..4b6187c058dc --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableInputFormat.java @@ -0,0 +1,452 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce; + +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import org.apache.commons.net.util.Base64; +import org.kududb.Schema; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.net.DNS; + +import javax.naming.NamingException; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + *

+ * This input format generates one split per tablet and the only location for each split is that + * tablet's leader. + *

+ * + *

+ * Hadoop doesn't have the concept of "closing" the input format so in order to release the + * resources we assume that once either {@link #getSplits(org.apache.hadoop.mapreduce.JobContext)} + * or {@link KuduTableInputFormat.TableRecordReader#close()} have been called that + * the object won't be used again and the AsyncKuduClient is shut down. + *

+ */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduTableInputFormat extends InputFormat + implements Configurable { + + private static final Log LOG = LogFactory.getLog(KuduTableInputFormat.class); + + private static final long SLEEP_TIME_FOR_RETRIES_MS = 1000; + + /** Job parameter that specifies the input table. */ + static final String INPUT_TABLE_KEY = "kudu.mapreduce.input.table"; + + /** Job parameter that specifies if the scanner should cache blocks or not (default: false). */ + static final String SCAN_CACHE_BLOCKS = "kudu.mapreduce.input.scan.cache.blocks"; + + /** Job parameter that specifies where the masters are. */ + static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.address"; + + /** Job parameter that specifies how long we wait for operations to complete (default: 10s). */ + static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms"; + + /** Job parameter that specifies the address for the name server. */ + static final String NAME_SERVER_KEY = "kudu.mapreduce.name.server"; + + /** Job parameter that specifies the encoded column range predicates (may be empty). */ + static final String ENCODED_COLUMN_RANGE_PREDICATES_KEY = + "kudu.mapreduce.encoded.column.range.predicates"; + + /** + * Job parameter that specifies the column projection as a comma-separated list of column names. + * + * Not specifying this at all (i.e. setting to null) or setting to the special string + * '*' means to project all columns. + * + * Specifying the empty string means to project no columns (i.e just count the rows). + */ + static final String COLUMN_PROJECTION_KEY = "kudu.mapreduce.column.projection"; + + /** + * The reverse DNS lookup cache mapping: address from Kudu => hostname for Hadoop. This cache is + * used in order to not do DNS lookups multiple times for each tablet server. + */ + private final Map reverseDNSCacheMap = new HashMap(); + + private Configuration conf; + private KuduClient client; + private KuduTable table; + private long operationTimeoutMs; + private String nameServer; + private boolean cacheBlocks; + private List projectedCols; + private byte[] rawPredicates; + + @Override + public List getSplits(JobContext jobContext) + throws IOException, InterruptedException { + try { + if (table == null) { + throw new IOException("No table was provided"); + } + List splits; + DeadlineTracker deadline = new DeadlineTracker(); + deadline.setDeadline(operationTimeoutMs); + // If the job is started while a leader election is running, we might not be able to find a + // leader right away. We'll wait as long as the user is willing to wait with the operation + // timeout, and once we've waited long enough we just start picking the first replica we see + // for those tablets that don't have a leader. The client will later try to find the leader + // and it might fail, in which case the task will get retried. + retryloop: + while (true) { + List locations; + try { + locations = table.getTabletsLocations(operationTimeoutMs); + } catch (Exception e) { + throw new IOException("Could not get the tablets locations", e); + } + + if (locations.isEmpty()) { + throw new IOException("The requested table has 0 tablets, cannot continue"); + } + + // For the moment we only pass the leader since that's who we read from. + // If we've been trying to get a leader for each tablet for too long, we stop looping + // and just finish with what we have. + splits = new ArrayList(locations.size()); + for (LocatedTablet locatedTablet : locations) { + List addresses = Lists.newArrayList(); + LocatedTablet.Replica replica = locatedTablet.getLeaderReplica(); + if (replica == null) { + if (deadline.wouldSleepingTimeout(SLEEP_TIME_FOR_RETRIES_MS)) { + LOG.debug("We ran out of retries, picking a non-leader replica for this tablet: " + + locatedTablet.toString()); + // We already checked it's not empty. + replica = locatedTablet.getReplicas().get(0); + } else { + LOG.debug("Retrying creating the splits because this tablet is missing a leader: " + + locatedTablet.toString()); + Thread.sleep(SLEEP_TIME_FOR_RETRIES_MS); + continue retryloop; + } + } + addresses.add(reverseDNS(replica.getRpcHost(), replica.getRpcPort())); + String[] addressesArray = addresses.toArray(new String[addresses.size()]); + Partition partition = locatedTablet.getPartition(); + TableSplit split = new TableSplit(partition.getPartitionKeyStart(), + partition.getPartitionKeyEnd(), + addressesArray); + splits.add(split); + } + return splits; + } + } finally { + shutdownClient(); + } + } + + private void shutdownClient() throws IOException { + try { + client.shutdown(); + } catch (Exception e) { + throw new IOException(e); + } + } + + /** + * This method might seem alien, but we do this in order to resolve the hostnames the same way + * Hadoop does. This ensures we get locality if Kudu is running along MR/YARN. + * @param host hostname we got from the master + * @param port port we got from the master + * @return reverse DNS'd address + */ + private String reverseDNS(String host, Integer port) { + String location = this.reverseDNSCacheMap.get(host); + if (location != null) { + return location; + } + // The below InetSocketAddress creation does a name resolution. + InetSocketAddress isa = new InetSocketAddress(host, port); + if (isa.isUnresolved()) { + LOG.warn("Failed address resolve for: " + isa); + } + InetAddress tabletInetAddress = isa.getAddress(); + try { + location = domainNamePointerToHostName( + DNS.reverseDns(tabletInetAddress, this.nameServer)); + this.reverseDNSCacheMap.put(host, location); + } catch (NamingException e) { + LOG.warn("Cannot resolve the host name for " + tabletInetAddress + " because of " + e); + location = host; + } + return location; + } + + @Override + public RecordReader createRecordReader(InputSplit inputSplit, + TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { + return new TableRecordReader(); + } + + @Override + public void setConf(Configuration entries) { + this.conf = new Configuration(entries); + + String tableName = conf.get(INPUT_TABLE_KEY); + String masterAddresses = conf.get(MASTER_ADDRESSES_KEY); + this.operationTimeoutMs = conf.getLong(OPERATION_TIMEOUT_MS_KEY, + AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS); + this.nameServer = conf.get(NAME_SERVER_KEY); + this.cacheBlocks = conf.getBoolean(SCAN_CACHE_BLOCKS, false); + + this.client = new KuduClient.KuduClientBuilder(masterAddresses) + .defaultOperationTimeoutMs(operationTimeoutMs) + .build(); + try { + this.table = client.openTable(tableName); + } catch (Exception ex) { + throw new RuntimeException("Could not obtain the table from the master, " + + "is the master running and is this table created? tablename=" + tableName + " and " + + "master address= " + masterAddresses, ex); + } + + String projectionConfig = conf.get(COLUMN_PROJECTION_KEY); + if (projectionConfig == null || projectionConfig.equals("*")) { + this.projectedCols = null; // project the whole table + } else if ("".equals(projectionConfig)) { + this.projectedCols = new ArrayList<>(); + } else { + this.projectedCols = Lists.newArrayList(Splitter.on(',').split(projectionConfig)); + + // Verify that the column names are valid -- better to fail with an exception + // before we submit the job. + Schema tableSchema = table.getSchema(); + for (String columnName : projectedCols) { + if (tableSchema.getColumn(columnName) == null) { + throw new IllegalArgumentException("Unknown column " + columnName); + } + } + } + + String encodedPredicates = conf.get(ENCODED_COLUMN_RANGE_PREDICATES_KEY, ""); + rawPredicates = Base64.decodeBase64(encodedPredicates); + } + + /** + * Given a PTR string generated via reverse DNS lookup, return everything + * except the trailing period. Example for host.example.com., return + * host.example.com + * @param dnPtr a domain name pointer (PTR) string. + * @return Sanitized hostname with last period stripped off. + * + */ + private static String domainNamePointerToHostName(String dnPtr) { + if (dnPtr == null) + return null; + return dnPtr.endsWith(".") ? dnPtr.substring(0, dnPtr.length() - 1) : dnPtr; + } + + @Override + public Configuration getConf() { + return conf; + } + + static class TableSplit extends InputSplit implements Writable, Comparable { + + private byte[] startPartitionKey; + private byte[] endPartitionKey; + private String[] locations; + + public TableSplit() { } // Writable + + public TableSplit(byte[] startPartitionKey, byte[] endPartitionKey, String[] locations) { + this.startPartitionKey = startPartitionKey; + this.endPartitionKey = endPartitionKey; + this.locations = locations; + } + + @Override + public long getLength() throws IOException, InterruptedException { + // TODO Guesstimate a size + return 0; + } + + @Override + public String[] getLocations() throws IOException, InterruptedException { + return locations; + } + + public byte[] getStartPartitionKey() { + return startPartitionKey; + } + + public byte[] getEndPartitionKey() { + return endPartitionKey; + } + + @Override + public int compareTo(TableSplit tableSplit) { + return Bytes.memcmp(startPartitionKey, tableSplit.getStartPartitionKey()); + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + Bytes.writeByteArray(dataOutput, startPartitionKey); + Bytes.writeByteArray(dataOutput, endPartitionKey); + dataOutput.writeInt(locations.length); + for (String location : locations) { + byte[] str = Bytes.fromString(location); + Bytes.writeByteArray(dataOutput, str); + } + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + startPartitionKey = Bytes.readByteArray(dataInput); + endPartitionKey = Bytes.readByteArray(dataInput); + locations = new String[dataInput.readInt()]; + for (int i = 0; i < locations.length; i++) { + byte[] str = Bytes.readByteArray(dataInput); + locations[i] = Bytes.getString(str); + } + } + + @Override + public int hashCode() { + // We currently just care about the row key since we're within the same table + return Arrays.hashCode(startPartitionKey); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TableSplit that = (TableSplit) o; + + return this.compareTo(that) == 0; + } + + @Override + public String toString() { + return Objects.toStringHelper(this) + .add("startPartitionKey", Bytes.pretty(startPartitionKey)) + .add("endPartitionKey", Bytes.pretty(endPartitionKey)) + .add("locations", Arrays.toString(locations)) + .toString(); + } + } + + class TableRecordReader extends RecordReader { + + private final NullWritable currentKey = NullWritable.get(); + private RowResult currentValue; + private RowResultIterator iterator; + private KuduScanner scanner; + private TableSplit split; + + @Override + public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { + if (!(inputSplit instanceof TableSplit)) { + throw new IllegalArgumentException("TableSplit is the only accepted input split"); + } + + split = (TableSplit) inputSplit; + scanner = client.newScannerBuilder(table) + .setProjectedColumnNames(projectedCols) + .lowerBoundPartitionKeyRaw(split.getStartPartitionKey()) + .exclusiveUpperBoundPartitionKeyRaw(split.getEndPartitionKey()) + .cacheBlocks(cacheBlocks) + .addColumnRangePredicatesRaw(rawPredicates) + .build(); + + // Calling this now to set iterator. + tryRefreshIterator(); + } + + @Override + public boolean nextKeyValue() throws IOException, InterruptedException { + if (!iterator.hasNext()) { + tryRefreshIterator(); + if (!iterator.hasNext()) { + // Means we still have the same iterator, we're done + return false; + } + } + currentValue = iterator.next(); + return true; + } + + /** + * If the scanner has more rows, get a new iterator else don't do anything. + * @throws IOException + */ + private void tryRefreshIterator() throws IOException { + if (!scanner.hasMoreRows()) { + return; + } + try { + iterator = scanner.nextRows(); + } catch (Exception e) { + throw new IOException("Couldn't get scan data", e); + } + } + + @Override + public NullWritable getCurrentKey() throws IOException, InterruptedException { + return currentKey; + } + + @Override + public RowResult getCurrentValue() throws IOException, InterruptedException { + return currentValue; + } + + @Override + public float getProgress() throws IOException, InterruptedException { + // TODO Guesstimate progress + return 0; + } + + @Override + public void close() throws IOException { + try { + scanner.close(); + } catch (Exception e) { + throw new IOException(e); + } + shutdownClient(); + } + } +} diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableMapReduceUtil.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableMapReduceUtil.java new file mode 100644 index 000000000000..c8fa5e9e29fd --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableMapReduceUtil.java @@ -0,0 +1,519 @@ +/** + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. See accompanying LICENSE file. + */ +package org.kududb.mapreduce; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.commons.net.util.Base64; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.TaskInputOutputContext; +import org.apache.hadoop.util.JarFinder; +import org.apache.hadoop.util.StringUtils; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.AsyncKuduClient; +import org.kududb.client.ColumnRangePredicate; +import org.kududb.client.KuduTable; +import org.kududb.client.Operation; + +import java.io.IOException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.*; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +/** + * Utility class to setup MR jobs that use Kudu as an input and/or output. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduTableMapReduceUtil { + // Mostly lifted from HBase's TableMapReduceUtil + + private static final Log LOG = LogFactory.getLog(KuduTableMapReduceUtil.class); + + /** + * Doesn't need instantiation + */ + private KuduTableMapReduceUtil() { } + + + /** + * Base class for MR I/O formats, contains the common configurations. + */ + private static abstract class AbstractMapReduceConfigurator { + protected final Job job; + protected final String table; + + protected boolean addDependencies = true; + + /** + * Constructor for the required fields to configure. + * @param job a job to configure + * @param table a string that contains the name of the table to read from + */ + private AbstractMapReduceConfigurator(Job job, String table) { + this.job = job; + this.table = table; + } + + /** + * Sets whether this job should add Kudu's dependencies to the distributed cache. Turned on + * by default. + * @param addDependencies a boolean that says if we should add the dependencies + * @return this instance + */ + @SuppressWarnings("unchecked") + public S addDependencies(boolean addDependencies) { + this.addDependencies = addDependencies; + return (S) this; + } + + /** + * Configures the job using the passed parameters. + * @throws IOException If addDependencies is enabled and a problem is encountered reading + * files on the filesystem + */ + public abstract void configure() throws IOException; + } + + /** + * Builder-like class that sets up the required configurations and classes to write to Kudu. + *

+ * Use either child classes when configuring the table output format. + */ + private static abstract class AbstractTableOutputFormatConfigurator + > + extends AbstractMapReduceConfigurator { + + protected String masterAddresses; + protected long operationTimeoutMs = AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS; + + /** + * {@inheritDoc} + */ + private AbstractTableOutputFormatConfigurator(Job job, String table) { + super(job, table); + } + + /** + * {@inheritDoc} + */ + public void configure() throws IOException { + job.setOutputFormatClass(KuduTableOutputFormat.class); + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(Operation.class); + + Configuration conf = job.getConfiguration(); + conf.set(KuduTableOutputFormat.MASTER_ADDRESSES_KEY, masterAddresses); + conf.set(KuduTableOutputFormat.OUTPUT_TABLE_KEY, table); + conf.setLong(KuduTableOutputFormat.OPERATION_TIMEOUT_MS_KEY, operationTimeoutMs); + if (addDependencies) { + addDependencyJars(job); + } + } + } + + /** + * Builder-like class that sets up the required configurations and classes to read from Kudu. + * By default, block caching is disabled. + *

+ * Use either child classes when configuring the table input format. + */ + private static abstract class AbstractTableInputFormatConfigurator + > + extends AbstractMapReduceConfigurator { + + protected String masterAddresses; + protected long operationTimeoutMs = AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS; + protected final String columnProjection; + protected boolean cacheBlocks; + protected List columnRangePredicates = new ArrayList<>(); + + /** + * Constructor for the required fields to configure. + * @param job a job to configure + * @param table a string that contains the name of the table to read from + * @param columnProjection a string containing a comma-separated list of columns to read. + * It can be null in which case we read empty rows + */ + private AbstractTableInputFormatConfigurator(Job job, String table, String columnProjection) { + super(job, table); + this.columnProjection = columnProjection; + } + + /** + * Sets the block caching configuration for the scanners. Turned off by default. + * @param cacheBlocks whether the job should use scanners that cache blocks. + * @return this instance + */ + public S cacheBlocks(boolean cacheBlocks) { + this.cacheBlocks = cacheBlocks; + return (S) this; + } + + /** + * Configures the job with all the passed parameters. + * @throws IOException If addDependencies is enabled and a problem is encountered reading + * files on the filesystem + */ + public void configure() throws IOException { + job.setInputFormatClass(KuduTableInputFormat.class); + + Configuration conf = job.getConfiguration(); + + conf.set(KuduTableInputFormat.MASTER_ADDRESSES_KEY, masterAddresses); + conf.set(KuduTableInputFormat.INPUT_TABLE_KEY, table); + conf.setLong(KuduTableInputFormat.OPERATION_TIMEOUT_MS_KEY, operationTimeoutMs); + conf.setBoolean(KuduTableInputFormat.SCAN_CACHE_BLOCKS, cacheBlocks); + + if (columnProjection != null) { + conf.set(KuduTableInputFormat.COLUMN_PROJECTION_KEY, columnProjection); + } + + if (!columnRangePredicates.isEmpty()) { + conf.set(KuduTableInputFormat.ENCODED_COLUMN_RANGE_PREDICATES_KEY, + base64EncodePredicates(columnRangePredicates)); + } + + if (addDependencies) { + addDependencyJars(job); + } + } + } + + static String base64EncodePredicates(List predicates) { + byte[] predicateBytes = ColumnRangePredicate.toByteArray(predicates); + return Base64.encodeBase64String(predicateBytes); + } + + + /** + * Table output format configurator to use to specify the parameters directly. + */ + public static class TableOutputFormatConfigurator + extends AbstractTableOutputFormatConfigurator { + + /** + * Constructor for the required fields to configure. + * @param job a job to configure + * @param table a string that contains the name of the table to read from + * @param masterAddresses a comma-separated list of masters' hosts and ports + */ + public TableOutputFormatConfigurator(Job job, String table, String masterAddresses) { + super(job, table); + this.masterAddresses = masterAddresses; + } + + /** + * Sets the timeout for all the operations. The default is 10 seconds. + * @param operationTimeoutMs a long that represents the timeout for operations to complete, + * must be a positive value or 0 + * @return this instance + * @throws IllegalArgumentException if the operation timeout is lower than 0 + */ + public TableOutputFormatConfigurator operationTimeoutMs(long operationTimeoutMs) { + if (operationTimeoutMs < 0) { + throw new IllegalArgumentException("The operation timeout must be => 0, " + + "passed value is: " + operationTimeoutMs); + } + this.operationTimeoutMs = operationTimeoutMs; + return this; + } + } + + /** + * Table output format that uses a {@link CommandLineParser} in order to set the + * master config and the operation timeout. + */ + public static class TableOutputFormatConfiguratorWithCommandLineParser extends + AbstractTableOutputFormatConfigurator { + + /** + * {@inheritDoc} + */ + public TableOutputFormatConfiguratorWithCommandLineParser(Job job, String table) { + super(job, table); + CommandLineParser parser = new CommandLineParser(job.getConfiguration()); + this.masterAddresses = parser.getMasterAddresses(); + this.operationTimeoutMs = parser.getOperationTimeoutMs(); + } + } + + /** + * Table input format configurator to use to specify the parameters directly. + */ + public static class TableInputFormatConfigurator + extends AbstractTableInputFormatConfigurator { + + /** + * Constructor for the required fields to configure. + * @param job a job to configure + * @param table a string that contains the name of the table to read from + * @param columnProjection a string containing a comma-separated list of columns to read. + * It can be null in which case we read empty rows + * @param masterAddresses a comma-separated list of masters' hosts and ports + */ + public TableInputFormatConfigurator(Job job, String table, String columnProjection, + String masterAddresses) { + super(job, table, columnProjection); + this.masterAddresses = masterAddresses; + } + + /** + * Sets the timeout for all the operations. The default is 10 seconds. + * @param operationTimeoutMs a long that represents the timeout for operations to complete, + * must be a positive value or 0 + * @return this instance + * @throws IllegalArgumentException if the operation timeout is lower than 0 + */ + public TableInputFormatConfigurator operationTimeoutMs(long operationTimeoutMs) { + if (operationTimeoutMs < 0) { + throw new IllegalArgumentException("The operation timeout must be => 0, " + + "passed value is: " + operationTimeoutMs); + } + this.operationTimeoutMs = operationTimeoutMs; + return this; + } + + /** + * Adds a new predicate that will be pushed down to all the tablets. + * @param predicate a predicate to add + * @return this instance + */ + public TableInputFormatConfigurator addColumnRangePredicate(ColumnRangePredicate predicate) { + this.columnRangePredicates.add(predicate); + return this; + } + } + + /** + * Table input format that uses a {@link CommandLineParser} in order to set the + * master config and the operation timeout. + * This version cannot set column range predicates. + */ + public static class TableInputFormatConfiguratorWithCommandLineParser extends + AbstractTableInputFormatConfigurator { + + /** + * {@inheritDoc} + */ + public TableInputFormatConfiguratorWithCommandLineParser(Job job, + String table, + String columnProjection) { + super(job, table, columnProjection); + CommandLineParser parser = new CommandLineParser(job.getConfiguration()); + this.masterAddresses = parser.getMasterAddresses(); + this.operationTimeoutMs = parser.getOperationTimeoutMs(); + } + } + + /** + * Use this method when setting up a task to get access to the KuduTable in order to create + * Inserts, Updates, and Deletes. + * @param context Map context + * @return The kudu table object as setup by the output format + */ + @SuppressWarnings("rawtypes") + public static KuduTable getTableFromContext(TaskInputOutputContext context) { + String multitonKey = context.getConfiguration().get(KuduTableOutputFormat.MULTITON_KEY); + return KuduTableOutputFormat.getKuduTable(multitonKey); + } + + /** + * Add the Kudu dependency jars as well as jars for any of the configured + * job classes to the job configuration, so that JobClient will ship them + * to the cluster and add them to the DistributedCache. + */ + public static void addDependencyJars(Job job) throws IOException { + addKuduDependencyJars(job.getConfiguration()); + try { + addDependencyJars(job.getConfiguration(), + // when making changes here, consider also mapred.TableMapReduceUtil + // pull job classes + job.getMapOutputKeyClass(), + job.getMapOutputValueClass(), + job.getInputFormatClass(), + job.getOutputKeyClass(), + job.getOutputValueClass(), + job.getOutputFormatClass(), + job.getPartitionerClass(), + job.getCombinerClass()); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + /** + * Add the jars containing the given classes to the job's configuration + * such that JobClient will ship them to the cluster and add them to + * the DistributedCache. + */ + public static void addDependencyJars(Configuration conf, + Class... classes) throws IOException { + + FileSystem localFs = FileSystem.getLocal(conf); + Set jars = new HashSet(); + // Add jars that are already in the tmpjars variable + jars.addAll(conf.getStringCollection("tmpjars")); + + // add jars as we find them to a map of contents jar name so that we can avoid + // creating new jars for classes that have already been packaged. + Map packagedClasses = new HashMap(); + + // Add jars containing the specified classes + for (Class clazz : classes) { + if (clazz == null) continue; + + Path path = findOrCreateJar(clazz, localFs, packagedClasses); + if (path == null) { + LOG.warn("Could not find jar for class " + clazz + + " in order to ship it to the cluster."); + continue; + } + if (!localFs.exists(path)) { + LOG.warn("Could not validate jar file " + path + " for class " + + clazz); + continue; + } + jars.add(path.toString()); + } + if (jars.isEmpty()) return; + + conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()]))); + } + + /** + * Add Kudu and its dependencies (only) to the job configuration. + *

+ * This is intended as a low-level API, facilitating code reuse between this + * class and its mapred counterpart. It also of use to external tools that + * need to build a MapReduce job that interacts with Kudu but want + * fine-grained control over the jars shipped to the cluster. + *

+ * @param conf The Configuration object to extend with dependencies. + * @see KuduTableMapReduceUtil + * @see PIG-3285 + */ + public static void addKuduDependencyJars(Configuration conf) throws IOException { + addDependencyJars(conf, + // explicitly pull a class from each module + Operation.class, // kudu-client + KuduTableMapReduceUtil.class, // kudu-mapreduce + // pull necessary dependencies + com.stumbleupon.async.Deferred.class); + } + + /** + * If org.apache.hadoop.util.JarFinder is available (0.23+ hadoop), finds + * the Jar for a class or creates it if it doesn't exist. If the class is in + * a directory in the classpath, it creates a Jar on the fly with the + * contents of the directory and returns the path to that Jar. If a Jar is + * created, it is created in the system temporary directory. Otherwise, + * returns an existing jar that contains a class of the same name. Maintains + * a mapping from jar contents to the tmp jar created. + * @param my_class the class to find. + * @param fs the FileSystem with which to qualify the returned path. + * @param packagedClasses a map of class name to path. + * @return a jar file that contains the class. + * @throws IOException + */ + @SuppressWarnings("deprecation") + private static Path findOrCreateJar(Class my_class, FileSystem fs, + Map packagedClasses) + throws IOException { + // attempt to locate an existing jar for the class. + String jar = findContainingJar(my_class, packagedClasses); + if (null == jar || jar.isEmpty()) { + jar = JarFinder.getJar(my_class); + updateMap(jar, packagedClasses); + } + + if (null == jar || jar.isEmpty()) { + return null; + } + + LOG.debug(String.format("For class %s, using jar %s", my_class.getName(), jar)); + return new Path(jar).makeQualified(fs); + } + + /** + * Find a jar that contains a class of the same name, if any. It will return + * a jar file, even if that is not the first thing on the class path that + * has a class with the same name. Looks first on the classpath and then in + * the packagedClasses map. + * @param my_class the class to find. + * @return a jar file that contains the class, or null. + * @throws IOException + */ + private static String findContainingJar(Class my_class, Map packagedClasses) + throws IOException { + ClassLoader loader = my_class.getClassLoader(); + String class_file = my_class.getName().replaceAll("\\.", "/") + ".class"; + + // first search the classpath + for (Enumeration itr = loader.getResources(class_file); itr.hasMoreElements();) { + URL url = itr.nextElement(); + if ("jar".equals(url.getProtocol())) { + String toReturn = url.getPath(); + if (toReturn.startsWith("file:")) { + toReturn = toReturn.substring("file:".length()); + } + // URLDecoder is a misnamed class, since it actually decodes + // x-www-form-urlencoded MIME type rather than actual + // URL encoding (which the file path has). Therefore it would + // decode +s to ' 's which is incorrect (spaces are actually + // either unencoded or encoded as "%20"). Replace +s first, so + // that they are kept sacred during the decoding process. + toReturn = toReturn.replaceAll("\\+", "%2B"); + toReturn = URLDecoder.decode(toReturn, "UTF-8"); + return toReturn.replaceAll("!.*$", ""); + } + } + + // now look in any jars we've packaged using JarFinder. Returns null when + // no jar is found. + return packagedClasses.get(class_file); + } + + /** + * Add entries to packagedClasses corresponding to class files + * contained in jar. + * @param jar The jar who's content to list. + * @param packagedClasses map[class -> jar] + */ + private static void updateMap(String jar, Map packagedClasses) throws IOException { + if (null == jar || jar.isEmpty()) { + return; + } + ZipFile zip = null; + try { + zip = new ZipFile(jar); + for (Enumeration iter = zip.entries(); iter.hasMoreElements();) { + ZipEntry entry = iter.nextElement(); + if (entry.getName().endsWith("class")) { + packagedClasses.put(entry.getName(), jar); + } + } + } finally { + if (null != zip) zip.close(); + } + } +} diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputCommitter.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputCommitter.java new file mode 100644 index 000000000000..8af750bde1f3 --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputCommitter.java @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; + +import java.io.IOException; + +/** + * Small committer class that does not do anything. + */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduTableOutputCommitter extends OutputCommitter { + @Override + public void setupJob(JobContext jobContext) throws IOException { + + } + + @Override + public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException { + + } + + @Override + public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException { + return false; + } + + @Override + public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException { + + } + + @Override + public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException { + + } +} diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputFormat.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputFormat.java new file mode 100644 index 000000000000..e80b73f4ac7b --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/KuduTableOutputFormat.java @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.*; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.OutputCommitter; +import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; + +/** + *

+ * Use {@link + * KuduTableMapReduceUtil.TableOutputFormatConfigurator} + * to correctly setup this output format, then {@link + * KuduTableMapReduceUtil#getTableFromContext(org.apache.hadoop.mapreduce.TaskInputOutputContext)} + * to get a KuduTable. + *

+ * + *

+ * Hadoop doesn't have the concept of "closing" the output format so in order to release the + * resources we assume that once either + * {@link #checkOutputSpecs(org.apache.hadoop.mapreduce.JobContext)} + * or {@link TableRecordWriter#close(org.apache.hadoop.mapreduce.TaskAttemptContext)} + * have been called that the object won't be used again and the KuduClient is shut down. + *

+ */ +@InterfaceAudience.Public +@InterfaceStability.Evolving +public class KuduTableOutputFormat extends OutputFormat + implements Configurable { + + private static final Logger LOG = LoggerFactory.getLogger(KuduTableOutputFormat.class); + + /** Job parameter that specifies the output table. */ + static final String OUTPUT_TABLE_KEY = "kudu.mapreduce.output.table"; + + /** Job parameter that specifies where the masters are */ + static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.addresses"; + + /** Job parameter that specifies how long we wait for operations to complete */ + static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms"; + + /** Number of rows that are buffered before flushing to the tablet server */ + static final String BUFFER_ROW_COUNT_KEY = "kudu.mapreduce.buffer.row.count"; + + /** + * Job parameter that specifies which key is to be used to reach the KuduTableOutputFormat + * belonging to the caller + */ + static final String MULTITON_KEY = "kudu.mapreduce.multitonkey"; + + /** + * This multiton is used so that the tasks using this output format/record writer can find + * their KuduTable without having a direct dependency on this class, + * with the additional complexity that the output format cannot be shared between threads. + */ + private static final ConcurrentHashMap MULTITON = new + ConcurrentHashMap(); + + /** + * This counter helps indicate which task log to look at since rows that weren't applied will + * increment this counter. + */ + public enum Counters { ROWS_WITH_ERRORS } + + private Configuration conf = null; + + private KuduClient client; + private KuduTable table; + private KuduSession session; + private long operationTimeoutMs; + + @Override + public void setConf(Configuration entries) { + this.conf = new Configuration(entries); + + String masterAddress = this.conf.get(MASTER_ADDRESSES_KEY); + String tableName = this.conf.get(OUTPUT_TABLE_KEY); + this.operationTimeoutMs = this.conf.getLong(OPERATION_TIMEOUT_MS_KEY, + AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS); + int bufferSpace = this.conf.getInt(BUFFER_ROW_COUNT_KEY, 1000); + + this.client = new KuduClient.KuduClientBuilder(masterAddress) + .defaultOperationTimeoutMs(operationTimeoutMs) + .build(); + try { + this.table = client.openTable(tableName); + } catch (Exception ex) { + throw new RuntimeException("Could not obtain the table from the master, " + + "is the master running and is this table created? tablename=" + tableName + " and " + + "master address= " + masterAddress, ex); + } + this.session = client.newSession(); + this.session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); + this.session.setMutationBufferSpace(bufferSpace); + this.session.setIgnoreAllDuplicateRows(true); + String multitonKey = String.valueOf(Thread.currentThread().getId()); + assert(MULTITON.get(multitonKey) == null); + MULTITON.put(multitonKey, this); + entries.set(MULTITON_KEY, multitonKey); + } + + private void shutdownClient() throws IOException { + try { + client.shutdown(); + } catch (Exception e) { + throw new IOException(e); + } + } + + public static KuduTable getKuduTable(String multitonKey) { + return MULTITON.get(multitonKey).getKuduTable(); + } + + private KuduTable getKuduTable() { + return this.table; + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) + throws IOException, InterruptedException { + return new TableRecordWriter(this.session); + } + + @Override + public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException { + shutdownClient(); + } + + @Override + public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws + IOException, InterruptedException { + return new KuduTableOutputCommitter(); + } + + protected class TableRecordWriter extends RecordWriter { + + private final AtomicLong rowsWithErrors = new AtomicLong(); + private final KuduSession session; + + public TableRecordWriter(KuduSession session) { + this.session = session; + } + + @Override + public void write(NullWritable key, Operation operation) + throws IOException, InterruptedException { + try { + session.apply(operation); + } catch (Exception e) { + throw new IOException("Encountered an error while writing", e); + } + } + + @Override + public void close(TaskAttemptContext taskAttemptContext) throws IOException, + InterruptedException { + try { + processRowErrors(session.close()); + shutdownClient(); + } catch (Exception e) { + throw new IOException("Encountered an error while closing this task", e); + } finally { + if (taskAttemptContext != null) { + // This is the only place where we have access to the context in the record writer, + // so set the counter here. + taskAttemptContext.getCounter(Counters.ROWS_WITH_ERRORS).setValue(rowsWithErrors.get()); + } + } + } + + private void processRowErrors(List responses) { + List errors = OperationResponse.collectErrors(responses); + if (!errors.isEmpty()) { + int rowErrorsCount = errors.size(); + rowsWithErrors.addAndGet(rowErrorsCount); + LOG.warn("Got per errors for " + rowErrorsCount + " rows, " + + "the first one being " + errors.get(0).getStatus()); + } + } + } +} diff --git a/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/TableReducer.java b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/TableReducer.java new file mode 100644 index 000000000000..7cf3ada0840e --- /dev/null +++ b/java/kudu-mapreduce/src/main/java/org/kududb/mapreduce/TableReducer.java @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.kududb.annotations.InterfaceAudience; +import org.kududb.annotations.InterfaceStability; +import org.kududb.client.Operation; +import org.apache.hadoop.mapreduce.Reducer; + +@InterfaceAudience.Public +@InterfaceStability.Evolving +public abstract class TableReducer + extends Reducer { +} diff --git a/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/HadoopTestingUtility.java b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/HadoopTestingUtility.java new file mode 100644 index 000000000000..1e2cb4164b8f --- /dev/null +++ b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/HadoopTestingUtility.java @@ -0,0 +1,101 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.mapreduce; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.io.IOException; + +/** + * This class is analog to HBaseTestingUtility except that we only need it for the MR tests. + */ +public class HadoopTestingUtility { + + private static final Log LOG = LogFactory.getLog(HadoopTestingUtility.class); + + private File testDir; + + private Configuration conf = new Configuration(); + + /** + * System property key to get base test directory value + */ + public static final String BASE_TEST_DIRECTORY_KEY = + "test.build.data.basedirectory"; + + /** + * Default base directory for test output. + */ + private static final String DEFAULT_BASE_TEST_DIRECTORY = "target/mr-data"; + + public Configuration getConfiguration() { + return this.conf; + } + + /** + * Sets up a temporary directory for the test to run in. Call cleanup() at the end of your + * tests to remove it. + * @param testName Will be used to build a part of the directory name for the test + * @return Where the test is homed + */ + public File setupAndGetTestDir(String testName, Configuration conf) { + if (this.testDir != null) { + return this.testDir; + } + Path testPath = new Path(getBaseTestDir(), testName + System.currentTimeMillis()); + this.testDir = new File(testPath.toString()).getAbsoluteFile(); + this.testDir.mkdirs(); + // Set this property so when mapreduce jobs run, they will use this as their home dir. + System.setProperty("test.build.dir", this.testDir.toString()); + System.setProperty("hadoop.home.dir", this.testDir.toString()); + conf.set("hadoop.tmp.dir", this.testDir.toString() + "/mapred"); + + LOG.info("Test configured to write to " + this.testDir); + return this.testDir; + } + + private Path getBaseTestDir() { + String pathName = System.getProperty(BASE_TEST_DIRECTORY_KEY, DEFAULT_BASE_TEST_DIRECTORY); + return new Path(pathName); + } + + public void cleanup() throws IOException { + FileSystem.closeAll(); + if (this.testDir != null) { + delete(this.testDir); + } + } + + private void delete(File dir) throws IOException { + if (dir == null || !dir.exists()) { + return; + } + try { + FileUtils.deleteDirectory(dir); + } catch (IOException ex) { + LOG.warn("Failed to delete " + dir.getAbsolutePath()); + } + } +} diff --git a/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestInputFormatJob.java b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestInputFormatJob.java new file mode 100644 index 000000000000..9f416fc56413 --- /dev/null +++ b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestInputFormatJob.java @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import com.google.common.collect.Lists; +import org.kududb.client.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.*; + +public class TestInputFormatJob extends BaseKuduTest { + private static final Logger LOG = LoggerFactory.getLogger(TestInputFormatJob.class); + + private static final String TABLE_NAME = + TestInputFormatJob.class.getName() + "-" + System.currentTimeMillis(); + + private static final HadoopTestingUtility HADOOP_UTIL = new HadoopTestingUtility(); + + /** Counter enumeration to count the actual rows. */ + private static enum Counters { ROWS } + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + try { + BaseKuduTest.tearDownAfterClass(); + } finally { + HADOOP_UTIL.cleanup(); + } + } + + @Test + @SuppressWarnings("deprecation") + public void test() throws Exception { + + createFourTabletsTableWithNineRows(TABLE_NAME); + + Configuration conf = new Configuration(); + HADOOP_UTIL.setupAndGetTestDir(TestInputFormatJob.class.getName(), conf).getAbsolutePath(); + + createAndTestJob(conf, new ArrayList(), 9); + + ColumnRangePredicate pred1 = new ColumnRangePredicate(basicSchema.getColumnByIndex(0)); + pred1.setLowerBound(20); + createAndTestJob(conf, Lists.newArrayList(pred1), 6); + + ColumnRangePredicate pred2 = new ColumnRangePredicate(basicSchema.getColumnByIndex(2)); + pred2.setUpperBound(1); + createAndTestJob(conf, Lists.newArrayList(pred1, pred2), 2); + } + + private void createAndTestJob(Configuration conf, + List predicates, int expectedCount) + throws Exception { + String jobName = TestInputFormatJob.class.getName(); + Job job = new Job(conf, jobName); + + Class mapperClass = TestMapperTableInput.class; + job.setJarByClass(mapperClass); + job.setMapperClass(mapperClass); + job.setNumReduceTasks(0); + job.setOutputFormatClass(NullOutputFormat.class); + KuduTableMapReduceUtil.TableInputFormatConfigurator configurator = + new KuduTableMapReduceUtil.TableInputFormatConfigurator( + job, + TABLE_NAME, + "*", + getMasterAddresses()) + .operationTimeoutMs(DEFAULT_SLEEP) + .addDependencies(false) + .cacheBlocks(false); + for (ColumnRangePredicate predicate : predicates) { + configurator.addColumnRangePredicate(predicate); + } + configurator.configure(); + + assertTrue("Test job did not end properly", job.waitForCompletion(true)); + + assertEquals(expectedCount, job.getCounters().findCounter(Counters.ROWS).getValue()); + } + + /** + * Simple row counter and printer + */ + static class TestMapperTableInput extends + Mapper { + + @Override + protected void map(NullWritable key, RowResult value, Context context) throws IOException, + InterruptedException { + context.getCounter(Counters.ROWS).increment(1); + LOG.info(value.toStringLongFormat()); // useful to visual debugging + } + } + +} diff --git a/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableInputFormat.java b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableInputFormat.java new file mode 100644 index 000000000000..d7942c0f1c24 --- /dev/null +++ b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableInputFormat.java @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import org.kududb.Schema; +import org.kududb.client.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.RecordReader; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; + +import static org.junit.Assert.*; + +public class TestKuduTableInputFormat extends BaseKuduTest { + + private static final String TABLE_NAME = + TestKuduTableInputFormat.class.getName() + "-" + System.currentTimeMillis(); + + @Test + public void test() throws Exception { + createTable(TABLE_NAME, getBasicSchema(), new CreateTableOptions()); + + KuduTable table = openTable(TABLE_NAME); + Schema schema = getBasicSchema(); + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, 1); + row.addInt(1, 2); + row.addInt(2, 3); + row.addString(3, "a string"); + row.addBoolean(4, true); + AsyncKuduSession session = client.newSession(); + session.apply(insert).join(DEFAULT_SLEEP); + session.close().join(DEFAULT_SLEEP); + + // Test getting all the columns back + RecordReader reader = createRecordReader("*", null); + assertTrue(reader.nextKeyValue()); + assertEquals(5, reader.getCurrentValue().getColumnProjection().getColumnCount()); + assertFalse(reader.nextKeyValue()); + + // Test getting two columns back + reader = createRecordReader(schema.getColumnByIndex(3).getName() + "," + + schema.getColumnByIndex(2).getName(), null); + assertTrue(reader.nextKeyValue()); + assertEquals(2, reader.getCurrentValue().getColumnProjection().getColumnCount()); + assertEquals("a string", reader.getCurrentValue().getString(0)); + assertEquals(3, reader.getCurrentValue().getInt(1)); + try { + reader.getCurrentValue().getString(2); + fail("Should only be getting 2 columns back"); + } catch (IndexOutOfBoundsException e) { + // expected + } + + // Test getting one column back + reader = createRecordReader(schema.getColumnByIndex(1).getName(), null); + assertTrue(reader.nextKeyValue()); + assertEquals(1, reader.getCurrentValue().getColumnProjection().getColumnCount()); + assertEquals(2, reader.getCurrentValue().getInt(0)); + try { + reader.getCurrentValue().getString(1); + fail("Should only be getting 1 column back"); + } catch (IndexOutOfBoundsException e) { + // expected + } + + // Test getting empty rows back + reader = createRecordReader("", null); + assertTrue(reader.nextKeyValue()); + assertEquals(0, reader.getCurrentValue().getColumnProjection().getColumnCount()); + assertFalse(reader.nextKeyValue()); + + // Test getting an unknown table, will not work + try { + createRecordReader("unknown", null); + fail("Should not be able to scan a column that doesn't exist"); + } catch (IllegalArgumentException e) { + // expected + } + + // Test using a predicate that filters the row out. + ColumnRangePredicate pred1 = new ColumnRangePredicate(schema.getColumnByIndex(1)); + pred1.setLowerBound(3); + reader = createRecordReader("*", Lists.newArrayList(pred1)); + assertFalse(reader.nextKeyValue()); + } + + private RecordReader createRecordReader(String columnProjection, + List predicates) throws IOException, InterruptedException { + KuduTableInputFormat input = new KuduTableInputFormat(); + Configuration conf = new Configuration(); + conf.set(KuduTableInputFormat.MASTER_ADDRESSES_KEY, getMasterAddresses()); + conf.set(KuduTableInputFormat.INPUT_TABLE_KEY, TABLE_NAME); + if (columnProjection != null) { + conf.set(KuduTableInputFormat.COLUMN_PROJECTION_KEY, columnProjection); + } + if (predicates != null) { + String encodedPredicates = KuduTableMapReduceUtil.base64EncodePredicates(predicates); + conf.set(KuduTableInputFormat.ENCODED_COLUMN_RANGE_PREDICATES_KEY, encodedPredicates); + } + input.setConf(conf); + List splits = input.getSplits(null); + + // We need to re-create the input format to reconnect the client. + input = new KuduTableInputFormat(); + input.setConf(conf); + RecordReader reader = input.createRecordReader(null, null); + reader.initialize(Iterables.getOnlyElement(splits), null); + return reader; + } +} diff --git a/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableOutputFormat.java b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableOutputFormat.java new file mode 100644 index 000000000000..16c9cf595922 --- /dev/null +++ b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestKuduTableOutputFormat.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.kududb.client.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestKuduTableOutputFormat extends BaseKuduTest { + + private static final String TABLE_NAME = + TestKuduTableOutputFormat.class.getName() + "-" + System.currentTimeMillis(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + } + + @Test + public void test() throws Exception { + createTable(TABLE_NAME, getBasicSchema(), new CreateTableOptions()); + + KuduTableOutputFormat output = new KuduTableOutputFormat(); + Configuration conf = new Configuration(); + conf.set(KuduTableOutputFormat.MASTER_ADDRESSES_KEY, getMasterAddresses()); + conf.set(KuduTableOutputFormat.OUTPUT_TABLE_KEY, TABLE_NAME); + output.setConf(conf); + + String multitonKey = conf.get(KuduTableOutputFormat.MULTITON_KEY); + KuduTable table = KuduTableOutputFormat.getKuduTable(multitonKey); + assertNotNull(table); + + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, 1); + row.addInt(1, 2); + row.addInt(2, 3); + row.addString(3, "a string"); + row.addBoolean(4, true); + + RecordWriter rw = output.getRecordWriter(null); + rw.write(NullWritable.get(), insert); + rw.close(null); + AsyncKuduScanner.AsyncKuduScannerBuilder builder = client.newScannerBuilder(table); + assertEquals(1, countRowsInScan(builder.build())); + } +} diff --git a/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestOutputFormatJob.java b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestOutputFormatJob.java new file mode 100644 index 000000000000..71889111d10a --- /dev/null +++ b/java/kudu-mapreduce/src/test/java/org/kududb/mapreduce/TestOutputFormatJob.java @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.kududb.mapreduce; + +import org.kududb.client.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +import static org.junit.Assert.*; + +public class TestOutputFormatJob extends BaseKuduTest { + + private static final String TABLE_NAME = + TestOutputFormatJob.class.getName() + "-" + System.currentTimeMillis(); + + private static final HadoopTestingUtility HADOOP_UTIL = new HadoopTestingUtility(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + BaseKuduTest.setUpBeforeClass(); + createTable(TABLE_NAME, getBasicSchema(), new CreateTableOptions()); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + try { + BaseKuduTest.tearDownAfterClass(); + } finally { + HADOOP_UTIL.cleanup(); + } + } + + @Test + @SuppressWarnings("deprecation") + public void test() throws Exception { + Configuration conf = new Configuration(); + String testHome = + HADOOP_UTIL.setupAndGetTestDir(TestOutputFormatJob.class.getName(), conf).getAbsolutePath(); + String jobName = TestOutputFormatJob.class.getName(); + Job job = new Job(conf, jobName); + + + // Create a 2 lines input file + File data = new File(testHome, "data.txt"); + writeDataFile(data); + FileInputFormat.setInputPaths(job, data.toString()); + + // Configure the job to map the file and write to kudu, without reducers + Class mapperClass = TestMapperTableOutput.class; + job.setJarByClass(mapperClass); + job.setMapperClass(mapperClass); + job.setInputFormatClass(TextInputFormat.class); + job.setNumReduceTasks(0); + new KuduTableMapReduceUtil.TableOutputFormatConfigurator( + job, + TABLE_NAME, + getMasterAddresses()) + .operationTimeoutMs(DEFAULT_SLEEP) + .addDependencies(false) + .configure(); + + assertTrue("Test job did not end properly", job.waitForCompletion(true)); + + // Make sure the data's there + KuduTable table = openTable(TABLE_NAME); + AsyncKuduScanner.AsyncKuduScannerBuilder builder = + client.newScannerBuilder(table); + assertEquals(2, countRowsInScan(builder.build())); + } + + /** + * Simple Mapper that writes one row per line, the key is the line number and the STRING column + * is the data from that line + */ + static class TestMapperTableOutput extends + Mapper { + + private KuduTable table; + @Override + protected void map(LongWritable key, Text value, Context context) throws IOException, + InterruptedException { + Insert insert = table.newInsert(); + PartialRow row = insert.getRow(); + row.addInt(0, (int) key.get()); + row.addInt(1, 1); + row.addInt(2, 2); + row.addString(3, value.toString()); + row.addBoolean(4, true); + context.write(NullWritable.get(), insert); + } + + @Override + protected void setup(Context context) throws IOException, InterruptedException { + super.setup(context); + table = KuduTableMapReduceUtil.getTableFromContext(context); + } + } + + private void writeDataFile(File data) throws IOException { + FileOutputStream fos = new FileOutputStream(data); + fos.write("VALUE1\nVALUE2\n".getBytes()); + fos.close(); + } +} diff --git a/java/kudu-spark/pom.xml b/java/kudu-spark/pom.xml new file mode 100644 index 000000000000..cc33f40d3c14 --- /dev/null +++ b/java/kudu-spark/pom.xml @@ -0,0 +1,169 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + + + kudu-spark + Kudu Spark Bindings + + + 1.3.0 + 2.10.4 + 2.10 + ${project.basedir}/.. + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + provided + + + + org.scala-lang + scala-library + + + + org.scala-lang + scalap + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + provided + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${spark.version} + test-jar + tests + test + + + + org.kududb + kudu-client + ${project.version} + + + org.kududb + kudu-client + ${project.version} + test-jar + test + + + org.kududb + interface-annotations + ${project.version} + + + + org.kududb + kudu-mapreduce + ${project.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + + org.scala-lang + scala-library + ${scala.version} + provided + + + junit + junit + ${junit.version} + test + + + + org.scalatest + scalatest_${scala.binary.version} + 2.2.4 + test + + + + org.scalamock + scalamock-scalatest-support_${scala.binary.version} + 3.1.4 + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + net.alchim31.maven + scala-maven-plugin + 3.2.0 + + ${project.build.sourceEncoding} + ${scala.version} + + + + scala-compile-first + process-resources + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + + diff --git a/java/kudu-spark/src/main/scala/org/kududb/spark/DefaultSource.scala b/java/kudu-spark/src/main/scala/org/kududb/spark/DefaultSource.scala new file mode 100644 index 000000000000..50cb9c66a492 --- /dev/null +++ b/java/kudu-spark/src/main/scala/org/kududb/spark/DefaultSource.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kududb.spark + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Row, SQLContext} +import org.kududb.Type +import org.kududb.annotations.InterfaceStability +import org.kududb.client.RowResult + +import scala.collection.JavaConverters._ +import scala.collection.immutable.HashMap + +/** + * DefaultSource for integration with Spark's dataframe datasources. + * This class will produce a relationProvider based on input given to it from spark. + */ +@InterfaceStability.Unstable +class DefaultSource extends RelationProvider { + + val TABLE_KEY = "kudu.table" + val KUDU_MASTER = "kudu.master" + + /** + * Construct a BaseRelation using the provided context and parameters. + * + * @param sqlContext SparkSQL context + * @param parameters parameters given to us from SparkSQL + * @return a BaseRelation Object + */ + override def createRelation(sqlContext: SQLContext, + parameters: Map[String, String]): + BaseRelation = { + val tableName = parameters.get(TABLE_KEY) + if (tableName.isEmpty) { + throw new IllegalArgumentException(s"Invalid value for $TABLE_KEY '$tableName'") + } + + val kuduMaster = parameters.getOrElse(KUDU_MASTER, "localhost") + + new KuduRelation(tableName.get, kuduMaster)(sqlContext) + } +} + +/** + * Implementation of Spark BaseRelation. + * + * @param tableName Kudu table that we plan to read from + * @param kuduMaster Kudu master addresses + * @param sqlContext SparkSQL context + */ +@InterfaceStability.Unstable +class KuduRelation(val tableName: String, + val kuduMaster: String)( + @transient val sqlContext: SQLContext) + extends BaseRelation with PrunedFilteredScan with Serializable { + + val typesMapping = HashMap[Type, DataType]( + Type.INT16 -> IntegerType, + Type.INT32 -> IntegerType, + Type.INT64 -> LongType, + Type.FLOAT -> FloatType, + Type.DOUBLE -> DoubleType, + Type.STRING -> StringType, + Type.TIMESTAMP -> TimestampType, + Type.BINARY -> BinaryType + ) + + // Using lazy val for the following because we can't serialize them but we need them once we + // deserialize them. + @transient lazy val kuduContext = new KuduContext(kuduMaster) + @transient lazy val kuduTable = kuduContext.syncClient.openTable(tableName) + @transient lazy val tableColumns = kuduTable.getSchema.getColumns.asScala + @transient lazy val kuduSchemaColumnMap = tableColumns.map(c => (c.getName, c)).toMap + + /** + * Generates a SparkSQL schema object so SparkSQL knows what is being + * provided by this BaseRelation. + * + * @return schema generated from the Kudu table's schema + */ + override def schema: StructType = { + val metadataBuilder = new MetadataBuilder() + + val structFieldArray: Array[StructField] = + tableColumns.map { columnSchema => + val columnSparkSqlType = typesMapping.getOrElse( + columnSchema.getType, + throw new IllegalArgumentException(s"Unsupported column type: ${columnSchema.getType}")) + + val metadata = metadataBuilder.putString("name", columnSchema.getName).build() + new StructField(columnSchema.getName, columnSparkSqlType, + nullable = columnSchema.isNullable, metadata) + }.toArray + + new StructType(structFieldArray) + } + + /** + * Build the RDD to scan rows. + * + * @param requiredColumns clumns that are being requested by the requesting query + * @param filters filters that are being applied by the requesting query + * @return RDD will all the results from Kudu + */ + override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + kuduContext.kuduRDD(sqlContext.sparkContext, tableName, requiredColumns).map { row => + // TODO use indexes instead of column names since it requires one less mapping. + Row.fromSeq(requiredColumns.map(column => getKuduValue(row, column))) + } + } + + private def getKuduValue(row: RowResult, columnName: String): Any = { + val columnType = kuduSchemaColumnMap.getOrElse(columnName, + throw new IllegalArgumentException(s"Couldn't find column '$columnName'")).getType + + columnType match { + case Type.BINARY => row.getBinary(columnName) + case Type.BOOL => row.getBoolean(columnName) + case Type.DOUBLE => row.getDouble(columnName) + case Type.FLOAT => row.getFloat(columnName) + case Type.INT16 => row.getShort(columnName) + case Type.INT32 => row.getInt(columnName) + case Type.INT64 => row.getLong(columnName) + case Type.INT8 => row.getByte(columnName) + case Type.TIMESTAMP => row.getLong(columnName) + case Type.STRING => row.getString(columnName) + case _ => throw new IllegalArgumentException(s"Type not supported: '${columnType.getName}'") + } + } +} \ No newline at end of file diff --git a/java/kudu-spark/src/main/scala/org/kududb/spark/KuduContext.scala b/java/kudu-spark/src/main/scala/org/kududb/spark/KuduContext.scala new file mode 100644 index 000000000000..a0340990db52 --- /dev/null +++ b/java/kudu-spark/src/main/scala/org/kududb/spark/KuduContext.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.kududb.spark + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.io.NullWritable +import org.apache.spark.rdd.RDD +import org.apache.spark.SparkContext +import org.kududb.annotations.InterfaceStability +import org.kududb.client.{AsyncKuduClient, KuduClient, RowResult} +import org.kududb.mapreduce.KuduTableInputFormat + +/** + * KuduContext is a façade for Kudu operations. + * + * If a Kudu client connection is needed as part of a Spark application, a + * [[KuduContext]] should used as a broadcast variable in the job in order to + * share connections among the tasks in a JVM. + */ +@InterfaceStability.Unstable +class KuduContext(kuduMaster: String) extends Serializable { + @transient lazy val syncClient = new KuduClient.KuduClientBuilder(kuduMaster).build() + @transient lazy val asyncClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build() + + /** + * Create an RDD from a Kudu table. + * + * @param tableName table to read from + * @param columnProjection list of columns to read + * + * Not specifying this at all (i.e. setting to null) or setting to the special string + * '*' means to project all columns. + * @return a new RDD that maps over the given table for the selected columns + */ + def kuduRDD(sc: SparkContext, + tableName: String, + columnProjection: Seq[String] = Nil): RDD[RowResult] = { + + val conf = new Configuration + conf.set("kudu.mapreduce.master.address", kuduMaster) + conf.set("kudu.mapreduce.input.table", tableName) + if (columnProjection.nonEmpty) { + conf.set("kudu.mapreduce.column.projection", columnProjection.mkString(",")) + } + + val rdd = sc.newAPIHadoopRDD(conf, classOf[KuduTableInputFormat], + classOf[NullWritable], classOf[RowResult]) + + val columnNames = if (columnProjection.nonEmpty) columnProjection.mkString(", ") else "(*)" + rdd.values.setName(s"KuduRDD { table=$tableName, columnProjection=$columnNames }") + } +} diff --git a/java/kudu-spark/src/test/resources/log4j.properties b/java/kudu-spark/src/test/resources/log4j.properties new file mode 100644 index 000000000000..cb277ed13efd --- /dev/null +++ b/java/kudu-spark/src/test/resources/log4j.properties @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +log4j.rootLogger = DEBUG, out +log4j.appender.out = org.apache.log4j.ConsoleAppender +log4j.appender.out.layout = org.apache.log4j.PatternLayout +log4j.appender.out.layout.ConversionPattern = %d (%t) [%p - %l] %m%n + +log4j.logger.org.kududb = DEBUG \ No newline at end of file diff --git a/java/kudu-spark/src/test/scala/org/kududb/spark/DefaultSourceTest.scala b/java/kudu-spark/src/test/scala/org/kududb/spark/DefaultSourceTest.scala new file mode 100644 index 000000000000..17cea07e6bcc --- /dev/null +++ b/java/kudu-spark/src/test/scala/org/kududb/spark/DefaultSourceTest.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.spark + +import org.apache.spark.sql.SQLContext +import org.junit.runner.RunWith +import org.scalatest.FunSuite +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class DefaultSourceTest extends FunSuite with TestContext { + + test("Test basic SparkSQL") { + val rowCount = 10 + + insertRows(rowCount) + + val sqlContext = new SQLContext(sc) + + sqlContext.load("org.kududb.spark", + Map("kudu.table" -> tableName, "kudu.master" -> miniCluster.getMasterAddresses)) + .registerTempTable(tableName) + + assert(sqlContext.sql("SELECT * FROM " + tableName).collectAsList().size() == rowCount) + } +} \ No newline at end of file diff --git a/java/kudu-spark/src/test/scala/org/kududb/spark/KuduContextTest.scala b/java/kudu-spark/src/test/scala/org/kududb/spark/KuduContextTest.scala new file mode 100644 index 000000000000..67aad7bf7d79 --- /dev/null +++ b/java/kudu-spark/src/test/scala/org/kududb/spark/KuduContextTest.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.spark + +import org.junit.runner.RunWith +import org.scalatest.FunSuite +import org.scalatest.junit.JUnitRunner + +@RunWith(classOf[JUnitRunner]) +class KuduContextTest extends FunSuite with TestContext { + test("Test basic kuduRDD") { + val rowCount = 10 + + insertRows(rowCount) + + val scanRdd = kuduContext.kuduRDD(sc, "test") + + val scanList = scanRdd.map(r => r.getInt(0)).collect() + assert(scanList.length == rowCount) + } +} \ No newline at end of file diff --git a/java/kudu-spark/src/test/scala/org/kududb/spark/TestContext.scala b/java/kudu-spark/src/test/scala/org/kududb/spark/TestContext.scala new file mode 100644 index 000000000000..fa24f5290a1c --- /dev/null +++ b/java/kudu-spark/src/test/scala/org/kududb/spark/TestContext.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.kududb.spark + +import com.google.common.collect.ImmutableList +import org.apache.spark.SparkContext +import org.kududb.ColumnSchema.ColumnSchemaBuilder +import org.kududb.client.KuduClient.KuduClientBuilder +import org.kududb.client.MiniKuduCluster.MiniKuduClusterBuilder +import org.kududb.client.{CreateTableOptions, KuduClient, KuduTable, MiniKuduCluster} +import org.kududb.{Schema, Type} +import org.scalatest.{BeforeAndAfterAll, Suite} + +trait TestContext extends BeforeAndAfterAll { self: Suite => + + var sc: SparkContext = null + var miniCluster: MiniKuduCluster = null + var kuduClient: KuduClient = null + var table: KuduTable = null + var kuduContext: KuduContext = null + + val tableName = "test" + + lazy val schema: Schema = { + val columns = ImmutableList.of( + new ColumnSchemaBuilder("key", Type.INT32).key(true).build(), + new ColumnSchemaBuilder("c1_i", Type.INT32).build(), + new ColumnSchemaBuilder("c2_s", Type.STRING).build()) + new Schema(columns) + } + + override def beforeAll() { + miniCluster = new MiniKuduClusterBuilder() + .numMasters(1) + .numTservers(1) + .build() + val envMap = Map[String,String](("Xmx", "512m")) + + sc = new SparkContext("local[2]", "test", null, Nil, envMap) + + kuduClient = new KuduClientBuilder(miniCluster.getMasterAddresses).build() + assert(miniCluster.waitForTabletServers(1)) + + kuduContext = new KuduContext(miniCluster.getMasterAddresses) + + val tableOptions = new CreateTableOptions().setNumReplicas(1) + table = kuduClient.createTable(tableName, schema, tableOptions) + } + + override def afterAll() { + if (kuduClient != null) kuduClient.shutdown() + if (miniCluster != null) miniCluster.shutdown() + if (sc != null) sc.stop() + } + + def insertRows(rowCount: Integer) { + val kuduSession = kuduClient.newSession() + + for (i <- 1 to rowCount) { + val insert = table.newInsert + val row = insert.getRow + row.addInt(0, i) + row.addInt(1, i) + row.addString(2, i.toString) + kuduSession.apply(insert) + } + } +} \ No newline at end of file diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 000000000000..8767235d831e --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,291 @@ + + + + 4.0.0 + + org.kududb + kudu-parent + 0.8.0-SNAPSHOT + pom + + Kudu + + + Kudu. + + + + + + UTF-8 + UTF-8 + + + 1.7 + 2.2.1 + 1.4 + 0.1.10 + 2.18 + 2.18 + 5.5.0-SNAPSHOT + + + 1.4.1 + 12.0.1 + 2.6.0-cdh5.4.7 + 4.11 + 1.2.17 + 1.9.0 + 3.8.0.Final + 2.6.1 + 1.7.12 + 2.4 + 1.0.0 + + + target/testdata + -enableassertions -Xmx1900m -XX:MaxPermSize=100m + -Djava.security.egd=file:/dev/./urandom -Djava.net.preferIPv4Stack=true + -Djava.awt.headless=true + + + + interface-annotations + kudu-client + kudu-client-tools + kudu-mapreduce + kudu-spark + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + ${maven-enforcer-plugin.version} + + + + [3.0.2,) + + + [1.7,) + + + + + + org.apache.maven.plugins + maven-failsafe-plugin + ${maven-failsafe-plugin.version} + + + + integration-test + verify + + + + + false + false + true + ${testArgLine} + + + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surefire-plugin.version} + + false + false + true + ${testArgLine} + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + false + + + clean + + enforce + + pre-clean + + + default + + enforce + + validate + + + site + + enforce + + pre-site + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.2 + + 1.7 + 1.7 + + + + org.apache.maven.plugins + maven-jar-plugin + 2.3.2 + + + prepare-package + + test-jar + + + + + + org.apache.maven.plugins + maven-antrun-plugin + ${maven-antrun-plugin.version} + + + test-compile + + + + + + + + run + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.3 + + 2048m + true + + org.kududb.annotations.tools.IncludePublicAnnotationsStandardDoclet + + + org.kududb + interface-annotations + ${project.version} + + User API + The Kudu Application Programmer's API + + com.google:org.kududb.cfile:org.kududb.consensus:org.kududb.log:org.kududb.master:org.kududb.rpc:org.kududb.server:org.kududb.tablet:org.kududb.tserver + + false + + + + org.kududb:interface-annotations + + + + + + + + + + + protoc-plugin + http://maven.davidtrott.com/repository + + + + + + cdh.repo + https://repository.cloudera.com/artifactory/cloudera-repos + Cloudera Repositories + + false + + + + + + + cdh.releases.repo + ${deploy.maven} + Releases Repository + + + cdh.snapshots.repo + http://maven.jenkins.cloudera.com:8081/artifactory/libs-snapshot-local + Snapshots Repository + + + + + + + buildCSD + + kudu-csd + + + + deploy-local + + true + + + http://maven.jenkins.cloudera.com:8081/artifactory/libs-release-local + + + + deploy-remote + + https://repository.cloudera.com/cloudera/libs-release-local + + + + diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 000000000000..921484a67577 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,43 @@ +# Copyright 2016 Cloudera, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Editor temporary/working/backup files +*flymake* + +# Compiled source and in-place build files +*.py[ocd] +*.so +.build_cache_dir +.cache +.eggs +MANIFEST + +# Generated sources +*.c +*.cpp +*.cmake +# Python files + +# setup.py working directory +build +# setup.py dist directory +dist +# Egg metadata +*.egg-info +# coverage +.coverage +coverage.xml + +# automatically generated during local development +kudu/version.py \ No newline at end of file diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 000000000000..a6e88d6729c7 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,13 @@ +include MANIFEST.in +include ../LICENSE.txt +include README.md +include setup.py + +graft kudu + +global-exclude *.so +global-exclude *.pyc +global-exclude *~ +global-exclude \#* +global-exclude .git* +global-exclude .DS_Store diff --git a/python/Makefile b/python/Makefile new file mode 100644 index 000000000000..3fdd0b6e57eb --- /dev/null +++ b/python/Makefile @@ -0,0 +1,21 @@ +# Copyright 2016 Cloudera, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +develop: + python setup.py build_ext --inplace + +all: develop + +clean-pyc: + find . -name "*.pyc" -exec rm -rf {} \; diff --git a/python/README.md b/python/README.md new file mode 100644 index 000000000000..54791c2485d8 --- /dev/null +++ b/python/README.md @@ -0,0 +1,12 @@ +## kudu-python: Python interface to the Apache Kudu (incubating) C++ Client API + +Using this package requires that you install the Kudu C++ client libraries and +headers. See http://getkudu.io for more. + +To install from PyPI, run + +``` +pip install kudu-python +``` + +Installation from source requires Cython. \ No newline at end of file diff --git a/python/kudu/__init__.pxd b/python/kudu/__init__.pxd new file mode 100644 index 000000000000..217e5db96078 --- /dev/null +++ b/python/kudu/__init__.pxd @@ -0,0 +1,17 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/kudu/__init__.py b/python/kudu/__init__.py new file mode 100644 index 000000000000..e51f90f9b49f --- /dev/null +++ b/python/kudu/__init__.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from kudu.client import (Client, Table, Scanner, Session, # noqa + Insert, Update, Delete, Predicate, + TimeDelta, KuduError, + FLUSH_AUTO_BACKGROUND, + FLUSH_AUTO_SYNC, + FLUSH_MANUAL) + +from kudu.errors import (KuduException, KuduBadStatus, KuduNotFound, # noqa + KuduNotSupported, + KuduInvalidArgument) + +from kudu.schema import (int8, int16, int32, int64, string_ as string, # noqa + double_ as double, float_, binary, + timestamp, + KuduType, + SchemaBuilder, ColumnSpec, Schema, ColumnSchema, + COMPRESSION_DEFAULT, + COMPRESSION_NONE, + COMPRESSION_SNAPPY, + COMPRESSION_LZ4, + COMPRESSION_ZLIB, + ENCODING_AUTO, + ENCODING_PLAIN, + ENCODING_PREFIX, + ENCODING_GROUP_VARINT, + ENCODING_RLE) + + +def connect(host, port, admin_timeout_ms=None, rpc_timeout_ms=None): + """ + Connect to a Kudu master server + + Parameters + ---------- + host : string + Server address of master + port : int + Server port + admin_timeout_ms : int, optional + Admin timeout in milliseconds + rpc_timeout_ms : int, optional + RPC timeout in milliseconds + + Returns + ------- + client : kudu.Client + """ + addr = '{0}:{1}'.format(host, port) + return Client(addr, admin_timeout_ms=admin_timeout_ms, + rpc_timeout_ms=rpc_timeout_ms) + + +def timedelta(seconds=0, millis=0, micros=0, nanos=0): + """ + Construct a Kudu TimeDelta to set timeouts, etc. Use this function instead + of interacting with the TimeDelta class yourself. + + Returns + ------- + delta : kudu.client.TimeDelta + """ + from kudu.compat import long + # TimeDelta is a wrapper for kudu::MonoDelta + total_ns = (long(0) + seconds * long(1000000000) + + millis * long(1000000) + micros * long(1000) + nanos) + return TimeDelta.from_nanos(total_ns) + + +def schema_builder(): + """ + Create a kudu.SchemaBuilder instance + + Examples + -------- + builder = kudu.schema_builder() + builder.add_column('key1', kudu.int64, nullable=False) + builder.add_column('key2', kudu.int32, nullable=False) + + (builder.add_column('name', kudu.string) + .nullable() + .compression('lz4')) + + builder.add_column('value1', kudu.double) + builder.add_column('value2', kudu.int8, encoding='rle') + builder.set_primary_keys(['key1', 'key2']) + + schema = builder.build() + + Returns + ------- + builder : SchemaBuilder + """ + return SchemaBuilder() + + +from .version import version as __version__ # noqa diff --git a/python/kudu/client.pyx b/python/kudu/client.pyx new file mode 100644 index 000000000000..ffe8e4642d3e --- /dev/null +++ b/python/kudu/client.pyx @@ -0,0 +1,1237 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: embedsignature = True + +from libcpp.string cimport string +from libcpp cimport bool as c_bool + +cimport cpython +from cython.operator cimport dereference as deref + +from libkudu_client cimport * + +from kudu.compat import tobytes, frombytes +from kudu.schema cimport Schema, ColumnSchema +from kudu.errors cimport check_status +from errors import KuduException + +import six + + +cdef class TimeDelta: + """ + Wrapper interface for kudu MonoDelta class, which is used to specify + timedeltas for timeouts and other uses. + """ + + cdef: + MonoDelta delta + + def __cinit__(self): + pass + + @staticmethod + def from_seconds(seconds): + """ + Construct a new TimeDelta from fractional seconds. + + Parameters + ---------- + seconds : double + + Returns + ------- + delta : TimeDelta + """ + cdef TimeDelta result = TimeDelta() + result.init(MonoDelta.FromSeconds(seconds)) + return result + + @staticmethod + def from_millis(int64_t ms): + """ + Construct a new TimeDelta from integer milliseconds. + + Parameters + ---------- + ms : int + + Returns + ------- + delta : TimeDelta + """ + cdef TimeDelta result = TimeDelta() + result.init(MonoDelta.FromMilliseconds(ms)) + return result + + @staticmethod + def from_micros(int64_t us): + """ + Construct a new TimeDelta from integer microseconds. + + Parameters + ---------- + us : int + + Returns + ------- + delta : TimeDelta + """ + cdef TimeDelta result = TimeDelta() + result.init(MonoDelta.FromMicroseconds(us)) + return result + + @staticmethod + def from_nanos(seconds): + """ + Construct a new TimeDelta from integer nanoseconds. + + Parameters + ---------- + ns : int + + Returns + ------- + delta : TimeDelta + """ + cdef TimeDelta result = TimeDelta() + result.init(MonoDelta.FromNanoseconds(seconds)) + return result + + cpdef double to_seconds(self): + """ + Return timedelta as fractional seconds. + """ + return self.delta.ToSeconds() + + cpdef int64_t to_millis(self): + """ + Return timedelta as exact milliseconds. + """ + return self.delta.ToMilliseconds() + + cpdef int64_t to_micros(self): + """ + Return timedelta as exact microseconds. + """ + return self.delta.ToMicroseconds() + + cpdef int64_t to_nanos(self): + """ + Return timedelta as exact nanoseconds. + """ + return self.delta.ToNanoseconds() + + cdef init(self, const MonoDelta& val): + self.delta = val + + def __repr__(self): + cdef object as_string + + if self.delta.Initialized(): + as_string = self.delta.ToString() + return 'kudu.TimeDelta({0})'.format(as_string) + else: + return 'kudu.TimeDelta()' + + def __richcmp__(TimeDelta self, TimeDelta other, int op): + if op == cpython.Py_EQ: + return self.delta.Equals(other.delta) + elif op == cpython.Py_NE: + return not self.delta.Equals(other.delta) + elif op == cpython.Py_LT: + return self.delta.LessThan(other.delta) + elif op == cpython.Py_LE: + return not self.delta.MoreThan(other.delta) + elif op == cpython.Py_GT: + return self.delta.MoreThan(other.delta) + elif op == cpython.Py_GE: + return not self.delta.LessThan(other.delta) + else: + raise ValueError('invalid operation: {0}'.format(op)) + + +cdef class Client: + + """ + The primary class for interacting with a Kudu cluster. Can connect to one + or more Kudu master servers. Do not instantiate this class directly; use + kudu.connect instead. + """ + + cdef: + shared_ptr[KuduClient] client + KuduClient* cp + + cdef readonly: + list master_addrs + + def __cinit__(self, addr_or_addrs, admin_timeout_ms=None, + rpc_timeout_ms=None): + cdef: + string c_addr + vector[string] c_addrs + KuduClientBuilder builder + TimeDelta timeout + + if isinstance(addr_or_addrs, six.string_types): + addr_or_addrs = [addr_or_addrs] + elif not isinstance(addr_or_addrs, list): + addr_or_addrs = list(addr_or_addrs) + + self.master_addrs = addr_or_addrs + for addr in addr_or_addrs: + c_addrs.push_back(tobytes(addr)) + + builder.master_server_addrs(c_addrs) + + if admin_timeout_ms is not None: + timeout = TimeDelta.from_millis(admin_timeout_ms) + builder.default_admin_operation_timeout(timeout.delta) + + if rpc_timeout_ms is not None: + timeout = TimeDelta.from_millis(rpc_timeout_ms) + builder.default_rpc_timeout(timeout.delta) + + check_status(builder.Build(&self.client)) + + # A convenience + self.cp = self.client.get() + + def __dealloc__(self): + self.close() + + property is_multimaster: + + def __get__(self): + return self.cp.IsMultiMaster() + + cpdef close(self): + # Nothing yet to clean up here + pass + + def create_table(self, table_name, Schema schema): + """ + Creates a new Kudu table from the passed Schema and options. + + Parameters + ---------- + table_name : string + schema : kudu.Schema + Create using kudu.schema_builder + """ + cdef: + KuduTableCreator* c + Status s + c = self.cp.NewTableCreator() + try: + s = (c.table_name(tobytes(table_name)) + .schema(schema.schema) + .Create()) + check_status(s) + finally: + del c + + def delete_table(self, table_name): + """ + Delete a Kudu table. Raises KuduNotFound if the table does not exist. + + Parameters + ---------- + table_name : string + """ + check_status(self.cp.DeleteTable(tobytes(table_name))) + + def table_exists(self, table_name): + """Return True if the indicated table exists in the Kudu cluster. + + Parameters + ---------- + table_name : string + + Returns + ------- + exists : bool + + """ + cdef: + string c_name = tobytes(table_name) + c_bool exists + + check_status(self.cp.TableExists(c_name, &exists)) + return exists + + def table(self, table_name): + """ + Construct a kudu.Table and retrieve its schema from the cluster. + + Raises KuduNotFound if the table does not exist. + + Parameters + ---------- + table_name : string + + Returns + ------- + table : kudu.Table + """ + table_name = tobytes(table_name) + cdef Table table = Table(table_name, self) + + check_status(self.cp.OpenTable(table_name, &table.table)) + table.init() + return table + + def list_tables(self, match_substring=None): + """ + Retrieve a list of table names in the Kudu cluster with an optional + substring filter. + + Parameters + ---------- + match_substring : string, optional + If passed, the string must be exactly contained in the table names + + Returns + ------- + tables : list[string] + Table names returned from Kudu + """ + cdef: + vector[string] tables + string c_match + size_t i + + if match_substring is not None: + c_match = tobytes(match_substring) + check_status(self.cp.ListTables(&tables, c_match)) + else: + check_status(self.cp.ListTables(&tables)) + + result = [] + for i in range(tables.size()): + result.append(frombytes(tables[i])) + return result + + def new_session(self, flush_mode='manual', timeout_ms=5000): + """ + Create a new KuduSession for applying write operations. + + Parameters + ---------- + flush_mode : {'manual', 'sync', 'background'}, default 'manual' + See Session.set_flush_mode + timeout_ms : int, default 5000 + Timeout in milliseconds + + Returns + ------- + session : kudu.Session + """ + cdef Session result = Session() + result.s = self.cp.NewSession() + + result.set_flush_mode(flush_mode) + result.set_timeout_ms(timeout_ms) + + return result + + + +#---------------------------------------------------------------------- +# Handle marshalling Python values to raw values. Since range predicates +# require a const void*, this is one valid (though a bit verbose) +# approach. Note that later versions of Cython handle many Python -> C type +# casting problems (and integer overflows), but these should all be tested +# rigorously in our test suite + + +cdef class RawValue: + cdef: + void* data + + def __cinit__(self): + self.data = NULL + + +cdef class Int8Val(RawValue): + cdef: + int8_t val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class Int16Val(RawValue): + cdef: + int16_t val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class Int32Val(RawValue): + cdef: + int32_t val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class Int64Val(RawValue): + cdef: + int64_t val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class DoubleVal(RawValue): + cdef: + double val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class FloatVal(RawValue): + cdef: + float val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class BoolVal(RawValue): + cdef: + c_bool val + + def __cinit__(self, obj): + self.val = obj + self.data = &self.val + + +cdef class StringVal(RawValue): + cdef: + # Python "str" object that was passed into the constructor. + # We hold a reference to this so that the underlying data + # doesn't go out of scope. + object py_str + # Heap-allocated Slice object, owned by this instance, + # which points to the data in 'py_str' + cdef Slice* val + + def __cinit__(self, obj): + self.py_str = obj + self.val = new Slice(self.py_str, len(self.py_str)) + # The C++ API expects a Slice* to be passed to the range predicate + # constructor. + self.data = self.val + + def __dealloc__(self): + del self.val + +#---------------------------------------------------------------------- + + +cdef class Table: + + """ + Represents a Kudu table, containing the schema and other tools. Create by + using the kudu.Client.table method after connecting to a cluster. + """ + + cdef: + shared_ptr[KuduTable] table + + cdef readonly: + object _name + Schema schema + Client parent + + def __cinit__(self, name, Client client): + self._name = name + self.parent = client + + # Users should not instantiate directly + self.schema = Schema() + + cdef init(self): + # Called after the refptr has been populated + self.schema.schema = &self.ptr().schema() + self.schema.own_schema = 0 + self.schema.parent = self + + def __len__(self): + # TODO: is this cheaply knowable? + raise NotImplementedError + + def __getitem__(self, key): + spec = self.schema[key] + return Column(self, key, spec) + + property name: + + def __get__(self): + return frombytes(self.ptr().name()) + + # XXX: don't love this name + property num_columns: + + def __get__(self): + return len(self.schema) + + def rename(self, new_name): + raise NotImplementedError + + def drop(self): + raise NotImplementedError + + def new_insert(self): + """ + Create a new Insert operation. Pass the completed Insert to a Session. + + Returns + ------- + insert : Insert + """ + return Insert(self) + + def new_update(self): + """ + Create a new Update operation. Pass the completed Update to a Session. + + Returns + ------- + update : Update + """ + return Update(self) + + def new_delete(self): + """ + Create a new Delete operation. Pass the completed Update to a Session. + + Returns + ------- + delete : Delete + """ + return Delete(self) + + def scanner(self): + """ + Create a new scanner for this table for retrieving a selection of table + rows. + + Examples + -------- + scanner = table.scanner() + scanner.add_predicate(table['key'] > 10) + scanner.open() + batch = scanner.read_all() + tuples = batch.as_tuples() + + Returns + ------- + scanner : kudu.Scanner + """ + cdef Scanner result = Scanner(self) + result.scanner = new KuduScanner(self.ptr()) + return result + + cdef inline KuduTable* ptr(self): + return self.table.get() + + +cdef class Column: + + """ + A reference to a Kudu table column intended to simplify creating predicates + and other column-specific operations. + + Write arithmetic comparisons to create new Predicate objects that can be + passed to a Scanner. + + Examples + -------- + scanner.add_predicate(table[col_name] <= 10) + """ + cdef readonly: + object name + Table parent + ColumnSchema spec + + def __cinit__(self, Table parent, object name, ColumnSchema spec): + self.name = tobytes(name) + self.parent = parent + self.spec = spec + + def __repr__(self): + result = ('Column({0}, parent={1}, type={2})' + .format(frombytes(self.name), + self.parent.name, + self.spec.type.name)) + return result + + cdef KuduValue* box_value(self, object obj) except NULL: + cdef: + KuduValue* val + Slice* slc + + if isinstance(obj, unicode): + obj = obj.encode('utf8') + + if isinstance(obj, bytes): + slc = new Slice( obj, len(obj)) + val = KuduValue.CopyString(deref(slc)) + del slc + elif isinstance(obj, int): + val = KuduValue.FromInt(obj) + elif isinstance(obj, float): + val = KuduValue.FromDouble(obj) + else: + raise TypeError(obj) + + return val + + def __richcmp__(Column self, value, int op): + cdef: + KuduPredicate* pred + KuduValue* val + Slice* col_name_slice + ComparisonOp cmp_op + Predicate result + + col_name_slice = new Slice( self.name, + len(self.name)) + + try: + if op == 1: # <= + cmp_op = KUDU_LESS_EQUAL + elif op == 2: # == + cmp_op = KUDU_EQUAL + elif op == 5: # >= + cmp_op = KUDU_GREATER_EQUAL + else: + raise NotImplementedError + + val = self.box_value(value) + pred = (self.parent.ptr() + .NewComparisonPredicate(deref(col_name_slice), + cmp_op, val)) + finally: + del col_name_slice + + result = Predicate() + result.init(pred) + + return result + + +cdef class Predicate: + + """ + Wrapper for a KuduPredicate. Pass to Scanner.add_predicates + """ + + cdef: + KuduPredicate* pred + + def __cinit__(self): + self.pred = NULL + + def __dealloc__(self): + if self.pred != NULL: + del self.pred + + cdef init(self, KuduPredicate* pred): + self.pred = pred + + +FLUSH_AUTO_SYNC = FlushMode_AutoSync +FLUSH_AUTO_BACKGROUND = FlushMode_AutoBackground +FLUSH_MANUAL = FlushMode_Manual + +cdef dict _flush_modes = { + 'manual': FlushMode_Manual, + 'sync': FlushMode_AutoSync, + 'background': FlushMode_AutoBackground +} + + +cdef class Session: + """ + Wrapper for a client KuduSession to build up write operations to interact + with the cluster. + """ + cdef: + shared_ptr[KuduSession] s + + def __cinit__(self): + pass + + def set_flush_mode(self, flush_mode='manual'): + """ + Set the session operation flush mode + + Parameters + ---------- + flush_mode : {'manual', 'sync', 'background'}, default 'manual' + You can also use the constants FLUSH_MANUAL, FLUSH_AUTO_SYNC, + and FLUSH_AUTO_BACKGROUND + """ + cdef Status status + cdef FlushMode fmode + + if isinstance(flush_mode, int): + # todo: validation + fmode = flush_mode + else: + try: + fmode = _flush_modes[flush_mode.lower()] + except KeyError: + raise ValueError('Invalid flush mode: {0}' + .format(flush_mode)) + + status = self.s.get().SetFlushMode(fmode) + + check_status(status) + + def set_timeout_ms(self, int64_t ms): + """ + Set the session timeout in milliseconds + """ + self.s.get().SetTimeoutMillis(ms) + + def apply(self, WriteOperation op): + """ + Apply the indicated write operation + + Examples + -------- + # Executes a single Insert operation + session = client.new_session() + op = table.new_insert() + op['key'] = 0 + op['value1'] = 5 + op['value2'] = 3.5 + session.apply(op) + session.flush() + """ + return op.add_to_session(self) + + def flush(self): + """ + Flush pending operations + """ + check_status(self.s.get().Flush()) + + def get_pending_errors(self): + """ + Returns a list of buffered Kudu errors. A second value is returned + indicating if there were more errors than could be stored in the + session's error buffer (i.e. False means there was no error overflow) + + Returns + ------- + errors, overflowed : list, bool + """ + cdef: + KuduError error + vector[C_KuduError*] v_errors + c_bool overflowed + size_t i + + self.s.get().GetPendingErrors(&v_errors, &overflowed) + + result = [] + for i in range(v_errors.size()): + error = KuduError() + error.error = v_errors[i] + result.append(error) + + return result, overflowed + + +cdef class Row: + + """ + A single row from a row batch + """ + + cdef: + # So we can access the schema information + Table table + + RowBatch parent + + # This object is owned by the parent RowBatch + KuduRowResult* row + + def __cinit__(self, batch, table): + self.parent = batch + self.table = table + self.row = NULL + + def __dealloc__(self): + pass + + cdef tuple as_tuple(self): + """ + Return the row as a Python tuple + """ + cdef: + int i, k + tuple tup + + k = self.table.num_columns + tup = cpython.PyTuple_New(k) + for i in range(k): + val = None + + if not self.is_null(i): + val = self.get_slot(i) + + cpython.Py_INCREF(val) + cpython.PyTuple_SET_ITEM(tup, i, val) + + return tup + + cdef inline get_bool(self, int i): + cdef c_bool val + check_status(self.row.GetBool(i, &val)) + # The built-in bool is masked by the libcpp typedef + return bool(val) + + cdef inline get_int8(self, int i): + cdef int8_t val + check_status(self.row.GetInt8(i, &val)) + return val + + cdef inline get_int16(self, int i): + cdef int16_t val + check_status(self.row.GetInt16(i, &val)) + return val + + cdef inline get_int32(self, int i): + cdef int32_t val + check_status(self.row.GetInt32(i, &val)) + return val + + cdef inline get_int64(self, int i): + cdef int64_t val + check_status(self.row.GetInt64(i, &val)) + return val + + cdef inline get_double(self, int i): + cdef double val + check_status(self.row.GetDouble(i, &val)) + return val + + cdef inline get_float(self, int i): + cdef float val + check_status(self.row.GetFloat(i, &val)) + return val + + cdef inline get_string(self, int i): + cdef Slice val + check_status(self.row.GetString(i, &val)) + return cpython.PyBytes_FromStringAndSize( val.mutable_data(), + val.size()) + + cdef inline get_slot(self, int i): + cdef: + Status s + DataType t = self.table.schema.loc_type(i) + + if t == KUDU_BOOL: + return self.get_bool(i) + elif t == KUDU_INT8: + return self.get_int8(i) + elif t == KUDU_INT16: + return self.get_int16(i) + elif t == KUDU_INT32: + return self.get_int32(i) + elif t == KUDU_INT64: + return self.get_int64(i) + elif t == KUDU_DOUBLE: + return self.get_double(i) + elif t == KUDU_FLOAT: + return self.get_float(i) + elif t == KUDU_STRING: + return frombytes(self.get_string(i)) + else: + raise TypeError(t) + + cdef inline bint is_null(self, int i): + return self.row.IsNull(i) + + +cdef class RowBatch: + """ + Class holding a batch of rows from a Scanner + """ + # This class owns the KuduRowResult data + cdef: + Table table + vector[KuduRowResult] rows + + def __cinit__(self, Table table): + self.table = table + + def __len__(self): + return self.rows.size() + + def __getitem__(self, i): + return self.get_row(i).as_tuple() + + def __iter__(self): + cdef int i = 0 + for i in range(len(self)): + yield self.get_row(i).as_tuple() + + def as_tuples(self): + """ + Return RowBatch as a list of Python tuples + + To simplify testing for the moment. + """ + cdef list tuples = [] + for i in range(self.rows.size()): + tuples.append(self.get_row(i).as_tuple()) + return tuples + + cdef Row get_row(self, i): + # TODO: boundscheck + + # For safety, we need to increment the parent reference count and hold + # on to a reference internally so that if the RowBatch goes out of + # scope we won't end up with orphaned Row objects. This isn't the best, + # but an intermediate solution until we can do something better.. + # + # One alternative is to copy the KuduRowResult into the Row, but that + # doesn't feel right. + cdef Row row = Row(self, self.table) + row.row = &self.rows[i] + + return row + + +cdef class Scanner: + """ + A class for defining a selection of data we wish to scan out of a Kudu + table. Create a scanner using Table.scanner. + """ + + cdef: + Table table + KuduScanner* scanner + bint is_open + + def __cinit__(self, Table table): + self.table = table + self.scanner = NULL + self.is_open = 0 + + def __dealloc__(self): + # We own this one + if self.scanner != NULL: + del self.scanner + + cdef inline ensure_open(self): + if not self.is_open: + self.open() + + def add_predicates(self, preds): + """ + Add a list of scan predicates to the scanner. Select columns from the + parent table and make comparisons to create predicates. + + Examples + -------- + c = table[col_name] + preds = [c >= 0, c <= 10] + scanner.add_predicates(preds) + + Parameters + ---------- + preds : list of Predicate + """ + for pred in preds: + self.add_predicate(pred) + + cpdef add_predicate(self, Predicate pred): + """ + Add a scan predicates to the scanner. Select columns from the + parent table and make comparisons to create predicates. + + Examples + -------- + pred = table[col_name] <= 10 + scanner.add_predicate(pred) + + Parameters + ---------- + pred : kudu.Predicate + """ + cdef KuduPredicate* clone + + # We clone the KuduPredicate so that the Predicate wrapper class can be + # reused + clone = pred.pred.Clone() + check_status(self.scanner.AddConjunctPredicate(clone)) + + def set_fault_tolerant(self): + """ + Makes the underlying KuduScanner fault tolerant. + Returns a reference to itself to facilitate chaining. + + Returns + ------- + self : Scanner + """ + check_status(self.scanner.SetFaultTolerant()) + return self + + def open(self): + """ + Returns a reference to itself to facilitate chaining + + Returns + ------- + self : Scanner + """ + if not self.is_open: + check_status(self.scanner.Open()) + self.is_open = 1 + return self + + def has_more_rows(self): + """ + Returns True if there are more rows to be read. + """ + return self.scanner.HasMoreRows() + + def read_all_tuples(self): + """ + Compute a RowBatch containing all rows from the scan operation (which + hopefully fit into memory, probably not handled gracefully at the + moment). + """ + cdef list tuples = [] + cdef RowBatch batch + + self.ensure_open() + + while self.has_more_rows(): + batch = self.next_batch() + tuples.extend(batch.as_tuples()) + + return tuples + + def read_next_batch_tuples(self): + return self.next_batch().as_tuples() + + cdef RowBatch next_batch(self): + """ + Retrieve the next batch of rows from the scanner. + + Returns + ------- + batch : RowBatch + """ + if not self.has_more_rows(): + raise StopIteration + + cdef RowBatch batch = RowBatch(self.table) + check_status(self.scanner.NextBatch(&batch.rows)) + return batch + + +cdef class KuduError: + + """ + Wrapper for a C++ KuduError indicating a client error resulting from + applying operations in a session. + """ + + cdef: + C_KuduError* error + + def __cinit__(self): + self.error = NULL + + def __dealloc__(self): + # We own this object + if self.error != NULL: + del self.error + + def failed_op(self): + raise NotImplementedError + + def __repr__(self): + return "KuduError('%s')" % (self.error.status().ToString()) + + +cdef class WriteOperation: + cdef: + Table table + KuduPartialRow* row + bint applied + + def __cinit__(self, Table table): + # This gets called before any subclass cinit methods + self.table = table + self.applied = 0 + + def __setitem__(self, key, value): + if isinstance(key, basestring): + self.set_field(key, value) + else: + self.set_loc(key, value) + + cpdef set_field(self, key, value): + cdef: + int i = self.table.schema.get_loc(key) + DataType t = self.table.schema.loc_type(i) + cdef Slice* slc + + # Leave it to Cython to do the coercion and complain if it doesn't + # work. Cython will catch many casting problems but we should verify + # with unit tests. + if t == KUDU_BOOL: + self.row.SetBool(i, value) + elif t == KUDU_INT8: + self.row.SetInt8(i, value) + elif t == KUDU_INT16: + self.row.SetInt16(i, value) + elif t == KUDU_INT32: + self.row.SetInt32(i, value) + elif t == KUDU_INT64: + self.row.SetInt64(i, value) + elif t == KUDU_FLOAT: + self.row.SetFloat(i, value) + elif t == KUDU_DOUBLE: + self.row.SetDouble(i, value) + elif t == KUDU_STRING: + if not cpython.PyBytes_Check(value): + value = value.encode('utf8') + + # TODO: It would be much better not to heap-allocate a Slice object + slc = new Slice(cpython.PyBytes_AsString(value)) + + # Not safe to take a reference to PyBytes data for now + self.row.SetStringCopy(i, deref(slc)) + del slc + + cpdef set_loc(self, int i, value): + pass + + cpdef set_field_null(self, key): + pass + + cpdef set_loc_null(self, int i): + pass + + cdef add_to_session(self, Session s): + pass + + +cdef class Insert(WriteOperation): + cdef: + KuduInsert* op + + def __cinit__(self, Table table): + self.op = self.table.ptr().NewInsert() + self.row = self.op.mutable_row() + + def __dealloc__(self): + del self.op + + cdef add_to_session(self, Session s): + if self.applied: + raise Exception + + check_status(s.s.get().Apply(self.op)) + self.op = NULL + self.applied = 1 + + +cdef class Update(WriteOperation): + cdef: + KuduUpdate* op + + def __cinit__(self, Table table): + self.table = table + self.op = table.ptr().NewUpdate() + self.row = self.op.mutable_row() + + def __dealloc__(self): + del self.op + + cdef add_to_session(self, Session s): + pass + + +cdef class Delete(WriteOperation): + cdef: + KuduDelete* op + + def __cinit__(self, Table table): + self.table = table + self.op = table.ptr().NewDelete() + self.row = self.op.mutable_row() + + def __dealloc__(self): + del self.op + + cdef add_to_session(self, Session s): + if self.applied: + raise Exception + + check_status(s.s.get().Apply(self.op)) + self.applied = 1 + self.op = NULL + + + +cdef inline cast_pyvalue(DataType t, object o): + if t == KUDU_BOOL: + return BoolVal(o) + elif t == KUDU_INT8: + return Int8Val(o) + elif t == KUDU_INT16: + return Int16Val(o) + elif t == KUDU_INT32: + return Int32Val(o) + elif t == KUDU_INT64: + return Int64Val(o) + elif t == KUDU_DOUBLE: + return DoubleVal(o) + elif t == KUDU_FLOAT: + return FloatVal(o) + elif t == KUDU_STRING: + return StringVal(o) + else: + raise TypeError(t) diff --git a/python/kudu/compat.py b/python/kudu/compat.py new file mode 100644 index 000000000000..2ac41ac8abf8 --- /dev/null +++ b/python/kudu/compat.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +import itertools + +import numpy as np + +import sys +import six +from six import BytesIO, StringIO, string_types as py_string + + +PY26 = sys.version_info[:2] == (2, 6) +PY2 = sys.version_info[0] == 2 + + +if PY26: + import unittest2 as unittest +else: + import unittest + + +if PY2: + import cPickle + + try: + from cdecimal import Decimal + except ImportError: + from decimal import Decimal + + unicode_type = unicode + lzip = zip + zip = itertools.izip + + def dict_values(x): + return x.values() + + range = xrange + long = long + + def tobytes(o): + if isinstance(o, unicode): + return o.encode('utf8') + else: + return o + + def frombytes(o): + return o +else: + unicode_type = str + def lzip(*x): + return list(zip(*x)) + long = int + zip = zip + def dict_values(x): + return list(x.values()) + from decimal import Decimal + range = range + + def tobytes(o): + if isinstance(o, str): + return o.encode('utf8') + else: + return o + + def frombytes(o): + return o.decode('utf8') + + +integer_types = six.integer_types + (np.integer,) diff --git a/python/kudu/errors.pxd b/python/kudu/errors.pxd new file mode 100644 index 000000000000..12cf13be43d7 --- /dev/null +++ b/python/kudu/errors.pxd @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libkudu_client cimport * + +cdef check_status(const Status& status) diff --git a/python/kudu/errors.pyx b/python/kudu/errors.pyx new file mode 100644 index 000000000000..747d6200a295 --- /dev/null +++ b/python/kudu/errors.pyx @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class KuduException(Exception): + pass + + +class KuduBadStatus(KuduException): + """ + A Kudu C++ client operation returned an error Status + """ + pass + + +class KuduNotFound(KuduBadStatus): + pass + + +class KuduNotSupported(KuduBadStatus): + pass + + +class KuduInvalidArgument(KuduBadStatus): + pass + + +class KuduNotAuthorized(KuduBadStatus): + pass + + +class KuduAborted(KuduBadStatus): + pass + + +cdef check_status(const Status& status): + if status.ok(): + return + + cdef string c_message = status.message().ToString() + + if status.IsNotFound(): + raise KuduNotFound(c_message) + elif status.IsNotSupported(): + raise KuduNotSupported(c_message) + elif status.IsInvalidArgument(): + raise KuduInvalidArgument(c_message) + else: + raise KuduBadStatus(status.ToString()) diff --git a/python/kudu/libkudu_client.pxd b/python/kudu/libkudu_client.pxd new file mode 100644 index 000000000000..43557c13340a --- /dev/null +++ b/python/kudu/libkudu_client.pxd @@ -0,0 +1,607 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libc.stdint cimport * +from libcpp cimport bool as c_bool +from libcpp.string cimport string +from libcpp.vector cimport vector + +# This must be included for cerr and other things to work +cdef extern from "": + pass + +#---------------------------------------------------------------------- +# Smart pointers and such + +cdef extern from "kudu/client/shared_ptr.h" namespace "kudu::client::sp" nogil: + + cdef cppclass shared_ptr[T]: + T* get() + void reset() + void reset(T* p) + +cdef extern from "kudu/util/status.h" namespace "kudu" nogil: + + # We can later add more of the common status factory methods as needed + cdef Status Status_OK "Status::OK"() + + cdef cppclass Status: + Status() + + string ToString() + + Slice message() + + c_bool ok() + c_bool IsNotFound() + c_bool IsCorruption() + c_bool IsNotSupported() + c_bool IsIOError() + c_bool IsInvalidArgument() + c_bool IsAlreadyPresent() + c_bool IsRuntimeError() + c_bool IsNetworkError() + c_bool IsIllegalState() + c_bool IsNotAuthorized() + c_bool IsAborted() + + +cdef extern from "kudu/util/monotime.h" namespace "kudu" nogil: + + # These classes are not yet needed directly but will need to be completed + # from the C++ API + cdef cppclass MonoDelta: + MonoDelta() + + @staticmethod + MonoDelta FromSeconds(double seconds) + + @staticmethod + MonoDelta FromMilliseconds(int64_t ms) + + @staticmethod + MonoDelta FromMicroseconds(int64_t us) + + @staticmethod + MonoDelta FromNanoseconds(int64_t ns) + + c_bool Initialized() + c_bool LessThan(const MonoDelta& other) + c_bool MoreThan(const MonoDelta& other) + c_bool Equals(const MonoDelta& other) + + string ToString() + + double ToSeconds() + int64_t ToMilliseconds() + int64_t ToMicroseconds() + int64_t ToNanoseconds() + + # TODO, when needed + # void ToTimeVal(struct timeval *tv) + # void ToTimeSpec(struct timespec *ts) + + # @staticmethod + # void NanosToTimeSpec(int64_t nanos, struct timespec* ts); + + + cdef cppclass MonoTime: + pass + + +cdef extern from "kudu/client/schema.h" namespace "kudu::client" nogil: + + enum DataType" kudu::client::KuduColumnSchema::DataType": + KUDU_INT8 " kudu::client::KuduColumnSchema::INT8" + KUDU_INT16 " kudu::client::KuduColumnSchema::INT16" + KUDU_INT32 " kudu::client::KuduColumnSchema::INT32" + KUDU_INT64 " kudu::client::KuduColumnSchema::INT64" + KUDU_STRING " kudu::client::KuduColumnSchema::STRING" + KUDU_BOOL " kudu::client::KuduColumnSchema::BOOL" + KUDU_FLOAT " kudu::client::KuduColumnSchema::FLOAT" + KUDU_DOUBLE " kudu::client::KuduColumnSchema::DOUBLE" + KUDU_BINARY " kudu::client::KuduColumnSchema::BINARY" + KUDU_TIMESTAMP " kudu::client::KuduColumnSchema::TIMESTAMP" + + enum EncodingType" kudu::client::KuduColumnStorageAttributes::EncodingType": + EncodingType_AUTO " kudu::client::KuduColumnStorageAttributes::AUTO_ENCODING" + EncodingType_PLAIN " kudu::client::KuduColumnStorageAttributes::PLAIN_ENCODING" + EncodingType_PREFIX " kudu::client::KuduColumnStorageAttributes::PREFIX_ENCODING" + EncodingType_GROUP_VARINT " kudu::client::KuduColumnStorageAttributes::GROUP_VARINT" + EncodingType_RLE " kudu::client::KuduColumnStorageAttributes::RLE" + + enum CompressionType" kudu::client::KuduColumnStorageAttributes::CompressionType": + CompressionType_DEFAULT " kudu::client::KuduColumnStorageAttributes::DEFAULT_COMPRESSION" + CompressionType_NONE " kudu::client::KuduColumnStorageAttributes::NO_COMPRESSION" + CompressionType_SNAPPY " kudu::client::KuduColumnStorageAttributes::SNAPPY" + CompressionType_LZ4 " kudu::client::KuduColumnStorageAttributes::LZ4" + CompressionType_ZLIB " kudu::client::KuduColumnStorageAttributes::ZLIB" + + cdef struct KuduColumnStorageAttributes: + KuduColumnStorageAttributes() + + EncodingType encoding + CompressionType compression + string ToString() + + cdef cppclass KuduColumnSchema: + KuduColumnSchema(const KuduColumnSchema& other) + KuduColumnSchema(const string& name, DataType type) + KuduColumnSchema(const string& name, DataType type, c_bool is_nullable) + KuduColumnSchema(const string& name, DataType type, c_bool is_nullable, + const void* default_value) + + string& name() + c_bool is_nullable() + DataType type() + + c_bool Equals(KuduColumnSchema& other) + void CopyFrom(KuduColumnSchema& other) + + cdef cppclass KuduSchema: + KuduSchema() + KuduSchema(vector[KuduColumnSchema]& columns, int key_columns) + + c_bool Equals(const KuduSchema& other) + KuduColumnSchema Column(size_t idx) + size_t num_columns() + + void GetPrimaryKeyColumnIndexes(vector[int]* indexes) + + KuduPartialRow* NewRow() + + cdef cppclass KuduColumnSpec: + + KuduColumnSpec* Default(KuduValue* value) + KuduColumnSpec* RemoveDefault() + + KuduColumnSpec* Compression(CompressionType compression) + KuduColumnSpec* Encoding(EncodingType encoding) + KuduColumnSpec* BlockSize(int32_t block_size) + + KuduColumnSpec* PrimaryKey() + KuduColumnSpec* NotNull() + KuduColumnSpec* Nullable() + KuduColumnSpec* Type(DataType type_) + + KuduColumnSpec* RenameTo(string& new_name) + + + cdef cppclass KuduSchemaBuilder: + + KuduColumnSpec* AddColumn(string& name) + KuduSchemaBuilder* SetPrimaryKey(vector[string]& key_col_names); + + Status Build(KuduSchema* schema) + + +cdef extern from "kudu/client/row_result.h" namespace "kudu::client" nogil: + + cdef cppclass KuduRowResult: + c_bool IsNull(Slice& col_name) + c_bool IsNull(int col_idx) + + # These getters return a bad Status if the type does not match, + # the value is unset, or the value is NULL. Otherwise they return + # the current set value in *val. + Status GetBool(Slice& col_name, c_bool* val) + + Status GetInt8(Slice& col_name, int8_t* val) + Status GetInt16(Slice& col_name, int16_t* val) + Status GetInt32(Slice& col_name, int32_t* val) + Status GetInt64(Slice& col_name, int64_t* val) + + Status GetTimestamp(const Slice& col_name, + int64_t* micros_since_utc_epoch) + + Status GetBool(int col_idx, c_bool* val) + + Status GetInt8(int col_idx, int8_t* val) + Status GetInt16(int col_idx, int16_t* val) + Status GetInt32(int col_idx, int32_t* val) + Status GetInt64(int col_idx, int64_t* val) + + Status GetString(Slice& col_name, Slice* val) + Status GetString(int col_idx, Slice* val) + + Status GetFloat(Slice& col_name, float* val) + Status GetFloat(int col_idx, float* val) + + Status GetDouble(Slice& col_name, double* val) + Status GetDouble(int col_idx, double* val) + + Status GetBinary(const Slice& col_name, Slice* val) + Status GetBinary(int col_idx, Slice* val) + + const void* cell(int col_idx) + string ToString() + + +cdef extern from "kudu/util/slice.h" namespace "kudu" nogil: + + cdef cppclass Slice: + Slice() + Slice(const uint8_t* data, size_t n) + Slice(const char* data, size_t n) + + Slice(string& s) + Slice(const char* s) + + # Many other constructors have been omitted; we can return and add them + # as needed for the code generation. + + const uint8_t* data() + uint8_t* mutable_data() + size_t size() + c_bool empty() + + uint8_t operator[](size_t n) + + void clear() + void remove_prefix(size_t n) + void truncate(size_t n) + + Status check_size(size_t expected_size) + + string ToString() + + string ToDebugString() + string ToDebugString(size_t max_len) + + int compare(Slice& b) + + c_bool starts_with(Slice& x) + + void relocate(uint8_t* d) + + # Many other API methods omitted + + +cdef extern from "kudu/common/partial_row.h" namespace "kudu" nogil: + + cdef cppclass KuduPartialRow: + # Schema must not be garbage-collected + # KuduPartialRow(const Schema* schema) + + #---------------------------------------------------------------------- + # Setters + + # Slice setters + Status SetBool(Slice& col_name, c_bool val) + + Status SetInt8(Slice& col_name, int8_t val) + Status SetInt16(Slice& col_name, int16_t val) + Status SetInt32(Slice& col_name, int32_t val) + Status SetInt64(Slice& col_name, int64_t val) + + Status SetTimestamp(const Slice& col_name, + int64_t micros_since_utc_epoch) + Status SetTimestamp(int col_idx, int64_t micros_since_utc_epoch) + + Status SetDouble(Slice& col_name, double val) + Status SetFloat(Slice& col_name, float val) + + # Integer setters + Status SetBool(int col_idx, c_bool val) + + Status SetInt8(int col_idx, int8_t val) + Status SetInt16(int col_idx, int16_t val) + Status SetInt32(int col_idx, int32_t val) + Status SetInt64(int col_idx, int64_t val) + + Status SetDouble(int col_idx, double val) + Status SetFloat(int col_idx, float val) + + # Set, but does not copy string + Status SetString(Slice& col_name, Slice& val) + Status SetString(int col_idx, Slice& val) + + Status SetStringCopy(Slice& col_name, Slice& val) + Status SetStringCopy(int col_idx, Slice& val) + + Status SetBinaryCopy(const Slice& col_name, const Slice& val) + Status SetBinaryCopy(int col_idx, const Slice& val) + + Status SetNull(Slice& col_name) + Status SetNull(int col_idx) + + Status Unset(Slice& col_name) + Status Unset(int col_idx) + + #---------------------------------------------------------------------- + # Getters + + c_bool IsColumnSet(Slice& col_name) + c_bool IsColumnSet(int col_idx) + + c_bool IsNull(Slice& col_name) + c_bool IsNull(int col_idx) + + Status GetBool(Slice& col_name, c_bool* val) + Status GetBool(int col_idx, c_bool* val) + + Status GetInt8(Slice& col_name, int8_t* val) + Status GetInt8(int col_idx, int8_t* val) + + Status GetInt16(Slice& col_name, int16_t* val) + Status GetInt16(int col_idx, int16_t* val) + + Status GetInt32(Slice& col_name, int32_t* val) + Status GetInt32(int col_idx, int32_t* val) + + Status GetInt64(Slice& col_name, int64_t* val) + Status GetInt64(int col_idx, int64_t* val) + + Status GetTimestamp(const Slice& col_name, + int64_t* micros_since_utc_epoch) + Status GetTimestamp(int col_idx, int64_t* micros_since_utc_epoch) + + Status GetDouble(Slice& col_name, double* val) + Status GetDouble(int col_idx, double* val) + + Status GetFloat(Slice& col_name, float* val) + Status GetFloat(int col_idx, float* val) + + # Gets the string but does not copy the value. Callers should + # copy the resulting Slice if necessary. + Status GetString(Slice& col_name, Slice* val) + Status GetString(int col_idx, Slice* val) + + Status GetBinary(const Slice& col_name, Slice* val) + Status GetBinary(int col_idx, Slice* val) + + Status EncodeRowKey(string* encoded_key) + string ToEncodedRowKeyOrDie() + + # Return true if all of the key columns have been specified + # for this mutation. + c_bool IsKeySet() + + # Return true if all columns have been specified. + c_bool AllColumnsSet() + string ToString() + + # const Schema* schema() + + +cdef extern from "kudu/client/write_op.h" namespace "kudu::client" nogil: + + enum WriteType" kudu::client::KuduWriteOperation::Type": + INSERT " kudu::client::KuduWriteOperation::INSERT" + UPDATE " kudu::client::KuduWriteOperation::UPDATE" + DELETE " kudu::client::KuduWriteOperation::DELETE" + + cdef cppclass KuduWriteOperation: + KuduPartialRow& row() + KuduPartialRow* mutable_row() + + # This is a pure virtual function implemented on each of the cppclass + # subclasses + string ToString() + + # Also a pure virtual + WriteType type() + + cdef cppclass KuduInsert(KuduWriteOperation): + pass + + cdef cppclass KuduDelete(KuduWriteOperation): + pass + + cdef cppclass KuduUpdate(KuduWriteOperation): + pass + + +cdef extern from "kudu/client/scan_predicate.h" namespace "kudu::client" nogil: + enum ComparisonOp" kudu::client::KuduPredicate::ComparisonOp": + KUDU_LESS_EQUAL " kudu::client::KuduPredicate::LESS_EQUAL" + KUDU_GREATER_EQUAL " kudu::client::KuduPredicate::GREATER_EQUAL" + KUDU_EQUAL " kudu::client::KuduPredicate::EQUAL" + + cdef cppclass KuduPredicate: + KuduPredicate* Clone() + + +cdef extern from "kudu/client/value.h" namespace "kudu::client" nogil: + + cdef cppclass KuduValue: + @staticmethod + KuduValue* FromInt(int64_t val); + + @staticmethod + KuduValue* FromFloat(float val); + + @staticmethod + KuduValue* FromDouble(double val); + + @staticmethod + KuduValue* FromBool(c_bool val); + + @staticmethod + KuduValue* CopyString(const Slice& s); + + +cdef extern from "kudu/client/client.h" namespace "kudu::client" nogil: + + # Omitted KuduClient::ReplicaSelection enum + + cdef cppclass KuduClient: + + Status DeleteTable(const string& table_name) + Status OpenTable(const string& table_name, + shared_ptr[KuduTable]* table) + Status GetTableSchema(const string& table_name, KuduSchema* schema) + + KuduTableCreator* NewTableCreator() + Status IsCreateTableInProgress(const string& table_name, + c_bool* create_in_progress) + + c_bool IsMultiMaster() + + Status ListTables(vector[string]* tables) + Status ListTables(vector[string]* tables, const string& filter) + + Status TableExists(const string& table_name, c_bool* exists) + + KuduTableAlterer* NewTableAlterer() + Status IsAlterTableInProgress(const string& table_name, + c_bool* alter_in_progress) + + shared_ptr[KuduSession] NewSession() + + cdef cppclass KuduClientBuilder: + KuduClientBuilder() + KuduClientBuilder& master_server_addrs(const vector[string]& addrs) + KuduClientBuilder& add_master_server_addr(const string& addr) + + KuduClientBuilder& default_admin_operation_timeout( + const MonoDelta& timeout) + + KuduClientBuilder& default_rpc_timeout(const MonoDelta& timeout) + + Status Build(shared_ptr[KuduClient]* client) + + cdef cppclass KuduTableCreator: + KuduTableCreator& table_name(string& name) + KuduTableCreator& schema(KuduSchema* schema) + KuduTableCreator& split_keys(vector[string]& keys) + KuduTableCreator& num_replicas(int n_replicas) + KuduTableCreator& wait(c_bool wait) + + Status Create() + + cdef cppclass KuduTableAlterer: + # The name of the existing table to alter + KuduTableAlterer& table_name(string& name) + + KuduTableAlterer& rename_table(string& name) + + KuduTableAlterer& add_column(string& name, DataType type, + const void *default_value) + KuduTableAlterer& add_column(string& name, DataType type, + const void *default_value, + KuduColumnStorageAttributes attr) + + KuduTableAlterer& add_nullable_column(string& name, DataType type) + + KuduTableAlterer& drop_column(string& name) + + KuduTableAlterer& rename_column(string& old_name, string& new_name) + + KuduTableAlterer& wait(c_bool wait) + + Status Alter() + + # Instances of KuduTable are not directly instantiated by users of the + # client. + cdef cppclass KuduTable: + + string& name() + KuduSchema& schema() + + KuduInsert* NewInsert() + KuduUpdate* NewUpdate() + KuduDelete* NewDelete() + + KuduPredicate* NewComparisonPredicate(const Slice& col_name, + ComparisonOp op, + KuduValue* value); + + KuduClient* client() + # const PartitionSchema& partition_schema() + + enum FlushMode" kudu::client::KuduSession::FlushMode": + FlushMode_AutoSync " kudu::client::KuduSession::AUTO_FLUSH_SYNC" + FlushMode_AutoBackground " kudu::client::KuduSession::AUTO_FLUSH_BACKGROUND" + FlushMode_Manual " kudu::client::KuduSession::MANUAL_FLUSH" + + cdef cppclass KuduSession: + + Status SetFlushMode(FlushMode m) + + void SetMutationBufferSpace(size_t size) + void SetTimeoutMillis(int millis) + + void SetPriority(int priority) + + Status Apply(KuduWriteOperation* write_op) + Status Apply(KuduInsert* write_op) + Status Apply(KuduUpdate* write_op) + Status Apply(KuduDelete* write_op) + + # This is thread-safe + Status Flush() + + # TODO: Will need to decide on a strategy for exposing the session's + # async API to Python + + # Status ApplyAsync(KuduWriteOperation* write_op, + # KuduStatusCallback cb) + # Status ApplyAsync(KuduInsert* write_op, + # KuduStatusCallback cb) + # Status ApplyAsync(KuduUpdate* write_op, + # KuduStatusCallback cb) + # Status ApplyAsync(KuduDelete* write_op, + # KuduStatusCallback cb) + # void FlushAsync(KuduStatusCallback& cb) + + + Status Close() + c_bool HasPendingOperations() + int CountBufferedOperations() + + int CountPendingErrors() + void GetPendingErrors(vector[C_KuduError*]* errors, c_bool* overflowed) + + KuduClient* client() + + enum ReadMode" kudu::client::KuduScanner::ReadMode": + READ_LATEST " kudu::client::KuduScanner::READ_LATEST" + READ_AT_SNAPSHOT " kudu::client::KuduScanner::READ_AT_SNAPSHOT" + + cdef cppclass KuduScanner: + KuduScanner(KuduTable* table) + + Status AddConjunctPredicate(KuduPredicate* pred) + + Status Open() + void Close() + + c_bool HasMoreRows() + Status NextBatch(vector[KuduRowResult]* rows) + Status SetBatchSizeBytes(uint32_t batch_size) + + # Pending definition of ReplicaSelection enum + # Status SetSelection(ReplicaSelection selection) + + Status SetReadMode(ReadMode read_mode) + Status SetSnapshot(uint64_t snapshot_timestamp_micros) + Status SetTimeoutMillis(int millis) + Status SetFaultTolerant() + + string ToString() + + cdef cppclass C_KuduError " kudu::client::KuduError": + + Status& status() + + KuduWriteOperation& failed_op() + KuduWriteOperation* release_failed_op() + + c_bool was_possibly_successful() diff --git a/python/kudu/schema.pxd b/python/kudu/schema.pxd new file mode 100644 index 000000000000..b70f8ad738ed --- /dev/null +++ b/python/kudu/schema.pxd @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libcpp.map cimport map + +from libkudu_client cimport * + + +cdef class KuduType(object): + cdef readonly: + DataType type + + +cdef class ColumnSchema: + """ + Wraps a Kudu client ColumnSchema object + """ + cdef: + KuduColumnSchema* schema + KuduType _type + + +cdef class ColumnSpec: + cdef: + KuduColumnSpec* spec + + +cdef class SchemaBuilder: + cdef: + KuduSchemaBuilder builder + + +cdef class Schema: + cdef: + const KuduSchema* schema + object parent + bint own_schema + map[string, int] _col_mapping + bint _mapping_initialized + + cdef int get_loc(self, name) except -1 + + cdef inline DataType loc_type(self, int i): + return self.schema.Column(i).type() diff --git a/python/kudu/schema.pyx b/python/kudu/schema.pyx new file mode 100644 index 000000000000..f02d0e43259b --- /dev/null +++ b/python/kudu/schema.pyx @@ -0,0 +1,545 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ +# cython: embedsignature = True + +from cython.operator cimport dereference as deref + +from kudu.compat import tobytes, frombytes +from kudu.schema cimport * +from kudu.errors cimport check_status + +import six + +from . import util + +BOOL = KUDU_BOOL +STRING = KUDU_STRING + +INT8 = KUDU_INT8 +INT16 = KUDU_INT16 +INT32 = KUDU_INT32 +INT64 = KUDU_INT64 + +FLOAT = KUDU_FLOAT +DOUBLE = KUDU_DOUBLE + +TIMESTAMP = KUDU_TIMESTAMP +BINARY = KUDU_BINARY + + +cdef dict _reverse_dict(d): + return dict((v, k) for k, v in d.items()) + + +# CompressionType enums +COMPRESSION_DEFAULT = CompressionType_DEFAULT +COMPRESSION_NONE = CompressionType_NONE +COMPRESSION_SNAPPY = CompressionType_SNAPPY +COMPRESSION_LZ4 = CompressionType_LZ4 +COMPRESSION_ZLIB = CompressionType_ZLIB + +cdef dict _compression_types = { + 'default': COMPRESSION_DEFAULT, + 'none': COMPRESSION_NONE, + 'snappy': COMPRESSION_SNAPPY, + 'lz4': COMPRESSION_LZ4, + 'zlib': COMPRESSION_ZLIB, +} + +cdef dict _compression_type_to_name = _reverse_dict(_compression_types) + + +# EncodingType enums +ENCODING_AUTO = EncodingType_AUTO +ENCODING_PLAIN = EncodingType_PLAIN +ENCODING_PREFIX = EncodingType_PREFIX +ENCODING_GROUP_VARINT = EncodingType_GROUP_VARINT +ENCODING_RLE = EncodingType_RLE + +cdef dict _encoding_types = { + 'auto': ENCODING_AUTO, + 'plain': ENCODING_PLAIN, + 'prefix': ENCODING_PREFIX, + 'group_varint': ENCODING_GROUP_VARINT, + 'rle': ENCODING_RLE, +} + +cdef dict _encoding_type_to_name = _reverse_dict(_encoding_types) + + +cdef class KuduType(object): + + """ + Usability wrapper for Kudu data type enum + """ + + def __cinit__(self, DataType type): + self.type = type + + property name: + + def __get__(self): + return _type_names[self.type] + + def __repr__(self): + return 'KuduType({0})'.format(self.name) + + +int8 = KuduType(KUDU_INT8) +int16 = KuduType(KUDU_INT16) +int32 = KuduType(KUDU_INT32) +int64 = KuduType(KUDU_INT64) +string_ = KuduType(KUDU_STRING) +bool_ = KuduType(KUDU_BOOL) +float_ = KuduType(KUDU_FLOAT) +double_ = KuduType(KUDU_DOUBLE) +binary = KuduType(KUDU_BINARY) +timestamp = KuduType(KUDU_TIMESTAMP) + + +cdef dict _type_names = { + INT8: 'int8', + INT16: 'int16', + INT32: 'int32', + INT64: 'int64', + STRING: 'string', + BOOL: 'bool', + FLOAT: 'float', + DOUBLE: 'double', + BINARY: 'binary', + TIMESTAMP: 'timestamp' +} + + +cdef dict _type_name_to_number = _reverse_dict(_type_names) + +cdef dict _type_to_obj = { + INT8: int8, + INT16: int16, + INT32: int32, + INT64: int64, + STRING: string_, + BOOL: bool_, + FLOAT: float_, + DOUBLE: double_, + BINARY: binary, + TIMESTAMP: timestamp +} + + +cdef KuduType to_data_type(object obj): + if isinstance(obj, KuduType): + return obj + elif isinstance(obj, six.string_types): + return _type_to_obj[_type_name_to_number[obj]] + elif obj in _type_to_obj: + return _type_to_obj[obj] + else: + raise ValueError('Invalid type: {0}'.format(obj)) + + +cdef class ColumnSchema: + """ + Wraps a Kudu client ColumnSchema object. Use schema.at(i) or schema[i] to + construct one. + """ + + def __cinit__(self): + self.schema = NULL + self._type = None + + def __dealloc__(self): + if self.schema is not NULL: + del self.schema + + property name: + def __get__(self): + return frombytes(self.schema.name()) + + property type: + def __get__(self): + if self._type is None: + self._type = _type_to_obj[self.schema.type()] + return self._type + + property nullable: + def __get__(self): + return self.schema.is_nullable() + + def equals(self, other): + if not isinstance(other, ColumnSchema): + return False + return self.schema.Equals(deref(( other).schema)) + + def __repr__(self): + return ('ColumnSchema(name=%s, type=%s, nullable=%s)' + % (self.name, self.type.name, + self.nullable)) + + +#---------------------------------------------------------------------- + +cdef class ColumnSpec: + + """ + Helper class for configuring a column's settings while using the + SchemaBuilder. + """ + + def type(self, type_): + self.spec.Type(to_data_type(type_).type) + return self + + def default(self, value): + """ + Set a default value for the column + """ + raise NotImplementedError + + def clear_default(self): + """ + Remove a default value set. + """ + raise NotImplementedError + + def compression(self, compression): + """ + Set the compression type + + Parameters + ---------- + compression : string or int + One of {'default', 'none', 'snappy', 'lz4', 'zlib'} + Or see kudu.COMPRESSION_* constants + + Returns + ------- + self + """ + cdef CompressionType type + if isinstance(compression, int): + # todo: validation + type = compression + else: + if compression is None: + type = CompressionType_NONE + else: + try: + type = _compression_types[compression.lower()] + except KeyError: + raise ValueError('Invalid compression type: {0}' + .format(compression)) + + self.spec.Compression(type) + return self + + def encoding(self, encoding): + """ + Set the encoding type + + Parameters + ---------- + encoding : string or int + One of {'auto', 'plain', 'prefix', 'group_varint', 'rle'} + Or see kudu.ENCODING_* constants + + Returns + ------- + self + """ + cdef EncodingType type + if isinstance(encoding, six.string_types): + try: + type = _encoding_types[encoding.lower()] + except KeyError: + raise ValueError('Invalid encoding type: {0}' + .format(encoding)) + else: + # todo: validation + type = encoding + + self.spec.Encoding(type) + return self + + def primary_key(self): + """ + Make this column a primary key. If you use this method, it will be the + only primary key. Otherwise see set_primary_keys method on + SchemaBuilder. + + Returns + ------- + self + """ + self.spec.PrimaryKey() + return self + + def nullable(self, bint is_nullable=True): + """ + Set nullable (True) or not nullable (False) + + Parameters + ---------- + is_nullable : boolean, default True + + Returns + ------- + self + """ + if is_nullable: + self.spec.Nullable() + else: + self.spec.NotNull() + return self + + def rename(self, new_name): + """ + Change the column name. + + TODO: Not implemented for table creation + """ + self.spec.RenameTo(new_name) + return self + + +cdef class SchemaBuilder: + + def add_column(self, name, type_=None, nullable=None, compression=None, + encoding=None, primary_key=False): + """ + Add a new column to the schema. Returns a ColumnSpec object for further + configuration and use in a fluid programming style. + + Parameters + ---------- + name : string + type_ : string or KuduType + Data type e.g. 'int32' or kudu.int32 + nullable : boolean, default None + New columns are nullable by default. Set boolean value for explicit + nullable / not-nullable + compression : string or int + One of {'default', 'none', 'snappy', 'lz4', 'zlib'} + Or see kudu.COMPRESSION_* constants + encoding : string or int + One of {'auto', 'plain', 'prefix', 'group_varint', 'rle'} + Or see kudu.ENCODING_* constants + primary_key : boolean, default False + Use this column as the table primary key + + Examples + -------- + (builder.add_column('foo') + .nullable(True) + .compression('lz4')) + + Returns + ------- + spec : ColumnSpec + """ + cdef: + ColumnSpec result = ColumnSpec() + string c_name = tobytes(name) + + result.spec = self.builder.AddColumn(c_name) + + if type_ is not None: + result.type(type_) + + if nullable is not None: + result.nullable(nullable) + + if compression is not None: + result.compression(compression) + + if encoding is not None: + result.encoding(encoding) + + if primary_key: + result.primary_key() + + return result + + def set_primary_keys(self, key_names): + """ + Set indicated columns (by name) to be the primary keys of the table + schema + + Parameters + ---------- + key_names : list of Python strings + + Returns + ------- + None + """ + cdef: + vector[string] key_col_names + + for name in key_names: + key_col_names.push_back(tobytes(name)) + + self.builder.SetPrimaryKey(key_col_names) + + def build(self): + """ + Creates an immutable Schema object after the user has finished adding + and onfiguring columns + + Returns + ------- + schema : Schema + """ + cdef Schema result = Schema() + cdef KuduSchema* schema = new KuduSchema() + check_status(self.builder.Build(schema)) + + result.schema = schema + return result + + +cdef class Schema: + + """ + Container for a Kudu table schema. Obtain from Table instances or create + new ones using kudu.SchemaBuilder + """ + + def __cinit__(self): + # Users should not call this directly + self.schema = NULL + self.own_schema = 1 + self._col_mapping.clear() + self._mapping_initialized = 0 + + def __dealloc__(self): + if self.schema is not NULL and self.own_schema: + del self.schema + + property names: + + def __get__(self): + result = [] + for i in range(self.schema.num_columns()): + name = frombytes(self.schema.Column(i).name()) + result.append(name) + + return result + + def __repr__(self): + # Got to be careful with huge schemas, maybe some kind of summary repr + # when more than 20-30 columns? + buf = six.StringIO() + + col_names = self.names + space = 2 + max(len(x) for x in col_names) + + for i in range(len(self)): + col = self.at(i) + not_null = '' if col.nullable else ' NOT NULL' + + buf.write('\n{0}{1}{2}' + .format(col.name.ljust(space), + col.type.name, not_null)) + + pk_string = ', '.join(col_names[i] for i in self.primary_key_indices()) + buf.write('\nPRIMARY KEY ({0})'.format(pk_string)) + + return "kudu.Schema {{{0}\n}}".format(util.indent(buf.getvalue(), 2)) + + def __len__(self): + return self.schema.num_columns() + + def __getitem__(self, key): + if isinstance(key, six.string_types): + key = self.get_loc(key) + + if key < 0: + key += len(self) + return self.at(key) + + def equals(self, Schema other): + """ + Returns True if the table schemas are equal + """ + return self.schema.Equals(deref(other.schema)) + + cdef int get_loc(self, name) except -1: + if not self._mapping_initialized: + for i in range(self.schema.num_columns()): + self._col_mapping[self.schema.Column(i).name()] = i + self._mapping_initialized = 1 + + name = tobytes(name) + + # TODO: std::map is slightly verbose and inefficient here (O(lg n) + # lookups), may consider replacing with a better / different hash table + # should it become a performance bottleneck + cdef map[string, int].iterator it = self._col_mapping.find(name) + if it == self._col_mapping.end(): + raise KeyError(name) + return self._col_mapping[name] + + def at(self, size_t i): + """ + Return the ColumnSchema for a column index. Analogous to schema[i]. + + Returns + ------- + col_schema : ColumnSchema + """ + cdef ColumnSchema result = ColumnSchema() + + if i < 0 or i >= self.schema.num_columns(): + raise IndexError('Column index {0} is not in range' + .format(i)) + + result.schema = new KuduColumnSchema(self.schema.Column(i)) + + return result + + def primary_key_indices(self): + """ + Return the indices of the columns used as primary keys + + Returns + ------- + key_indices : list[int] + """ + cdef: + vector[int] indices + size_t i + + self.schema.GetPrimaryKeyColumnIndexes(&indices) + + result = [] + for i in range(indices.size()): + result.append(indices[i]) + return result + + def primary_keys(self): + """ + Return the names of the columns used as primary keys + + Returns + ------- + key_names : list[str] + """ + indices = self.primary_key_indices() + return [self.at(i).name for i in indices] diff --git a/python/kudu/tests/__init__.py b/python/kudu/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/kudu/tests/common.py b/python/kudu/tests/common.py new file mode 100644 index 000000000000..ba012609aa47 --- /dev/null +++ b/python/kudu/tests/common.py @@ -0,0 +1,153 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import division + +import json +import fnmatch +import os +import shutil +import subprocess +import tempfile +import time + +import kudu + + +class KuduTestBase(object): + + """ + Base test class that will start a configurable number of master and + tablet servers. + """ + + BASE_PORT = 37000 + NUM_TABLET_SERVERS = 3 + + @classmethod + def start_cluster(cls): + local_path = tempfile.mkdtemp(dir=os.getenv("TEST_TMPDIR")) + kudu_build = os.getenv("KUDU_BUILD") + if not kudu_build: + kudu_build = os.path.join(os.getenv("KUDU_HOME"), "build", "latest") + bin_path = "{0}/bin".format(kudu_build) + + os.makedirs("{0}/master/".format(local_path)) + os.makedirs("{0}/master/data".format(local_path)) + os.makedirs("{0}/master/logs".format(local_path)) + + path = [ + "{0}/kudu-master".format(bin_path), + "-rpc_server_allow_ephemeral_ports", + "-rpc_bind_addresses=0.0.0.0:0", + "-fs_wal_dir={0}/master/data".format(local_path), + "-fs_data_dirs={0}/master/data".format(local_path), + "-log_dir={0}/master/logs".format(local_path), + "-logtostderr", + "-webserver_port=0", + # Only make one replica so that our tests don't need to worry about + # setting consistency modes. + "-default_num_replicas=1", + "-server_dump_info_path={0}/master/config.json".format(local_path) + ] + + p = subprocess.Popen(path, shell=False) + fid = open("{0}/master/kudu-master.pid".format(local_path), "w+") + fid.write("{0}".format(p.pid)) + fid.close() + + # We have to wait for the master to settle before the config file + # appears + config_file = "{0}/master/config.json".format(local_path) + for i in range(30): + if os.path.exists(config_file): + break + time.sleep(0.1 * (i + 1)) + else: + raise Exception("Could not find kudu-master config file") + + # If the server was started get the bind port from the config dump + master_config = json.load(open("{0}/master/config.json" + .format(local_path), "r")) + # One master bound on local host + master_port = master_config["bound_rpc_addresses"][0]["port"] + + for m in range(cls.NUM_TABLET_SERVERS): + os.makedirs("{0}/ts/{1}".format(local_path, m)) + os.makedirs("{0}/ts/{1}/logs".format(local_path, m)) + + path = [ + "{0}/kudu-tserver".format(bin_path), + "-rpc_server_allow_ephemeral_ports", + "-rpc_bind_addresses=0.0.0.0:0", + "-tserver_master_addrs=127.0.0.1:{0}".format(master_port), + "-webserver_port=0", + "-log_dir={0}/master/logs".format(local_path), + "-logtostderr", + "-fs_data_dirs={0}/ts/{1}/data".format(local_path, m), + "-fs_wal_dir={0}/ts/{1}/data".format(local_path, m), + ] + p = subprocess.Popen(path, shell=False) + tserver_pid = "{0}/ts/{1}/kudu-tserver.pid".format(local_path, m) + fid = open(tserver_pid, "w+") + fid.write("{0}".format(p.pid)) + fid.close() + + return local_path, master_port + + @classmethod + def stop_cluster(cls, path): + for root, dirnames, filenames in os.walk('{0}/..'.format(path)): + for filename in fnmatch.filter(filenames, '*.pid'): + with open(os.path.join(root, filename)) as fid: + a = fid.read() + r = subprocess.Popen(["kill", "{0}".format(a)]) + r.wait() + os.remove(os.path.join(root, filename)) + shutil.rmtree(path, True) + + @classmethod + def setUpClass(cls): + cls.cluster_path, master_port = cls.start_cluster() + time.sleep(1) + + cls.master_host = '127.0.0.1' + cls.master_port = master_port + + cls.client = kudu.connect(cls.master_host, cls.master_port) + + cls.schema = cls.example_schema() + + cls.ex_table = 'example-table' + if cls.client.table_exists(cls.ex_table): + cls.client.delete_table(cls.ex_table) + cls.client.create_table(cls.ex_table, cls.schema) + + @classmethod + def tearDownClass(cls): + cls.stop_cluster(cls.cluster_path) + + @classmethod + def example_schema(cls): + builder = kudu.schema_builder() + builder.add_column('key', kudu.int32, nullable=False) + builder.add_column('int_val', kudu.int32) + builder.add_column('string_val', kudu.string) + builder.set_primary_keys(['key']) + + return builder.build() diff --git a/python/kudu/tests/test_client.py b/python/kudu/tests/test_client.py new file mode 100644 index 000000000000..4636b3fb4710 --- /dev/null +++ b/python/kudu/tests/test_client.py @@ -0,0 +1,189 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from kudu.compat import unittest, long +from kudu.tests.common import KuduTestBase +import kudu + + +class TestClient(KuduTestBase, unittest.TestCase): + + def setUp(self): + pass + + def test_table_basics(self): + table = self.client.table(self.ex_table) + + self.assertEqual(table.name, self.ex_table) + self.assertEqual(table.num_columns, len(self.schema)) + + def test_table_column(self): + table = self.client.table(self.ex_table) + col = table['key'] + + assert col.name == b'key' + assert col.parent is table + + result_repr = repr(col) + expected_repr = ('Column(key, parent={0}, type=int32)' + .format(self.ex_table)) + assert result_repr == expected_repr + + def test_table_schema_retains_reference(self): + import gc + + table = self.client.table(self.ex_table) + schema = table.schema + table = None + + gc.collect() + repr(schema) + + def test_table_exists(self): + self.assertFalse(self.client.table_exists('nonexistent-table')) + self.assertTrue(self.client.table_exists(self.ex_table)) + + def test_list_tables(self): + schema = self.example_schema() + + to_create = ['foo1', 'foo2', 'foo3'] + for name in to_create: + self.client.create_table(name, schema) + + result = self.client.list_tables() + expected = [self.ex_table] + to_create + assert sorted(result) == expected + + result = self.client.list_tables('foo') + assert sorted(result) == to_create + + for name in to_create: + self.client.delete_table(name) + + def test_is_multimaster(self): + assert not self.client.is_multimaster + + def test_delete_table(self): + name = "peekaboo" + self.client.create_table(name, self.schema) + self.client.delete_table(name) + assert not self.client.table_exists(name) + + # Should raise a more meaningful exception at some point + with self.assertRaises(kudu.KuduNotFound): + self.client.delete_table(name) + + def test_table_nonexistent(self): + self.assertRaises(kudu.KuduNotFound, self.client.table, + '__donotexist__') + + def test_insert_nonexistent_field(self): + table = self.client.table(self.ex_table) + op = table.new_insert() + self.assertRaises(KeyError, op.__setitem__, 'doesntexist', 12) + + def test_insert_rows_and_delete(self): + nrows = 100 + table = self.client.table(self.ex_table) + session = self.client.new_session() + for i in range(nrows): + op = table.new_insert() + op['key'] = i + op['int_val'] = i * 2 + op['string_val'] = 'hello_%d' % i + session.apply(op) + + # Cannot apply the same insert twice, C++ client does not indicate an + # error + self.assertRaises(Exception, session.apply, op) + + # synchronous + session.flush() + + scanner = table.scanner().open() + assert len(scanner.read_all_tuples()) == nrows + + # Delete the rows we just wrote + for i in range(nrows): + op = table.new_delete() + op['key'] = i + session.apply(op) + session.flush() + + scanner = table.scanner().open() + assert len(scanner.read_all_tuples()) == 0 + + def test_session_auto_open(self): + table = self.client.table(self.ex_table) + scanner = table.scanner() + result = scanner.read_all_tuples() + assert len(result) == 0 + + def test_session_open_idempotent(self): + table = self.client.table(self.ex_table) + scanner = table.scanner().open().open() + result = scanner.read_all_tuples() + assert len(result) == 0 + + def test_session_flush_modes(self): + self.client.new_session(flush_mode=kudu.FLUSH_MANUAL) + self.client.new_session(flush_mode=kudu.FLUSH_AUTO_SYNC) + + self.client.new_session(flush_mode='manual') + self.client.new_session(flush_mode='sync') + + with self.assertRaises(kudu.KuduNotSupported): + self.client.new_session(flush_mode=kudu.FLUSH_AUTO_BACKGROUND) + + with self.assertRaises(kudu.KuduNotSupported): + self.client.new_session(flush_mode='background') + + with self.assertRaises(ValueError): + self.client.new_session(flush_mode='foo') + + def test_connect_timeouts(self): + # it works! any other way to check + kudu.connect(self.master_host, self.master_port, + admin_timeout_ms=100, + rpc_timeout_ms=100) + + def test_capture_kudu_error(self): + pass + + +class TestMonoDelta(unittest.TestCase): + + def test_empty_ctor(self): + delta = kudu.TimeDelta() + assert repr(delta) == 'kudu.TimeDelta()' + + def test_static_ctors(self): + delta = kudu.timedelta(3.5) + assert delta.to_seconds() == 3.5 + + delta = kudu.timedelta(millis=3500) + assert delta.to_millis() == 3500 + + delta = kudu.timedelta(micros=3500) + assert delta.to_micros() == 3500 + + delta = kudu.timedelta(micros=1000) + assert delta.to_nanos() == long(1000000) + + delta = kudu.timedelta(nanos=3500) + assert delta.to_nanos() == 3500 diff --git a/python/kudu/tests/test_scanner.py b/python/kudu/tests/test_scanner.py new file mode 100644 index 000000000000..573f9a97accf --- /dev/null +++ b/python/kudu/tests/test_scanner.py @@ -0,0 +1,105 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import division + +from kudu.compat import unittest +from kudu.tests.common import KuduTestBase +import kudu + + +class TestScanner(KuduTestBase, unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(TestScanner, cls).setUpClass() + + cls.nrows = 100 + table = cls.client.table(cls.ex_table) + session = cls.client.new_session() + + tuples = [] + for i in range(cls.nrows): + op = table.new_insert() + tup = i, i * 2, 'hello_%d' % i if i % 2 == 0 else None + op['key'] = tup[0] + op['int_val'] = tup[1] + if i % 2 == 0: + op['string_val'] = tup[2] + session.apply(op) + tuples.append(tup) + session.flush() + + cls.table = table + cls.tuples = tuples + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + pass + + def test_scan_rows_basic(self): + # Let's scan with no predicates + scanner = self.table.scanner().open() + + tuples = scanner.read_all_tuples() + self.assertEqual(sorted(tuples), self.tuples) + + def test_scan_rows_simple_predicate(self): + key = self.table['key'] + preds = [key >= 20, key <= 49] + + def _read_predicates(preds): + scanner = self.table.scanner() + scanner.add_predicates(preds) + scanner.open() + return scanner.read_all_tuples() + + tuples = _read_predicates(preds) + self.assertEqual(sorted(tuples), self.tuples[20:50]) + + # verify predicates reusable + tuples = _read_predicates(preds) + self.assertEqual(sorted(tuples), self.tuples[20:50]) + + def test_scan_rows_string_predicate(self): + scanner = self.table.scanner() + + sv = self.table['string_val'] + + scanner.add_predicates([sv >= 'hello_20', + sv <= 'hello_22']) + + scanner.set_fault_tolerant() + scanner.open() + + tuples = scanner.read_all_tuples() + + self.assertEqual(sorted(tuples), [(20, 40, 'hello_20'), (22, 44, 'hello_22')]) + + def test_scan_invalid_predicates(self): + scanner = self.table.scanner() + sv = self.table['string_val'] + + with self.assertRaises(TypeError): + scanner.add_predicates([sv >= None]) + + with self.assertRaises(kudu.KuduInvalidArgument): + scanner.add_predicates([sv >= 1]) diff --git a/python/kudu/tests/test_schema.py b/python/kudu/tests/test_schema.py new file mode 100644 index 000000000000..b75c61db646d --- /dev/null +++ b/python/kudu/tests/test_schema.py @@ -0,0 +1,182 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import division + +from kudu.compat import unittest +import kudu + + +class TestSchema(unittest.TestCase): + + def setUp(self): + self.columns = [('one', 'int32', False), + ('two', 'int8', False), + ('three', 'double', True), + ('four', 'string', False)] + + self.primary_keys = ['one', 'two'] + + self.builder = kudu.schema_builder() + for name, typename, nullable in self.columns: + self.builder.add_column(name, typename, nullable=nullable) + + self.builder.set_primary_keys(self.primary_keys) + self.schema = self.builder.build() + + def test_repr(self): + result = repr(self.schema) + for name, _, _ in self.columns: + assert name in result + + assert 'PRIMARY KEY (one, two)' in result + + def test_schema_length(self): + assert len(self.schema) == 4 + + def test_names(self): + assert self.schema.names == ['one', 'two', 'three', 'four'] + + def test_primary_keys(self): + assert self.schema.primary_key_indices() == [0, 1] + assert self.schema.primary_keys() == ['one', 'two'] + + def test_getitem_boundschecking(self): + with self.assertRaises(IndexError): + self.schema[4] + + def test_getitem_wraparound(self): + # wraparound + result = self.schema[-1] + expected = self.schema[3] + + assert result.equals(expected) + + def test_getitem_string(self): + result = self.schema['three'] + expected = self.schema[2] + + assert result.equals(expected) + + with self.assertRaises(KeyError): + self.schema['not_found'] + + def test_schema_equals(self): + assert self.schema.equals(self.schema) + + builder = kudu.schema_builder() + builder.add_column('key', 'int64', nullable=False, primary_key=True) + schema = builder.build() + + assert not self.schema.equals(schema) + + def test_column_equals(self): + assert not self.schema[0].equals(self.schema[1]) + + def test_type(self): + builder = kudu.schema_builder() + (builder.add_column('key') + .type('int32') + .primary_key() + .nullable(False)) + schema = builder.build() + + tp = schema[0].type + assert tp.name == 'int32' + assert tp.type == kudu.schema.INT32 + + def test_compression(self): + builder = kudu.schema_builder() + builder.add_column('key', 'int64', nullable=False) + + foo = builder.add_column('foo', 'string').compression('lz4') + assert foo is not None + + bar = builder.add_column('bar', 'string') + bar.compression(kudu.COMPRESSION_ZLIB) + + with self.assertRaises(ValueError): + bar = builder.add_column('qux', 'string', compression='unknown') + + builder.set_primary_keys(['key']) + builder.build() + + # TODO; The C++ client does not give us an API to see the storage + # attributes of a column + + def test_encoding(self): + builder = kudu.schema_builder() + builder.add_column('key', 'int64', nullable=False) + + foo = builder.add_column('foo', 'string').encoding('rle') + assert foo is not None + + bar = builder.add_column('bar', 'string') + bar.encoding(kudu.ENCODING_PLAIN) + + with self.assertRaises(ValueError): + builder.add_column('qux', 'string', encoding='unknown') + + builder.set_primary_keys(['key']) + builder.build() + # TODO(wesm): The C++ client does not give us an API to see the storage + # attributes of a column + + def test_set_column_spec_pk(self): + builder = kudu.schema_builder() + key = (builder.add_column('key', 'int64', nullable=False) + .primary_key()) + assert key is not None + schema = builder.build() + assert 'key' in schema.primary_keys() + + builder = kudu.schema_builder() + key = (builder.add_column('key', 'int64', nullable=False, + primary_key=True)) + schema = builder.build() + assert 'key' in schema.primary_keys() + + def test_partition_schema(self): + pass + + def test_nullable_not_null(self): + builder = kudu.schema_builder() + (builder.add_column('key', 'int64', nullable=False) + .primary_key()) + + builder.add_column('data1', 'double').nullable(True) + builder.add_column('data2', 'double').nullable(False) + builder.add_column('data3', 'double', nullable=True) + builder.add_column('data4', 'double', nullable=False) + + schema = builder.build() + + assert not schema[0].nullable + assert schema[1].nullable + assert not schema[2].nullable + + assert schema[3].nullable + assert not schema[4].nullable + + def test_default_value(self): + pass + + def test_column_schema_repr(self): + result = repr(self.schema[0]) + expected = 'ColumnSchema(name=one, type=int32, nullable=False)' + self.assertEqual(result, expected) diff --git a/python/kudu/util.py b/python/kudu/util.py new file mode 100644 index 000000000000..a2b65cffad07 --- /dev/null +++ b/python/kudu/util.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +def indent(text, spaces): + block = ' ' * spaces + return '\n'.join(block + x for x in text.split('\n')) diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 000000000000..ef646fa036df --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,6 @@ +pytest +numpy>=1.7.0 +cython >= 0.21 +setuptools >= 0.8 +six +unittest2 diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 000000000000..9af7e6f11bb0 --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest \ No newline at end of file diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000000..993dace37b71 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from Cython.Distutils import build_ext +from Cython.Build import cythonize +import Cython + +import sys +from setuptools import setup +from distutils.command.clean import clean as _clean +from distutils.extension import Extension +import os + +if Cython.__version__ < '0.19.1': + raise Exception('Please upgrade to Cython 0.19.1 or newer') + +MAJOR = 0 +MINOR = 1 +MICRO = 1 +VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) +ISRELEASED = True + +setup_dir = os.path.abspath(os.path.dirname(__file__)) + + +def write_version_py(filename=os.path.join(setup_dir, 'kudu/version.py')): + version = VERSION + if not ISRELEASED: + version += '.dev' + + a = open(filename, 'w') + file_content = "\n".join(["", + "# THIS FILE IS GENERATED FROM SETUP.PY", + "version = '%(version)s'", + "isrelease = '%(isrelease)s'"]) + + a.write(file_content % {'version': VERSION, + 'isrelease': str(ISRELEASED)}) + a.close() + + +class clean(_clean): + def run(self): + _clean.run(self) + for x in ['kudu/client.cpp', 'kudu/schema.cpp', + 'kudu/errors.cpp']: + try: + os.remove(x) + except OSError: + pass + + +# If we're in the context of the Kudu git repository, build against the +# latest in-tree build artifacts +if 'KUDU_HOME' in os.environ: + kudu_home = os.environ['KUDU_HOME'] + sys.stderr.write("Using KUDU_HOME directory: %s\n" % (kudu_home,)) + if not os.path.isdir(kudu_home): + sys.stderr.write("%s is not a valid KUDU_HOME directory" % (kudu_home,)) + sys.exit(1) + + kudu_include_dirs = [os.path.join(kudu_home, 'src')] + + if 'KUDU_BUILD' in os.environ: + kudu_build = os.environ['KUDU_BUILD'] + sys.stderr.write("Using KUDU_BUILD directory: %s\n" % (kudu_build,)) + else: + kudu_build = os.path.join(kudu_home, 'build', 'latest') + sys.stderr.write("Using inferred KUDU_BUILD directory: %s/\n" % (kudu_build,)) + if not os.path.isdir(kudu_build): + sys.stderr.write("%s is not a valid KUDU_BUILD directory" % (kudu_build,)) + sys.exit(1) + + kudu_include_dirs.append(os.path.join(kudu_build, 'src')) + kudu_lib_dir = os.path.join(kudu_build, 'lib', 'exported') +else: + if os.path.exists("/usr/local/include/kudu"): + prefix = "/usr/local" + elif os.path.exists("/usr/include/kudu"): + prefix = "/usr" + else: + sys.stderr.write("Cannot find installed kudu client.\n") + sys.exit(1) + sys.stderr.write("Building from system prefix {0}\n".format(prefix)) + kudu_include_dirs = [prefix + "/include"] + kudu_lib_dir = prefix + "/lib" + +INCLUDE_PATHS = kudu_include_dirs +LIBRARY_DIRS = [kudu_lib_dir] +RT_LIBRARY_DIRS = LIBRARY_DIRS + +ext_submodules = ['client', 'errors', 'schema'] + +extensions = [] + +for submodule_name in ext_submodules: + ext = Extension('kudu.{0}'.format(submodule_name), + ['kudu/{0}.pyx'.format(submodule_name)], + libraries=['kudu_client'], + # Disable the 'new' gcc5 ABI; see the top-level + # CMakeLists.txt for details. + define_macros=[('_GLIBCXX_USE_CXX11_ABI', '0')], + include_dirs=INCLUDE_PATHS, + library_dirs=LIBRARY_DIRS, + runtime_library_dirs=RT_LIBRARY_DIRS) + extensions.append(ext) + +extensions = cythonize(extensions) + +write_version_py() + +LONG_DESCRIPTION = open(os.path.join(setup_dir, "README.md")).read() +DESCRIPTION = "Python interface to the Apache Kudu (incubating) C++ Client API" + +CLASSIFIERS = [ + 'Development Status :: 3 - Alpha', + 'Environment :: Console', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Cython' +] + +URL = 'http://getkudu.io' + +setup( + name="kudu-python", + packages=['kudu', 'kudu.tests'], + version=VERSION, + package_data={'kudu': ['*.pxd', '*.pyx']}, + ext_modules=extensions, + cmdclass={ + 'clean': clean, + 'build_ext': build_ext + }, + setup_requires=['pytest-runner'], + tests_require=['pytest'], + install_requires=['cython >= 0.21'], + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + license='Apache License, Version 2.0', + classifiers=CLASSIFIERS, + maintainer="Apache Kudu (incubating) team", + maintainer_email="dev@kudu.incubator.apache.org", + url=URL, + test_suite="kudu.tests" +) diff --git a/src/kudu/benchmarks/CMakeLists.txt b/src/kudu/benchmarks/CMakeLists.txt new file mode 100644 index 000000000000..dce704707d75 --- /dev/null +++ b/src/kudu/benchmarks/CMakeLists.txt @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(TPCH_SRCS + tpch/rpc_line_item_dao.cc +) + +add_library(tpch ${TPCH_SRCS}) +target_link_libraries(tpch + kudu_client + integration-tests) + +# tpch1 +add_executable(tpch1 tpch/tpch1.cc) +target_link_libraries(tpch1 + tpch + ${KUDU_TEST_LINK_LIBS}) + +# tpch_real_world +add_executable(tpch_real_world tpch/tpch_real_world.cc) +target_link_libraries(tpch_real_world + tpch + ${KUDU_TEST_LINK_LIBS}) + +# rle +add_executable(rle rle.cc) +target_link_libraries(rle + kudu_util + ${KUDU_TEST_LINK_LIBS}) + +# wal_hiccup +# Disabled on OS X since it relies on fdatasync and sync_file_range. +if(NOT APPLE) + add_executable(wal_hiccup wal_hiccup.cc) + target_link_libraries(wal_hiccup + kudu_util + ${KUDU_TEST_LINK_LIBS}) +endif() + +# Tests +set(KUDU_TEST_LINK_LIBS tpch ${KUDU_TEST_LINK_LIBS}) +ADD_KUDU_TEST(tpch/rpc_line_item_dao-test) diff --git a/src/kudu/benchmarks/bin/parse_rpc_bench.sh b/src/kudu/benchmarks/bin/parse_rpc_bench.sh new file mode 100755 index 000000000000..63b1fdaab3cc --- /dev/null +++ b/src/kudu/benchmarks/bin/parse_rpc_bench.sh @@ -0,0 +1,47 @@ +#!/bin/bash -e +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Little script to parse the output from rpc-bench. + + +FILE=$1 +if [[ -z $FILE || $FILE == "-h" || $FILE == "--help" ]]; then + echo "Usage: $0 rpc-bench-output.log" + echo + echo 'Example:' + echo + echo '$ cd $KUDU_HOME' + echo '$ BUILD_TYPE=RELEASE ./build-support/jenkins/build-and-test.sh' + echo '$ KUDU_ALLOW_SLOW_TESTS=1 ./build/latest/bin/rpc-bench --gtest_repeat=10 2>&1 | tee rpc-bench-output.log' + echo '$ ./src/kudu/benchmarks/bin/parse_rpc_bench.sh rpc-bench-output.log' + echo + echo 'Example output:' + echo + echo 'Reqs/sec: runs=10, avg=146661.6, max=147649' + echo 'User CPU per req: runs=10, avg=16.3004, max=16.4745' + echo 'Sys CPU per req: runs=10, avg=29.25029, max=29.5273' + echo + exit 1 +fi + +# Just some hacky one-liners to parse and summarize the output files. +# Don't forget to redirect stderr to stdout when teeing the rpc-bench output to the log file! +perl -ne '/ (Reqs\/sec):\s+(\d+(?:\.(?:\d+)?)?)/ or next; $lab = $1; $m = $2 if $2 > $m; $v += $2; $ct++; END { print "$lab: runs=$ct, avg=" . $v/$ct . ", max=$m\n"; }' < $FILE +perl -ne '/ (User CPU per req):\s+(\d+(?:\.(?:\d+)?)?)/ or next; $lab = $1; $m = $2 if $2 > $m; $v += $2; $ct++; END { print "$lab: runs=$ct, avg=" . $v/$ct . ", max=$m\n"; }' < $FILE +perl -ne '/ (Sys CPU per req):\s+(\d+(?:\.(?:\d+)?)?)/ or next; $lab = $1; $m = $2 if $2 > $m; $v += $2; $ct++; END { print "$lab: runs=$ct, avg=" . $v/$ct . ", max=$m\n"; }' < $FILE diff --git a/src/kudu/benchmarks/rle.cc b/src/kudu/benchmarks/rle.cc new file mode 100644 index 000000000000..cb2c6f273b5b --- /dev/null +++ b/src/kudu/benchmarks/rle.cc @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Micro benchmark for writing/reading bit streams and Kudu specific +// run-length encoding (RLE) APIs. Currently only covers booleans and +// the most performance sensitive APIs. NB: Impala contains a RLE +// micro benchmark (rle-benchmark.cc). +// + +#include +#include + +#include "kudu/gutil/mathlimits.h" +#include "kudu/util/bit-stream-utils.h" +#include "kudu/util/logging.h" +#include "kudu/util/rle-encoding.h" +#include "kudu/util/stopwatch.h" + +DEFINE_int32(bitstream_num_bytes, 1 * 1024 * 1024, + "Number of bytes worth of bits to write and read from the bitstream"); + +namespace kudu { + +// Measure writing and reading single-bit streams +void BooleanBitStream() { + faststring buffer(FLAGS_bitstream_num_bytes); + BitWriter writer(&buffer); + + // Write alternating strings of repeating 0's and 1's + for (int i = 0; i < FLAGS_bitstream_num_bytes; ++i) { + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + writer.PutValue(i % 2, 1); + } + writer.Flush(); + + LOG(INFO) << "Wrote " << writer.bytes_written() << " bytes"; + + BitReader reader(buffer.data(), writer.bytes_written()); + for (int i = 0; i < FLAGS_bitstream_num_bytes; ++i) { + bool val; + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + reader.GetValue(1, &val); + } +} + +// Measure bulk puts and decoding runs of RLE bools +void BooleanRLE() { + const int num_iters = 3 * 1024; + + faststring buffer(45 * 1024); + RleEncoder encoder(&buffer, 1); + + for (int i = 0; i < num_iters; i++) { + encoder.Put(false, 100 * 1024); + encoder.Put(true, 3); + encoder.Put(false, 3); + encoder.Put(true, 213 * 1024); + encoder.Put(false, 300); + encoder.Put(true, 8); + encoder.Put(false, 4); + } + + LOG(INFO) << "Wrote " << encoder.len() << " bytes"; + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + bool val = false; + size_t run_length; + for (int i = 0; i < num_iters; i++) { + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + } +} + +} // namespace kudu + +int main(int argc, char **argv) { + FLAGS_logtostderr = 1; + google::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + + LOG_TIMING(INFO, "BooleanBitStream") { + kudu::BooleanBitStream(); + } + + LOG_TIMING(INFO, "BooleanRLE") { + kudu::BooleanRLE(); + } + + return 0; +} diff --git a/src/kudu/benchmarks/tpch/line_item_tsv_importer.h b/src/kudu/benchmarks/tpch/line_item_tsv_importer.h new file mode 100644 index 000000000000..6045286f9527 --- /dev/null +++ b/src/kudu/benchmarks/tpch/line_item_tsv_importer.h @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TPCH_LINE_ITEM_TSV_IMPORTER_H +#define KUDU_TPCH_LINE_ITEM_TSV_IMPORTER_H + +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/common/partial_row.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/status.h" + +namespace kudu { + +static const char* const kPipeSeparator = "|"; + +// Utility class used to parse the lineitem tsv file +class LineItemTsvImporter { + public: + explicit LineItemTsvImporter(const string &path) : in_(path.c_str()), + updated_(false) { + CHECK(in_.is_open()) << "not able to open input file: " << path; + } + + bool HasNextLine() { + if (!updated_) { + done_ = !getline(in_, line_); + updated_ = true; + } + return !done_; + } + + // Fills the row builder with a single line item from the file. + // It returns 0 if it's done or the order number if it got a line + int GetNextLine(KuduPartialRow* row) { + if (!HasNextLine()) return 0; + columns_.clear(); + + // grab all the columns_ individually + // Note that columns_ refers, and does not copy, the data in line_ + columns_ = strings::Split(line_, kPipeSeparator); + + // The row copies all indirect data from columns_. This must be done + // because callers expect to retrieve lines repeatedly before flushing + // the accumulated rows in a batch. + int i = 0; + int order_number = ConvertToInt64AndPopulate(columns_[i++], row, tpch::kOrderKeyColIdx); + ConvertToIntAndPopulate(columns_[i++], row, tpch::kPartKeyColIdx); + ConvertToIntAndPopulate(columns_[i++], row, tpch::kSuppKeyColIdx); + ConvertToIntAndPopulate(columns_[i++], row, tpch::kLineNumberColIdx); + ConvertToIntAndPopulate(columns_[i++], row, tpch::kQuantityColIdx); + ConvertToDoubleAndPopulate(columns_[i++], row, tpch::kExtendedPriceColIdx); + ConvertToDoubleAndPopulate(columns_[i++], row, tpch::kDiscountColIdx); + ConvertToDoubleAndPopulate(columns_[i++], row, tpch::kTaxColIdx); + CHECK_OK(row->SetStringCopy(tpch::kReturnFlagColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kLineStatusColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kShipDateColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kCommitDateColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kReceiptDateColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kShipInstructColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kShipModeColIdx, columns_[i++])); + CHECK_OK(row->SetStringCopy(tpch::kCommentColIdx, columns_[i++])); + + updated_ = false; + + return order_number; + } + + private: + int ConvertToInt64AndPopulate(const StringPiece &chars, KuduPartialRow* row, + int col_idx) { + // TODO: extra copy here, since we don't have a way to parse StringPiece + // into ints. + chars.CopyToString(&tmp_); + int64_t number; + bool ok_parse = safe_strto64(tmp_.c_str(), &number); + CHECK(ok_parse) << "Bad integer in column " << col_idx + << ": '" << tmp_ << "'"; + CHECK_OK(row->SetInt64(col_idx, number)); + return number; + } + + int ConvertToIntAndPopulate(const StringPiece &chars, KuduPartialRow* row, + int col_idx) { + // TODO: extra copy here, since we don't have a way to parse StringPiece + // into ints. + chars.CopyToString(&tmp_); + int number; + bool ok_parse = SimpleAtoi(tmp_.c_str(), &number); + CHECK(ok_parse) << "Bad integer in column " << col_idx + << ": '" << tmp_ << "'"; + CHECK_OK(row->SetInt32(col_idx, number)); + return number; + } + + void ConvertToDoubleAndPopulate(const StringPiece &chars, KuduPartialRow* row, + int col_idx) { + // TODO: extra copy here, since we don't have a way to parse StringPiece + // into ints. + chars.CopyToString(&tmp_); + char *error = NULL; + errno = 0; + const char *cstr = tmp_.c_str(); + double number = strtod(cstr, &error); + CHECK(errno == 0 && // overflow/underflow happened + error != cstr) << "Bad double in column " << col_idx + << ": '" << tmp_ << "': errno=" << errno; + CHECK_OK(row->SetDouble(col_idx, number)); + } + std::ifstream in_; + vector columns_; + string line_, tmp_; + bool updated_, done_; +}; +} // namespace kudu +#endif diff --git a/src/kudu/benchmarks/tpch/rpc_line_item_dao-test.cc b/src/kudu/benchmarks/tpch/rpc_line_item_dao-test.cc new file mode 100644 index 000000000000..b6a2f8e575c8 --- /dev/null +++ b/src/kudu/benchmarks/tpch/rpc_line_item_dao-test.cc @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/rpc_line_item_dao.h" +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/common/partial_row.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +using client::KuduRowResult; +using client::KuduSchema; +using std::string; +using std::vector; + +class RpcLineItemDAOTest : public KuduTest { + + public: + RpcLineItemDAOTest() {} + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // Start minicluster + cluster_.reset(new MiniCluster(env_.get(), MiniClusterOptions())); + ASSERT_OK(cluster_->Start()); + + const char *kTableName = "tpch1"; + + // Create the table and Connect to it. + string master_address(cluster_->mini_master()->bound_rpc_addr_str()); + dao_.reset(new kudu::RpcLineItemDAO(master_address, kTableName, 5)); + dao_->Init(); + } + + virtual void TearDown() OVERRIDE { + cluster_->Shutdown(); + KuduTest::TearDown(); + } + + protected: + gscoped_ptr cluster_; + gscoped_ptr dao_; + + // Builds a test row to be inserted into the lineitem table. + // The row's ship_date is set such that it matches the TPCH Q1 predicate. + static void BuildTestRow(int order, int line, KuduPartialRow* row) { + CHECK_OK(row->SetInt64(tpch::kOrderKeyColIdx, order)); + CHECK_OK(row->SetInt32(tpch::kLineNumberColIdx, line)); + CHECK_OK(row->SetInt32(tpch::kPartKeyColIdx, 12345)); + CHECK_OK(row->SetInt32(tpch::kSuppKeyColIdx, 12345)); + CHECK_OK(row->SetInt32(tpch::kQuantityColIdx, 12345)); + CHECK_OK(row->SetDouble(tpch::kExtendedPriceColIdx, 123.45)); + CHECK_OK(row->SetDouble(tpch::kDiscountColIdx, 123.45)); + CHECK_OK(row->SetDouble(tpch::kTaxColIdx, 123.45)); + CHECK_OK(row->SetStringCopy(tpch::kReturnFlagColIdx, StringPrintf("hello %d", line))); + CHECK_OK(row->SetStringCopy(tpch::kLineStatusColIdx, StringPrintf("hello %d", line))); + CHECK_OK(row->SetStringCopy(tpch::kShipDateColIdx, Slice("1985-07-15"))); + CHECK_OK(row->SetStringCopy(tpch::kCommitDateColIdx, Slice("1985-11-13"))); + CHECK_OK(row->SetStringCopy(tpch::kReceiptDateColIdx, Slice("1985-11-13"))); + CHECK_OK(row->SetStringCopy(tpch::kShipInstructColIdx, StringPrintf("hello %d", line))); + CHECK_OK(row->SetStringCopy(tpch::kShipModeColIdx, StringPrintf("hello %d", line))); + CHECK_OK(row->SetStringCopy(tpch::kCommentColIdx, StringPrintf("hello %d", line))); + } + + static void UpdateTestRow(int key, int line_number, int quantity, KuduPartialRow* row) { + CHECK_OK(row->SetInt64(tpch::kOrderKeyColIdx, key)); + CHECK_OK(row->SetInt32(tpch::kLineNumberColIdx, line_number)); + CHECK_OK(row->SetInt32(tpch::kQuantityColIdx, quantity)); + } + + int CountRows() { + gscoped_ptr scanner; + dao_->OpenScanner(vector(), &scanner); + vector rows; + int count = 0; + while (scanner->HasMore()) { + scanner->GetNext(&rows); + count += rows.size(); + } + return count; + } + + void ScanTpch1RangeToStrings(int64_t min_orderkey, int64_t max_orderkey, + vector* str_rows) { + str_rows->clear(); + gscoped_ptr scanner; + dao_->OpenTpch1ScannerForOrderKeyRange(min_orderkey, max_orderkey, + &scanner); + vector rows; + while (scanner->HasMore()) { + scanner->GetNext(&rows); + for (const KuduRowResult& row : rows) { + str_rows->push_back(row.ToString()); + } + } + std::sort(str_rows->begin(), str_rows->end()); + } +}; // class RpcLineItemDAOTest + +TEST_F(RpcLineItemDAOTest, TestInsert) { + dao_->WriteLine(boost::bind(BuildTestRow, 1, 1, _1)); + dao_->FinishWriting(); + ASSERT_EQ(1, CountRows()); + for (int i = 2; i < 10; i++) { + for (int y = 0; y < 5; y++) { + dao_->WriteLine(boost::bind(BuildTestRow, i, y, _1)); + } + } + dao_->FinishWriting(); + ASSERT_EQ(41, CountRows()); + + vector rows; + ScanTpch1RangeToStrings(7, 7, &rows); + ASSERT_EQ(5, rows.size()); + ScanTpch1RangeToStrings(5, 7, &rows); + ASSERT_EQ(15, rows.size()); +} + +TEST_F(RpcLineItemDAOTest, TestUpdate) { + dao_->WriteLine(boost::bind(BuildTestRow, 1, 1, _1)); + dao_->FinishWriting(); + ASSERT_EQ(1, CountRows()); + + dao_->MutateLine(boost::bind(UpdateTestRow, 1, 1, 12345, _1)); + dao_->FinishWriting(); + gscoped_ptr scanner; + dao_->OpenScanner({ tpch::kQuantityColName }, &scanner); + vector rows; + while (scanner->HasMore()) { + scanner->GetNext(&rows); + for (const KuduRowResult& row : rows) { + int32_t l_quantity; + ASSERT_OK(row.GetInt32(0, &l_quantity)); + ASSERT_EQ(12345, l_quantity); + } + } +} + +} // namespace kudu diff --git a/src/kudu/benchmarks/tpch/rpc_line_item_dao.cc b/src/kudu/benchmarks/tpch/rpc_line_item_dao.cc new file mode 100644 index 000000000000..12d1c3f0a32c --- /dev/null +++ b/src/kudu/benchmarks/tpch/rpc_line_item_dao.cc @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/rpc_line_item_dao.h" +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/meta_cache.h" +#include "kudu/client/value.h" +#include "kudu/client/write_op.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/util/coding.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +DEFINE_bool(tpch_cache_blocks_when_scanning, true, + "Whether the scanners should cache the blocks that are read or not"); + +namespace kudu { + +using client::KuduInsert; +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduError; +using client::KuduPredicate; +using client::KuduRowResult; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSession; +using client::KuduStatusCallback; +using client::KuduStatusMemberCallback; +using client::KuduTableCreator; +using client::KuduUpdate; +using client::KuduValue; +using std::vector; + +namespace { + +class FlushCallback : public KuduStatusCallback { + public: + FlushCallback(client::sp::shared_ptr session, Semaphore* sem) + : session_(std::move(session)), + sem_(sem) { + sem_->Acquire(); + } + + virtual void Run(const Status& s) OVERRIDE { + BatchFinished(); + CHECK_OK(s); + sem_->Release(); + delete this; + } + + private: + void BatchFinished() { + int nerrs = session_->CountPendingErrors(); + if (nerrs) { + LOG(WARNING) << nerrs << " errors occured during last batch."; + vector errors; + ElementDeleter d(&errors); + bool overflow; + session_->GetPendingErrors(&errors, &overflow); + if (overflow) { + LOG(WARNING) << "Error overflow occured"; + } + for (KuduError* error : errors) { + LOG(WARNING) << "FAILED: " << error->failed_op().ToString(); + } + } + } + + client::sp::shared_ptr session_; + Semaphore *sem_; +}; + +} // anonymous namespace + +const Slice RpcLineItemDAO::kScanUpperBound = Slice("1998-09-02"); + +void RpcLineItemDAO::Init() { + const KuduSchema schema = tpch::CreateLineItemSchema(); + + CHECK_OK(KuduClientBuilder() + .add_master_server_addr(master_address_) + .default_rpc_timeout(timeout_) + .Build(&client_)); + Status s = client_->OpenTable(table_name_, &client_table_); + if (s.IsNotFound()) { + gscoped_ptr table_creator(client_->NewTableCreator()); + CHECK_OK(table_creator->table_name(table_name_) + .schema(&schema) + .num_replicas(1) + .split_rows(tablet_splits_) + .Create()); + CHECK_OK(client_->OpenTable(table_name_, &client_table_)); + } else { + CHECK_OK(s); + } + + session_ = client_->NewSession(); + session_->SetTimeoutMillis(timeout_.ToMilliseconds()); + CHECK_OK(session_->SetFlushMode(KuduSession::MANUAL_FLUSH)); +} + +void RpcLineItemDAO::WriteLine(boost::function f) { + gscoped_ptr insert(client_table_->NewInsert()); + f(insert->mutable_row()); + CHECK_OK(session_->Apply(insert.release())); + ++batch_size_; + FlushIfBufferFull(); +} + +void RpcLineItemDAO::FlushIfBufferFull() { + if (batch_size_ < batch_max_) return; + + batch_size_ = 0; + + // The callback object frees itself after it is invoked. + session_->FlushAsync(new FlushCallback(session_, &semaphore_)); +} + +void RpcLineItemDAO::MutateLine(boost::function f) { + gscoped_ptr update(client_table_->NewUpdate()); + f(update->mutable_row()); + CHECK_OK(session_->Apply(update.release())); + ++batch_size_; + FlushIfBufferFull(); +} + +void RpcLineItemDAO::FinishWriting() { + FlushCallback* cb = new FlushCallback(session_, &semaphore_); + Status s = session_->Flush(); + + // Also deletes 'cb'. + cb->Run(s); +} + +void RpcLineItemDAO::OpenScanner(const vector& columns, + gscoped_ptr* out_scanner) { + vector preds; + OpenScanner(columns, preds, out_scanner); +} + +void RpcLineItemDAO::OpenScanner(const vector& columns, + const vector& preds, + gscoped_ptr* out_scanner) { + gscoped_ptr ret(new Scanner); + ret->scanner_.reset(new KuduScanner(client_table_.get())); + ret->scanner_->SetCacheBlocks(FLAGS_tpch_cache_blocks_when_scanning); + CHECK_OK(ret->scanner_->SetProjectedColumns(columns)); + for (KuduPredicate* pred : preds) { + CHECK_OK(ret->scanner_->AddConjunctPredicate(pred)); + } + CHECK_OK(ret->scanner_->Open()); + out_scanner->swap(ret); +} + +void RpcLineItemDAO::OpenTpch1Scanner(gscoped_ptr* out_scanner) { + vector preds; + preds.push_back(client_table_->NewComparisonPredicate( + tpch::kShipDateColName, KuduPredicate::LESS_EQUAL, + KuduValue::CopyString(kScanUpperBound))); + OpenScanner(tpch::GetTpchQ1QueryColumns(), preds, out_scanner); +} + +void RpcLineItemDAO::OpenTpch1ScannerForOrderKeyRange(int64_t min_key, int64_t max_key, + gscoped_ptr* out_scanner) { + vector preds; + preds.push_back(client_table_->NewComparisonPredicate( + tpch::kShipDateColName, KuduPredicate::LESS_EQUAL, + KuduValue::CopyString(kScanUpperBound))); + preds.push_back(client_table_->NewComparisonPredicate( + tpch::kOrderKeyColName, KuduPredicate::GREATER_EQUAL, + KuduValue::FromInt(min_key))); + preds.push_back(client_table_->NewComparisonPredicate( + tpch::kOrderKeyColName, KuduPredicate::LESS_EQUAL, + KuduValue::FromInt(max_key))); + OpenScanner(tpch::GetTpchQ1QueryColumns(), preds, out_scanner); +} + +bool RpcLineItemDAO::Scanner::HasMore() { + bool has_more = scanner_->HasMoreRows(); + if (!has_more) { + scanner_->Close(); + } + return has_more; +} + +void RpcLineItemDAO::Scanner::GetNext(vector *rows) { + CHECK_OK(scanner_->NextBatch(rows)); +} + +bool RpcLineItemDAO::IsTableEmpty() { + KuduScanner scanner(client_table_.get()); + CHECK_OK(scanner.Open()); + return !scanner.HasMoreRows(); +} + +RpcLineItemDAO::~RpcLineItemDAO() { + FinishWriting(); +} + +RpcLineItemDAO::RpcLineItemDAO(string master_address, string table_name, + int batch_size, int mstimeout, + vector tablet_splits) + : master_address_(std::move(master_address)), + table_name_(std::move(table_name)), + timeout_(MonoDelta::FromMilliseconds(mstimeout)), + batch_max_(batch_size), + tablet_splits_(std::move(tablet_splits)), + batch_size_(0), + semaphore_(1) { +} + +} // namespace kudu diff --git a/src/kudu/benchmarks/tpch/rpc_line_item_dao.h b/src/kudu/benchmarks/tpch/rpc_line_item_dao.h new file mode 100644 index 000000000000..b29eef2225fb --- /dev/null +++ b/src/kudu/benchmarks/tpch/rpc_line_item_dao.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TPCH_RPC_LINE_ITEM_DAO_H +#define KUDU_TPCH_RPC_LINE_ITEM_DAO_H + +#include +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/semaphore.h" + +namespace kudu { + +class RpcLineItemDAO { + public: + class Scanner; + + RpcLineItemDAO(std::string master_address, + std::string table_name, + int batch_size, + int mstimeout = 5000, + std::vector tablet_splits = {}); + ~RpcLineItemDAO(); + void WriteLine(boost::function f); + void MutateLine(boost::function f); + void Init(); + void FinishWriting(); + + // Deletes previous scanner if one is open. + // Projects only those column names listed in 'columns'. + void OpenScanner(const std::vector& columns, + gscoped_ptr* scanner); + // Calls OpenScanner with the tpch1 query parameters. + void OpenTpch1Scanner(gscoped_ptr* scanner); + + // Opens a scanner with the TPCH Q1 projection and filter, plus range filter to only + // select rows in the given order key range. + void OpenTpch1ScannerForOrderKeyRange(int64_t min_orderkey, int64_t max_orderkey, + gscoped_ptr* scanner); + bool IsTableEmpty(); + + // TODO: this wrapper class is of limited utility now that we only have a single + // "DAO" implementation -- we could just return the KuduScanner to users directly. + class Scanner { + public: + ~Scanner() {} + + // Return true if there are more rows left in the scanner. + bool HasMore(); + + // Return the next batch of rows into '*rows'. Any existing data is cleared. + void GetNext(std::vector *rows); + + private: + friend class RpcLineItemDAO; + Scanner() {} + + gscoped_ptr scanner_; + }; + + private: + static const Slice kScanUpperBound; + + void FlushIfBufferFull(); + void OpenScanner(const std::vector& columns, + const std::vector& preds, + gscoped_ptr* scanner); + + simple_spinlock lock_; + client::sp::shared_ptr client_; + client::sp::shared_ptr session_; + client::sp::shared_ptr client_table_; + const std::string master_address_; + const std::string table_name_; + const MonoDelta timeout_; + const int batch_max_; + const std::vector tablet_splits_; + int batch_size_; + + // Semaphore which restricts us to one batch at a time. + Semaphore semaphore_; +}; + +} //namespace kudu +#endif diff --git a/src/kudu/benchmarks/tpch/tpch-schemas.h b/src/kudu/benchmarks/tpch/tpch-schemas.h new file mode 100644 index 000000000000..20dfe6c2cef4 --- /dev/null +++ b/src/kudu/benchmarks/tpch/tpch-schemas.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Inline functions to create the TPC-H schemas +#ifndef KUDU_BENCHMARKS_TPCH_SCHEMAS_H +#define KUDU_BENCHMARKS_TPCH_SCHEMAS_H + +#include +#include + +#include "kudu/client/schema.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tpch { + +static const char* const kOrderKeyColName = "l_orderkey"; +static const char* const kLineNumberColName = "l_linenumber"; +static const char* const kPartKeyColName = "l_partkey"; +static const char* const kSuppKeyColName = "l_suppkey"; +static const char* const kQuantityColName = "l_quantity"; +static const char* const kExtendedPriceColName = "l_extendedprice"; +static const char* const kDiscountColName = "l_discount"; +static const char* const kTaxColName = "l_tax"; +static const char* const kReturnFlagColName = "l_returnflag"; +static const char* const kLineStatusColName = "l_linestatus"; +static const char* const kShipDateColName = "l_shipdate"; +static const char* const kCommitDateColName = "l_commitdate"; +static const char* const kReceiptDateColName = "l_receiptdate"; +static const char* const kShipInstructColName = "l_shipinstruct"; +static const char* const kShipModeColName = "l_shipmode"; +static const char* const kCommentColName = "l_comment"; + +static const client::KuduColumnStorageAttributes::EncodingType kPlainEncoding = + client::KuduColumnStorageAttributes::PLAIN_ENCODING; + +static const client::KuduColumnSchema::DataType kInt64 = + client::KuduColumnSchema::INT64; +static const client::KuduColumnSchema::DataType kInt32 = + client::KuduColumnSchema::INT32; +static const client::KuduColumnSchema::DataType kString = + client::KuduColumnSchema::STRING; +static const client::KuduColumnSchema::DataType kDouble = + client::KuduColumnSchema::DOUBLE; + +enum { + kOrderKeyColIdx = 0, + kLineNumberColIdx, + kPartKeyColIdx, + kSuppKeyColIdx, + kQuantityColIdx, + kExtendedPriceColIdx, + kDiscountColIdx, + kTaxColIdx, + kReturnFlagColIdx, + kLineStatusColIdx, + kShipDateColIdx, + kCommitDateColIdx, + kReceiptDateColIdx, + kShipInstructColIdx, + kShipModeColIdx, + kCommentColIdx +}; + +inline client::KuduSchema CreateLineItemSchema() { + client::KuduSchemaBuilder b; + client::KuduSchema s; + + b.AddColumn(kOrderKeyColName)->Type(kInt64)->NotNull(); + b.AddColumn(kLineNumberColName)->Type(kInt32)->NotNull(); + b.AddColumn(kPartKeyColName)->Type(kInt32)->NotNull(); + b.AddColumn(kSuppKeyColName)->Type(kInt32)->NotNull(); + b.AddColumn(kQuantityColName)->Type(kInt32)->NotNull(); // decimal? + b.AddColumn(kExtendedPriceColName)->Type(kDouble)->NotNull(); + b.AddColumn(kDiscountColName)->Type(kDouble)->NotNull(); + b.AddColumn(kTaxColName)->Type(kDouble)->NotNull(); + b.AddColumn(kReturnFlagColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kLineStatusColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kShipDateColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kCommitDateColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kReceiptDateColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kShipInstructColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kShipModeColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + b.AddColumn(kCommentColName)->Type(kString)->NotNull()->Encoding(kPlainEncoding); + + b.SetPrimaryKey({ kOrderKeyColName, kLineNumberColName }); + + CHECK_OK(b.Build(&s)); + return s; +} + +inline std::vector GetTpchQ1QueryColumns() { + return { kShipDateColName, + kReturnFlagColName, + kLineStatusColName, + kQuantityColName, + kExtendedPriceColName, + kDiscountColName, + kTaxColName }; +} + +} // namespace tpch +} // namespace kudu +#endif diff --git a/src/kudu/benchmarks/tpch/tpch1.cc b/src/kudu/benchmarks/tpch/tpch1.cc new file mode 100644 index 000000000000..73fb29542ac8 --- /dev/null +++ b/src/kudu/benchmarks/tpch/tpch1.cc @@ -0,0 +1,283 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This utility will first try to load the data from the given path if the +// tablet doesn't already exist at the given location. It will then run +// the tpch1 query, as described below, up to tpch_num_query_iterations times. +// +// The input data must be in the tpch format, separated by "|". +// +// Usage: +// tpch1 -tpch_path_to_data=/home/jdcryans/lineitem.tbl +// -tpch_num_query_iterations=1 +// -tpch_expected_matching_rows=12345 +// +// From Impala: +// ==== +// ---- QUERY : TPCH-Q1 +// # Q1 - Pricing Summary Report Query +// # Modifications: Remove ORDER BY +// select +// l_returnflag, +// l_linestatus, +// (sum(l_quantity), 1), +// (sum(l_extendedprice), 1), +// (sum(l_extendedprice * (1 - l_discount)), 1), +// (sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)), 1), +// (avg(l_quantity), 1), +// (avg(l_extendedprice), 1), +// (avg(l_discount), 1), count(1) +// from +// lineitem +// where +// l_shipdate<='1998-09-02' +// group by +// l_returnflag, +// l_linestatus +// ---- TYPES +// string, string, double, double, double, double, double, double, double, bigint +// ---- RESULTS +// 'A','F',37734107,56586554400.73,53758257134.9,55909065222.8,25.5,38273.1,0,1478493 +// 'N','F',991417,1487504710.38,1413082168.1,1469649223.2,25.5,38284.5,0.1,38854 +// 'N','O',74476040,111701729697.74,106118230307.6,110367043872.5,25.5,38249.1,0,2920374 +// 'R','F',37719753,56568041380.90,53741292684.6,55889619119.8,25.5,38250.9,0.1,1478870 +// ==== +#include +#include +#include + +#include + +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/benchmarks/tpch/rpc_line_item_dao.h" +#include "kudu/benchmarks/tpch/line_item_tsv_importer.h" +#include "kudu/codegen/compilation_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/util/env.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/slice.h" +#include "kudu/util/stopwatch.h" + +DEFINE_string(tpch_path_to_data, "/tmp/lineitem.tbl", + "The full path to the '|' separated file containing the lineitem table."); +DEFINE_int32(tpch_num_query_iterations, 1, "Number of times the query will be run."); +DEFINE_int32(tpch_expected_matching_rows, 5916591, "Number of rows that should match the query."); +DEFINE_bool(use_mini_cluster, true, + "Create a mini cluster for the work to be performed against."); +DEFINE_string(mini_cluster_base_dir, "/tmp/tpch", + "If using a mini cluster, directory for master/ts data."); +DEFINE_string(master_address, "localhost", + "Address of master for the cluster to operate on"); +DEFINE_int32(tpch_max_batch_size, 1000, + "Maximum number of inserts/updates to batch at once"); +DEFINE_string(table_name, "lineitem", + "The table name to write/read"); + +namespace kudu { + +using client::KuduColumnSchema; +using client::KuduRowResult; +using client::KuduSchema; + +using std::unordered_map; + +struct Result { + int32_t l_quantity; + double l_extendedprice; + double l_discount; + double l_tax; + int count; + Result() + : l_quantity(0), l_extendedprice(0), l_discount(0), l_tax(0), count(0) { + } +}; + +// This struct is used for the keys while running the GROUP BY instead of manipulating strings +struct SliceMapKey { + Slice slice; + + // This copies the string out of the result buffer + void RelocateSlice() { + auto buf = new uint8_t[slice.size()]; + slice.relocate(buf); + } + + bool operator==(const SliceMapKey &other_key) const { + return slice == other_key.slice; + } +}; + +struct hash { + size_t operator()(const SliceMapKey &key) const { + return util_hash::CityHash64( + reinterpret_cast(key.slice.data()), key.slice.size()); + } +}; + +void LoadLineItems(const string &path, RpcLineItemDAO *dao) { + LineItemTsvImporter importer(path); + + while (importer.HasNextLine()) { + dao->WriteLine(boost::bind(&LineItemTsvImporter::GetNextLine, + &importer, _1)); + } + dao->FinishWriting(); +} + +void WarmupScanCache(RpcLineItemDAO* dao) { + // Warms up cache for the tpch1 query. + gscoped_ptr scanner; + dao->OpenTpch1Scanner(&scanner); + codegen::CompilationManager::GetSingleton()->Wait(); +} + +void Tpch1(RpcLineItemDAO *dao) { + typedef unordered_map slice_map; + typedef unordered_map slice_map_map; + + gscoped_ptr scanner; + dao->OpenTpch1Scanner(&scanner); + + int matching_rows = 0; + slice_map_map results; + Result *r; + vector rows; + while (scanner->HasMore()) { + scanner->GetNext(&rows); + for (const KuduRowResult& row : rows) { + matching_rows++; + + SliceMapKey l_returnflag; + CHECK_OK(row.GetString(1, &l_returnflag.slice)); + SliceMapKey l_linestatus; + CHECK_OK(row.GetString(2, &l_linestatus.slice)); + int32_t l_quantity; + CHECK_OK(row.GetInt32(3, &l_quantity)); + double l_extendedprice; + CHECK_OK(row.GetDouble(4, &l_extendedprice)); + double l_discount; + CHECK_OK(row.GetDouble(5, &l_discount)); + double l_tax; + CHECK_OK(row.GetDouble(6, &l_tax)); + + slice_map *linestatus_map; + auto it = results.find(l_returnflag); + if (it == results.end()) { + linestatus_map = new slice_map; + l_returnflag.RelocateSlice(); + results[l_returnflag] = linestatus_map; + } else { + linestatus_map = it->second; + } + + auto inner_it = linestatus_map->find(l_linestatus); + if (inner_it == linestatus_map->end()) { + r = new Result(); + l_linestatus.RelocateSlice(); + (*linestatus_map)[l_linestatus] = r; + } else { + r = inner_it->second; + } + r->l_quantity += l_quantity; + r->l_extendedprice += l_extendedprice; + r->l_discount += l_discount; + r->l_tax += l_tax; + r->count++; + } + } + LOG(INFO) << "Result: "; + for (const auto& result : results) { + const SliceMapKey returnflag = result.first; + const auto* maps = result.second; + for (const auto& map : *maps) { + const SliceMapKey linestatus = map.first; + Result* r = map.second; + double avg_q = static_cast(r->l_quantity) / r->count; + double avg_ext_p = r->l_extendedprice / r->count; + double avg_discount = r->l_discount / r->count; + LOG(INFO) << returnflag.slice.ToString() << ", " << + linestatus.slice.ToString() << ", " << + r->l_quantity << ", " << + StringPrintf("%.2f", r->l_extendedprice) << ", " << + // TODO those two are missing at the moment, might want to change Result + // sum(l_extendedprice * (1 - l_discount)) + // sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) + StringPrintf("%.2f", avg_q) << ", " << + StringPrintf("%.2f", avg_ext_p) << ", " << + StringPrintf("%.2f", avg_discount) << ", " << + r->count; + delete r; + delete linestatus.slice.data(); + } + delete maps; + delete returnflag.slice.data(); + } + CHECK_EQ(matching_rows, FLAGS_tpch_expected_matching_rows) << "Wrong number of rows returned"; +} + +} // namespace kudu + +int main(int argc, char **argv) { + kudu::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + + gscoped_ptr env; + gscoped_ptr cluster; + string master_address; + if (FLAGS_use_mini_cluster) { + env.reset(new kudu::EnvWrapper(kudu::Env::Default())); + kudu::Status s = env->CreateDir(FLAGS_mini_cluster_base_dir); + CHECK(s.IsAlreadyPresent() || s.ok()); + kudu::MiniClusterOptions options; + options.data_root = FLAGS_mini_cluster_base_dir; + cluster.reset(new kudu::MiniCluster(env.get(), options)); + CHECK_OK(cluster->StartSync()); + master_address = cluster->mini_master()->bound_rpc_addr_str(); + } else { + master_address = FLAGS_master_address; + } + + gscoped_ptr dao(new kudu::RpcLineItemDAO(master_address, FLAGS_table_name, + FLAGS_tpch_max_batch_size)); + dao->Init(); + + kudu::WarmupScanCache(dao.get()); + + bool needs_loading = dao->IsTableEmpty(); + if (needs_loading) { + LOG_TIMING(INFO, "loading") { + kudu::LoadLineItems(FLAGS_tpch_path_to_data, dao.get()); + } + } else { + LOG(INFO) << "Data already in place"; + } + for (int i = 0; i < FLAGS_tpch_num_query_iterations; i++) { + LOG_TIMING(INFO, StringPrintf("querying for iteration # %d", i)) { + kudu::Tpch1(dao.get()); + } + } + + if (cluster) { + cluster->Shutdown(); + } + return 0; +} diff --git a/src/kudu/benchmarks/tpch/tpch_real_world.cc b/src/kudu/benchmarks/tpch/tpch_real_world.cc new file mode 100644 index 000000000000..4e6a2e5ad75e --- /dev/null +++ b/src/kudu/benchmarks/tpch/tpch_real_world.cc @@ -0,0 +1,396 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Benchmarking tool to run tpch1 concurrently with inserts. +// +// Requirements: +// - TPC-H's dbgen tool, compiled. +// - Optionally, a running cluster. By default it starts its own external cluster. +// +// This tool has three main configurations: +// - tpch_test_runtime_sec: the longest this test can run for, in seconds, excluding startup time. +// By default, it runs until there's no more rows to insert. +// - tpch_scaling_factor: the dbgen scaling factor to generate the data. The test will end if +// dbgen is done generating data, even if there's still time left. The default is 1. +// - tpch_path_to_dbgen: where to find dbgen, by default it assumes it can be found in the +// current directory. +// +// This tool has three threads: +// - One that runs "dbgen -T L" with the configured scale factor. +// - One that reads from the "lineitem.tbl" named pipe and inserts the rows. +// - One that runs tpch1 continuously and outputs the timings. This thread won't start until at +// least some rows have been written, because dbgen takes some seconds to startup. It also +// stops at soon as it gets the signal that we ran out of time or that there are no more rows to +// insert, so the last timing shouldn't be used. +// +// TODO Make the inserts multi-threaded. See Kudu-629 for the technique. +#include + +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/line_item_tsv_importer.h" +#include "kudu/benchmarks/tpch/rpc_line_item_dao.h" +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/atomic.h" +#include "kudu/util/env.h" +#include "kudu/util/errno.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/monotime.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/subprocess.h" +#include "kudu/util/thread.h" + +DEFINE_bool(tpch_use_mini_cluster, true, + "Create a mini cluster for the work to be performed against"); +DEFINE_bool(tpch_load_data, true, + "Load dbgen data"); +DEFINE_bool(tpch_run_queries, true, + "Query dbgen data as it is inserted"); +DEFINE_int32(tpch_max_batch_size, 1000, + "Maximum number of inserts to batch at once"); +DEFINE_int32(tpch_test_client_timeout_msec, 10000, + "Timeout that will be used for all operations and RPCs"); +DEFINE_int32(tpch_test_runtime_sec, 0, + "How long this test should run for excluding startup time (note that it will also " + "stop if dbgen finished generating all its data)"); +DEFINE_int32(tpch_scaling_factor, 1, + "Scaling factor to use with dbgen, the default is 1"); +DEFINE_int32(tpch_num_inserters, 1, + "Number of data inserters to run in parallel. Each inserter implies a new tablet " + "in the table"); +DEFINE_string(tpch_master_addresses, "localhost", + "Addresses of masters for the cluster to operate on if not using a mini cluster"); +DEFINE_string(tpch_mini_cluster_base_dir, "/tmp/tpch", + "If using a mini cluster, directory for master/ts data"); +DEFINE_string(tpch_path_to_dbgen_dir, ".", + "Path to the directory where the dbgen executable can be found"); +DEFINE_string(tpch_path_to_ts_flags_file, "", + "Path to the file that contains extra flags for the tablet servers if using " + "a mini cluster. Doesn't use one by default."); +DEFINE_string(tpch_table_name, "tpch_real_world", + "Table name to use during the test"); + +namespace kudu { + +using client::KuduRowResult; +using client::KuduSchema; +using strings::Substitute; + +class TpchRealWorld { + public: + TpchRealWorld() + : rows_inserted_(0), + stop_threads_(false), + dbgen_processes_finished_(FLAGS_tpch_num_inserters) { + } + + ~TpchRealWorld() { + STLDeleteElements(&dbgen_processes_); + } + + Status Init(); + + gscoped_ptr GetInittedDAO(); + + void LoadLineItemsThread(int i); + + void MonitorDbgenThread(int i); + + void RunQueriesThread(); + + void WaitForRowCount(int64_t row_count); + + Status Run(); + + private: + static const char* kLineItemBase; + + Status CreateFifos(); + Status StartDbgens(); + string GetNthLineItemFileName(int i) const { + // This is dbgen's naming convention; we're just following it. + return FLAGS_tpch_num_inserters > 1 + ? Substitute("$0.$1", kLineItemBase, i + 1) : kLineItemBase; + } + + gscoped_ptr cluster_; + AtomicInt rows_inserted_; + string master_addresses_; + AtomicBool stop_threads_; + CountDownLatch dbgen_processes_finished_; + + vector dbgen_processes_; +}; + +const char* TpchRealWorld::kLineItemBase = "lineitem.tbl"; + +Status TpchRealWorld::Init() { + Env* env = Env::Default(); + if (FLAGS_tpch_use_mini_cluster) { + if (env->FileExists(FLAGS_tpch_mini_cluster_base_dir)) { + RETURN_NOT_OK(env->DeleteRecursively(FLAGS_tpch_mini_cluster_base_dir)); + } + RETURN_NOT_OK(env->CreateDir(FLAGS_tpch_mini_cluster_base_dir)); + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = 1; + opts.data_root = FLAGS_tpch_mini_cluster_base_dir; + if (!FLAGS_tpch_path_to_ts_flags_file.empty()) { + opts.extra_tserver_flags.push_back("--flagfile=" + FLAGS_tpch_path_to_ts_flags_file); + } + + cluster_.reset(new ExternalMiniCluster(opts)); + RETURN_NOT_OK(cluster_->Start()); + master_addresses_ = cluster_->leader_master()->bound_rpc_hostport().ToString(); + } else { + master_addresses_ = FLAGS_tpch_master_addresses; + } + + // Create the table before any other DAOs are constructed. + GetInittedDAO(); + + if (FLAGS_tpch_load_data) { + RETURN_NOT_OK(CreateFifos()); + RETURN_NOT_OK(StartDbgens()); + } + + return Status::OK(); +} + +Status TpchRealWorld::CreateFifos() { + for (int i = 0; i < FLAGS_tpch_num_inserters; i++) { + string path_to_lineitem = GetNthLineItemFileName(i); + struct stat sbuf; + if (stat(path_to_lineitem.c_str(), &sbuf) != 0) { + if (errno == ENOENT) { + if (mkfifo(path_to_lineitem.c_str(), 0644) != 0) { + string msg = Substitute("Could not create the named pipe for the dbgen output at $0", + path_to_lineitem); + return Status::InvalidArgument(msg); + } + } else { + return Status::IOError(path_to_lineitem, ErrnoToString(errno), errno); + } + } else { + if (!S_ISFIFO(sbuf.st_mode)) { + string msg = Substitute("Please remove the current lineitem file at $0", + path_to_lineitem); + return Status::InvalidArgument(msg); + } + } + // We get here if the file was already a fifo or if we created it. + } + return Status::OK(); +} + +Status TpchRealWorld::StartDbgens() { + for (int i = 1; i <= FLAGS_tpch_num_inserters; i++) { + // This environment variable is necessary if dbgen isn't in the current dir. + setenv("DSS_CONFIG", FLAGS_tpch_path_to_dbgen_dir.c_str(), 1); + string path_to_dbgen = Substitute("$0/dbgen", FLAGS_tpch_path_to_dbgen_dir); + vector argv; + argv.push_back(path_to_dbgen); + argv.push_back("-q"); + argv.push_back("-T"); + argv.push_back("L"); + argv.push_back("-s"); + argv.push_back(Substitute("$0", FLAGS_tpch_scaling_factor)); + if (FLAGS_tpch_num_inserters > 1) { + argv.push_back("-C"); + argv.push_back(Substitute("$0", FLAGS_tpch_num_inserters)); + argv.push_back("-S"); + argv.push_back(Substitute("$0", i)); + } + gscoped_ptr dbgen_proc(new Subprocess(path_to_dbgen, argv)); + LOG(INFO) << "Running " << JoinStrings(argv, " "); + RETURN_NOT_OK(dbgen_proc->Start()); + dbgen_processes_.push_back(dbgen_proc.release()); + } + return Status::OK(); +} + +gscoped_ptr TpchRealWorld::GetInittedDAO() { + // When chunking, dbgen will begin the nth chunk on the order key: + // + // 6000000 * SF * n / num_chunks + // + // For example, when run with SF=2 and three chunks, the first keys for each + // chunk are 1, 4000001, and 8000001. + int64_t increment = 6000000L * FLAGS_tpch_scaling_factor / + FLAGS_tpch_num_inserters; + + KuduSchema schema(tpch::CreateLineItemSchema()); + vector split_rows; + for (int64_t i = 1; i < FLAGS_tpch_num_inserters; i++) { + KuduPartialRow* row = schema.NewRow(); + CHECK_OK(row->SetInt64(tpch::kOrderKeyColName, i * increment)); + CHECK_OK(row->SetInt32(tpch::kLineNumberColName, 0)); + split_rows.push_back(row); + } + + gscoped_ptr dao(new RpcLineItemDAO(master_addresses_, + FLAGS_tpch_table_name, + FLAGS_tpch_max_batch_size, + FLAGS_tpch_test_client_timeout_msec, + split_rows)); + dao->Init(); + return dao.Pass(); +} + +void TpchRealWorld::LoadLineItemsThread(int i) { + LOG(INFO) << "Connecting to cluster at " << master_addresses_; + gscoped_ptr dao = GetInittedDAO(); + LineItemTsvImporter importer(GetNthLineItemFileName(i)); + + boost::function f = + boost::bind(&LineItemTsvImporter::GetNextLine, &importer, _1); + while (importer.HasNextLine() && !stop_threads_.Load()) { + dao->WriteLine(f); + int64_t current_count = rows_inserted_.Increment(); + if (current_count % 250000 == 0) { + LOG(INFO) << "Inserted " << current_count << " rows"; + } + } + dao->FinishWriting(); +} + +void TpchRealWorld::MonitorDbgenThread(int i) { + Subprocess* dbgen_proc = dbgen_processes_[i]; + while (!stop_threads_.Load()) { + int ret; + Status s = dbgen_proc->WaitNoBlock(&ret); + if (s.ok()) { + CHECK(ret == 0) << "dbgen exited with a non-zero return code: " << ret; + LOG(INFO) << "dbgen finished inserting data"; + dbgen_processes_finished_.CountDown(); + return; + } else { + SleepFor(MonoDelta::FromMilliseconds(100)); + } + } + dbgen_proc->Kill(9); + int ret; + dbgen_proc->Wait(&ret); +} + +void TpchRealWorld::RunQueriesThread() { + gscoped_ptr dao = GetInittedDAO(); + while (!stop_threads_.Load()) { + string log; + if (FLAGS_tpch_load_data) { + log = StringPrintf("querying %" PRId64 " rows", rows_inserted_.Load()); + } else { + log = "querying data in cluster"; + } + LOG_TIMING(INFO, log) { + gscoped_ptr scanner; + dao->OpenTpch1Scanner(&scanner); + vector rows; + // We check stop_threads_ even while scanning since it can takes tens of seconds to query. + // This means that the last timing cannot be used for reporting. + while (scanner->HasMore() && !stop_threads_.Load()) { + scanner->GetNext(&rows); + } + } + } +} + +void TpchRealWorld::WaitForRowCount(int64_t row_count) { + while (rows_inserted_.Load() < row_count) { + SleepFor(MonoDelta::FromMilliseconds(100)); + } +} + +Status TpchRealWorld::Run() { + vector > threads; + if (FLAGS_tpch_load_data) { + for (int i = 0; i < FLAGS_tpch_num_inserters; i++) { + scoped_refptr thr; + RETURN_NOT_OK(kudu::Thread::Create("test", Substitute("lineitem-gen$0", i), + &TpchRealWorld::MonitorDbgenThread, this, i, + &thr)); + threads.push_back(thr); + RETURN_NOT_OK(kudu::Thread::Create("test", Substitute("lineitem-load$0", i), + &TpchRealWorld::LoadLineItemsThread, this, i, + &thr)); + threads.push_back(thr); + } + + // It takes some time for dbgen to start outputting rows so there's no need to query yet. + LOG(INFO) << "Waiting for dbgen to start..."; + WaitForRowCount(10000); + } + + if (FLAGS_tpch_run_queries) { + scoped_refptr thr; + RETURN_NOT_OK(kudu::Thread::Create("test", "lineitem-query", + &TpchRealWorld::RunQueriesThread, this, + &thr)); + threads.push_back(thr); + } + + // We'll wait until all the dbgens finish or after tpch_test_runtime_sec, + // whichever comes first. + if (FLAGS_tpch_test_runtime_sec > 0) { + if (!dbgen_processes_finished_.WaitFor( + MonoDelta::FromSeconds(FLAGS_tpch_test_runtime_sec))) { + LOG(WARNING) << FLAGS_tpch_test_runtime_sec + << " seconds expired, killing test"; + } + } else { + dbgen_processes_finished_.Wait(); + } + + if (!FLAGS_tpch_load_data) { + SleepFor(MonoDelta::FromSeconds(100)); + } + + stop_threads_.Store(true); + + for (scoped_refptr thr : threads) { + RETURN_NOT_OK(ThreadJoiner(thr.get()).Join()); + } + return Status::OK(); +} + +} // namespace kudu + +int main(int argc, char* argv[]) { + kudu::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + + kudu::TpchRealWorld benchmarker; + kudu::Status s = benchmarker.Init(); + if (!s.ok()) { + std::cerr << "Couldn't initialize the benchmarking tool, reason: "<< s.ToString() << std::endl; + return 1; + } + s = benchmarker.Run(); + if (!s.ok()) { + std::cerr << "Couldn't run the benchmarking tool, reason: "<< s.ToString() << std::endl; + return 1; + } + return 0; +} diff --git a/src/kudu/benchmarks/wal_hiccup-parser.py b/src/kudu/benchmarks/wal_hiccup-parser.py new file mode 100755 index 000000000000..7ba3d5f9ac82 --- /dev/null +++ b/src/kudu/benchmarks/wal_hiccup-parser.py @@ -0,0 +1,71 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Converts glog output from wal_hiccup into a CSV of all test runs. +# +# The output looks like this: +# +# var1,var2,...,varN,throughput,p95,p99,p99.99,max +# 0,0,...,68.5702,29,8544,22263,29108 +# +# Each variable represents a different facet of the test setup. +# The throughput is in MB/s and the latency figures are in us. + +import re +import sys + +def main(): + if len(sys.argv) != 2: + print "Usage: %s " % (sys.argv[0],) + return + cols = list() + cols_printed = False + with open(sys.argv[1], "r") as f: + vals = None + for line in f: + line = line.strip() + + # Beginning of a test result. + if "Test results for setup" in line: + vals = list() + + # End of a test result. + elif "-------" in line and vals is not None: + if not cols_printed: + print ",".join(cols) + cols_printed = True + print ",".join(vals) + vals = None + + # Entry in a test result. + elif vals is not None: + m = re.match(".*\] ([\w\.]+): ([\d\.]+)", line) + if not m: + continue + col = m.group(1) + val = m.group(2) + if cols_printed: + if col not in cols: + raise Exception("Unexpected column %s" % (col,)) + else: + cols.append(col) + vals.append(val) + +if __name__ == '__main__': + main() diff --git a/src/kudu/benchmarks/wal_hiccup.cc b/src/kudu/benchmarks/wal_hiccup.cc new file mode 100644 index 000000000000..e1525c8bea3c --- /dev/null +++ b/src/kudu/benchmarks/wal_hiccup.cc @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include +#include +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/logging.h" +#include "kudu/util/path_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread.h" + +DEFINE_int32(num_files, 40, "number of files to write"); +DEFINE_int32(file_size_mb, 16, "size of each file"); +DEFINE_int32(num_rounds, 1, "number of times to try each setup"); +DEFINE_int32(wal_interval_us, 1000, "number of microseconds to sleep between WAL writes"); +DEFINE_string(file_path, "", "path where all files are written; defaults to cwd"); + +DEFINE_bool(initiate_writeback_each_file, false, + "SYNC_FILE_RANGE_WRITE each file before closing it"); +DEFINE_bool(await_writeback_each_file, false, + "SYNC_FILE_RANGE_WAIT_BEFORE each file before closing it"); +DEFINE_bool(fdatasync_each_file, false, + "fdatasync() each file before closing it"); + +DEFINE_bool(initiate_writeback_at_end, false, + "SYNC_FILE_RANGE_WRITE each file after writing all files"); +DEFINE_bool(await_writeback_at_end, false, + "SYNC_FILE_RANGE_WAIT_BEFORE each file after writing all files"); +DEFINE_bool(fdatasync_at_end, true, + "fdatasync each file after writing all files"); + +DEFINE_bool(page_align_wal_writes, false, + "write to the fake WAL with exactly 4KB writes to never cross pages"); + +using std::string; + +namespace kudu { + +class WalHiccupBenchmarker { + public: + WalHiccupBenchmarker() + : finished_(1), + cur_histo_(NULL) { + } + ~WalHiccupBenchmarker() { + STLDeleteElements(&wal_histos_); + } + + void WALThread(); + void PrintConfig(); + void RunOnce(); + void Run(); + protected: + CountDownLatch finished_; + std::vector wal_histos_; + HdrHistogram* cur_histo_; +}; + + +void WalHiccupBenchmarker::WALThread() { + string name = "wal"; + if (!FLAGS_file_path.empty()) { + name = JoinPathSegments(FLAGS_file_path, name); + } + int fd = open(name.c_str(), O_WRONLY | O_TRUNC | O_CREAT, 0666); + PCHECK(fd >= 0) << "open() failed"; + char buf[4096]; + memset(buf, 0xFF, sizeof(buf)); + const MonoDelta sleepDelta = MonoDelta::FromMicroseconds(FLAGS_wal_interval_us); + while (finished_.count() > 0) { + SleepFor(sleepDelta); + MicrosecondsInt64 st = GetCurrentTimeMicros(); + size_t num_bytes = FLAGS_page_align_wal_writes ? sizeof(buf) : sizeof(buf) - 1; + PCHECK(write(fd, buf, num_bytes) == num_bytes); + PCHECK(fdatasync(fd) == 0); + MicrosecondsInt64 et = GetCurrentTimeMicros(); + MicrosecondsInt64 value = et - st; + cur_histo_->IncrementWithExpectedInterval(value, FLAGS_wal_interval_us); + if (value > FLAGS_wal_interval_us) { + LOG(WARNING) << "slow wal write: " << value << "us"; + } + } +} + +void WriteFile(const string& name, + bool initiate_writeback, + bool wait_writeback, + bool datasync, + int* fd) { + string full_name = name; + if (!FLAGS_file_path.empty()) { + full_name = JoinPathSegments(FLAGS_file_path, full_name); + } + *fd = open(full_name.c_str(), O_WRONLY|O_TRUNC|O_CREAT, 0666); + PCHECK(*fd >= 0) << "open() failed"; + + char buf[1024]; + memset(buf, 0xFF, sizeof(buf)); + for (int i = 0; i < FLAGS_file_size_mb; i++) { + for (int j = 0; j < 1024; j++) { + PCHECK(write(*fd, buf, sizeof(buf)) > 0); + } + } + + if (initiate_writeback) { + PCHECK(sync_file_range(*fd, 0, 0, SYNC_FILE_RANGE_WRITE) == 0); + } + + if (wait_writeback) { + PCHECK(sync_file_range(*fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE) == 0); + } + + if (datasync) { + PCHECK(fdatasync(*fd) == 0); + } +} + +void SetFlags(uint32_t setup) { + FLAGS_initiate_writeback_each_file = setup & (1 << 0); + FLAGS_await_writeback_each_file = setup & (1 << 1); + FLAGS_fdatasync_each_file = setup & (1 << 2); + + FLAGS_initiate_writeback_at_end = setup & (1 << 3); + FLAGS_await_writeback_at_end = setup & (1 << 4); + FLAGS_fdatasync_at_end = setup & (1 << 5); + + FLAGS_page_align_wal_writes = setup & (1 << 6); +} + +void WalHiccupBenchmarker::Run() { + int num_setups = 1 << 7; + wal_histos_.resize(num_setups); + + vector total_time; + total_time.resize(num_setups); + + vector setups; + setups.reserve(num_setups); + for (uint32_t setup = 0; setup < num_setups; setup++) { + setups.push_back(setup); + } + + for (int round = 0; round < FLAGS_num_rounds; round++) { + // Randomize the order of setups in each round. + std::random_shuffle(setups.begin(), setups.end()); + + for (uint32_t setup : setups) { + SetFlags(setup); + if (!FLAGS_fdatasync_each_file && !FLAGS_fdatasync_at_end) { + // Skip non-durable configuration + continue; + } + + LOG(INFO) << "----------------------------------------------------------------------"; + LOG(INFO) << "Continuing setup " << setup << ":"; + PrintConfig(); + LOG(INFO) << "----------------------------------------------------------------------"; + + sync(); + if (wal_histos_[setup] == NULL) { + wal_histos_[setup] = new HdrHistogram(1000000, 4); + } + cur_histo_ = wal_histos_[setup]; + + Stopwatch s; + s.start(); + RunOnce(); + s.stop(); + total_time[setup] += s.elapsed().wall_seconds(); + } + LOG(INFO) << "----------------------------------------------------------------------"; + LOG(INFO) << "Ran " << setups.size() << " setups"; + LOG(INFO) << "----------------------------------------------------------------------"; + } + + for (uint32_t setup : setups) { + SetFlags(setup); + if (!FLAGS_fdatasync_each_file && !FLAGS_fdatasync_at_end) { + // Skip non-durable configuration + continue; + } + + cur_histo_ = wal_histos_[setup]; + + double throughput = (FLAGS_num_rounds * FLAGS_num_files * FLAGS_file_size_mb) / + total_time[setup]; + + LOG(INFO) << "----------------------------------------------------------------------"; + LOG(INFO) << "Test results for setup " << setup << ":"; + PrintConfig(); + LOG(INFO); + LOG(INFO) << "throughput: " << throughput; + LOG(INFO) << "p95: " << cur_histo_->ValueAtPercentile(95.0); + LOG(INFO) << "p99: " << cur_histo_->ValueAtPercentile(99.0); + LOG(INFO) << "p99.99: " << cur_histo_->ValueAtPercentile(99.99); + LOG(INFO) << "max: " << cur_histo_->MaxValue(); + LOG(INFO) << "----------------------------------------------------------------------"; + } + +} + +void WalHiccupBenchmarker::PrintConfig() { + LOG(INFO) << "initiate_writeback_each_file: " << FLAGS_initiate_writeback_each_file; + LOG(INFO) << "await_writeback_each_file: " << FLAGS_await_writeback_each_file; + LOG(INFO) << "fdatasync_each_file: " << FLAGS_fdatasync_each_file; + LOG(INFO) << "initiate_writeback_at_end: " << FLAGS_initiate_writeback_at_end; + LOG(INFO) << "await_writeback_at_end: " << FLAGS_await_writeback_at_end; + LOG(INFO) << "fdatasync_at_end: " << FLAGS_fdatasync_at_end; + LOG(INFO) << "page_align_wal_writes: " << FLAGS_page_align_wal_writes; +} + +void WalHiccupBenchmarker::RunOnce() { + finished_.Reset(1); + scoped_refptr thr; + CHECK_OK(Thread::Create("test", "wal", &WalHiccupBenchmarker::WALThread, this, &thr)); + + + int fds[FLAGS_num_files]; + for (int i = 0; i < FLAGS_num_files; i++) { + WriteFile(strings::Substitute("file-$0", i), + FLAGS_initiate_writeback_each_file, + FLAGS_await_writeback_each_file, + FLAGS_fdatasync_each_file, + &fds[i]); + } + + LOG(INFO) << "Done writing..."; + if (FLAGS_initiate_writeback_at_end) { + LOG(INFO) << "Post-write initiating writeback..."; + for (int i = 0; i < FLAGS_num_files; i++) { + PCHECK(sync_file_range(fds[i], 0, 0, SYNC_FILE_RANGE_WRITE) == 0); + } + } + if (FLAGS_await_writeback_at_end) { + LOG(INFO) << "Post-write awaiting writeback..."; + for (int i = 0; i < FLAGS_num_files; i++) { + PCHECK(sync_file_range(fds[i], 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE) == 0); + } + } + if (FLAGS_fdatasync_at_end) { + LOG(INFO) << "Post-write fdatasync..."; + for (int i = 0; i < FLAGS_num_files; i++) { + PCHECK(fdatasync(fds[i]) == 0); + } + } + for (int i = 0; i < FLAGS_num_files; i++) { + PCHECK(close(fds[i]) == 0); + } + + LOG(INFO) << "Done closing..."; + finished_.CountDown(); + thr->Join(); +} + +} // namespace kudu + +int main(int argc, char* argv[]) { + google::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + + kudu::WalHiccupBenchmarker benchmarker; + benchmarker.Run(); + + return 0; +} diff --git a/src/kudu/benchmarks/ycsb-schema.h b/src/kudu/benchmarks/ycsb-schema.h new file mode 100644 index 000000000000..5a5386a0c63c --- /dev/null +++ b/src/kudu/benchmarks/ycsb-schema.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Inline function to create the YCSB schema +#ifndef KUDU_BENCHMARKS_YCSB_SCHEMA_H +#define KUDU_BENCHMARKS_YCSB_SCHEMA_H + +#include "kudu/client/schema.h" + +namespace kudu { + +static const client::KuduColumnSchema::DataType kString = + client::KuduColumnSchema::STRING; + +inline client::KuduSchema CreateYCSBSchema() { + client::KuduSchema s; + client::KuduSchemaBuilder b; + + b.AddColumn("key")->Type(kString)->NotNull()->PrimaryKey(); + b.AddColumn("field0")->Type(kString)->NotNull(); + b.AddColumn("field1")->Type(kString)->NotNull(); + b.AddColumn("field2")->Type(kString)->NotNull(); + b.AddColumn("field3")->Type(kString)->NotNull(); + b.AddColumn("field4")->Type(kString)->NotNull(); + b.AddColumn("field5")->Type(kString)->NotNull(); + b.AddColumn("field6")->Type(kString)->NotNull(); + b.AddColumn("field7")->Type(kString)->NotNull(); + b.AddColumn("field8")->Type(kString)->NotNull(); + b.AddColumn("field9")->Type(kString)->NotNull(); + CHECK_OK(b.Build(&s)); + return s; +} + +} // namespace kudu +#endif + diff --git a/src/kudu/cfile/CMakeLists.txt b/src/kudu/cfile/CMakeLists.txt new file mode 100644 index 000000000000..2bd5ca45f643 --- /dev/null +++ b/src/kudu/cfile/CMakeLists.txt @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +PROTOBUF_GENERATE_CPP( + CFILE_PROTO_SRCS CFILE_PROTO_HDRS CFILE_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES cfile.proto) +set(CFILE_PROTO_LIBS + kudu_common_proto + protobuf) +ADD_EXPORTABLE_LIBRARY(cfile_proto + SRCS ${CFILE_PROTO_SRCS} + DEPS ${CFILE_PROTO_LIBS} + NONLINK_DEPS ${CFILE_PROTO_TGTS}) + +add_library(cfile + binary_dict_block.cc + binary_plain_block.cc + binary_prefix_block.cc + block_cache.cc + block_compression.cc + bloomfile.cc + bshuf_block.cc + cfile_reader.cc + cfile_util.cc + cfile_writer.cc + compression_codec.cc + gvint_block.cc + index_block.cc + index_btree.cc + type_encodings.cc) + +target_link_libraries(cfile + kudu_common + kudu_fs + kudu_util + gutil + cfile_proto + lz4 + bitshuffle + snappy + zlib) + +# Tests +set(KUDU_TEST_LINK_LIBS cfile ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(index-test) +ADD_KUDU_TEST(cfile-test) +ADD_KUDU_TEST(encoding-test LABELS no_tsan) +ADD_KUDU_TEST(bloomfile-test) +ADD_KUDU_TEST(mt-bloomfile-test) +ADD_KUDU_TEST(block_cache-test) +ADD_KUDU_TEST(compression-test) + +# Tools +add_executable(cfile-dump cfile-dump.cc) +target_link_libraries(cfile-dump cfile ${KUDU_BASE_LIBS}) diff --git a/src/kudu/cfile/README b/src/kudu/cfile/README new file mode 100644 index 000000000000..cda598934fe0 --- /dev/null +++ b/src/kudu/cfile/README @@ -0,0 +1,186 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +CFile is a simple columnar format which stores multiple related B-Trees. + + +File format +----------------- + +
+ + +
+EOF + + +Header +------ + +: the string 'kuducfil' +
: 32-bit unsigned integer length delimiter +
: CFileHeaderPB protobuf + + +Footer +------ + +
: CFileFooterPB protobuf +: the string 'kuducfil' +
(length of protobuf) + + +============================== + +Data blocks: + +Data blocks are stored with various types of encodings. + +* Prefix Encoding + +Currently used for STRING blocks. This is based on the encoding used +by LevelDB for its data blocks, more or less. + +Starts with a header of four uint32s, group-varint coded: + \ + | + | group varint 32 + / + +Followed by prefix-compressed values. Each value is stored relative +to the value preceding it using the following format: + + shared_bytes: varint32 + unshared_bytes: varint32 + delta: char[unshared_bytes] + +Periodically there will be a "restart point" which is necessary for +faster binary searching. At a "restart point", shared_bytes is +0 but otherwise the encoding is the same. + +At the end of the block is a trailer with the offsets of the +restart points: + + restart_points[num_restarts]: uint32 + num_restarts: uint32 + +The restart points are offsets relative to the start of the block, +including the header. + + +* Group Varint Frame-Of-Reference Encoding + +Used for uint32 blocks. + +Starts with a header: + + \ + | + | group varint 32 + / + +The ordinal position is the ordinal position of the first item in the +file. For example, the first data block in the file has ordinal position +0. If that block had 400 data entries, then the second data block would +have ordinal position 400. + +Followed by the actual data, each set of 4 integers using group-varint. +The last group is padded with 0s. +Each integer is relative to the min element in the header. + +============================== + +Nullable Columns + +If a column is marked as nullable in the schema, a bitmap is used to keep track +of the null and not null rows. + +The bitmap is added the begininning of the data block, and it uses RLE. + + : vint + : vint + : RLE encoding + : encoded data + +Data Block Example - 4 items, the first and last are nulls. + 4 Num Elements in the block + 1 Null Bitmap Size + 0110 Null Bitmap + v2 Value of row 2 + v3 Value of row 3 + +============================== + +Index blocks: + +The index blocks are organized in a B-Tree. As data blocks are written, +they are appended to the end of a leaf index block. When a leaf index +block reaches the configured block size, it is added to another index +block higher up the tree, and a new leaf is started. If the intermediate +index block fills, it will start a new intermediate block and spill into +an even higher-layer internal block. + +For example: + + [Int 0] + ------------------------------ + | | + [Int 1] [Int 2] + ----------------- -------------- + | | | | | +[Leaf 0] ... [Leaf N] [Leaf N+1] [Leaf N+2] + + +In this case, we wrote N leaf blocks, which filled up the node labeled +Int 1. At this point, the writer would create Int 0 with one entry pointing +to Int 1. Further leaf blocks (N+1 and N+2) would be written to a new +internal node (Int 2). When the file is completed, Int 2 will spill, +adding its entry into Int 0 as well. + +Note that this strategy doesn't result in a fully balanced b-tree, but instead +results in a 100% "fill factor" on all nodes in each level except for the last +one written. + +There are two types of indexes: + +- Positional indexes: map ordinal position -> data block offset + +These are used to satisfy queries like: "seek to the Nth entry in this file" + +- Value-based indexes: reponsible for mapping value -> data block offset + +These are only present in files which contain data stored in sorted order +(e.g key columns). They can satisfy seeks by value. + + +An index block is encoded similarly for both types of indexes: + + + +... + key: vint64 for positional, otherwise varint-length-prefixed string + offset: vint64 + block size: vint32 + + (fixed32) + (fixed32) +... + These offsets are relative to the start of the block. + + + A IndexBlockTrailerPB protobuf + + +The trailer protobuf includes a field which designates whether the block +is a leaf node or internal node of the B-Tree, allowing a reader to know +whether the pointer is to another index block or to a data block. diff --git a/src/kudu/cfile/binary_dict_block.cc b/src/kudu/cfile/binary_dict_block.cc new file mode 100644 index 000000000000..731e16db08ac --- /dev/null +++ b/src/kudu/cfile/binary_dict_block.cc @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/binary_dict_block.h" + +#include +#include + +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/bshuf_block.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { +namespace cfile { + +BinaryDictBlockBuilder::BinaryDictBlockBuilder(const WriterOptions* options) + : options_(options), + dict_block_(options_), + dictionary_strings_arena_(1024, 32*1024*1024), + mode_(kCodeWordMode) { + data_builder_.reset(new BShufBlockBuilder(options_)); + Reset(); +} + +void BinaryDictBlockBuilder::Reset() { + buffer_.clear(); + buffer_.resize(kMaxHeaderSize); + buffer_.reserve(options_->storage_attributes.cfile_block_size); + + if (mode_ == kCodeWordMode && + dict_block_.IsBlockFull(options_->storage_attributes.cfile_block_size)) { + mode_ = kPlainBinaryMode; + data_builder_.reset(new BinaryPlainBlockBuilder(options_)); + } else { + data_builder_->Reset(); + } + + finished_ = false; +} + +Slice BinaryDictBlockBuilder::Finish(rowid_t ordinal_pos) { + finished_ = true; + + InlineEncodeFixed32(&buffer_[0], mode_); + + // TODO: if we could modify the the Finish() API a little bit, we can + // avoid an extra memory copy (buffer_.append(..)) + Slice data_slice = data_builder_->Finish(ordinal_pos); + buffer_.append(data_slice.data(), data_slice.size()); + + return Slice(buffer_); +} + +// The current block is considered full when the the size of data block +// exceeds limit or when the size of dictionary block exceeds the +// CFile block size. +// +// If it is the latter case, all the subsequent data blocks will switch to +// StringPlainBlock automatically. +bool BinaryDictBlockBuilder::IsBlockFull(size_t limit) const { + int block_size = options_->storage_attributes.cfile_block_size; + if (data_builder_->IsBlockFull(block_size)) return true; + if (dict_block_.IsBlockFull(block_size) && (mode_ == kCodeWordMode)) return true; + return false; +} + +int BinaryDictBlockBuilder::AddCodeWords(const uint8_t* vals, size_t count) { + DCHECK(!finished_); + DCHECK_GT(count, 0); + size_t i; + + if (data_builder_->Count() == 0) { + const Slice* first = reinterpret_cast(vals); + first_key_.assign_copy(first->data(), first->size()); + } + + for (i = 0; i < count; i++) { + const Slice* src = reinterpret_cast(vals); + const char* c_str = reinterpret_cast(src->data()); + StringPiece current_item(c_str, src->size()); + uint32_t codeword; + + if (!FindCopy(dictionary_, current_item, &codeword)) { + // The dictionary block is full + if (dict_block_.Add(vals, 1) == 0) { + break; + } + const uint8_t* s_ptr = dictionary_strings_arena_.AddSlice(*src); + if (s_ptr == nullptr) { + // Arena does not have enough space for string content + // Ideally, it should not happen. + LOG(ERROR) << "Arena of Dictionary Encoder does not have enough memory for strings"; + break; + } + const char* s_content = reinterpret_cast(s_ptr); + codeword = dict_block_.Count() - 1; + InsertOrDie(&dictionary_, StringPiece(s_content, src->size()), codeword); + } + // The data block is full + if (data_builder_->Add(reinterpret_cast(&codeword), 1) == 0) { + break; + } + vals += sizeof(Slice); + } + return i; +} + +int BinaryDictBlockBuilder::Add(const uint8_t* vals, size_t count) { + if (mode_ == kCodeWordMode) { + return AddCodeWords(vals, count); + } else { + DCHECK_EQ(mode_, kPlainBinaryMode); + return data_builder_->Add(vals, count); + } +} + +Status BinaryDictBlockBuilder::AppendExtraInfo(CFileWriter* c_writer, CFileFooterPB* footer) { + Slice dict_slice = dict_block_.Finish(0); + + std::vector dict_v; + dict_v.push_back(dict_slice); + + BlockPointer ptr; + Status s = c_writer->AppendDictBlock(dict_v, &ptr, "Append dictionary block"); + if (!s.ok()) { + LOG(WARNING) << "Unable to append block to file: " << s.ToString(); + return s; + } + ptr.CopyToPB(footer->mutable_dict_block_ptr()); + return Status::OK(); +} + +size_t BinaryDictBlockBuilder::Count() const { + return data_builder_->Count(); +} + +Status BinaryDictBlockBuilder::GetFirstKey(void* key_void) const { + if (mode_ == kCodeWordMode) { + CHECK(finished_); + Slice* slice = reinterpret_cast(key_void); + *slice = Slice(first_key_); + return Status::OK(); + } else { + DCHECK_EQ(mode_, kPlainBinaryMode); + return data_builder_->GetFirstKey(key_void); + } +} + +//////////////////////////////////////////////////////////// +// Decoding +//////////////////////////////////////////////////////////// + +BinaryDictBlockDecoder::BinaryDictBlockDecoder(Slice slice, CFileIterator* iter) + : data_(std::move(slice)), + parsed_(false) { + dict_decoder_ = iter->GetDictDecoder(); +} + +Status BinaryDictBlockDecoder::ParseHeader() { + CHECK(!parsed_); + + if (data_.size() < kMinHeaderSize) { + return Status::Corruption( + strings::Substitute("not enough bytes for header: dictionary block header " + "size ($0) less than minimum possible header length ($1)", + data_.size(), kMinHeaderSize)); + } + + bool valid = tight_enum_test_cast(DecodeFixed32(&data_[0]), &mode_); + if (PREDICT_FALSE(!valid)) { + return Status::Corruption("header Mode information corrupted"); + } + Slice content(data_.data() + 4, data_.size() - 4); + + if (mode_ == kCodeWordMode) { + data_decoder_.reset(new BShufBlockDecoder(content)); + } else { + if (mode_ != kPlainBinaryMode) { + return Status::Corruption("Unrecognized Dictionary encoded data block header"); + } + data_decoder_.reset(new BinaryPlainBlockDecoder(content)); + } + + RETURN_NOT_OK(data_decoder_->ParseHeader()); + parsed_ = true; + return Status::OK(); +} + +void BinaryDictBlockDecoder::SeekToPositionInBlock(uint pos) { + data_decoder_->SeekToPositionInBlock(pos); +} + +Status BinaryDictBlockDecoder::SeekAtOrAfterValue(const void* value_void, bool* exact) { + if (mode_ == kCodeWordMode) { + DCHECK(value_void != nullptr); + Status s = dict_decoder_->SeekAtOrAfterValue(value_void, exact); + if (!s.ok()) { + // This case means the value_void is larger that the largest key + // in the dictionary block. Therefore, it is impossible to be in + // the current data block, and we adjust the index to be the end + // of the block + data_decoder_->SeekToPositionInBlock(data_decoder_->Count() - 1); + return s; + } + + size_t index = dict_decoder_->GetCurrentIndex(); + bool tmp; + return data_decoder_->SeekAtOrAfterValue(&index, &tmp); + } else { + DCHECK_EQ(mode_, kPlainBinaryMode); + return data_decoder_->SeekAtOrAfterValue(value_void, exact); + } +} + +Status BinaryDictBlockDecoder::CopyNextDecodeStrings(size_t* n, ColumnDataView* dst) { + DCHECK(parsed_); + CHECK_EQ(dst->type_info()->physical_type(), BINARY); + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(Slice)); + + Arena* out_arena = dst->arena(); + Slice* out = reinterpret_cast(dst->data()); + + codeword_buf_.resize((*n)*sizeof(uint32_t)); + + // Copy the codewords into a temporary buffer first. + // And then Copy the strings corresponding to the codewords to the destination buffer. + BShufBlockDecoder* d_bptr = down_cast*>(data_decoder_.get()); + RETURN_NOT_OK(d_bptr->CopyNextValuesToArray(n, codeword_buf_.data())); + + for (int i = 0; i < *n; i++) { + uint32_t codeword = *reinterpret_cast(&codeword_buf_[i*sizeof(uint32_t)]); + Slice elem = dict_decoder_->string_at_index(codeword); + CHECK(out_arena->RelocateSlice(elem, out)); + out++; + } + return Status::OK(); +} + +Status BinaryDictBlockDecoder::CopyNextValues(size_t* n, ColumnDataView* dst) { + if (mode_ == kCodeWordMode) { + return CopyNextDecodeStrings(n, dst); + } else { + DCHECK_EQ(mode_, kPlainBinaryMode); + return data_decoder_->CopyNextValues(n, dst); + } +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/binary_dict_block.h b/src/kudu/cfile/binary_dict_block.h new file mode 100644 index 000000000000..77407958d3d0 --- /dev/null +++ b/src/kudu/cfile/binary_dict_block.h @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Dictionary encoding for strings. There is only one dictionary block +// for all the data blocks within a cfile. +// layout for dictionary encoded block: +// Either header + embedded codeword block, which can be encoded with any +// int blockbuilder, when mode_ = kCodeWordMode. +// Or header + embedded StringPlainBlock, when mode_ = kPlainStringMode. +// Data blocks start with mode_ = kCodeWordMode, when the the size of dictionary +// block go beyond the option_->block_size, the subsequent data blocks will switch +// to string plain block automatically. + +// You can embed any int block builder encoding formats, such as group-varint, +// bitshuffle. Currently, we use bitshuffle builder for codewords. +// +// To use other block builder/decoder, just make sure that BlockDecoder has +// interface CopyNextValuesToArray(size_t*, uint8_t*). To do that, just replace +// BShufBuilder/Decoder is ok. +// +// +#ifndef KUDU_CFILE_BINARY_DICT_BLOCK_H +#define KUDU_CFILE_BINARY_DICT_BLOCK_H + +#include +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/block_pointer.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/binary_plain_block.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/faststring.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { +class Arena; +namespace cfile { + +struct WriterOptions; + +// Header Mode type +enum DictEncodingMode { + DictEncodingMode_min = 1, + kCodeWordMode = 1, + kPlainBinaryMode = 2, + DictEncodingMode_max = 2 +}; + +class BinaryDictBlockBuilder : public BlockBuilder { + public: + explicit BinaryDictBlockBuilder(const WriterOptions* options); + + bool IsBlockFull(size_t limit) const OVERRIDE; + + // Append the dictionary block for the current cfile to the end of the cfile and set the footer + // accordingly. + Status AppendExtraInfo(CFileWriter* c_writer, CFileFooterPB* footer) OVERRIDE; + + int Add(const uint8_t* vals, size_t count) OVERRIDE; + + Slice Finish(rowid_t ordinal_pos) OVERRIDE; + + void Reset() OVERRIDE; + + size_t Count() const OVERRIDE; + + Status GetFirstKey(void* key) const OVERRIDE; + + static const size_t kMaxHeaderSize = sizeof(uint32_t) * 1; + + private: + int AddCodeWords(const uint8_t* vals, size_t count); + + faststring buffer_; + bool finished_; + const WriterOptions* options_; + + gscoped_ptr data_builder_; + + // dict_block_, dictionary_, dictionary_strings_arena_ + // is related to the dictionary block (one per cfile). + // They should NOT be clear in the Reset() method. + BinaryPlainBlockBuilder dict_block_; + + std::unordered_map > dictionary_; + // Memory to hold the actual content for strings in the dictionary_. + // + // The size of it should be bigger than the size limit for dictionary block + // (e.g option_->block_size). + // + // Currently, it can hold at most 64MB content. + Arena dictionary_strings_arena_; + + DictEncodingMode mode_; + + // First key when mode_ = kCodeWodeMode + faststring first_key_; +}; + +class CFileIterator; + +class BinaryDictBlockDecoder : public BlockDecoder { + public: + explicit BinaryDictBlockDecoder(Slice slice, CFileIterator* iter); + + virtual Status ParseHeader() OVERRIDE; + virtual void SeekToPositionInBlock(uint pos) OVERRIDE; + virtual Status SeekAtOrAfterValue(const void* value, bool* exact_match) OVERRIDE; + Status CopyNextValues(size_t* n, ColumnDataView* dst) OVERRIDE; + + virtual bool HasNext() const OVERRIDE { + return data_decoder_->HasNext(); + } + + virtual size_t Count() const OVERRIDE { + return data_decoder_->Count(); + } + + virtual size_t GetCurrentIndex() const OVERRIDE { + return data_decoder_->GetCurrentIndex(); + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return data_decoder_->GetFirstRowId(); + } + + static const size_t kMinHeaderSize = sizeof(uint32_t) * 1; + + private: + Status CopyNextDecodeStrings(size_t* n, ColumnDataView* dst); + + Slice data_; + bool parsed_; + + // Dictionary block decoder + BinaryPlainBlockDecoder* dict_decoder_; + + gscoped_ptr data_decoder_; + + DictEncodingMode mode_; + + // buffer to hold the codewords, needed by CopyNextDecodeStrings() + faststring codeword_buf_; + +}; + +} // namespace cfile +} // namespace kudu + +// Defined for tight_enum_test_cast<> -- has to be defined outside of any namespace. +MAKE_ENUM_LIMITS(kudu::cfile::DictEncodingMode, kudu::cfile::DictEncodingMode_min, + kudu::cfile::DictEncodingMode_max); + +#endif // KUDU_CFILE_BINARY_DICT_BLOCK_H diff --git a/src/kudu/cfile/binary_plain_block.cc b/src/kudu/cfile/binary_plain_block.cc new file mode 100644 index 000000000000..324815800848 --- /dev/null +++ b/src/kudu/cfile/binary_plain_block.cc @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/binary_plain_block.h" + +#include +#include + +#include "kudu/cfile/cfile_util.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { +namespace cfile { + +BinaryPlainBlockBuilder::BinaryPlainBlockBuilder(const WriterOptions *options) + : end_of_data_offset_(0), + size_estimate_(0), + options_(options) { + Reset(); +} + +void BinaryPlainBlockBuilder::Reset() { + offsets_.clear(); + buffer_.clear(); + buffer_.resize(kMaxHeaderSize); + buffer_.reserve(options_->storage_attributes.cfile_block_size); + + size_estimate_ = kMaxHeaderSize; + end_of_data_offset_ = kMaxHeaderSize; + finished_ = false; +} + +bool BinaryPlainBlockBuilder::IsBlockFull(size_t limit) const { + return size_estimate_ > limit; +} + +Slice BinaryPlainBlockBuilder::Finish(rowid_t ordinal_pos) { + finished_ = true; + + size_t offsets_pos = buffer_.size(); + + // Set up the header + InlineEncodeFixed32(&buffer_[0], ordinal_pos); + InlineEncodeFixed32(&buffer_[4], offsets_.size()); + InlineEncodeFixed32(&buffer_[8], offsets_pos); + + // append the offsets, if non-empty + if (!offsets_.empty()) { + coding::AppendGroupVarInt32Sequence(&buffer_, 0, &offsets_[0], offsets_.size()); + } + + return Slice(buffer_); +} + +int BinaryPlainBlockBuilder::Add(const uint8_t *vals, size_t count) { + DCHECK(!finished_); + DCHECK_GT(count, 0); + size_t i = 0; + + // If the block is full, should stop adding more items. + while (!IsBlockFull(options_->storage_attributes.cfile_block_size) && i < count) { + + // Every fourth entry needs a gvint selector byte + // TODO: does it cost a lot to account these things specifically? + // maybe cheaper to just over-estimate - allocation is cheaper than math? + if (offsets_.size() % 4 == 0) { + size_estimate_++; + } + + const Slice *src = reinterpret_cast(vals); + size_t offset = buffer_.size(); + offsets_.push_back(offset); + size_estimate_ += coding::CalcRequiredBytes32(offset); + + buffer_.append(src->data(), src->size()); + size_estimate_ += src->size(); + + i++; + vals += sizeof(Slice); + } + + end_of_data_offset_ = buffer_.size(); + + return i; +} + + +size_t BinaryPlainBlockBuilder::Count() const { + return offsets_.size(); +} + +Status BinaryPlainBlockBuilder::GetFirstKey(void *key_void) const { + CHECK(finished_); + + Slice *slice = reinterpret_cast(key_void); + + if (offsets_.empty()) { + return Status::NotFound("no keys in data block"); + } + + if (PREDICT_FALSE(offsets_.size() == 1)) { + *slice = Slice(&buffer_[kMaxHeaderSize], + end_of_data_offset_ - kMaxHeaderSize); + } else { + *slice = Slice(&buffer_[kMaxHeaderSize], + offsets_[1] - offsets_[0]); + } + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// Decoding +//////////////////////////////////////////////////////////// + +BinaryPlainBlockDecoder::BinaryPlainBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + cur_idx_(0) { +} + +Status BinaryPlainBlockDecoder::ParseHeader() { + CHECK(!parsed_); + + if (data_.size() < kMinHeaderSize) { + return Status::Corruption( + strings::Substitute("not enough bytes for header: string block header " + "size ($0) less than minimum possible header length ($1)", + data_.size(), kMinHeaderSize)); + } + + // Decode header. + ordinal_pos_base_ = DecodeFixed32(&data_[0]); + num_elems_ = DecodeFixed32(&data_[4]); + size_t offsets_pos = DecodeFixed32(&data_[8]); + + // Sanity check. + if (offsets_pos > data_.size()) { + return Status::Corruption( + StringPrintf("offsets_pos %ld > block size %ld in plain string block", + offsets_pos, data_.size())); + } + + // Decode the string offsets themselves + const uint8_t *p = data_.data() + offsets_pos; + const uint8_t *limit = data_.data() + data_.size(); + + offsets_.clear(); + offsets_.reserve(num_elems_); + + size_t rem = num_elems_; + while (rem >= 4) { + uint32_t ints[4]; + if (p + 16 < limit) { + p = coding::DecodeGroupVarInt32_SSE(p, &ints[0], &ints[1], &ints[2], &ints[3]); + } else { + p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], &ints[2], &ints[3]); + } + if (p > limit) { + LOG(WARNING) << "bad block: " << HexDump(data_); + return Status::Corruption( + StringPrintf("unable to decode offsets in block")); + } + + offsets_.push_back(ints[0]); + offsets_.push_back(ints[1]); + offsets_.push_back(ints[2]); + offsets_.push_back(ints[3]); + rem -= 4; + } + + if (rem > 0) { + uint32_t ints[4]; + p = coding::DecodeGroupVarInt32_SlowButSafe(p, &ints[0], &ints[1], &ints[2], &ints[3]); + if (p > limit) { + LOG(WARNING) << "bad block: " << HexDump(data_); + return Status::Corruption( + StringPrintf("unable to decode offsets in block")); + } + + for (int i = 0; i < rem; i++) { + offsets_.push_back(ints[i]); + } + } + + // Add one extra entry pointing after the last item to make the indexing easier. + offsets_.push_back(offsets_pos); + + parsed_ = true; + + return Status::OK(); +} + +void BinaryPlainBlockDecoder::SeekToPositionInBlock(uint pos) { + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, pos); + return; + } + + DCHECK_LE(pos, num_elems_); + cur_idx_ = pos; +} + +Status BinaryPlainBlockDecoder::SeekAtOrAfterValue(const void *value_void, bool *exact) { + DCHECK(value_void != nullptr); + + const Slice &target = *reinterpret_cast(value_void); + + // Binary search in restart array to find the first restart point + // with a key >= target + int32_t left = 0; + int32_t right = num_elems_; + while (left != right) { + uint32_t mid = (left + right) / 2; + Slice mid_key(string_at_index(mid)); + int c = mid_key.compare(target); + if (c < 0) { + left = mid + 1; + } else if (c > 0) { + right = mid; + } else { + cur_idx_ = mid; + *exact = true; + return Status::OK(); + } + } + *exact = false; + cur_idx_ = left; + if (cur_idx_ == num_elems_) { + return Status::NotFound("after last key in block"); + } + + return Status::OK(); +} + +Status BinaryPlainBlockDecoder::CopyNextValues(size_t *n, ColumnDataView *dst) { + DCHECK(parsed_); + CHECK_EQ(dst->type_info()->physical_type(), BINARY); + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(Slice)); + + Arena *out_arena = dst->arena(); + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + + Slice *out = reinterpret_cast(dst->data()); + size_t i; + for (i = 0; i < max_fetch; i++) { + Slice elem(string_at_index(cur_idx_)); + + // TODO: in a lot of cases, we might be able to get away with the decoder + // owning it and not truly copying. But, we should extend the CopyNextValues + // API so that the caller can specify if they truly _need_ copies or not. + CHECK(out_arena->RelocateSlice(elem, out)); + out++; + cur_idx_++; + } + + *n = i; + return Status::OK(); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/binary_plain_block.h b/src/kudu/cfile/binary_plain_block.h new file mode 100644 index 000000000000..97a9e396d201 --- /dev/null +++ b/src/kudu/cfile/binary_plain_block.h @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simplistic block encoding for strings. +// +// The block consists of: +// Header: +// ordinal_pos (32-bit fixed) +// num_elems (32-bit fixed) +// offsets_pos (32-bit fixed): position of the first offset, relative to block start +// Strings: +// raw strings that were written +// Offsets: [pointed to by offsets_pos] +// gvint-encoded offsets pointing to the beginning of each string +#ifndef KUDU_CFILE_BINARY_PLAIN_BLOCK_H +#define KUDU_CFILE_BINARY_PLAIN_BLOCK_H + +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/util/faststring.h" + +namespace kudu { +namespace cfile { + +struct WriterOptions; + +class BinaryPlainBlockBuilder : public BlockBuilder { + public: + explicit BinaryPlainBlockBuilder(const WriterOptions *options); + + bool IsBlockFull(size_t limit) const OVERRIDE; + + int Add(const uint8_t *vals, size_t count) OVERRIDE; + + // Return a Slice which represents the encoded data. + // + // This Slice points to internal data of this class + // and becomes invalid after the builder is destroyed + // or after Finish() is called again. + Slice Finish(rowid_t ordinal_pos) OVERRIDE; + + void Reset() OVERRIDE; + + size_t Count() const OVERRIDE; + + // Return the first added key. + // key should be a Slice * + Status GetFirstKey(void *key) const OVERRIDE; + + // Length of a header. + static const size_t kMaxHeaderSize = sizeof(uint32_t) * 3; + + private: + faststring buffer_; + + size_t end_of_data_offset_; + size_t size_estimate_; + + // Offsets of each entry, relative to the start of the block + std::vector offsets_; + + bool finished_; + + const WriterOptions *options_; + +}; + +class BinaryPlainBlockDecoder : public BlockDecoder { + public: + explicit BinaryPlainBlockDecoder(Slice slice); + + virtual Status ParseHeader() OVERRIDE; + virtual void SeekToPositionInBlock(uint pos) OVERRIDE; + virtual Status SeekAtOrAfterValue(const void *value, + bool *exact_match) OVERRIDE; + Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE; + + virtual bool HasNext() const OVERRIDE { + DCHECK(parsed_); + return cur_idx_ < num_elems_; + } + + virtual size_t Count() const OVERRIDE { + DCHECK(parsed_); + return num_elems_; + } + + virtual size_t GetCurrentIndex() const OVERRIDE { + DCHECK(parsed_); + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return ordinal_pos_base_; + } + + Slice string_at_index(size_t idx) const { + const uint32_t offset = offsets_[idx]; + uint32_t len = offsets_[idx + 1] - offset; + return Slice(&data_[offset], len); + } + + // Minimum length of a header. + static const size_t kMinHeaderSize = sizeof(uint32_t) * 3; + + private: + Slice data_; + bool parsed_; + + // The parsed offsets. + // This array also contains one extra offset at the end, pointing + // _after_ the last entry. This makes the code much simpler. + std::vector offsets_; + + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + + // Index of the currently seeked element in the block. + uint32_t cur_idx_; +}; + +} // namespace cfile +} // namespace kudu + +#endif // KUDU_CFILE_BINARY_PREFIX_BLOCK_H diff --git a/src/kudu/cfile/binary_prefix_block.cc b/src/kudu/cfile/binary_prefix_block.cc new file mode 100644 index 000000000000..e316aa682408 --- /dev/null +++ b/src/kudu/cfile/binary_prefix_block.cc @@ -0,0 +1,569 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/binary_prefix_block.h" + +#include +#include + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace cfile { + +using kudu::coding::AppendGroupVarInt32; +using strings::Substitute; + +//////////////////////////////////////////////////////////// +// Utility code used by both encoding and decoding +//////////////////////////////////////////////////////////// + +static const uint8_t *DecodeEntryLengths( + const uint8_t *ptr, const uint8_t *limit, + uint32_t *shared, uint32_t *non_shared) { + + if ((ptr = GetVarint32Ptr(ptr, limit, shared)) == nullptr) return nullptr; + if ((ptr = GetVarint32Ptr(ptr, limit, non_shared)) == nullptr) return nullptr; + if (limit - ptr < *non_shared) { + return nullptr; + } + + return ptr; +} + +//////////////////////////////////////////////////////////// +// StringPrefixBlockBuilder encoding +//////////////////////////////////////////////////////////// + +BinaryPrefixBlockBuilder::BinaryPrefixBlockBuilder(const WriterOptions *options) + : val_count_(0), + vals_since_restart_(0), + finished_(false), + options_(options) { + Reset(); +} + +void BinaryPrefixBlockBuilder::Reset() { + finished_ = false; + val_count_ = 0; + vals_since_restart_ = 0; + + buffer_.clear(); + buffer_.resize(kHeaderReservedLength); + buffer_.reserve(options_->storage_attributes.cfile_block_size); + + restarts_.clear(); + last_val_.clear(); +} + +bool BinaryPrefixBlockBuilder::IsBlockFull(size_t limit) const { + // TODO: take restarts size into account + return buffer_.size() > limit; +} + +Slice BinaryPrefixBlockBuilder::Finish(rowid_t ordinal_pos) { + CHECK(!finished_) << "already finished"; + DCHECK_GE(buffer_.size(), kHeaderReservedLength); + + faststring header(kHeaderReservedLength); + + AppendGroupVarInt32(&header, val_count_, ordinal_pos, + options_->block_restart_interval, 0); + + int header_encoded_len = header.size(); + + // Copy the header into the buffer at the right spot. + // Since the header is likely shorter than the amount of space + // reserved for it, need to find where it fits: + int header_offset = kHeaderReservedLength - header_encoded_len; + DCHECK_GE(header_offset, 0); + uint8_t *header_dst = buffer_.data() + header_offset; + strings::memcpy_inlined(header_dst, header.data(), header_encoded_len); + + // Serialize the restart points. + // Note that the values stored in restarts_ are relative to the + // start of the *buffer*, which is not the same as the start of + // the block. So, we must subtract the header offset from each. + buffer_.reserve(buffer_.size() + + restarts_.size() * sizeof(uint32_t) // the data + + sizeof(uint32_t)); // the restart count); + for (uint32_t restart : restarts_) { + DCHECK_GE(static_cast(restart), header_offset); + uint32_t relative_to_block = restart - header_offset; + VLOG(2) << "appending restart " << relative_to_block; + InlinePutFixed32(&buffer_, relative_to_block); + } + InlinePutFixed32(&buffer_, restarts_.size()); + + finished_ = true; + return Slice(&buffer_[header_offset], buffer_.size() - header_offset); +} + +int BinaryPrefixBlockBuilder::Add(const uint8_t *vals, size_t count) { + DCHECK_GT(count, 0); + DCHECK(!finished_); + DCHECK_LE(vals_since_restart_, options_->block_restart_interval); + + int added = 0; + const Slice* slices = reinterpret_cast(vals); + Slice prev_val(last_val_); + // We generate a static call to IsBlockFull() to avoid the vtable lookup + // in this hot path. + while (!BinaryPrefixBlockBuilder::IsBlockFull(options_->storage_attributes.cfile_block_size) && + added < count) { + const Slice val = slices[added]; + + int old_size = buffer_.size(); + buffer_.resize(old_size + 5 * 2 + val.size()); + uint8_t* dst_p = &buffer_[old_size]; + + size_t shared = 0; + if (vals_since_restart_ < options_->block_restart_interval) { + // See how much sharing to do with previous string + shared = CommonPrefixLength(prev_val, val); + } else { + // Restart compression + restarts_.push_back(old_size); + vals_since_restart_ = 0; + } + const size_t non_shared = val.size() - shared; + + // Add "" to buffer_ + dst_p = InlineEncodeVarint32(dst_p, shared); + dst_p = InlineEncodeVarint32(dst_p, non_shared); + + // Add string delta to buffer_ + memcpy(dst_p, val.data() + shared, non_shared); + dst_p += non_shared; + + // Chop off the extra size on the end of the buffer. + buffer_.resize(dst_p - &buffer_[0]); + + // Update state + prev_val = val; + added++; + vals_since_restart_++; + } + + last_val_.assign_copy(prev_val.data(), prev_val.size()); + val_count_ += added; + + return added; +} + +size_t BinaryPrefixBlockBuilder::Count() const { + return val_count_; +} + + +Status BinaryPrefixBlockBuilder::GetFirstKey(void *key) const { + if (val_count_ == 0) { + return Status::NotFound("no keys in data block"); + } + + const uint8_t *p = &buffer_[kHeaderReservedLength]; + uint32_t shared, non_shared; + p = DecodeEntryLengths(p, &buffer_[buffer_.size()], &shared, &non_shared); + if (p == nullptr) { + return Status::Corruption("Could not decode first entry in string block"); + } + + CHECK(shared == 0) << "first entry in string block had a non-zero 'shared': " + << shared; + + *reinterpret_cast(key) = Slice(p, non_shared); + return Status::OK(); +} + +size_t BinaryPrefixBlockBuilder::CommonPrefixLength(const Slice& slice_a, + const Slice& slice_b) { + // This implementation is modeled after strings::fastmemcmp_inlined(). + int len = std::min(slice_a.size(), slice_b.size()); + const uint8_t* a = slice_a.data(); + const uint8_t* b = slice_b.data(); + const uint8_t* a_limit = a + len; + + const size_t sizeof_uint64 = sizeof(uint64_t); + // Move forward 8 bytes at a time until finding an unequal portion. + while (a + sizeof_uint64 <= a_limit && + UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { + a += sizeof_uint64; + b += sizeof_uint64; + } + + // Same, 4 bytes at a time. + const size_t sizeof_uint32 = sizeof(uint32_t); + while (a + sizeof_uint32 <= a_limit && + UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { + a += sizeof_uint32; + b += sizeof_uint32; + } + + // Now one byte at a time. We could do a 2-bytes-at-a-time loop, + // but we're following the example of fastmemcmp_inlined(). The benefit of + // 2-at-a-time likely doesn't outweigh the cost of added code size. + while (a < a_limit && + *a == *b) { + a++; + b++; + } + + return a - slice_a.data(); +} + +//////////////////////////////////////////////////////////// +// StringPrefixBlockDecoder +//////////////////////////////////////////////////////////// + +BinaryPrefixBlockDecoder::BinaryPrefixBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + num_restarts_(0), + restarts_(nullptr), + data_start_(nullptr), + cur_idx_(0), + next_ptr_(nullptr) { +} + +Status BinaryPrefixBlockDecoder::ParseHeader() { + // First parse the actual header. + uint32_t unused; + + // Make sure the Slice we are referring to is at least the size of the + // minimum possible header + if (PREDICT_FALSE(data_.size() < kMinHeaderSize)) { + return Status::Corruption( + strings::Substitute("not enough bytes for header: string block header " + "size ($0) less than minimum possible header length ($1)", + data_.size(), kMinHeaderSize)); + // TODO include hexdump + } + + // Make sure the actual size of the group varints in the Slice we are + // referring to is as big as it claims to be + size_t header_size = coding::DecodeGroupVarInt32_GetGroupSize(data_.data()); + if (PREDICT_FALSE(data_.size() < header_size)) { + return Status::Corruption( + strings::Substitute("string block header size ($0) less than length " + "from in header ($1)", data_.size(), header_size)); + // TODO include hexdump + } + + // We should have enough space in the Slice to decode the group varints + // safely now + data_start_ = + coding::DecodeGroupVarInt32_SlowButSafe( + data_.data(), + &num_elems_, &ordinal_pos_base_, + &restart_interval_, &unused); + + // Then the footer, which points us to the restarts array + num_restarts_ = DecodeFixed32( + data_.data() + data_.size() - sizeof(uint32_t)); + + // sanity check the restarts size + uint32_t restarts_size = num_restarts_ * sizeof(uint32_t); + if (restarts_size > data_.size()) { + return Status::Corruption( + StringPrintf("restart count %d too big to fit in block size %d", + num_restarts_, static_cast(data_.size()))); + } + + // TODO: check relationship between num_elems, num_restarts_, + // and restart_interval_ + + restarts_ = reinterpret_cast( + data_.data() + data_.size() + - sizeof(uint32_t) // rewind before the restart length + - restarts_size); + + SeekToStart(); + parsed_ = true; + return Status::OK(); +} + +void BinaryPrefixBlockDecoder::SeekToStart() { + SeekToRestartPoint(0); +} + +void BinaryPrefixBlockDecoder::SeekToPositionInBlock(uint pos) { + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, pos); + return; + } + + DCHECK_LE(pos, num_elems_); + + int target_restart = pos/restart_interval_; + SeekToRestartPoint(target_restart); + + // Seek forward to the right index + + // TODO: Seek calls should return a Status + CHECK_OK(SkipForward(pos - cur_idx_)); + DCHECK_EQ(cur_idx_, pos); +} + +// Get the pointer to the entry corresponding to the given restart +// point. Note that the restart points in the file do not include +// the '0' restart point, since that is simply the beginning of +// the data and hence a waste of space. So, 'idx' may range from +// 0 (first record) through num_restarts_ (last recorded restart point) +const uint8_t * BinaryPrefixBlockDecoder::GetRestartPoint(uint32_t idx) const { + DCHECK_LE(idx, num_restarts_); + + if (PREDICT_TRUE(idx > 0)) { + return data_.data() + restarts_[idx - 1]; + } else { + return data_start_; + } +} + +// Note: see GetRestartPoint() for 'idx' semantics +void BinaryPrefixBlockDecoder::SeekToRestartPoint(uint32_t idx) { + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, idx); + return; + } + + next_ptr_ = GetRestartPoint(idx); + cur_idx_ = idx * restart_interval_; + CHECK_OK(ParseNextValue()); // TODO: handle corrupted blocks +} + +Status BinaryPrefixBlockDecoder::SeekAtOrAfterValue(const void *value_void, + bool *exact_match) { + DCHECK(value_void != nullptr); + + const Slice &target = *reinterpret_cast(value_void); + + // Binary search in restart array to find the first restart point + // with a key >= target + int32_t left = 0; + int32_t right = num_restarts_; + while (left < right) { + uint32_t mid = (left + right + 1) / 2; + const uint8_t *entry = GetRestartPoint(mid); + uint32_t shared, non_shared; + const uint8_t *key_ptr = DecodeEntryLengths(entry, &shared, &non_shared); + if (key_ptr == nullptr || (shared != 0)) { + string err = + StringPrintf("bad entry restart=%d shared=%d\n", mid, shared) + + HexDump(Slice(entry, 16)); + return Status::Corruption(err); + } + Slice mid_key(key_ptr, non_shared); + if (mid_key.compare(target) < 0) { + // Key at "mid" is smaller than "target". Therefore all + // blocks before "mid" are uninteresting. + left = mid; + } else { + // Key at "mid" is >= "target". Therefore all blocks at or + // after "mid" are uninteresting. + right = mid - 1; + } + } + + // Linear search (within restart block) for first key >= target + SeekToRestartPoint(left); + + while (true) { +#ifndef NDEBUG + VLOG(3) << "loop iter:\n" + << "cur_idx = " << cur_idx_ << "\n" + << "target =" << target.ToString() << "\n" + << "cur_val_=" << Slice(cur_val_).ToString(); +#endif + int cmp = Slice(cur_val_).compare(target); + if (cmp >= 0) { + *exact_match = (cmp == 0); + return Status::OK(); + } + RETURN_NOT_OK(ParseNextValue()); + cur_idx_++; + } +} + +Status BinaryPrefixBlockDecoder::CopyNextValues(size_t *n, ColumnDataView *dst) { + DCHECK(parsed_); + CHECK_EQ(dst->type_info()->physical_type(), BINARY); + + DCHECK_EQ(dst->stride(), sizeof(Slice)); + DCHECK_LE(*n, dst->nrows()); + + Arena *out_arena = dst->arena(); + Slice *out = reinterpret_cast(dst->data()); + + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t i = 0; + size_t max_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + + // Grab the first row, which we've cached from the last call or seek. + const uint8_t *out_data = out_arena->AddSlice(cur_val_); + if (PREDICT_FALSE(out_data == nullptr)) { + return Status::IOError( + "Out of memory", + StringPrintf("Failed to allocate %d bytes in output arena", + static_cast(cur_val_.size()))); + } + + // Put a slice to it in the output array + Slice prev_val(out_data, cur_val_.size()); + *out++ = prev_val; + i++; + cur_idx_++; + + #ifndef NDEBUG + cur_val_.assign_copy("INVALID"); + #endif + + // Now iterate pulling more rows from the block, decoding relative + // to the previous value. + + for (; i < max_fetch; i++) { + Slice copied; + RETURN_NOT_OK(ParseNextIntoArena(prev_val, dst->arena(), &copied)); + *out++ = copied; + prev_val = copied; + cur_idx_++; + } + + // Fetch the next value to be returned, using the last value we fetched + // for the delta. + cur_val_.assign_copy(prev_val.data(), prev_val.size()); + if (cur_idx_ < num_elems_) { + RETURN_NOT_OK(ParseNextValue()); + } else { + next_ptr_ = nullptr; + } + + *n = i; + return Status::OK(); +} + +// Decode the lengths pointed to by 'ptr', doing bounds checking. +// +// Returns a pointer to where the value itself starts. +// Returns NULL if the varints themselves, or the value that +// they prefix extend past the end of the block data. +const uint8_t *BinaryPrefixBlockDecoder::DecodeEntryLengths( + const uint8_t *ptr, uint32_t *shared, uint32_t *non_shared) const { + + // data ends where the restart info begins + const uint8_t *limit = reinterpret_cast(restarts_); + return kudu::cfile::DecodeEntryLengths(ptr, limit, shared, non_shared); +} + +Status BinaryPrefixBlockDecoder::SkipForward(int n) { + DCHECK_LE(cur_idx_ + n, num_elems_) << + "skip(" << n << ") curidx=" << cur_idx_ + << " num_elems=" << num_elems_; + + // If we're seeking exactly to the end of the data, we don't + // need to actually prepare the next value (in fact, it would + // crash). So, short-circuit here. + if (PREDICT_FALSE(cur_idx_ + n == num_elems_)) { + cur_idx_ += n; + return Status::OK(); + } + + // Probably a faster way to implement this using restarts, + for (int i = 0; i < n; i++) { + RETURN_NOT_OK(ParseNextValue()); + cur_idx_++; + } + return Status::OK(); +} + +Status BinaryPrefixBlockDecoder::CheckNextPtr() { + DCHECK(next_ptr_ != nullptr); + + if (PREDICT_FALSE(next_ptr_ == reinterpret_cast(restarts_))) { + DCHECK_EQ(cur_idx_, num_elems_ - 1); + return Status::NotFound("Trying to parse past end of array"); + } + return Status::OK(); +} + +inline Status BinaryPrefixBlockDecoder::ParseNextIntoArena(Slice prev_val, + Arena *dst, + Slice *copied) { + RETURN_NOT_OK(CheckNextPtr()); + uint32_t shared, non_shared; + const uint8_t *val_delta = DecodeEntryLengths(next_ptr_, &shared, &non_shared); + if (val_delta == nullptr) { + return Status::Corruption( + StringPrintf("Could not decode value length data at idx %d", + cur_idx_)); + } + + DCHECK_LE(shared, prev_val.size()) + << "Spcified longer shared amount than previous key length"; + + uint8_t *buf = reinterpret_cast(dst->AllocateBytes(non_shared + shared)); + strings::memcpy_inlined(buf, prev_val.data(), shared); + strings::memcpy_inlined(buf + shared, val_delta, non_shared); + + *copied = Slice(buf, non_shared + shared); + next_ptr_ = val_delta + non_shared; + return Status::OK(); +} + +// Parses the data pointed to by next_ptr_ and stores it in cur_val_ +// Advances next_ptr_ to point to the following values. +// Does not modify cur_idx_ +inline Status BinaryPrefixBlockDecoder::ParseNextValue() { + RETURN_NOT_OK(CheckNextPtr()); + + uint32_t shared, non_shared; + const uint8_t *val_delta = DecodeEntryLengths(next_ptr_, &shared, &non_shared); + if (val_delta == nullptr) { + return Status::Corruption( + StringPrintf("Could not decode value length data at idx %d", + cur_idx_)); + } + + // Chop the current key to the length that is shared with the next + // key, then append the delta portion. + DCHECK_LE(shared, cur_val_.size()) + << "Specified longer shared amount than previous key length"; + + cur_val_.resize(shared); + cur_val_.append(val_delta, non_shared); + + DCHECK_EQ(cur_val_.size(), shared + non_shared); + + next_ptr_ = val_delta + non_shared; + return Status::OK(); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/binary_prefix_block.h b/src/kudu/cfile/binary_prefix_block.h new file mode 100644 index 000000000000..ecee9c655c8c --- /dev/null +++ b/src/kudu/cfile/binary_prefix_block.h @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BINARY_PREFIX_BLOCK_H +#define KUDU_CFILE_BINARY_PREFIX_BLOCK_H + +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/common/rowid.h" + +namespace kudu { + +class Arena; +class ColumnDataView; + +namespace cfile { + +struct WriterOptions; + +// Encoding for data blocks of binary data that have common prefixes. +// This encodes in a manner similar to LevelDB (prefix coding) +class BinaryPrefixBlockBuilder : public BlockBuilder { + public: + explicit BinaryPrefixBlockBuilder(const WriterOptions *options); + + bool IsBlockFull(size_t limit) const OVERRIDE; + + int Add(const uint8_t *vals, size_t count) OVERRIDE; + + // Return a Slice which represents the encoded data. + // + // This Slice points to internal data of this class + // and becomes invalid after the builder is destroyed + // or after Finish() is called again. + Slice Finish(rowid_t ordinal_pos) OVERRIDE; + + void Reset() OVERRIDE; + + size_t Count() const OVERRIDE; + + // Return the first added key. + // key should be a Slice * + Status GetFirstKey(void *key) const OVERRIDE; + + private: + // Return the length of the common prefix shared by the two strings. + static size_t CommonPrefixLength(const Slice& a, const Slice& b); + + faststring buffer_; + faststring last_val_; + + // Restart points, offsets relative to start of block + std::vector restarts_; + + int val_count_; + int vals_since_restart_; + bool finished_; + + const WriterOptions *options_; + + // Maximum length of a header. + // We leave this much space at the start of the buffer before + // accumulating any data, so we can later fill in the variable-length + // header. + // Currently four varints, so maximum is 20 bytes + static const size_t kHeaderReservedLength = 20; +}; + +// Decoder for BINARY type, PREFIX encoding +class BinaryPrefixBlockDecoder : public BlockDecoder { + public: + explicit BinaryPrefixBlockDecoder(Slice slice); + + virtual Status ParseHeader() OVERRIDE; + virtual void SeekToPositionInBlock(uint pos) OVERRIDE; + virtual Status SeekAtOrAfterValue(const void *value, + bool *exact_match) OVERRIDE; + Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE; + + virtual bool HasNext() const OVERRIDE { + DCHECK(parsed_); + return cur_idx_ < num_elems_; + } + + virtual size_t Count() const OVERRIDE { + DCHECK(parsed_); + return num_elems_; + } + + virtual size_t GetCurrentIndex() const OVERRIDE { + DCHECK(parsed_); + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + DCHECK(parsed_); + return ordinal_pos_base_; + } + + // Minimum length of a header. + // Currently one group of varints for an empty block, so minimum is 5 bytes + static const size_t kMinHeaderSize = 5; + + private: + Status SkipForward(int n); + Status CheckNextPtr(); + Status ParseNextValue(); + Status ParseNextIntoArena(Slice prev_val, Arena *dst, Slice *copied); + + const uint8_t *DecodeEntryLengths(const uint8_t *ptr, + uint32_t *shared, + uint32_t *non_shared) const; + + const uint8_t *GetRestartPoint(uint32_t idx) const; + void SeekToRestartPoint(uint32_t idx); + + void SeekToStart(); + + Slice data_; + + bool parsed_; + + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + + uint32_t num_restarts_; + const uint32_t *restarts_; + uint32_t restart_interval_; + + const uint8_t *data_start_; + + // Index of the next row to be returned by CopyNextValues, relative to + // the block's base offset. + // When the block is exhausted, cur_idx_ == num_elems_ + uint32_t cur_idx_; + + // The first value to be returned by the next CopyNextValues(). + faststring cur_val_; + + // The ptr pointing to the next element to parse. This is for the entry + // following cur_val_ + // This is advanced by ParseNextValue() + const uint8_t *next_ptr_; +}; + +} // namespace cfile +} // namespace kudu + +#endif // KUDU_CFILE_BINARY_PREFIX_BLOCK_H diff --git a/src/kudu/cfile/block_cache-test.cc b/src/kudu/cfile/block_cache-test.cc new file mode 100644 index 000000000000..e155dd52fa88 --- /dev/null +++ b/src/kudu/cfile/block_cache-test.cc @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/util/cache.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace cfile { + +static const char *DATA_TO_CACHE = "hello world"; + +TEST(TestBlockCache, TestBasics) { + size_t data_size = strlen(DATA_TO_CACHE) + 1; + BlockCache cache(512 * 1024 * 1024); + BlockCache::FileId id(1234); + + uint8_t* data = cache.Allocate(data_size); + memcpy(data, DATA_TO_CACHE, data_size); + + // Lookup something missing from cache + { + BlockCacheHandle handle; + ASSERT_FALSE(cache.Lookup(id, 1, Cache::EXPECT_IN_CACHE, &handle)); + ASSERT_FALSE(handle.valid()); + } + + // Insert and re-lookup + BlockCacheHandle inserted_handle; + cache.Insert(id, 1, Slice(data, data_size), &inserted_handle); + ASSERT_TRUE(inserted_handle.valid()); + + BlockCacheHandle retrieved_handle; + ASSERT_TRUE(cache.Lookup(id, 1, Cache::EXPECT_IN_CACHE, &retrieved_handle)); + ASSERT_TRUE(retrieved_handle.valid()); + ASSERT_EQ(retrieved_handle.data().data(), data); + + // Ensure that a lookup for a different offset doesn't + // return this data. + ASSERT_FALSE(cache.Lookup(id, 3, Cache::EXPECT_IN_CACHE, &retrieved_handle)); + +} + + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/block_cache.cc b/src/kudu/cfile/block_cache.cc new file mode 100644 index 000000000000..2bf25bb78c2a --- /dev/null +++ b/src/kudu/cfile/block_cache.cc @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/gutil/port.h" +#include "kudu/util/cache.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/slice.h" +#include "kudu/util/string_case.h" + +DEFINE_int64(block_cache_capacity_mb, 512, "block cache capacity in MB"); +TAG_FLAG(block_cache_capacity_mb, stable); + +DEFINE_string(block_cache_type, "DRAM", + "Which type of block cache to use for caching data. " + "Valid choices are 'DRAM' or 'NVM'. DRAM, the default, " + "caches data in regular memory. 'NVM' caches data " + "in a memory-mapped file using the NVML library."); +TAG_FLAG(block_cache_type, experimental); + +namespace kudu { + +class MetricEntity; + +namespace cfile { + +struct CacheKey { + CacheKey(BlockCache::FileId file_id, uint64_t offset) : + file_id_(file_id.id()), + offset_(offset) + {} + + const Slice slice() const { + return Slice(reinterpret_cast(this), sizeof(*this)); + } + + uint64_t file_id_; + uint64_t offset_; +} PACKED; + +namespace { +class Deleter : public CacheDeleter { + public: + explicit Deleter(Cache* cache) : cache_(cache) { + } + virtual void Delete(const Slice& slice, void* value) OVERRIDE { + Slice *value_slice = reinterpret_cast(value); + + // The actual data was allocated from the cache's memory + // (i.e. it may be in nvm) + cache_->Free(value_slice->mutable_data()); + delete value_slice; + } + private: + Cache* cache_; + DISALLOW_COPY_AND_ASSIGN(Deleter); +}; + +Cache* CreateCache(int64_t capacity) { + CacheType t; + ToUpperCase(FLAGS_block_cache_type, &FLAGS_block_cache_type); + if (FLAGS_block_cache_type == "NVM") { + t = NVM_CACHE; + } else if (FLAGS_block_cache_type == "DRAM") { + t = DRAM_CACHE; + } else { + LOG(FATAL) << "Unknown block cache type: '" << FLAGS_block_cache_type + << "' (expected 'DRAM' or 'NVM')"; + } + return NewLRUCache(t, capacity, "block_cache"); +} + +} // anonymous namespace + +BlockCache::BlockCache() + : cache_(CreateCache(FLAGS_block_cache_capacity_mb * 1024 * 1024)) { + deleter_.reset(new Deleter(cache_.get())); +} + +BlockCache::BlockCache(size_t capacity) + : cache_(CreateCache(capacity)) { + deleter_.reset(new Deleter(cache_.get())); +} + +uint8_t* BlockCache::Allocate(size_t size) { + return cache_->Allocate(size); +} + +void BlockCache::Free(uint8_t* p) { + cache_->Free(p); +} + +uint8_t* BlockCache::MoveToHeap(uint8_t* p, size_t size) { + return cache_->MoveToHeap(p, size); +} + +bool BlockCache::Lookup(FileId file_id, uint64_t offset, Cache::CacheBehavior behavior, + BlockCacheHandle *handle) { + CacheKey key(file_id, offset); + Cache::Handle *h = cache_->Lookup(key.slice(), behavior); + if (h != nullptr) { + handle->SetHandle(cache_.get(), h); + } + return h != nullptr; +} + +bool BlockCache::Insert(FileId file_id, uint64_t offset, const Slice &block_data, + BlockCacheHandle *inserted) { + CacheKey key(file_id, offset); + // Allocate a copy of the value Slice (not the referred-to-data!) + // for insertion in the cache. + gscoped_ptr value(new Slice(block_data)); + Cache::Handle *h = cache_->Insert(key.slice(), value.get(), value->size(), + deleter_.get()); + if (h != nullptr) { + inserted->SetHandle(cache_.get(), h); + ignore_result(value.release()); + } + return h != nullptr; +} + +void BlockCache::StartInstrumentation(const scoped_refptr& metric_entity) { + cache_->SetMetrics(metric_entity); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/block_cache.h b/src/kudu/cfile/block_cache.h new file mode 100644 index 000000000000..6040999472f6 --- /dev/null +++ b/src/kudu/cfile/block_cache.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BLOCK_CACHE_H +#define KUDU_CFILE_BLOCK_CACHE_H + +#include +#include + +#include "kudu/fs/block_id.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/singleton.h" +#include "kudu/util/cache.h" + +DECLARE_string(block_cache_type); + +namespace kudu { + +class MetricRegistry; + +namespace cfile { + +class BlockCacheHandle; + +// Wrapper around kudu::Cache specifically for caching blocks of CFiles. +// Provides a singleton and LRU cache for CFile blocks. +class BlockCache { + public: + // BlockId refers to the unique identifier for a Kudu block, that is, for an + // entire CFile. This is different than the block cache's notion of a block, + // which is just a portion of a CFile. + typedef BlockId FileId; + + static BlockCache *GetSingleton() { + return Singleton::get(); + } + + explicit BlockCache(size_t capacity); + + // Lookup the given block in the cache. + // + // If the entry is found, then sets *handle to refer to the entry. + // This object's destructor will release the cache entry so it may be freed again. + // Alternatively, handle->Release() may be used to explicitly release it. + // + // Returns true to indicate that the entry was found, false otherwise. + bool Lookup(FileId file_id, uint64_t offset, + Cache::CacheBehavior behavior, BlockCacheHandle *handle); + + // Insert the given block into the cache. + // + // The data pointed to by Slice should have been allocated using Allocate(). + // After insertion, the block cache owns this pointer and will free it upon + // eviction. + // + // The inserted entry is returned in *inserted. + bool Insert(FileId file_id, uint64_t offset, const Slice &block_data, + BlockCacheHandle *inserted); + + // Pass a metric entity to the cache to start recording metrics. + // This should be called before the block cache starts serving blocks. + // Not calling StartInstrumentation will simply result in no block cache-related metrics. + // Calling StartInstrumentation multiple times will reset the metrics each time. + void StartInstrumentation(const scoped_refptr& metric_entity); + + // Allocate a chunk of memory to hold a value in this cache. + // + // Some cache implementations may allocate the buffer outside of the normal + // heap area. + // + // NOTE: The returned pointer may either be passed to Insert(), MoveToHeap(), or + // Free(). It must NOT be freed using free() or delete[]. + uint8_t* Allocate(size_t size); + + // Move a pointer previously allocated using Allocate() onto the normal heap. + // This is a no-op for a DRAM-based cache, but in other cases may relocate the + // data. + uint8_t* MoveToHeap(uint8_t* p, size_t size); + + // Free a pointer previously allocated using Allocate(). + void Free(uint8_t *p); + + private: + friend class Singleton; + BlockCache(); + + DISALLOW_COPY_AND_ASSIGN(BlockCache); + + // Deleter must be defined before cache_ so that cache_ destructs first. + // (the Cache needs to use the Deleter during destruction) + gscoped_ptr deleter_; + gscoped_ptr cache_; +}; + +// Scoped reference to a block from the block cache. +class BlockCacheHandle { + public: + BlockCacheHandle() : + handle_(NULL) + {} + + ~BlockCacheHandle() { + if (handle_ != NULL) { + Release(); + } + } + + void Release() { + CHECK_NOTNULL(cache_)->Release(CHECK_NOTNULL(handle_)); + handle_ = NULL; + } + + // Swap this handle with another handle. + // This can be useful to transfer ownership of a handle by swapping + // with an empty BlockCacheHandle. + void swap(BlockCacheHandle *dst) { + std::swap(this->cache_, dst->cache_); + std::swap(this->handle_, dst->handle_); + } + + // Return the data in the cached block. + // + // NOTE: this slice is only valid until the block cache handle is + // destructed or explicitly Released(). + const Slice &data() const { + const Slice *slice = reinterpret_cast(cache_->Value(handle_)); + return *slice; + } + + bool valid() const { + return handle_ != NULL; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BlockCacheHandle); + friend class BlockCache; + + void SetHandle(Cache *cache, Cache::Handle *handle) { + if (handle_ != NULL) Release(); + + cache_ = cache; + handle_ = handle; + } + + Cache::Handle *handle_; + Cache *cache_; +}; + + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/block_compression.cc b/src/kudu/cfile/block_compression.cc new file mode 100644 index 000000000000..61792b7e5d8d --- /dev/null +++ b/src/kudu/cfile/block_compression.cc @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/cfile/block_compression.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" + +namespace kudu { +namespace cfile { + +using std::vector; + +CompressedBlockBuilder::CompressedBlockBuilder(const CompressionCodec* codec, + size_t size_limit) + : codec_(DCHECK_NOTNULL(codec)), + compressed_size_limit_(size_limit) { +} + +Status CompressedBlockBuilder::Compress(const Slice& data, Slice *result) { + vector v; + v.push_back(data); + return Compress(v, result); +} + +Status CompressedBlockBuilder::Compress(const vector &data_slices, Slice *result) { + size_t data_size = 0; + for (const Slice& data : data_slices) { + data_size += data.size(); + } + + // Ensure that the buffer for header + compressed data is large enough + size_t max_compressed_size = codec_->MaxCompressedLength(data_size); + if (max_compressed_size > compressed_size_limit_) { + return Status::InvalidArgument( + StringPrintf("estimated max size %lu is greater than the expected %lu", + max_compressed_size, compressed_size_limit_)); + } + + buffer_.resize(kHeaderReservedLength + max_compressed_size); + + // Compress + size_t compressed_size; + RETURN_NOT_OK(codec_->Compress(data_slices, + buffer_.data() + kHeaderReservedLength, &compressed_size)); + + // Set up the header + InlineEncodeFixed32(&buffer_[0], compressed_size); + InlineEncodeFixed32(&buffer_[4], data_size); + *result = Slice(buffer_.data(), compressed_size + kHeaderReservedLength); + + return Status::OK(); +} + +CompressedBlockDecoder::CompressedBlockDecoder(const CompressionCodec* codec, + size_t size_limit) + : codec_(DCHECK_NOTNULL(codec)), + uncompressed_size_limit_(size_limit) { +} + +Status CompressedBlockDecoder::ValidateHeader(const Slice& data, uint32_t *uncompressed_size) { + // Check if the on-disk size is correct. + if (data.size() < CompressedBlockBuilder::kHeaderReservedLength) { + return Status::Corruption( + StringPrintf("data size %lu is not enough to contains the header. " + "required %lu, buffer", + data.size(), CompressedBlockBuilder::kHeaderReservedLength), + data.ToDebugString(50)); + } + + // Decode the header + uint32_t compressed_size = DecodeFixed32(data.data()); + *uncompressed_size = DecodeFixed32(data.data() + 4); + + // Check if the on-disk data size matches with the buffer + if (data.size() != (CompressedBlockBuilder::kHeaderReservedLength + compressed_size)) { + return Status::Corruption( + StringPrintf("compressed size %u does not match remaining length in buffer %lu, buffer", + compressed_size, data.size() - CompressedBlockBuilder::kHeaderReservedLength), + data.ToDebugString(50)); + } + + // Check if uncompressed size seems to be reasonable + if (*uncompressed_size > uncompressed_size_limit_) { + return Status::Corruption( + StringPrintf("uncompressed size %u overflows the maximum length %lu, buffer", + compressed_size, uncompressed_size_limit_), data.ToDebugString(50)); + } + + return Status::OK(); +} + +Status CompressedBlockDecoder::UncompressIntoBuffer(const Slice& data, uint8_t* dst, + uint32_t uncompressed_size) { + Slice compressed = data; + compressed.remove_prefix(CompressedBlockBuilder::kHeaderReservedLength); + RETURN_NOT_OK(codec_->Uncompress(compressed, dst, uncompressed_size)); + + return Status::OK(); +} + +Status CompressedBlockDecoder::Uncompress(const Slice& data, Slice *result) { + // Decode the header + uint32_t uncompressed_size; + RETURN_NOT_OK(ValidateHeader(data, &uncompressed_size)); + + // Allocate the buffer for the uncompressed data and uncompress + ::gscoped_array buffer(new uint8_t[uncompressed_size]); + RETURN_NOT_OK(UncompressIntoBuffer(data, buffer.get(), uncompressed_size)); + *result = Slice(buffer.release(), uncompressed_size); + + return Status::OK(); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/block_compression.h b/src/kudu/cfile/block_compression.h new file mode 100644 index 000000000000..14c7bff809ba --- /dev/null +++ b/src/kudu/cfile/block_compression.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BLOCK_COMPRESSION_H +#define KUDU_CFILE_BLOCK_COMPRESSION_H + +#include +#include + +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/compression_codec.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace cfile { + +class CompressedBlockBuilder { + public: + // 'codec' is expected to remain alive for the lifetime of this object. + CompressedBlockBuilder(const CompressionCodec* codec, size_t size_limit); + + // Sets "*result" to the compressed version of the "data". + // The data inside the result is owned by the CompressedBlockBuilder class + // and valid until the class is destructed or until Compress() is called again. + // + // If an error was encountered, returns a non-OK status. + Status Compress(const Slice& data, Slice *result); + Status Compress(const std::vector &data_slices, Slice *result); + + // header includes a 32-bit compressed length, 32-bit uncompressed length + static const size_t kHeaderReservedLength = (2 * sizeof(uint32_t)); + + private: + DISALLOW_COPY_AND_ASSIGN(CompressedBlockBuilder); + const CompressionCodec* codec_; + faststring buffer_; + size_t compressed_size_limit_; +}; + +class CompressedBlockDecoder { + public: + // 'codec' is expected to remain alive for the lifetime of this object. + CompressedBlockDecoder(const CompressionCodec* codec, size_t size_limit); + + // Sets "*result" to the uncompressed version of the "data". + // It is the caller's responsibility to free the result data. + // + // If an error was encountered, returns a non-OK status. + Status Uncompress(const Slice& data, Slice *result); + + // Validates the header in the data block 'data'. + // Sets '*uncompressed_size' to the uncompressed size of the data block + // (i.e. the size of buffer that's required for a later call for UncompressIntoBuffer()). + // + // Returns Corruption if the data block header indicates a compressed size + // that is different than the amount of remaining data in the block, or if the + // uncompressed size is greater than the 'size_limit' provided in this class's constructor. + // + // In the case that this doesn't return OK, the output parameter may still + // be modified. + Status ValidateHeader(const Slice& data, uint32_t *uncompressed_size); + + // Uncompress into the provided 'dst' buffer, which must be at least as + // large as 'uncompressed_size'. It's assumed that this length has already + // been determined by calling Uncompress_Validate(). + Status UncompressIntoBuffer(const Slice& data, uint8_t* dst, + uint32_t uncompressed_size); + private: + DISALLOW_COPY_AND_ASSIGN(CompressedBlockDecoder); + const CompressionCodec* codec_; + size_t uncompressed_size_limit_; +}; + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/block_encodings.h b/src/kudu/cfile/block_encodings.h new file mode 100644 index 000000000000..13c41047f426 --- /dev/null +++ b/src/kudu/cfile/block_encodings.h @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_BLOCK_ENCODINGS_H +#define KUDU_CFILE_BLOCK_ENCODINGS_H + +#include + +#include + +#include "kudu/common/rowid.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +class ColumnDataView; + +namespace cfile { +class CFileWriter; + +class BlockBuilder { + public: + BlockBuilder() { } + + // Append extra information to the end of the current cfile, for example: + // append the dictionary block for under dictionary encoding mode. + virtual Status AppendExtraInfo(CFileWriter *c_writer, CFileFooterPB* footer) { + return Status::OK(); + } + + // Used by the cfile writer to determine whether the current block is full. + // If it is full, the cfile writer will call FinishCurDataBlock(). + virtual bool IsBlockFull(size_t limit) const = 0; + + // Add a sequence of values to the block. + // Returns the number of values actually added, which may be less + // than requested if the block is full. + virtual int Add(const uint8_t *vals, size_t count) = 0; + + // Return a Slice which represents the encoded data. + // + // This Slice points to internal data of this class + // and becomes invalid after the builder is destroyed + // or after Finish() is called again. + virtual Slice Finish(rowid_t ordinal_pos) = 0; + + // Reset the internal state of the encoder. + // + // Any data previously returned by Finish or by GetFirstKey + // may be invalidated by this call. + // + // Postcondition: Count() == 0 + virtual void Reset() = 0; + + // Return the number of entries that have been added to the + // block. + virtual size_t Count() const = 0; + + // Return the key of the first entry in this index block. + // For pointer-based types (such as strings), the pointed-to + // data is only valid until the next call to Reset(). + // + // If no keys have been added, returns Status::NotFound + virtual Status GetFirstKey(void *key) const = 0; + + virtual ~BlockBuilder() {} + private: + DISALLOW_COPY_AND_ASSIGN(BlockBuilder); +}; + + +class BlockDecoder { + public: + BlockDecoder() { } + + virtual Status ParseHeader() = 0; + + // Seek the decoder to the given positional index of the block. + // For example, SeekToPositionInBlock(0) seeks to the first + // stored entry. + // + // It is an error to call this with a value larger than Count(). + // Doing so has undefined results. + // + // TODO: Since we know the actual file position, maybe we + // should just take the actual ordinal in the file + // instead of the position in the block? + virtual void SeekToPositionInBlock(uint pos) = 0; + + // Seek the decoder to the given value in the block, or the + // lowest value which is greater than the given value. + // + // If the decoder was able to locate an exact match, then + // sets *exact_match to true. Otherwise sets *exact_match to + // false, to indicate that the seeked value is _after_ the + // requested value. + // + // If the given value is less than the lowest value in the block, + // seeks to the start of the block. If it is higher than the highest + // value in the block, then returns Status::NotFound + // + // This will only return valid results when the data block + // consists of values in sorted order. + virtual Status SeekAtOrAfterValue(const void *value, + bool *exact_match) = 0; + + // Fetch the next set of values from the block into 'dst'. + // The output block must have space for up to n cells. + // + // Modifies *n to contain the number of values fetched. + // + // In the case that the values are themselves references + // to other memory (eg Slices), the referred-to memory is + // allocated in the dst block's arena. + virtual Status CopyNextValues(size_t *n, ColumnDataView *dst) = 0; + + // Return true if there are more values remaining to be iterated. + // (i.e that the next call to CopyNextValues will return at least 1 + // element) + // TODO: change this to a Remaining() call? + virtual bool HasNext() const = 0; + + // Return the number of elements in this block. + virtual size_t Count() const = 0; + + // Return the position within the block of the currently seeked + // entry (ie the entry that will next be returned by CopyNextValues()) + virtual size_t GetCurrentIndex() const = 0; + + // Return the first rowid stored in this block. + // TODO: get rid of this from the block decoder, and put it in a generic + // header which is shared by all data blocks. + virtual rowid_t GetFirstRowId() const = 0; + + virtual ~BlockDecoder() {} + private: + DISALLOW_COPY_AND_ASSIGN(BlockDecoder); +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/block_handle.h b/src/kudu/cfile/block_handle.h new file mode 100644 index 000000000000..0c34e41803e6 --- /dev/null +++ b/src/kudu/cfile/block_handle.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_BLOCK_HANDLE_H +#define KUDU_CFILE_BLOCK_HANDLE_H + +#include "kudu/cfile/block_cache.h" + +namespace kudu { + +namespace cfile { + +// When blocks are read, they are sometimes resident in the block cache, and sometimes skip the +// block cache. In the case that they came from the cache, we just need to dereference them when +// they stop being used. In the case that they didn't come from cache, we need to actually free +// the underlying data. +class BlockHandle { + MOVE_ONLY_TYPE_FOR_CPP_03(BlockHandle, RValue); + public: + static BlockHandle WithOwnedData(const Slice& data) { + return BlockHandle(data); + } + + static BlockHandle WithDataFromCache(BlockCacheHandle *handle) { + return BlockHandle(handle); + } + + // Constructor to use to Pass to. + BlockHandle() + : is_data_owner_(false) { } + + // Emulated Move constructor + BlockHandle(RValue other) { // NOLINT(runtime/explicit) + TakeState(other.object); + } + BlockHandle& operator=(RValue other) { + TakeState(other.object); + return *this; + } + + ~BlockHandle() { + if (is_data_owner_) { + delete [] data_.data(); + } + } + + const Slice &data() const { + if (is_data_owner_) { + return data_; + } else { + return dblk_data_.data(); + } + } + + private: + BlockCacheHandle dblk_data_; + Slice data_; + bool is_data_owner_; + + explicit BlockHandle(Slice data) + : data_(std::move(data)), + is_data_owner_(true) { + } + + explicit BlockHandle(BlockCacheHandle *dblk_data) + : is_data_owner_(false) { + dblk_data_.swap(dblk_data); + } + + void TakeState(BlockHandle* other) { + is_data_owner_ = other->is_data_owner_; + if (is_data_owner_) { + data_ = other->data_; + other->is_data_owner_ = false; + } else { + dblk_data_.swap(&other->dblk_data_); + } + } +}; + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/block_pointer.h b/src/kudu/cfile/block_pointer.h new file mode 100644 index 000000000000..670c9da1235e --- /dev/null +++ b/src/kudu/cfile/block_pointer.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BLOCK_POINTER_H +#define KUDU_CFILE_BLOCK_POINTER_H + +#include +#include + +#include "kudu/cfile/cfile.pb.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/coding.h" +#include "kudu/util/status.h" + +namespace kudu { namespace cfile { + +using std::string; + +class BlockPointer { + public: + BlockPointer() {} + BlockPointer(const BlockPointer &from) : + offset_(from.offset_), + size_(from.size_) {} + + explicit BlockPointer(const BlockPointerPB &from) : + offset_(from.offset()), + size_(from.size()) { + } + + BlockPointer(uint64_t offset, uint64_t size) : + offset_(offset), + size_(size) {} + + string ToString() const { + return strings::Substitute("offset=$0 size=$1", offset_, size_); + } + + template + void EncodeTo(StrType *s) const { + PutVarint64(s, offset_); + InlinePutVarint32(s, size_); + } + + Status DecodeFrom(const uint8_t *data, const uint8_t *limit) { + data = GetVarint64Ptr(data, limit, &offset_); + if (!data) { + return Status::Corruption("bad block pointer"); + } + + data = GetVarint32Ptr(data, limit, &size_); + if (!data) { + return Status::Corruption("bad block pointer"); + } + + return Status::OK(); + } + + void CopyToPB(BlockPointerPB *pb) const { + pb->set_offset(offset_); + pb->set_size(size_); + } + + uint64_t offset() const { + return offset_; + } + + uint32_t size() const { + return size_; + } + + private: + uint64_t offset_; + uint32_t size_; +}; + + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/bloomfile-test-base.h b/src/kudu/cfile/bloomfile-test-base.h new file mode 100644 index 000000000000..61385b5bdf85 --- /dev/null +++ b/src/kudu/cfile/bloomfile-test-base.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BLOOMFILE_TEST_BASE_H +#define KUDU_CFILE_BLOOMFILE_TEST_BASE_H + +#include +#include + +#include "kudu/cfile/bloomfile.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/endian.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DEFINE_int32(bloom_size_bytes, 4*1024, "Size of each bloom filter"); +DEFINE_int32(n_keys, 10*1000, "Number of keys to insert into the file"); +DEFINE_double(fp_rate, 0.01f, "False positive rate to aim for"); + +DEFINE_int64(benchmark_queries, 1000000, "Number of probes to benchmark"); +DEFINE_bool(benchmark_should_hit, false, "Set to true for the benchmark to query rows which match"); + +namespace kudu { +namespace cfile { + +using fs::ReadableBlock; +using fs::WritableBlock; + +static const int kKeyShift = 2; + +class BloomFileTestBase : public KuduTest { + public: + void SetUp() OVERRIDE { + KuduTest::SetUp(); + + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + void AppendBlooms(BloomFileWriter *bfw) { + uint64_t key_buf; + Slice key_slice(reinterpret_cast(&key_buf), + sizeof(key_buf)); + + for (uint64_t i = 0; i < FLAGS_n_keys; i++) { + // Shift the key left a bit so that while querying, we can + // get a good mix of hits and misses while still staying within + // the real key range. + key_buf = BigEndian::FromHost64(i << kKeyShift); + ASSERT_OK_FAST(bfw->AppendKeys(&key_slice, 1)); + } + } + + void WriteTestBloomFile() { + gscoped_ptr sink; + ASSERT_OK(fs_manager_->CreateNewBlock(&sink)); + block_id_ = sink->id(); + + // Set sizing based on flags + BloomFilterSizing sizing = BloomFilterSizing::BySizeAndFPRate( + FLAGS_bloom_size_bytes, FLAGS_fp_rate); + ASSERT_NEAR(sizing.n_bytes(), FLAGS_bloom_size_bytes, FLAGS_bloom_size_bytes * 0.05); + ASSERT_GT(FLAGS_n_keys, sizing.expected_count()) + << "Invalid parameters: --n_keys isn't set large enough to fill even " + << "one bloom filter of the requested --bloom_size_bytes"; + + BloomFileWriter bfw(sink.Pass(), sizing); + + ASSERT_OK(bfw.Start()); + AppendBlooms(&bfw); + ASSERT_OK(bfw.Finish()); + } + + Status OpenBloomFile() { + gscoped_ptr source; + RETURN_NOT_OK(fs_manager_->OpenBlock(block_id_, &source)); + + return BloomFileReader::Open(source.Pass(), ReaderOptions(), &bfr_); + } + + uint64_t ReadBenchmark() { + Random rng(GetRandomSeed32()); + uint64_t count_present = 0; + LOG_TIMING(INFO, strings::Substitute("Running $0 queries", FLAGS_benchmark_queries)) { + + for (uint64_t i = 0; i < FLAGS_benchmark_queries; i++) { + uint64_t key = rng.Uniform(FLAGS_n_keys); + key <<= kKeyShift; + if (!FLAGS_benchmark_should_hit) { + // Since the keys are bitshifted, setting the last bit + // ensures that none of the queries will match. + key |= 1; + } + + key = BigEndian::FromHost64(key); + + Slice s(reinterpret_cast(&key), sizeof(key)); + bool present; + CHECK_OK(bfr_->CheckKeyPresent(BloomKeyProbe(s), &present)); + if (present) count_present++; + } + } + return count_present; + } + + protected: + gscoped_ptr fs_manager_; + gscoped_ptr bfr_; + BlockId block_id_; +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/bloomfile-test.cc b/src/kudu/cfile/bloomfile-test.cc new file mode 100644 index 000000000000..185a174b729f --- /dev/null +++ b/src/kudu/cfile/bloomfile-test.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/bloomfile-test-base.h" +#include "kudu/fs/fs-test-util.h" + +using std::shared_ptr; + +namespace kudu { +namespace cfile { + +using fs::CountingReadableBlock; + +class BloomFileTest : public BloomFileTestBase { + + protected: + void VerifyBloomFile() { + // Verify all the keys that we inserted probe as present. + for (uint64_t i = 0; i < FLAGS_n_keys; i++) { + uint64_t i_byteswapped = BigEndian::FromHost64(i << kKeyShift); + Slice s(reinterpret_cast(&i_byteswapped), sizeof(i)); + + bool present = false; + ASSERT_OK_FAST(bfr_->CheckKeyPresent(BloomKeyProbe(s), &present)); + ASSERT_TRUE(present); + } + + int positive_count = 0; + // Check that the FP rate for keys we didn't insert is what we expect. + for (uint64 i = 0; i < FLAGS_n_keys; i++) { + uint64_t key = random(); + Slice s(reinterpret_cast(&key), sizeof(key)); + + bool present = false; + ASSERT_OK_FAST(bfr_->CheckKeyPresent(BloomKeyProbe(s), &present)); + if (present) { + positive_count++; + } + } + + double fp_rate = static_cast(positive_count) / FLAGS_n_keys; + LOG(INFO) << "fp_rate: " << fp_rate << "(" << positive_count << "/" << FLAGS_n_keys << ")"; + ASSERT_LT(fp_rate, FLAGS_fp_rate + FLAGS_fp_rate * 0.20f) + << "Should be no more than 1.2x the expected FP rate"; + } +}; + + +TEST_F(BloomFileTest, TestWriteAndRead) { + ASSERT_NO_FATAL_FAILURE(WriteTestBloomFile()); + ASSERT_OK(OpenBloomFile()); + VerifyBloomFile(); +} + +#ifdef NDEBUG +TEST_F(BloomFileTest, Benchmark) { + ASSERT_NO_FATAL_FAILURE(WriteTestBloomFile()); + ASSERT_OK(OpenBloomFile()); + + uint64_t count_present = ReadBenchmark(); + + double hit_rate = static_cast(count_present) / + static_cast(FLAGS_benchmark_queries); + LOG(INFO) << "Hit Rate: " << hit_rate << + "(" << count_present << "/" << FLAGS_benchmark_queries << ")"; + + if (FLAGS_benchmark_should_hit) { + ASSERT_EQ(count_present, FLAGS_benchmark_queries); + } else { + ASSERT_LT(hit_rate, FLAGS_fp_rate + FLAGS_fp_rate * 0.20f) + << "Should be no more than 1.2x the expected FP rate"; + } +} +#endif + +TEST_F(BloomFileTest, TestLazyInit) { + ASSERT_NO_FATAL_FAILURE(WriteTestBloomFile()); + + shared_ptr tracker = MemTracker::CreateTracker(-1, "test"); + int64_t initial_mem_usage = tracker->consumption(); + + // Open the bloom file using a "counting" readable block. + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id_, &block)); + size_t bytes_read = 0; + gscoped_ptr count_block( + new CountingReadableBlock(block.Pass(), &bytes_read)); + + // Lazily opening the bloom file should not trigger any reads. + gscoped_ptr reader; + ReaderOptions opts; + opts.parent_mem_tracker = tracker; + ASSERT_OK(BloomFileReader::OpenNoInit(count_block.Pass(), opts, &reader)); + ASSERT_EQ(0, bytes_read); + int64_t lazy_mem_usage = tracker->consumption(); + ASSERT_GT(lazy_mem_usage, initial_mem_usage); + + // But initializing it should (only the first time), and the bloom's + // memory usage should increase. + ASSERT_OK(reader->Init()); + ASSERT_GT(bytes_read, 0); + size_t bytes_read_after_init = bytes_read; + ASSERT_OK(reader->Init()); + ASSERT_EQ(bytes_read_after_init, bytes_read); + ASSERT_GT(tracker->consumption(), lazy_mem_usage); + + // And let's test non-lazy open for good measure; it should yield the + // same number of bytes read. + ASSERT_OK(fs_manager_->OpenBlock(block_id_, &block)); + bytes_read = 0; + count_block.reset(new CountingReadableBlock(block.Pass(), &bytes_read)); + ASSERT_OK(BloomFileReader::Open(count_block.Pass(), ReaderOptions(), &reader)); + ASSERT_EQ(bytes_read_after_init, bytes_read); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/bloomfile.cc b/src/kudu/cfile/bloomfile.cc new file mode 100644 index 000000000000..b3059bcb592b --- /dev/null +++ b/src/kudu/cfile/bloomfile.cc @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/bloomfile.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/util/coding.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/malloc.h" +#include "kudu/util/pb_util.h" + +DECLARE_bool(cfile_lazy_open); + +namespace kudu { +namespace cfile { + +using fs::ReadableBlock; +using fs::ScopedWritableBlockCloser; +using fs::WritableBlock; + +//////////////////////////////////////////////////////////// +// Writer +//////////////////////////////////////////////////////////// + +BloomFileWriter::BloomFileWriter(gscoped_ptr block, + const BloomFilterSizing &sizing) + : bloom_builder_(sizing) { + cfile::WriterOptions opts; + opts.write_posidx = false; + opts.write_validx = true; + // Never use compression, regardless of the default settings, since + // bloom filters are high-entropy data structures by their nature. + opts.storage_attributes.encoding = PLAIN_ENCODING; + opts.storage_attributes.compression = NO_COMPRESSION; + writer_.reset(new cfile::CFileWriter(opts, GetTypeInfo(BINARY), false, block.Pass())); +} + +Status BloomFileWriter::Start() { + return writer_->Start(); +} + +Status BloomFileWriter::Finish() { + ScopedWritableBlockCloser closer; + RETURN_NOT_OK(FinishAndReleaseBlock(&closer)); + return closer.CloseBlocks(); +} + +Status BloomFileWriter::FinishAndReleaseBlock(ScopedWritableBlockCloser* closer) { + if (bloom_builder_.count() > 0) { + RETURN_NOT_OK(FinishCurrentBloomBlock()); + } + return writer_->FinishAndReleaseBlock(closer); +} + +size_t BloomFileWriter::written_size() const { + return writer_->written_size(); +} + +Status BloomFileWriter::AppendKeys( + const Slice *keys, size_t n_keys) { + + // If this is the call on a new bloom, copy the first key. + if (bloom_builder_.count() == 0 && n_keys > 0) { + first_key_.assign_copy(keys[0].data(), keys[0].size()); + } + + for (size_t i = 0; i < n_keys; i++) { + + bloom_builder_.AddKey(BloomKeyProbe(keys[i])); + + // Bloom has reached optimal occupancy: flush it to the file + if (PREDICT_FALSE(bloom_builder_.count() >= bloom_builder_.expected_count())) { + RETURN_NOT_OK(FinishCurrentBloomBlock()); + + // Copy the next key as the first key of the next block. + // Doing this here avoids having to do it in normal code path of the loop. + if (i < n_keys - 1) { + first_key_.assign_copy(keys[i + 1].data(), keys[i + 1].size()); + } + } + } + + return Status::OK(); +} + +Status BloomFileWriter::FinishCurrentBloomBlock() { + VLOG(1) << "Appending a new bloom block, first_key=" << Slice(first_key_).ToDebugString(); + + // Encode the header. + BloomBlockHeaderPB hdr; + hdr.set_num_hash_functions(bloom_builder_.n_hashes()); + faststring hdr_str; + PutFixed32(&hdr_str, hdr.ByteSize()); + CHECK(pb_util::AppendToString(hdr, &hdr_str)); + + // The data is the concatenation of the header and the bloom itself. + vector slices; + slices.push_back(Slice(hdr_str)); + slices.push_back(bloom_builder_.slice()); + + // Append to the file. + Slice start_key(first_key_); + RETURN_NOT_OK(writer_->AppendRawBlock(slices, 0, &start_key, "bloom block")); + + bloom_builder_.Clear(); + + #ifndef NDEBUG + first_key_.assign_copy("POST_RESET"); + #endif + + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// Reader +//////////////////////////////////////////////////////////// + +Status BloomFileReader::Open(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader) { + gscoped_ptr bf_reader; + RETURN_NOT_OK(OpenNoInit(block.Pass(), options, &bf_reader)); + RETURN_NOT_OK(bf_reader->Init()); + + *reader = bf_reader.Pass(); + return Status::OK(); +} + +Status BloomFileReader::OpenNoInit(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader) { + gscoped_ptr cf_reader; + RETURN_NOT_OK(CFileReader::OpenNoInit(block.Pass(), options, &cf_reader)); + gscoped_ptr bf_reader(new BloomFileReader( + cf_reader.Pass(), options)); + if (!FLAGS_cfile_lazy_open) { + RETURN_NOT_OK(bf_reader->Init()); + } + + *reader = bf_reader.Pass(); + return Status::OK(); +} + +BloomFileReader::BloomFileReader(gscoped_ptr reader, + const ReaderOptions& options) + : reader_(reader.Pass()), + mem_consumption_(options.parent_mem_tracker, + memory_footprint_excluding_reader()) { +} + +Status BloomFileReader::Init() { + return init_once_.Init(&BloomFileReader::InitOnce, this); +} + +Status BloomFileReader::InitOnce() { + // Fully open the CFileReader if it was lazily opened earlier. + // + // If it's already initialized, this is a no-op. + RETURN_NOT_OK(reader_->Init()); + + if (reader_->is_compressed()) { + return Status::Corruption("bloom file is compressed (compression not supported)", + reader_->ToString()); + } + if (!reader_->has_validx()) { + return Status::Corruption("bloom file missing value index", + reader_->ToString()); + } + + BlockPointer validx_root = reader_->validx_root(); + + // Ugly hack: create a per-cpu iterator. + // Instead this should be threadlocal, or allow us to just + // stack-allocate these things more smartly! + int n_cpus = base::MaxCPUIndex() + 1; + for (int i = 0; i < n_cpus; i++) { + index_iters_.push_back( + IndexTreeIterator::Create(reader_.get(), validx_root)); + } + iter_locks_.reset(new padded_spinlock[n_cpus]); + + // The memory footprint has changed. + mem_consumption_.Reset(memory_footprint_excluding_reader()); + + return Status::OK(); +} + +Status BloomFileReader::ParseBlockHeader(const Slice &block, + BloomBlockHeaderPB *hdr, + Slice *bloom_data) const { + Slice data(block); + if (PREDICT_FALSE(data.size() < 4)) { + return Status::Corruption("Invalid bloom block header: not enough bytes"); + } + + uint32_t header_len = DecodeFixed32(data.data()); + data.remove_prefix(sizeof(header_len)); + + if (header_len > data.size()) { + return Status::Corruption( + StringPrintf("Header length %d doesn't fit in buffer of size %ld", + header_len, data.size())); + } + + if (!hdr->ParseFromArray(data.data(), header_len)) { + return Status::Corruption( + string("Invalid bloom block header: ") + + hdr->InitializationErrorString() + + "\nHeader:" + HexDump(Slice(data.data(), header_len))); + } + + data.remove_prefix(header_len); + *bloom_data = data; + return Status::OK(); +} + +Status BloomFileReader::CheckKeyPresent(const BloomKeyProbe &probe, + bool *maybe_present) { + DCHECK(init_once_.initted()); + +#if defined(__linux__) + int cpu = sched_getcpu(); +#else + // Use just one lock if on OS X. + int cpu = 0; +#endif + BlockPointer bblk_ptr; + { + std::unique_lock lock; + while (true) { + std::unique_lock l(iter_locks_[cpu], std::try_to_lock); + if (l.owns_lock()) { + lock.swap(l); + break; + } + cpu = (cpu + 1) % index_iters_.size(); + } + + cfile::IndexTreeIterator *index_iter = &index_iters_[cpu]; + + Status s = index_iter->SeekAtOrBefore(probe.key()); + if (PREDICT_FALSE(s.IsNotFound())) { + // Seek to before the first entry in the file. + *maybe_present = false; + return Status::OK(); + } + RETURN_NOT_OK(s); + + // Successfully found the pointer to the bloom block. Read it. + bblk_ptr = index_iter->GetCurrentBlockPointer(); + } + + BlockHandle dblk_data; + RETURN_NOT_OK(reader_->ReadBlock(bblk_ptr, CFileReader::CACHE_BLOCK, &dblk_data)); + + // Parse the header in the block. + BloomBlockHeaderPB hdr; + Slice bloom_data; + RETURN_NOT_OK(ParseBlockHeader(dblk_data.data(), &hdr, &bloom_data)); + + // Actually check the bloom filter. + BloomFilter bf(bloom_data, hdr.num_hash_functions()); + *maybe_present = bf.MayContainKey(probe); + return Status::OK(); +} + +size_t BloomFileReader::memory_footprint_excluding_reader() const { + size_t size = kudu_malloc_usable_size(this); + + size += init_once_.memory_footprint_excluding_this(); + + // This seems to be the easiest way to get a heap pointer to the ptr_vector. + // + // TODO: Track the iterators' memory footprint? May change with every seek; + // not clear if it's worth doing. + size += kudu_malloc_usable_size( + const_cast(this)->index_iters_.c_array()); + for (int i = 0; i < index_iters_.size(); i++) { + size += kudu_malloc_usable_size(&index_iters_[i]); + } + + if (iter_locks_) { + size += kudu_malloc_usable_size(iter_locks_.get()); + } + + return size; +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/bloomfile.h b/src/kudu/cfile/bloomfile.h new file mode 100644 index 000000000000..540ad55a83af --- /dev/null +++ b/src/kudu/cfile/bloomfile.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_BLOOMFILE_H +#define KUDU_CFILE_BLOOMFILE_H + +#include +#include +#include + +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/bloom_filter.h" +#include "kudu/util/faststring.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/once.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace cfile { + +class BloomFileWriter { + public: + BloomFileWriter(gscoped_ptr block, + const BloomFilterSizing &sizing); + + Status Start(); + Status AppendKeys(const Slice *keys, size_t n_keys); + + // Close the bloom's CFile, closing the underlying writable block. + Status Finish(); + + // Close the bloom's CFile, releasing the underlying block to 'closer'. + Status FinishAndReleaseBlock(fs::ScopedWritableBlockCloser* closer); + + // Estimate the amount of data already written to this file. + size_t written_size() const; + + private: + DISALLOW_COPY_AND_ASSIGN(BloomFileWriter); + + Status FinishCurrentBloomBlock(); + + gscoped_ptr writer_; + + BloomFilterBuilder bloom_builder_; + + // first key inserted in the current block. + faststring first_key_; +}; + +// Reader for a bloom file. +// NB: this is not currently thread-safe. +// When making it thread-safe, should make sure that the threads +// share a single CFileReader, or else the cache keys won't end up +// shared! +class BloomFileReader { + public: + + // Fully open a bloom file using a previously opened block. + // + // After this call, the bloom reader is safe for use. + static Status Open(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader); + + // Lazily opens a bloom file using a previously opened block. A lazy open + // does not incur additional I/O, nor does it validate the contents of + // the bloom file. + // + // Init() must be called before using CheckKeyPresent(). + static Status OpenNoInit(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader); + + // Fully opens a previously lazily opened bloom file, parsing and + // validating its contents. + // + // May be called multiple times; subsequent calls will no-op. + Status Init(); + + // Check if the given key may be present in the file. + // + // Sets *maybe_present to false if the key is definitely not + // present, otherwise sets it to true to indicate maybe present. + Status CheckKeyPresent(const BloomKeyProbe &probe, + bool *maybe_present); + + private: + DISALLOW_COPY_AND_ASSIGN(BloomFileReader); + + BloomFileReader(gscoped_ptr reader, const ReaderOptions& options); + + // Parse the header present in the given block. + // + // Returns the parsed header inside *hdr, and returns + // a Slice to the true bloom filter data inside + // *bloom_data. + Status ParseBlockHeader(const Slice &block, + BloomBlockHeaderPB *hdr, + Slice *bloom_data) const; + + // Callback used in 'init_once_' to initialize this bloom file. + Status InitOnce(); + + // Returns the memory usage of this object including the object itself but + // excluding the CFileReader, which is tracked independently. + size_t memory_footprint_excluding_reader() const; + + gscoped_ptr reader_; + + // TODO: temporary workaround for the fact that + // the index tree iterator is a member of the Reader object. + // We need a big per-thread object which gets passed around so as + // to avoid this... Instead we'll use a per-CPU iterator as a + // lame hack. + boost::ptr_vector index_iters_; + gscoped_ptr iter_locks_; + + KuduOnceDynamic init_once_; + + ScopedTrackedConsumption mem_consumption_; +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/bshuf_block.cc b/src/kudu/cfile/bshuf_block.cc new file mode 100644 index 000000000000..019d141726f2 --- /dev/null +++ b/src/kudu/cfile/bshuf_block.cc @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/cfile/bshuf_block.h" + +namespace kudu { +namespace cfile { + +void AbortWithBitShuffleError(int64_t val) { + switch (val) { + case -1: + LOG(FATAL) << "Failed to allocate memory"; + break; + case -11: + LOG(FATAL) << "Missing SSE"; + break; + case -12: + LOG(FATAL) << "Missing AVX"; + break; + case -80: + LOG(FATAL) << "Input size not a multiple of 8"; + break; + case -81: + LOG(FATAL) << "block_size not multiple of 8"; + break; + case -91: + LOG(FATAL) << "Decompression error, wrong number of bytes processed"; + break; + default: + LOG(FATAL) << "Error internal to compression routine"; + } +} + +// Template specialization for UINT32, which is used by dictionary encoding. +// It dynamically switch to block of UINT16 or UINT8 depending on the values +// in the current block. +template<> +Slice BShufBlockBuilder::Finish(rowid_t ordinal_pos) { + uint32_t max_value = 0; + for (int i = 0; i < count_; i++) { + uint32_t value = *reinterpret_cast(&data_[i * sizeof(uint32_t)]); + max_value = (max_value < value)? value : max_value; + } + + // Shrink the block of UINT32 to block of UINT8 or UINT16 whenever possible and + // set the header information accordingly, so that the decoder can recover the + // encoded data. + Slice ret; + if (max_value < 256) { + for (int i = 0; i < count_; i++) { + uint32_t value = *reinterpret_cast(&data_[i * sizeof(uint32_t)]); + uint8_t converted_value = (uint8_t)(value); + *reinterpret_cast(&data_[i * sizeof(uint8_t)]) = converted_value; + } + ret = Finish(ordinal_pos, sizeof(uint8_t)); + InlineEncodeFixed32(ret.mutable_data() + 16, sizeof(uint8_t)); + } else if (max_value < 65536) { + for (int i = 0; i < count_; i++) { + uint32_t value = *reinterpret_cast(&data_[i * sizeof(uint32_t)]); + uint16_t converted_value = (uint16_t)(value); + *reinterpret_cast(&data_[i * sizeof(uint16_t)]) = converted_value; + } + ret = Finish(ordinal_pos, sizeof(uint16_t)); + InlineEncodeFixed32(ret.mutable_data() + 16, sizeof(uint16_t)); + } else { + ret = Finish(ordinal_pos, sizeof(uint32_t)); + InlineEncodeFixed32(ret.mutable_data() + 16, sizeof(uint32_t)); + } + return ret; +} + +// Template specialization for UINT32, dynamically decoded blocks of +// bitshuffled UINT16 OR UINT8 to UINT32. +template<> +Status BShufBlockDecoder::Expand() { + if (num_elems_ > 0) { + int64_t bytes; + decoded_.resize(num_elems_after_padding_ * size_of_elem_); + uint8_t* in = const_cast(&data_[kHeaderSize]); + + bytes = bshuf_decompress_lz4(in, decoded_.data(), num_elems_after_padding_, size_of_elem_, 0); + if (PREDICT_FALSE(bytes < 0)) { + // Ideally, this should not happen. + AbortWithBitShuffleError(bytes); + return Status::RuntimeError("Unshuffle Process failed"); + } + } + + return Status::OK(); +} + +// Template specialization for UINT32. +template<> +Status BShufBlockDecoder::SeekAtOrAfterValue(const void* value_void, bool* exact) { + uint32_t target = *reinterpret_cast(value_void); + int32_t left = 0; + int32_t right = num_elems_; + + while (left != right) { + uint32_t mid = (left + right) / 2; + uint32_t mid_key; + switch (size_of_elem_) { + case 1: { + mid_key = Decode(&decoded_[mid * size_of_elem_]); + break; + } + case 2: { + mid_key = Decode(&decoded_[mid * size_of_elem_]); + break; + } + case 4: { + mid_key = Decode(&decoded_[mid * size_of_elem_]); + break; + } + } + if (mid_key == target) { + cur_idx_ = mid; + *exact = true; + return Status::OK(); + } else if (mid_key > target) { + right = mid; + } else { + left = mid + 1; + } + } + + *exact = false; + cur_idx_ = left; + if (cur_idx_ == num_elems_) { + return Status::NotFound("after last key in block"); + } + return Status::OK(); +} + +// Template specialization for UINT32, expand blocks of UINT8 or UINT16 to UINT32. +template<> +Status BShufBlockDecoder::CopyNextValuesToArray(size_t* n, uint8_t* array) { + DCHECK(parsed_); + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + // First, copy it to the destination array without any "expansion". + size_t max_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + memcpy(array, &decoded_[cur_idx_ * size_of_elem_], max_fetch * size_of_elem_); + + *n = max_fetch; + cur_idx_ += max_fetch; + + // Then, "expand" it out to the correct output size. We only need to do + // the expansion for size = 1 or size = 2. + if (size_of_elem_ == 1) { + for (int i = max_fetch - 1; i >= 0; i--) { + uint8_t value = *reinterpret_cast(&array[i * sizeof(uint8_t)]); + uint32_t convert_value = (uint32_t)(value); + *reinterpret_cast(&array[i * sizeof(uint32_t)]) = convert_value; + } + } else if (size_of_elem_ == 2) { + for (int i = max_fetch - 1; i >= 0; i--) { + uint16_t value = *reinterpret_cast(&array[i * sizeof(uint16_t)]); + uint32_t convert_value = (uint32_t)(value); + *reinterpret_cast(&array[i * sizeof(uint32_t)]) = convert_value; + } + } + + return Status::OK(); +} + + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/bshuf_block.h b/src/kudu/cfile/bshuf_block.h new file mode 100644 index 000000000000..1058eb0998f2 --- /dev/null +++ b/src/kudu/cfile/bshuf_block.h @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Make use of bitshuffle and lz4 to encode the fixed size +// type blocks, such as UINT8, INT8, UINT16, INT16, +// UINT32, INT32, FLOAT, DOUBLE. +// Reference: +// https://github.com/kiyo-masui/bitshuffle.git +#ifndef KUDU_CFILE_BSHUF_BLOCK_H +#define KUDU_CFILE_BSHUF_BLOCK_H + +#include +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/hexdump.h" + +namespace kudu { +namespace cfile { + + +// Log a FATAL error message and exit. +void AbortWithBitShuffleError(int64_t val) ATTRIBUTE_NORETURN; + +// BshufBlockBuilder bitshuffles and compresses the bits of fixed +// size type blocks with lz4. +// +// Header includes: +// 1. ordinal of the first element within the block (uint32_t, little endian). +// 2. num of element within the block (uint32_t, little endian). +// 3. compressed_size, including the header size (uint32_t, little endian). +// 4. number of element after padding, padding is needed to meet the requirement +// by bitshuffle library that the number of element in the block must be +// multiple of 8. That means some psudo elements are appended at the end of the +// block if necessary (uint32_t, little endian). +// 5. the size of the elements in bytes, as actually encoded. In the case that all of the +// data in a block can fit into a smaller integer type, then we may choose to encode +// that smaller type to save CPU costs. This is currently done only for the UINT32 +// block type. (uint32_t, little endian). +template +class BShufBlockBuilder : public BlockBuilder { + public: + explicit BShufBlockBuilder(const WriterOptions* options) + : count_(0), + options_(options) { + Reset(); + } + + void Reset() OVERRIDE { + count_ = 0; + data_.clear(); + data_.reserve(options_->storage_attributes.cfile_block_size); + buffer_.clear(); + buffer_.resize(kMaxHeaderSize); + } + + bool IsBlockFull(size_t limit) const OVERRIDE { + return EstimateEncodedSize() > limit; + } + + int Add(const uint8_t* vals_void, size_t count) OVERRIDE { + const CppType* vals = reinterpret_cast(vals_void); + int added = 0; + // If the current block is full, stop adding more items. + while (!IsBlockFull(options_->storage_attributes.cfile_block_size) && added < count) { + const uint8_t* ptr = reinterpret_cast(vals); + data_.append(ptr, size_of_type); + vals++; + added++; + count_++; + } + return added; + } + + size_t Count() const OVERRIDE { + return count_; + } + + Status GetFirstKey(void* key) const OVERRIDE { + if (count_ == 0) { + return Status::NotFound("no keys in data block"); + } + memcpy(key, &data_[0], size_of_type); + return Status::OK(); + } + + Slice Finish(rowid_t ordinal_pos) OVERRIDE { + return Finish(ordinal_pos, size_of_type); + } + + private: + size_t EstimateEncodedSize() const { + int num = count_ + NumOfPaddingNeeded(); + // The result of bshuf_compress_lz4_bound(num, size_of_type, 0) + // is always bigger than the original size (num * size_of_type). + // However, the compression ratio in most cases is larger than 1, + // Therefore, using the original size may be more accurate and + // cause less overhead. + return kMaxHeaderSize + num * size_of_type; + } + + uint32_t NumOfPaddingNeeded() const { + return (count_ % 8 == 0) ? 0 : 8 - (count_ % 8); + } + + Slice Finish(rowid_t ordinal_pos, int final_size_of_type) { + data_.resize(kMaxHeaderSize + final_size_of_type * count_); + + // Do padding so that the input num of element is multiple of 8. + uint32_t num_of_padding = NumOfPaddingNeeded() * final_size_of_type; + for (int i = 0; i < num_of_padding; i++) { + data_.push_back(0); + } + + int num_elems_after_padding = count_ + NumOfPaddingNeeded(); + buffer_.resize(kMaxHeaderSize + + bshuf_compress_lz4_bound(num_elems_after_padding, final_size_of_type, 0)); + + InlineEncodeFixed32(&buffer_[0], ordinal_pos); + InlineEncodeFixed32(&buffer_[4], count_); + int64_t bytes = bshuf_compress_lz4(data_.data(), &buffer_[kMaxHeaderSize], + num_elems_after_padding, final_size_of_type, 0); + if (PREDICT_FALSE(bytes < 0)) { + // This means the bitshuffle function fails. + // Ideally, this should not happen. + AbortWithBitShuffleError(bytes); + // It does not matter what will be returned here, + // since we have logged fatal in AbortWithBitShuffleError(). + return Slice(); + } + InlineEncodeFixed32(&buffer_[8], kMaxHeaderSize + bytes); + InlineEncodeFixed32(&buffer_[12], num_elems_after_padding); + InlineEncodeFixed32(&buffer_[16], final_size_of_type); + return Slice(buffer_.data(), kMaxHeaderSize + bytes); + } + + // Length of a header. + static const size_t kMaxHeaderSize = sizeof(uint32_t) * 5; + typedef typename TypeTraits::cpp_type CppType; + enum { + size_of_type = TypeTraits::size + }; + + faststring data_; + faststring buffer_; + uint32_t count_; + const WriterOptions* options_; +}; + +template<> +Slice BShufBlockBuilder::Finish(rowid_t ordinal_pos); + +template +class BShufBlockDecoder : public BlockDecoder { + public: + explicit BShufBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + ordinal_pos_base_(0), + num_elems_(0), + compressed_size_(0), + num_elems_after_padding_(0), + cur_idx_(0) { + } + + Status ParseHeader() OVERRIDE { + CHECK(!parsed_); + if (data_.size() < kHeaderSize) { + return Status::Corruption( + strings::Substitute("not enough bytes for header: bitshuffle block header " + "size ($0) less than expected header length ($1)", + data_.size(), kHeaderSize)); + } + + ordinal_pos_base_ = DecodeFixed32(&data_[0]); + num_elems_ = DecodeFixed32(&data_[4]); + compressed_size_ = DecodeFixed32(&data_[8]); + if (compressed_size_ != data_.size()) { + return Status::Corruption("Size Information unmatched"); + } + num_elems_after_padding_ = DecodeFixed32(&data_[12]); + if (num_elems_after_padding_ != num_elems_ + NumOfPaddingNeeded()) { + return Status::Corruption("num of element information corrupted"); + } + size_of_elem_ = DecodeFixed32(&data_[16]); + switch (size_of_elem_) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return Status::Corruption(strings::Substitute("invalid size_of_elem: $0", size_of_elem_)); + } + + // Currently, only the UINT32 block encoder supports expanding size: + if (PREDICT_FALSE(Type != UINT32 && size_of_elem_ != size_of_type)) { + return Status::Corruption(strings::Substitute("size_of_elem $0 != size_of_type $1", + size_of_elem_, size_of_type)); + } + if (PREDICT_FALSE(size_of_elem_ > size_of_type)) { + return Status::Corruption(strings::Substitute("size_of_elem $0 > size_of_type $1", + size_of_elem_, size_of_type)); + } + + RETURN_NOT_OK(Expand()); + + parsed_ = true; + return Status::OK(); + } + + void SeekToPositionInBlock(uint pos) OVERRIDE { + CHECK(parsed_) << "Must call ParseHeader()"; + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, pos); + return; + } + + DCHECK_LE(pos, num_elems_); + cur_idx_ = pos; + } + + Status SeekAtOrAfterValue(const void* value_void, bool* exact) OVERRIDE { + CppType target = *reinterpret_cast(value_void); + int32_t left = 0; + int32_t right = num_elems_; + while (left != right) { + uint32_t mid = (left + right) / 2; + CppType mid_key = Decode( + &decoded_[mid * size_of_type]); + if (mid_key == target) { + cur_idx_ = mid; + *exact = true; + return Status::OK(); + } else if (mid_key > target) { + right = mid; + } else { + left = mid + 1; + } + } + + *exact = false; + cur_idx_ = left; + if (cur_idx_ == num_elems_) { + return Status::NotFound("after last key in block"); + } + return Status::OK(); + } + + Status CopyNextValues(size_t* n, ColumnDataView* dst) OVERRIDE { + DCHECK_EQ(dst->stride(), sizeof(CppType)); + return CopyNextValuesToArray(n, dst->data()); + } + + // Copy the codewords to a temporary buffer. + // This API provides a more convenient way for the dictionary decoder to copy out + // integer codewords and then look up the strings. If we use the CopyNextValuesToArray() + // instead of CopyNextValues(), we do not need to create ColumnDataView and ColumnBlock + // object to wrap around the uint8_t pointer. + Status CopyNextValuesToArray(size_t* n, uint8_t* array) { + DCHECK(parsed_); + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + memcpy(array, &decoded_[cur_idx_ * size_of_type], max_fetch * size_of_type); + + *n = max_fetch; + cur_idx_ += max_fetch; + + return Status::OK(); + } + + size_t GetCurrentIndex() const OVERRIDE { + DCHECK(parsed_) << "must parse header first"; + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return ordinal_pos_base_; + } + + size_t Count() const OVERRIDE { + return num_elems_; + } + + bool HasNext() const OVERRIDE { + return (num_elems_ - cur_idx_) > 0; + } + + private: + template + static T Decode(const uint8_t* ptr) { + T result; + memcpy(&result, ptr, sizeof(result)); + return result; + } + + // Return the number of padding elements needed to ensure that the + // number of elements is a multiple of 8. + uint32_t NumOfPaddingNeeded() const { + return KUDU_ALIGN_UP(num_elems_, 8) - num_elems_; + } + + Status Expand() { + if (num_elems_ > 0) { + int64_t bytes; + decoded_.resize(num_elems_after_padding_ * size_of_type); + uint8_t* in = const_cast(&data_[kHeaderSize]); + bytes = bshuf_decompress_lz4(in, decoded_.data(), num_elems_after_padding_, size_of_type, 0); + if (PREDICT_FALSE(bytes < 0)) { + // Ideally, this should not happen. + AbortWithBitShuffleError(bytes); + return Status::RuntimeError("Unshuffle Process failed"); + } + } + return Status::OK(); + } + + // Min Length of a header. + static const size_t kHeaderSize = sizeof(uint32_t) * 5; + typedef typename TypeTraits::cpp_type CppType; + enum { + size_of_type = TypeTraits::size + }; + + Slice data_; + bool parsed_; + + rowid_t ordinal_pos_base_; + uint32_t num_elems_; + uint32_t compressed_size_; + uint32_t num_elems_after_padding_; + + // The size of each decoded element. In the case that the input range was + // smaller than the type, this may be smaller than 'size_of_type'. + // Currently, this is always 1, 2, 4, or 8. + int size_of_elem_; + + size_t cur_idx_; + faststring decoded_; +}; + +template<> +Status BShufBlockDecoder::Expand(); +template<> +Status BShufBlockDecoder::SeekAtOrAfterValue(const void* value_void, bool* exact); +template<> +Status BShufBlockDecoder::CopyNextValuesToArray(size_t* n, uint8_t* array); + + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/cfile-dump.cc b/src/kudu/cfile/cfile-dump.cc new file mode 100644 index 000000000000..ed33d1c6718b --- /dev/null +++ b/src/kudu/cfile/cfile-dump.cc @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/util/logging.h" +#include "kudu/util/flags.h" + +DEFINE_bool(print_meta, true, "print the header and footer from the file"); +DEFINE_bool(iterate_rows, true, "iterate each row in the file"); +DEFINE_bool(print_rows, true, "print each row in the file"); +DEFINE_int32(num_iterations, 1, "number of times to iterate the file"); + +namespace kudu { +namespace cfile { + +using std::string; +using std::cout; +using std::endl; + +Status DumpFile(const string& block_id_str) { + // Allow read-only access to live blocks. + FsManagerOpts fs_opts; + fs_opts.read_only = true; + FsManager fs_manager(Env::Default(), fs_opts); + RETURN_NOT_OK(fs_manager.Open()); + + uint64_t numeric_id; + CHECK(safe_strtou64_base(block_id_str, &numeric_id, 16)); + BlockId block_id(numeric_id); + gscoped_ptr block; + RETURN_NOT_OK(fs_manager.OpenBlock(block_id, &block)); + + gscoped_ptr reader; + RETURN_NOT_OK(CFileReader::Open(block.Pass(), ReaderOptions(), &reader)); + + if (FLAGS_print_meta) { + cout << "Header:\n" << reader->header().DebugString() << endl; + cout << "Footer:\n" << reader->footer().DebugString() << endl; + } + + if (FLAGS_iterate_rows) { + gscoped_ptr it; + RETURN_NOT_OK(reader->NewIterator(&it, CFileReader::DONT_CACHE_BLOCK)); + + DumpIteratorOptions opts; + opts.print_rows = FLAGS_print_rows; + for (int i = 0; i < FLAGS_num_iterations; i++) { + RETURN_NOT_OK(it->SeekToFirst()); + RETURN_NOT_OK(DumpIterator(*reader, it.get(), &cout, opts, 0)); + } + } + + return Status::OK(); +} + +} // namespace cfile +} // namespace kudu + +int main(int argc, char **argv) { + kudu::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + if (argc != 2) { + std::cerr << "usage: " << argv[0] + << " -fs_wal_dir -fs_data_dirs " << std::endl; + return 1; + } + + if (!FLAGS_iterate_rows) { + FLAGS_print_rows = false; + } + + kudu::Status s = kudu::cfile::DumpFile(argv[1]); + if (!s.ok()) { + std::cerr << "Error: " << s.ToString() << std::endl; + return 1; + } + + return 0; +} diff --git a/src/kudu/cfile/cfile-test-base.h b/src/kudu/cfile/cfile-test-base.h new file mode 100644 index 000000000000..54199932c1ac --- /dev/null +++ b/src/kudu/cfile/cfile-test-base.h @@ -0,0 +1,500 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_TEST_BASE_H +#define KUDU_CFILE_TEST_BASE_H + +#include +#include +#include +#include + +#include "kudu/cfile/cfile-test-base.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/common/columnblock.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/status.h" + +DEFINE_int32(cfile_test_block_size, 1024, + "Block size to use for testing cfiles. " + "Default is low to stress code, but can be set higher for " + "performance testing"); + +using kudu::fs::ReadableBlock; +using kudu::fs::WritableBlock; + +namespace kudu { +namespace cfile { + +// Abstract test data generator. +// You must implement BuildTestValue() to return your test value. +// Usage example: +// StringDataGenerator datagen; +// datagen.Build(10); +// for (int i = 0; i < datagen.block_entries(); ++i) { +// bool is_null = BitmpTest(datagen.null_bitmap(), i); +// Slice& v = datagen[i]; +// } +template +class DataGenerator { + public: + static bool has_nulls() { + return HAS_NULLS; + } + + static const DataType kDataType; + + typedef typename DataTypeTraits::cpp_type cpp_type; + + DataGenerator() : + values_(NULL), + null_bitmap_(NULL), + block_entries_(0), + total_entries_(0) + {} + + void Reset() { + block_entries_ = 0; + total_entries_ = 0; + } + + void Build(size_t num_entries) { + Build(total_entries_, num_entries); + total_entries_ += num_entries; + } + + // Build "num_entries" using (offset + i) as value + // You can get the data values and the null bitmap using values() and null_bitmap() + // both are valid until the class is destructed or until Build() is called again. + void Build(size_t offset, size_t num_entries) { + Resize(num_entries); + + for (size_t i = 0; i < num_entries; ++i) { + if (HAS_NULLS) { + BitmapChange(null_bitmap_.get(), i, !TestValueShouldBeNull(offset + i)); + } + values_[i] = BuildTestValue(i, offset + i); + } + } + + virtual cpp_type BuildTestValue(size_t block_index, size_t value) = 0; + + bool TestValueShouldBeNull(size_t n) { + if (!HAS_NULLS) { + return false; + } + + // The NULL pattern alternates every 32 rows, cycling between: + // 32 NULL + // 32 alternating NULL/NOTNULL + // 32 NOT-NULL + // 32 alternating NULL/NOTNULL + // This is to ensure that we stress the run-length coding for + // NULL value. + switch ((n >> 6) & 3) { + case 0: + return true; + case 1: + case 3: + return n & 1; + case 2: + return false; + default: + LOG(FATAL); + } + } + + virtual void Resize(size_t num_entries) { + if (block_entries_ >= num_entries) { + block_entries_ = num_entries; + return; + } + + values_.reset(new cpp_type[num_entries]); + null_bitmap_.reset(new uint8_t[BitmapSize(num_entries)]); + block_entries_ = num_entries; + } + + size_t block_entries() const { return block_entries_; } + size_t total_entries() const { return total_entries_; } + + const cpp_type *values() const { return values_.get(); } + const uint8_t *null_bitmap() const { return null_bitmap_.get(); } + + const cpp_type& operator[](size_t index) const { + return values_[index]; + } + + virtual ~DataGenerator() {} + + private: + gscoped_array values_; + gscoped_array null_bitmap_; + size_t block_entries_; + size_t total_entries_; +}; + +template +const DataType DataGenerator::kDataType = DATA_TYPE; + +template +class UInt8DataGenerator : public DataGenerator { + public: + UInt8DataGenerator() {} + uint8_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return (value * 10) % 256; + } +}; + +template +class Int8DataGenerator : public DataGenerator { + public: + Int8DataGenerator() {} + int8_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return ((value * 10) % 128) * (value % 2 == 0 ? -1 : 1); + } +}; + +template +class UInt16DataGenerator : public DataGenerator { + public: + UInt16DataGenerator() {} + uint16_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return (value * 10) % 65536; + } +}; + +template +class Int16DataGenerator : public DataGenerator { + public: + Int16DataGenerator() {} + int16_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return ((value * 10) % 32768) * (value % 2 == 0 ? -1 : 1); + } +}; + +template +class UInt32DataGenerator : public DataGenerator { + public: + UInt32DataGenerator() {} + uint32_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return value * 10; + } +}; + +template +class Int32DataGenerator : public DataGenerator { + public: + Int32DataGenerator() {} + int32_t BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return (value * 10) *(value % 2 == 0 ? -1 : 1); + } +}; + +// Floating-point data generator. +// This works for both floats and doubles. +template +class FPDataGenerator : public DataGenerator { + public: + typedef typename DataTypeTraits::cpp_type cpp_type; + + FPDataGenerator() {} + cpp_type BuildTestValue(size_t block_index, size_t value) OVERRIDE { + return static_cast(value) * 1.0001; + } +}; + +template +class StringDataGenerator : public DataGenerator { + public: + explicit StringDataGenerator(const char* format) + : format_(format) { + } + + Slice BuildTestValue(size_t block_index, size_t value) OVERRIDE { + char *buf = data_buffer_[block_index].data; + int len = snprintf(buf, kItemBufferSize - 1, format_, value); + DCHECK_LT(len, kItemBufferSize); + return Slice(buf, len); + } + + void Resize(size_t num_entries) OVERRIDE { + if (num_entries > this->block_entries()) { + data_buffer_.reset(new Buffer[num_entries]); + } + DataGenerator::Resize(num_entries); + } + + private: + static const int kItemBufferSize = 16; + + struct Buffer { + char data[kItemBufferSize]; + }; + + gscoped_array data_buffer_; + const char* format_; +}; + +// Class for generating strings that contain duplicate +template +class DuplicateStringDataGenerator : public DataGenerator { + public: + + // num specify number of possible unique strings that can be generated + explicit DuplicateStringDataGenerator(const char* format, int num) + : format_(format), + num_(num) { + } + + Slice BuildTestValue(size_t block_index, size_t value) OVERRIDE { + // random number from 0 ~ num_-1 + value = random() % num_; + char *buf = data_buffer_[block_index].data; + int len = snprintf(buf, kItemBufferSize - 1, format_, value); + DCHECK_LT(len, kItemBufferSize); + return Slice(buf, len); + } + + void Resize(size_t num_entries) OVERRIDE { + if (num_entries > this->block_entries()) { + data_buffer_.reset(new Buffer[num_entries]); + } + DataGenerator::Resize(num_entries); + } + + private: + static const int kItemBufferSize = 16; + + struct Buffer { + char data[kItemBufferSize]; + }; + + gscoped_array data_buffer_; + const char* format_; + int num_; +}; + +class CFileTestBase : public KuduTest { + public: + void SetUp() OVERRIDE { + KuduTest::SetUp(); + + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + protected: + enum Flags { + NO_FLAGS = 0, + WRITE_VALIDX = 1, + SMALL_BLOCKSIZE = 1 << 1 + }; + + template + void WriteTestFile(DataGeneratorType* data_generator, + EncodingType encoding, + CompressionType compression, + int num_entries, + uint32_t flags, + BlockId* block_id) { + gscoped_ptr sink; + ASSERT_OK(fs_manager_->CreateNewBlock(&sink)); + *block_id = sink->id(); + WriterOptions opts; + opts.write_posidx = true; + + if (flags & WRITE_VALIDX) { + opts.write_validx = true; + } + if (flags & SMALL_BLOCKSIZE) { + // Use a smaller block size to exercise multi-level indexing. + opts.storage_attributes.cfile_block_size = 1024; + } + + opts.storage_attributes.encoding = encoding; + opts.storage_attributes.compression = compression; + CFileWriter w(opts, GetTypeInfo(DataGeneratorType::kDataType), + DataGeneratorType::has_nulls(), sink.Pass()); + + ASSERT_OK(w.Start()); + + // Append given number of values to the test tree + const size_t kBufferSize = 8192; + size_t i = 0; + while (i < num_entries) { + int towrite = std::min(num_entries - i, kBufferSize); + + data_generator->Build(towrite); + DCHECK_EQ(towrite, data_generator->block_entries()); + + if (DataGeneratorType::has_nulls()) { + ASSERT_OK_FAST(w.AppendNullableEntries(data_generator->null_bitmap(), + data_generator->values(), + towrite)); + } else { + ASSERT_OK_FAST(w.AppendEntries(data_generator->values(), towrite)); + } + i += towrite; + } + + ASSERT_OK(w.Finish()); + } + + gscoped_ptr fs_manager_; + +}; + +// Fast unrolled summing of a vector. +// GCC's auto-vectorization doesn't work here, because there isn't +// enough guarantees on alignment and it can't seem to decode the +// constant stride. +template +SumType FastSum(const Indexable &data, size_t n) { + SumType sums[4] = {0, 0, 0, 0}; + int rem = n; + int i = 0; + while (rem >= 4) { + sums[0] += data[i]; + sums[1] += data[i+1]; + sums[2] += data[i+2]; + sums[3] += data[i+3]; + i += 4; + rem -= 4; + } + while (rem > 0) { + sums[3] += data[i++]; + rem--; + } + return sums[0] + sums[1] + sums[2] + sums[3]; +} + +template +static void TimeReadFileForDataType(gscoped_ptr &iter, int &count) { + ScopedColumnBlock cb(8192); + + SumType sum = 0; + while (iter->HasNext()) { + size_t n = cb.nrows(); + ASSERT_OK_FAST(iter->CopyNextValues(&n, &cb)); + sum += FastSum, SumType>(cb, n); + count += n; + cb.arena()->Reset(); + } + LOG(INFO)<< "Sum: " << sum; + LOG(INFO)<< "Count: " << count; +} + +template +static void ReadBinaryFile(CFileIterator* iter, int* count) { + ScopedColumnBlock cb(100); + uint64_t sum_lens = 0; + while (iter->HasNext()) { + size_t n = cb.nrows(); + ASSERT_OK_FAST(iter->CopyNextValues(&n, &cb)); + for (int i = 0; i < n; i++) { + sum_lens += cb[i].size(); + } + *count += n; + cb.arena()->Reset(); + } + LOG(INFO) << "Sum of value lengths: " << sum_lens; + LOG(INFO) << "Count: " << *count; +} + +static void TimeReadFile(FsManager* fs_manager, const BlockId& block_id, size_t *count_ret) { + Status s; + + gscoped_ptr source; + ASSERT_OK(fs_manager->OpenBlock(block_id, &source)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(source.Pass(), ReaderOptions(), &reader)); + + gscoped_ptr iter; + ASSERT_OK(reader->NewIterator(&iter, CFileReader::CACHE_BLOCK)); + ASSERT_OK(iter->SeekToOrdinal(0)); + + Arena arena(8192, 8*1024*1024); + int count = 0; + switch (reader->type_info()->physical_type()) { + case UINT8: + { + TimeReadFileForDataType(iter, count); + break; + } + case INT8: + { + TimeReadFileForDataType(iter, count); + break; + } + case UINT16: + { + TimeReadFileForDataType(iter, count); + break; + } + case INT16: + { + TimeReadFileForDataType(iter, count); + break; + } + case UINT32: + { + TimeReadFileForDataType(iter, count); + break; + } + case INT32: + { + TimeReadFileForDataType(iter, count); + break; + } + case FLOAT: + { + TimeReadFileForDataType(iter, count); + break; + } + case DOUBLE: + { + TimeReadFileForDataType(iter, count); + break; + } + case STRING: + { + ReadBinaryFile(iter.get(), &count); + break; + } + case BINARY: + { + ReadBinaryFile(iter.get(), &count); + break; + } + default: + FAIL() << "Unknown type: " << reader->type_info()->physical_type(); + } + *count_ret = count; +} + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/cfile-test.cc b/src/kudu/cfile/cfile-test.cc new file mode 100644 index 000000000000..87a6bef1aa45 --- /dev/null +++ b/src/kudu/cfile/cfile-test.cc @@ -0,0 +1,837 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/cfile/cfile-test-base.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/index_block.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/common/columnblock.h" +#include "kudu/fs/fs-test-util.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/stopwatch.h" + +DECLARE_string(block_cache_type); +DECLARE_string(cfile_do_on_finish); + +#if defined(__linux__) +DECLARE_string(nvm_cache_path); +DECLARE_bool(nvm_cache_simulate_allocation_failure); +#endif + +METRIC_DECLARE_counter(block_cache_hits_caching); + +METRIC_DECLARE_entity(server); + +using std::shared_ptr; + +namespace kudu { +namespace cfile { + +using fs::CountingReadableBlock; +using fs::ReadableBlock; +using fs::WritableBlock; + +class TestCFile : public CFileTestBase { + protected: + template + void TestReadWriteFixedSizeTypes(EncodingType encoding) { + BlockId block_id; + DataGeneratorType generator; + + WriteTestFile(&generator, encoding, NO_COMPRESSION, 10000, SMALL_BLOCKSIZE, &block_id); + + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(block.Pass(), ReaderOptions(), &reader)); + + BlockPointer ptr; + + gscoped_ptr iter; + ASSERT_OK(reader->NewIterator(&iter, CFileReader::CACHE_BLOCK)); + + ASSERT_OK(iter->SeekToOrdinal(5000)); + ASSERT_EQ(5000u, iter->GetCurrentOrdinal()); + + // Seek to last key exactly, should succeed + ASSERT_OK(iter->SeekToOrdinal(9999)); + ASSERT_EQ(9999u, iter->GetCurrentOrdinal()); + + // Seek to after last key. Should result in not found. + ASSERT_TRUE(iter->SeekToOrdinal(10000).IsNotFound()); + + // Seek to start of file + ASSERT_OK(iter->SeekToOrdinal(0)); + ASSERT_EQ(0u, iter->GetCurrentOrdinal()); + + // Fetch all data. + ScopedColumnBlock out(10000); + size_t n = 10000; + ASSERT_OK(iter->CopyNextValues(&n, &out)); + ASSERT_EQ(10000, n); + + DataGeneratorType data_generator_pre; + + for (int i = 0; i < 10000; i++) { + if (out[i] != data_generator_pre.BuildTestValue(0,i)) { + FAIL() << "mismatch at index " << i + << " expected: " << data_generator_pre.BuildTestValue(0,i) + << " got: " << out[i]; + } + out[i] = 0; + } + + // Fetch all data using small batches of only a few rows. + // This should catch edge conditions like a batch lining up exactly + // with the end of a block. + unsigned int seed = time(nullptr); + LOG(INFO) << "Using random seed: " << seed; + srand(seed); + ASSERT_OK(iter->SeekToOrdinal(0)); + size_t fetched = 0; + while (fetched < 10000) { + ColumnBlock advancing_block(out.type_info(), nullptr, + out.data() + (fetched * out.stride()), + out.nrows() - fetched, out.arena()); + ASSERT_TRUE(iter->HasNext()); + size_t batch_size = random() % 5 + 1; + size_t n = batch_size; + ASSERT_OK(iter->CopyNextValues(&n, &advancing_block)); + ASSERT_LE(n, batch_size); + fetched += n; + } + ASSERT_FALSE(iter->HasNext()); + + DataGeneratorType data_generator_post; + + // Re-verify + for (int i = 0; i < 10000; i++) { + if (out[i] != data_generator_post.BuildTestValue(0,i)) { + FAIL() << "mismatch at index " << i + << " expected: " << data_generator_post.BuildTestValue(0,i) + << " got: " << out[i]; + } + out[i] = 0; + } + + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(10000, n); + } + + template + void TimeSeekAndReadFileWithNulls(DataGeneratorType* generator, + const BlockId& block_id, size_t num_entries) { + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(block.Pass(), ReaderOptions(), &reader)); + ASSERT_EQ(DataGeneratorType::kDataType, reader->type_info()->type()); + + gscoped_ptr iter; + ASSERT_OK(reader->NewIterator(&iter, CFileReader::CACHE_BLOCK)); + + Arena arena(8192, 8*1024*1024); + ScopedColumnBlock cb(10); + + const int kNumLoops = AllowSlowTests() ? num_entries : 10; + for (int loop = 0; loop < kNumLoops; loop++) { + // Seek to a random point in the file, + // or just try each entry as starting point if you're running SlowTests + int target = AllowSlowTests() ? loop : (random() % (num_entries - 1)); + SCOPED_TRACE(target); + ASSERT_OK(iter->SeekToOrdinal(target)); + ASSERT_TRUE(iter->HasNext()); + + // Read and verify several ColumnBlocks from this point in the file. + int read_offset = target; + for (int block = 0; block < 3 && iter->HasNext(); block++) { + SCOPED_TRACE(block); + size_t n = cb.nrows(); + ASSERT_OK_FAST(iter->CopyNextValues(&n, &cb)); + ASSERT_EQ(n, std::min(num_entries - read_offset, cb.nrows())); + + // Verify that the block data is correct. + generator->Build(read_offset, n); + for (size_t j = 0; j < n; ++j) { + SCOPED_TRACE(j); + bool expected_null = generator->TestValueShouldBeNull(read_offset + j); + ASSERT_EQ(expected_null, cb.is_null(j)); + if (!expected_null) { + ASSERT_EQ((*generator)[j], cb[j]); + } + } + cb.arena()->Reset(); + read_offset += n; + } + } + } + + template + void TestNullTypes(DataGeneratorType* generator, EncodingType encoding, + CompressionType compression) { + BlockId block_id; + WriteTestFile(generator, encoding, compression, 10000, SMALL_BLOCKSIZE, &block_id); + + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(n, 10000); + + generator->Reset(); + TimeSeekAndReadFileWithNulls(generator, block_id, n); + } + + + void TestReadWriteRawBlocks(CompressionType compression, int num_entries) { + // Test Write + gscoped_ptr sink; + ASSERT_OK(fs_manager_->CreateNewBlock(&sink)); + BlockId id = sink->id(); + WriterOptions opts; + opts.write_posidx = true; + opts.write_validx = false; + opts.storage_attributes.cfile_block_size = FLAGS_cfile_test_block_size; + opts.storage_attributes.encoding = PLAIN_ENCODING; + CFileWriter w(opts, GetTypeInfo(STRING), false, sink.Pass()); + ASSERT_OK(w.Start()); + for (uint32_t i = 0; i < num_entries; i++) { + vector slices; + slices.push_back(Slice("Head")); + slices.push_back(Slice("Body")); + slices.push_back(Slice("Tail")); + slices.push_back(Slice(reinterpret_cast(&i), 4)); + ASSERT_OK(w.AppendRawBlock(slices, i, nullptr, "raw-data")); + } + ASSERT_OK(w.Finish()); + + // Test Read + gscoped_ptr source; + ASSERT_OK(fs_manager_->OpenBlock(id, &source)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(source.Pass(), ReaderOptions(), &reader)); + + gscoped_ptr iter; + iter.reset(IndexTreeIterator::Create(reader.get(), reader->posidx_root())); + ASSERT_OK(iter->SeekToFirst()); + + uint8_t data[16]; + Slice expected_data(data, 16); + memcpy(data, "HeadBodyTail", 12); + + uint32_t count = 0; + do { + BlockHandle dblk_data; + BlockPointer blk_ptr = iter->GetCurrentBlockPointer(); + ASSERT_OK(reader->ReadBlock(blk_ptr, CFileReader::CACHE_BLOCK, &dblk_data)); + + memcpy(data + 12, &count, 4); + ASSERT_EQ(expected_data, dblk_data.data()); + + count++; + } while (iter->Next().ok()); + ASSERT_EQ(num_entries, count); + } + + void TestReadWriteStrings(EncodingType encoding); + +#ifdef NDEBUG + void TestWrite100MFileStrings(EncodingType encoding) { + BlockId block_id; + LOG_TIMING(INFO, "writing 100M strings") { + LOG(INFO) << "Starting writefile"; + StringDataGenerator generator("hello %zu"); + WriteTestFile(&generator, encoding, NO_COMPRESSION, 100000000, NO_FLAGS, &block_id); + LOG(INFO) << "Done writing"; + } + + LOG_TIMING(INFO, "reading 100M strings") { + LOG(INFO) << "Starting readfile"; + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(100000000, n); + LOG(INFO) << "End readfile"; + } + } +#endif + +}; + +// Subclass of TestCFile which is parameterized on the block cache type. +// Tests that use TEST_P(TestCFileBothCacheTypes, ...) will run twice -- +// once for each cache type (DRAM, NVM). +class TestCFileBothCacheTypes : public TestCFile, + public ::testing::WithParamInterface { + public: + void SetUp() OVERRIDE { +#if defined(__linux__) + // The NVM cache can run using any directory as its path -- it doesn't have + // a lot of practical use outside of an actual NVM device, but for testing + // purposes, we'll point it at our test dir, unless otherwise specified. + if (google::GetCommandLineFlagInfoOrDie("nvm_cache_path").is_default) { + FLAGS_nvm_cache_path = GetTestPath("nvm-cache"); + ASSERT_OK(Env::Default()->CreateDir(FLAGS_nvm_cache_path)); + } +#endif + switch (GetParam()) { + case DRAM_CACHE: + FLAGS_block_cache_type = "DRAM"; + break; +#if defined(__linux__) + case NVM_CACHE: + FLAGS_block_cache_type = "NVM"; + break; +#endif + default: + LOG(FATAL) << "Unknown block cache type: '" << GetParam(); + } + CFileTestBase::SetUp(); + } + + void TearDown() OVERRIDE { + Singleton::UnsafeReset(); + } +}; + +#if defined(__linux__) +INSTANTIATE_TEST_CASE_P(CacheTypes, TestCFileBothCacheTypes, + ::testing::Values(DRAM_CACHE, NVM_CACHE)); +#else +INSTANTIATE_TEST_CASE_P(CacheTypes, TestCFileBothCacheTypes, ::testing::Values(DRAM_CACHE)); +#endif + +template +void CopyOne(CFileIterator *it, + typename TypeTraits::cpp_type *ret, + Arena *arena) { + ColumnBlock cb(GetTypeInfo(type), nullptr, ret, 1, arena); + size_t n = 1; + ASSERT_OK(it->CopyNextValues(&n, &cb)); + ASSERT_EQ(1, n); +} + +#ifdef NDEBUG +// Only run the 100M entry tests in non-debug mode. +// They take way too long with debugging enabled. + +TEST_P(TestCFileBothCacheTypes, TestWrite100MFileInts) { + BlockId block_id; + LOG_TIMING(INFO, "writing 100m ints") { + LOG(INFO) << "Starting writefile"; + UInt32DataGenerator generator; + WriteTestFile(&generator, GROUP_VARINT, NO_COMPRESSION, 100000000, NO_FLAGS, &block_id); + LOG(INFO) << "Done writing"; + } + + LOG_TIMING(INFO, "reading 100M ints") { + LOG(INFO) << "Starting readfile"; + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(100000000, n); + LOG(INFO) << "End readfile"; + } +} + +TEST_P(TestCFileBothCacheTypes, TestWrite100MFileNullableInts) { + BlockId block_id; + LOG_TIMING(INFO, "writing 100m nullable ints") { + LOG(INFO) << "Starting writefile"; + UInt32DataGenerator generator; + WriteTestFile(&generator, PLAIN_ENCODING, NO_COMPRESSION, 100000000, NO_FLAGS, &block_id); + LOG(INFO) << "Done writing"; + } + + LOG_TIMING(INFO, "reading 100M nullable ints") { + LOG(INFO) << "Starting readfile"; + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(100000000, n); + LOG(INFO) << "End readfile"; + } +} + +TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsPrefixEncoding) { + TestWrite100MFileStrings(PREFIX_ENCODING); +} + +TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsDictEncoding) { + TestWrite100MFileStrings(DICT_ENCODING); +} + +TEST_P(TestCFileBothCacheTypes, TestWrite100MFileStringsPlainEncoding) { + TestWrite100MFileStrings(PLAIN_ENCODING); +} + +#endif + +// Write and Read 1 million unique strings with dictionary encoding +TEST_P(TestCFileBothCacheTypes, TestWrite1MUniqueFileStringsDictEncoding) { + BlockId block_id; + LOG_TIMING(INFO, "writing 1M unique strings") { + LOG(INFO) << "Starting writefile"; + StringDataGenerator generator("hello %zu"); + WriteTestFile(&generator, DICT_ENCODING, NO_COMPRESSION, 1000000, NO_FLAGS, &block_id); + LOG(INFO) << "Done writing"; + } + + LOG_TIMING(INFO, "reading 1M strings") { + LOG(INFO) << "Starting readfile"; + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(1000000, n); + LOG(INFO) << "End readfile"; + } +} + +// Write and Read 1 million strings, which contains duplicates with dictionary encoding +TEST_P(TestCFileBothCacheTypes, TestWrite1MDuplicateFileStringsDictEncoding) { + BlockId block_id; + LOG_TIMING(INFO, "writing 1M duplicate strings") { + LOG(INFO) << "Starting writefile"; + + // The second parameter specify how many distinct strings are there + DuplicateStringDataGenerator generator("hello %zu", 256); + WriteTestFile(&generator, DICT_ENCODING, NO_COMPRESSION, 1000000, NO_FLAGS, &block_id); + LOG(INFO) << "Done writing"; + } + + LOG_TIMING(INFO, "reading 1M strings") { + LOG(INFO) << "Starting readfile"; + size_t n; + TimeReadFile(fs_manager_.get(), block_id, &n); + ASSERT_EQ(1000000, n); + LOG(INFO) << "End readfile"; + } +} + +TEST_P(TestCFileBothCacheTypes, TestFixedSizeReadWritePlainEncodingUInt32) { + TestReadWriteFixedSizeTypes >(GROUP_VARINT); + TestReadWriteFixedSizeTypes >(PLAIN_ENCODING); +} + +TEST_P(TestCFileBothCacheTypes, TestFixedSizeReadWritePlainEncodingInt32) { + TestReadWriteFixedSizeTypes >(PLAIN_ENCODING); +} + +TEST_P(TestCFileBothCacheTypes, TestFixedSizeReadWritePlainEncodingFloat) { + TestReadWriteFixedSizeTypes >(PLAIN_ENCODING); +} +TEST_P(TestCFileBothCacheTypes, TestFixedSizeReadWritePlainEncodingDouble) { + TestReadWriteFixedSizeTypes >(PLAIN_ENCODING); +} + +// Test for BitShuffle builder for UINT8, INT8, UINT16, INT16, UINT32, INT32, FLOAT, DOUBLE +template +class BitShuffleTest : public TestCFile { + public: + void TestBitShuffle() { + TestReadWriteFixedSizeTypes(BIT_SHUFFLE); + } +}; +typedef ::testing::Types, + Int8DataGenerator, + UInt16DataGenerator, + Int16DataGenerator, + UInt32DataGenerator, + Int32DataGenerator, + FPDataGenerator, + FPDataGenerator > MyTypes; +TYPED_TEST_CASE(BitShuffleTest, MyTypes); +TYPED_TEST(BitShuffleTest, TestFixedSizeReadWriteBitShuffle) { + this->TestBitShuffle(); +} + +void EncodeStringKey(const Schema &schema, const Slice& key, + gscoped_ptr *encoded_key) { + EncodedKeyBuilder kb(&schema); + kb.AddColumnKey(&key); + encoded_key->reset(kb.BuildEncodedKey()); +} + +void TestCFile::TestReadWriteStrings(EncodingType encoding) { + Schema schema({ ColumnSchema("key", STRING) }, 1); + + const int nrows = 10000; + BlockId block_id; + StringDataGenerator generator("hello %04d"); + WriteTestFile(&generator, encoding, NO_COMPRESSION, nrows, + SMALL_BLOCKSIZE | WRITE_VALIDX, &block_id); + + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(block.Pass(), ReaderOptions(), &reader)); + + rowid_t reader_nrows; + ASSERT_OK(reader->CountRows(&reader_nrows)); + ASSERT_EQ(nrows, reader_nrows); + + BlockPointer ptr; + + gscoped_ptr iter; + ASSERT_OK(reader->NewIterator(&iter, CFileReader::CACHE_BLOCK)); + + Arena arena(1024, 1024*1024); + + ASSERT_OK(iter->SeekToOrdinal(5000)); + ASSERT_EQ(5000u, iter->GetCurrentOrdinal()); + Slice s; + + CopyOne(iter.get(), &s, &arena); + ASSERT_EQ(string("hello 5000"), s.ToString()); + + // Seek to last key exactly, should succeed + ASSERT_OK(iter->SeekToOrdinal(9999)); + ASSERT_EQ(9999u, iter->GetCurrentOrdinal()); + + // Seek to after last key. Should result in not found. + ASSERT_TRUE(iter->SeekToOrdinal(10000).IsNotFound()); + + + //////// + // Now try some seeks by the value instead of position + ///////// + + gscoped_ptr encoded_key; + bool exact; + + // Seek in between each key + for (int i = 1; i < 10000; i++) { + SCOPED_TRACE(i); + char buf[100]; + snprintf(buf, sizeof(buf), "hello %04d.5", i - 1); + s = Slice(buf); + EncodeStringKey(schema, s, &encoded_key); + ASSERT_OK(iter->SeekAtOrAfter(*encoded_key, &exact)); + ASSERT_FALSE(exact); + ASSERT_EQ(i, iter->GetCurrentOrdinal()); + CopyOne(iter.get(), &s, &arena); + ASSERT_EQ(StringPrintf("hello %04d", i), s.ToString()); + } + + // Seek exactly to each key + for (int i = 0; i < 9999; i++) { + SCOPED_TRACE(i); + char buf[100]; + snprintf(buf, sizeof(buf), "hello %04d", i); + s = Slice(buf); + EncodeStringKey(schema, s, &encoded_key); + ASSERT_OK(iter->SeekAtOrAfter(*encoded_key, &exact)); + ASSERT_TRUE(exact); + ASSERT_EQ(i, iter->GetCurrentOrdinal()); + Slice read_back; + CopyOne(iter.get(), &read_back, &arena); + ASSERT_EQ(read_back.ToString(), s.ToString()); + } + + // after last entry + s = "hello 9999x"; + EncodeStringKey(schema, s, &encoded_key); + EXPECT_TRUE(iter->SeekAtOrAfter(*encoded_key, &exact).IsNotFound()); + + // before first entry + s = "hello"; + EncodeStringKey(schema, s, &encoded_key); + ASSERT_OK(iter->SeekAtOrAfter(*encoded_key, &exact)); + ASSERT_FALSE(exact); + ASSERT_EQ(0u, iter->GetCurrentOrdinal()); + CopyOne(iter.get(), &s, &arena); + ASSERT_EQ(string("hello 0000"), s.ToString()); + + // Seek to start of file by ordinal + ASSERT_OK(iter->SeekToFirst()); + ASSERT_EQ(0u, iter->GetCurrentOrdinal()); + CopyOne(iter.get(), &s, &arena); + ASSERT_EQ(string("hello 0000"), s.ToString()); + + // Reseek to start and fetch all data. + ASSERT_OK(iter->SeekToFirst()); + + ScopedColumnBlock cb(10000); + size_t n = 10000; + ASSERT_OK(iter->CopyNextValues(&n, &cb)); + ASSERT_EQ(10000, n); +} + + +TEST_P(TestCFileBothCacheTypes, TestReadWriteStringsPrefixEncoding) { + TestReadWriteStrings(PREFIX_ENCODING); +} + +// Read/Write test for dictionary encoded blocks +TEST_P(TestCFileBothCacheTypes, TestReadWriteStringsDictEncoding) { + TestReadWriteStrings(DICT_ENCODING); +} + +// Test that metadata entries stored in the cfile are persisted. +TEST_P(TestCFileBothCacheTypes, TestMetadata) { + BlockId block_id; + + // Write the file. + { + gscoped_ptr sink; + ASSERT_OK(fs_manager_->CreateNewBlock(&sink)); + block_id = sink->id(); + WriterOptions opts; + CFileWriter w(opts, GetTypeInfo(INT32), false, sink.Pass()); + + w.AddMetadataPair("key_in_header", "header value"); + ASSERT_OK(w.Start()); + + uint32_t val = 1; + ASSERT_OK(w.AppendEntries(&val, 1)); + + w.AddMetadataPair("key_in_footer", "footer value"); + ASSERT_OK(w.Finish()); + } + + // Read the file and ensure metadata is present. + { + gscoped_ptr source; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &source)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(source.Pass(), ReaderOptions(), &reader)); + string val; + ASSERT_TRUE(reader->GetMetadataEntry("key_in_header", &val)); + ASSERT_EQ(val, "header value"); + ASSERT_TRUE(reader->GetMetadataEntry("key_in_footer", &val)); + ASSERT_EQ(val, "footer value"); + ASSERT_FALSE(reader->GetMetadataEntry("not a key", &val)); + + // Test that, even though we didn't specify an encoding or compression, the + // resulting file has them explicitly set. + ASSERT_EQ(PLAIN_ENCODING, reader->type_encoding_info()->encoding_type()); + ASSERT_EQ(NO_COMPRESSION, reader->footer().compression()); + } +} + +TEST_P(TestCFileBothCacheTypes, TestDefaultColumnIter) { + const int kNumItems = 64; + uint8_t null_bitmap[BitmapSize(kNumItems)]; + uint32_t data[kNumItems]; + + // Test Int Default Value + uint32_t int_value = 15; + DefaultColumnValueIterator iter(GetTypeInfo(UINT32), &int_value); + ColumnBlock int_col(GetTypeInfo(UINT32), nullptr, data, kNumItems, nullptr); + ASSERT_OK(iter.Scan(&int_col)); + for (size_t i = 0; i < int_col.nrows(); ++i) { + ASSERT_EQ(int_value, *reinterpret_cast(int_col.cell_ptr(i))); + } + + // Test Int Nullable Default Value + int_value = 321; + DefaultColumnValueIterator nullable_iter(GetTypeInfo(UINT32), &int_value); + ColumnBlock nullable_col(GetTypeInfo(UINT32), null_bitmap, data, kNumItems, nullptr); + ASSERT_OK(nullable_iter.Scan(&nullable_col)); + for (size_t i = 0; i < nullable_col.nrows(); ++i) { + ASSERT_FALSE(nullable_col.is_null(i)); + ASSERT_EQ(int_value, *reinterpret_cast(nullable_col.cell_ptr(i))); + } + + // Test NULL Default Value + DefaultColumnValueIterator null_iter(GetTypeInfo(UINT32), nullptr); + ColumnBlock null_col(GetTypeInfo(UINT32), null_bitmap, data, kNumItems, nullptr); + ASSERT_OK(null_iter.Scan(&null_col)); + for (size_t i = 0; i < null_col.nrows(); ++i) { + ASSERT_TRUE(null_col.is_null(i)); + } + + // Test String Default Value + Slice str_data[kNumItems]; + Slice str_value("Hello"); + Arena arena(32*1024, 256*1024); + DefaultColumnValueIterator str_iter(GetTypeInfo(STRING), &str_value); + ColumnBlock str_col(GetTypeInfo(STRING), nullptr, str_data, kNumItems, &arena); + ASSERT_OK(str_iter.Scan(&str_col)); + for (size_t i = 0; i < str_col.nrows(); ++i) { + ASSERT_EQ(str_value, *reinterpret_cast(str_col.cell_ptr(i))); + } +} + +TEST_P(TestCFileBothCacheTypes, TestAppendRaw) { + TestReadWriteRawBlocks(NO_COMPRESSION, 1000); + TestReadWriteRawBlocks(SNAPPY, 1000); + TestReadWriteRawBlocks(LZ4, 1000); + TestReadWriteRawBlocks(ZLIB, 1000); +} + +TEST_P(TestCFileBothCacheTypes, TestNullInts) { + UInt32DataGenerator generator; + TestNullTypes(&generator, GROUP_VARINT, NO_COMPRESSION); + TestNullTypes(&generator, GROUP_VARINT, LZ4); +} + +TEST_P(TestCFileBothCacheTypes, TestNullFloats) { + FPDataGenerator generator; + TestNullTypes(&generator, PLAIN_ENCODING, NO_COMPRESSION); +} + +TEST_P(TestCFileBothCacheTypes, TestNullPrefixStrings) { + StringDataGenerator generator("hello %zu"); + TestNullTypes(&generator, PLAIN_ENCODING, NO_COMPRESSION); + TestNullTypes(&generator, PLAIN_ENCODING, LZ4); +} + +TEST_P(TestCFileBothCacheTypes, TestNullPlainStrings) { + StringDataGenerator generator("hello %zu"); + TestNullTypes(&generator, PREFIX_ENCODING, NO_COMPRESSION); + TestNullTypes(&generator, PREFIX_ENCODING, LZ4); +} + +// Test for dictionary encoding +TEST_P(TestCFileBothCacheTypes, TestNullDictStrings) { + StringDataGenerator generator("hello %zu"); + TestNullTypes(&generator, DICT_ENCODING, NO_COMPRESSION); + TestNullTypes(&generator, DICT_ENCODING, LZ4); +} + +TEST_P(TestCFileBothCacheTypes, TestReleaseBlock) { + gscoped_ptr sink; + ASSERT_OK(fs_manager_->CreateNewBlock(&sink)); + ASSERT_EQ(WritableBlock::CLEAN, sink->state()); + WriterOptions opts; + CFileWriter w(opts, GetTypeInfo(STRING), false, sink.Pass()); + ASSERT_OK(w.Start()); + fs::ScopedWritableBlockCloser closer; + ASSERT_OK(w.FinishAndReleaseBlock(&closer)); + if (FLAGS_cfile_do_on_finish == "flush") { + ASSERT_EQ(1, closer.blocks().size()); + ASSERT_EQ(WritableBlock::FLUSHING, closer.blocks()[0]->state()); + } else if (FLAGS_cfile_do_on_finish == "close") { + ASSERT_EQ(0, closer.blocks().size()); + } else if (FLAGS_cfile_do_on_finish == "nothing") { + ASSERT_EQ(1, closer.blocks().size()); + ASSERT_EQ(WritableBlock::DIRTY, closer.blocks()[0]->state()); + } else { + LOG(FATAL) << "Unknown value for cfile_do_on_finish: " + << FLAGS_cfile_do_on_finish; + } + ASSERT_OK(closer.CloseBlocks()); + ASSERT_EQ(0, closer.blocks().size()); +} + +TEST_P(TestCFileBothCacheTypes, TestLazyInit) { + // Create a small test file. + BlockId block_id; + { + const int nrows = 1000; + StringDataGenerator generator("hello %04d"); + WriteTestFile(&generator, PREFIX_ENCODING, NO_COMPRESSION, nrows, + SMALL_BLOCKSIZE | WRITE_VALIDX, &block_id); + } + + shared_ptr tracker = MemTracker::CreateTracker(-1, "test"); + int64_t initial_mem_usage = tracker->consumption(); + + // Open it using a "counting" readable block. + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + size_t bytes_read = 0; + gscoped_ptr count_block( + new CountingReadableBlock(block.Pass(), &bytes_read)); + ASSERT_EQ(initial_mem_usage, tracker->consumption()); + + // Lazily opening the cfile should not trigger any reads. + ReaderOptions opts; + opts.parent_mem_tracker = tracker; + gscoped_ptr reader; + ASSERT_OK(CFileReader::OpenNoInit(count_block.Pass(), opts, &reader)); + ASSERT_EQ(0, bytes_read); + int64_t lazy_mem_usage = tracker->consumption(); + ASSERT_GT(lazy_mem_usage, initial_mem_usage); + + // But initializing it should (only the first time), and the reader's + // memory usage should increase. + ASSERT_OK(reader->Init()); + ASSERT_GT(bytes_read, 0); + size_t bytes_read_after_init = bytes_read; + ASSERT_OK(reader->Init()); + ASSERT_EQ(bytes_read_after_init, bytes_read); + ASSERT_GT(tracker->consumption(), lazy_mem_usage); + + // And let's test non-lazy open for good measure; it should yield the + // same number of bytes read. + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + bytes_read = 0; + count_block.reset(new CountingReadableBlock(block.Pass(), &bytes_read)); + ASSERT_OK(CFileReader::Open(count_block.Pass(), ReaderOptions(), &reader)); + ASSERT_EQ(bytes_read_after_init, bytes_read); +} + +// Tests that the block cache keys used by CFileReaders are stable. That is, +// different reader instances operating on the same block should use the same +// block cache keys. +TEST_P(TestCFileBothCacheTypes, TestCacheKeysAreStable) { + // Set up block cache instrumentation. + MetricRegistry registry; + scoped_refptr entity(METRIC_ENTITY_server.Instantiate(®istry, "test_entity")); + BlockCache* cache = BlockCache::GetSingleton(); + cache->StartInstrumentation(entity); + + // Create a small test file. + BlockId block_id; + { + const int nrows = 1000; + StringDataGenerator generator("hello %04d"); + WriteTestFile(&generator, PREFIX_ENCODING, NO_COMPRESSION, nrows, + SMALL_BLOCKSIZE | WRITE_VALIDX, &block_id); + } + + // Open and read from it twice, checking the block cache statistics. + for (int i = 0; i < 2; i++) { + gscoped_ptr source; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &source)); + gscoped_ptr reader; + ASSERT_OK(CFileReader::Open(source.Pass(), ReaderOptions(), &reader)); + + gscoped_ptr iter; + iter.reset(IndexTreeIterator::Create(reader.get(), reader->posidx_root())); + ASSERT_OK(iter->SeekToFirst()); + + BlockHandle bh; + ASSERT_OK(reader->ReadBlock(iter->GetCurrentBlockPointer(), + CFileReader::CACHE_BLOCK, + &bh)); + + // The first time through, we miss in the seek and in the ReadBlock(). + // But the second time through, both are hits, because we've got the same + // cache keys as before. + ASSERT_EQ(i * 2, down_cast( + entity->FindOrNull(METRIC_block_cache_hits_caching).get())->value()); + } +} + +#if defined(__linux__) +// Inject failures in nvm allocation and ensure that we can still read a file. +TEST_P(TestCFileBothCacheTypes, TestNvmAllocationFailure) { + if (GetParam() != NVM_CACHE) return; + FLAGS_nvm_cache_simulate_allocation_failure = true; + TestReadWriteFixedSizeTypes >(PLAIN_ENCODING); +} +#endif + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/cfile.proto b/src/kudu/cfile/cfile.proto new file mode 100644 index 000000000000..d9a07760454f --- /dev/null +++ b/src/kudu/cfile/cfile.proto @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.cfile; + +option java_package = "org.kududb.cfile"; + +import "kudu/common/common.proto"; + +message FileMetadataPairPB { + required string key = 1; + required bytes value = 2; +} + +message CFileHeaderPB { + required int32 major_version = 1; + required int32 minor_version = 2; + + repeated FileMetadataPairPB metadata = 3; +} + +message BlockPointerPB { + required int64 offset = 1; + required int32 size = 2; +} + +message BTreeInfoPB { + required BlockPointerPB root_block = 1; +} + +message IndexBlockTrailerPB { + required int32 num_entries = 1; + + enum BlockType { + UNKNOWN = 999; + LEAF = 0; + INTERNAL = 1; + }; + required BlockType type = 2; +} +// TODO: name all the PBs with *PB convention + +message CFileFooterPB { + required kudu.DataType data_type = 1; + required EncodingType encoding = 2; + + // Total number of values in the file. + required int64 num_values = 3; + + optional BTreeInfoPB posidx_info = 4; + optional BTreeInfoPB validx_info = 5; + + optional CompressionType compression = 6 [default=NO_COMPRESSION]; + + repeated FileMetadataPairPB metadata = 7; + + optional bool is_type_nullable = 8 [default=false]; // TODO use enum with encoding? + + // Block pointer for dictionary block if the cfile is dictionary encoded. + // Only for dictionary encoding. + optional BlockPointerPB dict_block_ptr = 9; +} + + +message BloomBlockHeaderPB { + required int32 num_hash_functions = 1; +} diff --git a/src/kudu/cfile/cfile_reader.cc b/src/kudu/cfile/cfile_reader.cc new file mode 100644 index 000000000000..ebb107f503c7 --- /dev/null +++ b/src/kudu/cfile/cfile_reader.cc @@ -0,0 +1,990 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/cfile_reader.h" + +#include + +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/cfile/block_handle.h" +#include "kudu/cfile/block_pointer.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/cfile_writer.h" // for kMagicString +#include "kudu/cfile/gvint_block.h" +#include "kudu/cfile/index_block.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/cfile/binary_plain_block.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/malloc.h" +#include "kudu/util/object_pool.h" +#include "kudu/util/rle-encoding.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +DEFINE_bool(cfile_lazy_open, true, + "Allow lazily opening of cfiles"); +TAG_FLAG(cfile_lazy_open, hidden); + +using kudu::fs::ReadableBlock; +using strings::Substitute; + +namespace kudu { +namespace cfile { + +// Magic+Length: 8-byte magic, followed by 4-byte header size +static const size_t kMagicAndLengthSize = 12; +static const size_t kMaxHeaderFooterPBSize = 64*1024; + +static const size_t kBlockSizeLimit = 16 * 1024 * 1024; // 16MB + +static Status ParseMagicAndLength(const Slice &data, + uint32_t *parsed_len) { + if (data.size() != kMagicAndLengthSize) { + return Status::Corruption("Bad size data"); + } + + if (memcmp(kMagicString, data.data(), strlen(kMagicString)) != 0) { + return Status::Corruption("bad magic"); + } + + *parsed_len = DecodeFixed32(data.data() + strlen(kMagicString)); + if (*parsed_len <= 0 || *parsed_len > kMaxHeaderFooterPBSize) { + return Status::Corruption("invalid data size"); + } + + return Status::OK(); +} + +CFileReader::CFileReader(const ReaderOptions &options, + const uint64_t file_size, + gscoped_ptr block) : + block_(block.Pass()), + file_size_(file_size), + mem_consumption_(options.parent_mem_tracker, memory_footprint()) { +} + +Status CFileReader::Open(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader) { + gscoped_ptr reader_local; + RETURN_NOT_OK(OpenNoInit(block.Pass(), options, &reader_local)); + RETURN_NOT_OK(reader_local->Init()); + + reader->reset(reader_local.release()); + return Status::OK(); +} + +Status CFileReader::OpenNoInit(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr *reader) { + uint64_t block_size; + RETURN_NOT_OK(block->Size(&block_size)); + gscoped_ptr reader_local( + new CFileReader(options, block_size, block.Pass())); + if (!FLAGS_cfile_lazy_open) { + RETURN_NOT_OK(reader_local->Init()); + } + + reader->reset(reader_local.release()); + return Status::OK(); +} + +Status CFileReader::ReadMagicAndLength(uint64_t offset, uint32_t *len) { + TRACE_EVENT1("io", "CFileReader::ReadMagicAndLength", + "cfile", ToString()); + uint8_t scratch[kMagicAndLengthSize]; + Slice slice; + + RETURN_NOT_OK(block_->Read(offset, kMagicAndLengthSize, + &slice, scratch)); + + return ParseMagicAndLength(slice, len); +} + +Status CFileReader::InitOnce() { + VLOG(1) << "Initializing CFile with ID " << block_->id().ToString(); + + RETURN_NOT_OK(ReadAndParseHeader()); + + RETURN_NOT_OK(ReadAndParseFooter()); + + type_info_ = GetTypeInfo(footer_->data_type()); + + RETURN_NOT_OK(TypeEncodingInfo::Get(type_info_, + footer_->encoding(), + &type_encoding_info_)); + + VLOG(2) << "Initialized CFile reader. " + << "Header: " << header_->DebugString() + << " Footer: " << footer_->DebugString() + << " Type: " << type_info_->name(); + + // The header/footer have been allocated; memory consumption has changed. + mem_consumption_.Reset(memory_footprint()); + + return Status::OK(); +} + +Status CFileReader::Init() { + return init_once_.Init(&CFileReader::InitOnce, this); +} + +Status CFileReader::ReadAndParseHeader() { + TRACE_EVENT1("io", "CFileReader::ReadAndParseHeader", + "cfile", ToString()); + DCHECK(!init_once_.initted()); + + // First read and parse the "pre-header", which lets us know + // that it is indeed a CFile and tells us the length of the + // proper protobuf header. + uint32_t header_size; + RETURN_NOT_OK(ReadMagicAndLength(0, &header_size)); + + // Now read the protobuf header. + uint8_t header_space[header_size]; + Slice header_slice; + header_.reset(new CFileHeaderPB()); + RETURN_NOT_OK(block_->Read(kMagicAndLengthSize, header_size, + &header_slice, header_space)); + if (!header_->ParseFromArray(header_slice.data(), header_size)) { + return Status::Corruption("Invalid cfile pb header"); + } + + VLOG(2) << "Read header: " << header_->DebugString(); + + return Status::OK(); +} + + +Status CFileReader::ReadAndParseFooter() { + TRACE_EVENT1("io", "CFileReader::ReadAndParseFooter", + "cfile", ToString()); + DCHECK(!init_once_.initted()); + CHECK_GT(file_size_, kMagicAndLengthSize) << + "file too short: " << file_size_; + + // First read and parse the "post-footer", which has magic + // and the length of the actual protobuf footer + uint32_t footer_size; + RETURN_NOT_OK_PREPEND(ReadMagicAndLength(file_size_ - kMagicAndLengthSize, &footer_size), + "Failed to read magic and length from end of file"); + + // Now read the protobuf footer. + footer_.reset(new CFileFooterPB()); + uint8_t footer_space[footer_size]; + Slice footer_slice; + uint64_t off = file_size_ - kMagicAndLengthSize - footer_size; + RETURN_NOT_OK(block_->Read(off, footer_size, + &footer_slice, footer_space)); + if (!footer_->ParseFromArray(footer_slice.data(), footer_size)) { + return Status::Corruption("Invalid cfile pb footer"); + } + + // Verify if the compression codec is available + if (footer_->compression() != NO_COMPRESSION) { + const CompressionCodec* codec; + RETURN_NOT_OK(GetCompressionCodec(footer_->compression(), &codec)); + block_uncompressor_.reset(new CompressedBlockDecoder(codec, kBlockSizeLimit)); + } + + VLOG(2) << "Read footer: " << footer_->DebugString(); + + return Status::OK(); +} + +namespace { + +// ScratchMemory owns a memory buffer which could either be allocated on-heap +// or allocated by a Cache instance. In the case of the default DRAM-based cache, +// these two are equivalent, but we still make a distinction between "cache-managed" +// memory and "on-heap" memory. In the case of the NVM-based cache, this is a more +// important distinction: we would like to read (or decompress) blocks directly into NVM. +// +// This class tracks the block of memory, its size, and whether it came from the heap +// or the cache. In its destructor, the memory is freed, either via 'delete[]', if +// it's heap memory, or via Cache::Free(), if it came from the cache. Alternatively, +// the memory can be released using 'release()'. +class ScratchMemory { + public: + ScratchMemory() : cache_(nullptr), ptr_(nullptr), size_(-1) {} + ~ScratchMemory() { + if (!ptr_) return; + if (cache_) { + cache_->Free(ptr_); + } else { + delete[] ptr_; + } + } + + // Try to allocate 'size' bytes from the cache. If the cache has + // no capacity and cannot evict to make room, this will fall back + // to allocating from the heap. In that case, IsFromCache() will + // return false. + void TryAllocateFromCache(BlockCache* cache, int size) { + DCHECK(!ptr_); + cache_ = DCHECK_NOTNULL(cache); + ptr_ = cache->Allocate(size); + if (!ptr_) { + AllocateFromHeap(size); + return; + } + size_ = size; + } + + void AllocateFromHeap(int size) { + DCHECK(!ptr_); + cache_ = nullptr; + ptr_ = new uint8_t[size]; + size_ = size; + } + + // If the current memory was allocated by the cache, this moves it to normal + // heap memory. In the case of the DRAM cache, the cache implements this as + // a no-op. In the case of NVM, we actually allocate on-heap memory and + // memcpy the data. + void EnsureOnHeap() { + DCHECK(ptr_); + if (cache_) { + ptr_ = cache_->MoveToHeap(ptr_, size_); + } + cache_ = nullptr; + } + + // Return true if the current scratch memory was allocated from the cache. + bool IsFromCache() const { + return cache_ != nullptr; + } + + uint8_t* get() { + return DCHECK_NOTNULL(ptr_); + } + + uint8_t* release() { + uint8_t* ret = ptr_; + ptr_ = nullptr; + size_ = -1; + return ret; + } + + // Swap the contents of this instance with another. + void Swap(ScratchMemory* other) { + std::swap(cache_, other->cache_); + std::swap(ptr_, other->ptr_); + std::swap(size_, other->size_); + } + + private: + BlockCache* cache_; + uint8_t* ptr_; + int size_; + DISALLOW_COPY_AND_ASSIGN(ScratchMemory); +}; +} // anonymous namespace + +Status CFileReader::ReadBlock(const BlockPointer &ptr, CacheControl cache_control, + BlockHandle *ret) const { + DCHECK(init_once_.initted()); + CHECK(ptr.offset() > 0 && + ptr.offset() + ptr.size() < file_size_) << + "bad offset " << ptr.ToString() << " in file of size " + << file_size_; + BlockCacheHandle bc_handle; + Cache::CacheBehavior cache_behavior = cache_control == CACHE_BLOCK ? + Cache::EXPECT_IN_CACHE : Cache::NO_EXPECT_IN_CACHE; + BlockCache* cache = BlockCache::GetSingleton(); + if (cache->Lookup(block_->id(), ptr.offset(), cache_behavior, &bc_handle)) { + *ret = BlockHandle::WithDataFromCache(&bc_handle); + // Cache hit + return Status::OK(); + } + + // Cache miss: need to read ourselves. + // We issue trace events only in the cache miss case since we expect the + // tracing overhead to be small compared to the IO (even if it's a memcpy + // from the Linux cache). + TRACE_EVENT1("io", "CFileReader::ReadBlock(cache miss)", + "cfile", ToString()); + Slice block; + + // If we are reading uncompressed data and plan to cache the result, + // then we should allocate our scratch memory directly from the cache. + // This avoids an extra memory copy in the case of an NVM cache. + ScratchMemory scratch; + if (block_uncompressor_ == nullptr && cache_control == CACHE_BLOCK) { + scratch.TryAllocateFromCache(cache, ptr.size()); + } else { + scratch.AllocateFromHeap(ptr.size()); + } + + RETURN_NOT_OK(block_->Read(ptr.offset(), ptr.size(), &block, scratch.get())); + + if (block.size() != ptr.size()) { + return Status::IOError("Could not read full block length"); + } + + // Decompress the block + if (block_uncompressor_ != nullptr) { + // Get the size required for the uncompressed buffer + uint32_t uncompressed_size; + Status s = block_uncompressor_->ValidateHeader(block, &uncompressed_size); + if (!s.ok()) { + LOG(WARNING) << "Unable to get uncompressed size at " + << ptr.offset() << " of size " << ptr.size() << ": " + << s.ToString(); + return s; + } + + // If we plan to put the uncompressed block in the cache, we should + // decompress directly into the cache's memory (to avoid a memcpy for NVM). + ScratchMemory decompressed_scratch; + if (cache_control == CACHE_BLOCK) { + decompressed_scratch.TryAllocateFromCache(cache, uncompressed_size); + } else { + decompressed_scratch.AllocateFromHeap(uncompressed_size); + } + + s = block_uncompressor_->UncompressIntoBuffer(block, decompressed_scratch.get(), + uncompressed_size); + if (!s.ok()) { + LOG(WARNING) << "Unable to uncompress block at " << ptr.offset() + << " of size " << ptr.size() << ": " << s.ToString(); + return s; + } + + // Now that we've decompressed, we don't need to keep holding onto the original + // scratch buffer. Instead, we have to start holding onto our decompression + // output buffer. + scratch.Swap(&decompressed_scratch); + + // Set the result block to our decompressed data. + block = Slice(scratch.get(), uncompressed_size); + } else { + // Some of the File implementations from LevelDB attempt to be tricky + // and just return a Slice into an mmapped region (or in-memory region). + // But, this is hard to program against in terms of cache management, etc, + // so we memcpy into our scratch buffer if necessary. + block.relocate(scratch.get()); + } + + // It's possible that one of the TryAllocateFromCache() calls above + // failed, in which case we don't insert it into the cache regardless + // of what the user requested. + if (cache_control == CACHE_BLOCK && scratch.IsFromCache()) { + if (cache->Insert(block_->id(), ptr.offset(), block, &bc_handle)) { + *ret = BlockHandle::WithDataFromCache(&bc_handle); + } else { + // If we failed to insert in the cache, but we'd already read into + // cache-managed memory, we need to ensure that we end up with a + // heap-allocated block in the BlockHandle. + scratch.EnsureOnHeap(); + block = Slice(scratch.get(), block.size()); + *ret = BlockHandle::WithOwnedData(block); + } + } else { + // If we never intended to cache the block, then the scratch space + // should not be owned by the cache. + DCHECK_EQ(block.data(), scratch.get()); + DCHECK(!scratch.IsFromCache()); + *ret = BlockHandle::WithOwnedData(block); + } + + // The cache or the BlockHandle now has ownership over the memory, so release + // the scoped pointer. + ignore_result(scratch.release()); + + return Status::OK(); +} + +Status CFileReader::CountRows(rowid_t *count) const { + *count = footer().num_values(); + return Status::OK(); +} + +bool CFileReader::GetMetadataEntry(const string &key, string *val) { + for (const FileMetadataPairPB &pair : header().metadata()) { + if (pair.key() == key) { + *val = pair.value(); + return true; + } + } + for (const FileMetadataPairPB &pair : footer().metadata()) { + if (pair.key() == key) { + *val = pair.value(); + return true; + } + } + return false; +} + +Status CFileReader::NewIterator(CFileIterator **iter, CacheControl cache_control) { + *iter = new CFileIterator(this, cache_control); + return Status::OK(); +} + +size_t CFileReader::memory_footprint() const { + size_t size = kudu_malloc_usable_size(this); + size += block_->memory_footprint(); + size += init_once_.memory_footprint_excluding_this(); + + // SpaceUsed() uses sizeof() instead of malloc_usable_size() to account for + // the size of base objects (recursively too), thus not accounting for + // malloc "slop". + if (header_) { + size += header_->SpaceUsed(); + } + if (footer_) { + size += footer_->SpaceUsed(); + } + if (block_uncompressor_) { + size += kudu_malloc_usable_size(block_uncompressor_.get()); + } + return size; +} + +//////////////////////////////////////////////////////////// +// Default Column Value Iterator +//////////////////////////////////////////////////////////// +Status DefaultColumnValueIterator::SeekToOrdinal(rowid_t ord_idx) { + ordinal_ = ord_idx; + return Status::OK(); +} + +Status DefaultColumnValueIterator::PrepareBatch(size_t *n) { + batch_ = *n; + return Status::OK(); +} + +Status DefaultColumnValueIterator::Scan(ColumnBlock *dst) { + if (dst->is_nullable()) { + ColumnDataView dst_view(dst); + dst_view.SetNullBits(dst->nrows(), value_ != nullptr); + } + if (value_ != nullptr) { + if (typeinfo_->physical_type() == BINARY) { + const Slice *src_slice = reinterpret_cast(value_); + Slice dst_slice; + if (PREDICT_FALSE(!dst->arena()->RelocateSlice(*src_slice, &dst_slice))) { + return Status::IOError("out of memory copying slice", src_slice->ToString()); + } + for (size_t i = 0; i < dst->nrows(); ++i) { + dst->SetCellValue(i, &dst_slice); + } + } else { + for (size_t i = 0; i < dst->nrows(); ++i) { + dst->SetCellValue(i, value_); + } + } + } + return Status::OK(); +} + +Status DefaultColumnValueIterator::FinishBatch() { + ordinal_ += batch_; + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// Iterator +//////////////////////////////////////////////////////////// +CFileIterator::CFileIterator(CFileReader* reader, + CFileReader::CacheControl cache_control) + : reader_(reader), + seeked_(nullptr), + prepared_(false), + cache_control_(cache_control), + last_prepare_idx_(-1), + last_prepare_count_(-1) { +} + +CFileIterator::~CFileIterator() { +} + +Status CFileIterator::SeekToOrdinal(rowid_t ord_idx) { + RETURN_NOT_OK(PrepareForNewSeek()); + if (PREDICT_FALSE(posidx_iter_ == nullptr)) { + return Status::NotSupported("no positional index in file"); + } + + tmp_buf_.clear(); + KeyEncoderTraits::Encode(ord_idx, &tmp_buf_); + RETURN_NOT_OK(posidx_iter_->SeekAtOrBefore(Slice(tmp_buf_))); + + // TODO: fast seek within block (without reseeking index) + pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr( + prepared_block_pool_.Construct()); + RETURN_NOT_OK(ReadCurrentDataBlock(*posidx_iter_, b.get())); + + // If the data block doesn't actually contain the data + // we're looking for, then we're probably in the last + // block in the file. + // TODO: could assert that each of the index layers is + // at its last entry (ie HasNext() is false for each) + if (PREDICT_FALSE(ord_idx > b->last_row_idx())) { + return Status::NotFound("trying to seek past highest ordinal in file"); + } + + // Seek data block to correct index + DCHECK(ord_idx >= b->first_row_idx() && + ord_idx <= b->last_row_idx()) + << "got wrong data block. looking for ord_idx=" << ord_idx + << " but got dblk " << b->ToString(); + SeekToPositionInBlock(b.get(), ord_idx - b->first_row_idx()); + + prepared_blocks_.push_back(b.release()); + last_prepare_idx_ = ord_idx; + last_prepare_count_ = 0; + seeked_ = posidx_iter_.get(); + + CHECK_EQ(ord_idx, GetCurrentOrdinal()); + return Status::OK(); +} + +void CFileIterator::SeekToPositionInBlock(PreparedBlock *pb, uint32_t idx_in_block) { + // Since the data block only holds the non-null values, + // we need to translate from 'ord_idx' (the absolute row id) + // to the index within the non-null entries. + uint32_t index_within_nonnulls; + if (reader_->is_nullable()) { + if (PREDICT_TRUE(pb->idx_in_block_ <= idx_in_block)) { + // We are seeking forward. Skip from the current position in the RLE decoder + // instead of going back to the beginning of the block. + uint32_t nskip = idx_in_block - pb->idx_in_block_; + size_t cur_blk_idx = pb->dblk_->GetCurrentIndex(); + index_within_nonnulls = cur_blk_idx + pb->rle_decoder_.Skip(nskip); + } else { + // Seek backward - have to start from the start of the block. + pb->rle_decoder_ = RleDecoder(pb->rle_bitmap.data(), pb->rle_bitmap.size(), 1); + index_within_nonnulls = pb->rle_decoder_.Skip(idx_in_block); + } + } else { + index_within_nonnulls = idx_in_block; + } + + pb->dblk_->SeekToPositionInBlock(index_within_nonnulls); + DCHECK_EQ(index_within_nonnulls, pb->dblk_->GetCurrentIndex()) << "failed seek"; + pb->idx_in_block_ = idx_in_block; +} + +Status CFileIterator::SeekToFirst() { + RETURN_NOT_OK(PrepareForNewSeek()); + IndexTreeIterator *idx_iter; + if (PREDICT_TRUE(posidx_iter_ != nullptr)) { + RETURN_NOT_OK(posidx_iter_->SeekToFirst()); + idx_iter = posidx_iter_.get(); + } else if (PREDICT_TRUE(validx_iter_ != nullptr)) { + RETURN_NOT_OK(validx_iter_->SeekToFirst()); + idx_iter = validx_iter_.get(); + } else { + return Status::NotSupported("no value or positional index present"); + } + + pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr( + prepared_block_pool_.Construct()); + RETURN_NOT_OK(ReadCurrentDataBlock(*idx_iter, b.get())); + b->dblk_->SeekToPositionInBlock(0); + last_prepare_idx_ = 0; + last_prepare_count_ = 0; + + prepared_blocks_.push_back(b.release()); + + seeked_ = idx_iter; + return Status::OK(); +} + + + +Status CFileIterator::SeekAtOrAfter(const EncodedKey &key, + bool *exact_match) { + RETURN_NOT_OK(PrepareForNewSeek()); + DCHECK_EQ(reader_->is_nullable(), false); + + if (PREDICT_FALSE(validx_iter_ == nullptr)) { + return Status::NotSupported("no value index present"); + } + + Status s = validx_iter_->SeekAtOrBefore(key.encoded_key()); + if (PREDICT_FALSE(s.IsNotFound())) { + // Seeking to a value before the first value in the file + // will return NotFound, due to the way the index seek + // works. We need to special-case this and have the + // iterator seek all the way down its leftmost branches + // to get the correct reslt. + s = validx_iter_->SeekToFirst(); + } + RETURN_NOT_OK(s); + + pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr( + prepared_block_pool_.Construct()); + RETURN_NOT_OK(ReadCurrentDataBlock(*validx_iter_, b.get())); + + Status dblk_seek_status; + if (key.num_key_columns() > 1) { + Slice slice = key.encoded_key(); + dblk_seek_status = b->dblk_->SeekAtOrAfterValue(&slice, exact_match); + } else { + dblk_seek_status = b->dblk_->SeekAtOrAfterValue(key.raw_keys()[0], + exact_match); + } + + // If seeking within the data block results in NotFound, then that indicates that the + // value we're looking for fell after all the data in that block. + // If this is not the last block, then the search key was 'in the cracks' between + // two consecutive blocks, so we need to advance to the next one. If it was the + // last block in the file, then we just return NotFound(), since there is no + // value "at or after". + if (PREDICT_FALSE(dblk_seek_status.IsNotFound())) { + *exact_match = false; + if (PREDICT_FALSE(!validx_iter_->HasNext())) { + return Status::NotFound("key after last block in file", + key.encoded_key().ToDebugString()); + } + RETURN_NOT_OK(validx_iter_->Next()); + RETURN_NOT_OK(ReadCurrentDataBlock(*validx_iter_, b.get())); + SeekToPositionInBlock(b.get(), 0); + } else { + // It's possible we got some other error seeking in our data block -- + // still need to propagate those. + RETURN_NOT_OK(dblk_seek_status); + } + + last_prepare_idx_ = b->first_row_idx() + b->dblk_->GetCurrentIndex(); + last_prepare_count_ = 0; + + prepared_blocks_.push_back(b.release()); + + seeked_ = validx_iter_.get(); + return Status::OK(); +} + +Status CFileIterator::PrepareForNewSeek() { + // Fully open the CFileReader if it was lazily opened earlier. + // + // If it's already initialized, this is a no-op. + RETURN_NOT_OK(reader_->Init()); + + // Create the index tree iterators if we haven't already done so. + if (!posidx_iter_ && reader_->footer().has_posidx_info()) { + BlockPointer bp(reader_->footer().posidx_info().root_block()); + posidx_iter_.reset(IndexTreeIterator::Create(reader_, bp)); + } + if (!validx_iter_ && reader_->footer().has_validx_info()) { + BlockPointer bp(reader_->footer().validx_info().root_block()); + validx_iter_.reset(IndexTreeIterator::Create(reader_, bp)); + } + + // Initialize the decoder for the dictionary block + // in dictionary encoding mode. + if (!dict_decoder_ && reader_->footer().has_dict_block_ptr()) { + BlockPointer bp(reader_->footer().dict_block_ptr()); + + // Cache the dictionary for performance + RETURN_NOT_OK_PREPEND(reader_->ReadBlock(bp, CFileReader::CACHE_BLOCK, &dict_block_handle_), + "Couldn't read dictionary block"); + + dict_decoder_.reset(new BinaryPlainBlockDecoder(dict_block_handle_.data())); + RETURN_NOT_OK_PREPEND(dict_decoder_->ParseHeader(), "Couldn't parse dictionary block header"); + } + + seeked_ = nullptr; + for (PreparedBlock *pb : prepared_blocks_) { + prepared_block_pool_.Destroy(pb); + } + prepared_blocks_.clear(); + + return Status::OK(); +} + +rowid_t CFileIterator::GetCurrentOrdinal() const { + CHECK(seeked_) << "not seeked"; + return last_prepare_idx_; +} + +string CFileIterator::PreparedBlock::ToString() const { + return StringPrintf("dblk(%s, rows=%d-%d)", + dblk_ptr_.ToString().c_str(), + first_row_idx(), + last_row_idx()); +} + +// Decode the null header in the beginning of the data block +Status DecodeNullInfo(Slice *data_block, uint32_t *num_rows_in_block, Slice *null_bitmap) { + if (!GetVarint32(data_block, num_rows_in_block)) { + return Status::Corruption("bad null header, num elements in block"); + } + + uint32_t null_bitmap_size; + if (!GetVarint32(data_block, &null_bitmap_size)) { + return Status::Corruption("bad null header, bitmap size"); + } + + *null_bitmap = Slice(data_block->data(), null_bitmap_size); + data_block->remove_prefix(null_bitmap_size); + return Status::OK(); +} + +Status CFileIterator::ReadCurrentDataBlock(const IndexTreeIterator &idx_iter, + PreparedBlock *prep_block) { + prep_block->dblk_ptr_ = idx_iter.GetCurrentBlockPointer(); + RETURN_NOT_OK(reader_->ReadBlock(prep_block->dblk_ptr_, cache_control_, &prep_block->dblk_data_)); + + uint32_t num_rows_in_block = 0; + Slice data_block = prep_block->dblk_data_.data(); + if (reader_->is_nullable()) { + RETURN_NOT_OK(DecodeNullInfo(&data_block, &num_rows_in_block, &(prep_block->rle_bitmap))); + prep_block->rle_decoder_ = RleDecoder(prep_block->rle_bitmap.data(), + prep_block->rle_bitmap.size(), 1); + } + + BlockDecoder *bd; + RETURN_NOT_OK(reader_->type_encoding_info()->CreateBlockDecoder(&bd, data_block, this)); + prep_block->dblk_.reset(bd); + RETURN_NOT_OK(prep_block->dblk_->ParseHeader()); + + // For nullable blocks, we filled in the row count from the null information above, + // since the data block decoder only knows about the non-null values. + // For non-nullable ones, we use the information from the block decoder. + if (!reader_->is_nullable()) { + num_rows_in_block = bd->Count(); + } + + io_stats_.cells_read_from_disk += num_rows_in_block; + io_stats_.data_blocks_read_from_disk++; + io_stats_.bytes_read_from_disk += data_block.size(); + + prep_block->idx_in_block_ = 0; + prep_block->num_rows_in_block_ = num_rows_in_block; + prep_block->needs_rewind_ = false; + prep_block->rewind_idx_ = 0; + + DVLOG(2) << "Read dblk " << prep_block->ToString(); + return Status::OK(); +} + +Status CFileIterator::QueueCurrentDataBlock(const IndexTreeIterator &idx_iter) { + pblock_pool_scoped_ptr b = prepared_block_pool_.make_scoped_ptr( + prepared_block_pool_.Construct()); + RETURN_NOT_OK(ReadCurrentDataBlock(idx_iter, b.get())); + prepared_blocks_.push_back(b.release()); + return Status::OK(); +} + +bool CFileIterator::HasNext() const { + CHECK(seeked_) << "not seeked"; + CHECK(!prepared_) << "Cannot call HasNext() mid-batch"; + + return !prepared_blocks_.empty() || seeked_->HasNext(); +} + +Status CFileIterator::PrepareBatch(size_t *n) { + CHECK(!prepared_) << "Should call FinishBatch() first"; + CHECK(seeked_ != nullptr) << "must be seeked"; + + CHECK(!prepared_blocks_.empty()); + + rowid_t start_idx = last_prepare_idx_; + rowid_t end_idx = start_idx + *n; + + // Read blocks until all blocks covering the requested range are in the + // prepared_blocks_ queue. + while (prepared_blocks_.back()->last_row_idx() < end_idx) { + Status s = seeked_->Next(); + if (PREDICT_FALSE(s.IsNotFound())) { + VLOG(1) << "Reached EOF"; + break; + } else if (!s.ok()) { + return s; + } + RETURN_NOT_OK(QueueCurrentDataBlock(*seeked_)); + } + + // Seek the first block in the queue such that the first value to be read + // corresponds to start_idx + { + PreparedBlock *front = prepared_blocks_.front(); + front->rewind_idx_ = start_idx - front->first_row_idx(); + front->needs_rewind_ = true; + } + + uint32_t size_covered_by_prep_blocks = prepared_blocks_.back()->last_row_idx() - start_idx + 1; + if (PREDICT_FALSE(size_covered_by_prep_blocks < *n)) { + *n = size_covered_by_prep_blocks; + } + + last_prepare_idx_ = start_idx; + last_prepare_count_ = *n; + prepared_ = true; + + if (PREDICT_FALSE(VLOG_IS_ON(1))) { + VLOG(1) << "Prepared for " << (*n) << " rows" + << " (" << start_idx << "-" << (start_idx + *n - 1) << ")"; + for (PreparedBlock *b : prepared_blocks_) { + VLOG(1) << " " << b->ToString(); + } + VLOG(1) << "-------------"; + } + + return Status::OK(); +} + +Status CFileIterator::FinishBatch() { + CHECK(prepared_) << "no batch prepared"; + prepared_ = false; + + DVLOG(1) << "Finishing batch " << last_prepare_idx_ << "-" + << (last_prepare_idx_ + last_prepare_count_ - 1); + + // Release all blocks except for the last one, which may still contain + // relevent data for the next batch. + for (int i = 0; i < prepared_blocks_.size() - 1; i++) { + PreparedBlock *b = prepared_blocks_[i]; + prepared_block_pool_.Destroy(b); + } + + PreparedBlock *back = prepared_blocks_.back(); + DVLOG(1) << "checking last block " << back->ToString() << " vs " + << last_prepare_idx_ << " + " << last_prepare_count_ + << " (" << (last_prepare_idx_ + last_prepare_count_) << ")"; + if (back->last_row_idx() < last_prepare_idx_ + last_prepare_count_) { + // Last block is irrelevant + prepared_block_pool_.Destroy(back); + prepared_blocks_.clear(); + } else { + prepared_blocks_[0] = back; + prepared_blocks_.resize(1); + } + + #ifndef NDEBUG + if (VLOG_IS_ON(1)) { + VLOG(1) << "Left around following blocks:"; + for (PreparedBlock *b : prepared_blocks_) { + VLOG(1) << " " << b->ToString(); + } + VLOG(1) << "-------------"; + } + #endif + + last_prepare_idx_ += last_prepare_count_; + last_prepare_count_ = 0; + return Status::OK(); +} + + +Status CFileIterator::Scan(ColumnBlock *dst) { + CHECK(seeked_) << "not seeked"; + + // Use a column data view to been able to advance it as we read into it. + ColumnDataView remaining_dst(dst); + + uint32_t rem = last_prepare_count_; + DCHECK_LE(rem, dst->nrows()); + + for (PreparedBlock *pb : prepared_blocks_) { + if (pb->needs_rewind_) { + // Seek back to the saved position. + SeekToPositionInBlock(pb, pb->rewind_idx_); + // TODO: we could add a mark/reset like interface in BlockDecoder interface + // that might be more efficient (allowing the decoder to save internal state + // instead of having to reconstruct it) + } + + if (reader_->is_nullable()) { + DCHECK(dst->is_nullable()); + + size_t nrows = std::min(rem, pb->num_rows_in_block_ - pb->idx_in_block_); + + // Fill column bitmap + size_t count = nrows; + while (count > 0) { + bool not_null = false; + size_t nblock = pb->rle_decoder_.GetNextRun(¬_null, count); + DCHECK_LE(nblock, count); + if (PREDICT_FALSE(nblock == 0)) { + return Status::Corruption( + Substitute("Unexpected EOF on NULL bitmap read. Expected at least $0 more rows", + count)); + } + + size_t this_batch = nblock; + if (not_null) { + // TODO: Maybe copy all and shift later? + RETURN_NOT_OK(pb->dblk_->CopyNextValues(&this_batch, &remaining_dst)); + DCHECK_EQ(nblock, this_batch); + pb->needs_rewind_ = true; + } else { +#ifndef NDEBUG + kudu::OverwriteWithPattern(reinterpret_cast(remaining_dst.data()), + remaining_dst.stride() * nblock, + "NULLNULLNULLNULLNULL"); +#endif + } + + // Set the ColumnBlock bitmap + remaining_dst.SetNullBits(this_batch, not_null); + + rem -= this_batch; + count -= this_batch; + pb->idx_in_block_ += this_batch; + remaining_dst.Advance(this_batch); + } + } else { + // Fetch as many as we can from the current datablock. + size_t this_batch = rem; + RETURN_NOT_OK(pb->dblk_->CopyNextValues(&this_batch, &remaining_dst)); + pb->needs_rewind_ = true; + DCHECK_LE(this_batch, rem); + + // If the column is nullable, set all bits to true + if (dst->is_nullable()) { + remaining_dst.SetNullBits(this_batch, true); + } + + rem -= this_batch; + pb->idx_in_block_ += this_batch; + remaining_dst.Advance(this_batch); + } + + // If we didn't fetch as many as requested, then it should + // be because the current data block ran out. + if (rem > 0) { + DCHECK_EQ(pb->dblk_->Count(), pb->dblk_->GetCurrentIndex()) << + "dblk stopped yielding values before it was empty."; + } else { + break; + } + } + + DCHECK_EQ(rem, 0) << "Should have fetched exactly the number of prepared rows"; + return Status::OK(); +} + +Status CFileIterator::CopyNextValues(size_t *n, ColumnBlock *cb) { + RETURN_NOT_OK(PrepareBatch(n)); + RETURN_NOT_OK(Scan(cb)); + RETURN_NOT_OK(FinishBatch()); + return Status::OK(); +} + + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/cfile_reader.h b/src/kudu/cfile/cfile_reader.h new file mode 100644 index 000000000000..f1db14559041 --- /dev/null +++ b/src/kudu/cfile/cfile_reader.h @@ -0,0 +1,476 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_CFILE_READER_H +#define KUDU_CFILE_CFILE_READER_H + +#include +#include + +#include "kudu/common/columnblock.h" +#include "kudu/common/types.h" +#include "kudu/cfile/block_cache.h" +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/block_handle.h" +#include "kudu/cfile/block_compression.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/cfile/type_encodings.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/object_pool.h" +#include "kudu/util/once.h" +#include "kudu/util/rle-encoding.h" +#include "kudu/util/status.h" +#include "kudu/common/iterator_stats.h" +#include "kudu/common/key_encoder.h" + +namespace kudu { +namespace cfile { + +class BlockCache; +class BlockDecoder; +class BlockPointer; +class CFileHeaderPB; +class CFileFooterPB; +class CFileIterator; +class BinaryPlainBlockDecoder; + +class CFileReader { + public: + // Fully open a cfile using a previously opened block. + // + // After this call, the reader is safe for use. + static Status Open(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr* reader); + + // Lazily open a cfile using a previously opened block. A lazy open does + // not incur additional I/O, nor does it validate the contents of the + // cfile. + // + // Init() must be called before using most methods. Exceptions include + // NewIterator() and file_size(). + static Status OpenNoInit(gscoped_ptr block, + const ReaderOptions& options, + gscoped_ptr* reader); + + // Fully opens a previously lazily opened cfile, parsing and validating + // its contents. + // + // May be called multiple times; subsequent calls will no-op. + Status Init(); + + enum CacheControl { + CACHE_BLOCK, + DONT_CACHE_BLOCK + }; + + Status NewIterator(CFileIterator **iter, CacheControl cache_control); + Status NewIterator(gscoped_ptr *iter, + CacheControl cache_control) { + CFileIterator *iter_ptr; + RETURN_NOT_OK(NewIterator(&iter_ptr, cache_control)); + (*iter).reset(iter_ptr); + return Status::OK(); + } + + // TODO: make this private? should only be used + // by the iterator and index tree readers, I think. + Status ReadBlock(const BlockPointer &ptr, CacheControl cache_control, + BlockHandle *ret) const; + + // Return the number of rows in this cfile. + // This is assumed to be reasonably fast (i.e does not scan + // the data) + Status CountRows(rowid_t *count) const; + + // Retrieve the given metadata entry into 'val'. + // Returns true if the entry was found, otherwise returns false. + // + // Note that this implementation is currently O(n), so should not be used + // in a hot path. + bool GetMetadataEntry(const string &key, string *val); + + // Can be called before Init(). + uint64_t file_size() const { + return file_size_; + } + + const TypeInfo *type_info() const { + DCHECK(init_once_.initted()); + return type_info_; + } + + const TypeEncodingInfo *type_encoding_info() const { + DCHECK(init_once_.initted()); + return type_encoding_info_; + } + + bool is_nullable() const { + return footer().is_type_nullable(); + } + + const CFileHeaderPB &header() const { + DCHECK(init_once_.initted()); + return *DCHECK_NOTNULL(header_.get()); + } + + const CFileFooterPB &footer() const { + DCHECK(init_once_.initted()); + return *DCHECK_NOTNULL(footer_.get()); + } + + bool is_compressed() const { + return footer().compression() != NO_COMPRESSION; + } + + // Advanced access to the cfile. This is used by the + // delta reader code. TODO: think about reorganizing this: + // delta files can probably be done more cleanly. + + // Return true if there is a position-based index on this file. + bool has_posidx() const { return footer().has_posidx_info(); } + BlockPointer posidx_root() const { + DCHECK(has_posidx()); + return BlockPointer(footer().posidx_info().root_block()); + } + + // Return true if there is a value-based index on this file. + bool has_validx() const { return footer().has_validx_info(); } + BlockPointer validx_root() const { + DCHECK(has_validx()); + return BlockPointer(footer().validx_info().root_block()); + } + + std::string ToString() const { return block_->id().ToString(); } + + private: + DISALLOW_COPY_AND_ASSIGN(CFileReader); + + CFileReader(const ReaderOptions &options, + const uint64_t file_size, + gscoped_ptr block); + + // Callback used in 'init_once_' to initialize this cfile. + Status InitOnce(); + + Status ReadMagicAndLength(uint64_t offset, uint32_t *len); + Status ReadAndParseHeader(); + Status ReadAndParseFooter(); + + // Returns the memory usage of the object including the object itself. + size_t memory_footprint() const; + +#ifdef __clang__ + __attribute__((__unused__)) +#endif + const gscoped_ptr block_; + const uint64_t file_size_; + + gscoped_ptr header_; + gscoped_ptr footer_; + + gscoped_ptr block_uncompressor_; + + const TypeInfo *type_info_; + const TypeEncodingInfo *type_encoding_info_; + + KuduOnceDynamic init_once_; + + ScopedTrackedConsumption mem_consumption_; +}; + +// Column Iterator interface used by the CFileSet. +// Implemented by the CFileIterator, DefaultColumnValueIterator +// and the ColumnValueTypeAdaptorIterator. +// It is used to fill the data requested by the projection. +class ColumnIterator { + public: + virtual ~ColumnIterator() {} + + // Seek to the given ordinal entry in the file. + // Entry 0 is the first entry written to the file. + // If provided seek point is past the end of the file, + // then returns a NotFound Status. + // TODO: do we ever want to be able to seek to the end of the file? + virtual Status SeekToOrdinal(rowid_t ord_idx) = 0; + + // Return true if this reader is currently seeked. + // If the iterator is not seeked, it is an error to call any functions except + // for seek (including GetCurrentOrdinal). + virtual bool seeked() const = 0; + + // Get the ordinal index that the iterator is currently pointed to. + // + // Prior to calling PrepareBatch(), this returns the position after the last + // seek. PrepareBatch() and Scan() do not change the position returned by this + // function. FinishBatch() advances the ordinal to the position of the next + // block to be prepared. + virtual rowid_t GetCurrentOrdinal() const = 0; + + // Prepare to read up to *n into the given column block. + // On return sets *n to the number of prepared rows, which is always + // <= the requested value. + // + // This assumes that dst->size() >= *n on input. + // + // If there are at least dst->size() values remaining in the underlying file, + // this will always return *n == dst->size(). In other words, this does not + // ever result in a "short read". + virtual Status PrepareBatch(size_t *n) = 0; + + // Copy values into the prepared column block. + // Any indirected values (eg strings) are copied into the dst block's + // arena. + // This does _not_ advance the position in the underlying file. Multiple + // calls to Scan() will re-read the same values. + virtual Status Scan(ColumnBlock *dst) = 0; + + // Finish processing the current batch, advancing the iterators + // such that the next call to PrepareBatch() will start where the previous + // batch left off. + virtual Status FinishBatch() = 0; + + virtual const IteratorStats& io_statistics() const = 0; +}; + +// ColumnIterator that fills the ColumnBlock with the specified value. +// It is used by the CFileSet to handle the case of a column present +// in the projection schema but not in the base data. +// +// Example: +// DefaultColumnValueIterator iter; +// iter.Scan(&column_block); +class DefaultColumnValueIterator : public ColumnIterator { + public: + DefaultColumnValueIterator(const TypeInfo* typeinfo, const void *value) + : typeinfo_(typeinfo), value_(value), ordinal_(0) { + } + + Status SeekToOrdinal(rowid_t ord_idx) OVERRIDE; + + bool seeked() const OVERRIDE { return true; } + + rowid_t GetCurrentOrdinal() const OVERRIDE { return ordinal_; } + + Status PrepareBatch(size_t *n) OVERRIDE; + Status Scan(ColumnBlock *dst) OVERRIDE; + Status FinishBatch() OVERRIDE; + + const IteratorStats& io_statistics() const OVERRIDE { return io_stats_; } + + private: + const TypeInfo* typeinfo_; + const void *value_; + + size_t batch_; + rowid_t ordinal_; + IteratorStats io_stats_; +}; + + +class CFileIterator : public ColumnIterator { + public: + CFileIterator(CFileReader* reader, + CFileReader::CacheControl cache_control); + ~CFileIterator(); + + // Seek to the first entry in the file. This works for both + // ordinal-indexed and value-indexed files. + Status SeekToFirst(); + + // Seek to the given ordinal entry in the file. + // Entry 0 is the first entry written to the file. + // If provided seek point is past the end of the file, + // then returns a NotFound Status. + // TODO: do we ever want to be able to seek to the end of the file? + Status SeekToOrdinal(rowid_t ord_idx) OVERRIDE; + + // Seek the index to the given row_key, or to the index entry immediately + // before it. Then (if the index is sparse) seek the data block to the + // value matching value or to the value immediately after it. + // + // Sets *exact_match to indicate whether the seek found the exact + // key requested. + // + // If this iterator was constructed without no value index, + // then this will return a NotSupported status. + Status SeekAtOrAfter(const EncodedKey &encoded_key, + bool *exact_match); + + // Return true if this reader is currently seeked. + // If the iterator is not seeked, it is an error to call any functions except + // for seek (including GetCurrentOrdinal). + bool seeked() const OVERRIDE { return seeked_; } + + // Get the ordinal index that the iterator is currently pointed to. + // + // Prior to calling PrepareBatch(), this returns the position after the last + // seek. PrepareBatch() and Scan() do not change the position returned by this + // function. FinishBatch() advances the ordinal to the position of the next + // block to be prepared. + rowid_t GetCurrentOrdinal() const OVERRIDE; + + // Prepare to read up to *n into the given column block. + // On return sets *n to the number of prepared rows, which is always + // <= the requested value. + // + // This assumes that dst->size() >= *n on input. + // + // If there are at least dst->size() values remaining in the underlying file, + // this will always return *n == dst->size(). In other words, this does not + // ever result in a "short read". + Status PrepareBatch(size_t *n) OVERRIDE; + + // Copy values into the prepared column block. + // Any indirected values (eg strings) are copied into the dst block's + // arena. + // This does _not_ advance the position in the underlying file. Multiple + // calls to Scan() will re-read the same values. + Status Scan(ColumnBlock *dst) OVERRIDE; + + // Finish processing the current batch, advancing the iterators + // such that the next call to PrepareBatch() will start where the previous + // batch left off. + Status FinishBatch() OVERRIDE; + + // Return true if the next call to PrepareBatch will return at least one row. + bool HasNext() const; + + // Convenience method to prepare a batch, scan it, and finish it. + Status CopyNextValues(size_t *n, ColumnBlock *dst); + + const IteratorStats &io_statistics() const OVERRIDE { + return io_stats_; + } + + // It the column is dictionary-coded, returns the decoder + // for the cfile's dictionary block. This is called by the + // StringDictBlockDecoder. + BinaryPlainBlockDecoder* GetDictDecoder() { return dict_decoder_.get();} + + private: + DISALLOW_COPY_AND_ASSIGN(CFileIterator); + + struct PreparedBlock { + BlockPointer dblk_ptr_; + BlockHandle dblk_data_; + gscoped_ptr dblk_; + + // The rowid of the first row in this block. + rowid_t first_row_idx() const { + return dblk_->GetFirstRowId(); + } + + // The index of the seeked position, relative to the start of the block. + // In case of null bitmap present, dblk_->GetCurrentIndex() is not aligned + // with the row number, since null values are not written to the data block. + // check CFileIterator::SeekToPositionInBlock() + uint32_t idx_in_block_; + + // When the block is first read, it is seeked to the proper position + // and rewind_idx_ is set to that offset in the block. needs_rewind_ + // is initially false, but after any values are read from the block, + // it becomes true. This indicates that dblk_ is pointed at a later + // position in the block, and should be rewound if a second call to + // Scan() is made. + // rewind_idx is relative to the first entry in the block (i.e. not a rowid) + bool needs_rewind_; + uint32_t rewind_idx_; + + // Total number of rows in the block (nulls + not nulls) + uint32_t num_rows_in_block_; + + // Null bitmap and bitmap (RLE) decoder + Slice rle_bitmap; + RleDecoder rle_decoder_; + + rowid_t last_row_idx() const { + return first_row_idx() + num_rows_in_block_ - 1; + } + + string ToString() const; + }; + + // Seek the given PreparedBlock to the given index within it. + void SeekToPositionInBlock(PreparedBlock *pb, uint32_t idx_in_block); + + // Read the data block currently pointed to by idx_iter_ + // into the given PreparedBlock structure. + // + // This does not advance the iterator. + Status ReadCurrentDataBlock(const IndexTreeIterator &idx_iter, + PreparedBlock *prep_block); + + // Read the data block currently pointed to by idx_iter_, and enqueue + // it onto the end of the prepared_blocks_ deque. + Status QueueCurrentDataBlock(const IndexTreeIterator &idx_iter); + + // Fully initialize the underlying cfile reader if needed, and clear any + // seek-related state. + Status PrepareForNewSeek(); + + CFileReader* reader_; + + gscoped_ptr posidx_iter_; + gscoped_ptr validx_iter_; + + // Decoder for the dictionary block + gscoped_ptr dict_decoder_; + BlockHandle dict_block_handle_; + + // The currently in-use index iterator. This is equal to either + // posidx_iter_.get(), validx_iter_.get(), or NULL if not seeked. + IndexTreeIterator *seeked_; + + // Data blocks that contain data relevant to the currently Prepared + // batch of rows. + // These pointers are allocated from the prepared_block_pool_ below. + vector prepared_blocks_; + + ObjectPool prepared_block_pool_; + typedef ObjectPool::scoped_ptr pblock_pool_scoped_ptr; + + // True if PrepareBatch() has been called more recently than FinishBatch(). + bool prepared_; + + // Whether this iterator will ask the cfile to cache the blocks it requests or not. + const CFileReader::CacheControl cache_control_; + + // RowID of the current prepared batch, if prepared_ is true. + // Otherwise, the RowID of the next batch that will be prepared. + rowid_t last_prepare_idx_; + + // Number of rows in the current batch, if prepared_ is true. + // Otherwise, 0. + uint32_t last_prepare_count_; + + IteratorStats io_stats_; + + // a temporary buffer for encoding + faststring tmp_buf_; +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/cfile_util.cc b/src/kudu/cfile/cfile_util.cc new file mode 100644 index 000000000000..ef9ca3032be3 --- /dev/null +++ b/src/kudu/cfile/cfile_util.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/cfile_util.h" + +#include +#include +#include + +#include "kudu/cfile/cfile_reader.h" +#include "kudu/util/env.h" +#include "kudu/util/mem_tracker.h" + +namespace kudu { +namespace cfile { + +using std::string; + +static const int kBufSize = 1024*1024; + +Status DumpIterator(const CFileReader& reader, + CFileIterator* it, + std::ostream* out, + const DumpIteratorOptions& opts, + int indent) { + + Arena arena(8192, 8*1024*1024); + uint8_t buf[kBufSize]; + const TypeInfo *type = reader.type_info(); + size_t max_rows = kBufSize/type->size(); + uint8_t nulls[BitmapSize(max_rows)]; + ColumnBlock cb(type, reader.is_nullable() ? nulls : nullptr, buf, max_rows, &arena); + + string strbuf; + size_t count = 0; + while (it->HasNext()) { + size_t n = opts.nrows == 0 ? max_rows : std::min(max_rows, opts.nrows - count); + if (n == 0) break; + + RETURN_NOT_OK(it->CopyNextValues(&n, &cb)); + + if (opts.print_rows) { + if (reader.is_nullable()) { + for (size_t i = 0; i < n; i++) { + strbuf.append(indent, ' '); + const void *ptr = cb.nullable_cell_ptr(i); + if (ptr != nullptr) { + type->AppendDebugStringForValue(ptr, &strbuf); + } else { + strbuf.append("NULL"); + } + strbuf.push_back('\n'); + } + } else { + for (size_t i = 0; i < n; i++) { + strbuf.append(indent, ' '); + type->AppendDebugStringForValue(cb.cell_ptr(i), &strbuf); + strbuf.push_back('\n'); + } + } + *out << strbuf; + strbuf.clear(); + } + arena.Reset(); + count += n; + } + + VLOG(1) << "Dumped " << count << " rows"; + + return Status::OK(); +} + +ReaderOptions::ReaderOptions() + : parent_mem_tracker(MemTracker::GetRootTracker()) { +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/cfile_util.h b/src/kudu/cfile/cfile_util.h new file mode 100644 index 000000000000..cf647a6071e0 --- /dev/null +++ b/src/kudu/cfile/cfile_util.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef CFILE_UTIL_H_ +#define CFILE_UTIL_H_ + +#include +#include + +#include "kudu/cfile/cfile.pb.h" + +#include "kudu/common/schema.h" +#include "kudu/common/row.h" +#include "kudu/common/scan_predicate.h" +#include "kudu/common/encoded_key.h" +#include "kudu/util/bloom_filter.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace cfile { + +class CFileReader; +class CFileIterator; + +struct WriterOptions { + // Approximate size of index blocks. + // + // Default: 32KB. + size_t index_block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // This is currently only used by StringPrefixBlockBuilder + // + // Default: 16 + int block_restart_interval; + + // Whether the file needs a positional index. + bool write_posidx; + + // Whether the file needs a value index + bool write_validx; + + // Column storage attributes. + // + // Default: all default values as specified in the constructor in + // schema.h + ColumnStorageAttributes storage_attributes; + + WriterOptions(); +}; + +struct ReaderOptions { + ReaderOptions(); + + // The MemTracker that should account for this reader's memory consumption. + // + // Default: the root tracker. + std::shared_ptr parent_mem_tracker; +}; + +struct DumpIteratorOptions { + // If true, print values of rows, otherwise only print aggregate + // information. + bool print_rows; + + // Number of rows to iterate over. If 0, will iterate over all rows. + size_t nrows; + + DumpIteratorOptions() + : print_rows(false), nrows(0) { + } +}; + +// Dumps the contents of a cfile to 'out'; 'reader' and 'iterator' +// must be initialized. See cfile/cfile-dump.cc and tools/fs_tool.cc +// for sample usage. +// +// See also: DumpIteratorOptions +Status DumpIterator(const CFileReader& reader, + CFileIterator* it, + std::ostream* out, + const DumpIteratorOptions& opts, + int indent); + +} // namespace cfile +} // namespace kudu + +#endif /* CFILE_UTIL_H_ */ diff --git a/src/kudu/cfile/cfile_writer.cc b/src/kudu/cfile/cfile_writer.cc new file mode 100644 index 000000000000..db71e3ce9318 --- /dev/null +++ b/src/kudu/cfile/cfile_writer.cc @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/cfile_writer.h" + +#include +#include +#include + +#include "kudu/cfile/block_pointer.h" +#include "kudu/cfile/index_block.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/cfile/type_encodings.h" +#include "kudu/common/key_encoder.h" +#include "kudu/gutil/endian.h" +#include "kudu/util/coding.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/pb_util.h" + +using google::protobuf::RepeatedPtrField; +using kudu::fs::ScopedWritableBlockCloser; +using kudu::fs::WritableBlock; +using std::string; + +DEFINE_int32(cfile_default_block_size, 256*1024, "The default block size to use in cfiles"); +TAG_FLAG(cfile_default_block_size, advanced); + +DEFINE_string(cfile_default_compression_codec, "none", + "Default cfile block compression codec."); +TAG_FLAG(cfile_default_compression_codec, advanced); + +// The default value is optimized for the case where: +// 1. the cfile blocks are colocated with the WALs. +// 2. The underlying hardware is a spinning disk. +// 3. The underlying filesystem is either XFS or EXT4. +// 4. block_coalesce_close is false (see fs/block_manager.cc). +// +// When all conditions hold, this value ensures low latency for WAL writes. +DEFINE_string(cfile_do_on_finish, "close", + "What to do to cfile blocks when writing is finished. " + "Possible values are 'close', 'flush', or 'nothing'."); +TAG_FLAG(cfile_do_on_finish, experimental); + +namespace kudu { +namespace cfile { + +const char kMagicString[] = "kuducfil"; + +static const size_t kBlockSizeLimit = 16 * 1024 * 1024; // 16MB +static const size_t kMinBlockSize = 512; + +static CompressionType GetDefaultCompressionCodec() { + return GetCompressionCodecType(FLAGS_cfile_default_compression_codec); +} + +//////////////////////////////////////////////////////////// +// Options +//////////////////////////////////////////////////////////// +WriterOptions::WriterOptions() + : index_block_size(32*1024), + block_restart_interval(16), + write_posidx(false), + write_validx(false) { +} + + +//////////////////////////////////////////////////////////// +// CFileWriter +//////////////////////////////////////////////////////////// + + +CFileWriter::CFileWriter(const WriterOptions &options, + const TypeInfo* typeinfo, + bool is_nullable, + gscoped_ptr block) + : block_(block.Pass()), + off_(0), + value_count_(0), + options_(options), + is_nullable_(is_nullable), + typeinfo_(typeinfo), + key_encoder_(nullptr), + state_(kWriterInitialized) { + EncodingType encoding = options_.storage_attributes.encoding; + Status s = TypeEncodingInfo::Get(typeinfo_, encoding, &type_encoding_info_); + if (!s.ok()) { + // TODO: we should somehow pass some contextual info about the + // tablet here. + WARN_NOT_OK(s, "Falling back to default encoding"); + s = TypeEncodingInfo::Get(typeinfo, + TypeEncodingInfo::GetDefaultEncoding(typeinfo_), + &type_encoding_info_); + CHECK_OK(s); + } + + compression_ = options_.storage_attributes.compression; + if (compression_ == DEFAULT_COMPRESSION) { + compression_ = GetDefaultCompressionCodec(); + } + + if (options_.storage_attributes.cfile_block_size <= 0) { + options_.storage_attributes.cfile_block_size = FLAGS_cfile_default_block_size; + } + if (options_.storage_attributes.cfile_block_size < kMinBlockSize) { + LOG(WARNING) << "Configured block size " << options_.storage_attributes.cfile_block_size + << " smaller than minimum allowed value " << kMinBlockSize + << ": using minimum."; + options_.storage_attributes.cfile_block_size = kMinBlockSize; + } + + if (options.write_posidx) { + posidx_builder_.reset(new IndexTreeBuilder(&options_, + this)); + } + + if (options.write_validx) { + key_encoder_ = &GetKeyEncoder(typeinfo_); + validx_builder_.reset(new IndexTreeBuilder(&options_, + this)); + } +} + +CFileWriter::~CFileWriter() { +} + +Status CFileWriter::Start() { + TRACE_EVENT0("cfile", "CFileWriter::Start"); + CHECK(state_ == kWriterInitialized) << + "bad state for Start(): " << state_; + + if (compression_ != NO_COMPRESSION) { + const CompressionCodec* codec; + RETURN_NOT_OK(GetCompressionCodec(compression_, &codec)); + block_compressor_ .reset(new CompressedBlockBuilder(codec, kBlockSizeLimit)); + } + + CFileHeaderPB header; + header.set_major_version(kCFileMajorVersion); + header.set_minor_version(kCFileMinorVersion); + FlushMetadataToPB(header.mutable_metadata()); + + uint32_t pb_size = header.ByteSize(); + + faststring buf; + // First the magic. + buf.append(kMagicString); + // Then Length-prefixed header. + PutFixed32(&buf, pb_size); + if (!pb_util::AppendToString(header, &buf)) { + return Status::Corruption("unable to encode header"); + } + + RETURN_NOT_OK_PREPEND(block_->Append(Slice(buf)), "Couldn't write header"); + off_ += buf.size(); + + BlockBuilder *bb; + RETURN_NOT_OK(type_encoding_info_->CreateBlockBuilder(&bb, &options_)); + data_block_.reset(bb); + + if (is_nullable_) { + size_t nrows = ((options_.storage_attributes.cfile_block_size + typeinfo_->size() - 1) / + typeinfo_->size()); + null_bitmap_builder_.reset(new NullBitmapBuilder(nrows * 8)); + } + + state_ = kWriterWriting; + + return Status::OK(); +} + +Status CFileWriter::Finish() { + TRACE_EVENT0("cfile", "CFileWriter::Finish"); + ScopedWritableBlockCloser closer; + RETURN_NOT_OK(FinishAndReleaseBlock(&closer)); + return closer.CloseBlocks(); +} + +Status CFileWriter::FinishAndReleaseBlock(ScopedWritableBlockCloser* closer) { + TRACE_EVENT0("cfile", "CFileWriter::FinishAndReleaseBlock"); + CHECK(state_ == kWriterWriting) << + "Bad state for Finish(): " << state_; + + // Write out any pending values as the last data block. + RETURN_NOT_OK(FinishCurDataBlock()); + + state_ = kWriterFinished; + + // Start preparing the footer. + CFileFooterPB footer; + footer.set_data_type(typeinfo_->type()); + footer.set_is_type_nullable(is_nullable_); + footer.set_encoding(type_encoding_info_->encoding_type()); + footer.set_num_values(value_count_); + footer.set_compression(compression_); + + // Write out any pending positional index blocks. + if (options_.write_posidx) { + BTreeInfoPB posidx_info; + RETURN_NOT_OK_PREPEND(posidx_builder_->Finish(&posidx_info), + "Couldn't write positional index"); + footer.mutable_posidx_info()->CopyFrom(posidx_info); + } + + if (options_.write_validx) { + BTreeInfoPB validx_info; + RETURN_NOT_OK_PREPEND(validx_builder_->Finish(&validx_info), "Couldn't write value index"); + footer.mutable_validx_info()->CopyFrom(validx_info); + } + + // Optionally append extra information to the end of cfile. + // Example: dictionary block for dictionary encoding + RETURN_NOT_OK(data_block_->AppendExtraInfo(this, &footer)); + + // Flush metadata. + FlushMetadataToPB(footer.mutable_metadata()); + + faststring footer_str; + if (!pb_util::SerializeToString(footer, &footer_str)) { + return Status::Corruption("unable to serialize footer"); + } + + footer_str.append(kMagicString); + PutFixed32(&footer_str, footer.GetCachedSize()); + + RETURN_NOT_OK(block_->Append(footer_str)); + + // Done with this block. + if (FLAGS_cfile_do_on_finish == "flush") { + RETURN_NOT_OK(block_->FlushDataAsync()); + closer->AddBlock(block_.Pass()); + } else if (FLAGS_cfile_do_on_finish == "close") { + RETURN_NOT_OK(block_->Close()); + } else if (FLAGS_cfile_do_on_finish == "nothing") { + closer->AddBlock(block_.Pass()); + } else { + LOG(FATAL) << "Unknown value for cfile_do_on_finish: " + << FLAGS_cfile_do_on_finish; + } + return Status::OK(); +} + +void CFileWriter::AddMetadataPair(const Slice &key, const Slice &value) { + CHECK_NE(state_, kWriterFinished); + + unflushed_metadata_.push_back(make_pair(key.ToString(), value.ToString())); +} + +string CFileWriter::GetMetaValueOrDie(Slice key) const { + typedef pair ss_pair; + for (const ss_pair& entry : unflushed_metadata_) { + if (Slice(entry.first) == key) { + return entry.second; + } + } + LOG(FATAL) << "Missing metadata entry: " << key.ToDebugString(); +} + +void CFileWriter::FlushMetadataToPB(RepeatedPtrField *field) { + typedef pair ss_pair; + for (const ss_pair &entry : unflushed_metadata_) { + FileMetadataPairPB *pb = field->Add(); + pb->set_key(entry.first); + pb->set_value(entry.second); + } + unflushed_metadata_.clear(); +} + +Status CFileWriter::AppendEntries(const void *entries, size_t count) { + DCHECK(!is_nullable_); + + int rem = count; + + const uint8_t *ptr = reinterpret_cast(entries); + + while (rem > 0) { + int n = data_block_->Add(ptr, rem); + DCHECK_GE(n, 0); + + ptr += typeinfo_->size() * n; + rem -= n; + value_count_ += n; + + if (data_block_->IsBlockFull(options_.storage_attributes.cfile_block_size)) { + RETURN_NOT_OK(FinishCurDataBlock()); + } + } + + DCHECK_EQ(rem, 0); + return Status::OK(); +} + +Status CFileWriter::AppendNullableEntries(const uint8_t *bitmap, + const void *entries, + size_t count) { + DCHECK(is_nullable_ && bitmap != nullptr); + + const uint8_t *ptr = reinterpret_cast(entries); + + size_t nblock; + bool not_null = false; + BitmapIterator bmap_iter(bitmap, count); + while ((nblock = bmap_iter.Next(¬_null)) > 0) { + if (not_null) { + size_t rem = nblock; + do { + int n = data_block_->Add(ptr, rem); + DCHECK_GE(n, 0); + + null_bitmap_builder_->AddRun(true, n); + ptr += n * typeinfo_->size(); + value_count_ += n; + rem -= n; + + if (data_block_->IsBlockFull(options_.storage_attributes.cfile_block_size)) { + RETURN_NOT_OK(FinishCurDataBlock()); + } + + } while (rem > 0); + } else { + null_bitmap_builder_->AddRun(false, nblock); + ptr += nblock * typeinfo_->size(); + value_count_ += nblock; + } + } + + return Status::OK(); +} + +Status CFileWriter::FinishCurDataBlock() { + uint32_t num_elems_in_block = data_block_->Count(); + if (is_nullable_) { + num_elems_in_block = null_bitmap_builder_->nitems(); + } + + if (PREDICT_FALSE(num_elems_in_block == 0)) { + return Status::OK(); + } + + rowid_t first_elem_ord = value_count_ - num_elems_in_block; + VLOG(1) << "Appending data block for values " << + first_elem_ord << "-" << (first_elem_ord + num_elems_in_block); + + // The current data block is full, need to push it + // into the file, and add to index + Slice data = data_block_->Finish(first_elem_ord); + VLOG(2) << " actual size=" << data.size(); + + uint8_t key_tmp_space[typeinfo_->size()]; + + if (validx_builder_ != nullptr) { + // If we're building an index, we need to copy the first + // key from the block locally, so we can write it into that index. + RETURN_NOT_OK(data_block_->GetFirstKey(key_tmp_space)); + VLOG(1) << "Appending validx entry\n" << + kudu::HexDump(Slice(key_tmp_space, typeinfo_->size())); + } + + vector v; + faststring null_headers; + if (is_nullable_) { + Slice null_bitmap = null_bitmap_builder_->Finish(); + PutVarint32(&null_headers, num_elems_in_block); + PutVarint32(&null_headers, null_bitmap.size()); + v.push_back(Slice(null_headers.data(), null_headers.size())); + v.push_back(null_bitmap); + } + v.push_back(data); + Status s = AppendRawBlock(v, first_elem_ord, + reinterpret_cast(key_tmp_space), + "data block"); + + if (is_nullable_) { + null_bitmap_builder_->Reset(); + } + data_block_->Reset(); + + return s; +} + +Status CFileWriter::AppendRawBlock(const vector &data_slices, + size_t ordinal_pos, + const void *validx_key, + const char *name_for_log) { + CHECK_EQ(state_, kWriterWriting); + + BlockPointer ptr; + Status s = AddBlock(data_slices, &ptr, name_for_log); + if (!s.ok()) { + LOG(WARNING) << "Unable to append block to file: " << s.ToString(); + return s; + } + + // Now add to the index blocks + if (posidx_builder_ != nullptr) { + tmp_buf_.clear(); + KeyEncoderTraits::Encode(ordinal_pos, &tmp_buf_); + RETURN_NOT_OK(posidx_builder_->Append(Slice(tmp_buf_), ptr)); + } + + if (validx_builder_ != nullptr) { + CHECK(validx_key != nullptr) << + "must pass a key for raw block if validx is configured"; + VLOG(1) << "Appending validx entry\n" << + kudu::HexDump(Slice(reinterpret_cast(validx_key), + typeinfo_->size())); + key_encoder_->ResetAndEncode(validx_key, &tmp_buf_); + s = validx_builder_->Append(Slice(tmp_buf_), ptr); + if (!s.ok()) { + LOG(WARNING) << "Unable to append to value index: " << s.ToString(); + return s; + } + } + + return s; +} + +size_t CFileWriter::written_size() const { + // This is a low estimate, but that's OK -- this is checked after every block + // write during flush/compact, so better to give a fast slightly-inaccurate result + // than spend a lot of effort trying to improve accuracy by a few KB. + return off_; +} + +Status CFileWriter::AddBlock(const vector &data_slices, + BlockPointer *block_ptr, + const char *name_for_log) { + uint64_t start_offset = off_; + + if (block_compressor_ != nullptr) { + // Write compressed block + Slice cdata; + Status s = block_compressor_->Compress(data_slices, &cdata); + if (!s.ok()) { + LOG(WARNING) << "Unable to compress slice of size " + << cdata.size() << " at offset " << off_ + << ": " << s.ToString(); + return(s); + } + + RETURN_NOT_OK(WriteRawData(cdata)); + } else { + // Write uncompressed block + for (const Slice &data : data_slices) { + RETURN_NOT_OK(WriteRawData(data)); + } + } + + uint64_t total_size = off_ - start_offset; + + *block_ptr = BlockPointer(start_offset, total_size); + VLOG(1) << "Appended " << name_for_log + << " with " << total_size << " bytes at " << start_offset; + return Status::OK(); +} + +Status CFileWriter::WriteRawData(const Slice& data) { + Status s = block_->Append(data); + if (!s.ok()) { + LOG(WARNING) << "Unable to append slice of size " + << data.size() << " at offset " << off_ + << ": " << s.ToString(); + } + off_ += data.size(); + return s; +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/cfile_writer.h b/src/kudu/cfile/cfile_writer.h new file mode 100644 index 000000000000..a16b66ac1e7c --- /dev/null +++ b/src/kudu/cfile/cfile_writer.h @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_CFILE_WRITER_H +#define KUDU_CFILE_CFILE_WRITER_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/block_compression.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/cfile/type_encodings.h" +#include "kudu/common/key_encoder.h" +#include "kudu/common/types.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/env.h" +#include "kudu/util/rle-encoding.h" +#include "kudu/util/status.h" + +namespace kudu { +class Arena; + +namespace cfile { +using std::unordered_map; + +class BlockPointer; +class BTreeInfoPB; +class GVIntBlockBuilder; +class BinaryPrefixBlockBuilder; +class IndexTreeBuilder; + +// Magic used in header/footer +extern const char kMagicString[]; + +const int kCFileMajorVersion = 1; +const int kCFileMinorVersion = 0; + +class NullBitmapBuilder { + public: + explicit NullBitmapBuilder(size_t initial_row_capacity) + : nitems_(0), + bitmap_(BitmapSize(initial_row_capacity)), + rle_encoder_(&bitmap_, 1) { + } + + size_t nitems() const { + return nitems_; + } + + void AddRun(bool value, size_t run_length = 1) { + nitems_ += run_length; + rle_encoder_.Put(value, run_length); + } + + // the returned Slice is only valid until this Builder is destroyed or Reset + Slice Finish() { + int len = rle_encoder_.Flush(); + return Slice(bitmap_.data(), len); + } + + void Reset() { + nitems_ = 0; + rle_encoder_.Clear(); + } + + private: + size_t nitems_; + faststring bitmap_; + RleEncoder rle_encoder_; +}; + +// Main class used to write a CFile. +class CFileWriter { + public: + explicit CFileWriter(const WriterOptions &options, + const TypeInfo* typeinfo, + bool is_nullable, + gscoped_ptr block); + ~CFileWriter(); + + Status Start(); + + // Close the CFile and close the underlying writable block. + Status Finish(); + + // Close the CFile and release the underlying writable block to 'closer'. + Status FinishAndReleaseBlock(fs::ScopedWritableBlockCloser* closer); + + bool finished() { + return state_ == kWriterFinished; + } + + // Add a key-value pair of metadata to the file. Keys should be human-readable, + // values may be arbitrary binary. + // + // If this is called prior to Start(), then the metadata pairs will be added in + // the header. Otherwise, the pairs will be added in the footer during Finish(). + void AddMetadataPair(const Slice &key, const Slice &value); + + // Return the metadata value associated with the given key. + // + // If no such metadata has been added yet, logs a FATAL error. + std::string GetMetaValueOrDie(Slice key) const; + + // Append a set of values to the file. + Status AppendEntries(const void *entries, size_t count); + + // Append a set of values to the file with the relative null bitmap. + // "entries" is not "compact" - ie if you're appending 10 rows, and 9 are NULL, + // 'entries' still will have 10 elements in it + Status AppendNullableEntries(const uint8_t *bitmap, const void *entries, size_t count); + + // Append a raw block to the file, adding it to the various indexes. + // + // The Slices in 'data_slices' are concatenated to form the block. + // + // validx_key may be NULL if this file writer has not been configured with + // value indexing. + Status AppendRawBlock(const vector &data_slices, + size_t ordinal_pos, + const void *validx_key, + const char *name_for_log); + + + // Return the amount of data written so far to this CFile. + // More data may be written by Finish(), but this is an approximation. + size_t written_size() const; + + std::string ToString() const { return block_->id().ToString(); } + + // Wrapper for AddBlock() to append the dictionary block to the end of a Cfile. + Status AppendDictBlock(const vector &data_slices, BlockPointer *block_ptr, + const char *name_for_log) { + return AddBlock(data_slices, block_ptr, name_for_log); + } + + private: + DISALLOW_COPY_AND_ASSIGN(CFileWriter); + + friend class IndexTreeBuilder; + + // Append the given block into the file. + // + // Sets *block_ptr to correspond to the newly inserted block. + Status AddBlock(const vector &data_slices, + BlockPointer *block_ptr, + const char *name_for_log); + + Status WriteRawData(const Slice& data); + + Status FinishCurDataBlock(); + + // Flush the current unflushed_metadata_ entries into the given protobuf + // field, clearing the buffer. + void FlushMetadataToPB(google::protobuf::RepeatedPtrField *field); + + // Block being written. + gscoped_ptr block_; + + // Current file offset. + uint64_t off_; + + // Current number of values that have been appended. + rowid_t value_count_; + + WriterOptions options_; + + // Type of data being written + bool is_nullable_; + CompressionType compression_; + const TypeInfo* typeinfo_; + const TypeEncodingInfo* type_encoding_info_; + + // The key-encoder. Only set if the writer is writing an embedded + // value index. + const KeyEncoder* key_encoder_; + + // a temporary buffer for encoding + faststring tmp_buf_; + + // Metadata which has been added to the writer but not yet flushed. + vector > unflushed_metadata_; + + gscoped_ptr data_block_; + gscoped_ptr posidx_builder_; + gscoped_ptr validx_builder_; + gscoped_ptr null_bitmap_builder_; + gscoped_ptr block_compressor_; + + enum State { + kWriterInitialized, + kWriterWriting, + kWriterFinished + }; + State state_; +}; + + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/compression-test.cc b/src/kudu/cfile/compression-test.cc new file mode 100644 index 000000000000..a03275f77933 --- /dev/null +++ b/src/kudu/cfile/compression-test.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include +#include +#include +#include + +#include "kudu/cfile/cfile-test-base.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/compression_codec.h" +#include "kudu/cfile/index_block.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace cfile { + +static void TestCompressionCodec(CompressionType compression) { + const int kInputSize = 64; + + const CompressionCodec* codec; + uint8_t ibuffer[kInputSize]; + uint8_t ubuffer[kInputSize]; + size_t compressed; + + // Fill the test input buffer + memset(ibuffer, 'Z', kInputSize); + + // Get the specified compression codec + ASSERT_OK(GetCompressionCodec(compression, &codec)); + + // Allocate the compression buffer + size_t max_compressed = codec->MaxCompressedLength(kInputSize); + ASSERT_LT(max_compressed, (kInputSize * 2)); + gscoped_array cbuffer(new uint8_t[max_compressed]); + + // Compress and uncompress + ASSERT_OK(codec->Compress(Slice(ibuffer, kInputSize), cbuffer.get(), &compressed)); + ASSERT_OK(codec->Uncompress(Slice(cbuffer.get(), compressed), ubuffer, kInputSize)); + ASSERT_EQ(0, memcmp(ibuffer, ubuffer, kInputSize)); + + // Compress slices and uncompress + vector v; + v.push_back(Slice(ibuffer, 1)); + for (int i = 1; i <= kInputSize; i += 7) + v.push_back(Slice(ibuffer + i, 7)); + ASSERT_OK(codec->Compress(Slice(ibuffer, kInputSize), cbuffer.get(), &compressed)); + ASSERT_OK(codec->Uncompress(Slice(cbuffer.get(), compressed), ubuffer, kInputSize)); + ASSERT_EQ(0, memcmp(ibuffer, ubuffer, kInputSize)); +} + +class TestCompression : public CFileTestBase { + protected: + void TestReadWriteCompressed(CompressionType compression) { + const size_t nrows = 10000; + BlockId block_id; + size_t rdrows; + + { + StringDataGenerator string_gen("hello %04d"); + WriteTestFile(&string_gen, PREFIX_ENCODING, compression, nrows, + WRITE_VALIDX, &block_id); + + TimeReadFile(fs_manager_.get(), block_id, &rdrows); + ASSERT_EQ(nrows, rdrows); + } + + { + UInt32DataGenerator int_gen; + WriteTestFile(&int_gen, GROUP_VARINT, compression, nrows, + NO_FLAGS, &block_id); + TimeReadFile(fs_manager_.get(), block_id, &rdrows); + ASSERT_EQ(nrows, rdrows); + } + } +}; + +TEST_F(TestCompression, TestNoCompressionCodec) { + const CompressionCodec* codec; + ASSERT_OK(GetCompressionCodec(NO_COMPRESSION, &codec)); + ASSERT_EQ(nullptr, codec); +} + +TEST_F(TestCompression, TestSnappyCompressionCodec) { + TestCompressionCodec(SNAPPY); +} + +TEST_F(TestCompression, TestLz4CompressionCodec) { + TestCompressionCodec(LZ4); +} + +TEST_F(TestCompression, TestZlibCompressionCodec) { + TestCompressionCodec(ZLIB); +} + +TEST_F(TestCompression, TestCFileNoCompressionReadWrite) { + TestReadWriteCompressed(NO_COMPRESSION); +} + +TEST_F(TestCompression, TestCFileSnappyReadWrite) { + TestReadWriteCompressed(SNAPPY); +} + +TEST_F(TestCompression, TestCFileLZ4ReadWrite) { + TestReadWriteCompressed(SNAPPY); +} + +TEST_F(TestCompression, TestCFileZlibReadWrite) { + TestReadWriteCompressed(ZLIB); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/compression_codec.cc b/src/kudu/cfile/compression_codec.cc new file mode 100644 index 000000000000..2594f7976db0 --- /dev/null +++ b/src/kudu/cfile/compression_codec.cc @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/compression_codec.h" +#include "kudu/gutil/singleton.h" +#include "kudu/gutil/stringprintf.h" + +namespace kudu { +namespace cfile { + +using std::vector; + +CompressionCodec::CompressionCodec() { +} +CompressionCodec::~CompressionCodec() { +} + +class SlicesSource : public snappy::Source { + public: + explicit SlicesSource(const std::vector& slices) + : slice_index_(0), + slice_offset_(0), + slices_(slices) { + available_ = TotalSize(); + } + + size_t Available() const OVERRIDE { + return available_; + } + + const char* Peek(size_t* len) OVERRIDE { + if (available_ == 0) { + *len = 0; + return nullptr; + } + + const Slice& data = slices_[slice_index_]; + *len = data.size() - slice_offset_; + return reinterpret_cast(data.data()) + slice_offset_; + } + + void Skip(size_t n) OVERRIDE { + DCHECK_LE(n, Available()); + if (n == 0) return; + + available_ -= n; + if ((n + slice_offset_) < slices_[slice_index_].size()) { + slice_offset_ += n; + } else { + n -= slices_[slice_index_].size() - slice_offset_; + slice_index_++; + while (n > 0 && n >= slices_[slice_index_].size()) { + n -= slices_[slice_index_].size(); + slice_index_++; + } + slice_offset_ = n; + } + } + + void Dump(faststring *buffer) { + buffer->reserve(buffer->size() + TotalSize()); + for (const Slice& block : slices_) { + buffer->append(block.data(), block.size()); + } + } + + private: + size_t TotalSize(void) const { + size_t size = 0; + for (const Slice& data : slices_) { + size += data.size(); + } + return size; + } + + private: + size_t available_; + size_t slice_index_; + size_t slice_offset_; + const vector& slices_; +}; + +class SnappyCodec : public CompressionCodec { + public: + static SnappyCodec *GetSingleton() { + return Singleton::get(); + } + + Status Compress(const Slice& input, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + snappy::RawCompress(reinterpret_cast(input.data()), input.size(), + reinterpret_cast(compressed), compressed_length); + return Status::OK(); + } + + Status Compress(const vector& input_slices, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + SlicesSource source(input_slices); + snappy::UncheckedByteArraySink sink(reinterpret_cast(compressed)); + if ((*compressed_length = snappy::Compress(&source, &sink)) <= 0) { + return Status::Corruption("unable to compress the buffer"); + } + return Status::OK(); + } + + Status Uncompress(const Slice& compressed, + uint8_t *uncompressed, + size_t uncompressed_length) const OVERRIDE { + bool success = snappy::RawUncompress(reinterpret_cast(compressed.data()), + compressed.size(), reinterpret_cast(uncompressed)); + return success ? Status::OK() : Status::Corruption("unable to uncompress the buffer"); + } + + size_t MaxCompressedLength(size_t source_bytes) const OVERRIDE { + return snappy::MaxCompressedLength(source_bytes); + } +}; + +class Lz4Codec : public CompressionCodec { + public: + static Lz4Codec *GetSingleton() { + return Singleton::get(); + } + + Status Compress(const Slice& input, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + int n = LZ4_compress(reinterpret_cast(input.data()), + reinterpret_cast(compressed), input.size()); + *compressed_length = n; + return Status::OK(); + } + + Status Compress(const vector& input_slices, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + if (input_slices.size() == 1) { + return Compress(input_slices[0], compressed, compressed_length); + } + + SlicesSource source(input_slices); + faststring buffer; + source.Dump(&buffer); + return Compress(Slice(buffer.data(), buffer.size()), compressed, compressed_length); + } + + Status Uncompress(const Slice& compressed, + uint8_t *uncompressed, + size_t uncompressed_length) const OVERRIDE { + int n = LZ4_decompress_fast(reinterpret_cast(compressed.data()), + reinterpret_cast(uncompressed), uncompressed_length); + if (n != compressed.size()) { + return Status::Corruption( + StringPrintf("unable to uncompress the buffer. error near %d, buffer", -n), + compressed.ToDebugString(100)); + } + return Status::OK(); + } + + size_t MaxCompressedLength(size_t source_bytes) const OVERRIDE { + return LZ4_compressBound(source_bytes); + } +}; + +/** + * TODO: use a instance-local Arena and pass alloc/free into zlib + * so that it allocates from the arena. + */ +class ZlibCodec : public CompressionCodec { + public: + static ZlibCodec *GetSingleton() { + return Singleton::get(); + } + + Status Compress(const Slice& input, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + *compressed_length = MaxCompressedLength(input.size()); + int err = ::compress(compressed, compressed_length, input.data(), input.size()); + return err == Z_OK ? Status::OK() : Status::IOError("unable to compress the buffer"); + } + + Status Compress(const vector& input_slices, + uint8_t *compressed, size_t *compressed_length) const OVERRIDE { + if (input_slices.size() == 1) { + return Compress(input_slices[0], compressed, compressed_length); + } + + // TODO: use z_stream + SlicesSource source(input_slices); + faststring buffer; + source.Dump(&buffer); + return Compress(Slice(buffer.data(), buffer.size()), compressed, compressed_length); + } + + Status Uncompress(const Slice& compressed, + uint8_t *uncompressed, size_t uncompressed_length) const OVERRIDE { + int err = ::uncompress(uncompressed, &uncompressed_length, + compressed.data(), compressed.size()); + return err == Z_OK ? Status::OK() : Status::Corruption("unable to uncompress the buffer"); + } + + size_t MaxCompressedLength(size_t source_bytes) const OVERRIDE { + // one-time overhead of six bytes for the entire stream plus five bytes per 16 KB block + return source_bytes + (6 + (5 * ((source_bytes + 16383) >> 14))); + } +}; + +Status GetCompressionCodec(CompressionType compression, + const CompressionCodec** codec) { + switch (compression) { + case NO_COMPRESSION: + *codec = nullptr; + break; + case SNAPPY: + *codec = SnappyCodec::GetSingleton(); + break; + case LZ4: + *codec = Lz4Codec::GetSingleton(); + break; + case ZLIB: + *codec = ZlibCodec::GetSingleton(); + break; + default: + return Status::NotFound("bad compression type"); + } + return Status::OK(); +} + +CompressionType GetCompressionCodecType(const std::string& name) { + if (name.compare("snappy") == 0) + return SNAPPY; + if (name.compare("lz4") == 0) + return LZ4; + if (name.compare("zlib") == 0) + return ZLIB; + if (name.compare("none") == 0) + return NO_COMPRESSION; + + LOG(WARNING) << "Unable to recognize the compression codec '" << name + << "' using no compression as default."; + return NO_COMPRESSION; +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/compression_codec.h b/src/kudu/cfile/compression_codec.h new file mode 100644 index 000000000000..8ac84bfedcac --- /dev/null +++ b/src/kudu/cfile/compression_codec.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_COMPRESSION_CODEC_H +#define KUDU_CFILE_COMPRESSION_CODEC_H + +#include +#include + +#include "kudu/cfile/cfile.pb.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace cfile { + +class CompressionCodec { + public: + CompressionCodec(); + virtual ~CompressionCodec(); + + // REQUIRES: "compressed" must point to an area of memory that is at + // least "MaxCompressedLength(input_length)" bytes in length. + // + // Takes the data stored in "input[0..input_length]" and stores + // it in the array pointed to by "compressed". + // + // returns the length of the compressed output. + virtual Status Compress(const Slice& input, + uint8_t *compressed, size_t *compressed_length) const = 0; + + virtual Status Compress(const std::vector& input_slices, + uint8_t *compressed, size_t *compressed_length) const = 0; + + // Given data in "compressed[0..compressed_length-1]" generated by + // calling the Compress routine, this routine stores the uncompressed data + // to uncompressed[0..uncompressed_length-1] + // returns false if the message is corrupted and could not be uncompressed + virtual Status Uncompress(const Slice& compressed, + uint8_t *uncompressed, size_t uncompressed_length) const = 0; + + // Returns the maximal size of the compressed representation of + // input data that is "source_bytes" bytes in length. + virtual size_t MaxCompressedLength(size_t source_bytes) const = 0; + private: + DISALLOW_COPY_AND_ASSIGN(CompressionCodec); +}; + +// Returns the compression codec for the specified type. +// +// The returned codec is a singleton and should be not be destroyed. +Status GetCompressionCodec(CompressionType compression, + const CompressionCodec** codec); + +// Returns the compression codec type given the name +CompressionType GetCompressionCodecType(const std::string& name); + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/encoding-test.cc b/src/kudu/cfile/encoding-test.cc new file mode 100644 index 000000000000..10e8eef9b76b --- /dev/null +++ b/src/kudu/cfile/encoding-test.cc @@ -0,0 +1,879 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/bshuf_block.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/gvint_block.h" +#include "kudu/cfile/plain_bitmap_block.h" +#include "kudu/cfile/plain_block.h" +#include "kudu/cfile/rle_block.h" +#include "kudu/cfile/binary_plain_block.h" +#include "kudu/cfile/binary_prefix_block.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { namespace cfile { + +extern void DumpSSETable(); + +class TestEncoding : public ::testing::Test { + public: + TestEncoding() + : ::testing::Test(), + arena_(1024, 1024*1024) { + } + + protected: + virtual void SetUp() OVERRIDE { + arena_.Reset(); + } + + template + void CopyOne(BlockDecoder *decoder, + typename TypeTraits::cpp_type *ret) { + ColumnBlock cb(GetTypeInfo(type), nullptr, ret, 1, &arena_); + ColumnDataView cdv(&cb); + size_t n = 1; + ASSERT_OK(decoder->CopyNextValues(&n, &cdv)); + ASSERT_EQ(1, n); + } + + // Insert a given number of strings into the provided + // BinaryPrefixBlockBuilder. + template + static Slice CreateBinaryBlock(BuilderType *sbb, + int num_items, + const char *fmt_str) { + boost::ptr_vector to_insert; + std::vector slices; + + for (uint i = 0; i < num_items; i++) { + string *val = new string(StringPrintf(fmt_str, i)); + to_insert.push_back(val); + slices.push_back(Slice(*val)); + } + + + int rem = slices.size(); + Slice *ptr = &slices[0]; + while (rem > 0) { + int added = sbb->Add(reinterpret_cast(ptr), + rem); + CHECK(added > 0); + rem -= added; + ptr += added; + } + + CHECK_EQ(slices.size(), sbb->Count()); + return sbb->Finish(12345L); + } + + WriterOptions* NewWriterOptions() { + auto ret = new WriterOptions(); + ret->storage_attributes.cfile_block_size = 256 * 1024; + return ret; + } + + template + void TestBinarySeekByValueSmallBlock() { + gscoped_ptr opts(NewWriterOptions()); + BuilderType sbb(opts.get()); + // Insert "hello 0" through "hello 9" + const uint kCount = 10; + Slice s = CreateBinaryBlock(&sbb, kCount, "hello %d"); + DecoderType sbd(s); + ASSERT_OK(sbd.ParseHeader()); + + // Seeking to just after a key should return the + // next key ('hello 4x' falls between 'hello 4' and 'hello 5') + Slice q = "hello 4x"; + bool exact; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + ASSERT_FALSE(exact); + + Slice ret; + ASSERT_EQ(5u, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 5"), ret.ToString()); + + sbd.SeekToPositionInBlock(0); + + // Seeking to an exact key should return that key + q = "hello 4"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + ASSERT_EQ(4u, sbd.GetCurrentIndex()); + ASSERT_TRUE(exact); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 4"), ret.ToString()); + + // Seeking to before the first key should return first key + q = "hello"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + ASSERT_EQ(0, sbd.GetCurrentIndex()); + ASSERT_FALSE(exact); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 0"), ret.ToString()); + + // Seeking after the last key should return not found + q = "zzzz"; + ASSERT_TRUE(sbd.SeekAtOrAfterValue(&q, &exact).IsNotFound()); + + // Seeking to the last key should succeed + q = "hello 9"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + ASSERT_EQ(9u, sbd.GetCurrentIndex()); + ASSERT_TRUE(exact); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 9"), ret.ToString()); + } + + template + void TestStringSeekByValueLargeBlock() { + Arena arena(1024, 1024*1024); // TODO: move to fixture? + gscoped_ptr opts(NewWriterOptions()); + BinaryPrefixBlockBuilder sbb(opts.get()); + const uint kCount = 1000; + // Insert 'hello 000' through 'hello 999' + Slice s = CreateBinaryBlock(&sbb, kCount, "hello %03d"); + BinaryPrefixBlockDecoder sbd(s); + ASSERT_OK(sbd.ParseHeader()); + + // Seeking to just after a key should return the + // next key ('hello 444x' falls between 'hello 444' and 'hello 445') + Slice q = "hello 444x"; + bool exact; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + ASSERT_FALSE(exact); + + Slice ret; + ASSERT_EQ(445u, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 445"), ret.ToString()); + + sbd.SeekToPositionInBlock(0); + + // Seeking to an exact key should return that key + q = "hello 004"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + EXPECT_TRUE(exact); + EXPECT_EQ(4u, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 004"), ret.ToString()); + + // Seeking to before the first key should return first key + q = "hello"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + EXPECT_FALSE(exact); + EXPECT_EQ(0, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 000"), ret.ToString()); + + // Seeking after the last key should return not found + q = "zzzz"; + ASSERT_TRUE(sbd.SeekAtOrAfterValue(&q, &exact).IsNotFound()); + + // Seeking to the last key should succeed + q = "hello 999"; + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + EXPECT_TRUE(exact); + EXPECT_EQ(999u, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string("hello 999"), ret.ToString()); + + // Randomized seek + char target[20]; + char before_target[20]; + for (int i = 0; i < 1000; i++) { + int ord = random() % kCount; + int len = snprintf(target, sizeof(target), "hello %03d", ord); + q = Slice(target, len); + + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + EXPECT_TRUE(exact); + EXPECT_EQ(ord, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string(target), ret.ToString()); + + // Seek before this key + len = snprintf(before_target, sizeof(target), "hello %03d.before", ord-1); + q = Slice(before_target, len); + ASSERT_OK(sbd.SeekAtOrAfterValue(&q, &exact)); + EXPECT_FALSE(exact); + EXPECT_EQ(ord, sbd.GetCurrentIndex()); + CopyOne(&sbd, &ret); + ASSERT_EQ(string(target), ret.ToString()); + } + } + + template + void TestBinaryBlockRoundTrip() { + gscoped_ptr opts(NewWriterOptions()); + BuilderType sbb(opts.get()); + const uint kCount = 10; + Slice s = CreateBinaryBlock(&sbb, kCount, "hello %d"); + + LOG(INFO) << "Block: " << HexDump(s); + + // the slice should take at least a few bytes per entry + ASSERT_GT(s.size(), kCount * 2u); + + DecoderType sbd(s); + ASSERT_OK(sbd.ParseHeader()); + ASSERT_EQ(kCount, sbd.Count()); + ASSERT_EQ(12345u, sbd.GetFirstRowId()); + ASSERT_TRUE(sbd.HasNext()); + + // Iterate one by one through data, verifying that it matches + // what we put in. + for (uint i = 0; i < kCount; i++) { + ASSERT_EQ(i, sbd.GetCurrentIndex()); + ASSERT_TRUE(sbd.HasNext()) << "Failed on iter " << i; + Slice s; + CopyOne(&sbd, &s); + string expected = StringPrintf("hello %d", i); + ASSERT_EQ(expected, s.ToString()) << "failed at iter " << i; + } + ASSERT_FALSE(sbd.HasNext()); + + // Now iterate backwards using positional seeking + for (int i = kCount - 1; i >= 0; i--) { + sbd.SeekToPositionInBlock(i); + ASSERT_EQ(i, sbd.GetCurrentIndex()); + } + + // Try to request a bunch of data in one go + ScopedColumnBlock cb(kCount + 10); + ColumnDataView cdv(&cb); + sbd.SeekToPositionInBlock(0); + size_t n = kCount + 10; + ASSERT_OK(sbd.CopyNextValues(&n, &cdv)); + ASSERT_EQ(kCount, n); + ASSERT_FALSE(sbd.HasNext()); + + for (uint i = 0; i < kCount; i++) { + string expected = StringPrintf("hello %d", i); + ASSERT_EQ(expected, cb[i].ToString()); + } + } + + template + void DoSeekTest(BlockBuilderType* ibb, int num_ints, int num_queries, bool verify) { + // TODO : handle and verify seeking inside a run for testing RLE + typedef typename TypeTraits::cpp_type CppType; + + const CppType kBase = 6; + + CppType data[num_ints]; + for (CppType i = 0; i < num_ints; i++) { + data[i] = kBase + i * 2; + } + + CHECK_EQ(num_ints, ibb->Add(reinterpret_cast(&data[0]), + num_ints)); + + Slice s = ibb->Finish(0); + BlockDecoderType ibd(s); + ASSERT_OK(ibd.ParseHeader()); + + // Benchmark seeking + LOG_TIMING(INFO, strings::Substitute("Seeking in $0 block", TypeTraits::name())) { + for (int i = 0; i < num_queries; i++) { + bool exact = false; + CppType target = random() % (num_ints * 2 + kBase); + Status s = ibd.SeekAtOrAfterValue(&target, &exact); + if (verify) { + SCOPED_TRACE(target); + if (s.IsNotFound()) { + ASSERT_EQ(kBase + num_ints * 2 - 1, target); + continue; + } + ASSERT_OK_FAST(s); + + CppType got; + CopyOne(&ibd, &got); + + if (target < kBase) { + ASSERT_EQ(kBase, got); + ASSERT_FALSE(exact); + } else if (target % 2 == 0) { + // Was inserted + ASSERT_EQ(target, got); + ASSERT_TRUE(exact); + } else { + ASSERT_EQ(target + 1, got); + ASSERT_FALSE(exact); + } + } + } + } + } + + + template + void TestEmptyBlockEncodeDecode() { + gscoped_ptr opts(NewWriterOptions()); + BlockBuilderType bb(opts.get()); + Slice s = bb.Finish(0); + ASSERT_GT(s.size(), 0); + LOG(INFO) << "Encoded size for 0 items: " << s.size(); + + BlockDecoderType bd(s); + ASSERT_OK(bd.ParseHeader()); + ASSERT_EQ(0, bd.Count()); + ASSERT_FALSE(bd.HasNext()); + } + + template + void TestEncodeDecodeTemplateBlockEncoder(typename TypeTraits::cpp_type* src, + uint32_t size) { + typedef typename TypeTraits::cpp_type CppType; + const uint32_t kOrdinalPosBase = 12345; + gscoped_ptr opts(NewWriterOptions()); + BlockBuilder pbb(opts.get()); + + pbb.Add(reinterpret_cast(src), size); + Slice s = pbb.Finish(kOrdinalPosBase); + + LOG(INFO)<< "Encoded size for 10k elems: " << s.size(); + + BlockDecoder pbd(s); + ASSERT_OK(pbd.ParseHeader()); + ASSERT_EQ(kOrdinalPosBase, pbd.GetFirstRowId()); + ASSERT_EQ(0, pbd.GetCurrentIndex()); + + std::vector decoded; + decoded.resize(size); + + ColumnBlock dst_block(GetTypeInfo(Type), nullptr, &decoded[0], size, &arena_); + ColumnDataView view(&dst_block); + int dec_count = 0; + while (pbd.HasNext()) { + ASSERT_EQ((int32_t )(dec_count), pbd.GetCurrentIndex()); + + size_t to_decode = (random() % 30) + 1; + size_t n = to_decode > view.nrows() ? view.nrows() : to_decode; + ASSERT_OK_FAST(pbd.CopyNextValues(&n, &view)); + ASSERT_GE(to_decode, n); + view.Advance(n); + dec_count += n; + } + + ASSERT_EQ(0, view.nrows())<< "Should have no space left in the buffer after " + << "decoding all rows"; + + for (uint i = 0; i < size; i++) { + if (src[i] != decoded[i]) { + FAIL()<< "Fail at index " << i << + " inserted=" << src[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % decoded.size(); + pbd.SeekToPositionInBlock(seek_off); + + EXPECT_EQ((int32_t )(seek_off), pbd.GetCurrentIndex()); + CppType ret; + CopyOne(&pbd, &ret); + EXPECT_EQ(decoded[seek_off], ret); + } + } + + // Test truncation of blocks + template + void TestBinaryBlockTruncation() { + gscoped_ptr opts(NewWriterOptions()); + BuilderType sbb(opts.get()); + const uint kCount = 10; + size_t sbsize; + + Slice s = CreateBinaryBlock(&sbb, kCount, "hello %d"); + do { + sbsize = s.size(); + + LOG(INFO) << "Block: " << HexDump(s); + + DecoderType sbd(s); + Status st = sbd.ParseHeader(); + + if (sbsize < DecoderType::kMinHeaderSize) { + ASSERT_TRUE(st.IsCorruption()); + ASSERT_STR_CONTAINS(st.ToString(), "not enough bytes for header"); + } else if (sbsize < coding::DecodeGroupVarInt32_GetGroupSize(s.data())) { + ASSERT_TRUE(st.IsCorruption()); + ASSERT_STR_CONTAINS(st.ToString(), "less than length"); + } + if (sbsize > 0) { + s.truncate(sbsize - 1); + } + } while (sbsize > 0); + } + + // Test encoding and decoding of integer datatypes + template + void TestIntBlockRoundTrip(BuilderType* ibb) { + typedef typename DataTypeTraits::cpp_type CppType; + + LOG(INFO) << "Testing with IntType = " << DataTypeTraits::name(); + + const uint32_t kOrdinalPosBase = 12345; + + srand(123); + + std::vector to_insert; + for (int i = 0; i < 10003; i++) { + to_insert.push_back(random() % std::numeric_limits::max()); + } + + ibb->Add(reinterpret_cast(&to_insert[0]), + to_insert.size()); + Slice s = ibb->Finish(kOrdinalPosBase); + + DecoderType ibd(s); + ASSERT_OK(ibd.ParseHeader()); + + ASSERT_EQ(kOrdinalPosBase, ibd.GetFirstRowId()); + + std::vector decoded; + decoded.resize(to_insert.size()); + + ColumnBlock dst_block(GetTypeInfo(IntType), nullptr, + &decoded[0], + to_insert.size(), + &arena_); + int dec_count = 0; + while (ibd.HasNext()) { + ASSERT_EQ((uint32_t)(dec_count), ibd.GetCurrentIndex()); + + size_t to_decode = std::min(to_insert.size() - dec_count, + static_cast((random() % 30) + 1)); + size_t n = to_decode; + ColumnDataView dst_data(&dst_block, dec_count); + DCHECK_EQ((unsigned char *)(&decoded[dec_count]), dst_data.data()); + ASSERT_OK_FAST(ibd.CopyNextValues(&n, &dst_data)); + ASSERT_GE(to_decode, n); + dec_count += n; + } + + ASSERT_EQ(dec_count, dst_block.nrows()) + << "Should have decoded all rows to fill the buffer"; + + for (uint i = 0; i < to_insert.size(); i++) { + if (to_insert[i] != decoded[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << to_insert[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % decoded.size(); + ibd.SeekToPositionInBlock(seek_off); + + EXPECT_EQ((uint32_t)(seek_off), ibd.GetCurrentIndex()); + CppType ret; + CopyOne(&ibd, &ret); + EXPECT_EQ(decoded[seek_off], ret); + } + } + + template + void TestRleIntBlockRoundTrip() { + gscoped_ptr > ibb(new RleIntBlockBuilder()); + TestIntBlockRoundTrip, RleIntBlockDecoder, IntType>( + ibb.get()); + } + + // Test encoding and decoding BOOL datatypes + template + void TestBoolBlockRoundTrip() { + const uint32_t kOrdinalPosBase = 12345; + + srand(123); + + std::vector to_insert; + for (int i = 0; i < 10003; ) { + int run_size = random() % 100; + bool val = random() % 2; + for (int j = 0; j < run_size; j++) { + to_insert.push_back(val); + } + i += run_size; + } + + BuilderType bb; + bb.Add(reinterpret_cast(&to_insert[0]), + to_insert.size()); + Slice s = bb.Finish(kOrdinalPosBase); + + DecoderType bd(s); + ASSERT_OK(bd.ParseHeader()); + + ASSERT_EQ(kOrdinalPosBase, bd.GetFirstRowId()); + + std::vector decoded; + decoded.resize(to_insert.size()); + + ColumnBlock dst_block(GetTypeInfo(BOOL), nullptr, + &decoded[0], + to_insert.size(), + &arena_); + + int dec_count = 0; + while (bd.HasNext()) { + ASSERT_EQ((uint32_t)(dec_count), bd.GetCurrentIndex()); + + size_t to_decode = std::min(to_insert.size() - dec_count, + static_cast((random() % 30) + 1)); + size_t n = to_decode; + ColumnDataView dst_data(&dst_block, dec_count); + DCHECK_EQ((unsigned char *)(&decoded[dec_count]), dst_data.data()); + ASSERT_OK_FAST(bd.CopyNextValues(&n, &dst_data)); + ASSERT_GE(to_decode, n); + dec_count += n; + } + + ASSERT_EQ(dec_count, dst_block.nrows()) + << "Should have decoded all rows to fill the buffer"; + + for (uint i = 0; i < to_insert.size(); i++) { + if (to_insert[i] != decoded[i]) { + FAIL() << "Fail at index " << i << + " inserted=" << to_insert[i] << " got=" << decoded[i]; + } + } + + // Test Seek within block by ordinal + for (int i = 0; i < 100; i++) { + int seek_off = random() % decoded.size(); + bd.SeekToPositionInBlock(seek_off); + + EXPECT_EQ((uint32_t)(seek_off), bd.GetCurrentIndex()); + bool ret; + CopyOne(&bd, &ret); + EXPECT_EQ(static_cast(decoded[seek_off]), ret); + } + } + + Arena arena_; +}; + +TEST_F(TestEncoding, TestPlainBlockEncoder) { + const uint32_t kSize = 10000; + + gscoped_ptr ints(new int32_t[kSize]); + for (int i = 0; i < kSize; i++) { + ints.get()[i] = random(); + } + + TestEncodeDecodeTemplateBlockEncoder, + PlainBlockDecoder >(ints.get(), kSize); +} + +// Test for bitshuffle block, for INT32, FLOAT, DOUBLE +TEST_F(TestEncoding, TestBShufIntBlockEncoder) { + const uint32_t kSize = 10000; + + gscoped_ptr ints(new int32_t[kSize]); + for (int i = 0; i < kSize; i++) { + ints.get()[i] = random(); + } + + TestEncodeDecodeTemplateBlockEncoder, + BShufBlockDecoder >(ints.get(), kSize); +} + +TEST_F(TestEncoding, TestBShufFloatBlockEncoder) { + const uint32_t kSize = 10000; + + gscoped_ptr floats(new float[kSize]); + for (int i = 0; i < kSize; i++) { + floats.get()[i] = random() + static_cast(random())/INT_MAX; + } + + TestEncodeDecodeTemplateBlockEncoder, + BShufBlockDecoder >(floats.get(), kSize); +} + +TEST_F(TestEncoding, TestBShufDoubleBlockEncoder) { + const uint32_t kSize = 10000; + + gscoped_ptr doubles(new double[kSize]); + for (int i = 0; i < kSize; i++) { + doubles.get()[i] = random() + + static_cast(random())/INT_MAX; + } + + TestEncodeDecodeTemplateBlockEncoder, + BShufBlockDecoder >(doubles.get(), kSize); +} + +TEST_F(TestEncoding, TestIntBlockEncoder) { + gscoped_ptr opts(NewWriterOptions()); + GVIntBlockBuilder ibb(opts.get()); + + auto ints = new int[10000]; + for (int i = 0; i < 10000; i++) { + ints[i] = random(); + } + ibb.Add(reinterpret_cast(ints), 10000); + delete[] ints; + + Slice s = ibb.Finish(12345); + LOG(INFO) << "Encoded size for 10k ints: " << s.size(); + + // Test empty case -- should be 5 bytes for just the + // header word (all zeros) + ibb.Reset(); + s = ibb.Finish(0); + ASSERT_EQ(5UL, s.size()); +} + +TEST_F(TestEncoding, TestRleIntBlockEncoder) { + RleIntBlockBuilder ibb; + gscoped_ptr ints(new int[10000]); + for (int i = 0; i < 10000; i++) { + ints[i] = random(); + } + ibb.Add(reinterpret_cast(ints.get()), 10000); + + Slice s = ibb.Finish(12345); + LOG(INFO) << "RLE Encoded size for 10k ints: " << s.size(); + + ibb.Reset(); + ints.reset(new int[100]); + for (int i = 0; i < 100; i++) { + ints[i] = 0; + } + ibb.Add(reinterpret_cast(ints.get()), 100); + s = ibb.Finish(12345); + ASSERT_EQ(14UL, s.size()); +} + +TEST_F(TestEncoding, TestPlainBitMapRoundTrip) { + TestBoolBlockRoundTrip(); +} + +TEST_F(TestEncoding, TestRleBitMapRoundTrip) { + TestBoolBlockRoundTrip(); +} + +TEST_F(TestEncoding, TestGVIntBlockRoundTrip) { + gscoped_ptr opts(NewWriterOptions()); + gscoped_ptr ibb(new GVIntBlockBuilder(opts.get())); + TestIntBlockRoundTrip(ibb.get()); +} + +TEST_F(TestEncoding, TestRleIntBlockRoundTripAllTypes) { + LOG(INFO) << "Testing all integer types with RLE block encoding"; + + TestRleIntBlockRoundTrip(); + TestRleIntBlockRoundTrip(); + TestRleIntBlockRoundTrip(); + TestRleIntBlockRoundTrip(); + TestRleIntBlockRoundTrip(); + TestRleIntBlockRoundTrip(); +} + + +TEST_F(TestEncoding, TestGVIntEmptyBlockEncodeDecode) { + TestEmptyBlockEncodeDecode(); +} + +// Test seeking to a value in a small block. +// Regression test for a bug seen in development where this would +// infinite loop when there are no 'restarts' in a given block. +TEST_F(TestEncoding, TestBinaryPrefixBlockBuilderSeekByValueSmallBlock) { + TestBinarySeekByValueSmallBlock(); +} + +TEST_F(TestEncoding, TestBinaryPlainBlockBuilderSeekByValueSmallBlock) { + TestBinarySeekByValueSmallBlock(); +} + +// Test seeking to a value in a large block which contains +// many 'restarts' +TEST_F(TestEncoding, TestBinaryPrefixBlockBuilderSeekByValueLargeBlock) { + TestStringSeekByValueLargeBlock(); +} + +TEST_F(TestEncoding, TestBinaryPlainBlockBuilderSeekByValueLargeBlock) { + TestStringSeekByValueLargeBlock(); +} + +// Test round-trip encode/decode of a binary block. +TEST_F(TestEncoding, TestBinaryPrefixBlockBuilderRoundTrip) { + TestBinaryBlockRoundTrip(); +} + +TEST_F(TestEncoding, TestBinaryPlainBlockBuilderRoundTrip) { + TestBinaryBlockRoundTrip(); +} + +// Test empty block encode/decode +TEST_F(TestEncoding, TestBinaryPlainEmptyBlockEncodeDecode) { + TestEmptyBlockEncodeDecode(); +} + +TEST_F(TestEncoding, TestBinaryPrefixEmptyBlockEncodeDecode) { + TestEmptyBlockEncodeDecode(); +} + +// Test encode/decode of a binary block with various-sized truncations. +TEST_F(TestEncoding, TestBinaryPlainBlockBuilderTruncation) { + TestBinaryBlockTruncation(); +} + +TEST_F(TestEncoding, TestBinaryPrefixBlockBuilderTruncation) { + TestBinaryBlockTruncation(); +} + +#ifdef NDEBUG +TEST_F(TestEncoding, GVIntSeekBenchmark) { + gscoped_ptr opts(NewWriterOptions()); + gscoped_ptr ibb(new GVIntBlockBuilder(opts.get())); + DoSeekTest(ibb.get(), 32768, 100000, false); +} +#endif + +TEST_F(TestEncoding, GVIntSeekTest) { + gscoped_ptr opts(NewWriterOptions()); + gscoped_ptr ibb(new GVIntBlockBuilder(opts.get())); + DoSeekTest(ibb.get(), 64, 1000, true); +} + +TEST_F(TestEncoding, GVIntSeekTestTinyBlock) { + gscoped_ptr opts(NewWriterOptions()); + for (int block_size = 1; block_size < 16; block_size++) { + gscoped_ptr ibb(new GVIntBlockBuilder(opts.get())); + DoSeekTest(ibb.get(), block_size, 1000, true); + } +} + + +// We have several different encodings for INT blocks. +// The following tests use GTest's TypedTest functionality to run the tests +// for each of the encodings. +// +// Beware ugly template magic below. +struct PlainTestTraits { + template + struct Classes { + typedef PlainBlockBuilder encoder_type; + typedef PlainBlockDecoder decoder_type; + }; +}; + +struct RleTestTraits { + template + struct Classes { + typedef RleIntBlockBuilder encoder_type; + typedef RleIntBlockDecoder decoder_type; + }; +}; + +struct BitshuffleTestTraits { + template + struct Classes { + typedef BShufBlockBuilder encoder_type; + typedef BShufBlockDecoder decoder_type; + }; +}; +typedef testing::Types MyTestFixtures; +TYPED_TEST_CASE(IntEncodingTest, MyTestFixtures); + +template +class IntEncodingTest : public TestEncoding { + public: + template + void DoIntSeekTest(int num_ints, int num_queries, bool verify) { + typedef typename TestTraits::template Classes::encoder_type encoder_type; + typedef typename TestTraits::template Classes::decoder_type decoder_type; + + gscoped_ptr opts(NewWriterOptions()); + gscoped_ptr ibb(new encoder_type(opts.get())); + DoSeekTest(ibb.get(), num_ints, num_queries, verify); + } + + template + void DoIntSeekTestTinyBlock() { + for (int block_size = 1; block_size < 16; block_size++) { + DoIntSeekTest(block_size, 1000, true); + } + } + + template + void DoIntRoundTripTest() { + typedef typename TestTraits::template Classes::encoder_type encoder_type; + typedef typename TestTraits::template Classes::decoder_type decoder_type; + + gscoped_ptr opts(NewWriterOptions()); + gscoped_ptr ibb(new encoder_type(opts.get())); + TestIntBlockRoundTrip(ibb.get()); + } +}; + + +TYPED_TEST(IntEncodingTest, TestSeekAllTypes) { + this->template DoIntSeekTest(32, 1000, true); + this->template DoIntSeekTest(32, 1000, true); + this->template DoIntSeekTest(64, 1000, true); + this->template DoIntSeekTest(64, 1000, true); + this->template DoIntSeekTest(64, 1000, true); + this->template DoIntSeekTest(64, 1000, true); +} + +TYPED_TEST(IntEncodingTest, IntSeekTestTinyBlockAllTypes) { + this->template DoIntSeekTestTinyBlock(); + this->template DoIntSeekTestTinyBlock(); + this->template DoIntSeekTestTinyBlock(); + this->template DoIntSeekTestTinyBlock(); + this->template DoIntSeekTestTinyBlock(); + this->template DoIntSeekTestTinyBlock(); +} + +TYPED_TEST(IntEncodingTest, TestRoundTrip) { + this->template DoIntRoundTripTest(); + this->template DoIntRoundTripTest(); + this->template DoIntRoundTripTest(); + this->template DoIntRoundTripTest(); + this->template DoIntRoundTripTest(); + this->template DoIntRoundTripTest(); +} + +#ifdef NDEBUG +TYPED_TEST(IntEncodingTest, IntSeekBenchmark) { + this->template DoIntSeekTest(32768, 10000, false); +} +#endif + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/gvint_block.cc b/src/kudu/cfile/gvint_block.cc new file mode 100644 index 000000000000..6f3ecc79ad94 --- /dev/null +++ b/src/kudu/cfile/gvint_block.cc @@ -0,0 +1,369 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/gvint_block.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/util/group_varint-inl.h" + +namespace kudu { namespace cfile { + +using kudu::coding::AppendGroupVarInt32; +using kudu::coding::CalcRequiredBytes32; +using kudu::coding::DecodeGroupVarInt32; +using kudu::coding::DecodeGroupVarInt32_SlowButSafe; +using kudu::coding::DecodeGroupVarInt32_SSE_Add; +using kudu::coding::AppendGroupVarInt32Sequence; + +GVIntBlockBuilder::GVIntBlockBuilder(const WriterOptions *options) + : estimated_raw_size_(0), + options_(options) { + Reset(); +} + + +void GVIntBlockBuilder::Reset() { + ints_.clear(); + buffer_.clear(); + ints_.reserve(options_->storage_attributes.cfile_block_size / sizeof(uint32_t)); + estimated_raw_size_ = 0; +} + +bool GVIntBlockBuilder::IsBlockFull(size_t limit) const { + return EstimateEncodedSize() > limit; +} + +int GVIntBlockBuilder::Add(const uint8_t *vals_void, size_t count) { + const uint32_t *vals = reinterpret_cast(vals_void); + + int added = 0; + + // If the block is full, should stop adding more items. + while (!IsBlockFull(options_->storage_attributes.cfile_block_size) && added < count) { + uint32_t val = *vals++; + estimated_raw_size_ += CalcRequiredBytes32(val); + ints_.push_back(val); + added++; + } + + return added; +} + +size_t GVIntBlockBuilder::Count() const { + return ints_.size(); +} + +Status GVIntBlockBuilder::GetFirstKey(void *key) const { + if (ints_.empty()) { + return Status::NotFound("no keys in data block"); + } + + *reinterpret_cast(key) = ints_[0]; + return Status::OK(); +} + +Slice GVIntBlockBuilder::Finish(rowid_t ordinal_pos) { + int size_estimate = EstimateEncodedSize(); + buffer_.reserve(size_estimate); + // TODO: negatives and big ints + + IntType min = 0; + size_t size = ints_.size(); + + if (size > 0) { + min = *std::min_element(ints_.begin(), ints_.end()); + } + + CHECK_LT(ordinal_pos, MathLimits::kMax) << + "TODO: support large files"; + + buffer_.clear(); + AppendGroupVarInt32(&buffer_, + implicit_cast(size), + implicit_cast(min), + implicit_cast(ordinal_pos), 0); + + if (size > 0) { + AppendGroupVarInt32Sequence(&buffer_, min, &ints_[0], size); + } + + // Our estimate should always be an upper bound, or else there's a bunch of + // extra copies due to resizes here. + DCHECK_GE(size_estimate, buffer_.size()); + + return Slice(buffer_.data(), buffer_.size()); +} + +//////////////////////////////////////////////////////////// +// Decoder +//////////////////////////////////////////////////////////// + +GVIntBlockDecoder::GVIntBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + cur_pos_(nullptr), + cur_idx_(0) { +} + +Status GVIntBlockDecoder::ParseHeader() { + // TODO: better range check + CHECK_GE(data_.size(), kMinHeaderSize); + + uint32_t unused; + ints_start_ = DecodeGroupVarInt32_SlowButSafe( + (const uint8_t *)data_.data(), &num_elems_, &min_elem_, + &ordinal_pos_base_, &unused); + + if (num_elems_ > 0 && num_elems_ * 5 / 4 > data_.size()) { + return Status::Corruption("bad number of elems in int block"); + } + + parsed_ = true; + SeekToStart(); + + return Status::OK(); +} + +class NullSink { + public: + template + void push_back(T t) {} +}; + +template +class PtrSink { + public: + explicit PtrSink(uint8_t *ptr) + : ptr_(reinterpret_cast(ptr)) + {} + + void push_back(const T &t) { + *ptr_++ = t; + } + + private: + T *ptr_; +}; + +void GVIntBlockDecoder::SeekToPositionInBlock(uint pos) { + CHECK(parsed_) << "Must call ParseHeader()"; + + // no-op if seeking to current position + if (cur_idx_ == pos && cur_pos_ != nullptr) return; + + // Reset to start of block + cur_pos_ = ints_start_; + cur_idx_ = 0; + pending_.clear(); + + NullSink null; + // TODO: should this return Status? + size_t n = pos; + CHECK_OK(DoGetNextValues(&n, &null)); +} + +Status GVIntBlockDecoder::SeekAtOrAfterValue(const void *value_void, + bool *exact_match) { + + // for now, use a linear search. + // TODO: evaluate adding a few pointers at the end of the block back + // into every 16th group or so? + SeekToPositionInBlock(0); + + // Stop here if the target is < the first elem of the block. + uint32_t target = *reinterpret_cast(value_void); + if (target < min_elem_) { + *exact_match = false; + return Status::OK(); + } + + // Put target into this block's frame of reference + uint32_t rel_target = target - min_elem_; + + const uint8_t *prev_group_pos = cur_pos_; + + // Search for the group which contains the target + while (cur_idx_ < num_elems_) { + uint8_t tag = *cur_pos_++; + uint8_t a_sel = (tag & BOOST_BINARY(11 00 00 00)) >> 6; + + // Determine length of first in this block + uint32_t first_elem = *reinterpret_cast(cur_pos_) + & coding::MASKS[a_sel]; + if (rel_target < first_elem) { + // target fell in previous group + DCHECK_GE(cur_idx_, 4); + cur_idx_ -= 4; + cur_pos_ = prev_group_pos; + break; + } + + // Skip group; + uint8_t group_len = coding::VARINT_SELECTOR_LENGTHS[tag]; + prev_group_pos = cur_pos_ - 1; + cur_pos_ += group_len; + cur_idx_ += 4; + } + + if (cur_idx_ >= num_elems_) { + // target may be in the last group in the block + DCHECK_GE(cur_idx_, 4); + cur_idx_ -= 4; + cur_pos_ = prev_group_pos; + } + + // We're now pointed at the correct group. Decode it + + uint32_t chunk[4]; + PtrSink sink(reinterpret_cast(chunk)); + size_t count = 4; + RETURN_NOT_OK(DoGetNextValues(&count, &sink)); + + // Reset the index back to the start of this block + cur_idx_ -= count; + + for (int i = 0; i < count; i++) { + if (chunk[i] >= target) { + *exact_match = chunk[i] == target; + cur_idx_ += i; + + int rem = count; // convert to signed + + while (rem-- > i) { + pending_.push_back(chunk[rem]); + } + + return Status::OK(); + } + } + + // If it wasn't in this block, then it falls between this block + // and the following one. So, we are positioned correctly. + cur_idx_ += count; + *exact_match = false; + + if (cur_idx_ == num_elems_) { + // If it wasn't in the block, and this was the last block, + // mark as not found + return Status::NotFound("not in block"); + } + + return Status::OK(); +} + +Status GVIntBlockDecoder::CopyNextValues(size_t *n, ColumnDataView *dst) { + DCHECK_EQ(dst->type_info()->physical_type(), UINT32); + DCHECK_EQ(dst->stride(), sizeof(uint32_t)); + + PtrSink sink(dst->data()); + return DoGetNextValues(n, &sink); +} + +Status GVIntBlockDecoder::CopyNextValuesToArray(size_t *n, uint8_t* array) { + PtrSink sink(array); + return DoGetNextValues(n, &sink); +} + +template +inline Status GVIntBlockDecoder::DoGetNextValues(size_t *n_param, IntSink *sink) { + size_t n = *n_param; + int start_idx = cur_idx_; + size_t rem = num_elems_ - cur_idx_; + assert(rem >= 0); + + // Only fetch up to remaining amount + n = std::min(rem, n); + + float min_elem_f = bit_cast(min_elem_); + __m128i min_elem_xmm = (__m128i)_mm_set_ps( + min_elem_f, min_elem_f, min_elem_f, min_elem_f); + + // First drain pending_ + while (n > 0 && !pending_.empty()) { + sink->push_back(pending_.back()); + pending_.pop_back(); + n--; + cur_idx_++; + } + + const uint8_t *sse_safe_pos = data_.data() + data_.size() - 17; + if (n == 0) goto ret; + + // Now grab groups of 4 and append to vector + while (n >= 4) { + uint32_t ints[4]; + if (cur_pos_ < sse_safe_pos) { + cur_pos_ = DecodeGroupVarInt32_SSE_Add( + cur_pos_, ints, min_elem_xmm); + } else { + cur_pos_ = DecodeGroupVarInt32_SlowButSafe( + cur_pos_, &ints[0], &ints[1], &ints[2], &ints[3]); + ints[0] += min_elem_; + ints[1] += min_elem_; + ints[2] += min_elem_; + ints[3] += min_elem_; + } + cur_idx_ += 4; + + sink->push_back(ints[0]); + sink->push_back(ints[1]); + sink->push_back(ints[2]); + sink->push_back(ints[3]); + n -= 4; + } + + if (n == 0) goto ret; + + // Grab next batch into pending_ + // Note that this does _not_ increment cur_idx_ + uint32_t ints[4]; + cur_pos_ = DecodeGroupVarInt32_SlowButSafe( + cur_pos_, &ints[0], &ints[1], &ints[2], &ints[3]); + + DCHECK_LE(cur_pos_, &data_[0] + data_.size()) + << "Overflowed end of buffer! cur_pos=" << cur_pos_ + << " data=" << data_.data() << " size=" << data_.size(); + + ints[0] += min_elem_; + ints[1] += min_elem_; + ints[2] += min_elem_; + ints[3] += min_elem_; + // pending_ acts like a stack, so push in reverse order. + pending_.push_back(ints[3]); + pending_.push_back(ints[2]); + pending_.push_back(ints[1]); + pending_.push_back(ints[0]); + + while (n > 0 && !pending_.empty()) { + sink->push_back(pending_.back()); + pending_.pop_back(); + n--; + cur_idx_++; + } + + ret: + CHECK_EQ(n, 0); + *n_param = cur_idx_ - start_idx; + return Status::OK(); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/gvint_block.h b/src/kudu/cfile/gvint_block.h new file mode 100644 index 000000000000..def0e3e0244b --- /dev/null +++ b/src/kudu/cfile/gvint_block.h @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_GVINT_BLOCK_H +#define KUDU_CFILE_GVINT_BLOCK_H + +#include + +#include + +#include + +#include "kudu/cfile/block_encodings.h" + +namespace kudu { +namespace cfile { + +struct WriterOptions; +typedef uint32_t IntType; + +using std::vector; + +// Builder for an encoded block of ints. +// The encoding is group-varint plus frame-of-reference: +// +// Header group (gvint): ints_; + faststring buffer_; + uint64_t estimated_raw_size_; + + const WriterOptions *options_; + + enum { + kEstimatedHeaderSizeBytes = 10, + + // Up to 3 "0s" can be tacked on the end of the block to round out + // the groups of 4 + kTrailerExtraPaddingBytes = 3 + }; +}; + +// Decoder for UINT32 type, GROUP_VARINT coding +class GVIntBlockDecoder : public BlockDecoder { + public: + explicit GVIntBlockDecoder(Slice slice); + + Status ParseHeader() OVERRIDE; + void SeekToStart() { + SeekToPositionInBlock(0); + } + + void SeekToPositionInBlock(uint pos) OVERRIDE; + + Status SeekAtOrAfterValue(const void *value, bool *exact_match) OVERRIDE; + + Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE; + + // Copy the integers to a temporary buffer, it is used by StringDictDecoder + // in its CopyNextValues() method. + Status CopyNextValuesToArray(size_t *n, uint8_t* array); + + size_t GetCurrentIndex() const OVERRIDE { + DCHECK(parsed_) << "must parse header first"; + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return ordinal_pos_base_; + } + + size_t Count() const OVERRIDE { + return num_elems_; + } + + bool HasNext() const OVERRIDE { + return (num_elems_ - cur_idx_) > 0; + } + + private: + friend class TestEncoding; + + template + Status DoGetNextValues(size_t *n, IntSink *sink); + + Slice data_; + + bool parsed_; + const uint8_t *ints_start_; + uint32_t num_elems_; + uint32_t min_elem_; + rowid_t ordinal_pos_base_; + + const uint8_t *cur_pos_; + size_t cur_idx_; + + // Items that have been decoded but not yet yielded + // to the user. The next one to be yielded is at the + // *end* of the vector! + std::vector pending_; + + // Min Length of a header. (prefix + 4 tags) + static const size_t kMinHeaderSize = 5; +}; + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/index-test.cc b/src/kudu/cfile/index-test.cc new file mode 100644 index 000000000000..d52e920c9e0f --- /dev/null +++ b/src/kudu/cfile/index-test.cc @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/gutil/endian.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/status.h" +#include "kudu/util/test_macros.h" + +namespace kudu { namespace cfile { + +Status SearchInReaderString(const IndexBlockReader &reader, + string search_key, + BlockPointer *ptr, Slice *match) { + + static faststring dst; + + gscoped_ptr iter(reader.NewIterator()); + dst.clear(); + KeyEncoderTraits::Encode(search_key, &dst); + Status s = iter->SeekAtOrBefore(Slice(dst)); + RETURN_NOT_OK(s); + + *ptr = iter->GetCurrentBlockPointer(); + *match = iter->GetCurrentKey(); + return Status::OK(); +} + + +Status SearchInReaderUint32(const IndexBlockReader &reader, + uint32_t search_key, + BlockPointer *ptr, Slice *match) { + + static faststring dst; + + gscoped_ptr iter(reader.NewIterator()); + dst.clear(); + KeyEncoderTraits::Encode(search_key, &dst); + Status s = iter->SeekAtOrBefore(Slice(dst)); + RETURN_NOT_OK(s); + + *ptr = iter->GetCurrentBlockPointer(); + *match = iter->GetCurrentKey(); + return Status::OK(); +} + +// Expects a Slice containing a big endian encoded int +static uint32_t SliceAsUInt32(const Slice &slice) { + CHECK_EQ(slice.size(), 4); + uint32_t val; + memcpy(&val, slice.data(), slice.size()); + val = BigEndian::FromHost32(val); + return val; +} + +static void AddToIndex(IndexBlockBuilder *idx, uint32_t val, + const BlockPointer &block_pointer) { + + static faststring dst; + dst.clear(); + KeyEncoderTraits::Encode(val, &dst); + idx->Add(Slice(dst), block_pointer); +} + + +// Test IndexBlockBuilder and IndexReader with integers +TEST(TestIndexBuilder, TestIndexWithInts) { + + // Encode an index block. + WriterOptions opts; + IndexBlockBuilder idx(&opts, true); + + const int EXPECTED_NUM_ENTRIES = 4; + + uint32_t i; + + i = 10; + AddToIndex(&idx, i, BlockPointer(90010, 64 * 1024)); + + i = 20; + AddToIndex(&idx, i, BlockPointer(90020, 64 * 1024)); + + i = 30; + AddToIndex(&idx, i, BlockPointer(90030, 64 * 1024)); + + i = 40; + AddToIndex(&idx, i, BlockPointer(90040, 64 * 1024)); + + size_t est_size = idx.EstimateEncodedSize(); + Slice s = idx.Finish(); + + // Estimated size should be between 75-100% + // of actual size. + EXPECT_LT(s.size(), est_size); + EXPECT_GT(s.size(), est_size * 3 /4); + + // Open the encoded block in a reader. + IndexBlockReader reader; + ASSERT_OK(reader.Parse(s)); + + // Should have all the entries we inserted. + ASSERT_EQ(EXPECTED_NUM_ENTRIES, static_cast(reader.Count())); + + // Search for a value prior to first entry + BlockPointer ptr; + Slice match; + Status status = SearchInReaderUint32(reader, 0, &ptr, &match); + EXPECT_TRUE(status.IsNotFound()); + + // Search for a value equal to first entry + status = SearchInReaderUint32(reader, 10, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90010, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(10, SliceAsUInt32(match)); + + // Search for a value between 1st and 2nd entries. + // Should return 1st. + status = SearchInReaderUint32(reader, 15, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90010, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(10, SliceAsUInt32(match)); + + // Search for a value equal to 2nd + // Should return 2nd. + status = SearchInReaderUint32(reader, 20, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90020, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(20, SliceAsUInt32(match)); + + // Between 2nd and 3rd. + // Should return 2nd + status = SearchInReaderUint32(reader, 25, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90020, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(20, SliceAsUInt32(match)); + + // Equal 3rd + status = SearchInReaderUint32(reader, 30, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90030, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(30, SliceAsUInt32(match)); + + // Between 3rd and 4th + status = SearchInReaderUint32(reader, 35, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90030, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(30, SliceAsUInt32(match)); + + // Equal 4th (last) + status = SearchInReaderUint32(reader, 40, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90040, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(40, SliceAsUInt32(match)); + + // Greater than 4th (last) + status = SearchInReaderUint32(reader, 45, &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90040, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ(40, SliceAsUInt32(match)); + + idx.Reset(); +} + +TEST(TestIndexBlock, TestIndexBlockWithStrings) { + WriterOptions opts; + IndexBlockBuilder idx(&opts, true); + + // Insert data for "hello-10" through "hello-40" by 10s + const int EXPECTED_NUM_ENTRIES = 4; + char data[20]; + for (int i = 1; i <= EXPECTED_NUM_ENTRIES; i++) { + int len = snprintf(data, sizeof(data), "hello-%d", i * 10); + Slice s(data, len); + + idx.Add(s, BlockPointer(90000 + i*10, 64 * 1024)); + } + size_t est_size = idx.EstimateEncodedSize(); + Slice s = idx.Finish(); + + // Estimated size should be between 75-100% + // of actual size. + EXPECT_LT(s.size(), est_size); + EXPECT_GT(s.size(), est_size * 3 /4); + + VLOG(1) << kudu::HexDump(s); + + // Open the encoded block in a reader. + IndexBlockReader reader; + ASSERT_OK(reader.Parse(s)); + + // Should have all the entries we inserted. + ASSERT_EQ(EXPECTED_NUM_ENTRIES, static_cast(reader.Count())); + + // Search for a value prior to first entry + BlockPointer ptr; + Slice match; + Status status = SearchInReaderString(reader, "hello", &ptr, &match); + EXPECT_TRUE(status.IsNotFound()); + + // Search for a value equal to first entry + status = SearchInReaderString(reader, "hello-10", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90010, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-10", match); + + // Search for a value between 1st and 2nd entries. + // Should return 1st. + status = SearchInReaderString(reader, "hello-15", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90010, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-10", match); + + // Search for a value equal to 2nd + // Should return 2nd. + status = SearchInReaderString(reader, "hello-20", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90020, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-20", match); + + // Between 2nd and 3rd. + // Should return 2nd + status = SearchInReaderString(reader, "hello-25", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90020, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-20", match); + + // Equal 3rd + status = SearchInReaderString(reader, "hello-30", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90030, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-30", match); + + // Between 3rd and 4th + status = SearchInReaderString(reader, "hello-35", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90030, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-30", match); + + // Equal 4th (last) + status = SearchInReaderString(reader, "hello-40", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90040, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-40", match); + + // Greater than 4th (last) + status = SearchInReaderString(reader, "hello-45", &ptr, &match); + ASSERT_OK(status); + EXPECT_EQ(90040, static_cast(ptr.offset())); + EXPECT_EQ(64 * 1024, static_cast(ptr.size())); + EXPECT_EQ("hello-40", match); +} + +// Test seeking around using the IndexBlockIterator class +TEST(TestIndexBlock, TestIterator) { + // Encode an index block with 1000 entries. + WriterOptions opts; + IndexBlockBuilder idx(&opts, true); + + for (int i = 0; i < 1000; i++) { + uint32_t key = i * 10; + AddToIndex(&idx, key, BlockPointer(100000 + i, 64 * 1024)); + } + + Slice s = idx.Finish(); + + IndexBlockReader reader; + ASSERT_OK(reader.Parse(s)); + gscoped_ptr iter(reader.NewIterator()); + ASSERT_OK(iter->SeekToIndex(0)); + ASSERT_EQ(0U, SliceAsUInt32(iter->GetCurrentKey())); + ASSERT_EQ(100000U, iter->GetCurrentBlockPointer().offset()); + + ASSERT_OK(iter->SeekToIndex(50)); + ASSERT_EQ(500U, SliceAsUInt32(iter->GetCurrentKey())); + ASSERT_EQ(100050U, iter->GetCurrentBlockPointer().offset()); + + ASSERT_TRUE(iter->HasNext()); + ASSERT_OK(iter->Next()); + ASSERT_EQ(510U, SliceAsUInt32(iter->GetCurrentKey())); + ASSERT_EQ(100051U, iter->GetCurrentBlockPointer().offset()); + + ASSERT_OK(iter->SeekToIndex(999)); + ASSERT_EQ(9990U, SliceAsUInt32(iter->GetCurrentKey())); + ASSERT_EQ(100999U, iter->GetCurrentBlockPointer().offset()); + ASSERT_FALSE(iter->HasNext()); + ASSERT_TRUE(iter->Next().IsNotFound()); + + ASSERT_OK(iter->SeekToIndex(0)); + ASSERT_EQ(0U, SliceAsUInt32(iter->GetCurrentKey())); + ASSERT_EQ(100000U, iter->GetCurrentBlockPointer().offset()); + ASSERT_TRUE(iter->HasNext()); +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/index_block.cc b/src/kudu/cfile/index_block.cc new file mode 100644 index 000000000000..b5908d291e6f --- /dev/null +++ b/src/kudu/cfile/index_block.cc @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/index_block.h" +#include "kudu/util/protobuf_util.h" + +namespace kudu { +namespace cfile { + +inline void SliceEncode(const Slice &key, faststring *buf) { + InlinePutVarint32(buf, key.size()); + buf->append(key.data(), key.size()); +} + +inline const uint8_t *SliceDecode(const uint8_t *encoded_ptr, const uint8_t *limit, + Slice *retptr) { + uint32_t len; + const uint8_t *data_start = GetVarint32Ptr(encoded_ptr, limit, &len); + if (data_start == nullptr) { + // bad varint + return nullptr; + } + + if (data_start + len > limit) { + // length extends past end of valid area + return nullptr; + } + + *retptr = Slice(data_start, len); + return data_start + len; +} + +IndexBlockBuilder::IndexBlockBuilder( + const WriterOptions *options, + bool is_leaf) + : options_(options), + finished_(false), + is_leaf_(is_leaf) { +} + + +void IndexBlockBuilder::Add(const Slice &keyptr, + const BlockPointer &ptr) { + DCHECK(!finished_) << + "Must Reset() after Finish() before more Add()"; + + size_t entry_offset = buffer_.size(); + SliceEncode(keyptr, &buffer_); + ptr.EncodeTo(&buffer_); + entry_offsets_.push_back(entry_offset); +} + +Slice IndexBlockBuilder::Finish() { + CHECK(!finished_) << "already called Finish()"; + + for (uint32_t off : entry_offsets_) { + InlinePutFixed32(&buffer_, off); + } + + IndexBlockTrailerPB trailer; + trailer.set_num_entries(entry_offsets_.size()); + trailer.set_type( + is_leaf_ ? IndexBlockTrailerPB::LEAF : IndexBlockTrailerPB::INTERNAL); + AppendPBToString(trailer, &buffer_); + + InlinePutFixed32(&buffer_, trailer.GetCachedSize()); + + finished_ = true; + return Slice(buffer_); +} + + +// Return the key of the first entry in this index block. +Status IndexBlockBuilder::GetFirstKey(Slice *key) const { + // TODO: going to need to be able to pass an arena or something + // for slices, which need to copy + + if (entry_offsets_.empty()) { + return Status::NotFound("no keys in builder"); + } + + bool success = nullptr != SliceDecode(buffer_.data(),buffer_.data() + buffer_.size(),key); + + if (success) { + return Status::OK(); + } else { + return Status::Corruption("Unable to decode first key"); + } +} + +size_t IndexBlockBuilder::EstimateEncodedSize() const { + // the actual encoded index entries + int size = buffer_.size(); + + // entry offsets + size += sizeof(uint32_t) * entry_offsets_.size(); + + // estimate trailer cheaply -- not worth actually constructing + // a trailer to determine the size. + size += 16; + + return size; +} + +// Construct a reader. +// After construtoin, call +IndexBlockReader::IndexBlockReader() + : parsed_(false) { +} + +void IndexBlockReader::Reset() { + data_ = Slice(); + parsed_ = false; +} + +Status IndexBlockReader::Parse(const Slice &data) { + parsed_ = false; + data_ = data; + + + if (data_.size() < sizeof(uint32_t)) { + return Status::Corruption("index block too small"); + } + + const uint8_t *trailer_size_ptr = + data_.data() + data_.size() - sizeof(uint32_t); + uint32_t trailer_size = DecodeFixed32(trailer_size_ptr); + + size_t max_size = trailer_size_ptr - data_.data(); + if (trailer_size <= 0 || + trailer_size > max_size) { + string err = "invalid index block trailer size: " + + boost::lexical_cast(trailer_size); + return Status::Corruption(err); + } + + const uint8_t *trailer_ptr = trailer_size_ptr - trailer_size; + + bool success = trailer_.ParseFromArray(trailer_ptr, trailer_size); + if (!success) { + return Status::Corruption( + "unable to parse trailer", + trailer_.InitializationErrorString()); + } + + key_offsets_ = trailer_ptr - sizeof(uint32_t) * trailer_.num_entries(); + CHECK(trailer_ptr >= data_.data()); + + VLOG(2) << "Parsed index trailer: " << trailer_.DebugString(); + + parsed_ = true; + return Status::OK(); +} + +size_t IndexBlockReader::Count() const { + CHECK(parsed_) << "not parsed"; + return trailer_.num_entries(); +} + +IndexBlockIterator *IndexBlockReader::NewIterator() const { + CHECK(parsed_) << "not parsed"; + return new IndexBlockIterator(this); +} + +bool IndexBlockReader::IsLeaf() { + return trailer_.type() == IndexBlockTrailerPB::LEAF; +} + +int IndexBlockReader::CompareKey(int idx_in_block, + const Slice &search_key) const { + const uint8_t *key_ptr, *limit; + GetKeyPointer(idx_in_block, &key_ptr, &limit); + Slice this_slice; + if (PREDICT_FALSE(SliceDecode(key_ptr, limit, &this_slice) == nullptr)) { + LOG(WARNING)<< "Invalid data in block!"; + return 0; + } + + return this_slice.compare(search_key); +} + +Status IndexBlockReader::ReadEntry(size_t idx, Slice *key, BlockPointer *block_ptr) const { + if (idx >= trailer_.num_entries()) { + return Status::NotFound("Invalid index"); + } + + // At 'ptr', data is encoded as follows: + // + + const uint8_t *ptr, *limit; + GetKeyPointer(idx, &ptr, &limit); + + ptr = SliceDecode(ptr, limit, key); + if (ptr == nullptr) { + return Status::Corruption("Invalid key in index"); + } + + return block_ptr->DecodeFrom(ptr, data_.data() + data_.size()); +} + +void IndexBlockReader::GetKeyPointer(int idx_in_block, const uint8_t **ptr, + const uint8_t **limit) const { + size_t offset_in_block = DecodeFixed32( + &key_offsets_[idx_in_block * sizeof(uint32_t)]); + *ptr = data_.data() + offset_in_block; + + int next_idx = idx_in_block + 1; + + if (PREDICT_FALSE(next_idx >= trailer_.num_entries())) { + DCHECK(next_idx == Count()) << "Bad index: " << idx_in_block + << " Count: " << Count(); + // last key in block: limit is the beginning of the offsets array + *limit = key_offsets_; + } else { + // otherwise limit is the beginning of the next key + offset_in_block = DecodeFixed32( + &key_offsets_[next_idx * sizeof(uint32_t)]); + *limit = data_.data() + offset_in_block; + } +} + +size_t IndexBlockBuilder::Count() const { + return entry_offsets_.size(); +} + +void IndexBlockBuilder::Reset() { + buffer_.clear(); + entry_offsets_.clear(); + finished_ = false; +} + +IndexBlockIterator::IndexBlockIterator(const IndexBlockReader *reader) + : reader_(reader), + cur_idx_(-1), + seeked_(false) { +} + +void IndexBlockIterator::Reset() { + seeked_ = false; + cur_idx_ = -1; +} + +Status IndexBlockIterator::SeekAtOrBefore(const Slice &search_key) { + size_t left = 0; + size_t right = reader_->Count() - 1; + while (left < right) { + int mid = (left + right + 1) / 2; + + int compare = reader_->CompareKey(mid, search_key); + if (compare < 0) { // mid < search + left = mid; + } else if (compare > 0) { // mid > search + right = mid - 1; + } else { // mid == search + left = mid; + break; + } + } + + // closest is now 'left' + int compare = reader_->CompareKey(left, search_key); + if (compare > 0) { + // The last midpoint was still greather then the + // provided key, which implies that the key is + // lower than the lowest in the block. + return Status::NotFound("key not present"); + } + + return SeekToIndex(left); +} + +Status IndexBlockIterator::SeekToIndex(size_t idx) { + cur_idx_ = idx; + Status s = reader_->ReadEntry(idx, &cur_key_, &cur_ptr_); + seeked_ = s.ok(); + return s; +} + +bool IndexBlockIterator::HasNext() const { + return cur_idx_ + 1 < reader_->Count(); +} + +Status IndexBlockIterator::Next() { + return SeekToIndex(cur_idx_ + 1); +} + +const BlockPointer &IndexBlockIterator::GetCurrentBlockPointer() const { + CHECK(seeked_) << "not seeked"; + return cur_ptr_; +} + +const Slice IndexBlockIterator::GetCurrentKey() const { + CHECK(seeked_) << "not seeked"; + return cur_key_; +} + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/index_block.h b/src/kudu/cfile/index_block.h new file mode 100644 index 000000000000..7f7fe5a4a2ff --- /dev/null +++ b/src/kudu/cfile/index_block.h @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_INDEX_BLOCK_H +#define KUDU_CFILE_INDEX_BLOCK_H + +#include +#include +#include +#include +#include + +#include "kudu/common/types.h" +#include "kudu/cfile/block_pointer.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/coding-inl.h" + +namespace kudu { +namespace cfile { + +using std::string; +using std::vector; +using kudu::DataTypeTraits; + +// Forward decl. +class IndexBlockIterator; + +struct WriterOptions; + +// Index Block Builder for a particular key type. +// This works like the rest of the builders in the cfile package. +// After repeatedly calling Add(), call Finish() to encode it +// into a Slice, then you may Reset to re-use buffers. +class IndexBlockBuilder { + public: + explicit IndexBlockBuilder(const WriterOptions *options, + bool is_leaf); + + // Append an entry into the index. + void Add(const Slice &key, const BlockPointer &ptr); + + // Finish the current index block. + // Returns a fully encoded Slice including the data + // as well as any necessary footer. + // The Slice is only valid until the next call to + // Reset(). + Slice Finish(); + + // Return the key of the first entry in this index block. + // The pointed-to data is only valid until the next call to this builder. + Status GetFirstKey(Slice *key) const; + + size_t Count() const; + + // Return an estimate of the post-encoding size of this + // index block. This estimate should be conservative -- + // it will over-estimate rather than under-estimate, and + // should be accurate to within a reasonable threshold, + // but is not exact. + size_t EstimateEncodedSize() const; + + void Reset(); + + private: + DISALLOW_COPY_AND_ASSIGN(IndexBlockBuilder); + +#ifdef __clang__ + __attribute__((__unused__)) +#endif + const WriterOptions *options_; + + // Is the builder currently between Finish() and Reset() + bool finished_; + + // Is this a leaf block? + bool is_leaf_; + + faststring buffer_; + vector entry_offsets_; +}; + +class IndexBlockReader { + public: + IndexBlockReader(); + + void Reset(); + + // Parse the given index block. + // + // This function may be called repeatedly to "reset" the reader to process + // a new block. + // + // Note: this does not copy the data, so the slice must + // remain valid for the lifetime of the reader (or until the next Parse call) + Status Parse(const Slice &data); + + size_t Count() const; + + IndexBlockIterator *NewIterator() const; + + bool IsLeaf(); + + private: + friend class IndexBlockIterator; + + int CompareKey(int idx_in_block, const Slice &search_key) const; + + Status ReadEntry(size_t idx, Slice *key, BlockPointer *block_ptr) const; + + // Set *ptr to the beginning of the index data for the given index + // entry. + // Set *limit to the 'limit' pointer for that entry (i.e a pointer + // beyond which the data no longer is part of that entry). + // - *limit can be used to prevent overrunning in the case of a + // corrupted length varint or length prefix + void GetKeyPointer(int idx_in_block, const uint8_t **ptr, const uint8_t **limit) const; + + static const int kMaxTrailerSize = 64*1024; + Slice data_; + + IndexBlockTrailerPB trailer_; + const uint8_t *key_offsets_; + bool parsed_; + + DISALLOW_COPY_AND_ASSIGN(IndexBlockReader); +}; + +class IndexBlockIterator { + public: + explicit IndexBlockIterator(const IndexBlockReader *reader); + + // Reset the state of this iterator. This should be used + // after the associated 'reader' object parses a different block. + void Reset(); + + // Find the highest block pointer in this index + // block which has a value <= the given key. + // If such a block is found, returns OK status. + // If no such block is found (i.e the smallest key in the + // index is still larger than the provided key), then + // Status::NotFound is returned. + // + // If this function returns an error, then the state of this + // iterator is undefined (i.e it may or may not have moved + // since the previous call) + Status SeekAtOrBefore(const Slice &search_key); + + Status SeekToIndex(size_t idx); + + bool HasNext() const; + + Status Next(); + + const BlockPointer &GetCurrentBlockPointer() const; + + const Slice GetCurrentKey() const; + + private: + const IndexBlockReader *reader_; + size_t cur_idx_; + Slice cur_key_; + BlockPointer cur_ptr_; + bool seeked_; + + DISALLOW_COPY_AND_ASSIGN(IndexBlockIterator); +}; + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/index_btree.cc b/src/kudu/cfile/index_btree.cc new file mode 100644 index 000000000000..1265695911be --- /dev/null +++ b/src/kudu/cfile/index_btree.cc @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/common/key_encoder.h" +#include "kudu/util/debug-util.h" + +namespace kudu { +namespace cfile { + +IndexTreeBuilder::IndexTreeBuilder( + const WriterOptions *options, + CFileWriter *writer) : + options_(options), + writer_(writer) { + idx_blocks_.push_back(CreateBlockBuilder(true)); +} + + +IndexBlockBuilder *IndexTreeBuilder::CreateBlockBuilder(bool is_leaf) { + return new IndexBlockBuilder(options_, is_leaf); +} + +Status IndexTreeBuilder::Append(const Slice &key, + const BlockPointer &block) { + return Append(key, block, 0); +} + +Status IndexTreeBuilder::Append( + const Slice &key, const BlockPointer &block_ptr, + size_t level) { + if (level >= idx_blocks_.size()) { + // Need to create a new level + CHECK(level == idx_blocks_.size()) << + "trying to create level " << level << " but size is only " + << idx_blocks_.size(); + VLOG(1) << "Creating level-" << level << " in index b-tree"; + idx_blocks_.push_back(CreateBlockBuilder(false)); + } + + IndexBlockBuilder &idx_block = idx_blocks_[level]; + idx_block.Add(key, block_ptr); + + size_t est_size = idx_block.EstimateEncodedSize(); + if (est_size > options_->index_block_size) { + DCHECK(idx_block.Count() > 1) + << "Index block full with only one entry - this would create " + << "an infinite loop"; + // This index block is full, flush it. + BlockPointer index_block_ptr; + RETURN_NOT_OK(FinishBlockAndPropagate(level)); + } + + return Status::OK(); +} + + +Status IndexTreeBuilder::Finish(BTreeInfoPB *info) { + // Now do the same for the positional index blocks, starting + // with leaf + VLOG(1) << "flushing tree, b-tree has " << + idx_blocks_.size() << " levels"; + + // Flush all but the root of the index. + for (size_t i = 0; i < idx_blocks_.size() - 1; i++) { + RETURN_NOT_OK(FinishBlockAndPropagate(i)); + } + + // Flush the root + int root_level = idx_blocks_.size() - 1; + BlockPointer ptr; + Status s = FinishAndWriteBlock(root_level, &ptr); + if (!s.ok()) { + LOG(ERROR) << "Unable to flush root index block"; + return s; + } + + VLOG(1) << "Flushed root index block: " << ptr.ToString(); + + ptr.CopyToPB(info->mutable_root_block()); + return Status::OK(); +} + +Status IndexTreeBuilder::FinishBlockAndPropagate(size_t level) { + IndexBlockBuilder &idx_block = idx_blocks_[level]; + + // If the block doesn't have any data in it, we don't need to + // write it out. + // This happens if a lower-level block fills up exactly, + // and then the file completes. + // + // TODO: add a test case which exercises this explicitly. + if (idx_block.Count() == 0) { + return Status::OK(); + } + + // Write to file. + BlockPointer idx_block_ptr; + RETURN_NOT_OK(FinishAndWriteBlock(level, &idx_block_ptr)); + + // Get the first key of the finished block. + Slice first_in_idx_block; + Status s = idx_block.GetFirstKey(&first_in_idx_block); + + if (!s.ok()) { + LOG(ERROR) << "Unable to get first key of level-" << level + << " index block: " << s.ToString() << std::endl + << GetStackTrace(); + return s; + } + + // Add to higher-level index. + RETURN_NOT_OK(Append(first_in_idx_block, idx_block_ptr, + level + 1)); + + // Finally, reset the block we just wrote. It's important to wait until + // here to do this, since the first_in_idx_block data may point to internal + // storage of the index block. + idx_block.Reset(); + + return Status::OK(); +} + +// Finish the current block at the given level, writing it +// to the file. Return the location of the written block +// in 'written'. +Status IndexTreeBuilder::FinishAndWriteBlock(size_t level, BlockPointer *written) { + IndexBlockBuilder &idx_block = idx_blocks_[level]; + Slice data = idx_block.Finish(); + + vector v; + v.push_back(data); + Status s = writer_->AddBlock(v, written, "index block"); + if (!s.ok()) { + LOG(ERROR) << "Unable to append level-" << level << " index " + << "block to file"; + return s; + } + + return Status::OK(); +} + +//////////////////////////////////////////////////////////// + + +IndexTreeIterator::IndexTreeIterator(const CFileReader *reader, + const BlockPointer &root_blockptr) + : reader_(reader), + root_block_(root_blockptr) { +} + +Status IndexTreeIterator::SeekAtOrBefore(const Slice &search_key) { + return SeekDownward(search_key, root_block_, 0); +} + +Status IndexTreeIterator::SeekToFirst() { + return SeekToFirstDownward(root_block_, 0); +} + +bool IndexTreeIterator::HasNext() { + for (int i = seeked_indexes_.size() - 1; i >= 0; i--) { + if (seeked_indexes_[i].iter.HasNext()) + return true; + } + return false; +} + +Status IndexTreeIterator::Next() { + CHECK(!seeked_indexes_.empty()) << "not seeked"; + + // Start at the bottom level of the BTree, calling Next(), + // until one succeeds. If any does not succeed, then + // that block is exhausted, and gets removed. + while (!seeked_indexes_.empty()) { + Status s = BottomIter()->Next(); + if (s.IsNotFound()) { + seeked_indexes_.pop_back(); + } else if (s.ok()) { + break; + } else { + // error + return s; + } + } + + // If we're now empty, then the root block was exhausted, + // so we're entirely out of data. + if (seeked_indexes_.empty()) { + return Status::NotFound("end of iterator"); + } + + // Otherwise, the last layer points to the valid + // next block. Propagate downward if it is not a leaf. + while (!BottomReader()->IsLeaf()) { + RETURN_NOT_OK( + LoadBlock(BottomIter()->GetCurrentBlockPointer(), + seeked_indexes_.size())); + RETURN_NOT_OK(BottomIter()->SeekToIndex(0)); + } + + return Status::OK(); +} + +const Slice IndexTreeIterator::GetCurrentKey() const { + return seeked_indexes_.back().iter.GetCurrentKey(); +} + +const BlockPointer &IndexTreeIterator::GetCurrentBlockPointer() const { + return seeked_indexes_.back().iter.GetCurrentBlockPointer(); +} + +IndexBlockIterator *IndexTreeIterator::BottomIter() { + return &seeked_indexes_.back().iter; +} + +IndexBlockReader *IndexTreeIterator::BottomReader() { + return &seeked_indexes_.back().reader; +} + +IndexBlockIterator *IndexTreeIterator::seeked_iter(int depth) { + return &seeked_indexes_[depth].iter; +} + +IndexBlockReader *IndexTreeIterator::seeked_reader(int depth) { + return &seeked_indexes_[depth].reader; +} + +Status IndexTreeIterator::LoadBlock(const BlockPointer &block, int depth) { + + SeekedIndex *seeked; + if (depth < seeked_indexes_.size()) { + // We have a cached instance from previous seek. + seeked = &seeked_indexes_[depth]; + + if (seeked->block_ptr.offset() == block.offset()) { + // We're already seeked to this block - no need to re-parse it. + // This is handy on the root block as well as for the case + // when a lot of requests are traversing down the same part of + // the tree. + return Status::OK(); + } + + // Seeked to a different block: reset the reader + seeked->reader.Reset(); + seeked->iter.Reset(); + } else { + // No cached instance, make a new one. + seeked_indexes_.push_back(new SeekedIndex()); + seeked = &seeked_indexes_.back(); + } + + RETURN_NOT_OK(reader_->ReadBlock(block, CFileReader::CACHE_BLOCK, &seeked->data)); + seeked->block_ptr = block; + + // Parse the new block. + RETURN_NOT_OK(seeked->reader.Parse(seeked->data.data())); + + return Status::OK(); +} + +Status IndexTreeIterator::SeekDownward(const Slice &search_key, const BlockPointer &in_block, + int cur_depth) { + + // Read the block. + RETURN_NOT_OK(LoadBlock(in_block, cur_depth)); + IndexBlockIterator *iter = seeked_iter(cur_depth); + + RETURN_NOT_OK(iter->SeekAtOrBefore(search_key)); + + // If the block is a leaf block, we're done, + // otherwise recurse downward into next layer + // of B-Tree + if (seeked_reader(cur_depth)->IsLeaf()) { + seeked_indexes_.resize(cur_depth + 1); + return Status::OK(); + } else { + return SeekDownward(search_key, iter->GetCurrentBlockPointer(), + cur_depth + 1); + } +} + +Status IndexTreeIterator::SeekToFirstDownward(const BlockPointer &in_block, int cur_depth) { + // Read the block. + RETURN_NOT_OK(LoadBlock(in_block, cur_depth)); + IndexBlockIterator *iter = seeked_iter(cur_depth); + + RETURN_NOT_OK(iter->SeekToIndex(0)); + + // If the block is a leaf block, we're done, + // otherwise recurse downward into next layer + // of B-Tree + if (seeked_reader(cur_depth)->IsLeaf()) { + seeked_indexes_.resize(cur_depth + 1); + return Status::OK(); + } else { + return SeekToFirstDownward(iter->GetCurrentBlockPointer(), cur_depth + 1); + } +} + +IndexTreeIterator *IndexTreeIterator::IndexTreeIterator::Create( + const CFileReader *reader, + const BlockPointer &root_blockptr) { + return new IndexTreeIterator(reader, root_blockptr); +} + + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/index_btree.h b/src/kudu/cfile/index_btree.h new file mode 100644 index 000000000000..fbe48fb735bf --- /dev/null +++ b/src/kudu/cfile/index_btree.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_INDEX_BTREE_H +#define KUDU_CFILE_INDEX_BTREE_H + +#include +#include + +#include "kudu/cfile/block_handle.h" +#include "kudu/cfile/cfile.pb.h" +#include "kudu/cfile/index_block.h" +#include "kudu/gutil/macros.h" + +namespace kudu { +namespace cfile { + +using boost::ptr_vector; + +class CFileReader; +class CFileWriter; + +class IndexTreeBuilder { + public: + explicit IndexTreeBuilder( + const WriterOptions *options, + CFileWriter *writer); + + // Append the given key into the index. + // The key is copied into the builder's internal + // memory. + Status Append(const Slice &key, const BlockPointer &block); + Status Finish(BTreeInfoPB *info); + private: + IndexBlockBuilder *CreateBlockBuilder(bool is_leaf); + Status Append(const Slice &key, const BlockPointer &block_ptr, + size_t level); + + // Finish the current block at the given index level, and then + // propagate by inserting this block into the next higher-up + // level index. + Status FinishBlockAndPropagate(size_t level); + + // Finish the current block at the given level, writing it + // to the file. Return the location of the written block + // in 'written'. + Status FinishAndWriteBlock(size_t level, BlockPointer *written); + + const WriterOptions *options_; + CFileWriter *writer_; + + ptr_vector idx_blocks_; + + DISALLOW_COPY_AND_ASSIGN(IndexTreeBuilder); +}; + +class IndexTreeIterator { + public: + explicit IndexTreeIterator( + const CFileReader *reader, + const BlockPointer &root_blockptr); + + Status SeekToFirst(); + Status SeekAtOrBefore(const Slice &search_key); + bool HasNext(); + Status Next(); + + // The slice key at which the iterator + // is currently seeked to. + const Slice GetCurrentKey() const; + const BlockPointer &GetCurrentBlockPointer() const; + + static IndexTreeIterator *Create( + const CFileReader *reader, + const BlockPointer &idx_root); + + private: + IndexBlockIterator *BottomIter(); + IndexBlockReader *BottomReader(); + IndexBlockIterator *seeked_iter(int depth); + IndexBlockReader *seeked_reader(int depth); + Status LoadBlock(const BlockPointer &block, int dept); + Status SeekDownward(const Slice &search_key, const BlockPointer &in_block, + int cur_depth); + Status SeekToFirstDownward(const BlockPointer &in_block, int cur_depth); + + struct SeekedIndex { + SeekedIndex() : + iter(&reader) + {} + + // Hold a copy of the underlying block data, which would + // otherwise go out of scope. The reader and iter + // do not themselves retain the data. + BlockPointer block_ptr; + BlockHandle data; + IndexBlockReader reader; + IndexBlockIterator iter; + }; + + const CFileReader *reader_; + + BlockPointer root_block_; + + ptr_vector seeked_indexes_; + + DISALLOW_COPY_AND_ASSIGN(IndexTreeIterator); +}; + +} // namespace cfile +} // namespace kudu +#endif diff --git a/src/kudu/cfile/mt-bloomfile-test.cc b/src/kudu/cfile/mt-bloomfile-test.cc new file mode 100644 index 000000000000..aafbd7090f85 --- /dev/null +++ b/src/kudu/cfile/mt-bloomfile-test.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/cfile/bloomfile-test-base.h" + +#include + +#include "kudu/util/thread.h" + +DEFINE_int32(benchmark_num_threads, 8, "Number of threads to use for the benchmark"); + +namespace kudu { +namespace cfile { + +class MTBloomFileTest : public BloomFileTestBase { +}; + +#ifdef NDEBUG +TEST_F(MTBloomFileTest, Benchmark) { + ASSERT_NO_FATAL_FAILURE(WriteTestBloomFile()); + ASSERT_OK(OpenBloomFile()); + + vector > threads; + + for (int i = 0; i < FLAGS_benchmark_num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(Thread::Create("test", strings::Substitute("t$0", i), + boost::bind(&BloomFileTestBase::ReadBenchmark, this), + &new_thread)); + threads.push_back(new_thread); + } + for (scoped_refptr& t : threads) { + t->Join(); + } +} +#endif + +} // namespace cfile +} // namespace kudu diff --git a/src/kudu/cfile/plain_bitmap_block.h b/src/kudu/cfile/plain_bitmap_block.h new file mode 100644 index 000000000000..5ad976202077 --- /dev/null +++ b/src/kudu/cfile/plain_bitmap_block.h @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_PLAIN_BITMAP_BLOCK_H_ +#define KUDU_CFILE_PLAIN_BITMAP_BLOCK_H_ + +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/common/columnblock.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/bit-stream-utils.inline.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/hexdump.h" + +namespace kudu { +namespace cfile { + +struct WriterOptions; + +// +// A plain encoder for the BOOL datatype: stores a column of BOOL values +// as a packed bitmap. +// +class PlainBitMapBlockBuilder : public BlockBuilder { + public: + PlainBitMapBlockBuilder() + : writer_(&buf_) { + Reset(); + } + + virtual bool IsBlockFull(size_t limit) const OVERRIDE { + return writer_.bytes_written() > limit; + } + + virtual int Add(const uint8_t* vals, size_t count) OVERRIDE { + for (const uint8_t* val = vals; + val < vals + count; + ++val) { + // TODO (perf) : doing this one bit a time is probably + // inefficient. + writer_.PutValue(*val, 1); + } + count_ += count; + return count; + } + + virtual Slice Finish(rowid_t ordinal_pos) OVERRIDE { + InlineEncodeFixed32(&buf_[0], count_); + InlineEncodeFixed32(&buf_[4], ordinal_pos); + writer_.Flush(false); + return Slice(buf_); + } + + virtual void Reset() OVERRIDE { + count_ = 0; + writer_.Clear(); + // Reserve space for a header + writer_.PutValue(0xdeadbeef, 32); + writer_.PutValue(0xdeadbeef, 32); + } + + virtual size_t Count() const OVERRIDE { + return count_; + } + + // TODO Implement this method + virtual Status GetFirstKey(void* key) const OVERRIDE { + return Status::NotSupported("BOOL keys not supported"); + } + + private: + faststring buf_; + BitWriter writer_; + size_t count_; +}; + + +// +// Plain decoder for the BOOL datatype +// +class PlainBitMapBlockDecoder : public BlockDecoder { + public: + explicit PlainBitMapBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + cur_idx_(0) { + } + + virtual Status ParseHeader() OVERRIDE { + CHECK(!parsed_); + + if (data_.size() < kHeaderSize) { + return Status::Corruption( + "not enough bytes for header in PlainBitMapBlockDecoder"); + } + + num_elems_ = DecodeFixed32(&data_[0]); + ordinal_pos_base_ = DecodeFixed32(&data_[4]); + + if (data_.size() != kHeaderSize + BitmapSize(num_elems_)) { + return Status::Corruption( + strings::Substitute( + "unexpected data size (expected $0 bytes got $1 bytes).\100 bytes: $2", + data_.size(), kHeaderSize + BitmapSize(num_elems_), + HexDump(Slice(data_.data(), + data_.size() < 100 ? data_.size() : 100)))); + } + + parsed_ = true; + + reader_ = BitReader(data_.data() + kHeaderSize, data_.size() - kHeaderSize); + + SeekToPositionInBlock(0); + + return Status::OK(); + } + + virtual void SeekToPositionInBlock(uint pos) OVERRIDE { + CHECK(parsed_) << "Must call ParseHeader()"; + + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, pos); + return; + } + + DCHECK_LT(pos, num_elems_); + + reader_.SeekToBit(pos); + + cur_idx_ = pos; + } + + // TODO : Support BOOL keys + virtual Status SeekAtOrAfterValue(const void *value, + bool *exact_match) OVERRIDE { + return Status::NotSupported("BOOL keys are not supported!"); + } + + virtual Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE { + DCHECK(parsed_); + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(bool)); + + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t bits_to_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + size_t remaining = bits_to_fetch; + uint8_t* data_ptr = dst->data(); + // TODO : do this a word/byte at a time as opposed bit at a time + while (remaining > 0) { + bool result = reader_.GetValue(1, data_ptr); + DCHECK(result); + remaining--; + data_ptr++; + } + + cur_idx_ += bits_to_fetch; + *n = bits_to_fetch; + + return Status::OK(); + } + + virtual bool HasNext() const OVERRIDE { return cur_idx_ < num_elems_; } + + virtual size_t Count() const OVERRIDE { return num_elems_; } + + virtual size_t GetCurrentIndex() const OVERRIDE { return cur_idx_; } + + virtual rowid_t GetFirstRowId() const OVERRIDE { return ordinal_pos_base_; } + + private: + enum { + kHeaderSize = 8 + }; + + Slice data_; + bool parsed_; + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + uint32_t cur_idx_; + BitReader reader_; +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/plain_block.h b/src/kudu/cfile/plain_block.h new file mode 100644 index 000000000000..7b40bd59f79d --- /dev/null +++ b/src/kudu/cfile/plain_block.h @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_PLAIN_BLOCK_H +#define KUDU_CFILE_PLAIN_BLOCK_H + +#include +#include + +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/common/columnblock.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/hexdump.h" + +namespace kudu { +namespace cfile { + +template +inline Type Decode(const uint8_t *ptr) { + Type result; + memcpy(&result, ptr, sizeof(result)); + return result; +} + +static const size_t kPlainBlockHeaderSize = sizeof(uint32_t) * 2; + +// +// A plain encoder for generic fixed size data types. +// +template +class PlainBlockBuilder : public BlockBuilder { + public: + explicit PlainBlockBuilder(const WriterOptions *options) + : options_(options) { + // Reserve enough space for the block, plus a bit of slop since + // we often overrun the block by a few values. + buffer_.reserve(kPlainBlockHeaderSize + options_->storage_attributes.cfile_block_size + 1024); + Reset(); + } + + virtual int Add(const uint8_t *vals_void, size_t count) OVERRIDE { + int old_size = buffer_.size(); + buffer_.resize(old_size + count * kCppTypeSize); + memcpy(&buffer_[old_size], vals_void, count * kCppTypeSize); + count_ += count; + return count; + } + + virtual bool IsBlockFull(size_t limit) const OVERRIDE { + return buffer_.size() > limit; + } + + virtual Slice Finish(rowid_t ordinal_pos) OVERRIDE { + InlineEncodeFixed32(&buffer_[0], count_); + InlineEncodeFixed32(&buffer_[4], ordinal_pos); + return Slice(buffer_); + } + + virtual void Reset() OVERRIDE { + count_ = 0; + buffer_.clear(); + buffer_.resize(kPlainBlockHeaderSize); + } + + virtual size_t Count() const OVERRIDE { + return count_; + } + + virtual Status GetFirstKey(void *key) const OVERRIDE { + DCHECK_GT(count_, 0); + *reinterpret_cast(key) = Decode(&buffer_[kPlainBlockHeaderSize]); + return Status::OK(); + } + + private: + faststring buffer_; + const WriterOptions *options_; + size_t count_; + typedef typename TypeTraits::cpp_type CppType; + enum { + kCppTypeSize = TypeTraits::size + }; + +}; + +// +// A plain decoder for generic fixed size data types. +// +template +class PlainBlockDecoder : public BlockDecoder { + public: + explicit PlainBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + cur_idx_(0) { + } + + virtual Status ParseHeader() OVERRIDE { + CHECK(!parsed_); + + if (data_.size() < kPlainBlockHeaderSize) { + return Status::Corruption( + "not enough bytes for header in PlainBlockDecoder"); + } + + num_elems_ = DecodeFixed32(&data_[0]); + ordinal_pos_base_ = DecodeFixed32(&data_[4]); + + if (data_.size() != kPlainBlockHeaderSize + num_elems_ * size_of_type) { + return Status::Corruption( + string("unexpected data size. ") + "\nFirst 100 bytes: " + + HexDump( + Slice(data_.data(), + (data_.size() < 100 ? data_.size() : 100)))); + } + + parsed_ = true; + + SeekToPositionInBlock(0); + + return Status::OK(); + } + + virtual void SeekToPositionInBlock(uint pos) OVERRIDE { + CHECK(parsed_) << "Must call ParseHeader()"; + + if (PREDICT_FALSE(num_elems_ == 0)) { + DCHECK_EQ(0, pos); + return; + } + + DCHECK_LE(pos, num_elems_); + cur_idx_ = pos; + } + + virtual Status SeekAtOrAfterValue(const void *value, bool *exact_match) OVERRIDE { + DCHECK(value != NULL); + + const CppType &target = *reinterpret_cast(value); + + uint32_t left = 0; + uint32_t right = num_elems_; + while (left != right) { + uint32_t mid = (left + right) / 2; + CppType mid_key = Decode( + &data_[kPlainBlockHeaderSize + mid * size_of_type]); + // assumes CppType has an implementation of operator<() + if (mid_key < target) { + left = mid + 1; + } else if (mid_key > target) { + right = mid; + } else { + cur_idx_ = mid; + *exact_match = true; + return Status::OK(); + } + } + + *exact_match = false; + cur_idx_ = left; + if (cur_idx_ == num_elems_) { + return Status::NotFound("after last key in block"); + } + + return Status::OK(); + } + + virtual Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE { + DCHECK(parsed_); + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(CppType)); + + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t max_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + memcpy(dst->data(), + &data_[kPlainBlockHeaderSize + cur_idx_ * size_of_type], + max_fetch * size_of_type); + cur_idx_ += max_fetch; + *n = max_fetch; + return Status::OK(); + } + + virtual bool HasNext() const OVERRIDE { + return cur_idx_ < num_elems_; + } + + virtual size_t Count() const OVERRIDE { + return num_elems_; + } + + virtual size_t GetCurrentIndex() const OVERRIDE { + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return ordinal_pos_base_; + } + + private: + + Slice data_; + bool parsed_; + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + uint32_t cur_idx_; + typedef typename TypeTraits::cpp_type CppType; + enum { + size_of_type = TypeTraits::size + }; + +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/rle_block.h b/src/kudu/cfile/rle_block.h new file mode 100644 index 000000000000..0a4ac0d2ecef --- /dev/null +++ b/src/kudu/cfile/rle_block.h @@ -0,0 +1,423 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CFILE_RLE_BLOCK_H +#define KUDU_CFILE_RLE_BLOCK_H + +#include +#include + +#include "kudu/gutil/port.h" +#include "kudu/cfile/block_encodings.h" +#include "kudu/common/columnblock.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/bit-stream-utils.inline.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/rle-encoding.h" + + +namespace kudu { +namespace cfile { + +struct WriterOptions; + +enum { + kRleBitmapBlockHeaderSize = 8 +}; + +// +// RLE encoder for the BOOL datatype: uses an RLE-encoded bitmap to +// represent a bool column. +// +class RleBitMapBlockBuilder : public BlockBuilder { + public: + RleBitMapBlockBuilder() + : encoder_(&buf_, 1) { + Reset(); + } + + virtual int Add(const uint8_t* vals, size_t count) OVERRIDE { + for (const uint8_t* val = vals; + val < vals + count; + ++val) { + // TODO (perf) : doing this one bit a time is probably + // inefficient. + encoder_.Put(*val, 1); + } + count_ += count; + return count; + } + + virtual bool IsBlockFull(size_t limit) const OVERRIDE { + return encoder_.len() > limit; + } + + virtual Slice Finish(rowid_t ordinal_pos) OVERRIDE { + InlineEncodeFixed32(&buf_[0], count_); + InlineEncodeFixed32(&buf_[4], ordinal_pos); + encoder_.Flush(); + return Slice(buf_); + } + + virtual void Reset() OVERRIDE { + count_ = 0; + encoder_.Clear(); + encoder_.Reserve(kRleBitmapBlockHeaderSize, 0); + } + + virtual size_t Count() const OVERRIDE { + return count_; + } + + // TODO Implement this method + virtual Status GetFirstKey(void* key) const OVERRIDE { + return Status::NotSupported("BOOL keys not supported"); + } + + private: + faststring buf_; + RleEncoder encoder_; + size_t count_; +}; + +// +// RLE decoder for bool datatype +// +class RleBitMapBlockDecoder : public BlockDecoder { + public: + explicit RleBitMapBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + cur_idx_(0) { + } + + virtual Status ParseHeader() OVERRIDE { + CHECK(!parsed_); + + if (data_.size() < kRleBitmapBlockHeaderSize) { + return Status::Corruption( + "not enough bytes for header in RleBitMapBlockDecoder"); + } + + num_elems_ = DecodeFixed32(&data_[0]); + ordinal_pos_base_ = DecodeFixed32(&data_[4]); + + parsed_ = true; + + rle_decoder_ = RleDecoder(data_.data() + kRleBitmapBlockHeaderSize, + data_.size() - kRleBitmapBlockHeaderSize, 1); + + SeekToPositionInBlock(0); + + return Status::OK(); + } + + virtual void SeekToPositionInBlock(uint pos) OVERRIDE { + CHECK(parsed_) << "Must call ParseHeader()"; + + if (cur_idx_ == pos) { + // No need to seek. + return; + } else if (cur_idx_ < pos) { + uint nskip = pos - cur_idx_; + rle_decoder_.Skip(nskip); + } else { + // This approach is also used by CFileReader to + // seek backwards in an RLE encoded block + rle_decoder_ = RleDecoder(data_.data() + kRleBitmapBlockHeaderSize, + data_.size() - kRleBitmapBlockHeaderSize, 1); + rle_decoder_.Skip(pos); + } + cur_idx_ = pos; + } + + virtual Status CopyNextValues(size_t *n, ColumnDataView* dst) OVERRIDE { + DCHECK(parsed_); + + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(bool)); + + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t bits_to_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + size_t remaining = bits_to_fetch; + uint8_t* data_ptr = dst->data(); + // TODO : do this a word/byte at a time as opposed bit at a time + while (remaining > 0) { + bool result = rle_decoder_.Get(reinterpret_cast(data_ptr)); + DCHECK(result); + remaining--; + data_ptr++; + } + + cur_idx_ += bits_to_fetch; + *n = bits_to_fetch; + + return Status::OK(); + } + + virtual Status SeekAtOrAfterValue(const void *value, + bool *exact_match) OVERRIDE { + return Status::NotSupported("BOOL keys are not supported!"); + } + + virtual bool HasNext() const OVERRIDE { return cur_idx_ < num_elems_; } + + virtual size_t Count() const OVERRIDE { return num_elems_; } + + virtual size_t GetCurrentIndex() const OVERRIDE { return cur_idx_; } + + virtual rowid_t GetFirstRowId() const OVERRIDE { return ordinal_pos_base_; } + + private: + Slice data_; + bool parsed_; + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + uint32_t cur_idx_; + RleDecoder rle_decoder_; +}; + +// +// RLE builder for generic integer types. What is missing is some way +// to enforce that this can only be instantiated for INT types. +// TODO : consider if this can also be used for BOOL with only minor +// alterations +template +class RleIntBlockBuilder : public BlockBuilder { + public: + explicit RleIntBlockBuilder(const WriterOptions* opts = NULL) + : rle_encoder_(&buf_, kCppTypeSize * 8) { + Reset(); + } + + virtual bool IsBlockFull(size_t limit) const OVERRIDE { + return rle_encoder_.len() > limit; + } + + virtual int Add(const uint8_t* vals_void, size_t count) OVERRIDE { + if (PREDICT_FALSE(count_ == 0)) { + first_key_ = *reinterpret_cast(vals_void); + } + const CppType* vals = reinterpret_cast(vals_void); + for (size_t i = 0; i < count; ++i) { + rle_encoder_.Put(vals[i], 1); + } + count_ += count; + return count; + } + + virtual Slice Finish(rowid_t ordinal_pos) OVERRIDE { + InlineEncodeFixed32(&buf_[0], count_); + InlineEncodeFixed32(&buf_[4], ordinal_pos); + rle_encoder_.Flush(); + return Slice(buf_); + } + + virtual void Reset() OVERRIDE { + count_ = 0; + rle_encoder_.Clear(); + rle_encoder_.Reserve(kRleBitmapBlockHeaderSize, 0); + } + + virtual size_t Count() const OVERRIDE { + return count_; + } + + virtual Status GetFirstKey(void* key) const OVERRIDE { + if (count_ > 0) { + *reinterpret_cast(key) = first_key_; + return Status::OK(); + } + return Status::NotFound("No keys in the block"); + } + + private: + typedef typename TypeTraits::cpp_type CppType; + + enum { + kCppTypeSize = TypeTraits::size + }; + + CppType first_key_; + faststring buf_; + size_t count_; + RleEncoder rle_encoder_; +}; + +// +// RLE decoder for generic integer types. +// +// TODO : as with the matching BlockBuilder above (see comments for +// that class), it may be be possible to re-use most of the +// code here for the BOOL type. +// +template +class RleIntBlockDecoder : public BlockDecoder { + public: + explicit RleIntBlockDecoder(Slice slice) + : data_(std::move(slice)), + parsed_(false), + num_elems_(0), + ordinal_pos_base_(0), + cur_idx_(0) { + } + + virtual Status ParseHeader() OVERRIDE { + CHECK(!parsed_); + + if (data_.size() < kRleBitmapBlockHeaderSize) { + return Status::Corruption( + "not enough bytes for header in RleIntBlockDecoder"); + } + + num_elems_ = DecodeFixed32(&data_[0]); + ordinal_pos_base_ = DecodeFixed32(&data_[4]); + + parsed_ = true; + + rle_decoder_ = RleDecoder(data_.data() + kRleBitmapBlockHeaderSize, + data_.size() - kRleBitmapBlockHeaderSize, + kCppTypeSize * 8); + + SeekToPositionInBlock(0); + + return Status::OK(); + } + + virtual void SeekToPositionInBlock(uint pos) OVERRIDE { + CHECK(parsed_) << "Must call ParseHeader()"; + CHECK_LT(pos, num_elems_) + << "Tried to seek to " << pos << " which is >= number of elements (" + << num_elems_ << ") in the block!."; + + if (cur_idx_ == pos) { + // No need to seek. + return; + } else if (cur_idx_ < pos) { + uint nskip = pos - cur_idx_; + rle_decoder_.Skip(nskip); + } else { + rle_decoder_ = RleDecoder(data_.data() + kRleBitmapBlockHeaderSize, + data_.size() - kRleBitmapBlockHeaderSize, + kCppTypeSize * 8); + rle_decoder_.Skip(pos); + } + cur_idx_ = pos; + } + + virtual Status SeekAtOrAfterValue(const void *value_void, bool *exact_match) OVERRIDE { + // Currently using linear search as we do not check whether a + // mid-point of a buffer will fall on a literal or not. + // + // TODO (perf): make this faster by moving forward a 'run at a time' + // by perhaps pushing this loop down into RleDecoder itself + // TODO (perf): investigate placing pointers somewhere in either the + // header or the tail to speed up search. + + SeekToPositionInBlock(0); + + CppType target = *reinterpret_cast(value_void); + + while (cur_idx_ < num_elems_) { + CppType cur_elem; + if (!rle_decoder_.Get(&cur_elem)) { + break; + } + if (cur_elem == target) { + rle_decoder_.RewindOne(); + *exact_match = true; + return Status::OK(); + } + if (cur_elem > target) { + rle_decoder_.RewindOne(); + *exact_match = false; + return Status::OK(); + } + cur_idx_++; + } + + return Status::NotFound("not in block"); + } + + virtual Status CopyNextValues(size_t *n, ColumnDataView *dst) OVERRIDE { + DCHECK(parsed_); + + DCHECK_LE(*n, dst->nrows()); + DCHECK_EQ(dst->stride(), sizeof(CppType)); + + if (PREDICT_FALSE(*n == 0 || cur_idx_ >= num_elems_)) { + *n = 0; + return Status::OK(); + } + + size_t to_fetch = std::min(*n, static_cast(num_elems_ - cur_idx_)); + size_t remaining = to_fetch; + uint8_t* data_ptr = dst->data(); + while (remaining > 0) { + bool result = rle_decoder_.Get(reinterpret_cast(data_ptr)); + DCHECK(result); + remaining--; + data_ptr += kCppTypeSize; + } + + cur_idx_ += to_fetch; + *n = to_fetch; + return Status::OK(); + } + + virtual bool HasNext() const OVERRIDE { + return cur_idx_ < num_elems_; + } + + virtual size_t Count() const OVERRIDE { + return num_elems_; + } + + virtual size_t GetCurrentIndex() const OVERRIDE { + return cur_idx_; + } + + virtual rowid_t GetFirstRowId() const OVERRIDE { + return ordinal_pos_base_; + }; + private: + typedef typename TypeTraits::cpp_type CppType; + + enum { + kCppTypeSize = TypeTraits::size + }; + + Slice data_; + bool parsed_; + uint32_t num_elems_; + rowid_t ordinal_pos_base_; + size_t cur_idx_; + RleDecoder rle_decoder_; +}; + +} // namespace cfile +} // namespace kudu + +#endif diff --git a/src/kudu/cfile/type_encodings.cc b/src/kudu/cfile/type_encodings.cc new file mode 100644 index 000000000000..63bb0ed05184 --- /dev/null +++ b/src/kudu/cfile/type_encodings.cc @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/cfile/type_encodings.h" + +#include +#include +#include + +#include + +#include "kudu/cfile/bshuf_block.h" +#include "kudu/cfile/gvint_block.h" +#include "kudu/cfile/plain_bitmap_block.h" +#include "kudu/cfile/plain_block.h" +#include "kudu/cfile/rle_block.h" +#include "kudu/cfile/binary_dict_block.h" +#include "kudu/cfile/binary_plain_block.h" +#include "kudu/cfile/binary_prefix_block.h" +#include "kudu/common/types.h" +#include "kudu/gutil/strings/substitute.h" + +namespace kudu { +namespace cfile { + +using std::unordered_map; +using std::shared_ptr; + + +template +struct DataTypeEncodingTraits {}; + +// Instantiate this template to get static access to the type traits. +template struct TypeEncodingTraits + : public DataTypeEncodingTraits { + + static const DataType type = Type; + static const EncodingType encoding_type = Encoding; +}; + +// Generic, fallback, partial specialization that should work for all +// fixed size types. +template +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new PlainBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new PlainBlockDecoder(slice); + return Status::OK(); + } +}; + +// Generic, fallback, partial specialization that should work for all +// fixed size types. +template +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new BShufBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new BShufBlockDecoder(slice); + return Status::OK(); + } +}; + +// Template specialization for plain encoded string as they require a +// specific encoder/decoder. +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new BinaryPlainBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new BinaryPlainBlockDecoder(slice); + return Status::OK(); + } +}; + +// Template specialization for packed bitmaps +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new PlainBitMapBlockBuilder(); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new PlainBitMapBlockDecoder(slice); + return Status::OK(); + } +}; + + +// Template specialization for RLE encoded bitmaps +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder** bb, const WriterOptions *options) { + *bb = new RleBitMapBlockBuilder(); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new RleBitMapBlockDecoder(slice); + return Status::OK(); + } +}; + +// Template specialization for plain encoded string as they require a +// specific encoder \/decoder. +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new BinaryPrefixBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new BinaryPrefixBlockDecoder(slice); + return Status::OK(); + } +}; + +// Template for dictionary encoding +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new BinaryDictBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new BinaryDictBlockDecoder(slice, iter); + return Status::OK(); + } +}; + + +// Optimized grouping variable encoding for 32bit unsigned integers +template<> +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) { + *bb = new GVIntBlockBuilder(options); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) { + *bd = new GVIntBlockDecoder(slice); + return Status::OK(); + } +}; + +template +struct DataTypeEncodingTraits { + + static Status CreateBlockBuilder(BlockBuilder** bb, const WriterOptions *options) { + *bb = new RleIntBlockBuilder(); + return Status::OK(); + } + + static Status CreateBlockDecoder(BlockDecoder** bd, const Slice& slice, + CFileIterator *iter) { + *bd = new RleIntBlockDecoder(slice); + return Status::OK(); + } +}; + + +template +TypeEncodingInfo::TypeEncodingInfo(TypeEncodingTraitsClass t) + : encoding_type_(TypeEncodingTraitsClass::encoding_type), + create_builder_func_(TypeEncodingTraitsClass::CreateBlockBuilder), + create_decoder_func_(TypeEncodingTraitsClass::CreateBlockDecoder) { +} + +Status TypeEncodingInfo::CreateBlockDecoder(BlockDecoder **bd, + const Slice &slice, + CFileIterator *iter) const { + return create_decoder_func_(bd, slice, iter); +} + +Status TypeEncodingInfo::CreateBlockBuilder( + BlockBuilder **bb, const WriterOptions *options) const { + return create_builder_func_(bb, options); +} + +struct EncodingMapHash { + size_t operator()(pair pair) const { + return (pair.first + 31) ^ pair.second; + } +}; + +// A resolver for encodings, keeps all the allowed type<->encoding +// combinations. The first combination to be added to the map +// becomes the default encoding for the type. +class TypeEncodingResolver { + public: + Status GetTypeEncodingInfo(DataType t, EncodingType e, + const TypeEncodingInfo** out) { + if (e == AUTO_ENCODING) { + e = GetDefaultEncoding(t); + } + const TypeEncodingInfo *type_info = mapping_[make_pair(t, e)].get(); + if (PREDICT_FALSE(type_info == nullptr)) { + return Status::NotSupported( + strings::Substitute("Unsupported type/encoding pair: $0, $1", + DataType_Name(t), + EncodingType_Name(e))); + } + *out = type_info; + return Status::OK(); + } + + const EncodingType GetDefaultEncoding(DataType t) { + return default_mapping_[t]; + } + + // Add the encoding mappings + // the first encoder/decoder to be + // added to the mapping becomes the default + // + // TODO: Fix/work around the issue with RLE/BitWriter which + // (currently) makes it impossible to use RLE with + // 64-bit int types. + private: + TypeEncodingResolver() { + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + } + + template void AddMapping() { + TypeEncodingTraits traits; + pair encoding_for_type = make_pair(type, encoding); + if (mapping_.find(encoding_for_type) == mapping_.end()) { + default_mapping_.insert(make_pair(type, encoding)); + } + mapping_.insert( + make_pair(make_pair(type, encoding), + shared_ptr(new TypeEncodingInfo(traits)))); + } + + unordered_map, + shared_ptr, + EncodingMapHash > mapping_; + + unordered_map > default_mapping_; + + friend class Singleton; + DISALLOW_COPY_AND_ASSIGN(TypeEncodingResolver); +}; + +Status TypeEncodingInfo::Get(const TypeInfo* typeinfo, + EncodingType encoding, + const TypeEncodingInfo** out) { + return Singleton::get()->GetTypeEncodingInfo(typeinfo->physical_type(), + encoding, + out); +} + +const EncodingType TypeEncodingInfo::GetDefaultEncoding(const TypeInfo* typeinfo) { + return Singleton::get()->GetDefaultEncoding(typeinfo->physical_type()); +} + +} // namespace cfile +} // namespace kudu + diff --git a/src/kudu/cfile/type_encodings.h b/src/kudu/cfile/type_encodings.h new file mode 100644 index 000000000000..3a712d74bddf --- /dev/null +++ b/src/kudu/cfile/type_encodings.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CFILE_TYPE_ENCODINGS_H_ +#define KUDU_CFILE_TYPE_ENCODINGS_H_ + +#include "kudu/common/common.pb.h" +#include "kudu/util/status.h" + +namespace kudu { +class TypeInfo; + +namespace cfile { +class BlockBuilder; +class BlockDecoder; +class CFileReader; +class CFileIterator; +struct WriterOptions; + +// Runtime Information for type encoding/decoding +// including the ability to build BlockDecoders and BlockBuilders +// for each supported encoding +// Mimicked after common::TypeInfo et al +class TypeEncodingInfo { + public: + + static Status Get(const TypeInfo* typeinfo, EncodingType encoding, const TypeEncodingInfo** out); + + static const EncodingType GetDefaultEncoding(const TypeInfo* typeinfo); + + EncodingType encoding_type() const { return encoding_type_; } + + Status CreateBlockBuilder(BlockBuilder **bb, const WriterOptions *options) const; + + // Create a BlockDecoder. Sets *bd to the newly created decoder, + // if successful, otherwise returns a non-OK Status. + // + // iter parameter will only be used when it is dictionary encoding + Status CreateBlockDecoder(BlockDecoder **bd, const Slice &slice, + CFileIterator *iter) const; + private: + + friend class TypeEncodingResolver; + template TypeEncodingInfo(TypeEncodingTraitsClass t); + + EncodingType encoding_type_; + + typedef Status (*CreateBlockBuilderFunc)(BlockBuilder **, const WriterOptions *); + const CreateBlockBuilderFunc create_builder_func_; + + typedef Status (*CreateBlockDecoderFunc)(BlockDecoder **, const Slice &, + CFileIterator *); + const CreateBlockDecoderFunc create_decoder_func_; + + DISALLOW_COPY_AND_ASSIGN(TypeEncodingInfo); +}; + + +} // namespace cfile +} // namespace kudu + +#endif /* KUDU_CFILE_TYPE_ENCODINGS_H_ */ diff --git a/src/kudu/client/CMakeLists.txt b/src/kudu/client/CMakeLists.txt new file mode 100644 index 000000000000..4ebae41358f8 --- /dev/null +++ b/src/kudu/client/CMakeLists.txt @@ -0,0 +1,238 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(CLIENT_SRCS + batcher.cc + client.cc + client_builder-internal.cc + client-internal.cc + error_collector.cc + error-internal.cc + meta_cache.cc + scan_batch.cc + scan_predicate.cc + scanner-internal.cc + session-internal.cc + schema.cc + table-internal.cc + table_alterer-internal.cc + table_creator-internal.cc + tablet_server-internal.cc + value.cc + write_op.cc +) + +set(CLIENT_LIBS + kudu_common + master_proto + master_rpc + tserver_proto + tserver_service_proto + kudu_util + gutil + krpc) + +# Make sure we exclude tcmalloc from the exported library; we want the library +# code to use the linking application's malloc implementation. +set(EXPORTED_CLIENT_LIBS + ${CLIENT_LIBS} + ${KUDU_BASE_LIBS}) +list(REMOVE_ITEM EXPORTED_CLIENT_LIBS tcmalloc profiler) + +# We customize the output name/directory of the exported library so that we can +# call it "kudu_client" without it colliding with the regular library. +# +# Unfortunately, this doesn't extend to the autogenerated cmake files that ship +# with the exported library; they still hew to the original target name of +# "kudu_client_exported". +ADD_EXPORTABLE_LIBRARY(kudu_client + SRCS ${CLIENT_SRCS} + DEPS ${CLIENT_LIBS} + EXPORTED_SHARED + EXPORTED_OUTPUT_NAME kudu_client + EXPORTED_OUTPUT_DIRECTORY "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/exported" + EXPORTED_DEPS ${EXPORTED_CLIENT_LIBS}) + +################################################################################ +# LIBRARY VERSIONS +################################################################################ +# +# The external version of the exported Kudu client library. It is a separate +# entity from the main Kudu version so that the library and backends can evolve +# independently. +# +# Library versions affect packaging and control how the runtime linker finds the +# actual library file. The library's SOVERSION (i.e. major version) is typically +# appended to its package name and to the library filename, which means: +# +# 1. Multiple packages of the different major versions may be installed. +# 2. An application must link against a specific major version of the library. +# +# The minor and patch versions are purely informational; changes to them have no +# effect on runtime linking or on packages. +# +# The major version should only be incremented when an incompatible change is +# made to the library's ABI (i.e. interface to the application). It is not to be +# taken lightly because it has far reaching effects. Besides forcing an +# application to recompile (or worse, rewrite some code), it also affects +# packages. For example, package names typically change after a major version +# bump (e.g. libkuduclient0 becomes libkuduclient1). +# +# The minor and patch versions can be incremented freely, and should be +# incremented when significant (but not incompatible) or insignificant changes +# are made respectively. +# +# For a detailed explanation of the kinds of C++ changes that would break ABI +# compatibility, see: +# +# https://techbase.kde.org/Policies/Binary_Compatibility_Issues_With_C++ +# +# For more background on library versions, see: +# +# https://autotools.io/libtool/version.html +set(CLIENT_VERSION_MAJOR 0) +set(CLIENT_VERSION_MINOR 1) +set(CLIENT_VERSION_PATCH 0) + +if(NOT APPLE) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map") +endif() + +set_target_properties(kudu_client_exported + PROPERTIES + LINK_FLAGS "${LINK_FLAGS}" + # This version is appended to the filename. + # + # For example: libkudu_client.so.1.2.3 + VERSION "${CLIENT_VERSION_MAJOR}.${CLIENT_VERSION_MINOR}.${CLIENT_VERSION_PATCH}" + # This version is used to create two library symlinks. + # + # For example: libkudu_client.so -> libkudu_client.so.1 + # libkudu_client.so.1 -> libkudu_client.so.1.2.3 + # + # The runtime linker is expected to look up a library by name and SOVERSION. + # In the previous example, that means it'll look up a file with name + # libkudu_client.so.1. If the library is installed, that file should be a + # symlink to the actual library file libkudu_client.so.1.2.3. + SOVERSION "${CLIENT_VERSION_MAJOR}") + +# Generate kudu_export.h. +generate_export_header(kudu_client_exported + BASE_NAME kudu + EXPORT_FILE_NAME ${CMAKE_BINARY_DIR}/src/kudu/util/kudu_export.h) + +# "make install" invocations to generate a directory tree containing the +# exported client library and all of its headers. + +# For CMAKE_INSTALL_ variables. +include(GNUInstallDirs) + +# Shared library. +install(TARGETS kudu_client_exported + EXPORT kudu_client_export_set + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +# Headers: client +install(FILES + callbacks.h + client.h + row_result.h + scan_batch.h + scan_predicate.h + schema.h + shared_ptr.h + stubs.h + value.h + write_op.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/kudu/client) + +# Headers: common +install(FILES + ../common/partial_row.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/kudu/common) + +# Headers: util +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/../util/kudu_export.h + ../util/monotime.h + ../util/slice.h + ../util/status.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/kudu/util) + +# Client sample code. +# +# Can't use CMAKE_INSTALL_DOCDIR because we don't ever call project(). +install(FILES + samples/CMakeLists.txt + samples/sample.cc + DESTINATION ${CMAKE_INSTALL_DATADIR}/doc/kuduClient/samples) + +# Exported cmake file for just the library's targets. +# +# Should not be included directly by users. +install(EXPORT kudu_client_export_set + FILE kuduClientTargets.cmake + DESTINATION ${CMAKE_INSTALL_DATADIR}/kuduClient/cmake) + +# Exported cmake file for the library. +# +# This is the main cmake entry point and should be included directly. +include(CMakePackageConfigHelpers) +configure_package_config_file(clientConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/clientConfig.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_DATADIR}/kuduClient/cmake + PATH_VARS CMAKE_INSTALL_INCLUDEDIR) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/clientConfig.cmake + DESTINATION ${CMAKE_INSTALL_DATADIR}/kuduClient/cmake + RENAME kuduClientConfig.cmake) + +# There's no way to rename a logical cmake exported target, so we use this +# script to forcefully change the kudu_client_exported target to kudu_client. +install(SCRIPT MungeExportedInstallTargets.cmake) + +# Test utility library + +# This code is useful for other tests which use the client, but isn't +# part of the client itself (ie we don't want to ship it to customers, +# and therefore don't need to worry about export strictness) +add_library(kudu_client_test_util + client-test-util.cc) +target_link_libraries(kudu_client_test_util + gmock + kudu_client) + +# Tests + +# The OS X system compiler does not support source symbol maps, so the client +# leaks internal symbols globally. +# +# Coverage builds insert gcov-related symbols into the client library. We +# don't ship such builds, so there's no point in checking symbol visibility. +if (NOT APPLE AND NOT "${KUDU_GENERATE_COVERAGE}") + ADD_KUDU_TEST(client_symbol-test.sh LABELS no_dist_test) +endif() + +# The samples are never built with ASAN/TSAN. +if(NOT "${KUDU_USE_ASAN}" AND NOT "${KUDU_USE_TSAN}") + ADD_KUDU_TEST(client_samples-test.sh RUN_SERIAL true LABELS no_dist_test) +endif() +set(KUDU_TEST_LINK_LIBS kudu_client integration-tests ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(client-test) +ADD_KUDU_TEST(client-unittest) diff --git a/src/kudu/client/MungeExportedInstallTargets.cmake b/src/kudu/client/MungeExportedInstallTargets.cmake new file mode 100644 index 000000000000..b5e4060d79c4 --- /dev/null +++ b/src/kudu/client/MungeExportedInstallTargets.cmake @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Finds all Kudu client cmake installation files and replaces all references +# to kudu_client_exported with kudu_client, thus renaming the targets. + +set(CMAKE_FILES_DIR "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/share/kuduClient/cmake") +if(NOT EXISTS ${CMAKE_FILES_DIR}) + message(FATAL_ERROR "Cannot find cmake installation directory ${CMAKE_FILES_DIR}") +endif() +file(GLOB CMAKE_FILES "${CMAKE_FILES_DIR}/*.cmake") +foreach(CMAKE_FILE ${CMAKE_FILES}) + message(STATUS "Munging kudu client targets in ${CMAKE_FILE}") + execute_process(COMMAND sed s/kudu_client_exported/kudu_client/g ${CMAKE_FILE} + OUTPUT_FILE ${CMAKE_FILE}.new) + execute_process(COMMAND mv -f ${CMAKE_FILE}.new ${CMAKE_FILE}) +endforeach() diff --git a/src/kudu/client/README b/src/kudu/client/README new file mode 100644 index 000000000000..33bcf50ed8c5 --- /dev/null +++ b/src/kudu/client/README @@ -0,0 +1,132 @@ +// -*- mode: c++ -*- +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +/* + +This file contains some example code for the C++ client. It will +probably be eventually removed in favor of actual runnable examples, +but serves as a guide/docs for the client API design for now. + +See class docs for KuduClient, KuduSession, KuduTable for proper docs. +*/ + +// This is an example of explicit batching done by the client. +// This would be used in contexts like interactive webapps, where +// you are likely going to set a short timeout. +void ExplicitBatchingExample() { + // Get a reference to the tablet we want to insert into. + // Note that this may be done without a session, either before or + // after creating a session, since a session isn't tied to any + // particular table or set of tables. + scoped_refptr t; + CHECK_OK(client_->OpenTable("my_table", &t)); + + // Create a new session. All data-access operations must happen through + // a session. + shared_ptr session(client->NewSession()); + + // Setting flush mode to MANUAL_FLUSH makes the session accumulate + // all operations until the next Flush() call. This is sort of like + // TCP_CORK. + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Insert 100 rows. + for (int i = 0; i < 100; i++) { + gscoped_ptr ins = t->NewInsert(); + ins->mutable_row()->SetInt64("key", i); + ins->mutable_row()->SetInt64("val", i * 2); + // The insert should return immediately after moving the insert + // into the appropriate buffers. This always returns OK unless the + // Insert itself is invalid (eg missing a key column). + CHECK_OK(session->Apply(ins.Pass())); + } + + // Update a row. + gscoped_ptr upd = t->NewUpdate(); + upd->mutable_row()->SetInt64("key", 1); + upd->mutable_row()->SetInt64("val", 1 * 2 + 1); + + // Delete a row. + gscoped_ptr del = t->NewDelete(); + del->mutable_row()->SetInt64("key", 2); // only specify key. + + // Setting a timeout on the session applies to the next Flush call. + session->SetTimeoutMillis(300); + + // After accumulating all of the stuff in the batch, call Flush() + // to send the updates in one go. This may be done either sync or async. + // Sync API example: + { + // Returns an Error if any insert in the batch had an issue. + CHECK_OK(session->Flush()); + // Call session->GetPendingErrors() to get errors. + } + + // Async API example: + { + // Returns immediately, calls Callback when either success or failure. + CHECK_OK(session->FlushAsync(MyCallback)); + // TBD: should you be able to use the same session before the Callback has + // been called? Or require that you do nothing with this session while + // in-flight (which is more like what JDBC does I think) + } +} + +// This is an example of how a "bulk ingest" program might work -- one in +// which the client just wants to shove a bunch of data in, and perhaps +// fail if it ever gets an error. +void BulkIngestExample() { + scoped_refptr t; + CHECK_OK(client_->OpenTable("my_table", &t)); + shared_ptr session(client->NewSession()); + + // If the amount of buffered data in RAM is larger than this amount, + // blocks the writer from performing more inserts until memory has + // been freed (either by inserts succeeding or timing out). + session->SetBufferSpace(32 * 1024 * 1024); + + // Set a long timeout for this kind of usecase. This determines how long + // Flush() may block for, as well as how long Apply() may block due to + // the buffer being full. + session->SetTimeoutMillis(60 * 1000); + + // In AUTO_FLUSH_BACKGROUND mode, the session will try to accumulate batches + // for optimal efficiency, rather than flushing each operation. + CHECK_OK(session->SetFlushMode(KuduSession::AUTO_FLUSH_BACKGROUND)); + + for (int i = 0; i < 10000; i++) { + gscoped_ptr ins = t->NewInsertion(); + ins->SetInt64("key", i); + ins->SetInt64("val", i * 2); + // This will start getting written in the background. + // If there are any pending errors, it will return a bad Status, + // and the user should call GetPendingErrors() + // This may block if the buffer is full. + CHECK_OK(session->Apply(&ins)); + if (session->HasErrors())) { + LOG(FATAL) << "Failed to insert some rows: " << DumpErrors(session); + } + } + // Blocks until remaining buffered operations have been flushed. + // May also use the async API per above. + Status s = session->Flush()); + if (!s.ok()) { + LOG(FATAL) << "Failed to insert some rows: " << DumpErrors(session); + } +} + diff --git a/src/kudu/client/batcher.cc b/src/kudu/client/batcher.cc new file mode 100644 index 000000000000..f5ad0bb51294 --- /dev/null +++ b/src/kudu/client/batcher.cc @@ -0,0 +1,868 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/batcher.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/client-internal.h" +#include "kudu/client/error_collector.h" +#include "kudu/client/meta_cache.h" +#include "kudu/client/session-internal.h" +#include "kudu/client/write_op.h" +#include "kudu/client/write_op-internal.h" +#include "kudu/common/encoded_key.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/logging.h" + +using std::pair; +using std::set; +using std::shared_ptr; +using std::unordered_map; +using strings::Substitute; + +namespace kudu { + +using rpc::ErrorStatusPB; +using rpc::Messenger; +using rpc::Rpc; +using rpc::RpcController; +using tserver::WriteRequestPB; +using tserver::WriteResponsePB; +using tserver::WriteResponsePB_PerRowErrorPB; + +namespace client { + +namespace internal { + +// About lock ordering in this file: +// ------------------------------ +// The locks must be acquired in the following order: +// - Batcher::lock_ +// - InFlightOp::lock_ +// +// It's generally important to release all the locks before either calling +// a user callback, or chaining to another async function, since that function +// may also chain directly to the callback. Without releasing locks first, +// the lock ordering may be violated, or a lock may deadlock on itself (these +// locks are non-reentrant). +// ------------------------------------------------------------ + +// An operation which has been submitted to the batcher and not yet completed. +// The operation goes through a state machine as it progress through the +// various stages of a request. See the State enum for details. +// +// Note that in-flight ops *conceptually* hold a reference to the Batcher object. +// However, since there might be millions of these objects floating around, +// we can save a pointer per object by manually incrementing the Batcher ref-count +// when we create the object, and decrementing when we delete it. +struct InFlightOp { + InFlightOp() : state(kNew) { + } + + // Lock protecting the internal state of the op. + // This is necessary since callbacks may fire from IO threads + // concurrent with the user trying to abort/delete the batch. + // See comment above about lock ordering. + simple_spinlock lock_; + + enum State { + // Newly created op. + // + // OWNERSHIP: The op is only in this state when in local function scope (Batcher::Add) + kNew = 0, + + // Waiting for the MetaCache to determine which tablet ID hosts the row associated + // with this operation. In the case that the relevant tablet's key range was + // already cached, this state will be passed through immediately. Otherwise, + // the op may sit in this state for some amount of time while waiting on the + // MetaCache to perform an RPC to the master and find the correct tablet. + // + // OWNERSHIP: the op is present in the 'ops_' set, and also referenced by the + // in-flight callback provided to MetaCache. + kLookingUpTablet, + + // Once the correct tablet has been determined, and the tablet locations have been + // refreshed, we are ready to send the operation to the server. + // + // In MANUAL_FLUSH mode, the operations wait in this state until Flush has been called. + // + // In AUTO_FLUSH_BACKGROUND mode, the operations may wait in this state for one of + // two reasons: + // + // 1) There are already too many outstanding RPCs to the given tablet server. + // + // We restrict the number of concurrent RPCs from one client to a given TS + // to achieve better batching and throughput. + // TODO: not implemented yet + // + // 2) Batching delay. + // + // In order to achieve better batching, we do not immediately send a request + // to a TS as soon as we have one pending. Instead, we can wait for a configurable + // number of milliseconds for more requests to enter the queue for the same TS. + // This makes it likely that if a caller simply issues a small number of requests + // to the same tablet in AUTO_FLUSH_BACKGROUND mode that we'll batch all of the + // requests together in a single RPC. + // TODO: not implemented yet + // + // OWNERSHIP: When the operation is in this state, it is present in the 'ops_' set + // and also in the 'per_tablet_ops' map. + kBufferedToTabletServer, + + // Once the operation has been flushed (either due to explicit Flush() or background flush) + // it will enter this state. + // + // OWNERSHIP: when entering this state, the op is removed from 'per_tablet_ops' map + // and ownership is transfered to a WriteRPC's 'ops_' vector. The op still + // remains in the 'ops_' set. + kRequestSent + }; + State state; + + // The actual operation. + gscoped_ptr write_op; + + string partition_key; + + // The tablet the operation is destined for. + // This is only filled in after passing through the kLookingUpTablet state. + scoped_refptr tablet; + + // Each operation has a unique sequence number which preserves the user's intended + // order of operations. This is important when multiple operations act on the same row. + int sequence_number_; + + string ToString() const { + return strings::Substitute("op[state=$0, write_op=$1]", + state, write_op->ToString()); + } +}; + +// A Write RPC which is in-flight to a tablet. Initially, the RPC is sent +// to the leader replica, but it may be retried with another replica if the +// leader fails. +// +// Keeps a reference on the owning batcher while alive. +class WriteRpc : public Rpc { + public: + WriteRpc(const scoped_refptr& batcher, + RemoteTablet* const tablet, + vector ops, + const MonoTime& deadline, + const shared_ptr& messenger); + virtual ~WriteRpc(); + virtual void SendRpc() OVERRIDE; + virtual string ToString() const OVERRIDE; + + const KuduTable* table() const { + // All of the ops for a given tablet obviously correspond to the same table, + // so we'll just grab the table from the first. + return ops_[0]->write_op->table(); + } + const RemoteTablet* tablet() const { return tablet_; } + const vector& ops() const { return ops_; } + const WriteResponsePB& resp() const { return resp_; } + + private: + // Called when we finish a lookup (to find the new consensus leader). Retries + // the rpc after a short delay. + void LookupTabletCb(const Status& status); + + // Called when we finish initializing a TS proxy. + // Sends the RPC, provided there was no error. + void InitTSProxyCb(const Status& status); + + // Marks all replicas on current_ts_ as failed and retries the write on a + // new replica. + void FailToNewReplica(const Status& reason); + + virtual void SendRpcCb(const Status& status) OVERRIDE; + + // Pointer back to the batcher. Processes the write response when it + // completes, regardless of success or failure. + scoped_refptr batcher_; + + // The tablet that should receive this write. + RemoteTablet* const tablet_; + + // The TS receiving the write. May change if the write is retried. + RemoteTabletServer* current_ts_; + + // TSes that refused the write because they were followers at the time. + // Cleared when new consensus configuration information arrives from the master. + set followers_; + + // Request body. + WriteRequestPB req_; + + // Response body. + WriteResponsePB resp_; + + // Operations which were batched into this RPC. + // These operations are in kRequestSent state. + vector ops_; +}; + +WriteRpc::WriteRpc(const scoped_refptr& batcher, + RemoteTablet* const tablet, + vector ops, + const MonoTime& deadline, + const shared_ptr& messenger) + : Rpc(deadline, messenger), + batcher_(batcher), + tablet_(tablet), + current_ts_(NULL), + ops_(std::move(ops)) { + const Schema* schema = table()->schema().schema_; + + req_.set_tablet_id(tablet->tablet_id()); + switch (batcher->external_consistency_mode()) { + case kudu::client::KuduSession::CLIENT_PROPAGATED: + req_.set_external_consistency_mode(kudu::CLIENT_PROPAGATED); + break; + case kudu::client::KuduSession::COMMIT_WAIT: + req_.set_external_consistency_mode(kudu::COMMIT_WAIT); + break; + default: + LOG(FATAL) << "Unsupported consistency mode: " << batcher->external_consistency_mode(); + + } + + // Set up schema + CHECK_OK(SchemaToPB(*schema, req_.mutable_schema(), + SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES | SCHEMA_PB_WITHOUT_IDS)); + + RowOperationsPB* requested = req_.mutable_row_operations(); + + // Add the rows + int ctr = 0; + RowOperationsPBEncoder enc(requested); + for (InFlightOp* op : ops_) { + const Partition& partition = op->tablet->partition(); + const PartitionSchema& partition_schema = table()->partition_schema(); + const KuduPartialRow& row = op->write_op->row(); + +#ifndef NDEBUG + bool partition_contains_row; + CHECK(partition_schema.PartitionContainsRow(partition, row, &partition_contains_row).ok()); + CHECK(partition_contains_row) + << "Row " << partition_schema.RowDebugString(row) + << "not in partition " << partition_schema.PartitionDebugString(partition, *schema); +#endif + + enc.Add(ToInternalWriteType(op->write_op->type()), op->write_op->row()); + + // Set the state now, even though we haven't yet sent it -- at this point + // there is no return, and we're definitely going to send it. If we waited + // until after we sent it, the RPC callback could fire before we got a chance + // to change its state to 'sent'. + op->state = InFlightOp::kRequestSent; + VLOG(4) << ++ctr << ". Encoded row " << op->write_op->ToString(); + } + + if (VLOG_IS_ON(3)) { + VLOG(3) << "Created batch for " << tablet->tablet_id() << ":\n" + << req_.ShortDebugString(); + } +} + +WriteRpc::~WriteRpc() { + STLDeleteElements(&ops_); +} + +void WriteRpc::SendRpc() { + // Choose a destination TS according to the following algorithm: + // 1. Select the leader, provided: + // a. One exists, and + // b. It hasn't failed, and + // c. It isn't currently marked as a follower. + // 2. If there's no good leader select another replica, provided: + // a. It hasn't failed, and + // b. It hasn't rejected our write due to being a follower. + // 3. Preemptively mark the replica we selected in step 2 as "leader" in the + // meta cache, so that our selection remains sticky until the next Master + // metadata refresh. + // 4. If we're out of appropriate replicas, force a lookup to the master + // to fetch new consensus configuration information. + // 5. When the lookup finishes, forget which replicas were followers and + // retry the write (i.e. goto 1). + // 6. If we issue the write and it fails because the destination was a + // follower, remember that fact and retry the write (i.e. goto 1). + // 7. Repeat steps 1-6 until the write succeeds, fails for other reasons, + // or the write's deadline expires. + current_ts_ = tablet_->LeaderTServer(); + if (current_ts_ && ContainsKey(followers_, current_ts_)) { + VLOG(2) << "Tablet " << tablet_->tablet_id() << ": We have a follower for a leader: " + << current_ts_->ToString(); + + // Mark the node as a follower in the cache so that on the next go-round, + // LeaderTServer() will not return it as a leader unless a full metadata + // refresh has occurred. This also avoids LookupTabletByKey() going into + // "fast path" mode and not actually performing a metadata refresh from the + // Master when it needs to. + tablet_->MarkTServerAsFollower(current_ts_); + current_ts_ = NULL; + } + if (!current_ts_) { + // Try to "guess" the next leader. + vector replicas; + tablet_->GetRemoteTabletServers(&replicas); + for (RemoteTabletServer* ts : replicas) { + if (!ContainsKey(followers_, ts)) { + current_ts_ = ts; + break; + } + } + if (current_ts_) { + // Mark this next replica "preemptively" as the leader in the meta cache, + // so we go to it first on the next write if writing was successful. + VLOG(1) << "Tablet " << tablet_->tablet_id() << ": Previous leader failed. " + << "Preemptively marking tserver " << current_ts_->ToString() + << " as leader in the meta cache."; + tablet_->MarkTServerAsLeader(current_ts_); + } + } + + // If we've tried all replicas, force a lookup to the master to find the + // new leader. This relies on some properties of LookupTabletByKey(): + // 1. The fast path only works when there's a non-failed leader (which we + // know is untrue here). + // 2. The slow path always fetches consensus configuration information and updates the + // looked-up tablet. + // Put another way, we don't care about the lookup results at all; we're + // just using it to fetch the latest consensus configuration information. + // + // TODO: When we support tablet splits, we should let the lookup shift + // the write to another tablet (i.e. if it's since been split). + if (!current_ts_) { + batcher_->client_->data_->meta_cache_->LookupTabletByKey(table(), + tablet_->partition() + .partition_key_start(), + retrier().deadline(), + NULL, + Bind(&WriteRpc::LookupTabletCb, + Unretained(this))); + return; + } + + // Make sure we have a working proxy before sending out the RPC. + current_ts_->InitProxy(batcher_->client_, + Bind(&WriteRpc::InitTSProxyCb, Unretained(this))); +} + +string WriteRpc::ToString() const { + return Substitute("Write(tablet: $0, num_ops: $1, num_attempts: $2)", + tablet_->tablet_id(), ops_.size(), num_attempts()); +} + +void WriteRpc::LookupTabletCb(const Status& status) { + // We should retry the RPC regardless of the outcome of the lookup, as + // leader election doesn't depend on the existence of a master at all. + // + // Retry() imposes a slight delay, which is desirable in a lookup loop, + // but unnecessary the first time through. Seeing as leader failures are + // rare, perhaps this doesn't matter. + followers_.clear(); + mutable_retrier()->DelayedRetry(this, status); +} + +void WriteRpc::InitTSProxyCb(const Status& status) { + // Fail to a replica in the event of a DNS resolution failure. + if (!status.ok()) { + FailToNewReplica(status); + return; + } + + VLOG(2) << "Tablet " << tablet_->tablet_id() << ": Writing batch to replica " + << current_ts_->ToString(); + current_ts_->proxy()->WriteAsync(req_, &resp_, + mutable_retrier()->mutable_controller(), + boost::bind(&WriteRpc::SendRpcCb, this, Status::OK())); +} + +void WriteRpc::FailToNewReplica(const Status& reason) { + VLOG(1) << "Failing " << ToString() << " to a new replica: " + << reason.ToString(); + bool found = tablet_->MarkReplicaFailed(current_ts_, reason); + DCHECK(found) + << "Tablet " << tablet_->tablet_id() << ": Unable to mark replica " << current_ts_->ToString() + << " as failed. Replicas: " << tablet_->ReplicasAsString(); + + mutable_retrier()->DelayedRetry(this, reason); +} + +void WriteRpc::SendRpcCb(const Status& status) { + // Prefer early failures over controller failures. + Status new_status = status; + if (new_status.ok() && mutable_retrier()->HandleResponse(this, &new_status)) { + return; + } + + // Failover to a replica in the event of any network failure. + // + // TODO: This is probably too harsh; some network failures should be + // retried on the current replica. + if (new_status.IsNetworkError()) { + FailToNewReplica(new_status); + return; + } + + // Prefer controller failures over response failures. + if (new_status.ok() && resp_.has_error()) { + new_status = StatusFromPB(resp_.error().status()); + } + + // Oops, we failed over to a replica that wasn't a LEADER. Unlikely as + // we're using consensus configuration information from the master, but still possible + // (e.g. leader restarted and became a FOLLOWER). Try again. + // + // TODO: IllegalState is obviously way too broad an error category for + // this case. + if (new_status.IsIllegalState() || new_status.IsAborted()) { + followers_.insert(current_ts_); + mutable_retrier()->DelayedRetry(this, new_status); + return; + } + + if (!new_status.ok()) { + string current_ts_string; + if (current_ts_) { + current_ts_string = Substitute("on tablet server $0", current_ts_->ToString()); + } else { + current_ts_string = "(no tablet server available)"; + } + new_status = new_status.CloneAndPrepend( + Substitute("Failed to write batch of $0 ops to tablet $1 " + "$2 after $3 attempt(s)", + ops_.size(), tablet_->tablet_id(), + current_ts_string, num_attempts())); + LOG(WARNING) << new_status.ToString(); + } + batcher_->ProcessWriteResponse(*this, new_status); + delete this; +} + +Batcher::Batcher(KuduClient* client, + ErrorCollector* error_collector, + const sp::shared_ptr& session, + kudu::client::KuduSession::ExternalConsistencyMode consistency_mode) + : state_(kGatheringOps), + client_(client), + weak_session_(session), + consistency_mode_(consistency_mode), + error_collector_(error_collector), + had_errors_(false), + flush_callback_(NULL), + next_op_sequence_number_(0), + outstanding_lookups_(0), + max_buffer_size_(7 * 1024 * 1024), + buffer_bytes_used_(0) { +} + +void Batcher::Abort() { + unique_lock l(&lock_); + state_ = kAborted; + + vector to_abort; + for (InFlightOp* op : ops_) { + lock_guard l(&op->lock_); + if (op->state == InFlightOp::kBufferedToTabletServer) { + to_abort.push_back(op); + } + } + + for (InFlightOp* op : to_abort) { + VLOG(1) << "Aborting op: " << op->ToString(); + MarkInFlightOpFailedUnlocked(op, Status::Aborted("Batch aborted")); + } + + if (flush_callback_) { + l.unlock(); + + flush_callback_->Run(Status::Aborted("")); + } +} + +Batcher::~Batcher() { + if (PREDICT_FALSE(!ops_.empty())) { + for (InFlightOp* op : ops_) { + LOG(ERROR) << "Orphaned op: " << op->ToString(); + } + LOG(FATAL) << "ops_ not empty"; + } + CHECK(state_ == kFlushed || state_ == kAborted) << "Bad state: " << state_; +} + +void Batcher::SetTimeoutMillis(int millis) { + CHECK_GE(millis, 0); + lock_guard l(&lock_); + timeout_ = MonoDelta::FromMilliseconds(millis); +} + + +bool Batcher::HasPendingOperations() const { + lock_guard l(&lock_); + return !ops_.empty(); +} + +int Batcher::CountBufferedOperations() const { + lock_guard l(&lock_); + if (state_ == kGatheringOps) { + return ops_.size(); + } else { + // If we've already started to flush, then the ops aren't + // considered "buffered". + return 0; + } +} + +void Batcher::CheckForFinishedFlush() { + sp::shared_ptr session; + { + lock_guard l(&lock_); + if (state_ != kFlushing || !ops_.empty()) { + return; + } + + session = weak_session_.lock(); + state_ = kFlushed; + } + + if (session) { + // Important to do this outside of the lock so that we don't have + // a lock inversion deadlock -- the session lock should always + // come before the batcher lock. + session->data_->FlushFinished(this); + } + + Status s; + if (had_errors_) { + // User is responsible for fetching errors from the error collector. + s = Status::IOError("Some errors occurred"); + } + + flush_callback_->Run(s); +} + +MonoTime Batcher::ComputeDeadlineUnlocked() const { + MonoDelta timeout = timeout_; + if (PREDICT_FALSE(!timeout.Initialized())) { + KLOG_EVERY_N(WARNING, 1000) << "Client writing with no timeout set, using 60 seconds.\n" + << GetStackTrace(); + timeout = MonoDelta::FromSeconds(60); + } + MonoTime ret = MonoTime::Now(MonoTime::FINE); + ret.AddDelta(timeout); + return ret; +} + +void Batcher::FlushAsync(KuduStatusCallback* cb) { + { + lock_guard l(&lock_); + CHECK_EQ(state_, kGatheringOps); + state_ = kFlushing; + flush_callback_ = cb; + deadline_ = ComputeDeadlineUnlocked(); + } + + // In the case that we have nothing buffered, just call the callback + // immediately. Otherwise, the callback will be called by the last callback + // when it sees that the ops_ list has drained. + CheckForFinishedFlush(); + + // Trigger flushing of all of the buffers. Some of these may already have + // been flushed through an async path, but it's idempotent - a second call + // to flush would just be a no-op. + // + // If some of the operations are still in-flight, then they'll get sent + // when they hit 'per_tablet_ops', since our state is now kFlushing. + FlushBuffersIfReady(); +} + +Status Batcher::Add(KuduWriteOperation* write_op) { + int64_t required_size = write_op->SizeInBuffer(); + int64_t size_after_adding = buffer_bytes_used_.IncrementBy(required_size); + if (PREDICT_FALSE(size_after_adding > max_buffer_size_)) { + buffer_bytes_used_.IncrementBy(-required_size); + int64_t size_before_adding = size_after_adding - required_size; + return Status::Incomplete(Substitute( + "not enough space remaining in buffer for op (required $0, " + "$1 already used", + HumanReadableNumBytes::ToString(required_size), + HumanReadableNumBytes::ToString(size_before_adding))); + } + + + // As soon as we get the op, start looking up where it belongs, + // so that when the user calls Flush, we are ready to go. + gscoped_ptr op(new InFlightOp()); + RETURN_NOT_OK(write_op->table_->partition_schema() + .EncodeKey(write_op->row(), &op->partition_key)); + op->write_op.reset(write_op); + op->state = InFlightOp::kLookingUpTablet; + + AddInFlightOp(op.get()); + VLOG(3) << "Looking up tablet for " << op->write_op->ToString(); + + // Increment our reference count for the outstanding callback. + // + // deadline_ is set in FlushAsync(), after all Add() calls are done, so + // here we're forced to create a new deadline. + MonoTime deadline = ComputeDeadlineUnlocked(); + base::RefCountInc(&outstanding_lookups_); + client_->data_->meta_cache_->LookupTabletByKey( + op->write_op->table(), + op->partition_key, + deadline, + &op->tablet, + Bind(&Batcher::TabletLookupFinished, this, op.get())); + IgnoreResult(op.release()); + return Status::OK(); +} + +void Batcher::AddInFlightOp(InFlightOp* op) { + DCHECK_EQ(op->state, InFlightOp::kLookingUpTablet); + + lock_guard l(&lock_); + CHECK_EQ(state_, kGatheringOps); + InsertOrDie(&ops_, op); + op->sequence_number_ = next_op_sequence_number_++; +} + +bool Batcher::IsAbortedUnlocked() const { + return state_ == kAborted; +} + +void Batcher::MarkHadErrors() { + lock_guard l(&lock_); + had_errors_ = true; +} + +void Batcher::MarkInFlightOpFailed(InFlightOp* op, const Status& s) { + lock_guard l(&lock_); + MarkInFlightOpFailedUnlocked(op, s); +} + +void Batcher::MarkInFlightOpFailedUnlocked(InFlightOp* op, const Status& s) { + CHECK_EQ(1, ops_.erase(op)) + << "Could not remove op " << op->ToString() << " from in-flight list"; + gscoped_ptr error(new KuduError(op->write_op.release(), s)); + error_collector_->AddError(error.Pass()); + had_errors_ = true; + delete op; +} + +void Batcher::TabletLookupFinished(InFlightOp* op, const Status& s) { + base::RefCountDec(&outstanding_lookups_); + + // Acquire the batcher lock early to atomically: + // 1. Test if the batcher was aborted, and + // 2. Change the op state. + unique_lock l(&lock_); + + if (IsAbortedUnlocked()) { + VLOG(1) << "Aborted batch: TabletLookupFinished for " << op->write_op->ToString(); + MarkInFlightOpFailedUnlocked(op, Status::Aborted("Batch aborted")); + // 'op' is deleted by above function. + return; + } + + if (VLOG_IS_ON(3)) { + VLOG(3) << "TabletLookupFinished for " << op->write_op->ToString() + << ": " << s.ToString(); + if (s.ok()) { + VLOG(3) << "Result: tablet_id = " << op->tablet->tablet_id(); + } + } + + if (!s.ok()) { + MarkInFlightOpFailedUnlocked(op, s); + l.unlock(); + CheckForFinishedFlush(); + + // Even if we failed our lookup, it's possible that other requests were still + // pending waiting for our pending lookup to complete. So, we have to let them + // proceed. + FlushBuffersIfReady(); + return; + } + + { + lock_guard l2(&op->lock_); + CHECK_EQ(op->state, InFlightOp::kLookingUpTablet); + CHECK(op->tablet != NULL); + + op->state = InFlightOp::kBufferedToTabletServer; + + vector& to_ts = per_tablet_ops_[op->tablet.get()]; + to_ts.push_back(op); + + // "Reverse bubble sort" the operation into the right spot in the tablet server's + // buffer, based on the sequence numbers of the ops. + // + // There is a rare race (KUDU-743) where two operations in the same batch can get + // their order inverted with respect to the order that the user originally performed + // the operations. This loop re-sequences them back into the correct order. In + // the common case, it will break on the first iteration, so we expect the loop to be + // constant time, with worst case O(n). This is usually much better than something + // like a priority queue which would have O(lg n) in every case and a more complex + // code path. + for (int i = to_ts.size() - 1; i > 0; --i) { + if (to_ts[i]->sequence_number_ < to_ts[i - 1]->sequence_number_) { + std::swap(to_ts[i], to_ts[i - 1]); + } else { + break; + } + } + } + + l.unlock(); + + FlushBuffersIfReady(); +} + +void Batcher::FlushBuffersIfReady() { + unordered_map > ops_copy; + + // We're only ready to flush if: + // 1. The batcher is in the flushing state (i.e. FlushAsync was called). + // 2. All outstanding ops have finished lookup. Why? To avoid a situation + // where ops are flushed one by one as they finish lookup. + { + lock_guard l(&lock_); + if (state_ != kFlushing) { + VLOG(3) << "FlushBuffersIfReady: batcher not yet in flushing state"; + return; + } + if (!base::RefCountIsZero(&outstanding_lookups_)) { + VLOG(3) << "FlushBuffersIfReady: " + << base::subtle::NoBarrier_Load(&outstanding_lookups_) + << " ops still in lookup"; + return; + } + // Take ownership of the ops while we're under the lock. + ops_copy.swap(per_tablet_ops_); + } + + // Now flush the ops for each tablet. + for (const OpsMap::value_type& e : ops_copy) { + RemoteTablet* tablet = e.first; + const vector& ops = e.second; + + VLOG(3) << "FlushBuffersIfReady: already in flushing state, immediately flushing to " + << tablet->tablet_id(); + FlushBuffer(tablet, ops); + } +} + +void Batcher::FlushBuffer(RemoteTablet* tablet, const vector& ops) { + CHECK(!ops.empty()); + + // Create and send an RPC that aggregates the ops. The RPC is freed when + // its callback completes. + // + // The RPC object takes ownership of the ops. + WriteRpc* rpc = new WriteRpc(this, + tablet, + ops, + deadline_, + client_->data_->messenger_); + rpc->SendRpc(); +} + +void Batcher::ProcessWriteResponse(const WriteRpc& rpc, + const Status& s) { + // TODO: there is a potential race here -- if the Batcher gets destructed while + // RPCs are in-flight, then accessing state_ will crash. We probably need to keep + // track of the in-flight RPCs, and in the destructor, change each of them to an + // "aborted" state. + CHECK_EQ(state_, kFlushing); + + if (s.ok()) { + if (rpc.resp().has_timestamp()) { + client_->data_->UpdateLatestObservedTimestamp(rpc.resp().timestamp()); + } + } else { + // Mark each of the rows in the write op as failed, since the whole RPC failed. + for (InFlightOp* op : rpc.ops()) { + gscoped_ptr error(new KuduError(op->write_op.release(), s)); + error_collector_->AddError(error.Pass()); + } + + MarkHadErrors(); + } + + + // Remove all the ops from the "in-flight" list. + { + lock_guard l(&lock_); + for (InFlightOp* op : rpc.ops()) { + CHECK_EQ(1, ops_.erase(op)) + << "Could not remove op " << op->ToString() + << " from in-flight list"; + } + } + + // Check individual row errors. + for (const WriteResponsePB_PerRowErrorPB& err_pb : rpc.resp().per_row_errors()) { + // TODO: handle case where we get one of the more specific TS errors + // like the tablet not being hosted? + + if (err_pb.row_index() >= rpc.ops().size()) { + LOG(ERROR) << "Received a per_row_error for an out-of-bound op index " + << err_pb.row_index() << " (sent only " + << rpc.ops().size() << " ops)"; + LOG(ERROR) << "Response from tablet " << rpc.tablet()->tablet_id() << ":\n" + << rpc.resp().DebugString(); + continue; + } + gscoped_ptr op = rpc.ops()[err_pb.row_index()]->write_op.Pass(); + VLOG(1) << "Error on op " << op->ToString() << ": " + << err_pb.error().ShortDebugString(); + Status op_status = StatusFromPB(err_pb.error()); + gscoped_ptr error(new KuduError(op.release(), op_status)); + error_collector_->AddError(error.Pass()); + MarkHadErrors(); + } + + CheckForFinishedFlush(); +} + +} // namespace internal +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/batcher.h b/src/kudu/client/batcher.h new file mode 100644 index 000000000000..f79324b87a1c --- /dev/null +++ b/src/kudu/client/batcher.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_BATCHER_H +#define KUDU_CLIENT_BATCHER_H + +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/async_util.h" +#include "kudu/util/atomic.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace client { + +class KuduClient; +class KuduSession; +class KuduStatusCallback; +class KuduWriteOperation; + +namespace internal { + +struct InFlightOp; + +class ErrorCollector; +class RemoteTablet; +class WriteRpc; + +// A Batcher is the class responsible for collecting row operations, routing them to the +// correct tablet server, and possibly batching them together for better efficiency. +// +// It is a reference-counted class: the client session creating the batch holds one +// reference, and all of the in-flight operations hold others. This allows the client +// session to be destructed while ops are still in-flight, without the async callbacks +// attempting to access a destructed Batcher. +class Batcher : public RefCountedThreadSafe { + public: + // Create a new batcher associated with the given session. + // + // Any errors which come back from operations performed by this batcher are posted to + // the provided ErrorCollector. + // + // Takes a reference on error_collector. Creates a weak_ptr to 'session'. + Batcher(KuduClient* client, + ErrorCollector* error_collector, + const client::sp::shared_ptr& session, + kudu::client::KuduSession::ExternalConsistencyMode consistency_mode); + + // Abort the current batch. Any writes that were buffered and not yet sent are + // discarded. Those that were sent may still be delivered. If there is a pending Flush + // callback, it will be called immediately with an error status. + void Abort(); + + // Set the timeout for this batcher. + // + // The timeout is currently set on all of the RPCs, but in the future will be relative + // to when the Flush call is made (eg even if the lookup of the TS takes a long time, it + // may time out before even sending an op). TODO: implement that + void SetTimeoutMillis(int millis); + + // Add a new operation to the batch. Requires that the batch has not yet been flushed. + // TODO: in other flush modes, this may not be the case -- need to + // update this when they're implemented. + // + // NOTE: If this returns not-OK, does not take ownership of 'write_op'. + Status Add(KuduWriteOperation* write_op) WARN_UNUSED_RESULT; + + // Return true if any operations are still pending. An operation is no longer considered + // pending once it has either errored or succeeded. Operations are considering pending + // as soon as they are added, even if Flush has not been called. + bool HasPendingOperations() const; + + // Return the number of buffered operations. These are only those operations which are + // "corked" (i.e not yet flushed). Once Flush has been called, this returns 0. + int CountBufferedOperations() const; + + // Flush any buffered operations. The callback will be called once there are no + // more pending operations from this Batcher. If all of the operations succeeded, + // then the callback will receive Status::OK. Otherwise, it will receive IOError, + // and the caller must inspect the ErrorCollector to retrieve more detailed + // information on which operations failed. + void FlushAsync(KuduStatusCallback* cb); + + // Returns the consistency mode set on the batcher by the session when it was initially + // created. + kudu::client::KuduSession::ExternalConsistencyMode external_consistency_mode() const { + return consistency_mode_; + } + + private: + friend class RefCountedThreadSafe; + friend class WriteRpc; + + ~Batcher(); + + // Add an op to the in-flight set and increment the ref-count. + void AddInFlightOp(InFlightOp* op); + + void RemoveInFlightOp(InFlightOp* op); + + // Return true if the batch has been aborted, and any in-flight ops should stop + // processing wherever they are. + bool IsAbortedUnlocked() const; + + // Mark the fact that errors have occurred with this batch. This ensures that + // the flush callback will get a bad Status. + void MarkHadErrors(); + + // Remove an op from the in-flight op list, and delete the op itself. + // The operation is reported to the ErrorReporter as having failed with the + // given status. + void MarkInFlightOpFailed(InFlightOp* op, const Status& s); + void MarkInFlightOpFailedUnlocked(InFlightOp* op, const Status& s); + + void CheckForFinishedFlush(); + void FlushBuffersIfReady(); + void FlushBuffer(RemoteTablet* tablet, const std::vector& ops); + + // Cleans up an RPC response, scooping out any errors and passing them up + // to the batcher. + void ProcessWriteResponse(const WriteRpc& rpc, const Status& s); + + // Async Callbacks. + void TabletLookupFinished(InFlightOp* op, const Status& s); + + // Compute a new deadline based on timeout_. If no timeout_ has been set, + // uses a hard-coded default and issues periodic warnings. + MonoTime ComputeDeadlineUnlocked() const; + + // See note about lock ordering in batcher.cc + mutable simple_spinlock lock_; + + enum State { + kGatheringOps, + kFlushing, + kFlushed, + kAborted + }; + State state_; + + KuduClient* const client_; + client::sp::weak_ptr weak_session_; + + // The consistency mode set in the session. + kudu::client::KuduSession::ExternalConsistencyMode consistency_mode_; + + // Errors are reported into this error collector. + scoped_refptr const error_collector_; + + // Set to true if there was at least one error from this Batcher. + // Protected by lock_ + bool had_errors_; + + // If state is kFlushing, this member will be set to the user-provided + // callback. Once there are no more in-flight operations, the callback + // will be called exactly once (and the state changed to kFlushed). + KuduStatusCallback* flush_callback_; + + // All buffered or in-flight ops. + std::unordered_set ops_; + // Each tablet's buffered ops. + typedef std::unordered_map > OpsMap; + OpsMap per_tablet_ops_; + + // When each operation is added to the batcher, it is assigned a sequence number + // which preserves the user's intended order. Preserving order is critical when + // a batch contains multiple operations against the same row key. This member + // assigns the sequence numbers. + // Protected by lock_. + int next_op_sequence_number_; + + // Amount of time to wait for a given op, from start to finish. + // + // Set by SetTimeoutMillis. + MonoDelta timeout_; + + // After flushing, the absolute deadline for all in-flight ops. + MonoTime deadline_; + + // Number of outstanding lookups across all in-flight ops. + // + // Note: _not_ protected by lock_! + Atomic32 outstanding_lookups_; + + // The maximum number of bytes of encoded operations which will be allowed to + // be buffered. + int64_t max_buffer_size_; + + // The number of bytes used in the buffer for pending operations. + AtomicInt buffer_bytes_used_; + + DISALLOW_COPY_AND_ASSIGN(Batcher); +}; + +} // namespace internal +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_BATCHER_H */ diff --git a/src/kudu/client/callbacks.h b/src/kudu/client/callbacks.h new file mode 100644 index 000000000000..5c65d3b5f756 --- /dev/null +++ b/src/kudu/client/callbacks.h @@ -0,0 +1,185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_CALLBACKS_H +#define KUDU_CLIENT_CALLBACKS_H + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif +#include "kudu/util/kudu_export.h" + +namespace kudu { + +class Status; + +namespace client { + +// All possible log levels. +enum KuduLogSeverity { + SEVERITY_INFO, + SEVERITY_WARNING, + SEVERITY_ERROR, + SEVERITY_FATAL +}; + +// Interface for all logging callbacks. +class KUDU_EXPORT KuduLoggingCallback { + public: + KuduLoggingCallback() { + } + + virtual ~KuduLoggingCallback() { + } + + // 'message' is NOT terminated with an endline. + virtual void Run(KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(KuduLoggingCallback); +}; + +// Logging callback that invokes a member function pointer. +template +class KUDU_EXPORT KuduLoggingMemberCallback : public KuduLoggingCallback { + public: + typedef void (T::*MemberType)( + KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len); + + KuduLoggingMemberCallback(T* object, MemberType member) + : object_(object), + member_(member) { + } + + virtual void Run(KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len) OVERRIDE { + (object_->*member_)(severity, filename, line_number, time, + message, message_len); + } + + private: + T* object_; + MemberType member_; +}; + +// Logging callback that invokes a function pointer with a single argument. +template +class KUDU_EXPORT KuduLoggingFunctionCallback : public KuduLoggingCallback { + public: + typedef void (*FunctionType)(T arg, + KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len); + + KuduLoggingFunctionCallback(FunctionType function, T arg) + : function_(function), + arg_(arg) { + } + + virtual void Run(KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len) OVERRIDE { + function_(arg_, severity, filename, line_number, time, + message, message_len); + } + + private: + FunctionType function_; + T arg_; +}; + +// Interface for all status callbacks. +class KUDU_EXPORT KuduStatusCallback { + public: + KuduStatusCallback() { + } + + virtual ~KuduStatusCallback() { + } + + virtual void Run(const Status& s) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(KuduStatusCallback); +}; + +// Status callback that invokes a member function pointer. +template +class KUDU_EXPORT KuduStatusMemberCallback : public KuduStatusCallback { + public: + typedef void (T::*MemberType)(const Status& s); + + KuduStatusMemberCallback(T* object, MemberType member) + : object_(object), + member_(member) { + } + + virtual void Run(const Status& s) OVERRIDE { + (object_->*member_)(s); + } + + private: + T* object_; + MemberType member_; +}; + +// Status callback that invokes a function pointer with a single argument. +template +class KUDU_EXPORT KuduStatusFunctionCallback : public KuduStatusCallback { + public: + typedef void (*FunctionType)(T arg, const Status& s); + + KuduStatusFunctionCallback(FunctionType function, T arg) + : function_(function), + arg_(arg) { + } + + virtual void Run(const Status& s) OVERRIDE { + function_(arg_, s); + } + + private: + FunctionType function_; + T arg_; +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/client-internal.cc b/src/kudu/client/client-internal.cc new file mode 100644 index 000000000000..4ed4a6cc00f4 --- /dev/null +++ b/src/kudu/client/client-internal.cc @@ -0,0 +1,857 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/client-internal.h" + +#include +#include +#include +#include + +#include "kudu/client/meta_cache.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/master/master.h" +#include "kudu/master/master_rpc.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/rpc.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/net/dns_resolver.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/thread_restrictions.h" + +using std::set; +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { + +using consensus::RaftPeerPB; +using master::AlterTableRequestPB; +using master::AlterTableResponsePB; +using master::CreateTableRequestPB; +using master::CreateTableResponsePB; +using master::DeleteTableRequestPB; +using master::DeleteTableResponsePB; +using master::GetLeaderMasterRpc; +using master::GetTableSchemaRequestPB; +using master::GetTableSchemaResponsePB; +using master::IsAlterTableDoneRequestPB; +using master::IsAlterTableDoneResponsePB; +using master::IsCreateTableDoneRequestPB; +using master::IsCreateTableDoneResponsePB; +using master::ListTablesRequestPB; +using master::ListTablesResponsePB; +using master::ListTabletServersRequestPB; +using master::ListTabletServersResponsePB; +using master::MasterServiceProxy; +using master::MasterErrorPB; +using rpc::Rpc; +using rpc::RpcController; +using strings::Substitute; + +namespace client { + +using internal::GetTableSchemaRpc; +using internal::RemoteTablet; +using internal::RemoteTabletServer; + +Status RetryFunc(const MonoTime& deadline, + const string& retry_msg, + const string& timeout_msg, + const boost::function& func) { + DCHECK(deadline.Initialized()); + + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (deadline.ComesBefore(now)) { + return Status::TimedOut(timeout_msg); + } + + double wait_secs = 0.001; + const double kMaxSleepSecs = 2; + while (1) { + MonoTime func_stime = now; + bool retry = true; + Status s = func(deadline, &retry); + if (!retry) { + return s; + } + now = MonoTime::Now(MonoTime::FINE); + MonoDelta func_time = now.GetDeltaSince(func_stime); + + VLOG(1) << retry_msg << " status=" << s.ToString(); + double secs_remaining = std::numeric_limits::max(); + if (deadline.Initialized()) { + secs_remaining = deadline.GetDeltaSince(now).ToSeconds(); + } + wait_secs = std::min(wait_secs * 1.25, kMaxSleepSecs); + + // We assume that the function will take the same amount of time to run + // as it did in the previous attempt. If we don't have enough time left + // to sleep and run it again, we don't bother sleeping and retrying. + if (wait_secs + func_time.ToSeconds() > secs_remaining) { + break; + } + + VLOG(1) << "Waiting for " << HumanReadableElapsedTime::ToShortString(wait_secs) + << " before retrying..."; + SleepFor(MonoDelta::FromSeconds(wait_secs)); + now = MonoTime::Now(MonoTime::FINE); + + } + + return Status::TimedOut(timeout_msg); +} + +template +Status KuduClient::Data::SyncLeaderMasterRpc( + const MonoTime& deadline, + KuduClient* client, + const ReqClass& req, + RespClass* resp, + int* num_attempts, + const char* func_name, + const boost::function& func) { + DCHECK(deadline.Initialized()); + + while (true) { + RpcController rpc; + + // Have we already exceeded our deadline? + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (deadline.ComesBefore(now)) { + return Status::TimedOut(Substitute("$0 timed out after deadline expired", + func_name)); + } + + // The RPC's deadline is intentionally earlier than the overall + // deadline so that we reserve some time with which to find a new + // leader master and retry before the overall deadline expires. + // + // TODO: KUDU-683 tracks cleanup for this. + MonoTime rpc_deadline = now; + rpc_deadline.AddDelta(client->default_rpc_timeout()); + rpc.set_deadline(MonoTime::Earliest(rpc_deadline, deadline)); + + if (num_attempts != nullptr) { + ++*num_attempts; + } + Status s = func(master_proxy_.get(), req, resp, &rpc); + if (s.IsNetworkError()) { + LOG(WARNING) << "Unable to send the request (" << req.ShortDebugString() + << ") to leader Master (" << leader_master_hostport().ToString() + << "): " << s.ToString(); + if (client->IsMultiMaster()) { + LOG(INFO) << "Determining the new leader Master and retrying..."; + WARN_NOT_OK(SetMasterServerProxy(client, deadline), + "Unable to determine the new leader Master"); + continue; + } + } + + if (s.IsTimedOut()) { + if (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + LOG(WARNING) << "Unable to send the request (" << req.ShortDebugString() + << ") to leader Master (" << leader_master_hostport().ToString() + << "): " << s.ToString(); + if (client->IsMultiMaster()) { + LOG(INFO) << "Determining the new leader Master and retrying..."; + WARN_NOT_OK(SetMasterServerProxy(client, deadline), + "Unable to determine the new leader Master"); + continue; + } + } else { + // Operation deadline expired during this latest RPC. + s = s.CloneAndPrepend(Substitute("$0 timed out after deadline expired", + func_name)); + } + } + + if (s.ok() && resp->has_error()) { + if (resp->error().code() == MasterErrorPB::NOT_THE_LEADER || + resp->error().code() == MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED) { + if (client->IsMultiMaster()) { + LOG(INFO) << "Determining the new leader Master and retrying..."; + WARN_NOT_OK(SetMasterServerProxy(client, deadline), + "Unable to determine the new leader Master"); + continue; + } + } + } + return s; + } +} + +// Explicit specialization for callers outside this compilation unit. +template +Status KuduClient::Data::SyncLeaderMasterRpc( + const MonoTime& deadline, + KuduClient* client, + const ListTablesRequestPB& req, + ListTablesResponsePB* resp, + int* num_attempts, + const char* func_name, + const boost::function& func); +template +Status KuduClient::Data::SyncLeaderMasterRpc( + const MonoTime& deadline, + KuduClient* client, + const ListTabletServersRequestPB& req, + ListTabletServersResponsePB* resp, + int* num_attempts, + const char* func_name, + const boost::function& func); + +KuduClient::Data::Data() + : latest_observed_timestamp_(KuduClient::kNoTimestamp) { +} + +KuduClient::Data::~Data() { + // Workaround for KUDU-956: the user may close a KuduClient while a flush + // is still outstanding. In that case, the flush's callback will be the last + // holder of the client reference, causing it to shut down on the reactor + // thread. This triggers a ThreadRestrictions crash. It's not critical to + // fix urgently, because typically once a client is shutting down, latency + // jitter on the reactor is not a big deal (and DNS resolutions are not in flight). + ThreadRestrictions::ScopedAllowWait allow_wait; + dns_resolver_.reset(); +} + +RemoteTabletServer* KuduClient::Data::SelectTServer(const scoped_refptr& rt, + const ReplicaSelection selection, + const set& blacklist, + vector* candidates) const { + RemoteTabletServer* ret = nullptr; + candidates->clear(); + switch (selection) { + case LEADER_ONLY: { + ret = rt->LeaderTServer(); + if (ret != nullptr) { + candidates->push_back(ret); + if (ContainsKey(blacklist, ret->permanent_uuid())) { + ret = nullptr; + } + } + break; + } + case CLOSEST_REPLICA: + case FIRST_REPLICA: { + rt->GetRemoteTabletServers(candidates); + // Filter out all the blacklisted candidates. + vector filtered; + for (RemoteTabletServer* rts : *candidates) { + if (!ContainsKey(blacklist, rts->permanent_uuid())) { + filtered.push_back(rts); + } else { + VLOG(1) << "Excluding blacklisted tserver " << rts->permanent_uuid(); + } + } + if (selection == FIRST_REPLICA) { + if (!filtered.empty()) { + ret = filtered[0]; + } + } else if (selection == CLOSEST_REPLICA) { + // Choose a local replica. + for (RemoteTabletServer* rts : filtered) { + if (IsTabletServerLocal(*rts)) { + ret = rts; + break; + } + } + // Fallback to a random replica if none are local. + if (ret == nullptr && !filtered.empty()) { + ret = filtered[rand() % filtered.size()]; + } + } + break; + } + default: { + LOG(FATAL) << "Unknown ProxySelection value " << selection; + break; + } + } + + return ret; +} + +Status KuduClient::Data::GetTabletServer(KuduClient* client, + const scoped_refptr& rt, + ReplicaSelection selection, + const set& blacklist, + vector* candidates, + RemoteTabletServer** ts) { + // TODO: write a proper async version of this for async client. + RemoteTabletServer* ret = SelectTServer(rt, selection, blacklist, candidates); + if (PREDICT_FALSE(ret == nullptr)) { + // Construct a blacklist string if applicable. + string blacklist_string = ""; + if (!blacklist.empty()) { + blacklist_string = Substitute("(blacklist replicas $0)", JoinStrings(blacklist, ", ")); + } + return Status::ServiceUnavailable( + Substitute("No $0 for tablet $1 $2", + selection == LEADER_ONLY ? "LEADER" : "replicas", + rt->tablet_id(), + blacklist_string)); + } + Synchronizer s; + ret->InitProxy(client, s.AsStatusCallback()); + RETURN_NOT_OK(s.Wait()); + + *ts = ret; + return Status::OK(); +} + +Status KuduClient::Data::CreateTable(KuduClient* client, + const CreateTableRequestPB& req, + const KuduSchema& schema, + const MonoTime& deadline) { + CreateTableResponsePB resp; + + int attempts = 0; + Status s = SyncLeaderMasterRpc( + deadline, client, req, &resp, &attempts, "CreateTable", &MasterServiceProxy::CreateTable); + RETURN_NOT_OK(s); + if (resp.has_error()) { + if (resp.error().code() == MasterErrorPB::TABLE_ALREADY_PRESENT && attempts > 1) { + // If the table already exists and the number of attempts is > + // 1, then it means we may have succeeded in creating the + // table, but client didn't receive the successful + // response (e.g., due to failure before the successful + // response could be sent back, or due to a I/O pause or a + // network blip leading to a timeout, etc...) + KuduSchema actual_schema; + string table_id; + PartitionSchema actual_partition_schema; + RETURN_NOT_OK_PREPEND( + GetTableSchema(client, req.name(), deadline, &actual_schema, + &actual_partition_schema, &table_id), + Substitute("Unable to check the schema of table $0", req.name())); + if (!schema.Equals(actual_schema)) { + string msg = Substitute("Table $0 already exists with a different " + "schema. Requested schema was: $1, actual schema is: $2", + req.name(), schema.schema_->ToString(), actual_schema.schema_->ToString()); + LOG(ERROR) << msg; + return Status::AlreadyPresent(msg); + } else { + PartitionSchema partition_schema; + RETURN_NOT_OK(PartitionSchema::FromPB(req.partition_schema(), + *schema.schema_, &partition_schema)); + if (!partition_schema.Equals(actual_partition_schema)) { + string msg = Substitute("Table $0 already exists with a different partition schema. " + "Requested partition schema was: $1, actual partition schema is: $2", + req.name(), partition_schema.DebugString(*schema.schema_), + actual_partition_schema.DebugString(*actual_schema.schema_)); + LOG(ERROR) << msg; + return Status::AlreadyPresent(msg); + } else { + return Status::OK(); + } + } + } + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status KuduClient::Data::IsCreateTableInProgress(KuduClient* client, + const string& table_name, + const MonoTime& deadline, + bool *create_in_progress) { + IsCreateTableDoneRequestPB req; + IsCreateTableDoneResponsePB resp; + req.mutable_table()->set_table_name(table_name); + + // TODO: Add client rpc timeout and use 'default_admin_operation_timeout_' as + // the default timeout for all admin operations. + Status s = + SyncLeaderMasterRpc( + deadline, + client, + req, + &resp, + nullptr, + "IsCreateTableDone", + &MasterServiceProxy::IsCreateTableDone); + // RETURN_NOT_OK macro can't take templated function call as param, + // and SyncLeaderMasterRpc must be explicitly instantiated, else the + // compiler complains. + RETURN_NOT_OK(s); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + *create_in_progress = !resp.done(); + return Status::OK(); +} + +Status KuduClient::Data::WaitForCreateTableToFinish(KuduClient* client, + const string& table_name, + const MonoTime& deadline) { + return RetryFunc(deadline, + "Waiting on Create Table to be completed", + "Timed out waiting for Table Creation", + boost::bind(&KuduClient::Data::IsCreateTableInProgress, + this, client, table_name, _1, _2)); +} + +Status KuduClient::Data::DeleteTable(KuduClient* client, + const string& table_name, + const MonoTime& deadline) { + DeleteTableRequestPB req; + DeleteTableResponsePB resp; + int attempts = 0; + + req.mutable_table()->set_table_name(table_name); + Status s = SyncLeaderMasterRpc( + deadline, client, req, &resp, + &attempts, "DeleteTable", &MasterServiceProxy::DeleteTable); + RETURN_NOT_OK(s); + if (resp.has_error()) { + if (resp.error().code() == MasterErrorPB::TABLE_NOT_FOUND && attempts > 1) { + // A prior attempt to delete the table has succeeded, but + // appeared as a failure to the client due to, e.g., an I/O or + // network issue. + return Status::OK(); + } + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status KuduClient::Data::AlterTable(KuduClient* client, + const AlterTableRequestPB& req, + const MonoTime& deadline) { + AlterTableResponsePB resp; + Status s = + SyncLeaderMasterRpc( + deadline, + client, + req, + &resp, + nullptr, + "AlterTable", + &MasterServiceProxy::AlterTable); + RETURN_NOT_OK(s); + // TODO: Consider the situation where the request is sent to the + // server, gets executed on the server and written to the server, + // but is seen as failed by the client, and is then retried (in which + // case the retry will fail due to original table being removed, a + // column being already added, etc...) + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status KuduClient::Data::IsAlterTableInProgress(KuduClient* client, + const string& table_name, + const MonoTime& deadline, + bool *alter_in_progress) { + IsAlterTableDoneRequestPB req; + IsAlterTableDoneResponsePB resp; + + req.mutable_table()->set_table_name(table_name); + Status s = + SyncLeaderMasterRpc( + deadline, + client, + req, + &resp, + nullptr, + "IsAlterTableDone", + &MasterServiceProxy::IsAlterTableDone); + RETURN_NOT_OK(s); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + *alter_in_progress = !resp.done(); + return Status::OK(); +} + +Status KuduClient::Data::WaitForAlterTableToFinish(KuduClient* client, + const string& alter_name, + const MonoTime& deadline) { + return RetryFunc(deadline, + "Waiting on Alter Table to be completed", + "Timed out waiting for AlterTable", + boost::bind(&KuduClient::Data::IsAlterTableInProgress, + this, + client, alter_name, _1, _2)); +} + +Status KuduClient::Data::InitLocalHostNames() { + // Currently, we just use our configured hostname, and resolve it to come up with + // a list of potentially local hosts. It would be better to iterate over all of + // the local network adapters. See KUDU-327. + string hostname; + RETURN_NOT_OK(GetFQDN(&hostname)); + + // We don't want to consider 'localhost' to be local - otherwise if a misconfigured + // server reports its own name as localhost, all clients will hammer it. + if (hostname != "localhost" && hostname != "localhost.localdomain") { + local_host_names_.insert(hostname); + VLOG(1) << "Considering host " << hostname << " local"; + } + + vector addresses; + RETURN_NOT_OK_PREPEND(HostPort(hostname, 0).ResolveAddresses(&addresses), + Substitute("Could not resolve local host name '$0'", hostname)); + + for (const Sockaddr& addr : addresses) { + // Similar to above, ignore local or wildcard addresses. + if (addr.IsWildcard()) continue; + if (addr.IsAnyLocalAddress()) continue; + + VLOG(1) << "Considering host " << addr.host() << " local"; + local_host_names_.insert(addr.host()); + } + + return Status::OK(); +} + +bool KuduClient::Data::IsLocalHostPort(const HostPort& hp) const { + return ContainsKey(local_host_names_, hp.host()); +} + +bool KuduClient::Data::IsTabletServerLocal(const RemoteTabletServer& rts) const { + vector host_ports; + rts.GetHostPorts(&host_ports); + for (const HostPort& hp : host_ports) { + if (IsLocalHostPort(hp)) return true; + } + return false; +} + +namespace internal { + +// Gets a table's schema from the leader master. If the leader master +// is down, waits for a new master to become the leader, and then gets +// the table schema from the new leader master. +// +// TODO: When we implement the next fault tolerant client-master RPC +// call (e.g., CreateTable/AlterTable), we should generalize this +// method as to enable code sharing. +class GetTableSchemaRpc : public Rpc { + public: + GetTableSchemaRpc(KuduClient* client, + StatusCallback user_cb, + string table_name, + KuduSchema* out_schema, + PartitionSchema* out_partition_schema, + string* out_id, + const MonoTime& deadline, + const shared_ptr& messenger); + + virtual void SendRpc() OVERRIDE; + + virtual string ToString() const OVERRIDE; + + virtual ~GetTableSchemaRpc(); + + private: + virtual void SendRpcCb(const Status& status) OVERRIDE; + + void ResetLeaderMasterAndRetry(); + + void NewLeaderMasterDeterminedCb(const Status& status); + + KuduClient* client_; + StatusCallback user_cb_; + const string table_name_; + KuduSchema* out_schema_; + PartitionSchema* out_partition_schema_; + string* out_id_; + GetTableSchemaResponsePB resp_; +}; + +GetTableSchemaRpc::GetTableSchemaRpc(KuduClient* client, + StatusCallback user_cb, + string table_name, + KuduSchema* out_schema, + PartitionSchema* out_partition_schema, + string* out_id, + const MonoTime& deadline, + const shared_ptr& messenger) + : Rpc(deadline, messenger), + client_(DCHECK_NOTNULL(client)), + user_cb_(std::move(user_cb)), + table_name_(std::move(table_name)), + out_schema_(DCHECK_NOTNULL(out_schema)), + out_partition_schema_(DCHECK_NOTNULL(out_partition_schema)), + out_id_(DCHECK_NOTNULL(out_id)) { +} + +GetTableSchemaRpc::~GetTableSchemaRpc() { +} + +void GetTableSchemaRpc::SendRpc() { + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (retrier().deadline().ComesBefore(now)) { + SendRpcCb(Status::TimedOut("GetTableSchema timed out after deadline expired")); + return; + } + + // See KuduClient::Data::SyncLeaderMasterRpc(). + MonoTime rpc_deadline = now; + rpc_deadline.AddDelta(client_->default_rpc_timeout()); + mutable_retrier()->mutable_controller()->set_deadline( + MonoTime::Earliest(rpc_deadline, retrier().deadline())); + + GetTableSchemaRequestPB req; + req.mutable_table()->set_table_name(table_name_); + client_->data_->master_proxy()->GetTableSchemaAsync( + req, &resp_, + mutable_retrier()->mutable_controller(), + boost::bind(&GetTableSchemaRpc::SendRpcCb, this, Status::OK())); +} + +string GetTableSchemaRpc::ToString() const { + return Substitute("GetTableSchemaRpc(table_name: $0, num_attempts: $1)", + table_name_, num_attempts()); +} + +void GetTableSchemaRpc::ResetLeaderMasterAndRetry() { + client_->data_->SetMasterServerProxyAsync( + client_, + retrier().deadline(), + Bind(&GetTableSchemaRpc::NewLeaderMasterDeterminedCb, + Unretained(this))); +} + +void GetTableSchemaRpc::NewLeaderMasterDeterminedCb(const Status& status) { + if (status.ok()) { + mutable_retrier()->mutable_controller()->Reset(); + SendRpc(); + } else { + LOG(WARNING) << "Failed to determine new Master: " << status.ToString(); + mutable_retrier()->DelayedRetry(this, status); + } +} + +void GetTableSchemaRpc::SendRpcCb(const Status& status) { + Status new_status = status; + if (new_status.ok() && mutable_retrier()->HandleResponse(this, &new_status)) { + return; + } + + if (new_status.ok() && resp_.has_error()) { + if (resp_.error().code() == MasterErrorPB::NOT_THE_LEADER || + resp_.error().code() == MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED) { + if (client_->IsMultiMaster()) { + LOG(WARNING) << "Leader Master has changed (" + << client_->data_->leader_master_hostport().ToString() + << " is no longer the leader), re-trying..."; + ResetLeaderMasterAndRetry(); + return; + } + } + new_status = StatusFromPB(resp_.error().status()); + } + + if (new_status.IsTimedOut()) { + if (MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) { + if (client_->IsMultiMaster()) { + LOG(WARNING) << "Leader Master (" + << client_->data_->leader_master_hostport().ToString() + << ") timed out, re-trying..."; + ResetLeaderMasterAndRetry(); + return; + } + } else { + // Operation deadline expired during this latest RPC. + new_status = new_status.CloneAndPrepend( + "GetTableSchema timed out after deadline expired"); + } + } + + if (new_status.IsNetworkError()) { + if (client_->IsMultiMaster()) { + LOG(WARNING) << "Encountered a network error from the Master(" + << client_->data_->leader_master_hostport().ToString() << "): " + << new_status.ToString() << ", retrying..."; + ResetLeaderMasterAndRetry(); + return; + } + } + + if (new_status.ok()) { + gscoped_ptr schema(new Schema()); + new_status = SchemaFromPB(resp_.schema(), schema.get()); + if (new_status.ok()) { + delete out_schema_->schema_; + out_schema_->schema_ = schema.release(); + new_status = PartitionSchema::FromPB(resp_.partition_schema(), + *out_schema_->schema_, + out_partition_schema_); + + *out_id_ = resp_.table_id(); + CHECK_GT(out_id_->size(), 0) << "Running against a too-old master"; + } + } + if (!new_status.ok()) { + LOG(WARNING) << ToString() << " failed: " << new_status.ToString(); + } + user_cb_.Run(new_status); +} + +} // namespace internal + +Status KuduClient::Data::GetTableSchema(KuduClient* client, + const string& table_name, + const MonoTime& deadline, + KuduSchema* schema, + PartitionSchema* partition_schema, + string* table_id) { + Synchronizer sync; + GetTableSchemaRpc rpc(client, + sync.AsStatusCallback(), + table_name, + schema, + partition_schema, + table_id, + deadline, + messenger_); + rpc.SendRpc(); + return sync.Wait(); +} + +void KuduClient::Data::LeaderMasterDetermined(const Status& status, + const HostPort& host_port) { + Sockaddr leader_sock_addr; + Status new_status = status; + if (new_status.ok()) { + new_status = SockaddrFromHostPort(host_port, &leader_sock_addr); + } + + vector cbs; + { + lock_guard l(&leader_master_lock_); + cbs.swap(leader_master_callbacks_); + leader_master_rpc_.reset(); + + if (new_status.ok()) { + leader_master_hostport_ = host_port; + master_proxy_.reset(new MasterServiceProxy(messenger_, leader_sock_addr)); + } + } + + for (const StatusCallback& cb : cbs) { + cb.Run(new_status); + } +} + +Status KuduClient::Data::SetMasterServerProxy(KuduClient* client, + const MonoTime& deadline) { + Synchronizer sync; + SetMasterServerProxyAsync(client, deadline, sync.AsStatusCallback()); + return sync.Wait(); +} + +void KuduClient::Data::SetMasterServerProxyAsync(KuduClient* client, + const MonoTime& deadline, + const StatusCallback& cb) { + DCHECK(deadline.Initialized()); + + vector master_sockaddrs; + for (const string& master_server_addr : master_server_addrs_) { + vector addrs; + Status s; + // TODO: Do address resolution asynchronously as well. + s = ParseAddressList(master_server_addr, master::Master::kDefaultPort, &addrs); + if (!s.ok()) { + cb.Run(s); + return; + } + if (addrs.empty()) { + cb.Run(Status::InvalidArgument(Substitute("No master address specified by '$0'", + master_server_addr))); + return; + } + if (addrs.size() > 1) { + LOG(WARNING) << "Specified master server address '" << master_server_addr << "' " + << "resolved to multiple IPs. Using " << addrs[0].ToString(); + } + master_sockaddrs.push_back(addrs[0]); + } + + // Finding a new master involves a fan-out RPC to each master. A single + // RPC timeout's worth of time should be sufficient, though we'll use + // the provided deadline if it's sooner. + MonoTime leader_master_deadline = MonoTime::Now(MonoTime::FINE); + leader_master_deadline.AddDelta(client->default_rpc_timeout()); + MonoTime actual_deadline = MonoTime::Earliest(deadline, leader_master_deadline); + + // This ensures that no more than one GetLeaderMasterRpc is in + // flight at a time -- there isn't much sense in requesting this information + // in parallel, since the requests should end up with the same result. + // Instead, we simply piggy-back onto the existing request by adding our own + // callback to leader_master_callbacks_. + unique_lock l(&leader_master_lock_); + leader_master_callbacks_.push_back(cb); + if (!leader_master_rpc_) { + // No one is sending a request yet - we need to be the one to do it. + leader_master_rpc_.reset(new GetLeaderMasterRpc( + Bind(&KuduClient::Data::LeaderMasterDetermined, + Unretained(this)), + master_sockaddrs, + actual_deadline, + messenger_)); + l.unlock(); + leader_master_rpc_->SendRpc(); + } + + +} + +HostPort KuduClient::Data::leader_master_hostport() const { + lock_guard l(&leader_master_lock_); + return leader_master_hostport_; +} + +shared_ptr KuduClient::Data::master_proxy() const { + lock_guard l(&leader_master_lock_); + return master_proxy_; +} + +uint64_t KuduClient::Data::GetLatestObservedTimestamp() const { + return latest_observed_timestamp_.Load(); +} + +void KuduClient::Data::UpdateLatestObservedTimestamp(uint64_t timestamp) { + latest_observed_timestamp_.StoreMax(timestamp); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/client-internal.h b/src/kudu/client/client-internal.h new file mode 100644 index 000000000000..95716161b160 --- /dev/null +++ b/src/kudu/client/client-internal.h @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_CLIENT_INTERNAL_H +#define KUDU_CLIENT_CLIENT_INTERNAL_H + +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/util/atomic.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/net_util.h" + +namespace kudu { + +class DnsResolver; +class HostPort; + +namespace master { +class AlterTableRequestPB; +class CreateTableRequestPB; +class GetLeaderMasterRpc; +class MasterServiceProxy; +} // namespace master + +namespace rpc { +class Messenger; +class RpcController; +} // namespace rpc + +namespace client { + +class KuduClient::Data { + public: + Data(); + ~Data(); + + // Selects a TS replica from the given RemoteTablet subject + // to liveness and the provided selection criteria and blacklist. + // + // If no appropriate replica can be found, a non-OK status is returned and 'ts' is untouched. + // + // The 'candidates' return parameter indicates tservers that are live and meet the selection + // criteria, but are possibly filtered by the blacklist. This is useful for implementing + // retry logic. + Status GetTabletServer(KuduClient* client, + const scoped_refptr& rt, + ReplicaSelection selection, + const std::set& blacklist, + std::vector* candidates, + internal::RemoteTabletServer** ts); + + Status CreateTable(KuduClient* client, + const master::CreateTableRequestPB& req, + const KuduSchema& schema, + const MonoTime& deadline); + + Status IsCreateTableInProgress(KuduClient* client, + const std::string& table_name, + const MonoTime& deadline, + bool *create_in_progress); + + Status WaitForCreateTableToFinish(KuduClient* client, + const std::string& table_name, + const MonoTime& deadline); + + Status DeleteTable(KuduClient* client, + const std::string& table_name, + const MonoTime& deadline); + + Status AlterTable(KuduClient* client, + const master::AlterTableRequestPB& req, + const MonoTime& deadline); + + Status IsAlterTableInProgress(KuduClient* client, + const std::string& table_name, + const MonoTime& deadline, + bool *alter_in_progress); + + Status WaitForAlterTableToFinish(KuduClient* client, + const std::string& alter_name, + const MonoTime& deadline); + + Status GetTableSchema(KuduClient* client, + const std::string& table_name, + const MonoTime& deadline, + KuduSchema* schema, + PartitionSchema* partition_schema, + std::string* table_id); + + Status InitLocalHostNames(); + + bool IsLocalHostPort(const HostPort& hp) const; + + bool IsTabletServerLocal(const internal::RemoteTabletServer& rts) const; + + // Returns a non-failed replica of the specified tablet based on the provided selection criteria + // and tablet server blacklist. + // + // Returns NULL if there are no valid tablet servers. + internal::RemoteTabletServer* SelectTServer( + const scoped_refptr& rt, + const ReplicaSelection selection, + const std::set& blacklist, + std::vector* candidates) const; + + // Sets 'master_proxy_' from the address specified by + // 'leader_master_hostport_'. Called by + // GetLeaderMasterRpc::SendRpcCb() upon successful completion. + // + // See also: SetMasterServerProxyAsync. + void LeaderMasterDetermined(const Status& status, + const HostPort& host_port); + + // Asynchronously sets 'master_proxy_' to the leader master by + // cycling through servers listed in 'master_server_addrs_' until + // one responds with a Raft configuration that contains the leader + // master or 'deadline' expires. + // + // Invokes 'cb' with the appropriate status when finished. + // + // Works with both a distributed and non-distributed configuration. + void SetMasterServerProxyAsync(KuduClient* client, + const MonoTime& deadline, + const StatusCallback& cb); + + // Synchronous version of SetMasterServerProxyAsync method above. + // + // NOTE: since this uses a Synchronizer, this may not be invoked by + // a method that's on a reactor thread. + // + // TODO (KUDU-492): Get rid of this method and re-factor the client + // to lazily initialize 'master_proxy_'. + Status SetMasterServerProxy(KuduClient* client, + const MonoTime& deadline); + + std::shared_ptr master_proxy() const; + + HostPort leader_master_hostport() const; + + uint64_t GetLatestObservedTimestamp() const; + + void UpdateLatestObservedTimestamp(uint64_t timestamp); + + // Retry 'func' until either: + // + // 1) Methods succeeds on a leader master. + // 2) Method fails for a reason that is not related to network + // errors, timeouts, or leadership issues. + // 3) 'deadline' (if initialized) elapses. + // + // If 'num_attempts' is not NULL, it will be incremented on every + // attempt (successful or not) to call 'func'. + // + // NOTE: 'rpc_timeout' is a per-call timeout, while 'deadline' is a + // per operation deadline. If 'deadline' is not initialized, 'func' is + // retried forever. If 'deadline' expires, 'func_name' is included in + // the resulting Status. + template + Status SyncLeaderMasterRpc( + const MonoTime& deadline, + KuduClient* client, + const ReqClass& req, + RespClass* resp, + int* num_attempts, + const char* func_name, + const boost::function& func); + + std::shared_ptr messenger_; + gscoped_ptr dns_resolver_; + scoped_refptr meta_cache_; + + // Set of hostnames and IPs on the local host. + // This is initialized at client startup. + std::unordered_set local_host_names_; + + // Options the client was built with. + std::vector master_server_addrs_; + MonoDelta default_admin_operation_timeout_; + MonoDelta default_rpc_timeout_; + + // The host port of the leader master. This is set in + // LeaderMasterDetermined, which is invoked as a callback by + // SetMasterServerProxyAsync. + HostPort leader_master_hostport_; + + // Proxy to the leader master. + std::shared_ptr master_proxy_; + + // Ref-counted RPC instance: since 'SetMasterServerProxyAsync' call + // is asynchronous, we need to hold a reference in this class + // itself, as to avoid a "use-after-free" scenario. + scoped_refptr leader_master_rpc_; + std::vector leader_master_callbacks_; + + // Protects 'leader_master_rpc_', 'leader_master_hostport_', + // and master_proxy_ + // + // See: KuduClient::Data::SetMasterServerProxyAsync for a more + // in-depth explanation of why this is needed and how it works. + mutable simple_spinlock leader_master_lock_; + + AtomicInt latest_observed_timestamp_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +// Retry helper, takes a function like: Status funcName(const MonoTime& deadline, bool *retry, ...) +// The function should set the retry flag (default true) if the function should +// be retried again. On retry == false the return status of the function will be +// returned to the caller, otherwise a Status::Timeout() will be returned. +// If the deadline is already expired, no attempt will be made. +Status RetryFunc(const MonoTime& deadline, + const std::string& retry_msg, + const std::string& timeout_msg, + const boost::function& func); + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/client-test-util.cc b/src/kudu/client/client-test-util.cc new file mode 100644 index 000000000000..631b73550e44 --- /dev/null +++ b/src/kudu/client/client-test-util.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/client-test-util.h" + +#include + +#include "kudu/gutil/stl_util.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace client { + +void LogSessionErrorsAndDie(const sp::shared_ptr& session, + const Status& s) { + CHECK(!s.ok()); + std::vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK(!overflow); + + // Log only the first 10 errors. + LOG(INFO) << errors.size() << " failed ops. First 10 errors follow"; + int i = 0; + for (const KuduError* e : errors) { + if (i == 10) { + break; + } + LOG(INFO) << "Op " << e->failed_op().ToString() + << " had status " << e->status().ToString(); + i++; + } + CHECK_OK(s); // will fail +} + +void ScanTableToStrings(KuduTable* table, vector* row_strings) { + row_strings->clear(); + KuduScanner scanner(table); + ASSERT_OK(scanner.SetSelection(KuduClient::LEADER_ONLY)); + scanner.SetTimeoutMillis(60000); + ScanToStrings(&scanner, row_strings); +} + +int64_t CountTableRows(KuduTable* table) { + vector rows; + client::ScanTableToStrings(table, &rows); + return rows.size(); +} + +void ScanToStrings(KuduScanner* scanner, vector* row_strings) { + ASSERT_OK(scanner->Open()); + vector rows; + while (scanner->HasMoreRows()) { + ASSERT_OK(scanner->NextBatch(&rows)); + for (const KuduRowResult& row : rows) { + row_strings->push_back(row.ToString()); + } + } +} + +KuduSchema KuduSchemaFromSchema(const Schema& schema) { + return KuduSchema(schema); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/client-test-util.h b/src/kudu/client/client-test-util.h new file mode 100644 index 000000000000..882b9c4e228f --- /dev/null +++ b/src/kudu/client/client-test-util.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_CLIENT_TEST_UTIL_H +#define KUDU_CLIENT_CLIENT_TEST_UTIL_H + +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { +class Schema; + +namespace client { +class KuduSchema; + +// Log any pending errors in the given session, and then crash the current +// process. +void LogSessionErrorsAndDie(const sp::shared_ptr& session, + const Status& s); + +// Flush the given session. If any errors occur, log them and crash +// the process. +inline void FlushSessionOrDie(const sp::shared_ptr& session) { + Status s = session->Flush(); + if (PREDICT_FALSE(!s.ok())) { + LogSessionErrorsAndDie(session, s); + } +} + +// Scans in LEADER_ONLY mode, returning stringified rows in the given vector. +void ScanTableToStrings(KuduTable* table, std::vector* row_strings); + +// Count the number of rows in the table in LEADER_ONLY mode. +int64_t CountTableRows(KuduTable* table); + +void ScanToStrings(KuduScanner* scanner, std::vector* row_strings); + +// Convert a kudu::Schema to a kudu::client::KuduSchema. +KuduSchema KuduSchemaFromSchema(const Schema& schema); + +} // namespace client +} // namespace kudu + +#endif /* KUDU_CLIENT_CLIENT_TEST_UTIL_H */ diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc new file mode 100644 index 000000000000..15d42661a318 --- /dev/null +++ b/src/kudu/client/client-test.cc @@ -0,0 +1,2718 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/client-internal.h" +#include "kudu/client/client-test-util.h" +#include "kudu/client/meta_cache.h" +#include "kudu/client/row_result.h" +#include "kudu/client/scanner-internal.h" +#include "kudu/client/value.h" +#include "kudu/client/write_op.h" +#include "kudu/common/partial_row.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master-test-util.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/scanners.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DECLARE_bool(enable_data_block_fsync); +DECLARE_bool(log_inject_latency); +DECLARE_int32(heartbeat_interval_ms); +DECLARE_int32(log_inject_latency_ms_mean); +DECLARE_int32(log_inject_latency_ms_stddev); +DECLARE_int32(master_inject_latency_on_tablet_lookups_ms); +DECLARE_int32(max_create_tablets_per_ts); +DECLARE_int32(scanner_gc_check_interval_us); +DECLARE_int32(scanner_inject_latency_on_each_batch_ms); +DECLARE_int32(scanner_max_batch_size_bytes); +DECLARE_int32(scanner_ttl_ms); +DEFINE_int32(test_scan_num_rows, 1000, "Number of rows to insert and scan"); + +METRIC_DECLARE_counter(rpcs_queue_overflow); + +using std::string; +using std::set; +using std::vector; + +namespace kudu { +namespace client { + +using base::subtle::Atomic32; +using base::subtle::NoBarrier_AtomicIncrement; +using base::subtle::NoBarrier_Load; +using base::subtle::NoBarrier_Store; +using master::CatalogManager; +using master::GetTableLocationsRequestPB; +using master::GetTableLocationsResponsePB; +using master::TabletLocationsPB; +using sp::shared_ptr; +using tablet::TabletPeer; +using tserver::MiniTabletServer; + +class ClientTest : public KuduTest { + public: + ClientTest() { + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("int_val")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("string_val")->Type(KuduColumnSchema::STRING)->Nullable(); + b.AddColumn("non_null_with_default")->Type(KuduColumnSchema::INT32)->NotNull() + ->Default(KuduValue::FromInt(12345)); + CHECK_OK(b.Build(&schema_)); + + FLAGS_enable_data_block_fsync = false; // Keep unit tests fast. + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // Reduce the TS<->Master heartbeat interval + FLAGS_heartbeat_interval_ms = 10; + FLAGS_scanner_gc_check_interval_us = 50 * 1000; // 50 milliseconds. + + // Start minicluster and wait for tablet servers to connect to master. + cluster_.reset(new MiniCluster(env_.get(), MiniClusterOptions())); + ASSERT_OK(cluster_->Start()); + + // Connect to the cluster. + ASSERT_OK(KuduClientBuilder() + .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr().ToString()) + .Build(&client_)); + + ASSERT_NO_FATAL_FAILURE(CreateTable(kTableName, 1, GenerateSplitRows(), &client_table_)); + ASSERT_NO_FATAL_FAILURE(CreateTable(kTable2Name, 1, vector(), + &client_table2_)); + } + + // Generate a set of split rows for tablets used in this test. + vector GenerateSplitRows() { + vector rows; + KuduPartialRow* row = schema_.NewRow(); + CHECK_OK(row->SetInt32(0, 9)); + rows.push_back(row); + return rows; + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + cluster_.reset(); + } + KuduTest::TearDown(); + } + + // Count the rows of a table, checking that the operation succeeds. + // + // Must be public to use as a thread closure. + void CheckRowCount(KuduTable* table) { + CountRowsFromClient(table); + } + + protected: + + static const char *kTableName; + static const char *kTable2Name; + static const int32_t kNoBound; + + string GetFirstTabletId(KuduTable* table) { + GetTableLocationsRequestPB req; + GetTableLocationsResponsePB resp; + req.mutable_table()->set_table_name(table->name()); + CHECK_OK(cluster_->mini_master()->master()->catalog_manager()->GetTableLocations( + &req, &resp)); + CHECK(resp.tablet_locations_size() > 0); + return resp.tablet_locations(0).tablet_id(); + } + + void CheckNoRpcOverflow() { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* server = cluster_->mini_tablet_server(i); + if (server->is_started()) { + ASSERT_EQ(0, server->server()->rpc_server()-> + service_pool("kudu.tserver.TabletServerService")-> + RpcsQueueOverflowMetric()->value()); + } + } + } + + // Inserts 'num_rows' test rows using 'client' + void InsertTestRows(KuduClient* client, KuduTable* table, int num_rows, int first_row = 0) { + shared_ptr session = client->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(10000); + for (int i = first_row; i < num_rows + first_row; i++) { + gscoped_ptr insert(BuildTestRow(table, i)); + ASSERT_OK(session->Apply(insert.release())); + } + FlushSessionOrDie(session); + ASSERT_NO_FATAL_FAILURE(CheckNoRpcOverflow()); + } + + // Inserts 'num_rows' using the default client. + void InsertTestRows(KuduTable* table, int num_rows, int first_row = 0) { + InsertTestRows(client_.get(), table, num_rows, first_row); + } + + void UpdateTestRows(KuduTable* table, int lo, int hi) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(10000); + for (int i = lo; i < hi; i++) { + gscoped_ptr update(UpdateTestRow(table, i)); + ASSERT_OK(session->Apply(update.release())); + } + FlushSessionOrDie(session); + ASSERT_NO_FATAL_FAILURE(CheckNoRpcOverflow()); + } + + void DeleteTestRows(KuduTable* table, int lo, int hi) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(10000); + for (int i = lo; i < hi; i++) { + gscoped_ptr del(DeleteTestRow(table, i)); + ASSERT_OK(session->Apply(del.release())) + } + FlushSessionOrDie(session); + ASSERT_NO_FATAL_FAILURE(CheckNoRpcOverflow()); + } + + gscoped_ptr BuildTestRow(KuduTable* table, int index) { + gscoped_ptr insert(table->NewInsert()); + KuduPartialRow* row = insert->mutable_row(); + CHECK_OK(row->SetInt32(0, index)); + CHECK_OK(row->SetInt32(1, index * 2)); + CHECK_OK(row->SetStringCopy(2, Slice(StringPrintf("hello %d", index)))); + CHECK_OK(row->SetInt32(3, index * 3)); + return insert.Pass(); + } + + gscoped_ptr UpdateTestRow(KuduTable* table, int index) { + gscoped_ptr update(table->NewUpdate()); + KuduPartialRow* row = update->mutable_row(); + CHECK_OK(row->SetInt32(0, index)); + CHECK_OK(row->SetInt32(1, index * 2 + 1)); + CHECK_OK(row->SetStringCopy(2, Slice(StringPrintf("hello again %d", index)))); + return update.Pass(); + } + + gscoped_ptr DeleteTestRow(KuduTable* table, int index) { + gscoped_ptr del(table->NewDelete()); + KuduPartialRow* row = del->mutable_row(); + CHECK_OK(row->SetInt32(0, index)); + return del.Pass(); + } + + void DoTestScanWithoutPredicates() { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns({ "key" })); + LOG_TIMING(INFO, "Scanning with no predicates") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + uint64_t sum = 0; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + int32_t value; + ASSERT_OK(row.GetInt32(0, &value)); + sum += value; + } + } + // The sum should be the sum of the arithmetic series from + // 0..FLAGS_test_scan_num_rows-1 + uint64_t expected = implicit_cast(FLAGS_test_scan_num_rows) * + (0 + (FLAGS_test_scan_num_rows - 1)) / 2; + ASSERT_EQ(expected, sum); + } + } + + void DoTestScanWithStringPredicate() { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("string_val", KuduPredicate::GREATER_EQUAL, + KuduValue::CopyString("hello 2")))); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("string_val", KuduPredicate::LESS_EQUAL, + KuduValue::CopyString("hello 3")))); + + LOG_TIMING(INFO, "Scanning with string predicate") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + Slice s; + ASSERT_OK(row.GetString(2, &s)); + if (!s.starts_with("hello 2") && !s.starts_with("hello 3")) { + FAIL() << row.ToString(); + } + } + } + } + } + + void DoTestScanWithKeyPredicate() { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::GREATER_EQUAL, + KuduValue::FromInt(5)))); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::LESS_EQUAL, + KuduValue::FromInt(10)))); + + LOG_TIMING(INFO, "Scanning with key predicate") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + int32_t k; + ASSERT_OK(row.GetInt32(0, &k)); + if (k < 5 || k > 10) { + FAIL() << row.ToString(); + } + } + } + } + } + + int CountRowsFromClient(KuduTable* table) { + return CountRowsFromClient(table, kNoBound, kNoBound); + } + + int CountRowsFromClient(KuduTable* table, int32_t lower_bound, int32_t upper_bound) { + return CountRowsFromClient(table, KuduClient::LEADER_ONLY, lower_bound, upper_bound); + } + + int CountRowsFromClient(KuduTable* table, KuduClient::ReplicaSelection selection, + int32_t lower_bound, int32_t upper_bound) { + KuduScanner scanner(table); + CHECK_OK(scanner.SetSelection(selection)); + CHECK_OK(scanner.SetProjectedColumns(vector())); + if (lower_bound != kNoBound) { + CHECK_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::GREATER_EQUAL, + KuduValue::FromInt(lower_bound)))); + } + if (upper_bound != kNoBound) { + CHECK_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::LESS_EQUAL, + KuduValue::FromInt(upper_bound)))); + } + + CHECK_OK(scanner.Open()); + + int count = 0; + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + CHECK_OK(scanner.NextBatch(&batch)); + count += batch.NumRows(); + } + return count; + } + + // Creates a table with 'num_replicas', split into tablets based on 'split_rows' + // (or single tablet if 'split_rows' is empty). + void CreateTable(const string& table_name, + int num_replicas, + const vector& split_rows, + shared_ptr* table) { + + bool added_replicas = false; + // Add more tablet servers to satisfy all replicas, if necessary. + while (cluster_->num_tablet_servers() < num_replicas) { + ASSERT_OK(cluster_->AddTabletServer()); + added_replicas = true; + } + + if (added_replicas) { + ASSERT_OK(cluster_->WaitForTabletServerCount(num_replicas)); + } + + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(table_name) + .schema(&schema_) + .num_replicas(num_replicas) + .split_rows(split_rows) + .Create()); + + ASSERT_OK(client_->OpenTable(table_name, table)); + } + + // Kills a tablet server. + // Boolean flags control whether to restart the tserver, and if so, whether to wait for it to + // finish bootstrapping. + Status KillTServerImpl(const string& uuid, const bool restart, const bool wait_started) { + bool ts_found = false; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + if (ts->server()->instance_pb().permanent_uuid() == uuid) { + if (restart) { + LOG(INFO) << "Restarting TS at " << ts->bound_rpc_addr().ToString(); + RETURN_NOT_OK(ts->Restart()); + if (wait_started) { + LOG(INFO) << "Waiting for TS " << ts->bound_rpc_addr().ToString() + << " to finish bootstrapping"; + RETURN_NOT_OK(ts->WaitStarted()); + } + } else { + LOG(INFO) << "Killing TS " << uuid << " at " << ts->bound_rpc_addr().ToString(); + ts->Shutdown(); + } + ts_found = true; + break; + } + } + if (!ts_found) { + return Status::InvalidArgument(strings::Substitute("Could not find tablet server $1", uuid)); + } + + return Status::OK(); + } + + Status RestartTServerAndWait(const string& uuid) { + return KillTServerImpl(uuid, true, true); + } + + Status RestartTServerAsync(const string& uuid) { + return KillTServerImpl(uuid, true, false); + } + + Status KillTServer(const string& uuid) { + return KillTServerImpl(uuid, false, false); + } + + void DoApplyWithoutFlushTest(int sleep_micros); + + enum WhichServerToKill { + DEAD_MASTER, + DEAD_TSERVER + }; + void DoTestWriteWithDeadServer(WhichServerToKill which); + + KuduSchema schema_; + + gscoped_ptr cluster_; + shared_ptr client_; + shared_ptr client_table_; + shared_ptr client_table2_; +}; + +const char *ClientTest::kTableName = "client-testtb"; +const char *ClientTest::kTable2Name = "client-testtb2"; +const int32_t ClientTest::kNoBound = kint32max; + +TEST_F(ClientTest, TestListTables) { + vector tables; + ASSERT_OK(client_->ListTables(&tables)); + std::sort(tables.begin(), tables.end()); + ASSERT_EQ(string(kTableName), tables[0]); + ASSERT_EQ(string(kTable2Name), tables[1]); + tables.clear(); + ASSERT_OK(client_->ListTables(&tables, "testtb2")); + ASSERT_EQ(1, tables.size()); + ASSERT_EQ(string(kTable2Name), tables[0]); +} + +TEST_F(ClientTest, TestListTabletServers) { + vector tss; + ElementDeleter deleter(&tss); + ASSERT_OK(client_->ListTabletServers(&tss)); + ASSERT_EQ(1, tss.size()); + ASSERT_EQ(cluster_->mini_tablet_server(0)->server()->instance_pb().permanent_uuid(), + tss[0]->uuid()); + ASSERT_EQ(cluster_->mini_tablet_server(0)->server()->first_rpc_address().host(), + tss[0]->hostname()); +} + +TEST_F(ClientTest, TestBadTable) { + shared_ptr t; + Status s = client_->OpenTable("xxx-does-not-exist", &t); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), "Not found: The table does not exist"); +} + +// Test that, if the master is down, we experience a network error talking +// to it (no "find the new leader master" since there's only one master). +TEST_F(ClientTest, TestMasterDown) { + cluster_->mini_master()->Shutdown(); + shared_ptr t; + client_->data_->default_admin_operation_timeout_ = MonoDelta::FromSeconds(1); + Status s = client_->OpenTable("other-tablet", &t); + ASSERT_TRUE(s.IsNetworkError()); +} + +TEST_F(ClientTest, TestScan) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows( + client_table_.get(), FLAGS_test_scan_num_rows)); + + ASSERT_EQ(FLAGS_test_scan_num_rows, CountRowsFromClient(client_table_.get())); + + // Scan after insert + DoTestScanWithoutPredicates(); + DoTestScanWithStringPredicate(); + DoTestScanWithKeyPredicate(); + + // Scan after update + UpdateTestRows(client_table_.get(), 0, FLAGS_test_scan_num_rows); + DoTestScanWithKeyPredicate(); + + // Scan after delete half + DeleteTestRows(client_table_.get(), 0, FLAGS_test_scan_num_rows / 2); + DoTestScanWithKeyPredicate(); + + // Scan after delete all + DeleteTestRows(client_table_.get(), FLAGS_test_scan_num_rows / 2 + 1, + FLAGS_test_scan_num_rows); + DoTestScanWithKeyPredicate(); + + // Scan after re-insert + InsertTestRows(client_table_.get(), 1); + DoTestScanWithKeyPredicate(); +} + +TEST_F(ClientTest, TestScanAtSnapshot) { + int half_the_rows = FLAGS_test_scan_num_rows / 2; + + // Insert half the rows + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + half_the_rows)); + + // get the time from the server and transform to micros disregarding any + // logical values (we shouldn't have any with a single server anyway); + int64_t ts = server::HybridClock::GetPhysicalValueMicros( + cluster_->mini_tablet_server(0)->server()->clock()->Now()); + + // Insert the second half of the rows + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + half_the_rows, half_the_rows)); + + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.Open()); + uint64_t count = 0; + + // Do a "normal", READ_LATEST scan + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + count += batch.NumRows(); + } + + ASSERT_EQ(FLAGS_test_scan_num_rows, count); + + // Now close the scanner and perform a scan at 'ts' + scanner.Close(); + ASSERT_OK(scanner.SetReadMode(KuduScanner::READ_AT_SNAPSHOT)); + ASSERT_OK(scanner.SetSnapshotMicros(ts)); + ASSERT_OK(scanner.Open()); + + count = 0; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + count += batch.NumRows(); + } + + ASSERT_EQ(half_the_rows, count); +} + +// Test scanning at a timestamp in the future compared to the +// local clock. If we are within the clock error, this should wait. +// If we are far in the future, we should get an error. +TEST_F(ClientTest, TestScanAtFutureTimestamp) { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetReadMode(KuduScanner::READ_AT_SNAPSHOT)); + + // Try to perform a scan at NowLatest(). This is in the future, + // but the server should wait until it's in the past. + int64_t ts = server::HybridClock::GetPhysicalValueMicros( + cluster_->mini_tablet_server(0)->server()->clock()->NowLatest()); + ASSERT_OK(scanner.SetSnapshotMicros(ts)); + ASSERT_OK(scanner.Open()); + scanner.Close(); + + // Try to perform a scan far in the future (60s -- higher than max clock error). + // This should return an error. + ts += 60 * 1000000; + ASSERT_OK(scanner.SetSnapshotMicros(ts)); + Status s = scanner.Open(); + EXPECT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "in the future."); +} + +TEST_F(ClientTest, TestScanMultiTablet) { + // 5 tablets, each with 10 rows worth of space. + gscoped_ptr row(schema_.NewRow()); + vector rows; + for (int i = 1; i < 5; i++) { + KuduPartialRow* row = schema_.NewRow(); + CHECK_OK(row->SetInt32(0, i * 10)); + rows.push_back(row); + } + gscoped_ptr table_creator(client_->NewTableCreator()); + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable("TestScanMultiTablet", 1, rows, &table)); + + // Insert rows with keys 12, 13, 15, 17, 22, 23, 25, 27...47 into each + // tablet, except the first which is empty. + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(5000); + for (int i = 1; i < 5; i++) { + gscoped_ptr insert; + insert = BuildTestRow(table.get(), 2 + (i * 10)); + ASSERT_OK(session->Apply(insert.release())); + insert = BuildTestRow(table.get(), 3 + (i * 10)); + ASSERT_OK(session->Apply(insert.release())); + insert = BuildTestRow(table.get(), 5 + (i * 10)); + ASSERT_OK(session->Apply(insert.release())); + insert = BuildTestRow(table.get(), 7 + (i * 10)); + ASSERT_OK(session->Apply(insert.release())); + } + FlushSessionOrDie(session); + + // Run through various scans. + ASSERT_EQ(16, CountRowsFromClient(table.get(), kNoBound, kNoBound)); + ASSERT_EQ(3, CountRowsFromClient(table.get(), kNoBound, 15)); + ASSERT_EQ(9, CountRowsFromClient(table.get(), 27, kNoBound)); + ASSERT_EQ(3, CountRowsFromClient(table.get(), 0, 15)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 10)); + ASSERT_EQ(4, CountRowsFromClient(table.get(), 0, 20)); + ASSERT_EQ(8, CountRowsFromClient(table.get(), 0, 30)); + ASSERT_EQ(6, CountRowsFromClient(table.get(), 14, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 30, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 50, kNoBound)); + + // Update every other row + for (int i = 1; i < 5; ++i) { + gscoped_ptr update; + update = UpdateTestRow(table.get(), 2 + i * 10); + ASSERT_OK(session->Apply(update.release())); + update = UpdateTestRow(table.get(), 5 + i * 10); + ASSERT_OK(session->Apply(update.release())); + } + FlushSessionOrDie(session); + + // Check all counts the same (make sure updates don't change # of rows) + ASSERT_EQ(16, CountRowsFromClient(table.get(), kNoBound, kNoBound)); + ASSERT_EQ(3, CountRowsFromClient(table.get(), kNoBound, 15)); + ASSERT_EQ(9, CountRowsFromClient(table.get(), 27, kNoBound)); + ASSERT_EQ(3, CountRowsFromClient(table.get(), 0, 15)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 10)); + ASSERT_EQ(4, CountRowsFromClient(table.get(), 0, 20)); + ASSERT_EQ(8, CountRowsFromClient(table.get(), 0, 30)); + ASSERT_EQ(6, CountRowsFromClient(table.get(), 14, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 30, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 50, kNoBound)); + + // Delete half the rows + for (int i = 1; i < 5; ++i) { + gscoped_ptr del; + del = DeleteTestRow(table.get(), 5 + i*10); + ASSERT_OK(session->Apply(del.release())); + del = DeleteTestRow(table.get(), 7 + i*10); + ASSERT_OK(session->Apply(del.release())); + } + FlushSessionOrDie(session); + + // Check counts changed accordingly + ASSERT_EQ(8, CountRowsFromClient(table.get(), kNoBound, kNoBound)); + ASSERT_EQ(2, CountRowsFromClient(table.get(), kNoBound, 15)); + ASSERT_EQ(4, CountRowsFromClient(table.get(), 27, kNoBound)); + ASSERT_EQ(2, CountRowsFromClient(table.get(), 0, 15)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 10)); + ASSERT_EQ(2, CountRowsFromClient(table.get(), 0, 20)); + ASSERT_EQ(4, CountRowsFromClient(table.get(), 0, 30)); + ASSERT_EQ(2, CountRowsFromClient(table.get(), 14, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 30, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 50, kNoBound)); + + // Delete rest of rows + for (int i = 1; i < 5; ++i) { + gscoped_ptr del; + del = DeleteTestRow(table.get(), 2 + i*10); + ASSERT_OK(session->Apply(del.release())); + del = DeleteTestRow(table.get(), 3 + i*10); + ASSERT_OK(session->Apply(del.release())); + } + FlushSessionOrDie(session); + + // Check counts changed accordingly + ASSERT_EQ(0, CountRowsFromClient(table.get(), kNoBound, kNoBound)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), kNoBound, 15)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 27, kNoBound)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 15)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 10)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 20)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 0, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 14, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 30, 30)); + ASSERT_EQ(0, CountRowsFromClient(table.get(), 50, kNoBound)); +} + +TEST_F(ClientTest, TestScanEmptyTable) { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns(vector())); + ASSERT_OK(scanner.Open()); + + // There are two tablets in the table, both empty. Until we scan to + // the last tablet, HasMoreRows will return true (because it doesn't + // know whether there's data in subsequent tablets). + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + ASSERT_OK(scanner.NextBatch(&batch)); + ASSERT_EQ(0, batch.NumRows()); + ASSERT_FALSE(scanner.HasMoreRows()); +} + +// Test scanning with an empty projection. This should yield an empty +// row block with the proper number of rows filled in. Impala issues +// scans like this in order to implement COUNT(*). +TEST_F(ClientTest, TestScanEmptyProjection) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + FLAGS_test_scan_num_rows)); + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns(vector())); + ASSERT_EQ(scanner.GetProjectionSchema().num_columns(), 0); + LOG_TIMING(INFO, "Scanning with no projected columns") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + uint64_t count = 0; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + count += batch.NumRows(); + } + ASSERT_EQ(FLAGS_test_scan_num_rows, count); + } +} + +TEST_F(ClientTest, TestProjectInvalidColumn) { + KuduScanner scanner(client_table_.get()); + Status s = scanner.SetProjectedColumns({ "column-doesnt-exist" }); + ASSERT_EQ("Not found: Column: \"column-doesnt-exist\" was not found in the table schema.", + s.ToString()); + + // Test trying to use a projection where a column is used multiple times. + // TODO: consider fixing this to support returning the column multiple + // times, even though it's not very useful. + s = scanner.SetProjectedColumns({ "key", "key" }); + ASSERT_EQ("Invalid argument: Duplicate column name: key", s.ToString()); +} + +// Test a scan where we have a predicate on a key column that is not +// in the projection. +TEST_F(ClientTest, TestScanPredicateKeyColNotProjected) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + FLAGS_test_scan_num_rows)); + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns({ "int_val" })); + ASSERT_EQ(scanner.GetProjectionSchema().num_columns(), 1); + ASSERT_EQ(scanner.GetProjectionSchema().Column(0).type(), KuduColumnSchema::INT32); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::GREATER_EQUAL, + KuduValue::FromInt(5)))); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("key", KuduPredicate::LESS_EQUAL, + KuduValue::FromInt(10)))); + + size_t nrows = 0; + int32_t curr_key = 5; + LOG_TIMING(INFO, "Scanning with predicate columns not projected") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + int32_t val; + ASSERT_OK(row.GetInt32(0, &val)); + ASSERT_EQ(curr_key * 2, val); + nrows++; + curr_key++; + } + } + } + ASSERT_EQ(nrows, 6); +} + +// Test a scan where we have a predicate on a non-key column that is +// not in the projection. +TEST_F(ClientTest, TestScanPredicateNonKeyColNotProjected) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + FLAGS_test_scan_num_rows)); + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("int_val", KuduPredicate::GREATER_EQUAL, + KuduValue::FromInt(10)))); + ASSERT_OK(scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("int_val", KuduPredicate::LESS_EQUAL, + KuduValue::FromInt(20)))); + + size_t nrows = 0; + int32_t curr_key = 10; + + ASSERT_OK(scanner.SetProjectedColumns({ "key" })); + + LOG_TIMING(INFO, "Scanning with predicate columns not projected") { + ASSERT_OK(scanner.Open()); + + ASSERT_TRUE(scanner.HasMoreRows()); + KuduScanBatch batch; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + int32_t val; + ASSERT_OK(row.GetInt32(0, &val)); + ASSERT_EQ(curr_key / 2, val); + nrows++; + curr_key += 2; + } + } + } + ASSERT_EQ(nrows, 6); +} + +// Test adding various sorts of invalid binary predicates. +TEST_F(ClientTest, TestInvalidPredicates) { + KuduScanner scanner(client_table_.get()); + + // Predicate on a column that does not exist. + Status s = scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("this-does-not-exist", + KuduPredicate::EQUAL, KuduValue::FromInt(5))); + EXPECT_EQ("Not found: column not found: this-does-not-exist", s.ToString()); + + // Int predicate on a string column. + s = scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("string_val", + KuduPredicate::EQUAL, KuduValue::FromInt(5))); + EXPECT_EQ("Invalid argument: non-string value for string column string_val", + s.ToString()); + + // String predicate on an int column. + s = scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate("int_val", + KuduPredicate::EQUAL, KuduValue::CopyString("x"))); + EXPECT_EQ("Invalid argument: non-int value for int column int_val", + s.ToString()); + + // Out-of-range int predicate on an int column. + s = scanner.AddConjunctPredicate( + client_table_->NewComparisonPredicate( + "int_val", + KuduPredicate::EQUAL, + KuduValue::FromInt(static_cast(MathLimits::kMax) + 10))); + EXPECT_EQ("Invalid argument: value 2147483657 out of range for " + "32-bit signed integer column 'int_val'", s.ToString()); +} + + +// Check that the tserver proxy is reset on close, even for empty tables. +TEST_F(ClientTest, TestScanCloseProxy) { + const string kEmptyTable = "TestScanCloseProxy"; + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable(kEmptyTable, 3, GenerateSplitRows(), &table)); + + { + // Open and close an empty scanner. + KuduScanner scanner(table.get()); + ASSERT_OK(scanner.Open()); + scanner.Close(); + CHECK_EQ(0, scanner.data_->proxy_.use_count()) << "Proxy was not reset!"; + } + + // Insert some test rows. + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), + FLAGS_test_scan_num_rows)); + { + // Open and close a scanner with rows. + KuduScanner scanner(table.get()); + ASSERT_OK(scanner.Open()); + scanner.Close(); + CHECK_EQ(0, scanner.data_->proxy_.use_count()) << "Proxy was not reset!"; + } +} + +namespace internal { + +static void ReadBatchToStrings(KuduScanner* scanner, vector* rows) { + KuduScanBatch batch; + ASSERT_OK(scanner->NextBatch(&batch)); + for (int i = 0; i < batch.NumRows(); i++) { + rows->push_back(batch.Row(i).ToString()); + } +} + +static void DoScanWithCallback(KuduTable* table, + const vector& expected_rows, + const boost::function& cb) { + // Initialize fault-tolerant snapshot scanner. + KuduScanner scanner(table); + ASSERT_OK(scanner.SetFaultTolerant()); + // Set a small batch size so it reads in multiple batches. + ASSERT_OK(scanner.SetBatchSizeBytes(1)); + + ASSERT_OK(scanner.Open()); + vector rows; + + // Do a first scan to get us started. + { + LOG(INFO) << "Setting up scanner."; + ASSERT_TRUE(scanner.HasMoreRows()); + NO_FATALS(ReadBatchToStrings(&scanner, &rows)); + ASSERT_GT(rows.size(), 0); + ASSERT_TRUE(scanner.HasMoreRows()); + } + + // Call the callback on the tserver serving the scan. + LOG(INFO) << "Calling callback."; + { + KuduTabletServer* kts_ptr; + ASSERT_OK(scanner.GetCurrentServer(&kts_ptr)); + gscoped_ptr kts(kts_ptr); + ASSERT_OK(cb(kts->uuid())); + } + + // Check that we can still read the next batch. + LOG(INFO) << "Checking that we can still read the next batch."; + ASSERT_TRUE(scanner.HasMoreRows()); + ASSERT_OK(scanner.SetBatchSizeBytes(1024*1024)); + while (scanner.HasMoreRows()) { + NO_FATALS(ReadBatchToStrings(&scanner, &rows)); + } + scanner.Close(); + + // Verify results from the scan. + LOG(INFO) << "Verifying results from scan."; + for (int i = 0; i < rows.size(); i++) { + EXPECT_EQ(expected_rows[i], rows[i]); + } + ASSERT_EQ(expected_rows.size(), rows.size()); +} + +} // namespace internal + +// Test that ordered snapshot scans can be resumed in the case of different tablet server failures. +TEST_F(ClientTest, TestScanFaultTolerance) { + // Create test table and insert test rows. + const string kScanTable = "TestScanFaultTolerance"; + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable(kScanTable, 3, vector(), &table)); + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), FLAGS_test_scan_num_rows)); + + // Do an initial scan to determine the expected rows for later verification. + vector expected_rows; + ScanTableToStrings(table.get(), &expected_rows); + + for (int with_flush = 0; with_flush <= 1; with_flush++) { + SCOPED_TRACE((with_flush == 1) ? "with flush" : "without flush"); + // The second time through, flush to ensure that we test both against MRS and + // disk. + if (with_flush) { + string tablet_id = GetFirstTabletId(table.get()); + for (int i = 0; i < 3; i++) { + scoped_refptr tablet_peer; + ASSERT_TRUE(cluster_->mini_tablet_server(i)->server()->tablet_manager()->LookupTablet( + tablet_id, &tablet_peer)); + ASSERT_OK(tablet_peer->tablet()->Flush()); + } + } + + // Test a few different recoverable server-side error conditions. + // Since these are recoverable, the scan will succeed when retried elsewhere. + + // Restarting and waiting should result in a SCANNER_EXPIRED error. + LOG(INFO) << "Doing a scan while restarting a tserver and waiting for it to come up..."; + ASSERT_NO_FATAL_FAILURE(internal::DoScanWithCallback(table.get(), expected_rows, + boost::bind(&ClientTest_TestScanFaultTolerance_Test::RestartTServerAndWait, + this, _1))); + + // Restarting and not waiting means the tserver is hopefully bootstrapping, leading to + // a TABLET_NOT_RUNNING error. + LOG(INFO) << "Doing a scan while restarting a tserver..."; + ASSERT_NO_FATAL_FAILURE(internal::DoScanWithCallback(table.get(), expected_rows, + boost::bind(&ClientTest_TestScanFaultTolerance_Test::RestartTServerAsync, + this, _1))); + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + ASSERT_OK(ts->WaitStarted()); + } + + // Killing the tserver should lead to an RPC timeout. + LOG(INFO) << "Doing a scan while killing a tserver..."; + ASSERT_NO_FATAL_FAILURE(internal::DoScanWithCallback(table.get(), expected_rows, + boost::bind(&ClientTest_TestScanFaultTolerance_Test::KillTServer, + this, _1))); + + // Restart the server that we killed. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + if (!ts->is_started()) { + ASSERT_OK(ts->Start()); + ASSERT_OK(ts->WaitStarted()); + } + } + } +} + +TEST_F(ClientTest, TestGetTabletServerBlacklist) { + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable("blacklist", + 3, + GenerateSplitRows(), + &table)); + InsertTestRows(table.get(), 1, 0); + + // Look up the tablet and its replicas into the metadata cache. + // We have to loop since some replicas may have been created slowly. + scoped_refptr rt; + while (true) { + Synchronizer sync; + client_->data_->meta_cache_->LookupTabletByKey(table.get(), "", MonoTime::Max(), &rt, + sync.AsStatusCallback()); + ASSERT_OK(sync.Wait()); + ASSERT_TRUE(rt.get() != nullptr); + vector tservers; + rt->GetRemoteTabletServers(&tservers); + if (tservers.size() == 3) { + break; + } + rt->MarkStale(); + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Get the Leader. + internal::RemoteTabletServer *rts; + set blacklist; + vector candidates; + vector tservers; + ASSERT_OK(client_->data_->GetTabletServer(client_.get(), rt, + KuduClient::LEADER_ONLY, + blacklist, &candidates, &rts)); + tservers.push_back(rts); + // Blacklist the leader, should not work. + blacklist.insert(rts->permanent_uuid()); + { + Status s = client_->data_->GetTabletServer(client_.get(), rt, + KuduClient::LEADER_ONLY, + blacklist, &candidates, &rts); + ASSERT_TRUE(s.IsServiceUnavailable()); + } + // Keep blacklisting replicas until we run out. + ASSERT_OK(client_->data_->GetTabletServer(client_.get(), rt, + KuduClient::CLOSEST_REPLICA, + blacklist, &candidates, &rts)); + tservers.push_back(rts); + blacklist.insert(rts->permanent_uuid()); + ASSERT_OK(client_->data_->GetTabletServer(client_.get(), rt, + KuduClient::FIRST_REPLICA, + blacklist, &candidates, &rts)); + tservers.push_back(rts); + blacklist.insert(rts->permanent_uuid()); + + // Make sure none of the three modes work when all nodes are blacklisted. + vector selections; + selections.push_back(KuduClient::LEADER_ONLY); + selections.push_back(KuduClient::CLOSEST_REPLICA); + selections.push_back(KuduClient::FIRST_REPLICA); + for (KuduClient::ReplicaSelection selection : selections) { + Status s = client_->data_->GetTabletServer(client_.get(), rt, selection, + blacklist, &candidates, &rts); + ASSERT_TRUE(s.IsServiceUnavailable()); + } + + // Make sure none of the modes work when all nodes are dead. + for (internal::RemoteTabletServer* rt : tservers) { + client_->data_->meta_cache_->MarkTSFailed(rt, Status::NetworkError("test")); + } + blacklist.clear(); + for (KuduClient::ReplicaSelection selection : selections) { + Status s = client_->data_->GetTabletServer(client_.get(), rt, + selection, + blacklist, &candidates, &rts); + ASSERT_TRUE(s.IsServiceUnavailable()); + } +} + +TEST_F(ClientTest, TestScanWithEncodedRangePredicate) { + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable("split-table", + 1, /* replicas */ + GenerateSplitRows(), + &table)); + + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), 100)); + + vector all_rows; + ASSERT_NO_FATAL_FAILURE(ScanTableToStrings(table.get(), &all_rows)); + ASSERT_EQ(100, all_rows.size()); + + gscoped_ptr row(table->schema().NewRow()); + + // Test a double-sided range within first tablet + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 5)); + ASSERT_OK(scanner.AddLowerBound(*row)); + CHECK_OK(row->SetInt32(0, 8)); + ASSERT_OK(scanner.AddExclusiveUpperBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(8 - 5, rows.size()); + EXPECT_EQ(all_rows[5], rows.front()); + EXPECT_EQ(all_rows[7], rows.back()); + } + + // Test a double-sided range spanning tablets + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 5)); + ASSERT_OK(scanner.AddLowerBound(*row)); + CHECK_OK(row->SetInt32(0, 15)); + ASSERT_OK(scanner.AddExclusiveUpperBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(15 - 5, rows.size()); + EXPECT_EQ(all_rows[5], rows.front()); + EXPECT_EQ(all_rows[14], rows.back()); + } + + // Test a double-sided range within second tablet + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 15)); + ASSERT_OK(scanner.AddLowerBound(*row)); + CHECK_OK(row->SetInt32(0, 20)); + ASSERT_OK(scanner.AddExclusiveUpperBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(20 - 15, rows.size()); + EXPECT_EQ(all_rows[15], rows.front()); + EXPECT_EQ(all_rows[19], rows.back()); + } + + // Test a lower-bound only range. + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 5)); + ASSERT_OK(scanner.AddLowerBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(95, rows.size()); + EXPECT_EQ(all_rows[5], rows.front()); + EXPECT_EQ(all_rows[99], rows.back()); + } + + // Test an upper-bound only range in first tablet. + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 5)); + ASSERT_OK(scanner.AddExclusiveUpperBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(5, rows.size()); + EXPECT_EQ(all_rows[0], rows.front()); + EXPECT_EQ(all_rows[4], rows.back()); + } + + // Test an upper-bound only range in second tablet. + { + KuduScanner scanner(table.get()); + CHECK_OK(row->SetInt32(0, 15)); + ASSERT_OK(scanner.AddExclusiveUpperBound(*row)); + vector rows; + ASSERT_NO_FATAL_FAILURE(ScanToStrings(&scanner, &rows)); + ASSERT_EQ(15, rows.size()); + EXPECT_EQ(all_rows[0], rows.front()); + EXPECT_EQ(all_rows[14], rows.back()); + } + +} + +static void AssertScannersDisappear(const tserver::ScannerManager* manager) { + // The Close call is async, so we may have to loop a bit until we see it disappear. + // This loops for ~10sec. Typically it succeeds in only a few milliseconds. + int i = 0; + for (i = 0; i < 500; i++) { + if (manager->CountActiveScanners() == 0) { + LOG(INFO) << "Successfully saw scanner close on iteration " << i; + return; + } + // Sleep 2ms on first few times through, then longer on later iterations. + SleepFor(MonoDelta::FromMilliseconds(i < 10 ? 2 : 20)); + } + FAIL() << "Waited too long for the scanner to close"; +} + +namespace { + +int64_t SumResults(const KuduScanBatch& batch) { + int64_t sum = 0; + for (const KuduScanBatch::RowPtr& row : batch) { + int32_t val; + CHECK_OK(row.GetInt32(0, &val)); + sum += val; + } + return sum; +} + +} // anonymous namespace + +TEST_F(ClientTest, TestScannerKeepAlive) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), 1000)); + // Set the scanner ttl really low + ANNOTATE_BENIGN_RACE(&FLAGS_scanner_ttl_ms, "Set at runtime, for tests."); + FLAGS_scanner_ttl_ms = 100; // 100 milliseconds + // Start a scan but don't get the whole data back + KuduScanner scanner(client_table_.get()); + // This will make sure we have to do multiple NextBatch calls to the second tablet. + ASSERT_OK(scanner.SetBatchSizeBytes(100)); + ASSERT_OK(scanner.Open()); + + KuduScanBatch batch; + int64_t sum = 0; + + ASSERT_TRUE(scanner.HasMoreRows()); + ASSERT_OK(scanner.NextBatch(&batch)); + + // We should get only nine rows back (from the first tablet). + ASSERT_EQ(batch.NumRows(), 9); + sum += SumResults(batch); + + ASSERT_TRUE(scanner.HasMoreRows()); + + // We're in between tablets but even if there isn't a live scanner the client should + // still return OK to the keep alive call. + ASSERT_OK(scanner.KeepAlive()); + + // Start scanning the second tablet, but break as soon as we have some data so that + // we have a live remote scanner on the second tablet. + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + if (batch.NumRows() > 0) break; + } + sum += SumResults(batch); + ASSERT_TRUE(scanner.HasMoreRows()); + + // Now loop while keeping the scanner alive. Each time we loop we sleep 1/2 a scanner + // ttl interval (the garbage collector is running each 50 msecs too.). + for (int i = 0; i < 5; i++) { + SleepFor(MonoDelta::FromMilliseconds(50)); + ASSERT_OK(scanner.KeepAlive()); + } + + // Get a second batch before sleeping/keeping alive some more. This is test for a bug + // where we would only actually perform a KeepAlive() rpc after the first request and + // not on subsequent ones. + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + if (batch.NumRows() > 0) break; + } + + ASSERT_TRUE(scanner.HasMoreRows()); + for (int i = 0; i < 5; i++) { + SleepFor(MonoDelta::FromMilliseconds(50)); + ASSERT_OK(scanner.KeepAlive()); + } + sum += SumResults(batch); + + // Loop to get the remaining rows. + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&batch)); + sum += SumResults(batch); + } + ASSERT_FALSE(scanner.HasMoreRows()); + ASSERT_EQ(sum, 499500); +} + +// Test cleanup of scanners on the server side when closed. +TEST_F(ClientTest, TestCloseScanner) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), 10)); + + const tserver::ScannerManager* manager = + cluster_->mini_tablet_server(0)->server()->scanner_manager(); + // Open the scanner, make sure it gets closed right away + { + SCOPED_TRACE("Implicit close"); + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.Open()); + ASSERT_EQ(0, manager->CountActiveScanners()); + scanner.Close(); + AssertScannersDisappear(manager); + } + + // Open the scanner, make sure we see 1 registered scanner. + { + SCOPED_TRACE("Explicit close"); + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetBatchSizeBytes(0)); // won't return data on open + ASSERT_OK(scanner.Open()); + ASSERT_EQ(1, manager->CountActiveScanners()); + scanner.Close(); + AssertScannersDisappear(manager); + } + + { + SCOPED_TRACE("Close when out of scope"); + { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetBatchSizeBytes(0)); + ASSERT_OK(scanner.Open()); + ASSERT_EQ(1, manager->CountActiveScanners()); + } + // Above scanner went out of scope, so the destructor should close asynchronously. + AssertScannersDisappear(manager); + } +} + +TEST_F(ClientTest, TestScanTimeout) { + // If we set the RPC timeout to be 0, we'll time out in the GetTableLocations + // code path and not even discover where the tablet is hosted. + { + client_->data_->default_rpc_timeout_ = MonoDelta::FromSeconds(0); + KuduScanner scanner(client_table_.get()); + Status s = scanner.Open(); + EXPECT_TRUE(s.IsTimedOut()) << s.ToString(); + EXPECT_FALSE(scanner.data_->remote_) << "should not have located any tablet"; + client_->data_->default_rpc_timeout_ = MonoDelta::FromSeconds(5); + } + + // Warm the cache so that the subsequent timeout occurs within the scan, + // not the lookup. + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), 1)); + + // The "overall operation" timed out; no replicas failed. + { + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetTimeoutMillis(0)); + ASSERT_TRUE(scanner.Open().IsTimedOut()); + ASSERT_TRUE(scanner.data_->remote_) << "We should have located a tablet"; + ASSERT_EQ(0, scanner.data_->remote_->GetNumFailedReplicas()); + } + + // Insert some more rows so that the scan takes multiple batches, instead of + // fetching all the data on the 'Open()' call. + client_->data_->default_rpc_timeout_ = MonoDelta::FromSeconds(5); + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), 1000, 1)); + { + google::FlagSaver saver; + FLAGS_scanner_max_batch_size_bytes = 100; + KuduScanner scanner(client_table_.get()); + + // Set the single-RPC timeout low. Since we only have a single replica of this + // table, we'll ignore this timeout for the actual scan calls, and use the + // scanner timeout instead. + FLAGS_scanner_inject_latency_on_each_batch_ms = 50; + client_->data_->default_rpc_timeout_ = MonoDelta::FromMilliseconds(1); + scanner.SetTimeoutMillis(5000); + + // Should successfully scan. + ASSERT_OK(scanner.Open()); + ASSERT_TRUE(scanner.HasMoreRows()); + while (scanner.HasMoreRows()) { + KuduScanBatch batch; + ASSERT_OK(scanner.NextBatch(&batch)); + } + } +} + +static gscoped_ptr GetSingleErrorFromSession(KuduSession* session) { + CHECK_EQ(1, session->CountPendingErrors()); + vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK(!overflow); + CHECK_EQ(1, errors.size()); + KuduError* error = errors[0]; + errors.clear(); + return gscoped_ptr(error); +} + +// Simplest case of inserting through the client API: a single row +// with manual batching. +TEST_F(ClientTest, TestInsertSingleRowManualBatch) { + shared_ptr session = client_->NewSession(); + ASSERT_FALSE(session->HasPendingOperations()); + + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + gscoped_ptr insert(client_table_->NewInsert()); + // Try inserting without specifying a key: should fail. + ASSERT_OK(insert->mutable_row()->SetInt32("int_val", 54321)); + ASSERT_OK(insert->mutable_row()->SetStringCopy("string_val", "hello world")); + + KuduInsert* ptr = insert.get(); + Status s = session->Apply(insert.release()); + ASSERT_EQ("Illegal state: Key not specified: " + "INSERT int32 int_val=54321, string string_val=hello world", + s.ToString()); + + // Get error + ASSERT_EQ(session->CountPendingErrors(), 1) << "Should report bad key to error container"; + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + KuduWriteOperation* failed_op = error->release_failed_op(); + ASSERT_EQ(failed_op, ptr) << "Should be able to retrieve failed operation"; + insert.reset(ptr); + + // Retry + ASSERT_OK(insert->mutable_row()->SetInt32("key", 12345)); + ASSERT_OK(session->Apply(insert.release())); + ASSERT_TRUE(insert == nullptr) << "Successful insert should take ownership"; + ASSERT_TRUE(session->HasPendingOperations()) << "Should be pending until we Flush"; + + FlushSessionOrDie(session); +} + +static Status ApplyInsertToSession(KuduSession* session, + const shared_ptr& table, + int row_key, + int int_val, + const char* string_val) { + gscoped_ptr insert(table->NewInsert()); + RETURN_NOT_OK(insert->mutable_row()->SetInt32("key", row_key)); + RETURN_NOT_OK(insert->mutable_row()->SetInt32("int_val", int_val)); + RETURN_NOT_OK(insert->mutable_row()->SetStringCopy("string_val", string_val)); + return session->Apply(insert.release()); +} + +static Status ApplyUpdateToSession(KuduSession* session, + const shared_ptr& table, + int row_key, + int int_val) { + gscoped_ptr update(table->NewUpdate()); + RETURN_NOT_OK(update->mutable_row()->SetInt32("key", row_key)); + RETURN_NOT_OK(update->mutable_row()->SetInt32("int_val", int_val)); + return session->Apply(update.release()); +} + +static Status ApplyDeleteToSession(KuduSession* session, + const shared_ptr& table, + int row_key) { + gscoped_ptr del(table->NewDelete()); + RETURN_NOT_OK(del->mutable_row()->SetInt32("key", row_key)); + return session->Apply(del.release()); +} + +TEST_F(ClientTest, TestWriteTimeout) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // First time out the lookup on the master side. + { + google::FlagSaver saver; + FLAGS_master_inject_latency_on_tablet_lookups_ms = 110; + session->SetTimeoutMillis(100); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row")); + Status s = session->Flush(); + ASSERT_TRUE(s.IsIOError()) << "unexpected status: " << s.ToString(); + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString(); + ASSERT_STR_CONTAINS(error->status().ToString(), + "GetTableLocations(client-testtb, int32 key=1, 1) " + "failed: timed out after deadline expired"); + } + + // Next time out the actual write on the tablet server. + { + google::FlagSaver saver; + FLAGS_log_inject_latency = true; + FLAGS_log_inject_latency_ms_mean = 110; + FLAGS_log_inject_latency_ms_stddev = 0; + + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row")); + Status s = session->Flush(); + ASSERT_TRUE(s.IsIOError()); + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_TRUE(error->status().IsTimedOut()) << error->status().ToString(); + ASSERT_STR_CONTAINS(error->status().ToString(), + "Failed to write batch of 1 ops to tablet"); + ASSERT_STR_CONTAINS(error->status().ToString(), "Write RPC to 127.0.0.1:"); + ASSERT_STR_CONTAINS(error->status().ToString(), "after 1 attempt"); + } +} + +// Test which does an async flush and then drops the reference +// to the Session. This should still call the callback. +TEST_F(ClientTest, TestAsyncFlushResponseAfterSessionDropped) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row")); + Synchronizer s; + KuduStatusMemberCallback cb(&s, &Synchronizer::StatusCB); + session->FlushAsync(&cb); + session.reset(); + ASSERT_OK(s.Wait()); + + // Try again, this time with an error response (trying to re-insert the same row). + s.Reset(); + session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row")); + ASSERT_EQ(1, session->CountBufferedOperations()); + session->FlushAsync(&cb); + ASSERT_EQ(0, session->CountBufferedOperations()); + session.reset(); + ASSERT_FALSE(s.Wait().ok()); +} + +TEST_F(ClientTest, TestSessionClose) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "row")); + // Closing the session now should return Status::IllegalState since we + // have a pending operation. + ASSERT_TRUE(session->Close().IsIllegalState()); + + Synchronizer s; + KuduStatusMemberCallback cb(&s, &Synchronizer::StatusCB); + session->FlushAsync(&cb); + ASSERT_OK(s.Wait()); + + ASSERT_OK(session->Close()); +} + +// Test which sends multiple batches through the same session, each of which +// contains multiple rows spread across multiple tablets. +TEST_F(ClientTest, TestMultipleMultiRowManualBatches) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + const int kNumBatches = 5; + const int kRowsPerBatch = 10; + + int row_key = 0; + + for (int batch_num = 0; batch_num < kNumBatches; batch_num++) { + for (int i = 0; i < kRowsPerBatch; i++) { + ASSERT_OK(ApplyInsertToSession( + session.get(), + (row_key % 2 == 0) ? client_table_ : client_table2_, + row_key, row_key * 10, "hello world")); + row_key++; + } + ASSERT_TRUE(session->HasPendingOperations()) << "Should be pending until we Flush"; + FlushSessionOrDie(session); + ASSERT_FALSE(session->HasPendingOperations()) << "Should have no more pending ops after flush"; + } + + const int kNumRowsPerTablet = kNumBatches * kRowsPerBatch / 2; + ASSERT_EQ(kNumRowsPerTablet, CountRowsFromClient(client_table_.get())); + ASSERT_EQ(kNumRowsPerTablet, CountRowsFromClient(client_table2_.get())); + + // Verify the data looks right. + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + std::sort(rows.begin(), rows.end()); + ASSERT_EQ(kNumRowsPerTablet, rows.size()); + ASSERT_EQ("(int32 key=0, int32 int_val=0, string string_val=hello world, " + "int32 non_null_with_default=12345)" + , rows[0]); +} + +// Test a batch where one of the inserted rows succeeds while another +// fails. +TEST_F(ClientTest, TestBatchWithPartialError) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Insert a row with key "1" + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "original row")); + FlushSessionOrDie(session); + + // Now make a batch that has key "1" (which will fail) along with + // key "2" which will succeed. Flushing should return an error. + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "Attempted dup")); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 2, 1, "Should succeed")); + Status s = session->Flush(); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Some errors occurred"); + + // Fetch and verify the reported error. + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_TRUE(error->status().IsAlreadyPresent()); + ASSERT_EQ(error->failed_op().ToString(), + "INSERT int32 key=1, int32 int_val=1, string string_val=Attempted dup"); + + // Verify that the other row was successfully inserted + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(2, rows.size()); + std::sort(rows.begin(), rows.end()); + ASSERT_EQ("(int32 key=1, int32 int_val=1, string string_val=original row, " + "int32 non_null_with_default=12345)", rows[0]); + ASSERT_EQ("(int32 key=2, int32 int_val=1, string string_val=Should succeed, " + "int32 non_null_with_default=12345)", rows[1]); +} + +// Test flushing an empty batch (should be a no-op). +TEST_F(ClientTest, TestEmptyBatch) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + FlushSessionOrDie(session); +} + +void ClientTest::DoTestWriteWithDeadServer(WhichServerToKill which) { + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(1000); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Shut down the server. + switch (which) { + case DEAD_MASTER: + cluster_->mini_master()->Shutdown(); + break; + case DEAD_TSERVER: + cluster_->mini_tablet_server(0)->Shutdown(); + break; + } + + // Try a write. + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "x")); + Status s = session->Flush(); + ASSERT_TRUE(s.IsIOError()) << s.ToString(); + + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + switch (which) { + case DEAD_MASTER: + // Only one master, so no retry for finding the new leader master. + ASSERT_TRUE(error->status().IsNetworkError()); + break; + case DEAD_TSERVER: + ASSERT_TRUE(error->status().IsTimedOut()); + ASSERT_STR_CONTAINS(error->status().ToString(), "Connection refused"); + break; + } + + ASSERT_EQ(error->failed_op().ToString(), + "INSERT int32 key=1, int32 int_val=1, string string_val=x"); +} + +// Test error handling cases where the master is down (tablet resolution fails) +TEST_F(ClientTest, TestWriteWithDeadMaster) { + client_->data_->default_admin_operation_timeout_ = MonoDelta::FromSeconds(1); + DoTestWriteWithDeadServer(DEAD_MASTER); +} + +// Test error handling when the TS is down (actual write fails its RPC) +TEST_F(ClientTest, TestWriteWithDeadTabletServer) { + DoTestWriteWithDeadServer(DEAD_TSERVER); +} + +void ClientTest::DoApplyWithoutFlushTest(int sleep_micros) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "x")); + SleepFor(MonoDelta::FromMicroseconds(sleep_micros)); + session.reset(); // should not crash! + + // Should have no rows. + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); +} + + +// Applies some updates to the session, and then drops the reference to the +// Session before flushing. Makes sure that the tablet resolution callbacks +// properly deal with the session disappearing underneath. +// +// This test doesn't sleep between applying the operations and dropping the +// reference, in hopes that the reference will be dropped while DNS is still +// in-flight, etc. +TEST_F(ClientTest, TestApplyToSessionWithoutFlushing_OpsInFlight) { + DoApplyWithoutFlushTest(0); +} + +// Same as the above, but sleeps a little bit after applying the operations, +// so that the operations are already in the per-TS-buffer. +TEST_F(ClientTest, TestApplyToSessionWithoutFlushing_OpsBuffered) { + DoApplyWithoutFlushTest(10000); +} + +// Apply a large amount of data without calling Flush(), and ensure +// that we get an error on Apply() rather than sending a too-large +// RPC to the server. +TEST_F(ClientTest, TestApplyTooMuchWithoutFlushing) { + + // Applying a bunch of small rows without a flush should result + // in an error. + { + bool got_expected_error = false; + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + for (int i = 0; i < 1000000; i++) { + Status s = ApplyInsertToSession(session.get(), client_table_, 1, 1, "x"); + if (s.IsIncomplete()) { + ASSERT_STR_CONTAINS(s.ToString(), "not enough space remaining in buffer"); + got_expected_error = true; + break; + } else { + ASSERT_OK(s); + } + } + ASSERT_TRUE(got_expected_error); + } + + // Writing a single very large row should also result in an error. + { + string huge_string(10 * 1024 * 1024, 'x'); + + shared_ptr session = client_->NewSession(); + Status s = ApplyInsertToSession(session.get(), client_table_, 1, 1, huge_string.c_str()); + ASSERT_TRUE(s.IsIncomplete()) << "got unexpected status: " << s.ToString(); + } +} + +// Test that update updates and delete deletes with expected use +TEST_F(ClientTest, TestMutationsWork) { + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "original row")); + FlushSessionOrDie(session); + + ASSERT_OK(ApplyUpdateToSession(session.get(), client_table_, 1, 2)); + FlushSessionOrDie(session); + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=2, string string_val=original row, " + "int32 non_null_with_default=12345)", rows[0]); + rows.clear(); + + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); +} + +TEST_F(ClientTest, TestMutateDeletedRow) { + vector rows; + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "original row")); + FlushSessionOrDie(session); + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); + + // Attempt update deleted row + ASSERT_OK(ApplyUpdateToSession(session.get(), client_table_, 1, 2)); + Status s = session->Flush(); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Some errors occurred"); + // verify error + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_EQ(error->failed_op().ToString(), + "UPDATE int32 key=1, int32 int_val=2"); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); + + // Attempt delete deleted row + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + s = session->Flush(); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Some errors occurred"); + // verify error + error = GetSingleErrorFromSession(session.get()); + ASSERT_EQ(error->failed_op().ToString(), + "DELETE int32 key=1"); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); +} + +TEST_F(ClientTest, TestMutateNonexistentRow) { + vector rows; + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Attempt update nonexistent row + ASSERT_OK(ApplyUpdateToSession(session.get(), client_table_, 1, 2)); + Status s = session->Flush(); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Some errors occurred"); + // verify error + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_EQ(error->failed_op().ToString(), + "UPDATE int32 key=1, int32 int_val=2"); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); + + // Attempt delete nonexistent row + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + s = session->Flush(); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Some errors occurred"); + // verify error + error = GetSingleErrorFromSession(session.get()); + ASSERT_EQ(error->failed_op().ToString(), + "DELETE int32 key=1"); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); +} + +TEST_F(ClientTest, TestWriteWithBadColumn) { + shared_ptr table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + // Try to do a write with the bad schema. + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + gscoped_ptr insert(table->NewInsert()); + ASSERT_OK(insert->mutable_row()->SetInt32("key", 12345)); + Status s = insert->mutable_row()->SetInt32("bad_col", 12345); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), "No such column: bad_col"); +} + +// Do a write with a bad schema on the client side. This should make the Prepare +// phase of the write fail, which will result in an error on the RPC response. +TEST_F(ClientTest, TestWriteWithBadSchema) { + shared_ptr table; + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + // Remove the 'int_val' column. + // Now the schema on the client is "old" + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer + ->DropColumn("int_val") + ->Alter()); + + // Try to do a write with the bad schema. + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, + 12345, 12345, "x")); + Status s = session->Flush(); + ASSERT_FALSE(s.ok()); + + // Verify the specific error. + gscoped_ptr error = GetSingleErrorFromSession(session.get()); + ASSERT_TRUE(error->status().IsInvalidArgument()); + ASSERT_STR_CONTAINS(error->status().ToString(), + "Client provided column int_val[int32 NOT NULL] " + "not present in tablet"); + ASSERT_EQ(error->failed_op().ToString(), + "INSERT int32 key=12345, int32 int_val=12345, string string_val=x"); +} + +TEST_F(ClientTest, TestBasicAlterOperations) { + // test that having no steps throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + Status s = table_alterer->Alter(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "No alter steps provided"); + } + + // test that adding a non-nullable column with no default value throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull(); + Status s = table_alterer->Alter(); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "column `key`: NOT NULL columns must have a default"); + } + + // test that remove key should throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + Status s = table_alterer + ->DropColumn("key") + ->Alter(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "cannot remove a key column"); + } + + // test that renaming a key should throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AlterColumn("key")->RenameTo("key2"); + Status s = table_alterer->Alter(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "cannot rename a key column"); + } + + // test that renaming to an already-existing name throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AlterColumn("int_val")->RenameTo("string_val"); + Status s = table_alterer->Alter(); + ASSERT_TRUE(s.IsAlreadyPresent()); + ASSERT_STR_CONTAINS(s.ToString(), "The column already exists: string_val"); + } + + // Need a tablet peer for the next set of tests. + string tablet_id = GetFirstTabletId(client_table_.get()); + scoped_refptr tablet_peer; + ASSERT_TRUE(cluster_->mini_tablet_server(0)->server()->tablet_manager()->LookupTablet( + tablet_id, &tablet_peer)); + + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->DropColumn("int_val") + ->AddColumn("new_col")->Type(KuduColumnSchema::INT32); + ASSERT_OK(table_alterer->Alter()); + ASSERT_EQ(1, tablet_peer->tablet()->metadata()->schema_version()); + } + + // test that specifying an encoding incompatible with the column's + // type throws an error + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AddColumn("new_string_val")->Type(KuduColumnSchema::STRING) + ->Encoding(KuduColumnStorageAttributes::GROUP_VARINT); + Status s = table_alterer->Alter(); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_STR_CONTAINS(s.ToString(), "Unsupported type/encoding pair"); + ASSERT_EQ(1, tablet_peer->tablet()->metadata()->schema_version()); + } + + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AddColumn("new_string_val")->Type(KuduColumnSchema::STRING) + ->Encoding(KuduColumnStorageAttributes::PREFIX_ENCODING); + ASSERT_OK(table_alterer->Alter()); + ASSERT_EQ(2, tablet_peer->tablet()->metadata()->schema_version()); + } + + { + const char *kRenamedTableName = "RenamedTable"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer + ->RenameTo(kRenamedTableName) + ->Alter()); + ASSERT_EQ(3, tablet_peer->tablet()->metadata()->schema_version()); + ASSERT_EQ(kRenamedTableName, tablet_peer->tablet()->metadata()->table_name()); + + CatalogManager *catalog_manager = cluster_->mini_master()->master()->catalog_manager(); + ASSERT_TRUE(catalog_manager->TableNameExists(kRenamedTableName)); + ASSERT_FALSE(catalog_manager->TableNameExists(kTableName)); + } +} + +TEST_F(ClientTest, TestDeleteTable) { + // Open the table before deleting it. + ASSERT_OK(client_->OpenTable(kTableName, &client_table_)); + + // Insert a few rows, and scan them back. This is to populate the MetaCache. + NO_FATALS(InsertTestRows(client_.get(), client_table_.get(), 10)); + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(10, rows.size()); + + // Remove the table + // NOTE that it returns when the operation is completed on the master side + string tablet_id = GetFirstTabletId(client_table_.get()); + ASSERT_OK(client_->DeleteTable(kTableName)); + CatalogManager *catalog_manager = cluster_->mini_master()->master()->catalog_manager(); + ASSERT_FALSE(catalog_manager->TableNameExists(kTableName)); + + // Wait until the table is removed from the TS + int wait_time = 1000; + bool tablet_found = true; + for (int i = 0; i < 80 && tablet_found; ++i) { + scoped_refptr tablet_peer; + tablet_found = cluster_->mini_tablet_server(0)->server()->tablet_manager()->LookupTablet( + tablet_id, &tablet_peer); + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + wait_time = std::min(wait_time * 5 / 4, 1000000); + } + ASSERT_FALSE(tablet_found); + + // Try to open the deleted table + Status s = client_->OpenTable(kTableName, &client_table_); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), "The table does not exist"); + + // Create a new table with the same name. This is to ensure that the client + // doesn't cache anything inappropriately by table name (see KUDU-1055). + NO_FATALS(CreateTable(kTableName, 1, GenerateSplitRows(), &client_table_)); + + // Should be able to insert successfully into the new table. + NO_FATALS(InsertTestRows(client_.get(), client_table_.get(), 10)); +} + +TEST_F(ClientTest, TestGetTableSchema) { + KuduSchema schema; + + // Verify the schema for the current table + ASSERT_OK(client_->GetTableSchema(kTableName, &schema)); + ASSERT_TRUE(schema_.Equals(schema)); + + // Verify that a get schema request for a missing table throws not found + Status s = client_->GetTableSchema("MissingTableName", &schema); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), "The table does not exist"); +} + +TEST_F(ClientTest, TestStaleLocations) { + string tablet_id = GetFirstTabletId(client_table2_.get()); + + // The Tablet is up and running the location should not be stale + master::TabletLocationsPB locs_pb; + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTabletLocations( + tablet_id, &locs_pb)); + ASSERT_FALSE(locs_pb.stale()); + + // On Master restart and no tablet report we expect the locations to be stale + cluster_->mini_tablet_server(0)->Shutdown(); + ASSERT_OK(cluster_->mini_master()->Restart()); + ASSERT_OK(cluster_->mini_master()->master()-> + WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTabletLocations( + tablet_id, &locs_pb)); + ASSERT_TRUE(locs_pb.stale()); + + // Restart the TS and Wait for the tablets to be reported to the master. + ASSERT_OK(cluster_->mini_tablet_server(0)->Start()); + ASSERT_OK(cluster_->WaitForTabletServerCount(1)); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTabletLocations( + tablet_id, &locs_pb)); + + // It may take a while to bootstrap the tablet and send the location report + // so spin until we get a non-stale location. + int wait_time = 1000; + for (int i = 0; i < 80; ++i) { + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTabletLocations( + tablet_id, &locs_pb)); + if (!locs_pb.stale()) { + break; + } + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + wait_time = std::min(wait_time * 5 / 4, 1000000); + } + ASSERT_FALSE(locs_pb.stale()); +} + +// Test creating and accessing a table which has multiple tablets, +// each of which is replicated. +// +// TODO: this should probably be the default for _all_ of the tests +// in this file. However, some things like alter table are not yet +// working on replicated tables - see KUDU-304 +TEST_F(ClientTest, TestReplicatedMultiTabletTable) { + const string kReplicatedTable = "replicated"; + const int kNumRowsToWrite = 100; + const int kNumReplicas = 3; + + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable(kReplicatedTable, + kNumReplicas, + GenerateSplitRows(), + &table)); + + // Should have no rows to begin with. + ASSERT_EQ(0, CountRowsFromClient(table.get())); + + // Insert some data. + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), kNumRowsToWrite)); + + // Should now see the data. + ASSERT_EQ(kNumRowsToWrite, CountRowsFromClient(table.get())); + + // TODO: once leader re-election is in, should somehow force a re-election + // and ensure that the client handles refreshing the leader. +} + +TEST_F(ClientTest, TestReplicatedMultiTabletTableFailover) { + const string kReplicatedTable = "replicated_failover_on_reads"; + const int kNumRowsToWrite = 100; + const int kNumReplicas = 3; + const int kNumTries = 100; + + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable(kReplicatedTable, + kNumReplicas, + GenerateSplitRows(), + &table)); + + // Insert some data. + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), kNumRowsToWrite)); + + // Find the leader of the first tablet. + Synchronizer sync; + scoped_refptr rt; + client_->data_->meta_cache_->LookupTabletByKey(table.get(), "", + MonoTime::Max(), + &rt, sync.AsStatusCallback()); + ASSERT_OK(sync.Wait()); + internal::RemoteTabletServer *rts = rt->LeaderTServer(); + + // Kill the leader of the first tablet. + ASSERT_OK(KillTServer(rts->permanent_uuid())); + + // We wait until we fail over to the new leader(s). + int tries = 0; + for (;;) { + tries++; + int num_rows = CountRowsFromClient(table.get(), + KuduClient::LEADER_ONLY, + kNoBound, kNoBound); + if (num_rows == kNumRowsToWrite) { + LOG(INFO) << "Found expected number of rows: " << num_rows; + break; + } else { + LOG(INFO) << "Only found " << num_rows << " rows on try " + << tries << ", retrying"; + ASSERT_LE(tries, kNumTries); + SleepFor(MonoDelta::FromMilliseconds(10 * tries)); // sleep a bit more with each attempt. + } + } +} + +// This test that we can keep writing to a tablet when the leader +// tablet dies. +// This currently forces leader promotion through RPC and creates +// a new client afterwards. +// TODO Remove the leader promotion part when we have automated +// leader election. +TEST_F(ClientTest, TestReplicatedTabletWritesWithLeaderElection) { + const string kReplicatedTable = "replicated_failover_on_writes"; + const int kNumRowsToWrite = 100; + const int kNumReplicas = 3; + + shared_ptr table; + ASSERT_NO_FATAL_FAILURE(CreateTable(kReplicatedTable, + kNumReplicas, + vector(), + &table)); + + // Insert some data. + ASSERT_NO_FATAL_FAILURE(InsertTestRows(table.get(), kNumRowsToWrite)); + + // TODO: we have to sleep here to make sure that the leader has time to + // propagate the writes to the followers. We can remove this once the + // followers run a leader election on their own and handle advancing + // the commit index. + SleepFor(MonoDelta::FromMilliseconds(1500)); + + // Find the leader replica + Synchronizer sync; + scoped_refptr rt; + client_->data_->meta_cache_->LookupTabletByKey(table.get(), "", + MonoTime::Max(), + &rt, sync.AsStatusCallback()); + ASSERT_OK(sync.Wait()); + internal::RemoteTabletServer *rts; + set blacklist; + vector candidates; + ASSERT_OK(client_->data_->GetTabletServer(client_.get(), + rt, + KuduClient::LEADER_ONLY, + blacklist, + &candidates, + &rts)); + + string killed_uuid = rts->permanent_uuid(); + // Kill the tserver that is serving the leader tablet. + ASSERT_OK(KillTServer(killed_uuid)); + + // Since we waited before, hopefully all replicas will be up to date + // and we can just promote another replica. + std::shared_ptr client_messenger; + rpc::MessengerBuilder bld("client"); + ASSERT_OK(bld.Build(&client_messenger)); + gscoped_ptr new_leader_proxy; + + int new_leader_idx = -1; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + if (ts->is_started()) { + const string& uuid = ts->server()->instance_pb().permanent_uuid(); + if (uuid != killed_uuid) { + new_leader_idx = i; + break; + } + } + } + ASSERT_NE(-1, new_leader_idx); + + MiniTabletServer* new_leader = cluster_->mini_tablet_server(new_leader_idx); + ASSERT_TRUE(new_leader != nullptr); + new_leader_proxy.reset( + new consensus::ConsensusServiceProxy(client_messenger, + new_leader->bound_rpc_addr())); + + consensus::RunLeaderElectionRequestPB req; + consensus::RunLeaderElectionResponsePB resp; + rpc::RpcController controller; + + LOG(INFO) << "Promoting server at index " << new_leader_idx << " listening at " + << new_leader->bound_rpc_addr().ToString() << " ..."; + req.set_dest_uuid(new_leader->server()->fs_manager()->uuid()); + req.set_tablet_id(rt->tablet_id()); + ASSERT_OK(new_leader_proxy->RunLeaderElection(req, &resp, &controller)); + ASSERT_FALSE(resp.has_error()) << "Got error. Response: " << resp.ShortDebugString(); + + LOG(INFO) << "Inserting additional rows..."; + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_.get(), + table.get(), + kNumRowsToWrite, + kNumRowsToWrite)); + + // TODO: we have to sleep here to make sure that the leader has time to + // propagate the writes to the followers. We can remove this once the + // followers run a leader election on their own and handle advancing + // the commit index. + SleepFor(MonoDelta::FromMilliseconds(1500)); + + LOG(INFO) << "Counting rows..."; + ASSERT_EQ(2 * kNumRowsToWrite, CountRowsFromClient(table.get(), + KuduClient::FIRST_REPLICA, + kNoBound, kNoBound)); +} + +namespace { + +void CheckCorrectness(KuduScanner* scanner, int expected[], int nrows) { + scanner->Open(); + int readrows = 0; + KuduScanBatch batch; + if (nrows) { + ASSERT_TRUE(scanner->HasMoreRows()); + } else { + ASSERT_FALSE(scanner->HasMoreRows()); + } + + while (scanner->HasMoreRows()) { + ASSERT_OK(scanner->NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& r : batch) { + int32_t key; + int32_t val; + Slice strval; + ASSERT_OK(r.GetInt32(0, &key)); + ASSERT_OK(r.GetInt32(1, &val)); + ASSERT_OK(r.GetString(2, &strval)); + ASSERT_NE(expected[key], -1) << "Deleted key found in table in table " << key; + ASSERT_EQ(expected[key], val) << "Incorrect int value for key " << key; + ASSERT_EQ(strval.size(), 0) << "Incorrect string value for key " << key; + ++readrows; + } + } + ASSERT_EQ(readrows, nrows); + scanner->Close(); +} + +} // anonymous namespace + +// Randomized mutations accuracy testing +TEST_F(ClientTest, TestRandomWriteOperation) { + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(5000); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + int row[FLAGS_test_scan_num_rows]; // -1 indicates empty + int nrows; + KuduScanner scanner(client_table_.get()); + + // First half-fill + for (int i = 0; i < FLAGS_test_scan_num_rows/2; ++i) { + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, i, i, "")); + row[i] = i; + } + for (int i = FLAGS_test_scan_num_rows/2; i < FLAGS_test_scan_num_rows; ++i) { + row[i] = -1; + } + nrows = FLAGS_test_scan_num_rows/2; + + // Randomized testing + LOG(INFO) << "Randomized mutations testing."; + SeedRandom(); + for (int i = 0; i <= 1000; ++i) { + // Test correctness every so often + if (i % 50 == 0) { + LOG(INFO) << "Correctness test " << i; + FlushSessionOrDie(session); + ASSERT_NO_FATAL_FAILURE(CheckCorrectness(&scanner, row, nrows)); + LOG(INFO) << "...complete"; + } + + int change = rand() % FLAGS_test_scan_num_rows; + // Insert if empty + if (row[change] == -1) { + ASSERT_OK(ApplyInsertToSession(session.get(), + client_table_, + change, + change, + "")); + row[change] = change; + ++nrows; + VLOG(1) << "Insert " << change; + } else { + // Update or delete otherwise + int update = rand() & 1; + if (update) { + ASSERT_OK(ApplyUpdateToSession(session.get(), + client_table_, + change, + ++row[change])); + VLOG(1) << "Update " << change; + } else { + ASSERT_OK(ApplyDeleteToSession(session.get(), + client_table_, + change)); + row[change] = -1; + --nrows; + VLOG(1) << "Delete " << change; + } + } + } + + // And one more time for the last batch. + FlushSessionOrDie(session); + ASSERT_NO_FATAL_FAILURE(CheckCorrectness(&scanner, row, nrows)); +} + +// Test whether a batch can handle several mutations in a batch +TEST_F(ClientTest, TestSeveralRowMutatesPerBatch) { + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(5000); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Test insert/update + LOG(INFO) << "Testing insert/update in same batch, key " << 1 << "."; + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "")); + ASSERT_OK(ApplyUpdateToSession(session.get(), client_table_, 1, 2)); + FlushSessionOrDie(session); + vector rows; + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=2, string string_val=, " + "int32 non_null_with_default=12345)", rows[0]); + rows.clear(); + + + LOG(INFO) << "Testing insert/delete in same batch, key " << 2 << "."; + // Test insert/delete + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 2, 1, "")); + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 2)); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=2, string string_val=, " + "int32 non_null_with_default=12345)", rows[0]); + rows.clear(); + + // Test update/delete + LOG(INFO) << "Testing update/delete in same batch, key " << 1 << "."; + ASSERT_OK(ApplyUpdateToSession(session.get(), client_table_, 1, 1)); + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(0, rows.size()); + + // Test delete/insert (insert a row first) + LOG(INFO) << "Inserting row for delete/insert test, key " << 1 << "."; + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 1, "")); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=1, string string_val=, " + "int32 non_null_with_default=12345)", rows[0]); + rows.clear(); + LOG(INFO) << "Testing delete/insert in same batch, key " << 1 << "."; + ASSERT_OK(ApplyDeleteToSession(session.get(), client_table_, 1)); + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, 1, 2, "")); + FlushSessionOrDie(session); + ScanTableToStrings(client_table_.get(), &rows); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=2, string string_val=, " + "int32 non_null_with_default=12345)", rows[0]); + rows.clear(); +} + +// Tests that master permits are properly released after a whole bunch of +// rows are inserted. +TEST_F(ClientTest, TestMasterLookupPermits) { + int initial_value = client_->data_->meta_cache_->master_lookup_sem_.GetValue(); + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + FLAGS_test_scan_num_rows)); + ASSERT_EQ(initial_value, + client_->data_->meta_cache_->master_lookup_sem_.GetValue()); +} + +// Define callback for deadlock simulation, as well as various helper methods. +namespace { +class DLSCallback : public KuduStatusCallback { + public: + explicit DLSCallback(Atomic32* i) : i(i) { + } + + virtual void Run(const Status& s) OVERRIDE { + CHECK_OK(s); + NoBarrier_AtomicIncrement(i, 1); + delete this; + } + private: + Atomic32* const i; +}; + +// Returns col1 value of first row. +int32_t ReadFirstRowKeyFirstCol(const shared_ptr& tbl) { + KuduScanner scanner(tbl.get()); + + scanner.Open(); + KuduScanBatch batch; + CHECK(scanner.HasMoreRows()); + CHECK_OK(scanner.NextBatch(&batch)); + KuduRowResult row = batch.Row(0); + int32_t val; + CHECK_OK(row.GetInt32(1, &val)); + return val; +} + +// Checks that all rows have value equal to expected, return number of rows. +int CheckRowsEqual(const shared_ptr& tbl, int32_t expected) { + KuduScanner scanner(tbl.get()); + scanner.Open(); + KuduScanBatch batch; + int cnt = 0; + while (scanner.HasMoreRows()) { + CHECK_OK(scanner.NextBatch(&batch)); + for (const KuduScanBatch::RowPtr& row : batch) { + // Check that for every key: + // 1. Column 1 int32_t value == expected + // 2. Column 2 string value is empty + // 3. Column 3 int32_t value is default, 12345 + int32_t key; + int32_t val; + Slice strval; + int32_t val2; + CHECK_OK(row.GetInt32(0, &key)); + CHECK_OK(row.GetInt32(1, &val)); + CHECK_OK(row.GetString(2, &strval)); + CHECK_OK(row.GetInt32(3, &val2)); + CHECK_EQ(expected, val) << "Incorrect int value for key " << key; + CHECK_EQ(strval.size(), 0) << "Incorrect string value for key " << key; + CHECK_EQ(12345, val2); + ++cnt; + } + } + return cnt; +} + +// Return a session "loaded" with updates. Sets the session timeout +// to the parameter value. Larger timeouts decrease false positives. +shared_ptr LoadedSession(const shared_ptr& client, + const shared_ptr& tbl, + bool fwd, int max, int timeout) { + shared_ptr session = client->NewSession(); + session->SetTimeoutMillis(timeout); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + for (int i = 0; i < max; ++i) { + int key = fwd ? i : max - i; + CHECK_OK(ApplyUpdateToSession(session.get(), tbl, key, fwd)); + } + return session; +} +} // anonymous namespace + +// Starts many clients which update a table in parallel. +// Half of the clients update rows in ascending order while the other +// half update rows in descending order. +// This ensures that we don't hit a deadlock in such a situation. +TEST_F(ClientTest, TestDeadlockSimulation) { + if (!AllowSlowTests()) { + LOG(WARNING) << "TestDeadlockSimulation disabled since slow."; + return; + } + + // Make reverse client who will make batches that update rows + // in reverse order. Separate client used so rpc calls come in at same time. + shared_ptr rev_client; + ASSERT_OK(KuduClientBuilder() + .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr().ToString()) + .Build(&rev_client)); + shared_ptr rev_table; + ASSERT_OK(client_->OpenTable(kTableName, &rev_table)); + + // Load up some rows + const int kNumRows = 300; + const int kTimeoutMillis = 60000; + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(kTimeoutMillis); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + for (int i = 0; i < kNumRows; ++i) + ASSERT_OK(ApplyInsertToSession(session.get(), client_table_, i, i, "")); + FlushSessionOrDie(session); + + // Check both clients see rows + int fwd = CountRowsFromClient(client_table_.get()); + ASSERT_EQ(kNumRows, fwd); + int rev = CountRowsFromClient(rev_table.get()); + ASSERT_EQ(kNumRows, rev); + + // Generate sessions + const int kNumSessions = 100; + shared_ptr fwd_sessions[kNumSessions]; + shared_ptr rev_sessions[kNumSessions]; + for (int i = 0; i < kNumSessions; ++i) { + fwd_sessions[i] = LoadedSession(client_, client_table_, true, kNumRows, kTimeoutMillis); + rev_sessions[i] = LoadedSession(rev_client, rev_table, true, kNumRows, kTimeoutMillis); + } + + // Run async calls - one thread updates sequentially, another in reverse. + Atomic32 ctr1, ctr2; + NoBarrier_Store(&ctr1, 0); + NoBarrier_Store(&ctr2, 0); + for (int i = 0; i < kNumSessions; ++i) { + // The callbacks are freed after they are invoked. + fwd_sessions[i]->FlushAsync(new DLSCallback(&ctr1)); + rev_sessions[i]->FlushAsync(new DLSCallback(&ctr2)); + } + + // Spin while waiting for ops to complete. + int lctr1, lctr2, prev1 = 0, prev2 = 0; + do { + lctr1 = NoBarrier_Load(&ctr1); + lctr2 = NoBarrier_Load(&ctr2); + // Display progress in 10% increments. + if (prev1 == 0 || lctr1 + lctr2 - prev1 - prev2 > kNumSessions / 10) { + LOG(INFO) << "# updates: " << lctr1 << " fwd, " << lctr2 << " rev"; + prev1 = lctr1; + prev2 = lctr2; + } + SleepFor(MonoDelta::FromMilliseconds(100)); + } while (lctr1 != kNumSessions|| lctr2 != kNumSessions); + int32_t expected = ReadFirstRowKeyFirstCol(client_table_); + + // Check transaction from forward client. + fwd = CheckRowsEqual(client_table_, expected); + ASSERT_EQ(fwd, kNumRows); + + // Check from reverse client side. + rev = CheckRowsEqual(rev_table, expected); + ASSERT_EQ(rev, kNumRows); +} + +TEST_F(ClientTest, TestCreateDuplicateTable) { + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_TRUE(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(1) + .Create().IsAlreadyPresent()); +} + +TEST_F(ClientTest, TestCreateTableWithTooManyTablets) { + FLAGS_max_create_tablets_per_ts = 1; + + KuduPartialRow* split1 = schema_.NewRow(); + ASSERT_OK(split1->SetInt32("key", 1)); + + KuduPartialRow* split2 = schema_.NewRow(); + ASSERT_OK(split2->SetInt32("key", 2)); + + gscoped_ptr table_creator(client_->NewTableCreator()); + Status s = table_creator->table_name("foobar") + .schema(&schema_) + .split_rows({ split1, split2 }) + .num_replicas(3) + .Create(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), + "The requested number of tablets is over the permitted maximum (1)"); +} + +TEST_F(ClientTest, TestCreateTableWithTooManyReplicas) { + KuduPartialRow* split1 = schema_.NewRow(); + ASSERT_OK(split1->SetInt32("key", 1)); + + KuduPartialRow* split2 = schema_.NewRow(); + ASSERT_OK(split2->SetInt32("key", 2)); + + gscoped_ptr table_creator(client_->NewTableCreator()); + Status s = table_creator->table_name("foobar") + .schema(&schema_) + .split_rows({ split1, split2 }) + .num_replicas(3) + .Create(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), + "Not enough live tablet servers to create a table with the requested " + "replication factor 3. 1 tablet servers are alive"); +} + +TEST_F(ClientTest, TestLatestObservedTimestamp) { + // Check that a write updates the latest observed timestamp. + uint64_t ts0 = client_->GetLatestObservedTimestamp(); + ASSERT_EQ(ts0, KuduClient::kNoTimestamp); + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), 1, 0)); + uint64_t ts1 = client_->GetLatestObservedTimestamp(); + ASSERT_NE(ts0, ts1); + + // Check that the timestamp of the previous write will be observed by another + // client performing a snapshot scan at that timestamp. + shared_ptr client; + shared_ptr table; + ASSERT_OK(KuduClientBuilder() + .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr().ToString()) + .Build(&client)); + ASSERT_EQ(client->GetLatestObservedTimestamp(), KuduClient::kNoTimestamp); + ASSERT_OK(client->OpenTable(client_table_->name(), &table)); + KuduScanner scanner(table.get()); + ASSERT_OK(scanner.SetReadMode(KuduScanner::READ_AT_SNAPSHOT)); + ASSERT_OK(scanner.SetSnapshotRaw(ts1)); + ASSERT_OK(scanner.Open()); + scanner.Close(); + uint64_t ts2 = client->GetLatestObservedTimestamp(); + ASSERT_EQ(ts1, ts2); +} + +TEST_F(ClientTest, TestClonePredicates) { + ASSERT_NO_FATAL_FAILURE(InsertTestRows(client_table_.get(), + 2, 0)); + gscoped_ptr predicate(client_table_->NewComparisonPredicate( + "key", + KuduPredicate::EQUAL, + KuduValue::FromInt(1))); + + gscoped_ptr scanner(new KuduScanner(client_table_.get())); + ASSERT_OK(scanner->AddConjunctPredicate(predicate->Clone())); + ASSERT_OK(scanner->Open()); + + int count = 0; + KuduScanBatch batch; + while (scanner->HasMoreRows()) { + ASSERT_OK(scanner->NextBatch(&batch)); + count += batch.NumRows(); + } + + ASSERT_EQ(count, 1); + + scanner.reset(new KuduScanner(client_table_.get())); + ASSERT_OK(scanner->AddConjunctPredicate(predicate->Clone())); + ASSERT_OK(scanner->Open()); + + count = 0; + while (scanner->HasMoreRows()) { + ASSERT_OK(scanner->NextBatch(&batch)); + count += batch.NumRows(); + } + + ASSERT_EQ(count, 1); +} + +// Test that scanners will retry after receiving ERROR_SERVER_TOO_BUSY from an +// overloaded tablet server. Regression test for KUDU-1079. +TEST_F(ClientTest, TestServerTooBusyRetry) { + NO_FATALS(InsertTestRows(client_table_.get(), FLAGS_test_scan_num_rows)); + + // Introduce latency in each scan to increase the likelihood of + // ERROR_SERVER_TOO_BUSY. + FLAGS_scanner_inject_latency_on_each_batch_ms = 10; + + // Reduce the service queue length of each tablet server in order to increase + // the likelihood of ERROR_SERVER_TOO_BUSY. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + ts->options()->rpc_opts.service_queue_length = 1; + ASSERT_OK(ts->Restart()); + ASSERT_OK(ts->WaitStarted()); + } + + bool stop = false; + vector > threads; + int t = 0; + while (!stop) { + scoped_refptr thread; + ASSERT_OK(kudu::Thread::Create("test", strings::Substitute("t$0", t++), + &ClientTest::CheckRowCount, this, client_table_.get(), + &thread)); + threads.push_back(thread); + + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + scoped_refptr counter = METRIC_rpcs_queue_overflow.Instantiate( + cluster_->mini_tablet_server(i)->server()->metric_entity()); + stop = counter->value() > 0; + } + } + + for (const scoped_refptr& thread : threads) { + thread->Join(); + } +} + +TEST_F(ClientTest, TestLastErrorEmbeddedInScanTimeoutStatus) { + // For the random() calls that take place during scan retries. + SeedRandom(); + + NO_FATALS(InsertTestRows(client_table_.get(), FLAGS_test_scan_num_rows)); + + { + // Revert the latency injection flags at the end so the test exits faster. + google::FlagSaver saver; + + // Restart, but inject latency so that startup is very slow. + FLAGS_log_inject_latency = true; + FLAGS_log_inject_latency_ms_mean = 1000; + FLAGS_log_inject_latency_ms_stddev = 0; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(i); + ASSERT_OK(ts->Restart()); + } + + // As the tservers are still starting up, the scan will retry until it + // times out. The actual error should be embedded in the returned status. + KuduScanner scan(client_table_.get()); + ASSERT_OK(scan.SetTimeoutMillis(1000)); + Status s = scan.Open(); + SCOPED_TRACE(s.ToString()); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_STR_CONTAINS(s.ToString(), "Illegal state: Tablet not RUNNING"); + } +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/client-unittest.cc b/src/kudu/client/client-unittest.cc new file mode 100644 index 000000000000..6c0033dccd19 --- /dev/null +++ b/src/kudu/client/client-unittest.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tests for the client which are true unit tests and don't require a cluster, etc. + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/client-internal.h" + +using std::string; +using std::vector; + +namespace kudu { +namespace client { + +TEST(ClientUnitTest, TestSchemaBuilder_EmptySchema) { + KuduSchema s; + KuduSchemaBuilder b; + ASSERT_EQ("Invalid argument: no primary key specified", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_KeyNotSpecified) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->NotNull(); + ASSERT_EQ("Invalid argument: no primary key specified", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_DuplicateColumn) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("x")->Type(KuduColumnSchema::INT32); + b.AddColumn("x")->Type(KuduColumnSchema::INT32); + ASSERT_EQ("Invalid argument: Duplicate column name: x", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_KeyNotFirstColumn) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32); + b.AddColumn("x")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey();; + b.AddColumn("x")->Type(KuduColumnSchema::INT32); + ASSERT_EQ("Invalid argument: primary key column must be the first column", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_TwoPrimaryKeys) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->PrimaryKey(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->PrimaryKey(); + ASSERT_EQ("Invalid argument: multiple columns specified for primary key: a, b", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_PrimaryKeyOnColumnAndSet) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->PrimaryKey(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32); + b.SetPrimaryKey({ "a", "b" }); + ASSERT_EQ("Invalid argument: primary key specified by both " + "SetPrimaryKey() and on a specific column: a", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_SingleKey_GoodSchema) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32); + b.AddColumn("c")->Type(KuduColumnSchema::INT32)->NotNull(); + ASSERT_EQ("OK", b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_CompoundKey_GoodSchema) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->NotNull(); + b.SetPrimaryKey({ "a", "b" }); + ASSERT_EQ("OK", b.Build(&s).ToString()); + + vector key_columns; + s.GetPrimaryKeyColumnIndexes(&key_columns); + ASSERT_EQ(vector({ 0, 1 }), key_columns); +} + +TEST(ClientUnitTest, TestSchemaBuilder_DefaultValues) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->NotNull() + ->Default(KuduValue::FromInt(12345)); + ASSERT_EQ("OK", b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_DefaultValueString) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("b")->Type(KuduColumnSchema::STRING)->NotNull() + ->Default(KuduValue::CopyString("abc")); + b.AddColumn("c")->Type(KuduColumnSchema::BINARY)->NotNull() + ->Default(KuduValue::CopyString("def")); + ASSERT_EQ("OK", b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_CompoundKey_KeyNotFirst) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("x")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->NotNull(); + b.SetPrimaryKey({ "a", "b" }); + ASSERT_EQ("Invalid argument: primary key columns must be listed " + "first in the schema: a", + b.Build(&s).ToString()); +} + +TEST(ClientUnitTest, TestSchemaBuilder_CompoundKey_BadColumnName) { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("a")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("b")->Type(KuduColumnSchema::INT32)->NotNull(); + b.SetPrimaryKey({ "foo" }); + ASSERT_EQ("Invalid argument: primary key column not defined: foo", + b.Build(&s).ToString()); +} + +namespace { +Status TestFunc(const MonoTime& deadline, bool* retry, int* counter) { + (*counter)++; + *retry = true; + return Status::RuntimeError("x"); +} +} // anonymous namespace + +TEST(ClientUnitTest, TestRetryFunc) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(100)); + int counter = 0; + Status s = RetryFunc(deadline, "retrying test func", "timed out", + boost::bind(TestFunc, _1, _2, &counter)); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_GT(counter, 5); + ASSERT_LT(counter, 20); +} + +} // namespace client +} // namespace kudu + diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc new file mode 100644 index 000000000000..f2c889477bb7 --- /dev/null +++ b/src/kudu/client/client.cc @@ -0,0 +1,1280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/client.h" + +#include +#include +#include +#include +#include + +#include "kudu/client/batcher.h" +#include "kudu/client/callbacks.h" +#include "kudu/client/client-internal.h" +#include "kudu/client/client_builder-internal.h" +#include "kudu/client/error-internal.h" +#include "kudu/client/error_collector.h" +#include "kudu/client/meta_cache.h" +#include "kudu/client/row_result.h" +#include "kudu/client/scan_predicate-internal.h" +#include "kudu/client/scanner-internal.h" +#include "kudu/client/schema-internal.h" +#include "kudu/client/session-internal.h" +#include "kudu/client/table-internal.h" +#include "kudu/client/table_alterer-internal.h" +#include "kudu/client/table_creator-internal.h" +#include "kudu/client/tablet_server-internal.h" +#include "kudu/client/write_op.h" +#include "kudu/common/common.pb.h" +#include "kudu/common/partition.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.h" // TODO: remove this include - just needed for default port +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/messenger.h" +#include "kudu/util/init.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/dns_resolver.h" + +using kudu::master::AlterTableRequestPB; +using kudu::master::AlterTableRequestPB_Step; +using kudu::master::AlterTableResponsePB; +using kudu::master::CreateTableRequestPB; +using kudu::master::CreateTableResponsePB; +using kudu::master::DeleteTableRequestPB; +using kudu::master::DeleteTableResponsePB; +using kudu::master::GetTableSchemaRequestPB; +using kudu::master::GetTableSchemaResponsePB; +using kudu::master::ListTablesRequestPB; +using kudu::master::ListTablesResponsePB; +using kudu::master::ListTabletServersRequestPB; +using kudu::master::ListTabletServersResponsePB; +using kudu::master::ListTabletServersResponsePB_Entry; +using kudu::master::MasterServiceProxy; +using kudu::master::TabletLocationsPB; +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; +using kudu::tserver::ScanResponsePB; +using std::set; +using std::string; +using std::vector; + +MAKE_ENUM_LIMITS(kudu::client::KuduSession::FlushMode, + kudu::client::KuduSession::AUTO_FLUSH_SYNC, + kudu::client::KuduSession::MANUAL_FLUSH); + +MAKE_ENUM_LIMITS(kudu::client::KuduSession::ExternalConsistencyMode, + kudu::client::KuduSession::CLIENT_PROPAGATED, + kudu::client::KuduSession::COMMIT_WAIT); + +MAKE_ENUM_LIMITS(kudu::client::KuduScanner::ReadMode, + kudu::client::KuduScanner::READ_LATEST, + kudu::client::KuduScanner::READ_AT_SNAPSHOT); + +MAKE_ENUM_LIMITS(kudu::client::KuduScanner::OrderMode, + kudu::client::KuduScanner::UNORDERED, + kudu::client::KuduScanner::ORDERED); + +namespace kudu { +namespace client { + +using internal::Batcher; +using internal::ErrorCollector; +using internal::MetaCache; +using sp::shared_ptr; + +static const int kHtTimestampBitsToShift = 12; +static const char* kProgName = "kudu_client"; + +// We need to reroute all logging to stderr when the client library is +// loaded. GoogleOnceInit() can do that, but there are multiple entry +// points into the client code, and it'd need to be called in each one. +// So instead, let's use a constructor function. +// +// Should this be restricted to just the exported client build? Probably +// not, as any application using the library probably wants stderr logging +// more than file logging. +__attribute__((constructor)) +static void InitializeBasicLogging() { + InitGoogleLoggingSafeBasic(kProgName); +} + +// Adapts between the internal LogSeverity and the client's KuduLogSeverity. +static void LoggingAdapterCB(KuduLoggingCallback* user_cb, + LogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len) { + KuduLogSeverity client_severity; + switch (severity) { + case kudu::SEVERITY_INFO: + client_severity = SEVERITY_INFO; + break; + case kudu::SEVERITY_WARNING: + client_severity = SEVERITY_WARNING; + break; + case kudu::SEVERITY_ERROR: + client_severity = SEVERITY_ERROR; + break; + case kudu::SEVERITY_FATAL: + client_severity = SEVERITY_FATAL; + break; + default: + LOG(FATAL) << "Unknown Kudu log severity: " << severity; + } + user_cb->Run(client_severity, filename, line_number, time, + message, message_len); +} + +void InstallLoggingCallback(KuduLoggingCallback* cb) { + RegisterLoggingCallback(Bind(&LoggingAdapterCB, Unretained(cb))); +} + +void UninstallLoggingCallback() { + UnregisterLoggingCallback(); +} + +void SetVerboseLogLevel(int level) { + FLAGS_v = level; +} + +Status SetInternalSignalNumber(int signum) { + return SetStackTraceSignal(signum); +} + +KuduClientBuilder::KuduClientBuilder() + : data_(new KuduClientBuilder::Data()) { +} + +KuduClientBuilder::~KuduClientBuilder() { + delete data_; +} + +KuduClientBuilder& KuduClientBuilder::clear_master_server_addrs() { + data_->master_server_addrs_.clear(); + return *this; +} + +KuduClientBuilder& KuduClientBuilder::master_server_addrs(const vector& addrs) { + for (const string& addr : addrs) { + data_->master_server_addrs_.push_back(addr); + } + return *this; +} + +KuduClientBuilder& KuduClientBuilder::add_master_server_addr(const string& addr) { + data_->master_server_addrs_.push_back(addr); + return *this; +} + +KuduClientBuilder& KuduClientBuilder::default_admin_operation_timeout(const MonoDelta& timeout) { + data_->default_admin_operation_timeout_ = timeout; + return *this; +} + +KuduClientBuilder& KuduClientBuilder::default_rpc_timeout(const MonoDelta& timeout) { + data_->default_rpc_timeout_ = timeout; + return *this; +} + +Status KuduClientBuilder::Build(shared_ptr* client) { + RETURN_NOT_OK(CheckCPUFlags()); + + shared_ptr c(new KuduClient()); + + // Init messenger. + MessengerBuilder builder("client"); + RETURN_NOT_OK(builder.Build(&c->data_->messenger_)); + + c->data_->master_server_addrs_ = data_->master_server_addrs_; + c->data_->default_admin_operation_timeout_ = data_->default_admin_operation_timeout_; + c->data_->default_rpc_timeout_ = data_->default_rpc_timeout_; + + // Let's allow for plenty of time for discovering the master the first + // time around. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(c->default_admin_operation_timeout()); + RETURN_NOT_OK_PREPEND(c->data_->SetMasterServerProxy(c.get(), deadline), + "Could not locate the leader master"); + + c->data_->meta_cache_.reset(new MetaCache(c.get())); + c->data_->dns_resolver_.reset(new DnsResolver()); + + // Init local host names used for locality decisions. + RETURN_NOT_OK_PREPEND(c->data_->InitLocalHostNames(), + "Could not determine local host names"); + + client->swap(c); + return Status::OK(); +} + +KuduClient::KuduClient() + : data_(new KuduClient::Data()) { +} + +KuduClient::~KuduClient() { + delete data_; +} + +KuduTableCreator* KuduClient::NewTableCreator() { + return new KuduTableCreator(this); +} + +Status KuduClient::IsCreateTableInProgress(const string& table_name, + bool *create_in_progress) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + return data_->IsCreateTableInProgress(this, table_name, deadline, create_in_progress); +} + +Status KuduClient::DeleteTable(const string& table_name) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + return data_->DeleteTable(this, table_name, deadline); +} + +KuduTableAlterer* KuduClient::NewTableAlterer(const string& name) { + return new KuduTableAlterer(this, name); +} + +Status KuduClient::IsAlterTableInProgress(const string& table_name, + bool *alter_in_progress) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + return data_->IsAlterTableInProgress(this, table_name, deadline, alter_in_progress); +} + +Status KuduClient::GetTableSchema(const string& table_name, + KuduSchema* schema) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + string table_id_ignored; + PartitionSchema partition_schema; + return data_->GetTableSchema(this, + table_name, + deadline, + schema, + &partition_schema, + &table_id_ignored); +} + +Status KuduClient::ListTabletServers(vector* tablet_servers) { + ListTabletServersRequestPB req; + ListTabletServersResponsePB resp; + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + Status s = + data_->SyncLeaderMasterRpc( + deadline, + this, + req, + &resp, + nullptr, + "ListTabletServers", + &MasterServiceProxy::ListTabletServers); + RETURN_NOT_OK(s); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + for (int i = 0; i < resp.servers_size(); i++) { + const ListTabletServersResponsePB_Entry& e = resp.servers(i); + auto ts = new KuduTabletServer(); + ts->data_ = new KuduTabletServer::Data(e.instance_id().permanent_uuid(), + e.registration().rpc_addresses(0).host()); + tablet_servers->push_back(ts); + } + return Status::OK(); +} + +Status KuduClient::ListTables(vector* tables, + const string& filter) { + ListTablesRequestPB req; + ListTablesResponsePB resp; + + if (!filter.empty()) { + req.set_name_filter(filter); + } + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + Status s = + data_->SyncLeaderMasterRpc( + deadline, + this, + req, + &resp, + nullptr, + "ListTables", + &MasterServiceProxy::ListTables); + RETURN_NOT_OK(s); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + for (int i = 0; i < resp.tables_size(); i++) { + tables->push_back(resp.tables(i).name()); + } + return Status::OK(); +} + +Status KuduClient::TableExists(const string& table_name, bool* exists) { + std::vector tables; + RETURN_NOT_OK(ListTables(&tables, table_name)); + for (const string& table : tables) { + if (table == table_name) { + *exists = true; + return Status::OK(); + } + } + *exists = false; + return Status::OK(); +} + +Status KuduClient::OpenTable(const string& table_name, + shared_ptr* table) { + KuduSchema schema; + string table_id; + PartitionSchema partition_schema; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(default_admin_operation_timeout()); + RETURN_NOT_OK(data_->GetTableSchema(this, + table_name, + deadline, + &schema, + &partition_schema, + &table_id)); + + // In the future, probably will look up the table in some map to reuse KuduTable + // instances. + shared_ptr ret(new KuduTable(shared_from_this(), table_name, table_id, + schema, partition_schema)); + RETURN_NOT_OK(ret->data_->Open()); + table->swap(ret); + + return Status::OK(); +} + +shared_ptr KuduClient::NewSession() { + shared_ptr ret(new KuduSession(shared_from_this())); + ret->data_->Init(ret); + return ret; +} + +bool KuduClient::IsMultiMaster() const { + return data_->master_server_addrs_.size() > 1; +} + +const MonoDelta& KuduClient::default_admin_operation_timeout() const { + return data_->default_admin_operation_timeout_; +} + +const MonoDelta& KuduClient::default_rpc_timeout() const { + return data_->default_rpc_timeout_; +} + +const uint64_t KuduClient::kNoTimestamp = 0; + +uint64_t KuduClient::GetLatestObservedTimestamp() const { + return data_->GetLatestObservedTimestamp(); +} + +void KuduClient::SetLatestObservedTimestamp(uint64_t ht_timestamp) { + data_->UpdateLatestObservedTimestamp(ht_timestamp); +} + +//////////////////////////////////////////////////////////// +// KuduTableCreator +//////////////////////////////////////////////////////////// + +KuduTableCreator::KuduTableCreator(KuduClient* client) + : data_(new KuduTableCreator::Data(client)) { +} + +KuduTableCreator::~KuduTableCreator() { + delete data_; +} + +KuduTableCreator& KuduTableCreator::table_name(const string& name) { + data_->table_name_ = name; + return *this; +} + +KuduTableCreator& KuduTableCreator::schema(const KuduSchema* schema) { + data_->schema_ = schema; + return *this; +} + +KuduTableCreator& KuduTableCreator::add_hash_partitions(const std::vector& columns, + int32_t num_buckets) { + return add_hash_partitions(columns, num_buckets, 0); +} + +KuduTableCreator& KuduTableCreator::add_hash_partitions(const std::vector& columns, + int32_t num_buckets, int32_t seed) { + PartitionSchemaPB::HashBucketSchemaPB* bucket_schema = + data_->partition_schema_.add_hash_bucket_schemas(); + for (const string& col_name : columns) { + bucket_schema->add_columns()->set_name(col_name); + } + bucket_schema->set_num_buckets(num_buckets); + bucket_schema->set_seed(seed); + return *this; +} + +KuduTableCreator& KuduTableCreator::set_range_partition_columns( + const std::vector& columns) { + PartitionSchemaPB::RangeSchemaPB* range_schema = + data_->partition_schema_.mutable_range_schema(); + range_schema->Clear(); + for (const string& col_name : columns) { + range_schema->add_columns()->set_name(col_name); + } + + return *this; +} + +KuduTableCreator& KuduTableCreator::split_rows(const vector& rows) { + data_->split_rows_ = rows; + return *this; +} + +KuduTableCreator& KuduTableCreator::num_replicas(int num_replicas) { + data_->num_replicas_ = num_replicas; + return *this; +} + +KuduTableCreator& KuduTableCreator::timeout(const MonoDelta& timeout) { + data_->timeout_ = timeout; + return *this; +} + +KuduTableCreator& KuduTableCreator::wait(bool wait) { + data_->wait_ = wait; + return *this; +} + +Status KuduTableCreator::Create() { + if (!data_->table_name_.length()) { + return Status::InvalidArgument("Missing table name"); + } + if (!data_->schema_) { + return Status::InvalidArgument("Missing schema"); + } + + // Build request. + CreateTableRequestPB req; + req.set_name(data_->table_name_); + if (data_->num_replicas_ >= 1) { + req.set_num_replicas(data_->num_replicas_); + } + RETURN_NOT_OK_PREPEND(SchemaToPB(*data_->schema_->schema_, req.mutable_schema()), + "Invalid schema"); + + RowOperationsPBEncoder encoder(req.mutable_split_rows()); + + for (const KuduPartialRow* row : data_->split_rows_) { + encoder.Add(RowOperationsPB::SPLIT_ROW, *row); + } + req.mutable_partition_schema()->CopyFrom(data_->partition_schema_); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + if (data_->timeout_.Initialized()) { + deadline.AddDelta(data_->timeout_); + } else { + deadline.AddDelta(data_->client_->default_admin_operation_timeout()); + } + + RETURN_NOT_OK_PREPEND(data_->client_->data_->CreateTable(data_->client_, + req, + *data_->schema_, + deadline), + strings::Substitute("Error creating table $0 on the master", + data_->table_name_)); + + // Spin until the table is fully created, if requested. + if (data_->wait_) { + RETURN_NOT_OK(data_->client_->data_->WaitForCreateTableToFinish(data_->client_, + data_->table_name_, + deadline)); + } + + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// KuduTable +//////////////////////////////////////////////////////////// + +KuduTable::KuduTable(const shared_ptr& client, + const string& name, + const string& table_id, + const KuduSchema& schema, + const PartitionSchema& partition_schema) + : data_(new KuduTable::Data(client, name, table_id, schema, partition_schema)) { +} + +KuduTable::~KuduTable() { + delete data_; +} + +const string& KuduTable::name() const { + return data_->name_; +} + +const string& KuduTable::id() const { + return data_->id_; +} + +const KuduSchema& KuduTable::schema() const { + return data_->schema_; +} + +KuduInsert* KuduTable::NewInsert() { + return new KuduInsert(shared_from_this()); +} + +KuduUpdate* KuduTable::NewUpdate() { + return new KuduUpdate(shared_from_this()); +} + +KuduDelete* KuduTable::NewDelete() { + return new KuduDelete(shared_from_this()); +} + +KuduClient* KuduTable::client() const { + return data_->client_.get(); +} + +const PartitionSchema& KuduTable::partition_schema() const { + return data_->partition_schema_; +} + +KuduPredicate* KuduTable::NewComparisonPredicate(const Slice& col_name, + KuduPredicate::ComparisonOp op, + KuduValue* value) { + StringPiece name_sp(reinterpret_cast(col_name.data()), col_name.size()); + const Schema* s = data_->schema_.schema_; + int col_idx = s->find_column(name_sp); + if (col_idx == Schema::kColumnNotFound) { + // Since this function doesn't return an error, instead we create a special + // predicate that just returns the errors when we add it to the scanner. + // + // This makes the API more "fluent". + delete value; // we always take ownership of 'value'. + return new KuduPredicate(new ErrorPredicateData( + Status::NotFound("column not found", col_name))); + } + + return new KuduPredicate(new ComparisonPredicateData(s->column(col_idx), op, value)); +} + +//////////////////////////////////////////////////////////// +// Error +//////////////////////////////////////////////////////////// + +const Status& KuduError::status() const { + return data_->status_; +} + +const KuduWriteOperation& KuduError::failed_op() const { + return *data_->failed_op_; +} + +KuduWriteOperation* KuduError::release_failed_op() { + CHECK_NOTNULL(data_->failed_op_.get()); + return data_->failed_op_.release(); +} + +bool KuduError::was_possibly_successful() const { + // TODO: implement me - right now be conservative. + return true; +} + +KuduError::KuduError(KuduWriteOperation* failed_op, + const Status& status) + : data_(new KuduError::Data(gscoped_ptr(failed_op), + status)) { +} + +KuduError::~KuduError() { + delete data_; +} + +//////////////////////////////////////////////////////////// +// KuduSession +//////////////////////////////////////////////////////////// + +KuduSession::KuduSession(const shared_ptr& client) + : data_(new KuduSession::Data(client)) { +} + +KuduSession::~KuduSession() { + WARN_NOT_OK(data_->Close(true), "Closed Session with pending operations."); + delete data_; +} + +Status KuduSession::Close() { + return data_->Close(false); +} + +Status KuduSession::SetFlushMode(FlushMode m) { + if (m == AUTO_FLUSH_BACKGROUND) { + return Status::NotSupported("AUTO_FLUSH_BACKGROUND has not been implemented in the" + " c++ client (see KUDU-456)."); + } + if (data_->batcher_->HasPendingOperations()) { + // TODO: there may be a more reasonable behavior here. + return Status::IllegalState("Cannot change flush mode when writes are buffered"); + } + if (!tight_enum_test(m)) { + // Be paranoid in client code. + return Status::InvalidArgument("Bad flush mode"); + } + + data_->flush_mode_ = m; + return Status::OK(); +} + +Status KuduSession::SetExternalConsistencyMode(ExternalConsistencyMode m) { + if (data_->batcher_->HasPendingOperations()) { + // TODO: there may be a more reasonable behavior here. + return Status::IllegalState("Cannot change external consistency mode when writes are " + "buffered"); + } + if (!tight_enum_test(m)) { + // Be paranoid in client code. + return Status::InvalidArgument("Bad external consistency mode"); + } + + data_->external_consistency_mode_ = m; + return Status::OK(); +} + +void KuduSession::SetTimeoutMillis(int millis) { + CHECK_GE(millis, 0); + data_->timeout_ms_ = millis; + data_->batcher_->SetTimeoutMillis(millis); +} + +Status KuduSession::Flush() { + Synchronizer s; + KuduStatusMemberCallback ksmcb(&s, &Synchronizer::StatusCB); + FlushAsync(&ksmcb); + return s.Wait(); +} + +void KuduSession::FlushAsync(KuduStatusCallback* user_callback) { + CHECK_EQ(data_->flush_mode_, MANUAL_FLUSH) << "TODO: handle other flush modes"; + + // Swap in a new batcher to start building the next batch. + // Save off the old batcher. + scoped_refptr old_batcher; + { + lock_guard l(&data_->lock_); + data_->NewBatcher(shared_from_this(), &old_batcher); + InsertOrDie(&data_->flushed_batchers_, old_batcher.get()); + } + + // Send off any buffered data. Important to do this outside of the lock + // since the callback may itself try to take the lock, in the case that + // the batch fails "inline" on the same thread. + old_batcher->FlushAsync(user_callback); +} + +bool KuduSession::HasPendingOperations() const { + lock_guard l(&data_->lock_); + if (data_->batcher_->HasPendingOperations()) { + return true; + } + for (Batcher* b : data_->flushed_batchers_) { + if (b->HasPendingOperations()) { + return true; + } + } + return false; +} + +Status KuduSession::Apply(KuduWriteOperation* write_op) { + if (!write_op->row().IsKeySet()) { + Status status = Status::IllegalState("Key not specified", write_op->ToString()); + data_->error_collector_->AddError(gscoped_ptr( + new KuduError(write_op, status))); + return status; + } + + Status s = data_->batcher_->Add(write_op); + if (!PREDICT_FALSE(s.ok())) { + data_->error_collector_->AddError(gscoped_ptr( + new KuduError(write_op, s))); + return s; + } + + if (data_->flush_mode_ == AUTO_FLUSH_SYNC) { + return Flush(); + } + + return Status::OK(); +} + +int KuduSession::CountBufferedOperations() const { + lock_guard l(&data_->lock_); + CHECK_EQ(data_->flush_mode_, MANUAL_FLUSH); + + return data_->batcher_->CountBufferedOperations(); +} + +int KuduSession::CountPendingErrors() const { + return data_->error_collector_->CountErrors(); +} + +void KuduSession::GetPendingErrors(vector* errors, bool* overflowed) { + data_->error_collector_->GetErrors(errors, overflowed); +} + +KuduClient* KuduSession::client() const { + return data_->client_.get(); +} + +//////////////////////////////////////////////////////////// +// KuduTableAlterer +//////////////////////////////////////////////////////////// +KuduTableAlterer::KuduTableAlterer(KuduClient* client, const string& name) + : data_(new Data(client, name)) { +} + +KuduTableAlterer::~KuduTableAlterer() { + delete data_; +} + +KuduTableAlterer* KuduTableAlterer::RenameTo(const string& new_name) { + data_->rename_to_ = new_name; + return this; +} + +KuduColumnSpec* KuduTableAlterer::AddColumn(const string& name) { + Data::Step s = {AlterTableRequestPB::ADD_COLUMN, + new KuduColumnSpec(name)}; + data_->steps_.push_back(s); + return s.spec; +} + +KuduColumnSpec* KuduTableAlterer::AlterColumn(const string& name) { + Data::Step s = {AlterTableRequestPB::ALTER_COLUMN, + new KuduColumnSpec(name)}; + data_->steps_.push_back(s); + return s.spec; +} + +KuduTableAlterer* KuduTableAlterer::DropColumn(const string& name) { + Data::Step s = {AlterTableRequestPB::DROP_COLUMN, + new KuduColumnSpec(name)}; + data_->steps_.push_back(s); + return this; +} + +KuduTableAlterer* KuduTableAlterer::timeout(const MonoDelta& timeout) { + data_->timeout_ = timeout; + return this; +} + +KuduTableAlterer* KuduTableAlterer::wait(bool wait) { + data_->wait_ = wait; + return this; +} + +Status KuduTableAlterer::Alter() { + AlterTableRequestPB req; + RETURN_NOT_OK(data_->ToRequest(&req)); + + MonoDelta timeout = data_->timeout_.Initialized() ? + data_->timeout_ : + data_->client_->default_admin_operation_timeout(); + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + RETURN_NOT_OK(data_->client_->data_->AlterTable(data_->client_, req, deadline)); + if (data_->wait_) { + string alter_name = data_->rename_to_.get_value_or(data_->table_name_); + RETURN_NOT_OK(data_->client_->data_->WaitForAlterTableToFinish( + data_->client_, alter_name, deadline)); + } + + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// KuduScanner +//////////////////////////////////////////////////////////// + +KuduScanner::KuduScanner(KuduTable* table) + : data_(new KuduScanner::Data(table)) { +} + +KuduScanner::~KuduScanner() { + Close(); + delete data_; +} + +Status KuduScanner::SetProjectedColumns(const vector& col_names) { + return SetProjectedColumnNames(col_names); +} + +Status KuduScanner::SetProjectedColumnNames(const vector& col_names) { + if (data_->open_) { + return Status::IllegalState("Projection must be set before Open()"); + } + + const Schema* table_schema = data_->table_->schema().schema_; + vector col_indexes; + col_indexes.reserve(col_names.size()); + for (const string& col_name : col_names) { + int idx = table_schema->find_column(col_name); + if (idx == Schema::kColumnNotFound) { + return Status::NotFound(strings::Substitute("Column: \"$0\" was not found in the " + "table schema.", col_name)); + } + col_indexes.push_back(idx); + } + + return SetProjectedColumnIndexes(col_indexes); +} + +Status KuduScanner::SetProjectedColumnIndexes(const vector& col_indexes) { + if (data_->open_) { + return Status::IllegalState("Projection must be set before Open()"); + } + + const Schema* table_schema = data_->table_->schema().schema_; + vector cols; + cols.reserve(col_indexes.size()); + for (const int col_index : col_indexes) { + if (col_index >= table_schema->columns().size()) { + return Status::NotFound(strings::Substitute("Column: \"$0\" was not found in the " + "table schema.", col_index)); + } + cols.push_back(table_schema->column(col_index)); + } + + gscoped_ptr s(new Schema()); + RETURN_NOT_OK(s->Reset(cols, 0)); + data_->SetProjectionSchema(data_->pool_.Add(s.release())); + return Status::OK(); +} + +Status KuduScanner::SetBatchSizeBytes(uint32_t batch_size) { + data_->has_batch_size_bytes_ = true; + data_->batch_size_bytes_ = batch_size; + return Status::OK(); +} + +Status KuduScanner::SetReadMode(ReadMode read_mode) { + if (data_->open_) { + return Status::IllegalState("Read mode must be set before Open()"); + } + if (!tight_enum_test(read_mode)) { + return Status::InvalidArgument("Bad read mode"); + } + data_->read_mode_ = read_mode; + return Status::OK(); +} + +Status KuduScanner::SetOrderMode(OrderMode order_mode) { + if (data_->open_) { + return Status::IllegalState("Order mode must be set before Open()"); + } + if (!tight_enum_test(order_mode)) { + return Status::InvalidArgument("Bad order mode"); + } + data_->is_fault_tolerant_ = order_mode == ORDERED; + return Status::OK(); +} + +Status KuduScanner::SetFaultTolerant() { + if (data_->open_) { + return Status::IllegalState("Fault-tolerance must be set before Open()"); + } + RETURN_NOT_OK(SetReadMode(READ_AT_SNAPSHOT)); + data_->is_fault_tolerant_ = true; + return Status::OK(); +} + +Status KuduScanner::SetSnapshotMicros(uint64_t snapshot_timestamp_micros) { + if (data_->open_) { + return Status::IllegalState("Snapshot timestamp must be set before Open()"); + } + // Shift the HT timestamp bits to get well-formed HT timestamp with the logical + // bits zeroed out. + data_->snapshot_timestamp_ = snapshot_timestamp_micros << kHtTimestampBitsToShift; + return Status::OK(); +} + +Status KuduScanner::SetSnapshotRaw(uint64_t snapshot_timestamp) { + if (data_->open_) { + return Status::IllegalState("Snapshot timestamp must be set before Open()"); + } + data_->snapshot_timestamp_ = snapshot_timestamp; + return Status::OK(); +} + +Status KuduScanner::SetSelection(KuduClient::ReplicaSelection selection) { + if (data_->open_) { + return Status::IllegalState("Replica selection must be set before Open()"); + } + data_->selection_ = selection; + return Status::OK(); +} + +Status KuduScanner::SetTimeoutMillis(int millis) { + if (data_->open_) { + return Status::IllegalState("Timeout must be set before Open()"); + } + data_->timeout_ = MonoDelta::FromMilliseconds(millis); + return Status::OK(); +} + +Status KuduScanner::AddConjunctPredicate(KuduPredicate* pred) { + // Take ownership even if we return a bad status. + data_->pool_.Add(pred); + if (data_->open_) { + return Status::IllegalState("Predicate must be set before Open()"); + } + return pred->data_->AddToScanSpec(&data_->spec_); +} + +Status KuduScanner::AddLowerBound(const KuduPartialRow& key) { + gscoped_ptr enc(new string()); + RETURN_NOT_OK(key.EncodeRowKey(enc.get())); + RETURN_NOT_OK(AddLowerBoundRaw(Slice(*enc))); + data_->pool_.Add(enc.release()); + return Status::OK(); +} + +Status KuduScanner::AddLowerBoundRaw(const Slice& key) { + // Make a copy of the key. + gscoped_ptr enc_key; + RETURN_NOT_OK(EncodedKey::DecodeEncodedString( + *data_->table_->schema().schema_, &data_->arena_, key, &enc_key)); + data_->spec_.SetLowerBoundKey(enc_key.get()); + data_->pool_.Add(enc_key.release()); + return Status::OK(); +} + +Status KuduScanner::AddExclusiveUpperBound(const KuduPartialRow& key) { + gscoped_ptr enc(new string()); + RETURN_NOT_OK(key.EncodeRowKey(enc.get())); + RETURN_NOT_OK(AddExclusiveUpperBoundRaw(Slice(*enc))); + data_->pool_.Add(enc.release()); + return Status::OK(); +} + +Status KuduScanner::AddExclusiveUpperBoundRaw(const Slice& key) { + // Make a copy of the key. + gscoped_ptr enc_key; + RETURN_NOT_OK(EncodedKey::DecodeEncodedString( + *data_->table_->schema().schema_, &data_->arena_, key, &enc_key)); + data_->spec_.SetExclusiveUpperBoundKey(enc_key.get()); + data_->pool_.Add(enc_key.release()); + return Status::OK(); +} + +Status KuduScanner::AddLowerBoundPartitionKeyRaw(const Slice& partition_key) { + data_->spec_.SetLowerBoundPartitionKey(partition_key); + return Status::OK(); +} + +Status KuduScanner::AddExclusiveUpperBoundPartitionKeyRaw(const Slice& partition_key) { + data_->spec_.SetExclusiveUpperBoundPartitionKey(partition_key); + return Status::OK(); +} + +Status KuduScanner::SetCacheBlocks(bool cache_blocks) { + if (data_->open_) { + return Status::IllegalState("Block caching must be set before Open()"); + } + data_->spec_.set_cache_blocks(cache_blocks); + return Status::OK(); +} + +KuduSchema KuduScanner::GetProjectionSchema() const { + return data_->client_projection_; +} + +namespace { +// Callback for the RPC sent by Close(). +// We can't use the KuduScanner response and RPC controller members for this +// call, because the scanner object may be destructed while the call is still +// being processed. +struct CloseCallback { + RpcController controller; + ScanResponsePB response; + string scanner_id; + void Callback() { + if (!controller.status().ok()) { + LOG(WARNING) << "Couldn't close scanner " << scanner_id << ": " + << controller.status().ToString(); + } + delete this; + } +}; +} // anonymous namespace + +string KuduScanner::ToString() const { + Slice start_key = data_->spec_.lower_bound_key() ? + data_->spec_.lower_bound_key()->encoded_key() : Slice("INF"); + Slice end_key = data_->spec_.exclusive_upper_bound_key() ? + data_->spec_.exclusive_upper_bound_key()->encoded_key() : Slice("INF"); + return strings::Substitute("$0: [$1,$2)", data_->table_->name(), + start_key.ToDebugString(), end_key.ToDebugString()); +} + +Status KuduScanner::Open() { + CHECK(!data_->open_) << "Scanner already open"; + CHECK(data_->projection_ != nullptr) << "No projection provided"; + + // Find the first tablet. + data_->spec_encoder_.EncodeRangePredicates(&data_->spec_, false); + + VLOG(1) << "Beginning scan " << ToString(); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(data_->timeout_); + set blacklist; + + bool is_simple_range_partitioned = + data_->table_->partition_schema().IsSimplePKRangePartitioning(*data_->table_->schema().schema_); + + if (!is_simple_range_partitioned && + (data_->spec_.lower_bound_key() != nullptr || + data_->spec_.exclusive_upper_bound_key() != nullptr || + !data_->spec_.predicates().empty())) { + KLOG_FIRST_N(WARNING, 1) << "Starting full table scan. In the future this scan may be " + "automatically optimized with partition pruning."; + } + + if (is_simple_range_partitioned) { + // If the table is simple range partitioned, then the partition key space is + // isomorphic to the primary key space. We can potentially reduce the scan + // length by only scanning the intersection of the primary key range and the + // partition key range. This is a stop-gap until real partition pruning is + // in place that will work across any partition type. + Slice start_primary_key = data_->spec_.lower_bound_key() == nullptr ? Slice() + : data_->spec_.lower_bound_key()->encoded_key(); + Slice end_primary_key = data_->spec_.exclusive_upper_bound_key() == nullptr ? Slice() + : data_->spec_.exclusive_upper_bound_key()->encoded_key(); + Slice start_partition_key = data_->spec_.lower_bound_partition_key(); + Slice end_partition_key = data_->spec_.exclusive_upper_bound_partition_key(); + + if ((!end_partition_key.empty() && start_primary_key.compare(end_partition_key) >= 0) || + (!end_primary_key.empty() && start_partition_key.compare(end_primary_key) >= 0)) { + // The primary key range and the partition key range do not intersect; + // the scan will be empty. Keep the existing partition key range. + } else { + // Assign the scan's partition key range to the intersection of the + // primary key and partition key ranges. + data_->spec_.SetLowerBoundPartitionKey(start_primary_key); + data_->spec_.SetExclusiveUpperBoundPartitionKey(end_primary_key); + } + } + + RETURN_NOT_OK(data_->OpenTablet(data_->spec_.lower_bound_partition_key(), deadline, &blacklist)); + + data_->open_ = true; + return Status::OK(); +} + +Status KuduScanner::KeepAlive() { + return data_->KeepAlive(); +} + +void KuduScanner::Close() { + if (!data_->open_) return; + CHECK(data_->proxy_); + + VLOG(1) << "Ending scan " << ToString(); + + // Close the scanner on the server-side, if necessary. + // + // If the scan did not match any rows, the tserver will not assign a scanner ID. + // This is reflected in the Open() response. In this case, there is no server-side state + // to clean up. + if (!data_->next_req_.scanner_id().empty()) { + gscoped_ptr closer(new CloseCallback); + closer->scanner_id = data_->next_req_.scanner_id(); + data_->PrepareRequest(KuduScanner::Data::CLOSE); + data_->next_req_.set_close_scanner(true); + closer->controller.set_timeout(data_->timeout_); + data_->proxy_->ScanAsync(data_->next_req_, &closer->response, &closer->controller, + boost::bind(&CloseCallback::Callback, closer.get())); + ignore_result(closer.release()); + } + data_->proxy_.reset(); + data_->open_ = false; + return; +} + +bool KuduScanner::HasMoreRows() const { + CHECK(data_->open_); + return data_->data_in_open_ || // more data in hand + data_->last_response_.has_more_results() || // more data in this tablet + data_->MoreTablets(); // more tablets to scan, possibly with more data +} + +Status KuduScanner::NextBatch(vector* rows) { + RETURN_NOT_OK(NextBatch(&data_->batch_for_old_api_)); + data_->batch_for_old_api_.data_->ExtractRows(rows); + return Status::OK(); +} + +Status KuduScanner::NextBatch(KuduScanBatch* result) { + // TODO: do some double-buffering here -- when we return this batch + // we should already have fired off the RPC for the next batch, but + // need to do some swapping of the response objects around to avoid + // stomping on the memory the user is looking at. + CHECK(data_->open_); + CHECK(data_->proxy_); + + result->data_->Clear(); + + if (data_->data_in_open_) { + // We have data from a previous scan. + VLOG(1) << "Extracting data from scan " << ToString(); + data_->data_in_open_ = false; + return result->data_->Reset(&data_->controller_, + data_->projection_, + &data_->client_projection_, + make_gscoped_ptr(data_->last_response_.release_data())); + } else if (data_->last_response_.has_more_results()) { + // More data is available in this tablet. + VLOG(1) << "Continuing scan " << ToString(); + + // The user has specified a timeout 'data_->timeout_' which should + // apply to the total time for each call to NextBatch(). However, + // if this is a fault-tolerant scan, it's preferable to set a shorter + // timeout (the "default RPC timeout" for each individual RPC call -- + // so that if the server is hung we have time to fail over and try a + // different server. + MonoTime now = MonoTime::Now(MonoTime::FINE); + + MonoTime batch_deadline = now; + batch_deadline.AddDelta(data_->timeout_); + + MonoTime rpc_deadline; + if (data_->is_fault_tolerant_) { + rpc_deadline = now; + rpc_deadline.AddDelta(data_->table_->client()->default_rpc_timeout()); + rpc_deadline = MonoTime::Earliest(batch_deadline, rpc_deadline); + } else { + rpc_deadline = batch_deadline; + } + + data_->controller_.Reset(); + data_->controller_.set_deadline(rpc_deadline); + data_->PrepareRequest(KuduScanner::Data::CONTINUE); + Status rpc_status = data_->proxy_->Scan(data_->next_req_, + &data_->last_response_, + &data_->controller_); + const Status server_status = data_->CheckForErrors(); + + // Success case. + if (rpc_status.ok() && server_status.ok()) { + if (data_->last_response_.has_last_primary_key()) { + data_->last_primary_key_ = data_->last_response_.last_primary_key(); + } + data_->scan_attempts_ = 0; + return result->data_->Reset(&data_->controller_, + data_->projection_, + &data_->client_projection_, + make_gscoped_ptr(data_->last_response_.release_data())); + } + + data_->scan_attempts_++; + + // Error handling. + LOG(WARNING) << "Scan at tablet server " << data_->ts_->ToString() << " of tablet " + << ToString() << " failed: " + << (!rpc_status.ok() ? rpc_status.ToString() : server_status.ToString()); + set blacklist; + vector candidates; + RETURN_NOT_OK(data_->CanBeRetried(false, rpc_status, server_status, rpc_deadline, + batch_deadline, candidates, &blacklist)); + + LOG(WARNING) << "Attempting to retry scan of tablet " << ToString() << " elsewhere."; + // Use the start partition key of the current tablet as the start partition key. + const string& partition_key_start = data_->remote_->partition().partition_key_start(); + return data_->OpenTablet(partition_key_start, batch_deadline, &blacklist); + } else if (data_->MoreTablets()) { + // More data may be available in other tablets. + // No need to close the current tablet; we scanned all the data so the + // server closed it for us. + VLOG(1) << "Scanning next tablet " << ToString(); + data_->last_primary_key_.clear(); + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(data_->timeout_); + set blacklist; + RETURN_NOT_OK(data_->OpenTablet(data_->remote_->partition().partition_key_end(), + deadline, &blacklist)); + // No rows written, the next invocation will pick them up. + return Status::OK(); + } else { + // No more data anywhere. + return Status::OK(); + } +} + +Status KuduScanner::GetCurrentServer(KuduTabletServer** server) { + CHECK(data_->open_); + internal::RemoteTabletServer* rts = data_->ts_; + CHECK(rts); + vector host_ports; + rts->GetHostPorts(&host_ports); + if (host_ports.empty()) { + return Status::IllegalState(strings::Substitute("No HostPort found for RemoteTabletServer $0", + rts->ToString())); + } + *server = new KuduTabletServer(); + (*server)->data_ = new KuduTabletServer::Data(rts->permanent_uuid(), + host_ports[0].host()); + return Status::OK(); +} + +//////////////////////////////////////////////////////////// +// KuduTabletServer +//////////////////////////////////////////////////////////// + +KuduTabletServer::KuduTabletServer() + : data_(nullptr) { +} + +KuduTabletServer::~KuduTabletServer() { + delete data_; +} + +const string& KuduTabletServer::uuid() const { + return data_->uuid_; +} + +const string& KuduTabletServer::hostname() const { + return data_->hostname_; +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/client.h b/src/kudu/client/client.h new file mode 100644 index 000000000000..42463402afb7 --- /dev/null +++ b/src/kudu/client/client.h @@ -0,0 +1,1076 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_CLIENT_H +#define KUDU_CLIENT_CLIENT_H + +#include +#include +#include + +#include "kudu/client/row_result.h" +#include "kudu/client/scan_batch.h" +#include "kudu/client/scan_predicate.h" +#include "kudu/client/schema.h" +#include "kudu/client/shared_ptr.h" +#ifdef KUDU_HEADERS_NO_STUBS +#include +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif +#include "kudu/client/write_op.h" +#include "kudu/util/kudu_export.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +class LinkedListTester; +class PartitionSchema; + +namespace client { + +class KuduLoggingCallback; +class KuduSession; +class KuduStatusCallback; +class KuduTable; +class KuduTableAlterer; +class KuduTableCreator; +class KuduTabletServer; +class KuduValue; +class KuduWriteOperation; + +namespace internal { +class Batcher; +class GetTableSchemaRpc; +class LookupRpc; +class MetaCache; +class RemoteTablet; +class RemoteTabletServer; +class WriteRpc; +} // namespace internal + +// Installs a callback for internal client logging. It is invoked for a +// log event of any severity, across any KuduClient instance. +// +// Only the first invocation has any effect; subsequent invocations are +// a no-op. The caller must ensure that 'cb' stays alive until +// UninstallLoggingCallback() is called. +// +// Before a callback is registered, all internal client log events are +// logged to stderr. +void KUDU_EXPORT InstallLoggingCallback(KuduLoggingCallback* cb); + +// Removes a callback installed via InstallLoggingCallback(). +// +// Only the first invocation has any effect; subsequent invocations are +// a no-op. +// +// Should be called before unloading the client library. +void KUDU_EXPORT UninstallLoggingCallback(); + +// Set the logging verbosity of the client library. By default, this is 0. Logs become +// progressively more verbose as the level is increased. Empirically, the highest +// verbosity level used in Kudu is 6, which includes very fine-grained tracing +// information. Most useful logging is enabled at level 1 or 2, with the higher levels +// used only in rare circumstances. +// +// Logs are emitted to stderr, or to the configured log callback at SEVERITY_INFO. +// +// This may be called safely at any point during usage of the library. +void KUDU_EXPORT SetVerboseLogLevel(int level); + +// The Kudu client library uses signals internally in some cases. By default, it uses +// SIGUSR2. If your application makes use of SIGUSR2, this advanced API can help +// workaround conflicts. +Status KUDU_EXPORT SetInternalSignalNumber(int signum); + +// Creates a new KuduClient with the desired options. +// +// Note that KuduClients are shared amongst multiple threads and, as such, +// are stored in shared pointers. +class KUDU_EXPORT KuduClientBuilder { + public: + KuduClientBuilder(); + ~KuduClientBuilder(); + + KuduClientBuilder& clear_master_server_addrs(); + + // Add RPC addresses of multiple masters. + KuduClientBuilder& master_server_addrs(const std::vector& addrs); + + // Add an RPC address of a master. At least one master is required. + KuduClientBuilder& add_master_server_addr(const std::string& addr); + + // The default timeout used for administrative operations (e.g. CreateTable, + // AlterTable, ...). Optional. + // + // If not provided, defaults to 10s. + KuduClientBuilder& default_admin_operation_timeout(const MonoDelta& timeout); + + // The default timeout for individual RPCs. Optional. + // + // If not provided, defaults to 5s. + KuduClientBuilder& default_rpc_timeout(const MonoDelta& timeout); + + // Creates the client. + // + // The return value may indicate an error in the create operation, or a + // misuse of the builder; in the latter case, only the last error is + // returned. + Status Build(sp::shared_ptr* client); + private: + class KUDU_NO_EXPORT Data; + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduClientBuilder); +}; + +// The KuduClient represents a connection to a cluster. From the user +// perspective, they should only need to create one of these in their +// application, likely a singleton -- but it's not a singleton in Kudu in any +// way. Different Client objects do not interact with each other -- no +// connection pooling, etc. Each KuduClient instance is sandboxed with no +// global cross-client state. +// +// In the implementation, the client holds various pieces of common +// infrastructure which is not table-specific: +// +// - RPC messenger: reactor threads and RPC connections are pooled here +// - Authentication: the client is initialized with some credentials, and +// all accesses through it share those credentials. +// - Caches: caches of table schemas, tablet locations, tablet server IP +// addresses, etc are shared per-client. +// +// In order to actually access data on the cluster, callers must first +// create a KuduSession object using NewSession(). A KuduClient may +// have several associated sessions. +// +// TODO: Cluster administration functions are likely to be in this class +// as well. +// +// This class is thread-safe. +class KUDU_EXPORT KuduClient : public sp::enable_shared_from_this { + public: + ~KuduClient(); + + // Creates a KuduTableCreator; it is the caller's responsibility to free it. + KuduTableCreator* NewTableCreator(); + + // set 'create_in_progress' to true if a CreateTable operation is in-progress + Status IsCreateTableInProgress(const std::string& table_name, + bool *create_in_progress); + + Status DeleteTable(const std::string& table_name); + + // Creates a KuduTableAlterer; it is the caller's responsibility to free it. + KuduTableAlterer* NewTableAlterer(const std::string& table_name); + + // set 'alter_in_progress' to true if an AlterTable operation is in-progress + Status IsAlterTableInProgress(const std::string& table_name, + bool *alter_in_progress); + + Status GetTableSchema(const std::string& table_name, + KuduSchema* schema); + + Status ListTabletServers(std::vector* tablet_servers); + + // List only those tables whose names pass a substring match on 'filter'. + // + // 'tables' is appended to only on success. + Status ListTables(std::vector* tables, + const std::string& filter = ""); + + // Check if the table given by 'table_name' exists. + // + // 'exists' is set only on success. + Status TableExists(const std::string& table_name, bool* exists); + + // Open the table with the given name. If the table has not been opened before + // in this client, this will do an RPC to ensure that the table exists and + // look up its schema. + // + // TODO: should we offer an async version of this as well? + // TODO: probably should have a configurable timeout in KuduClientBuilder? + Status OpenTable(const std::string& table_name, + sp::shared_ptr* table); + + // Create a new session for interacting with the cluster. + // User is responsible for destroying the session object. + // This is a fully local operation (no RPCs or blocking). + sp::shared_ptr NewSession(); + + // Policy with which to choose amongst multiple replicas. + enum ReplicaSelection { + // Select the LEADER replica. + LEADER_ONLY, + + // Select the closest replica to the client, or a random one if all + // replicas are equidistant. + CLOSEST_REPLICA, + + // Select the first replica in the list. + FIRST_REPLICA + }; + + bool IsMultiMaster() const; + + const MonoDelta& default_admin_operation_timeout() const; + const MonoDelta& default_rpc_timeout() const; + + // Value for the latest observed timestamp when none has been observed or set. + static const uint64_t kNoTimestamp; + + // Returns highest HybridTime timestamp observed by the client. + // The latest observed timestamp can be used to start a snapshot scan on a + // table which is guaranteed to contain all data written or previously read by + // this client. See KuduScanner for more details on timestamps. + uint64_t GetLatestObservedTimestamp() const; + + // Sets the latest observed HybridTime timestamp, encoded in the HybridTime format. + // This is only useful when forwarding timestamps between clients to enforce + // external consistency when using KuduSession::CLIENT_PROPAGATED external consistency + // mode. + // To use this the user must obtain the HybridTime encoded timestamp from the first + // client with KuduClient::GetLatestObservedTimestamp() and the set it in the new + // client with this method. + void SetLatestObservedTimestamp(uint64_t ht_timestamp); + + private: + class KUDU_NO_EXPORT Data; + + friend class KuduClientBuilder; + friend class KuduScanner; + friend class KuduTable; + friend class KuduTableAlterer; + friend class KuduTableCreator; + friend class internal::Batcher; + friend class internal::GetTableSchemaRpc; + friend class internal::LookupRpc; + friend class internal::MetaCache; + friend class internal::RemoteTablet; + friend class internal::RemoteTabletServer; + friend class internal::WriteRpc; + + FRIEND_TEST(ClientTest, TestGetTabletServerBlacklist); + FRIEND_TEST(ClientTest, TestMasterDown); + FRIEND_TEST(ClientTest, TestMasterLookupPermits); + FRIEND_TEST(ClientTest, TestReplicatedMultiTabletTableFailover); + FRIEND_TEST(ClientTest, TestReplicatedTabletWritesWithLeaderElection); + FRIEND_TEST(ClientTest, TestScanFaultTolerance); + FRIEND_TEST(ClientTest, TestScanTimeout); + FRIEND_TEST(ClientTest, TestWriteWithDeadMaster); + FRIEND_TEST(MasterFailoverTest, DISABLED_TestPauseAfterCreateTableIssued); + + KuduClient(); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduClient); +}; + +// Creates a new table with the desired options. +class KUDU_EXPORT KuduTableCreator { + public: + ~KuduTableCreator(); + + // Sets the name to give the table. It is copied. Required. + KuduTableCreator& table_name(const std::string& name); + + // Sets the schema with which to create the table. Must remain valid for + // the lifetime of the builder. Required. + KuduTableCreator& schema(const KuduSchema* schema); + + // Adds a set of hash partitions to the table. + // + // For each set of hash partitions added to the table, the total number of + // table partitions is multiplied by the number of buckets. For example, if a + // table is created with 3 split rows, and two hash partitions with 4 and 5 + // buckets respectively, the total number of table partitions will be 80 + // (4 range partitions * 4 hash buckets * 5 hash buckets). + KuduTableCreator& add_hash_partitions(const std::vector& columns, + int32_t num_buckets); + + // Adds a set of hash partitions to the table. + // + // This constructor takes a seed value, which can be used to randomize the + // mapping of rows to hash buckets. Setting the seed may provide some + // amount of protection against denial of service attacks when the hashed + // columns contain user provided values. + KuduTableCreator& add_hash_partitions(const std::vector& columns, + int32_t num_buckets, int32_t seed); + + // Sets the columns on which the table will be range-partitioned. + // + // Every column must be a part of the table's primary key. If not set, the + // table will be created with the primary-key columns as the range-partition + // columns. If called with an empty vector, the table will be created without + // range partitioning. + // + // Optional. + KuduTableCreator& set_range_partition_columns(const std::vector& columns); + + // Sets the rows on which to pre-split the table. + // The table creator takes ownership of the rows. + // + // If any provided row is missing a value for any of the range partition + // columns, the logical minimum value for that column type will be used by + // default. + // + // If not provided, no range-based pre-splitting is performed. + // + // Optional. + KuduTableCreator& split_rows(const std::vector& split_rows); + + // Sets the number of replicas for each tablet in the table. + // This should be an odd number. Optional. + // + // If not provided (or if <= 0), falls back to the server-side default. + KuduTableCreator& num_replicas(int n_replicas); + + // Set the timeout for the operation. This includes any waiting + // after the create has been submitted (i.e if the create is slow + // to be performed for a large table, it may time out and then + // later be successful). + KuduTableCreator& timeout(const MonoDelta& timeout); + + // Wait for the table to be fully created before returning. + // Optional. + // + // If not provided, defaults to true. + KuduTableCreator& wait(bool wait); + + // Creates the table. + // + // The return value may indicate an error in the create table operation, + // or a misuse of the builder; in the latter case, only the last error is + // returned. + Status Create(); + private: + class KUDU_NO_EXPORT Data; + + friend class KuduClient; + + explicit KuduTableCreator(KuduClient* client); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduTableCreator); +}; + +// A KuduTable represents a table on a particular cluster. It holds the current +// schema of the table. Any given KuduTable instance belongs to a specific KuduClient +// instance. +// +// Upon construction, the table is looked up in the catalog (or catalog cache), +// and the schema fetched for introspection. +// +// This class is thread-safe. +class KUDU_EXPORT KuduTable : public sp::enable_shared_from_this { + public: + ~KuduTable(); + + const std::string& name() const; + + // Return the table's ID. This is an internal identifier which uniquely + // identifies a table. If the table is deleted and recreated with the same + // name, the ID will distinguish the old table from the new. + const std::string& id() const; + + const KuduSchema& schema() const; + + // Create a new write operation for this table. It is the caller's + // responsibility to free it, unless it is passed to KuduSession::Apply(). + KuduInsert* NewInsert(); + KuduUpdate* NewUpdate(); + KuduDelete* NewDelete(); + + // Create a new comparison predicate which can be used for scanners + // on this table. + // + // The type of 'value' must correspond to the type of the column to which + // the predicate is to be applied. For example, if the given column is + // any type of integer, the KuduValue should also be an integer, with its + // value in the valid range for the column type. No attempt is made to cast + // between floating point and integer values, or numeric and string values. + // + // The caller owns the result until it is passed into KuduScanner::AddConjunctPredicate(). + // The returned predicate takes ownership of 'value'. + // + // In the case of an error (e.g. an invalid column name), a non-NULL value + // is still returned. The error will be returned when attempting to add this + // predicate to a KuduScanner. + KuduPredicate* NewComparisonPredicate(const Slice& col_name, + KuduPredicate::ComparisonOp op, + KuduValue* value); + + KuduClient* client() const; + + const PartitionSchema& partition_schema() const; + + private: + class KUDU_NO_EXPORT Data; + + friend class KuduClient; + + KuduTable(const sp::shared_ptr& client, + const std::string& name, + const std::string& table_id, + const KuduSchema& schema, + const PartitionSchema& partition_schema); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduTable); +}; + +// Alters an existing table based on the provided steps. +// +// Sample usage: +// KuduTableAlterer* alterer = client->NewTableAlterer("table-name"); +// alterer->AddColumn("foo")->Type(KuduColumnSchema::INT32)->NotNull(); +// alterer->AlterColumn("bar")->Compression(KuduColumnStorageAttributes::LZ4); +// Status s = alterer->Alter(); +// delete alterer; +class KUDU_EXPORT KuduTableAlterer { + public: + ~KuduTableAlterer(); + + // Renames the table. + KuduTableAlterer* RenameTo(const std::string& new_name); + + // Adds a new column to the table. + // + // When adding a column, you must specify the default value of the new + // column using KuduColumnSpec::DefaultValue(...). + KuduColumnSpec* AddColumn(const std::string& name); + + // Alter an existing column. + KuduColumnSpec* AlterColumn(const std::string& name); + + // Drops an existing column from the table. + KuduTableAlterer* DropColumn(const std::string& name); + + // Set the timeout for the operation. This includes any waiting + // after the alter has been submitted (i.e if the alter is slow + // to be performed on a large table, it may time out and then + // later be successful). + KuduTableAlterer* timeout(const MonoDelta& timeout); + + // Wait for the table to be fully altered before returning. + // + // If not provided, defaults to true. + KuduTableAlterer* wait(bool wait); + + // Alters the table. + // + // The return value may indicate an error in the alter operation, or a + // misuse of the builder (e.g. add_column() with default_value=NULL); in + // the latter case, only the last error is returned. + Status Alter(); + + private: + class KUDU_NO_EXPORT Data; + friend class KuduClient; + + KuduTableAlterer(KuduClient* client, + const std::string& name); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduTableAlterer); +}; + +// An error which occurred in a given operation. This tracks the operation +// which caused the error, along with whatever the actual error was. +class KUDU_EXPORT KuduError { + public: + ~KuduError(); + + // Return the actual error which occurred. + const Status& status() const; + + // Return the operation which failed. + const KuduWriteOperation& failed_op() const; + + // Release the operation that failed. The caller takes ownership. Must only + // be called once. + KuduWriteOperation* release_failed_op(); + + // In some cases, it's possible that the server did receive and successfully + // perform the requested operation, but the client can't tell whether or not + // it was successful. For example, if the call times out, the server may still + // succeed in processing at a later time. + // + // This function returns true if there is some chance that the server did + // process the operation, and false if it can guarantee that the operation + // did not succeed. + bool was_possibly_successful() const; + + private: + class KUDU_NO_EXPORT Data; + + friend class internal::Batcher; + friend class KuduSession; + + KuduError(KuduWriteOperation* failed_op, const Status& error); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduError); +}; + + +// A KuduSession belongs to a specific KuduClient, and represents a context in +// which all read/write data access should take place. Within a session, +// multiple operations may be accumulated and batched together for better +// efficiency. Settings like timeouts, priorities, and trace IDs are also set +// per session. +// +// A KuduSession's main purpose is for grouping together multiple data-access +// operations together into batches or transactions. It is important to note +// the distinction between these two: +// +// * A batch is a set of operations which are grouped together in order to +// amortize fixed costs such as RPC call overhead and round trip times. +// A batch DOES NOT imply any ACID-like guarantees. Within a batch, some +// operations may succeed while others fail, and concurrent readers may see +// partial results. If the client crashes mid-batch, it is possible that some +// of the operations will be made durable while others were lost. +// +// * In contrast, a transaction is a set of operations which are treated as an +// indivisible semantic unit, per the usual definitions of database transactions +// and isolation levels. +// +// NOTE: Kudu does not currently support transactions! They are only mentioned +// in the above documentation to clarify that batches are not transactional and +// should only be used for efficiency. +// +// KuduSession is separate from KuduTable because a given batch or transaction +// may span multiple tables. This is particularly important in the future when +// we add ACID support, but even in the context of batching, we may be able to +// coalesce writes to different tables hosted on the same server into the same +// RPC. +// +// KuduSession is separate from KuduClient because, in a multi-threaded +// application, different threads may need to concurrently execute +// transactions. Similar to a JDBC "session", transaction boundaries will be +// delineated on a per-session basis -- in between a "BeginTransaction" and +// "Commit" call on a given session, all operations will be part of the same +// transaction. Meanwhile another concurrent Session object can safely run +// non-transactional work or other transactions without interfering. +// +// Additionally, there is a guarantee that writes from different sessions do not +// get batched together into the same RPCs -- this means that latency-sensitive +// clients can run through the same KuduClient object as throughput-oriented +// clients, perhaps by setting the latency-sensitive session's timeouts low and +// priorities high. Without the separation of batches, a latency-sensitive +// single-row insert might get batched along with 10MB worth of inserts from the +// batch writer, thus delaying the response significantly. +// +// Though we currently do not have transactional support, users will be forced +// to use a KuduSession to instantiate reads as well as writes. This will make +// it more straight-forward to add RW transactions in the future without +// significant modifications to the API. +// +// Users who are familiar with the Hibernate ORM framework should find this +// concept of a Session familiar. +// +// This class is not thread-safe except where otherwise specified. +class KUDU_EXPORT KuduSession : public sp::enable_shared_from_this { + public: + ~KuduSession(); + + enum FlushMode { + // Every write will be sent to the server in-band with the Apply() + // call. No batching will occur. This is the default flush mode. In this + // mode, the Flush() call never has any effect, since each Apply() call + // has already flushed the buffer. This is the default flush mode. + AUTO_FLUSH_SYNC, + + // Apply() calls will return immediately, but the writes will be sent in + // the background, potentially batched together with other writes from + // the same session. If there is not sufficient buffer space, then Apply() + // may block for buffer space to be available. + // + // Because writes are applied in the background, any errors will be stored + // in a session-local buffer. Call CountPendingErrors() or GetPendingErrors() + // to retrieve them. + // TODO: provide an API for the user to specify a callback to do their own + // error reporting. + // TODO: specify which threads the background activity runs on (probably the + // messenger IO threads?) + // + // NOTE: This is not implemented yet, see KUDU-456. + // + // The Flush() call can be used to block until the buffer is empty. + AUTO_FLUSH_BACKGROUND, + + // Apply() calls will return immediately, and the writes will not be + // sent until the user calls Flush(). If the buffer runs past the + // configured space limit, then Apply() will return an error. + MANUAL_FLUSH + }; + + // Set the flush mode. + // REQUIRES: there should be no pending writes -- call Flush() first to ensure. + Status SetFlushMode(FlushMode m) WARN_UNUSED_RESULT; + + // The possible external consistency modes on which Kudu operates. + enum ExternalConsistencyMode { + // The response to any write will contain a timestamp. Any further calls from the same + // client to other servers will update those servers with that timestamp. Following + // write operations from the same client will be assigned timestamps that are strictly + // higher, enforcing external consistency without having to wait or incur any latency + // penalties. + // + // In order to maintain external consistency for writes between two different clients + // in this mode, the user must forward the timestamp from the first client to the + // second by using KuduClient::GetLatestObservedTimestamp() and + // KuduClient::SetLatestObservedTimestamp(). + // + // WARNING: Failure to propagate timestamp information through back-channels between + // two different clients will negate any external consistency guarantee under this + // mode. + // + // This is the default mode. + CLIENT_PROPAGATED, + + // The server will guarantee that write operations from the same or from other client + // are externally consistent, without the need to propagate timestamps across clients. + // This is done by making write operations wait until there is certainty that all + // follow up write operations (operations that start after the previous one finishes) + // will be assigned a timestamp that is strictly higher, enforcing external consistency. + // + // WARNING: Depending on the clock synchronization state of TabletServers this may + // imply considerable latency. Moreover operations in COMMIT_WAIT external consistency + // mode will outright fail if TabletServer clocks are either unsynchronized or + // synchronized but with a maximum error which surpasses a pre-configured threshold. + COMMIT_WAIT + }; + + // Set the new external consistency mode for this session. + Status SetExternalConsistencyMode(ExternalConsistencyMode m) WARN_UNUSED_RESULT; + + // Set the amount of buffer space used by this session for outbound writes. + // The effect of the buffer size varies based on the flush mode of the + // session: + // + // AUTO_FLUSH_SYNC: + // since no buffering is done, this has no effect + // AUTO_FLUSH_BACKGROUND: + // if the buffer space is exhausted, then write calls will block until there + // is space available in the buffer. + // MANUAL_FLUSH: + // if the buffer space is exhausted, then write calls will return an error. + Status SetMutationBufferSpace(size_t size) WARN_UNUSED_RESULT; + + // Set the timeout for writes made in this session. + void SetTimeoutMillis(int millis); + + // TODO: add "doAs" ability here for proxy servers to be able to act on behalf of + // other users, assuming access rights. + + // Apply the write operation. Transfers the write_op's ownership to the KuduSession. + // + // The behavior of this function depends on the current flush mode. Regardless + // of flush mode, however, Apply may begin to perform processing in the background + // for the call (e.g looking up the tablet, etc). Given that, an error may be + // queued into the PendingErrors structure prior to flushing, even in MANUAL_FLUSH + // mode. + // + // In case of any error, which may occur during flushing or because the write_op + // is malformed, the write_op is stored in the session's error collector which + // may be retrieved at any time. + // + // This is thread safe. + Status Apply(KuduWriteOperation* write_op) WARN_UNUSED_RESULT; + + // Similar to the above, except never blocks. Even in the flush modes that + // return immediately, 'cb' is triggered with the result. The callback may be + // called by a reactor thread, or in some cases may be called inline by the + // same thread which calls ApplyAsync(). 'cb' must remain valid until it called. + // + // TODO: not yet implemented. + void ApplyAsync(KuduWriteOperation* write_op, KuduStatusCallback* cb); + + // Flush any pending writes. + // + // Returns a bad status if there are any pending errors after the rows have + // been flushed. Callers should then use GetPendingErrors to determine which + // specific operations failed. + // + // In AUTO_FLUSH_SYNC mode, this has no effect, since every Apply() call flushes + // itself inline. + // + // In the case that the async version of this method is used, then the callback + // will be called upon completion of the operations which were buffered since the + // last flush. In other words, in the following sequence: + // + // session->Insert(a); + // session->FlushAsync(callback_1); + // session->Insert(b); + // session->FlushAsync(callback_2); + // + // ... 'callback_2' will be triggered once 'b' has been inserted, regardless of whether + // 'a' has completed or not. + // + // Note that this also means that, if FlushAsync is called twice in succession, with + // no intervening operations, the second flush will return immediately. For example: + // + // session->Insert(a); + // session->FlushAsync(callback_1); // called when 'a' is inserted + // session->FlushAsync(callback_2); // called immediately! + // + // Note that, as in all other async functions in Kudu, the callback may be called + // either from an IO thread or the same thread which calls FlushAsync. The callback + // should not block. + // + // For FlushAsync, 'cb' must remain valid until it is invoked. + // + // This function is thread-safe. + Status Flush() WARN_UNUSED_RESULT; + void FlushAsync(KuduStatusCallback* cb); + + // Close the session. + // Returns an error if there are unflushed or in-flight operations. + Status Close() WARN_UNUSED_RESULT; + + // Return true if there are operations which have not yet been delivered to the + // cluster. This may include buffered operations (i.e those that have not yet been + // flushed) as well as in-flight operations (i.e those that are in the process of + // being sent to the servers). + // TODO: maybe "incomplete" or "undelivered" is clearer? + // + // This function is thread-safe. + bool HasPendingOperations() const; + + // Return the number of buffered operations. These are operations that have + // not yet been flushed - i.e they are not en-route yet. + // + // Note that this is different than HasPendingOperations() above, which includes + // operations which have been sent and not yet responded to. + // + // This is only relevant in MANUAL_FLUSH mode, where the result will not + // decrease except for after a manual Flush, after which point it will be 0. + // In the other flush modes, data is immediately put en-route to the destination, + // so this will return 0. + // + // This function is thread-safe. + int CountBufferedOperations() const; + + // Return the number of errors which are pending. Errors may accumulate when + // using the AUTO_FLUSH_BACKGROUND mode. + // + // This function is thread-safe. + int CountPendingErrors() const; + + // Return any errors from previous calls. If there were more errors + // than could be held in the session's error storage, then sets *overflowed to true. + // + // Caller takes ownership of the returned errors. + // + // This function is thread-safe. + void GetPendingErrors(std::vector* errors, bool* overflowed); + + KuduClient* client() const; + + private: + class KUDU_NO_EXPORT Data; + + friend class KuduClient; + friend class internal::Batcher; + explicit KuduSession(const sp::shared_ptr& client); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduSession); +}; + + +// A single scanner. This class is not thread-safe, though different +// scanners on different threads may share a single KuduTable object. +class KUDU_EXPORT KuduScanner { + public: + // The possible read modes for scanners. + enum ReadMode { + // When READ_LATEST is specified the server will always return committed writes at + // the time the request was received. This type of read does not return a snapshot + // timestamp and is not repeatable. + // + // In ACID terms this corresponds to Isolation mode: "Read Committed" + // + // This is the default mode. + READ_LATEST, + + // When READ_AT_SNAPSHOT is specified the server will attempt to perform a read + // at the provided timestamp. If no timestamp is provided the server will take the + // current time as the snapshot timestamp. In this mode reads are repeatable, i.e. + // all future reads at the same timestamp will yield the same data. This is + // performed at the expense of waiting for in-flight transactions whose timestamp + // is lower than the snapshot's timestamp to complete, so it might incur a latency + // penalty. + // + // In ACID terms this, by itself, corresponds to Isolation mode "Repeatable + // Read". If all writes to the scanned tablet are made externally consistent, + // then this corresponds to Isolation mode "Strict-Serializable". + // + // Note: there currently "holes", which happen in rare edge conditions, by which writes + // are sometimes not externally consistent even when action was taken to make them so. + // In these cases Isolation may degenerate to mode "Read Committed". See KUDU-430. + READ_AT_SNAPSHOT + }; + + // Whether the rows should be returned in order. This affects the fault-tolerance properties + // of a scanner. + enum OrderMode { + // Rows will be returned in an arbitrary order determined by the tablet server. + // This is efficient, but unordered scans are not fault-tolerant and cannot be resumed + // in the case of tablet server failure. + // + // This is the default mode. + UNORDERED, + // Rows will be returned ordered by primary key. Sorting the rows imposes additional overhead + // on the tablet server, but means that scans are fault-tolerant and will be resumed at + // another tablet server in the case of failure. + ORDERED + }; + + // Default scanner timeout. + // This is set to 3x the default RPC timeout (see KuduClientBuilder::default_rpc_timeout()). + enum { kScanTimeoutMillis = 15000 }; + + // Initialize the scanner. The given 'table' object must remain valid + // for the lifetime of this scanner object. + // TODO: should table be a const pointer? + explicit KuduScanner(KuduTable* table); + ~KuduScanner(); + + // Set the projection used for this scanner by passing the column names to read. + // + // This overrides any previous call to SetProjectedColumns. + Status SetProjectedColumnNames(const std::vector& col_names) WARN_UNUSED_RESULT; + + // Set the projection used for this scanner by passing the column indexes to read. + // + // This overrides any previous call to SetProjectedColumns/SetProjectedColumnIndexes. + Status SetProjectedColumnIndexes(const std::vector& col_indexes) WARN_UNUSED_RESULT; + + // DEPRECATED: See SetProjectedColumnNames + Status SetProjectedColumns(const std::vector& col_names) WARN_UNUSED_RESULT; + + // Add a predicate to this scanner. + // + // The predicates act as conjunctions -- i.e, they all must pass for + // a row to be returned. + // + // The Scanner takes ownership of 'pred', even if a bad Status is returned. + Status AddConjunctPredicate(KuduPredicate* pred) WARN_UNUSED_RESULT; + + // Add a lower bound (inclusive) primary key for the scan. + // If any bound is already added, this bound is intersected with that one. + // + // The scanner does not take ownership of 'key'; the caller may free it afterward. + Status AddLowerBound(const KuduPartialRow& key); + + // Like AddLowerBound(), but the encoded primary key is an opaque slice of data + // obtained elsewhere. + // + // DEPRECATED: use AddLowerBound + Status AddLowerBoundRaw(const Slice& key); + + // Add an upper bound (exclusive) primary key for the scan. + // If any bound is already added, this bound is intersected with that one. + // + // The scanner makes a copy of 'key'; the caller may free it afterward. + Status AddExclusiveUpperBound(const KuduPartialRow& key); + + // Like AddExclusiveUpperBound(), but the encoded primary key is an opaque slice of data + // obtained elsewhere. + // + // DEPRECATED: use AddExclusiveUpperBound + Status AddExclusiveUpperBoundRaw(const Slice& key); + + // Add a lower bound (inclusive) partition key for the scan. + // + // The scanner makes a copy of 'partition_key'; the caller may free it afterward. + // + // This method is unstable, and for internal use only. + Status AddLowerBoundPartitionKeyRaw(const Slice& partition_key); + + // Add an upper bound (exclusive) partition key for the scan. + // + // The scanner makes a copy of 'partition_key'; the caller may free it afterward. + // + // This method is unstable, and for internal use only. + Status AddExclusiveUpperBoundPartitionKeyRaw(const Slice& partition_key); + + // Set the block caching policy for this scanner. If true, scanned data blocks will be cached + // in memory and made available for future scans. Default is true. + Status SetCacheBlocks(bool cache_blocks); + + // Begin scanning. + Status Open(); + + // Keeps the current remote scanner alive on the Tablet server for an additional + // time-to-live (set by a configuration flag on the tablet server). + // This is useful if the interval in between NextBatch() calls is big enough that the + // remote scanner might be garbage collected (default ttl is set to 60 secs.). + // This does not invalidate any previously fetched results. + // This returns a non-OK status if the scanner was already garbage collected or if + // the TabletServer was unreachable, for any reason. + // + // NOTE: A non-OK status returned by this method should not be taken as indication that + // the scan has failed. Subsequent calls to NextBatch() might still be successful, + // particularly if SetFaultTolerant() was called. + Status KeepAlive(); + + // Close the scanner. + // This releases resources on the server. + // + // This call does not block, and will not ever fail, even if the server + // cannot be contacted. + // + // NOTE: the scanner is reset to its initial state by this function. + // You'll have to re-add any projection, predicates, etc if you want + // to reuse this Scanner object. + void Close(); + + // Return true if there may be rows to be fetched from this scanner. + // + // Note: will be true provided there's at least one more tablet left to + // scan, even if that tablet has no data (we'll only know once we scan it). + // It will also be true after the initially opening the scanner before + // NextBatch is called for the first time. + bool HasMoreRows() const; + + // Clears 'rows' and populates it with the next batch of rows from the tablet server. + // A call to NextBatch() invalidates all previously fetched results which might + // now be pointing to garbage memory. + // + // DEPRECATED: Use NextBatch(KuduScanBatch*) instead. + Status NextBatch(std::vector* rows); + + // Fetches the next batch of results for this scanner. + // + // A single KuduScanBatch instance may be reused. Each subsequent call replaces the data + // from the previous call, and invalidates any KuduScanBatch::RowPtr objects previously + // obtained from the batch. + Status NextBatch(KuduScanBatch* batch); + + // Get the KuduTabletServer that is currently handling the scan. + // More concretely, this is the server that handled the most recent Open or NextBatch + // RPC made by the server. + Status GetCurrentServer(KuduTabletServer** server); + + // Set the hint for the size of the next batch in bytes. + // If setting to 0 before calling Open(), it means that the first call + // to the tablet server won't return data. + Status SetBatchSizeBytes(uint32_t batch_size); + + // Sets the replica selection policy while scanning. + // + // TODO: kill this in favor of a consistency-level-based API + Status SetSelection(KuduClient::ReplicaSelection selection) WARN_UNUSED_RESULT; + + // Sets the ReadMode. Default is READ_LATEST. + Status SetReadMode(ReadMode read_mode) WARN_UNUSED_RESULT; + + // DEPRECATED: use SetFaultTolerant. + Status SetOrderMode(OrderMode order_mode) WARN_UNUSED_RESULT; + + // Scans are by default non fault-tolerant, and scans will fail if scanning an + // individual tablet fails (for example, if a tablet server crashes in the + // middle of a tablet scan). + // + // If this method is called, the scan will be resumed at another tablet server + // in the case of failure. + // + // Fault tolerant scans typically have lower throughput than non + // fault-tolerant scans. Fault tolerant scans use READ_AT_SNAPSHOT mode, + // if no snapshot timestamp is provided, the server will pick one. + Status SetFaultTolerant() WARN_UNUSED_RESULT; + + // Sets the snapshot timestamp, in microseconds since the epoch, for scans in + // READ_AT_SNAPSHOT mode. + Status SetSnapshotMicros(uint64_t snapshot_timestamp_micros) WARN_UNUSED_RESULT; + + // Sets the snapshot timestamp in raw encoded form (i.e. as returned by a + // previous call to a server), for scans in READ_AT_SNAPSHOT mode. + Status SetSnapshotRaw(uint64_t snapshot_timestamp) WARN_UNUSED_RESULT; + + // Sets the maximum time that Open() and NextBatch() are allowed to take. + Status SetTimeoutMillis(int millis); + + // Returns the schema of the projection being scanned. + KuduSchema GetProjectionSchema() const; + + // Returns a string representation of this scan. + std::string ToString() const; + private: + class KUDU_NO_EXPORT Data; + + FRIEND_TEST(ClientTest, TestScanCloseProxy); + FRIEND_TEST(ClientTest, TestScanFaultTolerance); + FRIEND_TEST(ClientTest, TestScanNoBlockCaching); + FRIEND_TEST(ClientTest, TestScanTimeout); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduScanner); +}; + +// In-memory representation of a remote tablet server. +class KUDU_EXPORT KuduTabletServer { + public: + ~KuduTabletServer(); + + // Returns the UUID of this tablet server. Is globally unique and + // guaranteed not to change for the lifetime of the tablet server. + const std::string& uuid() const; + + // Returns the hostname of the first RPC address that this tablet server + // is listening on. + const std::string& hostname() const; + + private: + class KUDU_NO_EXPORT Data; + + friend class KuduClient; + friend class KuduScanner; + + KuduTabletServer(); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduTabletServer); +}; + +} // namespace client +} // namespace kudu +#endif diff --git a/src/kudu/client/clientConfig.cmake.in b/src/kudu/client/clientConfig.cmake.in new file mode 100644 index 000000000000..9eaf00c3c097 --- /dev/null +++ b/src/kudu/client/clientConfig.cmake.in @@ -0,0 +1,11 @@ +# Workaround so the generated cmake file works in older versions of +# cmake +if(NOT CMAKE_CURRENT_LIST_DIR) + get_filename_component(CMAKE_CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) +endif() + +@PACKAGE_INIT@ + +set_and_check(KUDU_CLIENT_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@") + +include("${CMAKE_CURRENT_LIST_DIR}/kuduClientTargets.cmake") diff --git a/src/kudu/client/client_builder-internal.cc b/src/kudu/client/client_builder-internal.cc new file mode 100644 index 000000000000..8e6dd034fd87 --- /dev/null +++ b/src/kudu/client/client_builder-internal.cc @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/client_builder-internal.h" + +namespace kudu { + +namespace client { + +KuduClientBuilder::Data::Data() + : default_admin_operation_timeout_(MonoDelta::FromSeconds(10)), + default_rpc_timeout_(MonoDelta::FromSeconds(5)) { +} + +KuduClientBuilder::Data::~Data() { +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/client_builder-internal.h b/src/kudu/client/client_builder-internal.h new file mode 100644 index 000000000000..3136b79243fa --- /dev/null +++ b/src/kudu/client/client_builder-internal.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_CLIENT_BUILDER_INTERNAL_H +#define KUDU_CLIENT_CLIENT_BUILDER_INTERNAL_H + +#include +#include + +#include "kudu/client/client.h" + +namespace kudu { + +namespace client { + +class KuduClientBuilder::Data { + public: + Data(); + ~Data(); + + std::vector master_server_addrs_; + MonoDelta default_admin_operation_timeout_; + MonoDelta default_rpc_timeout_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/client_samples-test.sh b/src/kudu/client/client_samples-test.sh new file mode 100755 index 000000000000..5f0d9fcf5bad --- /dev/null +++ b/src/kudu/client/client_samples-test.sh @@ -0,0 +1,128 @@ +#!/bin/bash -xe +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Tests that the Kudu client library can be installed outside +# the build tree, that the installed headers are sane, and that +# the sample code can be built and runs correctly. + +# Clean up after the test. Must be idempotent. +cleanup() { + if [ -n "$TS_PID" ]; then + kill -9 "$TS_PID" || : + wait $TS_PID || : + fi + if [ -n "$MASTER_PID" ]; then + kill -9 "$MASTER_PID" || : + wait $MASTER_PID || : + fi + if [ -n "$BASE_DIR" -a -d "$BASE_DIR" ]; then + rm -rf "$BASE_DIR" + fi + if [ -n "$LIBRARY_DIR" -a -d "$LIBRARY_DIR" ]; then + rm -rf "$LIBRARY_DIR" + fi +} +trap cleanup EXIT + +OUTPUT_DIR=$(cd $(dirname "$BASH_SOURCE"); pwd) + +# Install the client library to a temporary directory. +# Try to detect whether we're building using Ninja or Make. +LIBRARY_DIR=$(mktemp -d -t kudu-samples-test.XXXXXXXXXXXXX) +PREFIX_DIR=$LIBRARY_DIR/usr/local +SAMPLES_DIR=$PREFIX_DIR/share/doc/kuduClient/samples +pushd $OUTPUT_DIR/.. +NINJA=$(which ninja 2>/dev/null) || NINJA="" +if [ -r build.ninja -a -n "$NINJA" ]; then + DESTDIR=$LIBRARY_DIR ninja install +else + make -j$(getconf _NPROCESSORS_ONLN) DESTDIR=$LIBRARY_DIR install +fi +popd + +# Test that all of the installed headers can be compiled on their own. +# This catches bugs where we've made a mistake in 'include-what-you-use' +# within the library. +for include_file in $(find $LIBRARY_DIR -name \*.h) ; do + echo Checking standalone compilation of $include_file... + if ! ${CXX:-g++} -o /dev/null -I$LIBRARY_DIR/usr/local/include $include_file ; then + set +x + echo + echo ----------------------------------------- + echo $include_file fails to build on its own. + echo See log above for details. + echo ----------------------------------------- + exit 1 + fi +done +# Prefer the cmake on the system path, since we expect our client library +# to be usable with older versions of cmake. But if it isn't there, +# use the one from thirdparty. +CMAKE=$(which cmake || :) +if [ -z "$CMAKE" ]; then + # TODO: temporary hack which assumes this script is in src/build//bin + CMAKE=$OUTPUT_DIR/../../../thirdparty/installed/bin/cmake +fi + +# Build the client samples using the client library. +# We can just always use Make here, since we're calling cmake ourselves. +pushd $SAMPLES_DIR +CMAKE_PREFIX_PATH=$PREFIX_DIR $CMAKE . +make -j$(getconf _NPROCESSORS_ONLN) +popd + +# Pick a unique localhost IP address so this can run in parallel with other +# tests. This only works on Linux. +LOCALHOST_IP=127.0.0.1 +if [ "$(uname)" == "Linux" ]; then + LOCALHOST_IP=127.$[($$ >> 8) & 0xff].$[$$ & 0xff].1 + echo Using unique localhost IP $LOCALHOST_IP +fi + + +# Start master+ts +export TMPDIR=${TMPDIR:-/tmp} +export TEST_TMPDIR=${TEST_TMPDIR:-$TMPDIR/kudutest-$UID} +mkdir -p $TEST_TMPDIR +BASE_DIR=$(mktemp -d $TEST_TMPDIR/client_samples-test.XXXXXXXX) +$OUTPUT_DIR/kudu-master \ + --default_num_replicas=1 \ + --log_dir=$BASE_DIR \ + --fs_wal_dir=$BASE_DIR/master \ + --fs_data_dirs=$BASE_DIR/master \ + --webserver_interface=localhost \ + --webserver_port=0 \ + --rpc_bind_addresses=$LOCALHOST_IP & +MASTER_PID=$! +$OUTPUT_DIR/kudu-tserver \ + --log_dir=$BASE_DIR \ + --fs_wal_dir=$BASE_DIR/ts \ + --fs_data_dirs=$BASE_DIR/ts \ + --rpc_bind_addresses=$LOCALHOST_IP \ + --local_ip_for_outbound_sockets=$LOCALHOST_IP \ + --webserver_interface=localhost \ + --webserver_port=0 \ + --tserver_master_addrs=$LOCALHOST_IP & +TS_PID=$! + +# Let them run for a bit. +sleep 5 + +# Run the samples. +$SAMPLES_DIR/sample $LOCALHOST_IP diff --git a/src/kudu/client/client_symbol-test.sh b/src/kudu/client/client_symbol-test.sh new file mode 100755 index 000000000000..2b5d0bcf0f7a --- /dev/null +++ b/src/kudu/client/client_symbol-test.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Tests that the dynamic symbols visible in the public client library +# (i.e. those available for runtime linking) are all approved Kudu symbols. + +NM=`which nm` +if [ -n $NM ]; then + echo "Found nm: $NM" +else + echo "Cannot find nm on PATH: $PATH" + exit 1 +fi + +if [[ "$OSTYPE" =~ ^linux ]]; then + DYLIB_SUFFIX="so" +elif [[ "$OSTYPE" == "darwin"* ]]; then + DYLIB_SUFFIX="dylib" +fi + +LIB=$(dirname "$BASH_SOURCE")/../lib/exported/libkudu_client.$DYLIB_SUFFIX +if [ -r $LIB ]; then + echo "Found kudu client library: $LIB" +else + echo "Can't read kudu client library at $LIB" + exit 1 +fi + +if [[ "$OSTYPE" =~ ^linux ]]; then + NM_COMMAND="${NM} -D --defined-only --demangle ${LIB}" +elif [[ "$OSTYPE" == "darwin"* ]]; then + NM_COMMAND="${NM} -U ${LIB} | c++filt" +fi + +NUM_BAD_SYMS=0 +while read ADDR TYPE SYMBOL; do + # Skip all symbols that aren't strong and global. + if [ "$TYPE" != "T" ]; then + echo "Skipping non-strong and non-global symbol '$SYMBOL'" + continue + fi + + # Skip special symbols. + if [ "$SYMBOL" = "_init" -o "$SYMBOL" = "_fini" ]; then + echo "Skipping special symbol '$SYMBOL'" + continue + fi + + # Skip Kudu symbols. Using [[ ]] for regex support. + if [[ "$SYMBOL" =~ ^kudu:: ]]; then + echo "Skipping kudu symbol '$SYMBOL'" + continue; + fi + + # KUDU-455: skip bizarro global symbol that remains when compiling with old gcc. + if [ "$SYMBOL" = "__gnu_cxx::hash::operator()(StringPiece) const" ]; then + echo "Skipping KUDU-455 symbol '$SYMBOL'" + continue + fi + + # Any left over symbol is bad. + echo "Found bad symbol '$SYMBOL'" + NUM_BAD_SYMS=$((NUM_BAD_SYMS + 1)) +done < <($NM_COMMAND) + +if [ $NUM_BAD_SYMS -gt 0 ]; then + echo "Kudu client library contains $NUM_BAD_SYMS bad symbols" + exit 1 +fi + +exit 0 diff --git a/src/kudu/client/error-internal.cc b/src/kudu/client/error-internal.cc new file mode 100644 index 000000000000..959d2e0b3ceb --- /dev/null +++ b/src/kudu/client/error-internal.cc @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/error-internal.h" + +namespace kudu { + +namespace client { + +KuduError::Data::Data(gscoped_ptr failed_op, + const Status& status) : + failed_op_(failed_op.Pass()), + status_(status) { +} + +KuduError::Data::~Data() { +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/error-internal.h b/src/kudu/client/error-internal.h new file mode 100644 index 000000000000..78ccad4999a7 --- /dev/null +++ b/src/kudu/client/error-internal.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_ERROR_INTERNAL_H +#define KUDU_CLIENT_ERROR_INTERNAL_H + +#include "kudu/client/client.h" +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { + +namespace client { + +class KuduError::Data { + public: + Data(gscoped_ptr failed_op, const Status& error); + ~Data(); + + gscoped_ptr failed_op_; + Status status_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/error_collector.cc b/src/kudu/client/error_collector.cc new file mode 100644 index 000000000000..987372f2a9f2 --- /dev/null +++ b/src/kudu/client/error_collector.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/client.h" +#include "kudu/client/error_collector.h" + +#include + +#include "kudu/gutil/stl_util.h" + +namespace kudu { +namespace client { +namespace internal { + +ErrorCollector::ErrorCollector() { +} + +ErrorCollector::~ErrorCollector() { + STLDeleteElements(&errors_); +} + +void ErrorCollector::AddError(gscoped_ptr error) { + lock_guard l(&lock_); + errors_.push_back(error.release()); +} + +int ErrorCollector::CountErrors() const { + lock_guard l(&lock_); + return errors_.size(); +} + +void ErrorCollector::GetErrors(std::vector* errors, bool* overflowed) { + lock_guard l(&lock_); + errors->swap(errors_); + *overflowed = false; +} + + +} // namespace internal +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/error_collector.h b/src/kudu/client/error_collector.h new file mode 100644 index 000000000000..c1ef640a8e28 --- /dev/null +++ b/src/kudu/client/error_collector.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_ERROR_COLLECTOR_H +#define KUDU_CLIENT_ERROR_COLLECTOR_H + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace client { + +class KuduError; +class KuduInsert; + +namespace internal { + +class ErrorCollector : public RefCountedThreadSafe { + public: + ErrorCollector(); + + void AddError(gscoped_ptr error); + + // See KuduSession for details. + int CountErrors() const; + + // See KuduSession for details. + void GetErrors(std::vector* errors, bool* overflowed); + + private: + friend class RefCountedThreadSafe; + virtual ~ErrorCollector(); + + mutable simple_spinlock lock_; + std::vector errors_; + + DISALLOW_COPY_AND_ASSIGN(ErrorCollector); +}; + +} // namespace internal +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_ERROR_COLLECTOR_H */ diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc new file mode 100644 index 000000000000..61ae145d32af --- /dev/null +++ b/src/kudu/client/meta_cache.cc @@ -0,0 +1,658 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/meta_cache.h" + +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/client-internal.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/util/net/dns_resolver.h" +#include "kudu/util/net/net_util.h" + +using std::string; +using std::map; +using std::shared_ptr; +using strings::Substitute; + +namespace kudu { + +using consensus::RaftPeerPB; +using master::GetTableLocationsRequestPB; +using master::GetTableLocationsResponsePB; +using master::MasterServiceProxy; +using master::TabletLocationsPB; +using master::TabletLocationsPB_ReplicaPB; +using master::TSInfoPB; +using rpc::Messenger; +using rpc::Rpc; +using tserver::TabletServerServiceProxy; + +namespace client { + +namespace internal { + +//////////////////////////////////////////////////////////// + +RemoteTabletServer::RemoteTabletServer(const master::TSInfoPB& pb) + : uuid_(pb.permanent_uuid()) { + + Update(pb); +} + +void RemoteTabletServer::DnsResolutionFinished(const HostPort& hp, + vector* addrs, + KuduClient* client, + const StatusCallback& user_callback, + const Status &result_status) { + gscoped_ptr > scoped_addrs(addrs); + + Status s = result_status; + + if (s.ok() && addrs->empty()) { + s = Status::NotFound("No addresses for " + hp.ToString()); + } + + if (!s.ok()) { + s = s.CloneAndPrepend("Failed to resolve address for TS " + uuid_); + user_callback.Run(s); + return; + } + + VLOG(1) << "Successfully resolved " << hp.ToString() << ": " + << (*addrs)[0].ToString(); + + { + lock_guard l(&lock_); + proxy_.reset(new TabletServerServiceProxy(client->data_->messenger_, (*addrs)[0])); + } + user_callback.Run(s); +} + +void RemoteTabletServer::InitProxy(KuduClient* client, const StatusCallback& cb) { + HostPort hp; + { + unique_lock l(&lock_); + + if (proxy_) { + // Already have a proxy created. + l.unlock(); + cb.Run(Status::OK()); + return; + } + + CHECK(!rpc_hostports_.empty()); + // TODO: if the TS advertises multiple host/ports, pick the right one + // based on some kind of policy. For now just use the first always. + hp = rpc_hostports_[0]; + } + + auto addrs = new vector(); + client->data_->dns_resolver_->ResolveAddresses( + hp, addrs, Bind(&RemoteTabletServer::DnsResolutionFinished, + Unretained(this), hp, addrs, client, cb)); +} + +void RemoteTabletServer::Update(const master::TSInfoPB& pb) { + CHECK_EQ(pb.permanent_uuid(), uuid_); + + lock_guard l(&lock_); + + rpc_hostports_.clear(); + for (const HostPortPB& hostport_pb : pb.rpc_addresses()) { + rpc_hostports_.push_back(HostPort(hostport_pb.host(), hostport_pb.port())); + } +} + +string RemoteTabletServer::permanent_uuid() const { + return uuid_; +} + +shared_ptr RemoteTabletServer::proxy() const { + lock_guard l(&lock_); + CHECK(proxy_); + return proxy_; +} + +string RemoteTabletServer::ToString() const { + string ret = uuid_; + lock_guard l(&lock_); + if (!rpc_hostports_.empty()) { + strings::SubstituteAndAppend(&ret, " ($0)", rpc_hostports_[0].ToString()); + } + return ret; +} + +void RemoteTabletServer::GetHostPorts(vector* host_ports) const { + lock_guard l(&lock_); + *host_ports = rpc_hostports_; +} + +//////////////////////////////////////////////////////////// + + +void RemoteTablet::Refresh(const TabletServerMap& tservers, + const google::protobuf::RepeatedPtrField + & replicas) { + // Adopt the data from the successful response. + lock_guard l(&lock_); + replicas_.clear(); + for (const TabletLocationsPB_ReplicaPB& r : replicas) { + RemoteReplica rep; + rep.ts = FindOrDie(tservers, r.ts_info().permanent_uuid()); + rep.role = r.role(); + rep.failed = false; + replicas_.push_back(rep); + } + stale_ = false; +} + +void RemoteTablet::MarkStale() { + lock_guard l(&lock_); + stale_ = true; +} + +bool RemoteTablet::stale() const { + lock_guard l(&lock_); + return stale_; +} + +bool RemoteTablet::MarkReplicaFailed(RemoteTabletServer *ts, + const Status& status) { + bool found = false; + lock_guard l(&lock_); + VLOG(2) << "Tablet " << tablet_id_ << ": Current remote replicas in meta cache: " + << ReplicasAsStringUnlocked(); + LOG(WARNING) << "Tablet " << tablet_id_ << ": Replica " << ts->ToString() + << " has failed: " << status.ToString(); + for (RemoteReplica& rep : replicas_) { + if (rep.ts == ts) { + rep.failed = true; + found = true; + } + } + return found; +} + +int RemoteTablet::GetNumFailedReplicas() const { + int failed = 0; + lock_guard l(&lock_); + for (const RemoteReplica& rep : replicas_) { + if (rep.failed) { + failed++; + } + } + return failed; +} + +RemoteTabletServer* RemoteTablet::LeaderTServer() const { + lock_guard l(&lock_); + for (const RemoteReplica& replica : replicas_) { + if (!replica.failed && replica.role == RaftPeerPB::LEADER) { + return replica.ts; + } + } + return nullptr; +} + +bool RemoteTablet::HasLeader() const { + return LeaderTServer() != nullptr; +} + +void RemoteTablet::GetRemoteTabletServers(vector* servers) const { + lock_guard l(&lock_); + for (const RemoteReplica& replica : replicas_) { + if (replica.failed) { + continue; + } + servers->push_back(replica.ts); + } +} + +void RemoteTablet::MarkTServerAsLeader(const RemoteTabletServer* server) { + bool found = false; + lock_guard l(&lock_); + for (RemoteReplica& replica : replicas_) { + if (replica.ts == server) { + replica.role = RaftPeerPB::LEADER; + found = true; + } else if (replica.role == RaftPeerPB::LEADER) { + replica.role = RaftPeerPB::FOLLOWER; + } + } + VLOG(3) << "Latest replicas: " << ReplicasAsStringUnlocked(); + DCHECK(found) << "Tablet " << tablet_id_ << ": Specified server not found: " + << server->ToString() << ". Replicas: " << ReplicasAsStringUnlocked(); +} + +void RemoteTablet::MarkTServerAsFollower(const RemoteTabletServer* server) { + bool found = false; + lock_guard l(&lock_); + for (RemoteReplica& replica : replicas_) { + if (replica.ts == server) { + replica.role = RaftPeerPB::FOLLOWER; + found = true; + } + } + VLOG(3) << "Latest replicas: " << ReplicasAsStringUnlocked(); + DCHECK(found) << "Tablet " << tablet_id_ << ": Specified server not found: " + << server->ToString() << ". Replicas: " << ReplicasAsStringUnlocked(); +} + +std::string RemoteTablet::ReplicasAsString() const { + lock_guard l(&lock_); + return ReplicasAsStringUnlocked(); +} + +std::string RemoteTablet::ReplicasAsStringUnlocked() const { + DCHECK(lock_.is_locked()); + string replicas_str; + for (const RemoteReplica& rep : replicas_) { + if (!replicas_str.empty()) replicas_str += ", "; + strings::SubstituteAndAppend(&replicas_str, "$0 ($1, $2)", + rep.ts->permanent_uuid(), + RaftPeerPB::Role_Name(rep.role), + rep.failed ? "FAILED" : "OK"); + } + return replicas_str; +} + +//////////////////////////////////////////////////////////// + +MetaCache::MetaCache(KuduClient* client) + : client_(client), + master_lookup_sem_(50) { +} + +MetaCache::~MetaCache() { + STLDeleteValues(&ts_cache_); +} + +void MetaCache::UpdateTabletServer(const TSInfoPB& pb) { + DCHECK(lock_.is_write_locked()); + RemoteTabletServer* ts = FindPtrOrNull(ts_cache_, pb.permanent_uuid()); + if (ts) { + ts->Update(pb); + return; + } + + VLOG(1) << "Client caching new TabletServer " << pb.permanent_uuid(); + InsertOrDie(&ts_cache_, pb.permanent_uuid(), new RemoteTabletServer(pb)); +} + +// A (table, partition_key) --> tablet lookup. May be in-flight to a master, or +// may be handled locally. +// +// Keeps a reference on the owning metacache while alive. +class LookupRpc : public Rpc { + public: + LookupRpc(const scoped_refptr& meta_cache, + StatusCallback user_cb, + const KuduTable* table, + string partition_key, + scoped_refptr* remote_tablet, + const MonoTime& deadline, + const shared_ptr& messenger); + virtual ~LookupRpc(); + virtual void SendRpc() OVERRIDE; + virtual string ToString() const OVERRIDE; + + const GetTableLocationsResponsePB& resp() const { return resp_; } + const string& table_name() const { return table_->name(); } + const string& table_id() const { return table_->id(); } + + private: + virtual void SendRpcCb(const Status& status) OVERRIDE; + + std::shared_ptr master_proxy() const { + return table_->client()->data_->master_proxy(); + } + + void ResetMasterLeaderAndRetry(); + + void NewLeaderMasterDeterminedCb(const Status& status); + + // Pointer back to the tablet cache. Populated with location information + // if the lookup finishes successfully. + // + // When the RPC is destroyed, a master lookup permit is returned to the + // cache if one was acquired in the first place. + scoped_refptr meta_cache_; + + // Request body. + GetTableLocationsRequestPB req_; + + // Response body. + GetTableLocationsResponsePB resp_; + + // User-specified callback to invoke when the lookup finishes. + // + // Always invoked, regardless of success or failure. + StatusCallback user_cb_; + + // Table to lookup. + const KuduTable* table_; + + // Encoded partition key to lookup. + string partition_key_; + + // When lookup finishes successfully, the selected tablet is + // written here prior to invoking 'user_cb_'. + scoped_refptr *remote_tablet_; + + // Whether this lookup has acquired a master lookup permit. + bool has_permit_; +}; + +LookupRpc::LookupRpc(const scoped_refptr& meta_cache, + StatusCallback user_cb, const KuduTable* table, + string partition_key, + scoped_refptr* remote_tablet, + const MonoTime& deadline, + const shared_ptr& messenger) + : Rpc(deadline, messenger), + meta_cache_(meta_cache), + user_cb_(std::move(user_cb)), + table_(table), + partition_key_(std::move(partition_key)), + remote_tablet_(remote_tablet), + has_permit_(false) { + DCHECK(deadline.Initialized()); +} + +LookupRpc::~LookupRpc() { + if (has_permit_) { + meta_cache_->ReleaseMasterLookupPermit(); + } +} + +void LookupRpc::SendRpc() { + // Fast path: lookup in the cache. + scoped_refptr result; + if (PREDICT_TRUE(meta_cache_->LookupTabletByKeyFastPath(table_, partition_key_, &result)) && + result->HasLeader()) { + VLOG(3) << "Fast lookup: found tablet " << result->tablet_id() + << " for " << table_->partition_schema() + .PartitionKeyDebugString(partition_key_, *table_->schema().schema_) + << " of " << table_->name(); + if (remote_tablet_) { + *remote_tablet_ = result; + } + user_cb_.Run(Status::OK()); + delete this; + return; + } + + // Slow path: must lookup the tablet in the master. + VLOG(3) << "Fast lookup: no known tablet" + << " for " << table_->partition_schema() + .PartitionKeyDebugString(partition_key_, *table_->schema().schema_) + << " of " << table_->name() + << ": refreshing our metadata from the Master"; + + if (!has_permit_) { + has_permit_ = meta_cache_->AcquireMasterLookupPermit(); + } + if (!has_permit_) { + // Couldn't get a permit, try again in a little while. + mutable_retrier()->DelayedRetry(this, Status::TimedOut( + "client has too many outstanding requests to the master")); + return; + } + + // Fill out the request. + req_.mutable_table()->set_table_id(table_->id()); + req_.set_partition_key_start(partition_key_); + + // The end partition key is left unset intentionally so that we'll prefetch + // some additional tablets. + + // See KuduClient::Data::SyncLeaderMasterRpc(). + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (retrier().deadline().ComesBefore(now)) { + SendRpcCb(Status::TimedOut("timed out after deadline expired")); + return; + } + MonoTime rpc_deadline = now; + rpc_deadline.AddDelta(meta_cache_->client_->default_rpc_timeout()); + mutable_retrier()->mutable_controller()->set_deadline( + MonoTime::Earliest(rpc_deadline, retrier().deadline())); + + master_proxy()->GetTableLocationsAsync(req_, &resp_, + mutable_retrier()->mutable_controller(), + boost::bind(&LookupRpc::SendRpcCb, this, Status::OK())); +} + +string LookupRpc::ToString() const { + return Substitute("GetTableLocations($0, $1, $2)", + table_->name(), + table_->partition_schema() + .PartitionKeyDebugString(partition_key_, *table_->schema().schema_), + num_attempts()); +} + +void LookupRpc::ResetMasterLeaderAndRetry() { + table_->client()->data_->SetMasterServerProxyAsync( + table_->client(), + retrier().deadline(), + Bind(&LookupRpc::NewLeaderMasterDeterminedCb, + Unretained(this))); +} + +void LookupRpc::NewLeaderMasterDeterminedCb(const Status& status) { + if (status.ok()) { + mutable_retrier()->mutable_controller()->Reset(); + SendRpc(); + } else { + LOG(WARNING) << "Failed to determine new Master: " << status.ToString(); + mutable_retrier()->DelayedRetry(this, status); + } +} + +void LookupRpc::SendRpcCb(const Status& status) { + gscoped_ptr delete_me(this); // delete on scope exit + + // Prefer early failures over controller failures. + Status new_status = status; + if (new_status.ok() && mutable_retrier()->HandleResponse(this, &new_status)) { + ignore_result(delete_me.release()); + return; + } + + // Prefer controller failures over response failures. + if (new_status.ok() && resp_.has_error()) { + if (resp_.error().code() == master::MasterErrorPB::NOT_THE_LEADER || + resp_.error().code() == master::MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED) { + if (meta_cache_->client_->IsMultiMaster()) { + LOG(WARNING) << "Leader Master has changed, re-trying..."; + ResetMasterLeaderAndRetry(); + ignore_result(delete_me.release()); + return; + } + } + new_status = StatusFromPB(resp_.error().status()); + } + + if (new_status.IsTimedOut()) { + if (MonoTime::Now(MonoTime::FINE).ComesBefore(retrier().deadline())) { + if (meta_cache_->client_->IsMultiMaster()) { + LOG(WARNING) << "Leader Master timed out, re-trying..."; + ResetMasterLeaderAndRetry(); + ignore_result(delete_me.release()); + return; + } + } else { + // Operation deadline expired during this latest RPC. + new_status = new_status.CloneAndPrepend( + "timed out after deadline expired"); + } + } + + if (new_status.IsNetworkError()) { + if (meta_cache_->client_->IsMultiMaster()) { + LOG(WARNING) << "Encountered a network error from the Master: " << new_status.ToString() + << ", retrying..."; + ResetMasterLeaderAndRetry(); + ignore_result(delete_me.release()); + return; + } + } + + // Prefer response failures over no tablets found. + if (new_status.ok() && resp_.tablet_locations_size() == 0) { + new_status = Status::NotFound("No such tablet found"); + } + + if (new_status.ok()) { + const scoped_refptr& result = + meta_cache_->ProcessLookupResponse(*this); + if (remote_tablet_) { + *remote_tablet_ = result; + } + } else { + new_status = new_status.CloneAndPrepend(Substitute("$0 failed", ToString())); + LOG(WARNING) << new_status.ToString(); + } + user_cb_.Run(new_status); +} + +const scoped_refptr& MetaCache::ProcessLookupResponse(const LookupRpc& rpc) { + VLOG(2) << "Processing master response for " << rpc.ToString() + << ". Response: " << rpc.resp().ShortDebugString(); + + lock_guard l(&lock_); + TabletMap& tablets_by_key = LookupOrInsert(&tablets_by_table_and_key_, + rpc.table_id(), TabletMap()); + for (const TabletLocationsPB& loc : rpc.resp().tablet_locations()) { + // First, update the tserver cache, needed for the Refresh calls below. + for (const TabletLocationsPB_ReplicaPB& r : loc.replicas()) { + UpdateTabletServer(r.ts_info()); + } + + // Next, update the tablet caches. + string tablet_id = loc.tablet_id(); + scoped_refptr remote = FindPtrOrNull(tablets_by_id_, tablet_id); + if (remote.get() != nullptr) { + // Partition should not have changed. + DCHECK_EQ(loc.partition().partition_key_start(), remote->partition().partition_key_start()); + DCHECK_EQ(loc.partition().partition_key_end(), remote->partition().partition_key_end()); + + VLOG(3) << "Refreshing tablet " << tablet_id << ": " + << loc.ShortDebugString(); + remote->Refresh(ts_cache_, loc.replicas()); + continue; + } + + VLOG(3) << "Caching tablet " << tablet_id << " for (" << rpc.table_name() + << "): " << loc.ShortDebugString(); + + Partition partition; + Partition::FromPB(loc.partition(), &partition); + remote = new RemoteTablet(tablet_id, partition); + remote->Refresh(ts_cache_, loc.replicas()); + + InsertOrDie(&tablets_by_id_, tablet_id, remote); + InsertOrDie(&tablets_by_key, partition.partition_key_start(), remote); + } + + // Always return the first tablet. + return FindOrDie(tablets_by_id_, rpc.resp().tablet_locations(0).tablet_id()); +} + +bool MetaCache::LookupTabletByKeyFastPath(const KuduTable* table, + const string& partition_key, + scoped_refptr* remote_tablet) { + shared_lock l(&lock_); + const TabletMap* tablets = FindOrNull(tablets_by_table_and_key_, table->id()); + if (PREDICT_FALSE(!tablets)) { + // No cache available for this table. + return false; + } + + const scoped_refptr* r = FindFloorOrNull(*tablets, partition_key); + if (PREDICT_FALSE(!r)) { + // No tablets with a start partition key lower than 'partition_key'. + return false; + } + + // Stale entries must be re-fetched. + if ((*r)->stale()) { + return false; + } + + if ((*r)->partition().partition_key_end().compare(partition_key) > 0 || + (*r)->partition().partition_key_end().empty()) { + // partition_key < partition.end OR tablet doesn't end. + *remote_tablet = *r; + return true; + } + + return false; +} + +void MetaCache::LookupTabletByKey(const KuduTable* table, + const string& partition_key, + const MonoTime& deadline, + scoped_refptr* remote_tablet, + const StatusCallback& callback) { + LookupRpc* rpc = new LookupRpc(this, + callback, + table, + partition_key, + remote_tablet, + deadline, + client_->data_->messenger_); + rpc->SendRpc(); +} + +void MetaCache::MarkTSFailed(RemoteTabletServer* ts, + const Status& status) { + LOG(INFO) << "Marking tablet server " << ts->ToString() << " as failed."; + shared_lock l(&lock_); + + Status ts_status = status.CloneAndPrepend("TS failed"); + + // TODO: replace with a ts->tablet multimap for faster lookup? + for (const TabletMap::value_type& tablet : tablets_by_id_) { + // We just loop on all tablets; if a tablet does not have a replica on this + // TS, MarkReplicaFailed() returns false and we ignore the return value. + tablet.second->MarkReplicaFailed(ts, ts_status); + } +} + +bool MetaCache::AcquireMasterLookupPermit() { + return master_lookup_sem_.TryAcquire(); +} + +void MetaCache::ReleaseMasterLookupPermit() { + master_lookup_sem_.Release(); +} + +} // namespace internal +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/meta_cache.h b/src/kudu/client/meta_cache.h new file mode 100644 index 000000000000..f99df73afa5e --- /dev/null +++ b/src/kudu/client/meta_cache.h @@ -0,0 +1,294 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This module is internal to the client and not a public API. +#ifndef KUDU_CLIENT_META_CACHE_H +#define KUDU_CLIENT_META_CACHE_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/partition.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/async_util.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/semaphore.h" +#include "kudu/util/status.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/net/net_util.h" + +namespace kudu { + +class KuduPartialRow; + +namespace tserver { +class TabletServerServiceProxy; +} // namespace tserver + +namespace master { +class MasterServiceProxy; +class TabletLocationsPB_ReplicaPB; +class TSInfoPB; +} // namespace master + +namespace client { + +class ClientTest_TestMasterLookupPermits_Test; +class KuduClient; +class KuduTable; + +namespace internal { + +class LookupRpc; + +// The information cached about a given tablet server in the cluster. +// +// This class is thread-safe. +class RemoteTabletServer { + public: + explicit RemoteTabletServer(const master::TSInfoPB& pb); + + // Initialize the RPC proxy to this tablet server, if it is not already set up. + // This will involve a DNS lookup if there is not already an active proxy. + // If there is an active proxy, does nothing. + void InitProxy(KuduClient* client, const StatusCallback& cb); + + // Update information from the given pb. + // Requires that 'pb''s UUID matches this server. + void Update(const master::TSInfoPB& pb); + + // Return the current proxy to this tablet server. Requires that InitProxy() + // be called prior to this. + std::shared_ptr proxy() const; + + std::string ToString() const; + + void GetHostPorts(std::vector* host_ports) const; + + // Returns the remote server's uuid. + std::string permanent_uuid() const; + + private: + // Internal callback for DNS resolution. + void DnsResolutionFinished(const HostPort& hp, + std::vector* addrs, + KuduClient* client, + const StatusCallback& user_callback, + const Status &result_status); + + mutable simple_spinlock lock_; + const std::string uuid_; + + std::vector rpc_hostports_; + std::shared_ptr proxy_; + + DISALLOW_COPY_AND_ASSIGN(RemoteTabletServer); +}; + +struct RemoteReplica { + RemoteTabletServer* ts; + consensus::RaftPeerPB::Role role; + bool failed; +}; + +typedef std::unordered_map TabletServerMap; + +// The client's view of a given tablet. This object manages lookups of +// the tablet's locations, status, etc. +// +// This class is thread-safe. +class RemoteTablet : public RefCountedThreadSafe { + public: + RemoteTablet(std::string tablet_id, + Partition partition) + : tablet_id_(std::move(tablet_id)), + partition_(std::move(partition)), + stale_(false) { + } + + // Updates this tablet's replica locations. + void Refresh(const TabletServerMap& tservers, + const google::protobuf::RepeatedPtrField + & replicas); + + // Mark this tablet as stale, indicating that the cached tablet metadata is + // out of date. Staleness is checked by the MetaCache when + // LookupTabletByKey() is called to determine whether the fast (non-network) + // path can be used or whether the metadata must be refreshed from the Master. + void MarkStale(); + + // Whether the tablet has been marked as stale. + bool stale() const; + + // Mark any replicas of this tablet hosted by 'ts' as failed. They will + // not be returned in future cache lookups. + // + // The provided status is used for logging. + // Returns true if 'ts' was found among this tablet's replicas, false if not. + bool MarkReplicaFailed(RemoteTabletServer *ts, + const Status& status); + + // Return the number of failed replicas for this tablet. + int GetNumFailedReplicas() const; + + // Return the tablet server which is acting as the current LEADER for + // this tablet, provided it hasn't failed. + // + // Returns NULL if there is currently no leader, or if the leader has + // failed. Given that the replica list may change at any time, + // callers should always check the result against NULL. + RemoteTabletServer* LeaderTServer() const; + + // Writes this tablet's TSes (across all replicas) to 'servers'. Skips + // failed replicas. + void GetRemoteTabletServers(std::vector* servers) const; + + // Return true if the tablet currently has a known LEADER replica + // (i.e the next call to LeaderTServer() is likely to return non-NULL) + bool HasLeader() const; + + const std::string& tablet_id() const { return tablet_id_; } + + const Partition& partition() const { + return partition_; + } + + // Mark the specified tablet server as the leader of the consensus configuration in the cache. + void MarkTServerAsLeader(const RemoteTabletServer* server); + + // Mark the specified tablet server as a follower in the cache. + void MarkTServerAsFollower(const RemoteTabletServer* server); + + // Return stringified representation of the list of replicas for this tablet. + std::string ReplicasAsString() const; + + private: + // Same as ReplicasAsString(), except that the caller must hold lock_. + std::string ReplicasAsStringUnlocked() const; + + const std::string tablet_id_; + const Partition partition_; + + // All non-const members are protected by 'lock_'. + mutable simple_spinlock lock_; + bool stale_; + std::vector replicas_; + + DISALLOW_COPY_AND_ASSIGN(RemoteTablet); +}; + +// Manager of RemoteTablets and RemoteTabletServers. The client consults +// this class to look up a given tablet or server. +// +// This class will also be responsible for cache eviction policies, etc. +class MetaCache : public RefCountedThreadSafe { + public: + // The passed 'client' object must remain valid as long as MetaCache is alive. + explicit MetaCache(KuduClient* client); + ~MetaCache(); + + // Look up which tablet hosts the given partition key for a table. When it is + // available, the tablet is stored in 'remote_tablet' (if not NULL) and the + // callback is fired. Only tablets with non-failed LEADERs are considered. + // + // NOTE: the callback may be called from an IO thread or inline with this + // call if the cached data is already available. + // + // NOTE: the memory referenced by 'table' must remain valid until 'callback' + // is invoked. + void LookupTabletByKey(const KuduTable* table, + const std::string& partition_key, + const MonoTime& deadline, + scoped_refptr* remote_tablet, + const StatusCallback& callback); + + // Mark any replicas of any tablets hosted by 'ts' as failed. They will + // not be returned in future cache lookups. + void MarkTSFailed(RemoteTabletServer* ts, const Status& status); + + // Acquire or release a permit to perform a (slow) master lookup. + // + // If acquisition fails, caller may still do the lookup, but is first + // blocked for a short time to prevent lookup storms. + bool AcquireMasterLookupPermit(); + void ReleaseMasterLookupPermit(); + + private: + friend class LookupRpc; + + FRIEND_TEST(client::ClientTest, TestMasterLookupPermits); + + // Called on the slow LookupTablet path when the master responds. Populates + // the tablet caches and returns a reference to the first one. + const scoped_refptr& ProcessLookupResponse(const LookupRpc& rpc); + + // Lookup the given tablet by key, only consulting local information. + // Returns true and sets *remote_tablet if successful. + bool LookupTabletByKeyFastPath(const KuduTable* table, + const std::string& partition_key, + scoped_refptr* remote_tablet); + + // Update our information about the given tablet server. + // + // This is called when we get some response from the master which contains + // the latest host/port info for a server. + // + // NOTE: Must be called with lock_ held. + void UpdateTabletServer(const master::TSInfoPB& pb); + + KuduClient* client_; + + rw_spinlock lock_; + + // Cache of Tablet Server locations: TS UUID -> RemoteTabletServer*. + // + // Given that the set of tablet servers is bounded by physical machines, we never + // evict entries from this map until the MetaCache is destructed. So, no need to use + // shared_ptr, etc. + // + // Protected by lock_. + TabletServerMap ts_cache_; + + // Cache of tablets, keyed by table ID, then by start partition key. + // + // Protected by lock_. + typedef std::map > TabletMap; + std::unordered_map tablets_by_table_and_key_; + + // Cache of tablets, keyed by tablet ID. + // + // Protected by lock_ + std::unordered_map > tablets_by_id_; + + // Prevents master lookup "storms" by delaying master lookups when all + // permits have been acquired. + Semaphore master_lookup_sem_; + + DISALLOW_COPY_AND_ASSIGN(MetaCache); +}; + +} // namespace internal +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_META_CACHE_H */ diff --git a/src/kudu/client/row_result.h b/src/kudu/client/row_result.h new file mode 100644 index 000000000000..f1f29ee9b325 --- /dev/null +++ b/src/kudu/client/row_result.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_ROW_RESULT_H +#define KUDU_CLIENT_ROW_RESULT_H + +#include "kudu/client/scan_batch.h" + +namespace kudu { +namespace client { + +// DEPRECATED: Kudu 0.7.0 renamed KuduRowResult to KuduScanBatch::RowPtr. +// The newer name is clearer that the row result's lifetime is tied to the +// lifetime of a batch. +typedef KuduScanBatch::RowPtr KuduRowResult; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/samples/CMakeLists.txt b/src/kudu/client/samples/CMakeLists.txt new file mode 100644 index 000000000000..f4c499053689 --- /dev/null +++ b/src/kudu/client/samples/CMakeLists.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Client API sample executables + +cmake_minimum_required(VERSION 2.8) + +find_package(kuduClient REQUIRED) +include_directories(${KUDU_CLIENT_INCLUDE_DIR}) + +# The Kudu client library always uses the old gcc ABI. +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + +add_executable(sample sample.cc) +target_link_libraries(sample kudu_client) diff --git a/src/kudu/client/samples/sample.cc b/src/kudu/client/samples/sample.cc new file mode 100644 index 000000000000..340d01c68240 --- /dev/null +++ b/src/kudu/client/samples/sample.cc @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/client/stubs.h" +#include "kudu/client/value.h" +#include "kudu/common/partial_row.h" + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduColumnSchema; +using kudu::client::KuduError; +using kudu::client::KuduInsert; +using kudu::client::KuduPredicate; +using kudu::client::KuduRowResult; +using kudu::client::KuduScanner; +using kudu::client::KuduSchema; +using kudu::client::KuduSchemaBuilder; +using kudu::client::KuduSession; +using kudu::client::KuduStatusFunctionCallback; +using kudu::client::KuduTable; +using kudu::client::KuduTableAlterer; +using kudu::client::KuduTableCreator; +using kudu::client::KuduValue; +using kudu::client::sp::shared_ptr; +using kudu::KuduPartialRow; +using kudu::MonoDelta; +using kudu::Status; + +using std::string; +using std::stringstream; +using std::vector; + +static Status CreateClient(const string& addr, + shared_ptr* client) { + return KuduClientBuilder() + .add_master_server_addr(addr) + .default_admin_operation_timeout(MonoDelta::FromSeconds(20)) + .Build(client); +} + +static KuduSchema CreateSchema() { + KuduSchema schema; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("int_val")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("string_val")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("non_null_with_default")->Type(KuduColumnSchema::INT32)->NotNull() + ->Default(KuduValue::FromInt(12345)); + KUDU_CHECK_OK(b.Build(&schema)); + return schema; +} + +static Status DoesTableExist(const shared_ptr& client, + const string& table_name, + bool *exists) { + shared_ptr table; + Status s = client->OpenTable(table_name, &table); + if (s.ok()) { + *exists = true; + } else if (s.IsNotFound()) { + *exists = false; + s = Status::OK(); + } + return s; +} + +static Status CreateTable(const shared_ptr& client, + const string& table_name, + const KuduSchema& schema, + int num_tablets) { + // Generate the split keys for the table. + vector splits; + int32_t increment = 1000 / num_tablets; + for (int32_t i = 1; i < num_tablets; i++) { + KuduPartialRow* row = schema.NewRow(); + KUDU_CHECK_OK(row->SetInt32(0, i * increment)); + splits.push_back(row); + } + + // Create the table. + KuduTableCreator* table_creator = client->NewTableCreator(); + Status s = table_creator->table_name(table_name) + .schema(&schema) + .split_rows(splits) + .Create(); + delete table_creator; + return s; +} + +static Status AlterTable(const shared_ptr& client, + const string& table_name) { + KuduTableAlterer* table_alterer = client->NewTableAlterer(table_name); + table_alterer->AlterColumn("int_val")->RenameTo("integer_val"); + table_alterer->AddColumn("another_val")->Type(KuduColumnSchema::BOOL); + table_alterer->DropColumn("string_val"); + Status s = table_alterer->Alter(); + delete table_alterer; + return s; +} + +static void StatusCB(void* unused, const Status& status) { + KUDU_LOG(INFO) << "Asynchronous flush finished with status: " + << status.ToString(); +} + +static Status InsertRows(const shared_ptr& table, int num_rows) { + shared_ptr session = table->client()->NewSession(); + KUDU_RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(5000); + + for (int i = 0; i < num_rows; i++) { + KuduInsert* insert = table->NewInsert(); + KuduPartialRow* row = insert->mutable_row(); + KUDU_CHECK_OK(row->SetInt32("key", i)); + KUDU_CHECK_OK(row->SetInt32("integer_val", i * 2)); + KUDU_CHECK_OK(row->SetInt32("non_null_with_default", i * 5)); + KUDU_CHECK_OK(session->Apply(insert)); + } + Status s = session->Flush(); + if (s.ok()) { + return s; + } + + // Test asynchronous flush. + KuduStatusFunctionCallback status_cb(&StatusCB, NULL); + session->FlushAsync(&status_cb); + + // Look at the session's errors. + vector errors; + bool overflow; + session->GetPendingErrors(&errors, &overflow); + s = overflow ? Status::IOError("Overflowed pending errors in session") : + errors.front()->status(); + while (!errors.empty()) { + delete errors.back(); + errors.pop_back(); + } + KUDU_RETURN_NOT_OK(s); + + // Close the session. + return session->Close(); +} + +static Status ScanRows(const shared_ptr& table) { + const int kLowerBound = 5; + const int kUpperBound = 600; + + KuduScanner scanner(table.get()); + + // Add a predicate: WHERE key >= 5 + KuduPredicate* p = table->NewComparisonPredicate( + "key", KuduPredicate::GREATER_EQUAL, KuduValue::FromInt(kLowerBound)); + KUDU_RETURN_NOT_OK(scanner.AddConjunctPredicate(p)); + + // Add a predicate: WHERE key <= 600 + p = table->NewComparisonPredicate( + "key", KuduPredicate::LESS_EQUAL, KuduValue::FromInt(kUpperBound)); + KUDU_RETURN_NOT_OK(scanner.AddConjunctPredicate(p)); + + KUDU_RETURN_NOT_OK(scanner.Open()); + vector results; + + int next_row = kLowerBound; + while (scanner.HasMoreRows()) { + KUDU_RETURN_NOT_OK(scanner.NextBatch(&results)); + for (vector::iterator iter = results.begin(); + iter != results.end(); + iter++, next_row++) { + const KuduRowResult& result = *iter; + int32_t val; + KUDU_RETURN_NOT_OK(result.GetInt32("key", &val)); + if (val != next_row) { + stringstream out; + out << "Scan returned the wrong results. Expected key " + << next_row << " but got " << val; + return Status::IOError(out.str()); + } + } + results.clear(); + } + + // next_row is now one past the last row we read. + int last_row_seen = next_row - 1; + + if (last_row_seen != kUpperBound) { + stringstream out; + out << "Scan returned the wrong results. Expected last row to be " + << kUpperBound << " rows but got " << last_row_seen; + return Status::IOError(out.str()); + } + return Status::OK(); +} + +static void LogCb(void* unused, + kudu::client::KuduLogSeverity severity, + const char* filename, + int line_number, + const struct ::tm* time, + const char* message, + size_t message_len) { + KUDU_LOG(INFO) << "Received log message from Kudu client library"; + KUDU_LOG(INFO) << " Severity: " << severity; + KUDU_LOG(INFO) << " Filename: " << filename; + KUDU_LOG(INFO) << " Line number: " << line_number; + char time_buf[32]; + // Example: Tue Mar 24 11:46:43 2015. + KUDU_CHECK(strftime(time_buf, sizeof(time_buf), "%a %b %d %T %Y", time)); + KUDU_LOG(INFO) << " Time: " << time_buf; + KUDU_LOG(INFO) << " Message: " << string(message, message_len); +} + +int main(int argc, char* argv[]) { + kudu::client::KuduLoggingFunctionCallback log_cb(&LogCb, NULL); + kudu::client::InstallLoggingCallback(&log_cb); + + if (argc != 2) { + KUDU_LOG(FATAL) << "usage: " << argv[0] << " "; + } + const string master_host = argv[1]; + + const string kTableName = "test_table"; + + // Enable verbose debugging for the client library. + kudu::client::SetVerboseLogLevel(2); + + // Create and connect a client. + shared_ptr client; + KUDU_CHECK_OK(CreateClient(master_host, &client)); + KUDU_LOG(INFO) << "Created a client connection"; + + // Disable the verbose logging. + kudu::client::SetVerboseLogLevel(0); + + // Create a schema. + KuduSchema schema(CreateSchema()); + KUDU_LOG(INFO) << "Created a schema"; + + // Create a table with that schema. + bool exists; + KUDU_CHECK_OK(DoesTableExist(client, kTableName, &exists)); + if (exists) { + client->DeleteTable(kTableName); + KUDU_LOG(INFO) << "Deleting old table before creating new one"; + } + KUDU_CHECK_OK(CreateTable(client, kTableName, schema, 10)); + KUDU_LOG(INFO) << "Created a table"; + + // Alter the table. + KUDU_CHECK_OK(AlterTable(client, kTableName)); + KUDU_LOG(INFO) << "Altered a table"; + + // Insert some rows into the table. + shared_ptr table; + KUDU_CHECK_OK(client->OpenTable(kTableName, &table)); + KUDU_CHECK_OK(InsertRows(table, 1000)); + KUDU_LOG(INFO) << "Inserted some rows into a table"; + + // Scan some rows. + KUDU_CHECK_OK(ScanRows(table)); + KUDU_LOG(INFO) << "Scanned some rows out of a table"; + + // Delete the table. + KUDU_CHECK_OK(client->DeleteTable(kTableName)); + KUDU_LOG(INFO) << "Deleted a table"; + + // Done! + KUDU_LOG(INFO) << "Done"; + return 0; +} diff --git a/src/kudu/client/scan_batch.cc b/src/kudu/client/scan_batch.cc new file mode 100644 index 000000000000..fef9c9834c5c --- /dev/null +++ b/src/kudu/client/scan_batch.cc @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/row_result.h" +#include "kudu/client/scan_batch.h" +#include "kudu/client/scanner-internal.h" +#include "kudu/client/schema.h" + +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/bitmap.h" + +using std::string; +using strings::Substitute; + +namespace kudu { +namespace client { + +//////////////////////////////////////////////////////////// +// KuduScanBatch +//////////////////////////////////////////////////////////// + +KuduScanBatch::KuduScanBatch() : data_(new Data()) {} + +KuduScanBatch::~KuduScanBatch() { + delete data_; +} + +int KuduScanBatch::NumRows() const { + return data_->num_rows(); +} + +KuduRowResult KuduScanBatch::Row(int idx) const { + return data_->row(idx); +} + +const KuduSchema* KuduScanBatch::projection_schema() const { + return data_->client_projection_; +} + +//////////////////////////////////////////////////////////// +// KuduScanBatch::RowPtr +//////////////////////////////////////////////////////////// + +namespace { + +inline Status FindColumn(const Schema& schema, const Slice& col_name, int* idx) { + StringPiece sp(reinterpret_cast(col_name.data()), col_name.size()); + *idx = schema.find_column(sp); + if (PREDICT_FALSE(*idx == -1)) { + return Status::NotFound("No such column", col_name); + } + return Status::OK(); +} + +// Just enough of a "cell" to support the Schema::DebugCellAppend calls +// made by KuduScanBatch::RowPtr::ToString. +class RowCell { + public: + RowCell(const KuduScanBatch::RowPtr* row, int idx) + : row_(row), + col_idx_(idx) { + } + + bool is_null() const { + return row_->IsNull(col_idx_); + } + const void* ptr() const { + return row_->cell(col_idx_); + } + + private: + const KuduScanBatch::RowPtr* row_; + const int col_idx_; +}; + +} // anonymous namespace + +bool KuduScanBatch::RowPtr::IsNull(int col_idx) const { + const ColumnSchema& col = schema_->column(col_idx); + if (!col.is_nullable()) { + return false; + } + + return BitmapTest(row_data_ + schema_->byte_size(), col_idx); +} + +bool KuduScanBatch::RowPtr::IsNull(const Slice& col_name) const { + int col_idx; + CHECK_OK(FindColumn(*schema_, col_name, &col_idx)); + return IsNull(col_idx); +} + +Status KuduScanBatch::RowPtr::GetBool(const Slice& col_name, bool* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetInt8(const Slice& col_name, int8_t* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetInt16(const Slice& col_name, int16_t* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetInt32(const Slice& col_name, int32_t* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetInt64(const Slice& col_name, int64_t* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetTimestamp(const Slice& col_name, int64_t* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetFloat(const Slice& col_name, float* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetDouble(const Slice& col_name, double* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetString(const Slice& col_name, Slice* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetBinary(const Slice& col_name, Slice* val) const { + return Get >(col_name, val); +} + +Status KuduScanBatch::RowPtr::GetBool(int col_idx, bool* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetInt8(int col_idx, int8_t* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetInt16(int col_idx, int16_t* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetInt32(int col_idx, int32_t* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetInt64(int col_idx, int64_t* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetTimestamp(int col_idx, int64_t* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetFloat(int col_idx, float* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetDouble(int col_idx, double* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetString(int col_idx, Slice* val) const { + return Get >(col_idx, val); +} + +Status KuduScanBatch::RowPtr::GetBinary(int col_idx, Slice* val) const { + return Get >(col_idx, val); +} + +template +Status KuduScanBatch::RowPtr::Get(const Slice& col_name, typename T::cpp_type* val) const { + int col_idx; + RETURN_NOT_OK(FindColumn(*schema_, col_name, &col_idx)); + return Get(col_idx, val); +} + +template +Status KuduScanBatch::RowPtr::Get(int col_idx, typename T::cpp_type* val) const { + const ColumnSchema& col = schema_->column(col_idx); + if (PREDICT_FALSE(col.type_info()->type() != T::type)) { + // TODO: at some point we could allow type coercion here. + return Status::InvalidArgument( + Substitute("invalid type $0 provided for column '$1' (expected $2)", + T::name(), + col.name(), col.type_info()->name())); + } + + if (col.is_nullable() && IsNull(col_idx)) { + return Status::NotFound("column is NULL"); + } + + memcpy(val, row_data_ + schema_->column_offset(col_idx), sizeof(*val)); + return Status::OK(); +} + +const void* KuduScanBatch::RowPtr::cell(int col_idx) const { + return row_data_ + schema_->column_offset(col_idx); +} + +const KuduSchema* KuduScanBatch::RowPtr::row_schema() const { + return client_schema_; +} + +//------------------------------------------------------------ +// Template instantiations: We instantiate all possible templates to avoid linker issues. +// see: https://isocpp.org/wiki/faq/templates#separate-template-fn-defn-from-decl +// TODO We can probably remove this when we move to c++11 and can use "extern template" +//------------------------------------------------------------ + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, bool* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, int8_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, int16_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, int32_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, int64_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >( + const Slice& col_name, int64_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, float* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, double* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, Slice* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(const Slice& col_name, Slice* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, bool* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, int8_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, int16_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, int32_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, int64_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, int64_t* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, float* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, double* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, Slice* val) const; + +template +Status KuduScanBatch::RowPtr::Get >(int col_idx, Slice* val) const; + +string KuduScanBatch::RowPtr::ToString() const { + string ret; + ret.append("("); + bool first = true; + for (int i = 0; i < schema_->num_columns(); i++) { + if (!first) { + ret.append(", "); + } + RowCell cell(this, i); + schema_->column(i).DebugCellAppend(cell, &ret); + first = false; + } + ret.append(")"); + return ret; +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/scan_batch.h b/src/kudu/client/scan_batch.h new file mode 100644 index 000000000000..073b5e307311 --- /dev/null +++ b/src/kudu/client/scan_batch.h @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCAN_BATCH_H +#define KUDU_CLIENT_SCAN_BATCH_H + +#include + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif + +#include "kudu/util/kudu_export.h" +#include "kudu/util/slice.h" + +namespace kudu { +class Schema; + +namespace tools { +class TsAdminClient; +} // namespace tools + +namespace client { +class KuduSchema; + +// A batch of zero or more rows returned from a KuduScanner. +// +// With C++11, you can iterate over the rows in the batch using a +// range-foreach loop: +// +// for (KuduScanBatch::RowPtr row : batch) { +// ... row.GetInt(1, ...) +// ... +// } +// +// In C++03, you'll need to use a regular for loop: +// +// for (int i = 0, num_rows = batch.NumRows(); +// i < num_rows; +// i++) { +// KuduScanBatch::RowPtr row = batch.Row(i); +// ... +// } +// +// Note that, in the above example, NumRows() is only called once at the +// beginning of the loop to avoid extra calls to the non-inlined method. +class KUDU_EXPORT KuduScanBatch { + public: + class RowPtr; + class const_iterator; + typedef RowPtr value_type; + + KuduScanBatch(); + ~KuduScanBatch(); + + // Return the number of rows in this batch. + int NumRows() const; + + // Return a reference to one of the rows in this batch. + // The returned object is only valid for as long as this KuduScanBatch. + KuduScanBatch::RowPtr Row(int idx) const; + + const_iterator begin() const; + const_iterator end() const; + + // Returns the projection schema for this batch. + // All KuduScanBatch::RowPtr returned by this batch are guaranteed to have this schema. + const KuduSchema* projection_schema() const; + + private: + class KUDU_NO_EXPORT Data; + friend class KuduScanner; + friend class kudu::tools::TsAdminClient; + + Data* data_; + DISALLOW_COPY_AND_ASSIGN(KuduScanBatch); +}; + +// A single row result from a scan. Note that this object acts as a pointer into +// a KuduScanBatch, and therefore is valid only as long as the batch it was constructed +// from. +class KUDU_EXPORT KuduScanBatch::RowPtr { + public: + // Construct an invalid RowPtr. Before use, you must assign + // a properly-initialized value. + RowPtr() : schema_(NULL), row_data_(NULL) {} + + bool IsNull(const Slice& col_name) const; + bool IsNull(int col_idx) const; + + // These getters return a bad Status if the type does not match, + // the value is unset, or the value is NULL. Otherwise they return + // the current set value in *val. + Status GetBool(const Slice& col_name, bool* val) const WARN_UNUSED_RESULT; + + Status GetInt8(const Slice& col_name, int8_t* val) const WARN_UNUSED_RESULT; + Status GetInt16(const Slice& col_name, int16_t* val) const WARN_UNUSED_RESULT; + Status GetInt32(const Slice& col_name, int32_t* val) const WARN_UNUSED_RESULT; + Status GetInt64(const Slice& col_name, int64_t* val) const WARN_UNUSED_RESULT; + Status GetTimestamp(const Slice& col_name, int64_t* micros_since_utc_epoch) + const WARN_UNUSED_RESULT; + + Status GetFloat(const Slice& col_name, float* val) const WARN_UNUSED_RESULT; + Status GetDouble(const Slice& col_name, double* val) const WARN_UNUSED_RESULT; + + // Same as above getters, but with numeric column indexes. + // These are faster since they avoid a hashmap lookup, so should + // be preferred in performance-sensitive code. + Status GetBool(int col_idx, bool* val) const WARN_UNUSED_RESULT; + + Status GetInt8(int col_idx, int8_t* val) const WARN_UNUSED_RESULT; + Status GetInt16(int col_idx, int16_t* val) const WARN_UNUSED_RESULT; + Status GetInt32(int col_idx, int32_t* val) const WARN_UNUSED_RESULT; + Status GetInt64(int col_idx, int64_t* val) const WARN_UNUSED_RESULT; + Status GetTimestamp(int col_idx, int64_t* micros_since_utc_epoch) const WARN_UNUSED_RESULT; + + Status GetFloat(int col_idx, float* val) const WARN_UNUSED_RESULT; + Status GetDouble(int col_idx, double* val) const WARN_UNUSED_RESULT; + + // Gets the string/binary value but does not copy the value. Callers should + // copy the resulting Slice if necessary. + Status GetString(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT; + Status GetString(int col_idx, Slice* val) const WARN_UNUSED_RESULT; + Status GetBinary(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT; + Status GetBinary(int col_idx, Slice* val) const WARN_UNUSED_RESULT; + + // Raw cell access. Should be avoided unless absolutely necessary. + const void* cell(int col_idx) const; + + const KuduSchema* row_schema() const; + + std::string ToString() const; + + private: + friend class KuduScanBatch; + template friend struct SliceKeysTestSetup; + template friend struct IntKeysTestSetup; + + // Only invoked by KuduScanner. + RowPtr(const Schema* schema, + const KuduSchema* client_projection, + const uint8_t* row_data) + : schema_(schema), + client_schema_(client_projection), + row_data_(row_data) { + } + + template + Status Get(const Slice& col_name, typename T::cpp_type* val) const; + + template + Status Get(int col_idx, typename T::cpp_type* val) const; + + const Schema* schema_; + const KuduSchema* client_schema_; + const uint8_t* row_data_; +}; + +// C++ forward iterator over the rows in a KuduScanBatch. +// +// This iterator yields KuduScanBatch::RowPtr objects which point inside the row batch +// itself. Thus, the iterator and any objects obtained from it are invalidated if the +// KuduScanBatch is destroyed or used for a new NextBatch() call. +class KUDU_EXPORT KuduScanBatch::const_iterator + : public std::iterator { + public: + ~const_iterator() {} + + KuduScanBatch::RowPtr operator*() const { + return batch_->Row(idx_); + } + + void operator++() { + idx_++; + } + + bool operator==(const const_iterator& other) { + return idx_ == other.idx_; + } + bool operator!=(const const_iterator& other) { + return idx_ != other.idx_; + } + + private: + friend class KuduScanBatch; + const_iterator(const KuduScanBatch* b, int idx) + : batch_(b), + idx_(idx) { + } + + const KuduScanBatch* batch_; + int idx_; +}; + + +inline KuduScanBatch::const_iterator KuduScanBatch::begin() const { + return const_iterator(this, 0); +} + +inline KuduScanBatch::const_iterator KuduScanBatch::end() const { + return const_iterator(this, NumRows()); +} + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/scan_predicate-internal.h b/src/kudu/client/scan_predicate-internal.h new file mode 100644 index 000000000000..32ce5bd33482 --- /dev/null +++ b/src/kudu/client/scan_predicate-internal.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCAN_PREDICATE_INTERNAL_H +#define KUDU_CLIENT_SCAN_PREDICATE_INTERNAL_H + +#include "kudu/client/value.h" +#include "kudu/client/value-internal.h" +#include "kudu/common/scan_spec.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace client { + +class KuduPredicate::Data { + public: + Data(); + virtual ~Data(); + virtual Status AddToScanSpec(ScanSpec* spec) = 0; + virtual Data* Clone() const = 0; +}; + +// A predicate implementation which represents an error constructing +// some other predicate. +// +// This allows us to provide a simple API -- if a predicate fails to +// construct, we return an instance of this class instead of the requested +// predicate implementation. Then, when the caller adds it to a scanner, +// the error is returned. +class ErrorPredicateData : public KuduPredicate::Data { + public: + explicit ErrorPredicateData(const Status& s) + : status_(s) { + } + + virtual ~ErrorPredicateData() { + } + + virtual Status AddToScanSpec(ScanSpec* spec) OVERRIDE { + return status_; + } + + virtual ErrorPredicateData* Clone() const OVERRIDE { + return new ErrorPredicateData(status_); + } + + private: + Status status_; +}; + + +// A simple binary comparison predicate between a column and +// a constant. +class ComparisonPredicateData : public KuduPredicate::Data { + public: + ComparisonPredicateData(ColumnSchema col, + KuduPredicate::ComparisonOp op, + KuduValue* value); + virtual ~ComparisonPredicateData(); + + virtual Status AddToScanSpec(ScanSpec* spec) OVERRIDE; + + virtual ComparisonPredicateData* Clone() const OVERRIDE { + return new ComparisonPredicateData(col_, op_, val_->Clone()); + } + + private: + friend class KuduScanner; + + ColumnSchema col_; + KuduPredicate::ComparisonOp op_; + gscoped_ptr val_; + + // Owned. + ColumnRangePredicate* pred_; +}; + +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_SCAN_PREDICATE_INTERNAL_H */ diff --git a/src/kudu/client/scan_predicate.cc b/src/kudu/client/scan_predicate.cc new file mode 100644 index 000000000000..4a4df8b87278 --- /dev/null +++ b/src/kudu/client/scan_predicate.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/scan_predicate.h" +#include "kudu/client/scan_predicate-internal.h" +#include "kudu/client/value.h" +#include "kudu/client/value-internal.h" + +#include "kudu/common/scan_spec.h" +#include "kudu/common/scan_predicate.h" + +#include "kudu/gutil/strings/substitute.h" + +using strings::Substitute; + +namespace kudu { +namespace client { + +KuduPredicate::KuduPredicate(Data* d) + : data_(d) { +} + +KuduPredicate::~KuduPredicate() { + delete data_; +} + +KuduPredicate::Data::Data() { +} + +KuduPredicate::Data::~Data() { +} + +KuduPredicate* KuduPredicate::Clone() const { + return new KuduPredicate(data_->Clone()); +} + +ComparisonPredicateData::ComparisonPredicateData(ColumnSchema col, + KuduPredicate::ComparisonOp op, + KuduValue* val) + : col_(std::move(col)), + op_(op), + val_(val) { +} +ComparisonPredicateData::~ComparisonPredicateData() { +} + + +Status ComparisonPredicateData::AddToScanSpec(ScanSpec* spec) { + void* val_void; + RETURN_NOT_OK(val_->data_->CheckTypeAndGetPointer(col_.name(), + col_.type_info()->physical_type(), + &val_void)); + + void* lower_bound = nullptr; + void* upper_bound = nullptr; + switch (op_) { + case KuduPredicate::LESS_EQUAL: + upper_bound = val_void; + break; + case KuduPredicate::GREATER_EQUAL: + lower_bound = val_void; + break; + case KuduPredicate::EQUAL: + lower_bound = upper_bound = val_void; + break; + default: + return Status::InvalidArgument(Substitute("invalid comparison op: $0", op_)); + } + + ColumnRangePredicate p(col_, lower_bound, upper_bound); + spec->AddPredicate(p); + + return Status::OK(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/scan_predicate.h b/src/kudu/client/scan_predicate.h new file mode 100644 index 000000000000..604d0969d680 --- /dev/null +++ b/src/kudu/client/scan_predicate.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCAN_PREDICATE_H +#define KUDU_CLIENT_SCAN_PREDICATE_H + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif + +#include "kudu/client/schema.h" +#include "kudu/util/kudu_export.h" + +namespace kudu { +namespace client { + +class KUDU_EXPORT KuduPredicate { + public: + enum ComparisonOp { + LESS_EQUAL, + GREATER_EQUAL, + EQUAL + }; + + ~KuduPredicate(); + + // Returns a new, identical, KuduPredicate. + KuduPredicate* Clone() const; + + // The PIMPL class has to be public since it's actually just an interface, + // and gcc gives an error trying to derive from a private nested class. + class KUDU_NO_EXPORT Data; + private: + friend class KuduScanner; + friend class KuduTable; + friend class ComparisonPredicateData; + friend class ErrorPredicateData; + + explicit KuduPredicate(Data* d); + + Data* data_; + DISALLOW_COPY_AND_ASSIGN(KuduPredicate); +}; + +} // namespace client +} // namespace kudu +#endif // KUDU_CLIENT_SCAN_PREDICATE_H diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc new file mode 100644 index 000000000000..68b1ca8df6d8 --- /dev/null +++ b/src/kudu/client/scanner-internal.cc @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/scanner-internal.h" + +#include +#include +#include +#include +#include + +#include "kudu/client/client-internal.h" +#include "kudu/client/meta_cache.h" +#include "kudu/client/row_result.h" +#include "kudu/client/table-internal.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/hexdump.h" + +using std::set; +using std::string; + +namespace kudu { + +using rpc::RpcController; +using strings::Substitute; +using strings::SubstituteAndAppend; +using tserver::ColumnRangePredicatePB; +using tserver::NewScanRequestPB; +using tserver::ScanResponsePB; + +namespace client { + +using internal::RemoteTabletServer; + +static const int64_t kNoTimestamp = -1; + +KuduScanner::Data::Data(KuduTable* table) + : open_(false), + data_in_open_(false), + has_batch_size_bytes_(false), + batch_size_bytes_(0), + selection_(KuduClient::CLOSEST_REPLICA), + read_mode_(READ_LATEST), + is_fault_tolerant_(false), + snapshot_timestamp_(kNoTimestamp), + table_(DCHECK_NOTNULL(table)), + arena_(1024, 1024*1024), + spec_encoder_(table->schema().schema_, &arena_), + timeout_(MonoDelta::FromMilliseconds(kScanTimeoutMillis)), + scan_attempts_(0) { + SetProjectionSchema(table->schema().schema_); +} + +KuduScanner::Data::~Data() { +} + +Status KuduScanner::Data::CheckForErrors() { + if (PREDICT_TRUE(!last_response_.has_error())) { + return Status::OK(); + } + + return StatusFromPB(last_response_.error().status()); +} + +void KuduScanner::Data::CopyPredicateBound(const ColumnSchema& col, + const void* bound_src, + string* bound_dst) { + const void* src; + size_t size; + if (col.type_info()->physical_type() == BINARY) { + // Copying a string involves an extra level of indirection through its + // owning slice. + const Slice* s = reinterpret_cast(bound_src); + src = s->data(); + size = s->size(); + } else { + src = bound_src; + size = col.type_info()->size(); + } + bound_dst->assign(reinterpret_cast(src), size); +} + +Status KuduScanner::Data::CanBeRetried(const bool isNewScan, + const Status& rpc_status, const Status& server_status, + const MonoTime& actual_deadline, const MonoTime& deadline, + const vector& candidates, + set* blacklist) { + CHECK(!rpc_status.ok() || !server_status.ok()); + + // Check for ERROR_SERVER_TOO_BUSY, which should result in a retry after a delay. + if (server_status.ok() && + !rpc_status.ok() && + controller_.error_response() && + controller_.error_response()->code() == rpc::ErrorStatusPB::ERROR_SERVER_TOO_BUSY) { + UpdateLastError(rpc_status); + + // Exponential backoff with jitter anchored between 10ms and 20ms, and an + // upper bound between 2.5s and 5s. + MonoDelta sleep = MonoDelta::FromMilliseconds( + (10 + rand() % 10) * static_cast(std::pow(2.0, std::min(8, scan_attempts_ - 1)))); + MonoTime now = MonoTime::Now(MonoTime::FINE); + now.AddDelta(sleep); + if (deadline.ComesBefore(now)) { + Status ret = Status::TimedOut("unable to retry before timeout", + rpc_status.ToString()); + return last_error_.ok() ? + ret : ret.CloneAndAppend(last_error_.ToString()); + } + LOG(INFO) << "Retrying scan to busy tablet server " << ts_->ToString() + << " after " << sleep.ToString() << "; attempt " << scan_attempts_; + SleepFor(sleep); + return Status::OK(); + } + + // Start by checking network errors. + if (!rpc_status.ok()) { + if (rpc_status.IsTimedOut() && actual_deadline.Equals(deadline)) { + // If we ended because of the overall deadline, we're done. + // We didn't wait a full RPC timeout though, so don't mark the tserver as failed. + LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at " + << ts_->ToString() << " deadline expired."; + return last_error_.ok() + ? rpc_status : rpc_status.CloneAndAppend(last_error_.ToString()); + } else { + // All other types of network errors are retriable, and also indicate the tserver is failed. + UpdateLastError(rpc_status); + table_->client()->data_->meta_cache_->MarkTSFailed(ts_, rpc_status); + } + } + + // If we're in the middle of a batch and doing a non fault-tolerant scan, then + // we cannot retry. Non fault-tolerant scans can still be retried on a tablet + // boundary (i.e. an OpenTablet call). + if (!isNewScan && !is_fault_tolerant_) { + return !rpc_status.ok() ? rpc_status : server_status; + } + + // For retries, the correct action depends on the particular failure condition. + // + // On an RPC error, we retry at a different tablet server. + // + // If the server returned an error code, it depends: + // + // - SCANNER_EXPIRED : The scan can be retried at the same tablet server. + // + // - TABLET_NOT_RUNNING : The scan can be retried at a different tablet server, subject + // to the client's specified selection criteria. + // + // - TABLET_NOT_FOUND : The scan can be retried at a different tablet server, subject + // to the client's specified selection criteria. + // The metadata for this tablet should be refreshed. + // + // - Any other error : Fatal. This indicates an unexpected error while processing the scan + // request. + if (rpc_status.ok() && !server_status.ok()) { + UpdateLastError(server_status); + + const tserver::TabletServerErrorPB& error = last_response_.error(); + switch (error.code()) { + case tserver::TabletServerErrorPB::SCANNER_EXPIRED: + VLOG(1) << "Got SCANNER_EXPIRED error code, non-fatal error."; + break; + case tserver::TabletServerErrorPB::TABLET_NOT_RUNNING: + VLOG(1) << "Got error code " << tserver::TabletServerErrorPB::Code_Name(error.code()) + << ": temporarily blacklisting node " << ts_->permanent_uuid(); + blacklist->insert(ts_->permanent_uuid()); + // We've blacklisted all the live candidate tservers. + // Do a short random sleep, clear the temp blacklist, then do another round of retries. + if (!candidates.empty() && candidates.size() == blacklist->size()) { + MonoDelta sleep_delta = MonoDelta::FromMilliseconds((random() % 5000) + 1000); + LOG(INFO) << "All live candidate nodes are unavailable because of transient errors." + << " Sleeping for " << sleep_delta.ToMilliseconds() << " ms before trying again."; + SleepFor(sleep_delta); + blacklist->clear(); + } + break; + case tserver::TabletServerErrorPB::TABLET_NOT_FOUND: { + // There was either a tablet configuration change or the table was + // deleted, since at the time of this writing we don't support splits. + // Backoff, then force a re-fetch of the tablet metadata. + remote_->MarkStale(); + // TODO: Only backoff on the second time we hit TABLET_NOT_FOUND on the + // same tablet (see KUDU-1314). + MonoDelta backoff_time = MonoDelta::FromMilliseconds((random() % 1000) + 500); + SleepFor(backoff_time); + break; + } + default: + // All other server errors are fatal. Usually indicates a malformed request, e.g. a bad scan + // specification. + return server_status; + } + } + + return Status::OK(); +} + +Status KuduScanner::Data::OpenTablet(const string& partition_key, + const MonoTime& deadline, + set* blacklist) { + + PrepareRequest(KuduScanner::Data::NEW); + next_req_.clear_scanner_id(); + NewScanRequestPB* scan = next_req_.mutable_new_scan_request(); + switch (read_mode_) { + case READ_LATEST: scan->set_read_mode(kudu::READ_LATEST); break; + case READ_AT_SNAPSHOT: scan->set_read_mode(kudu::READ_AT_SNAPSHOT); break; + default: LOG(FATAL) << "Unexpected read mode."; + } + + if (is_fault_tolerant_) { + scan->set_order_mode(kudu::ORDERED); + } else { + scan->set_order_mode(kudu::UNORDERED); + } + + if (last_primary_key_.length() > 0) { + VLOG(1) << "Setting NewScanRequestPB last_primary_key to hex value " + << HexDump(last_primary_key_); + scan->set_last_primary_key(last_primary_key_); + } + + scan->set_cache_blocks(spec_.cache_blocks()); + + if (snapshot_timestamp_ != kNoTimestamp) { + if (PREDICT_FALSE(read_mode_ != READ_AT_SNAPSHOT)) { + LOG(WARNING) << "Scan snapshot timestamp set but read mode was READ_LATEST." + " Ignoring timestamp."; + } else { + scan->set_snap_timestamp(snapshot_timestamp_); + } + } + + // Set up the predicates. + scan->clear_range_predicates(); + for (const ColumnRangePredicate& pred : spec_.predicates()) { + const ColumnSchema& col = pred.column(); + const ValueRange& range = pred.range(); + ColumnRangePredicatePB* pb = scan->add_range_predicates(); + if (range.has_lower_bound()) { + CopyPredicateBound(col, range.lower_bound(), + pb->mutable_lower_bound()); + } + if (range.has_upper_bound()) { + CopyPredicateBound(col, range.upper_bound(), + pb->mutable_upper_bound()); + } + ColumnSchemaToPB(col, pb->mutable_column()); + } + + if (spec_.lower_bound_key()) { + scan->mutable_start_primary_key()->assign( + reinterpret_cast(spec_.lower_bound_key()->encoded_key().data()), + spec_.lower_bound_key()->encoded_key().size()); + } else { + scan->clear_start_primary_key(); + } + if (spec_.exclusive_upper_bound_key()) { + scan->mutable_stop_primary_key()->assign( + reinterpret_cast(spec_.exclusive_upper_bound_key()->encoded_key().data()), + spec_.exclusive_upper_bound_key()->encoded_key().size()); + } else { + scan->clear_stop_primary_key(); + } + RETURN_NOT_OK(SchemaToColumnPBs(*projection_, scan->mutable_projected_columns(), + SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES | SCHEMA_PB_WITHOUT_IDS)); + + for (int attempt = 1;; attempt++) { + Synchronizer sync; + table_->client()->data_->meta_cache_->LookupTabletByKey(table_, + partition_key, + deadline, + &remote_, + sync.AsStatusCallback()); + RETURN_NOT_OK(sync.Wait()); + + scan->set_tablet_id(remote_->tablet_id()); + + RemoteTabletServer *ts; + vector candidates; + Status lookup_status = table_->client()->data_->GetTabletServer( + table_->client(), + remote_, + selection_, + *blacklist, + &candidates, + &ts); + // If we get ServiceUnavailable, this indicates that the tablet doesn't + // currently have any known leader. We should sleep and retry, since + // it's likely that the tablet is undergoing a leader election and will + // soon have one. + if (lookup_status.IsServiceUnavailable() && + MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + int sleep_ms = attempt * 100; + VLOG(1) << "Tablet " << remote_->tablet_id() << " current unavailable: " + << lookup_status.ToString() << ". Sleeping for " << sleep_ms << "ms " + << "and retrying..."; + SleepFor(MonoDelta::FromMilliseconds(sleep_ms)); + continue; + } + RETURN_NOT_OK(lookup_status); + + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (deadline.ComesBefore(now)) { + Status ret = Status::TimedOut("Scan timed out, deadline expired"); + return last_error_.ok() ? + ret : ret.CloneAndAppend(last_error_.ToString()); + } + + // Recalculate the deadlines. + // If we have other replicas beyond this one to try, then we'll try to + // open the scanner with the default RPC timeout. That gives us time to + // try other replicas later. Otherwise, we open the scanner using the + // full remaining deadline for the user's call. + MonoTime rpc_deadline; + if (static_cast(candidates.size()) - blacklist->size() > 1) { + rpc_deadline = now; + rpc_deadline.AddDelta(table_->client()->default_rpc_timeout()); + rpc_deadline = MonoTime::Earliest(deadline, rpc_deadline); + } else { + rpc_deadline = deadline; + } + + controller_.Reset(); + controller_.set_deadline(rpc_deadline); + + CHECK(ts->proxy()); + ts_ = CHECK_NOTNULL(ts); + proxy_ = ts->proxy(); + const Status rpc_status = proxy_->Scan(next_req_, &last_response_, &controller_); + const Status server_status = CheckForErrors(); + if (rpc_status.ok() && server_status.ok()) { + scan_attempts_ = 0; + break; + } + scan_attempts_++; + RETURN_NOT_OK(CanBeRetried(true, rpc_status, server_status, rpc_deadline, deadline, + candidates, blacklist)); + } + + next_req_.clear_new_scan_request(); + data_in_open_ = last_response_.has_data(); + if (last_response_.has_more_results()) { + next_req_.set_scanner_id(last_response_.scanner_id()); + VLOG(1) << "Opened tablet " << remote_->tablet_id() + << ", scanner ID " << last_response_.scanner_id(); + } else if (last_response_.has_data()) { + VLOG(1) << "Opened tablet " << remote_->tablet_id() << ", no scanner ID assigned"; + } else { + VLOG(1) << "Opened tablet " << remote_->tablet_id() << " (no rows), no scanner ID assigned"; + } + + // If present in the response, set the snapshot timestamp and the encoded last + // primary key. This is used when retrying the scan elsewhere. The last + // primary key is also updated on each scan response. + if (is_fault_tolerant_) { + CHECK(last_response_.has_snap_timestamp()); + snapshot_timestamp_ = last_response_.snap_timestamp(); + if (last_response_.has_last_primary_key()) { + last_primary_key_ = last_response_.last_primary_key(); + } + } + + if (last_response_.has_snap_timestamp()) { + table_->client()->data_->UpdateLatestObservedTimestamp(last_response_.snap_timestamp()); + } + + return Status::OK(); +} + +Status KuduScanner::Data::KeepAlive() { + if (!open_) return Status::IllegalState("Scanner was not open."); + // If there is no scanner to keep alive, we still return Status::OK(). + if (!last_response_.IsInitialized() || !last_response_.has_more_results() || + !next_req_.has_scanner_id()) { + return Status::OK(); + } + + RpcController controller; + controller.set_timeout(timeout_); + tserver::ScannerKeepAliveRequestPB request; + request.set_scanner_id(next_req_.scanner_id()); + tserver::ScannerKeepAliveResponsePB response; + RETURN_NOT_OK(proxy_->ScannerKeepAlive(request, &response, &controller)); + if (response.has_error()) { + return StatusFromPB(response.error().status()); + } + return Status::OK(); +} + +bool KuduScanner::Data::MoreTablets() const { + CHECK(open_); + // TODO(KUDU-565): add a test which has a scan end on a tablet boundary + + if (remote_->partition().partition_key_end().empty()) { + // Last tablet -- nothing more to scan. + return false; + } + + if (!spec_.exclusive_upper_bound_partition_key().empty() && + spec_.exclusive_upper_bound_partition_key() <= remote_->partition().partition_key_end()) { + // We are not past the scan's upper bound partition key. + return false; + } + + if (!table_->partition_schema().IsSimplePKRangePartitioning(*table_->schema().schema_)) { + // We can't do culling yet if the partitioning isn't simple. + return true; + } + + if (spec_.exclusive_upper_bound_key() == nullptr) { + // No upper bound - keep going! + return true; + } + + // Otherwise, we have to compare the upper bound. + return spec_.exclusive_upper_bound_key()->encoded_key() + .compare(remote_->partition().partition_key_end()) > 0; +} + +void KuduScanner::Data::PrepareRequest(RequestType state) { + if (state == KuduScanner::Data::CLOSE) { + next_req_.set_batch_size_bytes(0); + } else if (has_batch_size_bytes_) { + next_req_.set_batch_size_bytes(batch_size_bytes_); + } else { + next_req_.clear_batch_size_bytes(); + } + + if (state == KuduScanner::Data::NEW) { + next_req_.set_call_seq_id(0); + } else { + next_req_.set_call_seq_id(next_req_.call_seq_id() + 1); + } +} + +void KuduScanner::Data::UpdateLastError(const Status& error) { + if (last_error_.ok() || last_error_.IsTimedOut()) { + last_error_ = error; + } +} + +void KuduScanner::Data::SetProjectionSchema(const Schema* schema) { + projection_ = schema; + client_projection_ = KuduSchema(*schema); +} + + + +//////////////////////////////////////////////////////////// +// KuduScanBatch +//////////////////////////////////////////////////////////// + +KuduScanBatch::Data::Data() : projection_(NULL) {} + +KuduScanBatch::Data::~Data() {} + +size_t KuduScanBatch::Data::CalculateProjectedRowSize(const Schema& proj) { + return proj.byte_size() + + (proj.has_nullables() ? BitmapSize(proj.num_columns()) : 0); +} + +Status KuduScanBatch::Data::Reset(RpcController* controller, + const Schema* projection, + const KuduSchema* client_projection, + gscoped_ptr data) { + CHECK(controller->finished()); + controller_.Swap(controller); + projection_ = projection; + client_projection_ = client_projection; + resp_data_.Swap(data.get()); + + // First, rewrite the relative addresses into absolute ones. + if (PREDICT_FALSE(!resp_data_.has_rows_sidecar())) { + return Status::Corruption("Server sent invalid response: no row data"); + } else { + Status s = controller_.GetSidecar(resp_data_.rows_sidecar(), &direct_data_); + if (!s.ok()) { + return Status::Corruption("Server sent invalid response: row data " + "sidecar index corrupt", s.ToString()); + } + } + + if (resp_data_.has_indirect_data_sidecar()) { + Status s = controller_.GetSidecar(resp_data_.indirect_data_sidecar(), + &indirect_data_); + if (!s.ok()) { + return Status::Corruption("Server sent invalid response: indirect data " + "sidecar index corrupt", s.ToString()); + } + } + + RETURN_NOT_OK(RewriteRowBlockPointers(*projection_, resp_data_, indirect_data_, &direct_data_)); + projected_row_size_ = CalculateProjectedRowSize(*projection_); + return Status::OK(); +} + +void KuduScanBatch::Data::ExtractRows(vector* rows) { + int n_rows = resp_data_.num_rows(); + rows->resize(n_rows); + + if (PREDICT_FALSE(n_rows == 0)) { + // Early-out here to avoid a UBSAN failure. + VLOG(1) << "Extracted 0 rows"; + return; + } + + // Initialize each RowPtr with data from the response. + // + // Doing this resize and array indexing turns out to be noticeably faster + // than using reserve and push_back. + const uint8_t* src = direct_data_.data(); + KuduScanBatch::RowPtr* dst = &(*rows)[0]; + while (n_rows > 0) { + *dst = KuduScanBatch::RowPtr(projection_, client_projection_,src); + dst++; + src += projected_row_size_; + n_rows--; + } + VLOG(1) << "Extracted " << rows->size() << " rows"; +} + +void KuduScanBatch::Data::Clear() { + resp_data_.Clear(); + controller_.Reset(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/scanner-internal.h b/src/kudu/client/scanner-internal.h new file mode 100644 index 000000000000..0c530050f2ec --- /dev/null +++ b/src/kudu/client/scanner-internal.h @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCANNER_INTERNAL_H +#define KUDU_CLIENT_SCANNER_INTERNAL_H + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/predicate_encoder.h" +#include "kudu/tserver/tserver_service.proxy.h" + +namespace kudu { + +namespace client { + +class KuduScanner::Data { + public: + explicit Data(KuduTable* table); + ~Data(); + + Status CheckForErrors(); + + // Copies a predicate lower or upper bound from 'bound_src' into + // 'bound_dst'. + void CopyPredicateBound(const ColumnSchema& col, + const void* bound_src, std::string* bound_dst); + + // Called when KuduScanner::NextBatch or KuduScanner::Data::OpenTablet result in an RPC or + // server error. Returns the error status if the call cannot be retried. + // + // The number of parameters reflects the complexity of handling retries. + // We must respect the overall scan 'deadline', as well as the 'blacklist' of servers + // experiencing transient failures. See the implementation for more details. + Status CanBeRetried(const bool isNewScan, + const Status& rpc_status, + const Status& server_status, + const MonoTime& actual_deadline, + const MonoTime& deadline, + const std::vector& candidates, + std::set* blacklist); + + // Open a tablet. + // The deadline is the time budget for this operation. + // The blacklist is used to temporarily filter out nodes that are experiencing transient errors. + // This blacklist may be modified by the callee. + Status OpenTablet(const std::string& partition_key, + const MonoTime& deadline, + std::set* blacklist); + + Status KeepAlive(); + + // Returns whether there exist more tablets we should scan. + // + // Note: there may not be any actual matching rows in subsequent tablets, + // but we won't know until we scan them. + bool MoreTablets() const; + + // Possible scan requests. + enum RequestType { + // A new scan of a particular tablet. + NEW, + + // A continuation of an existing scan (to read more rows). + CONTINUE, + + // A close of a partially-completed scan. Complete scans are closed + // automatically by the tablet server. + CLOSE + }; + + // Modifies fields in 'next_req_' in preparation for a new request. + void PrepareRequest(RequestType state); + + // Update 'last_error_' if need be. Should be invoked whenever a + // non-fatal (i.e. retriable) scan error is encountered. + void UpdateLastError(const Status& error); + + // Sets the projection schema. + void SetProjectionSchema(const Schema* schema); + + bool open_; + bool data_in_open_; + bool has_batch_size_bytes_; + uint32 batch_size_bytes_; + KuduClient::ReplicaSelection selection_; + + ReadMode read_mode_; + bool is_fault_tolerant_; + int64_t snapshot_timestamp_; + + // The encoded last primary key from the most recent tablet scan response. + std::string last_primary_key_; + + internal::RemoteTabletServer* ts_; + // The proxy can be derived from the RemoteTabletServer, but this involves retaking the + // meta cache lock. Keeping our own shared_ptr avoids this overhead. + std::shared_ptr proxy_; + + // The next scan request to be sent. This is cached as a field + // since most scan requests will share the scanner ID with the previous + // request. + tserver::ScanRequestPB next_req_; + + // The last response received from the server. Cached for buffer reuse. + tserver::ScanResponsePB last_response_; + + // RPC controller for the last in-flight RPC. + rpc::RpcController controller_; + + // The table we're scanning. + KuduTable* table_; + + // The projection schema used in the scan. + const Schema* projection_; + + // 'projection_' after it is converted to KuduSchema, so that users can obtain + // the projection without having to include common/schema.h. + KuduSchema client_projection_; + + Arena arena_; + AutoReleasePool pool_; + + // Machinery to store and encode raw column range predicates into + // encoded keys. + ScanSpec spec_; + RangePredicateEncoder spec_encoder_; + + // The tablet we're scanning. + scoped_refptr remote_; + + // Timeout for scanner RPCs. + MonoDelta timeout_; + + // Number of attempts since the last successful scan. + int scan_attempts_; + + // The deprecated "NextBatch(vector*) API requires some local + // storage for the actual row data. If that API is used, this member keeps the + // actual storage for the batch that is returned. + KuduScanBatch batch_for_old_api_; + + // The latest error experienced by this scan that provoked a retry. If the + // scan times out, this error will be incorporated into the status that is + // passed back to the client. + // + // TODO: This and the overall scan retry logic duplicates much of RpcRetrier. + Status last_error_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +class KuduScanBatch::Data { + public: + Data(); + ~Data(); + + Status Reset(rpc::RpcController* controller, + const Schema* projection, + const KuduSchema* client_projection, + gscoped_ptr resp_data); + + int num_rows() const { + return resp_data_.num_rows(); + } + + KuduRowResult row(int idx) { + DCHECK_GE(idx, 0); + DCHECK_LT(idx, num_rows()); + int offset = idx * projected_row_size_; + return KuduRowResult(projection_, client_projection_, &direct_data_[offset]); + } + + void ExtractRows(vector* rows); + + void Clear(); + + // Returns the size of a row for the given projection 'proj'. + static size_t CalculateProjectedRowSize(const Schema& proj); + + // The RPC controller for the RPC which returned this batch. + // Holding on to the controller ensures we hold on to the indirect data + // which contains the rows. + rpc::RpcController controller_; + + // The PB which contains the "direct data" slice. + RowwiseRowBlockPB resp_data_; + + // Slices into the direct and indirect row data, whose lifetime is ensured + // by the members above. + Slice direct_data_, indirect_data_; + + // The projection being scanned. + const Schema* projection_; + // The KuduSchema version of 'projection_' + const KuduSchema* client_projection_; + + // The number of bytes of direct data for each row. + size_t projected_row_size_; +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/schema-internal.h b/src/kudu/client/schema-internal.h new file mode 100644 index 000000000000..873519c5fb78 --- /dev/null +++ b/src/kudu/client/schema-internal.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCHEMA_INTERNAL_H +#define KUDU_CLIENT_SCHEMA_INTERNAL_H + +#include + +#include "kudu/client/schema.h" +#include "kudu/common/common.pb.h" + +namespace kudu { +namespace client { + +// Helper functions that convert between client-facing and internal PB enums. + +kudu::EncodingType ToInternalEncodingType( + KuduColumnStorageAttributes::EncodingType type); +KuduColumnStorageAttributes::EncodingType FromInternalEncodingType( + kudu::EncodingType type); + +kudu::CompressionType ToInternalCompressionType( + KuduColumnStorageAttributes::CompressionType type); +KuduColumnStorageAttributes::CompressionType FromInternalCompressionType( + kudu::CompressionType type); + +kudu::DataType ToInternalDataType( + KuduColumnSchema::DataType type); +KuduColumnSchema::DataType FromInternalDataType( + kudu::DataType type); + + +class KuduColumnSpec::Data { + public: + explicit Data(std::string name) + : name(std::move(name)), + has_type(false), + has_encoding(false), + has_compression(false), + has_block_size(false), + has_nullable(false), + primary_key(false), + has_default(false), + default_val(NULL), + remove_default(false), + has_rename_to(false) { + } + + ~Data() { + delete default_val; + } + + const std::string name; + + bool has_type; + KuduColumnSchema::DataType type; + + bool has_encoding; + KuduColumnStorageAttributes::EncodingType encoding; + + bool has_compression; + KuduColumnStorageAttributes::CompressionType compression; + + bool has_block_size; + int32_t block_size; + + bool has_nullable; + bool nullable; + + bool primary_key; + + bool has_default; + KuduValue* default_val; // Owned. + + // For ALTER + bool remove_default; + + // For ALTER + bool has_rename_to; + std::string rename_to; +}; + +} // namespace client +} // namespace kudu +#endif // KUDU_CLIENT_SCHEMA_INTERNAL_H diff --git a/src/kudu/client/schema.cc b/src/kudu/client/schema.cc new file mode 100644 index 000000000000..ef5f5e36c698 --- /dev/null +++ b/src/kudu/client/schema.cc @@ -0,0 +1,534 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/schema.h" + +#include +#include + +#include "kudu/client/schema-internal.h" +#include "kudu/client/value-internal.h" +#include "kudu/common/partial_row.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" + +MAKE_ENUM_LIMITS(kudu::client::KuduColumnStorageAttributes::EncodingType, + kudu::client::KuduColumnStorageAttributes::AUTO_ENCODING, + kudu::client::KuduColumnStorageAttributes::RLE); + +MAKE_ENUM_LIMITS(kudu::client::KuduColumnStorageAttributes::CompressionType, + kudu::client::KuduColumnStorageAttributes::DEFAULT_COMPRESSION, + kudu::client::KuduColumnStorageAttributes::ZLIB); + +MAKE_ENUM_LIMITS(kudu::client::KuduColumnSchema::DataType, + kudu::client::KuduColumnSchema::INT8, + kudu::client::KuduColumnSchema::BOOL); + +using std::unordered_map; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace client { + +kudu::EncodingType ToInternalEncodingType(KuduColumnStorageAttributes::EncodingType type) { + switch (type) { + case KuduColumnStorageAttributes::AUTO_ENCODING: return kudu::AUTO_ENCODING; + case KuduColumnStorageAttributes::PLAIN_ENCODING: return kudu::PLAIN_ENCODING; + case KuduColumnStorageAttributes::PREFIX_ENCODING: return kudu::PREFIX_ENCODING; + case KuduColumnStorageAttributes::DICT_ENCODING: return kudu::DICT_ENCODING; + case KuduColumnStorageAttributes::GROUP_VARINT: return kudu::GROUP_VARINT; + case KuduColumnStorageAttributes::RLE: return kudu::RLE; + case KuduColumnStorageAttributes::BIT_SHUFFLE: return kudu::BIT_SHUFFLE; + default: LOG(FATAL) << "Unexpected encoding type: " << type; + } +} + +KuduColumnStorageAttributes::EncodingType FromInternalEncodingType(kudu::EncodingType type) { + switch (type) { + case kudu::AUTO_ENCODING: return KuduColumnStorageAttributes::AUTO_ENCODING; + case kudu::PLAIN_ENCODING: return KuduColumnStorageAttributes::PLAIN_ENCODING; + case kudu::PREFIX_ENCODING: return KuduColumnStorageAttributes::PREFIX_ENCODING; + case kudu::DICT_ENCODING: return KuduColumnStorageAttributes::DICT_ENCODING; + case kudu::GROUP_VARINT: return KuduColumnStorageAttributes::GROUP_VARINT; + case kudu::RLE: return KuduColumnStorageAttributes::RLE; + case kudu::BIT_SHUFFLE: return KuduColumnStorageAttributes::BIT_SHUFFLE; + default: LOG(FATAL) << "Unexpected internal encoding type: " << type; + } +} + +kudu::CompressionType ToInternalCompressionType(KuduColumnStorageAttributes::CompressionType type) { + switch (type) { + case KuduColumnStorageAttributes::DEFAULT_COMPRESSION: return kudu::DEFAULT_COMPRESSION; + case KuduColumnStorageAttributes::NO_COMPRESSION: return kudu::NO_COMPRESSION; + case KuduColumnStorageAttributes::SNAPPY: return kudu::SNAPPY; + case KuduColumnStorageAttributes::LZ4: return kudu::LZ4; + case KuduColumnStorageAttributes::ZLIB: return kudu::ZLIB; + default: LOG(FATAL) << "Unexpected compression type" << type; + } +} + +KuduColumnStorageAttributes::CompressionType FromInternalCompressionType( + kudu::CompressionType type) { + switch (type) { + case kudu::DEFAULT_COMPRESSION: return KuduColumnStorageAttributes::DEFAULT_COMPRESSION; + case kudu::NO_COMPRESSION: return KuduColumnStorageAttributes::NO_COMPRESSION; + case kudu::SNAPPY: return KuduColumnStorageAttributes::SNAPPY; + case kudu::LZ4: return KuduColumnStorageAttributes::LZ4; + case kudu::ZLIB: return KuduColumnStorageAttributes::ZLIB; + default: LOG(FATAL) << "Unexpected internal compression type: " << type; + } +} + +kudu::DataType ToInternalDataType(KuduColumnSchema::DataType type) { + switch (type) { + case KuduColumnSchema::INT8: return kudu::INT8; + case KuduColumnSchema::INT16: return kudu::INT16; + case KuduColumnSchema::INT32: return kudu::INT32; + case KuduColumnSchema::INT64: return kudu::INT64; + case KuduColumnSchema::TIMESTAMP: return kudu::TIMESTAMP; + case KuduColumnSchema::FLOAT: return kudu::FLOAT; + case KuduColumnSchema::DOUBLE: return kudu::DOUBLE; + case KuduColumnSchema::STRING: return kudu::STRING; + case KuduColumnSchema::BINARY: return kudu::BINARY; + case KuduColumnSchema::BOOL: return kudu::BOOL; + default: LOG(FATAL) << "Unexpected data type: " << type; + } +} + +KuduColumnSchema::DataType FromInternalDataType(kudu::DataType type) { + switch (type) { + case kudu::INT8: return KuduColumnSchema::INT8; + case kudu::INT16: return KuduColumnSchema::INT16; + case kudu::INT32: return KuduColumnSchema::INT32; + case kudu::INT64: return KuduColumnSchema::INT64; + case kudu::TIMESTAMP: return KuduColumnSchema::TIMESTAMP; + case kudu::FLOAT: return KuduColumnSchema::FLOAT; + case kudu::DOUBLE: return KuduColumnSchema::DOUBLE; + case kudu::STRING: return KuduColumnSchema::STRING; + case kudu::BINARY: return KuduColumnSchema::BINARY; + case kudu::BOOL: return KuduColumnSchema::BOOL; + default: LOG(FATAL) << "Unexpected internal data type: " << type; + } +} + +//////////////////////////////////////////////////////////// +// KuduColumnSpec +//////////////////////////////////////////////////////////// + +KuduColumnSpec::KuduColumnSpec(const std::string& name) + : data_(new Data(name)) { +} + +KuduColumnSpec::~KuduColumnSpec() { + delete data_; +} + +KuduColumnSpec* KuduColumnSpec::Type(KuduColumnSchema::DataType type) { + data_->has_type = true; + data_->type = type; + return this; +} + +KuduColumnSpec* KuduColumnSpec::Default(KuduValue* v) { + data_->has_default = true; + delete data_->default_val; + data_->default_val = v; + return this; +} + +KuduColumnSpec* KuduColumnSpec::Compression( + KuduColumnStorageAttributes::CompressionType compression) { + data_->has_compression = true; + data_->compression = compression; + return this; +} + +KuduColumnSpec* KuduColumnSpec::Encoding( + KuduColumnStorageAttributes::EncodingType encoding) { + data_->has_encoding = true; + data_->encoding = encoding; + return this; +} + +KuduColumnSpec* KuduColumnSpec::BlockSize(int32_t block_size) { + data_->has_block_size = true; + data_->block_size = block_size; + return this; +} + +KuduColumnSpec* KuduColumnSpec::PrimaryKey() { + data_->primary_key = true; + return this; +} + +KuduColumnSpec* KuduColumnSpec::NotNull() { + data_->has_nullable = true; + data_->nullable = false; + return this; +} + +KuduColumnSpec* KuduColumnSpec::Nullable() { + data_->has_nullable = true; + data_->nullable = true; + return this; +} + +KuduColumnSpec* KuduColumnSpec::RemoveDefault() { + data_->remove_default = true; + return this; +} + +KuduColumnSpec* KuduColumnSpec::RenameTo(const std::string& new_name) { + data_->has_rename_to = true; + data_->rename_to = new_name; + return this; +} + +Status KuduColumnSpec::ToColumnSchema(KuduColumnSchema* col) const { + // Verify that the user isn't trying to use any methods that + // don't make sense for CREATE. + if (data_->has_rename_to) { + // TODO(KUDU-861): adjust these errors as this method will also be used for + // ALTER TABLE ADD COLUMN support. + return Status::NotSupported("cannot rename a column during CreateTable", + data_->name); + } + if (data_->remove_default) { + return Status::NotSupported("cannot remove default during CreateTable", + data_->name); + } + + if (!data_->has_type) { + return Status::InvalidArgument("no type provided for column", data_->name); + } + DataType internal_type = ToInternalDataType(data_->type); + + bool nullable = data_->has_nullable ? data_->nullable : true; + + void* default_val = nullptr; + // TODO: distinguish between DEFAULT NULL and no default? + if (data_->has_default) { + RETURN_NOT_OK(data_->default_val->data_->CheckTypeAndGetPointer( + data_->name, internal_type, &default_val)); + } + + + // Encoding and compression + KuduColumnStorageAttributes::EncodingType encoding = + KuduColumnStorageAttributes::AUTO_ENCODING; + if (data_->has_encoding) { + encoding = data_->encoding; + } + + KuduColumnStorageAttributes::CompressionType compression = + KuduColumnStorageAttributes::DEFAULT_COMPRESSION; + if (data_->has_compression) { + compression = data_->compression; + } + + int32_t block_size = 0; // '0' signifies server-side default + if (data_->has_block_size) { + block_size = data_->block_size; + } + + *col = KuduColumnSchema(data_->name, data_->type, nullable, + default_val, + KuduColumnStorageAttributes(encoding, compression, block_size)); + + return Status::OK(); +} + + +//////////////////////////////////////////////////////////// +// KuduSchemaBuilder +//////////////////////////////////////////////////////////// + +class KUDU_NO_EXPORT KuduSchemaBuilder::Data { + public: + Data() : has_key_col_names(false) { + } + + ~Data() { + // Rather than delete the specs here, we have to do it in + // ~KuduSchemaBuilder(), to avoid a circular dependency in the + // headers declaring friend classes with nested classes. + } + + bool has_key_col_names; + vector key_col_names; + + vector specs; +}; + +KuduSchemaBuilder::KuduSchemaBuilder() + : data_(new Data()) { +} + +KuduSchemaBuilder::~KuduSchemaBuilder() { + for (KuduColumnSpec* spec : data_->specs) { + // Can't use STLDeleteElements because KuduSchemaBuilder + // is a friend of KuduColumnSpec in order to access its destructor. + // STLDeleteElements is a free function and therefore can't access it. + delete spec; + } + delete data_; +} + +KuduColumnSpec* KuduSchemaBuilder::AddColumn(const std::string& name) { + auto c = new KuduColumnSpec(name); + data_->specs.push_back(c); + return c; +} + +KuduSchemaBuilder* KuduSchemaBuilder::SetPrimaryKey( + const std::vector& key_col_names) { + data_->has_key_col_names = true; + data_->key_col_names = key_col_names; + return this; +} + +Status KuduSchemaBuilder::Build(KuduSchema* schema) { + vector cols; + cols.resize(data_->specs.size(), KuduColumnSchema()); + for (int i = 0; i < cols.size(); i++) { + RETURN_NOT_OK(data_->specs[i]->ToColumnSchema(&cols[i])); + } + + int num_key_cols; + + if (!data_->has_key_col_names) { + // If they didn't explicitly pass the column names for key, + // then they should have set it on exactly one column. + int single_key_col_idx = -1; + for (int i = 0; i < cols.size(); i++) { + if (data_->specs[i]->data_->primary_key) { + if (single_key_col_idx != -1) { + return Status::InvalidArgument("multiple columns specified for primary key", + Substitute("$0, $1", + cols[single_key_col_idx].name(), + cols[i].name())); + } + single_key_col_idx = i; + } + } + + if (single_key_col_idx == -1) { + return Status::InvalidArgument("no primary key specified"); + } + + // TODO: eventually allow primary keys which aren't the first column + if (single_key_col_idx != 0) { + return Status::InvalidArgument("primary key column must be the first column"); + } + + num_key_cols = 1; + } else { + // Build a map from name to index of all of the columns. + unordered_map name_to_idx_map; + int i = 0; + for (KuduColumnSpec* spec : data_->specs) { + // If they did pass the key column names, then we should not have explicitly + // set it on any columns. + if (spec->data_->primary_key) { + return Status::InvalidArgument("primary key specified by both SetPrimaryKey() and on a " + "specific column", spec->data_->name); + } + // If we have a duplicate column name, the Schema::Reset() will catch it later, + // anyway. + name_to_idx_map[spec->data_->name] = i++; + } + + // Convert the key column names to a set of indexes. + vector key_col_indexes; + for (const string& key_col_name : data_->key_col_names) { + int idx; + if (!FindCopy(name_to_idx_map, key_col_name, &idx)) { + return Status::InvalidArgument("primary key column not defined", key_col_name); + } + key_col_indexes.push_back(idx); + } + + // Currently we require that the key columns be contiguous at the front + // of the schema. We'll lift this restriction later -- hence the more + // flexible user-facing API. + for (int i = 0; i < key_col_indexes.size(); i++) { + if (key_col_indexes[i] != i) { + return Status::InvalidArgument("primary key columns must be listed first in the schema", + data_->key_col_names[i]); + } + } + + num_key_cols = key_col_indexes.size(); + } + + RETURN_NOT_OK(schema->Reset(cols, num_key_cols)); + + return Status::OK(); +} + + +//////////////////////////////////////////////////////////// +// KuduColumnSchema +//////////////////////////////////////////////////////////// + +std::string KuduColumnSchema::DataTypeToString(DataType type) { + return DataType_Name(ToInternalDataType(type)); +} + +KuduColumnSchema::KuduColumnSchema(const std::string &name, + DataType type, + bool is_nullable, + const void* default_value, + KuduColumnStorageAttributes attributes) { + ColumnStorageAttributes attr_private; + attr_private.encoding = ToInternalEncodingType(attributes.encoding()); + attr_private.compression = ToInternalCompressionType(attributes.compression()); + col_ = new ColumnSchema(name, ToInternalDataType(type), is_nullable, + default_value, default_value, attr_private); +} + +KuduColumnSchema::KuduColumnSchema(const KuduColumnSchema& other) + : col_(nullptr) { + CopyFrom(other); +} + +KuduColumnSchema::KuduColumnSchema() : col_(nullptr) { +} + +KuduColumnSchema::~KuduColumnSchema() { + delete col_; +} + +KuduColumnSchema& KuduColumnSchema::operator=(const KuduColumnSchema& other) { + if (&other != this) { + CopyFrom(other); + } + return *this; +} + +void KuduColumnSchema::CopyFrom(const KuduColumnSchema& other) { + delete col_; + if (other.col_) { + col_ = new ColumnSchema(*other.col_); + } else { + col_ = nullptr; + } +} + +bool KuduColumnSchema::Equals(const KuduColumnSchema& other) const { + return this == &other || + col_ == other.col_ || + (col_ != nullptr && col_->Equals(*other.col_, true)); +} + +const std::string& KuduColumnSchema::name() const { + return DCHECK_NOTNULL(col_)->name(); +} + +bool KuduColumnSchema::is_nullable() const { + return DCHECK_NOTNULL(col_)->is_nullable(); +} + +KuduColumnSchema::DataType KuduColumnSchema::type() const { + return FromInternalDataType(DCHECK_NOTNULL(col_)->type_info()->type()); +} + + +//////////////////////////////////////////////////////////// +// KuduSchema +//////////////////////////////////////////////////////////// + +KuduSchema::KuduSchema() + : schema_(nullptr) { +} + +KuduSchema::KuduSchema(const KuduSchema& other) + : schema_(nullptr) { + CopyFrom(other); +} + +KuduSchema::KuduSchema(const Schema& schema) + : schema_(new Schema(schema)) { +} + +KuduSchema::~KuduSchema() { + delete schema_; +} + +KuduSchema& KuduSchema::operator=(const KuduSchema& other) { + if (&other != this) { + CopyFrom(other); + } + return *this; +} + +void KuduSchema::CopyFrom(const KuduSchema& other) { + delete schema_; + schema_ = new Schema(*other.schema_); +} + +Status KuduSchema::Reset(const vector& columns, int key_columns) { + vector cols_private; + for (const KuduColumnSchema& col : columns) { + cols_private.push_back(*col.col_); + } + gscoped_ptr new_schema(new Schema()); + RETURN_NOT_OK(new_schema->Reset(cols_private, key_columns)); + + delete schema_; + schema_ = new_schema.release(); + return Status::OK(); +} + +bool KuduSchema::Equals(const KuduSchema& other) const { + return this == &other || + (schema_ && other.schema_ && schema_->Equals(*other.schema_)); +} + +KuduColumnSchema KuduSchema::Column(size_t idx) const { + ColumnSchema col(schema_->column(idx)); + KuduColumnStorageAttributes attrs(FromInternalEncodingType(col.attributes().encoding), + FromInternalCompressionType(col.attributes().compression)); + return KuduColumnSchema(col.name(), FromInternalDataType(col.type_info()->type()), + col.is_nullable(), col.read_default_value(), + attrs); +} + +KuduPartialRow* KuduSchema::NewRow() const { + return new KuduPartialRow(schema_); +} + +size_t KuduSchema::num_columns() const { + return schema_->num_columns(); +} + +size_t KuduSchema::num_key_columns() const { + return schema_->num_key_columns(); +} + +void KuduSchema::GetPrimaryKeyColumnIndexes(vector* indexes) const { + indexes->clear(); + indexes->resize(num_key_columns()); + for (int i = 0; i < num_key_columns(); i++) { + (*indexes)[i] = i; + } +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/schema.h b/src/kudu/client/schema.h new file mode 100644 index 000000000000..6bdeb0c55f35 --- /dev/null +++ b/src/kudu/client/schema.h @@ -0,0 +1,351 @@ + +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SCHEMA_H +#define KUDU_CLIENT_SCHEMA_H + +#include +#include + +#include "kudu/client/value.h" +#include "kudu/util/kudu_export.h" + +namespace kudu { + +class ColumnSchema; +class KuduPartialRow; +class Schema; +class TestWorkload; + +namespace tools { +class TsAdminClient; +} + +namespace client { + +namespace internal { +class GetTableSchemaRpc; +class LookupRpc; +class WriteRpc; +} // namespace internal + +class KuduClient; +class KuduSchema; +class KuduSchemaBuilder; +class KuduWriteOperation; + +class KUDU_EXPORT KuduColumnStorageAttributes { + public: + enum EncodingType { + AUTO_ENCODING = 0, + PLAIN_ENCODING = 1, + PREFIX_ENCODING = 2, + GROUP_VARINT = 3, + RLE = 4, + DICT_ENCODING = 5, + BIT_SHUFFLE = 6 + }; + + enum CompressionType { + DEFAULT_COMPRESSION = 0, + NO_COMPRESSION = 1, + SNAPPY = 2, + LZ4 = 3, + ZLIB = 4, + }; + + + // NOTE: this constructor is deprecated for external use, and will + // be made private in a future release. + KuduColumnStorageAttributes(EncodingType encoding = AUTO_ENCODING, + CompressionType compression = DEFAULT_COMPRESSION, + int32_t block_size = 0) + : encoding_(encoding), + compression_(compression), + block_size_(block_size) { + } + + const EncodingType encoding() const { + return encoding_; + } + + const CompressionType compression() const { + return compression_; + } + + std::string ToString() const; + + private: + EncodingType encoding_; + CompressionType compression_; + int32_t block_size_; +}; + +class KUDU_EXPORT KuduColumnSchema { + public: + enum DataType { + INT8 = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + STRING = 4, + BOOL = 5, + FLOAT = 6, + DOUBLE = 7, + BINARY = 8, + TIMESTAMP = 9 + }; + + static std::string DataTypeToString(DataType type); + + // DEPRECATED: use KuduSchemaBuilder instead. + // TODO(KUDU-809): make this hard-to-use constructor private. Clients should use + // the Builder API. Currently only the Python API uses this old API. + KuduColumnSchema(const std::string &name, + DataType type, + bool is_nullable = false, + const void* default_value = NULL, + KuduColumnStorageAttributes attributes = KuduColumnStorageAttributes()); + KuduColumnSchema(const KuduColumnSchema& other); + ~KuduColumnSchema(); + + KuduColumnSchema& operator=(const KuduColumnSchema& other); + + void CopyFrom(const KuduColumnSchema& other); + + bool Equals(const KuduColumnSchema& other) const; + + // Getters to expose column schema information. + const std::string& name() const; + DataType type() const; + bool is_nullable() const; + + // TODO: Expose default column value and attributes? + + private: + friend class KuduColumnSpec; + friend class KuduSchema; + friend class KuduSchemaBuilder; + // KuduTableAlterer::Data needs to be a friend. Friending the parent class + // is transitive to nested classes. See http://tiny.cloudera.com/jwtui + friend class KuduTableAlterer; + + KuduColumnSchema(); + + // Owned. + ColumnSchema* col_; +}; + +// Builder API for specifying or altering a column within a table schema. +// This cannot be constructed directly, but rather is returned from +// KuduSchemaBuilder::AddColumn() to specify a column within a Schema. +// +// TODO(KUDU-861): this API will also be used for an improved AlterTable API. +class KUDU_EXPORT KuduColumnSpec { + public: + // Set the default value for this column. + // + // When adding a new column to a table, this default value will be used to + // fill the new column in all existing rows. + // + // When a user inserts data, if the user does not specify any value for + // this column, the default will also be used. + // + // The KuduColumnSpec takes ownership over 'value'. + KuduColumnSpec* Default(KuduValue* value); + + // Set the preferred compression for this column. + KuduColumnSpec* Compression(KuduColumnStorageAttributes::CompressionType compression); + + // Set the preferred encoding for this column. + // Note that not all encodings are supported for all column types. + KuduColumnSpec* Encoding(KuduColumnStorageAttributes::EncodingType encoding); + + // Set the target block size for this column. + // + // This is the number of bytes of user data packed per block on disk, and + // represents the unit of IO when reading this column. Larger values + // may improve scan performance, particularly on spinning media. Smaller + // values may improve random access performance, particularly for workloads + // that have high cache hit rates or operate on fast storage such as SSD. + // + // Note that the block size specified here corresponds to uncompressed data. + // The actual size of the unit read from disk may be smaller if + // compression is enabled. + // + // It's recommended that this not be set any lower than 4096 (4KB) or higher + // than 1048576 (1MB). + // TODO(KUDU-1107): move above info to docs + KuduColumnSpec* BlockSize(int32_t block_size); + + // Operations only relevant for Create Table + // ------------------------------------------------------------ + + // Set this column to be the primary key of the table. + // + // This may only be used to set non-composite primary keys. If a composite + // key is desired, use KuduSchemaBuilder::SetPrimaryKey(). This may not be + // used in conjunction with KuduSchemaBuilder::SetPrimaryKey(). + // + // Only relevant for a CreateTable operation. Primary keys may not be changed + // after a table is created. + KuduColumnSpec* PrimaryKey(); + + // Set this column to be not nullable. + // Column nullability may not be changed once a table is created. + KuduColumnSpec* NotNull(); + + // Set this column to be nullable (the default). + // Column nullability may not be changed once a table is created. + KuduColumnSpec* Nullable(); + + // Set the type of this column. + // Column types may not be changed once a table is created. + KuduColumnSpec* Type(KuduColumnSchema::DataType type); + + // Operations only relevant for Alter Table + // ------------------------------------------------------------ + + // Remove the default value for this column. Without a default, clients must + // always specify a value for this column when inserting data. + KuduColumnSpec* RemoveDefault(); + + // Rename this column. + KuduColumnSpec* RenameTo(const std::string& new_name); + + private: + class KUDU_NO_EXPORT Data; + friend class KuduSchemaBuilder; + friend class KuduTableAlterer; + + // This class should always be owned and deleted by one of its friends, + // not the user. + ~KuduColumnSpec(); + + explicit KuduColumnSpec(const std::string& col_name); + + Status ToColumnSchema(KuduColumnSchema* col) const; + + // Owned. + Data* data_; +}; + +// Builder API for constructing a KuduSchema object. +// The API here is a "fluent" style of programming, such that the resulting code +// looks somewhat like a SQL "CREATE TABLE" statement. For example: +// +// SQL: +// CREATE TABLE t ( +// my_key int not null primary key, +// a float default 1.5 +// ); +// +// is represented as: +// +// KuduSchemaBuilder t; +// t.AddColumn("my_key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); +// t.AddColumn("a")->Type(KuduColumnSchema::FLOAT)->Default(KuduValue::FromFloat(1.5)); +// KuduSchema schema; +// t.Build(&schema); +// +class KUDU_EXPORT KuduSchemaBuilder { + public: + KuduSchemaBuilder(); + ~KuduSchemaBuilder(); + + // Return a KuduColumnSpec for a new column within the Schema. + // The returned object is owned by the KuduSchemaBuilder. + KuduColumnSpec* AddColumn(const std::string& name); + + // Set the primary key of the new Schema based on the given column names. + // This may be used to specify a compound primary key. + KuduSchemaBuilder* SetPrimaryKey(const std::vector& key_col_names); + + // Resets 'schema' to the result of this builder. + // + // If the Schema is invalid for any reason (eg missing types, duplicate column names, etc) + // a bad Status will be returned. + Status Build(KuduSchema* schema); + + private: + class KUDU_NO_EXPORT Data; + // Owned. + Data* data_; +}; + +class KUDU_EXPORT KuduSchema { + public: + KuduSchema(); + + KuduSchema(const KuduSchema& other); + ~KuduSchema(); + + KuduSchema& operator=(const KuduSchema& other); + void CopyFrom(const KuduSchema& other); + + // DEPRECATED: will be removed soon. + Status Reset(const std::vector& columns, int key_columns) + WARN_UNUSED_RESULT; + + bool Equals(const KuduSchema& other) const; + KuduColumnSchema Column(size_t idx) const; + size_t num_columns() const; + + // Get the indexes of the primary key columns within this Schema. + // In current versions of Kudu, these will always be contiguous column + // indexes starting with 0. However, in future versions this assumption + // may not hold, so callers should not assume it is the case. + void GetPrimaryKeyColumnIndexes(std::vector* indexes) const; + + // Create a new row corresponding to this schema. + // + // The new row refers to this KuduSchema object, so must be destroyed before + // the KuduSchema object. + // + // The caller takes ownership of the created row. + KuduPartialRow* NewRow() const; + + private: + friend class KuduClient; + friend class KuduScanner; + friend class KuduSchemaBuilder; + friend class KuduTable; + friend class KuduTableCreator; + friend class KuduWriteOperation; + friend class internal::GetTableSchemaRpc; + friend class internal::LookupRpc; + friend class internal::WriteRpc; + friend class kudu::tools::TsAdminClient; + + friend KuduSchema KuduSchemaFromSchema(const Schema& schema); + + + // For use by kudu tests. + explicit KuduSchema(const Schema& schema); + + // Private since we don't want users to rely on the first N columns + // being the keys. + size_t num_key_columns() const; + + // Owned. + Schema* schema_; +}; + +} // namespace client +} // namespace kudu +#endif // KUDU_CLIENT_SCHEMA_H diff --git a/src/kudu/client/session-internal.cc b/src/kudu/client/session-internal.cc new file mode 100644 index 000000000000..062615f60eb1 --- /dev/null +++ b/src/kudu/client/session-internal.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/session-internal.h" + +#include "kudu/client/batcher.h" +#include "kudu/client/error_collector.h" +#include "kudu/client/shared_ptr.h" + +namespace kudu { + +namespace client { + +using internal::Batcher; +using internal::ErrorCollector; + +using sp::shared_ptr; + +KuduSession::Data::Data(shared_ptr client) + : client_(std::move(client)), + error_collector_(new ErrorCollector()), + flush_mode_(AUTO_FLUSH_SYNC), + external_consistency_mode_(CLIENT_PROPAGATED), + timeout_ms_(-1) { +} + +KuduSession::Data::~Data() { +} + +void KuduSession::Data::Init(const shared_ptr& session) { + lock_guard l(&lock_); + CHECK(!batcher_); + NewBatcher(session, NULL); +} + +void KuduSession::Data::NewBatcher(const shared_ptr& session, + scoped_refptr* old_batcher) { + DCHECK(lock_.is_locked()); + + scoped_refptr batcher( + new Batcher(client_.get(), error_collector_.get(), session, + external_consistency_mode_)); + if (timeout_ms_ != -1) { + batcher->SetTimeoutMillis(timeout_ms_); + } + batcher.swap(batcher_); + + if (old_batcher) { + old_batcher->swap(batcher); + } +} + +void KuduSession::Data::FlushFinished(Batcher* batcher) { + lock_guard l(&lock_); + CHECK_EQ(flushed_batchers_.erase(batcher), 1); +} + +Status KuduSession::Data::Close(bool force) { + if (batcher_->HasPendingOperations() && !force) { + return Status::IllegalState("Could not close. There are pending operations."); + } + batcher_->Abort(); + return Status::OK(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/session-internal.h b/src/kudu/client/session-internal.h new file mode 100644 index 000000000000..d8393143cafd --- /dev/null +++ b/src/kudu/client/session-internal.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_SESSION_INTERNAL_H +#define KUDU_CLIENT_SESSION_INTERNAL_H + +#include + +#include "kudu/client/client.h" +#include "kudu/util/locks.h" + +namespace kudu { + +namespace client { + +namespace internal { +class Batcher; +class ErrorCollector; +} // internal + +class KuduSession::Data { + public: + explicit Data(sp::shared_ptr client); + ~Data(); + + void Init(const sp::shared_ptr& session); + + // Swap in a new Batcher instance, returning the old one in '*old_batcher', unless it is + // NULL. + void NewBatcher(const sp::shared_ptr& session, + scoped_refptr* old_batcher); + + // Called by Batcher when a flush has finished. + void FlushFinished(internal::Batcher* b); + + // Returns Status::IllegalState() if 'force' is false and there are still pending + // operations. If 'force' is true batcher_ is aborted even if there are pending + // operations. + Status Close(bool force); + + // The client that this session is associated with. + const sp::shared_ptr client_; + + // Lock protecting internal state. + // Note that this lock should not be taken if the thread is already holding + // a Batcher lock. This must be acquired first. + mutable simple_spinlock lock_; + + // Buffer for errors. + scoped_refptr error_collector_; + + // The current batcher being prepared. + scoped_refptr batcher_; + + // Any batchers which have been flushed but not yet finished. + // + // Upon a batch finishing, it will call FlushFinished(), which removes the batcher from + // this set. This set does not hold any reference count to the Batcher, since, while + // the flush is active, the batcher manages its own refcount. The Batcher will always + // call FlushFinished() before it destructs itself, so we're guaranteed that these + // pointers stay valid. + std::unordered_set flushed_batchers_; + + FlushMode flush_mode_; + kudu::client::KuduSession::ExternalConsistencyMode external_consistency_mode_; + + // Timeout for the next batch. + int timeout_ms_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/shared_ptr.h b/src/kudu/client/shared_ptr.h new file mode 100644 index 000000000000..2cc8c167e96b --- /dev/null +++ b/src/kudu/client/shared_ptr.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CLIENT_SHARED_PTR_H +#define KUDU_CLIENT_SHARED_PTR_H + +// Kudu uses c++11 features internally, but provides a client interface which +// does not require c++11. We use std::tr1::shared_ptr in our public interface +// to hold shared instances of KuduClient, KuduSession, and KuduTable. +// +// Unfortunately, on OS X, libc++ is the default c++ standard library +// implementation and is required when compiling with c++11, but it does not +// include the tr1 APIs. As a workaround, we use std::shared_ptr on OS X, since +// OS X is for development only, and it is acceptable to require clients to +// compile with c++11. +// +// In order to allow applications to compile against Kudu on both Linux and OS +// X, we provide this typedef which resolves to std::tr1::shared_ptr on Linux +// and std::shared_ptr on OS X. Clients are encouraged to use these typedefs in +// order to ensure that applications will compile on both Linux and OS X. + +#if defined(__APPLE__) +#include + +namespace kudu { +namespace client { +namespace sp { + using std::shared_ptr; + using std::weak_ptr; + using std::enable_shared_from_this; +} +} +} + +#else +#include + +namespace kudu { +namespace client { +namespace sp { + using std::tr1::shared_ptr; + using std::tr1::weak_ptr; + using std::tr1::enable_shared_from_this; +} +} +} +#endif + +#endif // define KUDU_CLIENT_SHARED_PTR_H diff --git a/src/kudu/client/stubs.h b/src/kudu/client/stubs.h new file mode 100644 index 000000000000..52acc6f2733a --- /dev/null +++ b/src/kudu/client/stubs.h @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_STUBS_H +#define KUDU_CLIENT_STUBS_H + +#include // for exit() + +#include + +// +// GCC can be told that a certain branch is not likely to be taken (for +// instance, a CHECK failure), and use that information in static analysis. +// Giving it this information can help it optimize for the common case in +// the absence of better information (ie. -fprofile-arcs). +// +#ifndef PREDICT_FALSE +#if defined(__GNUC__) +#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#else +#define PREDICT_FALSE(x) x +#endif +#endif +#ifndef PREDICT_TRUE +#if defined(__GNUC__) +#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define PREDICT_TRUE(x) x +#endif +#endif + +// Annotate a function indicating the caller must examine the return value. +// Use like: +// int foo() WARN_UNUSED_RESULT; +// To explicitly ignore a result, see |ignore_result()| in . +#ifndef WARN_UNUSED_RESULT +#if defined(__GNUC__) +#define WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#else +#define WARN_UNUSED_RESULT +#endif +#endif + +#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG) +#undef ATTRIBUTE_UNUSED +#define ATTRIBUTE_UNUSED __attribute__ ((unused)) +#else +#ifndef ATTRIBUTE_UNUSED +#define ATTRIBUTE_UNUSED +#endif +#endif + +#ifndef COMPILE_ASSERT +// The COMPILE_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// COMPILE_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template +struct StubsCompileAssert { +}; + +#define COMPILE_ASSERT(expr, msg) \ + typedef StubsCompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ATTRIBUTE_UNUSED // NOLINT(*) +#endif + +// Annotate a virtual method indicating it must be overriding a virtual +// method in the parent class. +// Use like: +// virtual void foo() OVERRIDE; +#ifndef OVERRIDE +#if defined(COMPILER_MSVC) +#define OVERRIDE override +#elif defined(__clang__) +#define OVERRIDE override +#elif defined(COMPILER_GCC) && __cplusplus >= 201103 && \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) >= 40700 +// GCC 4.7 supports explicit virtual overrides when C++11 support is enabled. +#define OVERRIDE override +#else +#define OVERRIDE +#endif +#endif + +#ifndef DISALLOW_COPY_AND_ASSIGN +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) +#endif + +#ifndef FRIEND_TEST +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test +#endif + +// Stubbed versions of macros defined in glog/logging.h, intended for +// environments where glog headers aren't available. +// +// Add more as needed. + +#define KUDU_DCHECK(condition) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_EQ(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_NE(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_LE(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_LT(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_GE(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_GT(val1, val2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_NOTNULL(val) (val) +#define KUDU_DCHECK_STREQ(str1, str2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_STRCASEEQ(str1, str2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_STRNE(str1, str2) while (false) kudu::internal_logging::NullLog() +#define KUDU_DCHECK_STRCASENE(str1, str2) while (false) kudu::internal_logging::NullLog() + +// Log levels. LOG ignores them, so their values are abitrary. + +#define KUDU_INFO 0 +#define KUDU_WARNING 1 +#define KUDU_ERROR 2 +#define KUDU_FATAL 3 + +#ifdef NDEBUG +#define KUDU_DFATAL KUDU_WARNING +#else +#define KUDU_DFATAL KUDU_FATAL +#endif // NDEBUG + +#define KUDU_LOG_INTERNAL(level) kudu::internal_logging::CerrLog(level) +#define KUDU_LOG(level) KUDU_LOG_INTERNAL(KUDU_##level) + +#define KUDU_CHECK(condition) \ + (condition) ? 0 : KUDU_LOG(FATAL) << "Check failed: " #condition " " + +namespace kudu { + +namespace internal_logging { + +class NullLog { + public: + template + NullLog& operator<<(const T& t) { + return *this; + } +}; + +class CerrLog { + public: + CerrLog(int severity) // NOLINT(runtime/explicit) + : severity_(severity), + has_logged_(false) { + } + + ~CerrLog() { + if (has_logged_) { + std::cerr << std::endl; + } + if (severity_ == KUDU_FATAL) { + exit(1); + } + } + + template + CerrLog& operator<<(const T& t) { + has_logged_ = true; + std::cerr << t; + return *this; + } + + private: + const int severity_; + bool has_logged_; +}; + +} // namespace internal_logging +} // namespace kudu + +#endif diff --git a/src/kudu/client/symbols.map b/src/kudu/client/symbols.map new file mode 100644 index 000000000000..0f089213a9fe --- /dev/null +++ b/src/kudu/client/symbols.map @@ -0,0 +1,37 @@ +{ + # Symbols marked as 'local' are not exported by the DSO and thus may not + # be used by client applications. + local: + # libunwind + _ULx86_64_*; + _Ux86_64_*; + _U_dyn_info_list_addr; + unw_backtrace; + + # libev + ev_*; + + # zlib + adler32*; + crc32*; + get_crc_table; + inflate*; + zError; + zlib*; + + # devtoolset + __cxa_throw_bad_array*; + + extern "C++" { + # glog, gflags, and protobuf + *google::*; + fL*::FLAGS_*; + gflags_mutex_namespace::*; + glog_internal_namespace_::*; + + # devtoolset - the Red Hat devtoolset statically links c++11 symbols + # into binaries so that the result may be executed on a system with an + # older libstdc++ which doesn't include the necessary c++11 symbols. + std::*; + }; +}; diff --git a/src/kudu/client/table-internal.cc b/src/kudu/client/table-internal.cc new file mode 100644 index 000000000000..ca97480a7c2d --- /dev/null +++ b/src/kudu/client/table-internal.cc @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/table-internal.h" + +#include + +#include "kudu/client/client-internal.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/monotime.h" + +namespace kudu { + +using master::GetTableLocationsRequestPB; +using master::GetTableLocationsResponsePB; +using rpc::RpcController; +using std::string; + +namespace client { + +using sp::shared_ptr; + +KuduTable::Data::Data(shared_ptr client, + string name, + string id, + const KuduSchema& schema, + PartitionSchema partition_schema) + : client_(std::move(client)), + name_(std::move(name)), + id_(std::move(id)), + schema_(schema), + partition_schema_(std::move(partition_schema)) { +} + +KuduTable::Data::~Data() { +} + +Status KuduTable::Data::Open() { + // TODO: fetch the schema from the master here once catalog is available. + GetTableLocationsRequestPB req; + GetTableLocationsResponsePB resp; + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(client_->default_admin_operation_timeout()); + + req.mutable_table()->set_table_id(id_); + Status s; + // TODO: replace this with Async RPC-retrier based RPC in the next revision, + // adding exponential backoff and allowing this to be used safely in a + // a reactor thread. + while (true) { + RpcController rpc; + + // Have we already exceeded our deadline? + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (deadline.ComesBefore(now)) { + const char* msg = "OpenTable timed out after deadline expired"; + LOG(ERROR) << msg; + return Status::TimedOut(msg); + } + + // See KuduClient::Data::SyncLeaderMasterRpc(). + MonoTime rpc_deadline = now; + rpc_deadline.AddDelta(client_->default_rpc_timeout()); + rpc.set_deadline(MonoTime::Earliest(rpc_deadline, deadline)); + + s = client_->data_->master_proxy()->GetTableLocations(req, &resp, &rpc); + if (!s.ok()) { + // Various conditions cause us to look for the leader master again. + // It's ok if that eventually fails; we'll retry over and over until + // the deadline is reached. + + if (s.IsNetworkError()) { + LOG(WARNING) << "Network error talking to the leader master (" + << client_->data_->leader_master_hostport().ToString() << "): " + << s.ToString(); + if (client_->IsMultiMaster()) { + LOG(INFO) << "Determining the leader master again and retrying."; + WARN_NOT_OK(client_->data_->SetMasterServerProxy(client_.get(), deadline), + "Failed to determine new Master"); + continue; + } + } + + if (s.IsTimedOut() + && MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + // If the RPC timed out and the operation deadline expired, we'll loop + // again and time out for good above. + LOG(WARNING) << "Timed out talking to the leader master (" + << client_->data_->leader_master_hostport().ToString() << "): " + << s.ToString(); + if (client_->IsMultiMaster()) { + LOG(INFO) << "Determining the leader master again and retrying."; + WARN_NOT_OK(client_->data_->SetMasterServerProxy(client_.get(), deadline), + "Failed to determine new Master"); + continue; + } + } + } + if (s.ok() && resp.has_error()) { + if (resp.error().code() == master::MasterErrorPB::NOT_THE_LEADER || + resp.error().code() == master::MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED) { + LOG(WARNING) << "Master " << client_->data_->leader_master_hostport().ToString() + << " is no longer the leader master."; + if (client_->IsMultiMaster()) { + LOG(INFO) << "Determining the leader master again and retrying."; + WARN_NOT_OK(client_->data_->SetMasterServerProxy(client_.get(), deadline), + "Failed to determine new Master"); + continue; + } + } + if (s.ok()) { + s = StatusFromPB(resp.error().status()); + } + } + if (!s.ok()) { + LOG(WARNING) << "Error getting table locations: " << s.ToString() << ", retrying."; + continue; + } + if (resp.tablet_locations_size() > 0) { + break; + } + + /* TODO: Use exponential backoff instead */ + base::SleepForMilliseconds(100); + } + + VLOG(1) << "Open Table " << name_ << ", found " << resp.tablet_locations_size() << " tablets"; + return Status::OK(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/table-internal.h b/src/kudu/client/table-internal.h new file mode 100644 index 000000000000..0a56f0de1665 --- /dev/null +++ b/src/kudu/client/table-internal.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_TABLE_INTERNAL_H +#define KUDU_CLIENT_TABLE_INTERNAL_H + +#include + +#include "kudu/common/partition.h" +#include "kudu/client/client.h" + +namespace kudu { + +namespace client { + +class KuduTable::Data { + public: + Data(sp::shared_ptr client, + std::string name, + std::string table_id, + const KuduSchema& schema, + PartitionSchema partition_schema); + ~Data(); + + Status Open(); + + sp::shared_ptr client_; + + std::string name_; + const std::string id_; + + // TODO: figure out how we deal with a schema change from the client perspective. + // Do we make them call a RefreshSchema() method? Or maybe reopen the table and get + // a new KuduTable instance (which would simplify the object lifecycle a little?) + const KuduSchema schema_; + const PartitionSchema partition_schema_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/table_alterer-internal.cc b/src/kudu/client/table_alterer-internal.cc new file mode 100644 index 000000000000..178bb8fc268f --- /dev/null +++ b/src/kudu/client/table_alterer-internal.cc @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/table_alterer-internal.h" + +#include + +#include "kudu/client/schema.h" +#include "kudu/client/schema-internal.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/master/master.pb.h" + +using std::string; + +namespace kudu { +namespace client { + +using master::AlterTableRequestPB; +using master::AlterTableRequestPB_AlterColumn; + +KuduTableAlterer::Data::Data(KuduClient* client, string name) + : client_(client), + table_name_(std::move(name)), + wait_(true) { +} + +KuduTableAlterer::Data::~Data() { + for (Step& s : steps_) { + delete s.spec; + } +} + +Status KuduTableAlterer::Data::ToRequest(AlterTableRequestPB* req) { + if (!status_.ok()) { + return status_; + } + + if (!rename_to_.is_initialized() && + steps_.empty()) { + return Status::InvalidArgument("No alter steps provided"); + } + + req->Clear(); + req->mutable_table()->set_table_name(table_name_); + if (rename_to_.is_initialized()) { + req->set_new_table_name(rename_to_.get()); + } + + for (const Step& s : steps_) { + AlterTableRequestPB::Step* pb_step = req->add_alter_schema_steps(); + pb_step->set_type(s.step_type); + + switch (s.step_type) { + case AlterTableRequestPB::ADD_COLUMN: + { + KuduColumnSchema col; + RETURN_NOT_OK(s.spec->ToColumnSchema(&col)); + ColumnSchemaToPB(*col.col_, + pb_step->mutable_add_column()->mutable_schema()); + break; + } + case AlterTableRequestPB::DROP_COLUMN: + { + pb_step->mutable_drop_column()->set_name(s.spec->data_->name); + break; + } + case AlterTableRequestPB::ALTER_COLUMN: + // TODO(KUDU-861): support altering a column in the wire protocol. + // For now, we just give an error if the caller tries to do + // any operation other than rename. + if (s.spec->data_->has_type || + s.spec->data_->has_encoding || + s.spec->data_->has_compression || + s.spec->data_->has_nullable || + s.spec->data_->primary_key || + s.spec->data_->has_default || + s.spec->data_->default_val || + s.spec->data_->remove_default) { + return Status::NotSupported("cannot support AlterColumn of this type", + s.spec->data_->name); + } + // We only support rename column + if (!s.spec->data_->has_rename_to) { + return Status::InvalidArgument("no alter operation specified", + s.spec->data_->name); + } + pb_step->mutable_rename_column()->set_old_name(s.spec->data_->name); + pb_step->mutable_rename_column()->set_new_name(s.spec->data_->rename_to); + pb_step->set_type(AlterTableRequestPB::RENAME_COLUMN); + break; + default: + LOG(FATAL) << "unknown step type " << s.step_type; + } + } + + return Status::OK(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/table_alterer-internal.h b/src/kudu/client/table_alterer-internal.h new file mode 100644 index 000000000000..aa0c641afa97 --- /dev/null +++ b/src/kudu/client/table_alterer-internal.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_TABLE_ALTERER_INTERNAL_H +#define KUDU_CLIENT_TABLE_ALTERER_INTERNAL_H + +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/master/master.pb.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace master { +class AlterTableRequestPB_AlterColumn; +} // namespace master +namespace client { + +class KuduColumnSpec; + +class KuduTableAlterer::Data { + public: + Data(KuduClient* client, std::string name); + ~Data(); + Status ToRequest(master::AlterTableRequestPB* req); + + + KuduClient* const client_; + const std::string table_name_; + + Status status_; + + struct Step { + master::AlterTableRequestPB::StepType step_type; + + // Owned by KuduTableAlterer::Data. + KuduColumnSpec *spec; + }; + std::vector steps_; + + MonoDelta timeout_; + + bool wait_; + + boost::optional rename_to_; + + private: + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/table_creator-internal.cc b/src/kudu/client/table_creator-internal.cc new file mode 100644 index 000000000000..f1b8b428af19 --- /dev/null +++ b/src/kudu/client/table_creator-internal.cc @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/table_creator-internal.h" + + +#include "kudu/gutil/stl_util.h" + +namespace kudu { + +namespace client { + +KuduTableCreator::Data::Data(KuduClient* client) + : client_(client), + schema_(nullptr), + num_replicas_(0), + wait_(true) { +} + +KuduTableCreator::Data::~Data() { + STLDeleteElements(&split_rows_); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/table_creator-internal.h b/src/kudu/client/table_creator-internal.h new file mode 100644 index 000000000000..7aff47a417d7 --- /dev/null +++ b/src/kudu/client/table_creator-internal.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_TABLE_CREATOR_INTERNAL_H +#define KUDU_CLIENT_TABLE_CREATOR_INTERNAL_H + +#include +#include + +#include "kudu/client/client.h" +#include "kudu/common/common.pb.h" + +namespace kudu { + +namespace client { + +class KuduTableCreator::Data { + public: + explicit Data(KuduClient* client); + ~Data(); + + KuduClient* client_; + + std::string table_name_; + + const KuduSchema* schema_; + + std::vector split_rows_; + + PartitionSchemaPB partition_schema_; + + int num_replicas_; + + MonoDelta timeout_; + + bool wait_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/tablet_server-internal.cc b/src/kudu/client/tablet_server-internal.cc new file mode 100644 index 000000000000..d5ad1c73b86d --- /dev/null +++ b/src/kudu/client/tablet_server-internal.cc @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/tablet_server-internal.h" + +using std::string; + +namespace kudu { +namespace client { + +KuduTabletServer::Data::Data(string uuid, string hostname) + : uuid_(std::move(uuid)), + hostname_(std::move(hostname)) { +} + +KuduTabletServer::Data::~Data() { +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/tablet_server-internal.h b/src/kudu/client/tablet_server-internal.h new file mode 100644 index 000000000000..edc903fa914d --- /dev/null +++ b/src/kudu/client/tablet_server-internal.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_TABLET_SERVER_INTERNAL_H +#define KUDU_CLIENT_TABLET_SERVER_INTERNAL_H + +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/macros.h" + +namespace kudu { +namespace client { + +class KuduTabletServer::Data { + public: + Data(std::string uuid, std::string hostname); + ~Data(); + + const std::string uuid_; + const std::string hostname_; + + DISALLOW_COPY_AND_ASSIGN(Data); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/value-internal.h b/src/kudu/client/value-internal.h new file mode 100644 index 000000000000..c66a7bcadd1b --- /dev/null +++ b/src/kudu/client/value-internal.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_VALUE_INTERNAL_H +#define KUDU_CLIENT_VALUE_INTERNAL_H + +#include + +#include "kudu/common/types.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace client { + +class KuduValue::Data { + public: + enum Type { + INT, + FLOAT, + DOUBLE, + SLICE + }; + Type type_; + union { + int64_t int_val_; + float float_val_; + double double_val_; + }; + Slice slice_val_; + + // Check that this value can be converted to the given datatype 't', + // and return a pointer to the underlying value in '*val_void'. + // + // 'col_name' is used to generate reasonable error messages in the case + // that the type cannot be coerced. + // + // The returned pointer in *val_void is only guaranteed to live as long + // as this KuduValue object. + Status CheckTypeAndGetPointer(const std::string& col_name, + DataType t, + void** val_void); + + private: + // Check that this value has the expected type 'type', returning + // a nice error Status if not. + Status CheckValType(const std::string& col_name, + KuduValue::Data::Type type, + const char* type_str) const; + + // Check that this value is a boolean constant, and set *val_void to + // point to it if so. + Status CheckAndPointToBool(const std::string& col_name, void** val_void); + + // Check that this value is an integer constant within the valid range, + // and set *val_void to point to it if so. + Status CheckAndPointToInt(const std::string& col_name, + size_t int_size, void** val_void); + + // Check that this value is a string constant, and set *val_void to + // point to it if so. + Status CheckAndPointToString(const std::string& col_name, + void** val_void); +}; + +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_VALUE_INTERNAL_H */ diff --git a/src/kudu/client/value.cc b/src/kudu/client/value.cc new file mode 100644 index 000000000000..797174839761 --- /dev/null +++ b/src/kudu/client/value.cc @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/value.h" +#include "kudu/client/value-internal.h" +#include "kudu/gutil/strings/substitute.h" +#include + +using std::string; +using strings::Substitute; + +namespace kudu { +namespace client { + +KuduValue::KuduValue(Data* d) +: data_(d) { +} + +KuduValue::~KuduValue() { + if (data_->type_ == Data::SLICE) { + delete[] data_->slice_val_.data(); + } + delete data_; +} + +KuduValue* KuduValue::Clone() const { + switch (data_->type_) { + case Data::INT: + return KuduValue::FromInt(data_->int_val_); + case Data::DOUBLE: + return KuduValue::FromDouble(data_->double_val_); + case Data::FLOAT: + return KuduValue::FromFloat(data_->float_val_); + case Data::SLICE: + return KuduValue::CopyString(data_->slice_val_); + } + LOG(FATAL); +} + +KuduValue* KuduValue::FromInt(int64_t v) { + auto d = new Data; + d->type_ = Data::INT; + d->int_val_ = v; + + return new KuduValue(d); +} + +KuduValue* KuduValue::FromDouble(double v) { + auto d = new Data; + d->type_ = Data::DOUBLE; + d->double_val_ = v; + + return new KuduValue(d); +} + + +KuduValue* KuduValue::FromFloat(float v) { + auto d = new Data; + d->type_ = Data::FLOAT; + d->float_val_ = v; + + return new KuduValue(d); +} + +KuduValue* KuduValue::FromBool(bool v) { + auto d = new Data; + d->type_ = Data::INT; + d->int_val_ = v ? 1 : 0; + + return new KuduValue(d); +} + +KuduValue* KuduValue::CopyString(Slice s) { + auto copy = new uint8_t[s.size()]; + memcpy(copy, s.data(), s.size()); + + auto d = new Data; + d->type_ = Data::SLICE; + d->slice_val_ = Slice(copy, s.size()); + + return new KuduValue(d); +} + +Status KuduValue::Data::CheckTypeAndGetPointer(const string& col_name, + DataType t, + void** val_void) { + const TypeInfo* ti = GetTypeInfo(t); + switch (ti->physical_type()) { + case kudu::INT8: + case kudu::INT16: + case kudu::INT32: + case kudu::INT64: + RETURN_NOT_OK(CheckAndPointToInt(col_name, ti->size(), val_void)); + break; + + case kudu::BOOL: + RETURN_NOT_OK(CheckAndPointToBool(col_name, val_void)); + break; + + case kudu::FLOAT: + RETURN_NOT_OK(CheckValType(col_name, KuduValue::Data::FLOAT, "float")); + *val_void = &float_val_; + break; + + case kudu::DOUBLE: + RETURN_NOT_OK(CheckValType(col_name, KuduValue::Data::DOUBLE, "double")); + *val_void = &double_val_; + break; + + case kudu::BINARY: + RETURN_NOT_OK(CheckAndPointToString(col_name, val_void)); + break; + + default: + return Status::InvalidArgument(Substitute("cannot determine value for column $0 (type $1)", + col_name, ti->name())); + } + return Status::OK(); +} + +Status KuduValue::Data::CheckValType(const string& col_name, + KuduValue::Data::Type type, + const char* type_str) const { + if (type_ != type) { + return Status::InvalidArgument( + Substitute("non-$0 value for $0 column $1", type_str, col_name)); + } + return Status::OK(); +} + +Status KuduValue::Data::CheckAndPointToBool(const string& col_name, + void** val_void) { + RETURN_NOT_OK(CheckValType(col_name, KuduValue::Data::INT, "bool")); + int64_t int_val = int_val_; + if (int_val != 0 && int_val != 1) { + return Status::InvalidArgument( + Substitute("value $0 out of range for boolean column '$1'", + int_val, col_name)); + } + *val_void = &int_val_; + return Status::OK(); +} + +Status KuduValue::Data::CheckAndPointToInt(const string& col_name, + size_t int_size, + void** val_void) { + RETURN_NOT_OK(CheckValType(col_name, KuduValue::Data::INT, "int")); + + int64_t int_min, int_max; + if (int_size == 8) { + int_min = MathLimits::kMin; + int_max = MathLimits::kMax; + } else { + size_t int_bits = int_size * 8 - 1; + int_max = (1LL << int_bits) - 1; + int_min = -int_max - 1; + } + + int64_t int_val = int_val_; + if (int_val < int_min || int_val > int_max) { + return Status::InvalidArgument( + Substitute("value $0 out of range for $1-bit signed integer column '$2'", + int_val, int_size * 8, col_name)); + } + + *val_void = &int_val_; + return Status::OK(); +} + +Status KuduValue::Data::CheckAndPointToString(const string& col_name, + void** val_void) { + RETURN_NOT_OK(CheckValType(col_name, KuduValue::Data::SLICE, "string")); + *val_void = &slice_val_; + return Status::OK(); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/value.h b/src/kudu/client/value.h new file mode 100644 index 000000000000..5ec4031e8fdf --- /dev/null +++ b/src/kudu/client/value.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_VALUE_H +#define KUDU_CLIENT_VALUE_H + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif +#include "kudu/util/slice.h" +#include "kudu/util/kudu_export.h" + +namespace kudu { +namespace client { + +// A constant cell value with a specific type. +class KUDU_EXPORT KuduValue { + public: + // Return a new identical KuduValue object. + KuduValue* Clone() const; + + // Construct a KuduValue from the given integer. + static KuduValue* FromInt(int64_t v); + + // Construct a KuduValue from the given float. + static KuduValue* FromFloat(float f); + + // Construct a KuduValue from the given double. + static KuduValue* FromDouble(double d); + + // Construct a KuduValue from the given bool. + static KuduValue* FromBool(bool b); + + // Construct a KuduValue by copying the value of the given Slice. + static KuduValue* CopyString(Slice s); + + ~KuduValue(); + private: + friend class ComparisonPredicateData; + friend class KuduColumnSpec; + + class KUDU_NO_EXPORT Data; + explicit KuduValue(Data* d); + + // Owned. + Data* data_; + + DISALLOW_COPY_AND_ASSIGN(KuduValue); +}; + +} // namespace client +} // namespace kudu +#endif /* KUDU_CLIENT_VALUE_H */ diff --git a/src/kudu/client/write_op-internal.h b/src/kudu/client/write_op-internal.h new file mode 100644 index 000000000000..cb2a53a7a722 --- /dev/null +++ b/src/kudu/client/write_op-internal.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_WRITE_OP_INTERNAL_H +#define KUDU_CLIENT_WRITE_OP_INTERNAL_H + +#include "kudu/client/write_op.h" +#include "kudu/common/wire_protocol.pb.h" + +namespace kudu { + +namespace client { + +RowOperationsPB_Type ToInternalWriteType(KuduWriteOperation::Type type); + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/client/write_op.cc b/src/kudu/client/write_op.cc new file mode 100644 index 000000000000..6064885f54c2 --- /dev/null +++ b/src/kudu/client/write_op.cc @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/client/write_op.h" + +#include "kudu/client/client.h" +#include "kudu/common/encoded_key.h" +#include "kudu/common/row.h" +#include "kudu/common/wire_protocol.pb.h" + +namespace kudu { +namespace client { + +using sp::shared_ptr; + +RowOperationsPB_Type ToInternalWriteType(KuduWriteOperation::Type type) { + switch (type) { + case KuduWriteOperation::INSERT: return RowOperationsPB_Type_INSERT; + case KuduWriteOperation::UPDATE: return RowOperationsPB_Type_UPDATE; + case KuduWriteOperation::DELETE: return RowOperationsPB_Type_DELETE; + default: LOG(FATAL) << "Unexpected write operation type: " << type; + } +} + +// WriteOperation -------------------------------------------------------------- + +KuduWriteOperation::KuduWriteOperation(const shared_ptr& table) + : table_(table), + row_(table->schema().schema_) { +} + +KuduWriteOperation::~KuduWriteOperation() {} + +EncodedKey* KuduWriteOperation::CreateKey() const { + CHECK(row_.IsKeySet()) << "key must be set"; + + ConstContiguousRow row(row_.schema(), row_.row_data_); + EncodedKeyBuilder kb(row.schema()); + for (int i = 0; i < row.schema()->num_key_columns(); i++) { + kb.AddColumnKey(row.cell_ptr(i)); + } + gscoped_ptr key(kb.BuildEncodedKey()); + return key.release(); +} + +int64_t KuduWriteOperation::SizeInBuffer() const { + const Schema* schema = row_.schema(); + int size = 1; // for the operation type + + // Add size of isset bitmap (always present). + size += BitmapSize(schema->num_columns()); + // Add size of null bitmap (present if the schema has nullables) + size += ContiguousRowHelper::null_bitmap_size(*schema); + // The column data itself: + for (int i = 0; i < schema->num_columns(); i++) { + if (row_.IsColumnSet(i) && !row_.IsNull(i)) { + size += schema->column(i).type_info()->size(); + if (schema->column(i).type_info()->physical_type() == BINARY) { + ContiguousRow row(schema, row_.row_data_); + Slice bin; + memcpy(&bin, row.cell_ptr(i), sizeof(bin)); + size += bin.size(); + } + } + } + return size; +} + +// Insert ----------------------------------------------------------------------- + +KuduInsert::KuduInsert(const shared_ptr& table) + : KuduWriteOperation(table) { +} + +KuduInsert::~KuduInsert() {} + +// Update ----------------------------------------------------------------------- + +KuduUpdate::KuduUpdate(const shared_ptr& table) + : KuduWriteOperation(table) { +} + +KuduUpdate::~KuduUpdate() {} + +// Delete ----------------------------------------------------------------------- + +KuduDelete::KuduDelete(const shared_ptr& table) + : KuduWriteOperation(table) { +} + +KuduDelete::~KuduDelete() {} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/client/write_op.h b/src/kudu/client/write_op.h new file mode 100644 index 000000000000..4340eb395019 --- /dev/null +++ b/src/kudu/client/write_op.h @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CLIENT_WRITE_OP_H +#define KUDU_CLIENT_WRITE_OP_H + +#include + +#include "kudu/client/shared_ptr.h" +#include "kudu/common/partial_row.h" +#include "kudu/util/kudu_export.h" + +namespace kudu { + +class EncodedKey; + +namespace client { + +namespace internal { +class Batcher; +class WriteRpc; +} // namespace internal + +class KuduTable; + +// A write operation operates on a single table and partial row. +// The KuduWriteOperation class itself allows the batcher to get to the +// generic information that it needs to process all write operations. +// +// On its own, the class does not represent any specific change and thus cannot +// be constructed independently. +// +// KuduWriteOperation also holds shared ownership of its KuduTable to allow client's +// scope to end while the KuduWriteOperation is still alive. +class KUDU_EXPORT KuduWriteOperation { + public: + enum Type { + INSERT = 1, + UPDATE = 2, + DELETE = 3, + }; + virtual ~KuduWriteOperation(); + + // See KuduPartialRow API for field setters, etc. + const KuduPartialRow& row() const { return row_; } + KuduPartialRow* mutable_row() { return &row_; } + + virtual std::string ToString() const = 0; + protected: + explicit KuduWriteOperation(const sp::shared_ptr& table); + virtual Type type() const = 0; + + sp::shared_ptr const table_; + KuduPartialRow row_; + + private: + friend class internal::Batcher; + friend class internal::WriteRpc; + + // Create and encode the key for this write (key must be set) + // + // Caller takes ownership of the allocated memory. + EncodedKey* CreateKey() const; + + const KuduTable* table() const { return table_.get(); } + + // Return the number of bytes required to buffer this operation, + // including direct and indirect data. + int64_t SizeInBuffer() const; + + DISALLOW_COPY_AND_ASSIGN(KuduWriteOperation); +}; + +// A single row insert to be sent to the cluster. +// Row operation is defined by what's in the PartialRow instance here. +// Use mutable_row() to change the row being inserted +// An insert requires all key columns from the table schema to be defined. +class KUDU_EXPORT KuduInsert : public KuduWriteOperation { + public: + virtual ~KuduInsert(); + + virtual std::string ToString() const OVERRIDE { return "INSERT " + row_.ToString(); } + + protected: + virtual Type type() const OVERRIDE { + return INSERT; + } + + private: + friend class KuduTable; + explicit KuduInsert(const sp::shared_ptr& table); +}; + + +// A single row update to be sent to the cluster. +// Row operation is defined by what's in the PartialRow instance here. +// Use mutable_row() to change the row being updated. +// An update requires the key columns and at least one other column +// in the schema to be defined. +class KUDU_EXPORT KuduUpdate : public KuduWriteOperation { + public: + virtual ~KuduUpdate(); + + virtual std::string ToString() const OVERRIDE { return "UPDATE " + row_.ToString(); } + + protected: + virtual Type type() const OVERRIDE { + return UPDATE; + } + + private: + friend class KuduTable; + explicit KuduUpdate(const sp::shared_ptr& table); +}; + + +// A single row delete to be sent to the cluster. +// Row operation is defined by what's in the PartialRow instance here. +// Use mutable_row() to change the row being deleted +// A delete requires just the key columns to be defined. +class KUDU_EXPORT KuduDelete : public KuduWriteOperation { + public: + virtual ~KuduDelete(); + + virtual std::string ToString() const OVERRIDE { return "DELETE " + row_.ToString(); } + + protected: + virtual Type type() const OVERRIDE { + return DELETE; + } + + private: + friend class KuduTable; + explicit KuduDelete(const sp::shared_ptr& table); +}; + +} // namespace client +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/CMakeLists.txt b/src/kudu/codegen/CMakeLists.txt new file mode 100644 index 000000000000..f29a0d09d0d5 --- /dev/null +++ b/src/kudu/codegen/CMakeLists.txt @@ -0,0 +1,174 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +################################################################################ +# This target relies on special LLVM-configured cmake functions +# see http://llvm.org/docs/CMake.html#embedding-llvm-in-your-project +################################################################################ + +####################################### +# Configure LLVM-specific dependencies +####################################### + +set(LLVM_REQ_COMPONENTS + analysis + irreader + instrumentation + ipo + mcdisassembler + mcjit + native +) + +## Add preprocessor defs and include directories +include_directories(SYSTEM ${LLVM_INCLUDE_DIRS}) +add_definitions(${LLVM_DEFINITIONS}) + +# Workaround for a conflict between LLVM's Support/Valgrind.h file +# and our dynamic_annotations.h. Defining this prevents the LLVM +# header from getting included. +add_definitions(-DLLVM_SUPPORT_VALGRIND_H) + +## Get the required libraries to link to in llvm +llvm_map_components_to_libnames(llvm_LIBRARIES "${LLVM_REQ_COMPONENTS}") + +####################################### +# Precompiling to LLVM bytecode +####################################### + +## Create .ll file for precompiled functions (and their dependencies) +set(CLANG_EXEC ${THIRDPARTY_PREFIX}/bin/clang++) +set(IR_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/precompiled.cc) +set(IR_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/precompiled.ll) +set(IR_OUTPUT_CC ${IR_OUTPUT}.cc) + +# Retrieve all includes directories needed for precompilation +get_directory_property(IR_INCLUDES + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + INCLUDE_DIRECTORIES) +foreach(noprefix ${IR_INCLUDES}) + set(PREFIXED_IR_INCLUDES ${PREFIXED_IR_INCLUDES} -I${noprefix}) +endforeach() + +if (APPLE) + # OS X keeps the libc++ headers in a non-standard location that the thirdparty + # Clang does not know about by default. + set(PREFIXED_IR_INCLUDES + ${PREFIXED_IR_INCLUDES} + -cxx-isystem "/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include/c++/v1") +endif() + +# Get preprocessing definitions, which enable directives for glog and gtest +get_directory_property(IR_PP_DEFINITIONS + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMPILE_DEFINITIONS) +foreach(noprefix ${IR_PP_DEFINITIONS}) + set(PREFIXED_IR_PP_DEFS ${PREFIXED_IR_PP_DEFS} -D${noprefix}) +endforeach() + +# Get flags related to actually compiling the source +set(IR_FLAGS + -S -emit-llvm + -DIR_BUILD + ${CMAKE_CXX_FLAGS} + ${PREFIXED_IR_PP_DEFS} + ${PREFIXED_IR_INCLUDES}) +separate_arguments(IR_FLAGS) + +# Avoid enabling ASAN in the precompiled IR. +# +# This avoids an issue when the main code is compiled with a different version +# of LLVM than we are using for JIT, and ASAN is enabled. In that case, +# the IR code will try to call __asan_init_v while our runtime code will +# only have defined __asan_init_v (where X != Y). +# +# Disabling -fsanitize-address will prevent the instrumentation, so it doesn't +# try to link against these symbols. +# +# NOTE: we leave "-DADDRESS_SANITIZER" because this enables ASAN annotations +# from dynamic_annotations.h. These annotations are just extern function +# declarations which will link fine against the ASAN in our executable, even +# if the JIT code is not instrumented. +list(REMOVE_ITEM IR_FLAGS "-fsanitize=address") + +# Disable TSAN in precompiled IR. +# +# Protobuf 2.6.1's atomicops-internals-tsan.h relies on +# , which is not provided by the LLVM version +# we use for codegen. +list(REMOVE_ITEM IR_FLAGS "-fsanitize=thread" "-DTHREAD_SANITIZER") + +# Remove any optimization flags from the generated IR. +# Optimizing during the precompilation limits the ability to optimize +# again at runtime. +list(REMOVE_ITEM IR_FLAGS "-O3" "-O2" "-O1" "-Os") + +# We need a library which depends on the IR source, because CMake+Ninja +# doesn't support IMPLICIT_DEPENDS in ADD_CUSTOM_COMMAND. +# +# Using a fake target like this allows us to pick up the dependencies +# of precompiled.ll, and then we make the IR generation depend on the fake +# target. We end up doing one extra compilation, but that's better than +# having stale IR. +# +# See: http://www.cmake.org/Bug/bug_relationship_graph.php?bug_id=13234 +add_library(ir_fake_target ${IR_SOURCE}) +# The IR uses protobufs from kudu_common, so we have to generate that code first. +target_link_libraries(ir_fake_target kudu_common_proto kudu_util ${KUDU_BASE_LIBS}) + +add_custom_command( + OUTPUT ${IR_OUTPUT} + COMMAND ${CLANG_EXEC} + ${IR_FLAGS} + ${IR_SOURCE} + -o ${IR_OUTPUT} + DEPENDS ir_fake_target) + +# Use 'xxd' to create a cc file containing the precompiled bitcode as a literal array. +# See http://stackoverflow.com/questions/4158900/embedding-resources-in-executable-using-gcc +add_custom_command( + OUTPUT ${IR_OUTPUT_CC} + COMMAND ${CMAKE_SOURCE_DIR}/build-support/generate_precompiled_xxd.sh ${IR_OUTPUT} ${IR_OUTPUT_CC} + DEPENDS ${IR_OUTPUT}) + +####################################### +# codegen +####################################### + +add_library(codegen + code_cache.cc + code_generator.cc + compilation_manager.cc + jit_wrapper.cc + module_builder.cc + row_projector.cc + ${IR_OUTPUT_CC}) + +target_link_libraries(codegen + ${llvm_LIBRARIES} + kudu_common + kudu_util + gutil + kudu_common_proto) + +####################################### +# Unit tests +####################################### + +set(KUDU_TEST_LINK_LIBS codegen ${KUDU_MIN_TEST_LIBS}) + +ADD_KUDU_TEST(codegen-test) diff --git a/src/kudu/codegen/README b/src/kudu/codegen/README new file mode 100644 index 000000000000..fb0a1e13f3db --- /dev/null +++ b/src/kudu/codegen/README @@ -0,0 +1,247 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=============================================================================== +Code Generation Interface +=============================================================================== + +The codegen directory houses code which is compiled with LLVM code generation +utilities. The point of code generation is to have code that is generated at +run time which is optimized to run on data specific to usage that can only be +described at run time. For instance, code which projects rows during a scan +relies on the types of the data stored in each of the columns, but these are +only determined by a run time schema. To alleviate this issue, a row projector +can be compiled with schema-specific machine code to run on the current rows. + +Note the following classes, whose headers are LLVM-independent and thus intended +to be used by the rest of project without introducing additional dependencies: + +CompilationManager (compilation_manager.h) +RowProjector (row_projector.h) + +(Other classes also avoid LLVM headers, but they have little external use). + +CompilationManager +------------------ + +The compilation manager takes care of asynchronous compilation tasks. It +accepts requests to compile new objects. If the requested object is already +cached, then the compiled object is returned. Otherwise, the compilation request +is enqueued and eventually carried out. + +The manager can be accessed (and thus compiled code requests can be made) +by using the GetSingleton() method. Yes - there's a universal singleton for +compilation management. See the header for details. + +The manager allows for waiting for all current compilations to finish, and can +register its metrics (which include code cache performance) upon request. + +No cleanup is necessary for the CompilationManager. It registers a shutdown method +with the exit handler. + +Generated objects +----------------- + +* codegen::RowProjector - A row projector has the same interface as a +common::RowProjector, but supports a narrower scope of row types and arenas. +It does not allow its schema to be reset (indeed, that's the point of compiling +to a specific schema). The row projector's behavior is fully determined by +the base and projection schemas. As such, the compilation manager expects those +two items when retrieving a row projector. + +================================================================================ +Code Generation Implementation Details +================================================================================ + +Code generation works by creating what is essentially an assembly language +file for the desired object, then handing off that assembly to the LLVM +MCJIT compiler. The LLVM backend handles generating target-dependent machine +code. After code generation, the machine code, which is represented as a +shared object in memory, is dynamically linked to the invoking application +(i.e., this one), and the newly generated code becomes available. + +Overview of LLVM-interfacing classes +------------------------------------ + +Most of the interfacing with LLVM is handled by the CodeGenerator +(code_generator.h) and ModuleBuilder (module_builder.h) classes. The CodeGenerator +takes care of setting up static intializations that LLVM is dependent on and +provides an interface which wraps around various calls to LLVM compilation +functions. + +The ModuleBuilder takes care of the one-time construction of a module, which is +LLVM's unit of code. A module is its own namespace containing functions that +are compiled together. Currently, LLVM does not support having multiple +modules per execution engine so the code is coupled with an ExecutionEngine +instance which owns the generated code behind the scenes (the ExecutionEngine is +the LLVM class responsible for actual compilation and running of the dynamically +linked code). Note throughout the directory the execution engine is referred to +(actually typedef-ed as) a JITCodeOwner, because to every single class except +the ModuleBuilder that is all the execution engine is good for. Once the +destructor to a JITCodeOwner object is called, the associated data is deleted. + +In turn, the ModuleBuilder provides a minimal interface to code-generating +classes (classes that accept data specific to a certain request and create the +LLVM IR - the assembly that was mentioned earlier - that is appropriate for +the specific data). The classes fill up the module with the desired assembly. + +Sequence of operation +--------------------- + +The parts come together as follows (in the case that the code cache is empty). + +1. External component requests some compiled object for certain runtime- +dependent data (e.g. a row projector for a base and projection schemas). +2. The CompilationManager accepts the request, but finds no such object +is cached. +3. The CompilationManager enqueues a request to compile said object to its +own threadpool, and responds with failure to the external component. +4. Eventually, a thread becomes available to take on the compilation task. The +task is dequeued and the CodeGenerator's compilation method for the request is +called. +5. The code generator checks that code generation is enabled, and makes a call +to the appropriate code-generating classes. +6. The classes rely on the ModuleBuilder to compile their code, after which +they return pointers to the requested functions. + +Code-generating classes +----------------------- + +As mentioned in steps (5) and (6), the code-generating classes are responsible +for generating the LLVM IR which is compiled at run time for whatever specific +requests the external components have. + +The "code-generating classes" implement the JITWrapper (jit_wrapper.h) interface. +The base class requires an owning reference to a JITCodeOwner, intended to be the +owner of the JIT-compiled code that the JITWrapper derived class refers to. + +On top of containing the JITCodeOwner and pointers to JIT-compiled functions, +the JITWrapper also provides methods which enable code caching. Caching compiled +code is essential because compilation times are prohibitively slow, so satisfying +any single request with freshly compiled code is not an option. As such, each +piece of compiled code should be associated with some run time determined data. + +In the case of a row projector, this data is a pair of schemas, for the base +and the projection. In order to work for arbitrary types (so we do not need +multiple code caches for each different compiled object), the JITWrapper +implementation must be able to provide a byte string key encoding of its +associated data. This provides the key for the aforementioned cache. Similarly, +there should be a static method which allows encoding such a key without +generating a new instance (every time there is a request made to the manager, +the manager needs to generate the byte string key to look it up in the cache). + +For instance, the JITWrapper for RowProjector code, RowProjectorFunctions, has +the following method: + +static Status EncodeKey(const Schema& base, const Schema& proj, + faststring* out); + +For any given input (pair of schemas), the JITWrapper generates a unique key +so that the cache can be looked up for the generated row projector in later +requests (the manager handles the cache lookups). + +In order to keep one homogeneous cache of all the generated code, the keys +need to be unique across classes, which is difficult to maintain because the +encodings could conflict by accident. For this reason, a type identifier should +be prefixed to the beginning of every key. This identifier is an enum, with +values for each JITWrapper derived type, thus guaranteeing uniqueness between +classes. + +Guide to creating new codegenned classes +---------------------------------------- + +To add new classes with code generation, one needs to generate the appropriate +JITWrapper and update the higher-level classes. + +First, the inputs to code generation need to be established (henceforth referred +to as just "inputs"). + +1. Making a new JITWrapper + +A new JITWrapper should derive from the JITWrapper class and expose a static +key-generation method which returns a key given the inputs for the class. To +satisfy the prefix condition, a new enum value must be added in +JITWrapper::JITWrapperType. + +The JITWrapper derived class should have a creation method that generates +a shared reference to an instance of itself. The JITWrappers should only +be handled through shared references because this ensures that the code owner +within the class is kept alive exactly as long as references to code pointing with +it exist (the derived class is the only class that should contain members which +are pointers to the desired compiled functions for the given input). + +The actual creation of the compiled code is perhaps the hardest part. See the +section below. + +2. Updating top-level classes + +On top of adding the new enum value in the JITWrapper enumeration, several other +top-level classes should provide the interfaces necessary to use the new +codegen class (the layer of interface classes enables separate components +of kudu to be independent of LLVM headers). + +In the CodeGenerator, there should be a Compile...(inputs) function which +creates a scoped_refptr to the derived JITWrapper class by invoking the +class' creation method. Note that the CodeGenerator should also print +the appropriate LLVM disassembly if the flag is activated. + +The compilation manager should likewise offer a Request...(inputs) function +that returns the requested compiled functions by looking up the cache for the +inputs by generating a key with the static encoding method mentioned above. If the +cache lookup fails, the manager should submit a new compilation request. The +cache hit metrics should be incremented appropriately. + +Guide to code generation +------------------------ + +The resources at the bottom of this document provide a good reference for +LLVM IR. However, there should be little need to use much LLVM IR because the +majority of the LLVM code can be precompiled. + +If you wish to execute certain functions A, B, or C based on the input data which +takes on values 1, 2, or 3, then do the following: + +1. Write A, B, and C in an extern "C" namespace (to avoid name mangling) in +codegen/precompiled.cc. +2. When creating your derived JITWrapper class, create a ModuleBuilder. The +builder should load your functions A, B, and C automatically. +3. Create an LLVM IR function dependent on the inputs. I.e., if the input +for code generation is 1, then the desired function would be A. In that case, +request the module builder for a function called "A". The builder, when compiled, +will offer a pointer to the compiled function. + +Note in the above example the only utility of code generation is avoiding +a couple of branches which decide on A, B, or C based on input data 1, 2, or 3. + +Code generation gets much more mileage from constant propagation. To utilize this, +one needs to generate a new function in LLVM IR at run time which passes +arguments to the precompiled functions, with hopefully some relevant constants +based on the input data. When LLVM compiles the module, it will propagate those +constants, creating more efficient machine code. + +To create a function in a module at run time, you need to use a +ModuleBuilder::LLVMBuilder. The builder emits LLVM IR dynamically. It is an +alias for the llvm::IRBuilder<> class, whose API is available in the links at +the bottom of this document. A worked example is available in row_projector.cc. + +Useful resources +---------------- +http://llvm.org/docs/doxygen/html/index.html +http://llvm.org/docs/tutorial/ +http://llvm.org/docs/LangRef.html + +Debugging +--------- + +Debug info is available by printing the generated code. See the flags declared +in code_generator.cc for further details. diff --git a/src/kudu/codegen/code_cache.cc b/src/kudu/codegen/code_cache.cc new file mode 100644 index 000000000000..91727173671f --- /dev/null +++ b/src/kudu/codegen/code_cache.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/code_cache.h" + +#include "kudu/codegen/jit_wrapper.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/cache.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace codegen { + +namespace { + +class Deleter : public CacheDeleter { + public: + Deleter() {} + virtual void Delete(const Slice& key, void* value) OVERRIDE { + // The Cache from cache.h deletes the memory that it allocates for its + // own copy of key, but it expects its users to delete their own + // void* values. To delete, we just release our shared ownership. + static_cast(value)->Release(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Deleter); +}; + +} // anonymous namespace + +CodeCache::CodeCache(size_t capacity) + : cache_(NewLRUCache(DRAM_CACHE, capacity, "code_cache")) { + deleter_.reset(new Deleter()); +} + +CodeCache::~CodeCache() {} + +Status CodeCache::AddEntry(const scoped_refptr& value) { + // Get the key + faststring key; + RETURN_NOT_OK(value->EncodeOwnKey(&key)); + + // Because Cache only accepts void* values, we store just the JITWrapper* + // and increase its ref count. + value->AddRef(); + + // Insert into cache and release the handle (we have a local copy of a refptr). + // We CHECK_NOTNULL because this is always a DRAM-based cache, and if allocation + // failed, we'd just crash the process. + Cache::Handle* inserted = CHECK_NOTNULL(cache_->Insert(key, value.get(), 1, deleter_.get())); + cache_->Release(inserted); + return Status::OK(); +} + +scoped_refptr CodeCache::Lookup(const Slice& key) { + // Look up in Cache after generating key, returning NULL if not found. + Cache::Handle* found = cache_->Lookup(key, Cache::EXPECT_IN_CACHE); + if (!found) return scoped_refptr(); + + // Retrieve the value + scoped_refptr value = + static_cast(cache_->Value(found)); + + // No need to hold on to handle after we have our copy + cache_->Release(found); + + return value; +} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/code_cache.h b/src/kudu/codegen/code_cache.h new file mode 100644 index 000000000000..ef21463e3b2d --- /dev/null +++ b/src/kudu/codegen/code_cache.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_CODE_CACHE_H +#define KUDU_CODEGEN_CODE_CACHE_H + +#include "kudu/codegen/row_projector.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/faststring.h" + +namespace kudu { + +class Cache; +class CacheDeleter; +class Schema; + +namespace codegen { + +class JITWrapper; + +// A code cache is a specialized LRU cache with the following services: +// 1. It supports only one writer at a time, but multiple concurrent +// readers. +// 2. If its items are taking too much space, it evicts the least- +// recently-used member of the cache. +// +// The cache takes shared ownership of its entry values, the JITWrappers, +// by incrementing their reference count. +// +// LRU eviction does not guarantee that a JITWrapper is deleted, only that +// the cache releases its shared ownership (by decrementing the reference +// count) of the jit code. +class CodeCache { + public: + // TODO: currently CodeCache is implemented using the Cache in + // kudu/util/cache.h, which requires some transformation to nongeneric + // Slice-type keys, and void* values. Furthermore, the Cache implementation + // provides concurrent write guarantees (thus relies on locks heavily), which + // is unnecessary for the CodeCache. A potential improvement would be to + // implement a single-writer multi-reader LRU cache with proper generics. + + // TODO: a potential improvment would be for the cache to monitor its memory + // consumption explicity and keep its usage under a size limit specified at + // construction time. In order to do this, the cache would have to inject + // a custom memory manager into the CodeGenerator's execution engine which + // intercepts allocation calls and tracks code size. + + // Generates an empty code cache which stores at most 'capacity' JITWrappers. + // A JIT payload is defined to be the combination of objects which rely on jitted + // code and the classes which own the jitted code. + explicit CodeCache(size_t capacity); + ~CodeCache(); + + // This function is NOT thread safe (only one writer may call this at + // a time). Attempts to add a new entry 'wrapper' to the cache, using + // wrapper->EncodeOwnKey() as the key. Overwrites the previous value + // if one exists. If insertion results in excess capacity, LRU eviction + // occurs. Returns Status::OK() upon success. + Status AddEntry(const scoped_refptr& wrapper); + + // This function may be called from any thread concurrently with other + // writes and reads to the cache. Looks in the cache for the specified key. + // Returns a reference to the associated payload, or NULL if no such entry + // exists in the cache. + scoped_refptr Lookup(const Slice& key); + + private: + + gscoped_ptr deleter_; + gscoped_ptr cache_; + + DISALLOW_COPY_AND_ASSIGN(CodeCache); +}; + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/code_generator.cc b/src/kudu/codegen/code_generator.cc new file mode 100644 index 000000000000..923752a554bc --- /dev/null +++ b/src/kudu/codegen/code_generator.cc @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/code_generator.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/codegen/jit_wrapper.h" +#include "kudu/codegen/module_builder.h" +#include "kudu/codegen/row_projector.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/once.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/status.h" + +DEFINE_bool(codegen_dump_functions, false, "Whether to print the LLVM IR" + " for generated functions"); +TAG_FLAG(codegen_dump_functions, experimental); +TAG_FLAG(codegen_dump_functions, runtime); +DEFINE_bool(codegen_dump_mc, false, "Whether to dump the disassembly of the" + " machine code for generated functions."); +TAG_FLAG(codegen_dump_mc, experimental); +TAG_FLAG(codegen_dump_mc, runtime); + +namespace llvm { +class MCAsmInfo; +class MCInstrInfo; +class MCRegisterInfo; +} // namespace llvm + +using llvm::ArrayRef; +using llvm::ExecutionEngine; +using llvm::MCAsmInfo; +using llvm::MCContext; +using llvm::MCDisassembler; +using llvm::MCInst; +using llvm::MCInstPrinter; +using llvm::MCInstrInfo; +using llvm::MCRegisterInfo; +using llvm::MCSubtargetInfo; +using llvm::Module; +using llvm::raw_os_ostream; +using llvm::StringRef; +using llvm::Target; +using llvm::TargetMachine; +using llvm::Triple; +using std::string; + +namespace kudu { + +class Schema; + +namespace codegen { + +namespace { + +// Returns Status::OK() if codegen is not disabled and an error status indicating +// that codegen has been disabled otherwise. +Status CheckCodegenEnabled() { +#ifdef KUDU_DISABLE_CODEGEN + return Status::NotSupported("Code generation has been disabled at compile time."); +#else + return Status::OK(); +#endif +} + +const uint8_t* ptr_from_i64(uint64_t addr) { + COMPILE_ASSERT(sizeof(uint64_t) <= sizeof(uintptr_t), cannot_represent_address_as_pointer); + uintptr_t iptr = addr; + return reinterpret_cast(iptr); +} + +template +uint64_t i64_from_ptr(FuncPtr ptr) { + COMPILE_ASSERT(sizeof(uintptr_t) <= sizeof(uint64_t), + cannot_represent_pointer_as_address); + // This cast is undefined prior to C++11 and only optionally supported even + // with it. However, we must use this because of the LLVM interface. + uintptr_t iptr = reinterpret_cast(reinterpret_cast(ptr)); + return iptr; +} + +// Prints assembly for a function pointed to by 'fptr' given a target +// machine 'tm'. Method is more or less platform-independent, but relies +// on the return instruction containing the "RET" string to terminate in +// the right place. Prints at most 'max_instr' instructions. +// +// Returns number of lines printed. +template +int DumpAsm(FuncPtr fptr, const TargetMachine& tm, std::ostream* out, int max_instr) { + uint64_t base_addr = i64_from_ptr(fptr); + + const MCInstrInfo& instr_info = *CHECK_NOTNULL(tm.getMCInstrInfo()); + const MCRegisterInfo* register_info = CHECK_NOTNULL(tm.getMCRegisterInfo()); + const MCAsmInfo* asm_info = CHECK_NOTNULL(tm.getMCAsmInfo()); + const MCSubtargetInfo subtarget_info = *CHECK_NOTNULL(tm.getMCSubtargetInfo()); + const Triple& triple = tm.getTargetTriple(); + + MCContext context(asm_info, register_info, nullptr); + + gscoped_ptr disas( + CHECK_NOTNULL(tm.getTarget().createMCDisassembler(subtarget_info, context))); + + // LLVM uses these completely undocumented magic syntax constants which had + // to be found in lib/Target/$ARCH/MCTargetDesc/$(ARCH)TargetDesc.cpp. + // Apparently this controls stuff like AT&T vs Intel syntax for x86, but + // there aren't always multiple values to choose from on different architectures. + // It seems that there's an unspoken rule to implement SyntaxVariant = 0. + // This only has meaning for a *given* target, but at least the 0th syntax + // will always be defined, so that's what we use. + static const unsigned kSyntaxVariant = 0; + gscoped_ptr printer( + CHECK_NOTNULL(tm.getTarget().createMCInstPrinter(triple, kSyntaxVariant, *asm_info, + instr_info, *register_info))); + + // Make a memory object referring to the bytes with addresses ranging from + // base_addr to base_addr + (maximum number of bytes instructions take). + const size_t kInstrSizeMax = 16; // max on x86 is 15 bytes + ArrayRef mem_obj(ptr_from_i64(base_addr), max_instr * kInstrSizeMax); + uint64_t addr = 0; + + for (int i = 0; i < max_instr; ++i) { + raw_os_ostream os(*out); + MCInst inst; + uint64_t size; + MCDisassembler::DecodeStatus stat = + disas->getInstruction(inst, size, mem_obj.slice(addr), addr, llvm::nulls(), llvm::nulls()); + if (stat != MCDisassembler::Success) { + *out << "\n" << std::dec; + } else { + string annotations; + printer->printInst(&inst, os, annotations, subtarget_info); + os << " " << annotations << "\n"; + // We need to check the opcode name for "RET" instead of comparing + // the opcode to llvm::ReturnInst::getOpcode() because the native + // opcode may be different, there may different types of returns, etc. + // TODO: this may fail if there are multiple 'ret' instructions in one + // function (in separate branches). In order to avoid this problem, + // we need to offer the execution engine a custom memory manager + // which tracks the exact sizes of the desired emitted functions. + // In order to make a custom memory manager, we require enabling + // LLVM RTTI, since subclassing an LLVM interface would require + // identical RTTI settings between LLVM and Kudu (see: + // http://llvm.org/docs/Packaging.html#c-features). + string opname = printer->getOpcodeName(inst.getOpcode()); + std::transform(opname.begin(), opname.end(), opname.begin(), ::toupper); + if (opname.find("RET") != string::npos) return i + 1; + } + addr += size; + } + + return max_instr; +} + +} // anonymous namespace + +void CodeGenerator::GlobalInit() { + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + llvm::InitializeNativeTargetAsmParser(); + llvm::InitializeNativeTargetDisassembler(); + // TODO would be nice to just initialize the TargetMachine here, just once, + // instead of constantly retrieving it from the codegen classes' expired + // ModuleBuilders. +} + +CodeGenerator::CodeGenerator() { + static GoogleOnceType once = GOOGLE_ONCE_INIT; + GoogleOnceInit(&once, &CodeGenerator::GlobalInit); +} + +CodeGenerator::~CodeGenerator() {} + + +Status CodeGenerator::CompileRowProjector(const Schema& base, const Schema& proj, + scoped_refptr* out) { + RETURN_NOT_OK(CheckCodegenEnabled()); + + TargetMachine* tm; + RETURN_NOT_OK(RowProjectorFunctions::Create(base, proj, out, &tm)); + + if (FLAGS_codegen_dump_mc) { + static const int kInstrMax = 500; + std::stringstream sstr; + sstr << "Printing read projection function:\n"; + int instrs = DumpAsm((*out)->read(), *tm, &sstr, kInstrMax); + sstr << "Printed " << instrs << " instructions."; + LOG(INFO) << sstr.str(); + } + + return Status::OK(); +} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/code_generator.h b/src/kudu/codegen/code_generator.h new file mode 100644 index 000000000000..d8e1a2c6f258 --- /dev/null +++ b/src/kudu/codegen/code_generator.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_CODE_GENERATOR_H +#define KUDU_CODEGEN_CODE_GENERATOR_H + +#include "kudu/codegen/row_projector.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace llvm { +class LLVMContext; +} // namespace llvm + +namespace kudu { + +class Schema; + +namespace codegen { + +class RowProjectorFunctions; + +// CodeGenerator is a top-level class that manages a per-module +// LLVM context, ExecutionEngine initialization, native target loading, +// and memory management. +// +// This generator is intended for JIT compilation of functions that +// are generated at runtime. These functions can make calls to pre-compiled +// C++ functions, which must be loaded from their *.ll files. +// +// Since the CodeGenerator is the owner of most of the LLVM compilation +// mechanisms (which in turn own most of the LLVM generated code), it also +// functions as a factory for the classes that use LLVM constructs dependent +// on the CodeGenerator's information. +// +// This class is thread-safe. +// +// The execution engine has a global lock for compilations. When a function +// is compiling, other threads will be blocked on their own compilations or +// runs. Because of this, a CodeGenerator should be assigned to a single +// compilation thread (See CompilationManager class). Threads may run +// codegen'd functions concurrently. +// +// Code generation may be disabled globally at compile time by defining +// the preprocessor macro KUDU_DISABLE_CODEGEN. +class CodeGenerator { + public: + // The constructor makes all the appropriate static LLVM initialization + // calls exactly once. + CodeGenerator(); + ~CodeGenerator(); + + // Attempts to initialize row projector functions by compiling code + // for the parameter schemas. Writes to 'out' upon success. + Status CompileRowProjector(const Schema& base, const Schema& proj, + scoped_refptr* out); + + private: + static void GlobalInit(); + + // TODO static ObjectCache shared b/w engines + + DISALLOW_COPY_AND_ASSIGN(CodeGenerator); +}; + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/codegen-test.cc b/src/kudu/codegen/codegen-test.cc new file mode 100644 index 000000000000..71f255a6218a --- /dev/null +++ b/src/kudu/codegen/codegen-test.cc @@ -0,0 +1,362 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +#include "kudu/codegen/code_generator.h" +#include "kudu/codegen/row_projector.h" +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/logging_test_util.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; + +DECLARE_bool(codegen_dump_mc); + +namespace kudu { + +typedef RowProjector NoCodegenRP; +typedef codegen::RowProjector CodegenRP; + +class CodegenTest : public KuduTest { + public: + CodegenTest() + : random_(SeedRandom()), + // Set the arena size as small as possible to catch errors during relocation, + // for its initial size and its eventual max size. + projections_arena_(16, kIndirectPerProjection * 2) { + // Create the base schema. + vector cols = { ColumnSchema("key ", UINT64, false), + ColumnSchema("int32 ", INT32, false), + ColumnSchema("int32-null-val", INT32, true), + ColumnSchema("int32-null ", INT32, true), + ColumnSchema("str32 ", STRING, false), + ColumnSchema("str32-null-val", STRING, true), + ColumnSchema("str32-null ", STRING, true) }; + base_.Reset(cols, 1); + base_ = SchemaBuilder(base_).Build(); // add IDs + + // Create an extended default schema + cols.push_back(ColumnSchema("int32-R ", INT32, false, kI32R, nullptr)); + cols.push_back(ColumnSchema("int32-RW", INT32, false, kI32R, kI32W)); + cols.push_back(ColumnSchema("str32-R ", STRING, false, kStrR, nullptr)); + cols.push_back(ColumnSchema("str32-RW", STRING, false, kStrR, kStrW)); + defaults_.Reset(cols, 1); + defaults_ = SchemaBuilder(defaults_).Build(); // add IDs + + test_rows_arena_.reset(new Arena(2 * 1024, 1024 * 1024)); + RowBuilder rb(base_); + for (int i = 0; i < kNumTestRows; ++i) { + rb.AddUint64(i); + rb.AddInt32(random_.Next32()); + rb.AddInt32(random_.Next32()); + rb.AddNull(); + AddRandomString(&rb); + AddRandomString(&rb); + rb.AddNull(); + + void* arena_data = test_rows_arena_->AllocateBytes( + ContiguousRowHelper::row_size(base_)); + ContiguousRow dst(&base_, static_cast(arena_data)); + CHECK_OK(CopyRow(rb.row(), &dst, test_rows_arena_.get())); + test_rows_[i].reset(new ConstContiguousRow(dst)); + rb.Reset(); + } + } + + protected: + Schema base_; + Schema defaults_; + + // Compares the projection-for-read and projection-for-write results + // of the codegen projection and the non-codegen projection + template + void TestProjection(const Schema* proj); + // Generates a new row projector for the given projection schema. + Status Generate(const Schema* proj, gscoped_ptr* out); + + enum { + // Base schema column indices + kKeyCol, + kI32Col, + kI32NullValCol, + kI32NullCol, + kStrCol, + kStrNullValCol, + kStrNullCol, + // Extended default projection schema column indices + kI32RCol, + kI32RWCol, + kStrRCol, + kStrRWCol + }; + + Status CreatePartialSchema(const vector& col_indexes, + Schema* out); + + private: + // Projects the test rows into parameter rowblock using projector and + // member projections_arena_ (should be Reset() manually). + template + void ProjectTestRows(RowProjectorType* rp, RowBlock* rb); + void AddRandomString(RowBuilder* rb); + + static const int kRandomStringMaxLength = 32; + static const int kNumTestRows = 10; + static const size_t kIndirectPerRow = 4 * kRandomStringMaxLength; + static const size_t kIndirectPerProjection = kIndirectPerRow * kNumTestRows; + typedef const void* DefaultValueType; + static const DefaultValueType kI32R, kI32W, kStrR, kStrW; + + codegen::CodeGenerator generator_; + Random random_; + gscoped_ptr test_rows_[kNumTestRows]; + Arena projections_arena_; + gscoped_ptr test_rows_arena_; +}; + +namespace { + +const int32_t kI32RValue = 0xFFFF0000; +const int32_t kI32WValue = 0x0000FFFF; +const Slice kStrRValue = "RRRRR STRING DEFAULT READ"; +const Slice kStrWValue = "WWWWW STRING DEFAULT WRITE"; + +// Assumes all rows are selected +// Also assumes schemas are the same. +void CheckRowBlocksEqual(const RowBlock* rb1, const RowBlock* rb2, + const string& name1, const string& name2) { + CHECK_EQ(rb1->nrows(), rb2->nrows()); + const Schema& schema = rb1->schema(); + for (int i = 0; i < rb1->nrows(); ++i) { + RowBlockRow row1 = rb1->row(i); + RowBlockRow row2 = rb2->row(i); + CHECK_EQ(schema.Compare(row1, row2), 0) + << "Rows unequal (failed at row " << i << "):\n" + << "\t(" << name1 << ") = " << schema.DebugRow(row1) << "\n" + << "\t(" << name2 << ") = " << schema.DebugRow(row2); + } +} + +} // anonymous namespace + +const CodegenTest::DefaultValueType CodegenTest::kI32R = &kI32RValue; +const CodegenTest::DefaultValueType CodegenTest::kI32W = &kI32WValue; +const CodegenTest::DefaultValueType CodegenTest::kStrR = &kStrRValue; +const CodegenTest::DefaultValueType CodegenTest::kStrW = &kStrWValue; + +void CodegenTest::AddRandomString(RowBuilder* rb) { + static char buf[kRandomStringMaxLength]; + int size = random_.Uniform(kRandomStringMaxLength); + RandomString(buf, size, &random_); + rb->AddString(Slice(buf, size)); +} + +template +void CodegenTest::ProjectTestRows(RowProjectorType* rp, RowBlock* rb) { + // Even though we can test two rows at a time, without using up the + // extra memory for keeping an entire row block around, this tests + // what the actual use case will be. + for (int i = 0; i < kNumTestRows; ++i) { + ConstContiguousRow src = *test_rows_[i]; + RowBlockRow dst = rb->row(i); + if (READ) { + CHECK_OK(rp->ProjectRowForRead(src, &dst, &projections_arena_)); + } else { + CHECK_OK(rp->ProjectRowForWrite(src, &dst, &projections_arena_)); + } + } +} + +template +void CodegenTest::TestProjection(const Schema* proj) { + gscoped_ptr with; + ASSERT_OK(Generate(proj, &with)); + NoCodegenRP without(&base_, proj); + ASSERT_OK(without.Init()); + + CHECK_EQ(with->base_schema(), &base_); + CHECK_EQ(with->projection(), proj); + + RowBlock rb_with(*proj, kNumTestRows, &projections_arena_); + RowBlock rb_without(*proj, kNumTestRows, &projections_arena_); + + projections_arena_.Reset(); + ProjectTestRows(with.get(), &rb_with); + ProjectTestRows(&without, &rb_without); + CheckRowBlocksEqual(&rb_with, &rb_without, "Codegen", "Expected"); +} + +Status CodegenTest::Generate(const Schema* proj, gscoped_ptr* out) { + scoped_refptr functions; + RETURN_NOT_OK(generator_.CompileRowProjector(base_, *proj, &functions)); + out->reset(new CodegenRP(&base_, proj, functions)); + return Status::OK(); +} + +Status CodegenTest::CreatePartialSchema(const vector& col_indexes, + Schema* out) { + vector col_ids; + for (size_t col_idx : col_indexes) { + col_ids.push_back(defaults_.column_id(col_idx)); + } + return defaults_.CreateProjectionByIdsIgnoreMissing(col_ids, out); +} + +TEST_F(CodegenTest, ObservablesTest) { + // Test when not identity + Schema proj = base_.CreateKeyProjection(); + gscoped_ptr with; + CHECK_OK(Generate(&proj, &with)); + NoCodegenRP without(&base_, &proj); + ASSERT_OK(without.Init()); + ASSERT_EQ(with->base_schema(), without.base_schema()); + ASSERT_EQ(with->projection(), without.projection()); + ASSERT_EQ(with->is_identity(), without.is_identity()); + ASSERT_FALSE(with->is_identity()); + + // Test when identity + Schema iproj = *&base_; + gscoped_ptr iwith; + CHECK_OK(Generate(&iproj, &iwith)) + NoCodegenRP iwithout(&base_, &iproj); + ASSERT_OK(iwithout.Init()); + ASSERT_EQ(iwith->base_schema(), iwithout.base_schema()); + ASSERT_EQ(iwith->projection(), iwithout.projection()); + ASSERT_EQ(iwith->is_identity(), iwithout.is_identity()); + ASSERT_TRUE(iwith->is_identity()); +} +// Test key projection +TEST_F(CodegenTest, TestKey) { + Schema key = base_.CreateKeyProjection(); + TestProjection(&key); + TestProjection(&key); +} + +// Test int projection +TEST_F(CodegenTest, TestInts) { + Schema ints; + vector part_cols = { kI32Col, kI32NullValCol, kI32NullCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &ints)); + + TestProjection(&ints); + TestProjection(&ints); +} + +// Test string projection +TEST_F(CodegenTest, TestStrings) { + Schema strs; + vector part_cols = { kStrCol, kStrNullValCol, kStrNullCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &strs)); + + TestProjection(&strs); + TestProjection(&strs); +} + +// Tests the projection of every non-nullable column +TEST_F(CodegenTest, TestNonNullables) { + Schema non_null; + vector part_cols = { kKeyCol, kI32Col, kStrCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &non_null)); + + TestProjection(&non_null); + TestProjection(&non_null); +} + +// Tests the projection of every nullable column +TEST_F(CodegenTest, TestNullables) { + Schema nullables; + vector part_cols = { kI32NullValCol, kI32NullCol, kStrNullValCol, kStrNullCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &nullables)); + + TestProjection(&nullables); + TestProjection(&nullables); +} + +// Test full schema projection +TEST_F(CodegenTest, TestFullSchema) { + TestProjection(&base_); + TestProjection(&base_); +} + +// Tests just the default projection +TEST_F(CodegenTest, TestDefaultsOnly) { + Schema pure_defaults; + + // Default read projections + vector part_cols = { kI32RCol, kI32RWCol, kStrRCol, kStrRWCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &pure_defaults)); + + TestProjection(&pure_defaults); + + // Default write projections + part_cols = { kI32RWCol, kStrRWCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &pure_defaults)); + + TestProjection(&pure_defaults); +} + +// Test full defaults projection +TEST_F(CodegenTest, TestFullSchemaWithDefaults) { + TestProjection(&defaults_); + + // Default write projection + Schema full_write; + vector part_cols = { kKeyCol, + kI32Col, + kI32NullValCol, + kI32NullCol, + kStrCol, + kStrNullValCol, + kStrNullCol, + kI32RWCol, + kStrRWCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &full_write)); + + TestProjection(&full_write); +} + +// Test the codegen_dump_mc flag works properly. +TEST_F(CodegenTest, TestDumpMC) { + FLAGS_codegen_dump_mc = true; + + StringVectorSink sink; + ScopedRegisterSink srs(&sink); + + Schema ints; + vector part_cols = { kI32Col, kI32NullValCol, kI32NullCol }; + ASSERT_OK(CreatePartialSchema(part_cols, &ints)); + TestProjection(&ints); + + const vector& msgs = sink.logged_msgs(); + ASSERT_EQ(msgs.size(), 1); + EXPECT_THAT(msgs[0], testing::ContainsRegex("retq")); +} + +} // namespace kudu diff --git a/src/kudu/codegen/compilation_manager.cc b/src/kudu/codegen/compilation_manager.cc new file mode 100644 index 000000000000..f70415279ce0 --- /dev/null +++ b/src/kudu/codegen/compilation_manager.cc @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/compilation_manager.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/codegen/code_cache.h" +#include "kudu/codegen/code_generator.h" +#include "kudu/codegen/jit_wrapper.h" +#include "kudu/codegen/row_projector.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/faststring.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/threadpool.h" + +using std::shared_ptr; + +DEFINE_bool(codegen_time_compilation, false, "Whether to print time that each code " + "generation request took."); +TAG_FLAG(codegen_time_compilation, experimental); +TAG_FLAG(codegen_time_compilation, runtime); + +METRIC_DEFINE_gauge_int64(server, code_cache_hits, "Codegen Cache Hits", + kudu::MetricUnit::kCacheHits, + "Number of codegen cache hits since start", + kudu::EXPOSE_AS_COUNTER); +METRIC_DEFINE_gauge_int64(server, code_cache_queries, "Codegen Cache Queries", + kudu::MetricUnit::kCacheQueries, + "Number of codegen cache queries (hits + misses) " + "since start", + kudu::EXPOSE_AS_COUNTER); +namespace kudu { +namespace codegen { + +namespace { + +// A CompilationTask is a ThreadPool's Runnable which, given a +// pair of schemas and a cache to refer to, will generate code pertaining +// to the two schemas and store it in the cache when run. +class CompilationTask : public Runnable { + public: + // Requires that the cache and generator are valid for the lifetime + // of this object. + CompilationTask(const Schema& base, const Schema& proj, CodeCache* cache, + CodeGenerator* generator) + : base_(base), + proj_(proj), + cache_(cache), + generator_(generator) {} + + // Can only be run once. + void Run() override { + // We need to fail softly because the user could have just given + // a malformed projection schema pair, but could be long gone by + // now so there's nowhere to return the status to. + WARN_NOT_OK(RunWithStatus(), + "Failed compilation of row projector from base schema " + + base_.ToString() + " to projection schema " + + proj_.ToString()); + } + + private: + Status RunWithStatus() { + faststring key; + RETURN_NOT_OK(RowProjectorFunctions::EncodeKey(base_, proj_, &key)); + + // Check again to make sure we didn't compile it already. + // This can occur if we request the same schema pair while the + // first one's compiling. + if (cache_->Lookup(key)) return Status::OK(); + + scoped_refptr functions; + LOG_TIMING_IF(INFO, FLAGS_codegen_time_compilation, "code-generating row projector") { + RETURN_NOT_OK(generator_->CompileRowProjector(base_, proj_, &functions)); + } + + RETURN_NOT_OK(cache_->AddEntry(functions)); + return Status::OK(); + } + + Schema base_; + Schema proj_; + CodeCache* const cache_; + CodeGenerator* const generator_; + + DISALLOW_COPY_AND_ASSIGN(CompilationTask); +}; + +} // anonymous namespace + +CompilationManager::CompilationManager() + : cache_(kDefaultCacheCapacity), + hit_counter_(0), + query_counter_(0) { + CHECK_OK(ThreadPoolBuilder("compiler_manager_pool") + .set_min_threads(0) + .set_max_threads(1) + .set_idle_timeout(MonoDelta::FromMilliseconds(kThreadTimeoutMs)) + .Build(&pool_)); + // We call std::atexit after the implicit default construction of + // generator_ to ensure static LLVM constants would not have been destructed + // when the registered function is called (since this object is a singleton, + // atexit will only be called once). + CHECK(std::atexit(&CompilationManager::Shutdown) == 0) + << "Compilation manager shutdown must be registered successfully with " + << "std::atexit to be used."; +} + +CompilationManager::~CompilationManager() {} + +void CompilationManager::Wait() { + pool_->Wait(); +} + +void CompilationManager::Shutdown() { + GetSingleton()->pool_->Shutdown(); +} + +Status CompilationManager::StartInstrumentation(const scoped_refptr& metric_entity) { + // Even though these function as counters, we use gauges instead, because + // this is a singleton that is shared across multiple TS instances in a + // minicluster setup. If we were to use counters, then we could not properly + // register the same metric in multiple registries. Using a gauge which loads + // an atomic int is a suitable workaround: each TS's registry ends up with a + // unique gauge which reads the value of the singleton's integer. + Callback hits = Bind(&AtomicInt::Load, + Unretained(&hit_counter_), + kMemOrderNoBarrier); + Callback queries = Bind(&AtomicInt::Load, + Unretained(&query_counter_), + kMemOrderNoBarrier); + metric_entity->NeverRetire( + METRIC_code_cache_hits.InstantiateFunctionGauge(metric_entity, hits)); + metric_entity->NeverRetire( + METRIC_code_cache_queries.InstantiateFunctionGauge(metric_entity, queries)); + return Status::OK(); +} + +bool CompilationManager::RequestRowProjector(const Schema* base_schema, + const Schema* projection, + gscoped_ptr* out) { + faststring key; + Status s = RowProjectorFunctions::EncodeKey(*base_schema, *projection, &key); + WARN_NOT_OK(s, "RowProjector compilation request failed"); + if (!s.ok()) return false; + query_counter_.Increment(); + + scoped_refptr cached( + down_cast(cache_.Lookup(key).get())); + + // If not cached, add a request to compilation pool + if (!cached) { + shared_ptr task( + new CompilationTask(*base_schema, *projection, &cache_, &generator_)); + WARN_NOT_OK(pool_->Submit(task), + "RowProjector compilation request failed"); + return false; + } + + hit_counter_.Increment(); + + out->reset(new RowProjector(base_schema, projection, cached)); + return true; +} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/compilation_manager.h b/src/kudu/codegen/compilation_manager.h new file mode 100644 index 000000000000..0533c5822cb3 --- /dev/null +++ b/src/kudu/codegen/compilation_manager.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_COMPILATION_MANAGER_H +#define KUDU_CODEGEN_COMPILATION_MANAGER_H + +#include "kudu/codegen/code_generator.h" +#include "kudu/codegen/code_cache.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/singleton.h" +#include "kudu/util/atomic.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Counter; +class MetricEntity; +class MetricRegistry; +class ThreadPool; + +namespace codegen { + +class RowProjector; + +// The compilation manager is a top-level class which manages the actual +// delivery of a code generator's output by maintaining its own +// threadpool and code cache. It accepts requests to compile various classes +// (all the ones that the CodeGenerator offers) and attempts to retrieve a +// cached copy. If no such copy exists, it adds a request to generate it. +// +// Class is thread safe. +// +// The compilation manager is available as a global singleton only because +// it is intended to be used on a per-tablet-server basis. While in +// certain unit tests (that don't depend on compilation performance), +// there may be multiple TSs per processes, this will not occur in a +// distributed enviornment, where each TS has its own process. +// Furthermore, using a singleton ensures that lower-level classes which +// depend on code generation need not depend on a top-level class which +// instantiates them to provide a compilation manager. This avoids many +// unnecessary dependencies on state and lifetime of top-level and +// intermediary classes which should not be aware of the code generation's +// use in the first place. +class CompilationManager { + public: + // Waits for all async tasks to finish. + ~CompilationManager(); + + static CompilationManager* GetSingleton() { + return Singleton::get(); + } + + // If a codegenned row projector with compatible schemas (see + // codegen::JITSchemaPair::ProjectionsCompatible) is ready, + // then it is written to 'out' and true is returned. + // Otherwise, this enqueues a compilation task for the parameter + // schemas in the CompilationManager's thread pool and returns + // false. Upon any failure, false is returned. + // Does not write to 'out' if false is returned. + bool RequestRowProjector(const Schema* base_schema, + const Schema* projection, + gscoped_ptr* out); + + // Waits for all asynchronous compilation tasks to finish. + void Wait(); + + // Sets up a metric registry to observe the compilation manager's metrics. + // This method is used instead of registering a counter with a given + // registry because the CompilationManager is a singleton and there would + // be lifetime issues if the manager was dependent on a single registry. + Status StartInstrumentation(const scoped_refptr& metric_entity); + + private: + friend class Singleton; + CompilationManager(); + + static void Shutdown(); + + CodeGenerator generator_; + CodeCache cache_; + gscoped_ptr pool_; + + AtomicInt hit_counter_; + AtomicInt query_counter_; + + static const int kDefaultCacheCapacity = 100; + static const int kThreadTimeoutMs = 100; + + DISALLOW_COPY_AND_ASSIGN(CompilationManager); +}; + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/jit_wrapper.cc b/src/kudu/codegen/jit_wrapper.cc new file mode 100644 index 000000000000..056baac8df88 --- /dev/null +++ b/src/kudu/codegen/jit_wrapper.cc @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/jit_wrapper.h" + +#include + +#include "kudu/util/faststring.h" + +using llvm::ExecutionEngine; +using std::unique_ptr; + +namespace kudu { +namespace codegen { + +JITWrapper::JITWrapper(unique_ptr owner) + : owner_(std::move(owner)) {} + +JITWrapper::~JITWrapper() {} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/jit_wrapper.h b/src/kudu/codegen/jit_wrapper.h new file mode 100644 index 000000000000..19981673a571 --- /dev/null +++ b/src/kudu/codegen/jit_wrapper.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_JIT_WRAPPER_H +#define KUDU_CODEGEN_JIT_WRAPPER_H + +#include + +#include "kudu/gutil/casts.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace llvm { +class ExecutionEngine; +} // namespace llvm + +namespace kudu { + +class faststring; + +namespace codegen { + +typedef llvm::ExecutionEngine JITCodeOwner; + +// A JITWrapper is the combination of a jitted code and a pointer +// (or pointers) to function(s) within that jitted code. Holding +// a ref-counted pointer to a JITWrapper ensures the validity of +// the codegenned function. A JITWrapper owns its code uniquely. +// +// All independent units which should be codegenned should derive +// from this type and update the JITWrapperType enum below so that there is +// a consistent unique identifier among jitted cache keys (each class may +// have its own different key encodings after those bytes). +class JITWrapper : public RefCountedThreadSafe { + public: + enum JITWrapperType { + ROW_PROJECTOR + }; + + // Returns the key encoding (for the code cache) for this upon success. + // If two JITWrapper instances of the same type have the same key, then + // their codegenned code should be functionally equivalent. + // Appends key to 'out' upon success. + // The key must be unique amongst all derived types of JITWrapper. + // To do this, the type's enum value from JITWrapper::JITWrapperType + // should be prefixed to out. + virtual Status EncodeOwnKey(faststring* out) = 0; + + protected: + explicit JITWrapper(std::unique_ptr owner); + virtual ~JITWrapper(); + + private: + friend class RefCountedThreadSafe; + + std::unique_ptr owner_; + + DISALLOW_COPY_AND_ASSIGN(JITWrapper); +}; + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/module_builder.cc b/src/kudu/codegen/module_builder.cc new file mode 100644 index 000000000000..f43f439deea7 --- /dev/null +++ b/src/kudu/codegen/module_builder.cc @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/module_builder.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/codegen/precompiled.ll.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +#ifndef CODEGEN_MODULE_BUILDER_DO_OPTIMIZATIONS +#if NDEBUG +#define CODEGEN_MODULE_BUILDER_DO_OPTIMIZATIONS 1 +#else +#define CODEGEN_MODULE_BUILDER_DO_OPTIMIZATIONS 0 +#endif +#endif + +using llvm::CodeGenOpt::Level; +using llvm::ConstantExpr; +using llvm::ConstantInt; +using llvm::EngineBuilder; +using llvm::ExecutionEngine; +using llvm::Function; +using llvm::FunctionType; +using llvm::IntegerType; +using llvm::legacy::FunctionPassManager; +using llvm::legacy::PassManager; +using llvm::LLVMContext; +using llvm::Module; +using llvm::PassManagerBuilder; +using llvm::PointerType; +using llvm::raw_os_ostream; +using llvm::SMDiagnostic; +using llvm::TargetMachine; +using llvm::Type; +using llvm::Value; +using std::move; +using std::ostream; +using std::string; +using std::stringstream; +using std::unique_ptr; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace codegen { + +namespace { + +string ToString(const SMDiagnostic& err) { + stringstream sstr; + raw_os_ostream os(sstr); + err.print("precompiled.ll", os); + os.flush(); + return Substitute("line $0 col $1: $2", + err.getLineNo(), err.getColumnNo(), + sstr.str()); +} + +string ToString(const Module& m) { + stringstream sstr; + raw_os_ostream os(sstr); + os << m; + return sstr.str(); +} + +// This method is needed for the implicit conversion from +// llvm::StringRef to std::string +string ToString(const Function* f) { + return f->getName(); +} + +bool ModuleContains(const Module& m, const Function* fptr) { + for (const auto& function : m) { + if (&function == fptr) return true; + } + return false; +} + +} // anonymous namespace + +ModuleBuilder::ModuleBuilder() + : state_(kUninitialized), + context_(new LLVMContext()), + builder_(*context_) {} + +ModuleBuilder::~ModuleBuilder() {} + +Status ModuleBuilder::Init() { + CHECK_EQ(state_, kUninitialized) << "Cannot Init() twice"; + + // Even though the LLVM API takes an explicit length for the input IR, + // it appears to actually depend on NULL termination. We assert for it + // here because otherwise we end up with very strange LLVM errors which + // are tough to debug. + CHECK_EQ('\0', precompiled_ll_data[precompiled_ll_len]) << "IR not properly NULL-terminated"; + + // However, despite depending on the buffer being null terminated, it doesn't + // expect the null terminator to be included in the length of the buffer. + // Per http://llvm.org/docs/doxygen/html/classllvm_1_1MemoryBuffer.html : + // > In addition to basic access to the characters in the file, this interface + // > guarantees you can read one character past the end of the file, and that this + // > character will read as '\0'. + llvm::StringRef ir_data(precompiled_ll_data, precompiled_ll_len); + CHECK_GT(ir_data.size(), 0) << "IR not properly linked"; + + // Parse IR. + SMDiagnostic err; + unique_ptr ir_buf(llvm::MemoryBuffer::getMemBuffer(ir_data)); + module_ = llvm::parseIR(ir_buf->getMemBufferRef(), err, *context_); + if (!module_) { + return Status::ConfigurationError("Could not parse IR", ToString(err)); + } + VLOG(3) << "Successfully parsed IR:\n" << ToString(*module_); + + // TODO: consider parsing this module once instead of on each invocation. + state_ = kBuilding; + return Status::OK(); +} + +Function* ModuleBuilder::Create(FunctionType* fty, const string& name) { + CHECK_EQ(state_, kBuilding); + return Function::Create(fty, Function::ExternalLinkage, name, module_.get()); +} + +Function* ModuleBuilder::GetFunction(const string& name) { + CHECK_EQ(state_, kBuilding); + // All extern "C" functions are guaranteed to have the same + // exact name as declared in the source file. + return CHECK_NOTNULL(module_->getFunction(name)); +} + +Type* ModuleBuilder::GetType(const string& name) { + CHECK_EQ(state_, kBuilding); + // Technically clang is not obligated to name every + // class as "class.kudu::ClassName" but so long as there + // are no naming conflicts in the LLVM context it appears + // to do so (naming conflicts are avoided by having 1 context + // per module) + return CHECK_NOTNULL(module_->getTypeByName(name)); +} + +Value* ModuleBuilder::GetPointerValue(void* ptr) const { + CHECK_EQ(state_, kBuilding); + // No direct way of creating constant pointer values in LLVM, so + // first a constant int has to be created and then casted to a pointer + IntegerType* llvm_uintptr_t = Type::getIntNTy(*context_, 8 * sizeof(ptr)); + uintptr_t int_value = reinterpret_cast(ptr); + ConstantInt* llvm_int_value = ConstantInt::get(llvm_uintptr_t, + int_value, false); + Type* llvm_ptr_t = Type::getInt8PtrTy(*context_); + return ConstantExpr::getIntToPtr(llvm_int_value, llvm_ptr_t); +} + + +void ModuleBuilder::AddJITPromise(llvm::Function* llvm_f, + FunctionAddress* actual_f) { + CHECK_EQ(state_, kBuilding); + DCHECK(ModuleContains(*module_, llvm_f)) + << "Function " << ToString(llvm_f) << " does not belong to ModuleBuilder."; + JITFuture fut; + fut.llvm_f_ = llvm_f; + fut.actual_f_ = actual_f; + futures_.push_back(fut); +} + +namespace { + +#if CODEGEN_MODULE_BUILDER_DO_OPTIMIZATIONS + +void DoOptimizations(ExecutionEngine* engine, + Module* module, + const vector& external_functions) { + PassManagerBuilder pass_builder; + pass_builder.OptLevel = 2; + // Don't optimize for code size (this corresponds to -O2/-O3) + pass_builder.SizeLevel = 0; + pass_builder.Inliner = llvm::createFunctionInliningPass(); + + FunctionPassManager fpm(module); + pass_builder.populateFunctionPassManager(fpm); + fpm.doInitialization(); + + // For each function in the module, optimize it + for (Function& f : *module) { + // The bool return value here just indicates whether the passes did anything. + // We can safely expect that many functions are too small to do any optimization. + ignore_result(fpm.run(f)); + } + fpm.doFinalization(); + + PassManager module_passes; + + // Internalize all functions that aren't explicitly specified with external linkage. + module_passes.add(llvm::createInternalizePass(external_functions)); + pass_builder.populateModulePassManager(module_passes); + + // Same as above, the result here just indicates whether optimization made any changes. + // Don't need to check it. + ignore_result(module_passes.run(*module)); +} + +#endif + +} // anonymous namespace + +Status ModuleBuilder::Compile(unique_ptr* out) { + CHECK_EQ(state_, kBuilding); + + // Attempt to generate the engine + string str; +#ifdef NDEBUG + Level opt_level = llvm::CodeGenOpt::Aggressive; +#else + Level opt_level = llvm::CodeGenOpt::None; +#endif + Module* module = module_.get(); + EngineBuilder ebuilder(move(module_)); + ebuilder.setErrorStr(&str); + ebuilder.setOptLevel(opt_level); + target_ = ebuilder.selectTarget(); + unique_ptr local_engine(ebuilder.create(target_)); + if (!local_engine) { + return Status::ConfigurationError("Code generation for module failed. " + "Could not start ExecutionEngine", + str); + } + module->setDataLayout(target_->createDataLayout()); + +#if CODEGEN_MODULE_BUILDER_DO_OPTIMIZATIONS + DoOptimizations(local_engine.get(), module, GetFunctionNames()); +#endif + + // Compile the module + local_engine->finalizeObject(); + + // Satisfy the promises + for (JITFuture& fut : futures_) { + *fut.actual_f_ = local_engine->getPointerToFunction(fut.llvm_f_); + if (*fut.actual_f_ == nullptr) { + return Status::NotFound( + "Code generation for module failed. Could not find function \"" + + ToString(fut.llvm_f_) + "\"."); + } + } + + // For LLVM 3.7, generated code lasts exactly as long as the execution engine + // that created it does. Furthermore, if the module is removed from the + // engine's ownership, neither the context nor the module have to stick + // around for the jitted code to run. + CHECK(local_engine->removeModule(module)); // releases ownership + module_.reset(module); + + // Upon success write to the output parameter + out->swap(local_engine); + state_ = kCompiled; + return Status::OK(); +} + +TargetMachine* ModuleBuilder::GetTargetMachine() const { + CHECK_EQ(state_, kCompiled); + return CHECK_NOTNULL(target_); +} + +vector ModuleBuilder::GetFunctionNames() const { + vector ret; + for (const JITFuture& fut : futures_) { + const char* name = CHECK_NOTNULL(fut.llvm_f_)->getName().data(); + ret.push_back(name); + } + return ret; +} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/module_builder.h b/src/kudu/codegen/module_builder.h new file mode 100644 index 000000000000..a5661b0601e9 --- /dev/null +++ b/src/kudu/codegen/module_builder.h @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_FUNCTION_BUILDER_H +#define KUDU_CODEGEN_FUNCTION_BUILDER_H + +#include +#include +#include + +#include +#include + +#include "kudu/util/status.h" + +namespace llvm { + +class ExecutionEngine; +class Function; +class FunctionType; +class LLVMContext; +class Module; +class TargetMachine; +class Type; +class Value; + +} // namespace llvm + +namespace kudu { +namespace codegen { + +// A ModuleBuilder provides an interface to generate code for procedures +// given a CodeGenerator to refer to. Builder can be used to create multiple +// functions. It is intended to make building functions easier than using +// LLVM's IRBuilder<> directly. Finally, a builder also provides an interface +// to precompiled functions and makes sure that the bytecode is linked to +// the working module. +// +// This class is not thread-safe. It is intended to be used by a single +// thread to build a set of functions. +// +// This class is just a helper for other classes within the codegen +// directory. It is intended to be used within *.cc files, and not to be +// included in outward-facing classes so other directories do not have a +// dependency on LLVM (this class is necessary because the templated +// IRBuilder<> cannot be forward-declared since it has default arguments). +// This class, however, can easily be forward-declared. +class ModuleBuilder { + private: + typedef void* FunctionAddress; + + public: + // Provide alias so template arguments can be changed in one place + typedef llvm::IRBuilder<> LLVMBuilder; + + // Creates a builder with a fresh module and context. + ModuleBuilder(); + + // Deletes own module and context if they have not been compiled. + ~ModuleBuilder(); + + // Inits a new module with parsed precompiled IR from precompiled.cc. + // TODO: with multiple *.ll files, each file should be loaded on demand + Status Init(); + + // Create a new, empty function in the module with external linkage + llvm::Function* Create(llvm::FunctionType* fty, const std::string& name); + // Retrieve a precompiled type + llvm::Type* GetType(const std::string& name); + // Retrieve a precompiled function + llvm::Function* GetFunction(const std::string& name); + // Get the LLVM wrapper for a constant pointer value of type i8* + llvm::Value* GetPointerValue(void* ptr) const; + + LLVMBuilder* builder() { return &builder_; } + + // Once a function is complete, it may be offered to the module builder + // along with the location of the function pointer to be written to + // with the value of the JIT-compiled function pointer. Once the module + // builder's Compile() method is called, these value are filled. + // Requires that llvm::Function belong to this ModuleBuilder's module. + template + void AddJITPromise(llvm::Function* llvm_f, FuncPtr* actual_f) { + // The below cast is technically yields undefined behavior for + // versions of the standard prior to C++0x. However, the llvm + // interface forces us to use object-pointer to function-pointer + // casting. + AddJITPromise(llvm_f, reinterpret_cast(actual_f)); + } + + // Compiles all promised functions. Builder may not be used after + // this method, only destructed. Upon success, releases ownership + // of the execution engine through the 'out' parameter. + // + // After this method has been called, the jit-compiled code may be + // called as long as 'out' remains alive. Once 'out' destructs, + // the code will be freed. + Status Compile(std::unique_ptr* out); + + // Retrieves the TargetMachine that the engine builder guessed was + // the native target. Requires compilation is complete. + // Pointer is valid while Compile()'s ExecutionEngine is. + llvm::TargetMachine* GetTargetMachine() const; + + private: + // The different states a ModuleBuilder can be in. + enum MBState { + kUninitialized, + kBuilding, + kCompiled + }; + // Basic POD which associates an llvm::Function to the location where its + // function pointer should be written to after compilation. + struct JITFuture { + llvm::Function* llvm_f_; + FunctionAddress* actual_f_; + }; + + void AddJITPromise(llvm::Function* llvm_f, FunctionAddress* actual_f); + // Returns a vector of the function names for the functions stored in the + // JITFutures. The pointers are valid so long as the futures_ vector's + // elements have valid llvm::Function* values. + std::vector GetFunctionNames() const; + + MBState state_; + std::vector futures_; + std::unique_ptr context_; + std::unique_ptr module_; + LLVMBuilder builder_; + llvm::TargetMachine* target_; // not owned + + DISALLOW_COPY_AND_ASSIGN(ModuleBuilder); +}; + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/codegen/precompiled.cc b/src/kudu/codegen/precompiled.cc new file mode 100644 index 000000000000..b073176f6aff --- /dev/null +++ b/src/kudu/codegen/precompiled.cc @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This file contains all of the functions that must be precompiled +// to an LLVM IR format (note: not bitcode to preserve function +// names for retrieval later). +// +// Note namespace scope is just for convenient symbol resolution. +// To preserve function names, extern "C" linkage is used, so these +// functions (1) must not be duplicated in any of the above headers +// and (2) do not belong to namespace kudu. +// +// NOTE: This file may rely on external definitions from any part of Kudu +// because the code generator will resolve external symbols at load time. +// However, the code generator relies on the fact that our Kudu binaries +// are built with unstripped visible symbols, so this style of code generation +// cannot be used in builds with settings that conflict with the required +// visibility (e.g., the client library). +// NOTE: This file is NOT compiled with ASAN annotations, even if Kudu +// is being built with ASAN. + +#include +#include + +#include "kudu/common/rowblock.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/memory/arena.h" + +// Even though this file is only needed for IR purposes, we need to check for +// IR_BUILD because we use a fake static library target to workaround a cmake +// dependencies bug. See 'ir_fake_target' in CMakeLists.txt. +#ifdef IR_BUILD + +// This file uses the 'always_inline' attribute on a bunch of functions to force +// the LLVM optimizer at runtime to inline them where it otherwise might not. +// Because the functions themselves aren't marked 'inline', gcc is unhappy with this. +// But, we can't mark them 'inline' or else they'll get optimized away and not even +// included in the .ll file. So, instead, we just mark them as always_inline in +// the IR_BUILD context. +#define IR_ALWAYS_INLINE __attribute__((always_inline)) + +// Workaround for an MCJIT deficiency where we see a link error when trying +// to load the JITted library. See the following LLVM bug and suggested workaround. +// https://llvm.org/bugs/show_bug.cgi?id=18062 +extern "C" void *__dso_handle __attribute__((__visibility__("hidden"))) = NULL; + +#else +#define IR_ALWAYS_INLINE +#endif + +namespace kudu { + +// Returns whether copy was successful (fails iff slice relocation fails, +// which can only occur if is_string is true). +// If arena is NULL, then no relocation occurs. +IR_ALWAYS_INLINE static bool BasicCopyCell( + uint64_t size, uint8_t* src, uint8_t* dst, bool is_string, Arena* arena) { + // Relocate indirect data + if (is_string) { + if (PREDICT_TRUE(arena != nullptr)) { + return PREDICT_TRUE(arena->RelocateSlice(*reinterpret_cast(src), + reinterpret_cast(dst))); + } + // If arena is NULL, don't relocate, but do copy the pointers to the raw + // data (callers that pass arena as NULL should be sure that the indirect + // data will stay alive after the projections) + } + + // Copy direct data + memcpy(dst, src, size); + return true; +} + +extern "C" { + +// Preface all used functions with _Precompiled to avoid the possibility +// of name clashes. Notice all the nontrivial types must be passed as +// void* parameters, otherwise LLVM will complain that the type does not match +// (and it is not possible to consistently extract the llvm::Type* from a +// parsed module which has the same signature as the one that would be passed +// as a parameter for the below functions if the did not use void* types). +// +// Note that: +// (1) There is no void* type in LLVM, instead i8* is used. +// (2) The functions below are all prefixed with _Precompiled to avoid +// any potential naming conflicts. + + +// declare i1 @_PrecompiledCopyCellToRowBlock( +// i64 size, i8* src, RowBlockRow* dst, i64 col, i1 is_string, Arena* arena) +// +// Performs the same function as CopyCell, copying size bytes of the +// cell pointed to by src to the cell of column col in the row pointed +// to by dst, copying indirect data to the parameter arena if is_string +// is true. Will hard crash if insufficient memory is available for +// relocation. Copies size bytes directly from the src cell. +// If arena is NULL then only the direct copy will occur. +// Returns whether successful. If not, out-of-memory during relocation of +// slices has occured, which can only happen if is_string is true. +IR_ALWAYS_INLINE bool _PrecompiledCopyCellToRowBlock( + uint64_t size, uint8_t* src, RowBlockRow* dst, + uint64_t col, bool is_string, Arena* arena) { + + // We manually compute the destination cell pointer here, rather than + // using dst->cell_ptr(), since we statically know the size of the column + // type. Using the normal access path would generate an 'imul' instruction, + // since it would be loading the column type info from the RowBlock object + // instead of our static parameter here. + size_t idx = dst->row_index(); + const RowBlock* block = dst->row_block(); + uint8_t* dst_cell = block->column_data_base_ptr(col) + idx * size; + return BasicCopyCell(size, src, dst_cell, is_string, arena); +} + +// declare i1 @_PrecompiledCopyCellToRowBlockNullable( +// i64 size, i8* src, RowBlockRow* dst, i64 col, i1 is_string, Arena* arena, +// i8* src_bitmap, i64 bitmap_idx) +// +// Performs the same function as _PrecompiledCopyCellToRowBlock but for nullable +// columns. Checks the parameter bitmap at the specified index and updates +// The row's bitmap accordingly. Then goes on to copy the cell over if it +// is not null. +// If arena is NULL then only the direct copy will occur (if the source +// bitmap indicates the cell itself is non-null). +// Returns whether successful. If not, out-of-memory during relocation of +// slices has occured, which can only happen if is_string is true. +IR_ALWAYS_INLINE bool _PrecompiledCopyCellToRowBlockNullable( + uint64_t size, uint8_t* src, RowBlockRow* dst, uint64_t col, bool is_string, + Arena* arena, uint8_t* src_bitmap, uint64_t bitmap_idx) { + // Using this method implies the nullablity of the column. + // Write whether the column is nullable to the RowBlock's ColumnBlock's bitmap + bool is_null = BitmapTest(src_bitmap, bitmap_idx); + dst->cell(col).set_null(is_null); + // No more copies necessary if null + if (is_null) return true; + return _PrecompiledCopyCellToRowBlock(size, src, dst, col, is_string, arena); +} + +// declare void @_PrecompiledSetRowBlockCellSetNull +// RowBlockRow* %dst, i64 , i1 %is_null) +// +// Sets the cell at column 'col' for destination RowBlockRow 'dst' +// to be marked as 'is_null' (requires the column is nullable). +IR_ALWAYS_INLINE void _PrecompiledCopyCellToRowBlockSetNull( + RowBlockRow* dst, uint64_t col, bool is_null) { + dst->cell(col).set_null(is_null); +} + +} // extern "C" +} // namespace kudu diff --git a/src/kudu/codegen/precompiled.ll.h b/src/kudu/codegen/precompiled.ll.h new file mode 100644 index 000000000000..91c06dd006f0 --- /dev/null +++ b/src/kudu/codegen/precompiled.ll.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_PRECOMPILED_LL_H +#define KUDU_CODEGEN_PRECOMPILED_LL_H + +namespace kudu { +namespace codegen { + +// Declare the precompiled LLVM bitcode data. The actual data is provided by a +// cc file generated at build time using xxd. See codegen/CMakeLists.txt. +extern const char precompiled_ll_data[]; +extern const unsigned int precompiled_ll_len; + +} // namespace codegen +} // namespace kudu + +#endif // KUDU_CODEGEN_PRECOMPILED_LL_H diff --git a/src/kudu/codegen/row_projector.cc b/src/kudu/codegen/row_projector.cc new file mode 100644 index 000000000000..dd162859df05 --- /dev/null +++ b/src/kudu/codegen/row_projector.cc @@ -0,0 +1,488 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/codegen/row_projector.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/codegen/jit_wrapper.h" +#include "kudu/codegen/module_builder.h" +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/util/faststring.h" +#include "kudu/util/status.h" + +namespace llvm { +class LLVMContext; +} // namespace llvm + +using llvm::Argument; +using llvm::BasicBlock; +using llvm::ConstantInt; +using llvm::ExecutionEngine; +using llvm::Function; +using llvm::FunctionType; +using llvm::GenericValue; +using llvm::LLVMContext; +using llvm::Module; +using llvm::PointerType; +using llvm::Type; +using llvm::Value; +using std::ostream; +using std::string; +using std::unique_ptr; +using std::vector; + +DECLARE_bool(codegen_dump_functions); + +namespace kudu { +namespace codegen { + +namespace { + +// Generates a schema-to-schema projection function of the form: +// bool(int8_t* src, RowBlockRow* row, Arena* arena) +// Requires src is a contiguous row of the base schema. +// Returns a boolean indicating success. Failure can only occur if a string +// relocation fails. +// +// Uses CHECKs to make sure projection is well-formed. Use +// kudu::RowProjector::Init() to return an error status instead. +template +llvm::Function* MakeProjection(const string& name, + ModuleBuilder* mbuilder, + const kudu::RowProjector& proj) { + // Get the IRBuilder + ModuleBuilder::LLVMBuilder* builder = mbuilder->builder(); + LLVMContext& context = builder->getContext(); + + // Extract schema information from projector + const Schema& base_schema = *proj.base_schema(); + const Schema& projection = *proj.projection(); + + // Create the function after providing a declaration + vector argtypes = { Type::getInt8PtrTy(context), + PointerType::getUnqual(mbuilder->GetType("class.kudu::RowBlockRow")), + PointerType::getUnqual(mbuilder->GetType("class.kudu::Arena")) }; + FunctionType* fty = + FunctionType::get(Type::getInt1Ty(context), argtypes, false); + Function* f = mbuilder->Create(fty, name); + + // Get the function's Arguments + Function::arg_iterator it = f->arg_begin(); + Argument* src = &*it++; + Argument* rbrow = &*it++; + Argument* arena = &*it++; + DCHECK(it == f->arg_end()); + + // Give names to the arguments for debugging IR. + src->setName("src"); + rbrow->setName("rbrow"); + arena->setName("arena"); + + // Mark our arguments as not aliasing. This eliminates a redundant + // load of rbrow->row_block_ and rbrow->row_index_ for each column. + // Note that these arguments are 1-based indexes. + f->setDoesNotAlias(1); + f->setDoesNotAlias(2); + f->setDoesNotAlias(3); + + // Project row function in IR (note: values in angle brackets are + // constants whose values are determined right now, at JIT time). + // + // define i1 @name(i8* noalias %src, RowBlockRow* noalias %rbrow, Arena* noalias %arena) + // entry: + // %src_bitmap = getelementptr i8* %src, i64 + // + // %src_cell = getelementptr i8* %src, i64 + // %result = call i1 @CopyCellToRowBlock( + // i64 , i8* %src_cell, RowBlockRow* %rbrow, + // i64 , i1 , Arena* %arena)** + // %success = and %success, %result*** + // + // + // + // call void @CopyCellToRowBlockNullDefault( + // RowBlockRow* %rbrow, i64 , i1 ) + // + // + // %src_cell = inttoptr i64 to i8* + // %result = call i1 @CopyCellToRowBlock( + // i64 , i8* %src_cell, RowBlockRow* %rbrow, + // i64 , i1 , Arena* %arena) + // %success = and %success, %result*** + // + // + // ret i1 %success + // + // **If the column is nullable, then the call is replaced with + // call i1 @CopyCellToRowBlockNullable( + // i64 , i8* %src_cell, RowBlockRow* %rbrow, i64 , + // i1 , Arena* %arena, i8* src_bitmap, i64 ) + // ***If the column is nullable and the default value is NULL, then the + // call is replaced with + // call void @CopyCellToRowBlockSetNull( + // RowBlockRow* %rbrow, i64 ) + // ****Technically, llvm ir does not support mutable registers. Thus, + // this is implemented by having "success" be the most recent result + // register of the last "and" instruction. The different "success" values + // can be differentiated by using a success_update_number. + + // Retrieve appropriate precompiled rowblock cell functions + Function* copy_cell_not_null = + mbuilder->GetFunction("_PrecompiledCopyCellToRowBlock"); + Function* copy_cell_nullable = + mbuilder->GetFunction("_PrecompiledCopyCellToRowBlockNullable"); + Function* row_block_set_null = + mbuilder->GetFunction("_PrecompiledCopyCellToRowBlockSetNull"); + + // The bitmap for a contiguous row goes after the row data + // See common/row.h ContiguousRowHelper class + builder->SetInsertPoint(BasicBlock::Create(context, "entry", f)); + Value* src_bitmap = builder->CreateConstGEP1_64(src, base_schema.byte_size()); + src_bitmap->setName("src_bitmap"); + Value* success = builder->getInt1(true); + int success_update_number = 0; + + // Copy base data + for (const kudu::RowProjector::ProjectionIdxMapping& pmap : proj.base_cols_mapping()) { + // Retrieve information regarding this column-to-column transformation + size_t proj_idx = pmap.first; + size_t base_idx = pmap.second; + size_t src_offset = base_schema.column_offset(base_idx); + const ColumnSchema& col = base_schema.column(base_idx); + + // Create the common values between the nullable and nonnullable calls + Value* size = builder->getInt64(col.type_info()->size()); + Value* src_cell = builder->CreateConstGEP1_64(src, src_offset); + src_cell->setName(StrCat("src_cell_base_", base_idx)); + Value* col_idx = builder->getInt64(proj_idx); + ConstantInt* is_binary = builder->getInt1(col.type_info()->physical_type() == BINARY); + vector args = { size, src_cell, rbrow, col_idx, is_binary, arena }; + + // Add additional arguments if nullable + Function* to_call = copy_cell_not_null; + if (col.is_nullable()) { + args.push_back(src_bitmap); + args.push_back(builder->getInt64(base_idx)); + to_call = copy_cell_nullable; + } + + // Make the call and check the return value + Value* result = builder->CreateCall(to_call, args); + result->setName(StrCat("result_b", base_idx, "_p", proj_idx)); + success = builder->CreateAnd(success, result); + success->setName(StrCat("success", success_update_number++)); + } + + // TODO: Copy adapted base data + DCHECK(proj.adapter_cols_mapping().size() == 0) + << "Value Adapter not supported yet"; + + // Fill defaults + for (size_t dfl_idx : proj.projection_defaults()) { + // Retrieve mapping information + const ColumnSchema& col = projection.column(dfl_idx); + const void* dfl = READ ? col.read_default_value() : + col.write_default_value(); + + // Generate arguments + Value* size = builder->getInt64(col.type_info()->size()); + Value* src_cell = mbuilder->GetPointerValue(const_cast(dfl)); + Value* col_idx = builder->getInt64(dfl_idx); + ConstantInt* is_binary = builder->getInt1(col.type_info()->physical_type() == BINARY); + + // Handle default columns that are nullable + if (col.is_nullable()) { + Value* is_null = builder->getInt1(dfl == nullptr); + vector args = { rbrow, col_idx, is_null }; + builder->CreateCall(row_block_set_null, args); + // If dfl was NULL, we're done + if (dfl == nullptr) continue; + } + + // Make the copy cell call and check the return value + vector args = { size, src_cell, rbrow, col_idx, is_binary, arena }; + Value* result = builder->CreateCall(copy_cell_not_null, args); + result->setName(StrCat("result_dfl", dfl_idx)); + success = builder->CreateAnd(success, result); + success->setName(StrCat("success", success_update_number++)); + } + + // Return + builder->CreateRet(success); + + if (FLAGS_codegen_dump_functions) { + LOG(INFO) << "Dumping " << (READ? "read" : "write") << " projection:"; + f->dump(); + } + + return f; +} + +} // anonymous namespace + +RowProjectorFunctions::RowProjectorFunctions(const Schema& base_schema, + const Schema& projection, + ProjectionFunction read_f, + ProjectionFunction write_f, + unique_ptr owner) + : JITWrapper(std::move(owner)), + base_schema_(base_schema), + projection_(projection), + read_f_(read_f), + write_f_(write_f) { + CHECK(read_f != nullptr) + << "Promise to compile read function not fulfilled by ModuleBuilder"; + CHECK(write_f != nullptr) + << "Promise to compile write function not fulfilled by ModuleBuilder"; +} + +Status RowProjectorFunctions::Create(const Schema& base_schema, + const Schema& projection, + scoped_refptr* out, + llvm::TargetMachine** tm) { + ModuleBuilder builder; + RETURN_NOT_OK(builder.Init()); + + // Use a no-codegen row projector to check validity and to build + // the codegen functions. + kudu::RowProjector no_codegen(&base_schema, &projection); + RETURN_NOT_OK(no_codegen.Init()); + + // Build the functions for code gen. No need to mangle for uniqueness; + // in the rare case we have two projectors in one module, LLVM takes + // care of uniquifying when making a GlobalValue. + Function* read = MakeProjection("ProjRead", &builder, no_codegen); + Function* write = MakeProjection("ProjWrite", &builder, no_codegen); + + // Have the ModuleBuilder accept promises to compile the functions + ProjectionFunction read_f, write_f; + builder.AddJITPromise(read, &read_f); + builder.AddJITPromise(write, &write_f); + + unique_ptr owner; + RETURN_NOT_OK(builder.Compile(&owner)); + + if (tm) { + *tm = builder.GetTargetMachine(); + } + out->reset(new RowProjectorFunctions(base_schema, projection, read_f, + write_f, std::move(owner))); + return Status::OK(); +} + +namespace { +// Convenience method which appends to a faststring +template +void AddNext(faststring* fs, const T& val) { + fs->append(&val, sizeof(T)); +} +} // anonymous namespace + +// Allocates space for and generates a key for a pair of schemas. The key +// is unique according to the criteria defined in the CodeCache class' +// block comment. In order to achieve this, we encode the schemas into +// a contiguous array of bytes as follows, in sequence. +// +// (1 byte) unique type identifier for RowProjectorFunctions +// (8 bytes) number, as unsigned long, of base columns +// (5 bytes each) base column types, in order +// 4 bytes for enum type +// 1 byte for nullability +// (8 bytes) number, as unsigned long, of projection columns +// (5 bytes each) projection column types, in order +// 4 bytes for enum type +// 1 byte for nullablility +// (8 bytes) number, as unsigned long, of base projection mappings +// (16 bytes each) base projection mappings, in order +// (24 bytes each) default projection columns, in order +// 8 bytes for the index +// 8 bytes for the read default +// 8 bytes for the write default +// +// This could be made more efficient by removing unnecessary information +// such as the top bits for many numbers, and using a thread-local buffer +// (the code cache copies its own key anyway). +// Respects IsCompatbile below. +// +// Writes to 'out' upon success. +Status RowProjectorFunctions::EncodeKey(const Schema& base, const Schema& proj, + faststring* out) { + kudu::RowProjector projector(&base, &proj); + RETURN_NOT_OK(projector.Init()); + + AddNext(out, JITWrapper::ROW_PROJECTOR); + AddNext(out, base.num_columns()); + for (const ColumnSchema& col : base.columns()) { + AddNext(out, col.type_info()->physical_type()); + AddNext(out, col.is_nullable()); + } + AddNext(out, proj.num_columns()); + for (const ColumnSchema& col : proj.columns()) { + AddNext(out, col.type_info()->physical_type()); + AddNext(out, col.is_nullable()); + } + AddNext(out, projector.base_cols_mapping().size()); + for (const kudu::RowProjector::ProjectionIdxMapping& map : projector.base_cols_mapping()) { + AddNext(out, map); + } + for (size_t dfl_idx : projector.projection_defaults()) { + const ColumnSchema& col = proj.column(dfl_idx); + AddNext(out, dfl_idx); + AddNext(out, col.read_default_value()); + AddNext(out, col.write_default_value()); + } + + return Status::OK(); +} + +RowProjector::RowProjector(const Schema* base_schema, const Schema* projection, + const scoped_refptr& functions) + : projector_(base_schema, projection), + functions_(functions) {} + +namespace { + +struct DefaultEquals { + template + bool operator()(const T& t1, const T& t2) { return t1 == t2; } +}; + +struct ColumnSchemaEqualsType { + bool operator()(const ColumnSchema& s1, const ColumnSchema& s2) { + return s1.EqualsType(s2); + } +}; + +template +bool ContainerEquals(const T& t1, const T& t2, const Equals& equals) { + if (t1.size() != t2.size()) return false; + if (!std::equal(t1.begin(), t1.end(), t2.begin(), equals)) return false; + return true; +} + +template +bool ContainerEquals(const T& t1, const T& t2) { + return ContainerEquals(t1, t2, DefaultEquals()); +} + +// This method defines what makes (base, projection) schema pairs compatible. +// In other words, this method can be thought of as the equivalence relation +// on the set of all well-formed (base, projection) schema pairs that +// partitions the set into equivalence classes which will have the exact +// same projection function code. +// +// This equivalence relation can be decomposed as: +// +// ProjectionsCompatible((base1, proj1), (base2, proj2)) := +// WELLFORMED(base1, proj1) && +// WELLFORMED(base2, proj2) && +// PROJEQUALS(base1, base2) && +// PROJEQUALS(proj1, proj2) && +// MAP(base1, proj1) == MAP(base2, proj2) +// +// where WELLFORMED checks that a projection is well-formed (i.e., a +// kudu::RowProjector can be initialized with the schema pair), PROJEQUAL +// is a relaxed version of the Schema::Equals() operator that is +// independent of column names and column IDs, and MAP addresses +// the actual dependency on column identification - which is the effect +// that those attributes have on the RowProjector's mapping (i.e., different +// names and IDs are ok, so long as the mapping is the same). Note that +// key columns are not given any special meaning in projection. Types +// and nullability of columns must be exactly equal between the two +// schema pairs. +// +// Status::OK corresponds to true in the equivalence relation and other +// statuses correspond to false, explaining why the projections are +// incompatible. +Status ProjectionsCompatible(const Schema& base1, const Schema& proj1, + const Schema& base2, const Schema& proj2) { + kudu::RowProjector rp1(&base1, &proj1), rp2(&base2, &proj2); + RETURN_NOT_OK_PREPEND(rp1.Init(), "(base1, proj1) projection " + "schema pair not well formed: "); + RETURN_NOT_OK_PREPEND(rp2.Init(), "(base2, proj2) projection " + "schema pair not well formed: "); + + if (!ContainerEquals(base1.columns(), base2.columns(), + ColumnSchemaEqualsType())) { + return Status::IllegalState("base schema types unequal"); + } + if (!ContainerEquals(proj1.columns(), proj2.columns(), + ColumnSchemaEqualsType())) { + return Status::IllegalState("projection schema types unequal"); + } + + if (!ContainerEquals(rp1.base_cols_mapping(), rp2.base_cols_mapping())) { + return Status::IllegalState("base column mappings do not match"); + } + if (!ContainerEquals(rp1.adapter_cols_mapping(), rp2.adapter_cols_mapping())) { + return Status::IllegalState("adapter column mappings do not match"); + } + if (!ContainerEquals(rp1.projection_defaults(), rp2.projection_defaults())) { + return Status::IllegalState("projection default indices do not match"); + } + + return Status::OK(); +} + +} // anonymous namespace + +Status RowProjector::Init() { + RETURN_NOT_OK(projector_.Init()); +#ifndef NDEBUG + RETURN_NOT_OK_PREPEND(ProjectionsCompatible( + *projector_.base_schema(), *projector_.projection(), + functions_->base_schema(), functions_->projection()), + "Codegenned row projector's schemas incompatible " + "with its functions' schemas:" + "\n projector base = " + + projector_.base_schema()->ToString() + + "\n projector proj = " + + projector_.projection()->ToString() + + "\n functions base = " + + functions_->base_schema().ToString() + + "\n functions proj = " + + functions_->projection().ToString()); +#endif + return Status::OK(); +} + +ostream& operator<<(ostream& o, const RowProjector& rp) { + o << "Row Projector s1->s2 with:\n" + << "\ts1 = " << rp.base_schema()->ToString() << "\n" + << "\ts2 = " << rp.projection()->ToString(); + return o; +} + +} // namespace codegen +} // namespace kudu diff --git a/src/kudu/codegen/row_projector.h b/src/kudu/codegen/row_projector.h new file mode 100644 index 000000000000..724f61ac0657 --- /dev/null +++ b/src/kudu/codegen/row_projector.h @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CODEGEN_ROW_PROJECTOR_H +#define KUDU_CODEGEN_ROW_PROJECTOR_H + +#include +#include +#include + +#include "kudu/codegen/jit_wrapper.h" +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace llvm { +class TargetMachine; +} // namespace llvm + +namespace kudu { + +class Arena; + +namespace codegen { + +// The JITWrapper for codegen::RowProjector functions. Contains +// the compiled functions themselves as well as the schemas used +// to generate them. +class RowProjectorFunctions : public JITWrapper { + public: + // Compiles the row projector functions for the given base + // and projection. + // Writes the llvm::TargetMachine* used to 'tm' (if not NULL) + // and the functions to 'out' upon success. + static Status Create(const Schema& base_schema, const Schema& projection, + scoped_refptr* out, + llvm::TargetMachine** tm = NULL); + + const Schema& base_schema() { return base_schema_; } + const Schema& projection() { return projection_; } + + typedef bool(*ProjectionFunction)(const uint8_t*, RowBlockRow*, Arena*); + ProjectionFunction read() const { return read_f_; } + ProjectionFunction write() const { return write_f_; } + + virtual Status EncodeOwnKey(faststring* out) OVERRIDE { + return EncodeKey(base_schema_, projection_, out); + } + + static Status EncodeKey(const Schema& base, const Schema& proj, + faststring* out); + + private: + RowProjectorFunctions(const Schema& base_schema, const Schema& projection, + ProjectionFunction read_f, ProjectionFunction write_f, + std::unique_ptr owner); + + const Schema base_schema_, projection_; + const ProjectionFunction read_f_, write_f_; +}; + +// This projector behaves the almost the same way as a tablet/RowProjector except that +// it only supports certain row types, and expects a regular Arena. Furthermore, +// the Reset() public method is unsupported. +// +// See documentation for RowProjector. Any differences in the API will be explained +// in this class. +class RowProjector { + public: + typedef kudu::RowProjector::ProjectionIdxMapping ProjectionIdxMapping; + + // Requires that both schemas remain valid for the lifetime of this + // object. Also requires that the schemas are compatible with + // the schemas used to create 'functions'. + RowProjector(const Schema* base_schema, const Schema* projection, + const scoped_refptr& code); + + Status Init(); + + // Ignores relocations if dst_arena == NULL + template + Status ProjectRowForRead(const ContiguousRowType& src_row, + RowBlockRow* dst_row, + Arena* dst_arena) const { + DCHECK_SCHEMA_EQ(*base_schema(), *src_row.schema()); + DCHECK_SCHEMA_EQ(*projection(), *dst_row->schema()); + RowProjectorFunctions::ProjectionFunction f = functions_->read(); + if (PREDICT_TRUE(f(src_row.row_data(), dst_row, dst_arena))) { + return Status::OK(); + } + return Status::IOError("out of memory copying slice during projection. " + "Base schema row: ", base_schema()->DebugRow(src_row)); + } + + // Warning: the projection schema should have write-defaults defined + // if it has default columns. There was no check for default write + // columns during this class' initialization. + // Ignores relocations if dst_arena == NULL + template + Status ProjectRowForWrite(const ContiguousRowType& src_row, + RowBlockRow* dst_row, + Arena* dst_arena) const { + DCHECK_SCHEMA_EQ(*base_schema(), *src_row.schema()); + DCHECK_SCHEMA_EQ(*projection(), *dst_row->schema()); + RowProjectorFunctions::ProjectionFunction f = functions_->write(); + if (PREDICT_TRUE(f(src_row.row_data(), dst_row, dst_arena))) { + return Status::OK(); + } + return Status::IOError("out of memory copying slice during projection. " + "Base schema row: ", base_schema()->DebugRow(src_row)); + } + + const vector& base_cols_mapping() const { + return projector_.base_cols_mapping(); + } + const vector& adapter_cols_mapping() const { + return projector_.adapter_cols_mapping(); + } + const vector& projection_defaults() const { + return projector_.projection_defaults(); + } + bool is_identity() const { return projector_.is_identity(); } + const Schema* projection() const { return projector_.projection(); } + const Schema* base_schema() const { return projector_.base_schema(); } + + private: + kudu::RowProjector projector_; + scoped_refptr functions_; + + DISALLOW_COPY_AND_ASSIGN(RowProjector); +}; + +extern std::ostream& operator<<(std::ostream& o, const RowProjector& rp); + +} // namespace codegen +} // namespace kudu + +#endif diff --git a/src/kudu/common/CMakeLists.txt b/src/kudu/common/CMakeLists.txt new file mode 100644 index 000000000000..7cf94957270e --- /dev/null +++ b/src/kudu/common/CMakeLists.txt @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +PROTOBUF_GENERATE_CPP( + COMMON_PROTO_SRCS COMMON_PROTO_HDRS COMMON_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES common.proto) +ADD_EXPORTABLE_LIBRARY(kudu_common_proto + SRCS ${COMMON_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${COMMON_PROTO_TGTS}) + +PROTOBUF_GENERATE_CPP( + WIRE_PROTOCOL_PROTO_SRCS WIRE_PROTOCOL_PROTO_HDRS WIRE_PROTOCOL_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES wire_protocol.proto) +set(WIRE_PROTOCOL_PROTO_LIBS + kudu_common_proto + consensus_metadata_proto + protobuf) +ADD_EXPORTABLE_LIBRARY(wire_protocol_proto + SRCS ${WIRE_PROTOCOL_PROTO_SRCS} + DEPS ${WIRE_PROTOCOL_PROTO_LIBS} + NONLINK_DEPS ${WIRE_PROTOCOL_PROTO_TGTS}) + +set(COMMON_SRCS + encoded_key.cc + generic_iterators.cc + id_mapping.cc + iterator_stats.cc + key_encoder.cc + partial_row.cc + partition.cc + predicate_encoder.cc + rowblock.cc + row_changelist.cc + row_key-util.cc + row_operations.cc + scan_predicate.cc + scan_spec.cc + schema.cc + timestamp.cc + types.cc + wire_protocol.cc) + +# Workaround for clang bug https://llvm.org/bugs/show_bug.cgi?id=23757 +# in which it incorrectly optimizes row_key-util.cc and causes incorrect results. +if ("${COMPILER_FAMILY}" STREQUAL "clang") + set_source_files_properties(row_key-util.cc PROPERTIES COMPILE_FLAGS -fwrapv) +endif() + +set(COMMON_LIBS + kudu_common_proto + consensus_metadata_proto + wire_protocol_proto + kudu_util + gutil) + +ADD_EXPORTABLE_LIBRARY(kudu_common + SRCS ${COMMON_SRCS} + DEPS ${COMMON_LIBS}) + +set(KUDU_TEST_LINK_LIBS kudu_common ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(encoded_key-test) +ADD_KUDU_TEST(generic_iterators-test) +ADD_KUDU_TEST(id_mapping-test) +ADD_KUDU_TEST(partial_row-test) +ADD_KUDU_TEST(partition-test) +ADD_KUDU_TEST(predicate-test) +ADD_KUDU_TEST(predicate_encoder-test) +ADD_KUDU_TEST(row_changelist-test) +ADD_KUDU_TEST(row_key-util-test) +ADD_KUDU_TEST(row_operations-test) +ADD_KUDU_TEST(schema-test) +ADD_KUDU_TEST(types-test) +ADD_KUDU_TEST(wire_protocol-test) diff --git a/src/kudu/common/README b/src/kudu/common/README new file mode 100644 index 000000000000..d2a294cbac70 --- /dev/null +++ b/src/kudu/common/README @@ -0,0 +1,16 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This module contains utilities and protobuf message definitions +related to the Kudu data model and the Kudu wire protocol that are to +be shared between client, tserver, and master. diff --git a/src/kudu/common/columnblock.h b/src/kudu/common/columnblock.h new file mode 100644 index 000000000000..d6cf44d9782f --- /dev/null +++ b/src/kudu/common/columnblock.h @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_COLUMNBLOCK_H +#define KUDU_COMMON_COLUMNBLOCK_H + +#include "kudu/common/types.h" +#include "kudu/common/row.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Arena; +class ColumnBlockCell; + +// A block of data all belonging to a single column. +// This is simply a view into a buffer - it does not have any associated +// storage in and of itself. It does, however, maintain its type +// information, which can be used for extra type safety in debug mode. +class ColumnBlock { + public: + typedef ColumnBlockCell Cell; + + ColumnBlock(const TypeInfo* type, + uint8_t *null_bitmap, + void *data, + size_t nrows, + Arena *arena) + : type_(type), + null_bitmap_(null_bitmap), + data_(reinterpret_cast(data)), + nrows_(nrows), + arena_(arena) { + DCHECK(data_) << "null data"; + } + + void SetCellIsNull(size_t idx, bool is_null) { + DCHECK(is_nullable()); + BitmapChange(null_bitmap_, idx, !is_null); + } + + void SetCellValue(size_t idx, const void *new_val) { + strings::memcpy_inlined(mutable_cell_ptr(idx), new_val, type_->size()); + } + +#ifndef NDEBUG + void OverwriteWithPattern(size_t idx, StringPiece pattern) { + char *col_data = reinterpret_cast(mutable_cell_ptr(idx)); + kudu::OverwriteWithPattern(col_data, type_->size(), pattern); + } +#endif + + // Return a pointer to the given cell. + const uint8_t *cell_ptr(size_t idx) const { + DCHECK_LT(idx, nrows_); + return data_ + type_->size() * idx; + } + + // Returns a pointer to the given cell or NULL. + const uint8_t *nullable_cell_ptr(size_t idx) const { + return is_null(idx) ? NULL : cell_ptr(idx); + } + + Cell cell(size_t idx) const; + + uint8_t *null_bitmap() const { + return null_bitmap_; + } + + bool is_nullable() const { + return null_bitmap_ != NULL; + } + + bool is_null(size_t idx) const { + DCHECK(is_nullable()); + DCHECK_LT(idx, nrows_); + return !BitmapTest(null_bitmap_, idx); + } + + const size_t stride() const { return type_->size(); } + const uint8_t * data() const { return data_; } + uint8_t *data() { return data_; } + const size_t nrows() const { return nrows_; } + + Arena *arena() { return arena_; } + + const TypeInfo* type_info() const { + return type_; + } + + private: + friend class ColumnBlockCell; + friend class ColumnDataView; + + // Return a pointer to the given cell. + uint8_t *mutable_cell_ptr(size_t idx) { + DCHECK_LT(idx, nrows_); + return data_ + type_->size() * idx; + } + + const TypeInfo *type_; + uint8_t *null_bitmap_; + + uint8_t *data_; + size_t nrows_; + + Arena *arena_; +}; + +// One of the cells in a ColumnBlock. +class ColumnBlockCell { + public: + ColumnBlockCell(ColumnBlock block, size_t row_idx) + : block_(std::move(block)), row_idx_(row_idx) {} + + const TypeInfo* typeinfo() const { return block_.type_info(); } + size_t size() const { return block_.type_info()->size(); } + const void* ptr() const { + return is_nullable() ? block_.nullable_cell_ptr(row_idx_) + : block_.cell_ptr(row_idx_); + } + void* mutable_ptr() { return block_.mutable_cell_ptr(row_idx_); } + bool is_nullable() const { return block_.is_nullable(); } + bool is_null() const { return block_.is_null(row_idx_); } + void set_null(bool is_null) { block_.SetCellIsNull(row_idx_, is_null); } + protected: + ColumnBlock block_; + size_t row_idx_; +}; + +inline ColumnBlockCell ColumnBlock::cell(size_t idx) const { + return ColumnBlockCell(*this, idx); +} + +// Wrap the ColumnBlock to expose a directly raw block at the specified offset. +// Used by the reader and block encoders to read/write raw data. +class ColumnDataView { + public: + explicit ColumnDataView(ColumnBlock *column_block, size_t first_row_idx = 0) + : column_block_(column_block), row_offset_(0) { + Advance(first_row_idx); + } + + void Advance(size_t skip) { + // Check <= here, not <, since you can skip to + // the very end of the data (leaving an empty block) + DCHECK_LE(skip, column_block_->nrows()); + row_offset_ += skip; + } + + size_t first_row_index() const { + return row_offset_; + } + + // Set 'nrows' bits of the the null-bitmap to "value" + // true if not null, false if null. + void SetNullBits(size_t nrows, bool value) { + BitmapChangeBits(column_block_->null_bitmap(), row_offset_, nrows, value); + } + + uint8_t *data() { + return column_block_->mutable_cell_ptr(row_offset_); + } + + const uint8_t *data() const { + return column_block_->cell_ptr(row_offset_); + } + + Arena *arena() { return column_block_->arena(); } + + size_t nrows() const { + return column_block_->nrows() - row_offset_; + } + + const size_t stride() const { + return column_block_->stride(); + } + + const TypeInfo* type_info() const { + return column_block_->type_info(); + } + + private: + ColumnBlock *column_block_; + size_t row_offset_; +}; + +// Utility class which allocates temporary storage for a +// dense block of column data, freeing it when it goes +// out of scope. +// +// This is more useful in test code than production code, +// since it doesn't allocate from an arena, etc. +template +class ScopedColumnBlock : public ColumnBlock { + public: + typedef typename TypeTraits::cpp_type cpp_type; + + explicit ScopedColumnBlock(size_t n_rows) + : ColumnBlock(GetTypeInfo(type), + new uint8_t[BitmapSize(n_rows)], + new cpp_type[n_rows], + n_rows, + new Arena(1024, 1*1024*1024)), + null_bitmap_(null_bitmap()), + data_(reinterpret_cast(data())), + arena_(arena()) { + } + + const cpp_type &operator[](size_t idx) const { + return data_[idx]; + } + + cpp_type &operator[](size_t idx) { + return data_[idx]; + } + + private: + gscoped_array null_bitmap_; + gscoped_array data_; + gscoped_ptr arena_; + +}; + +} // namespace kudu +#endif diff --git a/src/kudu/common/common.proto b/src/kudu/common/common.proto new file mode 100644 index 000000000000..964629f4d05c --- /dev/null +++ b/src/kudu/common/common.proto @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Protobufs which are common throughout Kudu. +// +// This file may contain protobufs which are persisted on disk +// as well as sent on the wire. If a particular protobuf is only +// used as part of the client-server wire protocol, it should go +// in common/wire_protocol.proto instead. If it is only used within +// the server(s), it should go in cfile/cfile.proto, server/metadata.proto, +// etc, as appropriate. +package kudu; + +option java_package = "org.kududb"; + +// If you add a new type keep in mind to add it to the end +// or update AddMapping() functions like the one in key_encoder.cc +// that have a vector that maps the protobuf tag with the index. +enum DataType { + UNKNOWN_DATA = 999; + UINT8 = 0; + INT8 = 1; + UINT16 = 2; + INT16 = 3; + UINT32 = 4; + INT32 = 5; + UINT64 = 6; + INT64 = 7; + STRING = 8; + BOOL = 9; + FLOAT = 10; + DOUBLE = 11; + BINARY = 12; + TIMESTAMP = 13; +} + +enum EncodingType { + UNKNOWN_ENCODING = 999; + AUTO_ENCODING = 0; + PLAIN_ENCODING = 1; + PREFIX_ENCODING = 2; + GROUP_VARINT = 3; + RLE = 4; + DICT_ENCODING = 5; + BIT_SHUFFLE = 6; +} + +enum CompressionType { + UNKNOWN_COMPRESSION = 999; + DEFAULT_COMPRESSION = 0; + NO_COMPRESSION = 1; + SNAPPY = 2; + LZ4 = 3; + ZLIB = 4; +} + +// TODO: Differentiate between the schema attributes +// that are only relevant to the server (e.g., +// encoding and compression) and those that also +// matter to the client. +message ColumnSchemaPB { + optional uint32 id = 1; + required string name = 2; + required DataType type = 3; + optional bool is_key = 4 [default = false]; + optional bool is_nullable = 5 [default = false]; + optional bytes read_default_value = 6; + optional bytes write_default_value = 7; + + // The following attributes refer to the on-disk storage of the column. + // They won't always be set, depending on context. + optional EncodingType encoding = 8 [default=AUTO_ENCODING]; + optional CompressionType compression = 9 [default=DEFAULT_COMPRESSION]; + optional int32 cfile_block_size = 10 [default=0]; +} + +message SchemaPB { + repeated ColumnSchemaPB columns = 1; +} + +message HostPortPB { + required string host = 1; + required uint32 port = 2; +} + +// The external consistency mode for client requests. +// This defines how transactions and/or sequences of operations that touch +// several TabletServers, in different machines, can be observed by external +// clients. +// +// Note that ExternalConsistencyMode makes no guarantee on atomicity, i.e. +// no sequence of operations is made atomic (or transactional) just because +// an external consistency mode is set. +// Note also that ExternalConsistencyMode has no implication on the +// consistency between replicas of the same tablet. +enum ExternalConsistencyMode { + UNKNOWN_EXTERNAL_CONSISTENCY_MODE = 0; + + // The response to any write will contain a timestamp. + // Any further calls from the same client to other servers will update + // those servers with that timestamp. The user will make sure that the + // timestamp is propagated through back-channels to other + // KuduClient's. + // + // WARNING: Failure to propagate timestamp information through + // back-channels will negate any external consistency guarantee under this + // mode. + // + // Example: + // 1 - Client A executes operation X in Tablet A + // 2 - Afterwards, Client A executes operation Y in Tablet B + // + // + // Client B may observe the following operation sequences: + // {}, {X}, {X Y} + // + // This is the default mode. + CLIENT_PROPAGATED = 1; + + // The server will guarantee that each transaction is externally + // consistent by making sure that none of its results are visible + // until every Kudu server agrees that the transaction is in the past. + // The client is not obligated to forward timestamp information + // through back-channels. + // + // WARNING: Depending on the clock synchronization state of TabletServers + // this may imply considerable latency. Moreover operations with + // COMMIT_WAIT requested external consistency will outright fail if + // TabletServer clocks are either unsynchronized or synchronized but + // with a maximum error which surpasses a pre-configured one. + // + // Example: + // - Client A executes operation X in Tablet A + // - Afterwards, Client A executes operation Y in Tablet B + // + // + // Client B may observe the following operation sequences: + // {}, {X}, {X Y} + COMMIT_WAIT = 2; +}; + +// The possible read modes for clients. +// Clients set these in Scan requests. +// The server keeps 2 snapshot boundaries: +// - The earliest snapshot: this corresponds to the earliest kept undo records +// in the tablet, meaning the current state (Base) can be undone up to +// this snapshot. +// - The latest snapshot: This corresponds to the instant beyond which no +// no transaction will have an earlier timestamp. Usually this corresponds +// to whatever clock->Now() returns, but can be higher if the client propagates +// a timestamp (see below). +enum ReadMode { + UNKNOWN_READ_MODE = 0; + + // When READ_LATEST is specified the server will execute the read independently + // of the clock and will always return all visible writes at the time the request + // was received. This type of read does not return a snapshot timestamp since + // it might not be repeatable, i.e. a later read executed at the same snapshot + // timestamp might yield rows that were committed by in-flight transactions. + // + // This is the default mode. + READ_LATEST = 1; + + // When READ_AT_SNAPSHOT is specified the server will attempt to perform a read + // at the required snapshot. If no snapshot is defined the server will take the + // current time as the snapshot timestamp. Snapshot reads are repeatable, i.e. + // all future reads at the same timestamp will yield the same rows. This is + // performed at the expense of waiting for in-flight transactions whose timestamp + // is lower than the snapshot's timestamp to complete. + // + // When mixing reads and writes clients that specify COMMIT_WAIT as their + // external consistency mode and then use the returned write_timestamp to + // to perform snapshot reads are guaranteed that that snapshot time is + // considered in the past by all servers and no additional action is + // necessary. Clients using CLIENT_PROPAGATED however must forcibly propagate + // the timestamps even at read time, so that the server will not generate + // any more transactions before the snapshot requested by the client. + // The latter option is implemented by allowing the client to specify one or + // two timestamps, the first one obtained from the previous CLIENT_PROPAGATED + // write, directly or through back-channels, must be signed and will be + // checked by the server. The second one, if defined, is the actual snapshot + // read time. When selecting both the latter must be lower than or equal to + // the former. + // TODO implement actually signing the propagated timestamp. + READ_AT_SNAPSHOT = 2; +} + +// The possible order modes for clients. +// Clients specify these in new scan requests. +// Ordered scans are fault-tolerant, and can be retried elsewhere in the case +// of tablet server failure. However, ordered scans impose additional overhead +// since the tablet server needs to sort the result rows. +enum OrderMode { + UNKNOWN_ORDER_MODE = 0; + // This is the default order mode. + UNORDERED = 1; + ORDERED = 2; +} + +// The serialized format of a Kudu table partition schema. +message PartitionSchemaPB { + + // A column identifier for partition schemas. In general, the name will be + // used when a client creates the table since column IDs are assigned by the + // master. All other uses of partition schemas will use the numeric column ID. + message ColumnIdentifierPB { + oneof identifier { + int32 id = 1; + string name = 2; + } + } + + message RangeSchemaPB { + // Column identifiers of columns included in the range. All columns must be + // a component of the primary key. + repeated ColumnIdentifierPB columns = 1; + } + + message HashBucketSchemaPB { + // Column identifiers of columns included in the hash. Every column must be + // a component of the primary key. + repeated ColumnIdentifierPB columns = 1; + + // Number of buckets into which columns will be hashed. Must be at least 2. + required int32 num_buckets = 2; + + // Seed value for hash calculation. Administrators may set a seed value + // on a per-table basis in order to randomize the mapping of rows to + // buckets. Setting a seed provides some amount of protection against denial + // of service attacks when the hash bucket columns contain user provided + // input. + optional uint32 seed = 3; + + enum HashAlgorithm { + UNKNOWN = 0; + MURMUR_HASH_2 = 1; + } + + // The hash algorithm to use for calculating the hash bucket. + optional HashAlgorithm hash_algorithm = 4; + } + + repeated HashBucketSchemaPB hash_bucket_schemas = 1; + optional RangeSchemaPB range_schema = 2; +} + +// The serialized format of a Kudu table partition. +message PartitionPB { + // The hash buckets of the partition. The number of hash buckets must match + // the number of hash bucket components in the partition's schema. + repeated int32 hash_buckets = 1 [packed = true]; + // The encoded start partition key (inclusive). + optional bytes partition_key_start = 2; + // The encoded end partition key (exclusive). + optional bytes partition_key_end = 3; +} diff --git a/src/kudu/common/encoded_key-test.cc b/src/kudu/common/encoded_key-test.cc new file mode 100644 index 000000000000..3b31218d2c64 --- /dev/null +++ b/src/kudu/common/encoded_key-test.cc @@ -0,0 +1,301 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/encoded_key.h" + +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/slice.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/test_util.h" +#include "kudu/util/test_macros.h" + +namespace kudu { + +#define EXPECT_ROWKEY_EQ(schema, expected, enc_key) \ + do { \ + SCOPED_TRACE(""); \ + EXPECT_NO_FATAL_FAILURE(ExpectRowKeyEq((schema), (expected), (enc_key))); \ + } while (0) + +#define EXPECT_DECODED_KEY_EQ(type, expected, encoded_form, val) \ + do { \ + SCOPED_TRACE(""); \ + EXPECT_NO_FATAL_FAILURE(ExpectDecodedKeyEq<(type)>((expected), (encoded_form), (val))); \ + } while (0) + +class EncodedKeyTest : public ::testing::Test { + public: + EncodedKeyTest() + : schema_(CreateSchema()), + key_builder_(&schema_) { + } + + static Schema CreateSchema() { + return Schema({ ColumnSchema("key", UINT32) }, 1); + } + + EncodedKey* BuildEncodedKey(EncodedKeyBuilder& key_builder, int val) { + key_builder.Reset(); + key_builder.AddColumnKey(&val); + return key_builder.BuildEncodedKey(); + } + + // Test whether target lies within the numerical key ranges given by + // start and end. If -1, an empty slice is used instead. + bool InRange(int start, int end, int target) { + gscoped_ptr start_key(BuildEncodedKey(key_builder_, start)); + gscoped_ptr end_key(BuildEncodedKey(key_builder_, end)); + gscoped_ptr target_key(BuildEncodedKey(key_builder_, target)); + return target_key->InRange(start != -1 ? start_key->encoded_key() : Slice(), + end != -1 ? end_key->encoded_key() : Slice()); + } + + void ExpectRowKeyEq(const Schema& schema, + const string& exp_str, + const EncodedKey& key) { + EXPECT_EQ(exp_str, schema.DebugEncodedRowKey(key.encoded_key(), Schema::START_KEY)); + } + + template + void ExpectDecodedKeyEq(const string& expected, + const Slice& encoded_form, + void* val) { + Schema schema({ ColumnSchema("key", Type) }, 1); + EncodedKeyBuilder builder(&schema); + builder.AddColumnKey(val); + gscoped_ptr key(builder.BuildEncodedKey()); + EXPECT_ROWKEY_EQ(schema, expected, *key); + EXPECT_EQ(encoded_form, key->encoded_key()); + } + + private: + Schema schema_; + EncodedKeyBuilder key_builder_; +}; + +TEST_F(EncodedKeyTest, TestKeyInRange) { + ASSERT_TRUE(InRange(-1, -1, 0)); + ASSERT_TRUE(InRange(-1, -1, 50)); + + ASSERT_TRUE(InRange(-1, 30, 0)); + ASSERT_TRUE(InRange(-1, 30, 29)); + ASSERT_FALSE(InRange(-1, 30, 30)); + ASSERT_FALSE(InRange(-1, 30, 31)); + + ASSERT_FALSE(InRange(10, -1, 9)); + ASSERT_TRUE(InRange(10, -1, 10)); + ASSERT_TRUE(InRange(10, -1, 11)); + ASSERT_TRUE(InRange(10, -1, 31)); + + ASSERT_FALSE(InRange(10, 20, 9)); + ASSERT_TRUE(InRange(10, 20, 10)); + ASSERT_TRUE(InRange(10, 20, 19)); + ASSERT_FALSE(InRange(10, 20, 20)); + ASSERT_FALSE(InRange(10, 20, 21)); +} + +TEST_F(EncodedKeyTest, TestDecodeSimpleKeys) { + { + uint8_t val = 123; + EXPECT_DECODED_KEY_EQ(UINT8, "(uint8 key=123)", "\x7b", &val); + } + + { + int8_t val = -123; + EXPECT_DECODED_KEY_EQ(INT8, "(int8 key=-123)", "\x05", &val); + } + + { + uint16_t val = 12345; + EXPECT_DECODED_KEY_EQ(UINT16, "(uint16 key=12345)", "\x30\x39", &val); + } + + { + int16_t val = 12345; + EXPECT_DECODED_KEY_EQ(INT16, "(int16 key=12345)", "\xb0\x39", &val); + } + + { + int16_t val = -12345; + EXPECT_DECODED_KEY_EQ(INT16, "(int16 key=-12345)", "\x4f\xc7", &val); + } + + { + uint32_t val = 123456; + EXPECT_DECODED_KEY_EQ(UINT32, "(uint32 key=123456)", + Slice("\x00\x01\xe2\x40", 4), &val); + } + + { + int32_t val = -123456; + EXPECT_DECODED_KEY_EQ(INT32, "(int32 key=-123456)", "\x7f\xfe\x1d\xc0", &val); + } + + { + uint64_t val = 1234567891011121314; + EXPECT_DECODED_KEY_EQ(UINT64, "(uint64 key=1234567891011121314)", + "\x11\x22\x10\xf4\xb2\xd2\x30\xa2", &val); + } + + { + int64_t val = -1234567891011121314; + EXPECT_DECODED_KEY_EQ(INT64, "(int64 key=-1234567891011121314)", + "\x6e\xdd\xef\x0b\x4d\x2d\xcf\x5e", &val); + } + + { + Slice val("aKey"); + EXPECT_DECODED_KEY_EQ(STRING, "(string key=aKey)", "aKey", &val); + } +} + +TEST_F(EncodedKeyTest, TestDecodeCompoundKeys) { + gscoped_ptr key; + { + // Integer type compound key. + Schema schema({ ColumnSchema("key0", UINT16), + ColumnSchema("key1", UINT32), + ColumnSchema("key2", UINT64) }, 3); + + EncodedKeyBuilder builder(&schema); + uint16_t key0 = 12345; + uint32_t key1 = 123456; + uint64_t key2 = 1234567891011121314; + builder.AddColumnKey(&key0); + builder.AddColumnKey(&key1); + builder.AddColumnKey(&key2); + key.reset(builder.BuildEncodedKey()); + + EXPECT_ROWKEY_EQ(schema, + "(uint16 key0=12345, uint32 key1=123456, uint64 key2=1234567891011121314)", + *key); + } + + { + // Mixed type compound key with STRING last. + Schema schema({ ColumnSchema("key0", UINT16), + ColumnSchema("key1", STRING) }, 2); + EncodedKeyBuilder builder(&schema); + uint16_t key0 = 12345; + Slice key1("aKey"); + builder.AddColumnKey(&key0); + builder.AddColumnKey(&key1); + key.reset(builder.BuildEncodedKey()); + + EXPECT_ROWKEY_EQ(schema, "(uint16 key0=12345, string key1=aKey)", *key); + } + + { + // Mixed type compound key with STRING in the middle + Schema schema({ ColumnSchema("key0", UINT16), + ColumnSchema("key1", STRING), + ColumnSchema("key2", UINT8) }, 3); + EncodedKeyBuilder builder(&schema); + uint16_t key0 = 12345; + Slice key1("aKey"); + uint8_t key2 = 123; + builder.AddColumnKey(&key0); + builder.AddColumnKey(&key1); + builder.AddColumnKey(&key2); + key.reset(builder.BuildEncodedKey()); + + EXPECT_ROWKEY_EQ(schema, "(uint16 key0=12345, string key1=aKey, uint8 key2=123)", *key); + } +} + +TEST_F(EncodedKeyTest, TestConstructFromEncodedString) { + gscoped_ptr key; + Arena arena(1024, 1024*1024); + + { + // Integer type compound key. + Schema schema({ ColumnSchema("key0", UINT16), + ColumnSchema("key1", UINT32), + ColumnSchema("key2", UINT64) }, 3); + + // Prefix with only one full column specified + ASSERT_OK(EncodedKey::DecodeEncodedString( + schema, &arena, + Slice("\x00\x01" + "\x00\x00\x00\x02" + "\x00\x00\x00\x00\x00\x00\x00\x03", 14), + &key)); + EXPECT_ROWKEY_EQ(schema, "(uint16 key0=1, uint32 key1=2, uint64 key2=3)", *key); + } +} + +// Test encoding random strings and ensure that the decoded string +// matches the input. +TEST_F(EncodedKeyTest, TestRandomStringEncoding) { + Random r(SeedRandom()); + char buf[80]; + faststring encoded; + Arena arena(1024, 1024); + for (int i = 0; i < 10000; i++) { + encoded.clear(); + arena.Reset(); + + int len = r.Uniform(sizeof(buf)); + RandomString(buf, len, &r); + + Slice in_slice(buf, len); + KeyEncoderTraits::EncodeWithSeparators(&in_slice, false, &encoded); + + Slice to_decode(encoded); + Slice decoded_slice; + // C++ does not allow commas in macro invocations without being wrapped in parenthesis. + ASSERT_OK((KeyEncoderTraits::DecodeKeyPortion( + &to_decode, false, &arena, + reinterpret_cast(&decoded_slice)))); + + ASSERT_EQ(decoded_slice.ToDebugString(), in_slice.ToDebugString()) + << "encoded: " << Slice(encoded).ToDebugString(); + } +} + +#ifdef NDEBUG + +// Without this wrapper function, small changes to the code size of +// EncodeWithSeparators would cause the benchmark to either inline or not +// inline the function under test, making the benchmark unreliable. +ATTRIBUTE_NOINLINE +static void NoInlineDoEncode(Slice s, bool is_last, faststring* dst) { + KeyEncoderTraits::EncodeWithSeparators(s, is_last, dst); +} + +TEST_F(EncodedKeyTest, BenchmarkStringEncoding) { + string data; + for (int i = 0; i < 100; i++) { + data += "abcdefghijklmnopqrstuvwxyz"; + } + + for (int size = 0; size < 32; size++) { + LOG_TIMING(INFO, strings::Substitute("1M strings: size=$0", size)) { + faststring dst; + for (int i = 0; i < 1000000; i++) { + dst.clear(); + NoInlineDoEncode(Slice(data.c_str(), size), false, &dst); + } + } + } +} +#endif +} // namespace kudu diff --git a/src/kudu/common/encoded_key.cc b/src/kudu/common/encoded_key.cc new file mode 100644 index 000000000000..e4ea0273b49d --- /dev/null +++ b/src/kudu/common/encoded_key.cc @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/encoded_key.h" +#include "kudu/common/key_encoder.h" +#include "kudu/common/row.h" +#include "kudu/common/row_key-util.h" + +namespace kudu { + +using std::string; + + +EncodedKey::EncodedKey(faststring* data, + vector *raw_keys, + size_t num_key_cols) + : num_key_cols_(num_key_cols) { + int len = data->size(); + data_.reset(data->release()); + encoded_key_ = Slice(data_.get(), len); + + DCHECK_LE(raw_keys->size(), num_key_cols); + + raw_keys_.swap(*raw_keys); +} + +gscoped_ptr EncodedKey::FromContiguousRow(const ConstContiguousRow& row) { + EncodedKeyBuilder kb(row.schema()); + for (int i = 0; i < row.schema()->num_key_columns(); i++) { + kb.AddColumnKey(row.cell_ptr(i)); + } + return make_gscoped_ptr(kb.BuildEncodedKey()); + +} + +Status EncodedKey::DecodeEncodedString(const Schema& schema, + Arena* arena, + const Slice& encoded, + gscoped_ptr* result) { + uint8_t* raw_key_buf = static_cast(arena->AllocateBytes(schema.key_byte_size())); + if (PREDICT_FALSE(!raw_key_buf)) { + return Status::RuntimeError("OOM"); + } + + RETURN_NOT_OK(schema.DecodeRowKey(encoded, raw_key_buf, arena)); + + vector raw_keys(schema.num_key_columns()); + for (int i = 0; i < schema.num_key_columns(); i++) { + raw_keys[i] = raw_key_buf + schema.column_offset(i); + } + + faststring data_copy; + data_copy.assign_copy(encoded.data(), encoded.size()); + + result->reset(new EncodedKey(&data_copy, &raw_keys, schema.num_key_columns())); + return Status::OK(); +} + +Status EncodedKey::IncrementEncodedKey(const Schema& tablet_schema, + gscoped_ptr *key, + Arena* arena) { + // Copy the row itself to the Arena. + uint8_t* new_row_key = static_cast( + arena->AllocateBytes(tablet_schema.key_byte_size())); + if (PREDICT_FALSE(!new_row_key)) { + return Status::RuntimeError("Out of memory allocating row key"); + } + + vector new_raw_keys(tablet_schema.num_key_columns()); + for (int i = 0; i < tablet_schema.num_key_columns(); i++) { + int size = tablet_schema.column(i).type_info()->size(); + + void* dst = new_row_key + tablet_schema.column_offset(i); + new_raw_keys[i] = dst; + memcpy(dst, + (*key)->raw_keys()[i], + size); + } + + // Increment the new key + ContiguousRow new_row(&tablet_schema, new_row_key); + if (!row_key_util::IncrementKey(&new_row, arena)) { + return Status::IllegalState("No lexicographically greater key exists"); + } + + // Re-encode it. + faststring buf; + tablet_schema.EncodeComparableKey(new_row, &buf); + + key->reset(new EncodedKey(&buf, &new_raw_keys, tablet_schema.num_key_columns())); + return Status::OK(); +} + +string EncodedKey::Stringify(const Schema &schema) const { + if (num_key_cols_ == 1) { + return schema.column(0).Stringify(raw_keys_.front()); + } + + faststring s; + s.append("("); + for (int i = 0; i < num_key_cols_; i++) { + if (i > 0) { + s.append(","); + } + if (i < raw_keys_.size()) { + s.append(schema.column(i).Stringify(raw_keys_[i])); + } else { + s.append("*"); + } + } + s.append(")"); + return s.ToString(); +} + +//////////////////////////////////////////////////////////// + +EncodedKeyBuilder::EncodedKeyBuilder(const Schema* schema) + : schema_(schema), + encoded_key_(schema->key_byte_size()), + num_key_cols_(schema->num_key_columns()), + idx_(0) { +} + +void EncodedKeyBuilder::Reset() { + encoded_key_.clear(); + idx_ = 0; + raw_keys_.clear(); + encoded_key_.reserve(schema_->key_byte_size()); +} + +void EncodedKeyBuilder::AddColumnKey(const void *raw_key) { + DCHECK_LT(idx_, num_key_cols_); + + const ColumnSchema &col = schema_->column(idx_); + DCHECK(!col.is_nullable()); + + const TypeInfo* ti = col.type_info(); + bool is_last = idx_ == num_key_cols_ - 1; + GetKeyEncoder(ti).Encode(raw_key, is_last, &encoded_key_); + raw_keys_.push_back(raw_key); + + ++idx_; +} + +EncodedKey *EncodedKeyBuilder::BuildEncodedKey() { + if (idx_ == 0) { + return nullptr; + } + auto ret = new EncodedKey(&encoded_key_, &raw_keys_, num_key_cols_); + idx_ = 0; + return ret; +} + +void EncodedKeyBuilder::AssignCopy(const EncodedKeyBuilder &other) { + DCHECK_SCHEMA_EQ(*schema_, *other.schema_); + + encoded_key_.assign_copy(other.encoded_key_.data(), + other.encoded_key_.length()); + idx_ = other.idx_; + raw_keys_.assign(other.raw_keys_.begin(), other.raw_keys_.end()); +} + +string EncodedKey::RangeToString(const EncodedKey* lower, const EncodedKey* upper) { + string ret; + if (lower && upper) { + ret.append("encoded key BETWEEN "); + ret.append(lower->encoded_key().ToDebugString()); + ret.append(" AND "); + ret.append(upper->encoded_key().ToDebugString()); + return ret; + } else if (lower) { + ret.append("encoded key >= "); + ret.append(lower->encoded_key().ToDebugString()); + return ret; + } else if (upper) { + ret.append("encoded key <= "); + ret.append(upper->encoded_key().ToDebugString()); + } else { + LOG(DFATAL) << "Invalid key!"; + ret = "invalid key range"; + } + return ret; +} + +string EncodedKey::RangeToStringWithSchema(const EncodedKey* lower, const EncodedKey* upper, + const Schema& s) { + string ret; + if (lower) { + ret.append("PK >= "); + ret.append(s.DebugEncodedRowKey(lower->encoded_key(), Schema::START_KEY)); + } + if (lower && upper) { + ret.append(" AND "); + } + if (upper) { + ret.append("PK < "); + ret.append(s.DebugEncodedRowKey(upper->encoded_key(), Schema::END_KEY)); + } + return ret; +} + +} // namespace kudu diff --git a/src/kudu/common/encoded_key.h b/src/kudu/common/encoded_key.h new file mode 100644 index 000000000000..2a60654dca2c --- /dev/null +++ b/src/kudu/common/encoded_key.h @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ENCODED_KEY_H +#define KUDU_COMMON_ENCODED_KEY_H + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/util/faststring.h" + +namespace kudu { + +class ConstContiguousRow; + +class EncodedKey { + public: + // Constructs a new EncodedKey. + // This class takes over the value of 'data' and contents of + // raw_keys. Note that num_key_cols is the number of key columns for + // the schema, but this may be different from the size of raw_keys + // in which case raw_keys represents the supplied prefix of a + // composite key. + EncodedKey(faststring *data, + vector *raw_keys, + size_t num_key_cols); + + static gscoped_ptr FromContiguousRow(const ConstContiguousRow& row); + + // Decode the encoded key specified in 'encoded', which must correspond to the + // provided schema. + // The returned row data is allocated from 'arena' and returned in '*result'. + // If allocation fails or the encoding is invalid, returns a bad Status. + static Status DecodeEncodedString(const Schema& schema, + Arena* arena, + const Slice& encoded, + gscoped_ptr *result); + + // Given an EncodedKey, increment it to the next lexicographically greater EncodedKey. + static Status IncrementEncodedKey(const Schema& tablet_schema, + gscoped_ptr* key, + Arena* arena); + + const Slice &encoded_key() const { return encoded_key_; } + + const vector &raw_keys() const { return raw_keys_; } + + size_t num_key_columns() const { return num_key_cols_; } + + std::string Stringify(const Schema &schema) const; + + // Tests whether this EncodedKey is within the bounds given by 'start' + // and 'end'. + // + // The empty bound has special significance: it's both the lowest value + // (if in 'start') and the highest (if in 'end'). + bool InRange(const Slice& start, const Slice& end) const { + return (start.compare(encoded_key_) <= 0 && + (end.empty() || encoded_key_.compare(end) < 0)); + } + + static std::string RangeToString(const EncodedKey* lower, + const EncodedKey* upper); + + static std::string RangeToStringWithSchema(const EncodedKey* lower, + const EncodedKey* upper, + const Schema& schema); + + + private: + const int num_key_cols_; + Slice encoded_key_; + gscoped_ptr data_; + vector raw_keys_; +}; + +// A builder for encoded key: creates an encoded key from +// one or more key columns specified as raw pointers. +class EncodedKeyBuilder { + public: + // 'schema' must remain valid for the lifetime of the EncodedKeyBuilder. + explicit EncodedKeyBuilder(const Schema* schema); + + void Reset(); + + void AddColumnKey(const void *raw_key); + + EncodedKey *BuildEncodedKey(); + + void AssignCopy(const EncodedKeyBuilder &other); + + private: + DISALLOW_COPY_AND_ASSIGN(EncodedKeyBuilder); + + const Schema* schema_; + faststring encoded_key_; + const size_t num_key_cols_; + size_t idx_; + vector raw_keys_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/common/generic_iterators-test.cc b/src/kudu/common/generic_iterators-test.cc new file mode 100644 index 000000000000..cb870bebdaac --- /dev/null +++ b/src/kudu/common/generic_iterators-test.cc @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/generic_iterators.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(num_lists, 3, "Number of lists to merge"); +DEFINE_int32(num_rows, 1000, "Number of entries per list"); +DEFINE_int32(num_iters, 1, "Number of times to run merge"); + +namespace kudu { + +using std::shared_ptr; + +static const Schema kIntSchema({ ColumnSchema("val", UINT32) }, 1); + +// Test iterator which just yields integer rows from a provided +// vector. +class VectorIterator : public ColumnwiseIterator { + public: + explicit VectorIterator(vector ints) + : ints_(std::move(ints)), + cur_idx_(0) { + } + + Status Init(ScanSpec *spec) OVERRIDE { + return Status::OK(); + } + + virtual Status PrepareBatch(size_t *nrows) OVERRIDE { + int rem = ints_.size() - cur_idx_; + if (rem < *nrows) { + *nrows = rem; + } + prepared_ = rem; + return Status::OK(); + } + + virtual Status InitializeSelectionVector(SelectionVector *sel_vec) OVERRIDE { + sel_vec->SetAllTrue(); + return Status::OK(); + } + + virtual Status MaterializeColumn(size_t col, ColumnBlock *dst) OVERRIDE { + CHECK_EQ(UINT32, dst->type_info()->physical_type()); + DCHECK_LE(prepared_, dst->nrows()); + + for (size_t i = 0; i < prepared_; i++) { + dst->SetCellValue(i, &(ints_[cur_idx_++])); + } + + return Status::OK(); + } + + virtual Status FinishBatch() OVERRIDE { + prepared_ = 0; + return Status::OK(); + } + + virtual bool HasNext() const OVERRIDE { + return cur_idx_ < ints_.size(); + } + + virtual string ToString() const OVERRIDE { + return string("VectorIterator"); + } + + virtual const Schema &schema() const OVERRIDE { + return kIntSchema; + } + + virtual void GetIteratorStats(vector* stats) const OVERRIDE { + stats->resize(schema().num_columns()); + } + + private: + vector ints_; + int cur_idx_; + size_t prepared_; +}; + +// Test that empty input to a merger behaves correctly. +TEST(TestMergeIterator, TestMergeEmpty) { + vector empty_vec; + shared_ptr iter( + new MaterializingIterator( + shared_ptr(new VectorIterator(empty_vec)))); + + vector > to_merge; + to_merge.push_back(iter); + + MergeIterator merger(kIntSchema, to_merge); + ASSERT_OK(merger.Init(nullptr)); + ASSERT_FALSE(merger.HasNext()); +} + + +class TestIntRangePredicate { + public: + TestIntRangePredicate(uint32_t lower, uint32_t upper) : + lower_(lower), + upper_(upper), + pred_(kIntSchema.column(0), &lower_, &upper_) {} + + uint32_t lower_, upper_; + ColumnRangePredicate pred_; +}; + +void TestMerge(const TestIntRangePredicate &predicate) { + vector > to_merge; + vector ints; + vector all_ints; + all_ints.reserve(FLAGS_num_rows * FLAGS_num_lists); + + // Setup predicate exclusion + ScanSpec spec; + spec.AddPredicate(predicate.pred_); + LOG(INFO) << "Predicate: " << predicate.pred_.ToString(); + + for (int i = 0; i < FLAGS_num_lists; i++) { + ints.clear(); + ints.reserve(FLAGS_num_rows); + + uint32_t entry = 0; + for (int j = 0; j < FLAGS_num_rows; j++) { + entry += rand() % 5; + ints.push_back(entry); + // Evaluate the predicate before pushing to all_ints + if (entry >= predicate.lower_ && entry <= predicate.upper_) { + all_ints.push_back(entry); + } + } + + shared_ptr iter( + new MaterializingIterator( + shared_ptr(new VectorIterator(ints)))); + vector > to_union; + to_union.push_back(iter); + to_merge.push_back(shared_ptr(new UnionIterator(to_union))); + } + + VLOG(1) << "Predicate expects " << all_ints.size() << " results"; + + LOG_TIMING(INFO, "std::sort the expected results") { + std::sort(all_ints.begin(), all_ints.end()); + } + + for (int trial = 0; trial < FLAGS_num_iters; trial++) { + LOG_TIMING(INFO, "Iterate merged lists") { + MergeIterator merger(kIntSchema, to_merge); + ASSERT_OK(merger.Init(&spec)); + + RowBlock dst(kIntSchema, 100, nullptr); + size_t total_idx = 0; + while (merger.HasNext()) { + ASSERT_OK(merger.NextBlock(&dst)); + ASSERT_GT(dst.nrows(), 0) << + "if HasNext() returns true, must return some rows"; + + for (int i = 0; i < dst.nrows(); i++) { + uint32_t this_row = *kIntSchema.ExtractColumnFromRow(dst.row(i), 0); + ASSERT_GE(this_row, predicate.lower_) << "Yielded integer excluded by predicate"; + ASSERT_LE(this_row, predicate.upper_) << "Yielded integer excluded by predicate"; + if (all_ints[total_idx] != this_row) { + ASSERT_EQ(all_ints[total_idx], this_row) << + "Yielded out of order at idx " << total_idx; + } + total_idx++; + } + } + } + } +} + +TEST(TestMergeIterator, TestMerge) { + TestIntRangePredicate predicate(0, MathLimits::kMax); + TestMerge(predicate); +} + + +TEST(TestMergeIterator, TestPredicate) { + TestIntRangePredicate predicate(0, FLAGS_num_rows / 5); + TestMerge(predicate); +} + +// Test that the MaterializingIterator properly evaluates predicates when they apply +// to single columns. +TEST(TestMaterializingIterator, TestMaterializingPredicatePushdown) { + ScanSpec spec; + TestIntRangePredicate pred1(20, 29); + spec.AddPredicate(pred1.pred_); + LOG(INFO) << "Predicate: " << pred1.pred_.ToString(); + + vector ints; + for (int i = 0; i < 100; i++) { + ints.push_back(i); + } + + shared_ptr colwise(new VectorIterator(ints)); + MaterializingIterator materializing(colwise); + ASSERT_OK(materializing.Init(&spec)); + ASSERT_EQ(0, spec.predicates().size()) + << "Iterator should have pushed down predicate"; + + Arena arena(1024, 1024); + RowBlock dst(kIntSchema, 100, &arena); + ASSERT_OK(materializing.NextBlock(&dst)); + ASSERT_EQ(dst.nrows(), 100); + + // Check that the resulting selection vector is correct (rows 20-29 selected) + ASSERT_EQ(10, dst.selection_vector()->CountSelected()); + ASSERT_FALSE(dst.selection_vector()->IsRowSelected(0)); + ASSERT_TRUE(dst.selection_vector()->IsRowSelected(20)); + ASSERT_TRUE(dst.selection_vector()->IsRowSelected(29)); + ASSERT_FALSE(dst.selection_vector()->IsRowSelected(30)); +} + +// Test that PredicateEvaluatingIterator will properly evaluate predicates on its +// input. +TEST(TestPredicateEvaluatingIterator, TestPredicateEvaluation) { + ScanSpec spec; + TestIntRangePredicate pred1(20, 29); + spec.AddPredicate(pred1.pred_); + LOG(INFO) << "Predicate: " << pred1.pred_.ToString(); + + vector ints; + for (int i = 0; i < 100; i++) { + ints.push_back(i); + } + + // Set up a MaterializingIterator with pushdown disabled, so that the + // PredicateEvaluatingIterator will wrap it and do evaluation. + shared_ptr colwise(new VectorIterator(ints)); + MaterializingIterator *materializing = new MaterializingIterator(colwise); + materializing->disallow_pushdown_for_tests_ = true; + + // Wrap it in another iterator to do the evaluation + shared_ptr outer_iter(materializing); + ASSERT_OK(PredicateEvaluatingIterator::InitAndMaybeWrap(&outer_iter, &spec)); + + ASSERT_NE(reinterpret_cast(outer_iter.get()), + reinterpret_cast(materializing)) + << "Iterator pointer should differ after wrapping"; + + PredicateEvaluatingIterator *pred_eval = down_cast( + outer_iter.get()); + + ASSERT_EQ(0, spec.predicates().size()) + << "Iterator tree should have accepted predicate"; + ASSERT_EQ(1, pred_eval->predicates_.size()) + << "Predicate should be evaluated by the outer iterator"; + + Arena arena(1024, 1024); + RowBlock dst(kIntSchema, 100, &arena); + ASSERT_OK(outer_iter->NextBlock(&dst)); + ASSERT_EQ(dst.nrows(), 100); + + // Check that the resulting selection vector is correct (rows 20-29 selected) + ASSERT_EQ(10, dst.selection_vector()->CountSelected()); + ASSERT_FALSE(dst.selection_vector()->IsRowSelected(0)); + ASSERT_TRUE(dst.selection_vector()->IsRowSelected(20)); + ASSERT_TRUE(dst.selection_vector()->IsRowSelected(29)); + ASSERT_FALSE(dst.selection_vector()->IsRowSelected(30)); +} + +// Test that PredicateEvaluatingIterator::InitAndMaybeWrap doesn't wrap an underlying +// iterator when there are no predicates left. +TEST(TestPredicateEvaluatingIterator, TestDontWrapWhenNoPredicates) { + ScanSpec spec; + + vector ints; + shared_ptr colwise(new VectorIterator(ints)); + shared_ptr materializing(new MaterializingIterator(colwise)); + shared_ptr outer_iter(materializing); + ASSERT_OK(PredicateEvaluatingIterator::InitAndMaybeWrap(&outer_iter, &spec)); + ASSERT_EQ(outer_iter, materializing) << "InitAndMaybeWrap should not have wrapped iter"; +} + +} // namespace kudu diff --git a/src/kudu/common/generic_iterators.cc b/src/kudu/common/generic_iterators.cc new file mode 100644 index 000000000000..6a0cbc08d08a --- /dev/null +++ b/src/kudu/common/generic_iterators.cc @@ -0,0 +1,598 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/memory/arena.h" + +using std::shared_ptr; +using std::string; + +DEFINE_bool(materializing_iterator_do_pushdown, true, + "Should MaterializingIterator do predicate pushdown"); +TAG_FLAG(materializing_iterator_do_pushdown, hidden); + +namespace kudu { + +//////////////////////////////////////////////////////////// +// Merge iterator +//////////////////////////////////////////////////////////// + +// TODO: size by bytes, not # rows +static const int kMergeRowBuffer = 1000; + +// MergeIterState wraps a RowwiseIterator for use by the MergeIterator. +// Importantly, it also filters out unselected rows from the wrapped RowwiseIterator, +// such that all returned rows are valid. +class MergeIterState { + public: + explicit MergeIterState(const shared_ptr &iter) : + iter_(iter), + arena_(1024, 256*1024), + read_block_(iter->schema(), kMergeRowBuffer, &arena_), + next_row_idx_(0), + num_advanced_(0), + num_valid_(0) + {} + + const RowBlockRow& next_row() { + DCHECK_LT(num_advanced_, num_valid_); + return next_row_; + } + + Status Advance() { + num_advanced_++; + if (IsBlockExhausted()) { + arena_.Reset(); + return PullNextBlock(); + } else { + // Seek to the next selected row. + SelectionVector *selection = read_block_.selection_vector(); + for (++next_row_idx_; next_row_idx_ < read_block_.nrows(); next_row_idx_++) { + if (selection->IsRowSelected(next_row_idx_)) { + next_row_.Reset(&read_block_, next_row_idx_); + break; + } + } + DCHECK_NE(next_row_idx_, read_block_.nrows()+1) << "No selected rows found!"; + return Status::OK(); + } + } + + bool IsBlockExhausted() const { + return num_advanced_ == num_valid_; + } + + bool IsFullyExhausted() const { + return num_valid_ == 0; + } + + Status PullNextBlock() { + CHECK_EQ(num_advanced_, num_valid_) + << "should not pull next block until current block is exhausted"; + + if (!iter_->HasNext()) { + // Fully exhausted + num_advanced_ = 0; + num_valid_ = 0; + return Status::OK(); + } + + RETURN_NOT_OK(iter_->NextBlock(&read_block_)); + num_advanced_ = 0; + // Honor the selection vector of the read_block_, since not all rows are necessarily selected. + SelectionVector *selection = read_block_.selection_vector(); + DCHECK_EQ(selection->nrows(), read_block_.nrows()); + DCHECK_LE(selection->CountSelected(), read_block_.nrows()); + num_valid_ = selection->CountSelected(); + VLOG(2) << selection->CountSelected() << "/" << read_block_.nrows() << " rows selected"; + // Seek next_row_ to the first selected row. + for (next_row_idx_ = 0; next_row_idx_ < read_block_.nrows(); next_row_idx_++) { + if (selection->IsRowSelected(next_row_idx_)) { + next_row_.Reset(&read_block_, next_row_idx_); + break; + } + } + DCHECK_NE(next_row_idx_, read_block_.nrows()+1) << "No selected rows found!"; + return Status::OK(); + } + + size_t remaining_in_block() const { + return num_valid_ - num_advanced_; + } + + const shared_ptr& iter() const { + return iter_; + } + + shared_ptr iter_; + Arena arena_; + RowBlock read_block_; + // The row currently pointed to by the iterator. + RowBlockRow next_row_; + // Row index of next_row_ in read_block_. + size_t next_row_idx_; + // Number of rows we've advanced past in the current RowBlock. + size_t num_advanced_; + // Number of valid (selected) rows in the current RowBlock. + size_t num_valid_; +}; + + +MergeIterator::MergeIterator( + const Schema &schema, + const vector > &iters) + : schema_(schema), + initted_(false) { + CHECK_GT(iters.size(), 0); + CHECK_GT(schema.num_key_columns(), 0); + orig_iters_.assign(iters.begin(), iters.end()); +} + +Status MergeIterator::Init(ScanSpec *spec) { + CHECK(!initted_); + // TODO: check that schemas match up! + + RETURN_NOT_OK(InitSubIterators(spec)); + + for (shared_ptr &state : iters_) { + RETURN_NOT_OK(state->PullNextBlock()); + } + + // Before we copy any rows, clean up any iterators which were empty + // to start with. Otherwise, HasNext() won't properly return false + // if we were passed only empty iterators. + for (size_t i = 0; i < iters_.size(); i++) { + if (PREDICT_FALSE(iters_[i]->IsFullyExhausted())) { + iters_.erase(iters_.begin() + i); + i--; + continue; + } + } + + initted_ = true; + return Status::OK(); +} + +bool MergeIterator::HasNext() const { + CHECK(initted_); + return !iters_.empty(); +} + +Status MergeIterator::InitSubIterators(ScanSpec *spec) { + // Initialize all the sub iterators. + for (shared_ptr &iter : orig_iters_) { + ScanSpec *spec_copy = spec != nullptr ? scan_spec_copies_.Construct(*spec) : nullptr; + RETURN_NOT_OK(PredicateEvaluatingIterator::InitAndMaybeWrap(&iter, spec_copy)); + iters_.push_back(shared_ptr(new MergeIterState(iter))); + } + + // Since we handle predicates in all the wrapped iterators, we can clear + // them here. + if (spec != nullptr) { + spec->mutable_predicates()->clear(); + } + return Status::OK(); +} + +Status MergeIterator::NextBlock(RowBlock* dst) { + CHECK(initted_); + DCHECK_SCHEMA_EQ(dst->schema(), schema()); + + PrepareBatch(dst); + RETURN_NOT_OK(MaterializeBlock(dst)); + + return Status::OK(); +} + +void MergeIterator::PrepareBatch(RowBlock* dst) { + if (dst->arena()) { + dst->arena()->Reset(); + } + + // We can always provide at least as many rows as are remaining + // in the currently queued up blocks. + size_t available = 0; + for (shared_ptr &iter : iters_) { + available += iter->remaining_in_block(); + } + + dst->Resize(std::min(dst->row_capacity(), available)); +} + +// TODO: this is an obvious spot to add codegen - there's a ton of branching +// and such around the comparisons. A simple experiment indicated there's some +// 2x to be gained. +Status MergeIterator::MaterializeBlock(RowBlock *dst) { + // Initialize the selection vector. + // MergeIterState only returns selected rows. + dst->selection_vector()->SetAllTrue(); + for (size_t dst_row_idx = 0; dst_row_idx < dst->nrows(); dst_row_idx++) { + RowBlockRow dst_row = dst->row(dst_row_idx); + + // Find the sub-iterator which is currently smallest + MergeIterState *smallest = nullptr; + ssize_t smallest_idx = -1; + + // Typically the number of iters_ is not that large, so using a priority + // queue is not worth it + for (size_t i = 0; i < iters_.size(); i++) { + shared_ptr &state = iters_[i]; + + if (smallest == nullptr || + schema_.Compare(state->next_row(), smallest->next_row()) < 0) { + smallest = state.get(); + smallest_idx = i; + } + } + + // If no iterators had any row left, then we're done iterating. + if (PREDICT_FALSE(smallest == nullptr)) break; + + // Otherwise, copy the row from the smallest one, and advance it + RETURN_NOT_OK(CopyRow(smallest->next_row(), &dst_row, dst->arena())); + RETURN_NOT_OK(smallest->Advance()); + + if (smallest->IsFullyExhausted()) { + iters_.erase(iters_.begin() + smallest_idx); + } + } + + return Status::OK(); +} + +string MergeIterator::ToString() const { + string s; + s.append("Merge("); + bool first = true; + for (const shared_ptr &iter : orig_iters_) { + s.append(iter->ToString()); + if (!first) { + s.append(", "); + } + first = false; + } + s.append(")"); + return s; +} + +const Schema& MergeIterator::schema() const { + CHECK(initted_); + return schema_; +} + +void MergeIterator::GetIteratorStats(vector* stats) const { + CHECK(initted_); + vector > stats_by_iter; + for (const shared_ptr& iter : orig_iters_) { + vector stats_for_iter; + iter->GetIteratorStats(&stats_for_iter); + stats_by_iter.push_back(stats_for_iter); + } + for (size_t idx = 0; idx < schema_.num_columns(); ++idx) { + IteratorStats stats_for_col; + for (const vector& stats_for_iter : stats_by_iter) { + stats_for_col.AddStats(stats_for_iter[idx]); + } + stats->push_back(stats_for_col); + } +} + + +//////////////////////////////////////////////////////////// +// Union iterator +//////////////////////////////////////////////////////////// + +UnionIterator::UnionIterator(const vector > &iters) + : initted_(false), + iters_(iters.size()) { + CHECK_GT(iters.size(), 0); + iters_.assign(iters.begin(), iters.end()); + all_iters_.assign(iters.begin(), iters.end()); +} + +Status UnionIterator::Init(ScanSpec *spec) { + CHECK(!initted_); + + // Initialize the underlying iterators + RETURN_NOT_OK(InitSubIterators(spec)); + + // Verify schemas match. + // Important to do the verification after initializing the + // sub-iterators, since they may not know their own schemas + // until they've been initialized (in the case of a union of unions) + schema_.reset(new Schema(iters_.front()->schema())); + for (const shared_ptr &iter : iters_) { + if (!iter->schema().Equals(*schema_)) { + return Status::InvalidArgument( + string("Schemas do not match: ") + schema_->ToString() + + " vs " + iter->schema().ToString()); + } + } + + initted_ = true; + return Status::OK(); +} + + +Status UnionIterator::InitSubIterators(ScanSpec *spec) { + for (shared_ptr &iter : iters_) { + ScanSpec *spec_copy = spec != nullptr ? scan_spec_copies_.Construct(*spec) : nullptr; + RETURN_NOT_OK(PredicateEvaluatingIterator::InitAndMaybeWrap(&iter, spec_copy)); + } + // Since we handle predicates in all the wrapped iterators, we can clear + // them here. + if (spec != nullptr) { + spec->mutable_predicates()->clear(); + } + return Status::OK(); +} + +bool UnionIterator::HasNext() const { + CHECK(initted_); + for (const shared_ptr &iter : iters_) { + if (iter->HasNext()) return true; + } + + return false; +} + +Status UnionIterator::NextBlock(RowBlock* dst) { + CHECK(initted_); + PrepareBatch(); + RETURN_NOT_OK(MaterializeBlock(dst)); + FinishBatch(); + return Status::OK(); +} + +void UnionIterator::PrepareBatch() { + CHECK(initted_); + + while (!iters_.empty() && + !iters_.front()->HasNext()) { + iters_.pop_front(); + } +} + +Status UnionIterator::MaterializeBlock(RowBlock *dst) { + return iters_.front()->NextBlock(dst); +} + +void UnionIterator::FinishBatch() { + if (!iters_.front()->HasNext()) { + // Iterator exhausted, remove it. + iters_.pop_front(); + } +} + + +string UnionIterator::ToString() const { + string s; + s.append("Union("); + bool first = true; + for (const shared_ptr &iter : iters_) { + if (!first) { + s.append(", "); + } + first = false; + s.append(iter->ToString()); + } + s.append(")"); + return s; +} + +void UnionIterator::GetIteratorStats(std::vector* stats) const { + CHECK(initted_); + vector > stats_by_iter; + for (const shared_ptr& iter : all_iters_) { + vector stats_for_iter; + iter->GetIteratorStats(&stats_for_iter); + stats_by_iter.push_back(stats_for_iter); + } + for (size_t idx = 0; idx < schema_->num_columns(); ++idx) { + IteratorStats stats_for_col; + for (const vector& stats_for_iter : stats_by_iter) { + stats_for_col.AddStats(stats_for_iter[idx]); + } + stats->push_back(stats_for_col); + } +} + +//////////////////////////////////////////////////////////// +// Materializing iterator +//////////////////////////////////////////////////////////// + +MaterializingIterator::MaterializingIterator(shared_ptr iter) + : iter_(std::move(iter)), + disallow_pushdown_for_tests_(!FLAGS_materializing_iterator_do_pushdown) { +} + +Status MaterializingIterator::Init(ScanSpec *spec) { + RETURN_NOT_OK(iter_->Init(spec)); + + if (spec != nullptr && !disallow_pushdown_for_tests_) { + // Gather any single-column predicates. + ScanSpec::PredicateList *preds = spec->mutable_predicates(); + for (auto iter = preds->begin(); iter != preds->end();) { + const ColumnRangePredicate &pred = *iter; + const string &col_name = pred.column().name(); + int idx = schema().find_column(col_name); + if (idx == -1) { + return Status::InvalidArgument("No such column", col_name); + } + + VLOG(1) << "Pushing down predicate " << pred.ToString(); + preds_by_column_.insert(std::make_pair(idx, pred)); + + // Since we'll evaluate this predicate ourselves, remove it from the scan spec + // so higher layers don't repeat our work. + iter = preds->erase(iter); + } + } + + // Determine a materialization order such that columns with predicates + // are materialized first. + // + // TODO: we can be a little smarter about this, by trying to estimate + // predicate selectivity, involve the materialization cost of types, etc. + vector with_preds, without_preds; + + for (size_t i = 0; i < schema().num_columns(); i++) { + int num_preds = preds_by_column_.count(i); + if (num_preds > 0) { + with_preds.push_back(i); + } else { + without_preds.push_back(i); + } + } + + materialization_order_.swap(with_preds); + materialization_order_.insert(materialization_order_.end(), + without_preds.begin(), without_preds.end()); + DCHECK_EQ(materialization_order_.size(), schema().num_columns()); + + return Status::OK(); +} + +bool MaterializingIterator::HasNext() const { + return iter_->HasNext(); +} + +Status MaterializingIterator::NextBlock(RowBlock* dst) { + size_t n = dst->row_capacity(); + if (dst->arena()) { + dst->arena()->Reset(); + } + + RETURN_NOT_OK(iter_->PrepareBatch(&n)); + dst->Resize(n); + RETURN_NOT_OK(MaterializeBlock(dst)); + RETURN_NOT_OK(iter_->FinishBatch()); + + return Status::OK(); +} + +Status MaterializingIterator::MaterializeBlock(RowBlock *dst) { + // Initialize the selection vector indicating which rows have been + // been deleted. + RETURN_NOT_OK(iter_->InitializeSelectionVector(dst->selection_vector())); + + bool short_circuit = false; + + for (size_t col_idx : materialization_order_) { + // Materialize the column itself into the row block. + ColumnBlock dst_col(dst->column_block(col_idx)); + RETURN_NOT_OK(iter_->MaterializeColumn(col_idx, &dst_col)); + + // Evaluate any predicates that apply to this column. + auto range = preds_by_column_.equal_range(col_idx); + for (auto it = range.first; it != range.second; ++it) { + const ColumnRangePredicate &pred = it->second; + + pred.Evaluate(dst, dst->selection_vector()); + + // If after evaluating this predicate, the entire row block has now been + // filtered out, we don't need to materialize other columns at all. + if (!dst->selection_vector()->AnySelected()) { + short_circuit = true; + break; + } + } + if (short_circuit) { + break; + } + } + DVLOG(1) << dst->selection_vector()->CountSelected() << "/" + << dst->nrows() << " passed predicate"; + return Status::OK(); +} + +string MaterializingIterator::ToString() const { + string s; + s.append("Materializing(").append(iter_->ToString()).append(")"); + return s; +} + +//////////////////////////////////////////////////////////// +// PredicateEvaluatingIterator +//////////////////////////////////////////////////////////// + +PredicateEvaluatingIterator::PredicateEvaluatingIterator(shared_ptr base_iter) + : base_iter_(std::move(base_iter)) { +} + +Status PredicateEvaluatingIterator::InitAndMaybeWrap( + shared_ptr *base_iter, ScanSpec *spec) { + RETURN_NOT_OK((*base_iter)->Init(spec)); + if (spec != nullptr && + !spec->predicates().empty()) { + // Underlying iterator did not accept all predicates. Wrap it. + shared_ptr wrapper( + new PredicateEvaluatingIterator(*base_iter)); + CHECK_OK(wrapper->Init(spec)); + base_iter->swap(wrapper); + } + return Status::OK(); +} + +Status PredicateEvaluatingIterator::Init(ScanSpec *spec) { + // base_iter_ already Init()ed before this is constructed. + + CHECK_NOTNULL(spec); + // Gather any predicates that the base iterator did not pushdown. + // This also clears the predicates from the spec. + predicates_.swap(*(spec->mutable_predicates())); + return Status::OK(); +} + +bool PredicateEvaluatingIterator::HasNext() const { + return base_iter_->HasNext(); +} + +Status PredicateEvaluatingIterator::NextBlock(RowBlock *dst) { + RETURN_NOT_OK(base_iter_->NextBlock(dst)); + + for (ColumnRangePredicate &pred : predicates_) { + pred.Evaluate(dst, dst->selection_vector()); + + // If after evaluating this predicate, the entire row block has now been + // filtered out, we don't need to evaluate any further predicates. + if (!dst->selection_vector()->AnySelected()) { + break; + } + } + + return Status::OK(); +} + +string PredicateEvaluatingIterator::ToString() const { + string s; + s.append("PredicateEvaluating(").append(base_iter_->ToString()).append(")"); + return s; +} + + +} // namespace kudu diff --git a/src/kudu/common/generic_iterators.h b/src/kudu/common/generic_iterators.h new file mode 100644 index 000000000000..da5e706d5aec --- /dev/null +++ b/src/kudu/common/generic_iterators.h @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_MERGE_ITERATOR_H +#define KUDU_COMMON_MERGE_ITERATOR_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/scan_spec.h" +#include "kudu/util/object_pool.h" + +namespace kudu { + +class Arena; +class MergeIterState; + +// An iterator which merges the results of other iterators, comparing +// based on keys. +class MergeIterator : public RowwiseIterator { + public: + // TODO: clarify whether schema is just the projection, or must include the merge + // key columns. It should probably just be the required projection, which must be + // a subset of the columns in 'iters'. + MergeIterator(const Schema &schema, + const std::vector > &iters); + + // The passed-in iterators should be already initialized. + Status Init(ScanSpec *spec) OVERRIDE; + + virtual bool HasNext() const OVERRIDE; + + virtual string ToString() const OVERRIDE; + + virtual const Schema& schema() const OVERRIDE; + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE; + + virtual Status NextBlock(RowBlock* dst) OVERRIDE; + + private: + void PrepareBatch(RowBlock* dst); + Status MaterializeBlock(RowBlock* dst); + Status InitSubIterators(ScanSpec *spec); + + const Schema schema_; + + bool initted_; + + // Holds the subiterators until Init is called. + // This is required because we can't create a MergeIterState of an uninitialized iterator. + std::deque > orig_iters_; + std::vector > iters_; + + // When the underlying iterators are initialized, each needs its own + // copy of the scan spec in order to do its own pushdown calculations, etc. + // The copies are allocated from this pool so they can be automatically freed + // when the UnionIterator goes out of scope. + ObjectPool scan_spec_copies_; +}; + + +// An iterator which unions the results of other iterators. +// This is different from MergeIterator in that it lays the results out end-to-end +// rather than merging them based on keys. Hence it is more efficient since there is +// no comparison needed, and the key column does not need to be read if it is not +// part of the projection. +class UnionIterator : public RowwiseIterator { + public: + // Construct a union iterator of the given iterators. + // The iterators must have matching schemas. + // The passed-in iterators should not yet be initialized. + // + // All passed-in iterators must be fully able to evaluate all predicates - i.e. + // calling iter->Init(spec) should remove all predicates from the spec. + explicit UnionIterator(const std::vector > &iters); + + Status Init(ScanSpec *spec) OVERRIDE; + + bool HasNext() const OVERRIDE; + + string ToString() const OVERRIDE; + + const Schema &schema() const OVERRIDE { + CHECK(initted_); + CHECK(schema_.get() != NULL) << "Bad schema in " << ToString(); + return *CHECK_NOTNULL(schema_.get()); + } + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE; + + virtual Status NextBlock(RowBlock* dst) OVERRIDE; + + private: + void PrepareBatch(); + Status MaterializeBlock(RowBlock* dst); + void FinishBatch(); + Status InitSubIterators(ScanSpec *spec); + + // Schema: initialized during Init() + gscoped_ptr schema_; + bool initted_; + std::deque > iters_; + + // Since we pop from 'iters_' this field is needed in order to keep + // the underlying iterators available for GetIteratorStats. + std::vector > all_iters_; + + // When the underlying iterators are initialized, each needs its own + // copy of the scan spec in order to do its own pushdown calculations, etc. + // The copies are allocated from this pool so they can be automatically freed + // when the UnionIterator goes out of scope. + ObjectPool scan_spec_copies_; +}; + +// An iterator which wraps a ColumnwiseIterator, materializing it into full rows. +// +// Predicates which only apply to a single column are pushed down into this iterator. +// While materializing a block, columns with associated predicates are materialized +// first, and the predicates evaluated. If the predicates succeed in filtering out +// an entire batch, then other columns may avoid doing any IO. +class MaterializingIterator : public RowwiseIterator { + public: + explicit MaterializingIterator(std::shared_ptr iter); + + // Initialize the iterator, performing predicate pushdown as described above. + Status Init(ScanSpec *spec) OVERRIDE; + + bool HasNext() const OVERRIDE; + + string ToString() const OVERRIDE; + + const Schema &schema() const OVERRIDE { + return iter_->schema(); + } + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE { + iter_->GetIteratorStats(stats); + } + + virtual Status NextBlock(RowBlock* dst) OVERRIDE; + + private: + FRIEND_TEST(TestMaterializingIterator, TestPredicatePushdown); + FRIEND_TEST(TestPredicateEvaluatingIterator, TestPredicateEvaluation); + + Status MaterializeBlock(RowBlock *dst); + + std::shared_ptr iter_; + + std::unordered_multimap preds_by_column_; + + // The order in which the columns will be materialized. + std::vector materialization_order_; + + // Set only by test code to disallow pushdown. + bool disallow_pushdown_for_tests_; +}; + + +// An iterator which wraps another iterator and evaluates any predicates that the +// wrapped iterator did not itself handle during push down. +class PredicateEvaluatingIterator : public RowwiseIterator { + public: + // Initialize the given '*base_iter' with the given 'spec'. + // + // If the base_iter accepts all predicates, then simply returns. + // Otherwise, swaps out *base_iter for a PredicateEvaluatingIterator which wraps + // the original iterator and accepts all predicates on its behalf. + // + // POSTCONDITION: spec->predicates().empty() + // POSTCONDITION: base_iter and its wrapper are initialized + static Status InitAndMaybeWrap(std::shared_ptr *base_iter, + ScanSpec *spec); + + // Initialize the iterator. + // POSTCONDITION: spec->predicates().empty() + Status Init(ScanSpec *spec) OVERRIDE; + + virtual Status NextBlock(RowBlock *dst) OVERRIDE; + + bool HasNext() const OVERRIDE; + + string ToString() const OVERRIDE; + + const Schema &schema() const OVERRIDE { + return base_iter_->schema(); + } + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE { + base_iter_->GetIteratorStats(stats); + } + + private: + // Construct the evaluating iterator. + // This is only called from ::InitAndMaybeWrap() + // REQUIRES: base_iter is already Init()ed. + explicit PredicateEvaluatingIterator( + std::shared_ptr base_iter); + + FRIEND_TEST(TestPredicateEvaluatingIterator, TestPredicateEvaluation); + + std::shared_ptr base_iter_; + std::vector predicates_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/common/id_mapping-test.cc b/src/kudu/common/id_mapping-test.cc new file mode 100644 index 000000000000..5cb0c529f382 --- /dev/null +++ b/src/kudu/common/id_mapping-test.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/id_mapping.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +namespace kudu { +// Basic unit test for IdMapping. +TEST(TestIdMapping, TestSimple) { + IdMapping m; + ASSERT_EQ(-1, m.get(1)); + m.set(1, 10); + m.set(2, 20); + m.set(3, 30); + ASSERT_EQ(10, m.get(1)); + ASSERT_EQ(20, m.get(2)); + ASSERT_EQ(30, m.get(3)); +} + +// Insert enough entries in the mapping so that it is forced to rehash +// itself. +TEST(TestIdMapping, TestRehash) { + IdMapping m; + + for (int i = 0; i < 1000; i++) { + m.set(i, i * 10); + } + for (int i = 0; i < 1000; i++) { + ASSERT_EQ(i * 10, m.get(i)); + } +} + +// Generate a random sequence of keys. +TEST(TestIdMapping, TestRandom) { + Random r(SeedRandom()); + IdMapping m; + std::vector picked; + for (int i = 0; i < 1000; i++) { + int32_t k = r.Next32(); + m.set(k, i); + picked.push_back(k); + } + + for (int i = 0; i < picked.size(); i++) { + ASSERT_EQ(i, m.get(picked[i])); + } +} + +// Regression test for a particular bad sequence of inserts +// that caused a crash on a previous implementation. +// +// The particular issue here is that we have many inserts +// which have the same key modulo the initial capacity. +TEST(TestIdMapping, TestBadSequence) { + IdMapping m; + m.set(0, 0); + m.set(4, 0); + m.set(128, 0); // 0 modulo 64 and 128 + m.set(129, 0); // 1 + m.set(130, 0); // 2 + m.set(131, 0); // 3 +} + +TEST(TestIdMapping, TestReinsert) { + IdMapping m; + m.set(0, 0); + ASSERT_DEATH({ + m.set(0, 1); + }, + "Cannot insert duplicate keys"); +} + +} // namespace kudu diff --git a/src/kudu/common/id_mapping.cc b/src/kudu/common/id_mapping.cc new file mode 100644 index 000000000000..3a0252023334 --- /dev/null +++ b/src/kudu/common/id_mapping.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/id_mapping.h" + +#include "kudu/util/malloc.h" + +namespace kudu { + +const int IdMapping::kNoEntry = -1; + +size_t IdMapping::memory_footprint_excluding_this() const { + if (entries_.capacity() > 0) { + return kudu_malloc_usable_size(entries_.data()); + } else { + return 0; + } +} + +size_t IdMapping::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} + +} // namespace kudu diff --git a/src/kudu/common/id_mapping.h b/src/kudu/common/id_mapping.h new file mode 100644 index 000000000000..d7ef9d5cc5af --- /dev/null +++ b/src/kudu/common/id_mapping.h @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ID_MAPPING_H +#define KUDU_COMMON_ID_MAPPING_H + +#include +#include + +#include + +#include "kudu/gutil/macros.h" + +namespace kudu { + +// Light-weight hashtable implementation for mapping a small number of +// integers to other integers. +// This is used by Schema to map from Column ID to column index. +// +// The implementation is an open-addressed hash table with linear probing. +// The probing is limited to look only in the initial position and a single +// position following. If neither position is free, the hashtable is doubled. +// Therefore, the fill rate of the hashtable could be fairly bad, in the worst +// case. However, in practice, we expect that most tables will have nearly +// sequential column IDs (with only the occasional gap if a column has been removed). +// Therefore, we expect to have very few collisions. +// +// The implementation takes care to only use power-of-2 sized bucket arrays so that +// modulo can be calculated using bit masking. This improves performance substantially +// since the 'div' instruction can take many cycles. +// +// NOTE: this map restricts that keys and values are positive. '-1' is used +// as a special identifier indicating that the slot is unused or that the key +// was not found. +class IdMapping { + private: + enum { + kInitialCapacity = 64, + kNumProbes = 2 + }; + typedef std::pair value_type; + + public: + static const int kNoEntry; + + IdMapping() : + mask_(kInitialCapacity - 1), + entries_(kInitialCapacity) { + clear(); + } + + explicit IdMapping(const IdMapping& other) + : mask_(other.mask_), + entries_(other.entries_) { + } + + ~IdMapping() {} + + void clear() { + ClearMap(&entries_); + } + + // NOLINT on this function definition because it thinks we're calling + // std::swap instead of defining it. + void swap(IdMapping& other) { // NOLINT(*) + uint64_t tmp = other.mask_; + other.mask_ = mask_; + mask_ = tmp; + other.entries_.swap(entries_); + } + + IdMapping& operator=(const IdMapping& other) { + mask_ = other.mask_; + entries_ = other.entries_; + return *this; + } + + int operator[](int key) const { + return get(key); + } + + int get(int key) const { + DCHECK_GE(key, 0); + for (int i = 0; i < kNumProbes; i++) { + int s = slot(key + i); + if (entries_[s].first == key || entries_[s].first == kNoEntry) { + return entries_[s].second; + } + } + return kNoEntry; + } + + void set(int key, int val) { + DCHECK_GE(key, 0); + DCHECK_GE(val, 0); + while (true) { + for (int i = 0; i < kNumProbes; i++) { + int s = slot(key + i); + CHECK_NE(entries_[s].first, key) << "Cannot insert duplicate keys"; + if (entries_[s].first == kNoEntry) { + entries_[s].first = key; + entries_[s].second = val; + return; + } + } + // Didn't find a spot. + DoubleCapacity(); + } + } + + int capacity() const { + return mask_ + 1; + } + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + int slot(int key) const { + return key & mask_; + } + + void DoubleCapacity() { + int new_capacity = capacity() * 2; + std::vector entries(new_capacity); + ClearMap(&entries); + mask_ = new_capacity - 1; + entries.swap(entries_); + + for (const auto& entry : entries) { + if (entry.first != kNoEntry) { + set(entry.first, entry.second); + } + } + } + + static void ClearMap(std::vector* v) { + for (auto& entry : *v) { + entry = std::make_pair(kNoEntry, kNoEntry); + } + } + + uint64_t mask_; + std::vector entries_; +}; + +} // namespace kudu +#endif /* KUDU_COMMON_ID_MAPPING_H */ diff --git a/src/kudu/common/iterator.h b/src/kudu/common/iterator.h new file mode 100644 index 000000000000..ac7df92647f5 --- /dev/null +++ b/src/kudu/common/iterator.h @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ITERATOR_H +#define KUDU_COMMON_ITERATOR_H + +#include +#include + +#include "kudu/common/columnblock.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/common/iterator_stats.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Arena; +class RowBlock; +class ScanSpec; + +class IteratorBase { + public: + // Initialize the iterator with the given scan spec. + // + // The scan spec may be transformed by this call to remove predicates + // which will be fully pushed down into the iterator. + // + // The scan spec pointer must remain valid for the lifetime of the + // iterator -- the iterator does not take ownership of the object. + // + // This may be NULL if there are no predicates, etc. + // TODO: passing NULL is just convenience for unit tests, etc. + // Should probably simplify the API by not allowing NULL. + virtual Status Init(ScanSpec *spec) = 0; + + // Return true if the next call to PrepareBatch is expected to return at least + // one row. + virtual bool HasNext() const = 0; + + // Return a string representation of this iterator, suitable for debug output. + virtual string ToString() const = 0; + + // Return the schema for the rows which this iterator produces. + virtual const Schema &schema() const = 0; + + virtual ~IteratorBase() {} +}; + +class RowwiseIterator : public virtual IteratorBase { + public: + // Materialize all columns in the destination block. + // + // Any indirect data (eg strings) are copied into the destination block's + // arena, if non-null. + // + // The destination row block's selection vector is set to indicate whether + // each row in the result has passed scan predicates and is still live in + // the current MVCC snapshot. The iterator implementation should not assume + // that the selection vector has been initialized prior to this call. + // + // The iterator will resize RowBlock to a sufficiently large number of rows, + // at most its row_capacity. The iterator will attempt to have the maximum + // number of rows in the batch, but may have less if it is near the end of data. + virtual Status NextBlock(RowBlock *dst) = 0; + + // Get IteratorStats for each column in the row, including + // (potentially) columns that are iterated over but not projected; + virtual void GetIteratorStats(std::vector* stats) const = 0; +}; + +class ColumnwiseIterator : public virtual IteratorBase { + public: + + // Prepare to read the next nrows from the underlying base data. + // Sets *nrows back to the number of rows available to be read, + // which may be less than the requested number in the case that the iterator + // is at the end of the available data. + virtual Status PrepareBatch(size_t *nrows) = 0; + + // Materialize the given column into the given column block. + // col_idx is within the projection schema, not the underlying schema. + // + // Any indirect data (eg strings) are copied into the destination block's + // arena, if non-null. + virtual Status MaterializeColumn(size_t col_idx, ColumnBlock *dst) = 0; + + // Finish the current batch. + virtual Status FinishBatch() = 0; + + // Initialize the given SelectionVector to indicate which rows in the currently + // prepared batch are live vs deleted. + // + // The SelectionVector passed in is uninitialized -- i.e its bits are in + // an undefined state and need to be explicitly set to 1 if the row is live. + virtual Status InitializeSelectionVector(SelectionVector *sel_vec) = 0; + + // Get IteratorStats for each column. + virtual void GetIteratorStats(std::vector* stats) const = 0; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/common/iterator_stats.cc b/src/kudu/common/iterator_stats.cc new file mode 100644 index 000000000000..f69188845014 --- /dev/null +++ b/src/kudu/common/iterator_stats.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/iterator_stats.h" + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +namespace kudu { + +using std::string; +using strings::Substitute; + +IteratorStats::IteratorStats() + : data_blocks_read_from_disk(0), + bytes_read_from_disk(0), + cells_read_from_disk(0) { +} + +string IteratorStats::ToString() const { + return Substitute("data_blocks_read_from_disk=$0 " + "bytes_read_from_disk=$1 " + "cells_read_from_disk=$2", + data_blocks_read_from_disk, + bytes_read_from_disk, + cells_read_from_disk); +} + +void IteratorStats::AddStats(const IteratorStats& other) { + data_blocks_read_from_disk += other.data_blocks_read_from_disk; + bytes_read_from_disk += other.bytes_read_from_disk; + cells_read_from_disk += other.cells_read_from_disk; + DCheckNonNegative(); +} + +void IteratorStats::SubtractStats(const IteratorStats& other) { + data_blocks_read_from_disk -= other.data_blocks_read_from_disk; + bytes_read_from_disk -= other.bytes_read_from_disk; + cells_read_from_disk -= other.cells_read_from_disk; + DCheckNonNegative(); +} + +void IteratorStats::DCheckNonNegative() const { + DCHECK_GE(data_blocks_read_from_disk, 0); + DCHECK_GE(bytes_read_from_disk, 0); + DCHECK_GE(cells_read_from_disk, 0); +} + + +} // namespace kudu diff --git a/src/kudu/common/iterator_stats.h b/src/kudu/common/iterator_stats.h new file mode 100644 index 000000000000..ed2bcb9b7e18 --- /dev/null +++ b/src/kudu/common/iterator_stats.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ITERATOR_STATS_H +#define KUDU_COMMON_ITERATOR_STATS_H + +#include +#include + +namespace kudu { + +struct IteratorStats { + IteratorStats(); + + std::string ToString() const; + + // The number of data blocks read from disk (or cache) by the iterator. + int64_t data_blocks_read_from_disk; + + // The number of bytes read from disk (or cache) by the iterator. + int64_t bytes_read_from_disk; + + // The number of cells which were read from disk -- regardless of whether + // they were decoded/materialized. + int64_t cells_read_from_disk; + + // Add statistics contained 'other' to this object (for each field + // in this object, increment it by the value of the equivalent field + // in 'other'). + void AddStats(const IteratorStats& other); + + // Same, except subtract. + void SubtractStats(const IteratorStats& other); + + private: + // DCHECK that all of the stats are non-negative. This is a no-op in + // release builds. + void DCheckNonNegative() const; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/key_encoder.cc b/src/kudu/common/key_encoder.cc new file mode 100644 index 000000000000..3d4e4e81b2dd --- /dev/null +++ b/src/kudu/common/key_encoder.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/common/common.pb.h" +#include "kudu/common/key_encoder.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/singleton.h" +#include "kudu/util/faststring.h" + +using std::shared_ptr; +using std::unordered_map; + +namespace kudu { + + +// A resolver for Encoders +template +class EncoderResolver { + public: + const KeyEncoder& GetKeyEncoder(DataType t) { + return *FindOrDie(encoders_, t); + } + + const bool HasKeyEncoderForType(DataType t) { + return ContainsKey(encoders_, t); + } + + private: + EncoderResolver() { + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + } + + template void AddMapping() { + KeyEncoderTraits traits; + InsertOrDie(&encoders_, Type, shared_ptr >(new KeyEncoder(traits))); + } + + friend class Singleton >; + unordered_map >, std::hash > encoders_; +}; + +template +const KeyEncoder& GetKeyEncoder(const TypeInfo* typeinfo) { + return Singleton >::get()->GetKeyEncoder(typeinfo->physical_type()); +} + +// Returns true if the type is allowed in keys. +const bool IsTypeAllowableInKey(const TypeInfo* typeinfo) { + return Singleton >::get()->HasKeyEncoderForType( + typeinfo->physical_type()); +} + +//------------------------------------------------------------ +//// Template instantiations: We instantiate all possible templates to avoid linker issues. +//// see: https://isocpp.org/wiki/faq/templates#separate-template-fn-defn-from-decl +////------------------------------------------------------------ + +template +const KeyEncoder& GetKeyEncoder(const TypeInfo* typeinfo); + +template +const KeyEncoder& GetKeyEncoder(const TypeInfo* typeinfo); + +} // namespace kudu diff --git a/src/kudu/common/key_encoder.h b/src/kudu/common/key_encoder.h new file mode 100644 index 000000000000..ac8886c9cd8b --- /dev/null +++ b/src/kudu/common/key_encoder.h @@ -0,0 +1,362 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_COMMON_KEYENCODER_H +#define KUDU_COMMON_KEYENCODER_H + +#include +#include +#include +#include + +#include "kudu/common/types.h" +#include "kudu/gutil/endian.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/strings/memutil.h" +#include "kudu/gutil/type_traits.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/status.h" + +// The SSE-based encoding is not yet working. Don't define this! +#undef KEY_ENCODER_USE_SSE + +namespace kudu { + +template +struct KeyEncoderTraits { +}; + +// This complicated-looking template magic defines a specialization of the +// KeyEncoderTraits struct for any integral type. This avoids a bunch of +// code duplication for all of our different size/signed-ness variants. +template +struct KeyEncoderTraits::cpp_type + >::value + >::type + > { + static const DataType key_type = Type; + + private: + typedef typename DataTypeTraits::cpp_type cpp_type; + typedef typename MathLimits::UnsignedType unsigned_cpp_type; + + static unsigned_cpp_type SwapEndian(unsigned_cpp_type x) { + switch (sizeof(x)) { + case 1: return x; + case 2: return BigEndian::FromHost16(x); + case 4: return BigEndian::FromHost32(x); + case 8: return BigEndian::FromHost64(x); + default: LOG(FATAL) << "bad type: " << x; + } + return 0; + } + + public: + static void Encode(cpp_type key, Buffer* dst) { + Encode(&key, dst); + } + + static void Encode(const void* key_ptr, Buffer* dst) { + unsigned_cpp_type key_unsigned; + memcpy(&key_unsigned, key_ptr, sizeof(key_unsigned)); + + // To encode signed integers, swap the MSB. + if (MathLimits::kIsSigned) { + key_unsigned ^= 1UL << (sizeof(key_unsigned) * CHAR_BIT - 1); + } + key_unsigned = SwapEndian(key_unsigned); + dst->append(reinterpret_cast(&key_unsigned), sizeof(key_unsigned)); + } + + static void EncodeWithSeparators(const void* key, bool is_last, Buffer* dst) { + Encode(key, dst); + } + + static Status DecodeKeyPortion(Slice* encoded_key, + bool is_last, + Arena* arena, + uint8_t* cell_ptr) { + if (PREDICT_FALSE(encoded_key->size() < sizeof(cpp_type))) { + return Status::InvalidArgument("key too short", encoded_key->ToDebugString()); + } + + unsigned_cpp_type val; + memcpy(&val, encoded_key->data(), sizeof(cpp_type)); + val = SwapEndian(val); + if (MathLimits::kIsSigned) { + val ^= 1UL << (sizeof(val) * CHAR_BIT - 1); + } + memcpy(cell_ptr, &val, sizeof(val)); + encoded_key->remove_prefix(sizeof(cpp_type)); + return Status::OK(); + } +}; + +template +struct KeyEncoderTraits { + + static const DataType key_type = BINARY; + + static void Encode(const void* key, Buffer* dst) { + Encode(*reinterpret_cast(key), dst); + } + + // simple slice encoding that just adds to the buffer + inline static void Encode(const Slice& s, Buffer* dst) { + dst->append(reinterpret_cast(s.data()),s.size()); + } + static void EncodeWithSeparators(const void* key, bool is_last, Buffer* dst) { + EncodeWithSeparators(*reinterpret_cast(key), is_last, dst); + } + + // slice encoding that uses a separator to retain lexicographic + // comparability. + // + // This implementation is heavily optimized for the case where the input + // slice has no '\0' bytes. We assume this is common in most user-generated + // compound keys. + inline static void EncodeWithSeparators(const Slice& s, bool is_last, Buffer* dst) { + if (is_last) { + dst->append(reinterpret_cast(s.data()), s.size()); + } else { + // If we're a middle component of a composite key, we need to add a \x00 + // at the end in order to separate this component from the next one. However, + // if we just did that, we'd have issues where a key that actually has + // \x00 in it would compare wrong, so we have to instead add \x00\x00, and + // encode \x00 as \x00\x01. + int old_size = dst->size(); + dst->resize(old_size + s.size() * 2 + 2); + + const uint8_t* srcp = s.data(); + uint8_t* dstp = reinterpret_cast(&(*dst)[old_size]); + int len = s.size(); + int rem = len; + + while (rem >= 16) { + if (!SSEEncodeChunk<16>(&srcp, &dstp)) { + goto slow_path; + } + rem -= 16; + } + while (rem >= 8) { + if (!SSEEncodeChunk<8>(&srcp, &dstp)) { + goto slow_path; + } + rem -= 8; + } + // Roll back to operate in 8 bytes at a time. + if (len > 8 && rem > 0) { + dstp -= 8 - rem; + srcp -= 8 - rem; + if (!SSEEncodeChunk<8>(&srcp, &dstp)) { + // TODO: optimize for the case where the input slice has '\0' + // bytes. (e.g. move the pointer to the first zero byte.) + dstp += 8 - rem; + srcp += 8 - rem; + goto slow_path; + } + rem = 0; + goto done; + } + + slow_path: + EncodeChunkLoop(&srcp, &dstp, rem); + + done: + *dstp++ = 0; + *dstp++ = 0; + dst->resize(dstp - reinterpret_cast(&(*dst)[0])); + } + } + + static Status DecodeKeyPortion(Slice* encoded_key, + bool is_last, + Arena* arena, + uint8_t* cell_ptr) { + if (is_last) { + Slice* dst_slice = reinterpret_cast(cell_ptr); + if (PREDICT_FALSE(!arena->RelocateSlice(*encoded_key, dst_slice))) { + return Status::RuntimeError("OOM"); + } + encoded_key->remove_prefix(encoded_key->size()); + return Status::OK(); + } + + uint8_t* separator = static_cast(memmem(encoded_key->data(), encoded_key->size(), + "\0\0", 2)); + if (PREDICT_FALSE(separator == NULL)) { + return Status::InvalidArgument("Missing separator after composite key string component", + encoded_key->ToDebugString()); + } + + uint8_t* src = encoded_key->mutable_data(); + int max_len = separator - src; + uint8_t* dst_start = static_cast(arena->AllocateBytes(max_len)); + uint8_t* dst = dst_start; + + for (int i = 0; i < max_len; i++) { + if (i >= 1 && src[i - 1] == '\0' && src[i] == '\1') { + continue; + } + *dst++ = src[i]; + } + + int real_len = dst - dst_start; + Slice slice(dst_start, real_len); + memcpy(cell_ptr, &slice, sizeof(Slice)); + encoded_key->remove_prefix(max_len + 2); + return Status::OK(); + } + + private: + // Encode a chunk of 'len' bytes from '*srcp' into '*dstp', incrementing + // the pointers upon return. + // + // This uses SSE2 operations to operate in 8 or 16 bytes at a time, fast-pathing + // the case where there are no '\x00' bytes in the source. + // + // Returns true if the chunk was successfully processed, false if there was one + // or more '\0' bytes requiring the slow path. + // + // REQUIRES: len == 16 or 8 + template + static bool SSEEncodeChunk(const uint8_t** srcp, uint8_t** dstp) { + COMPILE_ASSERT(LEN == 16 || LEN == 8, invalid_length); + __m128i data; + if (LEN == 16) { + // Load 16 bytes (unaligned) into the XMM register. + data = _mm_loadu_si128(reinterpret_cast(*srcp)); + } else if (LEN == 8) { + // Load 8 bytes (unaligned) into the XMM register + data = reinterpret_cast<__m128i>(_mm_load_sd(reinterpret_cast(*srcp))); + } + // Compare each byte of the input with '\0'. This results in a vector + // where each byte is either \x00 or \xFF, depending on whether the + // input had a '\x00' in the corresponding position. + __m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd()); + __m128i zero_bytes = _mm_cmpeq_epi8(data, zeros); + + // Check whether the resulting vector is all-zero. + bool all_zeros; + if (LEN == 16) { + all_zeros = _mm_testz_si128(zero_bytes, zero_bytes); + } else { // LEN == 8 + all_zeros = _mm_cvtsi128_si64(zero_bytes) == 0; + } + + // If it's all zero, we can just store the entire chunk. + if (PREDICT_FALSE(!all_zeros)) { + return false; + } + + if (LEN == 16) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(*dstp), data); + } else { + _mm_storel_epi64(reinterpret_cast<__m128i*>(*dstp), data); // movq m64, xmm + } + *dstp += LEN; + *srcp += LEN; + return true; + } + + // Non-SSE loop which encodes 'len' bytes from 'srcp' into 'dst'. + static void EncodeChunkLoop(const uint8_t** srcp, uint8_t** dstp, int len) { + while (len--) { + if (PREDICT_FALSE(**srcp == '\0')) { + *(*dstp)++ = 0; + *(*dstp)++ = 1; + } else { + *(*dstp)++ = **srcp; + } + (*srcp)++; + } + } +}; + +// Forward declaration is necessary for friend declaration in KeyEncoder. +template +class EncoderResolver; + +// The runtime version of the key encoder +template +class KeyEncoder { + public: + + // Encodes the provided key to the provided buffer + void Encode(const void* key, Buffer* dst) const { + encode_func_(key, dst); + } + + // Special encoding for composite keys. + void Encode(const void* key, bool is_last, Buffer* dst) const { + encode_with_separators_func_(key, is_last, dst); + } + + void ResetAndEncode(const void* key, Buffer* dst) const { + dst->clear(); + Encode(key, dst); + } + + // Decode the next component out of the composite key pointed to by '*encoded_key' + // into *cell_ptr. + // After decoding encoded_key is advanced forward such that it contains the remainder + // of the composite key. + // 'is_last' should be true when we expect that this component is the last (or only) component + // of the composite key. + // Any indirect data (eg strings) are allocated out of 'arena'. + Status Decode(Slice* encoded_key, + bool is_last, + Arena* arena, + uint8_t* cell_ptr) const { + return decode_key_portion_func_(encoded_key, is_last, arena, cell_ptr); + } + + private: + friend class EncoderResolver; + template + explicit KeyEncoder(EncoderTraitsClass t) + : encode_func_(EncoderTraitsClass::Encode), + encode_with_separators_func_(EncoderTraitsClass::EncodeWithSeparators), + decode_key_portion_func_(EncoderTraitsClass::DecodeKeyPortion) { + } + + typedef void (*EncodeFunc)(const void* key, Buffer* dst); + const EncodeFunc encode_func_; + typedef void (*EncodeWithSeparatorsFunc)(const void* key, bool is_last, Buffer* dst); + const EncodeWithSeparatorsFunc encode_with_separators_func_; + + typedef Status (*DecodeKeyPortionFunc)(Slice* enc_key, bool is_last, + Arena* arena, uint8_t* cell_ptr); + const DecodeKeyPortionFunc decode_key_portion_func_; + + private: + DISALLOW_COPY_AND_ASSIGN(KeyEncoder); +}; + +template +extern const KeyEncoder& GetKeyEncoder(const TypeInfo* typeinfo); + +extern const bool IsTypeAllowableInKey(const TypeInfo* typeinfo); + +} // namespace kudu + +#endif diff --git a/src/kudu/common/partial_row-test.cc b/src/kudu/common/partial_row-test.cc new file mode 100644 index 000000000000..966114076a7a --- /dev/null +++ b/src/kudu/common/partial_row-test.cc @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class PartialRowTest : public KuduTest { + public: + PartialRowTest() + : schema_({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true), + ColumnSchema("binary_val", BINARY, true) }, + 1) { + SeedRandom(); + } + protected: + Schema schema_; +}; + +TEST_F(PartialRowTest, UnitTest) { + KuduPartialRow row(&schema_); + string enc_key; + + // Initially all columns are unset. + EXPECT_FALSE(row.IsColumnSet(0)); + EXPECT_FALSE(row.IsColumnSet(1)); + EXPECT_FALSE(row.IsColumnSet(2)); + EXPECT_FALSE(row.IsKeySet()); + EXPECT_EQ("", row.ToString()); + + // Encoding the key when it is not set should give an error. + EXPECT_EQ("Invalid argument: All key columns must be set: key", + row.EncodeRowKey(&enc_key).ToString()); + + // Set just the key. + EXPECT_OK(row.SetInt32("key", 12345)); + EXPECT_TRUE(row.IsKeySet()); + EXPECT_FALSE(row.IsColumnSet(1)); + EXPECT_FALSE(row.IsColumnSet(2)); + EXPECT_EQ("int32 key=12345", row.ToString()); + int32_t x; + EXPECT_OK(row.GetInt32("key", &x)); + EXPECT_EQ(12345, x); + EXPECT_FALSE(row.IsNull("key")); + + // Test key encoding. + EXPECT_EQ("OK", row.EncodeRowKey(&enc_key).ToString()); + EXPECT_EQ("\\x80\\x0009", Slice(enc_key).ToDebugString()); + + // Fill in the other columns. + EXPECT_OK(row.SetInt32("int_val", 54321)); + EXPECT_OK(row.SetStringCopy("string_val", "hello world")); + EXPECT_TRUE(row.IsColumnSet(1)); + EXPECT_TRUE(row.IsColumnSet(2)); + EXPECT_EQ("int32 key=12345, int32 int_val=54321, string string_val=hello world", + row.ToString()); + Slice slice; + EXPECT_OK(row.GetString("string_val", &slice)); + EXPECT_EQ("hello world", slice.ToString()); + EXPECT_FALSE(row.IsNull("key")); + + // Set a nullable entry to NULL + EXPECT_OK(row.SetNull("string_val")); + EXPECT_EQ("int32 key=12345, int32 int_val=54321, string string_val=NULL", + row.ToString()); + EXPECT_TRUE(row.IsNull("string_val")); + + // Try to set an entry with the wrong type + Status s = row.SetStringCopy("int_val", "foo"); + EXPECT_EQ("Invalid argument: invalid type string provided for column 'int_val' (expected int32)", + s.ToString()); + + // Try to get an entry with the wrong type + s = row.GetString("int_val", &slice); + EXPECT_EQ("Invalid argument: invalid type string provided for column 'int_val' (expected int32)", + s.ToString()); + + // Try to set a non-nullable entry to NULL + s = row.SetNull("key"); + EXPECT_EQ("Invalid argument: column not nullable: key[int32 NOT NULL]", s.ToString()); + + // Set the NULL string back to non-NULL + EXPECT_OK(row.SetStringCopy("string_val", "goodbye world")); + EXPECT_EQ("int32 key=12345, int32 int_val=54321, string string_val=goodbye world", + row.ToString()); + + // Unset some columns. + EXPECT_OK(row.Unset("string_val")); + EXPECT_EQ("int32 key=12345, int32 int_val=54321", row.ToString()); + + EXPECT_OK(row.Unset("key")); + EXPECT_EQ("int32 int_val=54321", row.ToString()); + + // Set the column by index + EXPECT_OK(row.SetInt32(1, 99999)); + EXPECT_EQ("int32 int_val=99999", row.ToString()); + + // Set the binary column as a copy. + EXPECT_OK(row.SetBinaryCopy("binary_val", "hello_world")); + EXPECT_EQ("int32 int_val=99999, binary binary_val=hello_world", + row.ToString()); + // Unset the binary column. + EXPECT_OK(row.Unset("binary_val")); + EXPECT_EQ("int32 int_val=99999", row.ToString()); + + // Even though the storage is actually the same at the moment, we shouldn't be + // able to set string columns with SetBinary and vice versa. + EXPECT_FALSE(row.SetBinaryCopy("string_val", "oops").ok()); + EXPECT_FALSE(row.SetStringCopy("binary_val", "oops").ok()); +} + +TEST_F(PartialRowTest, TestCopy) { + KuduPartialRow row(&schema_); + + // The assignment operator is used in this test because it internally calls + // the copy constructor. + + // Check an empty copy. + KuduPartialRow copy = row; + EXPECT_FALSE(copy.IsColumnSet(0)); + EXPECT_FALSE(copy.IsColumnSet(1)); + EXPECT_FALSE(copy.IsColumnSet(2)); + + ASSERT_OK(row.SetInt32(0, 42)); + ASSERT_OK(row.SetInt32(1, 99)); + ASSERT_OK(row.SetStringCopy(2, "copied-string")); + + int32_t int_val; + Slice string_val; + Slice binary_val; + + // Check a copy with values. + copy = row; + ASSERT_OK(copy.GetInt32(0, &int_val)); + EXPECT_EQ(42, int_val); + ASSERT_OK(copy.GetInt32(1, &int_val)); + EXPECT_EQ(99, int_val); + ASSERT_OK(copy.GetString(2, &string_val)); + EXPECT_EQ("copied-string", string_val.ToString()); + + // Check a copy with a null value. + ASSERT_OK(row.SetNull(2)); + copy = row; + EXPECT_TRUE(copy.IsNull(2)); + + // Check a copy with a borrowed value. + string borrowed_string = "borrowed-string"; + string borrowed_binary = "borrowed-binary"; + ASSERT_OK(row.SetString(2, borrowed_string)); + ASSERT_OK(row.SetBinary(3, borrowed_binary)); + + copy = row; + ASSERT_OK(copy.GetString(2, &string_val)); + EXPECT_EQ("borrowed-string", string_val.ToString()); + ASSERT_OK(copy.GetBinary(3, &binary_val)); + EXPECT_EQ("borrowed-binary", binary_val.ToString()); + + borrowed_string.replace(0, 8, "mutated-"); + borrowed_binary.replace(0, 8, "mutated-"); + ASSERT_OK(copy.GetString(2, &string_val)); + EXPECT_EQ("mutated--string", string_val.ToString()); + ASSERT_OK(copy.GetBinary(3, &string_val)); + EXPECT_EQ("mutated--binary", string_val.ToString()); +} + +} // namespace kudu diff --git a/src/kudu/common/partial_row.cc b/src/kudu/common/partial_row.cc new file mode 100644 index 000000000000..7f6bff911cff --- /dev/null +++ b/src/kudu/common/partial_row.cc @@ -0,0 +1,672 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/partial_row.h" + +#include +#include +#include + +#include "kudu/common/common.pb.h" +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/status.h" + +using strings::Substitute; + +namespace kudu { + +namespace { +inline Status FindColumn(const Schema& schema, const Slice& col_name, int* idx) { + StringPiece sp(reinterpret_cast(col_name.data()), col_name.size()); + *idx = schema.find_column(sp); + if (PREDICT_FALSE(*idx == -1)) { + return Status::NotFound("No such column", col_name); + } + return Status::OK(); +} +} // anonymous namespace + +KuduPartialRow::KuduPartialRow(const Schema* schema) + : schema_(schema) { + DCHECK(schema_->initialized()); + size_t column_bitmap_size = BitmapSize(schema_->num_columns()); + size_t row_size = ContiguousRowHelper::row_size(*schema); + + auto dst = new uint8_t[2 * column_bitmap_size + row_size]; + isset_bitmap_ = dst; + owned_strings_bitmap_ = isset_bitmap_ + column_bitmap_size; + + memset(isset_bitmap_, 0, 2 * column_bitmap_size); + + row_data_ = owned_strings_bitmap_ + column_bitmap_size; +#ifndef NDEBUG + OverwriteWithPattern(reinterpret_cast(row_data_), + row_size, "NEWNEWNEWNEWNEW"); +#endif + ContiguousRowHelper::InitNullsBitmap( + *schema_, row_data_, ContiguousRowHelper::null_bitmap_size(*schema_)); +} + +KuduPartialRow::~KuduPartialRow() { + DeallocateOwnedStrings(); + // Both the row data and bitmap came from the same allocation. + // The bitmap is at the start of it. + delete [] isset_bitmap_; +} + +KuduPartialRow::KuduPartialRow(const KuduPartialRow& other) + : schema_(other.schema_) { + size_t column_bitmap_size = BitmapSize(schema_->num_columns()); + size_t row_size = ContiguousRowHelper::row_size(*schema_); + + size_t len = 2 * column_bitmap_size + row_size; + isset_bitmap_ = new uint8_t[len]; + owned_strings_bitmap_ = isset_bitmap_ + column_bitmap_size; + row_data_ = owned_strings_bitmap_ + column_bitmap_size; + + // Copy all bitmaps and row data. + memcpy(isset_bitmap_, other.isset_bitmap_, len); + + // Copy owned strings. + for (int col_idx = 0; col_idx < schema_->num_columns(); col_idx++) { + if (BitmapTest(owned_strings_bitmap_, col_idx)) { + ContiguousRow row(schema_, row_data_); + Slice* slice = reinterpret_cast(row.mutable_cell_ptr(col_idx)); + auto data = new uint8_t[slice->size()]; + slice->relocate(data); + } + } +} + +KuduPartialRow& KuduPartialRow::operator=(KuduPartialRow other) { + std::swap(schema_, other.schema_); + std::swap(isset_bitmap_, other.isset_bitmap_); + std::swap(owned_strings_bitmap_, other.owned_strings_bitmap_); + std::swap(row_data_, other.row_data_); + return *this; +} + +template +Status KuduPartialRow::Set(const Slice& col_name, + const typename T::cpp_type& val, + bool owned) { + int col_idx; + RETURN_NOT_OK(FindColumn(*schema_, col_name, &col_idx)); + return Set(col_idx, val, owned); +} + +template +Status KuduPartialRow::Set(int col_idx, + const typename T::cpp_type& val, + bool owned) { + const ColumnSchema& col = schema_->column(col_idx); + if (PREDICT_FALSE(col.type_info()->type() != T::type)) { + // TODO: at some point we could allow type coercion here. + return Status::InvalidArgument( + Substitute("invalid type $0 provided for column '$1' (expected $2)", + T::name(), + col.name(), col.type_info()->name())); + } + + ContiguousRow row(schema_, row_data_); + + // If we're replacing an existing STRING/BINARY value, deallocate the old value. + if (T::physical_type == BINARY) DeallocateStringIfSet(col_idx, col); + + // Mark the column as set. + BitmapSet(isset_bitmap_, col_idx); + + if (col.is_nullable()) { + row.set_null(col_idx, false); + } + + ContiguousRowCell dst(&row, col_idx); + memcpy(dst.mutable_ptr(), &val, sizeof(val)); + if (owned) { + BitmapSet(owned_strings_bitmap_, col_idx); + } + return Status::OK(); +} + +Status KuduPartialRow::Set(int32_t column_idx, const uint8_t* val) { + const ColumnSchema& column_schema = schema()->column(column_idx); + + switch (column_schema.type_info()->type()) { + case BOOL: { + RETURN_NOT_OK(SetBool(column_idx, *reinterpret_cast(val))); + break; + }; + case INT8: { + RETURN_NOT_OK(SetInt8(column_idx, *reinterpret_cast(val))); + break; + }; + case INT16: { + RETURN_NOT_OK(SetInt16(column_idx, *reinterpret_cast(val))); + break; + }; + case INT32: { + RETURN_NOT_OK(SetInt32(column_idx, *reinterpret_cast(val))); + break; + }; + case INT64: { + RETURN_NOT_OK(SetInt64(column_idx, *reinterpret_cast(val))); + break; + }; + case FLOAT: { + RETURN_NOT_OK(SetFloat(column_idx, *reinterpret_cast(val))); + break; + }; + case DOUBLE: { + RETURN_NOT_OK(SetDouble(column_idx, *reinterpret_cast(val))); + break; + }; + case STRING: { + RETURN_NOT_OK(SetStringCopy(column_idx, *reinterpret_cast(val))); + break; + }; + case BINARY: { + RETURN_NOT_OK(SetBinaryCopy(column_idx, *reinterpret_cast(val))); + break; + }; + case TIMESTAMP: { + RETURN_NOT_OK(SetTimestamp(column_idx, *reinterpret_cast(val))); + break; + }; + default: { + return Status::InvalidArgument("Unknown column type in schema", + column_schema.ToString()); + }; + } + return Status::OK(); +} + +void KuduPartialRow::DeallocateStringIfSet(int col_idx, const ColumnSchema& col) { + if (BitmapTest(owned_strings_bitmap_, col_idx)) { + ContiguousRow row(schema_, row_data_); + const Slice* dst; + if (col.type_info()->type() == BINARY) { + dst = schema_->ExtractColumnFromRow(row, col_idx); + } else { + CHECK(col.type_info()->type() == STRING); + dst = schema_->ExtractColumnFromRow(row, col_idx); + } + delete [] dst->data(); + BitmapClear(owned_strings_bitmap_, col_idx); + } +} + +void KuduPartialRow::DeallocateOwnedStrings() { + for (int i = 0; i < schema_->num_columns(); i++) { + DeallocateStringIfSet(i, schema_->column(i)); + } +} + +//------------------------------------------------------------ +// Setters +//------------------------------------------------------------ + +Status KuduPartialRow::SetBool(const Slice& col_name, bool val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetInt8(const Slice& col_name, int8_t val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetInt16(const Slice& col_name, int16_t val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetInt32(const Slice& col_name, int32_t val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetInt64(const Slice& col_name, int64_t val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetTimestamp(const Slice& col_name, int64_t val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetFloat(const Slice& col_name, float val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetDouble(const Slice& col_name, double val) { + return Set >(col_name, val); +} +Status KuduPartialRow::SetString(const Slice& col_name, const Slice& val) { + return Set >(col_name, val, false); +} +Status KuduPartialRow::SetBinary(const Slice& col_name, const Slice& val) { + return Set >(col_name, val, false); +} +Status KuduPartialRow::SetBool(int col_idx, bool val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetInt8(int col_idx, int8_t val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetInt16(int col_idx, int16_t val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetInt32(int col_idx, int32_t val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetInt64(int col_idx, int64_t val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetTimestamp(int col_idx, int64_t val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetString(int col_idx, const Slice& val) { + return Set >(col_idx, val, false); +} +Status KuduPartialRow::SetBinary(int col_idx, const Slice& val) { + return Set >(col_idx, val, false); +} +Status KuduPartialRow::SetFloat(int col_idx, float val) { + return Set >(col_idx, val); +} +Status KuduPartialRow::SetDouble(int col_idx, double val) { + return Set >(col_idx, val); +} + +Status KuduPartialRow::SetBinaryCopy(const Slice& col_name, const Slice& val) { + return SetSliceCopy >(col_name, val); +} +Status KuduPartialRow::SetBinaryCopy(int col_idx, const Slice& val) { + return SetSliceCopy >(col_idx, val); +} +Status KuduPartialRow::SetStringCopy(const Slice& col_name, const Slice& val) { + return SetSliceCopy >(col_name, val); +} +Status KuduPartialRow::SetStringCopy(int col_idx, const Slice& val) { + return SetSliceCopy >(col_idx, val); +} + +template +Status KuduPartialRow::SetSliceCopy(const Slice& col_name, const Slice& val) { + auto relocated = new uint8_t[val.size()]; + memcpy(relocated, val.data(), val.size()); + Slice relocated_val(relocated, val.size()); + Status s = Set(col_name, relocated_val, true); + if (!s.ok()) { + delete [] relocated; + } + return s; +} + +template +Status KuduPartialRow::SetSliceCopy(int col_idx, const Slice& val) { + auto relocated = new uint8_t[val.size()]; + memcpy(relocated, val.data(), val.size()); + Slice relocated_val(relocated, val.size()); + Status s = Set(col_idx, relocated_val, true); + if (!s.ok()) { + delete [] relocated; + } + return s; +} + +Status KuduPartialRow::SetNull(const Slice& col_name) { + int col_idx; + RETURN_NOT_OK(FindColumn(*schema_, col_name, &col_idx)); + return SetNull(col_idx); +} + +Status KuduPartialRow::SetNull(int col_idx) { + const ColumnSchema& col = schema_->column(col_idx); + if (PREDICT_FALSE(!col.is_nullable())) { + return Status::InvalidArgument("column not nullable", col.ToString()); + } + + if (col.type_info()->physical_type() == BINARY) DeallocateStringIfSet(col_idx, col); + + ContiguousRow row(schema_, row_data_); + row.set_null(col_idx, true); + + // Mark the column as set. + BitmapSet(isset_bitmap_, col_idx); + return Status::OK(); +} + +Status KuduPartialRow::Unset(const Slice& col_name) { + int col_idx; + RETURN_NOT_OK(FindColumn(*schema_, col_name, &col_idx)); + return Unset(col_idx); +} + +Status KuduPartialRow::Unset(int col_idx) { + const ColumnSchema& col = schema_->column(col_idx); + if (col.type_info()->physical_type() == BINARY) DeallocateStringIfSet(col_idx, col); + BitmapClear(isset_bitmap_, col_idx); + return Status::OK(); +} + +//------------------------------------------------------------ +// Template instantiations: We instantiate all possible templates to avoid linker issues. +// see: https://isocpp.org/wiki/faq/templates#separate-template-fn-defn-from-decl +// TODO We can probably remove this when we move to c++11 and can use "extern template" +//------------------------------------------------------------ + +template +Status KuduPartialRow::SetSliceCopy >(int col_idx, const Slice& val); + +template +Status KuduPartialRow::SetSliceCopy >(int col_idx, const Slice& val); + +template +Status KuduPartialRow::SetSliceCopy >(const Slice& col_name, const Slice& val); + +template +Status KuduPartialRow::SetSliceCopy >(const Slice& col_name, const Slice& val); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >( + int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(int col_idx, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >( + const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +template +Status KuduPartialRow::Set >(const Slice& col_name, + const TypeTraits::cpp_type& val, + bool owned); + +//------------------------------------------------------------ +// Getters +//------------------------------------------------------------ +bool KuduPartialRow::IsColumnSet(int col_idx) const { + DCHECK_GE(col_idx, 0); + DCHECK_LT(col_idx, schema_->num_columns()); + return BitmapTest(isset_bitmap_, col_idx); +} + +bool KuduPartialRow::IsColumnSet(const Slice& col_name) const { + int col_idx; + CHECK_OK(FindColumn(*schema_, col_name, &col_idx)); + return IsColumnSet(col_idx); +} + +bool KuduPartialRow::IsNull(int col_idx) const { + const ColumnSchema& col = schema_->column(col_idx); + if (!col.is_nullable()) { + return false; + } + + if (!IsColumnSet(col_idx)) return false; + + ContiguousRow row(schema_, row_data_); + return row.is_null(col_idx); +} + +bool KuduPartialRow::IsNull(const Slice& col_name) const { + int col_idx; + CHECK_OK(FindColumn(*schema_, col_name, &col_idx)); + return IsNull(col_idx); +} + +Status KuduPartialRow::GetBool(const Slice& col_name, bool* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetInt8(const Slice& col_name, int8_t* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetInt16(const Slice& col_name, int16_t* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetInt32(const Slice& col_name, int32_t* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetInt64(const Slice& col_name, int64_t* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetTimestamp(const Slice& col_name, int64_t* micros_since_utc_epoch) const { + return Get >(col_name, micros_since_utc_epoch); +} +Status KuduPartialRow::GetFloat(const Slice& col_name, float* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetDouble(const Slice& col_name, double* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetString(const Slice& col_name, Slice* val) const { + return Get >(col_name, val); +} +Status KuduPartialRow::GetBinary(const Slice& col_name, Slice* val) const { + return Get >(col_name, val); +} + +Status KuduPartialRow::GetBool(int col_idx, bool* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetInt8(int col_idx, int8_t* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetInt16(int col_idx, int16_t* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetInt32(int col_idx, int32_t* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetInt64(int col_idx, int64_t* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetTimestamp(int col_idx, int64_t* micros_since_utc_epoch) const { + return Get >(col_idx, micros_since_utc_epoch); +} +Status KuduPartialRow::GetFloat(int col_idx, float* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetDouble(int col_idx, double* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetString(int col_idx, Slice* val) const { + return Get >(col_idx, val); +} +Status KuduPartialRow::GetBinary(int col_idx, Slice* val) const { + return Get >(col_idx, val); +} + +template +Status KuduPartialRow::Get(const Slice& col_name, + typename T::cpp_type* val) const { + int col_idx; + RETURN_NOT_OK(FindColumn(*schema_, col_name, &col_idx)); + return Get(col_idx, val); +} + +template +Status KuduPartialRow::Get(int col_idx, typename T::cpp_type* val) const { + const ColumnSchema& col = schema_->column(col_idx); + if (PREDICT_FALSE(col.type_info()->type() != T::type)) { + // TODO: at some point we could allow type coercion here. + return Status::InvalidArgument( + Substitute("invalid type $0 provided for column '$1' (expected $2)", + T::name(), + col.name(), col.type_info()->name())); + } + + if (PREDICT_FALSE(!IsColumnSet(col_idx))) { + return Status::NotFound("column not set"); + } + if (col.is_nullable() && IsNull(col_idx)) { + return Status::NotFound("column is NULL"); + } + + ContiguousRow row(schema_, row_data_); + memcpy(val, row.cell_ptr(col_idx), sizeof(*val)); + return Status::OK(); +} + + +//------------------------------------------------------------ +// Key-encoding related functions +//------------------------------------------------------------ +Status KuduPartialRow::EncodeRowKey(string* encoded_key) const { + // Currently, a row key must be fully specified. + // TODO: allow specifying a prefix of the key, and automatically + // fill the rest with minimum values. + for (int i = 0; i < schema_->num_key_columns(); i++) { + if (PREDICT_FALSE(!IsColumnSet(i))) { + return Status::InvalidArgument("All key columns must be set", + schema_->column(i).name()); + } + } + + encoded_key->clear(); + ContiguousRow row(schema_, row_data_); + + for (int i = 0; i < schema_->num_key_columns(); i++) { + bool is_last = i == schema_->num_key_columns() - 1; + const TypeInfo* ti = schema_->column(i).type_info(); + GetKeyEncoder(ti).Encode(row.cell_ptr(i), is_last, encoded_key); + } + + return Status::OK(); +} + +string KuduPartialRow::ToEncodedRowKeyOrDie() const { + string ret; + CHECK_OK(EncodeRowKey(&ret)); + return ret; +} + +//------------------------------------------------------------ +// Utility code +//------------------------------------------------------------ + +bool KuduPartialRow::AllColumnsSet() const { + return BitMapIsAllSet(isset_bitmap_, 0, schema_->num_columns()); +} + +bool KuduPartialRow::IsKeySet() const { + return BitMapIsAllSet(isset_bitmap_, 0, schema_->num_key_columns()); +} + + +std::string KuduPartialRow::ToString() const { + ContiguousRow row(schema_, row_data_); + std::string ret; + bool first = true; + for (int i = 0; i < schema_->num_columns(); i++) { + if (IsColumnSet(i)) { + if (!first) { + ret.append(", "); + } + schema_->column(i).DebugCellAppend(row.cell(i), &ret); + first = false; + } + } + return ret; +} + +//------------------------------------------------------------ +// Serialization/deserialization +//------------------------------------------------------------ + + +} // namespace kudu diff --git a/src/kudu/common/partial_row.h b/src/kudu/common/partial_row.h new file mode 100644 index 000000000000..724fc69af725 --- /dev/null +++ b/src/kudu/common/partial_row.h @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_PARTIAL_ROW_H +#define KUDU_COMMON_PARTIAL_ROW_H + +#include +#include +#include + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +// This is a poor module interdependency, but the stubs are header-only and +// it's only for exported header builds, so we'll make an exception. +#include "kudu/client/stubs.h" +#endif + +#include "kudu/util/kudu_export.h" +#include "kudu/util/slice.h" + +namespace kudu { +class ColumnSchema; +namespace client { +class KuduWriteOperation; +template struct SliceKeysTestSetup; +template struct IntKeysTestSetup; +} // namespace client + +class Schema; +class PartialRowPB; + +// A row which may only contain values for a subset of the columns. +// This type contains a normal contiguous row, plus a bitfield indicating +// which columns have been set. Additionally, this type may optionally own +// copies of indirect data for variable length columns. +class KUDU_EXPORT KuduPartialRow { + public: + // The given Schema object must remain valid for the lifetime of this + // row. + explicit KuduPartialRow(const Schema* schema); + virtual ~KuduPartialRow(); + + KuduPartialRow(const KuduPartialRow& other); + + KuduPartialRow& operator=(KuduPartialRow other); + + //------------------------------------------------------------ + // Setters + //------------------------------------------------------------ + + Status SetBool(const Slice& col_name, bool val) WARN_UNUSED_RESULT; + + Status SetInt8(const Slice& col_name, int8_t val) WARN_UNUSED_RESULT; + Status SetInt16(const Slice& col_name, int16_t val) WARN_UNUSED_RESULT; + Status SetInt32(const Slice& col_name, int32_t val) WARN_UNUSED_RESULT; + Status SetInt64(const Slice& col_name, int64_t val) WARN_UNUSED_RESULT; + Status SetTimestamp(const Slice& col_name, int64_t micros_since_utc_epoch) WARN_UNUSED_RESULT; + + Status SetFloat(const Slice& col_name, float val) WARN_UNUSED_RESULT; + Status SetDouble(const Slice& col_name, double val) WARN_UNUSED_RESULT; + + // Same as above setters, but with numeric column indexes. + // These are faster since they avoid a hashmap lookup, so should + // be preferred in performance-sensitive code (eg bulk loaders). + Status SetBool(int col_idx, bool val) WARN_UNUSED_RESULT; + + Status SetInt8(int col_idx, int8_t val) WARN_UNUSED_RESULT; + Status SetInt16(int col_idx, int16_t val) WARN_UNUSED_RESULT; + Status SetInt32(int col_idx, int32_t val) WARN_UNUSED_RESULT; + Status SetInt64(int col_idx, int64_t val) WARN_UNUSED_RESULT; + Status SetTimestamp(int col_idx, int64_t micros_since_utc_epoch) WARN_UNUSED_RESULT; + + Status SetFloat(int col_idx, float val) WARN_UNUSED_RESULT; + Status SetDouble(int col_idx, double val) WARN_UNUSED_RESULT; + + // Sets the string/binary value but does not copy the value. The slice + // must remain valid until the call to AppendToPB(). + Status SetString(const Slice& col_name, const Slice& val) WARN_UNUSED_RESULT; + Status SetString(int col_idx, const Slice& val) WARN_UNUSED_RESULT; + Status SetBinary(const Slice& col_name, const Slice& val) WARN_UNUSED_RESULT; + Status SetBinary(int col_idx, const Slice& val) WARN_UNUSED_RESULT; + + // Copies 'val' immediately. + Status SetStringCopy(const Slice& col_name, const Slice& val) WARN_UNUSED_RESULT; + Status SetStringCopy(int col_idx, const Slice& val) WARN_UNUSED_RESULT; + Status SetBinaryCopy(const Slice& col_name, const Slice& val) WARN_UNUSED_RESULT; + Status SetBinaryCopy(int col_idx, const Slice& val) WARN_UNUSED_RESULT; + + // Set the given column to NULL. This will only succeed on nullable + // columns. Use Unset(...) to restore a column to its default. + Status SetNull(const Slice& col_name) WARN_UNUSED_RESULT; + Status SetNull(int col_idx) WARN_UNUSED_RESULT; + + // Unsets the given column. Note that this is different from setting + // it to NULL. + Status Unset(const Slice& col_name) WARN_UNUSED_RESULT; + Status Unset(int col_idx) WARN_UNUSED_RESULT; + + //------------------------------------------------------------ + // Getters + //------------------------------------------------------------ + // These getters return a bad Status if the type does not match, + // the value is unset, or the value is NULL. Otherwise they return + // the current set value in *val. + + // Return true if the given column has been specified. + bool IsColumnSet(const Slice& col_name) const; + bool IsColumnSet(int col_idx) const; + + bool IsNull(const Slice& col_name) const; + bool IsNull(int col_idx) const; + + Status GetBool(const Slice& col_name, bool* val) const WARN_UNUSED_RESULT; + + Status GetInt8(const Slice& col_name, int8_t* val) const WARN_UNUSED_RESULT; + Status GetInt16(const Slice& col_name, int16_t* val) const WARN_UNUSED_RESULT; + Status GetInt32(const Slice& col_name, int32_t* val) const WARN_UNUSED_RESULT; + Status GetInt64(const Slice& col_name, int64_t* val) const WARN_UNUSED_RESULT; + Status GetTimestamp(const Slice& col_name, + int64_t* micros_since_utc_epoch) const WARN_UNUSED_RESULT; + + Status GetFloat(const Slice& col_name, float* val) const WARN_UNUSED_RESULT; + Status GetDouble(const Slice& col_name, double* val) const WARN_UNUSED_RESULT; + + // Same as above getters, but with numeric column indexes. + // These are faster since they avoid a hashmap lookup, so should + // be preferred in performance-sensitive code. + Status GetBool(int col_idx, bool* val) const WARN_UNUSED_RESULT; + + Status GetInt8(int col_idx, int8_t* val) const WARN_UNUSED_RESULT; + Status GetInt16(int col_idx, int16_t* val) const WARN_UNUSED_RESULT; + Status GetInt32(int col_idx, int32_t* val) const WARN_UNUSED_RESULT; + Status GetInt64(int col_idx, int64_t* val) const WARN_UNUSED_RESULT; + Status GetTimestamp(int col_idx, int64_t* micros_since_utc_epoch) const WARN_UNUSED_RESULT; + + Status GetFloat(int col_idx, float* val) const WARN_UNUSED_RESULT; + Status GetDouble(int col_idx, double* val) const WARN_UNUSED_RESULT; + + // Gets the string/binary value but does not copy the value. Callers should + // copy the resulting Slice if necessary. + Status GetString(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT; + Status GetString(int col_idx, Slice* val) const WARN_UNUSED_RESULT; + Status GetBinary(const Slice& col_name, Slice* val) const WARN_UNUSED_RESULT; + Status GetBinary(int col_idx, Slice* val) const WARN_UNUSED_RESULT; + + //------------------------------------------------------------ + // Key-encoding related functions + //------------------------------------------------------------ + + // Encode a row key suitable for use as a tablet split key, an encoded + // key range, etc. + // + // Requires that all of the key columns must be set; otherwise, returns + // InvalidArgument. + Status EncodeRowKey(std::string* encoded_key) const; + + // Convenience method which is equivalent to the above, but triggers a + // FATAL error on failure. + std::string ToEncodedRowKeyOrDie() const; + + //------------------------------------------------------------ + // Utility code + //------------------------------------------------------------ + + // Return true if all of the key columns have been specified + // for this mutation. + bool IsKeySet() const; + + // Return true if all columns have been specified. + bool AllColumnsSet() const; + + std::string ToString() const; + + const Schema* schema() const { return schema_; } + + private: + friend class RowKeyUtilTest; + friend class RowOperationsPBDecoder; + friend class RowOperationsPBEncoder; + friend class client::KuduWriteOperation; // for row_data_. + friend class PartitionSchema; + template friend struct client::SliceKeysTestSetup; + template friend struct client::IntKeysTestSetup; + + template + Status Set(const Slice& col_name, const typename T::cpp_type& val, + bool owned = false); + + template + Status Set(int col_idx, const typename T::cpp_type& val, + bool owned = false); + + // Runtime version of the generic setter. + Status Set(int32_t column_idx, const uint8_t* val); + + template + Status Get(const Slice& col_name, typename T::cpp_type* val) const; + + template + Status Get(int col_idx, typename T::cpp_type* val) const; + + template + Status SetSliceCopy(const Slice& col_name, const Slice& val); + + template + Status SetSliceCopy(int col_idx, const Slice& val); + + // If the given column is a variable length column whose memory is owned by this instance, + // deallocates the value. + // NOTE: Does not mutate the isset bitmap. + // REQUIRES: col_idx must be a variable length column. + void DeallocateStringIfSet(int col_idx, const ColumnSchema& col); + + // Deallocate any string/binary values whose memory is managed by this object. + void DeallocateOwnedStrings(); + + const Schema* schema_; + + // 1-bit set for any field which has been explicitly set. This is distinct + // from NULL -- an "unset" field will take the server-side default on insert, + // whereas a field explicitly set to NULL will override the default. + uint8_t* isset_bitmap_; + + // 1-bit set for any variable length columns whose memory is managed by this instance. + // These strings need to be deallocated whenever the value is reset, + // or when the instance is destructed. + uint8_t* owned_strings_bitmap_; + + // The normal "contiguous row" format row data. Any column whose data is unset + // or NULL can have undefined bytes. + uint8_t* row_data_; +}; + +} // namespace kudu +#endif /* KUDU_COMMON_PARTIAL_ROW_H */ diff --git a/src/kudu/common/partition-test.cc b/src/kudu/common/partition-test.cc new file mode 100644 index 000000000000..4af83c489424 --- /dev/null +++ b/src/kudu/common/partition-test.cc @@ -0,0 +1,451 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/common.pb.h" +#include "kudu/common/partial_row.h" +#include "kudu/common/partition.h" +#include "kudu/common/row.h" +#include "kudu/common/scan_predicate.h" +#include "kudu/common/schema.h" +#include "kudu/util/hash_util.h" +#include "kudu/util/test_util.h" + +using std::vector; +using std::string; + +namespace kudu { + +namespace { +void AddHashBucketComponent(PartitionSchemaPB* partition_schema_pb, + const vector& columns, + uint32_t num_buckets, int32_t seed) { + PartitionSchemaPB::HashBucketSchemaPB* hash_bucket_schema = + partition_schema_pb->add_hash_bucket_schemas(); + for (const string& column : columns) { + hash_bucket_schema->add_columns()->set_name(column); + } + hash_bucket_schema->set_num_buckets(num_buckets); + hash_bucket_schema->set_seed(seed); +} + +void SetRangePartitionComponent(PartitionSchemaPB* partition_schema_pb, + const vector& columns) { + PartitionSchemaPB::RangeSchemaPB* range_schema = partition_schema_pb->mutable_range_schema(); + range_schema->Clear(); + for (const string& column : columns) { + range_schema->add_columns()->set_name(column); + } +} +} // namespace + +TEST(PartitionTest, TestPartitionKeyEncoding) { + // CREATE TABLE t (a INT32, b VARCHAR, c VARCHAR, PRIMARY KEY (a, b, c)) + // PARITITION BY [HASH BUCKET (a, b), HASH BUCKET (c), RANGE (a, b, c)]; + Schema schema({ ColumnSchema("a", INT32), + ColumnSchema("b", STRING), + ColumnSchema("c", STRING) }, + { ColumnId(0), ColumnId(1), ColumnId(2) }, 3); + + PartitionSchemaPB schema_builder; + AddHashBucketComponent(&schema_builder, { "a", "b" }, 32, 0); + AddHashBucketComponent(&schema_builder, { "c" }, 32, 42); + PartitionSchema partition_schema; + ASSERT_OK(PartitionSchema::FromPB(schema_builder, schema, &partition_schema)); + + ASSERT_EQ("hash bucket components: [(bucket count: 32, columns: [a, b]), " + "(bucket count: 32, seed: 42, columns: [c])], range columns: [a, b, c]", + partition_schema.DebugString(schema)); + + { + string key; + KuduPartialRow row(&schema); + ASSERT_OK(row.SetInt32("a", 0)); + ASSERT_OK(partition_schema.EncodeKey(row, &key)); + + EXPECT_EQ(string("\0\0\0\0" // hash(0, "") + "\0\0\0\x14" // hash("") + "\x80\0\0\0" // a = 0 + "\0\0", // b = ""; c is elided + 14), key); + string debug = "bucket=0, bucket=20, int32 a=0, string b=, string c="; + EXPECT_EQ(debug, partition_schema.RowDebugString(row)); + EXPECT_EQ(debug, partition_schema.PartitionKeyDebugString(key, schema)); + } + + { + string key; + KuduPartialRow row(&schema); + ASSERT_OK(row.SetInt32("a", 1)); + ASSERT_OK(partition_schema.EncodeKey(row, &key)); + + EXPECT_EQ(string("\0\0\0\x5" // hash(1, "") + "\0\0\0\x14" // hash("") + "\x80\0\0\x01" // a = 1 + "\0\0", // b = ""; c is elided + 14), key); + + string debug_b = "bucket=5, bucket=20, int32 a=1, string b=, string c="; + EXPECT_EQ(debug_b, partition_schema.RowDebugString(row)); + EXPECT_EQ(debug_b, partition_schema.PartitionKeyDebugString(key, schema)); + } + + { + string key; + KuduPartialRow row(&schema); + ASSERT_OK(row.SetInt32("a", 0)); + ASSERT_OK(row.SetStringCopy("b", "b")); + ASSERT_OK(row.SetStringCopy("c", "c")); + ASSERT_OK(partition_schema.EncodeKey(row, &key)); + + EXPECT_EQ(string("\0\0\0\x1A" // hash(0, "b") + "\0\0\0\x1D" // hash("c") + "\x80\0\0\0" // a = 0 + "b\0\0" // b = "b" + "c", // c = "c" + 16), key); + + string debug = "bucket=26, bucket=29, int32 a=0, string b=b, string c=c"; + EXPECT_EQ(debug, partition_schema.RowDebugString(row)); + EXPECT_EQ(debug, partition_schema.PartitionKeyDebugString(key, schema)); + } + + { + string key; + KuduPartialRow row(&schema); + ASSERT_OK(row.SetInt32("a", 1)); + ASSERT_OK(row.SetStringCopy("b", "b")); + ASSERT_OK(row.SetStringCopy("c", "c")); + ASSERT_OK(partition_schema.EncodeKey(row, &key)); + + EXPECT_EQ(string("\0\0\0\x0" // hash(1, "b") + "\0\0\0\x1D" // hash("c") + "\x80\0\0\x1" // a = 1 + "b\0\0" // b = "b" + "c", // c = "c" + 16), key); + + string debug = "bucket=0, bucket=29, int32 a=1, string b=b, string c=c"; + EXPECT_EQ(debug, partition_schema.RowDebugString(row)); + EXPECT_EQ(debug, partition_schema.PartitionKeyDebugString(key, schema)); + } +} + +TEST(PartitionTest, TestCreateRangePartitions) { + // CREATE TABLE t (a VARCHAR PRIMARY KEY), + // PARITITION BY [RANGE (a)]; + Schema schema({ ColumnSchema("a", STRING) }, { ColumnId(0) }, 1); + + PartitionSchema partition_schema; + ASSERT_OK(PartitionSchema::FromPB(PartitionSchemaPB(), schema, &partition_schema)); + + ASSERT_EQ("range columns: [a]", partition_schema.DebugString(schema)); + + // Split Rows: + // + // { a: "1" } + // { a: "2" } + // + // Encoded Partition Keys: + // + // [ ( ""), ("1") ) + // [ ("1"), ("2") ) + // [ ("2"), ( "") ) + + KuduPartialRow split1(&schema); + ASSERT_OK(split1.SetStringCopy("a", "1")); + string pk1; + ASSERT_OK(partition_schema.EncodeKey(split1, &pk1)); + + KuduPartialRow split2(&schema); + ASSERT_OK(split2.SetStringCopy("a", "2")); + string pk2; + ASSERT_OK(partition_schema.EncodeKey(split2, &pk2)); + + // Split keys need not be passed in sorted order. + vector split_rows = { split2, split1 }; + vector partitions; + ASSERT_OK(partition_schema.CreatePartitions(split_rows, schema, &partitions)); + ASSERT_EQ(3, partitions.size()); + + EXPECT_TRUE(partitions[0].hash_buckets().empty()); + EXPECT_EQ("", partitions[0].range_key_start()); + EXPECT_EQ("1", partitions[0].range_key_end()); + EXPECT_EQ("", partitions[0].partition_key_start()); + EXPECT_EQ("1", partitions[0].partition_key_end()); + EXPECT_EQ("range: [(), (string a=1))", + partition_schema.PartitionDebugString(partitions[0], schema)); + + EXPECT_TRUE(partitions[1].hash_buckets().empty()); + EXPECT_EQ("1", partitions[1].range_key_start()); + EXPECT_EQ("2", partitions[1].range_key_end()); + EXPECT_EQ("1", partitions[1].partition_key_start()); + EXPECT_EQ("2", partitions[1].partition_key_end()); + EXPECT_EQ("range: [(string a=1), (string a=2))", + partition_schema.PartitionDebugString(partitions[1], schema)); + + EXPECT_TRUE(partitions[2].hash_buckets().empty()); + EXPECT_EQ("2", partitions[2].range_key_start()); + EXPECT_EQ("", partitions[2].range_key_end()); + EXPECT_EQ("2", partitions[2].partition_key_start()); + EXPECT_EQ("", partitions[2].partition_key_end()); + EXPECT_EQ("range: [(string a=2), ())", + partition_schema.PartitionDebugString(partitions[2], schema)); +} + +TEST(PartitionTest, TestCreateHashBucketPartitions) { + // CREATE TABLE t (a VARCHAR PRIMARY KEY), + // PARITITION BY [HASH BUCKET (a)]; + Schema schema({ ColumnSchema("a", STRING) }, { ColumnId(0) }, 1); + + PartitionSchemaPB schema_builder; + SetRangePartitionComponent(&schema_builder, vector()); + AddHashBucketComponent(&schema_builder, { "a" }, 3, 42); + PartitionSchema partition_schema; + ASSERT_OK(PartitionSchema::FromPB(schema_builder, schema, &partition_schema)); + + ASSERT_EQ("hash bucket components: [(bucket count: 3, seed: 42, columns: [a])]", + partition_schema.DebugString(schema)); + + // Encoded Partition Keys: + // + // [ (_), (1) ) + // [ (1), (2) ) + // [ (3), (_) ) + + vector partitions; + ASSERT_OK(partition_schema.CreatePartitions(vector(), schema, &partitions)); + ASSERT_EQ(3, partitions.size()); + + EXPECT_EQ(0, partitions[0].hash_buckets()[0]); + EXPECT_EQ("", partitions[0].range_key_start()); + EXPECT_EQ("", partitions[0].range_key_end()); + EXPECT_EQ(string("", 0), partitions[0].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1", 4), partitions[0].partition_key_end()); + EXPECT_EQ("hash buckets: (0)", + partition_schema.PartitionDebugString(partitions[0], schema)); + + EXPECT_EQ(1, partitions[1].hash_buckets()[0]); + EXPECT_EQ("", partitions[1].range_key_start()); + EXPECT_EQ("", partitions[1].range_key_end()); + EXPECT_EQ(string("\0\0\0\1", 4), partitions[1].partition_key_start()); + EXPECT_EQ(string("\0\0\0\2", 4), partitions[1].partition_key_end()); + EXPECT_EQ("hash buckets: (1)", + partition_schema.PartitionDebugString(partitions[1], schema)); + + EXPECT_EQ(2, partitions[2].hash_buckets()[0]); + EXPECT_EQ("", partitions[2].range_key_start()); + EXPECT_EQ("", partitions[2].range_key_end()); + EXPECT_EQ(string("\0\0\0\2", 4), partitions[2].partition_key_start()); + EXPECT_EQ(string("", 0), partitions[2].partition_key_end()); + EXPECT_EQ("hash buckets: (2)", + partition_schema.PartitionDebugString(partitions[2], schema)); +} + +TEST(PartitionTest, TestCreatePartitions) { + // CREATE TABLE t (a VARCHAR, b VARCHAR, c VARCHAR, PRIMARY KEY (a, b, c)) + // PARITITION BY [HASH BUCKET (a), HASH BUCKET (b), RANGE (a, b, c)]; + Schema schema({ ColumnSchema("a", STRING), + ColumnSchema("b", STRING), + ColumnSchema("c", STRING) }, + { ColumnId(0), ColumnId(1), ColumnId(2) }, 3); + + PartitionSchemaPB schema_builder; + AddHashBucketComponent(&schema_builder, { "a" }, 2, 0); + AddHashBucketComponent(&schema_builder, { "b" }, 2, 0); + PartitionSchema partition_schema; + ASSERT_OK(PartitionSchema::FromPB(schema_builder, schema, &partition_schema)); + + ASSERT_EQ("hash bucket components: [(bucket count: 2, columns: [a]), " + "(bucket count: 2, columns: [b])], range columns: [a, b, c]", + partition_schema.DebugString(schema)); + + // Split Rows: + // + // { a: "a1", b: "b1", c: "c1" } + // { b: "a2", b: "b2" } + // + // non-specified column values default to the logical minimum value (""). + // + // Encoded Partition Keys: + // + // [ (_, _, _), (0, 0, "a1b1c1") ) + // [ (0, 0, "a1b1c1"), (0, 0, "a2b2") ) + // [ (0, 0, "a2b2"), (0, 1, _) ) + // + // [ (0, 1, _), (0, 1, "a1b1c1") ) + // [ (0, 1, "a1b1c1"), (0, 1, "a2b2") ) + // [ (0, 1, "a2b2"), (1, _, _) ) + // + // [ (1, _, _), (1, 0, "a1b1c1") ) + // [ (1, 0, "a1b1c1"), (1, 0, "a2b2") ) + // [ (1, 0, "a2b2"), (1, 1, _) ) + // + // [ (1, 1, _), (1, 1, "a1b1c1") ) + // [ (1, 1, "a1b1c1"), (1, 1, "a2b2") ) + // [ (1, 1, "a2b2"), (_, _, _) ) + // + // _ signifies that the value is omitted from the encoded partition key. + + KuduPartialRow split_a(&schema); + ASSERT_OK(split_a.SetStringCopy("a", "a1")); + ASSERT_OK(split_a.SetStringCopy("b", "b1")); + ASSERT_OK(split_a.SetStringCopy("c", "c1")); + string partition_key_a; + ASSERT_OK(partition_schema.EncodeKey(split_a, &partition_key_a)); + + KuduPartialRow split_b(&schema); + ASSERT_OK(split_b.SetStringCopy("a", "a2")); + ASSERT_OK(split_b.SetStringCopy("b", "b2")); + string partition_key_b; + ASSERT_OK(partition_schema.EncodeKey(split_b, &partition_key_b)); + + // Split keys need not be passed in sorted order. + vector split_rows = { split_b, split_a }; + vector partitions; + ASSERT_OK(partition_schema.CreatePartitions(split_rows, schema, &partitions)); + ASSERT_EQ(12, partitions.size()); + + EXPECT_EQ(0, partitions[0].hash_buckets()[0]); + EXPECT_EQ(0, partitions[0].hash_buckets()[1]); + EXPECT_EQ(string("", 0), partitions[0].range_key_start()); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[0].range_key_end()); + EXPECT_EQ(string("", 0), partitions[0].partition_key_start()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), partitions[0].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 0), " + "range: [(), (string a=a1, string b=b1, string c=c1))", + partition_schema.PartitionDebugString(partitions[0], schema)); + + EXPECT_EQ(0, partitions[1].hash_buckets()[0]); + EXPECT_EQ(0, partitions[1].hash_buckets()[1]); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[1].range_key_start()); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[1].range_key_end()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), + partitions[1].partition_key_start()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a2\0\0b2\0\0", 16), partitions[1].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 0), " + "range: [(string a=a1, string b=b1, string c=c1), (string a=a2, string b=b2, ))", + partition_schema.PartitionDebugString(partitions[1], schema)); + + EXPECT_EQ(0, partitions[2].hash_buckets()[0]); + EXPECT_EQ(0, partitions[2].hash_buckets()[1]); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[2].range_key_start()); + EXPECT_EQ(string("", 0), partitions[2].range_key_end()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\0" "a2\0\0b2\0\0", 16), partitions[2].partition_key_start()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1", 8), partitions[2].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 0), " + "range: [(string a=a2, string b=b2, ), ())", + partition_schema.PartitionDebugString(partitions[2], schema)); + + EXPECT_EQ(0, partitions[3].hash_buckets()[0]); + EXPECT_EQ(1, partitions[3].hash_buckets()[1]); + EXPECT_EQ(string("", 0), partitions[3].range_key_start()); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[3].range_key_end()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1", 8), partitions[3].partition_key_start()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a1\0\0b1\0\0c1", 18), partitions[3].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 1), " + "range: [(), (string a=a1, string b=b1, string c=c1))", + partition_schema.PartitionDebugString(partitions[3], schema)); + + EXPECT_EQ(0, partitions[4].hash_buckets()[0]); + EXPECT_EQ(1, partitions[4].hash_buckets()[1]); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[4].range_key_start()); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[4].range_key_end()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a1\0\0b1\0\0c1", 18), + partitions[4].partition_key_start()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[4].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 1), " + "range: [(string a=a1, string b=b1, string c=c1), (string a=a2, string b=b2, ))", + partition_schema.PartitionDebugString(partitions[4], schema)); + + EXPECT_EQ(0, partitions[5].hash_buckets()[0]); + EXPECT_EQ(1, partitions[5].hash_buckets()[1]); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[5].range_key_start()); + EXPECT_EQ(string("", 0), partitions[5].range_key_end()); + EXPECT_EQ(string("\0\0\0\0" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[5].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1", 4), partitions[5].partition_key_end()); + EXPECT_EQ("hash buckets: (0, 1), " + "range: [(string a=a2, string b=b2, ), ())", + partition_schema.PartitionDebugString(partitions[5], schema)); + + EXPECT_EQ(1, partitions[6].hash_buckets()[0]); + EXPECT_EQ(0, partitions[6].hash_buckets()[1]); + EXPECT_EQ(string("", 0), partitions[6].range_key_start()); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[6].range_key_end()); + EXPECT_EQ(string("\0\0\0\1", 4), partitions[6].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), partitions[6].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 0), " + "range: [(), (string a=a1, string b=b1, string c=c1))", + partition_schema.PartitionDebugString(partitions[6], schema)); + + EXPECT_EQ(1, partitions[7].hash_buckets()[0]); + EXPECT_EQ(0, partitions[7].hash_buckets()[1]); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[7].range_key_start()); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[7].range_key_end()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a1\0\0b1\0\0c1", 18), + partitions[7].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a2\0\0b2\0\0", 16), partitions[7].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 0), " + "range: [(string a=a1, string b=b1, string c=c1), (string a=a2, string b=b2, ))", + partition_schema.PartitionDebugString(partitions[7], schema)); + + EXPECT_EQ(1, partitions[8].hash_buckets()[0]); + EXPECT_EQ(0, partitions[8].hash_buckets()[1]); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[8].range_key_start()); + EXPECT_EQ(string("", 0), partitions[8].range_key_end()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\0" "a2\0\0b2\0\0", 16), partitions[8].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1", 8), partitions[8].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 0), " + "range: [(string a=a2, string b=b2, ), ())", + partition_schema.PartitionDebugString(partitions[8], schema)); + + EXPECT_EQ(1, partitions[9].hash_buckets()[0]); + EXPECT_EQ(1, partitions[9].hash_buckets()[1]); + EXPECT_EQ(string("", 0), partitions[9].range_key_start()); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[9].range_key_end()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1", 8), partitions[9].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a1\0\0b1\0\0c1", 18), partitions[9].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 1), " + "range: [(), (string a=a1, string b=b1, string c=c1))", + partition_schema.PartitionDebugString(partitions[9], schema)); + + EXPECT_EQ(1, partitions[10].hash_buckets()[0]); + EXPECT_EQ(1, partitions[10].hash_buckets()[1]); + EXPECT_EQ(string("a1\0\0b1\0\0c1", 10), partitions[10].range_key_start()); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[10].range_key_end()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a1\0\0b1\0\0c1", 18), + partitions[10].partition_key_start()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[10].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 1), " + "range: [(string a=a1, string b=b1, string c=c1), (string a=a2, string b=b2, ))", + partition_schema.PartitionDebugString(partitions[10], schema)); + + EXPECT_EQ(1, partitions[11].hash_buckets()[0]); + EXPECT_EQ(1, partitions[11].hash_buckets()[1]); + EXPECT_EQ(string("a2\0\0b2\0\0", 8), partitions[11].range_key_start()); + EXPECT_EQ(string("", 0), partitions[11].range_key_end()); + EXPECT_EQ(string("\0\0\0\1" "\0\0\0\1" "a2\0\0b2\0\0", 16), partitions[11].partition_key_start()); + EXPECT_EQ(string("", 0), partitions[11].partition_key_end()); + EXPECT_EQ("hash buckets: (1, 1), " + "range: [(string a=a2, string b=b2, ), ())", + partition_schema.PartitionDebugString(partitions[11], schema)); +} + +} // namespace kudu diff --git a/src/kudu/common/partition.cc b/src/kudu/common/partition.cc new file mode 100644 index 000000000000..2e0efc9dd151 --- /dev/null +++ b/src/kudu/common/partition.cc @@ -0,0 +1,784 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/partition.h" + +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_key-util.h" +#include "kudu/common/scan_predicate.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/hash_util.h" + +namespace kudu { + +using std::set; +using std::string; +using std::vector; + +using google::protobuf::RepeatedPtrField; +using strings::Substitute; + +// The encoded size of a hash bucket in a partition key. +static const size_t kEncodedBucketSize = sizeof(uint32_t); + +Slice Partition::range_key_start() const { + return range_key(partition_key_start()); +} + +Slice Partition::range_key_end() const { + return range_key(partition_key_end()); +} + +Slice Partition::range_key(const string& partition_key) const { + size_t hash_size = kEncodedBucketSize * hash_buckets().size(); + if (partition_key.size() > hash_size) { + Slice s = Slice(partition_key); + s.remove_prefix(hash_size); + return s; + } else { + return Slice(); + } +} + +void Partition::ToPB(PartitionPB* pb) const { + pb->Clear(); + pb->mutable_hash_buckets()->Reserve(hash_buckets_.size()); + for (int32_t bucket : hash_buckets()) { + pb->add_hash_buckets(bucket); + } + pb->set_partition_key_start(partition_key_start()); + pb->set_partition_key_end(partition_key_end()); +} + +void Partition::FromPB(const PartitionPB& pb, Partition* partition) { + partition->hash_buckets_.clear(); + partition->hash_buckets_.reserve(pb.hash_buckets_size()); + for (int32_t hash_bucket : pb.hash_buckets()) { + partition->hash_buckets_.push_back(hash_bucket); + } + + partition->partition_key_start_ = pb.partition_key_start(); + partition->partition_key_end_ = pb.partition_key_end(); +} + +namespace { +// Extracts the column IDs from a protobuf repeated field of column identifiers. +Status ExtractColumnIds(const RepeatedPtrField& identifiers, + const Schema& schema, + vector* column_ids) { + column_ids->reserve(identifiers.size()); + for (PartitionSchemaPB_ColumnIdentifierPB identifier : identifiers) { + switch (identifier.identifier_case()) { + case PartitionSchemaPB_ColumnIdentifierPB::kId: { + ColumnId column_id(identifier.id()); + if (schema.find_column_by_id(column_id) == Schema::kColumnNotFound) { + return Status::InvalidArgument("unknown column id", identifier.DebugString()); + } + column_ids->push_back(column_id); + continue; + } + case PartitionSchemaPB_ColumnIdentifierPB::kName: { + int32_t column_idx = schema.find_column(identifier.name()); + if (column_idx == Schema::kColumnNotFound) { + return Status::InvalidArgument("unknown column", identifier.DebugString()); + } + column_ids->push_back(schema.column_id(column_idx)); + continue; + } + default: return Status::InvalidArgument("unknown column", identifier.DebugString()); + } + } + return Status::OK(); +} +// Sets a repeated field of column identifiers to the provided column IDs. +void SetColumnIdentifiers(const vector& column_ids, + RepeatedPtrField* identifiers) { + identifiers->Reserve(column_ids.size()); + for (ColumnId column_id : column_ids) { + identifiers->Add()->set_id(column_id); + } +} +} // namespace + +Status PartitionSchema::FromPB(const PartitionSchemaPB& pb, + const Schema& schema, + PartitionSchema* partition_schema) { + partition_schema->Clear(); + + for (const PartitionSchemaPB_HashBucketSchemaPB& hash_bucket_pb : pb.hash_bucket_schemas()) { + HashBucketSchema hash_bucket; + RETURN_NOT_OK(ExtractColumnIds(hash_bucket_pb.columns(), schema, &hash_bucket.column_ids)); + + // Hashing is column-order dependent, so sort the column_ids to ensure that + // hash components with the same columns hash consistently. This is + // important when deserializing a user-supplied partition schema during + // table creation; after that the columns should remain in sorted order. + std::sort(hash_bucket.column_ids.begin(), hash_bucket.column_ids.end()); + + hash_bucket.seed = hash_bucket_pb.seed(); + hash_bucket.num_buckets = hash_bucket_pb.num_buckets(); + partition_schema->hash_bucket_schemas_.push_back(hash_bucket); + } + + if (pb.has_range_schema()) { + const PartitionSchemaPB_RangeSchemaPB& range_pb = pb.range_schema(); + RETURN_NOT_OK(ExtractColumnIds(range_pb.columns(), schema, + &partition_schema->range_schema_.column_ids)); + } else { + // Fill in the default range partition (PK columns). + // like the sorting above, this should only happen during table creation + // while deserializing the user-provided partition schema. + for (int32_t column_idx = 0; column_idx < schema.num_key_columns(); column_idx++) { + partition_schema->range_schema_.column_ids.push_back(schema.column_id(column_idx)); + } + } + + return partition_schema->Validate(schema); +} + +void PartitionSchema::ToPB(PartitionSchemaPB* pb) const { + pb->Clear(); + pb->mutable_hash_bucket_schemas()->Reserve(hash_bucket_schemas_.size()); + for (const HashBucketSchema& hash_bucket : hash_bucket_schemas_) { + PartitionSchemaPB_HashBucketSchemaPB* hash_bucket_pb = pb->add_hash_bucket_schemas(); + SetColumnIdentifiers(hash_bucket.column_ids, hash_bucket_pb->mutable_columns()); + hash_bucket_pb->set_num_buckets(hash_bucket.num_buckets); + hash_bucket_pb->set_seed(hash_bucket.seed); + } + + SetColumnIdentifiers(range_schema_.column_ids, pb->mutable_range_schema()->mutable_columns()); +} + +Status PartitionSchema::EncodeKey(const KuduPartialRow& row, string* buf) const { + const KeyEncoder& hash_encoder = GetKeyEncoder(GetTypeInfo(UINT32)); + + for (const HashBucketSchema& hash_bucket_schema : hash_bucket_schemas_) { + int32_t bucket; + RETURN_NOT_OK(BucketForRow(row, hash_bucket_schema, &bucket)); + hash_encoder.Encode(&bucket, buf); + } + + return EncodeColumns(row, range_schema_.column_ids, buf); +} + +Status PartitionSchema::EncodeKey(const ConstContiguousRow& row, string* buf) const { + const KeyEncoder& hash_encoder = GetKeyEncoder(GetTypeInfo(UINT32)); + + for (const HashBucketSchema& hash_bucket_schema : hash_bucket_schemas_) { + int32_t bucket; + RETURN_NOT_OK(BucketForRow(row, hash_bucket_schema, &bucket)); + hash_encoder.Encode(&bucket, buf); + } + + return EncodeColumns(row, range_schema_.column_ids, buf); +} + +Status PartitionSchema::CreatePartitions(const vector& split_rows, + const Schema& schema, + vector* partitions) const { + const KeyEncoder& hash_encoder = GetKeyEncoder(GetTypeInfo(UINT32)); + + // Create a partition per hash bucket combination. + *partitions = vector(1); + for (const HashBucketSchema& bucket_schema : hash_bucket_schemas_) { + vector new_partitions; + // For each of the partitions created so far, replicate it + // by the number of buckets in the next hash bucketing component + for (const Partition& base_partition : *partitions) { + for (int32_t bucket = 0; bucket < bucket_schema.num_buckets; bucket++) { + Partition partition = base_partition; + partition.hash_buckets_.push_back(bucket); + hash_encoder.Encode(&bucket, &partition.partition_key_start_); + hash_encoder.Encode(&bucket, &partition.partition_key_end_); + new_partitions.push_back(partition); + } + } + partitions->swap(new_partitions); + } + + unordered_set range_column_idxs; + for (ColumnId column_id : range_schema_.column_ids) { + int column_idx = schema.find_column_by_id(column_id); + if (column_idx == Schema::kColumnNotFound) { + return Status::InvalidArgument(Substitute("Range partition column ID $0 " + "not found in table schema.", column_id)); + } + if (!InsertIfNotPresent(&range_column_idxs, column_idx)) { + return Status::InvalidArgument("Duplicate column in range partition", + schema.column(column_idx).name()); + } + } + + // Create the start range keys. + set start_keys; + string start_key; + for (const KuduPartialRow& row : split_rows) { + int column_count = 0; + for (int column_idx = 0; column_idx < schema.num_columns(); column_idx++) { + const ColumnSchema& column = schema.column(column_idx); + if (row.IsColumnSet(column_idx)) { + if (ContainsKey(range_column_idxs, column_idx)) { + column_count++; + } else { + return Status::InvalidArgument("Split rows may only contain values for " + "range partitioned columns", column.name()); + } + } + } + + // Check for an empty split row. + if (column_count == 0) { + return Status::InvalidArgument("Split rows must contain a value for at " + "least one range partition column"); + } + + start_key.clear(); + RETURN_NOT_OK(EncodeColumns(row, range_schema_.column_ids, &start_key)); + + // Check for a duplicate split row. + if (!InsertIfNotPresent(&start_keys, start_key)) { + return Status::InvalidArgument("Duplicate split row", row.ToString()); + } + } + + // Create a partition per range and hash bucket combination. + vector new_partitions; + for (const Partition& base_partition : *partitions) { + start_key.clear(); + + for (const string& end_key : start_keys) { + Partition partition = base_partition; + partition.partition_key_start_.append(start_key); + partition.partition_key_end_.append(end_key); + new_partitions.push_back(partition); + start_key = end_key; + } + + // Add the final range. + Partition partition = base_partition; + partition.partition_key_start_.append(start_key); + new_partitions.push_back(partition); + } + partitions->swap(new_partitions); + + // Note: the following discussion and logic only takes effect when the table's + // partition schema includes at least one hash bucket component. + // + // At this point, we have the full set of partitions built up, but each + // partition only covers a finite slice of the partition key-space. Some + // operations involving partitions are easier (pruning, client meta cache) if + // it can be assumed that the partition keyspace does not have holes. + // + // In order to 'fill in' the partition key space, the absolute first and last + // partitions are extended to cover the rest of the lower and upper partition + // range by clearing the start and end partition key, respectively. + // + // When the table has two or more hash components, there will be gaps in + // between partitions at the boundaries of the component ranges. Similar to + // the absolute start and end case, these holes are filled by clearing the + // partition key beginning at the hash component. For a concrete example, + // see PartitionTest::TestCreatePartitions. + for (Partition& partition : *partitions) { + if (partition.range_key_start().empty()) { + for (int i = partition.hash_buckets().size() - 1; i >= 0; i--) { + if (partition.hash_buckets()[i] != 0) { + break; + } + partition.partition_key_start_.erase(kEncodedBucketSize * i); + } + } + if (partition.range_key_end().empty()) { + for (int i = partition.hash_buckets().size() - 1; i >= 0; i--) { + partition.partition_key_end_.erase(kEncodedBucketSize * i); + int32_t hash_bucket = partition.hash_buckets()[i] + 1; + if (hash_bucket != hash_bucket_schemas_[i].num_buckets) { + hash_encoder.Encode(&hash_bucket, &partition.partition_key_end_); + break; + } + } + } + } + + return Status::OK(); +} + +template +Status PartitionSchema::PartitionContainsRowImpl(const Partition& partition, + const Row& row, + bool* contains) const { + CHECK_EQ(partition.hash_buckets().size(), hash_bucket_schemas_.size()); + for (int i = 0; i < hash_bucket_schemas_.size(); i++) { + const HashBucketSchema& hash_bucket_schema = hash_bucket_schemas_[i]; + int32_t bucket; + RETURN_NOT_OK(BucketForRow(row, hash_bucket_schema, &bucket)); + + if (bucket != partition.hash_buckets()[i]) { + *contains = false; + return Status::OK(); + } + } + + string range_partition_key; + RETURN_NOT_OK(EncodeColumns(row, range_schema_.column_ids, &range_partition_key)); + + // If all of the hash buckets match, then the row is contained in the + // partition if the row is gte the lower bound; and if there is no upper + // bound, or the row is lt the upper bound. + *contains = (Slice(range_partition_key).compare(partition.range_key_start()) >= 0) + && (partition.range_key_end().empty() + || Slice(range_partition_key).compare(partition.range_key_end()) < 0); + + return Status::OK(); +} + +Status PartitionSchema::PartitionContainsRow(const Partition& partition, + const KuduPartialRow& row, + bool* contains) const { + return PartitionContainsRowImpl(partition, row, contains); +} + +Status PartitionSchema::PartitionContainsRow(const Partition& partition, + const ConstContiguousRow& row, + bool* contains) const { + return PartitionContainsRowImpl(partition, row, contains); +} + + +Status PartitionSchema::DecodeRangeKey(Slice* encoded_key, + KuduPartialRow* row, + Arena* arena) const { + ContiguousRow cont_row(row->schema(), row->row_data_); + for (int i = 0; i < range_schema_.column_ids.size(); i++) { + + if (encoded_key->empty()) { + // This can happen when decoding partition start and end keys, since they + // are truncated to simulate absolute upper and lower bounds. + continue; + } + + int32_t column_idx = row->schema()->find_column_by_id(range_schema_.column_ids[i]); + const ColumnSchema& column = row->schema()->column(column_idx); + const KeyEncoder& key_encoder = GetKeyEncoder(column.type_info()); + bool is_last = i == (range_schema_.column_ids.size() - 1); + + // Decode the column. + RETURN_NOT_OK_PREPEND(key_encoder.Decode(encoded_key, + is_last, + arena, + cont_row.mutable_cell_ptr(column_idx)), + Substitute("Error decoding partition key range component '$0'", + column.name())); + // Mark the column as set. + BitmapSet(row->isset_bitmap_, column_idx); + } + if (!encoded_key->empty()) { + return Status::InvalidArgument("unable to fully decode partition key range components"); + } + return Status::OK(); +} + +// Decodes a slice of a partition key into the buckets. The slice is modified to +// remove the hash components. +Status PartitionSchema::DecodeHashBuckets(Slice* encoded_key, + vector* buckets) const { + size_t hash_components_size = kEncodedBucketSize * hash_bucket_schemas_.size(); + if (encoded_key->size() < hash_components_size) { + return Status::InvalidArgument( + Substitute("expected encoded hash key to be at least $0 bytes (only found $1)", + hash_components_size, encoded_key->size())); + } + for (const auto& schema : hash_bucket_schemas_) { + (void) schema; // quiet unused variable warning + uint32_t big_endian; + memcpy(&big_endian, encoded_key->data(), sizeof(uint32_t)); + buckets->push_back(BigEndian::ToHost32(big_endian)); + encoded_key->remove_prefix(sizeof(uint32_t)); + } + + return Status::OK(); +} + +string PartitionSchema::PartitionDebugString(const Partition& partition, + const Schema& schema) const { + string s; + + if (!partition.hash_buckets().empty()) { + vector components; + for (int32_t bucket : partition.hash_buckets()) { + components.push_back(Substitute("$0", bucket)); + } + s.append("hash buckets: ("); + s.append(JoinStrings(components, ", ")); + if (!range_schema_.column_ids.empty()) { + s.append("), "); + } else { + s.append(")"); + } + } + + if (!range_schema_.column_ids.empty()) { + Arena arena(1024, 128 * 1024); + KuduPartialRow start_row(&schema); + KuduPartialRow end_row(&schema); + + s.append("range: [("); + + vector start_components; + Slice encoded_range_key_start = partition.range_key_start(); + Status status; + status = DecodeRangeKey(&encoded_range_key_start, &start_row, &arena); + if (status.ok()) { + AppendRangeDebugStringComponentsOrString(start_row, "", &start_components); + s.append(JoinStrings(start_components, ", ")); + } else { + s.append(Substitute("", status.ToString())); + } + s.append("), ("); + + vector end_components; + Slice encoded_range_key_end = partition.range_key_end(); + status = DecodeRangeKey(&encoded_range_key_end, &end_row, &arena); + if (status.ok()) { + AppendRangeDebugStringComponentsOrString(end_row, "", &end_components); + s.append(JoinStrings(end_components, ", ")); + } else { + s.append(Substitute("", status.ToString())); + } + s.append("))"); + } + + return s; +} + +void PartitionSchema::AppendRangeDebugStringComponentsOrString(const KuduPartialRow& row, + const StringPiece default_string, + vector* components) const { + ConstContiguousRow const_row(row.schema(), row.row_data_); + + for (ColumnId column_id : range_schema_.column_ids) { + string column; + int32_t column_idx = row.schema()->find_column_by_id(column_id); + if (column_idx == Schema::kColumnNotFound) { + components->push_back(""); + continue; + } + const ColumnSchema& column_schema = row.schema()->column(column_idx); + + if (!row.IsColumnSet(column_idx)) { + components->push_back(default_string.as_string()); + break; + } else { + column_schema.DebugCellAppend(const_row.cell(column_idx), &column); + } + + components->push_back(column); + } +} + +void PartitionSchema::AppendRangeDebugStringComponentsOrMin(const KuduPartialRow& row, + vector* components) const { + ConstContiguousRow const_row(row.schema(), row.row_data_); + + for (ColumnId column_id : range_schema_.column_ids) { + string column; + int32_t column_idx = row.schema()->find_column_by_id(column_id); + if (column_idx == Schema::kColumnNotFound) { + components->push_back(""); + continue; + } + const ColumnSchema& column_schema = row.schema()->column(column_idx); + + if (!row.IsColumnSet(column_idx)) { + uint8_t min_value[kLargestTypeSize]; + column_schema.type_info()->CopyMinValue(&min_value); + SimpleConstCell cell(&column_schema, &min_value); + column_schema.DebugCellAppend(cell, &column); + } else { + column_schema.DebugCellAppend(const_row.cell(column_idx), &column); + } + + components->push_back(column); + } +} + +string PartitionSchema::RowDebugString(const ConstContiguousRow& row) const { + vector components; + + for (const HashBucketSchema& hash_bucket_schema : hash_bucket_schemas_) { + int32_t bucket; + Status s = BucketForRow(row, hash_bucket_schema, &bucket); + if (s.ok()) { + components.push_back(Substitute("bucket=$0", bucket)); + } else { + components.push_back(Substitute("", s.ToString())); + } + } + + for (ColumnId column_id : range_schema_.column_ids) { + string column; + int32_t column_idx = row.schema()->find_column_by_id(column_id); + if (column_idx == Schema::kColumnNotFound) { + components.push_back(""); + break; + } + row.schema()->column(column_idx).DebugCellAppend(row.cell(column_idx), &column); + components.push_back(column); + } + + return JoinStrings(components, ", "); +} + +string PartitionSchema::RowDebugString(const KuduPartialRow& row) const { + vector components; + + for (const HashBucketSchema& hash_bucket_schema : hash_bucket_schemas_) { + int32_t bucket; + Status s = BucketForRow(row, hash_bucket_schema, &bucket); + if (s.ok()) { + components.push_back(Substitute("bucket=$0", bucket)); + } else { + components.push_back(Substitute("", s.ToString())); + } + } + + AppendRangeDebugStringComponentsOrMin(row, &components); + + return JoinStrings(components, ", "); +} + +string PartitionSchema::PartitionKeyDebugString(const string& key, const Schema& schema) const { + Slice encoded_key = key; + + vector components; + + if (!hash_bucket_schemas_.empty()) { + vector buckets; + Status s = DecodeHashBuckets(&encoded_key, &buckets); + if (!s.ok()) { + return Substitute("", s.ToString()); + } + for (int32_t bucket : buckets) { + components.push_back(Substitute("bucket=$0", bucket)); + } + } + + if (!range_schema_.column_ids.empty()) { + Arena arena(1024, 128 * 1024); + KuduPartialRow row(&schema); + + Status s = DecodeRangeKey(&encoded_key, &row, &arena); + if (!s.ok()) { + return Substitute("", s.ToString()); + } + + AppendRangeDebugStringComponentsOrMin(row, &components); + } + + return JoinStrings(components, ", "); +} + +namespace { +// Converts a list of column IDs to a string with the column names seperated by +// a comma character. +string ColumnIdsToColumnNames(const Schema& schema, + const vector column_ids) { + vector names; + for (ColumnId column_id : column_ids) { + names.push_back(schema.column(schema.find_column_by_id(column_id)).name()); + } + + return JoinStrings(names, ", "); +} +} // namespace + +string PartitionSchema::DebugString(const Schema& schema) const { + vector component_types; + + if (!hash_bucket_schemas_.empty()) { + vector hash_components; + for (const HashBucketSchema& hash_bucket_schema : hash_bucket_schemas_) { + string component; + component.append(Substitute("(bucket count: $0", hash_bucket_schema.num_buckets)); + if (hash_bucket_schema.seed != 0) { + component.append(Substitute(", seed: $0", hash_bucket_schema.seed)); + } + component.append(Substitute(", columns: [$0])", + ColumnIdsToColumnNames(schema, hash_bucket_schema.column_ids))); + hash_components.push_back(component); + } + component_types.push_back(Substitute("hash bucket components: [$0]", + JoinStrings(hash_components, ", "))); + } + + if (!range_schema_.column_ids.empty()) { + component_types.push_back(Substitute("range columns: [$0]", + ColumnIdsToColumnNames(schema, range_schema_.column_ids))); + } + return JoinStrings(component_types, ", "); +} + +bool PartitionSchema::Equals(const PartitionSchema& other) const { + if (this == &other) return true; + + // Compare range component. + if (range_schema_.column_ids != other.range_schema_.column_ids) return false; + + // Compare hash bucket components. + if (hash_bucket_schemas_.size() != other.hash_bucket_schemas_.size()) return false; + for (int i = 0; i < hash_bucket_schemas_.size(); i++) { + if (hash_bucket_schemas_[i].seed != other.hash_bucket_schemas_[i].seed) return false; + if (hash_bucket_schemas_[i].num_buckets + != other.hash_bucket_schemas_[i].num_buckets) return false; + if (hash_bucket_schemas_[i].column_ids + != other.hash_bucket_schemas_[i].column_ids) return false; + } + + return true; +} + +bool PartitionSchema::IsSimplePKRangePartitioning(const Schema& schema) const { + if (!hash_bucket_schemas_.empty()) return false; + if (range_schema_.column_ids.size() != schema.num_key_columns()) return false; + + for (int i = 0; i < schema.num_key_columns(); i++) { + if (range_schema_.column_ids[i] != schema.column_id(i)) return false; + } + return true; +} + +// Encodes the specified primary key columns of the supplied row into the buffer. +Status PartitionSchema::EncodeColumns(const ConstContiguousRow& row, + const vector& column_ids, + string* buf) { + for (int i = 0; i < column_ids.size(); i++) { + ColumnId column_id = column_ids[i]; + int32_t column_idx = row.schema()->find_column_by_id(column_id); + const TypeInfo* type = row.schema()->column(column_idx).type_info(); + GetKeyEncoder(type).Encode(row.cell_ptr(column_idx), i + 1 == column_ids.size(), buf); + } + return Status::OK(); +} + +// Encodes the specified primary key columns of the supplied row into the buffer. +Status PartitionSchema::EncodeColumns(const KuduPartialRow& row, + const vector& column_ids, + string* buf) { + for (int i = 0; i < column_ids.size(); i++) { + int32_t column_idx = row.schema()->find_column_by_id(column_ids[i]); + CHECK(column_idx != Schema::kColumnNotFound); + const TypeInfo* type_info = row.schema()->column(column_idx).type_info(); + const KeyEncoder& encoder = GetKeyEncoder(type_info); + + if (PREDICT_FALSE(!row.IsColumnSet(column_idx))) { + uint8_t min_value[kLargestTypeSize]; + type_info->CopyMinValue(min_value); + encoder.Encode(min_value, i + 1 == column_ids.size(), buf); + } else { + ContiguousRow cont_row(row.schema(), row.row_data_); + encoder.Encode(cont_row.cell_ptr(column_idx), i + 1 == column_ids.size(), buf); + } + } + return Status::OK(); +} + +int32_t PartitionSchema::BucketForEncodedColumns(const string& encoded_key, + const HashBucketSchema& hash_bucket_schema) { + uint64_t hash = HashUtil::MurmurHash2_64(encoded_key.data(), + encoded_key.length(), + hash_bucket_schema.seed); + return hash % static_cast(hash_bucket_schema.num_buckets); +} + +template +Status PartitionSchema::BucketForRow(const Row& row, + const HashBucketSchema& hash_bucket_schema, + int32_t* bucket) { + string buf; + RETURN_NOT_OK(EncodeColumns(row, hash_bucket_schema.column_ids, &buf)); + uint64_t hash = HashUtil::MurmurHash2_64(buf.data(), buf.length(), hash_bucket_schema.seed); + *bucket = hash % static_cast(hash_bucket_schema.num_buckets); + return Status::OK(); +} + +//------------------------------------------------------------ +// Template instantiations: We instantiate all possible templates to avoid linker issues. +// see: https://isocpp.org/wiki/faq/templates#separate-template-fn-defn-from-decl +//------------------------------------------------------------ + +template +Status PartitionSchema::BucketForRow(const KuduPartialRow& row, + const HashBucketSchema& hash_bucket_schema, + int32_t* bucket); + +template +Status PartitionSchema::BucketForRow(const ConstContiguousRow& row, + const HashBucketSchema& hash_bucket_schema, + int32_t* bucket); + +void PartitionSchema::Clear() { + hash_bucket_schemas_.clear(); + range_schema_.column_ids.clear(); +} + +Status PartitionSchema::Validate(const Schema& schema) const { + set hash_columns; + for (const PartitionSchema::HashBucketSchema& hash_schema : hash_bucket_schemas_) { + if (hash_schema.num_buckets < 2) { + return Status::InvalidArgument("must have at least two hash buckets"); + } + + if (hash_schema.column_ids.size() < 1) { + return Status::InvalidArgument("must have at least one hash column"); + } + + for (ColumnId hash_column : hash_schema.column_ids) { + if (!hash_columns.insert(hash_column).second) { + return Status::InvalidArgument("hash bucket schema components must not " + "contain columns in common"); + } + int32_t column_idx = schema.find_column_by_id(hash_column); + if (column_idx == Schema::kColumnNotFound) { + return Status::InvalidArgument("must specify existing columns for hash " + "bucket partition components"); + } else if (column_idx >= schema.num_key_columns()) { + return Status::InvalidArgument("must specify only primary key columns for " + "hash bucket partition components"); + } + } + } + + for (ColumnId column_id : range_schema_.column_ids) { + int32_t column_idx = schema.find_column_by_id(column_id); + if (column_idx == Schema::kColumnNotFound) { + return Status::InvalidArgument("must specify existing columns for range " + "partition component"); + } else if (column_idx >= schema.num_key_columns()) { + return Status::InvalidArgument("must specify only primary key columns for " + "range partition component"); + } + } + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/common/partition.h b/src/kudu/common/partition.h new file mode 100644 index 000000000000..b8988c42e4eb --- /dev/null +++ b/src/kudu/common/partition.h @@ -0,0 +1,273 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_PARTITION_H +#define KUDU_COMMON_PARTITION_H + +#include +#include +#include + +#include "kudu/common/common.pb.h" +#include "kudu/common/key_encoder.h" +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace kudu { + +class ColumnRangePredicate; +class ConstContiguousRow; +class KuduPartialRow; +class PartitionSchemaPB; +class TypeInfo; + +// A Partition describes the set of rows that a Tablet is responsible for +// serving. Each tablet is assigned a single Partition. +// +// Partitions consist primarily of a start and end partition key. Every row with +// a partition key that falls in a Tablet's Partition will be served by that +// tablet. +// +// In addition to the start and end partition keys, a Partition holds metadata +// to determine if a scan can prune, or skip, a partition based on the scan's +// start and end primary keys, and predicates. +class Partition { + public: + + const std::vector& hash_buckets() const { + return hash_buckets_; + } + + Slice range_key_start() const; + + Slice range_key_end() const; + + const std::string& partition_key_start() const { + return partition_key_start_; + } + + const std::string& partition_key_end() const { + return partition_key_end_; + } + + // Serializes a partition into a protobuf message. + void ToPB(PartitionPB* pb) const; + + // Deserializes a protobuf message into a partition. + // + // The protobuf message is not validated, since partitions are only expected + // to be created by the master process. + static void FromPB(const PartitionPB& pb, Partition* partition); + + private: + friend class PartitionSchema; + + // Helper function for accessing the range key portion of a partition key. + Slice range_key(const std::string& partition_key) const; + + std::vector hash_buckets_; + + std::string partition_key_start_; + std::string partition_key_end_; +}; + +// A partition schema describes how the rows of a table are distributed among +// tablets. +// +// Primarily, a table's partition schema is responsible for translating the +// primary key column values of a row into a partition key that can be used to +// determine the tablet containing the key. +// +// The partition schema is made up of zero or more hash bucket components, +// followed by a single range component. +// +// Each hash bucket component includes one or more columns from the primary key +// column set, with the restriction that an individual primary key column may +// only be included in a single hash component. +// +// To determine the hash bucket of an individual row, the values of the columns +// of the hash component are encoded into bytes (in PK or lexicographic +// preserving encoding), then hashed into a u64, then modded into an i32. When +// constructing a partition key from a row, the buckets of the row are simply +// encoded into the partition key in order (again in PK or lexicographic +// preserving encoding). +// +// The range component contains a (possibly full or empty) subset of the primary +// key columns. When encoding the partition key, the columns of the partition +// component are encoded in order. +// +// The above is true of the relationship between rows and partition keys. It +// gets trickier with partitions (tablet partition key boundaries), because the +// boundaries of tablets do not necessarily align to rows. For instance, +// currently the absolute-start and absolute-end primary keys of a table +// represented as an empty key, but do not have a corresponding row. Partitions +// are similar, but instead of having just one absolute-start and absolute-end, +// each component of a partition schema has an absolute-start and absolute-end. +// When creating the initial set of partitions during table creation, we deal +// with this by "carrying through" absolute-start or absolute-ends into lower +// significance components. +class PartitionSchema { + public: + + // Deserializes a protobuf message into a partition schema. + static Status FromPB(const PartitionSchemaPB& pb, + const Schema& schema, + PartitionSchema* partition_schema) WARN_UNUSED_RESULT; + + // Serializes a partition schema into a protobuf message. + void ToPB(PartitionSchemaPB* pb) const; + + // Appends the row's encoded partition key into the provided buffer. + // On failure, the buffer may have data partially appended. + Status EncodeKey(const KuduPartialRow& row, std::string* buf) const WARN_UNUSED_RESULT; + + // Appends the row's encoded partition key into the provided buffer. + // On failure, the buffer may have data partially appended. + Status EncodeKey(const ConstContiguousRow& row, std::string* buf) const WARN_UNUSED_RESULT; + + // Creates the set of table partitions for a partition schema and collection + // of split rows. + // + // The number of resulting partitions is the product of the number of hash + // buckets for each hash bucket component, multiplied by + // (split_rows.size() + 1). + Status CreatePartitions(const std::vector& split_rows, + const Schema& schema, + std::vector* partitions) const WARN_UNUSED_RESULT; + + // Tests if the partition contains the row. + Status PartitionContainsRow(const Partition& partition, + const KuduPartialRow& row, + bool* contains) const WARN_UNUSED_RESULT; + + // Tests if the partition contains the row. + Status PartitionContainsRow(const Partition& partition, + const ConstContiguousRow& row, + bool* contains) const WARN_UNUSED_RESULT; + + // Returns a text description of the partition suitable for debug printing. + std::string PartitionDebugString(const Partition& partition, const Schema& schema) const; + + // Returns a text description of the partial row's partition key suitable for debug printing. + std::string RowDebugString(const KuduPartialRow& row) const; + + // Returns a text description of the row's partition key suitable for debug printing. + std::string RowDebugString(const ConstContiguousRow& row) const; + + // Returns a text description of the encoded partition key suitable for debug printing. + std::string PartitionKeyDebugString(const std::string& key, const Schema& schema) const; + + // Returns a text description of this partition schema suitable for debug printing. + std::string DebugString(const Schema& schema) const; + + // Returns true if the other partition schema is equivalent to this one. + bool Equals(const PartitionSchema& other) const; + + // Return true if the partitioning scheme simply range-partitions on the full primary key, + // with no bucketing components, etc. + bool IsSimplePKRangePartitioning(const Schema& schema) const; + + private: + + struct RangeSchema { + std::vector column_ids; + }; + + struct HashBucketSchema { + std::vector column_ids; + int32_t num_buckets; + uint32_t seed; + }; + + // Encodes the specified columns of a row into lexicographic sort-order + // preserving format. + static Status EncodeColumns(const KuduPartialRow& row, + const std::vector& column_ids, + std::string* buf); + + // Encodes the specified columns of a row into lexicographic sort-order + // preserving format. + static Status EncodeColumns(const ConstContiguousRow& row, + const std::vector& column_ids, + std::string* buf); + + // Returns the hash bucket of the encoded hash column. The encoded columns must match the + // columns of the hash bucket schema. + static int32_t BucketForEncodedColumns(const std::string& encoded_hash_columns, + const HashBucketSchema& hash_bucket_schema); + + // Assigns the row to a hash bucket according to the hash schema. + template + static Status BucketForRow(const Row& row, + const HashBucketSchema& hash_bucket_schema, + int32_t* bucket); + + // Private templated helper for PartitionContainsRow. + template + Status PartitionContainsRowImpl(const Partition& partition, + const Row& row, + bool* contains) const; + + // Private templated helper for EncodeKey. + template + Status EncodeKeyImpl(const Row& row, string* buf) const; + + // Appends the stringified range partition components of a partial row to a + // vector. + // + // If any columns of the range partition do not exist in the partial row, + // processing stops and the provided default string piece is appended to the vector. + void AppendRangeDebugStringComponentsOrString(const KuduPartialRow& row, + StringPiece default_string, + std::vector* components) const; + + // Appends the stringified range partition components of a partial row to a + // vector. + // + // If any columns of the range partition do not exist in the partial row, the + // logical minimum value for that column will be used instead. + void AppendRangeDebugStringComponentsOrMin(const KuduPartialRow& row, + std::vector* components) const; + + // Decodes a range partition key into a partial row, with variable-length + // fields stored in the arena. + Status DecodeRangeKey(Slice* encode_key, + KuduPartialRow* partial_row, + Arena* arena) const; + + // Decodes the hash bucket component of a partition key into its buckets. + // + // This should only be called with partition keys created from a row, not with + // partition keys from a partition. + Status DecodeHashBuckets(Slice* partition_key, std::vector* buckets) const; + + // Clears the state of this partition schema. + void Clear(); + + // Validates that this partition schema is valid. Returns OK, or an + // appropriate error code for an invalid partition schema. + Status Validate(const Schema& schema) const; + + std::vector hash_bucket_schemas_; + RangeSchema range_schema_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/predicate-test.cc b/src/kudu/common/predicate-test.cc new file mode 100644 index 000000000000..a7c83f81fac2 --- /dev/null +++ b/src/kudu/common/predicate-test.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/scan_predicate.h" +#include "kudu/common/rowblock.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class TestPredicate : public KuduTest { + public: + TestPredicate() : + arena_(1024, 4096), + n_rows_(100), + schema_({ ColumnSchema("col0", UINT32), + ColumnSchema("col1", UINT32), + ColumnSchema("col2", STRING) }, + 1), + row_block_(schema_, n_rows_, &arena_) + {} + + // Set up a block of data with two columns: + // col0 col1 + // ---- ------ + // 0 0 + // 1 10 + // ... ... + // N N * 10 + // + // The third STRING column is left unset. + void SetUp() OVERRIDE { + KuduTest::SetUp(); + + ColumnBlock col0 = row_block_.column_block(0, n_rows_); + ColumnBlock col1 = row_block_.column_block(1, n_rows_); + + for (uint32_t i = 0; i < n_rows_; i++) { + uint32_t i1 = i * 10; + col0.SetCellValue(i, &i); + col1.SetCellValue(i, &i1); + } + } + + protected: + Arena arena_; + const size_t n_rows_; + Schema schema_; + RowBlock row_block_; +}; + +TEST_F(TestPredicate, TestSelectionVector) { + SelectionVector selvec(10); + selvec.SetAllTrue(); + ASSERT_TRUE(selvec.IsRowSelected(0)); + ASSERT_TRUE(selvec.IsRowSelected(9)); + ASSERT_EQ(10, selvec.CountSelected()); + ASSERT_TRUE(selvec.AnySelected()); + + for (int i = 0; i < 10; i++) { + BitmapClear(selvec.mutable_bitmap(), i); + } + + ASSERT_FALSE(selvec.AnySelected()); + + // Test Resize() + selvec.SetAllTrue(); + for (int i = 10; i > 0; --i) { + selvec.Resize(i); + ASSERT_EQ(selvec.CountSelected(), i); + ASSERT_TRUE(selvec.AnySelected()); + } + selvec.Resize(0); + ASSERT_EQ(selvec.CountSelected(), 0); + ASSERT_FALSE(selvec.AnySelected()); +} + +TEST_F(TestPredicate, TestColumnRange) { + SelectionVector selvec(n_rows_); + selvec.SetAllTrue(); + ASSERT_EQ(100, selvec.CountSelected()); + + // Apply predicate 20 <= col0 <= 29 + uint32_t col0_lower = 20; + uint32_t col0_upper = 29; + ColumnRangePredicate pred1(schema_.column(0), &col0_lower, &col0_upper); + ASSERT_EQ("(`col0` BETWEEN 20 AND 29)", pred1.ToString()); + pred1.Evaluate(&row_block_, &selvec); + ASSERT_EQ(10, selvec.CountSelected()) << "Only 10 rows should be left (20-29)"; + + // Apply predicate col1 >= 250 + uint32_t col1_lower = 250; + ColumnRangePredicate pred2(schema_.column(1), &col1_lower, nullptr); + ASSERT_EQ("(`col1` >= 250)", pred2.ToString()); + pred2.Evaluate(&row_block_, &selvec); + ASSERT_EQ(5, selvec.CountSelected()) << "Only 5 rows should be left (25-29)"; +} + +// Regression test for KUDU-54: should not try to access rows for which the +// selection vector is 0. +TEST_F(TestPredicate, TestDontEvalauteOnUnselectedRows) { + SelectionVector selvec(n_rows_); + selvec.SetAllFalse(); + + // Fill the STRING column with garbage data. + OverwriteWithPattern(reinterpret_cast(row_block_.column_block(2).data()), + row_block_.column_block(2).stride() * row_block_.nrows(), + "JUNKDATA"); + + Slice lower("lower"); + ColumnRangePredicate p(schema_.column(2), &lower, nullptr); + p.Evaluate(&row_block_, &selvec); + ASSERT_EQ(0, selvec.CountSelected()); +} + +} // namespace kudu diff --git a/src/kudu/common/predicate_encoder-test.cc b/src/kudu/common/predicate_encoder-test.cc new file mode 100644 index 000000000000..b2486d1fe5b3 --- /dev/null +++ b/src/kudu/common/predicate_encoder-test.cc @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/predicate_encoder.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class TestRangePredicateEncoder : public KuduTest { + public: + explicit TestRangePredicateEncoder(const Schema& s) + : arena_(1024, 256 * 1024), + schema_(s), + enc_(&schema_, &arena_) {} + + enum ComparisonOp { + GE, + EQ, + LE + }; + + template + void AddPredicate(ScanSpec* spec, StringPiece col, + ComparisonOp op, T val) { + int idx = schema_.find_column(col); + CHECK_GE(idx, 0); + + void* upper = nullptr; + void* lower = nullptr; + void* val_void = arena_.AllocateBytes(sizeof(val)); + memcpy(val_void, &val, sizeof(val)); + + switch (op) { + case GE: + lower = val_void; + break; + case EQ: + lower = upper = val_void; + break; + case LE: + upper = val_void; + break; + } + + ColumnRangePredicate pred(schema_.column(idx), lower, upper); + spec->AddPredicate(pred); + } + + + protected: + Arena arena_; + Schema schema_; + RangePredicateEncoder enc_; +}; + +class CompositeIntKeysTest : public TestRangePredicateEncoder { + public: + CompositeIntKeysTest() : + TestRangePredicateEncoder( + Schema({ ColumnSchema("a", UINT8), + ColumnSchema("b", UINT8), + ColumnSchema("c", UINT8) }, + 3)) { + } +}; + +// Test that multiple predicates on a column are collapsed by +// RangePredicateEncoder::Simplify() +TEST_F(CompositeIntKeysTest, TestSimplify) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 255); + AddPredicate(&spec, "b", GE, 3); + AddPredicate(&spec, "b", LE, 255); + AddPredicate(&spec, "b", LE, 200); + AddPredicate(&spec, "c", LE, 128); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + vector bounds; + enc_.SimplifyBounds(spec, &bounds); + ASSERT_EQ(3, bounds.size()); + ASSERT_EQ("(`a` BETWEEN 255 AND 255)", + ColumnRangePredicate(schema_.column(0), bounds[0].lower, bounds[0].upper).ToString()); + + ASSERT_EQ("(`b` BETWEEN 3 AND 200)", + ColumnRangePredicate(schema_.column(1), bounds[1].lower, bounds[1].upper).ToString()); + ASSERT_EQ("(`c` <= 128)", + ColumnRangePredicate(schema_.column(2), bounds[2].lower, bounds[2].upper).ToString()); +} + +// Predicate: a == 128 +TEST_F(CompositeIntKeysTest, TestPrefixEquality) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 128); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + // Expect: key >= (128, 0, 0) AND key < (129, 0, 0) + EXPECT_EQ("PK >= (uint8 a=128, uint8 b=0, uint8 c=0) AND " + "PK < (uint8 a=129, uint8 b=0, uint8 c=0)", + spec.ToStringWithSchema(schema_)); +} + +// Predicate: a <= 254 +TEST_F(CompositeIntKeysTest, TestPrefixUpperBound) { + ScanSpec spec; + AddPredicate(&spec, "a", LE, 254); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK < (uint8 a=255, uint8 b=0, uint8 c=0)", + spec.ToStringWithSchema(schema_)); +} + +// Predicate: a >= 254 +TEST_F(CompositeIntKeysTest, TestPrefixLowerBound) { + // Predicate: a >= 254 + ScanSpec spec; + AddPredicate(&spec, "a", GE, 254); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=254, uint8 b=0, uint8 c=0)", spec.ToStringWithSchema(schema_)); +} + +// Test a predicate on a non-prefix part of the key. Can't be pushed. +// +// Predicate: b == 128 +TEST_F(CompositeIntKeysTest, TestNonPrefix) { + ScanSpec spec; + AddPredicate(&spec, "b", EQ, 128); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + // Expect: nothing pushed (predicate is still on `b`, not PK) + EXPECT_EQ("(`b` BETWEEN 128 AND 128)", + spec.ToStringWithSchema(schema_)); +} + +// Test what happens when an upper bound on a cell is equal to the maximum +// value for the cell. In this case, the preceding cell is also at the maximum +// value as well, so we eliminate the upper bound entirely. +// +// Predicate: a == 255 AND b BETWEEN 3 AND 255 +TEST_F(CompositeIntKeysTest, TestRedundantUpperBound) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 255); + AddPredicate(&spec, "b", GE, 3); + AddPredicate(&spec, "b", LE, 255); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=255, uint8 b=3, uint8 c=0)", spec.ToStringWithSchema(schema_)); +} + +// A similar test, but in this case we still have an equality prefix +// that needs to be accounted for, so we can't eliminate the upper bound +// entirely. +// +// Predicate: a == 1 AND b BETWEEN 3 AND 255 +TEST_F(CompositeIntKeysTest, TestRedundantUpperBound2) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 1); + AddPredicate(&spec, "b", GE, 3); + AddPredicate(&spec, "b", LE, 255); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=1, uint8 b=3, uint8 c=0) AND " + "PK < (uint8 a=2, uint8 b=0, uint8 c=0)", + spec.ToStringWithSchema(schema_)); +} + +// Test that, if so desired, pushed predicates are not erased. +// +// Predicate: a == 254 +TEST_F(CompositeIntKeysTest, TestNoErasePredicates) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 254); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, false)); + EXPECT_EQ("PK >= (uint8 a=254, uint8 b=0, uint8 c=0) AND " + "PK < (uint8 a=255, uint8 b=0, uint8 c=0)\n" + "(`a` BETWEEN 254 AND 254)", spec.ToStringWithSchema(schema_)); +} + +// Test that, if pushed predicates are erased, that we don't +// erase non-pushed predicates. +// Because we have no predicate on column 'b', we can't push a +// a range predicate that includes 'c'. +// +// Predicate: a == 254 AND c == 254 +TEST_F(CompositeIntKeysTest, TestNoErasePredicates2) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 254); + AddPredicate(&spec, "c", EQ, 254); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + // The predicate on column A should be pushed while "c" remains. + EXPECT_EQ("PK >= (uint8 a=254, uint8 b=0, uint8 c=0) AND " + "PK < (uint8 a=255, uint8 b=0, uint8 c=0)\n" + "(`c` BETWEEN 254 AND 254)", spec.ToStringWithSchema(schema_)); +} + +// Test that predicates added out of key order are OK. +// +// Predicate: b == 254 AND a == 254 +TEST_F(CompositeIntKeysTest, TestPredicateOrderDoesntMatter) { + ScanSpec spec; + AddPredicate(&spec, "b", EQ, 254); + AddPredicate(&spec, "a", EQ, 254); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=254, uint8 b=254, uint8 c=0) AND " + "PK < (uint8 a=254, uint8 b=255, uint8 c=0)", + spec.ToStringWithSchema(schema_)); +} + +// Tests for String parts in composite keys +//------------------------------------------------------------ +class CompositeIntStringKeysTest : public TestRangePredicateEncoder { + public: + CompositeIntStringKeysTest() : + TestRangePredicateEncoder( + Schema({ ColumnSchema("a", UINT8), + ColumnSchema("b", STRING), + ColumnSchema("c", STRING) }, + 3)) { + } +}; + + +// Predicate: a == 128 +TEST_F(CompositeIntStringKeysTest, TestPrefixEquality) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 128); + SCOPED_TRACE(spec.ToStringWithSchema(schema_)); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + // Expect: key >= (128, "", "") AND key < (129, "", "") + EXPECT_EQ("PK >= (uint8 a=128, string b=, string c=) AND " + "PK < (uint8 a=129, string b=, string c=)", + spec.ToStringWithSchema(schema_)); +} + +// Predicate: a == 128 AND b = "abc" +TEST_F(CompositeIntStringKeysTest, TestPrefixEqualityWithString) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 128); + AddPredicate(&spec, "b", EQ, Slice("abc")); + SCOPED_TRACE(spec.ToString()); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=128, string b=abc, string c=) AND " + "PK < (uint8 a=128, string b=abc\\000, string c=)", + spec.ToStringWithSchema(schema_)); +} + +// Tests for non-composite int key +//------------------------------------------------------------ +class SingleIntKeyTest : public TestRangePredicateEncoder { + public: + SingleIntKeyTest() : + TestRangePredicateEncoder( + Schema({ ColumnSchema("a", UINT8) }, 1)) { + } +}; + +TEST_F(SingleIntKeyTest, TestEquality) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 128); + SCOPED_TRACE(spec.ToString()); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=128) AND " + "PK < (uint8 a=129)", + spec.ToStringWithSchema(schema_)); +} + +TEST_F(SingleIntKeyTest, TestRedundantUpperBound) { + ScanSpec spec; + AddPredicate(&spec, "a", EQ, 255); + SCOPED_TRACE(spec.ToString()); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("PK >= (uint8 a=255)", + spec.ToStringWithSchema(schema_)); +} + +TEST_F(SingleIntKeyTest, TestNoPredicates) { + ScanSpec spec; + SCOPED_TRACE(spec.ToString()); + ASSERT_NO_FATAL_FAILURE(enc_.EncodeRangePredicates(&spec, true)); + EXPECT_EQ("", spec.ToStringWithSchema(schema_)); +} + +} // namespace kudu diff --git a/src/kudu/common/predicate_encoder.cc b/src/kudu/common/predicate_encoder.cc new file mode 100644 index 000000000000..946a067461d9 --- /dev/null +++ b/src/kudu/common/predicate_encoder.cc @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/predicate_encoder.h" + +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/row_key-util.h" +#include "kudu/common/types.h" + +namespace kudu { + +RangePredicateEncoder::RangePredicateEncoder(const Schema* key_schema, + Arena* arena) + : key_schema_(key_schema), + arena_(arena) { +} + +void RangePredicateEncoder::EncodeRangePredicates(ScanSpec *spec, bool erase_pushed) { + // Step 1) Simplify all predicates which apply to keys. + // + // First, we loop over all predicates, find those that apply to key columns, + // and group them by the column they apply to. Bounds are simplified (i.e. + // the tightest bounds are retained). In this step, we retain the original indexes + // of the predicates that we've analyzed, so we can later remove them if necessary. + vector key_bounds; + SimplifyBounds(*spec, &key_bounds); + + // Step 2) Determine the length of the "equality" part of the key. + // + // The following pattern of predicates can be converted into range scans: + // + // k1 = a AND k2 = b AND ... AND kN BETWEEN c AND d + // + // In other words, we can have a sequence of equality conditions, followed by + // a range predicate on one further column past that. + // + // In this step, we count how many key components have equality predicates applied. + int equality_prefix = CountKeyPrefixEqualities(key_bounds); + + // We're only allowed to push the equality conditions and optionally one more. + int max_push_len = std::min(equality_prefix + 1, key_bounds.size()); + + // Step 3) Prepare upper and lower bound key tuples + // + // Here we allocate tuples from the arena which will store the upper and lower + // bound. We initialize all elements of these tuples to their minimum value. + uint8_t* lower_buf = static_cast( + CHECK_NOTNULL(arena_->AllocateBytes(key_schema_->key_byte_size()))); + uint8_t* upper_buf = static_cast( + CHECK_NOTNULL(arena_->AllocateBytes(key_schema_->key_byte_size()))); + ContiguousRow lower_key(key_schema_, lower_buf); + ContiguousRow upper_key(key_schema_, upper_buf); + + row_key_util::SetKeyToMinValues(&lower_key); + row_key_util::SetKeyToMinValues(&upper_key); + + // Step 4) Construct upper/lower bound tuples + // + // We iterate through the predicates and copy the predicate bounds into + // the tuples, while also keeping track of how many elements have been + // set in each. + // + // For example, with a (year, month, day) primary key: + // + // Predicate: year = 2015 AND month = 7 AND day <= 15 + // upper_key: (2015, 7, 15) (len=3) + // lower_key: (2015, 7, ) (len=2) + // + // Note that the 'day' component of the lower bound remains as '' + // here because there is no lower bound range predicate on the 'day' column. + // + // While iterating, we also keep track of which original predicates were + // pushed down, so we can remove them later. + int lower_len = 0; + int upper_len = 0; + vector was_pushed(spec->predicates().size()); + int n_pushed; + for (n_pushed = 0; n_pushed < max_push_len; n_pushed++) { + const ColumnSchema& col = key_schema_->column(n_pushed); + int size = col.type_info()->size(); + const SimplifiedBounds& b = key_bounds[n_pushed]; + + // If we're still in the "equality" part of the key, we expect both + // the upper and lower bounds to be set. + if (n_pushed < equality_prefix) { + DCHECK(b.lower && b.upper); + } + + if (b.lower) { + memcpy(lower_key.mutable_cell_ptr(n_pushed), key_bounds[n_pushed].lower, size); + lower_len++; + } + if (b.upper) { + memcpy(upper_key.mutable_cell_ptr(n_pushed), key_bounds[n_pushed].upper, size); + upper_len++; + } + for (int pred_idx : key_bounds[n_pushed].orig_predicate_indexes) { + was_pushed[pred_idx] = true; + } + } + + // Step 4) Convert upper bound to exclusive + // + // Column range predicates are inclusive, but primary key predicates are exclusive. + // Here, we increment the upper bound key to convert between the two. + // + // Handling prefix conditions on the upper bound is slightly subtle. + // Consider, for example: + // + // Predicate: year = 2015 AND month <= 7 + // upper_key: (2015, 7, ) (len=2) + // + // Conceptually, what we'd like to do is set upper_key <= (2015, 7, ), + // and then increment it to: upper_key < (2015, 8, ). However, there is + // no such concept of a "" value for a column (strings can always be + // incremented further). So, instead, we leave the remaining components + // as "", and increment the prefix, which yields the same result. + if (upper_len) { + if (!row_key_util::IncrementKeyPrefix(&upper_key, upper_len, arena_)) { + // If the upper bound is already the very maximum key, we can't increment + // it any more. In that case, it's equivalent to setting no bound at all, + // so we reset the length back to 0. + // + // For example, consider: + // Predicate: year <= MAX_INT + // upper_key; (MAX_INT, , ) (len=1) + // + // IncrementKeyPrefix(1) here will return false since MAX_INT cannot be + // further incremented. However, the predicate is itself tautological, so + // we can just remove it. + upper_len = 0; + } + } + + VLOG(4) << "Lower: " << key_schema_->DebugRowKey(lower_key) << "(" << lower_len << ")"; + VLOG(4) << "Upper: " << key_schema_->DebugRowKey(upper_key) << "(" << upper_len << ")"; + + // Step 5. Erase the pushed predicates from the ScanSpec. + if (erase_pushed) { + ErasePushedPredicates(spec, was_pushed); + } + + // Step 6. Add the new range predicates to the spec. + if (lower_len) { + EncodedKey* lower = EncodedKey::FromContiguousRow(ConstContiguousRow(lower_key)).release(); + pool_.Add(lower); + spec->SetLowerBoundKey(lower); + } + if (upper_len) { + EncodedKey* upper = EncodedKey::FromContiguousRow(ConstContiguousRow(upper_key)).release(); + pool_.Add(upper); + spec->SetExclusiveUpperBoundKey(upper); + } +} + +void RangePredicateEncoder::SimplifyBounds(const ScanSpec& spec, + vector* key_bounds) const { + key_bounds->clear(); + key_bounds->resize(key_schema_->num_key_columns()); + + for (int i = 0; i < spec.predicates().size(); i++) { + const ColumnRangePredicate& pred = spec.predicates()[i]; + int idx = key_schema_->find_column(pred.column().name()); + if (idx == -1 || idx >= key_bounds->size()) { + continue; + } + const ColumnSchema& col = key_schema_->column(idx); + + // Add to the list of pushable predicates for this column. + CHECK(pred.range().has_lower_bound() || pred.range().has_upper_bound()); + (*key_bounds)[idx].orig_predicate_indexes.push_back(i); + + if (pred.range().has_upper_bound()) { + // If we haven't seen any upper bound, or this upper bound is tighter than + // (less than) the one we've seen already, replace it. + if ((*key_bounds)[idx].upper == nullptr || + col.type_info()->Compare(pred.range().upper_bound(), + (*key_bounds)[idx].upper) < 0) { + (*key_bounds)[idx].upper = pred.range().upper_bound(); + } + } + + if (pred.range().has_lower_bound()) { + // If we haven't seen any lower bound, or this lower bound is tighter than + // (greater than) the one we've seen already, replace it. + if ((*key_bounds)[idx].lower == nullptr || + col.type_info()->Compare(pred.range().lower_bound(), + (*key_bounds)[idx].lower) > 0) { + (*key_bounds)[idx].lower = pred.range().lower_bound(); + } + } + } +} + +int RangePredicateEncoder::CountKeyPrefixEqualities( + const vector& key_bounds) const { + + int i = 0; + for (; i < key_schema_->num_key_columns(); i++) { + if (!key_bounds[i].lower || !key_bounds[i].upper) { + break; + } + ColumnRangePredicate pred(key_schema_->column(i), + key_bounds[i].lower, + key_bounds[i].upper); + if (!pred.range().IsEquality()) break; + } + return i; +} + +void RangePredicateEncoder::ErasePushedPredicates( + ScanSpec *spec, const vector& should_erase) const { + int num_preds = spec->predicates().size(); + CHECK_EQ(should_erase.size(), num_preds); + + vector new_preds; + new_preds.reserve(num_preds); + + for (int i = 0; i < num_preds; i++) { + if (!should_erase[i]) { + new_preds.push_back(spec->predicates()[i]); + } + } + spec->mutable_predicates()->swap(new_preds); +} + +} // namespace kudu diff --git a/src/kudu/common/predicate_encoder.h b/src/kudu/common/predicate_encoder.h new file mode 100644 index 000000000000..70df3819ca1e --- /dev/null +++ b/src/kudu/common/predicate_encoder.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_PREDICATE_ENCODER_H +#define KUDU_COMMON_PREDICATE_ENCODER_H + +#include +#include + + +#include "kudu/common/encoded_key.h" +#include "kudu/common/row.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/schema.h" +#include "kudu/util/auto_release_pool.h" + +namespace kudu { + +using std::vector; + +// Encodes a list of column predicates into key-range predicates. +// Uses an AutoReleasePool to allocate EncodedKey instances, +// which means the lifetime of RangePredicateEncoder must be >= the +// lifetime of any classes that access the ScanSpec. +class RangePredicateEncoder { + public: + // 'key_schema' is not copied and must remain valid for the lifetime + // of this object. + // + // Some parts of the resulting predicates may be allocated out of 'arena' + // and thus 'arena' must not be reset or destructed until after any ScanSpecs + // modified by this encoder have been destroyed. + RangePredicateEncoder(const Schema* key_schema, Arena* arena); + + // Encodes the predicates found in 'spec' into a key range which is + // then emitted back into 'spec'. + // + // If 'erase_pushed' is true, pushed predicates are removed from 'spec'. + void EncodeRangePredicates(ScanSpec *spec, bool erase_pushed); + + private: + friend class TestRangePredicateEncoder; + FRIEND_TEST(CompositeIntKeysTest, TestSimplify); + + struct SimplifiedBounds { + SimplifiedBounds() : upper(NULL), lower(NULL) {} + const void* upper; + const void* lower; + vector orig_predicate_indexes; + }; + + void SimplifyBounds(const ScanSpec& spec, + std::vector* key_bounds) const; + + // Returns the number of contiguous equalities in the key prefix. + int CountKeyPrefixEqualities(const std::vector& bounds) const; + + // Erases any predicates we've encoded from the predicate list within the + // ScanSpec. + void ErasePushedPredicates( + ScanSpec *spec, const std::vector& should_erase) const; + + const Schema* key_schema_; + Arena* arena_; + AutoReleasePool pool_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/row.h b/src/kudu/common/row.h new file mode 100644 index 000000000000..05c570aaa5a3 --- /dev/null +++ b/src/kudu/common/row.h @@ -0,0 +1,760 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ROW_H +#define KUDU_COMMON_ROW_H + +#include +#include +#include +#include + +#include "kudu/common/types.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +// A simple cell of data which directly corresponds to a pointer value. +// stack. +struct SimpleConstCell { + public: + // Both parameters must remain valid for the lifetime of the cell object. + SimpleConstCell(const ColumnSchema* col_schema, + const void* value) + : col_schema_(col_schema), + value_(value) { + } + + const TypeInfo* typeinfo() const { return col_schema_->type_info(); } + size_t size() const { return col_schema_->type_info()->size(); } + bool is_nullable() const { return col_schema_->is_nullable(); } + const void* ptr() const { return value_; } + bool is_null() const { return value_ == NULL; } + + private: + const ColumnSchema* col_schema_; + const void* value_; +}; + +// Copy the cell data from 'src' to 'dst'. This only copies the data, and not +// the null state. Use CopyCell() if you need to copy the null-ness. +// +// If dst_arena is non-NULL, relocates the data into the given arena. +template +Status CopyCellData(const SrcCellType &src, DstCellType* dst, ArenaType *dst_arena) { + DCHECK_EQ(src.typeinfo()->type(), dst->typeinfo()->type()); + + if (src.typeinfo()->physical_type() == BINARY) { + // If it's a Slice column, need to relocate the referred-to data + // as well as the slice itself. + // TODO: potential optimization here: if the new value is smaller than + // the old value, we could potentially just overwrite in some cases. + const Slice *src_slice = reinterpret_cast(src.ptr()); + Slice *dst_slice = reinterpret_cast(dst->mutable_ptr()); + if (dst_arena != NULL) { + if (PREDICT_FALSE(!dst_arena->RelocateSlice(*src_slice, dst_slice))) { + return Status::IOError("out of memory copying slice", src_slice->ToString()); + } + } else { + // Just copy the slice without relocating. + // This is used by callers who know that the source row's data is going + // to stick around for the scope of the destination. + *dst_slice = *src_slice; + } + } else { + memcpy(dst->mutable_ptr(), src.ptr(), src.size()); // TODO: inline? + } + return Status::OK(); +} + +// Copy the cell from 'src' to 'dst'. +// +// This copies the data, and relocates indirect data into the given arena, +// if it is not NULL. +template +Status CopyCell(const SrcCellType &src, DstCellType* dst, ArenaType *dst_arena) { + if (src.is_nullable()) { + // Copy the null state. + dst->set_null(src.is_null()); + if (src.is_null()) { + // no need to copy any data contents once we marked the destination + // cell as null. + return Status::OK(); + } + } + + return CopyCellData(src, dst, dst_arena); +} + +// Copy all of the cells from one row to another. The two rows must share +// the same Schema. If they do not, use ProjectRow() below. +// This can be used to translate between columnar and row-wise layout, for example. +// +// If 'dst_arena' is set, then will relocate any indirect data to that arena +// during the copy. +template +inline Status CopyRow(const RowType1 &src_row, RowType2 *dst_row, ArenaType *dst_arena) { + DCHECK_SCHEMA_EQ(*src_row.schema(), *dst_row->schema()); + + for (int i = 0; i < src_row.schema()->num_columns(); i++) { + typename RowType1::Cell src = src_row.cell(i); + typename RowType2::Cell dst = dst_row->cell(i); + RETURN_NOT_OK(CopyCell(src, &dst, dst_arena)); + } + + return Status::OK(); +} + +// Projection mapping for the specified schemas. +// A projection may contain: +// - columns that are present in the "base schema" +// - columns that are present in the "base schema" but with different types. +// In this case an adapter should be used (e.g. INT8 to INT64, INT8 to STRING, ...) +// - columns that are not present in the "base schema". +// In this case the default value of the projection column will be used. +// +// Example: +// RowProjector projector. +// projector.Init(base_schema, projection); +// projector.ProjectRow(row_a, &row_b, &row_b_arena); +class RowProjector { + public: + typedef std::pair ProjectionIdxMapping; + + // Construct a projector. + // The two Schema pointers must remain valid for the lifetime of this object. + RowProjector(const Schema* base_schema, const Schema* projection) + : base_schema_(base_schema), projection_(projection), + is_identity_(base_schema->Equals(*projection)) { + } + + // Initialize the projection mapping with the specified base_schema and projection + Status Init() { + return projection_->GetProjectionMapping(*base_schema_, this); + } + + Status Reset(const Schema* base_schema, const Schema* projection) { + base_schema_ = base_schema; + projection_ = projection; + base_cols_mapping_.clear(); + adapter_cols_mapping_.clear(); + projection_defaults_.clear(); + is_identity_ = base_schema->Equals(*projection); + return Init(); + } + + // Project a row from one schema into another, using the projection mapping. + // Indirected data is copied into the provided dst arena. + // + // Use this method only on the read-path. + // The col_schema.read_default_value() will be used. + template + Status ProjectRowForRead(const RowType1& src_row, RowType2 *dst_row, ArenaType *dst_arena) const { + return ProjectRow(src_row, dst_row, dst_arena); + } + + // Project a row from one schema into another, using the projection mapping. + // Indirected data is copied into the provided dst arena. + // + // Use this method only on the write-path. + // The col_schema.write_default_value() will be used. + template + Status ProjectRowForWrite(const RowType1& src_row, RowType2 *dst_row, + ArenaType *dst_arena) const { + return ProjectRow(src_row, dst_row, dst_arena); + } + + bool is_identity() const { return is_identity_; } + const Schema* projection() const { return projection_; } + const Schema* base_schema() const { return base_schema_; } + + // Returns the mapping between base schema and projection schema columns + // first: is the projection column index, second: is the base_schema index + const vector& base_cols_mapping() const { return base_cols_mapping_; } + + // Returns the mapping between base schema and projection schema columns + // that requires a type adapter. + // first: is the projection column index, second: is the base_schema index + const vector& adapter_cols_mapping() const { return adapter_cols_mapping_; } + + // Returns the projection indexes of the columns to add with a default value. + // + // These are columns which are present in 'projection_' but not in 'base_schema', + // and for which 'projection' has a default. + const vector& projection_defaults() const { return projection_defaults_; } + + private: + friend class Schema; + + Status ProjectBaseColumn(size_t proj_col_idx, size_t base_col_idx) { + base_cols_mapping_.push_back(ProjectionIdxMapping(proj_col_idx, base_col_idx)); + return Status::OK(); + } + + Status ProjectAdaptedColumn(size_t proj_col_idx, size_t base_col_idx) { + adapter_cols_mapping_.push_back(ProjectionIdxMapping(proj_col_idx, base_col_idx)); + return Status::OK(); + } + + Status ProjectDefaultColumn(size_t proj_col_idx) { + projection_defaults_.push_back(proj_col_idx); + return Status::OK(); + } + + Status ProjectExtraColumn(size_t proj_col_idx) { + return Status::InvalidArgument( + "The column '" + projection_->column(proj_col_idx).name() + + "' does not exist in the projection, and it does not have a " + "default value or a nullable type"); + } + + private: + // Project a row from one schema into another, using the projection mapping. + // Indirected data is copied into the provided dst arena. + template + Status ProjectRow(const RowType1& src_row, RowType2 *dst_row, ArenaType *dst_arena) const { + DCHECK_SCHEMA_EQ(*base_schema_, *src_row.schema()); + DCHECK_SCHEMA_EQ(*projection_, *dst_row->schema()); + + // Copy directly from base Data + for (const auto& base_mapping : base_cols_mapping_) { + typename RowType1::Cell src_cell = src_row.cell(base_mapping.second); + typename RowType2::Cell dst_cell = dst_row->cell(base_mapping.first); + RETURN_NOT_OK(CopyCell(src_cell, &dst_cell, dst_arena)); + } + + // TODO: Copy Adapted base Data + DCHECK(adapter_cols_mapping_.size() == 0) << "Value Adapter not supported yet"; + + // Fill with Defaults + for (auto proj_idx : projection_defaults_) { + const ColumnSchema& col_proj = projection_->column(proj_idx); + const void *vdefault = FOR_READ ? col_proj.read_default_value() : + col_proj.write_default_value(); + SimpleConstCell src_cell(&col_proj, vdefault); + typename RowType2::Cell dst_cell = dst_row->cell(proj_idx); + RETURN_NOT_OK(CopyCell(src_cell, &dst_cell, dst_arena)); + } + + return Status::OK(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(RowProjector); + + vector base_cols_mapping_; + vector adapter_cols_mapping_; + vector projection_defaults_; + + const Schema* base_schema_; + const Schema* projection_; + bool is_identity_; +}; + +// Projection mapping from the schema used to encode a RowChangeList +// to the new specified schema. Used on the read/compaction path to +// project the deltas to the user/latest specified projection. +// +// A projection may contain: +// - columns that are present in the "base schema" +// - columns that are present in the "base schema" but with different types. +// In this case an adapter should be used (e.g. INT8 to INT64, INT8 to STRING, ...) +// - columns that are not present in the "base schema". +// These columns are not considered since they cannot be in the delta. +class DeltaProjector { + public: + // The delta_schema and projection must remain valid for the lifetime + // of the object. + DeltaProjector(const Schema* delta_schema, const Schema* projection) + : delta_schema_(delta_schema), projection_(projection), + is_identity_(delta_schema->Equals(*projection)) { + } + + Status Init() { + // TODO: doesn't look like this uses the is_identity performance + // shortcut + return projection_->GetProjectionMapping(*delta_schema_, this); + } + + bool is_identity() const { return is_identity_; } + + const Schema* projection() const { return projection_; } + const Schema* delta_schema() const { return delta_schema_; } + + bool get_base_col_from_proj_idx(size_t proj_col_idx, size_t *base_col_idx) const { + return FindCopy(base_cols_mapping_, proj_col_idx, base_col_idx); + } + + bool get_adapter_col_from_proj_idx(size_t proj_col_idx, size_t *base_col_idx) const { + return FindCopy(adapter_cols_mapping_, proj_col_idx, base_col_idx); + } + + // TODO: Discourage the use of this. At the moment is only in RowChangeList::Project + bool get_proj_col_from_base_id(size_t col_id, size_t *proj_col_idx) const { + return FindCopy(rbase_cols_mapping_, col_id, proj_col_idx); + } + + bool get_proj_col_from_adapter_id(size_t col_id, size_t *proj_col_idx) const { + return FindCopy(radapter_cols_mapping_, col_id, proj_col_idx); + } + + private: + friend class ::kudu::Schema; + + Status ProjectBaseColumn(size_t proj_col_idx, size_t base_col_idx) { + base_cols_mapping_[proj_col_idx] = base_col_idx; + if (delta_schema_->has_column_ids()) { + rbase_cols_mapping_[delta_schema_->column_id(base_col_idx)] = proj_col_idx; + } else { + rbase_cols_mapping_[proj_col_idx] = proj_col_idx; + } + return Status::OK(); + } + + Status ProjectAdaptedColumn(size_t proj_col_idx, size_t base_col_idx) { + adapter_cols_mapping_[proj_col_idx] = base_col_idx; + if (delta_schema_->has_column_ids()) { + radapter_cols_mapping_[delta_schema_->column_id(base_col_idx)] = proj_col_idx; + } else { + radapter_cols_mapping_[proj_col_idx] = proj_col_idx; + } + return Status::OK(); + } + + Status ProjectDefaultColumn(size_t proj_col_idx) { + // Not used, since deltas are update... + // we don't have this column, so we don't have updates + return Status::OK(); + } + + Status ProjectExtraColumn(size_t proj_col_idx) { + return Status::InvalidArgument( + "The column '" + delta_schema_->column(proj_col_idx).name() + + "' does not exist in the projection, and it does not have a " + "default value or a nullable type"); + } + + private: + DISALLOW_COPY_AND_ASSIGN(DeltaProjector); + + std::unordered_map base_cols_mapping_; // [proj_idx] = base_idx + std::unordered_map rbase_cols_mapping_; // [id] = proj_idx + + std::unordered_map adapter_cols_mapping_; // [proj_idx] = base_idx + std::unordered_map radapter_cols_mapping_; // [id] = proj_idx + + const Schema* delta_schema_; + const Schema* projection_; + bool is_identity_; +}; + +// Copy any indirect (eg STRING) data referenced by the given row into the +// provided arena. +// +// The row itself is mutated so that the indirect data points to the relocated +// storage. +template +inline Status RelocateIndirectDataToArena(RowType *row, ArenaType *dst_arena) { + const Schema* schema = row->schema(); + // For any Slice columns, copy the sliced data into the arena + // and update the pointers + for (int i = 0; i < schema->num_columns(); i++) { + typename RowType::Cell cell = row->cell(i); + if (cell.typeinfo()->physical_type() == BINARY) { + if (cell.is_nullable() && cell.is_null()) { + continue; + } + + Slice *slice = reinterpret_cast(cell.mutable_ptr()); + if (!dst_arena->RelocateSlice(*slice, slice)) { + return Status::IOError("Unable to relocate slice"); + } + } + } + return Status::OK(); +} + + +class ContiguousRowHelper { + public: + static size_t null_bitmap_size(const Schema& schema) { + return schema.has_nullables() ? BitmapSize(schema.num_columns()) : 0; + } + + static uint8_t* null_bitmap_ptr(const Schema& schema, uint8_t* row_data) { + return row_data + schema.byte_size(); + } + + static size_t row_size(const Schema& schema) { + return schema.byte_size() + null_bitmap_size(schema); + } + + static void InitNullsBitmap(const Schema& schema, Slice& row_data) { + InitNullsBitmap(schema, row_data.mutable_data(), row_data.size() - schema.byte_size()); + } + + static void InitNullsBitmap(const Schema& schema, uint8_t *row_data, size_t bitmap_size) { + uint8_t *null_bitmap = row_data + schema.byte_size(); + for (size_t i = 0; i < bitmap_size; ++i) { + null_bitmap[i] = 0x00; + } + } + + static bool is_null(const Schema& schema, const uint8_t *row_data, size_t col_idx) { + DCHECK(schema.column(col_idx).is_nullable()); + return BitmapTest(row_data + schema.byte_size(), col_idx); + } + + static void SetCellIsNull(const Schema& schema, uint8_t *row_data, size_t col_idx, bool is_null) { + uint8_t *null_bitmap = row_data + schema.byte_size(); + BitmapChange(null_bitmap, col_idx, is_null); + } + + static const uint8_t *cell_ptr(const Schema& schema, const uint8_t *row_data, size_t col_idx) { + return row_data + schema.column_offset(col_idx); + } + + static const uint8_t *nullable_cell_ptr(const Schema& schema, + const uint8_t *row_data, + size_t col_idx) { + return is_null(schema, row_data, col_idx) ? NULL : cell_ptr(schema, row_data, col_idx); + } +}; + +template +class ContiguousRowCell { + public: + ContiguousRowCell(const ContiguousRowType* row, int idx) + : row_(row), col_idx_(idx) { + } + + const TypeInfo* typeinfo() const { return type_info(); } + size_t size() const { return type_info()->size(); } + const void* ptr() const { return row_->cell_ptr(col_idx_); } + void* mutable_ptr() const { return row_->mutable_cell_ptr(col_idx_); } + bool is_nullable() const { return row_->schema()->column(col_idx_).is_nullable(); } + bool is_null() const { return row_->is_null(col_idx_); } + void set_null(bool is_null) const { row_->set_null(col_idx_, is_null); } + + private: + const TypeInfo* type_info() const { + return row_->schema()->column(col_idx_).type_info(); + } + + const ContiguousRowType* row_; + int col_idx_; +}; + +// The row has all columns layed out in memory based on the schema.column_offset() +class ContiguousRow { + public: + typedef ContiguousRowCell Cell; + + explicit ContiguousRow(const Schema* schema, uint8_t *row_data = NULL) + : schema_(schema), row_data_(row_data) { + } + + const Schema* schema() const { + return schema_; + } + + void Reset(uint8_t *row_data) { + row_data_ = row_data; + } + + bool is_null(size_t col_idx) const { + return ContiguousRowHelper::is_null(*schema_, row_data_, col_idx); + } + + void set_null(size_t col_idx, bool is_null) const { + ContiguousRowHelper::SetCellIsNull(*schema_, row_data_, col_idx, is_null); + } + + const uint8_t *cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::cell_ptr(*schema_, row_data_, col_idx); + } + + uint8_t *mutable_cell_ptr(size_t col_idx) const { + return const_cast(cell_ptr(col_idx)); + } + + const uint8_t *nullable_cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::nullable_cell_ptr(*schema_, row_data_, col_idx); + } + + Cell cell(size_t col_idx) const { + return Cell(this, col_idx); + } + + private: + friend class ConstContiguousRow; + + const Schema* schema_; + uint8_t *row_data_; +}; + +// This is the same as ContiguousRow except it refers to a const area of memory that +// should not be mutated. +class ConstContiguousRow { + public: + typedef ContiguousRowCell Cell; + + explicit ConstContiguousRow(const ContiguousRow &row) + : schema_(row.schema_), + row_data_(row.row_data_) { + } + + ConstContiguousRow(const Schema* schema, const void *row_data) + : schema_(schema), row_data_(reinterpret_cast(row_data)) { + } + + ConstContiguousRow(const Schema* schema, const Slice& row_slice) + : schema_(schema), row_data_(row_slice.data()) { + } + + const Schema* schema() const { + return schema_; + } + + const uint8_t *row_data() const { + return row_data_; + } + + size_t row_size() const { + return ContiguousRowHelper::row_size(*schema_); + } + + bool is_null(size_t col_idx) const { + return ContiguousRowHelper::is_null(*schema_, row_data_, col_idx); + } + + const uint8_t *cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::cell_ptr(*schema_, row_data_, col_idx); + } + + const uint8_t *nullable_cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::nullable_cell_ptr(*schema_, row_data_, col_idx); + } + + Cell cell(size_t col_idx) const { + return Cell(this, col_idx); + } + + private: + const Schema* schema_; + const uint8_t *row_data_; +}; + +// Delete functions from ContiguousRowCell that can mutate the cell by +// specializing for ConstContiguousRow. +template<> +void* ContiguousRowCell::mutable_ptr() const; +template<> +void ContiguousRowCell::set_null(bool null) const; + + +// Utility class for building rows corresponding to a given schema. +// This is used only by tests. +// TODO: move it into a test utility. +class RowBuilder { + public: + explicit RowBuilder(const Schema& schema) + : schema_(schema), + arena_(1024, 1024*1024), + bitmap_size_(ContiguousRowHelper::null_bitmap_size(schema)) { + Reset(); + } + + // Reset the RowBuilder so that it is ready to build + // the next row. + // NOTE: The previous row's data is invalidated. Even + // if the previous row's data has been copied, indirected + // entries such as strings may end up shared or deallocated + // after Reset. So, the previous row must be fully copied + // (eg using CopyRowToArena()). + void Reset() { + arena_.Reset(); + size_t row_size = schema_.byte_size() + bitmap_size_; + buf_ = reinterpret_cast(arena_.AllocateBytes(row_size)); + CHECK(buf_) << "could not allocate " << row_size << " bytes for row builder"; + col_idx_ = 0; + byte_idx_ = 0; + ContiguousRowHelper::InitNullsBitmap(schema_, buf_, bitmap_size_); + } + + void AddString(const Slice &slice) { + CheckNextType(STRING); + AddSlice(slice); + } + + void AddString(const string &str) { + CheckNextType(STRING); + AddSlice(str); + } + + void AddBinary(const Slice &slice) { + CheckNextType(BINARY); + AddSlice(slice); + } + + void AddBinary(const string &str) { + CheckNextType(BINARY); + AddSlice(str); + } + + void AddInt8(int8_t val) { + CheckNextType(INT8); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddUint8(uint8_t val) { + CheckNextType(UINT8); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddInt16(int16_t val) { + CheckNextType(INT16); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddUint16(uint16_t val) { + CheckNextType(UINT16); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddInt32(int32_t val) { + CheckNextType(INT32); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddUint32(uint32_t val) { + CheckNextType(UINT32); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddInt64(int64_t val) { + CheckNextType(INT64); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddTimestamp(int64_t micros_utc_since_epoch) { + CheckNextType(TIMESTAMP); + *reinterpret_cast(&buf_[byte_idx_]) = micros_utc_since_epoch; + Advance(); + } + + void AddUint64(uint64_t val) { + CheckNextType(UINT64); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddFloat(float val) { + CheckNextType(FLOAT); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddDouble(double val) { + CheckNextType(DOUBLE); + *reinterpret_cast(&buf_[byte_idx_]) = val; + Advance(); + } + + void AddNull() { + CHECK(schema_.column(col_idx_).is_nullable()); + BitmapSet(buf_ + schema_.byte_size(), col_idx_); + Advance(); + } + + // Retrieve the data slice from the current row. + // The Add*() functions must have been called an appropriate + // number of times such that all columns are filled in, or else + // a crash will occur. + // + // The data slice returned by this is only valid until the next + // call to Reset(). + // Note that the Slice may also contain pointers which refer to + // other parts of the internal Arena, so even if the returned + // data is copied, it is not safe to Reset() before also calling + // CopyRowIndirectDataToArena. + const Slice data() const { + CHECK_EQ(byte_idx_, schema_.byte_size()); + return Slice(buf_, byte_idx_ + bitmap_size_); + } + + const Schema& schema() const { + return schema_; + } + + ConstContiguousRow row() const { + return ConstContiguousRow(&schema_, data()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(RowBuilder); + + void AddSlice(const Slice &slice) { + Slice *ptr = reinterpret_cast(buf_ + byte_idx_); + CHECK(arena_.RelocateSlice(slice, ptr)) << "could not allocate space in arena"; + + Advance(); + } + + void AddSlice(const string &str) { + uint8_t *in_arena = arena_.AddSlice(str); + CHECK(in_arena) << "could not allocate space in arena"; + + Slice *ptr = reinterpret_cast(buf_ + byte_idx_); + *ptr = Slice(in_arena, str.size()); + + Advance(); + } + + void CheckNextType(DataType type) { + CHECK_EQ(schema_.column(col_idx_).type_info()->type(), + type); + } + + void Advance() { + int size = schema_.column(col_idx_).type_info()->size(); + byte_idx_ += size; + col_idx_++; + } + + const Schema schema_; + Arena arena_; + uint8_t *buf_; + + size_t col_idx_; + size_t byte_idx_; + size_t bitmap_size_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/row_changelist-test.cc b/src/kudu/common/row_changelist-test.cc new file mode 100644 index 000000000000..c5cfa7349ce6 --- /dev/null +++ b/src/kudu/common/row_changelist-test.cc @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/row_changelist.h" +#include "kudu/common/row.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/faststring.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +using strings::Substitute; + +class TestRowChangeList : public KuduTest { + public: + TestRowChangeList() : + schema_(CreateSchema()) + {} + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn("col1", STRING)); + CHECK_OK(builder.AddColumn("col2", STRING)); + CHECK_OK(builder.AddColumn("col3", UINT32)); + CHECK_OK(builder.AddNullableColumn("col4", UINT32)); + return(builder.Build()); + } + + protected: + Schema schema_; +}; + +TEST_F(TestRowChangeList, TestEncodeDecodeUpdates) { + faststring buf; + RowChangeListEncoder rcl(&buf); + + // Construct an update with several columns changed + Slice update1("update1"); + Slice update2("update2"); + uint32 update3 = 12345; + + int c0_id = schema_.column_id(0); + int c1_id = schema_.column_id(1); + int c2_id = schema_.column_id(2); + int c3_id = schema_.column_id(3); + + rcl.AddColumnUpdate(schema_.column(0), c0_id, &update1); + rcl.AddColumnUpdate(schema_.column(1), c1_id, &update2); + rcl.AddColumnUpdate(schema_.column(2), c2_id, &update3); + rcl.AddColumnUpdate(schema_.column(3), c3_id, nullptr); + + LOG(INFO) << "Encoded: " << HexDump(buf); + + // Read it back. + EXPECT_EQ(string("SET col1=update1, col2=update2, col3=12345, col4=NULL"), + RowChangeList(Slice(buf)).ToString(schema_)); + + RowChangeListDecoder decoder((RowChangeList(buf))); + ASSERT_OK(decoder.Init()); + RowChangeListDecoder::DecodedUpdate dec; + ASSERT_TRUE(decoder.HasNext()); + ASSERT_OK(decoder.DecodeNext(&dec)); + ASSERT_EQ(c0_id, dec.col_id); + ASSERT_EQ(update1, dec.raw_value); + + ASSERT_TRUE(decoder.HasNext()); + ASSERT_OK(decoder.DecodeNext(&dec)); + ASSERT_EQ(c1_id, dec.col_id); + ASSERT_EQ(update2, dec.raw_value); + + ASSERT_TRUE(decoder.HasNext()); + ASSERT_OK(decoder.DecodeNext(&dec)); + ASSERT_EQ(c2_id, dec.col_id); + ASSERT_EQ("90\\x00\\x00", dec.raw_value.ToDebugString()); + + ASSERT_TRUE(decoder.HasNext()); + ASSERT_OK(decoder.DecodeNext(&dec)); + ASSERT_EQ(c3_id, dec.col_id); + ASSERT_TRUE(dec.null); + + ASSERT_FALSE(decoder.HasNext()); + + // ToString() with unknown columns should still be able to parse + // the whole changelist. + EXPECT_EQ(Substitute("SET [unknown column id $0]=update1, " + "[unknown column id $1]=update2, " + "[unknown column id $2]=90\\x00\\x00, " + "[unknown column id $3]=NULL", + c0_id, c1_id, c2_id, c3_id), + RowChangeList(Slice(buf)).ToString(Schema())); +} + +TEST_F(TestRowChangeList, TestDeletes) { + faststring buf; + RowChangeListEncoder rcl(&buf); + + // Construct a deletion. + rcl.SetToDelete(); + + LOG(INFO) << "Encoded: " << HexDump(buf); + + // Read it back. + EXPECT_EQ(string("DELETE"), RowChangeList(Slice(buf)).ToString(schema_)); + + RowChangeListDecoder decoder((RowChangeList(buf))); + ASSERT_OK(decoder.Init()); + ASSERT_TRUE(decoder.is_delete()); +} + +TEST_F(TestRowChangeList, TestReinserts) { + RowBuilder rb(schema_); + rb.AddString(Slice("hello")); + rb.AddString(Slice("world")); + rb.AddUint32(12345); + rb.AddNull(); + + // Construct a REINSERT. + faststring buf; + RowChangeListEncoder rcl(&buf); + rcl.SetToReinsert(rb.data()); + + LOG(INFO) << "Encoded: " << HexDump(buf); + + // Read it back. + EXPECT_EQ(string("REINSERT (string col1=hello, string col2=world, " + "uint32 col3=12345, uint32 col4=NULL)"), + RowChangeList(Slice(buf)).ToString(schema_)); + + RowChangeListDecoder decoder((RowChangeList(buf))); + ASSERT_OK(decoder.Init()); + ASSERT_TRUE(decoder.is_reinsert()); + + Slice s; + ASSERT_OK(decoder.GetReinsertedRowSlice(schema_, &s)); + ASSERT_EQ(s, rb.data()); +} + +TEST_F(TestRowChangeList, TestInvalid_EmptySlice) { + RowChangeListDecoder decoder((RowChangeList(Slice()))); + ASSERT_STR_CONTAINS(decoder.Init().ToString(), + "empty changelist"); +} + +TEST_F(TestRowChangeList, TestInvalid_BadTypeEnum) { + RowChangeListDecoder decoder(RowChangeList(Slice("\xff", 1))); + ASSERT_STR_CONTAINS(decoder.Init().ToString(), + "Corruption: bad type enum value: 255 in \\xff"); +} + +TEST_F(TestRowChangeList, TestInvalid_TooLongDelete) { + RowChangeListDecoder decoder(RowChangeList(Slice("\x02""blahblah"))); + ASSERT_STR_CONTAINS(decoder.Init().ToString(), + "Corruption: DELETE changelist too long"); +} + +TEST_F(TestRowChangeList, TestInvalid_TooShortReinsert) { + RowChangeListDecoder decoder(RowChangeList(Slice("\x03"))); + ASSERT_OK(decoder.Init()); + Slice s; + ASSERT_STR_CONTAINS(decoder.GetReinsertedRowSlice(schema_, &s).ToString(), + "Corruption: REINSERT changelist wrong length"); +} + +TEST_F(TestRowChangeList, TestInvalid_SetNullForNonNullableColumn) { + faststring buf; + RowChangeListEncoder rcl(&buf); + // Set column 0 = NULL + rcl.AddRawColumnUpdate(schema_.column_id(0), true, Slice()); + + ASSERT_EQ("[invalid update: Corruption: decoded set-to-NULL " + "for non-nullable column: col1[string NOT NULL], " + "before corruption: SET ]", + RowChangeList(Slice(buf)).ToString(schema_)); +} + +TEST_F(TestRowChangeList, TestInvalid_SetWrongSizeForIntColumn) { + faststring buf; + RowChangeListEncoder rcl(&buf); + // Set column id 2 = \xff + // (column id 2 is UINT32, so should be 4 bytes) + rcl.AddRawColumnUpdate(schema_.column_id(2), false, Slice("\xff")); + + ASSERT_EQ("[invalid update: Corruption: invalid value \\xff " + "for column col3[uint32 NOT NULL], " + "before corruption: SET ]", + RowChangeList(Slice(buf)).ToString(schema_)); +} + +} // namespace kudu diff --git a/src/kudu/common/row_changelist.cc b/src/kudu/common/row_changelist.cc new file mode 100644 index 000000000000..5429c9c825c5 --- /dev/null +++ b/src/kudu/common/row_changelist.cc @@ -0,0 +1,359 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/columnblock.h" +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/row_changelist.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/faststring.h" + +using strings::Substitute; +using strings::SubstituteAndAppend; + +namespace kudu { + +string RowChangeList::ToString(const Schema &schema) const { + DCHECK_GT(encoded_data_.size(), 0); + RowChangeListDecoder decoder(*this); + + Status s = decoder.Init(); + if (!s.ok()) { + return "[invalid: " + s.ToString() + "]"; + } + + if (decoder.is_delete()) { + return string("DELETE"); + } else if (decoder.is_reinsert()) { + ConstContiguousRow row(&schema, decoder.remaining_); + return string("REINSERT ") + schema.DebugRow(row); + } else { + CHECK(decoder.is_update()) << "Unknown changelist type!"; + } + + string ret = "SET "; + + bool first = true; + while (decoder.HasNext()) { + if (!first) { + ret.append(", "); + } + first = false; + + RowChangeListDecoder::DecodedUpdate dec; + int col_idx; + const void* value; + s = decoder.DecodeNext(&dec); + if (s.ok()) { + s = dec.Validate(schema, &col_idx, &value); + } + + if (!s.ok()) { + return "[invalid update: " + s.ToString() + ", before corruption: " + ret + "]"; + } + + if (col_idx == Schema::kColumnNotFound) { + // Unknown column. + SubstituteAndAppend(&ret, "[unknown column id $0]=", dec.col_id); + if (dec.null) { + ret.append("NULL"); + } else { + ret.append(dec.raw_value.ToDebugString()); + } + } else { + // Known column. + const ColumnSchema& col_schema = schema.column(col_idx); + ret.append(col_schema.name()); + ret.append("="); + if (value == nullptr) { + ret.append("NULL"); + } else { + ret.append(col_schema.Stringify(value)); + } + } + } + + return ret; +} + +void RowChangeListEncoder::AddRawColumnUpdate( + int col_id, bool is_null, Slice new_val) { + if (type_ == RowChangeList::kUninitialized) { + SetType(RowChangeList::kUpdate); + } else { + DCHECK_EQ(RowChangeList::kUpdate, type_); + } + + InlinePutVarint32(dst_, col_id); + if (is_null) { + dst_->push_back(0); + } else { + InlinePutVarint32(dst_, new_val.size() + 1); + dst_->append(new_val.data(), new_val.size()); + } +} + +void RowChangeListEncoder::AddColumnUpdate(const ColumnSchema& col_schema, + int col_id, + const void* cell_ptr) { + Slice val_slice; + if (cell_ptr != nullptr) { + if (col_schema.type_info()->physical_type() == BINARY) { + memcpy(&val_slice, cell_ptr, sizeof(val_slice)); + } else { + val_slice = Slice(reinterpret_cast(cell_ptr), + col_schema.type_info()->size()); + } + } else { + // NULL value. + DCHECK(col_schema.is_nullable()); + } + + AddRawColumnUpdate(col_id, cell_ptr == nullptr, val_slice); +} + +Status RowChangeListDecoder::Init() { + if (PREDICT_FALSE(remaining_.empty())) { + return Status::Corruption("empty changelist - expected type"); + } + + bool was_valid = tight_enum_test_cast(remaining_[0], &type_); + if (PREDICT_FALSE(!was_valid || type_ == RowChangeList::kUninitialized)) { + return Status::Corruption(Substitute("bad type enum value: $0 in $1", + static_cast(remaining_[0]), + remaining_.ToDebugString())); + } + if (PREDICT_FALSE(is_delete() && remaining_.size() != 1)) { + return Status::Corruption("DELETE changelist too long", + remaining_.ToDebugString()); + } + + remaining_.remove_prefix(1); + return Status::OK(); +} + +Status RowChangeListDecoder::GetReinsertedRowSlice(const Schema& schema, Slice* s) const { + DCHECK(is_reinsert()); + + int expected_size = ContiguousRowHelper::row_size(schema); + if (remaining_.size() != expected_size) { + return Status::Corruption(Substitute("REINSERT changelist wrong length (expected $0)", + expected_size, + remaining_.ToDebugString())); + } + *s = remaining_; + return Status::OK(); +} + +Status RowChangeListDecoder::ProjectUpdate(const DeltaProjector& projector, + const RowChangeList& src, + faststring *buf) { + RowChangeListDecoder decoder(src); + RETURN_NOT_OK(decoder.Init()); + + buf->clear(); + RowChangeListEncoder encoder(buf); + if (decoder.is_delete()) { + encoder.SetToDelete(); + } else if (decoder.is_reinsert()) { + Slice reinsert; + RETURN_NOT_OK(decoder.GetReinsertedRowSlice(*projector.delta_schema(), &reinsert)); + + // ReInsert = MemStore Insert -> Delete -> (Re)Insert + ConstContiguousRow src_row = ConstContiguousRow(projector.delta_schema(), + reinsert); + RowProjector row_projector(projector.delta_schema(), projector.projection()); + size_t row_size = ContiguousRowHelper::row_size(*projector.projection()); + uint8_t buffer[row_size]; + ContiguousRow row(projector.projection(), buffer); + RETURN_NOT_OK(row_projector.Init()); + RETURN_NOT_OK(row_projector.ProjectRowForRead(src_row, &row, static_cast(NULL))); + encoder.SetToReinsert(Slice(buffer, row_size)); + } else if (decoder.is_update()) { + while (decoder.HasNext()) { + DecodedUpdate dec; + RETURN_NOT_OK(decoder.DecodeNext(&dec)); + int col_idx; + const void* new_val; + RETURN_NOT_OK(dec.Validate(*projector.projection(), &col_idx, &new_val)); + // If the new schema doesn't have this column, throw away the update. + if (col_idx == Schema::kColumnNotFound) { + continue; + } + + encoder.AddRawColumnUpdate(dec.col_id, dec.null, dec.raw_value); + } + } + return Status::OK(); +} + +Status RowChangeListDecoder::ApplyRowUpdate(RowBlockRow *dst_row, + Arena *arena, RowChangeListEncoder* undo_encoder) { + const Schema* dst_schema = dst_row->schema(); + + while (HasNext()) { + DecodedUpdate dec; + RETURN_NOT_OK(DecodeNext(&dec)); + int col_idx; + const void* value; + RETURN_NOT_OK(dec.Validate(*dst_schema, &col_idx, &value)); + + // If the delta is for a column ID not part of the projection + // we're scanning, just skip over it. + if (col_idx == Schema::kColumnNotFound) { + continue; + } + + const ColumnSchema& col_schema = dst_schema->column(col_idx); + SimpleConstCell src(&col_schema, value); + + // save the old cell on the undo encoder + RowBlockRow::Cell dst_cell = dst_row->cell(col_idx); + undo_encoder->AddColumnUpdate(col_schema, dec.col_id, dst_cell.ptr()); + + // copy the new cell to the row + RETURN_NOT_OK(CopyCell(src, &dst_cell, arena)); + } + return Status::OK(); +} + + + +Status RowChangeListDecoder::ApplyToOneColumn(size_t row_idx, ColumnBlock* dst_col, + const Schema& dst_schema, + int col_idx, Arena *arena) { + DCHECK_EQ(RowChangeList::kUpdate, type_); + + const ColumnSchema& col_schema = dst_schema.column(col_idx); + ColumnId col_id = dst_schema.column_id(col_idx); + + while (HasNext()) { + DecodedUpdate dec; + RETURN_NOT_OK(DecodeNext(&dec)); + if (dec.col_id != col_id) { + continue; + } + + int junk_col_idx; + const void* new_val; + RETURN_NOT_OK(dec.Validate(dst_schema, &junk_col_idx, &new_val)); + DCHECK_EQ(junk_col_idx, col_idx); + + SimpleConstCell src(&col_schema, new_val); + ColumnBlock::Cell dst_cell = dst_col->cell(row_idx); + RETURN_NOT_OK(CopyCell(src, &dst_cell, arena)); + // TODO: could potentially break; here if we're guaranteed to only have one update + // per column in a RowChangeList (which would make sense!) + } + return Status::OK(); +} + +Status RowChangeListDecoder::RemoveColumnIdsFromChangeList(const RowChangeList& src, + const std::vector& col_ids, + RowChangeListEncoder* out) { + RowChangeListDecoder decoder(src); + RETURN_NOT_OK(decoder.Init()); + if (decoder.is_delete()) { + out->SetToDelete(); + } else if (decoder.is_reinsert()) { + out->SetToReinsert(decoder.remaining_); + } else if (decoder.is_update()) { + while (decoder.HasNext()) { + DecodedUpdate dec; + RETURN_NOT_OK(decoder.DecodeNext(&dec)); + if (!std::binary_search(col_ids.begin(), col_ids.end(), dec.col_id)) { + out->AddRawColumnUpdate(dec.col_id, dec.null, dec.raw_value); + } + } + } + return Status::OK(); +} + +Status RowChangeListDecoder::DecodeNext(DecodedUpdate* dec) { + DCHECK_NE(type_, RowChangeList::kUninitialized) << "Must call Init()"; + // Decode the column id. + uint32_t id; + if (PREDICT_FALSE(!GetVarint32(&remaining_, &id))) { + return Status::Corruption("Invalid column ID varint in delta"); + } + dec->col_id = id; + + uint32_t size; + if (PREDICT_FALSE(!GetVarint32(&remaining_, &size))) { + return Status::Corruption("Invalid size varint in delta"); + } + + dec->null = size == 0; + if (dec->null) { + return Status::OK(); + } + + size--; + + if (PREDICT_FALSE(remaining_.size() < size)) { + return Status::Corruption( + Substitute("truncated value for column id $0, expected $1 bytes, only $2 remaining", + id, size, remaining_.size())); + } + + dec->raw_value = Slice(remaining_.data(), size); + remaining_.remove_prefix(size); + return Status::OK(); +} + +Status RowChangeListDecoder::DecodedUpdate::Validate(const Schema& schema, + int* col_idx, + const void** value) const { + *col_idx = schema.find_column_by_id(this->col_id); + if (*col_idx == Schema::kColumnNotFound) { + return Status::OK(); + } + + // It's a valid column - validate it. + const ColumnSchema& col = schema.column(*col_idx); + + if (null) { + if (!col.is_nullable()) { + return Status::Corruption("decoded set-to-NULL for non-nullable column", + col.ToString()); + } + *value = nullptr; + return Status::OK(); + } + + if (col.type_info()->physical_type() == BINARY) { + *value = &this->raw_value; + return Status::OK(); + } + + if (PREDICT_FALSE(col.type_info()->size() != this->raw_value.size())) { + return Status::Corruption(Substitute( + "invalid value $0 for column $1", + this->raw_value.ToDebugString(), col.ToString())); + } + + *value = reinterpret_cast(this->raw_value.data()); + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/common/row_changelist.h b/src/kudu/common/row_changelist.h new file mode 100644 index 000000000000..c48af7573e05 --- /dev/null +++ b/src/kudu/common/row_changelist.h @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Row changelists are simply an encoded form of a list of updates to columns +// within a row. These are stored within the delta memstore and delta files. +#ifndef KUDU_COMMON_ROW_CHANGELIST_H +#define KUDU_COMMON_ROW_CHANGELIST_H + +#include +#include +#include + +#include "kudu/common/row.h" +#include "kudu/gutil/casts.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +class faststring; + +class Arena; +class ColumnBlock; +class RowBlockRow; +class Schema; + +// A RowChangeList is a wrapper around a Slice which contains a "changelist". +// +// A changelist is a single mutation to a row -- it may be one of three types: +// - UPDATE (set a new value for one or more columns) +// - DELETE (remove the row) +// - REINSERT (re-insert a "ghost" row, used only in the MemRowSet) +// +// RowChangeLists should be constructed using RowChangeListEncoder, and read +// using RowChangeListDecoder. NOTE that the schema passed to the Decoder must +// be the same one used by the Encoder. +// +// The actual serialization format is as follows: +// +// The first byte indicates the RCL type. The values are specified by +// ChangeType below. +// +// If type == kDelete, then no further data follows. The row is deleted. +// +// If type == kReinsert, then a "tuple-format" row follows. TODO: this will +// be changed by http://gerrit.sjc.cloudera.com:8080/#/c/6318/ in the near future. +// +// If type == kUpdate, then a sequence of column updates follow. Each update +// has the format: +// +// -- varint32 +// The ID of the column to be updated. +// +// -- varint32 +// If 0, then indicates that the column is to be set to NULL. +// Otherwise, encodes the actual data length + 1. +// +// -- length determined by previous field +// The value to which to set the column. In the case of a STRING field, +// this is the actual string value. Otherwise, this is a fixed-length +// field whose length is determined by the type. +// +// A few examples follow: +// +// 1) UPDATE SET [col_id 2] = "hello" +// 0x01 0x02 0x06 h e l l o +// UPDATE col_id=2 len(hello)+1 +// +// 2) UPDATE SET [col_id 3] = 33 (assuming INT32 column) +// 0x01 0x03 0x05 0x21 0x00 0x00 0x00 +// UPDATE col_id=3 sizeof(int32)+1 +// +// 3) UPDATE SET [col id 3] = NULL +// 0x01 0x03 0x00 +// UPDATE col_id=3 NULL +// +class RowChangeList { + public: + RowChangeList() {} + + explicit RowChangeList(const faststring &fs) + : encoded_data_(fs) { + } + + explicit RowChangeList(Slice s) : encoded_data_(std::move(s)) {} + + // Create a RowChangeList which represents a delete. + // This points to static (const) memory and should not be + // mutated or freed. + static RowChangeList CreateDelete() { + return RowChangeList(Slice("\x02")); + } + + const Slice &slice() const { return encoded_data_; } + + // Return a string form of this changelist. + string ToString(const Schema &schema) const; + + bool is_reinsert() const { + DCHECK_GT(encoded_data_.size(), 0); + return encoded_data_[0] == kReinsert; + } + + bool is_delete() const { + DCHECK_GT(encoded_data_.size(), 0); + return encoded_data_[0] == kDelete; + } + + bool is_null() const { + return encoded_data_.size() == 0; + } + + enum ChangeType { + ChangeType_min = 0, + kUninitialized = 0, + kUpdate = 1, + kDelete = 2, + kReinsert = 3, + ChangeType_max = 3 + }; + + Slice encoded_data_; +}; + +class RowChangeListEncoder { + public: + // Construct a new encoder. + explicit RowChangeListEncoder(faststring *dst) : + type_(RowChangeList::kUninitialized), + dst_(dst) + {} + + void Reset() { + dst_->clear(); + type_ = RowChangeList::kUninitialized; + } + + void SetToDelete() { + SetType(RowChangeList::kDelete); + } + + // TODO: This doesn't currently copy the indirected data, so + // REINSERT deltas can't possibly work anywhere but in memory. + // For now, there is an assertion in the DeltaFile flush code + // that prevents us from accidentally depending on this anywhere + // but in-memory. + void SetToReinsert(const Slice &row_data) { + SetType(RowChangeList::kReinsert); + dst_->append(row_data.data(), row_data.size()); + } + + // Add a column update, given knowledge of the schema. + // + // If 'cell_ptr' is NULL, then 'col_schema' must refer to a nullable + // column, and we encode SET [col]=NULL. + // + // Otherwise, 'cell_ptr' should point to the in-memory format for the + // appropriate type. For example, for a STRING column, 'cell_ptr' + // should be a Slice*. + void AddColumnUpdate(const ColumnSchema& col_schema, + int col_id, + const void* cell_ptr); + + + RowChangeList as_changelist() { + DCHECK_GT(dst_->size(), 0); + return RowChangeList(*dst_); + } + + bool is_initialized() const { + return type_ != RowChangeList::kUninitialized; + } + + bool is_empty() { + return dst_->size() == 0; + } + + private: + FRIEND_TEST(TestRowChangeList, TestInvalid_SetNullForNonNullableColumn); + FRIEND_TEST(TestRowChangeList, TestInvalid_SetWrongSizeForIntColumn); + friend class RowChangeListDecoder; + + void SetType(RowChangeList::ChangeType type) { + DCHECK_EQ(type_, RowChangeList::kUninitialized); + type_ = type; + dst_->push_back(type); + } + + // Add a column update by a raw value. This allows copying RCLs + // from one file to another without having any awareness of schema. + // + // If 'is_null' is set, then encodes a SET [col_id]=NULL update. + // Otherwise, SET [col_id] = 'new_val'. + // + // 'new_val' is the encoded form of the new value. In the case of + // a STRING, this is the actual user-provided STRING. Otherwise, + // it is the fixed-length representation of the type. + void AddRawColumnUpdate(int col_id, bool is_null, Slice new_val); + + + RowChangeList::ChangeType type_; + faststring *dst_; +}; + + +class RowChangeListDecoder { + public: + + // Construct a new decoder. + explicit RowChangeListDecoder(const RowChangeList &src) + : remaining_(src.slice()), + type_(RowChangeList::kUninitialized) { + } + + // Initialize the decoder. This will return an invalid Status if the RowChangeList + // appears to be corrupt/malformed. + Status Init(); + + // Like Init() above, but does not perform any safety checks in a release build. + // This can be used when it's known that the source of the RowChangeList is + // guaranteed to be non-corrupt (e.g. we created it and have kept it in memory). + void InitNoSafetyChecks() { +#ifndef NDEBUG + Status s = Init(); + DCHECK(s.ok()) << s.ToString(); +#else + type_ = static_cast(remaining_[0]); + remaining_.remove_prefix(1); +#endif + } + + bool HasNext() const { + DCHECK(!is_delete()); + return !remaining_.empty(); + } + + bool is_update() const { + return type_ == RowChangeList::kUpdate; + } + + bool is_delete() const { + return type_ == RowChangeList::kDelete; + } + + bool is_reinsert() const { + return type_ == RowChangeList::kReinsert; + } + + // If this RCL is a REINSERT, then returns the reinserted row in + // the contiguous in-memory row format. + Status GetReinsertedRowSlice(const Schema& schema, Slice* s) const; + + // Append an entry to *column_ids for each column that is updated + // in this RCL. + // This 'consumes' the remainder of the encoded RowChangeList. + Status GetIncludedColumnIds(std::vector* column_ids) { + column_ids->clear(); + DCHECK(is_update()); + while (HasNext()) { + DecodedUpdate dec; + RETURN_NOT_OK(DecodeNext(&dec)); + column_ids->push_back(dec.col_id); + } + return Status::OK(); + } + + // Applies changes in this decoder to the specified row and saves the old + // state of the row into the undo_encoder. + Status ApplyRowUpdate(RowBlockRow *dst_row, + Arena *arena, RowChangeListEncoder* undo_encoder); + + // Apply this UPDATE RowChangeList to row number 'row_idx' in 'dst_col', but only + // any updates that correspond to column 'col_idx' of 'dst_schema'. + // Any indirect data is copied into 'arena' if non-NULL. + // + // REQUIRES: is_update() + Status ApplyToOneColumn(size_t row_idx, ColumnBlock* dst_col, + const Schema& dst_schema, int col_idx, Arena *arena); + + // If this changelist is a DELETE or REINSERT, twiddle '*deleted' to reference + // the new state of the row. If it is an UPDATE, this call has no effect. + // + // This is used during mutation traversal, to keep track of whether a row is + // deleted or not. + void TwiddleDeleteStatus(bool *deleted) { + if (is_delete()) { + DCHECK(!*deleted); + *deleted = true; + } else if (is_reinsert()) { + DCHECK(*deleted); + *deleted = false; + } + } + + // Project the 'src' RowChangeList using the delta 'projector' + // The projected RowChangeList will be encoded to specified 'buf'. + // The buffer will be cleared before adding the result. + static Status ProjectUpdate(const DeltaProjector& projector, + const RowChangeList& src, + faststring *buf); + + // If 'src' is an update, then only add changes for columns NOT + // specified by 'column_indexes' to 'out'. Delete and Re-insert + // changes are added to 'out' as-is. If an update only contained + // changes for 'column_indexes', then out->is_initialized() will + // return false. + // 'column_ids' must be sorted; 'out' must be + // valid for the duration of this method, but not have been + // previously initialized. + static Status RemoveColumnIdsFromChangeList(const RowChangeList& src, + const std::vector& column_ids, + RowChangeListEncoder* out); + + struct DecodedUpdate { + // The updated column ID. + ColumnId col_id; + + // If true, this update sets the given column to NULL. + bool null; + + // The "raw" value of the updated column. + // - in the case of a fixed length type such as an integer, + // the slice will point directly to the new little-endian value. + // - in the case of a variable length type such as a string, + // the slice will point to the new string value (i.e not to a + // "wrapper" slice. + // 'raw_value' is only relevant in the case that 'null' is not true. + Slice raw_value; + + // Resolve the decoded update against the given Schema. + // + // If the updated column is present in the schema, and the decoded + // update has the correct length/type, then sets + // *col_idx, and sets *valid_value to point to the validated + // value. + // + // If the updated column is present, but the data provided is invalid, + // returns Status::Corruption. + // + // If the updated column is not present, sets *col_idx to -1 and returns + // Status::OK. + Status Validate(const Schema& s, + int* col_idx, + const void** valid_value) const; + }; + + // Decode the next updated column into '*update'. + // See the docs on DecodedUpdate above for field information. + // + // The update->raw_value slice points to memory within the buffer + // being decoded by this object. No copies are made. + // + // REQUIRES: is_update() + Status DecodeNext(DecodedUpdate* update); + + private: + FRIEND_TEST(TestRowChangeList, TestEncodeDecodeUpdates); + friend class RowChangeList; + + // Data remaining in the source buffer. + // This slice is advanced forward as entries are decoded. + Slice remaining_; + + RowChangeList::ChangeType type_; +}; + + +} // namespace kudu + +// Defined for tight_enum_test_cast<> -- has to be defined outside of any namespace. +MAKE_ENUM_LIMITS(kudu::RowChangeList::ChangeType, + kudu::RowChangeList::ChangeType_min, + kudu::RowChangeList::ChangeType_max); + +#endif diff --git a/src/kudu/common/row_key-util-test.cc b/src/kudu/common/row_key-util-test.cc new file mode 100644 index 000000000000..661dde4db5cc --- /dev/null +++ b/src/kudu/common/row_key-util-test.cc @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/row_key-util.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class RowKeyUtilTest : public KuduTest { + public: + RowKeyUtilTest() + : arena_(1024, 4096) {} + + protected: + uint8_t* row_data(KuduPartialRow* row) { + return row->row_data_; + } + + Arena arena_; +}; + +TEST_F(RowKeyUtilTest, TestIncrementNonCompositeKey) { + Schema schema({ ColumnSchema("key", INT32), + ColumnSchema("other_col", INT32), + ColumnSchema("other_col2", STRING, true) }, + 1); + KuduPartialRow p_row(&schema); + ContiguousRow row(&schema, row_data(&p_row)); + + // Normal increment. + EXPECT_OK(p_row.SetInt32(0, 1000)); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 key=1001", p_row.ToString()); + + // Overflow increment. + EXPECT_OK(p_row.SetInt32(0, MathLimits::kMax)); + EXPECT_FALSE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 key=-2147483648", p_row.ToString()); +} + +TEST_F(RowKeyUtilTest, TestIncrementCompositeKey) { + Schema schema({ ColumnSchema("k1", INT32), + ColumnSchema("k2", INT32), + ColumnSchema("other_col", STRING, true) }, + 2); + + KuduPartialRow p_row(&schema); + ContiguousRow row(&schema, row_data(&p_row)); + + // Normal increment. + EXPECT_OK(p_row.SetInt32(0, 1000)); + EXPECT_OK(p_row.SetInt32(1, 1000)); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 k1=1000, int32 k2=1001", p_row.ToString()); + + // Overflow a later part of the key, carrying into the earlier + // part.. + EXPECT_OK(p_row.SetInt32(1, MathLimits::kMax)); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 k1=1001, int32 k2=-2147483648", p_row.ToString()); + + // Overflow the whole key. + EXPECT_OK(p_row.SetInt32(0, MathLimits::kMax)); + EXPECT_OK(p_row.SetInt32(1, MathLimits::kMax)); + EXPECT_FALSE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 k1=-2147483648, int32 k2=-2147483648", p_row.ToString()); +} + +TEST_F(RowKeyUtilTest, TestIncrementCompositeIntStringKey) { + Schema schema({ ColumnSchema("k1", INT32), + ColumnSchema("k2", STRING), + ColumnSchema("other_col", STRING, true) }, + 2); + + KuduPartialRow p_row(&schema); + ContiguousRow row(&schema, row_data(&p_row)); + + // Normal increment. + EXPECT_OK(p_row.SetInt32(0, 1000)); + EXPECT_OK(p_row.SetString(1, "hello")); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 k1=1000, string k2=hello\\000", p_row.ToString()); + + // There's no way to overflow a string key - you can always make it higher + // by tacking on more \x00. + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("int32 k1=1000, string k2=hello\\000\\000", p_row.ToString()); +} + +TEST_F(RowKeyUtilTest, TestIncrementCompositeStringIntKey) { + Schema schema({ ColumnSchema("k1", STRING), + ColumnSchema("k2", INT32), + ColumnSchema("other_col", STRING, true) }, + 2); + + KuduPartialRow p_row(&schema); + ContiguousRow row(&schema, row_data(&p_row)); + + // Normal increment. + EXPECT_OK(p_row.SetString(0, "hello")); + EXPECT_OK(p_row.SetInt32(1, 1000)); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("string k1=hello, int32 k2=1001", p_row.ToString()); + + // Overflowing the int32 portion should tack \x00 onto the + // string portion. + EXPECT_OK(p_row.SetInt32(1, MathLimits::kMax)); + EXPECT_TRUE(row_key_util::IncrementKey(&row, &arena_)); + EXPECT_EQ("string k1=hello\\000, int32 k2=-2147483648", p_row.ToString()); +} + + + + +} // namespace kudu diff --git a/src/kudu/common/row_key-util.cc b/src/kudu/common/row_key-util.cc new file mode 100644 index 000000000000..26299521c6af --- /dev/null +++ b/src/kudu/common/row_key-util.cc @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/row_key-util.h" + +#include + +#include "kudu/common/row.h" + +namespace kudu { +namespace row_key_util { + +namespace { + +template +bool IncrementIntCell(void* cell_ptr) { + typedef DataTypeTraits traits; + typedef typename traits::cpp_type cpp_type; + + cpp_type orig; + memcpy(&orig, cell_ptr, sizeof(cpp_type)); + + cpp_type inc; + if (boost::is_unsigned::value) { + inc = orig + 1; + } else { + // Signed overflow is undefined in C. So, we'll use a branch here + // instead of counting on undefined behavior. + if (orig == MathLimits::kMax) { + inc = MathLimits::kMin; + } else { + inc = orig + 1; + } + } + memcpy(cell_ptr, &inc, sizeof(cpp_type)); + return inc > orig; +} + +bool IncrementStringCell(void* cell_ptr, Arena* arena) { + Slice orig; + memcpy(&orig, cell_ptr, sizeof(orig)); + uint8_t* new_buf = CHECK_NOTNULL( + static_cast(arena->AllocateBytes(orig.size() + 1))); + memcpy(new_buf, orig.data(), orig.size()); + new_buf[orig.size()] = '\0'; + + Slice inc(new_buf, orig.size() + 1); + memcpy(cell_ptr, &inc, sizeof(inc)); + return true; +} + +bool IncrementCell(const ColumnSchema& col, void* cell_ptr, Arena* arena) { + DataType type = col.type_info()->physical_type(); + switch (type) { +#define HANDLE_TYPE(t) case t: return IncrementIntCell(cell_ptr); + HANDLE_TYPE(UINT8); + HANDLE_TYPE(UINT16); + HANDLE_TYPE(UINT32); + HANDLE_TYPE(UINT64); + HANDLE_TYPE(INT8); + HANDLE_TYPE(INT16); + HANDLE_TYPE(INT32); + HANDLE_TYPE(TIMESTAMP); + HANDLE_TYPE(INT64); + case UNKNOWN_DATA: + case BOOL: + case FLOAT: + case DOUBLE: + LOG(FATAL) << "Unable to handle type " << type << " in row keys"; + case STRING: + case BINARY: + return IncrementStringCell(cell_ptr, arena); + default: CHECK(false) << "Unknown data type: " << type; + } + return false; // unreachable +#undef HANDLE_TYPE +} + +} // anonymous namespace + +void SetKeyToMinValues(ContiguousRow* row) { + for (int i = 0; i < row->schema()->num_key_columns(); i++) { + const ColumnSchema& col = row->schema()->column(i); + col.type_info()->CopyMinValue(row->mutable_cell_ptr(i)); + } +} + +bool IncrementKey(ContiguousRow* row, Arena* arena) { + return IncrementKeyPrefix(row, row->schema()->num_key_columns(), arena); +} + +bool IncrementKeyPrefix(ContiguousRow* row, int prefix_len, Arena* arena) { + for (int i = prefix_len - 1; i >= 0; --i) { + if (IncrementCell(row->schema()->column(i), + row->mutable_cell_ptr(i), + arena)) { + return true; + } + } + return false; +} + +} // namespace row_key_util +} // namespace kudu diff --git a/src/kudu/common/row_key-util.h b/src/kudu/common/row_key-util.h new file mode 100644 index 000000000000..558ca42ba456 --- /dev/null +++ b/src/kudu/common/row_key-util.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility functions for working with the primary key portion of a row. +#ifndef KUDU_COMMON_ROW_KEY_UTIL_H +#define KUDU_COMMON_ROW_KEY_UTIL_H + +#include "kudu/gutil/port.h" + +namespace kudu { + +class Arena; +class ContiguousRow; + +namespace row_key_util { + +// Set all of the parts of the key in 'row' to the minimum legal values +// for their types. +// +// For example: +// - signed ints become very large negative values +// - unsigned ints become 0 +// - strings become "" +void SetKeyToMinValues(ContiguousRow* row); + +// Increment the primary key of this row to the smallest key which is greater +// than the current key. +// +// For example, for a composite key with types (int8, int8), incrementing +// the row (1, 1) will result in (1, 2). Incrementing (1, 127) will result +// in (2, -128). +// +// Note that not all keys may be incremented without overflow. For example, +// if the primary key is an int8, and the key is already set to '127', +// incrementing would overflow. In this case, the value is incremented and +// overflowed, but the function returns 'false' to indicate the overflow +// condition. Otherwise, returns 'true'. +// +// String types are increment by appending a '\0' byte to the end. Since our +// strings have unbounded length, this implies that if a key has a string +// component, it will always be incremented. +// +// For the case of incrementing string types, we allocate a new copy of the +// string from 'arena', which must be non-NULL. +// +// REQUIRES: all key columns must be valid. +bool IncrementKey(ContiguousRow* row, Arena* arena) WARN_UNUSED_RESULT; + +// The same as the above function, but only acts on a prefix of the primary +// key. +// +// For example, for a composite primary key (int8, int8, int8) with value +// (1,2,3), IncrementKeyPrefix(2) will return (1,3,3). +bool IncrementKeyPrefix(ContiguousRow* row, int prefix_len, + Arena* arena) WARN_UNUSED_RESULT; + +} // namespace row_key_util +} // namespace kudu +#endif /* KUDU_COMMON_ROW_KEY_UTIL_H */ diff --git a/src/kudu/common/row_operations-test.cc b/src/kudu/common/row_operations-test.cc new file mode 100644 index 000000000000..4ae3418a7853 --- /dev/null +++ b/src/kudu/common/row_operations-test.cc @@ -0,0 +1,671 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/test_util.h" + +using std::shared_ptr; +using strings::Substitute; +using strings::SubstituteAndAppend; + +namespace kudu { + +class RowOperationsTest : public KuduTest { + public: + RowOperationsTest() + : arena_(1024, 128 * 1024) { + SeedRandom(); + + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn("key", INT32)); + CHECK_OK(builder.AddColumn("int_val", INT32)); + CHECK_OK(builder.AddNullableColumn("string_val", STRING)); + schema_ = builder.Build(); + schema_without_ids_ = builder.BuildWithoutIds(); + } + protected: + void CheckDecodeDoesntCrash(const Schema& client_schema, + const Schema& server_schema, + const RowOperationsPB& pb); + void DoFuzzTest(const Schema& server_schema, + const KuduPartialRow& row, + int n_random_changes); + + Schema schema_; + Schema schema_without_ids_; + Arena arena_; +}; + +// Perform some random mutation to a random byte in the provided string. +static void DoRandomMutation(string* s) { + int target_idx = random() % s->size(); + char* target_byte = &(*s)[target_idx]; + switch (random() % 3) { + case 0: + // increment a random byte by 1 + (*target_byte)++; + break; + case 1: + // decrement a random byte by 1 + (*target_byte)--; + break; + case 2: + // replace byte with random value + (*target_byte) = random(); + break; + } +} + +void RowOperationsTest::CheckDecodeDoesntCrash(const Schema& client_schema, + const Schema& server_schema, + const RowOperationsPB& pb) { + arena_.Reset(); + RowOperationsPBDecoder decoder(&pb, &client_schema, &server_schema, &arena_); + vector ops; + Status s = decoder.DecodeOperations(&ops); + if (s.ok() && !ops.empty()) { + // If we got an OK result, then we should be able to stringify without + // crashing. This ensures that any indirect data (eg strings) gets + // set correctly. + ignore_result(ops[0].ToString(server_schema)); + } + // Bad Status is OK -- we expect corruptions here. +} + +void RowOperationsTest::DoFuzzTest(const Schema& server_schema, + const KuduPartialRow& row, + int n_random_changes) { + for (int operation = 0; operation < 3; operation++) { + RowOperationsPB pb; + RowOperationsPBEncoder enc(&pb); + + switch (operation) { + case 0: + enc.Add(RowOperationsPB::INSERT, row); + break; + case 1: + enc.Add(RowOperationsPB::UPDATE, row); + break; + case 2: + enc.Add(RowOperationsPB::DELETE, row); + break; + case 3: + enc.Add(RowOperationsPB::SPLIT_ROW, row); + break; + } + + const Schema* client_schema = row.schema(); + + // Check that the un-mutated row doesn't crash. + CheckDecodeDoesntCrash(*client_schema, server_schema, pb); + + RowOperationsPB mutated; + + // Check all possible truncations of the protobuf 'rows' field. + for (int i = 0; i < pb.rows().size(); i++) { + mutated.CopyFrom(pb); + mutated.mutable_rows()->resize(i); + CheckDecodeDoesntCrash(*client_schema, server_schema, mutated); + } + + // Check bit flips of every bit in the first three bytes, which are + // particularly interesting, since they contain the null/isset + // bitmaps. + for (int bit = 0; bit < 8 * 3; bit++) { + int byte_idx = bit / 8; + int bit_idx = bit % 8; + int mask = 1 << bit_idx; + + (*mutated.mutable_rows())[byte_idx] ^= mask; + CheckDecodeDoesntCrash(*client_schema, server_schema, mutated); + (*mutated.mutable_rows())[byte_idx] ^= mask; + } + + // Check random byte changes in the 'rows' field. + for (int i = 0; i < n_random_changes; i++) { + mutated.CopyFrom(pb); + DoRandomMutation(mutated.mutable_rows()); + CheckDecodeDoesntCrash(*client_schema, server_schema, mutated); + } + } +} + +// Test that, even if the protobuf is corrupt in some way, we do not +// crash. These protobufs are provided by clients, so we want to make sure +// a malicious client can't crash the server. +TEST_F(RowOperationsTest, FuzzTest) { + const int n_iters = AllowSlowTests() ? 10000 : 1000; + + KuduPartialRow row(&schema_without_ids_); + EXPECT_OK(row.SetInt32("int_val", 54321)); + EXPECT_OK(row.SetStringCopy("string_val", "hello world")); + DoFuzzTest(schema_, row, n_iters); + EXPECT_OK(row.SetNull("string_val")); + DoFuzzTest(schema_, row, n_iters); +} + +// Add the given column, but with some probability change the type +// and nullability. +void AddFuzzedColumn(SchemaBuilder* builder, + const string& name, + DataType default_type) { + DataType rand_types[] = {INT32, INT64, DOUBLE, STRING}; + DataType t = default_type; + if (random() % 3 == 0) { + t = rand_types[random() % arraysize(rand_types)]; + } + bool nullable = random() & 1; + CHECK_OK(builder->AddColumn(name, t, nullable, NULL, NULL)); +} + +// Generate a randomized schema, where some columns might be missing, +// and types/nullability are randomized. We weight towards not making +// too many changes so that it's likely we generate compatible client +// and server schemas. +Schema GenRandomSchema(bool with_ids) { + SchemaBuilder builder; + if (random() % 5 != 0) { + AddFuzzedColumn(&builder, "c1", INT32); + } + if (random() % 5 != 0) { + AddFuzzedColumn(&builder, "c2", INT32); + } + if (random() % 5 != 0 || !builder.is_valid()) { + AddFuzzedColumn(&builder, "c3", STRING); + } + + return with_ids ? builder.Build() : builder.BuildWithoutIds(); +} + +namespace { + +struct FailingCase { + Schema* client_schema; + Schema* server_schema; + KuduPartialRow* row; +}; +FailingCase g_failing_case; + +// ASAN callback which will dump the case which caused a failure. +void DumpFailingCase() { + LOG(INFO) << "Failed on the following case:"; + LOG(INFO) << "Client schema:\n" << g_failing_case.client_schema->ToString(); + LOG(INFO) << "Server schema:\n" << g_failing_case.server_schema->ToString(); + LOG(INFO) << "Row: " << g_failing_case.row->ToString(); +} + +void GlogFailure() { + DumpFailingCase(); + abort(); +} + +} // anonymous namespace + +// Fuzz test which generates random pairs of client/server schemas, with +// random mutations like adding an extra column, removing a column, changing +// types, and changing nullability. +TEST_F(RowOperationsTest, SchemaFuzz) { + const int n_iters = AllowSlowTests() ? 10000 : 10; + for (int i = 0; i < n_iters; i++) { + // Generate a random client and server schema pair. + Schema client_schema = GenRandomSchema(false); + Schema server_schema = GenRandomSchema(true); + KuduPartialRow row(&client_schema); + + // On a crash or ASAN failure, dump the case information to the log so we + // can write a more specific repro. + g_failing_case.client_schema = &client_schema; + g_failing_case.server_schema = &server_schema; + g_failing_case.row = &row; + ASAN_SET_DEATH_CALLBACK(&DumpFailingCase); + google::InstallFailureFunction(&GlogFailure); + + for (int i = 0; i < client_schema.num_columns(); i++) { + if (client_schema.column(i).is_nullable() && + random() % 3 == 0) { + CHECK_OK(row.SetNull(i)); + continue; + } + switch (client_schema.column(i).type_info()->type()) { + case INT32: + CHECK_OK(row.SetInt32(i, 12345)); + break; + case INT64: + CHECK_OK(row.SetInt64(i, 12345678)); + break; + case DOUBLE: + CHECK_OK(row.SetDouble(i, 1234.5678)); + break; + case STRING: + CHECK_OK(row.SetStringCopy(i, "hello")); + break; + default: + LOG(FATAL); + } + } + + DoFuzzTest(server_schema, row, 100); + ASAN_SET_DEATH_CALLBACK(NULL); + google::InstallFailureFunction(&abort); + } +} + +// One case from SchemaFuzz which failed previously. +TEST_F(RowOperationsTest, TestFuzz1) { + SchemaBuilder client_schema_builder; + client_schema_builder.AddColumn("c1", INT32, false, nullptr, nullptr); + client_schema_builder.AddColumn("c2", STRING, false, nullptr, nullptr); + Schema client_schema = client_schema_builder.BuildWithoutIds(); + SchemaBuilder server_schema_builder; + server_schema_builder.AddColumn("c1", INT32, false, nullptr, nullptr); + server_schema_builder.AddColumn("c2", STRING, false, nullptr, nullptr); + Schema server_schema = server_schema_builder.Build(); + KuduPartialRow row(&client_schema); + CHECK_OK(row.SetInt32(0, 12345)); + CHECK_OK(row.SetStringCopy(1, "hello")); + DoFuzzTest(server_schema, row, 100); +} + +// Another case from SchemaFuzz which failed previously. +TEST_F(RowOperationsTest, TestFuzz2) { + SchemaBuilder client_schema_builder; + client_schema_builder.AddColumn("c1", STRING, true, nullptr, nullptr); + client_schema_builder.AddColumn("c2", STRING, false, nullptr, nullptr); + Schema client_schema = client_schema_builder.BuildWithoutIds(); + SchemaBuilder server_schema_builder; + server_schema_builder.AddColumn("c1", STRING, true, nullptr, nullptr); + server_schema_builder.AddColumn("c2", STRING, false, nullptr, nullptr); + Schema server_schema = server_schema_builder.Build(); + KuduPartialRow row(&client_schema); + CHECK_OK(row.SetNull(0)); + CHECK_OK(row.SetStringCopy(1, "hello")); + DoFuzzTest(server_schema, row, 100); +} + +namespace { + +// Project client_row into server_schema, and stringify the result. +// If an error occurs, the result string is "error: " +string TestProjection(RowOperationsPB::Type type, + const KuduPartialRow& client_row, + const Schema& server_schema) { + RowOperationsPB pb; + RowOperationsPBEncoder enc(&pb); + enc.Add(type, client_row); + + // Decode it + Arena arena(1024, 1024*1024); + vector ops; + RowOperationsPBDecoder dec(&pb, client_row.schema(), &server_schema, &arena); + Status s = dec.DecodeOperations(&ops); + + if (!s.ok()) { + return "error: " + s.ToString(); + } + CHECK_EQ(1, ops.size()); + return ops[0].ToString(server_schema); +} + +} // anonymous namespace + +// Test decoding partial rows from a client who has a schema which matches +// the table schema. +TEST_F(RowOperationsTest, ProjectionTestWholeSchemaSpecified) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); + + // Test a row missing 'int_val', which is required. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("error: Invalid argument: No value provided for required column: " + "int_val[int32 NOT NULL]", + TestProjection(RowOperationsPB::INSERT, client_row, schema_)); + } + + // Test a row missing 'string_val', which is nullable + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + CHECK_OK(client_row.SetInt32("int_val", 54321)); + // The NULL should get filled in. + EXPECT_EQ("INSERT (int32 key=12345, int32 int_val=54321, string string_val=NULL)", + TestProjection(RowOperationsPB::INSERT, client_row, schema_)); + } + + // Test a row with all of the fields specified, both with the nullable field + // specified to be NULL and non-NULL. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + CHECK_OK(client_row.SetInt32("int_val", 54321)); + CHECK_OK(client_row.SetStringCopy("string_val", "hello world")); + EXPECT_EQ("INSERT (int32 key=12345, int32 int_val=54321, string string_val=hello world)", + TestProjection(RowOperationsPB::INSERT, client_row, schema_)); + + // The first result should have the field specified. + // The second result should have the field NULL, since it was explicitly set. + CHECK_OK(client_row.SetNull("string_val")); + EXPECT_EQ("INSERT (int32 key=12345, int32 int_val=54321, string string_val=NULL)", + TestProjection(RowOperationsPB::INSERT, client_row, schema_)); + + } +} + +TEST_F(RowOperationsTest, ProjectionTestWithDefaults) { + int32_t nullable_default = 123; + int32_t non_null_default = 456; + SchemaBuilder b; + CHECK_OK(b.AddKeyColumn("key", INT32)); + CHECK_OK(b.AddColumn("nullable_with_default", INT32, true, + &nullable_default, &nullable_default)); + CHECK_OK(b.AddColumn("non_null_with_default", INT32, false, + &non_null_default, &non_null_default)); + Schema server_schema = b.Build(); + + // Clients may not have the defaults specified. + // TODO: evaluate whether this should be true - how "dumb" should clients be? + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("nullable_with_default", INT32, true), + ColumnSchema("non_null_with_default", INT32, false) }, + 1); + + // Specify just the key. The other two columns have defaults, so they'll get filled in. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("INSERT (int32 key=12345, int32 nullable_with_default=123," + " int32 non_null_with_default=456)", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } + + // Specify the key and override both defaults + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + CHECK_OK(client_row.SetInt32("nullable_with_default", 12345)); + CHECK_OK(client_row.SetInt32("non_null_with_default", 54321)); + EXPECT_EQ("INSERT (int32 key=12345, int32 nullable_with_default=12345," + " int32 non_null_with_default=54321)", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } + + // Specify the key and override both defaults, overriding the nullable + // one to NULL. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + CHECK_OK(client_row.SetNull("nullable_with_default")); + CHECK_OK(client_row.SetInt32("non_null_with_default", 54321)); + EXPECT_EQ("INSERT (int32 key=12345, int32 nullable_with_default=NULL," + " int32 non_null_with_default=54321)", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } +} + +// Test cases where the client only has a subset of the fields +// of the table, but where the missing columns have defaults +// or are NULLable. +TEST_F(RowOperationsTest, ProjectionTestWithClientHavingValidSubset) { + int32_t nullable_default = 123; + SchemaBuilder b; + CHECK_OK(b.AddKeyColumn("key", INT32)); + CHECK_OK(b.AddColumn("int_val", INT32)); + CHECK_OK(b.AddColumn("new_int_with_default", INT32, false, + &nullable_default, &nullable_default)); + CHECK_OK(b.AddNullableColumn("new_nullable_int", INT32)); + Schema server_schema = b.Build(); + + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32) }, + 1); + + // Specify just the key. This is an error because we're missing int_val. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("error: Invalid argument: No value provided for required column:" + " int_val[int32 NOT NULL]", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } + + // Specify both of the columns that the client is aware of. + // Defaults should be filled for the other two. + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + CHECK_OK(client_row.SetInt32("int_val", 12345)); + EXPECT_EQ("INSERT (int32 key=12345, int32 int_val=12345," + " int32 new_int_with_default=123, int32 new_nullable_int=NULL)", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } +} + +// Test cases where the client is missing a column which is non-null +// and has no default. This is an incompatible client. +TEST_F(RowOperationsTest, ProjectionTestWithClientHavingInvalidSubset) { + SchemaBuilder b; + CHECK_OK(b.AddKeyColumn("key", INT32)); + CHECK_OK(b.AddColumn("int_val", INT32)); + Schema server_schema = b.Build(); + + CHECK_OK(b.RemoveColumn("int_val")); + Schema client_schema = b.BuildWithoutIds(); + + { + KuduPartialRow client_row(&client_schema); + CHECK_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("error: Invalid argument: Client missing required column:" + " int_val[int32 NOT NULL]", + TestProjection(RowOperationsPB::INSERT, client_row, server_schema)); + } +} + +// Simple Update case where the client and server schemas match. +TEST_F(RowOperationsTest, TestProjectUpdates) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); + Schema server_schema = SchemaBuilder(client_schema).Build(); + + // Check without specifying any columns + KuduPartialRow client_row(&client_schema); + EXPECT_EQ("error: Invalid argument: No value provided for key column: key[int32 NOT NULL]", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); + + // Specify the key and no columns to update + ASSERT_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("error: Invalid argument: No fields updated, key is: (int32 key=12345)", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); + + + // Specify the key and update one column. + ASSERT_OK(client_row.SetInt32("int_val", 12345)); + EXPECT_EQ("MUTATE (int32 key=12345) SET int_val=12345", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); + + // Specify the key and update both columns + ASSERT_OK(client_row.SetString("string_val", "foo")); + EXPECT_EQ("MUTATE (int32 key=12345) SET int_val=12345, string_val=foo", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); + + // Update the nullable column to null. + ASSERT_OK(client_row.SetNull("string_val")); + EXPECT_EQ("MUTATE (int32 key=12345) SET int_val=12345, string_val=NULL", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); +} + +// Client schema has the columns in a different order. Makes +// sure the name-based projection is functioning. +TEST_F(RowOperationsTest, TestProjectUpdatesReorderedColumns) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("string_val", STRING, true), + ColumnSchema("int_val", INT32) }, + 1); + Schema server_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); + server_schema = SchemaBuilder(server_schema).Build(); + + KuduPartialRow client_row(&client_schema); + ASSERT_OK(client_row.SetInt32("key", 12345)); + ASSERT_OK(client_row.SetInt32("int_val", 54321)); + EXPECT_EQ("MUTATE (int32 key=12345) SET int_val=54321", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); +} + +// Client schema is missing one of the columns in the server schema. +// This is OK on an update. +TEST_F(RowOperationsTest, DISABLED_TestProjectUpdatesSubsetOfColumns) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); + Schema server_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); + server_schema = SchemaBuilder(server_schema).Build(); + + KuduPartialRow client_row(&client_schema); + ASSERT_OK(client_row.SetInt32("key", 12345)); + ASSERT_OK(client_row.SetString("string_val", "foo")); + EXPECT_EQ("MUTATE (int32 key=12345) SET string_val=foo", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); +} + +TEST_F(RowOperationsTest, TestClientMismatchedType) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT8) }, + 1); + Schema server_schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32) }, + 1); + server_schema = SchemaBuilder(server_schema).Build(); + + KuduPartialRow client_row(&client_schema); + ASSERT_OK(client_row.SetInt32("key", 12345)); + ASSERT_OK(client_row.SetInt8("int_val", 1)); + EXPECT_EQ("error: Invalid argument: The column 'int_val' must have type " + "int32 NOT NULL found int8 NOT NULL", + TestProjection(RowOperationsPB::UPDATE, client_row, server_schema)); +} + +TEST_F(RowOperationsTest, TestProjectDeletes) { + Schema client_schema({ ColumnSchema("key", INT32), + ColumnSchema("key_2", INT32), + ColumnSchema("string_val", STRING, true) }, + 2); + Schema server_schema = SchemaBuilder(client_schema).Build(); + + KuduPartialRow client_row(&client_schema); + // No columns set + EXPECT_EQ("error: Invalid argument: No value provided for key column: key[int32 NOT NULL]", + TestProjection(RowOperationsPB::DELETE, client_row, server_schema)); + + // Only half the key set + ASSERT_OK(client_row.SetInt32("key", 12345)); + EXPECT_EQ("error: Invalid argument: No value provided for key column: key_2[int32 NOT NULL]", + TestProjection(RowOperationsPB::DELETE, client_row, server_schema)); + + // Whole key set (correct) + ASSERT_OK(client_row.SetInt32("key_2", 54321)); + EXPECT_EQ("MUTATE (int32 key=12345, int32 key_2=54321) DELETE", + TestProjection(RowOperationsPB::DELETE, client_row, server_schema)); + + // Extra column set (incorrect) + ASSERT_OK(client_row.SetString("string_val", "hello")); + EXPECT_EQ("error: Invalid argument: DELETE should not have a value for column: " + "string_val[string NULLABLE]", + TestProjection(RowOperationsPB::DELETE, client_row, server_schema)); +} + +TEST_F(RowOperationsTest, SplitKeyRoundTrip) { + Schema client_schema = Schema({ ColumnSchema("int8", INT8), + ColumnSchema("int16", INT16), + ColumnSchema("int32", INT32), + ColumnSchema("int64", INT64), + ColumnSchema("string", STRING), + ColumnSchema("binary", BINARY), + ColumnSchema("timestamp", TIMESTAMP), + ColumnSchema("missing", STRING) }, + 8); + + // Use values at the upper end of the range. + int8_t int8_expected = 0xFE; + int16_t int16_expected = 0xFFFE; + int32_t int32_expected = 0xFFFFFE; + int64_t int64_expected = 0xFFFFFFFE; + + KuduPartialRow row(&client_schema); + ASSERT_OK(row.SetInt8("int8", int8_expected)); + ASSERT_OK(row.SetInt16("int16", int16_expected)); + ASSERT_OK(row.SetInt32("int32", int32_expected)); + ASSERT_OK(row.SetInt64("int64", int64_expected)); + ASSERT_OK(row.SetString("string", "string-value")); + ASSERT_OK(row.SetBinary("binary", "binary-value")); + ASSERT_OK(row.SetTimestamp("timestamp", 9)); + + RowOperationsPB pb; + RowOperationsPBEncoder(&pb).Add(RowOperationsPB::SPLIT_ROW, row); + + Schema schema = client_schema.CopyWithColumnIds(); + RowOperationsPBDecoder decoder(&pb, &client_schema, &schema, nullptr); + vector ops; + ASSERT_OK(decoder.DecodeOperations(&ops)); + ASSERT_EQ(1, ops.size()); + + const shared_ptr& row2 = ops[0].split_row; + + int8_t int8_val; + ASSERT_OK(row2->GetInt8("int8", &int8_val)); + CHECK_EQ(int8_expected, int8_val); + + int16_t int16_val; + ASSERT_OK(row2->GetInt16("int16", &int16_val)); + CHECK_EQ(int16_expected, int16_val); + + int32_t int32_val; + ASSERT_OK(row2->GetInt32("int32", &int32_val)); + CHECK_EQ(int32_expected, int32_val); + + int64_t int64_val; + ASSERT_OK(row2->GetInt64("int64", &int64_val)); + CHECK_EQ(int64_expected, int64_val); + + Slice string_val; + ASSERT_OK(row2->GetString("string", &string_val)); + CHECK_EQ("string-value", string_val); + + Slice binary_val; + ASSERT_OK(row2->GetBinary("binary", &binary_val)); + CHECK_EQ(Slice("binary-value"), binary_val); + + CHECK(!row2->IsColumnSet("missing")); +} + +} // namespace kudu diff --git a/src/kudu/common/row_operations.cc b/src/kudu/common/row_operations.cc new file mode 100644 index 000000000000..896b4c29bb0d --- /dev/null +++ b/src/kudu/common/row_operations.cc @@ -0,0 +1,576 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/row_operations.h" + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_changelist.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/faststring.h" +#include "kudu/util/safe_math.h" +#include "kudu/util/slice.h" + +using std::string; +using strings::Substitute; + +namespace kudu { + +string DecodedRowOperation::ToString(const Schema& schema) const { + switch (type) { + case RowOperationsPB::INSERT: + return "INSERT " + schema.DebugRow(ConstContiguousRow(&schema, row_data)); + case RowOperationsPB::UPDATE: + case RowOperationsPB::DELETE: + return Substitute("MUTATE $0 $1", + schema.DebugRowKey(ConstContiguousRow(&schema, row_data)), + changelist.ToString(schema)); + case RowOperationsPB::SPLIT_ROW: + return Substitute("SPLIT_ROW $0", split_row->ToString()); + default: + LOG(DFATAL) << "Bad type: " << type; + return ""; + } +} + +RowOperationsPBEncoder::RowOperationsPBEncoder(RowOperationsPB* pb) + : pb_(pb) { +} + +RowOperationsPBEncoder::~RowOperationsPBEncoder() { +} + +void RowOperationsPBEncoder::Add(RowOperationsPB::Type op_type, const KuduPartialRow& partial_row) { + const Schema* schema = partial_row.schema(); + + // See wire_protocol.pb for a description of the format. + string* dst = pb_->mutable_rows(); + + // Compute a bound on much space we may need in the 'rows' field. + // Then, resize it to this much space. This allows us to use simple + // memcpy() calls to copy the data, rather than string->append(), which + // reduces branches significantly in this fairly hot code path. + // (std::string::append doesn't get inlined). + // At the end of the function, we'll resize() the string back down to the + // right size. + int isset_bitmap_size = BitmapSize(schema->num_columns()); + int null_bitmap_size = ContiguousRowHelper::null_bitmap_size(*schema); + int type_size = 1; // type uses one byte + int max_size = type_size + schema->byte_size() + isset_bitmap_size + null_bitmap_size; + int old_size = dst->size(); + dst->resize(dst->size() + max_size); + + uint8_t* dst_ptr = reinterpret_cast(&(*dst)[old_size]); + + *dst_ptr++ = static_cast(op_type); + memcpy(dst_ptr, partial_row.isset_bitmap_, isset_bitmap_size); + dst_ptr += isset_bitmap_size; + + memcpy(dst_ptr, + ContiguousRowHelper::null_bitmap_ptr(*schema, partial_row.row_data_), + null_bitmap_size); + dst_ptr += null_bitmap_size; + + ContiguousRow row(schema, partial_row.row_data_); + for (int i = 0; i < schema->num_columns(); i++) { + if (!partial_row.IsColumnSet(i)) continue; + const ColumnSchema& col = schema->column(i); + + if (col.is_nullable() && row.is_null(i)) continue; + + if (col.type_info()->physical_type() == BINARY) { + const Slice* val = reinterpret_cast(row.cell_ptr(i)); + size_t indirect_offset = pb_->mutable_indirect_data()->size(); + pb_->mutable_indirect_data()->append(reinterpret_cast(val->data()), + val->size()); + Slice to_append(reinterpret_cast(indirect_offset), + val->size()); + memcpy(dst_ptr, &to_append, sizeof(Slice)); + dst_ptr += sizeof(Slice); + } else { + memcpy(dst_ptr, row.cell_ptr(i), col.type_info()->size()); + dst_ptr += col.type_info()->size(); + } + } + + dst->resize(reinterpret_cast(dst_ptr) - &(*dst)[0]); +} + +// ------------------------------------------------------------ +// Decoder +// ------------------------------------------------------------ + +RowOperationsPBDecoder::RowOperationsPBDecoder(const RowOperationsPB* pb, + const Schema* client_schema, + const Schema* tablet_schema, + Arena* dst_arena) + : pb_(pb), + client_schema_(client_schema), + tablet_schema_(tablet_schema), + dst_arena_(dst_arena), + bm_size_(BitmapSize(client_schema_->num_columns())), + tablet_row_size_(ContiguousRowHelper::row_size(*tablet_schema_)), + src_(pb->rows().data(), pb->rows().size()) { +} + +RowOperationsPBDecoder::~RowOperationsPBDecoder() { +} + +Status RowOperationsPBDecoder::ReadOpType(RowOperationsPB::Type* type) { + if (PREDICT_FALSE(src_.empty())) { + return Status::Corruption("Cannot find operation type"); + } + if (PREDICT_FALSE(!RowOperationsPB_Type_IsValid(src_[0]))) { + return Status::Corruption(Substitute("Unknown operation type: $0", src_[0])); + } + *type = static_cast(src_[0]); + src_.remove_prefix(1); + return Status::OK(); +} + +Status RowOperationsPBDecoder::ReadIssetBitmap(const uint8_t** bitmap) { + if (PREDICT_FALSE(src_.size() < bm_size_)) { + *bitmap = nullptr; + return Status::Corruption("Cannot find isset bitmap"); + } + *bitmap = src_.data(); + src_.remove_prefix(bm_size_); + return Status::OK(); +} + +Status RowOperationsPBDecoder::ReadNullBitmap(const uint8_t** null_bm) { + if (PREDICT_FALSE(src_.size() < bm_size_)) { + *null_bm = nullptr; + return Status::Corruption("Cannot find null bitmap"); + } + *null_bm = src_.data(); + src_.remove_prefix(bm_size_); + return Status::OK(); +} + +Status RowOperationsPBDecoder::GetColumnSlice(const ColumnSchema& col, Slice* slice) { + int size = col.type_info()->size(); + if (PREDICT_FALSE(src_.size() < size)) { + return Status::Corruption("Not enough data for column", col.ToString()); + } + // Find the data + if (col.type_info()->physical_type() == BINARY) { + // The Slice in the protobuf has a pointer relative to the indirect data, + // not a real pointer. Need to fix that. + const Slice* ptr_slice = reinterpret_cast(src_.data()); + size_t offset_in_indirect = reinterpret_cast(ptr_slice->data()); + bool overflowed = false; + size_t max_offset = AddWithOverflowCheck(offset_in_indirect, ptr_slice->size(), &overflowed); + if (PREDICT_FALSE(overflowed || max_offset > pb_->indirect_data().size())) { + return Status::Corruption("Bad indirect slice"); + } + + *slice = Slice(&pb_->indirect_data()[offset_in_indirect], ptr_slice->size()); + } else { + *slice = Slice(src_.data(), size); + } + src_.remove_prefix(size); + return Status::OK(); +} + +Status RowOperationsPBDecoder::ReadColumn(const ColumnSchema& col, uint8_t* dst) { + Slice slice; + RETURN_NOT_OK(GetColumnSlice(col, &slice)); + if (col.type_info()->physical_type() == BINARY) { + memcpy(dst, &slice, col.type_info()->size()); + } else { + slice.relocate(dst); + } + return Status::OK(); +} + +bool RowOperationsPBDecoder::HasNext() const { + return !src_.empty(); +} + +namespace { + +void SetupPrototypeRow(const Schema& schema, + ContiguousRow* row) { + for (int i = 0; i < schema.num_columns(); i++) { + const ColumnSchema& col = schema.column(i); + if (col.has_write_default()) { + if (col.is_nullable()) { + row->set_null(i, false); + } + memcpy(row->mutable_cell_ptr(i), col.write_default_value(), col.type_info()->size()); + } else if (col.is_nullable()) { + row->set_null(i, true); + } else { + // No default and not nullable. Therefore this column is required, + // and we'll ensure that it gets during the projection step. + } + } +} +} // anonymous namespace + +// Projector implementation which handles mapping the client column indexes +// to server-side column indexes, ensuring that all of the columns exist, +// and that every required (non-null, non-default) column in the server +// schema is also present in the client. +class ClientServerMapping { + public: + ClientServerMapping(const Schema* client_schema, + const Schema* tablet_schema) + : client_schema_(client_schema), + tablet_schema_(tablet_schema), + saw_tablet_col_(tablet_schema->num_columns()) { + } + + Status ProjectBaseColumn(size_t client_col_idx, size_t tablet_col_idx) { + // We should get this called exactly once for every input column, + // since the input columns must be a strict subset of the tablet columns. + DCHECK_EQ(client_to_tablet_.size(), client_col_idx); + DCHECK_LT(tablet_col_idx, saw_tablet_col_.size()); + client_to_tablet_.push_back(tablet_col_idx); + saw_tablet_col_[tablet_col_idx] = 1; + return Status::OK(); + } + + Status ProjectDefaultColumn(size_t client_col_idx) { + // Even if the client provides a default (which it shouldn't), we don't + // want to accept writes with an extra column. + return ProjectExtraColumn(client_col_idx); + } + + Status ProjectExtraColumn(size_t client_col_idx) { + return Status::InvalidArgument( + Substitute("Client provided column $0 not present in tablet", + client_schema_->column(client_col_idx).ToString())); + } + + // Translate from a client schema index to the tablet schema index + int client_to_tablet_idx(int client_idx) const { + DCHECK_LT(client_idx, client_to_tablet_.size()); + return client_to_tablet_[client_idx]; + } + + int num_mapped() const { + return client_to_tablet_.size(); + } + + // Ensure that any required (non-null, non-defaulted) columns from the + // server side schema are found in the client-side schema. If not, + // returns an InvalidArgument. + Status CheckAllRequiredColumnsPresent() { + for (int tablet_col_idx = 0; + tablet_col_idx < tablet_schema_->num_columns(); + tablet_col_idx++) { + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + if (!col.has_write_default() && + !col.is_nullable()) { + // All clients must pass this column. + if (!saw_tablet_col_[tablet_col_idx]) { + return Status::InvalidArgument( + "Client missing required column", col.ToString()); + } + } + } + return Status::OK(); + } + + private: + const Schema* const client_schema_; + const Schema* const tablet_schema_; + vector client_to_tablet_; + vector saw_tablet_col_; + DISALLOW_COPY_AND_ASSIGN(ClientServerMapping); +}; + + +Status RowOperationsPBDecoder::DecodeInsert(const uint8_t* prototype_row_storage, + const ClientServerMapping& mapping, + DecodedRowOperation* op) { + const uint8_t* client_isset_map; + const uint8_t* client_null_map; + + // Read the null and isset bitmaps for the client-provided row. + RETURN_NOT_OK(ReadIssetBitmap(&client_isset_map)); + if (client_schema_->has_nullables()) { + RETURN_NOT_OK(ReadNullBitmap(&client_null_map)); + } + + // Allocate a row with the tablet's layout. + uint8_t* tablet_row_storage = reinterpret_cast( + dst_arena_->AllocateBytesAligned(tablet_row_size_, 8)); + if (PREDICT_FALSE(!tablet_row_storage)) { + return Status::RuntimeError("Out of memory"); + } + + // Initialize the new row from the 'prototype' row which has been set + // with all of the server-side default values. This copy may be entirely + // overwritten in the case that all columns are specified, but this is + // still likely faster (and simpler) than looping through all the server-side + // columns to initialize defaults where non-set on every row. + memcpy(tablet_row_storage, prototype_row_storage, tablet_row_size_); + ContiguousRow tablet_row(tablet_schema_, tablet_row_storage); + + // Now handle each of the columns passed by the user, replacing the defaults + // from the prototype. + for (int client_col_idx = 0; client_col_idx < client_schema_->num_columns(); client_col_idx++) { + // Look up the corresponding column from the tablet. We use the server-side + // ColumnSchema object since it has the most up-to-date default, nullability, + // etc. + int tablet_col_idx = mapping.client_to_tablet_idx(client_col_idx); + DCHECK_GE(tablet_col_idx, 0); + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + + if (BitmapTest(client_isset_map, client_col_idx)) { + // If the client provided a value for this column, copy it. + + // Copy null-ness, if the server side column is nullable. + bool client_set_to_null = col.is_nullable() && + BitmapTest(client_null_map, client_col_idx); + if (col.is_nullable()) { + tablet_row.set_null(tablet_col_idx, client_set_to_null); + } + // Copy the value if it's not null + if (!client_set_to_null) { + RETURN_NOT_OK(ReadColumn(col, tablet_row.mutable_cell_ptr(tablet_col_idx))); + } + } else { + // If the client didn't provide a value, then the column must either be nullable or + // have a default (which was already set in the prototype row. + + if (PREDICT_FALSE(!(col.is_nullable() || col.has_write_default()))) { + // TODO: change this to return per-row errors. Otherwise if one row in a batch + // is missing a field for some reason, the whole batch will fail. + return Status::InvalidArgument("No value provided for required column", + col.ToString()); + } + } + } + + op->row_data = tablet_row_storage; + return Status::OK(); +} + +Status RowOperationsPBDecoder::DecodeUpdateOrDelete(const ClientServerMapping& mapping, + DecodedRowOperation* op) { + int rowkey_size = tablet_schema_->key_byte_size(); + + const uint8_t* client_isset_map; + const uint8_t* client_null_map; + + // Read the null and isset bitmaps for the client-provided row. + RETURN_NOT_OK(ReadIssetBitmap(&client_isset_map)); + if (client_schema_->has_nullables()) { + RETURN_NOT_OK(ReadNullBitmap(&client_null_map)); + } + + // Allocate space for the row key. + uint8_t* rowkey_storage = reinterpret_cast( + dst_arena_->AllocateBytesAligned(rowkey_size, 8)); + if (PREDICT_FALSE(!rowkey_storage)) { + return Status::RuntimeError("Out of memory"); + } + + // We're passing the full schema instead of the key schema here. + // That's OK because the keys come at the bottom. We lose some bounds + // checking in debug builds, but it avoids an extra copy of the key schema. + ContiguousRow rowkey(tablet_schema_, rowkey_storage); + + // First process the key columns. + int client_col_idx = 0; + for (; client_col_idx < client_schema_->num_key_columns(); client_col_idx++) { + // Look up the corresponding column from the tablet. We use the server-side + // ColumnSchema object since it has the most up-to-date default, nullability, + // etc. + DCHECK_EQ(mapping.client_to_tablet_idx(client_col_idx), + client_col_idx) << "key columns should match"; + int tablet_col_idx = client_col_idx; + + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + if (PREDICT_FALSE(!BitmapTest(client_isset_map, client_col_idx))) { + return Status::InvalidArgument("No value provided for key column", + col.ToString()); + } + + bool client_set_to_null = client_schema_->has_nullables() && + BitmapTest(client_null_map, client_col_idx); + if (PREDICT_FALSE(client_set_to_null)) { + return Status::InvalidArgument("NULL values not allowed for key column", + col.ToString()); + } + + RETURN_NOT_OK(ReadColumn(col, rowkey.mutable_cell_ptr(tablet_col_idx))); + } + op->row_data = rowkey_storage; + + // Now we process the rest of the columns: + // For UPDATE, we expect at least one other column to be set, indicating the + // update to perform. + // For DELETE, we expect no other columns to be set (and we verify that). + if (op->type == RowOperationsPB::UPDATE) { + faststring buf; + RowChangeListEncoder rcl_encoder(&buf); + + // Now process the rest of columns as updates. + for (; client_col_idx < client_schema_->num_columns(); client_col_idx++) { + int tablet_col_idx = mapping.client_to_tablet_idx(client_col_idx); + DCHECK_GE(tablet_col_idx, 0); + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + + if (BitmapTest(client_isset_map, client_col_idx)) { + bool client_set_to_null = client_schema_->has_nullables() && + BitmapTest(client_null_map, client_col_idx); + uint8_t scratch[kLargestTypeSize]; + uint8_t* val_to_add; + if (!client_set_to_null) { + RETURN_NOT_OK(ReadColumn(col, scratch)); + val_to_add = scratch; + } else { + + if (PREDICT_FALSE(!col.is_nullable())) { + return Status::InvalidArgument("NULL value not allowed for non-nullable column", + col.ToString()); + } + val_to_add = nullptr; + } + rcl_encoder.AddColumnUpdate(col, tablet_schema_->column_id(tablet_col_idx), val_to_add); + } + } + + if (PREDICT_FALSE(buf.size() == 0)) { + // No actual column updates specified! + return Status::InvalidArgument("No fields updated, key is", + tablet_schema_->DebugRowKey(rowkey)); + } + + // Copy the row-changelist to the arena. + uint8_t* rcl_in_arena = reinterpret_cast( + dst_arena_->AllocateBytesAligned(buf.size(), 8)); + if (PREDICT_FALSE(rcl_in_arena == nullptr)) { + return Status::RuntimeError("Out of memory allocating RCL"); + } + memcpy(rcl_in_arena, buf.data(), buf.size()); + op->changelist = RowChangeList(Slice(rcl_in_arena, buf.size())); + } else if (op->type == RowOperationsPB::DELETE) { + + // Ensure that no other columns are set. + for (; client_col_idx < client_schema_->num_columns(); client_col_idx++) { + if (BitmapTest(client_isset_map, client_col_idx)) { + int tablet_col_idx = mapping.client_to_tablet_idx(client_col_idx); + DCHECK_GE(tablet_col_idx, 0); + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + + return Status::InvalidArgument("DELETE should not have a value for column", + col.ToString()); + } + } + op->changelist = RowChangeList::CreateDelete(); + } else { + LOG(FATAL) << "Should only call this method with UPDATE or DELETE"; + } + + return Status::OK(); +} + +Status RowOperationsPBDecoder::DecodeSplitRow(const ClientServerMapping& mapping, + DecodedRowOperation* op) { + op->split_row.reset(new KuduPartialRow(tablet_schema_)); + + const uint8_t* client_isset_map; + const uint8_t* client_null_map; + + // Read the null and isset bitmaps for the client-provided row. + RETURN_NOT_OK(ReadIssetBitmap(&client_isset_map)); + if (client_schema_->has_nullables()) { + RETURN_NOT_OK(ReadNullBitmap(&client_null_map)); + } + + // Now handle each of the columns passed by the user. + for (int client_col_idx = 0; client_col_idx < client_schema_->num_columns(); client_col_idx++) { + // Look up the corresponding column from the tablet. We use the server-side + // ColumnSchema object since it has the most up-to-date default, nullability, + // etc. + int tablet_col_idx = mapping.client_to_tablet_idx(client_col_idx); + DCHECK_GE(tablet_col_idx, 0); + const ColumnSchema& col = tablet_schema_->column(tablet_col_idx); + + if (BitmapTest(client_isset_map, client_col_idx)) { + // If the client provided a value for this column, copy it. + Slice column_slice; + RETURN_NOT_OK(GetColumnSlice(col, &column_slice)); + const uint8_t* data; + if (col.type_info()->physical_type() == BINARY) { + data = reinterpret_cast(&column_slice); + } else { + data = column_slice.data(); + } + RETURN_NOT_OK(op->split_row->Set(tablet_col_idx, data)); + } + } + return Status::OK(); +} + +Status RowOperationsPBDecoder::DecodeOperations(vector* ops) { + // TODO: there's a bug here, in that if a client passes some column + // in its schema that has been deleted on the server, it will fail + // even if the client never actually specified any values for it. + // For example, a DBA might do a thorough audit that no one is using + // some column anymore, and then drop the column, expecting it to be + // compatible, but all writes would start failing until clients + // refreshed their schema. + // See DISABLED_TestProjectUpdatesSubsetOfColumns + CHECK(!client_schema_->has_column_ids()); + DCHECK(tablet_schema_->has_column_ids()); + ClientServerMapping mapping(client_schema_, tablet_schema_); + RETURN_NOT_OK(client_schema_->GetProjectionMapping(*tablet_schema_, &mapping)); + DCHECK_EQ(mapping.num_mapped(), client_schema_->num_columns()); + RETURN_NOT_OK(mapping.CheckAllRequiredColumnsPresent()); + + // Make a "prototype row" which has all the defaults filled in. We can copy + // this to create a starting point for each row as we decode it, with + // all the defaults in place without having to loop. + uint8_t prototype_row_storage[tablet_row_size_]; + ContiguousRow prototype_row(tablet_schema_, prototype_row_storage); + SetupPrototypeRow(*tablet_schema_, &prototype_row); + + while (HasNext()) { + RowOperationsPB::Type type; + RETURN_NOT_OK(ReadOpType(&type)); + DecodedRowOperation op; + op.type = type; + + switch (type) { + case RowOperationsPB::UNKNOWN: + return Status::NotSupported("Unknown row operation type"); + case RowOperationsPB::INSERT: + RETURN_NOT_OK(DecodeInsert(prototype_row_storage, mapping, &op)); + break; + case RowOperationsPB::UPDATE: + case RowOperationsPB::DELETE: + RETURN_NOT_OK(DecodeUpdateOrDelete(mapping, &op)); + break; + case RowOperationsPB::SPLIT_ROW: + RETURN_NOT_OK(DecodeSplitRow(mapping, &op)); + break; + } + + ops->push_back(op); + } + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/common/row_operations.h b/src/kudu/common/row_operations.h new file mode 100644 index 000000000000..e7ee722867f2 --- /dev/null +++ b/src/kudu/common/row_operations.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ROW_OPERATIONS_H +#define KUDU_COMMON_ROW_OPERATIONS_H + +#include +#include +#include + +#include "kudu/common/row_changelist.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Arena; +class KuduPartialRow; +class Schema; + +class ClientServerMapping; + +class RowOperationsPBEncoder { + public: + explicit RowOperationsPBEncoder(RowOperationsPB* pb); + ~RowOperationsPBEncoder(); + + // Append this partial row to the protobuf. + void Add(RowOperationsPB::Type type, const KuduPartialRow& row); + + private: + RowOperationsPB* pb_; + + DISALLOW_COPY_AND_ASSIGN(RowOperationsPBEncoder); +}; + +struct DecodedRowOperation { + RowOperationsPB::Type type; + + // For INSERT, the whole projected row. + // For UPDATE or DELETE, the row key. + const uint8_t* row_data; + + // For UPDATE and DELETE types, the changelist + RowChangeList changelist; + + // For SPLIT_ROW, the partial row to split on. + std::shared_ptr split_row; + + std::string ToString(const Schema& schema) const; +}; + +class RowOperationsPBDecoder { + public: + RowOperationsPBDecoder(const RowOperationsPB* pb, + const Schema* client_schema, + const Schema* tablet_schema, + Arena* dst_arena); + ~RowOperationsPBDecoder(); + + Status DecodeOperations(std::vector* ops); + + private: + Status ReadOpType(RowOperationsPB::Type* type); + Status ReadIssetBitmap(const uint8_t** bitmap); + Status ReadNullBitmap(const uint8_t** null_bm); + Status GetColumnSlice(const ColumnSchema& col, Slice* slice); + Status ReadColumn(const ColumnSchema& col, uint8_t* dst); + bool HasNext() const; + + Status DecodeInsert(const uint8_t* prototype_row_storage, + const ClientServerMapping& mapping, + DecodedRowOperation* op); + //------------------------------------------------------------ + // Serialization/deserialization support + //------------------------------------------------------------ + + // Decode the next encoded operation, which must be UPDATE or DELETE. + Status DecodeUpdateOrDelete(const ClientServerMapping& mapping, + DecodedRowOperation* op); + + // Decode the next encoded operation, which must be SPLIT_KEY. + Status DecodeSplitRow(const ClientServerMapping& mapping, + DecodedRowOperation* op); + + const RowOperationsPB* const pb_; + const Schema* const client_schema_; + const Schema* const tablet_schema_; + Arena* const dst_arena_; + + const int bm_size_; + const int tablet_row_size_; + Slice src_; + + + DISALLOW_COPY_AND_ASSIGN(RowOperationsPBDecoder); +}; +} // namespace kudu +#endif /* KUDU_COMMON_ROW_OPERATIONS_H */ diff --git a/src/kudu/common/rowblock.cc b/src/kudu/common/rowblock.cc new file mode 100644 index 000000000000..13ec37bcef6b --- /dev/null +++ b/src/kudu/common/rowblock.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "kudu/common/rowblock.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +SelectionVector::SelectionVector(size_t row_capacity) + : bytes_capacity_(BitmapSize(row_capacity)), + n_rows_(row_capacity), + n_bytes_(bytes_capacity_), + bitmap_(new uint8_t[n_bytes_]) { + CHECK_GT(n_bytes_, 0); +} + +void SelectionVector::Resize(size_t n_rows) { + size_t new_bytes = BitmapSize(n_rows); + CHECK_LE(new_bytes, bytes_capacity_); + n_rows_ = n_rows; + n_bytes_ = new_bytes; + // Pad with zeroes up to the next byte in order to give CountSelected() + // and AnySelected() the assumption that the size is an even byte + size_t bits_in_last_byte = n_rows & 7; + if (bits_in_last_byte > 0) { + BitmapChangeBits(&bitmap_[0], n_rows_, 8 - bits_in_last_byte, 0); + } +} + +size_t SelectionVector::CountSelected() const { + return Bits::Count(&bitmap_[0], n_bytes_); +} + +bool SelectionVector::AnySelected() const { + size_t rem = n_bytes_; + const uint32_t *p32 = reinterpret_cast( + &bitmap_[0]); + while (rem >= 4) { + if (*p32 != 0) { + return true; + } + p32++; + rem -= 4; + } + + const uint8_t *p8 = reinterpret_cast(p32); + while (rem > 0) { + if (*p8 != 0) { + return true; + } + p8++; + rem--; + } + + return false; +} + +////////////////////////////// +// RowBlock +////////////////////////////// +RowBlock::RowBlock(const Schema &schema, + size_t nrows, + Arena *arena) + : schema_(schema), + columns_data_(schema.num_columns()), + column_null_bitmaps_(schema.num_columns()), + row_capacity_(nrows), + nrows_(nrows), + arena_(arena), + sel_vec_(nrows) { + CHECK_GT(row_capacity_, 0); + + size_t bitmap_size = BitmapSize(row_capacity_); + for (size_t i = 0; i < schema.num_columns(); ++i) { + const ColumnSchema& col_schema = schema.column(i); + size_t col_size = row_capacity_ * col_schema.type_info()->size(); + columns_data_[i] = new uint8_t[col_size]; + + if (col_schema.is_nullable()) { + column_null_bitmaps_[i] = new uint8_t[bitmap_size]; + } + } +} + +RowBlock::~RowBlock() { + for (uint8_t *column_data : columns_data_) { + delete[] column_data; + } + for (uint8_t *bitmap_data : column_null_bitmaps_) { + delete[] bitmap_data; + } +} + +void RowBlock::Resize(size_t new_size) { + CHECK_LE(new_size, row_capacity_); + nrows_ = new_size; + sel_vec_.Resize(new_size); +} + +} // namespace kudu diff --git a/src/kudu/common/rowblock.h b/src/kudu/common/rowblock.h new file mode 100644 index 000000000000..672849cc3a4f --- /dev/null +++ b/src/kudu/common/rowblock.h @@ -0,0 +1,333 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ROWBLOCK_H +#define KUDU_COMMON_ROWBLOCK_H + +#include +#include "kudu/common/columnblock.h" +#include "kudu/common/schema.h" +#include "kudu/common/row.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +class RowBlockRow; + +// Bit-vector representing the selection status of each row in a row block. +// +// When scanning through data, a 1 bit in the selection vector indicates that +// the given row is live and has passed all predicates. +class SelectionVector { + public: + // Construct a new vector. The bits are initially in an indeterminate state. + // Call SetAllTrue() if you require all rows to be initially selected. + explicit SelectionVector(size_t row_capacity); + + // Construct a vector which shares the underlying memory of another vector, + // but only exposes up to a given prefix of the number of rows. + // + // Note that mutating the resulting bitmap may arbitrarily mutate the "extra" + // bits of the 'other' bitmap. + // + // The underlying bytes must not be deallocated or else this object will become + // invalid. + SelectionVector(SelectionVector *other, size_t prefix_rows); + + // Resize the selection vector to the given number of rows. + // This size must be <= the allocated capacity. + // + // Ensures that all rows for indices < n_rows are unmodified. + void Resize(size_t n_rows); + + // Return the number of selected rows. + size_t CountSelected() const; + + // Return true if any rows are selected, false if size 0. + // This is equivalent to (CountSelected() > 0), but faster. + bool AnySelected() const; + + bool IsRowSelected(size_t row) const { + DCHECK_LT(row, n_rows_); + return BitmapTest(&bitmap_[0], row); + } + + void SetRowSelected(size_t row) { + DCHECK_LT(row, n_rows_); + BitmapSet(&bitmap_[0], row); + } + + void SetRowUnselected(size_t row) { + DCHECK_LT(row, n_rows_); + BitmapClear(&bitmap_[0], row); + } + + uint8_t *mutable_bitmap() { + return &bitmap_[0]; + } + + const uint8_t *bitmap() const { + return &bitmap_[0]; + } + + // Set all bits in the bitmap to 1 + void SetAllTrue() { + // Initially all rows should be selected. + memset(&bitmap_[0], 0xff, n_bytes_); + // the last byte in the bitmap may have a few extra bits - need to + // clear those + + int trailer_bits = 8 - (n_rows_ % 8); + if (trailer_bits != 8) { + bitmap_[n_bytes_ - 1] >>= trailer_bits; + } + } + + // Set all bits in the bitmap to 0 + void SetAllFalse() { + memset(&bitmap_[0], 0, n_bytes_); + } + + size_t nrows() const { return n_rows_; } + + private: + DISALLOW_COPY_AND_ASSIGN(SelectionVector); + + // The number of allocated bytes in bitmap_ + size_t bytes_capacity_; + + size_t n_rows_; + size_t n_bytes_; + + gscoped_array bitmap_; +}; + +// A block of decoded rows. +// Wrapper around a buffer, which keeps the buffer's size, associated arena, +// and schema. Provides convenience accessors for indexing by row, column, etc. +// +// NOTE: TODO: We don't have any separate class for ConstRowBlock and ConstColumnBlock +// vs RowBlock and ColumnBlock. So, we use "const" in various places to either +// mean const-ness of the wrapper structure vs const-ness of the referred-to data. +// Typically in C++ this is done with separate wrapper classes for const vs non-const +// referred-to data, but that would require a lot of duplication elsewhere in the code, +// so for now, we just use convention: if you have a RowBlock or ColumnBlock parameter +// that you expect to be modifying, use a "RowBlock *param". Otherwise, use a +// "const RowBlock& param". Just because you _could_ modify the referred-to contents +// of the latter doesn't mean you _should_. +class RowBlock { + public: + RowBlock(const Schema &schema, + size_t nrows, + Arena *arena); + ~RowBlock(); + + // Resize the block to the given number of rows. + // This size must be <= the the allocated capacity row_capacity(). + // + // Ensures that all rows for indices < n_rows are unmodified. + void Resize(size_t n_rows); + + size_t row_capacity() const { + return row_capacity_; + } + + RowBlockRow row(size_t idx) const; + + const Schema &schema() const { return schema_; } + Arena *arena() const { return arena_; } + + ColumnBlock column_block(size_t col_idx) const { + return column_block(col_idx, nrows_); + } + + ColumnBlock column_block(size_t col_idx, size_t nrows) const { + DCHECK_LE(nrows, nrows_); + + const ColumnSchema& col_schema = schema_.column(col_idx); + uint8_t *col_data = columns_data_[col_idx]; + uint8_t *nulls_bitmap = column_null_bitmaps_[col_idx]; + + return ColumnBlock(col_schema.type_info(), nulls_bitmap, col_data, nrows, arena_); + } + + // Return the base pointer for the given column's data. + // + // This is used by the codegen code in preference to the "nicer" column_block APIs, + // because the codegenned code knows the column's type sizes statically, and thus + // computing the pointers by itself ends up avoiding a multiply instruction. + uint8_t* column_data_base_ptr(size_t col_idx) const { + DCHECK_LT(col_idx, columns_data_.size()); + return columns_data_[col_idx]; + } + + // Return the number of rows in the row block. Note that this includes + // rows which were filtered out by the selection vector. + size_t nrows() const { return nrows_; } + + // Zero the memory pointed to by this row block. + // This physically zeros the memory, so is not efficient - mostly useful + // from unit tests. + void ZeroMemory() { + size_t bitmap_size = BitmapSize(row_capacity_); + for (size_t i = 0; i < schema_.num_columns(); ++i) { + const ColumnSchema& col_schema = schema_.column(i); + size_t col_size = col_schema.type_info()->size() * row_capacity_; + memset(columns_data_[i], '\0', col_size); + + if (column_null_bitmaps_[i] != NULL) { + memset(column_null_bitmaps_[i], '\0', bitmap_size); + } + } + } + + // Return the selection vector which indicates which rows have passed + // predicates so far during evaluation of this block of rows. + // + // At the beginning of each batch, the vector is set to all 1s, and + // as predicates or deletions make rows invalid, they are set to 0s. + // After a batch has completed, only those rows with associated true + // bits in the selection vector are valid results for the scan. + SelectionVector *selection_vector() { + return &sel_vec_; + } + + const SelectionVector *selection_vector() const { + return &sel_vec_; + } + + private: + DISALLOW_COPY_AND_ASSIGN(RowBlock); + + static size_t RowBlockSize(const Schema& schema, size_t nrows) { + size_t block_size = schema.num_columns() * sizeof(size_t); + size_t bitmap_size = BitmapSize(nrows); + for (size_t col = 0; col < schema.num_columns(); col++) { + const ColumnSchema& col_schema = schema.column(col); + block_size += nrows * col_schema.type_info()->size(); + if (col_schema.is_nullable()) + block_size += bitmap_size; + } + return block_size; + } + + Schema schema_; + std::vector columns_data_; + std::vector column_null_bitmaps_; + + // The maximum number of rows that can be stored in our allocated buffer. + size_t row_capacity_; + + // The number of rows currently being processed in this block. + // nrows_ <= row_capacity_ + size_t nrows_; + + Arena *arena_; + + // The bitmap indicating which rows are valid in this block. + // Deleted rows or rows which have failed to pass predicates will be zeroed + // in the bitmap, and thus not returned to the end user. + SelectionVector sel_vec_; +}; + +// Provides an abstraction to interact with a RowBlock row. +// Example usage: +// RowBlock row_block(schema, 10, NULL); +// RowBlockRow row = row_block.row(5); // Get row 5 +// void *cell_data = row.cell_ptr(3); // Get column 3 of row 5 +// ... +class RowBlockRow { + public: + typedef ColumnBlock::Cell Cell; + + explicit RowBlockRow(const RowBlock *row_block = NULL, size_t row_index = 0) + : row_block_(row_block), row_index_(row_index) { + } + + RowBlockRow *Reset(const RowBlock *row_block, size_t row_index) { + row_block_ = row_block; + row_index_ = row_index; + return this; + } + + const RowBlock* row_block() const { + return row_block_; + } + + size_t row_index() const { + return row_index_; + } + + const Schema* schema() const { + return &row_block_->schema(); + } + + bool is_null(size_t col_idx) const { + return column_block(col_idx).is_null(row_index_); + } + + uint8_t *mutable_cell_ptr(size_t col_idx) const { + return const_cast(cell_ptr(col_idx)); + } + + const uint8_t *cell_ptr(size_t col_idx) const { + return column_block(col_idx).cell_ptr(row_index_); + } + + const uint8_t *nullable_cell_ptr(size_t col_idx) const { + return column_block(col_idx).nullable_cell_ptr(row_index_); + } + + Cell cell(size_t col_idx) const { + return row_block_->column_block(col_idx).cell(row_index_); + } + + ColumnBlock column_block(size_t col_idx) const { + return row_block_->column_block(col_idx); + } + + // Mark this row as unselected in the selection vector. + void SetRowUnselected() { + // TODO: const-ness issues since this class holds a const RowBlock *. + // hack around this for now + SelectionVector *vec = const_cast(row_block_->selection_vector()); + vec->SetRowUnselected(row_index_); + } + +#ifndef NDEBUG + void OverwriteWithPattern(StringPiece pattern) { + const Schema& schema = row_block_->schema(); + for (size_t col = 0; col < schema.num_columns(); col++) { + row_block_->column_block(col).OverwriteWithPattern(row_index_, pattern); + } + } +#endif + + private: + const RowBlock *row_block_; + size_t row_index_; +}; + +inline RowBlockRow RowBlock::row(size_t idx) const { + return RowBlockRow(this, idx); +} + +} // namespace kudu + +#endif diff --git a/src/kudu/common/rowid.h b/src/kudu/common/rowid.h new file mode 100644 index 000000000000..43be5db67b3c --- /dev/null +++ b/src/kudu/common/rowid.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_ROWID_H +#define KUDU_COMMON_ROWID_H + +#include + +#include "kudu/util/memcmpable_varint.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" + +namespace kudu { + +// Type to represent the ordinal ID of a row within a RowSet. +// This type should be used instead of uint32_t when referring to row indexes +// for better clarity. +// +// TODO: Currently we only support up to 4B rows per RowSet - some work +// is necessary to support larger RowSets without overflow. +typedef uint32_t rowid_t; + +// Substitution to use in printf() format arguments. +#define ROWID_PRINT_FORMAT PRIu32 + +// Serialize a rowid into the 'dst' buffer. +// The serialized form of row IDs is comparable using memcmp(). +inline void EncodeRowId(faststring *dst, rowid_t rowid) { + PutMemcmpableVarint64(dst, rowid); +} + + +// Decode a varint-encoded rowid from the given Slice, mutating the +// Slice to advance past the decoded data upon return. +// +// Returns false if the Slice is too short. +inline bool DecodeRowId(Slice *s, rowid_t *rowid) { + uint64_t tmp; + bool ret = GetMemcmpableVarint64(s, &tmp); + DCHECK_LT(tmp, 1ULL << 32); + *rowid = tmp; + return ret; +} + +} // namespace kudu + +#endif diff --git a/src/kudu/common/scan_predicate.cc b/src/kudu/common/scan_predicate.cc new file mode 100644 index 000000000000..e919cae67ff9 --- /dev/null +++ b/src/kudu/common/scan_predicate.cc @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/scan_predicate.h" + +#include + +#include "kudu/common/rowblock.h" +#include "kudu/common/types.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +using std::string; + +ValueRange::ValueRange(const TypeInfo* type, + const void* lower_bound, + const void* upper_bound) + : type_info_(type), + lower_bound_(lower_bound), + upper_bound_(upper_bound) { + CHECK(has_lower_bound() || has_upper_bound()) + << "range predicate has no bounds"; +} + +bool ValueRange::IsEquality() const { + if (has_lower_bound() && has_upper_bound()) { + return type_info_->Compare(upper_bound(), lower_bound()) == 0; + } + return false; +} + +bool ValueRange::ContainsCell(const void* cell) const { + if (has_lower_bound() && type_info_->Compare(cell, lower_bound()) < 0) { + return false; + } + if (has_upper_bound() && type_info_->Compare(cell, upper_bound()) > 0) { + return false; + } + return true; +} + +//////////////////////////////////////////////////////////// + +ColumnRangePredicate::ColumnRangePredicate(ColumnSchema col, + const void* lower_bound, + const void* upper_bound) + : col_(std::move(col)), + range_(col_.type_info(), lower_bound, upper_bound) {} + +void ColumnRangePredicate::Evaluate(RowBlock* block, SelectionVector* vec) const { + int col_idx = block->schema().find_column(col_.name()); + CHECK_GE(col_idx, 0) << "bad col: " << col_.ToString(); + + ColumnBlock cblock(block->column_block(col_idx, block->nrows())); + + // TODO: this is all rather slow, could probably push down all the way + // to the TypeInfo so we only make one virtual call, or use codegen. + // Not concerned for now -- plan of record is to eventually embed Impala + // expression evaluation somewhere here, so this is just a stub. + if (cblock.is_nullable()) { + for (size_t i = 0; i < block->nrows(); i++) { + if (!vec->IsRowSelected(i)) continue; + const void *cell = cblock.nullable_cell_ptr(i); + if (cell == nullptr || !range_.ContainsCell(cell)) { + BitmapClear(vec->mutable_bitmap(), i); + } + } + } else { + for (size_t i = 0; i < block->nrows(); i++) { + if (!vec->IsRowSelected(i)) continue; + const void *cell = cblock.cell_ptr(i); + if (!range_.ContainsCell(cell)) { + BitmapClear(vec->mutable_bitmap(), i); + } + } + } +} + +string ColumnRangePredicate::ToString() const { + if (range_.has_lower_bound() && range_.has_upper_bound()) { + return StringPrintf("(`%s` BETWEEN %s AND %s)", col_.name().c_str(), + col_.Stringify(range_.lower_bound()).c_str(), + col_.Stringify(range_.upper_bound()).c_str()); + } else if (range_.has_lower_bound()) { + return StringPrintf("(`%s` >= %s)", col_.name().c_str(), + col_.Stringify(range_.lower_bound()).c_str()); + } else if (range_.has_upper_bound()) { + return StringPrintf("(`%s` <= %s)", col_.name().c_str(), + col_.Stringify(range_.upper_bound()).c_str()); + } else { + LOG(FATAL) << "Cannot reach here"; + return string("Does not reach here"); + } +} + +} // namespace kudu diff --git a/src/kudu/common/scan_predicate.h b/src/kudu/common/scan_predicate.h new file mode 100644 index 000000000000..6d52b7c6cfcf --- /dev/null +++ b/src/kudu/common/scan_predicate.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_SCAN_PREDICATE_H +#define KUDU_COMMON_SCAN_PREDICATE_H + +#include + +#include + +#include "kudu/common/schema.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/faststring.h" + +namespace kudu { + +using std::string; + +class RowBlock; +class SelectionVector; + +class ValueRange { + public: + // Construct a new column range predicate. + // + // The min_value and upper_bound pointers should point to storage + // which represents a constant cell value to be used as a range. + // The range is inclusive on both ends. + // The cells are not copied by this object, so should remain unchanged + // for the lifetime of this object. + // + // If either optional is unspecified (i.e. NULL), then the range is + // open on that end. + // + // A range must be bounded on at least one end. + ValueRange(const TypeInfo* type, + const void* lower_bound, + const void* upper_bound); + + bool has_lower_bound() const { + return lower_bound_; + } + + bool has_upper_bound() const { + return upper_bound_; + } + + const void* lower_bound() const { + return lower_bound_; + } + + const void* upper_bound() const { + return upper_bound_; + } + + bool IsEquality() const; + + bool ContainsCell(const void* cell) const; + + private: + const TypeInfo* type_info_; + const void* lower_bound_; + const void* upper_bound_; +}; + +// Predicate which evaluates to true when the value for a given column +// is within a specified range. +// +// TODO: extract an interface for this once it's clearer what the interface should +// look like. Column range is not the only predicate in the world. +class ColumnRangePredicate { + public: + + // Construct a new column range predicate. + // The lower_bound and upper_bound pointers should point to storage + // which represents a constant cell value to be used as a range. + // The range is inclusive on both ends. + // If either optional is unspecified (i.e. NULL), then the range is + // open on that end. + ColumnRangePredicate(ColumnSchema col, const void* lower_bound, + const void* upper_bound); + + const ColumnSchema &column() const { + return col_; + } + + string ToString() const; + + // Return the value range for which this predicate passes. + const ValueRange &range() const { return range_; } + + private: + // For Evaluate. + friend class MaterializingIterator; + friend class PredicateEvaluatingIterator; + FRIEND_TEST(TestPredicate, TestColumnRange); + FRIEND_TEST(TestPredicate, TestDontEvalauteOnUnselectedRows); + + // Evaluate the predicate on every row in the rowblock. + // + // This is evaluated as an 'AND' with the current contents of *sel: + // - wherever the predicate evaluates false, set the appropriate bit in the selection + // vector to 0. + // - If the predicate evalutes true, does not make any change to the + // selection vector. + // + // On any rows where the current value of *sel is false, the predicate evaluation + // may be skipped. + // + // NOTE: the evaluation result is stored into '*sel' which may or may not be the + // same vector as block->selection_vector(). + void Evaluate(RowBlock *block, SelectionVector *sel) const; + + ColumnSchema col_; + ValueRange range_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/common/scan_spec.cc b/src/kudu/common/scan_spec.cc new file mode 100644 index 000000000000..9419508edaf4 --- /dev/null +++ b/src/kudu/common/scan_spec.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/scan_spec.h" + +#include +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/escaping.h" + +using std::vector; +using std::string; + +namespace kudu { + +void ScanSpec::AddPredicate(const ColumnRangePredicate &pred) { + predicates_.push_back(pred); +} + +void ScanSpec::SetLowerBoundKey(const EncodedKey* key) { + if (lower_bound_key_ == nullptr || + key->encoded_key().compare(lower_bound_key_->encoded_key()) > 0) { + lower_bound_key_ = key; + } +} +void ScanSpec::SetExclusiveUpperBoundKey(const EncodedKey* key) { + if (exclusive_upper_bound_key_ == nullptr || + key->encoded_key().compare(exclusive_upper_bound_key_->encoded_key()) < 0) { + exclusive_upper_bound_key_ = key; + } +} + +void ScanSpec::SetLowerBoundPartitionKey(const Slice& partitionKey) { + if (partitionKey.compare(lower_bound_partition_key_) > 0) { + lower_bound_partition_key_ = partitionKey.ToString(); + } +} + +void ScanSpec::SetExclusiveUpperBoundPartitionKey(const Slice& partitionKey) { + if (exclusive_upper_bound_partition_key_.empty() || + (!partitionKey.empty() && partitionKey.compare(exclusive_upper_bound_partition_key_) < 0)) { + exclusive_upper_bound_partition_key_ = partitionKey.ToString(); + } +} + +string ScanSpec::ToString() const { + return ToStringWithOptionalSchema(nullptr); +} + +string ScanSpec::ToStringWithSchema(const Schema& s) const { + return ToStringWithOptionalSchema(&s); +} + +string ScanSpec::ToStringWithOptionalSchema(const Schema* s) const { + vector preds; + + if (lower_bound_key_ || exclusive_upper_bound_key_) { + if (s) { + preds.push_back(EncodedKey::RangeToStringWithSchema( + lower_bound_key_, + exclusive_upper_bound_key_, + *s)); + } else { + preds.push_back(EncodedKey::RangeToString( + lower_bound_key_, + exclusive_upper_bound_key_)); + } + } + + for (const ColumnRangePredicate& pred : predicates_) { + preds.push_back(pred.ToString()); + } + return JoinStrings(preds, "\n"); +} + +} // namespace kudu diff --git a/src/kudu/common/scan_spec.h b/src/kudu/common/scan_spec.h new file mode 100644 index 000000000000..415cc28701d1 --- /dev/null +++ b/src/kudu/common/scan_spec.h @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_SCAN_SPEC_H +#define KUDU_COMMON_SCAN_SPEC_H + +#include +#include + +#include "kudu/common/scan_predicate.h" +#include "kudu/common/encoded_key.h" + +namespace kudu { + +using std::vector; + +class ScanSpec { + public: + ScanSpec() + : lower_bound_key_(NULL), + exclusive_upper_bound_key_(NULL), + lower_bound_partition_key_(), + exclusive_upper_bound_partition_key_(), + cache_blocks_(true) { + } + + typedef vector PredicateList; + + void AddPredicate(const ColumnRangePredicate &pred); + + // Set the lower bound (inclusive) primary key for the scan. + // Does not take ownership of 'key', which must remain valid. + // If called multiple times, the most restrictive key will be used. + void SetLowerBoundKey(const EncodedKey* key); + + // Set the upper bound (exclusive) primary key for the scan. + // Does not take ownership of 'key', which must remain valid. + // If called multiple times, the most restrictive key will be used. + void SetExclusiveUpperBoundKey(const EncodedKey* key); + + // Sets the lower bound (inclusive) partition key for the scan. + // + // The scan spec makes a copy of 'slice'; the caller may free it afterward. + // + // Only used in the client. + void SetLowerBoundPartitionKey(const Slice& slice); + + // Sets the upper bound (exclusive) partition key for the scan. + // + // The scan spec makes a copy of 'slice'; the caller may free it afterward. + // + // Only used in the client. + void SetExclusiveUpperBoundPartitionKey(const Slice& slice); + + const vector &predicates() const { + return predicates_; + } + + // Return a pointer to the list of predicates in this scan spec. + // + // Callers may use this during predicate pushdown to remove predicates + // from their caller if they're able to apply them lower down the + // iterator tree. + vector *mutable_predicates() { + return &predicates_; + } + + const EncodedKey* lower_bound_key() const { + return lower_bound_key_; + } + const EncodedKey* exclusive_upper_bound_key() const { + return exclusive_upper_bound_key_; + } + + const string& lower_bound_partition_key() const { + return lower_bound_partition_key_; + } + const string& exclusive_upper_bound_partition_key() const { + return exclusive_upper_bound_partition_key_; + } + + bool cache_blocks() const { + return cache_blocks_; + } + + void set_cache_blocks(bool cache_blocks) { + cache_blocks_ = cache_blocks; + } + + std::string ToString() const; + std::string ToStringWithSchema(const Schema& s) const; + + private: + // Helper for the ToString*() methods. 's' may be NULL. + std::string ToStringWithOptionalSchema(const Schema* s) const; + + vector predicates_; + const EncodedKey* lower_bound_key_; + const EncodedKey* exclusive_upper_bound_key_; + std::string lower_bound_partition_key_; + std::string exclusive_upper_bound_partition_key_; + bool cache_blocks_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/schema-test.cc b/src/kudu/common/schema-test.cc new file mode 100644 index 000000000000..b29f04dac39d --- /dev/null +++ b/src/kudu/common/schema-test.cc @@ -0,0 +1,421 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/schema.h" + +#include +#include +#include +#include + +#include "kudu/common/key_encoder.h" +#include "kudu/common/row.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" + +namespace kudu { +namespace tablet { + +using std::unordered_map; +using std::vector; +using strings::Substitute; + +// Copy a row and its referenced data into the given Arena. +static Status CopyRowToArena(const Slice &row, + const Schema &schema, + Arena *dst_arena, + ContiguousRow *copied) { + Slice row_data; + + // Copy the direct row data to arena + if (!dst_arena->RelocateSlice(row, &row_data)) { + return Status::IOError("no space for row data in arena"); + } + + copied->Reset(row_data.mutable_data()); + RETURN_NOT_OK(RelocateIndirectDataToArena(copied, dst_arena)); + return Status::OK(); +} + + + +// Test basic functionality of Schema definition +TEST(TestSchema, TestSchema) { + Schema empty_schema; + ASSERT_GT(empty_schema.memory_footprint_excluding_this(), 0); + + ColumnSchema col1("key", STRING); + ColumnSchema col2("uint32val", UINT32, true); + ColumnSchema col3("int32val", INT32); + + vector cols = { col1, col2, col3 }; + Schema schema(cols, 1); + + ASSERT_EQ(sizeof(Slice) + sizeof(uint32_t) + sizeof(int32_t), + schema.byte_size()); + ASSERT_EQ(3, schema.num_columns()); + ASSERT_EQ(0, schema.column_offset(0)); + ASSERT_EQ(sizeof(Slice), schema.column_offset(1)); + ASSERT_GT(schema.memory_footprint_excluding_this(), + empty_schema.memory_footprint_excluding_this()); + + EXPECT_EQ("Schema [\n" + "\tkey[string NOT NULL],\n" + "\tuint32val[uint32 NULLABLE],\n" + "\tint32val[int32 NOT NULL]\n" + "]", + schema.ToString()); + EXPECT_EQ("key[string NOT NULL]", schema.column(0).ToString()); + EXPECT_EQ("uint32 NULLABLE", schema.column(1).TypeToString()); +} + +TEST(TestSchema, TestSwap) { + Schema schema1({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", UINT32) }, + 2); + Schema schema2({ ColumnSchema("col3", UINT32), + ColumnSchema("col2", STRING) }, + 1); + schema1.swap(schema2); + ASSERT_EQ(2, schema1.num_columns()); + ASSERT_EQ(1, schema1.num_key_columns()); + ASSERT_EQ(3, schema2.num_columns()); + ASSERT_EQ(2, schema2.num_key_columns()); +} + +TEST(TestSchema, TestReset) { + Schema schema; + ASSERT_FALSE(schema.initialized()); + + ASSERT_OK(schema.Reset({ ColumnSchema("col3", UINT32), + ColumnSchema("col2", STRING) }, + 1)); + ASSERT_TRUE(schema.initialized()); + + // Swap the initialized schema with an uninitialized one. + Schema schema2; + schema2.swap(schema); + ASSERT_FALSE(schema.initialized()); + ASSERT_TRUE(schema2.initialized()); +} + +// Test for KUDU-943, a bug where we suspected that Variant didn't behave +// correctly with empty strings. +TEST(TestSchema, TestEmptyVariant) { + Slice empty_val(""); + Slice nonempty_val("test"); + + Variant v(STRING, &nonempty_val); + ASSERT_EQ("test", (static_cast(v.value()))->ToString()); + v.Reset(STRING, &empty_val); + ASSERT_EQ("", (static_cast(v.value()))->ToString()); + v.Reset(STRING, &nonempty_val); + ASSERT_EQ("test", (static_cast(v.value()))->ToString()); +} + +TEST(TestSchema, TestProjectSubset) { + Schema schema1({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", UINT32) }, + 1); + + Schema schema2({ ColumnSchema("col3", UINT32), + ColumnSchema("col2", STRING) }, + 0); + + RowProjector row_projector(&schema1, &schema2); + ASSERT_OK(row_projector.Init()); + + // Verify the mapping + ASSERT_EQ(2, row_projector.base_cols_mapping().size()); + ASSERT_EQ(0, row_projector.adapter_cols_mapping().size()); + ASSERT_EQ(0, row_projector.projection_defaults().size()); + + const vector& mapping = row_projector.base_cols_mapping(); + ASSERT_EQ(mapping[0].first, 0); // col3 schema2 + ASSERT_EQ(mapping[0].second, 2); // col3 schema1 + ASSERT_EQ(mapping[1].first, 1); // col2 schema2 + ASSERT_EQ(mapping[1].second, 1); // col2 schema1 +} + +// Test projection when the type of the projected column +// doesn't match the original type. +TEST(TestSchema, TestProjectTypeMismatch) { + Schema schema1({ ColumnSchema("key", STRING), + ColumnSchema("val", UINT32) }, + 1); + Schema schema2({ ColumnSchema("val", STRING) }, 0); + + RowProjector row_projector(&schema1, &schema2); + Status s = row_projector.Init(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.message().ToString(), "must have type"); +} + +// Test projection when the some columns in the projection +// are not present in the base schema +TEST(TestSchema, TestProjectMissingColumn) { + Schema schema1({ ColumnSchema("key", STRING), ColumnSchema("val", UINT32) }, 1); + Schema schema2({ ColumnSchema("val", UINT32), ColumnSchema("non_present", STRING) }, 0); + Schema schema3({ ColumnSchema("val", UINT32), ColumnSchema("non_present", UINT32, true) }, 0); + uint32_t default_value = 15; + Schema schema4({ ColumnSchema("val", UINT32), + ColumnSchema("non_present", UINT32, false, &default_value) }, + 0); + + RowProjector row_projector(&schema1, &schema2); + Status s = row_projector.Init(); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.message().ToString(), + "does not exist in the projection, and it does not have a default value or a nullable type"); + + // Verify Default nullable column with no default value + ASSERT_OK(row_projector.Reset(&schema1, &schema3)); + + ASSERT_EQ(1, row_projector.base_cols_mapping().size()); + ASSERT_EQ(0, row_projector.adapter_cols_mapping().size()); + ASSERT_EQ(1, row_projector.projection_defaults().size()); + + ASSERT_EQ(row_projector.base_cols_mapping()[0].first, 0); // val schema2 + ASSERT_EQ(row_projector.base_cols_mapping()[0].second, 1); // val schema1 + ASSERT_EQ(row_projector.projection_defaults()[0], 1); // non_present schema3 + + // Verify Default non nullable column with default value + ASSERT_OK(row_projector.Reset(&schema1, &schema4)); + + ASSERT_EQ(1, row_projector.base_cols_mapping().size()); + ASSERT_EQ(0, row_projector.adapter_cols_mapping().size()); + ASSERT_EQ(1, row_projector.projection_defaults().size()); + + ASSERT_EQ(row_projector.base_cols_mapping()[0].first, 0); // val schema4 + ASSERT_EQ(row_projector.base_cols_mapping()[0].second, 1); // val schema1 + ASSERT_EQ(row_projector.projection_defaults()[0], 1); // non_present schema4 +} + +// Test projection mapping using IDs. +// This simulate a column rename ('val' -> 'val_renamed') +// and a new column added ('non_present') +TEST(TestSchema, TestProjectRename) { + SchemaBuilder builder; + ASSERT_OK(builder.AddKeyColumn("key", STRING)); + ASSERT_OK(builder.AddColumn("val", UINT32)); + Schema schema1 = builder.Build(); + + builder.Reset(schema1); + ASSERT_OK(builder.AddNullableColumn("non_present", UINT32)); + ASSERT_OK(builder.RenameColumn("val", "val_renamed")); + Schema schema2 = builder.Build(); + + RowProjector row_projector(&schema1, &schema2); + ASSERT_OK(row_projector.Init()); + + ASSERT_EQ(2, row_projector.base_cols_mapping().size()); + ASSERT_EQ(0, row_projector.adapter_cols_mapping().size()); + ASSERT_EQ(1, row_projector.projection_defaults().size()); + + ASSERT_EQ(row_projector.base_cols_mapping()[0].first, 0); // key schema2 + ASSERT_EQ(row_projector.base_cols_mapping()[0].second, 0); // key schema1 + + ASSERT_EQ(row_projector.base_cols_mapping()[1].first, 1); // val_renamed schema2 + ASSERT_EQ(row_projector.base_cols_mapping()[1].second, 1); // val schema1 + + ASSERT_EQ(row_projector.projection_defaults()[0], 2); // non_present schema2 +} + + +// Test that the schema can be used to compare and stringify rows. +TEST(TestSchema, TestRowOperations) { + Schema schema({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", UINT32), + ColumnSchema("col4", INT32) }, + 1); + + Arena arena(1024, 256*1024); + + RowBuilder rb(schema); + rb.AddString(string("row_a_1")); + rb.AddString(string("row_a_2")); + rb.AddUint32(3); + rb.AddInt32(-3); + ContiguousRow row_a(&schema); + ASSERT_OK(CopyRowToArena(rb.data(), schema, &arena, &row_a)); + + rb.Reset(); + rb.AddString(string("row_b_1")); + rb.AddString(string("row_b_2")); + rb.AddUint32(3); + rb.AddInt32(-3); + ContiguousRow row_b(&schema); + ASSERT_OK(CopyRowToArena(rb.data(), schema, &arena, &row_b)); + + ASSERT_GT(schema.Compare(row_b, row_a), 0); + ASSERT_LT(schema.Compare(row_a, row_b), 0); + + ASSERT_EQ(string("(string col1=row_a_1, string col2=row_a_2, uint32 col3=3, int32 col4=-3)"), + schema.DebugRow(row_a)); +} + +TEST(TestKeyEncoder, TestKeyEncoder) { + faststring fs; + const KeyEncoder& encoder = GetKeyEncoder(GetTypeInfo(STRING)); + + typedef std::tuple, Slice> test_pair; + + vector pairs; + + // Simple key + pairs.push_back(test_pair({ Slice("foo", 3) }, Slice("foo", 3))); + + // Simple compound key + pairs.push_back(test_pair({ Slice("foo", 3), Slice("bar", 3) }, + Slice("foo" "\x00\x00" "bar", 8))); + + // Compound key with a \x00 in it + pairs.push_back(test_pair({ Slice("xxx\x00yyy", 7), Slice("bar", 3) }, + Slice("xxx" "\x00\x01" "yyy" "\x00\x00" "bar", 13))); + + int i = 0; + for (const test_pair &t : pairs) { + const vector &in = std::get<0>(t); + Slice expected = std::get<1>(t); + + fs.clear(); + for (int col = 0; col < in.size(); col++) { + encoder.Encode(&in[col], col == in.size() - 1, &fs); + } + + ASSERT_EQ(0, expected.compare(Slice(fs))) + << "Failed encoding example " << i << ".\n" + << "Expected: " << HexDump(expected) << "\n" + << "Got: " << HexDump(Slice(fs)); + i++; + } +} + +TEST(TestSchema, TestDecodeKeys_CompoundStringKey) { + Schema schema({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", STRING) }, + 2); + + EXPECT_EQ("(string col1=foo, string col2=bar)", + schema.DebugEncodedRowKey(Slice("foo\0\0bar", 8), Schema::START_KEY)); + EXPECT_EQ("(string col1=fo\\000o, string col2=bar)", + schema.DebugEncodedRowKey(Slice("fo\x00\x01o\0\0""bar", 10), Schema::START_KEY)); + EXPECT_EQ("(string col1=fo\\000o, string col2=bar\\000xy)", + schema.DebugEncodedRowKey(Slice("fo\x00\x01o\0\0""bar\0xy", 13), Schema::START_KEY)); + + EXPECT_EQ("", + schema.DebugEncodedRowKey("", Schema::START_KEY)); + EXPECT_EQ("", + schema.DebugEncodedRowKey("", Schema::END_KEY)); +} + +// Test that appropriate statuses are returned when trying to decode an invalid +// encoded key. +TEST(TestSchema, TestDecodeKeys_InvalidKeys) { + Schema schema({ ColumnSchema("col1", STRING), + ColumnSchema("col2", UINT32), + ColumnSchema("col3", STRING) }, + 2); + + EXPECT_EQ("", + schema.DebugEncodedRowKey(Slice("foo"), Schema::START_KEY)); + EXPECT_EQ("", + schema.DebugEncodedRowKey(Slice("foo\x00\x00", 5), Schema::START_KEY)); + EXPECT_EQ("", + schema.DebugEncodedRowKey(Slice("foo\x00\x00\xff\xff", 7), Schema::START_KEY)); +} + +TEST(TestSchema, TestCreateProjection) { + Schema schema({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", STRING), + ColumnSchema("col4", STRING), + ColumnSchema("col5", STRING) }, + 2); + Schema schema_with_ids = SchemaBuilder(schema).Build(); + Schema partial_schema; + + // By names, without IDs + ASSERT_OK(schema.CreateProjectionByNames({ "col1", "col2", "col4" }, &partial_schema)); + EXPECT_EQ("Schema [\n" + "\tcol1[string NOT NULL],\n" + "\tcol2[string NOT NULL],\n" + "\tcol4[string NOT NULL]\n" + "]", + partial_schema.ToString()); + + // By names, with IDS + ASSERT_OK(schema_with_ids.CreateProjectionByNames({ "col1", "col2", "col4" }, &partial_schema)); + EXPECT_EQ(Substitute("Schema [\n" + "\t$0:col1[string NOT NULL],\n" + "\t$1:col2[string NOT NULL],\n" + "\t$2:col4[string NOT NULL]\n" + "]", + schema_with_ids.column_id(0), + schema_with_ids.column_id(1), + schema_with_ids.column_id(3)), + partial_schema.ToString()); + + // By names, with missing names. + Status s = schema.CreateProjectionByNames({ "foobar" }, &partial_schema); + EXPECT_EQ("Not found: column not found: foobar", s.ToString()); + + // By IDs + ASSERT_OK(schema_with_ids.CreateProjectionByIdsIgnoreMissing({ schema_with_ids.column_id(0), + schema_with_ids.column_id(1), + ColumnId(1000), // missing column + schema_with_ids.column_id(3) }, + &partial_schema)); + EXPECT_EQ(Substitute("Schema [\n" + "\t$0:col1[string NOT NULL],\n" + "\t$1:col2[string NOT NULL],\n" + "\t$2:col4[string NOT NULL]\n" + "]", + schema_with_ids.column_id(0), + schema_with_ids.column_id(1), + schema_with_ids.column_id(3)), + partial_schema.ToString()); +} + +#ifdef NDEBUG +TEST(TestKeyEncoder, BenchmarkSimpleKey) { + faststring fs; + Schema schema({ ColumnSchema("col1", STRING) }, 1); + + RowBuilder rb(schema); + rb.AddString(Slice("hello world")); + ConstContiguousRow row(&rb.schema(), rb.data()); + + LOG_TIMING(INFO, "Encoding") { + for (int i = 0; i < 10000000; i++) { + schema.EncodeComparableKey(row, &fs); + } + } +} +#endif + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/common/schema.cc b/src/kudu/common/schema.cc new file mode 100644 index 000000000000..bd830886c3ce --- /dev/null +++ b/src/kudu/common/schema.cc @@ -0,0 +1,487 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/schema.h" + +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/util/malloc.h" +#include "kudu/util/status.h" +#include "kudu/common/row.h" + +namespace kudu { + +using std::set; +using std::unordered_map; +using std::unordered_set; + +// In a new schema, we typically would start assigning column IDs at 0. However, this +// makes it likely that in many test cases, the column IDs and the column indexes are +// equal to each other, and it's easy to accidentally pass an index where we meant to pass +// an ID, without having any issues. So, in DEBUG builds, we start assigning columns at ID +// 10, ensuring that if we accidentally mix up IDs and indexes, we're likely to fire an +// assertion or bad memory access. +#ifdef NDEBUG +static const ColumnId kFirstColumnId(0); +#else +static const ColumnId kFirstColumnId(10); +#endif + +string ColumnStorageAttributes::ToString() const { + return strings::Substitute("encoding=$0, compression=$1, cfile_block_size=$2", + EncodingType_Name(encoding), + CompressionType_Name(compression), + cfile_block_size); +} + +// TODO: include attributes_.ToString() -- need to fix unit tests +// first +string ColumnSchema::ToString() const { + return strings::Substitute("$0[$1]", + name_, + TypeToString()); +} + +string ColumnSchema::TypeToString() const { + return strings::Substitute("$0 $1", + type_info_->name(), + is_nullable_ ? "NULLABLE" : "NOT NULL"); +} + +size_t ColumnSchema::memory_footprint_excluding_this() const { + // Rough approximation. + return name_.capacity(); +} + +size_t ColumnSchema::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} + +Schema::Schema(const Schema& other) + : name_to_index_bytes_(0), + // TODO: C++11 provides a single-arg constructor + name_to_index_(10, + NameToIndexMap::hasher(), + NameToIndexMap::key_equal(), + NameToIndexMapAllocator(&name_to_index_bytes_)) { + CopyFrom(other); +} + +Schema& Schema::operator=(const Schema& other) { + if (&other != this) { + CopyFrom(other); + } + return *this; +} + +void Schema::CopyFrom(const Schema& other) { + num_key_columns_ = other.num_key_columns_; + cols_ = other.cols_; + col_ids_ = other.col_ids_; + col_offsets_ = other.col_offsets_; + id_to_index_ = other.id_to_index_; + + // We can't simply copy name_to_index_ since the StringPiece keys + // reference the other Schema's ColumnSchema objects. + name_to_index_.clear(); + int i = 0; + for (const ColumnSchema &col : cols_) { + // The map uses the 'name' string from within the ColumnSchema object. + name_to_index_[col.name()] = i++; + } + + has_nullables_ = other.has_nullables_; +} + +void Schema::swap(Schema& other) { + std::swap(num_key_columns_, other.num_key_columns_); + cols_.swap(other.cols_); + col_ids_.swap(other.col_ids_); + col_offsets_.swap(other.col_offsets_); + name_to_index_.swap(other.name_to_index_); + id_to_index_.swap(other.id_to_index_); + std::swap(has_nullables_, other.has_nullables_); +} + +Status Schema::Reset(const vector& cols, + const vector& ids, + int key_columns) { + cols_ = cols; + num_key_columns_ = key_columns; + + if (PREDICT_FALSE(key_columns > cols_.size())) { + return Status::InvalidArgument( + "Bad schema", "More key columns than columns"); + } + + if (PREDICT_FALSE(key_columns < 0)) { + return Status::InvalidArgument( + "Bad schema", "Cannot specify a negative number of key columns"); + } + + if (PREDICT_FALSE(!ids.empty() && ids.size() != cols_.size())) { + return Status::InvalidArgument("Bad schema", + "The number of ids does not match with the number of columns"); + } + + // Verify that the key columns are not nullable + for (int i = 0; i < key_columns; ++i) { + if (PREDICT_FALSE(cols_[i].is_nullable())) { + return Status::InvalidArgument( + "Bad schema", strings::Substitute("Nullable key columns are not " + "supported: $0", cols_[i].name())); + } + } + + // Calculate the offset of each column in the row format. + col_offsets_.reserve(cols_.size() + 1); // Include space for total byte size at the end. + size_t off = 0; + size_t i = 0; + name_to_index_.clear(); + for (const ColumnSchema &col : cols_) { + // The map uses the 'name' string from within the ColumnSchema object. + if (!InsertIfNotPresent(&name_to_index_, col.name(), i++)) { + return Status::InvalidArgument("Duplicate column name", col.name()); + } + + col_offsets_.push_back(off); + off += col.type_info()->size(); + } + + // Add an extra element on the end for the total + // byte size + col_offsets_.push_back(off); + + // Initialize IDs mapping + col_ids_ = ids; + id_to_index_.clear(); + max_col_id_ = 0; + for (int i = 0; i < ids.size(); ++i) { + if (ids[i] > max_col_id_) { + max_col_id_ = ids[i]; + } + id_to_index_.set(ids[i], i); + } + + // Determine whether any column is nullable + has_nullables_ = false; + for (const ColumnSchema& col : cols_) { + if (col.is_nullable()) { + has_nullables_ = true; + break; + } + } + + return Status::OK(); +} + +Status Schema::CreateProjectionByNames(const std::vector& col_names, + Schema* out) const { + vector ids; + vector cols; + for (const StringPiece& name : col_names) { + int idx = find_column(name); + if (idx == -1) { + return Status::NotFound("column not found", name); + } + if (has_column_ids()) { + ids.push_back(column_id(idx)); + } + cols.push_back(column(idx)); + } + return out->Reset(cols, ids, 0); +} + +Status Schema::CreateProjectionByIdsIgnoreMissing(const std::vector& col_ids, + Schema* out) const { + vector cols; + vector filtered_col_ids; + for (ColumnId id : col_ids) { + int idx = find_column_by_id(id); + if (idx == -1) { + continue; + } + cols.push_back(column(idx)); + filtered_col_ids.push_back(id); + } + return out->Reset(cols, filtered_col_ids, 0); +} + +Schema Schema::CopyWithColumnIds() const { + CHECK(!has_column_ids()); + vector ids; + for (int32_t i = 0; i < num_columns(); i++) { + ids.push_back(ColumnId(kFirstColumnId + i)); + } + return Schema(cols_, ids, num_key_columns_); +} + +Schema Schema::CopyWithoutColumnIds() const { + CHECK(has_column_ids()); + return Schema(cols_, num_key_columns_); +} + +Status Schema::VerifyProjectionCompatibility(const Schema& projection) const { + DCHECK(has_column_ids()) "The server schema must have IDs"; + + if (projection.has_column_ids()) { + return Status::InvalidArgument("User requests should not have Column IDs"); + } + + vector missing_columns; + for (const ColumnSchema& pcol : projection.columns()) { + int index = find_column(pcol.name()); + if (index < 0) { + missing_columns.push_back(pcol.name()); + } else if (!pcol.EqualsType(cols_[index])) { + // TODO: We don't support query with type adaptors yet + return Status::InvalidArgument("The column '" + pcol.name() + "' must have type " + + cols_[index].TypeToString() + " found " + pcol.TypeToString()); + } + } + + if (!missing_columns.empty()) { + return Status::InvalidArgument("Some columns are not present in the current schema", + JoinStrings(missing_columns, ", ")); + } + return Status::OK(); +} + + +Status Schema::GetMappedReadProjection(const Schema& projection, + Schema *mapped_projection) const { + // - The user projection may have different columns from the ones on the tablet + // - User columns non present in the tablet are considered errors + // - The user projection is not supposed to have the defaults or the nullable + // information on each field. The current tablet schema is supposed to. + // - Each CFileSet may have a different schema and each CFileSet::Iterator + // must use projection from the CFileSet schema to the mapped user schema. + RETURN_NOT_OK(VerifyProjectionCompatibility(projection)); + + // Get the Projection Mapping + vector mapped_cols; + vector mapped_ids; + + mapped_cols.reserve(projection.num_columns()); + mapped_ids.reserve(projection.num_columns()); + + for (const ColumnSchema& col : projection.columns()) { + int index = find_column(col.name()); + DCHECK_GE(index, 0) << col.name(); + mapped_cols.push_back(cols_[index]); + mapped_ids.push_back(col_ids_[index]); + } + + CHECK_OK(mapped_projection->Reset(mapped_cols, mapped_ids, projection.num_key_columns())); + return Status::OK(); +} + +string Schema::ToString() const { + vector col_strs; + if (has_column_ids()) { + for (int i = 0; i < cols_.size(); ++i) { + col_strs.push_back(strings::Substitute("$0:$1", col_ids_[i], cols_[i].ToString())); + } + } else { + for (const ColumnSchema &col : cols_) { + col_strs.push_back(col.ToString()); + } + } + + return StrCat("Schema [\n\t", + JoinStrings(col_strs, ",\n\t"), + "\n]"); +} + +Status Schema::DecodeRowKey(Slice encoded_key, + uint8_t* buffer, + Arena* arena) const { + ContiguousRow row(this, buffer); + + for (size_t col_idx = 0; col_idx < num_key_columns(); ++col_idx) { + const ColumnSchema& col = column(col_idx); + const KeyEncoder& key_encoder = GetKeyEncoder(col.type_info()); + bool is_last = col_idx == (num_key_columns() - 1); + RETURN_NOT_OK_PREPEND(key_encoder.Decode(&encoded_key, + is_last, + arena, + row.mutable_cell_ptr(col_idx)), + strings::Substitute("Error decoding composite key component '$0'", + col.name())); + } + return Status::OK(); +} + +string Schema::DebugEncodedRowKey(Slice encoded_key, StartOrEnd start_or_end) const { + if (encoded_key.empty()) { + switch (start_or_end) { + case START_KEY: return ""; + case END_KEY: return ""; + } + } + + Arena arena(1024, 128 * 1024); + uint8_t* buf = reinterpret_cast(arena.AllocateBytes(key_byte_size())); + Status s = DecodeRowKey(encoded_key, buf, &arena); + if (!s.ok()) { + return ""; + } + ConstContiguousRow row(this, buf); + return DebugRowKey(row); +} + +size_t Schema::memory_footprint_excluding_this() const { + size_t size = 0; + for (const ColumnSchema& col : cols_) { + size += col.memory_footprint_excluding_this(); + } + + if (cols_.capacity() > 0) { + size += kudu_malloc_usable_size(cols_.data()); + } + if (col_ids_.capacity() > 0) { + size += kudu_malloc_usable_size(col_ids_.data()); + } + if (col_offsets_.capacity() > 0) { + size += kudu_malloc_usable_size(col_offsets_.data()); + } + size += name_to_index_bytes_; + size += id_to_index_.memory_footprint_excluding_this(); + + return size; +} + +size_t Schema::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} + +// ============================================================================ +// Schema Builder +// ============================================================================ +void SchemaBuilder::Reset() { + cols_.clear(); + col_ids_.clear(); + col_names_.clear(); + num_key_columns_ = 0; + next_id_ = kFirstColumnId; +} + +void SchemaBuilder::Reset(const Schema& schema) { + cols_ = schema.cols_; + col_ids_ = schema.col_ids_; + num_key_columns_ = schema.num_key_columns_; + for (const auto& column : cols_) { + col_names_.insert(column.name()); + } + + if (col_ids_.empty()) { + for (int32_t i = 0; i < cols_.size(); ++i) { + col_ids_.push_back(ColumnId(kFirstColumnId + i)); + } + } + if (col_ids_.empty()) { + next_id_ = kFirstColumnId; + } else { + next_id_ = *std::max_element(col_ids_.begin(), col_ids_.end()) + 1; + } +} + +Status SchemaBuilder::AddKeyColumn(const string& name, DataType type) { + return AddColumn(ColumnSchema(name, type), true); +} + +Status SchemaBuilder::AddColumn(const string& name, + DataType type, + bool is_nullable, + const void *read_default, + const void *write_default) { + return AddColumn(ColumnSchema(name, type, is_nullable, read_default, write_default), false); +} + +Status SchemaBuilder::RemoveColumn(const string& name) { + unordered_set::const_iterator it_names; + if ((it_names = col_names_.find(name)) == col_names_.end()) { + return Status::NotFound("The specified column does not exist", name); + } + + col_names_.erase(it_names); + for (int i = 0; i < cols_.size(); ++i) { + if (name == cols_[i].name()) { + cols_.erase(cols_.begin() + i); + col_ids_.erase(col_ids_.begin() + i); + if (i < num_key_columns_) { + num_key_columns_--; + } + return Status::OK(); + } + } + + LOG(FATAL) << "Should not reach here"; + return Status::Corruption("Unable to remove existing column"); +} + +Status SchemaBuilder::RenameColumn(const string& old_name, const string& new_name) { + unordered_set::const_iterator it_names; + + // check if 'new_name' is already in use + if ((it_names = col_names_.find(new_name)) != col_names_.end()) { + return Status::AlreadyPresent("The column already exists", new_name); + } + + // check if the 'old_name' column exists + if ((it_names = col_names_.find(old_name)) == col_names_.end()) { + return Status::NotFound("The specified column does not exist", old_name); + } + + col_names_.erase(it_names); // TODO: Should this one stay and marked as alias? + col_names_.insert(new_name); + + for (ColumnSchema& col_schema : cols_) { + if (old_name == col_schema.name()) { + col_schema.set_name(new_name); + return Status::OK(); + } + } + + LOG(FATAL) << "Should not reach here"; + return Status::IllegalState("Unable to rename existing column"); +} + +Status SchemaBuilder::AddColumn(const ColumnSchema& column, bool is_key) { + if (ContainsKey(col_names_, column.name())) { + return Status::AlreadyPresent("The column already exists", column.name()); + } + + col_names_.insert(column.name()); + if (is_key) { + cols_.insert(cols_.begin() + num_key_columns_, column); + col_ids_.insert(col_ids_.begin() + num_key_columns_, next_id_); + num_key_columns_++; + } else { + cols_.push_back(column); + col_ids_.push_back(next_id_); + } + + next_id_ = ColumnId(next_id_ + 1); + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/common/schema.h b/src/kudu/common/schema.h new file mode 100644 index 000000000000..0c16c94ebc1f --- /dev/null +++ b/src/kudu/common/schema.h @@ -0,0 +1,864 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_COMMON_SCHEMA_H +#define KUDU_COMMON_SCHEMA_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/id_mapping.h" +#include "kudu/common/key_encoder.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +// Check that two schemas are equal, yielding a useful error message in the case that +// they are not. +#define DCHECK_SCHEMA_EQ(s1, s2) \ + do { \ + DCHECK((s1).Equals((s2))) << "Schema " << (s1).ToString() \ + << " does not match " << (s2).ToString(); \ + } while (0); + +#define DCHECK_KEY_PROJECTION_SCHEMA_EQ(s1, s2) \ + do { \ + DCHECK((s1).KeyEquals((s2))) << "Key-Projection Schema " \ + << (s1).ToString() << " does not match " \ + << (s2).ToString(); \ + } while (0); + +namespace kudu { + +using std::vector; +using std::unordered_map; +using std::unordered_set; + +// The ID of a column. Each column in a table has a unique ID. +struct ColumnId { + explicit ColumnId(int32_t t_) : t(t_) {} + ColumnId() : t() {} + ColumnId(const ColumnId& t_) : t(t_.t) {} + ColumnId& operator=(const ColumnId& rhs) { t = rhs.t; return *this; } + ColumnId& operator=(const int32_t& rhs) { t = rhs; return *this; } + operator const int32_t() const { return t; } + operator const strings::internal::SubstituteArg() const { return t; } + operator const AlphaNum() const { return t; } + bool operator==(const ColumnId & rhs) const { return t == rhs.t; } + bool operator<(const ColumnId & rhs) const { return t < rhs.t; } + friend std::ostream& operator<<(std::ostream& os, ColumnId column_id) { + return os << column_id.t; + } + private: + int32_t t; +}; + +// Class for storing column attributes such as compression and +// encoding. Column attributes describe the physical storage and +// representation of bytes, as opposed to a purely logical description +// normally associated with the term Schema. +// +// Column attributes are presently specified in the ColumnSchema +// protobuf message, but this should ideally be separate. +struct ColumnStorageAttributes { + public: + ColumnStorageAttributes() + : encoding(AUTO_ENCODING), + compression(DEFAULT_COMPRESSION), + cfile_block_size(0) { + } + + string ToString() const; + + EncodingType encoding; + CompressionType compression; + + // The preferred block size for cfile blocks. If 0, uses the + // server-wide default. + int32_t cfile_block_size; +}; + +// The schema for a given column. +// +// Holds the data type as well as information about nullability & column name. +// In the future, it may hold information about annotations, etc. +class ColumnSchema { + public: + // name: column name + // type: column type (e.g. UINT8, INT32, STRING, ...) + // is_nullable: true if a row value can be null + // read_default: default value used on read if the column was not present before alter. + // The value will be copied and released on ColumnSchema destruction. + // write_default: default value added to the row if the column value was + // not specified on insert. + // The value will be copied and released on ColumnSchema destruction. + // + // Example: + // ColumnSchema col_a("a", UINT32) + // ColumnSchema col_b("b", STRING, true); + // uint32_t default_i32 = -15; + // ColumnSchema col_c("c", INT32, false, &default_i32); + // Slice default_str("Hello"); + // ColumnSchema col_d("d", STRING, false, &default_str); + ColumnSchema(string name, DataType type, bool is_nullable = false, + const void* read_default = NULL, + const void* write_default = NULL, + ColumnStorageAttributes attributes = ColumnStorageAttributes()) + : name_(std::move(name)), + type_info_(GetTypeInfo(type)), + is_nullable_(is_nullable), + read_default_(read_default ? new Variant(type, read_default) : NULL), + attributes_(std::move(attributes)) { + if (write_default == read_default) { + write_default_ = read_default_; + } else if (write_default != NULL) { + DCHECK(read_default != NULL) << "Must have a read default"; + write_default_.reset(new Variant(type, write_default)); + } + } + + const TypeInfo* type_info() const { + return type_info_; + } + + bool is_nullable() const { + return is_nullable_; + } + + const string &name() const { + return name_; + } + + // Return a string identifying this column, including its + // name. + string ToString() const; + + // Same as above, but only including the type information. + // For example, "STRING NOT NULL". + string TypeToString() const; + + // Returns true if the column has a read default value + bool has_read_default() const { + return read_default_ != NULL; + } + + // Returns a pointer the default value associated with the column + // or NULL if there is no default value. You may check has_read_default() first. + // The returned value will be valid until the ColumnSchema will be destroyed. + // + // Example: + // const uint32_t *vu32 = static_cast(col_schema.read_default_value()); + // const Slice *vstr = static_cast(col_schema.read_default_value()); + const void *read_default_value() const { + if (read_default_ != NULL) { + return read_default_->value(); + } + return NULL; + } + + // Returns true if the column has a write default value + bool has_write_default() const { + return write_default_ != NULL; + } + + // Returns a pointer the default value associated with the column + // or NULL if there is no default value. You may check has_write_default() first. + // The returned value will be valid until the ColumnSchema will be destroyed. + // + // Example: + // const uint32_t *vu32 = static_cast(col_schema.write_default_value()); + // const Slice *vstr = static_cast(col_schema.write_default_value()); + const void *write_default_value() const { + if (write_default_ != NULL) { + return write_default_->value(); + } + return NULL; + } + + bool EqualsType(const ColumnSchema &other) const { + return is_nullable_ == other.is_nullable_ && + type_info()->type() == other.type_info()->type(); + } + + bool Equals(const ColumnSchema &other, bool check_defaults) const { + if (!EqualsType(other) || this->name_ != other.name_) + return false; + + // For Key comparison checking the defauls doesn't make sense, + // since we don't support them, for server vs user schema this comparison + // will always fail, since the user does not specify the defaults. + if (check_defaults) { + if (read_default_ == NULL && other.read_default_ != NULL) + return false; + + if (write_default_ == NULL && other.write_default_ != NULL) + return false; + + if (read_default_ != NULL && !read_default_->Equals(other.read_default_.get())) + return false; + + if (write_default_ != NULL && !write_default_->Equals(other.write_default_.get())) + return false; + } + return true; + } + + // Returns extended attributes (such as encoding, compression, etc...) + // associated with the column schema. The reason they are kept in a separate + // struct is so that in the future, they may be moved out to a more + // appropriate location as opposed to parts of ColumnSchema. + const ColumnStorageAttributes& attributes() const { + return attributes_; + } + + int Compare(const void *lhs, const void *rhs) const { + return type_info_->Compare(lhs, rhs); + } + + // Stringify the given cell. This just stringifies the cell contents, + // and doesn't include the column name or type. + string Stringify(const void *cell) const { + string ret; + type_info_->AppendDebugStringForValue(cell, &ret); + return ret; + } + + // Append a debug string for this cell. This differs from Stringify above + // in that it also includes the column info, for example 'STRING foo=bar'. + template + void DebugCellAppend(const CellType& cell, std::string* ret) const { + ret->append(type_info_->name()); + ret->append(" "); + ret->append(name_); + ret->append("="); + if (is_nullable_ && cell.is_null()) { + ret->append("NULL"); + } else { + type_info_->AppendDebugStringForValue(cell.ptr(), ret); + } + } + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + friend class SchemaBuilder; + + void set_name(const string& name) { + name_ = name; + } + + string name_; + const TypeInfo *type_info_; + bool is_nullable_; + // use shared_ptr since the ColumnSchema is always copied around. + std::shared_ptr read_default_; + std::shared_ptr write_default_; + ColumnStorageAttributes attributes_; +}; + +class ContiguousRow; + +// The schema for a set of rows. +// +// A Schema is simply a set of columns, along with information about +// which prefix of columns makes up the primary key. +// +// Note that, while Schema is copyable and assignable, it is a complex +// object that is not inexpensive to copy. You should generally prefer +// passing by pointer or reference, and functions that create new +// Schemas should generally prefer taking a Schema pointer and using +// Schema::swap() or Schema::Reset() rather than returning by value. +class Schema { + public: + + static const int kColumnNotFound = -1; + + Schema() + : num_key_columns_(0), + name_to_index_bytes_(0), + // TODO: C++11 provides a single-arg constructor + name_to_index_(10, + NameToIndexMap::hasher(), + NameToIndexMap::key_equal(), + NameToIndexMapAllocator(&name_to_index_bytes_)), + has_nullables_(false) { + } + + Schema(const Schema& other); + Schema& operator=(const Schema& other); + + void swap(Schema& other); // NOLINT(build/include_what_you_use) + + void CopyFrom(const Schema& other); + + // Construct a schema with the given information. + // + // NOTE: if the schema is user-provided, it's better to construct an + // empty schema and then use Reset(...) so that errors can be + // caught. If an invalid schema is passed to this constructor, an + // assertion will be fired! + Schema(const vector& cols, + int key_columns) + : name_to_index_bytes_(0), + // TODO: C++11 provides a single-arg constructor + name_to_index_(10, + NameToIndexMap::hasher(), + NameToIndexMap::key_equal(), + NameToIndexMapAllocator(&name_to_index_bytes_)) { + CHECK_OK(Reset(cols, key_columns)); + } + + // Construct a schema with the given information. + // + // NOTE: if the schema is user-provided, it's better to construct an + // empty schema and then use Reset(...) so that errors can be + // caught. If an invalid schema is passed to this constructor, an + // assertion will be fired! + Schema(const vector& cols, + const vector& ids, + int key_columns) + : name_to_index_bytes_(0), + // TODO: C++11 provides a single-arg constructor + name_to_index_(10, + NameToIndexMap::hasher(), + NameToIndexMap::key_equal(), + NameToIndexMapAllocator(&name_to_index_bytes_)) { + CHECK_OK(Reset(cols, ids, key_columns)); + } + + // Reset this Schema object to the given schema. + // If this fails, the Schema object is left in an inconsistent + // state and may not be used. + Status Reset(const vector& cols, int key_columns) { + std::vector ids; + return Reset(cols, ids, key_columns); + } + + // Reset this Schema object to the given schema. + // If this fails, the Schema object is left in an inconsistent + // state and may not be used. + Status Reset(const vector& cols, + const vector& ids, + int key_columns); + + // Return the number of bytes needed to represent a single row of this schema. + // + // This size does not include any indirected (variable length) data (eg strings) + size_t byte_size() const { + DCHECK(initialized()); + return col_offsets_.back(); + } + + // Return the number of bytes needed to represent + // only the key portion of this schema. + size_t key_byte_size() const { + return col_offsets_[num_key_columns_]; + } + + // Return the number of columns in this schema + size_t num_columns() const { + return cols_.size(); + } + + // Return the length of the key prefix in this schema. + size_t num_key_columns() const { + return num_key_columns_; + } + + // Return the byte offset within the row for the given column index. + size_t column_offset(size_t col_idx) const { + DCHECK_LT(col_idx, cols_.size()); + return col_offsets_[col_idx]; + } + + // Return the ColumnSchema corresponding to the given column index. + inline const ColumnSchema &column(size_t idx) const { + DCHECK_LT(idx, cols_.size()); + return cols_[idx]; + } + + // Return the ColumnSchema corresponding to the given column ID. + inline const ColumnSchema& column_by_id(ColumnId id) const { + int idx = find_column_by_id(id); + DCHECK_GE(idx, 0); + return cols_[idx]; + } + + // Return the column ID corresponding to the given column index + ColumnId column_id(size_t idx) const { + DCHECK(has_column_ids()); + DCHECK_LT(idx, cols_.size()); + return col_ids_[idx]; + } + + // Return true if the schema contains an ID mapping for its columns. + // In the case of an empty schema, this is false. + bool has_column_ids() const { + return !col_ids_.empty(); + } + + const std::vector& columns() const { + return cols_; + } + + // Return the column index corresponding to the given column, + // or kColumnNotFound if the column is not in this schema. + int find_column(const StringPiece col_name) const { + auto iter = name_to_index_.find(col_name); + if (PREDICT_FALSE(iter == name_to_index_.end())) { + return kColumnNotFound; + } else { + return (*iter).second; + } + } + + // Returns true if the schema contains nullable columns + bool has_nullables() const { + return has_nullables_; + } + + // Returns true if the specified column (by name) is a key + bool is_key_column(const StringPiece col_name) const { + return is_key_column(find_column(col_name)); + } + + // Returns true if the specified column (by index) is a key + bool is_key_column(size_t idx) const { + return idx < num_key_columns_; + } + + // Return true if this Schema is initialized and valid. + bool initialized() const { + return !col_offsets_.empty(); + } + + // Returns the highest column id in this Schema. + ColumnId max_col_id() const { + return max_col_id_; + } + + // Extract a given column from a row where the type is + // known at compile-time. The type is checked with a debug + // assertion -- but if the wrong type is used and these assertions + // are off, incorrect data may result. + // + // This is mostly useful for tests at this point. + // TODO: consider removing it. + template + const typename DataTypeTraits::cpp_type * + ExtractColumnFromRow(const RowType& row, size_t idx) const { + DCHECK_SCHEMA_EQ(*this, *row.schema()); + const ColumnSchema& col_schema = cols_[idx]; + DCHECK_LT(idx, cols_.size()); + DCHECK_EQ(col_schema.type_info()->type(), Type); + + const void *val; + if (col_schema.is_nullable()) { + val = row.nullable_cell_ptr(idx); + } else { + val = row.cell_ptr(idx); + } + + return reinterpret_cast::cpp_type *>(val); + } + + // Stringify the given row, which conforms to this schema, + // in a way suitable for debugging. This isn't currently optimized + // so should be avoided in hot paths. + template + string DebugRow(const RowType& row) const { + DCHECK_SCHEMA_EQ(*this, *row.schema()); + return DebugRowColumns(row, num_columns()); + } + + // Stringify the given row, which must have a schema which is + // key-compatible with this one. Per above, this is not for use in + // hot paths. + template + string DebugRowKey(const RowType& row) const { + DCHECK_KEY_PROJECTION_SCHEMA_EQ(*this, *row.schema()); + return DebugRowColumns(row, num_key_columns()); + } + + // Decode the specified encoded key into the given 'buffer', which + // must be at least as large as this->key_byte_size(). + // + // 'arena' is used for allocating indirect strings, but is unused + // for other datatypes. + Status DecodeRowKey(Slice encoded_key, uint8_t* buffer, + Arena* arena) const WARN_UNUSED_RESULT; + + // Decode and stringify the given contiguous encoded row key in + // order to, e.g., provide print human-readable information about a + // tablet's start and end keys. + // + // If the encoded key is empty then '' or '' + // will be returned based on the value of 'start_or_end'. + // + // See also: DebugRowKey, DecodeRowKey. + enum StartOrEnd { + START_KEY, + END_KEY + }; + string DebugEncodedRowKey(Slice encoded_key, StartOrEnd start_or_end) const; + + // Compare two rows of this schema. + template + int Compare(const RowTypeA& lhs, const RowTypeB& rhs) const { + DCHECK(KeyEquals(*lhs.schema()) && KeyEquals(*rhs.schema())); + + for (size_t col = 0; col < num_key_columns_; col++) { + int col_compare = column(col).Compare(lhs.cell_ptr(col), rhs.cell_ptr(col)); + if (col_compare != 0) { + return col_compare; + } + } + return 0; + } + + // Return the projection of this schema which contains only + // the key columns. + // TODO: this should take a Schema* out-parameter to avoid an + // extra copy of the ColumnSchemas. + // TODO this should probably be cached since the key projection + // is not supposed to change, for a single schema. + Schema CreateKeyProjection() const { + vector key_cols(cols_.begin(), + cols_.begin() + num_key_columns_); + vector col_ids; + if (!col_ids_.empty()) { + col_ids.assign(col_ids_.begin(), col_ids_.begin() + num_key_columns_); + } + + return Schema(key_cols, col_ids, num_key_columns_); + } + + // Return a new Schema which is the same as this one, but with IDs assigned. + // Requires that this schema has no column IDs. + Schema CopyWithColumnIds() const; + + // Return a new Schema which is the same as this one, but without any column + // IDs assigned. + // + // Requires that this schema has column IDs. + Schema CopyWithoutColumnIds() const; + + // Create a new schema containing only the selected columns. + // The resulting schema will have no key columns defined. + // If this schema has IDs, the resulting schema will as well. + Status CreateProjectionByNames(const std::vector& col_names, + Schema* out) const; + + // Create a new schema containing only the selected column IDs. + // + // If any column IDs are invalid, then they will be ignored and the + // result will have fewer columns than requested. + // + // The resulting schema will have no key columns defined. + Status CreateProjectionByIdsIgnoreMissing(const std::vector& col_ids, + Schema* out) const; + + // Encode the key portion of the given row into a buffer + // such that the buffer's lexicographic comparison represents + // the proper comparison order of the underlying types. + // + // The key is encoded into the given buffer, replacing its current + // contents. + // Returns the encoded key. + template + Slice EncodeComparableKey(const RowType& row, faststring *dst) const { + DCHECK_KEY_PROJECTION_SCHEMA_EQ(*this, *row.schema()); + + dst->clear(); + for (size_t i = 0; i < num_key_columns_; i++) { + DCHECK(!cols_[i].is_nullable()); + const TypeInfo* ti = cols_[i].type_info(); + bool is_last = i == num_key_columns_ - 1; + GetKeyEncoder(ti).Encode(row.cell_ptr(i), is_last, dst); + } + return Slice(*dst); + } + + // Stringify this Schema. This is not particularly efficient, + // so should only be used when necessary for output. + string ToString() const; + + // Return true if the schemas have exactly the same set of columns + // and respective types. + bool Equals(const Schema &other) const { + if (this == &other) return true; + if (this->num_key_columns_ != other.num_key_columns_) return false; + if (this->cols_.size() != other.cols_.size()) return false; + + const bool have_column_ids = other.has_column_ids() && has_column_ids(); + for (size_t i = 0; i < other.cols_.size(); i++) { + if (!this->cols_[i].Equals(other.cols_[i], have_column_ids)) return false; + } + + return true; + } + + // Return true if the key projection schemas have exactly the same set of + // columns and respective types. + bool KeyEquals(const Schema& other) const { + if (this->num_key_columns_ != other.num_key_columns_) return false; + for (size_t i = 0; i < this->num_key_columns_; i++) { + if (!this->cols_[i].Equals(other.cols_[i], false)) return false; + } + return true; + } + + // Return a non-OK status if the project is not compatible with the current schema + // - User columns non present in the tablet are considered errors + // - Matching columns with different types, at the moment, are considered errors + Status VerifyProjectionCompatibility(const Schema& projection) const; + + // Returns the projection schema mapped on the current one + // If the project is invalid, return a non-OK status. + Status GetMappedReadProjection(const Schema& projection, + Schema *mapped_projection) const; + + // Loops through this schema (the projection) and calls the projector methods once for + // each column. + // + // - Status ProjectBaseColumn(size_t proj_col_idx, size_t base_col_idx) + // + // Called if the column in this schema matches one of the columns in 'base_schema'. + // The column type must match exactly. + // + // - Status ProjectDefaultColumn(size_t proj_idx) + // + // Called if the column in this schema does not match any column in 'base_schema', + // but has a default or is nullable. + // + // - Status ProjectExtraColumn(size_t proj_idx, const ColumnSchema& col) + // + // Called if the column in this schema does not match any column in 'base_schema', + // and does not have a default, and is not nullable. + // + // If both schemas have column IDs, then the matching is done by ID. Otherwise, it is + // done by name. + // + // TODO(MAYBE): Pass the ColumnSchema and not only the column index? + template + Status GetProjectionMapping(const Schema& base_schema, Projector *projector) const { + const bool use_column_ids = base_schema.has_column_ids() && has_column_ids(); + + int proj_idx = 0; + for (int i = 0; i < cols_.size(); ++i) { + const ColumnSchema& col_schema = cols_[i]; + + // try to lookup the column by ID if present or just by name. + // Unit tests and Iter-Projections are probably always using the + // lookup by name. The IDs are generally set by the server on AlterTable(). + int base_idx; + if (use_column_ids) { + base_idx = base_schema.find_column_by_id(col_ids_[i]); + } else { + base_idx = base_schema.find_column(col_schema.name()); + } + + if (base_idx >= 0) { + const ColumnSchema& base_col_schema = base_schema.column(base_idx); + // Column present in the Base Schema... + if (!col_schema.EqualsType(base_col_schema)) { + // ...but with a different type, (TODO: try with an adaptor) + return Status::InvalidArgument("The column '" + col_schema.name() + + "' must have type " + + base_col_schema.TypeToString() + + " found " + col_schema.TypeToString()); + } else { + RETURN_NOT_OK(projector->ProjectBaseColumn(proj_idx, base_idx)); + } + } else { + bool has_default = col_schema.has_read_default() || col_schema.has_write_default(); + if (!has_default && !col_schema.is_nullable()) { + RETURN_NOT_OK(projector->ProjectExtraColumn(proj_idx)); + } + + // Column missing from the Base Schema, use the default value of the projection + RETURN_NOT_OK(projector->ProjectDefaultColumn(proj_idx)); + } + proj_idx++; + } + return Status::OK(); + } + + // Returns the column index given the column ID. + // If no such column exists, returns kColumnNotFound. + int find_column_by_id(ColumnId id) const { + DCHECK(cols_.empty() || has_column_ids()); + int ret = id_to_index_[id]; + if (ret == -1) { + return kColumnNotFound; + } + return ret; + } + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + + // Return a stringified version of the first 'num_columns' columns of the + // row. + template + std::string DebugRowColumns(const RowType& row, int num_columns) const { + string ret; + ret.append("("); + + for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { + if (col_idx > 0) { + ret.append(", "); + } + const ColumnSchema& col = cols_[col_idx]; + col.DebugCellAppend(row.cell(col_idx), &ret); + } + ret.append(")"); + return ret; + } + + friend class SchemaBuilder; + + vector cols_; + size_t num_key_columns_; + ColumnId max_col_id_; + vector col_ids_; + vector col_offsets_; + + // The keys of this map are StringPiece references to the actual name members of the + // ColumnSchema objects inside cols_. This avoids an extra copy of those strings, + // and also allows us to do lookups on the map using StringPiece keys, sometimes + // avoiding copies. + // + // The map is instrumented with a counting allocator so that we can accurately + // measure its memory footprint. + int64_t name_to_index_bytes_; + typedef STLCountingAllocator > NameToIndexMapAllocator; + typedef unordered_map< + StringPiece, + size_t, + std::hash, + std::equal_to, + NameToIndexMapAllocator> NameToIndexMap; + NameToIndexMap name_to_index_; + + IdMapping id_to_index_; + + // Cached indicator whether any columns are nullable. + bool has_nullables_; + + // NOTE: if you add more members, make sure to add the appropriate + // code to swap() and CopyFrom() as well to prevent subtle bugs. +}; + +// Helper used for schema creation/editing. +// +// Example: +// Status s; +// SchemaBuilder builder(base_schema); +// s = builder.RemoveColumn("value"); +// s = builder.AddKeyColumn("key2", STRING); +// s = builder.AddColumn("new_c1", UINT32); +// ... +// Schema new_schema = builder.Build(); +class SchemaBuilder { + public: + SchemaBuilder() { Reset(); } + explicit SchemaBuilder(const Schema& schema) { Reset(schema); } + + void Reset(); + void Reset(const Schema& schema); + + bool is_valid() const { return cols_.size() > 0; } + + // Set the next column ID to be assigned to columns added with + // AddColumn. + void set_next_column_id(ColumnId next_id) { + DCHECK_GE(next_id, ColumnId(0)); + next_id_ = next_id; + } + + // Return the next column ID that would be assigned with AddColumn. + ColumnId next_column_id() const { + return next_id_; + } + + Schema Build() const { return Schema(cols_, col_ids_, num_key_columns_); } + Schema BuildWithoutIds() const { return Schema(cols_, num_key_columns_); } + + Status AddKeyColumn(const string& name, DataType type); + + Status AddColumn(const ColumnSchema& column, bool is_key); + + Status AddColumn(const string& name, DataType type) { + return AddColumn(name, type, false, NULL, NULL); + } + + Status AddNullableColumn(const string& name, DataType type) { + return AddColumn(name, type, true, NULL, NULL); + } + + Status AddColumn(const string& name, + DataType type, + bool is_nullable, + const void *read_default, + const void *write_default); + + Status RemoveColumn(const string& name); + Status RenameColumn(const string& old_name, const string& new_name); + + private: + DISALLOW_COPY_AND_ASSIGN(SchemaBuilder); + + ColumnId next_id_; + vector col_ids_; + vector cols_; + unordered_set col_names_; + size_t num_key_columns_; +}; + +} // namespace kudu + +// Specialize std::hash for ColumnId +namespace std { +template<> +struct hash { + int operator()(const kudu::ColumnId& col_id) const { + return col_id; + } +}; +} // namespace std + +#endif diff --git a/src/kudu/common/timestamp.cc b/src/kudu/common/timestamp.cc new file mode 100644 index 000000000000..25d1325e42dd --- /dev/null +++ b/src/kudu/common/timestamp.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/timestamp.h" + +#include "kudu/util/faststring.h" +#include "kudu/util/memcmpable_varint.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/mathlimits.h" + +namespace kudu { + +const Timestamp Timestamp::kMin(MathLimits::kMin); +const Timestamp Timestamp::kMax(MathLimits::kMax); +const Timestamp Timestamp::kInitialTimestamp(MathLimits::kMin + 1); +const Timestamp Timestamp::kInvalidTimestamp(MathLimits::kMax - 1); + +bool Timestamp::DecodeFrom(Slice *input) { + return GetMemcmpableVarint64(input, &v); +} + +void Timestamp::EncodeTo(faststring *dst) const { + PutMemcmpableVarint64(dst, v); +} + +string Timestamp::ToString() const { + return strings::Substitute("$0", v); +} + +uint64_t Timestamp::ToUint64() const { + return v; +} + +Status Timestamp::FromUint64(uint64_t value) { + v = value; + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/common/timestamp.h b/src/kudu/common/timestamp.h new file mode 100644 index 000000000000..e2c843c4aa83 --- /dev/null +++ b/src/kudu/common/timestamp.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_COMMON_TIMESTAMP_H_ +#define KUDU_COMMON_TIMESTAMP_H_ + +#include +#include + +namespace kudu { +class faststring; +class Slice; +class Status; + +// A transaction timestamp generated by a Clock. +class Timestamp { + public: + typedef uint64_t val_type; + + Timestamp() : v(kInvalidTimestamp.v) {} + + explicit Timestamp(uint64_t val) : v(val) {} + + bool operator ==(const Timestamp &other) const { + return v == other.v; + } + bool operator !=(const Timestamp &other) const { + return v != other.v; + } + + // Decode a timestamp from the given input slice. + // Mutates the slice to point after the decoded timestamp. + // Returns true upon success. + bool DecodeFrom(Slice *input); + + // Encode the timestamp to the given buffer. + void EncodeTo(faststring *dst) const; + + int CompareTo(const Timestamp &other) const; + + std::string ToString() const; + + // Returns this Timestamp as an uint64_t + uint64_t ToUint64() const; + + // Sets this Timestamp from 'value' + Status FromUint64(uint64_t value); + + val_type value() const { return v; } + + // An initial transaction timestamp, higher than min so that we can have + // a Timestamp guaranteed to be lower than all generated timestamps. + static const Timestamp kInitialTimestamp; + + // An invalid transaction timestamp -- Timestamp types initialize to this variable. + static const Timestamp kInvalidTimestamp; + + // The maximum timestamp. + static const Timestamp kMax; + + // The minimum timestamp. + static const Timestamp kMin; + + private: + val_type v; +}; + +inline std::ostream &operator <<(std::ostream &o, const Timestamp ×tamp) { + return o << timestamp.ToString(); +} + +inline int Timestamp::CompareTo(const Timestamp &other) const { + if (v < other.v) { + return -1; + } else if (v > other.v) { + return 1; + } + return 0; +} + +} // namespace kudu + +#endif /* KUDU_COMMON_TIMESTAMP_H_ */ diff --git a/src/kudu/common/types-test.cc b/src/kudu/common/types-test.cc new file mode 100644 index 000000000000..ed5660611668 --- /dev/null +++ b/src/kudu/common/types-test.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/types.h" + +using std::string; + +namespace kudu { + +TEST(TestTypes, TestTimestampPrinting) { + const TypeInfo* info = GetTypeInfo(TIMESTAMP); + + // Test the minimum value + int64 time; + info->CopyMinValue(&time); + string result; + info->AppendDebugStringForValue(&time, &result); + ASSERT_EQ("-290308-12-21 19:59:05.224192 GMT", result); + result = ""; + + // Test a regular negative timestamp. + time = -1454368523123456; + info->AppendDebugStringForValue(&time, &result); + ASSERT_EQ("1923-12-01 00:44:36.876544 GMT", result); + result = ""; + + // Test that passing 0 microseconds returns the correct time (0 msecs after the epoch). + // This is a test for a bug where printing a timestamp with the value 0 would return the + // current time instead. + time = 0; + info->AppendDebugStringForValue(&time, &result); + ASSERT_EQ("1970-01-01 00:00:00.000000 GMT", result); + result = ""; + + // Test a regular positive timestamp. + time = 1454368523123456; + info->AppendDebugStringForValue(&time, &result); + ASSERT_EQ("2016-02-01 23:15:23.123456 GMT", result); + result = ""; + + // Test the maximum value. + time = MathLimits::kMax; + info->AppendDebugStringForValue(&time, &result); + ASSERT_EQ("294247-01-10 04:00:54.775807 GMT", result); +} + + +} // namespace kudu diff --git a/src/kudu/common/types.cc b/src/kudu/common/types.cc new file mode 100644 index 000000000000..ca29e41337c4 --- /dev/null +++ b/src/kudu/common/types.cc @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/types.h" + +#include +#include + +#include "kudu/gutil/singleton.h" + +using std::shared_ptr; +using std::unordered_map; + +namespace kudu { + +template +TypeInfo::TypeInfo(TypeTraitsClass t) + : type_(TypeTraitsClass::type), + physical_type_(TypeTraitsClass::physical_type), + name_(TypeTraitsClass::name()), + size_(TypeTraitsClass::size), + min_value_(TypeTraitsClass::min_value()), + append_func_(TypeTraitsClass::AppendDebugStringForValue), + compare_func_(TypeTraitsClass::Compare) { +} + +void TypeInfo::AppendDebugStringForValue(const void *ptr, string *str) const { + append_func_(ptr, str); +} + +int TypeInfo::Compare(const void *lhs, const void *rhs) const { + return compare_func_(lhs, rhs); +} + +class TypeInfoResolver { + public: + const TypeInfo* GetTypeInfo(DataType t) { + const TypeInfo *type_info = mapping_[t].get(); + CHECK(type_info != nullptr) << + "Bad type: " << t; + return type_info; + } + + private: + TypeInfoResolver() { + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + AddMapping(); + } + + template void AddMapping() { + TypeTraits traits; + mapping_.insert(make_pair(type, shared_ptr(new TypeInfo(traits)))); + } + + unordered_map, + std::hash > mapping_; + + friend class Singleton; + DISALLOW_COPY_AND_ASSIGN(TypeInfoResolver); +}; + +const TypeInfo* GetTypeInfo(DataType type) { + return Singleton::get()->GetTypeInfo(type); +} + +} // namespace kudu diff --git a/src/kudu/common/types.h b/src/kudu/common/types.h new file mode 100644 index 000000000000..470a12b9796d --- /dev/null +++ b/src/kudu/common/types.h @@ -0,0 +1,562 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_COMMON_TYPES_H +#define KUDU_COMMON_TYPES_H + +#include + +#include +#include + +#include "kudu/common/common.pb.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/util/slice.h" + +namespace kudu { + +// The size of the in-memory format of the largest +// type we support. +const int kLargestTypeSize = sizeof(Slice); + +using std::string; +class TypeInfo; + +// This is the important bit of this header: +// given a type enum, get the TypeInfo about it. +extern const TypeInfo* GetTypeInfo(DataType type); + +// Information about a given type. +// This is a runtime equivalent of the TypeTraits template below. +class TypeInfo { + public: + // Returns the type mentioned in the schema. + DataType type() const { return type_; } + // Returns the type used to actually store the data. + DataType physical_type() const { return physical_type_; } + const string& name() const { return name_; } + const size_t size() const { return size_; } + void AppendDebugStringForValue(const void *ptr, string *str) const; + int Compare(const void *lhs, const void *rhs) const; + void CopyMinValue(void* dst) const { + memcpy(dst, min_value_, size_); + } + + private: + friend class TypeInfoResolver; + template TypeInfo(Type t); + + const DataType type_; + const DataType physical_type_; + const string name_; + const size_t size_; + const void* const min_value_; + + typedef void (*AppendDebugFunc)(const void *, string *); + const AppendDebugFunc append_func_; + + typedef int (*CompareFunc)(const void *, const void *); + const CompareFunc compare_func_; +}; + +template struct DataTypeTraits {}; + +template +static int GenericCompare(const void *lhs, const void *rhs) { + typedef typename DataTypeTraits::cpp_type CppType; + CppType lhs_int = *reinterpret_cast(lhs); + CppType rhs_int = *reinterpret_cast(rhs); + if (lhs_int < rhs_int) { + return -1; + } else if (lhs_int > rhs_int) { + return 1; + } else { + return 0; + } +} + +template<> +struct DataTypeTraits { + static const DataType physical_type = UINT8; + typedef uint8_t cpp_type; + static const char *name() { + return "uint8"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = INT8; + typedef int8_t cpp_type; + static const char *name() { + return "int8"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = UINT16; + typedef uint16_t cpp_type; + static const char *name() { + return "uint16"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = INT16; + typedef int16_t cpp_type; + static const char *name() { + return "int16"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = UINT32; + typedef uint32_t cpp_type; + static const char *name() { + return "uint32"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = INT32; + typedef int32_t cpp_type; + static const char *name() { + return "int32"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = UINT64; + typedef uint64_t cpp_type; + static const char *name() { + return "uint64"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = INT64; + typedef int64_t cpp_type; + static const char *name() { + return "int64"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleItoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = FLOAT; + typedef float cpp_type; + static const char *name() { + return "float"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleFtoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = DOUBLE; + typedef double cpp_type; + static const char *name() { + return "double"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + str->append(SimpleDtoa(*reinterpret_cast(val))); + } + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + return &MathLimits::kMin; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = BINARY; + typedef Slice cpp_type; + static const char *name() { + return "binary"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + const Slice *s = reinterpret_cast(val); + str->append(strings::CHexEscape(s->ToString())); + } + + static int Compare(const void *lhs, const void *rhs) { + const Slice *lhs_slice = reinterpret_cast(lhs); + const Slice *rhs_slice = reinterpret_cast(rhs); + return lhs_slice->compare(*rhs_slice); + } + static const cpp_type* min_value() { + static Slice s(""); + return &s; + } +}; + +template<> +struct DataTypeTraits { + static const DataType physical_type = BOOL; + typedef bool cpp_type; + static const char* name() { + return "bool"; + } + static void AppendDebugStringForValue(const void* val, string* str) { + str->append(*reinterpret_cast(val) ? "true" : "false"); + } + + static int Compare(const void *lhs, const void *rhs) { + return GenericCompare(lhs, rhs); + } + static const cpp_type* min_value() { + static bool b = false; + return &b; + } +}; + +// Base class for types that are derived, that is that have some other type as the +// physical representation. +template +struct DerivedTypeTraits { + typedef typename DataTypeTraits::cpp_type cpp_type; + static const DataType physical_type = PhysicalType; + + static void AppendDebugStringForValue(const void *val, string *str) { + DataTypeTraits::AppendDebugStringForValue(val, str); + } + + static int Compare(const void *lhs, const void *rhs) { + return DataTypeTraits::Compare(lhs, rhs); + } + + static const cpp_type* min_value() { + return DataTypeTraits::min_value(); + } +}; + +template<> +struct DataTypeTraits : public DerivedTypeTraits{ + static const char* name() { + return "string"; + } + static void AppendDebugStringForValue(const void *val, string *str) { + const Slice *s = reinterpret_cast(val); + str->append(strings::Utf8SafeCEscape(s->ToString())); + } +}; + +static const char* kDateFormat = "%Y-%m-%d %H:%M:%S"; +static const char* kDateMicrosAndTzFormat = "%s.%06d GMT"; + +template<> +struct DataTypeTraits : public DerivedTypeTraits{ + static const int US_TO_S = 1000L * 1000L; + + static const char* name() { + return "timestamp"; + } + + static void AppendDebugStringForValue(const void* val, string* str) { + int64_t timestamp_micros = *reinterpret_cast(val); + time_t secs_since_epoch = timestamp_micros / US_TO_S; + // If the time is negative we need to take into account that any microseconds + // will actually decrease the time in seconds by one. + int remaining_micros = timestamp_micros % US_TO_S; + if (remaining_micros < 0) { + secs_since_epoch--; + remaining_micros = US_TO_S - std::abs(remaining_micros); + } + struct tm tm_info; + gmtime_r(&secs_since_epoch, &tm_info); + char time_up_to_secs[24]; + strftime(time_up_to_secs, sizeof(time_up_to_secs), kDateFormat, &tm_info); + char time[34]; + snprintf(time, sizeof(time), kDateMicrosAndTzFormat, time_up_to_secs, remaining_micros); + str->append(time); + } +}; + +// Instantiate this template to get static access to the type traits. +template +struct TypeTraits : public DataTypeTraits { + typedef typename DataTypeTraits::cpp_type cpp_type; + + static const DataType type = datatype; + static const size_t size = sizeof(cpp_type); +}; + +class Variant { + public: + Variant(DataType type, const void *value) { + Reset(type, value); + } + + ~Variant() { + Clear(); + } + + template + void Reset(const typename DataTypeTraits::cpp_type& value) { + Reset(Type, &value); + } + + // Set the variant to the specified type/value. + // The value must be of the relative type. + // In case of strings, the value must be a pointer to a Slice, and the data block + // will be copied, and released by the variant on the next set/clear call. + // + // Examples: + // uint16_t u16 = 512; + // Slice slice("Hello World"); + // variant.set(UINT16, &u16); + // variant.set(STRING, &slice); + void Reset(DataType type, const void *value) { + CHECK(value != NULL) << "Variant value must be not NULL"; + Clear(); + type_ = type; + switch (type_) { + case UNKNOWN_DATA: + LOG(FATAL) << "Unreachable"; + case BOOL: + numeric_.b1 = *static_cast(value); + break; + case INT8: + numeric_.i8 = *static_cast(value); + break; + case UINT8: + numeric_.u8 = *static_cast(value); + break; + case INT16: + numeric_.i16 = *static_cast(value); + break; + case UINT16: + numeric_.u16 = *static_cast(value); + break; + case INT32: + numeric_.i32 = *static_cast(value); + break; + case UINT32: + numeric_.u32 = *static_cast(value); + break; + case TIMESTAMP: + case INT64: + numeric_.i64 = *static_cast(value); + break; + case UINT64: + numeric_.u64 = *static_cast(value); + break; + case FLOAT: + numeric_.float_val = *static_cast(value); + break; + case DOUBLE: + numeric_.double_val = *static_cast(value); + break; + case STRING: // Fallthrough intended. + case BINARY: + { + const Slice *str = static_cast(value); + // In the case that str->size() == 0, then the 'Clear()' above has already + // set vstr_ to Slice(""). Otherwise, we need to allocate and copy the + // user's data. + if (str->size() > 0) { + auto blob = new uint8_t[str->size()]; + memcpy(blob, str->data(), str->size()); + vstr_ = Slice(blob, str->size()); + } + } + break; + default: LOG(FATAL) << "Unknown data type: " << type_; + } + } + + // Set the variant to a STRING type. + // The specified data block will be copied, and released by the variant + // on the next set/clear call. + void Reset(const string& data) { + Slice slice(data); + Reset(STRING, &slice); + } + + // Set the variant to a STRING type. + // The specified data block will be copied, and released by the variant + // on the next set/clear call. + void Reset(const char *data, size_t size) { + Slice slice(data, size); + Reset(STRING, &slice); + } + + // Returns the type of the Variant + DataType type() const { + return type_; + } + + // Returns a pointer to the internal variant value + // The return value can be casted to the relative type() + // The return value will be valid until the next set() is called. + // + // Examples: + // static_cast(variant.value()) + // static_cast(variant.value()) + const void *value() const { + switch (type_) { + case UNKNOWN_DATA: LOG(FATAL) << "Attempted to access value of unknown data type"; + case BOOL: return &(numeric_.b1); + case INT8: return &(numeric_.i8); + case UINT8: return &(numeric_.u8); + case INT16: return &(numeric_.i16); + case UINT16: return &(numeric_.u16); + case INT32: return &(numeric_.i32); + case UINT32: return &(numeric_.u32); + case INT64: return &(numeric_.i64); + case UINT64: return &(numeric_.u64); + case FLOAT: return (&numeric_.float_val); + case DOUBLE: return (&numeric_.double_val); + case STRING: + case BINARY: return &vstr_; + default: LOG(FATAL) << "Unknown data type: " << type_; + } + CHECK(false) << "not reached!"; + return NULL; + } + + bool Equals(const Variant *other) const { + if (other == NULL || type_ != other->type_) + return false; + return GetTypeInfo(type_)->Compare(value(), other->value()) == 0; + } + + private: + DISALLOW_COPY_AND_ASSIGN(Variant); + + void Clear() { + // No need to delete[] zero-length vstr_, because we always ensure that + // such a string would point to a constant "" rather than an allocated piece + // of memory. + if (vstr_.size() > 0) { + delete[] vstr_.mutable_data(); + vstr_.clear(); + } + } + + union NumericValue { + bool b1; + int8_t i8; + uint8_t u8; + int16_t i16; + uint16_t u16; + int32_t i32; + uint32_t u32; + int64_t i64; + uint64_t u64; + float float_val; + double double_val; + }; + + DataType type_; + NumericValue numeric_; + Slice vstr_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/common/wire_protocol-test-util.h b/src/kudu/common/wire_protocol-test-util.h new file mode 100644 index 000000000000..8e47c96cfef5 --- /dev/null +++ b/src/kudu/common/wire_protocol-test-util.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_COMMON_WIRE_PROTOCOL_TEST_UTIL_H_ +#define KUDU_COMMON_WIRE_PROTOCOL_TEST_UTIL_H_ + +#include "kudu/common/wire_protocol.h" + +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/row_operations.h" + +namespace kudu { + +inline Schema GetSimpleTestSchema() { + return Schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING, true) }, + 1); +} + +inline void AddTestRowWithNullableStringToPB(RowOperationsPB::Type op_type, + const Schema& schema, + int32_t key, + int32_t int_val, + const char* string_val, + RowOperationsPB* ops) { + DCHECK(schema.initialized()); + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32("key", key)); + CHECK_OK(row.SetInt32("int_val", int_val)); + if (string_val) { + CHECK_OK(row.SetStringCopy("string_val", string_val)); + } + RowOperationsPBEncoder enc(ops); + enc.Add(op_type, row); +} + +inline void AddTestRowToPB(RowOperationsPB::Type op_type, + const Schema& schema, + int32_t key, + int32_t int_val, + const string& string_val, + RowOperationsPB* ops) { + AddTestRowWithNullableStringToPB(op_type, schema, key, int_val, string_val.c_str(), ops); +} + +inline void AddTestKeyToPB(RowOperationsPB::Type op_type, + const Schema& schema, + int32_t key, + RowOperationsPB* ops) { + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32(0, key)); + RowOperationsPBEncoder enc(ops); + enc.Add(op_type, row); +} + +} // namespace kudu + +#endif /* KUDU_COMMON_WIRE_PROTOCOL_TEST_UTIL_H_ */ diff --git a/src/kudu/common/wire_protocol-test.cc b/src/kudu/common/wire_protocol-test.cc new file mode 100644 index 000000000000..80919429d8e2 --- /dev/null +++ b/src/kudu/common/wire_protocol-test.cc @@ -0,0 +1,322 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class WireProtocolTest : public KuduTest { + public: + WireProtocolTest() + : schema_({ ColumnSchema("col1", STRING), + ColumnSchema("col2", STRING), + ColumnSchema("col3", UINT32, true /* nullable */) }, + 1) { + } + + void FillRowBlockWithTestRows(RowBlock* block) { + block->selection_vector()->SetAllTrue(); + + for (int i = 0; i < block->nrows(); i++) { + RowBlockRow row = block->row(i); + *reinterpret_cast(row.mutable_cell_ptr(0)) = Slice("hello world col1"); + *reinterpret_cast(row.mutable_cell_ptr(1)) = Slice("hello world col2"); + *reinterpret_cast(row.mutable_cell_ptr(2)) = i; + row.cell(2).set_null(false); + } + } + protected: + Schema schema_; +}; + +TEST_F(WireProtocolTest, TestOKStatus) { + Status s = Status::OK(); + AppStatusPB pb; + StatusToPB(s, &pb); + EXPECT_EQ(AppStatusPB::OK, pb.code()); + EXPECT_FALSE(pb.has_message()); + EXPECT_FALSE(pb.has_posix_code()); + + Status s2 = StatusFromPB(pb); + ASSERT_OK(s2); +} + +TEST_F(WireProtocolTest, TestBadStatus) { + Status s = Status::NotFound("foo", "bar"); + AppStatusPB pb; + StatusToPB(s, &pb); + EXPECT_EQ(AppStatusPB::NOT_FOUND, pb.code()); + EXPECT_TRUE(pb.has_message()); + EXPECT_EQ("foo: bar", pb.message()); + EXPECT_FALSE(pb.has_posix_code()); + + Status s2 = StatusFromPB(pb); + EXPECT_TRUE(s2.IsNotFound()); + EXPECT_EQ(s.ToString(), s2.ToString()); +} + +TEST_F(WireProtocolTest, TestBadStatusWithPosixCode) { + Status s = Status::NotFound("foo", "bar", 1234); + AppStatusPB pb; + StatusToPB(s, &pb); + EXPECT_EQ(AppStatusPB::NOT_FOUND, pb.code()); + EXPECT_TRUE(pb.has_message()); + EXPECT_EQ("foo: bar", pb.message()); + EXPECT_TRUE(pb.has_posix_code()); + EXPECT_EQ(1234, pb.posix_code()); + + Status s2 = StatusFromPB(pb); + EXPECT_TRUE(s2.IsNotFound()); + EXPECT_EQ(1234, s2.posix_code()); + EXPECT_EQ(s.ToString(), s2.ToString()); +} + +TEST_F(WireProtocolTest, TestSchemaRoundTrip) { + google::protobuf::RepeatedPtrField pbs; + + ASSERT_OK(SchemaToColumnPBs(schema_, &pbs)); + ASSERT_EQ(3, pbs.size()); + + // Column 0. + EXPECT_TRUE(pbs.Get(0).is_key()); + EXPECT_EQ("col1", pbs.Get(0).name()); + EXPECT_EQ(STRING, pbs.Get(0).type()); + EXPECT_FALSE(pbs.Get(0).is_nullable()); + + // Column 1. + EXPECT_FALSE(pbs.Get(1).is_key()); + EXPECT_EQ("col2", pbs.Get(1).name()); + EXPECT_EQ(STRING, pbs.Get(1).type()); + EXPECT_FALSE(pbs.Get(1).is_nullable()); + + // Column 2. + EXPECT_FALSE(pbs.Get(2).is_key()); + EXPECT_EQ("col3", pbs.Get(2).name()); + EXPECT_EQ(UINT32, pbs.Get(2).type()); + EXPECT_TRUE(pbs.Get(2).is_nullable()); + + // Convert back to a Schema object and verify they're identical. + Schema schema2; + ASSERT_OK(ColumnPBsToSchema(pbs, &schema2)); + EXPECT_EQ(schema_.ToString(), schema2.ToString()); + EXPECT_EQ(schema_.num_key_columns(), schema2.num_key_columns()); +} + +// Test that, when non-contiguous key columns are passed, an error Status +// is returned. +TEST_F(WireProtocolTest, TestBadSchema_NonContiguousKey) { + google::protobuf::RepeatedPtrField pbs; + + // Column 0: key + ColumnSchemaPB* col_pb = pbs.Add(); + col_pb->set_name("c0"); + col_pb->set_type(STRING); + col_pb->set_is_key(true); + + // Column 1: not a key + col_pb = pbs.Add(); + col_pb->set_name("c1"); + col_pb->set_type(STRING); + col_pb->set_is_key(false); + + // Column 2: marked as key. This is an error. + col_pb = pbs.Add(); + col_pb->set_name("c2"); + col_pb->set_type(STRING); + col_pb->set_is_key(true); + + Schema schema; + Status s = ColumnPBsToSchema(pbs, &schema); + ASSERT_STR_CONTAINS(s.ToString(), "Got out-of-order key column"); +} + +// Test that, when multiple columns with the same name are passed, an +// error Status is returned. +TEST_F(WireProtocolTest, TestBadSchema_DuplicateColumnName) { + google::protobuf::RepeatedPtrField pbs; + + // Column 0: + ColumnSchemaPB* col_pb = pbs.Add(); + col_pb->set_name("c0"); + col_pb->set_type(STRING); + col_pb->set_is_key(true); + + // Column 1: + col_pb = pbs.Add(); + col_pb->set_name("c1"); + col_pb->set_type(STRING); + col_pb->set_is_key(false); + + // Column 2: same name as column 0 + col_pb = pbs.Add(); + col_pb->set_name("c0"); + col_pb->set_type(STRING); + col_pb->set_is_key(false); + + Schema schema; + Status s = ColumnPBsToSchema(pbs, &schema); + ASSERT_EQ("Invalid argument: Duplicate column name: c0", s.ToString()); +} + +// Create a block of rows in columnar layout and ensure that it can be +// converted to and from protobuf. +TEST_F(WireProtocolTest, TestColumnarRowBlockToPB) { + Arena arena(1024, 1024 * 1024); + RowBlock block(schema_, 10, &arena); + FillRowBlockWithTestRows(&block); + + // Convert to PB. + RowwiseRowBlockPB pb; + faststring direct, indirect; + SerializeRowBlock(block, &pb, nullptr, &direct, &indirect); + SCOPED_TRACE(pb.DebugString()); + SCOPED_TRACE("Row data: " + direct.ToString()); + SCOPED_TRACE("Indirect data: " + indirect.ToString()); + + // Convert back to a row, ensure that the resulting row is the same + // as the one we put in. + vector row_ptrs; + Slice direct_sidecar = direct; + ASSERT_OK(ExtractRowsFromRowBlockPB(schema_, pb, indirect, + &direct_sidecar, &row_ptrs)); + ASSERT_EQ(block.nrows(), row_ptrs.size()); + for (int i = 0; i < block.nrows(); ++i) { + ConstContiguousRow row_roundtripped(&schema_, row_ptrs[i]); + EXPECT_EQ(schema_.DebugRow(block.row(i)), + schema_.DebugRow(row_roundtripped)); + } +} + +#ifdef NDEBUG +TEST_F(WireProtocolTest, TestColumnarRowBlockToPBBenchmark) { + Arena arena(1024, 1024 * 1024); + const int kNumTrials = AllowSlowTests() ? 100 : 10; + RowBlock block(schema_, 10000 * kNumTrials, &arena); + FillRowBlockWithTestRows(&block); + + RowwiseRowBlockPB pb; + + LOG_TIMING(INFO, "Converting to PB") { + for (int i = 0; i < kNumTrials; i++) { + pb.Clear(); + faststring direct, indirect; + SerializeRowBlock(block, &pb, NULL, &direct, &indirect); + } + } +} +#endif + +// Test that trying to extract rows from an invalid block correctly returns +// Corruption statuses. +TEST_F(WireProtocolTest, TestInvalidRowBlock) { + Schema schema({ ColumnSchema("col1", STRING) }, 1); + RowwiseRowBlockPB pb; + vector row_ptrs; + + // Too short to be valid data. + const char* shortstr = "x"; + pb.set_num_rows(1); + Slice direct = shortstr; + Status s = ExtractRowsFromRowBlockPB(schema, pb, Slice(), &direct, &row_ptrs); + ASSERT_STR_CONTAINS(s.ToString(), "Corruption: Row block has 1 bytes of data"); + + // Bad pointer into indirect data. + shortstr = "xxxxxxxxxxxxxxxx"; + pb.set_num_rows(1); + direct = Slice(shortstr); + s = ExtractRowsFromRowBlockPB(schema, pb, Slice(), &direct, &row_ptrs); + ASSERT_STR_CONTAINS(s.ToString(), + "Corruption: Row #0 contained bad indirect slice"); +} + +// Test serializing a block which has a selection vector but no columns. +// This is the sort of result that is returned from a scan with an empty +// projection (a COUNT(*) query). +TEST_F(WireProtocolTest, TestBlockWithNoColumns) { + Schema empty(std::vector(), 0); + Arena arena(1024, 1024 * 1024); + RowBlock block(empty, 1000, &arena); + block.selection_vector()->SetAllTrue(); + // Unselect 100 rows + for (int i = 0; i < 100; i++) { + block.selection_vector()->SetRowUnselected(i * 2); + } + ASSERT_EQ(900, block.selection_vector()->CountSelected()); + + // Convert it to protobuf, ensure that the results look right. + RowwiseRowBlockPB pb; + faststring direct, indirect; + SerializeRowBlock(block, &pb, nullptr, &direct, &indirect); + ASSERT_EQ(900, pb.num_rows()); +} + +TEST_F(WireProtocolTest, TestColumnDefaultValue) { + Slice write_default_str("Hello Write"); + Slice read_default_str("Hello Read"); + uint32_t write_default_u32 = 512; + uint32_t read_default_u32 = 256; + ColumnSchemaPB pb; + + ColumnSchema col1("col1", STRING); + ColumnSchemaToPB(col1, &pb); + ColumnSchema col1fpb = ColumnSchemaFromPB(pb); + ASSERT_FALSE(col1fpb.has_read_default()); + ASSERT_FALSE(col1fpb.has_write_default()); + ASSERT_TRUE(col1fpb.read_default_value() == nullptr); + + ColumnSchema col2("col2", STRING, false, &read_default_str); + ColumnSchemaToPB(col2, &pb); + ColumnSchema col2fpb = ColumnSchemaFromPB(pb); + ASSERT_TRUE(col2fpb.has_read_default()); + ASSERT_FALSE(col2fpb.has_write_default()); + ASSERT_EQ(read_default_str, *static_cast(col2fpb.read_default_value())); + ASSERT_EQ(nullptr, static_cast(col2fpb.write_default_value())); + + ColumnSchema col3("col3", STRING, false, &read_default_str, &write_default_str); + ColumnSchemaToPB(col3, &pb); + ColumnSchema col3fpb = ColumnSchemaFromPB(pb); + ASSERT_TRUE(col3fpb.has_read_default()); + ASSERT_TRUE(col3fpb.has_write_default()); + ASSERT_EQ(read_default_str, *static_cast(col3fpb.read_default_value())); + ASSERT_EQ(write_default_str, *static_cast(col3fpb.write_default_value())); + + ColumnSchema col4("col4", UINT32, false, &read_default_u32); + ColumnSchemaToPB(col4, &pb); + ColumnSchema col4fpb = ColumnSchemaFromPB(pb); + ASSERT_TRUE(col4fpb.has_read_default()); + ASSERT_FALSE(col4fpb.has_write_default()); + ASSERT_EQ(read_default_u32, *static_cast(col4fpb.read_default_value())); + ASSERT_EQ(nullptr, static_cast(col4fpb.write_default_value())); + + ColumnSchema col5("col5", UINT32, false, &read_default_u32, &write_default_u32); + ColumnSchemaToPB(col5, &pb); + ColumnSchema col5fpb = ColumnSchemaFromPB(pb); + ASSERT_TRUE(col5fpb.has_read_default()); + ASSERT_TRUE(col5fpb.has_write_default()); + ASSERT_EQ(read_default_u32, *static_cast(col5fpb.read_default_value())); + ASSERT_EQ(write_default_u32, *static_cast(col5fpb.write_default_value())); +} + +} // namespace kudu diff --git a/src/kudu/common/wire_protocol.cc b/src/kudu/common/wire_protocol.cc new file mode 100644 index 000000000000..289f9a8933e1 --- /dev/null +++ b/src/kudu/common/wire_protocol.cc @@ -0,0 +1,560 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/wire_protocol.h" + +#include +#include + +#include "kudu/common/row.h" +#include "kudu/common/rowblock.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/faststring.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/safe_math.h" +#include "kudu/util/slice.h" + +using google::protobuf::RepeatedPtrField; +using std::vector; + +namespace kudu { + +void StatusToPB(const Status& status, AppStatusPB* pb) { + pb->Clear(); + bool is_unknown = false; + if (status.ok()) { + pb->set_code(AppStatusPB::OK); + // OK statuses don't have any message or posix code. + return; + } else if (status.IsNotFound()) { + pb->set_code(AppStatusPB::NOT_FOUND); + } else if (status.IsCorruption()) { + pb->set_code(AppStatusPB::CORRUPTION); + } else if (status.IsNotSupported()) { + pb->set_code(AppStatusPB::NOT_SUPPORTED); + } else if (status.IsInvalidArgument()) { + pb->set_code(AppStatusPB::INVALID_ARGUMENT); + } else if (status.IsIOError()) { + pb->set_code(AppStatusPB::IO_ERROR); + } else if (status.IsAlreadyPresent()) { + pb->set_code(AppStatusPB::ALREADY_PRESENT); + } else if (status.IsRuntimeError()) { + pb->set_code(AppStatusPB::RUNTIME_ERROR); + } else if (status.IsNetworkError()) { + pb->set_code(AppStatusPB::NETWORK_ERROR); + } else if (status.IsIllegalState()) { + pb->set_code(AppStatusPB::ILLEGAL_STATE); + } else if (status.IsNotAuthorized()) { + pb->set_code(AppStatusPB::NOT_AUTHORIZED); + } else if (status.IsAborted()) { + pb->set_code(AppStatusPB::ABORTED); + } else if (status.IsRemoteError()) { + pb->set_code(AppStatusPB::REMOTE_ERROR); + } else if (status.IsServiceUnavailable()) { + pb->set_code(AppStatusPB::SERVICE_UNAVAILABLE); + } else if (status.IsTimedOut()) { + pb->set_code(AppStatusPB::TIMED_OUT); + } else if (status.IsUninitialized()) { + pb->set_code(AppStatusPB::UNINITIALIZED); + } else if (status.IsConfigurationError()) { + pb->set_code(AppStatusPB::CONFIGURATION_ERROR); + } else if (status.IsIncomplete()) { + pb->set_code(AppStatusPB::INCOMPLETE); + } else if (status.IsEndOfFile()) { + pb->set_code(AppStatusPB::END_OF_FILE); + } else { + LOG(WARNING) << "Unknown error code translation from internal error " + << status.ToString() << ": sending UNKNOWN_ERROR"; + pb->set_code(AppStatusPB::UNKNOWN_ERROR); + is_unknown = true; + } + if (is_unknown) { + // For unknown status codes, include the original stringified error + // code. + pb->set_message(status.CodeAsString() + ": " + + status.message().ToString()); + } else { + // Otherwise, just encode the message itself, since the other end + // will reconstruct the other parts of the ToString() response. + pb->set_message(status.message().ToString()); + } + if (status.posix_code() != -1) { + pb->set_posix_code(status.posix_code()); + } +} + +Status StatusFromPB(const AppStatusPB& pb) { + int posix_code = pb.has_posix_code() ? pb.posix_code() : -1; + + switch (pb.code()) { + case AppStatusPB::OK: + return Status::OK(); + case AppStatusPB::NOT_FOUND: + return Status::NotFound(pb.message(), "", posix_code); + case AppStatusPB::CORRUPTION: + return Status::Corruption(pb.message(), "", posix_code); + case AppStatusPB::NOT_SUPPORTED: + return Status::NotSupported(pb.message(), "", posix_code); + case AppStatusPB::INVALID_ARGUMENT: + return Status::InvalidArgument(pb.message(), "", posix_code); + case AppStatusPB::IO_ERROR: + return Status::IOError(pb.message(), "", posix_code); + case AppStatusPB::ALREADY_PRESENT: + return Status::AlreadyPresent(pb.message(), "", posix_code); + case AppStatusPB::RUNTIME_ERROR: + return Status::RuntimeError(pb.message(), "", posix_code); + case AppStatusPB::NETWORK_ERROR: + return Status::NetworkError(pb.message(), "", posix_code); + case AppStatusPB::ILLEGAL_STATE: + return Status::IllegalState(pb.message(), "", posix_code); + case AppStatusPB::NOT_AUTHORIZED: + return Status::NotAuthorized(pb.message(), "", posix_code); + case AppStatusPB::ABORTED: + return Status::Aborted(pb.message(), "", posix_code); + case AppStatusPB::REMOTE_ERROR: + return Status::RemoteError(pb.message(), "", posix_code); + case AppStatusPB::SERVICE_UNAVAILABLE: + return Status::ServiceUnavailable(pb.message(), "", posix_code); + case AppStatusPB::TIMED_OUT: + return Status::TimedOut(pb.message(), "", posix_code); + case AppStatusPB::UNINITIALIZED: + return Status::Uninitialized(pb.message(), "", posix_code); + case AppStatusPB::CONFIGURATION_ERROR: + return Status::ConfigurationError(pb.message(), "", posix_code); + case AppStatusPB::INCOMPLETE: + return Status::Incomplete(pb.message(), "", posix_code); + case AppStatusPB::END_OF_FILE: + return Status::EndOfFile(pb.message(), "", posix_code); + case AppStatusPB::UNKNOWN_ERROR: + default: + LOG(WARNING) << "Unknown error code in status: " << pb.ShortDebugString(); + return Status::RuntimeError("(unknown error code)", pb.message(), posix_code); + } +} + +Status HostPortToPB(const HostPort& host_port, HostPortPB* host_port_pb) { + host_port_pb->set_host(host_port.host()); + host_port_pb->set_port(host_port.port()); + return Status::OK(); +} + +Status HostPortFromPB(const HostPortPB& host_port_pb, HostPort* host_port) { + host_port->set_host(host_port_pb.host()); + host_port->set_port(host_port_pb.port()); + return Status::OK(); +} + +Status AddHostPortPBs(const vector& addrs, + RepeatedPtrField* pbs) { + for (const Sockaddr& addr : addrs) { + HostPortPB* pb = pbs->Add(); + if (addr.IsWildcard()) { + RETURN_NOT_OK(GetFQDN(pb->mutable_host())); + } else { + pb->set_host(addr.host()); + } + pb->set_port(addr.port()); + } + return Status::OK(); +} + +Status SchemaToPB(const Schema& schema, SchemaPB *pb, int flags) { + pb->Clear(); + return SchemaToColumnPBs(schema, pb->mutable_columns(), flags); +} + +Status SchemaToPBWithoutIds(const Schema& schema, SchemaPB *pb) { + pb->Clear(); + return SchemaToColumnPBs(schema, pb->mutable_columns(), SCHEMA_PB_WITHOUT_IDS); +} + +Status SchemaFromPB(const SchemaPB& pb, Schema *schema) { + return ColumnPBsToSchema(pb.columns(), schema); +} + +void ColumnSchemaToPB(const ColumnSchema& col_schema, ColumnSchemaPB *pb, int flags) { + pb->Clear(); + pb->set_name(col_schema.name()); + pb->set_type(col_schema.type_info()->type()); + pb->set_is_nullable(col_schema.is_nullable()); + if (!(flags & SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES)) { + pb->set_encoding(col_schema.attributes().encoding); + pb->set_compression(col_schema.attributes().compression); + pb->set_cfile_block_size(col_schema.attributes().cfile_block_size); + } + if (col_schema.has_read_default()) { + if (col_schema.type_info()->physical_type() == BINARY) { + const Slice *read_slice = static_cast(col_schema.read_default_value()); + pb->set_read_default_value(read_slice->data(), read_slice->size()); + } else { + const void *read_value = col_schema.read_default_value(); + pb->set_read_default_value(read_value, col_schema.type_info()->size()); + } + } + if (col_schema.has_write_default()) { + if (col_schema.type_info()->physical_type() == BINARY) { + const Slice *write_slice = static_cast(col_schema.write_default_value()); + pb->set_write_default_value(write_slice->data(), write_slice->size()); + } else { + const void *write_value = col_schema.write_default_value(); + pb->set_write_default_value(write_value, col_schema.type_info()->size()); + } + } +} + +ColumnSchema ColumnSchemaFromPB(const ColumnSchemaPB& pb) { + const void *write_default_ptr = nullptr; + const void *read_default_ptr = nullptr; + Slice write_default; + Slice read_default; + const TypeInfo* typeinfo = GetTypeInfo(pb.type()); + if (pb.has_read_default_value()) { + read_default = Slice(pb.read_default_value()); + if (typeinfo->physical_type() == BINARY) { + read_default_ptr = &read_default; + } else { + read_default_ptr = read_default.data(); + } + } + if (pb.has_write_default_value()) { + write_default = Slice(pb.write_default_value()); + if (typeinfo->physical_type() == BINARY) { + write_default_ptr = &write_default; + } else { + write_default_ptr = write_default.data(); + } + } + + ColumnStorageAttributes attributes; + if (pb.has_encoding()) { + attributes.encoding = pb.encoding(); + } + if (pb.has_compression()) { + attributes.compression = pb.compression(); + } + if (pb.has_cfile_block_size()) { + attributes.cfile_block_size = pb.cfile_block_size(); + } + return ColumnSchema(pb.name(), pb.type(), pb.is_nullable(), + read_default_ptr, write_default_ptr, + attributes); +} + +Status ColumnPBsToSchema(const RepeatedPtrField& column_pbs, + Schema* schema) { + + vector columns; + vector column_ids; + columns.reserve(column_pbs.size()); + int num_key_columns = 0; + bool is_handling_key = true; + for (const ColumnSchemaPB& pb : column_pbs) { + columns.push_back(ColumnSchemaFromPB(pb)); + if (pb.is_key()) { + if (!is_handling_key) { + return Status::InvalidArgument( + "Got out-of-order key column", pb.ShortDebugString()); + } + num_key_columns++; + } else { + is_handling_key = false; + } + if (pb.has_id()) { + column_ids.push_back(ColumnId(pb.id())); + } + } + + DCHECK_LE(num_key_columns, columns.size()); + + // TODO(perf): could make the following faster by adding a + // Reset() variant which actually takes ownership of the column + // vector. + return schema->Reset(columns, column_ids, num_key_columns); +} + +Status SchemaToColumnPBs(const Schema& schema, + RepeatedPtrField* cols, + int flags) { + cols->Clear(); + int idx = 0; + for (const ColumnSchema& col : schema.columns()) { + ColumnSchemaPB* col_pb = cols->Add(); + ColumnSchemaToPB(col, col_pb); + col_pb->set_is_key(idx < schema.num_key_columns()); + + if (schema.has_column_ids() && !(flags & SCHEMA_PB_WITHOUT_IDS)) { + col_pb->set_id(schema.column_id(idx)); + } + + idx++; + } + return Status::OK(); +} + +// Because we use a faststring here, ASAN tests become unbearably slow +// with the extra verifications. +ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS +Status RewriteRowBlockPointers(const Schema& schema, const RowwiseRowBlockPB& rowblock_pb, + const Slice& indirect_data_slice, Slice* row_data_slice) { + // TODO: cheating here so we can rewrite the request as it arrived and + // change any indirect data pointers back to "real" pointers instead of + // on-the-wire pointers. Maybe the RPC layer should give us a non-const + // request? Maybe we should suck it up and copy the data when we mutate? + + // We don't need a const-cast because we can just use Slice's lack of + // const-safety. + uint8_t* row_data = row_data_slice->mutable_data(); + const uint8_t* indir_data = indirect_data_slice.data(); + size_t row_size = ContiguousRowHelper::row_size(schema); + size_t expected_data_size = rowblock_pb.num_rows() * row_size; + + if (PREDICT_FALSE(row_data_slice->size() != expected_data_size)) { + return Status::Corruption( + StringPrintf("Row block has %zd bytes of data but expected %zd for %" PRIu32 " rows", + row_data_slice->size(), expected_data_size, rowblock_pb.num_rows())); + } + + for (int i = 0; i < schema.num_columns(); i++) { + const ColumnSchema& col = schema.column(i); + if (col.type_info()->physical_type() != BINARY) { + continue; + } + + int row_idx = 0; + size_t offset = 0; + while (offset < row_data_slice->size()) { + ContiguousRow row(&schema, &row_data[offset]); + uint8_t* dst_cell = row.mutable_cell_ptr(i); + + if (!col.is_nullable() || !row.is_null(i)) { + // The pointer is currently an offset into indir_data. Need to replace it + // with the actual pointer into indir_data + Slice *slice = reinterpret_cast(dst_cell); + size_t offset_in_indirect = reinterpret_cast(slice->data()); + + // Ensure the updated pointer is within the bounds of the indirect data. + bool overflowed = false; + size_t max_offset = AddWithOverflowCheck(offset_in_indirect, slice->size(), &overflowed); + if (PREDICT_FALSE(overflowed || max_offset > indirect_data_slice.size())) { + return Status::Corruption( + StringPrintf("Row #%d contained bad indirect slice for column %s: (%zd, %zd)", + row_idx, col.ToString().c_str(), + reinterpret_cast(slice->data()), + slice->size())); + } + *slice = Slice(&indir_data[offset_in_indirect], slice->size()); + } + + // Advance to next row + offset += row_size; + row_idx++; + } + } + + return Status::OK(); +} + +Status ExtractRowsFromRowBlockPB(const Schema& schema, + const RowwiseRowBlockPB& rowblock_pb, + const Slice& indirect_data, + Slice* rows_data, + vector* rows) { + RETURN_NOT_OK(RewriteRowBlockPointers(schema, rowblock_pb, indirect_data, rows_data)); + + int n_rows = rowblock_pb.num_rows(); + if (PREDICT_FALSE(n_rows == 0)) { + // Early-out here to avoid a UBSAN failure. + return Status::OK(); + } + + // Doing this resize and array indexing turns out to be noticeably faster + // than using reserve and push_back. + size_t row_size = ContiguousRowHelper::row_size(schema); + const uint8_t* src = rows_data->data(); + int dst_index = rows->size(); + rows->resize(rows->size() + n_rows); + const uint8_t** dst = &(*rows)[dst_index]; + while (n_rows > 0) { + *dst++ = src; + src += row_size; + n_rows--; + } + + return Status::OK(); +} + +Status FindLeaderHostPort(const RepeatedPtrField& entries, + HostPort* leader_hostport) { + for (const ServerEntryPB& entry : entries) { + if (entry.has_error()) { + LOG(WARNING) << "Error encountered for server entry " << entry.ShortDebugString() + << ": " << StatusFromPB(entry.error()).ToString(); + continue; + } + if (!entry.has_role()) { + return Status::IllegalState( + strings::Substitute("Every server in must have a role, but entry ($0) has no role.", + entry.ShortDebugString())); + } + if (entry.role() == consensus::RaftPeerPB::LEADER) { + return HostPortFromPB(entry.registration().rpc_addresses(0), leader_hostport); + } + } + return Status::NotFound("No leader found."); +} + +template +void AppendRowToString(const RowType& row, string* buf); + +template<> +void AppendRowToString(const ConstContiguousRow& row, string* buf) { + buf->append(reinterpret_cast(row.row_data()), row.row_size()); +} + +template<> +void AppendRowToString(const RowBlockRow& row, string* buf) { + size_t row_size = ContiguousRowHelper::row_size(*row.schema()); + size_t appended_offset = buf->size(); + buf->resize(buf->size() + row_size); + uint8_t* copied_rowdata = reinterpret_cast(&(*buf)[appended_offset]); + ContiguousRow copied_row(row.schema(), copied_rowdata); + CHECK_OK(CopyRow(row, &copied_row, reinterpret_cast(NULL))); +} + +// Copy a column worth of data from the given RowBlock into the output +// protobuf. +// +// IS_NULLABLE: true if the column is nullable +// IS_VARLEN: true if the column is of variable length +// +// These are template parameters rather than normal function arguments +// so that there are fewer branches inside the loop. +// +// NOTE: 'dst_schema' must either be NULL or a subset of the specified's +// RowBlock's schema. If not NULL, then column at 'col_idx' in 'block' will +// be copied to column 'dst_col_idx' in the output protobuf; otherwise, +// dst_col_idx must be equal to col_idx. +template +static void CopyColumn(const RowBlock& block, int col_idx, + int dst_col_idx, uint8_t* dst_base, + faststring* indirect_data, const Schema* dst_schema) { + DCHECK_NOTNULL(dst_schema); + ColumnBlock cblock = block.column_block(col_idx); + size_t row_stride = ContiguousRowHelper::row_size(*dst_schema); + uint8_t* dst = dst_base + dst_schema->column_offset(dst_col_idx); + size_t offset_to_null_bitmap = dst_schema->byte_size() - dst_schema->column_offset(dst_col_idx); + + size_t cell_size = cblock.stride(); + const uint8_t* src = cblock.cell_ptr(0); + + BitmapIterator selected_row_iter(block.selection_vector()->bitmap(), + block.nrows()); + int run_size; + bool selected; + int row_idx = 0; + while ((run_size = selected_row_iter.Next(&selected))) { + if (!selected) { + src += run_size * cell_size; + row_idx += run_size; + continue; + } + for (int i = 0; i < run_size; i++) { + if (IS_NULLABLE && cblock.is_null(row_idx)) { + memset(dst, 0, cell_size); + BitmapChange(dst + offset_to_null_bitmap, dst_col_idx, true); + } else if (IS_VARLEN) { + const Slice *slice = reinterpret_cast(src); + size_t offset_in_indirect = indirect_data->size(); + indirect_data->append(reinterpret_cast(slice->data()), + slice->size()); + + Slice *dst_slice = reinterpret_cast(dst); + *dst_slice = Slice(reinterpret_cast(offset_in_indirect), + slice->size()); + if (IS_NULLABLE) { + BitmapChange(dst + offset_to_null_bitmap, dst_col_idx, false); + } + } else { // non-string, non-null + strings::memcpy_inlined(dst, src, cell_size); + if (IS_NULLABLE) { + BitmapChange(dst + offset_to_null_bitmap, dst_col_idx, false); + } + } + dst += row_stride; + src += cell_size; + row_idx++; + } + } +} + +// Because we use a faststring here, ASAN tests become unbearably slow +// with the extra verifications. +ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS +void SerializeRowBlock(const RowBlock& block, RowwiseRowBlockPB* rowblock_pb, + const Schema* projection_schema, + faststring* data_buf, faststring* indirect_data) { + DCHECK_GT(block.nrows(), 0); + const Schema& tablet_schema = block.schema(); + + if (projection_schema == nullptr) { + projection_schema = &tablet_schema; + } + + size_t old_size = data_buf->size(); + size_t row_stride = ContiguousRowHelper::row_size(*projection_schema); + int num_rows = block.selection_vector()->CountSelected(); + data_buf->resize(old_size + row_stride * num_rows); + uint8_t* base = reinterpret_cast(&(*data_buf)[old_size]); + + size_t proj_schema_idx = 0; + for (int t_schema_idx = 0; t_schema_idx < tablet_schema.num_columns(); t_schema_idx++) { + const ColumnSchema& col = tablet_schema.column(t_schema_idx); + proj_schema_idx = projection_schema->find_column(col.name()); + if (proj_schema_idx == -1) { + continue; + } + + // Generating different functions for each of these cases makes them much less + // branch-heavy -- we do the branch once outside the loop, and then have a + // compiled version for each combination below. + // TODO: Using LLVM to build a specialized CopyColumn on the fly should have + // even bigger gains, since we could inline the constant cell sizes and column + // offsets. + if (col.is_nullable() && col.type_info()->physical_type() == BINARY) { + CopyColumn(block, t_schema_idx, proj_schema_idx, base, indirect_data, + projection_schema); + } else if (col.is_nullable() && col.type_info()->physical_type() != BINARY) { + CopyColumn(block, t_schema_idx, proj_schema_idx, base, indirect_data, + projection_schema); + } else if (!col.is_nullable() && col.type_info()->physical_type() == BINARY) { + CopyColumn(block, t_schema_idx, proj_schema_idx, base, indirect_data, + projection_schema); + } else if (!col.is_nullable() && col.type_info()->physical_type() != BINARY) { + CopyColumn(block, t_schema_idx, proj_schema_idx, base, indirect_data, + projection_schema); + } else { + LOG(FATAL) << "cannot reach here"; + } + } + rowblock_pb->set_num_rows(rowblock_pb->num_rows() + num_rows); +} + +} // namespace kudu diff --git a/src/kudu/common/wire_protocol.h b/src/kudu/common/wire_protocol.h new file mode 100644 index 000000000000..859bf4a89a97 --- /dev/null +++ b/src/kudu/common/wire_protocol.h @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Helpers for dealing with the protobufs defined in wire_protocol.proto. +#ifndef KUDU_COMMON_WIRE_PROTOCOL_H +#define KUDU_COMMON_WIRE_PROTOCOL_H + +#include + +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/util/status.h" + +namespace kudu { + +class ConstContiguousRow; +class ColumnSchema; +class faststring; +class HostPort; +class RowBlock; +class RowBlockRow; +class RowChangeList; +class Schema; +class Slice; +class Sockaddr; + +// Convert the given C++ Status object into the equivalent Protobuf. +void StatusToPB(const Status& status, AppStatusPB* pb); + +// Convert the given protobuf into the equivalent C++ Status object. +Status StatusFromPB(const AppStatusPB& pb); + +// Convert the specified HostPort to protobuf. +Status HostPortToPB(const HostPort& host_port, HostPortPB* host_port_pb); + +// Returns the HostPort created from the specified protobuf. +Status HostPortFromPB(const HostPortPB& host_port_pb, HostPort* host_port); + +// Adds addresses in 'addrs' to 'pbs'. If an address is a wildcard +// (e.g., "0.0.0.0"), then the local machine's hostname is used in +// its place. +Status AddHostPortPBs(const std::vector& addrs, + google::protobuf::RepeatedPtrField* pbs); + +enum SchemaPBConversionFlags { + SCHEMA_PB_WITHOUT_IDS = 1 << 0, + SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES = 1 << 1, +}; + +// Convert the specified schema to protobuf. +// 'flags' is a bitfield of SchemaPBConversionFlags values. +Status SchemaToPB(const Schema& schema, SchemaPB* pb, int flags = 0); + +// Convert the specified schema to protobuf without column IDs. +Status SchemaToPBWithoutIds(const Schema& schema, SchemaPB *pb); + +// Returns the Schema created from the specified protobuf. +// If the schema is invalid, return a non-OK status. +Status SchemaFromPB(const SchemaPB& pb, Schema *schema); + +// Convert the specified column schema to protobuf. +// 'flags' is a bitfield of SchemaPBConversionFlags values. +void ColumnSchemaToPB(const ColumnSchema& schema, ColumnSchemaPB *pb, int flags = 0); + +// Return the ColumnSchema created from the specified protobuf. +ColumnSchema ColumnSchemaFromPB(const ColumnSchemaPB& pb); + +// Convert the given list of ColumnSchemaPB objects into a Schema object. +// +// Returns InvalidArgument if the provided columns don't make a valid Schema +// (eg if the keys are non-contiguous or nullable). +Status ColumnPBsToSchema( + const google::protobuf::RepeatedPtrField& column_pbs, + Schema* schema); + +// Extract the columns of the given Schema into protobuf objects. +// +// The 'cols' list is replaced by this method. +// 'flags' is a bitfield of SchemaPBConversionFlags values. +Status SchemaToColumnPBs( + const Schema& schema, + google::protobuf::RepeatedPtrField* cols, + int flags = 0); + +// Encode the given row block into the provided protobuf and data buffers. +// +// All data (both direct and indirect) for each selected row in the RowBlock is +// copied into the protobuf and faststrings. +// The original data may be destroyed safely after this returns. +// +// This only converts those rows whose selection vector entry is true. +// If 'client_projection_schema' is not NULL, then only columns specified in +// 'client_projection_schema' will be projected to 'data_buf'. +// +// Requires that block.nrows() > 0 +void SerializeRowBlock(const RowBlock& block, RowwiseRowBlockPB* rowblock_pb, + const Schema* client_projection_schema, + faststring* data_buf, faststring* indirect_data); + +// Rewrites the data pointed-to by row data slice 'row_data_slice' by replacing +// relative indirect data pointers with absolute ones in 'indirect_data_slice'. +// At the time of this writing, this rewriting is only done for STRING types. +// +// Returns a bad Status if the provided data is invalid or corrupt. +Status RewriteRowBlockPointers(const Schema& schema, const RowwiseRowBlockPB& rowblock_pb, + const Slice& indirect_data_slice, Slice* row_data_slice); + +// Extract the rows stored in this protobuf, which must have exactly the +// given Schema. This Schema may be obtained using ColumnPBsToSchema. +// +// Pointers are added to 'rows' for each of the extracted rows. These +// pointers are suitable for constructing ConstContiguousRow objects. +// TODO: would be nice to just return a vector, but +// they're not currently copyable, so this can't be done. +// +// Note that the returned rows refer to memory managed by 'rows_data' and +// 'indirect_data'. This is also the reason that 'rows_data' is a non-const pointer +// argument: the internal data is mutated in-place to restore the validity of +// indirect data pointers, which are relative on the wire but must be absolute +// while in-memory. +// +// Returns a bad Status if the provided data is invalid or corrupt. +Status ExtractRowsFromRowBlockPB(const Schema& schema, + const RowwiseRowBlockPB& rowblock_pb, + const Slice& indirect_data, + Slice* rows_data, + std::vector* rows); + +// Set 'leader_hostport' to the host/port of the leader server if one +// can be found in 'entries'. +// +// Returns Status::NotFound if no leader is found. +Status FindLeaderHostPort(const google::protobuf::RepeatedPtrField& entries, + HostPort* leader_hostport); + +} // namespace kudu +#endif diff --git a/src/kudu/common/wire_protocol.proto b/src/kudu/common/wire_protocol.proto new file mode 100644 index 000000000000..7f79475e96b6 --- /dev/null +++ b/src/kudu/common/wire_protocol.proto @@ -0,0 +1,171 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Protobufs used by both client-server and server-server traffic +// for user data transfer. This file should only contain protobufs +// which are exclusively used on the wire. If a protobuf is persisted on +// disk and not used as part of the wire protocol, it belongs in another +// place such as common/common.proto or within cfile/, server/, etc. +package kudu; + +option java_package = "org.kududb"; + +import "kudu/common/common.proto"; +import "kudu/consensus/metadata.proto"; + +// Error status returned by any RPC method. +// Every RPC method which could generate an application-level error +// should have this (or a more complex error result) as an optional field +// in its response. +// +// This maps to kudu::Status in C++. +message AppStatusPB { + enum ErrorCode { + UNKNOWN_ERROR = 999; + OK = 0; + NOT_FOUND = 1; + CORRUPTION = 2; + NOT_SUPPORTED = 3; + INVALID_ARGUMENT = 4; + IO_ERROR = 5; + ALREADY_PRESENT = 6; + RUNTIME_ERROR = 7; + NETWORK_ERROR = 8; + ILLEGAL_STATE = 9; + NOT_AUTHORIZED = 10; + ABORTED = 11; + REMOTE_ERROR = 12; + SERVICE_UNAVAILABLE = 13; + TIMED_OUT = 14; + UNINITIALIZED = 15; + CONFIGURATION_ERROR = 16; + INCOMPLETE = 17; + END_OF_FILE = 18; + } + + required ErrorCode code = 1; + optional string message = 2; + optional int32 posix_code = 4; +} + +// Uniquely identify a particular instance of a particular server in the +// cluster. +message NodeInstancePB { + // Unique ID which is created when the server is first started + // up. This is stored persistently on disk. + required bytes permanent_uuid = 1; + + // Sequence number incremented on every start-up of the server. + // This makes it easy to detect when an instance has restarted (and + // thus can be assumed to have forgotten any soft state it had in + // memory). + // + // On a freshly initialized server, the first sequence number + // should be 0. + required int64 instance_seqno = 2; +} + +// RPC and HTTP addresses for each server. +message ServerRegistrationPB { + repeated HostPortPB rpc_addresses = 1; + repeated HostPortPB http_addresses = 2; +} + +message ServerEntryPB { + // If there is an error communicating with the server (or retrieving + // the server registration on the server itself), this field will be + // set to contain the error. + // + // All subsequent fields are optional, as they may not be set if + // an error is encountered communicating with the individual server. + optional AppStatusPB error = 1; + + optional NodeInstancePB instance_id = 2; + optional ServerRegistrationPB registration = 3; + + // If an error has occured earlier in the RPC call, the role + // may be not be set. + optional consensus.RaftPeerPB.Role role = 4; +} + +// A row block in which each row is stored contiguously. +message RowwiseRowBlockPB { + // The number of rows in the block. This can typically be calculated + // by dividing rows.size() by the width of the row, but in the case that + // the client is scanning an empty projection (i.e a COUNT(*)), this + // field is the only way to determine how many rows were returned. + optional int32 num_rows = 1 [ default = 0 ]; + + // Sidecar index for the row data. + // + // In the sidecar, each row is stored in the same in-memory format + // as kudu::ContiguousRow (i.e the raw unencoded data followed by + // a null bitmap). + // + // The data for NULL cells will be present with undefined contents -- + // typically it will be filled with \x00s but this is not guaranteed, + // and clients may choose to initialize NULL cells with whatever they + // like. Setting to some constant improves RPC compression, though. + // + // Any pointers are made relative to the beginning of the indirect + // data sidecar. + // + // See rpc/rpc_sidecar.h for more information on where the data is + // actually stored. + optional int32 rows_sidecar = 2; + + // Sidecar index for the indirect data. + // + // In the sidecar, "indirect" data types in the block are stored + // contiguously. For example, STRING values in the block will be + // stored using the normal Slice in-memory format, except that + // instead of being pointers in RAM, the pointer portion will be an + // offset into this protobuf field. + optional int32 indirect_data_sidecar = 3; +} + +// A set of operations (INSERT, UPDATE, or DELETE) to apply to a table. +message RowOperationsPB { + enum Type { + UNKNOWN = 0; + INSERT = 1; + UPDATE = 2; + DELETE = 3; + // Used when specifying split rows on table creation. + SPLIT_ROW = 4; + } + + // The row data for each operation is stored in the following format: + // + // [operation type] (one byte): + // A single-byte field which determines the type of operation. The values are + // based on the 'Type' enum above. + // [column isset bitmap] (one bit for each column in the Schema, rounded to nearest byte) + // A set bit in this bitmap indicates that the user has specified the given column + // in the row. This indicates that the column will be present in the data to follow. + // [null bitmap] (one bit for each Schema column, rounded to nearest byte) + // A set bit in this bitmap indicates that the given column is NULL. + // This is only present if there are any nullable columns. + // [column data] + // For each column which is set and not NULL, the column's data follows. The data + // format of each cell is the canonical in-memory format (eg little endian). + // For string data, the pointers are relative to 'indirect_data'. + // + // The rows are concatenated end-to-end with no padding/alignment. + optional bytes rows = 2; + optional bytes indirect_data = 3; +} diff --git a/src/kudu/consensus/CMakeLists.txt b/src/kudu/consensus/CMakeLists.txt new file mode 100644 index 000000000000..2faaf628988e --- /dev/null +++ b/src/kudu/consensus/CMakeLists.txt @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################### +# consensus_metadata_proto +######################################### + +PROTOBUF_GENERATE_CPP( + METADATA_PROTO_SRCS METADATA_PROTO_HDRS METADATA_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES + metadata.proto + opid.proto) +set(METADATA_PROTO_LIBS + kudu_common_proto + fs_proto + protobuf) +ADD_EXPORTABLE_LIBRARY(consensus_metadata_proto + SRCS ${METADATA_PROTO_SRCS} + DEPS ${METADATA_PROTO_LIBS} + NONLINK_DEPS ${METADATA_PROTO_TGTS}) + +######################################### +# consensus_proto +######################################### + +KRPC_GENERATE( + CONSENSUS_KRPC_SRCS CONSENSUS_KRPC_HDRS CONSENSUS_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES consensus.proto) +list(APPEND CONSENSUS_KRPC_SRCS opid_util.cc) +set(CONSENSUS_KRPC_LIBS + cfile_proto + consensus_metadata_proto + krpc + kudu_common_proto + rpc_header_proto + protobuf + tablet_proto + tserver_admin_proto + wire_protocol_proto) + +ADD_EXPORTABLE_LIBRARY(consensus_proto + SRCS ${CONSENSUS_KRPC_SRCS} + DEPS ${CONSENSUS_KRPC_LIBS} + NONLINK_DEPS ${CONSENSUS_KRPC_TGTS}) + +######################################### +# log_proto +######################################### + +PROTOBUF_GENERATE_CPP( + LOG_PROTO_SRCS LOG_PROTO_HDRS LOG_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES log.proto) + +add_library(log_proto ${LOG_PROTO_SRCS} ${LOG_PROTO_HDRS}) +target_link_libraries(log_proto + consensus_proto) + +set(LOG_SRCS + log_util.cc + log.cc + log_anchor_registry.cc + log_index.cc + log_reader.cc + log_metrics.cc +) + +add_library(log ${LOG_SRCS}) +target_link_libraries(log + server_common + gutil + kudu_common + kudu_fs + consensus_proto + log_proto + consensus_metadata_proto) + +set(CONSENSUS_SRCS + consensus.cc + consensus_meta.cc + consensus_peers.cc + consensus_queue.cc + leader_election.cc + local_consensus.cc + log_cache.cc + peer_manager.cc + quorum_util.cc + raft_consensus.cc + raft_consensus_state.cc +) + +add_library(consensus ${CONSENSUS_SRCS}) +target_link_libraries(consensus + consensus_proto + kudu_common + log + protobuf) + +set(KUDU_TEST_LINK_LIBS + log + consensus + tserver + cfile + tablet + kudu_util + ${KUDU_MIN_TEST_LIBS} +) + +ADD_KUDU_TEST(consensus_meta-test) +ADD_KUDU_TEST(consensus_peers-test) +ADD_KUDU_TEST(consensus_queue-test) +ADD_KUDU_TEST(leader_election-test) +ADD_KUDU_TEST(log-test) +ADD_KUDU_TEST(log_anchor_registry-test) +ADD_KUDU_TEST(log_cache-test) +ADD_KUDU_TEST(log_index-test) +ADD_KUDU_TEST(mt-log-test) +ADD_KUDU_TEST(quorum_util-test) +ADD_KUDU_TEST(raft_consensus_quorum-test) +ADD_KUDU_TEST(raft_consensus_state-test) +ADD_KUDU_TEST(raft_consensus-test) + +#Tools +add_executable(log-dump log-dump.cc) +target_link_libraries(log-dump + log + ${KUDU_BASE_LIBS}) diff --git a/src/kudu/consensus/README b/src/kudu/consensus/README new file mode 100644 index 000000000000..b301a72f6cf3 --- /dev/null +++ b/src/kudu/consensus/README @@ -0,0 +1,280 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This document introduces how Kudu will handle log replication and consistency +using an algorithm known as Viewstamped Replication (VS) and a series of +practical algorithms/techniques for recovery, reconfiguration, compactions etc. +This document introduces all the concepts directly related to Kudu, for any +missing information please refer to the original papers [1,3,4]. + +Quorums, in Kudu, are a set of collaborating processes that serve the purpose +of keeping a consistent, replicated log of operations on a given data set, e.g. +a tablet. This replicated consistent log, also plays the role of the Write +Ahead Log (WAL) for the tablet. Throughout this document we use config +participant and process interchangeably, these do not represent machines or OS +processes, as machines and or application daemons will participate in multiple +configs. + +============================================================ +The write ahead log (WAL) +============================================================ + +The WAL provides strict ordering and durability guarantees: + +1) If calls to Reserve() are externally synchronized, the order in +which entries had been reserved will be the order in which they will +be committed to disk. + +2) If fsync is enabled (via the 'log_force_fsync_all' flag -- see +log_util.cc; note: this is _DISABLED_ by default), then every single +transaction is guaranteed to be synchronized to disk before its +execution is deemed successful. + +Log uses group commit to increase performance primarily by allowing +throughput to scale with the number of writer threads while +maintaining close to constant latency. + +============================================================ +Basic WAL usage +============================================================ + +To add operations to the log, the caller must obtain the lock, and +call Reserve() with a collection of operations and pointer to the +reserved entry (the latter being an out parameter). Then, the caller +may release the lock and call the AsyncAppend() method with the +reserved entry and a callback that will be invoked upon completion of +the append. AsyncAppend method performs serialization and copying +outside of the lock. + +For sample usage see local_consensus.cc and mt-log-test.cc. + +============================================================= +Group commit implementation details +============================================================= + +Currently, the group implementation uses a blocking queue (see +Log::entry_queue_ in log.h) and a separate long-running thread (see +Log::AppendThread in log.cc). Since access to the queue is +synchronized via a lock and only a single thread removes the queue, +the order in which the elements are added to the queue will be the +same as the order in which the elements are removed from the queue. + +The size of the queue is currently based on the number of entries, but +this will eventually be changed to be based on size of all queued +entries in bytes. + +============================================================= +Reserving a slot for the entry +============================================================= + +Currently Reserve() allocates memory for a new entry on the heap each +time, marks the entry internally as "reserved" via a state enum, and +adds it to the above-mentioned queue. In the future, a ring-buffer or +another similar data structure could be used that would take the place +of the queue and make allocation unnecessary. + +============================================================ +Copying the entry contents to the reserved slot +============================================================ + +AsyncAppend() serializes the contents of the entry to a buffer field +in the entry object (currently the buffer is allocated at the same +time as the entry itself); this avoids contention that would occur if +a shared buffer was to be used. + +============================================================ +Synchronizing the entry contents to disk +============================================================ + +A separate appender thread waits until entries are added to the +queue. Once the queue is no longer empty, the thread grabs all +elements on the queue. Then for each dequeued entry, the appender +waits until the entry is marked ready (see "Copying the entry contents +to the reserved slot" above) and then appends the entry to the current +log segment without synchronizing the underlying file with filesystem +(env::WritableFile::Append()) + +Note: this could be further optimized by calling AppendVector() with a +vector of buffers from all of the consumed entries. + +Once all entries are successfully appended, the appender thread syncs +the file to disk (env::WritableFile::Sync()) and (again) waits until +more entries are added to the queue, or until the queue or the +appender thread are shut down. + +============================================================ +Log segment files and asynchronous preallocation +============================================================ + +Log uses PosixWritableFile() for underlying storage. If preallocation +is enabled ('--log_preallocate_segments' flag, defined in log_util.cc, +true by default), then whenever a new segment is created, the +underlying file is preallocated to a certain size in megabytes +('--log_segment_size_mb', defined in log_util.cc, default 64). While +the offset in the segment file is below the preallocated length, +the cheaper fdatasync() operation is used instead of fsync(). + +When the size of the current segment exceeds the preallocated size, a +task is launched in a separate thread that begins preallocating the +underlying file for the new log segment; meanwhile, until the task +finishes, appends still go to the existing file. Once the new file is +preallocated, it is renamed to the correct name for the next segment +and is swapped in place of the current segment. + +When the current segment is closed without reaching the preallocated +size, the underlying file is truncated to the last written offset +(i.e., the actual size). + +============================================================ +Quorums and roles within configs +============================================================ + +A config in Kudu is a fault-tolerant, consistent unit that serves requests for +a single tablet. As long as there are 2f+1 participants available in a config, +where f is the number of possibly faulty participants, the config will keep +serving requests for its tablet and it is guaranteed that clients perceive a +fully consistent, linearizable view of both data and operations on that data. +The f parameter, defined table wide through configuration implicitly +defines the size of the config, f=0 indicates a single node config, f=1 +indicates a 3 node config, f=2 indicates a 5 node config, etc.. Quorums may +overlap in the sense that each physical machine may be participating in +multiple configs, usually one per each tablet that it serves. + +Within a single config, in steady state, i.e. when no peer is faulty, there +are two main types of peers. The leader peer and the follower peers. +The leader peer dictates the serialization of the operations throughout the +config, its version of the sequence of data altering requests is the "truth" +and any data altering request is only considered final (i.e. can be +acknowledged to the client as successful) when a majority of the config +acknowledges that they "agree" with the leader's view of the event order. +In practice this means that all write requests are sent directly to the +leader, which then replicates them to a majority of the followers before +sending an ACK to the client. Follower peers are completely passive in +steady state, only receiving data from the leader and acknowledging back. +Follower peers only become active when the leader process stops and one +of the followers (if there are any) must be elected leader. + +Participants in a config may be assigned the following roles: + +LEADER - The current leader of the config, receives requests from clients +and serializes them to other nodes. + +FOLLOWER - Active participants in the config, whose votes count towards +majority, replication count etc. + +LEARNER - Passive participants in the config, whose votes do not count +towards majority or replication count. New nodes joining the config +will have this role until they catch up and can be promoted to FOLLOWER. + +NON_PARTICIPANT - A peer that does not participate in a particular +config. Mostly used to mark prior participants that stopped being so +on a configuration change. + +The following diagram illustrates the possible state changes: + + +------------+ + | NON_PART +---+ + +-----+------+ | + Exist. RaftConfig? | | + +-----v------+ | + | LEARNER + | New RaftConfig? + +-----+------+ | + | | + +-----v------+ | + +-->+ FOLLOW. +<--+ + | +-----+------+ + | | + | +-----v------+ + Step Down +<--+ CANDIDATE | + ^ +-----+------+ + | | + | +-----v------+ + +<--+ LEADER | + +------------+ + +Additionally all states can transition to NON_PARTICIPANT, on configuration +changes and/or peer timeout/death. + +============================================================ +Assembling/Rebooting a RaftConfig and RaftConfig States +============================================================ + +Prior to starting/rebooting a peer, the state in WAL must have been replayed +in a bootstrap phase. This process will yield an up-to-date Log and Tablet. +The new/rebooting peer is then Init()'ed with this Log. The Log is queried +for the last committed configuration entry (A Raft configuration consists of +a set of peers (uuid and last known address) and hinted* roles). If there is +none, it means this is a new config. + +After the peer has been Init()'ed, Start(Configuration) is called. The provided +configuration is a hint which is only taken into account if there was no previous +configuration*. + +Independently of whether the configuration is a new one (new config) +or an old one (rebooting config), the config cannot start until a +leader has been elected and replicates the configuration through +consensus. This ensures that a majority of nodes agree that this is +the most recent configuration. + +The provided configuration will always specify a leader -- in the case +of a new config, it is chosen by the master, and in the case of a +rebooted one, it is the configuration that was active before the node +crashed. In either case, replicating this initial configuration +entry happens in the exact same way as any other config entry, +i.e. the LEADER will try and replicate it to FOLLOWERS. As usual if +the LEADER fails, leader election is triggered and the new LEADER will +try to replicate a new configuration. + +Only after the config has successfully replicated the initial configuration +entry is the config ready to accept writes. + + +Peers in the config can therefore be in the following states: + +BOOTSTRAPPING - The phase prior to initialization where the Log is being +replayed. If a majority of peers are still BOOTSTRAPPING, the config doesn't +exist yet. + +CONFIGURING: Until the current configuration is pushed though consensus. This +is true for both new configs and rebooting configs. The peers do not accept +client requests in this state. In this state, the Leader tries to replicate +the configuration. Followers run failure detection and trigger leader election +if the hinted leader doesn't successfully replicate within the configured +timeout period. + +RUNNING: The LEADER peer accepts writes and replicates them through consensus. +FOLLOWER replicas accepts writes from the leader and ACK. + +* The configuration provided on Start() can only be taken into account if there +is an appropriate leader election algorithm. This can be added later but is not +present in the initial implementation. Roles are hinted in the sense that the +config initiator (usually the master) might hint what the roles for the peers +in the config should be, but the config is the ultimate decider on whether that +is possible or not. + +============================================================ +References +============================================================ +[1] http://ramcloud.stanford.edu/raft.pdf + +[2] http://www.cs.berkeley.edu/~brewer/cs262/Aries.pdf + +[3] Viewstamped Replication: A New Primary Copy Method to Support +Highly-Available Distributed Systems. B. Oki, B. Liskov +http://www.pmg.csail.mit.edu/papers/vr.pdf + +[4] Viewstamped Replication Revisited. B. Liskov and J. Cowling +http://pmg.csail.mit.edu/papers/vr-revisited.pdf + +[5] Aether: A Scalable Approach to logging +http://infoscience.epfl.ch/record/149436/files/vldb10aether.pdf diff --git a/src/kudu/consensus/consensus-test-util.h b/src/kudu/consensus/consensus-test-util.h new file mode 100644 index 000000000000..252246b5f597 --- /dev/null +++ b/src/kudu/consensus/consensus-test-util.h @@ -0,0 +1,870 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/timestamp.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/consensus_queue.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/raft_consensus.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/clock.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/threadpool.h" + +#define TOKENPASTE(x, y) x ## y +#define TOKENPASTE2(x, y) TOKENPASTE(x, y) + +#define ASSERT_OPID_EQ(left, right) \ + OpId TOKENPASTE2(_left, __LINE__) = (left); \ + OpId TOKENPASTE2(_right, __LINE__) = (right); \ + if (!consensus::OpIdEquals(TOKENPASTE2(_left, __LINE__), TOKENPASTE2(_right,__LINE__))) \ + FAIL() << "Expected: " << TOKENPASTE2(_right,__LINE__).ShortDebugString() << "\n" \ + << "Value: " << TOKENPASTE2(_left,__LINE__).ShortDebugString() << "\n" + +namespace kudu { +namespace consensus { + +using log::Log; +using strings::Substitute; + +static gscoped_ptr CreateDummyReplicate(int term, + int index, + const Timestamp& timestamp, + int payload_size) { + gscoped_ptr msg(new ReplicateMsg); + OpId* id = msg->mutable_id(); + id->set_term(term); + id->set_index(index); + + msg->set_op_type(NO_OP); + msg->mutable_noop_request()->mutable_payload_for_tests()->resize(payload_size); + msg->set_timestamp(timestamp.ToUint64()); + return msg.Pass(); +} + +// Returns RaftPeerPB with given UUID and obviously-fake hostname / port combo. +RaftPeerPB FakeRaftPeerPB(const std::string& uuid) { + RaftPeerPB peer_pb; + peer_pb.set_permanent_uuid(uuid); + peer_pb.mutable_last_known_addr()->set_host(Substitute("$0-fake-hostname", CURRENT_TEST_NAME())); + peer_pb.mutable_last_known_addr()->set_port(0); + return peer_pb; +} + +// Appends 'count' messages to 'queue' with different terms and indexes. +// +// An operation will only be considered done (TestOperationStatus::IsDone() +// will become true) once at least 'n_majority' peers have called +// TestOperationStatus::AckPeer(). +static inline void AppendReplicateMessagesToQueue( + PeerMessageQueue* queue, + const scoped_refptr& clock, + int first, + int count, + int payload_size = 0) { + + for (int i = first; i < first + count; i++) { + int term = i / 7; + int index = i; + CHECK_OK(queue->AppendOperation(make_scoped_refptr_replicate( + CreateDummyReplicate(term, index, clock->Now(), payload_size).release()))); + } +} + +// Builds a configuration of 'num' voters. +RaftConfigPB BuildRaftConfigPBForTests(int num) { + RaftConfigPB raft_config; + raft_config.set_local(false); + for (int i = 0; i < num; i++) { + RaftPeerPB* peer_pb = raft_config.add_peers(); + peer_pb->set_member_type(RaftPeerPB::VOTER); + peer_pb->set_permanent_uuid(Substitute("peer-$0", i)); + HostPortPB* hp = peer_pb->mutable_last_known_addr(); + hp->set_host(Substitute("peer-$0.fake-domain-for-tests", i)); + hp->set_port(0); + } + return raft_config; +} + +// Abstract base class to build PeerProxy implementations on top of for testing. +// Provides a single-threaded pool to run callbacks in and callback +// registration/running, along with an enum to identify the supported methods. +class TestPeerProxy : public PeerProxy { + public: + // Which PeerProxy method to invoke. + enum Method { + kUpdate, + kRequestVote, + }; + + explicit TestPeerProxy(ThreadPool* pool) : pool_(pool) {} + + protected: + // Register the RPC callback in order to call later. + // We currently only support one request of each method being in flight at a time. + virtual void RegisterCallback(Method method, const rpc::ResponseCallback& callback) { + boost::lock_guard lock(lock_); + InsertOrDie(&callbacks_, method, callback); + } + + // Answer the peer. + virtual void Respond(Method method) { + rpc::ResponseCallback callback; + { + boost::lock_guard lock(lock_); + callback = FindOrDie(callbacks_, method); + CHECK_EQ(1, callbacks_.erase(method)); + // Drop the lock before submitting to the pool, since the callback itself may + // destroy this instance. + } + CHECK_OK(pool_->SubmitFunc(callback)); + } + + virtual void RegisterCallbackAndRespond(Method method, const rpc::ResponseCallback& callback) { + RegisterCallback(method, callback); + Respond(method); + } + + mutable simple_spinlock lock_; + ThreadPool* pool_; + std::map callbacks_; // Protected by lock_. +}; + +template +class DelayablePeerProxy : public TestPeerProxy { + public: + // Add delayability of RPC responses to the delegated impl. + // This class takes ownership of 'proxy'. + explicit DelayablePeerProxy(ThreadPool* pool, ProxyType* proxy) + : TestPeerProxy(pool), + proxy_(CHECK_NOTNULL(proxy)), + delay_response_(false), + latch_(1) { + } + + // Delay the answer to the next response to this remote + // peer. The response callback will only be called on Respond(). + virtual void DelayResponse() { + lock_guard l(&lock_); + delay_response_ = true; + latch_.Reset(1); // Reset for the next time. + } + + virtual void RespondUnlessDelayed(Method method) { + { + lock_guard l(&lock_); + if (delay_response_) { + latch_.CountDown(); + delay_response_ = false; + return; + } + } + TestPeerProxy::Respond(method); + } + + virtual void Respond(Method method) OVERRIDE { + latch_.Wait(); // Wait until strictly after peer would have responded. + return TestPeerProxy::Respond(method); + } + + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + RegisterCallback(kUpdate, callback); + return proxy_->UpdateAsync(request, response, controller, + boost::bind(&DelayablePeerProxy::RespondUnlessDelayed, + this, kUpdate)); + } + + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + RegisterCallback(kRequestVote, callback); + return proxy_->RequestConsensusVoteAsync(request, response, controller, + boost::bind(&DelayablePeerProxy::RespondUnlessDelayed, + this, kRequestVote)); + } + + ProxyType* proxy() const { + return proxy_.get(); + } + + protected: + gscoped_ptr const proxy_; + bool delay_response_; // Protected by lock_. + CountDownLatch latch_; +}; + +// Allows complete mocking of a peer's responses. +// You set the response, it will respond with that. +class MockedPeerProxy : public TestPeerProxy { + public: + explicit MockedPeerProxy(ThreadPool* pool) + : TestPeerProxy(pool), + update_count_(0) { + } + + virtual void set_update_response(const ConsensusResponsePB& update_response) { + CHECK(update_response.IsInitialized()) << update_response.ShortDebugString(); + { + lock_guard l(&lock_); + update_response_ = update_response; + } + } + + virtual void set_vote_response(const VoteResponsePB& vote_response) { + { + lock_guard l(&lock_); + vote_response_ = vote_response; + } + } + + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + { + lock_guard l(&lock_); + update_count_++; + *response = update_response_; + } + return RegisterCallbackAndRespond(kUpdate, callback); + } + + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + *response = vote_response_; + return RegisterCallbackAndRespond(kRequestVote, callback); + } + + // Return the number of times that UpdateAsync() has been called. + int update_count() const { + lock_guard l(&lock_); + return update_count_; + } + + protected: + int update_count_; + + ConsensusResponsePB update_response_; + VoteResponsePB vote_response_; +}; + +// Allows to test peers by emulating a noop remote endpoint that just replies +// that the messages were received/replicated/committed. +class NoOpTestPeerProxy : public TestPeerProxy { + public: + + explicit NoOpTestPeerProxy(ThreadPool* pool, const consensus::RaftPeerPB& peer_pb) + : TestPeerProxy(pool), peer_pb_(peer_pb) { + last_received_.CopyFrom(MinimumOpId()); + } + + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + + response->Clear(); + { + boost::lock_guard lock(lock_); + if (OpIdLessThan(last_received_, request->preceding_id())) { + ConsensusErrorPB* error = response->mutable_status()->mutable_error(); + error->set_code(ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH); + StatusToPB(Status::IllegalState(""), error->mutable_status()); + } else if (request->ops_size() > 0) { + last_received_.CopyFrom(request->ops(request->ops_size() - 1).id()); + } + + response->set_responder_uuid(peer_pb_.permanent_uuid()); + response->set_responder_term(request->caller_term()); + response->mutable_status()->mutable_last_received()->CopyFrom(last_received_); + response->mutable_status()->mutable_last_received_current_leader()->CopyFrom(last_received_); + // We set the last committed index to be the same index as the last received. While + // this is unlikely to happen in a real situation, its not technically incorrect and + // avoids having to come up with some other index that it still correct. + response->mutable_status()->set_last_committed_idx(last_received_.index()); + } + return RegisterCallbackAndRespond(kUpdate, callback); + } + + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + { + boost::lock_guard lock(lock_); + response->set_responder_uuid(peer_pb_.permanent_uuid()); + response->set_responder_term(request->candidate_term()); + response->set_vote_granted(true); + } + return RegisterCallbackAndRespond(kRequestVote, callback); + } + + const OpId& last_received() { + boost::lock_guard lock(lock_); + return last_received_; + } + + private: + const consensus::RaftPeerPB peer_pb_; + ConsensusStatusPB last_status_; // Protected by lock_. + OpId last_received_; // Protected by lock_. +}; + +class NoOpTestPeerProxyFactory : public PeerProxyFactory { + public: + NoOpTestPeerProxyFactory() { + CHECK_OK(ThreadPoolBuilder("test-peer-pool").set_max_threads(3).Build(&pool_)); + } + + virtual Status NewProxy(const consensus::RaftPeerPB& peer_pb, + gscoped_ptr* proxy) OVERRIDE { + proxy->reset(new NoOpTestPeerProxy(pool_.get(), peer_pb)); + return Status::OK(); + } + + gscoped_ptr pool_; +}; + +typedef std::unordered_map > TestPeerMap; + +// Thread-safe manager for list of peers being used in tests. +class TestPeerMapManager { + public: + explicit TestPeerMapManager(const RaftConfigPB& config) : config_(config) {} + + void AddPeer(const std::string& peer_uuid, const scoped_refptr& peer) { + boost::lock_guard lock(lock_); + InsertOrDie(&peers_, peer_uuid, peer); + } + + Status GetPeerByIdx(int idx, scoped_refptr* peer_out) const { + CHECK_LT(idx, config_.peers_size()); + return GetPeerByUuid(config_.peers(idx).permanent_uuid(), peer_out); + } + + Status GetPeerByUuid(const std::string& peer_uuid, + scoped_refptr* peer_out) const { + boost::lock_guard lock(lock_); + if (!FindCopy(peers_, peer_uuid, peer_out)) { + return Status::NotFound("Other consensus instance was destroyed"); + } + return Status::OK(); + } + + void RemovePeer(const std::string& peer_uuid) { + boost::lock_guard lock(lock_); + peers_.erase(peer_uuid); + } + + TestPeerMap GetPeerMapCopy() const { + boost::lock_guard lock(lock_); + return peers_; + } + + void Clear() { + // We create a copy of the peers before we clear 'peers_' so that there's + // still a reference to each peer. If we reduce the reference count to 0 under + // the lock we might get a deadlock as on shutdown consensus indirectly + // destroys the test proxies which in turn reach into this class. + TestPeerMap copy = peers_; + { + boost::lock_guard lock(lock_); + peers_.clear(); + } + + } + + private: + const RaftConfigPB config_; + TestPeerMap peers_; + mutable simple_spinlock lock_; +}; + + +// Allows to test remote peers by emulating an RPC. +// Both the "remote" peer's RPC call and the caller peer's response are executed +// asynchronously in a ThreadPool. +class LocalTestPeerProxy : public TestPeerProxy { + public: + LocalTestPeerProxy(std::string peer_uuid, ThreadPool* pool, + TestPeerMapManager* peers) + : TestPeerProxy(pool), + peer_uuid_(std::move(peer_uuid)), + peers_(peers), + miss_comm_(false) {} + + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + RegisterCallback(kUpdate, callback); + CHECK_OK(pool_->SubmitFunc(boost::bind(&LocalTestPeerProxy::SendUpdateRequest, + this, request, response))); + } + + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE { + RegisterCallback(kRequestVote, callback); + CHECK_OK(pool_->SubmitFunc(boost::bind(&LocalTestPeerProxy::SendVoteRequest, + this, request, response))); + } + + template + void SetResponseError(const Status& status, Response* response) { + tserver::TabletServerErrorPB* error = response->mutable_error(); + error->set_code(tserver::TabletServerErrorPB::UNKNOWN_ERROR); + StatusToPB(status, error->mutable_status()); + } + + template + void RespondOrMissResponse(Request* request, + const Response& response_temp, + Response* final_response, + Method method) { + + bool miss_comm_copy; + { + boost::lock_guard lock(lock_); + miss_comm_copy = miss_comm_; + miss_comm_ = false; + } + if (PREDICT_FALSE(miss_comm_copy)) { + VLOG(2) << this << ": injecting fault on " << request->ShortDebugString(); + SetResponseError(Status::IOError("Artificial error caused by communication " + "failure injection."), final_response); + } else { + final_response->CopyFrom(response_temp); + } + Respond(method); + } + + void SendUpdateRequest(const ConsensusRequestPB* request, + ConsensusResponsePB* response) { + // Copy the request and the response for the other peer so that ownership + // remains as close to the dist. impl. as possible. + ConsensusRequestPB other_peer_req; + other_peer_req.CopyFrom(*request); + + // Give the other peer a clean response object to write to. + ConsensusResponsePB other_peer_resp; + scoped_refptr peer; + Status s = peers_->GetPeerByUuid(peer_uuid_, &peer); + + if (s.ok()) { + s = peer->Update(&other_peer_req, &other_peer_resp); + if (s.ok() && !other_peer_resp.has_error()) { + CHECK(other_peer_resp.has_status()); + CHECK(other_peer_resp.status().IsInitialized()); + } + } + if (!s.ok()) { + LOG(WARNING) << "Could not Update replica with request: " + << other_peer_req.ShortDebugString() + << " Status: " << s.ToString(); + SetResponseError(s, &other_peer_resp); + } + + response->CopyFrom(other_peer_resp); + RespondOrMissResponse(request, other_peer_resp, response, kUpdate); + } + + + + void SendVoteRequest(const VoteRequestPB* request, + VoteResponsePB* response) { + + // Copy the request and the response for the other peer so that ownership + // remains as close to the dist. impl. as possible. + VoteRequestPB other_peer_req; + other_peer_req.CopyFrom(*request); + VoteResponsePB other_peer_resp; + other_peer_resp.CopyFrom(*response); + + scoped_refptr peer; + Status s = peers_->GetPeerByUuid(peer_uuid_, &peer); + + if (s.ok()) { + s = peer->RequestVote(&other_peer_req, &other_peer_resp); + } + if (!s.ok()) { + LOG(WARNING) << "Could not RequestVote from replica with request: " + << other_peer_req.ShortDebugString() + << " Status: " << s.ToString(); + SetResponseError(s, &other_peer_resp); + } + + response->CopyFrom(other_peer_resp); + RespondOrMissResponse(request, other_peer_resp, response, kRequestVote); + } + + void InjectCommFaultLeaderSide() { + VLOG(2) << this << ": injecting fault next time"; + boost::lock_guard lock(lock_); + miss_comm_ = true; + } + + const std::string& GetTarget() const { + return peer_uuid_; + } + + private: + const std::string peer_uuid_; + TestPeerMapManager* const peers_; + bool miss_comm_; +}; + +class LocalTestPeerProxyFactory : public PeerProxyFactory { + public: + explicit LocalTestPeerProxyFactory(TestPeerMapManager* peers) + : peers_(peers) { + CHECK_OK(ThreadPoolBuilder("test-peer-pool").set_max_threads(3).Build(&pool_)); + } + + virtual Status NewProxy(const consensus::RaftPeerPB& peer_pb, + gscoped_ptr* proxy) OVERRIDE { + LocalTestPeerProxy* new_proxy = new LocalTestPeerProxy(peer_pb.permanent_uuid(), + pool_.get(), + peers_); + proxy->reset(new_proxy); + proxies_.push_back(new_proxy); + return Status::OK(); + } + + virtual const vector& GetProxies() { + return proxies_; + } + + private: + gscoped_ptr pool_; + TestPeerMapManager* const peers_; + // NOTE: There is no need to delete this on the dctor because proxies are externally managed + vector proxies_; +}; + +// A simple implementation of the transaction driver. +// This is usually implemented by TransactionDriver but here we +// keep the implementation to the minimally required to have consensus +// work. +class TestDriver { + public: + TestDriver(ThreadPool* pool, Log* log, const scoped_refptr& round) + : round_(round), + pool_(pool), + log_(log) { + } + + void SetRound(const scoped_refptr& round) { + round_ = round; + } + + // Does nothing but enqueue the Apply + void ReplicationFinished(const Status& status) { + if (status.IsAborted()) { + Cleanup(); + return; + } + CHECK_OK(status); + CHECK_OK(pool_->SubmitFunc(boost::bind(&TestDriver::Apply, this))); + } + + // Called in all modes to delete the transaction and, transitively, the consensus + // round. + void Cleanup() { + delete this; + } + + scoped_refptr round_; + + private: + // The commit message has the exact same type of the replicate message, but + // no content. + void Apply() { + gscoped_ptr msg(new CommitMsg); + msg->set_op_type(round_->replicate_msg()->op_type()); + msg->mutable_commited_op_id()->CopyFrom(round_->id()); + CHECK_OK(log_->AsyncAppendCommit(msg.Pass(), + Bind(&TestDriver::CommitCallback, Unretained(this)))); + } + + void CommitCallback(const Status& s) { + CHECK_OK(s); + Cleanup(); + } + + ThreadPool* pool_; + Log* log_; +}; + +// Fake ReplicaTransactionFactory that allows for instantiating and unit +// testing RaftConsensusState. Does not actually support running transactions. +class MockTransactionFactory : public ReplicaTransactionFactory { + public: + virtual Status StartReplicaTransaction(const scoped_refptr& round) OVERRIDE { + return StartReplicaTransactionMock(round.get()); + } + MOCK_METHOD1(StartReplicaTransactionMock, Status(ConsensusRound* round)); +}; + +// A transaction factory for tests, usually this is implemented by TabletPeer. +class TestTransactionFactory : public ReplicaTransactionFactory { + public: + explicit TestTransactionFactory(Log* log) : consensus_(NULL), + log_(log) { + + CHECK_OK(ThreadPoolBuilder("test-txn-factory").set_max_threads(1).Build(&pool_)); + } + + void SetConsensus(Consensus* consensus) { + consensus_ = consensus; + } + + Status StartReplicaTransaction(const scoped_refptr& round) OVERRIDE { + auto txn = new TestDriver(pool_.get(), log_, round); + txn->round_->SetConsensusReplicatedCallback(Bind(&TestDriver::ReplicationFinished, + Unretained(txn))); + return Status::OK(); + } + + void ReplicateAsync(ConsensusRound* round) { + CHECK_OK(consensus_->Replicate(round)); + } + + void WaitDone() { + pool_->Wait(); + } + + void ShutDown() { + WaitDone(); + pool_->Shutdown(); + } + + ~TestTransactionFactory() { + ShutDown(); + } + + private: + gscoped_ptr pool_; + Consensus* consensus_; + Log* log_; +}; + +// Consensus fault hooks impl. that simply counts the number of calls to +// each method. +// Allows passing another hook instance so that we can use both. +// If non-null, the passed hook instance will be called first for all methods. +class CounterHooks : public Consensus::ConsensusFaultHooks { + public: + explicit CounterHooks( + std::shared_ptr current_hook) + : current_hook_(std::move(current_hook)), + pre_start_calls_(0), + post_start_calls_(0), + pre_config_change_calls_(0), + post_config_change_calls_(0), + pre_replicate_calls_(0), + post_replicate_calls_(0), + pre_update_calls_(0), + post_update_calls_(0), + pre_shutdown_calls_(0), + post_shutdown_calls_(0) {} + + virtual Status PreStart() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PreStart()); + boost::lock_guard lock(lock_); + pre_start_calls_++; + return Status::OK(); + } + + virtual Status PostStart() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PostStart()); + boost::lock_guard lock(lock_); + post_start_calls_++; + return Status::OK(); + } + + virtual Status PreConfigChange() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PreConfigChange()); + boost::lock_guard lock(lock_); + pre_config_change_calls_++; + return Status::OK(); + } + + virtual Status PostConfigChange() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PostConfigChange()); + boost::lock_guard lock(lock_); + post_config_change_calls_++; + return Status::OK(); + } + + virtual Status PreReplicate() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PreReplicate()); + boost::lock_guard lock(lock_); + pre_replicate_calls_++; + return Status::OK(); + } + + virtual Status PostReplicate() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PostReplicate()); + boost::lock_guard lock(lock_); + post_replicate_calls_++; + return Status::OK(); + } + + virtual Status PreUpdate() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PreUpdate()); + boost::lock_guard lock(lock_); + pre_update_calls_++; + return Status::OK(); + } + + virtual Status PostUpdate() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PostUpdate()); + boost::lock_guard lock(lock_); + post_update_calls_++; + return Status::OK(); + } + + virtual Status PreShutdown() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PreShutdown()); + boost::lock_guard lock(lock_); + pre_shutdown_calls_++; + return Status::OK(); + } + + virtual Status PostShutdown() OVERRIDE { + if (current_hook_.get()) RETURN_NOT_OK(current_hook_->PostShutdown()); + boost::lock_guard lock(lock_); + post_shutdown_calls_++; + return Status::OK(); + } + + int num_pre_start_calls() { + boost::lock_guard lock(lock_); + return pre_start_calls_; + } + + int num_post_start_calls() { + boost::lock_guard lock(lock_); + return post_start_calls_; + } + + int num_pre_config_change_calls() { + boost::lock_guard lock(lock_); + return pre_config_change_calls_; + } + + int num_post_config_change_calls() { + boost::lock_guard lock(lock_); + return post_config_change_calls_; + } + + int num_pre_replicate_calls() { + boost::lock_guard lock(lock_); + return pre_replicate_calls_; + } + + int num_post_replicate_calls() { + boost::lock_guard lock(lock_); + return post_replicate_calls_; + } + + int num_pre_update_calls() { + boost::lock_guard lock(lock_); + return pre_update_calls_; + } + + int num_post_update_calls() { + boost::lock_guard lock(lock_); + return post_update_calls_; + } + + int num_pre_shutdown_calls() { + boost::lock_guard lock(lock_); + return pre_shutdown_calls_; + } + + int num_post_shutdown_calls() { + boost::lock_guard lock(lock_); + return post_shutdown_calls_; + } + + private: + std::shared_ptr current_hook_; + int pre_start_calls_; + int post_start_calls_; + int pre_config_change_calls_; + int post_config_change_calls_; + int pre_replicate_calls_; + int post_replicate_calls_; + int pre_update_calls_; + int post_update_calls_; + int pre_shutdown_calls_; + int post_shutdown_calls_; + + // Lock that protects updates to the counters. + mutable simple_spinlock lock_; +}; + +class TestRaftConsensusQueueIface : public PeerMessageQueueObserver { + public: + bool IsMajorityReplicated(int64_t index) { + boost::lock_guard lock(lock_); + return index <= majority_replicated_index_; + } + + protected: + virtual void UpdateMajorityReplicated(const OpId& majority_replicated, + OpId* committed_index) OVERRIDE { + boost::lock_guard lock(lock_); + majority_replicated_index_ = majority_replicated.index(); + committed_index->CopyFrom(majority_replicated); + } + virtual void NotifyTermChange(int64_t term) OVERRIDE {} + virtual void NotifyFailedFollower(const std::string& uuid, + int64_t term, + const std::string& reason) OVERRIDE {} + + private: + mutable simple_spinlock lock_; + int64_t majority_replicated_index_; +}; + +} // namespace consensus +} // namespace kudu + diff --git a/src/kudu/consensus/consensus.cc b/src/kudu/consensus/consensus.cc new file mode 100644 index 000000000000..886367eea792 --- /dev/null +++ b/src/kudu/consensus/consensus.cc @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/consensus.h" + +#include + +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" + +namespace kudu { +namespace consensus { + +using std::shared_ptr; +using strings::Substitute; + +ConsensusBootstrapInfo::ConsensusBootstrapInfo() + : last_id(MinimumOpId()), + last_committed_id(MinimumOpId()) { +} + +ConsensusBootstrapInfo::~ConsensusBootstrapInfo() { + STLDeleteElements(&orphaned_replicates); +} + +ConsensusRound::ConsensusRound(Consensus* consensus, + gscoped_ptr replicate_msg, + ConsensusReplicatedCallback replicated_cb) + : consensus_(consensus), + replicate_msg_(new RefCountedReplicate(replicate_msg.release())), + replicated_cb_(std::move(replicated_cb)), + bound_term_(-1) {} + +ConsensusRound::ConsensusRound(Consensus* consensus, + const ReplicateRefPtr& replicate_msg) + : consensus_(consensus), + replicate_msg_(replicate_msg), + bound_term_(-1) { + DCHECK_NOTNULL(replicate_msg_.get()); +} + +void ConsensusRound::NotifyReplicationFinished(const Status& status) { + if (PREDICT_FALSE(replicated_cb_.is_null())) return; + replicated_cb_.Run(status); +} + +Status ConsensusRound::CheckBoundTerm(int64_t current_term) const { + if (PREDICT_FALSE(bound_term_ != -1 && + bound_term_ != current_term)) { + return Status::Aborted( + strings::Substitute( + "Transaction submitted in term $0 cannot be replicated in term $1", + bound_term_, current_term)); + } + return Status::OK(); +} + +scoped_refptr Consensus::NewRound( + gscoped_ptr replicate_msg, + const ConsensusReplicatedCallback& replicated_cb) { + return make_scoped_refptr(new ConsensusRound(this, replicate_msg.Pass(), replicated_cb)); +} + +void Consensus::SetFaultHooks(const shared_ptr& hooks) { + fault_hooks_ = hooks; +} + +const shared_ptr& Consensus::GetFaultHooks() const { + return fault_hooks_; +} + +Status Consensus::ExecuteHook(HookPoint point) { + if (PREDICT_FALSE(fault_hooks_.get() != nullptr)) { + switch (point) { + case Consensus::PRE_START: return fault_hooks_->PreStart(); + case Consensus::POST_START: return fault_hooks_->PostStart(); + case Consensus::PRE_CONFIG_CHANGE: return fault_hooks_->PreConfigChange(); + case Consensus::POST_CONFIG_CHANGE: return fault_hooks_->PostConfigChange(); + case Consensus::PRE_REPLICATE: return fault_hooks_->PreReplicate(); + case Consensus::POST_REPLICATE: return fault_hooks_->PostReplicate(); + case Consensus::PRE_UPDATE: return fault_hooks_->PreUpdate(); + case Consensus::POST_UPDATE: return fault_hooks_->PostUpdate(); + case Consensus::PRE_SHUTDOWN: return fault_hooks_->PreShutdown(); + case Consensus::POST_SHUTDOWN: return fault_hooks_->PostShutdown(); + default: LOG(FATAL) << "Unknown fault hook."; + } + } + return Status::OK(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus.h b/src/kudu/consensus/consensus.h new file mode 100644 index 000000000000..4b966f4b93ef --- /dev/null +++ b/src/kudu/consensus/consensus.h @@ -0,0 +1,428 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDO_QUORUM_CONSENSUS_H_ +#define KUDO_QUORUM_CONSENSUS_H_ + +#include +#include +#include +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" +#include "kudu/util/status_callback.h" + +namespace kudu { + +namespace log { +class Log; +} + +namespace master { +class SysCatalogTable; +} + +namespace server { +class Clock; +} + +namespace tablet { +class TabletPeer; +} + +namespace tserver { +class TabletServerErrorPB; +} + +namespace consensus { + +class ConsensusCommitContinuation; +class ConsensusRound; +class ReplicaTransactionFactory; + +typedef int64_t ConsensusTerm; + +typedef StatusCallback ConsensusReplicatedCallback; + +struct ConsensusOptions { + std::string tablet_id; +}; + +// After completing bootstrap, some of the results need to be plumbed through +// into the consensus implementation. +struct ConsensusBootstrapInfo { + ConsensusBootstrapInfo(); + ~ConsensusBootstrapInfo(); + + // The id of the last operation in the log + OpId last_id; + + // The id of the last committed operation in the log. + OpId last_committed_id; + + // REPLICATE messages which were in the log with no accompanying + // COMMIT. These need to be passed along to consensus init in order + // to potentially commit them. + // + // These are owned by the ConsensusBootstrapInfo instance. + std::vector orphaned_replicates; + + private: + DISALLOW_COPY_AND_ASSIGN(ConsensusBootstrapInfo); +}; + +// The external interface for a consensus peer. +// +// Note: Even though Consensus points to Log, it needs to be destroyed +// after it. See Log class header comment for the reason why. On the other +// hand Consensus must be quiesced before closing the log, otherwise it +// will try to write to a destroyed/closed log. +// +// The order of these operations on shutdown must therefore be: +// 1 - quiesce Consensus +// 2 - close/destroy Log +// 3 - destroy Consensus +class Consensus : public RefCountedThreadSafe { + public: + class ConsensusFaultHooks; + + Consensus() {} + + // Starts running the consensus algorithm. + virtual Status Start(const ConsensusBootstrapInfo& info) = 0; + + // Returns true if consensus is running. + virtual bool IsRunning() const = 0; + + // Emulates a leader election by simply making this peer leader. + virtual Status EmulateElection() = 0; + + // Triggers a leader election. + enum ElectionMode { + // A normal leader election. Peers will not vote for this node + // if they believe that a leader is alive. + NORMAL_ELECTION, + + // In this mode, peers will vote for this candidate even if they + // think a leader is alive. This can be used for a faster hand-off + // between a leader and one of its replicas. + ELECT_EVEN_IF_LEADER_IS_ALIVE + }; + virtual Status StartElection(ElectionMode mode) = 0; + + // Implement a LeaderStepDown() request. + virtual Status StepDown(LeaderStepDownResponsePB* resp) { + return Status::NotSupported("Not implemented."); + } + + // Creates a new ConsensusRound, the entity that owns all the data + // structures required for a consensus round, such as the ReplicateMsg + // (and later on the CommitMsg). ConsensusRound will also point to and + // increase the reference count for the provided callbacks. + scoped_refptr NewRound( + gscoped_ptr replicate_msg, + const ConsensusReplicatedCallback& replicated_cb); + + // Called by a Leader to replicate an entry to the state machine. + // + // From the leader instance perspective execution proceeds as follows: + // + // Leader RaftConfig + // + + + // 1) Req->| Replicate() | + // | | + // 2) +-------------replicate-------------->| + // |<---------------ACK------------------+ + // | | + // 3) +--+ | + // <----+ round.NotifyReplicationFinished()| + // | | + // 3a) | +------ update commitIndex ------->| + // | | + // + // 1) Caller calls Replicate(), method returns immediately to the caller and + // runs asynchronously. + // + // 2) Leader replicates the entry to the peers using the consensus + // algorithm, proceeds as soon as a majority of voters acknowledges the + // entry. + // + // 3) Leader defers to the caller by calling ConsensusRound::NotifyReplicationFinished, + // which calls the ConsensusReplicatedCallback. + // + // 3a) The leader asynchronously notifies other peers of the new + // commit index, which tells them to apply the operation. + // + // This method can only be called on the leader, i.e. role() == LEADER + virtual Status Replicate(const scoped_refptr& round) = 0; + + // Ensures that the consensus implementation is currently acting as LEADER, + // and thus is allowed to submit operations to be prepared before they are + // replicated. To avoid a time-of-check-to-time-of-use (TOCTOU) race, the + // implementation also stores the current term inside the round's "bound_term" + // member. When we eventually are about to replicate the transaction, we verify + // that the term has not changed in the meantime. + virtual Status CheckLeadershipAndBindTerm(const scoped_refptr& round) { + return Status::OK(); + } + + // Messages sent from LEADER to FOLLOWERS and LEARNERS to update their + // state machines. This is equivalent to "AppendEntries()" in Raft + // terminology. + // + // ConsensusRequestPB contains a sequence of 0 or more operations to apply + // on the replica. If there are 0 operations the request is considered + // 'status-only' i.e. the leader is communicating with the follower only + // in order to pass back and forth information on watermarks (eg committed + // operation ID, replicated op id, etc). + // + // If the sequence contains 1 or more operations they will be replicated + // in the same order as the leader, and submitted for asynchronous Prepare + // in the same order. + // + // The leader also provides information on the index of the latest + // operation considered committed by consensus. The replica uses this + // information to update the state of any pending (previously replicated/prepared) + // transactions. + // + // Returns Status::OK if the response has been filled (regardless of accepting + // or rejecting the specific request). Returns non-OK Status if a specific + // error response could not be formed, which will result in the service + // returning an UNKNOWN_ERROR RPC error code to the caller and including the + // stringified Status message. + virtual Status Update(const ConsensusRequestPB* request, + ConsensusResponsePB* response) = 0; + + // Messages sent from CANDIDATEs to voting peers to request their vote + // in leader election. + virtual Status RequestVote(const VoteRequestPB* request, + VoteResponsePB* response) = 0; + + // Implement a ChangeConfig() request. + virtual Status ChangeConfig(const ChangeConfigRequestPB& req, + const StatusCallback& client_cb, + boost::optional* error) { + return Status::NotSupported("Not implemented."); + } + + // Returns the current Raft role of this instance. + virtual RaftPeerPB::Role role() const = 0; + + // Returns the uuid of this peer. + virtual std::string peer_uuid() const = 0; + + // Returns the id of the tablet whose updates this consensus instance helps coordinate. + virtual std::string tablet_id() const = 0; + + // Returns a copy of the committed state of the Consensus system. + virtual ConsensusStatePB ConsensusState(ConsensusConfigType type) const = 0; + + // Returns a copy of the current committed Raft configuration. + virtual RaftConfigPB CommittedConfig() const = 0; + + virtual void DumpStatusHtml(std::ostream& out) const = 0; + + void SetFaultHooks(const std::shared_ptr& hooks); + + const std::shared_ptr& GetFaultHooks() const; + + // Stops running the consensus algorithm. + virtual void Shutdown() = 0; + + // Returns the last OpId (either received or committed, depending on the + // 'type' argument) that the Consensus implementation knows about. + // Primarily used for testing purposes. + virtual Status GetLastOpId(OpIdType type, OpId* id) { + return Status::NotFound("Not implemented."); + } + + protected: + friend class RefCountedThreadSafe; + friend class tablet::TabletPeer; + friend class master::SysCatalogTable; + + // This class is refcounted. + virtual ~Consensus() {} + + // Fault hooks for tests. In production code this will always be null. + std::shared_ptr fault_hooks_; + + enum HookPoint { + PRE_START, + POST_START, + PRE_CONFIG_CHANGE, + POST_CONFIG_CHANGE, + PRE_REPLICATE, + POST_REPLICATE, + PRE_COMMIT, + POST_COMMIT, + PRE_UPDATE, + POST_UPDATE, + PRE_SHUTDOWN, + POST_SHUTDOWN + }; + + Status ExecuteHook(HookPoint point); + + enum State { + kNotInitialized, + kInitializing, + kConfiguring, + kRunning, + }; + private: + DISALLOW_COPY_AND_ASSIGN(Consensus); +}; + +// Factory for replica transactions. +// An implementation of this factory must be registered prior to consensus +// start, and is used to create transactions when the consensus implementation receives +// messages from the leader. +// +// Replica transactions execute the following way: +// +// - When a ReplicateMsg is first received from the leader, the Consensus +// instance creates the ConsensusRound and calls StartReplicaTransaction(). +// This will trigger the Prepare(). At the same time replica consensus +// instance immediately stores the ReplicateMsg in the Log. Once the replicate +// message is stored in stable storage an ACK is sent to the leader (i.e. the +// replica Consensus instance does not wait for Prepare() to finish). +// +// - When the CommitMsg for a replicate is first received from the leader +// the replica waits for the corresponding Prepare() to finish (if it has +// not completed yet) and then proceeds to trigger the Apply(). +// +// - Once Apply() completes the ReplicaTransactionFactory is responsible for logging +// a CommitMsg to the log to ensure that the operation can be properly restored +// on a restart. +class ReplicaTransactionFactory { + public: + virtual Status StartReplicaTransaction(const scoped_refptr& context) = 0; + + virtual ~ReplicaTransactionFactory() {} +}; + +// Context for a consensus round on the LEADER side, typically created as an +// out-parameter of Consensus::Append. +// This class is ref-counted because we want to ensure it stays alive for the +// duration of the Transaction when it is associated with a Transaction, while +// we also want to ensure it has a proper lifecycle when a ConsensusRound is +// pushed that is not associated with a Tablet transaction. +class ConsensusRound : public RefCountedThreadSafe { + + public: + // Ctor used for leader transactions. Leader transactions can and must specify the + // callbacks prior to initiating the consensus round. + ConsensusRound(Consensus* consensus, gscoped_ptr replicate_msg, + ConsensusReplicatedCallback replicated_cb); + + // Ctor used for follower/learner transactions. These transactions do not use the + // replicate callback and the commit callback is set later, after the transaction + // is actually started. + ConsensusRound(Consensus* consensus, + const ReplicateRefPtr& replicate_msg); + + ReplicateMsg* replicate_msg() { + return replicate_msg_->get(); + } + + const ReplicateRefPtr& replicate_scoped_refptr() { + return replicate_msg_; + } + + // Returns the id of the (replicate) operation this context + // refers to. This is only set _after_ Consensus::Replicate(context). + OpId id() const { + return replicate_msg_->get()->id(); + } + + // Register a callback that is called by Consensus to notify that the round + // is considered either replicated, if 'status' is OK(), or that it has + // permanently failed to replicate if 'status' is anything else. If 'status' + // is OK() then the operation can be applied to the state machine, otherwise + // the operation should be aborted. + void SetConsensusReplicatedCallback(const ConsensusReplicatedCallback& replicated_cb) { + replicated_cb_ = replicated_cb; + } + + // If a continuation was set, notifies it that the round has been replicated. + void NotifyReplicationFinished(const Status& status); + + // Binds this round such that it may not be eventually executed in any term + // other than 'term'. + // See CheckBoundTerm(). + void BindToTerm(int64_t term) { + DCHECK_EQ(bound_term_, -1); + bound_term_ = term; + } + + // Check for a rare race in which an operation is submitted to the LEADER in some term, + // then before the operation is prepared, the replica loses its leadership, receives + // more operations as a FOLLOWER, and then regains its leadership. We detect this case + // by setting the ConsensusRound's "bound term" when it is first submitted to the + // PREPARE queue, and validate that the term is still the same when we have finished + // preparing it. See KUDU-597 for details. + // + // If this round has not been bound to any term, this is a no-op. + Status CheckBoundTerm(int64_t current_term) const; + + private: + friend class RaftConsensusQuorumTest; + friend class RefCountedThreadSafe; + + ~ConsensusRound() {} + + Consensus* consensus_; + // This round's replicate message. + ReplicateRefPtr replicate_msg_; + + // The continuation that will be called once the transaction is + // deemed committed/aborted by consensus. + ConsensusReplicatedCallback replicated_cb_; + + // The leader term that this round was submitted in. CheckBoundTerm() + // ensures that, when it is eventually replicated, the term has not + // changed in the meantime. + // + // Set to -1 if no term has been bound. + int64_t bound_term_; +}; + +class Consensus::ConsensusFaultHooks { + public: + virtual Status PreStart() { return Status::OK(); } + virtual Status PostStart() { return Status::OK(); } + virtual Status PreConfigChange() { return Status::OK(); } + virtual Status PostConfigChange() { return Status::OK(); } + virtual Status PreReplicate() { return Status::OK(); } + virtual Status PostReplicate() { return Status::OK(); } + virtual Status PreUpdate() { return Status::OK(); } + virtual Status PostUpdate() { return Status::OK(); } + virtual Status PreShutdown() { return Status::OK(); } + virtual Status PostShutdown() { return Status::OK(); } + virtual ~ConsensusFaultHooks() {} +}; + +} // namespace consensus +} // namespace kudu + +#endif /* CONSENSUS_H_ */ diff --git a/src/kudu/consensus/consensus.proto b/src/kudu/consensus/consensus.proto new file mode 100644 index 000000000000..517d06b971bb --- /dev/null +++ b/src/kudu/consensus/consensus.proto @@ -0,0 +1,493 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.consensus; + +option java_package = "org.kududb.consensus"; + +import "kudu/common/common.proto"; +import "kudu/common/wire_protocol.proto"; +import "kudu/consensus/metadata.proto"; +import "kudu/consensus/opid.proto"; +import "kudu/tablet/metadata.proto"; +import "kudu/tablet/tablet.proto"; +import "kudu/tserver/tserver_admin.proto"; +import "kudu/tserver/tserver.proto"; + +// Consensus-specific errors use this protobuf +message ConsensusErrorPB { + // The codes for consensus responses. These are set in the status when + // some consensus internal error occurs and require special handling + // by the caller. A generic error code is purposefully absent since + // generic errors should use tserver.TabletServerErrorPB. + enum Code { + UNKNOWN = 0; + + // Invalid term. + // Sent by peers in response to leader RPCs whenever the term + // of one of the messages sent in a batch is lower than the + // the term the peer is expecting. + INVALID_TERM = 2; + + // For leader election. + // The last OpId logged by the candidate is older than the last OpId logged + // by the local peer. + LAST_OPID_TOO_OLD = 3; + + // For leader election. + // The local replica has already voted for another candidate in this term. + ALREADY_VOTED = 4; + + // The replica does not recognize the caller's request as coming from a + // member of the configuration. + NOT_IN_QUORUM = 5; + + // The responder's last entry didn't match the caller's preceding entry. + PRECEDING_ENTRY_DIDNT_MATCH = 6; + + // The local replica is either a leader, or has heard from a valid leader + // more recently than the election timeout, so believes the leader to be alive. + LEADER_IS_ALIVE = 7; + + // The local replica is in the middle of servicing either another vote + // or an update from a valid leader. + CONSENSUS_BUSY = 8; + + // The local replica was unable to prepare a single transaction. + CANNOT_PREPARE = 9; + } + + // The error code. + required Code code = 1; + + // The Status object for the error. This will include a textual + // message that may be more useful to present in log messages, etc, + // though its error code is less specific. + required AppStatusPB status = 2; +} + +// =========================================================================== +// External Consensus Messages +// =========================================================================== + +// The types of operations that need a commit message, i.e. those that require +// at least one round of the consensus algorithm. +enum OperationType { + UNKNOWN_OP = 0; + NO_OP = 1; + WRITE_OP = 3; + ALTER_SCHEMA_OP = 4; + CHANGE_CONFIG_OP = 5; +} + +// The transaction driver type: indicates whether a transaction is +// being executed on a leader or a replica. +enum DriverType { + UNKNOWN_DRIVER = 0; + LEADER = 1; + REPLICA = 2; +} + +// A configuration change request for the tablet with 'tablet_id'. +// This message is dynamically generated by the leader when AddServer() or +// RemoveServer() is called, and is what gets replicated to the log. +message ChangeConfigRecordPB { + required bytes tablet_id = 1; + + // The old committed configuration config for verification purposes. + required RaftConfigPB old_config = 2; + + // The new configuration to set the configuration to. + required RaftConfigPB new_config = 3; +} + +enum ChangeConfigType { + UNKNOWN_CHANGE = 0; + ADD_SERVER = 1; + REMOVE_SERVER = 2; + CHANGE_ROLE = 3; +} + +// A configuration change request for the tablet with 'tablet_id'. +// These requests are restricted to one-by-one operations, as specified in +// Diego Ongaro's Raft PhD thesis. +// This is the RPC request, but it does not end up in the log. +// See also ChangeConfigRecordPB. +message ChangeConfigRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 4; + + required bytes tablet_id = 1; + + // The type of config change requested. + // This field must be specified, but is left as optional due to being an enum. + optional ChangeConfigType type = 2; + + // The peer to add or remove. + // When 'type' == ADD_SERVER, both the permanent_uuid and last_known_addr + // fields must be set. Otherwise, only the permanent_uuid field is required. + optional RaftPeerPB server = 3; + + // The OpId index of the committed config to replace. + // This optional parameter is here to provide an atomic (compare-and-swap) + // ChangeConfig operation. The ChangeConfig() operation will fail if this + // parameter is specified and the committed config does not have a matching + // opid_index. See also the definition of RaftConfigPB. + optional int64 cas_config_opid_index = 5; +} + +// The configuration change response. If any immediate error occurred +// the 'error' field is set with it, otherwise 'new_configuration' is set. +message ChangeConfigResponsePB { + optional tserver.TabletServerErrorPB error = 1; + + // Updated configuration after changing the config. + optional RaftPeerPB new_config = 2; + + // The timestamp chosen by the server for this change config operation. + // TODO: At the time of writing, this field is never set in the response. + // TODO: Propagate signed timestamps. See KUDU-611. + optional fixed64 timestamp = 3; +} + +// A Replicate message, sent to replicas by leader to indicate this operation must +// be stored in the WAL/SM log, as part of the first phase of the two phase +// commit. +message ReplicateMsg { + // The Raft operation ID (term and index) being replicated. + required OpId id = 1; + // The (hybrid or logical) timestamp assigned to this message. + required fixed64 timestamp = 2; + // optional ExternalConsistencyMode external_consistency_mode = 3 [default = NO_CONSISTENCY]; + required OperationType op_type = 4; + optional tserver.WriteRequestPB write_request = 5; + optional tserver.AlterSchemaRequestPB alter_schema_request = 6; + optional ChangeConfigRecordPB change_config_record = 7; + + optional NoOpRequestPB noop_request = 999; +} + +// A commit message for a previous operation. +// This is a commit in the consensus sense and may abort/commit any operation +// that required a consensus round. +message CommitMsg { + required OperationType op_type = 1; + // the id of the message this commit pertains to + optional OpId commited_op_id = 2; + // The operations that were applied and/or failed in this transaction. + optional tablet.TxResultPB result = 3; +} + +// =========================================================================== +// Internal Consensus Messages and State +// =========================================================================== + +// NoOp requests, mostly used in tests. +message NoOpRequestPB { + // Allows to set a dummy payload, for tests. + optional bytes payload_for_tests = 1; +} + +// NoOp responses, mostly used in tests. +message NoOpResponsePB { + // Allows to set a dummy payload, for tests. + optional bytes payload_for_tests = 1; +} + +message PerOpErrorPB { + // The id of the operation that failed in the other peer. + required OpId id = 1; + // The Status explaining why the operation failed. + required AppStatusPB status = 2; +} + +// Status message received in the peer responses. +message ConsensusStatusPB { + + // The last message received (and replicated) by the peer. + required OpId last_received = 1; + + // The id of the last op that was replicated by the current leader. + // This doesn't necessarily mean that the term of this op equals the current + // term, since the current leader may be replicating ops from a prior term. + // Unset if none currently received. + // + // In the case where there is a log matching property error + // (PRECEDING_ENTRY_DIDNT_MATCH), this field is important and may still be + // set, since the leader queue uses this field in conjuction with + // last_received to decide on the next id to send to the follower. + optional OpId last_received_current_leader = 4; + + // The last committed index that is known to the peer. + optional int64 last_committed_idx = 2; + + // When the last request failed for some consensus related (internal) reason. + // In some cases the error will have a specific code that the caller will + // have to handle in certain ways. + optional ConsensusErrorPB error = 3; +} + +// A request from a candidate peer that wishes to become leader of +// the configuration serving tablet with 'tablet_id'. +// See RAFT sec. 5.2. +message VoteRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 6; + + // Identifies the tablet configuration a the vote is being requested for. + required string tablet_id = 1; + + // The uuid of the sending peer. + required bytes candidate_uuid = 2; + + // The term we are requesting a vote for. + // If this term is higher than the callee's term, the callee will update its + // own term to match, and if it is the current leader it will step down. + required int64 candidate_term = 3; + + // The candidate node status so that the voter node can + // decide whether to vote for it as LEADER. + // + // In particular, this includes the last OpId persisted in the candidate's + // log, which corresponds to the lastLogIndex and lastLogTerm fields in Raft. + // A replica must vote no for a candidate that has an OpId lower than them. + required ConsensusStatusPB candidate_status = 4; + + // Normally, replicas will deny a vote with a LEADER_IS_ALIVE error if + // they are a leader or recently heard from a leader. This is to prevent + // partitioned nodes from disturbing liveness. If this flag is true, + // peers will vote even if they think a leader is alive. This can be used + // for example to force a faster leader hand-off rather than waiting for + // the election timer to expire. + optional bool ignore_live_leader = 5 [ default = false ]; +} + +// A response from a replica to a leader election request. +message VoteResponsePB { + // The uuid of the node sending the reply. + optional bytes responder_uuid = 1; + + // The term of the node sending the reply. + // Allows the candidate to update itself if it is behind. + optional int64 responder_term = 2; + + // True if this peer voted for the caller, false otherwise. + optional bool vote_granted = 3; + + // TODO: Migrate ConsensusService to the AppStatusPB RPC style and merge these errors. + // Error message from the consensus implementation. + optional ConsensusErrorPB consensus_error = 998; + + // A generic error message (such as tablet not found). + optional tserver.TabletServerErrorPB error = 999; +} + +// A consensus request message, the basic unit of a consensus round. +message ConsensusRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 7; + + required string tablet_id = 1; + + // The uuid of the peer making the call. + required bytes caller_uuid = 2; + + // The caller's term. As only leaders can send messages, + // replicas will accept all messages as long as the term + // is equal to or higher than the last term they know about. + // If a leader receives a request with a term higher than its own, + // it will step down and enter FOLLOWER state (see Raft sec. 5.1). + required int64 caller_term = 3; + + // The id of the operation immediately preceding the first + // operation in 'ops'. If the replica is receiving 'ops' for + // the first time 'preceding_id' must match the replica's + // last operation. + // + // This must be set if 'ops' is non-empty. + optional OpId preceding_id = 4; + + // The id of the last committed operation in the configuration. This is the + // id of the last operation the leader deemed committed from a consensus + // standpoint (not the last operation the leader applied). + // + // Raft calls this field 'leaderCommit'. + required OpId committed_index = 5; + + // Sequence of operations to be replicated by this peer. + // These will be committed when committed_index advances above their + // respective OpIds. In some cases committed_index can indicate that + // these operations are already committed, in which case they will be + // committed during the same request. + repeated ReplicateMsg ops = 6; +} + +message ConsensusResponsePB { + // The uuid of the peer making the response. + optional bytes responder_uuid = 1; + + // The current term of the peer making the response. + // This is used to update the caller (and make it step down if it is + // out of date). + optional int64 responder_term = 2; + + // The current consensus status of the receiver peer. + optional ConsensusStatusPB status = 3; + + // A generic error message (such as tablet not found), per operation + // error messages are sent along with the consensus status. + optional tserver.TabletServerErrorPB error = 999; +} + +// A message reflecting the status of an in-flight transaction. +message TransactionStatusPB { + required OpId op_id = 1; + required OperationType tx_type = 2; + // Time the transaction has been in flight. + required int64 running_for_micros = 3; + // Quick human-readable description (e.g., ToString() output). + required string description = 4; + + // If tracing is enabled when viewing the transaction, the trace + // buffer is copied here. + optional string trace_buffer = 6; +} + +message GetNodeInstanceRequestPB { +} + +message GetNodeInstanceResponsePB { + required NodeInstancePB node_instance = 1; +} + +// Message that makes the local peer run leader election to be elected leader. +// Assumes that a tablet with 'tablet_id' exists. +message RunLeaderElectionRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 2; + + // the id of the tablet + required bytes tablet_id = 1; +} + +message RunLeaderElectionResponsePB { + // A generic error message (such as tablet not found). + optional tserver.TabletServerErrorPB error = 1; +} + +message LeaderStepDownRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 2; + + // The id of the tablet. + required bytes tablet_id = 1; +} + +message LeaderStepDownResponsePB { + // A generic error message (such as tablet not found). + optional tserver.TabletServerErrorPB error = 1; +} + +enum OpIdType { + UNKNOWN_OPID_TYPE = 0; + RECEIVED_OPID = 1; + COMMITTED_OPID = 2; +} + +message GetLastOpIdRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 2; + + // the id of the tablet + required bytes tablet_id = 1; + + // Whether to return the last-received or last-committed OpId. + optional OpIdType opid_type = 3 [ default = RECEIVED_OPID ]; +} + +message GetLastOpIdResponsePB { + optional OpId opid = 1; + // A generic error message (such as tablet not found). + optional tserver.TabletServerErrorPB error = 2; +} + +message GetConsensusStateRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 2; + + // The id of the tablet. + required bytes tablet_id = 1; + + // Whether to fetch the committed or active consensus state. + optional ConsensusConfigType type = 3 [ default = CONSENSUS_CONFIG_COMMITTED ]; +} + +message GetConsensusStateResponsePB { + optional ConsensusStatePB cstate = 1; + // A generic error message (such as tablet not found). + optional tserver.TabletServerErrorPB error = 2; +} + +message StartRemoteBootstrapRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 5; + + required bytes tablet_id = 1; + + // Identification for the host we are bootstrapping from. + // TODO: Consider renaming these to bootstrap_source_*. + required bytes bootstrap_peer_uuid = 2; + required HostPortPB bootstrap_peer_addr = 3; + + // The caller's term. In the case that the target of this request has a + // TOMBSTONED replica with a term higher than this one, the request will fail. + optional int64 caller_term = 4 [ default = -1 ]; +} + +message StartRemoteBootstrapResponsePB { + optional tserver.TabletServerErrorPB error = 1; +} + +// A Raft implementation. +service ConsensusService { + // Analogous to AppendEntries in Raft, but only used for followers. + rpc UpdateConsensus(ConsensusRequestPB) returns (ConsensusResponsePB); + + // RequestVote() from Raft. + rpc RequestConsensusVote(VoteRequestPB) returns (VoteResponsePB); + + // Implements all of the one-by-one config change operations, including + // AddServer() and RemoveServer() from the Raft specification, as well as + // an operation to change the role of a server between VOTER and NON_VOTER. + // An OK response means the operation was successful. + rpc ChangeConfig(ChangeConfigRequestPB) returns (ChangeConfigResponsePB); + + rpc GetNodeInstance(GetNodeInstanceRequestPB) returns (GetNodeInstanceResponsePB); + + // Force this node to run a leader election. + rpc RunLeaderElection(RunLeaderElectionRequestPB) returns (RunLeaderElectionResponsePB); + + // Force this node to step down as leader. + rpc LeaderStepDown(LeaderStepDownRequestPB) returns (LeaderStepDownResponsePB); + + rpc GetLastOpId(GetLastOpIdRequestPB) returns (GetLastOpIdResponsePB); + + // Returns the committed Consensus state. + rpc GetConsensusState(GetConsensusStateRequestPB) returns (GetConsensusStateResponsePB); + + // Instruct this server to remotely bootstrap a tablet from another host. + rpc StartRemoteBootstrap(StartRemoteBootstrapRequestPB) returns (StartRemoteBootstrapResponsePB); +} diff --git a/src/kudu/consensus/consensus.txt b/src/kudu/consensus/consensus.txt new file mode 100644 index 000000000000..8d08eb5cbd99 --- /dev/null +++ b/src/kudu/consensus/consensus.txt @@ -0,0 +1,148 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Kudu Consensus Implementation +============================= + +Kudu implements the RAFT consensus protocol, with minor modifications as +described in this document. For the details of the write-ahead-log (WAL) message +format, see the README file in this directory. Kudu maintains one RAFT config +per tablet. + +RAFT describes how to ensure that additions to the leader's WAL get replicated +to the rest of the config, and when it's safe to apply (commit) those changes to +the "state machine" (the database). + +Overview of the RAFT protocol +----------------------------- + +While the RAFT paper [1] remains the primary specification of the protocol, a +brief outline of important points follows here, for quick reference: + +- RAFT has a strong leader for each config. +- A majority of nodes in the config must agree for a write to occur or a leader + to be elected. Every time a leader election takes place, the monotonic term + number is incremented (see currentTerm below). +- There are three roles a RAFT node can assume: leader, follower, or candidate. + Kudu explicitly specifies an additional learner role (not yet implemented), + which is a non-voting role. +- The WAL index is an absolute index into the log. If two logs contain an entry + with the same index and term, then the logs are identical in all entries + up through the given index. This is called the Log Matching property + (abbreviated "LMP" in some parts of the code). + +The following pieces of state are durable (updated before responding to RPCs): + +- The write-ahead log (WAL). +- currentTerm: latest term the server has seen, initialized to 0. +- votedFor: candidate that received this replica's vote for the current term, + or null if none. + +The WAL is implemented by log.cc. currentTerm and votedFor are part of the +ConsensusMetadataPB which is implemented in consensus_meta.cc. + + +Appending to the log: +------------------------------ + +Replicas implement an RPC method called UpdateConsensus which allows a +leader to replicate a batch of log entries to the follower. Only a +leader may call this RPC method, and a follower will only accept an +UpdateConsensus call with a term equal to or higher than its +currentTerm. +[Raft calls this RPC AppendEntries(). See Raft figure 2 for details.] + + +Leader election: +------------------------------ + +Replicas also implement an RPC method called RequestConsensusVote, which is +invoked by candidates to gather votes (RAFT sec 5.2). See RAFT figure 2 for +details. + +RaftConfig membership changes: +------------------------------ +A two-phase joint consensus protocol is used for making cluster membership +changes, including changing members of the config or the size of the majority. +See RAFT section 6 and figure 11 for details. + +NOTE: membership changes are not yet implemented in Kudu. + + +Leader election modifications (future work) +--------------------------------------------- + +The Kudu leader election protocol proceeds exactly as specified in the extended +version of the RAFT paper [1], sections 5.2 - 5.3, with the following +exceptions: + +- Timestamps issued by config leaders must be disjoint, as specified in the + Google Spanner paper [2] in section 4.1.1 and Appendix A. Therefore, when a + new leader is elected, the leader must do a "commit-wait" style pause before + replicating new writes. + + NOTE: this is not yet implemented. See KUDU-430. + +- To force acceptance of a particular node as leader (for load-balancing + purposes by the Master), the tablet may be quiesced until the specified node + has an up to date WAL, at which time that node becomes a candidate and starts + an election. Since other nodes are prohibited from becoming candidates during + this time, either this target node will win the election or the attempt to + force this configuration will time out and fail. + + NOTE: this is not implemented! + +- To prevent "flapping" in the case of a flaky connection, an additional step is + added to the leader election protocol. If a node detects that the config + leader failed, instead of immediately starting an election, it will + periodically query the existing nodes in the config to determine whether they + see that the leader has failed as well. If a majority of the config (including + the node itself) responds that the leader appears to have failed, the node + becomes a candidate and starts an election. If not, the node backs off and + retries again later, without updating its currentTerm field. + + NOTE: not yet implemented (KUDU-562) + + +Corner cases and other notes +---------------------------- + +Various parts of RAFT are subtle and worth mentioning explicitly: + +- Terms are monotonic and the only way a term may appear in a log is if a leader + won an election and then wrote it there. + +- The current term is passed with every RPC request and response, and if any + replica sees a request or response higher than its currentTerm, it steps down + (if the leader) and updates its own currentTerm to match. This is the reason + for our planned addition of the anti-flapping modification to the leader election + protocol noted above. + +- As mentioned in the RAFT paper (sec 5.4.2), as well as the CSAIL DSRG blog [3] + there are restrictions on when one can consider a log entry from a previous + term committed. Because of the Log Matching property, the last entry written + by the previous leader may be overwritten even after it reaches a majority. To + work around this issue, replicas may only commit uncommitted log entries from + a previous leader's term once the current leader has successfully replicated a + round of log entries for its own current term. + +References +---------- + +[1] RAFT: In Search of an Understandable Consensus Algorithm (Extended Version). + Ongaro & Ousterhout. http://ramcloud.stanford.edu/raft.pdf + +[2] Spanner: Google’s globally distributed database. Corbett, et al. OSDI '12. + http://research.google.com/archive/spanner-osdi2012.pdf + +[3] http://pdos.csail.mit.edu/dsrg/blog/2013/05/23/raft/ diff --git a/src/kudu/consensus/consensus_meta-test.cc b/src/kudu/consensus/consensus_meta-test.cc new file mode 100644 index 000000000000..7eb9c2dbd42a --- /dev/null +++ b/src/kudu/consensus/consensus_meta-test.cc @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/consensus/consensus_meta.h" + +#include + +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +#define ASSERT_VALUES_EQUAL(cmeta, opid_index, uuid, term) \ + ASSERT_NO_FATAL_FAILURE(AssertValuesEqual(cmeta, opid_index, uuid, term)) + +namespace kudu { +namespace consensus { + +using std::string; +using std::vector; + +const char* kTabletId = "test-consensus-metadata"; +const int64_t kInitialTerm = 3; + +class ConsensusMetadataTest : public KuduTest { + public: + ConsensusMetadataTest() + : fs_manager_(env_.get(), GetTestPath("fs_root")) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + ASSERT_OK(fs_manager_.CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_.Open()); + + // Initialize test configuration. + config_.set_local(true); + config_.add_peers()->set_permanent_uuid(fs_manager_.uuid()); + config_.set_opid_index(kInvalidOpIdIndex); + } + + protected: + // Assert that the given cmeta has a single configuration with the given metadata values. + void AssertValuesEqual(const ConsensusMetadata& cmeta, + int64_t opid_index, const string& permanant_uuid, int64_t term); + + FsManager fs_manager_; + RaftConfigPB config_; +}; + +void ConsensusMetadataTest::AssertValuesEqual(const ConsensusMetadata& cmeta, + int64_t opid_index, + const string& permanant_uuid, + int64_t term) { + // Sanity checks. + ASSERT_TRUE(cmeta.committed_config().local()); + ASSERT_EQ(1, cmeta.committed_config().peers_size()); + + // Value checks. + ASSERT_EQ(opid_index, cmeta.committed_config().opid_index()); + ASSERT_EQ(permanant_uuid, cmeta.committed_config().peers().begin()->permanent_uuid()); + ASSERT_EQ(term, cmeta.current_term()); +} + +// Test the basic "happy case" of creating and then loading a file. +TEST_F(ConsensusMetadataTest, TestCreateLoad) { + // Create the file. + { + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, fs_manager_.uuid(), + config_, kInitialTerm, &cmeta)); + } + + // Load the file. + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Load(&fs_manager_, kTabletId, fs_manager_.uuid(), &cmeta)); + ASSERT_VALUES_EQUAL(*cmeta, kInvalidOpIdIndex, fs_manager_.uuid(), kInitialTerm); +} + +// Ensure that we get an error when loading a file that doesn't exist. +TEST_F(ConsensusMetadataTest, TestFailedLoad) { + gscoped_ptr cmeta; + Status s = ConsensusMetadata::Load(&fs_manager_, kTabletId, fs_manager_.uuid(), &cmeta); + ASSERT_TRUE(s.IsNotFound()) << "Unexpected status: " << s.ToString(); + LOG(INFO) << "Expected failure: " << s.ToString(); +} + +// Check that changes are not written to disk until Flush() is called. +TEST_F(ConsensusMetadataTest, TestFlush) { + const int64_t kNewTerm = 4; + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, fs_manager_.uuid(), + config_, kInitialTerm, &cmeta)); + cmeta->set_current_term(kNewTerm); + + // We are sort of "breaking the rules" by having multiple ConsensusMetadata + // objects in flight that point to the same file, but for a test this is fine + // since it's read-only. + { + gscoped_ptr cmeta_read; + ASSERT_OK(ConsensusMetadata::Load(&fs_manager_, kTabletId, fs_manager_.uuid(), &cmeta_read)); + ASSERT_VALUES_EQUAL(*cmeta_read, kInvalidOpIdIndex, fs_manager_.uuid(), kInitialTerm); + } + + ASSERT_OK(cmeta->Flush()); + + { + gscoped_ptr cmeta_read; + ASSERT_OK(ConsensusMetadata::Load(&fs_manager_, kTabletId, fs_manager_.uuid(), &cmeta_read)); + ASSERT_VALUES_EQUAL(*cmeta_read, kInvalidOpIdIndex, fs_manager_.uuid(), kNewTerm); + } +} + +// Builds a distributed configuration of voters with the given uuids. +RaftConfigPB BuildConfig(const vector& uuids) { + RaftConfigPB config; + config.set_local(false); + for (const string& uuid : uuids) { + RaftPeerPB* peer = config.add_peers(); + peer->set_permanent_uuid(uuid); + peer->set_member_type(RaftPeerPB::VOTER); + CHECK_OK(HostPortToPB(HostPort("255.255.255.255", 0), peer->mutable_last_known_addr())); + } + return config; +} + +// Test ConsensusMetadata active role calculation. +TEST_F(ConsensusMetadataTest, TestActiveRole) { + vector uuids = { "a", "b", "c", "d" }; + string peer_uuid = "e"; + RaftConfigPB config1 = BuildConfig(uuids); // We aren't a member of this config... + config1.set_opid_index(1); + + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, peer_uuid, + config1, kInitialTerm, &cmeta)); + + // Not a participant. + ASSERT_EQ(RaftPeerPB::NON_PARTICIPANT, cmeta->active_role()); + + // Follower. + uuids.push_back(peer_uuid); + RaftConfigPB config2 = BuildConfig(uuids); // But we are a member of this one. + config2.set_opid_index(1); + cmeta->set_committed_config(config2); + ASSERT_EQ(RaftPeerPB::FOLLOWER, cmeta->active_role()); + + // Pending should mask committed. + cmeta->set_pending_config(config1); + ASSERT_EQ(RaftPeerPB::NON_PARTICIPANT, cmeta->active_role()); + cmeta->clear_pending_config(); + ASSERT_EQ(RaftPeerPB::FOLLOWER, cmeta->active_role()); + + // Leader. + cmeta->set_leader_uuid(peer_uuid); + ASSERT_EQ(RaftPeerPB::LEADER, cmeta->active_role()); + + // Again, pending should mask committed. + cmeta->set_pending_config(config1); + ASSERT_EQ(RaftPeerPB::NON_PARTICIPANT, cmeta->active_role()); + cmeta->set_pending_config(config2); // pending == committed. + ASSERT_EQ(RaftPeerPB::LEADER, cmeta->active_role()); + cmeta->set_committed_config(config1); // committed now excludes this node, but is masked... + ASSERT_EQ(RaftPeerPB::LEADER, cmeta->active_role()); + + // ... until we clear pending, then we find committed now excludes us. + cmeta->clear_pending_config(); + ASSERT_EQ(RaftPeerPB::NON_PARTICIPANT, cmeta->active_role()); +} + +// Ensure that invocations of ToConsensusStatePB() return the expected state +// in the returned object. +TEST_F(ConsensusMetadataTest, TestToConsensusStatePB) { + vector uuids = { "a", "b", "c", "d" }; + string peer_uuid = "e"; + + RaftConfigPB committed_config = BuildConfig(uuids); // We aren't a member of this config... + committed_config.set_opid_index(1); + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, peer_uuid, + committed_config, kInitialTerm, &cmeta)); + + uuids.push_back(peer_uuid); + RaftConfigPB pending_config = BuildConfig(uuids); + + // Set the pending configuration to be one containing the current leader (who is not + // in the committed configuration). Ensure that the leader shows up when we ask for + // the active consensus state. + cmeta->set_pending_config(pending_config); + cmeta->set_leader_uuid(peer_uuid); + ConsensusStatePB active_cstate = cmeta->ToConsensusStatePB(CONSENSUS_CONFIG_ACTIVE); + ASSERT_TRUE(active_cstate.has_leader_uuid()); + ASSERT_OK(VerifyConsensusState(active_cstate, UNCOMMITTED_QUORUM)); + + // Without changing anything, ask for the committed consensus state. + // Since the current leader is not a voter in the committed configuration, the + // returned consensus state should not list a leader. + ConsensusStatePB committed_cstate = cmeta->ToConsensusStatePB(CONSENSUS_CONFIG_COMMITTED); + ASSERT_FALSE(committed_cstate.has_leader_uuid()); + ASSERT_OK(VerifyConsensusState(committed_cstate, COMMITTED_QUORUM)); + + // Set a new leader to be a member of the committed configuration. Now the committed + // consensus state should list a leader. + cmeta->set_leader_uuid("a"); + ConsensusStatePB new_committed_cstate = cmeta->ToConsensusStatePB(CONSENSUS_CONFIG_COMMITTED); + ASSERT_TRUE(new_committed_cstate.has_leader_uuid()); + ASSERT_OK(VerifyConsensusState(new_committed_cstate, COMMITTED_QUORUM)); +} + +// Helper for TestMergeCommittedConsensusStatePB. +static void AssertConsensusMergeExpected(const gscoped_ptr& cmeta, + const ConsensusStatePB& cstate, + int64_t expected_term, + const string& expected_voted_for) { + // See header docs for ConsensusMetadata::MergeCommittedConsensusStatePB() for + // a "spec" of these assertions. + ASSERT_TRUE(!cmeta->has_pending_config()); + ASSERT_EQ(cmeta->committed_config().ShortDebugString(), cstate.config().ShortDebugString()); + ASSERT_EQ("", cmeta->leader_uuid()); + ASSERT_EQ(expected_term, cmeta->current_term()); + if (expected_voted_for.empty()) { + ASSERT_FALSE(cmeta->has_voted_for()); + } else { + ASSERT_EQ(expected_voted_for, cmeta->voted_for()); + } +} + +// Ensure that MergeCommittedConsensusStatePB() works as advertised. +TEST_F(ConsensusMetadataTest, TestMergeCommittedConsensusStatePB) { + vector uuids = { "a", "b", "c", "d" }; + + RaftConfigPB committed_config = BuildConfig(uuids); // We aren't a member of this config... + committed_config.set_opid_index(1); + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, "e", + committed_config, 1, &cmeta)); + + uuids.push_back("e"); + RaftConfigPB pending_config = BuildConfig(uuids); + cmeta->set_pending_config(pending_config); + cmeta->set_leader_uuid("e"); + cmeta->set_voted_for("e"); + + // Keep the term and votes because the merged term is lower. + ConsensusStatePB remote_state; + remote_state.set_current_term(0); + *remote_state.mutable_config() = BuildConfig({ "x", "y", "z" }); + cmeta->MergeCommittedConsensusStatePB(remote_state); + NO_FATALS(AssertConsensusMergeExpected(cmeta, remote_state, 1, "e")); + + // Same as above because the merged term is the same as the cmeta term. + remote_state.set_current_term(1); + *remote_state.mutable_config() = BuildConfig({ "f", "g", "h" }); + cmeta->MergeCommittedConsensusStatePB(remote_state); + NO_FATALS(AssertConsensusMergeExpected(cmeta, remote_state, 1, "e")); + + // Higher term, so wipe out the prior state. + remote_state.set_current_term(2); + *remote_state.mutable_config() = BuildConfig({ "i", "j", "k" }); + cmeta->set_pending_config(pending_config); + cmeta->MergeCommittedConsensusStatePB(remote_state); + NO_FATALS(AssertConsensusMergeExpected(cmeta, remote_state, 2, "")); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus_meta.cc b/src/kudu/consensus/consensus_meta.cc new file mode 100644 index 000000000000..d6cf74aaa4e6 --- /dev/null +++ b/src/kudu/consensus/consensus_meta.cc @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/consensus/consensus_meta.h" + +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/logging.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { +namespace consensus { + +using std::string; +using strings::Substitute; + +Status ConsensusMetadata::Create(FsManager* fs_manager, + const string& tablet_id, + const std::string& peer_uuid, + const RaftConfigPB& config, + int64_t current_term, + gscoped_ptr* cmeta_out) { + gscoped_ptr cmeta(new ConsensusMetadata(fs_manager, tablet_id, peer_uuid)); + cmeta->set_committed_config(config); + cmeta->set_current_term(current_term); + RETURN_NOT_OK(cmeta->Flush()); + cmeta_out->swap(cmeta); + return Status::OK(); +} + +Status ConsensusMetadata::Load(FsManager* fs_manager, + const std::string& tablet_id, + const std::string& peer_uuid, + gscoped_ptr* cmeta_out) { + gscoped_ptr cmeta(new ConsensusMetadata(fs_manager, tablet_id, peer_uuid)); + RETURN_NOT_OK(pb_util::ReadPBContainerFromPath(fs_manager->env(), + fs_manager->GetConsensusMetadataPath(tablet_id), + &cmeta->pb_)); + cmeta->UpdateActiveRole(); // Needs to happen here as we sidestep the accessor APIs. + cmeta_out->swap(cmeta); + return Status::OK(); +} + +Status ConsensusMetadata::DeleteOnDiskData(FsManager* fs_manager, const string& tablet_id) { + string cmeta_path = fs_manager->GetConsensusMetadataPath(tablet_id); + Env* env = fs_manager->env(); + if (!env->FileExists(cmeta_path)) { + return Status::OK(); + } + RETURN_NOT_OK_PREPEND(env->DeleteFile(cmeta_path), + "Unable to delete consensus metadata file for tablet " + tablet_id); + return Status::OK(); +} + +const int64_t ConsensusMetadata::current_term() const { + DCHECK(pb_.has_current_term()); + return pb_.current_term(); +} + +void ConsensusMetadata::set_current_term(int64_t term) { + DCHECK_GE(term, kMinimumTerm); + pb_.set_current_term(term); +} + +bool ConsensusMetadata::has_voted_for() const { + return pb_.has_voted_for(); +} + +const string& ConsensusMetadata::voted_for() const { + DCHECK(pb_.has_voted_for()); + return pb_.voted_for(); +} + +void ConsensusMetadata::clear_voted_for() { + pb_.clear_voted_for(); +} + +void ConsensusMetadata::set_voted_for(const string& uuid) { + DCHECK(!uuid.empty()); + pb_.set_voted_for(uuid); +} + +const RaftConfigPB& ConsensusMetadata::committed_config() const { + DCHECK(pb_.has_committed_config()); + return pb_.committed_config(); +} + +void ConsensusMetadata::set_committed_config(const RaftConfigPB& config) { + *pb_.mutable_committed_config() = config; + if (!has_pending_config_) { + UpdateActiveRole(); + } +} + +bool ConsensusMetadata::has_pending_config() const { + return has_pending_config_; +} + +const RaftConfigPB& ConsensusMetadata::pending_config() const { + DCHECK(has_pending_config_); + return pending_config_; +} + +void ConsensusMetadata::clear_pending_config() { + has_pending_config_ = false; + pending_config_.Clear(); + UpdateActiveRole(); +} + +void ConsensusMetadata::set_pending_config(const RaftConfigPB& config) { + has_pending_config_ = true; + pending_config_ = config; + UpdateActiveRole(); +} + +const RaftConfigPB& ConsensusMetadata::active_config() const { + if (has_pending_config_) { + return pending_config(); + } + return committed_config(); +} + +const string& ConsensusMetadata::leader_uuid() const { + return leader_uuid_; +} + +void ConsensusMetadata::set_leader_uuid(const string& uuid) { + leader_uuid_ = uuid; + UpdateActiveRole(); +} + +RaftPeerPB::Role ConsensusMetadata::active_role() const { + return active_role_; +} + +ConsensusStatePB ConsensusMetadata::ToConsensusStatePB(ConsensusConfigType type) const { + CHECK(type == CONSENSUS_CONFIG_ACTIVE || type == CONSENSUS_CONFIG_COMMITTED) + << "Unsupported ConsensusConfigType: " << ConsensusConfigType_Name(type) << ": " << type; + ConsensusStatePB cstate; + cstate.set_current_term(pb_.current_term()); + if (type == CONSENSUS_CONFIG_ACTIVE) { + *cstate.mutable_config() = active_config(); + cstate.set_leader_uuid(leader_uuid_); + } else { + *cstate.mutable_config() = committed_config(); + // It's possible, though unlikely, that a new node from a pending configuration + // could be elected leader. Do not indicate a leader in this case. + if (PREDICT_TRUE(IsRaftConfigVoter(leader_uuid_, cstate.config()))) { + cstate.set_leader_uuid(leader_uuid_); + } + } + return cstate; +} + +void ConsensusMetadata::MergeCommittedConsensusStatePB(const ConsensusStatePB& committed_cstate) { + if (committed_cstate.current_term() > current_term()) { + set_current_term(committed_cstate.current_term()); + clear_voted_for(); + } + + set_leader_uuid(""); + set_committed_config(committed_cstate.config()); + clear_pending_config(); +} + +Status ConsensusMetadata::Flush() { + SCOPED_LOG_SLOW_EXECUTION_PREFIX(WARNING, 500, LogPrefix(), "flushing consensus metadata"); + // Sanity test to ensure we never write out a bad configuration. + RETURN_NOT_OK_PREPEND(VerifyRaftConfig(pb_.committed_config(), COMMITTED_QUORUM), + "Invalid config in ConsensusMetadata, cannot flush to disk"); + + // Create directories if needed. + string dir = fs_manager_->GetConsensusMetadataDir(); + bool created_dir = false; + RETURN_NOT_OK_PREPEND(fs_manager_->CreateDirIfMissing(dir, &created_dir), + "Unable to create consensus metadata root dir"); + // fsync() parent dir if we had to create the dir. + if (PREDICT_FALSE(created_dir)) { + string parent_dir = DirName(dir); + RETURN_NOT_OK_PREPEND(Env::Default()->SyncDir(parent_dir), + "Unable to fsync consensus parent dir " + parent_dir); + } + + string meta_file_path = fs_manager_->GetConsensusMetadataPath(tablet_id_); + RETURN_NOT_OK_PREPEND(pb_util::WritePBContainerToPath( + fs_manager_->env(), meta_file_path, pb_, + pb_util::OVERWRITE, + // We use FLAGS_log_force_fsync_all here because the consensus metadata is + // essentially an extension of the primary durability mechanism of the + // consensus subsystem: the WAL. Using the same flag ensures that the WAL + // and the consensus metadata get the same durability guarantees. + FLAGS_log_force_fsync_all ? pb_util::SYNC : pb_util::NO_SYNC), + Substitute("Unable to write consensus meta file for tablet $0 to path $1", + tablet_id_, meta_file_path)); + return Status::OK(); +} + +ConsensusMetadata::ConsensusMetadata(FsManager* fs_manager, + std::string tablet_id, + std::string peer_uuid) + : fs_manager_(CHECK_NOTNULL(fs_manager)), + tablet_id_(std::move(tablet_id)), + peer_uuid_(std::move(peer_uuid)), + has_pending_config_(false) {} + +std::string ConsensusMetadata::LogPrefix() const { + return Substitute("T $0 P $1: ", tablet_id_, peer_uuid_); +} + +void ConsensusMetadata::UpdateActiveRole() { + ConsensusStatePB cstate = ToConsensusStatePB(CONSENSUS_CONFIG_ACTIVE); + active_role_ = GetConsensusRole(peer_uuid_, cstate); + VLOG_WITH_PREFIX(1) << "Updating active role to " << RaftPeerPB::Role_Name(active_role_) + << ". Consensus state: " << cstate.ShortDebugString(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus_meta.h b/src/kudu/consensus/consensus_meta.h new file mode 100644 index 000000000000..af1424a6c269 --- /dev/null +++ b/src/kudu/consensus/consensus_meta.h @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_CONSENSUS_META_H_ +#define KUDU_CONSENSUS_CONSENSUS_META_H_ + +#include +#include + +#include "kudu/consensus/metadata.pb.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +class FsManager; + +namespace consensus { + +// Provides methods to read, write, and persist consensus-related metadata. +// This partly corresponds to Raft Figure 2's "Persistent state on all servers". +// +// In addition to the persistent state, this class also provides access to some +// transient state. This includes the peer that this node considers to be the +// leader of the configuration, as well as the "pending" configuration, if any. +// +// Conceptually, a pending configuration is one that has been proposed via a config +// change operation (AddServer or RemoveServer from Chapter 4 of Diego Ongaro's +// Raft thesis) but has not yet been committed. According to the above spec, +// as soon as a server hears of a new cluster membership configuration, it must +// be adopted (even prior to be committed). +// +// The data structure difference between a committed configuration and a pending one +// is that opid_index (the index in the log of the committed config change +// operation) is always set in a committed configuration, while it is always unset in +// a pending configuration. +// +// Finally, this class exposes the concept of an "active" configuration, which means +// the pending configuration if a pending configuration is set, otherwise the committed +// configuration. +// +// This class is not thread-safe and requires external synchronization. +class ConsensusMetadata { + public: + // Create a ConsensusMetadata object with provided initial state. + // Encoded PB is flushed to disk before returning. + static Status Create(FsManager* fs_manager, + const std::string& tablet_id, + const std::string& peer_uuid, + const RaftConfigPB& config, + int64_t current_term, + gscoped_ptr* cmeta); + + // Load a ConsensusMetadata object from disk. + // Returns Status::NotFound if the file could not be found. May return other + // Status codes if unable to read the file. + static Status Load(FsManager* fs_manager, + const std::string& tablet_id, + const std::string& peer_uuid, + gscoped_ptr* cmeta); + + // Delete the ConsensusMetadata file associated with the given tablet from + // disk. + static Status DeleteOnDiskData(FsManager* fs_manager, const std::string& tablet_id); + + // Accessors for current term. + const int64_t current_term() const; + void set_current_term(int64_t term); + + // Accessors for voted_for. + bool has_voted_for() const; + const std::string& voted_for() const; + void clear_voted_for(); + void set_voted_for(const std::string& uuid); + + // Accessors for committed configuration. + const RaftConfigPB& committed_config() const; + void set_committed_config(const RaftConfigPB& config); + + // Returns whether a pending configuration is set. + bool has_pending_config() const; + + // Returns the pending configuration if one is set. Otherwise, fires a DCHECK. + const RaftConfigPB& pending_config() const; + + // Set & clear the pending configuration. + void clear_pending_config(); + void set_pending_config(const RaftConfigPB& config); + + // If a pending configuration is set, return it. + // Otherwise, return the committed configuration. + const RaftConfigPB& active_config() const; + + // Accessors for setting the active leader. + const std::string& leader_uuid() const; + void set_leader_uuid(const std::string& uuid); + + // Returns the currently active role of the current node. + RaftPeerPB::Role active_role() const; + + // Copy the stored state into a ConsensusStatePB object. + // To get the active configuration, specify 'type' = ACTIVE. + // Otherwise, 'type' = COMMITTED will return a version of the + // ConsensusStatePB using only the committed configuration. In this case, if the + // current leader is not a member of the committed configuration, then the + // leader_uuid field of the returned ConsensusStatePB will be cleared. + ConsensusStatePB ToConsensusStatePB(ConsensusConfigType type) const; + + // Merge the committed consensus state from the source node during remote + // bootstrap. + // + // This method will clear any pending config change, replace the committed + // consensus config with the one in 'committed_cstate', and clear the + // currently tracked leader. + // + // It will also check whether the current term passed in 'committed_cstate' + // is greater than the currently recorded one. If so, it will update the + // local current term to match the passed one and it will clear the voting + // record for this node. If the current term in 'committed_cstate' is less + // than the locally recorded term, the locally recorded term and voting + // record are not changed. + void MergeCommittedConsensusStatePB(const ConsensusStatePB& committed_cstate); + + // Persist current state of the protobuf to disk. + Status Flush(); + + private: + ConsensusMetadata(FsManager* fs_manager, std::string tablet_id, + std::string peer_uuid); + + std::string LogPrefix() const; + + // Updates the cached active role. + void UpdateActiveRole(); + + // Transient fields. + // Constants: + FsManager* const fs_manager_; + const std::string tablet_id_; + const std::string peer_uuid_; + // Mutable: + std::string leader_uuid_; // Leader of the current term (term == pb_.current_term). + bool has_pending_config_; // Indicates whether there is an as-yet uncommitted + // configuration change pending. + // RaftConfig used by the peers when there is a pending config change operation. + RaftConfigPB pending_config_; + + // Cached role of the peer_uuid_ within the active configuration. + RaftPeerPB::Role active_role_; + + // Durable fields. + ConsensusMetadataPB pb_; + + DISALLOW_COPY_AND_ASSIGN(ConsensusMetadata); +}; + +} // namespace consensus +} // namespace kudu + +#endif // KUDU_CONSENSUS_CONSENSUS_META_H_ diff --git a/src/kudu/consensus/consensus_peers-test.cc b/src/kudu/consensus/consensus_peers-test.cc new file mode 100644 index 000000000000..7e5d5363f3c3 --- /dev/null +++ b/src/kudu/consensus/consensus_peers-test.cc @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +METRIC_DECLARE_entity(tablet); + +namespace kudu { +namespace consensus { + +using log::Log; +using log::LogOptions; +using log::LogAnchorRegistry; + +const char* kTabletId = "test-peers-tablet"; +const char* kLeaderUuid = "peer-0"; +const char* kFollowerUuid = "peer-1"; + +class ConsensusPeersTest : public KuduTest { + public: + ConsensusPeersTest() + : metric_entity_(METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "peer-test")), + schema_(GetSimpleTestSchema()) { + CHECK_OK(ThreadPoolBuilder("test-peer-pool").set_max_threads(1).Build(&pool_)); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + CHECK_OK(fs_manager_->CreateInitialFileSystemLayout()); + CHECK_OK(Log::Open(options_, + fs_manager_.get(), + kTabletId, + schema_, + 0, // schema_version + NULL, + &log_)); + clock_.reset(new server::HybridClock()); + ASSERT_OK(clock_->Init()); + + consensus_.reset(new TestRaftConsensusQueueIface()); + message_queue_.reset(new PeerMessageQueue(metric_entity_, + log_.get(), + FakeRaftPeerPB(kLeaderUuid), + kTabletId)); + message_queue_->RegisterObserver(consensus_.get()); + } + + virtual void TearDown() OVERRIDE { + CHECK_OK(log_->WaitUntilAllFlushed()); + } + + DelayablePeerProxy* NewRemotePeer( + const string& peer_name, + gscoped_ptr* peer) { + RaftPeerPB peer_pb; + peer_pb.set_permanent_uuid(peer_name); + auto proxy_ptr = new DelayablePeerProxy( + pool_.get(), new NoOpTestPeerProxy(pool_.get(), peer_pb)); + gscoped_ptr proxy(proxy_ptr); + CHECK_OK(Peer::NewRemotePeer(peer_pb, + kTabletId, + kLeaderUuid, + message_queue_.get(), + pool_.get(), + proxy.Pass(), + peer)); + return proxy_ptr; + } + + void CheckLastLogEntry(int term, int index) { + OpId id; + log_->GetLatestEntryOpId(&id); + ASSERT_EQ(id.term(), term); + ASSERT_EQ(id.index(), index); + } + + void CheckLastRemoteEntry(DelayablePeerProxy* proxy, int term, int index) { + OpId id; + id.CopyFrom(proxy->proxy()->last_received()); + ASSERT_EQ(id.term(), term); + ASSERT_EQ(id.index(), index); + } + + // Registers a callback triggered when the op with the provided term and index + // is committed in the test consensus impl. + // This must be called _before_ the operation is committed. + void WaitForMajorityReplicatedIndex(int index) { + for (int i = 0; i < 100; i++) { + if (consensus_->IsMajorityReplicated(index)) { + return; + } + SleepFor(MonoDelta::FromMilliseconds(i)); + } + FAIL() << "Never replicated index " << index << " on a majority"; + } + + protected: + gscoped_ptr consensus_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + gscoped_ptr fs_manager_; + scoped_refptr log_; + gscoped_ptr message_queue_; + const Schema schema_; + LogOptions options_; + gscoped_ptr pool_; + scoped_refptr clock_; +}; + + +// Tests that a remote peer is correctly built and tracked +// by the message queue. +// After the operations are considered done the proxy (which +// simulates the other endpoint) should reflect the replicated +// messages. +TEST_F(ConsensusPeersTest, TestRemotePeer) { + // We use a majority size of 2 since we make one fake remote peer + // in addition to our real local log. + message_queue_->Init(MinimumOpId()); + message_queue_->SetLeaderMode(MinimumOpId(), + MinimumOpId().term(), + BuildRaftConfigPBForTests(3)); + + gscoped_ptr remote_peer; + DelayablePeerProxy* proxy = + NewRemotePeer(kFollowerUuid, &remote_peer); + + // Append a bunch of messages to the queue + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 20); + + // The above append ends up appending messages in term 2, so we + // update the peer's term to match. + remote_peer->SetTermForTest(2); + + // signal the peer there are requests pending. + remote_peer->SignalRequest(); + // now wait on the status of the last operation + // this will complete once the peer has logged all + // requests. + WaitForMajorityReplicatedIndex(20); + // verify that the replicated watermark corresponds to the last replicated + // message. + CheckLastRemoteEntry(proxy, 2, 20); +} + +TEST_F(ConsensusPeersTest, TestRemotePeers) { + message_queue_->Init(MinimumOpId()); + message_queue_->SetLeaderMode(MinimumOpId(), + MinimumOpId().term(), + BuildRaftConfigPBForTests(3)); + + // Create a set of remote peers + gscoped_ptr remote_peer1; + DelayablePeerProxy* remote_peer1_proxy = + NewRemotePeer("peer-1", &remote_peer1); + + gscoped_ptr remote_peer2; + DelayablePeerProxy* remote_peer2_proxy = + NewRemotePeer("peer-2", &remote_peer2); + + // Delay the response from the second remote peer. + remote_peer2_proxy->DelayResponse(); + + // Append one message to the queue. + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 1); + + OpId first = MakeOpId(0, 1); + + remote_peer1->SignalRequest(); + remote_peer2->SignalRequest(); + + // Now wait for the message to be replicated, this should succeed since + // majority = 2 and only one peer was delayed. The majority is made up + // of remote-peer1 and the local log. + WaitForMajorityReplicatedIndex(first.index()); + + CheckLastLogEntry(first.term(), first.index()); + CheckLastRemoteEntry(remote_peer1_proxy, first.term(), first.index()); + + remote_peer2_proxy->Respond(TestPeerProxy::kUpdate); + // Wait until all peers have replicated the message, otherwise + // when we add the next one remote_peer2 might find the next message + // in the queue and will replicate it, which is not what we want. + while (!OpIdEquals(message_queue_->GetAllReplicatedIndexForTests(), first)) { + SleepFor(MonoDelta::FromMilliseconds(1)); + } + + // Now append another message to the queue + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 2, 1); + + // We should not see it replicated, even after 10ms, + // since only the local peer replicates the message. + SleepFor(MonoDelta::FromMilliseconds(10)); + ASSERT_FALSE(consensus_->IsMajorityReplicated(2)); + + // Signal one of the two remote peers. + remote_peer1->SignalRequest(); + // We should now be able to wait for it to replicate, since two peers (a majority) + // have replicated the message. + WaitForMajorityReplicatedIndex(2); +} + +// Regression test for KUDU-699: even if a peer isn't making progress, +// and thus always has data pending, we should be able to close the peer. +TEST_F(ConsensusPeersTest, TestCloseWhenRemotePeerDoesntMakeProgress) { + message_queue_->Init(MinimumOpId()); + message_queue_->SetLeaderMode(MinimumOpId(), + MinimumOpId().term(), + BuildRaftConfigPBForTests(3)); + + auto mock_proxy = new MockedPeerProxy(pool_.get()); + gscoped_ptr peer; + ASSERT_OK(Peer::NewRemotePeer(FakeRaftPeerPB(kFollowerUuid), + kTabletId, + kLeaderUuid, + message_queue_.get(), + pool_.get(), + gscoped_ptr(mock_proxy), + &peer)); + + // Make the peer respond without making any progress -- it always returns + // that it has only replicated op 0.0. When we see the response, we always + // decide that more data is pending, and we want to send another request. + ConsensusResponsePB peer_resp; + peer_resp.set_responder_uuid(kFollowerUuid); + peer_resp.set_responder_term(0); + peer_resp.mutable_status()->mutable_last_received()->CopyFrom( + MakeOpId(0, 0)); + peer_resp.mutable_status()->mutable_last_received_current_leader()->CopyFrom( + MakeOpId(0, 0)); + peer_resp.mutable_status()->set_last_committed_idx(0); + + mock_proxy->set_update_response(peer_resp); + + // Add an op to the queue and start sending requests to the peer. + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 1); + peer->SignalRequest(true); + + // We should be able to close the peer even though it has more data pending. + peer->Close(); +} + +TEST_F(ConsensusPeersTest, TestDontSendOneRpcPerWriteWhenPeerIsDown) { + message_queue_->Init(MinimumOpId()); + message_queue_->SetLeaderMode(MinimumOpId(), + MinimumOpId().term(), + BuildRaftConfigPBForTests(3)); + + auto mock_proxy = new MockedPeerProxy(pool_.get()); + gscoped_ptr peer; + ASSERT_OK(Peer::NewRemotePeer(FakeRaftPeerPB(kFollowerUuid), + kTabletId, + kLeaderUuid, + message_queue_.get(), + pool_.get(), + gscoped_ptr(mock_proxy), + &peer)); + + // Initial response has to be successful -- otherwise we'll consider the peer + // "new" and only send heartbeat RPCs. + ConsensusResponsePB initial_resp; + initial_resp.set_responder_uuid(kFollowerUuid); + initial_resp.set_responder_term(0); + initial_resp.mutable_status()->mutable_last_received()->CopyFrom( + MakeOpId(1, 1)); + initial_resp.mutable_status()->mutable_last_received_current_leader()->CopyFrom( + MakeOpId(1, 1)); + initial_resp.mutable_status()->set_last_committed_idx(0); + mock_proxy->set_update_response(initial_resp); + + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, 1, 1); + peer->SignalRequest(true); + + // Now wait for the message to be replicated, this should succeed since + // the local (leader) peer always acks and the follower also acked this time. + WaitForMajorityReplicatedIndex(1); + + // Set up the peer to respond with an error. + ConsensusResponsePB error_resp; + error_resp.mutable_error()->set_code(tserver::TabletServerErrorPB::UNKNOWN_ERROR); + StatusToPB(Status::NotFound("fake error"), error_resp.mutable_error()->mutable_status()); + mock_proxy->set_update_response(error_resp); + + // Add a bunch of messages to the queue. + for (int i = 2; i <= 100; i++) { + AppendReplicateMessagesToQueue(message_queue_.get(), clock_, i, 1); + peer->SignalRequest(false); + SleepFor(MonoDelta::FromMilliseconds(2)); + } + + // Check that we didn't attempt to send one UpdateConsensus call per + // Write. 100 writes might have taken a second or two, though, so it's + // OK to have called UpdateConsensus() a few times due to regularly + // scheduled heartbeats. + ASSERT_LT(mock_proxy->update_count(), 5); +} + +} // namespace consensus +} // namespace kudu + diff --git a/src/kudu/consensus/consensus_peers.cc b/src/kudu/consensus/consensus_peers.cc new file mode 100644 index 000000000000..c27ae830bcc6 --- /dev/null +++ b/src/kudu/consensus/consensus_peers.cc @@ -0,0 +1,470 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/consensus.proxy.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/consensus_queue.h" +#include "kudu/consensus/log.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/threadpool.h" + +DEFINE_int32(consensus_rpc_timeout_ms, 1000, + "Timeout used for all consensus internal RPC communications."); +TAG_FLAG(consensus_rpc_timeout_ms, advanced); + +DEFINE_int32(raft_get_node_instance_timeout_ms, 30000, + "Timeout for retrieving node instance data over RPC."); +TAG_FLAG(consensus_rpc_timeout_ms, hidden); + +DECLARE_int32(raft_heartbeat_interval_ms); + +DEFINE_double(fault_crash_on_leader_request_fraction, 0.0, + "Fraction of the time when the leader will crash just before sending an " + "UpdateConsensus RPC. (For testing only!)"); +TAG_FLAG(fault_crash_on_leader_request_fraction, unsafe); + + +// Allow for disabling remote bootstrap in unit tests where we want to test +// certain scenarios without triggering bootstrap of a remote peer. +DEFINE_bool(enable_remote_bootstrap, true, + "Whether remote bootstrap will be initiated by the leader when it " + "detects that a follower is out of date or does not have a tablet " + "replica. For testing purposes only."); +TAG_FLAG(enable_remote_bootstrap, unsafe); + +namespace kudu { +namespace consensus { + +using log::Log; +using log::LogEntryBatch; +using std::shared_ptr; +using rpc::Messenger; +using rpc::RpcController; +using strings::Substitute; + +Status Peer::NewRemotePeer(const RaftPeerPB& peer_pb, + const string& tablet_id, + const string& leader_uuid, + PeerMessageQueue* queue, + ThreadPool* thread_pool, + gscoped_ptr proxy, + gscoped_ptr* peer) { + + gscoped_ptr new_peer(new Peer(peer_pb, + tablet_id, + leader_uuid, + proxy.Pass(), + queue, + thread_pool)); + RETURN_NOT_OK(new_peer->Init()); + peer->reset(new_peer.release()); + return Status::OK(); +} + +Peer::Peer(const RaftPeerPB& peer_pb, string tablet_id, string leader_uuid, + gscoped_ptr proxy, PeerMessageQueue* queue, + ThreadPool* thread_pool) + : tablet_id_(std::move(tablet_id)), + leader_uuid_(std::move(leader_uuid)), + peer_pb_(peer_pb), + proxy_(proxy.Pass()), + queue_(queue), + failed_attempts_(0), + sem_(1), + heartbeater_( + peer_pb.permanent_uuid(), + MonoDelta::FromMilliseconds(FLAGS_raft_heartbeat_interval_ms), + boost::bind(&Peer::SignalRequest, this, true)), + thread_pool_(thread_pool), + state_(kPeerCreated) {} + +void Peer::SetTermForTest(int term) { + response_.set_responder_term(term); +} + +Status Peer::Init() { + boost::lock_guard lock(peer_lock_); + queue_->TrackPeer(peer_pb_.permanent_uuid()); + RETURN_NOT_OK(heartbeater_.Start()); + state_ = kPeerStarted; + return Status::OK(); +} + +Status Peer::SignalRequest(bool even_if_queue_empty) { + // If the peer is currently sending, return Status::OK(). + // If there are new requests in the queue we'll get them on ProcessResponse(). + if (!sem_.TryAcquire()) { + return Status::OK(); + } + { + boost::lock_guard l(peer_lock_); + + if (PREDICT_FALSE(state_ == kPeerClosed)) { + sem_.Release(); + return Status::IllegalState("Peer was closed."); + } + + // For the first request sent by the peer, we send it even if the queue is empty, + // which it will always appear to be for the first request, since this is the + // negotiation round. + if (PREDICT_FALSE(state_ == kPeerStarted)) { + even_if_queue_empty = true; + state_ = kPeerRunning; + } + DCHECK_EQ(state_, kPeerRunning); + + // If our last request generated an error, and this is not a normal + // heartbeat request, then don't send the "per-RPC" request. Instead, + // we'll wait for the heartbeat. + // + // TODO: we could consider looking at the number of consecutive failed + // attempts, and instead of ignoring the signal, ask the heartbeater + // to "expedite" the next heartbeat in order to achieve something like + // exponential backoff after an error. As it is implemented today, any + // transient error will result in a latency blip as long as the heartbeat + // period. + if (failed_attempts_ > 0 && !even_if_queue_empty) { + sem_.Release(); + return Status::OK(); + } + } + + + RETURN_NOT_OK(thread_pool_->SubmitClosure( + Bind(&Peer::SendNextRequest, Unretained(this), even_if_queue_empty))); + return Status::OK(); +} + +void Peer::SendNextRequest(bool even_if_queue_empty) { + // The peer has no pending request nor is sending: send the request. + bool needs_remote_bootstrap = false; + int64_t commit_index_before = request_.has_committed_index() ? + request_.committed_index().index() : kMinimumOpIdIndex; + Status s = queue_->RequestForPeer(peer_pb_.permanent_uuid(), &request_, + &replicate_msg_refs_, &needs_remote_bootstrap); + int64_t commit_index_after = request_.has_committed_index() ? + request_.committed_index().index() : kMinimumOpIdIndex; + + if (PREDICT_FALSE(!s.ok())) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Could not obtain request from queue for peer: " + << peer_pb_.permanent_uuid() << ". Status: " << s.ToString(); + sem_.Release(); + return; + } + + if (PREDICT_FALSE(needs_remote_bootstrap)) { + Status s = SendRemoteBootstrapRequest(); + if (!s.ok()) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Unable to generate remote bootstrap request for peer: " + << s.ToString(); + sem_.Release(); + } + return; + } + + request_.set_tablet_id(tablet_id_); + request_.set_caller_uuid(leader_uuid_); + request_.set_dest_uuid(peer_pb_.permanent_uuid()); + + bool req_has_ops = request_.ops_size() > 0 || (commit_index_after > commit_index_before); + // If the queue is empty, check if we were told to send a status-only + // message, if not just return. + if (PREDICT_FALSE(!req_has_ops && !even_if_queue_empty)) { + sem_.Release(); + return; + } + + // If we're actually sending ops there's no need to heartbeat for a while, + // reset the heartbeater + if (req_has_ops) { + heartbeater_.Reset(); + } + + MAYBE_FAULT(FLAGS_fault_crash_on_leader_request_fraction); + + + VLOG_WITH_PREFIX_UNLOCKED(2) << "Sending to peer " << peer_pb().permanent_uuid() << ": " + << request_.ShortDebugString(); + controller_.Reset(); + + proxy_->UpdateAsync(&request_, &response_, &controller_, + boost::bind(&Peer::ProcessResponse, this)); +} + +void Peer::ProcessResponse() { + // Note: This method runs on the reactor thread. + + DCHECK_EQ(0, sem_.GetValue()) + << "Got a response when nothing was pending"; + + if (!controller_.status().ok()) { + if (controller_.status().IsRemoteError()) { + // Most controller errors are caused by network issues or corner cases + // like shutdown and failure to serialize a protobuf. Therefore, we + // generally consider these errors to indicate an unreachable peer. + // However, a RemoteError wraps some other error propagated from the + // remote peer, so we know the remote is alive. Therefore, we will let + // the queue know that the remote is responsive. + queue_->NotifyPeerIsResponsiveDespiteError(peer_pb_.permanent_uuid()); + } + ProcessResponseError(controller_.status()); + return; + } + + // Pass through errors we can respond to, like not found, since in that case + // we will need to remotely bootstrap. TODO: Handle DELETED response once implemented. + if ((response_.has_error() && + response_.error().code() != tserver::TabletServerErrorPB::TABLET_NOT_FOUND) || + (response_.status().has_error() && + response_.status().error().code() == consensus::ConsensusErrorPB::CANNOT_PREPARE)) { + // Again, let the queue know that the remote is still responsive, since we + // will not be sending this error response through to the queue. + queue_->NotifyPeerIsResponsiveDespiteError(peer_pb_.permanent_uuid()); + ProcessResponseError(StatusFromPB(response_.error().status())); + return; + } + + // The queue's handling of the peer response may generate IO (reads against + // the WAL) and SendNextRequest() may do the same thing. So we run the rest + // of the response handling logic on our thread pool and not on the reactor + // thread. + Status s = thread_pool_->SubmitClosure(Bind(&Peer::DoProcessResponse, Unretained(this))); + if (PREDICT_FALSE(!s.ok())) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Unable to process peer response: " << s.ToString() + << ": " << response_.ShortDebugString(); + sem_.Release(); + } +} + +void Peer::DoProcessResponse() { + failed_attempts_ = 0; + + VLOG_WITH_PREFIX_UNLOCKED(2) << "Response from peer " << peer_pb().permanent_uuid() << ": " + << response_.ShortDebugString(); + + bool more_pending; + queue_->ResponseFromPeer(peer_pb_.permanent_uuid(), response_, &more_pending); + + // We're OK to read the state_ without a lock here -- if we get a race, + // the worst thing that could happen is that we'll make one more request before + // noticing a close. + if (more_pending && ANNOTATE_UNPROTECTED_READ(state_) != kPeerClosed) { + SendNextRequest(true); + } else { + sem_.Release(); + } +} + +Status Peer::SendRemoteBootstrapRequest() { + if (!FLAGS_enable_remote_bootstrap) { + failed_attempts_++; + return Status::NotSupported("remote bootstrap is disabled"); + } + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Sending request to remotely bootstrap"; + RETURN_NOT_OK(queue_->GetRemoteBootstrapRequestForPeer(peer_pb_.permanent_uuid(), &rb_request_)); + controller_.Reset(); + proxy_->StartRemoteBootstrap(&rb_request_, &rb_response_, &controller_, + boost::bind(&Peer::ProcessRemoteBootstrapResponse, this)); + return Status::OK(); +} + +void Peer::ProcessRemoteBootstrapResponse() { + // We treat remote bootstrap as fire-and-forget. + if (rb_response_.has_error()) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Unable to begin remote bootstrap on peer: " + << rb_response_.ShortDebugString(); + } + sem_.Release(); +} + +void Peer::ProcessResponseError(const Status& status) { + failed_attempts_++; + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Couldn't send request to peer " << peer_pb_.permanent_uuid() + << " for tablet " << tablet_id_ + << " Status: " << status.ToString() << ". Retrying in the next heartbeat period." + << " Already tried " << failed_attempts_ << " times."; + sem_.Release(); +} + +string Peer::LogPrefixUnlocked() const { + return Substitute("T $0 P $1 -> Peer $2 ($3:$4): ", + tablet_id_, leader_uuid_, peer_pb_.permanent_uuid(), + peer_pb_.last_known_addr().host(), peer_pb_.last_known_addr().port()); +} + +void Peer::Close() { + WARN_NOT_OK(heartbeater_.Stop(), "Could not stop heartbeater"); + + // If the peer is already closed return. + { + boost::lock_guard lock(peer_lock_); + if (state_ == kPeerClosed) return; + DCHECK(state_ == kPeerRunning || state_ == kPeerStarted) << "Unexpected state: " << state_; + state_ = kPeerClosed; + } + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Closing peer: " << peer_pb_.permanent_uuid(); + + // Acquire the semaphore to wait for any concurrent request to finish. + // They will see the state_ == kPeerClosed and not start any new requests, + // but we can't currently cancel the already-sent ones. (see KUDU-699) + boost::lock_guard l(sem_); + queue_->UntrackPeer(peer_pb_.permanent_uuid()); + // We don't own the ops (the queue does). + request_.mutable_ops()->ExtractSubrange(0, request_.ops_size(), nullptr); +} + +Peer::~Peer() { + Close(); +} + + +RpcPeerProxy::RpcPeerProxy(gscoped_ptr hostport, + gscoped_ptr consensus_proxy) + : hostport_(hostport.Pass()), + consensus_proxy_(consensus_proxy.Pass()) { +} + +void RpcPeerProxy::UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) { + controller->set_timeout(MonoDelta::FromMilliseconds(FLAGS_consensus_rpc_timeout_ms)); + consensus_proxy_->UpdateConsensusAsync(*request, response, controller, callback); +} + +void RpcPeerProxy::RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) { + consensus_proxy_->RequestConsensusVoteAsync(*request, response, controller, callback); +} + +void RpcPeerProxy::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB* request, + StartRemoteBootstrapResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) { + consensus_proxy_->StartRemoteBootstrapAsync(*request, response, controller, callback); +} + +RpcPeerProxy::~RpcPeerProxy() {} + +namespace { + +Status CreateConsensusServiceProxyForHost(const shared_ptr& messenger, + const HostPort& hostport, + gscoped_ptr* new_proxy) { + vector addrs; + RETURN_NOT_OK(hostport.ResolveAddresses(&addrs)); + if (addrs.size() > 1) { + LOG(WARNING)<< "Peer address '" << hostport.ToString() << "' " + << "resolves to " << addrs.size() << " different addresses. Using " + << addrs[0].ToString(); + } + new_proxy->reset(new ConsensusServiceProxy(messenger, addrs[0])); + return Status::OK(); +} + +} // anonymous namespace + +RpcPeerProxyFactory::RpcPeerProxyFactory(shared_ptr messenger) + : messenger_(std::move(messenger)) {} + +Status RpcPeerProxyFactory::NewProxy(const RaftPeerPB& peer_pb, + gscoped_ptr* proxy) { + gscoped_ptr hostport(new HostPort); + RETURN_NOT_OK(HostPortFromPB(peer_pb.last_known_addr(), hostport.get())); + gscoped_ptr new_proxy; + RETURN_NOT_OK(CreateConsensusServiceProxyForHost(messenger_, *hostport, &new_proxy)); + proxy->reset(new RpcPeerProxy(hostport.Pass(), new_proxy.Pass())); + return Status::OK(); +} + +RpcPeerProxyFactory::~RpcPeerProxyFactory() {} + +Status SetPermanentUuidForRemotePeer(const shared_ptr& messenger, + RaftPeerPB* remote_peer) { + DCHECK(!remote_peer->has_permanent_uuid()); + HostPort hostport; + RETURN_NOT_OK(HostPortFromPB(remote_peer->last_known_addr(), &hostport)); + gscoped_ptr proxy; + RETURN_NOT_OK(CreateConsensusServiceProxyForHost(messenger, hostport, &proxy)); + GetNodeInstanceRequestPB req; + GetNodeInstanceResponsePB resp; + rpc::RpcController controller; + + // TODO generalize this exponential backoff algorithm, as we do the + // same thing in catalog_manager.cc + // (AsyncTabletRequestTask::RpcCallBack). + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(FLAGS_raft_get_node_instance_timeout_ms)); + int attempt = 1; + while (true) { + VLOG(2) << "Getting uuid from remote peer. Request: " << req.ShortDebugString(); + + controller.Reset(); + Status s = proxy->GetNodeInstance(req, &resp, &controller); + if (s.ok()) { + if (controller.status().ok()) { + break; + } + s = controller.status(); + } + + LOG(WARNING) << "Error getting permanent uuid from config peer " << hostport.ToString() << ": " + << s.ToString(); + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (now.ComesBefore(deadline)) { + int64_t remaining_ms = deadline.GetDeltaSince(now).ToMilliseconds(); + int64_t base_delay_ms = 1 << (attempt + 3); // 1st retry delayed 2^4 ms, 2nd 2^5, etc.. + int64_t jitter_ms = rand() % 50; // Add up to 50ms of additional random delay. + int64_t delay_ms = std::min(base_delay_ms + jitter_ms, remaining_ms); + VLOG(1) << "Sleeping " << delay_ms << " ms. before retrying to get uuid from remote peer..."; + SleepFor(MonoDelta::FromMilliseconds(delay_ms)); + LOG(INFO) << "Retrying to get permanent uuid for remote peer: " + << remote_peer->ShortDebugString() << " attempt: " << attempt++; + } else { + s = Status::TimedOut(Substitute("Getting permanent uuid from $0 timed out after $1 ms.", + hostport.ToString(), + FLAGS_raft_get_node_instance_timeout_ms), + s.ToString()); + return s; + } + } + remote_peer->set_permanent_uuid(resp.node_instance().permanent_uuid()); + return Status::OK(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus_peers.h b/src/kudu/consensus/consensus_peers.h new file mode 100644 index 000000000000..e10856fde059 --- /dev/null +++ b/src/kudu/consensus/consensus_peers.h @@ -0,0 +1,324 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_CONSENSUS_PEERS_H_ +#define KUDU_CONSENSUS_CONSENSUS_PEERS_H_ + +#include +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/rpc/response_callback.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/resettable_heartbeater.h" +#include "kudu/util/semaphore.h" +#include "kudu/util/status.h" + +namespace kudu { +class HostPort; +class ThreadPool; + +namespace log { +class Log; +} + +namespace rpc { +class Messenger; +class RpcController; +} + +namespace consensus { +class ConsensusServiceProxy; +class OpId; +class PeerProxy; +class PeerProxyFactory; +class PeerMessageQueue; +class VoteRequestPB; +class VoteResponsePB; + +// A peer in consensus (local or remote). +// +// Leaders use peers to update the local Log and remote replicas. +// +// Peers are owned by the consensus implementation and do not keep +// state aside from whether there are requests pending or if requests +// are being processed. +// +// There are two external actions that trigger a state change: +// +// SignalRequest(): Called by the consensus implementation, notifies +// that the queue contains messages to be processed. +// +// ProcessResponse() Called a response from a peer is received. +// +// The following state diagrams describe what happens when a state +// changing method is called. +// +// + +// | +// SignalRequest() | +// | +// | +// v +// +------------------+ +// +------+ processing ? +-----+ +// | +------------------+ | +// | | +// | Yes | No +// | | +// v v +// return ProcessNextRequest() +// processing = true +// - get reqs. from queue +// - update peer async +// return +// +// + +// | +// ProcessResponse() | +// processing = false | +// v +// +------------------+ +// +------+ more pending? +-----+ +// | +------------------+ | +// | | +// | Yes | No +// | | +// v v +// SignalRequest() return +// +class Peer { + public: + // Initializes a peer and get its status. + Status Init(); + + // Signals that this peer has a new request to replicate/store. + // 'force_if_queue_empty' indicates whether the peer should force + // send the request even if the queue is empty. This is used for + // status-only requests. + Status SignalRequest(bool force_if_queue_empty = false); + + const RaftPeerPB& peer_pb() const { return peer_pb_; } + + // Returns the PeerProxy if this is a remote peer or NULL if it + // isn't. Used for tests to fiddle with the proxy and emulate remote + // behavior. + PeerProxy* GetPeerProxyForTests(); + + void Close(); + + void SetTermForTest(int term); + + ~Peer(); + + // Creates a new remote peer and makes the queue track it.' + // + // Requests to this peer (which may end up doing IO to read non-cached + // log entries) are assembled on 'thread_pool'. + // Response handling may also involve IO related to log-entry lookups and is + // also done on 'thread_pool'. + static Status NewRemotePeer(const RaftPeerPB& peer_pb, + const std::string& tablet_id, + const std::string& leader_uuid, + PeerMessageQueue* queue, + ThreadPool* thread_pool, + gscoped_ptr proxy, + gscoped_ptr* peer); + + private: + Peer(const RaftPeerPB& peer, std::string tablet_id, std::string leader_uuid, + gscoped_ptr proxy, PeerMessageQueue* queue, + ThreadPool* thread_pool); + + void SendNextRequest(bool even_if_queue_empty); + + // Signals that a response was received from the peer. + // This method is called from the reactor thread and calls + // DoProcessResponse() on thread_pool_ to do any work that requires IO or + // lock-taking. + void ProcessResponse(); + + // Run on 'thread_pool'. Does response handling that requires IO or may block. + void DoProcessResponse(); + + // Fetch the desired remote bootstrap request from the queue and send it + // to the peer. The callback goes to ProcessRemoteBootstrapResponse(). + // + // Returns a bad Status if remote bootstrap is disabled, or if the + // request cannot be generated for some reason. + Status SendRemoteBootstrapRequest(); + + // Handle RPC callback from initiating remote bootstrap. + void ProcessRemoteBootstrapResponse(); + + // Signals there was an error sending the request to the peer. + void ProcessResponseError(const Status& status); + + std::string LogPrefixUnlocked() const; + + const std::string& tablet_id() const { return tablet_id_; } + + const std::string tablet_id_; + const std::string leader_uuid_; + + RaftPeerPB peer_pb_; + + gscoped_ptr proxy_; + + PeerMessageQueue* queue_; + uint64_t failed_attempts_; + + // The latest consensus update request and response. + ConsensusRequestPB request_; + ConsensusResponsePB response_; + + // The latest remote bootstrap request and response. + StartRemoteBootstrapRequestPB rb_request_; + StartRemoteBootstrapResponsePB rb_response_; + + // Reference-counted pointers to any ReplicateMsgs which are in-flight to the peer. We + // may have loaded these messages from the LogCache, in which case we are potentially + // sharing the same object as other peers. Since the PB request_ itself can't hold + // reference counts, this holds them. + std::vector replicate_msg_refs_; + + rpc::RpcController controller_; + + // Held if there is an outstanding request. + // This is used in order to ensure that we only have a single request + // oustanding at a time, and to wait for the outstanding requests + // at Close(). + Semaphore sem_; + + + // Heartbeater for remote peer implementations. + // This will send status only requests to the remote peers + // whenever we go more than 'FLAGS_raft_heartbeat_interval_ms' + // without sending actual data. + ResettableHeartbeater heartbeater_; + + // Thread pool used to construct requests to this peer. + ThreadPool* thread_pool_; + + enum State { + kPeerCreated, + kPeerStarted, + kPeerRunning, + kPeerClosed + }; + + // lock that protects Peer state changes, initialization, etc. + // Must not try to acquire sem_ while holding peer_lock_. + mutable simple_spinlock peer_lock_; + State state_; +}; + +// A proxy to another peer. Usually a thin wrapper around an rpc proxy but can +// be replaced for tests. +class PeerProxy { + public: + + // Sends a request, asynchronously, to a remote peer. + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) = 0; + + // Sends a RequestConsensusVote to a remote peer. + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) = 0; + + // Instructs a peer to begin a remote bootstrap session. + virtual void StartRemoteBootstrap(const StartRemoteBootstrapRequestPB* request, + StartRemoteBootstrapResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) { + LOG(DFATAL) << "Not implemented"; + } + + virtual ~PeerProxy() {} +}; + +// A peer proxy factory. Usually just obtains peers through the rpc implementation +// but can be replaced for tests. +class PeerProxyFactory { + public: + + virtual Status NewProxy(const RaftPeerPB& peer_pb, + gscoped_ptr* proxy) = 0; + + virtual ~PeerProxyFactory() {} +}; + +// PeerProxy implementation that does RPC calls +class RpcPeerProxy : public PeerProxy { + public: + RpcPeerProxy(gscoped_ptr hostport, + gscoped_ptr consensus_proxy); + + virtual void UpdateAsync(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE; + + virtual void RequestConsensusVoteAsync(const VoteRequestPB* request, + VoteResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE; + + virtual void StartRemoteBootstrap(const StartRemoteBootstrapRequestPB* request, + StartRemoteBootstrapResponsePB* response, + rpc::RpcController* controller, + const rpc::ResponseCallback& callback) OVERRIDE; + + virtual ~RpcPeerProxy(); + + private: + gscoped_ptr hostport_; + gscoped_ptr consensus_proxy_; +}; + +// PeerProxyFactory implementation that generates RPCPeerProxies +class RpcPeerProxyFactory : public PeerProxyFactory { + public: + explicit RpcPeerProxyFactory(std::shared_ptr messenger); + + virtual Status NewProxy(const RaftPeerPB& peer_pb, + gscoped_ptr* proxy) OVERRIDE; + + virtual ~RpcPeerProxyFactory(); + private: + std::shared_ptr messenger_; +}; + +// Query the consensus service at last known host/port that is +// specified in 'remote_peer' and set the 'permanent_uuid' field based +// on the response. +Status SetPermanentUuidForRemotePeer(const std::shared_ptr& messenger, + RaftPeerPB* remote_peer); + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_CONSENSUS_PEERS_H_ */ diff --git a/src/kudu/consensus/consensus_queue-test.cc b/src/kudu/consensus/consensus_queue-test.cc new file mode 100644 index 000000000000..fe101353fdc5 --- /dev/null +++ b/src/kudu/consensus/consensus_queue-test.cc @@ -0,0 +1,819 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus_queue.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/log-test-base.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(enable_data_block_fsync); +DECLARE_int32(consensus_max_batch_size_bytes); + +METRIC_DECLARE_entity(tablet); + +namespace kudu { +namespace consensus { + +static const char* kLeaderUuid = "peer-0"; +static const char* kPeerUuid = "peer-1"; +static const char* kTestTablet = "test-tablet"; + +class ConsensusQueueTest : public KuduTest { + public: + ConsensusQueueTest() + : schema_(GetSimpleTestSchema()), + metric_entity_(METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "queue-test")), + registry_(new log::LogAnchorRegistry) { + FLAGS_enable_data_block_fsync = false; // Keep unit tests fast. + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + CHECK_OK(log::Log::Open(log::LogOptions(), + fs_manager_.get(), + kTestTablet, + schema_, + 0, // schema_version + NULL, + &log_)); + clock_.reset(new server::HybridClock()); + ASSERT_OK(clock_->Init()); + + consensus_.reset(new TestRaftConsensusQueueIface()); + CloseAndReopenQueue(); + queue_->RegisterObserver(consensus_.get()); + } + + void CloseAndReopenQueue() { + // Blow away the memtrackers before creating the new queue. + queue_.reset(); + queue_.reset(new PeerMessageQueue(metric_entity_, + log_.get(), + FakeRaftPeerPB(kLeaderUuid), + kTestTablet)); + } + + virtual void TearDown() OVERRIDE { + log_->WaitUntilAllFlushed(); + queue_->Close(); + } + + Status AppendReplicateMsg(int term, int index, int payload_size) { + return queue_->AppendOperation( + make_scoped_refptr_replicate(CreateDummyReplicate(term, + index, + clock_->Now(), + payload_size).release())); + } + + // Updates the peer's watermark in the queue so that it matches + // the operation we want, since the queue always assumes that + // when a peer gets tracked it's always tracked starting at the + // last operation in the queue + void UpdatePeerWatermarkToOp(ConsensusRequestPB* request, + ConsensusResponsePB* response, + const OpId& last_received, + const OpId& last_received_current_leader, + int last_committed_idx, + bool* more_pending) { + + queue_->TrackPeer(kPeerUuid); + response->set_responder_uuid(kPeerUuid); + + // Ask for a request. The queue assumes the peer is up-to-date so + // this should contain no operations. + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request->ops_size(), 0); + + // Refuse saying that the log matching property check failed and + // that our last operation is actually 'last_received'. + RefuseWithLogPropertyMismatch(response, last_received, last_received_current_leader); + response->mutable_status()->set_last_committed_idx(last_committed_idx); + queue_->ResponseFromPeer(response->responder_uuid(), *response, more_pending); + request->Clear(); + response->mutable_status()->Clear(); + } + + // Like the above but uses the last received index as the commtited index. + void UpdatePeerWatermarkToOp(ConsensusRequestPB* request, + ConsensusResponsePB* response, + const OpId& last_received, + const OpId& last_received_current_leader, + bool* more_pending) { + return UpdatePeerWatermarkToOp(request, response, last_received, + last_received_current_leader, + last_received.index(), more_pending); + } + + void RefuseWithLogPropertyMismatch(ConsensusResponsePB* response, + const OpId& last_received, + const OpId& last_received_current_leader) { + ConsensusStatusPB* status = response->mutable_status(); + status->mutable_last_received()->CopyFrom(last_received); + status->mutable_last_received_current_leader()->CopyFrom(last_received_current_leader); + ConsensusErrorPB* error = status->mutable_error(); + error->set_code(ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH); + StatusToPB(Status::IllegalState("LMP failed."), error->mutable_status()); + } + + void WaitForLocalPeerToAckIndex(int index) { + while (true) { + PeerMessageQueue::TrackedPeer leader = queue_->GetTrackedPeerForTests(kLeaderUuid); + if (leader.last_received.index() >= index) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + } + + // Sets the last received op on the response, as well as the last committed index. + void SetLastReceivedAndLastCommitted(ConsensusResponsePB* response, + const OpId& last_received, + const OpId& last_received_current_leader, + int last_committed_idx) { + *response->mutable_status()->mutable_last_received() = last_received; + *response->mutable_status()->mutable_last_received_current_leader() = + last_received_current_leader; + response->mutable_status()->set_last_committed_idx(last_committed_idx); + } + + // Like the above but uses the same last_received for current term. + void SetLastReceivedAndLastCommitted(ConsensusResponsePB* response, + const OpId& last_received, + int last_committed_idx) { + SetLastReceivedAndLastCommitted(response, last_received, last_received, last_committed_idx); + } + + // Like the above but just sets the last committed index to have the same index + // as the last received op. + void SetLastReceivedAndLastCommitted(ConsensusResponsePB* response, + const OpId& last_received) { + SetLastReceivedAndLastCommitted(response, last_received, last_received.index()); + } + + protected: + gscoped_ptr consensus_; + const Schema schema_; + gscoped_ptr fs_manager_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + scoped_refptr log_; + gscoped_ptr queue_; + scoped_refptr registry_; + scoped_refptr clock_; +}; + +// Tests that the queue is able to track a peer when it starts tracking a peer +// after the initial message in the queue. In particular this creates a queue +// with several messages and then starts to track a peer whose watermark +// falls in the middle of the current messages in the queue. +TEST_F(ConsensusQueueTest, TestStartTrackingAfterStart) { + queue_->Init(MinimumOpId()); + queue_->SetLeaderMode(MinimumOpId(), MinimumOpId().term(), BuildRaftConfigPBForTests(2)); + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 100); + + ConsensusRequestPB request; + ConsensusResponsePB response; + response.set_responder_uuid(kPeerUuid); + bool more_pending = false; + + // Peer already has some messages, last one being 7.50 + OpId last_received = MakeOpId(7, 50); + OpId last_received_current_leader = MinimumOpId(); + + UpdatePeerWatermarkToOp(&request, &response, last_received, + last_received_current_leader, &more_pending); + ASSERT_TRUE(more_pending); + + // Getting a new request should get all operations after 7.50 + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(50, request.ops_size()); + + SetLastReceivedAndLastCommitted(&response, request.ops(49).id()); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_FALSE(more_pending) << "Queue still had requests pending"; + + // if we ask for a new request, it should come back empty + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(0, request.ops_size()); + + // extract the ops from the request to avoid double free + request.mutable_ops()->ExtractSubrange(0, request.ops_size(), nullptr); +} + +// Tests that the peers gets the messages pages, with the size of a page +// being 'consensus_max_batch_size_bytes' +TEST_F(ConsensusQueueTest, TestGetPagedMessages) { + queue_->Init(MinimumOpId()); + queue_->SetLeaderMode(MinimumOpId(), MinimumOpId().term(), BuildRaftConfigPBForTests(2)); + + // helper to estimate request size so that we can set the max batch size appropriately + ConsensusRequestPB page_size_estimator; + page_size_estimator.set_caller_term(14); + OpId* committed_index = page_size_estimator.mutable_committed_index(); + OpId* preceding_id = page_size_estimator.mutable_preceding_id(); + committed_index->CopyFrom(MinimumOpId()); + preceding_id->CopyFrom(MinimumOpId()); + + // We're going to add 100 messages to the queue so we make each page fetch 9 of those, + // for a total of 12 pages. The last page should have a single op. + const int kOpsPerRequest = 9; + for (int i = 0; i < kOpsPerRequest; i++) { + page_size_estimator.mutable_ops()->AddAllocated( + CreateDummyReplicate(0, 0, clock_->Now(), 0).release()); + } + + // Save the current flag state. + google::FlagSaver saver; + FLAGS_consensus_max_batch_size_bytes = page_size_estimator.ByteSize(); + + ConsensusRequestPB request; + ConsensusResponsePB response; + response.set_responder_uuid(kPeerUuid); + bool more_pending = false; + + UpdatePeerWatermarkToOp(&request, &response, MinimumOpId(), MinimumOpId(), &more_pending); + ASSERT_TRUE(more_pending); + + // Append the messages after the queue is tracked. Otherwise the ops might + // get evicted from the cache immediately and the requests below would + // result in async log reads instead of cache hits. + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 100); + + OpId last; + for (int i = 0; i < 11; i++) { + VLOG(1) << "Making request " << i; + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(kOpsPerRequest, request.ops_size()); + last = request.ops(request.ops_size() -1).id(); + SetLastReceivedAndLastCommitted(&response, last); + VLOG(1) << "Faking received up through " << last; + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending); + } + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(1, request.ops_size()); + last = request.ops(request.ops_size() -1).id(); + SetLastReceivedAndLastCommitted(&response, last); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_FALSE(more_pending); + + // extract the ops from the request to avoid double free + request.mutable_ops()->ExtractSubrange(0, request.ops_size(), nullptr); +} + +TEST_F(ConsensusQueueTest, TestPeersDontAckBeyondWatermarks) { + queue_->Init(MinimumOpId()); + queue_->SetLeaderMode(MinimumOpId(), MinimumOpId().term(), BuildRaftConfigPBForTests(3)); + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 100); + + // Wait for the local peer to append all messages + WaitForLocalPeerToAckIndex(100); + + OpId all_replicated = MakeOpId(14, 100); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MinimumOpId()); + // Since we're tracking a single peer still this should have moved the all + // replicated watermark to the last op appended to the local log. + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MakeOpId(14, 100)); + + // Start to track the peer after the queue has some messages in it + // at a point that is halfway through the current messages in the queue. + OpId first_msg = MakeOpId(7, 50); + + ConsensusRequestPB request; + ConsensusResponsePB response; + response.set_responder_uuid(kPeerUuid); + bool more_pending = false; + + UpdatePeerWatermarkToOp(&request, &response, first_msg, MinimumOpId(), &more_pending); + ASSERT_TRUE(more_pending); + + // Tracking a peer a new peer should have moved the all replicated watermark back. + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MinimumOpId()); + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MinimumOpId()); + + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(50, request.ops_size()); + + AppendReplicateMessagesToQueue(queue_.get(), clock_, 101, 100); + + SetLastReceivedAndLastCommitted(&response, request.ops(49).id()); + response.set_responder_term(28); + + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending) << "Queue didn't have anymore requests pending"; + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MakeOpId(14, 100)); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MakeOpId(14, 100)); + + // if we ask for a new request, it should come back with the rest of the messages + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(100, request.ops_size()); + + OpId expected = request.ops(99).id(); + + SetLastReceivedAndLastCommitted(&response, expected); + response.set_responder_term(expected.term()); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_FALSE(more_pending) << "Queue didn't have anymore requests pending"; + + WaitForLocalPeerToAckIndex(expected.index()); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), expected); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), expected); + + // extract the ops from the request to avoid double free + request.mutable_ops()->ExtractSubrange(0, request.ops_size(), nullptr); +} + +TEST_F(ConsensusQueueTest, TestQueueAdvancesCommittedIndex) { + queue_->Init(MinimumOpId()); + queue_->SetLeaderMode(MinimumOpId(), MinimumOpId().term(), BuildRaftConfigPBForTests(5)); + // Track 4 additional peers (in addition to the local peer) + queue_->TrackPeer("peer-1"); + queue_->TrackPeer("peer-2"); + queue_->TrackPeer("peer-3"); + queue_->TrackPeer("peer-4"); + + // Append 10 messages to the queue with a majority of 2 for a total of 3 peers. + // This should add messages 0.1 -> 0.7, 1.8 -> 1.10 to the queue. + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 10); + WaitForLocalPeerToAckIndex(10); + + // Since only the local log might have ACKed at this point, + // the committed_index should be MinimumOpId(). + queue_->observers_pool_->Wait(); + ASSERT_OPID_EQ(queue_->GetCommittedIndexForTests(), MinimumOpId()); + + // NOTE: We don't need to get operations from the queue. The queue + // only cares about what the peer reported as received, not what was sent. + ConsensusResponsePB response; + response.set_responder_term(1); + + bool more_pending; + OpId last_sent = MakeOpId(0, 5); + + // Ack the first five operations for peer-1 + response.set_responder_uuid("peer-1"); + SetLastReceivedAndLastCommitted(&response, last_sent, MinimumOpId().index()); + + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending); + + // Committed index should be the same + queue_->observers_pool_->Wait(); + ASSERT_OPID_EQ(queue_->GetCommittedIndexForTests(), MinimumOpId()); + + // Ack the first five operations for peer-2 + response.set_responder_uuid("peer-2"); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending); + + // A majority has now replicated up to 0.5. + queue_->observers_pool_->Wait(); + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MakeOpId(0, 5)); + + // Ack all operations for peer-3 + response.set_responder_uuid("peer-3"); + last_sent = MakeOpId(1, 10); + SetLastReceivedAndLastCommitted(&response, last_sent, MinimumOpId().index()); + + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + // The committed index moved so 'more_pending' should be true so that the peer is + // notified. + ASSERT_TRUE(more_pending); + + // Majority replicated watermark should be the same + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MakeOpId(0, 5)); + + // Ack the remaining operations for peer-4 + response.set_responder_uuid("peer-4"); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending); + + // Now that a majority of peers have replicated an operation in the queue's + // term the committed index should advance. + queue_->observers_pool_->Wait(); + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), MakeOpId(1, 10)); +} + +// In this test we append a sequence of operations to a log +// and then start tracking a peer whose first required operation +// is before the first operation in the queue. +TEST_F(ConsensusQueueTest, TestQueueLoadsOperationsForPeer) { + + OpId opid = MakeOpId(1, 1); + + for (int i = 1; i <= 100; i++) { + ASSERT_OK(log::AppendNoOpToLogSync(clock_, log_.get(), &opid)); + // Roll the log every 10 ops + if (i % 10 == 0) { + ASSERT_OK(log_->AllocateSegmentAndRollOver()); + } + } + ASSERT_OK(log_->WaitUntilAllFlushed()); + + OpId queues_last_op = opid; + queues_last_op.set_index(queues_last_op.index() - 1); + + // Now reset the queue so that we can pass a new committed index, + // the last operation in the log. + CloseAndReopenQueue(); + + OpId committed_index; + committed_index.set_term(1); + committed_index.set_index(100); + queue_->Init(committed_index); + queue_->SetLeaderMode(committed_index, committed_index.term(), BuildRaftConfigPBForTests(3)); + + ConsensusRequestPB request; + ConsensusResponsePB response; + response.set_responder_uuid(kPeerUuid); + bool more_pending = false; + + // The peer will actually be behind the first operation in the queue + // in this case about 50 operations before. + OpId peers_last_op; + peers_last_op.set_term(1); + peers_last_op.set_index(50); + + // Now we start tracking the peer, this negotiation round should let + // the queue know how far along the peer is. + ASSERT_NO_FATAL_FAILURE(UpdatePeerWatermarkToOp(&request, + &response, + peers_last_op, + MinimumOpId(), + &more_pending)); + + // The queue should reply that there are more messages for the peer. + ASSERT_TRUE(more_pending); + + // When we get another request for the peer the queue should load + // the missing operations. + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request.ops_size(), 50); + + // The messages still belong to the queue so we have to release them. + request.mutable_ops()->ExtractSubrange(0, request.ops().size(), nullptr); +} + +// This tests that the queue is able to handle operation overwriting, i.e. when a +// newly tracked peer reports the last received operations as some operation that +// doesn't exist in the leader's log. In particular it tests the case where a +// new leader starts at term 2 with only a part of the operations of the previous +// leader having been committed. +TEST_F(ConsensusQueueTest, TestQueueHandlesOperationOverwriting) { + + OpId opid = MakeOpId(1, 1); + // Append 10 messages in term 1 to the log. + for (int i = 1; i <= 10; i++) { + ASSERT_OK(log::AppendNoOpToLogSync(clock_, log_.get(), &opid)); + // Roll the log every 3 ops + if (i % 3 == 0) { + ASSERT_OK(log_->AllocateSegmentAndRollOver()); + } + } + + opid = MakeOpId(2, 11); + // Now append 10 more messages in term 2. + for (int i = 11; i <= 20; i++) { + ASSERT_OK(log::AppendNoOpToLogSync(clock_, log_.get(), &opid)); + // Roll the log every 3 ops + if (i % 3 == 0) { + ASSERT_OK(log_->AllocateSegmentAndRollOver()); + } + } + + + // Now reset the queue so that we can pass a new committed index, + // op, 2.15. + CloseAndReopenQueue(); + + OpId committed_index = MakeOpId(2, 15); + queue_->Init(MakeOpId(2, 20)); + queue_->SetLeaderMode(committed_index, committed_index.term(), BuildRaftConfigPBForTests(3)); + + // Now get a request for a simulated old leader, which contains more operations + // in term 1 than the new leader has. + // The queue should realize that the old leader's last received doesn't exist + // and send it operations starting at the old leader's committed index. + ConsensusRequestPB request; + ConsensusResponsePB response; + vector refs; + response.set_responder_uuid(kPeerUuid); + bool more_pending = false; + + queue_->TrackPeer(kPeerUuid); + + // Ask for a request. The queue assumes the peer is up-to-date so + // this should contain no operations. + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request.ops_size(), 0); + ASSERT_OPID_EQ(request.preceding_id(), MakeOpId(2, 20)); + ASSERT_OPID_EQ(request.committed_index(), committed_index); + + // The old leader was still in term 1 but it increased its term with our request. + response.set_responder_term(2); + + // We emulate that the old leader had 25 total operations in Term 1 (15 more than we knew about) + // which were never committed, and that its last known committed index was 5. + ConsensusStatusPB* status = response.mutable_status(); + status->mutable_last_received()->CopyFrom(MakeOpId(1, 25)); + status->mutable_last_received_current_leader()->CopyFrom(MinimumOpId()); + status->set_last_committed_idx(5); + ConsensusErrorPB* error = status->mutable_error(); + error->set_code(ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH); + StatusToPB(Status::IllegalState("LMP failed."), error->mutable_status()); + + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + request.Clear(); + + // The queue should reply that there are more operations pending. + ASSERT_TRUE(more_pending); + + // We're waiting for a two nodes. The all committed watermark should be + // 0.0 since we haven't had a successful exchange with the 'remote' peer. + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MinimumOpId()); + + // Test even when a correct peer responds (meaning we actually get to execute + // watermark advancement) we sill have the same all-replicated watermark. + ReplicateMsg* replicate = CreateDummyReplicate(2, 21, clock_->Now(), 0).release(); + ASSERT_OK(queue_->AppendOperation(make_scoped_refptr(new RefCountedReplicate(replicate)))); + WaitForLocalPeerToAckIndex(21); + + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MinimumOpId()); + + // Generate another request for the remote peer, which should include + // all of the ops since the peer's last-known committed index. + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_OPID_EQ(MakeOpId(1, 5), request.preceding_id()); + ASSERT_EQ(16, request.ops_size()); + + // Now when we respond the watermarks should advance. + response.mutable_status()->clear_error(); + SetLastReceivedAndLastCommitted(&response, MakeOpId(2, 21), 5); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + + // Now the watermark should have advanced. + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MakeOpId(2, 21)); + + // The messages still belong to the queue so we have to release them. + request.mutable_ops()->ExtractSubrange(0, request.ops().size(), nullptr); +} + +// Test for a bug where we wouldn't move any watermark back, when overwriting +// operations, which would cause a check failure on the write immediately +// following the overwriting write. +TEST_F(ConsensusQueueTest, TestQueueMovesWatermarksBackward) { + queue_->Init(MinimumOpId()); + queue_->SetNonLeaderMode(); + // Append a bunch of messages. + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 10); + log_->WaitUntilAllFlushed(); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MakeOpId(1, 10)); + // Now rewrite some of the operations and wait for the log to append. + Synchronizer synch; + CHECK_OK(queue_->AppendOperations( + { make_scoped_refptr(new RefCountedReplicate( + CreateDummyReplicate(2, 5, clock_->Now(), 0).release())) }, + synch.AsStatusCallback())); + + // Wait for the operation to be in the log. + ASSERT_OK(synch.Wait()); + + // Without the fix the following append would trigger a check failure + // in log cache. + synch.Reset(); + CHECK_OK(queue_->AppendOperations( + { make_scoped_refptr(new RefCountedReplicate( + CreateDummyReplicate(2, 6, clock_->Now(), 0).release())) }, + synch.AsStatusCallback())); + + // Wait for the operation to be in the log. + ASSERT_OK(synch.Wait()); + + // Now the all replicated watermark should have moved backward. + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), MakeOpId(2, 6)); +} + +// Tests that we're advancing the watermarks properly and only when the peer +// has a prefix of our log. This also tests for a specific bug that we had. Here's +// the scenario: +// Peer would report: +// - last received 75.49 +// - last committed 72.31 +// +// Queue has messages: +// 72.31-72.45 +// 73.46-73.51 +// 76.52-76.53 +// +// The queue has more messages than the peer, but the peer has messages +// that the queue doesn't and which will be overwritten. +// +// In the first round of negotiation the peer would report LMP mismatch. +// In the second round the queue would try to send it messages starting at 75.49 +// but since that message didn't exist in the queue's log it would instead send +// messages starting at 72.31. However, because the batches were big it was only +// able to send a few messages (e.g. up to 72.40). +// +// Since in this last exchange everything went ok (the peer still doesn't know +// that messages will be overwritten later), the queue would mark the exchange +// as successful and the peer's last received would be taken into account when +// calculating watermarks, which was incorrect. +TEST_F(ConsensusQueueTest, TestOnlyAdvancesWatermarkWhenPeerHasAPrefixOfOurLog) { + FLAGS_consensus_max_batch_size_bytes = 1024 * 10; + + queue_->Init(MakeOpId(72, 30)); + queue_->SetLeaderMode(MakeOpId(72, 31), 76, BuildRaftConfigPBForTests(3)); + + ConsensusRequestPB request; + ConsensusResponsePB response; + vector refs; + + bool more_pending; + // We expect the majority replicated watermark to star at the committed index. + OpId expected_majority_replicated = MakeOpId(72, 31); + // We expect the all replicated watermark to be reset when we track a new peer. + OpId expected_all_replicated = MinimumOpId(); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), expected_majority_replicated); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), expected_all_replicated); + + UpdatePeerWatermarkToOp(&request, &response, MakeOpId(75, 49), MinimumOpId(), 31, &more_pending); + ASSERT_TRUE(more_pending); + + for (int i = 31; i <= 53; i++) { + if (i <= 45) { + AppendReplicateMsg(72, i, 1024); + continue; + } + if (i <= 51) { + AppendReplicateMsg(73, i, 1024); + continue; + } + AppendReplicateMsg(76, i, 1024); + } + + WaitForLocalPeerToAckIndex(53); + + // When we get operations for this peer we should get them starting immediately after + // the committed index, for a total of 9 operations. + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request.ops_size(), 9); + ASSERT_OPID_EQ(request.ops(0).id(), MakeOpId(72, 32)); + const OpId* last_op = &request.ops(request.ops_size() - 1).id(); + + // When the peer acks that it received an operation that is not in our current + // term, it gets ignored in terms of watermark advancement. + SetLastReceivedAndLastCommitted(&response, MakeOpId(75, 49), *last_op, 31); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + ASSERT_TRUE(more_pending); + + // We've sent (and received and ack) up to 72.40 from the remote peer + expected_majority_replicated = MakeOpId(72, 40); + expected_all_replicated = MakeOpId(72, 40); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), expected_majority_replicated); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), expected_all_replicated); + + // Another request for this peer should get another page of messages. Still not + // on the queue's term (and thus without advancing watermarks). + request.mutable_ops()->ExtractSubrange(0, request.ops().size(), nullptr); + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request.ops_size(), 9); + ASSERT_OPID_EQ(request.ops(0).id(), MakeOpId(72, 41)); + last_op = &request.ops(request.ops_size() - 1).id(); + + SetLastReceivedAndLastCommitted(&response, MakeOpId(75, 49), *last_op, 31); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + + // We've now sent (and received an ack) up to 73.39 + expected_majority_replicated = MakeOpId(73, 49); + expected_all_replicated = MakeOpId(73, 49); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), expected_majority_replicated); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), expected_all_replicated); + + // The last page of request should overwrite the peer's operations and the + // response should finally advance the watermarks. + request.mutable_ops()->ExtractSubrange(0, request.ops().size(), nullptr); + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + ASSERT_EQ(request.ops_size(), 4); + ASSERT_OPID_EQ(request.ops(0).id(), MakeOpId(73, 50)); + + // We're done, both watermarks should be at the end. + expected_majority_replicated = MakeOpId(76, 53); + expected_all_replicated = MakeOpId(76, 53); + + SetLastReceivedAndLastCommitted(&response, expected_majority_replicated, + expected_majority_replicated, 31); + queue_->ResponseFromPeer(response.responder_uuid(), response, &more_pending); + + ASSERT_OPID_EQ(queue_->GetMajorityReplicatedOpIdForTests(), expected_majority_replicated); + ASSERT_OPID_EQ(queue_->GetAllReplicatedIndexForTests(), expected_all_replicated); + + request.mutable_ops()->ExtractSubrange(0, request.ops().size(), nullptr); +} + +// Test that remote bootstrap is triggered when a "tablet not found" error occurs. +TEST_F(ConsensusQueueTest, TestTriggerRemoteBootstrapIfTabletNotFound) { + queue_->Init(MinimumOpId()); + queue_->SetLeaderMode(MinimumOpId(), MinimumOpId().term(), BuildRaftConfigPBForTests(3)); + AppendReplicateMessagesToQueue(queue_.get(), clock_, 1, 100); + + ConsensusRequestPB request; + ConsensusResponsePB response; + response.set_responder_uuid(kPeerUuid); + queue_->TrackPeer(kPeerUuid); + + // Create request for new peer. + vector refs; + bool needs_remote_bootstrap; + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_FALSE(needs_remote_bootstrap); + + // Peer responds with tablet not found. + response.mutable_error()->set_code(tserver::TabletServerErrorPB::TABLET_NOT_FOUND); + StatusToPB(Status::NotFound("No such tablet"), response.mutable_error()->mutable_status()); + bool more_pending = false; + queue_->ResponseFromPeer(kPeerUuid, response, &more_pending); + + // If the peer needs remote bootstrap, more_pending should be set to true. + ASSERT_TRUE(more_pending); + + // On the next request, we should find out that the queue wants us to remotely bootstrap. + request.Clear(); + ASSERT_OK(queue_->RequestForPeer(kPeerUuid, &request, &refs, &needs_remote_bootstrap)); + ASSERT_TRUE(needs_remote_bootstrap); + + StartRemoteBootstrapRequestPB rb_req; + ASSERT_OK(queue_->GetRemoteBootstrapRequestForPeer(kPeerUuid, &rb_req)); + + ASSERT_TRUE(rb_req.IsInitialized()) << rb_req.ShortDebugString(); + ASSERT_EQ(kTestTablet, rb_req.tablet_id()); + ASSERT_EQ(kLeaderUuid, rb_req.bootstrap_peer_uuid()); + ASSERT_EQ(FakeRaftPeerPB(kLeaderUuid).last_known_addr().ShortDebugString(), + rb_req.bootstrap_peer_addr().ShortDebugString()); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc new file mode 100644 index 000000000000..a36d7fe6f795 --- /dev/null +++ b/src/kudu/consensus/consensus_queue.cc @@ -0,0 +1,876 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/consensus/consensus_queue.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/consensus/raft_consensus.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/url-coding.h" + +DEFINE_int32(consensus_max_batch_size_bytes, 1024 * 1024, + "The maximum per-tablet RPC batch size when updating peers."); +TAG_FLAG(consensus_max_batch_size_bytes, advanced); + +DEFINE_int32(follower_unavailable_considered_failed_sec, 300, + "Seconds that a leader is unable to successfully heartbeat to a " + "follower after which the follower is considered to be failed and " + "evicted from the config."); +TAG_FLAG(follower_unavailable_considered_failed_sec, advanced); + +DEFINE_int32(consensus_inject_latency_ms_in_notifications, 0, + "Injects a random sleep between 0 and this many milliseconds into " + "asynchronous notifications from the consensus queue back to the " + "consensus implementation."); +TAG_FLAG(consensus_inject_latency_ms_in_notifications, hidden); +TAG_FLAG(consensus_inject_latency_ms_in_notifications, unsafe); + +namespace kudu { +namespace consensus { + +using log::AsyncLogReader; +using log::Log; +using rpc::Messenger; +using strings::Substitute; + +METRIC_DEFINE_gauge_int64(tablet, majority_done_ops, "Leader Operations Acked by Majority", + MetricUnit::kOperations, + "Number of operations in the leader queue ack'd by a majority but " + "not all peers."); +METRIC_DEFINE_gauge_int64(tablet, in_progress_ops, "Leader Operations in Progress", + MetricUnit::kOperations, + "Number of operations in the leader queue ack'd by a minority of " + "peers."); + +std::string PeerMessageQueue::TrackedPeer::ToString() const { + return Substitute("Peer: $0, Is new: $1, Last received: $2, Next index: $3, " + "Last known committed idx: $4, Last exchange result: $5, " + "Needs remote bootstrap: $6", + uuid, is_new, OpIdToString(last_received), next_index, + last_known_committed_idx, + is_last_exchange_successful ? "SUCCESS" : "ERROR", + needs_remote_bootstrap); +} + +#define INSTANTIATE_METRIC(x) \ + x.Instantiate(metric_entity, 0) +PeerMessageQueue::Metrics::Metrics(const scoped_refptr& metric_entity) + : num_majority_done_ops(INSTANTIATE_METRIC(METRIC_majority_done_ops)), + num_in_progress_ops(INSTANTIATE_METRIC(METRIC_in_progress_ops)) { +} +#undef INSTANTIATE_METRIC + +PeerMessageQueue::PeerMessageQueue(const scoped_refptr& metric_entity, + const scoped_refptr& log, + const RaftPeerPB& local_peer_pb, + const string& tablet_id) + : local_peer_pb_(local_peer_pb), + tablet_id_(tablet_id), + log_cache_(metric_entity, log, local_peer_pb.permanent_uuid(), tablet_id), + metrics_(metric_entity) { + DCHECK(local_peer_pb_.has_permanent_uuid()); + DCHECK(local_peer_pb_.has_last_known_addr()); + queue_state_.current_term = MinimumOpId().term(); + queue_state_.committed_index = MinimumOpId(); + queue_state_.all_replicated_opid = MinimumOpId(); + queue_state_.majority_replicated_opid = MinimumOpId(); + queue_state_.state = kQueueConstructed; + queue_state_.mode = NON_LEADER; + queue_state_.majority_size_ = -1; + CHECK_OK(ThreadPoolBuilder("queue-observers-pool").set_max_threads(1).Build(&observers_pool_)); +} + +void PeerMessageQueue::Init(const OpId& last_locally_replicated) { + boost::lock_guard lock(queue_lock_); + CHECK_EQ(queue_state_.state, kQueueConstructed); + log_cache_.Init(last_locally_replicated); + queue_state_.last_appended = last_locally_replicated; + queue_state_.state = kQueueOpen; + TrackPeerUnlocked(local_peer_pb_.permanent_uuid()); +} + +void PeerMessageQueue::SetLeaderMode(const OpId& committed_index, + int64_t current_term, + const RaftConfigPB& active_config) { + boost::lock_guard lock(queue_lock_); + CHECK(committed_index.IsInitialized()); + queue_state_.current_term = current_term; + queue_state_.committed_index = committed_index; + queue_state_.majority_replicated_opid = committed_index; + queue_state_.active_config.reset(new RaftConfigPB(active_config)); + CHECK(IsRaftConfigVoter(local_peer_pb_.permanent_uuid(), *queue_state_.active_config)) + << local_peer_pb_.ShortDebugString() << " not a voter in config: " + << queue_state_.active_config->ShortDebugString(); + queue_state_.majority_size_ = MajoritySize(CountVoters(*queue_state_.active_config)); + queue_state_.mode = LEADER; + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Queue going to LEADER mode. State: " + << queue_state_.ToString(); + CheckPeersInActiveConfigIfLeaderUnlocked(); + + // Reset last communication time with all peers to reset the clock on the + // failure timeout. + MonoTime now(MonoTime::Now(MonoTime::FINE)); + for (const PeersMap::value_type& entry : peers_map_) { + entry.second->last_successful_communication_time = now; + } +} + +void PeerMessageQueue::SetNonLeaderMode() { + boost::lock_guard lock(queue_lock_); + queue_state_.active_config.reset(); + queue_state_.mode = NON_LEADER; + queue_state_.majority_size_ = -1; + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Queue going to NON_LEADER mode. State: " + << queue_state_.ToString(); +} + +void PeerMessageQueue::TrackPeer(const string& uuid) { + boost::lock_guard lock(queue_lock_); + TrackPeerUnlocked(uuid); +} + +void PeerMessageQueue::TrackPeerUnlocked(const string& uuid) { + CHECK(!uuid.empty()) << "Got request to track peer with empty UUID"; + DCHECK_EQ(queue_state_.state, kQueueOpen); + + TrackedPeer* tracked_peer = new TrackedPeer(uuid); + // We don't know the last operation received by the peer so, following the + // Raft protocol, we set next_index to one past the end of our own log. This + // way, if calling this method is the result of a successful leader election + // and the logs between the new leader and remote peer match, the + // peer->next_index will point to the index of the soon-to-be-written NO_OP + // entry that is used to assert leadership. If we guessed wrong, and the peer + // does not have a log that matches ours, the normal queue negotiation + // process will eventually find the right point to resume from. + tracked_peer->next_index = queue_state_.last_appended.index() + 1; + InsertOrDie(&peers_map_, uuid, tracked_peer); + + CheckPeersInActiveConfigIfLeaderUnlocked(); + + // We don't know how far back this peer is, so set the all replicated watermark to + // MinimumOpId. We'll advance it when we know how far along the peer is. + queue_state_.all_replicated_opid = MinimumOpId(); +} + +void PeerMessageQueue::UntrackPeer(const string& uuid) { + boost::lock_guard lock(queue_lock_); + TrackedPeer* peer = EraseKeyReturnValuePtr(&peers_map_, uuid); + if (peer != nullptr) { + delete peer; + } +} + +void PeerMessageQueue::CheckPeersInActiveConfigIfLeaderUnlocked() const { + if (queue_state_.mode != LEADER) return; + unordered_set config_peer_uuids; + for (const RaftPeerPB& peer_pb : queue_state_.active_config->peers()) { + InsertOrDie(&config_peer_uuids, peer_pb.permanent_uuid()); + } + for (const PeersMap::value_type& entry : peers_map_) { + if (!ContainsKey(config_peer_uuids, entry.first)) { + LOG_WITH_PREFIX_UNLOCKED(FATAL) << Substitute("Peer $0 is not in the active config. " + "Queue state: $1", + entry.first, + queue_state_.ToString()); + } + } +} + +void PeerMessageQueue::LocalPeerAppendFinished(const OpId& id, + const StatusCallback& callback, + const Status& status) { + CHECK_OK(status); + + // Fake an RPC response from the local peer. + // TODO: we should probably refactor the ResponseFromPeer function + // so that we don't need to construct this fake response, but this + // seems to work for now. + ConsensusResponsePB fake_response; + fake_response.set_responder_uuid(local_peer_pb_.permanent_uuid()); + *fake_response.mutable_status()->mutable_last_received() = id; + *fake_response.mutable_status()->mutable_last_received_current_leader() = id; + { + boost::unique_lock lock(queue_lock_); + fake_response.mutable_status()->set_last_committed_idx(queue_state_.committed_index.index()); + } + bool junk; + ResponseFromPeer(local_peer_pb_.permanent_uuid(), fake_response, &junk); + + callback.Run(status); +} + +Status PeerMessageQueue::AppendOperation(const ReplicateRefPtr& msg) { + return AppendOperations({ msg }, Bind(DoNothingStatusCB)); +} + +Status PeerMessageQueue::AppendOperations(const vector& msgs, + const StatusCallback& log_append_callback) { + + DFAKE_SCOPED_LOCK(append_fake_lock_); + boost::unique_lock lock(queue_lock_); + + OpId last_id = msgs.back()->get()->id(); + + if (last_id.term() > queue_state_.current_term) { + queue_state_.current_term = last_id.term(); + } + + // Unlock ourselves during Append to prevent a deadlock: it's possible that + // the log buffer is full, in which case AppendOperations would block. However, + // for the log buffer to empty, it may need to call LocalPeerAppendFinished() + // which also needs queue_lock_. + lock.unlock(); + RETURN_NOT_OK(log_cache_.AppendOperations(msgs, + Bind(&PeerMessageQueue::LocalPeerAppendFinished, + Unretained(this), + last_id, + log_append_callback))); + lock.lock(); + queue_state_.last_appended = last_id; + UpdateMetrics(); + + return Status::OK(); +} + +Status PeerMessageQueue::RequestForPeer(const string& uuid, + ConsensusRequestPB* request, + vector* msg_refs, + bool* needs_remote_bootstrap) { + TrackedPeer* peer = nullptr; + OpId preceding_id; + { + lock_guard lock(&queue_lock_); + DCHECK_EQ(queue_state_.state, kQueueOpen); + DCHECK_NE(uuid, local_peer_pb_.permanent_uuid()); + + peer = FindPtrOrNull(peers_map_, uuid); + if (PREDICT_FALSE(peer == nullptr || queue_state_.mode == NON_LEADER)) { + return Status::NotFound("Peer not tracked or queue not in leader mode."); + } + + // Clear the requests without deleting the entries, as they may be in use by other peers. + request->mutable_ops()->ExtractSubrange(0, request->ops_size(), nullptr); + + // This is initialized to the queue's last appended op but gets set to the id of the + // log entry preceding the first one in 'messages' if messages are found for the peer. + preceding_id = queue_state_.last_appended; + request->mutable_committed_index()->CopyFrom(queue_state_.committed_index); + request->set_caller_term(queue_state_.current_term); + } + + MonoDelta unreachable_time = + MonoTime::Now(MonoTime::FINE).GetDeltaSince(peer->last_successful_communication_time); + if (unreachable_time.ToSeconds() > FLAGS_follower_unavailable_considered_failed_sec) { + if (CountVoters(*queue_state_.active_config) > 2) { + // We never drop from 2 to 1 automatically, at least for now. We may want + // to revisit this later, we're just being cautious with this. + string msg = Substitute("Leader has been unable to successfully communicate " + "with Peer $0 for more than $1 seconds ($2)", + uuid, + FLAGS_follower_unavailable_considered_failed_sec, + unreachable_time.ToString()); + NotifyObserversOfFailedFollower(uuid, queue_state_.current_term, msg); + } + } + + if (PREDICT_FALSE(peer->needs_remote_bootstrap)) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Peer needs remote bootstrap: " << peer->ToString(); + *needs_remote_bootstrap = true; + return Status::OK(); + } + *needs_remote_bootstrap = false; + + // If we've never communicated with the peer, we don't know what messages to + // send, so we'll send a status-only request. Otherwise, we grab requests + // from the log starting at the last_received point. + if (!peer->is_new) { + + // The batch of messages to send to the peer. + vector messages; + int max_batch_size = FLAGS_consensus_max_batch_size_bytes - request->ByteSize(); + + // We try to get the follower's next_index from our log. + Status s = log_cache_.ReadOps(peer->next_index - 1, + max_batch_size, + &messages, + &preceding_id); + if (PREDICT_FALSE(!s.ok())) { + // It's normal to have a NotFound() here if a follower falls behind where + // the leader has GCed its logs. + if (PREDICT_TRUE(s.IsNotFound())) { + string msg = Substitute("The logs necessary to catch up peer $0 have been " + "garbage collected. The follower will never be able " + "to catch up ($1)", uuid, s.ToString()); + NotifyObserversOfFailedFollower(uuid, queue_state_.current_term, msg); + return s; + // IsIncomplete() means that we tried to read beyond the head of the log + // (in the future). See KUDU-1078. + } else if (s.IsIncomplete()) { + LOG_WITH_PREFIX_UNLOCKED(ERROR) << "Error trying to read ahead of the log " + << "while preparing peer request: " + << s.ToString() << ". Destination peer: " + << peer->ToString(); + return s; + } else { + LOG_WITH_PREFIX_UNLOCKED(FATAL) << "Error reading the log while preparing peer request: " + << s.ToString() << ". Destination peer: " + << peer->ToString(); + } + } + + // We use AddAllocated rather than copy, because we pin the log cache at the + // "all replicated" point. At some point we may want to allow partially loading + // (and not pinning) earlier messages. At that point we'll need to do something + // smarter here, like copy or ref-count. + for (const ReplicateRefPtr& msg : messages) { + request->mutable_ops()->AddAllocated(msg->get()); + } + msg_refs->swap(messages); + DCHECK_LE(request->ByteSize(), FLAGS_consensus_max_batch_size_bytes); + } + + DCHECK(preceding_id.IsInitialized()); + request->mutable_preceding_id()->CopyFrom(preceding_id); + + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + if (request->ops_size() > 0) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Sending request with operations to Peer: " << uuid + << ". Size: " << request->ops_size() + << ". From: " << request->ops(0).id().ShortDebugString() << ". To: " + << request->ops(request->ops_size() - 1).id().ShortDebugString(); + } else { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Sending status only request to Peer: " << uuid + << ": " << request->DebugString(); + } + } + + return Status::OK(); +} + +Status PeerMessageQueue::GetRemoteBootstrapRequestForPeer(const string& uuid, + StartRemoteBootstrapRequestPB* req) { + TrackedPeer* peer = nullptr; + { + lock_guard lock(&queue_lock_); + DCHECK_EQ(queue_state_.state, kQueueOpen); + DCHECK_NE(uuid, local_peer_pb_.permanent_uuid()); + peer = FindPtrOrNull(peers_map_, uuid); + if (PREDICT_FALSE(peer == nullptr || queue_state_.mode == NON_LEADER)) { + return Status::NotFound("Peer not tracked or queue not in leader mode."); + } + } + + if (PREDICT_FALSE(!peer->needs_remote_bootstrap)) { + return Status::IllegalState("Peer does not need to remotely bootstrap", uuid); + } + req->Clear(); + req->set_dest_uuid(uuid); + req->set_tablet_id(tablet_id_); + req->set_bootstrap_peer_uuid(local_peer_pb_.permanent_uuid()); + *req->mutable_bootstrap_peer_addr() = local_peer_pb_.last_known_addr(); + req->set_caller_term(queue_state_.current_term); + peer->needs_remote_bootstrap = false; // Now reset the flag. + return Status::OK(); +} + +void PeerMessageQueue::AdvanceQueueWatermark(const char* type, + OpId* watermark, + const OpId& replicated_before, + const OpId& replicated_after, + int num_peers_required, + const TrackedPeer* peer) { + + if (VLOG_IS_ON(2)) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Updating " << type << " watermark: " + << "Peer (" << peer->ToString() << ") changed from " + << replicated_before << " to " << replicated_after << ". " + << "Current value: " << watermark->ShortDebugString(); + } + + // Go through the peer's watermarks, we want the highest watermark that + // 'num_peers_required' of peers has replicated. To find this we do the + // following: + // - Store all the peer's 'last_received' in a vector + // - Sort the vector + // - Find the vector.size() - 'num_peers_required' position, this + // will be the new 'watermark'. + vector watermarks; + for (const PeersMap::value_type& peer : peers_map_) { + if (peer.second->is_last_exchange_successful) { + watermarks.push_back(&peer.second->last_received); + } + } + + // If we haven't enough peers to calculate the watermark return. + if (watermarks.size() < num_peers_required) { + return; + } + + std::sort(watermarks.begin(), watermarks.end(), OpIdIndexLessThanPtrFunctor()); + + OpId new_watermark = *watermarks[watermarks.size() - num_peers_required]; + OpId old_watermark = *watermark; + watermark->CopyFrom(new_watermark); + + VLOG_WITH_PREFIX_UNLOCKED(1) << "Updated " << type << " watermark " + << "from " << old_watermark << " to " << new_watermark; + if (VLOG_IS_ON(3)) { + VLOG_WITH_PREFIX_UNLOCKED(3) << "Peers: "; + for (const PeersMap::value_type& peer : peers_map_) { + VLOG_WITH_PREFIX_UNLOCKED(3) << "Peer: " << peer.second->ToString(); + } + VLOG_WITH_PREFIX_UNLOCKED(3) << "Sorted watermarks:"; + for (const OpId* watermark : watermarks) { + VLOG_WITH_PREFIX_UNLOCKED(3) << "Watermark: " << watermark->ShortDebugString(); + } + } +} + +void PeerMessageQueue::NotifyPeerIsResponsiveDespiteError(const std::string& peer_uuid) { + lock_guard l(&queue_lock_); + TrackedPeer* peer = FindPtrOrNull(peers_map_, peer_uuid); + if (!peer) return; + peer->last_successful_communication_time = MonoTime::Now(MonoTime::FINE); +} + +void PeerMessageQueue::ResponseFromPeer(const std::string& peer_uuid, + const ConsensusResponsePB& response, + bool* more_pending) { + DCHECK(response.IsInitialized()) << "Error: Uninitialized: " + << response.InitializationErrorString() << ". Response: " << response.ShortDebugString(); + + OpId updated_majority_replicated_opid; + Mode mode_copy; + { + unique_lock scoped_lock(&queue_lock_); + DCHECK_NE(kQueueConstructed, queue_state_.state); + + TrackedPeer* peer = FindPtrOrNull(peers_map_, peer_uuid); + if (PREDICT_FALSE(queue_state_.state != kQueueOpen || peer == nullptr)) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Queue is closed or peer was untracked, disregarding " + "peer response. Response: " << response.ShortDebugString(); + *more_pending = false; + return; + } + + // Remotely bootstrap the peer if the tablet is not found or deleted. + if (response.has_error()) { + // We only let special types of errors through to this point from the peer. + CHECK_EQ(tserver::TabletServerErrorPB::TABLET_NOT_FOUND, response.error().code()) + << response.ShortDebugString(); + + peer->needs_remote_bootstrap = true; + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Marked peer as needing remote bootstrap: " + << peer->ToString(); + *more_pending = true; + return; + } + + // Sanity checks. + // Some of these can be eventually removed, but they are handy for now. + DCHECK(response.status().IsInitialized()) << "Error: Uninitialized: " + << response.InitializationErrorString() << ". Response: " << response.ShortDebugString(); + // TODO: Include uuid in error messages as well. + DCHECK(response.has_responder_uuid() && !response.responder_uuid().empty()) + << "Got response from peer with empty UUID"; + + // Application level errors should be handled elsewhere + DCHECK(!response.has_error()); + // Responses should always have a status. + DCHECK(response.has_status()); + // The status must always have a last received op id and a last committed index. + DCHECK(response.status().has_last_received()); + DCHECK(response.status().has_last_received_current_leader()); + DCHECK(response.status().has_last_committed_idx()); + + const ConsensusStatusPB& status = response.status(); + + // Take a snapshot of the current peer status. + TrackedPeer previous = *peer; + + // Update the peer status based on the response. + peer->is_new = false; + peer->last_known_committed_idx = status.last_committed_idx(); + peer->last_successful_communication_time = MonoTime::Now(MonoTime::FINE); + + // If the reported last-received op for the replica is in our local log, + // then resume sending entries from that point onward. Otherwise, resume + // after the last op they received from us. If we've never successfully + // sent them anything, start after the last-committed op in their log, which + // is guaranteed by the Raft protocol to be a valid op. + + bool peer_has_prefix_of_log = IsOpInLog(status.last_received()); + if (peer_has_prefix_of_log) { + // If the latest thing in their log is in our log, we are in sync. + peer->last_received = status.last_received(); + peer->next_index = peer->last_received.index() + 1; + + } else if (!OpIdEquals(status.last_received_current_leader(), MinimumOpId())) { + // Their log may have diverged from ours, however we are in the process + // of replicating our ops to them, so continue doing so. Eventually, we + // will cause the divergent entry in their log to be overwritten. + peer->last_received = status.last_received_current_leader(); + peer->next_index = peer->last_received.index() + 1; + + } else { + // The peer is divergent and they have not (successfully) received + // anything from us yet. Start sending from their last committed index. + // This logic differs from the Raft spec slightly because instead of + // stepping back one-by-one from the end until we no longer have an LMP + // error, we jump back to the last committed op indicated by the peer with + // the hope that doing so will result in a faster catch-up process. + DCHECK_GE(peer->last_known_committed_idx, 0); + peer->next_index = peer->last_known_committed_idx + 1; + } + + if (PREDICT_FALSE(status.has_error())) { + peer->is_last_exchange_successful = false; + switch (status.error().code()) { + case ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH: { + DCHECK(status.has_last_received()); + if (previous.is_new) { + // That's currently how we can detect that we able to connect to a peer. + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Connected to new peer: " << peer->ToString(); + } else { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Got LMP mismatch error from peer: " + << peer->ToString(); + } + *more_pending = true; + return; + } + case ConsensusErrorPB::INVALID_TERM: { + CHECK(response.has_responder_term()); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Peer responded invalid term: " << peer->ToString(); + NotifyObserversOfTermChange(response.responder_term()); + *more_pending = false; + return; + } + default: { + LOG_WITH_PREFIX_UNLOCKED(FATAL) << "Unexpected consensus error. Code: " + << ConsensusErrorPB::Code_Name(status.error().code()) << ". Response: " + << response.ShortDebugString(); + } + } + } + + peer->is_last_exchange_successful = true; + + if (response.has_responder_term()) { + // The peer must have responded with a term that is greater than or equal to + // the last known term for that peer. + peer->CheckMonotonicTerms(response.responder_term()); + + // If the responder didn't send an error back that must mean that it has + // a term that is the same or lower than ours. + CHECK_LE(response.responder_term(), queue_state_.current_term); + } + + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Received Response from Peer (" << peer->ToString() << "). " + << "Response: " << response.ShortDebugString(); + } + + // If our log has the next request for the peer or if the peer's committed index is + // lower than our own, set 'more_pending' to true. + *more_pending = log_cache_.HasOpBeenWritten(peer->next_index) || + (peer->last_known_committed_idx < queue_state_.committed_index.index()); + + mode_copy = queue_state_.mode; + if (mode_copy == LEADER) { + // Advance the majority replicated index. + AdvanceQueueWatermark("majority_replicated", + &queue_state_.majority_replicated_opid, + previous.last_received, + peer->last_received, + queue_state_.majority_size_, + peer); + + updated_majority_replicated_opid = queue_state_.majority_replicated_opid; + } + + // Advance the all replicated index. + AdvanceQueueWatermark("all_replicated", + &queue_state_.all_replicated_opid, + previous.last_received, + peer->last_received, + peers_map_.size(), + peer); + + log_cache_.EvictThroughOp(queue_state_.all_replicated_opid.index()); + + UpdateMetrics(); + } + + if (mode_copy == LEADER) { + NotifyObserversOfMajorityReplOpChange(updated_majority_replicated_opid); + } +} + +PeerMessageQueue::TrackedPeer PeerMessageQueue::GetTrackedPeerForTests(string uuid) { + unique_lock scoped_lock(&queue_lock_); + TrackedPeer* tracked = FindOrDie(peers_map_, uuid); + return *tracked; +} + +OpId PeerMessageQueue::GetAllReplicatedIndexForTests() const { + boost::lock_guard lock(queue_lock_); + return queue_state_.all_replicated_opid; +} + +OpId PeerMessageQueue::GetCommittedIndexForTests() const { + boost::lock_guard lock(queue_lock_); + return queue_state_.committed_index; +} + +OpId PeerMessageQueue::GetMajorityReplicatedOpIdForTests() const { + boost::lock_guard lock(queue_lock_); + return queue_state_.majority_replicated_opid; +} + + +void PeerMessageQueue::UpdateMetrics() { + // Since operations have consecutive indices we can update the metrics based + // on simple index math. + metrics_.num_majority_done_ops->set_value( + queue_state_.committed_index.index() - + queue_state_.all_replicated_opid.index()); + metrics_.num_in_progress_ops->set_value( + queue_state_.last_appended.index() - + queue_state_.committed_index.index()); +} + +void PeerMessageQueue::DumpToStrings(vector* lines) const { + boost::lock_guard lock(queue_lock_); + DumpToStringsUnlocked(lines); +} + +void PeerMessageQueue::DumpToStringsUnlocked(vector* lines) const { + lines->push_back("Watermarks:"); + for (const PeersMap::value_type& entry : peers_map_) { + lines->push_back( + Substitute("Peer: $0 Watermark: $1", entry.first, entry.second->ToString())); + } + + log_cache_.DumpToStrings(lines); +} + +void PeerMessageQueue::DumpToHtml(std::ostream& out) const { + using std::endl; + + boost::lock_guard lock(queue_lock_); + out << "

Watermarks

" << endl; + out << "

" << endl;; + out << " " << endl; + for (const PeersMap::value_type& entry : peers_map_) { + out << Substitute(" ", + EscapeForHtmlToString(entry.first), + EscapeForHtmlToString(entry.second->ToString())) << endl; + } + out << "
PeerWatermark
$0$1
" << endl; + + log_cache_.DumpToHtml(out); +} + +void PeerMessageQueue::ClearUnlocked() { + STLDeleteValues(&peers_map_); + queue_state_.state = kQueueClosed; +} + +void PeerMessageQueue::Close() { + observers_pool_->Shutdown(); + boost::lock_guard lock(queue_lock_); + ClearUnlocked(); +} + +int64_t PeerMessageQueue::GetQueuedOperationsSizeBytesForTests() const { + return log_cache_.BytesUsed(); +} + +string PeerMessageQueue::ToString() const { + // Even though metrics are thread-safe obtain the lock so that we get + // a "consistent" snapshot of the metrics. + boost::lock_guard lock(queue_lock_); + return ToStringUnlocked(); +} + +string PeerMessageQueue::ToStringUnlocked() const { + return Substitute("Consensus queue metrics:" + "Only Majority Done Ops: $0, In Progress Ops: $1, Cache: $2", + metrics_.num_majority_done_ops->value(), metrics_.num_in_progress_ops->value(), + log_cache_.StatsString()); +} + +void PeerMessageQueue::RegisterObserver(PeerMessageQueueObserver* observer) { + boost::lock_guard lock(queue_lock_); + auto iter = std::find(observers_.begin(), observers_.end(), observer); + if (iter == observers_.end()) { + observers_.push_back(observer); + } +} + +Status PeerMessageQueue::UnRegisterObserver(PeerMessageQueueObserver* observer) { + boost::lock_guard lock(queue_lock_); + auto iter = std::find(observers_.begin(), observers_.end(), observer); + if (iter == observers_.end()) { + return Status::NotFound("Can't find observer."); + } + observers_.erase(iter); + return Status::OK(); +} + +bool PeerMessageQueue::IsOpInLog(const OpId& desired_op) const { + OpId log_op; + Status s = log_cache_.LookupOpId(desired_op.index(), &log_op); + if (PREDICT_TRUE(s.ok())) { + return OpIdEquals(desired_op, log_op); + } + if (PREDICT_TRUE(s.IsNotFound() || s.IsIncomplete())) { + return false; + } + LOG_WITH_PREFIX_UNLOCKED(FATAL) << "Error while reading the log: " << s.ToString(); + return false; // Unreachable; here to squelch GCC warning. +} + +void PeerMessageQueue::NotifyObserversOfMajorityReplOpChange( + const OpId new_majority_replicated_op) { + WARN_NOT_OK(observers_pool_->SubmitClosure( + Bind(&PeerMessageQueue::NotifyObserversOfMajorityReplOpChangeTask, + Unretained(this), new_majority_replicated_op)), + LogPrefixUnlocked() + "Unable to notify RaftConsensus of " + "majority replicated op change."); +} + +void PeerMessageQueue::NotifyObserversOfTermChange(int64_t term) { + WARN_NOT_OK(observers_pool_->SubmitClosure( + Bind(&PeerMessageQueue::NotifyObserversOfTermChangeTask, + Unretained(this), term)), + LogPrefixUnlocked() + "Unable to notify RaftConsensus of term change."); +} + +void PeerMessageQueue::NotifyObserversOfMajorityReplOpChangeTask( + const OpId new_majority_replicated_op) { + std::vector copy; + { + boost::lock_guard lock(queue_lock_); + copy = observers_; + } + + // TODO move commit index advancement here so that the queue is not dependent on + // consensus at all, but that requires a bit more work. + OpId new_committed_index; + for (PeerMessageQueueObserver* observer : copy) { + observer->UpdateMajorityReplicated(new_majority_replicated_op, &new_committed_index); + } + + { + boost::lock_guard lock(queue_lock_); + if (new_committed_index.IsInitialized() && + new_committed_index.index() > queue_state_.committed_index.index()) { + queue_state_.committed_index.CopyFrom(new_committed_index); + } + } +} + +void PeerMessageQueue::NotifyObserversOfTermChangeTask(int64_t term) { + MAYBE_INJECT_RANDOM_LATENCY(FLAGS_consensus_inject_latency_ms_in_notifications); + std::vector copy; + { + boost::lock_guard lock(queue_lock_); + copy = observers_; + } + OpId new_committed_index; + for (PeerMessageQueueObserver* observer : copy) { + observer->NotifyTermChange(term); + } +} + +void PeerMessageQueue::NotifyObserversOfFailedFollower(const string& uuid, + int64_t term, + const string& reason) { + WARN_NOT_OK(observers_pool_->SubmitClosure( + Bind(&PeerMessageQueue::NotifyObserversOfFailedFollowerTask, + Unretained(this), uuid, term, reason)), + LogPrefixUnlocked() + "Unable to notify RaftConsensus of abandoned follower."); +} + +void PeerMessageQueue::NotifyObserversOfFailedFollowerTask(const string& uuid, + int64_t term, + const string& reason) { + MAYBE_INJECT_RANDOM_LATENCY(FLAGS_consensus_inject_latency_ms_in_notifications); + std::vector observers_copy; + { + boost::lock_guard lock(queue_lock_); + observers_copy = observers_; + } + OpId new_committed_index; + for (PeerMessageQueueObserver* observer : observers_copy) { + observer->NotifyFailedFollower(uuid, term, reason); + } +} + +PeerMessageQueue::~PeerMessageQueue() { + Close(); +} + +string PeerMessageQueue::LogPrefixUnlocked() const { + // TODO: we should probably use an atomic here. We'll just annotate + // away the TSAN error for now, since the worst case is a slightly out-of-date + // log message, and not very likely. + Mode mode = ANNOTATE_UNPROTECTED_READ(queue_state_.mode); + return Substitute("T $0 P $1 [$2]: ", + tablet_id_, + local_peer_pb_.permanent_uuid(), + mode == LEADER ? "LEADER" : "NON_LEADER"); +} + +string PeerMessageQueue::QueueState::ToString() const { + return Substitute("All replicated op: $0, Majority replicated op: $1, " + "Committed index: $2, Last appended: $3, Current term: $4, Majority size: $5, " + "State: $6, Mode: $7$8", + OpIdToString(all_replicated_opid), OpIdToString(majority_replicated_opid), + OpIdToString(committed_index), OpIdToString(last_appended), current_term, + majority_size_, state, (mode == LEADER ? "LEADER" : "NON_LEADER"), + active_config ? ", active raft config: " + active_config->ShortDebugString() : ""); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h new file mode 100644 index 000000000000..6c22f68136a7 --- /dev/null +++ b/src/kudu/consensus/consensus_queue.h @@ -0,0 +1,431 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_CONSENSUS_QUEUE_H_ +#define KUDU_CONSENSUS_CONSENSUS_QUEUE_H_ + +#include +#include +#include +#include +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/log_cache.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { +template +class AtomicGauge; +class MemTracker; +class MetricEntity; +class ThreadPool; + +namespace log { +class Log; +class AsyncLogReader; +} + +namespace consensus { +class PeerMessageQueueObserver; + +// The id for the server-wide consensus queue MemTracker. +extern const char kConsensusQueueParentTrackerId[]; + +// Tracks the state of the peers and which transactions they have replicated. +// Owns the LogCache which actually holds the replicate messages which are +// en route to the various peers. +// +// This also takes care of pushing requests to peers as new operations are +// added, and notifying RaftConsensus when the commit index advances. +// +// This class is used only on the LEADER side. +// +// TODO Right now this class is able to track one outstanding operation +// per peer. If we want to have more than one outstanding RPC we need to +// modify it. +class PeerMessageQueue { + public: + struct TrackedPeer { + explicit TrackedPeer(std::string uuid) + : uuid(std::move(uuid)), + is_new(true), + next_index(kInvalidOpIdIndex), + last_received(MinimumOpId()), + last_known_committed_idx(MinimumOpId().index()), + is_last_exchange_successful(false), + last_successful_communication_time(MonoTime::Now(MonoTime::FINE)), + needs_remote_bootstrap(false), + last_seen_term_(0) {} + + // Check that the terms seen from a given peer only increase + // monotonically. + void CheckMonotonicTerms(int64_t term) { + DCHECK_GE(term, last_seen_term_); + last_seen_term_ = term; + } + + std::string ToString() const; + + // UUID of the peer. + const std::string uuid; + + // Whether this is a newly tracked peer. + bool is_new; + + // Next index to send to the peer. + // This corresponds to "nextIndex" as specified in Raft. + int64_t next_index; + + // The last operation that we've sent to this peer and that + // it acked. Used for watermark movement. + OpId last_received; + + // The last committed index this peer knows about. + int64_t last_known_committed_idx; + + // Whether the last exchange with this peer was successful. + bool is_last_exchange_successful; + + // The time of the last communication with the peer. + // Defaults to the time of construction, so does not necessarily mean that + // successful communication ever took place. + MonoTime last_successful_communication_time; + + // Whether the follower was detected to need remote bootstrap. + bool needs_remote_bootstrap; + + private: + // The last term we saw from a given peer. + // This is only used for sanity checking that a peer doesn't + // go backwards in time. + int64_t last_seen_term_; + }; + + PeerMessageQueue(const scoped_refptr& metric_entity, + const scoped_refptr& log, + const RaftPeerPB& local_peer_pb, + const std::string& tablet_id); + + // Initialize the queue. + virtual void Init(const OpId& last_locally_replicated); + + // Changes the queue to leader mode, meaning it tracks majority replicated + // operations and notifies observers when those change. + // 'committed_index' corresponds to the id of the last committed operation, + // i.e. operations with ids <= 'committed_index' should be considered committed. + // 'current_term' corresponds to the leader's current term, this is different + // from 'committed_index.term()' if the leader has not yet committed an + // operation in the current term. + // 'active_config' is the currently-active Raft config. This must always be + // a superset of the tracked peers, and that is enforced with runtime CHECKs. + virtual void SetLeaderMode(const OpId& committed_index, + int64_t current_term, + const RaftConfigPB& active_config); + + // Changes the queue to non-leader mode. Currently tracked peers will still + // be tracked so that the cache is only evicted when the peers no longer need + // the operations but the queue will no longer advance the majority replicated + // index or notify observers of its advancement. + virtual void SetNonLeaderMode(); + + // Makes the queue track this peer. + virtual void TrackPeer(const std::string& peer_uuid); + + // Makes the queue untrack this peer. + virtual void UntrackPeer(const std::string& peer_uuid); + + // Appends a single message to be replicated to the peers. + // Returns OK unless the message could not be added to the queue for some + // reason (e.g. the queue reached max size). + // If it returns OK the queue takes ownership of 'msg'. + // + // This is thread-safe against all of the read methods, but not thread-safe + // with concurrent Append calls. + virtual Status AppendOperation(const ReplicateRefPtr& msg); + + // Appends a vector of messages to be replicated to the peers. + // Returns OK unless the message could not be added to the queue for some + // reason (e.g. the queue reached max size), calls 'log_append_callback' when + // the messages are durable in the local Log. + // If it returns OK the queue takes ownership of 'msgs'. + // + // This is thread-safe against all of the read methods, but not thread-safe + // with concurrent Append calls. + virtual Status AppendOperations(const std::vector& msgs, + const StatusCallback& log_append_callback); + + // Assembles a request for a peer, adding entries past 'op_id' up to + // 'consensus_max_batch_size_bytes'. + // Returns OK if the request was assembled, or Status::NotFound() if the + // peer with 'uuid' was not tracked, of if the queue is not in leader mode. + // Returns Status::Incomplete if we try to read an operation index from the + // log that has not been written. + // + // WARNING: In order to avoid copying the same messages to every peer, + // entries are added to 'request' via AddAllocated() methods. + // The owner of 'request' is expected not to delete the request prior + // to removing the entries through ExtractSubRange() or any other method + // that does not delete the entries. The simplest way is to pass the same + // instance of ConsensusRequestPB to RequestForPeer(): the buffer will + // replace the old entries with new ones without de-allocating the old + // ones if they are still required. + virtual Status RequestForPeer(const std::string& uuid, + ConsensusRequestPB* request, + std::vector* msg_refs, + bool* needs_remote_bootstrap); + + // Fill in a StartRemoteBootstrapRequest for the specified peer. + // If that peer should not remotely bootstrap, returns a non-OK status. + // On success, also internally resets peer->needs_remote_bootstrap to false. + virtual Status GetRemoteBootstrapRequestForPeer(const std::string& uuid, + StartRemoteBootstrapRequestPB* req); + + // Update the last successful communication timestamp for the given peer + // to the current time. This should be called when a non-network related + // error is received from the peer, indicating that it is alive, even if it + // may not be fully up and running or able to accept updates. + void NotifyPeerIsResponsiveDespiteError(const std::string& peer_uuid); + + // Updates the request queue with the latest response of a peer, returns + // whether this peer has more requests pending. + virtual void ResponseFromPeer(const std::string& peer_uuid, + const ConsensusResponsePB& response, + bool* more_pending); + + // Closes the queue, peers are still allowed to call UntrackPeer() and + // ResponseFromPeer() but no additional peers can be tracked or messages + // queued. + virtual void Close(); + + virtual int64_t GetQueuedOperationsSizeBytesForTests() const; + + // Returns the last message replicated by all peers, for tests. + virtual OpId GetAllReplicatedIndexForTests() const; + + + virtual OpId GetCommittedIndexForTests() const; + + // Returns the current majority replicated OpId, for tests. + virtual OpId GetMajorityReplicatedOpIdForTests() const; + + // Returns a copy of the TrackedPeer with 'uuid' or crashes if the peer is + // not being tracked. + virtual TrackedPeer GetTrackedPeerForTests(std::string uuid); + + virtual std::string ToString() const; + + // Dumps the contents of the queue to the provided string vector. + virtual void DumpToStrings(std::vector* lines) const; + + virtual void DumpToHtml(std::ostream& out) const; + + virtual void RegisterObserver(PeerMessageQueueObserver* observer); + + virtual Status UnRegisterObserver(PeerMessageQueueObserver* observer); + + struct Metrics { + // Keeps track of the number of ops. that are completed by a majority but still need + // to be replicated to a minority (IsDone() is true, IsAllDone() is false). + scoped_refptr > num_majority_done_ops; + // Keeps track of the number of ops. that are still in progress (IsDone() returns false). + scoped_refptr > num_in_progress_ops; + + explicit Metrics(const scoped_refptr& metric_entity); + }; + + virtual ~PeerMessageQueue(); + + private: + FRIEND_TEST(ConsensusQueueTest, TestQueueAdvancesCommittedIndex); + + // Mode specifies how the queue currently behaves: + // LEADER - Means the queue tracks remote peers and replicates whatever messages + // are appended. Observers are notified of changes. + // NON_LEADER - Means the queue only tracks the local peer (remote peers are ignored). + // Observers are not notified of changes. + enum Mode { + LEADER, + NON_LEADER + }; + + enum State { + kQueueConstructed, + kQueueOpen, + kQueueClosed + }; + + struct QueueState { + + // The first operation that has been replicated to all currently + // tracked peers. + OpId all_replicated_opid; + + // The index of the last operation replicated to a majority. + // This is usually the same as 'committed_index' but might not + // be if the terms changed. + OpId majority_replicated_opid; + + // The index of the last operation to be considered committed. + OpId committed_index; + + // The opid of the last operation appended to the queue. + OpId last_appended; + + // The queue's owner current_term. + // Set by the last appended operation. + // If the queue owner's term is less than the term observed + // from another peer the queue owner must step down. + // TODO: it is likely to be cleaner to get this from the ConsensusMetadata + // rather than by snooping on what operations are appended to the queue. + int64_t current_term; + + // The size of the majority for the queue. + int majority_size_; + + State state; + + // The current mode of the queue. + Mode mode; + + // The currently-active raft config. Only set if in LEADER mode. + gscoped_ptr active_config; + + std::string ToString() const; + }; + + // Returns true iff given 'desired_op' is found in the local WAL. + // If the op is not found, returns false. + // If the log cache returns some error other than NotFound, crashes with a + // fatal error. + bool IsOpInLog(const OpId& desired_op) const; + + void NotifyObserversOfMajorityReplOpChange(const OpId new_majority_replicated_op); + void NotifyObserversOfMajorityReplOpChangeTask(const OpId new_majority_replicated_op); + + void NotifyObserversOfTermChange(int64_t term); + void NotifyObserversOfTermChangeTask(int64_t term); + + void NotifyObserversOfFailedFollower(const std::string& uuid, + int64_t term, + const std::string& reason); + void NotifyObserversOfFailedFollowerTask(const std::string& uuid, + int64_t term, + const std::string& reason); + + typedef std::unordered_map PeersMap; + + std::string ToStringUnlocked() const; + + std::string LogPrefixUnlocked() const; + + void DumpToStringsUnlocked(std::vector* lines) const; + + // Updates the metrics based on index math. + void UpdateMetrics(); + + void ClearUnlocked(); + + // Returns the last operation in the message queue, or + // 'preceding_first_op_in_queue_' if the queue is empty. + const OpId& GetLastOp() const; + + void TrackPeerUnlocked(const std::string& uuid); + + // Checks that if the queue is in LEADER mode then all registered peers are + // in the active config. Crashes with a FATAL log message if this invariant + // does not hold. If the queue is in NON_LEADER mode, does nothing. + void CheckPeersInActiveConfigIfLeaderUnlocked() const; + + // Callback when a REPLICATE message has finished appending to the local log. + void LocalPeerAppendFinished(const OpId& id, + const StatusCallback& callback, + const Status& status); + + // Advances 'watermark' to the smallest op that 'num_peers_required' have. + void AdvanceQueueWatermark(const char* type, + OpId* watermark, + const OpId& replicated_before, + const OpId& replicated_after, + int num_peers_required, + const TrackedPeer* who_caused); + + std::vector observers_; + + // The pool which executes observer notifications. + // TODO consider reusing a another pool. + gscoped_ptr observers_pool_; + + // PB containing identifying information about the local peer. + const RaftPeerPB local_peer_pb_; + + // The id of the tablet. + const std::string tablet_id_; + + QueueState queue_state_; + + // The currently tracked peers. + PeersMap peers_map_; + mutable simple_spinlock queue_lock_; // TODO: rename + + // We assume that we never have multiple threads racing to append to the queue. + // This fake mutex adds some extra assurance that this implementation property + // doesn't change. + DFAKE_MUTEX(append_fake_lock_); + + LogCache log_cache_; + + Metrics metrics_; +}; + +// The interface between RaftConsensus and the PeerMessageQueue. +class PeerMessageQueueObserver { + public: + // Called by the queue each time the response for a peer is handled with + // the resulting majority replicated index. + // The consensus implementation decides the commit index based on that + // and triggers the apply for pending transactions. + // 'committed_index' is set to the id of the last operation considered + // committed by consensus. + // The implementation is idempotent, i.e. independently of the ordering of + // calls to this method only non-triggered applys will be started. + virtual void UpdateMajorityReplicated(const OpId& majority_replicated, + OpId* committed_index) = 0; + + // Notify the Consensus implementation that a follower replied with a term + // higher than that established in the queue. + virtual void NotifyTermChange(int64_t term) = 0; + + // Notify Consensus that a peer is unable to catch up due to falling behind + // the leader's log GC threshold. + virtual void NotifyFailedFollower(const std::string& peer_uuid, + int64_t term, + const std::string& reason) = 0; + + virtual ~PeerMessageQueueObserver() {} +}; + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_CONSENSUS_QUEUE_H_ */ diff --git a/src/kudu/consensus/leader_election-test.cc b/src/kudu/consensus/leader_election-test.cc new file mode 100644 index 000000000000..cc2f56822457 --- /dev/null +++ b/src/kudu/consensus/leader_election-test.cc @@ -0,0 +1,636 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/leader_election.h" + +#include +#include + +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace consensus { + +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +namespace { + +const int kLeaderElectionTimeoutSecs = 10; + +// Generate list of voter uuids. +static vector GenVoterUUIDs(int num_voters) { + vector voter_uuids; + for (int i = 0; i < num_voters; i++) { + voter_uuids.push_back(Substitute("peer-$0", i)); + } + return voter_uuids; +} + +} // namespace + +//////////////////////////////////////// +// LeaderElectionTest +//////////////////////////////////////// + +typedef unordered_map ProxyMap; + +// A proxy factory that serves proxies from a map. +class FromMapPeerProxyFactory : public PeerProxyFactory { + public: + explicit FromMapPeerProxyFactory(const ProxyMap* proxy_map) + : proxy_map_(proxy_map) { + } + + Status NewProxy(const RaftPeerPB& peer_pb, + gscoped_ptr* proxy) override { + PeerProxy* proxy_ptr = FindPtrOrNull(*proxy_map_, peer_pb.permanent_uuid()); + if (!proxy_ptr) return Status::NotFound("No proxy for peer."); + proxy->reset(proxy_ptr); + return Status::OK(); + } + + private: + // FYI, the tests may add and remove nodes from this map while we hold a + // reference to it. + const ProxyMap* const proxy_map_; +}; + +class LeaderElectionTest : public KuduTest { + public: + LeaderElectionTest() + : tablet_id_("test-tablet"), + proxy_factory_(new FromMapPeerProxyFactory(&proxies_)), + latch_(1) { + CHECK_OK(ThreadPoolBuilder("test-peer-pool").set_max_threads(5).Build(&pool_)); + } + + void ElectionCallback(const ElectionResult& result); + + protected: + void InitUUIDs(int num_voters); + void InitNoOpPeerProxies(); + void InitDelayableMockedProxies(bool enable_delay); + gscoped_ptr InitVoteCounter(int num_voters, int majority_size); + + // Voter 0 is the high-term voter. + scoped_refptr SetUpElectionWithHighTermVoter(ConsensusTerm election_term); + + // Predetermine the election results using the specified number of + // grant / deny / error responses. + // num_grant must be at least 1, for the candidate to vote for itself. + // num_grant + num_deny + num_error must add up to an odd number. + scoped_refptr SetUpElectionWithGrantDenyErrorVotes(ConsensusTerm election_term, + int num_grant, + int num_deny, + int num_error); + + const string tablet_id_; + string candidate_uuid_; + vector voter_uuids_; + + RaftConfigPB config_; + ProxyMap proxies_; + gscoped_ptr proxy_factory_; + gscoped_ptr pool_; + + CountDownLatch latch_; + gscoped_ptr result_; +}; + +void LeaderElectionTest::ElectionCallback(const ElectionResult& result) { + result_.reset(new ElectionResult(result)); + latch_.CountDown(); +} + +void LeaderElectionTest::InitUUIDs(int num_voters) { + voter_uuids_ = GenVoterUUIDs(num_voters); + candidate_uuid_ = voter_uuids_[num_voters - 1]; + voter_uuids_.pop_back(); +} + +void LeaderElectionTest::InitNoOpPeerProxies() { + config_.Clear(); + for (const string& uuid : voter_uuids_) { + RaftPeerPB* peer_pb = config_.add_peers(); + peer_pb->set_permanent_uuid(uuid); + PeerProxy* proxy = new NoOpTestPeerProxy(pool_.get(), *peer_pb); + InsertOrDie(&proxies_, uuid, proxy); + } +} + +void LeaderElectionTest::InitDelayableMockedProxies(bool enable_delay) { + config_.Clear(); + for (const string& uuid : voter_uuids_) { + RaftPeerPB* peer_pb = config_.add_peers(); + peer_pb->set_permanent_uuid(uuid); + auto proxy = new DelayablePeerProxy(pool_.get(), + new MockedPeerProxy(pool_.get())); + if (enable_delay) { + proxy->DelayResponse(); + } + InsertOrDie(&proxies_, uuid, proxy); + } +} + +gscoped_ptr LeaderElectionTest::InitVoteCounter(int num_voters, int majority_size) { + gscoped_ptr counter(new VoteCounter(num_voters, majority_size)); + bool duplicate; + CHECK_OK(counter->RegisterVote(candidate_uuid_, VOTE_GRANTED, &duplicate)); + CHECK(!duplicate); + return counter.Pass(); +} + +scoped_refptr LeaderElectionTest::SetUpElectionWithHighTermVoter( + ConsensusTerm election_term) { + const int kNumVoters = 3; + const int kMajoritySize = 2; + + InitUUIDs(kNumVoters); + InitDelayableMockedProxies(true); + gscoped_ptr counter = InitVoteCounter(kNumVoters, kMajoritySize); + + VoteResponsePB response; + response.set_responder_uuid(voter_uuids_[0]); + response.set_responder_term(election_term + 1); + response.set_vote_granted(false); + response.mutable_consensus_error()->set_code(ConsensusErrorPB::INVALID_TERM); + StatusToPB(Status::InvalidArgument("Bad term"), + response.mutable_consensus_error()->mutable_status()); + down_cast*>(proxies_[voter_uuids_[0]]) + ->proxy()->set_vote_response(response); + + response.Clear(); + response.set_responder_uuid(voter_uuids_[1]); + response.set_responder_term(election_term); + response.set_vote_granted(true); + down_cast*>(proxies_[voter_uuids_[1]]) + ->proxy()->set_vote_response(response); + + VoteRequestPB request; + request.set_candidate_uuid(candidate_uuid_); + request.set_candidate_term(election_term); + request.set_tablet_id(tablet_id_); + + scoped_refptr election( + new LeaderElection(config_, proxy_factory_.get(), request, counter.Pass(), + MonoDelta::FromSeconds(kLeaderElectionTimeoutSecs), + Bind(&LeaderElectionTest::ElectionCallback, + Unretained(this)))); + return election; +} + +scoped_refptr LeaderElectionTest::SetUpElectionWithGrantDenyErrorVotes( + ConsensusTerm election_term, int num_grant, int num_deny, int num_error) { + const int kNumVoters = num_grant + num_deny + num_error; + CHECK_GE(num_grant, 1); // Gotta vote for yourself. + CHECK_EQ(1, kNumVoters % 2); // RaftConfig size must be odd. + const int kMajoritySize = (kNumVoters / 2) + 1; + + InitUUIDs(kNumVoters); + InitDelayableMockedProxies(false); // Don't delay the vote responses. + gscoped_ptr counter = InitVoteCounter(kNumVoters, kMajoritySize); + int num_grant_followers = num_grant - 1; + + // Set up mocked responses based on the params specified in the method arguments. + int voter_index = 0; + while (voter_index < voter_uuids_.size()) { + VoteResponsePB response; + if (num_grant_followers > 0) { + response.set_responder_uuid(voter_uuids_[voter_index]); + response.set_responder_term(election_term); + response.set_vote_granted(true); + --num_grant_followers; + } else if (num_deny > 0) { + response.set_responder_uuid(voter_uuids_[voter_index]); + response.set_responder_term(election_term); + response.set_vote_granted(false); + response.mutable_consensus_error()->set_code(ConsensusErrorPB::LAST_OPID_TOO_OLD); + StatusToPB(Status::InvalidArgument("Last OpId"), + response.mutable_consensus_error()->mutable_status()); + --num_deny; + } else if (num_error > 0) { + response.mutable_error()->set_code(tserver::TabletServerErrorPB::TABLET_NOT_FOUND); + StatusToPB(Status::NotFound("Unknown Tablet"), + response.mutable_error()->mutable_status()); + --num_error; + } else { + LOG(FATAL) << "Unexpected fallthrough"; + } + + down_cast*>(proxies_[voter_uuids_[voter_index]]) + ->proxy()->set_vote_response(response); + ++voter_index; + } + + VoteRequestPB request; + request.set_candidate_uuid(candidate_uuid_); + request.set_candidate_term(election_term); + request.set_tablet_id(tablet_id_); + + scoped_refptr election( + new LeaderElection(config_, proxy_factory_.get(), request, counter.Pass(), + MonoDelta::FromSeconds(kLeaderElectionTimeoutSecs), + Bind(&LeaderElectionTest::ElectionCallback, + Unretained(this)))); + return election; +} + +// All peers respond "yes", no failures. +TEST_F(LeaderElectionTest, TestPerfectElection) { + // Try configuration sizes of 1, 3, 5. + vector config_sizes = { 1, 3, 5 }; + for (int num_voters : config_sizes) { + LOG(INFO) << "Testing election with config size of " << num_voters; + int majority_size = (num_voters / 2) + 1; + ConsensusTerm election_term = 10 + num_voters; // Just to be able to differentiate. + + InitUUIDs(num_voters); + InitNoOpPeerProxies(); + gscoped_ptr counter = InitVoteCounter(num_voters, majority_size); + + VoteRequestPB request; + request.set_candidate_uuid(candidate_uuid_); + request.set_candidate_term(election_term); + request.set_tablet_id(tablet_id_); + + scoped_refptr election( + new LeaderElection(config_, proxy_factory_.get(), request, counter.Pass(), + MonoDelta::FromSeconds(kLeaderElectionTimeoutSecs), + Bind(&LeaderElectionTest::ElectionCallback, + Unretained(this)))); + election->Run(); + latch_.Wait(); + + ASSERT_EQ(election_term, result_->election_term); + ASSERT_EQ(VOTE_GRANTED, result_->decision); + + pool_->Wait(); + proxies_.clear(); // We don't delete them; The election VoterState object + // ends up owning them. + latch_.Reset(1); + } +} + +// Test leader election when we encounter a peer with a higher term before we +// have arrived at a majority decision. +TEST_F(LeaderElectionTest, TestHigherTermBeforeDecision) { + const ConsensusTerm kElectionTerm = 2; + scoped_refptr election = SetUpElectionWithHighTermVoter(kElectionTerm); + election->Run(); + + // This guy has a higher term. + down_cast*>(proxies_[voter_uuids_[0]]) + ->Respond(TestPeerProxy::kRequestVote); + latch_.Wait(); + + ASSERT_EQ(kElectionTerm, result_->election_term); + ASSERT_EQ(VOTE_DENIED, result_->decision); + ASSERT_TRUE(result_->has_higher_term); + ASSERT_EQ(kElectionTerm + 1, result_->higher_term); + LOG(INFO) << "Election lost. Reason: " << result_->message; + + // This guy will vote "yes". + down_cast*>(proxies_[voter_uuids_[1]]) + ->Respond(TestPeerProxy::kRequestVote); + + pool_->Wait(); // Wait for the election callbacks to finish before we destroy proxies. +} + +// Test leader election when we encounter a peer with a higher term after we +// have arrived at a majority decision of "yes". +TEST_F(LeaderElectionTest, TestHigherTermAfterDecision) { + const ConsensusTerm kElectionTerm = 2; + scoped_refptr election = SetUpElectionWithHighTermVoter(kElectionTerm); + election->Run(); + + // This guy will vote "yes". + down_cast*>(proxies_[voter_uuids_[1]]) + ->Respond(TestPeerProxy::kRequestVote); + latch_.Wait(); + + ASSERT_EQ(kElectionTerm, result_->election_term); + ASSERT_EQ(VOTE_GRANTED, result_->decision); + ASSERT_FALSE(result_->has_higher_term); + ASSERT_TRUE(result_->message.empty()); + LOG(INFO) << "Election won."; + + // This guy has a higher term. + down_cast*>(proxies_[voter_uuids_[0]]) + ->Respond(TestPeerProxy::kRequestVote); + + pool_->Wait(); // Wait for the election callbacks to finish before we destroy proxies. +} + +// Out-of-date OpId "vote denied" case. +TEST_F(LeaderElectionTest, TestWithDenyVotes) { + const ConsensusTerm kElectionTerm = 2; + const int kNumGrant = 2; + const int kNumDeny = 3; + const int kNumError = 0; + scoped_refptr election = + SetUpElectionWithGrantDenyErrorVotes(kElectionTerm, kNumGrant, kNumDeny, kNumError); + LOG(INFO) << "Running"; + election->Run(); + + latch_.Wait(); + ASSERT_EQ(kElectionTerm, result_->election_term); + ASSERT_EQ(VOTE_DENIED, result_->decision); + ASSERT_FALSE(result_->has_higher_term); + ASSERT_TRUE(result_->message.empty()); + LOG(INFO) << "Election denied."; + + pool_->Wait(); // Wait for the election callbacks to finish before we destroy proxies. +} + +// Count errors as denied votes. +TEST_F(LeaderElectionTest, TestWithErrorVotes) { + const ConsensusTerm kElectionTerm = 2; + const int kNumGrant = 1; + const int kNumDeny = 0; + const int kNumError = 4; + scoped_refptr election = + SetUpElectionWithGrantDenyErrorVotes(kElectionTerm, kNumGrant, kNumDeny, kNumError); + election->Run(); + + latch_.Wait(); + ASSERT_EQ(kElectionTerm, result_->election_term); + ASSERT_EQ(VOTE_DENIED, result_->decision); + ASSERT_FALSE(result_->has_higher_term); + ASSERT_TRUE(result_->message.empty()); + LOG(INFO) << "Election denied."; + + pool_->Wait(); // Wait for the election callbacks to finish before we destroy proxies. +} + +// Count errors as denied votes. +TEST_F(LeaderElectionTest, TestFailToCreateProxy) { + const ConsensusTerm kElectionTerm = 2; + const int kNumVoters = 3; + const int kMajoritySize = 2; + + // Initialize the UUIDs and the proxies (which also sets up the config PB). + InitUUIDs(kNumVoters); + InitNoOpPeerProxies(); + + // Remove all the proxies. This will make our peer factory return a bad Status. + STLDeleteValues(&proxies_); + + // Our election should now fail as if the votes were denied. + VoteRequestPB request; + request.set_candidate_uuid(candidate_uuid_); + request.set_candidate_term(kElectionTerm); + request.set_tablet_id(tablet_id_); + + gscoped_ptr counter = InitVoteCounter(kNumVoters, kMajoritySize); + scoped_refptr election( + new LeaderElection(config_, proxy_factory_.get(), request, counter.Pass(), + MonoDelta::FromSeconds(kLeaderElectionTimeoutSecs), + Bind(&LeaderElectionTest::ElectionCallback, + Unretained(this)))); + election->Run(); + latch_.Wait(); + ASSERT_EQ(kElectionTerm, result_->election_term); + ASSERT_EQ(VOTE_DENIED, result_->decision); + ASSERT_FALSE(result_->has_higher_term); + ASSERT_TRUE(result_->message.empty()); +} + +//////////////////////////////////////// +// VoteCounterTest +//////////////////////////////////////// + +class VoteCounterTest : public KuduTest { + protected: + static void AssertUndecided(const VoteCounter& counter); + static void AssertVoteCount(const VoteCounter& counter, int yes_votes, int no_votes); +}; + +void VoteCounterTest::AssertUndecided(const VoteCounter& counter) { + ASSERT_FALSE(counter.IsDecided()); + ElectionVote decision; + Status s = counter.GetDecision(&decision); + ASSERT_TRUE(s.IsIllegalState()); + ASSERT_STR_CONTAINS(s.ToString(), "Vote not yet decided"); +} + +void VoteCounterTest::AssertVoteCount(const VoteCounter& counter, int yes_votes, int no_votes) { + ASSERT_EQ(yes_votes, counter.yes_votes_); + ASSERT_EQ(no_votes, counter.no_votes_); + ASSERT_EQ(yes_votes + no_votes, counter.GetTotalVotesCounted()); +} + +// Test basic vote counting functionality with an early majority. +TEST_F(VoteCounterTest, TestVoteCounter_EarlyDecision) { + const int kNumVoters = 3; + const int kMajoritySize = 2; + vector voter_uuids = GenVoterUUIDs(kNumVoters); + + // "Yes" decision. + { + // Start off undecided. + VoteCounter counter(kNumVoters, kMajoritySize); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 0, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // First yes vote. + bool duplicate; + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 1, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Second yes vote wins it in a configuration of 3. + ASSERT_OK(counter.RegisterVote(voter_uuids[1], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_TRUE(counter.IsDecided()); + ElectionVote decision; + ASSERT_OK(counter.GetDecision(&decision)); + ASSERT_TRUE(decision == VOTE_GRANTED); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 2, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + } + + // "No" decision. + { + // Start off undecided. + VoteCounter counter(kNumVoters, kMajoritySize); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 0, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // First no vote. + bool duplicate; + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_DENIED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 0, 1)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Second no vote loses it in a configuration of 3. + ASSERT_OK(counter.RegisterVote(voter_uuids[1], VOTE_DENIED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_TRUE(counter.IsDecided()); + ElectionVote decision; + ASSERT_OK(counter.GetDecision(&decision)); + ASSERT_TRUE(decision == VOTE_DENIED); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 0, 2)); + ASSERT_FALSE(counter.AreAllVotesIn()); + } +} + +// Test basic vote counting functionality with the last vote being the deciding vote. +TEST_F(VoteCounterTest, TestVoteCounter_LateDecision) { + const int kNumVoters = 5; + const int kMajoritySize = 3; + vector voter_uuids = GenVoterUUIDs(kNumVoters); + + // Start off undecided. + VoteCounter counter(kNumVoters, kMajoritySize); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 0, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Add single yes vote, still undecided. + bool duplicate; + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 1, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Attempt duplicate vote. + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_GRANTED, &duplicate)); + ASSERT_TRUE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 1, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Attempt to change vote. + Status s = counter.RegisterVote(voter_uuids[0], VOTE_DENIED, &duplicate); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "voted a different way twice"); + LOG(INFO) << "Expected vote-changed error: " << s.ToString(); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 1, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Add more votes... + ASSERT_OK(counter.RegisterVote(voter_uuids[1], VOTE_DENIED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 1, 1)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + ASSERT_OK(counter.RegisterVote(voter_uuids[2], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 2, 1)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + ASSERT_OK(counter.RegisterVote(voter_uuids[3], VOTE_DENIED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_NO_FATAL_FAILURE(AssertUndecided(counter)); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 2, 2)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Win the election. + ASSERT_OK(counter.RegisterVote(voter_uuids[4], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_TRUE(counter.IsDecided()); + ElectionVote decision; + ASSERT_OK(counter.GetDecision(&decision)); + ASSERT_TRUE(decision == VOTE_GRANTED); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 3, 2)); + ASSERT_TRUE(counter.AreAllVotesIn()); + + // Attempt to vote with > the whole configuration. + s = counter.RegisterVote("some-random-node", VOTE_GRANTED, &duplicate); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "cause the number of votes to exceed the expected number"); + LOG(INFO) << "Expected voters-exceeded error: " << s.ToString(); + ASSERT_TRUE(counter.IsDecided()); + ASSERT_NO_FATAL_FAILURE(AssertVoteCount(counter, 3, 2)); + ASSERT_TRUE(counter.AreAllVotesIn()); +} + +// Test vote counting with an even number of voters. +TEST_F(VoteCounterTest, TestVoteCounter_EvenVoters) { + const int kNumVoters = 2; + const int kMajoritySize = 2; + vector voter_uuids = GenVoterUUIDs(kNumVoters); + + // "Yes" decision. + { + VoteCounter counter(kNumVoters, kMajoritySize); + NO_FATALS(AssertUndecided(counter)); + NO_FATALS(AssertVoteCount(counter, 0, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Initial yes vote. + bool duplicate; + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + NO_FATALS(AssertUndecided(counter)); + NO_FATALS(AssertVoteCount(counter, 1, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // Second yes vote wins it. + ASSERT_OK(counter.RegisterVote(voter_uuids[1], VOTE_GRANTED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_TRUE(counter.IsDecided()); + ElectionVote decision; + ASSERT_OK(counter.GetDecision(&decision)); + ASSERT_TRUE(decision == VOTE_GRANTED); + NO_FATALS(AssertVoteCount(counter, 2, 0)); + ASSERT_TRUE(counter.AreAllVotesIn()); + } + + // "No" decision. + { + VoteCounter counter(kNumVoters, kMajoritySize); + NO_FATALS(AssertUndecided(counter)); + NO_FATALS(AssertVoteCount(counter, 0, 0)); + ASSERT_FALSE(counter.AreAllVotesIn()); + + // The first "no" vote guarantees a failed election when num voters == 2. + bool duplicate; + ASSERT_OK(counter.RegisterVote(voter_uuids[0], VOTE_DENIED, &duplicate)); + ASSERT_FALSE(duplicate); + ASSERT_TRUE(counter.IsDecided()); + ElectionVote decision; + ASSERT_OK(counter.GetDecision(&decision)); + ASSERT_TRUE(decision == VOTE_DENIED); + NO_FATALS(AssertVoteCount(counter, 0, 1)); + ASSERT_FALSE(counter.AreAllVotesIn()); + } +} + + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/leader_election.cc b/src/kudu/consensus/leader_election.cc new file mode 100644 index 000000000000..2c4066dd14cc --- /dev/null +++ b/src/kudu/consensus/leader_election.cc @@ -0,0 +1,373 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/leader_election.h" + +#include + +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace consensus { + +using std::string; +using strings::Substitute; + +/////////////////////////////////////////////////// +// VoteCounter +/////////////////////////////////////////////////// + +VoteCounter::VoteCounter(int num_voters, int majority_size) + : num_voters_(num_voters), + majority_size_(majority_size), + yes_votes_(0), + no_votes_(0) { + CHECK_LE(majority_size, num_voters); + CHECK_GT(num_voters_, 0); + CHECK_GT(majority_size_, 0); +} + +Status VoteCounter::RegisterVote(const std::string& voter_uuid, ElectionVote vote, + bool* is_duplicate) { + // Handle repeated votes. + if (PREDICT_FALSE(ContainsKey(votes_, voter_uuid))) { + // Detect changed votes. + ElectionVote prior_vote = votes_[voter_uuid]; + if (PREDICT_FALSE(prior_vote != vote)) { + string msg = Substitute("Peer $0 voted a different way twice in the same election. " + "First vote: $1, second vote: $2.", + voter_uuid, prior_vote, vote); + return Status::InvalidArgument(msg); + } + + // This was just a duplicate. Allow the caller to log it but don't change + // the voting record. + *is_duplicate = true; + return Status::OK(); + } + + // Sanity check to ensure we did not exceed the allowed number of voters. + if (PREDICT_FALSE(yes_votes_ + no_votes_ == num_voters_)) { + // More unique voters than allowed! + return Status::InvalidArgument(Substitute( + "Vote from peer $0 would cause the number of votes to exceed the expected number of " + "voters, which is $1. Votes already received from the following peers: {$2}", + voter_uuid, + num_voters_, + JoinKeysIterator(votes_.begin(), votes_.end(), ", "))); + } + + // This is a valid vote, so store it. + InsertOrDie(&votes_, voter_uuid, vote); + switch (vote) { + case VOTE_GRANTED: + ++yes_votes_; + break; + case VOTE_DENIED: + ++no_votes_; + break; + } + *is_duplicate = false; + return Status::OK(); +} + +bool VoteCounter::IsDecided() const { + return yes_votes_ >= majority_size_ || + no_votes_ > num_voters_ - majority_size_; +} + +Status VoteCounter::GetDecision(ElectionVote* decision) const { + if (yes_votes_ >= majority_size_) { + *decision = VOTE_GRANTED; + return Status::OK(); + } + if (no_votes_ > num_voters_ - majority_size_) { + *decision = VOTE_DENIED; + return Status::OK(); + } + return Status::IllegalState("Vote not yet decided"); +} + +int VoteCounter::GetTotalVotesCounted() const { + return yes_votes_ + no_votes_; +} + +bool VoteCounter::AreAllVotesIn() const { + return GetTotalVotesCounted() == num_voters_; +} + +/////////////////////////////////////////////////// +// ElectionResult +/////////////////////////////////////////////////// + +ElectionResult::ElectionResult(ConsensusTerm election_term, ElectionVote decision) + : election_term(election_term), + decision(decision), + has_higher_term(false), + higher_term(kMinimumTerm) { +} + +ElectionResult::ElectionResult(ConsensusTerm election_term, ElectionVote decision, + ConsensusTerm higher_term, const std::string& message) + : election_term(election_term), + decision(decision), + has_higher_term(true), + higher_term(higher_term), + message(message) { + CHECK_EQ(VOTE_DENIED, decision); + CHECK_GT(higher_term, election_term); + DCHECK(!message.empty()); +} + +/////////////////////////////////////////////////// +// LeaderElection +/////////////////////////////////////////////////// + +LeaderElection::LeaderElection(const RaftConfigPB& config, + PeerProxyFactory* proxy_factory, + const VoteRequestPB& request, + gscoped_ptr vote_counter, + MonoDelta timeout, + ElectionDecisionCallback decision_callback) + : has_responded_(false), + request_(request), + vote_counter_(vote_counter.Pass()), + timeout_(std::move(timeout)), + decision_callback_(std::move(decision_callback)) { + for (const RaftPeerPB& peer : config.peers()) { + if (request.candidate_uuid() == peer.permanent_uuid()) continue; + follower_uuids_.push_back(peer.permanent_uuid()); + + gscoped_ptr state(new VoterState()); + state->proxy_status = proxy_factory->NewProxy(peer, &state->proxy); + InsertOrDie(&voter_state_, peer.permanent_uuid(), state.release()); + } + + // Ensure that the candidate has already voted for itself. + CHECK_EQ(1, vote_counter_->GetTotalVotesCounted()) << "Candidate must vote for itself first"; + + // Ensure that existing votes + future votes add up to the expected total. + CHECK_EQ(vote_counter_->GetTotalVotesCounted() + follower_uuids_.size(), + vote_counter_->GetTotalExpectedVotes()) + << "Expected different number of followers. Follower UUIDs: [" + << JoinStringsIterator(follower_uuids_.begin(), follower_uuids_.end(), ", ") + << "]; RaftConfig: {" << config.ShortDebugString() << "}"; +} + +LeaderElection::~LeaderElection() { + lock_guard guard(&lock_); + DCHECK(has_responded_); // We must always call the callback exactly once. + STLDeleteValues(&voter_state_); +} + +void LeaderElection::Run() { + VLOG_WITH_PREFIX(1) << "Running leader election."; + + // Check if we have already won the election (relevant if this is a + // single-node configuration, since we always pre-vote for ourselves). + CheckForDecision(); + + // The rest of the code below is for a typical multi-node configuration. + for (const std::string& voter_uuid : follower_uuids_) { + VoterState* state = nullptr; + { + lock_guard guard(&lock_); + state = FindOrDie(voter_state_, voter_uuid); + // Safe to drop the lock because voter_state_ is not mutated outside of + // the constructor / destructor. We do this to avoid deadlocks below. + } + + // If we failed to construct the proxy, just record a 'NO' vote with the status + // that indicates why it failed. + if (!state->proxy_status.ok()) { + LOG_WITH_PREFIX(WARNING) << "Was unable to construct an RPC proxy to peer " + << voter_uuid << ": " << state->proxy_status.ToString() + << ". Counting it as a 'NO' vote."; + { + lock_guard guard(&lock_); + RecordVoteUnlocked(voter_uuid, VOTE_DENIED); + } + CheckForDecision(); + continue; + } + + // Send the RPC request. + LOG_WITH_PREFIX(INFO) << "Requesting vote from peer " << voter_uuid; + state->rpc.set_timeout(timeout_); + + state->request = request_; + state->request.set_dest_uuid(voter_uuid); + + state->proxy->RequestConsensusVoteAsync( + &state->request, + &state->response, + &state->rpc, + // We use gutil Bind() for the refcounting and boost::bind to adapt the + // gutil Callback to a thunk. + boost::bind(&Closure::Run, + Bind(&LeaderElection::VoteResponseRpcCallback, this, voter_uuid))); + } +} + +void LeaderElection::CheckForDecision() { + bool to_respond = false; + { + lock_guard guard(&lock_); + // Check if the vote has been newly decided. + if (!result_ && vote_counter_->IsDecided()) { + ElectionVote decision; + CHECK_OK(vote_counter_->GetDecision(&decision)); + LOG_WITH_PREFIX(INFO) << "Election decided. Result: candidate " + << ((decision == VOTE_GRANTED) ? "won." : "lost."); + result_.reset(new ElectionResult(election_term(), decision)); + } + // Check whether to respond. This can happen as a result of either getting + // a majority vote or of something invalidating the election, like + // observing a higher term. + if (result_ && !has_responded_) { + has_responded_ = true; + to_respond = true; + } + } + + // Respond outside of the lock. + if (to_respond) { + // This is thread-safe since result_ is write-once. + decision_callback_.Run(*result_); + } +} + +void LeaderElection::VoteResponseRpcCallback(const std::string& voter_uuid) { + { + lock_guard guard(&lock_); + VoterState* state = FindOrDie(voter_state_, voter_uuid); + + // Check for RPC errors. + if (!state->rpc.status().ok()) { + LOG_WITH_PREFIX(WARNING) << "RPC error from VoteRequest() call to peer " << voter_uuid + << ": " << state->rpc.status().ToString(); + RecordVoteUnlocked(voter_uuid, VOTE_DENIED); + + // Check for tablet errors. + } else if (state->response.has_error()) { + LOG_WITH_PREFIX(WARNING) << "Tablet error from VoteRequest() call to peer " + << voter_uuid << ": " + << StatusFromPB(state->response.error().status()).ToString(); + RecordVoteUnlocked(voter_uuid, VOTE_DENIED); + + // If the peer changed their IP address, we shouldn't count this vote since + // our knowledge of the configuration is in an inconsistent state. + } else if (PREDICT_FALSE(voter_uuid != state->response.responder_uuid())) { + LOG_WITH_PREFIX(DFATAL) << "Received vote response from peer we thought had UUID " + << voter_uuid << ", but its actual UUID is " << state->response.responder_uuid(); + RecordVoteUnlocked(voter_uuid, VOTE_DENIED); + + // Count the granted votes. + } else if (state->response.vote_granted()) { + HandleVoteGrantedUnlocked(voter_uuid, *state); + + // Anything else is a denied vote. + } else { + HandleVoteDeniedUnlocked(voter_uuid, *state); + } + } + + // Check for a decision outside the lock. + CheckForDecision(); +} + +void LeaderElection::RecordVoteUnlocked(const std::string& voter_uuid, ElectionVote vote) { + DCHECK(lock_.is_locked()); + + // Record the vote. + bool duplicate; + Status s = vote_counter_->RegisterVote(voter_uuid, vote, &duplicate); + if (!s.ok()) { + LOG_WITH_PREFIX(WARNING) << "Error registering vote for peer " << voter_uuid + << ": " << s.ToString(); + return; + } + if (duplicate) { + // Note: This is DFATAL because at the time of writing we do not support + // retrying vote requests, so this should be impossible. It may be valid to + // receive duplicate votes in the future if we implement retry. + LOG_WITH_PREFIX(DFATAL) << "Duplicate vote received from peer " << voter_uuid; + } +} + +void LeaderElection::HandleHigherTermUnlocked(const string& voter_uuid, const VoterState& state) { + DCHECK(lock_.is_locked()); + DCHECK_GT(state.response.responder_term(), election_term()); + + string msg = Substitute("Vote denied by peer $0 with higher term. Message: $1", + state.response.responder_uuid(), + StatusFromPB(state.response.consensus_error().status()).ToString()); + LOG_WITH_PREFIX(WARNING) << msg; + + if (!result_) { + LOG_WITH_PREFIX(INFO) << "Cancelling election due to peer responding with higher term"; + result_.reset(new ElectionResult(election_term(), VOTE_DENIED, + state.response.responder_term(), msg)); + } +} + +void LeaderElection::HandleVoteGrantedUnlocked(const string& voter_uuid, const VoterState& state) { + DCHECK(lock_.is_locked()); + DCHECK_EQ(state.response.responder_term(), election_term()); + DCHECK(state.response.vote_granted()); + + LOG_WITH_PREFIX(INFO) << "Vote granted by peer " << voter_uuid; + RecordVoteUnlocked(voter_uuid, VOTE_GRANTED); +} + +void LeaderElection::HandleVoteDeniedUnlocked(const string& voter_uuid, const VoterState& state) { + DCHECK(lock_.is_locked()); + DCHECK(!state.response.vote_granted()); + + // If one of the voters responds with a greater term than our own, and we + // have not yet triggered the decision callback, it cancels the election. + if (state.response.responder_term() > election_term()) { + return HandleHigherTermUnlocked(voter_uuid, state); + } + + LOG_WITH_PREFIX(INFO) << "Vote denied by peer " << voter_uuid << ". Message: " + << StatusFromPB(state.response.consensus_error().status()).ToString(); + RecordVoteUnlocked(voter_uuid, VOTE_DENIED); +} + +std::string LeaderElection::LogPrefix() const { + return Substitute("T $0 P $1 [CANDIDATE]: Term $2 election: ", + request_.tablet_id(), + request_.candidate_uuid(), + request_.candidate_term()); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/leader_election.h b/src/kudu/consensus/leader_election.h new file mode 100644 index 000000000000..394bca4ae0fd --- /dev/null +++ b/src/kudu/consensus/leader_election.h @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_LEADER_ELECTION_H +#define KUDU_CONSENSUS_LEADER_ELECTION_H + +#include +#include +#include +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/locks.h" + +namespace kudu { +class Status; + +namespace metadata { +class RaftPeerPB; +} + +namespace rpc { +class Messenger; +class RpcController; +} + +namespace consensus { +class PeerProxy; +class PeerProxyFactory; + +// The vote a peer has given. +enum ElectionVote { + VOTE_DENIED = 0, + VOTE_GRANTED = 1, +}; + +// Simple class to count votes (in-memory, not persisted to disk). +// This class is not thread safe and requires external synchronization. +class VoteCounter { + public: + // Create new VoteCounter with the given majority size. + VoteCounter(int num_voters, int majority_size); + + // Register a peer's vote. + // + // If the voter already has a vote recorded, but it has a different value than + // the vote specified, returns Status::IllegalArgument. + // + // If the same vote is duplicated, 'is_duplicate' is set to true. + // Otherwise, it is set to false. + // If an OK status is not returned, the value in 'is_duplicate' is undefined. + Status RegisterVote(const std::string& voter_uuid, ElectionVote vote, bool* is_duplicate); + + // Return whether the vote is decided yet. + bool IsDecided() const; + + // Return decision iff IsDecided() returns true. + // If vote is not yet decided, returns Status::IllegalState(). + Status GetDecision(ElectionVote* decision) const; + + // Return the total of "Yes" and "No" votes. + int GetTotalVotesCounted() const; + + // Return total number of expected votes. + int GetTotalExpectedVotes() const { return num_voters_; } + + // Return true iff GetTotalVotesCounted() == num_voters_; + bool AreAllVotesIn() const; + + private: + friend class VoteCounterTest; + + typedef std::map VoteMap; + + const int num_voters_; + const int majority_size_; + VoteMap votes_; // Voting record. + int yes_votes_; // Accumulated yes votes, for quick counting. + int no_votes_; // Accumulated no votes. + + DISALLOW_COPY_AND_ASSIGN(VoteCounter); +}; + +// The result of a leader election. +struct ElectionResult { + public: + ElectionResult(ConsensusTerm election_term, ElectionVote decision); + ElectionResult(ConsensusTerm election_term, ElectionVote decision, + ConsensusTerm higher_term, const std::string& message); + + // Term the election was run for. + const ConsensusTerm election_term; + + // The overall election GRANTED/DENIED decision of the configuration. + const ElectionVote decision; + + // At least one voter had a higher term than the candidate. + const bool has_higher_term; + const ConsensusTerm higher_term; + + // Human-readable explanation of the vote result, if any. + const std::string message; +}; + +// Driver class to run a leader election. +// +// The caller must pass a callback to the driver, which will be called exactly +// once when a Yes/No decision has been made, except in case of Shutdown() +// on the Messenger or test ThreadPool, in which case no guarantee of a +// callback is provided. In that case, we should not care about the election +// result, because the server is ostensibly shutting down. +// +// For a "Yes" decision, a majority of voters must grant their vote. +// +// A "No" decision may be caused by either one of the following: +// - One of the peers replies with a higher term before a decision is made. +// - A majority of the peers votes "No". +// +// Any votes that come in after a decision has been made and the callback has +// been invoked are logged but ignored. Note that this somewhat strays from the +// letter of the Raft paper, in that replies that come after a "Yes" decision +// do not immediately cause the candidate/leader to step down, but this keeps +// our implementation and API simple, and the newly-minted leader will soon +// discover that it must step down when it attempts to replicate its first +// message to the peers. +// +// This class is thread-safe. +class LeaderElection : public RefCountedThreadSafe { + public: + typedef Callback ElectionDecisionCallback; + typedef std::unordered_map ProxyMap; + + // Set up a new leader election driver. + // + // The 'vote_counter' must be initialized with the candidate's own yes vote. + LeaderElection(const RaftConfigPB& config, PeerProxyFactory* proxy_factory, + const VoteRequestPB& request, + gscoped_ptr vote_counter, MonoDelta timeout, + ElectionDecisionCallback decision_callback); + + // Run the election: send the vote request to followers. + void Run(); + + private: + friend class RefCountedThreadSafe; + + struct VoterState { + gscoped_ptr proxy; + + // If constructing the proxy failed (e.g. due to a DNS resolution issue) + // then 'proxy' will be NULL, and 'proxy_status' will contain the error. + Status proxy_status; + + rpc::RpcController rpc; + VoteRequestPB request; + VoteResponsePB response; + }; + + typedef std::unordered_map VoterStateMap; + typedef simple_spinlock Lock; + + // This class is refcounted. + ~LeaderElection(); + + // Check to see if a decision has been made. If so, invoke decision callback. + // Calls the callback outside of holding a lock. + void CheckForDecision(); + + // Callback called when the RPC responds. + void VoteResponseRpcCallback(const std::string& voter_uuid); + + // Record vote from specified peer. + void RecordVoteUnlocked(const std::string& voter_uuid, ElectionVote vote); + + // Handle a peer that reponded with a term greater than the election term. + void HandleHigherTermUnlocked(const std::string& voter_uuid, const VoterState& state); + + // Log and record a granted vote. + void HandleVoteGrantedUnlocked(const std::string& voter_uuid, const VoterState& state); + + // Log the reason for a denied vote and record it. + void HandleVoteDeniedUnlocked(const std::string& voter_uuid, const VoterState& state); + + // Returns a string to be prefixed to all log entries. + // This method accesses const members and is thread safe. + std::string LogPrefix() const; + + // Helper to reference the term we are running the election for. + ConsensusTerm election_term() const { return request_.candidate_term(); } + + // All non-const fields are protected by 'lock_'. + Lock lock_; + + // The result returned by the ElectionDecisionCallback. + // NULL if not yet known. + gscoped_ptr result_; + + // Whether we have responded via the callback yet. + bool has_responded_; + + // Election request to send to voters. + const VoteRequestPB request_; + + // Object to count the votes. + const gscoped_ptr vote_counter_; + + // Timeout for sending RPCs. + const MonoDelta timeout_; + + // Callback invoked to notify the caller of an election decision. + const ElectionDecisionCallback decision_callback_; + + // List of all potential followers to request votes from. + // The candidate's own UUID must not be included. + std::vector follower_uuids_; + + // Map of UUID -> VoterState. + VoterStateMap voter_state_; +}; + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_LEADER_ELECTION_H */ diff --git a/src/kudu/consensus/local_consensus.cc b/src/kudu/consensus/local_consensus.cc new file mode 100644 index 000000000000..91540320d849 --- /dev/null +++ b/src/kudu/consensus/local_consensus.cc @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/local_consensus.h" + +#include +#include + +#include "kudu/consensus/log.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/metadata.h" +#include "kudu/server/clock.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/logging.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace consensus { + +using base::subtle::Barrier_AtomicIncrement; +using log::Log; +using log::LogEntryBatch; +using strings::Substitute; + +LocalConsensus::LocalConsensus(ConsensusOptions options, + gscoped_ptr cmeta, + string peer_uuid, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, Log* log) + : peer_uuid_(std::move(peer_uuid)), + options_(std::move(options)), + cmeta_(cmeta.Pass()), + txn_factory_(DCHECK_NOTNULL(txn_factory)), + log_(DCHECK_NOTNULL(log)), + clock_(clock), + state_(kInitializing), + next_op_id_index_(-1) { + CHECK(cmeta_) << "Passed ConsensusMetadata object is NULL"; +} + +Status LocalConsensus::Start(const ConsensusBootstrapInfo& info) { + TRACE_EVENT0("consensus", "LocalConsensus::Start"); + + CHECK_EQ(state_, kInitializing); + LOG_WITH_PREFIX(INFO) << "Starting LocalConsensus..."; + + { + boost::lock_guard lock(lock_); + + const RaftConfigPB& config = cmeta_->committed_config(); + CHECK(config.local()) << "Local consensus must be passed a local config"; + RETURN_NOT_OK_PREPEND(VerifyRaftConfig(config, COMMITTED_QUORUM), + "Invalid config found in LocalConsensus::Start()"); + + next_op_id_index_ = info.last_id.index() + 1; + + CHECK(config.peers(0).has_permanent_uuid()) << config.ShortDebugString(); + cmeta_->set_leader_uuid(config.peers(0).permanent_uuid()); + + RETURN_NOT_OK_PREPEND(ResubmitOrphanedReplicates(info.orphaned_replicates), + "Could not restart replicated operations"); + + state_ = kRunning; + } + TRACE("Consensus started"); + return Status::OK(); +} + +Status LocalConsensus::ResubmitOrphanedReplicates(const std::vector replicates) { + for (ReplicateMsg* msg : replicates) { + DCHECK_LT(msg->id().index(), next_op_id_index_) + << "Orphaned replicate " << OpIdToString(msg->id()) + << " is newer than next op index " << next_op_id_index_; + + LOG_WITH_PREFIX(INFO) << "Resubmitting operation " + << OpIdToString(msg->id()) << " after restart"; + ReplicateRefPtr replicate_ptr = make_scoped_refptr_replicate(new ReplicateMsg(*msg)); + scoped_refptr round(new ConsensusRound(this, replicate_ptr)); + RETURN_NOT_OK(txn_factory_->StartReplicaTransaction(round)); + round->NotifyReplicationFinished(Status::OK()); + } + return Status::OK(); +} + +bool LocalConsensus::IsRunning() const { + boost::lock_guard lock(lock_); + return state_ == kRunning; +} + +Status LocalConsensus::Replicate(const scoped_refptr& round) { + TRACE_EVENT0("consensus", "LocalConsensus::Replicate"); + DCHECK_GE(state_, kConfiguring); + + ReplicateMsg* msg = round->replicate_msg(); + + OpId* cur_op_id = DCHECK_NOTNULL(msg)->mutable_id(); + cur_op_id->set_term(0); + + // Pre-cache the ByteSize outside of the lock, since this is somewhat + // expensive. + ignore_result(msg->ByteSize()); + + LogEntryBatch* reserved_entry_batch; + { + boost::lock_guard lock(lock_); + + // create the new op id for the entry. + cur_op_id->set_index(next_op_id_index_++); + // Reserve the correct slot in the log for the replication operation. + // It's important that we do this under the same lock as we generate + // the op id, so that we log things in-order. + gscoped_ptr entry_batch; + log::CreateBatchFromAllocatedOperations({ round->replicate_scoped_refptr() }, &entry_batch); + + RETURN_NOT_OK(log_->Reserve(log::REPLICATE, entry_batch.Pass(), + &reserved_entry_batch)); + + // Local consensus transactions are always committed so we + // can just persist the configuration, if this is a change config. + if (round->replicate_msg()->op_type() == CHANGE_CONFIG_OP) { + RaftConfigPB new_config = round->replicate_msg()->change_config_record().new_config(); + DCHECK(!new_config.has_opid_index()); + new_config.set_opid_index(round->replicate_msg()->id().index()); + cmeta_->set_committed_config(new_config); + CHECK_OK(cmeta_->Flush()); + } + } + // Serialize and mark the message as ready to be appended. + // When the Log actually fsync()s this message to disk, 'repl_callback' + // is triggered. + RETURN_NOT_OK(log_->AsyncAppend( + reserved_entry_batch, + Bind(&ConsensusRound::NotifyReplicationFinished, round))); + return Status::OK(); +} + +RaftPeerPB::Role LocalConsensus::role() const { + return RaftPeerPB::LEADER; +} + +Status LocalConsensus::Update(const ConsensusRequestPB* request, + ConsensusResponsePB* response) { + return Status::NotSupported("LocalConsensus does not support Update() calls."); +} + +Status LocalConsensus::RequestVote(const VoteRequestPB* request, + VoteResponsePB* response) { + return Status::NotSupported("LocalConsensus does not support RequestVote() calls."); +} + +ConsensusStatePB LocalConsensus::ConsensusState(ConsensusConfigType type) const { + boost::lock_guard lock(lock_); + return cmeta_->ToConsensusStatePB(type); +} + +RaftConfigPB LocalConsensus::CommittedConfig() const { + boost::lock_guard lock(lock_); + return cmeta_->committed_config(); +} + +void LocalConsensus::Shutdown() { + VLOG_WITH_PREFIX(1) << "LocalConsensus Shutdown!"; +} + +void LocalConsensus::DumpStatusHtml(std::ostream& out) const { + out << "

Local Consensus Status

\n"; + + boost::lock_guard lock(lock_); + out << "next op: " << next_op_id_index_; +} + +std::string LocalConsensus::LogPrefix() const { + return Substitute("T $0 P $1: ", options_.tablet_id, peer_uuid_); +} + +} // end namespace consensus +} // end namespace kudu diff --git a/src/kudu/consensus/local_consensus.h b/src/kudu/consensus/local_consensus.h new file mode 100644 index 000000000000..9e5f799360f6 --- /dev/null +++ b/src/kudu/consensus/local_consensus.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOCAL_CONSENSUS_H_ +#define KUDU_CONSENSUS_LOCAL_CONSENSUS_H_ + +#include +#include +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/util/locks.h" + +namespace kudu { + +class FsManager; + +namespace metadata { +class TabletServerPB; +} + +namespace consensus { + +// Local implementation of Consensus. This is mostly for testing purposes/ +// using in single node configurations if/when applicable. +// +// NOTE: While this implementation has a lot less overhead running on a single +// node than a true consensus implementation in the same situation, this +// implementation will not be able to be reconfigured to accept more nodes +// while a true consensus implementation will. +// +// This class is not thread safe. +class LocalConsensus : public Consensus { + public: + explicit LocalConsensus(ConsensusOptions options, + gscoped_ptr cmeta, + std::string peer_uuid, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, + log::Log* log); + + virtual Status Start(const ConsensusBootstrapInfo& info) OVERRIDE; + + virtual bool IsRunning() const OVERRIDE; + + virtual Status EmulateElection() OVERRIDE { return Status::OK(); } + + virtual Status StartElection(ElectionMode mode) OVERRIDE { return Status::OK(); } + + virtual Status Replicate(const scoped_refptr& context) OVERRIDE; + + virtual RaftPeerPB::Role role() const OVERRIDE; + + virtual std::string peer_uuid() const OVERRIDE { + return peer_uuid_; + } + + virtual std::string tablet_id() const OVERRIDE { + return options_.tablet_id; + } + + virtual ConsensusStatePB ConsensusState(ConsensusConfigType type) const OVERRIDE; + + virtual RaftConfigPB CommittedConfig() const OVERRIDE; + + virtual void Shutdown() OVERRIDE; + + virtual void DumpStatusHtml(std::ostream& out) const OVERRIDE; + + // + // NOT IMPLEMENTED IN LOCAL CONSENSUS + // + virtual Status Update(const ConsensusRequestPB* request, + ConsensusResponsePB* response) OVERRIDE; + + virtual Status RequestVote(const VoteRequestPB* request, + VoteResponsePB* response) OVERRIDE; + + private: + // Log prefix. Doesn't access any variables that require locking. + std::string LogPrefix() const; + + // Resubmit the operations in 'replicates' to be applied immediately to + // the tablet. + // + // This is used to re-apply operations which were found in the WAL at startup, + // but did not have associated COMMIT records. + Status ResubmitOrphanedReplicates(const std::vector replicates); + + const std::string peer_uuid_; + const ConsensusOptions options_; + const gscoped_ptr cmeta_; + ReplicaTransactionFactory* const txn_factory_; + log::Log* const log_; + const scoped_refptr clock_; + + // Protects 'state_' and 'next_op_id_index_'. + mutable simple_spinlock lock_; + + State state_; + int64 next_op_id_index_; + + DISALLOW_COPY_AND_ASSIGN(LocalConsensus); +}; + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_LOCAL_CONSENSUS_H_ */ diff --git a/src/kudu/consensus/log-dump.cc b/src/kudu/consensus/log-dump.cc new file mode 100644 index 000000000000..8448ba8e36b9 --- /dev/null +++ b/src/kudu/consensus/log-dump.cc @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/common/row_operations.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/log_index.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/util/env.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/pb_util.h" + +DEFINE_bool(print_headers, true, "print the log segment headers/footers"); +DEFINE_string(print_entries, "decoded", + "How to print entries:\n" + " false|0|no = don't print\n" + " true|1|yes|decoded = print them decoded\n" + " pb = print the raw protobuf\n" + " id = print only their ids"); +DEFINE_int32(truncate_data, 100, + "Truncate the data fields to the given number of bytes " + "before printing. Set to 0 to disable"); +namespace kudu { +namespace log { + +using consensus::CommitMsg; +using consensus::OperationType; +using consensus::ReplicateMsg; +using tserver::WriteRequestPB; +using std::string; +using std::vector; +using std::cout; +using std::endl; + +enum PrintEntryType { + DONT_PRINT, + PRINT_PB, + PRINT_DECODED, + PRINT_ID +}; + +static PrintEntryType ParsePrintType() { + if (ParseLeadingBoolValue(FLAGS_print_entries.c_str(), true) == false) { + return DONT_PRINT; + } else if (ParseLeadingBoolValue(FLAGS_print_entries.c_str(), false) == true || + FLAGS_print_entries == "decoded") { + return PRINT_DECODED; + } else if (FLAGS_print_entries == "pb") { + return PRINT_PB; + } else if (FLAGS_print_entries == "id") { + return PRINT_ID; + } else { + LOG(FATAL) << "Unknown value for --print_entries: " << FLAGS_print_entries; + } +} + +void PrintIdOnly(const LogEntryPB& entry) { + switch (entry.type()) { + case log::REPLICATE: + { + cout << entry.replicate().id().term() << "." << entry.replicate().id().index() + << "@" << entry.replicate().timestamp() << "\t"; + cout << "REPLICATE " + << OperationType_Name(entry.replicate().op_type()); + break; + } + case log::COMMIT: + { + cout << "COMMIT " << entry.commit().commited_op_id().term() + << "." << entry.commit().commited_op_id().index(); + break; + } + default: + cout << "UNKNOWN: " << entry.ShortDebugString(); + } + + cout << endl; +} + +Status PrintDecodedWriteRequestPB(const string& indent, + const Schema& tablet_schema, + const WriteRequestPB& write) { + Schema request_schema; + RETURN_NOT_OK(SchemaFromPB(write.schema(), &request_schema)); + + Arena arena(32 * 1024, 1024 * 1024); + RowOperationsPBDecoder dec(&write.row_operations(), &request_schema, &tablet_schema, &arena); + vector ops; + RETURN_NOT_OK(dec.DecodeOperations(&ops)); + + cout << indent << "Tablet: " << write.tablet_id() << endl; + cout << indent << "Consistency: " + << ExternalConsistencyMode_Name(write.external_consistency_mode()) << endl; + if (write.has_propagated_timestamp()) { + cout << indent << "Propagated TS: " << write.propagated_timestamp() << endl; + } + + int i = 0; + for (const DecodedRowOperation& op : ops) { + // TODO (KUDU-515): Handle the case when a tablet's schema changes + // mid-segment. + cout << indent << "op " << (i++) << ": " << op.ToString(tablet_schema) << endl; + } + + return Status::OK(); +} + +Status PrintDecoded(const LogEntryPB& entry, const Schema& tablet_schema) { + PrintIdOnly(entry); + + const string indent = "\t"; + if (entry.has_replicate()) { + // We can actually decode REPLICATE messages. + + const ReplicateMsg& replicate = entry.replicate(); + if (replicate.op_type() == consensus::WRITE_OP) { + RETURN_NOT_OK(PrintDecodedWriteRequestPB(indent, tablet_schema, replicate.write_request())); + } else { + cout << indent << replicate.ShortDebugString() << endl; + } + } else if (entry.has_commit()) { + // For COMMIT we'll just dump the PB + cout << indent << entry.commit().ShortDebugString() << endl; + } + + return Status::OK(); +} + +Status PrintSegment(const scoped_refptr& segment) { + PrintEntryType print_type = ParsePrintType(); + if (FLAGS_print_headers) { + cout << "Header:\n" << segment->header().DebugString(); + } + vector entries; + RETURN_NOT_OK(segment->ReadEntries(&entries)); + + if (print_type == DONT_PRINT) return Status::OK(); + + Schema tablet_schema; + RETURN_NOT_OK(SchemaFromPB(segment->header().schema(), &tablet_schema)); + + for (LogEntryPB* entry : entries) { + + if (print_type == PRINT_PB) { + if (FLAGS_truncate_data > 0) { + pb_util::TruncateFields(entry, FLAGS_truncate_data); + } + + cout << "Entry:\n" << entry->DebugString(); + } else if (print_type == PRINT_DECODED) { + RETURN_NOT_OK(PrintDecoded(*entry, tablet_schema)); + } else if (print_type == PRINT_ID) { + PrintIdOnly(*entry); + } + } + if (FLAGS_print_headers && segment->HasFooter()) { + cout << "Footer:\n" << segment->footer().DebugString(); + } + + return Status::OK(); +} + +Status DumpLog(const string& tablet_id) { + Env *env = Env::Default(); + gscoped_ptr reader; + FsManagerOpts fs_opts; + fs_opts.read_only = true; + FsManager fs_manager(env, fs_opts); + RETURN_NOT_OK(fs_manager.Open()); + RETURN_NOT_OK(LogReader::Open(&fs_manager, + scoped_refptr(), + tablet_id, + scoped_refptr(), + &reader)); + + SegmentSequence segments; + RETURN_NOT_OK(reader->GetSegmentsSnapshot(&segments)); + + for (const scoped_refptr& segment : segments) { + RETURN_NOT_OK(PrintSegment(segment)); + } + + return Status::OK(); +} + +Status DumpSegment(const string &segment_path) { + Env *env = Env::Default(); + scoped_refptr segment; + RETURN_NOT_OK(ReadableLogSegment::Open(env, segment_path, &segment)); + RETURN_NOT_OK(PrintSegment(segment)); + + return Status::OK(); +} + +} // namespace log +} // namespace kudu + +int main(int argc, char **argv) { + kudu::ParseCommandLineFlags(&argc, &argv, true); + if (argc != 2) { + std::cerr << "usage: " << argv[0] + << " -fs_wal_dir -fs_data_dirs " + << " | " + << std::endl; + return 1; + } + kudu::InitGoogleLoggingSafe(argv[0]); + kudu::Status s = kudu::log::DumpSegment(argv[1]); + if (s.ok()) { + return 0; + } else if (s.IsNotFound()) { + s = kudu::log::DumpLog(argv[1]); + } + if (!s.ok()) { + std::cerr << "Error: " << s.ToString() << std::endl; + return 1; + } + return 0; +} diff --git a/src/kudu/consensus/log-test-base.h b/src/kudu/consensus/log-test-base.h new file mode 100644 index 000000000000..080811d896ba --- /dev/null +++ b/src/kudu/consensus/log-test-base.h @@ -0,0 +1,378 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOG_TEST_BASE_H +#define KUDU_CONSENSUS_LOG_TEST_BASE_H + +#include "kudu/consensus/log.h" + +#include +#include + +#include +#include +#include + +#include "kudu/common/timestamp.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/server/clock.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/metadata.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/env_util.h" +#include "kudu/util/metrics.h" +#include "kudu/util/path_util.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/stopwatch.h" + +METRIC_DECLARE_entity(tablet); + +DECLARE_int32(log_min_seconds_to_retain); + +namespace kudu { +namespace log { + +using consensus::OpId; +using consensus::CommitMsg; +using consensus::ReplicateMsg; +using consensus::WRITE_OP; +using consensus::NO_OP; + +using server::Clock; + +using tserver::WriteRequestPB; + +using tablet::TxResultPB; +using tablet::OperationResultPB; +using tablet::MemStoreTargetPB; + +const char* kTestTable = "test-log-table"; +const char* kTestTablet = "test-log-tablet"; +const bool APPEND_SYNC = true; +const bool APPEND_ASYNC = false; + +// Append a single batch of 'count' NoOps to the log. +// If 'size' is not NULL, increments it by the expected increase in log size. +// Increments 'op_id''s index once for each operation logged. +static Status AppendNoOpsToLogSync(const scoped_refptr& clock, + Log* log, + OpId* op_id, + int count, + int* size = NULL) { + + vector replicates; + for (int i = 0; i < count; i++) { + consensus::ReplicateRefPtr replicate = make_scoped_refptr_replicate(new ReplicateMsg()); + ReplicateMsg* repl = replicate->get(); + + repl->mutable_id()->CopyFrom(*op_id); + repl->set_op_type(NO_OP); + repl->set_timestamp(clock->Now().ToUint64()); + + // Increment op_id. + op_id->set_index(op_id->index() + 1); + + if (size) { + // If we're tracking the sizes we need to account for the fact that the Log wraps the + // log entry in an LogEntryBatchPB, and each actual entry will have a one-byte tag. + *size += repl->ByteSize() + 1; + } + replicates.push_back(replicate); + } + + // Account for the entry batch header and wrapper PB. + if (size) { + *size += log::kEntryHeaderSize + 5; + } + + Synchronizer s; + RETURN_NOT_OK(log->AsyncAppendReplicates(replicates, + s.AsStatusCallback())); + s.Wait(); + return Status::OK(); +} + +static Status AppendNoOpToLogSync(const scoped_refptr& clock, + Log* log, + OpId* op_id, + int* size = NULL) { + return AppendNoOpsToLogSync(clock, log, op_id, 1, size); +} + +class LogTestBase : public KuduTest { + public: + + typedef pair DeltaId; + + LogTestBase() + : schema_(GetSimpleTestSchema()), + log_anchor_registry_(new LogAnchorRegistry()) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + current_index_ = 1; + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + metric_registry_.reset(new MetricRegistry()); + metric_entity_ = METRIC_ENTITY_tablet.Instantiate(metric_registry_.get(), "log-test-base"); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + + clock_.reset(new server::HybridClock()); + ASSERT_OK(clock_->Init()); + + FLAGS_log_min_seconds_to_retain = 0; + } + + virtual void TearDown() OVERRIDE { + KuduTest::TearDown(); + STLDeleteElements(&entries_); + } + + void BuildLog() { + Schema schema_with_ids = SchemaBuilder(schema_).Build(); + CHECK_OK(Log::Open(options_, + fs_manager_.get(), + kTestTablet, + schema_with_ids, + 0, // schema_version + metric_entity_.get(), + &log_)); + } + + void CheckRightNumberOfSegmentFiles(int expected) { + // Test that we actually have the expected number of files in the fs. + // We should have n segments plus '.' and '..' + vector files; + ASSERT_OK(env_->GetChildren( + JoinPathSegments(fs_manager_->GetWalsRootDir(), + kTestTablet), + &files)); + int count = 0; + for (const string& s : files) { + if (HasPrefixString(s, FsManager::kWalFileNamePrefix)) { + count++; + } + } + ASSERT_EQ(expected, count); + } + + void EntriesToIdList(vector* ids) { + for (const LogEntryPB* entry : entries_) { + VLOG(2) << "Entry contents: " << entry->DebugString(); + if (entry->type() == REPLICATE) { + ids->push_back(entry->replicate().id().index()); + } + } + } + + static void CheckReplicateResult(const consensus::ReplicateRefPtr& msg, const Status& s) { + CHECK_OK(s); + } + + // Appends a batch with size 2 (1 insert, 1 mutate) to the log. + void AppendReplicateBatch(const OpId& opid, bool sync = APPEND_SYNC) { + consensus::ReplicateRefPtr replicate = make_scoped_refptr_replicate(new ReplicateMsg()); + replicate->get()->set_op_type(WRITE_OP); + replicate->get()->mutable_id()->CopyFrom(opid); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + WriteRequestPB* batch_request = replicate->get()->mutable_write_request(); + ASSERT_OK(SchemaToPB(schema_, batch_request->mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, + opid.index(), + 0, + "this is a test insert", + batch_request->mutable_row_operations()); + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, + opid.index() + 1, + 0, + "this is a test mutate", + batch_request->mutable_row_operations()); + batch_request->set_tablet_id(kTestTablet); + AppendReplicateBatch(replicate, sync); + } + + // Appends the provided batch to the log. + void AppendReplicateBatch(const consensus::ReplicateRefPtr& replicate, + bool sync = APPEND_SYNC) { + if (sync) { + Synchronizer s; + ASSERT_OK(log_->AsyncAppendReplicates({ replicate }, s.AsStatusCallback())); + ASSERT_OK(s.Wait()); + } else { + // AsyncAppendReplicates does not free the ReplicateMsg on completion, so we + // need to pass it through to our callback. + ASSERT_OK(log_->AsyncAppendReplicates({ replicate }, + Bind(&LogTestBase::CheckReplicateResult, replicate))); + } + } + + static void CheckCommitResult(const Status& s) { + CHECK_OK(s); + } + + // Append a commit log entry containing one entry for the insert and one + // for the mutate. + void AppendCommit(const OpId& original_opid, + bool sync = APPEND_SYNC) { + // The mrs id for the insert. + const int kTargetMrsId = 1; + + // The rs and delta ids for the mutate. + const int kTargetRsId = 0; + const int kTargetDeltaId = 0; + + AppendCommit(original_opid, kTargetMrsId, kTargetRsId, kTargetDeltaId, sync); + } + + void AppendCommit(const OpId& original_opid, + int mrs_id, int rs_id, int dms_id, + bool sync = APPEND_SYNC) { + gscoped_ptr commit(new CommitMsg); + commit->set_op_type(WRITE_OP); + + commit->mutable_commited_op_id()->CopyFrom(original_opid); + + TxResultPB* result = commit->mutable_result(); + + OperationResultPB* insert = result->add_ops(); + insert->add_mutated_stores()->set_mrs_id(mrs_id); + + OperationResultPB* mutate = result->add_ops(); + MemStoreTargetPB* target = mutate->add_mutated_stores(); + target->set_dms_id(dms_id); + target->set_rs_id(rs_id); + AppendCommit(commit.Pass(), sync); + } + + void AppendCommit(gscoped_ptr commit, bool sync = APPEND_SYNC) { + if (sync) { + Synchronizer s; + ASSERT_OK(log_->AsyncAppendCommit(commit.Pass(), s.AsStatusCallback())); + ASSERT_OK(s.Wait()); + } else { + ASSERT_OK(log_->AsyncAppendCommit(commit.Pass(), + Bind(&LogTestBase::CheckCommitResult))); + } + } + + // Appends 'count' ReplicateMsgs and the corresponding CommitMsgs to the log + void AppendReplicateBatchAndCommitEntryPairsToLog(int count, bool sync = true) { + for (int i = 0; i < count; i++) { + OpId opid = consensus::MakeOpId(1, current_index_); + AppendReplicateBatch(opid); + AppendCommit(opid, sync); + current_index_ += 1; + } + } + + // Append a single NO_OP entry. Increments op_id by one. + // If non-NULL, and if the write is successful, 'size' is incremented + // by the size of the written operation. + Status AppendNoOp(OpId* op_id, int* size = NULL) { + return AppendNoOpToLogSync(clock_, log_.get(), op_id, size); + } + + // Append a number of no-op entries to the log. + // Increments op_id's index by the number of records written. + // If non-NULL, 'size' keeps track of the size of the operations + // successfully written. + Status AppendNoOps(OpId* op_id, int num, int* size = NULL) { + for (int i = 0; i < num; i++) { + RETURN_NOT_OK(AppendNoOp(op_id, size)); + } + return Status::OK(); + } + + Status RollLog() { + RETURN_NOT_OK(log_->AsyncAllocateSegment()); + return log_->RollOver(); + } + + string DumpSegmentsToString(const SegmentSequence& segments) { + string dump; + for (const scoped_refptr& segment : segments) { + dump.append("------------\n"); + strings::SubstituteAndAppend(&dump, "Segment: $0, Path: $1\n", + segment->header().sequence_number(), segment->path()); + strings::SubstituteAndAppend(&dump, "Header: $0\n", + segment->header().ShortDebugString()); + if (segment->HasFooter()) { + strings::SubstituteAndAppend(&dump, "Footer: $0\n", segment->footer().ShortDebugString()); + } else { + dump.append("Footer: None or corrupt."); + } + } + return dump; + } + + protected: + const Schema schema_; + gscoped_ptr fs_manager_; + gscoped_ptr metric_registry_; + scoped_refptr metric_entity_; + scoped_refptr log_; + int32_t current_index_; + LogOptions options_; + // Reusable entries vector that deletes the entries on destruction. + vector entries_; + scoped_refptr log_anchor_registry_; + scoped_refptr clock_; +}; + +// Corrupts the last segment of the provided log by either truncating it +// or modifying a byte at the given offset. +enum CorruptionType { + TRUNCATE_FILE, + FLIP_BYTE +}; + +Status CorruptLogFile(Env* env, const string& log_path, + CorruptionType type, int corruption_offset) { + faststring buf; + RETURN_NOT_OK_PREPEND(ReadFileToString(env, log_path, &buf), + "Couldn't read log"); + + switch (type) { + case TRUNCATE_FILE: + buf.resize(corruption_offset); + break; + case FLIP_BYTE: + CHECK_LT(corruption_offset, buf.size()); + buf[corruption_offset] ^= 0xff; + break; + } + + // Rewrite the file with the corrupt log. + RETURN_NOT_OK_PREPEND(WriteStringToFile(env, Slice(buf), log_path), + "Couldn't rewrite corrupt log file"); + + return Status::OK(); +} + +} // namespace log +} // namespace kudu + +#endif diff --git a/src/kudu/consensus/log-test.cc b/src/kudu/consensus/log-test.cc new file mode 100644 index 000000000000..1d4313d2f195 --- /dev/null +++ b/src/kudu/consensus/log-test.cc @@ -0,0 +1,1051 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log-test-base.h" +#include "kudu/consensus/log_index.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/random.h" + +DEFINE_int32(num_batches, 10000, + "Number of batches to write to/read from the Log in TestWriteManyBatches"); + +DECLARE_int32(log_min_segments_to_retain); + +namespace kudu { +namespace log { + +using std::shared_ptr; +using consensus::MakeOpId; +using strings::Substitute; + +extern const char* kTestTable; +extern const char* kTestTablet; + +struct TestLogSequenceElem { + enum ElemType { + REPLICATE, + COMMIT, + ROLL + }; + ElemType type; + OpId id; +}; + +class LogTest : public LogTestBase { + public: + void CreateAndRegisterNewAnchor(int64_t log_index, vector* anchors) { + anchors->push_back(new LogAnchor()); + log_anchor_registry_->Register(log_index, CURRENT_TEST_NAME(), anchors->back()); + } + + // Create a series of NO_OP entries in the log. + // Anchor each segment on the first OpId of each log segment, + // and update op_id to point to the next valid OpId. + Status AppendMultiSegmentSequence(int num_total_segments, int num_ops_per_segment, + OpId* op_id, vector* anchors) { + CHECK(op_id->IsInitialized()); + for (int i = 0; i < num_total_segments - 1; i++) { + if (anchors) { + CreateAndRegisterNewAnchor(op_id->index(), anchors); + } + RETURN_NOT_OK(AppendNoOps(op_id, num_ops_per_segment)); + RETURN_NOT_OK(RollLog()); + } + + if (anchors) { + CreateAndRegisterNewAnchor(op_id->index(), anchors); + } + RETURN_NOT_OK(AppendNoOps(op_id, num_ops_per_segment)); + return Status::OK(); + } + + Status AppendNewEmptySegmentToReader(int sequence_number, + int first_repl_index, + LogReader* reader) { + string fqp = GetTestPath(strings::Substitute("wal-00000000$0", sequence_number)); + gscoped_ptr w_log_seg; + RETURN_NOT_OK(fs_manager_->env()->NewWritableFile(fqp, &w_log_seg)); + gscoped_ptr r_log_seg; + RETURN_NOT_OK(fs_manager_->env()->NewRandomAccessFile(fqp, &r_log_seg)); + + scoped_refptr readable_segment( + new ReadableLogSegment(fqp, shared_ptr(r_log_seg.release()))); + + LogSegmentHeaderPB header; + header.set_sequence_number(sequence_number); + header.set_major_version(0); + header.set_minor_version(0); + header.set_tablet_id(kTestTablet); + SchemaToPB(GetSimpleTestSchema(), header.mutable_schema()); + + LogSegmentFooterPB footer; + footer.set_num_entries(10); + footer.set_min_replicate_index(first_repl_index); + footer.set_max_replicate_index(first_repl_index + 9); + + RETURN_NOT_OK(readable_segment->Init(header, footer, 0)); + RETURN_NOT_OK(reader->AppendSegment(readable_segment)); + return Status::OK(); + } + + void GenerateTestSequence(Random* rng, int seq_len, + vector* ops, + vector* terms_by_index); + void AppendTestSequence(const vector& seq); + + // Where to corrupt the log entry. + enum CorruptionPosition { + // Corrupt/truncate within the header. + IN_HEADER, + // Corrupt/truncate within the entry data itself. + IN_ENTRY + }; + + void DoCorruptionTest(CorruptionType type, CorruptionPosition place, + Status expected_status, int expected_entries); + +}; + +// If we write more than one entry in a batch, we should be able to +// read all of those entries back. +TEST_F(LogTest, TestMultipleEntriesInABatch) { + BuildLog(); + + OpId opid; + opid.set_term(1); + opid.set_index(1); + + AppendNoOpsToLogSync(clock_, log_.get(), &opid, 2); + + // RollOver() the batch so that we have a properly formed footer. + ASSERT_OK(log_->AllocateSegmentAndRollOver()); + + vector entries; + ElementDeleter deleter(&entries); + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_OK(segments[0]->ReadEntries(&entries)); + + ASSERT_EQ(2, entries.size()); + + // Verify the index. + { + LogIndexEntry entry; + ASSERT_OK(log_->log_index_->GetEntry(1, &entry)); + ASSERT_EQ(1, entry.op_id.term()); + ASSERT_EQ(1, entry.segment_sequence_number); + int64_t offset = entry.offset_in_segment; + + ASSERT_OK(log_->log_index_->GetEntry(2, &entry)); + ASSERT_EQ(1, entry.op_id.term()); + ASSERT_EQ(1, entry.segment_sequence_number); + int64_t second_offset = entry.offset_in_segment; + + // The second entry should be at the same offset as the first entry + // since they were written in the same batch. + ASSERT_EQ(second_offset, offset); + } + + // Test LookupOpId + { + OpId loaded_op; + ASSERT_OK(log_->GetLogReader()->LookupOpId(1, &loaded_op)); + ASSERT_EQ("1.1", OpIdToString(loaded_op)); + ASSERT_OK(log_->GetLogReader()->LookupOpId(2, &loaded_op)); + ASSERT_EQ("1.2", OpIdToString(loaded_op)); + Status s = log_->GetLogReader()->LookupOpId(3, &loaded_op); + ASSERT_TRUE(s.IsNotFound()) << "unexpected status: " << s.ToString(); + } + + ASSERT_OK(log_->Close()); +} + +// Tests that everything works properly with fsync enabled: +// This also tests SyncDir() (see KUDU-261), which is called whenever +// a new log segment is initialized. +TEST_F(LogTest, TestFsync) { + options_.force_fsync_all = true; + BuildLog(); + + OpId opid; + opid.set_term(0); + opid.set_index(1); + + AppendNoOp(&opid); + + ASSERT_OK(log_->Close()); +} + +// Regression test for part of KUDU-735: +// if a log is not preallocated, we should properly track its on-disk size as we append to +// it. +TEST_F(LogTest, TestSizeIsMaintained) { + options_.preallocate_segments = false; + BuildLog(); + + OpId opid = MakeOpId(0, 1); + AppendNoOp(&opid); + + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + int64_t orig_size = segments[0]->file_size(); + ASSERT_GT(orig_size, 0); + + AppendNoOp(&opid); + + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + int64_t new_size = segments[0]->file_size(); + ASSERT_GT(new_size, orig_size); + + ASSERT_OK(log_->Close()); +} + +// Test that the reader can read from the log even if it hasn't been +// properly closed. +TEST_F(LogTest, TestLogNotTrimmed) { + BuildLog(); + + OpId opid; + opid.set_term(0); + opid.set_index(1); + + AppendNoOp(&opid); + + vector entries; + ElementDeleter deleter(&entries); + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_OK(segments[0]->ReadEntries(&entries)); + // Close after testing to ensure correct shutdown + // TODO : put this in TearDown() with a test on log state? + ASSERT_OK(log_->Close()); +} + +// Test that the reader will not fail if a log file is completely blank. +// This happens when it's opened but nothing has been written. +// The reader should gracefully handle this situation, but somehow expose that +// the segment is uninitialized. See KUDU-140. +TEST_F(LogTest, TestBlankLogFile) { + BuildLog(); + + // The log's reader will have a segment... + ASSERT_EQ(log_->GetLogReader()->num_segments(), 1); + + // ...and we're able to read from it. + vector entries; + ElementDeleter deleter(&entries); + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_OK(segments[0]->ReadEntries(&entries)); + + // ...It's just that it's empty. + ASSERT_EQ(entries.size(), 0); +} + +void LogTest::DoCorruptionTest(CorruptionType type, CorruptionPosition place, + Status expected_status, int expected_entries) { + const int kNumEntries = 4; + BuildLog(); + OpId op_id = MakeOpId(1, 1); + ASSERT_OK(AppendNoOps(&op_id, kNumEntries)); + + // Find the entry that we want to corrupt before closing the log. + LogIndexEntry entry; + ASSERT_OK(log_->log_index_->GetEntry(4, &entry)); + + ASSERT_OK(log_->Close()); + + // Corrupt the log as specified. + int offset; + switch (place) { + case IN_HEADER: + offset = entry.offset_in_segment + 1; + break; + case IN_ENTRY: + offset = entry.offset_in_segment + kEntryHeaderSize + 1; + break; + } + ASSERT_OK(CorruptLogFile( + env_.get(), log_->ActiveSegmentPathForTests(), type, offset)); + + // Open a new reader -- we don't reuse the existing LogReader from log_ + // because it has a cached header. + gscoped_ptr reader; + ASSERT_OK(LogReader::Open(fs_manager_.get(), + make_scoped_refptr(new LogIndex(log_->log_dir_)), + kTestTablet, nullptr, &reader)); + ASSERT_EQ(1, reader->num_segments()); + + SegmentSequence segments; + ASSERT_OK(reader->GetSegmentsSnapshot(&segments)); + Status s = segments[0]->ReadEntries(&entries_); + ASSERT_EQ(s.CodeAsString(), expected_status.CodeAsString()) + << "Got unexpected status: " << s.ToString(); + + // Last entry is ignored, but we should still see the previous ones. + ASSERT_EQ(expected_entries, entries_.size()); +} +// Tests that the log reader reads up until some truncated entry is found. +// It should still return OK, since on a crash, it's acceptable to have +// a partial entry at EOF. +TEST_F(LogTest, TestTruncateLogInEntry) { + DoCorruptionTest(TRUNCATE_FILE, IN_ENTRY, Status::OK(), 3); +} + +// Same, but truncate in the middle of the header of that entry. +TEST_F(LogTest, TestTruncateLogInHeader) { + DoCorruptionTest(TRUNCATE_FILE, IN_HEADER, Status::OK(), 3); +} + +// Similar to the above, except flips a byte. In this case, it should return +// a Corruption instead of an OK, because we still have a valid footer in +// the file (indicating that all of the entries should be valid as well). +TEST_F(LogTest, TestCorruptLogInEntry) { + DoCorruptionTest(FLIP_BYTE, IN_ENTRY, Status::Corruption(""), 3); +} + +// Same, but corrupt in the middle of the header of that entry. +TEST_F(LogTest, TestCorruptLogInHeader) { + DoCorruptionTest(FLIP_BYTE, IN_HEADER, Status::Corruption(""), 3); +} + +// Tests that segments roll over when max segment size is reached +// and that the player plays all entries in the correct order. +TEST_F(LogTest, TestSegmentRollover) { + BuildLog(); + // Set a small segment size so that we have roll overs. + log_->SetMaxSegmentSizeForTests(990); + const int kNumEntriesPerBatch = 100; + + OpId op_id = MakeOpId(1, 1); + int num_entries = 0; + + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + + while (segments.size() < 3) { + ASSERT_OK(AppendNoOps(&op_id, kNumEntriesPerBatch)); + num_entries += kNumEntriesPerBatch; + // Update the segments + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + } + + ASSERT_FALSE(segments.back()->HasFooter()); + ASSERT_OK(log_->Close()); + + gscoped_ptr reader; + ASSERT_OK(LogReader::Open(fs_manager_.get(), NULL, kTestTablet, NULL, &reader)); + ASSERT_OK(reader->GetSegmentsSnapshot(&segments)); + + ASSERT_TRUE(segments.back()->HasFooter()); + + for (const scoped_refptr& entry : segments) { + Status s = entry->ReadEntries(&entries_); + if (!s.ok()) { + FAIL() << "Failed to read entries in segment: " << entry->path() + << ". Status: " << s.ToString() + << ".\nSegments: " << DumpSegmentsToString(segments); + } + } + + ASSERT_EQ(num_entries, entries_.size()); +} + +TEST_F(LogTest, TestWriteAndReadToAndFromInProgressSegment) { + const int kNumEntries = 4; + BuildLog(); + + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(segments.size(), 1); + scoped_refptr readable_segment = segments[0]; + + int header_size = log_->active_segment_->written_offset(); + ASSERT_GT(header_size, 0); + readable_segment->UpdateReadableToOffset(header_size); + + vector entries; + + // Reading the readable segment now should return OK but yield no + // entries. + ASSERT_OK(readable_segment->ReadEntries(&entries)); + ASSERT_EQ(entries.size(), 0); + + // Dummy add_entry to help us estimate the size of what + // gets written to disk. + LogEntryBatchPB batch; + OpId op_id = MakeOpId(1, 1); + LogEntryPB* log_entry = batch.add_entry(); + log_entry->set_type(REPLICATE); + ReplicateMsg* repl = log_entry->mutable_replicate(); + repl->mutable_id()->CopyFrom(op_id); + repl->set_op_type(NO_OP); + repl->set_timestamp(0L); + + // Entries are prefixed with a header. + int single_entry_size = batch.ByteSize() + kEntryHeaderSize; + + int written_entries_size = header_size; + ASSERT_OK(AppendNoOps(&op_id, kNumEntries, &written_entries_size)); + ASSERT_EQ(single_entry_size * kNumEntries + header_size, written_entries_size); + ASSERT_EQ(written_entries_size, log_->active_segment_->written_offset()); + + // Updating the readable segment with the offset of the first entry should + // make it read a single entry even though there are several in the log. + readable_segment->UpdateReadableToOffset(header_size + single_entry_size); + ASSERT_OK(readable_segment->ReadEntries(&entries)); + ASSERT_EQ(entries.size(), 1); + STLDeleteElements(&entries); + + // Now append another entry so that the Log sets the correct readable offset + // on the reader. + ASSERT_OK(AppendNoOps(&op_id, 1, &written_entries_size)); + + // Now the reader should be able to read all 5 entries. + ASSERT_OK(readable_segment->ReadEntries(&entries)); + ASSERT_EQ(entries.size(), 5); + STLDeleteElements(&entries); + + // Offset should get updated for an additional entry. + ASSERT_EQ(single_entry_size * (kNumEntries + 1) + header_size, + written_entries_size); + ASSERT_EQ(written_entries_size, log_->active_segment_->written_offset()); + + // When we roll it should go back to the header size. + ASSERT_OK(log_->AllocateSegmentAndRollOver()); + ASSERT_EQ(header_size, log_->active_segment_->written_offset()); + written_entries_size = header_size; + + // Now that we closed the original segment. If we get a segment from the reader + // again, we should get one with a footer and we should be able to read all entries. + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(segments.size(), 2); + readable_segment = segments[0]; + ASSERT_OK(readable_segment->ReadEntries(&entries)); + ASSERT_EQ(entries.size(), 5); + STLDeleteElements(&entries); + + // Offset should get updated for an additional entry, again. + ASSERT_OK(AppendNoOp(&op_id, &written_entries_size)); + ASSERT_EQ(single_entry_size + header_size, written_entries_size); + ASSERT_EQ(written_entries_size, log_->active_segment_->written_offset()); +} + +// Tests that segments can be GC'd while the log is running. +TEST_F(LogTest, TestGCWithLogRunning) { + BuildLog(); + + vector anchors; + ElementDeleter deleter(&anchors); + + SegmentSequence segments; + + const int kNumTotalSegments = 4; + const int kNumOpsPerSegment = 5; + int num_gced_segments; + OpId op_id = MakeOpId(1, 1); + int64_t anchored_index = -1; + + ASSERT_OK(AppendMultiSegmentSequence(kNumTotalSegments, kNumOpsPerSegment, + &op_id, &anchors)); + + // We should get 4 anchors, each pointing at the beginning of a new segment + ASSERT_EQ(anchors.size(), 4); + + // Anchors should prevent GC. + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(4, segments.size()) << DumpSegmentsToString(segments); + ASSERT_OK(log_anchor_registry_->GetEarliestRegisteredLogIndex(&anchored_index)); + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(4, segments.size()) << DumpSegmentsToString(segments); + + // Freeing the first 2 anchors should allow GC of them. + ASSERT_OK(log_anchor_registry_->Unregister(anchors[0])); + ASSERT_OK(log_anchor_registry_->Unregister(anchors[1])); + ASSERT_OK(log_anchor_registry_->GetEarliestRegisteredLogIndex(&anchored_index)); + // We should now be anchored on op 0.11, i.e. on the 3rd segment + ASSERT_EQ(anchors[2]->log_index, anchored_index); + + // However, first, we'll try bumping the min retention threshold and + // verify that we don't GC any. + { + google::FlagSaver saver; + FLAGS_log_min_segments_to_retain = 10; + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_EQ(0, num_gced_segments); + } + + // Try again without the modified flag. + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_EQ(2, num_gced_segments) << DumpSegmentsToString(segments); + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(2, segments.size()) << DumpSegmentsToString(segments); + + // Release the remaining "rolled segment" anchor. GC will not delete the + // last rolled segment. + ASSERT_OK(log_anchor_registry_->Unregister(anchors[2])); + ASSERT_OK(log_anchor_registry_->GetEarliestRegisteredLogIndex(&anchored_index)); + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_EQ(0, num_gced_segments) << DumpSegmentsToString(segments); + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(2, segments.size()) << DumpSegmentsToString(segments); + + // Check that we get a NotFound if we try to read before the GCed point. + { + vector repls; + ElementDeleter d(&repls); + Status s = log_->GetLogReader()->ReadReplicatesInRange( + 1, 2, LogReader::kNoSizeLimit, &repls); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); + } + + ASSERT_OK(log_->Close()); + CheckRightNumberOfSegmentFiles(2); + + // We skip the first three, since we unregistered them above. + for (int i = 3; i < kNumTotalSegments; i++) { + ASSERT_OK(log_anchor_registry_->Unregister(anchors[i])); + } +} + +// Test that, when we are set to retain a given number of log segments, +// we also retain any relevant log index chunks, even if those operations +// are not necessary for recovery. +TEST_F(LogTest, TestGCOfIndexChunks) { + FLAGS_log_min_segments_to_retain = 4; + BuildLog(); + + // Append some segments which cross from one index chunk into another. + // 999990-999994 \___ the first index + // 999995-999999 / chunk points to these + // 1000000-100004 \_ + // 1000005-100009 _|- the second index chunk points to these + // 1000010- / + const int kNumTotalSegments = 5; + const int kNumOpsPerSegment = 5; + OpId op_id = MakeOpId(1, 999990); + ASSERT_OK(AppendMultiSegmentSequence(kNumTotalSegments, kNumOpsPerSegment, + &op_id, nullptr)); + + // Run a GC on an op in the second index chunk. We should remove only the + // earliest segment, because we are set to retain 4. + int num_gced_segments = 0; + ASSERT_OK(log_->GC(1000006, &num_gced_segments)); + ASSERT_EQ(1, num_gced_segments); + + // And we should still be able to read ops in the retained segment, even though + // the GC index was higher. + OpId loaded_op; + ASSERT_OK(log_->GetLogReader()->LookupOpId(999995, &loaded_op)); + ASSERT_EQ("1.999995", OpIdToString(loaded_op)); + + // If we drop the retention count down to 1, we can now GC, and the log index + // chunk should also be GCed. + FLAGS_log_min_segments_to_retain = 1; + ASSERT_OK(log_->GC(1000003, &num_gced_segments)); + ASSERT_EQ(1, num_gced_segments); + + Status s = log_->GetLogReader()->LookupOpId(999995, &loaded_op); + ASSERT_TRUE(s.IsNotFound()) << "unexpected status: " << s.ToString(); +} + +// Tests that we can append FLUSH_MARKER messages to the log queue to make sure +// all messages up to a certain point were fsync()ed without actually +// writing them to the log. +TEST_F(LogTest, TestWaitUntilAllFlushed) { + BuildLog(); + // Append 2 replicate/commit pairs asynchronously + AppendReplicateBatchAndCommitEntryPairsToLog(2, APPEND_ASYNC); + + ASSERT_OK(log_->WaitUntilAllFlushed()); + + // Make sure we only get 4 entries back and that no FLUSH_MARKER commit is found. + vector > segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_OK(segments[0]->ReadEntries(&entries_)); + ASSERT_EQ(entries_.size(), 4); + for (int i = 0; i < 4 ; i++) { + if (i % 2 == 0) { + ASSERT_TRUE(entries_[i]->has_replicate()); + } else { + ASSERT_TRUE(entries_[i]->has_commit()); + ASSERT_EQ(WRITE_OP, entries_[i]->commit().op_type()); + } + } +} + +// Tests log reopening and that GC'ing the old log's segments works. +TEST_F(LogTest, TestLogReopenAndGC) { + BuildLog(); + + SegmentSequence segments; + + vector anchors; + ElementDeleter deleter(&anchors); + + const int kNumTotalSegments = 3; + const int kNumOpsPerSegment = 5; + int num_gced_segments; + OpId op_id = MakeOpId(1, 1); + int64_t anchored_index = -1; + + ASSERT_OK(AppendMultiSegmentSequence(kNumTotalSegments, kNumOpsPerSegment, + &op_id, &anchors)); + // Anchors should prevent GC. + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(3, segments.size()); + ASSERT_OK(log_anchor_registry_->GetEarliestRegisteredLogIndex(&anchored_index)); + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(3, segments.size()); + + ASSERT_OK(log_->Close()); + + // Now reopen the log as if we had replayed the state into the stores. + // that were in memory and do GC. + BuildLog(); + + // The "old" data consists of 3 segments. We still hold anchors. + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)) + ASSERT_EQ(4, segments.size()); + + // Write to a new log segment, as if we had taken new requests and the + // mem stores are holding anchors, but don't roll it. + CreateAndRegisterNewAnchor(op_id.index(), &anchors); + ASSERT_OK(AppendNoOps(&op_id, kNumOpsPerSegment)); + + // Now release the "old" anchors and GC them. + for (int i = 0; i < 3; i++) { + ASSERT_OK(log_anchor_registry_->Unregister(anchors[i])); + } + ASSERT_OK(log_anchor_registry_->GetEarliestRegisteredLogIndex(&anchored_index)); + + // If we set the min_seconds_to_retain high, then we'll retain the logs even + // though we could GC them based on our anchoring. + FLAGS_log_min_seconds_to_retain = 500; + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_EQ(0, num_gced_segments); + + // Turn off the time-based retention and try GCing again. This time + // we should succeed. + FLAGS_log_min_seconds_to_retain = 0; + ASSERT_OK(log_->GC(anchored_index, &num_gced_segments)); + ASSERT_EQ(2, num_gced_segments); + + // After GC there should be only one left, besides the one currently being + // written to. That is because min_segments_to_retain defaults to 2. + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()) << DumpSegmentsToString(segments); + ASSERT_OK(log_->Close()); + + CheckRightNumberOfSegmentFiles(2); + + // Unregister the final anchor. + ASSERT_OK(log_anchor_registry_->Unregister(anchors[3])); +} + +// Helper to measure the performance of the log. +TEST_F(LogTest, TestWriteManyBatches) { + uint64_t num_batches = 10; + if (AllowSlowTests()) { + num_batches = FLAGS_num_batches; + } + BuildLog(); + + LOG(INFO)<< "Starting to write " << num_batches << " to log"; + LOG_TIMING(INFO, "Wrote all batches to log") { + AppendReplicateBatchAndCommitEntryPairsToLog(num_batches); + } + ASSERT_OK(log_->Close()); + LOG(INFO) << "Done writing"; + + LOG_TIMING(INFO, "Read all entries from Log") { + LOG(INFO) << "Starting to read log"; + uint32_t num_entries = 0; + + vector > segments; + + gscoped_ptr reader; + ASSERT_OK(LogReader::Open(fs_manager_.get(), NULL, kTestTablet, NULL, &reader)); + ASSERT_OK(reader->GetSegmentsSnapshot(&segments)); + + for (const scoped_refptr entry : segments) { + STLDeleteElements(&entries_); + ASSERT_OK(entry->ReadEntries(&entries_)); + num_entries += entries_.size(); + } + ASSERT_EQ(num_entries, num_batches * 2); + LOG(INFO) << "End readfile"; + } +} + +// This tests that querying LogReader works. +// This sets up a reader with some segments to query which amount to the +// following: +// seg002: 0.10 through 0.19 +// seg003: 0.20 through 0.29 +// seg004: 0.30 through 0.39 +TEST_F(LogTest, TestLogReader) { + LogReader reader(fs_manager_.get(), + scoped_refptr(), + kTestTablet, + nullptr); + reader.InitEmptyReaderForTests(); + ASSERT_OK(AppendNewEmptySegmentToReader(2, 10, &reader)); + ASSERT_OK(AppendNewEmptySegmentToReader(3, 20, &reader)); + ASSERT_OK(AppendNewEmptySegmentToReader(4, 30, &reader)); + + OpId op; + op.set_term(0); + SegmentSequence segments; + + // Queries for segment prefixes (used for GC) + + // Asking the reader the prefix of segments that does not include op 1 + // should return the empty set. + ASSERT_OK(reader.GetSegmentPrefixNotIncluding(1, &segments)); + ASSERT_TRUE(segments.empty()); + + // .. same for op 10 + ASSERT_OK(reader.GetSegmentPrefixNotIncluding(10, &segments)); + ASSERT_TRUE(segments.empty()); + + // Asking for the prefix of segments not including op 20 should return + // the first segment, since 20 is the first operation in segment 3. + ASSERT_OK(reader.GetSegmentPrefixNotIncluding(20, &segments)); + ASSERT_EQ(segments.size(), 1); + ASSERT_EQ(segments[0]->header().sequence_number(), 2); + + // Asking for 30 should include the first two. + ASSERT_OK(reader.GetSegmentPrefixNotIncluding(30, &segments)); + ASSERT_EQ(segments.size(), 2); + ASSERT_EQ(segments[0]->header().sequence_number(), 2); + ASSERT_EQ(segments[1]->header().sequence_number(), 3); + + // Asking for anything higher should return all segments. + ASSERT_OK(reader.GetSegmentPrefixNotIncluding(1000, &segments)); + ASSERT_EQ(segments.size(), 3); + ASSERT_EQ(segments[0]->header().sequence_number(), 2); + ASSERT_EQ(segments[1]->header().sequence_number(), 3); + + // Queries for specific segment sequence numbers. + scoped_refptr segment = reader.GetSegmentBySequenceNumber(2); + ASSERT_EQ(2, segment->header().sequence_number()); + segment = reader.GetSegmentBySequenceNumber(3); + ASSERT_EQ(3, segment->header().sequence_number()); + + segment = reader.GetSegmentBySequenceNumber(4); + ASSERT_EQ(4, segment->header().sequence_number()); + + segment = reader.GetSegmentBySequenceNumber(5); + ASSERT_TRUE(segment.get() == nullptr); +} + +// Test that, even if the LogReader's index is empty because no segments +// have been properly closed, we can still read the entries as the reader +// returns the current segment. +TEST_F(LogTest, TestLogReaderReturnsLatestSegmentIfIndexEmpty) { + BuildLog(); + + OpId opid = MakeOpId(1, 1); + AppendCommit(opid, APPEND_ASYNC); + AppendReplicateBatch(opid, APPEND_SYNC); + + SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(segments.size(), 1); + + vector entries; + ElementDeleter deleter(&entries); + ASSERT_OK(segments[0]->ReadEntries(&entries)); + ASSERT_EQ(2, entries.size()); +} + +TEST_F(LogTest, TestOpIdUtils) { + OpId id = MakeOpId(1, 2); + ASSERT_EQ("1.2", consensus::OpIdToString(id)); + ASSERT_EQ(1, id.term()); + ASSERT_EQ(2, id.index()); +} + +std::ostream& operator<<(std::ostream& os, const TestLogSequenceElem& elem) { + switch (elem.type) { + case TestLogSequenceElem::ROLL: + os << "ROLL"; + break; + case TestLogSequenceElem::REPLICATE: + os << "R" << elem.id; + break; + case TestLogSequenceElem::COMMIT: + os << "C" << elem.id; + break; + } + return os; +} + +// Generates a plausible sequence of items in the log, including term changes, moving the +// index backwards, log rolls, etc. +// +// NOTE: this log sequence may contain some aberrations which would not occur in a real +// consensus log, but our API supports them. In the future we may want to add assertions +// to the Log implementation that prevent such aberrations, in which case we'd need to +// modify this. +void LogTest::GenerateTestSequence(Random* rng, int seq_len, + vector* ops, + vector* terms_by_index) { + terms_by_index->assign(seq_len + 1, -1); + int64_t committed_index = 0; + int64_t max_repl_index = 0; + + OpId id = MakeOpId(1, 0); + for (int i = 0; i < seq_len; i++) { + if (rng->OneIn(5)) { + // Reset term - it may stay the same, or go up/down + id.set_term(std::max(static_cast(1), id.term() + rng->Uniform(5) - 2)); + } + + // Advance index by exactly one + id.set_index(id.index() + 1); + + if (rng->OneIn(5)) { + // Move index backward a bit, but not past the committed index + id.set_index(std::max(committed_index + 1, id.index() - rng->Uniform(5))); + } + + // Roll the log sometimes + if (i != 0 && rng->OneIn(15)) { + TestLogSequenceElem op; + op.type = TestLogSequenceElem::ROLL; + ops->push_back(op); + } + + TestLogSequenceElem op; + op.type = TestLogSequenceElem::REPLICATE; + op.id = id; + ops->push_back(op); + (*terms_by_index)[id.index()] = id.term(); + max_repl_index = std::max(max_repl_index, id.index()); + + // Advance the commit index sometimes + if (rng->OneIn(5)) { + while (committed_index < id.index()) { + committed_index++; + TestLogSequenceElem op; + op.type = TestLogSequenceElem::COMMIT; + op.id = MakeOpId((*terms_by_index)[committed_index], committed_index); + ops->push_back(op); + } + } + } + terms_by_index->resize(max_repl_index + 1); +} + +void LogTest::AppendTestSequence(const vector& seq) { + for (const TestLogSequenceElem& e : seq) { + VLOG(1) << "Appending: " << e; + switch (e.type) { + case TestLogSequenceElem::REPLICATE: + { + OpId id(e.id); + ASSERT_OK(AppendNoOp(&id)); + break; + } + case TestLogSequenceElem::COMMIT: + { + gscoped_ptr commit(new CommitMsg); + commit->set_op_type(NO_OP); + commit->mutable_commited_op_id()->CopyFrom(e.id); + Synchronizer s; + ASSERT_OK(log_->AsyncAppendCommit(commit.Pass(), s.AsStatusCallback())); + ASSERT_OK(s.Wait()); + } + case TestLogSequenceElem::ROLL: + { + ASSERT_OK(RollLog()); + } + } + } +} + +static int RandInRange(Random* r, int min_inclusive, int max_inclusive) { + int width = max_inclusive - min_inclusive + 1; + return min_inclusive + r->Uniform(width); +} + +// Test that if multiple REPLICATE entries are written for the same index, +// that we read the latest one. +// +// This is a randomized test: we generate a plausible sequence of log messages, +// write it out, and then read random ranges of log indexes, making sure we +// always see the correct term for each REPLICATE message (i.e whichever term +// was the last to append it). +TEST_F(LogTest, TestReadLogWithReplacedReplicates) { + const int kSequenceLength = AllowSlowTests() ? 1000 : 50; + + Random rng(SeedRandom()); + vector terms_by_index; + vector seq; + GenerateTestSequence(&rng, kSequenceLength, &seq, &terms_by_index); + LOG(INFO) << "test sequence: " << seq; + const int64_t max_repl_index = terms_by_index.size() - 1; + LOG(INFO) << "max_repl_index: " << max_repl_index; + + // Write the test sequence to the log. + // TODO: should consider adding batching here of multiple replicates + BuildLog(); + AppendTestSequence(seq); + + const int kNumRandomReads = 100; + + // We'll advance 'gc_index' randomly through the log until we've gotten to + // the end. This ensures that, when we GC, we don't ever remove the latest + // version of a replicate message unintentionally. + LogReader* reader = log_->GetLogReader(); + for (int gc_index = 1; gc_index < max_repl_index;) { + SCOPED_TRACE(Substitute("after GCing $0", gc_index)); + + // Test reading random ranges of indexes and verifying that we get back the + // REPLICATE messages with the correct terms + for (int random_read = 0; random_read < kNumRandomReads; random_read++) { + int start_index = RandInRange(&rng, gc_index, max_repl_index - 1); + int end_index = RandInRange(&rng, start_index, max_repl_index); + { + SCOPED_TRACE(Substitute("Reading $0-$1", start_index, end_index)); + vector repls; + ElementDeleter d(&repls); + ASSERT_OK(reader->ReadReplicatesInRange( + start_index, end_index, LogReader::kNoSizeLimit, &repls)); + ASSERT_EQ(end_index - start_index + 1, repls.size()); + int expected_index = start_index; + for (const ReplicateMsg* repl : repls) { + ASSERT_EQ(expected_index, repl->id().index()); + ASSERT_EQ(terms_by_index[expected_index], repl->id().term()); + expected_index++; + } + } + + int64_t bytes_read = log_->reader_->bytes_read_->value(); + int64_t entries_read = log_->reader_->entries_read_->value(); + int64_t read_batch_count = log_->reader_->read_batch_latency_->TotalCount(); + EXPECT_GT(log_->reader_->bytes_read_->value(), 0); + EXPECT_GT(log_->reader_->entries_read_->value(), 0); + EXPECT_GT(log_->reader_->read_batch_latency_->TotalCount(), 0); + + // Test a size-limited read. + int size_limit = RandInRange(&rng, 1, 1000); + { + SCOPED_TRACE(Substitute("Reading $0-$1 with size limit $2", + start_index, end_index, size_limit)); + vector repls; + ElementDeleter d(&repls); + ASSERT_OK(reader->ReadReplicatesInRange(start_index, end_index, size_limit, &repls)); + ASSERT_LE(repls.size(), end_index - start_index + 1); + int total_size = 0; + int expected_index = start_index; + for (const ReplicateMsg* repl : repls) { + ASSERT_EQ(expected_index, repl->id().index()); + ASSERT_EQ(terms_by_index[expected_index], repl->id().term()); + expected_index++; + total_size += repl->SpaceUsed(); + } + if (total_size > size_limit) { + ASSERT_EQ(1, repls.size()); + } else { + ASSERT_LE(total_size, size_limit); + } + } + + EXPECT_GT(log_->reader_->bytes_read_->value(), bytes_read); + EXPECT_GT(log_->reader_->entries_read_->value(), entries_read); + EXPECT_GT(log_->reader_->read_batch_latency_->TotalCount(), read_batch_count); + } + + int num_gced = 0; + ASSERT_OK(log_->GC(gc_index, &num_gced)); + gc_index += rng.Uniform(10); + } +} + +// Test various situations where we expect different segments depending on what the +// min log index is. +TEST_F(LogTest, TestGetMaxIndexesToSegmentSizeMap) { + FLAGS_log_min_segments_to_retain = 2; + BuildLog(); + + const int kNumTotalSegments = 5; + const int kNumOpsPerSegment = 5; + OpId op_id = MakeOpId(1, 10); + // Create 5 segments, starting from log index 10, with 5 ops per segment. + ASSERT_OK(AppendMultiSegmentSequence(kNumTotalSegments, kNumOpsPerSegment, + &op_id, nullptr)); + + std::map max_idx_to_segment_size; + + // Check getting all the segments we can get rid of (5 - 2). + log_->GetMaxIndexesToSegmentSizeMap(10, &max_idx_to_segment_size); + ASSERT_EQ(3, max_idx_to_segment_size.size()); + max_idx_to_segment_size.clear(); + + // Check that even when the min index is the last index from the oldest segment, + // we still return 3. + log_->GetMaxIndexesToSegmentSizeMap(14, &max_idx_to_segment_size); + ASSERT_EQ(3, max_idx_to_segment_size.size()); + max_idx_to_segment_size.clear(); + + // Check that if the first segment is GCable, we get 2 back. + log_->GetMaxIndexesToSegmentSizeMap(15, &max_idx_to_segment_size); + ASSERT_EQ(2, max_idx_to_segment_size.size()); + max_idx_to_segment_size.clear(); + + // Check that if the min index is at the very end of the only segment we can get rid of that we + // get 1 back. + log_->GetMaxIndexesToSegmentSizeMap(24, &max_idx_to_segment_size); + ASSERT_EQ(1, max_idx_to_segment_size.size()); + max_idx_to_segment_size.clear(); + + // Check that we don't get anything back when there's nothing we want to get rid of. + log_->GetMaxIndexesToSegmentSizeMap(25, &max_idx_to_segment_size); + ASSERT_EQ(0, max_idx_to_segment_size.size()); + + // Sanity check that even if the min log index is the newest op that nothing breaks and that + // we get 0 segments back. + log_->GetMaxIndexesToSegmentSizeMap(35, &max_idx_to_segment_size); + ASSERT_EQ(0, max_idx_to_segment_size.size()); + + // Check that logs that would normally count for log retention won't be returned since they are + // too young. + FLAGS_log_min_seconds_to_retain = 500; + log_->GetMaxIndexesToSegmentSizeMap(10, &max_idx_to_segment_size); + ASSERT_EQ(0, max_idx_to_segment_size.size()); +} +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log.cc b/src/kudu/consensus/log.cc new file mode 100644 index 000000000000..05abfd870b65 --- /dev/null +++ b/src/kudu/consensus/log.cc @@ -0,0 +1,1021 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log.h" + +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/log_index.h" +#include "kudu/consensus/log_metrics.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/log_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/coding.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env_util.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/kernel_stack_watchdog.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" + +// Log retention configuration. +// ----------------------------- +DEFINE_int32(log_min_segments_to_retain, 2, + "The minimum number of past log segments to keep at all times," + " regardless of what is required for durability. " + "Must be at least 1."); +TAG_FLAG(log_min_segments_to_retain, runtime); +TAG_FLAG(log_min_segments_to_retain, advanced); + +DEFINE_int32(log_min_seconds_to_retain, 300, + "The minimum number of seconds for which to keep log segments to keep at all times, " + "regardless of what is required for durability. Logs may be still retained for " + "a longer amount of time if they are necessary for correct restart. This should be " + "set long enough such that a tablet server which has temporarily failed can be " + "restarted within the given time period. If a server is down for longer than this " + "amount of time, it is possible that its tablets will be re-replicated on other " + "machines."); +TAG_FLAG(log_min_seconds_to_retain, runtime); +TAG_FLAG(log_min_seconds_to_retain, advanced); + +// Group commit configuration. +// ----------------------------- +DEFINE_int32(group_commit_queue_size_bytes, 4 * 1024 * 1024, + "Maximum size of the group commit queue in bytes"); +TAG_FLAG(group_commit_queue_size_bytes, advanced); + +// Fault/latency injection flags. +// ----------------------------- +DEFINE_bool(log_inject_latency, false, + "If true, injects artificial latency in log sync operations. " + "Advanced option. Use at your own risk -- has a negative effect " + "on performance for obvious reasons!"); +DEFINE_int32(log_inject_latency_ms_mean, 100, + "The number of milliseconds of latency to inject, on average. " + "Only takes effect if --log_inject_latency is true"); +DEFINE_int32(log_inject_latency_ms_stddev, 100, + "The standard deviation of latency to inject in the log. " + "Only takes effect if --log_inject_latency is true"); +DEFINE_double(fault_crash_before_append_commit, 0.0, + "Fraction of the time when the server will crash just before appending a " + "COMMIT message to the log. (For testing only!)"); +TAG_FLAG(log_inject_latency, unsafe); +TAG_FLAG(log_inject_latency_ms_mean, unsafe); +TAG_FLAG(log_inject_latency_ms_stddev, unsafe); +TAG_FLAG(fault_crash_before_append_commit, unsafe); + +// Validate that log_min_segments_to_retain >= 1 +static bool ValidateLogsToRetain(const char* flagname, int value) { + if (value >= 1) { + return true; + } + LOG(ERROR) << strings::Substitute("$0 must be at least 1, value $1 is invalid", + flagname, value); + return false; +} +static bool dummy = google::RegisterFlagValidator( + &FLAGS_log_min_segments_to_retain, &ValidateLogsToRetain); + +static const char kSegmentPlaceholderFileTemplate[] = ".tmp.newsegmentXXXXXX"; + +namespace kudu { +namespace log { + +using consensus::CommitMsg; +using consensus::OpId; +using consensus::ReplicateRefPtr; +using env_util::OpenFileForRandom; +using std::shared_ptr; +using strings::Substitute; + +// This class is responsible for managing the thread that appends to +// the log file. +class Log::AppendThread { + public: + explicit AppendThread(Log* log); + + // Initializes the objects and starts the thread. + Status Init(); + + // Waits until the last enqueued elements are processed, sets the + // Appender thread to closing state. If any entries are added to the + // queue during the process, invoke their callbacks' 'OnFailure()' + // method. + void Shutdown(); + + private: + void RunThread(); + + Log* const log_; + + // Lock to protect access to thread_ during shutdown. + mutable boost::mutex lock_; + scoped_refptr thread_; +}; + + +Log::AppendThread::AppendThread(Log *log) + : log_(log) { +} + +Status Log::AppendThread::Init() { + DCHECK(!thread_) << "Already initialized"; + VLOG(1) << "Starting log append thread for tablet " << log_->tablet_id(); + RETURN_NOT_OK(kudu::Thread::Create("log", "appender", + &AppendThread::RunThread, this, &thread_)); + return Status::OK(); +} + +void Log::AppendThread::RunThread() { + bool shutting_down = false; + while (PREDICT_TRUE(!shutting_down)) { + std::vector entry_batches; + ElementDeleter d(&entry_batches); + + // We shut down the entry_queue when it's time to shut down the append + // thread, which causes this call to return false, while still populating + // the entry_batches vector with the final set of log entry batches that + // were enqueued. We finish processing this last bunch of log entry batches + // before exiting the main RunThread() loop. + if (PREDICT_FALSE(!log_->entry_queue()->BlockingDrainTo(&entry_batches))) { + shutting_down = true; + } + + if (log_->metrics_) { + log_->metrics_->entry_batches_per_group->Increment(entry_batches.size()); + } + TRACE_EVENT1("log", "batch", "batch_size", entry_batches.size()); + + SCOPED_LATENCY_METRIC(log_->metrics_, group_commit_latency); + + bool is_all_commits = true; + for (LogEntryBatch* entry_batch : entry_batches) { + entry_batch->WaitForReady(); + TRACE_EVENT_FLOW_END0("log", "Batch", entry_batch); + Status s = log_->DoAppend(entry_batch); + if (PREDICT_FALSE(!s.ok())) { + LOG(ERROR) << "Error appending to the log: " << s.ToString(); + DLOG(FATAL) << "Aborting: " << s.ToString(); + entry_batch->set_failed_to_append(); + // TODO If a single transaction fails to append, should we + // abort all subsequent transactions in this batch or allow + // them to be appended? What about transactions in future + // batches? + if (!entry_batch->callback().is_null()) { + entry_batch->callback().Run(s); + } + } + if (is_all_commits && entry_batch->type_ != COMMIT) { + is_all_commits = false; + } + } + + Status s; + if (!is_all_commits) { + s = log_->Sync(); + } + if (PREDICT_FALSE(!s.ok())) { + LOG(ERROR) << "Error syncing log" << s.ToString(); + DLOG(FATAL) << "Aborting: " << s.ToString(); + for (LogEntryBatch* entry_batch : entry_batches) { + if (!entry_batch->callback().is_null()) { + entry_batch->callback().Run(s); + } + } + } else { + TRACE_EVENT0("log", "Callbacks"); + VLOG(2) << "Synchronized " << entry_batches.size() << " entry batches"; + SCOPED_WATCH_STACK(100); + for (LogEntryBatch* entry_batch : entry_batches) { + if (PREDICT_TRUE(!entry_batch->failed_to_append() + && !entry_batch->callback().is_null())) { + entry_batch->callback().Run(Status::OK()); + } + // It's important to delete each batch as we see it, because + // deleting it may free up memory from memory trackers, and the + // callback of a later batch may want to use that memory. + delete entry_batch; + } + entry_batches.clear(); + } + } + VLOG(1) << "Exiting AppendThread for tablet " << log_->tablet_id(); +} + +void Log::AppendThread::Shutdown() { + log_->entry_queue()->Shutdown(); + boost::lock_guard lock_guard(lock_); + if (thread_) { + VLOG(1) << "Shutting down log append thread for tablet " << log_->tablet_id(); + CHECK_OK(ThreadJoiner(thread_.get()).Join()); + VLOG(1) << "Log append thread for tablet " << log_->tablet_id() << " is shut down"; + thread_.reset(); + } +} + +// This task is submitted to allocation_pool_ in order to +// asynchronously pre-allocate new log segments. +void Log::SegmentAllocationTask() { + allocation_status_.Set(PreAllocateNewSegment()); +} + +const Status Log::kLogShutdownStatus( + Status::ServiceUnavailable("WAL is shutting down", "", ESHUTDOWN)); + +const uint64_t Log::kInitialLogSegmentSequenceNumber = 0L; + +Status Log::Open(const LogOptions &options, + FsManager *fs_manager, + const std::string& tablet_id, + const Schema& schema, + uint32_t schema_version, + const scoped_refptr& metric_entity, + scoped_refptr* log) { + + string tablet_wal_path = fs_manager->GetTabletWalDir(tablet_id); + RETURN_NOT_OK(fs_manager->CreateDirIfMissing(tablet_wal_path)); + + scoped_refptr new_log(new Log(options, + fs_manager, + tablet_wal_path, + tablet_id, + schema, + schema_version, + metric_entity)); + RETURN_NOT_OK(new_log->Init()); + log->swap(new_log); + return Status::OK(); +} + +Log::Log(LogOptions options, FsManager* fs_manager, string log_path, + string tablet_id, const Schema& schema, uint32_t schema_version, + const scoped_refptr& metric_entity) + : options_(std::move(options)), + fs_manager_(fs_manager), + log_dir_(std::move(log_path)), + tablet_id_(std::move(tablet_id)), + schema_(schema), + schema_version_(schema_version), + active_segment_sequence_number_(0), + log_state_(kLogInitialized), + max_segment_size_(options_.segment_size_mb * 1024 * 1024), + entry_batch_queue_(FLAGS_group_commit_queue_size_bytes), + append_thread_(new AppendThread(this)), + force_sync_all_(options_.force_fsync_all), + sync_disabled_(false), + allocation_state_(kAllocationNotStarted), + metric_entity_(metric_entity) { + CHECK_OK(ThreadPoolBuilder("log-alloc").set_max_threads(1).Build(&allocation_pool_)); + if (metric_entity_) { + metrics_.reset(new LogMetrics(metric_entity_)); + } +} + +Status Log::Init() { + boost::lock_guard write_lock(state_lock_); + CHECK_EQ(kLogInitialized, log_state_); + + // Init the index + log_index_.reset(new LogIndex(log_dir_)); + + // Reader for previous segments. + RETURN_NOT_OK(LogReader::Open(fs_manager_, + log_index_, + tablet_id_, + metric_entity_.get(), + &reader_)); + + // The case where we are continuing an existing log. + // We must pick up where the previous WAL left off in terms of + // sequence numbers. + if (reader_->num_segments() != 0) { + VLOG(1) << "Using existing " << reader_->num_segments() + << " segments from path: " << fs_manager_->GetWalsRootDir(); + + vector > segments; + RETURN_NOT_OK(reader_->GetSegmentsSnapshot(&segments)); + active_segment_sequence_number_ = segments.back()->header().sequence_number(); + } + + if (force_sync_all_) { + KLOG_FIRST_N(INFO, 1) << "Log is configured to fsync() on all Append() calls"; + } else { + KLOG_FIRST_N(INFO, 1) << "Log is configured to *not* fsync() on all Append() calls"; + } + + // We always create a new segment when the log starts. + RETURN_NOT_OK(AsyncAllocateSegment()); + RETURN_NOT_OK(allocation_status_.Get()); + RETURN_NOT_OK(SwitchToAllocatedSegment()); + + RETURN_NOT_OK(append_thread_->Init()); + log_state_ = kLogWriting; + return Status::OK(); +} + +Status Log::AsyncAllocateSegment() { + boost::lock_guard lock_guard(allocation_lock_); + CHECK_EQ(allocation_state_, kAllocationNotStarted); + allocation_status_.Reset(); + allocation_state_ = kAllocationInProgress; + RETURN_NOT_OK(allocation_pool_->SubmitClosure( + Bind(&Log::SegmentAllocationTask, Unretained(this)))); + return Status::OK(); +} + +Status Log::CloseCurrentSegment() { + if (!footer_builder_.has_min_replicate_index()) { + VLOG(1) << "Writing a segment without any REPLICATE message. " + "Segment: " << active_segment_->path(); + } + VLOG(2) << "Segment footer for " << active_segment_->path() + << ": " << footer_builder_.ShortDebugString(); + + footer_builder_.set_close_timestamp_micros(GetCurrentTimeMicros()); + RETURN_NOT_OK(active_segment_->WriteFooterAndClose(footer_builder_)); + + return Status::OK(); +} + +Status Log::RollOver() { + SCOPED_LATENCY_METRIC(metrics_, roll_latency); + + // Check if any errors have occurred during allocation + RETURN_NOT_OK(allocation_status_.Get()); + + DCHECK_EQ(allocation_state(), kAllocationFinished); + + RETURN_NOT_OK(Sync()); + RETURN_NOT_OK(CloseCurrentSegment()); + + RETURN_NOT_OK(SwitchToAllocatedSegment()); + + LOG(INFO) << "Rolled over to a new segment: " << active_segment_->path(); + return Status::OK(); +} + +Status Log::Reserve(LogEntryTypePB type, + gscoped_ptr entry_batch, + LogEntryBatch** reserved_entry) { + TRACE_EVENT0("log", "Log::Reserve"); + DCHECK(reserved_entry != nullptr); + { + boost::shared_lock read_lock(state_lock_.get_lock()); + CHECK_EQ(kLogWriting, log_state_); + } + + // In DEBUG builds, verify that all of the entries in the batch match the specified type. + // In non-debug builds the foreach loop gets optimized out. + #ifndef NDEBUG + for (const LogEntryPB& entry : entry_batch->entry()) { + DCHECK_EQ(entry.type(), type) << "Bad batch: " << entry_batch->DebugString(); + } + #endif + + int num_ops = entry_batch->entry_size(); + gscoped_ptr new_entry_batch(new LogEntryBatch(type, entry_batch.Pass(), num_ops)); + new_entry_batch->MarkReserved(); + + if (PREDICT_FALSE(!entry_batch_queue_.BlockingPut(new_entry_batch.get()))) { + return kLogShutdownStatus; + } + + // Release the memory back to the caller: this will be freed when + // the entry is removed from the queue. + // + // TODO (perf) Use a ring buffer instead of a blocking queue and set + // 'reserved_entry' to a pre-allocated slot in the buffer. + *reserved_entry = new_entry_batch.release(); + return Status::OK(); +} + +Status Log::AsyncAppend(LogEntryBatch* entry_batch, const StatusCallback& callback) { + TRACE_EVENT0("log", "Log::AsyncAppend"); + { + boost::shared_lock read_lock(state_lock_.get_lock()); + CHECK_EQ(kLogWriting, log_state_); + } + + RETURN_NOT_OK(entry_batch->Serialize()); + entry_batch->set_callback(callback); + TRACE("Serialized $0 byte log entry", entry_batch->total_size_bytes()); + TRACE_EVENT_FLOW_BEGIN0("log", "Batch", entry_batch); + entry_batch->MarkReady(); + + return Status::OK(); +} + +Status Log::AsyncAppendReplicates(const vector& msgs, + const StatusCallback& callback) { + gscoped_ptr batch; + CreateBatchFromAllocatedOperations(msgs, &batch); + + LogEntryBatch* reserved_entry_batch; + RETURN_NOT_OK(Reserve(REPLICATE, batch.Pass(), &reserved_entry_batch)); + // If we're able to reserve set the vector of replicate scoped ptrs in + // the LogEntryBatch. This will make sure there's a reference for each + // replicate while we're appending. + reserved_entry_batch->SetReplicates(msgs); + + RETURN_NOT_OK(AsyncAppend(reserved_entry_batch, callback)); + return Status::OK(); +} + +Status Log::AsyncAppendCommit(gscoped_ptr commit_msg, + const StatusCallback& callback) { + MAYBE_FAULT(FLAGS_fault_crash_before_append_commit); + + gscoped_ptr batch(new LogEntryBatchPB); + LogEntryPB* entry = batch->add_entry(); + entry->set_type(COMMIT); + entry->set_allocated_commit(commit_msg.release()); + + LogEntryBatch* reserved_entry_batch; + RETURN_NOT_OK(Reserve(COMMIT, batch.Pass(), &reserved_entry_batch)); + + RETURN_NOT_OK(AsyncAppend(reserved_entry_batch, callback)); + return Status::OK(); +} + +Status Log::DoAppend(LogEntryBatch* entry_batch, bool caller_owns_operation) { + size_t num_entries = entry_batch->count(); + DCHECK_GT(num_entries, 0) << "Cannot call DoAppend() with zero entries reserved"; + + Slice entry_batch_data = entry_batch->data(); + uint32_t entry_batch_bytes = entry_batch->total_size_bytes(); + // If there is no data to write return OK. + if (PREDICT_FALSE(entry_batch_bytes == 0)) { + return Status::OK(); + } + + // We keep track of the last-written OpId here. + // This is needed to initialize Consensus on startup. + if (entry_batch->type_ == REPLICATE) { + // TODO Probably remove the code below as it looks suspicious: Tablet peer uses this + // as 'safe' anchor as it believes it in the log, when it actually isn't, i.e. this + // is not the last durable operation. Either move this to tablet peer (since we're + // using in flights anyway no need to scan for ids here) or actually delay doing this + // until fsync() has been done. See KUDU-527. + boost::lock_guard write_lock(last_entry_op_id_lock_); + last_entry_op_id_.CopyFrom(entry_batch->MaxReplicateOpId()); + } + + // if the size of this entry overflows the current segment, get a new one + if (allocation_state() == kAllocationNotStarted) { + if ((active_segment_->Size() + entry_batch_bytes + 4) > max_segment_size_) { + LOG(INFO) << "Max segment size reached. Starting new segment allocation. "; + RETURN_NOT_OK(AsyncAllocateSegment()); + if (!options_.async_preallocate_segments) { + LOG_SLOW_EXECUTION(WARNING, 50, "Log roll took a long time") { + RETURN_NOT_OK(RollOver()); + } + } + } + } else if (allocation_state() == kAllocationFinished) { + LOG_SLOW_EXECUTION(WARNING, 50, "Log roll took a long time") { + RETURN_NOT_OK(RollOver()); + } + } else { + VLOG(1) << "Segment allocation already in progress..."; + } + + int64_t start_offset = active_segment_->written_offset(); + + LOG_SLOW_EXECUTION(WARNING, 50, "Append to log took a long time") { + SCOPED_LATENCY_METRIC(metrics_, append_latency); + SCOPED_WATCH_STACK(500); + + RETURN_NOT_OK(active_segment_->WriteEntryBatch(entry_batch_data)); + + // Update the reader on how far it can read the active segment. + reader_->UpdateLastSegmentOffset(active_segment_->written_offset()); + + if (log_hooks_) { + RETURN_NOT_OK_PREPEND(log_hooks_->PostAppend(), "PostAppend hook failed"); + } + } + + if (metrics_) { + metrics_->bytes_logged->IncrementBy(entry_batch_bytes); + } + + CHECK_OK(UpdateIndexForBatch(*entry_batch, start_offset)); + UpdateFooterForBatch(entry_batch); + + // For REPLICATE batches, we expect the caller to free the actual entries if + // caller_owns_operation is set. + if (entry_batch->type_ == REPLICATE && caller_owns_operation) { + for (int i = 0; i < entry_batch->entry_batch_pb_->entry_size(); i++) { + LogEntryPB* entry_pb = entry_batch->entry_batch_pb_->mutable_entry(i); + entry_pb->release_replicate(); + } + } + + return Status::OK(); +} + +Status Log::UpdateIndexForBatch(const LogEntryBatch& batch, + int64_t start_offset) { + if (batch.type_ != REPLICATE) { + return Status::OK(); + } + + for (const LogEntryPB& entry_pb : batch.entry_batch_pb_->entry()) { + LogIndexEntry index_entry; + + index_entry.op_id = entry_pb.replicate().id(); + index_entry.segment_sequence_number = active_segment_sequence_number_; + index_entry.offset_in_segment = start_offset; + RETURN_NOT_OK(log_index_->AddEntry(index_entry)); + } + return Status::OK(); +} + +void Log::UpdateFooterForBatch(LogEntryBatch* batch) { + footer_builder_.set_num_entries(footer_builder_.num_entries() + batch->count()); + + // We keep track of the last-written OpId here. + // This is needed to initialize Consensus on startup. + // We also retrieve the opid of the first operation in the batch so that, if + // we roll over to a new segment, we set the first operation in the footer + // immediately. + if (batch->type_ == REPLICATE) { + // Update the index bounds for the current segment. + for (const LogEntryPB& entry_pb : batch->entry_batch_pb_->entry()) { + int64_t index = entry_pb.replicate().id().index(); + if (!footer_builder_.has_min_replicate_index() || + index < footer_builder_.min_replicate_index()) { + footer_builder_.set_min_replicate_index(index); + } + if (!footer_builder_.has_max_replicate_index() || + index > footer_builder_.max_replicate_index()) { + footer_builder_.set_max_replicate_index(index); + } + } + } +} + +Status Log::AllocateSegmentAndRollOver() { + RETURN_NOT_OK(AsyncAllocateSegment()); + return RollOver(); +} + +FsManager* Log::GetFsManager() { + return fs_manager_; +} + +Status Log::Sync() { + TRACE_EVENT0("log", "Sync"); + SCOPED_LATENCY_METRIC(metrics_, sync_latency); + + if (PREDICT_FALSE(FLAGS_log_inject_latency && !sync_disabled_)) { + Random r(GetCurrentTimeMicros()); + int sleep_ms = r.Normal(FLAGS_log_inject_latency_ms_mean, + FLAGS_log_inject_latency_ms_stddev); + if (sleep_ms > 0) { + LOG(INFO) << "T " << tablet_id_ << ": Injecting " + << sleep_ms << "ms of latency in Log::Sync()"; + SleepFor(MonoDelta::FromMilliseconds(sleep_ms)); + } + } + + if (force_sync_all_ && !sync_disabled_) { + LOG_SLOW_EXECUTION(WARNING, 50, "Fsync log took a long time") { + RETURN_NOT_OK(active_segment_->Sync()); + + if (log_hooks_) { + RETURN_NOT_OK_PREPEND(log_hooks_->PostSyncIfFsyncEnabled(), + "PostSyncIfFsyncEnabled hook failed"); + } + } + } + + if (log_hooks_) { + RETURN_NOT_OK_PREPEND(log_hooks_->PostSync(), "PostSync hook failed"); + } + return Status::OK(); +} + +Status Log::GetSegmentsToGCUnlocked(int64_t min_op_idx, SegmentSequence* segments_to_gc) const { + // Find the prefix of segments in the segment sequence that is guaranteed not to include + // 'min_op_idx'. + RETURN_NOT_OK(reader_->GetSegmentPrefixNotIncluding(min_op_idx, segments_to_gc)); + + int max_to_delete = std::max(reader_->num_segments() - FLAGS_log_min_segments_to_retain, 0); + if (segments_to_gc->size() > max_to_delete) { + VLOG(2) << "GCing " << segments_to_gc->size() << " in " << log_dir_ + << " would not leave enough remaining segments to satisfy minimum " + << "retention requirement. Only considering " + << max_to_delete << "/" << reader_->num_segments(); + segments_to_gc->resize(max_to_delete); + } else if (segments_to_gc->size() < max_to_delete) { + int extra_segments = max_to_delete - segments_to_gc->size(); + VLOG(2) << tablet_id_ << " has too many log segments, need to GC " + << extra_segments << " more. "; + } + + // Don't GC segments that are newer than the configured time-based retention. + int64_t now = GetCurrentTimeMicros(); + for (int i = 0; i < segments_to_gc->size(); i++) { + const scoped_refptr& segment = (*segments_to_gc)[i]; + + // Segments here will always have a footer, since we don't return the in-progress segment + // up above. However, segments written by older Kudu builds may not have the timestamp + // info. In that case, we're allowed to GC them. + if (!segment->footer().has_close_timestamp_micros()) continue; + + int64_t age_seconds = (now - segment->footer().close_timestamp_micros()) / 1000000; + if (age_seconds < FLAGS_log_min_seconds_to_retain) { + VLOG(2) << "Segment " << segment->path() << " is only " << age_seconds << "s old: " + << "cannot GC it yet due to configured time-based retention policy."; + // Truncate the list of segments to GC here -- if this one is too new, then + // all later ones are also too new. + segments_to_gc->resize(i); + break; + } + } + + return Status::OK(); +} + +Status Log::Append(LogEntryPB* phys_entry) { + gscoped_ptr entry_batch_pb(new LogEntryBatchPB); + entry_batch_pb->mutable_entry()->AddAllocated(phys_entry); + LogEntryBatch entry_batch(phys_entry->type(), entry_batch_pb.Pass(), 1); + entry_batch.state_ = LogEntryBatch::kEntryReserved; + Status s = entry_batch.Serialize(); + if (s.ok()) { + entry_batch.state_ = LogEntryBatch::kEntryReady; + s = DoAppend(&entry_batch, false); + if (s.ok()) { + s = Sync(); + } + } + entry_batch.entry_batch_pb_->mutable_entry()->ExtractSubrange(0, 1, nullptr); + return s; +} + +Status Log::WaitUntilAllFlushed() { + // In order to make sure we empty the queue we need to use + // the async api. + gscoped_ptr entry_batch(new LogEntryBatchPB); + entry_batch->add_entry()->set_type(log::FLUSH_MARKER); + LogEntryBatch* reserved_entry_batch; + RETURN_NOT_OK(Reserve(FLUSH_MARKER, entry_batch.Pass(), &reserved_entry_batch)); + Synchronizer s; + RETURN_NOT_OK(AsyncAppend(reserved_entry_batch, s.AsStatusCallback())); + return s.Wait(); +} + +void Log::GetLatestEntryOpId(consensus::OpId* op_id) const { + boost::shared_lock read_lock(last_entry_op_id_lock_); + if (last_entry_op_id_.IsInitialized()) { + DCHECK_NOTNULL(op_id)->CopyFrom(last_entry_op_id_); + } else { + *op_id = consensus::MinimumOpId(); + } +} + +Status Log::GC(int64_t min_op_idx, int32_t* num_gced) { + CHECK_GE(min_op_idx, 0); + + VLOG(1) << "Running Log GC on " << log_dir_ << ": retaining ops >= " << min_op_idx; + VLOG_TIMING(1, "Log GC") { + SegmentSequence segments_to_delete; + + { + boost::lock_guard l(state_lock_); + CHECK_EQ(kLogWriting, log_state_); + + GetSegmentsToGCUnlocked(min_op_idx, &segments_to_delete); + + if (segments_to_delete.size() == 0) { + VLOG(1) << "No segments to delete."; + *num_gced = 0; + return Status::OK(); + } + // Trim the prefix of segments from the reader so that they are no longer + // referenced by the log. + RETURN_NOT_OK(reader_->TrimSegmentsUpToAndIncluding( + segments_to_delete[segments_to_delete.size() - 1]->header().sequence_number())); + } + + // Now that they are no longer referenced by the Log, delete the files. + *num_gced = 0; + for (const scoped_refptr& segment : segments_to_delete) { + LOG(INFO) << "Deleting log segment in path: " << segment->path() + << " (GCed ops < " << min_op_idx << ")"; + RETURN_NOT_OK(fs_manager_->env()->DeleteFile(segment->path())); + (*num_gced)++; + } + + // Determine the minimum remaining replicate index in order to properly GC + // the index chunks. + int64_t min_remaining_op_idx = reader_->GetMinReplicateIndex(); + if (min_remaining_op_idx > 0) { + log_index_->GC(min_remaining_op_idx); + } + } + return Status::OK(); +} + +void Log::GetGCableDataSize(int64_t min_op_idx, int64_t* total_size) const { + CHECK_GE(min_op_idx, 0); + SegmentSequence segments_to_delete; + *total_size = 0; + { + boost::shared_lock read_lock(state_lock_.get_lock()); + CHECK_EQ(kLogWriting, log_state_); + Status s = GetSegmentsToGCUnlocked(min_op_idx, &segments_to_delete); + + if (!s.ok() || segments_to_delete.size() == 0) { + return; + } + } + for (const scoped_refptr& segment : segments_to_delete) { + *total_size += segment->file_size(); + } +} + +void Log::GetMaxIndexesToSegmentSizeMap(int64_t min_op_idx, + std::map* max_idx_to_segment_size) + const { + boost::shared_lock read_lock(state_lock_.get_lock()); + CHECK_EQ(kLogWriting, log_state_); + // We want to retain segments so we're only asking the extra ones. + int segments_count = std::max(reader_->num_segments() - FLAGS_log_min_segments_to_retain, 0); + if (segments_count == 0) { + return; + } + + int64_t now = GetCurrentTimeMicros(); + int64_t max_close_time_us = now - (FLAGS_log_min_seconds_to_retain * 1000000); + reader_->GetMaxIndexesToSegmentSizeMap(min_op_idx, segments_count, max_close_time_us, + max_idx_to_segment_size); +} + +LogReader* Log::GetLogReader() const { + return reader_.get(); +} + +void Log::SetSchemaForNextLogSegment(const Schema& schema, + uint32_t version) { + boost::lock_guard l(schema_lock_); + schema_ = schema; + schema_version_ = version; +} + +Status Log::Close() { + allocation_pool_->Shutdown(); + append_thread_->Shutdown(); + + boost::lock_guard l(state_lock_); + switch (log_state_) { + case kLogWriting: + if (log_hooks_) { + RETURN_NOT_OK_PREPEND(log_hooks_->PreClose(), + "PreClose hook failed"); + } + RETURN_NOT_OK(Sync()); + RETURN_NOT_OK(CloseCurrentSegment()); + RETURN_NOT_OK(ReplaceSegmentInReaderUnlocked()); + log_state_ = kLogClosed; + VLOG(1) << "Log closed"; + + // Release FDs held by these objects. + log_index_.reset(); + reader_.reset(); + + if (log_hooks_) { + RETURN_NOT_OK_PREPEND(log_hooks_->PostClose(), + "PostClose hook failed"); + } + return Status::OK(); + + case kLogClosed: + VLOG(1) << "Log already closed"; + return Status::OK(); + + default: + return Status::IllegalState(Substitute("Bad state for Close() $0", log_state_)); + } +} + +Status Log::DeleteOnDiskData(FsManager* fs_manager, const string& tablet_id) { + string wal_dir = fs_manager->GetTabletWalDir(tablet_id); + Env* env = fs_manager->env(); + if (!env->FileExists(wal_dir)) { + return Status::OK(); + } + RETURN_NOT_OK_PREPEND(env->DeleteRecursively(wal_dir), + "Unable to recursively delete WAL dir for tablet " + tablet_id); + return Status::OK(); +} + +Status Log::PreAllocateNewSegment() { + TRACE_EVENT1("log", "PreAllocateNewSegment", "file", next_segment_path_); + CHECK_EQ(allocation_state(), kAllocationInProgress); + + WritableFileOptions opts; + opts.sync_on_close = force_sync_all_; + RETURN_NOT_OK(CreatePlaceholderSegment(opts, &next_segment_path_, &next_segment_file_)); + + if (options_.preallocate_segments) { + TRACE("Preallocating $0 byte segment in $1", max_segment_size_, next_segment_path_); + // TODO (perf) zero the new segments -- this could result in + // additional performance improvements. + RETURN_NOT_OK(next_segment_file_->PreAllocate(max_segment_size_)); + } + + { + boost::lock_guard lock_guard(allocation_lock_); + allocation_state_ = kAllocationFinished; + } + return Status::OK(); +} + +Status Log::SwitchToAllocatedSegment() { + CHECK_EQ(allocation_state(), kAllocationFinished); + + // Increment "next" log segment seqno. + active_segment_sequence_number_++; + + string new_segment_path = fs_manager_->GetWalSegmentFileName(tablet_id_, + active_segment_sequence_number_); + + RETURN_NOT_OK(fs_manager_->env()->RenameFile(next_segment_path_, new_segment_path)); + if (force_sync_all_) { + RETURN_NOT_OK(fs_manager_->env()->SyncDir(log_dir_)); + } + + // Create a new segment. + gscoped_ptr new_segment( + new WritableLogSegment(new_segment_path, next_segment_file_)); + + // Set up the new header and footer. + LogSegmentHeaderPB header; + header.set_major_version(kLogMajorVersion); + header.set_minor_version(kLogMinorVersion); + header.set_sequence_number(active_segment_sequence_number_); + header.set_tablet_id(tablet_id_); + + // Set up the new footer. This will be maintained as the segment is written. + footer_builder_.Clear(); + footer_builder_.set_num_entries(0); + + + // Set the new segment's schema. + { + boost::shared_lock l(schema_lock_); + RETURN_NOT_OK(SchemaToPB(schema_, header.mutable_schema())); + header.set_schema_version(schema_version_); + } + + RETURN_NOT_OK(new_segment->WriteHeaderAndOpen(header)); + + // Transform the currently-active segment into a readable one, since we + // need to be able to replay the segments for other peers. + { + if (active_segment_.get() != nullptr) { + boost::lock_guard l(state_lock_); + CHECK_OK(ReplaceSegmentInReaderUnlocked()); + } + } + + // Open the segment we just created in readable form and add it to the reader. + gscoped_ptr readable_file; + + RandomAccessFileOptions opts; + RETURN_NOT_OK(fs_manager_->env()->NewRandomAccessFile(opts, new_segment_path, &readable_file)); + scoped_refptr readable_segment( + new ReadableLogSegment(new_segment_path, + shared_ptr(readable_file.release()))); + RETURN_NOT_OK(readable_segment->Init(header, new_segment->first_entry_offset())); + RETURN_NOT_OK(reader_->AppendEmptySegment(readable_segment)); + + // Now set 'active_segment_' to the new segment. + active_segment_.reset(new_segment.release()); + + allocation_state_ = kAllocationNotStarted; + + return Status::OK(); +} + +Status Log::ReplaceSegmentInReaderUnlocked() { + // We should never switch to a new segment if we wrote nothing to the old one. + CHECK(active_segment_->IsClosed()); + shared_ptr readable_file; + RETURN_NOT_OK(OpenFileForRandom(fs_manager_->env(), active_segment_->path(), &readable_file)); + scoped_refptr readable_segment( + new ReadableLogSegment(active_segment_->path(), + readable_file)); + // Note: active_segment_->header() will only contain an initialized PB if we + // wrote the header out. + RETURN_NOT_OK(readable_segment->Init(active_segment_->header(), + active_segment_->footer(), + active_segment_->first_entry_offset())); + + return reader_->ReplaceLastSegment(readable_segment); +} + +Status Log::CreatePlaceholderSegment(const WritableFileOptions& opts, + string* result_path, + shared_ptr* out) { + string path_tmpl = JoinPathSegments(log_dir_, kSegmentPlaceholderFileTemplate); + VLOG(2) << "Creating temp. file for place holder segment, template: " << path_tmpl; + gscoped_ptr segment_file; + RETURN_NOT_OK(fs_manager_->env()->NewTempWritableFile(opts, + path_tmpl, + result_path, + &segment_file)); + VLOG(1) << "Created next WAL segment, placeholder path: " << *result_path; + out->reset(segment_file.release()); + return Status::OK(); +} + +Log::~Log() { + WARN_NOT_OK(Close(), "Error closing log"); +} + +LogEntryBatch::LogEntryBatch(LogEntryTypePB type, + gscoped_ptr entry_batch_pb, size_t count) + : type_(type), + entry_batch_pb_(entry_batch_pb.Pass()), + total_size_bytes_( + PREDICT_FALSE(count == 1 && entry_batch_pb_->entry(0).type() == FLUSH_MARKER) ? + 0 : entry_batch_pb_->ByteSize()), + count_(count), + state_(kEntryInitialized) { +} + +LogEntryBatch::~LogEntryBatch() { +} + +void LogEntryBatch::MarkReserved() { + DCHECK_EQ(state_, kEntryInitialized); + ready_lock_.Lock(); + state_ = kEntryReserved; +} + +Status LogEntryBatch::Serialize() { + DCHECK_EQ(state_, kEntryReserved); + buffer_.clear(); + // FLUSH_MARKER LogEntries are markers and are not serialized. + if (PREDICT_FALSE(count() == 1 && entry_batch_pb_->entry(0).type() == FLUSH_MARKER)) { + state_ = kEntrySerialized; + return Status::OK(); + } + buffer_.reserve(total_size_bytes_); + + if (!pb_util::AppendToString(*entry_batch_pb_, &buffer_)) { + return Status::IOError(Substitute("unable to serialize the entry batch, contents: $1", + entry_batch_pb_->DebugString())); + } + + state_ = kEntrySerialized; + return Status::OK(); +} + +void LogEntryBatch::MarkReady() { + DCHECK_EQ(state_, kEntrySerialized); + state_ = kEntryReady; + ready_lock_.Unlock(); +} + +void LogEntryBatch::WaitForReady() { + ready_lock_.Lock(); + DCHECK_EQ(state_, kEntryReady); + ready_lock_.Unlock(); +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log.h b/src/kudu/consensus/log.h new file mode 100644 index 000000000000..642319b7218b --- /dev/null +++ b/src/kudu/consensus/log.h @@ -0,0 +1,552 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_LOG_H_ +#define KUDU_CONSENSUS_LOG_H_ + +#include +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/util/async_util.h" +#include "kudu/util/blocking_queue.h" +#include "kudu/util/locks.h" +#include "kudu/util/promise.h" +#include "kudu/util/status.h" + +namespace kudu { + +class FsManager; +class MetricEntity; +class ThreadPool; + +namespace log { + +struct LogEntryBatchLogicalSize; +struct LogMetrics; +class LogEntryBatch; +class LogIndex; +class LogReader; + +typedef BlockingQueue LogEntryBatchQueue; + +// Log interface, inspired by Raft's (logcabin) Log. Provides durability to +// Kudu as a normal Write Ahead Log and also plays the role of persistent +// storage for the consensus state machine. +// +// Note: This class is not thread safe, the caller is expected to synchronize +// Log::Reserve() and Log::Append() calls. +// +// Log uses group commit to improve write throughput and latency +// without compromising ordering and durability guarantees. +// +// To add operations to the log, the caller must obtain the lock and +// call Reserve() with the collection of operations to be added. Then, +// the caller may release the lock and call AsyncAppend(). Reserve() +// reserves a slot on a queue for the log entry; AsyncAppend() +// indicates that the entry in the slot is safe to write to disk and +// adds a callback that will be invoked once the entry is written and +// synchronized to disk. +// +// For sample usage see local_consensus.cc and mt-log-test.cc +// +// Methods on this class are _not_ thread-safe and must be externally +// synchronized unless otherwise noted. +// +// Note: The Log needs to be Close()d before any log-writing class is +// destroyed, otherwise the Log might hold references to these classes +// to execute the callbacks after each write. +class Log : public RefCountedThreadSafe { + public: + class LogFaultHooks; + + static const Status kLogShutdownStatus; + static const uint64_t kInitialLogSegmentSequenceNumber; + + // Opens or continues a log and sets 'log' to the newly built Log. + // After a successful Open() the Log is ready to receive entries. + static Status Open(const LogOptions &options, + FsManager *fs_manager, + const std::string& tablet_id, + const Schema& schema, + uint32_t schema_version, + const scoped_refptr& metric_entity, + scoped_refptr *log); + + ~Log(); + + // Reserves a spot in the log's queue for 'entry_batch'. + // + // 'reserved_entry' is initialized by this method and any resources + // associated with it will be released in AsyncAppend(). In order + // to ensure correct ordering of operations across multiple threads, + // calls to this method must be externally synchronized. + // + // WARNING: the caller _must_ call AsyncAppend() or else the log + // will "stall" and will never be able to make forward progress. + Status Reserve(LogEntryTypePB type, + gscoped_ptr entry_batch, + LogEntryBatch** reserved_entry); + + // Asynchronously appends 'entry' to the log. Once the append + // completes and is synced, 'callback' will be invoked. + Status AsyncAppend(LogEntryBatch* entry, + const StatusCallback& callback); + + // Synchronously append a new entry to the log. + // Log does not take ownership of the passed 'entry'. + // TODO get rid of this method, transition to the asynchronous API + Status Append(LogEntryPB* entry); + + // Append the given set of replicate messages, asynchronously. + // This requires that the replicates have already been assigned OpIds. + Status AsyncAppendReplicates(const vector& replicates, + const StatusCallback& callback); + + // Append the given commit message, asynchronously. + // + // Returns a bad status if the log is already shut down. + Status AsyncAppendCommit(gscoped_ptr commit_msg, + const StatusCallback& callback); + + + // Blocks the current thread until all the entries in the log queue + // are flushed and fsynced (if fsync of log entries is enabled). + Status WaitUntilAllFlushed(); + + // Kick off an asynchronous task that pre-allocates a new + // log-segment, setting 'allocation_status_'. To wait for the + // result of the task, use allocation_status_.Get(). + Status AsyncAllocateSegment(); + + // The closure submitted to allocation_pool_ to allocate a new segment. + void SegmentAllocationTask(); + + // Syncs all state and closes the log. + Status Close(); + + // Delete all WAL data from the log associated with this tablet. + // REQUIRES: The Log must be closed. + static Status DeleteOnDiskData(FsManager* fs_manager, const std::string& tablet_id); + + // Returns a reader that is able to read through the previous + // segments. The reader pointer is guaranteed to be live as long + // as the log itself is initialized and live. + LogReader* GetLogReader() const; + + void SetMaxSegmentSizeForTests(uint64_t max_segment_size) { + max_segment_size_ = max_segment_size; + } + + void DisableAsyncAllocationForTests() { + options_.async_preallocate_segments = false; + } + + void DisableSync() { + sync_disabled_ = true; + } + + // If we previous called DisableSync(), we should restore the + // default behavior and then call Sync() which will perform the + // actual syncing if required. + Status ReEnableSyncIfRequired() { + sync_disabled_ = false; + return Sync(); + } + + // Get ID of tablet. + const std::string& tablet_id() const { + return tablet_id_; + } + + // Gets the last-used OpId written to the log. + // If no entry has ever been written to the log, returns (0, 0) + void GetLatestEntryOpId(consensus::OpId* op_id) const; + + // Runs the garbage collector on the set of previous segments. Segments that + // only refer to in-mem state that has been flushed are candidates for + // garbage collection. + // + // 'min_op_idx' is the minimum operation index required to be retained. + // If successful, num_gced is set to the number of deleted log segments. + // + // This method is thread-safe. + Status GC(int64_t min_op_idx, int* num_gced); + + // Computes the amount of bytes that would have been GC'd if Log::GC had been called. + void GetGCableDataSize(int64_t min_op_idx, int64_t* total_size) const; + + // Returns a map of log index -> segment size, of all the segments that currently cannot be GCed + // because in-memory structures have anchors in them. + // + // 'min_op_idx' is the minimum operation index to start looking from, meaning that we skip the + // segment that contains it and then start recording segments. + void GetMaxIndexesToSegmentSizeMap(int64_t min_op_idx, + std::map* max_idx_to_segment_size) const; + + // Returns the file system location of the currently active WAL segment. + const std::string& ActiveSegmentPathForTests() const { + return active_segment_->path(); + } + + // Forces the Log to allocate a new segment and roll over. + // This can be used to make sure all entries appended up to this point are + // available in closed, readable segments. + Status AllocateSegmentAndRollOver(); + + // Returns this Log's FsManager. + FsManager* GetFsManager(); + + void SetLogFaultHooksForTests(const std::shared_ptr &hooks) { + log_hooks_ = hooks; + } + + // Set the schema for the _next_ log segment. + // + // This method is thread-safe. + void SetSchemaForNextLogSegment(const Schema& schema, uint32_t version); + + private: + friend class LogTest; + friend class LogTestBase; + FRIEND_TEST(LogTest, TestMultipleEntriesInABatch); + FRIEND_TEST(LogTest, TestReadLogWithReplacedReplicates); + FRIEND_TEST(LogTest, TestWriteAndReadToAndFromInProgressSegment); + + class AppendThread; + + // Log state. + enum LogState { + kLogInitialized, + kLogWriting, + kLogClosed + }; + + // State of segment (pre-) allocation. + enum SegmentAllocationState { + kAllocationNotStarted, // No segment allocation requested + kAllocationInProgress, // Next segment allocation started + kAllocationFinished // Next segment ready + }; + + Log(LogOptions options, FsManager* fs_manager, std::string log_path, + std::string tablet_id, const Schema& schema, uint32_t schema_version, + const scoped_refptr& metric_entity); + + // Initializes a new one or continues an existing log. + Status Init(); + + // Make segments roll over. + Status RollOver(); + + // Writes the footer and closes the current segment. + Status CloseCurrentSegment(); + + // Sets 'out' to a newly created temporary file (see + // Env::NewTempWritableFile()) for a placeholder segment. Sets + // 'result_path' to the fully qualified path to the unique filename + // created for the segment. + Status CreatePlaceholderSegment(const WritableFileOptions& opts, + std::string* result_path, + std::shared_ptr* out); + + // Creates a new WAL segment on disk, writes the next_segment_header_ to + // disk as the header, and sets active_segment_ to point to this new segment. + Status SwitchToAllocatedSegment(); + + // Preallocates the space for a new segment. + Status PreAllocateNewSegment(); + + // Writes serialized contents of 'entry' to the log. Called inside + // AppenderThread. If 'caller_owns_operation' is true, then the + // 'operation' field of the entry will be released after the entry + // is appended. + // TODO once Append() is removed, 'caller_owns_operation' and + // associated logic will no longer be needed. + Status DoAppend(LogEntryBatch* entry, bool caller_owns_operation = true); + + // Update footer_builder_ to reflect the log indexes seen in 'batch'. + void UpdateFooterForBatch(LogEntryBatch* batch); + + // Update the LogIndex to include entries for the replicate messages found in + // 'batch'. The index entry points to the offset 'start_offset' in the current + // log segment. + Status UpdateIndexForBatch(const LogEntryBatch& batch, + int64_t start_offset); + + // Replaces the last "empty" segment in 'log_reader_', i.e. the one currently + // being written to, by the same segment once properly closed. + Status ReplaceSegmentInReaderUnlocked(); + + Status Sync(); + + // Helper method to get the segment sequence to GC based on the provided min_op_idx. + Status GetSegmentsToGCUnlocked(int64_t min_op_idx, SegmentSequence* segments_to_gc) const; + + LogEntryBatchQueue* entry_queue() { + return &entry_batch_queue_; + } + + const SegmentAllocationState allocation_state() { + boost::shared_lock shared_lock(allocation_lock_); + return allocation_state_; + } + + LogOptions options_; + FsManager *fs_manager_; + std::string log_dir_; + + // The ID of the tablet this log is dedicated to. + std::string tablet_id_; + + // Lock to protect modifications to schema_ and schema_version_. + mutable rw_spinlock schema_lock_; + + // The current schema of the tablet this log is dedicated to. + Schema schema_; + // The schema version + uint32_t schema_version_; + + // The currently active segment being written. + gscoped_ptr active_segment_; + + // The current (active) segment sequence number. + uint64_t active_segment_sequence_number_; + + // The writable file for the next allocated segment + std::shared_ptr next_segment_file_; + + // The path for the next allocated segment. + std::string next_segment_path_; + + // Lock to protect mutations to log_state_ and other shared state variables. + mutable percpu_rwlock state_lock_; + + LogState log_state_; + + // A reader for the previous segments that were not yet GC'd. + gscoped_ptr reader_; + + // Index which translates between operation indexes and the position + // of the operation in the log. + scoped_refptr log_index_; + + // Lock to protect last_entry_op_id_, which is constantly written but + // read occasionally by things like consensus and log GC. + mutable rw_spinlock last_entry_op_id_lock_; + + // The last known OpId for a REPLICATE message appended to this log + // (any segment). NOTE: this op is not necessarily durable. + consensus::OpId last_entry_op_id_; + + // A footer being prepared for the current segment. + // When the segment is closed, it will be written. + LogSegmentFooterPB footer_builder_; + + // The maximum segment size, in bytes. + uint64_t max_segment_size_; + + // The queue used to communicate between the thread calling + // Reserve() and the Log Appender thread + LogEntryBatchQueue entry_batch_queue_; + + // Thread writing to the log + gscoped_ptr append_thread_; + + gscoped_ptr allocation_pool_; + + // If true, sync on all appends. + bool force_sync_all_; + + // If true, ignore the 'force_sync_all_' flag above. + // This is used to disable fsync during bootstrap. + bool sync_disabled_; + + // The status of the most recent log-allocation action. + Promise allocation_status_; + + // Read-write lock to protect 'allocation_state_'. + mutable boost::shared_mutex allocation_lock_; + SegmentAllocationState allocation_state_; + + scoped_refptr metric_entity_; + gscoped_ptr metrics_; + + std::shared_ptr log_hooks_; + + DISALLOW_COPY_AND_ASSIGN(Log); +}; + +// This class represents a batch of operations to be written and +// synced to the log. It is opaque to the user and is managed by the +// Log class. +// A single batch must have only one type of entries in it (eg only +// REPLICATEs or only COMMITs). +class LogEntryBatch { + public: + ~LogEntryBatch(); + + private: + friend class Log; + friend struct LogEntryBatchLogicalSize; + friend class MultiThreadedLogTest; + + LogEntryBatch(LogEntryTypePB type, + gscoped_ptr entry_batch_pb, size_t count); + + // Serializes contents of the entry to an internal buffer. + Status Serialize(); + + // Sets the callback that will be invoked after the entry is + // appended and synced to disk + void set_callback(const StatusCallback& cb) { + callback_ = cb; + } + + // Returns the callback that will be invoked after the entry is + // appended and synced to disk. + const StatusCallback& callback() { + return callback_; + } + + bool failed_to_append() const { + return state_ == kEntryFailedToAppend; + } + + void set_failed_to_append() { + state_ = kEntryFailedToAppend; + } + + // Mark the entry as reserved, but not yet ready to write to the log. + void MarkReserved(); + + // Mark the entry as ready to write to log. + void MarkReady(); + + // Wait (currently, by spinning on ready_lock_) until ready. + void WaitForReady(); + + // Returns a Slice representing the serialized contents of the + // entry. + Slice data() const { + DCHECK_EQ(state_, kEntryReady); + return Slice(buffer_); + } + + size_t count() const { return count_; } + + // Returns the total size in bytes of the object. + size_t total_size_bytes() const { + return total_size_bytes_; + } + + // The highest OpId of a REPLICATE message in this batch. + // Requires that this be a REPLICATE batch. + consensus::OpId MaxReplicateOpId() const { + DCHECK_EQ(REPLICATE, type_); + int idx = entry_batch_pb_->entry_size() - 1; + DCHECK(entry_batch_pb_->entry(idx).replicate().IsInitialized()); + return entry_batch_pb_->entry(idx).replicate().id(); + } + + void SetReplicates(const vector& replicates) { + replicates_ = replicates; + } + + // The type of entries in this batch. + const LogEntryTypePB type_; + + // Contents of the log entries that will be written to disk. + gscoped_ptr entry_batch_pb_; + + // Total size in bytes of all entries + const uint32_t total_size_bytes_; + + // Number of entries in 'entry_batch_pb_' + const size_t count_; + + // The vector of refcounted replicates. + // Used only when type is REPLICATE, this makes sure there's at + // least a reference to each replicate message until we're finished + // appending. + vector replicates_; + + // Callback to be invoked upon the entries being written and + // synced to disk. + StatusCallback callback_; + + // Used to coordinate the synchronizer thread and the caller + // thread: this lock starts out locked, and is unlocked by the + // caller thread (i.e., inside AppendThread()) once the entry is + // fully initialized (once the callback is set and data is + // serialized) + base::SpinLock ready_lock_; + + // Buffer to which 'phys_entries_' are serialized by call to + // 'Serialize()' + faststring buffer_; + + enum LogEntryState { + kEntryInitialized, + kEntryReserved, + kEntrySerialized, + kEntryReady, + kEntryFailedToAppend + }; + LogEntryState state_; + + DISALLOW_COPY_AND_ASSIGN(LogEntryBatch); +}; + +// Used by 'Log::queue_' to determine logical size of a LogEntryBatch. +struct LogEntryBatchLogicalSize { + static size_t logical_size(const LogEntryBatch* batch) { + return batch->total_size_bytes(); + } +}; + +class Log::LogFaultHooks { + public: + + // Executed immediately before returning from Log::Sync() at *ALL* + // times. + virtual Status PostSync() { return Status::OK(); } + + // Iff fsync is enabled, executed immediately after call to fsync. + virtual Status PostSyncIfFsyncEnabled() { return Status::OK(); } + + // Emulate a slow disk where the filesystem has decided to synchronously + // flush a full buffer. + virtual Status PostAppend() { return Status::OK(); } + + virtual Status PreClose() { return Status::OK(); } + virtual Status PostClose() { return Status::OK(); } + + virtual ~LogFaultHooks() {} +}; + +} // namespace log +} // namespace kudu +#endif /* KUDU_CONSENSUS_LOG_H_ */ diff --git a/src/kudu/consensus/log.proto b/src/kudu/consensus/log.proto new file mode 100644 index 000000000000..20c38d6a9b9d --- /dev/null +++ b/src/kudu/consensus/log.proto @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.log; + +option java_package = "org.kududb.log"; + +import "kudu/common/common.proto"; +import "kudu/consensus/consensus.proto"; +import "kudu/consensus/metadata.proto"; + +// =========================================================================== +// Log Entries - Log specific messages + single node messages +// =========================================================================== + +// Types of log entries. +enum LogEntryTypePB { + UNKNOWN = 0; + REPLICATE = 1; + COMMIT = 2; + // Marker entry for dummy log messages. These will never end up in the log, + // just serve the purpose of making sure that all entries up to the FLUSH_MARKER + // entry are flushed. + FLUSH_MARKER = 999; +}; + +// An entry in the WAL/state machine log. +message LogEntryPB { + required LogEntryTypePB type = 1; + optional consensus.ReplicateMsg replicate = 2; + optional consensus.CommitMsg commit = 3; +} + +// A batch of entries in the WAL. +message LogEntryBatchPB { + repeated LogEntryPB entry = 1; +} + +// A header for a log segment. +message LogSegmentHeaderPB { + // Log format major version. + required uint32 major_version = 1; + + // Log format minor version. + required uint32 minor_version = 2; + + // The ID of the tablet this WAL segment stores entries for. + required bytes tablet_id = 5; + + // The tablet-specific sequence number of this WAL segment. + required uint64 sequence_number = 6; + + // Schema used when appending entries to this log, and its version. + required SchemaPB schema = 7; + optional uint32 schema_version = 8; +} + +// A footer for a log segment. +// +// Log segment footers might not be present (e.g. if the server +// crashed) so they should contain no information that cannot +// be obtained by actually reading the entries in the log. +// +// We use the footer to keep sparse index entries mapping +// op_id->offset (right now we just keep the first entry with an +// id in the log) +message LogSegmentFooterPB { + // the total number of operations inside this segment + required int64 num_entries = 1; + + // The minimum and maximum index of a REPLICATE message in this segment. + // NOTE: because of log truncation, the min/max are not necessarily the first/last! + // For example, a log segment may contain entries "1.5, 1.6, 3.3, 2.3" due to multiple + // term changes. + // + // Because it's possible for a segment to have no replicate messages in it, + // these are optional. We set the default to -1 to avoid accidentally reading + // 0, which might look like a real log index. + optional int64 min_replicate_index = 2 [ default = -1 ]; + optional int64 max_replicate_index = 3 [ default = -1 ]; + + // The time (microseconds since epoch) when this segment was closed. + // NOTE: since log segments are rewritten during bootstrap, these will all + // be reset to the time of the bootstrap on a newly-restarted server, rather + // than copied over from the old log segments. + optional int64 close_timestamp_micros = 4; +} diff --git a/src/kudu/consensus/log_anchor_registry-test.cc b/src/kudu/consensus/log_anchor_registry-test.cc new file mode 100644 index 000000000000..817fe0ba750e --- /dev/null +++ b/src/kudu/consensus/log_anchor_registry-test.cc @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_anchor_registry.h" + +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/test_util.h" + +using strings::Substitute; + +namespace kudu { +namespace log { + +class LogAnchorRegistryTest : public KuduTest { +}; + +TEST_F(LogAnchorRegistryTest, TestUpdateRegistration) { + const string test_name = CURRENT_TEST_NAME(); + scoped_refptr reg(new LogAnchorRegistry()); + + LogAnchor anchor; + const int64_t kInitialIndex = 12345; + + ASSERT_FALSE(anchor.is_registered); + ASSERT_FALSE(anchor.when_registered.Initialized()); + reg->Register(kInitialIndex, test_name, &anchor); + ASSERT_TRUE(anchor.is_registered); + ASSERT_TRUE(anchor.when_registered.Initialized()); + ASSERT_OK(reg->UpdateRegistration(kInitialIndex + 1, test_name, &anchor)); + ASSERT_OK(reg->Unregister(&anchor)); +} + +TEST_F(LogAnchorRegistryTest, TestDuplicateInserts) { + const string test_name = CURRENT_TEST_NAME(); + scoped_refptr reg(new LogAnchorRegistry()); + + // Register a bunch of anchors at log index 1. + const int num_anchors = 10; + LogAnchor anchors[num_anchors]; + for (auto& anchor : anchors) { + reg->Register(1, test_name, &anchor); + } + + // We should see index 1 as the earliest registered. + int64_t first_index = -1; + ASSERT_OK(reg->GetEarliestRegisteredLogIndex(&first_index)); + ASSERT_EQ(1, first_index); + + // Unregister them all. + for (auto& anchor : anchors) { + ASSERT_OK(reg->Unregister(&anchor)); + } + + // We should see none registered. + Status s = reg->GetEarliestRegisteredLogIndex(&first_index); + ASSERT_TRUE(s.IsNotFound()) + << Substitute("Should have empty OpId registry. Status: $0, anchor: $1, Num anchors: $2", + s.ToString(), first_index, reg->GetAnchorCountForTests()); + + ASSERT_EQ(0, reg->GetAnchorCountForTests()); +} + +// Ensure that the correct results are returned when anchors are added/removed +// out of order. +TEST_F(LogAnchorRegistryTest, TestOrderedEarliestOpId) { + scoped_refptr reg(new LogAnchorRegistry()); + const int kNumAnchors = 4; + const string test_name = CURRENT_TEST_NAME(); + + LogAnchor anchors[kNumAnchors]; + + reg->Register(2, test_name, &anchors[0]); + reg->Register(3, test_name, &anchors[1]); + reg->Register(1, test_name, &anchors[2]); + reg->Register(4, test_name, &anchors[3]); + + ASSERT_STR_CONTAINS(reg->DumpAnchorInfo(), "LogAnchor[index=1"); + + int64_t anchor_idx = -1; + ASSERT_OK(reg->GetEarliestRegisteredLogIndex(&anchor_idx)); + ASSERT_EQ(1, anchor_idx); + + ASSERT_OK(reg->Unregister(&anchors[2])); + ASSERT_OK(reg->GetEarliestRegisteredLogIndex(&anchor_idx)); + ASSERT_EQ(2, anchor_idx); + + ASSERT_OK(reg->Unregister(&anchors[3])); + ASSERT_OK(reg->GetEarliestRegisteredLogIndex(&anchor_idx)); + ASSERT_EQ(2, anchor_idx); + + ASSERT_OK(reg->Unregister(&anchors[0])); + ASSERT_OK(reg->GetEarliestRegisteredLogIndex(&anchor_idx)); + ASSERT_EQ(3, anchor_idx); + + ASSERT_OK(reg->Unregister(&anchors[1])); + Status s = reg->GetEarliestRegisteredLogIndex(&anchor_idx); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_anchor_registry.cc b/src/kudu/consensus/log_anchor_registry.cc new file mode 100644 index 000000000000..0ad9a7d204ef --- /dev/null +++ b/src/kudu/consensus/log_anchor_registry.cc @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" + +#include +#include + +#include "kudu/gutil/strings/substitute.h" + +namespace kudu { +namespace log { + +using consensus::kInvalidOpIdIndex; +using std::pair; +using std::string; +using strings::Substitute; +using strings::SubstituteAndAppend; + +LogAnchorRegistry::LogAnchorRegistry() { +} + +LogAnchorRegistry::~LogAnchorRegistry() { + CHECK(anchors_.empty()); +} + +void LogAnchorRegistry::Register(int64_t log_index, + const string& owner, + LogAnchor* anchor) { + boost::lock_guard l(lock_); + RegisterUnlocked(log_index, owner, anchor); +} + +Status LogAnchorRegistry::UpdateRegistration(int64_t log_index, + const std::string& owner, + LogAnchor* anchor) { + boost::lock_guard l(lock_); + RETURN_NOT_OK_PREPEND(UnregisterUnlocked(anchor), + "Unable to swap registration, anchor not registered") + RegisterUnlocked(log_index, owner, anchor); + return Status::OK(); +} + +Status LogAnchorRegistry::Unregister(LogAnchor* anchor) { + boost::lock_guard l(lock_); + return UnregisterUnlocked(anchor); +} + +Status LogAnchorRegistry::UnregisterIfAnchored(LogAnchor* anchor) { + boost::lock_guard l(lock_); + if (!anchor->is_registered) return Status::OK(); + return UnregisterUnlocked(anchor); +} + +Status LogAnchorRegistry::GetEarliestRegisteredLogIndex(int64_t* log_index) { + boost::lock_guard l(lock_); + auto iter = anchors_.begin(); + if (iter == anchors_.end()) { + return Status::NotFound("No anchors in registry"); + } + + // Since this is a sorted map, the first element is the one we want. + *log_index = iter->first; + return Status::OK(); +} + +size_t LogAnchorRegistry::GetAnchorCountForTests() const { + boost::lock_guard l(lock_); + return anchors_.size(); +} + +std::string LogAnchorRegistry::DumpAnchorInfo() const { + string buf; + boost::lock_guard l(lock_); + MonoTime now = MonoTime::Now(MonoTime::FINE); + for (const AnchorMultiMap::value_type& entry : anchors_) { + const LogAnchor* anchor = entry.second; + DCHECK(anchor->is_registered); + if (!buf.empty()) buf += ", "; + SubstituteAndAppend(&buf, "LogAnchor[index=$0, age=$1s, owner=$2]", + anchor->log_index, + now.GetDeltaSince(anchor->when_registered).ToSeconds(), + anchor->owner); + } + return buf; +} + +void LogAnchorRegistry::RegisterUnlocked(int64_t log_index, + const std::string& owner, + LogAnchor* anchor) { + DCHECK(anchor != nullptr); + DCHECK(!anchor->is_registered); + + anchor->log_index = log_index; + anchor->owner.assign(owner); + anchor->is_registered = true; + anchor->when_registered = MonoTime::Now(MonoTime::FINE); + AnchorMultiMap::value_type value(log_index, anchor); + anchors_.insert(value); +} + +Status LogAnchorRegistry::UnregisterUnlocked(LogAnchor* anchor) { + DCHECK(anchor != nullptr); + DCHECK(anchor->is_registered); + + auto iter = anchors_.find(anchor->log_index); + while (iter != anchors_.end()) { + if (iter->second == anchor) { + anchor->is_registered = false; + anchors_.erase(iter); + // No need for the iterator to remain valid since we return here. + return Status::OK(); + } else { + ++iter; + } + } + return Status::NotFound(Substitute("Anchor with index $0 and owner $1 not found", + anchor->log_index, anchor->owner)); +} + +LogAnchor::LogAnchor() + : is_registered(false), + log_index(kInvalidOpIdIndex) { +} + +LogAnchor::~LogAnchor() { + CHECK(!is_registered) << "Attempted to destruct a registered LogAnchor"; +} + +MinLogIndexAnchorer::MinLogIndexAnchorer(LogAnchorRegistry* registry, + string owner) + : registry_(DCHECK_NOTNULL(registry)), + owner_(std::move(owner)), + minimum_log_index_(kInvalidOpIdIndex) {} + +MinLogIndexAnchorer::~MinLogIndexAnchorer() { + CHECK_OK(ReleaseAnchor()); +} + +void MinLogIndexAnchorer::AnchorIfMinimum(int64_t log_index) { + boost::lock_guard l(lock_); + if (PREDICT_FALSE(minimum_log_index_ == kInvalidOpIdIndex)) { + minimum_log_index_ = log_index; + registry_->Register(minimum_log_index_, owner_, &anchor_); + } else if (log_index < minimum_log_index_) { + minimum_log_index_ = log_index; + CHECK_OK(registry_->UpdateRegistration(minimum_log_index_, owner_, &anchor_)); + } +} + +Status MinLogIndexAnchorer::ReleaseAnchor() { + boost::lock_guard l(lock_); + if (PREDICT_TRUE(minimum_log_index_ != kInvalidOpIdIndex)) { + return registry_->Unregister(&anchor_); + } + return Status::OK(); // If there were no inserts, return OK. +} + +int64_t MinLogIndexAnchorer::minimum_log_index() const { + boost::lock_guard l(lock_); + return minimum_log_index_; +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_anchor_registry.h b/src/kudu/consensus/log_anchor_registry.h new file mode 100644 index 000000000000..aaf5c8d0aafb --- /dev/null +++ b/src/kudu/consensus/log_anchor_registry.h @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOG_ANCHOR_REGISTRY_ +#define KUDU_CONSENSUS_LOG_ANCHOR_REGISTRY_ + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace log { + +struct LogAnchor; + +// This class allows callers to register their interest in (anchor) a particular +// log index. The primary use case for this is to prevent the deletion of segments of +// the WAL that reference as-yet unflushed in-memory operations. +// +// This class is thread-safe. +class LogAnchorRegistry : public RefCountedThreadSafe { + public: + LogAnchorRegistry(); + + // Register interest for a particular log index. + // log_index: The log index the caller wishes to anchor. + // owner: String to describe who is registering the anchor. Used in assert + // messages for debugging purposes. + // anchor: Pointer to LogAnchor structure that will be populated on registration. + void Register(int64_t log_index, const std::string& owner, LogAnchor* anchor); + + // Atomically update the registration of an anchor to a new log index. + // Before: anchor must be registered with some log index. + // After: anchor is now registered using index 'log_index'. + // See Register(). + Status UpdateRegistration(int64_t log_index, + const std::string& owner, + LogAnchor* anchor); + + // Release the anchor on a log index. + // Note: anchor must be the original pointer passed to Register(). + Status Unregister(LogAnchor* anchor); + + // Release the anchor on a log index if it is registered. + // Otherwise, do nothing. + Status UnregisterIfAnchored(LogAnchor* anchor); + + // Query the registry to find the earliest anchored log index in the registry. + // Returns Status::NotFound if no anchors are currently active. + Status GetEarliestRegisteredLogIndex(int64_t* op_id); + + // Simply returns the number of active anchors for use in debugging / tests. + // This is _not_ a constant-time operation. + size_t GetAnchorCountForTests() const; + + // Dumps information about registered anchors to a string. + std::string DumpAnchorInfo() const; + + private: + friend class RefCountedThreadSafe; + ~LogAnchorRegistry(); + + typedef std::multimap AnchorMultiMap; + + // Register a new anchor after taking the lock. See Register(). + void RegisterUnlocked(int64_t log_index, const std::string& owner, LogAnchor* anchor); + + // Unregister an anchor after taking the lock. See Unregister(). + Status UnregisterUnlocked(LogAnchor* anchor); + + AnchorMultiMap anchors_; + mutable simple_spinlock lock_; + + DISALLOW_COPY_AND_ASSIGN(LogAnchorRegistry); +}; + +// A simple struct that allows us to keep track of which log segments we want +// to anchor (prevent log GC on). +struct LogAnchor { + public: + LogAnchor(); + ~LogAnchor(); + + private: + FRIEND_TEST(LogTest, TestGCWithLogRunning); + FRIEND_TEST(LogAnchorRegistryTest, TestUpdateRegistration); + friend class LogAnchorRegistry; + + // Whether any log index is currently registered with this anchor. + bool is_registered; + + // When this anchor was last registered or updated. + MonoTime when_registered; + + // The index of the log entry we are anchoring on. + int64_t log_index; + + // An arbitrary string containing details of the subsystem holding the + // anchor, and any relevant information about it that should be displayed in + // the log or the web UI. + std::string owner; + + DISALLOW_COPY_AND_ASSIGN(LogAnchor); +}; + +// Helper class that will anchor the minimum log index recorded. +class MinLogIndexAnchorer { + public: + // Construct anchorer for specified registry that will register anchors with + // the specified owner name. + MinLogIndexAnchorer(LogAnchorRegistry* registry, std::string owner); + + // The destructor will unregister the anchor if it is registered. + ~MinLogIndexAnchorer(); + + // If op_id is less than the minimum index registered so far, or if no indexes + // are currently registered, anchor on 'log_index'. + void AnchorIfMinimum(int64_t log_index); + + // Un-anchors the earliest index (which is the only one tracked). + // If no minimum is known (no anchor registered), returns OK. + Status ReleaseAnchor(); + + // Returns the first recorded log index, kInvalidOpIdIndex if there's none. + int64_t minimum_log_index() const; + + private: + const scoped_refptr registry_; + const std::string owner_; + LogAnchor anchor_; + + // The index currently anchored, or kInvalidOpIdIndex if no anchor has yet been registered. + int64_t minimum_log_index_; + mutable simple_spinlock lock_; + + DISALLOW_COPY_AND_ASSIGN(MinLogIndexAnchorer); +}; + +} // namespace log +} // namespace kudu + +#endif // KUDU_CONSENSUS_LOG_ANCHOR_REGISTRY_ diff --git a/src/kudu/consensus/log_cache-test.cc b/src/kudu/consensus/log_cache-test.cc new file mode 100644 index 000000000000..b9a71f1aaff6 --- /dev/null +++ b/src/kudu/consensus/log_cache-test.cc @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_cache.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/bind_helpers.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +using std::shared_ptr; + +DECLARE_int32(log_cache_size_limit_mb); +DECLARE_int32(global_log_cache_size_limit_mb); + +METRIC_DECLARE_entity(tablet); + +namespace kudu { +namespace consensus { + +static const char* kPeerUuid = "leader"; +static const char* kTestTablet = "test-tablet"; + +class LogCacheTest : public KuduTest { + public: + LogCacheTest() + : schema_(GetSimpleTestSchema()), + metric_entity_(METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "LogCacheTest")) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + CHECK_OK(log::Log::Open(log::LogOptions(), + fs_manager_.get(), + kTestTablet, + schema_, + 0, // schema_version + NULL, + &log_)); + + CloseAndReopenCache(MinimumOpId()); + clock_.reset(new server::HybridClock()); + ASSERT_OK(clock_->Init()); + } + + virtual void TearDown() OVERRIDE { + log_->WaitUntilAllFlushed(); + } + + void CloseAndReopenCache(const OpId& preceding_id) { + // Blow away the memtrackers before creating the new cache. + cache_.reset(); + + cache_.reset(new LogCache(metric_entity_, + log_.get(), + kPeerUuid, + kTestTablet)); + cache_->Init(preceding_id); + } + + protected: + static void FatalOnError(const Status& s) { + CHECK_OK(s); + } + + Status AppendReplicateMessagesToCache( + int first, + int count, + int payload_size = 0) { + + for (int i = first; i < first + count; i++) { + int term = i / 7; + int index = i; + vector msgs; + msgs.push_back(make_scoped_refptr_replicate( + CreateDummyReplicate(term, index, clock_->Now(), payload_size).release())); + RETURN_NOT_OK(cache_->AppendOperations(msgs, Bind(&FatalOnError))); + } + return Status::OK(); + } + + const Schema schema_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + gscoped_ptr fs_manager_; + gscoped_ptr cache_; + scoped_refptr log_; + scoped_refptr clock_; +}; + + +TEST_F(LogCacheTest, TestAppendAndGetMessages) { + ASSERT_EQ(0, cache_->metrics_.log_cache_num_ops->value()); + ASSERT_EQ(0, cache_->metrics_.log_cache_size->value()); + ASSERT_OK(AppendReplicateMessagesToCache(1, 100)); + ASSERT_EQ(100, cache_->metrics_.log_cache_num_ops->value()); + ASSERT_GE(cache_->metrics_.log_cache_size->value(), 500); + log_->WaitUntilAllFlushed(); + + vector messages; + OpId preceding; + ASSERT_OK(cache_->ReadOps(0, 8 * 1024 * 1024, &messages, &preceding)); + EXPECT_EQ(100, messages.size()); + EXPECT_EQ("0.0", OpIdToString(preceding)); + + // Get starting in the middle of the cache. + messages.clear(); + ASSERT_OK(cache_->ReadOps(70, 8 * 1024 * 1024, &messages, &preceding)); + EXPECT_EQ(30, messages.size()); + EXPECT_EQ("10.70", OpIdToString(preceding)); + EXPECT_EQ("10.71", OpIdToString(messages[0]->get()->id())); + + // Get at the end of the cache + messages.clear(); + ASSERT_OK(cache_->ReadOps(100, 8 * 1024 * 1024, &messages, &preceding)); + EXPECT_EQ(0, messages.size()); + EXPECT_EQ("14.100", OpIdToString(preceding)); + + // Evict some and verify that the eviction took effect. + cache_->EvictThroughOp(50); + ASSERT_EQ(50, cache_->metrics_.log_cache_num_ops->value()); + + // Can still read data that was evicted, since it got written through. + messages.clear(); + ASSERT_OK(cache_->ReadOps(20, 8 * 1024 * 1024, &messages, &preceding)); + EXPECT_EQ(80, messages.size()); + EXPECT_EQ("2.20", OpIdToString(preceding)); + EXPECT_EQ("3.21", OpIdToString(messages[0]->get()->id())); +} + + +// Ensure that the cache always yields at least one message, +// even if that message is larger than the batch size. This ensures +// that we don't get "stuck" in the case that a large message enters +// the cache. +TEST_F(LogCacheTest, TestAlwaysYieldsAtLeastOneMessage) { + // generate a 2MB dummy payload + const int kPayloadSize = 2 * 1024 * 1024; + + // Append several large ops to the cache + ASSERT_OK(AppendReplicateMessagesToCache(1, 4, kPayloadSize)); + log_->WaitUntilAllFlushed(); + + // We should get one of them, even though we only ask for 100 bytes + vector messages; + OpId preceding; + ASSERT_OK(cache_->ReadOps(0, 100, &messages, &preceding)); + ASSERT_EQ(1, messages.size()); +} + +// Tests that the cache returns Status::NotFound() if queried for messages after an +// index that is higher than it's latest, returns an empty set of messages when queried for +// the the last index and returns all messages when queried for MinimumOpId(). +TEST_F(LogCacheTest, TestCacheEdgeCases) { + // Append 1 message to the cache + ASSERT_OK(AppendReplicateMessagesToCache(1, 1)); + log_->WaitUntilAllFlushed(); + + std::vector messages; + OpId preceding; + + // Test when the searched index is MinimumOpId().index(). + ASSERT_OK(cache_->ReadOps(0, 100, &messages, &preceding)); + ASSERT_EQ(1, messages.size()); + ASSERT_OPID_EQ(MakeOpId(0, 0), preceding); + + messages.clear(); + preceding.Clear(); + // Test when 'after_op_index' is the last index in the cache. + ASSERT_OK(cache_->ReadOps(1, 100, &messages, &preceding)); + ASSERT_EQ(0, messages.size()); + ASSERT_OPID_EQ(MakeOpId(0, 1), preceding); + + messages.clear(); + preceding.Clear(); + // Now test the case when 'after_op_index' is after the last index + // in the cache. + Status s = cache_->ReadOps(2, 100, &messages, &preceding); + ASSERT_TRUE(s.IsIncomplete()) << "unexpected status: " << s.ToString(); + ASSERT_EQ(0, messages.size()); + ASSERT_FALSE(preceding.IsInitialized()); + + messages.clear(); + preceding.Clear(); + + // Evict entries from the cache, and ensure that we can still read + // entries at the beginning of the log. + cache_->EvictThroughOp(50); + ASSERT_OK(cache_->ReadOps(0, 100, &messages, &preceding)); + ASSERT_EQ(1, messages.size()); + ASSERT_OPID_EQ(MakeOpId(0, 0), preceding); +} + + +TEST_F(LogCacheTest, TestMemoryLimit) { + FLAGS_log_cache_size_limit_mb = 1; + CloseAndReopenCache(MinimumOpId()); + + const int kPayloadSize = 400 * 1024; + // Limit should not be violated. + ASSERT_OK(AppendReplicateMessagesToCache(1, 1, kPayloadSize)); + log_->WaitUntilAllFlushed(); + ASSERT_EQ(1, cache_->num_cached_ops()); + + // Verify the size is right. It's not exactly kPayloadSize because of in-memory + // overhead, etc. + int size_with_one_msg = cache_->BytesUsed(); + ASSERT_GT(size_with_one_msg, 300 * 1024); + ASSERT_LT(size_with_one_msg, 500 * 1024); + + // Add another operation which fits under the 1MB limit. + ASSERT_OK(AppendReplicateMessagesToCache(2, 1, kPayloadSize)); + log_->WaitUntilAllFlushed(); + ASSERT_EQ(2, cache_->num_cached_ops()); + + int size_with_two_msgs = cache_->BytesUsed(); + ASSERT_GT(size_with_two_msgs, 2 * 300 * 1024); + ASSERT_LT(size_with_two_msgs, 2 * 500 * 1024); + + // Append a third operation, which will push the cache size above the 1MB limit + // and cause eviction of the first operation. + LOG(INFO) << "appending op 3"; + // Verify that we have trimmed by appending a message that would + // otherwise be rejected, since the cache max size limit is 2MB. + ASSERT_OK(AppendReplicateMessagesToCache(3, 1, kPayloadSize)); + log_->WaitUntilAllFlushed(); + ASSERT_EQ(2, cache_->num_cached_ops()); + ASSERT_EQ(size_with_two_msgs, cache_->BytesUsed()); + + // Test explicitly evicting one of the ops. + cache_->EvictThroughOp(2); + ASSERT_EQ(1, cache_->num_cached_ops()); + ASSERT_EQ(size_with_one_msg, cache_->BytesUsed()); + + // Explicitly evict the last op. + cache_->EvictThroughOp(3); + ASSERT_EQ(0, cache_->num_cached_ops()); + ASSERT_EQ(cache_->BytesUsed(), 0); +} + +TEST_F(LogCacheTest, TestGlobalMemoryLimit) { + FLAGS_global_log_cache_size_limit_mb = 4; + CloseAndReopenCache(MinimumOpId()); + + // Exceed the global hard limit. + ScopedTrackedConsumption consumption(cache_->parent_tracker_, 3*1024*1024); + + const int kPayloadSize = 768 * 1024; + + // Should succeed, but only end up caching one of the two ops because of the global limit. + ASSERT_OK(AppendReplicateMessagesToCache(1, 2, kPayloadSize)); + log_->WaitUntilAllFlushed(); + + ASSERT_EQ(1, cache_->num_cached_ops()); + ASSERT_LE(cache_->BytesUsed(), 1024 * 1024); +} + +// Test that the log cache properly replaces messages when an index +// is reused. This is a regression test for a bug where the memtracker's +// consumption wasn't properly managed when messages were replaced. +TEST_F(LogCacheTest, TestReplaceMessages) { + const int kPayloadSize = 128 * 1024; + shared_ptr tracker = cache_->tracker_;; + ASSERT_EQ(0, tracker->consumption()); + + ASSERT_OK(AppendReplicateMessagesToCache(1, 1, kPayloadSize)); + int size_with_one_msg = tracker->consumption(); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(AppendReplicateMessagesToCache(1, 1, kPayloadSize)); + } + + log_->WaitUntilAllFlushed(); + + EXPECT_EQ(size_with_one_msg, tracker->consumption()); + EXPECT_EQ(Substitute("Pinned index: 2, LogCacheStats(num_ops=1, bytes=$0)", + size_with_one_msg), + cache_->ToString()); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/log_cache.cc b/src/kudu/consensus/log_cache.cc new file mode 100644 index 000000000000..3700357148ac --- /dev/null +++ b/src/kudu/consensus/log_cache.cc @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_cache.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/locks.h" +#include "kudu/util/logging.h" + +DEFINE_int32(log_cache_size_limit_mb, 128, + "The total per-tablet size of consensus entries which may be kept in memory. " + "The log cache attempts to keep all entries which have not yet been replicated " + "to all followers in memory, but if the total size of those entries exceeds " + "this limit within an individual tablet, the oldest will be evicted."); +TAG_FLAG(log_cache_size_limit_mb, advanced); + +DEFINE_int32(global_log_cache_size_limit_mb, 1024, + "Server-wide version of 'log_cache_size_limit_mb'. The total memory used for " + "caching log entries across all tablets is kept under this threshold."); +TAG_FLAG(global_log_cache_size_limit_mb, advanced); + +using strings::Substitute; + +namespace kudu { +namespace consensus { + +METRIC_DEFINE_gauge_int64(tablet, log_cache_num_ops, "Log Cache Operation Count", + MetricUnit::kOperations, + "Number of operations in the log cache."); +METRIC_DEFINE_gauge_int64(tablet, log_cache_size, "Log Cache Memory Usage", + MetricUnit::kBytes, + "Amount of memory in use for caching the local log."); + +static const char kParentMemTrackerId[] = "log_cache"; + +typedef vector::const_iterator MsgIter; + +LogCache::LogCache(const scoped_refptr& metric_entity, + const scoped_refptr& log, + const string& local_uuid, + const string& tablet_id) + : log_(log), + local_uuid_(local_uuid), + tablet_id_(tablet_id), + next_sequential_op_index_(0), + min_pinned_op_index_(0), + metrics_(metric_entity) { + + + const int64_t max_ops_size_bytes = FLAGS_log_cache_size_limit_mb * 1024 * 1024; + const int64_t global_max_ops_size_bytes = FLAGS_global_log_cache_size_limit_mb * 1024 * 1024; + + // Set up (or reuse) a tracker with the global limit. It is parented directly + // to the root tracker so that it's always global. + parent_tracker_ = MemTracker::FindOrCreateTracker(global_max_ops_size_bytes, + kParentMemTrackerId); + + // And create a child tracker with the per-tablet limit. + tracker_ = MemTracker::CreateTracker( + max_ops_size_bytes, Substitute("$0:$1:$2", kParentMemTrackerId, + local_uuid, tablet_id), + parent_tracker_); + + // Put a fake message at index 0, since this simplifies a lot of our + // code paths elsewhere. + auto zero_op = new ReplicateMsg(); + *zero_op->mutable_id() = MinimumOpId(); + InsertOrDie(&cache_, 0, make_scoped_refptr_replicate(zero_op)); +} + +LogCache::~LogCache() { + tracker_->Release(tracker_->consumption()); + cache_.clear(); + + // Don't need to unregister parent_tracker_ because it is reused in each + // LogCache, not duplicated. + tracker_->UnregisterFromParent(); +} + +void LogCache::Init(const OpId& preceding_op) { + lock_guard l(&lock_); + CHECK_EQ(cache_.size(), 1) + << "Cache should have only our special '0' op"; + next_sequential_op_index_ = preceding_op.index() + 1; + min_pinned_op_index_ = next_sequential_op_index_; +} + +Status LogCache::AppendOperations(const vector& msgs, + const StatusCallback& callback) { + unique_lock l(&lock_); + + int size = msgs.size(); + CHECK_GT(size, 0); + + // If we're not appending a consecutive op we're likely overwriting and + // need to replace operations in the cache. + int64_t first_idx_in_batch = msgs.front()->get()->id().index(); + int64_t last_idx_in_batch = msgs.back()->get()->id().index(); + + if (first_idx_in_batch != next_sequential_op_index_) { + // If the index is not consecutive then it must be lower than or equal + // to the last index, i.e. we're overwriting. + CHECK_LE(first_idx_in_batch, next_sequential_op_index_); + + // Now remove the overwritten operations. + for (int64_t i = first_idx_in_batch; i < next_sequential_op_index_; ++i) { + ReplicateRefPtr msg = EraseKeyReturnValuePtr(&cache_, i); + if (msg != nullptr) { + AccountForMessageRemovalUnlocked(msg); + } + } + } + + + int64_t mem_required = 0; + for (const auto& msg : msgs) { + mem_required += msg->get()->SpaceUsed(); + } + + // Try to consume the memory. If it can't be consumed, we may need to evict. + bool borrowed_memory = false; + if (!tracker_->TryConsume(mem_required)) { + int spare = tracker_->SpareCapacity(); + int need_to_free = mem_required - spare; + VLOG_WITH_PREFIX_UNLOCKED(1) << "Memory limit would be exceeded trying to append " + << HumanReadableNumBytes::ToString(mem_required) + << " to log cache (available=" + << HumanReadableNumBytes::ToString(spare) + << "): attempting to evict some operations..."; + + // TODO: we should also try to evict from other tablets - probably better to + // evict really old ops from another tablet than evict recent ops from this one. + EvictSomeUnlocked(min_pinned_op_index_, need_to_free); + + // Force consuming, so that we don't refuse appending data. We might + // blow past our limit a little bit (as much as the number of tablets times + // the amount of in-flight data in the log), but until implementing the above TODO, + // it's difficult to solve this issue. + tracker_->Consume(mem_required); + + borrowed_memory = parent_tracker_->LimitExceeded(); + } + + for (const auto& msg : msgs) { + InsertOrDie(&cache_, msg->get()->id().index(), msg); + } + + // We drop the lock during the AsyncAppendReplicates call, since it may block + // if the queue is full, and the queue might not drain if it's trying to call + // our callback and blocked on this lock. + l.unlock(); + + Status log_status = log_->AsyncAppendReplicates( + msgs, Bind(&LogCache::LogCallback, + Unretained(this), + last_idx_in_batch, + borrowed_memory, + callback)); + l.lock(); + if (!log_status.ok()) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Couldn't append to log: " << log_status.ToString(); + tracker_->Release(mem_required); + return log_status; + } + + metrics_.log_cache_size->IncrementBy(mem_required); + metrics_.log_cache_num_ops->IncrementBy(msgs.size()); + + next_sequential_op_index_ = msgs.back()->get()->id().index() + 1; + + return Status::OK(); +} + +void LogCache::LogCallback(int64_t last_idx_in_batch, + bool borrowed_memory, + const StatusCallback& user_callback, + const Status& log_status) { + if (log_status.ok()) { + lock_guard l(&lock_); + if (min_pinned_op_index_ <= last_idx_in_batch) { + VLOG_WITH_PREFIX_UNLOCKED(1) << "Updating pinned index to " << (last_idx_in_batch + 1); + min_pinned_op_index_ = last_idx_in_batch + 1; + } + + // If we went over the global limit in order to log this batch, evict some to + // get back down under the limit. + if (borrowed_memory) { + int64_t spare_capacity = parent_tracker_->SpareCapacity(); + if (spare_capacity < 0) { + EvictSomeUnlocked(min_pinned_op_index_, -spare_capacity); + } + } + } + user_callback.Run(log_status); +} + +bool LogCache::HasOpBeenWritten(int64_t index) const { + lock_guard l(&lock_); + return index < next_sequential_op_index_; +} + +Status LogCache::LookupOpId(int64_t op_index, OpId* op_id) const { + // First check the log cache itself. + { + unique_lock l(&lock_); + + // We sometimes try to look up OpIds that have never been written + // on the local node. In that case, don't try to read the op from + // the log reader, since it might actually race against the writing + // of the op. + if (op_index >= next_sequential_op_index_) { + return Status::Incomplete(Substitute("Op with index $0 is ahead of the local log " + "(next sequential op: $1)", + op_index, next_sequential_op_index_)); + } + auto iter = cache_.find(op_index); + if (iter != cache_.end()) { + *op_id = iter->second->get()->id(); + return Status::OK(); + } + } + + // If it misses, read from the log. + return log_->GetLogReader()->LookupOpId(op_index, op_id); +} + +namespace { +// Calculate the total byte size that will be used on the wire to replicate +// this message as part of a consensus update request. This accounts for the +// length delimiting and tagging of the message. +int64_t TotalByteSizeForMessage(const ReplicateMsg& msg) { + int msg_size = google::protobuf::internal::WireFormatLite::LengthDelimitedSize( + msg.ByteSize()); + msg_size += 1; // for the type tag + return msg_size; +} +} // anonymous namespace + +Status LogCache::ReadOps(int64_t after_op_index, + int max_size_bytes, + std::vector* messages, + OpId* preceding_op) { + DCHECK_GE(after_op_index, 0); + RETURN_NOT_OK(LookupOpId(after_op_index, preceding_op)); + + unique_lock l(&lock_); + int64_t next_index = after_op_index + 1; + + // Return as many operations as we can, up to the limit + int64_t remaining_space = max_size_bytes; + while (remaining_space > 0 && next_index < next_sequential_op_index_) { + + // If the messages the peer needs haven't been loaded into the queue yet, + // load them. + MessageCache::const_iterator iter = cache_.lower_bound(next_index); + if (iter == cache_.end() || iter->first != next_index) { + int64_t up_to; + if (iter == cache_.end()) { + // Read all the way to the current op + up_to = next_sequential_op_index_ - 1; + } else { + // Read up to the next entry that's in the cache + up_to = iter->first - 1; + } + + l.unlock(); + + vector raw_replicate_ptrs; + RETURN_NOT_OK_PREPEND( + log_->GetLogReader()->ReadReplicatesInRange( + next_index, up_to, remaining_space, &raw_replicate_ptrs), + Substitute("Failed to read ops $0..$1", next_index, up_to)); + l.lock(); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Successfully read " << raw_replicate_ptrs.size() << " ops " + << "from disk."; + + for (ReplicateMsg* msg : raw_replicate_ptrs) { + CHECK_EQ(next_index, msg->id().index()); + + remaining_space -= TotalByteSizeForMessage(*msg); + if (remaining_space > 0) { + messages->push_back(make_scoped_refptr_replicate(msg)); + next_index++; + } else { + delete msg; + } + } + + } else { + // Pull contiguous messages from the cache until the size limit is achieved. + for (; iter != cache_.end(); ++iter) { + const ReplicateRefPtr& msg = iter->second; + int64_t index = msg->get()->id().index(); + if (index != next_index) { + continue; + } + + remaining_space -= TotalByteSizeForMessage(*msg->get()); + if (remaining_space < 0 && !messages->empty()) { + break; + } + + messages->push_back(msg); + next_index++; + } + } + } + return Status::OK(); +} + + +void LogCache::EvictThroughOp(int64_t index) { + lock_guard lock(&lock_); + + EvictSomeUnlocked(index, MathLimits::kMax); +} + +void LogCache::EvictSomeUnlocked(int64_t stop_after_index, int64_t bytes_to_evict) { + DCHECK(lock_.is_locked()); + VLOG_WITH_PREFIX_UNLOCKED(2) << "Evicting log cache index <= " + << stop_after_index + << " or " << HumanReadableNumBytes::ToString(bytes_to_evict) + << ": before state: " << ToStringUnlocked(); + + int64_t bytes_evicted = 0; + for (auto iter = cache_.begin(); iter != cache_.end();) { + const ReplicateRefPtr& msg = (*iter).second; + VLOG_WITH_PREFIX_UNLOCKED(2) << "considering for eviction: " << msg->get()->id(); + int64_t msg_index = msg->get()->id().index(); + if (msg_index == 0) { + // Always keep our special '0' op. + ++iter; + continue; + } + + if (msg_index > stop_after_index || msg_index >= min_pinned_op_index_) { + break; + } + + if (!msg->HasOneRef()) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Evicting cache: cannot remove " << msg->get()->id() + << " because it is in-use by a peer."; + ++iter; + continue; + } + + VLOG_WITH_PREFIX_UNLOCKED(2) << "Evicting cache. Removing: " << msg->get()->id(); + AccountForMessageRemovalUnlocked(msg); + bytes_evicted += msg->get()->SpaceUsed(); + cache_.erase(iter++); + + if (bytes_evicted >= bytes_to_evict) { + break; + } + } + VLOG_WITH_PREFIX_UNLOCKED(1) << "Evicting log cache: after state: " << ToStringUnlocked(); +} + +void LogCache::AccountForMessageRemovalUnlocked(const ReplicateRefPtr& msg) { + tracker_->Release(msg->get()->SpaceUsed()); + metrics_.log_cache_size->DecrementBy(msg->get()->SpaceUsed()); + metrics_.log_cache_num_ops->Decrement(); +} + +int64_t LogCache::BytesUsed() const { + return tracker_->consumption(); +} + +string LogCache::StatsString() const { + lock_guard lock(&lock_); + return StatsStringUnlocked(); +} + +string LogCache::StatsStringUnlocked() const { + return Substitute("LogCacheStats(num_ops=$0, bytes=$1)", + metrics_.log_cache_num_ops->value(), + metrics_.log_cache_size->value()); +} + +std::string LogCache::ToString() const { + lock_guard lock(&lock_); + return ToStringUnlocked(); +} + +std::string LogCache::ToStringUnlocked() const { + return Substitute("Pinned index: $0, $1", + min_pinned_op_index_, + StatsStringUnlocked()); +} + +std::string LogCache::LogPrefixUnlocked() const { + return Substitute("T $0 P $1: ", + tablet_id_, + local_uuid_); +} + +void LogCache::DumpToLog() const { + vector strings; + DumpToStrings(&strings); + for (const string& s : strings) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << s; + } +} + +void LogCache::DumpToStrings(vector* lines) const { + lock_guard lock(&lock_); + int counter = 0; + lines->push_back(ToStringUnlocked()); + lines->push_back("Messages:"); + for (const MessageCache::value_type& entry : cache_) { + const ReplicateMsg* msg = entry.second->get(); + lines->push_back( + Substitute("Message[$0] $1.$2 : REPLICATE. Type: $3, Size: $4", + counter++, msg->id().term(), msg->id().index(), + OperationType_Name(msg->op_type()), + msg->ByteSize())); + } +} + +void LogCache::DumpToHtml(std::ostream& out) const { + using std::endl; + + lock_guard lock(&lock_); + out << "

Messages:

" << endl; + out << "" << endl; + out << "" << endl; + + int counter = 0; + for (const MessageCache::value_type& entry : cache_) { + const ReplicateMsg* msg = entry.second->get(); + out << Substitute("" + "", + counter++, msg->id().term(), msg->id().index(), + OperationType_Name(msg->op_type()), + msg->ByteSize(), msg->id().ShortDebugString()) << endl; + } + out << "
EntryOpIdTypeSizeStatus
$0$1.$2REPLICATE $3$4$5
"; +} + +#define INSTANTIATE_METRIC(x) \ + x.Instantiate(metric_entity, 0) +LogCache::Metrics::Metrics(const scoped_refptr& metric_entity) + : log_cache_num_ops(INSTANTIATE_METRIC(METRIC_log_cache_num_ops)), + log_cache_size(INSTANTIATE_METRIC(METRIC_log_cache_size)) { +} +#undef INSTANTIATE_METRIC + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/log_cache.h b/src/kudu/consensus/log_cache.h new file mode 100644 index 000000000000..ba6739c0b3e1 --- /dev/null +++ b/src/kudu/consensus/log_cache.h @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOG_CACHE_H +#define KUDU_CONSENSUS_LOG_CACHE_H + +#include +#include +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/async_util.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MetricEntity; +class MemTracker; + +namespace log { +class Log; +class LogReader; +} // namespace log + +namespace consensus { + +class ReplicateMsg; + +// Write-through cache for the log. +// +// This stores a set of log messages by their index. New operations +// can be appended to the end as they are written to the log. Readers +// fetch entries that were explicitly appended, or they can fetch older +// entries which are asynchronously fetched from the disk. +class LogCache { + public: + LogCache(const scoped_refptr& metric_entity, + const scoped_refptr& log, + const std::string& local_uuid, + const std::string& tablet_id); + ~LogCache(); + + // Initialize the cache. + // + // 'preceding_op' is the current latest op. The next AppendOperation() call + // must follow this op. + // + // Requires that the cache is empty. + void Init(const OpId& preceding_op); + + // Read operations from the log, following 'after_op_index'. + // If such an op exists in the log, an OK result will always include at least one + // operation. + // + // The result will be limited such that the total ByteSize() of the returned ops + // is less than max_size_bytes, unless that would result in an empty result, in + // which case exactly one op is returned. + // + // The OpId which precedes the returned ops is returned in *preceding_op. + // The index of this OpId will match 'after_op_index'. + // + // If the ops being requested are not available in the log, this will synchronously + // read these ops from disk. Therefore, this function may take a substantial amount + // of time and should not be called with important locks held, etc. + Status ReadOps(int64_t after_op_index, + int max_size_bytes, + std::vector* messages, + OpId* preceding_op); + + // Append the operations into the log and the cache. + // When the messages have completed writing into the on-disk log, fires 'callback'. + // + // If the cache memory limit is exceeded, the entries may no longer be in the cache + // when the callback fires. + // + // Returns non-OK if the Log append itself fails. + Status AppendOperations(const std::vector& msgs, + const StatusCallback& callback); + + // Return true if an operation with the given index has been written through + // the cache. The operation may not necessarily be durable yet -- it could still be + // en route to the log. + bool HasOpBeenWritten(int64_t log_index) const; + + // Evict any operations with op index <= 'index'. + void EvictThroughOp(int64_t index); + + // Return the number of bytes of memory currently in use by the cache. + int64_t BytesUsed() const; + + int64_t num_cached_ops() const { + return metrics_.log_cache_num_ops->value(); + } + + // Dump the current contents of the cache to the log. + void DumpToLog() const; + + // Dumps the contents of the cache to the provided string vector. + void DumpToStrings(std::vector* lines) const; + + void DumpToHtml(std::ostream& out) const; + + std::string StatsString() const; + + std::string ToString() const; + + // Look up the OpId for the given operation index. + // If it is not in the cache, this consults the on-disk log index and thus + // may take a non-trivial amount of time due to IO. + // + // Returns "Incomplete" if the op has not yet been written. + // Returns "NotFound" if the op has been GCed. + // Returns another bad Status if the log index fails to load (eg. due to an IO error). + Status LookupOpId(int64_t op_index, OpId* op_id) const; + + private: + FRIEND_TEST(LogCacheTest, TestAppendAndGetMessages); + FRIEND_TEST(LogCacheTest, TestGlobalMemoryLimit); + FRIEND_TEST(LogCacheTest, TestReplaceMessages); + friend class LogCacheTest; + + // Try to evict the oldest operations from the queue, stopping either when + // 'bytes_to_evict' bytes have been evicted, or the op with index + // 'stop_after_index' has been evicted, whichever comes first. + void EvictSomeUnlocked(int64_t stop_after_index, int64_t bytes_to_evict); + + // Update metrics and MemTracker to account for the removal of the + // given message. + void AccountForMessageRemovalUnlocked(const ReplicateRefPtr& msg); + + // Return a string with stats + std::string StatsStringUnlocked() const; + + std::string ToStringUnlocked() const; + + std::string LogPrefixUnlocked() const; + + void LogCallback(int64_t last_idx_in_batch, + bool borrowed_memory, + const StatusCallback& user_callback, + const Status& log_status); + + scoped_refptr const log_; + + // The UUID of the local peer. + const std::string local_uuid_; + + // The id of the tablet. + const std::string tablet_id_; + + mutable simple_spinlock lock_; + + // An ordered map that serves as the buffer for the cached messages. + // Maps from log index -> ReplicateMsg + typedef std::map MessageCache; + MessageCache cache_; + + // The next log index to append. Each append operation must either + // start with this log index, or go backward (but never skip forward). + int64_t next_sequential_op_index_; + + // Any operation with an index >= min_pinned_op_ may not be + // evicted from the cache. This is used to prevent ops from being evicted + // until they successfully have been appended to the underlying log. + // Protected by lock_. + int64_t min_pinned_op_index_; + + // Pointer to a parent memtracker for all log caches. This + // exists to compute server-wide cache size and enforce a + // server-wide memory limit. When the first instance of a log + // cache is created, a new entry is added to MemTracker's static + // map; subsequent entries merely increment the refcount, so that + // the parent tracker can be deleted if all log caches are + // deleted (e.g., if all tablets are deleted from a server, or if + // the server is shutdown). + std::shared_ptr parent_tracker_; + + // A MemTracker for this instance. + std::shared_ptr tracker_; + + struct Metrics { + explicit Metrics(const scoped_refptr& metric_entity); + + // Keeps track of the total number of operations in the cache. + scoped_refptr > log_cache_num_ops; + + // Keeps track of the memory consumed by the cache, in bytes. + scoped_refptr > log_cache_size; + }; + Metrics metrics_; + + DISALLOW_COPY_AND_ASSIGN(LogCache); +}; + +} // namespace consensus +} // namespace kudu +#endif /* KUDU_CONSENSUS_LOG_CACHE_H */ diff --git a/src/kudu/consensus/log_index-test.cc b/src/kudu/consensus/log_index-test.cc new file mode 100644 index 000000000000..022ddceb436a --- /dev/null +++ b/src/kudu/consensus/log_index-test.cc @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_index.h" + +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace log { + +using consensus::MakeOpId; +using consensus::OpId; + +class LogIndexTest : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + index_ = new LogIndex(GetTestDataDirectory()); + } + + protected: + Status AddEntry(const OpId& op_id, int64_t segment, int64_t offset) { + LogIndexEntry entry; + entry.op_id = op_id; + entry.segment_sequence_number = segment; + entry.offset_in_segment = offset; + return index_->AddEntry(entry); + } + + void VerifyEntry(const OpId& op_id, int64_t segment, int64_t offset) { + SCOPED_TRACE(op_id); + LogIndexEntry result; + EXPECT_OK(index_->GetEntry(op_id.index(), &result)); + EXPECT_EQ(op_id.term(), result.op_id.term()); + EXPECT_EQ(op_id.index(), result.op_id.index()); + EXPECT_EQ(segment, result.segment_sequence_number); + EXPECT_EQ(offset, result.offset_in_segment); + } + + void VerifyNotFound(int64_t index) { + SCOPED_TRACE(index); + LogIndexEntry result; + Status s = index_->GetEntry(index, &result); + EXPECT_TRUE(s.IsNotFound()) << s.ToString(); + } + + scoped_refptr index_; +}; + + +TEST_F(LogIndexTest, TestBasic) { + // Insert three entries. + ASSERT_OK(AddEntry(MakeOpId(1, 1), 1, 12345)); + ASSERT_OK(AddEntry(MakeOpId(1, 999999), 1, 999)); + ASSERT_OK(AddEntry(MakeOpId(1, 1500000), 1, 54321)); + VerifyEntry(MakeOpId(1, 1), 1, 12345); + VerifyEntry(MakeOpId(1, 999999), 1, 999); + VerifyEntry(MakeOpId(1, 1500000), 1, 54321); + + // Overwrite one. + ASSERT_OK(AddEntry(MakeOpId(5, 1), 1, 50000)); + VerifyEntry(MakeOpId(5, 1), 1, 50000); +} + +TEST_F(LogIndexTest, TestMultiSegmentWithGC) { + ASSERT_OK(AddEntry(MakeOpId(1, 1), 1, 12345)); + ASSERT_OK(AddEntry(MakeOpId(1, 1000000), 1, 54321)); + ASSERT_OK(AddEntry(MakeOpId(1, 1500000), 1, 54321)); + ASSERT_OK(AddEntry(MakeOpId(1, 2500000), 1, 12345)); + + // GCing indexes < 1,000,000 shouldn't have any effect, because we can't remove any whole segment. + for (int gc = 0; gc < 1000000; gc += 100000) { + SCOPED_TRACE(gc); + index_->GC(gc); + VerifyEntry(MakeOpId(1, 1), 1, 12345); + VerifyEntry(MakeOpId(1, 1000000), 1, 54321); + VerifyEntry(MakeOpId(1, 1500000), 1, 54321); + VerifyEntry(MakeOpId(1, 2500000), 1, 12345); + } + + // If we GC index 1000000, we should lose the first op. + index_->GC(1000000); + VerifyNotFound(1); + VerifyEntry(MakeOpId(1, 1000000), 1, 54321); + VerifyEntry(MakeOpId(1, 1500000), 1, 54321); + VerifyEntry(MakeOpId(1, 2500000), 1, 12345); + + // GC everything + index_->GC(9000000); + VerifyNotFound(1); + VerifyNotFound(1000000); + VerifyNotFound(1500000); + VerifyNotFound(2500000); +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_index.cc b/src/kudu/consensus/log_index.cc new file mode 100644 index 000000000000..51ac6bd4fa5d --- /dev/null +++ b/src/kudu/consensus/log_index.cc @@ -0,0 +1,275 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// The implementation of the Log Index. +// +// The log index is implemented by a set of on-disk files, each containing a fixed number +// (kEntriesPerIndexChunk) of fixed size entries. Each index chunk is numbered such that, +// for a given log index, we can determine which chunk contains its index entry by a +// simple division operation. Because the entries are fixed size, we can compute the +// index offset by a modulo. +// +// When the log is GCed, we remove any index chunks which are no longer needed, and +// unmap them. + +#include "kudu/consensus/log_index.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/consensus/opid_util.h" +#include "kudu/util/locks.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" + +using std::string; +using strings::Substitute; + +#define RETRY_ON_EINTR(ret, expr) do { \ + ret = expr; \ +} while ((ret == -1) && (errno == EINTR)); + +namespace kudu { +namespace log { + +// The actual physical entry in the file. +// This mirrors LogIndexEntry but uses simple primitives only so we can +// read/write it via mmap. +// See LogIndexEntry for docs. +struct PhysicalEntry { + int64_t term; + uint64_t segment_sequence_number; + uint64_t offset_in_segment; +} PACKED; + +static const int64_t kEntriesPerIndexChunk = 1000000; +static const int64_t kChunkFileSize = kEntriesPerIndexChunk * sizeof(PhysicalEntry); + +//////////////////////////////////////////////////////////// +// LogIndex::IndexChunk implementation +//////////////////////////////////////////////////////////// + +// A single chunk of the index, representing a fixed number of entries. +// This class maintains the open file descriptor and mapped memory. +class LogIndex::IndexChunk : public RefCountedThreadSafe { + public: + explicit IndexChunk(string path); + ~IndexChunk(); + + // Open and map the memory. + Status Open(); + void GetEntry(int entry_index, PhysicalEntry* ret); + void SetEntry(int entry_index, const PhysicalEntry& entry); + + private: + const string path_; + int fd_; + uint8_t* mapping_; +}; + +namespace { +Status CheckError(int rc, const char* operation) { + if (PREDICT_FALSE(rc < 0)) { + int err = errno; + return Status::IOError(operation, ErrnoToString(err), err); + } + return Status::OK(); +} +} // anonymous namespace + +LogIndex::IndexChunk::IndexChunk(std::string path) + : path_(std::move(path)), fd_(-1), mapping_(nullptr) {} + +LogIndex::IndexChunk::~IndexChunk() { + if (mapping_ != nullptr) { + munmap(mapping_, kChunkFileSize); + } + + if (fd_ >= 0) { + close(fd_); + } +} + +Status LogIndex::IndexChunk::Open() { + RETRY_ON_EINTR(fd_, open(path_.c_str(), O_CLOEXEC | O_CREAT | O_RDWR, 0666)); + RETURN_NOT_OK(CheckError(fd_, "open")); + + int err; + RETRY_ON_EINTR(err, ftruncate(fd_, kChunkFileSize)); + RETURN_NOT_OK(CheckError(fd_, "truncate")); + + mapping_ = static_cast(mmap(nullptr, kChunkFileSize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd_, 0)); + if (mapping_ == nullptr) { + int err = errno; + return Status::IOError("Unable to mmap()", ErrnoToString(err), err); + } + + return Status::OK(); +} + +void LogIndex::IndexChunk::GetEntry(int entry_index, PhysicalEntry* ret) { + DCHECK_GE(fd_, 0) << "Must Open() first"; + DCHECK_LT(entry_index, kEntriesPerIndexChunk); + + memcpy(ret, mapping_ + sizeof(PhysicalEntry) * entry_index, sizeof(PhysicalEntry)); +} + +void LogIndex::IndexChunk::SetEntry(int entry_index, const PhysicalEntry& phys) { + DCHECK_GE(fd_, 0) << "Must Open() first"; + DCHECK_LT(entry_index, kEntriesPerIndexChunk); + + memcpy(mapping_ + sizeof(PhysicalEntry) * entry_index, &phys, sizeof(PhysicalEntry)); +} + +//////////////////////////////////////////////////////////// +// LogIndex +//////////////////////////////////////////////////////////// + +LogIndex::LogIndex(std::string base_dir) : base_dir_(std::move(base_dir)) {} + +LogIndex::~LogIndex() { +} + +string LogIndex::GetChunkPath(int64_t chunk_idx) { + return StringPrintf("%s/index.%09" PRId64, base_dir_.c_str(), chunk_idx); +} + +Status LogIndex::OpenChunk(int64_t chunk_idx, scoped_refptr* chunk) { + string path = GetChunkPath(chunk_idx); + + scoped_refptr new_chunk(new IndexChunk(path)); + RETURN_NOT_OK(new_chunk->Open()); + chunk->swap(new_chunk); + return Status::OK(); +} + +Status LogIndex::GetChunkForIndex(int64_t log_index, bool create, + scoped_refptr* chunk) { + CHECK_GT(log_index, 0); + int64_t chunk_idx = log_index / kEntriesPerIndexChunk; + + { + lock_guard l(&open_chunks_lock_); + if (FindCopy(open_chunks_, chunk_idx, chunk)) { + return Status::OK(); + } + } + + if (!create) { + return Status::NotFound("chunk not found"); + } + + RETURN_NOT_OK_PREPEND(OpenChunk(chunk_idx, chunk), + "Couldn't open index chunk"); + { + lock_guard l(&open_chunks_lock_); + if (PREDICT_FALSE(ContainsKey(open_chunks_, chunk_idx))) { + // Someone else opened the chunk in the meantime. + // We'll just return that one. + *chunk = FindOrDie(open_chunks_, chunk_idx); + return Status::OK(); + } + + InsertOrDie(&open_chunks_, chunk_idx, *chunk); + } + + return Status::OK(); +} + +Status LogIndex::AddEntry(const LogIndexEntry& entry) { + scoped_refptr chunk; + RETURN_NOT_OK(GetChunkForIndex(entry.op_id.index(), + true /* create if not found */, + &chunk)); + int index_in_chunk = entry.op_id.index() % kEntriesPerIndexChunk; + + PhysicalEntry phys; + phys.term = entry.op_id.term(); + phys.segment_sequence_number = entry.segment_sequence_number; + phys.offset_in_segment = entry.offset_in_segment; + + chunk->SetEntry(index_in_chunk, phys); + VLOG(3) << "Added log index entry " << entry.ToString(); + + return Status::OK(); +} + +Status LogIndex::GetEntry(int64_t index, LogIndexEntry* entry) { + scoped_refptr chunk; + RETURN_NOT_OK(GetChunkForIndex(index, false /* do not create */, &chunk)); + int index_in_chunk = index % kEntriesPerIndexChunk; + PhysicalEntry phys; + chunk->GetEntry(index_in_chunk, &phys); + + // We never write any real entries to offset 0, because there's a header + // in each log segment. So, this indicates an entry that was never written. + if (phys.offset_in_segment == 0) { + return Status::NotFound("entry not found"); + } + + entry->op_id = consensus::MakeOpId(phys.term, index); + entry->segment_sequence_number = phys.segment_sequence_number; + entry->offset_in_segment = phys.offset_in_segment; + + return Status::OK(); +} + +void LogIndex::GC(int64_t min_index_to_retain) { + int min_chunk_to_retain = min_index_to_retain / kEntriesPerIndexChunk; + + // Enumerate which chunks to delete. + vector chunks_to_delete; + { + lock_guard l(&open_chunks_lock_); + for (auto it = open_chunks_.begin(); + it != open_chunks_.lower_bound(min_chunk_to_retain); ++it) { + chunks_to_delete.push_back(it->first); + } + } + + // Outside of the lock, try to delete them (avoid holding the lock during IO). + for (int64_t chunk_idx : chunks_to_delete) { + string path = GetChunkPath(chunk_idx); + int rc = unlink(path.c_str()); + if (rc != 0) { + PLOG(WARNING) << "Unable to delete index chunk " << path; + continue; + } + LOG(INFO) << "Deleted log index segment " << path; + { + lock_guard l(&open_chunks_lock_); + open_chunks_.erase(chunk_idx); + } + } +} + +string LogIndexEntry::ToString() const { + return Substitute("op_id=$0.$1 segment_sequence_number=$2 offset=$3", + op_id.term(), op_id.index(), + segment_sequence_number, + offset_in_segment); +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_index.h b/src/kudu/consensus/log_index.h new file mode 100644 index 000000000000..7d12ecdebdb3 --- /dev/null +++ b/src/kudu/consensus/log_index.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOG_INDEX_H +#define KUDU_CONSENSUS_LOG_INDEX_H + +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace log { + +// An entry in the index. +struct LogIndexEntry { + consensus::OpId op_id; + + // The sequence number of the log segment which contains this entry. + int64_t segment_sequence_number; + + // The offset within that log segment for the batch which contains this + // entry. Note that the offset points to an entire batch which may contain + // more than one replicate. + int64_t offset_in_segment; + + std::string ToString() const; +}; + +// An on-disk structure which indexes from OpId index to the specific position in the WAL +// which contains the latest ReplicateMsg for that index. +// +// This structure is on-disk but *not durable*. We use mmap()ed IO to write it out, and +// never sync it to disk. Its only purpose is to allow random-reading earlier entries from +// the log to serve to Raft followers. +// +// This class is thread-safe, but doesn't provide a memory barrier between writers and +// readers. In other words, if a reader is expected to see an index entry written by a +// writer, there should be some other synchronization between them to ensure visibility. +// +// See .cc file for implementation notes. +class LogIndex : public RefCountedThreadSafe { + public: + explicit LogIndex(std::string base_dir); + + // Record an index entry in the index. + Status AddEntry(const LogIndexEntry& entry); + + // Retrieve an existing entry from the index. + // Returns NotFound() if the given log entry was never written. + Status GetEntry(int64_t index, LogIndexEntry* entry); + + // Indicate that we no longer need to retain information about indexes lower than the + // given index. Note that the implementation is conservative and _may_ choose to retain + // earlier entries. + void GC(int64_t min_index_to_retain); + + private: + friend class RefCountedThreadSafe; + ~LogIndex(); + + class IndexChunk; + + // Open the on-disk chunk with the given index. + // Note: 'chunk_idx' is the index of the index chunk, not the index of a log _entry_. + Status OpenChunk(int64_t chunk_idx, scoped_refptr* chunk); + + // Return the index chunk which contains the given log index. + // If 'create' is true, creates it on-demand. If 'create' is false, and + // the index chunk does not exist, returns NotFound. + Status GetChunkForIndex(int64_t log_index, bool create, + scoped_refptr* chunk); + + // Return the path of the given index chunk. + std::string GetChunkPath(int64_t chunk_idx); + + // The base directory where index files are located. + const std::string base_dir_; + + simple_spinlock open_chunks_lock_; + + // Map from chunk index to IndexChunk. The chunk index is the log index modulo + // the number of entries per chunk (see docs in log_index.cc). + // Protected by open_chunks_lock_ + typedef std::map > ChunkMap; + ChunkMap open_chunks_; + + DISALLOW_COPY_AND_ASSIGN(LogIndex); +}; + +} // namespace log +} // namespace kudu +#endif /* KUDU_CONSENSUS_LOG_INDEX_H */ diff --git a/src/kudu/consensus/log_metrics.cc b/src/kudu/consensus/log_metrics.cc new file mode 100644 index 000000000000..3e56c356c70f --- /dev/null +++ b/src/kudu/consensus/log_metrics.cc @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_metrics.h" + +#include "kudu/util/metrics.h" + +METRIC_DEFINE_counter(tablet, log_bytes_logged, "Bytes Written to WAL", + kudu::MetricUnit::kBytes, + "Number of bytes logged since service start"); + +METRIC_DEFINE_histogram(tablet, log_sync_latency, "Log Sync Latency", + kudu::MetricUnit::kMicroseconds, + "Microseconds spent on synchronizing the log segment file", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, log_append_latency, "Log Append Latency", + kudu::MetricUnit::kMicroseconds, + "Microseconds spent on appending to the log segment file", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, log_group_commit_latency, "Log Group Commit Latency", + kudu::MetricUnit::kMicroseconds, + "Microseconds spent on committing an entire group", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, log_roll_latency, "Log Roll Latency", + kudu::MetricUnit::kMicroseconds, + "Microseconds spent on rolling over to a new log segment file", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, log_entry_batches_per_group, "Log Group Commit Batch Size", + kudu::MetricUnit::kRequests, + "Number of log entry batches in a group commit group", + 1024, 2); + +namespace kudu { +namespace log { + +#define MINIT(x) x(METRIC_log_##x.Instantiate(metric_entity)) +LogMetrics::LogMetrics(const scoped_refptr& metric_entity) + : MINIT(bytes_logged), + MINIT(sync_latency), + MINIT(append_latency), + MINIT(group_commit_latency), + MINIT(roll_latency), + MINIT(entry_batches_per_group) { +} +#undef MINIT + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_metrics.h b/src/kudu/consensus/log_metrics.h new file mode 100644 index 000000000000..22e712b71a12 --- /dev/null +++ b/src/kudu/consensus/log_metrics.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_LOG_METRICS_H +#define KUDU_CONSENSUS_LOG_METRICS_H + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/monotime.h" + +namespace kudu { + +class Counter; +class Histogram; +class MetricEntity; + +namespace log { + +struct LogMetrics { + explicit LogMetrics(const scoped_refptr& metric_entity); + + // Global stats + scoped_refptr bytes_logged; + + // Per-group group commit stats + scoped_refptr sync_latency; + scoped_refptr append_latency; + scoped_refptr group_commit_latency; + scoped_refptr roll_latency; + scoped_refptr entry_batches_per_group; +}; + +// TODO extract and generalize this for all histogram metrics +#define SCOPED_LATENCY_METRIC(_mtx, _h) \ + ScopedLatencyMetric _h##_metric(_mtx ? _mtx->_h.get() : NULL) + +} // namespace log +} // namespace kudu + +#endif // KUDU_CONSENSUS_LOG_METRICS_H diff --git a/src/kudu/consensus/log_reader.cc b/src/kudu/consensus/log_reader.cc new file mode 100644 index 000000000000..3567f5451ab4 --- /dev/null +++ b/src/kudu/consensus/log_reader.cc @@ -0,0 +1,497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_reader.h" + +#include +#include + +#include "kudu/consensus/log_index.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding.h" +#include "kudu/util/env_util.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/metrics.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" + +METRIC_DEFINE_counter(tablet, log_reader_bytes_read, "Bytes Read From Log", + kudu::MetricUnit::kBytes, + "Data read from the WAL since tablet start"); + +METRIC_DEFINE_counter(tablet, log_reader_entries_read, "Entries Read From Log", + kudu::MetricUnit::kEntries, + "Number of entries read from the WAL since tablet start"); + +METRIC_DEFINE_histogram(tablet, log_reader_read_batch_latency, "Log Read Latency", + kudu::MetricUnit::kBytes, + "Microseconds spent reading log entry batches", + 60000000LU, 2); + +namespace kudu { +namespace log { + +namespace { +struct LogSegmentSeqnoComparator { + bool operator() (const scoped_refptr& a, + const scoped_refptr& b) { + return a->header().sequence_number() < b->header().sequence_number(); + } +}; +} + +using consensus::OpId; +using consensus::ReplicateMsg; +using env_util::ReadFully; +using strings::Substitute; + +const int LogReader::kNoSizeLimit = -1; + +Status LogReader::Open(FsManager *fs_manager, + const scoped_refptr& index, + const string& tablet_id, + const scoped_refptr& metric_entity, + gscoped_ptr *reader) { + gscoped_ptr log_reader(new LogReader(fs_manager, index, tablet_id, + metric_entity)); + + string tablet_wal_path = fs_manager->GetTabletWalDir(tablet_id); + + RETURN_NOT_OK(log_reader->Init(tablet_wal_path)) + reader->reset(log_reader.release()); + return Status::OK(); +} + +Status LogReader::OpenFromRecoveryDir(FsManager *fs_manager, + const string& tablet_id, + const scoped_refptr& metric_entity, + gscoped_ptr* reader) { + string recovery_path = fs_manager->GetTabletWalRecoveryDir(tablet_id); + + // When recovering, we don't want to have any log index -- since it isn't fsynced() + // during writing, its contents are useless to us. + scoped_refptr index(nullptr); + gscoped_ptr log_reader(new LogReader(fs_manager, index, tablet_id, + metric_entity)); + RETURN_NOT_OK_PREPEND(log_reader->Init(recovery_path), + "Unable to initialize log reader"); + reader->reset(log_reader.release()); + return Status::OK(); +} + +LogReader::LogReader(FsManager* fs_manager, + const scoped_refptr& index, + string tablet_id, + const scoped_refptr& metric_entity) + : fs_manager_(fs_manager), + log_index_(index), + tablet_id_(std::move(tablet_id)), + state_(kLogReaderInitialized) { + if (metric_entity) { + bytes_read_ = METRIC_log_reader_bytes_read.Instantiate(metric_entity); + entries_read_ = METRIC_log_reader_entries_read.Instantiate(metric_entity); + read_batch_latency_ = METRIC_log_reader_read_batch_latency.Instantiate(metric_entity); + } +} + +LogReader::~LogReader() { +} + +Status LogReader::Init(const string& tablet_wal_path) { + { + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderInitialized) << "bad state for Init(): " << state_; + } + VLOG(1) << "Reading wal from path:" << tablet_wal_path; + + Env* env = fs_manager_->env(); + + if (!fs_manager_->Exists(tablet_wal_path)) { + return Status::IllegalState("Cannot find wal location at", tablet_wal_path); + } + + VLOG(1) << "Parsing segments from path: " << tablet_wal_path; + // list existing segment files + vector log_files; + + RETURN_NOT_OK_PREPEND(env->GetChildren(tablet_wal_path, &log_files), + "Unable to read children from path"); + + SegmentSequence read_segments; + + // build a log segment from each file + for (const string &log_file : log_files) { + if (HasPrefixString(log_file, FsManager::kWalFileNamePrefix)) { + string fqp = JoinPathSegments(tablet_wal_path, log_file); + scoped_refptr segment; + RETURN_NOT_OK_PREPEND(ReadableLogSegment::Open(env, fqp, &segment), + "Unable to open readable log segment"); + DCHECK(segment); + CHECK(segment->IsInitialized()) << "Uninitialized segment at: " << segment->path(); + + if (!segment->HasFooter()) { + LOG(WARNING) << "Log segment " << fqp << " was likely left in-progress " + "after a previous crash. Will try to rebuild footer by scanning data."; + RETURN_NOT_OK(segment->RebuildFooterByScanning()); + } + + read_segments.push_back(segment); + } + } + + // Sort the segments by sequence number. + std::sort(read_segments.begin(), read_segments.end(), LogSegmentSeqnoComparator()); + + + { + boost::lock_guard lock(lock_); + + string previous_seg_path; + int64_t previous_seg_seqno = -1; + for (const SegmentSequence::value_type& entry : read_segments) { + VLOG(1) << " Log Reader Indexed: " << entry->footer().ShortDebugString(); + // Check that the log segments are in sequence. + if (previous_seg_seqno != -1 && entry->header().sequence_number() != previous_seg_seqno + 1) { + return Status::Corruption(Substitute("Segment sequence numbers are not consecutive. " + "Previous segment: seqno $0, path $1; Current segment: seqno $2, path $3", + previous_seg_seqno, previous_seg_path, + entry->header().sequence_number(), entry->path())); + previous_seg_seqno++; + } else { + previous_seg_seqno = entry->header().sequence_number(); + } + previous_seg_path = entry->path(); + RETURN_NOT_OK(AppendSegmentUnlocked(entry)); + } + + state_ = kLogReaderReading; + } + return Status::OK(); +} + +Status LogReader::InitEmptyReaderForTests() { + boost::lock_guard lock(lock_); + state_ = kLogReaderReading; + return Status::OK(); +} + +Status LogReader::GetSegmentPrefixNotIncluding(int64_t index, + SegmentSequence* segments) const { + DCHECK_GE(index, 0); + DCHECK(segments); + segments->clear(); + + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + + for (const scoped_refptr& segment : segments_) { + // The last segment doesn't have a footer. Never include that one. + if (!segment->HasFooter()) { + break; + } + if (segment->footer().max_replicate_index() >= index) { + break; + } + // TODO: tests for edge cases here with backwards ordered replicates. + segments->push_back(segment); + } + + return Status::OK(); +} + +int64_t LogReader::GetMinReplicateIndex() const { + boost::lock_guard lock(lock_); + int64_t min_remaining_op_idx = -1; + + for (const scoped_refptr& segment : segments_) { + if (!segment->HasFooter()) continue; + if (!segment->footer().has_min_replicate_index()) continue; + if (min_remaining_op_idx == -1 || + segment->footer().min_replicate_index() < min_remaining_op_idx) { + min_remaining_op_idx = segment->footer().min_replicate_index(); + } + } + return min_remaining_op_idx; +} + +void LogReader::GetMaxIndexesToSegmentSizeMap(int64_t min_op_idx, int32_t segments_count, + int64_t max_close_time_us, + std::map* + max_idx_to_segment_size) const { + boost::lock_guard lock(lock_); + DCHECK_GE(segments_count, 0); + for (const scoped_refptr& segment : segments_) { + if (max_idx_to_segment_size->size() == segments_count) { + break; + } + DCHECK(segment->HasFooter()); + if (segment->footer().max_replicate_index() < min_op_idx) { + // This means we found a log we can GC. Adjust the expected number of logs. + segments_count--; + continue; + } + + if (max_close_time_us < segment->footer().close_timestamp_micros()) { + int64_t age_seconds = segment->footer().close_timestamp_micros() / 1000000; + VLOG(2) << "Segment " << segment->path() << " is only " << age_seconds << "s old: " + << "won't be counted towards log retention"; + break; + } + (*max_idx_to_segment_size)[segment->footer().max_replicate_index()] = segment->file_size(); + } +} + +scoped_refptr LogReader::GetSegmentBySequenceNumber(int64_t seq) const { + boost::lock_guard lock(lock_); + if (segments_.empty()) { + return nullptr; + } + + // We always have a contiguous set of log segments, so we can find the requested + // segment in our vector by calculating its offset vs the first element. + int64_t first_seqno = segments_[0]->header().sequence_number(); + int64_t relative = seq - first_seqno; + if (relative < 0 || relative >= segments_.size()) { + return nullptr; + } + + DCHECK_EQ(segments_[relative]->header().sequence_number(), seq); + return segments_[relative]; +} + +Status LogReader::ReadBatchUsingIndexEntry(const LogIndexEntry& index_entry, + faststring* tmp_buf, + gscoped_ptr* batch) const { + const int index = index_entry.op_id.index(); + + scoped_refptr segment = GetSegmentBySequenceNumber( + index_entry.segment_sequence_number); + if (PREDICT_FALSE(!segment)) { + return Status::NotFound(Substitute("Segment $0 which contained index $1 has been GCed", + index_entry.segment_sequence_number, + index)); + } + + CHECK_GT(index_entry.offset_in_segment, 0); + int64_t offset = index_entry.offset_in_segment; + ScopedLatencyMetric scoped(read_batch_latency_.get()); + RETURN_NOT_OK_PREPEND(segment->ReadEntryHeaderAndBatch(&offset, tmp_buf, batch), + Substitute("Failed to read LogEntry for index $0 from log segment " + "$1 offset $2", + index, + index_entry.segment_sequence_number, + index_entry.offset_in_segment)); + + if (bytes_read_) { + bytes_read_->IncrementBy(kEntryHeaderSize + tmp_buf->length()); + entries_read_->IncrementBy((**batch).entry_size()); + } + + return Status::OK(); +} + +Status LogReader::ReadReplicatesInRange(const int64_t starting_at, + const int64_t up_to, + int64_t max_bytes_to_read, + vector* replicates) const { + DCHECK_GT(starting_at, 0); + DCHECK_GE(up_to, starting_at); + DCHECK(log_index_) << "Require an index to random-read logs"; + + vector replicates_tmp; + ElementDeleter d(&replicates_tmp); + LogIndexEntry prev_index_entry; + + int64_t total_size = 0; + bool limit_exceeded = false; + faststring tmp_buf; + gscoped_ptr batch; + for (int index = starting_at; index <= up_to && !limit_exceeded; index++) { + LogIndexEntry index_entry; + RETURN_NOT_OK_PREPEND(log_index_->GetEntry(index, &index_entry), + Substitute("Failed to read log index for op $0", index)); + + // Since a given LogEntryBatch may contain multiple REPLICATE messages, + // it's likely that this index entry points to the same batch as the previous + // one. If that's the case, we've already read this REPLICATE and we can + // skip reading the batch again. + if (index == starting_at || + index_entry.segment_sequence_number != prev_index_entry.segment_sequence_number || + index_entry.offset_in_segment != prev_index_entry.offset_in_segment) { + RETURN_NOT_OK(ReadBatchUsingIndexEntry(index_entry, &tmp_buf, &batch)); + + // Sanity-check the property that a batch should only have increasing indexes. + int64_t prev_index = 0; + for (int i = 0; i < batch->entry_size(); ++i) { + LogEntryPB* entry = batch->mutable_entry(i); + if (!entry->has_replicate()) continue; + int64_t this_index = entry->replicate().id().index(); + CHECK_GT(this_index, prev_index) + << "Expected that an entry batch should only include increasing log indexes: " + << index_entry.ToString() + << "\nBatch: " << batch->DebugString(); + prev_index = this_index; + } + } + + bool found = false; + for (int i = 0; i < batch->entry_size(); ++i) { + LogEntryPB* entry = batch->mutable_entry(i); + if (!entry->has_replicate()) { + continue; + } + + if (entry->replicate().id().index() != index) { + continue; + } + + int64_t space_required = entry->replicate().SpaceUsed(); + if (replicates_tmp.empty() || + max_bytes_to_read <= 0 || + total_size + space_required < max_bytes_to_read) { + total_size += space_required; + replicates_tmp.push_back(entry->release_replicate()); + } else { + limit_exceeded = true; + } + found = true; + break; + } + CHECK(found) << "Incorrect index entry didn't yield expected log entry: " + << index_entry.ToString(); + + prev_index_entry = index_entry; + } + + replicates->swap(replicates_tmp); + return Status::OK(); +} + +Status LogReader::LookupOpId(int64_t op_index, OpId* op_id) const { + LogIndexEntry index_entry; + RETURN_NOT_OK_PREPEND(log_index_->GetEntry(op_index, &index_entry), + strings::Substitute("Failed to read log index for op $0", op_index)); + *op_id = index_entry.op_id; + return Status::OK(); +} + +Status LogReader::GetSegmentsSnapshot(SegmentSequence* segments) const { + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + segments->assign(segments_.begin(), segments_.end()); + return Status::OK(); +} + +Status LogReader::TrimSegmentsUpToAndIncluding(int64_t segment_sequence_number) { + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + auto iter = segments_.begin(); + int num_deleted_segments = 0; + + while (iter != segments_.end()) { + if ((*iter)->header().sequence_number() <= segment_sequence_number) { + iter = segments_.erase(iter); + num_deleted_segments++; + continue; + } + break; + } + LOG(INFO) << "T " << tablet_id_ << ": removed " << num_deleted_segments + << " log segments from log reader"; + return Status::OK(); +} + +void LogReader::UpdateLastSegmentOffset(int64_t readable_to_offset) { + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + DCHECK(!segments_.empty()); + // Get the last segment + ReadableLogSegment* segment = segments_.back().get(); + DCHECK(!segment->HasFooter()); + segment->UpdateReadableToOffset(readable_to_offset); +} + +Status LogReader::ReplaceLastSegment(const scoped_refptr& segment) { + // This is used to replace the last segment once we close it properly so it must + // have a footer. + DCHECK(segment->HasFooter()); + + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + // Make sure the segment we're replacing has the same sequence number + CHECK(!segments_.empty()); + CHECK_EQ(segment->header().sequence_number(), segments_.back()->header().sequence_number()); + segments_[segments_.size() - 1] = segment; + + return Status::OK(); +} + +Status LogReader::AppendSegment(const scoped_refptr& segment) { + DCHECK(segment->IsInitialized()); + if (PREDICT_FALSE(!segment->HasFooter())) { + RETURN_NOT_OK(segment->RebuildFooterByScanning()); + } + boost::lock_guard lock(lock_); + return AppendSegmentUnlocked(segment); +} + +Status LogReader::AppendSegmentUnlocked(const scoped_refptr& segment) { + DCHECK(segment->IsInitialized()); + DCHECK(segment->HasFooter()); + + if (!segments_.empty()) { + CHECK_EQ(segments_.back()->header().sequence_number() + 1, + segment->header().sequence_number()); + } + segments_.push_back(segment); + return Status::OK(); +} + +Status LogReader::AppendEmptySegment(const scoped_refptr& segment) { + DCHECK(segment->IsInitialized()); + boost::lock_guard lock(lock_); + CHECK_EQ(state_, kLogReaderReading); + if (!segments_.empty()) { + CHECK_EQ(segments_.back()->header().sequence_number() + 1, + segment->header().sequence_number()); + } + segments_.push_back(segment); + return Status::OK(); +} + +const int LogReader::num_segments() const { + boost::lock_guard lock(lock_); + return segments_.size(); +} + +string LogReader::ToString() const { + boost::lock_guard lock(lock_); + string ret = "Reader's SegmentSequence: \n"; + for (const SegmentSequence::value_type& entry : segments_) { + ret.append(Substitute("Segment: $0 Footer: $1\n", + entry->header().sequence_number(), + !entry->HasFooter() ? "NONE" : entry->footer().ShortDebugString())); + } + return ret; +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_reader.h b/src/kudu/consensus/log_reader.h new file mode 100644 index 000000000000..3712af4360fe --- /dev/null +++ b/src/kudu/consensus/log_reader.h @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_LOG_LOG_READER_H_ +#define KUDU_LOG_LOG_READER_H_ + +#include +#include +#include +#include +#include + +#include "kudu/consensus/log_metrics.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/util/locks.h" + +namespace kudu { +namespace log { +class Log; +class LogIndex; +struct LogIndexEntry; + +// Reads a set of segments from a given path. Segment headers and footers +// are read and parsed, but entries are not. +// This class is thread safe. +class LogReader { + public: + ~LogReader(); + + // Opens a LogReader on the default tablet log directory, and sets + // 'reader' to the newly created LogReader. + // + // 'index' may be NULL, but if it is, ReadReplicatesInRange() may not + // be used. + static Status Open(FsManager *fs_manager, + const scoped_refptr& index, + const std::string& tablet_id, + const scoped_refptr& metric_entity, + gscoped_ptr *reader); + + // Opens a LogReader on a specific tablet log recovery directory, and sets + // 'reader' to the newly created LogReader. + static Status OpenFromRecoveryDir(FsManager *fs_manager, + const std::string& tablet_id, + const scoped_refptr& metric_entity, + gscoped_ptr *reader); + + // Returns the biggest prefix of segments, from the current sequence, guaranteed + // not to include any replicate messages with indexes >= 'index'. + Status GetSegmentPrefixNotIncluding(int64_t index, + SegmentSequence* segments) const; + + // Return the minimum replicate index that is retained in the currently available + // logs. May return -1 if no replicates have been logged. + int64_t GetMinReplicateIndex() const; + + // Returns a map of maximum log index in segment -> segment size representing all the segments + // that start after 'min_op_idx', up to 'segments_count'. + // + // 'min_op_idx' is the minimum operation index to start looking from, we don't record + // the segments before the one that contain that id. + // + // 'segments_count' is the number of segments we'll add to the map. It _must_ be sized so that + // we don't add the last segment. If we find logs that can be GCed, we'll decrease the number of + // elements we'll add to the map by 1 since they. + // + // 'max_close_time_us' is the timestamp in microseconds from which we don't want to evict, + // meaning that log segments that we closed after that time must not be added to the map. + void GetMaxIndexesToSegmentSizeMap(int64_t min_op_idx, int32_t segments_count, + int64_t max_close_time_us, + std::map* max_idx_to_segment_size) const; + + // Return a readable segment with the given sequence number, or NULL if it + // cannot be found (e.g. if it has already been GCed). + scoped_refptr GetSegmentBySequenceNumber(int64_t seq) const; + + // Copies a snapshot of the current sequence of segments into 'segments'. + // 'segments' will be cleared first. + Status GetSegmentsSnapshot(SegmentSequence* segments) const; + + // Reads all ReplicateMsgs from 'starting_at' to 'up_to' both inclusive. + // The caller takes ownership of the returned ReplicateMsg objects. + // + // Will attempt to read no more than 'max_bytes_to_read', unless it is set to + // LogReader::kNoSizeLimit. If the size limit would prevent reading any operations at + // all, then will read exactly one operation. + // + // Requires that a LogIndex was passed into LogReader::Open(). + Status ReadReplicatesInRange( + const int64_t starting_at, + const int64_t up_to, + int64_t max_bytes_to_read, + std::vector* replicates) const; + static const int kNoSizeLimit; + + // Look up the OpId for the given operation index. + // Returns a bad Status if the log index fails to load (eg. due to an IO error). + Status LookupOpId(int64_t op_index, consensus::OpId* op_id) const; + + // Returns the number of segments. + const int num_segments() const; + + std::string ToString() const; + + private: + FRIEND_TEST(LogTest, TestLogReader); + FRIEND_TEST(LogTest, TestReadLogWithReplacedReplicates); + friend class Log; + friend class LogTest; + + enum State { + kLogReaderInitialized, + kLogReaderReading, + kLogReaderClosed + }; + + // Appends 'segment' to the segments available for read by this reader. + // Index entries in 'segment's footer will be added to the index. + // If the segment has no footer it will be scanned so this should not be used + // for new segments. + Status AppendSegment(const scoped_refptr& segment); + + // Same as above but for segments without any entries. + // Used by the Log to add "empty" segments. + Status AppendEmptySegment(const scoped_refptr& segment); + + // Removes segments with sequence numbers less than or equal to 'seg_seqno' from this reader. + Status TrimSegmentsUpToAndIncluding(int64_t seg_seqno); + + // Replaces the last segment in the reader with 'segment'. + // Used to replace a segment that was still in the process of being written + // with its complete version which has a footer and index entries. + // Requires that the last segment in 'segments_' has the same sequence + // number as 'segment'. + // Expects 'segment' to be properly closed and to have footer. + Status ReplaceLastSegment(const scoped_refptr& segment); + + // Appends 'segment' to the segment sequence. + // Assumes that the segment was scanned, if no footer was found. + // To be used only internally, clients of this class with private access (i.e. friends) + // should use the thread safe version, AppendSegment(), which will also scan the segment + // if no footer is present. + Status AppendSegmentUnlocked(const scoped_refptr& segment); + + // Used by Log to update its LogReader on how far it is possible to read + // the current segment. Requires that the reader has at least one segment + // and that the last segment has no footer, meaning it is currently being + // written to. + void UpdateLastSegmentOffset(int64_t readable_to_offset); + + // Read the LogEntryBatch pointed to by the provided index entry. + // 'tmp_buf' is used as scratch space to avoid extra allocation. + Status ReadBatchUsingIndexEntry(const LogIndexEntry& index_entry, + faststring* tmp_buf, + gscoped_ptr* batch) const; + + LogReader(FsManager* fs_manager, const scoped_refptr& index, + std::string tablet_name, + const scoped_refptr& metric_entity); + + // Reads the headers of all segments in 'path_'. + Status Init(const std::string& path_); + + // Initializes an 'empty' reader for tests, i.e. does not scan a path looking for segments. + Status InitEmptyReaderForTests(); + + FsManager *fs_manager_; + const scoped_refptr log_index_; + const std::string tablet_id_; + + // Metrics + scoped_refptr bytes_read_; + scoped_refptr entries_read_; + scoped_refptr read_batch_latency_; + + // The sequence of all current log segments in increasing sequence number + // order. + SegmentSequence segments_; + + mutable simple_spinlock lock_; + + State state_; + + DISALLOW_COPY_AND_ASSIGN(LogReader); +}; + +} // namespace log +} // namespace kudu + +#endif /* KUDU_LOG_LOG_READER_H_ */ diff --git a/src/kudu/consensus/log_util.cc b/src/kudu/consensus/log_util.cc new file mode 100644 index 000000000000..f699be05efec --- /dev/null +++ b/src/kudu/consensus/log_util.cc @@ -0,0 +1,810 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log_util.h" + +#include +#include +#include +#include + +#include +#include + +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/coding.h" +#include "kudu/util/crc.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env_util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/pb_util.h" + +DEFINE_int32(log_segment_size_mb, 64, + "The default segment size for log roll-overs, in MB"); +TAG_FLAG(log_segment_size_mb, advanced); + +DEFINE_bool(log_force_fsync_all, false, + "Whether the Log/WAL should explicitly call fsync() after each write."); +TAG_FLAG(log_force_fsync_all, stable); + +DEFINE_bool(log_preallocate_segments, true, + "Whether the WAL should preallocate the entire segment before writing to it"); +TAG_FLAG(log_preallocate_segments, advanced); + +DEFINE_bool(log_async_preallocate_segments, true, + "Whether the WAL segments preallocation should happen asynchronously"); +TAG_FLAG(log_async_preallocate_segments, advanced); + +namespace kudu { +namespace log { + +using consensus::OpId; +using env_util::ReadFully; +using std::vector; +using std::shared_ptr; +using strings::Substitute; +using strings::SubstituteAndAppend; + +const char kTmpSuffix[] = ".tmp"; + +const char kLogSegmentHeaderMagicString[] = "kudulogf"; + +// A magic that is written as the very last thing when a segment is closed. +// Segments that were not closed (usually the last one being written) will not +// have this magic. +const char kLogSegmentFooterMagicString[] = "closedls"; + +// Header is prefixed with the header magic (8 bytes) and the header length (4 bytes). +const size_t kLogSegmentHeaderMagicAndHeaderLength = 12; + +// Footer is suffixed with the footer magic (8 bytes) and the footer length (4 bytes). +const size_t kLogSegmentFooterMagicAndFooterLength = 12; + +const size_t kEntryHeaderSize = 12; + +const int kLogMajorVersion = 1; +const int kLogMinorVersion = 0; + +// Maximum log segment header/footer size, in bytes (8 MB). +const uint32_t kLogSegmentMaxHeaderOrFooterSize = 8 * 1024 * 1024; + +LogOptions::LogOptions() +: segment_size_mb(FLAGS_log_segment_size_mb), + force_fsync_all(FLAGS_log_force_fsync_all), + preallocate_segments(FLAGS_log_preallocate_segments), + async_preallocate_segments(FLAGS_log_async_preallocate_segments) { +} + +Status ReadableLogSegment::Open(Env* env, + const string& path, + scoped_refptr* segment) { + VLOG(1) << "Parsing wal segment: " << path; + shared_ptr readable_file; + RETURN_NOT_OK_PREPEND(env_util::OpenFileForRandom(env, path, &readable_file), + "Unable to open file for reading"); + + segment->reset(new ReadableLogSegment(path, readable_file)); + RETURN_NOT_OK_PREPEND((*segment)->Init(), "Unable to initialize segment"); + return Status::OK(); +} + +ReadableLogSegment::ReadableLogSegment( + std::string path, shared_ptr readable_file) + : path_(std::move(path)), + file_size_(0), + readable_to_offset_(0), + readable_file_(std::move(readable_file)), + is_initialized_(false), + footer_was_rebuilt_(false) {} + +Status ReadableLogSegment::Init(const LogSegmentHeaderPB& header, + const LogSegmentFooterPB& footer, + int64_t first_entry_offset) { + DCHECK(!IsInitialized()) << "Can only call Init() once"; + DCHECK(header.IsInitialized()) << "Log segment header must be initialized"; + DCHECK(footer.IsInitialized()) << "Log segment footer must be initialized"; + + RETURN_NOT_OK(ReadFileSize()); + + header_.CopyFrom(header); + footer_.CopyFrom(footer); + first_entry_offset_ = first_entry_offset; + is_initialized_ = true; + readable_to_offset_.Store(file_size()); + + return Status::OK(); +} + +Status ReadableLogSegment::Init(const LogSegmentHeaderPB& header, + int64_t first_entry_offset) { + DCHECK(!IsInitialized()) << "Can only call Init() once"; + DCHECK(header.IsInitialized()) << "Log segment header must be initialized"; + + RETURN_NOT_OK(ReadFileSize()); + + header_.CopyFrom(header); + first_entry_offset_ = first_entry_offset; + is_initialized_ = true; + + // On a new segment, we don't expect any readable entries yet. + readable_to_offset_.Store(first_entry_offset); + + return Status::OK(); +} + +Status ReadableLogSegment::Init() { + DCHECK(!IsInitialized()) << "Can only call Init() once"; + + RETURN_NOT_OK(ReadFileSize()); + + RETURN_NOT_OK(ReadHeader()); + + Status s = ReadFooter(); + if (!s.ok()) { + LOG(WARNING) << "Could not read footer for segment: " << path_ + << ": " << s.ToString(); + } + + is_initialized_ = true; + + readable_to_offset_.Store(file_size()); + + return Status::OK(); +} + +const int64_t ReadableLogSegment::readable_up_to() const { + return readable_to_offset_.Load(); +} + +void ReadableLogSegment::UpdateReadableToOffset(int64_t readable_to_offset) { + readable_to_offset_.Store(readable_to_offset); + file_size_.StoreMax(readable_to_offset); +} + +Status ReadableLogSegment::RebuildFooterByScanning() { + TRACE_EVENT1("log", "ReadableLogSegment::RebuildFooterByScanning", + "path", path_); + + DCHECK(!footer_.IsInitialized()); + vector entries; + ElementDeleter deleter(&entries); + int64_t end_offset = 0; + RETURN_NOT_OK(ReadEntries(&entries, &end_offset)); + + footer_.set_num_entries(entries.size()); + + // Rebuild the min/max replicate index (by scanning) + for (const LogEntryPB* entry : entries) { + if (entry->has_replicate()) { + int64_t index = entry->replicate().id().index(); + // TODO: common code with Log::UpdateFooterForBatch + if (!footer_.has_min_replicate_index() || + index < footer_.min_replicate_index()) { + footer_.set_min_replicate_index(index); + } + if (!footer_.has_max_replicate_index() || + index > footer_.max_replicate_index()) { + footer_.set_max_replicate_index(index); + } + } + } + + DCHECK(footer_.IsInitialized()); + DCHECK_EQ(entries.size(), footer_.num_entries()); + footer_was_rebuilt_ = true; + + readable_to_offset_.Store(end_offset); + + LOG(INFO) << "Successfully rebuilt footer for segment: " << path_ + << " (valid entries through byte offset " << end_offset << ")"; + return Status::OK(); +} + +Status ReadableLogSegment::ReadFileSize() { + // Check the size of the file. + // Env uses uint here, even though we generally prefer signed ints to avoid + // underflow bugs. Use a local to convert. + uint64_t size; + RETURN_NOT_OK_PREPEND(readable_file_->Size(&size), "Unable to read file size"); + file_size_.Store(size); + if (size == 0) { + VLOG(1) << "Log segment file $0 is zero-length: " << path(); + return Status::OK(); + } + return Status::OK(); +} + +Status ReadableLogSegment::ReadHeader() { + uint32_t header_size; + RETURN_NOT_OK(ReadHeaderMagicAndHeaderLength(&header_size)); + if (header_size == 0) { + // If a log file has been pre-allocated but not initialized, then + // 'header_size' will be 0 even the file size is > 0; in this + // case, 'is_initialized_' remains set to false and return + // Status::OK() early. LogReader ignores segments where + // IsInitialized() returns false. + return Status::OK(); + } + + if (header_size > kLogSegmentMaxHeaderOrFooterSize) { + return Status::Corruption( + Substitute("File is corrupted. " + "Parsed header size: $0 is zero or bigger than max header size: $1", + header_size, kLogSegmentMaxHeaderOrFooterSize)); + } + + uint8_t header_space[header_size]; + Slice header_slice; + LogSegmentHeaderPB header; + + // Read and parse the log segment header. + RETURN_NOT_OK_PREPEND(ReadFully(readable_file_.get(), kLogSegmentHeaderMagicAndHeaderLength, + header_size, &header_slice, header_space), + "Unable to read fully"); + + RETURN_NOT_OK_PREPEND(pb_util::ParseFromArray(&header, + header_slice.data(), + header_size), + "Unable to parse protobuf"); + + header_.CopyFrom(header); + first_entry_offset_ = header_size + kLogSegmentHeaderMagicAndHeaderLength; + + return Status::OK(); +} + + +Status ReadableLogSegment::ReadHeaderMagicAndHeaderLength(uint32_t *len) { + uint8_t scratch[kLogSegmentHeaderMagicAndHeaderLength]; + Slice slice; + RETURN_NOT_OK(ReadFully(readable_file_.get(), 0, kLogSegmentHeaderMagicAndHeaderLength, + &slice, scratch)); + RETURN_NOT_OK(ParseHeaderMagicAndHeaderLength(slice, len)); + return Status::OK(); +} + +namespace { + +// We don't run TSAN on this function because it makes it really slow and causes some +// test timeouts. This is only used on local buffers anyway, so we don't lose much +// by not checking it. +ATTRIBUTE_NO_SANITIZE_THREAD +bool IsAllZeros(const Slice& s) { + // Walk a pointer through the slice instead of using s[i] + // since this is way faster in debug mode builds. We also do some + // manual unrolling for the same purpose. + const uint8_t* p = &s[0]; + int rem = s.size(); + + while (rem >= 8) { + if (UNALIGNED_LOAD64(p) != 0) return false; + rem -= 8; + p += 8; + } + + while (rem > 0) { + if (*p++ != '\0') return false; + rem--; + } + return true; +} +} // anonymous namespace + +Status ReadableLogSegment::ParseHeaderMagicAndHeaderLength(const Slice &data, + uint32_t *parsed_len) { + RETURN_NOT_OK_PREPEND(data.check_size(kLogSegmentHeaderMagicAndHeaderLength), + "Log segment file is too small to contain initial magic number"); + + if (memcmp(kLogSegmentHeaderMagicString, data.data(), + strlen(kLogSegmentHeaderMagicString)) != 0) { + // As a special case, we check whether the file was allocated but no header + // was written. We treat that case as an uninitialized file, much in the + // same way we treat zero-length files. + // Note: While the above comparison checks 8 bytes, this one checks the full 12 + // to ensure we have a full 12 bytes of NULL data. + if (IsAllZeros(data)) { + // 12 bytes of NULLs, good enough for us to consider this a file that + // was never written to (but apparently preallocated). + LOG(WARNING) << "Log segment file " << path() << " has 12 initial NULL bytes instead of " + << "magic and header length: " << data.ToDebugString() + << " and will be treated as a blank segment."; + *parsed_len = 0; + return Status::OK(); + } + // If no magic and not uninitialized, the file is considered corrupt. + return Status::Corruption(Substitute("Invalid log segment file $0: Bad magic. $1", + path(), data.ToDebugString())); + } + + *parsed_len = DecodeFixed32(data.data() + strlen(kLogSegmentHeaderMagicString)); + return Status::OK(); +} + +Status ReadableLogSegment::ReadFooter() { + uint32_t footer_size; + RETURN_NOT_OK(ReadFooterMagicAndFooterLength(&footer_size)); + + if (footer_size == 0 || footer_size > kLogSegmentMaxHeaderOrFooterSize) { + return Status::NotFound( + Substitute("File is corrupted. " + "Parsed header size: $0 is zero or bigger than max header size: $1", + footer_size, kLogSegmentMaxHeaderOrFooterSize)); + } + + if (footer_size > (file_size() - first_entry_offset_)) { + return Status::NotFound("Footer not found. File corrupted. " + "Decoded footer length pointed at a footer before the first entry."); + } + + uint8_t footer_space[footer_size]; + Slice footer_slice; + + int64_t footer_offset = file_size() - kLogSegmentFooterMagicAndFooterLength - footer_size; + + LogSegmentFooterPB footer; + + // Read and parse the log segment footer. + RETURN_NOT_OK_PREPEND(ReadFully(readable_file_.get(), footer_offset, + footer_size, &footer_slice, footer_space), + "Footer not found. Could not read fully."); + + RETURN_NOT_OK_PREPEND(pb_util::ParseFromArray(&footer, + footer_slice.data(), + footer_size), + "Unable to parse protobuf"); + + footer_.Swap(&footer); + return Status::OK(); +} + +Status ReadableLogSegment::ReadFooterMagicAndFooterLength(uint32_t *len) { + uint8_t scratch[kLogSegmentFooterMagicAndFooterLength]; + Slice slice; + + CHECK_GT(file_size(), kLogSegmentFooterMagicAndFooterLength); + RETURN_NOT_OK(ReadFully(readable_file_.get(), + file_size() - kLogSegmentFooterMagicAndFooterLength, + kLogSegmentFooterMagicAndFooterLength, + &slice, + scratch)); + + RETURN_NOT_OK(ParseFooterMagicAndFooterLength(slice, len)); + return Status::OK(); +} + +Status ReadableLogSegment::ParseFooterMagicAndFooterLength(const Slice &data, + uint32_t *parsed_len) { + RETURN_NOT_OK_PREPEND(data.check_size(kLogSegmentFooterMagicAndFooterLength), + "Slice is too small to contain final magic number"); + + if (memcmp(kLogSegmentFooterMagicString, data.data(), + strlen(kLogSegmentFooterMagicString)) != 0) { + return Status::NotFound("Footer not found. Footer magic doesn't match"); + } + + *parsed_len = DecodeFixed32(data.data() + strlen(kLogSegmentFooterMagicString)); + return Status::OK(); +} + +Status ReadableLogSegment::ReadEntries(vector* entries, + int64_t* end_offset) { + TRACE_EVENT1("log", "ReadableLogSegment::ReadEntries", + "path", path_); + + vector recent_offsets(4, -1); + int batches_read = 0; + + int64_t offset = first_entry_offset(); + int64_t readable_to_offset = readable_to_offset_.Load(); + VLOG(1) << "Reading segment entries from " + << path_ << ": offset=" << offset << " file_size=" + << file_size() << " readable_to_offset=" << readable_to_offset; + faststring tmp_buf; + + // If we have a footer we only read up to it. If we don't we likely crashed + // and always read to the end. + int64_t read_up_to = (footer_.IsInitialized() && !footer_was_rebuilt_) ? + file_size() - footer_.ByteSize() - kLogSegmentFooterMagicAndFooterLength : + readable_to_offset; + + if (end_offset != nullptr) { + *end_offset = offset; + } + + int num_entries_read = 0; + while (offset < read_up_to) { + const int64_t this_batch_offset = offset; + recent_offsets[batches_read++ % recent_offsets.size()] = offset; + + gscoped_ptr current_batch; + + // Read and validate the entry header first. + Status s; + if (offset + kEntryHeaderSize < read_up_to) { + s = ReadEntryHeaderAndBatch(&offset, &tmp_buf, ¤t_batch); + } else { + s = Status::Corruption(Substitute("Truncated log entry at offset $0", offset)); + } + + if (PREDICT_FALSE(!s.ok())) { + if (!s.IsCorruption()) { + // IO errors should always propagate back + return s.CloneAndPrepend(Substitute("Error reading from log $0", path_)); + } + + Status corruption_status = MakeCorruptionStatus( + batches_read, this_batch_offset, &recent_offsets, + *entries, s); + + // If we have a valid footer in the segment, then the segment was correctly + // closed, and we shouldn't see any corruption anywhere (including the last + // batch). + if (HasFooter() && !footer_was_rebuilt_) { + LOG(WARNING) << "Found a corruption in a closed log segment: " + << corruption_status.ToString(); + return corruption_status; + } + + // If we read a corrupt entry, but we don't have a footer, then it's + // possible that we crashed in the middle of writing an entry. + // In this case, we scan forward to see if there are any more valid looking + // entries after this one in the file. If there are, it's really a corruption. + // if not, we just WARN it, since it's OK for the last entry to be partially + // written. + bool has_valid_entries; + RETURN_NOT_OK_PREPEND(ScanForValidEntryHeaders(offset, &has_valid_entries), + "Scanning forward for valid entries"); + if (has_valid_entries) { + return corruption_status; + } + + LOG(INFO) << "Ignoring log segment corruption in " << path_ << " because " + << "there are no log entries following the corrupted one. " + << "The server probably crashed in the middle of writing an entry " + << "to the write-ahead log or downloaded an active log via remote bootstrap. " + << "Error detail: " << corruption_status.ToString(); + break; + } + + if (VLOG_IS_ON(3)) { + VLOG(3) << "Read Log entry batch: " << current_batch->DebugString(); + } + for (size_t i = 0; i < current_batch->entry_size(); ++i) { + entries->push_back(current_batch->mutable_entry(i)); + num_entries_read++; + } + current_batch->mutable_entry()->ExtractSubrange(0, + current_batch->entry_size(), + nullptr); + if (end_offset != nullptr) { + *end_offset = offset; + } + } + + if (footer_.IsInitialized() && footer_.num_entries() != num_entries_read) { + return Status::Corruption( + Substitute("Read $0 log entries from $1, but expected $2 based on the footer", + num_entries_read, path_, footer_.num_entries())); + } + + return Status::OK(); +} + +Status ReadableLogSegment::ScanForValidEntryHeaders(int64_t offset, bool* has_valid_entries) { + TRACE_EVENT1("log", "ReadableLogSegment::ScanForValidEntryHeaders", + "path", path_); + LOG(INFO) << "Scanning " << path_ << " for valid entry headers " + << "following offset " << offset << "..."; + *has_valid_entries = false; + + const int kChunkSize = 1024 * 1024; + gscoped_ptr buf(new uint8_t[kChunkSize]); + + // We overlap the reads by the size of the header, so that if a header + // spans chunks, we don't miss it. + for (; + offset < file_size() - kEntryHeaderSize; + offset += kChunkSize - kEntryHeaderSize) { + int rem = std::min(file_size() - offset, kChunkSize); + Slice chunk; + RETURN_NOT_OK(ReadFully(readable_file().get(), offset, rem, &chunk, &buf[0])); + + // Optimization for the case where a chunk is all zeros -- this is common in the + // case of pre-allocated files. This avoids a lot of redundant CRC calculation. + if (IsAllZeros(chunk)) { + continue; + } + + // Check if this chunk has a valid entry header. + for (int off_in_chunk = 0; + off_in_chunk < chunk.size() - kEntryHeaderSize; + off_in_chunk++) { + Slice potential_header = Slice(&chunk[off_in_chunk], kEntryHeaderSize); + + EntryHeader header; + if (DecodeEntryHeader(potential_header, &header)) { + LOG(INFO) << "Found a valid entry header at offset " << (offset + off_in_chunk); + *has_valid_entries = true; + return Status::OK(); + } + } + } + + LOG(INFO) << "Found no log entry headers"; + return Status::OK(); +} + +Status ReadableLogSegment::MakeCorruptionStatus(int batch_number, int64_t batch_offset, + vector* recent_offsets, + const std::vector& entries, + const Status& status) const { + + string err = "Log file corruption detected. "; + SubstituteAndAppend(&err, "Failed trying to read batch #$0 at offset $1 for log segment $2: ", + batch_number, batch_offset, path_); + err.append("Prior batch offsets:"); + std::sort(recent_offsets->begin(), recent_offsets->end()); + for (int64_t offset : *recent_offsets) { + if (offset >= 0) { + SubstituteAndAppend(&err, " $0", offset); + } + } + if (!entries.empty()) { + err.append("; Last log entries read:"); + const int kNumEntries = 4; // Include up to the last 4 entries in the segment. + for (int i = std::max(0, static_cast(entries.size()) - kNumEntries); + i < entries.size(); i++) { + LogEntryPB* entry = entries[i]; + LogEntryTypePB type = entry->type(); + string opid_str; + if (type == log::REPLICATE && entry->has_replicate()) { + opid_str = OpIdToString(entry->replicate().id()); + } else if (entry->has_commit() && entry->commit().has_commited_op_id()) { + opid_str = OpIdToString(entry->commit().commited_op_id()); + } else { + opid_str = ""; + } + SubstituteAndAppend(&err, " [$0 ($1)]", LogEntryTypePB_Name(type), opid_str); + } + } + + return status.CloneAndAppend(err); +} + +Status ReadableLogSegment::ReadEntryHeaderAndBatch(int64_t* offset, faststring* tmp_buf, + gscoped_ptr* batch) { + EntryHeader header; + RETURN_NOT_OK(ReadEntryHeader(offset, &header)); + RETURN_NOT_OK(ReadEntryBatch(offset, header, tmp_buf, batch)); + return Status::OK(); +} + + +Status ReadableLogSegment::ReadEntryHeader(int64_t *offset, EntryHeader* header) { + uint8_t scratch[kEntryHeaderSize]; + Slice slice; + RETURN_NOT_OK_PREPEND(ReadFully(readable_file().get(), *offset, kEntryHeaderSize, + &slice, scratch), + "Could not read log entry header"); + + if (PREDICT_FALSE(!DecodeEntryHeader(slice, header))) { + return Status::Corruption("CRC mismatch in log entry header"); + } + *offset += slice.size(); + return Status::OK(); +} + +bool ReadableLogSegment::DecodeEntryHeader(const Slice& data, EntryHeader* header) { + DCHECK_EQ(kEntryHeaderSize, data.size()); + header->msg_length = DecodeFixed32(&data[0]); + header->msg_crc = DecodeFixed32(&data[4]); + header->header_crc = DecodeFixed32(&data[8]); + + // Verify the header. + uint32_t computed_crc = crc::Crc32c(&data[0], 8); + return computed_crc == header->header_crc; +} + + +Status ReadableLogSegment::ReadEntryBatch(int64_t *offset, + const EntryHeader& header, + faststring *tmp_buf, + gscoped_ptr *entry_batch) { + TRACE_EVENT2("log", "ReadableLogSegment::ReadEntryBatch", + "path", path_, + "range", Substitute("offset=$0 entry_len=$1", + *offset, header.msg_length)); + + if (header.msg_length == 0) { + return Status::Corruption("Invalid 0 entry length"); + } + int64_t limit = readable_up_to(); + if (PREDICT_FALSE(header.msg_length + *offset > limit)) { + // The log was likely truncated during writing. + return Status::Corruption( + Substitute("Could not read $0-byte log entry from offset $1 in $2: " + "log only readable up to offset $3", + header.msg_length, *offset, path_, limit)); + } + + tmp_buf->clear(); + tmp_buf->resize(header.msg_length); + Slice entry_batch_slice; + + Status s = readable_file()->Read(*offset, + header.msg_length, + &entry_batch_slice, + tmp_buf->data()); + + if (!s.ok()) return Status::IOError(Substitute("Could not read entry. Cause: $0", + s.ToString())); + + // Verify the CRC. + uint32_t read_crc = crc::Crc32c(entry_batch_slice.data(), entry_batch_slice.size()); + if (PREDICT_FALSE(read_crc != header.msg_crc)) { + return Status::Corruption(Substitute("Entry CRC mismatch in byte range $0-$1: " + "expected CRC=$2, computed=$3", + *offset, *offset + header.msg_length, + header.msg_crc, read_crc)); + } + + + gscoped_ptr read_entry_batch(new LogEntryBatchPB()); + s = pb_util::ParseFromArray(read_entry_batch.get(), + entry_batch_slice.data(), + header.msg_length); + + if (!s.ok()) return Status::Corruption(Substitute("Could parse PB. Cause: $0", + s.ToString())); + + *offset += entry_batch_slice.size(); + entry_batch->reset(read_entry_batch.release()); + return Status::OK(); +} + +WritableLogSegment::WritableLogSegment(string path, + shared_ptr writable_file) + : path_(std::move(path)), + writable_file_(std::move(writable_file)), + is_header_written_(false), + is_footer_written_(false), + written_offset_(0) {} + +Status WritableLogSegment::WriteHeaderAndOpen(const LogSegmentHeaderPB& new_header) { + DCHECK(!IsHeaderWritten()) << "Can only call WriteHeader() once"; + DCHECK(new_header.IsInitialized()) + << "Log segment header must be initialized" << new_header.InitializationErrorString(); + faststring buf; + + // First the magic. + buf.append(kLogSegmentHeaderMagicString); + // Then Length-prefixed header. + PutFixed32(&buf, new_header.ByteSize()); + // Then Serialize the PB. + if (!pb_util::AppendToString(new_header, &buf)) { + return Status::Corruption("unable to encode header"); + } + RETURN_NOT_OK(writable_file()->Append(Slice(buf))); + + header_.CopyFrom(new_header); + first_entry_offset_ = buf.size(); + written_offset_ = first_entry_offset_; + is_header_written_ = true; + + return Status::OK(); +} + +Status WritableLogSegment::WriteFooterAndClose(const LogSegmentFooterPB& footer) { + TRACE_EVENT1("log", "WritableLogSegment::WriteFooterAndClose", + "path", path_); + DCHECK(IsHeaderWritten()); + DCHECK(!IsFooterWritten()); + DCHECK(footer.IsInitialized()) << footer.InitializationErrorString(); + + faststring buf; + + if (!pb_util::AppendToString(footer, &buf)) { + return Status::Corruption("unable to encode header"); + } + + buf.append(kLogSegmentFooterMagicString); + PutFixed32(&buf, footer.ByteSize()); + + RETURN_NOT_OK_PREPEND(writable_file()->Append(Slice(buf)), "Could not write the footer"); + + footer_.CopyFrom(footer); + is_footer_written_ = true; + + RETURN_NOT_OK(writable_file_->Close()); + + written_offset_ += buf.size(); + + return Status::OK(); +} + + +Status WritableLogSegment::WriteEntryBatch(const Slice& data) { + DCHECK(is_header_written_); + DCHECK(!is_footer_written_); + uint8_t header_buf[kEntryHeaderSize]; + + // First encode the length of the message. + uint32_t len = data.size(); + InlineEncodeFixed32(&header_buf[0], len); + + // Then the CRC of the message. + uint32_t msg_crc = crc::Crc32c(&data[0], data.size()); + InlineEncodeFixed32(&header_buf[4], msg_crc); + + // Then the CRC of the header + uint32_t header_crc = crc::Crc32c(&header_buf, 8); + InlineEncodeFixed32(&header_buf[8], header_crc); + + // Write the header to the file, followed by the batch data itself. + RETURN_NOT_OK(writable_file_->Append(Slice(header_buf, sizeof(header_buf)))); + written_offset_ += sizeof(header_buf); + + RETURN_NOT_OK(writable_file_->Append(data)); + written_offset_ += data.size(); + + return Status::OK(); +} + + +void CreateBatchFromAllocatedOperations(const vector& msgs, + gscoped_ptr* batch) { + gscoped_ptr entry_batch(new LogEntryBatchPB); + entry_batch->mutable_entry()->Reserve(msgs.size()); + for (const auto& msg : msgs) { + LogEntryPB* entry_pb = entry_batch->add_entry(); + entry_pb->set_type(log::REPLICATE); + entry_pb->set_allocated_replicate(msg->get()); + } + batch->reset(entry_batch.release()); +} + +bool IsLogFileName(const string& fname) { + if (HasPrefixString(fname, ".")) { + // Hidden file or ./.. + VLOG(1) << "Ignoring hidden file: " << fname; + return false; + } + + if (HasSuffixString(fname, kTmpSuffix)) { + LOG(WARNING) << "Ignoring tmp file: " << fname; + return false; + } + + vector v = strings::Split(fname, "-"); + if (v.size() != 2 || v[0] != FsManager::kWalFileNamePrefix) { + VLOG(1) << "Not a log file: " << fname; + return false; + } + + return true; +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/log_util.h b/src/kudu/consensus/log_util.h new file mode 100644 index 000000000000..387977af52b4 --- /dev/null +++ b/src/kudu/consensus/log_util.h @@ -0,0 +1,397 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_LOG_UTIL_H_ +#define KUDU_CONSENSUS_LOG_UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/consensus/log.pb.h" +#include "kudu/consensus/ref_counted_replicate.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/atomic.h" +#include "kudu/util/env.h" + +// Used by other classes, now part of the API. +DECLARE_bool(log_force_fsync_all); + +namespace kudu { + +namespace consensus { +struct OpIdBiggerThanFunctor; +} // namespace consensus + +namespace log { + +// Suffix for temprorary files +extern const char kTmpSuffix[]; + +// Each log entry is prefixed by its length (4 bytes), CRC (4 bytes), +// and checksum of the other two fields (see EntryHeader struct below). +extern const size_t kEntryHeaderSize; + +extern const int kLogMajorVersion; +extern const int kLogMinorVersion; + +class ReadableLogSegment; + +// Options for the State Machine/Write Ahead Log +struct LogOptions { + + // The size of a Log segment + // Logs will rollover upon reaching this size (default 64 MB) + size_t segment_size_mb; + + // Whether to call fsync on every call to Append(). + bool force_fsync_all; + + // Whether to fallocate segments before writing to them. + bool preallocate_segments; + + // Whether the allocation should happen asynchronously. + bool async_preallocate_segments; + + LogOptions(); +}; + + +// A sequence of segments, ordered by increasing sequence number. +typedef std::vector > SegmentSequence; + +// A segment of the log can either be a ReadableLogSegment (for replay and +// consensus catch-up) or a WritableLogSegment (where the Log actually stores +// state). LogSegments have a maximum size defined in LogOptions (set from the +// log_segment_size_mb flag, which defaults to 64). Upon reaching this size +// segments are rolled over and the Log continues in a new segment. + +// A readable log segment for recovery and follower catch-up. +class ReadableLogSegment : public RefCountedThreadSafe { + public: + // Factory method to construct a ReadableLogSegment from a file on the FS. + static Status Open(Env* env, + const std::string& path, + scoped_refptr* segment); + + // Build a readable segment to read entries from the provided path. + ReadableLogSegment(std::string path, + std::shared_ptr readable_file); + + // Initialize the ReadableLogSegment. + // This initializer provides methods for avoiding disk IO when creating a + // ReadableLogSegment for the current WritableLogSegment, i.e. for reading + // the log entries in the same segment that is currently being written to. + Status Init(const LogSegmentHeaderPB& header, + int64_t first_entry_offset); + + // Initialize the ReadableLogSegment. + // This initializer provides methods for avoiding disk IO when creating a + // ReadableLogSegment from a WritableLogSegment (i.e. for log rolling). + Status Init(const LogSegmentHeaderPB& header, + const LogSegmentFooterPB& footer, + int64_t first_entry_offset); + + // Initialize the ReadableLogSegment. + // This initializer will parse the log segment header and footer. + // Note: This returns Status and may fail. + Status Init(); + + // Reads all entries of the provided segment & adds them the 'entries' vector. + // The 'entries' vector owns the read entries. + // + // If the log is corrupted (i.e. the returned 'Status' is 'Corruption') all + // the log entries read up to the corrupted one are returned in the 'entries' + // vector. + // + // If 'end_offset' is not NULL, then returns the file offset following the last + // successfully read entry. + Status ReadEntries(std::vector* entries, + int64_t* end_offset = NULL); + + // Rebuilds this segment's footer by scanning its entries. + // This is an expensive operation as it reads and parses the whole segment + // so it should be only used in the case of a crash, where the footer is + // missing because we didn't have the time to write it out. + Status RebuildFooterByScanning(); + + bool IsInitialized() const { + return is_initialized_; + } + + // Returns the parent directory where log segments are stored. + const std::string &path() const { + return path_; + } + + const LogSegmentHeaderPB& header() const { + DCHECK(header_.IsInitialized()); + return header_; + } + + // Indicates whether this segment has a footer. + // + // Segments that were properly closed, e.g. because they were rolled over, + // will have properly written footers. On the other hand if there was a + // crash and the segment was not closed properly the footer will be missing. + // In this case calling ReadEntries() will rebuild the footer. + bool HasFooter() const { + return footer_.IsInitialized(); + } + + // Returns this log segment's footer. + // + // If HasFooter() returns false this cannot be called. + const LogSegmentFooterPB& footer() const { + DCHECK(IsInitialized()); + CHECK(HasFooter()); + return footer_; + } + + const std::shared_ptr readable_file() const { + return readable_file_; + } + + const int64_t file_size() const { + return file_size_.Load(); + } + + const int64_t first_entry_offset() const { + return first_entry_offset_; + } + + // Returns the full size of the file, if the segment is closed and has + // a footer, or the offset where the last written, non corrupt entry + // ends. + const int64_t readable_up_to() const; + + private: + friend class RefCountedThreadSafe; + friend class LogReader; + FRIEND_TEST(LogTest, TestWriteAndReadToAndFromInProgressSegment); + + struct EntryHeader { + // The length of the batch data. + uint32_t msg_length; + + // The CRC32C of the batch data. + uint32_t msg_crc; + + // The CRC32C of this EntryHeader. + uint32_t header_crc; + }; + + ~ReadableLogSegment() {} + + // Helper functions called by Init(). + + Status ReadFileSize(); + + Status ReadHeader(); + + Status ReadHeaderMagicAndHeaderLength(uint32_t *len); + + Status ParseHeaderMagicAndHeaderLength(const Slice &data, uint32_t *parsed_len); + + Status ReadFooter(); + + Status ReadFooterMagicAndFooterLength(uint32_t *len); + + Status ParseFooterMagicAndFooterLength(const Slice &data, uint32_t *parsed_len); + + // Starting at 'offset', read the rest of the log file, looking for any + // valid log entry headers. If any are found, sets *has_valid_entries to true. + // + // Returns a bad Status only in the case that some IO error occurred reading the + // file. + Status ScanForValidEntryHeaders(int64_t offset, bool* has_valid_entries); + + // Format a nice error message to report on a corruption in a log file. + Status MakeCorruptionStatus(int batch_number, int64_t batch_offset, + std::vector* recent_offsets, + const std::vector& entries, + const Status& status) const; + + Status ReadEntryHeaderAndBatch(int64_t* offset, faststring* tmp_buf, + gscoped_ptr* batch); + + // Reads a log entry header from the segment. + // Also increments the passed offset* by the length of the entry. + Status ReadEntryHeader(int64_t *offset, EntryHeader* header); + + // Decode a log entry header from the given slice, which must be kEntryHeaderSize + // bytes long. Returns true if successful, false if corrupt. + // + // NOTE: this is performance-critical since it is used by ScanForValidEntryHeaders + // and thus returns bool instead of Status. + bool DecodeEntryHeader(const Slice& data, EntryHeader* header); + + + // Reads a log entry batch from the provided readable segment, which gets decoded + // into 'entry_batch' and increments 'offset' by the batch's length. + Status ReadEntryBatch(int64_t *offset, + const EntryHeader& header, + faststring* tmp_buf, + gscoped_ptr* entry_batch); + + void UpdateReadableToOffset(int64_t readable_to_offset); + + const std::string path_; + + // The size of the readable file. + // This is set by Init(). In the case of a log being written to, + // this may be increased by UpdateReadableToOffset() + AtomicInt file_size_; + + // The offset up to which we can read the file. + // For already written segments this is fixed and equal to the file size + // but for the segments currently written to this is the offset up to which + // we can read without the fear of reading garbage/zeros. + // This is atomic because the Log thread might be updating the segment's readable + // offset while an async reader is reading the segment's entries. + // is reading it. + AtomicInt readable_to_offset_; + + // a readable file for a log segment (used on replay) + const std::shared_ptr readable_file_; + + bool is_initialized_; + + LogSegmentHeaderPB header_; + + LogSegmentFooterPB footer_; + + // True if the footer was rebuilt, rather than actually found on disk. + bool footer_was_rebuilt_; + + // the offset of the first entry in the log + int64_t first_entry_offset_; + + DISALLOW_COPY_AND_ASSIGN(ReadableLogSegment); +}; + +// A writable log segment where state data is stored. +class WritableLogSegment { + public: + WritableLogSegment(std::string path, + std::shared_ptr writable_file); + + // Opens the segment by writing the header. + Status WriteHeaderAndOpen(const LogSegmentHeaderPB& new_header); + + // Closes the segment by writing the footer and then actually closing the + // underlying WritableFile. + Status WriteFooterAndClose(const LogSegmentFooterPB& footer); + + bool IsClosed() { + return IsHeaderWritten() && IsFooterWritten(); + } + + int64_t Size() const { + return writable_file_->Size(); + } + + // Appends the provided batch of data, including a header + // and checksum. + // Makes sure that the log segment has not been closed. + Status WriteEntryBatch(const Slice& entry_batch_data); + + // Makes sure the I/O buffers in the underlying writable file are flushed. + Status Sync() { + return writable_file_->Sync(); + } + + // Returns true if the segment header has already been written to disk. + bool IsHeaderWritten() const { + return is_header_written_; + } + + const LogSegmentHeaderPB& header() const { + DCHECK(IsHeaderWritten()); + return header_; + } + + bool IsFooterWritten() const { + return is_footer_written_; + } + + const LogSegmentFooterPB& footer() const { + DCHECK(IsFooterWritten()); + return footer_; + } + + // Returns the parent directory where log segments are stored. + const std::string &path() const { + return path_; + } + + const int64_t first_entry_offset() const { + return first_entry_offset_; + } + + const int64_t written_offset() const { + return written_offset_; + } + + private: + + const std::shared_ptr& writable_file() const { + return writable_file_; + } + + // The path to the log file. + const std::string path_; + + // The writable file to which this LogSegment will be written. + const std::shared_ptr writable_file_; + + bool is_header_written_; + + bool is_footer_written_; + + LogSegmentHeaderPB header_; + + LogSegmentFooterPB footer_; + + // the offset of the first entry in the log + int64_t first_entry_offset_; + + // The offset where the last written entry ends. + int64_t written_offset_; + + DISALLOW_COPY_AND_ASSIGN(WritableLogSegment); +}; + +// Sets 'batch' to a newly created batch that contains the pre-allocated +// ReplicateMsgs in 'msgs'. +// We use C-style passing here to avoid having to allocate a vector +// in some hot paths. +void CreateBatchFromAllocatedOperations(const std::vector& msgs, + gscoped_ptr* batch); + +// Checks if 'fname' is a correctly formatted name of log segment file. +bool IsLogFileName(const std::string& fname); + +} // namespace log +} // namespace kudu + +#endif /* KUDU_CONSENSUS_LOG_UTIL_H_ */ diff --git a/src/kudu/consensus/metadata.proto b/src/kudu/consensus/metadata.proto new file mode 100644 index 000000000000..f9cf84c961c9 --- /dev/null +++ b/src/kudu/consensus/metadata.proto @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.consensus; + +option java_package = "org.kududb.consensus"; + +import "kudu/common/common.proto"; + +// =========================================================================== +// Consensus Metadata +// =========================================================================== + +// A peer in a configuration. +message RaftPeerPB { + // The possible roles for peers. + enum Role { + UNKNOWN_ROLE = 999; + + // Indicates this node is a follower in the configuration, i.e. that it participates + // in majorities and accepts Consensus::Update() calls. + FOLLOWER = 0; + + // Indicates this node is the current leader of the configuration, i.e. that it + // participates in majorities and accepts Consensus::Append() calls. + LEADER = 1; + + // Indicates that this node participates in the configuration in a passive role, + // i.e. that it accepts Consensus::Update() calls but does not participate + // in elections or majorities. + LEARNER = 2; + + // Indicates that this node is not a participant of the configuration, i.e. does + // not accept Consensus::Update() or Consensus::Update() and cannot + // participate in elections or majorities. This is usually the role of a node + // that leaves the configuration. + NON_PARTICIPANT = 3; + }; + + enum MemberType { + UNKNOWN_MEMBER_TYPE = 999; + NON_VOTER = 0; + VOTER = 1; + }; + // Permanent uuid is optional: RaftPeerPB/RaftConfigPB instances may + // be created before the permanent uuid is known (e.g., when + // manually specifying a configuration for Master/CatalogManager); + // permament uuid can be retrieved at a later time through RPC. + optional bytes permanent_uuid = 1; + optional MemberType member_type = 2; + optional HostPortPB last_known_addr = 3; +} + +enum ConsensusConfigType { + CONSENSUS_CONFIG_UNKNOWN = 999; + + // Committed consensus config. This includes the consensus configuration that + // has been serialized through consensus and committed, thus having a valid + // opid_index field set. + CONSENSUS_CONFIG_COMMITTED = 1; + + // Active consensus config. This could be a pending consensus config that + // has not yet been committed. If the config is not committed, its opid_index + // field will not be set. + CONSENSUS_CONFIG_ACTIVE = 2; +} + +// A set of peers, serving a single tablet. +message RaftConfigPB { + // The index of the operation which serialized this RaftConfigPB through + // consensus. It is set when the operation is consensus-committed (replicated + // to a majority of voters) and before the consensus metadata is updated. + // It is left undefined if the operation isn't committed. + optional int64 opid_index = 1; + + // Whether this is a local or distributed configuration (i.e. whether to use a local or dist + // implementation of consensus). + optional bool local = 2 [default = true]; + + // The set of peers in the configuration. + repeated RaftPeerPB peers = 3; +} + +// Represents a snapshot of a configuration at a given moment in time. +message ConsensusStatePB { + // A configuration is always guaranteed to have a known term. + required int64 current_term = 1; + + // There may not always be a leader of a configuration at any given time. + // + // The node that the local peer considers to be leader changes based on rules + // defined in the Raft specification. Roughly, this corresponds either to + // being elected leader (in the case that the local peer is the leader), or + // when an update is accepted from another node, which basically just amounts + // to a term check on the UpdateConsensus() RPC request. + // + // Whenever the local peer sees a new term, the leader flag is cleared until + // a new leader is acknowledged based on the above critera. Simply casting a + // vote for a peer is not sufficient to assume that that peer has won the + // election, so we do not update this field based on our vote. + // + // The leader listed here, if any, should always be a member of 'configuration', and + // the term that the node is leader of _must_ equal the term listed above in + // the 'current_term' field. The Master will use the combination of current + // term and leader uuid to determine when to update its cache of the current + // leader for client lookup purposes. + // + // There is a corner case in Raft where a node may be elected leader of a + // pending (uncommitted) configuration. In such a case, if the leader of the pending + // configuration is not a member of the committed configuration, and it is the committed + // configuration that is being reported, then the leader_uuid field should be + // cleared by the process filling in the ConsensusStatePB object. + optional string leader_uuid = 2; + + // The peers. In some contexts, this will be the committed configuration, + // which will always have configuration.opid_index set. In other contexts, this may + // a "pending" configuration, which is active but in the process of being committed. + // In any case, initial peership is set on tablet start, so this + // field should always be present. + required RaftConfigPB config = 3; +} + +// This PB is used to serialize all of the persistent state needed for +// Consensus that is not in the WAL, such as leader election and +// communication on startup. +message ConsensusMetadataPB { + // Last-committed peership. + required RaftConfigPB committed_config = 1; + + // Latest term this server has seen. + // When a configuration is first created, initialized to 0. + // + // Whenever a new election is started, the candidate increments this by one + // and requests votes from peers. + // + // If any RPC or RPC response is received from another node containing a term higher + // than this one, the server should step down to FOLLOWER and set its current_term to + // match the caller's term. + // + // If a follower receives an UpdateConsensus RPC with a term lower than this + // term, then that implies that the RPC is coming from a former LEADER who has + // not realized yet that its term is over. In that case, we will reject the + // UpdateConsensus() call with ConsensusErrorPB::INVALID_TERM. + // + // If a follower receives a RequestConsensusVote() RPC with an earlier term, + // the vote is denied. + required int64 current_term = 2; + + // Permanent UUID of the candidate voted for in 'current_term', or not present + // if no vote was made in the current term. + optional string voted_for = 3; +} diff --git a/src/kudu/consensus/mt-log-test.cc b/src/kudu/consensus/mt-log-test.cc new file mode 100644 index 000000000000..1b08c4de1c3a --- /dev/null +++ b/src/kudu/consensus/mt-log-test.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log-test-base.h" + +#include +#include +#include + +#include +#include + +#include "kudu/consensus/log_index.h" +#include "kudu/gutil/algorithm.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/locks.h" +#include "kudu/util/random.h" +#include "kudu/util/thread.h" + +DEFINE_int32(num_writer_threads, 4, "Number of threads writing to the log"); +DEFINE_int32(num_batches_per_thread, 2000, "Number of batches per thread"); +DEFINE_int32(num_ops_per_batch_avg, 5, "Target average number of ops per batch"); + +namespace kudu { +namespace log { + +using std::vector; +using consensus::ReplicateRefPtr; +using consensus::make_scoped_refptr_replicate; + +namespace { + +class CustomLatchCallback : public RefCountedThreadSafe { + public: + CustomLatchCallback(CountDownLatch* latch, vector* errors) + : latch_(latch), + errors_(errors) { + } + + void StatusCB(const Status& s) { + if (!s.ok()) { + errors_->push_back(s); + } + latch_->CountDown(); + } + + StatusCallback AsStatusCallback() { + return Bind(&CustomLatchCallback::StatusCB, this); + } + + private: + CountDownLatch* latch_; + vector* errors_; +}; + +} // anonymous namespace + +extern const char *kTestTablet; + +class MultiThreadedLogTest : public LogTestBase { + public: + MultiThreadedLogTest() + : random_(SeedRandom()) { + } + + virtual void SetUp() OVERRIDE { + LogTestBase::SetUp(); + } + + void LogWriterThread(int thread_id) { + CountDownLatch latch(FLAGS_num_batches_per_thread); + vector errors; + for (int i = 0; i < FLAGS_num_batches_per_thread; i++) { + LogEntryBatch* entry_batch; + vector batch_replicates; + int num_ops = static_cast(random_.Normal( + static_cast(FLAGS_num_ops_per_batch_avg), 1.0)); + DVLOG(1) << num_ops << " ops in this batch"; + num_ops = std::max(num_ops, 1); + { + boost::lock_guard lock_guard(lock_); + for (int j = 0; j < num_ops; j++) { + ReplicateRefPtr replicate = make_scoped_refptr_replicate(new ReplicateMsg); + int32_t index = current_index_++; + OpId* op_id = replicate->get()->mutable_id(); + op_id->set_term(0); + op_id->set_index(index); + + replicate->get()->set_op_type(WRITE_OP); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + + tserver::WriteRequestPB* request = replicate->get()->mutable_write_request(); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, index, 0, + "this is a test insert", + request->mutable_row_operations()); + request->set_tablet_id(kTestTablet); + batch_replicates.push_back(replicate); + } + + gscoped_ptr entry_batch_pb; + CreateBatchFromAllocatedOperations(batch_replicates, + &entry_batch_pb); + + ASSERT_OK(log_->Reserve(REPLICATE, entry_batch_pb.Pass(), &entry_batch)); + } // lock_guard scope + auto cb = new CustomLatchCallback(&latch, &errors); + entry_batch->SetReplicates(batch_replicates); + ASSERT_OK(log_->AsyncAppend(entry_batch, cb->AsStatusCallback())); + } + LOG_TIMING(INFO, strings::Substitute("thread $0 waiting to append and sync $1 batches", + thread_id, FLAGS_num_batches_per_thread)) { + latch.Wait(); + } + for (const Status& status : errors) { + WARN_NOT_OK(status, "Unexpected failure during AsyncAppend"); + } + ASSERT_EQ(0, errors.size()); + } + + void Run() { + for (int i = 0; i < FLAGS_num_writer_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", "inserter", + &MultiThreadedLogTest::LogWriterThread, this, i, &new_thread)); + threads_.push_back(new_thread); + } + for (scoped_refptr& thread : threads_) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + } + private: + ThreadSafeRandom random_; + simple_spinlock lock_; + vector > threads_; +}; + +TEST_F(MultiThreadedLogTest, TestAppends) { + BuildLog(); + int start_current_id = current_index_; + LOG_TIMING(INFO, strings::Substitute("inserting $0 batches($1 threads, $2 per-thread)", + FLAGS_num_writer_threads * FLAGS_num_batches_per_thread, + FLAGS_num_batches_per_thread, FLAGS_num_writer_threads)) { + ASSERT_NO_FATAL_FAILURE(Run()); + } + ASSERT_OK(log_->Close()); + + gscoped_ptr reader; + ASSERT_OK(LogReader::Open(fs_manager_.get(), NULL, kTestTablet, NULL, &reader)); + SegmentSequence segments; + ASSERT_OK(reader->GetSegmentsSnapshot(&segments)); + + for (const SegmentSequence::value_type& entry : segments) { + ASSERT_OK(entry->ReadEntries(&entries_)); + } + vector ids; + EntriesToIdList(&ids); + DVLOG(1) << "Wrote total of " << current_index_ - start_current_id << " ops"; + ASSERT_EQ(current_index_ - start_current_id, ids.size()); + ASSERT_TRUE(std::is_sorted(ids.begin(), ids.end())); +} + +} // namespace log +} // namespace kudu diff --git a/src/kudu/consensus/opid.proto b/src/kudu/consensus/opid.proto new file mode 100644 index 000000000000..443353302ca4 --- /dev/null +++ b/src/kudu/consensus/opid.proto @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.consensus; + +option java_package = "org.kududb.consensus"; + +// An id for a generic state machine operation. Composed of the leaders' term +// plus the index of the operation in that term, e.g., the th operation +// of the th leader. +message OpId { + // The term of an operation or the leader's sequence id. + required int64 term = 1; + required int64 index = 2; +} diff --git a/src/kudu/consensus/opid_util.cc b/src/kudu/consensus/opid_util.cc new file mode 100644 index 000000000000..dac5169d10a9 --- /dev/null +++ b/src/kudu/consensus/opid_util.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/opid_util.h" + +#include +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/substitute.h" + +namespace kudu { +namespace consensus { + +const int64_t kMinimumTerm = 0; +const int64_t kMinimumOpIdIndex = 0; +const int64_t kInvalidOpIdIndex = -1; + +int OpIdCompare(const OpId& first, const OpId& second) { + DCHECK(first.IsInitialized()); + DCHECK(second.IsInitialized()); + if (PREDICT_TRUE(first.term() == second.term())) { + return first.index() < second.index() ? -1 : first.index() == second.index() ? 0 : 1; + } + return first.term() < second.term() ? -1 : 1; +} + +bool OpIdEquals(const OpId& left, const OpId& right) { + DCHECK(left.IsInitialized()); + DCHECK(right.IsInitialized()); + return left.term() == right.term() && left.index() == right.index(); +} + +bool OpIdLessThan(const OpId& left, const OpId& right) { + DCHECK(left.IsInitialized()); + DCHECK(right.IsInitialized()); + if (left.term() < right.term()) return true; + if (left.term() > right.term()) return false; + return left.index() < right.index(); +} + +bool OpIdBiggerThan(const OpId& left, const OpId& right) { + DCHECK(left.IsInitialized()); + DCHECK(right.IsInitialized()); + if (left.term() > right.term()) return true; + if (left.term() < right.term()) return false; + return left.index() > right.index(); +} + +bool CopyIfOpIdLessThan(const consensus::OpId& to_compare, consensus::OpId* target) { + if (to_compare.IsInitialized() && + (!target->IsInitialized() || OpIdLessThan(to_compare, *target))) { + target->CopyFrom(to_compare); + return true; + } + return false; +} + +size_t OpIdHashFunctor::operator() (const OpId& id) const { + return (id.term() + 31) ^ id.index(); +} + +bool OpIdEqualsFunctor::operator() (const OpId& left, const OpId& right) const { + return OpIdEquals(left, right); +} + +bool OpIdLessThanPtrFunctor::operator() (const OpId* left, const OpId* right) const { + return OpIdLessThan(*left, *right); +} + +bool OpIdIndexLessThanPtrFunctor::operator() (const OpId* left, const OpId* right) const { + return left->index() < right->index(); +} + +bool OpIdCompareFunctor::operator() (const OpId& left, const OpId& right) const { + return OpIdLessThan(left, right); +} + +bool OpIdBiggerThanFunctor::operator() (const OpId& left, const OpId& right) const { + return OpIdBiggerThan(left, right); +} + +OpId MinimumOpId() { + OpId op_id; + op_id.set_term(0); + op_id.set_index(0); + return op_id; +} + +OpId MaximumOpId() { + OpId op_id; + op_id.set_term(std::numeric_limits::max()); + op_id.set_index(std::numeric_limits::max()); + return op_id; +} + +// helper hash functor for delta store ids +struct DeltaIdHashFunction { + size_t operator()(const std::pair& id) const { + return (id.first + 31) ^ id.second; + } +}; + +// helper equals functor for delta store ids +struct DeltaIdEqualsTo { + bool operator()(const std::pair& left, + const std::pair& right) const { + return left.first == right.first && left.second == right.second; + } +}; + +std::ostream& operator<<(std::ostream& os, const consensus::OpId& op_id) { + os << OpIdToString(op_id); + return os; +} + +std::string OpIdToString(const OpId& op_id) { + if (!op_id.IsInitialized()) { + return ""; + } + return strings::Substitute("$0.$1", op_id.term(), op_id.index()); +} + +std::string OpsRangeString(const ConsensusRequestPB& req) { + std::string ret; + ret.reserve(100); + ret.push_back('['); + if (req.ops_size() > 0) { + const OpId& first_op = req.ops(0).id(); + const OpId& last_op = req.ops(req.ops_size() - 1).id(); + strings::SubstituteAndAppend(&ret, "$0.$1-$2.$3", + first_op.term(), first_op.index(), + last_op.term(), last_op.index()); + } + ret.push_back(']'); + return ret; +} + +OpId MakeOpId(int term, int index) { + OpId ret; + ret.set_index(index); + ret.set_term(term); + return ret; +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/opid_util.h b/src/kudu/consensus/opid_util.h new file mode 100644 index 000000000000..76591fcb62ba --- /dev/null +++ b/src/kudu/consensus/opid_util.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_OPID_UTIL_H_ +#define KUDU_CONSENSUS_OPID_UTIL_H_ + +#include + +#include +#include +#include + +namespace kudu { +namespace consensus { + +class ConsensusRequestPB; +class OpId; + +// Minimum possible term. +extern const int64_t kMinimumTerm; + +// Minimum possible log index. +extern const int64_t kMinimumOpIdIndex; + +// Log index that is lower than the minimum index (and so will never occur). +extern const int64_t kInvalidOpIdIndex; + +// Returns true iff left == right. +bool OpIdEquals(const OpId& left, const OpId& right); + +// Returns true iff left < right. +bool OpIdLessThan(const OpId& left, const OpId& right); + +// Returns true iff left > right. +bool OpIdBiggerThan(const OpId& left, const OpId& right); + +// Copies to_compare into target under the following conditions: +// - If to_compare is initialized and target is not. +// - If they are both initialized and to_compare is less than target. +// Otherwise, does nothing. +// If to_compare is copied into target, returns true, else false. +bool CopyIfOpIdLessThan(const OpId& to_compare, OpId* target); + +// Return -1 if left < right, +// 0 if equal, +// 1 if left > right. +int OpIdCompare(const OpId& left, const OpId& right); + +// OpId hash functor. Suitable for use with std::unordered_map. +struct OpIdHashFunctor { + size_t operator() (const OpId& id) const; +}; + +// OpId equals functor. Suitable for use with std::unordered_map. +struct OpIdEqualsFunctor { + bool operator() (const OpId& left, const OpId& right) const; +}; + +// OpId less than functor for pointers.. Suitable for use with std::sort and std::map. +struct OpIdLessThanPtrFunctor { + // Returns true iff left < right. + bool operator() (const OpId* left, const OpId* right) const; +}; + +// Sorts op id's by index only, disregarding the term. +struct OpIdIndexLessThanPtrFunctor { + // Returns true iff left.index() < right.index(). + bool operator() (const OpId* left, const OpId* right) const; +}; + +// OpId compare() functor. Suitable for use with std::sort and std::map. +struct OpIdCompareFunctor { + // Returns true iff left < right. + bool operator() (const OpId& left, const OpId& right) const; +}; + +// OpId comparison functor that returns true iff left > right. Suitable for use +// td::sort and std::map to sort keys in increasing order.] +struct OpIdBiggerThanFunctor { + bool operator() (const OpId& left, const OpId& right) const; +}; + +std::ostream& operator<<(std::ostream& os, const consensus::OpId& op_id); + +// Return the minimum possible OpId. +OpId MinimumOpId(); + +// Return the maximum possible OpId. +OpId MaximumOpId(); + +std::string OpIdToString(const OpId& id); + +std::string OpsRangeString(const ConsensusRequestPB& req); + +OpId MakeOpId(int term, int index); + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_OPID_UTIL_H_ */ diff --git a/src/kudu/consensus/peer_manager.cc b/src/kudu/consensus/peer_manager.cc new file mode 100644 index 000000000000..8c9174d9032c --- /dev/null +++ b/src/kudu/consensus/peer_manager.cc @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/peer_manager.h" + +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/log.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/threadpool.h" + +namespace kudu { +namespace consensus { + +using log::Log; +using strings::Substitute; + +PeerManager::PeerManager(const std::string tablet_id, + const std::string local_uuid, + PeerProxyFactory* peer_proxy_factory, + PeerMessageQueue* queue, + ThreadPool* request_thread_pool, + const scoped_refptr& log) + : tablet_id_(tablet_id), + local_uuid_(local_uuid), + peer_proxy_factory_(peer_proxy_factory), + queue_(queue), + thread_pool_(request_thread_pool), + log_(log) { +} + +PeerManager::~PeerManager() { + Close(); +} + +Status PeerManager::UpdateRaftConfig(const RaftConfigPB& config) { + unordered_set new_peers; + + VLOG(1) << "Updating peers from new config: " << config.ShortDebugString(); + + boost::lock_guard lock(lock_); + // Create new peers + for (const RaftPeerPB& peer_pb : config.peers()) { + new_peers.insert(peer_pb.permanent_uuid()); + Peer* peer = FindPtrOrNull(peers_, peer_pb.permanent_uuid()); + if (peer != nullptr) { + continue; + } + if (peer_pb.permanent_uuid() == local_uuid_) { + continue; + } + + VLOG(1) << GetLogPrefix() << "Adding remote peer. Peer: " << peer_pb.ShortDebugString(); + gscoped_ptr peer_proxy; + RETURN_NOT_OK_PREPEND(peer_proxy_factory_->NewProxy(peer_pb, &peer_proxy), + "Could not obtain a remote proxy to the peer."); + + gscoped_ptr remote_peer; + RETURN_NOT_OK(Peer::NewRemotePeer(peer_pb, + tablet_id_, + local_uuid_, + queue_, + thread_pool_, + peer_proxy.Pass(), + &remote_peer)); + InsertOrDie(&peers_, peer_pb.permanent_uuid(), remote_peer.release()); + } + + return Status::OK(); +} + +void PeerManager::SignalRequest(bool force_if_queue_empty) { + boost::lock_guard lock(lock_); + auto iter = peers_.begin(); + for (; iter != peers_.end(); iter++) { + Status s = (*iter).second->SignalRequest(force_if_queue_empty); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << GetLogPrefix() + << "Peer was closed, removing from peers. Peer: " + << (*iter).second->peer_pb().ShortDebugString(); + peers_.erase(iter); + } + } +} + +void PeerManager::Close() { + { + boost::lock_guard lock(lock_); + for (const PeersMap::value_type& entry : peers_) { + entry.second->Close(); + } + STLDeleteValues(&peers_); + } +} + +std::string PeerManager::GetLogPrefix() const { + return Substitute("T $0 P $1: ", tablet_id_, local_uuid_); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/peer_manager.h b/src/kudu/consensus/peer_manager.h new file mode 100644 index 000000000000..c1dc86f75588 --- /dev/null +++ b/src/kudu/consensus/peer_manager.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_PEER_MANAGER_H +#define KUDU_CONSENSUS_PEER_MANAGER_H + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +#include +#include + +namespace kudu { + +class ThreadPool; + +namespace log { +class Log; +} // namespace log + +namespace consensus { + +class Peer; +class PeerMessageQueue; +class PeerProxyFactory; +class RaftConfigPB; + +// Manages the set of local and remote peers that pull data from the +// queue into the local log/remote machines. +// Methods are virtual to ease mocking. +class PeerManager { + public: + // All of the raw pointer arguments are not owned by the PeerManager + // and must live at least as long as the PeerManager. + // + // 'request_thread_pool' is the pool used to construct requests to send + // to the peers. + PeerManager(const std::string tablet_id, + const std::string local_uuid, + PeerProxyFactory* peer_proxy_factory, + PeerMessageQueue* queue, + ThreadPool* request_thread_pool, + const scoped_refptr& log); + + virtual ~PeerManager(); + + // Updates 'peers_' according to the new configuration config. + virtual Status UpdateRaftConfig(const RaftConfigPB& config); + + // Signals all peers of the current configuration that there is a new request pending. + virtual void SignalRequest(bool force_if_queue_empty = false); + + // Closes all peers. + virtual void Close(); + + private: + std::string GetLogPrefix() const; + + typedef std::unordered_map PeersMap; + const std::string tablet_id_; + const std::string local_uuid_; + PeerProxyFactory* peer_proxy_factory_; + PeerMessageQueue* queue_; + ThreadPool* thread_pool_; + scoped_refptr log_; + PeersMap peers_; + mutable simple_spinlock lock_; + + DISALLOW_COPY_AND_ASSIGN(PeerManager); +}; + + + +} // namespace consensus +} // namespace kudu +#endif /* KUDU_CONSENSUS_PEER_MANAGER_H */ diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc new file mode 100644 index 000000000000..8d99cf0ab384 --- /dev/null +++ b/src/kudu/consensus/quorum_util-test.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "kudu/consensus/quorum_util.h" + +#include "kudu/consensus/opid_util.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace consensus { + +using std::string; + +static void SetPeerInfo(const string& uuid, + RaftPeerPB::MemberType type, + RaftPeerPB* peer) { + peer->set_permanent_uuid(uuid); + peer->set_member_type(type); +} + +TEST(QuorumUtilTest, TestMemberExtraction) { + RaftConfigPB config; + SetPeerInfo("A", RaftPeerPB::VOTER, config.add_peers()); + SetPeerInfo("B", RaftPeerPB::VOTER, config.add_peers()); + SetPeerInfo("C", RaftPeerPB::VOTER, config.add_peers()); + + // Basic test for GetRaftConfigMember(). + RaftPeerPB peer_pb; + Status s = GetRaftConfigMember(config, "invalid", &peer_pb); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); + ASSERT_OK(GetRaftConfigMember(config, "A", &peer_pb)); + ASSERT_EQ("A", peer_pb.permanent_uuid()); + + // Basic test for GetRaftConfigLeader(). + ConsensusStatePB cstate; + *cstate.mutable_config() = config; + s = GetRaftConfigLeader(cstate, &peer_pb); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); + cstate.set_leader_uuid("B"); + ASSERT_OK(GetRaftConfigLeader(cstate, &peer_pb)); + ASSERT_EQ("B", peer_pb.permanent_uuid()); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc new file mode 100644 index 000000000000..ef9393462f09 --- /dev/null +++ b/src/kudu/consensus/quorum_util.cc @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/consensus/quorum_util.h" + +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace consensus { + +using google::protobuf::RepeatedPtrField; +using std::string; +using strings::Substitute; + +bool IsRaftConfigMember(const std::string& uuid, const RaftConfigPB& config) { + for (const RaftPeerPB& peer : config.peers()) { + if (peer.permanent_uuid() == uuid) { + return true; + } + } + return false; +} + +bool IsRaftConfigVoter(const std::string& uuid, const RaftConfigPB& config) { + for (const RaftPeerPB& peer : config.peers()) { + if (peer.permanent_uuid() == uuid) { + return peer.member_type() == RaftPeerPB::VOTER; + } + } + return false; +} + +Status GetRaftConfigMember(const RaftConfigPB& config, + const std::string& uuid, + RaftPeerPB* peer_pb) { + for (const RaftPeerPB& peer : config.peers()) { + if (peer.permanent_uuid() == uuid) { + *peer_pb = peer; + return Status::OK(); + } + } + return Status::NotFound(Substitute("Peer with uuid $0 not found in consensus config", uuid)); +} + +Status GetRaftConfigLeader(const ConsensusStatePB& cstate, RaftPeerPB* peer_pb) { + if (!cstate.has_leader_uuid() || cstate.leader_uuid().empty()) { + return Status::NotFound("Consensus config has no leader"); + } + return GetRaftConfigMember(cstate.config(), cstate.leader_uuid(), peer_pb); +} + +bool RemoveFromRaftConfig(RaftConfigPB* config, const string& uuid) { + RepeatedPtrField modified_peers; + bool removed = false; + for (const RaftPeerPB& peer : config->peers()) { + if (peer.permanent_uuid() == uuid) { + removed = true; + continue; + } + *modified_peers.Add() = peer; + } + if (!removed) return false; + config->mutable_peers()->Swap(&modified_peers); + return true; +} + +int CountVoters(const RaftConfigPB& config) { + int voters = 0; + for (const RaftPeerPB& peer : config.peers()) { + if (peer.member_type() == RaftPeerPB::VOTER) { + voters++; + } + } + return voters; +} + +int MajoritySize(int num_voters) { + DCHECK_GE(num_voters, 1); + return (num_voters / 2) + 1; +} + +RaftPeerPB::Role GetConsensusRole(const std::string& permanent_uuid, + const ConsensusStatePB& cstate) { + if (cstate.leader_uuid() == permanent_uuid) { + if (IsRaftConfigVoter(permanent_uuid, cstate.config())) { + return RaftPeerPB::LEADER; + } + return RaftPeerPB::NON_PARTICIPANT; + } + + for (const RaftPeerPB& peer : cstate.config().peers()) { + if (peer.permanent_uuid() == permanent_uuid) { + switch (peer.member_type()) { + case RaftPeerPB::VOTER: + return RaftPeerPB::FOLLOWER; + default: + return RaftPeerPB::LEARNER; + } + } + } + return RaftPeerPB::NON_PARTICIPANT; +} + +Status VerifyRaftConfig(const RaftConfigPB& config, RaftConfigState type) { + std::set uuids; + if (config.peers_size() == 0) { + return Status::IllegalState( + Substitute("RaftConfig must have at least one peer. RaftConfig: $0", + config.ShortDebugString())); + } + + if (!config.has_local()) { + return Status::IllegalState( + Substitute("RaftConfig must specify whether it is local. RaftConfig: ", + config.ShortDebugString())); + } + + if (type == COMMITTED_QUORUM) { + // Committed configurations must have 'opid_index' populated. + if (!config.has_opid_index()) { + return Status::IllegalState( + Substitute("Committed configs must have opid_index set. RaftConfig: $0", + config.ShortDebugString())); + } + } else if (type == UNCOMMITTED_QUORUM) { + // Uncommitted configurations must *not* have 'opid_index' populated. + if (config.has_opid_index()) { + return Status::IllegalState( + Substitute("Uncommitted configs must not have opid_index set. RaftConfig: $0", + config.ShortDebugString())); + } + } + + // Local configurations must have only one peer and it may or may not + // have an address. + if (config.local()) { + if (config.peers_size() != 1) { + return Status::IllegalState( + Substitute("Local configs must have 1 and only one peer. RaftConfig: ", + config.ShortDebugString())); + } + if (!config.peers(0).has_permanent_uuid() || + config.peers(0).permanent_uuid() == "") { + return Status::IllegalState( + Substitute("Local peer must have an UUID. RaftConfig: ", + config.ShortDebugString())); + } + return Status::OK(); + } + + for (const RaftPeerPB& peer : config.peers()) { + if (!peer.has_permanent_uuid() || peer.permanent_uuid() == "") { + return Status::IllegalState(Substitute("One peer didn't have an uuid or had the empty" + " string. RaftConfig: $0", config.ShortDebugString())); + } + if (ContainsKey(uuids, peer.permanent_uuid())) { + return Status::IllegalState( + Substitute("Found multiple peers with uuid: $0. RaftConfig: $1", + peer.permanent_uuid(), config.ShortDebugString())); + } + uuids.insert(peer.permanent_uuid()); + + if (!peer.has_last_known_addr()) { + return Status::IllegalState( + Substitute("Peer: $0 has no address. RaftConfig: $1", + peer.permanent_uuid(), config.ShortDebugString())); + } + if (!peer.has_member_type()) { + return Status::IllegalState( + Substitute("Peer: $0 has no member type set. RaftConfig: $1", peer.permanent_uuid(), + config.ShortDebugString())); + } + if (peer.member_type() == RaftPeerPB::NON_VOTER) { + return Status::IllegalState( + Substitute( + "Peer: $0 is a NON_VOTER, but this isn't supported yet. RaftConfig: $1", + peer.permanent_uuid(), config.ShortDebugString())); + } + } + + return Status::OK(); +} + +Status VerifyConsensusState(const ConsensusStatePB& cstate, RaftConfigState type) { + if (!cstate.has_current_term()) { + return Status::IllegalState("ConsensusStatePB missing current_term", cstate.ShortDebugString()); + } + if (!cstate.has_config()) { + return Status::IllegalState("ConsensusStatePB missing config", cstate.ShortDebugString()); + } + RETURN_NOT_OK(VerifyRaftConfig(cstate.config(), type)); + + if (cstate.has_leader_uuid() && !cstate.leader_uuid().empty()) { + if (!IsRaftConfigVoter(cstate.leader_uuid(), cstate.config())) { + return Status::IllegalState( + Substitute("Leader with UUID $0 is not a VOTER in the config! Consensus state: $1", + cstate.leader_uuid(), cstate.ShortDebugString())); + } + } + + return Status::OK(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/quorum_util.h b/src/kudu/consensus/quorum_util.h new file mode 100644 index 000000000000..3d1ccb065943 --- /dev/null +++ b/src/kudu/consensus/quorum_util.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_QUORUM_UTIL_H_ +#define KUDU_CONSENSUS_QUORUM_UTIL_H_ + +#include + +#include "kudu/consensus/metadata.pb.h" + +namespace kudu { +class Status; + +namespace consensus { + +enum RaftConfigState { + UNCOMMITTED_QUORUM, + COMMITTED_QUORUM, +}; + +bool IsRaftConfigMember(const std::string& uuid, const RaftConfigPB& config); +bool IsRaftConfigVoter(const std::string& uuid, const RaftConfigPB& config); + +// Get the specified member of the config. +// Returns Status::NotFound if a member with the specified uuid could not be +// found in the config. +Status GetRaftConfigMember(const RaftConfigPB& config, + const std::string& uuid, + RaftPeerPB* peer_pb); + +// Get the leader of the consensus configuration. +// Returns Status::NotFound() if the leader RaftPeerPB could not be found in +// the config, or if there is no leader defined. +Status GetRaftConfigLeader(const ConsensusStatePB& cstate, RaftPeerPB* peer_pb); + +// Modifies 'configuration' remove the peer with the specified 'uuid'. +// Returns false if the server with 'uuid' is not found in the configuration. +// Returns true on success. +bool RemoveFromRaftConfig(RaftConfigPB* config, const std::string& uuid); + +// Counts the number of voters in the configuration. +int CountVoters(const RaftConfigPB& config); + +// Calculates size of a configuration majority based on # of voters. +int MajoritySize(int num_voters); + +// Determines the role that the peer with uuid 'uuid' plays in the cluster. +// If the peer uuid is not a voter in the configuration, this function will return +// NON_PARTICIPANT, regardless of whether it is listed as leader in cstate. +RaftPeerPB::Role GetConsensusRole(const std::string& uuid, + const ConsensusStatePB& cstate); + +// Verifies that the provided configuration is well formed. +// If type == COMMITTED_QUORUM, we enforce that opid_index is set. +// If type == UNCOMMITTED_QUORUM, we enforce that opid_index is NOT set. +Status VerifyRaftConfig(const RaftConfigPB& config, RaftConfigState type); + +// Superset of checks performed by VerifyRaftConfig. Also ensures that the +// leader is a configuration voter, if it is set, and that a valid term is set. +Status VerifyConsensusState(const ConsensusStatePB& cstate, RaftConfigState type); + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_QUORUM_UTIL_H_ */ diff --git a/src/kudu/consensus/raft_consensus-test.cc b/src/kudu/consensus/raft_consensus-test.cc new file mode 100644 index 000000000000..76e19ad86463 --- /dev/null +++ b/src/kudu/consensus/raft_consensus-test.cc @@ -0,0 +1,752 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/peer_manager.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/server/logical_clock.h" +#include "kudu/util/async_util.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(enable_leader_failure_detection); + +METRIC_DECLARE_entity(tablet); + +using std::shared_ptr; +using std::string; + +namespace kudu { +namespace consensus { + +using log::Log; +using log::LogOptions; +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::AtLeast; +using ::testing::Eq; +using ::testing::InSequence; +using ::testing::Invoke; +using ::testing::Mock; +using ::testing::Property; +using ::testing::Return; + +const char* kTestTablet = "TestTablet"; +const char* kLocalPeerUuid = "peer-0"; + +// A simple map to collect the results of a sequence of transactions. +typedef std::map StatusesMap; + +class MockQueue : public PeerMessageQueue { + public: + explicit MockQueue(const scoped_refptr& metric_entity, log::Log* log) + : PeerMessageQueue(metric_entity, log, FakeRaftPeerPB(kLocalPeerUuid), kTestTablet) {} + MOCK_METHOD1(Init, void(const OpId& locally_replicated_index)); + MOCK_METHOD3(SetLeaderMode, void(const OpId& committed_opid, + int64_t current_term, + const RaftConfigPB& active_config)); + MOCK_METHOD0(SetNonLeaderMode, void()); + virtual Status AppendOperations(const vector& msgs, + const StatusCallback& callback) OVERRIDE { + return AppendOperationsMock(msgs, callback); + } + MOCK_METHOD2(AppendOperationsMock, Status(const vector& msgs, + const StatusCallback& callback)); + MOCK_METHOD1(TrackPeer, void(const string&)); + MOCK_METHOD1(UntrackPeer, void(const string&)); + MOCK_METHOD4(RequestForPeer, Status(const std::string& uuid, + ConsensusRequestPB* request, + std::vector* msg_refs, + bool* needs_remote_bootstrap)); + MOCK_METHOD3(ResponseFromPeer, void(const std::string& peer_uuid, + const ConsensusResponsePB& response, + bool* more_pending)); + MOCK_METHOD0(Close, void()); +}; + +class MockPeerManager : public PeerManager { + public: + MockPeerManager() : PeerManager("", "", nullptr, nullptr, nullptr, nullptr) {} + MOCK_METHOD1(UpdateRaftConfig, Status(const consensus::RaftConfigPB& config)); + MOCK_METHOD1(SignalRequest, void(bool force_if_queue_empty)); + MOCK_METHOD0(Close, void()); +}; + +class RaftConsensusSpy : public RaftConsensus { + public: + typedef Callback& round)> AppendCallback; + + RaftConsensusSpy(const ConsensusOptions& options, + gscoped_ptr cmeta, + gscoped_ptr proxy_factory, + gscoped_ptr queue, + gscoped_ptr peer_manager, + gscoped_ptr thread_pool, + const scoped_refptr& metric_entity, + const std::string& peer_uuid, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, + const scoped_refptr& log, + const shared_ptr& parent_mem_tracker, + const Callback& mark_dirty_clbk) + : RaftConsensus(options, + cmeta.Pass(), + proxy_factory.Pass(), + queue.Pass(), + peer_manager.Pass(), + thread_pool.Pass(), + metric_entity, + peer_uuid, + clock, + txn_factory, + log, + parent_mem_tracker, + mark_dirty_clbk) { + // These "aliases" allow us to count invocations and assert on them. + ON_CALL(*this, StartConsensusOnlyRoundUnlocked(_)) + .WillByDefault(Invoke(this, + &RaftConsensusSpy::StartNonLeaderConsensusRoundUnlockedConcrete)); + ON_CALL(*this, NonTxRoundReplicationFinished(_, _, _)) + .WillByDefault(Invoke(this, &RaftConsensusSpy::NonTxRoundReplicationFinishedConcrete)); + } + + MOCK_METHOD1(AppendNewRoundToQueueUnlocked, Status(const scoped_refptr& round)); + Status AppendNewRoundToQueueUnlockedConcrete(const scoped_refptr& round) { + return RaftConsensus::AppendNewRoundToQueueUnlocked(round); + } + + MOCK_METHOD1(StartConsensusOnlyRoundUnlocked, Status(const ReplicateRefPtr& msg)); + Status StartNonLeaderConsensusRoundUnlockedConcrete(const ReplicateRefPtr& msg) { + return RaftConsensus::StartConsensusOnlyRoundUnlocked(msg); + } + + MOCK_METHOD3(NonTxRoundReplicationFinished, void(ConsensusRound* round, + const StatusCallback& client_cb, + const Status& status)); + void NonTxRoundReplicationFinishedConcrete(ConsensusRound* round, + const StatusCallback& client_cb, + const Status& status) { + LOG(INFO) << "Committing round with opid " << round->id() + << " given Status " << status.ToString(); + RaftConsensus::NonTxRoundReplicationFinished(round, client_cb, status); + } + + private: + DISALLOW_COPY_AND_ASSIGN(RaftConsensusSpy); +}; + +void DoNothing(const string& s) { +} + +class RaftConsensusTest : public KuduTest { + public: + RaftConsensusTest() + : clock_(server::LogicalClock::CreateStartingAt(Timestamp(0))), + metric_entity_(METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "raft-consensus-test")), + schema_(GetSimpleTestSchema()) { + FLAGS_enable_leader_failure_detection = false; + options_.tablet_id = kTestTablet; + } + + virtual void SetUp() OVERRIDE { + LogOptions options; + string test_path = GetTestPath("test-peer-root"); + + // TODO mock the Log too, since we're gonna mock the queue + // monitors and pretty much everything else. + fs_manager_.reset(new FsManager(env_.get(), test_path)); + CHECK_OK(fs_manager_->CreateInitialFileSystemLayout()); + CHECK_OK(fs_manager_->Open()); + CHECK_OK(Log::Open(LogOptions(), + fs_manager_.get(), + kTestTablet, + schema_, + 0, // schema_version + NULL, + &log_)); + + queue_ = new MockQueue(metric_entity_, log_.get()); + peer_manager_ = new MockPeerManager; + txn_factory_.reset(new MockTransactionFactory); + + ON_CALL(*queue_, AppendOperationsMock(_, _)) + .WillByDefault(Invoke(this, &RaftConsensusTest::AppendToLog)); + } + + void SetUpConsensus(int64_t initial_term = consensus::kMinimumTerm, int num_peers = 1) { + config_ = BuildRaftConfigPBForTests(num_peers); + config_.set_opid_index(kInvalidOpIdIndex); + + gscoped_ptr proxy_factory(new LocalTestPeerProxyFactory(nullptr)); + + string peer_uuid = config_.peers(num_peers - 1).permanent_uuid(); + + gscoped_ptr cmeta; + CHECK_OK(ConsensusMetadata::Create(fs_manager_.get(), kTestTablet, peer_uuid, + config_, initial_term, &cmeta)); + + gscoped_ptr thread_pool; + CHECK_OK(ThreadPoolBuilder("raft-pool") .Build(&thread_pool)); + + consensus_.reset(new RaftConsensusSpy(options_, + cmeta.Pass(), + proxy_factory.Pass(), + gscoped_ptr(queue_), + gscoped_ptr(peer_manager_), + thread_pool.Pass(), + metric_entity_, + peer_uuid, + clock_, + txn_factory_.get(), + log_.get(), + MemTracker::GetRootTracker(), + Bind(&DoNothing))); + + ON_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .WillByDefault(Invoke(this, &RaftConsensusTest::MockAppendNewRound)); + } + + Status AppendToLog(const vector& msgs, + const StatusCallback& callback) { + return log_->AsyncAppendReplicates(msgs, + Bind(LogAppendCallback, callback)); + } + + static void LogAppendCallback(const StatusCallback& callback, + const Status& s) { + CHECK_OK(s); + callback.Run(s); + } + + Status MockAppendNewRound(const scoped_refptr& round) { + rounds_.push_back(round); + RETURN_NOT_OK(consensus_->AppendNewRoundToQueueUnlockedConcrete(round)); + LOG(INFO) << "Round append: " << round->id() << ", ReplicateMsg: " + << round->replicate_msg()->ShortDebugString(); + return Status::OK(); + } + + void SetUpGeneralExpectations() { + EXPECT_CALL(*peer_manager_, SignalRequest(_)) + .Times(AnyNumber()); + EXPECT_CALL(*peer_manager_, Close()) + .Times(AtLeast(1)); + EXPECT_CALL(*queue_, Close()) + .Times(1); + EXPECT_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .Times(AnyNumber()); + } + + // Create a ConsensusRequestPB suitable to send to a peer. + ConsensusRequestPB MakeConsensusRequest(int64_t caller_term, + const string& caller_uuid, + const OpId& preceding_opid); + + // Add a single no-op with the given OpId to a ConsensusRequestPB. + void AddNoOpToConsensusRequest(ConsensusRequestPB* request, const OpId& noop_opid); + + scoped_refptr AppendNoOpRound() { + ReplicateRefPtr replicate_ptr(make_scoped_refptr_replicate(new ReplicateMsg)); + replicate_ptr->get()->set_op_type(NO_OP); + replicate_ptr->get()->set_timestamp(clock_->Now().ToUint64()); + scoped_refptr round(new ConsensusRound(consensus_.get(), replicate_ptr)); + round->SetConsensusReplicatedCallback( + Bind(&RaftConsensusSpy::NonTxRoundReplicationFinished, + Unretained(consensus_.get()), Unretained(round.get()), Bind(&DoNothingStatusCB))); + + CHECK_OK(consensus_->Replicate(round)); + LOG(INFO) << "Appended NO_OP round with opid " << round->id(); + return round; + } + + void DumpRounds() { + LOG(INFO) << "Dumping rounds..."; + for (const scoped_refptr& round : rounds_) { + LOG(INFO) << "Round: OpId " << round->id() << ", ReplicateMsg: " + << round->replicate_msg()->ShortDebugString(); + } + } + + protected: + ConsensusOptions options_; + RaftConfigPB config_; + OpId initial_id_; + gscoped_ptr fs_manager_; + scoped_refptr log_; + gscoped_ptr proxy_factory_; + scoped_refptr clock_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + const Schema schema_; + scoped_refptr consensus_; + + vector > rounds_; + + // Mocks. + // NOTE: both 'queue_' and 'peer_manager_' belong to 'consensus_' and may be deleted before + // the test is. + MockQueue* queue_; + MockPeerManager* peer_manager_; + gscoped_ptr txn_factory_; +}; + +ConsensusRequestPB RaftConsensusTest::MakeConsensusRequest(int64_t caller_term, + const string& caller_uuid, + const OpId& preceding_opid) { + ConsensusRequestPB request; + request.set_caller_term(caller_term); + request.set_caller_uuid(caller_uuid); + request.set_tablet_id(kTestTablet); + *request.mutable_preceding_id() = preceding_opid; + return request; +} + +void RaftConsensusTest::AddNoOpToConsensusRequest(ConsensusRequestPB* request, + const OpId& noop_opid) { + ReplicateMsg* noop_msg = request->add_ops(); + *noop_msg->mutable_id() = noop_opid; + noop_msg->set_op_type(NO_OP); + noop_msg->set_timestamp(clock_->Now().ToUint64()); + noop_msg->mutable_noop_request(); +} + +// Tests that the committed index moves along with the majority replicated +// index when the terms are the same. +TEST_F(RaftConsensusTest, TestCommittedIndexWhenInSameTerm) { + SetUpConsensus(); + SetUpGeneralExpectations(); + EXPECT_CALL(*peer_manager_, UpdateRaftConfig(_)) + .Times(1) + .WillOnce(Return(Status::OK())); + EXPECT_CALL(*queue_, Init(_)) + .Times(1); + EXPECT_CALL(*queue_, SetLeaderMode(_, _, _)) + .Times(1); + EXPECT_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .Times(11); + EXPECT_CALL(*queue_, AppendOperationsMock(_, _)) + .Times(11).WillRepeatedly(Return(Status::OK())); + + ConsensusBootstrapInfo info; + ASSERT_OK(consensus_->Start(info)); + ASSERT_OK(consensus_->EmulateElection()); + + // Commit the first noop round, created on EmulateElection(); + OpId committed_index; + ASSERT_FALSE(rounds_.empty()) << "rounds_ is empty!"; + consensus_->UpdateMajorityReplicated(rounds_[0]->id(), &committed_index); + + ASSERT_OPID_EQ(rounds_[0]->id(), committed_index); + + // Append 10 rounds + for (int i = 0; i < 10; i++) { + scoped_refptr round = AppendNoOpRound(); + // queue reports majority replicated index in the leader's term + // committed index should move accordingly. + consensus_->UpdateMajorityReplicated(round->id(), &committed_index); + ASSERT_OPID_EQ(round->id(), committed_index); + } +} + +// Tests that, when terms change, the commit index only advances when the majority +// replicated index is in the current term. +TEST_F(RaftConsensusTest, TestCommittedIndexWhenTermsChange) { + SetUpConsensus(); + SetUpGeneralExpectations(); + EXPECT_CALL(*peer_manager_, UpdateRaftConfig(_)) + .Times(2) + .WillRepeatedly(Return(Status::OK())); + EXPECT_CALL(*queue_, Init(_)) + .Times(1); + EXPECT_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .Times(3); + EXPECT_CALL(*queue_, AppendOperationsMock(_, _)) + .Times(3).WillRepeatedly(Return(Status::OK()));; + + ConsensusBootstrapInfo info; + ASSERT_OK(consensus_->Start(info)); + ASSERT_OK(consensus_->EmulateElection()); + + OpId committed_index; + consensus_->UpdateMajorityReplicated(rounds_[0]->id(), &committed_index); + ASSERT_OPID_EQ(rounds_[0]->id(), committed_index); + + // Append another round in the current term (besides the original config round). + scoped_refptr round = AppendNoOpRound(); + + // Now emulate an election, the same guy will be leader but the term + // will change. + ASSERT_OK(consensus_->EmulateElection()); + + // Now tell consensus that 'round' has been majority replicated, this _shouldn't_ + // advance the committed index, since that belongs to a previous term. + OpId new_committed_index; + consensus_->UpdateMajorityReplicated(round->id(), &new_committed_index); + ASSERT_OPID_EQ(committed_index, new_committed_index); + + const scoped_refptr& last_config_round = rounds_[2]; + + // Now notify that the last change config was committed, this should advance the + // commit index to the id of the last change config. + consensus_->UpdateMajorityReplicated(last_config_round->id(), &committed_index); + + DumpRounds(); + ASSERT_OPID_EQ(last_config_round->id(), committed_index); +} + +// Asserts that a ConsensusRound has an OpId set in its ReplicateMsg. +MATCHER(HasOpId, "") { return arg->id().IsInitialized(); } + +// These matchers assert that a Status object is of a certain type. +MATCHER(IsOk, "") { return arg.ok(); } +MATCHER(IsAborted, "") { return arg.IsAborted(); } + +// Tests that consensus is able to handle pending operations. It tests this in two ways: +// - It tests that consensus does the right thing with pending transactions from the the WAL. +// - It tests that when a follower gets promoted to leader it does the right thing +// with the pending operations. +TEST_F(RaftConsensusTest, TestPendingTransactions) { + SetUpConsensus(10); + + // Emulate a stateful system by having a bunch of operations in flight when consensus starts. + // Specifically we emulate we're on term 10, with 5 operations before the last known + // committed operation, 10.104, which should be committed immediately, and 5 operations after the + // last known committed operation, which should be pending but not yet committed. + ConsensusBootstrapInfo info; + info.last_id.set_term(10); + for (int i = 0; i < 10; i++) { + auto replicate = new ReplicateMsg(); + replicate->set_op_type(NO_OP); + info.last_id.set_index(100 + i); + replicate->mutable_id()->CopyFrom(info.last_id); + info.orphaned_replicates.push_back(replicate); + } + + info.last_committed_id.set_term(10); + info.last_committed_id.set_index(104); + + { + InSequence dummy; + // On start we expect 10 NO_OPs to be enqueues, with 5 of those having + // their commit continuation called immediately. + EXPECT_CALL(*consensus_.get(), StartConsensusOnlyRoundUnlocked(_)) + .Times(10); + + // Queue gets initted when the peer starts. + EXPECT_CALL(*queue_, Init(_)) + .Times(1); + } + + ASSERT_OK(consensus_->Start(info)); + + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(queue_)); + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(txn_factory_.get())); + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(peer_manager_)); + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(consensus_.get())); + + // Now we test what this peer does with the pending operations once it's elected leader. + { + InSequence dummy; + // Peer manager gets updated with the new set of peers to send stuff to. + EXPECT_CALL(*peer_manager_, UpdateRaftConfig(_)) + .Times(1).WillOnce(Return(Status::OK())); + // The no-op should be appended to the queue. + // One more op will be appended for the election. + EXPECT_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .Times(1); + EXPECT_CALL(*queue_, AppendOperationsMock(_, _)) + .Times(1).WillRepeatedly(Return(Status::OK()));; + } + + // Emulate an election, this will make this peer become leader and trigger the + // above set expectations. + ASSERT_OK(consensus_->EmulateElection()); + + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(queue_)); + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(txn_factory_.get())); + ASSERT_TRUE(testing::Mock::VerifyAndClearExpectations(peer_manager_)); + + // Commit the 5 no-ops from the previous term, along with the one pushed to + // assert leadership. + EXPECT_CALL(*consensus_.get(), NonTxRoundReplicationFinished(HasOpId(), _, IsOk())) + .Times(6); + EXPECT_CALL(*peer_manager_, SignalRequest(_)) + .Times(AnyNumber()); + // In the end peer manager and the queue get closed. + EXPECT_CALL(*peer_manager_, Close()) + .Times(AtLeast(1)); + EXPECT_CALL(*queue_, Close()) + .Times(1); + + // Now tell consensus all original orphaned replicates were majority replicated. + // This should not advance the committed index because we haven't replicated + // anything in the current term. + OpId committed_index; + consensus_->UpdateMajorityReplicated(info.orphaned_replicates.back()->id(), + &committed_index); + // Should still be the last committed in the the wal. + ASSERT_OPID_EQ(committed_index, info.last_committed_id); + + // Now mark the last operation (the no-op round) as committed. + // This should advance the committed index, since that round in on our current term, + // and we should be able to commit all previous rounds. + OpId cc_round_id = info.orphaned_replicates.back()->id(); + cc_round_id.set_term(11); + cc_round_id.set_index(cc_round_id.index() + 1); + consensus_->UpdateMajorityReplicated(cc_round_id, + &committed_index); + + ASSERT_OPID_EQ(committed_index, cc_round_id); +} + +MATCHER_P2(RoundHasOpId, term, index, "") { + LOG(INFO) << "expected: " << MakeOpId(term, index) << ", actual: " << arg->id(); + return arg->id().term() == term && arg->id().index() == index; +} + +// Tests the case where a a leader is elected and pushed a sequence of +// operations of which some never get committed. Eventually a new leader in a higher +// term pushes operations that overwrite some of the original indexes. +TEST_F(RaftConsensusTest, TestAbortOperations) { + SetUpConsensus(1, 2); + + EXPECT_CALL(*consensus_.get(), AppendNewRoundToQueueUnlocked(_)) + .Times(AnyNumber()); + + EXPECT_CALL(*peer_manager_, SignalRequest(_)) + .Times(AnyNumber()); + EXPECT_CALL(*peer_manager_, Close()) + .Times(AtLeast(1)); + EXPECT_CALL(*queue_, Close()) + .Times(1); + EXPECT_CALL(*queue_, Init(_)) + .Times(1); + EXPECT_CALL(*peer_manager_, UpdateRaftConfig(_)) + .Times(1) + .WillRepeatedly(Return(Status::OK())); + + // We'll append to the queue 12 times, the initial noop txn + 10 initial ops while leader + // and the new leader's update, when we're overwriting operations. + EXPECT_CALL(*queue_, AppendOperationsMock(_, _)) + .Times(12); + + // .. but those will be overwritten later by another + // leader, which will push and commit 5 ops. + // Only these five should start as replica rounds. + EXPECT_CALL(*consensus_.get(), StartConsensusOnlyRoundUnlocked(_)) + .Times(4); + + ConsensusBootstrapInfo info; + ASSERT_OK(consensus_->Start(info)); + ASSERT_OK(consensus_->EmulateElection()); + + // Append 10 rounds: 2.2 - 2.11 + for (int i = 0; i < 10; i++) { + AppendNoOpRound(); + } + + // Expectations for what gets committed and what gets aborted: + // (note: the aborts may be triggered before the commits) + // 5 OK's for the 2.1-2.5 ops. + // 6 Aborts for the 2.6-2.11 ops. + // 1 OK for the 3.6 op. + for (int index = 1; index < 6; index++) { + EXPECT_CALL(*consensus_.get(), + NonTxRoundReplicationFinished(RoundHasOpId(2, index), _, IsOk())).Times(1); + } + for (int index = 6; index < 12; index++) { + EXPECT_CALL(*consensus_.get(), + NonTxRoundReplicationFinished(RoundHasOpId(2, index), _, IsAborted())).Times(1); + } + EXPECT_CALL(*consensus_.get(), + NonTxRoundReplicationFinished(RoundHasOpId(3, 6), _, IsOk())).Times(1); + + // Nothing's committed so far, so now just send an Update() message + // emulating another guy got elected leader and is overwriting a suffix + // of the previous messages. + // In particular this request has: + // - Op 2.5 from the previous leader's term + // - Ops 3.6-3.9 from the new leader's term + // - A new committed index of 3.6 + ConsensusRequestPB request; + request.set_caller_term(3); + const string PEER_0_UUID = "peer-0"; + request.set_caller_uuid(PEER_0_UUID); + request.set_tablet_id(kTestTablet); + request.mutable_preceding_id()->CopyFrom(MakeOpId(2, 4)); + + ReplicateMsg* replicate = request.add_ops(); + replicate->mutable_id()->CopyFrom(MakeOpId(2, 5)); + replicate->set_op_type(NO_OP); + + ReplicateMsg* noop_msg = request.add_ops(); + noop_msg->mutable_id()->CopyFrom(MakeOpId(3, 6)); + noop_msg->set_op_type(NO_OP); + noop_msg->set_timestamp(clock_->Now().ToUint64()); + noop_msg->mutable_noop_request(); + + // Overwrite another 3 of the original rounds for a total of 4 overwrites. + for (int i = 7; i < 10; i++) { + ReplicateMsg* replicate = request.add_ops(); + replicate->mutable_id()->CopyFrom(MakeOpId(3, i)); + replicate->set_op_type(NO_OP); + replicate->set_timestamp(clock_->Now().ToUint64()); + } + + request.mutable_committed_index()->CopyFrom(MakeOpId(3, 6)); + + ConsensusResponsePB response; + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.has_error()); + + ASSERT_TRUE(Mock::VerifyAndClearExpectations(consensus_.get())); + + // Now we expect to commit ops 3.7 - 3.9. + for (int index = 7; index < 10; index++) { + EXPECT_CALL(*consensus_.get(), + NonTxRoundReplicationFinished(RoundHasOpId(3, index), _, IsOk())).Times(1); + } + + request.mutable_ops()->Clear(); + request.mutable_preceding_id()->CopyFrom(MakeOpId(3, 9)); + request.mutable_committed_index()->CopyFrom(MakeOpId(3, 9)); + + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.has_error()); +} + +TEST_F(RaftConsensusTest, TestReceivedIdIsInittedBeforeStart) { + SetUpConsensus(); + OpId opid; + ASSERT_OK(consensus_->GetLastOpId(RECEIVED_OPID, &opid)); + ASSERT_TRUE(opid.IsInitialized()); + ASSERT_OPID_EQ(opid, MinimumOpId()); +} + +// Ensure that followers reset their "last_received_current_leader" +// ConsensusStatusPB field when a new term is encountered. This is a +// correctness test for the logic on the follower side that allows the +// leader-side queue to determine which op to send next in various scenarios. +TEST_F(RaftConsensusTest, TestResetRcvdFromCurrentLeaderOnNewTerm) { + SetUpConsensus(kMinimumTerm, 3); + SetUpGeneralExpectations(); + ConsensusBootstrapInfo info; + ASSERT_OK(consensus_->Start(info)); + + ConsensusRequestPB request; + ConsensusResponsePB response; + int64_t caller_term = 0; + int64_t log_index = 0; + + caller_term = 1; + string caller_uuid = config_.peers(0).permanent_uuid(); + OpId preceding_opid = MinimumOpId(); + + // Heartbeat. This will cause the term to increment on the follower. + request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_EQ(caller_term, response.responder_term()); + ASSERT_OPID_EQ(response.status().last_received(), MinimumOpId()); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), MinimumOpId()); + + // Replicate a no-op. + OpId noop_opid = MakeOpId(caller_term, ++log_index); + AddNoOpToConsensusRequest(&request, noop_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_OPID_EQ(response.status().last_received(), noop_opid); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid); + + // New leader heartbeat. Term increase to 2. + // Expect current term replicated to be nothing (MinimumOpId) but log + // replicated to be everything sent so far. + caller_term = 2; + caller_uuid = config_.peers(1).permanent_uuid(); + preceding_opid = noop_opid; + request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_EQ(caller_term, response.responder_term()); + ASSERT_OPID_EQ(response.status().last_received(), preceding_opid); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), MinimumOpId()); + + // Append a no-op. + noop_opid = MakeOpId(caller_term, ++log_index); + AddNoOpToConsensusRequest(&request, noop_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_OPID_EQ(response.status().last_received(), noop_opid); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid); + + // New leader heartbeat. The term should rev but we should get an LMP mismatch. + caller_term = 3; + caller_uuid = config_.peers(0).permanent_uuid(); + preceding_opid = MakeOpId(caller_term, log_index + 1); // Not replicated yet. + request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_EQ(caller_term, response.responder_term()); + ASSERT_OPID_EQ(response.status().last_received(), noop_opid); // Not preceding this time. + ASSERT_OPID_EQ(response.status().last_received_current_leader(), MinimumOpId()); + ASSERT_TRUE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_EQ(ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH, response.status().error().code()); + + // Decrement preceding and append a no-op. + preceding_opid = MakeOpId(2, log_index); + noop_opid = MakeOpId(caller_term, ++log_index); + request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid); + AddNoOpToConsensusRequest(&request, noop_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_OPID_EQ(response.status().last_received(), noop_opid) << response.ShortDebugString(); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid) + << response.ShortDebugString(); + + // Happy case. New leader with new no-op to append right off the bat. + // Response should be OK with all last_received* fields equal to the new no-op. + caller_term = 4; + caller_uuid = config_.peers(1).permanent_uuid(); + preceding_opid = noop_opid; + noop_opid = MakeOpId(caller_term, ++log_index); + request = MakeConsensusRequest(caller_term, caller_uuid, preceding_opid); + AddNoOpToConsensusRequest(&request, noop_opid); + response.Clear(); + ASSERT_OK(consensus_->Update(&request, &response)); + ASSERT_FALSE(response.status().has_error()) << response.ShortDebugString(); + ASSERT_EQ(caller_term, response.responder_term()); + ASSERT_OPID_EQ(response.status().last_received(), noop_opid); + ASSERT_OPID_EQ(response.status().last_received_current_leader(), noop_opid); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc new file mode 100644 index 000000000000..6e15c840ae81 --- /dev/null +++ b/src/kudu/consensus/raft_consensus.cc @@ -0,0 +1,1995 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/raft_consensus.h" + +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/leader_election.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/peer_manager.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/consensus/raft_consensus_state.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/server/clock.h" +#include "kudu/server/metadata.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" +#include "kudu/util/url-coding.h" + +DEFINE_int32(raft_heartbeat_interval_ms, 500, + "The heartbeat interval for Raft replication. The leader produces heartbeats " + "to followers at this interval. The followers expect a heartbeat at this interval " + "and consider a leader to have failed if it misses several in a row."); +TAG_FLAG(raft_heartbeat_interval_ms, advanced); + +// Defaults to be the same value as the leader heartbeat interval. +DEFINE_int32(leader_failure_monitor_check_mean_ms, -1, + "The mean failure-checking interval of the randomized failure monitor. If this " + "is configured to -1 (the default), uses the value of 'raft_heartbeat_interval_ms'."); +TAG_FLAG(leader_failure_monitor_check_mean_ms, experimental); + +// Defaults to half of the mean (above). +DEFINE_int32(leader_failure_monitor_check_stddev_ms, -1, + "The standard deviation of the failure-checking interval of the randomized " + "failure monitor. If this is configured to -1 (the default), this is set to " + "half of the mean check interval."); +TAG_FLAG(leader_failure_monitor_check_stddev_ms, experimental); + +DEFINE_double(leader_failure_max_missed_heartbeat_periods, 3.0, + "Maximum heartbeat periods that the leader can fail to heartbeat in before we " + "consider the leader to be failed. The total failure timeout in milliseconds is " + "raft_heartbeat_interval_ms times leader_failure_max_missed_heartbeat_periods. " + "The value passed to this flag may be fractional."); +TAG_FLAG(leader_failure_max_missed_heartbeat_periods, advanced); + +DEFINE_int32(leader_failure_exp_backoff_max_delta_ms, 20 * 1000, + "Maximum time to sleep in between leader election retries, in addition to the " + "regular timeout. When leader election fails the interval in between retries " + "increases exponentially, up to this value."); +TAG_FLAG(leader_failure_exp_backoff_max_delta_ms, experimental); + +DEFINE_bool(enable_leader_failure_detection, true, + "Whether to enable failure detection of tablet leaders. If enabled, attempts will be " + "made to elect a follower as a new leader when the leader is detected to have failed."); +TAG_FLAG(enable_leader_failure_detection, unsafe); + +DEFINE_bool(evict_failed_followers, true, + "Whether to evict followers from the Raft config that have fallen " + "too far behind the leader's log to catch up normally or have been " + "unreachable by the leader for longer than " + "follower_unavailable_considered_failed_sec"); +TAG_FLAG(evict_failed_followers, advanced); + +DEFINE_bool(follower_reject_update_consensus_requests, false, + "Whether a follower will return an error for all UpdateConsensus() requests. " + "Warning! This is only intended for testing."); +TAG_FLAG(follower_reject_update_consensus_requests, unsafe); + +DEFINE_bool(follower_fail_all_prepare, false, + "Whether a follower will fail preparing all transactions. " + "Warning! This is only intended for testing."); +TAG_FLAG(follower_fail_all_prepare, unsafe); + +DECLARE_int32(memory_limit_warn_threshold_percentage); + +METRIC_DEFINE_counter(tablet, follower_memory_pressure_rejections, + "Follower Memory Pressure Rejections", + kudu::MetricUnit::kRequests, + "Number of RPC requests rejected due to " + "memory pressure while FOLLOWER."); +METRIC_DEFINE_gauge_int64(tablet, raft_term, + "Current Raft Consensus Term", + kudu::MetricUnit::kUnits, + "Current Term of the Raft Consensus algorithm. This number increments " + "each time a leader election is started."); + +namespace { + +// Return the mean interval at which to check for failures of the +// leader. +int GetFailureMonitorCheckMeanMs() { + int val = FLAGS_leader_failure_monitor_check_mean_ms; + if (val < 0) { + val = FLAGS_raft_heartbeat_interval_ms; + } + return val; +} + +// Return the standard deviation for the interval at which to check +// for failures of the leader. +int GetFailureMonitorCheckStddevMs() { + int val = FLAGS_leader_failure_monitor_check_stddev_ms; + if (val < 0) { + val = GetFailureMonitorCheckMeanMs() / 2; + } + return val; +} + +} // anonymous namespace + +namespace kudu { +namespace consensus { + +using log::LogEntryBatch; +using std::shared_ptr; +using strings::Substitute; +using tserver::TabletServerErrorPB; + +// Special string that represents any known leader to the failure detector. +static const char* const kTimerId = "election-timer"; + +scoped_refptr RaftConsensus::Create( + const ConsensusOptions& options, + gscoped_ptr cmeta, + const RaftPeerPB& local_peer_pb, + const scoped_refptr& metric_entity, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, + const shared_ptr& messenger, + const scoped_refptr& log, + const shared_ptr& parent_mem_tracker, + const Callback& mark_dirty_clbk) { + gscoped_ptr rpc_factory(new RpcPeerProxyFactory(messenger)); + + // The message queue that keeps track of which operations need to be replicated + // where. + gscoped_ptr queue(new PeerMessageQueue(metric_entity, + log, + local_peer_pb, + options.tablet_id)); + + gscoped_ptr thread_pool; + CHECK_OK(ThreadPoolBuilder(Substitute("$0-raft", options.tablet_id.substr(0, 6))) + .Build(&thread_pool)); + + DCHECK(local_peer_pb.has_permanent_uuid()); + const string& peer_uuid = local_peer_pb.permanent_uuid(); + + // A manager for the set of peers that actually send the operations both remotely + // and to the local wal. + gscoped_ptr peer_manager( + new PeerManager(options.tablet_id, + peer_uuid, + rpc_factory.get(), + queue.get(), + thread_pool.get(), + log)); + + return make_scoped_refptr(new RaftConsensus( + options, + cmeta.Pass(), + rpc_factory.Pass(), + queue.Pass(), + peer_manager.Pass(), + thread_pool.Pass(), + metric_entity, + peer_uuid, + clock, + txn_factory, + log, + parent_mem_tracker, + mark_dirty_clbk)); +} + +RaftConsensus::RaftConsensus( + const ConsensusOptions& options, gscoped_ptr cmeta, + gscoped_ptr proxy_factory, + gscoped_ptr queue, gscoped_ptr peer_manager, + gscoped_ptr thread_pool, + const scoped_refptr& metric_entity, + const std::string& peer_uuid, const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, const scoped_refptr& log, + shared_ptr parent_mem_tracker, + Callback mark_dirty_clbk) + : thread_pool_(thread_pool.Pass()), + log_(log), + clock_(clock), + peer_proxy_factory_(proxy_factory.Pass()), + peer_manager_(peer_manager.Pass()), + queue_(queue.Pass()), + rng_(GetRandomSeed32()), + failure_monitor_(GetRandomSeed32(), GetFailureMonitorCheckMeanMs(), + GetFailureMonitorCheckStddevMs()), + failure_detector_(new TimedFailureDetector(MonoDelta::FromMilliseconds( + FLAGS_raft_heartbeat_interval_ms * + FLAGS_leader_failure_max_missed_heartbeat_periods))), + withhold_votes_until_(MonoTime::Min()), + mark_dirty_clbk_(std::move(mark_dirty_clbk)), + shutdown_(false), + follower_memory_pressure_rejections_(metric_entity->FindOrCreateCounter( + &METRIC_follower_memory_pressure_rejections)), + term_metric_(metric_entity->FindOrCreateGauge(&METRIC_raft_term, + cmeta->current_term())), + parent_mem_tracker_(std::move(parent_mem_tracker)) { + DCHECK_NOTNULL(log_.get()); + state_.reset(new ReplicaState(options, + peer_uuid, + cmeta.Pass(), + DCHECK_NOTNULL(txn_factory))); +} + +RaftConsensus::~RaftConsensus() { + Shutdown(); +} + +Status RaftConsensus::Start(const ConsensusBootstrapInfo& info) { + RETURN_NOT_OK(ExecuteHook(PRE_START)); + + // This just starts the monitor thread -- no failure detector is registered yet. + if (FLAGS_enable_leader_failure_detection) { + RETURN_NOT_OK(failure_monitor_.Start()); + } + + // Register the failure detector instance with the monitor. + // We still have not enabled failure detection for the leader election timer. + // That happens separately via the helper functions + // EnsureFailureDetector(Enabled/Disabled)Unlocked(); + RETURN_NOT_OK(failure_monitor_.MonitorFailureDetector(state_->GetOptions().tablet_id, + failure_detector_)); + + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForStart(&lock)); + state_->ClearLeaderUnlocked(); + + RETURN_NOT_OK_PREPEND(state_->StartUnlocked(info.last_id), + "Unable to start RAFT ReplicaState"); + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Replica starting. Triggering " + << info.orphaned_replicates.size() + << " pending transactions. Active config: " + << state_->GetActiveConfigUnlocked().ShortDebugString(); + for (ReplicateMsg* replicate : info.orphaned_replicates) { + ReplicateRefPtr replicate_ptr = make_scoped_refptr_replicate(new ReplicateMsg(*replicate)); + RETURN_NOT_OK(StartReplicaTransactionUnlocked(replicate_ptr)); + } + + bool committed_index_changed = false; + state_->AdvanceCommittedIndexUnlocked(info.last_committed_id, &committed_index_changed); + + queue_->Init(state_->GetLastReceivedOpIdUnlocked()); + } + + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForConfigChange(&lock)); + + RETURN_NOT_OK(EnsureFailureDetectorEnabledUnlocked()); + + // If this is the first term expire the FD immediately so that we have a fast first + // election, otherwise we just let the timer expire normally. + if (state_->GetCurrentTermUnlocked() == 0) { + // Initialize the failure detector timeout to some time in the past so that + // the next time the failure detector monitor runs it triggers an election + // (unless someone else requested a vote from us first, which resets the + // election timer). We do it this way instead of immediately running an + // election to get a higher likelihood of enough servers being available + // when the first one attempts an election to avoid multiple election + // cycles on startup, while keeping that "waiting period" random. + if (PREDICT_TRUE(FLAGS_enable_leader_failure_detection)) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Consensus starting up: Expiring failure detector timer " + "to make a prompt election more likely"; + } + RETURN_NOT_OK(ExpireFailureDetectorUnlocked()); + } + + // Now assume "follower" duties. + RETURN_NOT_OK(BecomeReplicaUnlocked()); + } + + RETURN_NOT_OK(ExecuteHook(POST_START)); + + // Report become visible to the Master. + MarkDirty("RaftConsensus started"); + + return Status::OK(); +} + +bool RaftConsensus::IsRunning() const { + ReplicaState::UniqueLock lock; + Status s = state_->LockForRead(&lock); + if (PREDICT_FALSE(!s.ok())) return false; + return state_->state() == ReplicaState::kRunning; +} + +Status RaftConsensus::EmulateElection() { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForConfigChange(&lock)); + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Emulating election..."; + + // Assume leadership of new term. + RETURN_NOT_OK(IncrementTermUnlocked()); + SetLeaderUuidUnlocked(state_->GetPeerUuid()); + return BecomeLeaderUnlocked(); +} + +Status RaftConsensus::StartElection(ElectionMode mode) { + TRACE_EVENT2("consensus", "RaftConsensus::StartElection", + "peer", peer_uuid(), + "tablet", tablet_id()); + scoped_refptr election; + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForConfigChange(&lock)); + + RaftPeerPB::Role active_role = state_->GetActiveRoleUnlocked(); + if (active_role == RaftPeerPB::LEADER) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Not starting election -- already leader"; + return Status::OK(); + } else if (PREDICT_FALSE(active_role == RaftPeerPB::NON_PARTICIPANT)) { + SnoozeFailureDetectorUnlocked(); // Avoid excessive election noise while in this state. + return Status::IllegalState("Not starting election: Node is currently " + "a non-participant in the raft config", + state_->GetActiveConfigUnlocked().ShortDebugString()); + } + + if (state_->HasLeaderUnlocked()) { + LOG_WITH_PREFIX_UNLOCKED(INFO) + << "Failure of leader " << state_->GetLeaderUuidUnlocked() + << " detected. Triggering leader election"; + } else { + LOG_WITH_PREFIX_UNLOCKED(INFO) + << "No leader contacted us within the election timeout. " + << "Triggering leader election"; + } + + // Increment the term. + RETURN_NOT_OK(IncrementTermUnlocked()); + + // Snooze to avoid the election timer firing again as much as possible. + // We do not disable the election timer while running an election. + RETURN_NOT_OK(EnsureFailureDetectorEnabledUnlocked()); + + MonoDelta timeout = LeaderElectionExpBackoffDeltaUnlocked(); + RETURN_NOT_OK(SnoozeFailureDetectorUnlocked(timeout, ALLOW_LOGGING)); + + const RaftConfigPB& active_config = state_->GetActiveConfigUnlocked(); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Starting election with config: " + << active_config.ShortDebugString(); + + // Initialize the VoteCounter. + int num_voters = CountVoters(active_config); + int majority_size = MajoritySize(num_voters); + gscoped_ptr counter(new VoteCounter(num_voters, majority_size)); + // Vote for ourselves. + // TODO: Consider using a separate Mutex for voting, which must sync to disk. + RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(state_->GetPeerUuid())); + bool duplicate; + RETURN_NOT_OK(counter->RegisterVote(state_->GetPeerUuid(), VOTE_GRANTED, &duplicate)); + CHECK(!duplicate) << state_->LogPrefixUnlocked() + << "Inexplicable duplicate self-vote for term " + << state_->GetCurrentTermUnlocked(); + + VoteRequestPB request; + request.set_ignore_live_leader(mode == ELECT_EVEN_IF_LEADER_IS_ALIVE); + request.set_candidate_uuid(state_->GetPeerUuid()); + request.set_candidate_term(state_->GetCurrentTermUnlocked()); + request.set_tablet_id(state_->GetOptions().tablet_id); + *request.mutable_candidate_status()->mutable_last_received() = + state_->GetLastReceivedOpIdUnlocked(); + + election.reset(new LeaderElection(active_config, + peer_proxy_factory_.get(), + request, counter.Pass(), timeout, + Bind(&RaftConsensus::ElectionCallback, this))); + } + + // Start the election outside the lock. + election->Run(); + + return Status::OK(); +} + +Status RaftConsensus::StepDown(LeaderStepDownResponsePB* resp) { + TRACE_EVENT0("consensus", "RaftConsensus::StepDown"); + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForConfigChange(&lock)); + if (state_->GetActiveRoleUnlocked() != RaftPeerPB::LEADER) { + resp->mutable_error()->set_code(TabletServerErrorPB::NOT_THE_LEADER); + StatusToPB(Status::IllegalState("Not currently leader"), + resp->mutable_error()->mutable_status()); + // We return OK so that the tablet service won't overwrite the error code. + return Status::OK(); + } + RETURN_NOT_OK(BecomeReplicaUnlocked()); + return Status::OK(); +} + +void RaftConsensus::ReportFailureDetected(const std::string& name, const Status& msg) { + DCHECK_EQ(name, kTimerId); + // Start an election. + Status s = StartElection(NORMAL_ELECTION); + if (PREDICT_FALSE(!s.ok())) { + LOG_WITH_PREFIX(WARNING) << "Failed to trigger leader election: " << s.ToString(); + } +} + +Status RaftConsensus::BecomeLeaderUnlocked() { + TRACE_EVENT2("consensus", "RaftConsensus::BecomeLeaderUnlocked", + "peer", peer_uuid(), + "tablet", tablet_id()); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Becoming Leader. State: " << state_->ToStringUnlocked(); + + // Disable FD while we are leader. + RETURN_NOT_OK(EnsureFailureDetectorDisabledUnlocked()); + + // Don't vote for anyone if we're a leader. + withhold_votes_until_ = MonoTime::Max(); + + queue_->RegisterObserver(this); + RETURN_NOT_OK(RefreshConsensusQueueAndPeersUnlocked()); + + // Initiate a NO_OP transaction that is sent at the beginning of every term + // change in raft. + auto replicate = new ReplicateMsg; + replicate->set_op_type(NO_OP); + replicate->mutable_noop_request(); // Define the no-op request field. + + // TODO: We should have no-ops (?) and config changes be COMMIT_WAIT + // transactions. See KUDU-798. + // Note: This timestamp has no meaning from a serialization perspective + // because this method is not executed on the TabletPeer's prepare thread. + replicate->set_timestamp(clock_->Now().ToUint64()); + + scoped_refptr round( + new ConsensusRound(this, make_scoped_refptr(new RefCountedReplicate(replicate)))); + round->SetConsensusReplicatedCallback(Bind(&RaftConsensus::NonTxRoundReplicationFinished, + Unretained(this), + Unretained(round.get()), + Bind(&DoNothingStatusCB))); + RETURN_NOT_OK(AppendNewRoundToQueueUnlocked(round)); + + return Status::OK(); +} + +Status RaftConsensus::BecomeReplicaUnlocked() { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Becoming Follower/Learner. State: " + << state_->ToStringUnlocked(); + + state_->ClearLeaderUnlocked(); + + // FD should be running while we are a follower. + RETURN_NOT_OK(EnsureFailureDetectorEnabledUnlocked()); + + // Now that we're a replica, we can allow voting for other nodes. + withhold_votes_until_ = MonoTime::Min(); + + queue_->UnRegisterObserver(this); + // Deregister ourselves from the queue. We don't care what get's replicated, since + // we're stepping down. + queue_->SetNonLeaderMode(); + + peer_manager_->Close(); + return Status::OK(); +} + +Status RaftConsensus::Replicate(const scoped_refptr& round) { + + RETURN_NOT_OK(ExecuteHook(PRE_REPLICATE)); + + boost::lock_guard lock(update_lock_); + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForReplicate(&lock, *round->replicate_msg())); + RETURN_NOT_OK(round->CheckBoundTerm(state_->GetCurrentTermUnlocked())); + RETURN_NOT_OK(AppendNewRoundToQueueUnlocked(round)); + } + + peer_manager_->SignalRequest(); + + RETURN_NOT_OK(ExecuteHook(POST_REPLICATE)); + return Status::OK(); +} + +Status RaftConsensus::CheckLeadershipAndBindTerm(const scoped_refptr& round) { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForReplicate(&lock, *round->replicate_msg())); + round->BindToTerm(state_->GetCurrentTermUnlocked()); + return Status::OK(); +} + +Status RaftConsensus::AppendNewRoundToQueueUnlocked(const scoped_refptr& round) { + state_->NewIdUnlocked(round->replicate_msg()->mutable_id()); + RETURN_NOT_OK(state_->AddPendingOperation(round)); + + Status s = queue_->AppendOperation(round->replicate_scoped_refptr()); + + // Handle Status::ServiceUnavailable(), which means the queue is full. + if (PREDICT_FALSE(s.IsServiceUnavailable())) { + gscoped_ptr id(round->replicate_msg()->release_id()); + // Rollback the id gen. so that we reuse this id later, when we can + // actually append to the state machine, i.e. this makes the state + // machine have continuous ids, for the same term, even if the queue + // refused to add any more operations. + state_->CancelPendingOperation(*id); + LOG_WITH_PREFIX_UNLOCKED(WARNING) << ": Could not append replicate request " + << "to the queue. Queue is Full. " + << "Queue metrics: " << queue_->ToString(); + + // TODO Possibly evict a dangling peer from the configuration here. + // TODO count of number of ops failed due to consensus queue overflow. + } + RETURN_NOT_OK_PREPEND(s, "Unable to append operation to consensus queue"); + state_->UpdateLastReceivedOpIdUnlocked(round->id()); + return Status::OK(); +} + +void RaftConsensus::UpdateMajorityReplicated(const OpId& majority_replicated, + OpId* committed_index) { + ReplicaState::UniqueLock lock; + Status s = state_->LockForMajorityReplicatedIndexUpdate(&lock); + if (PREDICT_FALSE(!s.ok())) { + LOG_WITH_PREFIX(WARNING) + << "Unable to take state lock to update committed index: " + << s.ToString(); + return; + } + UpdateMajorityReplicatedUnlocked(majority_replicated, committed_index); +} + +void RaftConsensus::UpdateMajorityReplicatedUnlocked(const OpId& majority_replicated, + OpId* committed_index) { + VLOG_WITH_PREFIX_UNLOCKED(1) << "Marking majority replicated up to " + << majority_replicated.ShortDebugString(); + TRACE("Marking majority replicated up to $0", majority_replicated.ShortDebugString()); + bool committed_index_changed = false; + Status s = state_->UpdateMajorityReplicatedUnlocked(majority_replicated, committed_index, + &committed_index_changed); + if (PREDICT_FALSE(!s.ok())) { + string msg = Substitute("Unable to mark committed up to $0: $1", + majority_replicated.ShortDebugString(), + s.ToString()); + TRACE(msg); + LOG_WITH_PREFIX_UNLOCKED(WARNING) << msg; + return; + } + + if (committed_index_changed && + state_->GetActiveRoleUnlocked() == RaftPeerPB::LEADER) { + peer_manager_->SignalRequest(false); + } +} + +void RaftConsensus::NotifyTermChange(int64_t term) { + ReplicaState::UniqueLock lock; + Status s = state_->LockForConfigChange(&lock); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << state_->LogPrefixThreadSafe() << "Unable to lock ReplicaState for config change" + << " when notified of new term " << term << ": " << s.ToString(); + return; + } + WARN_NOT_OK(HandleTermAdvanceUnlocked(term), "Couldn't advance consensus term."); +} + +void RaftConsensus::NotifyFailedFollower(const string& uuid, + int64_t term, + const std::string& reason) { + // Common info used in all of the log messages within this method. + string fail_msg = Substitute("Processing failure of peer $0 in term $1 ($2): ", + uuid, term, reason); + + if (!FLAGS_evict_failed_followers) { + LOG(INFO) << state_->LogPrefixThreadSafe() << fail_msg + << "Eviction of failed followers is disabled. Doing nothing."; + return; + } + + RaftConfigPB committed_config; + { + ReplicaState::UniqueLock lock; + Status s = state_->LockForRead(&lock); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << state_->LogPrefixThreadSafe() << fail_msg + << "Unable to lock ReplicaState for read: " << s.ToString(); + return; + } + + int64_t current_term = state_->GetCurrentTermUnlocked(); + if (current_term != term) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << fail_msg << "Notified about a follower failure in " + << "previous term " << term << ", but a leader election " + << "likely occurred since the failure was detected. " + << "Doing nothing."; + return; + } + + if (state_->IsConfigChangePendingUnlocked()) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << fail_msg << "There is already a config change operation " + << "in progress. Unable to evict follower until it completes. " + << "Doing nothing."; + return; + } + committed_config = state_->GetCommittedConfigUnlocked(); + } + + // Run config change on thread pool after dropping ReplicaState lock. + WARN_NOT_OK(thread_pool_->SubmitClosure(Bind(&RaftConsensus::TryRemoveFollowerTask, + this, uuid, committed_config, reason)), + state_->LogPrefixThreadSafe() + "Unable to start RemoteFollowerTask"); +} + +void RaftConsensus::TryRemoveFollowerTask(const string& uuid, + const RaftConfigPB& committed_config, + const std::string& reason) { + ChangeConfigRequestPB req; + req.set_tablet_id(tablet_id()); + req.mutable_server()->set_permanent_uuid(uuid); + req.set_type(REMOVE_SERVER); + req.set_cas_config_opid_index(committed_config.opid_index()); + LOG(INFO) << state_->LogPrefixThreadSafe() << "Attempting to remove follower " + << uuid << " from the Raft config. Reason: " << reason; + boost::optional error_code; + WARN_NOT_OK(ChangeConfig(req, Bind(&DoNothingStatusCB), &error_code), + state_->LogPrefixThreadSafe() + "Unable to remove follower " + uuid); +} + +Status RaftConsensus::Update(const ConsensusRequestPB* request, + ConsensusResponsePB* response) { + + if (PREDICT_FALSE(FLAGS_follower_reject_update_consensus_requests)) { + return Status::IllegalState("Rejected: --follower_reject_update_consensus_requests " + "is set to true."); + } + + RETURN_NOT_OK(ExecuteHook(PRE_UPDATE)); + response->set_responder_uuid(state_->GetPeerUuid()); + + VLOG_WITH_PREFIX(2) << "Replica received request: " << request->ShortDebugString(); + + // see var declaration + boost::lock_guard lock(update_lock_); + Status s = UpdateReplica(request, response); + if (PREDICT_FALSE(VLOG_IS_ON(1))) { + if (request->ops_size() == 0) { + VLOG_WITH_PREFIX(1) << "Replica replied to status only request. Replica: " + << state_->ToString() << ". Response: " << response->ShortDebugString(); + } + } + RETURN_NOT_OK(s); + + RETURN_NOT_OK(ExecuteHook(POST_UPDATE)); + return Status::OK(); +} + +// Helper function to check if the op is a non-Transaction op. +static bool IsConsensusOnlyOperation(OperationType op_type) { + if (op_type == NO_OP || op_type == CHANGE_CONFIG_OP) { + return true; + } + return false; +} + +Status RaftConsensus::StartReplicaTransactionUnlocked(const ReplicateRefPtr& msg) { + if (IsConsensusOnlyOperation(msg->get()->op_type())) { + return StartConsensusOnlyRoundUnlocked(msg); + } + + if (PREDICT_FALSE(FLAGS_follower_fail_all_prepare)) { + return Status::IllegalState("Rejected: --follower_fail_all_prepare " + "is set to true."); + } + + VLOG_WITH_PREFIX_UNLOCKED(1) << "Starting transaction: " << msg->get()->id().ShortDebugString(); + scoped_refptr round(new ConsensusRound(this, msg)); + ConsensusRound* round_ptr = round.get(); + RETURN_NOT_OK(state_->GetReplicaTransactionFactoryUnlocked()-> + StartReplicaTransaction(round)); + return state_->AddPendingOperation(round_ptr); +} + +std::string RaftConsensus::LeaderRequest::OpsRangeString() const { + std::string ret; + ret.reserve(100); + ret.push_back('['); + if (!messages.empty()) { + const OpId& first_op = (*messages.begin())->get()->id(); + const OpId& last_op = (*messages.rbegin())->get()->id(); + strings::SubstituteAndAppend(&ret, "$0.$1-$2.$3", + first_op.term(), first_op.index(), + last_op.term(), last_op.index()); + } + ret.push_back(']'); + return ret; +} + +void RaftConsensus::DeduplicateLeaderRequestUnlocked(ConsensusRequestPB* rpc_req, + LeaderRequest* deduplicated_req) { + const OpId& last_committed = state_->GetCommittedOpIdUnlocked(); + + // The leader's preceding id. + deduplicated_req->preceding_opid = &rpc_req->preceding_id(); + + int64_t dedup_up_to_index = state_->GetLastReceivedOpIdUnlocked().index(); + + deduplicated_req->first_message_idx = -1; + + // In this loop we discard duplicates and advance the leader's preceding id + // accordingly. + for (int i = 0; i < rpc_req->ops_size(); i++) { + ReplicateMsg* leader_msg = rpc_req->mutable_ops(i); + + if (leader_msg->id().index() <= last_committed.index()) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Skipping op id " << leader_msg->id() + << " (already committed)"; + deduplicated_req->preceding_opid = &leader_msg->id(); + continue; + } + + if (leader_msg->id().index() <= dedup_up_to_index) { + // If the index is uncommitted and below our match index, then it must be in the + // pendings set. + scoped_refptr round = + state_->GetPendingOpByIndexOrNullUnlocked(leader_msg->id().index()); + DCHECK(round); + + // If the OpIds match, i.e. if they have the same term and id, then this is just + // duplicate, we skip... + if (OpIdEquals(round->replicate_msg()->id(), leader_msg->id())) { + VLOG_WITH_PREFIX_UNLOCKED(2) << "Skipping op id " << leader_msg->id() + << " (already replicated)"; + deduplicated_req->preceding_opid = &leader_msg->id(); + continue; + } + + // ... otherwise we must adjust our match index, i.e. all messages from now on + // are "new" + dedup_up_to_index = leader_msg->id().index(); + } + + if (deduplicated_req->first_message_idx == - 1) { + deduplicated_req->first_message_idx = i; + } + deduplicated_req->messages.push_back(make_scoped_refptr_replicate(leader_msg)); + } + + if (deduplicated_req->messages.size() != rpc_req->ops_size()) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Deduplicated request from leader. Original: " + << rpc_req->preceding_id() << "->" << OpsRangeString(*rpc_req) + << " Dedup: " << *deduplicated_req->preceding_opid << "->" + << deduplicated_req->OpsRangeString(); + } + +} + +Status RaftConsensus::HandleLeaderRequestTermUnlocked(const ConsensusRequestPB* request, + ConsensusResponsePB* response) { + // Do term checks first: + if (PREDICT_FALSE(request->caller_term() != state_->GetCurrentTermUnlocked())) { + + // If less, reject. + if (request->caller_term() < state_->GetCurrentTermUnlocked()) { + string msg = Substitute("Rejecting Update request from peer $0 for earlier term $1. " + "Current term is $2. Ops: $3", + + request->caller_uuid(), + request->caller_term(), + state_->GetCurrentTermUnlocked(), + OpsRangeString(*request)); + LOG_WITH_PREFIX_UNLOCKED(INFO) << msg; + FillConsensusResponseError(response, + ConsensusErrorPB::INVALID_TERM, + Status::IllegalState(msg)); + return Status::OK(); + } else { + RETURN_NOT_OK(HandleTermAdvanceUnlocked(request->caller_term())); + } + } + return Status::OK(); +} + +Status RaftConsensus::EnforceLogMatchingPropertyMatchesUnlocked(const LeaderRequest& req, + ConsensusResponsePB* response) { + + bool term_mismatch; + if (state_->IsOpCommittedOrPending(*req.preceding_opid, &term_mismatch)) { + return Status::OK(); + } + + string error_msg = Substitute( + "Log matching property violated." + " Preceding OpId in replica: $0. Preceding OpId from leader: $1. ($2 mismatch)", + state_->GetLastReceivedOpIdUnlocked().ShortDebugString(), + req.preceding_opid->ShortDebugString(), + term_mismatch ? "term" : "index"); + + + FillConsensusResponseError(response, + ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH, + Status::IllegalState(error_msg)); + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Refusing update from remote peer " + << req.leader_uuid << ": " << error_msg; + + // If the terms mismatch we abort down to the index before the leader's preceding, + // since we know that is the last opid that has a chance of not being overwritten. + // Aborting preemptively here avoids us reporting a last received index that is + // possibly higher than the leader's causing an avoidable cache miss on the leader's + // queue. + // + // TODO: this isn't just an optimization! if we comment this out, we get + // failures on raft_consensus-itest a couple percent of the time! Should investigate + // why this is actually critical to do here, as opposed to just on requests that + // append some ops. + if (term_mismatch) { + return state_->AbortOpsAfterUnlocked(req.preceding_opid->index() - 1); + } + + return Status::OK(); +} + +Status RaftConsensus::CheckLeaderRequestUnlocked(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + LeaderRequest* deduped_req) { + + ConsensusRequestPB* mutable_req = const_cast(request); + DeduplicateLeaderRequestUnlocked(mutable_req, deduped_req); + + // This is an additional check for KUDU-639 that makes sure the message's index + // and term are in the right sequence in the request, after we've deduplicated + // them. We do this before we change any of the internal state. + // + // TODO move this to raft_consensus-state or whatever we transform that into. + // We should be able to do this check for each append, but right now the way + // we initialize raft_consensus-state is preventing us from doing so. + Status s; + const OpId* prev = deduped_req->preceding_opid; + for (const ReplicateRefPtr& message : deduped_req->messages) { + s = ReplicaState::CheckOpInSequence(*prev, message->get()->id()); + if (PREDICT_FALSE(!s.ok())) { + LOG(ERROR) << "Leader request contained out-of-sequence messages. Status: " + << s.ToString() << ". Leader Request: " << request->ShortDebugString(); + break; + } + prev = &message->get()->id(); + } + + // We only release the messages from the request after the above check so that + // that we can print the original request, if it fails. + if (!deduped_req->messages.empty()) { + // We take ownership of the deduped ops. + DCHECK_GE(deduped_req->first_message_idx, 0); + mutable_req->mutable_ops()->ExtractSubrange( + deduped_req->first_message_idx, + deduped_req->messages.size(), + nullptr); + } + + RETURN_NOT_OK(s); + + RETURN_NOT_OK(HandleLeaderRequestTermUnlocked(request, response)); + + if (response->status().has_error()) { + return Status::OK(); + } + + RETURN_NOT_OK(EnforceLogMatchingPropertyMatchesUnlocked(*deduped_req, response)); + + if (response->status().has_error()) { + return Status::OK(); + } + + // If the first of the messages to apply is not in our log, either it follows the last + // received message or it replaces some in-flight. + if (!deduped_req->messages.empty()) { + + bool term_mismatch; + CHECK(!state_->IsOpCommittedOrPending(deduped_req->messages[0]->get()->id(), &term_mismatch)); + + // If the index is in our log but the terms are not the same abort down to the leader's + // preceding id. + if (term_mismatch) { + RETURN_NOT_OK(state_->AbortOpsAfterUnlocked(deduped_req->preceding_opid->index())); + } + } + + // If all of the above logic was successful then we can consider this to be + // the effective leader of the configuration. If they are not currently marked as + // the leader locally, mark them as leader now. + const string& caller_uuid = request->caller_uuid(); + if (PREDICT_FALSE(state_->HasLeaderUnlocked() && + state_->GetLeaderUuidUnlocked() != caller_uuid)) { + LOG_WITH_PREFIX_UNLOCKED(FATAL) << "Unexpected new leader in same term! " + << "Existing leader UUID: " << state_->GetLeaderUuidUnlocked() << ", " + << "new leader UUID: " << caller_uuid; + } + if (PREDICT_FALSE(!state_->HasLeaderUnlocked())) { + SetLeaderUuidUnlocked(request->caller_uuid()); + } + + return Status::OK(); +} + +Status RaftConsensus::UpdateReplica(const ConsensusRequestPB* request, + ConsensusResponsePB* response) { + TRACE_EVENT2("consensus", "RaftConsensus::UpdateReplica", + "peer", peer_uuid(), + "tablet", tablet_id()); + Synchronizer log_synchronizer; + StatusCallback sync_status_cb = log_synchronizer.AsStatusCallback(); + + + // The ordering of the following operations is crucial, read on for details. + // + // The main requirements explained in more detail below are: + // + // 1) We must enqueue the prepares before we write to our local log. + // 2) If we were able to enqueue a prepare then we must be able to log it. + // 3) If we fail to enqueue a prepare, we must not attempt to enqueue any + // later-indexed prepare or apply. + // + // See below for detailed rationale. + // + // The steps are: + // + // 0 - Split/Dedup + // + // We split the operations into replicates and commits and make sure that we don't + // don't do anything on operations we've already received in a previous call. + // This essentially makes this method idempotent. + // + // 1 - We mark as many pending transactions as committed as we can. + // + // We may have some pending transactions that, according to the leader, are now + // committed. We Apply them early, because: + // - Soon (step 2) we may reject the call due to excessive memory pressure. One + // way to relieve the pressure is by flushing the MRS, and applying these + // transactions may unblock an in-flight Flush(). + // - The Apply and subsequent Prepares (step 2) can take place concurrently. + // + // 2 - We enqueue the Prepare of the transactions. + // + // The actual prepares are enqueued in order but happen asynchronously so we don't + // have decoding/acquiring locks on the critical path. + // + // We need to do this now for a number of reasons: + // - Prepares, by themselves, are inconsequential, i.e. they do not mutate the + // state machine so, were we to crash afterwards, having the prepares in-flight + // won't hurt. + // - Prepares depend on factors external to consensus (the transaction drivers and + // the tablet peer) so if for some reason they cannot be enqueued we must know + // before we try write them to the WAL. Once enqueued, we assume that prepare will + // always succeed on a replica transaction (because the leader already prepared them + // successfully, and thus we know they are valid). + // - The prepares corresponding to every operation that was logged must be in-flight + // first. This because should we need to abort certain transactions (say a new leader + // says they are not committed) we need to have those prepares in-flight so that + // the transactions can be continued (in the abort path). + // - Failure to enqueue prepares is OK, we can continue and let the leader know that + // we only went so far. The leader will re-send the remaining messages. + // - Prepares represent new transactions, and transactions consume memory. Thus, if the + // overall memory pressure on the server is too high, we will reject the prepares. + // + // 3 - We enqueue the writes to the WAL. + // + // We enqueue writes to the WAL, but only the operations that were successfully + // enqueued for prepare (for the reasons introduced above). This means that even + // if a prepare fails to enqueue, if any of the previous prepares were successfully + // submitted they must be written to the WAL. + // If writing to the WAL fails, we're in an inconsistent state and we crash. In this + // case, no one will ever know of the transactions we previously prepared so those are + // inconsequential. + // + // 4 - We mark the transactions as committed. + // + // For each transaction which has been committed by the leader, we update the + // transaction state to reflect that. If the logging has already succeeded for that + // transaction, this will trigger the Apply phase. Otherwise, Apply will be triggered + // when the logging completes. In both cases the Apply phase executes asynchronously. + // This must, of course, happen after the prepares have been triggered as the same batch + // can both replicate/prepare and commit/apply an operation. + // + // Currently, if a prepare failed to enqueue we still trigger all applies for operations + // with an id lower than it (if we have them). This is important now as the leader will + // not re-send those commit messages. This will be moot when we move to the commit + // commitIndex way of doing things as we can simply ignore the applies as we know + // they will be triggered with the next successful batch. + // + // 5 - We wait for the writes to be durable. + // + // Before replying to the leader we wait for the writes to be durable. We then + // just update the last replicated watermark and respond. + // + // TODO - These failure scenarios need to be exercised in an unit + // test. Moreover we need to add more fault injection spots (well that + // and actually use the) for each of these steps. + // This will be done in a follow up patch. + TRACE("Updating replica for $0 ops", request->ops_size()); + + // The deduplicated request. + LeaderRequest deduped_req; + + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForUpdate(&lock)); + + deduped_req.leader_uuid = request->caller_uuid(); + + RETURN_NOT_OK(CheckLeaderRequestUnlocked(request, response, &deduped_req)); + + if (response->status().has_error()) { + // We had an error, like an invalid term, we still fill the response. + FillConsensusResponseOKUnlocked(response); + return Status::OK(); + } + + // Snooze the failure detector as soon as we decide to accept the message. + // We are guaranteed to be acting as a FOLLOWER at this point by the above + // sanity check. + RETURN_NOT_OK(SnoozeFailureDetectorUnlocked()); + + // Also prohibit voting for anyone for the minimum election timeout. + withhold_votes_until_ = MonoTime::Now(MonoTime::FINE); + withhold_votes_until_.AddDelta(MinimumElectionTimeout()); + + + // 1 - Early commit pending (and committed) transactions + + // What should we commit? + // 1. As many pending transactions as we can, except... + // 2. ...if we commit beyond the preceding index, we'd regress KUDU-639, and... + // 3. ...the leader's committed index is always our upper bound. + OpId early_apply_up_to = state_->GetLastPendingTransactionOpIdUnlocked(); + CopyIfOpIdLessThan(*deduped_req.preceding_opid, &early_apply_up_to); + CopyIfOpIdLessThan(request->committed_index(), &early_apply_up_to); + + VLOG_WITH_PREFIX_UNLOCKED(1) << "Early marking committed up to " << + early_apply_up_to.ShortDebugString(); + TRACE("Early marking committed up to $0", + early_apply_up_to.ShortDebugString()); + bool committed_index_changed = false; + CHECK_OK(state_->AdvanceCommittedIndexUnlocked(early_apply_up_to, &committed_index_changed)); + + // 2 - Enqueue the prepares + + TRACE("Triggering prepare for $0 ops", deduped_req.messages.size()); + + Status prepare_status; + auto iter = deduped_req.messages.begin(); + + if (PREDICT_TRUE(deduped_req.messages.size() > 0)) { + // TODO Temporary until the leader explicitly propagates the safe timestamp. + clock_->Update(Timestamp(deduped_req.messages.back()->get()->timestamp())); + + // This request contains at least one message, and is likely to increase + // our memory pressure. + double capacity_pct; + if (parent_mem_tracker_->AnySoftLimitExceeded(&capacity_pct)) { + follower_memory_pressure_rejections_->Increment(); + string msg = StringPrintf( + "Soft memory limit exceeded (at %.2f%% of capacity)", + capacity_pct); + if (capacity_pct >= FLAGS_memory_limit_warn_threshold_percentage) { + KLOG_EVERY_N_SECS(WARNING, 1) << "Rejecting consensus request: " << msg + << THROTTLE_MSG; + } else { + KLOG_EVERY_N_SECS(INFO, 1) << "Rejecting consensus request: " << msg + << THROTTLE_MSG; + } + return Status::ServiceUnavailable(msg); + } + } + + while (iter != deduped_req.messages.end()) { + prepare_status = StartReplicaTransactionUnlocked(*iter); + if (PREDICT_FALSE(!prepare_status.ok())) { + break; + } + ++iter; + } + + // If we stopped before reaching the end we failed to prepare some message(s) and need + // to perform cleanup, namely trimming deduped_req.messages to only contain the messages + // that were actually prepared, and deleting the other ones since we've taken ownership + // when we first deduped. + if (iter != deduped_req.messages.end()) { + bool need_to_warn = true; + while (iter != deduped_req.messages.end()) { + ReplicateRefPtr msg = (*iter); + iter = deduped_req.messages.erase(iter); + if (need_to_warn) { + need_to_warn = false; + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Could not prepare transaction for op: " + << msg->get()->id() << ". Suppressed " << deduped_req.messages.size() << + " other warnings. Status for this op: " << prepare_status.ToString(); + } + } + + // If this is empty, it means we couldn't prepare a single de-duped message. There is nothing + // else we can do. The leader will detect this and retry later. + if (deduped_req.messages.empty()) { + string msg = Substitute("Rejecting Update request from peer $0 for term $1. " + "Could not prepare a single transaction due to: $2", + request->caller_uuid(), + request->caller_term(), + prepare_status.ToString()); + LOG_WITH_PREFIX_UNLOCKED(INFO) << msg; + FillConsensusResponseError(response, ConsensusErrorPB::CANNOT_PREPARE, + Status::IllegalState(msg)); + FillConsensusResponseOKUnlocked(response); + return Status::OK(); + } + } + + OpId last_from_leader; + // 3 - Enqueue the writes. + // Now that we've triggered the prepares enqueue the operations to be written + // to the WAL. + if (PREDICT_TRUE(deduped_req.messages.size() > 0)) { + last_from_leader = deduped_req.messages.back()->get()->id(); + // Trigger the log append asap, if fsync() is on this might take a while + // and we can't reply until this is done. + // + // Since we've prepared, we need to be able to append (or we risk trying to apply + // later something that wasn't logged). We crash if we can't. + CHECK_OK(queue_->AppendOperations(deduped_req.messages, sync_status_cb)); + } else { + last_from_leader = *deduped_req.preceding_opid; + } + + // 4 - Mark transactions as committed + + // Choose the last operation to be applied. This will either be 'committed_index', if + // no prepare enqueuing failed, or the minimum between 'committed_index' and the id of + // the last successfully enqueued prepare, if some prepare failed to enqueue. + OpId apply_up_to; + if (last_from_leader.index() < request->committed_index().index()) { + // we should never apply anything later than what we received in this request + apply_up_to = last_from_leader; + + VLOG_WITH_PREFIX_UNLOCKED(2) << "Received commit index " + << request->committed_index() << " from the leader but only" + << " marked up to " << apply_up_to << " as committed."; + } else { + apply_up_to = request->committed_index(); + } + + VLOG_WITH_PREFIX_UNLOCKED(1) << "Marking committed up to " << apply_up_to.ShortDebugString(); + TRACE(Substitute("Marking committed up to $0", apply_up_to.ShortDebugString())); + CHECK_OK(state_->AdvanceCommittedIndexUnlocked(apply_up_to, &committed_index_changed)); + + // We can now update the last received watermark. + // + // We do it here (and before we actually hear back from the wal whether things + // are durable) so that, if we receive another, possible duplicate, message + // that exercises this path we don't handle these messages twice. + // + // If any messages failed to be started locally, then we already have removed them + // from 'deduped_req' at this point. So, we can simply update our last-received + // watermark to the last message that remains in 'deduped_req'. + // + // It's possible that the leader didn't send us any new data -- it might be a completely + // duplicate request. In that case, we don't need to update LastReceived at all. + if (!deduped_req.messages.empty()) { + OpId last_appended = deduped_req.messages.back()->get()->id(); + TRACE(Substitute("Updating last received op as $0", last_appended.ShortDebugString())); + state_->UpdateLastReceivedOpIdUnlocked(last_appended); + } else { + DCHECK_GE(state_->GetLastReceivedOpIdUnlocked().index(), + deduped_req.preceding_opid->index()); + } + + // Fill the response with the current state. We will not mutate anymore state until + // we actually reply to the leader, we'll just wait for the messages to be durable. + FillConsensusResponseOKUnlocked(response); + } + // Release the lock while we wait for the log append to finish so that commits can go through. + // We'll re-acquire it before we update the state again. + + // Update the last replicated op id + if (deduped_req.messages.size() > 0) { + + // 5 - We wait for the writes to be durable. + + // Note that this is safe because dist consensus now only supports a single outstanding + // request at a time and this way we can allow commits to proceed while we wait. + TRACE("Waiting on the replicates to finish logging"); + TRACE_EVENT0("consensus", "Wait for log"); + Status s; + do { + s = log_synchronizer.WaitFor( + MonoDelta::FromMilliseconds(FLAGS_raft_heartbeat_interval_ms)); + // If just waiting for our log append to finish lets snooze the timer. + // We don't want to fire leader election because we're waiting on our own log. + if (s.IsTimedOut()) { + SnoozeFailureDetectorUnlocked(); + } + } while (s.IsTimedOut()); + RETURN_NOT_OK(s); + TRACE("finished"); + } + + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + VLOG_WITH_PREFIX(2) << "Replica updated." + << state_->ToString() << " Request: " << request->ShortDebugString(); + } + + TRACE("UpdateReplicas() finished"); + return Status::OK(); +} + +void RaftConsensus::FillConsensusResponseOKUnlocked(ConsensusResponsePB* response) { + TRACE("Filling consensus response to leader."); + response->set_responder_term(state_->GetCurrentTermUnlocked()); + response->mutable_status()->mutable_last_received()->CopyFrom( + state_->GetLastReceivedOpIdUnlocked()); + response->mutable_status()->mutable_last_received_current_leader()->CopyFrom( + state_->GetLastReceivedOpIdCurLeaderUnlocked()); + response->mutable_status()->set_last_committed_idx( + state_->GetCommittedOpIdUnlocked().index()); +} + +void RaftConsensus::FillConsensusResponseError(ConsensusResponsePB* response, + ConsensusErrorPB::Code error_code, + const Status& status) { + ConsensusErrorPB* error = response->mutable_status()->mutable_error(); + error->set_code(error_code); + StatusToPB(status, error->mutable_status()); +} + +Status RaftConsensus::RequestVote(const VoteRequestPB* request, VoteResponsePB* response) { + TRACE_EVENT2("consensus", "RaftConsensus::RequestVote", + "peer", peer_uuid(), + "tablet", tablet_id()); + response->set_responder_uuid(state_->GetPeerUuid()); + + // We must acquire the update lock in order to ensure that this vote action + // takes place between requests. + // Lock ordering: The update lock must be acquired before the ReplicaState lock. + boost::unique_lock update_guard(update_lock_, boost::defer_lock); + if (FLAGS_enable_leader_failure_detection) { + update_guard.try_lock(); + } else { + // If failure detection is not enabled, then we can't just reject the vote, + // because there will be no automatic retry later. So, block for the lock. + update_guard.lock(); + } + if (!update_guard.owns_lock()) { + // There is another vote or update concurrent with the vote. In that case, that + // other request is likely to reset the timer, and we'll end up just voting + // "NO" after waiting. To avoid starving RPC handlers and causing cascading + // timeouts, just vote a quick NO. + // + // We still need to take the state lock in order to respond with term info, etc. + ReplicaState::UniqueLock state_guard; + RETURN_NOT_OK(state_->LockForConfigChange(&state_guard)); + return RequestVoteRespondIsBusy(request, response); + } + + // Acquire the replica state lock so we can read / modify the consensus state. + ReplicaState::UniqueLock state_guard; + RETURN_NOT_OK(state_->LockForConfigChange(&state_guard)); + + // If the node is not in the configuration, allow the vote (this is required by Raft) + // but log an informational message anyway. + if (!IsRaftConfigMember(request->candidate_uuid(), state_->GetActiveConfigUnlocked())) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Handling vote request from an unknown peer " + << request->candidate_uuid(); + } + + // If we've heard recently from the leader, then we should ignore the request. + // It might be from a "disruptive" server. This could happen in a few cases: + // + // 1) Network partitions + // If the leader can talk to a majority of the nodes, but is partitioned from a + // bad node, the bad node's failure detector will trigger. If the bad node is + // able to reach other nodes in the cluster, it will continuously trigger elections. + // + // 2) An abandoned node + // It's possible that a node has fallen behind the log GC mark of the leader. In that + // case, the leader will stop sending it requests. Eventually, the the configuration + // will change to eject the abandoned node, but until that point, we don't want the + // abandoned follower to disturb the other nodes. + // + // See also https://ramcloud.stanford.edu/~ongaro/thesis.pdf + // section 4.2.3. + MonoTime now = MonoTime::Now(MonoTime::COARSE); + if (!request->ignore_live_leader() && + now.ComesBefore(withhold_votes_until_)) { + return RequestVoteRespondLeaderIsAlive(request, response); + } + + // Candidate is running behind. + if (request->candidate_term() < state_->GetCurrentTermUnlocked()) { + return RequestVoteRespondInvalidTerm(request, response); + } + + // We already voted this term. + if (request->candidate_term() == state_->GetCurrentTermUnlocked() && + state_->HasVotedCurrentTermUnlocked()) { + + // Already voted for the same candidate in the current term. + if (state_->GetVotedForCurrentTermUnlocked() == request->candidate_uuid()) { + return RequestVoteRespondVoteAlreadyGranted(request, response); + } + + // Voted for someone else in current term. + return RequestVoteRespondAlreadyVotedForOther(request, response); + } + + // The term advanced. + if (request->candidate_term() > state_->GetCurrentTermUnlocked()) { + RETURN_NOT_OK_PREPEND(HandleTermAdvanceUnlocked(request->candidate_term()), + Substitute("Could not step down in RequestVote. Current term: $0, candidate term: $1", + state_->GetCurrentTermUnlocked(), request->candidate_term())); + } + + // Candidate must have last-logged OpId at least as large as our own to get + // our vote. + OpId local_last_logged_opid = GetLatestOpIdFromLog(); + if (OpIdLessThan(request->candidate_status().last_received(), local_last_logged_opid)) { + return RequestVoteRespondLastOpIdTooOld(local_last_logged_opid, request, response); + } + + // Passed all our checks. Vote granted. + return RequestVoteRespondVoteGranted(request, response); +} + +Status RaftConsensus::ChangeConfig(const ChangeConfigRequestPB& req, + const StatusCallback& client_cb, + boost::optional* error_code) { + if (PREDICT_FALSE(!req.has_type())) { + return Status::InvalidArgument("Must specify 'type' argument to ChangeConfig()", + req.ShortDebugString()); + } + if (PREDICT_FALSE(!req.has_server())) { + *error_code = TabletServerErrorPB::INVALID_CONFIG; + return Status::InvalidArgument("Must specify 'server' argument to ChangeConfig()", + req.ShortDebugString()); + } + ChangeConfigType type = req.type(); + const RaftPeerPB& server = req.server(); + { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForConfigChange(&lock)); + RETURN_NOT_OK(state_->CheckActiveLeaderUnlocked()); + RETURN_NOT_OK(state_->CheckNoConfigChangePendingUnlocked()); + // We are required by Raft to reject config change operations until we have + // committed at least one operation in our current term as leader. + // See https://groups.google.com/forum/#!topic/raft-dev/t4xj6dJTP6E + RETURN_NOT_OK(state_->CheckHasCommittedOpInCurrentTermUnlocked()); + if (!server.has_permanent_uuid()) { + return Status::InvalidArgument("server must have permanent_uuid specified", + req.ShortDebugString()); + } + const RaftConfigPB& committed_config = state_->GetCommittedConfigUnlocked(); + + // Support atomic ChangeConfig requests. + if (req.has_cas_config_opid_index()) { + if (committed_config.opid_index() != req.cas_config_opid_index()) { + *error_code = TabletServerErrorPB::CAS_FAILED; + return Status::IllegalState(Substitute("Request specified cas_config_opid_index " + "of $0 but the committed config has opid_index " + "of $1", + req.cas_config_opid_index(), + committed_config.opid_index())); + } + } + + RaftConfigPB new_config = committed_config; + new_config.clear_opid_index(); + const string& server_uuid = server.permanent_uuid(); + switch (type) { + case ADD_SERVER: + // Ensure the server we are adding is not already a member of the configuration. + if (IsRaftConfigMember(server_uuid, committed_config)) { + return Status::InvalidArgument( + Substitute("Server with UUID $0 is already a member of the config. RaftConfig: $1", + server_uuid, committed_config.ShortDebugString())); + } + if (!server.has_member_type()) { + return Status::InvalidArgument("server must have member_type specified", + req.ShortDebugString()); + } + if (!server.has_last_known_addr()) { + return Status::InvalidArgument("server must have last_known_addr specified", + req.ShortDebugString()); + } + *new_config.add_peers() = server; + break; + + case REMOVE_SERVER: + if (server_uuid == peer_uuid()) { + return Status::InvalidArgument( + Substitute("Cannot remove peer $0 from the config because it is the leader. " + "Force another leader to be elected to remove this server. " + "Active consensus state: $1", + server_uuid, + state_->ConsensusStateUnlocked(CONSENSUS_CONFIG_ACTIVE) + .ShortDebugString())); + } + if (!RemoveFromRaftConfig(&new_config, server_uuid)) { + return Status::NotFound( + Substitute("Server with UUID $0 not a member of the config. RaftConfig: $1", + server_uuid, committed_config.ShortDebugString())); + } + break; + + // TODO: Support role change. + case CHANGE_ROLE: + default: + return Status::NotSupported("Role change is not yet implemented."); + } + + RETURN_NOT_OK(ReplicateConfigChangeUnlocked(committed_config, new_config, + Bind(&RaftConsensus::MarkDirtyOnSuccess, + Unretained(this), + string("Config change replication complete"), + client_cb))); + } + peer_manager_->SignalRequest(); + return Status::OK(); +} + +void RaftConsensus::Shutdown() { + // Avoid taking locks if already shut down so we don't violate + // ThreadRestrictions assertions in the case where the RaftConsensus + // destructor runs on the reactor thread due to an election callback being + // the last outstanding reference. + if (shutdown_.Load(kMemOrderAcquire)) return; + + CHECK_OK(ExecuteHook(PRE_SHUTDOWN)); + + { + ReplicaState::UniqueLock lock; + // Transition to kShuttingDown state. + CHECK_OK(state_->LockForShutdown(&lock)); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Raft consensus shutting down."; + } + + // Close the peer manager. + peer_manager_->Close(); + + // We must close the queue after we close the peers. + queue_->Close(); + + CHECK_OK(state_->CancelPendingTransactions()); + + { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForShutdown(&lock)); + CHECK_EQ(ReplicaState::kShuttingDown, state_->state()); + CHECK_OK(state_->ShutdownUnlocked()); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Raft consensus is shut down!"; + } + + // Shut down things that might acquire locks during destruction. + thread_pool_->Shutdown(); + failure_monitor_.Shutdown(); + + CHECK_OK(ExecuteHook(POST_SHUTDOWN)); + + shutdown_.Store(true, kMemOrderRelease); +} + +RaftPeerPB::Role RaftConsensus::GetActiveRole() const { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + return state_->GetActiveRoleUnlocked(); +} + +OpId RaftConsensus::GetLatestOpIdFromLog() { + OpId id; + log_->GetLatestEntryOpId(&id); + return id; +} + +Status RaftConsensus::StartConsensusOnlyRoundUnlocked(const ReplicateRefPtr& msg) { + OperationType op_type = msg->get()->op_type(); + CHECK(IsConsensusOnlyOperation(op_type)) + << "Expected a consensus-only op type, got " << OperationType_Name(op_type) + << ": " << msg->get()->ShortDebugString(); + VLOG_WITH_PREFIX_UNLOCKED(1) << "Starting consensus round: " + << msg->get()->id().ShortDebugString(); + scoped_refptr round(new ConsensusRound(this, msg)); + round->SetConsensusReplicatedCallback(Bind(&RaftConsensus::NonTxRoundReplicationFinished, + Unretained(this), + Unretained(round.get()), + Bind(&RaftConsensus::MarkDirtyOnSuccess, + Unretained(this), + string("Replicated consensus-only round"), + Bind(&DoNothingStatusCB)))); + return state_->AddPendingOperation(round); +} + +Status RaftConsensus::AdvanceTermForTests(int64_t new_term) { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForConfigChange(&lock)); + return HandleTermAdvanceUnlocked(new_term); +} + +std::string RaftConsensus::GetRequestVoteLogPrefixUnlocked() const { + return state_->LogPrefixUnlocked() + "Leader election vote request"; +} + +void RaftConsensus::FillVoteResponseVoteGranted(VoteResponsePB* response) { + response->set_responder_term(state_->GetCurrentTermUnlocked()); + response->set_vote_granted(true); +} + +void RaftConsensus::FillVoteResponseVoteDenied(ConsensusErrorPB::Code error_code, + VoteResponsePB* response) { + response->set_responder_term(state_->GetCurrentTermUnlocked()); + response->set_vote_granted(false); + response->mutable_consensus_error()->set_code(error_code); +} + +Status RaftConsensus::RequestVoteRespondInvalidTerm(const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteDenied(ConsensusErrorPB::INVALID_TERM, response); + string msg = Substitute("$0: Denying vote to candidate $1 for earlier term $2. " + "Current term is $3.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + request->candidate_term(), + state_->GetCurrentTermUnlocked()); + LOG(INFO) << msg; + StatusToPB(Status::InvalidArgument(msg), response->mutable_consensus_error()->mutable_status()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondVoteAlreadyGranted(const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteGranted(response); + LOG(INFO) << Substitute("$0: Already granted yes vote for candidate $1 in term $2. " + "Re-sending same reply.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + request->candidate_term()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondAlreadyVotedForOther(const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteDenied(ConsensusErrorPB::ALREADY_VOTED, response); + string msg = Substitute("$0: Denying vote to candidate $1 in current term $2: " + "Already voted for candidate $3 in this term.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + state_->GetCurrentTermUnlocked(), + state_->GetVotedForCurrentTermUnlocked()); + LOG(INFO) << msg; + StatusToPB(Status::InvalidArgument(msg), response->mutable_consensus_error()->mutable_status()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondLastOpIdTooOld(const OpId& local_last_logged_opid, + const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteDenied(ConsensusErrorPB::LAST_OPID_TOO_OLD, response); + string msg = Substitute("$0: Denying vote to candidate $1 for term $2 because " + "replica has last-logged OpId of $3, which is greater than that of the " + "candidate, which has last-logged OpId of $4.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + request->candidate_term(), + local_last_logged_opid.ShortDebugString(), + request->candidate_status().last_received().ShortDebugString()); + LOG(INFO) << msg; + StatusToPB(Status::InvalidArgument(msg), response->mutable_consensus_error()->mutable_status()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondLeaderIsAlive(const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteDenied(ConsensusErrorPB::LEADER_IS_ALIVE, response); + string msg = Substitute("$0: Denying vote to candidate $1 for term $2 because " + "replica is either leader or believes a valid leader to " + "be alive.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + request->candidate_term()); + LOG(INFO) << msg; + StatusToPB(Status::InvalidArgument(msg), response->mutable_consensus_error()->mutable_status()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondIsBusy(const VoteRequestPB* request, + VoteResponsePB* response) { + FillVoteResponseVoteDenied(ConsensusErrorPB::CONSENSUS_BUSY, response); + string msg = Substitute("$0: Denying vote to candidate $1 for term $2 because " + "replica is already servicing an update from a current leader " + "or another vote.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + request->candidate_term()); + LOG(INFO) << msg; + StatusToPB(Status::ServiceUnavailable(msg), + response->mutable_consensus_error()->mutable_status()); + return Status::OK(); +} + +Status RaftConsensus::RequestVoteRespondVoteGranted(const VoteRequestPB* request, + VoteResponsePB* response) { + // We know our vote will be "yes", so avoid triggering an election while we + // persist our vote to disk. We use an exponential backoff to avoid too much + // split-vote contention when nodes display high latencies. + MonoDelta additional_backoff = LeaderElectionExpBackoffDeltaUnlocked(); + RETURN_NOT_OK(SnoozeFailureDetectorUnlocked(additional_backoff, ALLOW_LOGGING)); + + // Persist our vote to disk. + RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(request->candidate_uuid())); + + FillVoteResponseVoteGranted(response); + + // Give peer time to become leader. Snooze one more time after persisting our + // vote. When disk latency is high, this should help reduce churn. + RETURN_NOT_OK(SnoozeFailureDetectorUnlocked(additional_backoff, DO_NOT_LOG)); + + LOG(INFO) << Substitute("$0: Granting yes vote for candidate $1 in term $2.", + GetRequestVoteLogPrefixUnlocked(), + request->candidate_uuid(), + state_->GetCurrentTermUnlocked()); + return Status::OK(); +} + +RaftPeerPB::Role RaftConsensus::role() const { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + return GetConsensusRole(state_->GetPeerUuid(), + state_->ConsensusStateUnlocked(CONSENSUS_CONFIG_ACTIVE)); +} + +std::string RaftConsensus::LogPrefixUnlocked() { + return state_->LogPrefixUnlocked(); +} + +std::string RaftConsensus::LogPrefix() { + return state_->LogPrefix(); +} + +void RaftConsensus::SetLeaderUuidUnlocked(const string& uuid) { + state_->SetLeaderUuidUnlocked(uuid); + MarkDirty("New leader " + uuid); +} + +Status RaftConsensus::ReplicateConfigChangeUnlocked(const RaftConfigPB& old_config, + const RaftConfigPB& new_config, + const StatusCallback& client_cb) { + auto cc_replicate = new ReplicateMsg(); + cc_replicate->set_op_type(CHANGE_CONFIG_OP); + ChangeConfigRecordPB* cc_req = cc_replicate->mutable_change_config_record(); + cc_req->set_tablet_id(tablet_id()); + *cc_req->mutable_old_config() = old_config; + *cc_req->mutable_new_config() = new_config; + + // TODO: We should have no-ops (?) and config changes be COMMIT_WAIT + // transactions. See KUDU-798. + // Note: This timestamp has no meaning from a serialization perspective + // because this method is not executed on the TabletPeer's prepare thread. + cc_replicate->set_timestamp(clock_->Now().ToUint64()); + + scoped_refptr round( + new ConsensusRound(this, make_scoped_refptr(new RefCountedReplicate(cc_replicate)))); + round->SetConsensusReplicatedCallback(Bind(&RaftConsensus::NonTxRoundReplicationFinished, + Unretained(this), + Unretained(round.get()), + client_cb)); + + // Set as pending. + RETURN_NOT_OK(state_->SetPendingConfigUnlocked(new_config)); + RETURN_NOT_OK(RefreshConsensusQueueAndPeersUnlocked()); + CHECK_OK(AppendNewRoundToQueueUnlocked(round)); + return Status::OK(); +} + +Status RaftConsensus::RefreshConsensusQueueAndPeersUnlocked() { + DCHECK_EQ(RaftPeerPB::LEADER, state_->GetActiveRoleUnlocked()); + const RaftConfigPB& active_config = state_->GetActiveConfigUnlocked(); + + // Change the peers so that we're able to replicate messages remotely and + // locally. The peer manager must be closed before updating the active config + // in the queue -- when the queue is in LEADER mode, it checks that all + // registered peers are a part of the active config. + peer_manager_->Close(); + queue_->SetLeaderMode(state_->GetCommittedOpIdUnlocked(), + state_->GetCurrentTermUnlocked(), + active_config); + RETURN_NOT_OK(peer_manager_->UpdateRaftConfig(active_config)); + return Status::OK(); +} + +string RaftConsensus::peer_uuid() const { + return state_->GetPeerUuid(); +} + +string RaftConsensus::tablet_id() const { + return state_->GetOptions().tablet_id; +} + +ConsensusStatePB RaftConsensus::ConsensusState(ConsensusConfigType type) const { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + return state_->ConsensusStateUnlocked(type); +} + +RaftConfigPB RaftConsensus::CommittedConfig() const { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + return state_->GetCommittedConfigUnlocked(); +} + +void RaftConsensus::DumpStatusHtml(std::ostream& out) const { + out << "

Raft Consensus State

" << std::endl; + + out << "

State

" << std::endl; + out << "
" << EscapeForHtmlToString(queue_->ToString()) << "
" << std::endl; + + // Dump the queues on a leader. + RaftPeerPB::Role role; + { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + role = state_->GetActiveRoleUnlocked(); + } + if (role == RaftPeerPB::LEADER) { + out << "

Queue overview

" << std::endl; + out << "
" << EscapeForHtmlToString(queue_->ToString()) << "
" << std::endl; + out << "
" << std::endl; + out << "

Queue details

" << std::endl; + queue_->DumpToHtml(out); + } +} + +ReplicaState* RaftConsensus::GetReplicaStateForTests() { + return state_.get(); +} + +void RaftConsensus::ElectionCallback(const ElectionResult& result) { + // The election callback runs on a reactor thread, so we need to defer to our + // threadpool. If the threadpool is already shut down for some reason, it's OK -- + // we're OK with the callback never running. + WARN_NOT_OK(thread_pool_->SubmitClosure(Bind(&RaftConsensus::DoElectionCallback, this, result)), + state_->LogPrefixThreadSafe() + "Unable to run election callback"); +} + +void RaftConsensus::DoElectionCallback(const ElectionResult& result) { + // Snooze to avoid the election timer firing again as much as possible. + { + ReplicaState::UniqueLock lock; + CHECK_OK(state_->LockForRead(&lock)); + // We need to snooze when we win and when we lose: + // - When we win because we're about to disable the timer and become leader. + // - When we loose or otherwise we can fall into a cycle, where everyone keeps + // triggering elections but no election ever completes because by the time they + // finish another one is triggered already. + // We ignore the status as we don't want to fail if we the timer is + // disabled. + ignore_result(SnoozeFailureDetectorUnlocked(LeaderElectionExpBackoffDeltaUnlocked(), + ALLOW_LOGGING)); + } + + if (result.decision == VOTE_DENIED) { + LOG_WITH_PREFIX(INFO) << "Leader election lost for term " << result.election_term + << ". Reason: " + << (!result.message.empty() ? result.message : "None given"); + return; + } + + ReplicaState::UniqueLock lock; + Status s = state_->LockForConfigChange(&lock); + if (PREDICT_FALSE(!s.ok())) { + LOG_WITH_PREFIX(INFO) << "Received election callback for term " + << result.election_term << " while not running: " + << s.ToString(); + return; + } + + if (result.election_term != state_->GetCurrentTermUnlocked()) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Leader election decision for defunct term " + << result.election_term << ": " + << (result.decision == VOTE_GRANTED ? "won" : "lost"); + return; + } + + const RaftConfigPB& active_config = state_->GetActiveConfigUnlocked(); + if (!IsRaftConfigVoter(state_->GetPeerUuid(), active_config)) { + LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Leader election decision while not in active config. " + << "Result: Term " << result.election_term << ": " + << (result.decision == VOTE_GRANTED ? "won" : "lost") + << ". RaftConfig: " << active_config.ShortDebugString(); + return; + } + + if (state_->GetActiveRoleUnlocked() == RaftPeerPB::LEADER) { + LOG_WITH_PREFIX_UNLOCKED(DFATAL) << "Leader election callback while already leader! " + "Result: Term " << result.election_term << ": " + << (result.decision == VOTE_GRANTED ? "won" : "lost"); + return; + } + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Leader election won for term " << result.election_term; + + // Convert role to LEADER. + SetLeaderUuidUnlocked(state_->GetPeerUuid()); + + // TODO: BecomeLeaderUnlocked() can fail due to state checks during shutdown. + // It races with the above state check. + // This could be a problem during tablet deletion. + CHECK_OK(BecomeLeaderUnlocked()); +} + +Status RaftConsensus::GetLastOpId(OpIdType type, OpId* id) { + ReplicaState::UniqueLock lock; + RETURN_NOT_OK(state_->LockForRead(&lock)); + if (type == RECEIVED_OPID) { + *DCHECK_NOTNULL(id) = state_->GetLastReceivedOpIdUnlocked(); + } else if (type == COMMITTED_OPID) { + *DCHECK_NOTNULL(id) = state_->GetCommittedOpIdUnlocked(); + } else { + return Status::InvalidArgument("Unsupported OpIdType", OpIdType_Name(type)); + } + return Status::OK(); +} + +void RaftConsensus::MarkDirty(const std::string& reason) { + WARN_NOT_OK(thread_pool_->SubmitClosure(Bind(mark_dirty_clbk_, reason)), + state_->LogPrefixThreadSafe() + "Unable to run MarkDirty callback"); +} + +void RaftConsensus::MarkDirtyOnSuccess(const string& reason, + const StatusCallback& client_cb, + const Status& status) { + if (PREDICT_TRUE(status.ok())) { + MarkDirty(reason); + } + client_cb.Run(status); +} + +void RaftConsensus::NonTxRoundReplicationFinished(ConsensusRound* round, + const StatusCallback& client_cb, + const Status& status) { + OperationType op_type = round->replicate_msg()->op_type(); + string op_type_str = OperationType_Name(op_type); + CHECK(IsConsensusOnlyOperation(op_type)) << "Unexpected op type: " << op_type_str; + if (!status.ok()) { + // TODO: Do something with the status on failure? + LOG(INFO) << state_->LogPrefixThreadSafe() << op_type_str << " replication failed: " + << status.ToString(); + client_cb.Run(status); + return; + } + VLOG(1) << state_->LogPrefixThreadSafe() << "Committing " << op_type_str << " with op id " + << round->id(); + gscoped_ptr commit_msg(new CommitMsg); + commit_msg->set_op_type(round->replicate_msg()->op_type()); + *commit_msg->mutable_commited_op_id() = round->id(); + + WARN_NOT_OK(log_->AsyncAppendCommit(commit_msg.Pass(), Bind(&DoNothingStatusCB)), + "Unable to append commit message"); + client_cb.Run(status); +} + +Status RaftConsensus::EnsureFailureDetectorEnabledUnlocked() { + if (PREDICT_FALSE(!FLAGS_enable_leader_failure_detection)) { + return Status::OK(); + } + if (failure_detector_->IsTracking(kTimerId)) { + return Status::OK(); + } + return failure_detector_->Track(kTimerId, + MonoTime::Now(MonoTime::FINE), + // Unretained to avoid a circular ref. + Bind(&RaftConsensus::ReportFailureDetected, Unretained(this))); +} + +Status RaftConsensus::EnsureFailureDetectorDisabledUnlocked() { + if (PREDICT_FALSE(!FLAGS_enable_leader_failure_detection)) { + return Status::OK(); + } + + if (!failure_detector_->IsTracking(kTimerId)) { + return Status::OK(); + } + return failure_detector_->UnTrack(kTimerId); +} + +Status RaftConsensus::ExpireFailureDetectorUnlocked() { + if (PREDICT_FALSE(!FLAGS_enable_leader_failure_detection)) { + return Status::OK(); + } + + return failure_detector_->MessageFrom(kTimerId, MonoTime::Min()); +} + +Status RaftConsensus::SnoozeFailureDetectorUnlocked() { + return SnoozeFailureDetectorUnlocked(MonoDelta::FromMicroseconds(0), DO_NOT_LOG); +} + +Status RaftConsensus::SnoozeFailureDetectorUnlocked(const MonoDelta& additional_delta, + AllowLogging allow_logging) { + if (PREDICT_FALSE(!FLAGS_enable_leader_failure_detection)) { + return Status::OK(); + } + + MonoTime time = MonoTime::Now(MonoTime::FINE); + time.AddDelta(additional_delta); + + if (allow_logging == ALLOW_LOGGING) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Snoozing failure detection for election timeout " + << "plus an additional " + additional_delta.ToString(); + } + + return failure_detector_->MessageFrom(kTimerId, time); +} + +MonoDelta RaftConsensus::MinimumElectionTimeout() const { + int32_t failure_timeout = FLAGS_leader_failure_max_missed_heartbeat_periods * + FLAGS_raft_heartbeat_interval_ms; + return MonoDelta::FromMilliseconds(failure_timeout); +} + +MonoDelta RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() { + // Compute a backoff factor based on how many leader elections have + // taken place since a leader was successfully elected. + int term_difference = state_->GetCurrentTermUnlocked() - + state_->GetCommittedOpIdUnlocked().term(); + double backoff_factor = pow(1.1, term_difference); + double min_timeout = MinimumElectionTimeout().ToMilliseconds(); + double max_timeout = std::min( + min_timeout * backoff_factor, + FLAGS_leader_failure_exp_backoff_max_delta_ms); + + // Randomize the timeout between the minimum and the calculated value. + // We do this after the above capping to the max. Otherwise, after a + // churny period, we'd end up highly likely to backoff exactly the max + // amount. + double timeout = min_timeout + (max_timeout - min_timeout) * rng_.NextDoubleFraction(); + DCHECK_GE(timeout, min_timeout); + + return MonoDelta::FromMilliseconds(timeout); +} + +Status RaftConsensus::IncrementTermUnlocked() { + return HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1); +} + +Status RaftConsensus::HandleTermAdvanceUnlocked(ConsensusTerm new_term) { + if (new_term <= state_->GetCurrentTermUnlocked()) { + return Status::IllegalState(Substitute("Can't advance term to: $0 current term: $1 is higher.", + new_term, state_->GetCurrentTermUnlocked())); + } + if (state_->GetActiveRoleUnlocked() == RaftPeerPB::LEADER) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Stepping down as leader of term " + << state_->GetCurrentTermUnlocked(); + RETURN_NOT_OK(BecomeReplicaUnlocked()); + } + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Advancing to term " << new_term; + RETURN_NOT_OK(state_->SetCurrentTermUnlocked(new_term)); + term_metric_->set_value(new_term); + return Status::OK(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h new file mode 100644 index 000000000000..6f5e377532b1 --- /dev/null +++ b/src/kudu/consensus/raft_consensus.h @@ -0,0 +1,467 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_RAFT_CONSENSUS_H_ +#define KUDU_CONSENSUS_RAFT_CONSENSUS_H_ + +#include +#include +#include +#include +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/consensus_queue.h" +#include "kudu/util/atomic.h" +#include "kudu/util/failure_detector.h" + +namespace kudu { + +typedef boost::lock_guard Lock; +typedef gscoped_ptr ScopedLock; + +class Counter; +class FailureDetector; +class HostPort; +class ThreadPool; + +namespace server { +class Clock; +} + +namespace rpc { +class Messenger; +} + +namespace consensus { +class ConsensusMetadata; +class Peer; +class PeerProxyFactory; +class PeerManager; +class ReplicaState; +struct ElectionResult; + +class RaftConsensus : public Consensus, + public PeerMessageQueueObserver { + public: + class ConsensusFaultHooks; + + static scoped_refptr Create( + const ConsensusOptions& options, + gscoped_ptr cmeta, + const RaftPeerPB& local_peer_pb, + const scoped_refptr& metric_entity, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, + const std::shared_ptr& messenger, + const scoped_refptr& log, + const std::shared_ptr& parent_mem_tracker, + const Callback& mark_dirty_clbk); + + RaftConsensus(const ConsensusOptions& options, + gscoped_ptr cmeta, + gscoped_ptr peer_proxy_factory, + gscoped_ptr queue, + gscoped_ptr peer_manager, + gscoped_ptr thread_pool, + const scoped_refptr& metric_entity, + const std::string& peer_uuid, + const scoped_refptr& clock, + ReplicaTransactionFactory* txn_factory, + const scoped_refptr& log, + std::shared_ptr parent_mem_tracker, + Callback mark_dirty_clbk); + + virtual ~RaftConsensus(); + + virtual Status Start(const ConsensusBootstrapInfo& info) OVERRIDE; + + virtual bool IsRunning() const OVERRIDE; + + // Emulates an election by increasing the term number and asserting leadership + // in the configuration by sending a NO_OP to other peers. + // This is NOT safe to use in a distributed configuration with failure detection + // enabled, as it could result in a split-brain scenario. + virtual Status EmulateElection() OVERRIDE; + + virtual Status StartElection(ElectionMode mode) OVERRIDE; + + virtual Status StepDown(LeaderStepDownResponsePB* resp) OVERRIDE; + + // Call StartElection(), log a warning if the call fails (usually due to + // being shut down). + void ReportFailureDetected(const std::string& name, const Status& msg); + + virtual Status Replicate(const scoped_refptr& round) OVERRIDE; + + virtual Status CheckLeadershipAndBindTerm(const scoped_refptr& round) OVERRIDE; + + virtual Status Update(const ConsensusRequestPB* request, + ConsensusResponsePB* response) OVERRIDE; + + virtual Status RequestVote(const VoteRequestPB* request, + VoteResponsePB* response) OVERRIDE; + + virtual Status ChangeConfig(const ChangeConfigRequestPB& req, + const StatusCallback& client_cb, + boost::optional* error_code) + OVERRIDE; + + virtual RaftPeerPB::Role role() const OVERRIDE; + + virtual std::string peer_uuid() const OVERRIDE; + + virtual std::string tablet_id() const OVERRIDE; + + virtual ConsensusStatePB ConsensusState(ConsensusConfigType type) const OVERRIDE; + + virtual RaftConfigPB CommittedConfig() const OVERRIDE; + + virtual void DumpStatusHtml(std::ostream& out) const OVERRIDE; + + virtual void Shutdown() OVERRIDE; + + // Makes this peer advance it's term (and step down if leader), for tests. + virtual Status AdvanceTermForTests(int64_t new_term); + + // Return the active (as opposed to committed) role. + RaftPeerPB::Role GetActiveRole() const; + + // Returns the replica state for tests. This should never be used outside of + // tests, in particular calling the LockFor* methods on the returned object + // can cause consensus to deadlock. + ReplicaState* GetReplicaStateForTests(); + + // Updates the committed_index and triggers the Apply()s for whatever + // transactions were pending. + // This is idempotent. + void UpdateMajorityReplicated(const OpId& majority_replicated, + OpId* committed_index) OVERRIDE; + + virtual void NotifyTermChange(int64_t term) OVERRIDE; + + virtual void NotifyFailedFollower(const std::string& uuid, + int64_t term, + const std::string& reason) OVERRIDE; + + virtual Status GetLastOpId(OpIdType type, OpId* id) OVERRIDE; + + protected: + // Trigger that a non-Transaction ConsensusRound has finished replication. + // If the replication was successful, an status will be OK. Otherwise, it + // may be Aborted or some other error status. + // If 'status' is OK, write a Commit message to the local WAL based on the + // type of message it is. + // The 'client_cb' will be invoked at the end of this execution. + virtual void NonTxRoundReplicationFinished(ConsensusRound* round, + const StatusCallback& client_cb, + const Status& status); + + // As a leader, append a new ConsensusRond to the queue. + // Only virtual and protected for mocking purposes. + virtual Status AppendNewRoundToQueueUnlocked(const scoped_refptr& round); + + // As a follower, start a consensus round not associated with a Transaction. + // Only virtual and protected for mocking purposes. + virtual Status StartConsensusOnlyRoundUnlocked(const ReplicateRefPtr& msg); + + private: + friend class ReplicaState; + friend class RaftConsensusQuorumTest; + + // Control whether printing of log messages should be done for a particular + // function call. + enum AllowLogging { + DO_NOT_LOG = 0, + ALLOW_LOGGING = 1, + }; + + // Helper struct that contains the messages from the leader that we need to + // append to our log, after they've been deduplicated. + struct LeaderRequest { + std::string leader_uuid; + const OpId* preceding_opid; + std::vector messages; + // The positional index of the first message selected to be appended, in the + // original leader's request message sequence. + int64_t first_message_idx; + + std::string OpsRangeString() const; + }; + + std::string LogPrefixUnlocked(); + + std::string LogPrefix(); + + // Set the leader UUID of the configuration and mark the tablet config dirty for + // reporting to the master. + void SetLeaderUuidUnlocked(const std::string& uuid); + + // Replicate (as leader) a pre-validated config change. This includes + // updating the peers and setting the new_configuration as pending. + // The old_configuration must be the currently-committed configuration. + Status ReplicateConfigChangeUnlocked(const RaftConfigPB& old_config, + const RaftConfigPB& new_config, + const StatusCallback& client_cb); + + // Update the peers and queue to be consistent with a new active configuration. + // Should only be called by the leader. + Status RefreshConsensusQueueAndPeersUnlocked(); + + // Makes the peer become leader. + // Returns OK once the change config transaction that has this peer as leader + // has been enqueued, the transaction will complete asynchronously. + // + // The ReplicaState must be locked for configuration change before calling. + Status BecomeLeaderUnlocked(); + + // Makes the peer become a replica, i.e. a FOLLOWER or a LEARNER. + // + // The ReplicaState must be locked for configuration change before calling. + Status BecomeReplicaUnlocked(); + + // Updates the state in a replica by storing the received operations in the log + // and triggering the required transactions. This method won't return until all + // operations have been stored in the log and all Prepares() have been completed, + // and a replica cannot accept any more Update() requests until this is done. + Status UpdateReplica(const ConsensusRequestPB* request, + ConsensusResponsePB* response); + + // Deduplicates an RPC request making sure that we get only messages that we + // haven't appended to our log yet. + // On return 'deduplicated_req' is instantiated with only the new messages + // and the correct preceding id. + void DeduplicateLeaderRequestUnlocked(ConsensusRequestPB* rpc_req, + LeaderRequest* deduplicated_req); + + // Handles a request from a leader, refusing the request if the term is lower than + // ours or stepping down if it's higher. + Status HandleLeaderRequestTermUnlocked(const ConsensusRequestPB* request, + ConsensusResponsePB* response); + + // Checks that the preceding op in 'req' is locally committed or pending and sets an + // appropriate error message in 'response' if not. + // If there is term mismatch between the preceding op id in 'req' and the local log's + // pending operations, we proactively abort those pending operations after and including + // the preceding op in 'req' to avoid a pointless cache miss in the leader's log cache. + Status EnforceLogMatchingPropertyMatchesUnlocked(const LeaderRequest& req, + ConsensusResponsePB* response); + + // Check a request received from a leader, making sure: + // - The request is in the right term + // - The log matching property holds + // - Messages are de-duplicated so that we only process previously unprocessed requests. + // - We abort transactions if the leader sends transactions that have the same index as + // transactions currently on the pendings set, but different terms. + // If this returns ok and the response has no errors, 'deduped_req' is set with only + // the messages to add to our state machine. + Status CheckLeaderRequestUnlocked(const ConsensusRequestPB* request, + ConsensusResponsePB* response, + LeaderRequest* deduped_req); + + // Pushes a new Raft configuration to a majority of peers. Contrary to write operations, + // this actually waits for the commit round to reach a majority of peers, so that we know + // we can proceed. If this returns Status::OK(), a majority of peers have accepted the new + // configuration. The peer cannot perform any additional operations until this succeeds. + Status PushConfigurationToPeersUnlocked(const RaftConfigPB& new_config); + + // Returns the most recent OpId written to the Log. + OpId GetLatestOpIdFromLog(); + + // Begin a replica transaction. If the type of message in 'msg' is not a type + // that uses transactions, delegates to StartConsensusOnlyRoundUnlocked(). + Status StartReplicaTransactionUnlocked(const ReplicateRefPtr& msg); + + // Return header string for RequestVote log messages. The ReplicaState lock must be held. + std::string GetRequestVoteLogPrefixUnlocked() const; + + // Fills the response with the current status, if an update was successful. + void FillConsensusResponseOKUnlocked(ConsensusResponsePB* response); + + // Fills the response with an error code and error message. + void FillConsensusResponseError(ConsensusResponsePB* response, + ConsensusErrorPB::Code error_code, + const Status& status); + + // Fill VoteResponsePB with the following information: + // - Update responder_term to current local term. + // - Set vote_granted to true. + void FillVoteResponseVoteGranted(VoteResponsePB* response); + + // Fill VoteResponsePB with the following information: + // - Update responder_term to current local term. + // - Set vote_granted to false. + // - Set consensus_error.code to the given code. + void FillVoteResponseVoteDenied(ConsensusErrorPB::Code error_code, VoteResponsePB* response); + + // Respond to VoteRequest that the candidate has an old term. + Status RequestVoteRespondInvalidTerm(const VoteRequestPB* request, VoteResponsePB* response); + + // Respond to VoteRequest that we already granted our vote to the candidate. + Status RequestVoteRespondVoteAlreadyGranted(const VoteRequestPB* request, + VoteResponsePB* response); + + // Respond to VoteRequest that we already granted our vote to someone else. + Status RequestVoteRespondAlreadyVotedForOther(const VoteRequestPB* request, + VoteResponsePB* response); + + // Respond to VoteRequest that the candidate's last-logged OpId is too old. + Status RequestVoteRespondLastOpIdTooOld(const OpId& local_last_opid, + const VoteRequestPB* request, + VoteResponsePB* response); + + // Respond to VoteRequest that the vote was not granted because we believe + // the leader to be alive. + Status RequestVoteRespondLeaderIsAlive(const VoteRequestPB* request, + VoteResponsePB* response); + + // Respond to VoteRequest that the replica is already in the middle of servicing + // another vote request or an update from a valid leader. + Status RequestVoteRespondIsBusy(const VoteRequestPB* request, + VoteResponsePB* response); + + // Respond to VoteRequest that the vote is granted for candidate. + Status RequestVoteRespondVoteGranted(const VoteRequestPB* request, + VoteResponsePB* response); + + void UpdateMajorityReplicatedUnlocked(const OpId& majority_replicated, + OpId* committed_index); + + // Callback for leader election driver. ElectionCallback is run on the + // reactor thread, so it simply defers its work to DoElectionCallback. + void ElectionCallback(const ElectionResult& result); + void DoElectionCallback(const ElectionResult& result); + + // Start tracking the leader for failures. This typically occurs at startup + // and when the local peer steps down as leader. + // If the failure detector is already registered, has no effect. + Status EnsureFailureDetectorEnabledUnlocked(); + + // Untrack the current leader from failure detector. + // This typically happens when the local peer becomes leader. + // If the failure detector is already unregistered, has no effect. + Status EnsureFailureDetectorDisabledUnlocked(); + + // Set the failure detector to an "expired" state, so that the next time + // the failure monitor runs it triggers an election. + // This is primarily intended to be used at startup time. + Status ExpireFailureDetectorUnlocked(); + + // "Reset" the failure detector to indicate leader activity. + // The failure detector must currently be enabled. + // When this is called a failure is guaranteed not to be detected + // before 'FLAGS_leader_failure_max_missed_heartbeat_periods' * + // 'FLAGS_raft_heartbeat_interval_ms' has elapsed. + Status SnoozeFailureDetectorUnlocked(); + + // Like the above but adds 'additional_delta' to the default timeout + // period. If allow_logging is set to ALLOW_LOGGING, then this method + // will print a log message when called. + Status SnoozeFailureDetectorUnlocked(const MonoDelta& additional_delta, + AllowLogging allow_logging); + + // Return the minimum election timeout. Due to backoff and random + // jitter, election timeouts may be longer than this. + MonoDelta MinimumElectionTimeout() const; + + // Calculates an additional snooze delta for leader election. + // The additional delta increases exponentially with the difference + // between the current term and the term of the last committed + // operation. + // The maximum delta is capped by 'FLAGS_leader_failure_exp_backoff_max_delta_ms'. + MonoDelta LeaderElectionExpBackoffDeltaUnlocked(); + + // Increment the term to the next term, resetting the current leader, etc. + Status IncrementTermUnlocked(); + + // Handle when the term has advanced beyond the current term. + Status HandleTermAdvanceUnlocked(ConsensusTerm new_term); + + // Asynchronously (on thread_pool_) notify the tablet peer that the consensus configuration + // has changed, thus reporting it back to the master. + void MarkDirty(const std::string& reason); + + // Calls MarkDirty() if 'status' == OK. Then, always calls 'client_cb' with + // 'status' as its argument. + void MarkDirtyOnSuccess(const std::string& reason, + const StatusCallback& client_cb, + const Status& status); + + // Attempt to remove the follower with the specified 'uuid' from the config, + // if the 'committed_config' is still the committed config and if the current + // node is the leader. + // + // Since this is inherently an asynchronous operation run on a thread pool, + // it may fail due to the configuration changing, the local node losing + // leadership, or the tablet shutting down. + // Logs a warning on failure. + void TryRemoveFollowerTask(const std::string& uuid, + const RaftConfigPB& committed_config, + const std::string& reason); + + // Threadpool for constructing requests to peers, handling RPC callbacks, + // etc. + gscoped_ptr thread_pool_; + + scoped_refptr log_; + scoped_refptr clock_; + gscoped_ptr peer_proxy_factory_; + + gscoped_ptr peer_manager_; + + // The queue of messages that must be sent to peers. + gscoped_ptr queue_; + + gscoped_ptr state_; + + Random rng_; + + // TODO: Plumb this from ServerBase. + RandomizedFailureMonitor failure_monitor_; + + scoped_refptr failure_detector_; + + // If any RequestVote() RPC arrives before this timestamp, + // the request will be ignored. This prevents abandoned or partitioned + // nodes from disturbing the healthy leader. + MonoTime withhold_votes_until_; + + const Callback mark_dirty_clbk_; + + // TODO hack to serialize updates due to repeated/out-of-order messages + // should probably be refactored out. + // + // Lock ordering note: If both this lock and the ReplicaState lock are to be + // taken, this lock must be taken first. + mutable simple_spinlock update_lock_; + + AtomicBool shutdown_; + + scoped_refptr follower_memory_pressure_rejections_; + scoped_refptr > term_metric_; + + std::shared_ptr parent_mem_tracker_; + + DISALLOW_COPY_AND_ASSIGN(RaftConsensus); +}; + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_RAFT_CONSENSUS_H_ */ diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc b/src/kudu/consensus/raft_consensus_quorum-test.cc new file mode 100644 index 000000000000..1e6386188caf --- /dev/null +++ b/src/kudu/consensus/raft_consensus_quorum-test.cc @@ -0,0 +1,1124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_index.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/peer_manager.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/consensus/raft_consensus.h" +#include "kudu/consensus/raft_consensus_state.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/server/metadata.h" +#include "kudu/server/logical_clock.h" +#include "kudu/util/auto_release_pool.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DECLARE_int32(raft_heartbeat_interval_ms); +DECLARE_bool(enable_leader_failure_detection); + +METRIC_DECLARE_entity(tablet); + +#define REPLICATE_SEQUENCE_OF_MESSAGES(a, b, c, d, e, f, g) \ + ASSERT_NO_FATAL_FAILURE(ReplicateSequenceOfMessages(a, b, c, d, e, f, g)) + +using std::shared_ptr; + +namespace kudu { + +namespace rpc { +class RpcContext; +} +namespace consensus { + +using log::Log; +using log::LogEntryPB; +using log::LogOptions; +using log::LogReader; +using rpc::RpcContext; +using strings::Substitute; +using strings::SubstituteAndAppend; + +const char* kTestTablet = "TestTablet"; + +void DoNothing(const string& s) { +} + +Status WaitUntilLeaderForTests(RaftConsensus* raft) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(15)); + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + if (raft->GetActiveRole() == RaftPeerPB::LEADER) { + return Status::OK(); + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + return Status::TimedOut("Timed out waiting to become leader"); +} + +// Test suite for tests that focus on multiple peer interaction, but +// without integrating with other components, such as transactions. +class RaftConsensusQuorumTest : public KuduTest { + public: + RaftConsensusQuorumTest() + : clock_(server::LogicalClock::CreateStartingAt(Timestamp(0))), + metric_entity_(METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "raft-test")), + schema_(GetSimpleTestSchema()) { + options_.tablet_id = kTestTablet; + FLAGS_enable_leader_failure_detection = false; + } + + + // Builds an initial configuration of 'num' elements. + // All of the peers start as followers. + void BuildInitialRaftConfigPB(int num) { + config_ = BuildRaftConfigPBForTests(num); + config_.set_opid_index(kInvalidOpIdIndex); + peers_.reset(new TestPeerMapManager(config_)); + } + + Status BuildFsManagersAndLogs() { + // Build the fsmanagers and logs + for (int i = 0; i < config_.peers_size(); i++) { + shared_ptr parent_mem_tracker = + MemTracker::CreateTracker(-1, Substitute("peer-$0", i)); + parent_mem_trackers_.push_back(parent_mem_tracker); + string test_path = GetTestPath(Substitute("peer-$0-root", i)); + FsManagerOpts opts; + opts.parent_mem_tracker = parent_mem_tracker; + opts.wal_path = test_path; + opts.data_paths = { test_path }; + gscoped_ptr fs_manager(new FsManager(env_.get(), opts)); + RETURN_NOT_OK(fs_manager->CreateInitialFileSystemLayout()); + RETURN_NOT_OK(fs_manager->Open()); + + scoped_refptr log; + RETURN_NOT_OK(Log::Open(LogOptions(), + fs_manager.get(), + kTestTablet, + schema_, + 0, // schema_version + NULL, + &log)); + logs_.push_back(log.get()); + fs_managers_.push_back(fs_manager.release()); + } + return Status::OK(); + } + + void BuildPeers() { + vector proxy_factories; + for (int i = 0; i < config_.peers_size(); i++) { + auto proxy_factory = new LocalTestPeerProxyFactory(peers_.get()); + proxy_factories.push_back(proxy_factory); + + auto txn_factory = new TestTransactionFactory(logs_[i].get()); + + string peer_uuid = Substitute("peer-$0", i); + + gscoped_ptr cmeta; + CHECK_OK(ConsensusMetadata::Create(fs_managers_[i], kTestTablet, peer_uuid, config_, + kMinimumTerm, &cmeta)); + + RaftPeerPB local_peer_pb; + CHECK_OK(GetRaftConfigMember(config_, peer_uuid, &local_peer_pb)); + gscoped_ptr queue(new PeerMessageQueue(metric_entity_, + logs_[i], + local_peer_pb, + kTestTablet)); + + gscoped_ptr thread_pool; + CHECK_OK(ThreadPoolBuilder(Substitute("$0-raft", options_.tablet_id.substr(0, 6))) + .Build(&thread_pool)); + + gscoped_ptr peer_manager( + new PeerManager(options_.tablet_id, + config_.peers(i).permanent_uuid(), + proxy_factory, + queue.get(), + thread_pool.get(), + logs_[i])); + + scoped_refptr peer( + new RaftConsensus(options_, + cmeta.Pass(), + gscoped_ptr(proxy_factory).Pass(), + queue.Pass(), + peer_manager.Pass(), + thread_pool.Pass(), + metric_entity_, + config_.peers(i).permanent_uuid(), + clock_, + txn_factory, + logs_[i], + parent_mem_trackers_[i], + Bind(&DoNothing))); + + txn_factory->SetConsensus(peer.get()); + txn_factories_.push_back(txn_factory); + peers_->AddPeer(config_.peers(i).permanent_uuid(), peer); + } + } + + Status StartPeers() { + ConsensusBootstrapInfo boot_info; + + TestPeerMap all_peers = peers_->GetPeerMapCopy(); + for (const TestPeerMap::value_type& entry : all_peers) { + RETURN_NOT_OK(entry.second->Start(boot_info)); + } + return Status::OK(); + } + + Status BuildConfig(int num) { + BuildInitialRaftConfigPB(num); + RETURN_NOT_OK(BuildFsManagersAndLogs()); + BuildPeers(); + return Status::OK(); + } + + Status BuildAndStartConfig(int num) { + RETURN_NOT_OK(BuildConfig(num)); + RETURN_NOT_OK(StartPeers()); + + // Automatically elect the last node in the list. + const int kLeaderIdx = num - 1; + scoped_refptr leader; + RETURN_NOT_OK(peers_->GetPeerByIdx(kLeaderIdx, &leader)); + RETURN_NOT_OK(leader->EmulateElection()); + return Status::OK(); + } + + LocalTestPeerProxy* GetLeaderProxyToPeer(int peer_idx, int leader_idx) { + scoped_refptr follower; + CHECK_OK(peers_->GetPeerByIdx(peer_idx, &follower)); + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(leader_idx, &leader)); + for (LocalTestPeerProxy* proxy : down_cast( + leader->peer_proxy_factory_.get())->GetProxies()) { + if (proxy->GetTarget() == follower->peer_uuid()) { + return proxy; + } + } + CHECK(false) << "Proxy not found"; + return nullptr; + } + + Status AppendDummyMessage(int peer_idx, + scoped_refptr* round) { + gscoped_ptr msg(new ReplicateMsg()); + msg->set_op_type(NO_OP); + msg->mutable_noop_request(); + msg->set_timestamp(clock_->Now().ToUint64()); + + scoped_refptr peer; + CHECK_OK(peers_->GetPeerByIdx(peer_idx, &peer)); + + // Use a latch in place of a Transaction callback. + gscoped_ptr sync(new Synchronizer()); + *round = peer->NewRound(msg.Pass(), sync->AsStatusCallback()); + InsertOrDie(&syncs_, round->get(), sync.release()); + RETURN_NOT_OK_PREPEND(peer->Replicate(round->get()), + Substitute("Unable to replicate to peer $0", peer_idx)); + return Status::OK(); + } + + static void FireSharedSynchronizer(const shared_ptr& sync, const Status& s) { + sync->StatusCB(s); + } + + Status CommitDummyMessage(int peer_idx, + ConsensusRound* round, + shared_ptr* commit_sync = nullptr) { + StatusCallback commit_callback; + if (commit_sync != nullptr) { + commit_sync->reset(new Synchronizer()); + commit_callback = Bind(&FireSharedSynchronizer, *commit_sync); + } else { + commit_callback = Bind(&DoNothingStatusCB); + } + + gscoped_ptr msg(new CommitMsg()); + msg->set_op_type(NO_OP); + msg->mutable_commited_op_id()->CopyFrom(round->id()); + CHECK_OK(logs_[peer_idx]->AsyncAppendCommit(msg.Pass(), commit_callback)); + return Status::OK(); + } + + Status WaitForReplicate(ConsensusRound* round) { + return FindOrDie(syncs_, round)->Wait(); + } + + Status TimedWaitForReplicate(ConsensusRound* round, const MonoDelta& delta) { + return FindOrDie(syncs_, round)->WaitFor(delta); + } + + void WaitForReplicateIfNotAlreadyPresent(const OpId& to_wait_for, int peer_idx) { + scoped_refptr peer; + CHECK_OK(peers_->GetPeerByIdx(peer_idx, &peer)); + ReplicaState* state = peer->GetReplicaStateForTests(); + while (true) { + { + ReplicaState::UniqueLock lock; + CHECK_OK(state->LockForRead(&lock)); + if (OpIdCompare(state->GetLastReceivedOpIdUnlocked(), to_wait_for) >= 0) { + return; + } + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } + } + + // Waits for an operation to be (database) committed in the replica at index + // 'peer_idx'. If the operation was already committed this returns immediately. + void WaitForCommitIfNotAlreadyPresent(const OpId& to_wait_for, + int peer_idx, + int leader_idx) { + MonoDelta timeout(MonoDelta::FromSeconds(10)); + MonoTime start(MonoTime::Now(MonoTime::FINE)); + + scoped_refptr peer; + CHECK_OK(peers_->GetPeerByIdx(peer_idx, &peer)); + ReplicaState* state = peer->GetReplicaStateForTests(); + + int backoff_exp = 0; + const int kMaxBackoffExp = 8; + OpId committed_op_id; + while (true) { + { + ReplicaState::UniqueLock lock; + CHECK_OK(state->LockForRead(&lock)); + committed_op_id = state->GetCommittedOpIdUnlocked(); + if (OpIdCompare(committed_op_id, to_wait_for) >= 0) { + return; + } + } + MonoDelta elapsed = MonoTime::Now(MonoTime::FINE).GetDeltaSince(start); + if (elapsed.MoreThan(timeout)) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(1 << backoff_exp)); + backoff_exp = std::min(backoff_exp + 1, kMaxBackoffExp); + } + + LOG(ERROR) << "Max timeout reached (" << timeout.ToString() << ") while waiting for commit of " + << "op " << to_wait_for << " on replica. Last committed op on replica: " + << committed_op_id << ". Dumping state and quitting."; + vector lines; + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(leader_idx, &leader)); + for (const string& line : lines) { + LOG(ERROR) << line; + } + + // Gather the replica and leader operations for printing + vector replica_ops; + ElementDeleter repl0_deleter(&replica_ops); + GatherLogEntries(peer_idx, logs_[peer_idx], &replica_ops); + vector leader_ops; + ElementDeleter leader_deleter(&leader_ops); + GatherLogEntries(leader_idx, logs_[leader_idx], &leader_ops); + SCOPED_TRACE(PrintOnError(replica_ops, Substitute("local peer ($0)", peer->peer_uuid()))); + SCOPED_TRACE(PrintOnError(leader_ops, Substitute("leader (peer-$0)", leader_idx))); + FAIL() << "Replica did not commit."; + } + + // Used in ReplicateSequenceOfMessages() to specify whether + // we should wait for all replicas to have replicated the + // sequence or just a majority. + enum ReplicateWaitMode { + WAIT_FOR_ALL_REPLICAS, + WAIT_FOR_MAJORITY + }; + + // Used in ReplicateSequenceOfMessages() to specify whether + // we should also commit the messages in the sequence + enum CommitMode { + DONT_COMMIT, + COMMIT_ONE_BY_ONE + }; + + // Replicates a sequence of messages to the peer passed as leader. + // Optionally waits for the messages to be replicated to followers. + // 'last_op_id' is set to the id of the last replicated operation. + // The operations are only committed if 'commit_one_by_one' is true. + void ReplicateSequenceOfMessages(int seq_size, + int leader_idx, + ReplicateWaitMode wait_mode, + CommitMode commit_mode, + OpId* last_op_id, + vector >* rounds, + shared_ptr* commit_sync = nullptr) { + for (int i = 0; i < seq_size; i++) { + scoped_refptr round; + ASSERT_OK(AppendDummyMessage(leader_idx, &round)); + ASSERT_OK(WaitForReplicate(round.get())); + last_op_id->CopyFrom(round->id()); + if (commit_mode == COMMIT_ONE_BY_ONE) { + CommitDummyMessage(leader_idx, round.get(), commit_sync); + } + rounds->push_back(round); + } + + if (wait_mode == WAIT_FOR_ALL_REPLICAS) { + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(leader_idx, &leader)); + + TestPeerMap all_peers = peers_->GetPeerMapCopy(); + int i = 0; + for (const TestPeerMap::value_type& entry : all_peers) { + if (entry.second->peer_uuid() != leader->peer_uuid()) { + WaitForReplicateIfNotAlreadyPresent(*last_op_id, i); + } + i++; + } + } + } + + void GatherLogEntries(int idx, const scoped_refptr& log, vector* entries) { + ASSERT_OK(log->WaitUntilAllFlushed()); + log->Close(); + gscoped_ptr log_reader; + ASSERT_OK(log::LogReader::Open(fs_managers_[idx], + scoped_refptr(), + kTestTablet, + metric_entity_.get(), + &log_reader)); + vector ret; + ElementDeleter deleter(&ret); + log::SegmentSequence segments; + ASSERT_OK(log_reader->GetSegmentsSnapshot(&segments)); + + for (const log::SegmentSequence::value_type& entry : segments) { + ASSERT_OK(entry->ReadEntries(&ret)); + } + + entries->swap(ret); + } + + // Verifies that the replica's log match the leader's. This deletes the + // peers (so we're sure that no further writes occur) and closes the logs + // so it must be the very last thing to run, in a test. + void VerifyLogs(int leader_idx, int first_replica_idx, int last_replica_idx) { + // Wait for in-flight transactions to be done. We're destroying the + // peers next and leader transactions won't be able to commit anymore. + for (TestTransactionFactory* factory : txn_factories_) { + factory->WaitDone(); + } + + // Shut down all the peers. + TestPeerMap all_peers = peers_->GetPeerMapCopy(); + for (const TestPeerMap::value_type& entry : all_peers) { + entry.second->Shutdown(); + } + + vector leader_entries; + ElementDeleter leader_entry_deleter(&leader_entries); + GatherLogEntries(leader_idx, logs_[leader_idx], &leader_entries); + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(leader_idx, &leader)); + + for (int replica_idx = first_replica_idx; replica_idx < last_replica_idx; replica_idx++) { + vector replica_entries; + ElementDeleter replica_entry_deleter(&replica_entries); + GatherLogEntries(replica_idx, logs_[replica_idx], &replica_entries); + + scoped_refptr replica; + CHECK_OK(peers_->GetPeerByIdx(replica_idx, &replica)); + VerifyReplica(leader_entries, + replica_entries, + leader->peer_uuid(), + replica->peer_uuid()); + } + } + + void ExtractReplicateIds(const vector& entries, + vector* ids) { + ids->reserve(entries.size() / 2); + for (const LogEntryPB* entry : entries) { + if (entry->has_replicate()) { + ids->push_back(entry->replicate().id()); + } + } + } + + void VerifyReplicateOrderMatches(const vector& leader_entries, + const vector& replica_entries) { + vector leader_ids, replica_ids; + ExtractReplicateIds(leader_entries, &leader_ids); + ExtractReplicateIds(replica_entries, &replica_ids); + ASSERT_EQ(leader_ids.size(), replica_ids.size()); + for (int i = 0; i < leader_ids.size(); i++) { + ASSERT_EQ(leader_ids[i].ShortDebugString(), + replica_ids[i].ShortDebugString()); + } + } + + void VerifyNoCommitsBeforeReplicates(const vector& entries) { + unordered_set replication_ops; + + for (const LogEntryPB* entry : entries) { + if (entry->has_replicate()) { + ASSERT_TRUE(InsertIfNotPresent(&replication_ops, entry->replicate().id())) + << "REPLICATE op id showed up twice: " << entry->ShortDebugString(); + } else if (entry->has_commit()) { + ASSERT_EQ(1, replication_ops.erase(entry->commit().commited_op_id())) + << "COMMIT came before associated REPLICATE: " << entry->ShortDebugString(); + } + } + } + + void VerifyReplica(const vector& leader_entries, + const vector& replica_entries, + const string& leader_name, + const string& replica_name) { + SCOPED_TRACE(PrintOnError(leader_entries, Substitute("Leader: $0", leader_name))); + SCOPED_TRACE(PrintOnError(replica_entries, Substitute("Replica: $0", replica_name))); + + // Check that the REPLICATE messages come in the same order on both nodes. + VerifyReplicateOrderMatches(leader_entries, replica_entries); + + // Check that no COMMIT precedes its related REPLICATE on both the replica + // and leader. + VerifyNoCommitsBeforeReplicates(replica_entries); + VerifyNoCommitsBeforeReplicates(leader_entries); + } + + string PrintOnError(const vector& replica_entries, + const string& replica_id) { + string ret = ""; + SubstituteAndAppend(&ret, "$1 log entries for replica $0:\n", + replica_id, replica_entries.size()); + for (LogEntryPB* replica_entry : replica_entries) { + StrAppend(&ret, "Replica log entry: ", replica_entry->ShortDebugString(), "\n"); + } + return ret; + } + + // Read the ConsensusMetadata for the given peer from disk. + gscoped_ptr ReadConsensusMetadataFromDisk(int peer_index) { + string peer_uuid = Substitute("peer-$0", peer_index); + gscoped_ptr cmeta; + CHECK_OK(ConsensusMetadata::Load(fs_managers_[peer_index], kTestTablet, peer_uuid, &cmeta)); + return cmeta.Pass(); + } + + // Assert that the durable term == term and that the peer that got the vote == voted_for. + void AssertDurableTermAndVote(int peer_index, int64_t term, const std::string& voted_for) { + gscoped_ptr cmeta = ReadConsensusMetadataFromDisk(peer_index); + ASSERT_EQ(term, cmeta->current_term()); + ASSERT_EQ(voted_for, cmeta->voted_for()); + } + + // Assert that the durable term == term and that the peer has not yet voted. + void AssertDurableTermWithoutVote(int peer_index, int64_t term) { + gscoped_ptr cmeta = ReadConsensusMetadataFromDisk(peer_index); + ASSERT_EQ(term, cmeta->current_term()); + ASSERT_FALSE(cmeta->has_voted_for()); + } + + ~RaftConsensusQuorumTest() { + peers_->Clear(); + STLDeleteElements(&txn_factories_); + // We need to clear the logs before deleting the fs_managers_ or we'll + // get a SIGSEGV when closing the logs. + logs_.clear(); + STLDeleteElements(&fs_managers_); + STLDeleteValues(&syncs_); + } + + protected: + ConsensusOptions options_; + RaftConfigPB config_; + OpId initial_id_; + vector > parent_mem_trackers_; + vector fs_managers_; + vector > logs_; + gscoped_ptr peers_; + vector txn_factories_; + scoped_refptr clock_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + const Schema schema_; + unordered_map syncs_; +}; + +// Tests Replicate/Commit a single message through the leader. +TEST_F(RaftConsensusQuorumTest, TestFollowersReplicateAndCommitMessage) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + vector > rounds; + shared_ptr commit_sync; + REPLICATE_SEQUENCE_OF_MESSAGES(1, + kLeaderIdx, + WAIT_FOR_ALL_REPLICAS, + DONT_COMMIT, + &last_op_id, + &rounds, + &commit_sync); + + // Commit the operation + ASSERT_OK(CommitDummyMessage(kLeaderIdx, rounds[0].get(), &commit_sync)); + + // Wait for everyone to commit the operations. + + // We need to make sure the CommitMsg lands on the leaders log or the + // verification will fail. Since CommitMsgs are appended to the replication + // queue there is a scenario where they land in the followers log before + // landing on the leader's log. However we know that they are durable + // on the leader when the commit callback is triggered. + // We thus wait for the commit callback to trigger, ensuring durability + // on the leader and then for the commits to be present on the replicas. + ASSERT_OK(commit_sync->Wait()); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower1Idx, kLeaderIdx); + VerifyLogs(2, 0, 1); +} + +// Tests Replicate/Commit a sequence of messages through the leader. +// First a sequence of replicates and then a sequence of commits. +TEST_F(RaftConsensusQuorumTest, TestFollowersReplicateAndCommitSequence) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + int seq_size = AllowSlowTests() ? 1000 : 100; + + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + vector > rounds; + shared_ptr commit_sync; + + REPLICATE_SEQUENCE_OF_MESSAGES(seq_size, + kLeaderIdx, + WAIT_FOR_ALL_REPLICAS, + DONT_COMMIT, + &last_op_id, + &rounds, + &commit_sync); + + // Commit the operations, but wait for the replicates to finish first + for (const scoped_refptr& round : rounds) { + ASSERT_OK(CommitDummyMessage(kLeaderIdx, round.get(), &commit_sync)); + } + + // See comment at the end of TestFollowersReplicateAndCommitMessage + // for an explanation on this waiting sequence. + ASSERT_OK(commit_sync->Wait()); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower1Idx, kLeaderIdx); + VerifyLogs(2, 0, 1); +} + +TEST_F(RaftConsensusQuorumTest, TestConsensusContinuesIfAMinorityFallsBehind) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_replicate; + vector > rounds; + { + // lock one of the replicas down by obtaining the state lock + // and never letting it go. + scoped_refptr follower0; + CHECK_OK(peers_->GetPeerByIdx(kFollower0Idx, &follower0)); + + ReplicaState* follower0_rs = follower0->GetReplicaStateForTests(); + ReplicaState::UniqueLock lock; + ASSERT_OK(follower0_rs->LockForRead(&lock)); + + // If the locked replica would stop consensus we would hang here + // as we wait for operations to be replicated to a majority. + ASSERT_NO_FATAL_FAILURE(ReplicateSequenceOfMessages( + 10, + kLeaderIdx, + WAIT_FOR_MAJORITY, + COMMIT_ONE_BY_ONE, + &last_replicate, + &rounds)); + + // Follower 1 should be fine (Were we to wait for follower0's replicate + // this would hang here). We know he must have replicated but make sure + // by calling Wait(). + WaitForReplicateIfNotAlreadyPresent(last_replicate, kFollower1Idx); + WaitForCommitIfNotAlreadyPresent(last_replicate, kFollower1Idx, kLeaderIdx); + } + + // After we let the lock go the remaining follower should get up-to-date + WaitForReplicateIfNotAlreadyPresent(last_replicate, kFollower0Idx); + WaitForCommitIfNotAlreadyPresent(last_replicate, kFollower0Idx, kLeaderIdx); + VerifyLogs(2, 0, 1); +} + +TEST_F(RaftConsensusQuorumTest, TestConsensusStopsIfAMajorityFallsBehind) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + + scoped_refptr round; + { + // lock two of the replicas down by obtaining the state locks + // and never letting them go. + scoped_refptr follower0; + CHECK_OK(peers_->GetPeerByIdx(kFollower0Idx, &follower0)); + ReplicaState* follower0_rs = follower0->GetReplicaStateForTests(); + ReplicaState::UniqueLock lock0; + ASSERT_OK(follower0_rs->LockForRead(&lock0)); + + scoped_refptr follower1; + CHECK_OK(peers_->GetPeerByIdx(kFollower1Idx, &follower1)); + ReplicaState* follower1_rs = follower1->GetReplicaStateForTests(); + ReplicaState::UniqueLock lock1; + ASSERT_OK(follower1_rs->LockForRead(&lock1)); + + // Append a single message to the queue + ASSERT_OK(AppendDummyMessage(kLeaderIdx, &round)); + last_op_id.CopyFrom(round->id()); + // This should timeout. + Status status = TimedWaitForReplicate(round.get(), MonoDelta::FromMilliseconds(500)); + ASSERT_TRUE(status.IsTimedOut()); + } + + // After we release the locks the operation should replicate to all replicas + // and we commit. + ASSERT_OK(WaitForReplicate(round.get())); + CommitDummyMessage(kLeaderIdx, round.get()); + + // Assert that everything was ok + WaitForReplicateIfNotAlreadyPresent(last_op_id, kFollower0Idx); + WaitForReplicateIfNotAlreadyPresent(last_op_id, kFollower1Idx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower1Idx, kLeaderIdx); + VerifyLogs(2, 0, 1); +} + +// If some communication error happens the leader will resend the request to the +// peers. This tests that the peers handle repeated requests. +TEST_F(RaftConsensusQuorumTest, TestReplicasHandleCommunicationErrors) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + + // Append a dummy message, with faults injected on the first attempt + // to send the message. + scoped_refptr round; + GetLeaderProxyToPeer(kFollower0Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + GetLeaderProxyToPeer(kFollower1Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + ASSERT_OK(AppendDummyMessage(kLeaderIdx, &round)); + + // We should successfully replicate it due to retries. + ASSERT_OK(WaitForReplicate(round.get())); + + GetLeaderProxyToPeer(kFollower0Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + GetLeaderProxyToPeer(kFollower1Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + ASSERT_OK(CommitDummyMessage(kLeaderIdx, round.get())); + + // The commit should eventually reach both followers as well. + last_op_id = round->id(); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower1Idx, kLeaderIdx); + + // Append a sequence of messages, and keep injecting errors into the + // replica proxies. + vector > rounds; + shared_ptr commit_sync; + for (int i = 0; i < 100; i++) { + scoped_refptr round; + ASSERT_OK(AppendDummyMessage(kLeaderIdx, &round)); + ConsensusRound* round_ptr = round.get(); + last_op_id.CopyFrom(round->id()); + rounds.push_back(round); + + // inject comm faults + if (i % 2 == 0) { + GetLeaderProxyToPeer(kFollower0Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + } else { + GetLeaderProxyToPeer(kFollower1Idx, kLeaderIdx)->InjectCommFaultLeaderSide(); + } + + ASSERT_OK(WaitForReplicate(round_ptr)); + ASSERT_OK(CommitDummyMessage(kLeaderIdx, round_ptr, &commit_sync)); + } + + // Assert last operation was correctly replicated and committed. + WaitForReplicateIfNotAlreadyPresent(last_op_id, kFollower0Idx); + WaitForReplicateIfNotAlreadyPresent(last_op_id, kFollower1Idx); + + // See comment at the end of TestFollowersReplicateAndCommitMessage + // for an explanation on this waiting sequence. + ASSERT_OK(commit_sync->Wait()); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(last_op_id, kFollower1Idx, kLeaderIdx); + VerifyLogs(2, 0, 1); +} + +// In this test we test the ability of the leader to send heartbeats +// to replicas by simply pushing nothing after the configuration round +// and still expecting for the replicas Update() hooks to be called. +TEST_F(RaftConsensusQuorumTest, TestLeaderHeartbeats) { + // Constants with the indexes of peers with certain roles, + // since peers don't change roles in this test. + const int kFollower0Idx = 0; + const int kFollower1Idx = 1; + const int kLeaderIdx = 2; + + ASSERT_OK(BuildConfig(3)); + + scoped_refptr follower0; + CHECK_OK(peers_->GetPeerByIdx(kFollower0Idx, &follower0)); + scoped_refptr follower1; + CHECK_OK(peers_->GetPeerByIdx(kFollower1Idx, &follower1)); + + shared_ptr counter_hook_rpl0( + new CounterHooks(follower0->GetFaultHooks())); + shared_ptr counter_hook_rpl1( + new CounterHooks(follower1->GetFaultHooks())); + + // Replace the default fault hooks on the replicas with counter hooks + // before we start the configuration. + follower0->SetFaultHooks(counter_hook_rpl0); + follower1->SetFaultHooks(counter_hook_rpl1); + + ASSERT_OK(StartPeers()); + + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(kLeaderIdx, &leader)); + ASSERT_OK(leader->EmulateElection()); + + // Wait for the config round to get committed and count the number + // of update calls, calls after that will be heartbeats. + OpId config_round; + config_round.set_term(1); + config_round.set_index(1); + WaitForCommitIfNotAlreadyPresent(config_round, kFollower0Idx, kLeaderIdx); + WaitForCommitIfNotAlreadyPresent(config_round, kFollower1Idx, kLeaderIdx); + + int repl0_init_count = counter_hook_rpl0->num_pre_update_calls(); + int repl1_init_count = counter_hook_rpl1->num_pre_update_calls(); + + // Now wait for about 4 times the hearbeat period the counters + // should have increased 3/4 times. + SleepFor(MonoDelta::FromMilliseconds(FLAGS_raft_heartbeat_interval_ms * 4)); + + int repl0_final_count = counter_hook_rpl0->num_pre_update_calls(); + int repl1_final_count = counter_hook_rpl1->num_pre_update_calls(); + + ASSERT_GE(repl0_final_count - repl0_init_count, 3); + ASSERT_LE(repl0_final_count - repl0_init_count, 4); + ASSERT_GE(repl1_final_count - repl1_init_count, 3); + ASSERT_LE(repl1_final_count - repl1_init_count, 4); + + VerifyLogs(2, 0, 1); +} + +// After creating the initial configuration, this test writes a small sequence +// of messages to the initial leader. It then shuts down the current +// leader, makes another peer become leader and writes a sequence of +// messages to it. The new leader and the follower should agree on the +// sequence of messages. +TEST_F(RaftConsensusQuorumTest, TestLeaderElectionWithQuiescedQuorum) { + const int kInitialNumPeers = 5; + ASSERT_OK(BuildAndStartConfig(kInitialNumPeers)); + + OpId last_op_id; + shared_ptr last_commit_sync; + vector > rounds; + + // Loop twice, successively shutting down the previous leader. + for (int current_config_size = kInitialNumPeers; + current_config_size >= kInitialNumPeers - 1; + current_config_size--) { + REPLICATE_SEQUENCE_OF_MESSAGES(10, + current_config_size - 1, // The index of the leader. + WAIT_FOR_ALL_REPLICAS, + COMMIT_ONE_BY_ONE, + &last_op_id, + &rounds, + &last_commit_sync); + + // Make sure the last operation is committed everywhere + ASSERT_OK(last_commit_sync->Wait()); + for (int i = 0; i < current_config_size - 1; i++) { + WaitForCommitIfNotAlreadyPresent(last_op_id, i, current_config_size - 1); + } + + // Now shutdown the current leader. + LOG(INFO) << "Shutting down current leader with index " << (current_config_size - 1); + scoped_refptr current_leader; + CHECK_OK(peers_->GetPeerByIdx(current_config_size - 1, ¤t_leader)); + current_leader->Shutdown(); + peers_->RemovePeer(current_leader->peer_uuid()); + + // ... and make the peer before it become leader. + scoped_refptr new_leader; + CHECK_OK(peers_->GetPeerByIdx(current_config_size - 2, &new_leader)); + + // This will force an election in which we expect to make the last + // non-shutdown peer in the list become leader. + LOG(INFO) << "Running election for future leader with index " << (current_config_size - 1); + ASSERT_OK(new_leader->StartElection(Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE)); + WaitUntilLeaderForTests(new_leader.get()); + LOG(INFO) << "Election won"; + + // ... replicating a set of messages to the new leader should now be possible. + REPLICATE_SEQUENCE_OF_MESSAGES(10, + current_config_size - 2, // The index of the new leader. + WAIT_FOR_MAJORITY, + COMMIT_ONE_BY_ONE, + &last_op_id, + &rounds, + &last_commit_sync); + + // Make sure the last operation is committed everywhere + ASSERT_OK(last_commit_sync->Wait()); + for (int i = 0; i < current_config_size - 2; i++) { + WaitForCommitIfNotAlreadyPresent(last_op_id, i, current_config_size - 2); + } + } + // We can only verify the logs of the peers that were not killed, due to the + // old leaders being out-of-date now. + VerifyLogs(2, 0, 1); +} + +TEST_F(RaftConsensusQuorumTest, TestReplicasEnforceTheLogMatchingProperty) { + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + shared_ptr last_commit_sync; + vector > rounds; + REPLICATE_SEQUENCE_OF_MESSAGES(10, + 2, // The index of the initial leader. + WAIT_FOR_ALL_REPLICAS, + COMMIT_ONE_BY_ONE, + &last_op_id, + &rounds, + &last_commit_sync); + + // Make sure the last operation is committed everywhere + ASSERT_OK(last_commit_sync->Wait()); + WaitForCommitIfNotAlreadyPresent(last_op_id, 0, 2); + WaitForCommitIfNotAlreadyPresent(last_op_id, 1, 2); + + // Now replicas should only accept operations with + // 'last_id' as the preceding id. + ConsensusRequestPB req; + ConsensusResponsePB resp; + + scoped_refptr leader; + CHECK_OK(peers_->GetPeerByIdx(2, &leader)); + + scoped_refptr follower; + CHECK_OK(peers_->GetPeerByIdx(0, &follower)); + + + req.set_caller_uuid(leader->peer_uuid()); + req.set_caller_term(last_op_id.term()); + req.mutable_preceding_id()->CopyFrom(last_op_id); + req.mutable_committed_index()->CopyFrom(last_op_id); + + ReplicateMsg* replicate = req.add_ops(); + replicate->set_timestamp(clock_->Now().ToUint64()); + OpId* id = replicate->mutable_id(); + id->set_term(last_op_id.term()); + id->set_index(last_op_id.index() + 1); + replicate->set_op_type(NO_OP); + + // Appending this message to peer0 should work and update + // its 'last_received' to 'id'. + ASSERT_OK(follower->Update(&req, &resp)); + ASSERT_TRUE(OpIdEquals(resp.status().last_received(), *id)); + + // Now skip one message in the same term. The replica should + // complain with the right error message. + req.mutable_preceding_id()->set_index(id->index() + 1); + id->set_index(id->index() + 2); + // Appending this message to peer0 should return a Status::OK + // but should contain an error referring to the log matching property. + ASSERT_OK(follower->Update(&req, &resp)); + ASSERT_TRUE(resp.has_status()); + ASSERT_TRUE(resp.status().has_error()); + ASSERT_EQ(resp.status().error().code(), ConsensusErrorPB::PRECEDING_ENTRY_DIDNT_MATCH); + ASSERT_STR_CONTAINS(resp.status().error().status().message(), + "Log matching property violated"); +} + +// Test that RequestVote performs according to "spec". +TEST_F(RaftConsensusQuorumTest, TestRequestVote) { + ASSERT_OK(BuildAndStartConfig(3)); + + OpId last_op_id; + shared_ptr last_commit_sync; + vector > rounds; + REPLICATE_SEQUENCE_OF_MESSAGES(10, + 2, // The index of the initial leader. + WAIT_FOR_ALL_REPLICAS, + COMMIT_ONE_BY_ONE, + &last_op_id, + &rounds, + &last_commit_sync); + + // Make sure the last operation is committed everywhere + ASSERT_OK(last_commit_sync->Wait()); + WaitForCommitIfNotAlreadyPresent(last_op_id, 0, 2); + WaitForCommitIfNotAlreadyPresent(last_op_id, 1, 2); + + // Ensure last-logged OpId is > (0,0). + ASSERT_TRUE(OpIdLessThan(MinimumOpId(), last_op_id)); + + const int kPeerIndex = 1; + scoped_refptr peer; + CHECK_OK(peers_->GetPeerByIdx(kPeerIndex, &peer)); + + VoteRequestPB request; + request.set_tablet_id(kTestTablet); + request.mutable_candidate_status()->mutable_last_received()->CopyFrom(last_op_id); + + // Test that the replica won't vote since it has recently heard from + // a valid leader. + VoteResponsePB response; + request.set_candidate_uuid("peer-0"); + request.set_candidate_term(last_op_id.term() + 1); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_FALSE(response.vote_granted()); + ASSERT_EQ(ConsensusErrorPB::LEADER_IS_ALIVE, response.consensus_error().code()); + + // Test that replicas only vote yes for a single peer per term. + + // Indicate that replicas should vote even if they think another leader is alive. + // This will allow the rest of the requests in the test to go through. + request.set_ignore_live_leader(true); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_TRUE(response.vote_granted()); + ASSERT_EQ(last_op_id.term() + 1, response.responder_term()); + ASSERT_NO_FATAL_FAILURE(AssertDurableTermAndVote(kPeerIndex, last_op_id.term() + 1, "peer-0")); + + // Ensure we get same response for same term and same UUID. + response.Clear(); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_TRUE(response.vote_granted()); + + // Ensure we get a "no" for a different candidate UUID for that term. + response.Clear(); + request.set_candidate_uuid("peer-2"); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_FALSE(response.vote_granted()); + ASSERT_TRUE(response.has_consensus_error()); + ASSERT_EQ(ConsensusErrorPB::ALREADY_VOTED, response.consensus_error().code()); + ASSERT_EQ(last_op_id.term() + 1, response.responder_term()); + ASSERT_NO_FATAL_FAILURE(AssertDurableTermAndVote(kPeerIndex, last_op_id.term() + 1, "peer-0")); + + // + // Test that replicas refuse votes for an old term. + // + + // Increase the term of our candidate, which will cause the voter replica to + // increase its own term to match. + request.set_candidate_uuid("peer-0"); + request.set_candidate_term(last_op_id.term() + 2); + response.Clear(); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_TRUE(response.vote_granted()); + ASSERT_EQ(last_op_id.term() + 2, response.responder_term()); + ASSERT_NO_FATAL_FAILURE(AssertDurableTermAndVote(kPeerIndex, last_op_id.term() + 2, "peer-0")); + + // Now try the old term. + // Note: Use the peer who "won" the election on the previous term (peer-0), + // although in practice the impl does not store historical vote data. + request.set_candidate_term(last_op_id.term() + 1); + response.Clear(); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_FALSE(response.vote_granted()); + ASSERT_TRUE(response.has_consensus_error()); + ASSERT_EQ(ConsensusErrorPB::INVALID_TERM, response.consensus_error().code()); + ASSERT_EQ(last_op_id.term() + 2, response.responder_term()); + ASSERT_NO_FATAL_FAILURE(AssertDurableTermAndVote(kPeerIndex, last_op_id.term() + 2, "peer-0")); + + // + // Ensure replicas vote no for an old op index. + // + + request.set_candidate_uuid("peer-0"); + request.set_candidate_term(last_op_id.term() + 3); + request.mutable_candidate_status()->mutable_last_received()->CopyFrom(MinimumOpId()); + response.Clear(); + ASSERT_OK(peer->RequestVote(&request, &response)); + ASSERT_FALSE(response.vote_granted()); + ASSERT_TRUE(response.has_consensus_error()); + ASSERT_EQ(ConsensusErrorPB::LAST_OPID_TOO_OLD, response.consensus_error().code()); + ASSERT_EQ(last_op_id.term() + 3, response.responder_term()); + ASSERT_NO_FATAL_FAILURE(AssertDurableTermWithoutVote(kPeerIndex, last_op_id.term() + 3)); + + // Send a "heartbeat" to the peer. It should be rejected. + ConsensusRequestPB req; + req.set_caller_term(last_op_id.term()); + req.set_caller_uuid("peer-0"); + req.mutable_committed_index()->CopyFrom(last_op_id); + ConsensusResponsePB res; + Status s = peer->Update(&req, &res); + ASSERT_EQ(last_op_id.term() + 3, res.responder_term()); + ASSERT_TRUE(res.status().has_error()); + ASSERT_EQ(ConsensusErrorPB::INVALID_TERM, res.status().error().code()); + LOG(INFO) << "Follower rejected old heartbeat, as expected: " << res.ShortDebugString(); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/raft_consensus_state-test.cc b/src/kudu/consensus/raft_consensus_state-test.cc new file mode 100644 index 000000000000..31a32151b66f --- /dev/null +++ b/src/kudu/consensus/raft_consensus_state-test.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/consensus/raft_consensus_state.h" + +#include +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace consensus { + +using std::vector; + +// TODO: Share a test harness with ConsensusMetadataTest? +const char* kTabletId = "TestTablet"; + +class RaftConsensusStateTest : public KuduTest { + public: + RaftConsensusStateTest() + : fs_manager_(env_.get(), GetTestPath("fs_root")), + txn_factory_(new MockTransactionFactory()) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + ASSERT_OK(fs_manager_.CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_.Open()); + + // Initialize test configuration. + config_.set_local(true); + config_.add_peers()->set_permanent_uuid(fs_manager_.uuid()); + config_.set_opid_index(kInvalidOpIdIndex); + + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(&fs_manager_, kTabletId, fs_manager_.uuid(), + config_, kMinimumTerm, &cmeta)); + state_.reset(new ReplicaState(ConsensusOptions(), fs_manager_.uuid(), cmeta.Pass(), + txn_factory_.get())); + + // Start up the ReplicaState. + ReplicaState::UniqueLock lock; + ASSERT_OK(state_->LockForStart(&lock)); + ASSERT_OK(state_->StartUnlocked(MinimumOpId())); + } + + protected: + FsManager fs_manager_; + RaftConfigPB config_; + gscoped_ptr txn_factory_; + gscoped_ptr state_; +}; + +// Test that we can transition a new configuration from a pending state into a +// persistent state. +TEST_F(RaftConsensusStateTest, TestPendingPersistent) { + ReplicaState::UniqueLock lock; + ASSERT_OK(state_->LockForConfigChange(&lock)); + + config_.clear_opid_index(); + ASSERT_OK(state_->SetPendingConfigUnlocked(config_)); + ASSERT_TRUE(state_->IsConfigChangePendingUnlocked()); + ASSERT_FALSE(state_->GetPendingConfigUnlocked().has_opid_index()); + ASSERT_TRUE(state_->GetCommittedConfigUnlocked().has_opid_index()); + + ASSERT_FALSE(state_->SetCommittedConfigUnlocked(config_).ok()); + config_.set_opid_index(1); + ASSERT_TRUE(state_->SetCommittedConfigUnlocked(config_).ok()); + + ASSERT_FALSE(state_->IsConfigChangePendingUnlocked()); + ASSERT_EQ(1, state_->GetCommittedConfigUnlocked().opid_index()); +} + +// Ensure that we can set persistent configurations directly. +TEST_F(RaftConsensusStateTest, TestPersistentWrites) { + ReplicaState::UniqueLock lock; + ASSERT_OK(state_->LockForConfigChange(&lock)); + + ASSERT_FALSE(state_->IsConfigChangePendingUnlocked()); + ASSERT_EQ(kInvalidOpIdIndex, state_->GetCommittedConfigUnlocked().opid_index()); + + config_.clear_opid_index(); + ASSERT_OK(state_->SetPendingConfigUnlocked(config_)); + config_.set_opid_index(1); + ASSERT_OK(state_->SetCommittedConfigUnlocked(config_)); + ASSERT_EQ(1, state_->GetCommittedConfigUnlocked().opid_index()); + + config_.clear_opid_index(); + ASSERT_OK(state_->SetPendingConfigUnlocked(config_)); + config_.set_opid_index(2); + ASSERT_OK(state_->SetCommittedConfigUnlocked(config_)); + ASSERT_EQ(2, state_->GetCommittedConfigUnlocked().opid_index()); +} + +} // namespace consensus +} // namespace kudu diff --git a/src/kudu/consensus/raft_consensus_state.cc b/src/kudu/consensus/raft_consensus_state.cc new file mode 100644 index 000000000000..6a2c34e6f53b --- /dev/null +++ b/src/kudu/consensus/raft_consensus_state.cc @@ -0,0 +1,740 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/consensus/raft_consensus_state.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/logging.h" +#include "kudu/util/status.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace consensus { + +using std::string; +using strings::Substitute; +using strings::SubstituteAndAppend; + +////////////////////////////////////////////////// +// ReplicaState +////////////////////////////////////////////////// + +ReplicaState::ReplicaState(ConsensusOptions options, string peer_uuid, + gscoped_ptr cmeta, + ReplicaTransactionFactory* txn_factory) + : options_(std::move(options)), + peer_uuid_(std::move(peer_uuid)), + cmeta_(cmeta.Pass()), + next_index_(0), + txn_factory_(txn_factory), + last_received_op_id_(MinimumOpId()), + last_received_op_id_current_leader_(MinimumOpId()), + last_committed_index_(MinimumOpId()), + state_(kInitialized) { + CHECK(cmeta_) << "ConsensusMeta passed as NULL"; +} + +Status ReplicaState::StartUnlocked(const OpId& last_id_in_wal) { + DCHECK(update_lock_.is_locked()); + + // Our last persisted term can be higher than the last persisted operation + // (i.e. if we called an election) but reverse should never happen. + CHECK_LE(last_id_in_wal.term(), GetCurrentTermUnlocked()) << LogPrefixUnlocked() + << "The last op in the WAL with id " << OpIdToString(last_id_in_wal) + << " has a term (" << last_id_in_wal.term() << ") that is greater " + << "than the latest recorded term, which is " << GetCurrentTermUnlocked(); + + next_index_ = last_id_in_wal.index() + 1; + last_received_op_id_.CopyFrom(last_id_in_wal); + + state_ = kRunning; + return Status::OK(); +} + +Status ReplicaState::LockForStart(UniqueLock* lock) const { + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + CHECK_EQ(state_, kInitialized) << "Illegal state for Start()." + << " Replica is not in kInitialized state"; + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForRead(UniqueLock* lock) const { + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForReplicate(UniqueLock* lock, const ReplicateMsg& msg) const { + ThreadRestrictions::AssertWaitAllowed(); + DCHECK(!msg.has_id()) << "Should not have an ID yet: " << msg.ShortDebugString(); + UniqueLock l(&update_lock_); + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::IllegalState("Replica not in running state"); + } + + RETURN_NOT_OK(CheckActiveLeaderUnlocked()); + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForCommit(UniqueLock* lock) const { + TRACE_EVENT0("consensus", "ReplicaState::LockForCommit"); + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + if (PREDICT_FALSE(state_ != kRunning && state_ != kShuttingDown)) { + return Status::IllegalState("Replica not in running state"); + } + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForMajorityReplicatedIndexUpdate(UniqueLock* lock) const { + TRACE_EVENT0("consensus", "ReplicaState::LockForMajorityReplicatedIndexUpdate"); + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::IllegalState("Replica not in running state"); + } + + if (PREDICT_FALSE(GetActiveRoleUnlocked() != RaftPeerPB::LEADER)) { + return Status::IllegalState("Replica not LEADER"); + } + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::CheckActiveLeaderUnlocked() const { + RaftPeerPB::Role role = GetActiveRoleUnlocked(); + switch (role) { + case RaftPeerPB::LEADER: + return Status::OK(); + default: + ConsensusStatePB cstate = ConsensusStateUnlocked(CONSENSUS_CONFIG_ACTIVE); + return Status::IllegalState(Substitute("Replica $0 is not leader of this config. Role: $1. " + "Consensus state: $2", + peer_uuid_, + RaftPeerPB::Role_Name(role), + cstate.ShortDebugString())); + } +} + +Status ReplicaState::LockForConfigChange(UniqueLock* lock) const { + TRACE_EVENT0("consensus", "ReplicaState::LockForConfigChange"); + + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + // Can only change the config on running replicas. + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::IllegalState("Unable to lock ReplicaState for config change", + Substitute("State = $0", state_)); + } + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForUpdate(UniqueLock* lock) const { + TRACE_EVENT0("consensus", "ReplicaState::LockForUpdate"); + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::IllegalState("Replica not in running state"); + } + if (!IsRaftConfigVoter(peer_uuid_, ConsensusStateUnlocked(CONSENSUS_CONFIG_ACTIVE).config())) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Allowing update even though not a member of the config"; + } + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::LockForShutdown(UniqueLock* lock) { + TRACE_EVENT0("consensus", "ReplicaState::LockForShutdown"); + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock l(&update_lock_); + if (state_ != kShuttingDown && state_ != kShutDown) { + state_ = kShuttingDown; + } + lock->swap(&l); + return Status::OK(); +} + +Status ReplicaState::ShutdownUnlocked() { + DCHECK(update_lock_.is_locked()); + CHECK_EQ(state_, kShuttingDown); + state_ = kShutDown; + return Status::OK(); +} + +RaftPeerPB::Role ReplicaState::GetActiveRoleUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->active_role(); +} + +bool ReplicaState::IsConfigChangePendingUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->has_pending_config(); +} + +Status ReplicaState::CheckNoConfigChangePendingUnlocked() const { + DCHECK(update_lock_.is_locked()); + if (IsConfigChangePendingUnlocked()) { + return Status::IllegalState( + Substitute("RaftConfig change currently pending. Only one is allowed at a time.\n" + " Committed config: $0.\n Pending config: $1", + GetCommittedConfigUnlocked().ShortDebugString(), + GetPendingConfigUnlocked().ShortDebugString())); + } + return Status::OK(); +} + +Status ReplicaState::SetPendingConfigUnlocked(const RaftConfigPB& new_config) { + DCHECK(update_lock_.is_locked()); + RETURN_NOT_OK_PREPEND(VerifyRaftConfig(new_config, UNCOMMITTED_QUORUM), + "Invalid config to set as pending"); + CHECK(!cmeta_->has_pending_config()) + << "Attempt to set pending config while another is already pending! " + << "Existing pending config: " << cmeta_->pending_config().ShortDebugString() << "; " + << "Attempted new pending config: " << new_config.ShortDebugString(); + cmeta_->set_pending_config(new_config); + return Status::OK(); +} + +void ReplicaState::ClearPendingConfigUnlocked() { + cmeta_->clear_pending_config(); +} + +const RaftConfigPB& ReplicaState::GetPendingConfigUnlocked() const { + DCHECK(update_lock_.is_locked()); + CHECK(IsConfigChangePendingUnlocked()) << "No pending config"; + return cmeta_->pending_config(); +} + +Status ReplicaState::SetCommittedConfigUnlocked(const RaftConfigPB& committed_config) { + TRACE_EVENT0("consensus", "ReplicaState::SetCommittedConfigUnlocked"); + DCHECK(update_lock_.is_locked()); + DCHECK(committed_config.IsInitialized()); + RETURN_NOT_OK_PREPEND(VerifyRaftConfig(committed_config, COMMITTED_QUORUM), + "Invalid config to set as committed"); + + // Compare committed with pending configuration, ensure they are the same. + // Pending will not have an opid_index, so ignore that field. + DCHECK(cmeta_->has_pending_config()); + RaftConfigPB config_no_opid = committed_config; + config_no_opid.clear_opid_index(); + const RaftConfigPB& pending_config = GetPendingConfigUnlocked(); + // Quorums must be exactly equal, even w.r.t. peer ordering. + CHECK_EQ(GetPendingConfigUnlocked().SerializeAsString(), config_no_opid.SerializeAsString()) + << Substitute("New committed config must equal pending config, but does not. " + "Pending config: $0, committed config: $1", + pending_config.ShortDebugString(), committed_config.ShortDebugString()); + + cmeta_->set_committed_config(committed_config); + cmeta_->clear_pending_config(); + CHECK_OK(cmeta_->Flush()); + return Status::OK(); +} + +const RaftConfigPB& ReplicaState::GetCommittedConfigUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->committed_config(); +} + +const RaftConfigPB& ReplicaState::GetActiveConfigUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->active_config(); +} + +bool ReplicaState::IsOpCommittedOrPending(const OpId& op_id, bool* term_mismatch) { + + *term_mismatch = false; + + if (op_id.index() <= GetCommittedOpIdUnlocked().index()) { + return true; + } + + if (op_id.index() > GetLastReceivedOpIdUnlocked().index()) { + return false; + } + + scoped_refptr round = GetPendingOpByIndexOrNullUnlocked(op_id.index()); + DCHECK(round); + + if (round->id().term() != op_id.term()) { + *term_mismatch = true; + return false; + } + return true; +} + +Status ReplicaState::SetCurrentTermUnlocked(int64_t new_term) { + TRACE_EVENT1("consensus", "ReplicaState::SetCurrentTermUnlocked", + "term", new_term); + DCHECK(update_lock_.is_locked()); + if (PREDICT_FALSE(new_term <= GetCurrentTermUnlocked())) { + return Status::IllegalState( + Substitute("Cannot change term to a term that is lower than or equal to the current one. " + "Current: $0, Proposed: $1", GetCurrentTermUnlocked(), new_term)); + } + cmeta_->set_current_term(new_term); + cmeta_->clear_voted_for(); + CHECK_OK(cmeta_->Flush()); + ClearLeaderUnlocked(); + last_received_op_id_current_leader_ = MinimumOpId(); + return Status::OK(); +} + +const int64_t ReplicaState::GetCurrentTermUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->current_term(); +} + +void ReplicaState::SetLeaderUuidUnlocked(const std::string& uuid) { + DCHECK(update_lock_.is_locked()); + cmeta_->set_leader_uuid(uuid); +} + +const string& ReplicaState::GetLeaderUuidUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->leader_uuid(); +} + +const bool ReplicaState::HasVotedCurrentTermUnlocked() const { + DCHECK(update_lock_.is_locked()); + return cmeta_->has_voted_for(); +} + +Status ReplicaState::SetVotedForCurrentTermUnlocked(const std::string& uuid) { + TRACE_EVENT1("consensus", "ReplicaState::SetVotedForCurrentTermUnlocked", + "uuid", uuid); + DCHECK(update_lock_.is_locked()); + cmeta_->set_voted_for(uuid); + CHECK_OK(cmeta_->Flush()); + return Status::OK(); +} + +const std::string& ReplicaState::GetVotedForCurrentTermUnlocked() const { + DCHECK(update_lock_.is_locked()); + DCHECK(cmeta_->has_voted_for()); + return cmeta_->voted_for(); +} + +ReplicaTransactionFactory* ReplicaState::GetReplicaTransactionFactoryUnlocked() const { + return txn_factory_; +} + +const string& ReplicaState::GetPeerUuid() const { + return peer_uuid_; +} + +const ConsensusOptions& ReplicaState::GetOptions() const { + return options_; +} + +int ReplicaState::GetNumPendingTxnsUnlocked() const { + DCHECK(update_lock_.is_locked()); + return pending_txns_.size(); +} + +Status ReplicaState::CancelPendingTransactions() { + { + ThreadRestrictions::AssertWaitAllowed(); + UniqueLock lock(&update_lock_); + if (state_ != kShuttingDown) { + return Status::IllegalState("Can only wait for pending commits on kShuttingDown state."); + } + if (pending_txns_.empty()) { + return Status::OK(); + } + + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Trying to abort " << pending_txns_.size() + << " pending transactions."; + for (const auto& txn : pending_txns_) { + const scoped_refptr& round = txn.second; + // We cancel only transactions whose applies have not yet been triggered. + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Aborting transaction as it isn't in flight: " + << txn.second->replicate_msg()->ShortDebugString(); + round->NotifyReplicationFinished(Status::Aborted("Transaction aborted")); + } + } + return Status::OK(); +} + +void ReplicaState::GetUncommittedPendingOperationsUnlocked( + vector >* ops) { + for (const IndexToRoundMap::value_type& entry : pending_txns_) { + if (entry.first > last_committed_index_.index()) { + ops->push_back(entry.second); + } + } +} + +Status ReplicaState::AbortOpsAfterUnlocked(int64_t new_preceding_idx) { + DCHECK(update_lock_.is_locked()); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Aborting all transactions after (but not including): " + << new_preceding_idx << ". Current State: " << ToStringUnlocked(); + + DCHECK_GE(new_preceding_idx, 0); + OpId new_preceding; + + auto iter = pending_txns_.lower_bound(new_preceding_idx); + + // Either the new preceding id is in the pendings set or it must be equal to the + // committed index since we can't truncate already committed operations. + if (iter != pending_txns_.end() && (*iter).first == new_preceding_idx) { + new_preceding = (*iter).second->replicate_msg()->id(); + ++iter; + } else { + CHECK_EQ(new_preceding_idx, last_committed_index_.index()); + new_preceding = last_committed_index_; + } + + // This is the same as UpdateLastReceivedOpIdUnlocked() but we do it + // here to avoid the bounds check, since we're breaking monotonicity. + last_received_op_id_ = new_preceding; + last_received_op_id_current_leader_ = last_received_op_id_; + next_index_ = new_preceding.index() + 1; + + for (; iter != pending_txns_.end();) { + const scoped_refptr& round = (*iter).second; + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Aborting uncommitted operation due to leader change: " + << round->replicate_msg()->id(); + round->NotifyReplicationFinished(Status::Aborted("Transaction aborted by new leader")); + // Erase the entry from pendings. + pending_txns_.erase(iter++); + } + + return Status::OK(); +} + +Status ReplicaState::AddPendingOperation(const scoped_refptr& round) { + DCHECK(update_lock_.is_locked()); + if (PREDICT_FALSE(state_ != kRunning)) { + // Special case when we're configuring and this is a config change, refuse + // everything else. + // TODO: Don't require a NO_OP to get to kRunning state + if (round->replicate_msg()->op_type() != NO_OP) { + return Status::IllegalState("Cannot trigger prepare. Replica is not in kRunning state."); + } + } + + // Mark pending configuration. + if (PREDICT_FALSE(round->replicate_msg()->op_type() == CHANGE_CONFIG_OP)) { + DCHECK(round->replicate_msg()->change_config_record().has_old_config()); + DCHECK(round->replicate_msg()->change_config_record().old_config().has_opid_index()); + DCHECK(round->replicate_msg()->change_config_record().has_new_config()); + DCHECK(!round->replicate_msg()->change_config_record().new_config().has_opid_index()); + const RaftConfigPB& old_config = round->replicate_msg()->change_config_record().old_config(); + const RaftConfigPB& new_config = round->replicate_msg()->change_config_record().new_config(); + if (GetActiveRoleUnlocked() != RaftPeerPB::LEADER) { + // The leader has to mark the configuration as pending before it gets here + // because the active configuration affects the replication queue. + // Do one last sanity check. + Status s = CheckNoConfigChangePendingUnlocked(); + if (PREDICT_FALSE(!s.ok())) { + s = s.CloneAndAppend(Substitute("\n New config: $0", new_config.ShortDebugString())); + LOG_WITH_PREFIX_UNLOCKED(INFO) << s.ToString(); + return s; + } + // Check if the pending Raft config has an OpId less than the committed + // config. If so, this is a replay at startup in which the COMMIT + // messages were delayed. + const RaftConfigPB& committed_config = GetCommittedConfigUnlocked(); + if (round->replicate_msg()->id().index() > committed_config.opid_index()) { + CHECK_OK(SetPendingConfigUnlocked(new_config)); + } else { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Ignoring setting pending config change with OpId " + << round->replicate_msg()->id() << " because the committed config has OpId index " + << committed_config.opid_index() << ". The config change we are ignoring is: " + << "Old config: { " << old_config.ShortDebugString() << " }. " + << "New config: { " << new_config.ShortDebugString() << " }"; + } + } + } + + InsertOrDie(&pending_txns_, round->replicate_msg()->id().index(), round); + return Status::OK(); +} + +scoped_refptr ReplicaState::GetPendingOpByIndexOrNullUnlocked(int64_t index) { + DCHECK(update_lock_.is_locked()); + return FindPtrOrNull(pending_txns_, index); +} + +Status ReplicaState::UpdateMajorityReplicatedUnlocked(const OpId& majority_replicated, + OpId* committed_index, + bool* committed_index_changed) { + DCHECK(update_lock_.is_locked()); + DCHECK(majority_replicated.IsInitialized()); + DCHECK(last_committed_index_.IsInitialized()); + if (PREDICT_FALSE(state_ == kShuttingDown || state_ == kShutDown)) { + return Status::ServiceUnavailable("Cannot trigger apply. Replica is shutting down."); + } + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::IllegalState("Cannot trigger apply. Replica is not in kRunning state."); + } + + // If the last committed operation was in the current term (the normal case) + // then 'committed_index' is simply equal to majority replicated. + if (last_committed_index_.term() == GetCurrentTermUnlocked()) { + RETURN_NOT_OK(AdvanceCommittedIndexUnlocked(majority_replicated, + committed_index_changed)); + committed_index->CopyFrom(last_committed_index_); + return Status::OK(); + } + + // If the last committed operation is not in the current term (such as when + // we change leaders) but 'majority_replicated' is then we can advance the + // 'committed_index' too. + if (majority_replicated.term() == GetCurrentTermUnlocked()) { + OpId previous = last_committed_index_; + RETURN_NOT_OK(AdvanceCommittedIndexUnlocked(majority_replicated, + committed_index_changed)); + committed_index->CopyFrom(last_committed_index_); + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Advanced the committed_index across terms." + << " Last committed operation was: " << previous.ShortDebugString() + << " New committed index is: " << last_committed_index_.ShortDebugString(); + return Status::OK(); + } + + committed_index->CopyFrom(last_committed_index_); + KLOG_EVERY_N_SECS(WARNING, 1) << LogPrefixUnlocked() + << "Can't advance the committed index across term boundaries" + << " until operations from the current term are replicated." + << " Last committed operation was: " << last_committed_index_.ShortDebugString() << "," + << " New majority replicated is: " << majority_replicated.ShortDebugString() << "," + << " Current term is: " << GetCurrentTermUnlocked(); + + return Status::OK(); +} + +Status ReplicaState::AdvanceCommittedIndexUnlocked(const OpId& committed_index, + bool *committed_index_changed) { + *committed_index_changed = false; + // If we already committed up to (or past) 'id' return. + // This can happen in the case that multiple UpdateConsensus() calls end + // up in the RPC queue at the same time, and then might get interleaved out + // of order. + if (last_committed_index_.index() >= committed_index.index()) { + VLOG_WITH_PREFIX_UNLOCKED(1) + << "Already marked ops through " << last_committed_index_ << " as committed. " + << "Now trying to mark " << committed_index << " which would be a no-op."; + return Status::OK(); + } + + if (pending_txns_.empty()) { + last_committed_index_.CopyFrom(committed_index); + VLOG_WITH_PREFIX_UNLOCKED(1) << "No transactions to mark as committed up to: " + << committed_index.ShortDebugString(); + return Status::OK(); + } + + // Start at the operation after the last committed one. + auto iter = pending_txns_.upper_bound(last_committed_index_.index()); + // Stop at the operation after the last one we must commit. + auto end_iter = pending_txns_.upper_bound(committed_index.index()); + CHECK(iter != pending_txns_.end()); + + VLOG_WITH_PREFIX_UNLOCKED(1) << "Last triggered apply was: " + << last_committed_index_.ShortDebugString() + << " Starting to apply from log index: " << (*iter).first; + + OpId prev_id = last_committed_index_; + + while (iter != end_iter) { + scoped_refptr round = (*iter).second; // Make a copy. + DCHECK(round); + const OpId& current_id = round->id(); + + if (PREDICT_TRUE(!OpIdEquals(prev_id, MinimumOpId()))) { + CHECK_OK(CheckOpInSequence(prev_id, current_id)); + } + + pending_txns_.erase(iter++); + + // Set committed configuration. + if (PREDICT_FALSE(round->replicate_msg()->op_type() == CHANGE_CONFIG_OP)) { + DCHECK(round->replicate_msg()->change_config_record().has_old_config()); + DCHECK(round->replicate_msg()->change_config_record().has_new_config()); + RaftConfigPB old_config = round->replicate_msg()->change_config_record().old_config(); + RaftConfigPB new_config = round->replicate_msg()->change_config_record().new_config(); + DCHECK(old_config.has_opid_index()); + DCHECK(!new_config.has_opid_index()); + new_config.set_opid_index(current_id.index()); + // Check if the pending Raft config has an OpId less than the committed + // config. If so, this is a replay at startup in which the COMMIT + // messages were delayed. + const RaftConfigPB& committed_config = GetCommittedConfigUnlocked(); + if (new_config.opid_index() > committed_config.opid_index()) { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Committing config change with OpId " + << current_id << ". " + << "Old config: { " << old_config.ShortDebugString() << " }. " + << "New config: { " << new_config.ShortDebugString() << " }"; + CHECK_OK(SetCommittedConfigUnlocked(new_config)); + } else { + LOG_WITH_PREFIX_UNLOCKED(INFO) << "Ignoring commit of config change with OpId " + << current_id << " because the committed config has OpId index " + << committed_config.opid_index() << ". The config change we are ignoring is: " + << "Old config: { " << old_config.ShortDebugString() << " }. " + << "New config: { " << new_config.ShortDebugString() << " }"; + } + } + + prev_id.CopyFrom(round->id()); + round->NotifyReplicationFinished(Status::OK()); + } + + last_committed_index_.CopyFrom(committed_index); + *committed_index_changed = true; + return Status::OK(); +} + +const OpId& ReplicaState::GetCommittedOpIdUnlocked() const { + DCHECK(update_lock_.is_locked()); + return last_committed_index_; +} + +Status ReplicaState::CheckHasCommittedOpInCurrentTermUnlocked() const { + int64_t term = GetCurrentTermUnlocked(); + const OpId& opid = GetCommittedOpIdUnlocked(); + if (opid.term() != term) { + return Status::IllegalState("Latest committed op is not from this term", OpIdToString(opid)); + } + return Status::OK(); +} + +void ReplicaState::UpdateLastReceivedOpIdUnlocked(const OpId& op_id) { + DCHECK(update_lock_.is_locked()); + DCHECK_LE(OpIdCompare(last_received_op_id_, op_id), 0) + << "Previously received OpId: " << last_received_op_id_.ShortDebugString() + << ", updated OpId: " << op_id.ShortDebugString() + << ", Trace:" << std::endl << Trace::CurrentTrace()->DumpToString(true); + last_received_op_id_ = op_id; + last_received_op_id_current_leader_ = last_received_op_id_; + next_index_ = op_id.index() + 1; +} + +const OpId& ReplicaState::GetLastReceivedOpIdUnlocked() const { + DCHECK(update_lock_.is_locked()); + return last_received_op_id_; +} + +const OpId& ReplicaState::GetLastReceivedOpIdCurLeaderUnlocked() const { + DCHECK(update_lock_.is_locked()); + return last_received_op_id_current_leader_; +} + +OpId ReplicaState::GetLastPendingTransactionOpIdUnlocked() const { + DCHECK(update_lock_.is_locked()); + return pending_txns_.empty() + ? MinimumOpId() : (--pending_txns_.end())->second->id(); +} + +void ReplicaState::NewIdUnlocked(OpId* id) { + DCHECK(update_lock_.is_locked()); + id->set_term(GetCurrentTermUnlocked()); + id->set_index(next_index_++); +} + +void ReplicaState::CancelPendingOperation(const OpId& id) { + OpId previous = id; + previous.set_index(previous.index() - 1); + DCHECK(update_lock_.is_locked()); + CHECK_EQ(GetCurrentTermUnlocked(), id.term()); + CHECK_EQ(next_index_, id.index() + 1); + next_index_ = id.index(); + + // We don't use UpdateLastReceivedOpIdUnlocked because we're actually + // updating it back to a lower value and we need to avoid the checks + // that method has. + + // This is only ok if we do _not_ release the lock after calling + // NewIdUnlocked() (which we don't in RaftConsensus::Replicate()). + last_received_op_id_ = previous; + scoped_refptr round = EraseKeyReturnValuePtr(&pending_txns_, id.index()); + DCHECK(round); +} + +string ReplicaState::LogPrefix() { + ReplicaState::UniqueLock lock; + CHECK_OK(LockForRead(&lock)); + return LogPrefixUnlocked(); +} + +string ReplicaState::LogPrefixUnlocked() const { + DCHECK(update_lock_.is_locked()); + return Substitute("T $0 P $1 [term $2 $3]: ", + options_.tablet_id, + peer_uuid_, + GetCurrentTermUnlocked(), + RaftPeerPB::Role_Name(GetActiveRoleUnlocked())); +} + +string ReplicaState::LogPrefixThreadSafe() const { + return Substitute("T $0 P $1: ", + options_.tablet_id, + peer_uuid_); +} + +ReplicaState::State ReplicaState::state() const { + DCHECK(update_lock_.is_locked()); + return state_; +} + +string ReplicaState::ToString() const { + ThreadRestrictions::AssertWaitAllowed(); + ReplicaState::UniqueLock lock(&update_lock_); + return ToStringUnlocked(); +} + +string ReplicaState::ToStringUnlocked() const { + DCHECK(update_lock_.is_locked()); + string ret; + SubstituteAndAppend(&ret, "Replica: $0, State: $1, Role: $2\n", + peer_uuid_, state_, + RaftPeerPB::Role_Name(GetActiveRoleUnlocked())); + + SubstituteAndAppend(&ret, "Watermarks: {Received: $0 Committed: $1}\n", + last_received_op_id_.ShortDebugString(), + last_committed_index_.ShortDebugString()); + return ret; +} + +Status ReplicaState::CheckOpInSequence(const OpId& previous, const OpId& current) { + if (current.term() < previous.term()) { + return Status::Corruption(Substitute("New operation's term is not >= than the previous " + "op's term. Current: $0. Previous: $1", OpIdToString(current), OpIdToString(previous))); + } + if (current.index() != previous.index() + 1) { + return Status::Corruption(Substitute("New operation's index does not follow the previous" + " op's index. Current: $0. Previous: $1", OpIdToString(current), OpIdToString(previous))); + } + return Status::OK(); +} + +} // namespace consensus +} // namespace kudu + diff --git a/src/kudu/consensus/raft_consensus_state.h b/src/kudu/consensus/raft_consensus_state.h new file mode 100644 index 000000000000..a849cfc4fca1 --- /dev/null +++ b/src/kudu/consensus/raft_consensus_state.h @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_CONSENSUS_RAFT_CONSENSUS_UTIL_H_ +#define KUDU_CONSENSUS_RAFT_CONSENSUS_UTIL_H_ + +#include +#include +#include +#include +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/consensus_queue.h" +#include "kudu/consensus/log_util.h" +#include "kudu/gutil/port.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { + +class HostPort; +class ReplicaState; +class ThreadPool; + +namespace rpc { +class Messenger; +} + +namespace consensus { + +// Class that coordinates access to the replica state (independently of Role). +// This has a 1-1 relationship with RaftConsensus and is essentially responsible for +// keeping state and checking if state changes are viable. +// +// Note that, in the case of a LEADER role, there are two configuration states that +// that are tracked: a pending and a committed configuration. The "active" state is +// considered to be the pending configuration if it is non-null, otherwise the +// committed configuration is the active configuration. +// +// When a replica becomes a leader of a configuration, it sets the pending configuration to +// a new configuration declaring itself as leader and sets its "active" role to LEADER. +// It then starts up ConsensusPeers for each member of the pending configuration and +// tries to push a new configuration to the peers. Once that configuration is +// pushed to a majority of the cluster, it is considered committed and the +// replica flushes that configuration to disk as the committed configuration. +// +// Each time an operation is to be performed on the replica the appropriate LockFor*() +// method should be called. The LockFor*() methods check that the replica is in the +// appropriate state to perform the requested operation and returns the lock or return +// Status::IllegalState if that is not the case. +// +// All state reading/writing methods acquire the lock, unless suffixed by "Unlocked", in +// which case a lock should be obtained prior to calling them. +class ReplicaState { + public: + enum State { + // State after the replica is built. + kInitialized, + + // State signaling the replica accepts requests (from clients + // if leader, from leader if follower) + kRunning, + + // State signaling that the replica is shutting down and no longer accepting + // new transactions or commits. + kShuttingDown, + + // State signaling the replica is shut down and does not accept + // any more requests. + kShutDown + }; + + typedef unique_lock UniqueLock; + + typedef std::map > IndexToRoundMap; + + typedef std::set OutstandingCommits; + + typedef IndexToRoundMap::value_type IndexToRoundEntry; + + ReplicaState(ConsensusOptions options, std::string peer_uuid, + gscoped_ptr cmeta, + ReplicaTransactionFactory* txn_factory); + + Status StartUnlocked(const OpId& last_in_wal); + + // Locks a replica in preparation for StartUnlocked(). Makes + // sure the replica is in kInitialized state. + Status LockForStart(UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Locks a replica down until the critical section of an append completes, + // i.e. until the replicate message has been assigned an id and placed in + // the log queue. + // This also checks that the replica is in the appropriate + // state (role) to replicate the provided operation, that the operation + // contains a replicate message and is of the appropriate type, and returns + // Status::IllegalState if that is not the case. + Status LockForReplicate(UniqueLock* lock, const ReplicateMsg& msg) const WARN_UNUSED_RESULT; + + // Locks a replica down until the critical section of a commit completes. + // This succeeds for all states since a replica which has initiated + // a Prepare()/Replicate() must eventually commit even if it's state + // has changed after the initial Append()/Update(). + Status LockForCommit(UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Locks a replica down until an the critical section of an update completes. + // Further updates from the same or some other leader will be blocked until + // this completes. This also checks that the replica is in the appropriate + // state (role) to be updated and returns Status::IllegalState if that + // is not the case. + Status LockForUpdate(UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Changes the role to non-participant and returns a lock that can be + // used to make sure no state updates come in until Shutdown() is + // completed. + Status LockForShutdown(UniqueLock* lock) WARN_UNUSED_RESULT; + + Status LockForConfigChange(UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Obtains the lock for a state read, does not check state. + Status LockForRead(UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Obtains the lock so that we can advance the majority replicated + // index and possibly the committed index. + // Requires that this peer is leader. + Status LockForMajorityReplicatedIndexUpdate( + UniqueLock* lock) const WARN_UNUSED_RESULT; + + // Ensure the local peer is the active leader. + // Returns OK if leader, IllegalState otherwise. + Status CheckActiveLeaderUnlocked() const; + + // Completes the Shutdown() of this replica. No more operations, local + // or otherwise can happen after this point. + // Called after the quiescing phase (started with LockForShutdown()) + // finishes. + Status ShutdownUnlocked() WARN_UNUSED_RESULT; + + // Return current consensus state summary. + ConsensusStatePB ConsensusStateUnlocked(ConsensusConfigType type) const { + return cmeta_->ToConsensusStatePB(type); + } + + // Returns the currently active Raft role. + RaftPeerPB::Role GetActiveRoleUnlocked() const; + + // Returns true if there is a configuration change currently in-flight but not yet + // committed. + bool IsConfigChangePendingUnlocked() const; + + // Inverse of IsConfigChangePendingUnlocked(): returns OK if there is + // currently *no* configuration change pending, and IllegalState is there *is* a + // configuration change pending. + Status CheckNoConfigChangePendingUnlocked() const; + + // Returns true if an operation is in this replica's log, namely: + // - If the op's index is lower than or equal to our committed index + // - If the op id matches an inflight op. + // If an operation with the same index is in our log but the terms + // are different 'term_mismatch' is set to true, it is false otherwise. + bool IsOpCommittedOrPending(const OpId& op_id, bool* term_mismatch); + + // Sets the given configuration as pending commit. Does not persist into the peers + // metadata. In order to be persisted, SetCommittedConfigUnlocked() must be called. + Status SetPendingConfigUnlocked(const RaftConfigPB& new_config) WARN_UNUSED_RESULT; + + // Clear (cancel) the pending configuration. + void ClearPendingConfigUnlocked(); + + // Return the pending configuration, or crash if one is not set. + const RaftConfigPB& GetPendingConfigUnlocked() const; + + // Changes the committed config for this replica. Checks that there is a + // pending configuration and that it is equal to this one. Persists changes to disk. + // Resets the pending configuration to null. + Status SetCommittedConfigUnlocked(const RaftConfigPB& new_config); + + // Return the persisted configuration. + const RaftConfigPB& GetCommittedConfigUnlocked() const; + + // Return the "active" configuration - if there is a pending configuration return it; + // otherwise return the committed configuration. + const RaftConfigPB& GetActiveConfigUnlocked() const; + + // Checks if the term change is legal. If so, sets 'current_term' + // to 'new_term' and sets 'has voted' to no for the current term. + Status SetCurrentTermUnlocked(int64_t new_term) WARN_UNUSED_RESULT; + + // Returns the term set in the last config change round. + const int64_t GetCurrentTermUnlocked() const; + + // Accessors for the leader of the current term. + void SetLeaderUuidUnlocked(const std::string& uuid); + const std::string& GetLeaderUuidUnlocked() const; + bool HasLeaderUnlocked() const { return !GetLeaderUuidUnlocked().empty(); } + void ClearLeaderUnlocked() { SetLeaderUuidUnlocked(""); } + + // Return whether this peer has voted in the current term. + const bool HasVotedCurrentTermUnlocked() const; + + // Record replica's vote for the current term, then flush the consensus + // metadata to disk. + Status SetVotedForCurrentTermUnlocked(const std::string& uuid) WARN_UNUSED_RESULT; + + // Return replica's vote for the current term. + // The vote must be set; use HasVotedCurrentTermUnlocked() to check. + const std::string& GetVotedForCurrentTermUnlocked() const; + + ReplicaTransactionFactory* GetReplicaTransactionFactoryUnlocked() const; + + // Returns the uuid of the peer to which this replica state belongs. + // Safe to call with or without locks held. + const std::string& GetPeerUuid() const; + + const ConsensusOptions& GetOptions() const; + + // Returns the operations that are not consensus committed. + void GetUncommittedPendingOperationsUnlocked(std::vector >* ops); + + // Aborts pending operations after, but not including 'index'. The OpId with 'index' + // will become our new last received id. If there are pending operations with indexes + // higher than 'index' those operations are aborted. + Status AbortOpsAfterUnlocked(int64_t index); + + // Returns the the ConsensusRound with the provided index, if there is any, or NULL + // if there isn't. + scoped_refptr GetPendingOpByIndexOrNullUnlocked(int64_t index); + + // Add 'round' to the set of rounds waiting to be committed. + Status AddPendingOperation(const scoped_refptr& round); + + // Marks ReplicaTransactions up to 'id' as majority replicated, meaning the + // transaction may Apply() (immediately if Prepare() has completed or when Prepare() + // completes, if not). + // + // If this advanced the committed index, sets *committed_index_changed to true. + Status UpdateMajorityReplicatedUnlocked(const OpId& majority_replicated, + OpId* committed_index, + bool* committed_index_changed); + + // Advances the committed index. + // This is a no-op if the committed index has not changed. + // Returns in '*committed_index_changed' whether the operation actually advanced + // the index. + Status AdvanceCommittedIndexUnlocked(const OpId& committed_index, + bool* committed_index_changed); + + // Returns the watermark below which all operations are known to + // be committed according to consensus. + // + // This must be called under a lock. + const OpId& GetCommittedOpIdUnlocked() const; + + // Returns OK iff an op from the current term has been committed. + Status CheckHasCommittedOpInCurrentTermUnlocked() const; + + // Updates the last received operation. + // This must be called under a lock. + void UpdateLastReceivedOpIdUnlocked(const OpId& op_id); + + // Returns the last received op id. This must be called under the lock. + const OpId& GetLastReceivedOpIdUnlocked() const; + + // Returns the id of the last op received from the current leader. + const OpId& GetLastReceivedOpIdCurLeaderUnlocked() const; + + // Returns the id of the latest pending transaction (i.e. the one with the + // latest index). This must be called under the lock. + OpId GetLastPendingTransactionOpIdUnlocked() const; + + // Updates the last committed operation including removing it from the pending commits. + // + // 'commit_op_id' refers to the OpId of the actual commit operation, whereas + // 'committed_op_id' refers to the OpId of the original REPLICATE message which was + // committed. + // + // This must be called under a lock. + void UpdateReplicaCommittedOpIdUnlocked(const OpId& committed_op_id); + + // Waits for already triggered Apply()s to commit. + Status WaitForOustandingApplies(); + + // Used by replicas to cancel pending transactions. Pending transaction are those + // that have completed prepare/replicate but are waiting on the LEADER's commit + // to complete. This does not cancel transactions being applied. + Status CancelPendingTransactions(); + + void NewIdUnlocked(OpId* id); + + // Used when, for some reason, an operation that failed before it could be considered + // a part of the state machine. Basically restores the id gen to the state it was before + // generating 'id'. + void CancelPendingOperation(const OpId& id); + + // Returns the number of transactions that are currently in the pending state + // i.e. transactions for which Prepare() is done or under way. + int GetNumPendingTxnsUnlocked() const; + + std::string ToString() const; + std::string ToStringUnlocked() const; + + // A common prefix that should be in any log messages emitted, + // identifying the tablet and peer. + std::string LogPrefix(); + std::string LogPrefixUnlocked() const; + + // A variant of LogPrefix which does not take the lock. This is a slightly + // less thorough prefix which only includes immutable (and thus thread-safe) + // information, but does not require the lock. + std::string LogPrefixThreadSafe() const; + + // Checks that 'current' correctly follows 'previous'. Specifically it checks + // that the term is the same or higher and that the index is sequential. + static Status CheckOpInSequence(const OpId& previous, const OpId& current); + + // Return the current state of this object. + // The update_lock_ must be held. + ReplicaState::State state() const; + + private: + const ConsensusOptions options_; + + // The UUID of the local peer. + const std::string peer_uuid_; + + mutable simple_spinlock update_lock_; + + // Consensus metadata persistence object. + gscoped_ptr cmeta_; + + // Used by the LEADER. This is the index of the next operation generated + // by this LEADER. + int64_t next_index_; + + // Index=>Round map that manages pending ops, i.e. operations for which we've + // received a replicate message from the leader but have yet to be committed. + // The key is the index of the replicate operation. + IndexToRoundMap pending_txns_; + + // When we receive a message from a remote peer telling us to start a transaction, we use + // this factory to start it. + ReplicaTransactionFactory* txn_factory_; + + // The id of the last received operation, which corresponds to the last entry + // written to the local log. Operations whose id is lower than or equal to + // this id do not need to be resent by the leader. This is not guaranteed to + // be monotonically increasing due to the possibility for log truncation and + // aborted operations when a leader change occurs. + OpId last_received_op_id_; + + // Same as last_received_op_id_ but only includes operations sent by the + // current leader. The "term" in this op may not actually match the current + // term, since leaders may replicate ops from prior terms. + // + // As an implementation detail, this field is reset to MinumumOpId() every + // time there is a term advancement on the local node, to simplify the logic + // involved in resetting this every time a new node becomes leader. + OpId last_received_op_id_current_leader_; + + // The id of the Apply that was last triggered when the last message from the leader + // was received. Initialized to MinimumOpId(). + OpId last_committed_index_; + + State state_; +}; + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_RAFT_CONSENSUS_UTIL_H_ */ diff --git a/src/kudu/consensus/ref_counted_replicate.h b/src/kudu/consensus/ref_counted_replicate.h new file mode 100644 index 000000000000..712ae14d1e1a --- /dev/null +++ b/src/kudu/consensus/ref_counted_replicate.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_CONSENSUS_REF_COUNTED_REPLICATE_H_ +#define KUDU_CONSENSUS_REF_COUNTED_REPLICATE_H_ + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { +namespace consensus { + +// A simple ref-counted wrapper around ReplicateMsg. +class RefCountedReplicate : public RefCountedThreadSafe { + public: + explicit RefCountedReplicate(ReplicateMsg* msg) : msg_(msg) {} + + ReplicateMsg* get() { + return msg_.get(); + } + + private: + gscoped_ptr msg_; +}; + +typedef scoped_refptr ReplicateRefPtr; + +inline ReplicateRefPtr make_scoped_refptr_replicate(ReplicateMsg* replicate) { + return ReplicateRefPtr(new RefCountedReplicate(replicate)); +} + +} // namespace consensus +} // namespace kudu + +#endif /* KUDU_CONSENSUS_REF_COUNTED_REPLICATE_H_ */ diff --git a/src/kudu/experiments/CMakeLists.txt b/src/kudu/experiments/CMakeLists.txt new file mode 100644 index 000000000000..f7fffb80ae86 --- /dev/null +++ b/src/kudu/experiments/CMakeLists.txt @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# rwlock-perf +# Disabled on OS X because it relies on sched_getcpu. +if(NOT APPLE) + add_executable(rwlock-perf rwlock-perf.cc) + target_link_libraries(rwlock-perf + kudu_util + ${KUDU_MIN_TEST_LIBS}) +endif() + +add_executable(merge-test merge-test.cc) +target_link_libraries(merge-test + kudu_util + ${KUDU_MIN_TEST_LIBS}) diff --git a/src/kudu/experiments/merge-test.cc b/src/kudu/experiments/merge-test.cc new file mode 100644 index 000000000000..b9760a2c5c06 --- /dev/null +++ b/src/kudu/experiments/merge-test.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/util/stopwatch.h" + +DEFINE_int32(num_lists, 3, "Number of lists to merge"); +DEFINE_int32(num_rows, 100, "Number of entries per list"); +DEFINE_int32(num_iters, 5, "Number of times to run merge"); + +using std::vector; +using std::string; + +typedef string MergeType; + +struct CompareIters { + explicit CompareIters(vector::const_iterator> *iters) : + iters_(iters) + {} + + bool operator()(int left, int right) { + return *((*iters_)[left]) >= *((*iters_)[right]); + } + + vector::const_iterator> *iters_; +}; + +void HeapMerge( + const vector > &in_lists, + vector *out) { + typedef vector::const_iterator MergeTypeIter; + + vector iters; + vector indexes; + size_t i = 0; + for (const vector &list : in_lists) { + iters.push_back(list.begin()); + indexes.push_back(i++); + } + + CompareIters comp(&iters); + std::make_heap(indexes.begin(), indexes.end(), comp); + + while (!indexes.empty()) { + size_t min_idx = indexes.front(); + MergeTypeIter &min_iter = iters[min_idx]; + + out->push_back(*min_iter); + + min_iter++; + std::pop_heap(indexes.begin(), indexes.end(), comp); + if (min_iter == in_lists[min_idx].end()) { + indexes.pop_back(); + } else { + std::push_heap(indexes.begin(), indexes.end(), comp); + } + } +} + +void SimpleMerge(const vector > &in_lists, + vector *out) { + typedef vector::const_iterator MergeTypeIter; + vector iters; + for (const vector &list : in_lists) { + iters.push_back(list.begin()); + } + + while (true) { + MergeTypeIter *smallest = nullptr; + for (int i = 0; i < in_lists.size(); i++) { + if (iters[i] == in_lists[i].end()) continue; + if (smallest == nullptr || + *iters[i] < **smallest) { + smallest = &iters[i]; + } + } + + if (smallest == nullptr) break; + + out->push_back(**smallest); + (*smallest)++; + } +} + +int main(int argc, char **argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + + vector > in_lists; + in_lists.resize(FLAGS_num_lists); + + for (int i = 0; i < FLAGS_num_lists; i++) { + vector &list = in_lists[i]; + + int entry = 0; + for (int j = 0; j < FLAGS_num_rows; j++) { + entry += rand() % 5; + list.push_back(boost::lexical_cast(entry)); + } + } + + for (int i = 0; i < FLAGS_num_iters; i++) { + vector out; + out.reserve(FLAGS_num_lists * FLAGS_num_rows); + + LOG_TIMING(INFO, "HeapMerge") { + HeapMerge(in_lists, &out); + } + } + + for (int i = 0; i < FLAGS_num_iters; i++) { + vector out; + out.reserve(FLAGS_num_lists * FLAGS_num_rows); + + LOG_TIMING(INFO, "SimpleMerge") { + SimpleMerge(in_lists, &out); + } + } + + return 0; +} diff --git a/src/kudu/experiments/rwlock-perf.cc b/src/kudu/experiments/rwlock-perf.cc new file mode 100644 index 000000000000..31118cc9325a --- /dev/null +++ b/src/kudu/experiments/rwlock-perf.cc @@ -0,0 +1,247 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/errno.h" +#include "kudu/util/locks.h" + +DEFINE_int32(num_threads, 8, "Number of threads to test"); + +class my_spinlock : public boost::detail::spinlock { + public: + my_spinlock() { + v_ = 0; + } + + private: + DISALLOW_COPY_AND_ASSIGN(my_spinlock); +}; + +struct per_cpu_lock { + struct padded_lock { + my_spinlock lock; + char padding[CACHELINE_SIZE - sizeof(my_spinlock)]; + }; + + per_cpu_lock() { + errno = 0; + n_cpus_ = base::NumCPUs(); + CHECK_EQ(errno, 0) << kudu::ErrnoToString(errno); + CHECK_GT(n_cpus_, 0); + locks_ = new padded_lock[n_cpus_]; + } + + ~per_cpu_lock() { + delete [] locks_; + } + + my_spinlock *get_lock() { + int cpu = sched_getcpu(); + CHECK_LT(cpu, n_cpus_); + return &locks_[cpu].lock; + } + + int n_cpus_; + padded_lock *locks_; + +}; + +struct shared_data { + shared_data() { + errno = 0; + } + + kudu::rw_spinlock rw_spinlock; + boost::shared_mutex rwlock; + boost::mutex lock; + kudu::percpu_rwlock per_cpu; +}; + + +class noop_lock { + public: + void lock() {} + void unlock() {} +}; + +// Some trivial workload to be done while +// holding the lock. +static float workload(float result) { + for (int i = 0; i < 1; i++) { + result += 1; + result *= 2.1; + } + return result; +} + +// Add a dependency on result - this will never +// be true, but prevents compiler optimizations +// from killing off the workload call +static void depend_on(float val) { + if (val == 12345.0) { + printf("hello world"); + } +} + +void shared_rwlock_entry(shared_data *shared) { + float result = 1; + for (int i = 0; i < 1000000; i++) { + shared->rwlock.lock_shared(); + result += workload(result); + shared->rwlock.unlock_shared(); + } + depend_on(result); +} + +void shared_rw_spinlock_entry(shared_data *shared) { + float result = 1; + for (int i = 0; i < 1000000; i++) { + shared->rw_spinlock.lock_shared(); + result += workload(result); + shared->rw_spinlock.unlock_shared(); + } + depend_on(result); +} + +void shared_mutex_entry(shared_data *shared) { + float result = 1; + for (int i = 0; i < 1000000; i++) { + shared->lock.lock(); + result += workload(result); + shared->lock.unlock(); + } + depend_on(result); +} + +template +void own_mutex_entry() { + LockType mylock; + float result = 1; + for (int i = 0; i < 1000000; i++) { + mylock.lock(); + result += workload(result); + mylock.unlock(); + } + + depend_on(result); +} + +void percpu_rwlock_entry(shared_data *shared) { + float result = 1; + for (int i = 0; i < 1000000; i++) { + kudu::rw_spinlock &l = shared->per_cpu.get_lock(); + l.lock_shared(); + result += workload(result); + l.unlock_shared(); + } + + depend_on(result); +} + + +enum TestMethod { + SHARED_RWLOCK, + SHARED_MUTEX, + OWN_MUTEX, + OWN_SPINLOCK, + PERCPU_RWLOCK, + NO_LOCK, + RW_SPINLOCK +}; + +void test_shared_lock(int num_threads, + TestMethod method, + const char *name) { + boost::ptr_vector threads; + shared_data shared; + + for (int i = 0; i < num_threads; i++) { + switch (method) { + case SHARED_RWLOCK: + threads.push_back(new boost::thread( + shared_rwlock_entry, &shared)); + break; + case SHARED_MUTEX: + threads.push_back(new boost::thread( + shared_mutex_entry, &shared)); + break; + case OWN_MUTEX: + threads.push_back(new boost::thread( + own_mutex_entry)); + break; + case OWN_SPINLOCK: + threads.push_back(new boost::thread( + own_mutex_entry)); + break; + case NO_LOCK: + threads.push_back(new boost::thread( + own_mutex_entry)); + break; + case PERCPU_RWLOCK: + threads.push_back(new boost::thread( + percpu_rwlock_entry, &shared)); + break; + case RW_SPINLOCK: + threads.push_back(new boost::thread( + shared_rw_spinlock_entry, &shared)); + break; + default: + CHECK(0) << "bad method: " << method; + } + } + + int64_t start = CycleClock::Now(); + for (boost::thread &thr : threads) { + thr.join(); + } + int64_t end = CycleClock::Now(); + + printf("%13s % 7d %ldM\n", + name, num_threads, (end-start)/1000000); +} + +int main(int argc, char **argv) { + printf(" Test Threads Cycles\n"); + printf("------------------------------\n"); + + for (int num_threads = 1; + num_threads < FLAGS_num_threads; + num_threads++) { + test_shared_lock(num_threads, SHARED_RWLOCK, "shared_rwlock"); + test_shared_lock(num_threads, SHARED_MUTEX, "shared_mutex"); + test_shared_lock(num_threads, OWN_MUTEX, "own_mutex"); + test_shared_lock(num_threads, OWN_SPINLOCK, "own_spinlock"); + test_shared_lock(num_threads, NO_LOCK, "no_lock"); + test_shared_lock(num_threads, PERCPU_RWLOCK, "percpu_rwlock"); + test_shared_lock(num_threads, RW_SPINLOCK, "rw_spinlock"); + } + +} diff --git a/src/kudu/fs/CMakeLists.txt b/src/kudu/fs/CMakeLists.txt new file mode 100644 index 000000000000..23a713e8036d --- /dev/null +++ b/src/kudu/fs/CMakeLists.txt @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +PROTOBUF_GENERATE_CPP( + FS_PROTO_SRCS FS_PROTO_HDRS FS_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES fs.proto) +ADD_EXPORTABLE_LIBRARY(fs_proto + SRCS ${FS_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${FS_PROTO_TGTS}) + +add_library(kudu_fs + block_id.cc + block_manager.cc + block_manager_metrics.cc + block_manager_util.cc + file_block_manager.cc + fs_manager.cc + log_block_manager.cc) + +target_link_libraries(kudu_fs + fs_proto + kudu_util + gutil) + +# Tests +set(KUDU_TEST_LINK_LIBS kudu_fs ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(block_manager-test) +ADD_KUDU_TEST(block_manager_util-test) +ADD_KUDU_TEST(block_manager-stress-test RUN_SERIAL true) +ADD_KUDU_TEST(fs_manager-test) diff --git a/src/kudu/fs/block_id.cc b/src/kudu/fs/block_id.cc new file mode 100644 index 000000000000..8b821172dc9c --- /dev/null +++ b/src/kudu/fs/block_id.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/block_id.h" + +#include +#include +#include + +#include "kudu/fs/fs.pb.h" +#include "kudu/gutil/strings/join.h" + +using std::string; +using std::vector; + +namespace kudu { + +const uint64_t BlockId::kInvalidId = 0; + +string BlockId::JoinStrings(const vector& blocks) { + vector strings; + strings.reserve(blocks.size()); + for (const BlockId& block : blocks) { + strings.push_back(block.ToString()); + } + return ::JoinStrings(strings, ","); +} + +void BlockId::CopyToPB(BlockIdPB *pb) const { + pb->set_id(id_); +} + +BlockId BlockId::FromPB(const BlockIdPB& pb) { + DCHECK(pb.has_id()); + return BlockId(pb.id()); +} + +} // namespace kudu diff --git a/src/kudu/fs/block_id.h b/src/kudu/fs/block_id.h new file mode 100644 index 000000000000..c0d360124a47 --- /dev/null +++ b/src/kudu/fs/block_id.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_FS_BLOCK_ID_H +#define KUDU_FS_BLOCK_ID_H + +#include +#include +#include + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/stringprintf.h" + +namespace kudu { + +class BlockIdPB; + +namespace fs { +namespace internal { +class FileBlockLocation; +} // namespace internal +} // namespace fs + +class BlockId { + public: + BlockId() + : id_(kInvalidId) { + } + + explicit BlockId(uint64_t id) { + SetId(id); + } + + void SetId(uint64_t id) { + id_ = id; + } + + bool IsNull() const { return id_ == kInvalidId; } + + std::string ToString() const { + return StringPrintf("%016" PRIu64, id_); + } + + bool operator==(const BlockId& other) const { + return id_ == other.id_; + } + + bool operator!=(const BlockId& other) const { + return id_ != other.id_; + } + + // Returns the raw ID. Use with care; in most cases the BlockId should be + // treated as a completely opaque value. + uint64_t id() const { return id_; } + + // Join the given block IDs with ','. Useful for debug printouts. + static std::string JoinStrings(const std::vector& blocks); + + void CopyToPB(BlockIdPB* pb) const; + + static BlockId FromPB(const BlockIdPB& pb); + + private: + static const uint64_t kInvalidId; + + uint64_t id_; +}; + +std::ostream& operator<<(std::ostream& o, const BlockId& block_id); + +struct BlockIdHash { + size_t operator()(const BlockId& block_id) const { + return block_id.id(); + } +}; + +struct BlockIdCompare { + bool operator()(const BlockId& first, const BlockId& second) const { + return first.id() < second.id(); + } +}; + +struct BlockIdEqual { + bool operator()(const BlockId& first, const BlockId& second) const { + return first.id() == second.id(); + } +}; + +} // namespace kudu +#endif /* KUDU_FS_BLOCK_ID_H */ diff --git a/src/kudu/fs/block_manager-stress-test.cc b/src/kudu/fs/block_manager-stress-test.cc new file mode 100644 index 000000000000..e1bc9dd772d7 --- /dev/null +++ b/src/kudu/fs/block_manager-stress-test.cc @@ -0,0 +1,412 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/fs/file_block_manager.h" +#include "kudu/fs/log_block_manager.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/atomic.h" +#include "kudu/util/metrics.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DEFINE_int32(test_duration_secs, 2, "Number of seconds to run the test"); +DEFINE_int32(num_writer_threads, 4, "Number of writer threads to run"); +DEFINE_int32(num_reader_threads, 8, "Number of reader threads to run"); +DEFINE_int32(num_deleter_threads, 1, "Number of deleter threads to run"); +DEFINE_int32(block_group_size, 8, "Number of blocks to write per block " + "group. Must be power of 2"); +DEFINE_int32(block_group_bytes, 64 * 1024, + "Total amount of data (in bytes) to write per block group"); +DEFINE_int32(num_bytes_per_write, 64, + "Number of bytes to write at a time"); +DEFINE_string(block_manager_paths, "", "Comma-separated list of paths to " + "use for block storage. If empty, will use the default unit " + "test path"); + +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace fs { + +// This test attempts to simulate how a TS might use the block manager: +// +// writing threads (default 2) that do the following in a tight loop: +// - create a new group of blocks (default 10) +// - write a PRNG seed into each block +// - write a big chunk of data (default 64m) into the block group: +// - pick the next block to write a piece to at random +// - write one piece at a time (default 64k) of data generated using +// that block's PRNG seed +// - close the blocks +// - add the blocks to the block_id vector (write locked) +// reading threads (default 8) that do the following in a tight loop: +// - read one block id at random from block_id vector (read locked) +// - read the block fully into memory, parsing its seed +// - verify that the contents of the block match the PRNG output +// deleting threads (default 1) that do the following every second: +// - drain the block_id vector(write locked) +// - delete all the blocks drained from the vector +// +// TODO: Don't delete all blocks ala "permgen". +template +class BlockManagerStressTest : public KuduTest { + public: + BlockManagerStressTest() : + rand_seed_(SeedRandom()), + stop_latch_(1), + bm_(CreateBlockManager()), + total_blocks_written_(0), + total_bytes_written_(0), + total_blocks_read_(0), + total_bytes_read_(0), + total_blocks_deleted_(0) { + } + + virtual void SetUp() OVERRIDE { + CHECK_OK(bm_->Create()); + CHECK_OK(bm_->Open()); + } + + virtual void TearDown() OVERRIDE { + // If non-standard paths were provided we need to delete them in + // between test runs. + if (!FLAGS_block_manager_paths.empty()) { + vector paths = strings::Split(FLAGS_block_manager_paths, ",", + strings::SkipEmpty()); + for (const string& path : paths) { + WARN_NOT_OK(env_->DeleteRecursively(path), + Substitute("Couldn't recursively delete $0", path)); + } + } + } + + BlockManager* CreateBlockManager() { + BlockManagerOptions opts; + if (FLAGS_block_manager_paths.empty()) { + opts.root_paths.push_back(GetTestDataDirectory()); + } else { + opts.root_paths = strings::Split(FLAGS_block_manager_paths, ",", + strings::SkipEmpty()); + } + return new T(env_.get(), opts); + } + + void RunTest(int secs) { + LOG(INFO) << "Starting all threads"; + this->StartThreads(); + SleepFor(MonoDelta::FromSeconds(secs)); + LOG(INFO) << "Stopping all threads"; + this->StopThreads(); + this->JoinThreads(); + this->stop_latch_.Reset(1); + } + + void StartThreads() { + scoped_refptr new_thread; + for (int i = 0; i < FLAGS_num_writer_threads; i++) { + CHECK_OK(Thread::Create("BlockManagerStressTest", Substitute("writer-$0", i), + &BlockManagerStressTest::WriterThread, this, &new_thread)); + threads_.push_back(new_thread); + } + for (int i = 0; i < FLAGS_num_reader_threads; i++) { + CHECK_OK(Thread::Create("BlockManagerStressTest", Substitute("reader-$0", i), + &BlockManagerStressTest::ReaderThread, this, &new_thread)); + threads_.push_back(new_thread); + } + for (int i = 0; i < FLAGS_num_deleter_threads; i++) { + CHECK_OK(Thread::Create("BlockManagerStressTest", Substitute("deleter-$0", i), + &BlockManagerStressTest::DeleterThread, this, &new_thread)); + threads_.push_back(new_thread); + } + } + + void StopThreads() { + stop_latch_.CountDown(); + } + + bool ShouldStop(const MonoDelta& wait_time) { + return stop_latch_.WaitFor(wait_time); + } + + void JoinThreads() { + for (const scoped_refptr& thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + } + + void WriterThread(); + void ReaderThread(); + void DeleterThread(); + + protected: + // Used to generate random data. All PRNG instances are seeded with this + // value to ensure that the test is reproducible. + int rand_seed_; + + // Tells the threads to stop running. + CountDownLatch stop_latch_; + + // Tracks blocks that have been synced and are ready to be read/deleted. + vector written_blocks_; + + // Protects written_blocks_. + rw_spinlock lock_; + + // The block manager. + gscoped_ptr bm_; + + // The running threads. + vector > threads_; + + // Some performance counters. + + AtomicInt total_blocks_written_; + AtomicInt total_bytes_written_; + + AtomicInt total_blocks_read_; + AtomicInt total_bytes_read_; + + AtomicInt total_blocks_deleted_; +}; + +template +void BlockManagerStressTest::WriterThread() { + string thread_name = Thread::current_thread()->name(); + LOG(INFO) << "Thread " << thread_name << " starting"; + + Random rand(rand_seed_); + size_t num_blocks_written = 0; + size_t num_bytes_written = 0; + MonoDelta tight_loop(MonoDelta::FromSeconds(0)); + while (!ShouldStop(tight_loop)) { + vector dirty_blocks; + ElementDeleter deleter(&dirty_blocks); + vector dirty_block_rands; + + // Create the blocks and write out the PRNG seeds. + for (int i = 0; i < FLAGS_block_group_size; i++) { + gscoped_ptr block; + CHECK_OK(bm_->CreateBlock(&block)); + + const uint32_t seed = rand.Next() + 1; + Slice seed_slice(reinterpret_cast(&seed), sizeof(seed)); + LOG(INFO) << "Creating block " << block->id().ToString() << " with seed " << seed; + CHECK_OK(block->Append(seed_slice)); + + dirty_blocks.push_back(block.release()); + dirty_block_rands.push_back(Random(seed)); + } + + // Write a large amount of data to the group of blocks. + // + // To emulate a real life workload, we pick the next block to write at + // random, and write a smaller chunk of data to it. + LOG(INFO) << "Writing " << FLAGS_block_group_bytes << " bytes into new blocks"; + size_t total_dirty_bytes = 0; + while (total_dirty_bytes < FLAGS_block_group_bytes) { + // Pick the next block. + int next_block_idx = rand.Skewed(log2(dirty_blocks.size())); + WritableBlock* block = dirty_blocks[next_block_idx]; + Random& rand = dirty_block_rands[next_block_idx]; + + // Write a small chunk of data. + faststring data; + while (data.length() < FLAGS_num_bytes_per_write) { + const uint32_t next_int = rand.Next(); + data.append(&next_int, sizeof(next_int)); + } + CHECK_OK(block->Append(data)); + total_dirty_bytes += data.length(); + } + + // Close all dirty blocks. + // + // We could close them implicitly when the blocks are destructed but + // this way we can check for errors. + LOG(INFO) << "Closing new blocks"; + CHECK_OK(bm_->CloseBlocks(dirty_blocks)); + + // Publish the now sync'ed blocks to readers and deleters. + { + lock_guard l(&lock_); + for (WritableBlock* block : dirty_blocks) { + written_blocks_.push_back(block->id()); + } + } + num_blocks_written += dirty_blocks.size(); + num_bytes_written += total_dirty_bytes; + } + + LOG(INFO) << Substitute("Thread $0 stopping. Wrote $1 blocks ($2 bytes)", + thread_name, num_blocks_written, num_bytes_written); + total_blocks_written_.IncrementBy(num_blocks_written); + total_bytes_written_.IncrementBy(num_bytes_written); +} + +template +void BlockManagerStressTest::ReaderThread() { + string thread_name = Thread::current_thread()->name(); + LOG(INFO) << "Thread " << thread_name << " starting"; + + Random rand(rand_seed_); + size_t num_blocks_read = 0; + size_t num_bytes_read = 0; + MonoDelta tight_loop(MonoDelta::FromSeconds(0)); + while (!ShouldStop(tight_loop)) { + gscoped_ptr block; + { + // Grab a block at random. + shared_lock l(&lock_); + size_t num_blocks = written_blocks_.size(); + if (num_blocks > 0) { + uint32_t next_id = rand.Uniform(num_blocks); + const BlockId& block_id = written_blocks_[next_id]; + CHECK_OK(bm_->OpenBlock(block_id, &block)); + } + } + if (!block) { + continue; + } + + // Read it fully into memory. + string block_id = block->id().ToString(); + uint64_t block_size; + CHECK_OK(block->Size(&block_size)); + Slice data; + gscoped_ptr scratch(new uint8_t[block_size]); + CHECK_OK(block->Read(0, block_size, &data, scratch.get())); + LOG(INFO) << "Read " << block_size << " bytes from block " << block_id; + + // The first 4 bytes correspond to the PRNG seed. + CHECK(data.size() >= 4); + uint32_t seed; + memcpy(&seed, data.data(), sizeof(uint32_t)); + LOG(INFO) << "Read seed " << seed << " from block " << block_id; + Random rand(seed); + + // Verify every subsequent number using the PRNG. + size_t bytes_processed; + for (bytes_processed = 4; // start after the PRNG seed + bytes_processed < data.size(); + bytes_processed += sizeof(uint32_t)) { + uint32_t expected_num = rand.Next(); + uint32_t actual_num; + memcpy(&actual_num, data.data() + bytes_processed, sizeof(uint32_t)); + if (expected_num != actual_num) { + LOG(FATAL) << "Read " << actual_num << " and not " << expected_num + << " from position " << bytes_processed << " in block " + << block_id; + } + } + CHECK_EQ(bytes_processed, data.size()); + LOG(INFO) << "Finished reading block " << block->id().ToString(); + num_blocks_read++; + num_bytes_read += block_size; + } + + LOG(INFO) << Substitute("Thread $0 stopping. Read $1 blocks ($2 bytes)", + thread_name, num_blocks_read, num_bytes_read); + total_blocks_read_.IncrementBy(num_blocks_read); + total_bytes_read_.IncrementBy(num_bytes_read); +} + +template +void BlockManagerStressTest::DeleterThread() { + string thread_name = Thread::current_thread()->name(); + LOG(INFO) << "Thread " << thread_name << " starting"; + + size_t num_blocks_deleted = 0; + MonoDelta sleep_time(MonoDelta::FromSeconds(1)); + while (!ShouldStop(sleep_time)) { + // Grab all the blocks we can. + vector to_delete; + { + lock_guard l(&lock_); + to_delete.swap(written_blocks_); + } + + // And delete them. + for (const BlockId& block_id : to_delete) { + LOG(INFO) << "Deleting block " << block_id.ToString(); + CHECK_OK(bm_->DeleteBlock(block_id)); + } + num_blocks_deleted += to_delete.size(); + } + + LOG(INFO) << Substitute("Thread $0 stopping. Deleted $1 blocks", + thread_name, num_blocks_deleted); + total_blocks_deleted_.IncrementBy(num_blocks_deleted); +} + +// What kinds of BlockManagers are supported? +#if defined(__linux__) +typedef ::testing::Types BlockManagers; +#else +typedef ::testing::Types BlockManagers; +#endif +TYPED_TEST_CASE(BlockManagerStressTest, BlockManagers); + +TYPED_TEST(BlockManagerStressTest, StressTest) { + OverrideFlagForSlowTests("test_duration_secs", "30"); + OverrideFlagForSlowTests("block_group_size", "16"); + OverrideFlagForSlowTests("block_group_bytes", + Substitute("$0", 64 * 1024 * 1024)); + OverrideFlagForSlowTests("num_bytes_per_write", + Substitute("$0", 64 * 1024)); + + if ((FLAGS_block_group_size & (FLAGS_block_group_size - 1)) != 0) { + LOG(FATAL) << "block_group_size " << FLAGS_block_group_size + << " is not a power of 2"; + } + + LOG(INFO) << "Running on fresh block manager"; + this->RunTest(FLAGS_test_duration_secs / 2); + LOG(INFO) << "Running on populated block manager"; + // Blow away old memtrackers before creating new block manager. + this->bm_.reset(); + this->bm_.reset(this->CreateBlockManager()); + ASSERT_OK(this->bm_->Open()); + this->RunTest(FLAGS_test_duration_secs / 2); + + LOG(INFO) << "Printing test totals"; + LOG(INFO) << "--------------------"; + LOG(INFO) << Substitute("Wrote $0 blocks ($1 bytes) via $2 threads", + this->total_blocks_written_.Load(), + this->total_bytes_written_.Load(), + FLAGS_num_writer_threads); + LOG(INFO) << Substitute("Read $0 blocks ($1 bytes) via $2 threads", + this->total_blocks_read_.Load(), + this->total_bytes_read_.Load(), + FLAGS_num_reader_threads); + LOG(INFO) << Substitute("Deleted $0 blocks via $1 threads", + this->total_blocks_deleted_.Load(), + FLAGS_num_deleter_threads); +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager-test.cc b/src/kudu/fs/block_manager-test.cc new file mode 100644 index 000000000000..6f3d0a4c2558 --- /dev/null +++ b/src/kudu/fs/block_manager-test.cc @@ -0,0 +1,758 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + + +#include "kudu/fs/file_block_manager.h" +#include "kudu/fs/log_block_manager.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +// LogBlockManager opens two files per container, and CloseManyBlocksTest +// uses one container for each block. To simplify testing (i.e. no need to +// raise the ulimit on open files), the default is kept low. +DEFINE_int32(num_blocks_close, 500, + "Number of blocks to simultaneously close in CloseManyBlocksTest"); + +DECLARE_uint64(log_container_preallocate_bytes); +DECLARE_uint64(log_container_max_size); + +// Generic block manager metrics. +METRIC_DECLARE_gauge_uint64(block_manager_blocks_open_reading); +METRIC_DECLARE_gauge_uint64(block_manager_blocks_open_writing); +METRIC_DECLARE_counter(block_manager_total_writable_blocks); +METRIC_DECLARE_counter(block_manager_total_readable_blocks); +METRIC_DECLARE_counter(block_manager_total_bytes_written); +METRIC_DECLARE_counter(block_manager_total_bytes_read); + +// Log block manager metrics. +METRIC_DECLARE_gauge_uint64(log_block_manager_bytes_under_management); +METRIC_DECLARE_gauge_uint64(log_block_manager_blocks_under_management); +METRIC_DECLARE_counter(log_block_manager_containers); +METRIC_DECLARE_counter(log_block_manager_full_containers); + +namespace kudu { +namespace fs { + +template +class BlockManagerTest : public KuduTest { + public: + BlockManagerTest() : + bm_(CreateBlockManager(scoped_refptr(), + shared_ptr(), + { GetTestDataDirectory() })) { + } + + virtual void SetUp() OVERRIDE { + CHECK_OK(bm_->Create()); + CHECK_OK(bm_->Open()); + } + + protected: + T* CreateBlockManager(const scoped_refptr& metric_entity, + const shared_ptr& parent_mem_tracker, + const vector& paths) { + BlockManagerOptions opts; + opts.metric_entity = metric_entity; + opts.parent_mem_tracker = parent_mem_tracker; + opts.root_paths = paths; + return new T(env_.get(), opts); + } + + void ReopenBlockManager(const scoped_refptr& metric_entity, + const shared_ptr& parent_mem_tracker, + const vector& paths, + bool create) { + // Blow away old memtrackers first. + bm_.reset(); + bm_.reset(CreateBlockManager(metric_entity, parent_mem_tracker, paths)); + if (create) { + ASSERT_OK(bm_->Create()); + } + ASSERT_OK(bm_->Open()); + } + + void RunMultipathTest(const vector& paths); + + void RunLogMetricsTest(); + + void RunLogContainerPreallocationTest(); + + void RunMemTrackerTest(); + + gscoped_ptr bm_; +}; + +template <> +void BlockManagerTest::RunMultipathTest(const vector& paths) { + // Ensure that each path has an instance file and that it's well-formed. + for (const string& path : paths) { + vector children; + ASSERT_OK(env_->GetChildren(path, &children)); + ASSERT_EQ(3, children.size()); + for (const string& child : children) { + if (child == "." || child == "..") { + continue; + } + PathInstanceMetadataPB instance; + ASSERT_OK(pb_util::ReadPBContainerFromPath(env_.get(), + JoinPathSegments(path, child), + &instance)); + } + } + + // Write ten blocks. + const char* kTestData = "test data"; + for (int i = 0; i < 10; i++) { + gscoped_ptr written_block; + ASSERT_OK(bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Append(kTestData)); + ASSERT_OK(written_block->Close()); + } + + // Each path should now have some additional block subdirectories. We + // can't know for sure exactly how many (depends on the block IDs + // generated), but this ensures that at least some change were made. + for (const string& path : paths) { + vector children; + ASSERT_OK(env_->GetChildren(path, &children)); + ASSERT_GT(children.size(), 3); + } +} + +template <> +void BlockManagerTest::RunMultipathTest(const vector& paths) { + // Write (3 * numPaths * 2) blocks, in groups of (numPaths * 2). That should + // yield two containers per path. + const char* kTestData = "test data"; + for (int i = 0; i < 3; i++) { + ScopedWritableBlockCloser closer; + for (int j = 0; j < paths.size() * 2; j++) { + gscoped_ptr block; + ASSERT_OK(bm_->CreateBlock(&block)); + ASSERT_OK(block->Append(kTestData)); + closer.AddBlock(block.Pass()); + } + ASSERT_OK(closer.CloseBlocks()); + } + + // Verify the results: 7 children = dot, dotdot, instance file, and two + // containers (two files per container). + for (const string& path : paths) { + vector children; + ASSERT_OK(env_->GetChildren(path, &children)); + ASSERT_EQ(children.size(), 7); + } +} + +template <> +void BlockManagerTest::RunLogMetricsTest() { + LOG(INFO) << "Test skipped; wrong block manager"; +} + +static void CheckLogMetrics(const scoped_refptr& entity, + int bytes_under_management, int blocks_under_management, + int containers, int full_containers) { + ASSERT_EQ(bytes_under_management, down_cast*>( + entity->FindOrNull(METRIC_log_block_manager_bytes_under_management) + .get())->value()); + ASSERT_EQ(blocks_under_management, down_cast*>( + entity->FindOrNull(METRIC_log_block_manager_blocks_under_management) + .get())->value()); + ASSERT_EQ(containers, down_cast( + entity->FindOrNull(METRIC_log_block_manager_containers) + .get())->value()); + ASSERT_EQ(full_containers, down_cast( + entity->FindOrNull(METRIC_log_block_manager_full_containers) + .get())->value()); +} + +template <> +void BlockManagerTest::RunLogMetricsTest() { + MetricRegistry registry; + scoped_refptr entity = METRIC_ENTITY_server.Instantiate(®istry, "test"); + this->ReopenBlockManager(entity, + shared_ptr(), + { GetTestDataDirectory() }, + false); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(entity, 0, 0, 0, 0)); + + // Lower the max container size so that we can more easily test full + // container metrics. + FLAGS_log_container_max_size = 1024; + + // One block --> one container. + gscoped_ptr writer; + ASSERT_OK(this->bm_->CreateBlock(&writer)); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(entity, 0, 0, 1, 0)); + + // And when the block is closed, it becomes "under management". + ASSERT_OK(writer->Close()); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(entity, 0, 1, 1, 0)); + + // Create 10 blocks concurrently. We reuse the existing container and + // create 9 new ones. All of them get filled. + BlockId saved_id; + { + Random rand(SeedRandom()); + ScopedWritableBlockCloser closer; + for (int i = 0; i < 10; i++) { + gscoped_ptr b; + ASSERT_OK(this->bm_->CreateBlock(&b)); + if (saved_id.IsNull()) { + saved_id = b->id(); + } + uint8_t data[1024]; + for (int i = 0; i < sizeof(data); i += sizeof(uint32_t)) { + data[i] = rand.Next(); + } + b->Append(Slice(data, sizeof(data))); + closer.AddBlock(b.Pass()); + } + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(entity, 0, 1, 10, 0)); + + // Only when the blocks are closed are the containers considered full. + ASSERT_OK(closer.CloseBlocks()); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(entity, 10 * 1024, 11, 10, 10)); + } + + // Reopen the block manager and test the metrics. They're all based on + // persistent information so they should be the same. + MetricRegistry new_registry; + scoped_refptr new_entity = METRIC_ENTITY_server.Instantiate(&new_registry, "test"); + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager(new_entity, + shared_ptr(), + { GetTestDataDirectory() }, + false)); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(new_entity, 10 * 1024, 11, 10, 10)); + + // Delete a block. Its contents should no longer be under management. + ASSERT_OK(this->bm_->DeleteBlock(saved_id)); + ASSERT_NO_FATAL_FAILURE(CheckLogMetrics(new_entity, 9 * 1024, 10, 10, 10)); +} + +template <> +void BlockManagerTest::RunLogContainerPreallocationTest() { + LOG(INFO) << "Test skipped; wrong block manager"; +} + +template <> +void BlockManagerTest::RunLogContainerPreallocationTest() { + // Create a block with some test data. This should also trigger + // preallocation of the container, provided it's supported by the kernel. + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Close()); + + // Now reopen the block manager and create another block. More + // preallocation, but it should be from the end of the previous block, + // not from the end of the file. + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager(scoped_refptr(), + shared_ptr(), + { GetTestDataDirectory() }, + false)); + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Close()); + + // dot, dotdot, test metadata, instance file, and one container file pair. + vector children; + ASSERT_OK(this->env_->GetChildren(GetTestDataDirectory(), &children)); + ASSERT_EQ(6, children.size()); + + // If preallocation was done from the end of the file (rather than the + // end of the previous block), the file's size would be twice the + // preallocation amount. That would be wrong. + // + // Instead, we expect the size to either be 0 (preallocation isn't + // supported) or equal to the preallocation amount. + bool found = false; + for (const string& child : children) { + if (HasSuffixString(child, ".data")) { + found = true; + uint64_t size; + ASSERT_OK(this->env_->GetFileSizeOnDisk( + JoinPathSegments(GetTestDataDirectory(), child), &size)); + ASSERT_TRUE(size == 0 || size == FLAGS_log_container_preallocate_bytes); + } + } + ASSERT_TRUE(found); +} + +template <> +void BlockManagerTest::RunMemTrackerTest() { + shared_ptr tracker = MemTracker::CreateTracker(-1, "test tracker"); + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager(scoped_refptr(), + tracker, + { GetTestDataDirectory() }, + false)); + + // The file block manager does not allocate memory for persistent data. + int64_t initial_mem = tracker->consumption(); + ASSERT_EQ(initial_mem, 0); + gscoped_ptr writer; + ASSERT_OK(this->bm_->CreateBlock(&writer)); + ASSERT_OK(writer->Close()); + ASSERT_EQ(tracker->consumption(), initial_mem); +} + +template <> +void BlockManagerTest::RunMemTrackerTest() { + shared_ptr tracker = MemTracker::CreateTracker(-1, "test tracker"); + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager(scoped_refptr(), + tracker, + { GetTestDataDirectory() }, + false)); + + // The initial consumption should be non-zero due to the block map. + int64_t initial_mem = tracker->consumption(); + ASSERT_GT(initial_mem, 0); + + // Allocating a persistent block should increase the consumption. + gscoped_ptr writer; + ASSERT_OK(this->bm_->CreateBlock(&writer)); + ASSERT_OK(writer->Close()); + ASSERT_GT(tracker->consumption(), initial_mem); +} + +// What kinds of BlockManagers are supported? +#if defined(__linux__) +typedef ::testing::Types BlockManagers; +#else +typedef ::testing::Types BlockManagers; +#endif +TYPED_TEST_CASE(BlockManagerTest, BlockManagers); + +// Test the entire lifecycle of a block. +TYPED_TEST(BlockManagerTest, EndToEndTest) { + // Create a block. + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + + // Write some data to it. + string test_data = "test data"; + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->Close()); + + // Read the data back. + gscoped_ptr read_block; + ASSERT_OK(this->bm_->OpenBlock(written_block->id(), &read_block)); + uint64_t sz; + ASSERT_OK(read_block->Size(&sz)); + ASSERT_EQ(test_data.length(), sz); + Slice data; + gscoped_ptr scratch(new uint8_t[test_data.length()]); + ASSERT_OK(read_block->Read(0, test_data.length(), &data, scratch.get())); + ASSERT_EQ(test_data, data); + + // Delete the block. + ASSERT_OK(this->bm_->DeleteBlock(written_block->id())); + ASSERT_TRUE(this->bm_->OpenBlock(written_block->id(), nullptr) + .IsNotFound()); +} + +// Test that we can still read from an opened block after deleting it +// (even if we can't open it again). +TYPED_TEST(BlockManagerTest, ReadAfterDeleteTest) { + // Write a new block. + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + string test_data = "test data"; + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->Close()); + + // Open it for reading, then delete it. Subsequent opens should fail. + gscoped_ptr read_block; + ASSERT_OK(this->bm_->OpenBlock(written_block->id(), &read_block)); + ASSERT_OK(this->bm_->DeleteBlock(written_block->id())); + ASSERT_TRUE(this->bm_->OpenBlock(written_block->id(), nullptr) + .IsNotFound()); + + // But we should still be able to read from the opened block. + Slice data; + gscoped_ptr scratch(new uint8_t[test_data.length()]); + ASSERT_OK(read_block->Read(0, test_data.length(), &data, scratch.get())); + ASSERT_EQ(test_data, data); +} + +TYPED_TEST(BlockManagerTest, CloseTwiceTest) { + // Create a new block and close it repeatedly. + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Close()); + ASSERT_OK(written_block->Close()); + + // Open it for reading and close it repeatedly. + gscoped_ptr read_block; + ASSERT_OK(this->bm_->OpenBlock(written_block->id(), &read_block)); + ASSERT_OK(read_block->Close()); + ASSERT_OK(read_block->Close()); +} + +TYPED_TEST(BlockManagerTest, CloseManyBlocksTest) { + if (!AllowSlowTests()) { + LOG(INFO) << "Not running in slow-tests mode"; + return; + } + + // Disable preallocation for this test as it creates many containers. + FLAGS_log_container_preallocate_bytes = 0; + + Random rand(SeedRandom()); + vector dirty_blocks; + ElementDeleter deleter(&dirty_blocks); + LOG(INFO) << "Creating " << FLAGS_num_blocks_close << " blocks"; + for (int i = 0; i < FLAGS_num_blocks_close; i++) { + // Create a block. + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + + // Write 64k bytes of random data into it. + uint8_t data[65536]; + for (int i = 0; i < sizeof(data); i += sizeof(uint32_t)) { + data[i] = rand.Next(); + } + written_block->Append(Slice(data, sizeof(data))); + + dirty_blocks.push_back(written_block.release()); + } + + LOG_TIMING(INFO, Substitute("closing $0 blocks", FLAGS_num_blocks_close)) { + ASSERT_OK(this->bm_->CloseBlocks(dirty_blocks)); + } +} + +// We can't really test that FlushDataAsync() "works", but we can test that +// it doesn't break anything. +TYPED_TEST(BlockManagerTest, FlushDataAsyncTest) { + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + string test_data = "test data"; + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->FlushDataAsync()); +} + +TYPED_TEST(BlockManagerTest, WritableBlockStateTest) { + gscoped_ptr written_block; + + // Common flow: CLEAN->DIRTY->CLOSED. + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_EQ(WritableBlock::CLEAN, written_block->state()); + string test_data = "test data"; + ASSERT_OK(written_block->Append(test_data)); + ASSERT_EQ(WritableBlock::DIRTY, written_block->state()); + ASSERT_OK(written_block->Append(test_data)); + ASSERT_EQ(WritableBlock::DIRTY, written_block->state()); + ASSERT_OK(written_block->Close()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); + + // Test FLUSHING->CLOSED transition. + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->FlushDataAsync()); + ASSERT_EQ(WritableBlock::FLUSHING, written_block->state()); + ASSERT_OK(written_block->Close()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); + + // Test CLEAN->CLOSED transition. + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Close()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); + + // Test FlushDataAsync() no-op. + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->FlushDataAsync()); + ASSERT_EQ(WritableBlock::FLUSHING, written_block->state()); + + // Test DIRTY->CLOSED transition. + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->Close()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); +} + +TYPED_TEST(BlockManagerTest, AbortTest) { + gscoped_ptr written_block; + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + string test_data = "test data"; + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->Abort()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); + ASSERT_TRUE(this->bm_->OpenBlock(written_block->id(), nullptr) + .IsNotFound()); + + ASSERT_OK(this->bm_->CreateBlock(&written_block)); + ASSERT_OK(written_block->Append(test_data)); + ASSERT_OK(written_block->FlushDataAsync()); + ASSERT_OK(written_block->Abort()); + ASSERT_EQ(WritableBlock::CLOSED, written_block->state()); + ASSERT_TRUE(this->bm_->OpenBlock(written_block->id(), nullptr) + .IsNotFound()); +} + +TYPED_TEST(BlockManagerTest, PersistenceTest) { + // Create three blocks: + // 1. Empty. + // 2. Non-empty. + // 3. Deleted. + gscoped_ptr written_block1; + gscoped_ptr written_block2; + gscoped_ptr written_block3; + ASSERT_OK(this->bm_->CreateBlock(&written_block1)); + ASSERT_OK(written_block1->Close()); + ASSERT_OK(this->bm_->CreateBlock(&written_block2)); + string test_data = "test data"; + ASSERT_OK(written_block2->Append(test_data)); + ASSERT_OK(written_block2->Close()); + ASSERT_OK(this->bm_->CreateBlock(&written_block3)); + ASSERT_OK(written_block3->Append(test_data)); + ASSERT_OK(written_block3->Close()); + ASSERT_OK(this->bm_->DeleteBlock(written_block3->id())); + + // Reopen the block manager. This may read block metadata from disk. + // + // The existing block manager is left open, which proxies for the process + // having crashed without cleanly shutting down the block manager. The + // on-disk metadata should still be clean. + gscoped_ptr new_bm(this->CreateBlockManager( + scoped_refptr(), + MemTracker::CreateTracker(-1, "other tracker"), + { GetTestDataDirectory() })); + ASSERT_OK(new_bm->Open()); + + // Test that the state of all three blocks is properly reflected. + gscoped_ptr read_block; + ASSERT_OK(new_bm->OpenBlock(written_block1->id(), &read_block)); + uint64_t sz; + ASSERT_OK(read_block->Size(&sz)); + ASSERT_EQ(0, sz); + ASSERT_OK(read_block->Close()); + ASSERT_OK(new_bm->OpenBlock(written_block2->id(), &read_block)); + ASSERT_OK(read_block->Size(&sz)); + ASSERT_EQ(test_data.length(), sz); + Slice data; + gscoped_ptr scratch(new uint8_t[test_data.length()]); + ASSERT_OK(read_block->Read(0, test_data.length(), &data, scratch.get())); + ASSERT_EQ(test_data, data); + ASSERT_OK(read_block->Close()); + ASSERT_TRUE(new_bm->OpenBlock(written_block3->id(), nullptr) + .IsNotFound()); +} + +TYPED_TEST(BlockManagerTest, MultiPathTest) { + // Recreate the block manager with three paths. + vector paths; + for (int i = 0; i < 3; i++) { + paths.push_back(this->GetTestPath(Substitute("path$0", i))); + } + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager( + scoped_refptr(), + shared_ptr(), + paths, + true)); + + ASSERT_NO_FATAL_FAILURE(this->RunMultipathTest(paths)); +} + +static void CloseHelper(ReadableBlock* block) { + CHECK_OK(block->Close()); +} + +// Tests that ReadableBlock::Close() is thread-safe and idempotent. +TYPED_TEST(BlockManagerTest, ConcurrentCloseReadableBlockTest) { + gscoped_ptr writer; + ASSERT_OK(this->bm_->CreateBlock(&writer)); + ASSERT_OK(writer->Close()); + + gscoped_ptr reader; + ASSERT_OK(this->bm_->OpenBlock(writer->id(), &reader)); + + vector > threads; + for (int i = 0; i < 100; i++) { + scoped_refptr t; + ASSERT_OK(Thread::Create("test", Substitute("t$0", i), + &CloseHelper, reader.get(), &t)); + threads.push_back(t); + } + for (const scoped_refptr& t : threads) { + t->Join(); + } +} + +static void CheckMetrics(const scoped_refptr& metrics, + int blocks_open_reading, int blocks_open_writing, + int total_readable_blocks, int total_writable_blocks, + int total_bytes_read, int total_bytes_written) { + ASSERT_EQ(blocks_open_reading, down_cast*>( + metrics->FindOrNull(METRIC_block_manager_blocks_open_reading).get())->value()); + ASSERT_EQ(blocks_open_writing, down_cast*>( + metrics->FindOrNull(METRIC_block_manager_blocks_open_writing).get())->value()); + ASSERT_EQ(total_readable_blocks, down_cast( + metrics->FindOrNull(METRIC_block_manager_total_readable_blocks).get())->value()); + ASSERT_EQ(total_writable_blocks, down_cast( + metrics->FindOrNull(METRIC_block_manager_total_writable_blocks).get())->value()); + ASSERT_EQ(total_bytes_read, down_cast( + metrics->FindOrNull(METRIC_block_manager_total_bytes_read).get())->value()); + ASSERT_EQ(total_bytes_written, down_cast( + metrics->FindOrNull(METRIC_block_manager_total_bytes_written).get())->value()); +} + +TYPED_TEST(BlockManagerTest, MetricsTest) { + const string kTestData = "test data"; + MetricRegistry registry; + scoped_refptr entity = METRIC_ENTITY_server.Instantiate(®istry, "test"); + ASSERT_NO_FATAL_FAILURE(this->ReopenBlockManager(entity, + shared_ptr(), + { GetTestDataDirectory() }, + false)); + ASSERT_NO_FATAL_FAILURE(CheckMetrics(entity, 0, 0, 0, 0, 0, 0)); + + for (int i = 0; i < 3; i++) { + gscoped_ptr writer; + gscoped_ptr reader; + + // An open writer. Also reflected in total_writable_blocks. + ASSERT_OK(this->bm_->CreateBlock(&writer)); + ASSERT_NO_FATAL_FAILURE(CheckMetrics( + entity, 0, 1, i, i + 1, + i * kTestData.length(), i * kTestData.length())); + + // Block is no longer opened for writing, but its data + // is now reflected in total_bytes_written. + ASSERT_OK(writer->Append(kTestData)); + ASSERT_OK(writer->Close()); + ASSERT_NO_FATAL_FAILURE(CheckMetrics( + entity, 0, 0, i, i + 1, + i * kTestData.length(), (i + 1) * kTestData.length())); + + // An open reader. + ASSERT_OK(this->bm_->OpenBlock(writer->id(), &reader)); + ASSERT_NO_FATAL_FAILURE(CheckMetrics( + entity, 1, 0, i + 1, i + 1, + i * kTestData.length(), (i + 1) * kTestData.length())); + + // The read is reflected in total_bytes_read. + Slice data; + gscoped_ptr scratch(new uint8_t[kTestData.length()]); + ASSERT_OK(reader->Read(0, kTestData.length(), &data, scratch.get())); + ASSERT_NO_FATAL_FAILURE(CheckMetrics( + entity, 1, 0, i + 1, i + 1, + (i + 1) * kTestData.length(), (i + 1) * kTestData.length())); + + // The reader is now gone. + ASSERT_OK(reader->Close()); + ASSERT_NO_FATAL_FAILURE(CheckMetrics( + entity, 0, 0, i + 1, i + 1, + (i + 1) * kTestData.length(), (i + 1) * kTestData.length())); + } +} + +TYPED_TEST(BlockManagerTest, LogMetricsTest) { + ASSERT_NO_FATAL_FAILURE(this->RunLogMetricsTest()); +} + +TYPED_TEST(BlockManagerTest, LogContainerPreallocationTest) { + ASSERT_NO_FATAL_FAILURE(this->RunLogContainerPreallocationTest()); +} + +TYPED_TEST(BlockManagerTest, MemTrackerTest) { + ASSERT_NO_FATAL_FAILURE(this->RunMemTrackerTest()); +} + +// The LogBlockManager is only supported on Linux, since it requires hole punching. +#if defined(__linux__) +// LogBlockManager-specific tests +class LogBlockManagerTest : public BlockManagerTest { +}; + +// Regression test for KUDU-1190, a crash at startup when a block ID has been +// reused. +TEST_F(LogBlockManagerTest, TestReuseBlockIds) { + // Set a deterministic random seed, so that we can reproduce the sequence + // of random numbers. + bm_->rand_.Reset(1); + vector block_ids; + + // Create 4 containers, with the first four block IDs in the random sequence. + { + ScopedWritableBlockCloser closer; + for (int i = 0; i < 4; i++) { + gscoped_ptr writer; + ASSERT_OK(bm_->CreateBlock(&writer)); + block_ids.push_back(writer->id()); + closer.AddBlock(writer.Pass()); + } + ASSERT_OK(closer.CloseBlocks()); + } + + // Create one more block, which should reuse the first container. + { + gscoped_ptr writer; + ASSERT_OK(bm_->CreateBlock(&writer)); + ASSERT_OK(writer->Close()); + } + + ASSERT_EQ(4, bm_->available_containers_.size()); + + // Delete the original blocks. + for (const BlockId& b : block_ids) { + ASSERT_OK(bm_->DeleteBlock(b)); + } + + // Reset the random seed and re-create new blocks which should reuse the same + // block IDs (allowed since the blocks were deleted). + bm_->rand_.Reset(1); + for (int i = 0; i < 4; i++) { + gscoped_ptr writer; + ASSERT_OK(bm_->CreateBlock(&writer)); + ASSERT_EQ(writer->id(), block_ids[i]); + ASSERT_OK(writer->Close()); + } + + // Now we have 4 containers with the following metadata: + // 1: CREATE(1) CREATE (5) DELETE(1) CREATE(4) + // 2: CREATE(2) DELETE(2) CREATE(1) + // 3: CREATE(3) DELETE(3) CREATE(2) + // 4: CREATE(4) DELETE(4) CREATE(3) + + // Re-open the block manager and make sure it can deal with this case where + // block IDs have been reused. + NO_FATALS(ReopenBlockManager( + scoped_refptr(), + shared_ptr(), + { GetTestDataDirectory() }, + false)); +} +#endif // defined(__linux__) + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager.cc b/src/kudu/fs/block_manager.cc new file mode 100644 index 000000000000..1add6d14b990 --- /dev/null +++ b/src/kudu/fs/block_manager.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/block_manager.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" + +// The default value is optimized for the case where: +// 1. the cfile blocks are colocated with the WALs. +// 2. The underlying hardware is a spinning disk. +// 3. The underlying filesystem is either XFS or EXT4. +// 4. cfile_do_on_finish is 'close' (see cfile/cfile_writer.cc). +// +// When all conditions hold, this value ensures low latency for WAL writes. +DEFINE_bool(block_coalesce_close, false, + "Coalesce synchronization of data during CloseBlocks()"); +TAG_FLAG(block_coalesce_close, experimental); + +DEFINE_bool(block_manager_lock_dirs, true, + "Lock the data block directories to prevent concurrent usage. " + "Note that read-only concurrent usage is still allowed."); +TAG_FLAG(block_manager_lock_dirs, unsafe); + +namespace kudu { +namespace fs { + +const char* BlockManager::kInstanceMetadataFileName = "block_manager_instance"; + +BlockManagerOptions::BlockManagerOptions() + : read_only(false) { +} + +BlockManagerOptions::~BlockManagerOptions() { +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager.h b/src/kudu/fs/block_manager.h new file mode 100644 index 000000000000..cc5c5a152d45 --- /dev/null +++ b/src/kudu/fs/block_manager.h @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_FS_BLOCK_MANAGER_H +#define KUDU_FS_BLOCK_MANAGER_H + +#include +#include +#include +#include +#include + +#include "kudu/fs/block_id.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +DECLARE_bool(block_coalesce_close); + +namespace kudu { + +class MemTracker; +class MetricEntity; +class Slice; + +namespace fs { + +class BlockManager; + +// The smallest unit of Kudu data that is backed by the local filesystem. +// +// The block interface reflects Kudu on-disk storage design principles: +// - Blocks are append only. +// - Blocks are immutable once written. +// - Blocks opened for reading are thread-safe and may be used by multiple +// concurrent readers. +// - Blocks opened for writing are not thread-safe. +class Block { + public: + virtual ~Block() {} + + // Returns the identifier for this block. + virtual const BlockId& id() const = 0; +}; + +// A block that has been opened for writing. There may only be a single +// writing thread, and data may only be appended to the block. +// +// Close() is an expensive operation, as it must flush both dirty block data +// and metadata to disk. The block manager API provides two ways to improve +// Close() performance: +// 1. FlushDataAsync() before Close(). If there's enough work to be done +// between the two calls, there will be less outstanding I/O to wait for +// during Close(). +// 2. CloseBlocks() on a group of blocks. This at least ensures that, when +// waiting on outstanding I/O, the waiting is done in parallel. +// +// NOTE: if a WritableBlock is not explicitly Close()ed, it will be aborted +// (i.e. deleted). +class WritableBlock : public Block { + public: + enum State { + // There is no dirty data in the block. + CLEAN, + + // There is some dirty data in the block. + DIRTY, + + // There is an outstanding flush operation asynchronously flushing + // dirty block data to disk. + FLUSHING, + + // The block is closed. No more operations can be performed on it. + CLOSED + }; + + // Destroy the WritableBlock. If it was not explicitly closed using Close(), + // this will Abort() the block. + virtual ~WritableBlock() {} + + // Destroys the in-memory representation of the block and synchronizes + // dirty block data and metadata with the disk. On success, guarantees + // that the entire block is durable. + virtual Status Close() = 0; + + // Like Close() but does not synchronize dirty data or metadata to disk. + // Meaning, after a successful Abort(), the block no longer exists. + virtual Status Abort() = 0; + + // Get a pointer back to this block's manager. + virtual BlockManager* block_manager() const = 0; + + // Appends the chunk of data referenced by 'data' to the block. + // + // Does not guarantee durability of 'data'; Close() must be called for all + // outstanding data to reach the disk. + virtual Status Append(const Slice& data) = 0; + + // Begins an asynchronous flush of dirty block data to disk. + // + // This is purely a performance optimization for Close(); if there is + // other work to be done between the final Append() and the future + // Close(), FlushDataAsync() will reduce the amount of time spent waiting + // for outstanding I/O to complete in Close(). This is analogous to + // readahead or prefetching. + // + // Data may not be written to the block after FlushDataAsync() is called. + virtual Status FlushDataAsync() = 0; + + // Returns the number of bytes successfully appended via Append(). + virtual size_t BytesAppended() const = 0; + + virtual State state() const = 0; +}; + +// A block that has been opened for reading. Multiple in-memory blocks may +// be constructed for the same logical block, and the same in-memory block +// may be shared amongst threads for concurrent reading. +class ReadableBlock : public Block { + public: + virtual ~ReadableBlock() {} + + // Destroys the in-memory representation of the block. + virtual Status Close() = 0; + + // Returns the on-disk size of a written block. + virtual Status Size(uint64_t* sz) const = 0; + + // Reads exactly 'length' bytes beginning from 'offset' in the block, + // returning an error if fewer bytes exist. A slice referencing the + // results is written to 'result' and may be backed by memory in + // 'scratch'. As such, 'scratch' must be at least 'length' in size and + // must remain alive while 'result' is used. + // + // Does not modify 'result' on error (but may modify 'scratch'). + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const = 0; + + // Returns the memory usage of this object including the object itself. + virtual size_t memory_footprint() const = 0; +}; + +// Provides options and hints for block placement. +struct CreateBlockOptions { +}; + +// Block manager creation options. +struct BlockManagerOptions { + BlockManagerOptions(); + ~BlockManagerOptions(); + + // The entity under which all metrics should be grouped. If NULL, metrics + // will not be produced. + // + // Defaults to NULL. + scoped_refptr metric_entity; + + // The memory tracker under which all new memory trackers will be parented. + // If NULL, new memory trackers will be parented to the root tracker. + std::shared_ptr parent_mem_tracker; + + // The paths where data blocks will be stored. Cannot be empty. + std::vector root_paths; + + // Whether the block manager should only allow reading. Defaults to false. + bool read_only; +}; + +// Utilities for Kudu block lifecycle management. All methods are +// thread-safe. +class BlockManager { + public: + virtual ~BlockManager() {} + + // Creates a new on-disk representation for this block manager. Must be + // followed up with a call to Open() to use the block manager. + // + // Returns an error if one already exists or cannot be created. + virtual Status Create() = 0; + + // Opens an existing on-disk representation of this block manager. + // + // Returns an error if one does not exist or cannot be opened. + virtual Status Open() = 0; + + // Creates a new block using the provided options and opens it for + // writing. The block's ID will be generated. + // + // Does not guarantee the durability of the block; it must be closed to + // ensure that it reaches disk. + // + // Does not modify 'block' on error. + virtual Status CreateBlock(const CreateBlockOptions& opts, + gscoped_ptr* block) = 0; + + // Like the above but uses default options. + virtual Status CreateBlock(gscoped_ptr* block) = 0; + + // Opens an existing block for reading. + // + // Does not modify 'block' on error. + virtual Status OpenBlock(const BlockId& block_id, + gscoped_ptr* block) = 0; + + // Deletes an existing block, allowing its space to be reclaimed by the + // filesystem. The change is immediately made durable. + // + // Blocks may be deleted while they are open for reading or writing; + // the actual deletion will take place after the last open reader or + // writer is closed. + virtual Status DeleteBlock(const BlockId& block_id) = 0; + + // Closes (and fully synchronizes) the given blocks. Effectively like + // Close() for each block but may be optimized for groups of blocks. + // + // On success, guarantees that outstanding data is durable. + virtual Status CloseBlocks(const std::vector& blocks) = 0; + + protected: + static const char* kInstanceMetadataFileName; +}; + +// Closes a group of blocks. +// +// Blocks must be closed explicitly via CloseBlocks(), otherwise they will +// be deleted in the in the destructor. +class ScopedWritableBlockCloser { + public: + ScopedWritableBlockCloser() {} + + ~ScopedWritableBlockCloser() { + for (WritableBlock* block : blocks_) { + WARN_NOT_OK(block->Abort(), strings::Substitute( + "Failed to abort block with id $0", block->id().ToString())); + } + STLDeleteElements(&blocks_); + } + + void AddBlock(gscoped_ptr block) { + blocks_.push_back(block.release()); + } + + Status CloseBlocks() { + if (blocks_.empty()) { + return Status::OK(); + } + ElementDeleter deleter(&blocks_); + + // We assume every block is using the same block manager, so any + // block's manager will do. + BlockManager* bm = blocks_[0]->block_manager(); + return bm->CloseBlocks(blocks_); + } + + const std::vector& blocks() const { return blocks_; } + + private: + std::vector blocks_; +}; + +} // namespace fs +} // namespace kudu + +#endif diff --git a/src/kudu/fs/block_manager_metrics.cc b/src/kudu/fs/block_manager_metrics.cc new file mode 100644 index 000000000000..c53f26f00a6b --- /dev/null +++ b/src/kudu/fs/block_manager_metrics.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/block_manager_metrics.h" + +#include "kudu/util/metrics.h" + +METRIC_DEFINE_gauge_uint64(server, block_manager_blocks_open_reading, + "Data Blocks Open For Read", + kudu::MetricUnit::kBlocks, + "Number of data blocks currently open for reading"); + +METRIC_DEFINE_gauge_uint64(server, block_manager_blocks_open_writing, + "Data Blocks Open For Write", + kudu::MetricUnit::kBlocks, + "Number of data blocks currently open for writing"); + +METRIC_DEFINE_counter(server, block_manager_total_writable_blocks, + "Data Blocks Opened For Write", + kudu::MetricUnit::kBlocks, + "Number of data blocks opened for writing since service start"); + +METRIC_DEFINE_counter(server, block_manager_total_readable_blocks, + "Data Blocks Opened For Read", + kudu::MetricUnit::kBlocks, + "Number of data blocks opened for reading since service start"); + +METRIC_DEFINE_counter(server, block_manager_total_bytes_written, + "Block Data Bytes Written", + kudu::MetricUnit::kBytes, + "Number of bytes of block data written since service start"); + +METRIC_DEFINE_counter(server, block_manager_total_bytes_read, + "Block Data Bytes Read", + kudu::MetricUnit::kBytes, + "Number of bytes of block data read since service start"); + +namespace kudu { +namespace fs { +namespace internal { + +#define MINIT(x) x(METRIC_block_manager_##x.Instantiate(entity)) +#define GINIT(x) x(METRIC_block_manager_##x.Instantiate(entity, 0)) +BlockManagerMetrics::BlockManagerMetrics(const scoped_refptr& entity) + : GINIT(blocks_open_reading), + GINIT(blocks_open_writing), + MINIT(total_readable_blocks), + MINIT(total_writable_blocks), + MINIT(total_bytes_read), + MINIT(total_bytes_written) { +} +#undef GINIT +#undef MINIT + +} // namespace internal +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager_metrics.h b/src/kudu/fs/block_manager_metrics.h new file mode 100644 index 000000000000..926b9c72fd18 --- /dev/null +++ b/src/kudu/fs/block_manager_metrics.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_FS_BLOCK_MANAGER_METRICS_H +#define KUDU_FS_BLOCK_MANAGER_METRICS_H + +#include + +#include "kudu/gutil/ref_counted.h" + +namespace kudu { + +class Counter; +template +class AtomicGauge; +class MetricEntity; + +namespace fs { +namespace internal { + +struct BlockManagerMetrics { + explicit BlockManagerMetrics(const scoped_refptr& metric_entity); + + scoped_refptr > blocks_open_reading; + scoped_refptr > blocks_open_writing; + + scoped_refptr total_readable_blocks; + scoped_refptr total_writable_blocks; + scoped_refptr total_bytes_read; + scoped_refptr total_bytes_written; +}; + +} // namespace internal +} // namespace fs +} // namespace kudu + +#endif // KUDU_FS_BLOCK_MANAGER_METRICS_H diff --git a/src/kudu/fs/block_manager_util-test.cc b/src/kudu/fs/block_manager_util-test.cc new file mode 100644 index 000000000000..cc68380135de --- /dev/null +++ b/src/kudu/fs/block_manager_util-test.cc @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/fs/block_manager_util.h" + +#include +#include + +#include +#include + +#include "kudu/fs/fs.pb.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/path_util.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace fs { + +using google::protobuf::RepeatedPtrField; +using std::string; +using std::vector; +using strings::Substitute; + +TEST_F(KuduTest, Lifecycle) { + string kType = "asdf"; + string kFileName = JoinPathSegments(GetTestDataDirectory(), "foo"); + string kUuid = "a_uuid"; + + // Test that the metadata file was created. + { + PathInstanceMetadataFile file(env_.get(), kType, kFileName); + ASSERT_OK(file.Create(kUuid, { kUuid })); + } + ASSERT_TRUE(env_->FileExists(kFileName)); + + // Test that we could open and parse it. + { + PathInstanceMetadataFile file(env_.get(), kType, kFileName); + ASSERT_OK(file.LoadFromDisk()); + PathInstanceMetadataPB* md = file.metadata(); + ASSERT_EQ(kType, md->block_manager_type()); + const PathSetPB& path_set = md->path_set(); + ASSERT_EQ(kUuid, path_set.uuid()); + ASSERT_EQ(1, path_set.all_uuids_size()); + ASSERT_EQ(kUuid, path_set.all_uuids(0)); + } + + // Test that expecting a different type of block manager fails. + { + PathInstanceMetadataFile file(env_.get(), "other type", kFileName); + PathInstanceMetadataPB pb; + ASSERT_TRUE(file.LoadFromDisk().IsIOError()); + } +} + +TEST_F(KuduTest, Locking) { + string kType = "asdf"; + string kFileName = JoinPathSegments(GetTestDataDirectory(), "foo"); + string kUuid = "a_uuid"; + + PathInstanceMetadataFile file(env_.get(), kType, kFileName); + ASSERT_OK(file.Create(kUuid, { kUuid })); + + PathInstanceMetadataFile first(env_.get(), kType, kFileName); + ASSERT_OK(first.LoadFromDisk()); + ASSERT_OK(first.Lock()); + + ASSERT_DEATH({ + PathInstanceMetadataFile second(env_.get(), kType, kFileName); + CHECK_OK(second.LoadFromDisk()); + CHECK_OK(second.Lock()); + }, "Could not lock"); + + ASSERT_OK(first.Unlock()); + ASSERT_DEATH({ + PathInstanceMetadataFile second(env_.get(), kType, kFileName); + CHECK_OK(second.LoadFromDisk()); + Status s = second.Lock(); + if (s.ok()) { + LOG(FATAL) << "Lock successfully acquired"; + } else { + LOG(FATAL) << "Could not lock: " << s.ToString(); + } + }, "Lock successfully acquired"); +} + +static void RunCheckIntegrityTest(Env* env, + const vector& path_sets, + const string& expected_status_string) { + vector instances; + ElementDeleter deleter(&instances); + + int i = 0; + for (const PathSetPB& ps : path_sets) { + gscoped_ptr instance( + new PathInstanceMetadataFile(env, "asdf", Substitute("$0", i))); + gscoped_ptr metadata(new PathInstanceMetadataPB()); + metadata->set_block_manager_type("asdf"); + metadata->set_filesystem_block_size_bytes(1); + metadata->mutable_path_set()->CopyFrom(ps); + instance->SetMetadataForTests(metadata.Pass()); + instances.push_back(instance.release()); + i++; + } + + EXPECT_EQ(expected_status_string, + PathInstanceMetadataFile::CheckIntegrity(instances).ToString()); +} + +TEST_F(KuduTest, CheckIntegrity) { + vector uuids = { "fee", "fi", "fo", "fum" }; + RepeatedPtrField kAllUuids(uuids.begin(), uuids.end()); + + // Initialize path_sets to be fully consistent. + vector path_sets(kAllUuids.size()); + for (int i = 0; i < path_sets.size(); i++) { + PathSetPB& ps = path_sets[i]; + ps.set_uuid(kAllUuids.Get(i)); + ps.mutable_all_uuids()->CopyFrom(kAllUuids); + } + + { + // Test consistent path sets. + EXPECT_NO_FATAL_FAILURE(RunCheckIntegrityTest(env_.get(), path_sets, "OK")); + } + { + // Test where two path sets claim the same UUID. + vector path_sets_copy(path_sets); + path_sets_copy[1].set_uuid(path_sets_copy[0].uuid()); + EXPECT_NO_FATAL_FAILURE(RunCheckIntegrityTest( + env_.get(), path_sets_copy, + "IO error: File 1 claimed uuid fee already claimed by file 0")); + } + { + // Test where the path sets have duplicate UUIDs. + vector path_sets_copy(path_sets); + for (PathSetPB& ps : path_sets_copy) { + ps.add_all_uuids("fee"); + } + EXPECT_NO_FATAL_FAILURE(RunCheckIntegrityTest( + env_.get(), path_sets_copy, + "IO error: File 0 has duplicate uuids: fee,fi,fo,fum,fee")); + } + { + // Test where a path set claims a UUID that isn't in all_uuids. + vector path_sets_copy(path_sets); + path_sets_copy[1].set_uuid("something_else"); + EXPECT_NO_FATAL_FAILURE(RunCheckIntegrityTest( + env_.get(), path_sets_copy, + "IO error: File 1 claimed uuid something_else which is not in " + "all_uuids (fee,fi,fo,fum)")); + } + { + // Test where a path set claims a different all_uuids. + vector path_sets_copy(path_sets); + path_sets_copy[1].add_all_uuids("another_uuid"); + EXPECT_NO_FATAL_FAILURE(RunCheckIntegrityTest( + env_.get(), path_sets_copy, + "IO error: File 1 claimed all_uuids fee,fi,fo,fum,another_uuid but " + "file 0 claimed all_uuids fee,fi,fo,fum")); + } +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager_util.cc b/src/kudu/fs/block_manager_util.cc new file mode 100644 index 000000000000..79c13b5655b3 --- /dev/null +++ b/src/kudu/fs/block_manager_util.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/fs/block_manager_util.h" + +#include +#include +#include + +#include + +#include "kudu/fs/fs.pb.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/env.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" + +DECLARE_bool(enable_data_block_fsync); + +namespace kudu { +namespace fs { + +using std::set; +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +PathInstanceMetadataFile::PathInstanceMetadataFile(Env* env, + string block_manager_type, + string filename) + : env_(env), + block_manager_type_(std::move(block_manager_type)), + filename_(std::move(filename)) {} + +PathInstanceMetadataFile::~PathInstanceMetadataFile() { + if (lock_) { + WARN_NOT_OK(Unlock(), Substitute("Failed to unlock file $0", filename_)); + } +} + +Status PathInstanceMetadataFile::Create(const string& uuid, const vector& all_uuids) { + DCHECK(!lock_) << + "Creating a metadata file that's already locked would release the lock"; + DCHECK(ContainsKey(set(all_uuids.begin(), all_uuids.end()), uuid)); + + uint64_t block_size; + RETURN_NOT_OK(env_->GetBlockSize(DirName(filename_), &block_size)); + + PathInstanceMetadataPB new_instance; + + // Set up the path set. + PathSetPB* new_path_set = new_instance.mutable_path_set(); + new_path_set->set_uuid(uuid); + new_path_set->mutable_all_uuids()->Reserve(all_uuids.size()); + for (const string& u : all_uuids) { + new_path_set->add_all_uuids(u); + } + + // And the rest of the metadata. + new_instance.set_block_manager_type(block_manager_type_); + new_instance.set_filesystem_block_size_bytes(block_size); + + return pb_util::WritePBContainerToPath( + env_, filename_, new_instance, + pb_util::NO_OVERWRITE, + FLAGS_enable_data_block_fsync ? pb_util::SYNC : pb_util::NO_SYNC); +} + +Status PathInstanceMetadataFile::LoadFromDisk() { + DCHECK(!lock_) << + "Opening a metadata file that's already locked would release the lock"; + + gscoped_ptr pb(new PathInstanceMetadataPB()); + RETURN_NOT_OK(pb_util::ReadPBContainerFromPath(env_, filename_, pb.get())); + + if (pb->block_manager_type() != block_manager_type_) { + return Status::IOError("Wrong block manager type", pb->block_manager_type()); + } + + uint64_t block_size; + RETURN_NOT_OK(env_->GetBlockSize(filename_, &block_size)); + if (pb->filesystem_block_size_bytes() != block_size) { + return Status::IOError("Wrong filesystem block size", Substitute( + "Expected $0 but was $1", pb->filesystem_block_size_bytes(), block_size)); + } + + metadata_.swap(pb); + return Status::OK(); +} + +Status PathInstanceMetadataFile::Lock() { + DCHECK(!lock_); + + FileLock* lock; + RETURN_NOT_OK_PREPEND(env_->LockFile(filename_, &lock), + Substitute("Could not lock $0", filename_)); + lock_.reset(lock); + return Status::OK(); +} + +Status PathInstanceMetadataFile::Unlock() { + DCHECK(lock_); + + RETURN_NOT_OK_PREPEND(env_->UnlockFile(lock_.release()), + Substitute("Could not unlock $0", filename_)); + return Status::OK(); +} + +Status PathInstanceMetadataFile::CheckIntegrity( + const vector& instances) { + unordered_map uuids; + pair first_all_uuids; + + for (PathInstanceMetadataFile* instance : instances) { + const PathSetPB& path_set = instance->metadata()->path_set(); + + // Check that this instance's UUID wasn't already claimed. + PathInstanceMetadataFile** other = + InsertOrReturnExisting(&uuids, path_set.uuid(), instance); + if (other) { + return Status::IOError(Substitute( + "File $0 claimed uuid $1 already claimed by file $2", + instance->filename_, path_set.uuid(), (*other)->filename_)); + } + + // Check that there are no duplicate UUIDs in all_uuids. + set deduplicated_uuids(path_set.all_uuids().begin(), + path_set.all_uuids().end()); + string all_uuids_str = JoinStrings(path_set.all_uuids(), ","); + if (deduplicated_uuids.size() != path_set.all_uuids_size()) { + return Status::IOError(Substitute( + "File $0 has duplicate uuids: $1", + instance->filename_, all_uuids_str)); + } + + // Check that this instance's UUID is a member of all_uuids. + if (!ContainsKey(deduplicated_uuids, path_set.uuid())) { + return Status::IOError(Substitute( + "File $0 claimed uuid $1 which is not in all_uuids ($2)", + instance->filename_, path_set.uuid(), all_uuids_str)); + } + + // Check that every all_uuids is the same. + if (first_all_uuids.first.empty()) { + first_all_uuids.first = all_uuids_str; + first_all_uuids.second = instance; + } else if (first_all_uuids.first != all_uuids_str) { + return Status::IOError(Substitute( + "File $0 claimed all_uuids $1 but file $2 claimed all_uuids $3", + instance->filename_, all_uuids_str, + first_all_uuids.second->filename_, first_all_uuids.first)); + } + } + + return Status::OK(); +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/block_manager_util.h b/src/kudu/fs/block_manager_util.h new file mode 100644 index 000000000000..e1136b74dd36 --- /dev/null +++ b/src/kudu/fs/block_manager_util.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_FS_BLOCK_MANAGER_UTIL_H +#define KUDU_FS_BLOCK_MANAGER_UTIL_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/path_util.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Env; +class FileLock; +class PathInstanceMetadataPB; + +namespace fs { + +// Reads and writes block manager instance metadata files. +// +// Thread-unsafe; access to this object must be externally synchronized. +class PathInstanceMetadataFile { + public: + // 'env' must remain valid for the lifetime of this class. + PathInstanceMetadataFile(Env* env, std::string block_manager_type, + std::string filename); + + ~PathInstanceMetadataFile(); + + // Creates, writes, synchronizes, and closes a new instance metadata file. + // + // 'uuid' is this instance's UUID, and 'all_uuids' is all of the UUIDs in + // this instance's path set. + Status Create(const std::string& uuid, + const std::vector& all_uuids); + + // Opens, reads, verifies, and closes an existing instance metadata file. + // + // On success, 'metadata_' is overwritten with the contents of the file. + Status LoadFromDisk(); + + // Locks the instance metadata file, which must exist on-disk. Returns an + // error if it's already locked. The lock is released when Unlock() is + // called, when this object is destroyed, or when the process exits. + // + // Note: the lock is also released if any fd of the instance metadata file + // in this process is closed. Thus, it is an error to call Create() or + // LoadFromDisk() on a locked file. + Status Lock(); + + // Unlocks the instance metadata file. Must have been locked to begin with. + Status Unlock(); + + void SetMetadataForTests(gscoped_ptr metadata) { + metadata_ = metadata.Pass(); + } + + std::string path() const { return DirName(filename_); } + PathInstanceMetadataPB* const metadata() const { return metadata_.get(); } + + // Check the integrity of the provided instances' path sets. + static Status CheckIntegrity(const std::vector& instances); + + private: + Env* env_; + const std::string block_manager_type_; + const std::string filename_; + gscoped_ptr metadata_; + gscoped_ptr lock_; +}; + +} // namespace fs +} // namespace kudu +#endif diff --git a/src/kudu/fs/file_block_manager.cc b/src/kudu/fs/file_block_manager.cc new file mode 100644 index 000000000000..677a9af66174 --- /dev/null +++ b/src/kudu/fs/file_block_manager.cc @@ -0,0 +1,764 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/file_block_manager.h" + +#include +#include +#include +#include + +#include "kudu/fs/block_manager_metrics.h" +#include "kudu/fs/block_manager_util.h" +#include "kudu/fs/fs.pb.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/atomic.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/malloc.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/oid_generator.h" +#include "kudu/util/path_util.h" +#include "kudu/util/random_util.h" +#include "kudu/util/status.h" + +using kudu::env_util::ScopedFileDeleter; +using std::shared_ptr; +using std::string; +using std::unordered_set; +using std::vector; +using strings::Substitute; + +DECLARE_bool(enable_data_block_fsync); +DECLARE_bool(block_manager_lock_dirs); + +namespace kudu { +namespace fs { + +namespace internal { + +//////////////////////////////////////////////////////////// +// FileBlockLocation +//////////////////////////////////////////////////////////// + +// Logical location of a block in the file block manager. +// +// A block ID uniquely locates a block. Every ID is a uint64_t, broken down +// into multiple logical components: +// 1. Bytes 0 (MSB) and 1 identify the block's root path by path set index. See +// fs.proto for more details on path sets. +// 2. Bytes 2-7 (LSB) uniquely identify the block within the root path. As more +// and more blocks are created in a root path, the likelihood of a collision +// becomes greater. In the event of a collision, the block manager will +// retry(see CreateBlock()). +// +// A FileBlockLocation abstracts away these details so that clients need not +// worry about them. It is constructed via FromParts() or FromBlockId() and is +// copyable and assignable. +class FileBlockLocation { + public: + // Empty constructor + FileBlockLocation() { + } + + // Construct a location from its constituent parts. + static FileBlockLocation FromParts(const string& root_path, + uint16_t root_path_idx, + const BlockId& block_id); + + // Construct a location from a full block ID. + static FileBlockLocation FromBlockId(const string& root_path, + const BlockId& block_id); + + // Get the root path index of a given block ID. + static uint16_t GetRootPathIdx(const BlockId& block_id) { + return block_id.id() >> 48; + } + + // Returns the full filesystem path for this location. + string GetFullPath() const; + + // Create all subdirectories needed for this location. + // + // On success, 'created_dirs' contains the directories that were actually + // created (as opposed to those that were reused). + Status CreateBlockDir(Env* env, vector* created_dirs); + + // Writes all parent directories that are part of this location to + // 'parent_dirs'. + // + // The directories are written in "fsync order"; that is, the order in + // which they should be fsynced to make them durable. + void GetAllParentDirs(vector* parent_dirs) const; + + // Simple accessors. + const string& root_path() const { return root_path_; } + const BlockId& block_id() const { return block_id_; } + + private: + FileBlockLocation(string root_path, BlockId block_id) + : root_path_(std::move(root_path)), block_id_(std::move(block_id)) {} + + // These per-byte accessors yield subdirectories in which blocks are grouped. + string byte2() const { + return StringPrintf("%02llx", + (block_id_.id() & 0x0000FF0000000000ULL) >> 40); + } + string byte3() const { + return StringPrintf("%02llx", + (block_id_.id() & 0x000000FF00000000ULL) >> 32); + } + string byte4() const { + return StringPrintf("%02llx", + (block_id_.id() & 0x00000000FF000000ULL) >> 24); + } + + string root_path_; + BlockId block_id_; +}; + +FileBlockLocation FileBlockLocation::FromParts(const string& root_path, + uint16_t root_path_idx, + const BlockId& block_id) { + // The combined ID consists of 'root_path_idx' (top 2 bytes) and 'block_id' + // (bottom 6 bytes). The top 2 bytes of 'block_id' are dropped. + uint64_t combined_id = static_cast(root_path_idx) << 48; + combined_id |= block_id.id() & ((1ULL << 48) - 1); + return FileBlockLocation(root_path, BlockId(combined_id)); +} + +FileBlockLocation FileBlockLocation::FromBlockId(const string& root_path, + const BlockId& block_id) { + return FileBlockLocation(root_path, block_id); +} + +string FileBlockLocation::GetFullPath() const { + string p = root_path_; + p = JoinPathSegments(p, byte2()); + p = JoinPathSegments(p, byte3()); + p = JoinPathSegments(p, byte4()); + p = JoinPathSegments(p, block_id_.ToString()); + return p; +} + +Status FileBlockLocation::CreateBlockDir(Env* env, + vector* created_dirs) { + DCHECK(env->FileExists(root_path_)); + + bool path0_created; + string path0 = JoinPathSegments(root_path_, byte2()); + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, path0, &path0_created)); + + bool path1_created; + string path1 = JoinPathSegments(path0, byte3()); + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, path1, &path1_created)); + + bool path2_created; + string path2 = JoinPathSegments(path1, byte4()); + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, path2, &path2_created)); + + if (path2_created) { + created_dirs->push_back(path1); + } + if (path1_created) { + created_dirs->push_back(path0); + } + if (path0_created) { + created_dirs->push_back(root_path_); + } + return Status::OK(); +} + +void FileBlockLocation::GetAllParentDirs(vector* parent_dirs) const { + string path0 = JoinPathSegments(root_path_, byte2()); + string path1 = JoinPathSegments(path0, byte3()); + string path2 = JoinPathSegments(path1, byte4()); + + // This is the order in which the parent directories should be + // synchronized to disk. + parent_dirs->push_back(path2); + parent_dirs->push_back(path1); + parent_dirs->push_back(path0); + parent_dirs->push_back(root_path_); +} + +//////////////////////////////////////////////////////////// +// FileWritableBlock +//////////////////////////////////////////////////////////// + +// A file-backed block that has been opened for writing. +// +// Contains a pointer to the block manager as well as a FileBlockLocation +// so that dirty metadata can be synced via BlockManager::SyncMetadata() +// at Close() time. Embedding a FileBlockLocation (and not a simpler +// BlockId) consumes more memory, but the number of outstanding +// FileWritableBlock instances is expected to be low. +class FileWritableBlock : public WritableBlock { + public: + FileWritableBlock(FileBlockManager* block_manager, FileBlockLocation location, + shared_ptr writer); + + virtual ~FileWritableBlock(); + + virtual Status Close() OVERRIDE; + + virtual Status Abort() OVERRIDE; + + virtual BlockManager* block_manager() const OVERRIDE; + + virtual const BlockId& id() const OVERRIDE; + + virtual Status Append(const Slice& data) OVERRIDE; + + virtual Status FlushDataAsync() OVERRIDE; + + virtual size_t BytesAppended() const OVERRIDE; + + virtual State state() const OVERRIDE; + + private: + enum SyncMode { + SYNC, + NO_SYNC + }; + + // Close the block, optionally synchronizing dirty data and metadata. + Status Close(SyncMode mode); + + // Back pointer to the block manager. + // + // Should remain alive for the lifetime of this block. + FileBlockManager* block_manager_; + + // The block's location. + const FileBlockLocation location_; + + // The underlying opened file backing this block. + shared_ptr writer_; + + State state_; + + // The number of bytes successfully appended to the block. + size_t bytes_appended_; + + DISALLOW_COPY_AND_ASSIGN(FileWritableBlock); +}; + +FileWritableBlock::FileWritableBlock(FileBlockManager* block_manager, + FileBlockLocation location, + shared_ptr writer) + : block_manager_(block_manager), + location_(std::move(location)), + writer_(std::move(writer)), + state_(CLEAN), + bytes_appended_(0) { + if (block_manager_->metrics_) { + block_manager_->metrics_->blocks_open_writing->Increment(); + block_manager_->metrics_->total_writable_blocks->Increment(); + } +} + +FileWritableBlock::~FileWritableBlock() { + if (state_ != CLOSED) { + WARN_NOT_OK(Abort(), Substitute("Failed to close block $0", + id().ToString())); + } +} + +Status FileWritableBlock::Close() { + return Close(SYNC); +} + +Status FileWritableBlock::Abort() { + RETURN_NOT_OK(Close(NO_SYNC)); + return block_manager()->DeleteBlock(id()); +} + +BlockManager* FileWritableBlock::block_manager() const { + return block_manager_; +} + +const BlockId& FileWritableBlock::id() const { + return location_.block_id(); +} + +Status FileWritableBlock::Append(const Slice& data) { + DCHECK(state_ == CLEAN || state_ == DIRTY) + << "Invalid state: " << state_; + + RETURN_NOT_OK(writer_->Append(data)); + state_ = DIRTY; + bytes_appended_ += data.size(); + return Status::OK(); +} + +Status FileWritableBlock::FlushDataAsync() { + DCHECK(state_ == CLEAN || state_ == DIRTY || state_ == FLUSHING) + << "Invalid state: " << state_; + if (state_ == DIRTY) { + VLOG(3) << "Flushing block " << id(); + RETURN_NOT_OK(writer_->Flush(WritableFile::FLUSH_ASYNC)); + } + + state_ = FLUSHING; + return Status::OK(); +} + +size_t FileWritableBlock::BytesAppended() const { + return bytes_appended_; +} + +WritableBlock::State FileWritableBlock::state() const { + return state_; +} + +Status FileWritableBlock::Close(SyncMode mode) { + if (state_ == CLOSED) { + return Status::OK(); + } + + Status sync; + if (mode == SYNC && + (state_ == CLEAN || state_ == DIRTY || state_ == FLUSHING)) { + // Safer to synchronize data first, then metadata. + VLOG(3) << "Syncing block " << id(); + if (FLAGS_enable_data_block_fsync) { + sync = writer_->Sync(); + } + if (sync.ok()) { + sync = block_manager_->SyncMetadata(location_); + } + WARN_NOT_OK(sync, Substitute("Failed to sync when closing block $0", + id().ToString())); + } + Status close = writer_->Close(); + + state_ = CLOSED; + writer_.reset(); + if (block_manager_->metrics_) { + block_manager_->metrics_->blocks_open_writing->Decrement(); + block_manager_->metrics_->total_bytes_written->IncrementBy(BytesAppended()); + } + + // Prefer the result of Close() to that of Sync(). + return !close.ok() ? close : sync; +} + +//////////////////////////////////////////////////////////// +// FileReadableBlock +//////////////////////////////////////////////////////////// + +// A file-backed block that has been opened for reading. +// +// There may be millions of instances of FileReadableBlock outstanding, so +// great care must be taken to reduce its size. To that end, it does _not_ +// embed a FileBlockLocation, using the simpler BlockId instead. +class FileReadableBlock : public ReadableBlock { + public: + FileReadableBlock(const FileBlockManager* block_manager, BlockId block_id, + shared_ptr reader); + + virtual ~FileReadableBlock(); + + virtual Status Close() OVERRIDE; + + virtual const BlockId& id() const OVERRIDE; + + virtual Status Size(uint64_t* sz) const OVERRIDE; + + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const OVERRIDE; + + virtual size_t memory_footprint() const OVERRIDE; + + private: + // Back pointer to the owning block manager. + const FileBlockManager* block_manager_; + + // The block's identifier. + const BlockId block_id_; + + // The underlying opened file backing this block. + shared_ptr reader_; + + // Whether or not this block has been closed. Close() is thread-safe, so + // this must be an atomic primitive. + AtomicBool closed_; + + DISALLOW_COPY_AND_ASSIGN(FileReadableBlock); +}; + +FileReadableBlock::FileReadableBlock(const FileBlockManager* block_manager, + BlockId block_id, + shared_ptr reader) + : block_manager_(block_manager), + block_id_(std::move(block_id)), + reader_(std::move(reader)), + closed_(false) { + if (block_manager_->metrics_) { + block_manager_->metrics_->blocks_open_reading->Increment(); + block_manager_->metrics_->total_readable_blocks->Increment(); + } +} + +FileReadableBlock::~FileReadableBlock() { + WARN_NOT_OK(Close(), Substitute("Failed to close block $0", + id().ToString())); +} + +Status FileReadableBlock::Close() { + if (closed_.CompareAndSet(false, true)) { + reader_.reset(); + if (block_manager_->metrics_) { + block_manager_->metrics_->blocks_open_reading->Decrement(); + } + } + + return Status::OK(); +} + +const BlockId& FileReadableBlock::id() const { + return block_id_; +} + +Status FileReadableBlock::Size(uint64_t* sz) const { + DCHECK(!closed_.Load()); + + return reader_->Size(sz); +} + +Status FileReadableBlock::Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const { + DCHECK(!closed_.Load()); + + RETURN_NOT_OK(env_util::ReadFully(reader_.get(), offset, length, result, scratch)); + if (block_manager_->metrics_) { + block_manager_->metrics_->total_bytes_read->IncrementBy(length); + } + + return Status::OK(); +} + +size_t FileReadableBlock::memory_footprint() const { + DCHECK(reader_); + return kudu_malloc_usable_size(this) + reader_->memory_footprint(); +} + +} // namespace internal + +//////////////////////////////////////////////////////////// +// FileBlockManager +//////////////////////////////////////////////////////////// + +static const char* kBlockManagerType = "file"; +static const int kMaxPaths = (1 << 16) - 1; + +Status FileBlockManager::SyncMetadata(const internal::FileBlockLocation& location) { + vector parent_dirs; + location.GetAllParentDirs(&parent_dirs); + + // Figure out what directories to sync. + vector to_sync; + { + lock_guard l(&lock_); + for (const string& parent_dir : parent_dirs) { + if (dirty_dirs_.erase(parent_dir)) { + to_sync.push_back(parent_dir); + } + } + } + + // Sync them. + if (FLAGS_enable_data_block_fsync) { + for (const string& s : to_sync) { + RETURN_NOT_OK(env_->SyncDir(s)); + } + } + return Status::OK(); +} + +bool FileBlockManager::FindBlockPath(const BlockId& block_id, + string* path) const { + PathInstanceMetadataFile* metadata_file = FindPtrOrNull( + root_paths_by_idx_, internal::FileBlockLocation::GetRootPathIdx(block_id)); + if (metadata_file) { + *path = internal::FileBlockLocation::FromBlockId( + metadata_file->path(), block_id).GetFullPath(); + } + return metadata_file != nullptr; +} + +FileBlockManager::FileBlockManager(Env* env, const BlockManagerOptions& opts) + : env_(DCHECK_NOTNULL(env)), + read_only_(opts.read_only), + root_paths_(opts.root_paths), + rand_(GetRandomSeed32()), + mem_tracker_(MemTracker::CreateTracker(-1, + "file_block_manager", + opts.parent_mem_tracker)) { + DCHECK_GT(root_paths_.size(), 0); + if (opts.metric_entity) { + metrics_.reset(new internal::BlockManagerMetrics(opts.metric_entity)); + } +} + +FileBlockManager::~FileBlockManager() { + STLDeleteValues(&root_paths_by_idx_); + mem_tracker_->UnregisterFromParent(); +} + +Status FileBlockManager::Create() { + CHECK(!read_only_); + + deque delete_on_failure; + ElementDeleter d(&delete_on_failure); + + if (root_paths_.size() > kMaxPaths) { + return Status::NotSupported( + Substitute("File block manager supports a maximum of $0 paths", kMaxPaths)); + } + + // The UUIDs and indices will be included in every instance file. + ObjectIdGenerator oid_generator; + vector all_uuids(root_paths_.size()); + for (string& u : all_uuids) { + u = oid_generator.Next(); + } + int idx = 0; + + // Ensure the data paths exist and create the instance files. + unordered_set to_sync; + for (const string& root_path : root_paths_) { + bool created; + RETURN_NOT_OK_PREPEND(env_util::CreateDirIfMissing(env_, root_path, &created), + Substitute("Could not create directory $0", root_path)); + if (created) { + delete_on_failure.push_front(new ScopedFileDeleter(env_, root_path)); + to_sync.insert(DirName(root_path)); + } + + string instance_filename = JoinPathSegments( + root_path, kInstanceMetadataFileName); + PathInstanceMetadataFile metadata(env_, kBlockManagerType, + instance_filename); + RETURN_NOT_OK_PREPEND(metadata.Create(all_uuids[idx], all_uuids), + Substitute("Could not create $0", instance_filename)); + delete_on_failure.push_front(new ScopedFileDeleter(env_, instance_filename)); + idx++; + } + + // Ensure newly created directories are synchronized to disk. + if (FLAGS_enable_data_block_fsync) { + for (const string& dir : to_sync) { + RETURN_NOT_OK_PREPEND(env_->SyncDir(dir), + Substitute("Unable to synchronize directory $0", dir)); + } + } + + // Success: don't delete any files. + for (ScopedFileDeleter* deleter : delete_on_failure) { + deleter->Cancel(); + } + return Status::OK(); +} + +Status FileBlockManager::Open() { + vector instances; + ElementDeleter deleter(&instances); + instances.reserve(root_paths_.size()); + + for (const string& root_path : root_paths_) { + if (!env_->FileExists(root_path)) { + return Status::NotFound(Substitute( + "FileBlockManager at $0 not found", root_path)); + } + string instance_filename = JoinPathSegments( + root_path, kInstanceMetadataFileName); + gscoped_ptr metadata( + new PathInstanceMetadataFile(env_, kBlockManagerType, + instance_filename)); + RETURN_NOT_OK_PREPEND(metadata->LoadFromDisk(), + Substitute("Could not open $0", instance_filename)); + if (FLAGS_block_manager_lock_dirs) { + Status s = metadata->Lock(); + if (!s.ok()) { + Status new_status = s.CloneAndPrepend(Substitute( + "Could not lock $0", instance_filename)); + if (read_only_) { + // Not fatal in read-only mode. + LOG(WARNING) << new_status.ToString(); + LOG(WARNING) << "Proceeding without lock"; + } else { + return new_status; + } + } + } + + instances.push_back(metadata.release()); + } + + RETURN_NOT_OK_PREPEND(PathInstanceMetadataFile::CheckIntegrity(instances), + Substitute("Could not verify integrity of files: $0", + JoinStrings(root_paths_, ","))); + + PathMap instances_by_idx; + for (PathInstanceMetadataFile* instance : instances) { + const PathSetPB& path_set = instance->metadata()->path_set(); + uint32_t idx = -1; + for (int i = 0; i < path_set.all_uuids_size(); i++) { + if (path_set.uuid() == path_set.all_uuids(i)) { + idx = i; + break; + } + } + DCHECK_NE(idx, -1); // Guaranteed by CheckIntegrity(). + if (idx > kMaxPaths) { + return Status::NotSupported( + Substitute("File block manager supports a maximum of $0 paths", kMaxPaths)); + } + InsertOrDie(&instances_by_idx, idx, instance); + } + instances.clear(); + instances_by_idx.swap(root_paths_by_idx_); + next_root_path_ = root_paths_by_idx_.begin(); + return Status::OK(); +} + +Status FileBlockManager::CreateBlock(const CreateBlockOptions& opts, + gscoped_ptr* block) { + CHECK(!read_only_); + + // Pick a root path using a simple round-robin block placement strategy. + uint16_t root_path_idx; + string root_path; + { + lock_guard l(&lock_); + root_path_idx = next_root_path_->first; + root_path = next_root_path_->second->path(); + next_root_path_++; + if (next_root_path_ == root_paths_by_idx_.end()) { + next_root_path_ = root_paths_by_idx_.begin(); + } + } + + string path; + vector created_dirs; + Status s; + internal::FileBlockLocation location; + shared_ptr writer; + + // Repeat in case of block id collisions (unlikely). + do { + created_dirs.clear(); + + // Make sure we don't accidentally create a location using the magic + // invalid ID value. + BlockId id; + do { + id.SetId(rand_.Next64()); + } while (id.IsNull()); + + location = internal::FileBlockLocation::FromParts( + root_path, root_path_idx, id); + path = location.GetFullPath(); + RETURN_NOT_OK_PREPEND(location.CreateBlockDir(env_, &created_dirs), path); + WritableFileOptions wr_opts; + wr_opts.mode = Env::CREATE_NON_EXISTING; + s = env_util::OpenFileForWrite(wr_opts, env_, path, &writer); + } while (PREDICT_FALSE(s.IsAlreadyPresent())); + if (s.ok()) { + VLOG(1) << "Creating new block " << location.block_id().ToString() << " at " << path; + { + // Update dirty_dirs_ with those provided as well as the block's + // directory, which may not have been created but is definitely dirty + // (because we added a file to it). + lock_guard l(&lock_); + for (const string& created : created_dirs) { + dirty_dirs_.insert(created); + } + dirty_dirs_.insert(DirName(path)); + } + block->reset(new internal::FileWritableBlock(this, location, writer)); + } + return s; +} + +Status FileBlockManager::CreateBlock(gscoped_ptr* block) { + return CreateBlock(CreateBlockOptions(), block); +} + +Status FileBlockManager::OpenBlock(const BlockId& block_id, + gscoped_ptr* block) { + string path; + if (!FindBlockPath(block_id, &path)) { + return Status::NotFound( + Substitute("Block $0 not found", block_id.ToString())); + } + + VLOG(1) << "Opening block with id " << block_id.ToString() << " at " << path; + + shared_ptr reader; + RETURN_NOT_OK(env_util::OpenFileForRandom(env_, path, &reader)); + block->reset(new internal::FileReadableBlock(this, block_id, reader)); + return Status::OK(); +} + +Status FileBlockManager::DeleteBlock(const BlockId& block_id) { + CHECK(!read_only_); + + string path; + if (!FindBlockPath(block_id, &path)) { + return Status::NotFound( + Substitute("Block $0 not found", block_id.ToString())); + } + RETURN_NOT_OK(env_->DeleteFile(path)); + + // We don't bother fsyncing the parent directory as there's nothing to be + // gained by ensuring that the deletion is made durable. Even if we did + // fsync it, we'd need to account for garbage at startup time (in the + // event that we crashed just before the fsync), and with such accounting + // fsync-as-you-delete is unnecessary. + // + // The block's directory hierarchy is left behind. We could prune it if + // it's empty, but that's racy and leaving it isn't much overhead. + + return Status::OK(); +} + +Status FileBlockManager::CloseBlocks(const vector& blocks) { + VLOG(3) << "Closing " << blocks.size() << " blocks"; + if (FLAGS_block_coalesce_close) { + // Ask the kernel to begin writing out each block's dirty data. This is + // done up-front to give the kernel opportunities to coalesce contiguous + // dirty pages. + for (WritableBlock* block : blocks) { + RETURN_NOT_OK(block->FlushDataAsync()); + } + } + + // Now close each block, waiting for each to become durable. + for (WritableBlock* block : blocks) { + RETURN_NOT_OK(block->Close()); + } + return Status::OK(); +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/file_block_manager.h b/src/kudu/fs/file_block_manager.h new file mode 100644 index 000000000000..4d4a90db6978 --- /dev/null +++ b/src/kudu/fs/file_block_manager.h @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_FS_FILE_BLOCK_MANAGER_H +#define KUDU_FS_FILE_BLOCK_MANAGER_H + +#include +#include +#include +#include +#include + +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/util/locks.h" +#include "kudu/util/random.h" + +namespace kudu { + +class Env; +class MemTracker; +class MetricEntity; +class WritableFile; + +namespace fs { +class PathInstanceMetadataFile; + +namespace internal { +class FileBlockLocation; +class FileReadableBlock; +class FileWritableBlock; + +struct BlockManagerMetrics; +} // namespace internal + +// A file-backed block storage implementation. +// +// This is a naive block implementation which maps each block to its own +// file on disk. To prevent the block directory from becoming too large, +// blocks are aggregated into a 3-level directory hierarchy. +// +// The block manager can take advantage of multiple filesystem paths. A block +// written to a given path will be assigned an ID that includes enough +// information to uniquely identify the path's underlying disk. The ID is +// resolved back into a filesystem path when the block is opened for reading. +// The structure of this ID limits the block manager to at most 65,536 disks. +// +// When creating blocks, the block manager will round robin through the +// available filesystem paths. +// +// TODO: Support path-based block placement hints. + +// The file-backed block manager. +class FileBlockManager : public BlockManager { + public: + + // Creates a new in-memory instance of a FileBlockManager. + // + // 'env' should remain alive for the lifetime of the block manager. + FileBlockManager(Env* env, const BlockManagerOptions& opts); + + virtual ~FileBlockManager(); + + virtual Status Create() OVERRIDE; + + virtual Status Open() OVERRIDE; + + virtual Status CreateBlock(const CreateBlockOptions& opts, + gscoped_ptr* block) OVERRIDE; + + virtual Status CreateBlock(gscoped_ptr* block) OVERRIDE; + + virtual Status OpenBlock(const BlockId& block_id, + gscoped_ptr* block) OVERRIDE; + + virtual Status DeleteBlock(const BlockId& block_id) OVERRIDE; + + virtual Status CloseBlocks(const std::vector& blocks) OVERRIDE; + + private: + friend class internal::FileBlockLocation; + friend class internal::FileReadableBlock; + friend class internal::FileWritableBlock; + + // Synchronizes the metadata for a block with the given id. + Status SyncMetadata(const internal::FileBlockLocation& block_id); + + // Looks up the path of the file backing a particular block ID. + // + // On success, overwrites 'path' with the file's path. + bool FindBlockPath(const BlockId& block_id, + std::string* root_path) const; + + Env* env() const { return env_; } + + // For manipulating files. + Env* env_; + + // If true, only read operations are allowed. + const bool read_only_; + + // Filesystem paths where all block directories are found. + const std::vector root_paths_; + + // Maps path indices their instance files. + // + // There's no need to synchronize access to the map as it is only written + // to during Create() and Open(); all subsequent accesses are reads. + typedef std::map PathMap; + PathMap root_paths_by_idx_; + + // For generating block IDs. + ThreadSafeRandom rand_; + + // Protects 'dirty_dirs_' and 'next_root_path_'. + mutable simple_spinlock lock_; + + // Tracks the block directories which are dirty from block creation. This + // lets us perform some simple coalescing when synchronizing metadata. + std::unordered_set dirty_dirs_; + + // Points to the filesystem path to be used when creating the next block. + PathMap::iterator next_root_path_; + + // Metric container for the block manager. + // May be null if instantiated without metrics. + gscoped_ptr metrics_; + + // Tracks memory consumption of any allocations numerous enough to be + // interesting. + std::shared_ptr mem_tracker_; + + DISALLOW_COPY_AND_ASSIGN(FileBlockManager); +}; + +} // namespace fs +} // namespace kudu + +#endif diff --git a/src/kudu/fs/fs-test-util.h b/src/kudu/fs/fs-test-util.h new file mode 100644 index 000000000000..15c702c7c3d8 --- /dev/null +++ b/src/kudu/fs/fs-test-util.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_FS_FS_TEST_UTIL_H +#define KUDU_FS_FS_TEST_UTIL_H + +#include "kudu/fs/block_manager.h" +#include "kudu/util/malloc.h" + +namespace kudu { +namespace fs { + +// ReadableBlock that counts the total number of bytes read. +// +// The counter is kept separate from the class itself because +// ReadableBlocks are often wholly owned by other objects, preventing tests +// from easily snooping on the counter's value. +// +// Sample usage: +// +// gscoped_ptr block; +// fs_manager->OpenBlock("some block id", &block); +// size_t bytes_read = 0; +// gscoped_ptr tr_block(new CountingReadableBlock(block.Pass(), &bytes_read)); +// tr_block->Read(0, 100, ...); +// tr_block->Read(0, 200, ...); +// ASSERT_EQ(300, bytes_read); +// +class CountingReadableBlock : public ReadableBlock { + public: + CountingReadableBlock(gscoped_ptr block, size_t* bytes_read) + : block_(block.Pass()), + bytes_read_(bytes_read) { + } + + virtual const BlockId& id() const OVERRIDE { + return block_->id(); + } + + virtual Status Close() OVERRIDE { + return block_->Close(); + } + + virtual Status Size(uint64_t* sz) const OVERRIDE { + return block_->Size(sz); + } + + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const OVERRIDE { + RETURN_NOT_OK(block_->Read(offset, length, result, scratch)); + *bytes_read_ += length; + return Status::OK(); + } + + virtual size_t memory_footprint() const OVERRIDE { + return block_->memory_footprint(); + } + + private: + gscoped_ptr block_; + size_t* bytes_read_; +}; + +} // namespace fs +} // namespace kudu + +#endif diff --git a/src/kudu/fs/fs.proto b/src/kudu/fs/fs.proto new file mode 100644 index 000000000000..bf1cfca5e210 --- /dev/null +++ b/src/kudu/fs/fs.proto @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +option java_package = "org.kududb"; + +// ============================================================================ +// Local file system metadata +// ============================================================================ + +// When any server initializes a new filesystem (eg a new node is created in the +// cluster), it creates this structure and stores it persistently. +message InstanceMetadataPB { + + // The UUID which is assigned when the instance is first formatted. This uniquely + // identifies the node in the cluster. + required bytes uuid = 1; + + // Human-readable string indicating when and where the node was first + // initialized. + required string format_stamp = 2; + + // TODO: add a "node type" (TS/Master?) +} + +// Describes a collection of filesystem path instances and the membership of a +// particular instance in the collection. +// +// In a healthy filesystem (see below), a path instance can be referred to via +// its UUID's position in all_uuids instead of via the UUID itself. This is +// useful when there are many such references, as the position is much shorter +// than the UUID. +message PathSetPB { + // Globally unique identifier for this path instance. + required bytes uuid = 1; + + // All UUIDs in this path instance set. In a healthy filesystem: + // 1. There exists an on-disk PathInstanceMetadataPB for each listed UUID, and + // 2. Every PathSetPB contains an identical copy of all_uuids. + repeated bytes all_uuids = 2; +} + +// A filesystem instance can contain multiple paths. One of these structures +// is persisted in each path when the filesystem instance is created. +message PathInstanceMetadataPB { + // Describes this path instance as well as all of the other path instances + // that, taken together, describe a complete set. + required PathSetPB path_set = 1; + + // Textual representation of the block manager that formatted this path. + required string block_manager_type = 2; + + // Block size on the filesystem where this instance was created. If the + // instance (and its data) are ever copied to another location, the block + // size in that location must be the same. + required uint64 filesystem_block_size_bytes = 3; +} + +message BlockIdPB { + required fixed64 id = 1; +} + +// The kind of record. +enum BlockRecordType { + UNKNOWN = 0; + CREATE = 1; + DELETE = 2; +} + +// An element found in a container metadata file of the log-backed block +// storage implementation. +// +// Each one tracks the existence (creation) or non-existence (deletion) +// of a particular block. They are written sequentially, with subsequent +// messages taking precedence over earlier ones (e.g. "CREATE foo" followed +// by "DELETE foo" means that block foo does not exist). +message BlockRecordPB { + // The unique identifier of the block. + required BlockIdPB block_id = 1; + + // Whether this is a CREATE or a DELETE. + required BlockRecordType op_type = 2; + + // The time at which this block record was created, expressed in terms of + // microseconds since the epoch. + required uint64 timestamp_us = 3; + + // The offset of the block in the container data file. + // + // Required for CREATE. + optional int64 offset = 4; + + // The length of the block in the container data file. + // + // Required for CREATE. + optional int64 length = 5; +} diff --git a/src/kudu/fs/fs_manager-test.cc b/src/kudu/fs/fs_manager-test.cc new file mode 100644 index 000000000000..922e617a2b24 --- /dev/null +++ b/src/kudu/fs/fs_manager-test.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/fs/block_manager.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +using std::shared_ptr; + +namespace kudu { + +class FsManagerTestBase : public KuduTest { + public: + void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // Initialize File-System Layout + ReinitFsManager(); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + void ReinitFsManager() { + ReinitFsManager(GetTestPath("fs_root"), { GetTestPath("fs_root")} ); + } + + void ReinitFsManager(const string& wal_path, const vector& data_paths) { + // Blow away the old memtrackers first. + fs_manager_.reset(); + + FsManagerOpts opts; + opts.wal_path = wal_path; + opts.data_paths = data_paths; + fs_manager_.reset(new FsManager(env_.get(), opts)); + } + + void TestReadWriteDataFile(const Slice& data) { + uint8_t buffer[64]; + DCHECK_LT(data.size(), sizeof(buffer)); + + // Test Write + gscoped_ptr writer; + ASSERT_OK(fs_manager()->CreateNewBlock(&writer)); + ASSERT_OK(writer->Append(data)); + ASSERT_OK(writer->Close()); + + // Test Read + Slice result; + gscoped_ptr reader; + ASSERT_OK(fs_manager()->OpenBlock(writer->id(), &reader)); + ASSERT_OK(reader->Read(0, data.size(), &result, buffer)); + ASSERT_EQ(data.size(), result.size()); + ASSERT_EQ(0, result.compare(data)); + } + + FsManager *fs_manager() const { return fs_manager_.get(); } + + private: + gscoped_ptr fs_manager_; +}; + +TEST_F(FsManagerTestBase, TestBaseOperations) { + fs_manager()->DumpFileSystemTree(std::cout); + + TestReadWriteDataFile(Slice("test0")); + TestReadWriteDataFile(Slice("test1")); + + fs_manager()->DumpFileSystemTree(std::cout); +} + +TEST_F(FsManagerTestBase, TestIllegalPaths) { + vector illegal = { "", "asdf", "/foo\n\t" }; + for (const string& path : illegal) { + ReinitFsManager(path, { path }); + ASSERT_TRUE(fs_manager()->CreateInitialFileSystemLayout().IsIOError()); + } +} + +TEST_F(FsManagerTestBase, TestMultiplePaths) { + string wal_path = GetTestPath("a"); + vector data_paths = { GetTestPath("a"), GetTestPath("b"), GetTestPath("c") }; + ReinitFsManager(wal_path, data_paths); + ASSERT_OK(fs_manager()->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager()->Open()); +} + +TEST_F(FsManagerTestBase, TestMatchingPathsWithMismatchedSlashes) { + string wal_path = GetTestPath("foo"); + vector data_paths = { wal_path + "/" }; + ReinitFsManager(wal_path, data_paths); + ASSERT_OK(fs_manager()->CreateInitialFileSystemLayout()); +} + +TEST_F(FsManagerTestBase, TestDuplicatePaths) { + string path = GetTestPath("foo"); + ReinitFsManager(path, { path, path, path }); + ASSERT_OK(fs_manager()->CreateInitialFileSystemLayout()); + ASSERT_EQ(vector({ JoinPathSegments(path, fs_manager()->kDataDirName) }), + fs_manager()->GetDataRootDirs()); +} + +TEST_F(FsManagerTestBase, TestListTablets) { + vector tablet_ids; + ASSERT_OK(fs_manager()->ListTabletIds(&tablet_ids)); + ASSERT_EQ(0, tablet_ids.size()); + + string path = fs_manager()->GetTabletMetadataDir(); + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile( + JoinPathSegments(path, "foo.tmp"), &writer)); + ASSERT_OK(env_->NewWritableFile( + JoinPathSegments(path, "foo.tmp.abc123"), &writer)); + ASSERT_OK(env_->NewWritableFile( + JoinPathSegments(path, ".hidden"), &writer)); + ASSERT_OK(env_->NewWritableFile( + JoinPathSegments(path, "a_tablet_sort_of"), &writer)); + + ASSERT_OK(fs_manager()->ListTabletIds(&tablet_ids)); + ASSERT_EQ(1, tablet_ids.size()) << tablet_ids; +} + +TEST_F(FsManagerTestBase, TestCannotUseNonEmptyFsRoot) { + string path = GetTestPath("new_fs_root"); + ASSERT_OK(env_->CreateDir(path)); + { + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile( + JoinPathSegments(path, "some_file"), &writer)); + } + + // Try to create the FS layout. It should fail. + ReinitFsManager(path, { path }); + ASSERT_TRUE(fs_manager()->CreateInitialFileSystemLayout().IsAlreadyPresent()); +} + +TEST_F(FsManagerTestBase, TestEmptyWALPath) { + ReinitFsManager("", vector()); + Status s = fs_manager()->CreateInitialFileSystemLayout(); + ASSERT_TRUE(s.IsIOError()); + ASSERT_STR_CONTAINS(s.ToString(), "directory (fs_wal_dir) not provided"); +} + +TEST_F(FsManagerTestBase, TestOnlyWALPath) { + string path = GetTestPath("new_fs_root"); + ASSERT_OK(env_->CreateDir(path)); + + ReinitFsManager(path, vector()); + ASSERT_OK(fs_manager()->CreateInitialFileSystemLayout()); + ASSERT_TRUE(HasPrefixString(fs_manager()->GetWalsRootDir(), path)); + ASSERT_TRUE(HasPrefixString(fs_manager()->GetConsensusMetadataDir(), path)); + ASSERT_TRUE(HasPrefixString(fs_manager()->GetTabletMetadataDir(), path)); + vector data_dirs = fs_manager()->GetDataRootDirs(); + ASSERT_EQ(1, data_dirs.size()); + ASSERT_TRUE(HasPrefixString(data_dirs[0], path)); +} + +} // namespace kudu diff --git a/src/kudu/fs/fs_manager.cc b/src/kudu/fs/fs_manager.cc new file mode 100644 index 000000000000..117389f4da22 --- /dev/null +++ b/src/kudu/fs/fs_manager.cc @@ -0,0 +1,511 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/fs_manager.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "kudu/fs/block_id.h" +#include "kudu/fs/file_block_manager.h" +#include "kudu/fs/fs.pb.h" +#include "kudu/fs/log_block_manager.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/strtoint.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/env_util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/oid_generator.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" + +DEFINE_bool(enable_data_block_fsync, true, + "Whether to enable fsync() of data blocks, metadata, and their parent directories. " + "Disabling this flag may cause data loss in the event of a system crash."); +TAG_FLAG(enable_data_block_fsync, unsafe); + +#if defined(__linux__) +DEFINE_string(block_manager, "log", "Which block manager to use for storage. " + "Valid options are 'file' and 'log'."); +#else +DEFINE_string(block_manager, "file", "Which block manager to use for storage. " + "Only the file block manager is supported for non-Linux systems."); +#endif +TAG_FLAG(block_manager, advanced); + +DEFINE_string(fs_wal_dir, "", + "Directory with write-ahead logs. If this is not specified, the " + "program will not start. May be the same as fs_data_dirs"); +TAG_FLAG(fs_wal_dir, stable); +DEFINE_string(fs_data_dirs, "", + "Comma-separated list of directories with data blocks. If this " + "is not specified, fs_wal_dir will be used as the sole data " + "block directory."); +TAG_FLAG(fs_data_dirs, stable); + +using google::protobuf::Message; +using kudu::env_util::ScopedFileDeleter; +using kudu::fs::BlockManagerOptions; +using kudu::fs::CreateBlockOptions; +using kudu::fs::FileBlockManager; +using kudu::fs::LogBlockManager; +using kudu::fs::ReadableBlock; +using kudu::fs::WritableBlock; +using std::map; +using std::unordered_set; +using strings::Substitute; + +namespace kudu { + +// ========================================================================== +// FS Paths +// ========================================================================== +const char *FsManager::kWalDirName = "wals"; +const char *FsManager::kWalFileNamePrefix = "wal"; +const char *FsManager::kWalsRecoveryDirSuffix = ".recovery"; +const char *FsManager::kTabletMetadataDirName = "tablet-meta"; +const char *FsManager::kDataDirName = "data"; +const char *FsManager::kCorruptedSuffix = ".corrupted"; +const char *FsManager::kInstanceMetadataFileName = "instance"; +const char *FsManager::kConsensusMetadataDirName = "consensus-meta"; + +static const char* const kTmpInfix = ".tmp"; + +FsManagerOpts::FsManagerOpts() + : wal_path(FLAGS_fs_wal_dir), + read_only(false) { + data_paths = strings::Split(FLAGS_fs_data_dirs, ",", strings::SkipEmpty()); +} + +FsManagerOpts::~FsManagerOpts() { +} + +FsManager::FsManager(Env* env, const string& root_path) + : env_(DCHECK_NOTNULL(env)), + read_only_(false), + wal_fs_root_(root_path), + data_fs_roots_({ root_path }), + metric_entity_(nullptr), + initted_(false) { +} + +FsManager::FsManager(Env* env, + const FsManagerOpts& opts) + : env_(DCHECK_NOTNULL(env)), + read_only_(opts.read_only), + wal_fs_root_(opts.wal_path), + data_fs_roots_(opts.data_paths), + metric_entity_(opts.metric_entity), + parent_mem_tracker_(opts.parent_mem_tracker), + initted_(false) { +} + +FsManager::~FsManager() { +} + +Status FsManager::Init() { + if (initted_) { + return Status::OK(); + } + + // The wal root must be set. + if (wal_fs_root_.empty()) { + return Status::IOError("Write-ahead log directory (fs_wal_dir) not provided"); + } + + // Deduplicate all of the roots. + set all_roots; + all_roots.insert(wal_fs_root_); + for (const string& data_fs_root : data_fs_roots_) { + all_roots.insert(data_fs_root); + } + + // Build a map of original root --> canonicalized root, sanitizing each + // root a bit as we go. + typedef map RootMap; + RootMap canonicalized_roots; + for (const string& root : all_roots) { + if (root.empty()) { + return Status::IOError("Empty string provided for filesystem root"); + } + if (root[0] != '/') { + return Status::IOError( + Substitute("Relative path $0 provided for filesystem root", root)); + } + { + string root_copy = root; + StripWhiteSpace(&root_copy); + if (root != root_copy) { + return Status::IOError( + Substitute("Filesystem root $0 contains illegal whitespace", root)); + } + } + + // Strip the basename when canonicalizing, as it may not exist. The + // dirname, however, must exist. + string canonicalized; + RETURN_NOT_OK(env_->Canonicalize(DirName(root), &canonicalized)); + canonicalized = JoinPathSegments(canonicalized, BaseName(root)); + InsertOrDie(&canonicalized_roots, root, canonicalized); + } + + // All done, use the map to set the canonicalized state. + canonicalized_wal_fs_root_ = FindOrDie(canonicalized_roots, wal_fs_root_); + if (!data_fs_roots_.empty()) { + canonicalized_metadata_fs_root_ = FindOrDie(canonicalized_roots, data_fs_roots_[0]); + for (const string& data_fs_root : data_fs_roots_) { + canonicalized_data_fs_roots_.insert(FindOrDie(canonicalized_roots, data_fs_root)); + } + } else { + LOG(INFO) << "Data directories (fs_data_dirs) not provided"; + LOG(INFO) << "Using write-ahead log directory (fs_wal_dir) as data directory"; + canonicalized_metadata_fs_root_ = canonicalized_wal_fs_root_; + canonicalized_data_fs_roots_.insert(canonicalized_wal_fs_root_); + } + for (const RootMap::value_type& e : canonicalized_roots) { + canonicalized_all_fs_roots_.insert(e.second); + } + + if (VLOG_IS_ON(1)) { + VLOG(1) << "WAL root: " << canonicalized_wal_fs_root_; + VLOG(1) << "Metadata root: " << canonicalized_metadata_fs_root_; + VLOG(1) << "Data roots: " << canonicalized_data_fs_roots_; + VLOG(1) << "All roots: " << canonicalized_all_fs_roots_; + } + + // With the data roots canonicalized, we can initialize the block manager. + InitBlockManager(); + + initted_ = true; + return Status::OK(); +} + +void FsManager::InitBlockManager() { + BlockManagerOptions opts; + opts.metric_entity = metric_entity_; + opts.parent_mem_tracker = parent_mem_tracker_; + opts.root_paths = GetDataRootDirs(); + opts.read_only = read_only_; + if (FLAGS_block_manager == "file") { + block_manager_.reset(new FileBlockManager(env_, opts)); + } else if (FLAGS_block_manager == "log") { + block_manager_.reset(new LogBlockManager(env_, opts)); + } else { + LOG(FATAL) << "Invalid block manager: " << FLAGS_block_manager; + } +} + +Status FsManager::Open() { + RETURN_NOT_OK(Init()); + for (const string& root : canonicalized_all_fs_roots_) { + gscoped_ptr pb(new InstanceMetadataPB); + RETURN_NOT_OK(pb_util::ReadPBContainerFromPath(env_, GetInstanceMetadataPath(root), + pb.get())); + if (!metadata_) { + metadata_.reset(pb.release()); + } else if (pb->uuid() != metadata_->uuid()) { + return Status::Corruption(Substitute( + "Mismatched UUIDs across filesystem roots: $0 vs. $1", + metadata_->uuid(), pb->uuid())); + } + } + + RETURN_NOT_OK(block_manager_->Open()); + LOG(INFO) << "Opened local filesystem: " << JoinStrings(canonicalized_all_fs_roots_, ",") + << std::endl << metadata_->DebugString(); + return Status::OK(); +} + +Status FsManager::CreateInitialFileSystemLayout() { + CHECK(!read_only_); + + RETURN_NOT_OK(Init()); + + // It's OK if a root already exists as long as there's nothing in it. + for (const string& root : canonicalized_all_fs_roots_) { + if (!env_->FileExists(root)) { + // We'll create the directory below. + continue; + } + bool is_empty; + RETURN_NOT_OK_PREPEND(IsDirectoryEmpty(root, &is_empty), + "Unable to check if FSManager root is empty"); + if (!is_empty) { + return Status::AlreadyPresent("FSManager root is not empty", root); + } + } + + // All roots are either empty or non-existent. Create missing roots and all + // subdirectories. + // + // In the event of failure, delete everything we created. + deque delete_on_failure; + ElementDeleter d(&delete_on_failure); + + InstanceMetadataPB metadata; + CreateInstanceMetadata(&metadata); + unordered_set to_sync; + for (const string& root : canonicalized_all_fs_roots_) { + bool created; + RETURN_NOT_OK_PREPEND(CreateDirIfMissing(root, &created), + "Unable to create FSManager root"); + if (created) { + delete_on_failure.push_front(new ScopedFileDeleter(env_, root)); + to_sync.insert(DirName(root)); + } + RETURN_NOT_OK_PREPEND(WriteInstanceMetadata(metadata, root), + "Unable to write instance metadata"); + delete_on_failure.push_front(new ScopedFileDeleter( + env_, GetInstanceMetadataPath(root))); + } + + // Initialize ancillary directories. + vector ancillary_dirs = { GetWalsRootDir(), + GetTabletMetadataDir(), + GetConsensusMetadataDir() }; + for (const string& dir : ancillary_dirs) { + bool created; + RETURN_NOT_OK_PREPEND(CreateDirIfMissing(dir, &created), + Substitute("Unable to create directory $0", dir)); + if (created) { + delete_on_failure.push_front(new ScopedFileDeleter(env_, dir)); + to_sync.insert(DirName(dir)); + } + } + + // Ensure newly created directories are synchronized to disk. + if (FLAGS_enable_data_block_fsync) { + for (const string& dir : to_sync) { + RETURN_NOT_OK_PREPEND(env_->SyncDir(dir), + Substitute("Unable to synchronize directory $0", dir)); + } + } + + // And lastly, the block manager. + RETURN_NOT_OK_PREPEND(block_manager_->Create(), "Unable to create block manager"); + + // Success: don't delete any files. + for (ScopedFileDeleter* deleter : delete_on_failure) { + deleter->Cancel(); + } + return Status::OK(); +} + +void FsManager::CreateInstanceMetadata(InstanceMetadataPB* metadata) { + ObjectIdGenerator oid_generator; + metadata->set_uuid(oid_generator.Next()); + + string time_str; + StringAppendStrftime(&time_str, "%Y-%m-%d %H:%M:%S", time(nullptr), false); + string hostname; + if (!GetHostname(&hostname).ok()) { + hostname = ""; + } + metadata->set_format_stamp(Substitute("Formatted at $0 on $1", time_str, hostname)); +} + +Status FsManager::WriteInstanceMetadata(const InstanceMetadataPB& metadata, + const string& root) { + const string path = GetInstanceMetadataPath(root); + + // The instance metadata is written effectively once per TS, so the + // durability cost is negligible. + RETURN_NOT_OK(pb_util::WritePBContainerToPath(env_, path, + metadata, + pb_util::NO_OVERWRITE, + pb_util::SYNC)); + LOG(INFO) << "Generated new instance metadata in path " << path << ":\n" + << metadata.DebugString(); + return Status::OK(); +} + +Status FsManager::IsDirectoryEmpty(const string& path, bool* is_empty) { + vector children; + RETURN_NOT_OK(env_->GetChildren(path, &children)); + for (const string& child : children) { + if (child == "." || child == "..") { + continue; + } else { + *is_empty = false; + return Status::OK(); + } + } + *is_empty = true; + return Status::OK(); +} + +Status FsManager::CreateDirIfMissing(const string& path, bool* created) { + return env_util::CreateDirIfMissing(env_, path, created); +} + +const string& FsManager::uuid() const { + return CHECK_NOTNULL(metadata_.get())->uuid(); +} + +vector FsManager::GetDataRootDirs() const { + // Add the data subdirectory to each data root. + std::vector data_paths; + for (const string& data_fs_root : canonicalized_data_fs_roots_) { + data_paths.push_back(JoinPathSegments(data_fs_root, kDataDirName)); + } + return data_paths; +} + +string FsManager::GetTabletMetadataDir() const { + DCHECK(initted_); + return JoinPathSegments(canonicalized_metadata_fs_root_, kTabletMetadataDirName); +} + +string FsManager::GetTabletMetadataPath(const string& tablet_id) const { + return JoinPathSegments(GetTabletMetadataDir(), tablet_id); +} + +namespace { +// Return true if 'fname' is a valid tablet ID. +bool IsValidTabletId(const std::string& fname) { + if (fname.find(kTmpInfix) != string::npos) { + LOG(WARNING) << "Ignoring tmp file in tablet metadata dir: " << fname; + return false; + } + + if (HasPrefixString(fname, ".")) { + // Hidden file or ./.. + VLOG(1) << "Ignoring hidden file in tablet metadata dir: " << fname; + return false; + } + + return true; +} +} // anonymous namespace + +Status FsManager::ListTabletIds(vector* tablet_ids) { + string dir = GetTabletMetadataDir(); + vector children; + RETURN_NOT_OK_PREPEND(ListDir(dir, &children), + Substitute("Couldn't list tablets in metadata directory $0", dir)); + + vector tablets; + for (const string& child : children) { + if (!IsValidTabletId(child)) { + continue; + } + tablet_ids->push_back(child); + } + return Status::OK(); +} + +string FsManager::GetInstanceMetadataPath(const string& root) const { + return JoinPathSegments(root, kInstanceMetadataFileName); +} + +string FsManager::GetTabletWalRecoveryDir(const string& tablet_id) const { + string path = JoinPathSegments(GetWalsRootDir(), tablet_id); + StrAppend(&path, kWalsRecoveryDirSuffix); + return path; +} + +string FsManager::GetWalSegmentFileName(const string& tablet_id, + uint64_t sequence_number) const { + return JoinPathSegments(GetTabletWalDir(tablet_id), + strings::Substitute("$0-$1", + kWalFileNamePrefix, + StringPrintf("%09" PRIu64, sequence_number))); +} + + +// ========================================================================== +// Dump/Debug utils +// ========================================================================== + +void FsManager::DumpFileSystemTree(ostream& out) { + DCHECK(initted_); + + for (const string& root : canonicalized_all_fs_roots_) { + out << "File-System Root: " << root << std::endl; + + std::vector objects; + Status s = env_->GetChildren(root, &objects); + if (!s.ok()) { + LOG(ERROR) << "Unable to list the fs-tree: " << s.ToString(); + return; + } + + DumpFileSystemTree(out, "|-", root, objects); + } +} + +void FsManager::DumpFileSystemTree(ostream& out, const string& prefix, + const string& path, const vector& objects) { + for (const string& name : objects) { + if (name == "." || name == "..") continue; + + std::vector sub_objects; + string sub_path = JoinPathSegments(path, name); + Status s = env_->GetChildren(sub_path, &sub_objects); + if (s.ok()) { + out << prefix << name << "/" << std::endl; + DumpFileSystemTree(out, prefix + "---", sub_path, sub_objects); + } else { + out << prefix << name << std::endl; + } + } +} + +// ========================================================================== +// Data read/write interfaces +// ========================================================================== + +Status FsManager::CreateNewBlock(gscoped_ptr* block) { + CHECK(!read_only_); + + return block_manager_->CreateBlock(block); +} + +Status FsManager::OpenBlock(const BlockId& block_id, gscoped_ptr* block) { + return block_manager_->OpenBlock(block_id, block); +} + +Status FsManager::DeleteBlock(const BlockId& block_id) { + CHECK(!read_only_); + + return block_manager_->DeleteBlock(block_id); +} + +bool FsManager::BlockExists(const BlockId& block_id) const { + gscoped_ptr block; + return block_manager_->OpenBlock(block_id, &block).ok(); +} + +std::ostream& operator<<(std::ostream& o, const BlockId& block_id) { + return o << block_id.ToString(); +} + +} // namespace kudu diff --git a/src/kudu/fs/fs_manager.h b/src/kudu/fs/fs_manager.h new file mode 100644 index 000000000000..fab043684563 --- /dev/null +++ b/src/kudu/fs/fs_manager.h @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_FS_FS_MANAGER_H +#define KUDU_FS_FS_MANAGER_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/env.h" +#include "kudu/util/path_util.h" + +DECLARE_bool(enable_data_block_fsync); + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class MemTracker; +class MetricEntity; + +namespace fs { +class BlockManager; +class ReadableBlock; +class WritableBlock; +} // namespace fs + +namespace itest { +class ExternalMiniClusterFsInspector; +} + +class BlockId; +class InstanceMetadataPB; + +struct FsManagerOpts { + FsManagerOpts(); + ~FsManagerOpts(); + + // The entity under which all metrics should be grouped. If NULL, metrics + // will not be produced. + // + // Defaults to NULL. + scoped_refptr metric_entity; + + // The memory tracker under which all new memory trackers will be parented. + // If NULL, new memory trackers will be parented to the root tracker. + std::shared_ptr parent_mem_tracker; + + // The path where WALs will be stored. Cannot be empty. + std::string wal_path; + + // The paths where data blocks will be stored. Cannot be empty. + std::vector data_paths; + + // Whether or not read-write operations should be allowed. Defaults to false. + bool read_only; +}; + +// FsManager provides helpers to read data and metadata files, +// and it's responsible for abstracting the file-system layout. +// +// The user should not be aware of where files are placed, +// but instead should interact with the storage in terms of "open the block xyz" +// or "write a new schema metadata file for table kwz". +// +// The current layout is: +// /data/ +// /data//// +class FsManager { + public: + static const char *kWalFileNamePrefix; + static const char *kWalsRecoveryDirSuffix; + + // Only for unit tests. + FsManager(Env* env, const std::string& root_path); + + FsManager(Env* env, const FsManagerOpts& opts); + ~FsManager(); + + // Initialize and load the basic filesystem metadata. + // If the file system has not been initialized, returns NotFound. + // In that case, CreateInitialFileSystemLayout may be used to initialize + // the on-disk structures. + Status Open(); + + // Create the initial filesystem layout. + // + // Returns an error if the file system is already initialized. + Status CreateInitialFileSystemLayout(); + + void DumpFileSystemTree(std::ostream& out); + + // Return the UUID persisted in the local filesystem. If Open() + // has not been called, this will crash. + const std::string& uuid() const; + + // ========================================================================== + // Data read/write interfaces + // ========================================================================== + + // Creates a new anonymous block. + // + // Block will be synced on close. + Status CreateNewBlock(gscoped_ptr* block); + + Status OpenBlock(const BlockId& block_id, + gscoped_ptr* block); + + Status DeleteBlock(const BlockId& block_id); + + bool BlockExists(const BlockId& block_id) const; + + // ========================================================================== + // on-disk path + // ========================================================================== + std::vector GetDataRootDirs() const; + + std::string GetWalsRootDir() const { + DCHECK(initted_); + return JoinPathSegments(canonicalized_wal_fs_root_, kWalDirName); + } + + std::string GetTabletWalDir(const std::string& tablet_id) const { + return JoinPathSegments(GetWalsRootDir(), tablet_id); + } + + std::string GetTabletWalRecoveryDir(const std::string& tablet_id) const; + + std::string GetWalSegmentFileName(const std::string& tablet_id, + uint64_t sequence_number) const; + + // Return the directory where tablet superblocks should be stored. + std::string GetTabletMetadataDir() const; + + // Return the path for a specific tablet's superblock. + std::string GetTabletMetadataPath(const std::string& tablet_id) const; + + // List the tablet IDs in the metadata directory. + Status ListTabletIds(std::vector* tablet_ids); + + // Return the path where InstanceMetadataPB is stored. + std::string GetInstanceMetadataPath(const std::string& root) const; + + // Return the directory where the consensus metadata is stored. + std::string GetConsensusMetadataDir() const { + DCHECK(initted_); + return JoinPathSegments(canonicalized_metadata_fs_root_, kConsensusMetadataDirName); + } + + // Return the path where ConsensusMetadataPB is stored. + std::string GetConsensusMetadataPath(const std::string& tablet_id) const { + return JoinPathSegments(GetConsensusMetadataDir(), tablet_id); + } + + Env *env() { return env_; } + + bool read_only() const { + return read_only_; + } + + // ========================================================================== + // file-system helpers + // ========================================================================== + bool Exists(const std::string& path) const { + return env_->FileExists(path); + } + + Status ListDir(const std::string& path, std::vector *objects) const { + return env_->GetChildren(path, objects); + } + + Status CreateDirIfMissing(const std::string& path, bool* created = NULL); + + fs::BlockManager* block_manager() { + return block_manager_.get(); + } + + private: + FRIEND_TEST(FsManagerTestBase, TestDuplicatePaths); + friend class itest::ExternalMiniClusterFsInspector; // for access to directory names + + // Initializes, sanitizes, and canonicalizes the filesystem roots. + Status Init(); + + // Select and create an instance of the appropriate block manager. + // + // Does not actually perform any on-disk operations. + void InitBlockManager(); + + // Create a new InstanceMetadataPB. + void CreateInstanceMetadata(InstanceMetadataPB* metadata); + + // Save a InstanceMetadataPB to the filesystem. + // Does not mutate the current state of the fsmanager. + Status WriteInstanceMetadata(const InstanceMetadataPB& metadata, + const std::string& root); + + // Checks if 'path' is an empty directory. + // + // Returns an error if it's not a directory. Otherwise, sets 'is_empty' + // accordingly. + Status IsDirectoryEmpty(const std::string& path, bool* is_empty); + + // ========================================================================== + // file-system helpers + // ========================================================================== + void DumpFileSystemTree(std::ostream& out, + const std::string& prefix, + const std::string& path, + const std::vector& objects); + + static const char *kDataDirName; + static const char *kTabletMetadataDirName; + static const char *kWalDirName; + static const char *kCorruptedSuffix; + static const char *kInstanceMetadataFileName; + static const char *kInstanceMetadataMagicNumber; + static const char *kTabletSuperBlockMagicNumber; + static const char *kConsensusMetadataDirName; + + Env *env_; + + // If false, operations that mutate on-disk state are prohibited. + const bool read_only_; + + // These roots are the constructor input verbatim. None of them are used + // as-is; they are first canonicalized during Init(). + const std::string wal_fs_root_; + const std::vector data_fs_roots_; + + scoped_refptr metric_entity_; + + std::shared_ptr parent_mem_tracker_; + + // Canonicalized forms of 'wal_fs_root_ and 'data_fs_roots_'. Constructed + // during Init(). + // + // - The first data root is used as the metadata root. + // - Common roots in the collections have been deduplicated. + std::string canonicalized_wal_fs_root_; + std::string canonicalized_metadata_fs_root_; + std::set canonicalized_data_fs_roots_; + std::set canonicalized_all_fs_roots_; + + gscoped_ptr metadata_; + + gscoped_ptr block_manager_; + + bool initted_; + + DISALLOW_COPY_AND_ASSIGN(FsManager); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc new file mode 100644 index 000000000000..3f48aa261e20 --- /dev/null +++ b/src/kudu/fs/log_block_manager.cc @@ -0,0 +1,1601 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/fs/log_block_manager.h" + + +#include "kudu/fs/block_manager_metrics.h" +#include "kudu/fs/block_manager_util.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/alignment.h" +#include "kudu/util/atomic.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/malloc.h" +#include "kudu/util/metrics.h" +#include "kudu/util/mutex.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/random_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/threadpool.h" + +// TODO: How should this be configured? Should provide some guidance. +DEFINE_uint64(log_container_max_size, 10LU * 1024 * 1024 * 1024, + "Maximum size (soft) of a log container"); +TAG_FLAG(log_container_max_size, advanced); + +DEFINE_uint64(log_container_preallocate_bytes, 32LU * 1024 * 1024, + "Number of bytes to preallocate in a log container when " + "creating new blocks. Set to 0 to disable preallocation"); +TAG_FLAG(log_container_preallocate_bytes, advanced); + +DEFINE_bool(log_block_manager_test_hole_punching, true, + "Ensure hole punching is supported by the underlying filesystem"); +TAG_FLAG(log_block_manager_test_hole_punching, advanced); +TAG_FLAG(log_block_manager_test_hole_punching, unsafe); + +DECLARE_bool(enable_data_block_fsync); +DECLARE_bool(block_manager_lock_dirs); + +METRIC_DEFINE_gauge_uint64(server, log_block_manager_bytes_under_management, + "Bytes Under Management", + kudu::MetricUnit::kBytes, + "Number of bytes of data blocks currently under management"); + +METRIC_DEFINE_gauge_uint64(server, log_block_manager_blocks_under_management, + "Blocks Under Management", + kudu::MetricUnit::kBlocks, + "Number of data blocks currently under management"); + +METRIC_DEFINE_counter(server, log_block_manager_containers, + "Number of Block Containers", + kudu::MetricUnit::kLogBlockContainers, + "Number of log block containers"); + +METRIC_DEFINE_counter(server, log_block_manager_full_containers, + "Number of Full Block Counters", + kudu::MetricUnit::kLogBlockContainers, + "Number of full log block containers"); + +using std::unordered_map; +using std::unordered_set; +using strings::Substitute; +using kudu::env_util::ScopedFileDeleter; +using kudu::fs::internal::LogBlock; +using kudu::fs::internal::LogBlockContainer; +using kudu::pb_util::ReadablePBContainerFile; +using kudu::pb_util::WritablePBContainerFile; + +namespace kudu { + +namespace fs { + +namespace internal { + +//////////////////////////////////////////////////////////// +// LogBlockManagerMetrics +//////////////////////////////////////////////////////////// + +// Metrics container associated with the log block manager. +// +// Includes implementation-agnostic metrics as well as some that are +// specific to the log block manager. +struct LogBlockManagerMetrics { + explicit LogBlockManagerMetrics(const scoped_refptr& metric_entity); + + // Implementation-agnostic metrics. + BlockManagerMetrics generic_metrics; + + scoped_refptr > bytes_under_management; + scoped_refptr > blocks_under_management; + + scoped_refptr containers; + scoped_refptr full_containers; +}; + +#define MINIT(x) x(METRIC_log_block_manager_##x.Instantiate(metric_entity)) +#define GINIT(x) x(METRIC_log_block_manager_##x.Instantiate(metric_entity, 0)) +LogBlockManagerMetrics::LogBlockManagerMetrics(const scoped_refptr& metric_entity) + : generic_metrics(metric_entity), + GINIT(bytes_under_management), + GINIT(blocks_under_management), + MINIT(containers), + MINIT(full_containers) { +} +#undef GINIT +#undef MINIT + +//////////////////////////////////////////////////////////// +// LogBlockContainer +//////////////////////////////////////////////////////////// + +// A single block container belonging to the log-backed block manager. +// +// A container may only be used to write one WritableBlock at a given time. +// However, existing blocks may be deleted concurrently. As such, almost +// all container functions must be reentrant, even if the container itself +// is logically thread unsafe (i.e. multiple clients calling WriteData() +// concurrently will produce nonsensical container data). Thread unsafe +// functions are marked explicitly. +class LogBlockContainer { + public: + static const std::string kMetadataFileSuffix; + static const std::string kDataFileSuffix; + static const char* kMagic; + + // Creates a new block container in 'dir'. + static Status Create(LogBlockManager* block_manager, + PathInstanceMetadataPB* instance, + const std::string& dir, + gscoped_ptr* container); + + // Opens an existing block container in 'dir'. + // + // Every container is comprised of two files: "/.data" and + // "/.metadata". Together, 'dir' and 'id' fully describe both files. + static Status Open(LogBlockManager* block_manager, + PathInstanceMetadataPB* instance, + const std::string& dir, + const std::string& id, + gscoped_ptr* container); + + // Indicates that the writing of 'block' is finished. If successful, + // adds the block to the block manager's in-memory maps. + // + // Returns a status that is either the same as 's' (if !s.ok()) or + // potentially different (if s.ok() and FinishBlock() failed). + // + // After returning, this container has been released to the block manager + // and may no longer be used in the context of writing 'block'. + Status FinishBlock(const Status& s, WritableBlock* block); + + // Frees the space associated with a block at 'offset' and 'length'. This + // is a physical operation, not a logical one; a separate AppendMetadata() + // is required to record the deletion in container metadata. + // + // The on-disk effects of this call are made durable only after SyncData(). + Status DeleteBlock(int64_t offset, int64_t length); + + // Writes 'data' to this container's data file at offset 'offset'. + // + // The on-disk effects of this call are made durable only after SyncData(). + Status WriteData(int64_t offset, const Slice& data); + + // See RWFile::Read(). + Status ReadData(int64_t offset, size_t length, + Slice* result, uint8_t* scratch) const; + + // Appends 'pb' to this container's metadata file. + // + // The on-disk effects of this call are made durable only after SyncMetadata(). + Status AppendMetadata(const BlockRecordPB& pb); + + // Asynchronously flush this container's data file from 'offset' through + // to 'length'. + // + // Does not guarantee data durability; use SyncData() for that. + Status FlushData(int64_t offset, int64_t length); + + // Asynchronously flush this container's metadata file (all dirty bits). + // + // Does not guarantee metadata durability; use SyncMetadata() for that. + // + // TODO: Add support to just flush a range. + Status FlushMetadata(); + + // Synchronize this container's data file with the disk. On success, + // guarantees that the data is made durable. + // + // TODO: Add support to synchronize just a range. + Status SyncData(); + + // Synchronize this container's metadata file with the disk. On success, + // guarantees that the metadata is made durable. + // + // TODO: Add support to synchronize just a range. + Status SyncMetadata(); + + // Ensure that 'length' bytes are preallocated in this container, + // beginning from the position where the last written block ended. + Status Preallocate(size_t length); + + // Manipulates the block manager's memory tracker on behalf of blocks. + void ConsumeMemory(int64_t bytes); + void ReleaseMemory(int64_t bytes); + + // Reads the container's metadata from disk, sanity checking and + // returning the records. + Status ReadContainerRecords(deque* records) const; + + // Updates 'total_bytes_written_', marking this container as full if + // needed. Should only be called when a block is fully written, as it + // will round up the container data file's position. + // + // This function is thread unsafe. + void UpdateBytesWritten(int64_t more_bytes); + + // Run a task on this container's root path thread pool. + // + // Normally the task is performed asynchronously. However, if submission to + // the pool fails, it runs synchronously on the current thread. + void ExecClosure(const Closure& task); + + // Simple accessors. + std::string dir() const { return DirName(path_); } + const std::string& ToString() const { return path_; } + LogBlockManager* block_manager() const { return block_manager_; } + int64_t total_bytes_written() const { return total_bytes_written_; } + bool full() const { + return total_bytes_written_ >= FLAGS_log_container_max_size; + } + const LogBlockManagerMetrics* metrics() const { return metrics_; } + const PathInstanceMetadataPB* instance() const { return instance_; } + + private: + // RAII-style class for finishing containers in FinishBlock(). + class ScopedFinisher { + public: + // 'container' must outlive the finisher. + explicit ScopedFinisher(LogBlockContainer* container) : + container_(container) { + } + ~ScopedFinisher() { + container_->block_manager()->MakeContainerAvailable(container_); + } + private: + LogBlockContainer* container_; + }; + + LogBlockContainer(LogBlockManager* block_manager, + PathInstanceMetadataPB* instance, std::string path, + gscoped_ptr metadata_writer, + gscoped_ptr data_file); + + // Performs sanity checks on a block record. + void CheckBlockRecord(const BlockRecordPB& record, + uint64_t data_file_size) const; + + // The owning block manager. Must outlive the container itself. + LogBlockManager* const block_manager_; + + // The path to the container's files. Equivalent to "/" (see the + // container constructor). + const std::string path_; + + // Opened file handles to the container's files. + // + // WritableFile is not thread safe so access to each writer must be + // serialized through a (sleeping) mutex. We use different mutexes to + // avoid contention in cases where only one writer is needed. + gscoped_ptr metadata_pb_writer_; + Mutex metadata_pb_writer_lock_; + Mutex data_writer_lock_; + gscoped_ptr data_file_; + + // The amount of data written thus far in the container. + int64_t total_bytes_written_; + + // The metrics. Not owned by the log container; it has the same lifespan + // as the block manager. + const LogBlockManagerMetrics* metrics_; + + const PathInstanceMetadataPB* instance_; + + DISALLOW_COPY_AND_ASSIGN(LogBlockContainer); +}; + +const std::string LogBlockContainer::kMetadataFileSuffix(".metadata"); +const std::string LogBlockContainer::kDataFileSuffix(".data"); + +LogBlockContainer::LogBlockContainer( + LogBlockManager* block_manager, PathInstanceMetadataPB* instance, + string path, gscoped_ptr metadata_writer, + gscoped_ptr data_file) + : block_manager_(block_manager), + path_(std::move(path)), + metadata_pb_writer_(metadata_writer.Pass()), + data_file_(data_file.Pass()), + total_bytes_written_(0), + metrics_(block_manager->metrics()), + instance_(instance) {} + +Status LogBlockContainer::Create(LogBlockManager* block_manager, + PathInstanceMetadataPB* instance, + const string& dir, + gscoped_ptr* container) { + string common_path; + string metadata_path; + string data_path; + Status metadata_status; + Status data_status; + gscoped_ptr metadata_writer; + gscoped_ptr data_file; + WritableFileOptions wr_opts; + wr_opts.mode = Env::CREATE_NON_EXISTING; + + // Repeat in the event of a container id collision (unlikely). + // + // When looping, we delete any created-and-orphaned files. + do { + if (metadata_writer) { + block_manager->env()->DeleteFile(metadata_path); + } + common_path = JoinPathSegments(dir, block_manager->oid_generator()->Next()); + metadata_path = StrCat(common_path, kMetadataFileSuffix); + metadata_status = block_manager->env()->NewWritableFile(wr_opts, + metadata_path, + &metadata_writer); + if (data_file) { + block_manager->env()->DeleteFile(data_path); + } + data_path = StrCat(common_path, kDataFileSuffix); + RWFileOptions rw_opts; + rw_opts.mode = Env::CREATE_NON_EXISTING; + data_status = block_manager->env()->NewRWFile(rw_opts, + data_path, + &data_file); + } while (PREDICT_FALSE(metadata_status.IsAlreadyPresent() || + data_status.IsAlreadyPresent())); + if (metadata_status.ok() && data_status.ok()) { + gscoped_ptr metadata_pb_writer( + new WritablePBContainerFile(metadata_writer.Pass())); + RETURN_NOT_OK(metadata_pb_writer->Init(BlockRecordPB())); + container->reset(new LogBlockContainer(block_manager, + instance, + common_path, + metadata_pb_writer.Pass(), + data_file.Pass())); + VLOG(1) << "Created log block container " << (*container)->ToString(); + } + + // Prefer metadata status (arbitrarily). + return !metadata_status.ok() ? metadata_status : data_status; +} + +Status LogBlockContainer::Open(LogBlockManager* block_manager, + PathInstanceMetadataPB* instance, + const string& dir, const string& id, + gscoped_ptr* container) { + string common_path = JoinPathSegments(dir, id); + + // Open the existing metadata and data files for writing. + string metadata_path = StrCat(common_path, kMetadataFileSuffix); + gscoped_ptr metadata_writer; + WritableFileOptions wr_opts; + wr_opts.mode = Env::OPEN_EXISTING; + + RETURN_NOT_OK(block_manager->env()->NewWritableFile(wr_opts, + metadata_path, + &metadata_writer)); + gscoped_ptr metadata_pb_writer( + new WritablePBContainerFile(metadata_writer.Pass())); + // No call to metadata_pb_writer->Init() because we're reopening an + // existing pb container (that should already have a valid header). + + string data_path = StrCat(common_path, kDataFileSuffix); + gscoped_ptr data_file; + RWFileOptions rw_opts; + rw_opts.mode = Env::OPEN_EXISTING; + RETURN_NOT_OK(block_manager->env()->NewRWFile(rw_opts, + data_path, + &data_file)); + + // Create the in-memory container and populate it. + gscoped_ptr open_container(new LogBlockContainer(block_manager, + instance, + common_path, + metadata_pb_writer.Pass(), + data_file.Pass())); + VLOG(1) << "Opened log block container " << open_container->ToString(); + container->reset(open_container.release()); + return Status::OK(); +} + +Status LogBlockContainer::ReadContainerRecords(deque* records) const { + string metadata_path = StrCat(path_, kMetadataFileSuffix); + gscoped_ptr metadata_reader; + RETURN_NOT_OK(block_manager()->env()->NewRandomAccessFile(metadata_path, &metadata_reader)); + ReadablePBContainerFile pb_reader(metadata_reader.Pass()); + RETURN_NOT_OK(pb_reader.Init()); + + uint64_t data_file_size; + RETURN_NOT_OK(data_file_->Size(&data_file_size)); + deque local_records; + Status read_status; + while (true) { + local_records.resize(local_records.size() + 1); + read_status = pb_reader.ReadNextPB(&local_records.back()); + if (!read_status.ok()) { + // Drop the last element; we didn't use it. + local_records.pop_back(); + break; + } + CheckBlockRecord(local_records.back(), data_file_size); + } + Status close_status = pb_reader.Close(); + Status ret = !read_status.IsEndOfFile() ? read_status : close_status; + if (ret.ok()) { + records->swap(local_records); + } + return ret; +} + +void LogBlockContainer::CheckBlockRecord(const BlockRecordPB& record, + uint64_t data_file_size) const { + if (record.op_type() == CREATE && + (!record.has_offset() || + !record.has_length() || + record.offset() < 0 || + record.length() < 0 || + record.offset() + record.length() > data_file_size)) { + LOG(FATAL) << "Found malformed block record: " << record.DebugString(); + } +} + +Status LogBlockContainer::FinishBlock(const Status& s, WritableBlock* block) { + ScopedFinisher finisher(this); + if (!s.ok()) { + // Early return; 'finisher' makes the container available again. + return s; + } + + // A failure when syncing the container means the container (and its new + // blocks) may be missing the next time the on-disk state is reloaded. + // + // As such, it's not correct to add the block to in-memory state unless + // synchronization succeeds. In the worst case, this means the data file + // will have written some garbage that can be expunged during a GC. + RETURN_NOT_OK(block_manager()->SyncContainer(*this)); + + CHECK(block_manager()->AddLogBlock(this, block->id(), + total_bytes_written(), block->BytesAppended())); + UpdateBytesWritten(block->BytesAppended()); + if (full() && block_manager()->metrics()) { + block_manager()->metrics()->full_containers->Increment(); + } + return Status::OK(); +} + +Status LogBlockContainer::DeleteBlock(int64_t offset, int64_t length) { + DCHECK_GE(offset, 0); + DCHECK_GE(length, 0); + + // Guaranteed by UpdateBytesWritten(). + DCHECK_EQ(0, offset % instance()->filesystem_block_size_bytes()); + + // It is invalid to punch a zero-size hole. + if (length) { + lock_guard l(&data_writer_lock_); + // Round up to the nearest filesystem block so that the kernel will + // actually reclaim disk space. + // + // It's OK if we exceed the file's total size; the kernel will truncate + // our request. + return data_file_->PunchHole(offset, KUDU_ALIGN_UP( + length, instance()->filesystem_block_size_bytes())); + } + return Status::OK(); +} + +Status LogBlockContainer::WriteData(int64_t offset, const Slice& data) { + DCHECK_GE(offset, 0); + + lock_guard l(&data_writer_lock_); + return data_file_->Write(offset, data); +} + +Status LogBlockContainer::ReadData(int64_t offset, size_t length, + Slice* result, uint8_t* scratch) const { + DCHECK_GE(offset, 0); + + return data_file_->Read(offset, length, result, scratch); +} + +Status LogBlockContainer::AppendMetadata(const BlockRecordPB& pb) { + lock_guard l(&metadata_pb_writer_lock_); + return metadata_pb_writer_->Append(pb); +} + +Status LogBlockContainer::FlushData(int64_t offset, int64_t length) { + DCHECK_GE(offset, 0); + DCHECK_GE(length, 0); + + lock_guard l(&data_writer_lock_); + return data_file_->Flush(RWFile::FLUSH_ASYNC, offset, length); +} + +Status LogBlockContainer::FlushMetadata() { + lock_guard l(&metadata_pb_writer_lock_); + return metadata_pb_writer_->Flush(); +} + +Status LogBlockContainer::SyncData() { + if (FLAGS_enable_data_block_fsync) { + lock_guard l(&data_writer_lock_); + return data_file_->Sync(); + } + return Status::OK(); +} + +Status LogBlockContainer::SyncMetadata() { + if (FLAGS_enable_data_block_fsync) { + lock_guard l(&metadata_pb_writer_lock_); + return metadata_pb_writer_->Sync(); + } + return Status::OK(); +} + +Status LogBlockContainer::Preallocate(size_t length) { + return data_file_->PreAllocate(total_bytes_written(), length); +} + +void LogBlockContainer::ConsumeMemory(int64_t bytes) { + block_manager()->mem_tracker_->Consume(bytes); +} + +void LogBlockContainer::ReleaseMemory(int64_t bytes) { + block_manager()->mem_tracker_->Release(bytes); +} + +void LogBlockContainer::UpdateBytesWritten(int64_t more_bytes) { + DCHECK_GE(more_bytes, 0); + + // The number of bytes is rounded up to the nearest filesystem block so + // that each Kudu block is guaranteed to be on a filesystem block + // boundary. This guarantees that the disk space can be reclaimed when + // the block is deleted. + total_bytes_written_ += KUDU_ALIGN_UP(more_bytes, + instance()->filesystem_block_size_bytes()); + if (full()) { + VLOG(1) << "Container " << ToString() << " with size " + << total_bytes_written_ << " is now full, max size is " + << FLAGS_log_container_max_size; + } +} + +void LogBlockContainer::ExecClosure(const Closure& task) { + ThreadPool* pool = FindOrDie(block_manager()->thread_pools_by_root_path_, + dir()); + Status s = pool->SubmitClosure(task); + if (!s.ok()) { + WARN_NOT_OK( + s, "Could not submit task to thread pool, running it synchronously"); + task.Run(); + } +} + +//////////////////////////////////////////////////////////// +// LogBlock +//////////////////////////////////////////////////////////// + +// The persistent metadata that describes a logical block. +// +// A block grows a LogBlock when its data has been synchronized with +// the disk. That's when it's fully immutable (i.e. none of its metadata +// can change), and when it becomes readable and persistent. +// +// LogBlocks are reference counted to simplify support for deletion with +// outstanding readers. All refcount increments are performed with the +// block manager lock held, as are deletion-based decrements. However, +// no lock is held when ~LogReadableBlock decrements the refcount, thus it +// must be made thread safe (by extending RefCountedThreadSafe instead of +// the simpler RefCounted). +class LogBlock : public RefCountedThreadSafe { + public: + LogBlock(LogBlockContainer* container, BlockId block_id, int64_t offset, + int64_t length); + ~LogBlock(); + + const BlockId& block_id() const { return block_id_; } + LogBlockContainer* container() const { return container_; } + int64_t offset() const { return offset_; } + int64_t length() const { return length_; } + + // Delete the block. Actual deletion takes place when the + // block is destructed. + void Delete(); + + private: + // The owning container. Must outlive the LogBlock. + LogBlockContainer* container_; + + // The block identifier. + const BlockId block_id_; + + // The block's offset in the container. + const int64_t offset_; + + // The block's length. + const int64_t length_; + + // Whether the block has been marked for deletion. + bool deleted_; + + DISALLOW_COPY_AND_ASSIGN(LogBlock); +}; + +LogBlock::LogBlock(LogBlockContainer* container, BlockId block_id, + int64_t offset, int64_t length) + : container_(container), + block_id_(std::move(block_id)), + offset_(offset), + length_(length), + deleted_(false) { + DCHECK_GE(offset, 0); + DCHECK_GE(length, 0); + + container_->ConsumeMemory(kudu_malloc_usable_size(this)); +} + +static void DeleteBlockAsync(LogBlockContainer* container, + BlockId block_id, + int64_t offset, + int64_t length) { + // We don't call SyncData() to synchronize the deletion because it's + // expensive, and in the worst case, we'll just leave orphaned data + // behind to be cleaned up in the next GC. + VLOG(3) << "Freeing space belonging to block " << block_id; + WARN_NOT_OK(container->DeleteBlock(offset, length), + Substitute("Could not delete block $0", block_id.ToString())); +} + +LogBlock::~LogBlock() { + if (deleted_) { + container_->ExecClosure(Bind(&DeleteBlockAsync, container_, block_id_, + offset_, length_)); + } + container_->ReleaseMemory(kudu_malloc_usable_size(this)); +} + +void LogBlock::Delete() { + DCHECK(!deleted_); + deleted_ = true; +} + +//////////////////////////////////////////////////////////// +// LogWritableBlock +//////////////////////////////////////////////////////////// + +// A log-backed block that has been opened for writing. +// +// There's no reference to a LogBlock as this block has yet to be +// persisted. +class LogWritableBlock : public WritableBlock { + public: + enum SyncMode { + SYNC, + NO_SYNC + }; + + LogWritableBlock(LogBlockContainer* container, BlockId block_id, + int64_t block_offset); + + virtual ~LogWritableBlock(); + + virtual Status Close() OVERRIDE; + + virtual Status Abort() OVERRIDE; + + virtual const BlockId& id() const OVERRIDE; + + virtual BlockManager* block_manager() const OVERRIDE; + + virtual Status Append(const Slice& data) OVERRIDE; + + virtual Status FlushDataAsync() OVERRIDE; + + virtual size_t BytesAppended() const OVERRIDE; + + virtual State state() const OVERRIDE; + + // Actually close the block, possibly synchronizing its dirty data and + // metadata to disk. + Status DoClose(SyncMode mode); + + // Write this block's metadata to disk. + // + // Does not synchronize the written data; that takes place in Close(). + Status AppendMetadata(); + + private: + + // RAII-style class for finishing writable blocks in DoClose(). + class ScopedFinisher { + public: + // Both 'block' and 's' must outlive the finisher. + ScopedFinisher(LogWritableBlock* block, Status* s) : + block_(block), + status_(s) { + } + ~ScopedFinisher() { + block_->state_ = CLOSED; + *status_ = block_->container_->FinishBlock(*status_, block_); + } + private: + LogWritableBlock* block_; + Status* status_; + }; + + // The owning container. Must outlive the block. + LogBlockContainer* container_; + + // The block's identifier. + const BlockId block_id_; + + // The block's offset within the container. Known from the moment the + // block is created. + const int64_t block_offset_; + + // The block's length. Changes with each Append(). + int64_t block_length_; + + // The state of the block describing where it is in the write lifecycle, + // for example, has it been synchronized to disk? + WritableBlock::State state_; + + DISALLOW_COPY_AND_ASSIGN(LogWritableBlock); +}; + +LogWritableBlock::LogWritableBlock(LogBlockContainer* container, + BlockId block_id, int64_t block_offset) + : container_(container), + block_id_(std::move(block_id)), + block_offset_(block_offset), + block_length_(0), + state_(CLEAN) { + DCHECK_GE(block_offset, 0); + DCHECK_EQ(0, block_offset % container->instance()->filesystem_block_size_bytes()); + if (container->metrics()) { + container->metrics()->generic_metrics.blocks_open_writing->Increment(); + container->metrics()->generic_metrics.total_writable_blocks->Increment(); + } +} + +LogWritableBlock::~LogWritableBlock() { + if (state_ != CLOSED) { + WARN_NOT_OK(Abort(), Substitute("Failed to abort block $0", + id().ToString())); + } +} + +Status LogWritableBlock::Close() { + return DoClose(SYNC); +} + +Status LogWritableBlock::Abort() { + RETURN_NOT_OK(DoClose(NO_SYNC)); + + // DoClose() has unlocked the container; it may be locked by someone else. + // But block_manager_ is immutable, so this is safe. + return container_->block_manager()->DeleteBlock(id()); +} + +const BlockId& LogWritableBlock::id() const { + return block_id_; +} + +BlockManager* LogWritableBlock::block_manager() const { + return container_->block_manager(); +} + +Status LogWritableBlock::Append(const Slice& data) { + DCHECK(state_ == CLEAN || state_ == DIRTY) + << "Invalid state: " << state_; + + // The metadata change is deferred to Close() or FlushDataAsync(), + // whichever comes first. We can't do it now because the block's + // length is still in flux. + RETURN_NOT_OK(container_->WriteData(block_offset_ + block_length_, data)); + + block_length_ += data.size(); + state_ = DIRTY; + return Status::OK(); +} + +Status LogWritableBlock::FlushDataAsync() { + DCHECK(state_ == CLEAN || state_ == DIRTY || state_ == FLUSHING) + << "Invalid state: " << state_; + if (state_ == DIRTY) { + VLOG(3) << "Flushing block " << id(); + RETURN_NOT_OK(container_->FlushData(block_offset_, block_length_)); + + RETURN_NOT_OK(AppendMetadata()); + + // TODO: Flush just the range we care about. + RETURN_NOT_OK(container_->FlushMetadata()); + } + + state_ = FLUSHING; + return Status::OK(); +} + +size_t LogWritableBlock::BytesAppended() const { + return block_length_; +} + + +WritableBlock::State LogWritableBlock::state() const { + return state_; +} + +Status LogWritableBlock::DoClose(SyncMode mode) { + if (state_ == CLOSED) { + return Status::OK(); + } + + // Tracks the first failure (if any). + // + // It's important that any subsequent failures mutate 's' before + // returning. Otherwise 'finisher' won't properly provide the first + // failure to LogBlockContainer::FinishBlock(). + // + // Note also that when 'finisher' goes out of scope it may mutate 's'. + Status s; + { + ScopedFinisher finisher(this, &s); + + // FlushDataAsync() was not called; append the metadata now. + if (state_ == CLEAN || state_ == DIRTY) { + s = AppendMetadata(); + RETURN_NOT_OK(s); + } + + if (mode == SYNC && + (state_ == CLEAN || state_ == DIRTY || state_ == FLUSHING)) { + VLOG(3) << "Syncing block " << id(); + + // TODO: Sync just this block's dirty data. + s = container_->SyncData(); + RETURN_NOT_OK(s); + + // TODO: Sync just this block's dirty metadata. + s = container_->SyncMetadata(); + RETURN_NOT_OK(s); + + if (container_->metrics()) { + container_->metrics()->generic_metrics.blocks_open_writing->Decrement(); + container_->metrics()->generic_metrics.total_bytes_written->IncrementBy( + BytesAppended()); + } + } + } + + return s; +} + +Status LogWritableBlock::AppendMetadata() { + BlockRecordPB record; + id().CopyToPB(record.mutable_block_id()); + record.set_op_type(CREATE); + record.set_timestamp_us(GetCurrentTimeMicros()); + record.set_offset(block_offset_); + record.set_length(block_length_); + return container_->AppendMetadata(record); +} + +//////////////////////////////////////////////////////////// +// LogReadableBlock +//////////////////////////////////////////////////////////// + +// A log-backed block that has been opened for reading. +// +// Refers to a LogBlock representing the block's persisted metadata. +class LogReadableBlock : public ReadableBlock { + public: + LogReadableBlock(LogBlockContainer* container, + const scoped_refptr& log_block); + + virtual ~LogReadableBlock(); + + virtual Status Close() OVERRIDE; + + virtual const BlockId& id() const OVERRIDE; + + virtual Status Size(uint64_t* sz) const OVERRIDE; + + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const OVERRIDE; + + virtual size_t memory_footprint() const OVERRIDE; + + private: + // The owning container. Must outlive this block. + LogBlockContainer* container_; + + // A reference to this block's metadata. + scoped_refptr log_block_; + + // Whether or not this block has been closed. Close() is thread-safe, so + // this must be an atomic primitive. + AtomicBool closed_; + + DISALLOW_COPY_AND_ASSIGN(LogReadableBlock); +}; + +LogReadableBlock::LogReadableBlock(LogBlockContainer* container, + const scoped_refptr& log_block) + : container_(container), + log_block_(log_block), + closed_(false) { + if (container_->metrics()) { + container_->metrics()->generic_metrics.blocks_open_reading->Increment(); + container_->metrics()->generic_metrics.total_readable_blocks->Increment(); + } +} + +LogReadableBlock::~LogReadableBlock() { + WARN_NOT_OK(Close(), Substitute("Failed to close block $0", + id().ToString())); +} + +Status LogReadableBlock::Close() { + if (closed_.CompareAndSet(false, true)) { + log_block_.reset(); + if (container_->metrics()) { + container_->metrics()->generic_metrics.blocks_open_reading->Decrement(); + } + } + + return Status::OK(); +} + +const BlockId& LogReadableBlock::id() const { + return log_block_->block_id(); +} + +Status LogReadableBlock::Size(uint64_t* sz) const { + DCHECK(!closed_.Load()); + + *sz = log_block_->length(); + return Status::OK(); +} + +Status LogReadableBlock::Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const { + DCHECK(!closed_.Load()); + + uint64_t read_offset = log_block_->offset() + offset; + if (log_block_->length() < offset + length) { + return Status::IOError("Out-of-bounds read", + Substitute("read of [$0-$1) in block [$2-$3)", + read_offset, + read_offset + length, + log_block_->offset(), + log_block_->offset() + log_block_->length())); + } + RETURN_NOT_OK(container_->ReadData(read_offset, length, result, scratch)); + + if (container_->metrics()) { + container_->metrics()->generic_metrics.total_bytes_read->IncrementBy(length); + } + return Status::OK(); +} + +size_t LogReadableBlock::memory_footprint() const { + return kudu_malloc_usable_size(this); +} + +} // namespace internal + +//////////////////////////////////////////////////////////// +// LogBlockManager +//////////////////////////////////////////////////////////// + +static const char* kBlockManagerType = "log"; + +LogBlockManager::LogBlockManager(Env* env, const BlockManagerOptions& opts) + : mem_tracker_(MemTracker::CreateTracker(-1, + "log_block_manager", + opts.parent_mem_tracker)), + // TODO: C++11 provides a single-arg constructor + blocks_by_block_id_(10, + BlockMap::hasher(), + BlockMap::key_equal(), + BlockAllocator(mem_tracker_)), + env_(DCHECK_NOTNULL(env)), + read_only_(opts.read_only), + root_paths_(opts.root_paths), + root_paths_idx_(0), + rand_(GetRandomSeed32()) { + DCHECK_GT(root_paths_.size(), 0); + if (opts.metric_entity) { + metrics_.reset(new internal::LogBlockManagerMetrics(opts.metric_entity)); + } +} + +LogBlockManager::~LogBlockManager() { + // A LogBlock's destructor depends on its container, so all LogBlocks must be + // destroyed before their containers. + blocks_by_block_id_.clear(); + + // As LogBlock destructors run, some blocks may be deleted, so we might be + // waiting here for a little while. + LOG_SLOW_EXECUTION(INFO, 1000, + Substitute("waiting on $0 log block manager thread pools", + thread_pools_by_root_path_.size())) { + for (const ThreadPoolMap::value_type& e : + thread_pools_by_root_path_) { + ThreadPool* p = e.second; + p->Wait(); + p->Shutdown(); + } + } + + STLDeleteElements(&all_containers_); + STLDeleteValues(&thread_pools_by_root_path_); + STLDeleteValues(&instances_by_root_path_); + mem_tracker_->UnregisterFromParent(); +} + +static const char kHolePunchErrorMsg[] = + "Error during hole punch test. The log block manager requires a " + "filesystem with hole punching support such as ext4 or xfs. On el6, " + "kernel version 2.6.32-358 or newer is required. To run without hole " + "punching (at the cost of some efficiency and scalability), reconfigure " + "Kudu with --block_manager=file. Refer to the Kudu documentation for more " + "details. Raw error message follows"; + +Status LogBlockManager::Create() { + CHECK(!read_only_); + + RETURN_NOT_OK(Init()); + + deque delete_on_failure; + ElementDeleter d(&delete_on_failure); + + // The UUIDs and indices will be included in every instance file. + vector all_uuids(root_paths_.size()); + for (string& u : all_uuids) { + u = oid_generator()->Next(); + } + int idx = 0; + + // Ensure the data paths exist and create the instance files. + unordered_set to_sync; + for (const string& root_path : root_paths_) { + bool created; + RETURN_NOT_OK_PREPEND(env_util::CreateDirIfMissing(env_, root_path, &created), + Substitute("Could not create directory $0", root_path)); + if (created) { + delete_on_failure.push_front(new ScopedFileDeleter(env_, root_path)); + to_sync.insert(DirName(root_path)); + } + + if (FLAGS_log_block_manager_test_hole_punching) { + RETURN_NOT_OK_PREPEND(CheckHolePunch(root_path), kHolePunchErrorMsg); + } + + string instance_filename = JoinPathSegments( + root_path, kInstanceMetadataFileName); + PathInstanceMetadataFile metadata(env_, kBlockManagerType, + instance_filename); + RETURN_NOT_OK_PREPEND(metadata.Create(all_uuids[idx], all_uuids), instance_filename); + delete_on_failure.push_front(new ScopedFileDeleter(env_, instance_filename)); + idx++; + } + + // Ensure newly created directories are synchronized to disk. + if (FLAGS_enable_data_block_fsync) { + for (const string& dir : to_sync) { + RETURN_NOT_OK_PREPEND(env_->SyncDir(dir), + Substitute("Unable to synchronize directory $0", dir)); + } + } + + // Success: don't delete any files. + for (ScopedFileDeleter* deleter : delete_on_failure) { + deleter->Cancel(); + } + return Status::OK(); +} + +Status LogBlockManager::Open() { + RETURN_NOT_OK(Init()); + + vector statuses(root_paths_.size()); + unordered_map metadata_files; + ValueDeleter deleter(&metadata_files); + for (const string& root_path : root_paths_) { + InsertOrDie(&metadata_files, root_path, nullptr); + } + + // Submit each open to its own thread pool and wait for them to complete. + int i = 0; + for (const string& root_path : root_paths_) { + ThreadPool* pool = FindOrDie(thread_pools_by_root_path_, root_path); + RETURN_NOT_OK_PREPEND(pool->SubmitClosure( + Bind(&LogBlockManager::OpenRootPath, + Unretained(this), + root_path, + &statuses[i], + &FindOrDie(metadata_files, root_path))), + Substitute("Could not open root path $0", root_path)); + i++; + } + for (const ThreadPoolMap::value_type& e : + thread_pools_by_root_path_) { + e.second->Wait(); + } + + // Ensure that no tasks failed. + for (const Status& s : statuses) { + if (!s.ok()) { + return s; + } + } + + instances_by_root_path_.swap(metadata_files); + return Status::OK(); +} + + +Status LogBlockManager::CreateBlock(const CreateBlockOptions& opts, + gscoped_ptr* block) { + CHECK(!read_only_); + + // Find a free container. If one cannot be found, create a new one. + // + // TODO: should we cap the number of outstanding containers and force + // callers to block if we've reached it? + LogBlockContainer* container = GetAvailableContainer(); + if (!container) { + // Round robin through the root paths to select where the next + // container should live. + int32 old_idx; + int32 new_idx; + do { + old_idx = root_paths_idx_.Load(); + new_idx = (old_idx + 1) % root_paths_.size(); + } while (!root_paths_idx_.CompareAndSet(old_idx, new_idx)); + string root_path = root_paths_[old_idx]; + + // Guaranteed by LogBlockManager::Open(). + PathInstanceMetadataFile* instance = FindOrDie(instances_by_root_path_, root_path); + + gscoped_ptr new_container; + RETURN_NOT_OK(LogBlockContainer::Create(this, + instance->metadata(), + root_path, + &new_container)); + container = new_container.release(); + { + lock_guard l(&lock_); + dirty_dirs_.insert(root_path); + AddNewContainerUnlocked(container); + } + } + + // By preallocating with each CreateBlock(), we're effectively + // maintaining a rolling buffer of preallocated data just ahead of where + // the next write will fall. + if (FLAGS_log_container_preallocate_bytes) { + RETURN_NOT_OK(container->Preallocate(FLAGS_log_container_preallocate_bytes)); + } + + // Generate a free block ID. + BlockId new_block_id; + do { + new_block_id.SetId(rand_.Next64()); + } while (!TryUseBlockId(new_block_id)); + + block->reset(new internal::LogWritableBlock(container, + new_block_id, + container->total_bytes_written())); + VLOG(3) << "Created block " << (*block)->id() << " in container " + << container->ToString(); + return Status::OK(); +} + +Status LogBlockManager::CreateBlock(gscoped_ptr* block) { + return CreateBlock(CreateBlockOptions(), block); +} + +Status LogBlockManager::OpenBlock(const BlockId& block_id, + gscoped_ptr* block) { + scoped_refptr lb; + { + lock_guard l(&lock_); + lb = FindPtrOrNull(blocks_by_block_id_, block_id); + } + if (!lb) { + return Status::NotFound("Can't find block", block_id.ToString()); + } + + block->reset(new internal::LogReadableBlock(lb->container(), + lb.get())); + VLOG(3) << "Opened block " << (*block)->id() + << " from container " << lb->container()->ToString(); + return Status::OK(); +} + +Status LogBlockManager::DeleteBlock(const BlockId& block_id) { + CHECK(!read_only_); + + scoped_refptr lb(RemoveLogBlock(block_id)); + if (!lb) { + return Status::NotFound("Can't find block", block_id.ToString()); + } + VLOG(3) << "Deleting block " << block_id; + lb->Delete(); + + // Record the on-disk deletion. + // + // TODO: what if this fails? Should we restore the in-memory block? + BlockRecordPB record; + block_id.CopyToPB(record.mutable_block_id()); + record.set_op_type(DELETE); + record.set_timestamp_us(GetCurrentTimeMicros()); + RETURN_NOT_OK(lb->container()->AppendMetadata(record)); + + // We don't bother fsyncing the metadata append for deletes in order to avoid + // the disk overhead. Even if we did fsync it, we'd still need to account for + // garbage at startup time (in the event that we crashed just before the + // fsync). TODO: Implement GC of orphaned blocks. See KUDU-829. + + return Status::OK(); +} + +Status LogBlockManager::CloseBlocks(const std::vector& blocks) { + VLOG(3) << "Closing " << blocks.size() << " blocks"; + if (FLAGS_block_coalesce_close) { + // Ask the kernel to begin writing out each block's dirty data. This is + // done up-front to give the kernel opportunities to coalesce contiguous + // dirty pages. + for (WritableBlock* block : blocks) { + RETURN_NOT_OK(block->FlushDataAsync()); + } + } + + // Now close each block, waiting for each to become durable. + for (WritableBlock* block : blocks) { + RETURN_NOT_OK(block->Close()); + } + return Status::OK(); +} + +int64_t LogBlockManager::CountBlocksForTests() const { + lock_guard l(&lock_); + return blocks_by_block_id_.size(); +} + +void LogBlockManager::AddNewContainerUnlocked(LogBlockContainer* container) { + DCHECK(lock_.is_locked()); + all_containers_.push_back(container); + if (metrics()) { + metrics()->containers->Increment(); + if (container->full()) { + metrics()->full_containers->Increment(); + } + } +} + +LogBlockContainer* LogBlockManager::GetAvailableContainer() { + LogBlockContainer* container = nullptr; + lock_guard l(&lock_); + if (!available_containers_.empty()) { + container = available_containers_.front(); + available_containers_.pop_front(); + } + return container; +} + +void LogBlockManager::MakeContainerAvailable(LogBlockContainer* container) { + lock_guard l(&lock_); + MakeContainerAvailableUnlocked(container); +} + +void LogBlockManager::MakeContainerAvailableUnlocked(LogBlockContainer* container) { + DCHECK(lock_.is_locked()); + if (container->full()) { + return; + } + available_containers_.push_back(container); +} + +Status LogBlockManager::SyncContainer(const LogBlockContainer& container) { + Status s; + bool to_sync = false; + { + lock_guard l(&lock_); + to_sync = dirty_dirs_.erase(container.dir()); + } + + if (to_sync && FLAGS_enable_data_block_fsync) { + s = env_->SyncDir(container.dir()); + + // If SyncDir fails, the container directory must be restored to + // dirty_dirs_. Otherwise a future successful LogWritableBlock::Close() + // on this container won't call SyncDir again, and the container might + // be lost on crash. + // + // In the worst case (another block synced this container as we did), + // we'll sync it again needlessly. + if (!s.ok()) { + lock_guard l(&lock_); + dirty_dirs_.insert(container.dir()); + } + } + return s; +} + +bool LogBlockManager::TryUseBlockId(const BlockId& block_id) { + if (block_id.IsNull()) { + return false; + } + + lock_guard l(&lock_); + if (ContainsKey(blocks_by_block_id_, block_id)) { + return false; + } + return InsertIfNotPresent(&open_block_ids_, block_id); +} + +bool LogBlockManager::AddLogBlock(LogBlockContainer* container, + const BlockId& block_id, + int64_t offset, + int64_t length) { + lock_guard l(&lock_); + scoped_refptr lb(new LogBlock(container, block_id, offset, length)); + return AddLogBlockUnlocked(lb); +} + +bool LogBlockManager::AddLogBlockUnlocked(const scoped_refptr& lb) { + DCHECK(lock_.is_locked()); + + if (!InsertIfNotPresent(&blocks_by_block_id_, lb->block_id(), lb)) { + return false; + } + + // There may already be an entry in open_block_ids_ (e.g. we just finished + // writing out a block). + open_block_ids_.erase(lb->block_id()); + if (metrics()) { + metrics()->blocks_under_management->Increment(); + metrics()->bytes_under_management->IncrementBy(lb->length()); + } + return true; +} + +scoped_refptr LogBlockManager::RemoveLogBlock(const BlockId& block_id) { + lock_guard l(&lock_); + return RemoveLogBlockUnlocked(block_id); +} + +scoped_refptr LogBlockManager::RemoveLogBlockUnlocked(const BlockId& block_id) { + DCHECK(lock_.is_locked()); + + scoped_refptr result = + EraseKeyReturnValuePtr(&blocks_by_block_id_, block_id); + if (result && metrics()) { + metrics()->blocks_under_management->Decrement(); + metrics()->bytes_under_management->DecrementBy(result->length()); + } + return result; +} + +void LogBlockManager::OpenRootPath(const string& root_path, + Status* result_status, + PathInstanceMetadataFile** result_metadata) { + if (!env_->FileExists(root_path)) { + *result_status = Status::NotFound(Substitute( + "LogBlockManager at $0 not found", root_path)); + return; + } + + // Open and lock the metadata instance file. + string instance_filename = JoinPathSegments( + root_path, kInstanceMetadataFileName); + gscoped_ptr metadata( + new PathInstanceMetadataFile(env_, kBlockManagerType, + instance_filename)); + Status s = metadata->LoadFromDisk(); + if (!s.ok()) { + *result_status = s.CloneAndPrepend(Substitute( + "Could not open $0", instance_filename)); + return; + } + if (FLAGS_block_manager_lock_dirs) { + s = metadata->Lock(); + if (!s.ok()) { + Status new_status = s.CloneAndPrepend(Substitute( + "Could not lock $0", instance_filename)); + if (read_only_) { + // Not fatal in read-only mode. + LOG(WARNING) << new_status.ToString(); + LOG(WARNING) << "Proceeding without lock"; + } else { + *result_status = new_status; + return; + } + } + } + + // Find all containers and open them. + vector children; + s = env_->GetChildren(root_path, &children); + if (!s.ok()) { + *result_status = s.CloneAndPrepend(Substitute( + "Could not list children of $0", root_path)); + return; + } + for (const string& child : children) { + string id; + if (!TryStripSuffixString(child, LogBlockContainer::kMetadataFileSuffix, &id)) { + continue; + } + gscoped_ptr container; + s = LogBlockContainer::Open(this, metadata->metadata(), + root_path, id, &container); + if (!s.ok()) { + *result_status = s.CloneAndPrepend(Substitute( + "Could not open container $0", id)); + return; + } + + // Populate the in-memory block maps using each container's records. + deque records; + s = container->ReadContainerRecords(&records); + if (!s.ok()) { + *result_status = s.CloneAndPrepend(Substitute( + "Could not read records from container $0", container->ToString())); + return; + } + + // Process the records, building a container-local map. + // + // It's important that we don't try to add these blocks to the global map + // incrementally as we see each record, since it's possible that one container + // has a "CREATE " while another has a "CREATE ; DELETE " pair. + // If we processed those two containers in this order, then upon processing + // the second container, we'd think there was a duplicate block. Building + // the container-local map first ensures that we discount deleted blocks + // before checking for duplicate IDs. + UntrackedBlockMap blocks_in_container; + for (const BlockRecordPB& r : records) { + ProcessBlockRecord(r, container.get(), &blocks_in_container); + } + + // Under the lock, merge this map into the main block map and add + // the container. + { + lock_guard l(&lock_); + for (const UntrackedBlockMap::value_type& e : blocks_in_container) { + if (!AddLogBlockUnlocked(e.second)) { + LOG(FATAL) << "Found duplicate CREATE record for block " << e.first + << " which already is alive from another container when " + << " processing container " << container->ToString(); + } + } + + AddNewContainerUnlocked(container.get()); + MakeContainerAvailableUnlocked(container.release()); + } + } + + *result_status = Status::OK(); + *result_metadata = metadata.release(); +} + +void LogBlockManager::ProcessBlockRecord(const BlockRecordPB& record, + LogBlockContainer* container, + UntrackedBlockMap* block_map) { + BlockId block_id(BlockId::FromPB(record.block_id())); + switch (record.op_type()) { + case CREATE: { + scoped_refptr lb(new LogBlock(container, block_id, + record.offset(), record.length())); + if (!InsertIfNotPresent(block_map, block_id, lb)) { + LOG(FATAL) << "Found duplicate CREATE record for block " + << block_id.ToString() << " in container " + << container->ToString() << ": " + << record.DebugString(); + } + + VLOG(2) << Substitute("Found CREATE block $0 at offset $1 with length $2", + block_id.ToString(), + record.offset(), record.length()); + + // This block must be included in the container's logical size, even if + // it has since been deleted. This helps satisfy one of our invariants: + // once a container byte range has been used, it may never be reused in + // the future. + // + // If we ignored deleted blocks, we would end up reusing the space + // belonging to the last deleted block in the container. + container->UpdateBytesWritten(record.length()); + break; + } + case DELETE: + if (block_map->erase(block_id) != 1) { + LOG(FATAL) << "Found DELETE record for invalid block " + << block_id.ToString() << " in container " + << container->ToString() << ": " + << record.DebugString(); + } + VLOG(2) << Substitute("Found DELETE block $0", block_id.ToString()); + break; + default: + LOG(FATAL) << "Found unknown op type in block record: " + << record.DebugString(); + } +} + +Status LogBlockManager::CheckHolePunch(const string& path) { + // Arbitrary constants. + static uint64_t kFileSize = 4096 * 4; + static uint64_t kHoleOffset = 4096; + static uint64_t kHoleSize = 8192; + static uint64_t kPunchedFileSize = kFileSize - kHoleSize; + + // Open the test file. + string filename = JoinPathSegments(path, "hole_punch_test_file"); + gscoped_ptr file; + RWFileOptions opts; + RETURN_NOT_OK(env_->NewRWFile(opts, filename, &file)); + + // The file has been created; delete it on exit no matter what happens. + ScopedFileDeleter file_deleter(env_, filename); + + // Preallocate it, making sure the file's size is what we'd expect. + uint64_t sz; + RETURN_NOT_OK(file->PreAllocate(0, kFileSize)); + RETURN_NOT_OK(env_->GetFileSizeOnDisk(filename, &sz)); + if (sz != kFileSize) { + return Status::IOError(Substitute( + "Unexpected pre-punch file size for $0: expected $1 but got $2", + filename, kFileSize, sz)); + } + + // Punch the hole, testing the file's size again. + RETURN_NOT_OK(file->PunchHole(kHoleOffset, kHoleSize)); + RETURN_NOT_OK(env_->GetFileSizeOnDisk(filename, &sz)); + if (sz != kPunchedFileSize) { + return Status::IOError(Substitute( + "Unexpected post-punch file size for $0: expected $1 but got $2", + filename, kPunchedFileSize, sz)); + } + + return Status::OK(); +} + +Status LogBlockManager::Init() { + // Initialize thread pools. + ThreadPoolMap pools; + ValueDeleter d(&pools); + int i = 0; + for (const string& root : root_paths_) { + gscoped_ptr p; + RETURN_NOT_OK_PREPEND(ThreadPoolBuilder(Substitute("lbm root $0", i++)) + .set_max_threads(1) + .Build(&p), + "Could not build thread pool"); + InsertOrDie(&pools, root, p.release()); + } + thread_pools_by_root_path_.swap(pools); + + return Status::OK(); +} + +} // namespace fs +} // namespace kudu diff --git a/src/kudu/fs/log_block_manager.h b/src/kudu/fs/log_block_manager.h new file mode 100644 index 000000000000..8e58843c6e1e --- /dev/null +++ b/src/kudu/fs/log_block_manager.h @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_FS_LOG_BLOCK_MANAGER_H +#define KUDU_FS_LOG_BLOCK_MANAGER_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/fs/fs.pb.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/atomic.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/oid_generator.h" +#include "kudu/util/random.h" + +namespace kudu { +class Env; +class MetricEntity; +class ThreadPool; + +namespace fs { +class PathInstanceMetadataFile; + +namespace internal { +class LogBlock; +class LogBlockContainer; + +struct LogBlockManagerMetrics; +} // namespace internal + +// A log-backed (i.e. sequentially allocated file) block storage +// implementation. +// +// This is a block storage implementation that attempts to reduce the +// number of files used by clustering blocks into large files known +// henceforth as containers. A container begins empty and is written to +// sequentially, block by block. When a container becomes full, it is set +// aside and a new container is created. +// +// Implementation details +// ---------------------- +// +// A container is comprised of two files, one for metadata and one for +// data. Both are written to sequentially. During a write, the block's data +// is written as-is to the data file. After the block has been +// synchronized, a small record is written to the metadata file containing +// the block's ID and its location within the data file. +// +// Block deletions are handled similarly. When a block is deleted, a record +// is written describing the deletion, orphaning the old block data. The +// orphaned data can be reclaimed instantaneously via hole punching, or +// later via garbage collection. The latter is used when hole punching is +// not supported on the filesystem, or on next boot if there's a crash +// after deletion but before hole punching. The metadata file itself is not +// compacted, as it is expected to remain quite small even after a great +// many create/delete cycles. +// +// Data and metadata operations are carefully ordered to ensure the +// correctness of the persistent representation at all times. During the +// writable block lifecycle (i.e. when a block is being created), data +// operations come before metadata operations. In the event that a metadata +// operation fails, the result is an orphaned block that is detected and +// pruned in the next garbage collection cycle. Conversely, metadata +// operations precede the data operations when deleting a block. In the +// worst case, a failure in the latter yields more garbage data that can be +// deleted in a GC. +// +// Care is taken to keep the in-memory representation of the block manager +// in synch with its persistent representation. To wit, a block is only +// made available in memory if _all_ on-disk operations (including any +// necessary synchronization calls) are successful. +// +// When a new block is created, a container is selected using a round-robin +// policy (i.e. the least recently used container). If no containers are +// available, a new one is created. Only when the block is fully written is +// the container returned to the pool of available containers. +// +// All log block manager metadata requests are served from memory. When an +// existing block manager is opened, all on-disk container metadata is +// parsed to build a single in-memory map describing the existence and +// locations of various blocks. Each entry in the map consumes ~64 bytes, +// putting the memory overhead at ~610 MB for 10 million blocks. +// +// New blocks are placed on a filesystem block boundary, and the size of +// hole punch requests is rounded up to the nearest filesystem block size. +// Taken together, this guarantees that hole punching can actually reclaim +// disk space (instead of just zeroing the block's bytes on disk). +// +// Design trade-offs +// ----------------- +// In general, log-backed block storage is optimized for sustained reads +// and writes. The idea is that all blocks in a given container contain +// related data and are generally read at once, reducing seeks for +// sustained scans. This comes at a cost: the containers need to be garbage +// collected every now and then, though newer systems can take advantage of +// filesystem hole punching (as described above) to reclaim space. +// +// The on-disk container metadata design favors simplicity and contiguous +// access over space consumption and scalability to a very large number of +// blocks. To be more specific, the separation of metadata from data allows +// for high performance sustained reads at block manager open time at a +// manageability cost: a container is not a single file, and needs multiple +// open fds to be of use. Moreover, the log-structured nature of the +// metadata is simple and performant at open time. +// +// Likewise, the default container placement policy favors simplicity over +// performance. In the future, locality hints will ensure that blocks +// pertaining to similar data are colocated, improving scan performance. +// +// The choice to serve all metadata requests from memory favors simplicity +// over memory consumption. With a very large number of blocks, the +// in-memory map may balloon in size and some sort of "spilling" behavior +// may be beneficial. + +// TODO +// ---- +// - Implement garbage collection fallback for hole punching. +// - Implement locality hints so that specific containers can be used for +// groups of blocks (i.e. an entire column). +// - Unlock containers on FlushDataAsync() so that the workflow in +// BlockManagerTest::CloseManyBlocksTest can use just one container. +// - Implement failure recovery (i.e. metadata truncation and other +// similarly recoverable errors). +// - Evaluate and implement a solution for data integrity (e.g. per-block +// checksum). + +// The log-backed block manager. +class LogBlockManager : public BlockManager { + public: + LogBlockManager(Env* env, const BlockManagerOptions& opts); + + virtual ~LogBlockManager(); + + virtual Status Create() OVERRIDE; + + virtual Status Open() OVERRIDE; + + virtual Status CreateBlock(const CreateBlockOptions& opts, + gscoped_ptr* block) OVERRIDE; + + virtual Status CreateBlock(gscoped_ptr* block) OVERRIDE; + + virtual Status OpenBlock(const BlockId& block_id, + gscoped_ptr* block) OVERRIDE; + + virtual Status DeleteBlock(const BlockId& block_id) OVERRIDE; + + virtual Status CloseBlocks(const std::vector& blocks) OVERRIDE; + + // Return the number of blocks stored in the block manager. + int64_t CountBlocksForTests() const; + + private: + FRIEND_TEST(LogBlockManagerTest, TestReuseBlockIds); + friend class internal::LogBlockContainer; + + // Simpler typedef for a block map which isn't tracked in the memory tracker. + // Used during startup. + typedef std::unordered_map< + const BlockId, + scoped_refptr, + BlockIdHash, + BlockIdEqual> UntrackedBlockMap; + + typedef MemTrackerAllocator< + std::pair > > BlockAllocator; + typedef std::unordered_map< + const BlockId, + scoped_refptr, + BlockIdHash, + BlockIdEqual, + BlockAllocator> BlockMap; + + // Adds an as of yet unseen container to this block manager. + void AddNewContainerUnlocked(internal::LogBlockContainer* container); + + // Returns the next container available for writing using a round-robin + // selection policy, or null if no suitable container was found. + // + // After returning, the container is considered to be in use. When + // writing is finished, call MakeContainerAvailable() to make it + // available to other writers. + internal::LogBlockContainer* GetAvailableContainer(); + + // Indicate that this container is no longer in use and can be handed out + // to other writers. + void MakeContainerAvailable(internal::LogBlockContainer* container); + void MakeContainerAvailableUnlocked(internal::LogBlockContainer* container); + + // Synchronizes a container's dirty metadata to disk, taking care not to + // sync more than is necessary (using 'dirty_dirs_'). + Status SyncContainer(const internal::LogBlockContainer& container); + + // Attempts to claim 'block_id' for use in a new WritableBlock. + // + // Returns true if the given block ID was not in use (and marks it as in + // use), false otherwise. + bool TryUseBlockId(const BlockId& block_id); + + // Adds a LogBlock to in-memory data structures. + // + // Returns success if the LogBlock was successfully added, failure if it + // was already present. + bool AddLogBlock(internal::LogBlockContainer* container, + const BlockId& block_id, + int64_t offset, + int64_t length); + + // Unlocked variant of AddLogBlock() for an already-constructed LogBlock object. + // Must hold 'lock_'. + bool AddLogBlockUnlocked(const scoped_refptr& lb); + + // Removes a LogBlock from in-memory data structures. + // + // Returns the LogBlock if it was successfully removed, NULL if it was + // already gone. + scoped_refptr RemoveLogBlock(const BlockId& block_id); + + // Unlocked variant of RemoveLogBlock(); must hold 'lock_'. + scoped_refptr RemoveLogBlockUnlocked(const BlockId& block_id); + + // Parse a block record, adding or removing it in 'block_map', and + // accounting for it in the metadata for 'container'. + void ProcessBlockRecord(const BlockRecordPB& record, + internal::LogBlockContainer* container, + UntrackedBlockMap* block_map); + + // Open a particular root path belonging to the block manager. + // + // Success or failure is set in 'result_status'. On success, also sets + // 'result_metadata' with an allocated metadata file. + void OpenRootPath(const std::string& root_path, + Status* result_status, + PathInstanceMetadataFile** result_metadata); + + // Test for hole punching support at 'path'. + Status CheckHolePunch(const std::string& path); + + // Perform basic initialization. + Status Init(); + + ObjectIdGenerator* oid_generator() { return &oid_generator_; } + + Env* env() const { return env_; } + + const internal::LogBlockManagerMetrics* metrics() const { return metrics_.get(); } + + // Tracks memory consumption of any allocations numerous enough to be + // interesting (e.g. LogBlocks). + std::shared_ptr mem_tracker_; + + // Protects the block map, container structures, and 'dirty_dirs'. + mutable simple_spinlock lock_; + + // Maps block IDs to blocks that are now readable, either because they + // already existed on disk when the block manager was opened, or because + // they're WritableBlocks that were closed. + BlockMap blocks_by_block_id_; + + // Contains block IDs for WritableBlocks that are still open for writing. + // When a WritableBlock is closed, its ID is moved to blocks_by_block_id. + // + // Together with blocks_by_block_id's keys, used to prevent collisions + // when creating new anonymous blocks. + std::unordered_set open_block_ids_; + + // Holds (and owns) all containers loaded from disk. + std::vector all_containers_; + + // Holds only those containers that are currently available for writing, + // excluding containers that are either in use or full. + // + // Does not own the containers. + std::deque available_containers_; + + // Tracks dirty container directories. + // + // Synced and cleared by SyncMetadata(). + std::unordered_set dirty_dirs_; + + // For manipulating files. + Env* env_; + + // If true, only read operations are allowed. + const bool read_only_; + + // Filesystem paths where all block directories are found. + const std::vector root_paths_; + + // Index of 'root_paths_' for the next created block. + AtomicInt root_paths_idx_; + + // Maps root paths to instance metadata files found in each root path. + typedef std::unordered_map InstanceMap; + InstanceMap instances_by_root_path_; + + // Maps root paths to thread pools. Each pool runs at most one thread, and + // so serves as a "work queue" for that particular disk. + typedef std::unordered_map ThreadPoolMap; + ThreadPoolMap thread_pools_by_root_path_; + + // For generating container names. + ObjectIdGenerator oid_generator_; + + // For generating block IDs. + ThreadSafeRandom rand_; + + // Metrics for the block manager. + // + // May be null if instantiated without metrics. + gscoped_ptr metrics_; + + DISALLOW_COPY_AND_ASSIGN(LogBlockManager); +}; + +} // namespace fs +} // namespace kudu + +#endif diff --git a/src/kudu/gutil/CMakeLists.txt b/src/kudu/gutil/CMakeLists.txt new file mode 100644 index 000000000000..e0e2eb1c1b68 --- /dev/null +++ b/src/kudu/gutil/CMakeLists.txt @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(GUTIL_SRCS + atomicops-internals-x86.cc + bits.cc + callback_internal.cc + cpu.cc + dynamic_annotations.c + hash/city.cc + hash/hash.cc + hash/jenkins.cc + int128.cc + mathlimits.cc + once.cc + ref_counted.cc + ref_counted_memory.cc + spinlock.cc + spinlock_internal.cc + stringprintf.cc + strings/ascii_ctype.cc + strings/charset.cc + strings/escaping.cc + strings/human_readable.cc + strings/join.cc + strings/memutil.cc + strings/numbers.cc + strings/serialize.cc + strings/split.cc + strings/strcat.cc + strings/stringpiece.cc + strings/strip.cc + strings/substitute.cc + strings/util.cc + strtoint.cc + sysinfo.cc + threading/thread_collision_warner.cc + utf/rune.c + walltime.cc) + +set(GUTIL_LIBS + glog + protobuf) + +if (NOT APPLE) + set(GUTIL_LIBS + ${GUTIL_LIBS} + rt) # clock_gettime() requires -lrt +endif() + +ADD_EXPORTABLE_LIBRARY(gutil + SRCS ${GUTIL_SRCS} + DEPS ${GUTIL_LIBS} + # Disable warnings which trigger a lot in the Google code: + COMPILE_FLAGS "-funsigned-char -Wno-deprecated -Wno-char-subscripts") + +add_kudu_test(strings/string_util-test) diff --git a/src/kudu/gutil/algorithm.h b/src/kudu/gutil/algorithm.h new file mode 100644 index 000000000000..ff45e98c3429 --- /dev/null +++ b/src/kudu/gutil/algorithm.h @@ -0,0 +1,441 @@ +// Copyright 2006 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// +// +// This file contains some Google extensions to the standard +// C++ header. Many of these algorithms were in the +// original STL before it was proposed for standardization. + +#ifndef UTIL_GTL_ALGORITHM_H_ +#define UTIL_GTL_ALGORITHM_H_ + +#include +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +using std::binary_function; +using std::less; +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +using std::make_pair; +using std::pair; + +namespace util { +namespace gtl { + +// Returns true if [first, last) contains an element equal to value. +// Complexity: linear. +template +bool contains(InputIterator first, InputIterator last, + const EqualityComparable& value) { + return std::find(first, last, value) != last; +} + +// There is no contains_if(). Use any() instead. + +template +bool contains_some_of(InputIterator first1, InputIterator last1, + ForwardIterator first2, ForwardIterator last2) { + return std::find_first_of(first1, last1, first2, last2) != last1; +} + +template +bool contains_some_of(InputIterator first1, InputIterator last1, + ForwardIterator first2, ForwardIterator last2, + Predicate pred) { + return std::find_first_of(first1, last1, first2, last2, pred) != last1; +} + +template +typename std::iterator_traits::pointer +find_or_null(InputIterator first, InputIterator last, + const EqualityComparable& value) { + const InputIterator it = std::find(first, last, value); + return it != last ? &*it : NULL; +} + +template +typename std::iterator_traits::pointer +find_if_or_null(InputIterator first, InputIterator last, Predicate pred) { + const InputIterator it = std::find_if(first, last, pred); + return it != last ? &*it : NULL; +} + +// Copies all elements that satisfy the predicate pred from [first, +// last) to out. This is the complement of remove_copy_if. Complexity: +// exactly last-first applications of pred. +template +OutputIterator copy_if(InputIterator first, InputIterator last, + OutputIterator out, + Predicate pred) { + for (; first != last; ++first) { + if (pred(*first)) + *out++ = *first; + } + return out; +} + +// Copies n elements to out. Equivalent to copy(first, first + n, out) for +// random access iterators and a longer code block for lesser iterators. +template +OutputIterator copy_n(InputIterator first, Size n, OutputIterator out) { + while (n > 0) { + *out = *first; + ++out; + ++first; + --n; + } + return out; +} + +// Returns true if pred is true for every element in [first, last). Complexity: +// at most last-first applications of pred. +template +bool all(InputIterator first, InputIterator last, Predicate pred) { + for (; first != last; ++first) { + if (!pred(*first)) + return false; + } + return true; +} + +// Returns true if pred is false for every element in [first, +// last). Complexity: at most last-first applications of pred. +template +bool none(InputIterator first, InputIterator last, Predicate pred) { + return find_if(first, last, pred) == last; +} + +// Returns true if pred is true for at least one element in [first, last). +// Complexity: at most last-first applications of pred. +template +bool any(InputIterator first, InputIterator last, Predicate pred) { + return !none(first, last, pred); +} + +// Returns a pair of iterators p such that p.first points to the +// minimum element in the range and p.second points to the maximum, +// ordered by comp. Complexity: at most floor((3/2) (N-1)) +// comparisons. Postcondition: If the return value is , min is +// the first iterator i in [first, last) such that comp(*j, *i) is +// false for every other iterator j, and max is the last iterator i +// such that comp(*i, *j) is false for every other iterator j. Or less +// formally, min is the first minimum and max is the last maximum. +template +std::pair minmax_element(ForwardIter first, + const ForwardIter last, + Compare comp) { + // Initialization: for N<2, set min=max=first. For N >= 2, set min + // to be the smaller of the first two and max to be the larger. (Taking + // care that min is the first, and max the second, if the two compare + // equivalent.) + ForwardIter min(first); + ForwardIter max(first); + + if (first != last) { + ++first; + } + + if (first != last) { + max = first; + if (comp(*max, *min)) + swap(min, max); + ++first; + } + + while (first != last) { + ForwardIter next(first); + ++next; + + if (next != last) { + // We have two elements to look at that we haven't seen already, + // *first and *next. Compare the smaller of them with the + // current min, and the larger against the current max. The + // subtle point: write all of the comparisons so that, if the + // two things being compared are equivalent, we take the first + // one as the smaller and the last as the larger. + if (comp(*next, *first)) { + if (comp(*next, *min)) + min = next; + if (!comp(*first, *max)) + max = first; + } else { + if (comp(*first, *min)) + min = first; + if (!comp(*next, *max)) + max = next; + } + } else { + // There is only one element left that we haven't seen already, *first. + // Adjust min or max, as appropriate, and exit the loop. + if (comp(*first, *min)) { + min = first; + } else if (!comp(*first, *max)) { + max = first; + } + break; + } + + first = next; + ++first; + } + + return make_pair(min, max); +} + +// Returns a pair of iterators p such that p.first points to the first +// minimum element in the range and p.second points to the last +// maximum, ordered by operator<. +template +inline std::pair minmax_element(ForwardIter first, + ForwardIter last) { + typedef typename std::iterator_traits::value_type value_type; + return util::gtl::minmax_element(first, last, std::less()); +} + +// Returns true if [first, last) is partitioned by pred, i.e. if all +// elements that satisfy pred appear before those that do +// not. Complexity: linear. +template +inline bool is_partitioned(ForwardIterator first, ForwardIterator last, + Predicate pred) { + for (; first != last; ++first) { + if (!pred(*first)) { + ++first; + return util::gtl::none(first, last, pred); + } + } + return true; +} + +// Precondition: is_partitioned(first, last, pred). Returns: the +// partition point p, i.e. an iterator mid satisfying the conditions +// all(first, mid, pred) and none(mid, last, pred). Complexity: +// O(log(last-first)) applications of pred. +template +ForwardIterator partition_point(ForwardIterator first, ForwardIterator last, + Predicate pred) { + typedef typename std::iterator_traits::difference_type diff; + diff n = distance(first, last); + + // Loop invariant: n == distance(first, last) + while (first != last) { + diff half = n/2; + ForwardIterator mid = first; + advance(mid, half); + if (pred(*mid)) { + first = mid; + ++first; + n -= half+1; + } else { + n = half; + last = mid; + } + } + + return first; +} + +// Copies all elements that satisfy pred to out_true and all elements +// that don't satisfy it to out_false. Returns: a pair p such that +// p.first is the end of the range beginning at out_t and p.second is +// the end of the range beginning at out_f. Complexity: exactly +// last-first applications of pred. +template +std::pair +partition_copy(InputIterator first, InputIterator last, + OutputIterator1 out_true, OutputIterator2 out_false, + Predicate pred) { + for (; first != last; ++first) { + if (pred(*first)) + *out_true++ = *first; + else + *out_false++ = *first; + } + return make_pair(out_true, out_false); +} + +// Reorders elements in [first, last), so that for each consecutive group +// of duplicate elements (according to eq predicate) the first one is left and +// others are moved at the end of the range. Returns: iterator middle such that +// [first, middle) contains no two consecutive elements that are duplicates and +// [middle, last) contains elements removed from all groups. It's stable for +// range [first, middle) meaning the order of elements are the same as order of +// their corresponding groups in input, but the order in range [middle, last) +// is not preserved. Function is similar to std::unique, but ensures that +// removed elements are properly copied and accessible at the range's end. +// Complexity: exactly last-first-1 applications of eq; at most middle-first-1 +// swap operations. +template +ForwardIterator unique_partition(ForwardIterator first, ForwardIterator last, + Equals eq) { + first = adjacent_find(first, last, eq); + if (first == last) + return last; + + // Points to right-most element within range of unique elements being built. + ForwardIterator result = first; + + // 'first' iterator goes through the sequence starting from element after + // first equal elements pair (found by adjacent_find above). + ++first; + while (++first != last) { + // If we encounter an element that isn't equal to right-most element in + // result, then extend the range and swap this element into it. + // Otherwise just continue incrementing 'first'. + if (!eq(*result, *first)) { + swap(*++result, *first); + } + } + // Return past-the-end upper-bound of the resulting range. + return ++result; +} + +// Reorders elements in [first, last) range moving duplicates for each +// consecutive group of elements to the end. Equality is checked using ==. +template +inline ForwardIterator unique_partition(ForwardIterator first, + ForwardIterator last) { + typedef typename std::iterator_traits::value_type T; + return unique_partition(first, last, std::equal_to()); +} + +// Samples k elements from the next n. +// Elements have the same order in the output stream as they did on input. +// +// This is Algorithm S from section 3.4.2 of Knuth, TAOCP, 2nd edition. +// My k corresponds to Knuth n-m. +// My n corrsponds to Knuth N-t. +// +// RngFunctor is any functor that can be called as: +// size_t RngFunctor(size_t x) +// The return value is an integral value in the half-open range [0, x) +// such that all values are equally likely. Behavior is unspecified if x==0. +// (This function never calls RngFunctor with x==0). + +template +inline void sample_k_of_n(InputIterator in, size_t k, size_t n, + RngFunctor& rng, OutputIterator out) { + if (k > n) { + k = n; + } + while (k > 0) { + if (rng(n) < k) { + *out++ = *in; + k--; + } + ++in; + --n; + } +} + +// Finds the longest prefix of a range that is a binary max heap with +// respect to a given StrictWeakOrdering. If first == last, returns last. +// Otherwise, return an iterator it such that [first,it) is a heap but +// no longer prefix is -- in other words, first + i for the lowest i +// such that comp(first[(i-1)/2], first[i]) returns true. +template +RandomAccessIterator gtl_is_binary_heap_until(RandomAccessIterator first, + RandomAccessIterator last, + StrictWeakOrdering comp) { + if (last - first < 2) return last; + RandomAccessIterator parent = first; + bool is_right_child = false; + for (RandomAccessIterator child = first + 1; child != last; ++child) { + if (comp(*parent, *child)) return child; + if (is_right_child) ++parent; + is_right_child = !is_right_child; + } + return last; +} + +// Special case of gtl_is_binary_heap_until where the order is std::less, +// i.e., where we're working with a simple max heap. +template +RandomAccessIterator gtl_is_binary_heap_until(RandomAccessIterator first, + RandomAccessIterator last) { + typedef typename std::iterator_traits::value_type T; + return gtl_is_binary_heap_until(first, last, std::less()); +} + +// Checks whether a range of values is a binary heap, i.e., checks that +// no node is less (as defined by a StrictWeakOrdering) than a child. +template +bool gtl_is_binary_heap(RandomAccessIterator begin, + RandomAccessIterator end, + StrictWeakOrdering comp) { + return gtl_is_binary_heap_until(begin, end, comp) == end; +} + +// Special case of gtl_is_binary_heap where the order is std::less (i.e., +// where we're working on a simple max heap). +template +bool gtl_is_binary_heap(RandomAccessIterator begin, + RandomAccessIterator end) { + return gtl_is_binary_heap_until(begin, end) == end; +} + +// Unqualified calls to is_heap are ambiguous with some build types, +// namespace that can clash with names that C++11 added to ::std. +// By calling util::gtl::is_heap, clients can avoid those errors, +// and by using the underlying is_heap call we ensure consistency +// with the standard library's heap implementation just in case a +// standard library ever uses anything other than a binary heap. +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus > 199711L \ + || defined(LIBCXX) || _MSC_VER >= 1600 /* Visual Studio 2010 */ +using std::is_heap; +#elif defined __GNUC__ +/* using __gnu_cxx::is_heap; */ +#elif defined _MSC_VER +// For old versions of MSVC++, we know by inspection that make_heap() +// traffics in binary max heaps, so gtl_is_binary_heap is an acceptable +// implementation for is_heap. +template +bool is_heap(RandomAccessIterator begin, RandomAccessIterator end) { + return gtl_is_binary_heap(begin, end); +} + +template +bool is_heap(RandomAccessIterator begin, + RandomAccessIterator end, + StrictWeakOrdering comp) { + return gtl_is_binary_heap(begin, end, comp); +} +#else +// We need an implementation of is_heap that matches the library's +// implementation of make_heap() and friends. gtl_is_binary_heap will +// *probably* work, but let's be safe and not make that assumption. +#error No implementation of is_heap defined for this toolchain. +#endif + +} // namespace gtl +} // namespace util + +#endif // UTIL_GTL_ALGORITHM_H_ diff --git a/src/kudu/gutil/arm_instruction_set_select.h b/src/kudu/gutil/arm_instruction_set_select.h new file mode 100644 index 000000000000..87bc183358b8 --- /dev/null +++ b/src/kudu/gutil/arm_instruction_set_select.h @@ -0,0 +1,52 @@ +// Copyright 2011 Google Inc. +// All Rights Reserved. +// +// +// Generalizes the plethora of ARM flavors available to an easier to manage set +// Defs reference is at https://wiki.edubuntu.org/ARM/Thumb2PortingHowto + +#ifndef ARM_INSTRUCTION_SET_SELECT_H_ +#define ARM_INSTRUCTION_SET_SELECT_H_ + +#if defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7A__) +# define ARMV7 1 +#endif + +#if defined(ARMV7) || \ + defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6ZK__) +# define ARMV6 1 +#endif + +#if defined(ARMV6) || \ + defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__) || \ + defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define ARMV5 1 +#endif + +#if defined(ARMV5) || \ + defined(__ARM_ARCH_4__) || \ + defined(__ARM_ARCH_4T__) +# define ARMV4 1 +#endif + +#if defined(ARMV4) || \ + defined(__ARM_ARCH_3__) || \ + defined(__ARM_ARCH_3M__) +# define ARMV3 1 +#endif + +#if defined(ARMV3) || \ + defined(__ARM_ARCH_2__) +# define ARMV2 1 +#endif + +#endif // ARM_INSTRUCTION_SET_SELECT_H_ diff --git a/src/kudu/gutil/atomic_refcount.h b/src/kudu/gutil/atomic_refcount.h new file mode 100644 index 000000000000..9c8092189bdd --- /dev/null +++ b/src/kudu/gutil/atomic_refcount.h @@ -0,0 +1,195 @@ +#ifndef BASE_ATOMIC_REFCOUNT_H_ +#define BASE_ATOMIC_REFCOUNT_H_ +// Copyright 2008 Google Inc. +// All rights reserved. + +// Atomic increment and decrement for reference counting. +// For atomic operations on statistics counters and sequence numbers, +// see atomic_stats_counter.h and atomic_sequence_num.h respectively. + +// Some clients use atomic operations for reference counting. +// you use one of them: +// util/refcount/reference_counted.h +// util/gtl/refcounted_ptr.h +// util/gtl/shared_ptr.h +// Alternatively, use a Mutex to maintain your reference count. +// If you really must build your own reference counts with atomic operations, +// use the following routines in the way suggested by this example: +// AtomicWord ref_count_; // remember to initialize this to 0 +// ... +// void Ref() { +// base::RefCountInc(&this->ref_count_); +// } +// void Unref() { +// if (!base::RefCountDec(&this->ref_count_)) { +// delete this; +// } +// } +// Using these routines (rather than the ones in atomicops.h) will provide the +// correct semantics; in particular, the memory ordering needed to make +// reference counting work will be guaranteed. +// You need not declare the reference count word "volatile". After +// initialization you should use the word only via the routines below; the +// "volatile" in the signatures below is for backwards compatibility. +// +// The implementation includes annotations to avoid some false alarms +// when using Helgrind (the data race detector). +// +// If you need to do something very different from this, use a Mutex. + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/dynamic_annotations.h" + +namespace base { + +// These calls are available for both Atomic32, and AtomicWord types, +// and also for base::subtle::Atomic64 if available on the platform. + +// Normally, clients are expected to use RefCountInc/RefCountDec. +// In rare cases, it may be necessary to adjust the reference count by +// more than 1, in which case they may use RefCountIncN/RefCountDecN. + +// Increment a reference count by "increment", which must exceed 0. +inline void RefCountIncN(volatile Atomic32 *ptr, Atomic32 increment) { + DCHECK_GT(increment, 0); + base::subtle::NoBarrier_AtomicIncrement(ptr, increment); +} + +// Decrement a reference count by "decrement", which must exceed 0, +// and return whether the result is non-zero. +// Insert barriers to ensure that state written before the reference count +// became zero will be visible to a thread that has just made the count zero. +inline bool RefCountDecN(volatile Atomic32 *ptr, Atomic32 decrement) { + DCHECK_GT(decrement, 0); + ANNOTATE_HAPPENS_BEFORE(ptr); + bool res = base::subtle::Barrier_AtomicIncrement(ptr, -decrement) != 0; + if (!res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} + +// Increment a reference count by 1. +inline void RefCountInc(volatile Atomic32 *ptr) { + base::RefCountIncN(ptr, 1); +} + +// Decrement a reference count by 1 and return whether the result is non-zero. +// Insert barriers to ensure that state written before the reference count +// became zero will be visible to a thread that has just made the count zero. +inline bool RefCountDec(volatile Atomic32 *ptr) { + return base::RefCountDecN(ptr, 1); +} + +// Return whether the reference count is one. +// If the reference count is used in the conventional way, a +// refrerence count of 1 implies that the current thread owns the +// reference and no other thread shares it. +// This call performs the test for a referenece count of one, and +// performs the memory barrier needed for the owning thread +// to act on the object, knowing that it has exclusive access to the +// object. +inline bool RefCountIsOne(const volatile Atomic32 *ptr) { + bool res = base::subtle::Acquire_Load(ptr) == 1; + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} + +// Return whether the reference count is zero. With conventional object +// referencing counting, the object will be destroyed, so the reference count +// should never be zero. Hence this is generally used for a debug check. +inline bool RefCountIsZero(const volatile Atomic32 *ptr) { + bool res = (subtle::Acquire_Load(ptr) == 0); + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} + +#if BASE_HAS_ATOMIC64 +// Implementations for Atomic64, if available. +inline void RefCountIncN(volatile base::subtle::Atomic64 *ptr, + base::subtle::Atomic64 increment) { + DCHECK_GT(increment, 0); + base::subtle::NoBarrier_AtomicIncrement(ptr, increment); +} +inline bool RefCountDecN(volatile base::subtle::Atomic64 *ptr, + base::subtle::Atomic64 decrement) { + DCHECK_GT(decrement, 0); + ANNOTATE_HAPPENS_BEFORE(ptr); + bool res = base::subtle::Barrier_AtomicIncrement(ptr, -decrement) != 0; + if (!res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +inline void RefCountInc(volatile base::subtle::Atomic64 *ptr) { + base::RefCountIncN(ptr, 1); +} +inline bool RefCountDec(volatile base::subtle::Atomic64 *ptr) { + return base::RefCountDecN(ptr, 1); +} +inline bool RefCountIsOne(const volatile base::subtle::Atomic64 *ptr) { + bool res = base::subtle::Acquire_Load(ptr) == 1; + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +inline bool RefCountIsZero(const volatile base::subtle::Atomic64 *ptr) { + bool res = (base::subtle::Acquire_Load(ptr) == 0); + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +#endif + +#ifdef AtomicWordCastType +// Implementations for AtomicWord, if it's a different type from the above. +inline void RefCountIncN(volatile AtomicWord *ptr, AtomicWord increment) { + base::RefCountIncN( + reinterpret_cast(ptr), increment); +} +inline bool RefCountDecN(volatile AtomicWord *ptr, AtomicWord decrement) { + ANNOTATE_HAPPENS_BEFORE(ptr); + bool res = base::RefCountDecN( + reinterpret_cast(ptr), decrement); + if (!res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +inline void RefCountInc(volatile AtomicWord *ptr) { + base::RefCountIncN(ptr, 1); +} +inline bool RefCountDec(volatile AtomicWord *ptr) { + return base::RefCountDecN(ptr, 1); +} +inline bool RefCountIsOne(const volatile AtomicWord *ptr) { + bool res = base::subtle::Acquire_Load( + reinterpret_cast(ptr)) == 1; + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +inline bool RefCountIsZero(const volatile AtomicWord *ptr) { + bool res = base::subtle::Acquire_Load( + reinterpret_cast(ptr)) == 0; + if (res) { + ANNOTATE_HAPPENS_AFTER(ptr); + } + return res; +} +#endif + +} // namespace base + +#endif // BASE_ATOMIC_REFCOUNT_H_ diff --git a/src/kudu/gutil/atomicops-internals-macosx.h b/src/kudu/gutil/atomicops-internals-macosx.h new file mode 100644 index 000000000000..15efaef34c0a --- /dev/null +++ b/src/kudu/gutil/atomicops-internals-macosx.h @@ -0,0 +1,406 @@ +// Copyright 2006 Google Inc. +// All Rights Reserved. +// +// +// Implementation of atomic operations for Mac OS X. This file should not +// be included directly. Clients should instead include +// "base/atomicops.h". + +#ifndef BASE_AUXILIARY_ATOMICOPS_INTERNALS_MACOSX_H_ +#define BASE_AUXILIARY_ATOMICOPS_INTERNALS_MACOSX_H_ + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +// MacOS uses long for intptr_t, AtomicWord and Atomic32 are always different +// on the Mac, even when they are the same size. Similarly, on __ppc64__, +// AtomicWord and Atomic64 are always different. Thus, we need explicit +// casting. +#ifdef __LP64__ +#define AtomicWordCastType base::subtle::Atomic64 +#else +#define AtomicWordCastType Atomic32 +#endif + +#if defined(__LP64__) || defined(__i386__) +#define BASE_HAS_ATOMIC64 1 // Use only in tests and base/atomic* +#endif + +#include + +#if !defined(__LP64__) && defined(__ppc__) + +// The Mac 64-bit OSAtomic implementations are not available for 32-bit PowerPC, +// while the underlying assembly instructions are available only some +// implementations of PowerPC. + +// The following inline functions will fail with the error message at compile +// time ONLY IF they are called. So it is safe to use this header if user +// code only calls AtomicWord and Atomic32 operations. +// +// NOTE(user): Implementation notes to implement the atomic ops below may +// be found in "PowerPC Virtual Environment Architecture, Book II, +// Version 2.02", January 28, 2005, Appendix B, page 46. Unfortunately, +// extra care must be taken to ensure data are properly 8-byte aligned, and +// that data are returned correctly according to Mac OS X ABI specs. + +inline int64_t OSAtomicCompareAndSwap64( + int64_t oldValue, int64_t newValue, int64_t *theValue) { + __asm__ __volatile__( + "_OSAtomicCompareAndSwap64_not_supported_for_32_bit_ppc\n\t"); + return 0; +} + +inline int64_t OSAtomicAdd64(int64_t theAmount, int64_t *theValue) { + __asm__ __volatile__( + "_OSAtomicAdd64_not_supported_for_32_bit_ppc\n\t"); + return 0; +} + +inline int64_t OSAtomicCompareAndSwap64Barrier( + int64_t oldValue, int64_t newValue, int64_t *theValue) { + int64_t prev = OSAtomicCompareAndSwap64(oldValue, newValue, theValue); + OSMemoryBarrier(); + return prev; +} + +inline int64_t OSAtomicAdd64Barrier( + int64_t theAmount, int64_t *theValue) { + int64_t new_val = OSAtomicAdd64(theAmount, theValue); + OSMemoryBarrier(); + return new_val; +} +#endif + + +namespace base { +namespace subtle { + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +inline void MemoryBarrier() { + OSMemoryBarrier(); +} + +// 32-bit Versions. + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value; + do { + if (OSAtomicCompareAndSwap32(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap32(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap32Barrier(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return Acquire_AtomicExchange(ptr, new_value); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return OSAtomicAdd32(increment, const_cast(ptr)); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return OSAtomicAdd32Barrier(increment, const_cast(ptr)); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value; + do { + if (OSAtomicCompareAndSwap32Barrier(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + return Acquire_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32 *ptr) { + MemoryBarrier(); + return *ptr; +} + +// 64-bit version + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev_value; + do { + if (OSAtomicCompareAndSwap64(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + Atomic64 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap64(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + Atomic64 old_value; + do { + old_value = *ptr; + } while (!OSAtomicCompareAndSwap64Barrier(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return Acquire_AtomicExchange(ptr, new_value); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return OSAtomicAdd64(increment, const_cast(ptr)); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return OSAtomicAdd64Barrier(increment, const_cast(ptr)); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev_value; + do { + if (OSAtomicCompareAndSwap64Barrier(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + // The lib kern interface does not distinguish between + // Acquire and Release memory barriers; they are equivalent. + return Acquire_CompareAndSwap(ptr, old_value, new_value); +} + +#ifdef __LP64__ + +// 64-bit implementation on 64-bit platform + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +// Issue the x86 "pause" instruction, which tells the CPU that we +// are in a spinlock wait loop and should allow other hyperthreads +// to run, not speculate memory access, etc. +inline void PauseCPU() { + __asm__ __volatile__("pause" : : : "memory"); +} + +inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) { + Atomic64 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64 *ptr) { + MemoryBarrier(); + return *ptr; +} + +#else + +// 64-bit implementation on 32-bit platform + +#if defined(__ppc__) + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + __asm__ __volatile__( + "_NoBarrier_Store_not_supported_for_32_bit_ppc\n\t"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + __asm__ __volatile__( + "_NoBarrier_Load_not_supported_for_32_bit_ppc\n\t"); + return 0; +} + +#elif defined(__i386__) + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Reset FP registers + : "=m" (*ptr) + : "m" (value) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); + +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + Atomic64 value; + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Reset FP registers + : "=m" (value) + : "m" (*ptr) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); + + return value; +} + +#elif defined(__arm__) + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + int store_failed; + Atomic64 dummy; + __asm__ __volatile__( + "1:\n" + // Dummy load to lock cache line. + "ldrexd %1, [%3]\n" + "strexd %0, %2, [%3]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (store_failed), "=&r"(dummy) + : "r"(value), "r" (ptr) + : "cc", "memory"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + Atomic64 res; + __asm__ __volatile__( + "ldrexd %0, [%1]\n" + "clrex\n" + : "=r" (res) + : "r"(ptr), "Q"(*ptr)); + return res; +} + +#endif + + +inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) { + NoBarrier_Store(ptr, value); + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) { + MemoryBarrier(); + NoBarrier_Store(ptr, value); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) { + Atomic64 value = NoBarrier_Load(ptr); + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64 *ptr) { + MemoryBarrier(); + return NoBarrier_Load(ptr); +} +#endif // __LP64__ + +} // namespace base::subtle +} // namespace base + +// NOTE(user): The following is also deprecated. New callers should use +// the base::subtle namespace. +inline void MemoryBarrier() { + base::subtle::MemoryBarrier(); +} +#endif // BASE_AUXILIARY_ATOMICOPS_INTERNALS_MACOSX_H_ diff --git a/src/kudu/gutil/atomicops-internals-powerpc.h b/src/kudu/gutil/atomicops-internals-powerpc.h new file mode 100644 index 000000000000..0e56475bc15b --- /dev/null +++ b/src/kudu/gutil/atomicops-internals-powerpc.h @@ -0,0 +1,304 @@ +// Copyright 2012 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// All Rights Reserved. +// +// +// Implementation of atomic operations for PowerPC. This file should not +// be included directly. Clients should instead include +// "base/atomicops.h". + +// *** WARNING EXPERIMENTAL CODE *** +// This is not tested and may contain bugs. Until we have bootstrapped +// this. + +#ifndef GUTIL_ATOMICOPS_INTERNALS_POWERPC_H_ +#define GUTIL_ATOMICOPS_INTERNALS_POWERPC_H_ + +typedef int32_t Atomic32; +#define BASE_HAS_ATOMIC64 1 // Use only in tests and base/atomic* + + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +// 32-bit PowerPC is not supported yet. +#ifndef ARCH_POWERPC64 +#error "Only PowerPC64 is supported" +#endif + +namespace base { +namespace subtle { + +typedef int64_t Atomic64; + +// sync vs. lwsync: +// 1. lwsync only works in cache enabled memory (system memory). lwsync is +// unsuitable for memory caching disabled & guarded (device memory). +// sync can handle both system and device memory. +// 2. lwsync does not prevent reordering of a store followed by a load if they +// access different memory. sync orders all 4 kinds of memory access pairs. + +inline void MemoryBarrier() { + __asm__ __volatile__("sync" : : : "memory"); +} + +// 32-bit low-level operations. + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 value; + __asm__ __volatile__("1:\n" + " lwarx %0, 0, %2\n" + " cmpw 0, %0, %3\n" + " bne- 2f\n" + " stwcx. %4, 0, %2\n" + " bne- 1b\n" + "2:\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(old_value), "r"(new_value) + : "cc"); + return value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 value; + __asm__ __volatile__("1:\n" + " lwarx %0, 0, %2\n" + " stwcx. %3, 0, %2\n" + " bne- 1b\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(new_value) + : "cc"); + return value; +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 value = NoBarrier_AtomicExchange(ptr, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + MemoryBarrier(); + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 value; + __asm__ __volatile__("1:\n" + " lwarx %0, 0, %2\n" + " add %0, %0, %3\n" + " stwcx. %0, 0, %2\n" + " bne- 1b\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(increment) + : "cc"); + return value; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 value; + __asm__ __volatile__(" lwsync\n" + "1:\n" + " lwarx %0, 0, %2\n" + " add %0, %0, %3\n" + " stwcx. %0, 0, %2\n" + " bne- 1b\n" + " lwsync\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(increment) + : "cc", "memory"); + return value; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +// 64-bit low-level operations. + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 value; + __asm__ __volatile__("1:\n" + " ldarx %0, 0, %2\n" + " cmpd 0, %0, %3\n" + " bne- 2f\n" + " stdcx. %4, 0, %2\n" + " bne- 1b\n" + "2:\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(old_value), "r"(new_value) + : "cc"); + return value; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 value; + __asm__ __volatile__("1:\n" + " ldarx %0, 0, %2\n" + " stdcx. %3, 0, %2\n" + " bne- 1b\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(new_value) + : "cc"); + return value; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 value = NoBarrier_AtomicExchange(ptr, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 value; + __asm__ __volatile__("1:\n" + " ldarx %0, 0, %2\n" + " add %0, %0, %3\n" + " stdcx. %0, 0, %2\n" + " bne- 1b\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(increment) + : "cc"); + return value; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 value; + __asm__ __volatile__(" lwsync\n" + "1:\n" + " ldarx %0, 0, %2\n" + " add %0, %0, %3\n" + " stdcx. %0, 0, %2\n" + " bne- 1b\n" + " lwsync\n" + : "=&r" (value), "+m"(*ptr) + : "b"(ptr), "r"(increment) + : "cc", "memory"); + return value; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +} // namespace subtle +} // namespace base + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GUTIL_ATOMICOPS_INTERNALS_POWERPC_H_ diff --git a/src/kudu/gutil/atomicops-internals-tsan.h b/src/kudu/gutil/atomicops-internals-tsan.h new file mode 100644 index 000000000000..aecaefc3b4a9 --- /dev/null +++ b/src/kudu/gutil/atomicops-internals-tsan.h @@ -0,0 +1,217 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file is an internal atomic implementation for compiler-based +// ThreadSanitizer. Use base/atomicops.h instead. + +#ifndef BASE_ATOMICOPS_INTERNALS_TSAN_H_ +#define BASE_ATOMICOPS_INTERNALS_TSAN_H_ + +// Workaround for Chromium BASE_EXPORT definition +#ifndef BASE_EXPORT +#define BASE_EXPORT +#endif + +// This struct is not part of the public API of this module; clients may not +// use it. (However, it's exported via BASE_EXPORT because clients implicitly +// do use it at link time by inlining these functions.) +// Features of this x86. Values may not be correct before main() is run, +// but are set conservatively. +struct AtomicOps_x86CPUFeatureStruct { + bool has_amd_lock_mb_bug; // Processor has AMD memory-barrier bug; do lfence + // after acquire compare-and-swap. + bool has_sse2; // Processor has SSE2. +}; +BASE_EXPORT extern struct AtomicOps_x86CPUFeatureStruct + AtomicOps_Internalx86CPUFeatures; + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +#include + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +namespace base { +namespace subtle { + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_relaxed, __tsan_memory_order_relaxed); + return cmp; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_relaxed); +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_acquire); +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32 *ptr, + Atomic32 new_value) { + return __tsan_atomic32_exchange(ptr, new_value, + __tsan_memory_order_release); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return increment + __tsan_atomic32_fetch_add(ptr, increment, + __tsan_memory_order_relaxed); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32 *ptr, + Atomic32 increment) { + return increment + __tsan_atomic32_fetch_add(ptr, increment, + __tsan_memory_order_acq_rel); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_acquire, __tsan_memory_order_acquire); + return cmp; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32 *ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 cmp = old_value; + __tsan_atomic32_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_release, __tsan_memory_order_relaxed); + return cmp; +} + +inline void NoBarrier_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_relaxed); + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +inline void Release_Store(volatile Atomic32 *ptr, Atomic32 value) { + __tsan_atomic32_store(ptr, value, __tsan_memory_order_release); +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32 *ptr) { + return __tsan_atomic32_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic32 Acquire_Load(volatile const Atomic32 *ptr) { + return __tsan_atomic32_load(ptr, __tsan_memory_order_acquire); +} + +inline Atomic32 Release_Load(volatile const Atomic32 *ptr) { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); + return __tsan_atomic32_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_relaxed, __tsan_memory_order_relaxed); + return cmp; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_acquire); +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64 *ptr, + Atomic64 new_value) { + return __tsan_atomic64_exchange(ptr, new_value, __tsan_memory_order_release); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return increment + __tsan_atomic64_fetch_add(ptr, increment, + __tsan_memory_order_relaxed); +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64 *ptr, + Atomic64 increment) { + return increment + __tsan_atomic64_fetch_add(ptr, increment, + __tsan_memory_order_acq_rel); +} + +inline void NoBarrier_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_relaxed); +} + +inline void Acquire_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_relaxed); + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +inline void Release_Store(volatile Atomic64 *ptr, Atomic64 value) { + __tsan_atomic64_store(ptr, value, __tsan_memory_order_release); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64 *ptr) { + return __tsan_atomic64_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64 *ptr) { + return __tsan_atomic64_load(ptr, __tsan_memory_order_acquire); +} + +inline Atomic64 Release_Load(volatile const Atomic64 *ptr) { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); + return __tsan_atomic64_load(ptr, __tsan_memory_order_relaxed); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_acquire, __tsan_memory_order_acquire); + return cmp; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64 *ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 cmp = old_value; + __tsan_atomic64_compare_exchange_strong(ptr, &cmp, new_value, + __tsan_memory_order_release, __tsan_memory_order_relaxed); + return cmp; +} + +inline void MemoryBarrier() { + __tsan_atomic_thread_fence(__tsan_memory_order_seq_cst); +} + +inline void PauseCPU() { +} + +} // namespace base::subtle +} // namespace base + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // BASE_ATOMICOPS_INTERNALS_TSAN_H_ diff --git a/src/kudu/gutil/atomicops-internals-x86.cc b/src/kudu/gutil/atomicops-internals-x86.cc new file mode 100644 index 000000000000..5d4529ec7386 --- /dev/null +++ b/src/kudu/gutil/atomicops-internals-x86.cc @@ -0,0 +1,128 @@ +// Copyright 2007 Google, Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// All rights reserved. + + +// This module gets enough CPU information to optimize the +// atomicops module on x86. + +#include "kudu/gutil/atomicops-internals-x86.h" + +#include + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/integral_types.h" + +// This file only makes sense with atomicops-internals-x86.h -- it +// depends on structs that are defined in that file. If atomicops.h +// doesn't sub-include that file, then we aren't needed, and shouldn't +// try to do anything. +#ifdef GUTIL_ATOMICOPS_INTERNALS_X86_H_ + +// This macro was copied from //util/cpuid/cpuid.cc +// Inline cpuid instruction. In PIC compilations, %ebx contains the address +// of the global offset table. To avoid breaking such executables, this code +// must preserve that register's value across cpuid instructions. +#if defined(__i386__) +#define cpuid(a, b, c, d, inp) \ + asm("mov %%ebx, %%edi\n" \ + "cpuid\n" \ + "xchg %%edi, %%ebx\n" \ + : "=a" (a), "=D" (b), "=c" (c), "=d" (d) : "a" (inp)) +#elif defined(__x86_64__) +#define cpuid(a, b, c, d, inp) \ + asm("mov %%rbx, %%rdi\n" \ + "cpuid\n" \ + "xchg %%rdi, %%rbx\n" \ + : "=a" (a), "=D" (b), "=c" (c), "=d" (d) : "a" (inp)) +#endif + +#if defined(cpuid) // initialize the struct only on x86 + +// Set the flags so that code will run correctly and conservatively +// until InitGoogle() is called. +struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures = { + false, // bug can't exist before process spawns multiple threads + false, // no SSE2 + false, // no cmpxchg16b +}; + +// Initialize the AtomicOps_Internalx86CPUFeatures struct. +static void AtomicOps_Internalx86CPUFeaturesInit() { + uint32 eax; + uint32 ebx; + uint32 ecx; + uint32 edx; + + // Get vendor string (issue CPUID with eax = 0) + cpuid(eax, ebx, ecx, edx, 0); + char vendor[13]; + memcpy(vendor, &ebx, 4); + memcpy(vendor + 4, &edx, 4); + memcpy(vendor + 8, &ecx, 4); + vendor[12] = 0; + + // get feature flags in ecx/edx, and family/model in eax + cpuid(eax, ebx, ecx, edx, 1); + + int family = (eax >> 8) & 0xf; // family and model fields + int model = (eax >> 4) & 0xf; + if (family == 0xf) { // use extended family and model fields + family += (eax >> 20) & 0xff; + model += ((eax >> 16) & 0xf) << 4; + } + + // Opteron Rev E has a bug in which on very rare occasions a locked + // instruction doesn't act as a read-acquire barrier if followed by a + // non-locked read-modify-write instruction. Rev F has this bug in + // pre-release versions, but not in versions released to customers, + // so we test only for Rev E, which is family 15, model 32..63 inclusive. + if (strcmp(vendor, "AuthenticAMD") == 0 && // AMD + family == 15 && + 32 <= model && model <= 63) { + AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug = true; + } else { + AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug = false; + } + + // edx bit 26 is SSE2 which we use to tell use whether we can use mfence + AtomicOps_Internalx86CPUFeatures.has_sse2 = ((edx >> 26) & 1); + + // ecx bit 13 indicates whether the cmpxchg16b instruction is supported + AtomicOps_Internalx86CPUFeatures.has_cmpxchg16b = ((ecx >> 13) & 1); + + VLOG(1) << "vendor " << vendor << + " family " << family << + " model " << model << + " amd_lock_mb_bug " << + AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug << + " sse2 " << AtomicOps_Internalx86CPUFeatures.has_sse2 << + " cmpxchg16b " << AtomicOps_Internalx86CPUFeatures.has_cmpxchg16b; +} + +// AtomicOps initialisation routine for external use. +void AtomicOps_x86CPUFeaturesInit() { + AtomicOps_Internalx86CPUFeaturesInit(); +} + +#endif + +#endif // GUTIL_ATOMICOPS_INTERNALS_X86_H_ diff --git a/src/kudu/gutil/atomicops-internals-x86.h b/src/kudu/gutil/atomicops-internals-x86.h new file mode 100644 index 000000000000..acbd2e3ffed5 --- /dev/null +++ b/src/kudu/gutil/atomicops-internals-x86.h @@ -0,0 +1,513 @@ +// Copyright 2003 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// All Rights Reserved. +// +// +// Implementation of atomic operations for x86. This file should not +// be included directly. Clients should instead include +// "base/atomicops.h". + +#ifndef GUTIL_ATOMICOPS_INTERNALS_X86_H_ +#define GUTIL_ATOMICOPS_INTERNALS_X86_H_ + +#include + +#include + +#define BASE_HAS_ATOMIC64 1 // Use only in tests and base/atomic* + + +// NOTE(user): x86 does not need to define AtomicWordCastType, because it +// already matches Atomic32 or Atomic64, depending on the platform. + + +// This struct is not part of the public API of this module; clients may not +// use it. +// Features of this x86. Values may not be correct before InitGoogle() is run, +// but are set conservatively. +struct AtomicOps_x86CPUFeatureStruct { + bool has_amd_lock_mb_bug; // Processor has AMD memory-barrier bug; do lfence + // after acquire compare-and-swap. + bool has_sse2; // Processor has SSE2. + bool has_cmpxchg16b; // Processor supports cmpxchg16b instruction. +}; +extern struct AtomicOps_x86CPUFeatureStruct AtomicOps_Internalx86CPUFeatures; + + +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +// AtomicOps initialisation for open source use. +void AtomicOps_x86CPUFeaturesInit(); + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +namespace base { +namespace subtle { + +typedef int32_t Atomic32; +typedef int64_t Atomic64; + +// These atomic primitives don't work atomically, and can cause really nasty +// hard-to-track-down bugs, if the pointer isn't naturally aligned. Check alignment +// in debug mode. +template +inline void CheckNaturalAlignment(const T *ptr) { + DCHECK_EQ(0, reinterpret_cast(ptr) & (sizeof(T) - 1)) + << "unaligned pointer not allowed for atomics"; +} + +// 32-bit low-level operations on any platform. + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + CheckNaturalAlignment(ptr); + Atomic32 prev; + __asm__ __volatile__("lock; cmpxchgl %1,%2" + : "=a" (prev) + : "q" (new_value), "m" (*ptr), "0" (old_value) + : "memory"); + return prev; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + CheckNaturalAlignment(ptr); + __asm__ __volatile__("xchgl %1,%0" // The lock prefix is implicit for xchg. + : "=r" (new_value) + : "m" (*ptr), "0" (new_value) + : "memory"); + return new_value; // Now it's the previous value. +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + CheckNaturalAlignment(ptr); + Atomic32 old_val = NoBarrier_AtomicExchange(ptr, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return old_val; +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + CheckNaturalAlignment(ptr); + Atomic32 temp = increment; + __asm__ __volatile__("lock; xaddl %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now holds the old value of *ptr + return temp + increment; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + CheckNaturalAlignment(ptr); + Atomic32 temp = increment; + __asm__ __volatile__("lock; xaddl %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now holds the old value of *ptr + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return temp + increment; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return x; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + CheckNaturalAlignment(ptr); + *ptr = value; +} + +// Issue the x86 "pause" instruction, which tells the CPU that we +// are in a spinlock wait loop and should allow other hyperthreads +// to run, not speculate memory access, etc. +inline void PauseCPU() { + __asm__ __volatile__("pause" : : : "memory"); +} + +#if defined(__x86_64__) + +// 64-bit implementations of memory barrier can be simpler, because it +// "mfence" is guaranteed to exist. +inline void MemoryBarrier() { + __asm__ __volatile__("mfence" : : : "memory"); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + CheckNaturalAlignment(ptr); + *ptr = value; + MemoryBarrier(); +} + +#else + +inline void MemoryBarrier() { + if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + __asm__ __volatile__("mfence" : : : "memory"); + } else { // mfence is faster but not present on PIII + Atomic32 x = 0; + Acquire_AtomicExchange(&x, 0); + } +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + if (AtomicOps_Internalx86CPUFeatures.has_sse2) { + CheckNaturalAlignment(ptr); + *ptr = value; + __asm__ __volatile__("mfence" : : : "memory"); + } else { + Acquire_AtomicExchange(ptr, value); + } +} +#endif + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + CheckNaturalAlignment(ptr); + ATOMICOPS_COMPILER_BARRIER(); + *ptr = value; // An x86 store acts as a release barrier. + // See comments in Atomic64 version of Release_Store(), below. +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + CheckNaturalAlignment(ptr); + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + CheckNaturalAlignment(ptr); + Atomic32 value = *ptr; // An x86 load acts as a acquire barrier. + // See comments in Atomic64 version of Release_Store(), below. + ATOMICOPS_COMPILER_BARRIER(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + CheckNaturalAlignment(ptr); + MemoryBarrier(); + return *ptr; +} + +#if defined(__x86_64__) + +// 64-bit low-level operations on 64-bit platform. + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev; + CheckNaturalAlignment(ptr); + __asm__ __volatile__("lock; cmpxchgq %1,%2" + : "=a" (prev) + : "q" (new_value), "m" (*ptr), "0" (old_value) + : "memory"); + return prev; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + CheckNaturalAlignment(ptr); + __asm__ __volatile__("xchgq %1,%0" // The lock prefix is implicit for xchg. + : "=r" (new_value) + : "m" (*ptr), "0" (new_value) + : "memory"); + return new_value; // Now it's the previous value. +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 old_val = NoBarrier_AtomicExchange(ptr, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return old_val; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 temp = increment; + CheckNaturalAlignment(ptr); + __asm__ __volatile__("lock; xaddq %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now contains the previous value of *ptr + return temp + increment; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 temp = increment; + CheckNaturalAlignment(ptr); + __asm__ __volatile__("lock; xaddq %0,%1" + : "+r" (temp), "+m" (*ptr) + : : "memory"); + // temp now contains the previous value of *ptr + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return temp + increment; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + CheckNaturalAlignment(ptr); + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + CheckNaturalAlignment(ptr); + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + ATOMICOPS_COMPILER_BARRIER(); + CheckNaturalAlignment(ptr); + *ptr = value; // An x86 store acts as a release barrier + // for current AMD/Intel chips as of Jan 2008. + // See also Acquire_Load(), below. + + // When new chips come out, check: + // IA-32 Intel Architecture Software Developer's Manual, Volume 3: + // System Programming Guide, Chatper 7: Multiple-processor management, + // Section 7.2, Memory Ordering. + // Last seen at: + // http://developer.intel.com/design/pentium4/manuals/index_new.htm + // + // x86 stores/loads fail to act as barriers for a few instructions (clflush + // maskmovdqu maskmovq movntdq movnti movntpd movntps movntq) but these are + // not generated by the compiler, and are rare. Users of these instructions + // need to know about cache behaviour in any case since all of these involve + // either flushing cache lines or non-temporal cache hints. +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + CheckNaturalAlignment(ptr); + return *ptr; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + CheckNaturalAlignment(ptr); + Atomic64 value = *ptr; // An x86 load acts as a acquire barrier, + // for current AMD/Intel chips as of Jan 2008. + // See also Release_Store(), above. + ATOMICOPS_COMPILER_BARRIER(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + CheckNaturalAlignment(ptr); + MemoryBarrier(); + return *ptr; +} + +#else // defined(__x86_64__) + +// 64-bit low-level operations on 32-bit platform. + +#if !((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) +// For compilers older than gcc 4.1, we use inline asm. +// +// Potential pitfalls: +// +// 1. %ebx points to Global offset table (GOT) with -fPIC. +// We need to preserve this register. +// 2. When explicit registers are used in inline asm, the +// compiler may not be aware of it and might try to reuse +// the same register for another argument which has constraints +// that allow it ("r" for example). + +inline Atomic64 __sync_val_compare_and_swap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + CheckNaturalAlignment(ptr); + Atomic64 prev; + __asm__ __volatile__("push %%ebx\n\t" + "movl (%3), %%ebx\n\t" // Move 64-bit new_value into + "movl 4(%3), %%ecx\n\t" // ecx:ebx + "lock; cmpxchg8b (%1)\n\t" // If edx:eax (old_value) same + "pop %%ebx\n\t" + : "=A" (prev) // as contents of ptr: + : "D" (ptr), // ecx:ebx => ptr + "0" (old_value) , // else: + "S" (&new_value) // old *ptr => edx:eax + : "memory", "%ecx"); + return prev; +} +#endif // Compiler < gcc-4.1 + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_val, + Atomic64 new_val) { + CheckNaturalAlignment(ptr); + return __sync_val_compare_and_swap(ptr, old_val, new_val); +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_val) { + Atomic64 old_val; + CheckNaturalAlignment(ptr); + + do { + old_val = *ptr; + } while (__sync_val_compare_and_swap(ptr, old_val, new_val) != old_val); + + return old_val; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_val) { + CheckNaturalAlignment(ptr); + Atomic64 old_val = NoBarrier_AtomicExchange(ptr, new_val); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return old_val; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_val) { + return NoBarrier_AtomicExchange(ptr, new_val); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + CheckNaturalAlignment(ptr); + Atomic64 old_val, new_val; + + do { + old_val = *ptr; + new_val = old_val + increment; + } while (__sync_val_compare_and_swap(ptr, old_val, new_val) != old_val); + + return old_val + increment; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + CheckNaturalAlignment(ptr); + Atomic64 new_val = NoBarrier_AtomicIncrement(ptr, increment); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return new_val; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + CheckNaturalAlignment(ptr); + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Empty mmx state/Reset FP regs + : "=m" (*ptr) + : "m" (value) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NoBarrier_Store(ptr, value); + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + ATOMICOPS_COMPILER_BARRIER(); + NoBarrier_Store(ptr, value); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + CheckNaturalAlignment(ptr); + Atomic64 value; + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Empty mmx state/Reset FP regs + : "=m" (value) + : "m" (*ptr) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); + return value; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + CheckNaturalAlignment(ptr); + Atomic64 value = NoBarrier_Load(ptr); + ATOMICOPS_COMPILER_BARRIER(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return NoBarrier_Load(ptr); +} + +#endif // defined(__x86_64__) + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 x = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + if (AtomicOps_Internalx86CPUFeatures.has_amd_lock_mb_bug) { + __asm__ __volatile__("lfence" : : : "memory"); + } + return x; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +} // namespace subtle +} // namespace base + +#undef ATOMICOPS_COMPILER_BARRIER + +#endif // GUTIL_ATOMICOPS_INTERNALS_X86_H_ diff --git a/src/kudu/gutil/atomicops.h b/src/kudu/gutil/atomicops.h new file mode 100644 index 000000000000..fa4c44c90682 --- /dev/null +++ b/src/kudu/gutil/atomicops.h @@ -0,0 +1,373 @@ +// Copyright 2003 Google Inc. +// All Rights Reserved. +// + +// For atomic operations on statistics counters, see atomic_stats_counter.h. +// For atomic operations on sequence numbers, see atomic_sequence_num.h. +// For atomic operations on reference counts, see atomic_refcount.h. + +// Some fast atomic operations -- typically with machine-dependent +// implementations. This file may need editing as Google code is +// ported to different architectures. + +// The routines exported by this module are subtle. If you use them, even if +// you get the code right, it will depend on careful reasoning about atomicity +// and memory ordering; it will be less readable, and harder to maintain. If +// you plan to use these routines, you should have a good reason, such as solid +// evidence that performance would otherwise suffer, or there being no +// alternative. You should assume only properties explicitly guaranteed by the +// specifications in this file. You are almost certainly _not_ writing code +// just for the x86; if you assume x86 semantics, x86 hardware bugs and +// implementations on other archtectures will cause your code to break. If you +// do not know what you are doing, avoid these routines, and use a Mutex. +// +// These following lower-level operations are typically useful only to people +// implementing higher-level synchronization operations like spinlocks, +// mutexes, and condition-variables. They combine CompareAndSwap(), +// addition, exchange, a load, or a store with appropriate memory-ordering +// instructions. "Acquire" operations ensure that no later memory access by +// the same thread can be reordered ahead of the operation. "Release" +// operations ensure that no previous memory access by the same thread can be +// reordered after the operation. "Barrier" operations have both "Acquire" and +// "Release" semantics. A MemoryBarrier() has "Barrier" semantics, but does no +// memory access. "NoBarrier" operations have no barrier: the CPU is +// permitted to reorder them freely (as seen by other threads), even in ways +// the appear to violate functional dependence, just as it can for any normal +// variable access. +// +// It is incorrect to make direct assignments to/from an atomic variable. +// You should use one of the Load or Store routines. The NoBarrier +// versions are provided when no barriers are needed: +// NoBarrier_Store() +// NoBarrier_Load() +// Although there are currently no compiler enforcement, you are encouraged +// to use these. Moreover, if you choose to use base::subtle::Atomic64 type, +// you MUST use one of the Load or Store routines to get correct behavior +// on 32-bit platforms. +// +// The intent is eventually to put all of these routines in namespace +// base::subtle + +#ifndef THREAD_ATOMICOPS_H_ +#define THREAD_ATOMICOPS_H_ + +#include + +// ------------------------------------------------------------------------ +// Include the platform specific implementations of the types +// and operations listed below. Implementations are to provide Atomic32 +// and Atomic64 operations. If there is a mismatch between intptr_t and +// the Atomic32 or Atomic64 types for a platform, the platform-specific header +// should define the macro, AtomicWordCastType in a clause similar to the +// following: +// #if ...pointers are 64 bits... +// # define AtomicWordCastType base::subtle::Atomic64 +// #else +// # define AtomicWordCastType Atomic32 +// #endif +// ------------------------------------------------------------------------ + +#include "kudu/gutil/arm_instruction_set_select.h" + +// ThreadSanitizer provides own implementation of atomicops. +#if defined(THREAD_SANITIZER) +#include "kudu/gutil/atomicops-internals-tsan.h" +#elif defined(__APPLE__) +#include "kudu/gutil/atomicops-internals-macosx.h" +#elif defined(__GNUC__) && defined(ARMV6) +#include "kudu/gutil/atomicops-internals-arm-v6plus.h" +#elif defined(ARMV3) +#include "kudu/gutil/atomicops-internals-arm-generic.h" +#elif defined(__GNUC__) && (defined(__i386) || defined(__x86_64__)) +#include "kudu/gutil/atomicops-internals-x86.h" +#elif defined(__GNUC__) && defined(ARCH_POWERPC64) +#include "kudu/gutil/atomicops-internals-powerpc.h" +#elif defined(OS_WINDOWS) +#include "kudu/gutil/atomicops-internals-windows.h" +#else +#error You need to implement atomic operations for this architecture +#endif + +// Signed type that can hold a pointer and supports the atomic ops below, as +// well as atomic loads and stores. Instances must be naturally-aligned. +typedef intptr_t AtomicWord; + +#ifdef AtomicWordCastType +// ------------------------------------------------------------------------ +// This section is needed only when explicit type casting is required to +// cast AtomicWord to one of the basic atomic types (Atomic64 or Atomic32). +// It also serves to document the AtomicWord interface. +// ------------------------------------------------------------------------ + +namespace base { +namespace subtle { + +// Atomically execute: +// result = *ptr; +// if (*ptr == old_value) +// *ptr = new_value; +// return result; +// +// I.e., replace "*ptr" with "new_value" if "*ptr" used to be "old_value". +// Always return the old value of "*ptr" +// +// This routine implies no memory barriers. +inline AtomicWord NoBarrier_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return NoBarrier_CompareAndSwap( + reinterpret_cast(ptr), + old_value, new_value); +} + +// Atomically store new_value into *ptr, returning the previous value held in +// *ptr. This routine implies no memory barriers. +inline AtomicWord NoBarrier_AtomicExchange(volatile AtomicWord* ptr, + AtomicWord new_value) { + return NoBarrier_AtomicExchange( + reinterpret_cast(ptr), new_value); +} + +inline AtomicWord Acquire_AtomicExchange(volatile AtomicWord* ptr, + AtomicWord new_value) { + return Acquire_AtomicExchange( + reinterpret_cast(ptr), new_value); +} + +inline AtomicWord Release_AtomicExchange(volatile AtomicWord* ptr, + AtomicWord new_value) { + return Release_AtomicExchange( + reinterpret_cast(ptr), new_value); +} + +// Atomically increment *ptr by "increment". Returns the new value of +// *ptr with the increment applied. This routine implies no memory +// barriers. +inline AtomicWord NoBarrier_AtomicIncrement(volatile AtomicWord* ptr, + AtomicWord increment) { + return NoBarrier_AtomicIncrement( + reinterpret_cast(ptr), increment); +} + +inline AtomicWord Barrier_AtomicIncrement(volatile AtomicWord* ptr, + AtomicWord increment) { + return Barrier_AtomicIncrement( + reinterpret_cast(ptr), increment); +} + +inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return base::subtle::Acquire_CompareAndSwap( + reinterpret_cast(ptr), + old_value, new_value); +} + +inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return base::subtle::Release_CompareAndSwap( + reinterpret_cast(ptr), + old_value, new_value); +} + +inline void NoBarrier_Store(volatile AtomicWord *ptr, AtomicWord value) { + NoBarrier_Store( + reinterpret_cast(ptr), value); +} + +inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) { + return base::subtle::Acquire_Store( + reinterpret_cast(ptr), value); +} + +inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) { + return base::subtle::Release_Store( + reinterpret_cast(ptr), value); +} + +inline AtomicWord NoBarrier_Load(volatile const AtomicWord *ptr) { + return NoBarrier_Load( + reinterpret_cast(ptr)); +} + +inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) { + return base::subtle::Acquire_Load( + reinterpret_cast(ptr)); +} + +inline AtomicWord Release_Load(volatile const AtomicWord* ptr) { + return base::subtle::Release_Load( + reinterpret_cast(ptr)); +} + +} // namespace base::subtle +} // namespace base +#endif // AtomicWordCastType + +// ------------------------------------------------------------------------ +// Commented out type definitions and method declarations for documentation +// of the interface provided by this module. +// ------------------------------------------------------------------------ + +#if 0 + +// Signed 32-bit type that supports the atomic ops below, as well as atomic +// loads and stores. Instances must be naturally aligned. This type differs +// from AtomicWord in 64-bit binaries where AtomicWord is 64-bits. +typedef int32_t Atomic32; + +// Corresponding operations on Atomic32 +namespace base { +namespace subtle { + +// Signed 64-bit type that supports the atomic ops below, as well as atomic +// loads and stores. Instances must be naturally aligned. This type differs +// from AtomicWord in 32-bit binaries where AtomicWord is 32-bits. +typedef int64_t Atomic64; + +Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); +Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value); +Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value); +Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, Atomic32 new_value); +Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, Atomic32 increment); +Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment); +Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); +Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value); +void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value); +void Acquire_Store(volatile Atomic32* ptr, Atomic32 value); +void Release_Store(volatile Atomic32* ptr, Atomic32 value); +Atomic32 NoBarrier_Load(volatile const Atomic32* ptr); +Atomic32 Acquire_Load(volatile const Atomic32* ptr); +Atomic32 Release_Load(volatile const Atomic32* ptr); + +// Corresponding operations on Atomic64 +Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value); +Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value); +Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, Atomic64 new_value); +Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment); +Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, Atomic64 increment); + +Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value); +void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value); +void Acquire_Store(volatile Atomic64* ptr, Atomic64 value); +void Release_Store(volatile Atomic64* ptr, Atomic64 value); +Atomic64 NoBarrier_Load(volatile const Atomic64* ptr); +Atomic64 Acquire_Load(volatile const Atomic64* ptr); +Atomic64 Release_Load(volatile const Atomic64* ptr); +} // namespace base::subtle +} // namespace base + +void MemoryBarrier(); + +void PauseCPU(); + +#endif // 0 + + +// ------------------------------------------------------------------------ +// The following are to be deprecated when all uses have been changed to +// use the base::subtle namespace. +// ------------------------------------------------------------------------ + +#ifdef AtomicWordCastType +// AtomicWord versions to be deprecated +inline AtomicWord Acquire_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value); +} + +inline AtomicWord Release_CompareAndSwap(volatile AtomicWord* ptr, + AtomicWord old_value, + AtomicWord new_value) { + return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value); +} + +inline void Acquire_Store(volatile AtomicWord* ptr, AtomicWord value) { + return base::subtle::Acquire_Store(ptr, value); +} + +inline void Release_Store(volatile AtomicWord* ptr, AtomicWord value) { + return base::subtle::Release_Store(ptr, value); +} + +inline AtomicWord Acquire_Load(volatile const AtomicWord* ptr) { + return base::subtle::Acquire_Load(ptr); +} + +inline AtomicWord Release_Load(volatile const AtomicWord* ptr) { + return base::subtle::Release_Load(ptr); +} +#endif // AtomicWordCastType + +// 32-bit Acquire/Release operations to be deprecated. + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value); +} +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value); +} +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + base::subtle::Acquire_Store(ptr, value); +} +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + return base::subtle::Release_Store(ptr, value); +} +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + return base::subtle::Acquire_Load(ptr); +} +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + return base::subtle::Release_Load(ptr); +} + +// 64-bit Acquire/Release operations to be deprecated. + +inline base::subtle::Atomic64 Acquire_CompareAndSwap( + volatile base::subtle::Atomic64* ptr, + base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) { + return base::subtle::Acquire_CompareAndSwap(ptr, old_value, new_value); +} +inline base::subtle::Atomic64 Release_CompareAndSwap( + volatile base::subtle::Atomic64* ptr, + base::subtle::Atomic64 old_value, base::subtle::Atomic64 new_value) { + return base::subtle::Release_CompareAndSwap(ptr, old_value, new_value); +} +inline void Acquire_Store( + volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) { + base::subtle::Acquire_Store(ptr, value); +} +inline void Release_Store( + volatile base::subtle::Atomic64* ptr, base::subtle::Atomic64 value) { + return base::subtle::Release_Store(ptr, value); +} +inline base::subtle::Atomic64 Acquire_Load( + volatile const base::subtle::Atomic64* ptr) { + return base::subtle::Acquire_Load(ptr); +} +inline base::subtle::Atomic64 Release_Load( + volatile const base::subtle::Atomic64* ptr) { + return base::subtle::Release_Load(ptr); +} + +#endif // THREAD_ATOMICOPS_H_ diff --git a/src/kudu/gutil/auxiliary/atomicops-internals-arm-generic.h b/src/kudu/gutil/auxiliary/atomicops-internals-arm-generic.h new file mode 100644 index 000000000000..417c6a0ca309 --- /dev/null +++ b/src/kudu/gutil/auxiliary/atomicops-internals-arm-generic.h @@ -0,0 +1,230 @@ +// Copyright 2003 Google Inc. +// All Rights Reserved. +// +// +// This file is an internal atomic implementation, use base/atomicops.h instead. +// +// LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. + +#ifndef BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_GENERIC_H_ +#define BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_GENERIC_H_ + +#include +#include +#include "kudu/gutil/macros.h" // For COMPILE_ASSERT +#include "kudu/gutil/port.h" // ATTRIBUTE_WEAK + +typedef int32_t Atomic32; + +namespace base { +namespace subtle { + +typedef int64_t Atomic64; + +// 0xffff0fc0 is the hard coded address of a function provided by +// the kernel which implements an atomic compare-exchange. On older +// ARM architecture revisions (pre-v6) this may be implemented using +// a syscall. This address is stable, and in active use (hard coded) +// by at least glibc-2.7 and the Android C library. +// pLinuxKernelCmpxchg has both acquire and release barrier sematincs. +typedef Atomic32 (*LinuxKernelCmpxchgFunc)(Atomic32 old_value, + Atomic32 new_value, + volatile Atomic32* ptr); +LinuxKernelCmpxchgFunc pLinuxKernelCmpxchg ATTRIBUTE_WEAK = + (LinuxKernelCmpxchgFunc) 0xffff0fc0; + +typedef void (*LinuxKernelMemoryBarrierFunc)(void); +LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK = + (LinuxKernelMemoryBarrierFunc) 0xffff0fa0; + + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 prev_value = *ptr; + do { + if (!pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))) { + return old_value; + } + prev_value = *ptr; + } while (prev_value == old_value); + return prev_value; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value; + do { + old_value = *ptr; + } while (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr))); + return old_value; +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + for (;;) { + // Atomic exchange the old value with an incremented one. + Atomic32 old_value = *ptr; + Atomic32 new_value = old_value + increment; + if (pLinuxKernelCmpxchg(old_value, new_value, + const_cast(ptr)) == 0) { + // The exchange took place as expected. + return new_value; + } + // Otherwise, *ptr changed mid-loop and we need to retry. + } +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void MemoryBarrier() { + pLinuxKernelMemoryBarrier(); +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + + +// 64-bit versions are not implemented yet. + +inline void NotImplementedFatalError(const char *function_name) { + fprintf(stderr, "64-bit %s() not implemented on this platform\n", + function_name); + abort(); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_CompareAndSwap"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_AtomicExchange"); + return 0; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("Acquire_AtomicExchange"); + return 0; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("Release_AtomicExchange"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("NoBarrier_AtomicIncrement"); + return 0; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("Barrier_AtomicIncrement"); + return 0; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("NoBarrier_Store"); +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("Acquire_Store64"); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("Release_Store"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("NoBarrier_Load"); + return 0; +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("Atomic64 Acquire_Load"); + return 0; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("Atomic64 Release_Load"); + return 0; +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("Atomic64 Acquire_CompareAndSwap"); + return 0; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("Atomic64 Release_CompareAndSwap"); + return 0; +} + +} // namespace base::subtle +} // namespace base + +#endif // BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_GENERIC_H_ diff --git a/src/kudu/gutil/auxiliary/atomicops-internals-arm-v6plus.h b/src/kudu/gutil/auxiliary/atomicops-internals-arm-v6plus.h new file mode 100644 index 000000000000..edafc4e3d403 --- /dev/null +++ b/src/kudu/gutil/auxiliary/atomicops-internals-arm-v6plus.h @@ -0,0 +1,378 @@ +// Copyright 2011 Google Inc. +// All Rights Reserved. +// +// based on atomicops-internals by Sanjay Ghemawat +// +// This file is an internal atomic implementation, use base/atomicops.h instead. +// +// This code implements ARM atomics for architectures V6 and newer. + +#ifndef BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_ +#define BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_ + +#include +#include +#include "kudu/gutil/basictypes.h" // For COMPILE_ASSERT + +// The LDREXD and STREXD instructions in ARM all v7 variants or above. In v6, +// only some variants support it. For simplicity, we only use exclusive +// 64-bit load/store in V7 or above. +#if defined(ARMV7) +# define BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD +#endif + +typedef int32_t Atomic32; + +namespace base { +namespace subtle { + +typedef int64_t Atomic64; + +inline void MemoryBarrier() { + __asm__ __volatile__("dmb" : : : "memory"); +} + +// 32-bit low-level ops + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 oldval, res; + do { + __asm__ __volatile__( + "ldrex %1, [%3]\n" + "mov %0, #0\n" + "teq %1, %4\n" + // The following IT (if-then) instruction is needed for the subsequent + // conditional instruction STREXEQ when compiling in THUMB mode. + // In ARM mode, the compiler/assembler will not generate any code for it. + "it eq\n" + "strexeq %0, %5, [%3]\n" + : "=&r" (res), "=&r" (oldval), "+Qo" (*ptr) + : "r" (ptr), "Ir" (old_value), "r" (new_value) + : "cc"); + } while (res); + return oldval; +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 tmp, old; + __asm__ __volatile__( + "1:\n" + "ldrex %1, [%2]\n" + "strex %0, %3, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (tmp), "=&r" (old) + : "r" (ptr), "r" (new_value) + : "cc", "memory"); + return old; +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + Atomic32 old_value = NoBarrier_AtomicExchange(ptr, new_value); + MemoryBarrier(); + return old_value; +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + MemoryBarrier(); + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 tmp, res; + __asm__ __volatile__( + "1:\n" + "ldrex %1, [%2]\n" + "add %1, %1, %3\n" + "strex %0, %1, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (tmp), "=&r"(res) + : "r" (ptr), "r"(increment) + : "cc", "memory"); + return res; +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + Atomic32 tmp, res; + __asm__ __volatile__( + "1:\n" + "ldrex %1, [%2]\n" + "add %1, %1, %3\n" + "dmb\n" + "strex %0, %1, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (tmp), "=&r"(res) + : "r" (ptr), "r"(increment) + : "cc", "memory"); + return res; +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + Atomic32 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + MemoryBarrier(); + *ptr = value; +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + MemoryBarrier(); + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +// 64-bit versions are only available if LDREXD and STREXD instructions +// are available. +#ifdef BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD + +#define BASE_HAS_ATOMIC64 1 + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 oldval, res; + do { + __asm__ __volatile__( + "ldrexd %1, [%3]\n" + "mov %0, #0\n" + "teq %Q1, %Q4\n" + // The following IT (if-then) instructions are needed for the subsequent + // conditional instructions when compiling in THUMB mode. + // In ARM mode, the compiler/assembler will not generate any code for it. + "it eq\n" + "teqeq %R1, %R4\n" + "it eq\n" + "strexdeq %0, %5, [%3]\n" + : "=&r" (res), "=&r" (oldval), "+Q" (*ptr) + : "r" (ptr), "Ir" (old_value), "r" (new_value) + : "cc"); + } while (res); + return oldval; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + int store_failed; + Atomic64 old; + __asm__ __volatile__( + "1:\n" + "ldrexd %1, [%2]\n" + "strexd %0, %3, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (store_failed), "=&r" (old) + : "r" (ptr), "r" (new_value) + : "cc", "memory"); + return old; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 old_value = NoBarrier_AtomicExchange(ptr, new_value); + MemoryBarrier(); + return old_value; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + int store_failed; + Atomic64 res; + __asm__ __volatile__( + "1:\n" + "ldrexd %1, [%2]\n" + "adds %Q1, %Q1, %Q3\n" + "adc %R1, %R1, %R3\n" + "strexd %0, %1, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (store_failed), "=&r"(res) + : "r" (ptr), "r"(increment) + : "cc", "memory"); + return res; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + int store_failed; + Atomic64 res; + __asm__ __volatile__( + "1:\n" + "ldrexd %1, [%2]\n" + "adds %Q1, %Q1, %Q3\n" + "adc %R1, %R1, %R3\n" + "dmb\n" + "strexd %0, %1, [%2]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (store_failed), "=&r"(res) + : "r" (ptr), "r"(increment) + : "cc", "memory"); + return res; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + int store_failed; + Atomic64 dummy; + __asm__ __volatile__( + "1:\n" + // Dummy load to lock cache line. + "ldrexd %1, [%3]\n" + "strexd %0, %2, [%3]\n" + "teq %0, #0\n" + "bne 1b" + : "=&r" (store_failed), "=&r"(dummy) + : "r"(value), "r" (ptr) + : "cc", "memory"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + Atomic64 res; + __asm__ __volatile__( + "ldrexd %0, [%1]\n" + "clrex\n" + : "=r" (res) + : "r"(ptr), "Q"(*ptr)); + return res; +} + +#else // BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD + +inline void NotImplementedFatalError(const char *function_name) { + fprintf(stderr, "64-bit %s() not implemented on this platform\n", + function_name); + abort(); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_CompareAndSwap"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_AtomicExchange"); + return 0; +} + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("Acquire_AtomicExchange"); + return 0; +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + NotImplementedFatalError("Release_AtomicExchange"); + return 0; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("NoBarrier_AtomicIncrement"); + return 0; +} + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + NotImplementedFatalError("Barrier_AtomicIncrement"); + return 0; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("NoBarrier_Store"); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("NoBarrier_Load"); + return 0; +} + +#endif // BASE_ATOMICOPS_HAS_LDREXD_AND_STREXD + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NoBarrier_Store(ptr, value); + MemoryBarrier(); +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + MemoryBarrier(); + NoBarrier_Store(ptr, value); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 value = NoBarrier_Load(ptr); + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return NoBarrier_Load(ptr); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 value = NoBarrier_CompareAndSwap(ptr, old_value, new_value); + MemoryBarrier(); + return value; +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + MemoryBarrier(); + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +} // namespace subtle ends +} // namespace base ends + +#endif // BASE_AUXILIARY_ATOMICOPS_INTERNALS_ARM_V6PLUS_H_ diff --git a/src/kudu/gutil/auxiliary/atomicops-internals-windows.h b/src/kudu/gutil/auxiliary/atomicops-internals-windows.h new file mode 100644 index 000000000000..96674dc41459 --- /dev/null +++ b/src/kudu/gutil/auxiliary/atomicops-internals-windows.h @@ -0,0 +1,508 @@ +// Copyright 2003 Google Inc. +// All Rights Reserved. +// +// +// Implementation of atomic operations using Windows API +// functions. This file should not be included directly. Clients +// should instead include "base/atomicops.h". + +#ifndef BASE_AUXILIARY_ATOMICOPS_INTERNALS_WINDOWS_H_ +#define BASE_AUXILIARY_ATOMICOPS_INTERNALS_WINDOWS_H_ + +#include +#include +#include "kudu/gutil/basictypes.h" // For COMPILE_ASSERT + +typedef int32 Atomic32; + +#if defined(_WIN64) +#define BASE_HAS_ATOMIC64 1 // Use only in tests and base/atomic* +#endif + +namespace base { +namespace subtle { + +typedef int64 Atomic64; + +// On windows, we assume intrinsics and asms are compiler barriers +// This is redefined below if using gcc asms. +#define ATOMICOPS_COMPILER_BARRIER() + + +// 32-bit low-level operations on any platform + +extern "C" { +// We use windows intrinsics when we can (they seem to be supported +// well on MSVC 8.0 and above). Unfortunately, in some +// environments, and have conflicting +// declarations of some other intrinsics, breaking compilation: +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +// Therefore, we simply declare the relevant intrinsics ourself. + +// MinGW has a bug in the header files where it doesn't indicate the +// first argument is volatile -- they're not up to date. See +// http://readlist.com/lists/lists.sourceforge.net/mingw-users/0/3861.html +// We have to const_cast away the volatile to avoid compiler warnings. +// TODO(user): remove this once MinGW has updated MinGW/include/winbase.h +#if defined(__MINGW32__) +inline LONG FastInterlockedCompareExchange(volatile LONG* ptr, + LONG newval, LONG oldval) { + return ::InterlockedCompareExchange(const_cast(ptr), newval, oldval); +} +inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) { + return ::InterlockedExchange(const_cast(ptr), newval); +} +inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) { + return ::InterlockedExchangeAdd(const_cast(ptr), increment); +} + +#elif _MSC_VER >= 1400 // intrinsics didn't work so well before MSVC 8.0 +// Unfortunately, in some environments, and +// have conflicting declarations of some intrinsics, breaking +// compilation. So we declare the intrinsics we need ourselves. See +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +LONG _InterlockedCompareExchange(volatile LONG* ptr, LONG newval, LONG oldval); +#pragma intrinsic(_InterlockedCompareExchange) +inline LONG FastInterlockedCompareExchange(volatile LONG* ptr, + LONG newval, LONG oldval) { + return _InterlockedCompareExchange(ptr, newval, oldval); +} + +LONG _InterlockedExchange(volatile LONG* ptr, LONG newval); +#pragma intrinsic(_InterlockedExchange) +inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) { + return _InterlockedExchange(ptr, newval); +} + +LONG _InterlockedExchangeAdd(volatile LONG* ptr, LONG increment); +#pragma intrinsic(_InterlockedExchangeAdd) +inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) { + return _InterlockedExchangeAdd(ptr, increment); +} + +#else +inline LONG FastInterlockedCompareExchange(volatile LONG* ptr, + LONG newval, LONG oldval) { + return ::InterlockedCompareExchange(ptr, newval, oldval); +} +inline LONG FastInterlockedExchange(volatile LONG* ptr, LONG newval) { + return ::InterlockedExchange(ptr, newval); +} +inline LONG FastInterlockedExchangeAdd(volatile LONG* ptr, LONG increment) { + return ::InterlockedExchangeAdd(ptr, increment); +} + +#endif // ifdef __MINGW32__ +} // extern "C" + +inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + LONG result = FastInterlockedCompareExchange( + reinterpret_cast(ptr), + static_cast(new_value), + static_cast(old_value)); + return static_cast(result); +} + +inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + LONG result = FastInterlockedExchange( + reinterpret_cast(ptr), + static_cast(new_value)); + return static_cast(result); +} + +inline Atomic32 Acquire_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 Release_AtomicExchange(volatile Atomic32* ptr, + Atomic32 new_value) { + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return FastInterlockedExchangeAdd( + reinterpret_cast(ptr), + static_cast(increment)) + increment; +} + +inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, + Atomic32 increment) { + return Barrier_AtomicIncrement(ptr, increment); +} + +} // namespace base::subtle +} // namespace base + + +// In msvc8/vs2005, winnt.h already contains a definition for +// MemoryBarrier in the global namespace. Add it there for earlier +// versions and forward to it from within the namespace. +#if !(_MSC_VER && _MSC_VER >= 1400) +inline void MemoryBarrier() { + Atomic32 value = 0; + base::subtle::NoBarrier_AtomicExchange(&value, 0); + // actually acts as a barrier in thisd implementation +} +#endif + +namespace base { +namespace subtle { + +inline void MemoryBarrier() { + ::MemoryBarrier(); +} + +inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, + Atomic32 old_value, + Atomic32 new_value) { + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { + Acquire_AtomicExchange(ptr, value); +} + +inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { + *ptr = value; // works w/o barrier for current Intel chips as of June 2005 + // See comments in Atomic64 version of Release_Store() below. +} + +inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { + return *ptr; +} + +inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { + Atomic32 value = *ptr; + return value; +} + +inline Atomic32 Release_Load(volatile const Atomic32* ptr) { + MemoryBarrier(); + return *ptr; +} + +// 64-bit operations + +#if defined(_WIN64) || defined(__MINGW64__) + +// 64-bit low-level operations on 64-bit platform. + +COMPILE_ASSERT(sizeof(Atomic64) == sizeof(PVOID), atomic_word_is_atomic); + +// These are the intrinsics needed for 64-bit operations. Similar to the +// 32-bit case above. + +extern "C" { +#if defined(__MINGW64__) +inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr, + PVOID newval, PVOID oldval) { + return ::InterlockedCompareExchangePointer(const_cast(ptr), + newval, oldval); +} +inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) { + return ::InterlockedExchangePointer(const_cast(ptr), newval); +} +inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr, + LONGLONG increment) { + return ::InterlockedExchangeAdd64(const_cast(ptr), increment); +} + +#elif _MSC_VER >= 1400 // intrinsics didn't work so well before MSVC 8.0 +// Like above, we need to declare the intrinsics ourselves. +PVOID _InterlockedCompareExchangePointer(volatile PVOID* ptr, + PVOID newval, PVOID oldval); +#pragma intrinsic(_InterlockedCompareExchangePointer) +inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr, + PVOID newval, PVOID oldval) { + return _InterlockedCompareExchangePointer(const_cast(ptr), + newval, oldval); +} + +PVOID _InterlockedExchangePointer(volatile PVOID* ptr, PVOID newval); +#pragma intrinsic(_InterlockedExchangePointer) +inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) { + return _InterlockedExchangePointer(const_cast(ptr), newval); +} + +LONGLONG _InterlockedExchangeAdd64(volatile LONGLONG* ptr, LONGLONG increment); +#pragma intrinsic(_InterlockedExchangeAdd64) +inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr, + LONGLONG increment) { + return _InterlockedExchangeAdd64(const_cast(ptr), increment); +} + +#else +inline PVOID FastInterlockedCompareExchangePointer(volatile PVOID* ptr, + PVOID newval, PVOID oldval) { + return ::InterlockedCompareExchangePointer(ptr, newval, oldval); +} +inline PVOID FastInterlockedExchangePointer(volatile PVOID* ptr, PVOID newval) { + return ::InterlockedExchangePointer(ptr, newval); +} +inline LONGLONG FastInterlockedExchangeAdd64(volatile LONGLONG* ptr, + LONGLONG increment) { + return ::InterlockedExchangeAdd64(ptr, increment); +} + +#endif // ifdef __MINGW64__ +} // extern "C" + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + PVOID result = FastInterlockedCompareExchangePointer( + reinterpret_cast(ptr), + reinterpret_cast(new_value), reinterpret_cast(old_value)); + return reinterpret_cast(result); +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + PVOID result = FastInterlockedExchangePointer( + reinterpret_cast(ptr), + reinterpret_cast(new_value)); + return reinterpret_cast(result); +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + return FastInterlockedExchangeAdd64( + reinterpret_cast(ptr), + static_cast(increment)) + increment; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + *ptr = value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + NoBarrier_AtomicExchange(ptr, value); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + return *ptr; +} + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return *ptr; +} + +#else // defined(_WIN64) || defined(__MINGW64__) + +// 64-bit low-level operations on 32-bit platform + +#if defined(_MSC_VER) +// Windows, 32-bit ABI, with MSVC compiler. + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* p) { + Atomic64 value; + __asm { + mov eax, p + movq mm0, [eax] // Use mmx reg for 64-bit atomic moves + movq value, mm0 + emms // Empty mmx state to enable FP registers + } + return value; +} + +inline void NoBarrier_Store(volatile Atomic64* p, Atomic64 value) { + __asm { + mov eax, p + movq mm0, value // Use mmx reg for 64-bit atomic moves + movq [eax], mm0 + emms // Empty mmx state to enable FP registers + } +} + +#pragma warning(push) +#pragma warning(disable : 4035) // disable the warning about no return statement + // in NoBarrier_CompareAndSwap() + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* p, + Atomic64 old_value, + Atomic64 new_value) { + __asm { + lea edi, old_value + lea esi, new_value + mov eax, [edi] + mov edx, [edi+4] + mov ebx, [esi] + mov ecx, [esi+4] + mov edi, p + lock cmpxchg8b [edi] + } + // There's no explcit return statement, so the warning is disabled above. + // The result is returned in edx,eax +} +#pragma warning(pop) + +#elif defined(__MINGW32__) +// Windows, 32-bit ABI, with GNU compiler. + +#undef ATOMICOPS_COMPILER_BARRIER +#define ATOMICOPS_COMPILER_BARRIER() __asm__ __volatile__("" : : : "memory") + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + Atomic64 value; + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Empty mmx state/Reset FP regs + : "=m" (value) + : "m" (*ptr) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); + return value; +} + +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + __asm__ __volatile__("movq %1, %%mm0\n\t" // Use mmx reg for 64-bit atomic + "movq %%mm0, %0\n\t" // moves (ptr could be read-only) + "emms\n\t" // Empty mmx state/Reset FP regs + : "=m" (*ptr) + : "m" (value) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", + "st(5)", "st(6)", "st(7)", "mm0", "mm1", + "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"); +} + +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + Atomic64 prev; + __asm__ __volatile__("push %%ebx\n\t" + "movl (%3), %%ebx\n\t" // Move 64-bit new_value into + "movl 4(%3), %%ecx\n\t" // ecx:ebx + "lock; cmpxchg8b (%1)\n\t"// If edx:eax (old_value) same + "pop %%ebx\n\t" + : "=A" (prev) // as contents of ptr: + : "D" (ptr), // ecx:ebx => ptr + "0" (old_value), // else: + "S" (&new_value) // old *ptr => edx:eax + : "memory", "%ecx"); + return prev; +} + +#else +// Windows, 32-bit ABI, but not Microsoft compiler, or GNU compiler. + +inline void NotImplementedFatalError(const char *function_name) { + fprintf(stderr, "64-bit %s() not implemented on this platform\n", + function_name); + abort(); +} + +inline Atomic64 NoBarrier_Load(volatile const Atomic64* ptr) { + NotImplementedFatalError("NoBarrier_Load(Atomic64 *)"); +} +inline void NoBarrier_Store(volatile Atomic64* ptr, Atomic64 value) { + NotImplementedFatalError("NoBarrier_Store(Atomic64 *, ...)"); +} +inline Atomic64 NoBarrier_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + NotImplementedFatalError("NoBarrier_CompareAndSwap(Atomic64 *, ...)"); +} +#endif + + +inline Atomic64 Release_Load(volatile const Atomic64* ptr) { + MemoryBarrier(); + return NoBarrier_Load(ptr); +} + +inline Atomic64 NoBarrier_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + Atomic64 old_value; + do { + old_value = NoBarrier_Load(ptr); + } while (NoBarrier_CompareAndSwap(ptr, old_value, new_value) != old_value); + return old_value; +} + +inline Atomic64 NoBarrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + Atomic64 old_value; + Atomic64 new_value; + do { + old_value = NoBarrier_Load(ptr); + new_value = old_value + increment; + } while (NoBarrier_CompareAndSwap(ptr, old_value, new_value) != old_value); + return new_value; +} + +inline void Acquire_Store(volatile Atomic64* ptr, Atomic64 value) { + // acquire and release are implicit in x86 AtomicExchange + NoBarrier_AtomicExchange(ptr, value); +} + +#endif // defined(_WIN64) || defined(__MINGW64__) + +inline Atomic64 Acquire_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + // acquire and release are implicit in x86 AtomicExchange + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic64 Release_AtomicExchange(volatile Atomic64* ptr, + Atomic64 new_value) { + // acquire and release are implicit in x86 AtomicExchange + return NoBarrier_AtomicExchange(ptr, new_value); +} + +inline Atomic64 Acquire_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + // acquire and release are implicit in x86 CompareAndSwap + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + +inline Atomic64 Release_CompareAndSwap(volatile Atomic64* ptr, + Atomic64 old_value, + Atomic64 new_value) { + // acquire and release are implicit in x86 CompareAndSwap + return NoBarrier_CompareAndSwap(ptr, old_value, new_value); +} + + +inline Atomic64 Barrier_AtomicIncrement(volatile Atomic64* ptr, + Atomic64 increment) { + // barriers are implicit in atomic increment on on the x86 + return NoBarrier_AtomicIncrement(ptr, increment); +} + +inline Atomic64 Acquire_Load(volatile const Atomic64* ptr) { + Atomic64 result = NoBarrier_Load(ptr); // acquire is implicit in x86 loads + ATOMICOPS_COMPILER_BARRIER(); + return result; +} + +inline void Release_Store(volatile Atomic64* ptr, Atomic64 value) { + ATOMICOPS_COMPILER_BARRIER(); + NoBarrier_Store(ptr, value); // release is implicit on x86 stores +} + +#undef ATOMICOPS_COMPILER_BARRIER + +} // namespace base::subtle +} // namespace base + +#endif // BASE_AUXILIARY_ATOMICOPS_INTERNALS_WINDOWS_H_ diff --git a/src/kudu/gutil/basictypes.h b/src/kudu/gutil/basictypes.h new file mode 100644 index 000000000000..1c095ca2e1f4 --- /dev/null +++ b/src/kudu/gutil/basictypes.h @@ -0,0 +1,32 @@ +// Copyright 2001 - 2003 Google, Inc. +// +// Google-specific types + +#ifndef BASE_BASICTYPES_H_ +#define BASE_BASICTYPES_H_ + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" + +// Argument type used in interfaces that can optionally take ownership +// of a passed in argument. If TAKE_OWNERSHIP is passed, the called +// object takes ownership of the argument. Otherwise it does not. +enum Ownership { + DO_NOT_TAKE_OWNERSHIP, + TAKE_OWNERSHIP +}; + +// Used to explicitly mark the return value of a function as unused. If you are +// really sure you don't want to do anything with the return value of a function +// that has been marked WARN_UNUSED_RESULT, wrap it with this. Example: +// +// scoped_ptr my_var = ...; +// if (TakeOwnership(my_var.get()) == SUCCESS) +// ignore_result(my_var.release()); +// +template +inline void ignore_result(const T&) { +} + + +#endif // BASE_BASICTYPES_H_ diff --git a/src/kudu/gutil/bind.h b/src/kudu/gutil/bind.h new file mode 100644 index 000000000000..8875f7061e48 --- /dev/null +++ b/src/kudu/gutil/bind.h @@ -0,0 +1,539 @@ +// This file was GENERATED by command: +// pump.py bind.h.pump +// DO NOT EDIT BY HAND!!! + + +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_BIND_H_ +#define KUDU_GUTIL_BIND_H_ + +#include "kudu/gutil/bind_internal.h" +#include "kudu/gutil/callback_internal.h" + +// ----------------------------------------------------------------------------- +// Usage documentation +// ----------------------------------------------------------------------------- +// +// See kudu/gutil/callback.h for documentation. +// +// +// ----------------------------------------------------------------------------- +// Implementation notes +// ----------------------------------------------------------------------------- +// +// If you're reading the implementation, before proceeding further, you should +// read the top comment of kudu/gutil/bind_internal.h for a definition of common +// terms and concepts. +// +// RETURN TYPES +// +// Though Bind()'s result is meant to be stored in a Callback<> type, it +// cannot actually return the exact type without requiring a large amount +// of extra template specializations. The problem is that in order to +// discern the correct specialization of Callback<>, Bind would need to +// unwrap the function signature to determine the signature's arity, and +// whether or not it is a method. +// +// Each unique combination of (arity, function_type, num_prebound) where +// function_type is one of {function, method, const_method} would require +// one specialization. We eventually have to do a similar number of +// specializations anyways in the implementation (see the Invoker<>, +// classes). However, it is avoidable in Bind if we return the result +// via an indirection like we do below. +// +// TODO(ajwong): We might be able to avoid this now, but need to test. +// +// It is possible to move most of the COMPILE_ASSERT asserts into BindState<>, +// but it feels a little nicer to have the asserts here so people do not +// need to crack open bind_internal.h. On the other hand, it makes Bind() +// harder to read. + +namespace kudu { + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void()> + ::UnboundRunType> +Bind(Functor functor) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + typedef internal::BindState BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor))); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + typedef internal::BindState::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2, const P3& p3) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p3_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2, p3)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2, const P3& p3, const P4& p4) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p3_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p4_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2, p3, p4)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2, const P3& p3, const P4& p4, + const P5& p5) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p3_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p4_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p5_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2, p3, p4, p5)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2, const P3& p3, const P4& p4, + const P5& p5, const P6& p6) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p3_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p4_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p5_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p6_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2, p3, p4, p5, p6)); +} + +template +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void(typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> + ::UnboundRunType> +Bind(Functor functor, const P1& p1, const P2& p2, const P3& p3, const P4& p4, + const P5& p5, const P6& p6, const P7& p7) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !(base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value || + base::is_non_const_reference::value ), + do_not_bind_functions_with_nonconst_ref); + + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p1_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p2_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p3_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p4_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p5_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p6_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p7_is_refcounted_type_and_needs_scoped_refptr); + typedef internal::BindState::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType, + typename internal::CallbackParamTraits::StorageType)> BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor), p1, p2, p3, p4, p5, p6, + p7)); +} + +} // namespace kudu + +#endif // KUDU_GUTIL_BIND_H_ diff --git a/src/kudu/gutil/bind.h.pump b/src/kudu/gutil/bind.h.pump new file mode 100644 index 000000000000..2bf6a91e46b1 --- /dev/null +++ b/src/kudu/gutil/bind.h.pump @@ -0,0 +1,153 @@ +$$ This is a pump file for generating file templates. Pump is a python +$$ script that is part of the Google Test suite of utilities. Description +$$ can be found here: +$$ +$$ http://code.google.com/p/googletest/wiki/PumpManual +$$ + +$$ +$$ MAX_ARITY controls the number of arguments that Bind() supports. +$$ The amount of code, and more importantly, the number of template types +$$ generated by pump grows at O(MAX_ARITY^2). +$$ +$$ We tried going to 11 and found it imposed an extra 10 penalty on windows +$$ cycle times compared to our original baseline of 6. +$$ +$$ Currently 7 is chosen as a compromise between supporting a convenient +$$ number of arguments and keeping compile times low. At 7, we have 115 +$$ templates being generated by pump. +$$ +$$ Be careful when adjusting this number. If people find a need to bind +$$ a larger number of arguments, consider refactoring the function to use +$$ a param struct instead of raising the MAX_ARITY. +$$ +$$ See http://crbug.com/98542 for more context. +$$ +$var MAX_ARITY = 7 + +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_BIND_H_ +#define KUDU_GUTIL_BIND_H_ + +#include "kudu/gutil/bind_internal.h" +#include "kudu/gutil/callback_internal.h" + +// ----------------------------------------------------------------------------- +// Usage documentation +// ----------------------------------------------------------------------------- +// +// See kudu/gutil/callback.h for documentation. +// +// +// ----------------------------------------------------------------------------- +// Implementation notes +// ----------------------------------------------------------------------------- +// +// If you're reading the implementation, before proceeding further, you should +// read the top comment of kudu/gutil/bind_internal.h for a definition of common +// terms and concepts. +// +// RETURN TYPES +// +// Though Bind()'s result is meant to be stored in a Callback<> type, it +// cannot actually return the exact type without requiring a large amount +// of extra template specializations. The problem is that in order to +// discern the correct specialization of Callback<>, Bind would need to +// unwrap the function signature to determine the signature's arity, and +// whether or not it is a method. +// +// Each unique combination of (arity, function_type, num_prebound) where +// function_type is one of {function, method, const_method} would require +// one specialization. We eventually have to do a similar number of +// specializations anyways in the implementation (see the Invoker<>, +// classes). However, it is avoidable in Bind if we return the result +// via an indirection like we do below. +// +// TODO(ajwong): We might be able to avoid this now, but need to test. +// +// It is possible to move most of the COMPILE_ASSERT asserts into BindState<>, +// but it feels a little nicer to have the asserts here so people do not +// need to crack open bind_internal.h. On the other hand, it makes Bind() +// harder to read. + +namespace kudu { + +$range ARITY 0..MAX_ARITY +$for ARITY [[ +$range ARG 1..ARITY + +template 0 [[, ]] $for ARG , [[typename P$(ARG)]]> +Callback< + typename internal::BindState< + typename internal::FunctorTraits::RunnableType, + typename internal::FunctorTraits::RunType, + void($for ARG , [[typename internal::CallbackParamTraits::StorageType]])> + ::UnboundRunType> +Bind(Functor functor +$if ARITY > 0 [[, ]] $for ARG , [[const P$(ARG)& p$(ARG)]]) { + // Typedefs for how to store and run the functor. + typedef typename internal::FunctorTraits::RunnableType RunnableType; + typedef typename internal::FunctorTraits::RunType RunType; + +$if ARITY > 0 [[ + + // Use RunnableType::RunType instead of RunType above because our + // checks should below for bound references need to know what the actual + // functor is going to interpret the argument as. + typedef internal::FunctionTraits + BoundFunctorTraits; + + // Do not allow binding a non-const reference parameter. Non-const reference + // parameters are disallowed by the Google style guide. Also, binding a + // non-const reference parameter can make for subtle bugs because the + // invoked function will receive a reference to the stored copy of the + // argument and not the original. + COMPILE_ASSERT( + !($for ARG || [[ +base::is_non_const_reference::value ]]), + do_not_bind_functions_with_nonconst_ref); + +]] + + +$for ARG [[ + + +$if ARG == 1 [[ + // For methods, we need to be careful for parameter 1. We do not require + // a scoped_refptr because BindState<> itself takes care of AddRef() for + // methods. We also disallow binding of an array as the method's target + // object. + COMPILE_ASSERT( + internal::HasIsMethodTag::value || + !internal::NeedsScopedRefptrButGetsRawPtr::value, + p$(ARG)_is_refcounted_type_and_needs_scoped_refptr); + COMPILE_ASSERT(!internal::HasIsMethodTag::value || + !base::is_array::value, + first_bound_argument_to_method_cannot_be_array); +]] $else [[ + COMPILE_ASSERT(!internal::NeedsScopedRefptrButGetsRawPtr::value, + p$(ARG)_is_refcounted_type_and_needs_scoped_refptr); +]] $$ $if ARG + +]] $$ $for ARG + + typedef internal::BindState::StorageType]])> [[]] +BindState; + + + return Callback( + new BindState(internal::MakeRunnable(functor)[[]] +$if ARITY > 0 [[, ]] $for ARG , [[p$(ARG)]])); +} + +]] $$ for ARITY + +} // namespace kudu + +#endif // KUDU_GUTIL_BIND_H_ diff --git a/src/kudu/gutil/bind_helpers.h b/src/kudu/gutil/bind_helpers.h new file mode 100644 index 000000000000..8a106a68ee27 --- /dev/null +++ b/src/kudu/gutil/bind_helpers.h @@ -0,0 +1,551 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This defines a set of argument wrappers and related factory methods that +// can be used specify the refcounting and reference semantics of arguments +// that are bound by the Bind() function in kudu/gutil/bind.h. +// +// It also defines a set of simple functions and utilities that people want +// when using Callback<> and Bind(). +// +// +// ARGUMENT BINDING WRAPPERS +// +// The wrapper functions are kudu::Unretained(), kudu::Owned(), kudu::Passed(), +// kudu::ConstRef(), and kudu::IgnoreResult(). +// +// Unretained() allows Bind() to bind a non-refcounted class, and to disable +// refcounting on arguments that are refcounted objects. +// +// Owned() transfers ownership of an object to the Callback resulting from +// bind; the object will be deleted when the Callback is deleted. +// +// Passed() is for transferring movable-but-not-copyable types (eg. scoped_ptr) +// through a Callback. Logically, this signifies a destructive transfer of +// the state of the argument into the target function. Invoking +// Callback::Run() twice on a Callback that was created with a Passed() +// argument will CHECK() because the first invocation would have already +// transferred ownership to the target function. +// +// ConstRef() allows binding a constant reference to an argument rather +// than a copy. +// +// IgnoreResult() is used to adapt a function or Callback with a return type to +// one with a void return. This is most useful if you have a function with, +// say, a pesky ignorable bool return that you want to use with PostTask or +// something else that expect a Callback with a void return. +// +// EXAMPLE OF Unretained(): +// +// class Foo { +// public: +// void func() { cout << "Foo:f" << endl; } +// }; +// +// // In some function somewhere. +// Foo foo; +// Closure foo_callback = +// Bind(&Foo::func, Unretained(&foo)); +// foo_callback.Run(); // Prints "Foo:f". +// +// Without the Unretained() wrapper on |&foo|, the above call would fail +// to compile because Foo does not support the AddRef() and Release() methods. +// +// +// EXAMPLE OF Owned(): +// +// void foo(int* arg) { cout << *arg << endl } +// +// int* pn = new int(1); +// Closure foo_callback = Bind(&foo, Owned(pn)); +// +// foo_callback.Run(); // Prints "1" +// foo_callback.Run(); // Prints "1" +// *n = 2; +// foo_callback.Run(); // Prints "2" +// +// foo_callback.Reset(); // |pn| is deleted. Also will happen when +// // |foo_callback| goes out of scope. +// +// Without Owned(), someone would have to know to delete |pn| when the last +// reference to the Callback is deleted. +// +// +// EXAMPLE OF ConstRef(): +// +// void foo(int arg) { cout << arg << endl } +// +// int n = 1; +// Closure no_ref = Bind(&foo, n); +// Closure has_ref = Bind(&foo, ConstRef(n)); +// +// no_ref.Run(); // Prints "1" +// has_ref.Run(); // Prints "1" +// +// n = 2; +// no_ref.Run(); // Prints "1" +// has_ref.Run(); // Prints "2" +// +// Note that because ConstRef() takes a reference on |n|, |n| must outlive all +// its bound callbacks. +// +// +// EXAMPLE OF IgnoreResult(): +// +// int DoSomething(int arg) { cout << arg << endl; } +// +// // Assign to a Callback with a void return type. +// Callback cb = Bind(IgnoreResult(&DoSomething)); +// cb->Run(1); // Prints "1". +// +// // Prints "1" on |ml|. +// ml->PostTask(FROM_HERE, Bind(IgnoreResult(&DoSomething), 1); +// +// +// EXAMPLE OF Passed(): +// +// void TakesOwnership(scoped_ptr arg) { } +// scoped_ptr CreateFoo() { return scoped_ptr(new Foo()); } +// +// scoped_ptr f(new Foo()); +// +// // |cb| is given ownership of Foo(). |f| is now NULL. +// // You can use f.Pass() in place of &f, but it's more verbose. +// Closure cb = Bind(&TakesOwnership, Passed(&f)); +// +// // Run was never called so |cb| still owns Foo() and deletes +// // it on Reset(). +// cb.Reset(); +// +// // |cb| is given a new Foo created by CreateFoo(). +// cb = Bind(&TakesOwnership, Passed(CreateFoo())); +// +// // |arg| in TakesOwnership() is given ownership of Foo(). |cb| +// // no longer owns Foo() and, if reset, would not delete Foo(). +// cb.Run(); // Foo() is now transferred to |arg| and deleted. +// cb.Run(); // This CHECK()s since Foo() already been used once. +// +// Passed() is particularly useful with PostTask() when you are transferring +// ownership of an argument into a task, but don't necessarily know if the +// task will always be executed. This can happen if the task is cancellable +// or if it is posted to a MessageLoopProxy. +// +// +// SIMPLE FUNCTIONS AND UTILITIES. +// +// DoNothing() - Useful for creating a Closure that does nothing when called. +// DeletePointer() - Useful for creating a Closure that will delete a +// pointer when invoked. Only use this when necessary. +// In most cases MessageLoop::DeleteSoon() is a better +// fit. + +#ifndef KUDU_GUTIL_BIND_HELPERS_H_ +#define KUDU_GUTIL_BIND_HELPERS_H_ + +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/template_util.h" + +// Unneeded define from Chromium +#define BASE_EXPORT + + +namespace kudu { +namespace internal { + +// Use the Substitution Failure Is Not An Error (SFINAE) trick to inspect T +// for the existence of AddRef() and Release() functions of the correct +// signature. +// +// http://en.wikipedia.org/wiki/Substitution_failure_is_not_an_error +// http://stackoverflow.com/questions/257288/is-it-possible-to-write-a-c-template-to-check-for-a-functions-existence +// http://stackoverflow.com/questions/4358584/sfinae-approach-comparison +// http://stackoverflow.com/questions/1966362/sfinae-to-check-for-inherited-member-functions +// +// The last link in particular show the method used below. +// +// For SFINAE to work with inherited methods, we need to pull some extra tricks +// with multiple inheritance. In the more standard formulation, the overloads +// of Check would be: +// +// template +// Yes NotTheCheckWeWant(Helper<&C::TargetFunc>*); +// +// template +// No NotTheCheckWeWant(...); +// +// static const bool value = sizeof(NotTheCheckWeWant(0)) == sizeof(Yes); +// +// The problem here is that template resolution will not match +// C::TargetFunc if TargetFunc does not exist directly in C. That is, if +// TargetFunc in inherited from an ancestor, &C::TargetFunc will not match, +// |value| will be false. This formulation only checks for whether or +// not TargetFunc exist directly in the class being introspected. +// +// To get around this, we play a dirty trick with multiple inheritance. +// First, We create a class BaseMixin that declares each function that we +// want to probe for. Then we create a class Base that inherits from both T +// (the class we wish to probe) and BaseMixin. Note that the function +// signature in BaseMixin does not need to match the signature of the function +// we are probing for; thus it's easiest to just use void(void). +// +// Now, if TargetFunc exists somewhere in T, then &Base::TargetFunc has an +// ambiguous resolution between BaseMixin and T. This lets us write the +// following: +// +// template +// No GoodCheck(Helper<&C::TargetFunc>*); +// +// template +// Yes GoodCheck(...); +// +// static const bool value = sizeof(GoodCheck(0)) == sizeof(Yes); +// +// Notice here that the variadic version of GoodCheck() returns Yes here +// instead of No like the previous one. Also notice that we calculate |value| +// by specializing GoodCheck() on Base instead of T. +// +// We've reversed the roles of the variadic, and Helper overloads. +// GoodCheck(Helper<&C::TargetFunc>*), when C = Base, fails to be a valid +// substitution if T::TargetFunc exists. Thus GoodCheck(0) will resolve +// to the variadic version if T has TargetFunc. If T::TargetFunc does not +// exist, then &C::TargetFunc is not ambiguous, and the overload resolution +// will prefer GoodCheck(Helper<&C::TargetFunc>*). +// +// This method of SFINAE will correctly probe for inherited names, but it cannot +// typecheck those names. It's still a good enough sanity check though. +// +// Works on gcc-4.2, gcc-4.4, and Visual Studio 2008. +// +// TODO(ajwong): Move to ref_counted.h or template_util.h when we've vetted +// this works well. +// +// TODO(ajwong): Make this check for Release() as well. +// See http://crbug.com/82038. +template +class SupportsAddRefAndRelease { + typedef char Yes[1]; + typedef char No[2]; + + struct BaseMixin { + void AddRef(); + }; + +// MSVC warns when you try to use Base if T has a private destructor, the +// common pattern for refcounted types. It does this even though no attempt to +// instantiate Base is made. We disable the warning for this definition. +#if defined(OS_WIN) +#pragma warning(push) +#pragma warning(disable:4624) +#endif + struct Base : public T, public BaseMixin { + }; +#if defined(OS_WIN) +#pragma warning(pop) +#endif + + template struct Helper {}; + + template + static No& Check(Helper<&C::AddRef>*); + + template + static Yes& Check(...); + + public: + static const bool value = sizeof(Check(0)) == sizeof(Yes); +}; + +// Helpers to assert that arguments of a recounted type are bound with a +// scoped_refptr. +template +struct UnsafeBindtoRefCountedArgHelper : base::false_type { +}; + +template +struct UnsafeBindtoRefCountedArgHelper + : base::integral_constant::value> { +}; + +template +struct UnsafeBindtoRefCountedArg : base::false_type { +}; + +template +struct UnsafeBindtoRefCountedArg + : UnsafeBindtoRefCountedArgHelper::value, T> { +}; + +template +class HasIsMethodTag { + typedef char Yes[1]; + typedef char No[2]; + + template + static Yes& Check(typename U::IsMethod*); + + template + static No& Check(...); + + public: + static const bool value = sizeof(Check(0)) == sizeof(Yes); +}; + +template +class UnretainedWrapper { + public: + explicit UnretainedWrapper(T* o) : ptr_(o) {} + T* get() const { return ptr_; } + private: + T* ptr_; +}; + +template +class ConstRefWrapper { + public: + explicit ConstRefWrapper(const T& o) : ptr_(&o) {} + const T& get() const { return *ptr_; } + private: + const T* ptr_; +}; + +template +struct IgnoreResultHelper { + explicit IgnoreResultHelper(T functor) : functor_(functor) {} + + T functor_; +}; + +template +struct IgnoreResultHelper > { + explicit IgnoreResultHelper(const Callback& functor) : functor_(functor) {} + + const Callback& functor_; +}; + +// An alternate implementation is to avoid the destructive copy, and instead +// specialize ParamTraits<> for OwnedWrapper<> to change the StorageType to +// a class that is essentially a scoped_ptr<>. +// +// The current implementation has the benefit though of leaving ParamTraits<> +// fully in callback_internal.h as well as avoiding type conversions during +// storage. +template +class OwnedWrapper { + public: + explicit OwnedWrapper(T* o) : ptr_(o) {} + ~OwnedWrapper() { delete ptr_; } + T* get() const { return ptr_; } + OwnedWrapper(const OwnedWrapper& other) { + ptr_ = other.ptr_; + other.ptr_ = NULL; + } + + private: + mutable T* ptr_; +}; + +// PassedWrapper is a copyable adapter for a scoper that ignores const. +// +// It is needed to get around the fact that Bind() takes a const reference to +// all its arguments. Because Bind() takes a const reference to avoid +// unnecessary copies, it is incompatible with movable-but-not-copyable +// types; doing a destructive "move" of the type into Bind() would violate +// the const correctness. +// +// This conundrum cannot be solved without either C++11 rvalue references or +// a O(2^n) blowup of Bind() templates to handle each combination of regular +// types and movable-but-not-copyable types. Thus we introduce a wrapper type +// that is copyable to transmit the correct type information down into +// BindState<>. Ignoring const in this type makes sense because it is only +// created when we are explicitly trying to do a destructive move. +// +// Two notes: +// 1) PassedWrapper supports any type that has a "Pass()" function. +// This is intentional. The whitelisting of which specific types we +// support is maintained by CallbackParamTraits<>. +// 2) is_valid_ is distinct from NULL because it is valid to bind a "NULL" +// scoper to a Callback and allow the Callback to execute once. +template +class PassedWrapper { + public: + explicit PassedWrapper(T scoper) : is_valid_(true), scoper_(scoper.Pass()) {} + PassedWrapper(const PassedWrapper& other) + : is_valid_(other.is_valid_), scoper_(other.scoper_.Pass()) { + } + T Pass() const { + assert(is_valid_); + is_valid_ = false; + return scoper_.Pass(); + } + + private: + mutable bool is_valid_; + mutable T scoper_; +}; + +// Unwrap the stored parameters for the wrappers above. +template +struct UnwrapTraits { + typedef const T& ForwardType; + static ForwardType Unwrap(const T& o) { return o; } +}; + +template +struct UnwrapTraits > { + typedef T* ForwardType; + static ForwardType Unwrap(UnretainedWrapper unretained) { + return unretained.get(); + } +}; + +template +struct UnwrapTraits > { + typedef const T& ForwardType; + static ForwardType Unwrap(ConstRefWrapper const_ref) { + return const_ref.get(); + } +}; + +template +struct UnwrapTraits > { + typedef T* ForwardType; + static ForwardType Unwrap(const scoped_refptr& o) { return o.get(); } +}; + +// We didn't import WeakPtr from Chromium. +// +//template +//struct UnwrapTraits > { +// typedef const WeakPtr& ForwardType; +// static ForwardType Unwrap(const WeakPtr& o) { return o; } +//}; + +template +struct UnwrapTraits > { + typedef T* ForwardType; + static ForwardType Unwrap(const OwnedWrapper& o) { + return o.get(); + } +}; + +template +struct UnwrapTraits > { + typedef T ForwardType; + static T Unwrap(PassedWrapper& o) { + return o.Pass(); + } +}; + +// Utility for handling different refcounting semantics in the Bind() +// function. +template +struct MaybeRefcount; + +template +struct MaybeRefcount { + static void AddRef(const T&) {} + static void Release(const T&) {} +}; + +template +struct MaybeRefcount { + static void AddRef(const T*) {} + static void Release(const T*) {} +}; + +template +struct MaybeRefcount { + static void AddRef(const T&) {} + static void Release(const T&) {} +}; + +template +struct MaybeRefcount { + static void AddRef(T* o) { o->AddRef(); } + static void Release(T* o) { o->Release(); } +}; + +// No need to additionally AddRef() and Release() since we are storing a +// scoped_refptr<> inside the storage object already. +template +struct MaybeRefcount > { + static void AddRef(const scoped_refptr& o) {} + static void Release(const scoped_refptr& o) {} +}; + +template +struct MaybeRefcount { + static void AddRef(const T* o) { o->AddRef(); } + static void Release(const T* o) { o->Release(); } +}; + +// We didn't import WeakPtr from Chromium. +// +//// IsWeakMethod is a helper that determine if we are binding a WeakPtr<> to a +//// method. It is used internally by Bind() to select the correct +//// InvokeHelper that will no-op itself in the event the WeakPtr<> for +//// the target object is invalidated. +//// +//// P1 should be the type of the object that will be received of the method. +//template +//struct IsWeakMethod : public false_type {}; +// +//template +//struct IsWeakMethod > : public true_type {}; +// +//template +//struct IsWeakMethod > > : public true_type {}; + +} // namespace internal + +template +static inline internal::UnretainedWrapper Unretained(T* o) { + return internal::UnretainedWrapper(o); +} + +template +static inline internal::ConstRefWrapper ConstRef(const T& o) { + return internal::ConstRefWrapper(o); +} + +template +static inline internal::OwnedWrapper Owned(T* o) { + return internal::OwnedWrapper(o); +} + +// We offer 2 syntaxes for calling Passed(). The first takes a temporary and +// is best suited for use with the return value of a function. The second +// takes a pointer to the scoper and is just syntactic sugar to avoid having +// to write Passed(scoper.Pass()). +template +static inline internal::PassedWrapper Passed(T scoper) { + return internal::PassedWrapper(scoper.Pass()); +} +template +static inline internal::PassedWrapper Passed(T* scoper) { + return internal::PassedWrapper(scoper->Pass()); +} + +template +static inline internal::IgnoreResultHelper IgnoreResult(T data) { + return internal::IgnoreResultHelper(data); +} + +template +static inline internal::IgnoreResultHelper > +IgnoreResult(const Callback& data) { + return internal::IgnoreResultHelper >(data); +} + +template +void DeletePointer(T* obj) { + delete obj; +} + +} // namespace kudu + +#endif // BASE_BIND_HELPERS_H_ diff --git a/src/kudu/gutil/bind_internal.h b/src/kudu/gutil/bind_internal.h new file mode 100644 index 000000000000..6764d7e5fe21 --- /dev/null +++ b/src/kudu/gutil/bind_internal.h @@ -0,0 +1,2695 @@ +// This file was GENERATED by command: +// pump.py bind_internal.h.pump +// DO NOT EDIT BY HAND!!! + + +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_BIND_INTERNAL_H_ +#define KUDU_GUTIL_BIND_INTERNAL_H_ + +#include "kudu/gutil/bind_helpers.h" +#include "kudu/gutil/callback_internal.h" +#include "kudu/gutil/raw_scoped_refptr_mismatch_checker.h" +#include "kudu/gutil/template_util.h" + +#if defined(OS_WIN) +#include "kudu/gutil/bind_internal_win.h" +#endif + +// During Chromium import, WeakPtr-related code was removed. + +namespace kudu { +namespace internal { + +// See kudu/gutil/callback.h for user documentation. +// +// +// CONCEPTS: +// Runnable -- A type (really a type class) that has a single Run() method +// and a RunType typedef that corresponds to the type of Run(). +// A Runnable can declare that it should treated like a method +// call by including a typedef named IsMethod. The value of +// this typedef is NOT inspected, only the existence. When a +// Runnable declares itself a method, Bind() will enforce special +// refcounting + WeakPtr handling semantics for the first +// parameter which is expected to be an object. +// Functor -- A copyable type representing something that should be called. +// All function pointers, Callback<>, and Runnables are functors +// even if the invocation syntax differs. +// RunType -- A function type (as opposed to function _pointer_ type) for +// a Run() function. Usually just a convenience typedef. +// (Bound)ArgsType -- A function type that is being (ab)used to store the +// types of set of arguments. The "return" type is always +// void here. We use this hack so that we do not need +// a new type name for each arity of type. (eg., +// BindState1, BindState2). This makes forward +// declarations and friending much much easier. +// +// Types: +// RunnableAdapter<> -- Wraps the various "function" pointer types into an +// object that adheres to the Runnable interface. +// There are |3*ARITY| RunnableAdapter types. +// FunctionTraits<> -- Type traits that unwrap a function signature into a +// a set of easier to use typedefs. Used mainly for +// compile time asserts. +// There are |ARITY| FunctionTraits types. +// ForceVoidReturn<> -- Helper class for translating function signatures to +// equivalent forms with a "void" return type. +// There are |ARITY| ForceVoidReturn types. +// FunctorTraits<> -- Type traits used determine the correct RunType and +// RunnableType for a Functor. This is where function +// signature adapters are applied. +// There are |ARITY| ForceVoidReturn types. +// MakeRunnable<> -- Takes a Functor and returns an object in the Runnable +// type class that represents the underlying Functor. +// There are |O(1)| MakeRunnable types. +// InvokeHelper<> -- Take a Runnable + arguments and actully invokes it. +// Handle the differing syntaxes needed for WeakPtr<> support, +// and for ignoring return values. This is separate from +// Invoker to avoid creating multiple version of Invoker<> +// which grows at O(n^2) with the arity. +// There are |k*ARITY| InvokeHelper types. +// Invoker<> -- Unwraps the curried parameters and executes the Runnable. +// There are |(ARITY^2 + ARITY)/2| Invoketypes. +// BindState<> -- Stores the curried parameters, and is the main entry point +// into the Bind() system, doing most of the type resolution. +// There are ARITY BindState types. + +// RunnableAdapter<> +// +// The RunnableAdapter<> templates provide a uniform interface for invoking +// a function pointer, method pointer, or const method pointer. The adapter +// exposes a Run() method with an appropriate signature. Using this wrapper +// allows for writing code that supports all three pointer types without +// undue repetition. Without it, a lot of code would need to be repeated 3 +// times. +// +// For method pointers and const method pointers the first argument to Run() +// is considered to be the received of the method. This is similar to STL's +// mem_fun(). +// +// This class also exposes a RunType typedef that is the function type of the +// Run() function. +// +// If and only if the wrapper contains a method or const method pointer, an +// IsMethod typedef is exposed. The existence of this typedef (NOT the value) +// marks that the wrapper should be considered a method wrapper. + +template +class RunnableAdapter; + +// Function: Arity 0. +template +class RunnableAdapter { + public: + typedef R (RunType)(); + + explicit RunnableAdapter(R(*function)()) + : function_(function) { + } + + R Run() { + return function_(); + } + + private: + R (*function_)(); +}; + +// Method: Arity 0. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)()) + : method_(method) { + } + + R Run(T* object) { + return (object->*method_)(); + } + + private: + R (T::*method_)(); +}; + +// Const Method: Arity 0. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)() const) + : method_(method) { + } + + R Run(const T* object) { + return (object->*method_)(); + } + + private: + R (T::*method_)() const; +}; + +// Function: Arity 1. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1); + + explicit RunnableAdapter(R(*function)(A1)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1) { + return function_(CallbackForward(a1)); + } + + private: + R (*function_)(A1); +}; + +// Method: Arity 1. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1) { + return (object->*method_)(CallbackForward(a1)); + } + + private: + R (T::*method_)(A1); +}; + +// Const Method: Arity 1. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1) { + return (object->*method_)(CallbackForward(a1)); + } + + private: + R (T::*method_)(A1) const; +}; + +// Function: Arity 2. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2); + + explicit RunnableAdapter(R(*function)(A1, A2)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2) { + return function_(CallbackForward(a1), CallbackForward(a2)); + } + + private: + R (*function_)(A1, A2); +}; + +// Method: Arity 2. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2)); + } + + private: + R (T::*method_)(A1, A2); +}; + +// Const Method: Arity 2. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2)); + } + + private: + R (T::*method_)(A1, A2) const; +}; + +// Function: Arity 3. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2, A3); + + explicit RunnableAdapter(R(*function)(A1, A2, A3)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3) { + return function_(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3)); + } + + private: + R (*function_)(A1, A2, A3); +}; + +// Method: Arity 3. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2, A3); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3)); + } + + private: + R (T::*method_)(A1, A2, A3); +}; + +// Const Method: Arity 3. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2, A3); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3)); + } + + private: + R (T::*method_)(A1, A2, A3) const; +}; + +// Function: Arity 4. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2, A3, A4); + + explicit RunnableAdapter(R(*function)(A1, A2, A3, A4)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4) { + return function_(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4)); + } + + private: + R (*function_)(A1, A2, A3, A4); +}; + +// Method: Arity 4. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2, A3, A4); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4)); + } + + private: + R (T::*method_)(A1, A2, A3, A4); +}; + +// Const Method: Arity 4. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2, A3, A4); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4)); + } + + private: + R (T::*method_)(A1, A2, A3, A4) const; +}; + +// Function: Arity 5. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2, A3, A4, A5); + + explicit RunnableAdapter(R(*function)(A1, A2, A3, A4, A5)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5) { + return function_(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5)); + } + + private: + R (*function_)(A1, A2, A3, A4, A5); +}; + +// Method: Arity 5. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2, A3, A4, A5); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5); +}; + +// Const Method: Arity 5. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2, A3, A4, A5); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5) const; +}; + +// Function: Arity 6. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2, A3, A4, A5, A6); + + explicit RunnableAdapter(R(*function)(A1, A2, A3, A4, A5, A6)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6) { + return function_(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6)); + } + + private: + R (*function_)(A1, A2, A3, A4, A5, A6); +}; + +// Method: Arity 6. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2, A3, A4, A5, A6); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5, A6)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5, A6); +}; + +// Const Method: Arity 6. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2, A3, A4, A5, A6); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5, A6) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5, A6) const; +}; + +// Function: Arity 7. +template +class RunnableAdapter { + public: + typedef R (RunType)(A1, A2, A3, A4, A5, A6, A7); + + explicit RunnableAdapter(R(*function)(A1, A2, A3, A4, A5, A6, A7)) + : function_(function) { + } + + R Run(typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6, + typename CallbackParamTraits::ForwardType a7) { + return function_(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6), CallbackForward(a7)); + } + + private: + R (*function_)(A1, A2, A3, A4, A5, A6, A7); +}; + +// Method: Arity 7. +template +class RunnableAdapter { + public: + typedef R (RunType)(T*, A1, A2, A3, A4, A5, A6, A7); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5, A6, A7)) + : method_(method) { + } + + R Run(T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6, + typename CallbackParamTraits::ForwardType a7) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6), CallbackForward(a7)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5, A6, A7); +}; + +// Const Method: Arity 7. +template +class RunnableAdapter { + public: + typedef R (RunType)(const T*, A1, A2, A3, A4, A5, A6, A7); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)(A1, A2, A3, A4, A5, A6, A7) const) + : method_(method) { + } + + R Run(const T* object, typename CallbackParamTraits::ForwardType a1, + typename CallbackParamTraits::ForwardType a2, + typename CallbackParamTraits::ForwardType a3, + typename CallbackParamTraits::ForwardType a4, + typename CallbackParamTraits::ForwardType a5, + typename CallbackParamTraits::ForwardType a6, + typename CallbackParamTraits::ForwardType a7) { + return (object->*method_)(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6), CallbackForward(a7)); + } + + private: + R (T::*method_)(A1, A2, A3, A4, A5, A6, A7) const; +}; + + +// FunctionTraits<> +// +// Breaks a function signature apart into typedefs for easier introspection. +template +struct FunctionTraits; + +template +struct FunctionTraits { + typedef R ReturnType; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; + typedef A3 A3Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; + typedef A3 A3Type; + typedef A4 A4Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; + typedef A3 A3Type; + typedef A4 A4Type; + typedef A5 A5Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; + typedef A3 A3Type; + typedef A4 A4Type; + typedef A5 A5Type; + typedef A6 A6Type; +}; + +template +struct FunctionTraits { + typedef R ReturnType; + typedef A1 A1Type; + typedef A2 A2Type; + typedef A3 A3Type; + typedef A4 A4Type; + typedef A5 A5Type; + typedef A6 A6Type; + typedef A7 A7Type; +}; + + +// ForceVoidReturn<> +// +// Set of templates that support forcing the function return type to void. +template +struct ForceVoidReturn; + +template +struct ForceVoidReturn { + typedef void(RunType)(); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2, A3); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2, A3, A4); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2, A3, A4, A5); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2, A3, A4, A5, A6); +}; + +template +struct ForceVoidReturn { + typedef void(RunType)(A1, A2, A3, A4, A5, A6, A7); +}; + + +// FunctorTraits<> +// +// See description at top of file. +template +struct FunctorTraits { + typedef RunnableAdapter RunnableType; + typedef typename RunnableType::RunType RunType; +}; + +template +struct FunctorTraits > { + typedef typename FunctorTraits::RunnableType RunnableType; + typedef typename ForceVoidReturn< + typename RunnableType::RunType>::RunType RunType; +}; + +template +struct FunctorTraits > { + typedef Callback RunnableType; + typedef typename Callback::RunType RunType; +}; + + +// MakeRunnable<> +// +// Converts a passed in functor to a RunnableType using type inference. + +template +typename FunctorTraits::RunnableType MakeRunnable(const T& t) { + return RunnableAdapter(t); +} + +template +typename FunctorTraits::RunnableType +MakeRunnable(const IgnoreResultHelper& t) { + return MakeRunnable(t.functor_); +} + +template +const typename FunctorTraits >::RunnableType& +MakeRunnable(const Callback& t) { + DCHECK(!t.is_null()); + return t; +} + + +// InvokeHelper<> +// +// There are 3 logical InvokeHelper<> specializations: normal, void-return, +// WeakCalls. +// +// The normal type just calls the underlying runnable. +// +// We need a InvokeHelper to handle void return types in order to support +// IgnoreResult(). Normally, if the Runnable's RunType had a void return, +// the template system would just accept "return functor.Run()" ignoring +// the fact that a void function is being used with return. This piece of +// sugar breaks though when the Runnable's RunType is not void. Thus, we +// need a partial specialization to change the syntax to drop the "return" +// from the invocation call. +// +// WeakCalls similarly need special syntax that is applied to the first +// argument to check if they should no-op themselves. +template +struct InvokeHelper; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable) { + return runnable.Run(); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable) { + runnable.Run(); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1) { + return runnable.Run(CallbackForward(a1)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1) { + runnable.Run(CallbackForward(a1)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2) { + runnable.Run(CallbackForward(a1), CallbackForward(a2)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3) { + runnable.Run(CallbackForward(a1), CallbackForward(a2), CallbackForward(a3)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4) { + runnable.Run(CallbackForward(a1), CallbackForward(a2), CallbackForward(a3), + CallbackForward(a4)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, + A5 a5) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5) { + runnable.Run(CallbackForward(a1), CallbackForward(a2), CallbackForward(a3), + CallbackForward(a4), CallbackForward(a5)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, + A5 a5, A6 a6) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, + A6 a6) { + runnable.Run(CallbackForward(a1), CallbackForward(a2), CallbackForward(a3), + CallbackForward(a4), CallbackForward(a5), CallbackForward(a6)); + } +}; + +template +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, + A5 a5, A6 a6, A7 a7) { + return runnable.Run(CallbackForward(a1), CallbackForward(a2), + CallbackForward(a3), CallbackForward(a4), CallbackForward(a5), + CallbackForward(a6), CallbackForward(a7)); + } +}; + +template +struct InvokeHelper { + static void MakeItSo(Runnable runnable, A1 a1, A2 a2, A3 a3, A4 a4, A5 a5, + A6 a6, A7 a7) { + runnable.Run(CallbackForward(a1), CallbackForward(a2), CallbackForward(a3), + CallbackForward(a4), CallbackForward(a5), CallbackForward(a6), + CallbackForward(a7)); + } +}; + +// Invoker<> +// +// See description at the top of the file. +template +struct Invoker; + +// Arity 0 -> 0. +template +struct Invoker<0, StorageType, R()> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper + ::MakeItSo(storage->runnable_); + } +}; + +// Arity 1 -> 1. +template +struct Invoker<0, StorageType, R(X1)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1)> + ::MakeItSo(storage->runnable_, CallbackForward(x1)); + } +}; + +// Arity 1 -> 0. +template +struct Invoker<1, StorageType, R(X1)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1)); + } +}; + +// Arity 2 -> 2. +template +struct Invoker<0, StorageType, R(X1, X2)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2)); + } +}; + +// Arity 2 -> 1. +template +struct Invoker<1, StorageType, R(X1, X2)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2)); + } +}; + +// Arity 2 -> 0. +template +struct Invoker<2, StorageType, R(X1, X2)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2)); + } +}; + +// Arity 3 -> 3. +template +struct Invoker<0, StorageType, R(X1, X2, X3)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2, X3); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3)); + } +}; + +// Arity 3 -> 2. +template +struct Invoker<1, StorageType, R(X1, X2, X3)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2, X3); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2, + typename CallbackParamTraits::ForwardType x3)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3)); + } +}; + +// Arity 3 -> 1. +template +struct Invoker<2, StorageType, R(X1, X2, X3)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X3); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x3) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper::ForwardType x3)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3)); + } +}; + +// Arity 3 -> 0. +template +struct Invoker<3, StorageType, R(X1, X2, X3)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3)); + } +}; + +// Arity 4 -> 4. +template +struct Invoker<0, StorageType, R(X1, X2, X3, X4)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2, X3, X4); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4)); + } +}; + +// Arity 4 -> 3. +template +struct Invoker<1, StorageType, R(X1, X2, X3, X4)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2, X3, X4); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4)); + } +}; + +// Arity 4 -> 2. +template +struct Invoker<2, StorageType, R(X1, X2, X3, X4)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X3, X4); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper::ForwardType x3, + typename CallbackParamTraits::ForwardType x4)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4)); + } +}; + +// Arity 4 -> 1. +template +struct Invoker<3, StorageType, R(X1, X2, X3, X4)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X4); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x4) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + return InvokeHelper::ForwardType x4)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4)); + } +}; + +// Arity 4 -> 0. +template +struct Invoker<4, StorageType, R(X1, X2, X3, X4)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4)); + } +}; + +// Arity 5 -> 5. +template +struct Invoker<0, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2, X3, X4, X5); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 5 -> 4. +template +struct Invoker<1, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2, X3, X4, X5); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 5 -> 3. +template +struct Invoker<2, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X3, X4, X5); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 5 -> 2. +template +struct Invoker<3, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X4, X5); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + return InvokeHelper::ForwardType x4, + typename CallbackParamTraits::ForwardType x5)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 5 -> 1. +template +struct Invoker<4, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X5); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x5) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + return InvokeHelper::ForwardType x5)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 5 -> 0. +template +struct Invoker<5, StorageType, R(X1, X2, X3, X4, X5)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5)); + } +}; + +// Arity 6 -> 6. +template +struct Invoker<0, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2, X3, X4, X5, X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 5. +template +struct Invoker<1, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2, X3, X4, X5, X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 4. +template +struct Invoker<2, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X3, X4, X5, X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 3. +template +struct Invoker<3, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X4, X5, X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + return InvokeHelper::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 2. +template +struct Invoker<4, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X5, X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + return InvokeHelper::ForwardType x5, + typename CallbackParamTraits::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 1. +template +struct Invoker<5, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X6); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x6) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + return InvokeHelper::ForwardType x6)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 6 -> 0. +template +struct Invoker<6, StorageType, R(X1, X2, X3, X4, X5, X6)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + typedef typename StorageType::Bound6UnwrapTraits Bound6UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + typename Bound6UnwrapTraits::ForwardType x6 = + Bound6UnwrapTraits::Unwrap(storage->p6_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6)); + } +}; + +// Arity 7 -> 7. +template +struct Invoker<0, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X1, X2, X3, X4, X5, X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + + return InvokeHelper::ForwardType x1, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 6. +template +struct Invoker<1, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X2, X3, X4, X5, X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + return InvokeHelper::ForwardType x2, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 5. +template +struct Invoker<2, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X3, X4, X5, X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + return InvokeHelper::ForwardType x3, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 4. +template +struct Invoker<3, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X4, X5, X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + return InvokeHelper::ForwardType x4, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 3. +template +struct Invoker<4, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X5, X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + return InvokeHelper::ForwardType x5, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 2. +template +struct Invoker<5, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X6, X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x6, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + return InvokeHelper::ForwardType x6, + typename CallbackParamTraits::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 1. +template +struct Invoker<6, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*, + typename CallbackParamTraits::ForwardType); + + typedef R(UnboundRunType)(X7); + + static R Run(BindStateBase* base, + typename CallbackParamTraits::ForwardType x7) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + typedef typename StorageType::Bound6UnwrapTraits Bound6UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + typename Bound6UnwrapTraits::ForwardType x6 = + Bound6UnwrapTraits::Unwrap(storage->p6_); + return InvokeHelper::ForwardType x7)> + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + +// Arity 7 -> 0. +template +struct Invoker<7, StorageType, R(X1, X2, X3, X4, X5, X6, X7)> { + typedef R(RunType)(BindStateBase*); + + typedef R(UnboundRunType)(); + + static R Run(BindStateBase* base) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. + typedef typename StorageType::Bound1UnwrapTraits Bound1UnwrapTraits; + typedef typename StorageType::Bound2UnwrapTraits Bound2UnwrapTraits; + typedef typename StorageType::Bound3UnwrapTraits Bound3UnwrapTraits; + typedef typename StorageType::Bound4UnwrapTraits Bound4UnwrapTraits; + typedef typename StorageType::Bound5UnwrapTraits Bound5UnwrapTraits; + typedef typename StorageType::Bound6UnwrapTraits Bound6UnwrapTraits; + typedef typename StorageType::Bound7UnwrapTraits Bound7UnwrapTraits; + + typename Bound1UnwrapTraits::ForwardType x1 = + Bound1UnwrapTraits::Unwrap(storage->p1_); + typename Bound2UnwrapTraits::ForwardType x2 = + Bound2UnwrapTraits::Unwrap(storage->p2_); + typename Bound3UnwrapTraits::ForwardType x3 = + Bound3UnwrapTraits::Unwrap(storage->p3_); + typename Bound4UnwrapTraits::ForwardType x4 = + Bound4UnwrapTraits::Unwrap(storage->p4_); + typename Bound5UnwrapTraits::ForwardType x5 = + Bound5UnwrapTraits::Unwrap(storage->p5_); + typename Bound6UnwrapTraits::ForwardType x6 = + Bound6UnwrapTraits::Unwrap(storage->p6_); + typename Bound7UnwrapTraits::ForwardType x7 = + Bound7UnwrapTraits::Unwrap(storage->p7_); + return InvokeHelper + ::MakeItSo(storage->runnable_, CallbackForward(x1), + CallbackForward(x2), CallbackForward(x3), + CallbackForward(x4), CallbackForward(x5), + CallbackForward(x6), CallbackForward(x7)); + } +}; + + +// BindState<> +// +// This stores all the state passed into Bind() and is also where most +// of the template resolution magic occurs. +// +// Runnable is the functor we are binding arguments to. +// RunType is type of the Run() function that the Invoker<> should use. +// Normally, this is the same as the RunType of the Runnable, but it can +// be different if an adapter like IgnoreResult() has been used. +// +// BoundArgsType contains the storage type for all the bound arguments by +// (ab)using a function type. +template +struct BindState; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<0, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + explicit BindState(Runnable runnable) : runnable_(std::move(runnable)) {} + + virtual ~BindState() { } + + RunnableType runnable_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<1, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + + BindState(Runnable runnable, P1 p1) + : runnable_(std::move(runnable)), p1_(std::move(p1)) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<2, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + + BindState(Runnable runnable, P1 p1, P2 p2) + : runnable_(std::move(runnable)), p1_(std::move(p1)), p2_(std::move(p2)) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<3, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + typedef UnwrapTraits Bound3UnwrapTraits; + + BindState(Runnable runnable, P1 p1, P2 p2, P3 p3) + : runnable_(std::move(runnable)), + p1_(std::move(p1)), + p2_(std::move(p2)), + p3_(std::move(p3)) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; + P3 p3_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<4, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + typedef UnwrapTraits Bound3UnwrapTraits; + typedef UnwrapTraits Bound4UnwrapTraits; + + BindState(Runnable runnable, P1 p1, P2 p2, const P3& p3, P4 p4) + : runnable_(std::move(runnable)), + p1_(std::move(p1)), + p2_(std::move(p2)), + p3_(p3), + p4_(std::move(p4)) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; + P3 p3_; + P4 p4_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<5, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + typedef UnwrapTraits Bound3UnwrapTraits; + typedef UnwrapTraits Bound4UnwrapTraits; + typedef UnwrapTraits Bound5UnwrapTraits; + + BindState(Runnable runnable, P1 p1, P2 p2, P3 p3, P4 p4, P5 p5) + : runnable_(std::move(runnable)), + p1_(std::move(p1)), + p2_(std::move(p2)), + p3_(std::move(p3)), + p4_(std::move(p4)), + p5_(std::move(p5)) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; + P3 p3_; + P4 p4_; + P5 p5_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<6, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + typedef UnwrapTraits Bound3UnwrapTraits; + typedef UnwrapTraits Bound4UnwrapTraits; + typedef UnwrapTraits Bound5UnwrapTraits; + typedef UnwrapTraits Bound6UnwrapTraits; + + BindState(const Runnable& runnable, const P1& p1, const P2& p2, const P3& p3, + const P4& p4, const P5& p5, const P6& p6) + : runnable_(runnable), + p1_(p1), + p2_(p2), + p3_(p3), + p4_(p4), + p5_(p5), + p6_(p6) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; + P3 p3_; + P4 p4_; + P5 p5_; + P6 p6_; +}; + +template +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<7, BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + + // Convenience typedefs for bound argument types. + typedef UnwrapTraits Bound1UnwrapTraits; + typedef UnwrapTraits Bound2UnwrapTraits; + typedef UnwrapTraits Bound3UnwrapTraits; + typedef UnwrapTraits Bound4UnwrapTraits; + typedef UnwrapTraits Bound5UnwrapTraits; + typedef UnwrapTraits Bound6UnwrapTraits; + typedef UnwrapTraits Bound7UnwrapTraits; + + BindState(const Runnable& runnable, const P1& p1, const P2& p2, const P3& p3, + const P4& p4, const P5& p5, const P6& p6, const P7& p7) + : runnable_(runnable), + p1_(p1), + p2_(p2), + p3_(p3), + p4_(p4), + p5_(p5), + p6_(p6), + p7_(p7) { + MaybeRefcount::value, P1>::AddRef(p1_); + } + + virtual ~BindState() { MaybeRefcount::value, + P1>::Release(p1_); } + + RunnableType runnable_; + P1 p1_; + P2 p2_; + P3 p3_; + P4 p4_; + P5 p5_; + P6 p6_; + P7 p7_; +}; + +} // namespace internal +} // namespace kudu + +#endif // KUDU_GUTIL_BIND_INTERNAL_H_ diff --git a/src/kudu/gutil/bind_internal.h.pump b/src/kudu/gutil/bind_internal.h.pump new file mode 100644 index 000000000000..8352f9bc894c --- /dev/null +++ b/src/kudu/gutil/bind_internal.h.pump @@ -0,0 +1,464 @@ +$$ This is a pump file for generating file templates. Pump is a python +$$ script that is part of the Google Test suite of utilities. Description +$$ can be found here: +$$ +$$ http://code.google.com/p/googletest/wiki/PumpManual +$$ + +$$ See comment for MAX_ARITY in kudu/gutil/bind.h.pump. +$var MAX_ARITY = 7 +$range ARITY 0..MAX_ARITY + +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_BIND_INTERNAL_H_ +#define KUDU_GUTIL_BIND_INTERNAL_H_ + +#include "kudu/gutil/bind_helpers.h" +#include "kudu/gutil/callback_internal.h" +#include "kudu/gutil/raw_scoped_refptr_mismatch_checker.h" +#include "kudu/gutil/template_util.h" + +#if defined(OS_WIN) +#include "kudu/gutil/bind_internal_win.h" +#endif + +// During Chromium import, WeakPtr-related code was removed. + +namespace kudu { +namespace internal { + +// See kudu/gutil/callback.h for user documentation. +// +// +// CONCEPTS: +// Runnable -- A type (really a type class) that has a single Run() method +// and a RunType typedef that corresponds to the type of Run(). +// A Runnable can declare that it should treated like a method +// call by including a typedef named IsMethod. The value of +// this typedef is NOT inspected, only the existence. When a +// Runnable declares itself a method, Bind() will enforce special +// refcounting + WeakPtr handling semantics for the first +// parameter which is expected to be an object. +// Functor -- A copyable type representing something that should be called. +// All function pointers, Callback<>, and Runnables are functors +// even if the invocation syntax differs. +// RunType -- A function type (as opposed to function _pointer_ type) for +// a Run() function. Usually just a convenience typedef. +// (Bound)ArgsType -- A function type that is being (ab)used to store the +// types of set of arguments. The "return" type is always +// void here. We use this hack so that we do not need +// a new type name for each arity of type. (eg., +// BindState1, BindState2). This makes forward +// declarations and friending much much easier. +// +// Types: +// RunnableAdapter<> -- Wraps the various "function" pointer types into an +// object that adheres to the Runnable interface. +// There are |3*ARITY| RunnableAdapter types. +// FunctionTraits<> -- Type traits that unwrap a function signature into a +// a set of easier to use typedefs. Used mainly for +// compile time asserts. +// There are |ARITY| FunctionTraits types. +// ForceVoidReturn<> -- Helper class for translating function signatures to +// equivalent forms with a "void" return type. +// There are |ARITY| ForceVoidReturn types. +// FunctorTraits<> -- Type traits used determine the correct RunType and +// RunnableType for a Functor. This is where function +// signature adapters are applied. +// There are |ARITY| ForceVoidReturn types. +// MakeRunnable<> -- Takes a Functor and returns an object in the Runnable +// type class that represents the underlying Functor. +// There are |O(1)| MakeRunnable types. +// InvokeHelper<> -- Take a Runnable + arguments and actully invokes it. +// Handle the differing syntaxes needed for WeakPtr<> support, +// and for ignoring return values. This is separate from +// Invoker to avoid creating multiple version of Invoker<> +// which grows at O(n^2) with the arity. +// There are |k*ARITY| InvokeHelper types. +// Invoker<> -- Unwraps the curried parameters and executes the Runnable. +// There are |(ARITY^2 + ARITY)/2| Invoketypes. +// BindState<> -- Stores the curried parameters, and is the main entry point +// into the Bind() system, doing most of the type resolution. +// There are ARITY BindState types. + +// RunnableAdapter<> +// +// The RunnableAdapter<> templates provide a uniform interface for invoking +// a function pointer, method pointer, or const method pointer. The adapter +// exposes a Run() method with an appropriate signature. Using this wrapper +// allows for writing code that supports all three pointer types without +// undue repetition. Without it, a lot of code would need to be repeated 3 +// times. +// +// For method pointers and const method pointers the first argument to Run() +// is considered to be the received of the method. This is similar to STL's +// mem_fun(). +// +// This class also exposes a RunType typedef that is the function type of the +// Run() function. +// +// If and only if the wrapper contains a method or const method pointer, an +// IsMethod typedef is exposed. The existence of this typedef (NOT the value) +// marks that the wrapper should be considered a method wrapper. + +template +class RunnableAdapter; + +$for ARITY [[ +$range ARG 1..ARITY + +// Function: Arity $(ARITY). +template 0[[, ]] $for ARG , [[typename A$(ARG)]]> +class RunnableAdapter { + public: + typedef R (RunType)($for ARG , [[A$(ARG)]]); + + explicit RunnableAdapter(R(*function)($for ARG , [[A$(ARG)]])) + : function_(function) { + } + + R Run($for ARG , [[typename CallbackParamTraits::ForwardType a$(ARG)]]) { + return function_($for ARG , [[CallbackForward(a$(ARG))]]); + } + + private: + R (*function_)($for ARG , [[A$(ARG)]]); +}; + +// Method: Arity $(ARITY). +template 0[[, ]] $for ARG , [[typename A$(ARG)]]> +class RunnableAdapter { + public: + typedef R (RunType)(T*[[]] +$if ARITY > 0[[, ]] $for ARG , [[A$(ARG)]]); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)($for ARG , [[A$(ARG)]])) + : method_(method) { + } + + R Run(T* object[[]] +$if ARITY > 0[[, ]] $for ARG, [[typename CallbackParamTraits::ForwardType a$(ARG)]]) { + return (object->*method_)($for ARG , [[CallbackForward(a$(ARG))]]); + } + + private: + R (T::*method_)($for ARG , [[A$(ARG)]]); +}; + +// Const Method: Arity $(ARITY). +template 0[[, ]] $for ARG , [[typename A$(ARG)]]> +class RunnableAdapter { + public: + typedef R (RunType)(const T*[[]] +$if ARITY > 0[[, ]] $for ARG , [[A$(ARG)]]); + typedef base::true_type IsMethod; + + explicit RunnableAdapter(R(T::*method)($for ARG , [[A$(ARG)]]) const) + : method_(method) { + } + + R Run(const T* object[[]] +$if ARITY > 0[[, ]] $for ARG, [[typename CallbackParamTraits::ForwardType a$(ARG)]]) { + return (object->*method_)($for ARG , [[CallbackForward(a$(ARG))]]); + } + + private: + R (T::*method_)($for ARG , [[A$(ARG)]]) const; +}; + +]] $$ for ARITY + + +// FunctionTraits<> +// +// Breaks a function signature apart into typedefs for easier introspection. +template +struct FunctionTraits; + +$for ARITY [[ +$range ARG 1..ARITY + +template 0[[, ]] $for ARG , [[typename A$(ARG)]]> +struct FunctionTraits { + typedef R ReturnType; +$for ARG [[ + + typedef A$(ARG) A$(ARG)Type; +]] + +}; + +]] + + +// ForceVoidReturn<> +// +// Set of templates that support forcing the function return type to void. +template +struct ForceVoidReturn; + +$for ARITY [[ +$range ARG 1..ARITY + +template 0[[, ]] $for ARG , [[typename A$(ARG)]]> +struct ForceVoidReturn { + typedef void(RunType)($for ARG , [[A$(ARG)]]); +}; + +]] $$ for ARITY + + +// FunctorTraits<> +// +// See description at top of file. +template +struct FunctorTraits { + typedef RunnableAdapter RunnableType; + typedef typename RunnableType::RunType RunType; +}; + +template +struct FunctorTraits > { + typedef typename FunctorTraits::RunnableType RunnableType; + typedef typename ForceVoidReturn< + typename RunnableType::RunType>::RunType RunType; +}; + +template +struct FunctorTraits > { + typedef Callback RunnableType; + typedef typename Callback::RunType RunType; +}; + + +// MakeRunnable<> +// +// Converts a passed in functor to a RunnableType using type inference. + +template +typename FunctorTraits::RunnableType MakeRunnable(const T& t) { + return RunnableAdapter(t); +} + +template +typename FunctorTraits::RunnableType +MakeRunnable(const IgnoreResultHelper& t) { + return MakeRunnable(t.functor_); +} + +template +const typename FunctorTraits >::RunnableType& +MakeRunnable(const Callback& t) { + DCHECK(!t.is_null()); + return t; +} + + +// InvokeHelper<> +// +// There are 3 logical InvokeHelper<> specializations: normal, void-return, +// WeakCalls. +// +// The normal type just calls the underlying runnable. +// +// We need a InvokeHelper to handle void return types in order to support +// IgnoreResult(). Normally, if the Runnable's RunType had a void return, +// the template system would just accept "return functor.Run()" ignoring +// the fact that a void function is being used with return. This piece of +// sugar breaks though when the Runnable's RunType is not void. Thus, we +// need a partial specialization to change the syntax to drop the "return" +// from the invocation call. +// +// WeakCalls similarly need special syntax that is applied to the first +// argument to check if they should no-op themselves. +template +struct InvokeHelper; + +$for ARITY [[ +$range ARG 1..ARITY +$range WEAKCALL_ARG 2..ARITY + +template 0 [[,]] $for ARG , [[typename A$(ARG)]]> +struct InvokeHelper { + static ReturnType MakeItSo(Runnable runnable[[]] +$if ARITY > 0[[, ]] $for ARG , [[A$(ARG) a$(ARG)]]) { + return runnable.Run($for ARG , [[CallbackForward(a$(ARG))]]); + } +}; + +template 0 [[,]] $for ARG , [[typename A$(ARG)]]> +struct InvokeHelper { + static void MakeItSo(Runnable runnable[[]] +$if ARITY > 0[[, ]] $for ARG , [[A$(ARG) a$(ARG)]]) { + runnable.Run($for ARG , [[CallbackForward(a$(ARG))]]); + } +}; + +]] $$ for ARITY + +// Invoker<> +// +// See description at the top of the file. +template +struct Invoker; + +$for ARITY [[ + +$$ Number of bound arguments. +$range BOUND 0..ARITY +$for BOUND [[ + +$var UNBOUND = ARITY - BOUND +$range ARG 1..ARITY +$range BOUND_ARG 1..BOUND +$range UNBOUND_ARG (ARITY - UNBOUND + 1)..ARITY + +// Arity $(ARITY) -> $(UNBOUND). +template 0 [[,]][[]] +$for ARG , [[typename X$(ARG)]]> +struct Invoker<$(BOUND), StorageType, R($for ARG , [[X$(ARG)]])> { + typedef R(RunType)(BindStateBase*[[]] +$if UNBOUND != 0 [[, ]] +$for UNBOUND_ARG , [[typename CallbackParamTraits::ForwardType]]); + + typedef R(UnboundRunType)($for UNBOUND_ARG , [[X$(UNBOUND_ARG)]]); + + static R Run(BindStateBase* base[[]] +$if UNBOUND != 0 [[, ]][[]] +$for UNBOUND_ARG , [[ +typename CallbackParamTraits::ForwardType x$(UNBOUND_ARG) +]][[]] +) { + StorageType* storage = static_cast(base); + + // Local references to make debugger stepping easier. If in a debugger, + // you really want to warp ahead and step through the + // InvokeHelper<>::MakeItSo() call below. +$for BOUND_ARG +[[ + + typedef typename StorageType::Bound$(BOUND_ARG)UnwrapTraits Bound$(BOUND_ARG)UnwrapTraits; +]] + + +$for BOUND_ARG +[[ + + typename Bound$(BOUND_ARG)UnwrapTraits::ForwardType x$(BOUND_ARG) = + Bound$(BOUND_ARG)UnwrapTraits::Unwrap(storage->p$(BOUND_ARG)_); +]] + + return InvokeHelper 0 [[$if BOUND > 0 [[, ]]]][[]] + +$for UNBOUND_ARG , [[ +typename CallbackParamTraits::ForwardType x$(UNBOUND_ARG) +]] +)> + ::MakeItSo(storage->runnable_ +$if ARITY > 0[[, ]] $for ARG , [[CallbackForward(x$(ARG))]]); + } +}; + +]] $$ for BOUND +]] $$ for ARITY + + +// BindState<> +// +// This stores all the state passed into Bind() and is also where most +// of the template resolution magic occurs. +// +// Runnable is the functor we are binding arguments to. +// RunType is type of the Run() function that the Invoker<> should use. +// Normally, this is the same as the RunType of the Runnable, but it can +// be different if an adapter like IgnoreResult() has been used. +// +// BoundArgsType contains the storage type for all the bound arguments by +// (ab)using a function type. +template +struct BindState; + +$for ARITY [[ +$range ARG 1..ARITY + +template 0[[, ]] $for ARG , [[typename P$(ARG)]]> +struct BindState : public BindStateBase { + typedef Runnable RunnableType; + + typedef base::false_type IsWeakCall; + + typedef Invoker<$(ARITY), BindState, RunType> InvokerType; + typedef typename InvokerType::UnboundRunType UnboundRunType; + +$if ARITY > 0 [[ + + // Convenience typedefs for bound argument types. + +$for ARG [[ + typedef UnwrapTraits Bound$(ARG)UnwrapTraits; + +]] $$ for ARG + + +]] $$ if ARITY > 0 + +$$ The extra [[ ]] is needed to massage spacing. Silly pump.py. +[[ ]]$if ARITY == 0 [[explicit ]]BindState(const Runnable& runnable +$if ARITY > 0 [[, ]] $for ARG , [[const P$(ARG)& p$(ARG)]]) + : runnable_(runnable)[[]] +$if ARITY == 0 [[ + { + +]] $else [[ +, $for ARG , [[ + + p$(ARG)_(p$(ARG)) +]] { + MaybeRefcount::value, P1>::AddRef(p1_); + +]] + } + + virtual ~BindState() { +$if ARITY > 0 [[ + MaybeRefcount::value, P1>::Release(p1_); +]] + } + + RunnableType runnable_; + +$for ARG [[ + P$(ARG) p$(ARG)_; + +]] +}; + +]] $$ for ARITY + +} // namespace internal +} // namespace kudu + +#endif // KUDU_GUTIL_BIND_INTERNAL_H_ diff --git a/src/kudu/gutil/bits.cc b/src/kudu/gutil/bits.cc new file mode 100644 index 000000000000..333e464a9346 --- /dev/null +++ b/src/kudu/gutil/bits.cc @@ -0,0 +1,101 @@ +// Copyright 2002 and onwards Google Inc. +// +// Derived from code by Moses Charikar + +#include "kudu/gutil/bits.h" + +#include + +// this array gives the number of bits for any number from 0 to 255 +// (We could make these ints. The tradeoff is size (eg does it overwhelm +// the cache?) vs efficiency in referencing sub-word-sized array elements) +const char Bits::num_bits[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; + +int Bits::Count(const void *m, int num_bytes) { + int nbits = 0; + const uint8 *s = (const uint8 *) m; + for (int i = 0; i < num_bytes; i++) + nbits += num_bits[*s++]; + return nbits; +} + +int Bits::Difference(const void *m1, const void *m2, int num_bytes) { + int nbits = 0; + const uint8 *s1 = (const uint8 *) m1; + const uint8 *s2 = (const uint8 *) m2; + for (int i = 0; i < num_bytes; i++) + nbits += num_bits[(*s1++) ^ (*s2++)]; + return nbits; +} + +int Bits::CappedDifference(const void *m1, const void *m2, + int num_bytes, int cap) { + int nbits = 0; + const uint8 *s1 = (const uint8 *) m1; + const uint8 *s2 = (const uint8 *) m2; + for (int i = 0; i < num_bytes && nbits <= cap; i++) + nbits += num_bits[(*s1++) ^ (*s2++)]; + return nbits; +} + +int Bits::Log2Floor_Portable(uint32 n) { + if (n == 0) + return -1; + int log = 0; + uint32 value = n; + for (int i = 4; i >= 0; --i) { + int shift = (1 << i); + uint32 x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + assert(value == 1); + return log; +} + +int Bits::Log2Ceiling(uint32 n) { + int floor = Log2Floor(n); + if (n == (n &~ (n - 1))) // zero or a power of two + return floor; + else + return floor + 1; +} + +int Bits::Log2Ceiling64(uint64 n) { + int floor = Log2Floor64(n); + if (n == (n &~ (n - 1))) // zero or a power of two + return floor; + else + return floor + 1; +} + +int Bits::FindLSBSetNonZero_Portable(uint32 n) { + int rc = 31; + for (int i = 4, shift = 1 << 4; i >= 0; --i) { + const uint32 x = n << shift; + if (x != 0) { + n = x; + rc -= shift; + } + shift >>= 1; + } + return rc; +} diff --git a/src/kudu/gutil/bits.h b/src/kudu/gutil/bits.h new file mode 100644 index 000000000000..639186ec78ee --- /dev/null +++ b/src/kudu/gutil/bits.h @@ -0,0 +1,267 @@ +// Copyright 2002 and onwards Google Inc. +// +// A collection of useful (static) bit-twiddling functions. + +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" + +#ifndef _BITS_H_ +#define _BITS_H_ + +class Bits { + public: + // Return the number of one bits in the given integer. + static int CountOnesInByte(unsigned char n); + + static int CountOnes(uint32 n) { + n -= ((n >> 1) & 0x55555555); + n = ((n >> 2) & 0x33333333) + (n & 0x33333333); + return (((n + (n >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; + } + + // Count bits using sideways addition [WWG'57]. See Knuth TAOCP v4 7.1.3(59) + static inline int CountOnes64(uint64 n) { +#if defined(__x86_64__) + n -= (n >> 1) & 0x5555555555555555ULL; + n = ((n >> 2) & 0x3333333333333333ULL) + (n & 0x3333333333333333ULL); + return (((n + (n >> 4)) & 0xF0F0F0F0F0F0F0FULL) + * 0x101010101010101ULL) >> 56; +#else + return CountOnes(n >> 32) + CountOnes(n & 0xffffffff); +#endif + } + + // Count bits using popcnt instruction (available on argo machines). + // Doesn't check if the instruction exists. + // Please use TestCPUFeature(POPCNT) from base/cpuid/cpuid.h before using this. + static inline int CountOnes64withPopcount(uint64 n) { +#if defined(__x86_64__) && defined __GNUC__ + int64 count = 0; + asm("popcnt %1,%0" : "=r"(count) : "rm"(n) : "cc"); + return count; +#else + return CountOnes64(n); +#endif + } + + // Reverse the bits in the given integer. + static uint8 ReverseBits8(uint8 n); + static uint32 ReverseBits32(uint32 n); + static uint64 ReverseBits64(uint64 n); + + // Return the number of one bits in the byte sequence. + static int Count(const void *m, int num_bytes); + + // Return the number of different bits in the given byte sequences. + // (i.e., the Hamming distance) + static int Difference(const void *m1, const void *m2, int num_bytes); + + // Return the number of different bits in the given byte sequences, + // up to a maximum. Values larger than the maximum may be returned + // (because multiple bits are checked at a time), but the function + // may exit early if the cap is exceeded. + static int CappedDifference(const void *m1, const void *m2, + int num_bytes, int cap); + + // Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0. + static int Log2Floor(uint32 n); + static int Log2Floor64(uint64 n); + + // Potentially faster version of Log2Floor() that returns an + // undefined value if n == 0 + static int Log2FloorNonZero(uint32 n); + static int Log2FloorNonZero64(uint64 n); + + // Return ceiling(log2(n)) for positive integer n. Returns -1 iff n == 0. + static int Log2Ceiling(uint32 n); + static int Log2Ceiling64(uint64 n); + + // Return the first set least / most significant bit, 0-indexed. Returns an + // undefined value if n == 0. FindLSBSetNonZero() is similar to ffs() except + // that it's 0-indexed, while FindMSBSetNonZero() is the same as + // Log2FloorNonZero(). + static int FindLSBSetNonZero(uint32 n); + static int FindLSBSetNonZero64(uint64 n); + static int FindMSBSetNonZero(uint32 n) { return Log2FloorNonZero(n); } + static int FindMSBSetNonZero64(uint64 n) { return Log2FloorNonZero64(n); } + + // Portable implementations + static int Log2Floor_Portable(uint32 n); + static int Log2FloorNonZero_Portable(uint32 n); + static int FindLSBSetNonZero_Portable(uint32 n); + static int Log2Floor64_Portable(uint64 n); + static int Log2FloorNonZero64_Portable(uint64 n); + static int FindLSBSetNonZero64_Portable(uint64 n); + + // Viewing bytes as a stream of unsigned bytes, does that stream + // contain any byte equal to c? + template static bool BytesContainByte(T bytes, uint8 c); + + // Viewing bytes as a stream of unsigned bytes, does that stream + // contain any byte b < c? + template static bool BytesContainByteLessThan(T bytes, uint8 c); + + // Viewing bytes as a stream of unsigned bytes, are all elements of that + // stream in [lo, hi]? + template static bool BytesAllInRange(T bytes, uint8 lo, uint8 hi); + + private: + static const char num_bits[]; + static const unsigned char bit_reverse_table[]; + DISALLOW_COPY_AND_ASSIGN(Bits); +}; + +// A utility class for some handy bit patterns. The names l and h +// were chosen to match Knuth Volume 4: l is 0x010101... and h is 0x808080...; +// half_ones is ones in the lower half only. We assume sizeof(T) is 1 or even. +template struct BitPattern { + static const T half_ones = (static_cast(1) << (sizeof(T)*4)) - 1; + static const T l = (sizeof(T) == 1) ? 1 : + (half_ones / 0xff * (half_ones + 2)); + static const T h = ~(l * 0x7f); +}; + +// ------------------------------------------------------------------------ +// Implementation details follow +// ------------------------------------------------------------------------ + +// use GNU builtins where available +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +inline int Bits::Log2Floor(uint32 n) { + return n == 0 ? -1 : 31 ^ __builtin_clz(n); +} + +inline int Bits::Log2FloorNonZero(uint32 n) { + return 31 ^ __builtin_clz(n); +} + +inline int Bits::FindLSBSetNonZero(uint32 n) { + return __builtin_ctz(n); +} + +inline int Bits::Log2Floor64(uint64 n) { + return n == 0 ? -1 : 63 ^ __builtin_clzll(n); +} + +inline int Bits::Log2FloorNonZero64(uint64 n) { + return 63 ^ __builtin_clzll(n); +} + +inline int Bits::FindLSBSetNonZero64(uint64 n) { + return __builtin_ctzll(n); +} +#elif defined(_MSC_VER) +#include "kudu/gutil/bits-internal-windows.h" +#else +#include "kudu/gutil/bits-internal-unknown.h" +#endif + +inline int Bits::CountOnesInByte(unsigned char n) { + return num_bits[n]; +} + +inline uint8 Bits::ReverseBits8(unsigned char n) { + n = ((n >> 1) & 0x55) | ((n & 0x55) << 1); + n = ((n >> 2) & 0x33) | ((n & 0x33) << 2); + return ((n >> 4) & 0x0f) | ((n & 0x0f) << 4); +} + +inline uint32 Bits::ReverseBits32(uint32 n) { + n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); + n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); + n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); + n = ((n >> 8) & 0x00FF00FF) | ((n & 0x00FF00FF) << 8); + return ( n >> 16 ) | ( n << 16); +} + +inline uint64 Bits::ReverseBits64(uint64 n) { +#if defined(__x86_64__) + n = ((n >> 1) & 0x5555555555555555ULL) | ((n & 0x5555555555555555ULL) << 1); + n = ((n >> 2) & 0x3333333333333333ULL) | ((n & 0x3333333333333333ULL) << 2); + n = ((n >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((n & 0x0F0F0F0F0F0F0F0FULL) << 4); + n = ((n >> 8) & 0x00FF00FF00FF00FFULL) | ((n & 0x00FF00FF00FF00FFULL) << 8); + n = ((n >> 16) & 0x0000FFFF0000FFFFULL) | ((n & 0x0000FFFF0000FFFFULL) << 16); + return ( n >> 32 ) | ( n << 32); +#else + return ReverseBits32( n >> 32 ) | + (static_cast(ReverseBits32( n & 0xffffffff )) << 32); +#endif +} + +inline int Bits::Log2FloorNonZero_Portable(uint32 n) { + // Just use the common routine + return Log2Floor(n); +} + +// Log2Floor64() is defined in terms of Log2Floor32(), Log2FloorNonZero32() +inline int Bits::Log2Floor64_Portable(uint64 n) { + const uint32 topbits = static_cast(n >> 32); + if (topbits == 0) { + // Top bits are zero, so scan in bottom bits + return Log2Floor(static_cast(n)); + } else { + return 32 + Log2FloorNonZero(topbits); + } +} + +// Log2FloorNonZero64() is defined in terms of Log2FloorNonZero32() +inline int Bits::Log2FloorNonZero64_Portable(uint64 n) { + const uint32 topbits = static_cast(n >> 32); + if (topbits == 0) { + // Top bits are zero, so scan in bottom bits + return Log2FloorNonZero(static_cast(n)); + } else { + return 32 + Log2FloorNonZero(topbits); + } +} + +// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero() +inline int Bits::FindLSBSetNonZero64_Portable(uint64 n) { + const uint32 bottombits = static_cast(n); + if (bottombits == 0) { + // Bottom bits are zero, so scan in top bits + return 32 + FindLSBSetNonZero(static_cast(n >> 32)); + } else { + return FindLSBSetNonZero(bottombits); + } +} + +template +inline bool Bits::BytesContainByteLessThan(T bytes, uint8 c) { + T l = BitPattern::l; + T h = BitPattern::h; + // The c <= 0x80 code is straight out of Knuth Volume 4. + // Usually c will be manifestly constant. + return c <= 0x80 ? + ((h & (bytes - l * c) & ~bytes) != 0) : + ((((bytes - l * c) | (bytes ^ h)) & h) != 0); +} + +template inline bool Bits::BytesContainByte(T bytes, uint8 c) { + // Usually c will be manifestly constant. + return Bits::BytesContainByteLessThan(bytes ^ (c * BitPattern::l), 1); +} + +template +inline bool Bits::BytesAllInRange(T bytes, uint8 lo, uint8 hi) { + T l = BitPattern::l; + T h = BitPattern::h; + // In the common case, lo and hi are manifest constants. + if (lo > hi) { + return false; + } + if (hi - lo < 128) { + T x = bytes - l * lo; + T y = bytes + l * (127 - hi); + return ((x | y) & h) == 0; + } + return !Bits::BytesContainByteLessThan(bytes + (255 - hi) * l, + lo + (255 - hi)); +} + +#endif // _BITS_H_ diff --git a/src/kudu/gutil/callback.h b/src/kudu/gutil/callback.h new file mode 100644 index 000000000000..1a41622edae5 --- /dev/null +++ b/src/kudu/gutil/callback.h @@ -0,0 +1,765 @@ +// This file was GENERATED by command: +// pump.py callback.h.pump +// DO NOT EDIT BY HAND!!! + + +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_CALLBACK_H_ +#define KUDU_GUTIL_CALLBACK_H_ + +#include "kudu/gutil/callback_forward.h" +#include "kudu/gutil/callback_internal.h" +#include "kudu/gutil/template_util.h" + +// NOTE: Header files that do not require the full definition of Callback or +// Closure should #include "kudu/gutil/callback_forward.h" instead of this file. + +// ----------------------------------------------------------------------------- +// Introduction +// ----------------------------------------------------------------------------- +// +// The templated Callback class is a generalized function object. Together +// with the Bind() function in bind.h, they provide a type-safe method for +// performing partial application of functions. +// +// Partial application (or "currying") is the process of binding a subset of +// a function's arguments to produce another function that takes fewer +// arguments. This can be used to pass around a unit of delayed execution, +// much like lexical closures are used in other languages. For example, it +// is used in Chromium code to schedule tasks on different MessageLoops. +// +// A callback with no unbound input parameters (kudu::Callback) +// is called a kudu::Closure. Note that this is NOT the same as what other +// languages refer to as a closure -- it does not retain a reference to its +// enclosing environment. +// +// MEMORY MANAGEMENT AND PASSING +// +// The Callback objects themselves should be passed by const-reference, and +// stored by copy. They internally store their state via a refcounted class +// and thus do not need to be deleted. +// +// The reason to pass via a const-reference is to avoid unnecessary +// AddRef/Release pairs to the internal state. +// +// +// ----------------------------------------------------------------------------- +// Quick reference for basic stuff +// ----------------------------------------------------------------------------- +// +// BINDING A BARE FUNCTION +// +// int Return5() { return 5; } +// kudu::Callback func_cb = kudu::Bind(&Return5); +// LOG(INFO) << func_cb.Run(); // Prints 5. +// +// BINDING A CLASS METHOD +// +// The first argument to bind is the member function to call, the second is +// the object on which to call it. +// +// class Ref : public kudu::RefCountedThreadSafe { +// public: +// int Foo() { return 3; } +// void PrintBye() { LOG(INFO) << "bye."; } +// }; +// scoped_refptr ref = new Ref(); +// kudu::Callback ref_cb = kudu::Bind(&Ref::Foo, ref); +// LOG(INFO) << ref_cb.Run(); // Prints out 3. +// +// By default the object must support RefCounted or you will get a compiler +// error. If you're passing between threads, be sure it's +// RefCountedThreadSafe! See "Advanced binding of member functions" below if +// you don't want to use reference counting. +// +// RUNNING A CALLBACK +// +// Callbacks can be run with their "Run" method, which has the same +// signature as the template argument to the callback. +// +// void DoSomething(const kudu::Callback& callback) { +// callback.Run(5, "hello"); +// } +// +// Callbacks can be run more than once (they don't get deleted or marked when +// run). However, this precludes using kudu::Passed (see below). +// +// void DoSomething(const kudu::Callback& callback) { +// double myresult = callback.Run(3.14159); +// myresult += callback.Run(2.71828); +// } +// +// PASSING UNBOUND INPUT PARAMETERS +// +// Unbound parameters are specified at the time a callback is Run(). They are +// specified in the Callback template type: +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc); +// cb.Run(23, "hello, world"); +// +// PASSING BOUND INPUT PARAMETERS +// +// Bound parameters are specified when you create the callback as arguments +// to Bind(). They will be passed to the function and the Run()ner of the +// callback doesn't see those values or even know that the function it's +// calling. +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc, 23, "hello world"); +// cb.Run(); +// +// A callback with no unbound input parameters (kudu::Callback) +// is called a kudu::Closure. So we could have also written: +// +// kudu::Closure cb = kudu::Bind(&MyFunc, 23, "hello world"); +// +// When calling member functions, bound parameters just go after the object +// pointer. +// +// kudu::Closure cb = kudu::Bind(&MyClass::MyFunc, this, 23, "hello world"); +// +// PARTIAL BINDING OF PARAMETERS +// +// You can specify some parameters when you create the callback, and specify +// the rest when you execute the callback. +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc, 23); +// cb.Run("hello world"); +// +// When calling a function bound parameters are first, followed by unbound +// parameters. +// +// +// ----------------------------------------------------------------------------- +// Quick reference for advanced binding +// ----------------------------------------------------------------------------- +// +// BINDING A CLASS METHOD WITH WEAK POINTERS +// +// kudu::Bind(&MyClass::Foo, GetWeakPtr()); +// +// The callback will not be issued if the object is destroyed at the time +// it's issued. DANGER: weak pointers are not threadsafe, so don't use this +// when passing between threads! +// +// BINDING A CLASS METHOD WITH MANUAL LIFETIME MANAGEMENT +// +// kudu::Bind(&MyClass::Foo, kudu::Unretained(this)); +// +// This disables all lifetime management on the object. You're responsible +// for making sure the object is alive at the time of the call. You break it, +// you own it! +// +// BINDING A CLASS METHOD AND HAVING THE CALLBACK OWN THE CLASS +// +// MyClass* myclass = new MyClass; +// kudu::Bind(&MyClass::Foo, kudu::Owned(myclass)); +// +// The object will be deleted when the callback is destroyed, even if it's +// not run (like if you post a task during shutdown). Potentially useful for +// "fire and forget" cases. +// +// IGNORING RETURN VALUES +// +// Sometimes you want to call a function that returns a value in a callback +// that doesn't expect a return value. +// +// int DoSomething(int arg) { cout << arg << endl; } +// kudu::Callback) cb = +// kudu::Bind(kudu::IgnoreResult(&DoSomething)); +// +// +// ----------------------------------------------------------------------------- +// Quick reference for binding parameters to Bind() +// ----------------------------------------------------------------------------- +// +// Bound parameters are specified as arguments to Bind() and are passed to the +// function. A callback with no parameters or no unbound parameters is called a +// Closure (kudu::Callback and kudu::Closure are the same thing). +// +// PASSING PARAMETERS OWNED BY THE CALLBACK +// +// void Foo(int* arg) { cout << *arg << endl; } +// int* pn = new int(1); +// kudu::Closure foo_callback = kudu::Bind(&foo, kudu::Owned(pn)); +// +// The parameter will be deleted when the callback is destroyed, even if it's +// not run (like if you post a task during shutdown). +// +// PASSING PARAMETERS AS A scoped_ptr +// +// void TakesOwnership(scoped_ptr arg) {} +// scoped_ptr f(new Foo); +// // f becomes null during the following call. +// kudu::Closure cb = kudu::Bind(&TakesOwnership, kudu::Passed(&f)); +// +// Ownership of the parameter will be with the callback until the it is run, +// when ownership is passed to the callback function. This means the callback +// can only be run once. If the callback is never run, it will delete the +// object when it's destroyed. +// +// PASSING PARAMETERS AS A scoped_refptr +// +// void TakesOneRef(scoped_refptr arg) {} +// scoped_refptr f(new Foo) +// kudu::Closure cb = kudu::Bind(&TakesOneRef, f); +// +// This should "just work." The closure will take a reference as long as it +// is alive, and another reference will be taken for the called function. +// +// PASSING PARAMETERS BY REFERENCE +// +// void foo(int arg) { cout << arg << endl } +// int n = 1; +// kudu::Closure has_ref = kudu::Bind(&foo, kudu::ConstRef(n)); +// n = 2; +// has_ref.Run(); // Prints "2" +// +// Normally parameters are copied in the closure. DANGER: ConstRef stores a +// const reference instead, referencing the original parameter. This means +// that you must ensure the object outlives the callback! +// +// +// ----------------------------------------------------------------------------- +// Implementation notes +// ----------------------------------------------------------------------------- +// +// WHERE IS THIS DESIGN FROM: +// +// The design Callback and Bind is heavily influenced by C++'s +// tr1::function/tr1::bind, and by the "Google Callback" system used inside +// Google. +// +// +// HOW THE IMPLEMENTATION WORKS: +// +// There are three main components to the system: +// 1) The Callback classes. +// 2) The Bind() functions. +// 3) The arguments wrappers (e.g., Unretained() and ConstRef()). +// +// The Callback classes represent a generic function pointer. Internally, +// it stores a refcounted piece of state that represents the target function +// and all its bound parameters. Each Callback specialization has a templated +// constructor that takes an BindState<>*. In the context of the constructor, +// the static type of this BindState<> pointer uniquely identifies the +// function it is representing, all its bound parameters, and a Run() method +// that is capable of invoking the target. +// +// Callback's constructor takes the BindState<>* that has the full static type +// and erases the target function type as well as the types of the bound +// parameters. It does this by storing a pointer to the specific Run() +// function, and upcasting the state of BindState<>* to a +// BindStateBase*. This is safe as long as this BindStateBase pointer +// is only used with the stored Run() pointer. +// +// To BindState<> objects are created inside the Bind() functions. +// These functions, along with a set of internal templates, are responsible for +// +// - Unwrapping the function signature into return type, and parameters +// - Determining the number of parameters that are bound +// - Creating the BindState storing the bound parameters +// - Performing compile-time asserts to avoid error-prone behavior +// - Returning an Callback<> with an arity matching the number of unbound +// parameters and that knows the correct refcounting semantics for the +// target object if we are binding a method. +// +// The Bind functions do the above using type-inference, and template +// specializations. +// +// By default Bind() will store copies of all bound parameters, and attempt +// to refcount a target object if the function being bound is a class method. +// These copies are created even if the function takes parameters as const +// references. (Binding to non-const references is forbidden, see bind.h.) +// +// To change this behavior, we introduce a set of argument wrappers +// (e.g., Unretained(), and ConstRef()). These are simple container templates +// that are passed by value, and wrap a pointer to argument. See the +// file-level comment in kudu/gutil/bind_helpers.h for more info. +// +// These types are passed to the Unwrap() functions, and the MaybeRefcount() +// functions respectively to modify the behavior of Bind(). The Unwrap() +// and MaybeRefcount() functions change behavior by doing partial +// specialization based on whether or not a parameter is a wrapper type. +// +// ConstRef() is similar to tr1::cref. Unretained() is specific to Chromium. +// +// +// WHY NOT TR1 FUNCTION/BIND? +// +// Direct use of tr1::function and tr1::bind was considered, but ultimately +// rejected because of the number of copy constructors invocations involved +// in the binding of arguments during construction, and the forwarding of +// arguments during invocation. These copies will no longer be an issue in +// C++0x because C++0x will support rvalue reference allowing for the compiler +// to avoid these copies. However, waiting for C++0x is not an option. +// +// Measured with valgrind on gcc version 4.4.3 (Ubuntu 4.4.3-4ubuntu5), the +// tr1::bind call itself will invoke a non-trivial copy constructor three times +// for each bound parameter. Also, each when passing a tr1::function, each +// bound argument will be copied again. +// +// In addition to the copies taken at binding and invocation, copying a +// tr1::function causes a copy to be made of all the bound parameters and +// state. +// +// Furthermore, in Chromium, it is desirable for the Callback to take a +// reference on a target object when representing a class method call. This +// is not supported by tr1. +// +// Lastly, tr1::function and tr1::bind has a more general and flexible API. +// This includes things like argument reordering by use of +// tr1::bind::placeholder, support for non-const reference parameters, and some +// limited amount of subtyping of the tr1::function object (e.g., +// tr1::function is convertible to tr1::function). +// +// These are not features that are required in Chromium. Some of them, such as +// allowing for reference parameters, and subtyping of functions, may actually +// become a source of errors. Removing support for these features actually +// allows for a simpler implementation, and a terser Currying API. +// +// +// WHY NOT GOOGLE CALLBACKS? +// +// The Google callback system also does not support refcounting. Furthermore, +// its implementation has a number of strange edge cases with respect to type +// conversion of its arguments. In particular, the argument's constness must +// at times match exactly the function signature, or the type-inference might +// break. Given the above, writing a custom solution was easier. +// +// +// MISSING FUNCTIONALITY +// - Invoking the return of Bind. Bind(&foo).Run() does not work; +// - Binding arrays to functions that take a non-const pointer. +// Example: +// void Foo(const char* ptr); +// void Bar(char* ptr); +// Bind(&Foo, "test"); +// Bind(&Bar, "test"); // This fails because ptr is not const. + +namespace kudu { + +// First, we forward declare the Callback class template. This informs the +// compiler that the template only has 1 type parameter which is the function +// signature that the Callback is representing. +// +// After this, create template specializations for 0-7 parameters. Note that +// even though the template typelist grows, the specialization still +// only has one type: the function signature. +// +// If you are thinking of forward declaring Callback in your own header file, +// please include "base/callback_forward.h" instead. +template +class Callback; + +namespace internal { +template +struct BindState; +} // namespace internal + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run() const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get()); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2, A3); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2, + typename internal::CallbackParamTraits::ForwardType a3) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2), + internal::CallbackForward(a3)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2, A3, A4); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2, + typename internal::CallbackParamTraits::ForwardType a3, + typename internal::CallbackParamTraits::ForwardType a4) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2), + internal::CallbackForward(a3), + internal::CallbackForward(a4)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2, A3, A4, A5); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2, + typename internal::CallbackParamTraits::ForwardType a3, + typename internal::CallbackParamTraits::ForwardType a4, + typename internal::CallbackParamTraits::ForwardType a5) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2), + internal::CallbackForward(a3), + internal::CallbackForward(a4), + internal::CallbackForward(a5)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2, A3, A4, A5, A6); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2, + typename internal::CallbackParamTraits::ForwardType a3, + typename internal::CallbackParamTraits::ForwardType a4, + typename internal::CallbackParamTraits::ForwardType a5, + typename internal::CallbackParamTraits::ForwardType a6) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2), + internal::CallbackForward(a3), + internal::CallbackForward(a4), + internal::CallbackForward(a5), + internal::CallbackForward(a6)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + +template +class Callback : public internal::CallbackBase { + public: + typedef R(RunType)(A1, A2, A3, A4, A5, A6, A7); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run(typename internal::CallbackParamTraits::ForwardType a1, + typename internal::CallbackParamTraits::ForwardType a2, + typename internal::CallbackParamTraits::ForwardType a3, + typename internal::CallbackParamTraits::ForwardType a4, + typename internal::CallbackParamTraits::ForwardType a5, + typename internal::CallbackParamTraits::ForwardType a6, + typename internal::CallbackParamTraits::ForwardType a7) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get(), internal::CallbackForward(a1), + internal::CallbackForward(a2), + internal::CallbackForward(a3), + internal::CallbackForward(a4), + internal::CallbackForward(a5), + internal::CallbackForward(a6), + internal::CallbackForward(a7)); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType, + typename internal::CallbackParamTraits::ForwardType); + +}; + + +// Syntactic sugar to make Callbacks easier to declare since it +// will be used in a lot of APIs with delayed execution. +typedef Callback Closure; + +} // namespace kudu + +#endif // KUDU_GUTIL_CALLBACK_H diff --git a/src/kudu/gutil/callback.h.pump b/src/kudu/gutil/callback.h.pump new file mode 100644 index 000000000000..a2864d4be27d --- /dev/null +++ b/src/kudu/gutil/callback.h.pump @@ -0,0 +1,436 @@ +$$ This is a pump file for generating file templates. Pump is a python +$$ script that is part of the Google Test suite of utilities. Description +$$ can be found here: +$$ +$$ http://code.google.com/p/googletest/wiki/PumpManual +$$ + +$$ See comment for MAX_ARITY in kudu/gutil/bind.h.pump. +$var MAX_ARITY = 7 + +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_CALLBACK_H_ +#define KUDU_GUTIL_CALLBACK_H_ + +#include "kudu/gutil/callback_forward.h" +#include "kudu/gutil/callback_internal.h" +#include "kudu/gutil/template_util.h" + +// NOTE: Header files that do not require the full definition of Callback or +// Closure should #include "kudu/gutil/callback_forward.h" instead of this file. + +// ----------------------------------------------------------------------------- +// Introduction +// ----------------------------------------------------------------------------- +// +// The templated Callback class is a generalized function object. Together +// with the Bind() function in bind.h, they provide a type-safe method for +// performing partial application of functions. +// +// Partial application (or "currying") is the process of binding a subset of +// a function's arguments to produce another function that takes fewer +// arguments. This can be used to pass around a unit of delayed execution, +// much like lexical closures are used in other languages. For example, it +// is used in Chromium code to schedule tasks on different MessageLoops. +// +// A callback with no unbound input parameters (kudu::Callback) +// is called a kudu::Closure. Note that this is NOT the same as what other +// languages refer to as a closure -- it does not retain a reference to its +// enclosing environment. +// +// MEMORY MANAGEMENT AND PASSING +// +// The Callback objects themselves should be passed by const-reference, and +// stored by copy. They internally store their state via a refcounted class +// and thus do not need to be deleted. +// +// The reason to pass via a const-reference is to avoid unnecessary +// AddRef/Release pairs to the internal state. +// +// +// ----------------------------------------------------------------------------- +// Quick reference for basic stuff +// ----------------------------------------------------------------------------- +// +// BINDING A BARE FUNCTION +// +// int Return5() { return 5; } +// kudu::Callback func_cb = kudu::Bind(&Return5); +// LOG(INFO) << func_cb.Run(); // Prints 5. +// +// BINDING A CLASS METHOD +// +// The first argument to bind is the member function to call, the second is +// the object on which to call it. +// +// class Ref : public kudu::RefCountedThreadSafe { +// public: +// int Foo() { return 3; } +// void PrintBye() { LOG(INFO) << "bye."; } +// }; +// scoped_refptr ref = new Ref(); +// kudu::Callback ref_cb = kudu::Bind(&Ref::Foo, ref); +// LOG(INFO) << ref_cb.Run(); // Prints out 3. +// +// By default the object must support RefCounted or you will get a compiler +// error. If you're passing between threads, be sure it's +// RefCountedThreadSafe! See "Advanced binding of member functions" below if +// you don't want to use reference counting. +// +// RUNNING A CALLBACK +// +// Callbacks can be run with their "Run" method, which has the same +// signature as the template argument to the callback. +// +// void DoSomething(const kudu::Callback& callback) { +// callback.Run(5, "hello"); +// } +// +// Callbacks can be run more than once (they don't get deleted or marked when +// run). However, this precludes using kudu::Passed (see below). +// +// void DoSomething(const kudu::Callback& callback) { +// double myresult = callback.Run(3.14159); +// myresult += callback.Run(2.71828); +// } +// +// PASSING UNBOUND INPUT PARAMETERS +// +// Unbound parameters are specified at the time a callback is Run(). They are +// specified in the Callback template type: +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc); +// cb.Run(23, "hello, world"); +// +// PASSING BOUND INPUT PARAMETERS +// +// Bound parameters are specified when you create the callback as arguments +// to Bind(). They will be passed to the function and the Run()ner of the +// callback doesn't see those values or even know that the function it's +// calling. +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc, 23, "hello world"); +// cb.Run(); +// +// A callback with no unbound input parameters (kudu::Callback) +// is called a kudu::Closure. So we could have also written: +// +// kudu::Closure cb = kudu::Bind(&MyFunc, 23, "hello world"); +// +// When calling member functions, bound parameters just go after the object +// pointer. +// +// kudu::Closure cb = kudu::Bind(&MyClass::MyFunc, this, 23, "hello world"); +// +// PARTIAL BINDING OF PARAMETERS +// +// You can specify some parameters when you create the callback, and specify +// the rest when you execute the callback. +// +// void MyFunc(int i, const std::string& str) {} +// kudu::Callback cb = kudu::Bind(&MyFunc, 23); +// cb.Run("hello world"); +// +// When calling a function bound parameters are first, followed by unbound +// parameters. +// +// +// ----------------------------------------------------------------------------- +// Quick reference for advanced binding +// ----------------------------------------------------------------------------- +// +// BINDING A CLASS METHOD WITH WEAK POINTERS +// +// kudu::Bind(&MyClass::Foo, GetWeakPtr()); +// +// The callback will not be issued if the object is destroyed at the time +// it's issued. DANGER: weak pointers are not threadsafe, so don't use this +// when passing between threads! +// +// BINDING A CLASS METHOD WITH MANUAL LIFETIME MANAGEMENT +// +// kudu::Bind(&MyClass::Foo, kudu::Unretained(this)); +// +// This disables all lifetime management on the object. You're responsible +// for making sure the object is alive at the time of the call. You break it, +// you own it! +// +// BINDING A CLASS METHOD AND HAVING THE CALLBACK OWN THE CLASS +// +// MyClass* myclass = new MyClass; +// kudu::Bind(&MyClass::Foo, kudu::Owned(myclass)); +// +// The object will be deleted when the callback is destroyed, even if it's +// not run (like if you post a task during shutdown). Potentially useful for +// "fire and forget" cases. +// +// IGNORING RETURN VALUES +// +// Sometimes you want to call a function that returns a value in a callback +// that doesn't expect a return value. +// +// int DoSomething(int arg) { cout << arg << endl; } +// kudu::Callback) cb = +// kudu::Bind(kudu::IgnoreResult(&DoSomething)); +// +// +// ----------------------------------------------------------------------------- +// Quick reference for binding parameters to Bind() +// ----------------------------------------------------------------------------- +// +// Bound parameters are specified as arguments to Bind() and are passed to the +// function. A callback with no parameters or no unbound parameters is called a +// Closure (kudu::Callback and kudu::Closure are the same thing). +// +// PASSING PARAMETERS OWNED BY THE CALLBACK +// +// void Foo(int* arg) { cout << *arg << endl; } +// int* pn = new int(1); +// kudu::Closure foo_callback = kudu::Bind(&foo, kudu::Owned(pn)); +// +// The parameter will be deleted when the callback is destroyed, even if it's +// not run (like if you post a task during shutdown). +// +// PASSING PARAMETERS AS A scoped_ptr +// +// void TakesOwnership(scoped_ptr arg) {} +// scoped_ptr f(new Foo); +// // f becomes null during the following call. +// kudu::Closure cb = kudu::Bind(&TakesOwnership, kudu::Passed(&f)); +// +// Ownership of the parameter will be with the callback until the it is run, +// when ownership is passed to the callback function. This means the callback +// can only be run once. If the callback is never run, it will delete the +// object when it's destroyed. +// +// PASSING PARAMETERS AS A scoped_refptr +// +// void TakesOneRef(scoped_refptr arg) {} +// scoped_refptr f(new Foo) +// kudu::Closure cb = kudu::Bind(&TakesOneRef, f); +// +// This should "just work." The closure will take a reference as long as it +// is alive, and another reference will be taken for the called function. +// +// PASSING PARAMETERS BY REFERENCE +// +// void foo(int arg) { cout << arg << endl } +// int n = 1; +// kudu::Closure has_ref = kudu::Bind(&foo, kudu::ConstRef(n)); +// n = 2; +// has_ref.Run(); // Prints "2" +// +// Normally parameters are copied in the closure. DANGER: ConstRef stores a +// const reference instead, referencing the original parameter. This means +// that you must ensure the object outlives the callback! +// +// +// ----------------------------------------------------------------------------- +// Implementation notes +// ----------------------------------------------------------------------------- +// +// WHERE IS THIS DESIGN FROM: +// +// The design Callback and Bind is heavily influenced by C++'s +// tr1::function/tr1::bind, and by the "Google Callback" system used inside +// Google. +// +// +// HOW THE IMPLEMENTATION WORKS: +// +// There are three main components to the system: +// 1) The Callback classes. +// 2) The Bind() functions. +// 3) The arguments wrappers (e.g., Unretained() and ConstRef()). +// +// The Callback classes represent a generic function pointer. Internally, +// it stores a refcounted piece of state that represents the target function +// and all its bound parameters. Each Callback specialization has a templated +// constructor that takes an BindState<>*. In the context of the constructor, +// the static type of this BindState<> pointer uniquely identifies the +// function it is representing, all its bound parameters, and a Run() method +// that is capable of invoking the target. +// +// Callback's constructor takes the BindState<>* that has the full static type +// and erases the target function type as well as the types of the bound +// parameters. It does this by storing a pointer to the specific Run() +// function, and upcasting the state of BindState<>* to a +// BindStateBase*. This is safe as long as this BindStateBase pointer +// is only used with the stored Run() pointer. +// +// To BindState<> objects are created inside the Bind() functions. +// These functions, along with a set of internal templates, are responsible for +// +// - Unwrapping the function signature into return type, and parameters +// - Determining the number of parameters that are bound +// - Creating the BindState storing the bound parameters +// - Performing compile-time asserts to avoid error-prone behavior +// - Returning an Callback<> with an arity matching the number of unbound +// parameters and that knows the correct refcounting semantics for the +// target object if we are binding a method. +// +// The Bind functions do the above using type-inference, and template +// specializations. +// +// By default Bind() will store copies of all bound parameters, and attempt +// to refcount a target object if the function being bound is a class method. +// These copies are created even if the function takes parameters as const +// references. (Binding to non-const references is forbidden, see bind.h.) +// +// To change this behavior, we introduce a set of argument wrappers +// (e.g., Unretained(), and ConstRef()). These are simple container templates +// that are passed by value, and wrap a pointer to argument. See the +// file-level comment in kudu/gutil/bind_helpers.h for more info. +// +// These types are passed to the Unwrap() functions, and the MaybeRefcount() +// functions respectively to modify the behavior of Bind(). The Unwrap() +// and MaybeRefcount() functions change behavior by doing partial +// specialization based on whether or not a parameter is a wrapper type. +// +// ConstRef() is similar to tr1::cref. Unretained() is specific to Chromium. +// +// +// WHY NOT TR1 FUNCTION/BIND? +// +// Direct use of tr1::function and tr1::bind was considered, but ultimately +// rejected because of the number of copy constructors invocations involved +// in the binding of arguments during construction, and the forwarding of +// arguments during invocation. These copies will no longer be an issue in +// C++0x because C++0x will support rvalue reference allowing for the compiler +// to avoid these copies. However, waiting for C++0x is not an option. +// +// Measured with valgrind on gcc version 4.4.3 (Ubuntu 4.4.3-4ubuntu5), the +// tr1::bind call itself will invoke a non-trivial copy constructor three times +// for each bound parameter. Also, each when passing a tr1::function, each +// bound argument will be copied again. +// +// In addition to the copies taken at binding and invocation, copying a +// tr1::function causes a copy to be made of all the bound parameters and +// state. +// +// Furthermore, in Chromium, it is desirable for the Callback to take a +// reference on a target object when representing a class method call. This +// is not supported by tr1. +// +// Lastly, tr1::function and tr1::bind has a more general and flexible API. +// This includes things like argument reordering by use of +// tr1::bind::placeholder, support for non-const reference parameters, and some +// limited amount of subtyping of the tr1::function object (e.g., +// tr1::function is convertible to tr1::function). +// +// These are not features that are required in Chromium. Some of them, such as +// allowing for reference parameters, and subtyping of functions, may actually +// become a source of errors. Removing support for these features actually +// allows for a simpler implementation, and a terser Currying API. +// +// +// WHY NOT GOOGLE CALLBACKS? +// +// The Google callback system also does not support refcounting. Furthermore, +// its implementation has a number of strange edge cases with respect to type +// conversion of its arguments. In particular, the argument's constness must +// at times match exactly the function signature, or the type-inference might +// break. Given the above, writing a custom solution was easier. +// +// +// MISSING FUNCTIONALITY +// - Invoking the return of Bind. Bind(&foo).Run() does not work; +// - Binding arrays to functions that take a non-const pointer. +// Example: +// void Foo(const char* ptr); +// void Bar(char* ptr); +// Bind(&Foo, "test"); +// Bind(&Bar, "test"); // This fails because ptr is not const. + +namespace kudu { + +// First, we forward declare the Callback class template. This informs the +// compiler that the template only has 1 type parameter which is the function +// signature that the Callback is representing. +// +// After this, create template specializations for 0-$(MAX_ARITY) parameters. Note that +// even though the template typelist grows, the specialization still +// only has one type: the function signature. +// +// If you are thinking of forward declaring Callback in your own header file, +// please include "base/callback_forward.h" instead. +template +class Callback; + +namespace internal { +template +struct BindState; +} // namespace internal + + +$range ARITY 0..MAX_ARITY +$for ARITY [[ +$range ARG 1..ARITY + +$if ARITY == 0 [[ +template +class Callback : public internal::CallbackBase { +]] $else [[ +template +class Callback : public internal::CallbackBase { +]] + + public: + typedef R(RunType)($for ARG , [[A$(ARG)]]); + + Callback() : CallbackBase(NULL) { } + + // Note that this constructor CANNOT be explicit, and that Bind() CANNOT + // return the exact Callback<> type. See base/bind.h for details. + template + Callback(internal::BindState* bind_state) + : CallbackBase(bind_state) { + + // Force the assignment to a local variable of PolymorphicInvoke + // so the compiler will typecheck that the passed in Run() method has + // the correct type. + PolymorphicInvoke invoke_func = + &internal::BindState + ::InvokerType::Run; + polymorphic_invoke_ = reinterpret_cast(invoke_func); + } + + bool Equals(const Callback& other) const { + return CallbackBase::Equals(other); + } + + R Run($for ARG , + [[typename internal::CallbackParamTraits::ForwardType a$(ARG)]]) const { + PolymorphicInvoke f = + reinterpret_cast(polymorphic_invoke_); + + return f(bind_state_.get()[[]] +$if ARITY != 0 [[, ]] +$for ARG , + [[internal::CallbackForward(a$(ARG))]]); + } + + private: + typedef R(*PolymorphicInvoke)( + internal::BindStateBase*[[]] +$if ARITY != 0 [[, ]] +$for ARG , [[typename internal::CallbackParamTraits::ForwardType]]); + +}; + + +]] $$ for ARITY + +// Syntactic sugar to make Callbacks easier to declare since it +// will be used in a lot of APIs with delayed execution. +typedef Callback Closure; + +} // namespace kudu + +#endif // KUDU_GUTIL_CALLBACK_H diff --git a/src/kudu/gutil/callback_forward.h b/src/kudu/gutil/callback_forward.h new file mode 100644 index 000000000000..956ff737ff6b --- /dev/null +++ b/src/kudu/gutil/callback_forward.h @@ -0,0 +1,17 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_CALLBACK_FORWARD_H_ +#define KUDU_GUTIL_CALLBACK_FORWARD_H_ + +namespace kudu { + +template +class Callback; + +typedef Callback Closure; + +} // namespace kudu + +#endif // KUDU_GUTIL_CALLBACK_FORWARD_H diff --git a/src/kudu/gutil/callback_internal.cc b/src/kudu/gutil/callback_internal.cc new file mode 100644 index 000000000000..05b9e8f29920 --- /dev/null +++ b/src/kudu/gutil/callback_internal.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/callback_internal.h" + +namespace kudu { +namespace internal { + +bool CallbackBase::is_null() const { + return bind_state_.get() == nullptr; +} + +void CallbackBase::Reset() { + polymorphic_invoke_ = nullptr; + // NULL the bind_state_ last, since it may be holding the last ref to whatever + // object owns us, and we may be deleted after that. + bind_state_ = nullptr; +} + +bool CallbackBase::Equals(const CallbackBase& other) const { + return bind_state_.get() == other.bind_state_.get() && + polymorphic_invoke_ == other.polymorphic_invoke_; +} + +CallbackBase::CallbackBase(BindStateBase* bind_state) + : bind_state_(bind_state), + polymorphic_invoke_(nullptr) { + DCHECK(!bind_state_.get() || bind_state_->HasOneRef()); +} + +CallbackBase::~CallbackBase() { +} + +} // namespace internal +} // namespace kudu diff --git a/src/kudu/gutil/callback_internal.h b/src/kudu/gutil/callback_internal.h new file mode 100644 index 000000000000..47b56555f60f --- /dev/null +++ b/src/kudu/gutil/callback_internal.h @@ -0,0 +1,177 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file contains utility functions and classes that help the +// implementation, and management of the Callback objects. + +#ifndef KUDU_GUTIL_CALLBACK_INTERNAL_H_ +#define KUDU_GUTIL_CALLBACK_INTERNAL_H_ + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" + +template +class ScopedVector; + +namespace kudu { +namespace internal { + +// BindStateBase is used to provide an opaque handle that the Callback +// class can use to represent a function object with bound arguments. It +// behaves as an existential type that is used by a corresponding +// DoInvoke function to perform the function execution. This allows +// us to shield the Callback class from the types of the bound argument via +// "type erasure." +class BindStateBase : public RefCountedThreadSafe { + protected: + friend class RefCountedThreadSafe; + virtual ~BindStateBase() {} +}; + +// Holds the Callback methods that don't require specialization to reduce +// template bloat. +class CallbackBase { + public: + // Returns true if Callback is null (doesn't refer to anything). + bool is_null() const; + + // Returns the Callback into an uninitialized state. + void Reset(); + + protected: + // In C++, it is safe to cast function pointers to function pointers of + // another type. It is not okay to use void*. We create a InvokeFuncStorage + // that that can store our function pointer, and then cast it back to + // the original type on usage. + typedef void(*InvokeFuncStorage)(void); + + // Returns true if this callback equals |other|. |other| may be null. + bool Equals(const CallbackBase& other) const; + + // Allow initializing of |bind_state_| via the constructor to avoid default + // initialization of the scoped_refptr. We do not also initialize + // |polymorphic_invoke_| here because doing a normal assignment in the + // derived Callback templates makes for much nicer compiler errors. + explicit CallbackBase(BindStateBase* bind_state); + + // Force the destructor to be instantiated inside this translation unit so + // that our subclasses will not get inlined versions. Avoids more template + // bloat. + ~CallbackBase(); + + scoped_refptr bind_state_; + InvokeFuncStorage polymorphic_invoke_; +}; + +// A helper template to determine if given type is non-const move-only-type, +// i.e. if a value of the given type should be passed via .Pass() in a +// destructive way. +template struct IsMoveOnlyType { + template + static base::YesType Test(const typename U::MoveOnlyTypeForCPP03*); + + template + static base::NoType Test(...); + + static const bool value = sizeof(Test(0)) == sizeof(base::YesType) && + !base::is_const::value; +}; + +// This is a typetraits object that's used to take an argument type, and +// extract a suitable type for storing and forwarding arguments. +// +// In particular, it strips off references, and converts arrays to +// pointers for storage; and it avoids accidentally trying to create a +// "reference of a reference" if the argument is a reference type. +// +// This array type becomes an issue for storage because we are passing bound +// parameters by const reference. In this case, we end up passing an actual +// array type in the initializer list which C++ does not allow. This will +// break passing of C-string literals. +template ::value> +struct CallbackParamTraits { + typedef const T& ForwardType; + typedef T StorageType; +}; + +// The Storage should almost be impossible to trigger unless someone manually +// specifies type of the bind parameters. However, in case they do, +// this will guard against us accidentally storing a reference parameter. +// +// The ForwardType should only be used for unbound arguments. +template +struct CallbackParamTraits { + typedef T& ForwardType; + typedef T StorageType; +}; + +// Note that for array types, we implicitly add a const in the conversion. This +// means that it is not possible to bind array arguments to functions that take +// a non-const pointer. Trying to specialize the template based on a "const +// T[n]" does not seem to match correctly, so we are stuck with this +// restriction. +template +struct CallbackParamTraits { + typedef const T* ForwardType; + typedef const T* StorageType; +}; + +// See comment for CallbackParamTraits. +template +struct CallbackParamTraits { + typedef const T* ForwardType; + typedef const T* StorageType; +}; + +// Parameter traits for movable-but-not-copyable scopers. +// +// Callback<>/Bind() understands movable-but-not-copyable semantics where +// the type cannot be copied but can still have its state destructively +// transferred (aka. moved) to another instance of the same type by calling a +// helper function. When used with Bind(), this signifies transferal of the +// object's state to the target function. +// +// For these types, the ForwardType must not be a const reference, or a +// reference. A const reference is inappropriate, and would break const +// correctness, because we are implementing a destructive move. A non-const +// reference cannot be used with temporaries which means the result of a +// function or a cast would not be usable with Callback<> or Bind(). +template +struct CallbackParamTraits { + typedef T ForwardType; + typedef T StorageType; +}; + +// CallbackForward() is a very limited simulation of C++11's std::forward() +// used by the Callback/Bind system for a set of movable-but-not-copyable +// types. It is needed because forwarding a movable-but-not-copyable +// argument to another function requires us to invoke the proper move +// operator to create a rvalue version of the type. The supported types are +// whitelisted below as overloads of the CallbackForward() function. The +// default template compiles out to be a no-op. +// +// In C++11, std::forward would replace all uses of this function. However, it +// is impossible to implement a general std::forward with C++11 due to a lack +// of rvalue references. +// +// In addition to Callback/Bind, this is used by PostTaskAndReplyWithResult to +// simulate std::forward() and forward the result of one Callback as a +// parameter to another callback. This is to support Callbacks that return +// the movable-but-not-copyable types whitelisted above. +template +typename base::enable_if::value, T>::type& CallbackForward(T& t) { + return t; +} + +template +typename base::enable_if::value, T>::type CallbackForward(T& t) { + return t.Pass(); +} + +} // namespace internal +} // namespace kudu + +#endif // KUDU_GUTIL_CALLBACK_INTERNAL_H_ diff --git a/src/kudu/gutil/casts.h b/src/kudu/gutil/casts.h new file mode 100644 index 000000000000..474e083686af --- /dev/null +++ b/src/kudu/gutil/casts.h @@ -0,0 +1,392 @@ +// Copyright 2009 Google Inc. All Rights Reserved. +// +// Various Google-specific casting templates. +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// + +#ifndef BASE_CASTS_H_ +#define BASE_CASTS_H_ + +#include // for use with down_cast<> +#include // for memcpy +#include // for enumeration casts and tests + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/template_util.h" +#include "kudu/gutil/type_traits.h" + +// Use implicit_cast as a safe version of static_cast or const_cast +// for implicit conversions. For example: +// - Upcasting in a type hierarchy. +// - Performing arithmetic conversions (int32 to int64, int to double, etc.). +// - Adding const or volatile qualifiers. +// +// In general, implicit_cast can be used to convert this code +// To to = from; +// DoSomething(to); +// to this +// DoSomething(implicit_cast(from)); +// +// base::identity_ is used to make a non-deduced context, which +// forces all callers to explicitly specify the template argument. +template +inline To implicit_cast(typename base::identity_::type to) { + return to; +} + +// This version of implicit_cast is used when two template arguments +// are specified. It's obsolete and should not be used. +template +inline To implicit_cast(typename base::identity_::type const &f) { + return f; +} + + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use implicit_cast<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. + +template // use like this: down_cast(foo); +inline To down_cast(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + + // TODO(user): This should use COMPILE_ASSERT. + if (false) { + ::implicit_cast(NULL); + } + + // uses RTTI in dbg and fastbuild. asserts are disabled in opt builds. + assert(f == NULL || dynamic_cast(f) != NULL); + return static_cast(f); +} + +// Overload of down_cast for references. Use like this: down_cast(foo). +// The code is slightly convoluted because we're still using the pointer +// form of dynamic cast. (The reference form throws an exception if it +// fails.) +// +// There's no need for a special const overload either for the pointer +// or the reference form. If you call down_cast with a const T&, the +// compiler will just bind From to const T. +template +inline To down_cast(From& f) { + COMPILE_ASSERT(base::is_reference::value, target_type_not_a_reference); + typedef typename base::remove_reference::type* ToAsPointer; + if (false) { + // Compile-time check that To inherits from From. See above for details. + ::implicit_cast(NULL); + } + + assert(dynamic_cast(&f) != NULL); // RTTI: debug mode only + return static_cast(f); +} + +// bit_cast is a template function that implements the +// equivalent of "*reinterpret_cast(&source)". We need this in +// very low-level functions like the protobuf library and fast math +// support. +// +// float f = 3.14159265358979; +// int i = bit_cast(f); +// // i = 0x40490fdb +// +// The classical address-casting method is: +// +// // WRONG +// float f = 3.14159265358979; // WRONG +// int i = * reinterpret_cast(&f); // WRONG +// +// The address-casting method actually produces undefined behavior +// according to ISO C++ specification section 3.10 -15 -. Roughly, this +// section says: if an object in memory has one type, and a program +// accesses it with a different type, then the result is undefined +// behavior for most values of "different type". +// +// This is true for any cast syntax, either *(int*)&f or +// *reinterpret_cast(&f). And it is particularly true for +// conversions betweeen integral lvalues and floating-point lvalues. +// +// The purpose of 3.10 -15- is to allow optimizing compilers to assume +// that expressions with different types refer to different memory. gcc +// 4.0.1 has an optimizer that takes advantage of this. So a +// non-conforming program quietly produces wildly incorrect output. +// +// The problem is not the use of reinterpret_cast. The problem is type +// punning: holding an object in memory of one type and reading its bits +// back using a different type. +// +// The C++ standard is more subtle and complex than this, but that +// is the basic idea. +// +// Anyways ... +// +// bit_cast<> calls memcpy() which is blessed by the standard, +// especially by the example in section 3.9 . Also, of course, +// bit_cast<> wraps up the nasty logic in one place. +// +// Fortunately memcpy() is very fast. In optimized mode, with a +// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline +// code with the minimal amount of data movement. On a 32-bit system, +// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8) +// compiles to two loads and two stores. +// +// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1. +// +// WARNING: if Dest or Source is a non-POD type, the result of the memcpy +// is likely to surprise you. +// +// Props to Bill Gibbons for the compile time assertion technique and +// Art Komninos and Igor Tandetnik for the msvc experiments. +// +// -- mec 2005-10-17 + +template +inline Dest bit_cast(const Source& source) { + // Compile time assertion: sizeof(Dest) == sizeof(Source) + // A compile error here means your Dest and Source have different sizes. + COMPILE_ASSERT(sizeof(Dest) == sizeof(Source), VerifySizesAreEqual); + + Dest dest; + memcpy(&dest, &source, sizeof(dest)); + return dest; +} + + +// **** Enumeration Casts and Tests +// +// C++ requires that the value of an integer that is converted to an +// enumeration be within the value bounds of the enumeration. Modern +// compilers can and do take advantage of this requirement to optimize +// programs. So, using a raw static_cast with enums can be bad. See +// +// The following templates and macros enable casting from an int to an enum +// with checking against the appropriate bounds. First, when defining an +// enumeration, identify the limits of the values of its enumerators. +// +// enum A { A_min = -18, A_max = 33 }; +// MAKE_ENUM_LIMITS(A, A_min, A_max) +// +// Convert an enum to an int in one of two ways. The prefered way is a +// tight conversion, which ensures that A_min <= value <= A_max. +// +// A var = tight_enum_cast(3); +// +// However, the C++ language defines the set of possible values for an +// enumeration to be essentially the range of a bitfield that can represent +// all the enumerators, i.e. those within the nearest containing power +// of two. In the example above, the nearest positive power of two is 64, +// and so the upper bound is 63. The nearest negative power of two is +// -32 and so the lower bound is -32 (two's complement), which is upgraded +// to match the upper bound, becoming -64. The values within this range +// of -64 to 63 are valid, according to the C++ standard. You can cast +// values within this range as follows. +// +// A var = loose_enum_cast(45); +// +// These casts will log a message if the value does not reside within the +// specified range, and will be fatal when in debug mode. +// +// For those times when an assert too strong, there are test functions. +// +// bool var = tight_enum_test(3); +// bool var = loose_enum_test(45); +// +// For code that needs to use the enumeration value if and only if +// it is good, there is a function that both tests and casts. +// +// int i = ....; +// A var; +// if (tight_enum_test_cast(i, &var)) +// .... // use valid var with value as indicated by i +// else +// .... // handle invalid enum cast +// +// The enum test/cast facility is currently limited to enumerations that +// fit within an int. It is also limited to two's complement ints. + +// ** Implementation Description +// +// The enum_limits template class captures the minimum and maximum +// enumerator. All uses of this template are intended to be of +// specializations, so the generic has a field to identify itself as +// not specialized. The test/cast templates assert specialization. + +template +class enum_limits { + public: + static const Enum min_enumerator = 0; + static const Enum max_enumerator = 0; + static const bool is_specialized = false; +}; + +// Now we define the macro to define the specialization for enum_limits. +// The specialization checks that the enumerators fit within an int. +// This checking relies on integral promotion. + +#define MAKE_ENUM_LIMITS(ENUM_TYPE, ENUM_MIN, ENUM_MAX) \ +template <> \ +class enum_limits { \ + public: \ + static const ENUM_TYPE min_enumerator = ENUM_MIN; \ + static const ENUM_TYPE max_enumerator = ENUM_MAX; \ + static const bool is_specialized = true; \ + COMPILE_ASSERT(ENUM_MIN >= INT_MIN, enumerator_too_negative_for_int); \ + COMPILE_ASSERT(ENUM_MAX <= INT_MAX, enumerator_too_positive_for_int); \ +}; + +// The loose enum test/cast is actually the more complicated one, +// because of the problem of finding the bounds. +// +// The unary upper bound, ub, on a positive number is its positive +// saturation, i.e. for a value v within pow(2,k-1) <= v < pow(2,k), +// the upper bound is pow(2,k)-1. +// +// The unary lower bound, lb, on a negative number is its negative +// saturation, i.e. for a value v within -pow(2,k) <= v < -pow(2,k-1), +// the lower bound is -pow(2,k). +// +// The actual bounds are (1) the binary upper bound over the maximum +// enumerator and the one's complement of a negative minimum enumerator +// and (2) the binary lower bound over the minimum enumerator and the +// one's complement of the positive maximum enumerator, except that if no +// enumerators are negative, the lower bound is zero. +// +// The algorithm relies heavily on the observation that +// +// a,b>0 then ub(a,b) == ub(a) | ub(b) == ub(a|b) +// a,b<0 then lb(a,b) == lb(a) & lb(b) == lb(a&b) +// +// Note that the compiler will boil most of this code away +// because of value propagation on the constant enumerator bounds. + +template +inline bool loose_enum_test(int e_val) { + COMPILE_ASSERT(enum_limits::is_specialized, missing_MAKE_ENUM_LIMITS); + const Enum e_min = enum_limits::min_enumerator; + const Enum e_max = enum_limits::max_enumerator; + COMPILE_ASSERT(sizeof(e_val) == 4 || sizeof(e_val) == 8, unexpected_int_size); + + // Find the unary bounding negative number of e_min and e_max. + + // Find the unary bounding negative number of e_max. + // This would be b_min = e_max < 0 ? e_max : ~e_max, + // but we want to avoid branches to help the compiler. + int e_max_sign = e_max >> (sizeof(e_val)*8 - 1); + int b_min = ~e_max_sign ^ e_max; + + // Find the binary bounding negative of both e_min and e_max. + b_min &= e_min; + + // However, if e_min is postive, the result will be positive. + // Now clear all bits right of the most significant clear bit, + // which is a negative saturation for negative numbers. + // In the case of positive numbers, this is flush to zero. + b_min &= b_min >> 1; + b_min &= b_min >> 2; + b_min &= b_min >> 4; + b_min &= b_min >> 8; + b_min &= b_min >> 16; +#if INT_MAX > 2147483647 + b_min &= b_min >> 32; +#endif + + // Find the unary bounding positive number of e_max. + int b_max = e_max_sign ^ e_max; + + // Find the binary bounding postive number of that + // and the unary bounding positive number of e_min. + int e_min_sign = e_min >> (sizeof(e_val)*8 - 1); + b_max |= e_min_sign ^ e_min; + + // Now set all bits right of the most significant set bit, + // which is a postive saturation for positive numbers. + b_max |= b_max >> 1; + b_max |= b_max >> 2; + b_max |= b_max >> 4; + b_max |= b_max >> 8; + b_max |= b_max >> 16; +#if INT_MAX > 2147483647 + b_max |= b_max >> 32; +#endif + + // Finally test the bounds. + return b_min <= e_val && e_val <= b_max; +} + +template +inline bool tight_enum_test(int e_val) { + COMPILE_ASSERT(enum_limits::is_specialized, missing_MAKE_ENUM_LIMITS); + const Enum e_min = enum_limits::min_enumerator; + const Enum e_max = enum_limits::max_enumerator; + return e_min <= e_val && e_val <= e_max; +} + +template +inline bool loose_enum_test_cast(int e_val, Enum* e_var) { + if (loose_enum_test(e_val)) { + *e_var = static_cast(e_val); + return true; + } else { + return false; + } +} + +template +inline bool tight_enum_test_cast(int e_val, Enum* e_var) { + if (tight_enum_test(e_val)) { + *e_var = static_cast(e_val); + return true; + } else { + return false; + } +} + +namespace base { +namespace internal { + +inline void WarnEnumCastError(int value_of_int) { + LOG(DFATAL) << "Bad enum value " << value_of_int; +} + +} // namespace internal +} // namespace base + +template +inline Enum loose_enum_cast(int e_val) { + if (!loose_enum_test(e_val)) { + base::internal::WarnEnumCastError(e_val); + } + return static_cast(e_val); +} + +template +inline Enum tight_enum_cast(int e_val) { + if (!tight_enum_test(e_val)) { + base::internal::WarnEnumCastError(e_val); + } + return static_cast(e_val); +} + +#endif // BASE_CASTS_H_ diff --git a/src/kudu/gutil/charmap.h b/src/kudu/gutil/charmap.h new file mode 100644 index 000000000000..2698bbe48697 --- /dev/null +++ b/src/kudu/gutil/charmap.h @@ -0,0 +1,87 @@ +// Character Map Class +// +// Originally written by Daniel Dulitz +// Yanked out from url.cc on February 2003 by Wei-Hwa Huang +// +// Copyright (C) Google, 2001. +// +// A fast, bit-vector map for 8-bit unsigned characters. +// +// Internally stores 256 bits in an array of 8 uint32s. +// See changelist history for micro-optimization attempts. +// Does quick bit-flicking to lookup needed characters. +// +// This class is useful for non-character purposes as well. + +#ifndef UTIL_GTL_CHARMAP_H_ +#define UTIL_GTL_CHARMAP_H_ + +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/type_traits.h" + +class Charmap { + public: + // Initializes with given uint32 values. For instance, the first + // variable contains bits for values 0x1F (US) down to 0x00 (NUL). + Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3, + uint32 b4, uint32 b5, uint32 b6, uint32 b7) { + m_[0] = b0; + m_[1] = b1; + m_[2] = b2; + m_[3] = b3; + m_[4] = b4; + m_[5] = b5; + m_[6] = b6; + m_[7] = b7; + } + + // Initializes with a given char*. Note that NUL is not treated as + // a terminator, but rather a char to be flicked. + Charmap(const char* str, int len) { + Init(str, len); + } + + // Initializes with a given char*. NUL is treated as a terminator + // and will not be in the charmap. + explicit Charmap(const char* str) { + Init(str, strlen(str)); + } + + bool contains(unsigned char c) const { + return (m_[c >> 5] >> (c & 0x1f)) & 0x1; + } + + // Returns true if and only if a character exists in both maps. + bool IntersectsWith(const Charmap & c) const { + for (int i = 0; i < 8; ++i) { + if ((m_[i] & c.m_[i]) != 0) + return true; + } + return false; + } + + bool IsZero() const { + for (uint32 c : m_) { + if (c != 0) + return false; + } + return true; + } + + protected: + uint32 m_[8]; + + void Init(const char* str, int len) { + memset(&m_, 0, sizeof m_); + for (int i = 0; i < len; ++i) { + unsigned char value = static_cast(str[i]); + m_[value >> 5] |= 1UL << (value & 0x1f); + } + } +}; +DECLARE_POD(Charmap); + +#endif // UTIL_GTL_CHARMAP_H_ diff --git a/src/kudu/gutil/cpu.cc b/src/kudu/gutil/cpu.cc new file mode 100644 index 000000000000..c6bf41f1a952 --- /dev/null +++ b/src/kudu/gutil/cpu.cc @@ -0,0 +1,289 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/cpu.h" + +#include +#include + +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/strings/stringpiece.h" + +#if defined(__x86_64__) +#if defined(_MSC_VER) +#include +#include // For _xgetbv() +#endif +#endif + +namespace base { + +CPU::CPU() + : signature_(0), + type_(0), + family_(0), + model_(0), + stepping_(0), + ext_model_(0), + ext_family_(0), + has_mmx_(false), + has_sse_(false), + has_sse2_(false), + has_sse3_(false), + has_ssse3_(false), + has_sse41_(false), + has_sse42_(false), + has_avx_(false), + has_avx2_(false), + has_aesni_(false), + has_non_stop_time_stamp_counter_(false), + has_broken_neon_(false), + cpu_vendor_("unknown") { + Initialize(); +} + +namespace { + +#if defined(__x86_64__) +#ifndef _MSC_VER + +#if defined(__pic__) && defined(__i386__) + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} + +#else + +void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} + +#endif + +// _xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +uint64 _xgetbv(uint32 xcr) { + uint32 eax, edx; + + __asm__ volatile ( + "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (static_cast(edx) << 32) | eax; +} + +#endif // !_MSC_VER +#endif // __x86_64__ + +#if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) +class LazyCpuInfoValue { + public: + LazyCpuInfoValue() : has_broken_neon_(false) { + // This function finds the value from /proc/cpuinfo under the key "model + // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 + // and later for arm64) and is shown once per CPU. "Processor" is used in + // earler versions and is shown only once at the top of /proc/cpuinfo + // regardless of the number CPUs. + const char kModelNamePrefix[] = "model name\t: "; + const char kProcessorPrefix[] = "Processor\t: "; + + // This function also calculates whether we believe that this CPU has a + // broken NEON unit based on these fields from cpuinfo: + unsigned implementer = 0, architecture = 0, variant = 0, part = 0, + revision = 0; + const struct { + const char key[17]; + unsigned int* result; + } kUnsignedValues[] = { + {"CPU implementer", &implementer}, + {"CPU architecture", &architecture}, + {"CPU variant", &variant}, + {"CPU part", &part}, + {"CPU revision", &revision}, + }; + + std::string contents; + ReadFileToString(FilePath("/proc/cpuinfo"), &contents); + DCHECK(!contents.empty()); + if (contents.empty()) { + return; + } + + std::istringstream iss(contents); + std::string line; + while (std::getline(iss, line)) { + if (brand_.empty() && + (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || + line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { + brand_.assign(line.substr(strlen(kModelNamePrefix))); + } + + for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { + const char *key = kUnsignedValues[i].key; + const size_t len = strlen(key); + + if (line.compare(0, len, key) == 0 && + line.size() >= len + 1 && + (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { + size_t colon_pos = line.find(':', len); + if (colon_pos == std::string::npos) { + continue; + } + + const StringPiece line_sp(line); + StringPiece value_sp = line_sp.substr(colon_pos + 1); + while (!value_sp.empty() && + (value_sp[0] == ' ' || value_sp[0] == '\t')) { + value_sp = value_sp.substr(1); + } + + // The string may have leading "0x" or not, so we use strtoul to + // handle that. + char* endptr; + std::string value(value_sp.as_string()); + unsigned long int result = strtoul(value.c_str(), &endptr, 0); + if (*endptr == 0 && result <= UINT_MAX) { + *kUnsignedValues[i].result = result; + } + } + } + } + + has_broken_neon_ = + implementer == 0x51 && + architecture == 7 && + variant == 1 && + part == 0x4d && + revision == 0; + } + + const std::string& brand() const { return brand_; } + bool has_broken_neon() const { return has_broken_neon_; } + + private: + std::string brand_; + bool has_broken_neon_; + DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); +}; + +base::LazyInstance::Leaky g_lazy_cpuinfo = + LAZY_INSTANCE_INITIALIZER; + +#endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || + // defined(OS_LINUX)) + +} // anonymous namespace + +void CPU::Initialize() { +#if defined(__x86_64__) + int cpu_info[4] = {-1}; + char cpu_string[48]; + + // __cpuid with an InfoType argument of 0 returns the number of + // valid Ids in CPUInfo[0] and the CPU identification string in + // the other three array elements. The CPU identification string is + // not in linear order. The code below arranges the information + // in a human readable form. The human readable order is CPUInfo[1] | + // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped + // before using memcpy to copy these three array elements to cpu_string. + __cpuid(cpu_info, 0); + int num_ids = cpu_info[0]; + std::swap(cpu_info[2], cpu_info[3]); + memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); + cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); + + // Interpret CPU feature information. + if (num_ids > 0) { + int cpu_info7[4] = {0}; + __cpuid(cpu_info, 1); + if (num_ids >= 7) { + __cpuid(cpu_info7, 7); + } + signature_ = cpu_info[0]; + stepping_ = cpu_info[0] & 0xf; + model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); + family_ = (cpu_info[0] >> 8) & 0xf; + type_ = (cpu_info[0] >> 12) & 0x3; + ext_model_ = (cpu_info[0] >> 16) & 0xf; + ext_family_ = (cpu_info[0] >> 20) & 0xff; + has_mmx_ = (cpu_info[3] & 0x00800000) != 0; + has_sse_ = (cpu_info[3] & 0x02000000) != 0; + has_sse2_ = (cpu_info[3] & 0x04000000) != 0; + has_sse3_ = (cpu_info[2] & 0x00000001) != 0; + has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; + has_sse41_ = (cpu_info[2] & 0x00080000) != 0; + has_sse42_ = (cpu_info[2] & 0x00100000) != 0; + // AVX instructions will generate an illegal instruction exception unless + // a) they are supported by the CPU, + // b) XSAVE is supported by the CPU and + // c) XSAVE is enabled by the kernel. + // See http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled + // + // In addition, we have observed some crashes with the xgetbv instruction + // even after following Intel's example code. (See crbug.com/375968.) + // Because of that, we also test the XSAVE bit because its description in + // the CPUID documentation suggests that it signals xgetbv support. + has_avx_ = + (cpu_info[2] & 0x10000000) != 0 && + (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && + (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && + (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; + has_aesni_ = (cpu_info[2] & 0x02000000) != 0; + has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; + } + + // Get the brand string of the cpu. + __cpuid(cpu_info, 0x80000000); + const int parameter_end = 0x80000004; + int max_parameter = cpu_info[0]; + + if (cpu_info[0] >= parameter_end) { + char* cpu_string_ptr = cpu_string; + + for (int parameter = 0x80000002; parameter <= parameter_end && + cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { + __cpuid(cpu_info, parameter); + memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); + cpu_string_ptr += sizeof(cpu_info); + } + cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); + } + + const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; + if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { + __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); + has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; + } +#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) + cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); + has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); +#else + #error unknown architecture +#endif +} + +CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { + if (has_avx2()) return AVX2; + if (has_avx()) return AVX; + if (has_sse42()) return SSE42; + if (has_sse41()) return SSE41; + if (has_ssse3()) return SSSE3; + if (has_sse3()) return SSE3; + if (has_sse2()) return SSE2; + if (has_sse()) return SSE; + return PENTIUM; +} + +} // namespace base diff --git a/src/kudu/gutil/cpu.h b/src/kudu/gutil/cpu.h new file mode 100644 index 000000000000..65498140d172 --- /dev/null +++ b/src/kudu/gutil/cpu.h @@ -0,0 +1,90 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_CPU_H_ +#define BASE_CPU_H_ + +#include + +namespace base { + +// Query information about the processor. +class CPU { + public: + // Constructor + CPU(); + + enum IntelMicroArchitecture { + PENTIUM, + SSE, + SSE2, + SSE3, + SSSE3, + SSE41, + SSE42, + AVX, + AVX2, + MAX_INTEL_MICRO_ARCHITECTURE + }; + + // Accessors for CPU information. + const std::string& vendor_name() const { return cpu_vendor_; } + int signature() const { return signature_; } + int stepping() const { return stepping_; } + int model() const { return model_; } + int family() const { return family_; } + int type() const { return type_; } + int extended_model() const { return ext_model_; } + int extended_family() const { return ext_family_; } + bool has_mmx() const { return has_mmx_; } + bool has_sse() const { return has_sse_; } + bool has_sse2() const { return has_sse2_; } + bool has_sse3() const { return has_sse3_; } + bool has_ssse3() const { return has_ssse3_; } + bool has_sse41() const { return has_sse41_; } + bool has_sse42() const { return has_sse42_; } + bool has_avx() const { return has_avx_; } + bool has_avx2() const { return has_avx2_; } + bool has_aesni() const { return has_aesni_; } + bool has_non_stop_time_stamp_counter() const { + return has_non_stop_time_stamp_counter_; + } + // has_broken_neon is only valid on ARM chips. If true, it indicates that we + // believe that the NEON unit on the current CPU is flawed and cannot execute + // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 + bool has_broken_neon() const { return has_broken_neon_; } + + IntelMicroArchitecture GetIntelMicroArchitecture() const; + const std::string& cpu_brand() const { return cpu_brand_; } + + private: + // Query the processor for CPUID information. + void Initialize(); + + int signature_; // raw form of type, family, model, and stepping + int type_; // process type + int family_; // family of the processor + int model_; // model of processor + int stepping_; // processor revision number + int ext_model_; + int ext_family_; + bool has_mmx_; + bool has_sse_; + bool has_sse2_; + bool has_sse3_; + bool has_ssse3_; + bool has_sse41_; + bool has_sse42_; + bool has_avx_; + bool has_avx2_; + bool has_aesni_; + bool has_non_stop_time_stamp_counter_; + bool has_broken_neon_; + std::string cpu_vendor_; + std::string cpu_brand_; +}; + +} // namespace base + +#endif // BASE_CPU_H_ diff --git a/src/kudu/gutil/cycleclock-inl.h b/src/kudu/gutil/cycleclock-inl.h new file mode 100644 index 000000000000..77385923a254 --- /dev/null +++ b/src/kudu/gutil/cycleclock-inl.h @@ -0,0 +1,203 @@ +// Copyright (C) 1999-2007 Google, Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// All rights reserved. +// Extracted from base/timer.h by jrvb + +// The implementation of CycleClock::Now() +// See cycleclock.h +// +// IWYU pragma: private, include "base/cycleclock.h" + +// NOTE: only i386 and x86_64 have been well tested. +// PPC, sparc, alpha, and ia64 are based on +// http://peter.kuscsik.com/wordpress/?p=14 +// with modifications by m3b. See also +// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h + +#ifndef GUTIL_CYCLECLOCK_INL_H_ +#define GUTIL_CYCLECLOCK_INL_H_ + +#include + +#include "kudu/gutil/port.h" +#include "kudu/gutil/arm_instruction_set_select.h" + +// Please do not nest #if directives. Keep one section, and one #if per +// platform. + +// For historical reasons, the frequency on some platforms is scaled to be +// close to the platform's core clock frequency. This is not guaranteed by the +// interface, and may change in future implementations. + +// ---------------------------------------------------------------- +#if defined(__APPLE__) +#include +inline int64 CycleClock::Now() { + // this goes at the top because we need ALL Macs, regardless of + // architecture, to return the number of "mach time units" that + // have passed since startup. See sysinfo.cc where + // InitializeSystemInfo() sets the supposed cpu clock frequency of + // macs to the number of mach time units per second, not actual + // CPU clock frequency (which can change in the face of CPU + // frequency scaling). Also note that when the Mac sleeps, this + // counter pauses; it does not continue counting, nor does it + // reset to zero. + return mach_absolute_time(); +} + +// ---------------------------------------------------------------- +#elif defined(__i386__) +inline int64 CycleClock::Now() { + int64 ret; + __asm__ volatile("rdtsc" : "=A" (ret)); + return ret; +} + +// ---------------------------------------------------------------- +#elif defined(__x86_64__) || defined(__amd64__) +inline int64 CycleClock::Now() { + uint64 low, high; + __asm__ volatile("rdtsc" : "=a" (low), "=d" (high)); + return (high << 32) | low; +} + +// ---------------------------------------------------------------- +#elif defined(__powerpc__) || defined(__ppc__) +#define SPR_TB 268 +#define SPR_TBU 269 +inline int64 CycleClock::Now() { + uint64 time_base_value; + if (sizeof(void*) == 8) { + // On PowerPC64, time base can be read with one SPR read. + asm volatile("mfspr %0, %1" : "=r" (time_base_value) : "i"(SPR_TB)); + } else { + uint32 tbl, tbu0, tbu1; + asm volatile (" mfspr %0, %3\n" + " mfspr %1, %4\n" + " mfspr %2, %3\n" : + "=r"(tbu0), "=r"(tbl), "=r"(tbu1) : + "i"(SPR_TBU), "i"(SPR_TB)); + // If there is a carry into the upper half, it is okay to return + // (tbu1, 0) since it must be between the 2 TBU reads. + tbl &= -static_cast(tbu0 == tbu1); + // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is garbage) + time_base_value = + (static_cast(tbu1) << 32) | static_cast(tbl); + } + return static_cast(time_base_value); +} + +// ---------------------------------------------------------------- +#elif defined(__sparc__) +inline int64 CycleClock::Now() { + int64 tick; + asm(".byte 0x83, 0x41, 0x00, 0x00"); + asm("mov %%g1, %0" : "=r" (tick)); + return tick; +} + +// ---------------------------------------------------------------- +#elif defined(__ia64__) +inline int64 CycleClock::Now() { + int64 itc; + asm("mov %0 = ar.itc" : "=r" (itc)); + return itc; +} + +// ---------------------------------------------------------------- +#elif defined(_MSC_VER) && defined(_M_IX86) +inline int64 CycleClock::Now() { + // Older MSVC compilers (like 7.x) don't seem to support the + // __rdtsc intrinsic properly, so I prefer to use _asm instead + // when I know it will work. Otherwise, I'll use __rdtsc and hope + // the code is being compiled with a non-ancient compiler. + _asm rdtsc +} + +// ---------------------------------------------------------------- +#elif defined(_MSC_VER) +// For MSVC, we want to use '_asm rdtsc' when possible (since it works +// with even ancient MSVC compilers), and when not possible the +// __rdtsc intrinsic, declared in . Unfortunately, in some +// environments, and have conflicting +// declarations of some other intrinsics, breaking compilation. +// Therefore, we simply declare __rdtsc ourselves. See also +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +extern "C" uint64 __rdtsc(); +#pragma intrinsic(__rdtsc) +inline int64 CycleClock::Now() { + return __rdtsc(); +} + +// ---------------------------------------------------------------- +#elif defined(ARMV6) // V6 is the earliest arm that has a standard cyclecount +#include "kudu/gutil/sysinfo.h" +inline int64 CycleClock::Now() { + uint32 pmccntr; + uint32 pmuseren; + uint32 pmcntenset; + // Read the user mode perf monitor counter access permissions. + asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren)); + if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. + asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset)); + if (pmcntenset & 0x80000000ul) { // Is it counting? + asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr)); + // The counter is set up to count every 64th cycle + return static_cast(pmccntr) * 64; // Should optimize to << 6 + } + } + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast((tv.tv_sec + tv.tv_usec * 0.000001) + * CyclesPerSecond()); +} + +// ---------------------------------------------------------------- +#elif defined(ARMV3) +#include "kudu/gutil/sysinfo.h" // for CyclesPerSecond() +inline int64 CycleClock::Now() { + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast((tv.tv_sec + tv.tv_usec * 0.000001) + * CyclesPerSecond()); +} + +// ---------------------------------------------------------------- +#elif defined(__mips__) +#include "kudu/gutil/sysinfo.h" +inline int64 CycleClock::Now() { + // mips apparently only allows rdtsc for superusers, so we fall + // back to gettimeofday. It's possible clock_gettime would be better. + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast((tv.tv_sec + tv.tv_usec * 0.000001) + * CyclesPerSecond()); +} + +// ---------------------------------------------------------------- +#else +// The soft failover to a generic implementation is automatic only for some +// platforms. For other platforms the developer is expected to make an attempt +// to create a fast implementation and use generic version if nothing better is +// available. +#error You need to define CycleTimer for your O/S and CPU +#endif + +#endif // GUTIL_CYCLECLOCK_INL_H_ diff --git a/src/kudu/gutil/dynamic_annotations.c b/src/kudu/gutil/dynamic_annotations.c new file mode 100644 index 000000000000..17a94da93ad3 --- /dev/null +++ b/src/kudu/gutil/dynamic_annotations.c @@ -0,0 +1,173 @@ +/* Copyright (c) 2008-2009, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Kostya Serebryany + */ + +#ifdef __cplusplus +# error "This file should be built as pure C to avoid name mangling" +#endif + +#include +#include + +#include "kudu/gutil/dynamic_annotations.h" + +#ifdef __GNUC__ +/* valgrind.h uses gcc extensions so it won't build with other compilers */ +#include "kudu/gutil/valgrind.h" +#endif + +/* Compiler-based ThreadSanitizer defines + DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL = 1 + and provides its own definitions of the functions. */ + +#ifndef DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL +# define DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL 0 +#endif + +/* Each function is empty and called (via a macro) only in debug mode. + The arguments are captured by dynamic tools at runtime. */ + +#if DYNAMIC_ANNOTATIONS_ENABLED == 1 \ + && DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 + +void AnnotateRWLockCreate(const char *file, int line, + const volatile void *lock){} +void AnnotateRWLockDestroy(const char *file, int line, + const volatile void *lock){} +void AnnotateRWLockAcquired(const char *file, int line, + const volatile void *lock, long is_w){} +void AnnotateRWLockReleased(const char *file, int line, + const volatile void *lock, long is_w){} +void AnnotateBarrierInit(const char *file, int line, + const volatile void *barrier, long count, + long reinitialization_allowed) {} +void AnnotateBarrierWaitBefore(const char *file, int line, + const volatile void *barrier) {} +void AnnotateBarrierWaitAfter(const char *file, int line, + const volatile void *barrier) {} +void AnnotateBarrierDestroy(const char *file, int line, + const volatile void *barrier) {} + +void AnnotateCondVarWait(const char *file, int line, + const volatile void *cv, + const volatile void *lock){} +void AnnotateCondVarSignal(const char *file, int line, + const volatile void *cv){} +void AnnotateCondVarSignalAll(const char *file, int line, + const volatile void *cv){} +void AnnotatePublishMemoryRange(const char *file, int line, + const volatile void *address, + long size){} +void AnnotateUnpublishMemoryRange(const char *file, int line, + const volatile void *address, + long size){} +void AnnotatePCQCreate(const char *file, int line, + const volatile void *pcq){} +void AnnotatePCQDestroy(const char *file, int line, + const volatile void *pcq){} +void AnnotatePCQPut(const char *file, int line, + const volatile void *pcq){} +void AnnotatePCQGet(const char *file, int line, + const volatile void *pcq){} +void AnnotateNewMemory(const char *file, int line, + const volatile void *mem, + long size){} +void AnnotateExpectRace(const char *file, int line, + const volatile void *mem, + const char *description){} +void AnnotateBenignRace(const char *file, int line, + const volatile void *mem, + const char *description){} +void AnnotateBenignRaceSized(const char *file, int line, + const volatile void *mem, + long size, + const char *description) {} +void AnnotateMutexIsUsedAsCondVar(const char *file, int line, + const volatile void *mu){} +void AnnotateTraceMemory(const char *file, int line, + const volatile void *arg){} +void AnnotateThreadName(const char *file, int line, + const char *name){} +void AnnotateIgnoreReadsBegin(const char *file, int line){} +void AnnotateIgnoreReadsEnd(const char *file, int line){} +void AnnotateIgnoreWritesBegin(const char *file, int line){} +void AnnotateIgnoreWritesEnd(const char *file, int line){} +void AnnotateEnableRaceDetection(const char *file, int line, int enable){} +void AnnotateNoOp(const char *file, int line, + const volatile void *arg){} +void AnnotateFlushState(const char *file, int line){} + +#endif /* DYNAMIC_ANNOTATIONS_ENABLED == 1 + && DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 */ + +#if DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 + +static int GetRunningOnValgrind(void) { +#ifdef RUNNING_ON_VALGRIND + if (RUNNING_ON_VALGRIND) return 1; +#endif + char *running_on_valgrind_str = getenv("RUNNING_ON_VALGRIND"); + if (running_on_valgrind_str) { + return strcmp(running_on_valgrind_str, "0") != 0; + } + return 0; +} + +/* See the comments in dynamic_annotations.h */ +int RunningOnValgrind(void) { + static volatile int running_on_valgrind = -1; + int local_running_on_valgrind = running_on_valgrind; + /* C doesn't have thread-safe initialization of statics, and we + don't want to depend on pthread_once here, so hack it. */ + ANNOTATE_BENIGN_RACE(&running_on_valgrind, "safe hack"); + if (local_running_on_valgrind == -1) + running_on_valgrind = local_running_on_valgrind = GetRunningOnValgrind(); + return local_running_on_valgrind; +} + +/* See the comments in dynamic_annotations.h */ +double ValgrindSlowdown(void) { + /* Same initialization hack as in RunningOnValgrind(). */ + static volatile double slowdown = 0.0; + double local_slowdown = slowdown; + ANNOTATE_BENIGN_RACE(&slowdown, "safe hack"); + if (RunningOnValgrind() == 0) { + return 1.0; + } + if (local_slowdown == 0.0) { + char *env = getenv("VALGRIND_SLOWDOWN"); + slowdown = local_slowdown = env ? atof(env) : 50.0; + } + return local_slowdown; +} + +#endif /* DYNAMIC_ANNOTATIONS_EXTERNAL_IMPL == 0 */ diff --git a/src/kudu/gutil/dynamic_annotations.h b/src/kudu/gutil/dynamic_annotations.h new file mode 100644 index 000000000000..4d69a3f38d4f --- /dev/null +++ b/src/kudu/gutil/dynamic_annotations.h @@ -0,0 +1,770 @@ +/* Copyright (c) 2008-2009, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Kostya Serebryany + */ + +/* This file defines dynamic annotations for use with dynamic analysis + tool such as valgrind, PIN, etc. + + Dynamic annotation is a source code annotation that affects + the generated code (that is, the annotation is not a comment). + Each such annotation is attached to a particular + instruction and/or to a particular object (address) in the program. + + The annotations that should be used by users are macros in all upper-case + (e.g., ANNOTATE_NEW_MEMORY). + + Actual implementation of these macros may differ depending on the + dynamic analysis tool being used. + + See http://code.google.com/p/data-race-test/ for more information. + + This file supports the following dynamic analysis tools: + - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero). + Macros are defined empty. + - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1). + Macros are defined as calls to non-inlinable empty functions + that are intercepted by Valgrind. */ + +#ifndef __DYNAMIC_ANNOTATIONS_H__ +#define __DYNAMIC_ANNOTATIONS_H__ + +#ifndef DYNAMIC_ANNOTATIONS_ENABLED +# define DYNAMIC_ANNOTATIONS_ENABLED 0 +#endif + +#if DYNAMIC_ANNOTATIONS_ENABLED != 0 + + /* ------------------------------------------------------------- + Annotations useful when implementing condition variables such as CondVar, + using conditional critical sections (Await/LockWhen) and when constructing + user-defined synchronization mechanisms. + + The annotations ANNOTATE_HAPPENS_BEFORE() and ANNOTATE_HAPPENS_AFTER() can + be used to define happens-before arcs in user-defined synchronization + mechanisms: the race detector will infer an arc from the former to the + latter when they share the same argument pointer. + + Example 1 (reference counting): + + void Unref() { + ANNOTATE_HAPPENS_BEFORE(&refcount_); + if (AtomicDecrementByOne(&refcount_) == 0) { + ANNOTATE_HAPPENS_AFTER(&refcount_); + delete this; + } + } + + Example 2 (message queue): + + void MyQueue::Put(Type *e) { + MutexLock lock(&mu_); + ANNOTATE_HAPPENS_BEFORE(e); + PutElementIntoMyQueue(e); + } + + Type *MyQueue::Get() { + MutexLock lock(&mu_); + Type *e = GetElementFromMyQueue(); + ANNOTATE_HAPPENS_AFTER(e); + return e; + } + + Note: when possible, please use the existing reference counting and message + queue implementations instead of inventing new ones. */ + + /* Report that wait on the condition variable at address "cv" has succeeded + and the lock at address "lock" is held. */ + #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \ + AnnotateCondVarWait(__FILE__, __LINE__, cv, lock) + + /* Report that wait on the condition variable at "cv" has succeeded. Variant + w/o lock. */ + #define ANNOTATE_CONDVAR_WAIT(cv) \ + AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL) + + /* Report that we are about to signal on the condition variable at address + "cv". */ + #define ANNOTATE_CONDVAR_SIGNAL(cv) \ + AnnotateCondVarSignal(__FILE__, __LINE__, cv) + + /* Report that we are about to signal_all on the condition variable at "cv". */ + #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \ + AnnotateCondVarSignalAll(__FILE__, __LINE__, cv) + + /* Annotations for user-defined synchronization mechanisms. */ + #define ANNOTATE_HAPPENS_BEFORE(obj) ANNOTATE_CONDVAR_SIGNAL(obj) + #define ANNOTATE_HAPPENS_AFTER(obj) ANNOTATE_CONDVAR_WAIT(obj) + + /* Report that the bytes in the range [pointer, pointer+size) are about + to be published safely. The race checker will create a happens-before + arc from the call ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to + subsequent accesses to this memory. + Note: this annotation may not work properly if the race detector uses + sampling, i.e. does not observe all memory accesses. + */ + #define ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \ + AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size) + + /* DEPRECATED. Don't use it. */ + #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size) \ + AnnotateUnpublishMemoryRange(__FILE__, __LINE__, pointer, size) + + /* DEPRECATED. Don't use it. */ + #define ANNOTATE_SWAP_MEMORY_RANGE(pointer, size) \ + do { \ + ANNOTATE_UNPUBLISH_MEMORY_RANGE(pointer, size); \ + ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size); \ + } while (0) + + /* Instruct the tool to create a happens-before arc between mu->Unlock() and + mu->Lock(). This annotation may slow down the race detector and hide real + races. Normally it is used only when it would be difficult to annotate each + of the mutex's critical sections individually using the annotations above. + This annotation makes sense only for hybrid race detectors. For pure + happens-before detectors this is a no-op. For more details see + http://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */ + #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \ + AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu) + + /* Deprecated. Use ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX. */ + #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) \ + AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu) + + /* ------------------------------------------------------------- + Annotations useful when defining memory allocators, or when memory that + was protected in one way starts to be protected in another. */ + + /* Report that a new memory at "address" of size "size" has been allocated. + This might be used when the memory has been retrieved from a free list and + is about to be reused, or when a the locking discipline for a variable + changes. */ + #define ANNOTATE_NEW_MEMORY(address, size) \ + AnnotateNewMemory(__FILE__, __LINE__, address, size) + + /* ------------------------------------------------------------- + Annotations useful when defining FIFO queues that transfer data between + threads. */ + + /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at + address "pcq" has been created. The ANNOTATE_PCQ_* annotations + should be used only for FIFO queues. For non-FIFO queues use + ANNOTATE_HAPPENS_BEFORE (for put) and ANNOTATE_HAPPENS_AFTER (for get). */ + #define ANNOTATE_PCQ_CREATE(pcq) \ + AnnotatePCQCreate(__FILE__, __LINE__, pcq) + + /* Report that the queue at address "pcq" is about to be destroyed. */ + #define ANNOTATE_PCQ_DESTROY(pcq) \ + AnnotatePCQDestroy(__FILE__, __LINE__, pcq) + + /* Report that we are about to put an element into a FIFO queue at address + "pcq". */ + #define ANNOTATE_PCQ_PUT(pcq) \ + AnnotatePCQPut(__FILE__, __LINE__, pcq) + + /* Report that we've just got an element from a FIFO queue at address "pcq". */ + #define ANNOTATE_PCQ_GET(pcq) \ + AnnotatePCQGet(__FILE__, __LINE__, pcq) + + /* ------------------------------------------------------------- + Annotations that suppress errors. It is usually better to express the + program's synchronization using the other annotations, but these can + be used when all else fails. */ + + /* Report that we may have a benign race at "pointer", with size + "sizeof(*(pointer))". "pointer" must be a non-void* pointer. Insert at the + point where "pointer" has been allocated, preferably close to the point + where the race happens. See also ANNOTATE_BENIGN_RACE_STATIC. */ + #define ANNOTATE_BENIGN_RACE(pointer, description) \ + AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \ + sizeof(*(pointer)), description) + + /* Same as ANNOTATE_BENIGN_RACE(address, description), but applies to + the memory range [address, address+size). */ + #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \ + AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description) + + /* Request the analysis tool to ignore all reads in the current thread + until ANNOTATE_IGNORE_READS_END is called. + Useful to ignore intentional racey reads, while still checking + other reads and all writes. + See also ANNOTATE_UNPROTECTED_READ. */ + #define ANNOTATE_IGNORE_READS_BEGIN() \ + AnnotateIgnoreReadsBegin(__FILE__, __LINE__) + + /* Stop ignoring reads. */ + #define ANNOTATE_IGNORE_READS_END() \ + AnnotateIgnoreReadsEnd(__FILE__, __LINE__) + + /* Similar to ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */ + #define ANNOTATE_IGNORE_WRITES_BEGIN() \ + AnnotateIgnoreWritesBegin(__FILE__, __LINE__) + + /* Stop ignoring writes. */ + #define ANNOTATE_IGNORE_WRITES_END() \ + AnnotateIgnoreWritesEnd(__FILE__, __LINE__) + + /* Start ignoring all memory accesses (reads and writes). */ + #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \ + do {\ + ANNOTATE_IGNORE_READS_BEGIN();\ + ANNOTATE_IGNORE_WRITES_BEGIN();\ + }while(0)\ + + /* Stop ignoring all memory accesses. */ + #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \ + do {\ + ANNOTATE_IGNORE_WRITES_END();\ + ANNOTATE_IGNORE_READS_END();\ + }while(0)\ + + /* Start ignoring all synchronization until ANNOTATE_IGNORE_SYNC_END + is called. */ + #define ANNOTATE_IGNORE_SYNC_BEGIN() \ + AnnotateIgnoreSyncBegin(__FILE__, __LINE__) + + /* Stop ignoring all synchronization. */ + #define ANNOTATE_IGNORE_SYNC_END() \ + AnnotateIgnoreSyncEnd(__FILE__, __LINE__) + + /* Enable (enable!=0) or disable (enable==0) race detection for all threads. + This annotation could be useful if you want to skip expensive race analysis + during some period of program execution, e.g. during initialization. */ + #define ANNOTATE_ENABLE_RACE_DETECTION(enable) \ + AnnotateEnableRaceDetection(__FILE__, __LINE__, enable) + + /* ------------------------------------------------------------- + Annotations useful for debugging. */ + + /* Request to trace every access to "address". */ + #define ANNOTATE_TRACE_MEMORY(address) \ + AnnotateTraceMemory(__FILE__, __LINE__, address) + + /* Report the current thread name to a race detector. */ + #define ANNOTATE_THREAD_NAME(name) \ + AnnotateThreadName(__FILE__, __LINE__, name) + + /* ------------------------------------------------------------- + Annotations useful when implementing locks. They are not + normally needed by modules that merely use locks. + The "lock" argument is a pointer to the lock object. */ + + /* Report that a lock has been created at address "lock". */ + #define ANNOTATE_RWLOCK_CREATE(lock) \ + AnnotateRWLockCreate(__FILE__, __LINE__, lock) + + /* Report that a linker initialized lock has been created at address "lock". + */ +#ifdef THREAD_SANITIZER + #define ANNOTATE_RWLOCK_CREATE_STATIC(lock) \ + AnnotateRWLockCreateStatic(__FILE__, __LINE__, lock) +#else + #define ANNOTATE_RWLOCK_CREATE_STATIC(lock) ANNOTATE_RWLOCK_CREATE(lock) +#endif + + /* Report that the lock at address "lock" is about to be destroyed. */ + #define ANNOTATE_RWLOCK_DESTROY(lock) \ + AnnotateRWLockDestroy(__FILE__, __LINE__, lock) + + /* Report that the lock at address "lock" has been acquired. + is_w=1 for writer lock, is_w=0 for reader lock. */ + #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \ + AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w) + + /* Report that the lock at address "lock" is about to be released. */ + #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) \ + AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w) + + /* ------------------------------------------------------------- + Annotations useful when implementing barriers. They are not + normally needed by modules that merely use barriers. + The "barrier" argument is a pointer to the barrier object. */ + + /* Report that the "barrier" has been initialized with initial "count". + If 'reinitialization_allowed' is true, initialization is allowed to happen + multiple times w/o calling barrier_destroy() */ + #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \ + AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \ + reinitialization_allowed) + + /* Report that we are about to enter barrier_wait("barrier"). */ + #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \ + AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier) + + /* Report that we just exited barrier_wait("barrier"). */ + #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) \ + AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier) + + /* Report that the "barrier" has been destroyed. */ + #define ANNOTATE_BARRIER_DESTROY(barrier) \ + AnnotateBarrierDestroy(__FILE__, __LINE__, barrier) + + /* ------------------------------------------------------------- + Annotations useful for testing race detectors. */ + + /* Report that we expect a race on the variable at "address". + Use only in unit tests for a race detector. */ + #define ANNOTATE_EXPECT_RACE(address, description) \ + AnnotateExpectRace(__FILE__, __LINE__, address, description) + + /* A no-op. Insert where you like to test the interceptors. */ + #define ANNOTATE_NO_OP(arg) \ + AnnotateNoOp(__FILE__, __LINE__, arg) + + /* Force the race detector to flush its state. The actual effect depends on + * the implementation of the detector. */ + #define ANNOTATE_FLUSH_STATE() \ + AnnotateFlushState(__FILE__, __LINE__) + + +#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */ + + #define ANNOTATE_RWLOCK_CREATE(lock) /* empty */ + #define ANNOTATE_RWLOCK_CREATE_STATIC(lock) /* empty */ + #define ANNOTATE_RWLOCK_DESTROY(lock) /* empty */ + #define ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */ + #define ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */ + #define ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */ + #define ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */ + #define ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */ + #define ANNOTATE_BARRIER_DESTROY(barrier) /* empty */ + #define ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */ + #define ANNOTATE_CONDVAR_WAIT(cv) /* empty */ + #define ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */ + #define ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */ + #define ANNOTATE_HAPPENS_BEFORE(obj) /* empty */ + #define ANNOTATE_HAPPENS_AFTER(obj) /* empty */ + #define ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */ + #define ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size) /* empty */ + #define ANNOTATE_SWAP_MEMORY_RANGE(address, size) /* empty */ + #define ANNOTATE_PCQ_CREATE(pcq) /* empty */ + #define ANNOTATE_PCQ_DESTROY(pcq) /* empty */ + #define ANNOTATE_PCQ_PUT(pcq) /* empty */ + #define ANNOTATE_PCQ_GET(pcq) /* empty */ + #define ANNOTATE_NEW_MEMORY(address, size) /* empty */ + #define ANNOTATE_EXPECT_RACE(address, description) /* empty */ + #define ANNOTATE_BENIGN_RACE(address, description) /* empty */ + #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */ + #define ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */ + #define ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */ + #define ANNOTATE_TRACE_MEMORY(arg) /* empty */ + #define ANNOTATE_THREAD_NAME(name) /* empty */ + #define ANNOTATE_IGNORE_READS_BEGIN() /* empty */ + #define ANNOTATE_IGNORE_READS_END() /* empty */ + #define ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */ + #define ANNOTATE_IGNORE_WRITES_END() /* empty */ + #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */ + #define ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */ + #define ANNOTATE_IGNORE_SYNC_BEGIN() /* empty */ + #define ANNOTATE_IGNORE_SYNC_END() /* empty */ + #define ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */ + #define ANNOTATE_NO_OP(arg) /* empty */ + #define ANNOTATE_FLUSH_STATE() /* empty */ + +#endif /* DYNAMIC_ANNOTATIONS_ENABLED */ + +/* Macro definitions for GCC attributes that allow static thread safety + analysis to recognize and use some of the dynamic annotations as + escape hatches. + TODO(user): remove the check for __SUPPORT_DYN_ANNOTATION__ once the + default crosstool/GCC supports these GCC attributes. */ + +#define ANNOTALYSIS_STATIC_INLINE +#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY ; +#define ANNOTALYSIS_IGNORE_READS_BEGIN +#define ANNOTALYSIS_IGNORE_READS_END +#define ANNOTALYSIS_IGNORE_WRITES_BEGIN +#define ANNOTALYSIS_IGNORE_WRITES_END +#define ANNOTALYSIS_UNPROTECTED_READ + +#if defined(__GNUC__) && (!defined(SWIG)) && (!defined(__clang__)) + +#if DYNAMIC_ANNOTATIONS_ENABLED == 0 +#define ANNOTALYSIS_ONLY 1 +#undef ANNOTALYSIS_STATIC_INLINE +#define ANNOTALYSIS_STATIC_INLINE static inline +#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY { (void)file; (void)line; } +#endif + +/* Only emit attributes when annotalysis is enabled. */ +#if defined(__SUPPORT_TS_ANNOTATION__) && defined(__SUPPORT_DYN_ANNOTATION__) +#undef ANNOTALYSIS_IGNORE_READS_BEGIN +#define ANNOTALYSIS_IGNORE_READS_BEGIN __attribute__ ((ignore_reads_begin)) +#undef ANNOTALYSIS_IGNORE_READS_END +#define ANNOTALYSIS_IGNORE_READS_END __attribute__ ((ignore_reads_end)) +#undef ANNOTALYSIS_IGNORE_WRITES_BEGIN +#define ANNOTALYSIS_IGNORE_WRITES_BEGIN __attribute__ ((ignore_writes_begin)) +#undef ANNOTALYSIS_IGNORE_WRITES_END +#define ANNOTALYSIS_IGNORE_WRITES_END __attribute__ ((ignore_writes_end)) +#undef ANNOTALYSIS_UNPROTECTED_READ +#define ANNOTALYSIS_UNPROTECTED_READ __attribute__ ((unprotected_read)) +#endif + +#endif // defined(__GNUC__) && (!defined(SWIG)) && (!defined(__clang__)) + + +/* TODO(user) -- Replace __CLANG_SUPPORT_DYN_ANNOTATION__ with the + appropriate feature ID. */ +#if defined(__clang__) && (!defined(SWIG)) \ + && defined(__CLANG_SUPPORT_DYN_ANNOTATION__) + +/* TODO(user) -- The exclusive lock here ignores writes as well, but + allows INGORE_READS_AND_WRITES to work properly. */ +#undef ANNOTALYSIS_IGNORE_READS_BEGIN +#define ANNOTALYSIS_IGNORE_READS_BEGIN \ + __attribute__((exclusive_lock_function("*"))) +#undef ANNOTALYSIS_IGNORE_READS_END +#define ANNOTALYSIS_IGNORE_READS_END \ + __attribute__((unlock_function("*"))) + +#if DYNAMIC_ANNOTATIONS_ENABLED == 0 +/* Turn on certain macros for static analysis, even if dynamic annotations are + not enabled. */ +#define CLANG_ANNOTALYSIS_ONLY 1 + +#undef ANNOTALYSIS_STATIC_INLINE +#define ANNOTALYSIS_STATIC_INLINE static inline +#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +#define ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY { (void)file; (void)line; } + +#endif /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */ +#endif /* defined(__clang__) && (!defined(SWIG)) */ + + +/* Use the macros above rather than using these functions directly. */ +#ifdef __cplusplus +extern "C" { +#endif +void AnnotateRWLockCreate(const char *file, int line, + const volatile void *lock); +void AnnotateRWLockCreateStatic(const char *file, int line, + const volatile void *lock); +void AnnotateRWLockDestroy(const char *file, int line, + const volatile void *lock); +void AnnotateRWLockAcquired(const char *file, int line, + const volatile void *lock, long is_w); +void AnnotateRWLockReleased(const char *file, int line, + const volatile void *lock, long is_w); +void AnnotateBarrierInit(const char *file, int line, + const volatile void *barrier, long count, + long reinitialization_allowed); +void AnnotateBarrierWaitBefore(const char *file, int line, + const volatile void *barrier); +void AnnotateBarrierWaitAfter(const char *file, int line, + const volatile void *barrier); +void AnnotateBarrierDestroy(const char *file, int line, + const volatile void *barrier); +void AnnotateCondVarWait(const char *file, int line, + const volatile void *cv, + const volatile void *lock); +void AnnotateCondVarSignal(const char *file, int line, + const volatile void *cv); +void AnnotateCondVarSignalAll(const char *file, int line, + const volatile void *cv); +void AnnotatePublishMemoryRange(const char *file, int line, + const volatile void *address, + long size); +void AnnotateUnpublishMemoryRange(const char *file, int line, + const volatile void *address, + long size); +void AnnotatePCQCreate(const char *file, int line, + const volatile void *pcq); +void AnnotatePCQDestroy(const char *file, int line, + const volatile void *pcq); +void AnnotatePCQPut(const char *file, int line, + const volatile void *pcq); +void AnnotatePCQGet(const char *file, int line, + const volatile void *pcq); +void AnnotateNewMemory(const char *file, int line, + const volatile void *address, + long size); +void AnnotateExpectRace(const char *file, int line, + const volatile void *address, + const char *description); +void AnnotateBenignRace(const char *file, int line, + const volatile void *address, + const char *description); +void AnnotateBenignRaceSized(const char *file, int line, + const volatile void *address, + long size, + const char *description); +void AnnotateMutexIsUsedAsCondVar(const char *file, int line, + const volatile void *mu); +void AnnotateTraceMemory(const char *file, int line, + const volatile void *arg); +void AnnotateThreadName(const char *file, int line, + const char *name); +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreReadsBegin(const char *file, int line) + ANNOTALYSIS_IGNORE_READS_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreReadsEnd(const char *file, int line) + ANNOTALYSIS_IGNORE_READS_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreWritesBegin(const char *file, int line) + ANNOTALYSIS_IGNORE_WRITES_BEGIN ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +ANNOTALYSIS_STATIC_INLINE +void AnnotateIgnoreWritesEnd(const char *file, int line) + ANNOTALYSIS_IGNORE_WRITES_END ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY +void AnnotateIgnoreSyncBegin(const char *file, int line); +void AnnotateIgnoreSyncEnd(const char *file, int line); +void AnnotateEnableRaceDetection(const char *file, int line, int enable); +void AnnotateNoOp(const char *file, int line, + const volatile void *arg); +void AnnotateFlushState(const char *file, int line); + +/* Return non-zero value if running under valgrind. + + If "valgrind.h" is included into dynamic_annotations.c, + the regular valgrind mechanism will be used. + See http://valgrind.org/docs/manual/manual-core-adv.html about + RUNNING_ON_VALGRIND and other valgrind "client requests". + The file "valgrind.h" may be obtained by doing + svn co svn://svn.valgrind.org/valgrind/trunk/include + + If for some reason you can't use "valgrind.h" or want to fake valgrind, + there are two ways to make this function return non-zero: + - Use environment variable: export RUNNING_ON_VALGRIND=1 + - Make your tool intercept the function RunningOnValgrind() and + change its return value. + */ +int RunningOnValgrind(void); + +/* ValgrindSlowdown returns: + * 1.0, if (RunningOnValgrind() == 0) + * 50.0, if (RunningOnValgrind() != 0 && getenv("VALGRIND_SLOWDOWN") == NULL) + * atof(getenv("VALGRIND_SLOWDOWN")) otherwise + This function can be used to scale timeout values: + EXAMPLE: + for (;;) { + DoExpensiveBackgroundTask(); + SleepForSeconds(5 * ValgrindSlowdown()); + } + */ +double ValgrindSlowdown(void); + + +/* AddressSanitizer annotations from LLVM asan_interface.h */ + +// Marks memory region [addr, addr+size) as unaddressable. +// This memory must be previously allocated by the user program. Accessing +// addresses in this region from instrumented code is forbidden until +// this region is unpoisoned. This function is not guaranteed to poison +// the whole region - it may poison only subregion of [addr, addr+size) due +// to ASan alignment restrictions. +// Method is NOT thread-safe in the sense that no two threads can +// (un)poison memory in the same memory region simultaneously. +void __asan_poison_memory_region(void const volatile *addr, size_t size); +// Marks memory region [addr, addr+size) as addressable. +// This memory must be previously allocated by the user program. Accessing +// addresses in this region is allowed until this region is poisoned again. +// This function may unpoison a superregion of [addr, addr+size) due to +// ASan alignment restrictions. +// Method is NOT thread-safe in the sense that no two threads can +// (un)poison memory in the same memory region simultaneously. +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); + +// User code should use macros instead of functions. +#if defined(__SANITIZE_ADDRESS__) || defined(ADDRESS_SANITIZER) +#define ASAN_POISON_MEMORY_REGION(addr, size) \ + __asan_poison_memory_region((addr), (size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + __asan_unpoison_memory_region((addr), (size)) +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif + +// Sets the callback to be called right before death on error. +// Passing 0 will unset the callback. +void __asan_set_death_callback(void (*callback)(void)); + +#if defined(__SANITIZE_ADDRESS__) || defined(ADDRESS_SANITIZER) +#define ASAN_SET_DEATH_CALLBACK(cb) \ + __asan_set_death_callback((cb)) +#else +#define ASAN_SET_DEATH_CALLBACK(cb) \ + ((void)(cb)) +#endif + +#ifdef __cplusplus +} +#endif + +#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus) + + /* ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads. + + Instead of doing + ANNOTATE_IGNORE_READS_BEGIN(); + ... = x; + ANNOTATE_IGNORE_READS_END(); + one can use + ... = ANNOTATE_UNPROTECTED_READ(x); */ + template + inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) + ANNOTALYSIS_UNPROTECTED_READ { + ANNOTATE_IGNORE_READS_BEGIN(); + T res = x; + ANNOTATE_IGNORE_READS_END(); + return res; + } + /* Apply ANNOTATE_BENIGN_RACE_SIZED to a static variable. */ + #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description) \ + namespace { \ + class static_var ## _annotator { \ + public: \ + static_var ## _annotator() { \ + ANNOTATE_BENIGN_RACE_SIZED(&static_var, \ + sizeof(static_var), \ + # static_var ": " description); \ + } \ + }; \ + static static_var ## _annotator the ## static_var ## _annotator;\ + } +#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */ + + #define ANNOTATE_UNPROTECTED_READ(x) (x) + #define ANNOTATE_BENIGN_RACE_STATIC(static_var, description) /* empty */ + +#endif /* DYNAMIC_ANNOTATIONS_ENABLED */ + +/* Annotalysis, a GCC based static analyzer, is able to understand and use + some of the dynamic annotations defined in this file. However, dynamic + annotations are usually disabled in the opt mode (to avoid additional + runtime overheads) while Annotalysis only works in the opt mode. + In order for Annotalysis to use these dynamic annotations when they + are disabled, we re-define these annotations here. Note that unlike the + original macro definitions above, these macros are expanded to calls to + static inline functions so that the compiler will be able to remove the + calls after the analysis. */ + +#ifdef ANNOTALYSIS_ONLY + + #undef ANNOTALYSIS_ONLY + + /* Undefine and re-define the macros that the static analyzer understands. */ + #undef ANNOTATE_IGNORE_READS_BEGIN + #define ANNOTATE_IGNORE_READS_BEGIN() \ + AnnotateIgnoreReadsBegin(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_END + #define ANNOTATE_IGNORE_READS_END() \ + AnnotateIgnoreReadsEnd(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_WRITES_BEGIN + #define ANNOTATE_IGNORE_WRITES_BEGIN() \ + AnnotateIgnoreWritesBegin(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_WRITES_END + #define ANNOTATE_IGNORE_WRITES_END() \ + AnnotateIgnoreWritesEnd(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN + #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \ + do { \ + ANNOTATE_IGNORE_READS_BEGIN(); \ + ANNOTATE_IGNORE_WRITES_BEGIN(); \ + }while(0) \ + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_END + #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \ + do { \ + ANNOTATE_IGNORE_WRITES_END(); \ + ANNOTATE_IGNORE_READS_END(); \ + }while(0) \ + + #if defined(__cplusplus) + #undef ANNOTATE_UNPROTECTED_READ + template + inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) + ANNOTALYSIS_UNPROTECTED_READ { + ANNOTATE_IGNORE_READS_BEGIN(); + T res = x; + ANNOTATE_IGNORE_READS_END(); + return res; + } + #endif /* __cplusplus */ + +#endif /* ANNOTALYSIS_ONLY */ + + +#ifdef CLANG_ANNOTALYSIS_ONLY + +#undef CLANG_ANNOTALYSIS_ONLY + +/* Turn on macros that the static analyzer understands. These should be on + * even if dynamic annotations are off. */ + + #undef ANNOTATE_IGNORE_READS_BEGIN + #define ANNOTATE_IGNORE_READS_BEGIN() \ + AnnotateIgnoreReadsBegin(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_END + #define ANNOTATE_IGNORE_READS_END() \ + AnnotateIgnoreReadsEnd(__FILE__, __LINE__) + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN + #define ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \ + do { \ + ANNOTATE_IGNORE_READS_BEGIN(); \ + ANNOTATE_IGNORE_WRITES_BEGIN(); \ + } while (0) \ + + #undef ANNOTATE_IGNORE_READS_AND_WRITES_END + #define ANNOTATE_IGNORE_READS_AND_WRITES_END() \ + do { \ + ANNOTATE_IGNORE_WRITES_END(); \ + ANNOTATE_IGNORE_READS_END(); \ + } while (0) \ + + #if defined(__cplusplus) + #undef ANNOTATE_UNPROTECTED_READ + template + inline T ANNOTATE_UNPROTECTED_READ(const volatile T &x) { + ANNOTATE_IGNORE_READS_BEGIN(); + T res = x; + ANNOTATE_IGNORE_READS_END(); + return res; + } + #endif + +#endif /* CLANG_ANNOTALYSIS_ONLY */ + + +/* Undefine the macros intended only in this file. */ +#undef ANNOTALYSIS_STATIC_INLINE +#undef ANNOTALYSIS_SEMICOLON_OR_EMPTY_BODY + +#endif /* __DYNAMIC_ANNOTATIONS_H__ */ diff --git a/src/kudu/gutil/endian.h b/src/kudu/gutil/endian.h new file mode 100644 index 000000000000..5ed8f38f292f --- /dev/null +++ b/src/kudu/gutil/endian.h @@ -0,0 +1,358 @@ +// Copyright 2005 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// +// +// Utility functions that depend on bytesex. We define htonll and ntohll, +// as well as "Google" versions of all the standards: ghtonl, ghtons, and +// so on. These functions do exactly the same as their standard variants, +// but don't require including the dangerous netinet/in.h. +// +// Buffer routines will copy to and from buffers without causing +// a bus error when the architecture requires differnt byte alignments +#ifndef UTIL_ENDIAN_ENDIAN_H_ +#define UTIL_ENDIAN_ENDIAN_H_ + +#include + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" + +inline uint64 gbswap_64(uint64 host_int) { +#if defined(__GNUC__) && defined(__x86_64__) && !defined(__APPLE__) + // Adapted from /usr/include/byteswap.h. Not available on Mac. + if (__builtin_constant_p(host_int)) { + return __bswap_constant_64(host_int); + } else { + register uint64 result; + __asm__("bswap %0" : "=r" (result) : "0" (host_int)); + return result; + } +#elif defined(bswap_64) + return bswap_64(host_int); +#else + return static_cast(bswap_32(static_cast(host_int >> 32))) | + (static_cast(bswap_32(static_cast(host_int))) << 32); +#endif // bswap_64 +} + +#ifdef IS_LITTLE_ENDIAN + +// Definitions for ntohl etc. that don't require us to include +// netinet/in.h. We wrap bswap_32 and bswap_16 in functions rather +// than just #defining them because in debug mode, gcc doesn't +// correctly handle the (rather involved) definitions of bswap_32. +// gcc guarantees that inline functions are as fast as macros, so +// this isn't a performance hit. +inline uint16 ghtons(uint16 x) { return bswap_16(x); } +inline uint32 ghtonl(uint32 x) { return bswap_32(x); } +inline uint64 ghtonll(uint64 x) { return gbswap_64(x); } + +#elif defined IS_BIG_ENDIAN + +// These definitions are simpler on big-endian machines +// These are functions instead of macros to avoid self-assignment warnings +// on calls such as "i = ghtnol(i);". This also provides type checking. +inline uint16 ghtons(uint16 x) { return x; } +inline uint32 ghtonl(uint32 x) { return x; } +inline uint64 ghtonll(uint64 x) { return x; } + +#else +#error "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT +#endif // bytesex + + +// ntoh* and hton* are the same thing for any size and bytesex, +// since the function is an involution, i.e., its own inverse. +#define gntohl(x) ghtonl(x) +#define gntohs(x) ghtons(x) +#define gntohll(x) ghtonll(x) +#if !defined(__APPLE__) +// This one is safe to take as it's an extension +#define htonll(x) ghtonll(x) +#define ntohll(x) htonll(x) +#endif + +// Utilities to convert numbers between the current hosts's native byte +// order and little-endian byte order +// +// Load/Store methods are alignment safe +class LittleEndian { + public: + // Conversion functions. +#ifdef IS_LITTLE_ENDIAN + + static uint16 FromHost16(uint16 x) { return x; } + static uint16 ToHost16(uint16 x) { return x; } + + static uint32 FromHost32(uint32 x) { return x; } + static uint32 ToHost32(uint32 x) { return x; } + + static uint64 FromHost64(uint64 x) { return x; } + static uint64 ToHost64(uint64 x) { return x; } + + static bool IsLittleEndian() { return true; } + +#elif defined IS_BIG_ENDIAN + + static uint16 FromHost16(uint16 x) { return bswap_16(x); } + static uint16 ToHost16(uint16 x) { return bswap_16(x); } + + static uint32 FromHost32(uint32 x) { return bswap_32(x); } + static uint32 ToHost32(uint32 x) { return bswap_32(x); } + + static uint64 FromHost64(uint64 x) { return gbswap_64(x); } + static uint64 ToHost64(uint64 x) { return gbswap_64(x); } + + static bool IsLittleEndian() { return false; } + +#endif /* ENDIAN */ + + // Functions to do unaligned loads and stores in little-endian order. + static uint16 Load16(const void *p) { + return ToHost16(UNALIGNED_LOAD16(p)); + } + + static void Store16(void *p, uint16 v) { + UNALIGNED_STORE16(p, FromHost16(v)); + } + + static uint32 Load32(const void *p) { + return ToHost32(UNALIGNED_LOAD32(p)); + } + + static void Store32(void *p, uint32 v) { + UNALIGNED_STORE32(p, FromHost32(v)); + } + + static uint64 Load64(const void *p) { + return ToHost64(UNALIGNED_LOAD64(p)); + } + + // Build a uint64 from 1-8 bytes. + // 8 * len least significant bits are loaded from the memory with + // LittleEndian order. The 64 - 8 * len most significant bits are + // set all to 0. + // In latex-friendly words, this function returns: + // $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned. + // + // This function is equivalent with: + // uint64 val = 0; + // memcpy(&val, p, len); + // return ToHost64(val); + // TODO(user): write a small benchmark and benchmark the speed + // of a memcpy based approach. + // + // For speed reasons this function does not work for len == 0. + // The caller needs to guarantee that 1 <= len <= 8. + static uint64 Load64VariableLength(const void * const p, int len) { + assert(len >= 1 && len <= 8); + const char * const buf = static_cast(p); + uint64 val = 0; + --len; + do { + val = (val << 8) | buf[len]; + // (--len >= 0) is about 10 % faster than (len--) in some benchmarks. + } while (--len >= 0); + // No ToHost64(...) needed. The bytes are accessed in little-endian manner + // on every architecture. + return val; + } + + static void Store64(void *p, uint64 v) { + UNALIGNED_STORE64(p, FromHost64(v)); + } + + static uint128 Load128(const void *p) { + return uint128( + ToHost64(UNALIGNED_LOAD64(reinterpret_cast(p) + 1)), + ToHost64(UNALIGNED_LOAD64(p))); + } + + static void Store128(void *p, const uint128 v) { + UNALIGNED_STORE64(p, FromHost64(Uint128Low64(v))); + UNALIGNED_STORE64(reinterpret_cast(p) + 1, + FromHost64(Uint128High64(v))); + } + + // Build a uint128 from 1-16 bytes. + // 8 * len least significant bits are loaded from the memory with + // LittleEndian order. The 128 - 8 * len most significant bits are + // set all to 0. + static uint128 Load128VariableLength(const void *p, int len) { + if (len <= 8) { + return uint128(Load64VariableLength(p, len)); + } else { + return uint128( + Load64VariableLength(static_cast(p) + 8, len - 8), + Load64(p)); + } + } + + // Load & Store in machine's word size. + static uword_t LoadUnsignedWord(const void *p) { + if (sizeof(uword_t) == 8) + return Load64(p); + else + return Load32(p); + } + + static void StoreUnsignedWord(void *p, uword_t v) { + if (sizeof(v) == 8) + Store64(p, v); + else + Store32(p, v); + } +}; + +// Utilities to convert numbers between the current hosts's native byte +// order and big-endian byte order (same as network byte order) +// +// Load/Store methods are alignment safe +class BigEndian { + public: +#ifdef IS_LITTLE_ENDIAN + + static uint16 FromHost16(uint16 x) { return bswap_16(x); } + static uint16 ToHost16(uint16 x) { return bswap_16(x); } + + static uint32 FromHost32(uint32 x) { return bswap_32(x); } + static uint32 ToHost32(uint32 x) { return bswap_32(x); } + + static uint64 FromHost64(uint64 x) { return gbswap_64(x); } + static uint64 ToHost64(uint64 x) { return gbswap_64(x); } + + static bool IsLittleEndian() { return true; } + +#elif defined IS_BIG_ENDIAN + + static uint16 FromHost16(uint16 x) { return x; } + static uint16 ToHost16(uint16 x) { return x; } + + static uint32 FromHost32(uint32 x) { return x; } + static uint32 ToHost32(uint32 x) { return x; } + + static uint64 FromHost64(uint64 x) { return x; } + static uint64 ToHost64(uint64 x) { return x; } + + static bool IsLittleEndian() { return false; } + +#endif /* ENDIAN */ + // Functions to do unaligned loads and stores in little-endian order. + static uint16 Load16(const void *p) { + return ToHost16(UNALIGNED_LOAD16(p)); + } + + static void Store16(void *p, uint16 v) { + UNALIGNED_STORE16(p, FromHost16(v)); + } + + static uint32 Load32(const void *p) { + return ToHost32(UNALIGNED_LOAD32(p)); + } + + static void Store32(void *p, uint32 v) { + UNALIGNED_STORE32(p, FromHost32(v)); + } + + static uint64 Load64(const void *p) { + return ToHost64(UNALIGNED_LOAD64(p)); + } + + // Build a uint64 from 1-8 bytes. + // 8 * len least significant bits are loaded from the memory with + // BigEndian order. The 64 - 8 * len most significant bits are + // set all to 0. + // In latex-friendly words, this function returns: + // $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned. + // + // This function is equivalent with: + // uint64 val = 0; + // memcpy(&val, p, len); + // return ToHost64(val); + // TODO(user): write a small benchmark and benchmark the speed + // of a memcpy based approach. + // + // For speed reasons this function does not work for len == 0. + // The caller needs to guarantee that 1 <= len <= 8. + static uint64 Load64VariableLength(const void * const p, int len) { + assert(len >= 1 && len <= 8); + uint64 val = Load64(p); + uint64 mask = 0; + --len; + do { + mask = (mask << 8) | 0xff; + // (--len >= 0) is about 10 % faster than (len--) in some benchmarks. + } while (--len >= 0); + return val & mask; + } + + static void Store64(void *p, uint64 v) { + UNALIGNED_STORE64(p, FromHost64(v)); + } + + static uint128 Load128(const void *p) { + return uint128( + ToHost64(UNALIGNED_LOAD64(p)), + ToHost64(UNALIGNED_LOAD64(reinterpret_cast(p) + 1))); + } + + static void Store128(void *p, const uint128 v) { + UNALIGNED_STORE64(p, FromHost64(Uint128High64(v))); + UNALIGNED_STORE64(reinterpret_cast(p) + 1, + FromHost64(Uint128Low64(v))); + } + + // Build a uint128 from 1-16 bytes. + // 8 * len least significant bits are loaded from the memory with + // BigEndian order. The 128 - 8 * len most significant bits are + // set all to 0. + static uint128 Load128VariableLength(const void *p, int len) { + if (len <= 8) { + return uint128(Load64VariableLength(static_cast(p)+8, + len)); + } else { + return uint128( + Load64VariableLength(p, len-8), + Load64(static_cast(p)+8)); + } + } + + // Load & Store in machine's word size. + static uword_t LoadUnsignedWord(const void *p) { + if (sizeof(uword_t) == 8) + return Load64(p); + else + return Load32(p); + } + + static void StoreUnsignedWord(void *p, uword_t v) { + if (sizeof(uword_t) == 8) + Store64(p, v); + else + Store32(p, v); + } +}; // BigEndian + +// Network byte order is big-endian +typedef BigEndian NetworkByteOrder; + +#endif // UTIL_ENDIAN_ENDIAN_H_ diff --git a/src/kudu/gutil/fixedarray.h b/src/kudu/gutil/fixedarray.h new file mode 100644 index 000000000000..3e9e07261cbb --- /dev/null +++ b/src/kudu/gutil/fixedarray.h @@ -0,0 +1,181 @@ +// Copyright 2005 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// + +#ifndef UTIL_GTL_FIXEDARRAY_H__ +#define UTIL_GTL_FIXEDARRAY_H__ + +#include + +#include + +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/manual_constructor.h" + +// A FixedArray represents a non-resizable array of T where the +// length of the array does not need to be a compile time constant. +// +// FixedArray allocates small arrays inline, and large arrays on +// the heap. It is a good replacement for non-standard and deprecated +// uses of alloca() and variable length arrays (a GCC extension). +// +// FixedArray keeps performance fast for small arrays, because it +// avoids heap operations. It also helps reduce the chances of +// accidentally overflowing your stack if large input is passed to +// your function. +// +// Also, FixedArray is useful for writing portable code. Not all +// compilers support arrays of dynamic size. + +// Most users should not specify an inline_elements argument and let +// FixedArray<> automatically determine the number of elements +// to store inline based on sizeof(T). +// +// If inline_elements is specified, the FixedArray<> implementation +// will store arrays of length <= inline_elements inline. +// +// Finally note that unlike vector FixedArray will not zero-initialize +// simple types like int, double, bool, etc. +// +// Non-POD types will be default-initialized just like regular vectors or +// arrays. + +template +class FixedArray { + public: + // For playing nicely with stl: + typedef T value_type; + typedef T* iterator; + typedef T const* const_iterator; + typedef T& reference; + typedef T const& const_reference; + typedef T* pointer; + typedef std::ptrdiff_t difference_type; + typedef size_t size_type; + + // REQUIRES: n >= 0 + // Creates an array object that can store "n" elements. + // + // FixedArray will not zero-initialiaze POD (simple) types like int, + // double, bool, etc. + // Non-POD types will be default-initialized just like regular vectors or + // arrays. + explicit FixedArray(size_type n); + + // Releases any resources. + ~FixedArray(); + + // Returns the length of the array. + inline size_type size() const { return size_; } + + // Returns the memory size of the array in bytes. + inline size_t memsize() const { return size_ * sizeof(T); } + + // Returns a pointer to the underlying element array. + inline const T* get() const { return reinterpret_cast(array_); } + inline T* get() { return reinterpret_cast(array_); } + + // REQUIRES: 0 <= i < size() + // Returns a reference to the "i"th element. + inline T& operator[](size_type i) { + DCHECK_GE(i, 0); + DCHECK_LT(i, size_); + return array_[i].element; + } + + // REQUIRES: 0 <= i < size() + // Returns a reference to the "i"th element. + inline const T& operator[](size_type i) const { + DCHECK_GE(i, 0); + DCHECK_LT(i, size_); + return array_[i].element; + } + + inline iterator begin() { return get(); } + inline iterator end() { return get() + size_; } + + inline const_iterator begin() const { return get(); } + inline const_iterator end() const { return get() + size_; } + + private: + // Container to hold elements of type T. This is necessary to handle + // the case where T is a a (C-style) array. The size of InnerContainer + // and T must be the same, otherwise callers' assumptions about use + // of this code will be broken. + struct InnerContainer { + T element; + }; + COMPILE_ASSERT(sizeof(InnerContainer) == sizeof(T), + fixedarray_inner_container_size_mismatch); + + // How many elements should we store inline? + // a. If not specified, use a default of 256 bytes (256 bytes + // seems small enough to not cause stack overflow or unnecessary + // stack pollution, while still allowing stack allocation for + // reasonably long character arrays. + // b. Never use 0 length arrays (not ISO C++) + static const size_type S1 = ((inline_elements < 0) + ? (256/sizeof(T)) : inline_elements); + static const size_type S2 = (S1 <= 0) ? 1 : S1; + static const size_type kInlineElements = S2; + + size_type const size_; + InnerContainer* const array_; + + // Allocate some space, not an array of elements of type T, so that we can + // skip calling the T constructors and destructors for space we never use. + base::ManualConstructor + inline_space_[kInlineElements]; + + DISALLOW_EVIL_CONSTRUCTORS(FixedArray); +}; + +// Implementation details follow + +template +inline FixedArray::FixedArray(typename FixedArray::size_type n) + : size_(n), + array_((n <= kInlineElements + ? reinterpret_cast(inline_space_) + : new InnerContainer[n])) { + DCHECK_GE(n, 0); + + // Construct only the elements actually used. + if (array_ == reinterpret_cast(inline_space_)) { + for (int i = 0; i != size_; ++i) { + inline_space_[i].Init(); + } + } +} + +template +inline FixedArray::~FixedArray() { + if (array_ != reinterpret_cast(inline_space_)) { + delete[] array_; + } else { + for (int i = 0; i != size_; ++i) { + inline_space_[i].Destroy(); + } + } +} + +#endif // UTIL_GTL_FIXEDARRAY_H__ diff --git a/src/kudu/gutil/gscoped_ptr.h b/src/kudu/gutil/gscoped_ptr.h new file mode 100644 index 000000000000..cda7349b228d --- /dev/null +++ b/src/kudu/gutil/gscoped_ptr.h @@ -0,0 +1,830 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE.txt file. + +// Scopers help you manage ownership of a pointer, helping you easily manage the +// a pointer within a scope, and automatically destroying the pointer at the +// end of a scope. There are two main classes you will use, which correspond +// to the operators new/delete and new[]/delete[]. +// +// Example usage (gscoped_ptr): +// { +// gscoped_ptr foo(new Foo("wee")); +// } // foo goes out of scope, releasing the pointer with it. +// +// { +// gscoped_ptr foo; // No pointer managed. +// foo.reset(new Foo("wee")); // Now a pointer is managed. +// foo.reset(new Foo("wee2")); // Foo("wee") was destroyed. +// foo.reset(new Foo("wee3")); // Foo("wee2") was destroyed. +// foo->Method(); // Foo::Method() called. +// foo.get()->Method(); // Foo::Method() called. +// SomeFunc(foo.release()); // SomeFunc takes ownership, foo no longer +// // manages a pointer. +// foo.reset(new Foo("wee4")); // foo manages a pointer again. +// foo.reset(); // Foo("wee4") destroyed, foo no longer +// // manages a pointer. +// } // foo wasn't managing a pointer, so nothing was destroyed. +// +// Example usage (gscoped_array): +// { +// gscoped_array foo(new Foo[100]); +// foo.get()->Method(); // Foo::Method on the 0th element. +// foo[10].Method(); // Foo::Method on the 10th element. +// } +// +// These scopers also implement part of the functionality of C++11 unique_ptr +// in that they are "movable but not copyable." You can use the scopers in +// the parameter and return types of functions to signify ownership transfer +// in to and out of a function. When calling a function that has a scoper +// as the argument type, it must be called with the result of an analogous +// scoper's Pass() function or another function that generates a temporary; +// passing by copy will NOT work. Here is an example using gscoped_ptr: +// +// void TakesOwnership(gscoped_ptr arg) { +// // Do something with arg +// } +// gscoped_ptr CreateFoo() { +// // No need for calling Pass() because we are constructing a temporary +// // for the return value. +// return gscoped_ptr(new Foo("new")); +// } +// gscoped_ptr PassThru(gscoped_ptr arg) { +// return arg.Pass(); +// } +// +// { +// gscoped_ptr ptr(new Foo("yay")); // ptr manages Foo("yay"). +// TakesOwnership(ptr.Pass()); // ptr no longer owns Foo("yay"). +// gscoped_ptr ptr2 = CreateFoo(); // ptr2 owns the return Foo. +// gscoped_ptr ptr3 = // ptr3 now owns what was in ptr2. +// PassThru(ptr2.Pass()); // ptr2 is correspondingly NULL. +// } +// +// Notice that if you do not call Pass() when returning from PassThru(), or +// when invoking TakesOwnership(), the code will not compile because scopers +// are not copyable; they only implement move semantics which require calling +// the Pass() function to signify a destructive transfer of state. CreateFoo() +// is different though because we are constructing a temporary on the return +// line and thus can avoid needing to call Pass(). +// +// Pass() properly handles upcast in assignment, i.e. you can assign +// gscoped_ptr to gscoped_ptr: +// +// gscoped_ptr foo(new Foo()); +// gscoped_ptr parent = foo.Pass(); +// +// PassAs<>() should be used to upcast return value in return statement: +// +// gscoped_ptr CreateFoo() { +// gscoped_ptr result(new FooChild()); +// return result.PassAs(); +// } +// +// Note that PassAs<>() is implemented only for gscoped_ptr, but not for +// gscoped_array. This is because casting array pointers may not be safe. +// +// ------------------------------------------------------------------------- +// Cloudera notes: this should be used in preference to boost::scoped_ptr since +// it offers a ::release() method like unique_ptr. We unfortunately cannot +// just use unique_ptr because it has an inconsistent implementation in +// some of the older compilers we have to support. +// ------------------------------------------------------------------------- + +#ifndef KUDU_GUTIL_GSCOPED_PTR_H_ +#define KUDU_GUTIL_GSCOPED_PTR_H_ + +// This is an implementation designed to match the anticipated future TR2 +// implementation of the scoped_ptr class, and its closely-related brethren, +// scoped_array, scoped_ptr_malloc. + +#include +#include +#include + +#include // For std::swap(). + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/template_util.h" +#include "kudu/gutil/type_traits.h" +#include "kudu/gutil/move.h" + +namespace kudu { + +namespace subtle { +class RefCountedBase; +class RefCountedThreadSafeBase; +} // namespace subtle + +// Function object which deletes its parameter, which must be a pointer. +// If C is an array type, invokes 'delete[]' on the parameter; otherwise, +// invokes 'delete'. The default deleter for gscoped_ptr. +template +struct DefaultDeleter { + DefaultDeleter() {} + template DefaultDeleter(const DefaultDeleter& other) { + // IMPLEMENTATION NOTE: C++11 20.7.1.1.2p2 only provides this constructor + // if U* is implicitly convertible to T* and U is not an array type. + // + // Correct implementation should use SFINAE to disable this + // constructor. However, since there are no other 1-argument constructors, + // using a COMPILE_ASSERT() based on is_convertible<> and requiring + // complete types is simpler and will cause compile failures for equivalent + // misuses. + // + // Note, the is_convertible check also ensures that U is not an + // array. T is guaranteed to be a non-array, so any U* where U is an array + // cannot convert to T*. + enum { T_must_be_complete = sizeof(T) }; + enum { U_must_be_complete = sizeof(U) }; + COMPILE_ASSERT((base::is_convertible::value), + U_ptr_must_implicitly_convert_to_T_ptr); + } + inline void operator()(T* ptr) const { + enum { type_must_be_complete = sizeof(T) }; + delete ptr; + } +}; + +// Specialization of DefaultDeleter for array types. +template +struct DefaultDeleter { + inline void operator()(T* ptr) const { + enum { type_must_be_complete = sizeof(T) }; + delete[] ptr; + } + + private: + // Disable this operator for any U != T because it is undefined to execute + // an array delete when the static type of the array mismatches the dynamic + // type. + // + // References: + // C++98 [expr.delete]p3 + // http://cplusplus.github.com/LWG/lwg-defects.html#938 + template void operator()(U* array) const; +}; + +template +struct DefaultDeleter { + // Never allow someone to declare something like gscoped_ptr. + COMPILE_ASSERT(sizeof(T) == -1, do_not_use_array_with_size_as_type); +}; + +// Function object which invokes 'free' on its parameter, which must be +// a pointer. Can be used to store malloc-allocated pointers in gscoped_ptr: +// +// gscoped_ptr foo_ptr( +// static_cast(malloc(sizeof(int)))); +struct FreeDeleter { + inline void operator()(void* ptr) const { + free(ptr); + } +}; + +namespace internal { + +template struct IsNotRefCounted { + enum { + value = !base::is_convertible::value && + !base::is_convertible:: + value + }; +}; + +// Minimal implementation of the core logic of gscoped_ptr, suitable for +// reuse in both gscoped_ptr and its specializations. +template +class gscoped_ptr_impl { + public: + explicit gscoped_ptr_impl(T* p) : data_(p) { } + + // Initializer for deleters that have data parameters. + gscoped_ptr_impl(T* p, const D& d) : data_(p, d) {} + + // Templated constructor that destructively takes the value from another + // gscoped_ptr_impl. + template + gscoped_ptr_impl(gscoped_ptr_impl* other) + : data_(other->release(), other->get_deleter()) { + // We do not support move-only deleters. We could modify our move + // emulation to have base::subtle::move() and base::subtle::forward() + // functions that are imperfect emulations of their C++11 equivalents, + // but until there's a requirement, just assume deleters are copyable. + } + + template + void TakeState(gscoped_ptr_impl* other) { + // See comment in templated constructor above regarding lack of support + // for move-only deleters. + reset(other->release()); + get_deleter() = other->get_deleter(); + } + + ~gscoped_ptr_impl() { + if (data_.ptr != NULL) { + // Not using get_deleter() saves one function call in non-optimized + // builds. + static_cast(data_)(data_.ptr); + } + } + + void reset(T* p) { + // This is a self-reset, which is no longer allowed: http://crbug.com/162971 + if (p != NULL && p == data_.ptr) + abort(); + + // Note that running data_.ptr = p can lead to undefined behavior if + // get_deleter()(get()) deletes this. In order to pevent this, reset() + // should update the stored pointer before deleting its old value. + // + // However, changing reset() to use that behavior may cause current code to + // break in unexpected ways. If the destruction of the owned object + // dereferences the gscoped_ptr when it is destroyed by a call to reset(), + // then it will incorrectly dispatch calls to |p| rather than the original + // value of |data_.ptr|. + // + // During the transition period, set the stored pointer to NULL while + // deleting the object. Eventually, this safety check will be removed to + // prevent the scenario initially described from occuring and + // http://crbug.com/176091 can be closed. + T* old = data_.ptr; + data_.ptr = NULL; + if (old != NULL) + static_cast(data_)(old); + data_.ptr = p; + } + + T* get() const { return data_.ptr; } + + D& get_deleter() { return data_; } + const D& get_deleter() const { return data_; } + + void swap(gscoped_ptr_impl& p2) { + // Standard swap idiom: 'using std::swap' ensures that std::swap is + // present in the overload set, but we call swap unqualified so that + // any more-specific overloads can be used, if available. + using std::swap; + swap(static_cast(data_), static_cast(p2.data_)); + swap(data_.ptr, p2.data_.ptr); + } + + T* release() { + T* old_ptr = data_.ptr; + data_.ptr = NULL; + return old_ptr; + } + + private: + // Needed to allow type-converting constructor. + template friend class gscoped_ptr_impl; + + // Use the empty base class optimization to allow us to have a D + // member, while avoiding any space overhead for it when D is an + // empty class. See e.g. http://www.cantrip.org/emptyopt.html for a good + // discussion of this technique. + struct Data : public D { + explicit Data(T* ptr_in) : ptr(ptr_in) {} + Data(T* ptr_in, D other) : D(std::move(other)), ptr(ptr_in) {} + T* ptr; + }; + + Data data_; + + DISALLOW_COPY_AND_ASSIGN(gscoped_ptr_impl); +}; + +} // namespace internal + +} // namespace kudu + +// A gscoped_ptr is like a T*, except that the destructor of gscoped_ptr +// automatically deletes the pointer it holds (if any). +// That is, gscoped_ptr owns the T object that it points to. +// Like a T*, a gscoped_ptr may hold either NULL or a pointer to a T object. +// Also like T*, gscoped_ptr is thread-compatible, and once you +// dereference it, you get the thread safety guarantees of T. +// +// The size of gscoped_ptr is small. On most compilers, when using the +// DefaultDeleter, sizeof(gscoped_ptr) == sizeof(T*). Custom deleters will +// increase the size proportional to whatever state they need to have. See +// comments inside gscoped_ptr_impl<> for details. +// +// Current implementation targets having a strict subset of C++11's +// unique_ptr<> features. Known deficiencies include not supporting move-only +// deleteres, function pointers as deleters, and deleters with reference +// types. +template > +class gscoped_ptr { + MOVE_ONLY_TYPE_FOR_CPP_03(gscoped_ptr, RValue) + + COMPILE_ASSERT(kudu::internal::IsNotRefCounted::value, + T_is_refcounted_type_and_needs_scoped_refptr); + + public: + // The element and deleter types. + typedef T element_type; + typedef D deleter_type; + + // Constructor. Defaults to initializing with NULL. + gscoped_ptr() : impl_(NULL) { } + + // Constructor. Takes ownership of p. + explicit gscoped_ptr(element_type* p) : impl_(p) { } + + // Constructor. Allows initialization of a stateful deleter. + gscoped_ptr(element_type* p, const D& d) : impl_(p, d) { } + + // Constructor. Allows construction from a gscoped_ptr rvalue for a + // convertible type and deleter. + // + // IMPLEMENTATION NOTE: C++11 unique_ptr<> keeps this constructor distinct + // from the normal move constructor. By C++11 20.7.1.2.1.21, this constructor + // has different post-conditions if D is a reference type. Since this + // implementation does not support deleters with reference type, + // we do not need a separate move constructor allowing us to avoid one + // use of SFINAE. You only need to care about this if you modify the + // implementation of gscoped_ptr. + template + gscoped_ptr(gscoped_ptr other) : impl_(&other.impl_) { + COMPILE_ASSERT(!base::is_array::value, U_cannot_be_an_array); + } + + // Constructor. Move constructor for C++03 move emulation of this type. + gscoped_ptr(RValue rvalue) : impl_(&rvalue.object->impl_) { } + + // operator=. Allows assignment from a gscoped_ptr rvalue for a convertible + // type and deleter. + // + // IMPLEMENTATION NOTE: C++11 unique_ptr<> keeps this operator= distinct from + // the normal move assignment operator. By C++11 20.7.1.2.3.4, this templated + // form has different requirements on for move-only Deleters. Since this + // implementation does not support move-only Deleters, we do not need a + // separate move assignment operator allowing us to avoid one use of SFINAE. + // You only need to care about this if you modify the implementation of + // gscoped_ptr. + template + gscoped_ptr& operator=(gscoped_ptr rhs) { + COMPILE_ASSERT(!base::is_array::value, U_cannot_be_an_array); + impl_.TakeState(&rhs.impl_); + return *this; + } + + // Reset. Deletes the currently owned object, if any. + // Then takes ownership of a new object, if given. + void reset(element_type* p = NULL) { impl_.reset(p); } + + // Accessors to get the owned object. + // operator* and operator-> will assert() if there is no current object. + element_type& operator*() const { + assert(impl_.get() != NULL); + return *impl_.get(); + } + element_type* operator->() const { + assert(impl_.get() != NULL); + return impl_.get(); + } + element_type* get() const { return impl_.get(); } + + // Access to the deleter. + deleter_type& get_deleter() { return impl_.get_deleter(); } + const deleter_type& get_deleter() const { return impl_.get_deleter(); } + + // Allow gscoped_ptr to be used in boolean expressions, but not + // implicitly convertible to a real bool (which is dangerous). + private: + typedef kudu::internal::gscoped_ptr_impl + gscoped_ptr::*Testable; + + public: + operator Testable() const { return impl_.get() ? &gscoped_ptr::impl_ : NULL; } + + // Comparison operators. + // These return whether two gscoped_ptr refer to the same object, not just to + // two different but equal objects. + bool operator==(const element_type* p) const { return impl_.get() == p; } + bool operator!=(const element_type* p) const { return impl_.get() != p; } + + // Swap two scoped pointers. + void swap(gscoped_ptr& p2) { + impl_.swap(p2.impl_); + } + + // Release a pointer. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + element_type* release() WARN_UNUSED_RESULT { + return impl_.release(); + } + + // C++98 doesn't support functions templates with default parameters which + // makes it hard to write a PassAs() that understands converting the deleter + // while preserving simple calling semantics. + // + // Until there is a use case for PassAs() with custom deleters, just ignore + // the custom deleter. + template + gscoped_ptr PassAs() { + return gscoped_ptr(Pass()); + } + + private: + // Needed to reach into |impl_| in the constructor. + template friend class gscoped_ptr; + kudu::internal::gscoped_ptr_impl impl_; + + // Forbid comparison of gscoped_ptr types. If U != T, it totally + // doesn't make sense, and if U == T, it still doesn't make sense + // because you should never have the same object owned by two different + // gscoped_ptrs. + template bool operator==(gscoped_ptr const& p2) const; + template bool operator!=(gscoped_ptr const& p2) const; +}; + +template +class gscoped_ptr { + MOVE_ONLY_TYPE_FOR_CPP_03(gscoped_ptr, RValue) + + public: + // The element and deleter types. + typedef T element_type; + typedef D deleter_type; + + // Constructor. Defaults to initializing with NULL. + gscoped_ptr() : impl_(NULL) { } + + // Constructor. Stores the given array. Note that the argument's type + // must exactly match T*. In particular: + // - it cannot be a pointer to a type derived from T, because it is + // inherently unsafe in the general case to access an array through a + // pointer whose dynamic type does not match its static type (eg., if + // T and the derived types had different sizes access would be + // incorrectly calculated). Deletion is also always undefined + // (C++98 [expr.delete]p3). If you're doing this, fix your code. + // - it cannot be NULL, because NULL is an integral expression, not a + // pointer to T. Use the no-argument version instead of explicitly + // passing NULL. + // - it cannot be const-qualified differently from T per unique_ptr spec + // (http://cplusplus.github.com/LWG/lwg-active.html#2118). Users wanting + // to work around this may use implicit_cast(). + // However, because of the first bullet in this comment, users MUST + // NOT use implicit_cast() to upcast the static type of the array. + explicit gscoped_ptr(element_type* array) : impl_(array) { } + + // Constructor. Move constructor for C++03 move emulation of this type. + gscoped_ptr(RValue rvalue) : impl_(&rvalue.object->impl_) { } + + // operator=. Move operator= for C++03 move emulation of this type. + gscoped_ptr& operator=(RValue rhs) { + impl_.TakeState(&rhs.object->impl_); + return *this; + } + + // Reset. Deletes the currently owned array, if any. + // Then takes ownership of a new object, if given. + void reset(element_type* array = NULL) { impl_.reset(array); } + + // Accessors to get the owned array. + element_type& operator[](size_t i) const { + assert(impl_.get() != NULL); + return impl_.get()[i]; + } + element_type* get() const { return impl_.get(); } + + // Access to the deleter. + deleter_type& get_deleter() { return impl_.get_deleter(); } + const deleter_type& get_deleter() const { return impl_.get_deleter(); } + + // Allow gscoped_ptr to be used in boolean expressions, but not + // implicitly convertible to a real bool (which is dangerous). + private: + typedef kudu::internal::gscoped_ptr_impl + gscoped_ptr::*Testable; + + public: + operator Testable() const { return impl_.get() ? &gscoped_ptr::impl_ : NULL; } + + // Comparison operators. + // These return whether two gscoped_ptr refer to the same object, not just to + // two different but equal objects. + bool operator==(element_type* array) const { return impl_.get() == array; } + bool operator!=(element_type* array) const { return impl_.get() != array; } + + // Swap two scoped pointers. + void swap(gscoped_ptr& p2) { + impl_.swap(p2.impl_); + } + + // Release a pointer. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + element_type* release() WARN_UNUSED_RESULT { + return impl_.release(); + } + + private: + // Force element_type to be a complete type. + enum { type_must_be_complete = sizeof(element_type) }; + + // Actually hold the data. + kudu::internal::gscoped_ptr_impl impl_; + + // Disable initialization from any type other than element_type*, by + // providing a constructor that matches such an initialization, but is + // private and has no definition. This is disabled because it is not safe to + // call delete[] on an array whose static type does not match its dynamic + // type. + template explicit gscoped_ptr(U* array); + explicit gscoped_ptr(int disallow_construction_from_null); + + // Disable reset() from any type other than element_type*, for the same + // reasons as the constructor above. + template void reset(U* array); + void reset(int disallow_reset_from_null); + + // Forbid comparison of gscoped_ptr types. If U != T, it totally + // doesn't make sense, and if U == T, it still doesn't make sense + // because you should never have the same object owned by two different + // gscoped_ptrs. + template bool operator==(gscoped_ptr const& p2) const; + template bool operator!=(gscoped_ptr const& p2) const; +}; + +// Free functions +template +void swap(gscoped_ptr& p1, gscoped_ptr& p2) { + p1.swap(p2); +} + +template +bool operator==(T* p1, const gscoped_ptr& p2) { + return p1 == p2.get(); +} + +template +bool operator!=(T* p1, const gscoped_ptr& p2) { + return p1 != p2.get(); +} + +// DEPRECATED: Use gscoped_ptr instead. +// +// gscoped_array is like gscoped_ptr, except that the caller must allocate +// with new [] and the destructor deletes objects with delete []. +// +// As with gscoped_ptr, a gscoped_array either points to an object +// or is NULL. A gscoped_array owns the object that it points to. +// gscoped_array is thread-compatible, and once you index into it, +// the returned objects have only the thread safety guarantees of T. +// +// Size: sizeof(gscoped_array) == sizeof(C*) +template +class gscoped_array { + MOVE_ONLY_TYPE_FOR_CPP_03(gscoped_array, RValue) + + public: + + // The element type + typedef C element_type; + + // Constructor. Defaults to initializing with NULL. + // There is no way to create an uninitialized gscoped_array. + // The input parameter must be allocated with new []. + explicit gscoped_array(C* p = NULL) : array_(p) { } + + // Constructor. Move constructor for C++03 move emulation of this type. + gscoped_array(RValue rvalue) + : array_(rvalue.object->release()) { + } + + // Destructor. If there is a C object, delete it. + // We don't need to test ptr_ == NULL because C++ does that for us. + ~gscoped_array() { + enum { type_must_be_complete = sizeof(C) }; + delete[] array_; + } + + // operator=. Move operator= for C++03 move emulation of this type. + gscoped_array& operator=(RValue rhs) { + reset(rhs.object->release()); + return *this; + } + + // Reset. Deletes the current owned object, if any. + // Then takes ownership of a new object, if given. + // this->reset(this->get()) works. + void reset(C* p = NULL) { + if (p != array_) { + enum { type_must_be_complete = sizeof(C) }; + delete[] array_; + array_ = p; + } + } + + // Get one element of the current object. + // Will assert() if there is no current object, or index i is negative. + C& operator[](ptrdiff_t i) const { + assert(i >= 0); + assert(array_ != NULL); + return array_[i]; + } + + // Get a pointer to the zeroth element of the current object. + // If there is no current object, return NULL. + C* get() const { + return array_; + } + + // Allow gscoped_array to be used in boolean expressions, but not + // implicitly convertible to a real bool (which is dangerous). + typedef C* gscoped_array::*Testable; + operator Testable() const { return array_ ? &gscoped_array::array_ : NULL; } + + // Comparison operators. + // These return whether two gscoped_array refer to the same object, not just to + // two different but equal objects. + bool operator==(C* p) const { return array_ == p; } + bool operator!=(C* p) const { return array_ != p; } + + // Swap two scoped arrays. + void swap(gscoped_array& p2) { + C* tmp = array_; + array_ = p2.array_; + p2.array_ = tmp; + } + + // Release an array. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + C* release() WARN_UNUSED_RESULT { + C* retVal = array_; + array_ = NULL; + return retVal; + } + + private: + C* array_; + + // Forbid comparison of different gscoped_array types. + template bool operator==(gscoped_array const& p2) const; + template bool operator!=(gscoped_array const& p2) const; +}; + +// Free functions +template +void swap(gscoped_array& p1, gscoped_array& p2) { + p1.swap(p2); +} + +template +bool operator==(C* p1, const gscoped_array& p2) { + return p1 == p2.get(); +} + +template +bool operator!=(C* p1, const gscoped_array& p2) { + return p1 != p2.get(); +} + +// DEPRECATED: Use gscoped_ptr instead. +// +// gscoped_ptr_malloc<> is similar to gscoped_ptr<>, but it accepts a +// second template argument, the functor used to free the object. + +template +class gscoped_ptr_malloc { + MOVE_ONLY_TYPE_FOR_CPP_03(gscoped_ptr_malloc, RValue) + + public: + + // The element type + typedef C element_type; + + // Constructor. Defaults to initializing with NULL. + // There is no way to create an uninitialized gscoped_ptr. + // The input parameter must be allocated with an allocator that matches the + // Free functor. For the default Free functor, this is malloc, calloc, or + // realloc. + explicit gscoped_ptr_malloc(C* p = NULL): ptr_(p) {} + + // Constructor. Move constructor for C++03 move emulation of this type. + gscoped_ptr_malloc(RValue rvalue) + : ptr_(rvalue.object->release()) { + } + + // Destructor. If there is a C object, call the Free functor. + ~gscoped_ptr_malloc() { + reset(); + } + + // operator=. Move operator= for C++03 move emulation of this type. + gscoped_ptr_malloc& operator=(RValue rhs) { + reset(rhs.object->release()); + return *this; + } + + // Reset. Calls the Free functor on the current owned object, if any. + // Then takes ownership of a new object, if given. + // this->reset(this->get()) works. + void reset(C* p = NULL) { + if (ptr_ != p) { + if (ptr_ != NULL) { + FreeProc free_proc; + free_proc(ptr_); + } + ptr_ = p; + } + } + + // Get the current object. + // operator* and operator-> will cause an assert() failure if there is + // no current object. + C& operator*() const { + assert(ptr_ != NULL); + return *ptr_; + } + + C* operator->() const { + assert(ptr_ != NULL); + return ptr_; + } + + C* get() const { + return ptr_; + } + + // Allow gscoped_ptr_malloc to be used in boolean expressions, but not + // implicitly convertible to a real bool (which is dangerous). + typedef C* gscoped_ptr_malloc::*Testable; + operator Testable() const { return ptr_ ? &gscoped_ptr_malloc::ptr_ : NULL; } + + // Comparison operators. + // These return whether a gscoped_ptr_malloc and a plain pointer refer + // to the same object, not just to two different but equal objects. + // For compatibility with the boost-derived implementation, these + // take non-const arguments. + bool operator==(C* p) const { + return ptr_ == p; + } + + bool operator!=(C* p) const { + return ptr_ != p; + } + + // Swap two scoped pointers. + void swap(gscoped_ptr_malloc & b) { + C* tmp = b.ptr_; + b.ptr_ = ptr_; + ptr_ = tmp; + } + + // Release a pointer. + // The return value is the current pointer held by this object. + // If this object holds a NULL pointer, the return value is NULL. + // After this operation, this object will hold a NULL pointer, + // and will not own the object any more. + C* release() WARN_UNUSED_RESULT { + C* tmp = ptr_; + ptr_ = NULL; + return tmp; + } + + private: + C* ptr_; + + // no reason to use these: each gscoped_ptr_malloc should have its own object + template + bool operator==(gscoped_ptr_malloc const& p) const; + template + bool operator!=(gscoped_ptr_malloc const& p) const; +}; + +template inline +void swap(gscoped_ptr_malloc& a, gscoped_ptr_malloc& b) { + a.swap(b); +} + +template inline +bool operator==(C* p, const gscoped_ptr_malloc& b) { + return p == b.get(); +} + +template inline +bool operator!=(C* p, const gscoped_ptr_malloc& b) { + return p != b.get(); +} + +// A function to convert T* into gscoped_ptr +// Doing e.g. make_gscoped_ptr(new FooBarBaz(arg)) is a shorter notation +// for gscoped_ptr >(new FooBarBaz(arg)) +template +gscoped_ptr make_gscoped_ptr(T* ptr) { + return gscoped_ptr(ptr); +} + +#endif // KUDU_GUTIL_GSCOPED_PTR_H_ diff --git a/src/kudu/gutil/hash/builtin_type_hash.h b/src/kudu/gutil/hash/builtin_type_hash.h new file mode 100644 index 000000000000..c979eb2ed041 --- /dev/null +++ b/src/kudu/gutil/hash/builtin_type_hash.h @@ -0,0 +1,95 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Hash functions for C++ builtin types. These are all of the fundamental +// integral and floating point types in the language as well as pointers. This +// library provides a minimal set of interfaces for hashing these values. + +#ifndef UTIL_HASH_BUILTIN_TYPE_HASH_H_ +#define UTIL_HASH_BUILTIN_TYPE_HASH_H_ + +#include +#include + +#include "kudu/gutil/casts.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/hash/jenkins_lookup2.h" + +inline uint32 Hash32NumWithSeed(uint32 num, uint32 c) { + uint32 b = 0x9e3779b9UL; // the golden ratio; an arbitrary value + mix(num, b, c); + return c; +} + +inline uint64 Hash64NumWithSeed(uint64 num, uint64 c) { + uint64 b = GG_ULONGLONG(0xe08c1d668b756f82); // more of the golden ratio + mix(num, b, c); + return c; +} + +// This function hashes pointer sized items and returns a 32b hash, +// convenienty hiding the fact that pointers may be 32b or 64b, +// depending on the architecture. +inline uint32 Hash32PointerWithSeed(const void* p, uint32 seed) { + uintptr_t pvalue = reinterpret_cast(p); + uint32 h = seed; + // Hash the pointer 32b at a time. + for (size_t i = 0; i < sizeof(pvalue); i += 4) { + h = Hash32NumWithSeed(static_cast(pvalue >> (i*8)), h); + } + return h; +} + +// ---------------------------------------------------------------------- +// Hash64FloatWithSeed +// Hash64DoubleWithSeed +// Functions for computing a hash value of floating-point numbers. +// On systems where float and double comply with IEEE 754, these hashes +// guarantee that if a == b, Hash64FloatWithSeed(a, c) == +// Hash64FloatWithSeed(b, c). Note that NaN does not compare equal to +// itself, so two NaN inputs will not necessarily hash to the same value. +// +// It is often a mistake to compare floating-point values for equality, +// since floating-point computations do not produce exact values, due to +// rounding. If equality comparison doesn't make sense in your situation, +// hashing almost certainly doesn't make sense either. +// +// Not guaranteed to return the same value in different builds, or to +// avoid any reserved values. +// ---------------------------------------------------------------------- +inline uint64 Hash64FloatWithSeed(float num, uint64 seed) { + // +0 and -0 are the only floating point numbers which compare equal but + // have distinct bitwise representations in IEEE 754. To work around this, + // we force 0 to be +0. + if (num == 0) { + num = 0; + } + COMPILE_ASSERT(sizeof(float) == sizeof(uint32), float_has_wrong_size); + + const uint64 kMul = 0xc6a4a7935bd1e995ULL; + + uint64 a = (bit_cast(num) + seed) * kMul; + a ^= (a >> 47); + a *= kMul; + a ^= (a >> 47); + a *= kMul; + return a; +} + +inline uint64 Hash64DoubleWithSeed(double num, uint64 seed) { + if (num == 0) { + num = 0; + } + COMPILE_ASSERT(sizeof(double) == sizeof(uint64), double_has_wrong_size); + + const uint64 kMul = 0xc6a4a7935bd1e995ULL; + + uint64 a = (bit_cast(num) + seed) * kMul; + a ^= (a >> 47); + a *= kMul; + a ^= (a >> 47); + a *= kMul; + return a; +} + +#endif // UTIL_HASH_BUILTIN_TYPE_HASH_H_ diff --git a/src/kudu/gutil/hash/city.cc b/src/kudu/gutil/hash/city.cc new file mode 100644 index 000000000000..cc00ff70bf69 --- /dev/null +++ b/src/kudu/gutil/hash/city.cc @@ -0,0 +1,317 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Authors: gpike@google.com (Geoff Pike), jyrki@google.com (Jyrki Alakuijala) +// +// This file provides CityHash64() and related functions. +// +// The externally visible functions follow the naming conventions of +// hash.h, where the size of the output is part of the name. For +// example, CityHash64 returns a 64-bit hash. The internal helpers do +// not have the return type in their name, but instead have names like +// HashLenXX or HashLenXXtoYY, where XX and YY are input string lengths. +// +// Most of the constants and tricks here were copied from murmur.cc or +// hash.h, or discovered by trial and error. It's probably possible to further +// optimize the code here by writing a program that systematically explores +// more of the space of possible hash functions, or by using SIMD instructions. + +#include "kudu/gutil/hash/city.h" + +#include +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +using std::make_pair; +using std::pair; + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/hash/hash128to64.h" +#include "kudu/gutil/endian.h" + +namespace util_hash { + +// Some primes between 2^63 and 2^64 for various uses. +static const uint64 k0 = 0xa5b85c5e198ed849ULL; +static const uint64 k1 = 0x8d58ac26afe12e47ULL; +static const uint64 k2 = 0xc47b6e9e3a970ed3ULL; +static const uint64 k3 = 0xc70f6907e782aa0bULL; + +// Bitwise right rotate. Normally this will compile to a single +// instruction, especially if the shift is a manifest constant. +static uint64 Rotate(uint64 val, int shift) { + DCHECK_GE(shift, 0); + DCHECK_LE(shift, 63); + // Avoid shifting by 64: doing so yields an undefined result. + return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); +} + +// Equivalent to Rotate(), but requires the second arg to be non-zero. +// On x86-64, and probably others, it's possible for this to compile +// to a single instruction if both args are already in registers. +static uint64 RotateByAtLeast1(uint64 val, int shift) { + DCHECK_GE(shift, 1); + DCHECK_LE(shift, 63); + return (val >> shift) | (val << (64 - shift)); +} + +static uint64 ShiftMix(uint64 val) { + return val ^ (val >> 47); +} + +static uint64 HashLen16(uint64 u, uint64 v) { + return Hash128to64(uint128(u, v)); +} + +static uint64 HashLen0to16(const char *s, size_t len) { + DCHECK_GE(len, 0); + DCHECK_LE(len, 16); + if (len > 8) { + uint64 a = LittleEndian::Load64(s); + uint64 b = LittleEndian::Load64(s + len - 8); + return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b; + } + if (len >= 4) { + uint64 a = LittleEndian::Load32(s); + return HashLen16(len + (a << 3), LittleEndian::Load32(s + len - 4)); + } + if (len > 0) { + uint8 a = s[0]; + uint8 b = s[len >> 1]; + uint8 c = s[len - 1]; + uint32 y = static_cast(a) + (static_cast(b) << 8); + uint32 z = len + (static_cast(c) << 2); + return ShiftMix(y * k2 ^ z * k3) * k2; + } + return k2; +} + +// This probably works well for 16-byte strings as well, but it may be overkill +// in that case. +static uint64 HashLen17to32(const char *s, size_t len) { + DCHECK_GE(len, 17); + DCHECK_LE(len, 32); + uint64 a = LittleEndian::Load64(s) * k1; + uint64 b = LittleEndian::Load64(s + 8); + uint64 c = LittleEndian::Load64(s + len - 8) * k2; + uint64 d = LittleEndian::Load64(s + len - 16) * k0; + return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, + a + Rotate(b ^ k3, 20) - c + len); +} + +// Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. +// (For more, see the code review discussion of CL 18799087.) +static pair WeakHashLen32WithSeeds( + uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { + a += w; + b = Rotate(b + a + z, 51); + uint64 c = a; + a += x; + a += y; + b += Rotate(a, 23); + return make_pair(a + z, b + c); +} + +// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. +static pair WeakHashLen32WithSeeds( + const char* s, uint64 a, uint64 b) { + return WeakHashLen32WithSeeds(LittleEndian::Load64(s), + LittleEndian::Load64(s + 8), + LittleEndian::Load64(s + 16), + LittleEndian::Load64(s + 24), + a, + b); +} + +// Return an 8-byte hash for 33 to 64 bytes. +static uint64 HashLen33to64(const char *s, size_t len) { + uint64 z = LittleEndian::Load64(s + 24); + uint64 a = LittleEndian::Load64(s) + + (len + LittleEndian::Load64(s + len - 16)) * k0; + uint64 b = Rotate(a + z, 52); + uint64 c = Rotate(a, 37); + a += LittleEndian::Load64(s + 8); + c += Rotate(a, 7); + a += LittleEndian::Load64(s + 16); + uint64 vf = a + z; + uint64 vs = b + Rotate(a, 31) + c; + a = LittleEndian::Load64(s + 16) + LittleEndian::Load64(s + len - 32); + z += LittleEndian::Load64(s + len - 8); + b = Rotate(a + z, 52); + c = Rotate(a, 37); + a += LittleEndian::Load64(s + len - 24); + c += Rotate(a, 7); + a += LittleEndian::Load64(s + len - 16); + uint64 wf = a + z; + uint64 ws = b + Rotate(a, 31) + c; + uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); + return ShiftMix(r * k0 + vs) * k2; +} + +uint64 CityHash64(const char *s, size_t len) { + if (len <= 32) { + if (len <= 16) { + return HashLen0to16(s, len); + } else { + return HashLen17to32(s, len); + } + } else if (len <= 64) { + return HashLen33to64(s, len); + } + + // For strings over 64 bytes we hash the end first, and then as we + // loop we keep 56 bytes of state: v, w, x, y, and z. + uint64 x = LittleEndian::Load64(s + len - 40); + uint64 y = LittleEndian::Load64(s + len - 16) + + LittleEndian::Load64(s + len - 56); + uint64 z = HashLen16(LittleEndian::Load64(s + len - 48) + len, + LittleEndian::Load64(s + len - 24)); + pair v = WeakHashLen32WithSeeds(s + len - 64, len, z); + pair w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); + x = x * k1 + LittleEndian::Load64(s); + + // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. + len = (len - 1) & ~static_cast(63); + DCHECK_GT(len, 0); + DCHECK_EQ(len, len / 64 * 64); + do { + x = Rotate(x + y + v.first + LittleEndian::Load64(s + 8), 37) * k1; + y = Rotate(y + v.second + LittleEndian::Load64(s + 48), 42) * k1; + x ^= w.second; + y += v.first + LittleEndian::Load64(s + 40); + z = Rotate(z + w.first, 33) * k1; + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, + y + LittleEndian::Load64(s + 16)); + std::swap(z, x); + s += 64; + len -= 64; + } while (len != 0); + return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, + HashLen16(v.second, w.second) + x); +} + +uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) { + return CityHash64WithSeeds(s, len, k2, seed); +} + +uint64 CityHash64WithSeeds(const char *s, size_t len, + uint64 seed0, uint64 seed1) { + return HashLen16(CityHash64(s, len) - seed0, seed1); +} + +// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in ssize_t. Based on City and Murmur128. +static uint128 CityMurmur(const char *s, size_t len, uint128 seed) { + uint64 a = Uint128Low64(seed); + uint64 b = Uint128High64(seed); + uint64 c = 0; + uint64 d = 0; + ssize_t l = len - 16; + if (l <= 0) { // len <= 16 + c = b * k1 + HashLen0to16(s, len); + d = Rotate(a + (len >= 8 ? LittleEndian::Load64(s) : c), 32); + } else { // len > 16 + c = HashLen16(LittleEndian::Load64(s + len - 8) + k1, a); + d = HashLen16(b + len, c + LittleEndian::Load64(s + len - 16)); + a += d; + do { + a ^= ShiftMix(LittleEndian::Load64(s) * k1) * k1; + a *= k1; + b ^= a; + c ^= ShiftMix(LittleEndian::Load64(s + 8) * k1) * k1; + c *= k1; + d ^= c; + s += 16; + l -= 16; + } while (l > 0); + } + a = HashLen16(a, c); + b = HashLen16(d, b); + return uint128(a ^ b, HashLen16(b, a)); +} + +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) { + // TODO(user): As of February 2011, there's a beta of Murmur3 that would + // most likely be useful here. E.g., if (len < 900) return Murmur3(...) + if (len < 128) { + return CityMurmur(s, len, seed); + } + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + pair v, w; + uint64 x = Uint128Low64(seed); + uint64 y = Uint128High64(seed); + uint64 z = len * k1; + v.first = Rotate(y ^ k1, 49) * k1 + LittleEndian::Load64(s); + v.second = Rotate(v.first, 42) * k1 + LittleEndian::Load64(s + 8); + w.first = Rotate(y + z, 35) * k1 + x; + w.second = Rotate(x + LittleEndian::Load64(s + 88), 53) * k1; + + // This is similar to the inner loop of CityHash64(), manually unrolled. + do { + x = Rotate(x + y + v.first + LittleEndian::Load64(s + 16), 37) * k1; + y = Rotate(y + v.second + LittleEndian::Load64(s + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); + std::swap(z, x); + s += 64; + x = Rotate(x + y + v.first + LittleEndian::Load64(s + 16), 37) * k1; + y = Rotate(y + v.second + LittleEndian::Load64(s + 48), 42) * k1; + x ^= w.second; + y ^= v.first; + z = Rotate(z ^ w.first, 33); + v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); + w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); + std::swap(z, x); + s += 64; + len -= 128; + } while (PREDICT_TRUE(len >= 128)); + y += Rotate(w.first, 37) * k0 + z; + x += Rotate(v.first + z, 49) * k0; + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for (size_t tail_done = 0; tail_done < len; ) { + tail_done += 32; + y = Rotate(y - x, 42) * k0 + v.second; + w.first += LittleEndian::Load64(s + len - tail_done + 16); + x = Rotate(x, 49) * k0 + w.first; + w.first += v.first; + v = WeakHashLen32WithSeeds(s + len - tail_done, v.first, v.second); + } + // At this point our 48 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 48-byte-to-8-byte hashes to get a 16-byte final result. + x = HashLen16(x, v.first); + y = HashLen16(y, w.first); + return uint128(HashLen16(x + v.second, w.second) + y, + HashLen16(x + w.second, y + v.second)); +} + +uint128 CityHash128(const char *s, size_t len) { + if (len >= 16) { + return CityHash128WithSeed(s + 16, + len - 16, + uint128(LittleEndian::Load64(s) ^ k3, + LittleEndian::Load64(s + 8))); + } else if (len >= 8) { + return CityHash128WithSeed(nullptr, + 0, + uint128(LittleEndian::Load64(s) ^ (len * k0), + LittleEndian::Load64(s + len - 8) ^ k1)); + } else { + return CityHash128WithSeed(s, len, uint128(k0, k1)); + } +} + +} // namespace util_hash diff --git a/src/kudu/gutil/hash/city.h b/src/kudu/gutil/hash/city.h new file mode 100644 index 000000000000..e99202e521fc --- /dev/null +++ b/src/kudu/gutil/hash/city.h @@ -0,0 +1,53 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Authors: gpike@google.com (Geoff Pike), jyrki@google.com (Jyrki Alakuijala) +// +// This file provides a few functions for hashing strings. On x86-64 +// hardware as of early 2010, CityHash64() is much faster than +// MurmurHash64(), and passes the quality-of-hash tests in +// ./hasheval/hasheval_test.cc, among others, with flying colors. The +// difference in speed can be a factor of two for strings of 50 to 64 +// bytes, and sometimes even more for cache-resident longer strings. +// +// CityHash128() is optimized for relatively long strings and returns +// a 128-bit hash. For strings more than about 2000 bytes it can be +// faster than CityHash64(). +// +// Functions in the CityHash family are not suitable for cryptography. +// +// By the way, for some hash functions, given strings a and b, the hash +// of a+b is easily derived from the hashes of a and b. This property +// doesn't hold for any hash functions in this file. + +#ifndef UTIL_HASH_CITY_H_ +#define UTIL_HASH_CITY_H_ + +#include // for size_t. + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" + +namespace util_hash { + +// Hash function for a byte array. +// The mapping may change from time to time. +uint64 CityHash64(const char *buf, size_t len); + +// Hash function for a byte array. For convenience, a 64-bit seed is also +// hashed into the result. The mapping may change from time to time. +uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed); + +// Hash function for a byte array. For convenience, two seeds are also +// hashed into the result. The mapping may change from time to time. +uint64 CityHash64WithSeeds(const char *buf, size_t len, + uint64 seed0, uint64 seed1); + +// Hash function for a byte array. The mapping will never change. +uint128 CityHash128(const char *s, size_t len); + +// Hash function for a byte array. For convenience, a 128-bit seed is also +// hashed into the result. The mapping will never change. +uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed); + +} // namespace util_hash + +#endif // UTIL_HASH_CITY_H_ diff --git a/src/kudu/gutil/hash/hash.cc b/src/kudu/gutil/hash/hash.cc new file mode 100644 index 000000000000..92a8ca27191b --- /dev/null +++ b/src/kudu/gutil/hash/hash.cc @@ -0,0 +1,197 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// This is the legacy unified hash library implementation. Its components are +// being split up into smaller, dedicated libraries. What remains here are +// things still being migrated. +// +// To find the implementation of the core Bob Jenkins lookup2 hash, look in +// jenkins.cc. + +#include "kudu/gutil/hash/hash.h" + +#include "kudu/gutil/integral_types.h" +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/hash/jenkins.h" +#include "kudu/gutil/hash/jenkins_lookup2.h" + +// For components that ship code externally (notably the Google Search +// Appliance) we want to change the fingerprint function so that +// attackers cannot mount offline attacks to find collisions with +// google.com internal fingerprints (most importantly, for URL +// fingerprints). +#ifdef GOOGLECLIENT +#error Do not compile this into binaries that we deliver to users! +#error Instead, use +#endif +#ifdef EXTERNAL_FP +static const uint32 kFingerprintSeed0 = 0xabc; +static const uint32 kFingerprintSeed1 = 0xdef; +#else +static const uint32 kFingerprintSeed0 = 0; +static const uint32 kFingerprintSeed1 = 102072; +#endif + +static inline uint32 char2unsigned(char c) { + return static_cast(static_cast(c)); +} + +uint64 FingerprintReferenceImplementation(const char *s, uint32 len) { + uint32 hi = Hash32StringWithSeed(s, len, kFingerprintSeed0); + uint32 lo = Hash32StringWithSeed(s, len, kFingerprintSeed1); + return CombineFingerprintHalves(hi, lo); +} + +// This is a faster version of FingerprintReferenceImplementation(), +// making use of the fact that we're hashing the same string twice. +// The code is tedious to read, but it's just two interleaved copies of +// Hash32StringWithSeed(). +uint64 FingerprintInterleavedImplementation(const char *s, uint32 len) { + uint32 a, b, c = kFingerprintSeed0, d, e, f = kFingerprintSeed1; + uint32 keylen; + + a = b = d = e = 0x9e3779b9UL; // the golden ratio; an arbitrary value + + keylen = len; + if (keylen >= 4 * sizeof(a)) { + uint32 word32AtOffset0 = Google1At(s); + do { + a += word32AtOffset0; + d += word32AtOffset0; + b += Google1At(s + sizeof(a)); + e += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + f += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + word32AtOffset0 = Google1At(s); + mix(a, b, c); + mix(d, e, f); + keylen -= 3 * static_cast(sizeof(a)); + } while (keylen >= 4 * sizeof(a)); + if (keylen >= 3 * sizeof(a)) { + a += word32AtOffset0; + d += word32AtOffset0; + b += Google1At(s + sizeof(a)); + e += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + f += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + mix(a, b, c); + mix(d, e, f); + keylen -= 3 * static_cast(sizeof(a)); + DCHECK_LT(keylen, sizeof(a)); + c += len; + f += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 3 : + a += char2unsigned(s[2]) << 16; + d += char2unsigned(s[2]) << 16; + case 2 : + a += char2unsigned(s[1]) << 8; + d += char2unsigned(s[1]) << 8; + case 1 : + a += char2unsigned(s[0]); + d += char2unsigned(s[0]); + } + } else { + DCHECK(sizeof(a) <= keylen && keylen < 3 * sizeof(a)); + c += len; + f += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 11: + c += char2unsigned(s[10]) << 24; + f += char2unsigned(s[10]) << 24; + case 10: + c += char2unsigned(s[9]) << 16; + f += char2unsigned(s[9]) << 16; + case 9 : + c += char2unsigned(s[8]) << 8; + f += char2unsigned(s[8]) << 8; + case 8 : + b += Google1At(s+4); a += word32AtOffset0; + e += Google1At(s+4); d += word32AtOffset0; + break; + case 7 : + b += char2unsigned(s[6]) << 16; + e += char2unsigned(s[6]) << 16; + case 6 : + b += char2unsigned(s[5]) << 8; + e += char2unsigned(s[5]) << 8; + case 5 : + b += char2unsigned(s[4]); + e += char2unsigned(s[4]); + case 4 : + a += word32AtOffset0; + d += word32AtOffset0; + } + } + } else { + if (keylen >= 3 * sizeof(a)) { + a += Google1At(s); + d += Google1At(s); + b += Google1At(s + sizeof(a)); + e += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + f += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + mix(a, b, c); + mix(d, e, f); + keylen -= 3 * static_cast(sizeof(a)); + } + c += len; + f += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 11: + c += char2unsigned(s[10]) << 24; + f += char2unsigned(s[10]) << 24; + case 10: + c += char2unsigned(s[9]) << 16; + f += char2unsigned(s[9]) << 16; + case 9 : + c += char2unsigned(s[8]) << 8; + f += char2unsigned(s[8]) << 8; + case 8 : + b += Google1At(s+4); a += Google1At(s); + e += Google1At(s+4); d += Google1At(s); + break; + case 7 : + b += char2unsigned(s[6]) << 16; + e += char2unsigned(s[6]) << 16; + case 6 : + b += char2unsigned(s[5]) << 8; + e += char2unsigned(s[5]) << 8; + case 5 : + b += char2unsigned(s[4]); + e += char2unsigned(s[4]); + case 4 : + a += Google1At(s); + d += Google1At(s); + break; + case 3 : + a += char2unsigned(s[2]) << 16; + d += char2unsigned(s[2]) << 16; + case 2 : + a += char2unsigned(s[1]) << 8; + d += char2unsigned(s[1]) << 8; + case 1 : + a += char2unsigned(s[0]); + d += char2unsigned(s[0]); + } + } + mix(a, b, c); + mix(d, e, f); + return CombineFingerprintHalves(c, f); +} + +// Extern template definitions. + +#if defined(__GNUC__) +#include +namespace __gnu_cxx { + +template class hash_set; +template class hash_map; + +} // namespace __gnu_cxx + +#endif diff --git a/src/kudu/gutil/hash/hash.h b/src/kudu/gutil/hash/hash.h new file mode 100644 index 000000000000..3c14f807cd0c --- /dev/null +++ b/src/kudu/gutil/hash/hash.h @@ -0,0 +1,419 @@ +// +// Copyright (C) 1999 and onwards Google, Inc. +// +// +// This file contains routines for hashing and fingerprinting. +// +// A hash function takes an arbitrary input bitstring (string, char*, +// number) and turns it into a hash value (a fixed-size number) such +// that unequal input values have a high likelihood of generating +// unequal hash values. A fingerprint is a hash whose design is +// biased towards avoiding hash collisions, possibly at the expense of +// other characteristics such as execution speed. +// +// In general, if you are only using the hash values inside a single +// executable -- you're not writing the values to disk, and you don't +// depend on another instance of your program, running on another +// machine, generating the same hash values as you -- you want to use +// a HASH. Otherwise, you want to use a FINGERPRINT. +// +// RECOMMENDED HASH FOR STRINGS: GoodFastHash +// +// It is a functor, so you can use it like this: +// hash_map > +// hash_set > +// +// RECOMMENDED HASH FOR NUMBERS: hash<> +// +// Note that this is likely the identity hash, so if your +// numbers are "non-random" (especially in the low bits), another +// choice is better. You can use it like this: +// hash_map +// hash_set +// +// RECOMMENDED HASH FOR POINTERS: hash<> +// +// This is also likely the identity hash. +// +// RECOMMENDED HASH FOR STRUCTS: hash +// +// Take a fingerprint of the struct, and use that as the key. +// For instance: const uint64 hash_data[] = { s.foo, bit_cast(s.bar) }; +// uint64 fprint = (reinterpret_cast(hash_data), +// sizeof(hash_data)); +// hash_map[fprint] = whatever; +// +// RECOMMENDED FINGERPRINT: Fingerprint2011 +// +// (In util/hash/fingerprint2011.h) +// In particular, do *not* use Fingerprint in new code; it has +// problems with excess collisions. +// +// OTHER HASHES AND FINGERPRINTS: +// +// +// The wiki page also has good advice for when to use a fingerprint vs +// a hash. +// +// +// Note: if your file declares hash_map or +// hash_set, it will use the default hash function, +// hash. This is not a great choice. Always provide an +// explicit functor, such as GoodFastHash, as a template argument. +// (Either way, you will need to #include this file to get the +// necessary definition.) +// +// Some of the hash functions below are documented to be fixed +// forever; the rest (whether they're documented as so or not) may +// change over time. If you require a hash function that does not +// change over time, you should have unittests enforcing this +// property. We already have several such functions; see +// hash_unittest.cc for the details and unittests. + +#ifndef UTIL_HASH_HASH_H_ +#define UTIL_HASH_HASH_H_ + +#include +#include // for uintptr_t +#include +#include +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_map; // hacky way to make sure we import standard hash<> fns +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_set; +#include +#include + +#include "kudu/gutil/casts.h" +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/hash/hash128to64.h" +#include "kudu/gutil/hash/jenkins.h" +#include "kudu/gutil/hash/jenkins_lookup2.h" +#include "kudu/gutil/hash/legacy_hash.h" +#include "kudu/gutil/hash/string_hash.h" + +#include +namespace __gnu_cxx { + + +// STLport and MSVC 10.0 above already define these. +#if !defined(_STLP_LONG_LONG) && !(defined(_MSC_VER) && _MSC_VER >= 1600) + +#if defined(_MSC_VER) +// MSVC's stl implementation with _MSC_VER less than 1600 doesn't have +// this hash struct. STLport already defines this. +template +struct hash { + size_t operator()(const T& t) const; +}; +#endif // defined(_MSC_VER) + +#endif // !defined(_STLP_LONG_LONG) && !(defined(_MSC_VER) && _MSC_VER >= 1600) + +template<> struct hash { + size_t operator()(bool x) const { return static_cast(x); } +}; + + +} // namespace __gnu_cxx + + + +// ---------------------------------------------------------------------- +// Fingerprint() +// Not recommended for new code. Instead, use Fingerprint2011(), +// a higher-quality and faster hash function. See fingerprint2011.h. +// +// Fingerprinting a string (or char*) will never return 0 or 1, +// in case you want a couple of special values. However, +// fingerprinting a numeric type may produce 0 or 1. +// +// The hash mapping of Fingerprint() will never change. +// +// Note: AVOID USING FINGERPRINT if at all possible. Use +// Fingerprint2011 (in fingerprint2011.h) instead. +// Fingerprint() is susceptible to collisions for even short +// strings with low edit distance; see +// Example collisions: +// "01056/02" vs. "11057/02" +// "LTA 02" vs. "MTA 12" +// The same study found only one collision each for CityHash64() and +// MurmurHash64(), from more than 2^32 inputs, and on medium-length +// strings with large edit distances.These issues, among others, +// led to the recommendation that new code should avoid Fingerprint(). +// ---------------------------------------------------------------------- +extern uint64 FingerprintReferenceImplementation(const char *s, uint32 len); +extern uint64 FingerprintInterleavedImplementation(const char *s, uint32 len); +inline uint64 Fingerprint(const char *s, uint32 len) { + if (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers. + // The better choice when we have a decent number of registers. + return FingerprintInterleavedImplementation(s, len); + } else { + return FingerprintReferenceImplementation(s, len); + } +} + +// Routine that combines together the hi/lo part of a fingerprint +// and changes the result appropriately to avoid returning 0/1. +inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) { + uint64 result = (static_cast(hi) << 32) | static_cast(lo); + if ((hi == 0) && (lo < 2)) { + result ^= GG_ULONGLONG(0x130f9bef94a0a928); + } + return result; +} + +inline uint64 Fingerprint(const std::string& s) { + return Fingerprint(s.data(), static_cast(s.size())); +} +inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) { + return Hash64StringWithSeed(s.data(), static_cast(s.size()), c); +} +inline uint64 Fingerprint(schar c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(char c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(uint16 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(int16 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(uint32 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(int32 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(uint64 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} +inline uint64 Fingerprint(int64 c) { + return Hash64NumWithSeed(static_cast(c), MIX64); +} + +// This concatenates two 64-bit fingerprints. It is a convenience function to +// get a fingerprint for a combination of already fingerprinted components. +// It assumes that each input is already a good fingerprint itself. +// Note that this is legacy code and new code should use its replacement +// FingerprintCat2011(). +// +// Note that in general it's impossible to construct Fingerprint(str) +// from the fingerprints of substrings of str. One shouldn't expect +// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate +// anything about Fingerprint(StrCat(x, y)). +inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) { + return Hash64NumWithSeed(fp1, fp2); +} + +#include +namespace __gnu_cxx { + + +// This intended to be a "good" hash function. It may change from time to time. +template<> struct hash { + size_t operator()(const uint128& x) const { + if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers. + return Hash128to64(x); + } else { + uint32 a = static_cast(Uint128Low64(x)) + + static_cast(0x9e3779b9UL); + uint32 b = static_cast(Uint128Low64(x) >> 32) + + static_cast(0x9e3779b9UL); + uint32 c = static_cast(Uint128High64(x)) + MIX32; + mix(a, b, c); + a += static_cast(Uint128High64(x) >> 32); + mix(a, b, c); + return c; + } + } + // Less than operator for MSVC use. + bool operator()(const uint128& a, const uint128& b) const { + return a < b; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// Avoid collision with definition in port_hash.h (via port.h). +#ifndef HAVE_DEFINED_HASH_FOR_POINTERS +#define HAVE_DEFINED_HASH_FOR_POINTERS +// Hash pointers as if they were int's, but bring more entropy to +// the lower bits. +template struct hash { + size_t operator()(T *x) const { + size_t k = reinterpret_cast(x); + return k + (k >> 6); + } +}; +#endif // HAVE_DEFINED_HASH_FOR_POINTERS + +#if defined(__GNUC__) +// Use our nice hash function for strings +template +struct hash > { + size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { + return HashTo32(k.data(), static_cast(k.length())); + } +}; + +// they don't define a hash for const string at all +template<> struct hash { + size_t operator()(const std::string& k) const { + return HashTo32(k.data(), static_cast(k.length())); + } +}; +#endif // defined(__GNUC__) + +// MSVC's STL requires an ever-so slightly different decl +#if defined(STL_MSVC) +template<> struct hash { + size_t operator()(char const* const k) const { + return HashTo32(k, strlen(k)); + } + // Less than operator: + bool operator()(char const* const a, char const* const b) const { + return strcmp(a, b) < 0; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// MSVC 10.0 and above have already defined this. +#if !defined(_MSC_VER) || _MSC_VER < 1600 +template<> struct hash { + size_t operator()(const std::string& k) const { + return HashTo32(k.data(), k.length()); + } + // Less than operator: + bool operator()(const std::string& a, const std::string& b) const { + return a < b; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; +#endif // !defined(_MSC_VER) || _MSC_VER < 1600 + +#endif // defined(STL_MSVC) + +// Hasher for STL pairs. Requires hashers for both members to be defined +template +struct hash > { + size_t operator()(const pair& p) const { + size_t h1 = hash()(p.first); + size_t h2 = hash()(p.second); + // The decision below is at compile time + return (sizeof(h1) <= sizeof(uint32)) ? + Hash32NumWithSeed(h1, h2) + : Hash64NumWithSeed(h1, h2); + } + // Less than operator for MSVC. + bool operator()(const pair& a, + const pair& b) const { + return a < b; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + + +} // namespace __gnu_cxx + + +// If you want an excellent string hash function, and you don't mind if it +// might change when you sync and recompile, please use GoodFastHash<>. +// For most applications, GoodFastHash<> is a good choice, better than +// hash or hash or similar. GoodFastHash<> can change +// from time to time and may differ across platforms, and we'll strive +// to keep improving it. +// +// By the way, when deleting the contents of a hash_set of pointers, it is +// unsafe to delete *iterator because the hash function may be called on +// the next iterator advance. Use STLDeleteContainerPointers(). + +template struct GoodFastHash; + +// This intended to be a "good" hash function. It may change from time to time. +template<> struct GoodFastHash { + size_t operator()(const char* s) const { + return HashStringThoroughly(s, strlen(s)); + } + // Less than operator for MSVC. + bool operator()(const char* a, const char* b) const { + return strcmp(a, b) < 0; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// This intended to be a "good" hash function. It may change from time to time. +template<> struct GoodFastHash { + size_t operator()(const char* s) const { + return HashStringThoroughly(s, strlen(s)); + } + // Less than operator for MSVC. + bool operator()(const char* a, const char* b) const { + return strcmp(a, b) < 0; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// This intended to be a "good" hash function. It may change from time to time. +template +struct GoodFastHash > { + size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { + return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); + } + // Less than operator for MSVC. + bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, + const std::basic_string<_CharT, _Traits, _Alloc>& b) const { + return a < b; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// This intended to be a "good" hash function. It may change from time to time. +template +struct GoodFastHash > { + size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { + return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); + } + // Less than operator for MSVC. + bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, + const std::basic_string<_CharT, _Traits, _Alloc>& b) const { + return a < b; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; + +// Extern template declarations. +// +// gcc only for now. msvc and others: this technique is likely to work with +// your compiler too. changelists welcome. +// +// This technique is limited to template specializations whose hash key +// functions are declared in this file. + +#if defined(__GNUC__) +#include +namespace __gnu_cxx { + +extern template class hash_set; +extern template class hash_map; + +} // namespace __gnu_cxx + +#endif // defined(__GNUC__) + +#endif // UTIL_HASH_HASH_H_ diff --git a/src/kudu/gutil/hash/hash128to64.h b/src/kudu/gutil/hash/hash128to64.h new file mode 100644 index 000000000000..481a0101ae6f --- /dev/null +++ b/src/kudu/gutil/hash/hash128to64.h @@ -0,0 +1,24 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Authors: jyrki@google.com (Jyrki Alakuijala), gpike@google.com (Geoff Pike) + +#ifndef UTIL_HASH_HASH128TO64_H_ +#define UTIL_HASH_HASH128TO64_H_ + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" + +// Hash 128 input bits down to 64 bits of output. +// This is intended to be a reasonably good hash function. +// It may change from time to time. +inline uint64 Hash128to64(const uint128& x) { + // Murmur-inspired hashing. + const uint64 kMul = 0xc6a4a7935bd1e995ULL; + uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; + a ^= (a >> 47); + uint64 b = (Uint128High64(x) ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; +} + +#endif // UTIL_HASH_HASH128TO64_H_ diff --git a/src/kudu/gutil/hash/jenkins.cc b/src/kudu/gutil/hash/jenkins.cc new file mode 100644 index 000000000000..70a7e30d43e4 --- /dev/null +++ b/src/kudu/gutil/hash/jenkins.cc @@ -0,0 +1,188 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Contains the legacy Bob Jenkins Lookup2-based hashing routines. These need to +// always return the same results as their values have been recorded in various +// places and cannot easily be updated. +// +// Original Author: Sanjay Ghemawat +// +// This is based on Bob Jenkins newhash function +// see: http://burtleburtle.net/bob/hash/evahash.html +// According to http://burtleburtle.net/bob/c/lookup2.c, +// his implementation is public domain. +// +// The implementation here is backwards compatible with the google1 +// implementation. The google1 implementation used a 'signed char *' +// to load words from memory a byte at a time. See gwshash.cc for an +// implementation that is compatible with Bob Jenkins' lookup2.c. + +#include "kudu/gutil/hash/jenkins.h" + +#include "kudu/gutil/integral_types.h" +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/hash/jenkins_lookup2.h" + +static inline uint32 char2unsigned(char c) { + return static_cast(static_cast(c)); +} + +static inline uint64 char2unsigned64(char c) { + return static_cast(static_cast(c)); +} + +uint32 Hash32StringWithSeedReferenceImplementation(const char *s, uint32 len, + uint32 c) { + uint32 a, b; + uint32 keylen; + + a = b = 0x9e3779b9UL; // the golden ratio; an arbitrary value + + for ( keylen = len; keylen >= 3*sizeof(a); + keylen -= static_cast(3*sizeof(a)), s += 3*sizeof(a) ) { + a += Google1At(s); + b += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a)*2); + mix(a,b,c); + } + + c += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 11: c += char2unsigned(s[10]) << 24; + case 10: c += char2unsigned(s[9]) << 16; + case 9 : c += char2unsigned(s[8]) << 8; + // the first byte of c is reserved for the length + case 8 : b += Google1At(s+4); a += Google1At(s); break; + case 7 : b += char2unsigned(s[6]) << 16; + case 6 : b += char2unsigned(s[5]) << 8; + case 5 : b += char2unsigned(s[4]); + case 4 : a += Google1At(s); break; + case 3 : a += char2unsigned(s[2]) << 16; + case 2 : a += char2unsigned(s[1]) << 8; + case 1 : a += char2unsigned(s[0]); + // case 0 : nothing left to add + } + mix(a,b,c); + return c; +} + + +uint32 Hash32StringWithSeed(const char *s, uint32 len, uint32 c) { + uint32 a, b; + uint32 keylen; + + a = b = 0x9e3779b9UL; // the golden ratio; an arbitrary value + + keylen = len; + if (keylen >= 4 * sizeof(a)) { + uint32 word32AtOffset0 = Google1At(s); + do { + a += word32AtOffset0; + b += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + word32AtOffset0 = Google1At(s); + mix(a, b, c); + keylen -= 3 * static_cast(sizeof(a)); + } while (keylen >= 4 * sizeof(a)); + if (keylen >= 3 * sizeof(a)) { + a += word32AtOffset0; + b += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + mix(a, b, c); + keylen -= 3 * static_cast(sizeof(a)); + DCHECK_LT(keylen, sizeof(a)); + c += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 3 : a += char2unsigned(s[2]) << 16; + case 2 : a += char2unsigned(s[1]) << 8; + case 1 : a += char2unsigned(s[0]); + } + } else { + DCHECK(sizeof(a) <= keylen && keylen < 3 * sizeof(a)); + c += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 11: c += char2unsigned(s[10]) << 24; + case 10: c += char2unsigned(s[9]) << 16; + case 9 : c += char2unsigned(s[8]) << 8; + case 8 : b += Google1At(s+4); a += word32AtOffset0; break; + case 7 : b += char2unsigned(s[6]) << 16; + case 6 : b += char2unsigned(s[5]) << 8; + case 5 : b += char2unsigned(s[4]); + case 4 : a += word32AtOffset0; break; + } + } + } else { + if (keylen >= 3 * sizeof(a)) { + a += Google1At(s); + b += Google1At(s + sizeof(a)); + c += Google1At(s + sizeof(a) * 2); + s += 3 * sizeof(a); + mix(a, b, c); + keylen -= 3 * static_cast(sizeof(a)); + } + c += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 11: c += char2unsigned(s[10]) << 24; + case 10: c += char2unsigned(s[9]) << 16; + case 9 : c += char2unsigned(s[8]) << 8; + case 8 : b += Google1At(s+4); a += Google1At(s); break; + case 7 : b += char2unsigned(s[6]) << 16; + case 6 : b += char2unsigned(s[5]) << 8; + case 5 : b += char2unsigned(s[4]); + case 4 : a += Google1At(s); break; + case 3 : a += char2unsigned(s[2]) << 16; + case 2 : a += char2unsigned(s[1]) << 8; + case 1 : a += char2unsigned(s[0]); + } + } + mix(a, b, c); + return c; +} + +uint64 Hash64StringWithSeed(const char *s, uint32 len, uint64 c) { + uint64 a, b; + uint32 keylen; + + a = b = GG_ULONGLONG(0xe08c1d668b756f82); // the golden ratio; an arbitrary value + + for ( keylen = len; keylen >= 3 * sizeof(a); + keylen -= 3 * static_cast(sizeof(a)), s += 3 * sizeof(a) ) { + a += Word64At(s); + b += Word64At(s + sizeof(a)); + c += Word64At(s + sizeof(a) * 2); + mix(a,b,c); + } + + c += len; + switch ( keylen ) { // deal with rest. Cases fall through + case 23: c += char2unsigned64(s[22]) << 56; + case 22: c += char2unsigned64(s[21]) << 48; + case 21: c += char2unsigned64(s[20]) << 40; + case 20: c += char2unsigned64(s[19]) << 32; + case 19: c += char2unsigned64(s[18]) << 24; + case 18: c += char2unsigned64(s[17]) << 16; + case 17: c += char2unsigned64(s[16]) << 8; + // the first byte of c is reserved for the length + case 16: b += Word64At(s+8); a += Word64At(s); break; + case 15: b += char2unsigned64(s[14]) << 48; + case 14: b += char2unsigned64(s[13]) << 40; + case 13: b += char2unsigned64(s[12]) << 32; + case 12: b += char2unsigned64(s[11]) << 24; + case 11: b += char2unsigned64(s[10]) << 16; + case 10: b += char2unsigned64(s[ 9]) << 8; + case 9: b += char2unsigned64(s[ 8]) ; + case 8: a += Word64At(s); break; + case 7: a += char2unsigned64(s[ 6]) << 48; + case 6: a += char2unsigned64(s[ 5]) << 40; + case 5: a += char2unsigned64(s[ 4]) << 32; + case 4: a += char2unsigned64(s[ 3]) << 24; + case 3: a += char2unsigned64(s[ 2]) << 16; + case 2: a += char2unsigned64(s[ 1]) << 8; + case 1: a += char2unsigned64(s[ 0]) ; + // case 0: nothing left to add + } + mix(a,b,c); + return c; +} diff --git a/src/kudu/gutil/hash/jenkins.h b/src/kudu/gutil/hash/jenkins.h new file mode 100644 index 000000000000..90a47ed1ef30 --- /dev/null +++ b/src/kudu/gutil/hash/jenkins.h @@ -0,0 +1,40 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// The core Jenkins Lookup2-based hashing routines. These are legacy hashing +// routines and should be avoided in new code. Their implementations are dated +// and cannot be changed due to values being recorded and breaking if not +// preserved. New code which explicitly desires this property should use the +// consistent hashing libraries. New code which does not explicitly desire this +// behavior should use the generic hashing routines in hash.h. + +#ifndef UTIL_HASH_JENKINS_H_ +#define UTIL_HASH_JENKINS_H_ + +#include "kudu/gutil/integral_types.h" + +// ---------------------------------------------------------------------- +// Hash32StringWithSeed() +// Hash64StringWithSeed() +// Hash32NumWithSeed() +// Hash64NumWithSeed() +// These are Bob Jenkins' hash functions, one for 32 bit numbers +// and one for 64 bit numbers. Each takes a string as input and +// a start seed. Hashing the same string with two different seeds +// should give two independent hash values. +// The *Num*() functions just do a single mix, in order to +// convert the given number into something *random*. +// +// Note that these methods may return any value for the given size, while +// the corresponding HashToXX() methods avoids certain reserved values. +// ---------------------------------------------------------------------- + +// These slow down a lot if inlined, so do not inline them --Sanjay +uint32 Hash32StringWithSeed(const char *s, uint32 len, uint32 c); +uint64 Hash64StringWithSeed(const char *s, uint32 len, uint64 c); + +// This is a reference implementation of the same fundamental algorithm as +// Hash32StringWithSeed. It is used primarily as a performance metric. +uint32 Hash32StringWithSeedReferenceImplementation(const char *s, + uint32 len, uint32 c); + +#endif // UTIL_HASH_JENKINS_H_ diff --git a/src/kudu/gutil/hash/jenkins_lookup2.h b/src/kudu/gutil/hash/jenkins_lookup2.h new file mode 100644 index 000000000000..e6ffa843cc9e --- /dev/null +++ b/src/kudu/gutil/hash/jenkins_lookup2.h @@ -0,0 +1,156 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Legacy implementation of the core Jenkins lookup2 algorithm. This is used in +// many older hash functions which we are unable to remove or change due to the +// values being recorded. New code should not use any of these routines and +// should not include this header file. It pollutes the global namespace with +// the 'mix' function. +// +// This file contains the basic hash "mix" code which is widely referenced. +// +// This file also contains routines used to load an unaligned little-endian +// word from memory. This relatively generic functionality probably +// shouldn't live in this file. + +#ifndef UTIL_HASH_JENKINS_LOOKUP2_H_ +#define UTIL_HASH_JENKINS_LOOKUP2_H_ + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" + +// ---------------------------------------------------------------------- +// mix() +// The hash function I use is due to Bob Jenkins (see +// http://burtleburtle.net/bob/hash/index.html). +// Each mix takes 36 instructions, in 18 cycles if you're lucky. +// +// On x86 architectures, this requires 45 instructions in 27 cycles, +// if you're lucky. +// ---------------------------------------------------------------------- + +static inline void mix(uint32& a, uint32& b, uint32& c) { // 32bit version + a -= b; a -= c; a ^= (c>>13); + b -= c; b -= a; b ^= (a<<8); + c -= a; c -= b; c ^= (b>>13); + a -= b; a -= c; a ^= (c>>12); + b -= c; b -= a; b ^= (a<<16); + c -= a; c -= b; c ^= (b>>5); + a -= b; a -= c; a ^= (c>>3); + b -= c; b -= a; b ^= (a<<10); + c -= a; c -= b; c ^= (b>>15); +} + +static inline void mix(uint64& a, uint64& b, uint64& c) { // 64bit version + a -= b; a -= c; a ^= (c>>43); + b -= c; b -= a; b ^= (a<<9); + c -= a; c -= b; c ^= (b>>8); + a -= b; a -= c; a ^= (c>>38); + b -= c; b -= a; b ^= (a<<23); + c -= a; c -= b; c ^= (b>>5); + a -= b; a -= c; a ^= (c>>35); + b -= c; b -= a; b ^= (a<<49); + c -= a; c -= b; c ^= (b>>11); + a -= b; a -= c; a ^= (c>>12); + b -= c; b -= a; b ^= (a<<18); + c -= a; c -= b; c ^= (b>>22); +} + + +// Load an unaligned little endian word from memory. +// +// These routines are named Word32At(), Word64At() and Google1At(). +// Long ago, the 32-bit version of this operation was implemented using +// signed characters. The hash function that used this variant creates +// persistent hash values. The hash routine needs to remain backwards +// compatible, so we renamed the word loading function 'Google1At' to +// make it clear this implements special functionality. +// +// If a machine has alignment constraints or is big endian, we must +// load the word a byte at a time. Otherwise we can load the whole word +// from memory. +// +// [Plausibly, Word32At() and Word64At() should really be called +// UNALIGNED_LITTLE_ENDIAN_LOAD32() and UNALIGNED_LITTLE_ENDIAN_LOAD64() +// but that seems overly verbose.] + +#if !defined(NEED_ALIGNED_LOADS) && defined(IS_LITTLE_ENDIAN) +static inline uint64 Word64At(const char *ptr) { + return UNALIGNED_LOAD64(ptr); +} + +static inline uint32 Word32At(const char *ptr) { + return UNALIGNED_LOAD32(ptr); +} + +// This produces the same results as the byte-by-byte version below. +// Here, we mask off the sign bits and subtract off two copies. To +// see why this is the same as adding together the sign extensions, +// start by considering the low-order byte. If we loaded an unsigned +// word and wanted to sign extend it, we isolate the sign bit and subtract +// that from zero which gives us a sequence of bits matching the sign bit +// at and above the sign bit. If we remove (subtract) the sign bit and +// add in the low order byte, we now have a sign-extended byte as desired. +// We can then operate on all four bytes in parallel because addition +// is associative and commutative. +// +// For example, consider sign extending the bytes 0x01 and 0x81. For 0x01, +// the sign bit is zero, and 0x01 - 0 -0 = 1. For 0x81, the sign bit is 1 +// and we are computing 0x81 - 0x80 + (-0x80) == 0x01 + 0xFFFFFF80. +// +// Similarily, if we start with 0x8200 and want to sign extend that, +// we end up calculating 0x8200 - 0x8000 + (-0x8000) == 0xFFFF8000 + 0x0200 +// +// Suppose we have two bytes at the same time. Doesn't the adding of all +// those F's generate something wierd? Ignore the F's and reassociate +// the addition. For 0x8281, processing the bytes one at a time (like +// we used to do) calculates +// [0x8200 - 0x8000 + (-0x8000)] + [0x0081 - 0x80 + (-0x80)] +// == 0x8281 - 0x8080 - 0x8000 - 0x80 +// == 0x8281 - 0x8080 - 0x8080 + +static inline uint32 Google1At(const char *ptr) { + uint32 t = UNALIGNED_LOAD32(ptr); + uint32 masked = t & 0x80808080; + return t - masked - masked; +} + +#else + +// NOTE: This code is not normally used or tested. + +static inline uint64 Word64At(const char *ptr) { + return (static_cast(ptr[0]) + + (static_cast(ptr[1]) << 8) + + (static_cast(ptr[2]) << 16) + + (static_cast(ptr[3]) << 24) + + (static_cast(ptr[4]) << 32) + + (static_cast(ptr[5]) << 40) + + (static_cast(ptr[6]) << 48) + + (static_cast(ptr[7]) << 56)); +} + +static inline uint32 Word32At(const char *ptr) { + return (static_cast(ptr[0]) + + (static_cast(ptr[1]) << 8) + + (static_cast(ptr[2]) << 16) + + (static_cast(ptr[3]) << 24)); +} + +static inline uint32 Google1At(const char *ptr2) { + const schar * ptr = reinterpret_cast(ptr2); + return (static_cast(ptr[0]) + + (static_cast(ptr[1]) << 8) + + (static_cast(ptr[2]) << 16) + + (static_cast(ptr[3]) << 24)); +} + +#endif /* !NEED_ALIGNED_LOADS && IS_LITTLE_ENDIAN */ + +// Historically, WORD_HASH has always been defined as we always run on +// machines that don't NEED_ALIGNED_LOADS and which IS_LITTLE_ENDIAN. +// +// TODO(user): find occurences of WORD_HASH and adjust the code to +// use more meaningful concepts. +# define WORD_HASH + +#endif // UTIL_HASH_JENKINS_LOOKUP2_H_ diff --git a/src/kudu/gutil/hash/legacy_hash.h b/src/kudu/gutil/hash/legacy_hash.h new file mode 100644 index 000000000000..3a69336f8cfb --- /dev/null +++ b/src/kudu/gutil/hash/legacy_hash.h @@ -0,0 +1,84 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// This is a library of legacy hashing routines. These routines are still in +// use, but are not encouraged for any new code, and may be removed at some +// point in the future. +// +// New code should use one of the targeted libraries that provide hash +// interfaces for the types needed. See //util/hash/README for details. + +#ifndef UTIL_HASH_LEGACY_HASH_H_ +#define UTIL_HASH_LEGACY_HASH_H_ + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/hash/builtin_type_hash.h" +#include "kudu/gutil/hash/string_hash.h" + +// Hash8, Hash16 and Hash32 are for legacy use only. +typedef uint32 Hash32; +typedef uint16 Hash16; +typedef uint8 Hash8; + +const Hash32 kIllegalHash32 = static_cast(0xffffffffUL); +const Hash16 kIllegalHash16 = static_cast(0xffff); + +static const uint32 MIX32 = 0x12b9b0a1UL; // pi; an arbitrary number +static const uint64 MIX64 = GG_ULONGLONG(0x2b992ddfa23249d6); // more of pi + +// ---------------------------------------------------------------------- +// HashTo32() +// HashTo16() +// These functions take various types of input (through operator +// overloading) and return 32 or 16 bit quantities, respectively. +// The basic rule of our hashing is: always mix(). Thus, even for +// char outputs we cast to a uint32 and mix with two arbitrary numbers. +// HashTo32 never returns kIllegalHash32, and similary, +// HashTo16 never returns kIllegalHash16. +// +// Note that these methods avoid returning certain reserved values, while +// the corresponding HashXXStringWithSeed() methods may return any value. +// ---------------------------------------------------------------------- + +// This macro defines the HashTo32 and HashTo16 versions all in one go. +// It takes the argument list and a command that hashes your number. +// (For 16 we just mod retval before returning it.) Example: +// HASH_TO((char c), Hash32NumWithSeed(c, MIX32_1)) +// evaluates to +// uint32 retval; +// retval = Hash32NumWithSeed(c, MIX32_1); +// return retval == kIllegalHash32 ? retval-1 : retval; +// + +#define HASH_TO(arglist, command) \ +inline uint32 HashTo32 arglist { \ + uint32 retval = command; \ + return retval == kIllegalHash32 ? retval-1 : retval; \ +} + +// This defines: +// HashToXX(char *s, int slen); +// HashToXX(char c); +// etc + +HASH_TO((const char *s, uint32 slen), Hash32StringWithSeed(s, slen, MIX32)) +HASH_TO((const wchar_t *s, uint32 slen), + Hash32StringWithSeed(reinterpret_cast(s), + static_cast(sizeof(wchar_t) * slen), + MIX32)) +HASH_TO((char c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((schar c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((uint16 c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((int16 c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((uint32 c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((int32 c), Hash32NumWithSeed(static_cast(c), MIX32)) +HASH_TO((uint64 c), static_cast(Hash64NumWithSeed(c, MIX64) >> 32)) +HASH_TO((int64 c), static_cast(Hash64NumWithSeed(c, MIX64) >> 32)) + +#undef HASH_TO // clean up the macro space + +inline uint16 HashTo16(const char *s, uint32 slen) { + uint16 retval = Hash32StringWithSeed(s, slen, MIX32) >> 16; + return retval == kIllegalHash16 ? static_cast(retval-1) : retval; +} + +#endif // UTIL_HASH_LEGACY_HASH_H_ diff --git a/src/kudu/gutil/hash/string_hash.h b/src/kudu/gutil/hash/string_hash.h new file mode 100644 index 000000000000..d8c20f3a706b --- /dev/null +++ b/src/kudu/gutil/hash/string_hash.h @@ -0,0 +1,85 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// These are the core hashing routines which operate on strings. We define +// strings loosely as a sequence of bytes, and these routines are designed to +// work with the most fundamental representations of a string of bytes. +// +// These routines provide "good" hash functions in terms of both quality and +// speed. Their values can and will change as their implementations change and +// evolve. + +#ifndef UTIL_HASH_STRING_HASH_H_ +#define UTIL_HASH_STRING_HASH_H_ + +#include + +#include "kudu/gutil/port.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/hash/jenkins.h" +#include "kudu/gutil/hash/jenkins_lookup2.h" + +namespace hash_internal { + +// We have some special cases for 64-bit hardware and x86-64 in particular. +// Instead of sprinkling ifdefs through the file, we have one ugly ifdef here. +// Later code can then use "if" instead of "ifdef". +#if defined(__x86_64__) +enum { x86_64 = true, sixty_four_bit = true }; +#elif defined(_LP64) +enum { x86_64 = false, sixty_four_bit = true }; +#else +enum { x86_64 = false, sixty_four_bit = false }; +#endif + +// Arbitrary mix constants (pi). +static const uint32 kMix32 = 0x12b9b0a1UL; +static const uint64 kMix64 = GG_ULONGLONG(0x2b992ddfa23249d6); + +} // namespace hash_internal + +inline size_t HashStringThoroughlyWithSeed(const char* s, size_t len, + size_t seed) { + if (hash_internal::x86_64) + return static_cast(util_hash::CityHash64WithSeed(s, len, seed)); + + if (hash_internal::sixty_four_bit) + return Hash64StringWithSeed(s, static_cast(len), seed); + + return static_cast(Hash32StringWithSeed(s, static_cast(len), + static_cast(seed))); +} + +inline size_t HashStringThoroughly(const char* s, size_t len) { + if (hash_internal::x86_64) + return static_cast(util_hash::CityHash64(s, len)); + + if (hash_internal::sixty_four_bit) + return Hash64StringWithSeed(s, static_cast(len), + hash_internal::kMix64); + + return static_cast(Hash32StringWithSeed(s, static_cast(len), + hash_internal::kMix32)); +} + +inline size_t HashStringThoroughlyWithSeeds(const char* s, size_t len, + size_t seed0, size_t seed1) { + if (hash_internal::x86_64) + return util_hash::CityHash64WithSeeds(s, len, seed0, seed1); + + if (hash_internal::sixty_four_bit) { + uint64 a = seed0; + uint64 b = seed1; + uint64 c = HashStringThoroughly(s, len); + mix(a, b, c); + return c; + } + + uint32 a = static_cast(seed0); + uint32 b = static_cast(seed1); + uint32 c = static_cast(HashStringThoroughly(s, len)); + mix(a, b, c); + return c; +} + +#endif // UTIL_HASH_STRING_HASH_H_ diff --git a/src/kudu/gutil/int128.cc b/src/kudu/gutil/int128.cc new file mode 100644 index 000000000000..eeaee2f35dc2 --- /dev/null +++ b/src/kudu/gutil/int128.cc @@ -0,0 +1,19 @@ +// Copyright 2004 Google Inc. +// All Rights Reserved. +// +// + +#include +using std::cout; +using std::endl; +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" + +const uint128_pod kuint128max = { + static_cast(GG_LONGLONG(0xFFFFFFFFFFFFFFFF)), + static_cast(GG_LONGLONG(0xFFFFFFFFFFFFFFFF)) +}; + +std::ostream& operator<<(std::ostream& o, const uint128& b) { + return (o << b.hi_ << "::" << b.lo_); +} diff --git a/src/kudu/gutil/int128.h b/src/kudu/gutil/int128.h new file mode 100644 index 000000000000..2a19cca24320 --- /dev/null +++ b/src/kudu/gutil/int128.h @@ -0,0 +1,332 @@ +// Copyright 2004 Google Inc. +// All Rights Reserved. +// + +#ifndef BASE_INT128_H_ +#define BASE_INT128_H_ + +#include +using std::ostream; +#include "kudu/gutil/integral_types.h" + +struct uint128_pod; + +// An unsigned 128-bit integer type. Thread-compatible. +class uint128 { + public: + uint128(); // Sets to 0, but don't trust on this behavior. + uint128(uint64 top, uint64 bottom); +#ifndef SWIG + uint128(int bottom); + uint128(uint32 bottom); // Top 96 bits = 0 +#endif + uint128(uint64 bottom); // hi_ = 0 + uint128(const uint128 &val); + uint128(const uint128_pod &val); + + void Initialize(uint64 top, uint64 bottom); + + uint128& operator=(const uint128& b); + + // Arithmetic operators. + // TODO: division, etc. + uint128& operator+=(const uint128& b); + uint128& operator-=(const uint128& b); + uint128& operator*=(const uint128& b); + uint128 operator++(int); + uint128 operator--(int); + uint128& operator<<=(int); + uint128& operator>>=(int); + uint128& operator&=(const uint128& b); + uint128& operator|=(const uint128& b); + uint128& operator^=(const uint128& b); + uint128& operator++(); + uint128& operator--(); + + friend uint64 Uint128Low64(const uint128& v); + friend uint64 Uint128High64(const uint128& v); + + // We add "std::" to avoid including all of port.h. + friend std::ostream& operator<<(std::ostream& o, const uint128& b); + + private: + // Little-endian memory order optimizations can benefit from + // having lo_ first, hi_ last. + // See util/endian/endian.h and Load128/Store128 for storing a uint128. + uint64 lo_; + uint64 hi_; + + // Not implemented, just declared for catching automatic type conversions. + uint128(uint8); + uint128(uint16); + uint128(float v); + uint128(double v); +}; + +// This is a POD form of uint128 which can be used for static variables which +// need to be operated on as uint128. +struct uint128_pod { + // Note: The ordering of fields is different than 'class uint128' but the + // same as its 2-arg constructor. This enables more obvious initialization + // of static instances, which is the primary reason for this struct in the + // first place. This does not seem to defeat any optimizations wrt + // operations involving this struct. + uint64 hi; + uint64 lo; +}; + +extern const uint128_pod kuint128max; + +// allow uint128 to be logged +extern std::ostream& operator<<(std::ostream& o, const uint128& b); + +// Methods to access low and high pieces of 128-bit value. +// Defined externally from uint128 to facilitate conversion +// to native 128-bit types when compilers support them. +inline uint64 Uint128Low64(const uint128& v) { return v.lo_; } +inline uint64 Uint128High64(const uint128& v) { return v.hi_; } + +// TODO: perhaps it would be nice to have int128, a signed 128-bit type? + +// -------------------------------------------------------------------------- +// Implementation details follow +// -------------------------------------------------------------------------- +inline bool operator==(const uint128& lhs, const uint128& rhs) { + return (Uint128Low64(lhs) == Uint128Low64(rhs) && + Uint128High64(lhs) == Uint128High64(rhs)); +} +inline bool operator!=(const uint128& lhs, const uint128& rhs) { + return !(lhs == rhs); +} +inline uint128& uint128::operator=(const uint128& b) { + lo_ = b.lo_; + hi_ = b.hi_; + return *this; +} + +inline uint128::uint128(): lo_(0), hi_(0) { } +inline uint128::uint128(uint64 top, uint64 bottom) : lo_(bottom), hi_(top) { } +inline uint128::uint128(const uint128 &v) : lo_(v.lo_), hi_(v.hi_) { } +inline uint128::uint128(const uint128_pod &v) : lo_(v.lo), hi_(v.hi) { } +inline uint128::uint128(uint64 bottom) : lo_(bottom), hi_(0) { } +#ifndef SWIG +inline uint128::uint128(uint32 bottom) : lo_(bottom), hi_(0) { } +inline uint128::uint128(int bottom) : lo_(bottom), hi_(0) { + if (bottom < 0) { + --hi_; + } +} +#endif +inline void uint128::Initialize(uint64 top, uint64 bottom) { + hi_ = top; + lo_ = bottom; +} + +// Comparison operators. + +#define CMP128(op) \ +inline bool operator op(const uint128& lhs, const uint128& rhs) { \ + return (Uint128High64(lhs) == Uint128High64(rhs)) ? \ + (Uint128Low64(lhs) op Uint128Low64(rhs)) : \ + (Uint128High64(lhs) op Uint128High64(rhs)); \ +} + +CMP128(<) +CMP128(>) +CMP128(>=) +CMP128(<=) + +#undef CMP128 + +// Unary operators + +inline uint128 operator-(const uint128& val) { + const uint64 hi_flip = ~Uint128High64(val); + const uint64 lo_flip = ~Uint128Low64(val); + const uint64 lo_add = lo_flip + 1; + if (lo_add < lo_flip) { + return uint128(hi_flip + 1, lo_add); + } + return uint128(hi_flip, lo_add); +} + +inline bool operator!(const uint128& val) { + return !Uint128High64(val) && !Uint128Low64(val); +} + +// Logical operators. + +inline uint128 operator~(const uint128& val) { + return uint128(~Uint128High64(val), ~Uint128Low64(val)); +} + +#define LOGIC128(op) \ +inline uint128 operator op(const uint128& lhs, const uint128& rhs) { \ + return uint128(Uint128High64(lhs) op Uint128High64(rhs), \ + Uint128Low64(lhs) op Uint128Low64(rhs)); \ +} + +LOGIC128(|) +LOGIC128(&) +LOGIC128(^) + +#undef LOGIC128 + +#define LOGICASSIGN128(op) \ +inline uint128& uint128::operator op(const uint128& other) { \ + hi_ op other.hi_; \ + lo_ op other.lo_; \ + return *this; \ +} + +LOGICASSIGN128(|=) +LOGICASSIGN128(&=) +LOGICASSIGN128(^=) + +#undef LOGICASSIGN128 + +// Shift operators. + +inline uint128 operator<<(const uint128& val, int amount) { + // uint64 shifts of >= 64 are undefined, so we will need some special-casing. + if (amount < 64) { + if (amount == 0) { + return val; + } + uint64 new_hi = (Uint128High64(val) << amount) | + (Uint128Low64(val) >> (64 - amount)); + uint64 new_lo = Uint128Low64(val) << amount; + return uint128(new_hi, new_lo); + } else if (amount < 128) { + return uint128(Uint128Low64(val) << (amount - 64), 0); + } else { + return uint128(0, 0); + } +} + +inline uint128 operator>>(const uint128& val, int amount) { + // uint64 shifts of >= 64 are undefined, so we will need some special-casing. + if (amount < 64) { + if (amount == 0) { + return val; + } + uint64 new_hi = Uint128High64(val) >> amount; + uint64 new_lo = (Uint128Low64(val) >> amount) | + (Uint128High64(val) << (64 - amount)); + return uint128(new_hi, new_lo); + } else if (amount < 128) { + return uint128(0, Uint128High64(val) >> (amount - 64)); + } else { + return uint128(0, 0); + } +} + +inline uint128& uint128::operator<<=(int amount) { + // uint64 shifts of >= 64 are undefined, so we will need some special-casing. + if (amount < 64) { + if (amount != 0) { + hi_ = (hi_ << amount) | (lo_ >> (64 - amount)); + lo_ = lo_ << amount; + } + } else if (amount < 128) { + hi_ = lo_ << (amount - 64); + lo_ = 0; + } else { + hi_ = 0; + lo_ = 0; + } + return *this; +} + +inline uint128& uint128::operator>>=(int amount) { + // uint64 shifts of >= 64 are undefined, so we will need some special-casing. + if (amount < 64) { + if (amount != 0) { + lo_ = (lo_ >> amount) | (hi_ << (64 - amount)); + hi_ = hi_ >> amount; + } + } else if (amount < 128) { + hi_ = 0; + lo_ = hi_ >> (amount - 64); + } else { + hi_ = 0; + lo_ = 0; + } + return *this; +} + +inline uint128 operator+(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) += rhs; +} + +inline uint128 operator-(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) -= rhs; +} + +inline uint128 operator*(const uint128& lhs, const uint128& rhs) { + return uint128(lhs) *= rhs; +} + +inline uint128& uint128::operator+=(const uint128& b) { + hi_ += b.hi_; + uint64 lolo = lo_ + b.lo_; + if (lolo < lo_) + ++hi_; + lo_ = lolo; + return *this; +} + +inline uint128& uint128::operator-=(const uint128& b) { + hi_ -= b.hi_; + if (b.lo_ > lo_) + --hi_; + lo_ -= b.lo_; + return *this; +} + +inline uint128& uint128::operator*=(const uint128& b) { + uint64 a96 = hi_ >> 32; + uint64 a64 = hi_ & 0xffffffffu; + uint64 a32 = lo_ >> 32; + uint64 a00 = lo_ & 0xffffffffu; + uint64 b96 = b.hi_ >> 32; + uint64 b64 = b.hi_ & 0xffffffffu; + uint64 b32 = b.lo_ >> 32; + uint64 b00 = b.lo_ & 0xffffffffu; + // multiply [a96 .. a00] x [b96 .. b00] + // terms higher than c96 disappear off the high side + // terms c96 and c64 are safe to ignore carry bit + uint64 c96 = a96 * b00 + a64 * b32 + a32 * b64 + a00 * b96; + uint64 c64 = a64 * b00 + a32 * b32 + a00 * b64; + this->hi_ = (c96 << 32) + c64; + this->lo_ = 0; + // add terms after this one at a time to capture carry + *this += uint128(a32 * b00) << 32; + *this += uint128(a00 * b32) << 32; + *this += a00 * b00; + return *this; +} + +inline uint128 uint128::operator++(int) { + uint128 tmp(*this); + *this += 1; + return tmp; +} + +inline uint128 uint128::operator--(int) { + uint128 tmp(*this); + *this -= 1; + return tmp; +} + +inline uint128& uint128::operator++() { + *this += 1; + return *this; +} + +inline uint128& uint128::operator--() { + *this -= 1; + return *this; +} + +#endif // BASE_INT128_H_ diff --git a/src/kudu/gutil/integral_types.h b/src/kudu/gutil/integral_types.h new file mode 100644 index 000000000000..cbcf917ea5ab --- /dev/null +++ b/src/kudu/gutil/integral_types.h @@ -0,0 +1,104 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// +// Basic integer type definitions for various platforms +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// + +#ifndef BASE_INTEGRAL_TYPES_H_ +#define BASE_INTEGRAL_TYPES_H_ + +#include + +// These typedefs are also defined in base/google.swig. In the +// SWIG environment, we use those definitions and avoid duplicate +// definitions here with an ifdef. The definitions should be the +// same in both files, and ideally be only defined in this file. +#ifndef SWIG +// Standard typedefs +// All Google2 code is compiled with -funsigned-char to make "char" +// unsigned. Google2 code therefore doesn't need a "uchar" type. +typedef int8_t schar; +typedef int8_t int8; +typedef int16_t int16; +typedef int32_t int32; +#ifdef _MSC_VER +typedef __int64 int64; +#else +typedef int64_t int64; +#endif /* _MSC_VER */ + +// NOTE: unsigned types are DANGEROUS in loops and other arithmetical +// places. Use the signed types unless your variable represents a bit +// pattern (eg a hash value) or you really need the extra bit. Do NOT +// use 'unsigned' to express "this value should always be positive"; +// use assertions for this. + +typedef uint8_t uint8; +typedef uint16_t uint16; +typedef uint32_t uint32; +#ifdef _MSC_VER +typedef unsigned __int64 uint64; +#else +typedef uint64_t uint64; +#endif /* _MSC_VER */ + +// A type to represent a Unicode code-point value. As of Unicode 4.0, +// such values require up to 21 bits. +// (For type-checking on pointers, make this explicitly signed, +// and it should always be the signed version of whatever int32 is.) +typedef signed int char32; + +// A type to represent a natural machine word (for e.g. efficiently +// scanning through memory for checksums or index searching). Don't use +// this for storing normal integers. Ideally this would be just +// unsigned int, but our 64-bit architectures use the LP64 model +// (http://www.opengroup.org/public/tech/aspen/lp64_wp.htm), hence +// their ints are only 32 bits. We want to use the same fundamental +// type on all archs if possible to preserve *printf() compatability. +typedef unsigned long uword_t; + +#endif /* SWIG */ + +// long long macros to be used because gcc and vc++ use different suffixes, +// and different size specifiers in format strings +#undef GG_LONGLONG +#undef GG_ULONGLONG +#undef GG_LL_FORMAT + +#ifdef _MSC_VER /* if Visual C++ */ + +// VC++ long long suffixes +#define GG_LONGLONG(x) x##I64 +#define GG_ULONGLONG(x) x##UI64 + +#else /* not Visual C++ */ + +#define GG_LONGLONG(x) x##LL +#define GG_ULONGLONG(x) x##ULL + +#endif // _MSC_VER + + +static const uint8 kuint8max = (( uint8) 0xFF); +static const uint16 kuint16max = ((uint16) 0xFFFF); +static const uint32 kuint32max = ((uint32) 0xFFFFFFFF); +static const uint64 kuint64max = ((uint64) GG_LONGLONG(0xFFFFFFFFFFFFFFFF)); +static const int8 kint8min = (( int8) ~0x7F); +static const int8 kint8max = (( int8) 0x7F); +static const int16 kint16min = (( int16) ~0x7FFF); +static const int16 kint16max = (( int16) 0x7FFF); +static const int32 kint32min = (( int32) ~0x7FFFFFFF); +static const int32 kint32max = (( int32) 0x7FFFFFFF); +static const int64 kint64min = (( int64) GG_LONGLONG(~0x7FFFFFFFFFFFFFFF)); +static const int64 kint64max = (( int64) GG_LONGLONG(0x7FFFFFFFFFFFFFFF)); + +// TODO(user): remove this eventually. +// No object has kIllegalFprint as its Fingerprint. +typedef uint64 Fprint; +static const Fprint kIllegalFprint = 0; +static const Fprint kMaxFprint = GG_ULONGLONG(0xFFFFFFFFFFFFFFFF); + +#endif // BASE_INTEGRAL_TYPES_H_ diff --git a/src/kudu/gutil/linux_syscall_support.h b/src/kudu/gutil/linux_syscall_support.h new file mode 100644 index 000000000000..5476d0bfa664 --- /dev/null +++ b/src/kudu/gutil/linux_syscall_support.h @@ -0,0 +1,3680 @@ +/* Copyright (c) 2005-2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Markus Gutschke + */ + +/* This file includes Linux-specific support functions common to the + * coredumper and the thread lister; primarily, this is a collection + * of direct system calls, and a couple of symbols missing from + * standard header files. + * There are a few options that the including file can set to control + * the behavior of this file: + * + * SYS_CPLUSPLUS: + * The entire header file will normally be wrapped in 'extern "C" { }", + * making it suitable for compilation as both C and C++ source. If you + * do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit + * the wrapping. N.B. doing so will suppress inclusion of all prerequisite + * system header files, too. It is the caller's responsibility to provide + * the necessary definitions. + * + * SYS_ERRNO: + * All system calls will update "errno" unless overriden by setting the + * SYS_ERRNO macro prior to including this file. SYS_ERRNO should be + * an l-value. + * + * SYS_INLINE: + * New symbols will be defined "static inline", unless overridden by + * the SYS_INLINE macro. + * + * SYS_LINUX_SYSCALL_SUPPORT_H + * This macro is used to avoid multiple inclusions of this header file. + * If you need to include this file more than once, make sure to + * unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion. + * + * SYS_PREFIX: + * New system calls will have a prefix of "sys_" unless overridden by + * the SYS_PREFIX macro. Valid values for this macro are [0..9] which + * results in prefixes "sys[0..9]_". It is also possible to set this + * macro to -1, which avoids all prefixes. + * + * This file defines a few internal symbols that all start with "LSS_". + * Do not access these symbols from outside this file. They are not part + * of the supported API. + */ +#ifndef SYS_LINUX_SYSCALL_SUPPORT_H +#define SYS_LINUX_SYSCALL_SUPPORT_H + +/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux. + * Porting to other related platforms should not be difficult. + */ +#if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__mips__) || defined(__PPC__)) && defined(__linux) + +#ifndef SYS_CPLUSPLUS +#ifdef __cplusplus +/* Some system header files in older versions of gcc neglect to properly + * handle being included from C++. As it appears to be harmless to have + * multiple nested 'extern "C"' blocks, just add another one here. + */ +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __mips__ +/* Include definitions of the ABI currently in use. */ +#include +#endif + +#endif + +/* As glibc often provides subtly incompatible data structures (and implicit + * wrapper functions that convert them), we provide our own kernel data + * structures for use by the system calls. + * These structures have been developed by using Linux 2.6.23 headers for + * reference. Note though, we do not care about exact API compatibility + * with the kernel, and in fact the kernel often does not have a single + * API that works across architectures. Instead, we try to mimic the glibc + * API where reasonable, and only guarantee ABI compatibility with the + * kernel headers. + * Most notably, here are a few changes that were made to the structures + * defined by kernel headers: + * + * - we only define structures, but not symbolic names for kernel data + * types. For the latter, we directly use the native C datatype + * (i.e. "unsigned" instead of "mode_t"). + * - in a few cases, it is possible to define identical structures for + * both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by + * standardizing on the 64bit version of the data types. In particular, + * this means that we use "unsigned" where the 32bit headers say + * "unsigned long". + * - overall, we try to minimize the number of cases where we need to + * conditionally define different structures. + * - the "struct kernel_sigaction" class of structures have been + * modified to more closely mimic glibc's API by introducing an + * anonymous union for the function pointer. + * - a small number of field names had to have an underscore appended to + * them, because glibc defines a global macro by the same name. + */ + +/* include/linux/dirent.h */ +struct kernel_dirent64 { + unsigned long long d_ino; + long long d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[256]; +}; + +/* include/linux/dirent.h */ +struct kernel_dirent { + long d_ino; + long d_off; + unsigned short d_reclen; + char d_name[256]; +}; + +/* include/linux/uio.h */ +struct kernel_iovec { + void *iov_base; + unsigned long iov_len; +}; + +/* include/linux/socket.h */ +struct kernel_msghdr { + void *msg_name; + int msg_namelen; + struct kernel_iovec*msg_iov; + unsigned long msg_iovlen; + void *msg_control; + unsigned long msg_controllen; + unsigned msg_flags; +}; + +/* include/asm-generic/poll.h */ +struct kernel_pollfd { + int fd; + short events; + short revents; +}; + +/* include/linux/resource.h */ +struct kernel_rlimit { + unsigned long rlim_cur; + unsigned long rlim_max; +}; + +/* include/linux/time.h */ +struct kernel_timespec { + long tv_sec; + long tv_nsec; +}; + +/* include/linux/time.h */ +struct kernel_timeval { + long tv_sec; + long tv_usec; +}; + +/* include/linux/resource.h */ +struct kernel_rusage { + struct kernel_timeval ru_utime; + struct kernel_timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* include/linux/capablilty.h */ +struct kernel_cap_user_header { + unsigned int version; + int pid; +}; + +struct kernel_cap_user_data { + unsigned int effective; + unsigned int permitted; + unsigned int inheritable; +}; + +struct siginfo; +#if defined(__i386__) || defined(__arm__) || defined(__PPC__) + +/* include/asm-{arm,i386,mips,ppc}/signal.h */ +struct kernel_old_sigaction { + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_mask; + unsigned long sa_flags; + void (*sa_restorer)(void); +} __attribute__((packed,aligned(4))); +#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + #define kernel_old_sigaction kernel_sigaction +#endif + +/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the + * exactly match the size of the signal set, even though the API was + * intended to be extensible. We define our own KERNEL_NSIG to deal with + * this. + * Please note that glibc provides signals [1.._NSIG-1], whereas the + * kernel (and this header) provides the range [1..KERNEL_NSIG]. The + * actual number of signals is obviously the same, but the constants + * differ by one. + */ +#ifdef __mips__ +#define KERNEL_NSIG 128 +#else +#define KERNEL_NSIG 64 +#endif + +/* include/asm-{arm,i386,mips,x86_64}/signal.h */ +struct kernel_sigset_t { + unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/ + (8*sizeof(unsigned long))]; +}; + +/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */ +struct kernel_sigaction { +#ifdef __mips__ + unsigned long sa_flags; + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + struct kernel_sigset_t sa_mask; +#else + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_flags; + void (*sa_restorer)(void); + struct kernel_sigset_t sa_mask; +#endif +}; + +/* include/linux/socket.h */ +struct kernel_sockaddr { + unsigned short sa_family; + char sa_data[14]; +}; + +/* include/asm-{arm,i386,mips,ppc}/stat.h */ +#ifdef __mips__ +#if _MIPS_SIM == _MIPS_SIM_ABI64 +struct kernel_stat { +#else +struct kernel_stat64 { +#endif + unsigned st_dev; + unsigned __pad0[3]; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + unsigned __pad1[3]; + long long st_size; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned st_blksize; + unsigned __pad2; + unsigned long long st_blocks; +}; +#elif defined __PPC__ && !defined __PPC64__ +struct kernel_stat64 { + unsigned long long st_dev; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned short int __pad2; + long long st_size; + long st_blksize; + long long st_blocks; + long st_atime_; + unsigned long st_atime_nsec_; + long st_mtime_; + unsigned long st_mtime_nsec_; + long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +}; +#else +struct kernel_stat64 { + unsigned long long st_dev; + unsigned char __pad0[4]; + unsigned __st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned char __pad3[4]; + long long st_size; + unsigned st_blksize; + unsigned long long st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned long long st_ino; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */ +#if defined(__i386__) || defined(__arm__) +struct kernel_stat { + /* The kernel headers suggest that st_dev and st_rdev should be 32bit + * quantities encoding 12bit major and 20bit minor numbers in an interleaved + * format. In reality, we do not see useful data in the top bits. So, + * we'll leave the padding in here, until we find a better solution. + */ + unsigned short st_dev; + short pad1; + unsigned st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + short pad2; + unsigned st_size; + unsigned st_blksize; + unsigned st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned __unused4; + unsigned __unused5; +}; +#elif defined(__x86_64__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned st_mode; + unsigned st_uid; + unsigned st_gid; + unsigned __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + long __unused[3]; +}; +#elif defined(__PPC__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; // ino_t +#ifdef __PPC64__ + unsigned long st_nlink; // nlink_t + unsigned int st_mode; // mode_t +#else + unsigned int st_mode; // mode_t + unsigned short st_nlink; // nlink_t +#endif + unsigned int st_uid; // uid_t + unsigned int st_gid; // gid_t + unsigned long st_rdev; + long st_size; // off_t + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +#ifdef __PPC64__ + unsigned long __unused6; +#endif +}; +#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) +struct kernel_stat { + unsigned st_dev; + int st_pad1[3]; + unsigned st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + int st_pad2[2]; + long st_size; + int st_pad3; + long st_atime_; + long st_atime_nsec_; + long st_mtime_; + long st_mtime_nsec_; + long st_ctime_; + long st_ctime_nsec_; + int st_blksize; + int st_blocks; + int st_pad4[14]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */ +#ifdef __mips__ +#if _MIPS_SIM != _MIPS_SIM_ABI64 +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_frsize; + unsigned long __pad; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_files; + unsigned long long f_ffree; + unsigned long long f_bavail; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_spare[6]; +}; +#endif +#elif !defined(__x86_64__) +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_bavail; + unsigned long long f_files; + unsigned long long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */ +#ifdef __mips__ +struct kernel_statfs { + long f_type; + long f_bsize; + long f_frsize; + long f_blocks; + long f_bfree; + long f_files; + long f_ffree; + long f_bavail; + struct { int val[2]; } f_fsid; + long f_namelen; + long f_spare[6]; +}; +#else +struct kernel_statfs { + /* x86_64 actually defines all these fields as signed, whereas all other */ + /* platforms define them as unsigned. Leaving them at unsigned should not */ + /* cause any problems. */ + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_blocks; + unsigned long f_bfree; + unsigned long f_bavail; + unsigned long f_files; + unsigned long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__PPC__) +/* include/linux/aio_abi.h */ +/* Layout depends on big/little endian. */ +struct kernel_iocb { + unsigned long long aio_data; + unsigned int aio_key; + unsigned int aio_reserved; + unsigned short aio_lio_opcode; + short aio_reqprio; + unsigned int aio_filedes; + unsigned long long aio_buf; + unsigned long long aio_nbytes; + unsigned long long aio_offset; + unsigned long long aio_reserved2; + unsigned int aio_flags; + unsigned int aio_resfd; +}; +#elif defined(__PPC__) +struct kernel_iocb { + unsigned long long aio_data; + unsigned int aio_reserved; + unsigned int aio_key; + unsigned short aio_lio_opcode; + short aio_reqprio; + unsigned int aio_fildes; + unsigned long long aio_buf; + unsigned long long aio_nbytes; + unsigned long long aio_offset; + unsigned long long aio_reserved2; + unsigned int aio_flags; + unsigned int aio_resfd; +}; +#endif + +/* include/linux/aio_abi.h */ +struct kernel_io_event { + unsigned long long data; + unsigned long long obj; + long long res; + long long res2; +}; + +/* Definitions missing from the standard header files */ +#ifndef O_DIRECTORY +#if defined(__arm__) || defined(__PPC_) +#define O_DIRECTORY 0040000 +#else +#define O_DIRECTORY 0200000 +#endif +#endif +#ifndef NT_PRXFPREG +#define NT_PRXFPREG 0x46e62b7f +#endif +#ifndef PTRACE_GETFPXREGS +#define PTRACE_GETFPXREGS ((enum __ptrace_request)18) +#endif +#ifndef PR_GET_DUMPABLE +#define PR_GET_DUMPABLE 3 +#endif +#ifndef PR_SET_DUMPABLE +#define PR_SET_DUMPABLE 4 +#endif +#ifndef PR_GET_SECCOMP +#define PR_GET_SECCOMP 21 +#endif +#ifndef PR_SET_SECCOMP +#define PR_SET_SECCOMP 22 +#endif +#ifndef AT_FDCWD +#define AT_FDCWD (-100) +#endif +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 +#endif +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif +#ifndef MREMAP_FIXED +#define MREMAP_FIXED 2 +#endif +#ifndef SA_RESTORER +#define SA_RESTORER 0x04000000 +#endif +#ifndef CPUCLOCK_PROF +#define CPUCLOCK_PROF 0 +#endif +#ifndef CPUCLOCK_VIRT +#define CPUCLOCK_VIRT 1 +#endif +#ifndef CPUCLOCK_SCHED +#define CPUCLOCK_SCHED 2 +#endif +#ifndef CPUCLOCK_PERTHREAD_MASK +#define CPUCLOCK_PERTHREAD_MASK 4 +#endif +#ifndef MAKE_PROCESS_CPUCLOCK +#define MAKE_PROCESS_CPUCLOCK(pid, clock) \ + ((~(int)(pid) << 3) | (int)(clock)) +#endif +#ifndef MAKE_THREAD_CPUCLOCK +#define MAKE_THREAD_CPUCLOCK(tid, clock) \ + ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK)) +#endif + +#ifndef FUTEX_WAIT +#define FUTEX_WAIT 0 +#endif +#ifndef FUTEX_WAKE +#define FUTEX_WAKE 1 +#endif +#ifndef FUTEX_FD +#define FUTEX_FD 2 +#endif +#ifndef FUTEX_REQUEUE +#define FUTEX_REQUEUE 3 +#endif +#ifndef FUTEX_CMP_REQUEUE +#define FUTEX_CMP_REQUEUE 4 +#endif +#ifndef FUTEX_WAKE_OP +#define FUTEX_WAKE_OP 5 +#endif +#ifndef FUTEX_LOCK_PI +#define FUTEX_LOCK_PI 6 +#endif +#ifndef FUTEX_UNLOCK_PI +#define FUTEX_UNLOCK_PI 7 +#endif +#ifndef FUTEX_TRYLOCK_PI +#define FUTEX_TRYLOCK_PI 8 +#endif +#ifndef FUTEX_PRIVATE_FLAG +#define FUTEX_PRIVATE_FLAG 128 +#endif +#ifndef FUTEX_CMD_MASK +#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG +#endif +#ifndef FUTEX_WAIT_PRIVATE +#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_WAKE_PRIVATE +#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_REQUEUE_PRIVATE +#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_CMP_REQUEUE_PRIVATE +#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_WAKE_OP_PRIVATE +#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_LOCK_PI_PRIVATE +#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_UNLOCK_PI_PRIVATE +#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_TRYLOCK_PI_PRIVATE +#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) +#endif + + +#if defined(__x86_64__) +#ifndef ARCH_SET_GS +#define ARCH_SET_GS 0x1001 +#endif +#ifndef ARCH_GET_GS +#define ARCH_GET_GS 0x1004 +#endif +#endif + +#if defined(__i386__) +#ifndef __NR_mount +#define __NR_mount 21 +#endif +#ifndef __NR_setgroups32 +#define __NR_setgroups32 81 +#endif +#ifndef __NR_quotactl +#define __NR_quotactl 131 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 170 +#define __NR_getresgid 171 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 173 +#define __NR_rt_sigaction 174 +#define __NR_rt_sigprocmask 175 +#define __NR_rt_sigpending 176 +#define __NR_rt_sigsuspend 179 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 180 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 181 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 191 +#endif +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 208 +#define __NR_getresuid32 209 +#define __NR_setresgid32 210 +#define __NR_getresgid32 211 +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 215 +#define __NR_setfsgid32 216 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 220 +#endif +#ifndef __NR_gettid +#define __NR_gettid 224 +#endif +#ifndef __NR_readahead +#define __NR_readahead 225 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 226 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 227 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 229 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 230 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 232 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 233 +#endif +#ifndef __NR_tkill +#define __NR_tkill 238 +#endif +#ifndef __NR_futex +#define __NR_futex 240 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 241 +#define __NR_sched_getaffinity 242 +#endif +#ifndef __NR_io_setup +#define __NR_io_setup 245 +#define __NR_io_destroy 246 +#define __NR_io_getevents 247 +#define __NR_io_submit 248 +#define __NR_io_cancel 249 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 258 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 265 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 266 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 268 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 269 +#endif +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 272 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 289 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 290 +#endif +#ifndef __NR_openat +#define __NR_openat 295 +#endif +#ifndef __NR_fstatat64 +#define __NR_fstatat64 300 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 301 +#endif +#ifndef __NR_unshare +#define __NR_unshare 310 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 317 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 318 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 324 +#endif +#ifndef __NR_preadv +#define __NR_preadv 333 +#endif +#ifndef __NR_pwritev +#define __NR_pwritev 334 +#endif +#ifndef __NR_setns +#define __NR_setns 346 +#endif +/* End of i386 definitions */ +#elif defined(__arm__) +#ifndef __syscall +#if defined(__thumb__) || defined(__ARM_EABI__) +#define __SYS_REG(name) register long __sysreg __asm__("r6") = __NR_##name; +#define __SYS_REG_LIST(regs...) [sysreg] "r" (__sysreg) , ##regs +#define __syscall(name) "swi\t0" +#define __syscall_safe(name) \ + "push {r7}\n" \ + "mov r7,%[sysreg]\n" \ + __syscall(name)"\n" \ + "pop {r7}" +#else +#define __SYS_REG(name) +#define __SYS_REG_LIST(regs...) regs +#define __syscall(name) "swi\t" __sys1(__NR_##name) "" +#define __syscall_safe(name) __syscall(name) +#endif +#endif +#ifndef __NR_mount +#define __NR_mount (__NR_SYSCALL_BASE + 21) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_SYSCALL_BASE + 164) +#define __NR_getresuid (__NR_SYSCALL_BASE + 165) +#define __NR_setresgid (__NR_SYSCALL_BASE + 170) +#define __NR_getresgid (__NR_SYSCALL_BASE + 171) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173) +#define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174) +#define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175) +#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176) +#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_SYSCALL_BASE + 180) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181) +#endif +#ifndef __NR_capset +#define __NR_capset (__NR_SYSCALL_BASE + 185) +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_SYSCALL_BASE + 195) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_SYSCALL_BASE + 197) +#endif +#ifndef __NR_setgroups32 +#define __NR_setgroups32 (__NR_SYSCALL_BASE + 206) +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208) +#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209) +#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210) +#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211) +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215) +#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_SYSCALL_BASE + 217) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_SYSCALL_BASE + 224) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_SYSCALL_BASE + 225) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_SYSCALL_BASE + 226) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_SYSCALL_BASE + 229) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_SYSCALL_BASE + 232) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_SYSCALL_BASE + 233) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_SYSCALL_BASE + 238) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_SYSCALL_BASE + 240) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241) +#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_SYSCALL_BASE + 264) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_SYSCALL_BASE + 266) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315) +#endif +#ifndef __NR_unshare +#define __NR_unshare (__NR_SYSCALL_BASE + 337) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_SYSCALL_BASE + 344) +#endif +#ifndef __NR_setns +#define __NR_setns (__NR_SYSCALL_BASE + 375) +#endif +/* End of ARM definitions */ +#elif defined(__x86_64__) +#ifndef __NR_pread64 +#define __NR_pread64 17 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 18 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 117 +#define __NR_getresuid 118 +#define __NR_setresgid 119 +#define __NR_getresgid 120 +#endif +#ifndef __NR_mount +#define __NR_mount 165 +#endif +#ifndef __NR_quotactl +#define __NR_quotactl 179 +#endif +#ifndef __NR_gettid +#define __NR_gettid 186 +#endif +#ifndef __NR_readahead +#define __NR_readahead 187 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 188 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 189 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 191 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 192 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 194 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 195 +#endif +#ifndef __NR_tkill +#define __NR_tkill 200 +#endif +#ifndef __NR_futex +#define __NR_futex 202 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 203 +#define __NR_sched_getaffinity 204 +#endif +#ifndef __NR_io_setup +#define __NR_io_setup 206 +#define __NR_io_destroy 207 +#define __NR_io_getevents 208 +#define __NR_io_submit 209 +#define __NR_io_cancel 210 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 217 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 218 +#endif +#ifndef __NR_fadvise64 +#define __NR_fadvise64 221 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 228 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 229 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 251 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 252 +#endif +#ifndef __NR_openat +#define __NR_openat 257 +#endif +#ifndef __NR_newfstatat +#define __NR_newfstatat 262 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 263 +#endif +#ifndef __NR_unshare +#define __NR_unshare 272 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 279 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 285 +#endif +#ifndef __NR_preadv +#define __NR_preadv 295 +#endif +#ifndef __NR_pwritev +#define __NR_pwritev 296 +#endif +#ifndef __NR_setns +#define __NR_setns 308 +#endif +/* End of x86-64 definitions */ +#elif defined(__mips__) +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#ifndef __NR_mount +#define __NR_mount (__NR_Linux + 21) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 185) +#define __NR_getresuid (__NR_Linux + 186) +#define __NR_setresgid (__NR_Linux + 190) +#define __NR_getresgid (__NR_Linux + 191) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_Linux + 193) +#define __NR_rt_sigaction (__NR_Linux + 194) +#define __NR_rt_sigprocmask (__NR_Linux + 195) +#define __NR_rt_sigpending (__NR_Linux + 196) +#define __NR_rt_sigsuspend (__NR_Linux + 199) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 200) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 201) +#endif +#ifndef __NR_capset +#define __NR_capset (__NR_Linux + 205) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_Linux + 213) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_Linux + 215) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_Linux + 219) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 222) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 223) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 224) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 225) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 227) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 228) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 230) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 231) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 236) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 238) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 239) +#define __NR_sched_getaffinity (__NR_Linux + 240) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 252) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 255) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 264) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 288) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 293) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 294) +#endif +#ifndef __NR_unshare +#define __NR_unshare (__NR_Linux + 303) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 308) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 312) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 315) +#endif +#ifndef __NR_setns +#define __NR_setns (__NR_Linux + 344) +#endif +/* End of MIPS (old 32bit API) definitions */ +#elif _MIPS_SIM == _MIPS_SIM_ABI64 +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 16) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 17) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_capset +#define __NR_capset (__NR_Linux + 124) +#endif +#ifndef __NR_mount +#define __NR_mount (__NR_Linux + 160) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 212) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 222) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 223) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 247) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 252) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 253) +#endif +#ifndef __NR_unshare +#define __NR_unshare (__NR_Linux + 262) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 267) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 271) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 273) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 274) +#endif +#ifndef __NR_setns +#define __NR_setns (__NR_Linux + 303) +#endif +/* End of MIPS (64bit API) definitions */ +#else +#ifndef __NR_mount +#define __NR_mount (__NR_Linux + 160) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_capset +#define __NR_capset (__NR_Linux + 124) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 213) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 217) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 218) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 226) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 227) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 251) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 256) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 257) +#endif +#ifndef __NR_unshare +#define __NR_unshare (__NR_Linux + 266) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 271) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 275) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 277) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 278) +#endif +#ifndef __NR_setns +#define __NR_setns (__NR_Linux + 308) +#endif +/* End of MIPS (new 32bit API) definitions */ +#endif +/* End of MIPS definitions */ +#elif defined(__PPC__) +#ifndef __NR_mount +#define __NR_mount 21 +#endif +#ifndef __NR_setfsuid +#define __NR_setfsuid 138 +#define __NR_setfsgid 139 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 169 +#define __NR_getresgid 170 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 172 +#define __NR_rt_sigaction 173 +#define __NR_rt_sigprocmask 174 +#define __NR_rt_sigpending 175 +#define __NR_rt_sigsuspend 178 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 179 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 180 +#endif +#ifndef __NR_capset +#define __NR_capset 184 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 190 +#endif +#ifndef __NR_readahead +#define __NR_readahead 191 +#endif +#ifndef __PPC64__ +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#endif /* !defined(__PPC64__) */ +#ifndef __NR_getdents64 +#define __NR_getdents64 202 +#endif +#ifndef __NR_gettid +#define __NR_gettid 207 +#endif +#ifndef __NR_tkill +#define __NR_tkill 208 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 209 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 210 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 212 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 213 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 215 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 216 +#endif +#ifndef __NR_futex +#define __NR_futex 221 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 222 +#define __NR_sched_getaffinity 223 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 232 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 246 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 247 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 252 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 253 +#endif +#ifndef __PPC64__ +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 254 +#endif +#endif /* !defined(__PPC64__) */ +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 273 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 274 +#endif +#ifndef __NR_unshare +#define __NR_unshare 282 +#endif +#ifndef __NR_openat +#define __NR_openat 286 +#endif +#ifndef __PPC64__ +#ifndef __NR_fstatat64 +#define __NR_fstatat64 291 +#endif +#endif /* !defined(__PPC64__) */ +#ifndef __NR_unlinkat +#define __NR_unlinkat 292 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 301 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 302 +#endif +#ifndef __NR_setns +#define __NR_setns 350 +#endif +/* End of powerpc defininitions */ +#endif + + +/* After forking, we must make sure to only call system calls. */ +#if __BOUNDED_POINTERS__ + #error "Need to port invocations of syscalls for bounded ptrs" +#else + /* The core dumper and the thread lister get executed after threads + * have been suspended. As a consequence, we cannot call any functions + * that acquire locks. Unfortunately, libc wraps most system calls + * (e.g. in order to implement pthread_atfork, and to make calls + * cancellable), which means we cannot call these functions. Instead, + * we have to call syscall() directly. + */ + #undef LSS_ERRNO + #ifdef SYS_ERRNO + /* Allow the including file to override the location of errno. This can + * be useful when using clone() with the CLONE_VM option. + */ + #define LSS_ERRNO SYS_ERRNO + #else + #define LSS_ERRNO errno + #endif + + #undef LSS_INLINE + #ifdef SYS_INLINE + #define LSS_INLINE SYS_INLINE + #else + #define LSS_INLINE static inline + #endif + + /* Allow the including file to override the prefix used for all new + * system calls. By default, it will be set to "sys_". + */ + #undef LSS_NAME + #ifndef SYS_PREFIX + #define LSS_NAME(name) sys_##name + #elif SYS_PREFIX < 0 + #define LSS_NAME(name) name + #elif SYS_PREFIX == 0 + #define LSS_NAME(name) sys0_##name + #elif SYS_PREFIX == 1 + #define LSS_NAME(name) sys1_##name + #elif SYS_PREFIX == 2 + #define LSS_NAME(name) sys2_##name + #elif SYS_PREFIX == 3 + #define LSS_NAME(name) sys3_##name + #elif SYS_PREFIX == 4 + #define LSS_NAME(name) sys4_##name + #elif SYS_PREFIX == 5 + #define LSS_NAME(name) sys5_##name + #elif SYS_PREFIX == 6 + #define LSS_NAME(name) sys6_##name + #elif SYS_PREFIX == 7 + #define LSS_NAME(name) sys7_##name + #elif SYS_PREFIX == 8 + #define LSS_NAME(name) sys8_##name + #elif SYS_PREFIX == 9 + #define LSS_NAME(name) sys9_##name + #endif + + #undef LSS_RETURN + #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__)) + /* Failing system calls return a negative result in the range of + * -1..-4095. These are "errno" values with the sign inverted. + */ + #define LSS_RETURN(type, res) \ + do { \ + if ((unsigned long)(res) >= (unsigned long)(-4095)) { \ + LSS_ERRNO = -(res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__mips__) + /* On MIPS, failing system calls return -1, and set errno in a + * separate CPU register. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__PPC__) + /* On PPC, failing system calls return -1, and set errno in a + * separate CPU register. See linux/unistd.h. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err & 0x10000000 ) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #endif + #if defined(__i386__) + #if defined(NO_FRAME_POINTER) && (100 * __GNUC__ + __GNUC_MINOR__ >= 404) + /* This only works for GCC-4.4 and above -- the first version to use + .cfi directives for dwarf unwind info. */ + #define CFI_ADJUST_CFA_OFFSET(adjust) \ + ".cfi_adjust_cfa_offset " #adjust "\n" + #else + #define CFI_ADJUST_CFA_OFFSET(adjust) /**/ + #endif + + /* In PIC mode (e.g. when building shared libraries), gcc for i386 + * reserves ebx. Unfortunately, most distribution ship with implementations + * of _syscallX() which clobber ebx. + * Also, most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all of the _syscallX() macros. + */ + #undef LSS_BODY + #define LSS_BODY(type,args...) \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + CFI_ADJUST_CFA_OFFSET(4) \ + "movl %2,%%ebx\n" \ + "int $0x80\n" \ + "pop %%ebx\n" \ + CFI_ADJUST_CFA_OFFSET(-4) \ + args \ + : "esp", "memory"); \ + LSS_RETURN(type,__res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)(void) { \ + long __res; \ + __asm__ volatile("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name) \ + : "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1,type2 arg2) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4))); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + "movl %2,%%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx" \ + : "=a" (__res) \ + : "i" (__NR_##name), "ri" ((long)(arg1)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "esp", "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 }; \ + __asm__ __volatile__("push %%ebp\n" \ + "push %%ebx\n" \ + "movl 4(%2),%%ebp\n" \ + "movl 0(%2), %%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx\n" \ + "pop %%ebp" \ + : "=a" (__res) \ + : "i" (__NR_##name), "0" ((long)(&__s)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "esp", "memory"); \ + LSS_RETURN(type,__res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "movl %3,%%ecx\n" + "jecxz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "movl %4,%%ecx\n" + "jecxz 1f\n" + + /* Set up alignment of the child stack: + * child_stack = (child_stack & ~0xF) - 20; + */ + "andl $-16,%%ecx\n" + "subl $20,%%ecx\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movl %6,%%eax\n" + "movl %%eax,4(%%ecx)\n" + "movl %3,%%eax\n" + "movl %%eax,(%%ecx)\n" + + /* %eax = syscall(%eax = __NR_clone, + * %ebx = flags, + * %ecx = child_stack, + * %edx = parent_tidptr, + * %esi = newtls, + * %edi = child_tidptr) + * Also, make sure that %ebx gets preserved as it is + * used in PIC mode. + */ + "movl %8,%%esi\n" + "movl %7,%%edx\n" + "movl %5,%%eax\n" + "movl %9,%%edi\n" + "pushl %%ebx\n" + "movl %%eax,%%ebx\n" + "movl %2,%%eax\n" + "int $0x80\n" + + /* In the parent: restore %ebx + * In the child: move "fn" into %ebx + */ + "popl %%ebx\n" + + /* if (%eax != 0) + * return %eax; + */ + "test %%eax,%%eax\n" + "jnz 1f\n" + + /* In the child, now. Terminate frame pointer chain. + */ + "movl $0,%%ebp\n" + + /* Call "fn". "arg" is already on the stack. + */ + "call *%%ebx\n" + + /* Call _exit(%ebx). Unfortunately older versions + * of gcc restrict the number of arguments that can + * be passed to asm(). So, we need to hard-code the + * system call number. + */ + "movl %%eax,%%ebx\n" + "movl $1,%%eax\n" + "int $0x80\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), + "m"(fn), "m"(child_stack), "m"(flags), "m"(arg), + "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr) + : "esp", "memory", "ecx", "edx", "esi", "edi"); + LSS_RETURN(int, __res); + } + + #define __NR__fadvise64_64 __NR_fadvise64_64 + LSS_INLINE _syscall6(int, _fadvise64_64, int, fd, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi, + int, advice) + + LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset, + loff_t len, int advice) { + return LSS_NAME(_fadvise64_64)(fd, + (unsigned)offset, (unsigned)(offset >>32), + (unsigned)len, (unsigned)(len >> 32), + advice); + } + + #define __NR__fallocate __NR_fallocate + LSS_INLINE _syscall6(int, _fallocate, int, fd, + int, mode, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi) + + LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode, + loff_t offset, loff_t len) { + union { loff_t off; unsigned w[2]; } o = { offset }, l = { len }; + return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]); + } + + LSS_INLINE _syscall1(int, set_thread_area, void *, u) + LSS_INLINE _syscall1(int, get_thread_area, void *, u) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + LSS_INLINE void (*LSS_NAME(restore)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:pop %%eax\n" + "movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_sigreturn)); + return res; + } + #elif defined(__x86_64__) + /* There are no known problems with any of the _syscallX() macros + * currently shipping for x86_64, but we still need to be able to define + * our own version so that we can override the location of the errno + * location (e.g. when using the clone() system call with the CLONE_VM + * option). + */ + #undef LSS_BODY + #define LSS_BODY(type,name, ...) \ + long __res; \ + __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \ + ##__VA_ARGS__ : "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, name, "D" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)), "r" ((long)(arg5)) : \ + "r8", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \ + "syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)), "r" ((long)(arg5)), "r" ((long)(arg6)) : \ + "r8", "r9", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "testq %4,%4\n" + "jz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "testq %5,%5\n" + "jz 1f\n" + + /* Set up alignment of the child stack: + * child_stack = (child_stack & ~0xF) - 16; + */ + "andq $-16,%5\n" + "subq $16,%5\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movq %7,8(%5)\n" + "movq %4,0(%5)\n" + + /* %rax = syscall(%rax = __NR_clone, + * %rdi = flags, + * %rsi = child_stack, + * %rdx = parent_tidptr, + * %r8 = new_tls, + * %r10 = child_tidptr) + */ + "movq %2,%%rax\n" + "movq %9,%%r8\n" + "movq %10,%%r10\n" + "syscall\n" + + /* if (%rax != 0) + * return; + */ + "testq %%rax,%%rax\n" + "jnz 1f\n" + + /* In the child. Terminate frame pointer chain. + */ + "xorq %%rbp,%%rbp\n" + + /* Call "fn(arg)". + */ + "popq %%rax\n" + "popq %%rdi\n" + "call *%%rax\n" + + /* Call _exit(%ebx). + */ + "movq %%rax,%%rdi\n" + "movq %3,%%rax\n" + "syscall\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "S"(child_stack), "D"(flags), "r"(arg), + "d"(parent_tidptr), "g"(newtls), "g"(child_tidptr) + : "rsp", "memory", "r8", "r10", "r11", "rcx"); + } + LSS_RETURN(int, __res); + } + LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a) + LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, + int, advice) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On x86-64, the kernel does not know how to return from + * a signal handler. Instead, it relies on user space to provide a + * restorer function that calls the rt_sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movq %1,%%rax\n" + "syscall\n" + "2:popq %0\n" + "addq $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + #elif defined(__arm__) + /* Most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all fo the _syscallX() macros. + */ + #undef LSS_REG + #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a + + /* r0..r3 are scratch registers and not preserved across function + * calls. We need to first evaluate the first 4 syscall arguments + * and store them on stack. They must be loaded into r0..r3 after + * all function calls to avoid r0..r3 being clobbered. + */ + #undef LSS_SAVE_ARG + #define LSS_SAVE_ARG(r,a) long __tmp##r = (long)a + #undef LSS_LOAD_ARG + #define LSS_LOAD_ARG(r) register long __r##r __asm__("r"#r) = __tmp##r + + #undef LSS_BODY + #define LSS_BODY(type, name, args...) \ + register long __res_r0 __asm__("r0"); \ + long __res; \ + __SYS_REG(name) \ + __asm__ __volatile__ (__syscall_safe(name) \ + : "=r"(__res_r0) \ + : __SYS_REG_LIST(args) \ + : "lr", "memory"); \ + __res = __res_r0; \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + /* There is no need for using a volatile temp. */ \ + LSS_REG(0, arg1); \ + LSS_BODY(type, name, "r"(__r0)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_SAVE_ARG(0, arg1); \ + LSS_SAVE_ARG(1, arg2); \ + LSS_LOAD_ARG(0); \ + LSS_LOAD_ARG(1); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_SAVE_ARG(0, arg1); \ + LSS_SAVE_ARG(1, arg2); \ + LSS_SAVE_ARG(2, arg3); \ + LSS_LOAD_ARG(0); \ + LSS_LOAD_ARG(1); \ + LSS_LOAD_ARG(2); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2)); \ + } + #undef _syscall4 + #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_SAVE_ARG(0, arg1); \ + LSS_SAVE_ARG(1, arg2); \ + LSS_SAVE_ARG(2, arg3); \ + LSS_SAVE_ARG(3, arg4); \ + LSS_LOAD_ARG(0); \ + LSS_LOAD_ARG(1); \ + LSS_LOAD_ARG(2); \ + LSS_LOAD_ARG(3); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3)); \ + } + #undef _syscall5 + #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_SAVE_ARG(0, arg1); \ + LSS_SAVE_ARG(1, arg2); \ + LSS_SAVE_ARG(2, arg3); \ + LSS_SAVE_ARG(3, arg4); \ + LSS_REG(4, arg5); \ + LSS_LOAD_ARG(0); \ + LSS_LOAD_ARG(1); \ + LSS_LOAD_ARG(2); \ + LSS_LOAD_ARG(3); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4)); \ + } + #undef _syscall6 + #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_SAVE_ARG(0, arg1); \ + LSS_SAVE_ARG(1, arg2); \ + LSS_SAVE_ARG(2, arg3); \ + LSS_SAVE_ARG(3, arg4); \ + LSS_REG(4, arg5); \ + LSS_REG(5, arg6); \ + LSS_LOAD_ARG(0); \ + LSS_LOAD_ARG(1); \ + LSS_LOAD_ARG(2); \ + LSS_LOAD_ARG(3); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4), "r"(__r5)); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + register long __res __asm__("r5"); + { + if (fn == NULL || child_stack == NULL) { + __res = -EINVAL; + goto clone_exit; + } + + /* stash first 4 arguments on stack first because we can only load + * them after all function calls. + */ + int tmp_flags = flags; + int * tmp_stack = (int*) child_stack; + void * tmp_ptid = parent_tidptr; + void * tmp_tls = newtls; + + register int *__ctid __asm__("r4") = child_tidptr; + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + *(--tmp_stack) = (int) arg; + *(--tmp_stack) = (int) fn; + + /* We must load r0..r3 last after all possible function calls. */ + register int __flags __asm__("r0") = tmp_flags; + register void *__stack __asm__("r1") = tmp_stack; + register void *__ptid __asm__("r2") = tmp_ptid; + register void *__tls __asm__("r3") = tmp_tls; + + /* %r0 = syscall(%r0 = flags, + * %r1 = child_stack, + * %r2 = parent_tidptr, + * %r3 = newtls, + * %r4 = child_tidptr) + */ + __SYS_REG(clone) + __asm__ __volatile__(/* %r0 = syscall(%r0 = flags, + * %r1 = child_stack, + * %r2 = parent_tidptr, + * %r3 = newtls, + * %r4 = child_tidptr) + */ + "push {r7}\n" + "mov r7,%1\n" + __syscall(clone)"\n" + + /* if (%r0 != 0) + * return %r0; + */ + "movs %0,r0\n" + "bne 1f\n" + + /* In the child, now. Call "fn(arg)". + */ + "ldr r0,[sp, #4]\n" + "mov lr,pc\n" + "ldr pc,[sp]\n" + + /* Call _exit(%r0), which never returns. We only + * need to set r7 for EABI syscall ABI but we do + * this always to simplify code sharing between + * old and new syscall ABIs. + */ + "mov r7,%2\n" + __syscall(exit)"\n" + + /* Pop r7 from the stack only in the parent. + */ + "1: pop {r7}\n" + : "=r" (__res) + : "r"(__sysreg), + "i"(__NR_exit), "r"(__stack), "r"(__flags), + "r"(__ptid), "r"(__tls), "r"(__ctid) + : "cc", "lr", "memory"); + } + clone_exit: + LSS_RETURN(int, __res); + } + #elif defined(__mips__) + #undef LSS_REG + #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \ + (unsigned long)(a) + #undef LSS_BODY + #define LSS_BODY(type,name,r7,...) \ + register unsigned long __v0 __asm__("$2") = __NR_##name; \ + __asm__ __volatile__ ("syscall\n" \ + : "=&r"(__v0), r7 (__r7) \ + : "0"(__v0), ##__VA_ARGS__ \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_BODY(type, name, "=r"); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall5 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "m" ((unsigned long)arg5) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8)); \ + } + #endif + #undef _syscall6 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "lw $8, %7\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "sw $8, 20($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "r" ((unsigned long)arg5), \ + "r" ((unsigned long)arg6) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5,type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8), "r"(__r9)); \ + } + #endif + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + register unsigned long __v0 __asm__("$2"); + register unsigned long __r7 __asm__("$7") = (unsigned long)newtls; + { + register int __flags __asm__("$4") = flags; + register void *__stack __asm__("$5") = child_stack; + register void *__ptid __asm__("$6") = parent_tidptr; + register int *__ctid __asm__("$8") = child_tidptr; + __asm__ __volatile__( + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu $29,24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub $29,16\n" + #else + "dsubu $29,16\n" + #endif + + /* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "li %0,%2\n" + "beqz %5,1f\n" + "beqz %6,1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu %6,32\n" + "sw %5,0(%6)\n" + "sw %8,4(%6)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub %6,32\n" + "sw %5,0(%6)\n" + "sw %8,8(%6)\n" + #else + "dsubu %6,32\n" + "sd %5,0(%6)\n" + "sd %8,8(%6)\n" + #endif + + /* $7 = syscall($4 = flags, + * $5 = child_stack, + * $6 = parent_tidptr, + * $7 = newtls, + * $8 = child_tidptr) + */ + "li $2,%3\n" + "syscall\n" + + /* if ($7 != 0) + * return $2; + */ + "bnez $7,1f\n" + "bnez $2,1f\n" + + /* In the child, now. Call "fn(arg)". + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "lw $25,0($29)\n" + "lw $4,4($29)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "lw $25,0($29)\n" + "lw $4,8($29)\n" + #else + "ld $25,0($29)\n" + "ld $4,8($29)\n" + #endif + "jalr $25\n" + + /* Call _exit($2) + */ + "move $4,$2\n" + "li $2,%4\n" + "syscall\n" + + "1:\n" + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "addu $29, 24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "add $29, 16\n" + #else + "daddu $29,16\n" + #endif + : "=&r" (__v0), "=r" (__r7) + : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "r"(__stack), "r"(__flags), "r"(arg), + "r"(__ptid), "r"(__r7), "r"(__ctid) + : "$9", "$10", "$11", "$12", "$13", "$14", "$15", + "$24", "memory"); + } + LSS_RETURN(int, __v0, __r7); + } + #elif defined (__PPC__) + #undef LSS_LOADARGS_0 + #define LSS_LOADARGS_0(name, dummy...) \ + __sc_0 = __NR_##name + #undef LSS_LOADARGS_1 + #define LSS_LOADARGS_1(name, arg1) \ + LSS_LOADARGS_0(name); \ + __sc_3 = (unsigned long) (arg1) + #undef LSS_LOADARGS_2 + #define LSS_LOADARGS_2(name, arg1, arg2) \ + LSS_LOADARGS_1(name, arg1); \ + __sc_4 = (unsigned long) (arg2) + #undef LSS_LOADARGS_3 + #define LSS_LOADARGS_3(name, arg1, arg2, arg3) \ + LSS_LOADARGS_2(name, arg1, arg2); \ + __sc_5 = (unsigned long) (arg3) + #undef LSS_LOADARGS_4 + #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4) \ + LSS_LOADARGS_3(name, arg1, arg2, arg3); \ + __sc_6 = (unsigned long) (arg4) + #undef LSS_LOADARGS_5 + #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5) \ + LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4); \ + __sc_7 = (unsigned long) (arg5) + #undef LSS_LOADARGS_6 + #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5); \ + __sc_8 = (unsigned long) (arg6) + #undef LSS_ASMINPUT_0 + #define LSS_ASMINPUT_0 "0" (__sc_0) + #undef LSS_ASMINPUT_1 + #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3) + #undef LSS_ASMINPUT_2 + #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4) + #undef LSS_ASMINPUT_3 + #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5) + #undef LSS_ASMINPUT_4 + #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6) + #undef LSS_ASMINPUT_5 + #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7) + #undef LSS_ASMINPUT_6 + #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8) + #undef LSS_BODY + #define LSS_BODY(nr, type, name, args...) \ + long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0"); \ + register unsigned long __sc_3 __asm__ ("r3"); \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + \ + LSS_LOADARGS_##nr(name, args); \ + __asm__ __volatile__ \ + ("sc\n\t" \ + "mfcr %0" \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : LSS_ASMINPUT_##nr \ + : "cr0", "ctr", "memory", \ + "r9", "r10", "r11", "r12"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)(void) { \ + LSS_BODY(0, type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(1, type, name, arg1); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(2, type, name, arg1, arg2); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(3, type, name, arg1, arg2, arg3); \ + } + #undef _syscall4 + #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(4, type, name, arg1, arg2, arg3, arg4); \ + } + #undef _syscall5 + #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5); \ + } + #undef _syscall6 + #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \ + } + + #undef LSS_PPC_MINIMUM_FRAME_SIZE + #undef LSS_SIZE_S + #ifdef __PPC64__ + #define LSS_PPC_MINIMUM_FRAME_SIZE 112 + #define LSS_SIZE_S "d" + #else + #define LSS_PPC_MINIMUM_FRAME_SIZE 16 + #define LSS_SIZE_S "w" + #endif + + /* clone function adapted from glibc 2.3.6 clone.S */ + /* TODO(user): consider wrapping some args up in a struct, like we + * do for i386's _syscall6, so we can compile successfully on gcc 2.95 + */ + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __ret, __err; + { + register int (*__fn)(void *) __asm__ ("r8") = fn; + register void *__cstack __asm__ ("r4") = child_stack; + register int __flags __asm__ ("r3") = flags; + register void * __arg __asm__ ("r9") = arg; + register int * __ptidptr __asm__ ("r5") = parent_tidptr; + register void * __newtls __asm__ ("r6") = newtls; + register int * __ctidptr __asm__ ("r7") = child_tidptr; + __asm__ __volatile__( + /* check for fn == NULL + * and child_stack == NULL + */ + "cmp" LSS_SIZE_S "i cr0, %6, 0\n\t" + "cmp" LSS_SIZE_S "i cr1, %7, 0\n\t" + "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t" + "beq- cr0, 1f\n\t" + + /* set up stack frame for child */ + "clrr" LSS_SIZE_S "i %7, %7, 4\n\t" + "li 0, 0\n\t" + "st" LSS_SIZE_S "u 0, %13(%7)\n\t" + + /* fn, arg, child_stack are saved across the syscall: r27-29 */ + "mr 28, %6\n\t" + "mr 29, %7\n\t" + "mr 27, %9\n\t" + + /* syscall */ + "li 0, %4\n\t" + /* flags already in r3 + * child_stack already in r4 + * ptidptr already in r5 + * newtls already in r6 + * ctidptr already in r7 + */ + "sc\n\t" + + /* Test if syscall was successful */ + "cmp" LSS_SIZE_S "i cr1, 3, 0\n\t" + "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t" + "bne- cr1, 1f\n\t" + + /* Do the function call. On PowerPC64, a function pointer points + * a function descriptor instead of the first instruction. We need + * to load the callee's entry point and TOC from the descriptor. + * Since the callee may have a differet TOC, we also need to + * save and restore caller's TOC around the call. + */ + + #ifdef __PPC64__ + "std 2, 40(1)\n\t" /* Save caller's TOC. */ + "ld 4, 0(28)\n\t" /* Get callee's entry address. */ + "ld 2, 8(28)\n\t" /* Load calee's TOC. */ + "mtctr 4\n\t" + "mr 3, 27\n\t" + "bctrl\n\t" + "ld 2, 40(1)\n\t" /* Restore caller's TOC after call. */ + #else + "mtctr 28\n\t" + "mr 3, 27\n\t" + "bctrl\n\t" + #endif + + /* Call _exit(r3) */ + "li 0, %5\n\t" + "sc\n\t" + + /* Return to parent */ + "1:\n" + "mfcr %1\n\t" + "mr %0, 3\n\t" + : "=r" (__ret), "=r" (__err) + : "0" (-1), "1" (EINVAL), + "i" (__NR_clone), "i" (__NR_exit), + "r" (__fn), "r" (__cstack), "r" (__flags), + "r" (__arg), "r" (__ptidptr), "r" (__newtls), + "r" (__ctidptr), "i"(-LSS_PPC_MINIMUM_FRAME_SIZE) + : "cr0", "cr1", "memory", "ctr", + "r0", "r29", "r27", "r28"); + } + LSS_RETURN(int, __ret, __err); + } + #ifdef __PPC64__ + LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, + int, advice) + #else + /* fadvise64 wrapper not yet implemented for 32-bit PowerPC. */ + #endif + #endif /* defined (__PPC__) */ + #define __NR__exit __NR_exit + #define __NR__gettid __NR_gettid + #define __NR__mremap __NR_mremap + LSS_INLINE _syscall1(int, brk, void *, e) + LSS_INLINE _syscall2(int, capset, + struct kernel_cap_user_header*, h, + struct kernel_cap_user_data*, d) + LSS_INLINE _syscall1(int, chdir, const char *,p) + LSS_INLINE _syscall1(int, chroot, const char *,p) + LSS_INLINE _syscall1(int, close, int, f) + LSS_INLINE _syscall2(int, clock_getres, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall2(int, clock_gettime, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall1(int, dup, int, f) + LSS_INLINE _syscall2(int, dup2, int, s, + int, d) + LSS_INLINE _syscall3(int, execve, const char*, f, + const char*const*,a,const char*const*, e) + LSS_INLINE _syscall1(int, _exit, int, e) + LSS_INLINE _syscall1(int, exit_group, int, e) + LSS_INLINE _syscall3(int, fcntl, int, f, + int, c, long, a) + LSS_INLINE _syscall0(pid_t, fork) + LSS_INLINE _syscall2(int, fstat, int, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, fstatfs, int, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall2(int, ftruncate, int, f, + off_t, l) + LSS_INLINE _syscall4(int, futex, int*, a, + int, o, int, v, + struct kernel_timespec*, t) + LSS_INLINE _syscall3(int, getdents, int, f, + struct kernel_dirent*, d, int, c) + LSS_INLINE _syscall3(int, getdents64, int, f, + struct kernel_dirent64*, d, int, c) + LSS_INLINE _syscall0(gid_t, getegid) + LSS_INLINE _syscall0(uid_t, geteuid) + LSS_INLINE _syscall0(pid_t, getpgrp) + LSS_INLINE _syscall0(pid_t, getpid) + LSS_INLINE _syscall0(pid_t, getppid) + LSS_INLINE _syscall2(int, getpriority, int, a, + int, b) + LSS_INLINE _syscall3(int, getresgid, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, getresuid, uid_t *, r, + uid_t *, e, uid_t *, s) + #ifndef __ARM_EABI__ + /* No available on ARM EABI Linux. */ + LSS_INLINE _syscall2(int, getrlimit, int, r, + struct kernel_rlimit*, l) + #endif + LSS_INLINE _syscall1(pid_t, getsid, pid_t, p) + LSS_INLINE _syscall0(pid_t, _gettid) + LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v, + struct timezone *, z) + LSS_INLINE _syscall5(int, setxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall5(int, lsetxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(int, ioctl, int, d, + int, r, void *, a) + LSS_INLINE _syscall2(int, ioprio_get, int, which, + int, who) + LSS_INLINE _syscall3(int, ioprio_set, int, which, + int, who, int, ioprio) + LSS_INLINE _syscall2(int, kill, pid_t, p, + int, s) + LSS_INLINE _syscall3(off_t, lseek, int, f, + off_t, o, int, w) + LSS_INLINE _syscall2(int, munmap, void*, s, + size_t, l) + LSS_INLINE _syscall6(long, move_pages, pid_t, p, + unsigned long, n, void **,g, int *, d, + int *, s, int, f) + LSS_INLINE _syscall3(int, mprotect, const void *,a, + size_t, l, int, p) + LSS_INLINE _syscall5(void*, _mremap, void*, o, + size_t, os, size_t, ns, + unsigned long, f, void *, a) + LSS_INLINE _syscall3(int, open, const char*, p, + int, f, int, m) + LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u, + unsigned int, n, int, t) + LSS_INLINE _syscall2(int, prctl, int, o, + long, a) + LSS_INLINE _syscall5(int, mount, const char *, source, const char *, target, + const char *, filesystemtype, unsigned long, mountflags, + const void *, data) + LSS_INLINE _syscall1(int, unshare, int, flags) + LSS_INLINE _syscall2(int, setns, int, fd, int, nstype) + #if defined(__NR_preadv) + // Defined on x86_64 / i386 only + LSS_INLINE _syscall5(ssize_t, preadv, unsigned long, fd, + const struct kernel_iovec*, iovec, + unsigned long, vlen, unsigned long, pos_l, + unsigned long, pos_h) + #endif + LSS_INLINE _syscall4(long, ptrace, int, r, + pid_t, p, void *, a, void *, d) + #if defined(__NR_pwritev) + // Defined on x86_64 / i386 only + LSS_INLINE _syscall5(ssize_t, pwritev, unsigned long, fd, + const struct kernel_iovec*, iovec, + unsigned long, vlen, unsigned long, pos_l, + unsigned long, pos_h) + #endif + #if defined(__NR_quotactl) + // Defined on x86_64 / i386 only + LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special, + int, id, caddr_t, addr) + #endif + LSS_INLINE _syscall3(ssize_t, read, int, f, + void *, b, size_t, c) + LSS_INLINE _syscall3(int, readlink, const char*, p, + char*, b, size_t, s) + LSS_INLINE _syscall4(int, rt_sigaction, int, s, + const struct kernel_sigaction*, a, + struct kernel_sigaction*, o, size_t, c) + LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s, + size_t, c) + LSS_INLINE _syscall4(int, rt_sigprocmask, int, h, + const struct kernel_sigset_t*, s, + struct kernel_sigset_t*, o, size_t, c); + LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u); + LSS_INLINE _syscall2(int, rt_sigsuspend, + const struct kernel_sigset_t*, s, size_t, c); + LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall0(int, sched_yield) + LSS_INLINE _syscall1(long, set_tid_address, int *, t) + LSS_INLINE _syscall1(int, setfsgid, gid_t, g) + LSS_INLINE _syscall1(int, setfsuid, uid_t, u) + LSS_INLINE _syscall1(int, setuid, uid_t, u) + LSS_INLINE _syscall1(int, setgid, gid_t, g) + LSS_INLINE _syscall2(int, setpgid, pid_t, p, + pid_t, g) + LSS_INLINE _syscall3(int, setpriority, int, a, + int, b, int, p) + LSS_INLINE _syscall3(int, setresgid, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, setresuid, uid_t, r, + uid_t, e, uid_t, s) + LSS_INLINE _syscall2(int, setrlimit, int, r, + const struct kernel_rlimit*, l) + LSS_INLINE _syscall0(pid_t, setsid) + LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s, + const stack_t*, o) + #if defined(__NR_sigreturn) + LSS_INLINE _syscall1(int, sigreturn, unsigned long, u); + #endif + LSS_INLINE _syscall2(int, stat, const char*, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, statfs, const char*, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall3(int, tgkill, pid_t, p, + pid_t, t, int, s) + LSS_INLINE _syscall2(int, tkill, pid_t, p, + int, s) + LSS_INLINE _syscall3(ssize_t, write, int, f, + const void *, b, size_t, c) + LSS_INLINE _syscall3(ssize_t, writev, int, f, + const struct kernel_iovec*, v, size_t, c) + LSS_INLINE _syscall1(int, umask, unsigned, m) + LSS_INLINE _syscall1(int, unlink, const char*, f) + #if defined(__NR_getcpu) + LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu, + unsigned *, node, void *, unused); + #endif + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(int, recvmsg, int, s, + struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall3(int, sendmsg, int, s, + const struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall6(int, sendto, int, s, + const void*, m, size_t, l, + int, f, + const struct kernel_sockaddr*, a, int, t) + LSS_INLINE _syscall2(int, shutdown, int, s, + int, h) + LSS_INLINE _syscall3(int, socket, int, d, + int, t, int, p) + LSS_INLINE _syscall4(int, socketpair, int, d, + int, t, int, p, int*, s) + #endif + + #if defined(__x86_64__) || defined(__PPC__) + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + return LSS_NAME(getresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + return LSS_NAME(getresuid)(ruid, euid, suid); + } + + LSS_INLINE _syscall4(int, newfstatat, int, d, + const char *, p, + struct kernel_stat*, b, int, f) + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + return LSS_NAME(setfsgid)(gid); + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + return LSS_NAME(setfsuid)(uid); + } + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + return LSS_NAME(setresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + return LSS_NAME(setresuid)(ruid, euid, suid); + } + #endif // defined(__x86_64__) || defined(__PPC__) + + #if defined(__x86_64__) || defined(__PPC64__) + LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode, + loff_t, offset, loff_t, len) + + LSS_INLINE _syscall6(void*, mmap, void*, s, + size_t, l, int, p, + int, f, int, d, + __off64_t, o) + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + #if defined(__x86_64__) + /* On x86_64, the kernel requires us to always set our own + * SA_RESTORER in order to be able to return from a signal handler. + * This function must have a "magic" signature that the "gdb" + * (and maybe the kernel?) can recognize. + */ + if (act != NULL && !(act->sa_flags & SA_RESTORER)) { + struct kernel_sigaction a = *act; + a.sa_flags |= SA_RESTORER; + a.sa_restorer = LSS_NAME(restore_rt)(); + return LSS_NAME(rt_sigaction)(signum, &a, oldact, + (KERNEL_NSIG+7)/8); + } else { + return LSS_NAME(rt_sigaction)(signum, act, oldact, + (KERNEL_NSIG+7)/8); + } + #else + return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8); + #endif + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + } + #endif /* defined(__x86_64__) || defined(__PPC64__) */ + + #if defined(__x86_64__) || \ + defined(__arm__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall4(pid_t, wait4, pid_t, p, + int*, s, int, o, + struct kernel_rusage*, r) + + LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){ + return LSS_NAME(wait4)(pid, status, options, 0); + } + #endif + #if defined(__x86_64__)|| \ + defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_5T__) || \ + defined(__mips__) || defined(__PPC__) + LSS_INLINE _syscall2(int, setgroups, size_t, c, + const gid_t *, g) + #endif + #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__PPC__) + LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m) + LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f) + #endif + #if defined(__i386__) || defined(__arm__) + #define __NR__getresgid32 __NR_getresgid32 + #define __NR__getresuid32 __NR_getresuid32 + #define __NR__setfsgid32 __NR_setfsgid32 + #define __NR__setfsuid32 __NR_setfsuid32 + #define __NR__setgroups32 __NR_setgroups32 + #define __NR__setgroups __NR_setgroups + #define __NR__setresgid32 __NR_setresgid32 + #define __NR__setresuid32 __NR_setresuid32 + LSS_INLINE _syscall2(int, ugetrlimit, int, r, + struct kernel_rlimit*, l) + LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r, + uid_t *, e, uid_t *, s) + LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f) + LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f) + LSS_INLINE _syscall2(int, _setgroups32, int, s, + const unsigned int *, l) + LSS_INLINE _syscall2(int, _setgroups, size_t, c, + const unsigned short *, g) + LSS_INLINE _syscall3(int, _setresgid32, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, _setresuid32, uid_t, r, + uid_t, e, uid_t, s) + + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + int rc; + if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresgid only sets 16 bits + *rgid = *egid = *sgid = 0; + rc = LSS_NAME(getresgid)(rgid, egid, sgid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + int rc; + if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresuid only sets 16 bits + *ruid = *euid = *suid = 0; + rc = LSS_NAME(getresuid)(ruid, euid, suid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + int rc; + if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)gid & ~0xFFFFu) { + LSS_ERRNO = EINVAL; + } else { + rc = LSS_NAME(setfsgid)(gid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + int rc; + if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)uid & ~0xFFFFu) { + LSS_ERRNO = EINVAL; + } else { + rc = LSS_NAME(setfsuid)(uid); + } + } + return rc; + } + + + // We cannot allocate memory so there is a problem with building the + // list of groups with the proper datatype. Older kernels have limits + // on the number of groups that can be set at one time of up to 32. + // So we have an array on the stack of size 32 where to put the groups. + #define LSS_SET_GROUPS_SIZE 32 + LSS_INLINE int LSS_NAME(setgroups)(size_t size, const unsigned int *list) { + int rc = 0; + if ((rc = LSS_NAME(_setgroups32)(size, list)) < 0 && + LSS_ERRNO == ENOSYS) { + if (size > LSS_SET_GROUPS_SIZE) { + LSS_ERRNO = EINVAL; + } else { + unsigned short gid_list[LSS_SET_GROUPS_SIZE]; + int i; + for (i = 0; i < size; ++i) { + if (list[i] & ~0xFFFFu) { + LSS_ERRNO = EINVAL; + break; + } + gid_list[i] = list[i]; + } + if (LSS_ERRNO != EINVAL) { + rc = LSS_NAME(_setgroups)(size, gid_list); + } + } + } + return rc; + } + #undef LSS_SET_GROUPS_SIZE + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + int rc; + if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)rgid & ~0xFFFFu || + (unsigned int)egid & ~0xFFFFu || + (unsigned int)sgid & ~0xFFFFu) { + LSS_ERRNO = EINVAL; + } else { + rc = LSS_NAME(setresgid)(rgid, egid, sgid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + int rc; + if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)ruid & ~0xFFFFu || + (unsigned int)euid & ~0xFFFFu || + (unsigned int)suid & ~0xFFFFu) { + LSS_ERRNO = EINVAL; + } else { + rc = LSS_NAME(setresuid)(ruid, euid, suid); + } + } + return rc; + } + #endif + LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) { + memset(&set->sig, 0, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) { + memset(&set->sig, -1, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0]))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0])))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] & + (1UL << ((signum - 1) % (8*sizeof(set->sig[0]))))); + } + } + #if defined(__i386__) || \ + defined(__arm__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \ + (defined(__PPC__) && !defined(__PPC64__)) + #define __NR__sigaction __NR_sigaction + #define __NR__sigpending __NR_sigpending + #define __NR__sigprocmask __NR_sigprocmask + #define __NR__sigsuspend __NR_sigsuspend + LSS_INLINE _syscall2(int, fstat64, int, f, + struct kernel_stat64 *, b) + LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo, + loff_t *, res, uint, wh) + + #ifndef __ARM_EABI__ + /* Not available on ARM EABI Linux. */ + LSS_INLINE _syscall1(void*, mmap, void*, a) + #endif + LSS_INLINE _syscall6(void*, mmap2, void*, s, + size_t, l, int, p, + int, f, int, d, + off_t, o) + LSS_INLINE _syscall3(int, _sigaction, int, s, + const struct kernel_old_sigaction*, a, + struct kernel_old_sigaction*, o) + LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s) + LSS_INLINE _syscall3(int, _sigprocmask, int, h, + const unsigned long*, s, + unsigned long*, o) + #ifdef __PPC__ + LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s) + #else + LSS_INLINE _syscall3(int, _sigsuspend, const void*, a, + int, b, + unsigned long, s) + #endif + LSS_INLINE _syscall2(int, stat64, const char *, p, + struct kernel_stat64 *, b) + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + int old_errno = LSS_ERRNO; + int rc; + struct kernel_sigaction a; + if (act != NULL) { + a = *act; + #ifdef __i386__ + /* On i386, the kernel requires us to always set our own + * SA_RESTORER when using realtime signals. Otherwise, it does not + * know how to return from a signal handler. This function must have + * a "magic" signature that the "gdb" (and maybe the kernel?) can + * recognize. + * Apparently, a SA_RESTORER is implicitly set by the kernel, when + * using non-realtime signals. + * + * TODO: Test whether ARM needs a restorer + */ + if (!(a.sa_flags & SA_RESTORER)) { + a.sa_flags |= SA_RESTORER; + a.sa_restorer = (a.sa_flags & SA_SIGINFO) + ? LSS_NAME(restore_rt)() : LSS_NAME(restore)(); + } + #endif + } + rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact, + (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa; + if (!act) { + ptr_a = NULL; + } else { + oa.sa_handler_ = act->sa_handler_; + memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask)); + #ifndef __mips__ + oa.sa_restorer = act->sa_restorer; + #endif + oa.sa_flags = act->sa_flags; + } + if (!oldact) { + ptr_oa = NULL; + } + LSS_ERRNO = old_errno; + rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa); + if (rc == 0 && oldact) { + if (act) { + memcpy(oldact, act, sizeof(*act)); + } else { + memset(oldact, 0, sizeof(*oldact)); + } + oldact->sa_handler_ = ptr_oa->sa_handler_; + oldact->sa_flags = ptr_oa->sa_flags; + memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask)); + #ifndef __mips__ + oldact->sa_restorer = ptr_oa->sa_restorer; + #endif + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + int old_errno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = old_errno; + LSS_NAME(sigemptyset)(set); + rc = LSS_NAME(_sigpending)(&set->sig[0]); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + if (oldset) { + LSS_NAME(sigemptyset)(oldset); + } + rc = LSS_NAME(_sigprocmask)(how, + set ? &set->sig[0] : NULL, + oldset ? &oldset->sig[0] : NULL); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + rc = LSS_NAME(_sigsuspend)( + #ifndef __PPC__ + set, 0, + #endif + set->sig[0]); + } + return rc; + } + #endif + #if defined(__PPC__) + #undef LSS_SC_LOADARGS_0 + #define LSS_SC_LOADARGS_0(dummy...) + /* arg1 .. arg6 are passed in an unsigned long array pointed by r4. */ + #undef LSS_SC_LOADARGS_1 + #define LSS_SC_LOADARGS_1(arg1) \ + sc_args[0] = (unsigned long) (arg1) + #undef LSS_SC_LOADARGS_2 + #define LSS_SC_LOADARGS_2(arg1, arg2) \ + LSS_SC_LOADARGS_1(arg1); \ + sc_args[1] = (unsigned long) (arg2) + #undef LSS_SC_LOADARGS_3 + #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \ + LSS_SC_LOADARGS_2(arg1, arg2); \ + sc_args[2] = (unsigned long) (arg3) + #undef LSS_SC_LOADARGS_4 + #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \ + LSS_SC_LOADARGS_3(arg1, arg2, arg3); \ + sc_args[3] = (unsigned long) (arg4) + #undef LSS_SC_LOADARGS_5 + #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \ + LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \ + sc_args[4] = (unsigned long) (arg5) + #undef LSS_SC_LOADARGS_6 + #define LSS_SC_LOADARGS_6(arg1, arg2, arg3, arg4, arg5, arg6) \ + LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5); \ + sc_args[5] = (unsigned long) (arg6) + #undef LSS_SC_BODY + /* + * Do a socket system call using the generic socketcall() interface. + * We pack arguments into an array of unsigned longs and then + * call socketcall() with a function number and the argument array. + * Although some socket calls now have their own syscall numbers, + * we still use socketcall() to make our code work with older kernels. + */ + #define LSS_SC_BODY(nr, type, opt, args...) \ + long __sc_ret, __sc_err; \ + { \ + unsigned long sc_args[6]; \ + register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \ + register unsigned long __sc_3 __asm__ ("r3") = opt; \ + register unsigned long __sc_4 __asm__ ("r4"); \ + LSS_SC_LOADARGS_##nr(args); \ + __asm__ __volatile__ \ + ("sc\n\t" \ + "mfcr %0" \ + : "+r" (__sc_0), \ + "+r" (__sc_3), "=r" (__sc_4) \ + : "2"(&sc_args) \ + : "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", \ + "cr0", "ctr", "memory"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + LSS_SC_BODY(3, ssize_t, 17, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + LSS_SC_BODY(3, ssize_t, 16, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + LSS_SC_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen); + } + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + LSS_SC_BODY(2, int, 13, s, how); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + LSS_SC_BODY(3, int, 1, domain, type, protocol); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + LSS_SC_BODY(4, int, 8, d, type, protocol, sv); + } + #endif + #if defined(__i386__) || \ + (defined(__arm__) && !defined(__ARM_EABI__)) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + + /* See sys_socketcall in net/socket.c in kernel source. + * It de-multiplexes on its first arg and unpacks the arglist + * array in its second arg. + */ + LSS_INLINE _syscall2(long, socketcall, int, c, unsigned long*, a) + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + unsigned long args[3] = { + (unsigned long) s, + (unsigned long) msg, + (unsigned long) flags + }; + return (ssize_t) LSS_NAME(socketcall)(17, args); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + unsigned long args[3] = { + (unsigned long) s, + (unsigned long) msg, + (unsigned long) flags + }; + return (ssize_t) LSS_NAME(socketcall)(16, args); + } + + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + unsigned long args[6] = { + (unsigned long) s, + (unsigned long) buf, + (unsigned long) len, + (unsigned long) flags, + (unsigned long) to, + (unsigned long) tolen + }; + return (ssize_t) LSS_NAME(socketcall)(11, args); + } + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + unsigned long args[2] = { + (unsigned long) s, + (unsigned long) how + }; + return LSS_NAME(socketcall)(13, args); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + unsigned long args[3] = { + (unsigned long) domain, + (unsigned long) type, + (unsigned long) protocol + }; + return LSS_NAME(socketcall)(1, args); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + unsigned long args[4] = { + (unsigned long) d, + (unsigned long) type, + (unsigned long) protocol, + (unsigned long) sv + }; + return LSS_NAME(socketcall)(8, args); + } + #elif defined(__ARM_EABI__) + /* ARM EABI Linix does not have socketcall. */ + LSS_INLINE _syscall3(ssize_t, recvmsg, int, s, + struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall3(ssize_t, sendmsg, int, s, + const struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall6(ssize_t, sendto, int, s, + const void*, b, size_t, l, + int, f, + const struct kernel_sockaddr*, to, + unsigned int, tl) + LSS_INLINE _syscall2(int, shutdown, int, s, + int, h) + LSS_INLINE _syscall3(int, socket, int, d, + int, t, int, p) + LSS_INLINE _syscall4(int, socketpair, int, d, + int, t, int, p, int*, s) + #endif + #if defined(__i386__) || (defined(__PPC__) && !defined(__PPC64__)) || \ + defined(__arm__) + LSS_INLINE _syscall4(int, fstatat64, int, d, + const char *, p, + struct kernel_stat64 *, b, int, f) + #endif + #if defined(__i386__) || defined(__PPC__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p, + int*, s, int, o) + #endif + #if defined(__mips__) + /* sys_pipe() on MIPS has non-standard calling conventions, as it returns + * both file handles through CPU registers. + */ + LSS_INLINE int LSS_NAME(pipe)(int *p) { + register unsigned long __v0 __asm__("$2") = __NR_pipe; + register unsigned long __v1 __asm__("$3"); + register unsigned long __r7 __asm__("$7"); + __asm__ __volatile__ ("syscall\n" + : "=&r"(__v0), "=&r"(__v1), "+r" (__r7) + : "0"(__v0) + : "$8", "$9", "$10", "$11", "$12", + "$13", "$14", "$15", "$24", "memory"); + if (__r7) { + LSS_ERRNO = __v0; + return -1; + } else { + p[0] = __v0; + p[1] = __v1; + return 0; + } + } + #else + LSS_INLINE _syscall1(int, pipe, int *, p) + #endif + /* TODO(user): see if ppc can/should support this as well */ + #if defined(__i386__) || \ + defined(__arm__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) + #define __NR__statfs64 __NR_statfs64 + #define __NR__fstatfs64 __NR_fstatfs64 + LSS_INLINE _syscall3(int, _statfs64, const char*, p, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE _syscall3(int, _fstatfs64, int, f, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE int LSS_NAME(statfs64)(const char *p, + struct kernel_statfs64 *b) { + return LSS_NAME(_statfs64)(p, sizeof(*b), b); + } + LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) { + return LSS_NAME(_fstatfs64)(f, sizeof(*b), b); + } + #endif + + LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) { + extern char **environ; + return LSS_NAME(execve)(path, argv, (const char *const *)environ); + } + + LSS_INLINE pid_t LSS_NAME(gettid)() { + pid_t tid = LSS_NAME(_gettid)(); + if (tid != -1) { + return tid; + } + return LSS_NAME(getpid)(); + } + + LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size, + size_t new_size, int flags, ...) { + va_list ap; + void *new_address, *rc; + va_start(ap, flags); + new_address = va_arg(ap, void *); + rc = LSS_NAME(_mremap)(old_address, old_size, new_size, + flags, new_address); + va_end(ap); + return rc; + } + + LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) { + return LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0); + } + + LSS_INLINE int LSS_NAME(raise)(int sig) { + return LSS_NAME(kill)(LSS_NAME(getpid)(), sig); + } + + LSS_INLINE int LSS_NAME(setpgrp)() { + return LSS_NAME(setpgid)(0, 0); + } + + LSS_INLINE int LSS_NAME(sysconf)(int name) { + extern int __getpagesize(void); + switch (name) { + case _SC_OPEN_MAX: { + struct kernel_rlimit limit; + + /* On some systems getrlimit is obsolete, use ugetrlimit instead. */ + #ifndef __NR_getrlimit + return LSS_NAME(ugetrlimit)(RLIMIT_NOFILE, &limit) < 0 + ? 8192 : limit.rlim_cur; + #else + return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0 + ? 8192 : limit.rlim_cur; + #endif + } + case _SC_PAGESIZE: + return __getpagesize(); + default: + LSS_ERRNO = ENOSYS; + return -1; + } + } + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64) + LSS_INLINE _syscall4(ssize_t, pread64, int, f, + void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall4(ssize_t, pwrite64, int, f, + const void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall3(int, readahead, int, f, + loff_t, o, unsigned, c) + #else + #define __NR__pread64 __NR_pread64 + #define __NR__pwrite64 __NR_pwrite64 + #define __NR__readahead __NR_readahead + LSS_INLINE _syscall5(ssize_t, _pread64, int, f, + void *, b, size_t, c, unsigned, o1, + unsigned, o2) + LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f, + const void *, b, size_t, c, unsigned, o1, + long, o2) + LSS_INLINE _syscall4(int, _readahead, int, f, + unsigned, o1, unsigned, o2, size_t, c); + /* We force 64bit-wide parameters onto the stack, then access each + * 32-bit component individually. This guarantees that we build the + * correct parameters independent of the native byte-order of the + * underlying architecture. + */ + LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count, + loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf, + size_t count, loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len); + } + #endif + #if defined(__NR_io_setup) + LSS_INLINE _syscall2(int, io_setup, + int, maxevents, + unsigned long *, ctxp); + LSS_INLINE _syscall3(int, io_submit, + unsigned long, ctx_id, + long, nr, + struct kernel_iocb **, ios); + LSS_INLINE _syscall5(int, io_getevents, + unsigned long, ctx_id, + long, min_nr, + long, nr, + struct kernel_io_event *, events, + struct kernel_timespec*, timeout); + LSS_INLINE _syscall1(int, io_destroy, + unsigned long, ctx); + LSS_INLINE _syscall3(int, io_cancel, + unsigned long, ctx_id, + struct kernel_iocb*, iocb, + struct kernel_io_event*, result); + #endif +#endif + +#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS) +} +#endif + +#endif +#endif diff --git a/src/kudu/gutil/logging-inl.h b/src/kudu/gutil/logging-inl.h new file mode 100644 index 000000000000..409a99c0ad5d --- /dev/null +++ b/src/kudu/gutil/logging-inl.h @@ -0,0 +1,50 @@ +// Copyright 2012 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// All rights reserved. +// +// Additional constants from logging.h and its dependencies which are +// not exported by glog. + +#ifndef _LOGGING_IN_H_ +#define _LOGGING_IN_H_ + +// DFATAL is FATAL in debug mode, ERROR in normal mode +#ifdef NDEBUG +#define DFATAL_LEVEL ERROR +#else +#define DFATAL_LEVEL FATAL +#endif + +// NDEBUG usage helpers related to (RAW_)DCHECK: +// +// DEBUG_MODE is for small !NDEBUG uses like +// if (DEBUG_MODE) foo.CheckThatFoo(); +// instead of substantially more verbose +// #ifndef NDEBUG +// foo.CheckThatFoo(); +// #endif +// +#ifdef NDEBUG +const bool DEBUG_MODE = false; +#else +const bool DEBUG_MODE = true; +#endif + +#endif // _LOGGING_IN_H_ diff --git a/src/kudu/gutil/macros.h b/src/kudu/gutil/macros.h new file mode 100644 index 000000000000..a57a375fa7d9 --- /dev/null +++ b/src/kudu/gutil/macros.h @@ -0,0 +1,271 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// +// Various Google-specific macros. +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// + +#ifndef BASE_MACROS_H_ +#define BASE_MACROS_H_ + +#include // For size_t +#include "kudu/gutil/port.h" + +// The swigged version of an abstract class must be concrete if any methods +// return objects of the abstract type. We keep it abstract in C++ and +// concrete for swig. +#ifndef SWIG +#define ABSTRACT = 0 +#endif + +// The COMPILE_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// COMPILE_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// COMPILE_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template +struct CompileAssert { +}; + +#define COMPILE_ASSERT(expr, msg) \ + typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ATTRIBUTE_UNUSED + +// Implementation details of COMPILE_ASSERT: +// +// - COMPILE_ASSERT works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define COMPILE_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// COMPILE_ASSERT(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outer parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert +// +// instead, these compilers will refuse to compile +// +// COMPILE_ASSERT(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + + +// A macro to disallow the copy constructor and operator= functions +// This should be used in the private: declarations for a class +// +// For disallowing only assign or copy, write the code directly, but declare +// the intend in a comment, for example: +// void operator=(const TypeName&); // DISALLOW_ASSIGN +// Note, that most uses of DISALLOW_ASSIGN and DISALLOW_COPY are broken +// semantically, one should either use disallow both or neither. Try to +// avoid these in new code. +// +// The LANG_CXX11 branch is a workaround for +// http://gcc.gnu.org/PR51213 in gcc-4.7 / Crosstool v16. +// TODO(user): Remove "&& !defined(__clang_)" when =delete is +// gcc-4.7 before =delete is allowed, go back to the C++98 definition. +#if LANG_CXX11 && !defined(__clang__) +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete +#else +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&) +#endif + +// An older, politically incorrect name for the above. +// Prefer DISALLOW_COPY_AND_ASSIGN for new code. +#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) DISALLOW_COPY_AND_ASSIGN(TypeName) + +// A macro to disallow all the implicit constructors, namely the +// default constructor, copy constructor and operator= functions. +// +// This should be used in the private: declarations for a class +// that wants to prevent anyone from instantiating it. This is +// especially useful for classes containing only static methods. +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + TypeName(); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that, for C++03, arraysize() doesn't accept any array of +// an anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++03's template system. The limitation has +// been removed in C++11. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef _MSC_VER +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// ARRAYSIZE performs essentially the same calculation as arraysize, +// but can be used on anonymous types or types defined inside +// functions. It's less safe than arraysize as it accepts some +// (although not all) pointers. Therefore, you should use arraysize +// whenever possible. +// +// The expression ARRAYSIZE(a) is a compile-time constant of type +// size_t. +// +// ARRAYSIZE catches a few type errors. If you see a compiler error +// +// "warning: division by zero in ..." +// +// when using ARRAYSIZE, you are (wrongfully) giving it a pointer. +// You should only use ARRAYSIZE on statically allocated arrays. +// +// The following comments are on the implementation details, and can +// be ignored by the users. +// +// ARRAYSIZE(arr) works by inspecting sizeof(arr) (the # of bytes in +// the array) and sizeof(*(arr)) (the # of bytes in one array +// element). If the former is divisible by the latter, perhaps arr is +// indeed an array, in which case the division result is the # of +// elements in the array. Otherwise, arr cannot possibly be an array, +// and we generate a compiler error to prevent the code from +// compiling. +// +// Since the size of bool is implementation-defined, we need to cast +// !(sizeof(a) & sizeof(*(a))) to size_t in order to ensure the final +// result has type size_t. +// +// This macro is not perfect as it wrongfully accepts certain +// pointers, namely where the pointer size is divisible by the pointee +// size. For code that goes through a 32-bit compiler, where a pointer +// is 4 bytes, this means all pointers to a type whose size is 3 or +// greater than 4 will be (righteously) rejected. +// +// Kudos to Jorg Brown for this simple and elegant implementation. +// +// - wan 2005-11-16 +// +// Starting with Visual C++ 2005, WinNT.h includes ARRAYSIZE. +#if !defined(_MSC_VER) || (defined(_MSC_VER) && _MSC_VER < 1400) +#define ARRAYSIZE(a) \ + ((sizeof(a) / sizeof(*(a))) / \ + static_cast(!(sizeof(a) % sizeof(*(a))))) +#endif + +// A macro to turn a symbol into a string +#define AS_STRING(x) AS_STRING_INTERNAL(x) +#define AS_STRING_INTERNAL(x) #x + +// Macro that allows definition of a variable appended with the current line +// number in the source file. Typically for use by other macros to allow the +// user to declare multiple variables with the same "base" name inside the same +// lexical block. +#define VARNAME_LINENUM(varname) VARNAME_LINENUM_INTERNAL(varname ## _L, __LINE__) +#define VARNAME_LINENUM_INTERNAL(v, line) VARNAME_LINENUM_INTERNAL2(v, line) +#define VARNAME_LINENUM_INTERNAL2(v, line) v ## line + +// The following enum should be used only as a constructor argument to indicate +// that the variable has static storage class, and that the constructor should +// do nothing to its state. It indicates to the reader that it is legal to +// declare a static instance of the class, provided the constructor is given +// the base::LINKER_INITIALIZED argument. Normally, it is unsafe to declare a +// static variable that has a constructor or a destructor because invocation +// order is undefined. However, IF the type can be initialized by filling with +// zeroes (which the loader does for static variables), AND the type's +// destructor does nothing to the storage, then a constructor for static +// initialization can be declared as +// explicit MyClass(base::LinkerInitialized x) {} +// and invoked as +// static MyClass my_variable_name(base::LINKER_INITIALIZED); +namespace base { +enum LinkerInitialized { LINKER_INITIALIZED }; +} + +// The FALLTHROUGH_INTENDED macro can be used to annotate implicit fall-through +// between switch labels: +// switch (x) { +// case 40: +// case 41: +// if (truth_is_out_there) { +// ++x; +// FALLTHROUGH_INTENDED; // Use instead of/along with annotations in +// // comments. +// } else { +// return x; +// } +// case 42: +// ... +// +// As shown in the example above, the FALLTHROUGH_INTENDED macro should be +// followed by a semicolon. It is designed to mimic control-flow statements +// like 'break;', so it can be placed in most places where 'break;' can, but +// only if there are no statements on the execution path between it and the +// next switch label. +// +// When compiled with clang in C++11 mode, the FALLTHROUGH_INTENDED macro is +// expanded to [[clang::fallthrough]] attribute, which is analysed when +// performing switch labels fall-through diagnostic ('-Wimplicit-fallthrough'). +// See clang documentation on language extensions for details: +// http://clang.llvm.org/docs/LanguageExtensions.html#clang__fallthrough +// +// When used with unsupported compilers, the FALLTHROUGH_INTENDED macro has no +// effect on diagnostics. +// +// In either case this macro has no effect on runtime behavior and performance +// of code. +#if defined(__clang__) && defined(LANG_CXX11) && defined(__has_warning) +#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#define FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT +#endif +#endif + +#ifndef FALLTHROUGH_INTENDED +#define FALLTHROUGH_INTENDED do { } while (0) +#endif + +#endif // BASE_MACROS_H_ diff --git a/src/kudu/gutil/manual_constructor.h b/src/kudu/gutil/manual_constructor.h new file mode 100644 index 000000000000..adcda070005f --- /dev/null +++ b/src/kudu/gutil/manual_constructor.h @@ -0,0 +1,250 @@ +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// --- +// +// +// ManualConstructor statically-allocates space in which to store some +// object, but does not initialize it. You can then call the constructor +// and destructor for the object yourself as you see fit. This is useful +// for memory management optimizations, where you want to initialize and +// destroy an object multiple times but only allocate it once. +// +// (When I say ManualConstructor statically allocates space, I mean that +// the ManualConstructor object itself is forced to be the right size.) +// +// For example usage, check out util/gtl/small_map.h. + +#ifndef UTIL_GTL_MANUAL_CONSTRUCTOR_H_ +#define UTIL_GTL_MANUAL_CONSTRUCTOR_H_ + +#include + +#include "kudu/gutil/port.h" + +namespace base { + +namespace util { +namespace gtl { +namespace internal { + +// +// Provides a char array with the exact same alignment as another type. The +// first parameter must be a complete type, the second parameter is how many +// of that type to provide space for. +// +// UTIL_GTL_ALIGNED_CHAR_ARRAY(struct stat, 16) storage_; +// +// Because MSVC and older GCCs require that the argument to their alignment +// construct to be a literal constant integer, we use a template instantiated +// at all the possible powers of two. +#ifndef SWIG +template struct AlignType { }; +template struct AlignType<0, size> { typedef char result[size]; }; +#if defined(_MSC_VER) +#define UTIL_GTL_ALIGN_ATTRIBUTE(X) __declspec(align(X)) +#define UTIL_GTL_ALIGN_OF(T) __alignof(T) +#elif defined(__GNUC__) || defined(__APPLE__) || defined(__INTEL_COMPILER) \ + || defined(__nacl__) +#define UTIL_GTL_ALIGN_ATTRIBUTE(X) __attribute__((aligned(X))) +#define UTIL_GTL_ALIGN_OF(T) __alignof__(T) +#endif + +#if defined(UTIL_GTL_ALIGN_ATTRIBUTE) + +#define UTIL_GTL_ALIGNTYPE_TEMPLATE(X) \ + template struct AlignType { \ + typedef UTIL_GTL_ALIGN_ATTRIBUTE(X) char result[size]; \ + } + +UTIL_GTL_ALIGNTYPE_TEMPLATE(1); +UTIL_GTL_ALIGNTYPE_TEMPLATE(2); +UTIL_GTL_ALIGNTYPE_TEMPLATE(4); +UTIL_GTL_ALIGNTYPE_TEMPLATE(8); +UTIL_GTL_ALIGNTYPE_TEMPLATE(16); +UTIL_GTL_ALIGNTYPE_TEMPLATE(32); +UTIL_GTL_ALIGNTYPE_TEMPLATE(64); +UTIL_GTL_ALIGNTYPE_TEMPLATE(128); +UTIL_GTL_ALIGNTYPE_TEMPLATE(256); +UTIL_GTL_ALIGNTYPE_TEMPLATE(512); +UTIL_GTL_ALIGNTYPE_TEMPLATE(1024); +UTIL_GTL_ALIGNTYPE_TEMPLATE(2048); +UTIL_GTL_ALIGNTYPE_TEMPLATE(4096); +UTIL_GTL_ALIGNTYPE_TEMPLATE(8192); +// Any larger and MSVC++ will complain. + +#define UTIL_GTL_ALIGNED_CHAR_ARRAY(T, Size) \ + typename util::gtl::internal::AlignType::result + +#undef UTIL_GTL_ALIGNTYPE_TEMPLATE +#undef UTIL_GTL_ALIGN_ATTRIBUTE + +#else // defined(UTIL_GTL_ALIGN_ATTRIBUTE) +#error "You must define UTIL_GTL_ALIGNED_CHAR_ARRAY for your compiler." +#endif // defined(UTIL_GTL_ALIGN_ATTRIBUTE) + +#else // !SWIG + +// SWIG can't represent alignment and doesn't care about alignment on data +// members (it works fine without it). +template +struct AlignType { typedef char result[Size]; }; +#define UTIL_GTL_ALIGNED_CHAR_ARRAY(T, Size) \ + util::gtl::internal::AlignType::result + +// Enough to parse with SWIG, will never be used by running code. +#define UTIL_GTL_ALIGN_OF(Type) 16 + +#endif // !SWIG + +} // namespace internal +} // namespace gtl +} // namespace util + +template +class ManualConstructor { + public: + // No constructor or destructor because one of the most useful uses of + // this class is as part of a union, and members of a union cannot have + // constructors or destructors. And, anyway, the whole point of this + // class is to bypass these. + + // Support users creating arrays of ManualConstructor<>s. This ensures that + // the array itself has the correct alignment. + static void* operator new[](size_t size) { + return aligned_malloc(size, UTIL_GTL_ALIGN_OF(Type)); + } + static void operator delete[](void* mem) { + aligned_free(mem); + } + + inline Type* get() { + return reinterpret_cast(space_); + } + inline const Type* get() const { + return reinterpret_cast(space_); + } + + inline Type* operator->() { return get(); } + inline const Type* operator->() const { return get(); } + + inline Type& operator*() { return *get(); } + inline const Type& operator*() const { return *get(); } + + // You can pass up to four constructor arguments as arguments of Init(). + inline void Init() { + new(space_) Type; + } + + template + inline void Init(const T1& p1) { + new(space_) Type(p1); + } + + template + inline void Init(const T1& p1, const T2& p2) { + new(space_) Type(p1, p2); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3) { + new(space_) Type(p1, p2, p3); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4) { + new(space_) Type(p1, p2, p3, p4); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5) { + new(space_) Type(p1, p2, p3, p4, p5); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6) { + new(space_) Type(p1, p2, p3, p4, p5, p6); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6, const T7& p7) { + new(space_) Type(p1, p2, p3, p4, p5, p6, p7); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6, const T7& p7, const T8& p8) { + new(space_) Type(p1, p2, p3, p4, p5, p6, p7, p8); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6, const T7& p7, const T8& p8, + const T9& p9) { + new(space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6, const T7& p7, const T8& p8, + const T9& p9, const T10& p10) { + new(space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10); + } + + template + inline void Init(const T1& p1, const T2& p2, const T3& p3, const T4& p4, + const T5& p5, const T6& p6, const T7& p7, const T8& p8, + const T9& p9, const T10& p10, const T11& p11) { + new(space_) Type(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11); + } + + inline void Destroy() { + get()->~Type(); + } + + private: + UTIL_GTL_ALIGNED_CHAR_ARRAY(Type, 1) space_; +}; + +#undef UTIL_GTL_ALIGNED_CHAR_ARRAY +#undef UTIL_GTL_ALIGN_OF + +} + +#endif // UTIL_GTL_MANUAL_CONSTRUCTOR_H_ diff --git a/src/kudu/gutil/map-util.h b/src/kudu/gutil/map-util.h new file mode 100644 index 000000000000..0682c2745431 --- /dev/null +++ b/src/kudu/gutil/map-util.h @@ -0,0 +1,770 @@ +// Copyright 2005 Google Inc. +// +// #status: RECOMMENDED +// #category: maps +// #summary: Utility functions for use with map-like containers. +// +// This file provides utility functions for use with STL map-like data +// structures, such as std::map and hash_map. Some functions will also work with +// sets, such as ContainsKey(). +// +// The main functions in this file fall into the following categories: +// +// - Find*() +// - Contains*() +// - Insert*() +// - Lookup*() +// +// These functions often have "...OrDie" or "...OrDieNoPrint" variants. These +// variants will crash the process with a CHECK() failure on error, including +// the offending key/data in the log message. The NoPrint variants will not +// include the key/data in the log output under the assumption that it's not a +// printable type. +// +// Most functions are fairly self explanatory from their names, with the +// exception of Find*() vs Lookup*(). The Find functions typically use the map's +// .find() member function to locate and return the map's value type. The +// Lookup*() functions typically use the map's .insert() (yes, insert) member +// function to insert the given value if necessary and returns (usually a +// reference to) the map's value type for the found item. +// +// See the per-function comments for specifics. +// +// There are also a handful of functions for doing other miscellaneous things. +// +// A note on terminology: +// +// Map-like containers are collections of pairs. Like all STL containers they +// contain a few standard typedefs identifying the types of data they contain. +// Given the following map declaration: +// +// map my_map; +// +// the notable typedefs would be as follows: +// +// - key_type -- string +// - value_type -- pair +// - mapped_type -- int +// +// Note that the map above contains two types of "values": the key-value pairs +// themselves (value_type) and the values within the key-value pairs +// (mapped_type). A value_type consists of a key_type and a mapped_type. +// +// The documentation below is written for programmers thinking in terms of keys +// and the (mapped_type) values associated with a given key. For example, the +// statement +// +// my_map["foo"] = 3; +// +// has a key of "foo" (type: string) with a value of 3 (type: int). +// + +#ifndef UTIL_GTL_MAP_UTIL_H_ +#define UTIL_GTL_MAP_UTIL_H_ + +#include +#include +using std::string; +#include +using std::make_pair; +using std::pair; +#include +using std::vector; + +#include + +#include "kudu/gutil/logging-inl.h" + +// +// Find*() +// + +// Returns a const reference to the value associated with the given key if it +// exists. Crashes otherwise. +// +// This is intended as a replacement for operator[] as an rvalue (for reading) +// when the key is guaranteed to exist. +// +// operator[] for lookup is discouraged for several reasons: +// * It has a side-effect of inserting missing keys +// * It is not thread-safe (even when it is not inserting, it can still +// choose to resize the underlying storage) +// * It invalidates iterators (when it chooses to resize) +// * It default constructs a value object even if it doesn't need to +// +// This version assumes the key is printable, and includes it in the fatal log +// message. +template +const typename Collection::value_type::second_type& +FindOrDie(const Collection& collection, + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + CHECK(it != collection.end()) << "Map key not found: " << key; + return it->second; +} + +// Same as above, but returns a non-const reference. +template +typename Collection::value_type::second_type& +FindOrDie(Collection& collection, // NOLINT + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + CHECK(it != collection.end()) << "Map key not found: " << key; + return it->second; +} + +// Same as FindOrDie above, but doesn't log the key on failure. +template +const typename Collection::value_type::second_type& +FindOrDieNoPrint(const Collection& collection, + const typename Collection::value_type::first_type& key) { + typename Collection::const_iterator it = collection.find(key); + CHECK(it != collection.end()) << "Map key not found"; + return it->second; +} + +// Same as above, but returns a non-const reference. +template +typename Collection::value_type::second_type& +FindOrDieNoPrint(Collection& collection, // NOLINT + const typename Collection::value_type::first_type& key) { + typename Collection::iterator it = collection.find(key); + CHECK(it != collection.end()) << "Map key not found"; + return it->second; +} + +// Returns a const reference to the value associated with the given key if it +// exists, otherwise a const reference to the provided default value is +// returned. +// +// WARNING: If a temporary object is passed as the default "value," this +// function will return a reference to that temporary object, which will be +// destroyed by the end of the statement. Specifically, if you have a map with +// string values, and you pass a char* as the default "value," either use the +// returned value immediately or store it in a string (not string&). Details: +template +const typename Collection::value_type::second_type& +FindWithDefault(const Collection& collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value) { + auto it = collection.find(key); + if (it == collection.end()) { + return value; + } + return it->second; +} + +// Returns a pointer to the const value associated with the given key if it +// exists, or NULL otherwise. +template +const typename Collection::value_type::second_type* +FindOrNull(const Collection& collection, + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + if (it == collection.end()) { + return 0; + } + return &it->second; +} + +// Same as above but returns a pointer to the non-const value. +template +typename Collection::value_type::second_type* +FindOrNull(Collection& collection, // NOLINT + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + if (it == collection.end()) { + return 0; + } + return &it->second; +} + +// Returns a pointer to the const value associated with the greatest key +// that's less than or equal to the given key, or NULL if no such key exists. +template +const typename Collection::value_type::second_type* +FindFloorOrNull(const Collection& collection, + const typename Collection::value_type::first_type& key) { + auto it = collection.upper_bound(key); + if (it == collection.begin()) { + return 0; + } + return &(--it)->second; +} + +// Same as above but returns a pointer to the non-const value. +template +typename Collection::value_type::second_type* +FindFloorOrNull(Collection& collection, // NOLINT + const typename Collection::value_type::first_type& key) { + auto it = collection.upper_bound(key); + if (it == collection.begin()) { + return 0; + } + return &(--it)->second; +} + +// Returns the pointer value associated with the given key. If none is found, +// NULL is returned. The function is designed to be used with a map of keys to +// pointers. +// +// This function does not distinguish between a missing key and a key mapped +// to a NULL value. +template +typename Collection::value_type::second_type +FindPtrOrNull(const Collection& collection, + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + if (it == collection.end()) { + return typename Collection::value_type::second_type(0); + } + return it->second; +} + +// Same as above, except takes non-const reference to collection. +// +// This function is needed for containers that propagate constness to the +// pointee, such as boost::ptr_map. +template +typename Collection::value_type::second_type +FindPtrOrNull(Collection& collection, // NOLINT + const typename Collection::value_type::first_type& key) { + auto it = collection.find(key); + if (it == collection.end()) { + return typename Collection::value_type::second_type(0); + } + return it->second; +} + +// Finds the value associated with the given key and copies it to *value (if not +// NULL). Returns false if the key was not found, true otherwise. +template +bool FindCopy(const Collection& collection, + const Key& key, + Value* const value) { + auto it = collection.find(key); + if (it == collection.end()) { + return false; + } + if (value) { + *value = it->second; + } + return true; +} + +// +// Contains*() +// + +// Returns true iff the given collection contains the given key. +template +bool ContainsKey(const Collection& collection, const Key& key) { + auto it = collection.find(key); + return it != collection.end(); +} + +// Returns true iff the given collection contains the given key-value pair. +template +bool ContainsKeyValuePair(const Collection& collection, + const Key& key, + const Value& value) { + typedef typename Collection::const_iterator const_iterator; + pair range = collection.equal_range(key); + for (const_iterator it = range.first; it != range.second; ++it) { + if (it->second == value) { + return true; + } + } + return false; +} + +// +// Insert*() +// + +// Inserts the given key-value pair into the collection. Returns true if the +// given key didn't previously exist. If the given key already existed in the +// map, its value is changed to the given "value" and false is returned. +template +bool InsertOrUpdate(Collection* const collection, + const typename Collection::value_type& vt) { + pair ret = collection->insert(vt); + if (!ret.second) { + // update + ret.first->second = vt.second; + return false; + } + return true; +} + +// Same as above, except that the key and value are passed separately. +template +bool InsertOrUpdate(Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value) { + return InsertOrUpdate( + collection, typename Collection::value_type(key, value)); +} + +// Inserts/updates all the key-value pairs from the range defined by the +// iterators "first" and "last" into the given collection. +template +void InsertOrUpdateMany(Collection* const collection, + InputIterator first, InputIterator last) { + for (; first != last; ++first) { + InsertOrUpdate(collection, *first); + } +} + +// Change the value associated with a particular key in a map or hash_map +// of the form map which owns the objects pointed to by the +// value pointers. If there was an existing value for the key, it is deleted. +// True indicates an insert took place, false indicates an update + delete. +template +bool InsertAndDeleteExisting( + Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value) { + pair ret = + collection->insert(typename Collection::value_type(key, value)); + if (!ret.second) { + delete ret.first->second; + ret.first->second = value; + return false; + } + return true; +} + +// Inserts the given key and value into the given collection iff the given key +// did NOT already exist in the collection. If the key previously existed in the +// collection, the value is not changed. Returns true if the key-value pair was +// inserted; returns false if the key was already present. +template +bool InsertIfNotPresent(Collection* const collection, + const typename Collection::value_type& vt) { + return collection->insert(vt).second; +} + +// Same as above except the key and value are passed separately. +template +bool InsertIfNotPresent( + Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value) { + return InsertIfNotPresent( + collection, typename Collection::value_type(key, value)); +} + +// Same as above except dies if the key already exists in the collection. +template +void InsertOrDie(Collection* const collection, + const typename Collection::value_type& value) { + CHECK(InsertIfNotPresent(collection, value)) << "duplicate value: " << value; +} + +// Same as above except doesn't log the value on error. +template +void InsertOrDieNoPrint(Collection* const collection, + const typename Collection::value_type& value) { + CHECK(InsertIfNotPresent(collection, value)) << "duplicate value."; +} + +// Inserts the key-value pair into the collection. Dies if key was already +// present. +template +void InsertOrDie(Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& data) { + CHECK(InsertIfNotPresent(collection, key, data)) + << "duplicate key: " << key; +} + +// Same as above except deson't log the key on error. +template +void InsertOrDieNoPrint( + Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& data) { + CHECK(InsertIfNotPresent(collection, key, data)) << "duplicate key."; +} + +// Inserts a new key and default-initialized value. Dies if the key was already +// present. Returns a reference to the value. Example usage: +// +// map m; +// SomeProto& proto = InsertKeyOrDie(&m, 3); +// proto.set_field("foo"); +template +typename Collection::value_type::second_type& InsertKeyOrDie( + Collection* const collection, + const typename Collection::value_type::first_type& key) { + typedef typename Collection::value_type value_type; + pair res = + collection->insert(value_type(key, typename value_type::second_type())); + CHECK(res.second) << "duplicate key: " << key; + return res.first->second; +} + +// +// Lookup*() +// + +// Looks up a given key and value pair in a collection and inserts the key-value +// pair if it's not already present. Returns a reference to the value associated +// with the key. +template +typename Collection::value_type::second_type& +LookupOrInsert(Collection* const collection, + const typename Collection::value_type& vt) { + return collection->insert(vt).first->second; +} + +// Same as above except the key-value are passed separately. +template +typename Collection::value_type::second_type& +LookupOrInsert(Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value) { + return LookupOrInsert( + collection, typename Collection::value_type(key, value)); +} + +// Counts the number of equivalent elements in the given "sequence", and stores +// the results in "count_map" with element as the key and count as the value. +// +// Example: +// vector v = {"a", "b", "c", "a", "b"}; +// map m; +// AddTokenCounts(v, 1, &m); +// assert(m["a"] == 2); +// assert(m["b"] == 2); +// assert(m["c"] == 1); +template +void AddTokenCounts( + const Sequence& sequence, + const typename Collection::value_type::second_type& increment, + Collection* const count_map) { + for (typename Sequence::const_iterator it = sequence.begin(); + it != sequence.end(); ++it) { + typename Collection::value_type::second_type& value = + LookupOrInsert(count_map, *it, + typename Collection::value_type::second_type()); + value += increment; + } +} + +// Helpers for LookupOrInsertNew(), needed to create a new value type when the +// type itself is a pointer, i.e., these extract the actual type from a pointer. +template +void MapUtilAssignNewDefaultInstance(T** location) { + *location = new T(); +} + +template +void MapUtilAssignNewInstance(T** location, const Arg &arg) { + *location = new T(arg); +} + +// Returns a reference to the value associated with key. If not found, a value +// is default constructed on the heap and added to the map. +// +// This function is useful for containers of the form map, where +// inserting a new key, value pair involves constructing a new heap-allocated +// Value, and storing a pointer to that in the collection. +template +typename Collection::value_type::second_type& +LookupOrInsertNew(Collection* const collection, + const typename Collection::value_type::first_type& key) { + pair ret = + collection->insert( + typename Collection::value_type(key, + static_cast(NULL))); + if (ret.second) { + // This helper is needed to 'extract' the Value type from the type of the + // container value, which is (Value*). + MapUtilAssignNewDefaultInstance(&(ret.first->second)); + } + return ret.first->second; +} + +// Same as above but constructs the value using the single-argument constructor +// and the given "arg". +template +typename Collection::value_type::second_type& +LookupOrInsertNew(Collection* const collection, + const typename Collection::value_type::first_type& key, + const Arg& arg) { + pair ret = + collection->insert( + typename Collection::value_type( + key, + static_cast(NULL))); + if (ret.second) { + // This helper is needed to 'extract' the Value type from the type of the + // container value, which is (Value*). + MapUtilAssignNewInstance(&(ret.first->second), arg); + } + return ret.first->second; +} + +// Lookup of linked/shared pointers is used in two scenarios: +// +// Use LookupOrInsertSharedPtr if the container does not own the elements +// for their whole lifetime. This is typically the case when a reader allows +// parallel updates to the container. In this case a Mutex only needs to lock +// container operations, but all element operations must be performed on the +// shared pointer. Finding an element must be performed using FindPtr*() and +// cannot be done with FindLinkedPtr*() even though it compiles. + +// Lookup a key in a map or hash_map whose values are shared_ptrs. If it is +// missing, set collection[key].reset(new Value::element_type). Unlike +// LookupOrInsertNewLinkedPtr, this function returns the shared_ptr instead of +// the raw pointer. Value::element_type must be default constructable. +template +typename Collection::value_type::second_type& +LookupOrInsertNewSharedPtr( + Collection* const collection, + const typename Collection::value_type::first_type& key) { + typedef typename Collection::value_type::second_type SharedPtr; + typedef typename Collection::value_type::second_type::element_type Element; + pair ret = + collection->insert(typename Collection::value_type(key, SharedPtr())); + if (ret.second) { + ret.first->second.reset(new Element()); + } + return ret.first->second; +} + +// A variant of LookupOrInsertNewSharedPtr where the value is constructed using +// a single-parameter constructor. Note: the constructor argument is computed +// even if it will not be used, so only values cheap to compute should be passed +// here. On the other hand it does not matter how expensive the construction of +// the actual stored value is, as that only occurs if necessary. +template +typename Collection::value_type::second_type& +LookupOrInsertNewSharedPtr( + Collection* const collection, + const typename Collection::value_type::first_type& key, + const Arg& arg) { + typedef typename Collection::value_type::second_type SharedPtr; + typedef typename Collection::value_type::second_type::element_type Element; + pair ret = + collection->insert(typename Collection::value_type(key, SharedPtr())); + if (ret.second) { + ret.first->second.reset(new Element(arg)); + } + return ret.first->second; +} + +// +// Misc Utility Functions +// + +// Updates the value associated with the given key. If the key was not already +// present, then the key-value pair are inserted and "previous" is unchanged. If +// the key was already present, the value is updated and "*previous" will +// contain a copy of the old value. +// +// InsertOrReturnExisting has complementary behavior that returns the +// address of an already existing value, rather than updating it. +template +bool UpdateReturnCopy(Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& value, + typename Collection::value_type::second_type* previous) { + pair ret = + collection->insert(typename Collection::value_type(key, value)); + if (!ret.second) { + // update + if (previous) { + *previous = ret.first->second; + } + ret.first->second = value; + return true; + } + return false; +} + +// Same as above except that the key and value are passed as a pair. +template +bool UpdateReturnCopy(Collection* const collection, + const typename Collection::value_type& vt, + typename Collection::value_type::second_type* previous) { + pair ret = + collection->insert(vt); + if (!ret.second) { + // update + if (previous) { + *previous = ret.first->second; + } + ret.first->second = vt.second; + return true; + } + return false; +} + +// Tries to insert the given key-value pair into the collection. Returns NULL if +// the insert succeeds. Otherwise, returns a pointer to the existing value. +// +// This complements UpdateReturnCopy in that it allows to update only after +// verifying the old value and still insert quickly without having to look up +// twice. Unlike UpdateReturnCopy this also does not come with the issue of an +// undefined previous* in case new data was inserted. +template +typename Collection::value_type::second_type* const +InsertOrReturnExisting(Collection* const collection, + const typename Collection::value_type& vt) { + pair ret = collection->insert(vt); + if (ret.second) { + return NULL; // Inserted, no existing previous value. + } else { + return &ret.first->second; // Return address of already existing value. + } +} + +// Same as above, except for explicit key and data. +template +typename Collection::value_type::second_type* const +InsertOrReturnExisting( + Collection* const collection, + const typename Collection::value_type::first_type& key, + const typename Collection::value_type::second_type& data) { + return InsertOrReturnExisting(collection, + typename Collection::value_type(key, data)); +} + +// Saves the reverse mapping into reverse. Key/value pairs are inserted in the +// order the iterator returns them. +template +void ReverseMap(const Collection& collection, + ReverseCollection* const reverse) { + CHECK(reverse != NULL); + for (typename Collection::const_iterator it = collection.begin(); + it != collection.end(); + ++it) { + InsertOrUpdate(reverse, it->second, it->first); + } +} + +// Erases the collection item identified by the given key, and returns the value +// associated with that key. It is assumed that the value (i.e., the +// mapped_type) is a pointer. Returns NULL if the key was not found in the +// collection. +// +// Examples: +// map my_map; +// +// One line cleanup: +// delete EraseKeyReturnValuePtr(&my_map, "abc"); +// +// Use returned value: +// gscoped_ptr value_ptr(EraseKeyReturnValuePtr(&my_map, "abc")); +// if (value_ptr.get()) +// value_ptr->DoSomething(); +// +template +typename Collection::value_type::second_type EraseKeyReturnValuePtr( + Collection* const collection, + const typename Collection::value_type::first_type& key) { + auto it = collection->find(key); + if (it == collection->end()) { + return NULL; + } + typename Collection::value_type::second_type v = it->second; + collection->erase(it); + return v; +} + +// Inserts all the keys from map_container into key_container, which must +// support insert(MapContainer::key_type). +// +// Note: any initial contents of the key_container are not cleared. +template +void InsertKeysFromMap(const MapContainer& map_container, + KeyContainer* key_container) { + CHECK(key_container != NULL); + for (typename MapContainer::const_iterator it = map_container.begin(); + it != map_container.end(); ++it) { + key_container->insert(it->first); + } +} + +// Appends all the keys from map_container into key_container, which must +// support push_back(MapContainer::key_type). +// +// Note: any initial contents of the key_container are not cleared. +template +void AppendKeysFromMap(const MapContainer& map_container, + KeyContainer* key_container) { + CHECK(key_container != NULL); + for (typename MapContainer::const_iterator it = map_container.begin(); + it != map_container.end(); ++it) { + key_container->push_back(it->first); + } +} + +// A more specialized overload of AppendKeysFromMap to optimize reallocations +// for the common case in which we're appending keys to a vector and hence can +// (and sometimes should) call reserve() first. +// +// (It would be possible to play SFINAE games to call reserve() for any +// container that supports it, but this seems to get us 99% of what we need +// without the complexity of a SFINAE-based solution.) +template +void AppendKeysFromMap(const MapContainer& map_container, + vector* key_container) { + CHECK(key_container != NULL); + // We now have the opportunity to call reserve(). Calling reserve() every + // time is a bad idea for some use cases: libstdc++'s implementation of + // vector<>::reserve() resizes the vector's backing store to exactly the + // given size (unless it's already at least that big). Because of this, + // the use case that involves appending a lot of small maps (total size + // N) one by one to a vector would be O(N^2). But never calling reserve() + // loses the opportunity to improve the use case of adding from a large + // map to an empty vector (this improves performance by up to 33%). A + // number of heuristics are possible; see the discussion in + // cl/34081696. Here we use the simplest one. + if (key_container->empty()) { + key_container->reserve(map_container.size()); + } + for (typename MapContainer::const_iterator it = map_container.begin(); + it != map_container.end(); ++it) { + key_container->push_back(it->first); + } +} + +// Inserts all the values from map_container into value_container, which must +// support push_back(MapContainer::mapped_type). +// +// Note: any initial contents of the value_container are not cleared. +template +void AppendValuesFromMap(const MapContainer& map_container, + ValueContainer* value_container) { + CHECK(value_container != NULL); + for (typename MapContainer::const_iterator it = map_container.begin(); + it != map_container.end(); ++it) { + value_container->push_back(it->second); + } +} + +// A more specialized overload of AppendValuesFromMap to optimize reallocations +// for the common case in which we're appending values to a vector and hence +// can (and sometimes should) call reserve() first. +// +// (It would be possible to play SFINAE games to call reserve() for any +// container that supports it, but this seems to get us 99% of what we need +// without the complexity of a SFINAE-based solution.) +template +void AppendValuesFromMap(const MapContainer& map_container, + vector* value_container) { + CHECK(value_container != NULL); + // See AppendKeysFromMap for why this is done. + if (value_container->empty()) { + value_container->reserve(map_container.size()); + } + for (const auto& entry : map_container) { + value_container->push_back(entry.second); + } +} + +#endif // UTIL_GTL_MAP_UTIL_H_ diff --git a/src/kudu/gutil/mathlimits.cc b/src/kudu/gutil/mathlimits.cc new file mode 100644 index 000000000000..dcc261d4944a --- /dev/null +++ b/src/kudu/gutil/mathlimits.cc @@ -0,0 +1,123 @@ +// Copyright 2005 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// +// + +#include "kudu/gutil/mathlimits.h" + +#include "kudu/gutil/integral_types.h" + +// MSVC++ 2005 thinks the header declaration was a definition, and +// erroneously flags these as a duplicate definition. +#ifdef _MSC_VER + +#define DEF_COMMON_LIMITS(Type) +#define DEF_UNSIGNED_INT_LIMITS(Type) +#define DEF_SIGNED_INT_LIMITS(Type) +#define DEF_PRECISION_LIMITS(Type) + +#else + +#define DEF_COMMON_LIMITS(Type) \ +const bool MathLimits::kIsSigned; \ +const bool MathLimits::kIsInteger; \ +const int MathLimits::kMin10Exp; \ +const int MathLimits::kMax10Exp; + +#define DEF_UNSIGNED_INT_LIMITS(Type) \ +DEF_COMMON_LIMITS(Type) \ +const Type MathLimits::kPosMin; \ +const Type MathLimits::kPosMax; \ +const Type MathLimits::kMin; \ +const Type MathLimits::kMax; \ +const Type MathLimits::kEpsilon; \ +const Type MathLimits::kStdError; + +#define DEF_SIGNED_INT_LIMITS(Type) \ +DEF_UNSIGNED_INT_LIMITS(Type) \ +const Type MathLimits::kNegMin; \ +const Type MathLimits::kNegMax; + +#define DEF_PRECISION_LIMITS(Type) \ +const int MathLimits::kPrecisionDigits; + +#endif // not _MSC_VER + +// http://en.wikipedia.org/wiki/Quadruple_precision_floating-point_format#Double-double_arithmetic +// With some compilers (gcc 4.6.x) on some platforms (powerpc64), +// "long double" is implemented as a pair of double: "double double" format. +// This causes a problem with epsilon (eps). +// eps is the smallest positive number such that 1.0 + eps > 1.0 +// +// Normal format: 1.0 + e = 1.0...01 // N-1 zeros for N fraction bits +// D-D format: 1.0 + e = 1.000...0001 // epsilon can be very small +// +// In the normal format, 1.0 + e has to fit in one stretch of bits. +// The maximum rounding error is half of eps. +// +// In the double-double format, 1.0 + e splits across two doubles: +// 1.0 in the high double, e in the low double, and they do not have to +// be contiguous. The maximum rounding error on a value close to 1.0 is +// much larger than eps. +// +// Some code checks for errors by comparing a computed value to a golden +// value +/- some multiple of the maximum rounding error. The maximum +// rounding error is not available so we use eps as an approximation +// instead. That fails when long double is in the double-double format. +// Therefore, we define kStdError as a multiple of +// max(DBL_EPSILON * DBL_EPSILON, kEpsilon) rather than a multiple of kEpsilon. + +#define DEF_FP_LIMITS(Type, PREFIX) \ +DEF_COMMON_LIMITS(Type) \ +const Type MathLimits::kPosMin = PREFIX##_MIN; \ +const Type MathLimits::kPosMax = PREFIX##_MAX; \ +const Type MathLimits::kMin = -MathLimits::kPosMax; \ +const Type MathLimits::kMax = MathLimits::kPosMax; \ +const Type MathLimits::kNegMin = -MathLimits::kPosMin; \ +const Type MathLimits::kNegMax = -MathLimits::kPosMax; \ +const Type MathLimits::kEpsilon = PREFIX##_EPSILON; \ +/* 32 is 5 bits of mantissa error; should be adequate for common errors */ \ +const Type MathLimits::kStdError = \ + 32 * (DBL_EPSILON * DBL_EPSILON > MathLimits::kEpsilon \ + ? DBL_EPSILON * DBL_EPSILON : MathLimits::kEpsilon); \ +DEF_PRECISION_LIMITS(Type) \ +const Type MathLimits::kNaN = HUGE_VAL - HUGE_VAL; \ +const Type MathLimits::kPosInf = HUGE_VAL; \ +const Type MathLimits::kNegInf = -HUGE_VAL; + +DEF_SIGNED_INT_LIMITS(int8) +DEF_SIGNED_INT_LIMITS(int16) +DEF_SIGNED_INT_LIMITS(int32) +DEF_SIGNED_INT_LIMITS(int64) +DEF_UNSIGNED_INT_LIMITS(uint8) +DEF_UNSIGNED_INT_LIMITS(uint16) +DEF_UNSIGNED_INT_LIMITS(uint32) +DEF_UNSIGNED_INT_LIMITS(uint64) + +DEF_FP_LIMITS(float, FLT) +DEF_FP_LIMITS(double, DBL) +DEF_FP_LIMITS(long double, LDBL); + +#undef DEF_COMMON_LIMITS +#undef DEF_SIGNED_INT_LIMITS +#undef DEF_UNSIGNED_INT_LIMITS +#undef DEF_FP_LIMITS +#undef DEF_PRECISION_LIMITS diff --git a/src/kudu/gutil/mathlimits.h b/src/kudu/gutil/mathlimits.h new file mode 100644 index 000000000000..9d69733351c1 --- /dev/null +++ b/src/kudu/gutil/mathlimits.h @@ -0,0 +1,256 @@ +// Copyright 2005 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// +// +// Useful integer and floating point limits and type traits. +// +// This partially replaces/duplictes numeric_limits<> from . +// We get a Google-style class that we have a greater control over +// and thus can add new features to it or fix whatever happens to be broken in +// numeric_limits for the compilers we use. +// + +#ifndef UTIL_MATH_MATHLIMITS_H__ +#define UTIL_MATH_MATHLIMITS_H__ + +#include +#include +#include +#include + +// ========================================================================= // + +// Useful integer and floating point limits and type traits. +// This is just for the documentation; +// real members are defined in our specializations below. +template struct MathLimits { + // Type name. + typedef T Type; + // Unsigned version of the Type with the same byte size. + // Same as Type for floating point and unsigned types. + typedef T UnsignedType; + // If the type supports negative values. + static const bool kIsSigned; + // If the type supports only integer values. + static const bool kIsInteger; + // Magnitude-wise smallest representable positive value. + static const Type kPosMin; + // Magnitude-wise largest representable positive value. + static const Type kPosMax; + // Smallest representable value. + static const Type kMin; + // Largest representable value. + static const Type kMax; + // Magnitude-wise smallest representable negative value. + // Present only if kIsSigned. + static const Type kNegMin; + // Magnitude-wise largest representable negative value. + // Present only if kIsSigned. + static const Type kNegMax; + // Smallest integer x such that 10^x is representable. + static const int kMin10Exp; + // Largest integer x such that 10^x is representable. + static const int kMax10Exp; + // Smallest positive value such that Type(1) + kEpsilon != Type(1) + static const Type kEpsilon; + // Typical rounding error that is enough to cover + // a few simple floating-point operations. + // Slightly larger than kEpsilon to account for a few rounding errors. + // Is zero if kIsInteger. + static const Type kStdError; + // Number of decimal digits of mantissa precision. + // Present only if !kIsInteger. + static const int kPrecisionDigits; + // Not a number, i.e. result of 0/0. + // Present only if !kIsInteger. + static const Type kNaN; + // Positive infinity, i.e. result of 1/0. + // Present only if !kIsInteger. + static const Type kPosInf; + // Negative infinity, i.e. result of -1/0. + // Present only if !kIsInteger. + static const Type kNegInf; + + // NOTE: Special floating point values behave + // in a special (but mathematically-logical) way + // in terms of (in)equalty comparison and mathematical operations + // -- see out unittest for examples. + + // Special floating point value testers. + // Present in integer types for convenience. + static bool IsFinite(const Type x); + static bool IsNaN(const Type x); + static bool IsInf(const Type x); + static bool IsPosInf(const Type x); + static bool IsNegInf(const Type x); +}; + +// ========================================================================= // + +// All #define-s below are simply to refactor the declarations of +// MathLimits template specializations. +// They are all #undef-ined below. + +// The hoop-jumping in *_INT_(MAX|MIN) below is so that the compiler does not +// get an overflow while computing the constants. + +#define SIGNED_INT_MAX(Type) \ + (((Type(1) << (sizeof(Type)*8 - 2)) - 1) + (Type(1) << (sizeof(Type)*8 - 2))) + +#define SIGNED_INT_MIN(Type) \ + (-(Type(1) << (sizeof(Type)*8 - 2)) - (Type(1) << (sizeof(Type)*8 - 2))) + +#define UNSIGNED_INT_MAX(Type) \ + (((Type(1) << (sizeof(Type)*8 - 1)) - 1) + (Type(1) << (sizeof(Type)*8 - 1))) + +// Compile-time selected log10-related constants for integer types. +#define SIGNED_MAX_10_EXP(Type) \ + (sizeof(Type) == 1 ? 2 : ( \ + sizeof(Type) == 2 ? 4 : ( \ + sizeof(Type) == 4 ? 9 : ( \ + sizeof(Type) == 8 ? 18 : -1)))) + +#define UNSIGNED_MAX_10_EXP(Type) \ + (sizeof(Type) == 1 ? 2 : ( \ + sizeof(Type) == 2 ? 4 : ( \ + sizeof(Type) == 4 ? 9 : ( \ + sizeof(Type) == 8 ? 19 : -1)))) + +#define DECL_INT_LIMIT_FUNCS \ + static bool IsFinite(const Type x) { return true; } \ + static bool IsNaN(const Type x) { return false; } \ + static bool IsInf(const Type x) { return false; } \ + static bool IsPosInf(const Type x) { return false; } \ + static bool IsNegInf(const Type x) { return false; } + +#define DECL_SIGNED_INT_LIMITS(IntType, UnsignedIntType) \ +template<> \ +struct MathLimits { \ + typedef IntType Type; \ + typedef UnsignedIntType UnsignedType; \ + static const bool kIsSigned = true; \ + static const bool kIsInteger = true; \ + static const Type kPosMin = 1; \ + static const Type kPosMax = SIGNED_INT_MAX(Type); \ + static const Type kMin = SIGNED_INT_MIN(Type); \ + static const Type kMax = kPosMax; \ + static const Type kNegMin = -1; \ + static const Type kNegMax = kMin; \ + static const int kMin10Exp = 0; \ + static const int kMax10Exp = SIGNED_MAX_10_EXP(Type); \ + static const Type kEpsilon = 1; \ + static const Type kStdError = 0; \ + DECL_INT_LIMIT_FUNCS \ +}; + +#define DECL_UNSIGNED_INT_LIMITS(IntType) \ +template<> \ +struct MathLimits { \ + typedef IntType Type; \ + typedef IntType UnsignedType; \ + static const bool kIsSigned = false; \ + static const bool kIsInteger = true; \ + static const Type kPosMin = 1; \ + static const Type kPosMax = UNSIGNED_INT_MAX(Type); \ + static const Type kMin = 0; \ + static const Type kMax = kPosMax; \ + static const int kMin10Exp = 0; \ + static const int kMax10Exp = UNSIGNED_MAX_10_EXP(Type); \ + static const Type kEpsilon = 1; \ + static const Type kStdError = 0; \ + DECL_INT_LIMIT_FUNCS \ +}; + +DECL_SIGNED_INT_LIMITS(signed char, unsigned char) +DECL_SIGNED_INT_LIMITS(signed short int, unsigned short int) +DECL_SIGNED_INT_LIMITS(signed int, unsigned int) +DECL_SIGNED_INT_LIMITS(signed long int, unsigned long int) +DECL_SIGNED_INT_LIMITS(signed long long int, unsigned long long int) +DECL_UNSIGNED_INT_LIMITS(unsigned char) +DECL_UNSIGNED_INT_LIMITS(unsigned short int) +DECL_UNSIGNED_INT_LIMITS(unsigned int) +DECL_UNSIGNED_INT_LIMITS(unsigned long int) +DECL_UNSIGNED_INT_LIMITS(unsigned long long int) + +#undef DECL_SIGNED_INT_LIMITS +#undef DECL_UNSIGNED_INT_LIMITS +#undef SIGNED_INT_MAX +#undef SIGNED_INT_MIN +#undef UNSIGNED_INT_MAX +#undef SIGNED_MAX_10_EXP +#undef UNSIGNED_MAX_10_EXP +#undef DECL_INT_LIMIT_FUNCS + +// ========================================================================= // +#ifdef WIN32 // Lacks built-in isnan() and isinf() +#define DECL_FP_LIMIT_FUNCS \ + static bool IsFinite(const Type x) { return _finite(x); } \ + static bool IsNaN(const Type x) { return _isnan(x); } \ + static bool IsInf(const Type x) { return (_fpclass(x) & (_FPCLASS_NINF | _FPCLASS_PINF)) != 0; } \ + static bool IsPosInf(const Type x) { return _fpclass(x) == _FPCLASS_PINF; } \ + static bool IsNegInf(const Type x) { return _fpclass(x) == _FPCLASS_NINF; } +#else +#define DECL_FP_LIMIT_FUNCS \ + static bool IsFinite(const Type x) { return !std::isinf(x) && !std::isnan(x); } \ + static bool IsNaN(const Type x) { return std::isnan(x); } \ + static bool IsInf(const Type x) { return std::isinf(x); } \ + static bool IsPosInf(const Type x) { return std::isinf(x) && x > 0; } \ + static bool IsNegInf(const Type x) { return std::isinf(x) && x < 0; } +#endif + +// We can't put floating-point constant values in the header here because +// such constants are not considered to be primitive-type constants by gcc. +// CAVEAT: Hence, they are going to be initialized only during +// the global objects construction time. +#define DECL_FP_LIMITS(FP_Type, PREFIX) \ +template<> \ +struct MathLimits { \ + typedef FP_Type Type; \ + typedef FP_Type UnsignedType; \ + static const bool kIsSigned = true; \ + static const bool kIsInteger = false; \ + static const Type kPosMin; \ + static const Type kPosMax; \ + static const Type kMin; \ + static const Type kMax; \ + static const Type kNegMin; \ + static const Type kNegMax; \ + static const int kMin10Exp = PREFIX##_MIN_10_EXP; \ + static const int kMax10Exp = PREFIX##_MAX_10_EXP; \ + static const Type kEpsilon; \ + static const Type kStdError; \ + static const int kPrecisionDigits = PREFIX##_DIG; \ + static const Type kNaN; \ + static const Type kPosInf; \ + static const Type kNegInf; \ + DECL_FP_LIMIT_FUNCS \ +}; + +DECL_FP_LIMITS(float, FLT) +DECL_FP_LIMITS(double, DBL) +DECL_FP_LIMITS(long double, LDBL) + +#undef DECL_FP_LIMITS +#undef DECL_FP_LIMIT_FUNCS + +// ========================================================================= // + +#endif // UTIL_MATH_MATHLIMITS_H__ diff --git a/src/kudu/gutil/move.h b/src/kudu/gutil/move.h new file mode 100644 index 000000000000..1c67155be1c7 --- /dev/null +++ b/src/kudu/gutil/move.h @@ -0,0 +1,218 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_MOVE_H_ +#define BASE_MOVE_H_ + +// Macro with the boilerplate that makes a type move-only in C++03. +// +// USAGE +// +// This macro should be used instead of DISALLOW_COPY_AND_ASSIGN to create +// a "move-only" type. Unlike DISALLOW_COPY_AND_ASSIGN, this macro should be +// the first line in a class declaration. +// +// A class using this macro must call .Pass() (or somehow be an r-value already) +// before it can be: +// +// * Passed as a function argument +// * Used as the right-hand side of an assignment +// * Returned from a function +// +// Each class will still need to define their own "move constructor" and "move +// operator=" to make this useful. Here's an example of the macro, the move +// constructor, and the move operator= from the scoped_ptr class: +// +// template +// class scoped_ptr { +// MOVE_ONLY_TYPE_FOR_CPP_03(scoped_ptr, RValue) +// public: +// scoped_ptr(RValue& other) : ptr_(other.release()) { } +// scoped_ptr& operator=(RValue& other) { +// swap(other); +// return *this; +// } +// }; +// +// Note that the constructor must NOT be marked explicit. +// +// For consistency, the second parameter to the macro should always be RValue +// unless you have a strong reason to do otherwise. It is only exposed as a +// macro parameter so that the move constructor and move operator= don't look +// like they're using a phantom type. +// +// +// HOW THIS WORKS +// +// For a thorough explanation of this technique, see: +// +// http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Move_Constructor +// +// The summary is that we take advantage of 2 properties: +// +// 1) non-const references will not bind to r-values. +// 2) C++ can apply one user-defined conversion when initializing a +// variable. +// +// The first lets us disable the copy constructor and assignment operator +// by declaring private version of them with a non-const reference parameter. +// +// For l-values, direct initialization still fails like in +// DISALLOW_COPY_AND_ASSIGN because the copy constructor and assignment +// operators are private. +// +// For r-values, the situation is different. The copy constructor and +// assignment operator are not viable due to (1), so we are trying to call +// a non-existent constructor and non-existing operator= rather than a private +// one. Since we have not committed an error quite yet, we can provide an +// alternate conversion sequence and a constructor. We add +// +// * a private struct named "RValue" +// * a user-defined conversion "operator RValue()" +// * a "move constructor" and "move operator=" that take the RValue& as +// their sole parameter. +// +// Only r-values will trigger this sequence and execute our "move constructor" +// or "move operator=." L-values will match the private copy constructor and +// operator= first giving a "private in this context" error. This combination +// gives us a move-only type. +// +// For signaling a destructive transfer of data from an l-value, we provide a +// method named Pass() which creates an r-value for the current instance +// triggering the move constructor or move operator=. +// +// Other ways to get r-values is to use the result of an expression like a +// function call. +// +// Here's an example with comments explaining what gets triggered where: +// +// class Foo { +// MOVE_ONLY_TYPE_FOR_CPP_03(Foo, RValue); +// +// public: +// ... API ... +// Foo(RValue other); // Move constructor. +// Foo& operator=(RValue rhs); // Move operator= +// }; +// +// Foo MakeFoo(); // Function that returns a Foo. +// +// Foo f; +// Foo f_copy(f); // ERROR: Foo(Foo&) is private in this context. +// Foo f_assign; +// f_assign = f; // ERROR: operator=(Foo&) is private in this context. +// +// +// Foo f(MakeFoo()); // R-value so alternate conversion executed. +// Foo f_copy(f.Pass()); // R-value so alternate conversion executed. +// f = f_copy.Pass(); // R-value so alternate conversion executed. +// +// +// IMPLEMENTATION SUBTLETIES WITH RValue +// +// The RValue struct is just a container for a pointer back to the original +// object. It should only ever be created as a temporary, and no external +// class should ever declare it or use it in a parameter. +// +// It is tempting to want to use the RValue type in function parameters, but +// excluding the limited usage here for the move constructor and move +// operator=, doing so would mean that the function could take both r-values +// and l-values equially which is unexpected. See COMPARED To Boost.Move for +// more details. +// +// An alternate, and incorrect, implementation of the RValue class used by +// Boost.Move makes RValue a fieldless child of the move-only type. RValue& +// is then used in place of RValue in the various operators. The RValue& is +// "created" by doing *reinterpret_cast(this). This has the appeal +// of never creating a temporary RValue struct even with optimizations +// disabled. Also, by virtue of inheritance you can treat the RValue +// reference as if it were the move-only type itself. Unfortunately, +// using the result of this reinterpret_cast<> is actually undefined behavior +// due to C++98 5.2.10.7. In certain compilers (e.g., NaCl) the optimizer +// will generate non-working code. +// +// In optimized builds, both implementations generate the same assembly so we +// choose the one that adheres to the standard. +// +// +// WHY HAVE typedef void MoveOnlyTypeForCPP03 +// +// Callback<>/Bind() needs to understand movable-but-not-copyable semantics +// to call .Pass() appropriately when it is expected to transfer the value. +// The cryptic typedef MoveOnlyTypeForCPP03 is added to make this check +// easy and automatic in helper templates for Callback<>/Bind(). +// See IsMoveOnlyType template and its usage in base/callback_internal.h +// for more details. +// +// +// COMPARED TO C++11 +// +// In C++11, you would implement this functionality using an r-value reference +// and our .Pass() method would be replaced with a call to std::move(). +// +// This emulation also has a deficiency where it uses up the single +// user-defined conversion allowed by C++ during initialization. This can +// cause problems in some API edge cases. For instance, in scoped_ptr, it is +// impossible to make a function "void Foo(scoped_ptr p)" accept a +// value of type scoped_ptr even if you add a constructor to +// scoped_ptr<> that would make it look like it should work. C++11 does not +// have this deficiency. +// +// +// COMPARED TO Boost.Move +// +// Our implementation similar to Boost.Move, but we keep the RValue struct +// private to the move-only type, and we don't use the reinterpret_cast<> hack. +// +// In Boost.Move, RValue is the boost::rv<> template. This type can be used +// when writing APIs like: +// +// void MyFunc(boost::rv& f) +// +// that can take advantage of rv<> to avoid extra copies of a type. However you +// would still be able to call this version of MyFunc with an l-value: +// +// Foo f; +// MyFunc(f); // Uh oh, we probably just destroyed |f| w/o calling Pass(). +// +// unless someone is very careful to also declare a parallel override like: +// +// void MyFunc(const Foo& f) +// +// that would catch the l-values first. This was declared unsafe in C++11 and +// a C++11 compiler will explicitly fail MyFunc(f). Unfortunately, we cannot +// ensure this in C++03. +// +// Since we have no need for writing such APIs yet, our implementation keeps +// RValue private and uses a .Pass() method to do the conversion instead of +// trying to write a version of "std::move()." Writing an API like std::move() +// would require the RValue struct to be public. +// +// +// CAVEATS +// +// If you include a move-only type as a field inside a class that does not +// explicitly declare a copy constructor, the containing class's implicit +// copy constructor will change from Containing(const Containing&) to +// Containing(Containing&). This can cause some unexpected errors. +// +// http://llvm.org/bugs/show_bug.cgi?id=11528 +// +// The workaround is to explicitly declare your copy constructor. +// +#define MOVE_ONLY_TYPE_FOR_CPP_03(type, rvalue_type) \ + private: \ + struct rvalue_type { \ + explicit rvalue_type(type* object) : object(object) {} \ + type* object; \ + }; \ + type(type&); \ + void operator=(type&); \ + public: \ + operator rvalue_type() { return rvalue_type(this); } \ + type Pass() { return type(rvalue_type(this)); } \ + typedef void MoveOnlyTypeForCPP03; \ + private: + +#endif // BASE_MOVE_H_ diff --git a/src/kudu/gutil/once.cc b/src/kudu/gutil/once.cc new file mode 100644 index 000000000000..1b97f8f295b8 --- /dev/null +++ b/src/kudu/gutil/once.cc @@ -0,0 +1,49 @@ +// Copyright 2008 Google Inc. All Rights Reserved. + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/once.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/spinlock_internal.h" + +// All modifications to a GoogleOnceType occur inside GoogleOnceInternalInit. +// The fast path reads the variable with an acquire-load.. +// This is safe provided we always perform a memory barrier +// immediately before setting the value to GOOGLE_ONCE_INTERNAL_DONE. + +void GoogleOnceInternalInit(Atomic32 *control, void (*func)(), + void (*func_with_arg)(void*), void* arg) { + if (DEBUG_MODE) { + int32 old_control = base::subtle::Acquire_Load(control); + if (old_control != GOOGLE_ONCE_INTERNAL_INIT && + old_control != GOOGLE_ONCE_INTERNAL_RUNNING && + old_control != GOOGLE_ONCE_INTERNAL_WAITER && + old_control != GOOGLE_ONCE_INTERNAL_DONE) { + LOG(FATAL) << "Either GoogleOnceType is used in non-static storage " + "(where GoogleOnceDynamic might be appropriate), " + "or there's a memory corruption."; + } + } + static const base::internal::SpinLockWaitTransition trans[] = { + { GOOGLE_ONCE_INTERNAL_INIT, GOOGLE_ONCE_INTERNAL_RUNNING, true }, + { GOOGLE_ONCE_INTERNAL_RUNNING, GOOGLE_ONCE_INTERNAL_WAITER, false }, + { GOOGLE_ONCE_INTERNAL_DONE, GOOGLE_ONCE_INTERNAL_DONE, true } + }; + // Short circuit the simplest case to avoid procedure call overhead. + if (base::subtle::Acquire_CompareAndSwap(control, GOOGLE_ONCE_INTERNAL_INIT, + GOOGLE_ONCE_INTERNAL_RUNNING) == GOOGLE_ONCE_INTERNAL_INIT || + base::internal::SpinLockWait(control, ARRAYSIZE(trans), trans) == + GOOGLE_ONCE_INTERNAL_INIT) { + if (func != nullptr) { + (*func)(); + } else { + (*func_with_arg)(arg); + } + ANNOTATE_HAPPENS_BEFORE(control); + int32 old_control = base::subtle::NoBarrier_Load(control); + base::subtle::Release_Store(control, GOOGLE_ONCE_INTERNAL_DONE); + if (old_control == GOOGLE_ONCE_INTERNAL_WAITER) { + base::internal::SpinLockWake(control, true); + } + } // else *control is already GOOGLE_ONCE_INTERNAL_DONE +} diff --git a/src/kudu/gutil/once.h b/src/kudu/gutil/once.h new file mode 100644 index 000000000000..ff161c5ab88d --- /dev/null +++ b/src/kudu/gutil/once.h @@ -0,0 +1,119 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// +// The first call to GoogleOnceInit() with a particular GoogleOnceType +// argument will run the specified function. Other calls with the same +// argument will not run the function, but will wait for the provided +// function to finish running (if it is still running). This provides +// a safe, simple, and fast mechanism for one-time initialization in a +// multi-threaded process. +// +// This module is a replacement for pthread_once(). It was added +// since some versions of pthread_once() call the supplied function +// +// Example usage: +// static GoogleOnceType once = GOOGLE_ONCE_INIT; +// static void Initializer() { +// ... do initialization ... +// } +// ... +// void SomeFunction() { +// GoogleOnceInit(&once, &Initializer); +// ... +// } + +#ifndef BASE_ONCE_H_ +#define BASE_ONCE_H_ + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/type_traits.h" + +// The following enum values are not for use by clients +enum { + GOOGLE_ONCE_INTERNAL_INIT = 0, + GOOGLE_ONCE_INTERNAL_RUNNING = 0x65C2937B, // an improbable 32-bit value + GOOGLE_ONCE_INTERNAL_WAITER = 0x05A308D2, // a different improbable value + GOOGLE_ONCE_INTERNAL_DONE = 0x3F2D8AB0, // yet another improbable value +}; + +struct GoogleOnceType { + Atomic32 state; +}; + +#define GOOGLE_ONCE_INIT { GOOGLE_ONCE_INTERNAL_INIT } + +// For internal use only. +extern void GoogleOnceInternalInit(Atomic32* state, void (*func)(), + void (*func_with_arg)(void*), void* arg); + +inline void GoogleOnceInit(GoogleOnceType* state, void (*func)()) { + Atomic32 s = Acquire_Load(&state->state); + if (PREDICT_FALSE(s != GOOGLE_ONCE_INTERNAL_DONE)) { + GoogleOnceInternalInit(&state->state, func, 0, 0); + } + ANNOTATE_HAPPENS_AFTER(&state->state); +} + +// A version of GoogleOnceInit where the function argument takes a pointer +// of arbitrary type. +template +inline void GoogleOnceInitArg(GoogleOnceType* state, + void (*func_with_arg)(T*), T* arg) { + Atomic32 s = Acquire_Load(&state->state); + if (PREDICT_FALSE(s != GOOGLE_ONCE_INTERNAL_DONE)) { + // Deal with const T as well as non-const T. + typedef typename base::remove_const::type mutable_T; + GoogleOnceInternalInit(&state->state, 0, + reinterpret_cast(func_with_arg), + const_cast(arg)); + } + ANNOTATE_HAPPENS_AFTER(&state->state); +} + +// GoogleOnceDynamic is like GoogleOnceType, but is dynamically +// initialized instead of statically initialized. This should be used only +// when the variable is not of static storage class. +// It might be used to delay expensive initialization of part of a +// dynamically-allocated data structure until it is known to be needed. For +// example: +// class MyType { +// GoogleOnceDynamic once_; +// ComplexStuff* complex_stuff_; +// static void InitComplexStuff(MyType* me) { +// me->complex_stuff_ = ...; +// } +// public: +// ComplexStuff* complex_stuff() { +// this->once_.Init(&InitComplexStuff, this); +// return this->complex_stuff_; +// } +// } +class GoogleOnceDynamic { + public: + GoogleOnceDynamic() : state_(GOOGLE_ONCE_INTERNAL_INIT) { } + + // If this->Init() has not been called before by any thread, + // execute (*func_with_arg)(arg) then return. + // Otherwise, wait until that prior invocation has finished + // executing its function, then return. + template + void Init(void (*func_with_arg)(T*), T* arg) { + Atomic32 s = Acquire_Load(&this->state_); + if (PREDICT_FALSE(s != GOOGLE_ONCE_INTERNAL_DONE)) { + // Deal with const T as well as non-const T. + typedef typename base::remove_const::type mutable_T; + GoogleOnceInternalInit(&this->state_, 0, + reinterpret_cast(func_with_arg), + const_cast(arg)); + } + ANNOTATE_HAPPENS_AFTER(&this->state_); + } + private: + Atomic32 state_; + DISALLOW_COPY_AND_ASSIGN(GoogleOnceDynamic); +}; + +#endif // BASE_ONCE_H_ diff --git a/src/kudu/gutil/paranoid.h b/src/kudu/gutil/paranoid.h new file mode 100644 index 000000000000..01f34b7d0bac --- /dev/null +++ b/src/kudu/gutil/paranoid.h @@ -0,0 +1,92 @@ +// Copyright 2006, Google Inc. All rights reserved. +// +// Paranoid helpers. This is separate from basictypes.h so that it +// can use logging. + +#ifndef BASE_PARANOID_H_ +#define BASE_PARANOID_H_ + +#include + +#include "kudu/gutil/logging-inl.h" + +// Sanitize a bool value which might be sour. +// +// I made up the term "sour bool". It means a bool that is not false (0x0) +// and not true (0x1) but has one of the other 2^N-2 states. A common way +// to create a sour bool is to read an uninitialized bool object. +// +// The Standard says: +// [dcl.init] 8.5 -9- "Otherwise, if no initializer is specified for an +// object, the object and its subobjects, if any, have an indeterminate +// initial value." +// [basic.fundamental] 3.9.1 -5- footnote 42: "Using a bool value in ways +// described by this International standard as "undefined", such as by +// examining the value of an uninitialized automatic variable, might cause +// it to behave as if niether true nor false." +// +// Specifically, this program fragment: +// bool b; +// printf("%d\n", b ? 1 : 0); +// can print any value at all, not just 1 or 0! gcc-4.1.0-piii-linux-opt +// generates code tantamount to "static_cast(b)" with no comparison +// operators. This is harmful for invalid values of b, but fast for all +// valid values. +// +// The original bug was a sour bool that confused the protobuf runtime. +// RawOutputToArray wrote a single byte with the sour bool value and +// ReadBool read a ReadVarint32. If the sour bool did not look like a +// single-byte varint32, then the serialized protobuf would be unreadable. +// +// === +// +// If you run into a compiler where the volatile pointer does not work, try +// a bit_cast. Just plain "static_cast(b) ? 1 : 0" does not +// work with gcc-4.1.0-piii-linux-opt, but bit_cast does. +// +// === +// +// If the assert fires, you probably have an uninitialized bool value. The +// original case of this was an auto struct with an uninitialized bool +// field. It might also be memory corruption, or it might be a new C++ +// compiler that has found a new way to hurt us. +// +// === +// +// Props to Apurv Gupta for the report, Ian Lance Taylor for volatile, +// and Sanjay Ghemawat for general guidance. +// +// -- mec 2006-07-06 + +inline bool SanitizeBool(bool b) { + unsigned char c = static_cast(b); + volatile unsigned char* p = &c; + DCHECK_LT(*p, 2); + return (*p != '\0') ? true : false; +} + +// Returns true iff. a given bool is either true (0x1) or false (0x0). +// Mainly used for sanity checking for set_field(bool) in Protocol Buffer. +// +// This sanity checking is necessary since a sour bool might confuse the +// Protocol Buffer runtime as mentioned above. +// +// Uses an assembler sequence so as not to be compiler-optimization sensitive. +inline bool IsSaneBool(bool b) { +#if (defined __i386__ || defined __x86_64__) && defined __GNUC__ + bool result; + // Set result to true if b is below or equal to 0x1. + __asm__("cmpb $0x1, %1\n\t" + "setbe %0" + : "=m" (result) // Output spec + : "m" (b) // Input spec + : "cc"); // Clobbers condition-codes + return result; +#else + unsigned char c = static_cast(b); + volatile unsigned char* p = &c; + return *p <= 1; +#endif +} + +#endif // BASE_PARANOID_H_ diff --git a/src/kudu/gutil/port.h b/src/kudu/gutil/port.h new file mode 100644 index 000000000000..d99e4041e705 --- /dev/null +++ b/src/kudu/gutil/port.h @@ -0,0 +1,1218 @@ +// +// Copyright (C) 1999 and onwards Google, Inc. +// +// +// These are weird things we need to do to get this compiling on +// random systems (and on SWIG). + +#ifndef BASE_PORT_H_ +#define BASE_PORT_H_ + +#include // So we can set the bounds of our types +#include // for memcpy() +#include // for free() + +#if defined(__APPLE__) +#include // for getpagesize() on mac +#elif defined(OS_CYGWIN) +#include // for memalign() +#endif + +#include "kudu/gutil/integral_types.h" + +// Must happens before inttypes.h inclusion */ +#if defined(__APPLE__) +/* From MacOSX's inttypes.h: + * "C++ implementations should define these macros only when + * __STDC_FORMAT_MACROS is defined before is included." */ +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif /* __STDC_FORMAT_MACROS */ +#endif /* __APPLE__ */ + +/* Default for most OSes */ +/* We use SIGPWR since that seems unlikely to be used for other reasons. */ +#define GOOGLE_OBSCURE_SIGNAL SIGPWR + +#if defined OS_LINUX || defined OS_CYGWIN + +// _BIG_ENDIAN +#include + +// The uint mess: +// mysql.h sets _GNU_SOURCE which sets __USE_MISC in +// sys/types.h typedefs uint if __USE_MISC +// mysql typedefs uint if HAVE_UINT not set +// The following typedef is carefully considered, and should not cause +// any clashes +#if !defined(__USE_MISC) +#if !defined(HAVE_UINT) +#define HAVE_UINT 1 +typedef unsigned int uint; +#endif +#if !defined(HAVE_USHORT) +#define HAVE_USHORT 1 +typedef unsigned short ushort; +#endif +#if !defined(HAVE_ULONG) +#define HAVE_ULONG 1 +typedef unsigned long ulong; +#endif +#endif + +#if defined(__cplusplus) +#include // For _GLIBCXX macros +#endif + +#if !defined(HAVE_TLS) && defined(_GLIBCXX_HAVE_TLS) && defined(__x86_64__) +#define HAVE_TLS 1 +#endif + +#elif defined OS_FREEBSD + +// _BIG_ENDIAN +#include + +#elif defined OS_SOLARIS + +// _BIG_ENDIAN +#include + +// Solaris doesn't define sig_t (function taking an int, returning void) +typedef void (*sig_t)(int); + +// Solaris only defines strtoll, not strtoq +#define strtoq strtoll +#define strtouq strtoull + +// It doesn't define the posix-standard(?) u_int_16 +#include // NOLINT(build/include) +typedef uint16_t u_int16_t; + +#elif defined __APPLE__ + +// BIG_ENDIAN +#include // NOLINT(build/include) +/* Let's try and follow the Linux convention */ +#define __BYTE_ORDER BYTE_ORDER +#define __LITTLE_ENDIAN LITTLE_ENDIAN +#define __BIG_ENDIAN BIG_ENDIAN + +#endif + +// The following guarenty declaration of the byte swap functions, and +// define __BYTE_ORDER for MSVC +#ifdef _MSC_VER +#include // NOLINT(build/include) +#define __BYTE_ORDER __LITTLE_ENDIAN +#define bswap_16(x) _byteswap_ushort(x) +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#include +#define bswap_16(x) OSSwapInt16(x) +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#elif defined(__GLIBC__) +#include // IWYU pragma: export + +#else + +static inline uint16 bswap_16(uint16 x) { + return ((x & 0xFF) << 8) | ((x & 0xFF00) >> 8); +} +#define bswap_16(x) bswap_16(x) +static inline uint32 bswap_32(uint32 x) { + return (((x & 0xFF) << 24) | + ((x & 0xFF00) << 8) | + ((x & 0xFF0000) >> 8) | + ((x & 0xFF000000) >> 24)); +} +#define bswap_32(x) bswap_32(x) +static inline uint64 bswap_64(uint64 x) { + return (((x & GG_ULONGLONG(0xFF)) << 56) | + ((x & GG_ULONGLONG(0xFF00)) << 40) | + ((x & GG_ULONGLONG(0xFF0000)) << 24) | + ((x & GG_ULONGLONG(0xFF000000)) << 8) | + ((x & GG_ULONGLONG(0xFF00000000)) >> 8) | + ((x & GG_ULONGLONG(0xFF0000000000)) >> 24) | + ((x & GG_ULONGLONG(0xFF000000000000)) >> 40) | + ((x & GG_ULONGLONG(0xFF00000000000000)) >> 56)); +} +#define bswap_64(x) bswap_64(x) + +#endif + + +// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN +// using the above endian defintions from endian.h if +// endian.h was included +#ifdef __BYTE_ORDER +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define IS_LITTLE_ENDIAN +#endif + +#if __BYTE_ORDER == __BIG_ENDIAN +#define IS_BIG_ENDIAN +#endif + +#else + +#if defined(__LITTLE_ENDIAN__) +#define IS_LITTLE_ENDIAN +#elif defined(__BIG_ENDIAN__) +#define IS_BIG_ENDIAN +#endif + +// there is also PDP endian ... + +#endif // __BYTE_ORDER + +// Define the OS's path separator +#ifdef __cplusplus // C won't merge duplicate const variables at link time +// Some headers provide a macro for this (GCC's system.h), remove it so that we +// can use our own. +#undef PATH_SEPARATOR +#if defined(OS_WINDOWS) +const char PATH_SEPARATOR = '\\'; +#else +const char PATH_SEPARATOR = '/'; +#endif +#endif + +// Windows has O_BINARY as a flag to open() (like "b" for fopen). +// Linux doesn't need make this distinction. +#if defined OS_LINUX && !defined O_BINARY +#define O_BINARY 0 +#endif + +// va_copy portability definitions +#ifdef _MSC_VER +// MSVC doesn't have va_copy yet. +// This is believed to work for 32-bit msvc. This may not work at all for +// other platforms. +// If va_list uses the single-element-array trick, you will probably get +// a compiler error here. +// +#include +inline void va_copy(va_list& a, va_list& b) { + a = b; +} + +// Nor does it have uid_t +typedef int uid_t; + +#endif + +// Mac OS X / Darwin features + +#if defined(__APPLE__) + +// For mmap, Linux defines both MAP_ANONYMOUS and MAP_ANON and says MAP_ANON is +// deprecated. In Darwin, MAP_ANON is all there is. +#if !defined MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +// Linux has this in +#define __ptr_t void * + +// Linux has this in +#define EXFULL ENOMEM // not really that great a translation... + +// Darwin doesn't have strnlen. No comment. +inline size_t strnlen(const char *s, size_t maxlen) { + const char* end = (const char *)memchr(s, '\0', maxlen); + if (end) + return end - s; + return maxlen; +} + +namespace std {} // Avoid error if we didn't see std. +using namespace std; // Just like VC++, we need a using here. + +// Doesn't exist on OSX; used in google.cc for send() to mean "no flags". +#define MSG_NOSIGNAL 0 + +// No SIGPWR on MacOSX. SIGINFO seems suitably obscure. +#undef GOOGLE_OBSCURE_SIGNAL +#define GOOGLE_OBSCURE_SIGNAL SIGINFO + +#elif defined(OS_CYGWIN) // Cygwin-specific behavior. + +#if defined(__CYGWIN32__) +#define __WORDSIZE 32 +#else +// It's probably possible to support 64-bit, but the #defines will need checked. +#error "Cygwin is currently only 32-bit." +#endif + +// No signalling on Windows. +#undef GOOGLE_OBSCURE_SIGNAL +#define GOOGLE_OBSCURE_SIGNAL 0 + +struct stack_t { + void* ss_sp; + int ss_flags; + size_t ss_size; +}; +inline int sigaltstack(stack_t* ss, stack_t* oss) { return 0; } + +#define PTHREAD_STACK_MIN 0 // Not provided by cygwin + +// Scans memory for a character. +// memrchr is used in a few places, but it's linux-specific. +inline void* memrchr(const void* bytes, int find_char, size_t len) { + const unsigned char* cursor = + reinterpret_cast(bytes) + len - 1; + unsigned char actual_char = find_char; + for (; cursor >= bytes; --cursor) { + if (*cursor == actual_char) { + return const_cast(reinterpret_cast(cursor)); + } + } + return NULL; +} + +#endif + +// Klocwork static analysis tool's C/C++ complier kwcc +#if defined(__KLOCWORK__) +#define STATIC_ANALYSIS +#endif // __KLOCWORK__ + + +// Annotate a function indicating the caller must examine the return value. +// Use like: +// int foo() WARN_UNUSED_RESULT; +// To explicitly ignore a result, see |ignore_result()| in . +#if defined(__GNUC__) +#define WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#else +#define WARN_UNUSED_RESULT +#endif + +// GCC-specific features + +#if (defined(__GNUC__) || defined(__APPLE__)) && !defined(SWIG) + +// +// Tell the compiler to do printf format string checking if the +// compiler supports it; see the 'format' attribute in +// . +// +// N.B.: As the GCC manual states, "[s]ince non-static C++ methods +// have an implicit 'this' argument, the arguments of such methods +// should be counted from two, not one." +// +#define PRINTF_ATTRIBUTE(string_index, first_to_check) \ + __attribute__((__format__ (__printf__, string_index, first_to_check))) +#define SCANF_ATTRIBUTE(string_index, first_to_check) \ + __attribute__((__format__ (__scanf__, string_index, first_to_check))) + +// +// Prevent the compiler from padding a structure to natural alignment +// +#define PACKED __attribute__ ((packed)) + +// Cache line alignment +#if defined(__i386__) || defined(__x86_64__) +#define CACHELINE_SIZE 64 +#elif defined(__powerpc64__) +// TODO(user) This is the L1 D-cache line size of our Power7 machines. +// Need to check if this is appropriate for other PowerPC64 systems. +#define CACHELINE_SIZE 128 +#elif defined(__arm__) +// Cache line sizes for ARM: These values are not strictly correct since +// cache line sizes depend on implementations, not architectures. There +// are even implementations with cache line sizes configurable at boot +// time. +#if defined(__ARM_ARCH_5T__) +#define CACHELINE_SIZE 32 +#elif defined(__ARM_ARCH_7A__) +#define CACHELINE_SIZE 64 +#endif +#endif + +// This is a NOP if CACHELINE_SIZE is not defined. +#ifdef CACHELINE_SIZE +#define CACHELINE_ALIGNED __attribute__((aligned(CACHELINE_SIZE))) +#else +#define CACHELINE_ALIGNED +#endif + +// +// Prevent the compiler from complaining about or optimizing away variables +// that appear unused +// (careful, others e.g. third_party/libxml/xmlversion.h also define this) +#undef ATTRIBUTE_UNUSED +#define ATTRIBUTE_UNUSED __attribute__ ((unused)) + +// Same as above, but for class members. +// As of 10/2013 this appears to only be supported in Clang/LLVM. +// See http://patchwork.ozlabs.org/patch/232594/ which is not yet committed +// in gcc trunk. +#if defined(__llvm__) +#define ATTRIBUTE_MEMBER_UNUSED ATTRIBUTE_UNUSED +#else +#define ATTRIBUTE_MEMBER_UNUSED +#endif + +// +// For functions we want to force inline or not inline. +// Introduced in gcc 3.1. +#define ATTRIBUTE_ALWAYS_INLINE __attribute__ ((always_inline)) +#define HAVE_ATTRIBUTE_ALWAYS_INLINE 1 +#define ATTRIBUTE_NOINLINE __attribute__ ((noinline)) +#define HAVE_ATTRIBUTE_NOINLINE 1 + +// For weak functions +#undef ATTRIBUTE_WEAK +#define ATTRIBUTE_WEAK __attribute__ ((weak)) +#define HAVE_ATTRIBUTE_WEAK 1 + +// Tell the compiler to use "initial-exec" mode for a thread-local variable. +// See http://people.redhat.com/drepper/tls.pdf for the gory details. +#define ATTRIBUTE_INITIAL_EXEC __attribute__ ((tls_model ("initial-exec"))) + +// +// Tell the compiler that some function parameters should be non-null pointers. +// Note: As the GCC manual states, "[s]ince non-static C++ methods +// have an implicit 'this' argument, the arguments of such methods +// should be counted from two, not one." +// +#define ATTRIBUTE_NONNULL(arg_index) __attribute__((nonnull(arg_index))) + +// +// Tell the compiler that a given function never returns +// +#define ATTRIBUTE_NORETURN __attribute__((noreturn)) + +// Tell AddressSanitizer (or other memory testing tools) to ignore a given +// function. Useful for cases when a function reads random locations on stack, +// calls _exit from a cloned subprocess, deliberately accesses buffer +// out of bounds or does other scary things with memory. +#ifdef ADDRESS_SANITIZER +#define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS \ + __attribute__((no_address_safety_analysis)) +#else +#define ATTRIBUTE_NO_ADDRESS_SAFETY_ANALYSIS +#endif + +// Tell ThreadSanitizer to ignore a given function. This can dramatically reduce +// the running time and memory requirements for racy code when TSAN is active. +// GCC does not support this attribute at the time of this writing (GCC 4.8). +#if defined(__llvm__) +#define ATTRIBUTE_NO_SANITIZE_THREAD \ + __attribute__((no_sanitize_thread)) +#else +#define ATTRIBUTE_NO_SANITIZE_THREAD +#endif + +#ifndef HAVE_ATTRIBUTE_SECTION // may have been pre-set to 0, e.g. for Darwin +#define HAVE_ATTRIBUTE_SECTION 1 +#endif + +// +// The legacy prod71 libc does not provide the stack alignment required for use +// of SSE intrinsics. In order to properly use the intrinsics you need to use +// a trampoline function which aligns the stack prior to calling your code, +// or as of crosstool v10 with gcc 4.2.0 there is an attribute which asks +// gcc to do this for you. +// +// It has also been discovered that crosstool up to and including v10 does not +// provide proper alignment for pthread_once() functions in x86-64 code either. +// Unfortunately gcc does not provide force_align_arg_pointer as an option in +// x86-64 code, so this requires us to always have a trampoline. +// +// For an example of using this see util/hash/adler32* + +#if defined(__i386__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define ATTRIBUTE_STACK_ALIGN_FOR_OLD_LIBC __attribute__((force_align_arg_pointer)) +#define REQUIRE_STACK_ALIGN_TRAMPOLINE (0) +#elif defined(__i386__) || defined(__x86_64__) +#define REQUIRE_STACK_ALIGN_TRAMPOLINE (1) +#define ATTRIBUTE_STACK_ALIGN_FOR_OLD_LIBC +#else +#define REQUIRE_STACK_ALIGN_TRAMPOLINE (0) +#define ATTRIBUTE_STACK_ALIGN_FOR_OLD_LIBC +#endif + + +// +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() MUST_USE_RESULT; +// +#if defined(SWIG) +#define MUST_USE_RESULT +#elif __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) +#define MUST_USE_RESULT __attribute__ ((warn_unused_result)) +#else +#define MUST_USE_RESULT +#endif + +// Annotate a virtual method indicating it must be overriding a virtual +// method in the parent class. +// Use like: +// virtual void foo() OVERRIDE; +#if defined(COMPILER_MSVC) +#define OVERRIDE override +#elif defined(__clang__) +#define OVERRIDE override +#elif defined(COMPILER_GCC) && __cplusplus >= 201103 && \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) >= 40700 +// GCC 4.7 supports explicit virtual overrides when C++11 support is enabled. +#define OVERRIDE override +#else +#define OVERRIDE +#endif + +// Annotate a virtual method indicating that subclasses must not override it, +// or annotate a class to indicate that it cannot be subclassed. +// Use like: +// virtual void foo() FINAL; +// class B FINAL : public A {}; +#if defined(COMPILER_MSVC) +// TODO(jered): Change this to "final" when chromium no longer uses MSVC 2010. +#define FINAL sealed +#elif defined(__clang__) +#define FINAL final +#elif defined(COMPILER_GCC) && __cplusplus >= 201103 && \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) >= 40700 +// GCC 4.7 supports explicit virtual overrides when C++11 support is enabled. +#define FINAL final +#else +#define FINAL +#endif + +#if defined(__GNUC__) +// Defined behavior on some of the uarchs: +// PREFETCH_HINT_T0: +// prefetch to all levels of the hierarchy (except on p4: prefetch to L2) +// PREFETCH_HINT_NTA: +// p4: fetch to L2, but limit to 1 way (out of the 8 ways) +// core: skip L2, go directly to L1 +// k8 rev E and later: skip L2, can go to either of the 2-ways in L1 +enum PrefetchHint { + PREFETCH_HINT_T0 = 3, // More temporal locality + PREFETCH_HINT_T1 = 2, + PREFETCH_HINT_T2 = 1, // Less temporal locality + PREFETCH_HINT_NTA = 0 // No temporal locality +}; +#else +// prefetch is a no-op for this target. Feel free to add more sections above. +#endif + +extern inline void prefetch(const char *x, int hint) { +#if defined(__llvm__) + // In the gcc version of prefetch(), hint is only a constant _after_ inlining + // (assumed to have been successful). llvm views things differently, and + // checks constant-ness _before_ inlining. This leads to compilation errors + // with using the other version of this code with llvm. + // + // One way round this is to use a switch statement to explicitly match + // prefetch hint enumerations, and invoke __builtin_prefetch for each valid + // value. llvm's optimization removes the switch and unused case statements + // after inlining, so that this boils down in the end to the same as for gcc; + // that is, a single inlined prefetchX instruction. + // + // Note that this version of prefetch() cannot verify constant-ness of hint. + // If client code calls prefetch() with a variable value for hint, it will + // receive the full expansion of the switch below, perhaps also not inlined. + // This should however not be a problem in the general case of well behaved + // caller code that uses the supplied prefetch hint enumerations. + switch (hint) { + case PREFETCH_HINT_T0: + __builtin_prefetch(x, 0, PREFETCH_HINT_T0); + break; + case PREFETCH_HINT_T1: + __builtin_prefetch(x, 0, PREFETCH_HINT_T1); + break; + case PREFETCH_HINT_T2: + __builtin_prefetch(x, 0, PREFETCH_HINT_T2); + break; + case PREFETCH_HINT_NTA: + __builtin_prefetch(x, 0, PREFETCH_HINT_NTA); + break; + default: + __builtin_prefetch(x); + break; + } +#elif defined(__GNUC__) + #if !defined(__i386) || defined(__SSE__) + if (__builtin_constant_p(hint)) { + __builtin_prefetch(x, 0, hint); + } else { + // Defaults to PREFETCH_HINT_T0 + __builtin_prefetch(x); + } +#else + // We want a __builtin_prefetch, but we build with the default -march=i386 + // where __builtin_prefetch quietly turns into nothing. + // Once we crank up to -march=pentium3 or higher the __SSE__ + // clause above will kick in with the builtin. + // -- mec 2006-06-06 + if (hint == PREFETCH_HINT_NTA) + __asm__ __volatile__("prefetchnta (%0)" : : "r"(x)); + #endif +#else + // You get no effect. Feel free to add more sections above. +#endif +} + +#ifdef __cplusplus +// prefetch intrinsic (bring data to L1 without polluting L2 cache) +extern inline void prefetch(const char *x) { + return prefetch(x, 0); +} +#endif // ifdef __cplusplus + +// +// GCC can be told that a certain branch is not likely to be taken (for +// instance, a CHECK failure), and use that information in static analysis. +// Giving it this information can help it optimize for the common case in +// the absence of better information (ie. -fprofile-arcs). +// +#if defined(__GNUC__) +#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define PREDICT_FALSE(x) x +#define PREDICT_TRUE(x) x +#endif + +// +// Tell GCC that a function is hot or cold. GCC can use this information to +// improve static analysis, i.e. a conditional branch to a cold function +// is likely to be not-taken. +// This annotation is used for function declarations, e.g.: +// int foo() ATTRIBUTE_HOT; +// +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) +#define ATTRIBUTE_HOT __attribute__ ((hot)) +#define ATTRIBUTE_COLD __attribute__ ((cold)) +#else +#define ATTRIBUTE_HOT +#define ATTRIBUTE_COLD +#endif + +#define FTELLO ftello +#define FSEEKO fseeko + +#if !defined(__cplusplus) && !defined(__APPLE__) && !defined(OS_CYGWIN) +// stdlib.h only declares this in C++, not in C, so we declare it here. +// Also make sure to avoid declaring it on platforms which don't support it. +extern int posix_memalign(void **memptr, size_t alignment, size_t size); +#endif + +inline void *aligned_malloc(size_t size, int minimum_alignment) { +#if defined(__APPLE__) + // mac lacks memalign(), posix_memalign(), however, according to + // http://stackoverflow.com/questions/196329/osx-lacks-memalign + // mac allocs are already 16-byte aligned. + if (minimum_alignment <= 16) + return malloc(size); + // next, try to return page-aligned memory. perhaps overkill + if (minimum_alignment <= getpagesize()) + return valloc(size); + // give up + return NULL; +#elif defined(OS_CYGWIN) + return memalign(minimum_alignment, size); +#else // !__APPLE__ && !OS_CYGWIN + void *ptr = NULL; + if (posix_memalign(&ptr, minimum_alignment, size) != 0) + return NULL; + else + return ptr; +#endif +} + +inline void aligned_free(void *aligned_memory) { + free(aligned_memory); +} + +#else // not GCC + +#define PRINTF_ATTRIBUTE(string_index, first_to_check) +#define SCANF_ATTRIBUTE(string_index, first_to_check) +#define PACKED +#define CACHELINE_ALIGNED +#define ATTRIBUTE_UNUSED +#define ATTRIBUTE_ALWAYS_INLINE +#define ATTRIBUTE_NOINLINE +#define ATTRIBUTE_HOT +#define ATTRIBUTE_COLD +#define ATTRIBUTE_WEAK +#define HAVE_ATTRIBUTE_WEAK 0 +#define ATTRIBUTE_INITIAL_EXEC +#define ATTRIBUTE_NONNULL(arg_index) +#define ATTRIBUTE_NORETURN +#define ATTRIBUTE_STACK_ALIGN_FOR_OLD_LIBC +#define REQUIRE_STACK_ALIGN_TRAMPOLINE (0) +#define MUST_USE_RESULT +extern inline void prefetch(const char *x) {} +#define PREDICT_FALSE(x) x +#define PREDICT_TRUE(x) x + +// These should be redefined appropriately if better alternatives to +// ftell/fseek exist in the compiler +#define FTELLO ftell +#define FSEEKO fseek + +#endif // GCC + +// +// Provides a char array with the exact same alignment as another type. The +// first parameter must be a complete type, the second parameter is how many +// of that type to provide space for. +// +// ALIGNED_CHAR_ARRAY(struct stat, 16) storage_; +// +#if defined(__cplusplus) +#undef ALIGNED_CHAR_ARRAY +// Because MSVC and older GCCs require that the argument to their alignment +// construct to be a literal constant integer, we use a template instantiated +// at all the possible powers of two. +#ifndef SWIG +template struct AlignType { }; +template struct AlignType<0, size> { typedef char result[size]; }; +#if defined(_MSC_VER) +#define BASE_PORT_H_ALIGN_ATTRIBUTE(X) __declspec(align(X)) +#define BASE_PORT_H_ALIGN_OF(T) __alignof(T) +#elif defined(__GNUC__) +#define BASE_PORT_H_ALIGN_ATTRIBUTE(X) __attribute__((aligned(X))) +#define BASE_PORT_H_ALIGN_OF(T) __alignof__(T) +#endif + +#if defined(BASE_PORT_H_ALIGN_ATTRIBUTE) + +#define BASE_PORT_H_ALIGNTYPE_TEMPLATE(X) \ + template struct AlignType { \ + typedef BASE_PORT_H_ALIGN_ATTRIBUTE(X) char result[size]; \ + } + +BASE_PORT_H_ALIGNTYPE_TEMPLATE(1); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(2); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(4); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(8); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(16); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(32); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(64); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(128); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(256); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(512); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(1024); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(2048); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(4096); +BASE_PORT_H_ALIGNTYPE_TEMPLATE(8192); +// Any larger and MSVC++ will complain. + +#define ALIGNED_CHAR_ARRAY(T, Size) \ + typename AlignType::result + +#undef BASE_PORT_H_ALIGNTYPE_TEMPLATE +#undef BASE_PORT_H_ALIGN_ATTRIBUTE + +#else // defined(BASE_PORT_H_ALIGN_ATTRIBUTE) +#define ALIGNED_CHAR_ARRAY you_must_define_ALIGNED_CHAR_ARRAY_for_your_compiler_in_base_port_h +#endif // defined(BASE_PORT_H_ALIGN_ATTRIBUTE) + +#else // !SWIG + +// SWIG can't represent alignment and doesn't care about alignment on data +// members (it works fine without it). +template +struct AlignType { typedef char result[Size]; }; +#define ALIGNED_CHAR_ARRAY(T, Size) AlignType::result + +#endif // !SWIG +#else // __cpluscplus +#define ALIGNED_CHAR_ARRAY ALIGNED_CHAR_ARRAY_is_not_available_without_Cplusplus +#endif // __cplusplus + +#ifdef _MSC_VER /* if Visual C++ */ + +// This compiler flag can be easily overlooked on MSVC. +// _CHAR_UNSIGNED gets set with the /J flag. +#ifndef _CHAR_UNSIGNED +#error chars must be unsigned! Use the /J flag on the compiler command line. +#endif + +// MSVC is a little hyper-active in its warnings +// Signed vs. unsigned comparison is ok. +#pragma warning(disable : 4018 ) +// We know casting from a long to a char may lose data +#pragma warning(disable : 4244 ) +// Don't need performance warnings about converting ints to bools +#pragma warning(disable : 4800 ) +// Integral constant overflow is apparently ok too +// for example: +// short k; int n; +// k = k + n; +#pragma warning(disable : 4307 ) +// It's ok to use this* in constructor +// Example: +// class C { +// Container cont_; +// C() : cont_(this) { ... +#pragma warning(disable : 4355 ) +// Truncating from double to float is ok +#pragma warning(disable : 4305 ) + +#include +#include +#include +#undef ERROR + +#include // for nextafter functionality on windows +#include // for HUGE_VAL + +#ifndef HUGE_VALF +#define HUGE_VALF (static_cast(HUGE_VAL)) +#endif + +namespace std {} // Avoid error if we didn't see std. +using namespace std; + +// VC++ doesn't understand "uint" +#ifndef HAVE_UINT +#define HAVE_UINT 1 +typedef unsigned int uint; +#endif + +// VC++ doesn't understand "ssize_t" +#ifndef HAVE_SSIZET +#define HAVE_SSIZET 1 +// The following correctly defines ssize_t on most (all?) VC++ versions: +// #include +// typedef SSIZE_T ssize_t; +// However, several projects in googleclient already use plain 'int', e.g., +// googleclient/posix/unistd.h +// googleclient/earth/client/libs/base/types.h +// so to avoid conflicts with those definitions, we do the same here. +typedef int ssize_t; +#endif + +#define strtoq _strtoi64 +#define strtouq _strtoui64 +#define strtoll _strtoi64 +#define strtoull _strtoui64 +#define atoll _atoi64 + + +// VC++ 6 and before ship without an ostream << operator for 64-bit ints +#if (_MSC_VER <= 1200) +#include +using std::ostream; +inline ostream& operator<< (ostream& os, const unsigned __int64& num ) { + // Fake operator; doesn't actually do anything. + LOG(FATAL) << "64-bit ostream operator << not supported in VC++ 6"; + return os; +} +#endif + +// You say tomato, I say atotom +#define PATH_MAX MAX_PATH + +// You say tomato, I say _tomato +#define vsnprintf _vsnprintf +#define snprintf _snprintf +#define strcasecmp _stricmp +#define strncasecmp _strnicmp + +#define nextafter _nextafter + +#define hypot _hypot +#define hypotf _hypotf + +#define strdup _strdup +#define tempnam _tempnam +#define chdir _chdir +#define getcwd _getcwd +#define putenv _putenv + + +// You say tomato, I say toma +#define random() rand() +#define srandom(x) srand(x) + +// You say juxtapose, I say transpose +#define bcopy(s, d, n) memcpy(d, s, n) + +inline void *aligned_malloc(size_t size, int minimum_alignment) { + return _aligned_malloc(size, minimum_alignment); +} + +inline void aligned_free(void *aligned_memory) { + _aligned_free(aligned_memory); +} + +// ----- BEGIN VC++ STUBS & FAKE DEFINITIONS --------------------------------- + +// See http://en.wikipedia.org/wiki/IEEE_754 for details of +// floating point format. + +enum { + FP_NAN, // is "Not a Number" + FP_INFINITE, // is either plus or minus infinity. + FP_ZERO, + FP_SUBNORMAL, // is too small to be represented in normalized format. + FP_NORMAL // if nothing of the above is correct that it must be a + // normal floating-point number. +}; + +inline int fpclassify_double(double x) { + const int float_point_class =_fpclass(x); + int c99_class; + switch (float_point_class) { + case _FPCLASS_SNAN: // Signaling NaN + case _FPCLASS_QNAN: // Quiet NaN + c99_class = FP_NAN; + break; + case _FPCLASS_NZ: // Negative zero ( -0) + case _FPCLASS_PZ: // Positive 0 (+0) + c99_class = FP_ZERO; + break; + case _FPCLASS_NINF: // Negative infinity ( -INF) + case _FPCLASS_PINF: // Positive infinity (+INF) + c99_class = FP_INFINITE; + break; + case _FPCLASS_ND: // Negative denormalized + case _FPCLASS_PD: // Positive denormalized + c99_class = FP_SUBNORMAL; + break; + case _FPCLASS_NN: // Negative normalized non-zero + case _FPCLASS_PN: // Positive normalized non-zero + c99_class = FP_NORMAL; + break; + default: + c99_class = FP_NAN; // Should never happen + break; + } + return c99_class; +} + +// This function handle the special subnormal case for float; it will +// become a normal number while casting to double. +// bit_cast is avoided to simplify dependency and to create a code that is +// easy to deploy in C code +inline int fpclassify_float(float x) { + uint32 bitwise_representation; + memcpy(&bitwise_representation, &x, 4); + if ((bitwise_representation & 0x7f800000) == 0 && + (bitwise_representation & 0x007fffff) != 0) + return FP_SUBNORMAL; + return fpclassify_double(x); +} +// +// This define takes care of the denormalized float; the casting to +// double make it a normal number +#define fpclassify(x) ((sizeof(x) == sizeof(float)) ? fpclassify_float(x) : fpclassify_double(x)) + +#define isnan _isnan + +inline int isinf(double x) { + const int float_point_class =_fpclass(x); + if (float_point_class == _FPCLASS_PINF) return 1; + if (float_point_class == _FPCLASS_NINF) return -1; + return 0; +} + +// #include "kudu/conflict-signal.h" +typedef void (*sig_t)(int); + +// These actually belong in errno.h but there's a name confilict in errno +// on WinNT. They (and a ton more) are also found in Winsock2.h, but +// if'd out under NT. We need this subset at minimum. +#define EXFULL ENOMEM // not really that great a translation... +// The following are already defined in VS2010. +#if (_MSC_VER < 1600) +#define EWOULDBLOCK WSAEWOULDBLOCK +#ifndef PTHREADS_REDHAT_WIN32 +#define ETIMEDOUT WSAETIMEDOUT +#endif +#define ENOTSOCK WSAENOTSOCK +#define EINPROGRESS WSAEINPROGRESS +#define ECONNRESET WSAECONNRESET +#endif + +// +// Really from +// + +inline void bzero(void *s, int n) { + memset(s, 0, n); +} + +// From glob.h +#define __ptr_t void * + +// Defined all over the place. +typedef int pid_t; + +// From stat.h +typedef unsigned int mode_t; + +// u_int16_t, int16_t don't exist in MSVC +typedef unsigned short u_int16_t; +typedef short int16_t; + +// ----- END VC++ STUBS & FAKE DEFINITIONS ---------------------------------- + +#endif // _MSC_VER + +#ifdef STL_MSVC // not always the same as _MSC_VER +#include "kudu/base/port_hash.h" +#else +struct PortableHashBase { }; +#endif + +#if defined(OS_WINDOWS) || defined(__APPLE__) +// gethostbyname() *is* thread-safe for Windows native threads. It is also +// safe on Mac OS X, where it uses thread-local storage, even though the +// manpages claim otherwise. For details, see +// http://lists.apple.com/archives/Darwin-dev/2006/May/msg00008.html +#else +// gethostbyname() is not thread-safe. So disallow its use. People +// should either use the HostLookup::Lookup*() methods, or gethostbyname_r() +#define gethostbyname gethostbyname_is_not_thread_safe_DO_NOT_USE +#endif + +// create macros in which the programmer should enclose all specializations +// for hash_maps and hash_sets. This is necessary since these classes are not +// STL standardized. Depending on the STL implementation they are in different +// namespaces. Right now the right namespace is passed by the Makefile +// Examples: gcc3: -DHASH_NAMESPACE=__gnu_cxx +// icc: -DHASH_NAMESPACE=std +// gcc2: empty + +#ifndef HASH_NAMESPACE +# define HASH_NAMESPACE_DECLARATION_START +# define HASH_NAMESPACE_DECLARATION_END +#else +# define HASH_NAMESPACE_DECLARATION_START namespace HASH_NAMESPACE { +# define HASH_NAMESPACE_DECLARATION_END } +#endif + +// Our STL-like classes use __STD. +#if defined(__GNUC__) || defined(__APPLE__) || defined(_MSC_VER) +#define __STD std +#endif + +#if defined __GNUC__ +#define STREAM_SET(s, bit) (s).setstate(ios_base::bit) +#define STREAM_SETF(s, flag) (s).setf(ios_base::flag) +#else +#define STREAM_SET(s, bit) (s).set(ios::bit) +#define STREAM_SETF(s, flag) (s).setf(ios::flag) +#endif + +// Portable handling of unaligned loads, stores, and copies. +// On some platforms, like ARM, the copy functions can be more efficient +// then a load and a store. + +#if defined(__i386) || defined(ARCH_ATHLON) || defined(__x86_64__) || defined(_ARCH_PPC) + +// x86 and x86-64 can perform unaligned loads/stores directly; +// modern PowerPC hardware can also do unaligned integer loads and stores; +// but note: the FPU still sends unaligned loads and stores to a trap handler! + +#define UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) +#define UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) +#define UNALIGNED_LOAD64(_p) (*reinterpret_cast(_p)) + +#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast(_p) = (_val)) +#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast(_p) = (_val)) +#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast(_p) = (_val)) + +#elif defined(__arm__) && \ + !defined(__ARM_ARCH_5__) && \ + !defined(__ARM_ARCH_5T__) && \ + !defined(__ARM_ARCH_5TE__) && \ + !defined(__ARM_ARCH_5TEJ__) && \ + !defined(__ARM_ARCH_6__) && \ + !defined(__ARM_ARCH_6J__) && \ + !defined(__ARM_ARCH_6K__) && \ + !defined(__ARM_ARCH_6Z__) && \ + !defined(__ARM_ARCH_6ZK__) && \ + !defined(__ARM_ARCH_6T2__) + +// ARMv7 and newer support native unaligned accesses, but only of 16-bit +// and 32-bit values (not 64-bit); older versions either raise a fatal signal, +// do an unaligned read and rotate the words around a bit, or do the reads very +// slowly (trip through kernel mode). There's no simple #define that says just +// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6 +// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define, +// so in time, maybe we can move on to that. +// +// This is a mess, but there's not much we can do about it. + +#define UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) +#define UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) + +#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast(_p) = (_val)) +#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast(_p) = (_val)) + +// TODO(user): NEON supports unaligned 64-bit loads and stores. +// See if that would be more efficient on platforms supporting it, +// at least for copies. + +inline uint64 UNALIGNED_LOAD64(const void *p) { + uint64 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline void UNALIGNED_STORE64(void *p, uint64 v) { + memcpy(p, &v, sizeof v); +} + +#else + +#define NEED_ALIGNED_LOADS + +// These functions are provided for architectures that don't support +// unaligned loads and stores. + +inline uint16 UNALIGNED_LOAD16(const void *p) { + uint16 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline uint32 UNALIGNED_LOAD32(const void *p) { + uint32 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline uint64 UNALIGNED_LOAD64(const void *p) { + uint64 t; + memcpy(&t, p, sizeof t); + return t; +} + +inline void UNALIGNED_STORE16(void *p, uint16 v) { + memcpy(p, &v, sizeof v); +} + +inline void UNALIGNED_STORE32(void *p, uint32 v) { + memcpy(p, &v, sizeof v); +} + +inline void UNALIGNED_STORE64(void *p, uint64 v) { + memcpy(p, &v, sizeof v); +} + +#endif + +#ifdef _LP64 +#define UNALIGNED_LOADW(_p) UNALIGNED_LOAD64(_p) +#define UNALIGNED_STOREW(_p, _val) UNALIGNED_STORE64(_p, _val) +#else +#define UNALIGNED_LOADW(_p) UNALIGNED_LOAD32(_p) +#define UNALIGNED_STOREW(_p, _val) UNALIGNED_STORE32(_p, _val) +#endif + +// NOTE(user): These are only exported to C++ because the macros they depend on +// use C++-only syntax. This #ifdef can be removed if/when the macros are fixed. + +#if defined(__cplusplus) + +inline void UnalignedCopy16(const void *src, void *dst) { + UNALIGNED_STORE16(dst, UNALIGNED_LOAD16(src)); +} + +inline void UnalignedCopy32(const void *src, void *dst) { + UNALIGNED_STORE32(dst, UNALIGNED_LOAD32(src)); +} + +inline void UnalignedCopy64(const void *src, void *dst) { + if (sizeof(void *) == 8) { + UNALIGNED_STORE64(dst, UNALIGNED_LOAD64(src)); + } else { + const char *src_char = reinterpret_cast(src); + char *dst_char = reinterpret_cast(dst); + + UNALIGNED_STORE32(dst_char, UNALIGNED_LOAD32(src_char)); + UNALIGNED_STORE32(dst_char + 4, UNALIGNED_LOAD32(src_char + 4)); + } +} + +#endif // defined(__cpluscplus) + +// printf macros for size_t, in the style of inttypes.h +#ifdef _LP64 +#define __PRIS_PREFIX "z" +#else +#define __PRIS_PREFIX +#endif + +// Use these macros after a % in a printf format string +// to get correct 32/64 bit behavior, like this: +// size_t size = records.size(); +// printf("%" PRIuS "\n", size); + +#define PRIdS __PRIS_PREFIX "d" +#define PRIxS __PRIS_PREFIX "x" +#define PRIuS __PRIS_PREFIX "u" +#define PRIXS __PRIS_PREFIX "X" +#define PRIoS __PRIS_PREFIX "o" + +#define GPRIuPTHREAD "lu" +#define GPRIxPTHREAD "lx" +#ifdef OS_CYGWIN +#define PRINTABLE_PTHREAD(pthreadt) reinterpret_cast(pthreadt) +#else +#define PRINTABLE_PTHREAD(pthreadt) pthreadt +#endif + +#define SIZEOF_MEMBER(t, f) sizeof(((t*) 4096)->f) + +#define OFFSETOF_MEMBER(t, f) \ + (reinterpret_cast( \ + &reinterpret_cast(16)->f) - \ + reinterpret_cast(16)) + +#ifdef PTHREADS_REDHAT_WIN32 +#include +using std::ostream; // NOLINT(build/include) +#include // NOLINT(build/include) +// pthread_t is not a simple integer or pointer on Win32 +std::ostream& operator << (std::ostream& out, const pthread_t& thread_id); +#endif + +// GXX_EXPERIMENTAL_CXX0X is defined by gcc and clang up to at least +// gcc-4.7 and clang-3.1 (2011-12-13). __cplusplus was defined to 1 +// in gcc before 4.7 (Crosstool 16) and clang before 3.1, but is +// defined according to the language version in effect thereafter. I +// believe MSVC will also define __cplusplus according to the language +// version, but haven't checked that. +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L +// Define this to 1 if the code is compiled in C++11 mode; leave it +// undefined otherwise. Do NOT define it to 0 -- that causes +// '#ifdef LANG_CXX11' to behave differently from '#if LANG_CXX11'. +#define LANG_CXX11 1 +#endif + +// On some platforms, a "function pointer" points to a function descriptor +// rather than directly to the function itself. Use FUNC_PTR_TO_CHAR_PTR(func) +// to get a char-pointer to the first instruction of the function func. +#if defined(__powerpc__) || defined(__ia64) +// use opd section for function descriptors on these platforms, the function +// address is the first word of the descriptor +enum { kPlatformUsesOPDSections = 1 }; +#define FUNC_PTR_TO_CHAR_PTR(func) (reinterpret_cast(func)[0]) +#else +enum { kPlatformUsesOPDSections = 0 }; +#define FUNC_PTR_TO_CHAR_PTR(func) (reinterpret_cast(func)) +#endif + +#endif // BASE_PORT_H_ diff --git a/src/kudu/gutil/raw_scoped_refptr_mismatch_checker.h b/src/kudu/gutil/raw_scoped_refptr_mismatch_checker.h new file mode 100644 index 000000000000..1dacd1753f67 --- /dev/null +++ b/src/kudu/gutil/raw_scoped_refptr_mismatch_checker.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_RAW_SCOPED_REFPTR_MISMATCH_CHECKER_H_ +#define KUDU_GUTIL_RAW_SCOPED_REFPTR_MISMATCH_CHECKER_H_ + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/template_util.h" +#include "kudu/gutil/tuple.h" + +// It is dangerous to post a task with a T* argument where T is a subtype of +// RefCounted(Base|ThreadSafeBase), since by the time the parameter is used, the +// object may already have been deleted since it was not held with a +// scoped_refptr. Example: http://crbug.com/27191 +// The following set of traits are designed to generate a compile error +// whenever this antipattern is attempted. + +namespace kudu { + +// This is a base internal implementation file used by task.h and callback.h. +// Not for public consumption, so we wrap it in namespace internal. +namespace internal { + +template +struct NeedsScopedRefptrButGetsRawPtr { +#if defined(OS_WIN) + enum { + value = base::false_type::value + }; +#else + enum { + // Human readable translation: you needed to be a scoped_refptr if you are a + // raw pointer type and are convertible to a RefCounted(Base|ThreadSafeBase) + // type. + value = (base::is_pointer::value && + (base::is_convertible::value || + base::is_convertible::value)) + }; +#endif +}; + +template +struct ParamsUseScopedRefptrCorrectly { + enum { value = 0 }; +}; + +template <> +struct ParamsUseScopedRefptrCorrectly { + enum { value = 1 }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !NeedsScopedRefptrButGetsRawPtr::value }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +template +struct ParamsUseScopedRefptrCorrectly > { + enum { value = !(NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value || + NeedsScopedRefptrButGetsRawPtr::value) }; +}; + +} // namespace internal + +} // namespace kudu + +#endif // KUDU_GUTIL_RAW_SCOPED_REFPTR_MISMATCH_CHECKER_H_ diff --git a/src/kudu/gutil/ref_counted.cc b/src/kudu/gutil/ref_counted.cc new file mode 100644 index 000000000000..a15a1e26bf0a --- /dev/null +++ b/src/kudu/gutil/ref_counted.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/ref_counted.h" + +#include +#include "kudu/gutil/threading/thread_collision_warner.h" + +namespace kudu { + +namespace subtle { + +RefCountedBase::RefCountedBase() + : ref_count_(0) +#ifndef NDEBUG + , in_dtor_(false) +#endif + { +} + +RefCountedBase::~RefCountedBase() { +#ifndef NDEBUG + DCHECK(in_dtor_) << "RefCounted object deleted without calling Release()"; +#endif +} + +void RefCountedBase::AddRef() const { + // TODO(maruel): Add back once it doesn't assert 500 times/sec. + // Current thread books the critical section "AddRelease" without release it. + // DFAKE_SCOPED_LOCK_THREAD_LOCKED(add_release_); +#ifndef NDEBUG + DCHECK(!in_dtor_); +#endif + ++ref_count_; +} + +bool RefCountedBase::Release() const { + // TODO(maruel): Add back once it doesn't assert 500 times/sec. + // Current thread books the critical section "AddRelease" without release it. + // DFAKE_SCOPED_LOCK_THREAD_LOCKED(add_release_); +#ifndef NDEBUG + DCHECK(!in_dtor_); +#endif + if (--ref_count_ == 0) { +#ifndef NDEBUG + in_dtor_ = true; +#endif + return true; + } + return false; +} + +bool RefCountedThreadSafeBase::HasOneRef() const { + return base::RefCountIsOne( + &const_cast(this)->ref_count_); +} + +RefCountedThreadSafeBase::RefCountedThreadSafeBase() : ref_count_(0) { +#ifndef NDEBUG + in_dtor_ = false; +#endif +} + +RefCountedThreadSafeBase::~RefCountedThreadSafeBase() { +#ifndef NDEBUG + DCHECK(in_dtor_) << "RefCountedThreadSafe object deleted without " + "calling Release()"; +#endif +} + +void RefCountedThreadSafeBase::AddRef() const { +#ifndef NDEBUG + DCHECK(!in_dtor_); +#endif + base::RefCountInc(&ref_count_); +} + +bool RefCountedThreadSafeBase::Release() const { +#ifndef NDEBUG + DCHECK(!in_dtor_); + DCHECK(!base::RefCountIsZero(&ref_count_)); +#endif + if (!base::RefCountDec(&ref_count_)) { +#ifndef NDEBUG + in_dtor_ = true; +#endif + return true; + } + return false; +} + +} // namespace subtle + +} // namespace kudu diff --git a/src/kudu/gutil/ref_counted.h b/src/kudu/gutil/ref_counted.h new file mode 100644 index 000000000000..8b6a5539df4d --- /dev/null +++ b/src/kudu/gutil/ref_counted.h @@ -0,0 +1,354 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_MEMORY_REF_COUNTED_H_ +#define BASE_MEMORY_REF_COUNTED_H_ + +#include + +#include "kudu/gutil/atomic_refcount.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/threading/thread_collision_warner.h" + +namespace kudu { +namespace subtle { + +typedef Atomic32 AtomicRefCount; + +class RefCountedBase { + public: + bool HasOneRef() const { return ref_count_ == 1; } + + protected: + RefCountedBase(); + ~RefCountedBase(); + + void AddRef() const; + + // Returns true if the object should self-delete. + bool Release() const; + + private: + mutable int ref_count_; +#ifndef NDEBUG + mutable bool in_dtor_; +#endif + + DFAKE_MUTEX(add_release_); + + DISALLOW_COPY_AND_ASSIGN(RefCountedBase); +}; + +class RefCountedThreadSafeBase { + public: + bool HasOneRef() const; + + protected: + RefCountedThreadSafeBase(); + ~RefCountedThreadSafeBase(); + + void AddRef() const; + + // Returns true if the object should self-delete. + bool Release() const; + + private: + mutable AtomicRefCount ref_count_; +#ifndef NDEBUG + mutable bool in_dtor_; +#endif + + DISALLOW_COPY_AND_ASSIGN(RefCountedThreadSafeBase); +}; + +} // namespace subtle + +// +// A base class for reference counted classes. Otherwise, known as a cheap +// knock-off of WebKit's RefCounted class. To use this guy just extend your +// class from it like so: +// +// class MyFoo : public RefCounted { +// ... +// private: +// friend class RefCounted; +// ~MyFoo(); +// }; +// +// You should always make your destructor private, to avoid any code deleting +// the object accidently while there are references to it. +template +class RefCounted : public subtle::RefCountedBase { + public: + RefCounted() {} + + void AddRef() const { + subtle::RefCountedBase::AddRef(); + } + + void Release() const { + if (subtle::RefCountedBase::Release()) { + delete static_cast(this); + } + } + + protected: + ~RefCounted() {} + + private: + DISALLOW_COPY_AND_ASSIGN(RefCounted); +}; + +// Forward declaration. +template class RefCountedThreadSafe; + +// Default traits for RefCountedThreadSafe. Deletes the object when its ref +// count reaches 0. Overload to delete it on a different thread etc. +template +struct DefaultRefCountedThreadSafeTraits { + static void Destruct(const T* x) { + // Delete through RefCountedThreadSafe to make child classes only need to be + // friend with RefCountedThreadSafe instead of this struct, which is an + // implementation detail. + RefCountedThreadSafe::DeleteInternal(x); + } +}; + +// +// A thread-safe variant of RefCounted +// +// class MyFoo : public RefCountedThreadSafe { +// ... +// }; +// +// If you're using the default trait, then you should add compile time +// asserts that no one else is deleting your object. i.e. +// private: +// friend class RefCountedThreadSafe; +// ~MyFoo(); +template > +class RefCountedThreadSafe : public subtle::RefCountedThreadSafeBase { + public: + RefCountedThreadSafe() {} + + void AddRef() const { + subtle::RefCountedThreadSafeBase::AddRef(); + } + + void Release() const { + if (subtle::RefCountedThreadSafeBase::Release()) { + Traits::Destruct(static_cast(this)); + } + } + + protected: + ~RefCountedThreadSafe() {} + + private: + friend struct DefaultRefCountedThreadSafeTraits; + static void DeleteInternal(const T* x) { delete x; } + + DISALLOW_COPY_AND_ASSIGN(RefCountedThreadSafe); +}; + +// +// A thread-safe wrapper for some piece of data so we can place other +// things in scoped_refptrs<>. +// +template +class RefCountedData + : public kudu::RefCountedThreadSafe< kudu::RefCountedData > { + public: + RefCountedData() : data() {} + RefCountedData(const T& in_value) : data(in_value) {} + + T data; + + private: + friend class kudu::RefCountedThreadSafe >; + ~RefCountedData() {} +}; + +} // namespace kudu + +// +// A smart pointer class for reference counted objects. Use this class instead +// of calling AddRef and Release manually on a reference counted object to +// avoid common memory leaks caused by forgetting to Release an object +// reference. Sample usage: +// +// class MyFoo : public RefCounted { +// ... +// }; +// +// void some_function() { +// scoped_refptr foo = new MyFoo(); +// foo->Method(param); +// // |foo| is released when this function returns +// } +// +// void some_other_function() { +// scoped_refptr foo = new MyFoo(); +// ... +// foo = NULL; // explicitly releases |foo| +// ... +// if (foo) +// foo->Method(param); +// } +// +// The above examples show how scoped_refptr acts like a pointer to T. +// Given two scoped_refptr classes, it is also possible to exchange +// references between the two objects, like so: +// +// { +// scoped_refptr a = new MyFoo(); +// scoped_refptr b; +// +// b.swap(a); +// // now, |b| references the MyFoo object, and |a| references NULL. +// } +// +// To make both |a| and |b| in the above example reference the same MyFoo +// object, simply use the assignment operator: +// +// { +// scoped_refptr a = new MyFoo(); +// scoped_refptr b; +// +// b = a; +// // now, |a| and |b| each own a reference to the same MyFoo object. +// } +// +template +class scoped_refptr { + public: + typedef T element_type; + + scoped_refptr() : ptr_(NULL) { + } + + scoped_refptr(T* p) : ptr_(p) { + if (ptr_) + ptr_->AddRef(); + } + + scoped_refptr(const scoped_refptr& r) : ptr_(r.ptr_) { + if (ptr_) + ptr_->AddRef(); + } + + template + scoped_refptr(const scoped_refptr& r) : ptr_(r.get()) { + if (ptr_) + ptr_->AddRef(); + } + + template + scoped_refptr(scoped_refptr&& r) : ptr_(r.get()) { + r.ptr_ = nullptr; + } + + ~scoped_refptr() { + if (ptr_) + ptr_->Release(); + } + + T* get() const { return ptr_; } + +// The following is disabled in Cloudera's version of this file since it's +// relatively dangerous. Chromium is planning on doing the same in their +// tree, but hasn't done so yet. See http://code.google.com/p/chromium/issues/detail?id=110610 +#if SCOPED_REFPTR_ALLOW_IMPLICIT_CONVERSION_TO_PTR + // Allow scoped_refptr to be used in boolean expression + // and comparison operations. + operator T*() const { return ptr_; } +#else + typedef T* scoped_refptr::*Testable; + operator Testable() const { return ptr_ ? &scoped_refptr::ptr_ : NULL; } +#endif + + T* operator->() const { + assert(ptr_ != NULL); + return ptr_; + } + + scoped_refptr& operator=(T* p) { + // AddRef first so that self assignment should work + if (p) + p->AddRef(); + T* old_ptr = ptr_; + ptr_ = p; + if (old_ptr) + old_ptr->Release(); + return *this; + } + + scoped_refptr& operator=(const scoped_refptr& r) { + return *this = r.ptr_; + } + + template + scoped_refptr& operator=(const scoped_refptr& r) { + return *this = r.get(); + } + + scoped_refptr& operator=(scoped_refptr&& r) { + scoped_refptr(r).swap(*this); + return *this; + } + + template + scoped_refptr& operator=(scoped_refptr&& r) { + scoped_refptr(r).swap(*this); + return *this; + } + + void swap(T** pp) { + T* p = ptr_; + ptr_ = *pp; + *pp = p; + } + + void swap(scoped_refptr& r) { + swap(&r.ptr_); + } + + // Like gscoped_ptr::reset(), drops a reference on the currently held object + // (if any), and adds a reference to the passed-in object (if not NULL). + void reset(T* p = NULL) { + *this = p; + } + + protected: + T* ptr_; + + private: + template friend class scoped_refptr; +}; + +// Handy utility for creating a scoped_refptr out of a T* explicitly without +// having to retype all the template arguments +template +scoped_refptr make_scoped_refptr(T* t) { + return scoped_refptr(t); +} + +// equal_to and hash implementations for templated scoped_refptrs suitable for +// use with STL unordered_* containers. +template +struct ScopedRefPtrEqualToFunctor { + bool operator()(const scoped_refptr& x, const scoped_refptr& y) const { + return x.get() == y.get(); + } +}; + +template +struct ScopedRefPtrHashFunctor { + size_t operator()(const scoped_refptr& p) const { + return reinterpret_cast(p.get()); + } +}; + +#endif // BASE_MEMORY_REF_COUNTED_H_ diff --git a/src/kudu/gutil/ref_counted_memory.cc b/src/kudu/gutil/ref_counted_memory.cc new file mode 100644 index 000000000000..1d695d93a407 --- /dev/null +++ b/src/kudu/gutil/ref_counted_memory.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/ref_counted_memory.h" + +#include + +#include + +namespace kudu { + +bool RefCountedMemory::Equals( + const scoped_refptr& other) const { + return other.get() && + size() == other->size() && + (memcmp(front(), other->front(), size()) == 0); +} + +RefCountedMemory::RefCountedMemory() {} + +RefCountedMemory::~RefCountedMemory() {} + +const unsigned char* RefCountedStaticMemory::front() const { + return data_; +} + +size_t RefCountedStaticMemory::size() const { + return length_; +} + +RefCountedStaticMemory::~RefCountedStaticMemory() {} + +RefCountedBytes::RefCountedBytes() {} + +RefCountedBytes::RefCountedBytes(std::vector initializer) + : data_(std::move(initializer)) {} + +RefCountedBytes::RefCountedBytes(const unsigned char* p, size_t size) + : data_(p, p + size) {} + +RefCountedBytes* RefCountedBytes::TakeVector( + std::vector* to_destroy) { + auto bytes = new RefCountedBytes; + bytes->data_.swap(*to_destroy); + return bytes; +} + +const unsigned char* RefCountedBytes::front() const { + // STL will assert if we do front() on an empty vector, but calling code + // expects a NULL. + return size() ? &data_.front() : nullptr; +} + +size_t RefCountedBytes::size() const { + return data_.size(); +} + +RefCountedBytes::~RefCountedBytes() {} + +RefCountedString::RefCountedString() {} + +RefCountedString::~RefCountedString() {} + +// static +RefCountedString* RefCountedString::TakeString(std::string* to_destroy) { + auto self = new RefCountedString; + to_destroy->swap(self->data_); + return self; +} + +const unsigned char* RefCountedString::front() const { + return data_.empty() ? nullptr : + reinterpret_cast(data_.data()); +} + +size_t RefCountedString::size() const { + return data_.size(); +} + +RefCountedMallocedMemory::RefCountedMallocedMemory( + void* data, size_t length) + : data_(reinterpret_cast(data)), length_(length) { + DCHECK(data || length == 0); +} + +const unsigned char* RefCountedMallocedMemory::front() const { + return length_ ? data_ : nullptr; +} + +size_t RefCountedMallocedMemory::size() const { + return length_; +} + +RefCountedMallocedMemory::~RefCountedMallocedMemory() { + free(data_); +} + +} // namespace kudu diff --git a/src/kudu/gutil/ref_counted_memory.h b/src/kudu/gutil/ref_counted_memory.h new file mode 100644 index 000000000000..550a142cfa30 --- /dev/null +++ b/src/kudu/gutil/ref_counted_memory.h @@ -0,0 +1,150 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_GUTIL_REF_COUNTED_MEMORY_H_ +#define KUDU_GUTIL_REF_COUNTED_MEMORY_H_ + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/port.h" + +#ifndef BASE_EXPORT +#define BASE_EXPORT +#endif + +namespace kudu { + +// A generic interface to memory. This object is reference counted because one +// of its two subclasses own the data they carry, and we need to have +// heterogeneous containers of these two types of memory. +class BASE_EXPORT RefCountedMemory + : public RefCountedThreadSafe { + public: + // Retrieves a pointer to the beginning of the data we point to. If the data + // is empty, this will return NULL. + virtual const unsigned char* front() const = 0; + + // Size of the memory pointed to. + virtual size_t size() const = 0; + + // Returns true if |other| is byte for byte equal. + bool Equals(const scoped_refptr& other) const; + + // Handy method to simplify calling front() with a reinterpret_cast. + template const T* front_as() const { + return reinterpret_cast(front()); + } + + protected: + friend class RefCountedThreadSafe; + RefCountedMemory(); + virtual ~RefCountedMemory(); +}; + +// An implementation of RefCountedMemory, where the ref counting does not +// matter. +class BASE_EXPORT RefCountedStaticMemory : public RefCountedMemory { + public: + RefCountedStaticMemory() + : data_(NULL), length_(0) {} + RefCountedStaticMemory(const void* data, size_t length) + : data_(static_cast(length ? data : NULL)), + length_(length) {} + + // Overridden from RefCountedMemory: + virtual const unsigned char* front() const OVERRIDE; + virtual size_t size() const OVERRIDE; + + private: + virtual ~RefCountedStaticMemory(); + + const unsigned char* data_; + size_t length_; + + DISALLOW_COPY_AND_ASSIGN(RefCountedStaticMemory); +}; + +// An implementation of RefCountedMemory, where we own the data in a vector. +class BASE_EXPORT RefCountedBytes : public RefCountedMemory { + public: + RefCountedBytes(); + + // Constructs a RefCountedBytes object by _copying_ from |initializer|. + explicit RefCountedBytes(std::vector initializer); + + // Constructs a RefCountedBytes object by copying |size| bytes from |p|. + RefCountedBytes(const unsigned char* p, size_t size); + + // Constructs a RefCountedBytes object by performing a swap. (To non + // destructively build a RefCountedBytes, use the constructor that takes a + // vector.) + static RefCountedBytes* TakeVector(std::vector* to_destroy); + + // Overridden from RefCountedMemory: + virtual const unsigned char* front() const OVERRIDE; + virtual size_t size() const OVERRIDE; + + const std::vector& data() const { return data_; } + std::vector& data() { return data_; } + + private: + virtual ~RefCountedBytes(); + + std::vector data_; + + DISALLOW_COPY_AND_ASSIGN(RefCountedBytes); +}; + +// An implementation of RefCountedMemory, where the bytes are stored in an STL +// string. Use this if your data naturally arrives in that format. +class BASE_EXPORT RefCountedString : public RefCountedMemory { + public: + RefCountedString(); + + // Constructs a RefCountedString object by performing a swap. (To non + // destructively build a RefCountedString, use the default constructor and + // copy into object->data()). + static RefCountedString* TakeString(std::string* to_destroy); + + // Overridden from RefCountedMemory: + virtual const unsigned char* front() const OVERRIDE; + virtual size_t size() const OVERRIDE; + + const std::string& data() const { return data_; } + std::string& data() { return data_; } + + private: + virtual ~RefCountedString(); + + std::string data_; + + DISALLOW_COPY_AND_ASSIGN(RefCountedString); +}; + +// An implementation of RefCountedMemory that holds a chunk of memory +// previously allocated with malloc or calloc, and that therefore must be freed +// using free(). +class BASE_EXPORT RefCountedMallocedMemory : public RefCountedMemory { + public: + RefCountedMallocedMemory(void* data, size_t length); + + // Overridden from RefCountedMemory: + virtual const unsigned char* front() const OVERRIDE; + virtual size_t size() const OVERRIDE; + + private: + virtual ~RefCountedMallocedMemory(); + + unsigned char* data_; + size_t length_; + + DISALLOW_COPY_AND_ASSIGN(RefCountedMallocedMemory); +}; + +} // namespace kudu + +#endif // KUDU_GUTIL_REF_COUNTED_MEMORY_H_ diff --git a/src/kudu/gutil/singleton.h b/src/kudu/gutil/singleton.h new file mode 100644 index 000000000000..61e5ea558336 --- /dev/null +++ b/src/kudu/gutil/singleton.h @@ -0,0 +1,153 @@ +// Copyright 2003 Google Inc. +// +// The Singleton class manages a single instance of Type which will be +// created on first use and (usually) never destroyed. +// +// MyClass* ptr = Singleton::get() +// ptr->DoSomething(); +// +// Singleton<> has no non-static members and is never actually instantiated. +// +// WARNING: Read go/singletons before using. +// +// This class is thread safe; the constructor will be run at most once, and +// no user will gain access to the object until the constructor is completed. +// The underlying Type must of course be thread-safe if you want to use it +// concurrently. +// +// If you want to ensure that your class can only exist as a singleton, make +// its constructors private, and make Singleton<> a friend: +// +// class MySingletonOnlyClass { +// public: +// void DoSomething() { ... } +// private: +// DISALLOW_COPY_AND_ASSIGN(MySingletonOnlyClass); +// MySingletonOnlyClass() { ... } +// friend class Singleton; +// } +// +// If your singleton requires complex initialization, or does not have a +// suitable default constructor, you can provide a specialization of +// Singleton::CreateInstance() to perform the appropriate setup, e.g.: +// +// template <> +// Type* Singleton::CreateInstance() { return new ConcreteImpl; } +// +// If you want to initialize something eagerly at startup, rather than lazily +// upon use, consider using REGISTER_MODULE_INITIALIZER (in base/googleinit.h). +// +// This class also allows users to pick a particular instance as the +// singleton with InjectInstance(). This enables unittesting and +// dependency injection. It must only be used at program startup. +// +// Caveats: +// (a) The instance is normally never destroyed. Destroying a Singleton is +// complex and error-prone; C++ books go on about this at great length, +// and I have seen no perfect general solution to the problem. +// We *do* offer UnsafeReset() which is not thread-safe at all. +// +// (b) Your class must have a default (no-argument) constructor, or you must +// provide a specialization for Singleton::CreateInstance(). +// +// (c) Your class's constructor must never throw an exception. +// +// Singleton::get() is very fast - about 1ns on a 2.4GHz Core 2. + +#ifndef UTIL_GTL_SINGLETON_H__ +#define UTIL_GTL_SINGLETON_H__ + +#include + +#include + +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/once.h" + +namespace util { +namespace gtl { +template class ScopedSingletonOverride; +template class ScopedSingletonOverrideNoDelete; +} // namespace gtl +} // namespace util + +template +class Singleton { + public: + // Return a pointer to the one true instance of the class. + static Type* get() { + GoogleOnceInit(&once_, &Singleton::Init); + return instance_; + } + + // WARNING!!! This function is not thread-safe and may leak memory. + static void UnsafeReset() { + delete instance_; + instance_ = NULL; + once_.state = GOOGLE_ONCE_INTERNAL_INIT; // This is the bad part! + } + + // This function is used to replace the instance used by + // Singleton::get(). It can be used for breaking dependencies. For + // unittesting, you probably want to use ScopedSingletonOverride instead. + // + // This function must be called before Singleton::get() is + // called and before any threads are created. If these assumptions + // are violated, anything could happen, but we try to crash in debug + // mode and do nothing in production. + static void InjectInstance(Type* instance) { + injected_instance_ = instance; + GoogleOnceInit(&once_, &Singleton::Inject); + injected_instance_ = NULL; // Helps detect leaks in the unittest. + if (instance_ != instance) { + LOG(DFATAL) << "(jyasskin) InjectInstance() must be called at most once" + << " at the start of the program, before the Singleton has" + << " been accessed and before any threads have been created." + << " Ignoring the call in production."; + delete instance; + } + } + + private: + friend class util::gtl::ScopedSingletonOverride; + friend class util::gtl::ScopedSingletonOverrideNoDelete; + + // Create the instance. + static void Init() { + instance_ = CreateInstance(); + } + + // Create and return the instance. You can use Singleton for objects which + // require more complex setup by defining a specialization for your type. + static Type* CreateInstance() { + // use ::new to work around a gcc bug when operator new is overloaded + return ::new Type; + } + + // Inject the instance. + static void Inject() { + instance_ = injected_instance_; + } + + // Used by ScopedSingletonOverride. Definitely not threadsafe. No one + // should be calling this other than ScopedSingletonOverride (which has + // friend access to do this and makes sure it calls get() first). + static void OverrideSingleton(Type* override_instance) { + instance_ = override_instance; + } + + static GoogleOnceType once_; + static Type* instance_; + static Type* injected_instance_; +}; + +template +GoogleOnceType Singleton::once_ = GOOGLE_ONCE_INIT; + +template +Type* Singleton::instance_ = NULL; + +template +Type* Singleton::injected_instance_ = NULL; + +#endif // UTIL_GTL_SINGLETON_H__ diff --git a/src/kudu/gutil/spinlock.cc b/src/kudu/gutil/spinlock.cc new file mode 100644 index 000000000000..8a02c9572dc5 --- /dev/null +++ b/src/kudu/gutil/spinlock.cc @@ -0,0 +1,187 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2006, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Sanjay Ghemawat + */ + +#include "kudu/gutil/spinlock.h" +#include "kudu/gutil/synchronization_profiling.h" +#include "kudu/gutil/spinlock_internal.h" +#include "kudu/gutil/walltime.h" +#include "kudu/gutil/sysinfo.h" /* for NumCPUs() */ + +namespace base { + +// NOTE on the Lock-state values: +// +// kSpinLockFree represents the unlocked state +// kSpinLockHeld represents the locked state with no waiters +// +// Values greater than kSpinLockHeld represent the locked state with waiters, +// where the value is the time the current lock holder had to +// wait before obtaining the lock. The kSpinLockSleeper state is a special +// "locked with waiters" state that indicates that a sleeper needs to +// be woken, but the thread that just released the lock didn't wait. + +static int adaptive_spin_count = 0; + +const base::LinkerInitialized SpinLock::LINKER_INITIALIZED = + base::LINKER_INITIALIZED; + +namespace { +struct SpinLock_InitHelper { + SpinLock_InitHelper() { + // On multi-cpu machines, spin for longer before yielding + // the processor or sleeping. Reduces idle time significantly. + if (base::NumCPUs() > 1) { + adaptive_spin_count = 1000; + } + } +}; + +// Hook into global constructor execution: +// We do not do adaptive spinning before that, +// but nothing lock-intensive should be going on at that time. +static SpinLock_InitHelper init_helper; + +} // unnamed namespace + +// Monitor the lock to see if its value changes within some time period +// (adaptive_spin_count loop iterations). A timestamp indicating +// when the thread initially started waiting for the lock is passed in via +// the initial_wait_timestamp value. The total wait time in cycles for the +// lock is returned in the wait_cycles parameter. The last value read +// from the lock is returned from the method. +Atomic32 SpinLock::SpinLoop(int64 initial_wait_timestamp, + Atomic32* wait_cycles) { + int c = adaptive_spin_count; + while (base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree && --c > 0) { + base::subtle::PauseCPU(); + } + Atomic32 spin_loop_wait_cycles = CalculateWaitCycles(initial_wait_timestamp); + Atomic32 lock_value = + base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + spin_loop_wait_cycles); + *wait_cycles = spin_loop_wait_cycles; + return lock_value; +} + +void SpinLock::SlowLock() { + // The lock was not obtained initially, so this thread needs to wait for + // it. Record the current timestamp in the local variable wait_start_time + // so the total wait time can be stored in the lockword once this thread + // obtains the lock. + int64 wait_start_time = CycleClock::Now(); + Atomic32 wait_cycles; + Atomic32 lock_value = SpinLoop(wait_start_time, &wait_cycles); + + int lock_wait_call_count = 0; + while (lock_value != kSpinLockFree) { + // If the lock is currently held, but not marked as having a sleeper, mark + // it as having a sleeper. + if (lock_value == kSpinLockHeld) { + // Here, just "mark" that the thread is going to sleep. Don't store the + // lock wait time in the lock as that will cause the current lock + // owner to think it experienced contention. + lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_, + kSpinLockHeld, + kSpinLockSleeper); + if (lock_value == kSpinLockHeld) { + // Successfully transitioned to kSpinLockSleeper. Pass + // kSpinLockSleeper to the SpinLockWait routine to properly indicate + // the last lock_value observed. + lock_value = kSpinLockSleeper; + } else if (lock_value == kSpinLockFree) { + // Lock is free again, so try and acquire it before sleeping. The + // new lock state will be the number of cycles this thread waited if + // this thread obtains the lock. + lock_value = base::subtle::Acquire_CompareAndSwap(&lockword_, + kSpinLockFree, + wait_cycles); + continue; // skip the delay at the end of the loop + } + } + + // Wait for an OS specific delay. + base::internal::SpinLockDelay(&lockword_, lock_value, + ++lock_wait_call_count); + // Spin again after returning from the wait routine to give this thread + // some chance of obtaining the lock. + lock_value = SpinLoop(wait_start_time, &wait_cycles); + } +} + +// The wait time for contentionz lock profiling must fit into 32 bits. +// However, the lower 32-bits of the cycle counter wrap around too quickly +// with high frequency processors, so a right-shift by 7 is performed to +// quickly divide the cycles by 128. Using these 32 bits, reduces the +// granularity of time measurement to 128 cycles, and loses track +// of wait time for waits greater than 109 seconds on a 5 GHz machine +// [(2^32 cycles/5 Ghz)*128 = 109.95 seconds]. Waits this long should be +// very rare and the reduced granularity should not be an issue given +// processors in the Google fleet operate at a minimum of one billion +// cycles/sec. +enum { PROFILE_TIMESTAMP_SHIFT = 7 }; + +void SpinLock::SlowUnlock(uint64 wait_cycles) { + base::internal::SpinLockWake(&lockword_, false); // wake waiter if necessary + + // Collect contentionz profile info, expanding the wait_cycles back out to + // the full value. If wait_cycles is <= kSpinLockSleeper, then no wait + // was actually performed, so don't record the wait time. Note, that the + // CalculateWaitCycles method adds in kSpinLockSleeper cycles + // unconditionally to guarantee the wait time is not kSpinLockFree or + // kSpinLockHeld. The adding in of these small number of cycles may + // overestimate the contention by a slight amount 50% of the time. However, + // if this code tried to correct for that addition by subtracting out the + // kSpinLockSleeper amount that would underestimate the contention slightly + // 50% of the time. Both ways get the wrong answer, so the code + // overestimates to be more conservative. Overestimating also makes the code + // a little simpler. + // + if (wait_cycles > kSpinLockSleeper) { + gutil::SubmitSpinLockProfileData(this, + wait_cycles << PROFILE_TIMESTAMP_SHIFT); + } +} + +inline int32 SpinLock::CalculateWaitCycles(int64 wait_start_time) { + int32 wait_cycles = ((CycleClock::Now() - wait_start_time) >> + PROFILE_TIMESTAMP_SHIFT); + // The number of cycles waiting for the lock is used as both the + // wait_cycles and lock value, so it can't be kSpinLockFree or + // kSpinLockHeld. Make sure the value returned is at least + // kSpinLockSleeper. + wait_cycles |= kSpinLockSleeper; + return wait_cycles; +} + +} // namespace base diff --git a/src/kudu/gutil/spinlock.h b/src/kudu/gutil/spinlock.h new file mode 100644 index 000000000000..fcd6287594dd --- /dev/null +++ b/src/kudu/gutil/spinlock.h @@ -0,0 +1,151 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2006, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Sanjay Ghemawat + */ + +// SpinLock is async signal safe. +// If used within a signal handler, all lock holders +// should block the signal even outside the signal handler. + +#ifndef BASE_SPINLOCK_H_ +#define BASE_SPINLOCK_H_ + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/thread_annotations.h" + +// This isn't originally in the base:: namespace in tcmalloc, +// but tcmalloc inadvertently exports these symbols. So, if we +// don't namespace it differently, we conflict. +namespace base { + +class LOCKABLE SpinLock { + public: + SpinLock() : lockword_(kSpinLockFree) { } + + // Special constructor for use with static SpinLock objects. E.g., + // + // static SpinLock lock(base::LINKER_INITIALIZED); + // + // When intialized using this constructor, we depend on the fact + // that the linker has already initialized the memory appropriately. + // A SpinLock constructed like this can be freely used from global + // initializers without worrying about the order in which global + // initializers run. + explicit SpinLock(base::LinkerInitialized /*x*/) { + // Does nothing; lockword_ is already initialized + } + + // Acquire this SpinLock. + // TODO(csilvers): uncomment the annotation when we figure out how to + // support this macro with 0 args (see thread_annotations.h) + inline void Lock() /*EXCLUSIVE_LOCK_FUNCTION()*/ { + if (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + kSpinLockHeld) != kSpinLockFree) { + SlowLock(); + } + ANNOTATE_RWLOCK_ACQUIRED(this, 1); + } + + // Try to acquire this SpinLock without blocking and return true if the + // acquisition was successful. If the lock was not acquired, false is + // returned. If this SpinLock is free at the time of the call, TryLock + // will return true with high probability. + inline bool TryLock() EXCLUSIVE_TRYLOCK_FUNCTION(true) { + bool res = + (base::subtle::Acquire_CompareAndSwap(&lockword_, kSpinLockFree, + kSpinLockHeld) == kSpinLockFree); + if (res) { + ANNOTATE_RWLOCK_ACQUIRED(this, 1); + } + return res; + } + + // Release this SpinLock, which must be held by the calling thread. + // TODO(csilvers): uncomment the annotation when we figure out how to + // support this macro with 0 args (see thread_annotations.h) + inline void Unlock() /*UNLOCK_FUNCTION()*/ { + ANNOTATE_RWLOCK_RELEASED(this, 1); + uint64 wait_cycles = static_cast( + base::subtle::Release_AtomicExchange(&lockword_, kSpinLockFree)); + if (wait_cycles != kSpinLockHeld) { + // Collect contentionz profile info, and speed the wakeup of any waiter. + // The wait_cycles value indicates how long this thread spent waiting + // for the lock. + SlowUnlock(wait_cycles); + } + } + + // Determine if the lock is held. When the lock is held by the invoking + // thread, true will always be returned. Intended to be used as + // CHECK(lock.IsHeld()). + inline bool IsHeld() const { + return base::subtle::NoBarrier_Load(&lockword_) != kSpinLockFree; + } + + static const base::LinkerInitialized LINKER_INITIALIZED; // backwards compat + private: + enum { kSpinLockFree = 0 }; + enum { kSpinLockHeld = 1 }; + enum { kSpinLockSleeper = 2 }; + + volatile Atomic32 lockword_; + + void SlowLock(); + void SlowUnlock(uint64 wait_cycles); + Atomic32 SpinLoop(int64 initial_wait_timestamp, Atomic32* wait_cycles); + inline int32 CalculateWaitCycles(int64 wait_start_time); + + DISALLOW_COPY_AND_ASSIGN(SpinLock); +}; + +// Corresponding locker object that arranges to acquire a spinlock for +// the duration of a C++ scope. +class SCOPED_LOCKABLE SpinLockHolder { + private: + SpinLock* lock_; + public: + inline explicit SpinLockHolder(SpinLock* l) EXCLUSIVE_LOCK_FUNCTION(l) + : lock_(l) { + l->Lock(); + } + // TODO(csilvers): uncomment the annotation when we figure out how to + // support this macro with 0 args (see thread_annotations.h) + inline ~SpinLockHolder() /*UNLOCK_FUNCTION()*/ { lock_->Unlock(); } +}; +// Catch bug where variable name is omitted, e.g. SpinLockHolder (&lock); +#define SpinLockHolder(x) COMPILE_ASSERT(0, spin_lock_decl_missing_var_name) + +} // namespace base + +#endif // BASE_SPINLOCK_H_ diff --git a/src/kudu/gutil/spinlock_internal.cc b/src/kudu/gutil/spinlock_internal.cc new file mode 100644 index 000000000000..151b0ddab124 --- /dev/null +++ b/src/kudu/gutil/spinlock_internal.cc @@ -0,0 +1,122 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// The OS-specific header included below must provide two calls: +// base::internal::SpinLockDelay() and base::internal::SpinLockWake(). +// See spinlock_internal.h for the spec of SpinLockWake(). + +// void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) +// SpinLockDelay() generates an apprproate spin delay on iteration "loop" of a +// spin loop on location *w, whose previously observed value was "value". +// SpinLockDelay() may do nothing, may yield the CPU, may sleep a clock tick, +// or may wait for a delay that can be truncated by a call to SpinlockWake(w). +// In all cases, it must return in bounded time even if SpinlockWake() is not +// called. + +#include "kudu/gutil/spinlock_internal.h" + +// forward declaration for use by spinlock_*-inl.h +namespace base { namespace internal { static int SuggestedDelayNS(int loop); }} + +#if defined(_WIN32) +#include "kudu/gutil/spinlock_win32-inl.h" +#elif defined(__linux__) +#include "kudu/gutil/spinlock_linux-inl.h" +#else +#include "kudu/gutil/spinlock_posix-inl.h" +#endif + +namespace base { +namespace internal { + +// See spinlock_internal.h for spec. +int32 SpinLockWait(volatile Atomic32 *w, int n, + const SpinLockWaitTransition trans[]) { + int32 v; + bool done = false; + for (int loop = 0; !done; loop++) { + v = base::subtle::Acquire_Load(w); + int i; + for (i = 0; i != n && v != trans[i].from; i++) { + } + if (i == n) { + SpinLockDelay(w, v, loop); // no matching transition + } else if (trans[i].to == v || // null transition + base::subtle::Acquire_CompareAndSwap(w, v, trans[i].to) == v) { + done = trans[i].done; + } + } + return v; +} + +// Return a suggested delay in nanoseconds for iteration number "loop" +static int SuggestedDelayNS(int loop) { + // Weak pseudo-random number generator to get some spread between threads + // when many are spinning. +#ifdef BASE_HAS_ATOMIC64 + static base::subtle::Atomic64 rand; + uint64 r = base::subtle::NoBarrier_Load(&rand); + r = 0x5deece66dLL * r + 0xb; // numbers from nrand48() + base::subtle::NoBarrier_Store(&rand, r); + + r <<= 16; // 48-bit random number now in top 48-bits. + if (loop < 0 || loop > 32) { // limit loop to 0..32 + loop = 32; + } + // loop>>3 cannot exceed 4 because loop cannot exceed 32. + // Select top 20..24 bits of lower 48 bits, + // giving approximately 0ms to 16ms. + // Mean is exponential in loop for first 32 iterations, then 8ms. + // The futex path multiplies this by 16, since we expect explicit wakeups + // almost always on that path. + return r >> (44 - (loop >> 3)); +#else + static Atomic32 rand; + uint32 r = base::subtle::NoBarrier_Load(&rand); + r = 0x343fd * r + 0x269ec3; // numbers from MSVC++ + base::subtle::NoBarrier_Store(&rand, r); + + r <<= 1; // 31-bit random number now in top 31-bits. + if (loop < 0 || loop > 32) { // limit loop to 0..32 + loop = 32; + } + // loop>>3 cannot exceed 4 because loop cannot exceed 32. + // Select top 20..24 bits of lower 31 bits, + // giving approximately 0ms to 16ms. + // Mean is exponential in loop for first 32 iterations, then 8ms. + // The futex path multiplies this by 16, since we expect explicit wakeups + // almost always on that path. + return r >> (12 - (loop >> 3)); +#endif +} + +} // namespace internal +} // namespace base diff --git a/src/kudu/gutil/spinlock_internal.h b/src/kudu/gutil/spinlock_internal.h new file mode 100644 index 000000000000..c2358933bc0b --- /dev/null +++ b/src/kudu/gutil/spinlock_internal.h @@ -0,0 +1,64 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * This file is an internal part spinlock.cc and once.cc + * It may not be used directly by code outside of //base. + */ + +#ifndef BASE_SPINLOCK_INTERNAL_H_ +#define BASE_SPINLOCK_INTERNAL_H_ + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/atomicops.h" + +namespace base { +namespace internal { + +// SpinLockWait() waits until it can perform one of several transitions from +// "from" to "to". It returns when it performs a transition where done==true. +struct SpinLockWaitTransition { + int32 from; + int32 to; + bool done; +}; + +// Wait until *w can transition from trans[i].from to trans[i].to for some i +// satisfying 0<=i +#include +#include +#include +#include "kudu/gutil/linux_syscall_support.h" + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_PRIVATE_FLAG 128 + +static bool have_futex; +static int futex_private_flag = FUTEX_PRIVATE_FLAG; + +namespace { +static struct InitModule { + InitModule() { + int x = 0; + // futexes are ints, so we can use them only when + // that's the same size as the lockword_ in SpinLock. +#ifdef __arm__ + // ARM linux doesn't support sys_futex1(void*, int, int, struct timespec*); + have_futex = 0; +#else + have_futex = (sizeof (Atomic32) == sizeof (int) && + sys_futex(&x, FUTEX_WAKE, 1, 0) >= 0); +#endif + if (have_futex && + sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, 0) < 0) { + futex_private_flag = 0; + } + } +} init_module; + +} // anonymous namespace + + +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { + if (loop != 0) { + int save_errno = errno; + struct timespec tm; + tm.tv_sec = 0; + if (have_futex) { + tm.tv_nsec = base::internal::SuggestedDelayNS(loop); + } else { + tm.tv_nsec = 2000001; // above 2ms so linux 2.4 doesn't spin + } + if (have_futex) { + tm.tv_nsec *= 16; // increase the delay; we expect explicit wakeups + sys_futex(reinterpret_cast(const_cast(w)), + FUTEX_WAIT | futex_private_flag, + value, reinterpret_cast(&tm)); + } else { + nanosleep(&tm, NULL); + } + errno = save_errno; + } +} + +void SpinLockWake(volatile Atomic32 *w, bool all) { + if (have_futex) { + sys_futex(reinterpret_cast(const_cast(w)), + FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, 0); + } +} + +} // namespace internal +} // namespace base diff --git a/src/kudu/gutil/spinlock_posix-inl.h b/src/kudu/gutil/spinlock_posix-inl.h new file mode 100644 index 000000000000..b34c9912b393 --- /dev/null +++ b/src/kudu/gutil/spinlock_posix-inl.h @@ -0,0 +1,62 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2009, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * This file is a Posix-specific part of spinlock_internal.cc + */ + +#include +#if defined(HAVE_SCHED_H) || defined(__APPLE__) +#include /* For sched_yield() */ +#endif +#include /* For nanosleep() */ + +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { + int save_errno = errno; + if (loop == 0) { + } else if (loop == 1) { + sched_yield(); + } else { + struct timespec tm; + tm.tv_sec = 0; + tm.tv_nsec = base::internal::SuggestedDelayNS(loop); + nanosleep(&tm, NULL); + } + errno = save_errno; +} + +void SpinLockWake(volatile Atomic32 *w, bool all) { +} + +} // namespace internal +} // namespace base diff --git a/src/kudu/gutil/spinlock_win32-inl.h b/src/kudu/gutil/spinlock_win32-inl.h new file mode 100644 index 000000000000..956b9653e6db --- /dev/null +++ b/src/kudu/gutil/spinlock_win32-inl.h @@ -0,0 +1,54 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2009, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * This file is a Win32-specific part of spinlock_internal.cc + */ + + +#include + +namespace base { +namespace internal { + +void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { + if (loop == 0) { + } else if (loop == 1) { + Sleep(0); + } else { + Sleep(base::internal::SuggestedDelayNS(loop) / 1000000); + } +} + +void SpinLockWake(volatile Atomic32 *w, bool all) { +} + +} // namespace internal +} // namespace base diff --git a/src/kudu/gutil/stl_util.h b/src/kudu/gutil/stl_util.h new file mode 100644 index 000000000000..68e31e9cbe1b --- /dev/null +++ b/src/kudu/gutil/stl_util.h @@ -0,0 +1,977 @@ +// Copyright 2002 Google Inc. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// --- +// +// +// STL utility functions. Usually, these replace built-in, but slow(!), +// STL functions with more efficient versions or provide a more convenient +// and Google friendly API. +// + +#ifndef UTIL_GTL_STL_UTIL_H_ +#define UTIL_GTL_STL_UTIL_H_ + +#include +#include // for memcpy +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +#include +using std::deque; +#include +using std::binary_function; +using std::less; +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +#include +using std::string; +#include +using std::vector; + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/algorithm.h" + +// Sort and remove duplicates of an STL vector or deque. +template +void STLSortAndRemoveDuplicates(T *v) { + sort(v->begin(), v->end()); + v->erase(unique(v->begin(), v->end()), v->end()); +} + +// Clear internal memory of an STL object. +// STL clear()/reserve(0) does not always free internal memory allocated +// This function uses swap/destructor to ensure the internal memory is freed. +template void STLClearObject(T* obj) { + T tmp; + tmp.swap(*obj); + obj->reserve(0); // this is because sometimes "T tmp" allocates objects with + // memory (arena implementation?). use reserve() + // to clear() even if it doesn't always work +} + +// Specialization for deque. Same as STLClearObject but doesn't call reserve +// since deque doesn't have reserve. +template +void STLClearObject(deque* obj) { + deque tmp; + tmp.swap(*obj); +} + +// Reduce memory usage on behalf of object if its capacity is greater +// than or equal to "limit", which defaults to 2^20. +template inline void STLClearIfBig(T* obj, size_t limit = 1<<20) { + if (obj->capacity() >= limit) { + STLClearObject(obj); + } else { + obj->clear(); + } +} + +// Specialization for deque, which doesn't implement capacity(). +template +inline void STLClearIfBig(deque* obj, size_t limit = 1<<20) { + if (obj->size() >= limit) { + STLClearObject(obj); + } else { + obj->clear(); + } +} + +// Reduce the number of buckets in a hash_set or hash_map back to the +// default if the current number of buckets is "limit" or more. +// +// Suppose you repeatedly fill and clear a hash_map or hash_set. If +// you ever insert a lot of items, then your hash table will have lots +// of buckets thereafter. (The number of buckets is not reduced when +// the table is cleared.) Having lots of buckets is good if you +// insert comparably many items in every iteration, because you'll +// reduce collisions and table resizes. But having lots of buckets is +// bad if you insert few items in most subsequent iterations, because +// repeatedly clearing out all those buckets can get expensive. +// +// One solution is to call STLClearHashIfBig() with a "limit" value +// that is a small multiple of the typical number of items in your +// table. In the common case, this is equivalent to an ordinary +// clear. In the rare case where you insert a lot of items, the +// number of buckets is reset to the default to keep subsequent clear +// operations cheap. Note that the default number of buckets is 193 +// in the Gnu library implementation as of Jan '08. +template inline void STLClearHashIfBig(T *obj, size_t limit) { + if (obj->bucket_count() >= limit) { + T tmp; + tmp.swap(*obj); + } else { + obj->clear(); + } +} + +// Reserve space for STL object. +// STL's reserve() will always copy. +// This function avoid the copy if we already have capacity +template void STLReserveIfNeeded(T* obj, int new_size) { + if (obj->capacity() < new_size) // increase capacity + obj->reserve(new_size); + else if (obj->size() > new_size) // reduce size + obj->resize(new_size); +} + +// STLDeleteContainerPointers() +// For a range within a container of pointers, calls delete +// (non-array version) on these pointers. +// NOTE: for these three functions, we could just implement a DeleteObject +// functor and then call for_each() on the range and functor, but this +// requires us to pull in all of , which seems expensive. +// For hash_[multi]set, it is important that this deletes behind the iterator +// because the hash_set may call the hash function on the iterator when it is +// advanced, which could result in the hash function trying to deference a +// stale pointer. +// NOTE: If you're calling this on an entire container, you probably want +// to call STLDeleteElements(&container) instead, or use an ElementDeleter. +template +void STLDeleteContainerPointers(ForwardIterator begin, + ForwardIterator end) { + while (begin != end) { + ForwardIterator temp = begin; + ++begin; + delete *temp; + } +} + +// STLDeleteContainerPairPointers() +// For a range within a container of pairs, calls delete +// (non-array version) on BOTH items in the pairs. +// NOTE: Like STLDeleteContainerPointers, it is important that this deletes +// behind the iterator because if both the key and value are deleted, the +// container may call the hash function on the iterator when it is advanced, +// which could result in the hash function trying to dereference a stale +// pointer. +template +void STLDeleteContainerPairPointers(ForwardIterator begin, + ForwardIterator end) { + while (begin != end) { + ForwardIterator temp = begin; + ++begin; + delete temp->first; + delete temp->second; + } +} + +// STLDeleteContainerPairFirstPointers() +// For a range within a container of pairs, calls delete (non-array version) +// on the FIRST item in the pairs. +// NOTE: Like STLDeleteContainerPointers, deleting behind the iterator. +template +void STLDeleteContainerPairFirstPointers(ForwardIterator begin, + ForwardIterator end) { + while (begin != end) { + ForwardIterator temp = begin; + ++begin; + delete temp->first; + } +} + +// STLDeleteContainerPairSecondPointers() +// For a range within a container of pairs, calls delete +// (non-array version) on the SECOND item in the pairs. +// NOTE: Like STLDeleteContainerPointers, deleting behind the iterator. +// Deleting the value does not always invalidate the iterator, but it may +// do so if the key is a pointer into the value object. +// NOTE: If you're calling this on an entire container, you probably want +// to call STLDeleteValues(&container) instead, or use ValueDeleter. +template +void STLDeleteContainerPairSecondPointers(ForwardIterator begin, + ForwardIterator end) { + while (begin != end) { + ForwardIterator temp = begin; + ++begin; + delete temp->second; + } +} + +template +inline void STLAssignToVector(vector* vec, + const T* ptr, + size_t n) { + vec->resize(n); + if (n == 0) return; + memcpy(&vec->front(), ptr, n*sizeof(T)); +} + +// Not faster; but we need the specialization so the function works at all +// on the vector specialization. +template<> +inline void STLAssignToVector(vector* vec, + const bool* ptr, + size_t n) { + vec->clear(); + if (n == 0) return; + vec->insert(vec->begin(), ptr, ptr + n); +} + +/***** Hack to allow faster assignment to a vector *****/ + +// This routine speeds up an assignment of 32 bytes to a vector from +// about 250 cycles per assignment to about 140 cycles. +// +// Usage: +// STLAssignToVectorChar(&vec, ptr, size); +// STLAssignToString(&str, ptr, size); + +inline void STLAssignToVectorChar(vector* vec, + const char* ptr, + size_t n) { + STLAssignToVector(vec, ptr, n); +} + +// A struct that mirrors the GCC4 implementation of a string. See: +// /usr/crosstool/v8/gcc-4.1.0-glibc-2.2.2/i686-unknown-linux-gnu/include/c++/4.1.0/ext/sso_string_base.h +struct InternalStringRepGCC4 { + char* _M_data; + size_t _M_string_length; + + enum { _S_local_capacity = 15 }; + + union { + char _M_local_data[_S_local_capacity + 1]; + size_t _M_allocated_capacity; + }; +}; + +// Like str->resize(new_size), except any new characters added to +// "*str" as a result of resizing may be left uninitialized, rather +// than being filled with '0' bytes. Typically used when code is then +// going to overwrite the backing store of the string with known data. +inline void STLStringResizeUninitialized(string* s, size_t new_size) { + if (sizeof(*s) == sizeof(InternalStringRepGCC4)) { + if (new_size > s->capacity()) { + s->reserve(new_size); + } + // The line below depends on the layout of 'string'. THIS IS + // NON-PORTABLE CODE. If our STL implementation changes, we will + // need to change this as well. + InternalStringRepGCC4* rep = reinterpret_cast(s); + assert(rep->_M_data == s->data()); + assert(rep->_M_string_length == s->size()); + + // We have to null-terminate the string for c_str() to work properly. + // So we leave the actual contents of the string uninitialized, but + // we set the byte one past the new end of the string to '\0' + const_cast(s->data())[new_size] = '\0'; + rep->_M_string_length = new_size; + } else { + // Slow path: have to reallocate stuff, or an unknown string rep + s->resize(new_size); + } +} + +// Returns true if the string implementation supports a resize where +// the new characters added to the string are left untouched. +inline bool STLStringSupportsNontrashingResize(const string& s) { + return (sizeof(s) == sizeof(InternalStringRepGCC4)); +} + +inline void STLAssignToString(string* str, const char* ptr, size_t n) { + STLStringResizeUninitialized(str, n); + if (n == 0) return; + memcpy(&*str->begin(), ptr, n); +} + +inline void STLAppendToString(string* str, const char* ptr, size_t n) { + if (n == 0) return; + size_t old_size = str->size(); + STLStringResizeUninitialized(str, old_size + n); + memcpy(&*str->begin() + old_size, ptr, n); +} + +// To treat a possibly-empty vector as an array, use these functions. +// If you know the array will never be empty, you can use &*v.begin() +// directly, but that is allowed to dump core if v is empty. This +// function is the most efficient code that will work, taking into +// account how our STL is actually implemented. THIS IS NON-PORTABLE +// CODE, so call us instead of repeating the nonportable code +// everywhere. If our STL implementation changes, we will need to +// change this as well. + +template +inline T* vector_as_array(vector* v) { +# ifdef NDEBUG + return &*v->begin(); +# else + return v->empty() ? NULL : &*v->begin(); +# endif +} + +template +inline const T* vector_as_array(const vector* v) { +# ifdef NDEBUG + return &*v->begin(); +# else + return v->empty() ? NULL : &*v->begin(); +# endif +} + +// Return a mutable char* pointing to a string's internal buffer, +// which may not be null-terminated. Writing through this pointer will +// modify the string. +// +// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the +// next call to a string method that invalidates iterators. +// +// Prior to C++11, there was no standard-blessed way of getting a mutable +// reference to a string's internal buffer. The requirement that string be +// contiguous is officially part of the C++11 standard [string.require]/5. +// According to Matt Austern, this should already work on all current C++98 +// implementations. +inline char* string_as_array(string* str) { + // DO NOT USE const_cast(str->data())! See the unittest for why. + return str->empty() ? NULL : &*str->begin(); +} + +// These are methods that test two hash maps/sets for equality. These exist +// because the == operator in the STL can return false when the maps/sets +// contain identical elements. This is because it compares the internal hash +// tables which may be different if the order of insertions and deletions +// differed. + +template +inline bool +HashSetEquality(const HashSet& set_a, + const HashSet& set_b) { + if (set_a.size() != set_b.size()) return false; + for (typename HashSet::const_iterator i = set_a.begin(); + i != set_a.end(); + ++i) + if (set_b.find(*i) == set_b.end()) return false; + return true; +} + +template +inline bool +HashMapEquality(const HashMap& map_a, + const HashMap& map_b) { + if (map_a.size() != map_b.size()) return false; + for (typename HashMap::const_iterator i = map_a.begin(); + i != map_a.end(); ++i) { + typename HashMap::const_iterator j = map_b.find(i->first); + if (j == map_b.end()) return false; + if (i->second != j->second) return false; + } + return true; +} + +// The following functions are useful for cleaning up STL containers +// whose elements point to allocated memory. + +// STLDeleteElements() deletes all the elements in an STL container and clears +// the container. This function is suitable for use with a vector, set, +// hash_set, or any other STL container which defines sensible begin(), end(), +// and clear() methods. +// +// If container is NULL, this function is a no-op. +// +// As an alternative to calling STLDeleteElements() directly, consider +// ElementDeleter (defined below), which ensures that your container's elements +// are deleted when the ElementDeleter goes out of scope. +template +void STLDeleteElements(T *container) { + if (!container) return; + STLDeleteContainerPointers(container->begin(), container->end()); + container->clear(); +} + +// Given an STL container consisting of (key, value) pairs, STLDeleteValues +// deletes all the "value" components and clears the container. Does nothing +// in the case it's given a NULL pointer. +template +void STLDeleteValues(T *v) { + if (!v) return; + STLDeleteContainerPairSecondPointers(v->begin(), v->end()); + v->clear(); +} + + +// ElementDeleter and ValueDeleter provide a convenient way to delete all +// elements or values from STL containers when they go out of scope. This +// greatly simplifies code that creates temporary objects and has multiple +// return statements. Example: +// +// vector tmp_proto; +// ElementDeleter d(&tmp_proto); +// if (...) return false; +// ... +// return success; + +// A very simple interface that simply provides a virtual destructor. It is +// used as a non-templated base class for the TemplatedElementDeleter and +// TemplatedValueDeleter classes. Clients should not typically use this class +// directly. +class BaseDeleter { + public: + virtual ~BaseDeleter() {} + + protected: + BaseDeleter() {} + + private: + DISALLOW_EVIL_CONSTRUCTORS(BaseDeleter); +}; + +// Given a pointer to an STL container, this class will delete all the element +// pointers when it goes out of scope. Clients should typically use +// ElementDeleter rather than invoking this class directly. +template +class TemplatedElementDeleter : public BaseDeleter { + public: + explicit TemplatedElementDeleter(STLContainer *ptr) + : container_ptr_(ptr) { + } + + virtual ~TemplatedElementDeleter() { + STLDeleteElements(container_ptr_); + } + + private: + STLContainer *container_ptr_; + + DISALLOW_EVIL_CONSTRUCTORS(TemplatedElementDeleter); +}; + +// Like TemplatedElementDeleter, this class will delete element pointers from a +// container when it goes out of scope. However, it is much nicer to use, +// since the class itself is not templated. +class ElementDeleter { + public: + template + explicit ElementDeleter(STLContainer *ptr) + : deleter_(new TemplatedElementDeleter(ptr)) { + } + + ~ElementDeleter() { + delete deleter_; + } + + private: + BaseDeleter *deleter_; + + DISALLOW_EVIL_CONSTRUCTORS(ElementDeleter); +}; + +// Given a pointer to an STL container this class will delete all the value +// pointers when it goes out of scope. Clients should typically use +// ValueDeleter rather than invoking this class directly. +template +class TemplatedValueDeleter : public BaseDeleter { + public: + explicit TemplatedValueDeleter(STLContainer *ptr) + : container_ptr_(ptr) { + } + + virtual ~TemplatedValueDeleter() { + STLDeleteValues(container_ptr_); + } + + private: + STLContainer *container_ptr_; + + DISALLOW_EVIL_CONSTRUCTORS(TemplatedValueDeleter); +}; + +// Similar to ElementDeleter, but wraps a TemplatedValueDeleter rather than an +// TemplatedElementDeleter. +class ValueDeleter { + public: + template + explicit ValueDeleter(STLContainer *ptr) + : deleter_(new TemplatedValueDeleter(ptr)) { + } + + ~ValueDeleter() { + delete deleter_; + } + + private: + BaseDeleter *deleter_; + + DISALLOW_EVIL_CONSTRUCTORS(ValueDeleter); +}; + + +// STLElementDeleter and STLValueDeleter are similar to ElementDeleter and +// ValueDeleter, except that: +// - The classes are templated, making them less convenient to use. +// - Their destructors are not virtual, making them potentially more efficient. +// New code should typically use ElementDeleter and ValueDeleter unless +// efficiency is a large concern. + +template class STLElementDeleter { + public: + STLElementDeleter(STLContainer *ptr) : container_ptr_(ptr) {} + ~STLElementDeleter() { STLDeleteElements(container_ptr_); } + private: + STLContainer *container_ptr_; +}; + +template class STLValueDeleter { + public: + STLValueDeleter(STLContainer *ptr) : container_ptr_(ptr) {} + ~STLValueDeleter() { STLDeleteValues(container_ptr_); } + private: + STLContainer *container_ptr_; +}; + + +// STLSet{Difference,SymmetricDifference,Union,Intersection}(A a, B b, C *c) +// *APPEND* the set {difference, symmetric difference, union, intersection} of +// the two sets a and b to c. +// STLSet{Difference,SymmetricDifference,Union,Intersection}(T a, T b) do the +// same but return the result by value rather than by the third pointer +// argument. The result type is the same as both of the inputs in the two +// argument case. +// +// Requires: +// a and b must be STL like containers that contain sorted data (as defined +// by the < operator). +// For the 3 argument version &a == c or &b == c are disallowed. In those +// cases the 2 argument version is probably what you want anyway: +// a = STLSetDifference(a, b); +// +// These function are convenience functions. The code they implement is +// trivial (at least for now). The STL incantations they wrap are just too +// verbose for programmers to use then and they are unpleasant to the eye. +// Without these convenience versions people will simply keep writing one-off +// for loops which are harder to read and more error prone. +// +// Note that for initial construction of an object it is just as efficient to +// use the 2 argument version as the 3 version due to RVO (return value +// optimization) of modern C++ compilers: +// set c = STLSetDifference(a, b); +// is an example of where RVO comes into play. + +template +void STLSetDifference(const SortedSTLContainerA &a, + const SortedSTLContainerB &b, + SortedSTLContainerC *c) { + // The qualified name avoids an ambiguity error, particularly with C++11: + assert(std::is_sorted(a.begin(), a.end())); + assert(std::is_sorted(b.begin(), b.end())); + assert(static_cast(&a) != + static_cast(c)); + assert(static_cast(&b) != + static_cast(c)); + std::set_difference(a.begin(), a.end(), b.begin(), b.end(), + std::inserter(*c, c->end())); +} + +template +SortedSTLContainer STLSetDifference(const SortedSTLContainer &a, + const SortedSTLContainer &b) { + SortedSTLContainer c; + STLSetDifference(a, b, &c); + return c; +} + +template +void STLSetUnion(const SortedSTLContainerA &a, + const SortedSTLContainerB &b, + SortedSTLContainerC *c) { + assert(std::is_sorted(a.begin(), a.end())); + assert(std::is_sorted(b.begin(), b.end())); + assert(static_cast(&a) != + static_cast(c)); + assert(static_cast(&b) != + static_cast(c)); + std::set_union(a.begin(), a.end(), b.begin(), b.end(), + std::inserter(*c, c->end())); +} + +template +void STLSetSymmetricDifference(const SortedSTLContainerA &a, + const SortedSTLContainerB &b, + SortedSTLContainerC *c) { + assert(std::is_sorted(a.begin(), a.end())); + assert(std::is_sorted(b.begin(), b.end())); + assert(static_cast(&a) != + static_cast(c)); + assert(static_cast(&b) != + static_cast(c)); + std::set_symmetric_difference(a.begin(), a.end(), b.begin(), b.end(), + std::inserter(*c, c->end())); +} + +template +SortedSTLContainer STLSetSymmetricDifference(const SortedSTLContainer &a, + const SortedSTLContainer &b) { + SortedSTLContainer c; + STLSetSymmetricDifference(a, b, &c); + return c; +} + +template +SortedSTLContainer STLSetUnion(const SortedSTLContainer &a, + const SortedSTLContainer &b) { + SortedSTLContainer c; + STLSetUnion(a, b, &c); + return c; +} + +template +void STLSetIntersection(const SortedSTLContainerA &a, + const SortedSTLContainerB &b, + SortedSTLContainerC *c) { + assert(std::is_sorted(a.begin(), a.end())); + assert(std::is_sorted(b.begin(), b.end())); + assert(static_cast(&a) != + static_cast(c)); + assert(static_cast(&b) != + static_cast(c)); + std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), + std::inserter(*c, c->end())); +} + +template +SortedSTLContainer STLSetIntersection(const SortedSTLContainer &a, + const SortedSTLContainer &b) { + SortedSTLContainer c; + STLSetIntersection(a, b, &c); + return c; +} + +// Similar to STLSet{Union,Intesection,etc}, but simpler because the result is +// always bool. +template +bool STLIncludes(const SortedSTLContainerA &a, + const SortedSTLContainerB &b) { + assert(std::is_sorted(a.begin(), a.end())); + assert(std::is_sorted(b.begin(), b.end())); + return std::includes(a.begin(), a.end(), + b.begin(), b.end()); +} + +// Functors that compose arbitrary unary and binary functions with a +// function that "projects" one of the members of a pair. +// Specifically, if p1 and p2, respectively, are the functions that +// map a pair to its first and second, respectively, members, the +// table below summarizes the functions that can be constructed: +// +// * UnaryOperate1st(f) returns the function x -> f(p1(x)) +// * UnaryOperate2nd(f) returns the function x -> f(p2(x)) +// * BinaryOperate1st(f) returns the function (x,y) -> f(p1(x),p1(y)) +// * BinaryOperate2nd(f) returns the function (x,y) -> f(p2(x),p2(y)) +// +// A typical usage for these functions would be when iterating over +// the contents of an STL map. For other sample usage, see the unittest. + +template +class UnaryOperateOnFirst + : public std::unary_function { + public: + UnaryOperateOnFirst() { + } + + UnaryOperateOnFirst(const UnaryOp& f) : f_(f) { // TODO(user): explicit? + } + + typename UnaryOp::result_type operator()(const Pair& p) const { + return f_(p.first); + } + + private: + UnaryOp f_; +}; + +template +UnaryOperateOnFirst UnaryOperate1st(const UnaryOp& f) { + return UnaryOperateOnFirst(f); +} + +template +class UnaryOperateOnSecond + : public std::unary_function { + public: + UnaryOperateOnSecond() { + } + + UnaryOperateOnSecond(const UnaryOp& f) : f_(f) { // TODO(user): explicit? + } + + typename UnaryOp::result_type operator()(const Pair& p) const { + return f_(p.second); + } + + private: + UnaryOp f_; +}; + +template +UnaryOperateOnSecond UnaryOperate2nd(const UnaryOp& f) { + return UnaryOperateOnSecond(f); +} + +template +class BinaryOperateOnFirst + : public std::binary_function { + public: + BinaryOperateOnFirst() { + } + + BinaryOperateOnFirst(const BinaryOp& f) : f_(f) { // TODO(user): explicit? + } + + typename BinaryOp::result_type operator()(const Pair& p1, + const Pair& p2) const { + return f_(p1.first, p2.first); + } + + private: + BinaryOp f_; +}; + +// TODO(user): explicit? +template +BinaryOperateOnFirst BinaryOperate1st(const BinaryOp& f) { + return BinaryOperateOnFirst(f); +} + +template +class BinaryOperateOnSecond + : public std::binary_function { + public: + BinaryOperateOnSecond() { + } + + BinaryOperateOnSecond(const BinaryOp& f) : f_(f) { + } + + typename BinaryOp::result_type operator()(const Pair& p1, + const Pair& p2) const { + return f_(p1.second, p2.second); + } + + private: + BinaryOp f_; +}; + +template +BinaryOperateOnSecond BinaryOperate2nd(const BinaryOp& f) { + return BinaryOperateOnSecond(f); +} + +// Functor that composes a binary functor h from an arbitrary binary functor +// f and two unary functors g1, g2, so that: +// +// BinaryCompose1(f, g) returns function (x, y) -> f(g(x), g(y)) +// BinaryCompose2(f, g1, g2) returns function (x, y) -> f(g1(x), g2(y)) +// +// This is a generalization of the BinaryOperate* functors for types other +// than pairs. +// +// For sample usage, see the unittest. +// +// F has to be a model of AdaptableBinaryFunction. +// G1 and G2 have to be models of AdabtableUnaryFunction. +template +class BinaryComposeBinary : public binary_function { + public: + BinaryComposeBinary(F f, G1 g1, G2 g2) : f_(f), g1_(g1), g2_(g2) { } + + typename F::result_type operator()(typename G1::argument_type x, + typename G2::argument_type y) const { + return f_(g1_(x), g2_(y)); + } + + private: + F f_; + G1 g1_; + G2 g2_; +}; + +template +BinaryComposeBinary BinaryCompose1(F f, G g) { + return BinaryComposeBinary(f, g, g); +} + +template +BinaryComposeBinary BinaryCompose2(F f, G1 g1, G2 g2) { + return BinaryComposeBinary(f, g1, g2); +} + +// This is a wrapper for an STL allocator which keeps a count of the +// active bytes allocated by this class of allocators. This is NOT +// THREAD SAFE. This should only be used in situations where you can +// ensure that only a single thread performs allocation and +// deallocation. +template > +class STLCountingAllocator : public Alloc { + public: + typedef typename Alloc::pointer pointer; + typedef typename Alloc::size_type size_type; + + STLCountingAllocator() : bytes_used_(NULL) { } + STLCountingAllocator(int64* b) : bytes_used_(b) {} // TODO(user): explicit? + + // Constructor used for rebinding + template + STLCountingAllocator(const STLCountingAllocator& x) + : Alloc(x), + bytes_used_(x.bytes_used()) { + } + + pointer allocate(size_type n, std::allocator::const_pointer hint = 0) { + assert(bytes_used_ != NULL); + *bytes_used_ += n * sizeof(T); + return Alloc::allocate(n, hint); + } + + void deallocate(pointer p, size_type n) { + Alloc::deallocate(p, n); + assert(bytes_used_ != NULL); + *bytes_used_ -= n * sizeof(T); + } + + // Rebind allows an allocator to be used for a different type + template struct rebind { + typedef STLCountingAllocator::other> other; + }; + + int64* bytes_used() const { return bytes_used_; } + + private: + int64* bytes_used_; +}; + +// Even though a struct has no data members, it cannot have zero size +// according to the standard. However, "empty base-class +// optimization" allows an empty parent class to add no additional +// size to the object. STLEmptyBaseHandle is a handy way to "stuff" +// objects that are typically empty (e.g., allocators, compare +// objects) into other fields of an object without increasing the size +// of the object. +// +// struct Empty { +// void Method() { } +// }; +// struct OneInt { +// STLEmptyBaseHandle i; +// }; +// +// In the example above, "i.data" refers to the integer field, whereas +// "i" refers to the empty base class. sizeof(OneInt) == sizeof(int) +// despite the fact that sizeof(Empty) > 0. +template +struct STLEmptyBaseHandle : public Base { + template + STLEmptyBaseHandle(const U &b, const Data &d) + : Base(b), + data(d) { + } + Data data; +}; + +// These functions return true if there is some element in the sorted range +// [begin1, end) which is equal to some element in the sorted range [begin2, +// end2). The iterators do not have to be of the same type, but the value types +// must be less-than comparable. (Two elements a,b are considered equal if +// !(a < b) && !(b < a). +template +bool SortedRangesHaveIntersection(InputIterator1 begin1, InputIterator1 end1, + InputIterator2 begin2, InputIterator2 end2) { + assert(std::is_sorted(begin1, end1)); + assert(std::is_sorted(begin2, end2)); + while (begin1 != end1 && begin2 != end2) { + if (*begin1 < *begin2) { + ++begin1; + } else if (*begin2 < *begin1) { + ++begin2; + } else { + return true; + } + } + return false; +} + +// This is equivalent to the function above, but using a custom comparison +// function. +template +bool SortedRangesHaveIntersection(InputIterator1 begin1, InputIterator1 end1, + InputIterator2 begin2, InputIterator2 end2, + Comp comparator) { + assert(std::is_sorted(begin1, end1, comparator)); + assert(std::is_sorted(begin2, end2, comparator)); + while (begin1 != end1 && begin2 != end2) { + if (comparator(*begin1, *begin2)) { + ++begin1; + } else if (comparator(*begin2, *begin1)) { + ++begin2; + } else { + return true; + } + } + return false; +} + +// release_ptr is intended to help remove systematic use of gscoped_ptr +// in cases like: +// +// vector v; +// ElementDeleter d(&v); +// ... { +// int remove_idx = f(v); +// gscoped_ptr t(v[remove_idx]); +// v[remove_idx] = NULL; // Save from deleter. +// return t.release(); +// } +// +// This would be replaced by: +// ... { +// int remove_idx = f(v); +// return release_ptr(&v[remove_idx]); +// } +template T* release_ptr(T **ptr) MUST_USE_RESULT; +template T* release_ptr(T **ptr) { + assert(ptr); + T *tmp = *ptr; + *ptr = NULL; + return tmp; +} + + +#endif // UTIL_GTL_STL_UTIL_H_ diff --git a/src/kudu/gutil/stringprintf.cc b/src/kudu/gutil/stringprintf.cc new file mode 100644 index 000000000000..112605cbebc8 --- /dev/null +++ b/src/kudu/gutil/stringprintf.cc @@ -0,0 +1,137 @@ +// Copyright 2002 and onwards Google Inc. + +#include "kudu/gutil/stringprintf.h" + +#include +#include // For va_list and related operations +#include // MSVC requires this for _vsnprintf +#include +using std::vector; +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" + +#ifdef _MSC_VER +enum { IS__MSC_VER = 1 }; +#else +enum { IS__MSC_VER = 0 }; +#endif + +void StringAppendV(string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + static const int kSpaceLength = 1024; + char space[kSpaceLength]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, kSpaceLength, format, backup_ap); + va_end(backup_ap); + + if (result < kSpaceLength) { + if (result >= 0) { + // Normal case -- everything fit. + dst->append(space, result); + return; + } + + if (IS__MSC_VER) { + // Error or MSVC running out of space. MSVC 8.0 and higher + // can be asked about space needed with the special idiom below: + va_copy(backup_ap, ap); + result = vsnprintf(nullptr, 0, format, backup_ap); + va_end(backup_ap); + } + + if (result < 0) { + // Just an error. + return; + } + } + + // Increase the buffer size to the size requested by vsnprintf, + // plus one for the closing \0. + int length = result+1; + auto buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if (result >= 0 && result < length) { + // It fit + dst->append(buf, result); + } + delete[] buf; +} + + +string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +const string& SStringPrintf(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); + return *dst; +} + +void StringAppendF(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} + +// Max arguments supported by StringPrintVector +const int kStringPrintfVectorMaxArgs = 32; + +// An empty block of zero for filler arguments. This is const so that if +// printf tries to write to it (via %n) then the program gets a SIGSEGV +// and we can fix the problem or protect against an attack. +static const char string_printf_empty_block[256] = { '\0' }; + +string StringPrintfVector(const char* format, const vector& v) { + CHECK_LE(v.size(), kStringPrintfVectorMaxArgs) + << "StringPrintfVector currently only supports up to " + << kStringPrintfVectorMaxArgs << " arguments. " + << "Feel free to add support for more if you need it."; + + // Add filler arguments so that bogus format+args have a harder time + // crashing the program, corrupting the program (%n), + // or displaying random chunks of memory to users. + + const char* cstr[kStringPrintfVectorMaxArgs]; + for (int i = 0; i < v.size(); ++i) { + cstr[i] = v[i].c_str(); + } + for (int i = v.size(); i < arraysize(cstr); ++i) { + cstr[i] = &string_printf_empty_block[0]; + } + + // I do not know any way to pass kStringPrintfVectorMaxArgs arguments, + // or any way to build a va_list by hand, or any API for printf + // that accepts an array of arguments. The best I can do is stick + // this COMPILE_ASSERT right next to the actual statement. + + COMPILE_ASSERT(kStringPrintfVectorMaxArgs == 32, arg_count_mismatch); + return StringPrintf(format, + cstr[0], cstr[1], cstr[2], cstr[3], cstr[4], + cstr[5], cstr[6], cstr[7], cstr[8], cstr[9], + cstr[10], cstr[11], cstr[12], cstr[13], cstr[14], + cstr[15], cstr[16], cstr[17], cstr[18], cstr[19], + cstr[20], cstr[21], cstr[22], cstr[23], cstr[24], + cstr[25], cstr[26], cstr[27], cstr[28], cstr[29], + cstr[30], cstr[31]); +} diff --git a/src/kudu/gutil/stringprintf.h b/src/kudu/gutil/stringprintf.h new file mode 100644 index 000000000000..20835745c8e9 --- /dev/null +++ b/src/kudu/gutil/stringprintf.h @@ -0,0 +1,48 @@ +// Copyright 2002 and onwards Google Inc. +// +// Printf variants that place their output in a C++ string. +// +// Usage: +// string result = StringPrintf("%d %s\n", 10, "hello"); +// SStringPrintf(&result, "%d %s\n", 10, "hello"); +// StringAppendF(&result, "%d %s\n", 20, "there"); + +#ifndef _BASE_STRINGPRINTF_H +#define _BASE_STRINGPRINTF_H + +#include +#include +using std::string; +#include +using std::vector; + +#include "kudu/gutil/port.h" + +// Return a C++ string +extern string StringPrintf(const char* format, ...) + // Tell the compiler to do printf format string checking. + PRINTF_ATTRIBUTE(1,2); + +// Store result into a supplied string and return it +extern const string& SStringPrintf(string* dst, const char* format, ...) + // Tell the compiler to do printf format string checking. + PRINTF_ATTRIBUTE(2,3); + +// Append result to a supplied string +extern void StringAppendF(string* dst, const char* format, ...) + // Tell the compiler to do printf format string checking. + PRINTF_ATTRIBUTE(2,3); + +// Lower-level routine that takes a va_list and appends to a specified +// string. All other routines are just convenience wrappers around it. +extern void StringAppendV(string* dst, const char* format, va_list ap); + +// The max arguments supported by StringPrintfVector +extern const int kStringPrintfVectorMaxArgs; + +// You can use this version when all your arguments are strings, but +// you don't know how many arguments you'll have at compile time. +// StringPrintfVector will LOG(FATAL) if v.size() > kStringPrintfVectorMaxArgs +extern string StringPrintfVector(const char* format, const vector& v); + +#endif /* _BASE_STRINGPRINTF_H */ diff --git a/src/kudu/gutil/strings/ascii_ctype.cc b/src/kudu/gutil/strings/ascii_ctype.cc new file mode 100644 index 000000000000..50ea8c2f1ce4 --- /dev/null +++ b/src/kudu/gutil/strings/ascii_ctype.cc @@ -0,0 +1,110 @@ +// Copyright 2007 Google Inc. All Rights Reserved. +// +// These are a little ugly. +// The C++ style guide requires 80-column lines. +// cpplint.py requires 2-space indentation. + +#include "kudu/gutil/strings/ascii_ctype.h" + +// # Table generated by this Python code (bit 0x02 is currently unused): +// def Hex2(n): +// return '0x' + hex(n/16)[2:] + hex(n%16)[2:] +// def IsPunct(ch): +// return (ord(ch) >= 32 and ord(ch) < 127 and +// not ch.isspace() and not ch.isalnum()) +// def IsBlank(ch): +// return ch in ' \t' +// def IsCntrl(ch): +// return ord(ch) < 32 or ord(ch) == 127 +// def IsXDigit(ch): +// return ch.isdigit() or ch.lower() in 'abcdef' +// for i in range(128): +// ch = chr(i) +// mask = ((ch.isalpha() and 0x01 or 0) | +// (ch.isalnum() and 0x04 or 0) | +// (ch.isspace() and 0x08 or 0) | +// (IsPunct(ch) and 0x10 or 0) | +// (IsBlank(ch) and 0x20 or 0) | +// (IsCntrl(ch) and 0x40 or 0) | +// (IsXDigit(ch) and 0x80 or 0)) +// print Hex2(mask) + ',', +// if i % 16 == 7: +// print ' //', Hex2(i & 0x78) +// elif i % 16 == 15: +// print +const unsigned char kAsciiPropertyBits[256] = { + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00 + 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40, + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10 + 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, + 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20 + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30 + 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40 + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50 + 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60 + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70 + 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40, +}; + +const unsigned char kAsciiToLower[256] = { + 00, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 'a', 'b', 'c', 'd', 'e', + 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', + 'z', 91, 92, 93, 94, 95, 96, 97, 98, 99, + 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, + 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, + 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, + 250, 251, 252, 253, 254, 255 +}; + +const unsigned char kAsciiToUpper[256] = { + 00, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', + 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, + 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, + 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, + 250, 251, 252, 253, 254, 255 +}; diff --git a/src/kudu/gutil/strings/ascii_ctype.h b/src/kudu/gutil/strings/ascii_ctype.h new file mode 100644 index 000000000000..aeaa94371b42 --- /dev/null +++ b/src/kudu/gutil/strings/ascii_ctype.h @@ -0,0 +1,75 @@ +// Copyright 2007 Google Inc. All Rights Reserved. +// +// Character classification functions similar to standard . +// Some C++ implementations provide locale-sensitive implementations +// of some functions. These ascii_* functions are +// hard-wired for ASCII. Hard-wired for ASCII is much faster. +// +// ascii_isalnum, ascii_isalpha, ascii_isascii, ascii_isblank, +// ascii_iscntrl, ascii_isdigit, ascii_isgraph, ascii_islower, +// ascii_isprint, ascii_ispunct, ascii_isspace, ascii_isupper, +// ascii_isxdigit +// Similar to the functions with similar names. +// Input parameter is an unsigned char. Return value is a bool. +// If the input has a numerical value greater than 127 +// then the output is "false". +// +// ascii_tolower, ascii_toupper +// Similar to the functions with similar names. +// Input parameter is an unsigned char. Return value is a char. +// If the input is not an ascii {lower,upper}-case letter +// (including numerical values greater than 127) +// then the output is the same as the input. + +#ifndef STRINGS_ASCII_CTYPE_H_ +#define STRINGS_ASCII_CTYPE_H_ + +// Array of character information. This is an implementation detail. +// The individual bits do not have names because the array definition is +// already tightly coupled to these functions. Names would just make it +// harder to read and debug. + +#define kApb kAsciiPropertyBits +extern const unsigned char kAsciiPropertyBits[256]; + +// Public functions. + +static inline bool ascii_isalpha(unsigned char c) { return kApb[c] & 0x01; } +static inline bool ascii_isalnum(unsigned char c) { return kApb[c] & 0x04; } +static inline bool ascii_isspace(unsigned char c) { return kApb[c] & 0x08; } +static inline bool ascii_ispunct(unsigned char c) { return kApb[c] & 0x10; } +static inline bool ascii_isblank(unsigned char c) { return kApb[c] & 0x20; } +static inline bool ascii_iscntrl(unsigned char c) { return kApb[c] & 0x40; } +static inline bool ascii_isxdigit(unsigned char c) { return kApb[c] & 0x80; } + +static inline bool ascii_isdigit(unsigned char c) { + return c >= '0' && c <= '9'; +} + +static inline bool ascii_isprint(unsigned char c) { + return c >= 32 && c < 127; +} + +static inline bool ascii_isgraph(unsigned char c) { + return c > 32 && c < 127; +} + +static inline bool ascii_isupper(unsigned char c) { + return c >= 'A' && c <= 'Z'; +} + +static inline bool ascii_islower(unsigned char c) { + return c >= 'a' && c <= 'z'; +} + +static inline bool ascii_isascii(unsigned char c) { + return c < 128; +} +#undef kApb + +extern const unsigned char kAsciiToLower[256]; +static inline char ascii_tolower(unsigned char c) { return kAsciiToLower[c]; } +extern const unsigned char kAsciiToUpper[256]; +static inline char ascii_toupper(unsigned char c) { return kAsciiToUpper[c]; } + +#endif // STRINGS_ASCII_CTYPE_H_ diff --git a/src/kudu/gutil/strings/charset.cc b/src/kudu/gutil/strings/charset.cc new file mode 100644 index 000000000000..2e9d5b13536b --- /dev/null +++ b/src/kudu/gutil/strings/charset.cc @@ -0,0 +1,24 @@ +// Copyright 2008 Google Inc. All Rights Reserved. + +#include "kudu/gutil/strings/charset.h" + +#include + +namespace strings { + +CharSet::CharSet() { + memset(this, 0, sizeof(*this)); +} + +CharSet::CharSet(const char* characters) { + memset(this, 0, sizeof(*this)); + for (; *characters != '\0'; ++characters) { + Add(*characters); + } +} + +CharSet::CharSet(const CharSet& other) { + memcpy(this, &other, sizeof(*this)); +} + +} // namespace strings diff --git a/src/kudu/gutil/strings/charset.h b/src/kudu/gutil/strings/charset.h new file mode 100644 index 000000000000..a2dbca455004 --- /dev/null +++ b/src/kudu/gutil/strings/charset.h @@ -0,0 +1,71 @@ +// Copyright 2008 Google Inc. All Rights Reserved. + +#ifndef STRINGS_CHARSET_H_ +#define STRINGS_CHARSET_H_ + +#include "kudu/gutil/integral_types.h" + +namespace strings { + +// A CharSet is a simple map from (1-byte) characters to Booleans. It simply +// exposes the mechanism of checking if a given character is in the set, fairly +// efficiently. Useful for string tokenizing routines. +// +// Run on asherah (2 X 2400 MHz CPUs); 2008/11/10-13:18:03 +// CPU: Intel Core2 (2 cores) dL1:32KB dL2:4096KB +// ***WARNING*** CPU scaling is enabled, the benchmark timings may be noisy, +// Benchmark Time(ns) CPU(ns) Iterations +// ------------------------------------------------------- +// BM_CharSetTesting/1K 21 21 32563138 +// BM_CharSetTesting/4K 21 21 31968433 +// BM_CharSetTesting/32K 21 21 32114953 +// BM_CharSetTesting/256K 22 22 31679082 +// BM_CharSetTesting/1M 21 21 32563138 +// +// This class is thread-compatible. +// +// This class has an implicit constructor. +// Style guide exception granted: +// http://goto/style-guide-exception-20978288 + +class CharSet { + public: + // Initialize a CharSet containing no characters or the given set of + // characters, respectively. + CharSet(); + // Deliberately an implicit constructor, so anything that takes a CharSet + // can also take an explicit list of characters. + CharSet(const char* characters); // NOLINT(runtime/explicit) + explicit CharSet(const CharSet& other); + + // Add or remove a character from the set. + void Add(unsigned char c) { bits_[Word(c)] |= BitMask(c); } + void Remove(unsigned char c) { bits_[Word(c)] &= ~BitMask(c); } + + // Return true if this character is in the set + bool Test(unsigned char c) const { return bits_[Word(c)] & BitMask(c); } + + private: + // The numbers below are optimized for 64-bit hardware. TODO(user): In the + // future, we should change this to use uword_t and do various bits of magic + // to calculate the numbers at compile time. + + // In general, + // static const int kNumWords = max(32 / sizeof(uword_t), 1); + uint64 bits_[4]; + + // 4 words => the high 2 bits of c are the word number. In general, + // kShiftValue = 8 - log2(kNumWords) + static int Word(unsigned char c) { return c >> 6; } + + // And the value we AND with c is ((1 << shift value) - 1) + // static const int kLowBitsMask = (256 / kNumWords) - 1; + static uint64 BitMask(unsigned char c) { + uint64 mask = 1; + return mask << (c & 0x3f); + } +}; + +} // namespace strings + +#endif // STRINGS_CHARSET_H_ diff --git a/src/kudu/gutil/strings/escaping.cc b/src/kudu/gutil/strings/escaping.cc new file mode 100644 index 000000000000..cf4935972f3f --- /dev/null +++ b/src/kudu/gutil/strings/escaping.cc @@ -0,0 +1,2023 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// Authors: Numerous. See the .h for contact people. + +#include "kudu/gutil/strings/escaping.h" + +#include +#include +#include + +#include +using std::numeric_limits; +#include +using std::vector; + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/utf/utf.h" // for runetochar +#include "kudu/gutil/charmap.h" +#include "kudu/gutil/stl_util.h" + +namespace strings { + +// These are used for the leave_nulls_escaped argument to CUnescapeInternal(). +static bool kUnescapeNulls = false; +static bool kLeaveNullsEscaped = true; + +// ---------------------------------------------------------------------- +// EscapeStrForCSV() +// Escapes the quotes in 'src' by doubling them. This is necessary +// for generating CSV files (see SplitCSVLine). +// Returns the number of characters written into dest (not counting +// the \0) or -1 if there was insufficient space. Dest could end up +// twice as long as src. +// +// Example: [some "string" to test] --> [some ""string"" to test] +// ---------------------------------------------------------------------- +int EscapeStrForCSV(const char* src, char* dest, int dest_len) { + int used = 0; + + while (true) { + if (*src == '\0' && used < dest_len) { + dest[used] = '\0'; + return used; + } + + if (used + 1 >= dest_len) // +1 because we might require two characters + return -1; + + if (*src == '"') + dest[used++] = '"'; + + dest[used++] = *src++; + } +} + +// ---------------------------------------------------------------------- +// UnescapeCEscapeSequences() +// This does all the unescaping that C does: \ooo, \r, \n, etc +// Returns length of resulting string. +// The implementation of \x parses any positive number of hex digits, +// but it is an error if the value requires more than 8 bits, and the +// result is truncated to 8 bits. The same is true for octals. +// +// The second call stores its errors in a supplied string vector. +// If the string vector pointer is NULL, it reports the errors with LOG(). +// +// *** DEPRECATED: Use CUnescape() in new code *** +// +// NOTE: any changes to this function must also be reflected in the newer +// CUnescape(). +// ---------------------------------------------------------------------- + +#define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7')) + +int UnescapeCEscapeSequences(const char* source, char* dest) { + return UnescapeCEscapeSequences(source, dest, nullptr); +} + +int UnescapeCEscapeSequences(const char* source, char* dest, + vector *errors) { + char* d = dest; + const char* p = source; + + // Small optimization for case where source = dest and there's no escaping + while ( p == d && *p != '\0' && *p != '\\' ) + p++, d++; + + while (*p != '\0') { + if (*p != '\\') { + *d++ = *p++; + } else { + switch ( *++p ) { // skip past the '\\' + case '\0': + LOG_STRING(ERROR, errors) << "String cannot end with \\"; + *d = '\0'; + return d - dest; // we're done with p + case 'a': *d++ = '\a'; break; + case 'b': *d++ = '\b'; break; + case 'f': *d++ = '\f'; break; + case 'n': *d++ = '\n'; break; + case 'r': *d++ = '\r'; break; + case 't': *d++ = '\t'; break; + case 'v': *d++ = '\v'; break; + case '\\': *d++ = '\\'; break; + case '?': *d++ = '\?'; break; // \? Who knew? + case '\'': *d++ = '\''; break; + case '"': *d++ = '\"'; break; + case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits + case '4': case '5': case '6': case '7': { + const char *octal_start = p; + unsigned int ch = *p - '0'; + if ( IS_OCTAL_DIGIT(p[1]) ) + ch = ch * 8 + *++p - '0'; + if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice + ch = ch * 8 + *++p - '0'; // now points at last digit + if (ch > 0xFF) + LOG_STRING(ERROR, errors) << "Value of " << + "\\" << string(octal_start, p+1-octal_start) << + " exceeds 8 bits"; + *d++ = ch; + break; + } + case 'x': case 'X': { + if (!ascii_isxdigit(p[1])) { + if (p[1] == '\0') { + LOG_STRING(ERROR, errors) << "String cannot end with \\x"; + } else { + LOG_STRING(ERROR, errors) << + "\\x cannot be followed by a non-hex digit: \\" << *p << p[1]; + } + break; + } + unsigned int ch = 0; + const char *hex_start = p; + while (ascii_isxdigit(p[1])) // arbitrarily many hex digits + ch = (ch << 4) + hex_digit_to_int(*++p); + if (ch > 0xFF) + LOG_STRING(ERROR, errors) << "Value of " << + "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits"; + *d++ = ch; + break; + } + case 'u': { + // \uhhhh => convert 4 hex digits to UTF-8 + char32 rune = 0; + const char *hex_start = p; + for (int i = 0; i < 4; ++i) { + if (ascii_isxdigit(p[1])) { // Look one char ahead. + rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. + } else { + LOG_STRING(ERROR, errors) + << "\\u must be followed by 4 hex digits: \\" + << string(hex_start, p+1-hex_start); + break; + } + } + d += runetochar(d, &rune); + break; + } + case 'U': { + // \Uhhhhhhhh => convert 8 hex digits to UTF-8 + char32 rune = 0; + const char *hex_start = p; + for (int i = 0; i < 8; ++i) { + if (ascii_isxdigit(p[1])) { // Look one char ahead. + // Don't change rune until we're sure this + // is within the Unicode limit, but do advance p. + char32 newrune = (rune << 4) + hex_digit_to_int(*++p); + if (newrune > 0x10FFFF) { + LOG_STRING(ERROR, errors) + << "Value of \\" + << string(hex_start, p + 1 - hex_start) + << " exceeds Unicode limit (0x10FFFF)"; + break; + } else { + rune = newrune; + } + } else { + LOG_STRING(ERROR, errors) + << "\\U must be followed by 8 hex digits: \\" + << string(hex_start, p+1-hex_start); + break; + } + } + d += runetochar(d, &rune); + break; + } + default: + LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p; + } + p++; // read past letter we escaped + } + } + *d = '\0'; + return d - dest; +} + +// ---------------------------------------------------------------------- +// UnescapeCEscapeString() +// This does the same thing as UnescapeCEscapeSequences, but creates +// a new string. The caller does not need to worry about allocating +// a dest buffer. This should be used for non performance critical +// tasks such as printing debug messages. It is safe for src and dest +// to be the same. +// +// The second call stores its errors in a supplied string vector. +// If the string vector pointer is NULL, it reports the errors with LOG(). +// +// In the first and second calls, the length of dest is returned. In the +// the third call, the new string is returned. +// +// *** DEPRECATED: Use CUnescape() in new code *** +// +// ---------------------------------------------------------------------- +int UnescapeCEscapeString(const string& src, string* dest) { + return UnescapeCEscapeString(src, dest, nullptr); +} + +int UnescapeCEscapeString(const string& src, string* dest, + vector *errors) { + CHECK(dest); + dest->resize(src.size() + 1); + int len = UnescapeCEscapeSequences(src.c_str(), + const_cast(dest->data()), errors); + dest->resize(len); + return len; +} + +string UnescapeCEscapeString(const string& src) { + gscoped_array unescaped(new char[src.size() + 1]); + int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr); + return string(unescaped.get(), len); +} + +// ---------------------------------------------------------------------- +// CUnescapeInternal() +// Implements both CUnescape() and CUnescapeForNullTerminatedString(). +// +// Unescapes C escape sequences and is the reverse of CEscape(). +// +// If 'source' is valid, stores the unescaped string and its size in +// 'dest' and 'dest_len' respectively, and returns true. Otherwise +// returns false and optionally stores the error description in +// 'error'. Set 'error' to NULL to disable error reporting. +// +// 'dest' should point to a buffer that is at least as big as 'source'. +// 'source' and 'dest' may be the same. +// +// NOTE: any changes to this function must also be reflected in the older +// UnescapeCEscapeSequences(). +// ---------------------------------------------------------------------- +static bool CUnescapeInternal(const StringPiece& source, + bool leave_nulls_escaped, + char* dest, + int* dest_len, + string* error) { + char* d = dest; + const char* p = source.data(); + const char* end = source.end(); + const char* last_byte = end - 1; + + // Small optimization for case where source = dest and there's no escaping + while (p == d && p < end && *p != '\\') + p++, d++; + + while (p < end) { + if (*p != '\\') { + *d++ = *p++; + } else { + if (++p > last_byte) { // skip past the '\\' + if (error) *error = "String cannot end with \\"; + return false; + } + switch (*p) { + case 'a': *d++ = '\a'; break; + case 'b': *d++ = '\b'; break; + case 'f': *d++ = '\f'; break; + case 'n': *d++ = '\n'; break; + case 'r': *d++ = '\r'; break; + case 't': *d++ = '\t'; break; + case 'v': *d++ = '\v'; break; + case '\\': *d++ = '\\'; break; + case '?': *d++ = '\?'; break; // \? Who knew? + case '\'': *d++ = '\''; break; + case '"': *d++ = '\"'; break; + case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits + case '4': case '5': case '6': case '7': { + const char *octal_start = p; + unsigned int ch = *p - '0'; + if (p < last_byte && IS_OCTAL_DIGIT(p[1])) + ch = ch * 8 + *++p - '0'; + if (p < last_byte && IS_OCTAL_DIGIT(p[1])) + ch = ch * 8 + *++p - '0'; // now points at last digit + if (ch > 0xff) { + if (error) { + *error = "Value of \\" + + string(octal_start, p + 1 - octal_start) + + " exceeds 0xff"; + } + return false; + } + if ((ch == 0) && leave_nulls_escaped) { + // Copy the escape sequence for the null character + const int octal_size = p + 1 - octal_start; + *d++ = '\\'; + memcpy(d, octal_start, octal_size); + d += octal_size; + break; + } + *d++ = ch; + break; + } + case 'x': case 'X': { + if (p >= last_byte) { + if (error) *error = "String cannot end with \\x"; + return false; + } else if (!ascii_isxdigit(p[1])) { + if (error) *error = "\\x cannot be followed by a non-hex digit"; + return false; + } + unsigned int ch = 0; + const char *hex_start = p; + while (p < last_byte && ascii_isxdigit(p[1])) + // Arbitrarily many hex digits + ch = (ch << 4) + hex_digit_to_int(*++p); + if (ch > 0xFF) { + if (error) { + *error = "Value of \\" + string(hex_start, p + 1 - hex_start) + + " exceeds 0xff"; + } + return false; + } + if ((ch == 0) && leave_nulls_escaped) { + // Copy the escape sequence for the null character + const int hex_size = p + 1 - hex_start; + *d++ = '\\'; + memcpy(d, hex_start, hex_size); + d += hex_size; + break; + } + *d++ = ch; + break; + } + case 'u': { + // \uhhhh => convert 4 hex digits to UTF-8 + char32 rune = 0; + const char *hex_start = p; + if (p + 4 >= end) { + if (error) { + *error = "\\u must be followed by 4 hex digits: \\" + + string(hex_start, p + 1 - hex_start); + } + return false; + } + for (int i = 0; i < 4; ++i) { + // Look one char ahead. + if (ascii_isxdigit(p[1])) { + rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p. + } else { + if (error) { + *error = "\\u must be followed by 4 hex digits: \\" + + string(hex_start, p + 1 - hex_start); + } + return false; + } + } + if ((rune == 0) && leave_nulls_escaped) { + // Copy the escape sequence for the null character + *d++ = '\\'; + memcpy(d, hex_start, 5); // u0000 + d += 5; + break; + } + d += runetochar(d, &rune); + break; + } + case 'U': { + // \Uhhhhhhhh => convert 8 hex digits to UTF-8 + char32 rune = 0; + const char *hex_start = p; + if (p + 8 >= end) { + if (error) { + *error = "\\U must be followed by 8 hex digits: \\" + + string(hex_start, p + 1 - hex_start); + } + return false; + } + for (int i = 0; i < 8; ++i) { + // Look one char ahead. + if (ascii_isxdigit(p[1])) { + // Don't change rune until we're sure this + // is within the Unicode limit, but do advance p. + char32 newrune = (rune << 4) + hex_digit_to_int(*++p); + if (newrune > 0x10FFFF) { + if (error) { + *error = "Value of \\" + + string(hex_start, p + 1 - hex_start) + + " exceeds Unicode limit (0x10FFFF)"; + } + return false; + } else { + rune = newrune; + } + } else { + if (error) { + *error = "\\U must be followed by 8 hex digits: \\" + + string(hex_start, p + 1 - hex_start); + } + return false; + } + } + if ((rune == 0) && leave_nulls_escaped) { + // Copy the escape sequence for the null character + *d++ = '\\'; + memcpy(d, hex_start, 9); // U00000000 + d += 9; + break; + } + d += runetochar(d, &rune); + break; + } + default: { + if (error) *error = string("Unknown escape sequence: \\") + *p; + return false; + } + } + p++; // read past letter we escaped + } + } + *dest_len = d - dest; + return true; +} + +// ---------------------------------------------------------------------- +// CUnescapeInternal() +// +// Same as above but uses a C++ string for output. 'source' and 'dest' +// may be the same. +// ---------------------------------------------------------------------- +bool CUnescapeInternal(const StringPiece& source, + bool leave_nulls_escaped, + string* dest, + string* error) { + dest->resize(source.size()); + int dest_size; + if (!CUnescapeInternal(source, + leave_nulls_escaped, + const_cast(dest->data()), + &dest_size, + error)) { + return false; + } + dest->resize(dest_size); + return true; +} + +// ---------------------------------------------------------------------- +// CUnescape() +// +// See CUnescapeInternal() for implementation details. +// ---------------------------------------------------------------------- +bool CUnescape(const StringPiece& source, char* dest, int* dest_len, + string* error) { + return CUnescapeInternal(source, kUnescapeNulls, dest, dest_len, error); +} + +bool CUnescape(const StringPiece& source, string* dest, string* error) { + return CUnescapeInternal(source, kUnescapeNulls, dest, error); +} + +// ---------------------------------------------------------------------- +// CUnescapeForNullTerminatedString() +// +// See CUnescapeInternal() for implementation details. +// ---------------------------------------------------------------------- +bool CUnescapeForNullTerminatedString(const StringPiece& source, + char* dest, + int* dest_len, + string* error) { + return CUnescapeInternal(source, kLeaveNullsEscaped, dest, dest_len, error); +} + +bool CUnescapeForNullTerminatedString(const StringPiece& source, + string* dest, + string* error) { + return CUnescapeInternal(source, kLeaveNullsEscaped, dest, error); +} + +// ---------------------------------------------------------------------- +// CEscapeString() +// CHexEscapeString() +// Utf8SafeCEscapeString() +// Utf8SafeCHexEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. This is very useful for preparing query +// flags. 'src' and 'dest' should not overlap. The 'Hex' version uses +// hexadecimal rather than octal sequences. The 'Utf8Safe' version doesn't +// touch UTF-8 bytes. +// Returns the number of bytes written to 'dest' (not including the \0) +// or -1 if there was insufficient space. +// +// Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped. +// ---------------------------------------------------------------------- +int CEscapeInternal(const char* src, int src_len, char* dest, + int dest_len, bool use_hex, bool utf8_safe) { + const char* src_end = src + src_len; + int used = 0; + bool last_hex_escape = false; // true if last output char was \xNN + + for (; src < src_end; src++) { + if (dest_len - used < 2) // Need space for two letter escape + return -1; + + bool is_hex_escape = false; + switch (*src) { + case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break; + case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break; + case '\t': dest[used++] = '\\'; dest[used++] = 't'; break; + case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break; + case '\'': dest[used++] = '\\'; dest[used++] = '\''; break; + case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break; + default: + // Note that if we emit \xNN and the src character after that is a hex + // digit then that digit must be escaped too to prevent it being + // interpreted as part of the character code by C. + if ((!utf8_safe || *src < 0x80) && + (!ascii_isprint(*src) || + (last_hex_escape && ascii_isxdigit(*src)))) { + if (dest_len - used < 4) // need space for 4 letter escape + return -1; + sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"), *src); + is_hex_escape = use_hex; + used += 4; + } else { + dest[used++] = *src; + break; + } + } + last_hex_escape = is_hex_escape; + } + + if (dest_len - used < 1) // make sure that there is room for \0 + return -1; + + dest[used] = '\0'; // doesn't count towards return value though + return used; +} + +int CEscapeString(const char* src, int src_len, char* dest, int dest_len) { + return CEscapeInternal(src, src_len, dest, dest_len, false, false); +} + +int CHexEscapeString(const char* src, int src_len, char* dest, int dest_len) { + return CEscapeInternal(src, src_len, dest, dest_len, true, false); +} + +int Utf8SafeCEscapeString(const char* src, int src_len, char* dest, + int dest_len) { + return CEscapeInternal(src, src_len, dest, dest_len, false, true); +} + +int Utf8SafeCHexEscapeString(const char* src, int src_len, char* dest, + int dest_len) { + return CEscapeInternal(src, src_len, dest, dest_len, true, true); +} + +// ---------------------------------------------------------------------- +// CEscape() +// CHexEscape() +// Utf8SafeCEscape() +// Utf8SafeCHexEscape() +// Copies 'src' to result, escaping dangerous characters using +// C-style escape sequences. This is very useful for preparing query +// flags. 'src' and 'dest' should not overlap. The 'Hex' version +// hexadecimal rather than octal sequences. The 'Utf8Safe' version +// doesn't touch UTF-8 bytes. +// +// Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped. +// ---------------------------------------------------------------------- +string CEscape(const StringPiece& src) { + const int dest_length = src.size() * 4 + 1; // Maximum possible expansion + gscoped_array dest(new char[dest_length]); + const int len = CEscapeInternal(src.data(), src.size(), + dest.get(), dest_length, false, false); + DCHECK_GE(len, 0); + return string(dest.get(), len); +} + +string CHexEscape(const StringPiece& src) { + const int dest_length = src.size() * 4 + 1; // Maximum possible expansion + gscoped_array dest(new char[dest_length]); + const int len = CEscapeInternal(src.data(), src.size(), + dest.get(), dest_length, true, false); + DCHECK_GE(len, 0); + return string(dest.get(), len); +} + +string Utf8SafeCEscape(const StringPiece& src) { + const int dest_length = src.size() * 4 + 1; // Maximum possible expansion + gscoped_array dest(new char[dest_length]); + const int len = CEscapeInternal(src.data(), src.size(), + dest.get(), dest_length, false, true); + DCHECK_GE(len, 0); + return string(dest.get(), len); +} + +string Utf8SafeCHexEscape(const StringPiece& src) { + const int dest_length = src.size() * 4 + 1; // Maximum possible expansion + gscoped_array dest(new char[dest_length]); + const int len = CEscapeInternal(src.data(), src.size(), + dest.get(), dest_length, true, true); + DCHECK_GE(len, 0); + return string(dest.get(), len); +} + +// ---------------------------------------------------------------------- +// BackslashEscape and BackslashUnescape +// ---------------------------------------------------------------------- +void BackslashEscape(const StringPiece& src, + const strings::CharSet& to_escape, + string* dest) { + dest->reserve(dest->size() + src.size()); + for (const char *p = src.data(), *end = src.data() + src.size(); + p != end; ) { + // Advance to next character we need to escape, or to end of source + const char* next = p; + while (next < end && !to_escape.Test(*next)) { + next++; + } + // Append the whole run of non-escaped chars + dest->append(p, next - p); + if (next == end) break; + // Char at *next needs to be escaped. Append backslash followed by *next + char c[2]; + c[0] = '\\'; + c[1] = *next; + dest->append(c, 2); + p = next + 1; + } +} + +void BackslashUnescape(const StringPiece& src, + const strings::CharSet& to_unescape, + string* dest) { + dest->reserve(dest->size() + src.size()); + bool escaped = false; + for (const char* p = src.data(), *end = src.data() + src.size(); + p != end; ++p) { + if (escaped) { + if (!to_unescape.Test(*p)) { + // Keep the backslash + dest->push_back('\\'); + } + dest->push_back(*p); + escaped = false; + } else if (*p == '\\') { + escaped = true; + } else { + dest->push_back(*p); + } + } +} + +// ---------------------------------------------------------------------- +// int QuotedPrintableUnescape() +// +// Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for +// more details, only briefly implemented. But from the web... +// Quoted-printable is an encoding method defined in the MIME +// standard. It is used primarily to encode 8-bit text (such as text +// that includes foreign characters) into 7-bit US ASCII, creating a +// document that is mostly readable by humans, even in its encoded +// form. All MIME compliant applications can decode quoted-printable +// text, though they may not necessarily be able to properly display the +// document as it was originally intended. As quoted-printable encoding +// is implemented most commonly, printable ASCII characters (values 33 +// through 126, excluding 61), tabs and spaces that do not appear at the +// end of lines, and end-of-line characters are not encoded. Other +// characters are represented by an equal sign (=) immediately followed +// by that character's hexadecimal value. Lines that are longer than 76 +// characters are shortened by line breaks, with the equal sign marking +// where the breaks occurred. +// +// Note that QuotedPrintableUnescape is different from 'Q'-encoding as +// defined in rfc2047. In particular, This does not treat '_'s as spaces. +// See QEncodingUnescape(). +// ---------------------------------------------------------------------- + +int QuotedPrintableUnescape(const char *source, int slen, + char *dest, int szdest) { + char* d = dest; + const char* p = source; + + while ( p < source+slen && *p != '\0' && d < dest+szdest ) { + switch (*p) { + case '=': + // If it's valid, convert to hex and insert or remove line-wrap. + // In the case of line-wrap removal, we allow LF as well as CRLF. + if ( p < source + slen - 1 ) { + if ( p[1] == '\n' ) { + p++; + } else if ( p < source + slen - 2 ) { + if ( ascii_isxdigit(p[1]) && ascii_isxdigit(p[2]) ) { + *d++ = hex_digit_to_int(p[1])*16 + hex_digit_to_int(p[2]); + p += 2; + } else if ( p[1] == '\r' && p[2] == '\n' ) { + p += 2; + } + } + } + p++; + break; + default: + *d++ = *p++; + break; + } + } + return (d-dest); +} + +// ---------------------------------------------------------------------- +// int QEncodingUnescape() +// +// This is very similar to QuotedPrintableUnescape except that we convert +// '_'s into spaces. (See RFC 2047) +// ---------------------------------------------------------------------- +int QEncodingUnescape(const char *source, int slen, + char *dest, int szdest) { + char* d = dest; + const char* p = source; + + while ( p < source+slen && *p != '\0' && d < dest+szdest ) { + switch (*p) { + case '=': + // If it's valid, convert to hex and insert or remove line-wrap. + // In the case of line-wrap removal, the assumption is that this + // is an RFC-compliant message with lines terminated by CRLF. + if (p < source+slen-2) { + if ( ascii_isxdigit(p[1]) && ascii_isxdigit(p[2]) ) { + *d++ = hex_digit_to_int(p[1])*16 + hex_digit_to_int(p[2]); + p += 2; + } else if ( p[1] == '\r' && p[2] == '\n' ) { + p += 2; + } + } + p++; + break; + case '_': // According to rfc2047, _'s are to be treated as spaces + *d++ = ' '; + p++; + break; + default: + *d++ = *p++; + break; + } + } + return (d-dest); +} + +int CalculateBase64EscapedLen(int input_len, bool do_padding) { + // Base64 encodes three bytes of input at a time. If the input is not + // divisible by three, we pad as appropriate. + // + // (from http://www.ietf.org/rfc/rfc3548.txt) + // Special processing is performed if fewer than 24 bits are available + // at the end of the data being encoded. A full encoding quantum is + // always completed at the end of a quantity. When fewer than 24 input + // bits are available in an input group, zero bits are added (on the + // right) to form an integral number of 6-bit groups. Padding at the + // end of the data is performed using the '=' character. Since all base + // 64 input is an integral number of octets, only the following cases + // can arise: + + + // Base64 encodes each three bytes of input into four bytes of output. + int len = (input_len / 3) * 4; + + if (input_len % 3 == 0) { + // (from http://www.ietf.org/rfc/rfc3548.txt) + // (1) the final quantum of encoding input is an integral multiple of 24 + // bits; here, the final unit of encoded output will be an integral + // multiple of 4 characters with no "=" padding, + } else if (input_len % 3 == 1) { + // (from http://www.ietf.org/rfc/rfc3548.txt) + // (2) the final quantum of encoding input is exactly 8 bits; here, the + // final unit of encoded output will be two characters followed by two + // "=" padding characters, or + len += 2; + if (do_padding) { + len += 2; + } + } else { // (input_len % 3 == 2) + // (from http://www.ietf.org/rfc/rfc3548.txt) + // (3) the final quantum of encoding input is exactly 16 bits; here, the + // final unit of encoded output will be three characters followed by one + // "=" padding character. + len += 3; + if (do_padding) { + len += 1; + } + } + + assert(len >= input_len); // make sure we didn't overflow + return len; +} + +// Base64Escape does padding, so this calculation includes padding. +int CalculateBase64EscapedLen(int input_len) { + return CalculateBase64EscapedLen(input_len, true); +} + +// ---------------------------------------------------------------------- +// int Base64Unescape() - base64 decoder +// int Base64Escape() - base64 encoder +// int WebSafeBase64Unescape() - Google's variation of base64 decoder +// int WebSafeBase64Escape() - Google's variation of base64 encoder +// +// Check out +// http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for formal +// description, but what we care about is that... +// Take the encoded stuff in groups of 4 characters and turn each +// character into a code 0 to 63 thus: +// A-Z map to 0 to 25 +// a-z map to 26 to 51 +// 0-9 map to 52 to 61 +// +(- for WebSafe) maps to 62 +// /(_ for WebSafe) maps to 63 +// There will be four numbers, all less than 64 which can be represented +// by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively). +// Arrange the 6 digit binary numbers into three bytes as such: +// aaaaaabb bbbbcccc ccdddddd +// Equals signs (one or two) are used at the end of the encoded block to +// indicate that the text was not an integer multiple of three bytes long. +// In the sorted variation, we instead use the mapping +// . maps to 0 +// 0-9 map to 1-10 +// A-Z map to 11-37 +// _ maps to 38 +// a-z map to 39-63 +// This mapping has the property that the output will be sorted in the same +// order as the input, i.e. a < b iff map(a) < map(b). It is web-safe and +// filename-safe. +// ---------------------------------------------------------------------- + +int Base64UnescapeInternal(const char *src, int szsrc, + char *dest, int szdest, + const signed char* unbase64) { + static const char kPad64 = '='; + + int decode = 0; + int destidx = 0; + int state = 0; + unsigned int ch = 0; + unsigned int temp = 0; + + // The GET_INPUT macro gets the next input character, skipping + // over any whitespace, and stopping when we reach the end of the + // string or when we read any non-data character. The arguments are + // an arbitrary identifier (used as a label for goto) and the number + // of data bytes that must remain in the input to avoid aborting the + // loop. +#define GET_INPUT(label, remain) \ + label: \ + --szsrc; \ + ch = *src++; \ + decode = unbase64[ch]; \ + if (decode < 0) { \ + if (ascii_isspace(ch) && szsrc >= remain) \ + goto label; \ + state = 4 - remain; \ + break; \ + } + + // if dest is null, we're just checking to see if it's legal input + // rather than producing output. (I suspect this could just be done + // with a regexp...). We duplicate the loop so this test can be + // outside it instead of in every iteration. + + if (dest) { + // This loop consumes 4 input bytes and produces 3 output bytes + // per iteration. We can't know at the start that there is enough + // data left in the string for a full iteration, so the loop may + // break out in the middle; if so 'state' will be set to the + // number of input bytes read. + + while (szsrc >= 4) { + // We'll start by optimistically assuming that the next four + // bytes of the string (src[0..3]) are four good data bytes + // (that is, no nulls, whitespace, padding chars, or illegal + // chars). We need to test src[0..2] for nulls individually + // before constructing temp to preserve the property that we + // never read past a null in the string (no matter how long + // szsrc claims the string is). + + if (!src[0] || !src[1] || !src[2] || + (temp = ((unbase64[src[0]] << 18) | + (unbase64[src[1]] << 12) | + (unbase64[src[2]] << 6) | + (unbase64[src[3]]))) & 0x80000000) { + // Iff any of those four characters was bad (null, illegal, + // whitespace, padding), then temp's high bit will be set + // (because unbase64[] is -1 for all bad characters). + // + // We'll back up and resort to the slower decoder, which knows + // how to handle those cases. + + GET_INPUT(first, 4); + temp = decode; + GET_INPUT(second, 3); + temp = (temp << 6) | decode; + GET_INPUT(third, 2); + temp = (temp << 6) | decode; + GET_INPUT(fourth, 1); + temp = (temp << 6) | decode; + } else { + // We really did have four good data bytes, so advance four + // characters in the string. + + szsrc -= 4; + src += 4; + decode = -1; + ch = '\0'; + } + + // temp has 24 bits of input, so write that out as three bytes. + + if (destidx+3 > szdest) return -1; + dest[destidx+2] = temp; + temp >>= 8; + dest[destidx+1] = temp; + temp >>= 8; + dest[destidx] = temp; + destidx += 3; + } + } else { + while (szsrc >= 4) { + if (!src[0] || !src[1] || !src[2] || + (temp = ((unbase64[src[0]] << 18) | + (unbase64[src[1]] << 12) | + (unbase64[src[2]] << 6) | + (unbase64[src[3]]))) & 0x80000000) { + GET_INPUT(first_no_dest, 4); + GET_INPUT(second_no_dest, 3); + GET_INPUT(third_no_dest, 2); + GET_INPUT(fourth_no_dest, 1); + } else { + szsrc -= 4; + src += 4; + decode = -1; + ch = '\0'; + } + destidx += 3; + } + } + +#undef GET_INPUT + + // if the loop terminated because we read a bad character, return + // now. + if (decode < 0 && ch != '\0' && ch != kPad64 && !ascii_isspace(ch)) + return -1; + + if (ch == kPad64) { + // if we stopped by hitting an '=', un-read that character -- we'll + // look at it again when we count to check for the proper number of + // equals signs at the end. + ++szsrc; + --src; + } else { + // This loop consumes 1 input byte per iteration. It's used to + // clean up the 0-3 input bytes remaining when the first, faster + // loop finishes. 'temp' contains the data from 'state' input + // characters read by the first loop. + while (szsrc > 0) { + --szsrc; + ch = *src++; + decode = unbase64[ch]; + if (decode < 0) { + if (ascii_isspace(ch)) { + continue; + } else if (ch == '\0') { + break; + } else if (ch == kPad64) { + // back up one character; we'll read it again when we check + // for the correct number of equals signs at the end. + ++szsrc; + --src; + break; + } else { + return -1; + } + } + + // Each input character gives us six bits of output. + temp = (temp << 6) | decode; + ++state; + if (state == 4) { + // If we've accumulated 24 bits of output, write that out as + // three bytes. + if (dest) { + if (destidx+3 > szdest) return -1; + dest[destidx+2] = temp; + temp >>= 8; + dest[destidx+1] = temp; + temp >>= 8; + dest[destidx] = temp; + } + destidx += 3; + state = 0; + temp = 0; + } + } + } + + // Process the leftover data contained in 'temp' at the end of the input. + int expected_equals = 0; + switch (state) { + case 0: + // Nothing left over; output is a multiple of 3 bytes. + break; + + case 1: + // Bad input; we have 6 bits left over. + return -1; + + case 2: + // Produce one more output byte from the 12 input bits we have left. + if (dest) { + if (destidx+1 > szdest) return -1; + temp >>= 4; + dest[destidx] = temp; + } + ++destidx; + expected_equals = 2; + break; + + case 3: + // Produce two more output bytes from the 18 input bits we have left. + if (dest) { + if (destidx+2 > szdest) return -1; + temp >>= 2; + dest[destidx+1] = temp; + temp >>= 8; + dest[destidx] = temp; + } + destidx += 2; + expected_equals = 1; + break; + + default: + // state should have no other values at this point. + LOG(FATAL) << "This can't happen; base64 decoder state = " << state; + } + + // The remainder of the string should be all whitespace, mixed with + // exactly 0 equals signs, or exactly 'expected_equals' equals + // signs. (Always accepting 0 equals signs is a google extension + // not covered in the RFC.) + + int equals = 0; + while (szsrc > 0 && *src) { + if (*src == kPad64) + ++equals; + else if (!ascii_isspace(*src)) + return -1; + --szsrc; + ++src; + } + + return (equals == 0 || equals == expected_equals) ? destidx : -1; +} + +// The arrays below were generated by the following code +// #include +// #include +// #include +// main() +// { +// static const char Base64[] = +// "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +// char *pos; +// int idx, i, j; +// printf(" "); +// for (i = 0; i < 255; i += 8) { +// for (j = i; j < i + 8; j++) { +// pos = strchr(Base64, j); +// if ((pos == NULL) || (j == 0)) +// idx = -1; +// else +// idx = pos - Base64; +// if (idx == -1) +// printf(" %2d, ", idx); +// else +// printf(" %2d/*%c*/,", idx, j); +// } +// printf("\n "); +// } +// } +// +// where the value of "Base64[]" was replaced by one of the base-64 conversion +// tables from the functions below. +static const signed char kUnBase64[] = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */, + 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, + 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, + -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, + 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, + 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, + 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1, + -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, + 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, + 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, + 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1 +}; +static const signed char kUnWebSafeBase64[] = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, 62/*-*/, -1, -1, + 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/, + 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1, + -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, + 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, + 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, + 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/, + -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/, + 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/, + 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/, + 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1 +}; + +int Base64Unescape(const char *src, int szsrc, char *dest, int szdest) { + return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnBase64); +} + +int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) { + return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64); +} + +static bool Base64UnescapeInternal(const char* src, int slen, string* dest, + const signed char* unbase64) { + // Determine the size of the output string. Base64 encodes every 3 bytes into + // 4 characters. any leftover chars are added directly for good measure. + // This is documented in the base64 RFC: http://www.ietf.org/rfc/rfc3548.txt + const int dest_len = 3 * (slen / 4) + (slen % 4); + + dest->clear(); + dest->resize(dest_len); + + // We are getting the destination buffer by getting the beginning of the + // string and converting it into a char *. + const int len = Base64UnescapeInternal(src, slen, string_as_array(dest), + dest->size(), unbase64); + if (len < 0) { + dest->clear(); + return false; + } + + // could be shorter if there was padding + DCHECK_LE(len, dest_len); + dest->resize(len); + + return true; +} + +bool Base64Unescape(const char *src, int slen, string* dest) { + return Base64UnescapeInternal(src, slen, dest, kUnBase64); +} + +bool WebSafeBase64Unescape(const char *src, int slen, string* dest) { + return Base64UnescapeInternal(src, slen, dest, kUnWebSafeBase64); +} + +int Base64EscapeInternal(const unsigned char *src, int szsrc, + char *dest, int szdest, const char *base64, + bool do_padding) { + static const char kPad64 = '='; + + if (szsrc <= 0) return 0; + + char *cur_dest = dest; + const unsigned char *cur_src = src; + + // Three bytes of data encodes to four characters of cyphertext. + // So we can pump through three-byte chunks atomically. + while (szsrc > 2) { /* keep going until we have less than 24 bits */ + if ((szdest -= 4) < 0) return 0; + cur_dest[0] = base64[cur_src[0] >> 2]; + cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)]; + cur_dest[2] = base64[((cur_src[1] & 0x0f) << 2) + (cur_src[2] >> 6)]; + cur_dest[3] = base64[cur_src[2] & 0x3f]; + + cur_dest += 4; + cur_src += 3; + szsrc -= 3; + } + + /* now deal with the tail (<=2 bytes) */ + switch (szsrc) { + case 0: + // Nothing left; nothing more to do. + break; + case 1: + // One byte left: this encodes to two characters, and (optionally) + // two pad characters to round out the four-character cypherblock. + if ((szdest -= 2) < 0) return 0; + cur_dest[0] = base64[cur_src[0] >> 2]; + cur_dest[1] = base64[(cur_src[0] & 0x03) << 4]; + cur_dest += 2; + if (do_padding) { + if ((szdest -= 2) < 0) return 0; + cur_dest[0] = kPad64; + cur_dest[1] = kPad64; + cur_dest += 2; + } + break; + case 2: + // Two bytes left: this encodes to three characters, and (optionally) + // one pad character to round out the four-character cypherblock. + if ((szdest -= 3) < 0) return 0; + cur_dest[0] = base64[cur_src[0] >> 2]; + cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)]; + cur_dest[2] = base64[(cur_src[1] & 0x0f) << 2]; + cur_dest += 3; + if (do_padding) { + if ((szdest -= 1) < 0) return 0; + cur_dest[0] = kPad64; + cur_dest += 1; + } + break; + default: + // Should not be reached: blocks of 3 bytes are handled + // in the while loop before this switch statement. + LOG_ASSERT(false) << "Logic problem? szsrc = " << szsrc; + break; + } + return (cur_dest - dest); +} + +static const char kBase64Chars[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static const char kWebSafeBase64Chars[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; + +int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) { + return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true); +} +int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest, + int szdest, bool do_padding) { + return Base64EscapeInternal(src, szsrc, dest, szdest, + kWebSafeBase64Chars, do_padding); +} + +void Base64EscapeInternal(const unsigned char* src, int szsrc, + string* dest, bool do_padding, + const char* base64_chars) { + const int calc_escaped_size = + CalculateBase64EscapedLen(szsrc, do_padding); + dest->clear(); + dest->resize(calc_escaped_size, '\0'); + const int escaped_len = Base64EscapeInternal(src, szsrc, + string_as_array(dest), + dest->size(), + base64_chars, + do_padding); + DCHECK_EQ(calc_escaped_size, escaped_len); +} + +void Base64Escape(const unsigned char *src, int szsrc, + string* dest, bool do_padding) { + Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars); +} + +void WebSafeBase64Escape(const unsigned char *src, int szsrc, + string *dest, bool do_padding) { + Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars); +} + +void Base64Escape(const string& src, string* dest) { + Base64Escape(reinterpret_cast(src.data()), + src.size(), dest, true); +} + +void WebSafeBase64Escape(const string& src, string* dest) { + WebSafeBase64Escape(reinterpret_cast(src.data()), + src.size(), dest, false); +} + +void WebSafeBase64EscapeWithPadding(const string& src, string* dest) { + WebSafeBase64Escape(reinterpret_cast(src.data()), + src.size(), dest, true); +} + +// Returns true iff c is in the Base 32 alphabet. +bool ValidBase32Byte(char c) { + return (c >= 'A' && c <= 'Z') || (c >= '2' && c <= '7') || c == '='; +} + +// Mapping from number of Base32 escaped characters (0 through 8) to number of +// unescaped bytes. 8 Base32 escaped characters represent 5 unescaped bytes. +// For N < 8, then number of unescaped bytes is less than 5. Note that in +// valid input, N can only be 0, 2, 4, 5, 7, or 8 (corresponding to 0, 1, 2, +// 3, 4, or 5 unescaped bytes). +// +// We use 5 for invalid values of N to be safe, since this is used to compute +// the length of the buffer to hold unescaped data. +// +// See http://tools.ietf.org/html/rfc4648#section-6 for details. +static const int kBase32NumUnescapedBytes[] = { + 0, 5, 1, 5, 2, 3, 5, 4, 5 +}; + +int Base32Unescape(const char* src, int slen, char* dest, int szdest) { + int destidx = 0; + char escaped_bytes[8]; + unsigned char unescaped_bytes[5]; + while (slen > 0) { + // Collect the next 8 escaped bytes and convert to upper case. If there + // are less than 8 bytes left, pad with '=', but keep track of the number + // of non-padded bytes for later. + int non_padded_len = 8; + for (int i = 0; i < 8; ++i) { + escaped_bytes[i] = (i < slen) ? ascii_toupper(src[i]) : '='; + if (!ValidBase32Byte(escaped_bytes[i])) { + return -1; + } + // Stop counting escaped bytes at first '='. + if (escaped_bytes[i] == '=' && non_padded_len == 8) { + non_padded_len = i; + } + } + + // Convert the 8 escaped bytes to 5 unescaped bytes and copy to dest. + EightBase32DigitsToFiveBytes(escaped_bytes, unescaped_bytes); + const int num_unescaped = kBase32NumUnescapedBytes[non_padded_len]; + for (int i = 0; i < num_unescaped; ++i) { + if (destidx == szdest) { + // No more room in dest, so terminate early. + return -1; + } + dest[destidx] = unescaped_bytes[i]; + ++destidx; + } + src += 8; + slen -= 8; + } + return destidx; +} + +bool Base32Unescape(const char* src, int slen, string* dest) { + // Determine the size of the output string. + const int dest_len = 5 * (slen / 8) + kBase32NumUnescapedBytes[slen % 8]; + + dest->clear(); + dest->resize(dest_len); + + // We are getting the destination buffer by getting the beginning of the + // string and converting it into a char *. + const int len = Base32Unescape(src, slen, + string_as_array(dest), dest->size()); + if (len < 0) { + dest->clear(); + return false; + } + + // Could be shorter if there was padding. + DCHECK_LE(len, dest_len); + dest->resize(len); + + return true; +} + +void GeneralFiveBytesToEightBase32Digits(const unsigned char *in_bytes, + char *out, const char *alphabet) { + // It's easier to just hard code this. + // The conversion isbased on the following picture of the division of a + // 40-bit block into 8 5-byte words: + // + // 5 3 2 5 1 4 4 1 5 2 3 5 + // |:::::::|:::::::|:::::::|:::::::|::::::: + // +----+----+----+----+----+----+----+---- + // + out[0] = alphabet[in_bytes[0] >> 3]; + out[1] = alphabet[(in_bytes[0] & 0x07) << 2 | in_bytes[1] >> 6]; + out[2] = alphabet[(in_bytes[1] & 0x3E) >> 1]; + out[3] = alphabet[(in_bytes[1] & 0x01) << 4 | in_bytes[2] >> 4]; + out[4] = alphabet[(in_bytes[2] & 0x0F) << 1 | in_bytes[3] >> 7]; + out[5] = alphabet[(in_bytes[3] & 0x7C) >> 2]; + out[6] = alphabet[(in_bytes[3] & 0x03) << 3 | in_bytes[4] >> 5]; + out[7] = alphabet[(in_bytes[4] & 0x1F)]; +} + +static int GeneralBase32Escape(const unsigned char *src, size_t szsrc, + char *dest, size_t szdest, + const char *alphabet) { + static const char kPad32 = '='; + + if (szsrc == 0) return 0; + + char *cur_dest = dest; + const unsigned char *cur_src = src; + + // Five bytes of data encodes to eight characters of cyphertext. + // So we can pump through three-byte chunks atomically. + while (szsrc > 4) { // keep going until we have less than 40 bits + if ( szdest < 8) return 0; + szdest -= 8; + + GeneralFiveBytesToEightBase32Digits(cur_src, cur_dest, alphabet); + + cur_dest += 8; + cur_src += 5; + szsrc -= 5; + } + + // Now deal with the tail (<=4 bytes). + if (szsrc > 0) { + if ( szdest < 8) return 0; + szdest -= 8; + unsigned char last_chunk[5]; + memcpy(last_chunk, cur_src, szsrc); + + for (size_t i = szsrc; i < 5; ++i) { + last_chunk[i] = '\0'; + } + + GeneralFiveBytesToEightBase32Digits(last_chunk, cur_dest, alphabet); + int filled = (szsrc * 8) / 5 + 1; + cur_dest += filled; + + // Add on the padding. + for (int i = 0; i < (8 - filled); ++i) { + *(cur_dest++) = kPad32; + } + } + + return cur_dest - dest; +} + +static bool GeneralBase32Escape(const string& src, string* dest, + const char *alphabet) { + const int max_escaped_size = CalculateBase32EscapedLen(src.length()); + dest->clear(); + dest->resize(max_escaped_size + 1, '\0'); + const int escaped_len = + GeneralBase32Escape(reinterpret_cast(src.c_str()), + src.length(), &*dest->begin(), dest->size(), + alphabet); + + DCHECK_LE(max_escaped_size, escaped_len); + + if (escaped_len < 0) { + dest->clear(); + return false; + } + + dest->resize(escaped_len); + return true; +} + +static const char Base32Alphabet[] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '2', '3', '4', '5', '6', '7' + }; + +int Base32Escape(const unsigned char* src, size_t szsrc, + char* dest, size_t szdest) { + return GeneralBase32Escape(src, szsrc, dest, szdest, Base32Alphabet); +} + +bool Base32Escape(const string& src, string* dest) { + return GeneralBase32Escape(src, dest, Base32Alphabet); +} + +void FiveBytesToEightBase32Digits(const unsigned char *in_bytes, char *out) { + GeneralFiveBytesToEightBase32Digits(in_bytes, out, Base32Alphabet); +} + +static const char Base32HexAlphabet[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', + 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', + 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', + }; + +int Base32HexEscape(const unsigned char* src, size_t szsrc, + char* dest, size_t szdest) { + return GeneralBase32Escape(src, szsrc, dest, szdest, Base32HexAlphabet); +} + +bool Base32HexEscape(const string& src, string* dest) { + return GeneralBase32Escape(src, dest, Base32HexAlphabet); +} + +int CalculateBase32EscapedLen(size_t input_len) { + DCHECK_LE(input_len, numeric_limits::max() / 8); + size_t intermediate_result = 8 * input_len + 4; + size_t len = intermediate_result / 5; + len = (len + 7) & ~7; + return len; +} + +// ---------------------------------------------------------------------- +// EightBase32DigitsToTenHexDigits() +// Converts an 8-digit base32 string to a 10-digit hex string. +// +// *in must point to 8 base32 digits. +// *out must point to 10 bytes. +// +// Base32 uses A-Z,2-7 to represent the numbers 0-31. +// See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt +// for details on base32. +// ---------------------------------------------------------------------- + + +void EightBase32DigitsToTenHexDigits(const char *in, char *out) { + unsigned char bytes[5]; + EightBase32DigitsToFiveBytes(in, bytes); + b2a_hex(bytes, out, 5); +} + +void EightBase32DigitsToFiveBytes(const char *in, unsigned char *bytes_out) { + static const char Base32InverseAlphabet[] = { + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 26/*2*/, 27/*3*/, 28/*4*/, 29/*5*/, 30/*6*/, 31/*7*/, + 99, 99, 99, 99, 99, 00/*=*/, 99, 99, + 99, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/, + 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/, + 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/, + 23/*X*/, 24/*Y*/, 25/*Z*/, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99 + }; + + // Convert to raw bytes. It's easier to just hard code this. + bytes_out[0] = Base32InverseAlphabet[in[0]] << 3 | + Base32InverseAlphabet[in[1]] >> 2; + + bytes_out[1] = Base32InverseAlphabet[in[1]] << 6 | + Base32InverseAlphabet[in[2]] << 1 | + Base32InverseAlphabet[in[3]] >> 4; + + bytes_out[2] = Base32InverseAlphabet[in[3]] << 4 | + Base32InverseAlphabet[in[4]] >> 1; + + bytes_out[3] = Base32InverseAlphabet[in[4]] << 7 | + Base32InverseAlphabet[in[5]] << 2 | + Base32InverseAlphabet[in[6]] >> 3; + + bytes_out[4] = Base32InverseAlphabet[in[6]] << 5 | + Base32InverseAlphabet[in[7]]; +} + +// ---------------------------------------------------------------------- +// TenHexDigitsToEightBase32Digits() +// Converts a 10-digit hex string to an 8-digit base32 string. +// +// *in must point to 10 hex digits. +// *out must point to 8 bytes. +// +// See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt +// for details on base32. +// ---------------------------------------------------------------------- +void TenHexDigitsToEightBase32Digits(const char *in, char *out) { + unsigned char bytes[5]; + + // Convert hex to raw bytes. + a2b_hex(in, bytes, 5); + FiveBytesToEightBase32Digits(bytes, out); +} + +// ---------------------------------------------------------------------- +// EscapeFileName / UnescapeFileName +// ---------------------------------------------------------------------- +static const Charmap escape_file_name_exceptions( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" // letters + "0123456789" // digits + "-_."); + +void EscapeFileName(const StringPiece& src, string* dst) { + // Reserve at least src.size() chars + dst->reserve(dst->size() + src.size()); + + for (char c : src) { + // We do not use "isalpha" because we want the behavior to be + // independent of the current locale settings. + if (escape_file_name_exceptions.contains(c)) { + dst->push_back(c); + + } else if (c == '/') { + dst->push_back('~'); + + } else { + char tmp[2]; + b2a_hex(reinterpret_cast(&c), tmp, 1); + dst->push_back('%'); + dst->append(tmp, 2); + } + } +} + +void UnescapeFileName(const StringPiece& src_piece, string* dst) { + const char* src = src_piece.data(); + const int len = src_piece.size(); + for (int i = 0; i < len; ++i) { + const char c = src[i]; + if (c == '~') { + dst->push_back('/'); + + } else if ((c == '%') && (i + 2 < len)) { + unsigned char tmp[1]; + a2b_hex(src + i + 1, &tmp[0], 1); + dst->push_back(tmp[0]); + i += 2; + + } else { + dst->push_back(c); + } + } +} + +static char hex_value[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9' + 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f' + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static char hex_char[] = "0123456789abcdef"; + +// This is a templated function so that T can be either a char* +// or a string. This works because we use the [] operator to access +// individual characters at a time. +template +static void a2b_hex_t(const char* a, T b, int num) { + for (int i = 0; i < num; i++) { + b[i] = (hex_value[a[i * 2] & 0xFF] << 4) + + (hex_value[a[i * 2 + 1] & 0xFF]); + } +} + +string a2b_bin(const string& a, bool byte_order_msb) { + string result; + const char *data = a.c_str(); + int num_bytes = (a.size()+7)/8; + for (int byte_offset = 0; byte_offset < num_bytes; ++byte_offset) { + unsigned char c = 0; + for (int bit_offset = 0; bit_offset < 8; ++bit_offset) { + if (*data == '\0') + break; + if (*data++ != '0') { + int bits_to_shift = (byte_order_msb) ? 7-bit_offset : bit_offset; + c |= (1 << bits_to_shift); + } + } + result.append(1, c); + } + return result; +} + +// This is a templated function so that T can be either a char* +// or a string. This works because we use the [] operator to access +// individual characters at a time. +template +static void b2a_hex_t(const unsigned char* b, T a, int num) { + for (int i = 0; i < num; i++) { + a[i * 2 + 0] = hex_char[b[i] >> 4]; + a[i * 2 + 1] = hex_char[b[i] & 0xf]; + } +} + +string b2a_bin(const string& b, bool byte_order_msb) { + string result; + for (char c : b) { + for (int bit_offset = 0; bit_offset < 8; ++bit_offset) { + int x = (byte_order_msb) ? 7-bit_offset : bit_offset; + result.append(1, (c & (1 << x)) ? '1' : '0'); + } + } + return result; +} + +void b2a_hex(const unsigned char* b, char* a, int num) { + b2a_hex_t(b, a, num); +} + +void a2b_hex(const char* a, unsigned char* b, int num) { + a2b_hex_t(a, b, num); +} + +void a2b_hex(const char* a, char* b, int num) { + a2b_hex_t(a, b, num); +} + +string b2a_hex(const char* b, int len) { + string result; + result.resize(len << 1); + b2a_hex_t(reinterpret_cast(b), result, len); + return result; +} + +string b2a_hex(const StringPiece& b) { + return b2a_hex(b.data(), b.size()); +} + +string a2b_hex(const string& a) { + string result; + a2b_hex(a.c_str(), &result, a.size()/2); + + return result; +} + +void b2a_hex(const unsigned char* from, string* to, int num) { + to->resize(num << 1); + b2a_hex_t(from, *to, num); +} + +void a2b_hex(const char* from, string* to, int num) { + to->resize(num); + a2b_hex_t(from, *to, num); +} + +const char* kDontNeedShellEscapeChars = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.=/:,@"; + +string ShellEscape(StringPiece src) { + if (!src.empty() && // empty string needs quotes + src.find_first_not_of(kDontNeedShellEscapeChars) == StringPiece::npos) { + // only contains chars that don't need quotes; it's fine + return src.ToString(); + } else if (src.find('\'') == StringPiece::npos) { + // no single quotes; just wrap it in single quotes + return StrCat("'", src, "'"); + } else { + // needs double quote escaping + string result = "\""; + for (char c : src) { + switch (c) { + case '\\': + case '$': + case '"': + case '`': + result.push_back('\\'); + }; + result.push_back(c); + } + result.push_back('"'); + return result; + } +} + +static const char kHexTable[513]= + "000102030405060708090a0b0c0d0e0f" + "101112131415161718191a1b1c1d1e1f" + "202122232425262728292a2b2c2d2e2f" + "303132333435363738393a3b3c3d3e3f" + "404142434445464748494a4b4c4d4e4f" + "505152535455565758595a5b5c5d5e5f" + "606162636465666768696a6b6c6d6e6f" + "707172737475767778797a7b7c7d7e7f" + "808182838485868788898a8b8c8d8e8f" + "909192939495969798999a9b9c9d9e9f" + "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf" + "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf" + "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf" + "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf" + "e0e1e2e3e4e5e6e7e8e9eaebecedeeef" + "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff"; + +//------------------------------------------------------------------------ +// ByteStringToAscii +// Reads at most bytes_to_read from binary_string and prints it to +// ascii_string in downcased hex. +//------------------------------------------------------------------------ +void ByteStringToAscii(string const &binary_string, int bytes_to_read, + string * ascii_string ) { + if (binary_string.size() < bytes_to_read) { + bytes_to_read = binary_string.size(); + } + + CHECK_GE(bytes_to_read, 0); + ascii_string->resize(bytes_to_read*2); + + string::const_iterator in = binary_string.begin(); + string::iterator out = ascii_string->begin(); + + for (int i = 0; i < bytes_to_read; i++) { + *out++ = kHexTable[(*in)*2]; + *out++ = kHexTable[(*in)*2 + 1]; + ++in; + } +} + +//------------------------------------------------------------------------ +// ByteStringFromAscii +// Converts the hex from ascii_string into binary data and +// writes the binary data into binary_string. +// Empty input successfully converts to empty output. +// Returns false and may modify output if it is +// unable to parse the hex string. +//------------------------------------------------------------------------ +bool ByteStringFromAscii(string const & hex_string, string * binary_string) { + binary_string->clear(); + + if ((hex_string.size()%2) != 0) { + return false; + } + + int value = 0; + for (int i = 0; i < hex_string.size(); i++) { + char c = hex_string[i]; + + if (!ascii_isxdigit(c)) { + return false; + } + + if (ascii_isdigit(c)) { + value += c - '0'; + } else if (ascii_islower(c)) { + value += 10 + c - 'a'; + } else { + value += 10 + c - 'A'; + } + + if (i & 1) { + binary_string->push_back(value); + value = 0; + } else { + value <<= 4; + } + } + + return true; +} + +// ---------------------------------------------------------------------- +// CleanStringLineEndings() +// Clean up a multi-line string to conform to Unix line endings. +// Reads from src and appends to dst, so usually dst should be empty. +// +// If there is no line ending at the end of a non-empty string, it can +// be added automatically. +// +// Four different types of input are correctly handled: +// +// - Unix/Linux files: line ending is LF, pass through unchanged +// +// - DOS/Windows files: line ending is CRLF: convert to LF +// +// - Legacy Mac files: line ending is CR: convert to LF +// +// - Garbled files: random line endings, covert gracefully +// lonely CR, lonely LF, CRLF: convert to LF +// +// @param src The multi-line string to convert +// @param dst The converted string is appended to this string +// @param auto_end_last_line Automatically terminate the last line +// +// Limitations: +// +// This does not do the right thing for CRCRLF files created by +// broken programs that do another Unix->DOS conversion on files +// that are already in CRLF format. For this, a two-pass approach +// brute-force would be needed that +// +// (1) determines the presence of LF (first one is ok) +// (2) if yes, removes any CR, else convert every CR to LF + +void CleanStringLineEndings(const string& src, string* dst, + bool auto_end_last_line) { + if (dst->empty()) { + dst->append(src); + CleanStringLineEndings(dst, auto_end_last_line); + } else { + string tmp = src; + CleanStringLineEndings(&tmp, auto_end_last_line); + dst->append(tmp); + } +} + +void CleanStringLineEndings(string* str, bool auto_end_last_line) { + int output_pos = 0; + bool r_seen = false; + int len = str->size(); + + char* p = string_as_array(str); + + for (int input_pos = 0; input_pos < len;) { + if (!r_seen && input_pos + 8 < len) { + uint64 v = UNALIGNED_LOAD64(p + input_pos); + // Loop over groups of 8 bytes at a time until we come across + // a word that has a byte whose value is less than or equal to + // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ). + // + // We use a has_less macro that quickly tests a whole 64-bit + // word to see if any of the bytes has a value < N. + // + // For more details, see: + // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord +#define has_less(x, n) (((x)-~0ULL/255*(n))&~(x)&~0ULL/255*128) + if (!has_less(v, '\r' + 1)) { +#undef has_less + // No byte in this word has a value that could be a \r or a \n + if (output_pos != input_pos) + UNALIGNED_STORE64(p + output_pos, v); + input_pos += 8; + output_pos += 8; + continue; + } + } + string::const_reference in = p[input_pos]; + if (in == '\r') { + if (r_seen) + p[output_pos++] = '\n'; + r_seen = true; + } else if (in == '\n') { + if (input_pos != output_pos) + p[output_pos++] = '\n'; + else + output_pos++; + r_seen = false; + } else { + if (r_seen) + p[output_pos++] = '\n'; + r_seen = false; + if (input_pos != output_pos) + p[output_pos++] = in; + else + output_pos++; + } + input_pos++; + } + if (r_seen || (auto_end_last_line + && output_pos > 0 + && p[output_pos - 1] != '\n')) { + str->resize(output_pos + 1); + str->operator[](output_pos) = '\n'; + } else if (output_pos < len) { + str->resize(output_pos); + } +} + + +} // namespace strings diff --git a/src/kudu/gutil/strings/escaping.h b/src/kudu/gutil/strings/escaping.h new file mode 100644 index 000000000000..19e0860839f4 --- /dev/null +++ b/src/kudu/gutil/strings/escaping.h @@ -0,0 +1,677 @@ +// Copyright 2006 Google Inc. All Rights Reserved. +// Authors: Numerous. Principal maintainers are csilvers and zunger. +// +// This is a grab-bag file for string utilities involved in escaping and +// unescaping strings in various ways. Who knew there were so many? +// +// NOTE: Although the functions declared here have been imported into +// the global namespace, the using statements are slated for removal. +// Do not refer to these symbols without properly namespace-qualifying +// them with "strings::". Of course you may also use "using" statements +// within a .cc file. +// +// There are more escaping functions in: +// webutil/html/tagutils.h (Escaping strings for HTML, PRE, JavaScript, etc.) +// webutil/url/url.h (Escaping for URL's, both RFC-2396 and other methods) +// template/template_modifiers.h (All sorts of stuff) +// util/regex/re2/re2.h (Escaping for literals within regular expressions +// - see RE2::QuoteMeta). +// And probably many more places, as well. + +#ifndef STRINGS_ESCAPING_H_ +#define STRINGS_ESCAPING_H_ + +#include +#include +using std::string; +#include +using std::vector; + +#include + +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/charset.h" +#include "kudu/gutil/strings/stringpiece.h" + +namespace strings { + +// ---------------------------------------------------------------------- +// EscapeStrForCSV() +// Escapes the quotes in 'src' by doubling them. This is necessary +// for generating CSV files (see SplitCSVLine). +// Returns the number of characters written into dest (not counting +// the \0) or -1 if there was insufficient space. +// +// Example: [some "string" to test] --> [some ""string"" to test] +// ---------------------------------------------------------------------- +int EscapeStrForCSV(const char* src, char* dest, int dest_len); + +// ---------------------------------------------------------------------- +// UnescapeCEscapeSequences() +// Copies "source" to "dest", rewriting C-style escape sequences +// -- '\n', '\r', '\\', '\ooo', etc -- to their ASCII +// equivalents. "dest" must be sufficiently large to hold all +// the characters in the rewritten string (i.e. at least as large +// as strlen(source) + 1 should be safe, since the replacements +// are always shorter than the original escaped sequences). It's +// safe for source and dest to be the same. RETURNS the length +// of dest. +// +// It allows hex sequences \xhh, or generally \xhhhhh with an +// arbitrary number of hex digits, but all of them together must +// specify a value of a single byte (e.g. \x0045 is equivalent +// to \x45, and \x1234 is erroneous). If the value is too large, +// it is truncated to 8 bits and an error is set. This is also +// true of octal values that exceed 0xff. +// +// It also allows escape sequences of the form \uhhhh (exactly four +// hex digits, upper or lower case) or \Uhhhhhhhh (exactly eight +// hex digits, upper or lower case) to specify a Unicode code +// point. The dest array will contain the UTF8-encoded version of +// that code-point (e.g., if source contains \u2019, then dest will +// contain the three bytes 0xE2, 0x80, and 0x99). For the inverse +// transformation, use UniLib::UTF8EscapeString +// (util/utf8/public/unilib.h), not CEscapeString. +// +// Errors: In the first form of the call, errors are reported with +// LOG(ERROR). The same is true for the second form of the call if +// the pointer to the string vector is NULL; otherwise, error +// messages are stored in the vector. In either case, the effect on +// the dest array is not defined, but rest of the source will be +// processed. +// +// *** DEPRECATED: Use CUnescape() in new code *** +// ---------------------------------------------------------------------- +int UnescapeCEscapeSequences(const char* source, char* dest); +int UnescapeCEscapeSequences(const char* source, char* dest, + vector* errors); + +// ---------------------------------------------------------------------- +// UnescapeCEscapeString() +// This does the same thing as UnescapeCEscapeSequences, but creates +// a new string. The caller does not need to worry about allocating +// a dest buffer. This should be used for non performance critical +// tasks such as printing debug messages. It is safe for src and dest +// to be the same. +// +// The second call stores its errors in a supplied string vector. +// If the string vector pointer is NULL, it reports the errors with LOG(). +// +// In the first and second calls, the length of dest is returned. In the +// the third call, the new string is returned. +// +// *** DEPRECATED: Use CUnescape() in new code *** +// ---------------------------------------------------------------------- +int UnescapeCEscapeString(const string& src, string* dest); +int UnescapeCEscapeString(const string& src, string* dest, + vector* errors); +string UnescapeCEscapeString(const string& src); + +// ---------------------------------------------------------------------- +// CUnescape() +// Copies "source" to "dest", rewriting C-style escape sequences +// -- '\n', '\r', '\\', '\ooo', etc -- to their ASCII +// equivalents. "dest" must be sufficiently large to hold all +// the characters in the rewritten string (i.e. at least as large +// as source.size() should be safe, since the replacements +// are never longer than the original escaped sequences). It's +// safe for source and dest to be the same. RETURNS true if +// conversion was successful, false otherwise. Stores the size of +// the result in 'dest_len'. +// +// It allows hex sequences \xhh, or generally \xhhhhh with an +// arbitrary number of hex digits, but all of them together must +// specify a value of a single byte (e.g. \x0045 is equivalent +// to \x45, and \x1234 is erroneous). If the value is too large, +// an error is set. This is also true of octal values that exceed 0xff. +// +// It also allows escape sequences of the form \uhhhh (exactly four +// hex digits, upper or lower case) or \Uhhhhhhhh (exactly eight +// hex digits, upper or lower case) to specify a Unicode code +// point. The dest array will contain the UTF8-encoded version of +// that code-point (e.g., if source contains \u2019, then dest will +// contain the three bytes 0xE2, 0x80, and 0x99). For the inverse +// transformation, use UniLib::UTF8EscapeString +// (util/utf8/public/unilib.h), not CEscapeString. +// +// Errors: Sets the description of the first encountered error in +// 'error'. To disable error reporting, set 'error' to NULL. +// ---------------------------------------------------------------------- +bool CUnescape(const StringPiece& source, char* dest, int* dest_len, + string* error); + +bool CUnescape(const StringPiece& source, string* dest, string* error); + +// A version with no error reporting. +inline bool CUnescape(const StringPiece& source, string* dest) { + return CUnescape(source, dest, NULL); +} + +// ---------------------------------------------------------------------- +// CUnescapeForNullTerminatedString() +// +// This has the same behavior as CUnescape, except that each octal, hex, +// or Unicode escape sequence that resolves to a null character ('\0') +// is left in its original escaped form. The result is a +// display-formatted string that can be interpreted as a null-terminated +// const char* and will not be cut short if it contains embedded null +// characters. +// +// ---------------------------------------------------------------------- + +bool CUnescapeForNullTerminatedString(const StringPiece& source, + char* dest, + int* dest_len, + string* error); + +bool CUnescapeForNullTerminatedString(const StringPiece& source, + string* dest, + string* error); + +// A version with no error reporting. +inline bool CUnescapeForNullTerminatedString(const StringPiece& source, + string* dest) { + return CUnescapeForNullTerminatedString(source, dest, NULL); +} + +// ---------------------------------------------------------------------- +// CEscapeString() +// CHexEscapeString() +// Utf8SafeCEscapeString() +// Utf8SafeCHexEscapeString() +// Copies 'src' to 'dest', escaping dangerous characters using +// C-style escape sequences. This is very useful for preparing query +// flags. 'src' and 'dest' should not overlap. The 'Hex' version uses +// hexadecimal rather than octal sequences. The 'Utf8Safe' version +// doesn't touch UTF-8 bytes. +// Returns the number of bytes written to 'dest' (not including the \0) +// or -1 if there was insufficient space. +// +// Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped. +// ---------------------------------------------------------------------- +int CEscapeString(const char* src, int src_len, char* dest, int dest_len); +int CHexEscapeString(const char* src, int src_len, char* dest, int dest_len); +int Utf8SafeCEscapeString(const char* src, int src_len, char* dest, + int dest_len); +int Utf8SafeCHexEscapeString(const char* src, int src_len, char* dest, + int dest_len); + +// ---------------------------------------------------------------------- +// CEscape() +// CHexEscape() +// Utf8SafeCEscape() +// Utf8SafeCHexEscape() +// More convenient form of CEscapeString: returns result as a "string". +// This version is slower than CEscapeString() because it does more +// allocation. However, it is much more convenient to use in +// non-speed-critical code like logging messages etc. +// ---------------------------------------------------------------------- +string CEscape(const StringPiece& src); +string CHexEscape(const StringPiece& src); +string Utf8SafeCEscape(const StringPiece& src); +string Utf8SafeCHexEscape(const StringPiece& src); + +// ---------------------------------------------------------------------- +// BackslashEscape() +// Given a string and a list of characters to escape, replace any +// instance of one of those characters with \ + that character. For +// example, when exporting maps to /varz, label values need to have +// all dots escaped. Appends the result to dest. +// BackslashUnescape() +// Replace \ + any of the indicated "unescape me" characters with just +// that character. Appends the result to dest. +// +// IMPORTANT: +// This function does not escape \ by default, so if you do not include +// it in the chars to escape you will most certainly get an undesirable +// result. That is, it won't be a reversible operation: +// string src = "foo\\:bar"; +// BackslashUnescape(BackslashEscape(src, ":"), ":") == "foo\\\\:bar" +// On the other hand, for all strings "src", the following is true: +// BackslashUnescape(BackslashEscape(src, ":\\"), ":\\") == src +// ---------------------------------------------------------------------- +void BackslashEscape(const StringPiece& src, + const strings::CharSet& to_escape, + string* dest); +void BackslashUnescape(const StringPiece& src, + const strings::CharSet& to_unescape, + string* dest); + +inline string BackslashEscape(const StringPiece& src, + const strings::CharSet& to_escape) { + string s; + BackslashEscape(src, to_escape, &s); + return s; +} + +inline string BackslashUnescape(const StringPiece& src, + const strings::CharSet& to_unescape) { + string s; + BackslashUnescape(src, to_unescape, &s); + return s; +} + +// ---------------------------------------------------------------------- +// QuotedPrintableUnescape() +// Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for +// more details, only briefly implemented. But from the web... +// Quoted-printable is an encoding method defined in the MIME +// standard. It is used primarily to encode 8-bit text (such as text +// that includes foreign characters) into 7-bit US ASCII, creating a +// document that is mostly readable by humans, even in its encoded +// form. All MIME compliant applications can decode quoted-printable +// text, though they may not necessarily be able to properly display the +// document as it was originally intended. As quoted-printable encoding +// is implemented most commonly, printable ASCII characters (values 33 +// through 126, excluding 61), tabs and spaces that do not appear at the +// end of lines, and end-of-line characters are not encoded. Other +// characters are represented by an equal sign (=) immediately followed +// by that character's hexadecimal value. Lines that are longer than 76 +// characters are shortened by line breaks, with the equal sign marking +// where the breaks occurred. +// +// Note that QuotedPrintableUnescape is different from 'Q'-encoding as +// defined in rfc2047. In particular, This does not treat '_'s as spaces. +// +// See QEncodingUnescape(). +// +// Copies "src" to "dest", rewriting quoted printable escape sequences +// =XX to their ASCII equivalents. src is not null terminated, instead +// specify len. I recommend that slen= 0) && (i <= 15)); + return ((i < 10) ? (i + '0') : ((i - 10) + 'A')); +} + +inline int int_to_lower_hex_digit(int i) { + DCHECK((i >= 0) && (i <= 15)); + return (i < 10) ? (i + '0') : ((i - 10) + 'a'); +} + +inline int hex_digit_to_int(char c) { + /* Assume ASCII. */ + DCHECK('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61); + DCHECK(ascii_isxdigit(c)); + int x = static_cast(c); + if (x > '9') { + x += 9; + } + return x & 0xf; +} + +// ---------------------------------------------------------------------- +// a2b_hex() +// Description: Ascii-to-Binary hex conversion. This converts +// 2*'num' hexadecimal characters to 'num' binary data. +// Return value: 'num' bytes of binary data (via the 'to' argument) +// ---------------------------------------------------------------------- +void a2b_hex(const char* from, unsigned char* to, int num); +void a2b_hex(const char* from, char* to, int num); +void a2b_hex(const char* from, string* to, int num); +string a2b_hex(const string& a); + +// ---------------------------------------------------------------------- +// a2b_bin() +// Description: Ascii-to-Binary binary conversion. This converts +// a.size() binary characters (ascii '0' or '1') to +// ceil(a.size()/8) bytes of binary data. The first character is +// considered the most significant if byte_order_msb is set. a is +// considered to be padded with trailing 0s if its size is not a +// multiple of 8. +// Return value: ceil(a.size()/8) bytes of binary data +// ---------------------------------------------------------------------- +string a2b_bin(const string& a, bool byte_order_msb); + +// ---------------------------------------------------------------------- +// b2a_hex() +// Description: Binary-to-Ascii hex conversion. This converts +// 'num' bytes of binary to a 2*'num'-character hexadecimal representation +// Return value: 2*'num' characters of ascii text (via the 'to' argument) +// ---------------------------------------------------------------------- +void b2a_hex(const unsigned char* from, char* to, int num); +void b2a_hex(const unsigned char* from, string* to, int num); + +// ---------------------------------------------------------------------- +// b2a_hex() +// Description: Binary-to-Ascii hex conversion. This converts +// 'num' bytes of binary to a 2*'num'-character hexadecimal representation +// Return value: 2*'num' characters of ascii string +// ---------------------------------------------------------------------- +string b2a_hex(const char* from, int num); +string b2a_hex(const StringPiece& b); + +// ---------------------------------------------------------------------- +// b2a_bin() +// Description: Binary-to-Ascii binary conversion. This converts +// b.size() bytes of binary to a 8*b.size() character representation +// (ascii '0' or '1'). The highest order bit in each byte is returned +// first in the string if byte_order_msb is set. +// Return value: 8*b.size() characters of ascii text +// ---------------------------------------------------------------------- +string b2a_bin(const string& b, bool byte_order_msb); + +// ---------------------------------------------------------------------- +// ShellEscape +// Make a shell command argument from a string. +// Returns a Bourne shell string literal such that, once the shell finishes +// expanding the argument, the argument passed on to the program being +// run will be the same as whatever you passed in. +// NOTE: This is "ported" from python2.2's commands.mkarg(); it should be +// safe for Bourne shell syntax (i.e. sh, bash), but mileage may vary +// with other shells. +// ---------------------------------------------------------------------- +string ShellEscape(StringPiece src); + +// Runs ShellEscape() on the arguments, concatenates them with a space, and +// returns the resulting string. +template +string ShellEscapeCommandLine(InputIterator begin, const InputIterator& end) { + string result; + for (; begin != end; ++begin) { + if (!result.empty()) result.append(" "); + result.append(ShellEscape(*begin)); + } + return result; +} + +// Reads at most bytes_to_read from binary_string and writes it to +// ascii_string in lower case hex. +void ByteStringToAscii(const string& binary_string, int bytes_to_read, + string* ascii_string); + +inline string ByteStringToAscii(const string& binary_string, + int bytes_to_read) { + string result; + ByteStringToAscii(binary_string, bytes_to_read, &result); + return result; +} + +// Converts the hex from ascii_string into binary data and +// writes the binary data into binary_string. +// Empty input successfully converts to empty output. +// Returns false and may modify output if it is +// unable to parse the hex string. +bool ByteStringFromAscii(const string& ascii_string, string* binary_string); + +// Clean up a multi-line string to conform to Unix line endings. +// Reads from src and appends to dst, so usually dst should be empty. +// If there is no line ending at the end of a non-empty string, it can +// be added automatically. +// +// Four different types of input are correctly handled: +// +// - Unix/Linux files: line ending is LF, pass through unchanged +// +// - DOS/Windows files: line ending is CRLF: convert to LF +// +// - Legacy Mac files: line ending is CR: convert to LF +// +// - Garbled files: random line endings, covert gracefully +// lonely CR, lonely LF, CRLF: convert to LF +// +// @param src The multi-line string to convert +// @param dst The converted string is appended to this string +// @param auto_end_last_line Automatically terminate the last line +// +// Limitations: +// +// This does not do the right thing for CRCRLF files created by +// broken programs that do another Unix->DOS conversion on files +// that are already in CRLF format. +void CleanStringLineEndings(const string& src, string* dst, + bool auto_end_last_line); + +// Same as above, but transforms the argument in place. +void CleanStringLineEndings(string* str, bool auto_end_last_line); + +} // namespace strings + +// The following functions used to be defined in strutil.h in the top-level +// namespace, so we alias them here. Do not add new functions here. +// +// Talk to him if you want to help. +// +// DEPRECATED(mec): Using these names in the global namespace is deprecated. +// Use the strings:: names. + +using strings::EscapeStrForCSV; +using strings::UnescapeCEscapeSequences; +using strings::UnescapeCEscapeString; +using strings::CEscapeString; +using strings::CHexEscapeString; +using strings::CEscape; +using strings::CHexEscape; +using strings::BackslashEscape; +using strings::BackslashUnescape; +using strings::QuotedPrintableUnescape; +using strings::QEncodingUnescape; +using strings::Base64Unescape; +using strings::WebSafeBase64Unescape; +using strings::CalculateBase64EscapedLen; +using strings::Base64Escape; +using strings::WebSafeBase64Escape; +using strings::WebSafeBase64EscapeWithPadding; +using strings::Base32Escape; +using strings::Base32HexEscape; +using strings::CalculateBase32EscapedLen; +using strings::EightBase32DigitsToTenHexDigits; +using strings::TenHexDigitsToEightBase32Digits; +using strings::EightBase32DigitsToFiveBytes; +using strings::FiveBytesToEightBase32Digits; +using strings::int_to_hex_digit; +using strings::int_to_lower_hex_digit; +using strings::hex_digit_to_int; +using strings::a2b_hex; +using strings::a2b_bin; +using strings::b2a_hex; +using strings::b2a_bin; +using strings::ShellEscape; +using strings::ShellEscapeCommandLine; +using strings::ByteStringFromAscii; +using strings::ByteStringToAscii; +using strings::CleanStringLineEndings; + +#endif // STRINGS_ESCAPING_H_ diff --git a/src/kudu/gutil/strings/fastmem.h b/src/kudu/gutil/strings/fastmem.h new file mode 100644 index 000000000000..3beca6d10b9f --- /dev/null +++ b/src/kudu/gutil/strings/fastmem.h @@ -0,0 +1,130 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// +// Fast memory copying and comparison routines. +// strings::fastmemcmp_inlined() replaces memcmp() +// strings::memcpy_inlined() replaces memcpy() +// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 +// +// strings::*_inlined() routines are inline versions of the +// routines exported by this module. Sometimes using the inlined +// versions is faster. Measure before using the inlined versions. +// +// Performance measurement: +// strings::fastmemcmp_inlined +// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp +// 2012-01-30 + +#ifndef STRINGS_FASTMEM_H_ +#define STRINGS_FASTMEM_H_ + +#include +#include +#include +#include + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" + +namespace strings { + +// Return true if the n bytes at a equal the n bytes at b. +// The regions are allowed to overlap. +// +// The performance is similar to the performance memcmp(), but faster for +// moderately-sized inputs, or inputs that share a common prefix and differ +// somewhere in their last 8 bytes. Further optimizations can be added later +// if it makes sense to do so. +inline bool memeq(const void* a_v, const void* b_v, size_t n) { + const uint8_t *a = reinterpret_cast(a_v); + const uint8_t *b = reinterpret_cast(b_v); + + size_t n_rounded_down = n & ~static_cast(7); + if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 + return memcmp(a, b, n) == 0; + } + // n >= 8 + uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); + uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); + if ((u | v) != 0) { // The first or last 8 bytes differ. + return false; + } + a += 8; + b += 8; + n = n_rounded_down - 8; + if (n > 128) { + // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 + // instructions, and while we could try to do something faster, it + // doesn't seem worth pursuing. + return memcmp(a, b, n) == 0; + } + for (; n >= 16; n -= 16) { + uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); + uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); + if ((x | y) != 0) { + return false; + } + a += 16; + b += 16; + } + // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. + return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b); +} + +inline int fastmemcmp_inlined(const void *a_void, const void *b_void, size_t n) { + const uint8_t *a = reinterpret_cast(a_void); + const uint8_t *b = reinterpret_cast(b_void); + + if (n >= 64) { + return memcmp(a, b, n); + } + const void* a_limit = a + n; + const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof) + while (a + sizeof_uint64 <= a_limit && + UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { + a += sizeof_uint64; + b += sizeof_uint64; + } + const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof) + if (a + sizeof_uint32 <= a_limit && + UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { + a += sizeof_uint32; + b += sizeof_uint32; + } + while (a < a_limit) { + int d = static_cast(*a++) - static_cast(*b++); + if (d) return d; + } + return 0; +} + +// The standard memcpy operation is slow for variable small sizes. +// This implementation inlines the optimal realization for sizes 1 to 16. +// To avoid code bloat don't use it in case of not performance-critical spots, +// nor when you don't expect very frequent values of size <= 16. +inline void memcpy_inlined(void *dst, const void *src, size_t size) { + // Compiler inlines code with minimal amount of data movement when third + // parameter of memcpy is a constant. + switch (size) { + case 1: memcpy(dst, src, 1); break; + case 2: memcpy(dst, src, 2); break; + case 3: memcpy(dst, src, 3); break; + case 4: memcpy(dst, src, 4); break; + case 5: memcpy(dst, src, 5); break; + case 6: memcpy(dst, src, 6); break; + case 7: memcpy(dst, src, 7); break; + case 8: memcpy(dst, src, 8); break; + case 9: memcpy(dst, src, 9); break; + case 10: memcpy(dst, src, 10); break; + case 11: memcpy(dst, src, 11); break; + case 12: memcpy(dst, src, 12); break; + case 13: memcpy(dst, src, 13); break; + case 14: memcpy(dst, src, 14); break; + case 15: memcpy(dst, src, 15); break; + case 16: memcpy(dst, src, 16); break; + default: memcpy(dst, src, size); break; + } +} + +} // namespace strings + +#endif // STRINGS_FASTMEM_H_ diff --git a/src/kudu/gutil/strings/human_readable.cc b/src/kudu/gutil/strings/human_readable.cc new file mode 100644 index 000000000000..fb3419a721ef --- /dev/null +++ b/src/kudu/gutil/strings/human_readable.cc @@ -0,0 +1,428 @@ +// Copyright 2007 Google Inc. All Rights Reserved. + +#include "kudu/gutil/strings/human_readable.h" + +#include +#include +#include + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/strip.h" + +namespace { + +template +const char* GetNegStr(T* value) { + if (*value < 0) { + *value = -(*value); + return "-"; + } else { + return ""; + } +} + +} // namespace + +bool HumanReadableNumBytes::LessThan(const string &a, const string &b) { + int64 a_bytes, b_bytes; + if (!HumanReadableNumBytes::ToInt64(a, &a_bytes)) + a_bytes = 0; + if (!HumanReadableNumBytes::ToInt64(b, &b_bytes)) + b_bytes = 0; + return (a_bytes < b_bytes); +} + +bool HumanReadableNumBytes::ToInt64(const string &str, int64 *num_bytes) { + const char *cstr = str.c_str(); + bool neg = (*cstr == '-'); + if (neg) { + cstr++; + } + char *end; + double d = strtod(cstr, &end); + // If this didn't consume the entire string, fail. + if ((end - str.c_str()) + 1 < str.size()) + return false; + int64 scale = 1; + switch (*end) { + // NB: an int64 can only go up to <8 EB. + case 'E': scale <<= 10; // Fall through... + case 'P': scale <<= 10; + case 'T': scale <<= 10; + case 'G': scale <<= 10; + case 'M': scale <<= 10; + case 'K': + case 'k': scale <<= 10; + case 'B': + case '\0': break; // To here. + default: + return false; + } + d *= scale; + if (d > kint64max || d < 0) + return false; + *num_bytes = static_cast(d + 0.5); + if (neg) { + *num_bytes = -*num_bytes; + } + return true; +} + +bool HumanReadableNumBytes::ToDouble(const string &str, double *num_bytes) { + char *end; + double d = strtod(str.c_str(), &end); + // If this didn't consume the entire string, fail. + if ((end - str.c_str()) + 1 < str.size()) + return false; + const char scale = *end; + switch (scale) { + case 'Y': d *= 1024.0; // That's a yotta bytes! + case 'Z': d *= 1024.0; + case 'E': d *= 1024.0; + case 'P': d *= 1024.0; + case 'T': d *= 1024.0; + case 'G': d *= 1024.0; + case 'M': d *= 1024.0; + case 'K': + case 'k': d *= 1024.0; + case 'B': + case '\0': break; // to here. + default: + return false; + } + *num_bytes = d; + return true; +} + +string HumanReadableNumBytes::DoubleToString(double num_bytes) { + const char *neg_str = GetNegStr(&num_bytes); + static const char units[] = "BKMGTPEZY"; + double scaled = num_bytes; + int i = 0; + for (; i < arraysize(units) && scaled >= 1024.0; ++i) { + scaled /= 1024.0; + } + if (i == arraysize(units)) { + return StringPrintf("%s%g", neg_str, num_bytes); + } else { + return StringPrintf("%s%.2f%c", neg_str, scaled, units[i]); + } +} + +string HumanReadableNumBytes::ToString(int64 num_bytes) { + if (num_bytes == kint64min) { + // Special case for number with not representable nagation. + return "-8E"; + } + + const char *neg_str = GetNegStr(&num_bytes); + + // Special case for bytes. + if (num_bytes < GG_LONGLONG(1024)) { + // No fractions for bytes. + return StringPrintf("%s%" PRId64 "B", neg_str, num_bytes); + } + + static const char units[] = "KMGTPE"; // int64 only goes up to E. + const char* unit = units; + while (num_bytes >= GG_LONGLONG(1024) * GG_LONGLONG(1024)) { + num_bytes /= GG_LONGLONG(1024); + ++unit; + CHECK(unit < units + arraysize(units)); + } + + return StringPrintf(((*unit == 'K') + ? "%s%.1f%c" + : "%s%.2f%c"), neg_str, num_bytes / 1024.0, *unit); +} + +string HumanReadableNumBytes::ToStringWithoutRounding(int64 num_bytes) { + if (num_bytes == kint64min) { + // Special case for number with not representable nagation. + return "-8E"; + } + + const char *neg_str = GetNegStr(&num_bytes); + static const char units[] = "BKMGTPE"; // int64 only goes up to E. + + int64 num_units = num_bytes; + int unit_type = 0; + for (; unit_type < arraysize(units); unit_type++) { + if (num_units % 1024 != 0) { + // Not divisible by the next unit. + break; + } + + int64 next_units = num_units >> 10; + if (next_units == 0) { + // Less than the next unit. + break; + } + + num_units = next_units; + } + return StringPrintf("%s%" PRId64 "%c", neg_str, num_units, units[unit_type]); +} + +string HumanReadableInt::ToString(int64 value) { + string s; + if (value < 0) { + s += "-"; + value = -value; + } + if (value < GG_LONGLONG(1000)) { + StringAppendF(&s, "%" PRId64, value); + } else if (value >= GG_LONGLONG(1000000000000000)) { + // Number bigger than 1E15; use that notation. + StringAppendF(&s, "%0.3G", static_cast(value)); + } else { + static const char units[] = "kMBT"; + const char *unit = units; + while (value >= GG_LONGLONG(1000000)) { + value /= GG_LONGLONG(1000); + ++unit; + CHECK(unit < units + arraysize(units)); + } + StringAppendF(&s, "%.2f%c", value / 1000.0, *unit); + } + return s; +} + +string HumanReadableNum::ToString(int64 value) { + return HumanReadableInt::ToString(value); +} + +string HumanReadableNum::DoubleToString(double value) { + string s; + if (value < 0) { + s += "-"; + value = -value; + } + if (value < 1.0) { + StringAppendF(&s, "%.3f", value); + } else if (value < 10) { + StringAppendF(&s, "%.2f", value); + } else if (value < 1e2) { + StringAppendF(&s, "%.1f", value); + } else if (value < 1e3) { + StringAppendF(&s, "%.0f", value); + } else if (value >= 1e15) { + // Number bigger than 1E15; use that notation. + StringAppendF(&s, "%0.3G", value); + } else { + static const char units[] = "kMBT"; + const char *unit = units; + while (value >= 1e6) { + value /= 1e3; + ++unit; + CHECK(unit < units + arraysize(units)); + } + StringAppendF(&s, "%.2f%c", value / 1000.0, *unit); + } + return s; +} + +bool HumanReadableNum::ToDouble(const string &str, double *value) { + char *end; + double d = strtod(str.c_str(), &end); + // Allow the string to contain at most one extra character: + if ((end - str.c_str()) + 1 < str.size()) + return false; + const char scale = *end; + if ((scale == 'k') || (scale == 'K')) { + d *= 1e3; + } else if (scale == 'M') { + d *= 1e6; + } else if (scale == 'B') { + d *= 1e9; + } else if (scale == 'T') { + d *= 1e12; + } else if (scale != '\0') { + return false; + } + *value = d; + return true; +} + +bool HumanReadableInt::ToInt64(const string &str, int64 *value) { + char *end; + double d = strtod(str.c_str(), &end); + if (d > kint64max || d < kint64min) + return false; + if (*end == 'k') { + d *= 1000; + } else if (*end == 'M') { + d *= 1e6; + } else if (*end == 'B') { + d *= 1e9; + } else if (*end == 'T') { + d *= 1e12; + } else if (*end != '\0') { + return false; + } + *value = static_cast(d < 0 ? d - 0.5 : d + 0.5); + return true; +} + +// Abbreviations used here are acceptable English abbreviations +// without the ending period (".") for brevity, except for uncommon +// abbreviations, in which case the entire word is spelled out. ("mo" +// and "mos" are not good abbreviations for "months" -- with or +// without the period). If needed, one can add a +// HumanReadableTime::ToStringShort() for shorter abbreviations or one +// for always spelling out the unit, HumanReadableTime::ToStringLong(). +string HumanReadableElapsedTime::ToShortString(double seconds) { + string human_readable; + + if (seconds < 0) { + human_readable = "-"; + seconds = -seconds; + } + + // Start with ns and keep going up to years. + if (seconds < 0.000001) { + StringAppendF(&human_readable, "%0.3g ns", seconds * 1000000000.0); + return human_readable; + } + if (seconds < 0.001) { + StringAppendF(&human_readable, "%0.3g us", seconds * 1000000.0); + return human_readable; + } + if (seconds < 1.0) { + StringAppendF(&human_readable, "%0.3g ms", seconds * 1000.0); + return human_readable; + } + if (seconds < 60.0) { + StringAppendF(&human_readable, "%0.3g s", seconds); + return human_readable; + } + seconds /= 60.0; + if (seconds < 60.0) { + StringAppendF(&human_readable, "%0.3g min", seconds); + return human_readable; + } + seconds /= 60.0; + if (seconds < 24.0) { + StringAppendF(&human_readable, "%0.3g h", seconds); + return human_readable; + } + seconds /= 24.0; + if (seconds < 30.0) { + StringAppendF(&human_readable, "%0.3g days", seconds); + return human_readable; + } + if (seconds < 365.2425) { + StringAppendF(&human_readable, "%0.3g months", seconds / 30.436875); + return human_readable; + } + seconds /= 365.2425; + StringAppendF(&human_readable, "%0.3g years", seconds); + return human_readable; +} + +bool HumanReadableElapsedTime::ToDouble(const string& str, double* value) { + struct TimeUnits { + const char* unit; // unit name + double seconds; // number of seconds in that unit (minutes => 60) + }; + + // These must be sorted in decreasing length. In particulary, a + // string must exist before and of its substrings or the substring + // will match; + static const TimeUnits kUnits[] = { + // Long forms + { "nanosecond", 0.000000001 }, + { "microsecond", 0.000001 }, + { "millisecond", 0.001 }, + { "second", 1.0 }, + { "minute", 60.0 }, + { "hour", 3600.0 }, + { "day", 86400.0 }, + { "week", 7 * 86400.0 }, + { "month", 30 * 86400.0 }, + { "year", 365 * 86400.0 }, + + // Abbreviated forms + { "nanosec", 0.000000001 }, + { "microsec", 0.000001 }, + { "millisec", 0.001 }, + { "sec", 1.0 }, + { "min", 60.0 }, + { "hr", 3600.0 }, + { "dy", 86400.0 }, + { "wk", 7 * 86400.0 }, + { "mon", 30 * 86400.0 }, + { "yr", 365 * 86400.0 }, + + // nano -> n + { "nsecond", 0.000000001 }, + { "nsec", 0.000000001 }, + // micro -> u + { "usecond", 0.000001 }, + { "usec", 0.000001 }, + // milli -> m + { "msecond", 0.001 }, + { "msec", 0.001 }, + + // Ultra-short form + { "ns", 0.000000001 }, + { "us", 0.000001 }, + { "ms", 0.001 }, + { "s", 1.0 }, + { "m", 60.0 }, + { "h", 3600.0 }, + { "d", 86400.0 }, + { "w", 7 * 86400.0 }, + { "M", 30 * 86400.0 }, // upper-case M to disambiguate with minute + { "y", 365 * 86400.0 } + }; + + char* unit_start; // Start of unit name. + double work_value = 0; + int sign = 1; + const char* interval_start = SkipLeadingWhiteSpace(str.c_str()); + if (*interval_start == '-') { + sign = -1; + interval_start = SkipLeadingWhiteSpace(interval_start + 1); + } else if (*interval_start == '+') { + interval_start = SkipLeadingWhiteSpace(interval_start + 1); + } + if (!*interval_start) { + // Empty string and strings with just a sign are illegal. + return false; + } + do { + // Leading signs on individual values are not allowed. + if (*interval_start == '-' || *interval_start == '+') { + return false; + } + double factor = strtod(interval_start, &unit_start); + if (interval_start == unit_start) { + // Illegally formatted value, no values consumed by strtod. + return false; + } + unit_start = SkipLeadingWhiteSpace(unit_start); + bool found_unit = false; + for (int i = 0; !found_unit && i < ARRAYSIZE(kUnits); ++i) { + const size_t unit_len = strlen(kUnits[i].unit); + if (strncmp(unit_start, kUnits[i].unit, unit_len) == 0) { + work_value += factor * kUnits[i].seconds; + interval_start = unit_start + unit_len; + // Allowing pluralization of any unit (except empty string) + if (unit_len > 0 && *interval_start == 's') { + interval_start++; + } + found_unit = true; + } + } + if (!found_unit) { + return false; + } + interval_start = SkipLeadingWhiteSpace(interval_start); + } while (*interval_start); + + *value = sign * work_value; + return true; +} diff --git a/src/kudu/gutil/strings/human_readable.h b/src/kudu/gutil/strings/human_readable.h new file mode 100644 index 000000000000..e05b169b45ad --- /dev/null +++ b/src/kudu/gutil/strings/human_readable.h @@ -0,0 +1,162 @@ +// Copyright 2007 Google Inc. All Rights Reserved. +// +// A collection of methods to convert back and forth between a number +// and a human-readable string representing the number. + +#ifndef STRINGS_HUMAN_READABLE_H__ +#define STRINGS_HUMAN_READABLE_H__ + +#include +using std::binary_function; +using std::less; +#include +using std::string; + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" + +// WARNING +// HumanReadable{NumBytes, Int} don't give you the standard set of SI prefixes. +// +// HumanReadableNumBytes uses binary powers -- 1M = 1 << 20 -- but for numbers +// less than 1024, it adds the suffix "B" for "bytes." It is OK when you need +// to print a literal number of bytes, but can be awfully confusing for +// anything else. +// +// HumanReadableInt uses decimal powers -- 1M = 10^3 -- but prints +// 'B'-for-billion instead of 'G'-for-giga. It's good for representing +// true numbers, like how many documents are in a repository. +// HumanReadableNum is the same as HumanReadableInt but has additional +// support for DoubleToString(), where smaller numbers will print more +// (up to 3) decimal places. +// +// If you want SI prefixes, use the functions in si_prefix.h instead; for +// example, strings::si_prefix::ToDecimalString(1053.2) == "1.05k". + +class HumanReadableNumBytes { + public: + // Converts between an int64 representing a number of bytes and a + // human readable string representing the same number. + // e.g. 1000000 -> "976.6K". + // Note that calling these two functions in succession isn't a + // noop, since ToString() may round. + static bool ToInt64(const string &str, int64 *num_bytes); + static string ToString(int64 num_bytes); + // Like ToString but without rounding. For example 1025 would return + // "1025B" rather than "1.0K". Uses the largest common denominator. + static string ToStringWithoutRounding(int64 num_bytes); + + static bool ToDouble(const string &str, double *num_bytes); + // Function overloading this with a function that takes an int64 is just + // asking for trouble. + static string DoubleToString(double num_bytes); + + // TODO(user): Maybe change this class to use SIPrefix? + + // ---------------------------------------------------------------------- + // LessThan + // humanreadablebytes_less + // humanreadablebytes_greater + // These numerically compare the values encoded in strings by + // ToString(). Strings which cannot be parsed are treated as + // if they represented the value 0. The following byte sizes + // would be sorted as: + // 3B + // .06K + // .03M + // 10000G + // 10T + // 3.01P + // 3.02P + // 0.007E + // ---------------------------------------------------------------------- + static bool LessThan(const string &a, const string &b); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HumanReadableNumBytes); +}; + + +// See documentation at HumanReadableNumBytes::LessThan(). +struct humanreadablebytes_less + : public binary_function { + bool operator()(const string& a, const string &b) const { + return HumanReadableNumBytes::LessThan(a, b); + } +}; + +// See documentation at HumanReadableNumBytes::LessThan(). +struct humanreadablebytes_greater + : public binary_function { + bool operator()(const string& a, const string &b) const { + return HumanReadableNumBytes::LessThan(b, a); + } +}; + +class HumanReadableInt { + public: + // Similar to HumanReadableNumBytes::ToInt64(), but uses decimal + // rather than binary expansions - so M = 1 million, B = 1 billion, + // etc. Numbers beyond 1T are expressed as "3E14" etc. + static string ToString(int64 value); + + // Reverses ToString(). Note that calling these two functions in + // succession isn't a noop, since ToString() may round. + static bool ToInt64(const string &str, int64 *value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HumanReadableInt); +}; + +class HumanReadableNum { + public: + // Same as HumanReadableInt::ToString(). + static string ToString(int64 value); + + // Similar to HumanReadableInt::ToString(), but prints 2 decimal + // places for numbers with absolute value < 10.0 and 1 decimal place + // for numbers >= 10.0 and < 100.0. + static string DoubleToString(double value); + + // Reverses DoubleToString(). Note that calling these two functions in + // succession isn't a noop, since there may be rounding errors. + static bool ToDouble(const string &str, double *value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HumanReadableNum); +}; + +class HumanReadableElapsedTime { + public: + // Converts a time interval as double to a human readable + // string. For example: + // 0.001 -> "1 ms" + // 10.0 -> "10 s" + // 933120.0 -> "10.8 days" + // 39420000.0 -> "1.25 years" + // -10 -> "-10 s" + static string ToShortString(double seconds); + + // Reverses ToShortString(). Note that calling these two functions in + // succession isn't a noop, since ToShortString() may round. + // This accepts multiple forms of units, but the abbreviated forms are + // us (microseconds), ms (milliseconds), s, m (minutes), h, d, w, + // M (month = 30 days), y + // This function is not particularly fast. Use at performance peril. + // Only leading negative signs are allowed. + // Examples: + // "1ms" -> 0.001 + // "10 second" -> 10 + // "10.8 days" -> 933120.0 + // "1m 30s" -> 90 + // "-10 sec" -> -10 + // "18.3" -> 18.3 + // "1M" -> 2592000 (1 month = 30 days) + static bool ToDouble(const string& str, double* value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HumanReadableElapsedTime); +}; + +#endif // STRINGS_HUMAN_READABLE_H__ diff --git a/src/kudu/gutil/strings/join.cc b/src/kudu/gutil/strings/join.cc new file mode 100644 index 000000000000..c0035e12d156 --- /dev/null +++ b/src/kudu/gutil/strings/join.cc @@ -0,0 +1,211 @@ +// Copyright 2008 and onwards Google Inc. All rights reserved. + +#include "kudu/gutil/strings/join.h" + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/escaping.h" + +// ---------------------------------------------------------------------- +// JoinUsing() +// This merges a vector of string components with delim inserted +// as separaters between components. +// This is essentially the same as JoinUsingToBuffer except +// the return result is dynamically allocated using "new char[]". +// It is the caller's responsibility to "delete []" the +// +// If result_length_p is not NULL, it will contain the length of the +// result string (not including the trailing '\0'). +// ---------------------------------------------------------------------- +char* JoinUsing(const vector& components, + const char* delim, + int* result_length_p) { + const int num_components = components.size(); + const int delim_length = strlen(delim); + int num_chars = (num_components > 1) + ? delim_length * (num_components - 1) + : 0; + for (int i = 0; i < num_components; ++i) + num_chars += strlen(components[i]); + + auto res_buffer = new char[num_chars + 1]; + return JoinUsingToBuffer(components, delim, num_chars+1, + res_buffer, result_length_p); +} + +// ---------------------------------------------------------------------- +// JoinUsingToBuffer() +// This merges a vector of string components with delim inserted +// as separaters between components. +// User supplies the result buffer with specified buffer size. +// The result is also returned for convenience. +// +// If result_length_p is not NULL, it will contain the length of the +// result string (not including the trailing '\0'). +// ---------------------------------------------------------------------- +char* JoinUsingToBuffer(const vector& components, + const char* delim, + int result_buffer_size, + char* result_buffer, + int* result_length_p) { + CHECK(result_buffer != nullptr); + const int num_components = components.size(); + const int max_str_len = result_buffer_size - 1; + char* curr_dest = result_buffer; + int num_chars = 0; + for (int i = 0; (i < num_components) && (num_chars < max_str_len); ++i) { + const char* curr_src = components[i]; + while ((*curr_src != '\0') && (num_chars < max_str_len)) { + *curr_dest = *curr_src; + ++num_chars; + ++curr_dest; + ++curr_src; + } + if (i != (num_components-1)) { // not the last component ==> add separator + curr_src = delim; + while ((*curr_src != '\0') && (num_chars < max_str_len)) { + *curr_dest = *curr_src; + ++num_chars; + ++curr_dest; + ++curr_src; + } + } + } + + if (result_buffer_size > 0) + *curr_dest = '\0'; // add null termination + if (result_length_p != nullptr) // set string length value + *result_length_p = num_chars; + + return result_buffer; +} + +// ---------------------------------------------------------------------- +// JoinStrings() +// This merges a vector of string components with delim inserted +// as separaters between components. +// This is essentially the same as JoinUsingToBuffer except +// it uses strings instead of char *s. +// +// ---------------------------------------------------------------------- + +void JoinStringsInArray(string const* const* components, + int num_components, + const char* delim, + string * result) { + CHECK(result != nullptr); + result->clear(); + for (int i = 0; i < num_components; i++) { + if (i>0) { + (*result) += delim; + } + (*result) += *(components[i]); + } +} + +void JoinStringsInArray(string const *components, + int num_components, + const char *delim, + string *result) { + JoinStringsIterator(components, + components + num_components, + delim, + result); +} + +// ---------------------------------------------------------------------- +// JoinMapKeysAndValues() +// JoinVectorKeysAndValues() +// This merges the keys and values of a string -> string map or pair +// of strings vector, with one delim (intra_delim) between each key +// and its associated value and another delim (inter_delim) between +// each key/value pair. The result is returned in a string (passed +// as the last argument). +// ---------------------------------------------------------------------- + +void JoinMapKeysAndValues(const map& components, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string* result) { + JoinKeysAndValuesIterator(components.begin(), components.end(), + intra_delim, inter_delim, + result); +} + +void JoinVectorKeysAndValues(const vector< pair >& components, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string* result) { + JoinKeysAndValuesIterator(components.begin(), components.end(), + intra_delim, inter_delim, + result); +} + +// ---------------------------------------------------------------------- +// JoinCSVLine() +// This function is the inverse of SplitCSVLineWithDelimiter() in that the +// string returned by JoinCSVLineWithDelimiter() can be passed to +// SplitCSVLineWithDelimiter() to get the original string vector back. +// Quotes and escapes the elements of original_cols according to CSV quoting +// rules, and the joins the escaped quoted strings with commas using +// JoinStrings(). Note that JoinCSVLineWithDelimiter() will not necessarily +// return the same string originally passed in to +// SplitCSVLineWithDelimiter(), since SplitCSVLineWithDelimiter() can handle +// gratuitous spacing and quoting. 'output' must point to an empty string. +// +// Example: +// [Google], [x], [Buchheit, Paul], [string with " quoite in it], [ space ] +// ---> [Google,x,"Buchheit, Paul","string with "" quote in it"," space "] +// ---------------------------------------------------------------------- +void JoinCSVLineWithDelimiter(const vector& cols, char delimiter, + string* output) { + CHECK(output); + CHECK(output->empty()); + vector quoted_cols; + + const string delimiter_str(1, delimiter); + const string escape_chars = delimiter_str + "\""; + + // If the string contains the delimiter or " anywhere, or begins or ends with + // whitespace (ie ascii_isspace() returns true), escape all double-quotes and + // bracket the string in double quotes. string.rbegin() evaluates to the last + // character of the string. + for (const auto& col : cols) { + if ((col.find_first_of(escape_chars) != string::npos) || + (!col.empty() && (ascii_isspace(*col.begin()) || + ascii_isspace(*col.rbegin())))) { + // Double the original size, for escaping, plus two bytes for + // the bracketing double-quotes, and one byte for the closing \0. + int size = 2 * col.size() + 3; + gscoped_array buf(new char[size]); + + // Leave space at beginning and end for bracketing double-quotes. + int escaped_size = strings::EscapeStrForCSV(col.c_str(), + buf.get() + 1, size - 2); + CHECK_GE(escaped_size, 0) << "Buffer somehow wasn't large enough."; + CHECK_GE(size, escaped_size + 3) + << "Buffer should have one space at the beginning for a " + << "double-quote, one at the end for a double-quote, and " + << "one at the end for a closing '\0'"; + *buf.get() = '"'; + *((buf.get() + 1) + escaped_size) = '"'; + *((buf.get() + 1) + escaped_size + 1) = '\0'; + quoted_cols.push_back(string(buf.get(), buf.get() + escaped_size + 2)); + } else { + quoted_cols.push_back(col); + } + } + JoinStrings(quoted_cols, delimiter_str, output); +} + +void JoinCSVLine(const vector& cols, string* output) { + JoinCSVLineWithDelimiter(cols, ',', output); +} + +string JoinCSVLine(const vector& cols) { + string output; + JoinCSVLine(cols, &output); + return output; +} diff --git a/src/kudu/gutil/strings/join.h b/src/kudu/gutil/strings/join.h new file mode 100644 index 000000000000..1e8e626dbe3d --- /dev/null +++ b/src/kudu/gutil/strings/join.h @@ -0,0 +1,371 @@ +// Copyright 2008 and onwards Google, Inc. +// +// #status: RECOMMENDED +// #category: operations on strings +// #summary: Functions for joining strings and numbers using a delimiter. +// +#ifndef STRINGS_JOIN_H_ +#define STRINGS_JOIN_H_ + +#include +#include +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_map; // Not used in this file. +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_set; // Not used in this file. +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +using std::map; +using std::multimap; +#include +using std::multiset; +using std::set; +#include +using std::string; +#include +using std::make_pair; +using std::pair; +#include +using std::vector; + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/template_util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/strcat.h" // For backward compatibility. +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/hash/hash.h" + +// ---------------------------------------------------------------------- +// JoinUsing() +// This concatenates a vector of strings "components" into a new char[] +// buffer, using the C-string "delim" as a separator between components. +// +// This is essentially the same as JoinUsingToBuffer except +// the return result is dynamically allocated using "new char[]". +// It is the caller's responsibility to "delete []" the char* that is +// returned. +// +// If result_length_p is not NULL, it will contain the length of the +// result string (not including the trailing '\0'). +// ---------------------------------------------------------------------- +char* JoinUsing(const vector& components, + const char* delim, + int* result_length_p); + +// ---------------------------------------------------------------------- +// JoinUsingToBuffer() +// This concatenates a vector of strings "components" into a given char[] +// buffer, using the C-string "delim" as a separator between components. +// User supplies the result buffer with specified buffer size. +// The result is also returned for convenience. +// +// If result_length_p is not NULL, it will contain the length of the +// result string (not including the trailing '\0'). +// ---------------------------------------------------------------------- +char* JoinUsingToBuffer(const vector& components, + const char* delim, + int result_buffer_size, + char* result_buffer, + int* result_length_p); + +// ---------------------------------------------------------------------- +// JoinStrings(), JoinStringsIterator(), JoinStringsInArray() +// +// JoinStrings concatenates a container of strings into a C++ string, +// using the string "delim" as a separator between components. +// "components" can be any sequence container whose values are C++ strings +// or StringPieces. More precisely, "components" must support STL container +// iteration; i.e. it must have begin() and end() methods with appropriate +// semantics, which return forward iterators whose value type is +// string or StringPiece. Repeated string fields of protocol messages +// satisfy these requirements. +// +// JoinStringsIterator is the same as JoinStrings, except that the input +// strings are specified with a pair of iterators. The requirements on +// the iterators are the same as the requirements on components.begin() +// and components.end() for JoinStrings. +// +// JoinStringsInArray is the same as JoinStrings, but operates on +// an array of C++ strings or string pointers. +// +// There are two flavors of each function, one flavor returns the +// concatenated string, another takes a pointer to the target string. In +// the latter case the target string is cleared and overwritten. +// ---------------------------------------------------------------------- +template +void JoinStrings(const CONTAINER& components, + const StringPiece& delim, + string* result); +template +string JoinStrings(const CONTAINER& components, + const StringPiece& delim); + +template +void JoinStringsIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim, + string* result); +template +string JoinStringsIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim); + +// Join the keys of a map using the specified delimiter. +template +void JoinKeysIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim, + string *result) { + result->clear(); + for (ITERATOR iter = start; iter != end; ++iter) { + if (iter == start) { + StrAppend(result, iter->first); + } else { + StrAppend(result, delim, iter->first); + } + } +} + +template +string JoinKeysIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim) { + string result; + JoinKeysIterator(start, end, delim, &result); + return result; +} + +// Join the keys and values of a map using the specified delimiters. +template +void JoinKeysAndValuesIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string *result) { + result->clear(); + for (ITERATOR iter = start; iter != end; ++iter) { + if (iter == start) { + StrAppend(result, iter->first, intra_delim, iter->second); + } else { + StrAppend(result, inter_delim, iter->first, intra_delim, iter->second); + } + } +} + +template +string JoinKeysAndValuesIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& intra_delim, + const StringPiece& inter_delim) { + string result; + JoinKeysAndValuesIterator(start, end, intra_delim, inter_delim, &result); + return result; +} + +void JoinStringsInArray(string const* const* components, + int num_components, + const char* delim, + string* result); +void JoinStringsInArray(string const* components, + int num_components, + const char* delim, + string* result); +string JoinStringsInArray(string const* const* components, + int num_components, + const char* delim); +string JoinStringsInArray(string const* components, + int num_components, + const char* delim); + +// ---------------------------------------------------------------------- +// Definitions of above JoinStrings* methods +// ---------------------------------------------------------------------- +template +inline void JoinStrings(const CONTAINER& components, + const StringPiece& delim, + string* result) { + JoinStringsIterator(components.begin(), components.end(), delim, result); +} + +template +inline string JoinStrings(const CONTAINER& components, + const StringPiece& delim) { + string result; + JoinStrings(components, delim, &result); + return result; +} + +template +void JoinStringsIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim, + string* result) { + result->clear(); + + // Precompute resulting length so we can reserve() memory in one shot. + if (start != end) { + int length = delim.size()*(distance(start, end)-1); + for (ITERATOR iter = start; iter != end; ++iter) { + length += iter->size(); + } + result->reserve(length); + } + + // Now combine everything. + for (ITERATOR iter = start; iter != end; ++iter) { + if (iter != start) { + result->append(delim.data(), delim.size()); + } + result->append(iter->data(), iter->size()); + } +} + +template +inline string JoinStringsIterator(const ITERATOR& start, + const ITERATOR& end, + const StringPiece& delim) { + string result; + JoinStringsIterator(start, end, delim, &result); + return result; +} + +inline string JoinStringsInArray(string const* const* components, + int num_components, + const char* delim) { + string result; + JoinStringsInArray(components, num_components, delim, &result); + return result; +} + +inline string JoinStringsInArray(string const* components, + int num_components, + const char* delim) { + string result; + JoinStringsInArray(components, num_components, delim, &result); + return result; +} + +// ---------------------------------------------------------------------- +// JoinMapKeysAndValues() +// JoinHashMapKeysAndValues() +// JoinVectorKeysAndValues() +// This merges the keys and values of a string -> string map or pair +// of strings vector, with one delim (intra_delim) between each key +// and its associated value and another delim (inter_delim) between +// each key/value pair. The result is returned in a string (passed +// as the last argument). +// ---------------------------------------------------------------------- + +void JoinMapKeysAndValues(const map& components, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string* result); +void JoinVectorKeysAndValues(const vector< pair >& components, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string* result); + +// DEPRECATED(jyrki): use JoinKeysAndValuesIterator directly. +template +void JoinHashMapKeysAndValues(const T& container, + const StringPiece& intra_delim, + const StringPiece& inter_delim, + string* result) { + JoinKeysAndValuesIterator(container.begin(), container.end(), + intra_delim, inter_delim, + result); +} + +// ---------------------------------------------------------------------- +// JoinCSVLineWithDelimiter() +// This function is the inverse of SplitCSVLineWithDelimiter() in that the +// string returned by JoinCSVLineWithDelimiter() can be passed to +// SplitCSVLineWithDelimiter() to get the original string vector back. +// Quotes and escapes the elements of original_cols according to CSV quoting +// rules, and the joins the escaped quoted strings with commas using +// JoinStrings(). Note that JoinCSVLineWithDelimiter() will not necessarily +// return the same string originally passed in to +// SplitCSVLineWithDelimiter(), since SplitCSVLineWithDelimiter() can handle +// gratuitous spacing and quoting. 'output' must point to an empty string. +// +// Example: +// [Google], [x], [Buchheit, Paul], [string with " quoite in it], [ space ] +// ---> [Google,x,"Buchheit, Paul","string with "" quote in it"," space "] +// +// JoinCSVLine() +// A convenience wrapper around JoinCSVLineWithDelimiter which uses +// ',' as the delimiter. +// ---------------------------------------------------------------------- +void JoinCSVLine(const vector& original_cols, string* output); +string JoinCSVLine(const vector& original_cols); +void JoinCSVLineWithDelimiter(const vector& original_cols, + char delimiter, + string* output); + +// ---------------------------------------------------------------------- +// JoinElements() +// This merges a container of any type supported by StrAppend() with delim +// inserted as separators between components. This is essentially a +// templatized version of JoinUsingToBuffer(). +// +// JoinElementsIterator() +// Same as JoinElements(), except that the input elements are specified +// with a pair of forward iterators. +// ---------------------------------------------------------------------- + +template +void JoinElementsIterator(ITERATOR first, + ITERATOR last, + StringPiece delim, + string* result) { + result->clear(); + for (ITERATOR it = first; it != last; ++it) { + if (it != first) { + StrAppend(result, delim); + } + StrAppend(result, *it); + } +} + +template +string JoinElementsIterator(ITERATOR first, + ITERATOR last, + StringPiece delim) { + string result; + JoinElementsIterator(first, last, delim, &result); + return result; +} + +template +inline void JoinElements(const CONTAINER& components, + StringPiece delim, + string* result) { + JoinElementsIterator(components.begin(), components.end(), delim, result); +} + +template +inline string JoinElements(const CONTAINER& components, StringPiece delim) { + string result; + JoinElements(components, delim, &result); + return result; +} + +template +void JoinInts(const CONTAINER& components, + const char* delim, + string* result) { + JoinElements(components, delim, result); +} + +template +inline string JoinInts(const CONTAINER& components, + const char* delim) { + return JoinElements(components, delim); +} + +#endif // STRINGS_JOIN_H_ diff --git a/src/kudu/gutil/strings/memutil.cc b/src/kudu/gutil/strings/memutil.cc new file mode 100644 index 000000000000..390d2a0eed10 --- /dev/null +++ b/src/kudu/gutil/strings/memutil.cc @@ -0,0 +1,138 @@ +// +// Copyright (C) 2001 and onwards Google, Inc. +// + +#include "kudu/gutil/strings/memutil.h" + +#include // for malloc, NULL + +#include "kudu/gutil/strings/ascii_ctype.h" // for ascii_tolower + +int memcasecmp(const char *s1, const char *s2, size_t len) { + const unsigned char *us1 = reinterpret_cast(s1); + const unsigned char *us2 = reinterpret_cast(s2); + + for ( int i = 0; i < len; i++ ) { + const int diff = + static_cast(static_cast(ascii_tolower(us1[i]))) - + static_cast(static_cast(ascii_tolower(us2[i]))); + if (diff != 0) return diff; + } + return 0; +} + +char *memdup(const char *s, size_t slen) { + void *copy; + if ( (copy=malloc(slen)) == nullptr ) + return nullptr; + memcpy(copy, s, slen); + return reinterpret_cast(copy); +} + +char *memrchr(const char *s, int c, size_t slen) { + for (const char* e = s + slen-1; e >= s; e--) { + if (*e == c) + return const_cast(e); + } + return nullptr; +} + +size_t memspn(const char *s, size_t slen, const char *accept) { + const char *p = s, *spanp; + char c, sc; + + cont: + c = *p++; + if ( slen-- == 0 ) + return p-1 - s; + for (spanp = accept; (sc=*spanp++) != '\0';) + if (sc == c) + goto cont; + return p-1 - s; +} + + +size_t memcspn(const char *s, size_t slen, const char *reject) { + const char *p = s, *spanp; + char c, sc; + + while ( slen-- != 0 ) { + c = *p++; + for (spanp = reject; (sc=*spanp++) != '\0';) + if (sc == c) + return p-1 - s; + } + return p - s; +} + +char *mempbrk(const char *s, size_t slen, const char *accept) { + const char *scanp; + int sc; + + for ( ; slen; ++s, --slen ) { + for (scanp = accept; (sc=*scanp++) != '\0';) + if (sc == *s) + return const_cast(s); + } + return nullptr; +} + +template +const char *int_memmatch(const char *phaystack, size_t haylen, + const char *pneedle, size_t neelen) { + if (0 == neelen) { + return phaystack; // even if haylen is 0 + } + const unsigned char *haystack = (const unsigned char *) phaystack; + const unsigned char *hayend = (const unsigned char *) phaystack + haylen; + const unsigned char *needlestart = (const unsigned char *) pneedle; + const unsigned char *needle = (const unsigned char *) pneedle; + const unsigned char *needleend = (const unsigned char *) pneedle + neelen; + + for (; haystack < hayend; ++haystack) { + unsigned char hay = case_sensitive ? *haystack : + static_cast(ascii_tolower(*haystack)); + unsigned char nee = case_sensitive ? *needle : + static_cast(ascii_tolower(*needle)); + if (hay == nee) { + if (++needle == needleend) { + return (const char *) (haystack + 1 - neelen); + } + } else if (needle != needlestart) { + // must back up haystack in case a prefix matched (find "aab" in "aaab") + haystack -= needle - needlestart; // for loop will advance one more + needle = needlestart; + } + } + return nullptr; +} + +// explicit template instantiations +template const char *int_memmatch(const char *phaystack, size_t haylen, + const char *pneedle, size_t neelen); +template const char *int_memmatch(const char *phaystack, size_t haylen, + const char *pneedle, size_t neelen); + +// This is significantly faster for case-sensitive matches with very +// few possible matches. See unit test for benchmarks. +const char *memmatch(const char *phaystack, size_t haylen, + const char *pneedle, size_t neelen) { + if (0 == neelen) { + return phaystack; // even if haylen is 0 + } + if (haylen < neelen) + return nullptr; + + const char* match; + const char* hayend = phaystack + haylen - neelen + 1; + // A C-style cast is used here to work around the fact that memchr returns a + // void* on Posix-compliant systems and const void* on Windows. + while ((match = (const char*)(memchr(phaystack, pneedle[0], + hayend - phaystack)))) { + if (memcmp(match, pneedle, neelen) == 0) + return match; + else + phaystack = match + 1; + } + return nullptr; +} diff --git a/src/kudu/gutil/strings/memutil.h b/src/kudu/gutil/strings/memutil.h new file mode 100644 index 000000000000..93357352198b --- /dev/null +++ b/src/kudu/gutil/strings/memutil.h @@ -0,0 +1,153 @@ +// +// Copyright (C) 2001 and onwards Google, Inc. +// +// (Please see comments in strutil.h near the include of +// if you feel compelled to try to provide more efficient implementations +// of these routines.) +// +// These routines provide mem versions of standard C string routines, +// such a strpbrk. They function exactly the same as the str version, +// so if you wonder what they are, replace the word "mem" by +// "str" and check out the man page. I could return void*, as the +// strutil.h mem*() routines tend to do, but I return char* instead +// since this is by far the most common way these functions are called. +// +// The difference between the mem and str versions is the mem version +// takes a pointer and a length, rather than a NULL-terminated string. +// The memcase* routines defined here assume the locale is "C" +// (they use ascii_tolower instead of tolower). +// +// These routines are based on the BSD library. +// +// Here's a list of routines from string.h, and their mem analogues. +// Functions in lowercase are defined in string.h; those in UPPERCASE +// are defined here: +// +// strlen -- +// strcat strncat MEMCAT +// strcpy strncpy memcpy +// -- memccpy (very cool function, btw) +// -- memmove +// -- memset +// strcmp strncmp memcmp +// strcasecmp strncasecmp MEMCASECMP +// strchr memchr +// strcoll -- +// strxfrm -- +// strdup strndup MEMDUP +// strrchr MEMRCHR +// strspn MEMSPN +// strcspn MEMCSPN +// strpbrk MEMPBRK +// strstr MEMSTR MEMMEM +// (g)strcasestr MEMCASESTR MEMCASEMEM +// strtok -- +// strprefix MEMPREFIX (strprefix is from strutil.h) +// strcaseprefix MEMCASEPREFIX (strcaseprefix is from strutil.h) +// strsuffix MEMSUFFIX (strsuffix is from strutil.h) +// strcasesuffix MEMCASESUFFIX (strcasesuffix is from strutil.h) +// -- MEMIS +// -- MEMCASEIS +// strcount MEMCOUNT (strcount is from strutil.h) + +#ifndef STRINGS_MEMUTIL_H_ +#define STRINGS_MEMUTIL_H_ + +#include +#include // to get the POSIX mem*() routines + +#include "kudu/gutil/port.h" // disable some warnings on Windows + +inline char *memcat(char *dest, size_t destlen, + const char *src, size_t srclen) { + return reinterpret_cast(memcpy(dest + destlen, src, srclen)); +} + +int memcasecmp(const char *s1, const char *s2, size_t len); +char *memdup(const char *s, size_t slen); +char *memrchr(const char *s, int c, size_t slen); +size_t memspn(const char *s, size_t slen, const char *accept); +size_t memcspn(const char *s, size_t slen, const char *reject); +char *mempbrk(const char *s, size_t slen, const char *accept); + +// This is for internal use only. Don't call this directly +template +const char * int_memmatch(const char * phaystack, size_t haylen, + const char * pneedle, size_t neelen); + +// These are the guys you can call directly +inline const char * memstr(const char *phaystack, size_t haylen, + const char *pneedle) { + return int_memmatch(phaystack, haylen, pneedle, strlen(pneedle)); +} + +inline const char * memcasestr(const char *phaystack, size_t haylen, + const char *pneedle) { + return int_memmatch(phaystack, haylen, pneedle, strlen(pneedle)); +} + +inline const char * memmem(const char *phaystack, size_t haylen, + const char *pneedle, size_t needlelen) { + return int_memmatch(phaystack, haylen, pneedle, needlelen); +} + +inline const char * memcasemem(const char *phaystack, size_t haylen, + const char *pneedle, size_t needlelen) { + return int_memmatch(phaystack, haylen, pneedle, needlelen); +} + +// This is significantly faster for case-sensitive matches with very +// few possible matches. See unit test for benchmarks. +const char *memmatch(const char *phaystack, size_t haylen, + const char *pneedle, size_t neelen); + +// The ""'s catch people who don't pass in a literal for "str" +#define strliterallen(str) (sizeof("" str "")-1) + +// Must use a string literal for prefix. +#define memprefix(str, len, prefix) \ + ( (((len) >= strliterallen(prefix)) \ + && memcmp(str, prefix, strliterallen(prefix)) == 0) \ + ? str + strliterallen(prefix) \ + : NULL ) + +#define memcaseprefix(str, len, prefix) \ + ( (((len) >= strliterallen(prefix)) \ + && memcasecmp(str, prefix, strliterallen(prefix)) == 0) \ + ? str + strliterallen(prefix) \ + : NULL ) + +// Must use a string literal for suffix. +#define memsuffix(str, len, suffix) \ + ( (((len) >= strliterallen(suffix)) \ + && memcmp(str + (len) - strliterallen(suffix), suffix, \ + strliterallen(suffix)) == 0) \ + ? str + (len) - strliterallen(suffix) \ + : NULL ) + +#define memcasesuffix(str, len, suffix) \ + ( (((len) >= strliterallen(suffix)) \ + && memcasecmp(str + (len) - strliterallen(suffix), suffix, \ + strliterallen(suffix)) == 0) \ + ? str + (len) - strliterallen(suffix) \ + : NULL ) + +#define memis(str, len, literal) \ + ( (((len) == strliterallen(literal)) \ + && memcmp(str, literal, strliterallen(literal)) == 0) ) + +#define memcaseis(str, len, literal) \ + ( (((len) == strliterallen(literal)) \ + && memcasecmp(str, literal, strliterallen(literal)) == 0) ) + + +inline int memcount(const char* buf, size_t len, char c) { + int num = 0; + for (int i = 0; i < len; i++) { + if (buf[i] == c) + num++; + } + return num; +} + +#endif // STRINGS_MEMUTIL_H_ diff --git a/src/kudu/gutil/strings/numbers.cc b/src/kudu/gutil/strings/numbers.cc new file mode 100644 index 000000000000..7bdb57c8960c --- /dev/null +++ b/src/kudu/gutil/strings/numbers.cc @@ -0,0 +1,1461 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Refactored from contributions of various authors in strings/strutil.cc +// +// This file contains string processing functions related to +// numeric values. + +#include "kudu/gutil/strings/numbers.h" + +#include +#include +#include +#include // for DBL_DIG and FLT_DIG +#include // for HUGE_VAL +#include +#include +#include +#include +using std::numeric_limits; +#include +using std::string; + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strtoint.h" +#include "kudu/gutil/strings/ascii_ctype.h" + +// Reads a in *text, which may not be whitespace-initiated. +// *len is the length, or -1 if text is '\0'-terminated, which is more +// efficient. Sets *text to the end of the double, and val to the +// converted value, and the length of the double is subtracted from +// *len. may also be a '?', in which case val will be +// unchanged. Returns true upon success. If initial_minus is +// non-NULL, then *initial_minus will indicate whether the first +// symbol seen was a '-', which will be ignored. Similarly, if +// final_period is non-NULL, then *final_period will indicate whether +// the last symbol seen was a '.', which will be ignored. This is +// useful in case that an initial '-' or final '.' would have another +// meaning (as a separator, e.g.). +static inline bool EatADouble(const char** text, int* len, bool allow_question, + double* val, bool* initial_minus, + bool* final_period) { + const char* pos = *text; + int rem = *len; // remaining length, or -1 if null-terminated + + if (pos == nullptr || rem == 0) + return false; + + if (allow_question && (*pos == '?')) { + *text = pos + 1; + if (rem != -1) + *len = rem - 1; + return true; + } + + if (initial_minus) { + if ((*initial_minus = (*pos == '-'))) { // Yes, we want assignment. + if (rem == 1) + return false; + ++pos; + if (rem != -1) + --rem; + } + } + + // a double has to begin one of these (we don't allow 'inf' or whitespace) + // this also serves as an optimization. + if (!strchr("-+.0123456789", *pos)) + return false; + + // strtod is evil in that the second param is a non-const char** + char* end_nonconst; + double retval; + if (rem == -1) { + retval = strtod(pos, &end_nonconst); + } else { + // not '\0'-terminated & no obvious terminator found. must copy. + gscoped_array buf(new char[rem + 1]); + memcpy(buf.get(), pos, rem); + buf[rem] = '\0'; + retval = strtod(buf.get(), &end_nonconst); + end_nonconst = const_cast(pos) + (end_nonconst - buf.get()); + } + + if (pos == end_nonconst) + return false; + + if (final_period) { + *final_period = (end_nonconst[-1] == '.'); + if (*final_period) { + --end_nonconst; + } + } + + *text = end_nonconst; + *val = retval; + if (rem != -1) + *len = rem - (end_nonconst - pos); + return true; +} + +// If update, consume one of acceptable_chars from string *text of +// length len and return that char, or '\0' otherwise. If len is -1, +// *text is null-terminated. If update is false, don't alter *text and +// *len. If null_ok, then update must be false, and, if text has no +// more chars, then return '\1' (arbitrary nonzero). +static inline char EatAChar(const char** text, int* len, + const char* acceptable_chars, + bool update, bool null_ok) { + assert(!(update && null_ok)); + if ((*len == 0) || (**text == '\0')) + return (null_ok ? '\1' : '\0'); // if null_ok, we're in predicate mode. + + if (strchr(acceptable_chars, **text)) { + char result = **text; + if (update) { + ++(*text); + if (*len != -1) + --(*len); + } + return result; + } + + return '\0'; // no match; no update +} + +// Parse an expression in 'text' of the form: or +// See full comments in header file. +bool ParseDoubleRange(const char* text, int len, const char** end, + double* from, double* to, bool* is_currency, + const DoubleRangeOptions& opts) { + const double from_default = opts.dont_modify_unbounded ? *from : -HUGE_VAL; + + if (!opts.dont_modify_unbounded) { + *from = -HUGE_VAL; + *to = HUGE_VAL; + } + if (opts.allow_currency && (is_currency != nullptr)) + *is_currency = false; + + assert(len >= -1); + assert(opts.separators && (*opts.separators != '\0')); + // these aren't valid separators + assert(strlen(opts.separators) == + strcspn(opts.separators, "+0123456789eE$")); + assert(opts.num_required_bounds <= 2); + + // Handle easier cases of comparators (<, >) first + if (opts.allow_comparators) { + char comparator = EatAChar(&text, &len, "<>", true, false); + if (comparator) { + double* dest = (comparator == '>') ? from : to; + EatAChar(&text, &len, "=", true, false); + if (opts.allow_currency && EatAChar(&text, &len, "$", true, false)) + if (is_currency != nullptr) + *is_currency = true; + if (!EatADouble(&text, &len, opts.allow_unbounded_markers, dest, nullptr, + nullptr)) + return false; + *end = text; + return EatAChar(&text, &len, opts.acceptable_terminators, false, + opts.null_terminator_ok); + } + } + + bool seen_dollar = (opts.allow_currency && + EatAChar(&text, &len, "$", true, false)); + + // If we see a '-', two things could be happening: - or + // ... where is negative. Treat initial minus sign as a + // separator if '-' is a valid separator. + // Similarly, we prepare for the possibility of seeing a '.' at the + // end of the number, in case '.' (which really means '..') is a + // separator. + bool initial_minus_sign = false; + bool final_period = false; + bool* check_initial_minus = (strchr(opts.separators, '-') && !seen_dollar + && (opts.num_required_bounds < 2)) ? + (&initial_minus_sign) : nullptr; + bool* check_final_period = strchr(opts.separators, '.') ? (&final_period) + : nullptr; + bool double_seen = EatADouble(&text, &len, opts.allow_unbounded_markers, + from, check_initial_minus, check_final_period); + + // if 2 bounds required, must see a double (or '?' if allowed) + if ((opts.num_required_bounds == 2) && !double_seen) return false; + + if (seen_dollar && !double_seen) { + --text; + if (len != -1) + ++len; + seen_dollar = false; + } + // If we're here, we've read the first double and now expect a + // separator and another . + char separator = EatAChar(&text, &len, opts.separators, true, false); + if (separator == '.') { + // seen one '.' as separator; must check for another; perhaps set seplen=2 + if (EatAChar(&text, &len, ".", true, false)) { + if (final_period) { + // We may have three periods in a row. The first is part of the + // first number, the others are a separator. Policy: 234...567 + // is "234." to "567", not "234" to ".567". + EatAChar(&text, &len, ".", true, false); + } + } else if (!EatAChar(&text, &len, opts.separators, true, false)) { + // just one '.' and no other separator; uneat the first '.' we saw + --text; + if (len != -1) + ++len; + separator = '\0'; + } + } + // By now, we've consumed whatever separator there may have been, + // and separator is true iff there was one. + if (!separator) { + if (final_period) // final period now considered part of first double + EatAChar(&text, &len, ".", true, false); + if (initial_minus_sign && double_seen) { + *to = *from; + *from = from_default; + } else if (opts.require_separator || + (opts.num_required_bounds > 0 && !double_seen) || + (opts.num_required_bounds > 1) ) { + return false; + } + } else { + if (initial_minus_sign && double_seen) + *from = -(*from); + // read second + bool second_dollar_seen = (seen_dollar + || (opts.allow_currency && !double_seen)) + && EatAChar(&text, &len, "$", true, false); + bool second_double_seen = EatADouble( + &text, &len, opts.allow_unbounded_markers, to, nullptr, nullptr); + if (opts.num_required_bounds > double_seen + second_double_seen) + return false; + if (second_dollar_seen && !second_double_seen) { + --text; + if (len != -1) + ++len; + second_dollar_seen = false; + } + seen_dollar = seen_dollar || second_dollar_seen; + } + + if (seen_dollar && (is_currency != nullptr)) + *is_currency = true; + // We're done. But we have to check that the next char is a proper + // terminator. + *end = text; + char terminator = EatAChar(&text, &len, opts.acceptable_terminators, false, + opts.null_terminator_ok); + if (terminator == '.') + --(*end); + return terminator; +} + +// ---------------------------------------------------------------------- +// ConsumeStrayLeadingZeroes +// Eliminates all leading zeroes (unless the string itself is composed +// of nothing but zeroes, in which case one is kept: 0...0 becomes 0). +// -------------------------------------------------------------------- + +void ConsumeStrayLeadingZeroes(string *const str) { + const string::size_type len(str->size()); + if (len > 1 && (*str)[0] == '0') { + const char + *const begin(str->c_str()), + *const end(begin + len), + *ptr(begin + 1); + while (ptr != end && *ptr == '0') { + ++ptr; + } + string::size_type remove(ptr - begin); + DCHECK_GT(ptr, begin); + if (remove == len) { + --remove; // if they are all zero, leave one... + } + str->erase(0, remove); + } +} + +// ---------------------------------------------------------------------- +// ParseLeadingInt32Value() +// ParseLeadingUInt32Value() +// A simple parser for [u]int32 values. Returns the parsed value +// if a valid value is found; else returns deflt +// This cannot handle decimal numbers with leading 0s. +// -------------------------------------------------------------------- + +int32 ParseLeadingInt32Value(const char *str, int32 deflt) { + char *error = nullptr; + long value = strtol(str, &error, 0); + // Limit long values to int32 min/max. Needed for lp64; no-op on 32 bits. + if (value > numeric_limits::max()) { + value = numeric_limits::max(); + } else if (value < numeric_limits::min()) { + value = numeric_limits::min(); + } + return (error == str) ? deflt : value; +} + +uint32 ParseLeadingUInt32Value(const char *str, uint32 deflt) { + if (numeric_limits::max() == numeric_limits::max()) { + // When long is 32 bits, we can use strtoul. + char *error = nullptr; + const uint32 value = strtoul(str, &error, 0); + return (error == str) ? deflt : value; + } else { + // When long is 64 bits, we must use strto64 and handle limits + // by hand. The reason we cannot use a 64-bit strtoul is that + // it would be impossible to differentiate "-2" (that should wrap + // around to the value UINT_MAX-1) from a string with ULONG_MAX-1 + // (that should be pegged to UINT_MAX due to overflow). + char *error = nullptr; + int64 value = strto64(str, &error, 0); + if (value > numeric_limits::max() || + value < -static_cast(numeric_limits::max())) { + value = numeric_limits::max(); + } + // Within these limits, truncation to 32 bits handles negatives correctly. + return (error == str) ? deflt : value; + } +} + +// ---------------------------------------------------------------------- +// ParseLeadingDec32Value +// ParseLeadingUDec32Value +// A simple parser for [u]int32 values. Returns the parsed value +// if a valid value is found; else returns deflt +// The string passed in is treated as *10 based*. +// This can handle strings with leading 0s. +// -------------------------------------------------------------------- + +int32 ParseLeadingDec32Value(const char *str, int32 deflt) { + char *error = nullptr; + long value = strtol(str, &error, 10); + // Limit long values to int32 min/max. Needed for lp64; no-op on 32 bits. + if (value > numeric_limits::max()) { + value = numeric_limits::max(); + } else if (value < numeric_limits::min()) { + value = numeric_limits::min(); + } + return (error == str) ? deflt : value; +} + +uint32 ParseLeadingUDec32Value(const char *str, uint32 deflt) { + if (numeric_limits::max() == numeric_limits::max()) { + // When long is 32 bits, we can use strtoul. + char *error = nullptr; + const uint32 value = strtoul(str, &error, 10); + return (error == str) ? deflt : value; + } else { + // When long is 64 bits, we must use strto64 and handle limits + // by hand. The reason we cannot use a 64-bit strtoul is that + // it would be impossible to differentiate "-2" (that should wrap + // around to the value UINT_MAX-1) from a string with ULONG_MAX-1 + // (that should be pegged to UINT_MAX due to overflow). + char *error = nullptr; + int64 value = strto64(str, &error, 10); + if (value > numeric_limits::max() || + value < -static_cast(numeric_limits::max())) { + value = numeric_limits::max(); + } + // Within these limits, truncation to 32 bits handles negatives correctly. + return (error == str) ? deflt : value; + } +} + +// ---------------------------------------------------------------------- +// ParseLeadingUInt64Value +// ParseLeadingInt64Value +// ParseLeadingHex64Value +// A simple parser for 64-bit values. Returns the parsed value if a +// valid integer is found; else returns deflt +// UInt64 and Int64 cannot handle decimal numbers with leading 0s. +// -------------------------------------------------------------------- +uint64 ParseLeadingUInt64Value(const char *str, uint64 deflt) { + char *error = nullptr; + const uint64 value = strtou64(str, &error, 0); + return (error == str) ? deflt : value; +} + +int64 ParseLeadingInt64Value(const char *str, int64 deflt) { + char *error = nullptr; + const int64 value = strto64(str, &error, 0); + return (error == str) ? deflt : value; +} + +uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { + char *error = nullptr; + const uint64 value = strtou64(str, &error, 16); + return (error == str) ? deflt : value; +} + +// ---------------------------------------------------------------------- +// ParseLeadingDec64Value +// ParseLeadingUDec64Value +// A simple parser for [u]int64 values. Returns the parsed value +// if a valid value is found; else returns deflt +// The string passed in is treated as *10 based*. +// This can handle strings with leading 0s. +// -------------------------------------------------------------------- + +int64 ParseLeadingDec64Value(const char *str, int64 deflt) { + char *error = nullptr; + const int64 value = strto64(str, &error, 10); + return (error == str) ? deflt : value; +} + +uint64 ParseLeadingUDec64Value(const char *str, uint64 deflt) { + char *error = nullptr; + const uint64 value = strtou64(str, &error, 10); + return (error == str) ? deflt : value; +} + +// ---------------------------------------------------------------------- +// ParseLeadingDoubleValue() +// A simple parser for double values. Returns the parsed value +// if a valid value is found; else returns deflt +// -------------------------------------------------------------------- + +double ParseLeadingDoubleValue(const char *str, double deflt) { + char *error = nullptr; + errno = 0; + const double value = strtod(str, &error); + if (errno != 0 || // overflow/underflow happened + error == str) { // no valid parse + return deflt; + } else { + return value; + } +} + +// ---------------------------------------------------------------------- +// ParseLeadingBoolValue() +// A recognizer of boolean string values. Returns the parsed value +// if a valid value is found; else returns deflt. This skips leading +// whitespace, is case insensitive, and recognizes these forms: +// 0/1, false/true, no/yes, n/y +// -------------------------------------------------------------------- +bool ParseLeadingBoolValue(const char *str, bool deflt) { + static const int kMaxLen = 5; + char value[kMaxLen + 1]; + // Skip whitespace + while (ascii_isspace(*str)) { + ++str; + } + int len = 0; + for (; len <= kMaxLen && ascii_isalnum(*str); ++str) + value[len++] = ascii_tolower(*str); + if (len == 0 || len > kMaxLen) + return deflt; + value[len] = '\0'; + switch (len) { + case 1: + if (value[0] == '0' || value[0] == 'n') + return false; + if (value[0] == '1' || value[0] == 'y') + return true; + break; + case 2: + if (!strcmp(value, "no")) + return false; + break; + case 3: + if (!strcmp(value, "yes")) + return true; + break; + case 4: + if (!strcmp(value, "true")) + return true; + break; + case 5: + if (!strcmp(value, "false")) + return false; + break; + } + return deflt; +} + + +// ---------------------------------------------------------------------- +// FpToString() +// FloatToString() +// IntToString() +// Convert various types to their string representation, possibly padded +// with spaces, using snprintf format specifiers. +// ---------------------------------------------------------------------- + +string FpToString(Fprint fp) { + char buf[17]; + snprintf(buf, sizeof(buf), "%016" PRIx64, fp); + return string(buf); +} + +// Default arguments +string Uint128ToHexString(uint128 ui128) { + char buf[33]; + snprintf(buf, sizeof(buf), "%016" PRIx64, + Uint128High64(ui128)); + snprintf(buf + 16, sizeof(buf) - 16, "%016" PRIx64, + Uint128Low64(ui128)); + return string(buf); +} + +namespace { + +// Represents integer values of digits. +// Uses 36 to indicate an invalid character since we support +// bases up to 36. +static const int8 kAsciiToInt[256] = { + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 16 36s. + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 36, 36, 36, 36, 36, 36, 36, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 36, 36, 36, 36, 36, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36 }; + +// Input format based on POSIX.1-2008 strtol +// http://pubs.opengroup.org/onlinepubs/9699919799/functions/strtol.html +template +bool safe_int_internal(const char* start, const char* end, int base, + IntType* value_p) { + // Consume whitespace. + while (start < end && ascii_isspace(start[0])) { + ++start; + } + while (start < end && ascii_isspace(end[-1])) { + --end; + } + if (start >= end) { + return false; + } + + // Consume sign. + const bool negative = (start[0] == '-'); + if (negative || start[0] == '+') { + ++start; + if (start >= end) { + return false; + } + } + + // Consume base-dependent prefix. + // base 0: "0x" -> base 16, "0" -> base 8, default -> base 10 + // base 16: "0x" -> base 16 + // Also validate the base. + if (base == 0) { + if (end - start >= 2 && start[0] == '0' && + (start[1] == 'x' || start[1] == 'X')) { + base = 16; + start += 2; + } else if (end - start >= 1 && start[0] == '0') { + base = 8; + start += 1; + } else { + base = 10; + } + } else if (base == 16) { + if (end - start >= 2 && start[0] == '0' && + (start[1] == 'x' || start[1] == 'X')) { + start += 2; + } + } else if (base >= 2 && base <= 36) { + // okay + } else { + return false; + } + + // Consume digits. + // + // The classic loop: + // + // for each digit + // value = value * base + digit + // value *= sign + // + // The classic loop needs overflow checking. It also fails on the most + // negative integer, -2147483648 in 32-bit two's complement representation. + // + // My improved loop: + // + // if (!negative) + // for each digit + // value = value * base + // value = value + digit + // else + // for each digit + // value = value * base + // value = value - digit + // + // Overflow checking becomes simple. + // + // I present the positive code first for easier reading. + IntType value = 0; + if (!negative) { + const IntType vmax = std::numeric_limits::max(); + assert(vmax > 0); + assert(vmax >= base); + const IntType vmax_over_base = vmax / base; + // loop over digits + // loop body is interleaved for perf, not readability + for (; start < end; ++start) { + unsigned char c = static_cast(start[0]); + int digit = kAsciiToInt[c]; + if (value > vmax_over_base) return false; + value *= base; + if (digit >= base) return false; + if (value > vmax - digit) return false; + value += digit; + } + } else { + const IntType vmin = std::numeric_limits::min(); + assert(vmin < 0); + assert(vmin <= 0 - base); + IntType vmin_over_base = vmin / base; + // 2003 c++ standard [expr.mul] + // "... the sign of the remainder is implementation-defined." + // Although (vmin/base)*base + vmin%base is always vmin. + // 2011 c++ standard tightens the spec but we cannot rely on it. + if (vmin % base > 0) { + vmin_over_base += 1; + } + // loop over digits + // loop body is interleaved for perf, not readability + for (; start < end; ++start) { + unsigned char c = static_cast(start[0]); + int digit = kAsciiToInt[c]; + if (value < vmin_over_base) return false; + value *= base; + if (digit >= base) return false; + if (value < vmin + digit) return false; + value -= digit; + } + } + + // Store output. + *value_p = value; + return true; +} + +} // anonymous namespace + +bool safe_strto32_base(const char* startptr, const int buffer_size, + int32* v, int base) { + return safe_int_internal(startptr, startptr + buffer_size, base, v); +} + +bool safe_strto64_base(const char* startptr, const int buffer_size, + int64* v, int base) { + return safe_int_internal(startptr, startptr + buffer_size, base, v); +} + +bool safe_strto32(const char* startptr, const int buffer_size, int32* value) { + return safe_int_internal(startptr, startptr + buffer_size, 10, value); +} + +bool safe_strto64(const char* startptr, const int buffer_size, int64* value) { + return safe_int_internal(startptr, startptr + buffer_size, 10, value); +} + +bool safe_strto32_base(const char* str, int32* value, int base) { + char* endptr; + errno = 0; // errno only gets set on errors + *value = strto32(str, &endptr, base); + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + return *str != '\0' && *endptr == '\0' && errno == 0; +} + +bool safe_strto64_base(const char* str, int64* value, int base) { + char* endptr; + errno = 0; // errno only gets set on errors + *value = strto64(str, &endptr, base); + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + return *str != '\0' && *endptr == '\0' && errno == 0; +} + +bool safe_strtou32_base(const char* str, uint32* value, int base) { + // strtoul does not give any errors on negative numbers, so we have to + // search the string for '-' manually. + while (ascii_isspace(*str)) ++str; + if (*str == '-') return false; + + char* endptr; + errno = 0; // errno only gets set on errors + *value = strtou32(str, &endptr, base); + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + return *str != '\0' && *endptr == '\0' && errno == 0; +} + +bool safe_strtou64_base(const char* str, uint64* value, int base) { + // strtou64 does not give any errors on negative numbers, so we have to + // search the string for '-' manually. + while (ascii_isspace(*str)) ++str; + if (*str == '-') return false; + + char* endptr; + errno = 0; // errno only gets set on errors + *value = strtou64(str, &endptr, base); + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + return *str != '\0' && *endptr == '\0' && errno == 0; +} + +// ---------------------------------------------------------------------- +// u64tostr_base36() +// Converts unsigned number to string representation in base-36. +// -------------------------------------------------------------------- +size_t u64tostr_base36(uint64 number, size_t buf_size, char* buffer) { + CHECK_GT(buf_size, 0); + CHECK(buffer); + static const char kAlphabet[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + + buffer[buf_size - 1] = '\0'; + size_t result_size = 1; + + do { + if (buf_size == result_size) { // Ran out of space. + return 0; + } + int remainder = number % 36; + number /= 36; + buffer[buf_size - result_size - 1] = kAlphabet[remainder]; + result_size++; + } while (number); + + memmove(buffer, buffer + buf_size - result_size, result_size); + + return result_size - 1; +} + +// Generate functions that wrap safe_strtoXXX_base. +#define GEN_SAFE_STRTO(name, type) \ +bool name##_base(const string& str, type* value, int base) { \ + return name##_base(str.c_str(), value, base); \ +} \ +bool name(const char* str, type* value) { \ + return name##_base(str, value, 10); \ +} \ +bool name(const string& str, type* value) { \ + return name##_base(str.c_str(), value, 10); \ +} +GEN_SAFE_STRTO(safe_strto32, int32); +GEN_SAFE_STRTO(safe_strtou32, uint32); +GEN_SAFE_STRTO(safe_strto64, int64); +GEN_SAFE_STRTO(safe_strtou64, uint64); +#undef GEN_SAFE_STRTO + +bool safe_strtof(const char* str, float* value) { + char* endptr; +#ifdef _MSC_VER // has no strtof() + *value = strtod(str, &endptr); +#else + *value = strtof(str, &endptr); +#endif + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + // Ignore range errors from strtod/strtof. + // The values it returns on underflow and + // overflow are the right fallback in a + // robust setting. + return *str != '\0' && *endptr == '\0'; +} + +bool safe_strtod(const char* str, double* value) { + char* endptr; + *value = strtod(str, &endptr); + if (endptr != str) { + while (ascii_isspace(*endptr)) ++endptr; + } + // Ignore range errors from strtod. The values it + // returns on underflow and overflow are the right + // fallback in a robust setting. + return *str != '\0' && *endptr == '\0'; +} + +bool safe_strtof(const string& str, float* value) { + return safe_strtof(str.c_str(), value); +} + +bool safe_strtod(const string& str, double* value) { + return safe_strtod(str.c_str(), value); +} + +uint64 atoi_kmgt(const char* s) { + char* endptr; + uint64 n = strtou64(s, &endptr, 10); + uint64 scale = 1; + char c = *endptr; + if (c != '\0') { + c = ascii_toupper(c); + switch (c) { + case 'K': + scale = GG_ULONGLONG(1) << 10; + break; + case 'M': + scale = GG_ULONGLONG(1) << 20; + break; + case 'G': + scale = GG_ULONGLONG(1) << 30; + break; + case 'T': + scale = GG_ULONGLONG(1) << 40; + break; + default: + LOG(FATAL) << "Invalid mnemonic: `" << c << "';" + << " should be one of `K', `M', `G', and `T'."; + } + } + return n * scale; +} + +// ---------------------------------------------------------------------- +// FastIntToBuffer() +// FastInt64ToBuffer() +// FastHexToBuffer() +// FastHex64ToBuffer() +// FastHex32ToBuffer() +// FastTimeToBuffer() +// These are intended for speed. FastHexToBuffer() assumes the +// integer is non-negative. FastHexToBuffer() puts output in +// hex rather than decimal. FastTimeToBuffer() puts the output +// into RFC822 format. If time is 0, uses the current time. +// +// FastHex64ToBuffer() puts a 64-bit unsigned value in hex-format, +// padded to exactly 16 bytes (plus one byte for '\0') +// +// FastHex32ToBuffer() puts a 32-bit unsigned value in hex-format, +// padded to exactly 8 bytes (plus one byte for '\0') +// +// All functions take the output buffer as an arg. FastInt() +// uses at most 22 bytes, FastTime() uses exactly 30 bytes. +// They all return a pointer to the beginning of the output, +// which may not be the beginning of the input buffer. (Though +// for FastTimeToBuffer(), we guarantee that it is.) +// ---------------------------------------------------------------------- + +char *FastInt64ToBuffer(int64 i, char* buffer) { + FastInt64ToBufferLeft(i, buffer); + return buffer; +} + +char *FastInt32ToBuffer(int32 i, char* buffer) { + FastInt32ToBufferLeft(i, buffer); + return buffer; +} + +char *FastHexToBuffer(int i, char* buffer) { + CHECK_GE(i, 0) << "FastHexToBuffer() wants non-negative integers, not " << i; + + static const char *hexdigits = "0123456789abcdef"; + char *p = buffer + 21; + *p-- = '\0'; + do { + *p-- = hexdigits[i & 15]; // mod by 16 + i >>= 4; // divide by 16 + } while (i > 0); + return p + 1; +} + +char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) { + static const char *hexdigits = "0123456789abcdef"; + buffer[num_byte] = '\0'; + for (int i = num_byte - 1; i >= 0; i--) { + buffer[i] = hexdigits[value & 0xf]; + value >>= 4; + } + return buffer; +} + +char *FastHex64ToBuffer(uint64 value, char* buffer) { + return InternalFastHexToBuffer(value, buffer, 16); +} + +char *FastHex32ToBuffer(uint32 value, char* buffer) { + return InternalFastHexToBuffer(value, buffer, 8); +} + +// TODO(user): revisit the two_ASCII_digits optimization. +// +// Several converters use this table to reduce +// division and modulo operations. +extern const char two_ASCII_digits[100][2]; // from strutil.cc + +// ---------------------------------------------------------------------- +// FastInt32ToBufferLeft() +// FastUInt32ToBufferLeft() +// FastInt64ToBufferLeft() +// FastUInt64ToBufferLeft() +// +// Like the Fast*ToBuffer() functions above, these are intended for speed. +// Unlike the Fast*ToBuffer() functions, however, these functions write +// their output to the beginning of the buffer (hence the name, as the +// output is left-aligned). The caller is responsible for ensuring that +// the buffer has enough space to hold the output. +// +// Returns a pointer to the end of the string (i.e. the null character +// terminating the string). +// ---------------------------------------------------------------------- + +char* FastUInt32ToBufferLeft(uint32 u, char* buffer) { + uint digits; + const char *ASCII_digits = nullptr; + // The idea of this implementation is to trim the number of divides to as few + // as possible by using multiplication and subtraction rather than mod (%), + // and by outputting two digits at a time rather than one. + // The huge-number case is first, in the hopes that the compiler will output + // that case in one branch-free block of code, and only output conditional + // branches into it from below. + if (u >= 1000000000) { // >= 1,000,000,000 + digits = u / 100000000; // 100,000,000 + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + sublt100_000_000: + u -= digits * 100000000; // 100,000,000 + lt100_000_000: + digits = u / 1000000; // 1,000,000 + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + sublt1_000_000: + u -= digits * 1000000; // 1,000,000 + lt1_000_000: + digits = u / 10000; // 10,000 + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + sublt10_000: + u -= digits * 10000; // 10,000 + lt10_000: + digits = u / 100; + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + sublt100: + u -= digits * 100; + lt100: + digits = u; + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + done: + *buffer = 0; + return buffer; + } + + if (u < 100) { + digits = u; + if (u >= 10) goto lt100; + *buffer++ = '0' + digits; + goto done; + } + if (u < 10000) { // 10,000 + if (u >= 1000) goto lt10_000; + digits = u / 100; + *buffer++ = '0' + digits; + goto sublt100; + } + if (u < 1000000) { // 1,000,000 + if (u >= 100000) goto lt1_000_000; + digits = u / 10000; // 10,000 + *buffer++ = '0' + digits; + goto sublt10_000; + } + if (u < 100000000) { // 100,000,000 + if (u >= 10000000) goto lt100_000_000; + digits = u / 1000000; // 1,000,000 + *buffer++ = '0' + digits; + goto sublt1_000_000; + } + // we already know that u < 1,000,000,000 + digits = u / 100000000; // 100,000,000 + *buffer++ = '0' + digits; + goto sublt100_000_000; +} + +char* FastInt32ToBufferLeft(int32 i, char* buffer) { + uint32 u = i; + if (i < 0) { + *buffer++ = '-'; + // We need to do the negation in modular (i.e., "unsigned") + // arithmetic; MSVC++ apprently warns for plain "-u", so + // we write the equivalent expression "0 - u" instead. + u = 0 - u; + } + return FastUInt32ToBufferLeft(u, buffer); +} + +char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) { + uint digits; + const char *ASCII_digits = nullptr; + + uint32 u = static_cast(u64); + if (u == u64) return FastUInt32ToBufferLeft(u, buffer); + + uint64 top_11_digits = u64 / 1000000000; + buffer = FastUInt64ToBufferLeft(top_11_digits, buffer); + u = u64 - (top_11_digits * 1000000000); + + digits = u / 10000000; // 10,000,000 + DCHECK_LT(digits, 100); + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + u -= digits * 10000000; // 10,000,000 + digits = u / 100000; // 100,000 + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + u -= digits * 100000; // 100,000 + digits = u / 1000; // 1,000 + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + u -= digits * 1000; // 1,000 + digits = u / 10; + ASCII_digits = two_ASCII_digits[digits]; + buffer[0] = ASCII_digits[0]; + buffer[1] = ASCII_digits[1]; + buffer += 2; + u -= digits * 10; + digits = u; + *buffer++ = '0' + digits; + *buffer = 0; + return buffer; +} + +char* FastInt64ToBufferLeft(int64 i, char* buffer) { + uint64 u = i; + if (i < 0) { + *buffer++ = '-'; + u = 0 - u; + } + return FastUInt64ToBufferLeft(u, buffer); +} + +int HexDigitsPrefix(const char* buf, int num_digits) { + for (int i = 0; i < num_digits; i++) + if (!ascii_isxdigit(buf[i])) + return 0; // This also detects end of string as '\0' is not xdigit. + return 1; +} + +// ---------------------------------------------------------------------- +// AutoDigitStrCmp +// AutoDigitLessThan +// StrictAutoDigitLessThan +// autodigit_less +// autodigit_greater +// strict_autodigit_less +// strict_autodigit_greater +// These are like less and greater, except when a +// run of digits is encountered at corresponding points in the two +// arguments. Such digit strings are compared numerically instead +// of lexicographically. Therefore if you sort by +// "autodigit_less", some machine names might get sorted as: +// exaf1 +// exaf2 +// exaf10 +// When using "strict" comparison (AutoDigitStrCmp with the strict flag +// set to true, or the strict version of the other functions), +// strings that represent equal numbers will not be considered equal if +// the string representations are not identical. That is, "01" < "1" in +// strict mode, but "01" == "1" otherwise. +// ---------------------------------------------------------------------- + +int AutoDigitStrCmp(const char* a, int alen, + const char* b, int blen, + bool strict) { + int aindex = 0; + int bindex = 0; + while ((aindex < alen) && (bindex < blen)) { + if (isdigit(a[aindex]) && isdigit(b[bindex])) { + // Compare runs of digits. Instead of extracting numbers, we + // just skip leading zeroes, and then get the run-lengths. This + // allows us to handle arbitrary precision numbers. We remember + // how many zeroes we found so that we can differentiate between + // "1" and "01" in strict mode. + + // Skip leading zeroes, but remember how many we found + int azeroes = aindex; + int bzeroes = bindex; + while ((aindex < alen) && (a[aindex] == '0')) aindex++; + while ((bindex < blen) && (b[bindex] == '0')) bindex++; + azeroes = aindex - azeroes; + bzeroes = bindex - bzeroes; + + // Count digit lengths + int astart = aindex; + int bstart = bindex; + while ((aindex < alen) && isdigit(a[aindex])) aindex++; + while ((bindex < blen) && isdigit(b[bindex])) bindex++; + if (aindex - astart < bindex - bstart) { + // a has shorter run of digits: so smaller + return -1; + } else if (aindex - astart > bindex - bstart) { + // a has longer run of digits: so larger + return 1; + } else { + // Same lengths, so compare digit by digit + for (int i = 0; i < aindex-astart; i++) { + if (a[astart+i] < b[bstart+i]) { + return -1; + } else if (a[astart+i] > b[bstart+i]) { + return 1; + } + } + // Equal: did one have more leading zeroes? + if (strict && azeroes != bzeroes) { + if (azeroes > bzeroes) { + // a has more leading zeroes: a < b + return -1; + } else { + // b has more leading zeroes: a > b + return 1; + } + } + // Equal: so continue scanning + } + } else if (a[aindex] < b[bindex]) { + return -1; + } else if (a[aindex] > b[bindex]) { + return 1; + } else { + aindex++; + bindex++; + } + } + + if (aindex < alen) { + // b is prefix of a + return 1; + } else if (bindex < blen) { + // a is prefix of b + return -1; + } else { + // a is equal to b + return 0; + } +} + +bool AutoDigitLessThan(const char* a, int alen, const char* b, int blen) { + return AutoDigitStrCmp(a, alen, b, blen, false) < 0; +} + +bool StrictAutoDigitLessThan(const char* a, int alen, + const char* b, int blen) { + return AutoDigitStrCmp(a, alen, b, blen, true) < 0; +} + +// ---------------------------------------------------------------------- +// SimpleDtoa() +// SimpleFtoa() +// DoubleToBuffer() +// FloatToBuffer() +// We want to print the value without losing precision, but we also do +// not want to print more digits than necessary. This turns out to be +// trickier than it sounds. Numbers like 0.2 cannot be represented +// exactly in binary. If we print 0.2 with a very large precision, +// e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". +// On the other hand, if we set the precision too low, we lose +// significant digits when printing numbers that actually need them. +// It turns out there is no precision value that does the right thing +// for all numbers. +// +// Our strategy is to first try printing with a precision that is never +// over-precise, then parse the result with strtod() to see if it +// matches. If not, we print again with a precision that will always +// give a precise result, but may use more digits than necessary. +// +// An arguably better strategy would be to use the algorithm described +// in "How to Print Floating-Point Numbers Accurately" by Steele & +// White, e.g. as implemented by David M. Gay's dtoa(). It turns out, +// however, that the following implementation is about as fast as +// DMG's code. Furthermore, DMG's code locks mutexes, which means it +// will not scale well on multi-core machines. DMG's code is slightly +// more accurate (in that it will never use more digits than +// necessary), but this is probably irrelevant for most users. +// +// Rob Pike and Ken Thompson also have an implementation of dtoa() in +// third_party/fmt/fltfmt.cc. Their implementation is similar to this +// one in that it makes guesses and then uses strtod() to check them. +// Their implementation is faster because they use their own code to +// generate the digits in the first place rather than use snprintf(), +// thus avoiding format string parsing overhead. However, this makes +// it considerably more complicated than the following implementation, +// and it is embedded in a larger library. If speed turns out to be +// an issue, we could re-implement this in terms of their +// implementation. +// ---------------------------------------------------------------------- + +string SimpleDtoa(double value) { + char buffer[kDoubleToBufferSize]; + return DoubleToBuffer(value, buffer); +} + +string SimpleFtoa(float value) { + char buffer[kFloatToBufferSize]; + return FloatToBuffer(value, buffer); +} + +char* DoubleToBuffer(double value, char* buffer) { + // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all + // platforms these days. Just in case some system exists where DBL_DIG + // is significantly larger -- and risks overflowing our buffer -- we have + // this assert. + COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big); + + int snprintf_result = + snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value); + + // The snprintf should never overflow because the buffer is significantly + // larger than the precision we asked for. + DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); + + if (strtod(buffer, nullptr) != value) { + snprintf_result = + snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value); + + // Should never overflow; see above. + DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); + } + return buffer; +} + +char* FloatToBuffer(float value, char* buffer) { + // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all + // platforms these days. Just in case some system exists where FLT_DIG + // is significantly larger -- and risks overflowing our buffer -- we have + // this assert. + COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big); + + int snprintf_result = + snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value); + + // The snprintf should never overflow because the buffer is significantly + // larger than the precision we asked for. + DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); + + float parsed_value; + if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { + snprintf_result = + snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value); + + // Should never overflow; see above. + DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); + } + return buffer; +} + +// ---------------------------------------------------------------------- +// SimpleItoaWithCommas() +// Description: converts an integer to a string. +// Puts commas every 3 spaces. +// Faster than printf("%d")? +// +// Return value: string +// ---------------------------------------------------------------------- +string SimpleItoaWithCommas(int32 i) { + // 10 digits, 3 commas, and sign are good for 32-bit or smaller ints. + // Longest is -2,147,483,648. + char local[14]; + char *p = local + sizeof(local); + // Need to use uint32 instead of int32 to correctly handle + // -2,147,483,648. + uint32 n = i; + if (i < 0) + n = 0 - n; // negate the unsigned value to avoid overflow + *--p = '0' + n % 10; // this case deals with the number "0" + n /= 10; + while (n) { + *--p = '0' + n % 10; + n /= 10; + if (n == 0) break; + + *--p = '0' + n % 10; + n /= 10; + if (n == 0) break; + + *--p = ','; + *--p = '0' + n % 10; + n /= 10; + // For this unrolling, we check if n == 0 in the main while loop + } + if (i < 0) + *--p = '-'; + return string(p, local + sizeof(local)); +} + +// We need this overload because otherwise SimpleItoaWithCommas(5U) wouldn't +// compile. +string SimpleItoaWithCommas(uint32 i) { + // 10 digits and 3 commas are good for 32-bit or smaller ints. + // Longest is 4,294,967,295. + char local[13]; + char *p = local + sizeof(local); + *--p = '0' + i % 10; // this case deals with the number "0" + i /= 10; + while (i) { + *--p = '0' + i % 10; + i /= 10; + if (i == 0) break; + + *--p = '0' + i % 10; + i /= 10; + if (i == 0) break; + + *--p = ','; + *--p = '0' + i % 10; + i /= 10; + // For this unrolling, we check if i == 0 in the main while loop + } + return string(p, local + sizeof(local)); +} + +string SimpleItoaWithCommas(int64 i) { + // 19 digits, 6 commas, and sign are good for 64-bit or smaller ints. + char local[26]; + char *p = local + sizeof(local); + // Need to use uint64 instead of int64 to correctly handle + // -9,223,372,036,854,775,808. + uint64 n = i; + if (i < 0) + n = 0 - n; + *--p = '0' + n % 10; // this case deals with the number "0" + n /= 10; + while (n) { + *--p = '0' + n % 10; + n /= 10; + if (n == 0) break; + + *--p = '0' + n % 10; + n /= 10; + if (n == 0) break; + + *--p = ','; + *--p = '0' + n % 10; + n /= 10; + // For this unrolling, we check if n == 0 in the main while loop + } + if (i < 0) + *--p = '-'; + return string(p, local + sizeof(local)); +} + +// We need this overload because otherwise SimpleItoaWithCommas(5ULL) wouldn't +// compile. +string SimpleItoaWithCommas(uint64 i) { + // 20 digits and 6 commas are good for 64-bit or smaller ints. + // Longest is 18,446,744,073,709,551,615. + char local[26]; + char *p = local + sizeof(local); + *--p = '0' + i % 10; // this case deals with the number "0" + i /= 10; + while (i) { + *--p = '0' + i % 10; + i /= 10; + if (i == 0) break; + + *--p = '0' + i % 10; + i /= 10; + if (i == 0) break; + + *--p = ','; + *--p = '0' + i % 10; + i /= 10; + // For this unrolling, we check if i == 0 in the main while loop + } + return string(p, local + sizeof(local)); +} + +// ---------------------------------------------------------------------- +// ItoaKMGT() +// Description: converts an integer to a string +// Truncates values to a readable unit: K, G, M or T +// Opposite of atoi_kmgt() +// e.g. 100 -> "100" 1500 -> "1500" 4000 -> "3K" 57185920 -> "45M" +// +// Return value: string +// ---------------------------------------------------------------------- +string ItoaKMGT(int64 i) { + const char *sign = "", *suffix = ""; + if (i < 0) { + // We lose some accuracy if the caller passes LONG_LONG_MIN, but + // that's OK as this function is only for human readability + if (i == numeric_limits::min()) i++; + sign = "-"; + i = -i; + } + + int64 val; + + if ((val = (i >> 40)) > 1) { + suffix = "T"; + } else if ((val = (i >> 30)) > 1) { + suffix = "G"; + } else if ((val = (i >> 20)) > 1) { + suffix = "M"; + } else if ((val = (i >> 10)) > 1) { + suffix = "K"; + } else { + val = i; + } + + return StringPrintf("%s%" PRId64 "%s", sign, val, suffix); +} + +// DEPRECATED(wadetregaskis). +// These are non-inline because some BUILD files turn on -Wformat-non-literal. + +string FloatToString(float f, const char* format) { + return StringPrintf(format, f); +} + +string IntToString(int i, const char* format) { + return StringPrintf(format, i); +} + +string Int64ToString(int64 i64, const char* format) { + return StringPrintf(format, i64); +} + +string UInt64ToString(uint64 ui64, const char* format) { + return StringPrintf(format, ui64); +} + diff --git a/src/kudu/gutil/strings/numbers.h b/src/kudu/gutil/strings/numbers.h new file mode 100644 index 000000000000..e9f5c1aaa008 --- /dev/null +++ b/src/kudu/gutil/strings/numbers.h @@ -0,0 +1,575 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Maintainer: mec@google.com (Michael Chastain) +// +// Convert strings to numbers or numbers to strings. + +#ifndef STRINGS_NUMBERS_H_ +#define STRINGS_NUMBERS_H_ + +#include +#include +#include +#include +#include +using std::binary_function; +using std::less; +#include +using std::numeric_limits; +#include +using std::string; +#include +using std::vector; + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/stringprintf.h" + + +// START DOXYGEN NumbersFunctions grouping +/* @defgroup NumbersFunctions + * @{ */ + +// Convert a fingerprint to 16 hex digits. +string FpToString(Fprint fp); + +// Formats a uint128 as a 32-digit hex string. +string Uint128ToHexString(uint128 ui128); + +// Convert strings to numeric values, with strict error checking. +// Leading and trailing spaces are allowed. +// Negative inputs are not allowed for unsigned ints (unlike strtoul). +// Numbers must be in base 10; see the _base variants below for other bases. +// Returns false on errors (including overflow/underflow). +bool safe_strto32(const char* str, int32* value); +bool safe_strto64(const char* str, int64* value); +bool safe_strtou32(const char* str, uint32* value); +bool safe_strtou64(const char* str, uint64* value); +// Convert strings to floating point values. +// Leading and trailing spaces are allowed. +// Values may be rounded on over- and underflow. +bool safe_strtof(const char* str, float* value); +bool safe_strtod(const char* str, double* value); + +bool safe_strto32(const string& str, int32* value); +bool safe_strto64(const string& str, int64* value); +bool safe_strtou32(const string& str, uint32* value); +bool safe_strtou64(const string& str, uint64* value); +bool safe_strtof(const string& str, float* value); +bool safe_strtod(const string& str, double* value); + +// Parses buffer_size many characters from startptr into value. +bool safe_strto32(const char* startptr, int buffer_size, int32* value); +bool safe_strto64(const char* startptr, int buffer_size, int64* value); + +// Parses with a fixed base between 2 and 36. For base 16, leading "0x" is ok. +// If base is set to 0, its value is inferred from the beginning of str: +// "0x" means base 16, "0" means base 8, otherwise base 10 is used. +bool safe_strto32_base(const char* str, int32* value, int base); +bool safe_strto64_base(const char* str, int64* value, int base); +bool safe_strtou32_base(const char* str, uint32* value, int base); +bool safe_strtou64_base(const char* str, uint64* value, int base); + +bool safe_strto32_base(const string& str, int32* value, int base); +bool safe_strto64_base(const string& str, int64* value, int base); +bool safe_strtou32_base(const string& str, uint32* value, int base); +bool safe_strtou64_base(const string& str, uint64* value, int base); + +bool safe_strto32_base(const char* startptr, int buffer_size, + int32* value, int base); +bool safe_strto64_base(const char* startptr, int buffer_size, + int64* value, int base); + +// u64tostr_base36() +// The inverse of safe_strtou64_base, converts the number agument to +// a string representation in base-36. +// Conversion fails if buffer is too small to to hold the string and +// terminating NUL. +// Returns number of bytes written, not including terminating NUL. +// Return value 0 indicates error. +size_t u64tostr_base36(uint64 number, size_t buf_size, char* buffer); + +// Similar to atoi(s), except s could be like "16k", "32M", "2G", "4t". +uint64 atoi_kmgt(const char* s); +inline uint64 atoi_kmgt(const string& s) { return atoi_kmgt(s.c_str()); } + +// ---------------------------------------------------------------------- +// FastIntToBuffer() +// FastHexToBuffer() +// FastHex64ToBuffer() +// FastHex32ToBuffer() +// FastTimeToBuffer() +// These are intended for speed. FastIntToBuffer() assumes the +// integer is non-negative. FastHexToBuffer() puts output in +// hex rather than decimal. FastTimeToBuffer() puts the output +// into RFC822 format. +// +// FastHex64ToBuffer() puts a 64-bit unsigned value in hex-format, +// padded to exactly 16 bytes (plus one byte for '\0') +// +// FastHex32ToBuffer() puts a 32-bit unsigned value in hex-format, +// padded to exactly 8 bytes (plus one byte for '\0') +// +// All functions take the output buffer as an arg. FastInt() uses +// at most 22 bytes, FastTime() uses exactly 30 bytes. They all +// return a pointer to the beginning of the output, which for +// FastHex() may not be the beginning of the input buffer. (For +// all others, we guarantee that it is.) +// +// NOTE: In 64-bit land, sizeof(time_t) is 8, so it is possible +// to pass to FastTimeToBuffer() a time whose year cannot be +// represented in 4 digits. In this case, the output buffer +// will contain the string "Invalid:" +// ---------------------------------------------------------------------- + +// Previously documented minimums -- the buffers provided must be at least this +// long, though these numbers are subject to change: +// Int32, UInt32: 12 bytes +// Int64, UInt64, Hex: 22 bytes +// Time: 30 bytes +// Hex32: 9 bytes +// Hex64: 17 bytes +// Use kFastToBufferSize rather than hardcoding constants. +static const int kFastToBufferSize = 32; + +char* FastInt32ToBuffer(int32 i, char* buffer); +char* FastInt64ToBuffer(int64 i, char* buffer); +char* FastUInt32ToBuffer(uint32 i, char* buffer); +char* FastUInt64ToBuffer(uint64 i, char* buffer); +char* FastHexToBuffer(int i, char* buffer) MUST_USE_RESULT; +char* FastTimeToBuffer(time_t t, char* buffer); +char* FastHex64ToBuffer(uint64 i, char* buffer); +char* FastHex32ToBuffer(uint32 i, char* buffer); + +// at least 22 bytes long +inline char* FastIntToBuffer(int i, char* buffer) { + return (sizeof(i) == 4 ? + FastInt32ToBuffer(i, buffer) : FastInt64ToBuffer(i, buffer)); +} +inline char* FastUIntToBuffer(unsigned int i, char* buffer) { + return (sizeof(i) == 4 ? + FastUInt32ToBuffer(i, buffer) : FastUInt64ToBuffer(i, buffer)); +} + +// ---------------------------------------------------------------------- +// FastInt32ToBufferLeft() +// FastUInt32ToBufferLeft() +// FastInt64ToBufferLeft() +// FastUInt64ToBufferLeft() +// +// Like the Fast*ToBuffer() functions above, these are intended for speed. +// Unlike the Fast*ToBuffer() functions, however, these functions write +// their output to the beginning of the buffer (hence the name, as the +// output is left-aligned). The caller is responsible for ensuring that +// the buffer has enough space to hold the output. +// +// Returns a pointer to the end of the string (i.e. the null character +// terminating the string). +// ---------------------------------------------------------------------- + +char* FastInt32ToBufferLeft(int32 i, char* buffer); // at least 12 bytes +char* FastUInt32ToBufferLeft(uint32 i, char* buffer); // at least 12 bytes +char* FastInt64ToBufferLeft(int64 i, char* buffer); // at least 22 bytes +char* FastUInt64ToBufferLeft(uint64 i, char* buffer); // at least 22 bytes + +// Just define these in terms of the above. +inline char* FastUInt32ToBuffer(uint32 i, char* buffer) { + FastUInt32ToBufferLeft(i, buffer); + return buffer; +} +inline char* FastUInt64ToBuffer(uint64 i, char* buffer) { + FastUInt64ToBufferLeft(i, buffer); + return buffer; +} + +// ---------------------------------------------------------------------- +// HexDigitsPrefix() +// returns 1 if buf is prefixed by "num_digits" of hex digits +// returns 0 otherwise. +// The function checks for '\0' for string termination. +// ---------------------------------------------------------------------- +int HexDigitsPrefix(const char* buf, int num_digits); + +// ---------------------------------------------------------------------- +// ConsumeStrayLeadingZeroes +// Eliminates all leading zeroes (unless the string itself is composed +// of nothing but zeroes, in which case one is kept: 0...0 becomes 0). +void ConsumeStrayLeadingZeroes(string* str); + +// ---------------------------------------------------------------------- +// ParseLeadingInt32Value +// A simple parser for int32 values. Returns the parsed value +// if a valid integer is found; else returns deflt. It does not +// check if str is entirely consumed. +// This cannot handle decimal numbers with leading 0s, since they will be +// treated as octal. If you know it's decimal, use ParseLeadingDec32Value. +// -------------------------------------------------------------------- +int32 ParseLeadingInt32Value(const char* str, int32 deflt); +inline int32 ParseLeadingInt32Value(const string& str, int32 deflt) { + return ParseLeadingInt32Value(str.c_str(), deflt); +} + +// ParseLeadingUInt32Value +// A simple parser for uint32 values. Returns the parsed value +// if a valid integer is found; else returns deflt. It does not +// check if str is entirely consumed. +// This cannot handle decimal numbers with leading 0s, since they will be +// treated as octal. If you know it's decimal, use ParseLeadingUDec32Value. +// -------------------------------------------------------------------- +uint32 ParseLeadingUInt32Value(const char* str, uint32 deflt); +inline uint32 ParseLeadingUInt32Value(const string& str, uint32 deflt) { + return ParseLeadingUInt32Value(str.c_str(), deflt); +} + +// ---------------------------------------------------------------------- +// ParseLeadingDec32Value +// A simple parser for decimal int32 values. Returns the parsed value +// if a valid integer is found; else returns deflt. It does not +// check if str is entirely consumed. +// The string passed in is treated as *10 based*. +// This can handle strings with leading 0s. +// See also: ParseLeadingDec64Value +// -------------------------------------------------------------------- +int32 ParseLeadingDec32Value(const char* str, int32 deflt); +inline int32 ParseLeadingDec32Value(const string& str, int32 deflt) { + return ParseLeadingDec32Value(str.c_str(), deflt); +} + +// ParseLeadingUDec32Value +// A simple parser for decimal uint32 values. Returns the parsed value +// if a valid integer is found; else returns deflt. It does not +// check if str is entirely consumed. +// The string passed in is treated as *10 based*. +// This can handle strings with leading 0s. +// See also: ParseLeadingUDec64Value +// -------------------------------------------------------------------- +uint32 ParseLeadingUDec32Value(const char* str, uint32 deflt); +inline uint32 ParseLeadingUDec32Value(const string& str, uint32 deflt) { + return ParseLeadingUDec32Value(str.c_str(), deflt); +} + +// ---------------------------------------------------------------------- +// ParseLeadingUInt64Value +// ParseLeadingInt64Value +// ParseLeadingHex64Value +// ParseLeadingDec64Value +// ParseLeadingUDec64Value +// A simple parser for long long values. +// Returns the parsed value if a +// valid integer is found; else returns deflt +// -------------------------------------------------------------------- +uint64 ParseLeadingUInt64Value(const char* str, uint64 deflt); +inline uint64 ParseLeadingUInt64Value(const string& str, uint64 deflt) { + return ParseLeadingUInt64Value(str.c_str(), deflt); +} +int64 ParseLeadingInt64Value(const char* str, int64 deflt); +inline int64 ParseLeadingInt64Value(const string& str, int64 deflt) { + return ParseLeadingInt64Value(str.c_str(), deflt); +} +uint64 ParseLeadingHex64Value(const char* str, uint64 deflt); +inline uint64 ParseLeadingHex64Value(const string& str, uint64 deflt) { + return ParseLeadingHex64Value(str.c_str(), deflt); +} +int64 ParseLeadingDec64Value(const char* str, int64 deflt); +inline int64 ParseLeadingDec64Value(const string& str, int64 deflt) { + return ParseLeadingDec64Value(str.c_str(), deflt); +} +uint64 ParseLeadingUDec64Value(const char* str, uint64 deflt); +inline uint64 ParseLeadingUDec64Value(const string& str, uint64 deflt) { + return ParseLeadingUDec64Value(str.c_str(), deflt); +} + +// ---------------------------------------------------------------------- +// ParseLeadingDoubleValue +// A simple parser for double values. Returns the parsed value +// if a valid double is found; else returns deflt. It does not +// check if str is entirely consumed. +// -------------------------------------------------------------------- +double ParseLeadingDoubleValue(const char* str, double deflt); +inline double ParseLeadingDoubleValue(const string& str, double deflt) { + return ParseLeadingDoubleValue(str.c_str(), deflt); +} + +// ---------------------------------------------------------------------- +// ParseLeadingBoolValue() +// A recognizer of boolean string values. Returns the parsed value +// if a valid value is found; else returns deflt. This skips leading +// whitespace, is case insensitive, and recognizes these forms: +// 0/1, false/true, no/yes, n/y +// -------------------------------------------------------------------- +bool ParseLeadingBoolValue(const char* str, bool deflt); +inline bool ParseLeadingBoolValue(const string& str, bool deflt) { + return ParseLeadingBoolValue(str.c_str(), deflt); +} + +// ---------------------------------------------------------------------- +// AutoDigitStrCmp +// AutoDigitLessThan +// StrictAutoDigitLessThan +// autodigit_less +// autodigit_greater +// strict_autodigit_less +// strict_autodigit_greater +// These are like less and greater, except when a +// run of digits is encountered at corresponding points in the two +// arguments. Such digit strings are compared numerically instead +// of lexicographically. Therefore if you sort by +// "autodigit_less", some machine names might get sorted as: +// exaf1 +// exaf2 +// exaf10 +// When using "strict" comparison (AutoDigitStrCmp with the strict flag +// set to true, or the strict version of the other functions), +// strings that represent equal numbers will not be considered equal if +// the string representations are not identical. That is, "01" < "1" in +// strict mode, but "01" == "1" otherwise. +// ---------------------------------------------------------------------- + +int AutoDigitStrCmp(const char* a, int alen, + const char* b, int blen, + bool strict); + +bool AutoDigitLessThan(const char* a, int alen, + const char* b, int blen); + +bool StrictAutoDigitLessThan(const char* a, int alen, + const char* b, int blen); + +struct autodigit_less + : public binary_function { + bool operator()(const string& a, const string& b) const { + return AutoDigitLessThan(a.data(), a.size(), b.data(), b.size()); + } +}; + +struct autodigit_greater + : public binary_function { + bool operator()(const string& a, const string& b) const { + return AutoDigitLessThan(b.data(), b.size(), a.data(), a.size()); + } +}; + +struct strict_autodigit_less + : public binary_function { + bool operator()(const string& a, const string& b) const { + return StrictAutoDigitLessThan(a.data(), a.size(), b.data(), b.size()); + } +}; + +struct strict_autodigit_greater + : public binary_function { + bool operator()(const string& a, const string& b) const { + return StrictAutoDigitLessThan(b.data(), b.size(), a.data(), a.size()); + } +}; + +// ---------------------------------------------------------------------- +// SimpleItoa() +// Description: converts an integer to a string. +// Faster than printf("%d"). +// +// Return value: string +// ---------------------------------------------------------------------- +inline string SimpleItoa(int32 i) { + char buf[16]; // Longest is -2147483648 + return string(buf, FastInt32ToBufferLeft(i, buf)); +} + +// We need this overload because otherwise SimpleItoa(5U) wouldn't compile. +inline string SimpleItoa(uint32 i) { + char buf[16]; // Longest is 4294967295 + return string(buf, FastUInt32ToBufferLeft(i, buf)); +} + +inline string SimpleItoa(int64 i) { + char buf[32]; // Longest is -9223372036854775808 + return string(buf, FastInt64ToBufferLeft(i, buf)); +} + +// We need this overload because otherwise SimpleItoa(5ULL) wouldn't compile. +inline string SimpleItoa(uint64 i) { + char buf[32]; // Longest is 18446744073709551615 + return string(buf, FastUInt64ToBufferLeft(i, buf)); +} + +// SimpleAtoi converts a string to an integer. +// Uses safe_strto?() for actual parsing, so strict checking is +// applied, which is to say, the string must be a base-10 integer, optionally +// followed or preceded by whitespace, and value has to be in the range of +// the corresponding integer type. +// +// Returns true if parsing was successful. +template +bool MUST_USE_RESULT SimpleAtoi(const char* s, int_type* out) { + // Must be of integer type (not pointer type), with more than 16-bitwidth. + COMPILE_ASSERT(sizeof(*out) == 4 || sizeof(*out) == 8, + SimpleAtoiWorksWith32Or64BitInts); + if (std::numeric_limits::is_signed) { // Signed + if (sizeof(*out) == 64 / 8) { // 64-bit + return safe_strto64(s, reinterpret_cast(out)); + } else { // 32-bit + return safe_strto32(s, reinterpret_cast(out)); + } + } else { // Unsigned + if (sizeof(*out) == 64 / 8) { // 64-bit + return safe_strtou64(s, reinterpret_cast(out)); + } else { // 32-bit + return safe_strtou32(s, reinterpret_cast(out)); + } + } +} + +template +bool MUST_USE_RESULT SimpleAtoi(const string& s, int_type* out) { + return SimpleAtoi(s.c_str(), out); +} + +// ---------------------------------------------------------------------- +// SimpleDtoa() +// SimpleFtoa() +// DoubleToBuffer() +// FloatToBuffer() +// Description: converts a double or float to a string which, if +// passed to strtod(), will produce the exact same original double +// (except in case of NaN; all NaNs are considered the same value). +// We try to keep the string short but it's not guaranteed to be as +// short as possible. +// +// DoubleToBuffer() and FloatToBuffer() write the text to the given +// buffer and return it. The buffer must be at least +// kDoubleToBufferSize bytes for doubles and kFloatToBufferSize +// bytes for floats. kFastToBufferSize is also guaranteed to be large +// enough to hold either. +// +// Return value: string +// ---------------------------------------------------------------------- +string SimpleDtoa(double value); +string SimpleFtoa(float value); + +char* DoubleToBuffer(double i, char* buffer); +char* FloatToBuffer(float i, char* buffer); + +// In practice, doubles should never need more than 24 bytes and floats +// should never need more than 14 (including null terminators), but we +// overestimate to be safe. +static const int kDoubleToBufferSize = 32; +static const int kFloatToBufferSize = 24; + +// ---------------------------------------------------------------------- +// SimpleItoaWithCommas() +// Description: converts an integer to a string. +// Puts commas every 3 spaces. +// Faster than printf("%d")? +// +// Return value: string +// ---------------------------------------------------------------------- +string SimpleItoaWithCommas(int32 i); +string SimpleItoaWithCommas(uint32 i); +string SimpleItoaWithCommas(int64 i); +string SimpleItoaWithCommas(uint64 i); + +// ---------------------------------------------------------------------- +// ItoaKMGT() +// Description: converts an integer to a string +// Truncates values to K, G, M or T as appropriate +// Opposite of atoi_kmgt() +// e.g. 3000 -> 2K 57185920 -> 45M +// +// Return value: string +// ---------------------------------------------------------------------- +string ItoaKMGT(int64 i); + +// ---------------------------------------------------------------------- +// ParseDoubleRange() +// Parse an expression in 'text' of the form: +// where may be a double-precision number and is a +// single char or "..", and must be one of the chars in parameter +// 'separators', which may contain '-' or '.' (which means "..") or +// any chars not allowed in a double. If allow_unbounded_markers, +// may also be a '?' to indicate unboundedness (if on the +// left of , means unbounded below; if on the right, means +// unbounded above). Depending on num_required_bounds, which may be +// 0, 1, or 2, may also be the empty string, indicating +// unboundedness. If require_separator is false, then a single +// is acceptable and is parsed as a range bounded from +// below. We also check that the character following the range must +// be in acceptable_terminators. If null_terminator_ok, then it is +// also OK if the range ends in \0 or after len chars. If +// allow_currency is true, the first may be optionally +// preceded by a '$', in which case *is_currency will be true, and +// the second may similarly be preceded by a '$'. In these +// cases, the '$' will be ignored (otherwise it's an error). If +// allow_comparators is true, the expression in 'text' may also be +// of the form , where is '<' or +// '>' or '<=' or '>='. separators and require_separator are +// ignored in this format, but all other parameters function as for +// the first format. Return true if the expression parsed +// successfully; false otherwise. If successful, output params are: +// 'end', which points to the char just beyond the expression; +// 'from' and 'to' are set to the values of the s, and are +// -inf and inf (or unchanged, depending on dont_modify_unbounded) +// if unbounded. Output params are undefined if false is +// returned. len is the input length, or -1 if text is +// '\0'-terminated, which is more efficient. +// ---------------------------------------------------------------------- +struct DoubleRangeOptions { + const char* separators; + bool require_separator; + const char* acceptable_terminators; + bool null_terminator_ok; + bool allow_unbounded_markers; + uint32 num_required_bounds; + bool dont_modify_unbounded; + bool allow_currency; + bool allow_comparators; +}; + +// NOTE: The instruction below creates a Module titled +// NumbersFunctions within the auto-generated Doxygen documentation. +// This instruction is needed to expose global functions that are not +// within a namespace. +// +bool ParseDoubleRange(const char* text, int len, const char** end, + double* from, double* to, bool* is_currency, + const DoubleRangeOptions& opts); + +// END DOXYGEN SplitFunctions grouping +/* @} */ + +// These functions are deprecated. +// Do not use in new code. + +// DEPRECATED(wadetregaskis). Just call StringPrintf or SimpleFtoa. +string FloatToString(float f, const char* format); + +// DEPRECATED(wadetregaskis). Just call StringPrintf or SimpleItoa. +string IntToString(int i, const char* format); + +// DEPRECATED(wadetregaskis). Just call StringPrintf or SimpleItoa. +string Int64ToString(int64 i64, const char* format); + +// DEPRECATED(wadetregaskis). Just call StringPrintf or SimpleItoa. +string UInt64ToString(uint64 ui64, const char* format); + +// DEPRECATED(wadetregaskis). Just call StringPrintf. +inline string FloatToString(float f) { + return StringPrintf("%7f", f); +} + +// DEPRECATED(wadetregaskis). Just call StringPrintf. +inline string IntToString(int i) { + return StringPrintf("%7d", i); +} + +// DEPRECATED(wadetregaskis). Just call StringPrintf. +inline string Int64ToString(int64 i64) { + return StringPrintf("%7" PRId64, i64); +} + +// DEPRECATED(wadetregaskis). Just call StringPrintf. +inline string UInt64ToString(uint64 ui64) { + return StringPrintf("%7" PRIu64, ui64); +} + +#endif // STRINGS_NUMBERS_H_ diff --git a/src/kudu/gutil/strings/serialize.cc b/src/kudu/gutil/strings/serialize.cc new file mode 100644 index 000000000000..5449ef7be981 --- /dev/null +++ b/src/kudu/gutil/strings/serialize.cc @@ -0,0 +1,328 @@ +// Copyright 2003, Google Inc. All rights reserved. + +#include "kudu/gutil/strings/serialize.h" + +#include +#include +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_map; +#include +using std::string; +#include +using std::make_pair; +using std::pair; +#include +using std::vector; + +#include "kudu/gutil/casts.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strtoint.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/hash/hash.h" + +// Convert a uint32 to a 4-byte string. +string Uint32ToKey(uint32 u32) { + string key; + KeyFromUint32(u32, &key); + return key; +} + +string Uint64ToKey(uint64 fp) { + string key; + KeyFromUint64(fp, &key); + return key; +} + +// Convert a uint128 to a 16-byte string. +string Uint128ToKey(uint128 u128) { + string key; + KeyFromUint128(u128, &key); + return key; +} + +// Converts int32 to a 4-byte string key +// NOTE: Lexicographic ordering of the resulting strings does not in +// general correspond to any natural ordering of the corresponding +// integers. For non-negative inputs, lexicographic ordering of the +// resulting strings corresponds to increasing ordering of the +// integers. However, negative inputs are sorted *after* the non-negative +// inputs. To obtain keys such that lexicographic ordering corresponds +// to the natural total order on the integers, use OrderedStringFromInt32() +// or ReverseOrderedStringFromInt32() instead. +void KeyFromInt32(int32 i32, string* key) { + // TODO(user): Redefine using bit_cast<> and KeyFromUint32()? + key->resize(sizeof(i32)); + for (int i = sizeof(i32) - 1; i >= 0; --i) { + (*key)[i] = i32 & 0xff; + i32 = (i32 >> 8); + } +} + +// Converts a 4-byte string key (typically generated by KeyFromInt32) +// into an int32 value +int32 KeyToInt32(const StringPiece& key) { + int32 i32 = 0; + CHECK_EQ(key.size(), sizeof(i32)); + for (size_t i = 0; i < sizeof(i32); ++i) { + i32 = (i32 << 8) | static_cast(key[i]); + } + return i32; +} + +// Converts a double value to an 8-byte string key, so that +// the string keys sort in the same order as the original double values. +void KeyFromDouble(double x, string* key) { + uint64 n = bit_cast(x); + // IEEE standard 754 floating point representation + // [sign-bit] [exponent] [mantissa] + // + // Let "a", "b" be two double values, and F(.) be following + // transformation. We have: + // If 0 < a < b: + // 0x80000000ULL < uint64(F(a)) < uint64(F(b)) + // If a == -0.0, b == +0.0: + // uint64(F(-0.0)) == uint64(F(+0.0)) = 0x80000000ULL + // If a < b < 0: + // uint64(F(a)) < uint64(F(b)) < 0x80000000ULL + const uint64 sign_bit = GG_ULONGLONG(1) << 63; + if ((n & sign_bit) == 0) { + n += sign_bit; + } else { + n = -n; + } + KeyFromUint64(n, key); +} + +// Version of KeyFromDouble that returns the key. +string DoubleToKey(double x) { + string key; + KeyFromDouble(x, &key); + return key; +} + +// Converts key generated by KeyFromDouble() back to double. +double KeyToDouble(const StringPiece& key) { + int64 n = KeyToUint64(key); + if (n & (GG_ULONGLONG(1) << 63)) + n -= (GG_ULONGLONG(1) << 63); + else + n = -n; + return bit_cast(n); +} + +// Converts int32 to a 4-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in increasing order by +// integer values. This can be useful when constructing secondary +void OrderedStringFromInt32(int32 i32, string* key) { + uint32 ui32 = static_cast(i32) ^ 0x80000000; + key->resize(sizeof ui32); + for ( int i = (sizeof ui32) - 1; i >= 0; --i ) { + (*key)[i] = ui32 & 0xff; + ui32 = (ui32 >> 8); + } +} + +string Int32ToOrderedString(int32 i32) { + string key; + OrderedStringFromInt32(i32, &key); + return key; +} + +// The inverse of the above function. +int32 OrderedStringToInt32(const StringPiece& key) { + uint32 ui32 = 0; + CHECK(key.size() == sizeof ui32); + for ( int i = 0; i < sizeof ui32; ++i ) { + ui32 = (ui32 << 8); + ui32 = ui32 | static_cast(key[i]); + } + return static_cast(ui32 ^ 0x80000000); +} + +// Converts int64 to a 8-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in increasing order by +// integer values. +void OrderedStringFromInt64(int64 i64, string* key) { + uint64 ui64 = static_cast(i64) ^ (GG_ULONGLONG(1) << 63); + key->resize(sizeof ui64); + for ( int i = (sizeof ui64) - 1; i >= 0; --i ) { + (*key)[i] = ui64 & 0xff; + ui64 = (ui64 >> 8); + } +} + +string Int64ToOrderedString(int64 i64) { + string key; + OrderedStringFromInt64(i64, &key); + return key; +} + +// The inverse of the above function. +int64 OrderedStringToInt64(const StringPiece& key) { + uint64 ui64 = 0; + CHECK(key.size() == sizeof ui64); + for ( int i = 0; i < sizeof ui64; ++i ) { + ui64 = (ui64 << 8); + ui64 = ui64 | static_cast(key[i]); + } + return static_cast(ui64 ^ (GG_ULONGLONG(1) << 63)); +} + +// Converts int32 to a 4-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in decreasing order +// by integer values. This can be useful when constructing secondary +void ReverseOrderedStringFromInt32(int32 i32, string* key) { + // ~ is like -, but works even for INT_MIN. (-INT_MIN == INT_MIN, + // but ~x = -x - 1, so ~INT_MIN = -INT_MIN - 1 = INT_MIN - 1 = INT_MAX). + OrderedStringFromInt32(~i32, key); +} + +string Int32ToReverseOrderedString(int32 i32) { + string key; + ReverseOrderedStringFromInt32(i32, &key); + return key; +} + +// The inverse of the above function. +int32 ReverseOrderedStringToInt32(const StringPiece& key) { + return ~OrderedStringToInt32(key); +} + +// Converts int64 to an 8-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in decreasing order +// by integer values. This can be useful when constructing secondary +void ReverseOrderedStringFromInt64(int64 i64, string* key) { + return OrderedStringFromInt64(~i64, key); +} + +string Int64ToReverseOrderedString(int64 i64) { + string key; + ReverseOrderedStringFromInt64(i64, &key); + return key; +} + +// The inverse of the above function. +int64 ReverseOrderedStringToInt64(const StringPiece& key) { + return ~OrderedStringToInt64(key); +} + +// -------------------------------------------------------------------------- +// DictionaryInt32Encode +// DictionaryInt64Encode +// DictionaryDoubleEncode +// DictionaryInt32Decode +// DictionaryInt64Decode +// DictionaryDoubleDecode +// Routines to serialize/unserialize simple dictionaries +// (string->T hashmaps). We use ':' to separate keys and values, +// and commas to separate entries. +// -------------------------------------------------------------------------- + +string DictionaryInt32Encode(const hash_map* dictionary) { + vector entries; + for (const auto& entry : *dictionary) { + entries.push_back(StringPrintf("%s:%d", entry.first.c_str(), entry.second)); + } + + string result; + JoinStrings(entries, ",", &result); + return result; +} + +string DictionaryInt64Encode(const hash_map* dictionary) { + vector entries; + for (const auto& entry : *dictionary) { + entries.push_back(StringPrintf("%s:%" PRId64, + entry.first.c_str(), entry.second)); + } + + string result; + JoinStrings(entries, ",", &result); + return result; +} + +string DictionaryDoubleEncode(const hash_map* dictionary) { + vector entries; + for (const auto& entry : *dictionary) { + entries.push_back(StringPrintf("%s:%g", entry.first.c_str(), entry.second)); + } + + string result; + JoinStrings(entries, ",", &result); + return result; +} + +bool DictionaryParse(const string& encoded_str, + vector >* items) { + vector entries; + SplitStringUsing(encoded_str, ",", &entries); + for (const auto& entry : entries) { + vector fields; + SplitStringAllowEmpty(entry, ":", &fields); + if (fields.size() != 2) // parsing error + return false; + items->push_back(make_pair(fields[0], fields[1])); + } + return true; +} + +bool DictionaryInt32Decode(hash_map* dictionary, + const string& encoded_str) { + vector > items; + if (!DictionaryParse(encoded_str, &items)) + return false; + + dictionary->clear(); + for (const auto& item : items) { + char *error = nullptr; + const int32 value = strto32(item.second.c_str(), &error, 0); + if (error == item.second.c_str() || *error != '\0') { + // parsing error + return false; + } + (*dictionary)[item.first] = value; + } + return true; +} + +bool DictionaryInt64Decode(hash_map* dictionary, + const string& encoded_str) { + vector > items; + if (!DictionaryParse(encoded_str, &items)) + return false; + + dictionary->clear(); + for (const auto& item : items) { + char *error = nullptr; + const int64 value = strto64(item.second.c_str(), &error, 0); + if (error == item.second.c_str() || *error != '\0') { + // parsing error + return false; + } + (*dictionary)[item.first] = value; + } + return true; +} + + +bool DictionaryDoubleDecode(hash_map* dictionary, + const string& encoded_str) { + vector > items; + if (!DictionaryParse(encoded_str, &items)) + return false; + + dictionary->clear(); + for (const auto& item : items) { + char *error = nullptr; + const double value = strtod(item.second.c_str(), &error); + if (error == item.second.c_str() || *error != '\0') { + // parsing error + return false; + } + (*dictionary)[item.first] = value; + } + return true; +} diff --git a/src/kudu/gutil/strings/serialize.h b/src/kudu/gutil/strings/serialize.h new file mode 100644 index 000000000000..7966cd2a8823 --- /dev/null +++ b/src/kudu/gutil/strings/serialize.h @@ -0,0 +1,343 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Refactored from contributions of various authors in strings/strutil.h +// +// This file contains conversion functions from various data types to +// strings and back. + +#ifndef STRINGS_SERIALIZE_H_ +#define STRINGS_SERIALIZE_H_ + +#include +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_map; +#include +using std::string; +#include +using std::make_pair; +using std::pair; +#include +using std::vector; + +#include + +#include "kudu/gutil/int128.h" +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/type_traits.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/endian.h" +#include "kudu/gutil/stl_util.h" + +// Converts a 4-byte uint32 to a string such that the string keys sort in +// the same order as the original uint32 value. +// TODO(user): Rework all reinterpret_casts<> in this file. +inline void KeyFromUint32(uint32 u32, string* key) { + uint32 norder = ghtonl(u32); + key->assign(reinterpret_cast(&norder), sizeof(norder)); +} + +// Converts "fp" to an 8-byte string key +inline void KeyFromUint64(uint64 fp, string* key) { + uint64 norder = htonll(fp); + key->assign(reinterpret_cast(&norder), sizeof(norder)); +} + +// Converts a 16-byte uint128 to a string such that the string keys sort in +// the same order as the original uint128 value. +inline void KeyFromUint128(uint128 fp, string* key) { + uint64 norder[] = { htonll(Uint128High64(fp)), + htonll(Uint128Low64(fp)) + }; + key->assign(reinterpret_cast(norder), 2 * sizeof(norder[0])); +} + +// This version of KeyFromUint32 is less efficient but very convenient +string Uint32ToKey(uint32 u32); + +// This version of KeyFromUint64 is less efficient but very convenient +string Uint64ToKey(uint64 fp); + +// This version of KeyFromUint128 is less efficient but very convenient +string Uint128ToKey(uint128 u128); + +// Converts a 4-byte string key (typically generated by KeyFromUint32 or +// Uint32ToKey) into a uint32 value. +inline uint32 KeyToUint32(const StringPiece& key) { + uint32 value; + DCHECK_EQ(key.size(), sizeof(value)); + memcpy(&value, key.data(), sizeof(value)); + return gntohl(value); +} + +// Converts an 8-byte string key (typically generated by Uint64ToKey or +// KeyFromUint64) into a uint64 value +inline uint64 KeyToUint64(const StringPiece& key) { + uint64 value; + DCHECK_EQ(key.size(), sizeof(value)); + memcpy(&value, key.data(), sizeof(value)); + return ntohll(value); +} + +// Converts a 16-byte string key (typically generated by Uint128ToKey or +// KeyFromUint128) into a uint128 value +inline uint128 KeyToUint128(const StringPiece& key) { + uint64 v0, v1; + DCHECK_EQ(key.size(), sizeof(v0) + sizeof(v1)); + memcpy(&v0, key.data(), sizeof(v0)); + memcpy(&v1, key.data() + sizeof(v0), sizeof(v1)); + return uint128(ntohll(v0), ntohll(v1)); +} + +// Converts "i32" to a 4-byte string key +// NOTE: Lexicographic ordering of the resulting strings does not in +// general correspond to any natural ordering of the corresponding +// integers. For non-negative inputs, lexicographic ordering of the +// resulting strings corresponds to increasing ordering of the +// integers. However, negative inputs are sorted *after* the non-negative +// inputs. To obtain keys such that lexicographic ordering corresponds +// to the natural total order on the integers, use OrderedStringFromInt32() +// or ReverseOrderedStringFromInt32() instead. +void KeyFromInt32(int32 i32, string* key); + +// Convenient form of KeyFromInt32. +inline string Int32ToKey(int32 i32) { + string s; + KeyFromInt32(i32, &s); + return s; +} + +// Converts a 4-byte string key (typically generated by KeyFromInt32) +// into an int32 value +int32 KeyToInt32(const StringPiece& key); + +// Converts a double value to an 8-byte string key, so that +// the string keys sort in the same order as the original double values. +void KeyFromDouble(double x, string* key); + +// Converts key generated by KeyFromDouble() back to double. +double KeyToDouble(const StringPiece& key); + +// This version of KeyFromDouble is less efficient but very convenient +string DoubleToKey(double x); + +// Converts int32 to a 4-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in increasing order by +// integer values. This can be useful when constructing secondary +void OrderedStringFromInt32(int32 i32, string* key); + +// This version of OrderedStringFromInt32 is less efficient but very convenient +string Int32ToOrderedString(int32 i32); + +// The inverse of the above function. +int32 OrderedStringToInt32(const StringPiece& key); + +// Converts int64 to an 8-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in increasing order by +// integer values. +void OrderedStringFromInt64(int64 i64, string* key); + +// This version of OrderedStringFromInt64 is less efficient but very convenient +string Int64ToOrderedString(int64 i64); + +// The inverse of the above function. +int64 OrderedStringToInt64(const StringPiece& key); + +// Converts int32 to a 4-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in decreasing order +// by integer values. This can be useful when constructing secondary +void ReverseOrderedStringFromInt32(int32 i32, string* key); + +// This version of ReverseOrderedStringFromInt32 is less efficient but very +string Int32ToReverseOrderedString(int32 i32); + +// The inverse of the above function. +int32 ReverseOrderedStringToInt32(const StringPiece& key); + +// Converts int64 to an 8-byte string key such that lexicographic +// ordering of strings is equivalent to sorting in decreasing order +// by integer values. This can be useful when constructing secondary +void ReverseOrderedStringFromInt64(int64 i64, string* key); + +// This version of ReverseOrderedStringFromInt64 is less efficient but very +string Int64ToReverseOrderedString(int64 i64); + +// The inverse of the above function. +int64 ReverseOrderedStringToInt64(const StringPiece& key); + +// Stores the bytes of a plain old data type value in a C++ string. +// Verifies the given data type is a POD and copies the bytes of the +// value into a newly created string. +// +// Can replace the use of Encode*, and avoid the use of castings, +// or adding additional functions for each type. +// For example, use: +// int32 i = 100; +// string s = EncodePOD(i); +// in place of: +// string s = EncodeUint32(static_cast(i)); +template inline string EncodePOD(const T& value) { + ENFORCE_POD(T); + string s; + STLStringResizeUninitialized(&s, sizeof(T)); + memcpy(string_as_array(&s), &value, sizeof(T)); + return s; +} + +// Retrieves the bytes of a plain old data type value from a StringPiece. +// Verifies the given data type is a POD and copies the bytes of the +// value from the given string. +// Returns true if the operation succeeded. +// Note that other than the data length, no check is (or can be) +// done on the type of data stored in the string. +// +// Can replace the use of Decode*, and avoid the use of castings, +// or adding additional functions for each type. +// For example, use: +// int32 i = 100; +// int32 j; +// string s = EncodePOD(i); +// CHECK(DecodePOD(s, &j)); +// in place of: +// string s = EncodeUint32(static_cast(i)); +// CHECK(DecodesUint32(s, static_cast(&j))); +template inline bool DecodePOD(const StringPiece& str, T* result) { + ENFORCE_POD(T); + CHECK(result != NULL); + if (sizeof(*result) != str.size()) { + return false; + } + memcpy(result, str.data(), sizeof(T)); + return true; +} + +// Stores the value bytes of a vector of plain old data type in a C++ string. +// Verifies the given data type is a POD and copies the bytes of each value +// in the vector into a newly created string. +template inline string EncodeVectorPOD(const vector& vec) { + ENFORCE_POD(T); + string s; + STLStringResizeUninitialized(&s, vec.size() * sizeof(T)); + typename vector::const_iterator iter; + char* ptr; + for (iter = vec.begin(), ptr = string_as_array(&s); + iter != vec.end(); + ++iter, ptr += sizeof(T)) { + memcpy(ptr, &(*iter), sizeof(T)); + } + return s; +} + +// Reconstructs a vector of a plain old data type values from a C++ string. +// Verifies the given data type is a POD and copies the bytes of each value +// from the given string to the given vector. +// Returns true if the operation succeeded. +// Note that other than the data length, no check is (or can be) +// done on the type of data stored in the string. +template inline bool DecodeVectorPOD(const string& str, + vector* result) { + ENFORCE_POD(T); + CHECK(result != NULL); + if (str.size() % sizeof(T) != 0) + return false; + result->clear(); + result->reserve(str.size() / sizeof(T)); + T value; + const char* begin = str.data(); + const char* end = str.data() + str.size(); + for (const char* ptr = begin; ptr != end; ptr += sizeof(T)) { + memcpy(&value, ptr, sizeof(T)); + result->push_back(value); + } + return true; +} + +// ---------------------------------------------------------------------- +// EncodeDouble() +// EncodeFloat() +// EncodeUint32() +// EncodeUint64() +// DecodeDouble() +// DecodeFloat() +// DecodeUint32() +// DecodeUint64() +// The Encode* functions store the bytes of ints, floats or doubles into the +// data bytes of a C++ string. The Decode* functions perform the reverse +// operations, but operate on a StringPiece rather than directly on a C++ +// string. They return true iff s contained the right number of bytes. +// +// These may be preferred to naked calls to EncodePOD/DecodePOD since +// they make the payload type explicit. +// Note that these encodings are NOT endian-neutral. +// ---------------------------------------------------------------------- +inline string EncodeDouble(double d) { + return EncodePOD(d); +} + +inline string EncodeFloat(float f) { + return EncodePOD(f); +} + +inline string EncodeUint32(uint32 i) { + return EncodePOD(i); +} + +inline string EncodeUint64(uint64 i) { + return EncodePOD(i); +} + +inline bool DecodeDouble(const StringPiece& s, double* d) { + return DecodePOD(s, d); +} + +inline bool DecodeFloat(const StringPiece& s, float* f) { + return DecodePOD(s, f); +} + +inline bool DecodeUint32(const StringPiece& s, uint32* i) { + return DecodePOD(s, i); +} + +inline bool DecodeUint64(const StringPiece& s, uint64* i) { + return DecodePOD(s, i); +} + +// ------------------------------------------------------------------------- +// DictionaryParse +// This routine parses a common dictionary format (key and value separated +// by ':', entries separated by commas). This format is used for many +// complex commandline flags. It is also used to encode dictionaries for +// exporting them or writing them to a checkpoint. Returns a vector of +// pairs. Returns true if there if no error in parsing, false +// otherwise. +// ------------------------------------------------------------------------- +bool DictionaryParse(const string& encoded_str, + vector >* items); + +// -------------------------------------------------------------------------- +// DictionaryInt32Encode +// DictionaryInt64Encode +// DictionaryDoubleEncode +// DictionaryInt32Decode +// DictionaryInt64Decode +// DictionaryDoubleDecode +// Routines to serialize/unserialize simple dictionaries +// (string->T hashmaps). These are useful for exporting, checkpointing etc +// *Decode routines clear the input dictionary. They return true if there +// was no error in decoding, false otherwise. +// Note: these routines are not meant for use with very large dictionaries. +// They are written for convenience and not efficiency. +// -------------------------------------------------------------------------- +string DictionaryInt32Encode(const hash_map* dictionary); +string DictionaryInt64Encode(const hash_map* dictionary); +string DictionaryDoubleEncode(const hash_map* dictionary); + +bool DictionaryInt32Decode(hash_map* dictionary, + const string& encoded_str); +bool DictionaryInt64Decode(hash_map* dictionary, + const string& encoded_str); +bool DictionaryDoubleDecode(hash_map* dictionary, + const string& encoded_str); + + +#endif // STRINGS_SERIALIZE_H_ diff --git a/src/kudu/gutil/strings/split.cc b/src/kudu/gutil/strings/split.cc new file mode 100644 index 000000000000..a42faa7d7a27 --- /dev/null +++ b/src/kudu/gutil/strings/split.cc @@ -0,0 +1,1088 @@ +// Copyright 2008 and onwards Google Inc. All rights reserved. +// +// Maintainer: Greg Miller + +#include "kudu/gutil/strings/split.h" + +#include +#include +#include +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +using std::numeric_limits; + +#include "kudu/gutil/integral_types.h" +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strtoint.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/hash/hash.h" + +// Implementations for some of the Split2 API. Much of the Split2 API is +// templated so it exists in header files, either strings/split.h or +// strings/split_iternal.h. +namespace strings { +namespace delimiter { + +namespace { + +// This GenericFind() template function encapsulates the finding algorithm +// shared between the Literal and AnyOf delimiters. The FindPolicy template +// parameter allows each delimiter to customize the actual find function to use +// and the length of the found delimiter. For example, the Literal delimiter +// will ultimately use StringPiece::find(), and the AnyOf delimiter will use +// StringPiece::find_first_of(). +template +StringPiece GenericFind( + StringPiece text, + StringPiece delimiter, + FindPolicy find_policy) { + if (delimiter.empty() && text.length() > 0) { + // Special case for empty string delimiters: always return a zero-length + // StringPiece referring to the item at position 1. + return StringPiece(text.begin() + 1, 0); + } + int found_pos = StringPiece::npos; + StringPiece found(text.end(), 0); // By default, not found + found_pos = find_policy.Find(text, delimiter); + if (found_pos != StringPiece::npos) { + found.set(text.data() + found_pos, find_policy.Length(delimiter)); + } + return found; +} + +// Finds using StringPiece::find(), therefore the length of the found delimiter +// is delimiter.length(). +struct LiteralPolicy { + int Find(StringPiece text, StringPiece delimiter) { + return text.find(delimiter); + } + int Length(StringPiece delimiter) { + return delimiter.length(); + } +}; + +// Finds using StringPiece::find_first_of(), therefore the length of the found +// delimiter is 1. +struct AnyOfPolicy { + size_t Find(StringPiece text, StringPiece delimiter) { + return text.find_first_of(delimiter); + } + int Length(StringPiece delimiter) { + return 1; + } +}; + +} // namespace + +// +// Literal +// + +Literal::Literal(StringPiece sp) : delimiter_(sp.ToString()) { +} + +StringPiece Literal::Find(StringPiece text) const { + return GenericFind(text, delimiter_, LiteralPolicy()); +} + +// +// AnyOf +// + +AnyOf::AnyOf(StringPiece sp) : delimiters_(sp.ToString()) { +} + +StringPiece AnyOf::Find(StringPiece text) const { + return GenericFind(text, delimiters_, AnyOfPolicy()); +} + +} // namespace delimiter +} // namespace strings + +// +// ==================== LEGACY SPLIT FUNCTIONS ==================== +// + +using ::strings::SkipEmpty; +using ::strings::delimiter::AnyOf; +using ::strings::delimiter::Limit; + +namespace { + +// Appends the results of a split to the specified container. This function has +// the following overloads: +// - vector - for better performance +// - map - to change append semantics +// - hash_map - to change append semantics +template +void AppendToImpl(Container* container, Splitter splitter) { + Container c = splitter; // Calls implicit conversion operator. + std::copy(c.begin(), c.end(), std::inserter(*container, container->end())); +} + +// Overload of AppendToImpl() that is optimized for appending to vector. +// This version eliminates a couple string copies by using a vector +// as the intermediate container. +template +void AppendToImpl(vector* container, Splitter splitter) { + vector vsp = splitter; // Calls implicit conversion operator. + size_t container_size = container->size(); + container->resize(container_size + vsp.size()); + for (const auto& sp : vsp) { + sp.CopyToString(&(*container)[container_size++]); + } +} + +// Here we define two AppendToImpl() overloads for map<> and hash_map<>. Both of +// these overloads call through to this AppendToMap() function. This is needed +// because inserting a duplicate key into a map does NOT overwrite the previous +// value, which was not the behavior of the split1 Split*() functions. Consider +// this example: +// +// map m; +// m.insert(std::make_pair("a", "1")); +// m.insert(std::make_pair("a", "2")); // <-- doesn't actually insert. +// ASSERT_EQ(m["a"], "1"); // <-- "a" has value "1" not "2". +// +// Due to this behavior of map::insert, we can't rely on a normal std::inserter +// for a maps. Instead, maps and hash_maps need to be special cased to implement +// the desired append semantic of inserting an existing value overwrites the +// previous value. +// +// This same issue is true with sets as well. However, since sets don't have a +// separate key and value, failing to overwrite an existing value in a set is +// fine because the value already exists in the set. +// +template +void AppendToMap(Map* m, Splitter splitter) { + Map tmp = splitter; // Calls implicit conversion operator. + for (typename Map::const_iterator it = tmp.begin(); it != tmp.end(); ++it) { + (*m)[it->first] = it->second; + } +} + +template +void AppendToImpl(map* map_container, Splitter splitter) { + AppendToMap(map_container, splitter); +} + +template +void AppendToImpl(hash_map* map_container, Splitter splitter) { + AppendToMap(map_container, splitter); +} + +// Appends the results of a call to strings::Split() to the specified container. +// This function is used with the new strings::Split() API to implement the +// append semantics of the legacy Split*() functions. +// +// The "Splitter" template parameter is intended to be a +// ::strings::internal::Splitter<>, which is the return value of a call to +// strings::Split(). Sample usage: +// +// vector v; +// ... add stuff to "v" ... +// AppendTo(&v, strings::Split("a,b,c", ",")); +// +template +void AppendTo(Container* container, Splitter splitter) { + if (container->empty()) { + // "Appending" to an empty container is by far the common case. For this we + // assign directly to the output container, which is more efficient than + // explicitly appending. + *container = splitter; // Calls implicit conversion operator. + } else { + AppendToImpl(container, splitter); + } +} + +} // anonymous namespace + +// Constants for ClipString() +static const int kMaxOverCut = 12; +// The ellipsis to add to strings that are too long +static const char kCutStr[] = "..."; +static const int kCutStrSize = sizeof(kCutStr) - 1; + +// ---------------------------------------------------------------------- +// Return the place to clip the string at, or -1 +// if the string doesn't need to be clipped. +// ---------------------------------------------------------------------- +static int ClipStringHelper(const char* str, int max_len, bool use_ellipsis) { + if (strlen(str) <= max_len) + return -1; + + int max_substr_len = max_len; + + if (use_ellipsis && max_len > kCutStrSize) { + max_substr_len -= kCutStrSize; + } + + const char* cut_by = + (max_substr_len < kMaxOverCut ? str : str + max_len - kMaxOverCut); + const char* cut_at = str + max_substr_len; + while (!ascii_isspace(*cut_at) && cut_at > cut_by) + cut_at--; + + if (cut_at == cut_by) { + // No space was found + return max_substr_len; + } else { + return cut_at-str; + } +} + +// ---------------------------------------------------------------------- +// ClipString +// Clip a string to a max length. We try to clip on a word boundary +// if this is possible. If the string is clipped, we append an +// ellipsis. +// ---------------------------------------------------------------------- + +void ClipString(char* str, int max_len) { + int cut_at = ClipStringHelper(str, max_len, true); + if (cut_at != -1) { + if (max_len > kCutStrSize) { + strcpy(str+cut_at, kCutStr); + } else { + strcpy(str+cut_at, ""); + } + } +} + +// ---------------------------------------------------------------------- +// ClipString +// Version of ClipString() that uses string instead of char*. +// ---------------------------------------------------------------------- +void ClipString(string* full_str, int max_len) { + int cut_at = ClipStringHelper(full_str->c_str(), max_len, true); + if (cut_at != -1) { + full_str->erase(cut_at); + if (max_len > kCutStrSize) { + full_str->append(kCutStr); + } + } +} + +// ---------------------------------------------------------------------- +// SplitStringToIteratorAllowEmpty() +// Split a string using a character delimiter. Append the components +// to 'result'. If there are consecutive delimiters, this function +// will return corresponding empty strings. The string is split into +// at most the specified number of pieces greedily. This means that the +// last piece may possibly be split further. To split into as many pieces +// as possible, specify 0 as the number of pieces. +// +// If "full" is the empty string, yields an empty string as the only value. +// +// If "pieces" is negative for some reason, it returns the whole string +// ---------------------------------------------------------------------- +template +static inline +void SplitStringToIteratorAllowEmpty(const StringType& full, + const char* delim, + int pieces, + ITR& result) { + string::size_type begin_index, end_index; + begin_index = 0; + + for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { + end_index = full.find_first_of(delim, begin_index); + if (end_index == string::npos) { + *result++ = full.substr(begin_index); + return; + } + *result++ = full.substr(begin_index, (end_index - begin_index)); + begin_index = end_index + 1; + } + *result++ = full.substr(begin_index); +} + +void SplitStringIntoNPiecesAllowEmpty(const string& full, + const char* delim, + int pieces, + vector* result) { + if (pieces == 0) { + // No limit when pieces is 0. + AppendTo(result, strings::Split(full, AnyOf(delim))); + } else { + // The input argument "pieces" specifies the max size that *result should + // be. However, the argument to the Limit() delimiter is the max number of + // delimiters, which should be one less than "pieces". Example: "a,b,c" has + // 3 pieces and two comma delimiters. + int limit = std::max(pieces - 1, 0); + AppendTo(result, strings::Split(full, Limit(AnyOf(delim), limit))); + } +} + +// ---------------------------------------------------------------------- +// SplitStringAllowEmpty +// Split a string using a character delimiter. Append the components +// to 'result'. If there are consecutive delimiters, this function +// will return corresponding empty strings. +// ---------------------------------------------------------------------- +void SplitStringAllowEmpty(const string& full, const char* delim, + vector* result) { + AppendTo(result, strings::Split(full, AnyOf(delim))); +} + +// If we know how much to allocate for a vector of strings, we can +// allocate the vector only once and directly to the right size. +// This saves in between 33-66 % of memory space needed for the result, +// and runs faster in the microbenchmarks. +// +// The reserve is only implemented for the single character delim. +// +// The implementation for counting is cut-and-pasted from +// SplitStringToIteratorUsing. I could have written my own counting iterator, +// and use the existing template function, but probably this is more clear +// and more sure to get optimized to reasonable code. +static int CalculateReserveForVector(const string& full, const char* delim) { + int count = 0; + if (delim[0] != '\0' && delim[1] == '\0') { + // Optimize the common case where delim is a single character. + char c = delim[0]; + const char* p = full.data(); + const char* end = p + full.size(); + while (p != end) { + if (*p == c) { // This could be optimized with hasless(v,1) trick. + ++p; + } else { + while (++p != end && *p != c) { + // Skip to the next occurence of the delimiter. + } + ++count; + } + } + } + return count; +} + +// ---------------------------------------------------------------------- +// SplitStringUsing() +// SplitStringToHashsetUsing() +// SplitStringToSetUsing() +// SplitStringToMapUsing() +// SplitStringToHashmapUsing() +// Split a string using a character delimiter. Append the components +// to 'result'. +// +// Note: For multi-character delimiters, this routine will split on *ANY* of +// the characters in the string, not the entire string as a single delimiter. +// ---------------------------------------------------------------------- +template +static inline +void SplitStringToIteratorUsing(const StringType& full, + const char* delim, + ITR& result) { + // Optimize the common case where delim is a single character. + if (delim[0] != '\0' && delim[1] == '\0') { + char c = delim[0]; + const char* p = full.data(); + const char* end = p + full.size(); + while (p != end) { + if (*p == c) { + ++p; + } else { + const char* start = p; + while (++p != end && *p != c) { + // Skip to the next occurence of the delimiter. + } + *result++ = StringType(start, p - start); + } + } + return; + } + + string::size_type begin_index, end_index; + begin_index = full.find_first_not_of(delim); + while (begin_index != string::npos) { + end_index = full.find_first_of(delim, begin_index); + if (end_index == string::npos) { + *result++ = full.substr(begin_index); + return; + } + *result++ = full.substr(begin_index, (end_index - begin_index)); + begin_index = full.find_first_not_of(delim, end_index); + } +} + +void SplitStringUsing(const string& full, + const char* delim, + vector* result) { + result->reserve(result->size() + CalculateReserveForVector(full, delim)); + std::back_insert_iterator< vector > it(*result); + SplitStringToIteratorUsing(full, delim, it); +} + +void SplitStringToHashsetUsing(const string& full, const char* delim, + hash_set* result) { + AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); +} + +void SplitStringToSetUsing(const string& full, const char* delim, + set* result) { + AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); +} + +void SplitStringToMapUsing(const string& full, const char* delim, + map* result) { + AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); +} + +void SplitStringToHashmapUsing(const string& full, const char* delim, + hash_map* result) { + AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); +} + +// ---------------------------------------------------------------------- +// SplitStringPieceToVector() +// Split a StringPiece into sub-StringPieces based on delim +// and appends the pieces to 'vec'. +// If omit empty strings is true, empty strings are omitted +// from the resulting vector. +// ---------------------------------------------------------------------- +void SplitStringPieceToVector(const StringPiece& full, + const char* delim, + vector* vec, + bool omit_empty_strings) { + if (omit_empty_strings) { + AppendTo(vec, strings::Split(full, AnyOf(delim), SkipEmpty())); + } else { + AppendTo(vec, strings::Split(full, AnyOf(delim))); + } +} + +// ---------------------------------------------------------------------- +// SplitUsing() +// Split a string using a string of delimiters, returning vector +// of strings. The original string is modified to insert nulls. +// ---------------------------------------------------------------------- + +vector* SplitUsing(char* full, const char* delim) { + auto vec = new vector; + SplitToVector(full, delim, vec, true); // Omit empty strings + return vec; +} + +void SplitToVector(char* full, const char* delim, vector* vec, + bool omit_empty_strings) { + char* next = full; + while ((next = gstrsep(&full, delim)) != nullptr) { + if (omit_empty_strings && next[0] == '\0') continue; + vec->push_back(next); + } + // Add last element (or full string if no delimeter found): + if (full != nullptr) { + vec->push_back(full); + } +} + +void SplitToVector(char* full, const char* delim, vector* vec, + bool omit_empty_strings) { + char* next = full; + while ((next = gstrsep(&full, delim)) != nullptr) { + if (omit_empty_strings && next[0] == '\0') continue; + vec->push_back(next); + } + // Add last element (or full string if no delimeter found): + if (full != nullptr) { + vec->push_back(full); + } +} + +// ---------------------------------------------------------------------- +// SplitOneStringToken() +// Mainly a stringified wrapper around strpbrk() +// ---------------------------------------------------------------------- +string SplitOneStringToken(const char ** source, const char * delim) { + assert(source); + assert(delim); + if (!*source) { + return string(); + } + const char * begin = *source; + // Optimize the common case where delim is a single character. + if (delim[0] != '\0' && delim[1] == '\0') { + *source = strchr(*source, delim[0]); + } else { + *source = strpbrk(*source, delim); + } + if (*source) { + return string(begin, (*source)++); + } else { + return string(begin); + } +} + +// ---------------------------------------------------------------------- +// SplitStringWithEscaping() +// SplitStringWithEscapingAllowEmpty() +// SplitStringWithEscapingToSet() +// SplitStringWithWithEscapingToHashset() +// Split the string using the specified delimiters, taking escaping into +// account. '\' is not allowed as a delimiter. +// ---------------------------------------------------------------------- +template +static inline +void SplitStringWithEscapingToIterator(const string& src, + const strings::CharSet& delimiters, + const bool allow_empty, + ITR* result) { + CHECK(!delimiters.Test('\\')) << "\\ is not allowed as a delimiter."; + CHECK(result); + string part; + + for (uint32 i = 0; i < src.size(); ++i) { + char current_char = src[i]; + if (delimiters.Test(current_char)) { + // Push substrings when we encounter delimiters. + if (allow_empty || !part.empty()) { + *(*result)++ = part; + part.clear(); + } + } else if (current_char == '\\' && ++i < src.size()) { + // If we see a backslash, the next delimiter or backslash is literal. + current_char = src[i]; + if (current_char != '\\' && !delimiters.Test(current_char)) { + // Don't honour unknown escape sequences: emit \f for \f. + part.push_back('\\'); + } + part.push_back(current_char); + } else { + // Otherwise, we have a normal character or trailing backslash. + part.push_back(current_char); + } + } + + // Push the trailing part. + if (allow_empty || !part.empty()) { + *(*result)++ = part; + } +} + +void SplitStringWithEscaping(const string &full, + const strings::CharSet& delimiters, + vector *result) { + std::back_insert_iterator< vector > it(*result); + SplitStringWithEscapingToIterator(full, delimiters, false, &it); +} + +void SplitStringWithEscapingAllowEmpty(const string &full, + const strings::CharSet& delimiters, + vector *result) { + std::back_insert_iterator< vector > it(*result); + SplitStringWithEscapingToIterator(full, delimiters, true, &it); +} + +void SplitStringWithEscapingToSet(const string &full, + const strings::CharSet& delimiters, + set *result) { + std::insert_iterator< set > it(*result, result->end()); + SplitStringWithEscapingToIterator(full, delimiters, false, &it); +} + +void SplitStringWithEscapingToHashset(const string &full, + const strings::CharSet& delimiters, + hash_set *result) { + std::insert_iterator< hash_set > it(*result, result->end()); + SplitStringWithEscapingToIterator(full, delimiters, false, &it); +} + + +// ---------------------------------------------------------------------- +// SplitOneIntToken() +// SplitOneInt32Token() +// SplitOneUint32Token() +// SplitOneInt64Token() +// SplitOneUint64Token() +// SplitOneDoubleToken() +// SplitOneFloatToken() +// SplitOneDecimalIntToken() +// SplitOneDecimalInt32Token() +// SplitOneDecimalUint32Token() +// SplitOneDecimalInt64Token() +// SplitOneDecimalUint64Token() +// SplitOneHexUint32Token() +// SplitOneHexUint64Token() +// Mainly a stringified wrapper around strtol/strtoul/strtod +// ---------------------------------------------------------------------- +// Curried functions for the macro below +static inline long strto32_0(const char * source, char ** end) { + return strto32(source, end, 0); } +static inline unsigned long strtou32_0(const char * source, char ** end) { + return strtou32(source, end, 0); } +static inline int64 strto64_0(const char * source, char ** end) { + return strto64(source, end, 0); } +static inline uint64 strtou64_0(const char * source, char ** end) { + return strtou64(source, end, 0); } +static inline long strto32_10(const char * source, char ** end) { + return strto32(source, end, 10); } +static inline unsigned long strtou32_10(const char * source, char ** end) { + return strtou32(source, end, 10); } +static inline int64 strto64_10(const char * source, char ** end) { + return strto64(source, end, 10); } +static inline uint64 strtou64_10(const char * source, char ** end) { + return strtou64(source, end, 10); } +static inline uint32 strtou32_16(const char * source, char ** end) { + return strtou32(source, end, 16); } +static inline uint64 strtou64_16(const char * source, char ** end) { + return strtou64(source, end, 16); } + +#define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ +bool SplitOne##name##Token(const char ** source, const char * delim, \ + type * value) { \ + assert(source); \ + assert(delim); \ + assert(value); \ + if (!*source) \ + return false; \ + /* Parse int */ \ + char * end; \ + *value = function(*source, &end); \ + if (end == *source) \ + return false; /* number not present at start of string */ \ + if (end[0] && !strchr(delim, end[0])) \ + return false; /* Garbage characters after int */ \ + /* Advance past token */ \ + if (*end != '\0') \ + *source = const_cast(end+1); \ + else \ + *source = NULL; \ + return true; \ +} + +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) +#ifdef _MSC_VER // has no strtof() +// Note: does an implicit cast to float. +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) +#else +DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) +#endif +DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) +DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) + + +// ---------------------------------------------------------------------- +// SplitRange() +// Splits a string of the form "-". Either or both can be +// missing. A raw number () is interpreted as "-". Modifies +// parameters insofar as they're specified by the string. RETURNS +// true iff the input is a well-formed range. If it RETURNS false, +// from and to remain unchanged. The range in rangestr should be +// terminated either by "\0" or by whitespace. +// ---------------------------------------------------------------------- + +#define EOS(ch) ( (ch) == '\0' || ascii_isspace(ch) ) +bool SplitRange(const char* rangestr, int* from, int* to) { + // We need to do the const-cast because strol takes a char**, not const char** + char* val = const_cast(rangestr); + if (val == nullptr || EOS(*val)) return true; // we'll say nothingness is ok + + if ( val[0] == '-' && EOS(val[1]) ) // CASE 1: - + return true; // nothing changes + + if ( val[0] == '-' ) { // CASE 2: - + const int int2 = strto32(val+1, &val, 10); + if ( !EOS(*val) ) return false; // not a valid integer + *to = int2; // only "to" changes + return true; + + } else { + const int int1 = strto32(val, &val, 10); + if ( EOS(*val) || (*val == '-' && EOS(*(val+1))) ) { + *from = int1; // CASE 3: , same as - + return true; // only "from" changes + } else if (*val != '-') { // not a valid range + return false; + } + const int int2 = strto32(val+1, &val, 10); + if ( !EOS(*val) ) return false; // not a valid integer + *from = int1; // CASE 4: - + *to = int2; + return true; + } +} + +void SplitCSVLineWithDelimiter(char* line, char delimiter, + vector* cols) { + char* end_of_line = line + strlen(line); + char* end; + char* start; + + for (; line < end_of_line; line++) { + // Skip leading whitespace, unless said whitespace is the delimiter. + while (ascii_isspace(*line) && *line != delimiter) + ++line; + + if (*line == '"' && delimiter == ',') { // Quoted value... + start = ++line; + end = start; + for (; *line; line++) { + if (*line == '"') { + line++; + if (*line != '"') // [""] is an escaped ["] + break; // but just ["] is end of value + } + *end++ = *line; + } + // All characters after the closing quote and before the comma + // are ignored. + line = strchr(line, delimiter); + if (!line) line = end_of_line; + } else { + start = line; + line = strchr(line, delimiter); + if (!line) line = end_of_line; + // Skip all trailing whitespace, unless said whitespace is the delimiter. + for (end = line; end > start; --end) { + if (!ascii_isspace(end[-1]) || end[-1] == delimiter) + break; + } + } + const bool need_another_column = + (*line == delimiter) && (line == end_of_line - 1); + *end = '\0'; + cols->push_back(start); + // If line was something like [paul,] (comma is the last character + // and is not proceeded by whitespace or quote) then we are about + // to eliminate the last column (which is empty). This would be + // incorrect. + if (need_another_column) + cols->push_back(end); + + assert(*line == '\0' || *line == delimiter); + } +} + +void SplitCSVLine(char* line, vector* cols) { + SplitCSVLineWithDelimiter(line, ',', cols); +} + +void SplitCSVLineWithDelimiterForStrings(const string &line, + char delimiter, + vector *cols) { + // Unfortunately, the interface requires char* instead of const char* + // which requires copying the string. + char *cline = strndup_with_new(line.c_str(), line.size()); + vector v; + SplitCSVLineWithDelimiter(cline, delimiter, &v); + for (vector::const_iterator ci = v.begin(); ci != v.end(); ++ci) { + cols->push_back(*ci); + } + delete[] cline; +} + +// ---------------------------------------------------------------------- +namespace { + +// Helper class used by SplitStructuredLineInternal. +class ClosingSymbolLookup { + public: + explicit ClosingSymbolLookup(const char* symbol_pairs) + : closing_(), + valid_closing_() { + // Initialize the opening/closing arrays. + for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) { + unsigned char opening = *symbol; + ++symbol; + // If the string ends before the closing character has been found, + // use the opening character as the closing character. + unsigned char closing = *symbol != 0 ? *symbol : opening; + closing_[opening] = closing; + valid_closing_[closing] = true; + if (*symbol == 0) break; + } + } + + // Returns the closing character corresponding to an opening one, + // or 0 if the argument is not an opening character. + char GetClosingChar(char opening) const { + return closing_[static_cast(opening)]; + } + + // Returns true if the argument is a closing character. + bool IsClosing(char c) const { + return valid_closing_[static_cast(c)]; + } + + private: + // Maps an opening character to its closing. If the entry contains 0, + // the character is not in the opening set. + char closing_[256]; + // Valid closing characters. + bool valid_closing_[256]; + + DISALLOW_COPY_AND_ASSIGN(ClosingSymbolLookup); +}; + +char* SplitStructuredLineInternal(char* line, + char delimiter, + const char* symbol_pairs, + vector* cols, + bool with_escapes) { + ClosingSymbolLookup lookup(symbol_pairs); + + // Stack of symbols expected to close the current opened expressions. + vector expected_to_close; + bool in_escape = false; + + CHECK(cols); + cols->push_back(line); + char* current; + for (current = line; *current; ++current) { + char c = *current; + if (in_escape) { + in_escape = false; + } else if (with_escapes && c == '\\') { + // We are escaping the next character. Note the escape still appears + // in the output. + in_escape = true; + } else if (expected_to_close.empty() && c == delimiter) { + // We don't have any open expression, this is a valid separator. + *current = 0; + cols->push_back(current + 1); + } else if (!expected_to_close.empty() && c == expected_to_close.back()) { + // Can we close the currently open expression? + expected_to_close.pop_back(); + } else if (lookup.GetClosingChar(c)) { + // If this is an opening symbol, we open a new expression and push + // the expected closing symbol on the stack. + expected_to_close.push_back(lookup.GetClosingChar(c)); + } else if (lookup.IsClosing(c)) { + // Error: mismatched closing symbol. + return current; + } + } + if (!expected_to_close.empty()) { + return current; // Missing closing symbol(s) + } + return nullptr; // Success +} + +bool SplitStructuredLineInternal(StringPiece line, + char delimiter, + const char* symbol_pairs, + vector* cols, + bool with_escapes) { + ClosingSymbolLookup lookup(symbol_pairs); + + // Stack of symbols expected to close the current opened expressions. + vector expected_to_close; + bool in_escape = false; + + CHECK_NOTNULL(cols); + cols->push_back(line); + for (int i = 0; i < line.size(); ++i) { + char c = line[i]; + if (in_escape) { + in_escape = false; + } else if (with_escapes && c == '\\') { + // We are escaping the next character. Note the escape still appears + // in the output. + in_escape = true; + } else if (expected_to_close.empty() && c == delimiter) { + // We don't have any open expression, this is a valid separator. + cols->back().remove_suffix(line.size() - i); + cols->push_back(StringPiece(line, i + 1)); + } else if (!expected_to_close.empty() && c == expected_to_close.back()) { + // Can we close the currently open expression? + expected_to_close.pop_back(); + } else if (lookup.GetClosingChar(c)) { + // If this is an opening symbol, we open a new expression and push + // the expected closing symbol on the stack. + expected_to_close.push_back(lookup.GetClosingChar(c)); + } else if (lookup.IsClosing(c)) { + // Error: mismatched closing symbol. + return false; + } + } + if (!expected_to_close.empty()) { + return false; // Missing closing symbol(s) + } + return true; // Success +} + +} // anonymous namespace + +char* SplitStructuredLine(char* line, + char delimiter, + const char *symbol_pairs, + vector* cols) { + return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, + false); +} + +bool SplitStructuredLine(StringPiece line, + char delimiter, + const char* symbol_pairs, + vector* cols) { + return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, + false); +} + +char* SplitStructuredLineWithEscapes(char* line, + char delimiter, + const char *symbol_pairs, + vector* cols) { + return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, + true); +} + +bool SplitStructuredLineWithEscapes(StringPiece line, + char delimiter, + const char* symbol_pairs, + vector* cols) { + return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, + true); +} + + +// ---------------------------------------------------------------------- +// SplitStringIntoKeyValues() +// ---------------------------------------------------------------------- +bool SplitStringIntoKeyValues(const string& line, + const string& key_value_delimiters, + const string& value_value_delimiters, + string *key, vector *values) { + key->clear(); + values->clear(); + + // find the key string + size_t end_key_pos = line.find_first_of(key_value_delimiters); + if (end_key_pos == string::npos) { + VLOG(1) << "cannot parse key from line: " << line; + return false; // no key + } + key->assign(line, 0, end_key_pos); + + // find the values string + string remains(line, end_key_pos, line.size() - end_key_pos); + size_t begin_values_pos = remains.find_first_not_of(key_value_delimiters); + if (begin_values_pos == string::npos) { + VLOG(1) << "cannot parse value from line: " << line; + return false; // no value + } + string values_string(remains, + begin_values_pos, + remains.size() - begin_values_pos); + + // construct the values vector + if (value_value_delimiters.empty()) { // one value + values->push_back(values_string); + } else { // multiple values + SplitStringUsing(values_string, value_value_delimiters.c_str(), values); + if (values->size() < 1) { + VLOG(1) << "cannot parse value from line: " << line; + return false; // no value + } + } + return true; +} + +bool SplitStringIntoKeyValuePairs(const string& line, + const string& key_value_delimiters, + const string& key_value_pair_delimiters, + vector >* kv_pairs) { + kv_pairs->clear(); + + vector pairs; + SplitStringUsing(line, key_value_pair_delimiters.c_str(), &pairs); + + bool success = true; + for (const auto& pair : pairs) { + string key; + vector value; + if (!SplitStringIntoKeyValues(pair, + key_value_delimiters, + "", &key, &value)) { + // Don't return here, to allow for keys without associated + // values; just record that our split failed. + success = false; + } + // we expect atmost one value because we passed in an empty vsep to + // SplitStringIntoKeyValues + DCHECK_LE(value.size(), 1); + kv_pairs->push_back(make_pair(key, value.empty()? "" : value[0])); + } + return success; +} + +// ---------------------------------------------------------------------- +// SplitLeadingDec32Values() +// SplitLeadingDec64Values() +// A simple parser for space-separated decimal int32/int64 values. +// Appends parsed integers to the end of the result vector, stopping +// at the first unparsable spot. Skips past leading and repeated +// whitespace (does not consume trailing whitespace), and returns +// a pointer beyond the last character parsed. +// -------------------------------------------------------------------- +const char* SplitLeadingDec32Values(const char *str, vector *result) { + for (;;) { + char *end = nullptr; + long value = strtol(str, &end, 10); + if (end == str) + break; + // Limit long values to int32 min/max. Needed for lp64. + if (value > numeric_limits::max()) { + value = numeric_limits::max(); + } else if (value < numeric_limits::min()) { + value = numeric_limits::min(); + } + result->push_back(value); + str = end; + if (!ascii_isspace(*end)) + break; + } + return str; +} + +const char* SplitLeadingDec64Values(const char *str, vector *result) { + for (;;) { + char *end = nullptr; + const int64 value = strtoll(str, &end, 10); + if (end == str) + break; + result->push_back(value); + str = end; + if (!ascii_isspace(*end)) + break; + } + return str; +} + +void SplitStringToLines(const char* full, + int max_len, + int num_lines, + vector* result) { + if (max_len <= 0) { + return; + } + int pos = 0; + for (int i = 0; (i < num_lines || num_lines <= 0); i++) { + int cut_at = ClipStringHelper(full+pos, max_len, (i == num_lines - 1)); + if (cut_at == -1) { + result->push_back(string(full+pos)); + return; + } + result->push_back(string(full+pos, cut_at)); + if (i == num_lines - 1 && max_len > kCutStrSize) { + result->at(i).append(kCutStr); + } + pos += cut_at; + } +} diff --git a/src/kudu/gutil/strings/split.h b/src/kudu/gutil/strings/split.h new file mode 100644 index 000000000000..c48768902e8b --- /dev/null +++ b/src/kudu/gutil/strings/split.h @@ -0,0 +1,1209 @@ +// Copyright 2008 and onwards Google, Inc. +// +// #status: RECOMMENDED +// #category: operations on strings +// #summary: Functions for splitting strings into substrings. +// +// This file contains functions for splitting strings. The new and recommended +// API for string splitting is the strings::Split() function. The old API is a +// large collection of standalone functions declared at the bottom of this file +// in the global scope. +// +// TODO(user): Rough migration plan from old API to new API +// (1) Add comments to old Split*() functions showing how to do the same things +// with the new API. +// (2) Reimplement some of the old Split*() functions in terms of the new +// Split() API. This will allow deletion of code in split.cc. +// (3) (Optional) Replace old Split*() API calls at call sites with calls to new +// Split() API. +// +#ifndef STRINGS_SPLIT_H_ +#define STRINGS_SPLIT_H_ + +#include +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_map; +#include +using __gnu_cxx::hash; +using __gnu_cxx::hash_set; +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +using std::map; +using std::multimap; +#include +using std::multiset; +using std::set; +#include +using std::string; +#include +using std::make_pair; +using std::pair; +#include +using std::vector; + +#include + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/strings/charset.h" +#include "kudu/gutil/strings/split_internal.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/strings/strip.h" + +namespace strings { + +// The new Split API +// aka Split2 +// aka strings::Split() +// +// This string splitting API consists of a Split() function in the ::strings +// namespace and a handful of delimiter objects in the ::strings::delimiter +// namespace (more on delimiter objects below). The Split() function always +// takes two arguments: the text to be split and the delimiter on which to split +// the text. An optional third argument may also be given, which is a Predicate +// functor that will be used to filter the results, e.g., to skip empty strings +// (more on predicates below). The Split() function adapts the returned +// collection to the type specified by the caller. +// +// Example 1: +// // Splits the given string on commas. Returns the results in a +// // vector of strings. +// vector v = strings::Split("a,b,c", ","); +// assert(v.size() == 3); +// +// Example 2: +// // By default, empty strings are *included* in the output. See the +// // strings::SkipEmpty predicate below to omit them. +// vector v = strings::Split("a,b,,c", ","); +// assert(v.size() == 4); // "a", "b", "", "c" +// v = strings::Split("", ","); +// assert(v.size() == 1); // v contains a single "" +// +// Example 3: +// // Splits the string as in the previous example, except that the results +// // are returned as StringPiece objects. Note that because we are storing +// // the results within StringPiece objects, we have to ensure that the input +// // string outlives any results. +// vector v = strings::Split("a,b,c", ","); +// assert(v.size() == 3); +// +// Example 4: +// // Stores results in a set. +// set a = strings::Split("a,b,c,a,b,c", ","); +// assert(a.size() == 3); +// +// Example 5: +// // Stores results in a map. The map implementation assumes that the input +// // is provided as a series of key/value pairs. For example, the 0th element +// // resulting from the split will be stored as a key to the 1st element. If +// // an odd number of elements are resolved, the last element is paired with +// // a default-constructed value (e.g., empty string). +// map m = strings::Split("a,b,c", ","); +// assert(m.size() == 2); +// assert(m["a"] == "b"); +// assert(m["c"] == ""); // last component value equals "" +// +// Example 6: +// // Splits on the empty string, which results in each character of the input +// // string becoming one element in the output collection. +// vector v = strings::Split("abc", ""); +// assert(v.size() == 3); +// +// Example 7: +// // Stores first two split strings as the members in an std::pair. +// std::pair p = strings::Split("a,b,c", ","); +// EXPECT_EQ("a", p.first); +// EXPECT_EQ("b", p.second); +// // "c" is omitted because std::pair can hold only two elements. +// +// As illustrated above, the Split() function adapts the returned collection to +// the type specified by the caller. The returned collections may contain +// string, StringPiece, Cord, or any object that has a constructor (explicit or +// not) that takes a single StringPiece argument. This pattern works for all +// standard STL containers including vector, list, deque, set, multiset, map, +// and multimap, non-standard containers including hash_set and hash_map, and +// even std::pair which is not actually a container. +// +// Splitting to std::pair is an interesting case because it can hold only two +// elements and is not a collection type. When splitting to an std::pair the +// first two split strings become the std::pair's .first and .second members +// respectively. The remaining split substrings are discarded. If there are less +// than two split substrings, the empty string is used for the corresponding +// std::pair member. +// +// The strings::Split() function can be used multiple times to perform more +// complicated splitting logic, such as intelligently parsing key-value pairs. +// For example +// +// // The input string "a=b=c,d=e,f=,g" becomes +// // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } +// map m; +// for (StringPiece sp : strings::Split("a=b=c,d=e,f=,g", ",")) { +// m.insert(strings::Split(sp, strings::delimiter::Limit("=", 1))); +// } +// EXPECT_EQ("b=c", m.find("a")->second); +// EXPECT_EQ("e", m.find("d")->second); +// EXPECT_EQ("", m.find("f")->second); +// EXPECT_EQ("", m.find("g")->second); +// +// The above example stores the results in an std::map. But depending on your +// data requirements, you can just as easily store the results in an +// std::multimap or even a vector>. +// +// +// Delimiters +// +// The Split() function also takes a second argument that is a delimiter. This +// delimiter is actually an object that defines the boundaries between elements +// in the provided input. If a string (const char*, ::string, or StringPiece) is +// passed in place of an explicit Delimiter object, the argument is implicitly +// converted to a ::strings::delimiter::Literal. +// +// With this split API comes the formal concept of a Delimiter (big D). A +// Delimiter is an object with a Find() function that knows how find the first +// occurrence of itself in a given StringPiece. Models of the Delimiter concept +// represent specific kinds of delimiters, such as single characters, +// substrings, or even regular expressions. +// +// The following Delimiter objects are provided as part of the Split() API: +// +// - Literal (default) +// - AnyOf +// - Limit +// +// The following are examples of using some provided Delimiter objects: +// +// Example 1: +// // Because a string literal is converted to a strings::delimiter::Literal, +// // the following two splits are equivalent. +// vector v1 = strings::Split("a,b,c", ","); // (1) +// using ::strings::delimiter::Literal; +// vector v2 = strings::Split("a,b,c", Literal(",")); // (2) +// +// Example 2: +// // Splits on any of the characters specified in the delimiter string. +// using ::strings::delimiter::AnyOf; +// vector v = strings::Split("a,b;c-d", AnyOf(",;-")); +// assert(v.size() == 4); +// +// Example 3: +// // Uses the Limit meta-delimiter to limit the number of matches a delimiter +// // can have. In this case, the delimiter of a Literal comma is limited to +// // to matching at most one time. The last element in the returned +// // collection will contain all unsplit pieces, which may contain instances +// // of the delimiter. +// using ::strings::delimiter::Limit; +// vector v = strings::Split("a,b,c", Limit(",", 1)); +// assert(v.size() == 2); // Limited to 1 delimiter; so two elements found +// assert(v[0] == "a"); +// assert(v[1] == "b,c"); +// +// +// Predicates +// +// Predicates can filter the results of a Split() operation by determining +// whether or not a resultant element is included in the result set. A predicate +// may be passed as an *optional* third argument to the Split() function. +// +// Predicates are unary functions (or functors) that take a single StringPiece +// argument and return bool indicating whether the argument should be included +// (true) or excluded (false). +// +// One example where this is useful is when filtering out empty substrings. By +// default, empty substrings may be returned by strings::Split(), which is +// similar to the way split functions work in other programming languages. For +// example: +// +// // Empty strings *are* included in the returned collection. +// vector v = strings::Split(",a,,b,", ","); +// assert(v.size() == 5); // v[0] == "", v[1] == "a", v[2] == "", ... +// +// These empty strings can be filtered out of the results by simply passing the +// provided SkipEmpty predicate as the third argument to the Split() function. +// SkipEmpty does not consider a string containing all whitespace to be empty. +// For that behavior use the SkipWhitespace predicate. For example: +// +// Example 1: +// // Uses SkipEmpty to omit empty strings. Strings containing whitespace are +// // not empty and are therefore not skipped. +// using strings::SkipEmpty; +// vector v = strings::Split(",a, ,b,", ",", SkipEmpty()); +// assert(v.size() == 3); +// assert(v[0] == "a"); +// assert(v[1] == " "); // <-- The whitespace makes the string not empty. +// assert(v[2] == "b"); +// +// Example 2: +// // Uses SkipWhitespace to skip all strings that are either empty or contain +// // only whitespace. +// using strings::SkipWhitespace; +// vector v = strings::Split(",a, ,b,", ",", SkipWhitespace()); +// assert(v.size() == 2); +// assert(v[0] == "a"); +// assert(v[1] == "b"); +// +// +// Differences between Split1 and Split2 +// +// Split2 is the strings::Split() API described above. Split1 is a name for the +// collection of legacy Split*() functions declared later in this file. Most of +// the Split1 functions follow a set of conventions that don't necessarily match +// the conventions used in Split2. The following are some of the important +// differences between Split1 and Split2: +// +// Split1 -> Split2 +// ---------------- +// Append -> Assign: +// The Split1 functions all returned their output collections via a pointer to +// an out parameter as is typical in Google code. In some cases the comments +// explicitly stated that results would be *appended* to the output +// collection. In some cases it was ambiguous whether results were appended. +// This ambiguity is gone in the Split2 API as results are always assigned to +// the output collection, never appended. +// +// AnyOf -> Literal: +// Most Split1 functions treated their delimiter argument as a string of +// individual byte delimiters. For example, a delimiter of ",;" would split on +// "," and ";", not the substring ",;". This behavior is equivalent to the +// Split2 delimiter strings::delimiter::AnyOf, which is *not* the default. By +// default, strings::Split() splits using strings::delimiter::Literal() which +// would treat the whole string ",;" as a single delimiter string. +// +// SkipEmpty -> allow empty: +// Most Split1 functions omitted empty substrings in the results. To keep +// empty substrings one would have to use an explicitly named +// Split*AllowEmpty() function. This behavior is reversed in Split2. By +// default, strings::Split() *allows* empty substrings in the output. To skip +// them, use the strings::SkipEmpty predicate. +// +// string -> user's choice: +// Most Split1 functions return collections of string objects. Some return +// char*, but the type returned is dictated by each Split1 function. With +// Split2 the caller can choose which string-like object to return. (Note: +// char* C-strings are not supported in Split2--use StringPiece instead). +// + +// Definitions of the main Split() function. +template +inline internal::Splitter Split(StringPiece text, Delimiter d) { + return internal::Splitter(text, d); +} + +template +inline internal::Splitter Split( + StringPiece text, Delimiter d, Predicate p) { + return internal::Splitter(text, d, p); +} + +namespace delimiter { +// A Delimiter object represents a single separator, such as a character, +// literal string, or regular expression. A Delimiter object must have the +// following member: +// +// StringPiece Find(StringPiece text); +// +// This Find() member function should return a StringPiece referring to the next +// occurrence of the represented delimiter within the given string text. If no +// delimiter is found in the given text, a zero-length StringPiece referring to +// text.end() should be returned (e.g., StringPiece(text.end(), 0)). It is +// important that the returned StringPiece always be within the bounds of the +// StringPiece given as an argument--it must not refer to a string that is +// physically located outside of the given string. The following example is a +// simple Delimiter object that is created with a single char and will look for +// that char in the text given to the Find() function: +// +// struct SimpleDelimiter { +// const char c_; +// explicit SimpleDelimiter(char c) : c_(c) {} +// StringPiece Find(StringPiece text) { +// int pos = text.find(c_); +// if (pos == StringPiece::npos) return StringPiece(text.end(), 0); +// return StringPiece(text, pos, 1); +// } +// }; + +// Represents a literal string delimiter. Examples: +// +// using ::strings::delimiter::Literal; +// vector v = strings::Split("a=>b=>c", Literal("=>")); +// assert(v.size() == 3); +// assert(v[0] == "a"); +// assert(v[1] == "b"); +// assert(v[2] == "c"); +// +// The next example uses the empty string as a delimiter. +// +// using ::strings::delimiter::Literal; +// vector v = strings::Split("abc", Literal("")); +// assert(v.size() == 3); +// assert(v[0] == "a"); +// assert(v[1] == "b"); +// assert(v[2] == "c"); +// +class Literal { + public: + explicit Literal(StringPiece sp); + StringPiece Find(StringPiece text) const; + + private: + const string delimiter_; +}; + +// Represents a delimiter that will match any of the given byte-sized +// characters. AnyOf is similar to Literal, except that AnyOf uses +// StringPiece::find_first_of() and Literal uses StringPiece::find(). AnyOf +// examples: +// +// using ::strings::delimiter::AnyOf; +// vector v = strings::Split("a,b=c", AnyOf(",=")); +// +// assert(v.size() == 3); +// assert(v[0] == "a"); +// assert(v[1] == "b"); +// assert(v[2] == "c"); +// +// If AnyOf is given the empty string, it behaves exactly like Literal and +// matches each individual character in the input string. +// +// Note: The string passed to AnyOf is assumed to be a string of single-byte +// ASCII characters. AnyOf does not work with multi-byte characters. +class AnyOf { + public: + explicit AnyOf(StringPiece sp); + StringPiece Find(StringPiece text) const; + + private: + const string delimiters_; +}; + +// Wraps another delimiter and sets a max number of matches for that delimiter. +// Create LimitImpls using the Limit() function. Example: +// +// using ::strings::delimiter::Limit; +// vector v = strings::Split("a,b,c,d", Limit(",", 2)); +// +// assert(v.size() == 3); // Split on 2 commas, giving a vector with 3 items +// assert(v[0] == "a"); +// assert(v[1] == "b"); +// assert(v[2] == "c,d"); +// +template +class LimitImpl { + public: + LimitImpl(Delimiter delimiter, int limit) + : delimiter_(std::move(delimiter)), limit_(limit), count_(0) {} + StringPiece Find(StringPiece text) { + if (count_++ == limit_) { + return StringPiece(text.end(), 0); // No more matches. + } + return delimiter_.Find(text); + } + + private: + Delimiter delimiter_; + const int limit_; + int count_; +}; + +// Overloaded Limit() function to create LimitImpl<> objects. Uses the Delimiter +// Literal as the default if string-like objects are passed as the delimiter +// parameter. This is similar to the overloads for Split() below. +template +inline LimitImpl Limit(Delimiter delim, int limit) { + return LimitImpl(delim, limit); +} + +inline LimitImpl Limit(const char* s, int limit) { + return LimitImpl(Literal(s), limit); +} + +inline LimitImpl Limit(const string& s, int limit) { + return LimitImpl(Literal(s), limit); +} + +inline LimitImpl Limit(StringPiece s, int limit) { + return LimitImpl(Literal(s), limit); +} + +} // namespace delimiter + +// +// Predicates are functors that return bool indicating whether the given +// StringPiece should be included in the split output. If the predicate returns +// false then the string will be excluded from the output from strings::Split(). +// + +// Always returns true, indicating that all strings--including empty +// strings--should be included in the split output. This predicate is not +// strictly needed because this is the default behavior of the strings::Split() +// function. But it might be useful at some call sites to make the intent +// explicit. +// +// vector v = Split(" a , ,,b,", ",", AllowEmpty()); +// EXPECT_THAT(v, ElementsAre(" a ", " ", "", "b", "")); +struct AllowEmpty { + bool operator()(StringPiece sp) const { + return true; + } +}; + +// Returns false if the given StringPiece is empty, indicating that the +// strings::Split() API should omit the empty string. +// +// vector v = Split(" a , ,,b,", ",", SkipEmpty()); +// EXPECT_THAT(v, ElementsAre(" a ", " ", "b")); +struct SkipEmpty { + bool operator()(StringPiece sp) const { + return !sp.empty(); + } +}; + +// Returns false if the given StringPiece is empty or contains only whitespace, +// indicating that the strings::Split() API should omit the string. +// +// vector v = Split(" a , ,,b,", ",", SkipWhitespace()); +// EXPECT_THAT(v, ElementsAre(" a ", "b")); +struct SkipWhitespace { + bool operator()(StringPiece sp) const { + StripWhiteSpace(&sp); + return !sp.empty(); + } +}; + +// Split() function overloads to effectively give Split() a default Delimiter +// type of Literal. If Split() is called and a string is passed as the delimiter +// instead of an actual Delimiter object, then one of these overloads will be +// invoked and will create a Splitter with the delimiter string. +// +// Since Split() is a function template above, these overload signatures need to +// be explicit about the string type so they match better than the templated +// version. These functions are overloaded for: +// +// - const char* +// - const string& +// - StringPiece + +inline internal::Splitter Split( + StringPiece text, const char* delimiter) { + return internal::Splitter( + text, delimiter::Literal(delimiter)); +} + +inline internal::Splitter Split( + StringPiece text, const string& delimiter) { + return internal::Splitter( + text, delimiter::Literal(delimiter)); +} + +inline internal::Splitter Split( + StringPiece text, StringPiece delimiter) { + return internal::Splitter( + text, delimiter::Literal(delimiter)); +} + +// Same overloads as above, but also including a Predicate argument. +template +inline internal::Splitter Split( + StringPiece text, const char* delimiter, Predicate p) { + return internal::Splitter( + text, delimiter::Literal(delimiter), p); +} + +template +inline internal::Splitter Split( + StringPiece text, const string& delimiter, Predicate p) { + return internal::Splitter( + text, delimiter::Literal(delimiter), p); +} + +template +inline internal::Splitter Split( + StringPiece text, StringPiece delimiter, Predicate p) { + return internal::Splitter( + text, delimiter::Literal(delimiter), p); +} + +} // namespace strings + +// +// ==================== LEGACY SPLIT FUNCTIONS ==================== +// + +// NOTE: The instruction below creates a Module titled +// GlobalSplitFunctions within the auto-generated Doxygen documentation. +// This instruction is needed to expose global functions that are not +// within a namespace. +// +// START DOXYGEN SplitFunctions grouping +/* @defgroup SplitFunctions + * @{ */ + +// ---------------------------------------------------------------------- +// ClipString +// Clip a string to a max length. We try to clip on a word boundary +// if this is possible. If the string is clipped, we append an +// ellipsis. +// +// ***NOTE*** +// ClipString counts length with strlen. If you have non-ASCII +// strings like UTF-8, this is wrong. If you are displaying the +// clipped strings to users in a frontend, consider using +// ClipStringOnWordBoundary in +// webserver/util/snippets/rewriteboldtags, which considers the width +// of the string, not just the number of bytes. +// +// TODO(user) Move ClipString back to strutil. The problem with this is +// that ClipStringHelper is used behind the scenes by SplitStringToLines, but +// probably shouldn't be exposed in the .h files. +// ---------------------------------------------------------------------- +void ClipString(char* str, int max_len); + +// ---------------------------------------------------------------------- +// ClipString +// Version of ClipString() that uses string instead of char*. +// NOTE: See comment above. +// ---------------------------------------------------------------------- +void ClipString(string* full_str, int max_len); + +// ---------------------------------------------------------------------- +// SplitStringToLines() Split a string into lines of maximum length +// 'max_len'. Append the resulting lines to 'result'. Will attempt +// to split on word boundaries. If 'num_lines' +// is zero it splits up the whole string regardless of length. If +// 'num_lines' is positive, it returns at most num_lines lines, and +// appends a "..." to the end of the last line if the string is too +// long to fit completely into 'num_lines' lines. +// ---------------------------------------------------------------------- +void SplitStringToLines(const char* full, + int max_len, + int num_lines, + vector* result); + +// ---------------------------------------------------------------------- +// SplitOneStringToken() +// Returns the first "delim" delimited string from "*source" and modifies +// *source to point after the delimiter that was found. If no delimiter is +// found, *source is set to NULL. +// +// If the start of *source is a delimiter, an empty string is returned. +// If *source is NULL, an empty string is returned. +// +// "delim" is treated as a sequence of 1 or more character delimiters. Any one +// of the characters present in "delim" is considered to be a single +// delimiter; The delimiter is not "delim" as a whole. For example: +// +// const char* s = "abc=;de"; +// string r = SplitOneStringToken(&s, ";="); +// // r = "abc" +// // s points to ";de" +// ---------------------------------------------------------------------- +string SplitOneStringToken(const char** source, const char* delim); + +// ---------------------------------------------------------------------- +// SplitUsing() +// Split a string into substrings based on the nul-terminated list +// of bytes at delimiters (uses strsep) and return a vector of +// those strings. Modifies 'full' We allocate the return vector, +// and you should free it. Note that empty fields are ignored. +// Use SplitToVector with last argument 'false' if you want the +// empty fields. +// ---------------------------------------------------------------------- +vector* SplitUsing(char* full, const char* delimiters); + +// ---------------------------------------------------------------------- +// SplitToVector() +// Split a string into substrings based on the nul-terminated list +// of bytes at delim (uses strsep) and appends the split +// strings to 'vec'. Modifies "full". If omit empty strings is +// true, empty strings are omitted from the resulting vector. +// ---------------------------------------------------------------------- +void SplitToVector(char* full, const char* delimiters, + vector* vec, + bool omit_empty_strings); +void SplitToVector(char* full, const char* delimiters, + vector* vec, + bool omit_empty_strings); + +// ---------------------------------------------------------------------- +// SplitStringPieceToVector +// Split a StringPiece into sub-StringPieces based on the +// nul-terminated list of bytes at delim and appends the +// pieces to 'vec'. If omit empty strings is true, empty strings +// are omitted from the resulting vector. +// Expects the original string (from which 'full' is derived) to exist +// for the full lifespan of 'vec'. +// ---------------------------------------------------------------------- +void SplitStringPieceToVector(const StringPiece& full, + const char* delim, + vector* vec, + bool omit_empty_strings); + +// ---------------------------------------------------------------------- +// SplitStringUsing() +// SplitStringToHashsetUsing() +// SplitStringToSetUsing() +// SplitStringToMapUsing() +// SplitStringToHashmapUsing() + +// Splits a string using one or more byte delimiters, presented as a +// nul-terminated c string. Append the components to 'result'. If there are +// consecutive delimiters, this function skips over all of them: in other words, +// empty components are dropped. If you want to keep empty components, try +// SplitStringAllowEmpty(). +// +// NOTE: Do not use this for multi-byte delimiters such as UTF-8 strings. Use +// strings::Split() with strings::delimiter::Literal as the delimiter. +// +// ==> NEW API: Consider using the new Split API defined above. <== +// Example: +// +// using strings::SkipEmpty; +// using strings::Split; +// using strings::delimiter::AnyOf; +// +// vector v = Split(full, AnyOf(delimiter), SkipEmpty()); +// +// For even better performance, store the result in a vector +// to avoid string copies. +// ---------------------------------------------------------------------- +void SplitStringUsing(const string& full, const char* delimiters, + vector* result); +void SplitStringToHashsetUsing(const string& full, const char* delimiters, + hash_set* result); +void SplitStringToSetUsing(const string& full, const char* delimiters, + set* result); +// The even-positioned (0-based) components become the keys for the +// odd-positioned components that follow them. When there is an odd +// number of components, the value for the last key will be unchanged +// if the key was already present in the hash table, or will be the +// empty string if the key is a newly inserted key. +void SplitStringToMapUsing(const string& full, const char* delim, + map* result); +void SplitStringToHashmapUsing(const string& full, const char* delim, + hash_map* result); + +// ---------------------------------------------------------------------- +// SplitStringAllowEmpty() +// +// Split a string using one or more byte delimiters, presented as a +// nul-terminated c string. Append the components to 'result'. If there are +// consecutive delimiters, this function will return corresponding empty +// strings. If you want to drop the empty strings, try SplitStringUsing(). +// +// If "full" is the empty string, yields an empty string as the only value. +// +// ==> NEW API: Consider using the new Split API defined above. <== +// +// using strings::Split; +// using strings::delimiter::AnyOf; +// +// vector v = Split(full, AnyOf(delimiter)); +// +// For even better performance, store the result in a vector to +// avoid string copies. +// ---------------------------------------------------------------------- +void SplitStringAllowEmpty(const string& full, const char* delim, + vector* result); + +// ---------------------------------------------------------------------- +// SplitStringWithEscaping() +// SplitStringWithEscapingAllowEmpty() +// SplitStringWithEscapingToSet() +// SplitStringWithEscapingToHashset() + +// Split the string using the specified delimiters, taking escaping into +// account. '\' is not allowed as a delimiter. +// +// Within the string, preserve a delimiter preceded by a backslash as a +// literal delimiter. In addition, preserve two consecutive backslashes as +// a single literal backslash. Do not unescape any other backslash-character +// sequence. +// +// Eg. 'foo\=bar=baz\\qu\ux' split on '=' becomes ('foo=bar', 'baz\qu\ux') +// +// All versions other than "AllowEmpty" discard any empty substrings. +// ---------------------------------------------------------------------- +void SplitStringWithEscaping(const string& full, + const strings::CharSet& delimiters, + vector* result); +void SplitStringWithEscapingAllowEmpty(const string& full, + const strings::CharSet& delimiters, + vector* result); +void SplitStringWithEscapingToSet(const string& full, + const strings::CharSet& delimiters, + set* result); +void SplitStringWithEscapingToHashset(const string& full, + const strings::CharSet& delimiters, + hash_set* result); + +// ---------------------------------------------------------------------- +// SplitStringIntoNPiecesAllowEmpty() + +// Split a string using a nul-terminated list of byte +// delimiters. Append the components to 'result'. If there are +// consecutive delimiters, this function will return corresponding +// empty strings. The string is split into at most the specified +// number of pieces greedily. This means that the last piece may +// possibly be split further. To split into as many pieces as +// possible, specify 0 as the number of pieces. +// +// If "full" is the empty string, yields an empty string as the only value. +// ---------------------------------------------------------------------- +void SplitStringIntoNPiecesAllowEmpty(const string& full, + const char* delimiters, + int pieces, + vector* result); + +// ---------------------------------------------------------------------- +// SplitStringAndParse() +// SplitStringAndParseToContainer() +// SplitStringAndParseToList() +// Split a string using a nul-terminated list of character +// delimiters. For each component, parse using the provided +// parsing function and if successful, append it to 'result'. +// Return true if and only if all components parse successfully. +// If there are consecutive delimiters, this function skips over +// all of them. This function will correctly handle parsing +// strings that have embedded \0s. +// +// SplitStringAndParse fills into a vector. +// SplitStringAndParseToContainer fills into any container that implements +// a single-argument insert function. (i.e. insert(const value_type& x) ). +// SplitStringAndParseToList fills into any container that implements a single- +// argument push_back function (i.e. push_back(const value_type& x) ), plus +// value_type& back() and pop_back(). +// NOTE: This implementation relies on parsing in-place into the "back()" +// reference, so its performance may depend on the efficiency of back(). +// +// Example Usage: +// vector values; +// CHECK(SplitStringAndParse("1.0,2.0,3.0", ",", &safe_strtod, &values)); +// CHECK_EQ(3, values.size()); +// +// vector values; +// CHECK(SplitStringAndParse("1M,2M,3M", ",", +// &HumanReadableNumBytes::ToInt64, &values)); +// CHECK_EQ(3, values.size()); +// +// set values; +// CHECK(SplitStringAndParseToContainer("3,1,1,2", ",", +// &safe_strto64, &values)); +// CHECK_EQ(4, values.size()); +// +// deque values; +// CHECK(SplitStringAndParseToList("3,1,1,2", ",", &safe_strto64, &values)); +// CHECK_EQ(4, values.size()); +// ---------------------------------------------------------------------- +template +bool SplitStringAndParse(StringPiece source, StringPiece delim, + bool (*parse)(const string& str, T* value), + vector* result); +template +bool SplitStringAndParseToContainer( + StringPiece source, StringPiece delim, + bool (*parse)(const string& str, typename Container::value_type* value), + Container* result); + +template +bool SplitStringAndParseToList( + StringPiece source, StringPiece delim, + bool (*parse)(const string& str, typename List::value_type* value), + List* result); +// ---------------------------------------------------------------------- +// SplitRange() +// Splits a string of the form "-". Either or both can be +// missing. A raw number () is interpreted as "-". Modifies +// parameters insofar as they're specified by the string. RETURNS +// true iff the input is a well-formed range. If it RETURNS false, +// from and to remain unchanged. The range in rangestr should be +// terminated either by "\0" or by whitespace. +// ---------------------------------------------------------------------- +bool SplitRange(const char* rangestr, int* from, int* to); + +// ---------------------------------------------------------------------- +// SplitCSVLineWithDelimiter() +// CSV lines come in many guises. There's the Comma Separated Values +// variety, in which fields are separated by (surprise!) commas. There's +// also the tab-separated values variant, in which tabs separate the +// fields. This routine handles both, which makes it almost like +// SplitUsing(line, delimiter), but for some special processing. For both +// delimiters, whitespace is trimmed from either side of the field value. +// If the delimiter is ',', we play additional games with quotes. A +// field value surrounded by double quotes is allowed to contain commas, +// which are not treated as field separators. Within a double-quoted +// string, a series of two double quotes signals an escaped single double +// quote. It'll be clearer in the examples. +// Example: +// Google , x , "Buchheit, Paul", "string with "" quote in it" +// --> [Google], [x], [Buchheit, Paul], [string with " quote in it] +// +// SplitCSVLine() +// A convenience wrapper around SplitCSVLineWithDelimiter which uses +// ',' as the delimiter. +// +// The following variants of SplitCSVLine() are not recommended for new code. +// Please consider the CSV parser in //util/csv as an alternative. Examples: +// To parse a single line: +// #include "kudu/util/csv/parser.h" +// vector fields = util::csv::ParseLine(line).fields(); +// +// To parse an entire file: +// #include "kudu/util/csv/parser.h" +// for (Record rec : Parser(source)) { +// vector fields = rec.fields(); +// } +// +// See //util/csv/parser.h for more complete documentation. +// +// ---------------------------------------------------------------------- +void SplitCSVLine(char* line, vector* cols); +void SplitCSVLineWithDelimiter(char* line, char delimiter, + vector* cols); +// SplitCSVLine string wrapper that internally makes a copy of string line. +void SplitCSVLineWithDelimiterForStrings(const string& line, char delimiter, + vector* cols); + +// ---------------------------------------------------------------------- +// SplitStructuredLine() +// Splits a line using the given delimiter, and places the columns +// into 'cols'. This is unlike 'SplitUsing(line, ",")' because you can +// define pairs of opening closing symbols inside which the delimiter should +// be ignored. If the symbol_pair string has an odd number of characters, +// the last character (which cannot be paired) will be assumed to be both an +// opening and closing symbol. +// WARNING : The input string 'line' is destroyed in the process. +// The function returns 0 if the line was parsed correctly (i.e all the +// opened braces had their closing braces) otherwise, it returns the position +// of the error. +// Example: +// SplitStructuredLine("item1,item2,{subitem1,subitem2},item4,[5,{6,7}]", +// ',', +// "{}[]", &output) +// --> output = { "item1", "item2", "{subitem1,subitem2}", "item4", +// "[5,{6,7}]" } +// Example2: trying to split "item1,[item2,{4,5],5}" will fail and the +// function will return the position of the problem : ] +// +// ---------------------------------------------------------------------- +char* SplitStructuredLine(char* line, + char delimiter, + const char* symbol_pairs, + vector* cols); + +// Similar to the function with the same name above, but splits a StringPiece +// into StringPiece parts. Returns true if successful. +bool SplitStructuredLine(StringPiece line, + char delimiter, + const char* symbol_pairs, + vector* cols); + +// ---------------------------------------------------------------------- +// SplitStructuredLineWithEscapes() +// Like SplitStructuredLine but also allows characters to be escaped. +// +// WARNING: the escape characters will be replicated in the output +// columns rather than being consumed, i.e. if {} were the opening and +// closing symbols, using \{ to quote a curly brace in the middle of +// an option would pass this unchanged. +// +// Example: +// SplitStructuredLineWithEscapes( +// "\{item1\},it\\em2,{\{subitem1\},sub\\item2},item4\,item5,[5,{6,7}]", +// ',', +// "{}[]", +// &output) +// --> output = { "\{item1\}", "it\\em2", "{\{subitem1\},sub\\item2}", +// "item4\,item5", "[5,{6,7}]" } +// +// ---------------------------------------------------------------------- +char* SplitStructuredLineWithEscapes(char* line, + char delimiter, + const char* symbol_pairs, + vector* cols); + +// Similar to the function with the same name above, but splits a StringPiece +// into StringPiece parts. Returns true if successful. +bool SplitStructuredLineWithEscapes(StringPiece line, + char delimiter, + const char* symbol_pairs, + vector* cols); + +// ---------------------------------------------------------------------- +// DEPRECATED(jgm): See the "NEW API" comment about this function below for +// example code showing an alternative. +// +// SplitStringIntoKeyValues() +// Split a line into a key string and a vector of value strings. The line has +// the following format: +// +// +*++...* +// +// where key and value are strings; */+ means zero/one or more; is +// a delimiter character to separate key and value; and is a delimiter +// character to separate between values. The user can specify a bunch of +// delimiter characters using a string. For example, if the user specifies +// the separator string as "\t ", then either ' ' or '\t' or any combination +// of them wil be treated as separator. For , the user can specify a +// empty string to indicate there is only one value. +// +// Note: this function assumes the input string begins exactly with a +// key. Therefore, if you use whitespaces to separate key and value, you +// should not let whitespace precedes the key in the input. Otherwise, you +// will get an empty string as the key. +// +// A line with no will return an empty string as the key, even if +// is non-empty! +// +// The syntax makes it impossible for a value to be the empty string. +// It is possible for the number of values to be zero. +// +// Returns false if the line has no or if the number of values is +// zero. +// +// ==> NEW API: Consider using the new Split API defined above. <== +// +// The SplitStringIntoKeyValues() function has some subtle and surprising +// semantics in various corner cases. To avoid this the strings::Split API is +// recommended. The following example shows how to split a string of delimited +// key-value pairs into a vector of pairs using the strings::Split API. +// +// using strings::Split; +// using strings::delimiter::AnyOf; +// using strings::delimiter::Limit; +// +// pair key_values = +// Split(line, Limit(AnyOf(kv_delim), 1)); +// string key = key_values.first; +// vector values = Split(key_values.second, AnyOf(vv_delim)); +// +// ---------------------------------------------------------------------- +bool SplitStringIntoKeyValues(const string& line, + const string& key_value_delimiters, + const string& value_value_delimiters, + string* key, vector* values); + +// ---------------------------------------------------------------------- +// SplitStringIntoKeyValuePairs() +// Split a line into a vector of pairs. The line has +// the following format: +// +// *+++...* +// +// Where key and value are strings; */+ means zero/one or more. is +// a delimiter character to separate key and value and is a delimiter +// character to separate key value pairs. The user can specify a bunch of +// delimiter characters using a string. +// +// Note: this function assumes each key-value pair begins exactly with a +// key. Therefore, if you use whitespaces to separate key and value, you +// should not let whitespace precede the key in the pair. Otherwise, you +// will get an empty string as the key. +// +// A pair with no will return empty strings as the key and value, +// even if is non-empty! +// +// Returns false for pairs with no specified and for pairs with +// empty strings as values. +// +// ==> NEW API: Consider using the new Split API defined above. <== +// +// The SplitStringIntoKeyValuePairs() function has some subtle and surprising +// semantics in various corner cases. To avoid this the strings::Split API is +// recommended. The following example shows how to split a string of delimited +// key-value pairs into a vector of pairs using the strings::Split API. +// +// using strings::SkipEmpty; +// using strings::Split; +// using strings::delimiter::AnyOf; +// using strings::delimiter::Limit; +// +// vector> pairs; // or even map +// for (StringPiece sp : Split(line, AnyOf(pair_delim), SkipEmpty())) { +// pairs.push_back(Split(sp, Limit(AnyOf(kv_delim), 1), SkipEmpty())); +// } +// +// ---------------------------------------------------------------------- +bool SplitStringIntoKeyValuePairs(const string& line, + const string& key_value_delimiters, + const string& key_value_pair_delimiters, + vector >* kv_pairs); + + +// ---------------------------------------------------------------------- +// SplitLeadingDec32Values() +// SplitLeadingDec64Values() +// A simple parser for space-separated decimal int32/int64 values. +// Appends parsed integers to the end of the result vector, stopping +// at the first unparsable spot. Skips past leading and repeated +// whitespace (does not consume trailing whitespace), and returns +// a pointer beyond the last character parsed. +// -------------------------------------------------------------------- +const char* SplitLeadingDec32Values(const char* next, vector* result); +const char* SplitLeadingDec64Values(const char* next, vector* result); + +// ---------------------------------------------------------------------- +// SplitOneIntToken() +// SplitOneInt32Token() +// SplitOneUint32Token() +// SplitOneInt64Token() +// SplitOneUint64Token() +// SplitOneDoubleToken() +// SplitOneFloatToken() +// Parse a single "delim" delimited number from "*source" into "*value". +// Modify *source to point after the delimiter. +// If no delimiter is present after the number, set *source to NULL. +// +// If the start of *source is not an number, return false. +// If the int is followed by the null character, return true. +// If the int is not followed by a character from delim, return false. +// If *source is NULL, return false. +// +// They cannot handle decimal numbers with leading 0s, since they will be +// treated as octal. +// ---------------------------------------------------------------------- +bool SplitOneIntToken(const char** source, const char* delim, + int* value); +bool SplitOneInt32Token(const char** source, const char* delim, + int32* value); +bool SplitOneUint32Token(const char** source, const char* delim, + uint32* value); +bool SplitOneInt64Token(const char** source, const char* delim, + int64* value); +bool SplitOneUint64Token(const char** source, const char* delim, + uint64* value); +bool SplitOneDoubleToken(const char** source, const char* delim, + double* value); +bool SplitOneFloatToken(const char** source, const char* delim, + float* value); + +// Some aliases, so that the function names are standardized against the names +// of the reflection setters/getters in proto2. This makes it easier to use +// certain macros with reflection when creating custom text formats for protos. + +inline bool SplitOneUInt32Token(const char** source, const char* delim, + uint32* value) { + return SplitOneUint32Token(source, delim, value); +} + +inline bool SplitOneUInt64Token(const char** source, const char* delim, + uint64* value) { + return SplitOneUint64Token(source, delim, value); +} + +// ---------------------------------------------------------------------- +// SplitOneDecimalIntToken() +// SplitOneDecimalInt32Token() +// SplitOneDecimalUint32Token() +// SplitOneDecimalInt64Token() +// SplitOneDecimalUint64Token() +// Parse a single "delim"-delimited number from "*source" into "*value". +// Unlike SplitOneIntToken, etc., this function always interprets +// the numbers as decimal. +bool SplitOneDecimalIntToken(const char** source, const char* delim, + int* value); +bool SplitOneDecimalInt32Token(const char** source, const char* delim, + int32* value); +bool SplitOneDecimalUint32Token(const char** source, const char* delim, + uint32* value); +bool SplitOneDecimalInt64Token(const char** source, const char* delim, + int64* value); +bool SplitOneDecimalUint64Token(const char** source, const char* delim, + uint64* value); + +// ---------------------------------------------------------------------- +// SplitOneHexUint32Token() +// SplitOneHexUint64Token() +// Once more, for hexadecimal numbers (unsigned only). +bool SplitOneHexUint32Token(const char** source, const char* delim, + uint32* value); +bool SplitOneHexUint64Token(const char** source, const char* delim, + uint64* value); + + +// ###################### TEMPLATE INSTANTIATIONS BELOW ####################### + +// SplitStringAndParse() -- see description above +template +bool SplitStringAndParse(StringPiece source, StringPiece delim, + bool (*parse)(const string& str, T* value), + vector* result) { + return SplitStringAndParseToList(source, delim, parse, result); +} + +namespace strings { +namespace internal { + +template +bool SplitStringAndParseToInserter( + StringPiece source, StringPiece delim, + bool (*parse)(const string& str, typename Container::value_type* value), + Container* result, InsertPolicy insert_policy) { + CHECK(NULL != parse); + CHECK(NULL != result); + CHECK(NULL != delim.data()); + CHECK_GT(delim.size(), 0); + bool retval = true; + vector pieces = strings::Split(source, + strings::delimiter::AnyOf(delim), + strings::SkipEmpty()); + for (const auto& piece : pieces) { + typename Container::value_type t; + if (parse(piece.as_string(), &t)) { + insert_policy(result, t); + } else { + retval = false; + } + } + return retval; +} + +// Cannot use output iterator here (e.g. std::inserter, std::back_inserter) +// because some callers use non-standard containers that don't have iterators, +// only an insert() or push_back() method. +struct BasicInsertPolicy { + template + void operator()(C* c, const V& v) const { c->insert(v); } +}; + +struct BackInsertPolicy { + template + void operator()(C* c, const V& v) const { c->push_back(v); } +}; + +} // namespace internal +} // namespace strings + +// SplitStringAndParseToContainer() -- see description above +template +bool SplitStringAndParseToContainer( + StringPiece source, StringPiece delim, + bool (*parse)(const string& str, typename Container::value_type* value), + Container* result) { + return strings::internal::SplitStringAndParseToInserter( + source, delim, parse, result, strings::internal::BasicInsertPolicy()); +} + +// SplitStringAndParseToList() -- see description above +template +bool SplitStringAndParseToList( + StringPiece source, StringPiece delim, + bool (*parse)(const string& str, typename List::value_type* value), + List* result) { + return strings::internal::SplitStringAndParseToInserter( + source, delim, parse, result, strings::internal::BackInsertPolicy()); +} + +// END DOXYGEN SplitFunctions grouping +/* @} */ + +#endif // STRINGS_SPLIT_H_ diff --git a/src/kudu/gutil/strings/split_internal.h b/src/kudu/gutil/strings/split_internal.h new file mode 100644 index 000000000000..01b3facac455 --- /dev/null +++ b/src/kudu/gutil/strings/split_internal.h @@ -0,0 +1,413 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// This file declares INTERNAL parts of the Split API that are inline/templated +// or otherwise need to be available at compile time. The main two abstractions +// defined in here are +// +// - SplitIterator<> +// - Splitter<> +// +// Everything else is plumbing for those two. +// +// DO NOT INCLUDE THIS FILE DIRECTLY. Use this file by including +// strings/split.h. +// +// IWYU pragma: private, include "strings/split.h" + +#ifndef STRINGS_SPLIT_INTERNAL_H_ +#define STRINGS_SPLIT_INTERNAL_H_ + +#include +using std::back_insert_iterator; +using std::iterator_traits; +#include +using std::map; +using std::multimap; +#include +using std::vector; + +#include "kudu/gutil/port.h" // for LANG_CXX11 +#include "kudu/gutil/strings/stringpiece.h" + +#ifdef LANG_CXX11 +// This must be included after "base/port.h", which defines LANG_CXX11. +#include +#endif // LANG_CXX11 + +namespace strings { + +namespace internal { + +// The default Predicate object, which doesn't filter out anything. +struct NoFilter { + bool operator()(StringPiece /* ignored */) { + return true; + } +}; + +// This class splits a string using the given delimiter, returning the split +// substrings via an iterator interface. An optional Predicate functor may be +// supplied, which will be used to filter the split strings: strings for which +// the predicate returns false will be skipped. A Predicate object is any +// functor that takes a StringPiece and returns bool. By default, the NoFilter +// Predicate is used, which does not filter out anything. +// +// This class is NOT part of the public splitting API. +// +// Usage: +// +// using strings::delimiter::Literal; +// Literal d(","); +// for (SplitIterator it("a,b,c", d), end(d); it != end; ++it) { +// StringPiece substring = *it; +// DoWork(substring); +// } +// +// The explicit single-argument constructor is used to create an "end" iterator. +// The two-argument constructor is used to split the given text using the given +// delimiter. +template +class SplitIterator + : public std::iterator { + public: + // Two constructors for "end" iterators. + explicit SplitIterator(Delimiter d) + : delimiter_(std::move(d)), predicate_(), is_end_(true) {} + SplitIterator(Delimiter d, Predicate p) + : delimiter_(std::move(d)), predicate_(std::move(p)), is_end_(true) {} + // Two constructors taking the text to iterator. + SplitIterator(StringPiece text, Delimiter d) + : text_(std::move(text)), + delimiter_(std::move(d)), + predicate_(), + is_end_(false) { + ++(*this); + } + SplitIterator(StringPiece text, Delimiter d, Predicate p) + : text_(std::move(text)), + delimiter_(std::move(d)), + predicate_(std::move(p)), + is_end_(false) { + ++(*this); + } + + StringPiece operator*() { return curr_piece_; } + StringPiece* operator->() { return &curr_piece_; } + + SplitIterator& operator++() { + do { + if (text_.end() == curr_piece_.end()) { + // Already consumed all of text_, so we're done. + is_end_ = true; + return *this; + } + StringPiece found_delimiter = delimiter_.Find(text_); + assert(found_delimiter.data() != NULL); + assert(text_.begin() <= found_delimiter.begin()); + assert(found_delimiter.end() <= text_.end()); + // found_delimiter is allowed to be empty. + // Sets curr_piece_ to all text up to but excluding the delimiter itself. + // Sets text_ to remaining data after the delimiter. + curr_piece_.set(text_.begin(), found_delimiter.begin() - text_.begin()); + text_.remove_prefix(found_delimiter.end() - text_.begin()); + } while (!predicate_(curr_piece_)); + return *this; + } + + SplitIterator operator++(int /* postincrement */) { + SplitIterator old(*this); + ++(*this); + return old; + } + + bool operator==(const SplitIterator& other) const { + // Two "end" iterators are always equal. If the two iterators being compared + // aren't both end iterators, then we fallback to comparing their fields. + // Importantly, the text being split must be equal and the current piece + // within the text being split must also be equal. The delimiter_ and + // predicate_ fields need not be checked here because they're template + // parameters that are already part of the SplitIterator's type. + return (is_end_ && other.is_end_) || + (is_end_ == other.is_end_ && + text_ == other.text_ && + text_.data() == other.text_.data() && + curr_piece_ == other.curr_piece_ && + curr_piece_.data() == other.curr_piece_.data()); + } + + bool operator!=(const SplitIterator& other) const { + return !(*this == other); + } + + private: + // The text being split. Modified as delimited pieces are consumed. + StringPiece text_; + Delimiter delimiter_; + Predicate predicate_; + bool is_end_; + // Holds the currently split piece of text. Will always refer to string data + // within text_. This value is returned when the iterator is dereferenced. + StringPiece curr_piece_; +}; + +// Declares a functor that can convert a StringPiece to another type. This works +// for any type that has a constructor (explicit or not) taking a single +// StringPiece argument. A specialization exists for converting to string +// because the underlying data needs to be copied. In theory, these +// specializations could be extended to work with other types (e.g., int32), but +// then a solution for error reporting would need to be devised. +template +struct StringPieceTo { + To operator()(StringPiece from) const { + return To(from); + } +}; + +// Specialization for converting to string. +template <> +struct StringPieceTo { + string operator()(StringPiece from) const { + return from.ToString(); + } +}; + +// Specialization for converting to *const* string. +template <> +struct StringPieceTo { + string operator()(StringPiece from) const { + return from.ToString(); + } +}; + +#ifdef LANG_CXX11 +// IsNotInitializerList::type exists iff T is not an initializer_list. More +// details below in Splitter<> where this is used. +template +struct IsNotInitializerList { + typedef void type; +}; +template +struct IsNotInitializerList > {}; +#endif // LANG_CXX11 + +// This class implements the behavior of the split API by giving callers access +// to the underlying split substrings in various convenient ways, such as +// through iterators or implicit conversion functions. Do not construct this +// class directly, rather use the Split() function instead. +// +// Output containers can be collections of either StringPiece or string objects. +// StringPiece is more efficient because the underlying data will not need to be +// copied; the returned StringPieces will all refer to the data within the +// original input string. If a collection of string objects is used, then each +// substring will be copied. +// +// An optional Predicate functor may be supplied. This predicate will be used to +// filter the split strings: only strings for which the predicate returns true +// will be kept. A Predicate object is any unary functor that takes a +// StringPiece and returns bool. By default, the NoFilter predicate is used, +// which does not filter out anything. +template +class Splitter { + public: + typedef internal::SplitIterator Iterator; + + Splitter(StringPiece text, Delimiter d) + : begin_(text, d), end_(d) {} + + Splitter(StringPiece text, Delimiter d, Predicate p) + : begin_(text, d, p), end_(d, p) {} + + // Range functions that iterate the split substrings as StringPiece objects. + // These methods enable a Splitter to be used in a range-based for loop in + // C++11, for example: + // + // for (StringPiece sp : my_splitter) { + // DoWork(sp); + // } + const Iterator& begin() const { return begin_; } + const Iterator& end() const { return end_; } + +#ifdef LANG_CXX11 +// Support for default template arguments for function templates was added in +// C++11, but it is not allowed if compiled in C++98 compatibility mode. Since +// this code is under a LANG_CXX11 guard, we can safely ignore the +// -Wc++98-compat flag and use default template arguments on the implicit +// conversion operator below. +// +// This use of default template arguments on a function template was approved +// by tgs and sanjay on behalf of the c-style-arbiters in email thread +// +// All compiler flags are first saved with a diagnostic push and restored with a +// diagnostic pop below. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic ignored "-Wc++98-compat" + + // Uses SFINAE to restrict conversion to container-like types (by testing for + // the presence of a const_iterator member type) and also to disable + // conversion to an initializer_list (which also has a const_iterator). + // Otherwise, code compiled in C++11 will get an error due to ambiguous + // conversion paths (in C++11 vector::operator= is overloaded to take + // either a vector or an initializer_list). + // + // This trick was taken from util/gtl/container_literal.h + template ::type, + typename ContainerChecker = + typename Container::const_iterator> + operator Container() { + return SelectContainer::value>()(this); + } + +// Restores diagnostic settings, i.e., removes the "ignore" on -Wpragmas and +// -Wc++98-compat. +#pragma GCC diagnostic pop + +#else + // Not under LANG_CXX11 + template + operator Container() { + return SelectContainer::value>()(this); + } +#endif // LANG_CXX11 + + template + operator std::pair() { + return ToPair(); + } + + private: + // is_map::value is true iff there exists a type T::mapped_type. This is + // used to dispatch to one of the SelectContainer<> functors (below) from the + // implicit conversion operator (above). + template + struct is_map { + template static base::big_ test(typename U::mapped_type*); + template static base::small_ test(...); + static const bool value = (sizeof(test(0)) == sizeof(base::big_)); + }; + + // Base template handles splitting to non-map containers + template + struct SelectContainer { + Container operator()(Splitter* splitter) const { + return splitter->template ToContainer(); + } + }; + + // Partial template specialization for splitting to map-like containers. + template + struct SelectContainer { + Container operator()(Splitter* splitter) const { + return splitter->template ToMap(); + } + }; + + // Inserts split results into the container. To do this the results are first + // stored in a vector. This is where the input text is actually + // "parsed". This vector is then used to possibly reserve space in the output + // container, and the StringPieces in "v" are converted as necessary to the + // output container's value type. + // + // The reason to use an intermediate vector of StringPiece is so we can learn + // the needed capacity of the output container. This is needed when the output + // container is a vector in which case resizes can be expensive due to + // copying of the ::string objects. + // + // At some point in the future we might add a C++11 move constructor to + // ::string, in which case the vector resizes are much less expensive and the + // use of this intermediate vector "v" can be removed. + template + Container ToContainer() { + vector v; + for (Iterator it = begin(); it != end_; ++it) { + v.push_back(*it); + } + typedef typename Container::value_type ToType; + internal::StringPieceTo converter; + Container c; + ReserveCapacity(&c, v.size()); + std::insert_iterator inserter(c, c.begin()); + for (const auto& sp : v) { + *inserter++ = converter(sp); + } + return c; + } + + // The algorithm is to insert a new pair into the map for each even-numbered + // item, with the even-numbered item as the key with a default-constructed + // value. Each odd-numbered item will then be assigned to the last pair's + // value. + template + Map ToMap() { + typedef typename Map::key_type Key; + typedef typename Map::mapped_type Data; + Map m; + StringPieceTo key_converter; + StringPieceTo val_converter; + typename Map::iterator curr_pair; + bool is_even = true; + for (Iterator it = begin(); it != end_; ++it) { + if (is_even) { + curr_pair = InsertInMap(std::make_pair(key_converter(*it), Data()), &m); + } else { + curr_pair->second = val_converter(*it); + } + is_even = !is_even; + } + return m; + } + + // Returns a pair with its .first and .second members set to the first two + // strings returned by the begin() iterator. Either/both of .first and .second + // will be empty strings if the iterator doesn't have a corresponding value. + template + std::pair ToPair() { + StringPieceTo first_converter; + StringPieceTo second_converter; + StringPiece first, second; + Iterator it = begin(); + if (it != end()) { + first = *it; + if (++it != end()) { + second = *it; + } + } + return std::make_pair(first_converter(first), second_converter(second)); + } + + // Overloaded InsertInMap() function. The first overload is the commonly used + // one for most map-like objects. The second overload is a special case for + // multimap, because multimap's insert() member function directly returns an + // iterator, rather than a pair like map's. + template + typename Map::iterator InsertInMap( + const typename Map::value_type& value, Map* map) { + return map->insert(value).first; + } + + // InsertInMap overload for multimap. + template + typename std::multimap::iterator InsertInMap( + const typename std::multimap::value_type& value, + typename std::multimap* map) { + return map->insert(value); + } + + // Reserves the given amount of capacity in a vector + template + void ReserveCapacity(vector* v, size_t size) { + v->reserve(size); + } + void ReserveCapacity(...) {} + + const Iterator begin_; + const Iterator end_; +}; + +} // namespace internal + +} // namespace strings + +#endif // STRINGS_SPLIT_INTERNAL_H_ diff --git a/src/kudu/gutil/strings/strcat.cc b/src/kudu/gutil/strings/strcat.cc new file mode 100644 index 000000000000..93f8114c6c77 --- /dev/null +++ b/src/kudu/gutil/strings/strcat.cc @@ -0,0 +1,252 @@ +// Copyright 2008 and onwards Google Inc. All rights reserved. + +#include "kudu/gutil/strings/strcat.h" + +#include +#include +#include +#include + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/stl_util.h" + +AlphaNum gEmptyAlphaNum(""); + +// ---------------------------------------------------------------------- +// StrCat() +// This merges the given strings or integers, with no delimiter. This +// is designed to be the fastest possible way to construct a string out +// of a mix of raw C strings, StringPieces, strings, and integer values. +// ---------------------------------------------------------------------- + +// Append is merely a version of memcpy that returns the address of the byte +// after the area just overwritten. It comes in multiple flavors to minimize +// call overhead. +static char *Append1(char *out, const AlphaNum &x) { + memcpy(out, x.data(), x.size()); + return out + x.size(); +} + +static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) { + memcpy(out, x1.data(), x1.size()); + out += x1.size(); + + memcpy(out, x2.data(), x2.size()); + return out + x2.size(); +} + +static char *Append4(char *out, + const AlphaNum &x1, const AlphaNum &x2, + const AlphaNum &x3, const AlphaNum &x4) { + memcpy(out, x1.data(), x1.size()); + out += x1.size(); + + memcpy(out, x2.data(), x2.size()); + out += x2.size(); + + memcpy(out, x3.data(), x3.size()); + out += x3.size(); + + memcpy(out, x4.data(), x4.size()); + return out + x4.size(); +} + +string StrCat(const AlphaNum &a) { + return string(a.data(), a.size()); +} + +string StrCat(const AlphaNum &a, const AlphaNum &b) { + string result; + STLStringResizeUninitialized(&result, a.size() + b.size()); + char *const begin = &*result.begin(); + char *out = Append2(begin, a, b); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { + string result; + STLStringResizeUninitialized(&result, a.size() + b.size() + c.size()); + char *const begin = &*result.begin(); + char *out = Append2(begin, a, b); + out = Append1(out, c); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d) { + string result; + STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e) { + string result; + STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size() + e.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + out = Append1(out, e); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) { + string result; + STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size() + e.size() + f.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + out = Append2(out, e, f); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g) { + string result; + STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size() + e.size() + + f.size() + g.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + out = Append2(out, e, f); + out = Append1(out, g); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h) { + string result; + STLStringResizeUninitialized(&result, + a.size() + b.size() + c.size() + d.size() + e.size() + + f.size() + g.size() + h.size()); + char *const begin = &*result.begin(); + char *out = Append4(begin, a, b, c, d); + out = Append4(out, e, f, g, h); + DCHECK_EQ(out, begin + result.size()); + return result; +} + +namespace strings { +namespace internal { + +// StrCat with this many params is exceedingly rare, but it has been +// requested... therefore we'll rely on default arguments to make calling +// slightly less efficient, to preserve code size. +string StrCatNineOrMore(const AlphaNum *a, ...) { + string result; + + va_list args; + va_start(args, a); + size_t size = a->size(); + while (const AlphaNum *arg = va_arg(args, const AlphaNum *)) { + size += arg->size(); + } + STLStringResizeUninitialized(&result, size); + va_end(args); + va_start(args, a); + char *const begin = &*result.begin(); + char *out = Append1(begin, *a); + while (const AlphaNum *arg = va_arg(args, const AlphaNum *)) { + out = Append1(out, *arg); + } + va_end(args); + DCHECK_EQ(out, begin + size); + return result; +} + +} // namespace internal +} // namespace strings + +// It's possible to call StrAppend with a StringPiece that is itself a fragment +// of the string we're appending to. However the results of this are random. +// Therefore, check for this in debug mode. Use unsigned math so we only have +// to do one comparison. +#define DCHECK_NO_OVERLAP(dest, src) \ + DCHECK_GT(uintptr_t((src).data() - (dest).data()), uintptr_t((dest).size())) + +void StrAppend(string *result, const AlphaNum &a) { + DCHECK_NO_OVERLAP(*result, a); + result->append(a.data(), a.size()); +} + +void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + string::size_type old_size = result->size(); + STLStringResizeUninitialized(result, old_size + a.size() + b.size()); + char *const begin = &*result->begin(); + char *out = Append2(begin + old_size, a, b); + DCHECK_EQ(out, begin + result->size()); +} + +void StrAppend(string *result, + const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + DCHECK_NO_OVERLAP(*result, c); + string::size_type old_size = result->size(); + STLStringResizeUninitialized(result, + old_size + a.size() + b.size() + c.size()); + char *const begin = &*result->begin(); + char *out = Append2(begin + old_size, a, b); + out = Append1(out, c); + DCHECK_EQ(out, begin + result->size()); +} + +void StrAppend(string *result, + const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + DCHECK_NO_OVERLAP(*result, c); + DCHECK_NO_OVERLAP(*result, d); + string::size_type old_size = result->size(); + STLStringResizeUninitialized(result, + old_size + a.size() + b.size() + c.size() + d.size()); + char *const begin = &*result->begin(); + char *out = Append4(begin + old_size, a, b, c, d); + DCHECK_EQ(out, begin + result->size()); +} + +// StrAppend with this many params is even rarer than with StrCat. +// Therefore we'll again rely on default arguments to make calling +// slightly less efficient, to preserve code size. +void StrAppend(string *result, + const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { + DCHECK_NO_OVERLAP(*result, a); + DCHECK_NO_OVERLAP(*result, b); + DCHECK_NO_OVERLAP(*result, c); + DCHECK_NO_OVERLAP(*result, d); + DCHECK_NO_OVERLAP(*result, e); + DCHECK_NO_OVERLAP(*result, f); + DCHECK_NO_OVERLAP(*result, g); + DCHECK_NO_OVERLAP(*result, h); + DCHECK_NO_OVERLAP(*result, i); + string::size_type old_size = result->size(); + STLStringResizeUninitialized(result, + old_size + a.size() + b.size() + c.size() + d.size() + + e.size() + f.size() + g.size() + h.size() + i.size()); + char *const begin = &*result->begin(); + char *out = Append4(begin + old_size, a, b, c, d); + out = Append4(out, e, f, g, h); + out = Append1(out, i); + DCHECK_EQ(out, begin + result->size()); +} diff --git a/src/kudu/gutil/strings/strcat.h b/src/kudu/gutil/strings/strcat.h new file mode 100644 index 000000000000..40b888b6cf5d --- /dev/null +++ b/src/kudu/gutil/strings/strcat.h @@ -0,0 +1,380 @@ +// Copyright 2008 and onwards Google, Inc. +// +// #status: RECOMMENDED +// #category: operations on strings +// #summary: Merges strings or numbers with no delimiter. +// +#ifndef STRINGS_STRCAT_H_ +#define STRINGS_STRCAT_H_ + +#include +using std::string; + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/stringpiece.h" + +// The AlphaNum type was designed to be used as the parameter type for StrCat(). +// I suppose that any routine accepting either a string or a number could accept +// it. The basic idea is that by accepting a "const AlphaNum &" as an argument +// to your function, your callers will automagically convert bools, integers, +// and floating point values to strings for you. +// +// Conversion from 8-bit values is not accepted because if it were, then an +// attempt to pass ':' instead of ":" might result in a 58 ending up in your +// result. +// +// Bools convert to "0" or "1". +// +// Floating point values are converted to a string which, if passed to strtod(), +// would produce the exact same original double (except in case of NaN; all NaNs +// are considered the same value). We try to keep the string short but it's not +// guaranteed to be as short as possible. +// +// This class has implicit constructors. +// Style guide exception granted: +// http://goto/style-guide-exception-20978288 +// +struct AlphaNum { + StringPiece piece; + char digits[kFastToBufferSize]; + + // No bool ctor -- bools convert to an integral type. + // A bool ctor would also convert incoming pointers (bletch). + + AlphaNum(int32 i32) // NOLINT(runtime/explicit) + : piece(digits, FastInt32ToBufferLeft(i32, digits) - &digits[0]) {} + AlphaNum(uint32 u32) // NOLINT(runtime/explicit) + : piece(digits, FastUInt32ToBufferLeft(u32, digits) - &digits[0]) {} + AlphaNum(int64 i64) // NOLINT(runtime/explicit) + : piece(digits, FastInt64ToBufferLeft(i64, digits) - &digits[0]) {} + AlphaNum(uint64 u64) // NOLINT(runtime/explicit) + : piece(digits, FastUInt64ToBufferLeft(u64, digits) - &digits[0]) {} + +#if defined(__APPLE__) + AlphaNum(size_t size) // NOLINT(runtime/explicit) + : piece(digits, FastUInt64ToBufferLeft(size, digits) - &digits[0]) {} +#endif + + AlphaNum(float f) // NOLINT(runtime/explicit) + : piece(digits, strlen(FloatToBuffer(f, digits))) {} + AlphaNum(double f) // NOLINT(runtime/explicit) + : piece(digits, strlen(DoubleToBuffer(f, digits))) {} + + AlphaNum(const char *c_str) : piece(c_str) {} // NOLINT(runtime/explicit) + AlphaNum(StringPiece pc) + : piece(std::move(pc)) {} // NOLINT(runtime/explicit) + AlphaNum(const string &s) : piece(s) {} // NOLINT(runtime/explicit) + + StringPiece::size_type size() const { return piece.size(); } + const char *data() const { return piece.data(); } + + private: + // Use ":" not ':' + AlphaNum(char c); // NOLINT(runtime/explicit) +}; + +extern AlphaNum gEmptyAlphaNum; + +// ---------------------------------------------------------------------- +// StrCat() +// This merges the given strings or numbers, with no delimiter. This +// is designed to be the fastest possible way to construct a string out +// of a mix of raw C strings, StringPieces, strings, bool values, +// and numeric values. +// +// Don't use this for user-visible strings. The localization process +// works poorly on strings built up out of fragments. +// +// For clarity and performance, don't use StrCat when appending to a +// string. In particular, avoid using any of these (anti-)patterns: +// str.append(StrCat(...) +// str += StrCat(...) +// str = StrCat(str, ...) +// where the last is the worse, with the potential to change a loop +// from a linear time operation with O(1) dynamic allocations into a +// quadratic time operation with O(n) dynamic allocations. StrAppend +// is a better choice than any of the above, subject to the restriction +// of StrAppend(&str, a, b, c, ...) that none of the a, b, c, ... may +// be a reference into str. +// ---------------------------------------------------------------------- + +string StrCat(const AlphaNum &a); +string StrCat(const AlphaNum &a, const AlphaNum &b); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g); +string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h); + +namespace strings { +namespace internal { + +// Do not call directly - this is not part of the public API. +string StrCatNineOrMore(const AlphaNum *a1, ...); + +} // namespace internal +} // namespace strings + +// Support 9 or more arguments +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u, + const AlphaNum &v) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, &v, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u, + const AlphaNum &v, const AlphaNum &w) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, &v, &w, null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u, + const AlphaNum &v, const AlphaNum &w, const AlphaNum &x) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, &v, &w, &x, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u, + const AlphaNum &v, const AlphaNum &w, const AlphaNum &x, + const AlphaNum &y) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, &v, &w, &x, &y, + null_alphanum); +} + +inline string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c, + const AlphaNum &d, const AlphaNum &e, const AlphaNum &f, + const AlphaNum &g, const AlphaNum &h, const AlphaNum &i, + const AlphaNum &j, const AlphaNum &k, const AlphaNum &l, + const AlphaNum &m, const AlphaNum &n, const AlphaNum &o, + const AlphaNum &p, const AlphaNum &q, const AlphaNum &r, + const AlphaNum &s, const AlphaNum &t, const AlphaNum &u, + const AlphaNum &v, const AlphaNum &w, const AlphaNum &x, + const AlphaNum &y, const AlphaNum &z) { + const AlphaNum* null_alphanum = NULL; + return strings::internal::StrCatNineOrMore(&a, &b, &c, &d, &e, &f, &g, &h, &i, + &j, &k, &l, &m, &n, &o, &p, &q, &r, + &s, &t, &u, &v, &w, &x, &y, &z, + null_alphanum); +} + +// ---------------------------------------------------------------------- +// StrAppend() +// Same as above, but adds the output to the given string. +// WARNING: For speed, StrAppend does not try to check each of its input +// arguments to be sure that they are not a subset of the string being +// appended to. That is, while this will work: +// +// string s = "foo"; +// s += s; +// +// This will not (necessarily) work: +// +// string s = "foo"; +// StrAppend(&s, s); +// +// Note: while StrCat supports appending up to 12 arguments, StrAppend +// is currently limited to 9. That's rarely an issue except when +// automatically transforming StrCat to StrAppend, and can easily be +// worked around as consecutive calls to StrAppend are quite efficient. +// ---------------------------------------------------------------------- + +void StrAppend(string *dest, const AlphaNum &a); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c); +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d); + +// Support up to 9 params by using a default empty AlphaNum. +void StrAppend(string *dest, const AlphaNum &a, const AlphaNum &b, + const AlphaNum &c, const AlphaNum &d, const AlphaNum &e, + const AlphaNum &f = gEmptyAlphaNum, + const AlphaNum &g = gEmptyAlphaNum, + const AlphaNum &h = gEmptyAlphaNum, + const AlphaNum &i = gEmptyAlphaNum); + +#endif // STRINGS_STRCAT_H_ diff --git a/src/kudu/gutil/strings/string_util-test.cc b/src/kudu/gutil/strings/string_util-test.cc new file mode 100644 index 000000000000..8849ca294257 --- /dev/null +++ b/src/kudu/gutil/strings/string_util-test.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions Copyright 2013 The Chromium Authors. All rights reserved. +#include "kudu/gutil/strings/util.h" + +#include + +namespace kudu { + +TEST(StringUtilTest, MatchPatternTest) { + EXPECT_TRUE(MatchPattern("www.google.com", "*.com")); + EXPECT_TRUE(MatchPattern("www.google.com", "*")); + EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org")); + EXPECT_TRUE(MatchPattern("Hello", "H?l?o")); + EXPECT_FALSE(MatchPattern("www.google.com", "http://*)")); + EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM")); + EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*")); + EXPECT_FALSE(MatchPattern("", "*.*")); + EXPECT_TRUE(MatchPattern("", "*")); + EXPECT_TRUE(MatchPattern("", "?")); + EXPECT_TRUE(MatchPattern("", "")); + EXPECT_FALSE(MatchPattern("Hello", "")); + EXPECT_TRUE(MatchPattern("Hello*", "Hello*")); + // Stop after a certain recursion depth. + EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*")); + + // Test UTF8 matching. + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0")); + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?.")); + EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*")); + // Invalid sequences should be handled as a single invalid character. + EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?")); + // If the pattern has invalid characters, it shouldn't match anything. + EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80")); + + // This test verifies that consecutive wild cards are collapsed into 1 + // wildcard (when this doesn't occur, MatchPattern reaches it's maximum + // recursion depth). + EXPECT_TRUE(MatchPattern("Hello" , + "He********************************o")) ; +} + +} // namespace kudu diff --git a/src/kudu/gutil/strings/stringpiece.cc b/src/kudu/gutil/strings/stringpiece.cc new file mode 100644 index 000000000000..d6f23ee9b827 --- /dev/null +++ b/src/kudu/gutil/strings/stringpiece.cc @@ -0,0 +1,224 @@ +// Copyright 2004 and onwards Google Inc. +// +// + +#include "kudu/gutil/strings/stringpiece.h" + +#include +#include +#include +#include +#include + +#include "kudu/gutil/hash/hash.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/memutil.h" + +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +using std::string; + +namespace std { + size_t hash::operator()(StringPiece s) const { + return HashTo32(s.data(), s.size()); + } +} // namespace std + +std::ostream& operator<<(std::ostream& o, StringPiece piece) { + o.write(piece.data(), piece.size()); + return o; +} + +StringPiece::StringPiece(StringPiece x, int pos) + : ptr_(x.ptr_ + pos), length_(x.length_ - pos) { + DCHECK_LE(0, pos); + DCHECK_LE(pos, x.length_); +} + +StringPiece::StringPiece(StringPiece x, int pos, int len) + : ptr_(x.ptr_ + pos), length_(min(len, x.length_ - pos)) { + DCHECK_LE(0, pos); + DCHECK_LE(pos, x.length_); + DCHECK_GE(len, 0); +} + +void StringPiece::CopyToString(string* target) const { + STLAssignToString(target, ptr_, length_); +} + +void StringPiece::AppendToString(string* target) const { + STLAppendToString(target, ptr_, length_); +} + +int StringPiece::copy(char* buf, size_type n, size_type pos) const { + int ret = min(length_ - pos, n); + memcpy(buf, ptr_ + pos, ret); + return ret; +} + +bool StringPiece::contains(StringPiece s) const { + return find(s, 0) != npos; +} + +int StringPiece::find(StringPiece s, size_type pos) const { + if (length_ <= 0 || pos > static_cast(length_)) { + if (length_ == 0 && pos == 0 && s.length_ == 0) return 0; + return npos; + } + const char *result = memmatch(ptr_ + pos, length_ - pos, + s.ptr_, s.length_); + return result ? result - ptr_ : npos; +} + +int StringPiece::find(char c, size_type pos) const { + if (length_ <= 0 || pos >= static_cast(length_)) { + return npos; + } + const char* result = static_cast( + memchr(ptr_ + pos, c, length_ - pos)); + return result != nullptr ? result - ptr_ : npos; +} + +int StringPiece::rfind(StringPiece s, size_type pos) const { + if (length_ < s.length_) return npos; + const size_t ulen = length_; + if (s.length_ == 0) return min(ulen, pos); + + const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_; + const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); + return result != last ? result - ptr_ : npos; +} + +// Search range is [0..pos] inclusive. If pos == npos, search everything. +int StringPiece::rfind(char c, size_type pos) const { + // Note: memrchr() is not available on Windows. + if (length_ <= 0) return npos; + for (int i = min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (ptr_[i] == c) { + return i; + } + } + return npos; +} + +// For each character in characters_wanted, sets the index corresponding +// to the ASCII code of that character to 1 in table. This is used by +// the find_.*_of methods below to tell whether or not a character is in +// the lookup table in constant time. +// The argument `table' must be an array that is large enough to hold all +// the possible values of an unsigned char. Thus it should be be declared +// as follows: +// bool table[UCHAR_MAX + 1] +static inline void BuildLookupTable(StringPiece characters_wanted, + bool* table) { + const int length = characters_wanted.length(); + const char* const data = characters_wanted.data(); + for (int i = 0; i < length; ++i) { + table[static_cast(data[i])] = true; + } +} + +int StringPiece::find_first_of(StringPiece s, size_type pos) const { + if (length_ <= 0 || s.length_ <= 0) { + return npos; + } + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) return find_first_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (int i = pos; i < length_; ++i) { + if (lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +int StringPiece::find_first_not_of(StringPiece s, size_type pos) const { + if (length_ <= 0) return npos; + if (s.length_ <= 0) return 0; + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) return find_first_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (int i = pos; i < length_; ++i) { + if (!lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +int StringPiece::find_first_not_of(char c, size_type pos) const { + if (length_ <= 0) return npos; + + for (; pos < static_cast(length_); ++pos) { + if (ptr_[pos] != c) { + return pos; + } + } + return npos; +} + +int StringPiece::find_last_of(StringPiece s, size_type pos) const { + if (length_ <= 0 || s.length_ <= 0) return npos; + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) return find_last_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (int i = min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +int StringPiece::find_last_not_of(StringPiece s, size_type pos) const { + if (length_ <= 0) return npos; + + int i = min(pos, static_cast(length_ - 1)); + if (s.length_ <= 0) return i; + + // Avoid the cost of BuildLookupTable() for a single-character search. + if (s.length_ == 1) return find_last_not_of(s.ptr_[0], pos); + + bool lookup[UCHAR_MAX + 1] = { false }; + BuildLookupTable(s, lookup); + for (; i >= 0; --i) { + if (!lookup[static_cast(ptr_[i])]) { + return i; + } + } + return npos; +} + +int StringPiece::find_last_not_of(char c, size_type pos) const { + if (length_ <= 0) return npos; + + for (int i = min(pos, static_cast(length_ - 1)); + i >= 0; --i) { + if (ptr_[i] != c) { + return i; + } + } + return npos; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > length_) pos = length_; + if (n > length_ - pos) n = length_ - pos; + return StringPiece(ptr_ + pos, n); +} + +const StringPiece::size_type StringPiece::npos = size_type(-1); diff --git a/src/kudu/gutil/strings/stringpiece.h b/src/kudu/gutil/strings/stringpiece.h new file mode 100644 index 000000000000..0f28f2b12cbd --- /dev/null +++ b/src/kudu/gutil/strings/stringpiece.h @@ -0,0 +1,377 @@ +// Copyright 2001, Google Inc. All rights reserved. +// Maintainer: mec@google.com (Michael Chastain) +// +// A StringPiece points to part or all of a string, Cord, double-quoted string +// literal, or other string-like object. A StringPiece does *not* own the +// string to which it points. A StringPiece is not null-terminated. +// +// You can use StringPiece as a function or method parameter. A StringPiece +// parameter can receive a double-quoted string literal argument, a "const +// char*" argument, a string argument, or a StringPiece argument with no data +// copying. Systematic use of StringPiece for arguments reduces data +// copies and strlen() calls. +// +// You may pass a StringPiece argument by value or const reference. +// Passing by value generates slightly smaller code. +// void MyFunction(const StringPiece& arg); +// // Slightly better, but same lifetime requirements as const-ref parameter: +// void MyFunction(StringPiece arg); +// +// StringPiece is also suitable for local variables if you know that +// the lifetime of the underlying object is longer than the lifetime +// of your StringPiece variable. +// +// Beware of binding a StringPiece to a temporary: +// StringPiece sp = obj.MethodReturningString(); // BAD: lifetime problem +// +// This code is okay: +// string str = obj.MethodReturningString(); // str owns its contents +// StringPiece sp(str); // GOOD, although you may not need sp at all +// +// StringPiece is sometimes a poor choice for a return value and usually a poor +// choice for a data member. If you do use a StringPiece this way, it is your +// responsibility to ensure that the object pointed to by the StringPiece +// outlives the StringPiece. +// +// A StringPiece may represent just part of a string; thus the name "Piece". +// For example, when splitting a string, vector is a natural data +// type for the output. For another example, a Cord is a non-contiguous, +// potentially very long string-like object. The Cord class has an interface +// that iteratively provides StringPiece objects that point to the +// successive pieces of a Cord object. +// +// A StringPiece is not null-terminated. If you write code that scans a +// StringPiece, you must check its length before reading any characters. +// Common idioms that work on null-terminated strings do not work on +// StringPiece objects. +// +// There are several ways to create a null StringPiece: +// StringPiece() +// StringPiece(NULL) +// StringPiece(NULL, 0) +// For all of the above, sp.data() == NULL, sp.length() == 0, +// and sp.empty() == true. Also, if you create a StringPiece with +// a non-NULL pointer then sp.data() != non-NULL. Once created, +// sp.data() will stay either NULL or not-NULL, except if you call +// sp.clear() or sp.set(). +// +// Thus, you can use StringPiece(NULL) to signal an out-of-band value +// that is different from other StringPiece values. This is similar +// to the way that const char* p1 = NULL; is different from +// const char* p2 = "";. +// +// There are many ways to create an empty StringPiece: +// StringPiece() +// StringPiece(NULL) +// StringPiece(NULL, 0) +// StringPiece("") +// StringPiece("", 0) +// StringPiece("abcdef", 0) +// StringPiece("abcdef"+6, 0) +// For all of the above, sp.length() will be 0 and sp.empty() will be true. +// For some empty StringPiece values, sp.data() will be NULL. +// For some empty StringPiece values, sp.data() will not be NULL. +// +// Be careful not to confuse: null StringPiece and empty StringPiece. +// The set of empty StringPieces properly includes the set of null StringPieces. +// That is, every null StringPiece is an empty StringPiece, +// but some non-null StringPieces are empty Stringpieces too. +// +// All empty StringPiece values compare equal to each other. +// Even a null StringPieces compares equal to a non-null empty StringPiece: +// StringPiece() == StringPiece("", 0) +// StringPiece(NULL) == StringPiece("abc", 0) +// StringPiece(NULL, 0) == StringPiece("abcdef"+6, 0) +// +// Look carefully at this example: +// StringPiece("") == NULL +// True or false? TRUE, because StringPiece::operator== converts +// the right-hand side from NULL to StringPiece(NULL), +// and then compares two zero-length spans of characters. +// However, we are working to make this example produce a compile error. +// +// Suppose you want to write: +// bool TestWhat?(StringPiece sp) { return sp == NULL; } // BAD +// Do not do that. Write one of these instead: +// bool TestNull(StringPiece sp) { return sp.data() == NULL; } +// bool TestEmpty(StringPiece sp) { return sp.empty(); } +// The intent of TestWhat? is unclear. Did you mean TestNull or TestEmpty? +// Right now, TestWhat? behaves likes TestEmpty. +// We are working to make TestWhat? produce a compile error. +// TestNull is good to test for an out-of-band signal. +// TestEmpty is good to test for an empty StringPiece. +// +// Caveats (again): +// (1) The lifetime of the pointed-to string (or piece of a string) +// must be longer than the lifetime of the StringPiece. +// (2) There may or may not be a '\0' character after the end of +// StringPiece data. +// (3) A null StringPiece is empty. +// An empty StringPiece may or may not be a null StringPiece. + +#ifndef STRINGS_STRINGPIECE_H_ +#define STRINGS_STRINGPIECE_H_ + + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/type_traits.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/hash/hash.h" + +class StringPiece { + private: + const char* ptr_; + int length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + // + // Style guide exception granted: + // http://goto/style-guide-exception-20978288 + StringPiece() : ptr_(NULL), length_(0) {} + StringPiece(const char* str) // NOLINT(runtime/explicit) + : ptr_(str), length_(0) { + if (str != NULL) { + size_t length = strlen(str); + assert(length <= static_cast(std::numeric_limits::max())); + length_ = static_cast(length); + } + } + StringPiece(const std::string& str) // NOLINT(runtime/explicit) + : ptr_(str.data()), length_(0) { + size_t length = str.size(); + assert(length <= static_cast(std::numeric_limits::max())); + length_ = static_cast(length); + } + StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { + assert(len >= 0); + } + + // Substring of another StringPiece. + // pos must be non-negative and <= x.length(). + StringPiece(StringPiece x, int pos); + // Substring of another StringPiece. + // pos must be non-negative and <= x.length(). + // len must be non-negative and will be pinned to at most x.length() - pos. + StringPiece(StringPiece x, int pos, int len); + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + int size() const { return length_; } + int length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { + ptr_ = NULL; + length_ = 0; + } + + void set(const char* data, int len) { + assert(len >= 0); + ptr_ = data; + length_ = len; + } + + void set(const char* str) { + ptr_ = str; + if (str != NULL) + length_ = static_cast(strlen(str)); + else + length_ = 0; + } + void set(const void* data, int len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](int i) const { + assert(0 <= i); + assert(i < length_); + return ptr_[i]; + } + + void remove_prefix(int n) { + assert(length_ >= n); + ptr_ += n; + length_ -= n; + } + + void remove_suffix(int n) { + assert(length_ >= n); + length_ -= n; + } + + // returns {-1, 0, 1} + int compare(StringPiece x) const { + const int min_size = length_ < x.length_ ? length_ : x.length_; + int r = memcmp(ptr_, x.ptr_, min_size); + if (r < 0) return -1; + if (r > 0) return 1; + if (length_ < x.length_) return -1; + if (length_ > x.length_) return 1; + return 0; + } + + std::string as_string() const { + return ToString(); + } + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const { + if (ptr_ == NULL) return std::string(); + return std::string(data(), size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + bool starts_with(StringPiece x) const { + return (length_ >= x.length_) && (memcmp(ptr_, x.ptr_, x.length_) == 0); + } + + bool ends_with(StringPiece x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + // STLS says return size_type, but Google says return int + int max_size() const { return length_; } + int capacity() const { return length_; } + + // cpplint.py emits a false positive [build/include_what_you_use] + int copy(char* buf, size_type n, size_type pos = 0) const; // NOLINT + + bool contains(StringPiece s) const; + + int find(StringPiece s, size_type pos = 0) const; + int find(char c, size_type pos = 0) const; + int rfind(StringPiece s, size_type pos = npos) const; + int rfind(char c, size_type pos = npos) const; + + int find_first_of(StringPiece s, size_type pos = 0) const; + int find_first_of(char c, size_type pos = 0) const { return find(c, pos); } + int find_first_not_of(StringPiece s, size_type pos = 0) const; + int find_first_not_of(char c, size_type pos = 0) const; + int find_last_of(StringPiece s, size_type pos = npos) const; + int find_last_of(char c, size_type pos = npos) const { return rfind(c, pos); } + int find_last_not_of(StringPiece s, size_type pos = npos) const; + int find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; +}; + +#ifndef SWIG +DECLARE_POD(StringPiece); // So vector becomes really fast +#endif + +// This large function is defined inline so that in a fairly common case where +// one of the arguments is a literal, the compiler can elide a lot of the +// following comparisons. +inline bool operator==(StringPiece x, StringPiece y) { + int len = x.size(); + if (len != y.size()) { + return false; + } + + return x.data() == y.data() || len <= 0 || + strings::memeq(x.data(), y.data(), len); +} + +inline bool operator!=(StringPiece x, StringPiece y) { + return !(x == y); +} + +inline bool operator<(StringPiece x, StringPiece y) { + const int min_size = x.size() < y.size() ? x.size() : y.size(); + const int r = memcmp(x.data(), y.data(), min_size); + return (r < 0) || (r == 0 && x.size() < y.size()); +} + +inline bool operator>(StringPiece x, StringPiece y) { + return y < x; +} + +inline bool operator<=(StringPiece x, StringPiece y) { + return !(x > y); +} + +inline bool operator>=(StringPiece x, StringPiece y) { + return !(x < y); +} +class StringPiece; +template struct GoodFastHash; + +// ------------------------------------------------------------------ +// Functions used to create STL containers that use StringPiece +// Remember that a StringPiece's lifetime had better be less than +// that of the underlying string or char*. If it is not, then you +// cannot safely store a StringPiece into an STL container +// ------------------------------------------------------------------ + +// SWIG doesn't know how to parse this stuff properly. Omit it. +#ifndef SWIG + +namespace std { +template<> struct hash { + size_t operator()(StringPiece s) const; +}; +} // namespace std + + +// An implementation of GoodFastHash for StringPiece. See +// GoodFastHash values. +template<> struct GoodFastHash { + size_t operator()(StringPiece s) const { + return HashStringThoroughly(s.data(), s.size()); + } + // Less than operator, for MSVC. + bool operator()(const StringPiece& s1, const StringPiece& s2) const { + return s1 < s2; + } + static const size_t bucket_size = 4; // These are required by MSVC + static const size_t min_buckets = 8; // 4 and 8 are defaults. +}; +#endif + +// allow StringPiece to be logged +extern ostream& operator<<(ostream& o, StringPiece piece); + + +#endif // STRINGS_STRINGPIECE_H__ diff --git a/src/kudu/gutil/strings/strip.cc b/src/kudu/gutil/strings/strip.cc new file mode 100644 index 000000000000..1a6a547add60 --- /dev/null +++ b/src/kudu/gutil/strings/strip.cc @@ -0,0 +1,384 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// based on contributions of various authors in strings/strutil_unittest.cc +// +// This file contains functions that remove a defined part from the string, +// i.e., strip the string. + +#include "kudu/gutil/strings/strip.h" + +#include +#include +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +using std::string; + +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/stringpiece.h" + +string StripPrefixString(StringPiece str, const StringPiece& prefix) { + if (str.starts_with(prefix)) + str.remove_prefix(prefix.length()); + return str.as_string(); +} + +bool TryStripPrefixString(StringPiece str, const StringPiece& prefix, + string* result) { + const bool has_prefix = str.starts_with(prefix); + if (has_prefix) + str.remove_prefix(prefix.length()); + str.as_string().swap(*result); + return has_prefix; +} + +string StripSuffixString(StringPiece str, const StringPiece& suffix) { + if (str.ends_with(suffix)) + str.remove_suffix(suffix.length()); + return str.as_string(); +} + +bool TryStripSuffixString(StringPiece str, const StringPiece& suffix, + string* result) { + const bool has_suffix = str.ends_with(suffix); + if (has_suffix) + str.remove_suffix(suffix.length()); + str.as_string().swap(*result); + return has_suffix; +} + +// ---------------------------------------------------------------------- +// StripString +// Replaces any occurrence of the character 'remove' (or the characters +// in 'remove') with the character 'replacewith'. +// ---------------------------------------------------------------------- +void StripString(char* str, StringPiece remove, char replacewith) { + for (; *str != '\0'; ++str) { + if (remove.find(*str) != StringPiece::npos) { + *str = replacewith; + } + } +} + +void StripString(char* str, int len, StringPiece remove, char replacewith) { + char* end = str + len; + for (; str < end; ++str) { + if (remove.find(*str) != StringPiece::npos) { + *str = replacewith; + } + } +} + +void StripString(string* s, StringPiece remove, char replacewith) { + for (char& c : *s) { + if (remove.find(c) != StringPiece::npos) { + c = replacewith; + } + } +} + +// ---------------------------------------------------------------------- +// StripWhiteSpace +// ---------------------------------------------------------------------- +void StripWhiteSpace(const char** str, int* len) { + // strip off trailing whitespace + while ((*len) > 0 && ascii_isspace((*str)[(*len)-1])) { + (*len)--; + } + + // strip off leading whitespace + while ((*len) > 0 && ascii_isspace((*str)[0])) { + (*len)--; + (*str)++; + } +} + +bool StripTrailingNewline(string* s) { + if (!s->empty() && (*s)[s->size() - 1] == '\n') { + if (s->size() > 1 && (*s)[s->size() - 2] == '\r') + s->resize(s->size() - 2); + else + s->resize(s->size() - 1); + return true; + } + return false; +} + +void StripWhiteSpace(string* str) { + int str_length = str->length(); + + // Strip off leading whitespace. + int first = 0; + while (first < str_length && ascii_isspace(str->at(first))) { + ++first; + } + // If entire string is white space. + if (first == str_length) { + str->clear(); + return; + } + if (first > 0) { + str->erase(0, first); + str_length -= first; + } + + // Strip off trailing whitespace. + int last = str_length - 1; + while (last >= 0 && ascii_isspace(str->at(last))) { + --last; + } + if (last != (str_length - 1) && last >= 0) { + str->erase(last + 1, string::npos); + } +} + +// ---------------------------------------------------------------------- +// Misc. stripping routines +// ---------------------------------------------------------------------- +void StripCurlyBraces(string* s) { + return StripBrackets('{', '}', s); +} + +void StripBrackets(char left, char right, string* s) { + string::iterator opencurly = find(s->begin(), s->end(), left); + while (opencurly != s->end()) { + string::iterator closecurly = find(opencurly, s->end(), right); + if (closecurly == s->end()) + return; + opencurly = s->erase(opencurly, closecurly + 1); + opencurly = find(opencurly, s->end(), left); + } +} + +void StripMarkupTags(string* s) { + string::iterator openbracket = find(s->begin(), s->end(), '<'); + while (openbracket != s->end()) { + string::iterator closebracket = find(openbracket, s->end(), '>'); + if (closebracket == s->end()) { + s->erase(openbracket, closebracket); + return; + } + + openbracket = s->erase(openbracket, closebracket + 1); + openbracket = find(openbracket, s->end(), '<'); + } +} + +string OutputWithMarkupTagsStripped(const string& s) { + string result(s); + StripMarkupTags(&result); + return result; +} + + +int TrimStringLeft(string* s, const StringPiece& remove) { + int i = 0; + while (i < s->size() && memchr(remove.data(), (*s)[i], remove.size())) { + ++i; + } + if (i > 0) s->erase(0, i); + return i; +} + +int TrimStringRight(string* s, const StringPiece& remove) { + int i = s->size(), trimmed = 0; + while (i > 0 && memchr(remove.data(), (*s)[i-1], remove.size())) { + --i; + } + if (i < s->size()) { + trimmed = s->size() - i; + s->erase(i); + } + return trimmed; +} + +// ---------------------------------------------------------------------- +// Various removal routines +// ---------------------------------------------------------------------- +int strrm(char* str, char c) { + char *src, *dest; + for (src = dest = str; *src != '\0'; ++src) + if (*src != c) *(dest++) = *src; + *dest = '\0'; + return dest - str; +} + +int memrm(char* str, int strlen, char c) { + char *src, *dest; + for (src = dest = str; strlen-- > 0; ++src) + if (*src != c) *(dest++) = *src; + return dest - str; +} + +int strrmm(char* str, const char* chars) { + char *src, *dest; + for (src = dest = str; *src != '\0'; ++src) { + bool skip = false; + for (const char* c = chars; *c != '\0'; c++) { + if (*src == *c) { + skip = true; + break; + } + } + if (!skip) *(dest++) = *src; + } + *dest = '\0'; + return dest - str; +} + +int strrmm(string* str, const string& chars) { + size_t str_len = str->length(); + size_t in_index = str->find_first_of(chars); + if (in_index == string::npos) + return str_len; + + size_t out_index = in_index++; + + while (in_index < str_len) { + char c = (*str)[in_index++]; + if (chars.find(c) == string::npos) + (*str)[out_index++] = c; + } + + str->resize(out_index); + return out_index; +} + +// ---------------------------------------------------------------------- +// StripDupCharacters +// Replaces any repeated occurrence of the character 'repeat_char' +// with single occurrence. e.g., +// StripDupCharacters("a//b/c//d", '/', 0) => "a/b/c/d" +// Return the number of characters removed +// ---------------------------------------------------------------------- +int StripDupCharacters(string* s, char dup_char, int start_pos) { + if (start_pos < 0) + start_pos = 0; + + // remove dups by compaction in-place + int input_pos = start_pos; // current reader position + int output_pos = start_pos; // current writer position + const int input_end = s->size(); + while (input_pos < input_end) { + // keep current character + const char curr_char = (*s)[input_pos]; + if (output_pos != input_pos) // must copy + (*s)[output_pos] = curr_char; + ++input_pos; + ++output_pos; + + if (curr_char == dup_char) { // skip subsequent dups + while ((input_pos < input_end) && ((*s)[input_pos] == dup_char)) + ++input_pos; + } + } + const int num_deleted = input_pos - output_pos; + s->resize(s->size() - num_deleted); + return num_deleted; +} + +// ---------------------------------------------------------------------- +// RemoveExtraWhitespace() +// Remove leading, trailing, and duplicate internal whitespace. +// ---------------------------------------------------------------------- +void RemoveExtraWhitespace(string* s) { + assert(s != nullptr); + // Empty strings clearly have no whitespace, and this code assumes that + // string length is greater than 0 + if (s->empty()) + return; + + int input_pos = 0; // current reader position + int output_pos = 0; // current writer position + const int input_end = s->size(); + // Strip off leading space + while (input_pos < input_end && ascii_isspace((*s)[input_pos])) input_pos++; + + while (input_pos < input_end - 1) { + char c = (*s)[input_pos]; + char next = (*s)[input_pos + 1]; + // Copy each non-whitespace character to the right position. + // For a block of whitespace, print the last one. + if (!ascii_isspace(c) || !ascii_isspace(next)) { + if (output_pos != input_pos) { // only copy if needed + (*s)[output_pos] = c; + } + output_pos++; + } + input_pos++; + } + // Pick up the last character if needed. + char c = (*s)[input_end - 1]; + if (!ascii_isspace(c)) (*s)[output_pos++] = c; + + s->resize(output_pos); +} + +//------------------------------------------------------------------------ +// See comment in header file for a complete description. +//------------------------------------------------------------------------ +void StripLeadingWhiteSpace(string* str) { + char const* const leading = StripLeadingWhiteSpace( + const_cast(str->c_str())); + if (leading != nullptr) { + string const tmp(leading); + str->assign(tmp); + } else { + str->assign(""); + } +} + +void StripTrailingWhitespace(string* const s) { + string::size_type i; + for (i = s->size(); i > 0 && ascii_isspace((*s)[i - 1]); --i) { + } + + s->resize(i); +} + +// ---------------------------------------------------------------------- +// TrimRunsInString +// Removes leading and trailing runs, and collapses middle +// runs of a set of characters into a single character (the +// first one specified in 'remove'). Useful for collapsing +// runs of repeated delimiters, whitespace, etc. E.g., +// TrimRunsInString(&s, " :,()") removes leading and trailing +// delimiter chars and collapses and converts internal runs +// of delimiters to single ' ' characters, so, for example, +// " a:(b):c " -> "a b c" +// "first,last::(area)phone, ::zip" -> "first last area phone zip" +// ---------------------------------------------------------------------- +void TrimRunsInString(string* s, StringPiece remove) { + string::iterator dest = s->begin(); + string::iterator src_end = s->end(); + for (string::iterator src = s->begin(); src != src_end; ) { + if (remove.find(*src) == StringPiece::npos) { + *(dest++) = *(src++); + } else { + // Skip to the end of this run of chars that are in 'remove'. + for (++src; src != src_end; ++src) { + if (remove.find(*src) == StringPiece::npos) { + if (dest != s->begin()) { + // This is an internal run; collapse it. + *(dest++) = remove[0]; + } + *(dest++) = *(src++); + break; + } + } + } + } + s->erase(dest, src_end); +} + +// ---------------------------------------------------------------------- +// RemoveNullsInString +// Removes any internal \0 characters from the string. +// ---------------------------------------------------------------------- +void RemoveNullsInString(string* s) { + s->erase(remove(s->begin(), s->end(), '\0'), s->end()); +} diff --git a/src/kudu/gutil/strings/strip.h b/src/kudu/gutil/strings/strip.h new file mode 100644 index 000000000000..8104b76d0ca6 --- /dev/null +++ b/src/kudu/gutil/strings/strip.h @@ -0,0 +1,272 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Refactored from contributions of various authors in strings/strutil.h +// +// This file contains functions that remove a defined part from the string, +// i.e., strip the string. + +#ifndef STRINGS_STRIP_H_ +#define STRINGS_STRIP_H_ + +#include +#include +using std::string; + +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/stringpiece.h" + +// Given a string and a putative prefix, returns the string minus the +// prefix string if the prefix matches, otherwise the original +// string. +string StripPrefixString(StringPiece str, const StringPiece& prefix); + +// Like StripPrefixString, but return true if the prefix was +// successfully matched. Write the output to *result. +// It is safe for result to point back to the input string. +bool TryStripPrefixString(StringPiece str, const StringPiece& prefix, + string* result); + +// Given a string and a putative suffix, returns the string minus the +// suffix string if the suffix matches, otherwise the original +// string. +string StripSuffixString(StringPiece str, const StringPiece& suffix); + + +// Like StripSuffixString, but return true if the suffix was +// successfully matched. Write the output to *result. +// It is safe for result to point back to the input string. +bool TryStripSuffixString(StringPiece str, const StringPiece& suffix, + string* result); + +// ---------------------------------------------------------------------- +// StripString +// Replaces any occurrence of the character 'remove' (or the characters +// in 'remove') with the character 'replacewith'. +// Good for keeping html characters or protocol characters (\t) out +// of places where they might cause a problem. +// ---------------------------------------------------------------------- +inline void StripString(char* str, char remove, char replacewith) { + for (; *str; str++) { + if (*str == remove) + *str = replacewith; + } +} + +void StripString(char* str, StringPiece remove, char replacewith); +void StripString(char* str, int len, StringPiece remove, char replacewith); +void StripString(string* s, StringPiece remove, char replacewith); + +// ---------------------------------------------------------------------- +// StripDupCharacters +// Replaces any repeated occurrence of the character 'dup_char' +// with single occurrence. e.g., +// StripDupCharacters("a//b/c//d", '/', 0) => "a/b/c/d" +// Return the number of characters removed +// ---------------------------------------------------------------------- +int StripDupCharacters(string* s, char dup_char, int start_pos); + +// ---------------------------------------------------------------------- +// StripWhiteSpace +// "Removes" whitespace from both sides of string. Pass in a pointer to an +// array of characters, and its length. The function changes the pointer +// and length to refer to a substring that does not contain leading or +// trailing spaces; it does not modify the string itself. If the caller is +// using NUL-terminated strings, it is the caller's responsibility to insert +// the NUL character at the end of the substring." +// +// Note: to be completely type safe, this function should be +// parameterized as a template: template void +// StripWhiteSpace(anyChar** str, int* len), where the expectation +// is that anyChar could be char, const char, w_char, const w_char, +// unicode_char, or any other character type we want. However, we +// just provided a version for char and const char. C++ is +// inconvenient, but correct, here. Ask Amit is you want to know +// the type safety details. +// ---------------------------------------------------------------------- +void StripWhiteSpace(const char** str, int* len); + +//------------------------------------------------------------------------ +// StripTrailingWhitespace() +// Removes whitespace at the end of the string *s. +//------------------------------------------------------------------------ +void StripTrailingWhitespace(string* s); + +//------------------------------------------------------------------------ +// StripTrailingNewline(string*) +// Strips the very last trailing newline or CR+newline from its +// input, if one exists. Useful for dealing with MapReduce's text +// input mode, which appends '\n' to each map input. Returns true +// if a newline was stripped. +//------------------------------------------------------------------------ +bool StripTrailingNewline(string* s); + +inline void StripWhiteSpace(char** str, int* len) { + // The "real" type for StripWhiteSpace is ForAll char types C, take + // (C, int) as input and return (C, int) as output. We're using the + // cast here to assert that we can take a char*, even though the + // function thinks it's assigning to const char*. + StripWhiteSpace(const_cast(str), len); +} + +inline void StripWhiteSpace(StringPiece* str) { + const char* data = str->data(); + int len = str->size(); + StripWhiteSpace(&data, &len); + str->set(data, len); +} + +void StripWhiteSpace(string* str); + +namespace strings { + +template +inline void StripWhiteSpaceInCollection(Collection* collection) { + for (typename Collection::iterator it = collection->begin(); + it != collection->end(); ++it) + StripWhiteSpace(&(*it)); +} + +} // namespace strings + +// ---------------------------------------------------------------------- +// StripLeadingWhiteSpace +// "Removes" whitespace from beginning of string. Returns ptr to first +// non-whitespace character if one is present, NULL otherwise. Assumes +// "line" is null-terminated. +// ---------------------------------------------------------------------- + +inline const char* StripLeadingWhiteSpace(const char* line) { + // skip leading whitespace + while (ascii_isspace(*line)) + ++line; + + if ('\0' == *line) // end of line, no non-whitespace + return NULL; + + return line; +} + +// StripLeadingWhiteSpace for non-const strings. +inline char* StripLeadingWhiteSpace(char* line) { + return const_cast( + StripLeadingWhiteSpace(const_cast(line))); +} + +void StripLeadingWhiteSpace(string* str); + +// Remove leading, trailing, and duplicate internal whitespace. +void RemoveExtraWhitespace(string* s); + + +// ---------------------------------------------------------------------- +// SkipLeadingWhiteSpace +// Returns str advanced past white space characters, if any. +// Never returns NULL. "str" must be terminated by a null character. +// ---------------------------------------------------------------------- +inline const char* SkipLeadingWhiteSpace(const char* str) { + while (ascii_isspace(*str)) + ++str; + return str; +} + +inline char* SkipLeadingWhiteSpace(char* str) { + while (ascii_isspace(*str)) + ++str; + return str; +} + +// ---------------------------------------------------------------------- +// StripCurlyBraces +// Strips everything enclosed in pairs of curly braces and the curly +// braces. Doesn't touch open braces. It doesn't handle nested curly +// braces. This is used for removing things like {:stopword} from +// queries. +// StripBrackets does the same, but allows the caller to specify different +// left and right bracket characters, such as '(' and ')'. +// ---------------------------------------------------------------------- + +void StripCurlyBraces(string* s); +void StripBrackets(char left, char right, string* s); + + +// ---------------------------------------------------------------------- +// StripMarkupTags +// Strips everything enclosed in pairs of angle brackets and the angle +// brackets. +// This is used for stripping strings of markup; e.g. going from +// "the quick brown fox" to "the quick brown fox." +// If you want to skip entire sections of markup (e.g. the word "brown" +// too in that example), see webutil/pageutil/pageutil.h . +// This function was designed for stripping the bold tags (inserted by the +// docservers) from the titles of news stories being returned by RSS. +// This implementation DOES NOT cover all cases in html documents +// like tags that contain quoted angle-brackets, or HTML comment. +// For example A > B +// or +// See "perldoc -q html" +// ---------------------------------------------------------------------- + +void StripMarkupTags(string* s); +string OutputWithMarkupTagsStripped(const string& s); + +// ---------------------------------------------------------------------- +// TrimStringLeft +// Removes any occurrences of the characters in 'remove' from the start +// of the string. Returns the number of chars trimmed. +// ---------------------------------------------------------------------- +int TrimStringLeft(string* s, const StringPiece& remove); + +// ---------------------------------------------------------------------- +// TrimStringRight +// Removes any occurrences of the characters in 'remove' from the end +// of the string. Returns the number of chars trimmed. +// ---------------------------------------------------------------------- +int TrimStringRight(string* s, const StringPiece& remove); + +// ---------------------------------------------------------------------- +// TrimString +// Removes any occurrences of the characters in 'remove' from either +// end of the string. +// ---------------------------------------------------------------------- +inline int TrimString(string* s, const StringPiece& remove) { + return TrimStringRight(s, remove) + TrimStringLeft(s, remove); +} + +// ---------------------------------------------------------------------- +// TrimRunsInString +// Removes leading and trailing runs, and collapses middle +// runs of a set of characters into a single character (the +// first one specified in 'remove'). Useful for collapsing +// runs of repeated delimiters, whitespace, etc. E.g., +// TrimRunsInString(&s, " :,()") removes leading and trailing +// delimiter chars and collapses and converts internal runs +// of delimiters to single ' ' characters, so, for example, +// " a:(b):c " -> "a b c" +// "first,last::(area)phone, ::zip" -> "first last area phone zip" +// ---------------------------------------------------------------------- +void TrimRunsInString(string* s, StringPiece remove); + +// ---------------------------------------------------------------------- +// RemoveNullsInString +// Removes any internal \0 characters from the string. +// ---------------------------------------------------------------------- +void RemoveNullsInString(string* s); + +// ---------------------------------------------------------------------- +// strrm() +// memrm() +// Remove all occurrences of a given character from a string. +// Returns the new length. +// ---------------------------------------------------------------------- + +int strrm(char* str, char c); +int memrm(char* str, int strlen, char c); + +// ---------------------------------------------------------------------- +// strrmm() +// Remove all occurrences of a given set of characters from a string. +// Returns the new length. +// ---------------------------------------------------------------------- +int strrmm(char* str, const char* chars); +int strrmm(string* str, const string& chars); + +#endif // STRINGS_STRIP_H_ diff --git a/src/kudu/gutil/strings/substitute.cc b/src/kudu/gutil/strings/substitute.cc new file mode 100644 index 000000000000..245894bfffe0 --- /dev/null +++ b/src/kudu/gutil/strings/substitute.cc @@ -0,0 +1,133 @@ +// Copyright 2008 Google Inc. All rights reserved. + +#include "kudu/gutil/strings/substitute.h" + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/stl_util.h" + +namespace strings { + +using internal::SubstituteArg; + +const SubstituteArg SubstituteArg::NoArg; + +// Returns the number of args in arg_array which were passed explicitly +// to Substitute(). +static int CountSubstituteArgs(const SubstituteArg* const* args_array) { + int count = 0; + while (args_array[count] != &SubstituteArg::NoArg) { + ++count; + } + return count; +} + +namespace internal { +int SubstitutedSize(StringPiece format, + const SubstituteArg* const* args_array) { + int size = 0; + for (int i = 0; i < format.size(); i++) { + if (format[i] == '$') { + if (i+1 >= format.size()) { + LOG(DFATAL) << "Invalid strings::Substitute() format string: \"" + << CEscape(format) << "\"."; + return 0; + } else if (ascii_isdigit(format[i+1])) { + int index = format[i+1] - '0'; + if (args_array[index]->size() == -1) { + LOG(DFATAL) + << "strings::Substitute format string invalid: asked for \"$" + << index << "\", but only " << CountSubstituteArgs(args_array) + << " args were given. Full format string was: \"" + << CEscape(format) << "\"."; + return 0; + } + size += args_array[index]->size(); + ++i; // Skip next char. + } else if (format[i+1] == '$') { + ++size; + ++i; // Skip next char. + } else { + LOG(DFATAL) << "Invalid strings::Substitute() format string: \"" + << CEscape(format) << "\"."; + return 0; + } + } else { + ++size; + } + } + return size; +} + +char* SubstituteToBuffer(StringPiece format, + const SubstituteArg* const* args_array, + char* target) { + for (int i = 0; i < format.size(); i++) { + if (format[i] == '$') { + if (ascii_isdigit(format[i+1])) { + const SubstituteArg* src = args_array[format[i+1] - '0']; + memcpy(target, src->data(), src->size()); + target += src->size(); + ++i; // Skip next char. + } else if (format[i+1] == '$') { + *target++ = '$'; + ++i; // Skip next char. + } + } else { + *target++ = format[i]; + } + } + return target; +} + +} // namespace internal + +void SubstituteAndAppend( + string* output, StringPiece format, + const SubstituteArg& arg0, const SubstituteArg& arg1, + const SubstituteArg& arg2, const SubstituteArg& arg3, + const SubstituteArg& arg4, const SubstituteArg& arg5, + const SubstituteArg& arg6, const SubstituteArg& arg7, + const SubstituteArg& arg8, const SubstituteArg& arg9) { + const SubstituteArg* const args_array[] = { + &arg0, &arg1, &arg2, &arg3, &arg4, &arg5, &arg6, &arg7, &arg8, &arg9, nullptr + }; + + // Determine total size needed. + int size = SubstitutedSize(format, args_array); + if (size == 0) return; + + // Build the string. + int original_size = output->size(); + STLStringResizeUninitialized(output, original_size + size); + char* target = string_as_array(output) + original_size; + + target = SubstituteToBuffer(format, args_array, target); + DCHECK_EQ(target - output->data(), output->size()); +} + +SubstituteArg::SubstituteArg(const void* value) { + COMPILE_ASSERT(sizeof(scratch_) >= sizeof(value) * 2 + 2, + fix_sizeof_scratch_); + if (value == nullptr) { + text_ = "NULL"; + size_ = strlen(text_); + } else { + char* ptr = scratch_ + sizeof(scratch_); + uintptr_t num = reinterpret_cast(value); + static const char kHexDigits[] = "0123456789abcdef"; + do { + *--ptr = kHexDigits[num & 0xf]; + num >>= 4; + } while (num != 0); + *--ptr = 'x'; + *--ptr = '0'; + text_ = ptr; + size_ = scratch_ + sizeof(scratch_) - ptr; + } +} + +} // namespace strings diff --git a/src/kudu/gutil/strings/substitute.h b/src/kudu/gutil/strings/substitute.h new file mode 100644 index 000000000000..0812c3f4c921 --- /dev/null +++ b/src/kudu/gutil/strings/substitute.h @@ -0,0 +1,192 @@ +// Copyright 2008 Google Inc. All rights reserved. + +#include +#include +using std::string; + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/stringpiece.h" + + +#ifndef STRINGS_SUBSTITUTE_H_ +#define STRINGS_SUBSTITUTE_H_ + +namespace strings { + +// ---------------------------------------------------------------------- +// strings::Substitute() +// strings::SubstituteAndAppend() +// Kind of like StringPrintf, but different. +// +// Example: +// string GetMessage(string first_name, string last_name, int age) { +// return strings::Substitute("My name is $0 $1 and I am $2 years old.", +// first_name, last_name, age); +// } +// +// Differences from StringPrintf: +// * The format string does not identify the types of arguments. +// Instead, the magic of C++ deals with this for us. See below +// for a list of accepted types. +// * Substitutions in the format string are identified by a '$' +// followed by a digit. So, you can use arguments out-of-order and +// use the same argument multiple times. +// * '$$' in the format string means output a literal '$' character. +// * It's much faster than StringPrintf. +// +// Supported types: +// * StringPiece (const char*, const string&) (NULL is equivalent to "") +// * Note that this means you do not have to add .c_str() to all of +// your strings. In fact, you shouldn't; it will be slower. +// * int32, int64, uint32, uint64 +// * float, double +// * bool: Printed as "true" or "false". +// * pointer types other than char*: Printed as "0x", +// except that NULL is printed as "NULL". +// +// If not enough arguments are supplied, a LOG(DFATAL) will be issued and +// the empty string will be returned. If too many arguments are supplied, +// just the first ones will be used (no warning). +// +// SubstituteAndAppend() is like Substitute() but appends the result to +// *output. Example: +// +// string str; +// strings::SubstituteAndAppend(&str, +// "My name is $0 $1 and I am $2 years old.", +// first_name, last_name, age); +// +// Substitute() is significantly faster than StringPrintf(). For very +// large strings, it may be orders of magnitude faster. +// ---------------------------------------------------------------------- + +namespace internal { // Implementation details. + +// This class has implicit constructors. +// Style guide exception granted: +// http://goto/style-guide-exception-20978288 + +class SubstituteArg { + public: + // We must explicitly overload char* so that the compiler doesn't try to + // cast it to bool to construct a DynamicSubstituteArg. Might as well + // overload const string& as well, since this allows us to avoid a temporary + // object. + inline SubstituteArg(const char* value) // NOLINT(runtime/explicit) + : text_(value), size_(value == NULL ? 0 : strlen(text_)) {} + inline SubstituteArg(const string& value) // NOLINT(runtime/explicit) + : text_(value.data()), size_(value.size()) {} + inline SubstituteArg(const StringPiece& value) // NOLINT(runtime/explicit) + : text_(value.data()), size_(value.size()) {} + + // Primitives + // We don't overload for signed and unsigned char because if people are + // explicitly declaring their chars as signed or unsigned then they are + // probably actually using them as 8-bit integers and would probably + // prefer an integer representation. But, we don't really know. So, we + // make the caller decide what to do. + inline SubstituteArg(char value) // NOLINT(runtime/explicit) + : text_(scratch_), size_(1) { scratch_[0] = value; } + inline SubstituteArg(short value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastInt32ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(unsigned short value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastUInt32ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(int value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastInt32ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(unsigned int value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastUInt32ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(long value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_((sizeof(value) == 4 ? FastInt32ToBufferLeft(value, scratch_) + : FastInt64ToBufferLeft(value, scratch_)) + - scratch_) {} + inline SubstituteArg(unsigned long value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_((sizeof(value) == 4 ? FastUInt32ToBufferLeft(value, scratch_) + : FastUInt64ToBufferLeft(value, scratch_)) + - scratch_) {} + inline SubstituteArg(long long value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastInt64ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(unsigned long long value) // NOLINT(runtime/explicit) + : text_(scratch_), + size_(FastUInt64ToBufferLeft(value, scratch_) - scratch_) {} + inline SubstituteArg(float value) // NOLINT(runtime/explicit) + : text_(FloatToBuffer(value, scratch_)), size_(strlen(text_)) {} + inline SubstituteArg(double value) // NOLINT(runtime/explicit) + : text_(DoubleToBuffer(value, scratch_)), size_(strlen(text_)) {} + inline SubstituteArg(bool value) // NOLINT(runtime/explicit) + : text_(value ? "true" : "false"), size_(strlen(text_)) {} + // void* values, with the exception of char*, are printed as + // StringPrintf with format "%p" would ("0x"), with the + // exception of NULL, which is printed as "NULL". + SubstituteArg(const void* value); // NOLINT(runtime/explicit) + + inline const char* data() const { return text_; } + inline int size() const { return size_; } + + // Indicates that no argument was given. + static const SubstituteArg NoArg; + + private: + inline SubstituteArg() : text_(NULL), size_(-1) {} + + const char* text_; + int size_; + char scratch_[kFastToBufferSize]; +}; + +// Return the length of the resulting string after performing the given +// substitution. +int SubstitutedSize(StringPiece format, + const SubstituteArg* const* args_array); + +// Perform the given substitution into 'target'. 'target' must have +// space for the result -- use SubstitutedSize() to determine how many +// bytes are required. Returns a pointer to the next byte following +// the result in 'target'. +char* SubstituteToBuffer(StringPiece format, + const SubstituteArg* const* args_array, + char* target); + +} // namespace internal + +void SubstituteAndAppend( + string* output, StringPiece format, + const internal::SubstituteArg& arg0 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg1 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg2 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg3 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg4 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg5 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg6 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg7 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg8 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg9 = internal::SubstituteArg::NoArg); + +inline string Substitute( + StringPiece format, + const internal::SubstituteArg& arg0 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg1 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg2 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg3 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg4 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg5 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg6 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg7 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg8 = internal::SubstituteArg::NoArg, + const internal::SubstituteArg& arg9 = internal::SubstituteArg::NoArg) { + string result; + SubstituteAndAppend(&result, format, arg0, arg1, arg2, arg3, arg4, + arg5, arg6, arg7, arg8, arg9); + return result; +} + +} // namespace strings + +#endif // STRINGS_SUBSTITUTE_H_ diff --git a/src/kudu/gutil/strings/util.cc b/src/kudu/gutil/strings/util.cc new file mode 100644 index 000000000000..c16d9b2871d5 --- /dev/null +++ b/src/kudu/gutil/strings/util.cc @@ -0,0 +1,1218 @@ +// +// Copyright (C) 1999-2005 Google, Inc. +// + +// TODO(user): visit each const_cast. Some of them are no longer necessary +// because last Single Unix Spec and grte v2 are more const-y. + +#include "kudu/gutil/strings/util.h" + +#include +#include +#include +#include +#include // for FastTimeToBuffer() +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include +using std::string; +#include +using std::vector; + +#include +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/strings/ascii_ctype.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/stl_util.h" // for string_as_array, STLAppendToString +#include "kudu/gutil/utf/utf.h" + +#ifdef OS_WINDOWS +#ifdef min // windows.h defines this to something silly +#undef min +#endif +#endif + +// Use this instead of gmtime_r if you want to build for Windows. +// Windows doesn't have a 'gmtime_r', but it has the similar 'gmtime_s'. +// TODO(user): Probably belongs in //base:time_support.{cc|h}. +static struct tm* PortableSafeGmtime(const time_t* timep, struct tm* result) { +#ifdef OS_WINDOWS + return gmtime_s(result, timep) == 0 ? result : NULL; +#else + return gmtime_r(timep, result); +#endif // OS_WINDOWS +} + +char* strnstr(const char* haystack, const char* needle, + size_t haystack_len) { + if (*needle == '\0') { + return const_cast(haystack); + } + size_t needle_len = strlen(needle); + char* where; + while ((where = strnchr(haystack, *needle, haystack_len)) != nullptr) { + if (where - haystack + needle_len > haystack_len) { + return nullptr; + } + if (strncmp(where, needle, needle_len) == 0) { + return where; + } + haystack_len -= where + 1 - haystack; + haystack = where + 1; + } + return nullptr; +} + +const char* strnprefix(const char* haystack, int haystack_size, + const char* needle, int needle_size) { + if (needle_size > haystack_size) { + return nullptr; + } else { + if (strncmp(haystack, needle, needle_size) == 0) { + return haystack + needle_size; + } else { + return nullptr; + } + } +} + +const char* strncaseprefix(const char* haystack, int haystack_size, + const char* needle, int needle_size) { + if (needle_size > haystack_size) { + return nullptr; + } else { + if (strncasecmp(haystack, needle, needle_size) == 0) { + return haystack + needle_size; + } else { + return nullptr; + } + } +} + +char* strcasesuffix(char* str, const char* suffix) { + const int lenstr = strlen(str); + const int lensuffix = strlen(suffix); + char* strbeginningoftheend = str + lenstr - lensuffix; + + if (lenstr >= lensuffix && 0 == strcasecmp(strbeginningoftheend, suffix)) { + return (strbeginningoftheend); + } else { + return (nullptr); + } +} + +const char* strnsuffix(const char* haystack, int haystack_size, + const char* needle, int needle_size) { + if (needle_size > haystack_size) { + return nullptr; + } else { + const char* start = haystack + haystack_size - needle_size; + if (strncmp(start, needle, needle_size) == 0) { + return start; + } else { + return nullptr; + } + } +} + +const char* strncasesuffix(const char* haystack, int haystack_size, + const char* needle, int needle_size) { + if (needle_size > haystack_size) { + return nullptr; + } else { + const char* start = haystack + haystack_size - needle_size; + if (strncasecmp(start, needle, needle_size) == 0) { + return start; + } else { + return nullptr; + } + } +} + +char* strchrnth(const char* str, const char& c, int n) { + if (str == nullptr) + return nullptr; + if (n <= 0) + return const_cast(str); + const char* sp; + int k = 0; + for (sp = str; *sp != '\0'; sp ++) { + if (*sp == c) { + ++k; + if (k >= n) + break; + } + } + return (k < n) ? nullptr : const_cast(sp); +} + +char* AdjustedLastPos(const char* str, char separator, int n) { + if ( str == nullptr ) + return nullptr; + const char* pos = nullptr; + if ( n > 0 ) + pos = strchrnth(str, separator, n); + + // if n <= 0 or separator appears fewer than n times, get the last occurrence + if ( pos == nullptr) + pos = strrchr(str, separator); + return const_cast(pos); +} + + +// ---------------------------------------------------------------------- +// Misc. routines +// ---------------------------------------------------------------------- + +bool IsAscii(const char* str, int len) { + const char* end = str + len; + while (str < end) { + if (!ascii_isascii(*str++)) { + return false; + } + } + return true; +} + +// ---------------------------------------------------------------------- +// StringReplace() +// Give me a string and two patterns "old" and "new", and I replace +// the first instance of "old" in the string with "new", if it +// exists. If "replace_all" is true then call this repeatedly until it +// fails. RETURN a new string, regardless of whether the replacement +// happened or not. +// ---------------------------------------------------------------------- + +string StringReplace(const StringPiece& s, const StringPiece& oldsub, + const StringPiece& newsub, bool replace_all) { + string ret; + StringReplace(s, oldsub, newsub, replace_all, &ret); + return ret; +} + + +// ---------------------------------------------------------------------- +// StringReplace() +// Replace the "old" pattern with the "new" pattern in a string, +// and append the result to "res". If replace_all is false, +// it only replaces the first instance of "old." +// ---------------------------------------------------------------------- + +void StringReplace(const StringPiece& s, const StringPiece& oldsub, + const StringPiece& newsub, bool replace_all, + string* res) { + if (oldsub.empty()) { + res->append(s.data(), s.length()); // If empty, append the given string. + return; + } + + StringPiece::size_type start_pos = 0; + StringPiece::size_type pos; + do { + pos = s.find(oldsub, start_pos); + if (pos == StringPiece::npos) { + break; + } + res->append(s.data() + start_pos, pos - start_pos); + res->append(newsub.data(), newsub.length()); + // Start searching again after the "old". + start_pos = pos + oldsub.length(); + } while (replace_all); + res->append(s.data() + start_pos, s.length() - start_pos); +} + +// ---------------------------------------------------------------------- +// GlobalReplaceSubstring() +// Replaces all instances of a substring in a string. Does nothing +// if 'substring' is empty. Returns the number of replacements. +// +// NOTE: The string pieces must not overlap s. +// ---------------------------------------------------------------------- + +int GlobalReplaceSubstring(const StringPiece& substring, + const StringPiece& replacement, + string* s) { + CHECK(s != nullptr); + if (s->empty() || substring.empty()) + return 0; + string tmp; + int num_replacements = 0; + size_t pos = 0; + for (size_t match_pos = s->find(substring.data(), pos, substring.length()); + match_pos != string::npos; + pos = match_pos + substring.length(), + match_pos = s->find(substring.data(), pos, substring.length())) { + ++num_replacements; + // Append the original content before the match. + tmp.append(*s, pos, match_pos - pos); + // Append the replacement for the match. + tmp.append(replacement.begin(), replacement.end()); + } + // Append the content after the last match. If no replacements were made, the + // original string is left untouched. + if (num_replacements > 0) { + tmp.append(*s, pos, s->length() - pos); + s->swap(tmp); + } + return num_replacements; +} + +//--------------------------------------------------------------------------- +// RemoveStrings() +// Remove the strings from v given by the (sorted least -> greatest) +// numbers in indices. +// Order of v is *not* preserved. +//--------------------------------------------------------------------------- +void RemoveStrings(vector* v, const vector& indices) { + assert(v); + assert(indices.size() <= v->size()); + // go from largest index to smallest so that smaller indices aren't + // invalidated + for (int lcv = indices.size() - 1; lcv >= 0; --lcv) { +#ifndef NDEBUG + // verify that indices is sorted least->greatest + if (indices.size() >= 2 && lcv > 0) + // use LT and not LE because we should never see repeat indices + CHECK_LT(indices[lcv-1], indices[lcv]); +#endif + assert(indices[lcv] >= 0); + assert(indices[lcv] < v->size()); + swap((*v)[indices[lcv]], v->back()); + v->pop_back(); + } +} + +// ---------------------------------------------------------------------- +// gstrcasestr is a case-insensitive strstr. Eventually we should just +// use the GNU libc version of strcasestr, but it isn't compiled into +// RedHat Linux by default in version 6.1. +// +// This function uses ascii_tolower() instead of tolower(), for speed. +// ---------------------------------------------------------------------- + +char *gstrcasestr(const char* haystack, const char* needle) { + char c, sc; + size_t len; + + if ((c = *needle++) != 0) { + c = ascii_tolower(c); + len = strlen(needle); + do { + do { + if ((sc = *haystack++) == 0) + return nullptr; + } while (ascii_tolower(sc) != c); + } while (strncasecmp(haystack, needle, len) != 0); + haystack--; + } + // This is a const violation but strstr() also returns a char*. + return const_cast(haystack); +} + +// ---------------------------------------------------------------------- +// gstrncasestr is a case-insensitive strnstr. +// Finds the occurence of the (null-terminated) needle in the +// haystack, where no more than len bytes of haystack is searched. +// Characters that appear after a '\0' in the haystack are not searched. +// +// This function uses ascii_tolower() instead of tolower(), for speed. +// ---------------------------------------------------------------------- +const char *gstrncasestr(const char* haystack, const char* needle, size_t len) { + char c, sc; + + if ((c = *needle++) != 0) { + c = ascii_tolower(c); + size_t needle_len = strlen(needle); + do { + do { + if (len-- <= needle_len + || 0 == (sc = *haystack++)) + return nullptr; + } while (ascii_tolower(sc) != c); + } while (strncasecmp(haystack, needle, needle_len) != 0); + haystack--; + } + return haystack; +} + +// ---------------------------------------------------------------------- +// gstrncasestr is a case-insensitive strnstr. +// Finds the occurence of the (null-terminated) needle in the +// haystack, where no more than len bytes of haystack is searched. +// Characters that appear after a '\0' in the haystack are not searched. +// +// This function uses ascii_tolower() instead of tolower(), for speed. +// ---------------------------------------------------------------------- +char *gstrncasestr(char* haystack, const char* needle, size_t len) { + return const_cast(gstrncasestr(static_cast(haystack), + needle, len)); +} +// ---------------------------------------------------------------------- +// gstrncasestr_split performs a case insensitive search +// on (prefix, non_alpha, suffix). +// ---------------------------------------------------------------------- +char *gstrncasestr_split(const char* str, + const char* prefix, char non_alpha, + const char* suffix, + size_t n) { + int prelen = prefix == nullptr ? 0 : strlen(prefix); + int suflen = suffix == nullptr ? 0 : strlen(suffix); + + // adjust the string and its length to avoid unnessary searching. + // an added benefit is to avoid unnecessary range checks in the if + // statement in the inner loop. + if (suflen + prelen >= n) return nullptr; + str += prelen; + n -= prelen; + n -= suflen; + + const char* where = nullptr; + + // for every occurance of non_alpha in the string ... + while ((where = static_cast( + memchr(str, non_alpha, n))) != nullptr) { + // ... test whether it is followed by suffix and preceded by prefix + if ((!suflen || strncasecmp(where + 1, suffix, suflen) == 0) && + (!prelen || strncasecmp(where - prelen, prefix, prelen) == 0)) { + return const_cast(where - prelen); + } + // if not, advance the pointer, and adjust the length according + n -= (where + 1) - str; + str = where + 1; + } + + return nullptr; +} + +// ---------------------------------------------------------------------- +// strcasestr_alnum is like a case-insensitive strstr, except that it +// ignores non-alphanumeric characters in both strings for the sake of +// comparison. +// +// This function uses ascii_isalnum() instead of isalnum() and +// ascii_tolower() instead of tolower(), for speed. +// +// E.g. strcasestr_alnum("i use google all the time", " !!Google!! ") +// returns pointer to "google all the time" +// ---------------------------------------------------------------------- +char *strcasestr_alnum(const char *haystack, const char *needle) { + const char *haystack_ptr; + const char *needle_ptr; + + // Skip non-alnums at beginning + while ( !ascii_isalnum(*needle) ) + if ( *needle++ == '\0' ) + return const_cast(haystack); + needle_ptr = needle; + + // Skip non-alnums at beginning + while ( !ascii_isalnum(*haystack) ) + if ( *haystack++ == '\0' ) + return nullptr; + haystack_ptr = haystack; + + while ( *needle_ptr != '\0' ) { + // Non-alnums - advance + while ( !ascii_isalnum(*needle_ptr) ) + if ( *needle_ptr++ == '\0' ) + return const_cast(haystack); + + while ( !ascii_isalnum(*haystack_ptr) ) + if ( *haystack_ptr++ == '\0' ) + return nullptr; + + if ( ascii_tolower(*needle_ptr) == ascii_tolower(*haystack_ptr) ) { + // Case-insensitive match - advance + needle_ptr++; + haystack_ptr++; + } else { + // No match - rollback to next start point in haystack + haystack++; + while ( !ascii_isalnum(*haystack) ) + if ( *haystack++ == '\0' ) + return nullptr; + haystack_ptr = haystack; + needle_ptr = needle; + } + } + return const_cast(haystack); +} + + +// ---------------------------------------------------------------------- +// CountSubstring() +// Return the number times a "substring" appears in the "text" +// NOTE: this function's complexity is O(|text| * |substring|) +// It is meant for short "text" (such as to ensure the +// printf format string has the right number of arguments). +// DO NOT pass in long "text". +// ---------------------------------------------------------------------- +int CountSubstring(StringPiece text, StringPiece substring) { + CHECK_GT(substring.length(), 0); + + int count = 0; + StringPiece::size_type curr = 0; + while (StringPiece::npos != (curr = text.find(substring, curr))) { + ++count; + ++curr; + } + return count; +} + +// ---------------------------------------------------------------------- +// strstr_delimited() +// Just like strstr(), except it ensures that the needle appears as +// a complete item (or consecutive series of items) in a delimited +// list. +// +// Like strstr(), returns haystack if needle is empty, or NULL if +// either needle/haystack is NULL. +// ---------------------------------------------------------------------- +const char* strstr_delimited(const char* haystack, + const char* needle, + char delim) { + if (!needle || !haystack) return nullptr; + if (*needle == '\0') return haystack; + + int needle_len = strlen(needle); + + while (true) { + // Skip any leading delimiters. + while (*haystack == delim) ++haystack; + + // Walk down the haystack, matching every character in the needle. + const char* this_match = haystack; + int i = 0; + for (; i < needle_len; i++) { + if (*haystack != needle[i]) { + // We ran out of haystack or found a non-matching character. + break; + } + ++haystack; + } + + // If we matched the whole needle, ensure that it's properly delimited. + if (i == needle_len && (*haystack == '\0' || *haystack == delim)) { + return this_match; + } + + // No match. Consume non-delimiter characters until we run out of them. + while (*haystack != delim) { + if (*haystack == '\0') return nullptr; + ++haystack; + } + } + LOG(FATAL) << "Unreachable statement"; + return nullptr; +} + + +// ---------------------------------------------------------------------- +// Older versions of libc have a buggy strsep. +// ---------------------------------------------------------------------- + +char* gstrsep(char** stringp, const char* delim) { + char *s; + const char *spanp; + int c, sc; + char *tok; + + if ((s = *stringp) == nullptr) + return nullptr; + + tok = s; + while (true) { + c = *s++; + spanp = delim; + do { + if ((sc = *spanp++) == c) { + if (c == 0) + s = nullptr; + else + s[-1] = 0; + *stringp = s; + return tok; + } + } while (sc != 0); + } + + return nullptr; /* should not happen */ +} + +void FastStringAppend(string* s, const char* data, int len) { + STLAppendToString(s, data, len); +} + + +// TODO(user): add a microbenchmark and revisit +// the optimizations done here. +// +// Several converters use this table to reduce +// division and modulo operations. +extern const char two_ASCII_digits[100][2]; + +const char two_ASCII_digits[100][2] = { + {'0', '0'}, {'0', '1'}, {'0', '2'}, {'0', '3'}, {'0', '4'}, + {'0', '5'}, {'0', '6'}, {'0', '7'}, {'0', '8'}, {'0', '9'}, + {'1', '0'}, {'1', '1'}, {'1', '2'}, {'1', '3'}, {'1', '4'}, + {'1', '5'}, {'1', '6'}, {'1', '7'}, {'1', '8'}, {'1', '9'}, + {'2', '0'}, {'2', '1'}, {'2', '2'}, {'2', '3'}, {'2', '4'}, + {'2', '5'}, {'2', '6'}, {'2', '7'}, {'2', '8'}, {'2', '9'}, + {'3', '0'}, {'3', '1'}, {'3', '2'}, {'3', '3'}, {'3', '4'}, + {'3', '5'}, {'3', '6'}, {'3', '7'}, {'3', '8'}, {'3', '9'}, + {'4', '0'}, {'4', '1'}, {'4', '2'}, {'4', '3'}, {'4', '4'}, + {'4', '5'}, {'4', '6'}, {'4', '7'}, {'4', '8'}, {'4', '9'}, + {'5', '0'}, {'5', '1'}, {'5', '2'}, {'5', '3'}, {'5', '4'}, + {'5', '5'}, {'5', '6'}, {'5', '7'}, {'5', '8'}, {'5', '9'}, + {'6', '0'}, {'6', '1'}, {'6', '2'}, {'6', '3'}, {'6', '4'}, + {'6', '5'}, {'6', '6'}, {'6', '7'}, {'6', '8'}, {'6', '9'}, + {'7', '0'}, {'7', '1'}, {'7', '2'}, {'7', '3'}, {'7', '4'}, + {'7', '5'}, {'7', '6'}, {'7', '7'}, {'7', '8'}, {'7', '9'}, + {'8', '0'}, {'8', '1'}, {'8', '2'}, {'8', '3'}, {'8', '4'}, + {'8', '5'}, {'8', '6'}, {'8', '7'}, {'8', '8'}, {'8', '9'}, + {'9', '0'}, {'9', '1'}, {'9', '2'}, {'9', '3'}, {'9', '4'}, + {'9', '5'}, {'9', '6'}, {'9', '7'}, {'9', '8'}, {'9', '9'} +}; + +static inline void PutTwoDigits(int i, char* p) { + DCHECK_GE(i, 0); + DCHECK_LT(i, 100); + p[0] = two_ASCII_digits[i][0]; + p[1] = two_ASCII_digits[i][1]; +} + +char* FastTimeToBuffer(time_t s, char* buffer) { + if (s == 0) { + time(&s); + } + + struct tm tm; + if (PortableSafeGmtime(&s, &tm) == nullptr) { + // Error message must fit in 30-char buffer. + memcpy(buffer, "Invalid:", sizeof("Invalid:")); + FastInt64ToBufferLeft(s, buffer+strlen(buffer)); + return buffer; + } + + // strftime format: "%a, %d %b %Y %H:%M:%S GMT", + // but strftime does locale stuff which we do not want + // plus strftime takes > 10x the time of hard code + + const char* weekday_name = "Xxx"; + switch (tm.tm_wday) { + default: { DLOG(FATAL) << "tm.tm_wday: " << tm.tm_wday; } break; + case 0: weekday_name = "Sun"; break; + case 1: weekday_name = "Mon"; break; + case 2: weekday_name = "Tue"; break; + case 3: weekday_name = "Wed"; break; + case 4: weekday_name = "Thu"; break; + case 5: weekday_name = "Fri"; break; + case 6: weekday_name = "Sat"; break; + } + + const char* month_name = "Xxx"; + switch (tm.tm_mon) { + default: { DLOG(FATAL) << "tm.tm_mon: " << tm.tm_mon; } break; + case 0: month_name = "Jan"; break; + case 1: month_name = "Feb"; break; + case 2: month_name = "Mar"; break; + case 3: month_name = "Apr"; break; + case 4: month_name = "May"; break; + case 5: month_name = "Jun"; break; + case 6: month_name = "Jul"; break; + case 7: month_name = "Aug"; break; + case 8: month_name = "Sep"; break; + case 9: month_name = "Oct"; break; + case 10: month_name = "Nov"; break; + case 11: month_name = "Dec"; break; + } + + // Write out the buffer. + + memcpy(buffer+0, weekday_name, 3); + buffer[3] = ','; + buffer[4] = ' '; + + PutTwoDigits(tm.tm_mday, buffer+5); + buffer[7] = ' '; + + memcpy(buffer+8, month_name, 3); + buffer[11] = ' '; + + int32 year = tm.tm_year + 1900; + PutTwoDigits(year/100, buffer+12); + PutTwoDigits(year%100, buffer+14); + buffer[16] = ' '; + + PutTwoDigits(tm.tm_hour, buffer+17); + buffer[19] = ':'; + + PutTwoDigits(tm.tm_min, buffer+20); + buffer[22] = ':'; + + PutTwoDigits(tm.tm_sec, buffer+23); + + // includes ending NUL + memcpy(buffer+25, " GMT", 5); + + return buffer; +} + +// ---------------------------------------------------------------------- +// strdup_with_new() +// strndup_with_new() +// +// strdup_with_new() is the same as strdup() except that the memory +// is allocated by new[] and hence an exception will be generated +// if out of memory. +// +// strndup_with_new() is the same as strdup_with_new() except that it will +// copy up to the specified number of characters. This function +// is useful when we want to copy a substring out of a string +// and didn't want to (or cannot) modify the string +// ---------------------------------------------------------------------- +char* strdup_with_new(const char* the_string) { + if (the_string == nullptr) + return nullptr; + else + return strndup_with_new(the_string, strlen(the_string)); +} + +char* strndup_with_new(const char* the_string, int max_length) { + if (the_string == nullptr) + return nullptr; + + auto result = new char[max_length + 1]; + result[max_length] = '\0'; // terminate the string because strncpy might not + return strncpy(result, the_string, max_length); +} + + + + +// ---------------------------------------------------------------------- +// ScanForFirstWord() +// This function finds the first word in the string "the_string" given. +// A word is defined by consecutive !ascii_isspace() characters. +// If no valid words are found, +// return NULL and *end_ptr will contain junk +// else +// return the beginning of the first word and +// *end_ptr will store the address of the first invalid character +// (ascii_isspace() or '\0'). +// +// Precondition: (end_ptr != NULL) +// ---------------------------------------------------------------------- +const char* ScanForFirstWord(const char* the_string, const char** end_ptr) { + CHECK(end_ptr != nullptr) << ": precondition violated"; + + if (the_string == nullptr) // empty string + return nullptr; + + const char* curr = the_string; + while ((*curr != '\0') && ascii_isspace(*curr)) // skip initial spaces + ++curr; + + if (*curr == '\0') // no valid word found + return nullptr; + + // else has a valid word + const char* first_word = curr; + + // now locate the end of the word + while ((*curr != '\0') && !ascii_isspace(*curr)) + ++curr; + + *end_ptr = curr; + return first_word; +} + +// ---------------------------------------------------------------------- +// AdvanceIdentifier() +// This function returns a pointer past the end of the longest C-style +// identifier that is a prefix of str or NULL if str does not start with +// one. A C-style identifier begins with an ASCII letter or underscore +// and continues with ASCII letters, digits, or underscores. +// ---------------------------------------------------------------------- +const char *AdvanceIdentifier(const char *str) { + // Not using isalpha and isalnum so as not to rely on the locale. + // We could have used ascii_isalpha and ascii_isalnum. + char ch = *str++; + if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_')) + return nullptr; + while (true) { + ch = *str; + if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || + (ch >= '0' && ch <= '9') || ch == '_')) + return str; + str++; + } +} + + +// ---------------------------------------------------------------------- +// IsIdentifier() +// This function returns true if str is a C-style identifier. +// A C-style identifier begins with an ASCII letter or underscore +// and continues with ASCII letters, digits, or underscores. +// ---------------------------------------------------------------------- +bool IsIdentifier(const char *str) { + const char *end = AdvanceIdentifier(str); + return end && *end == '\0'; +} + +static bool IsWildcard(Rune character) { + return character == '*' || character == '?'; +} + +// Move the strings pointers to the point where they start to differ. +template +static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, + const CHAR** string, const CHAR* string_end, + NEXT next) { + const CHAR* escape = nullptr; + while (*pattern != pattern_end && *string != string_end) { + if (!escape && IsWildcard(**pattern)) { + // We don't want to match wildcard here, except if it's escaped. + return; + } + + // Check if the escapement char is found. If so, skip it and move to the + // next character. + if (!escape && **pattern == '\\') { + escape = *pattern; + next(pattern, pattern_end); + continue; + } + + // Check if the chars match, if so, increment the ptrs. + const CHAR* pattern_next = *pattern; + const CHAR* string_next = *string; + Rune pattern_char = next(&pattern_next, pattern_end); + if (pattern_char == next(&string_next, string_end) && + pattern_char != Runeerror && + pattern_char <= Runemax) { + *pattern = pattern_next; + *string = string_next; + } else { + // Uh ho, it did not match, we are done. If the last char was an + // escapement, that means that it was an error to advance the ptr here, + // let's put it back where it was. This also mean that the MatchPattern + // function will return false because if we can't match an escape char + // here, then no one will. + if (escape) { + *pattern = escape; + } + return; + } + + escape = nullptr; + } +} + +template +static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { + while (*pattern != end) { + if (!IsWildcard(**pattern)) + return; + next(pattern, end); + } +} + +template +static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, + const CHAR* pattern, const CHAR* pattern_end, + int depth, + NEXT next) { + const int kMaxDepth = 16; + if (depth > kMaxDepth) + return false; + + // Eat all the matching chars. + EatSameChars(&pattern, pattern_end, &eval, eval_end, next); + + // If the string is empty, then the pattern must be empty too, or contains + // only wildcards. + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + return pattern == pattern_end; + } + + // Pattern is empty but not string, this is not a match. + if (pattern == pattern_end) + return false; + + // If this is a question mark, then we need to compare the rest with + // the current string or the string with one character eaten. + const CHAR* next_pattern = pattern; + next(&next_pattern, pattern_end); + if (pattern[0] == '?') { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + const CHAR* next_eval = eval; + next(&next_eval, eval_end); + if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + } + + // This is a *, try to match all the possible substrings with the remainder + // of the pattern. + if (pattern[0] == '*') { + // Collapse duplicate wild cards (********** into *) so that the + // method does not recurse unnecessarily. http://crbug.com/52839 + EatWildcard(&next_pattern, pattern_end, next); + + while (eval != eval_end) { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + eval++; + } + + // We reached the end of the string, let see if the pattern contains only + // wildcards. + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + if (pattern != pattern_end) + return false; + return true; + } + } + + return false; +} + +struct NextCharUTF8 { + Rune operator()(const char** p, const char* end) { + Rune c; + int offset = charntorune(&c, *p, static_cast(end - *p)); + *p += offset; + return c; + } +}; + +bool MatchPattern(const StringPiece& eval, + const StringPiece& pattern) { + return MatchPatternT(eval.data(), eval.data() + eval.size(), + pattern.data(), pattern.data() + pattern.size(), + 0, NextCharUTF8()); +} + + + +// ---------------------------------------------------------------------- +// FindTagValuePair +// Given a string of the form +// ... +// where the part before the first attr_sep is optional, +// this function extracts the first tag and value, if any. +// The function returns true if successful, in which case "tag" and "value" +// are set to point to the beginning of the tag and the value, respectively, +// and "tag_len" and "value_len" are set to the respective lengths. +// ---------------------------------------------------------------------- + +bool FindTagValuePair(const char* arg_str, char tag_value_separator, + char attribute_separator, char string_terminal, + char **tag, int *tag_len, + char **value, int *value_len) { + char* in_str = const_cast(arg_str); // For msvc8. + if (in_str == nullptr) + return false; + char tv_sep_or_term[3] = {tag_value_separator, string_terminal, '\0'}; + char attr_sep_or_term[3] = {attribute_separator, string_terminal, '\0'}; + + // Look for beginning of tag + *tag = strpbrk(in_str, attr_sep_or_term); + // If string_terminal is '\0', strpbrk won't find it but return null. + if (*tag == nullptr || **tag == string_terminal) + *tag = in_str; + else + (*tag)++; // Move past separator + // Now look for value... + char *tv_sep_pos = strpbrk(*tag, tv_sep_or_term); + if (tv_sep_pos == nullptr || *tv_sep_pos == string_terminal) + return false; + // ...and end of value + char *attr_sep_pos = strpbrk(tv_sep_pos, attr_sep_or_term); + + *tag_len = tv_sep_pos - *tag; + *value = tv_sep_pos + 1; + if (attr_sep_pos != nullptr) + *value_len = attr_sep_pos - *value; + else + *value_len = strlen(*value); + return true; +} + +void UniformInsertString(string* s, int interval, const char* separator) { + const size_t separator_len = strlen(separator); + + if (interval < 1 || // invalid interval + s->empty() || // nothing to do + separator_len == 0) // invalid separator + return; + + int num_inserts = (s->size() - 1) / interval; // -1 to avoid appending at end + if (num_inserts == 0) // nothing to do + return; + + string tmp; + tmp.reserve(s->size() + num_inserts * separator_len + 1); + + for (int i = 0; i < num_inserts ; ++i) { + // append this interval + tmp.append(*s, i * interval, interval); + // append a separator + tmp.append(separator, separator_len); + } + + // append the tail + const size_t tail_pos = num_inserts * interval; + tmp.append(*s, tail_pos, s->size() - tail_pos); + + s->swap(tmp); +} + +void InsertString(string *const s, + const vector &indices, + char const *const separator) { + const unsigned num_indices(indices.size()); + if (num_indices == 0) { + return; // nothing to do... + } + + const unsigned separator_len(strlen(separator)); + if (separator_len == 0) { + return; // still nothing to do... + } + + string tmp; + const unsigned s_len(s->size()); + tmp.reserve(s_len + separator_len * num_indices); + + vector::const_iterator const ind_end(indices.end()); + auto ind_pos(indices.begin()); + + uint32 last_pos(0); + while (ind_pos != ind_end) { + const uint32 pos(*ind_pos); + DCHECK_GE(pos, last_pos); + DCHECK_LE(pos, s_len); + + tmp.append(s->substr(last_pos, pos - last_pos)); + tmp.append(separator); + + last_pos = pos; + ++ind_pos; + } + tmp.append(s->substr(last_pos)); + + s->swap(tmp); +} + +//------------------------------------------------------------------------ +// FindNth() +// return index of nth occurrence of c in the string, +// or string::npos if n > number of occurrences of c. +// (returns string::npos = -1 if n <= 0) +//------------------------------------------------------------------------ +int FindNth(StringPiece s, char c, int n) { + size_t pos = string::npos; + + for ( int i = 0; i < n; ++i ) { + pos = s.find_first_of(c, pos + 1); + if ( pos == StringPiece::npos ) { + break; + } + } + return pos; +} + +//------------------------------------------------------------------------ +// ReverseFindNth() +// return index of nth-to-last occurrence of c in the string, +// or string::npos if n > number of occurrences of c. +// (returns string::npos if n <= 0) +//------------------------------------------------------------------------ +int ReverseFindNth(StringPiece s, char c, int n) { + if ( n <= 0 ) { + return static_cast(StringPiece::npos); + } + + size_t pos = s.size(); + + for ( int i = 0; i < n; ++i ) { + // If pos == 0, we return StringPiece::npos right away. Otherwise, + // the following find_last_of call would take (pos - 1) as string::npos, + // which means it would again search the entire input string. + if (pos == 0) { + return static_cast(StringPiece::npos); + } + pos = s.find_last_of(c, pos - 1); + if ( pos == string::npos ) { + break; + } + } + return pos; +} + +namespace strings { + +// FindEol() +// Returns the location of the next end-of-line sequence. + +StringPiece FindEol(StringPiece s) { + for (size_t i = 0; i < s.length(); ++i) { + if (s[i] == '\n') { + return StringPiece(s.data() + i, 1); + } + if (s[i] == '\r') { + if (i+1 < s.length() && s[i+1] == '\n') { + return StringPiece(s.data() + i, 2); + } else { + return StringPiece(s.data() + i, 1); + } + } + } + return StringPiece(s.data() + s.length(), 0); +} + +} // namespace strings + +//------------------------------------------------------------------------ +// OnlyWhitespace() +// return true if string s contains only whitespace characters +//------------------------------------------------------------------------ +bool OnlyWhitespace(const StringPiece& s) { + for (const auto& c : s) { + if ( !ascii_isspace(c) ) return false; + } + return true; +} + +string PrefixSuccessor(const StringPiece& prefix) { + // We can increment the last character in the string and be done + // unless that character is 255, in which case we have to erase the + // last character and increment the previous character, unless that + // is 255, etc. If the string is empty or consists entirely of + // 255's, we just return the empty string. + bool done = false; + string limit(prefix.data(), prefix.size()); + int index = limit.length() - 1; + while (!done && index >= 0) { + if (limit[index] == 255) { + limit.erase(index); + index--; + } else { + limit[index]++; + done = true; + } + } + if (!done) { + return ""; + } else { + return limit; + } +} + +string ImmediateSuccessor(const StringPiece& s) { + // Return the input string, with an additional NUL byte appended. + string out; + out.reserve(s.size() + 1); + out.append(s.data(), s.size()); + out.push_back('\0'); + return out; +} + +void FindShortestSeparator(const StringPiece& start, + const StringPiece& limit, + string* separator) { + // Find length of common prefix + size_t min_length = min(start.size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + (start[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Handle the case where either string is a prefix of the other + // string, or both strings are identical. + start.CopyToString(separator); + return; + } + + if (diff_index+1 == start.size()) { + // If the first difference is in the last character, do not bother + // incrementing that character since the separator will be no + // shorter than "start". + start.CopyToString(separator); + return; + } + + if (start[diff_index] == 0xff) { + // Avoid overflow when incrementing start[diff_index] + start.CopyToString(separator); + return; + } + + separator->assign(start.data(), diff_index); + separator->push_back(start[diff_index] + 1); + if (*separator >= limit) { + // Never pick a separator that causes confusion with "limit" + start.CopyToString(separator); + } +} + +int SafeSnprintf(char *str, size_t size, const char *format, ...) { + va_list printargs; + va_start(printargs, format); + int ncw = vsnprintf(str, size, format, printargs); + va_end(printargs); + return (ncw < size && ncw >= 0) ? ncw : 0; +} + +bool GetlineFromStdioFile(FILE* file, string* str, char delim) { + str->erase(); + while (true) { + if (feof(file) || ferror(file)) { + return false; + } + int c = getc(file); + if (c == EOF) return false; + if (c == delim) return true; + str->push_back(c); + } +} + +namespace { + +template +size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { + for (size_t i = 0; i < dst_size; ++i) { + if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. + return i; + } + + // We were left off at dst_size. We over copied 1 byte. Null terminate. + if (dst_size != 0) + dst[dst_size - 1] = 0; + + // Count the rest of the |src|, and return it's length in characters. + while (src[dst_size]) ++dst_size; + return dst_size; +} + +} // namespace + +size_t strings::strlcpy(char* dst, const char* src, size_t dst_size) { + return lcpyT(dst, src, dst_size); +} diff --git a/src/kudu/gutil/strings/util.h b/src/kudu/gutil/strings/util.h new file mode 100644 index 000000000000..59db97d4ad42 --- /dev/null +++ b/src/kudu/gutil/strings/util.h @@ -0,0 +1,514 @@ +// +// Copyright 1999-2006 and onwards Google, Inc. +// +// Useful string functions and so forth. This is a grab-bag file. +// +// You might also want to look at memutil.h, which holds mem*() +// equivalents of a lot of the str*() functions in string.h, +// eg memstr, mempbrk, etc. +// +// These functions work fine for UTF-8 strings as long as you can +// consider them to be just byte strings. For example, due to the +// design of UTF-8 you do not need to worry about accidental matches, +// as long as all your inputs are valid UTF-8 (use \uHHHH, not \xHH or \oOOO). +// +// Caveats: +// * all the lengths in these routines refer to byte counts, +// not character counts. +// * case-insensitivity in these routines assumes that all the letters +// in question are in the range A-Z or a-z. +// +// If you need Unicode specific processing (for example being aware of +// Unicode character boundaries, or knowledge of Unicode casing rules, +// or various forms of equivalence and normalization), take a look at +// files in i18n/utf8. + +#ifndef STRINGS_UTIL_H_ +#define STRINGS_UTIL_H_ + +#include +#include +#include +#include +#ifndef _MSC_VER +#include // for strcasecmp, but msvc does not have this header +#endif + +#include +using std::binary_function; +using std::less; +#include +using std::string; +#include +using std::vector; + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/stringpiece.h" + +// Newer functions. + +namespace strings { + +// Finds the next end-of-line sequence. +// An end-of-line sequence is one of: +// \n common on unix, including mac os x +// \r common on macos 9 and before +// \r\n common on windows +// +// Returns a StringPiece that contains the end-of-line sequence (a pointer into +// the input, 1 or 2 characters long). +// +// If the input does not contain an end-of-line sequence, returns an empty +// StringPiece located at the end of the input: +// StringPiece(sp.data() + sp.length(), 0). + +StringPiece FindEol(StringPiece sp); + +} // namespace strings + +// Older functions. + +// Duplicates a non-null, non-empty char* string. Returns a pointer to the new +// string, or NULL if the input is null or empty. +inline char* strdup_nonempty(const char* src) { + if (src && src[0]) return strdup(src); + return NULL; +} + +// Finds the first occurrence of a character in at most a given number of bytes +// of a char* string. Returns a pointer to the first occurrence, or NULL if no +// occurrence found in the first sz bytes. +// Never searches past the first null character in the string; therefore, only +// suitable for null-terminated strings. +// WARNING: Removes const-ness of string argument! +inline char* strnchr(const char* buf, char c, int sz) { + const char* end = buf + sz; + while (buf != end && *buf) { + if (*buf == c) + return const_cast(buf); + ++buf; + } + return NULL; +} + +// Finds the first occurrence of the null-terminated needle in at most the first +// haystack_len bytes of haystack. Returns NULL if needle is not found. Returns +// haystack if needle is empty. +// WARNING: Removes const-ness of string argument! +char* strnstr(const char* haystack, const char* needle, size_t haystack_len); + +// Matches a prefix (which must be a char* literal!) against the beginning of +// str. Returns a pointer past the prefix, or NULL if the prefix wasn't matched. +// (Like the standard strcasecmp(), but for efficiency doesn't call strlen() on +// prefix, and returns a pointer rather than an int.) +// +// The ""'s catch people who don't pass in a literal for "prefix" +#ifndef strprefix +#define strprefix(str, prefix) \ + (strncmp(str, prefix, sizeof("" prefix "")-1) == 0 ? \ + str + sizeof(prefix)-1 : \ + NULL) +#endif + +// Same as strprefix() (immediately above), but matches a case-insensitive +// prefix. +#ifndef strcaseprefix +#define strcaseprefix(str, prefix) \ + (strncasecmp(str, prefix, sizeof("" prefix "")-1) == 0 ? \ + str + sizeof(prefix)-1 : \ + NULL) +#endif + +// Matches a prefix (up to the first needle_size bytes of needle) in the first +// haystack_size byte of haystack. Returns a pointer past the prefix, or NULL if +// the prefix wasn't matched. (Unlike strprefix(), prefix doesn't need to be a +// char* literal. Like the standard strncmp(), but also takes a haystack_size, +// and returns a pointer rather than an int.) +// +// Always returns either NULL or haystack + needle_size. +// +// Some windows header sometimes #defines strnprefix to something we +// don't want. +#ifdef strnprefix +#undef strnprefix +#endif +const char* strnprefix(const char* haystack, int haystack_size, + const char* needle, int needle_size); + +// Matches a case-insensitive prefix (up to the first needle_size bytes of +// needle) in the first haystack_size byte of haystack. Returns a pointer past +// the prefix, or NULL if the prefix wasn't matched. +// +// Always returns either NULL or haystack + needle_size. +const char* strncaseprefix(const char* haystack, int haystack_size, + const char* needle, int needle_size); + +// Matches a prefix; returns a pointer past the prefix, or NULL if not found. +// (Like strprefix() and strcaseprefix() but not restricted to searching for +// char* literals). Templated so searching a const char* returns a const char*, +// and searching a non-const char* returns a non-const char*. +template +inline CharStar var_strprefix(CharStar str, const char* prefix) { + const int len = strlen(prefix); + return strncmp(str, prefix, len) == 0 ? str + len : NULL; +} + +// Same as var_strprefix() (immediately above), but matches a case-insensitive +// prefix. +template +inline CharStar var_strcaseprefix(CharStar str, const char* prefix) { + const int len = strlen(prefix); + return strncasecmp(str, prefix, len) == 0 ? str + len : NULL; +} + +// Returns input, or "(null)" if NULL. (Useful for logging.) +inline const char* GetPrintableString(const char* const in) { + return NULL == in ? "(null)" : in; +} + +// Returns whether str begins with prefix. +inline bool HasPrefixString(const StringPiece& str, + const StringPiece& prefix) { + return str.starts_with(prefix); +} + +// Returns whether str ends with suffix. +inline bool HasSuffixString(const StringPiece& str, + const StringPiece& suffix) { + return str.ends_with(suffix); +} + +// Returns true if the string passed in matches the pattern. The pattern +// string can contain wildcards like * and ? +// The backslash character (\) is an escape character for * and ? +// We limit the patterns to having a max of 16 * or ? characters. +// ? matches 0 or 1 character, while * matches 0 or more characters. +bool MatchPattern(const StringPiece& string, + const StringPiece& pattern); + +// Returns where suffix begins in str, or NULL if str doesn't end with suffix. +inline char* strsuffix(char* str, const char* suffix) { + const int lenstr = strlen(str); + const int lensuffix = strlen(suffix); + char* strbeginningoftheend = str + lenstr - lensuffix; + + if (lenstr >= lensuffix && 0 == strcmp(strbeginningoftheend, suffix)) { + return (strbeginningoftheend); + } else { + return (NULL); + } +} +inline const char* strsuffix(const char* str, const char* suffix) { + return const_cast(strsuffix(const_cast(str), suffix)); +} + +// Same as strsuffix() (immediately above), but matches a case-insensitive +// suffix. +char* strcasesuffix(char* str, const char* suffix); +inline const char* strcasesuffix(const char* str, const char* suffix) { + return const_cast(strcasesuffix(const_cast(str), suffix)); +} + +const char* strnsuffix(const char* haystack, int haystack_size, + const char* needle, int needle_size); +const char* strncasesuffix(const char* haystack, int haystack_size, + const char* needle, int needle_size); + +// Returns the number of times a character occurs in a string for a null +// terminated string. +inline ptrdiff_t strcount(const char* buf, char c) { + if (buf == NULL) + return 0; + ptrdiff_t num = 0; + for (const char* bp = buf; *bp != '\0'; bp++) { + if (*bp == c) + num++; + } + return num; +} +// Returns the number of times a character occurs in a string for a string +// defined by a pointer to the first character and a pointer just past the last +// character. +inline ptrdiff_t strcount(const char* buf_begin, const char* buf_end, char c) { + if (buf_begin == NULL) + return 0; + if (buf_end <= buf_begin) + return 0; + ptrdiff_t num = 0; + for (const char* bp = buf_begin; bp != buf_end; bp++) { + if (*bp == c) + num++; + } + return num; +} +// Returns the number of times a character occurs in a string for a string +// defined by a pointer to the first char and a length: +inline ptrdiff_t strcount(const char* buf, size_t len, char c) { + return strcount(buf, buf + len, c); +} +// Returns the number of times a character occurs in a string for a C++ string: +inline ptrdiff_t strcount(const string& buf, char c) { + return strcount(buf.c_str(), buf.size(), c); +} + +// Returns a pointer to the nth occurrence of a character in a null-terminated +// string. +// WARNING: Removes const-ness of string argument! +char* strchrnth(const char* str, const char& c, int n); + +// Returns a pointer to the nth occurrence of a character in a null-terminated +// string, or the last occurrence if occurs fewer than n times. +// WARNING: Removes const-ness of string argument! +char* AdjustedLastPos(const char* str, char separator, int n); + +// STL-compatible function objects for char* string keys: + +// Compares two char* strings for equality. (Works with NULL, which compares +// equal only to another NULL). Useful in hash tables: +// hash_map, streq> ht; +struct streq : public binary_function { + bool operator()(const char* s1, const char* s2) const { + return ((s1 == 0 && s2 == 0) || + (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0)); + } +}; + +// Compares two char* strings. (Works with NULL, which compares greater than any +// non-NULL). Useful in maps: +// map m; +struct strlt : public binary_function { + bool operator()(const char* s1, const char* s2) const { + return (s1 != s2) && (s2 == 0 || (s1 != 0 && strcmp(s1, s2) < 0)); + } +}; + +// Returns whether str has only Ascii characters (as defined by ascii_isascii() +// in strings/ascii_ctype.h). +bool IsAscii(const char* str, int len); +inline bool IsAscii(const StringPiece& str) { + return IsAscii(str.data(), str.size()); +} + +// Returns the smallest lexicographically larger string of equal or smaller +// length. Returns an empty string if there is no such successor (if the input +// is empty or consists entirely of 0xff bytes). +// Useful for calculating the smallest lexicographically larger string +// that will not be prefixed by the input string. +// +// Examples: +// "a" -> "b", "aaa" -> "aab", "aa\xff" -> "ab", "\xff" -> "", "" -> "" +string PrefixSuccessor(const StringPiece& prefix); + +// Returns the immediate lexicographically-following string. This is useful to +// turn an inclusive range into something that can be used with Bigtable's +// SetLimitRow(): +// +// // Inclusive range [min_element, max_element]. +// string min_element = ...; +// string max_element = ...; +// +// // Equivalent range [range_start, range_end). +// string range_start = min_element; +// string range_end = ImmediateSuccessor(max_element); +// +// WARNING: Returns the input string with a '\0' appended; if you call c_str() +// on the result, it will compare equal to s. +// +// WARNING: Transforms "" -> "\0"; this doesn't account for Bigtable's special +// treatment of "" as infinity. +string ImmediateSuccessor(const StringPiece& s); + +// Fills in *separator with a short string less than limit but greater than or +// equal to start. If limit is greater than start, *separator is the common +// prefix of start and limit, followed by the successor to the next character in +// start. Examples: +// FindShortestSeparator("foobar", "foxhunt", &sep) => sep == "fop" +// FindShortestSeparator("abracadabra", "bacradabra", &sep) => sep == "b" +// If limit is less than or equal to start, fills in *separator with start. +void FindShortestSeparator(const StringPiece& start, const StringPiece& limit, + string* separator); + +// Copies at most n-1 bytes from src to dest, and returns dest. If n >=1, null +// terminates dest; otherwise, returns dest unchanged. Unlike strncpy(), only +// puts one null character at the end of dest. +inline char* safestrncpy(char* dest, const char* src, size_t n) { + if (n < 1) return dest; + + // Avoid using non-ANSI memccpy(), which is also deprecated in MSVC + for (size_t i = 0; i < n; ++i) { + if ((dest[i] = src[i]) == '\0') + return dest; + } + + dest[n-1] = '\0'; + return dest; +} + +namespace strings { + +// BSD-style safe and consistent string copy functions. +// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. +// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as +// long as |dst_size| is not 0. Returns the length of |src| in characters. +// If the return value is >= dst_size, then the output was truncated. +// NOTE: All sizes are in number of characters, NOT in bytes. +size_t strlcpy(char* dst, const char* src, size_t dst_size); + +} // namespace strings + +// Replaces the first occurrence (if replace_all is false) or all occurrences +// (if replace_all is true) of oldsub in s with newsub. In the second version, +// *res must be distinct from all the other arguments. +string StringReplace(const StringPiece& s, const StringPiece& oldsub, + const StringPiece& newsub, bool replace_all); +void StringReplace(const StringPiece& s, const StringPiece& oldsub, + const StringPiece& newsub, bool replace_all, + string* res); + +// Replaces all occurrences of substring in s with replacement. Returns the +// number of instances replaced. s must be distinct from the other arguments. +// +// Less flexible, but faster, than RE::GlobalReplace(). +int GlobalReplaceSubstring(const StringPiece& substring, + const StringPiece& replacement, + string* s); + +// Removes v[i] for every element i in indices. Does *not* preserve the order of +// v. indices must be sorted in strict increasing order (no duplicates). Runs in +// O(indices.size()). +void RemoveStrings(vector* v, const vector& indices); + +// Case-insensitive strstr(); use system strcasestr() instead. +// WARNING: Removes const-ness of string argument! +char* gstrcasestr(const char* haystack, const char* needle); + +// Finds (case insensitively) the first occurrence of (null terminated) needle +// in at most the first len bytes of haystack. Returns a pointer into haystack, +// or NULL if needle wasn't found. +// WARNING: Removes const-ness of haystack! +const char* gstrncasestr(const char* haystack, const char* needle, size_t len); +char* gstrncasestr(char* haystack, const char* needle, size_t len); + +// Finds (case insensitively), in str (which is a list of tokens separated by +// non_alpha), a token prefix and a token suffix. Returns a pointer into str of +// the position of prefix, or NULL if not found. +// WARNING: Removes const-ness of string argument! +char* gstrncasestr_split(const char* str, + const char* prefix, char non_alpha, + const char* suffix, + size_t n); + +// Finds (case insensitively) needle in haystack, paying attention only to +// alphanumerics in either string. Returns a pointer into haystack, or NULL if +// not found. +// Example: strcasestr_alnum("This is a longer test string", "IS-A-LONGER") +// returns a pointer to "is a longer". +// WARNING: Removes const-ness of string argument! +char* strcasestr_alnum(const char* haystack, const char* needle); + +// Returns the number times substring appears in text. +// Note: Runs in O(text.length() * substring.length()). Do *not* use on long +// strings. +int CountSubstring(StringPiece text, StringPiece substring); + +// Finds, in haystack (which is a list of tokens separated by delim), an token +// equal to needle. Returns a pointer into haystack, or NULL if not found (or +// either needle or haystack is empty). +const char* strstr_delimited(const char* haystack, + const char* needle, + char delim); + +// Gets the next token from string *stringp, where tokens are strings separated +// by characters from delim. +char* gstrsep(char** stringp, const char* delim); + +// Appends StringPiece(data, len) to *s. +void FastStringAppend(string* s, const char* data, int len); + +// Returns a duplicate of the_string, with memory allocated by new[]. +char* strdup_with_new(const char* the_string); + +// Returns a duplicate of up to the first max_length bytes of the_string, with +// memory allocated by new[]. +char* strndup_with_new(const char* the_string, int max_length); + +// Finds, in the_string, the first "word" (consecutive !ascii_isspace() +// characters). Returns pointer to the beginning of the word, and sets *end_ptr +// to the character after the word (which may be space or '\0'); returns NULL +// (and *end_ptr is undefined) if no next word found. +// end_ptr must not be NULL. +const char* ScanForFirstWord(const char* the_string, const char** end_ptr); +inline char* ScanForFirstWord(char* the_string, char** end_ptr) { + // implicit_cast<> would be more appropriate for casting to const, + // but we save the inclusion of "base/casts.h" here by using const_cast<>. + return const_cast( + ScanForFirstWord(const_cast(the_string), + const_cast(end_ptr))); +} + +// For the following functions, an "identifier" is a letter or underscore, +// followed by letters, underscores, or digits. + +// Returns a pointer past the end of the "identifier" (see above) beginning at +// str, or NULL if str doesn't start with an identifier. +const char* AdvanceIdentifier(const char* str); +inline char* AdvanceIdentifier(char* str) { + // implicit_cast<> would be more appropriate for casting to const, + // but we save the inclusion of "base/casts.h" here by using const_cast<>. + return const_cast(AdvanceIdentifier(const_cast(str))); +} + +// Returns whether str is an "identifier" (see above). +bool IsIdentifier(const char* str); + +// Finds the first tag and value in a string of tag/value pairs. +// +// The first pair begins after the first occurrence of attribute_separator (or +// string_terminal, if not '\0'); tag_value_separator separates the tag and +// value; and the value ends before the following occurrence of +// attribute_separator (or string_terminal, if not '\0'). +// +// Returns true (and populates tag, tag_len, value, and value_len) if a +// tag/value pair is founds; returns false otherwise. +bool FindTagValuePair(const char* in_str, char tag_value_separator, + char attribute_separator, char string_terminal, + char** tag, int* tag_len, + char** value, int* value_len); + +// Inserts separator after every interval characters in *s (but never appends to +// the end of the original *s). +void UniformInsertString(string* s, int interval, const char* separator); + +// Inserts separator into s at each specified index. indices must be sorted in +// ascending order. +void InsertString( + string* s, const vector& indices, char const* separator); + +// Finds the nth occurrence of c in n; returns the index in s of that +// occurrence, or string::npos if fewer than n occurrences. +int FindNth(StringPiece s, char c, int n); + +// Finds the nth-to-last occurrence of c in s; returns the index in s of that +// occurrence, or string::npos if fewer than n occurrences. +int ReverseFindNth(StringPiece s, char c, int n); + +// Returns whether s contains only whitespace characters (including the case +// where s is empty). +bool OnlyWhitespace(const StringPiece& s); + +// Formats a string in the same fashion as snprintf(), but returns either the +// number of characters written, or zero if not enough space was available. +// (snprintf() returns the number of characters that would have been written if +// enough space had been available.) +// +// A drop-in replacement for the safe_snprintf() macro. +int SafeSnprintf(char* str, size_t size, const char* format, ...) + PRINTF_ATTRIBUTE(3, 4); + +// Reads a line (terminated by delim) from file into *str. Reads delim from +// file, but doesn't copy it into *str. Returns true if read a delim-terminated +// line, or false on end-of-file or error. +bool GetlineFromStdioFile(FILE* file, string* str, char delim); + +#endif // STRINGS_UTIL_H_ diff --git a/src/kudu/gutil/strtoint.cc b/src/kudu/gutil/strtoint.cc new file mode 100644 index 000000000000..bb96a574d3b3 --- /dev/null +++ b/src/kudu/gutil/strtoint.cc @@ -0,0 +1,47 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// +// Architecture-neutral plug compatible replacements for strtol() friends. +// See strtoint.h for details on how to use this component. +// + +#include +#include "kudu/gutil/port.h" +#include "kudu/gutil/strtoint.h" + +// Replacement strto[u]l functions that have identical overflow and underflow +// characteristics for both ILP-32 and LP-64 platforms, including errno +// preservation for error-free calls. +int32 strto32_adapter(const char *nptr, char **endptr, int base) { + const int saved_errno = errno; + errno = 0; + const long result = strtol(nptr, endptr, base); + if (errno == ERANGE && result == LONG_MIN) { + return kint32min; + } else if (errno == ERANGE && result == LONG_MAX) { + return kint32max; + } else if (errno == 0 && result < kint32min) { + errno = ERANGE; + return kint32min; + } else if (errno == 0 && result > kint32max) { + errno = ERANGE; + return kint32max; + } + if (errno == 0) + errno = saved_errno; + return static_cast(result); +} + +uint32 strtou32_adapter(const char *nptr, char **endptr, int base) { + const int saved_errno = errno; + errno = 0; + const unsigned long result = strtoul(nptr, endptr, base); + if (errno == ERANGE && result == ULONG_MAX) { + return kuint32max; + } else if (errno == 0 && result > kuint32max) { + errno = ERANGE; + return kuint32max; + } + if (errno == 0) + errno = saved_errno; + return static_cast(result); +} diff --git a/src/kudu/gutil/strtoint.h b/src/kudu/gutil/strtoint.h new file mode 100644 index 000000000000..b58138513762 --- /dev/null +++ b/src/kudu/gutil/strtoint.h @@ -0,0 +1,93 @@ +// Copyright 2008 Google Inc. All Rights Reserved. +// +// Architecture-neutral plug compatible replacements for strtol() friends. +// +// Long's have different lengths on ILP-32 and LP-64 platforms, and so overflow +// behavior across the two varies when strtol() and similar are used to parse +// 32-bit integers. Similar problems exist with atoi(), because although it +// has an all-integer interface, it uses strtol() internally, and so suffers +// from the same narrowing problems on assignments to int. +// +// Examples: +// errno = 0; +// i = strtol("3147483647", NULL, 10); +// printf("%d, errno %d\n", i, errno); +// // 32-bit platform: 2147483647, errno 34 +// // 64-bit platform: -1147483649, errno 0 +// +// printf("%d\n", atoi("3147483647")); +// // 32-bit platform: 2147483647 +// // 64-bit platform: -1147483649 +// +// A way round this is to define local replacements for these, and use them +// instead of the standard libc functions. +// +// In most 32-bit cases the replacements can be inlined away to a call to the +// libc function. In a couple of 64-bit cases, however, adapters are required, +// to provide the right overflow and errno behavior. +// + +#ifndef BASE_STRTOINT_H_ +#define BASE_STRTOINT_H_ + +#include // For strtol* functions. +#include +using std::string; +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" + +// Adapter functions for handling overflow and errno. +int32 strto32_adapter(const char *nptr, char **endptr, int base); +uint32 strtou32_adapter(const char *nptr, char **endptr, int base); + +// Conversions to a 32-bit integer can pass the call to strto[u]l on 32-bit +// platforms, but need a little extra work on 64-bit platforms. +inline int32 strto32(const char *nptr, char **endptr, int base) { + if (sizeof(int32) == sizeof(long)) + return static_cast(strtol(nptr, endptr, base)); + else + return strto32_adapter(nptr, endptr, base); +} + +inline uint32 strtou32(const char *nptr, char **endptr, int base) { + if (sizeof(uint32) == sizeof(unsigned long)) + return static_cast(strtoul(nptr, endptr, base)); + else + return strtou32_adapter(nptr, endptr, base); +} + +// For now, long long is 64-bit on all the platforms we care about, so these +// functions can simply pass the call to strto[u]ll. +inline int64 strto64(const char *nptr, char **endptr, int base) { + COMPILE_ASSERT(sizeof(int64) == sizeof(long long), + sizeof_int64_is_not_sizeof_long_long); + return strtoll(nptr, endptr, base); +} + +inline uint64 strtou64(const char *nptr, char **endptr, int base) { + COMPILE_ASSERT(sizeof(uint64) == sizeof(unsigned long long), + sizeof_uint64_is_not_sizeof_long_long); + return strtoull(nptr, endptr, base); +} + +// Although it returns an int, atoi() is implemented in terms of strtol, and +// so has differing overflow and underflow behavior. atol is the same. +inline int32 atoi32(const char *nptr) { + return strto32(nptr, NULL, 10); +} + +inline int64 atoi64(const char *nptr) { + return strto64(nptr, NULL, 10); +} + +// Convenience versions of the above that take a string argument. +inline int32 atoi32(const string &s) { + return atoi32(s.c_str()); +} + +inline int64 atoi64(const string &s) { + return atoi64(s.c_str()); +} + +#endif // BASE_STRTOINT_H_ diff --git a/src/kudu/gutil/synchronization_profiling.h b/src/kudu/gutil/synchronization_profiling.h new file mode 100644 index 000000000000..f00887179036 --- /dev/null +++ b/src/kudu/gutil/synchronization_profiling.h @@ -0,0 +1,51 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +/* Copyright (c) 2010, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Chris Ruemmler + */ + +#ifndef BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ +#define BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ + +#include "kudu/gutil/basictypes.h" + +namespace gutil { + +// We can do contention-profiling of SpinLocks, but the code is in +// mutex.cc, which is not always linked in with spinlock. Hence we +// provide a weak definition, which are used if mutex.cc isn't linked in. + +// Submit the number of cycles the spinlock spent contending. +ATTRIBUTE_WEAK extern void SubmitSpinLockProfileData(const void *, int64); +extern void SubmitSpinLockProfileData(const void *contendedlock, + int64 wait_cycles) {} +} +#endif // BASE_AUXILIARY_SYNCHRONIZATION_PROFILING_H_ diff --git a/src/kudu/gutil/sysinfo.cc b/src/kudu/gutil/sysinfo.cc new file mode 100644 index 000000000000..6006b6df1975 --- /dev/null +++ b/src/kudu/gutil/sysinfo.cc @@ -0,0 +1,412 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#if (defined(_WIN32) || defined(__MINGW32__)) && !defined(__CYGWIN__) && !defined(__CYGWIN32) +# define PLATFORM_WINDOWS 1 +#endif + +#include // for isspace() +#include // for getenv() +#include // for snprintf(), sscanf() +#include // for memmove(), memchr(), etc. +#include // for open() +#include // for errno +#ifdef HAVE_UNISTD_H +#include // for read() +#endif +#if defined __MACH__ // Mac OS X, almost certainly +#include +#include // how we figure out numcpu's on OS X +#elif defined __FreeBSD__ +#include +#elif defined __sun__ // Solaris +#include // for, e.g., prmap_t +#elif defined(PLATFORM_WINDOWS) +#include // for getpid() (actually, _getpid()) +#include // for SHGetValueA() +#include // for Module32First() +#endif +#include "kudu/gutil/dynamic_annotations.h" // for RunningOnValgrind +#include "kudu/gutil/macros.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/gutil/walltime.h" +#include + +// This isn't in the 'base' namespace in tcmallc. But, tcmalloc +// exports these functions, so we need to namespace them to avoid +// the conflict. +namespace base { + +// ---------------------------------------------------------------------- +// CyclesPerSecond() +// NumCPUs() +// It's important this not call malloc! -- they may be called at +// global-construct time, before we've set up all our proper malloc +// hooks and such. +// ---------------------------------------------------------------------- + +static double cpuinfo_cycles_per_second = 1.0; // 0.0 might be dangerous +static int cpuinfo_num_cpus = 1; // Conservative guess +static int cpuinfo_max_cpu_index = -1; + +void SleepForNanoseconds(int64_t nanoseconds) { + // Sleep for nanosecond duration + struct timespec sleep_time; + sleep_time.tv_sec = nanoseconds / 1000 / 1000 / 1000; + sleep_time.tv_nsec = (nanoseconds % (1000 * 1000 * 1000)); + while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) + ; // Ignore signals and wait for the full interval to elapse. +} + +void SleepForMilliseconds(int64_t milliseconds) { + SleepForNanoseconds(milliseconds * 1000 * 1000); +} + +// Helper function estimates cycles/sec by observing cycles elapsed during +// sleep(). Using small sleep time decreases accuracy significantly. +static int64 EstimateCyclesPerSecond(const int estimate_time_ms) { + CHECK(estimate_time_ms > 0); + if (estimate_time_ms <= 0) + return 1; + double multiplier = 1000.0 / (double)estimate_time_ms; // scale by this much + + const int64 start_ticks = CycleClock::Now(); + SleepForMilliseconds(estimate_time_ms); + const int64 guess = int64(multiplier * (CycleClock::Now() - start_ticks)); + return guess; +} + +// ReadIntFromFile is only called on linux and cygwin platforms. +#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__) + +// Slurp a file with a single read() call into 'buf'. This is only safe to use on small +// files in places like /proc where we are guaranteed not to get a partial read. +// Any remaining bytes in the buffer are zeroed. +// +// 'buflen' must be more than large enough to hold the whole file, or else this will +// issue a FATAL error. +static bool SlurpSmallTextFile(const char* file, char* buf, int buflen) { + bool ret = false; + int fd = open(file, O_RDONLY); + if (fd == -1) return ret; + + memset(buf, '\0', buflen); + int n = read(fd, buf, buflen - 1); + CHECK_NE(n, buflen - 1) << "buffer of len " << buflen << " not large enough to store " + << "contents of " << file; + if (n > 0) { + ret = true; + } + + close(fd); + return ret; +} + +// Helper function for reading an int from a file. Returns true if successful +// and the memory location pointed to by value is set to the value read. +static bool ReadIntFromFile(const char *file, int *value) { + char line[1024]; + if (!SlurpSmallTextFile(file, line, arraysize(line))) { + return false; + } + char* err; + const int temp_value = strtol(line, &err, 10); + if (line[0] != '\0' && (*err == '\n' || *err == '\0')) { + *value = temp_value; + return true; + } + return false; +} + +static int ReadMaxCPUIndex() { + char buf[1024]; + CHECK(SlurpSmallTextFile("/sys/devices/system/cpu/present", buf, arraysize(buf))); + + // On a single-core machine, 'buf' will contain the string '0' with a newline. + if (strcmp(buf, "0\n") == 0) { + return 0; + } + + // On multi-core, it will have a CPU range like '0-7'. + CHECK_EQ(0, memcmp(buf, "0-", 2)) << "bad list of possible CPUs: " << buf; + + char* max_cpu_str = &buf[2]; + char* err; + int val = strtol(max_cpu_str, &err, 10); + CHECK(*err == '\n' || *err == '\0') << "unable to parse max CPU index from: " << buf; + return val; +} + +#endif + +// WARNING: logging calls back to InitializeSystemInfo() so it must +// not invoke any logging code. Also, InitializeSystemInfo() can be +// called before main() -- in fact it *must* be since already_called +// isn't protected -- before malloc hooks are properly set up, so +// we make an effort not to call any routines which might allocate +// memory. + +static void InitializeSystemInfo() { + static bool already_called = false; // safe if we run before threads + if (already_called) return; + already_called = true; + + bool saw_mhz = false; + + if (RunningOnValgrind()) { + // Valgrind may slow the progress of time artificially (--scale-time=N + // option). We thus can't rely on CPU Mhz info stored in /sys or /proc + // files. Thus, actually measure the cps. + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(100); + saw_mhz = true; + } + +#if defined(__linux__) || defined(__CYGWIN__) || defined(__CYGWIN32__) + char line[1024]; + char* err; + int freq; + + // If the kernel is exporting the tsc frequency use that. There are issues + // where cpuinfo_max_freq cannot be relied on because the BIOS may be + // exporintg an invalid p-state (on x86) or p-states may be used to put the + // processor in a new mode (turbo mode). Essentially, those frequencies + // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as + // well. + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) { + // The value is in kHz (as the file name suggests). For example, on a + // 2GHz warpstation, the file contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; + } + + // If CPU scaling is in effect, we want to use the *maximum* frequency, + // not whatever CPU speed some random processor happens to be using now. + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", + &freq)) { + // The value is in kHz. For example, on a 2GHz machine, the file + // contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; + } + + // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq. + const char* pname = "/proc/cpuinfo"; + int fd = open(pname, O_RDONLY); + if (fd == -1) { + PLOG(FATAL) << "Unable to read CPU info from /proc. procfs must be mounted."; + } + + double bogo_clock = 1.0; + bool saw_bogo = false; + int num_cpus = 0; + line[0] = line[1] = '\0'; + int chars_read = 0; + do { // we'll exit when the last read didn't read anything + // Move the next line to the beginning of the buffer + const int oldlinelen = strlen(line); + if (sizeof(line) == oldlinelen + 1) // oldlinelen took up entire line + line[0] = '\0'; + else // still other lines left to save + memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1)); + // Terminate the new line, reading more if we can't find the newline + char* newline = strchr(line, '\n'); + if (newline == NULL) { + const int linelen = strlen(line); + const int bytes_to_read = sizeof(line)-1 - linelen; + CHECK(bytes_to_read > 0); // because the memmove recovered >=1 bytes + chars_read = read(fd, line + linelen, bytes_to_read); + line[linelen + chars_read] = '\0'; + newline = strchr(line, '\n'); + } + if (newline != NULL) + *newline = '\0'; + +#if defined(__powerpc__) || defined(__ppc__) + // PowerPC cpus report the frequency in "clock" line + if (strncasecmp(line, "clock", sizeof("clock")-1) == 0) { + const char* freqstr = strchr(line, ':'); + if (freqstr) { + // PowerPC frequencies are only reported as MHz (check 'show_cpuinfo' + // function at arch/powerpc/kernel/setup-common.c) + char *endp = strstr(line, "MHz"); + if (endp) { + *endp = 0; + cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0; + if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0) + saw_mhz = true; + } + } +#else + // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only + // accept postive values. Some environments (virtual machines) report zero, + // which would cause infinite looping in WallTime_Init. + if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) { + const char* freqstr = strchr(line, ':'); + if (freqstr) { + cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0; + if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0) + saw_mhz = true; + } + } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) { + const char* freqstr = strchr(line, ':'); + if (freqstr) { + bogo_clock = strtod(freqstr+1, &err) * 1000000.0; + if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0) + saw_bogo = true; + } +#endif + } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) { + num_cpus++; // count up every time we see an "processor :" entry + } + } while (chars_read > 0); + close(fd); + + if (!saw_mhz) { + if (saw_bogo) { + // If we didn't find anything better, we'll use bogomips, but + // we're not happy about it. + cpuinfo_cycles_per_second = bogo_clock; + } else { + // If we don't even have bogomips, we'll use the slow estimation. + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); + } + } + if (cpuinfo_cycles_per_second == 0.0) { + cpuinfo_cycles_per_second = 1.0; // maybe unnecessary, but safe + } + if (num_cpus > 0) { + cpuinfo_num_cpus = num_cpus; + } + cpuinfo_max_cpu_index = ReadMaxCPUIndex(); + +#elif defined __FreeBSD__ + // For this sysctl to work, the machine must be configured without + // SMP, APIC, or APM support. hz should be 64-bit in freebsd 7.0 + // and later. Before that, it's a 32-bit quantity (and gives the + // wrong answer on machines faster than 2^32 Hz). See + // http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html + // But also compare FreeBSD 7.0: + // http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223 + // 231 error = sysctl_handle_quad(oidp, &freq, 0, req); + // To FreeBSD 6.3 (it's the same in 6-STABLE): + // http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131 + // 139 error = sysctl_handle_int(oidp, &freq, sizeof(freq), req); +#if __FreeBSD__ >= 7 + uint64_t hz = 0; +#else + unsigned int hz = 0; +#endif + size_t sz = sizeof(hz); + const char *sysctl_path = "machdep.tsc_freq"; + if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) { + fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n", + sysctl_path, strerror(errno)); + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); + } else { + cpuinfo_cycles_per_second = hz; + } + // TODO(csilvers): also figure out cpuinfo_num_cpus + +#elif defined(PLATFORM_WINDOWS) +# pragma comment(lib, "shlwapi.lib") // for SHGetValue() + // In NT, read MHz from the registry. If we fail to do so or we're in win9x + // then make a crude estimate. + OSVERSIONINFO os; + os.dwOSVersionInfoSize = sizeof(os); + DWORD data, data_size = sizeof(data); + if (GetVersionEx(&os) && + os.dwPlatformId == VER_PLATFORM_WIN32_NT && + SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "~MHz", NULL, &data, &data_size))) + cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz + else + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500? + + // Get the number of processors. + SYSTEM_INFO info; + GetSystemInfo(&info); + cpuinfo_num_cpus = info.dwNumberOfProcessors; + +#elif defined(__MACH__) && defined(__APPLE__) + // returning "mach time units" per second. the current number of elapsed + // mach time units can be found by calling uint64 mach_absolute_time(); + // while not as precise as actual CPU cycles, it is accurate in the face + // of CPU frequency scaling and multi-cpu/core machines. + // Our mac users have these types of machines, and accuracy + // (i.e. correctness) trumps precision. + // See cycleclock.h: CycleClock::Now(), which returns number of mach time + // units on Mac OS X. + mach_timebase_info_data_t timebase_info; + mach_timebase_info(&timebase_info); + double mach_time_units_per_nanosecond = + static_cast(timebase_info.denom) / + static_cast(timebase_info.numer); + cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9; + + int num_cpus = 0; + size_t size = sizeof(num_cpus); + int numcpus_name[] = { CTL_HW, HW_NCPU }; + if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, nullptr, 0) + == 0 + && (size == sizeof(num_cpus))) + cpuinfo_num_cpus = num_cpus; + +#else + // Generic cycles per second counter + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); +#endif + + // On platforms where we can't determine the max CPU index, just use the + // number of CPUs. This might break if CPUs are taken offline, but + // better than a wild guess. + if (cpuinfo_max_cpu_index < 0) { + cpuinfo_max_cpu_index = cpuinfo_num_cpus - 1; + } +} + +double CyclesPerSecond(void) { + InitializeSystemInfo(); + return cpuinfo_cycles_per_second; +} + +int NumCPUs(void) { + InitializeSystemInfo(); + return cpuinfo_num_cpus; +} + +int MaxCPUIndex(void) { + InitializeSystemInfo(); + return cpuinfo_max_cpu_index; +} + +} // namespace base diff --git a/src/kudu/gutil/sysinfo.h b/src/kudu/gutil/sysinfo.h new file mode 100644 index 000000000000..ec3abe7bc814 --- /dev/null +++ b/src/kudu/gutil/sysinfo.h @@ -0,0 +1,55 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef _SYSINFO_H_ +#define _SYSINFO_H_ + +namespace base { + +// Return the number of online CPUs. This is computed and cached the first time this or +// NumCPUs() is called, so does not reflect any CPUs enabled or disabled at a later +// point in time. +// +// Note that, if not all CPUs are online, this may return a value lower than the maximum +// value of sched_getcpu(). +extern int NumCPUs(); + +// Return the maximum CPU index that may be returned by sched_getcpu(). For example, on +// an 8-core machine, this will return '7' even if some of the CPUs have been disabled. +extern int MaxCPUIndex(); + +void SleepForNanoseconds(int64_t nanoseconds); +void SleepForMilliseconds(int64_t milliseconds); + +// processor cycles per second of each processor. Thread-safe. +extern double CyclesPerSecond(void); + +} // namespace base +#endif /* #ifndef _SYSINFO_H_ */ diff --git a/src/kudu/gutil/template_util.h b/src/kudu/gutil/template_util.h new file mode 100644 index 000000000000..aebfa163bbad --- /dev/null +++ b/src/kudu/gutil/template_util.h @@ -0,0 +1,164 @@ +// Copyright 2005 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ---- +// +// Template metaprogramming utility functions. +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// +// +// The names choosen here reflect those used in tr1 and the boost::mpl +// library, there are similar operations used in the Loki library as +// well. I prefer the boost names for 2 reasons: +// 1. I think that portions of the Boost libraries are more likely to +// be included in the c++ standard. +// 2. It is not impossible that some of the boost libraries will be +// included in our own build in the future. +// Both of these outcomes means that we may be able to directly replace +// some of these with boost equivalents. +// +#ifndef BASE_TEMPLATE_UTIL_H_ +#define BASE_TEMPLATE_UTIL_H_ + +namespace base { + +// Types small_ and big_ are guaranteed such that sizeof(small_) < +// sizeof(big_) +typedef char small_; + +struct big_ { + char dummy[2]; +}; + +// Types YesType and NoType are guaranteed such that sizeof(YesType) < +// sizeof(NoType). +typedef small_ YesType; +typedef big_ NoType; + +// Identity metafunction. +template +struct identity_ { + typedef T type; +}; + +// integral_constant, defined in tr1, is a wrapper for an integer +// value. We don't really need this generality; we could get away +// with hardcoding the integer type to bool. We use the fully +// general integer_constant for compatibility with tr1. + +template +struct integral_constant { + static const T value = v; + typedef T value_type; + typedef integral_constant type; +}; + +template const T integral_constant::value; + + +// Abbreviations: true_type and false_type are structs that represent boolean +// true and false values. Also define the boost::mpl versions of those names, +// true_ and false_. +typedef integral_constant true_type; +typedef integral_constant false_type; +typedef true_type true_; +typedef false_type false_; + +template struct is_non_const_reference : false_type {}; +template struct is_non_const_reference : true_type {}; +template struct is_non_const_reference : false_type {}; + +template struct is_const : false_type {}; +template struct is_const : true_type {}; + +template struct is_void : false_type {}; +template <> struct is_void : true_type {}; + +// if_ is a templatized conditional statement. +// if_ is a compile time evaluation of cond. +// if_<>::type contains A if cond is true, B otherwise. +template +struct if_{ + typedef A type; +}; + +template +struct if_ { + typedef B type; +}; + + +// type_equals_ is a template type comparator, similar to Loki IsSameType. +// type_equals_::value is true iff "A" is the same type as "B". +// +// New code should prefer base::is_same, defined in base/type_traits.h. +// It is functionally identical, but is_same is the standard spelling. +template +struct type_equals_ : public false_ { +}; + +template +struct type_equals_ : public true_ { +}; + +// and_ is a template && operator. +// and_::value evaluates "A::value && B::value". +template +struct and_ : public integral_constant { +}; + +// or_ is a template || operator. +// or_::value evaluates "A::value || B::value". +template +struct or_ : public integral_constant { +}; + +// Used to determine if a type is a struct/union/class. Inspired by Boost's +// is_class type_trait implementation. +struct IsClassHelper { + template + static YesType Test(void(C::*)(void)); + + template + static NoType Test(...); +}; + +template +struct is_class + : integral_constant(0)) == + sizeof(YesType)> { +}; + +} + +#endif // BASE_TEMPLATE_UTIL_H_ diff --git a/src/kudu/gutil/thread_annotations.h b/src/kudu/gutil/thread_annotations.h new file mode 100644 index 000000000000..924a090bf01b --- /dev/null +++ b/src/kudu/gutil/thread_annotations.h @@ -0,0 +1,236 @@ +// Copyright (c) 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// --- +// +// +// This header file contains the macro definitions for thread safety +// annotations that allow the developers to document the locking policies +// of their multi-threaded code. The annotations can also help program +// analysis tools to identify potential thread safety issues. +// +// +// The annotations are implemented using GCC's "attributes" extension. +// Using the macros defined here instead of the raw GCC attributes allows +// for portability and future compatibility. +// + +#ifndef BASE_THREAD_ANNOTATIONS_H_ +#define BASE_THREAD_ANNOTATIONS_H_ + + +#if defined(__GNUC__) && defined(__SUPPORT_TS_ANNOTATION__) && !defined(SWIG) +#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) +#else +#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op +#endif + +#if defined(__GNUC__) && !defined(__clang__) + +// Document if a shared variable/field needs to be protected by a lock. +// GUARDED_BY allows the user to specify a particular lock that should be +// held when accessing the annotated variable, while GUARDED_VAR only +// indicates a shared variable should be guarded (by any lock). GUARDED_VAR +// is primarily used when the client cannot express the name of the lock. +#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x)) +#define GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(guarded) + +// Document if the memory location pointed to by a pointer should be guarded +// by a lock when dereferencing the pointer. Similar to GUARDED_VAR, +// PT_GUARDED_VAR is primarily used when the client cannot express the name +// of the lock. Note that a pointer variable to a shared memory location +// could itself be a shared variable. For example, if a shared global pointer +// q, which is guarded by mu1, points to a shared memory location that is +// guarded by mu2, q should be annotated as follows: +// int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2); +#define PT_GUARDED_BY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded_by(x)) +#define PT_GUARDED_VAR \ + THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded) + +// Document the acquisition order between locks that can be held +// simultaneously by a thread. For any two locks that need to be annotated +// to establish an acquisition order, only one of them needs the annotation. +// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER +// and ACQUIRED_BEFORE.) +#define ACQUIRED_AFTER(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__)) +#define ACQUIRED_BEFORE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__)) + +// The following three annotations document the lock requirements for +// functions/methods. + +// Document if a function expects certain locks to be held before it is called +#define EXCLUSIVE_LOCKS_REQUIRED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__)) + +#define SHARED_LOCKS_REQUIRED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__)) + +// Document the locks acquired in the body of the function. These locks +// non-reentrant). +#define LOCKS_EXCLUDED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__)) + +// Document the lock the annotated function returns without acquiring it. +#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x)) + +// Document if a class/type is a lockable type (such as the Mutex class). +#define LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(lockable) + +// Document if a class is a scoped lockable type (such as the MutexLock class). +#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable) + +// The following annotations specify lock and unlock primitives. +#define EXCLUSIVE_LOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock(__VA_ARGS__)) + +#define SHARED_LOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_lock(__VA_ARGS__)) + +#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock(__VA_ARGS__)) + +#define SHARED_TRYLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock(__VA_ARGS__)) + +#define UNLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(unlock(__VA_ARGS__)) + +// An escape hatch for thread safety analysis to ignore the annotated function. +#define NO_THREAD_SAFETY_ANALYSIS \ + THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +// Used to mark functions that need to be fixed, because they are producing +// thread safety warnings. This macro is intended primarily for use by the +// compiler team; it allows new thread safety warnings to be rolled out +// without breaking existing code. Code which triggers the new warnings are +// marked with a FIXME, and referred back to the code owners to fix. +#define NO_THREAD_SAFETY_ANALYSIS_FIXME \ + THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +// NO_THREAD_SAFETY_ANALYSIS_OPT turns off thread-safety checking in the +// annotated function in opt (NDEBUG) mode. It is for use specifically when +// the thread-safety checker is failing in opt mode on an otherwise correct +// piece of code. +#ifdef NDEBUG +#define NO_THREAD_SAFETY_ANALYSIS_OPT NO_THREAD_SAFETY_ANALYSIS +#else +#define NO_THREAD_SAFETY_ANALYSIS_OPT +#endif + +// TS_UNCHECKED should be placed around lock expressions that are not valid +// C++ syntax, but which are present for documentation purposes. The +// expressions are passed unchanged to gcc, which will usually treat them +// as the universal lock. +#define TS_UNCHECKED(x) x + +// TS_FIXME is used to mark lock expressions that are not valid C++ syntax. +// This annotation should eventually be either fixed, or changed to +// TS_UNCHECKED. +#define TS_FIXME(x) x + +// This is used to pass different annotations to gcc and clang, in cases where +// gcc would reject a lock expression (e.g. &MyClass::mu_) that is accepted +// by clang. This is seldom needed, since GCC usually ignores invalid lock +// expressions except in certain cases, such as LOCK_RETURNED. +#define TS_CLANG_ONLY(CLANG_EXPR, GCC_EXPR) GCC_EXPR + +// Clang Attributes +// The names of attributes in the clang analysis are slightly different +#else + +#define GUARDED_BY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x)) + +#define GUARDED_VAR \ + THREAD_ANNOTATION_ATTRIBUTE__(guarded) + +#define PT_GUARDED_BY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x)) + +#define PT_GUARDED_VAR \ + THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded) + +#define ACQUIRED_AFTER(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__)) + +#define ACQUIRED_BEFORE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__)) + +#define EXCLUSIVE_LOCKS_REQUIRED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__)) + +#define SHARED_LOCKS_REQUIRED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__)) + +#define LOCKS_EXCLUDED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__)) + +#define LOCK_RETURNED(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x)) + +#define LOCKABLE \ + THREAD_ANNOTATION_ATTRIBUTE__(lockable) + +#define SCOPED_LOCKABLE \ + THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable) + +#define EXCLUSIVE_LOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__)) + +#define SHARED_LOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__)) + +#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__)) + +#define SHARED_TRYLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__)) + +#define UNLOCK_FUNCTION(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__)) + +#define NO_THREAD_SAFETY_ANALYSIS \ + THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +#define NO_THREAD_SAFETY_ANALYSIS_OPT + +#define NO_THREAD_SAFETY_ANALYSIS_FIXME \ + THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + +#define TS_UNCHECKED(x) "" + +#define TS_FIXME(x) "" + +#define TS_CLANG_ONLY(CLANG_EXPR, GCC_EXPR) CLANG_EXPR + +#endif // defined(__clang__) + +#endif // BASE_THREAD_ANNOTATIONS_H_ diff --git a/src/kudu/gutil/threading/thread_collision_warner.cc b/src/kudu/gutil/threading/thread_collision_warner.cc new file mode 100644 index 000000000000..89270d85bee9 --- /dev/null +++ b/src/kudu/gutil/threading/thread_collision_warner.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/threading/thread_collision_warner.h" + +#include + +#include "kudu/gutil/linux_syscall_support.h" + +namespace base { + +void DCheckAsserter::warn() { + LOG(FATAL) << "Thread Collision"; +} + +#if 0 +// Original source from Chromium -- we didn't import their threading library +// into Cloudera source as of yet + +static subtle::Atomic32 CurrentThread() { + const PlatformThreadId current_thread_id = PlatformThread::CurrentId(); + // We need to get the thread id into an atomic data type. This might be a + // truncating conversion, but any loss-of-information just increases the + // chance of a fault negative, not a false positive. + const subtle::Atomic32 atomic_thread_id = + static_cast(current_thread_id); + + return atomic_thread_id; +} +#else + +static subtle::Atomic64 CurrentThread() { +#if defined(__APPLE__) + uint64_t tid; + CHECK_EQ(0, pthread_threadid_np(NULL, &tid)); + return tid; +#elif defined(__linux__) + return syscall(__NR_gettid); +#endif +} + +#endif + +void ThreadCollisionWarner::EnterSelf() { + // If the active thread is 0 then I'll write the current thread ID + // if two or more threads arrive here only one will succeed to + // write on valid_thread_id_ the current thread ID. + subtle::Atomic64 current_thread_id = CurrentThread(); + + int64_t previous_value = subtle::NoBarrier_CompareAndSwap(&valid_thread_id_, + 0, + current_thread_id); + if (previous_value != 0 && previous_value != current_thread_id) { + // gotcha! a thread is trying to use the same class and that is + // not current thread. + asserter_->warn(); + } + + subtle::NoBarrier_AtomicIncrement(&counter_, 1); +} + +void ThreadCollisionWarner::Enter() { + subtle::Atomic64 current_thread_id = CurrentThread(); + + if (subtle::NoBarrier_CompareAndSwap(&valid_thread_id_, + 0, + current_thread_id) != 0) { + // gotcha! another thread is trying to use the same class. + asserter_->warn(); + } + + subtle::NoBarrier_AtomicIncrement(&counter_, 1); +} + +void ThreadCollisionWarner::Leave() { + if (subtle::Barrier_AtomicIncrement(&counter_, -1) == 0) { + subtle::NoBarrier_Store(&valid_thread_id_, 0); + } +} + +} // namespace base diff --git a/src/kudu/gutil/threading/thread_collision_warner.h b/src/kudu/gutil/threading/thread_collision_warner.h new file mode 100644 index 000000000000..d59ea67c5dbc --- /dev/null +++ b/src/kudu/gutil/threading/thread_collision_warner.h @@ -0,0 +1,248 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_THREADING_THREAD_COLLISION_WARNER_H_ +#define BASE_THREADING_THREAD_COLLISION_WARNER_H_ + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/macros.h" + +#ifndef BASE_EXPORT +#define BASE_EXPORT +#endif + +// A helper class alongside macros to be used to verify assumptions about thread +// safety of a class. +// +// Example: Queue implementation non thread-safe but still usable if clients +// are synchronized somehow. +// +// In this case the macro DFAKE_SCOPED_LOCK has to be +// used, it checks that if a thread is inside the push/pop then +// noone else is still inside the pop/push +// +// class NonThreadSafeQueue { +// public: +// ... +// void push(int) { DFAKE_SCOPED_LOCK(push_pop_); ... } +// int pop() { DFAKE_SCOPED_LOCK(push_pop_); ... } +// ... +// private: +// DFAKE_MUTEX(push_pop_); +// }; +// +// +// Example: Queue implementation non thread-safe but still usable if clients +// are synchronized somehow, it calls a method to "protect" from +// a "protected" method +// +// In this case the macro DFAKE_SCOPED_RECURSIVE_LOCK +// has to be used, it checks that if a thread is inside the push/pop +// then noone else is still inside the pop/push +// +// class NonThreadSafeQueue { +// public: +// void push(int) { +// DFAKE_SCOPED_LOCK(push_pop_); +// ... +// } +// int pop() { +// DFAKE_SCOPED_RECURSIVE_LOCK(push_pop_); +// bar(); +// ... +// } +// void bar() { DFAKE_SCOPED_RECURSIVE_LOCK(push_pop_); ... } +// ... +// private: +// DFAKE_MUTEX(push_pop_); +// }; +// +// +// Example: Queue implementation not usable even if clients are synchronized, +// so only one thread in the class life cycle can use the two members +// push/pop. +// +// In this case the macro DFAKE_SCOPED_LOCK_THREAD_LOCKED pins the +// specified +// critical section the first time a thread enters push or pop, from +// that time on only that thread is allowed to execute push or pop. +// +// class NonThreadSafeQueue { +// public: +// ... +// void push(int) { DFAKE_SCOPED_LOCK_THREAD_LOCKED(push_pop_); ... } +// int pop() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(push_pop_); ... } +// ... +// private: +// DFAKE_MUTEX(push_pop_); +// }; +// +// +// Example: Class that has to be contructed/destroyed on same thread, it has +// a "shareable" method (with external synchronization) and a not +// shareable method (even with external synchronization). +// +// In this case 3 Critical sections have to be defined +// +// class ExoticClass { +// public: +// ExoticClass() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... } +// ~ExoticClass() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... } +// +// void Shareable() { DFAKE_SCOPED_LOCK(shareable_section_); ... } +// void NotShareable() { DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); ... } +// ... +// private: +// DFAKE_MUTEX(ctor_dtor_); +// DFAKE_MUTEX(shareable_section_); +// }; + + +#if !defined(NDEBUG) + +// Defines a class member that acts like a mutex. It is used only as a +// verification tool. +#define DFAKE_MUTEX(obj) \ + mutable base::ThreadCollisionWarner obj +// Asserts the call is never called simultaneously in two threads. Used at +// member function scope. +#define DFAKE_SCOPED_LOCK(obj) \ + base::ThreadCollisionWarner::ScopedCheck s_check_##obj(&obj) +// Asserts the call is never called simultaneously in two threads. Used at +// member function scope. Same as DFAKE_SCOPED_LOCK but allows recursive locks. +#define DFAKE_SCOPED_RECURSIVE_LOCK(obj) \ + base::ThreadCollisionWarner::ScopedRecursiveCheck sr_check_##obj(&obj) +// Asserts the code is always executed in the same thread. +#define DFAKE_SCOPED_LOCK_THREAD_LOCKED(obj) \ + base::ThreadCollisionWarner::Check check_##obj(&obj) + +#else + +#define DFAKE_MUTEX(obj) typedef void InternalFakeMutexType##obj +#define DFAKE_SCOPED_LOCK(obj) ((void)0) +#define DFAKE_SCOPED_RECURSIVE_LOCK(obj) ((void)0) +#define DFAKE_SCOPED_LOCK_THREAD_LOCKED(obj) ((void)0) + +#endif + +namespace base { + +// The class ThreadCollisionWarner uses an Asserter to notify the collision +// AsserterBase is the interfaces and DCheckAsserter is the default asserter +// used. During the unit tests is used another class that doesn't "DCHECK" +// in case of collision (check thread_collision_warner_unittests.cc) +struct BASE_EXPORT AsserterBase { + virtual ~AsserterBase() {} + virtual void warn() = 0; +}; + +struct BASE_EXPORT DCheckAsserter : public AsserterBase { + virtual ~DCheckAsserter() {} + virtual void warn() OVERRIDE; +}; + +class BASE_EXPORT ThreadCollisionWarner { + public: + // The parameter asserter is there only for test purpose + explicit ThreadCollisionWarner(AsserterBase* asserter = new DCheckAsserter()) + : valid_thread_id_(0), + counter_(0), + asserter_(asserter) {} + + ~ThreadCollisionWarner() { + delete asserter_; + } + + // This class is meant to be used through the macro + // DFAKE_SCOPED_LOCK_THREAD_LOCKED + // it doesn't leave the critical section, as opposed to ScopedCheck, + // because the critical section being pinned is allowed to be used only + // from one thread + class BASE_EXPORT Check { + public: + explicit Check(ThreadCollisionWarner* warner) + : warner_(warner) { + warner_->EnterSelf(); + } + + ~Check() {} + + private: + ThreadCollisionWarner* warner_; + + DISALLOW_COPY_AND_ASSIGN(Check); + }; + + // This class is meant to be used through the macro + // DFAKE_SCOPED_LOCK + class BASE_EXPORT ScopedCheck { + public: + explicit ScopedCheck(ThreadCollisionWarner* warner) + : warner_(warner) { + warner_->Enter(); + } + + ~ScopedCheck() { + warner_->Leave(); + } + + private: + ThreadCollisionWarner* warner_; + + DISALLOW_COPY_AND_ASSIGN(ScopedCheck); + }; + + // This class is meant to be used through the macro + // DFAKE_SCOPED_RECURSIVE_LOCK + class BASE_EXPORT ScopedRecursiveCheck { + public: + explicit ScopedRecursiveCheck(ThreadCollisionWarner* warner) + : warner_(warner) { + warner_->EnterSelf(); + } + + ~ScopedRecursiveCheck() { + warner_->Leave(); + } + + private: + ThreadCollisionWarner* warner_; + + DISALLOW_COPY_AND_ASSIGN(ScopedRecursiveCheck); + }; + + private: + // This method stores the current thread identifier and does a DCHECK + // if a another thread has already done it, it is safe if same thread + // calls this multiple time (recursion allowed). + void EnterSelf(); + + // Same as EnterSelf but recursion is not allowed. + void Enter(); + + // Removes the thread_id stored in order to allow other threads to + // call EnterSelf or Enter. + void Leave(); + + // This stores the thread id that is inside the critical section, if the + // value is 0 then no thread is inside. + volatile subtle::Atomic64 valid_thread_id_; + + // Counter to trace how many time a critical section was "pinned" + // (when allowed) in order to unpin it when counter_ reaches 0. + volatile subtle::Atomic64 counter_; + + // Here only for class unit tests purpose, during the test I need to not + // DCHECK but notify the collision with something else. + AsserterBase* asserter_; + + DISALLOW_COPY_AND_ASSIGN(ThreadCollisionWarner); +}; + +} // namespace base + +#endif // BASE_THREADING_THREAD_COLLISION_WARNER_H_ diff --git a/src/kudu/gutil/tuple.h b/src/kudu/gutil/tuple.h new file mode 100644 index 000000000000..e496612ed468 --- /dev/null +++ b/src/kudu/gutil/tuple.h @@ -0,0 +1,1291 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// A Tuple is a generic templatized container, similar in concept to std::pair. +// There are classes Tuple0 to Tuple6, cooresponding to the number of elements +// it contains. The convenient MakeTuple() function takes 0 to 6 arguments, +// and will construct and return the appropriate Tuple object. The functions +// DispatchToMethod and DispatchToFunction take a function pointer or instance +// and method pointer, and unpack a tuple into arguments to the call. +// +// Tuple elements are copied by value, and stored in the tuple. See the unit +// tests for more details of how/when the values are copied. +// +// Example usage: +// // These two methods of creating a Tuple are identical. +// Tuple2 tuple_a(1, "wee"); +// Tuple2 tuple_b = MakeTuple(1, "wee"); +// +// void SomeFunc(int a, const char* b) { } +// DispatchToFunction(&SomeFunc, tuple_a); // SomeFunc(1, "wee") +// DispatchToFunction( +// &SomeFunc, MakeTuple(10, "foo")); // SomeFunc(10, "foo") +// +// struct { void SomeMeth(int a, int b, int c) { } } foo; +// DispatchToMethod(&foo, &Foo::SomeMeth, MakeTuple(1, 2, 3)); +// // foo->SomeMeth(1, 2, 3); + +#ifndef BASE_TUPLE_H__ +#define BASE_TUPLE_H__ + +#include "kudu/gutil/bind_helpers.h" + +// Traits ---------------------------------------------------------------------- +// +// A simple traits class for tuple arguments. +// +// ValueType: the bare, nonref version of a type (same as the type for nonrefs). +// RefType: the ref version of a type (same as the type for refs). +// ParamType: what type to pass to functions (refs should not be constified). + +template +struct TupleTraits { + typedef P ValueType; + typedef P& RefType; + typedef const P& ParamType; +}; + +template +struct TupleTraits { + typedef P ValueType; + typedef P& RefType; + typedef P& ParamType; +}; + +template +struct TupleTypes { }; + +// Tuple ----------------------------------------------------------------------- +// +// This set of classes is useful for bundling 0 or more heterogeneous data types +// into a single variable. The advantage of this is that it greatly simplifies +// function objects that need to take an arbitrary number of parameters; see +// RunnableMethod and IPC::MessageWithTuple. +// +// Tuple0 is supplied to act as a 'void' type. It can be used, for example, +// when dispatching to a function that accepts no arguments (see the +// Dispatchers below). +// Tuple1 is rarely useful. One such use is when A is non-const ref that you +// want filled by the dispatchee, and the tuple is merely a container for that +// output (a "tier"). See MakeRefTuple and its usages. + +struct Tuple0 { + typedef Tuple0 ValueTuple; + typedef Tuple0 RefTuple; + typedef Tuple0 ParamTuple; +}; + +template +struct Tuple1 { + public: + typedef A TypeA; + + Tuple1() {} + explicit Tuple1(typename TupleTraits::ParamType a) : a(a) {} + + A a; +}; + +template +struct Tuple2 { + public: + typedef A TypeA; + typedef B TypeB; + + Tuple2() {} + Tuple2(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b) + : a(a), b(b) { + } + + A a; + B b; +}; + +template +struct Tuple3 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + + Tuple3() {} + Tuple3(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c) + : a(a), b(b), c(c){ + } + + A a; + B b; + C c; +}; + +template +struct Tuple4 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + typedef D TypeD; + + Tuple4() {} + Tuple4(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c, + typename TupleTraits::ParamType d) + : a(a), b(b), c(c), d(d) { + } + + A a; + B b; + C c; + D d; +}; + +template +struct Tuple5 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + typedef D TypeD; + typedef E TypeE; + + Tuple5() {} + Tuple5(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c, + typename TupleTraits::ParamType d, + typename TupleTraits::ParamType e) + : a(a), b(b), c(c), d(d), e(e) { + } + + A a; + B b; + C c; + D d; + E e; +}; + +template +struct Tuple6 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + typedef D TypeD; + typedef E TypeE; + typedef F TypeF; + + Tuple6() {} + Tuple6(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c, + typename TupleTraits::ParamType d, + typename TupleTraits::ParamType e, + typename TupleTraits::ParamType f) + : a(a), b(b), c(c), d(d), e(e), f(f) { + } + + A a; + B b; + C c; + D d; + E e; + F f; +}; + +template +struct Tuple7 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + typedef D TypeD; + typedef E TypeE; + typedef F TypeF; + typedef G TypeG; + + Tuple7() {} + Tuple7(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c, + typename TupleTraits::ParamType d, + typename TupleTraits::ParamType e, + typename TupleTraits::ParamType f, + typename TupleTraits::ParamType g) + : a(a), b(b), c(c), d(d), e(e), f(f), g(g) { + } + + A a; + B b; + C c; + D d; + E e; + F f; + G g; +}; + +template +struct Tuple8 { + public: + typedef A TypeA; + typedef B TypeB; + typedef C TypeC; + typedef D TypeD; + typedef E TypeE; + typedef F TypeF; + typedef G TypeG; + typedef H TypeH; + + Tuple8() {} + Tuple8(typename TupleTraits::ParamType a, + typename TupleTraits::ParamType b, + typename TupleTraits::ParamType c, + typename TupleTraits::ParamType d, + typename TupleTraits::ParamType e, + typename TupleTraits::ParamType f, + typename TupleTraits::ParamType g, + typename TupleTraits::ParamType h) + : a(a), b(b), c(c), d(d), e(e), f(f), g(g), h(h) { + } + + A a; + B b; + C c; + D d; + E e; + F f; + G g; + H h; +}; + +// Tuple types ---------------------------------------------------------------- +// +// Allows for selection of ValueTuple/RefTuple/ParamTuple without needing the +// definitions of class types the tuple takes as parameters. + +template <> +struct TupleTypes< Tuple0 > { + typedef Tuple0 ValueTuple; + typedef Tuple0 RefTuple; + typedef Tuple0 ParamTuple; +}; + +template +struct TupleTypes< Tuple1 > { + typedef Tuple1::ValueType> ValueTuple; + typedef Tuple1::RefType> RefTuple; + typedef Tuple1::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple2 > { + typedef Tuple2::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple2::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple2::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple3 > { + typedef Tuple3::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple3::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple3::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple4 > { + typedef Tuple4::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple4::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple4::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple5 > { + typedef Tuple5::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple5::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple5::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple6 > { + typedef Tuple6::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple6::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple6::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple7 > { + typedef Tuple7::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple7::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple7::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +template +struct TupleTypes< Tuple8 > { + typedef Tuple8::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType, + typename TupleTraits::ValueType> ValueTuple; +typedef Tuple8::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType, + typename TupleTraits::RefType> RefTuple; + typedef Tuple8::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType, + typename TupleTraits::ParamType> ParamTuple; +}; + +// Tuple creators ------------------------------------------------------------- +// +// Helper functions for constructing tuples while inferring the template +// argument types. + +inline Tuple0 MakeTuple() { + return Tuple0(); +} + +template +inline Tuple1 MakeTuple(const A& a) { + return Tuple1(a); +} + +template +inline Tuple2 MakeTuple(const A& a, const B& b) { + return Tuple2(a, b); +} + +template +inline Tuple3 MakeTuple(const A& a, const B& b, const C& c) { + return Tuple3(a, b, c); +} + +template +inline Tuple4 MakeTuple(const A& a, const B& b, const C& c, + const D& d) { + return Tuple4(a, b, c, d); +} + +template +inline Tuple5 MakeTuple(const A& a, const B& b, const C& c, + const D& d, const E& e) { + return Tuple5(a, b, c, d, e); +} + +template +inline Tuple6 MakeTuple(const A& a, const B& b, const C& c, + const D& d, const E& e, const F& f) { + return Tuple6(a, b, c, d, e, f); +} + +template +inline Tuple7 MakeTuple(const A& a, const B& b, const C& c, + const D& d, const E& e, const F& f, + const G& g) { + return Tuple7(a, b, c, d, e, f, g); +} + +template +inline Tuple8 MakeTuple(const A& a, const B& b, + const C& c, const D& d, + const E& e, const F& f, + const G& g, const H& h) { + return Tuple8(a, b, c, d, e, f, g, h); +} + +// The following set of helpers make what Boost refers to as "Tiers" - a tuple +// of references. + +template +inline Tuple1 MakeRefTuple(A& a) { + return Tuple1(a); +} + +template +inline Tuple2 MakeRefTuple(A& a, B& b) { + return Tuple2(a, b); +} + +template +inline Tuple3 MakeRefTuple(A& a, B& b, C& c) { + return Tuple3(a, b, c); +} + +template +inline Tuple4 MakeRefTuple(A& a, B& b, C& c, D& d) { + return Tuple4(a, b, c, d); +} + +template +inline Tuple5 MakeRefTuple(A& a, B& b, C& c, D& d, E& e) { + return Tuple5(a, b, c, d, e); +} + +template +inline Tuple6 MakeRefTuple(A& a, B& b, C& c, D& d, E& e, + F& f) { + return Tuple6(a, b, c, d, e, f); +} + +template +inline Tuple7 MakeRefTuple(A& a, B& b, C& c, D& d, + E& e, F& f, G& g) { + return Tuple7(a, b, c, d, e, f, g); +} + +template +inline Tuple8 MakeRefTuple(A& a, B& b, C& c, + D& d, E& e, F& f, + G& g, H& h) { + return Tuple8(a, b, c, d, e, f, g, h); +} + +// Dispatchers ---------------------------------------------------------------- +// +// Helper functions that call the given method on an object, with the unpacked +// tuple arguments. Notice that they all have the same number of arguments, +// so you need only write: +// DispatchToMethod(object, &Object::method, args); +// This is very useful for templated dispatchers, since they don't need to know +// what type |args| is. + +// Non-Static Dispatchers with no out params. + +template +inline void DispatchToMethod(ObjT* obj, Method method, const Tuple0& arg) { + (obj->*method)(); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, const A& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, const Tuple1& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a)); +} + +template +inline void DispatchToMethod(ObjT* obj, + Method method, + const Tuple2& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple7& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f), + kudu::internal::UnwrapTraits::Unwrap(arg.g)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple8& arg) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f), + kudu::internal::UnwrapTraits::Unwrap(arg.g), + kudu::internal::UnwrapTraits::Unwrap(arg.h)); +} + +// Static Dispatchers with no out params. + +template +inline void DispatchToFunction(Function function, const Tuple0& arg) { + (*function)(); +} + +template +inline void DispatchToFunction(Function function, const A& arg) { + (*function)(arg); +} + +template +inline void DispatchToFunction(Function function, const Tuple1& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a)); +} + +template +inline void DispatchToFunction(Function function, const Tuple2& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b)); +} + +template +inline void DispatchToFunction(Function function, const Tuple3& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c)); +} + +template +inline void DispatchToFunction(Function function, + const Tuple4& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d)); +} + +template +inline void DispatchToFunction(Function function, + const Tuple5& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e)); +} + +template +inline void DispatchToFunction(Function function, + const Tuple6& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f)); +} + +template +inline void DispatchToFunction(Function function, + const Tuple7& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f), + kudu::internal::UnwrapTraits::Unwrap(arg.g)); +} + +template +inline void DispatchToFunction(Function function, + const Tuple8& arg) { + (*function)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f), + kudu::internal::UnwrapTraits::Unwrap(arg.g), + kudu::internal::UnwrapTraits::Unwrap(arg.h)); +} + +// Dispatchers with 0 out param (as a Tuple0). + +template +inline void DispatchToMethod(ObjT* obj, + Method method, + const Tuple0& arg, Tuple0*) { + (obj->*method)(); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, const A& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg)); +} + +template +inline void DispatchToMethod(ObjT* obj, + Method method, + const Tuple1& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a)); +} + +template +inline void DispatchToMethod(ObjT* obj, + Method method, + const Tuple2& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e)); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& arg, Tuple0*) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(arg.a), + kudu::internal::UnwrapTraits::Unwrap(arg.b), + kudu::internal::UnwrapTraits::Unwrap(arg.c), + kudu::internal::UnwrapTraits::Unwrap(arg.d), + kudu::internal::UnwrapTraits::Unwrap(arg.e), + kudu::internal::UnwrapTraits::Unwrap(arg.f)); +} + +// Dispatchers with 1 out param. + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple0& in, + Tuple1* out) { + (obj->*method)(&out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const InA& in, + Tuple1* out) { + (obj->*method)(in, &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple1& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple2& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + &out->a); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& in, + Tuple1* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + kudu::internal::UnwrapTraits::Unwrap(in.f), + &out->a); +} + +// Dispatchers with 2 out params. + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple0& in, + Tuple2* out) { + (obj->*method)(&out->a, &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const InA& in, + Tuple2* out) { + (obj->*method)(in, &out->a, &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple1& in, + Tuple2* out) { + (obj->*method)( + kudu::internal::UnwrapTraits::Unwrap(in.a), &out->a, &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple2& in, + Tuple2* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + &out->a, + &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& in, + Tuple2* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + &out->a, + &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& in, + Tuple2* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + &out->a, + &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& in, + Tuple2* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + &out->a, + &out->b); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& in, + Tuple2* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + kudu::internal::UnwrapTraits::Unwrap(in.f), + &out->a, + &out->b); +} + +// Dispatchers with 3 out params. + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple0& in, + Tuple3* out) { + (obj->*method)(&out->a, &out->b, &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const InA& in, + Tuple3* out) { + (obj->*method)(in, &out->a, &out->b, &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple1& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + &out->a, + &out->b, + &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple2& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + &out->a, + &out->b, + &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + &out->a, + &out->b, + &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + &out->a, + &out->b, + &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + &out->a, + &out->b, + &out->c); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& in, + Tuple3* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + kudu::internal::UnwrapTraits::Unwrap(in.f), + &out->a, + &out->b, + &out->c); +} + +// Dispatchers with 4 out params. + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple0& in, + Tuple4* out) { + (obj->*method)(&out->a, &out->b, &out->c, &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const InA& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple1& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple2& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + &out->a, + &out->b, + &out->c, + &out->d); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& in, + Tuple4* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + kudu::internal::UnwrapTraits::Unwrap(in.f), + &out->a, + &out->b, + &out->c, + &out->d); +} + +// Dispatchers with 5 out params. + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple0& in, + Tuple5* out) { + (obj->*method)(&out->a, &out->b, &out->c, &out->d, &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const InA& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple1& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple2& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple3& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple4& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple5& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +template +inline void DispatchToMethod(ObjT* obj, Method method, + const Tuple6& in, + Tuple5* out) { + (obj->*method)(kudu::internal::UnwrapTraits::Unwrap(in.a), + kudu::internal::UnwrapTraits::Unwrap(in.b), + kudu::internal::UnwrapTraits::Unwrap(in.c), + kudu::internal::UnwrapTraits::Unwrap(in.d), + kudu::internal::UnwrapTraits::Unwrap(in.e), + kudu::internal::UnwrapTraits::Unwrap(in.f), + &out->a, + &out->b, + &out->c, + &out->d, + &out->e); +} + +#endif // BASE_TUPLE_H__ diff --git a/src/kudu/gutil/type_traits.h b/src/kudu/gutil/type_traits.h new file mode 100644 index 000000000000..a4e874f7dbea --- /dev/null +++ b/src/kudu/gutil/type_traits.h @@ -0,0 +1,363 @@ +// Copyright (c) 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// ---- +// +// This code is compiled directly on many platforms, including client +// platforms like Windows, Mac, and embedded systems. Before making +// any changes here, make sure that you're not breaking any platforms. +// +// Define a small subset of tr1 type traits. The traits we define are: +// enable_if +// is_integral +// is_floating_point +// is_pointer +// is_array +// is_enum +// is_reference +// is_pod +// has_trivial_constructor +// has_trivial_copy +// has_trivial_assign +// has_trivial_destructor +// remove_const +// remove_volatile +// remove_cv +// remove_reference +// add_reference +// remove_pointer +// is_same +// is_convertible +// We can add more type traits as required. + +#ifndef BASE_TYPE_TRAITS_H_ +#define BASE_TYPE_TRAITS_H_ + +#include +using std::make_pair; +using std::pair; // For pair + +#include "kudu/gutil/template_util.h" // For true_type and false_type + +namespace base { + +template struct enable_if; +template struct is_integral; +template struct is_floating_point; +template struct is_pointer; +template struct is_array; +// MSVC can't compile this correctly, and neither can gcc 3.3.5 (at least) +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) +// is_enum uses is_convertible, which is not available on MSVC. +template struct is_enum; +#endif +template struct is_reference; +template struct is_pod; +template struct has_trivial_constructor; +template struct has_trivial_copy; +template struct has_trivial_assign; +template struct has_trivial_destructor; +template struct remove_const; +template struct remove_volatile; +template struct remove_cv; +template struct remove_reference; +template struct add_reference; +template struct remove_pointer; +template struct is_same; +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) +template struct is_convertible; +#endif + +// enable_if, equivalent semantics to c++11 std::enable_if, specifically: +// "If B is true, the member typedef type shall equal T; otherwise, there +// shall be no member typedef type." +// Specified by 20.9.7.6 [Other transformations] +template struct enable_if { typedef T type; }; +template struct enable_if {}; + +// is_integral is false except for the built-in integer types. A +// cv-qualified type is integral if and only if the underlying type is. +template struct is_integral : false_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#if defined(_MSC_VER) +// wchar_t is not by default a distinct type from unsigned short in +// Microsoft C. +// See http://msdn2.microsoft.com/en-us/library/dh8che7s(VS.80).aspx +template<> struct is_integral<__wchar_t> : true_type { }; +#else +template<> struct is_integral : true_type { }; +#endif +#if defined(__APPLE__) +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#endif +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#ifdef HAVE_LONG_LONG +template<> struct is_integral : true_type { }; +template<> struct is_integral : true_type { }; +#endif +template struct is_integral : is_integral { }; +template struct is_integral : is_integral { }; +template struct is_integral : is_integral { }; + +// is_floating_point is false except for the built-in floating-point types. +// A cv-qualified type is integral if and only if the underlying type is. +template struct is_floating_point : false_type { }; +template<> struct is_floating_point : true_type { }; +template<> struct is_floating_point : true_type { }; +template<> struct is_floating_point : true_type { }; +template struct is_floating_point + : is_floating_point { }; +template struct is_floating_point + : is_floating_point { }; +template struct is_floating_point + : is_floating_point { }; + +// is_pointer is false except for pointer types. A cv-qualified type (e.g. +// "int* const", as opposed to "int const*") is cv-qualified if and only if +// the underlying type is. +template struct is_pointer : false_type { }; +template struct is_pointer : true_type { }; +template struct is_pointer : is_pointer { }; +template struct is_pointer : is_pointer { }; +template struct is_pointer : is_pointer { }; + + +template struct is_array : public false_type {}; +template struct is_array : public true_type {}; +template struct is_array : public true_type {}; + +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) + +namespace internal { + +template struct is_class_or_union { + template static small_ tester(void (U::*)()); + template static big_ tester(...); + static const bool value = sizeof(tester(0)) == sizeof(small_); +}; + +// is_convertible chokes if the first argument is an array. That's why +// we use add_reference here. +template struct is_enum_impl + : is_convertible::type, int> { }; + +template struct is_enum_impl : false_type { }; + +} // namespace internal + +// Specified by TR1 [4.5.1] primary type categories. + +// Implementation note: +// +// Each type is either void, integral, floating point, array, pointer, +// reference, member object pointer, member function pointer, enum, +// union or class. Out of these, only integral, floating point, reference, +// class and enum types are potentially convertible to int. Therefore, +// if a type is not a reference, integral, floating point or class and +// is convertible to int, it's a enum. Adding cv-qualification to a type +// does not change whether it's an enum. +// +// Is-convertible-to-int check is done only if all other checks pass, +// because it can't be used with some types (e.g. void or classes with +// inaccessible conversion operators). +template struct is_enum + : internal::is_enum_impl< + is_same::value || + is_integral::value || + is_floating_point::value || + is_reference::value || + internal::is_class_or_union::value, + T> { }; + +template struct is_enum : is_enum { }; +template struct is_enum : is_enum { }; +template struct is_enum : is_enum { }; + +#endif + +// is_reference is false except for reference types. +template struct is_reference : false_type {}; +template struct is_reference : true_type {}; + + +// We can't get is_pod right without compiler help, so fail conservatively. +// We will assume it's false except for arithmetic types, enumerations, +// pointers and cv-qualified versions thereof. Note that std::pair +// is not a POD even if T and U are PODs. +template struct is_pod + : integral_constant::value || + is_floating_point::value || +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) + // is_enum is not available on MSVC. + is_enum::value || +#endif + is_pointer::value)> { }; +template struct is_pod : is_pod { }; +template struct is_pod : is_pod { }; +template struct is_pod : is_pod { }; + + +// We can't get has_trivial_constructor right without compiler help, so +// fail conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial +// constructors. (3) array of a type with a trivial constructor. +// (4) const versions thereof. +template struct has_trivial_constructor : is_pod { }; +template struct has_trivial_constructor > + : integral_constant::value && + has_trivial_constructor::value)> { }; +template struct has_trivial_constructor + : has_trivial_constructor { }; +template struct has_trivial_constructor + : has_trivial_constructor { }; + +// We can't get has_trivial_copy right without compiler help, so fail +// conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial copy +// constructors. (3) array of a type with a trivial copy constructor. +// (4) const versions thereof. +template struct has_trivial_copy : is_pod { }; +template struct has_trivial_copy > + : integral_constant::value && + has_trivial_copy::value)> { }; +template struct has_trivial_copy + : has_trivial_copy { }; +template struct has_trivial_copy : has_trivial_copy { }; + +// We can't get has_trivial_assign right without compiler help, so fail +// conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial copy +// constructors. (3) array of a type with a trivial assign constructor. +template struct has_trivial_assign : is_pod { }; +template struct has_trivial_assign > + : integral_constant::value && + has_trivial_assign::value)> { }; +template struct has_trivial_assign + : has_trivial_assign { }; + +// We can't get has_trivial_destructor right without compiler help, so +// fail conservatively. We will assume it's false except for: (1) types +// for which is_pod is true. (2) std::pair of types with trivial +// destructors. (3) array of a type with a trivial destructor. +// (4) const versions thereof. +template struct has_trivial_destructor : is_pod { }; +template struct has_trivial_destructor > + : integral_constant::value && + has_trivial_destructor::value)> { }; +template struct has_trivial_destructor + : has_trivial_destructor { }; +template struct has_trivial_destructor + : has_trivial_destructor { }; + +// Specified by TR1 [4.7.1] +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; +template struct remove_volatile { typedef T type; }; +template struct remove_volatile { typedef T type; }; +template struct remove_cv { + typedef typename remove_const::type>::type type; +}; + + +// Specified by TR1 [4.7.2] Reference modifications. +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; + +template struct add_reference { typedef T& type; }; +template struct add_reference { typedef T& type; }; + +// Specified by TR1 [4.7.4] Pointer modifications. +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { + typedef T type; }; + +// Specified by TR1 [4.6] Relationships between types +template struct is_same : public false_type { }; +template struct is_same : public true_type { }; + +// Specified by TR1 [4.6] Relationships between types +#if !defined(_MSC_VER) && !(defined(__GNUC__) && __GNUC__ <= 3) +namespace internal { + +// This class is an implementation detail for is_convertible, and you +// don't need to know how it works to use is_convertible. For those +// who care: we declare two different functions, one whose argument is +// of type To and one with a variadic argument list. We give them +// return types of different size, so we can use sizeof to trick the +// compiler into telling us which function it would have chosen if we +// had called it with an argument of type From. See Alexandrescu's +// _Modern C++ Design_ for more details on this sort of trick. + +template +struct ConvertHelper { + static small_ Test(To); + static big_ Test(...); + static From Create(); +}; +} // namespace internal + +// Inherits from true_type if From is convertible to To, false_type otherwise. +template +struct is_convertible + : integral_constant::Test( + internal::ConvertHelper::Create())) + == sizeof(small_)> { +}; +#endif + +} + +// Right now these macros are no-ops, and mostly just document the fact +// these types are PODs, for human use. They may be made more contentful +// later. The typedef is just to make it legal to put a semicolon after +// these macros. +#define DECLARE_POD(TypeName) typedef int Dummy_Type_For_DECLARE_POD ATTRIBUTE_UNUSED +#define DECLARE_NESTED_POD(TypeName) DECLARE_POD(TypeName) +#define PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT(TemplateName) \ + typedef int Dummy_Type_For_PROPAGATE_POD_FROM_TEMPLATE_ARGUMENT ATTRIBUTE_UNUSED +#define ENFORCE_POD(TypeName) typedef int Dummy_Type_For_ENFORCE_POD ATTRIBUTE_UNUSED + +#endif // BASE_TYPE_TRAITS_H_ diff --git a/src/kudu/gutil/utf/LICENSE b/src/kudu/gutil/utf/LICENSE new file mode 100644 index 000000000000..08d705914ee0 --- /dev/null +++ b/src/kudu/gutil/utf/LICENSE @@ -0,0 +1,13 @@ +UTF-8 Library + +The authors of this software are Rob Pike and Ken Thompson. + Copyright (c) 1998-2002 by Lucent Technologies. +Permission to use, copy, modify, and distribute this software for any +purpose without fee is hereby granted, provided that this entire notice +is included in all copies of any software which is or includes a copy +or modification of this software and in all copies of the supporting +documentation for such software. +THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED +WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY +REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY +OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. diff --git a/src/kudu/gutil/utf/rune.c b/src/kudu/gutil/utf/rune.c new file mode 100644 index 000000000000..061535a8ffd8 --- /dev/null +++ b/src/kudu/gutil/utf/rune.c @@ -0,0 +1,350 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#include +#include +#include "kudu/gutil/utf/utf.h" +#include "kudu/gutil/utf/utfdef.h" + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1< T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + // If we can't read more than one character we must stop + if(length <= 1) { + goto badlen; + } + + /* + * two character sequence (11-bit value) + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + // If we can't read more than two characters we must stop + if(length <= 2) { + goto badlen; + } + + /* + * three character sequence (16-bit value) + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + if (length <= 3) + goto badlen; + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + // Support for 5-byte or longer UTF-8 would go here, but + // since we don't have that, we'll just fall through to bad. + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +badlen: + *rune = Bad; + return 0; + +} + + +/* + * This is the older "unsafe" version, which works fine on + * null-terminated strings. + */ +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + long l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { + *consumed = charntorune(rune, str, length); + return *rune != Runeerror || *consumed == 3; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +runenlen(const Rune *r, int nrune) +{ + int nb, c; + + nb = 0; + while(nrune--) { + c = *r++; + if (c <= Rune1) + nb++; + else if (c <= Rune2) + nb += 2; + else if (c <= Rune3) + nb += 3; + else /* assert(c <= Rune4) */ + nb += 4; + } + return nb; +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(uchar*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} diff --git a/src/kudu/gutil/utf/utf.h b/src/kudu/gutil/utf/utf.h new file mode 100644 index 000000000000..02ba472aef6d --- /dev/null +++ b/src/kudu/gutil/utf/utf.h @@ -0,0 +1,233 @@ +#ifndef _UTFH_ +#define _UTFH_ 1 + +#include + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * rune routines + */ + +/* + * These routines were written by Rob Pike and Ken Thompson + * and first appeared in Plan 9. + * SEE ALSO + * utf (7) + * tcs (1) +*/ + +// runetochar copies (encodes) one rune, pointed to by r, to at most +// UTFmax bytes starting at s and returns the number of bytes generated. + +int runetochar(char* s, const Rune* r); + + +// chartorune copies (decodes) at most UTFmax bytes starting at s to +// one rune, pointed to by r, and returns the number of bytes consumed. +// If the input is not exactly in UTF format, chartorune will set *r +// to Runeerror and return 1. +// +// Note: There is no special case for a "null-terminated" string. A +// string whose first byte has the value 0 is the UTF8 encoding of the +// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal +// anywhere else in a UTF sequence. + +int chartorune(Rune* r, const char* s); + + +// charntorune is like chartorune, except that it will access at most +// n bytes of s. If the UTF sequence is incomplete within n bytes, +// charntorune will set *r to Runeerror and return 0. If it is complete +// but not in UTF format, it will set *r to Runeerror and return 1. +// +// Added 2004-09-24 by Wei-Hwa Huang + +int charntorune(Rune* r, const char* s, int n); + +// isvalidcharntorune(str, n, r, consumed) +// is a convenience function that calls "*consumed = charntorune(r, str, n)" +// and returns an int (logically boolean) indicating whether the first +// n bytes of str was a valid and complete UTF sequence. + +int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); + +// runelen returns the number of bytes required to convert r into UTF. + +int runelen(Rune r); + + +// runenlen returns the number of bytes required to convert the n +// runes pointed to by r into UTF. + +int runenlen(const Rune* r, int n); + + +// fullrune returns 1 if the string s of length n is long enough to be +// decoded by chartorune, and 0 otherwise. This does not guarantee +// that the string contains a legal UTF encoding. This routine is used +// by programs that obtain input one byte at a time and need to know +// when a full rune has arrived. + +int fullrune(const char* s, int n); + +// The following routines are analogous to the corresponding string +// routines with "utf" substituted for "str", and "rune" substituted +// for "chr". + +// utflen returns the number of runes that are represented by the UTF +// string s. (cf. strlen) + +int utflen(const char* s); + + +// utfnlen returns the number of complete runes that are represented +// by the first n bytes of the UTF string s. If the last few bytes of +// the string contain an incompletely coded rune, utfnlen will not +// count them; in this way, it differs from utflen, which includes +// every byte of the string. (cf. strnlen) + +int utfnlen(const char* s, long n); + + +// utfrune returns a pointer to the first occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strchr) + +const char* utfrune(const char* s, Rune r); + + +// utfrrune returns a pointer to the last occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strrchr) + +const char* utfrrune(const char* s, Rune r); + + +// utfutf returns a pointer to the first occurrence of the UTF string +// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the +// null string, utfutf returns s1. (cf. strstr) + +const char* utfutf(const char* s1, const char* s2); + + +// utfecpy copies UTF sequences until a null sequence has been copied, +// but writes no sequences beyond es1. If any sequences are copied, +// s1 is terminated by a null sequence, and a pointer to that sequence +// is returned. Otherwise, the original s1 is returned. (cf. strecpy) + +char* utfecpy(char *s1, char *es1, const char *s2); + + + +// These functions are rune-string analogues of the corresponding +// functions in strcat (3). +// +// These routines first appeared in Plan 9. +// SEE ALSO +// memmove (3) +// rune (3) +// strcat (2) +// +// BUGS: The outcome of overlapping moves varies among implementations. + +Rune* runestrcat(Rune* s1, const Rune* s2); +Rune* runestrncat(Rune* s1, const Rune* s2, long n); + +const Rune* runestrchr(const Rune* s, Rune c); + +int runestrcmp(const Rune* s1, const Rune* s2); +int runestrncmp(const Rune* s1, const Rune* s2, long n); + +Rune* runestrcpy(Rune* s1, const Rune* s2); +Rune* runestrncpy(Rune* s1, const Rune* s2, long n); +Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); + +Rune* runestrdup(const Rune* s); + +const Rune* runestrrchr(const Rune* s, Rune c); +long runestrlen(const Rune* s); +const Rune* runestrstr(const Rune* s1, const Rune* s2); + + + +// The following routines test types and modify cases for Unicode +// characters. Unicode defines some characters as letters and +// specifies three cases: upper, lower, and title. Mappings among the +// cases are also defined, although they are not exhaustive: some +// upper case letters have no lower case mapping, and so on. Unicode +// also defines several character properties, a subset of which are +// checked by these routines. These routines are based on Unicode +// version 3.0.0. +// +// NOTE: The routines are implemented in C, so the boolean functions +// (e.g., isupperrune) return 0 for false and 1 for true. +// +// +// toupperrune, tolowerrune, and totitlerune are the Unicode case +// mappings. These routines return the character unchanged if it has +// no defined mapping. + +Rune toupperrune(Rune r); +Rune tolowerrune(Rune r); +Rune totitlerune(Rune r); + + +// isupperrune tests for upper case characters, including Unicode +// upper case letters and targets of the toupper mapping. islowerrune +// and istitlerune are defined analogously. + +int isupperrune(Rune r); +int islowerrune(Rune r); +int istitlerune(Rune r); + + +// isalpharune tests for Unicode letters; this includes ideographs in +// addition to alphabetic characters. + +int isalpharune(Rune r); + + +// isdigitrune tests for digits. Non-digit numbers, such as Roman +// numerals, are not included. + +int isdigitrune(Rune r); + + +// isideographicrune tests for ideographic characters and numbers, as +// defined by the Unicode standard. + +int isideographicrune(Rune r); + + +// isspacerune tests for whitespace characters, including "C" locale +// whitespace, Unicode defined whitespace, and the "zero-width +// non-break space" character. + +int isspacerune(Rune r); + + +// (The comments in this file were copied from the manpage files rune.3, +// isalpharune.3, and runestrcat.3. Some formatting changes were also made +// to conform to Google style. /JRM 11/11/05) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/kudu/gutil/utf/utfdef.h b/src/kudu/gutil/utf/utfdef.h new file mode 100644 index 000000000000..4b58ae87e42b --- /dev/null +++ b/src/kudu/gutil/utf/utfdef.h @@ -0,0 +1,14 @@ +#define uchar _utfuchar +#define ushort _utfushort +#define uint _utfuint +#define ulong _utfulong +#define vlong _utfvlong +#define uvlong _utfuvlong + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#define nelem(x) (sizeof(x)/sizeof((x)[0])) +#define nil ((void*)0) diff --git a/src/kudu/gutil/valgrind.h b/src/kudu/gutil/valgrind.h new file mode 100644 index 000000000000..577c59ab0cd0 --- /dev/null +++ b/src/kudu/gutil/valgrind.h @@ -0,0 +1,3924 @@ +/* -*- c -*- + ---------------------------------------------------------------- + + Notice that the following BSD-style license applies to this one + file (valgrind.h) only. The rest of Valgrind is licensed under the + terms of the GNU General Public License, version 2, unless + otherwise indicated. See the COPYING file in the source + distribution for details. + + ---------------------------------------------------------------- + + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2000-2008 Julian Seward. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------------------------------------------------------- + + Notice that the above BSD-style license applies to this one file + (valgrind.h) only. The entire rest of Valgrind is licensed under + the terms of the GNU General Public License, version 2. See the + COPYING file in the source distribution for details. + + ---------------------------------------------------------------- +*/ + + +/* This file is for inclusion into client (your!) code. + + You can use these macros to manipulate and query Valgrind's + execution inside your own programs. + + The resulting executables will still run without Valgrind, just a + little bit more slowly than they otherwise would, but otherwise + unchanged. When not running on valgrind, each client request + consumes very few (eg. 7) instructions, so the resulting performance + loss is negligible unless you plan to execute client requests + millions of times per second. Nevertheless, if that is still a + problem, you can compile with the NVALGRIND symbol defined (gcc + -DNVALGRIND) so that client requests are not even compiled in. */ + +#ifndef __VALGRIND_H +#define __VALGRIND_H + +#include + +/* Nb: this file might be included in a file compiled with -ansi. So + we can't use C++ style "//" comments nor the "asm" keyword (instead + use "__asm__"). */ + +/* Derive some tags indicating what the target platform is. Note + that in this file we're using the compiler's CPP symbols for + identifying architectures, which are different to the ones we use + within the rest of Valgrind. Note, __powerpc__ is active for both + 32 and 64-bit PPC, whereas __powerpc64__ is only active for the + latter (on Linux, that is). */ +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_ppc32_aix5 +#undef PLAT_ppc64_aix5 + +#if !defined(_AIX) && defined(__i386__) +# define PLAT_x86_linux 1 +#elif !defined(_AIX) && defined(__x86_64__) +# define PLAT_amd64_linux 1 +#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__) +# define PLAT_ppc32_linux 1 +#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__) +# define PLAT_ppc64_linux 1 +#elif defined(_AIX) && defined(__64BIT__) +# define PLAT_ppc64_aix5 1 +#elif defined(_AIX) && !defined(__64BIT__) +# define PLAT_ppc32_aix5 1 +#endif + + +/* If we're not compiling for our target platform, don't generate + any inline asms. */ +#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \ + && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \ + && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5) +# if !defined(NVALGRIND) +# define NVALGRIND 1 +# endif +#endif + + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS. There is nothing */ +/* in here of use to end-users -- skip to the next section. */ +/* ------------------------------------------------------------------ */ + +#if defined(NVALGRIND) + +/* Define NVALGRIND to completely remove the Valgrind magic sequence + from the compiled code (analogous to NDEBUG's effects on + assert()) */ +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { \ + (_zzq_rlval) = (_zzq_default); \ + } + +#else /* ! NVALGRIND */ + +/* The following defines the magic code sequences which the JITter + spots and handles magically. Don't look too closely at them as + they will rot your brain. + + The assembly code sequences for all architectures is in this one + file. This is because this file must be stand-alone, and we don't + want to have multiple files. + + For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default + value gets put in the return slot, so that everything works when + this is executed not under Valgrind. Args are passed in a memory + block, and so there's no intrinsic limit to the number that could + be passed, but it's currently five. + + The macro args are: + _zzq_rlval result lvalue + _zzq_default default value (result returned when running on real CPU) + _zzq_request request code + _zzq_arg1..5 request params + + The other two macros are used to support function wrapping, and are + a lot simpler. VALGRIND_GET_NR_CONTEXT returns the value of the + guest's NRADDR pseudo-register and whatever other information is + needed to safely run the call original from the wrapper: on + ppc64-linux, the R2 value at the divert point is also needed. This + information is abstracted into a user-visible type, OrigFn. + + VALGRIND_CALL_NOREDIR_* behaves the same as the following on the + guest, but guarantees that the branch instruction will not be + redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64: + branch-and-link-to-r11. VALGRIND_CALL_NOREDIR is just text, not a + complete inline asm, since it needs to be combined with more magic + inline asm stuff to be useful. +*/ + +/* ------------------------- x86-linux ------------------------- */ + +#if defined(PLAT_x86_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "roll $3, %%edi ; roll $13, %%edi\n\t" \ + "roll $29, %%edi ; roll $19, %%edi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned int _zzq_args[6]; \ + volatile unsigned int _zzq_result; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EDX = client_request ( %EAX ) */ \ + "xchgl %%ebx,%%ebx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EAX = guest_NRADDR */ \ + "xchgl %%ecx,%%ecx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_EAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%EAX */ \ + "xchgl %%edx,%%edx\n\t" +#endif /* PLAT_x86_linux */ + +/* ------------------------ amd64-linux ------------------------ */ + +#if defined(PLAT_amd64_linux) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rolq $3, %%rdi ; rolq $13, %%rdi\n\t" \ + "rolq $61, %%rdi ; rolq $51, %%rdi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned long long int _zzq_args[6]; \ + volatile unsigned long long int _zzq_result; \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RDX = client_request ( %RAX ) */ \ + "xchgq %%rbx,%%rbx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RAX = guest_NRADDR */ \ + "xchgq %%rcx,%%rcx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_RAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%RAX */ \ + "xchgq %%rdx,%%rdx\n\t" +#endif /* PLAT_amd64_linux */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[6]; \ + unsigned int _zzq_result; \ + unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 3,%1\n\t" /*default*/ \ + "mr 4,%2\n\t" /*ptr*/ \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" /*result*/ \ + : "=b" (_zzq_result) \ + : "b" (_zzq_default), "b" (_zzq_ptr) \ + : "cc", "memory", "r3", "r4"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "cc", "memory", "r3" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[6]; \ + register unsigned long long int _zzq_result __asm__("r3"); \ + register unsigned long long int* _zzq_ptr __asm__("r4"); \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1" \ + : "=r" (_zzq_result) \ + : "0" (_zzq_default), "r" (_zzq_ptr) \ + : "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr __asm__("r3"); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + unsigned int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[7]; \ + register unsigned int _zzq_result; \ + register unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "lwz 3, 24(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[7]; \ + register unsigned long long int _zzq_result; \ + register unsigned long long int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int long long)(_zzq_request); \ + _zzq_args[1] = (unsigned int long long)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int long long)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int long long)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int long long)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int long long)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int long long)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "ld 3, 48(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_aix5 */ + +/* Insert assembly code for other platforms here... */ + +#endif /* NVALGRIND */ + + +/* ------------------------------------------------------------------ */ +/* PLATFORM SPECIFICS for FUNCTION WRAPPING. This is all very */ +/* ugly. It's the least-worst tradeoff I can think of. */ +/* ------------------------------------------------------------------ */ + +/* This section defines magic (a.k.a appalling-hack) macros for doing + guaranteed-no-redirection macros, so as to get from function + wrappers to the functions they are wrapping. The whole point is to + construct standard call sequences, but to do the call itself with a + special no-redirect call pseudo-instruction that the JIT + understands and handles specially. This section is long and + repetitious, and I can't see a way to make it shorter. + + The naming scheme is as follows: + + CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc} + + 'W' stands for "word" and 'v' for "void". Hence there are + different macros for calling arity 0, 1, 2, 3, 4, etc, functions, + and for each, the possibility of returning a word-typed result, or + no result. +*/ + +/* Use these to write the name of your wrapper. NOTE: duplicates + VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */ + +#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname) \ + _vgwZU_##soname##_##fnname + +#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname) \ + _vgwZZ_##soname##_##fnname + +/* Use this macro from within a wrapper function to collect the + context (address and possibly other info) of the original function. + Once you have that you can then use it in one of the CALL_FN_ + macros. The type of the argument _lval is OrigFn. */ +#define VALGRIND_GET_ORIG_FN(_lval) VALGRIND_GET_NR_CONTEXT(_lval) + +/* Derivatives of the main macros below, for calling functions + returning void. */ + +#define CALL_FN_v_v(fnptr) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_v(_junk,fnptr); } while (0) + +#define CALL_FN_v_W(fnptr, arg1) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_W(_junk,fnptr,arg1); } while (0) + +#define CALL_FN_v_WW(fnptr, arg1,arg2) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0) + +#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0) + +/* ------------------------- x86-linux ------------------------- */ + +#if defined(PLAT_x86_linux) + +/* These regs are trashed by the hidden call. No need to mention eax + as gcc can already see that, plus causes gcc to bomb. */ +#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx" + +/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $4, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $8, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $12, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $16, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $20, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $24, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $28, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $32, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $36, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $40, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $44, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "pushl 48(%%eax)\n\t" \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $48, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_x86_linux */ + +/* ------------------------ amd64-linux ------------------------ */ + +#if defined(PLAT_amd64_linux) + +/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi", \ + "rdi", "r8", "r9", "r10", "r11" + +/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned + long) == 8. */ + +/* NB 9 Sept 07. There is a nasty kludge here in all these CALL_FN_ + macros. In order not to trash the stack redzone, we need to drop + %rsp by 128 before the hidden call, and restore afterwards. The + nastyness is that it is only by luck that the stack still appears + to be unwindable during the hidden call - since then the behaviour + of any routine using this macro does not match what the CFI data + says. Sigh. + + Why is this important? Imagine that a wrapper has a stack + allocated local, and passes to the hidden call, a pointer to it. + Because gcc does not know about the hidden call, it may allocate + that local in the redzone. Unfortunately the hidden call may then + trash it before it comes to use it. So we must step clear of the + redzone, for the duration of the hidden call, to make it safe. + + Probably the same problem afflicts the other redzone-style ABIs too + (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is + self describing (none of this CFI nonsense) so at least messing + with the stack pointer doesn't give a danger of non-unwindable + stack. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + "addq $128,%%rsp\n\t" \ + VALGRIND_CALL_NOREDIR_RAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $8, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $16, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $24, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $32, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $40, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 96(%%rax)\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $48, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_amd64_linux */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +/* This is useful for finding out about the on-stack stuff: + + extern int f9 ( int,int,int,int,int,int,int,int,int ); + extern int f10 ( int,int,int,int,int,int,int,int,int,int ); + extern int f11 ( int,int,int,int,int,int,int,int,int,int,int ); + extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int ); + + int g9 ( void ) { + return f9(11,22,33,44,55,66,77,88,99); + } + int g10 ( void ) { + return f10(11,22,33,44,55,66,77,88,99,110); + } + int g11 ( void ) { + return f11(11,22,33,44,55,66,77,88,99,110,121); + } + int g12 ( void ) { + return f12(11,22,33,44,55,66,77,88,99,110,121,132); + } +*/ + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc32-linux, + sizeof(unsigned long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + _argvec[12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,20(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "lwz 3," #_n_fr "(1)\n\t" \ + "stw 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,68(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "ld 3," #_n_fr "(1)\n\t" \ + "std 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_aix5 */ + + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ +/* */ +/* ------------------------------------------------------------------ */ + +/* Some request codes. There are many more of these, but most are not + exposed to end-user view. These are the public ones, all of the + form 0x1000 + small_number. + + Core ones are in the range 0x00000000--0x0000ffff. The non-public + ones start at 0x2000. +*/ + +/* These macros are used by tools -- they must be public, but don't + embed them into other programs. */ +#define VG_USERREQ_TOOL_BASE(a,b) \ + ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16)) +#define VG_IS_TOOL_USERREQ(a, b, v) \ + (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000)) + +/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! + This enum comprises an ABI exported by Valgrind to programs + which use client requests. DO NOT CHANGE THE ORDER OF THESE + ENTRIES, NOR DELETE ANY -- add new ones at the end. */ +typedef + enum { VG_USERREQ__RUNNING_ON_VALGRIND = 0x1001, + VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002, + + /* These allow any function to be called from the simulated + CPU but run on the real CPU. Nb: the first arg passed to + the function is always the ThreadId of the running + thread! So CLIENT_CALL0 actually requires a 1 arg + function, etc. */ + VG_USERREQ__CLIENT_CALL0 = 0x1101, + VG_USERREQ__CLIENT_CALL1 = 0x1102, + VG_USERREQ__CLIENT_CALL2 = 0x1103, + VG_USERREQ__CLIENT_CALL3 = 0x1104, + + /* Can be useful in regression testing suites -- eg. can + send Valgrind's output to /dev/null and still count + errors. */ + VG_USERREQ__COUNT_ERRORS = 0x1201, + + /* These are useful and can be interpreted by any tool that + tracks malloc() et al, by using vg_replace_malloc.c. */ + VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301, + VG_USERREQ__FREELIKE_BLOCK = 0x1302, + /* Memory pool support. */ + VG_USERREQ__CREATE_MEMPOOL = 0x1303, + VG_USERREQ__DESTROY_MEMPOOL = 0x1304, + VG_USERREQ__MEMPOOL_ALLOC = 0x1305, + VG_USERREQ__MEMPOOL_FREE = 0x1306, + VG_USERREQ__MEMPOOL_TRIM = 0x1307, + VG_USERREQ__MOVE_MEMPOOL = 0x1308, + VG_USERREQ__MEMPOOL_CHANGE = 0x1309, + VG_USERREQ__MEMPOOL_EXISTS = 0x130a, + + /* Allow printfs to valgrind log. */ + VG_USERREQ__PRINTF = 0x1401, + VG_USERREQ__PRINTF_BACKTRACE = 0x1402, + + /* Stack support. */ + VG_USERREQ__STACK_REGISTER = 0x1501, + VG_USERREQ__STACK_DEREGISTER = 0x1502, + VG_USERREQ__STACK_CHANGE = 0x1503 + } Vg_ClientRequest; + +#if !defined(__GNUC__) +# define __extension__ /* */ +#endif + +/* Returns the number of Valgrinds this code is running under. That + is, 0 if running natively, 1 if running under Valgrind, 2 if + running under Valgrind which is running under another Valgrind, + etc. */ +#define RUNNING_ON_VALGRIND __extension__ \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */, \ + VG_USERREQ__RUNNING_ON_VALGRIND, \ + 0, 0, 0, 0, 0); \ + _qzz_res; \ + }) + + +/* Discard translation of code in the range [_qzz_addr .. _qzz_addr + + _qzz_len - 1]. Useful if you are debugging a JITter or some such, + since it provides a way to make sure valgrind will retranslate the + invalidated area. Returns no value. */ +#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DISCARD_TRANSLATIONS, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + } + + +/* These requests are for getting Valgrind itself to print something. + Possibly with a backtrace. This is a really ugly hack. */ + +#if defined(NVALGRIND) + +# define VALGRIND_PRINTF(...) +# define VALGRIND_PRINTF_BACKTRACE(...) + +#else /* NVALGRIND */ + +/* Modern GCC will optimize the static routine out if unused, + and unused attribute will shut down warnings about it. */ +static int VALGRIND_PRINTF(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF, + (unsigned long)format, (unsigned long)vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF_BACKTRACE(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE, + (unsigned long)format, (unsigned long)vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +#endif /* NVALGRIND */ + + +/* These requests allow control to move from the simulated CPU to the + real CPU, calling an arbitary function. + + Note that the current ThreadId is inserted as the first argument. + So this call: + + VALGRIND_NON_SIMD_CALL2(f, arg1, arg2) + + requires f to have this signature: + + Word f(Word tid, Word arg1, Word arg2) + + where "Word" is a word-sized type. + + Note that these client requests are not entirely reliable. For example, + if you call a function with them that subsequently calls printf(), + there's a high chance Valgrind will crash. Generally, your prospects of + these working are made higher if the called function does not refer to + any global variables, and does not refer to any libc or other functions + (printf et al). Any kind of entanglement with libc or dynamic linking is + likely to have a bad outcome, for tricky reasons which we've grappled + with a lot in the past. +*/ +#define VALGRIND_NON_SIMD_CALL0(_qyy_fn) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL0, \ + _qyy_fn, \ + 0, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL1, \ + _qyy_fn, \ + _qyy_arg1, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL2, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL3, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, \ + _qyy_arg3, 0); \ + _qyy_res; \ + }) + + +/* Counts the number of errors that have been recorded by a tool. Nb: + the tool must record the errors with VG_(maybe_record_error)() or + VG_(unique_error)() for them to be counted. */ +#define VALGRIND_COUNT_ERRORS \ + __extension__ \ + ({unsigned int _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__COUNT_ERRORS, \ + 0, 0, 0, 0, 0); \ + _qyy_res; \ + }) + +/* Mark a block of memory as having been allocated by a malloc()-like + function. `addr' is the start of the usable block (ie. after any + redzone) `rzB' is redzone size if the allocator can apply redzones; + use '0' if not. Adding redzones makes it more likely Valgrind will spot + block overruns. `is_zeroed' indicates if the memory is zeroed, as it is + for calloc(). Put it immediately after the point where a block is + allocated. + + If you're using Memcheck: If you're allocating memory via superblocks, + and then handing out small chunks of each superblock, if you don't have + redzones on your small blocks, it's worth marking the superblock with + VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are + detected. But if you can put redzones on, it's probably better to not do + this, so that messages for small overruns are described in terms of the + small block rather than the superblock (but if you have a big overrun + that skips over a redzone, you could miss an error this way). See + memcheck/tests/custom_alloc.c for an example. + + WARNING: if your allocator uses malloc() or 'new' to allocate + superblocks, rather than mmap() or brk(), this will not work properly -- + you'll likely get assertion failures during leak detection. This is + because Valgrind doesn't like seeing overlapping heap blocks. Sorry. + + Nb: block must be freed via a free()-like function specified + with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */ +#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MALLOCLIKE_BLOCK, \ + addr, sizeB, rzB, is_zeroed, 0); \ + } + +/* Mark a block of memory as having been freed by a free()-like function. + `rzB' is redzone size; it must match that given to + VALGRIND_MALLOCLIKE_BLOCK. Memory not freed will be detected by the leak + checker. Put it immediately after the point where the block is freed. */ +#define VALGRIND_FREELIKE_BLOCK(addr, rzB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__FREELIKE_BLOCK, \ + addr, rzB, 0, 0, 0); \ + } + +/* Create a memory pool. */ +#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__CREATE_MEMPOOL, \ + pool, rzB, is_zeroed, 0, 0); \ + } + +/* Destroy a memory pool. */ +#define VALGRIND_DESTROY_MEMPOOL(pool) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DESTROY_MEMPOOL, \ + pool, 0, 0, 0, 0); \ + } + +/* Associate a piece of memory with a memory pool. */ +#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_ALLOC, \ + pool, addr, size, 0, 0); \ + } + +/* Disassociate a piece of memory from a memory pool. */ +#define VALGRIND_MEMPOOL_FREE(pool, addr) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_FREE, \ + pool, addr, 0, 0, 0); \ + } + +/* Disassociate any pieces outside a particular range. */ +#define VALGRIND_MEMPOOL_TRIM(pool, addr, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_TRIM, \ + pool, addr, size, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MOVE_MEMPOOL(poolA, poolB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MOVE_MEMPOOL, \ + poolA, poolB, 0, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_CHANGE, \ + pool, addrA, addrB, size, 0); \ + } + +/* Return 1 if a mempool exists, else 0. */ +#define VALGRIND_MEMPOOL_EXISTS(pool) \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_EXISTS, \ + pool, 0, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Mark a piece of memory as being a stack. Returns a stack id. */ +#define VALGRIND_STACK_REGISTER(start, end) \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_REGISTER, \ + start, end, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Unmark the piece of memory associated with a stack id as being a + stack. */ +#define VALGRIND_STACK_DEREGISTER(id) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_DEREGISTER, \ + id, 0, 0, 0, 0); \ + } + +/* Change the start and end address of the stack id. */ +#define VALGRIND_STACK_CHANGE(id, start, end) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_CHANGE, \ + id, start, end, 0, 0); \ + } + + +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_ppc32_aix5 +#undef PLAT_ppc64_aix5 + +#endif /* __VALGRIND_H */ diff --git a/src/kudu/gutil/walltime.cc b/src/kudu/gutil/walltime.cc new file mode 100644 index 000000000000..89a805e7d6b7 --- /dev/null +++ b/src/kudu/gutil/walltime.cc @@ -0,0 +1,208 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Author: tkaftal@google.com (Tomasz Kaftal) +// +// The implementation of walltime functionalities. +#ifndef _GNU_SOURCE // gcc3 at least defines it on the command line +#define _GNU_SOURCE // Linux wants that for strptime in time.h +#endif + +#include "kudu/gutil/walltime.h" + +#include +#include + +#if defined(__APPLE__) +#include +#include +#endif // defined(__APPLE__) + +#if defined(__APPLE__) +namespace walltime_internal { + +GoogleOnceType timebase_info_once = GOOGLE_ONCE_INIT; +mach_timebase_info_data_t timebase_info; + +void InitializeTimebaseInfo() { + CHECK_EQ(KERN_SUCCESS, mach_timebase_info(&timebase_info)) + << "unable to initialize mach_timebase_info"; +} +} // namespace walltime_internal +#endif + +// This is exactly like mktime() except it is guaranteed to return -1 on +// failure. Some versions of glibc allow mktime() to return negative +// values which the standard says are undefined. See the standard at +// http://www.opengroup.org/onlinepubs/007904875/basedefs/xbd_chap04.html +// under the heading "Seconds Since the Epoch". +static inline time_t gmktime(struct tm *tm) { + time_t rt = mktime(tm); + return rt < 0 ? time_t(-1) : rt; +} + +static void StringAppendStrftime(string* dst, + const char* format, + const struct tm* tm) { + char space[1024]; + + int result = strftime(space, sizeof(space), format, tm); + + if ((result >= 0) && (result < sizeof(space))) { + // It fit + dst->append(space, result); + return; + } + + int length = sizeof(space); + for (int sanity = 0; sanity < 5; ++sanity) { + length *= 2; + auto buf = new char[length]; + + result = strftime(buf, length, format, tm); + if ((result >= 0) && (result < length)) { + // It fit + dst->append(buf, result); + delete[] buf; + return; + } + + delete[] buf; + } + + // sanity failure + return; +} + +// Convert a "struct tm" interpreted as *GMT* into a time_t (technically +// a long since we can't include header files in header files bla bla bla). +// This is basically filling a hole in the standard library. +// +// There are several approaches to mkgmtime() implementation on the net, +// many of them wrong. Simply reimplementing the logic seems to be the +// simplest and most efficient, though it does reimplement calendar logic. +// The calculation is mostly straightforward; leap years are the main issue. +// +// Like gmktime() this method returns -1 on failure. Negative results +// are considered undefined by the standard so these cases are +// considered failures and thus return -1. +time_t mkgmtime(const struct tm *tm) { + // Month-to-day offset for non-leap-years. + static const int month_day[12] = + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; + + // Most of the calculation is easy; leap years are the main difficulty. + int month = tm->tm_mon % 12; + int year = tm->tm_year + tm->tm_mon / 12; + if (month < 0) { // Negative values % 12 are still negative. + month += 12; + --year; + } + + // This is the number of Februaries since 1900. + const int year_for_leap = (month > 1) ? year + 1 : year; + + time_t rt = tm->tm_sec // Seconds + + 60 * (tm->tm_min // Minute = 60 seconds + + 60 * (tm->tm_hour // Hour = 60 minutes + + 24 * (month_day[month] + tm->tm_mday - 1 // Day = 24 hours + + 365 * (year - 70) // Year = 365 days + + (year_for_leap - 69) / 4 // Every 4 years is leap... + - (year_for_leap - 1) / 100 // Except centuries... + + (year_for_leap + 299) / 400))); // Except 400s. + return rt < 0 ? -1 : rt; +} + +bool WallTime_Parse_Timezone(const char* time_spec, + const char* format, + const struct tm* default_time, + bool local, + WallTime* result) { + struct tm split_time; + if (default_time) { + split_time = *default_time; + } else { + memset(&split_time, 0, sizeof(split_time)); + } + const char* parsed = strptime(time_spec, format, &split_time); + if (parsed == nullptr) return false; + + // If format ends with "%S", match fractional seconds + double fraction = 0.0; + char junk; + if ((*parsed == '.') && + (strcmp(format + strlen(format) - 2, "%S") == 0) && + (sscanf(parsed, "%lf%c", // NOLINT(runtime/printf) + &fraction, &junk) == 1)) { + parsed = format + strlen(format); // Parsed it all! + } + if (*parsed != '\0') return false; + + // Convert into seconds since epoch. Adjust so it is interpreted + // w.r.t. the daylight-saving-state at the specified time. + split_time.tm_isdst = -1; // Ask gmktime() to find dst imfo + time_t ptime; + if (local) { + ptime = gmktime(&split_time); + } else { + ptime = mkgmtime(&split_time); // Returns time in GMT instead of local. + } + + if (ptime == -1) return false; + + *result = ptime; + *result += fraction; + return true; +} + +WallTime WallTime_Now() { +#if defined(__APPLE__) + mach_timespec_t ts; + walltime_internal::GetCurrentTime(&ts); + return ts.tv_sec + ts.tv_nsec / static_cast(1e9); +#else + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec + ts.tv_nsec / static_cast(1e9); +#endif // defined(__APPLE__) +} + +void StringAppendStrftime(string* dst, + const char* format, + time_t when, + bool local) { + struct tm tm; + bool conversion_error; + if (local) { + conversion_error = (localtime_r(&when, &tm) == nullptr); + } else { + conversion_error = (gmtime_r(&when, &tm) == nullptr); + } + if (conversion_error) { + // If we couldn't convert the time, don't append anything. + return; + } + StringAppendStrftime(dst, format, &tm); +} + +string LocalTimeAsString() { + string ret; + StringAppendStrftime(&ret, "%Y-%m-%d %H:%M:%S %Z", time(nullptr), true); + return ret; +} diff --git a/src/kudu/gutil/walltime.h b/src/kudu/gutil/walltime.h new file mode 100644 index 000000000000..e6a12941d61b --- /dev/null +++ b/src/kudu/gutil/walltime.h @@ -0,0 +1,179 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef GUTIL_WALLTIME_H_ +#define GUTIL_WALLTIME_H_ + +#include + +#include +#include +using std::string; + +#if defined(__APPLE__) +#include +#include +#include + +#include "kudu/gutil/once.h" +#endif // defined(__APPLE__) + +#include "kudu/gutil/integral_types.h" + +typedef double WallTime; + +// Append result to a supplied string. +// If an error occurs during conversion 'dst' is not modified. +void StringAppendStrftime(std::string* dst, + const char* format, + time_t when, + bool local); + +// Return the local time as a string suitable for user display. +std::string LocalTimeAsString(); + +// Similar to the WallTime_Parse, but it takes a boolean flag local as +// argument specifying if the time_spec is in local time or UTC +// time. If local is set to true, the same exact result as +// WallTime_Parse is returned. +bool WallTime_Parse_Timezone(const char* time_spec, + const char* format, + const struct tm* default_time, + bool local, + WallTime* result); + +// Return current time in seconds as a WallTime. +WallTime WallTime_Now(); + +typedef int64 MicrosecondsInt64; + +namespace walltime_internal { + +#if defined(__APPLE__) + +extern GoogleOnceType timebase_info_once; +extern mach_timebase_info_data_t timebase_info; +extern void InitializeTimebaseInfo(); + +inline void GetCurrentTime(mach_timespec_t* ts) { + clock_serv_t cclock; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, ts); + mach_port_deallocate(mach_task_self(), cclock); +} + +inline MicrosecondsInt64 GetCurrentTimeMicros() { + mach_timespec_t ts; + GetCurrentTime(&ts); + return ts.tv_sec * 1e6 + ts.tv_nsec / 1e3; +} + +inline int64_t GetMonoTimeNanos() { + // See Apple Technical Q&A QA1398 for further detail on mono time in OS X. + GoogleOnceInit(&timebase_info_once, &InitializeTimebaseInfo); + + uint64_t time = mach_absolute_time(); + + // mach_absolute_time returns ticks, which need to be scaled by the timebase + // info to get nanoseconds. + return time * timebase_info.numer / timebase_info.denom; +} + +inline MicrosecondsInt64 GetMonoTimeMicros() { + return GetMonoTimeNanos() / 1e3; +} + +inline MicrosecondsInt64 GetThreadCpuTimeMicros() { + // See https://www.gnu.org/software/hurd/gnumach-doc/Thread-Information.html + // and Chromium base/time/time_mac.cc. + task_t thread = mach_thread_self(); + if (thread == MACH_PORT_NULL) { + LOG(WARNING) << "Failed to get mach_thread_self()"; + return 0; + } + + mach_msg_type_number_t thread_info_count = THREAD_BASIC_INFO_COUNT; + thread_basic_info_data_t thread_info_data; + + kern_return_t result = thread_info( + thread, + THREAD_BASIC_INFO, + reinterpret_cast(&thread_info_data), + &thread_info_count); + + if (result != KERN_SUCCESS) { + LOG(WARNING) << "Failed to get thread_info()"; + return 0; + } + + return thread_info_data.user_time.seconds * 1e6 + thread_info_data.user_time.microseconds; +} + +#else + +inline MicrosecondsInt64 GetClockTimeMicros(clockid_t clock) { + timespec ts; + clock_gettime(clock, &ts); + return ts.tv_sec * 1e6 + ts.tv_nsec / 1e3; +} + +#endif // defined(__APPLE__) + +} // namespace walltime_internal + +// Returns the time since the Epoch measured in microseconds. +inline MicrosecondsInt64 GetCurrentTimeMicros() { +#if defined(__APPLE__) + return walltime_internal::GetCurrentTimeMicros(); +#else + return walltime_internal::GetClockTimeMicros(CLOCK_REALTIME); +#endif // defined(__APPLE__) +} + +// Returns the time since some arbitrary reference point, measured in microseconds. +// Guaranteed to be monotonic (and therefore useful for measuring intervals) +inline MicrosecondsInt64 GetMonoTimeMicros() { +#if defined(__APPLE__) + return walltime_internal::GetMonoTimeMicros(); +#else + return walltime_internal::GetClockTimeMicros(CLOCK_MONOTONIC); +#endif // defined(__APPLE__) +} + +// Returns the time spent in user CPU on the current thread, measured in microseconds. +inline MicrosecondsInt64 GetThreadCpuTimeMicros() { +#if defined(__APPLE__) + return walltime_internal::GetThreadCpuTimeMicros(); +#else + return walltime_internal::GetClockTimeMicros(CLOCK_THREAD_CPUTIME_ID); +#endif // defined(__APPLE__) +} + +// A CycleClock yields the value of a cycle counter that increments at a rate +// that is approximately constant. +class CycleClock { + public: + // Return the value of the counter. + static inline int64 Now(); + + private: + CycleClock(); +}; + +#include "kudu/gutil/cycleclock-inl.h" // inline method bodies +#endif // GUTIL_WALLTIME_H_ diff --git a/src/kudu/integration-tests/CMakeLists.txt b/src/kudu/integration-tests/CMakeLists.txt new file mode 100644 index 000000000000..0631da097dfc --- /dev/null +++ b/src/kudu/integration-tests/CMakeLists.txt @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(INTEGRATION_TESTS_SRCS + cluster_itest_util.cc + cluster_verifier.cc + external_mini_cluster.cc + external_mini_cluster_fs_inspector.cc + mini_cluster.cc + test_workload.cc +) + +add_library(integration-tests ${INTEGRATION_TESTS_SRCS}) +target_link_libraries(integration-tests + tserver + tserver_test_util + master + ksck + kudu_client + kudu_client_test_util + kudu_fs + kudu_test_util) +add_dependencies(integration-tests + kudu-tserver + kudu-master) + +# Tests +set(KUDU_TEST_LINK_LIBS integration-tests ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(alter_table-test) +ADD_KUDU_TEST(alter_table-randomized-test) +ADD_KUDU_TEST(registration-test RESOURCE_LOCK "master-web-port") +ADD_KUDU_TEST(client_failover-itest) +ADD_KUDU_TEST(client-stress-test + RESOURCE_LOCK "master-rpc-ports" + RUN_SERIAL true) +ADD_KUDU_TEST(master_replication-itest RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(master_failover-itest RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(raft_consensus-itest RUN_SERIAL true) +ADD_KUDU_TEST(ts_tablet_manager-itest) +ADD_KUDU_TEST(ts_recovery-itest) +ADD_KUDU_TEST(create-table-stress-test) +ADD_KUDU_TEST(delete_table-test) +ADD_KUDU_TEST(external_mini_cluster-test RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(linked_list-test RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(all_types-itest RESOURCE_LOCK "master-rpc-ports") +ADD_KUDU_TEST(remote_bootstrap-itest) +ADD_KUDU_TEST(tablet_replacement-itest) +ADD_KUDU_TEST(create-table-itest) + +# Some tests have additional dependencies +set(KUDU_TEST_LINK_LIBS kudu_client kudu_tools_util ${KUDU_TEST_LINK_LIBS}) +ADD_KUDU_TEST(full_stack-insert-scan-test RUN_SERIAL true) +ADD_KUDU_TEST(update_scan_delta_compact-test RUN_SERIAL true) +ADD_KUDU_TEST(flex_partitioning-itest) diff --git a/src/kudu/integration-tests/all_types-itest.cc b/src/kudu/integration-tests/all_types-itest.cc new file mode 100644 index 000000000000..9dae94f77955 --- /dev/null +++ b/src/kudu/integration-tests/all_types-itest.cc @@ -0,0 +1,451 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/client/row_result.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/ts_itest-base.h" + +DEFINE_int32(num_rows_per_tablet, 100, "The number of rows to be inserted into each tablet"); + +using std::vector; + +namespace kudu { +namespace client { + +using sp::shared_ptr; + +static const int kNumTabletServers = 3; +static const int kNumTablets = 3; +static const int KMaxBatchSize = 8 * 1024 * 1024; + +template +struct SliceKeysTestSetup { + + SliceKeysTestSetup() + : max_rows_(MathLimits::kMax), + rows_per_tablet_(std::min(max_rows_/ kNumTablets, FLAGS_num_rows_per_tablet)), + increment_(static_cast(MathLimits::kMax / kNumTablets)) { + } + + void AddKeyColumnsToSchema(KuduSchemaBuilder* builder) const { + builder->AddColumn("key")->Type( + client::FromInternalDataType(KeyTypeWrapper::type))->NotNull()->PrimaryKey(); + } + + // Split points are calculated by equally partitioning the int64_t key space and then + // using the stringified hexadecimal representation to create the split keys (with + // zero padding). + vector GenerateSplitRows(const KuduSchema& schema) const { + vector splits; + splits.reserve(kNumTablets - 1); + for (int i = 1; i < kNumTablets; i++) { + int split = i * increment_; + splits.push_back(StringPrintf("%08x", split)); + } + vector rows; + for (string val : splits) { + Slice slice(val); + KuduPartialRow* row = schema.NewRow(); + CHECK_OK(row->SetSliceCopy >(0, slice)); + rows.push_back(row); + } + return rows; + } + + Status GenerateRowKey(KuduInsert* insert, int split_idx, int row_idx) const { + int row_key_num = (split_idx * increment_) + row_idx; + string row_key = StringPrintf("%08x", row_key_num); + Slice row_key_slice(row_key); + return insert->mutable_row()->SetSliceCopy >(0, + row_key_slice); + } + + Status VerifyRowKey(const KuduRowResult& result, int split_idx, int row_idx) const { + int expected_row_key_num = (split_idx * increment_) + row_idx; + string expected_row_key = StringPrintf("%08x", expected_row_key_num); + Slice expected_row_key_slice(expected_row_key); + Slice row_key; + RETURN_NOT_OK(result.Get >(0, &row_key)); + if (expected_row_key_slice.compare(row_key) != 0) { + return Status::Corruption(strings::Substitute("Keys didn't match. Expected: $0 Got: $1", + expected_row_key_slice.ToDebugString(), + row_key.ToDebugString())); + } + + return Status::OK(); + } + + int GetRowsPerTablet() const { + return rows_per_tablet_; + } + + int GetMaxRows() const { + return max_rows_; + } + + vector GetKeyColumns() const { + vector key_col; + key_col.push_back("key"); + return key_col; + } + + int max_rows_; + int rows_per_tablet_; + int increment_; +}; + +template +struct IntKeysTestSetup { + typedef typename TypeTraits::cpp_type CppType; + + IntKeysTestSetup() + // If CppType is actually bigger than int (e.g. int64_t) casting the max to int + // returns -1, so we make sure in that case we get max from int directly. + : max_rows_(static_cast(MathLimits::kMax) != -1 ? + static_cast(MathLimits::kMax) : MathLimits::kMax), + increment_(max_rows_ / kNumTablets), + rows_per_tablet_(std::min(increment_, FLAGS_num_rows_per_tablet)) { + DCHECK(base::is_integral::value); + } + + void AddKeyColumnsToSchema(KuduSchemaBuilder* builder) const { + builder->AddColumn("key")->Type( + client::FromInternalDataType(KeyTypeWrapper::type))->NotNull()->PrimaryKey(); + } + + vector GenerateSplitRows(const KuduSchema& schema) const { + vector splits; + splits.reserve(kNumTablets - 1); + for (int64_t i = 1; i < kNumTablets; i++) { + splits.push_back(i * increment_); + } + vector rows; + for (CppType val : splits) { + KuduPartialRow* row = schema.NewRow(); + CHECK_OK(row->Set >(0, val)); + rows.push_back(row); + } + return rows; + } + + Status GenerateRowKey(KuduInsert* insert, int split_idx, int row_idx) const { + CppType val = (split_idx * increment_) + row_idx; + return insert->mutable_row()->Set >(0, val); + } + + Status VerifyRowKey(const KuduRowResult& result, int split_idx, int row_idx) const { + CppType val; + RETURN_NOT_OK(result.Get >(0, &val)); + int expected = (split_idx * increment_) + row_idx; + if (val != expected) { + return Status::Corruption(strings::Substitute("Keys didn't match. Expected: $0 Got: $1", + expected, val)); + } + return Status::OK(); + } + + int GetRowsPerTablet() const { + return rows_per_tablet_; + } + + int GetMaxRows() const { + return max_rows_; + } + + vector GetKeyColumns() const { + vector key_col; + key_col.push_back("key"); + return key_col; + } + + int max_rows_; + int increment_; + int rows_per_tablet_; +}; + +// Integration that writes, scans and verifies all types. +template +class AllTypesItest : public KuduTest { + public: + AllTypesItest() { + if (AllowSlowTests()) { + FLAGS_num_rows_per_tablet = 10000; + } + setup_ = TestSetup(); + } + + // Builds a schema that includes all (frontend) supported types. + // The key is templated so that we can try different key types. + void CreateAllTypesSchema() { + KuduSchemaBuilder builder; + setup_.AddKeyColumnsToSchema(&builder); + builder.AddColumn("int8_val")->Type(KuduColumnSchema::INT8); + builder.AddColumn("int16_val")->Type(KuduColumnSchema::INT16); + builder.AddColumn("int32_val")->Type(KuduColumnSchema::INT32); + builder.AddColumn("int64_val")->Type(KuduColumnSchema::INT64); + builder.AddColumn("timestamp_val")->Type(KuduColumnSchema::TIMESTAMP); + builder.AddColumn("string_val")->Type(KuduColumnSchema::STRING); + builder.AddColumn("bool_val")->Type(KuduColumnSchema::BOOL); + builder.AddColumn("float_val")->Type(KuduColumnSchema::FLOAT); + builder.AddColumn("double_val")->Type(KuduColumnSchema::DOUBLE); + builder.AddColumn("binary_val")->Type(KuduColumnSchema::BINARY); + CHECK_OK(builder.Build(&schema_)); + } + + Status CreateCluster() { + vector ts_flags; + // Set the flush threshold low so that we have flushes and test the on-disk formats. + ts_flags.push_back("--flush_threshold_mb=1"); + // Set the major delta compaction ratio low enough that we trigger a lot of them. + ts_flags.push_back("--tablet_delta_store_major_compact_min_ratio=0.001"); + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = kNumTabletServers; + + for (const std::string& flag : ts_flags) { + opts.extra_tserver_flags.push_back(flag); + } + + cluster_.reset(new ExternalMiniCluster(opts)); + RETURN_NOT_OK(cluster_->Start()); + KuduClientBuilder builder; + return cluster_->CreateClient(builder, &client_); + } + + Status CreateTable() { + CreateAllTypesSchema(); + vector split_rows = setup_.GenerateSplitRows(schema_); + gscoped_ptr table_creator(client_->NewTableCreator()); + + for (const KuduPartialRow* row : split_rows) { + split_rows_.push_back(*row); + } + + RETURN_NOT_OK(table_creator->table_name("all-types-table") + .schema(&schema_) + .split_rows(split_rows) + .num_replicas(kNumTabletServers) + .Create()); + return client_->OpenTable("all-types-table", &table_); + } + + Status GenerateRow(KuduSession* session, int split_idx, int row_idx) { + KuduInsert* insert = table_->NewInsert(); + RETURN_NOT_OK(setup_.GenerateRowKey(insert, split_idx, row_idx)); + int int_val = (split_idx * setup_.GetRowsPerTablet()) + row_idx; + KuduPartialRow* row = insert->mutable_row(); + RETURN_NOT_OK(row->SetInt8("int8_val", int_val)); + RETURN_NOT_OK(row->SetInt16("int16_val", int_val)); + RETURN_NOT_OK(row->SetInt32("int32_val", int_val)); + RETURN_NOT_OK(row->SetInt64("int64_val", int_val)); + RETURN_NOT_OK(row->SetTimestamp("timestamp_val", int_val)); + string content = strings::Substitute("hello $0", int_val); + Slice slice_val(content); + RETURN_NOT_OK(row->SetStringCopy("string_val", slice_val)); + RETURN_NOT_OK(row->SetBinaryCopy("binary_val", slice_val)); + double double_val = int_val; + RETURN_NOT_OK(row->SetDouble("double_val", double_val)); + RETURN_NOT_OK(row->SetFloat("float_val", double_val)); + RETURN_NOT_OK(row->SetBool("bool_val", int_val % 2)); + VLOG(1) << "Inserting row[" << split_idx << "," << row_idx << "]" << insert->ToString(); + RETURN_NOT_OK(session->Apply(insert)); + return Status::OK(); + } + + // This inserts kNumRowsPerTablet in each of the tablets. In the end we should have + // perfectly partitioned table, if the encoding of the keys was correct and the rows + // ended up in the right place. + Status InsertRows() { + shared_ptr session = client_->NewSession(); + RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + int max_rows_per_tablet = setup_.GetRowsPerTablet(); + for (int i = 0; i < kNumTablets; ++i) { + for (int j = 0; j < max_rows_per_tablet; ++j) { + RETURN_NOT_OK(GenerateRow(session.get(), i, j)); + if (j % 1000 == 0) { + RETURN_NOT_OK(session->Flush()); + } + } + RETURN_NOT_OK(session->Flush()); + } + return Status::OK(); + } + + void SetupProjection(vector* projection) { + vector keys = setup_.GetKeyColumns(); + for (const string& key : keys) { + projection->push_back(key); + } + projection->push_back("int8_val"); + projection->push_back("int16_val"); + projection->push_back("int32_val"); + projection->push_back("int64_val"); + projection->push_back("timestamp_val"); + projection->push_back("string_val"); + projection->push_back("binary_val"); + projection->push_back("double_val"); + projection->push_back("float_val"); + projection->push_back("bool_val"); + } + + void VerifyRow(const KuduRowResult& row, int split_idx, int row_idx) { + ASSERT_OK(setup_.VerifyRowKey(row, split_idx, row_idx)); + + int64_t expected_int_val = (split_idx * setup_.GetRowsPerTablet()) + row_idx; + int8_t int8_val; + ASSERT_OK(row.GetInt8("int8_val", &int8_val)); + ASSERT_EQ(int8_val, static_cast(expected_int_val)); + int16_t int16_val; + ASSERT_OK(row.GetInt16("int16_val", &int16_val)); + ASSERT_EQ(int16_val, static_cast(expected_int_val)); + int32_t int32_val; + ASSERT_OK(row.GetInt32("int32_val", &int32_val)); + ASSERT_EQ(int32_val, static_cast(expected_int_val)); + int64_t int64_val; + ASSERT_OK(row.GetInt64("int64_val", &int64_val)); + ASSERT_EQ(int64_val, expected_int_val); + int64_t timestamp_val; + ASSERT_OK(row.GetTimestamp("timestamp_val", ×tamp_val)); + ASSERT_EQ(timestamp_val, expected_int_val); + + string content = strings::Substitute("hello $0", expected_int_val); + Slice expected_slice_val(content); + Slice string_val; + ASSERT_OK(row.GetString("string_val", &string_val)); + ASSERT_EQ(string_val, expected_slice_val); + Slice binary_val; + ASSERT_OK(row.GetBinary("binary_val", &binary_val)); + ASSERT_EQ(binary_val, expected_slice_val); + + bool expected_bool_val = expected_int_val % 2; + bool bool_val; + ASSERT_OK(row.GetBool("bool_val", &bool_val)); + ASSERT_EQ(bool_val, expected_bool_val); + + double expected_double_val = expected_int_val; + double double_val; + ASSERT_OK(row.GetDouble("double_val", &double_val)); + ASSERT_EQ(double_val, expected_double_val); + float float_val; + ASSERT_OK(row.GetFloat("float_val", &float_val)); + ASSERT_EQ(float_val, static_cast(double_val)); + } + + Status VerifyRows() { + vector projection; + SetupProjection(&projection); + + int total_rows = 0; + // Scan a single tablet and make sure it has the rows we expect in the amount we + // expect. + for (int i = 0; i < kNumTablets; ++i) { + KuduScanner scanner(table_.get()); + string low_split; + string high_split; + if (i != 0) { + const KuduPartialRow& split = split_rows_[i - 1]; + RETURN_NOT_OK(scanner.AddLowerBound(split)); + low_split = split.ToString(); + } + if (i != kNumTablets - 1) { + const KuduPartialRow& split = split_rows_[i]; + RETURN_NOT_OK(scanner.AddExclusiveUpperBound(split)); + high_split = split.ToString(); + } + + RETURN_NOT_OK(scanner.SetProjectedColumns(projection)); + RETURN_NOT_OK(scanner.SetBatchSizeBytes(KMaxBatchSize)); + RETURN_NOT_OK(scanner.SetFaultTolerant()); + RETURN_NOT_OK(scanner.SetReadMode(KuduScanner::READ_AT_SNAPSHOT)); + RETURN_NOT_OK(scanner.SetTimeoutMillis(5000)); + RETURN_NOT_OK(scanner.Open()); + LOG(INFO) << "Scanning tablet: [" << low_split << ", " << high_split << ")"; + + int total_rows_in_tablet = 0; + while (scanner.HasMoreRows()) { + vector rows; + RETURN_NOT_OK(scanner.NextBatch(&rows)); + + for (int j = 0; j < rows.size(); ++j) { + VLOG(1) << "Scanned row: " << rows[j].ToString(); + VerifyRow(rows[j], i, total_rows_in_tablet + j); + } + total_rows_in_tablet += rows.size(); + } + CHECK_EQ(total_rows_in_tablet, setup_.GetRowsPerTablet()); + total_rows += total_rows_in_tablet; + } + CHECK_EQ(total_rows, setup_.GetRowsPerTablet() * kNumTablets); + return Status::OK(); + } + + void RunTest() { + ASSERT_OK(CreateCluster()); + ASSERT_OK(CreateTable()); + ASSERT_OK(InsertRows()); + // Check that all of the replicas agree on the inserted data. This retries until + // all replicas are up-to-date, which is important to ensure that the following + // Verify always passes. + NO_FATALS(ClusterVerifier(cluster_.get()).CheckCluster()); + // Check that the inserted data matches what we thought we inserted. + ASSERT_OK(VerifyRows()); + } + + virtual void TearDown() OVERRIDE { + cluster_->AssertNoCrashes(); + cluster_->Shutdown(); + } + + protected: + TestSetup setup_; + KuduSchema schema_; + vector split_rows_; + shared_ptr client_; + gscoped_ptr cluster_; + shared_ptr table_; +}; + +// Wrap the actual DataType so that we can have the setup structs be friends of other classes +// without leaking DataType. +template +struct KeyTypeWrapper { + static const DataType type = KeyType; +}; + +typedef ::testing::Types >, + IntKeysTestSetup >, + IntKeysTestSetup >, + IntKeysTestSetup >, + IntKeysTestSetup >, + SliceKeysTestSetup >, + SliceKeysTestSetup > + > KeyTypes; + +TYPED_TEST_CASE(AllTypesItest, KeyTypes); + +TYPED_TEST(AllTypesItest, TestAllKeyTypes) { + this->RunTest(); +} + +} // namespace client +} // namespace kudu + diff --git a/src/kudu/integration-tests/alter_table-randomized-test.cc b/src/kudu/integration-tests/alter_table-randomized-test.cc new file mode 100644 index 000000000000..cbf1bfdf4c71 --- /dev/null +++ b/src/kudu/integration-tests/alter_table-randomized-test.cc @@ -0,0 +1,446 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduError; +using client::KuduInsert; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduTable; +using client::KuduTableAlterer; +using client::KuduTableCreator; +using client::KuduValue; +using client::KuduWriteOperation; +using client::sp::shared_ptr; +using std::make_pair; +using std::map; +using std::pair; +using std::vector; +using strings::SubstituteAndAppend; + +const char* kTableName = "test-table"; +const int kMaxColumns = 30; + +class AlterTableRandomized : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = 3; + // Because this test performs a lot of alter tables, we end up flushing + // and rewriting metadata files quite a bit. Globally disabling fsync + // speeds the test runtime up dramatically. + opts.extra_tserver_flags.push_back("--never_fsync"); + // This test produces tables with lots of columns. With container preallocation, + // we end up using quite a bit of disk space. So, we disable it. + opts.extra_tserver_flags.push_back("--log_container_preallocate_bytes=0"); + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + + KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); + } + + virtual void TearDown() OVERRIDE { + cluster_->Shutdown(); + KuduTest::TearDown(); + } + + void RestartTabletServer(int idx) { + LOG(INFO) << "Restarting TS " << idx; + cluster_->tablet_server(idx)->Shutdown(); + CHECK_OK(cluster_->tablet_server(idx)->Restart()); + CHECK_OK(cluster_->WaitForTabletsRunning(cluster_->tablet_server(idx), + MonoDelta::FromSeconds(60))); + } + + protected: + gscoped_ptr cluster_; + shared_ptr client_; +}; + +struct RowState { + // We use this special value to denote NULL values. + // We ensure that we never insert or update to this value except in the case of + // NULLable columns. + static const int32_t kNullValue = 0xdeadbeef; + vector > cols; + + string ToString() const { + string ret = "("; + typedef pair entry; + bool first = true; + for (const entry& e : cols) { + if (!first) { + ret.append(", "); + } + first = false; + if (e.second == kNullValue) { + SubstituteAndAppend(&ret, "int32 $0=$1", e.first, "NULL"); + } else { + SubstituteAndAppend(&ret, "int32 $0=$1", e.first, e.second); + } + } + ret.push_back(')'); + return ret; + } +}; + +struct TableState { + TableState() { + col_names_.push_back("key"); + col_nullable_.push_back(false); + } + + ~TableState() { + STLDeleteValues(&rows_); + } + + void GenRandomRow(int32_t key, int32_t seed, + vector >* row) { + if (seed == RowState::kNullValue) { + seed++; + } + row->clear(); + row->push_back(make_pair("key", key)); + for (int i = 1; i < col_names_.size(); i++) { + int32_t val; + if (col_nullable_[i] && seed % 2 == 1) { + val = RowState::kNullValue; + } else { + val = seed; + } + row->push_back(make_pair(col_names_[i], val)); + } + } + + bool Insert(const vector >& data) { + DCHECK_EQ("key", data[0].first); + int32_t key = data[0].second; + if (ContainsKey(rows_, key)) return false; + + auto r = new RowState; + r->cols = data; + rows_[key] = r; + return true; + } + + bool Update(const vector >& data) { + DCHECK_EQ("key", data[0].first); + int32_t key = data[0].second; + if (!ContainsKey(rows_, key)) return false; + + RowState* r = rows_[key]; + r->cols = data; + return true; + } + + void Delete(int32_t row_key) { + RowState* r = EraseKeyReturnValuePtr(&rows_, row_key); + CHECK(r) << "row key " << row_key << " not found"; + delete r; + } + + void AddColumnWithDefault(const string& name, int32_t def, bool nullable) { + col_names_.push_back(name); + col_nullable_.push_back(nullable); + for (entry& e : rows_) { + e.second->cols.push_back(make_pair(name, def)); + } + } + + void DropColumn(const string& name) { + auto col_it = std::find(col_names_.begin(), col_names_.end(), name); + int index = col_it - col_names_.begin(); + col_names_.erase(col_it); + col_nullable_.erase(col_nullable_.begin() + index); + for (entry& e : rows_) { + e.second->cols.erase(e.second->cols.begin() + index); + } + } + + int32_t GetRandomRowKey(int32_t rand) { + CHECK(!rows_.empty()); + int idx = rand % rows_.size(); + map::const_iterator it = rows_.begin(); + for (int i = 0; i < idx; i++) { + ++it; + } + return it->first; + } + + void ToStrings(vector* strs) { + strs->clear(); + for (const entry& e : rows_) { + strs->push_back(e.second->ToString()); + } + } + + // The name of each column. + vector col_names_; + + // For each column, whether it is NULLable. + // Has the same length as col_names_. + vector col_nullable_; + + typedef pair entry; + map rows_; +}; + +struct MirrorTable { + explicit MirrorTable(shared_ptr client) + : client_(std::move(client)) {} + + Status Create() { + KuduSchema schema; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + CHECK_OK(b.Build(&schema)); + gscoped_ptr table_creator(client_->NewTableCreator()); + RETURN_NOT_OK(table_creator->table_name(kTableName) + .schema(&schema) + .num_replicas(3) + .Create()); + return Status::OK(); + } + + bool TryInsert(int32_t row_key, int32_t rand) { + vector > row; + ts_.GenRandomRow(row_key, rand, &row); + Status s = DoRealOp(row, INSERT); + if (s.IsAlreadyPresent()) { + CHECK(!ts_.Insert(row)) << "real table said already-present, fake table succeeded"; + return false; + } + CHECK_OK(s); + + CHECK(ts_.Insert(row)); + return true; + } + + void DeleteRandomRow(uint32_t rand) { + if (ts_.rows_.empty()) return; + int32_t row_key = ts_.GetRandomRowKey(rand); + vector > del; + del.push_back(make_pair("key", row_key)); + CHECK_OK(DoRealOp(del, DELETE)); + + ts_.Delete(row_key); + } + + void UpdateRandomRow(uint32_t rand) { + if (ts_.rows_.empty()) return; + int32_t row_key = ts_.GetRandomRowKey(rand); + + vector > update; + update.push_back(make_pair("key", row_key)); + for (int i = 1; i < num_columns(); i++) { + int32_t val = rand * i; + if (val == RowState::kNullValue) val++; + if (ts_.col_nullable_[i] && val % 2 == 1) { + val = RowState::kNullValue; + } + update.push_back(make_pair(ts_.col_names_[i], val)); + } + + if (update.size() == 1) { + // No columns got updated. Just ignore this update. + return; + } + + Status s = DoRealOp(update, UPDATE); + if (s.IsNotFound()) { + CHECK(!ts_.Update(update)) << "real table said not-found, fake table succeeded"; + return; + } + CHECK_OK(s); + + CHECK(ts_.Update(update)); + } + + void AddAColumn(const string& name) { + int32_t default_value = rand(); + bool nullable = rand() % 2 == 1; + + // Add to the real table. + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + + if (nullable) { + default_value = RowState::kNullValue; + table_alterer->AddColumn(name)->Type(KuduColumnSchema::INT32); + } else { + table_alterer->AddColumn(name)->Type(KuduColumnSchema::INT32)->NotNull() + ->Default(KuduValue::FromInt(default_value)); + } + ASSERT_OK(table_alterer->Alter()); + + // Add to the mirror state. + ts_.AddColumnWithDefault(name, default_value, nullable); + } + + void DropAColumn(const string& name) { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + CHECK_OK(table_alterer->DropColumn(name)->Alter()); + ts_.DropColumn(name); + } + + void DropRandomColumn(int seed) { + if (num_columns() == 1) return; + + string name = ts_.col_names_[1 + (seed % (num_columns() - 1))]; + DropAColumn(name); + } + + int num_columns() const { + return ts_.col_names_.size(); + } + + void Verify() { + // First scan the real table + vector rows; + { + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableName, &table)); + client::ScanTableToStrings(table.get(), &rows); + } + std::sort(rows.begin(), rows.end()); + + // Then get our mock table. + vector expected; + ts_.ToStrings(&expected); + + // They should look the same. + ASSERT_EQ(rows, expected); + } + + private: + enum OpType { + INSERT, UPDATE, DELETE + }; + + Status DoRealOp(const vector >& data, + OpType op_type) { + shared_ptr session = client_->NewSession(); + shared_ptr table; + RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15 * 1000); + RETURN_NOT_OK(client_->OpenTable(kTableName, &table)); + gscoped_ptr op; + switch (op_type) { + case INSERT: op.reset(table->NewInsert()); break; + case UPDATE: op.reset(table->NewUpdate()); break; + case DELETE: op.reset(table->NewDelete()); break; + } + for (const auto& d : data) { + if (d.second == RowState::kNullValue) { + CHECK_OK(op->mutable_row()->SetNull(d.first)); + } else { + CHECK_OK(op->mutable_row()->SetInt32(d.first, d.second)); + } + } + RETURN_NOT_OK(session->Apply(op.release())); + Status s = session->Flush(); + if (s.ok()) { + return s; + } + + std::vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK_EQ(errors.size(), 1); + return errors[0]->status(); + } + + shared_ptr client_; + TableState ts_; +}; + +// Stress test for various alter table scenarios. This performs a random sequence of: +// - insert a row (using the latest schema) +// - delete a random row +// - update a row (all columns with the latest schema) +// - add a new column +// - drop a column +// - restart the tablet server +// +// During the sequence of operations, a "mirror" of the table in memory is kept up to +// date. We periodically scan the actual table, and ensure that the data in Kudu +// matches our in-memory "mirror". +TEST_F(AlterTableRandomized, TestRandomSequence) { + MirrorTable t(client_); + ASSERT_OK(t.Create()); + + Random rng(SeedRandom()); + + const int n_iters = AllowSlowTests() ? 2000 : 1000; + for (int i = 0; i < n_iters; i++) { + // Perform different operations with varying probability. + // We mostly insert and update, with occasional deletes, + // and more occasional table alterations or restarts. + int r = rng.Uniform(1000); + if (r < 400) { + t.TryInsert(1000000 + rng.Uniform(1000000), rng.Next()); + } else if (r < 600) { + t.UpdateRandomRow(rng.Next()); + } else if (r < 920) { + t.DeleteRandomRow(rng.Next()); + } else if (r < 970) { + if (t.num_columns() < kMaxColumns) { + t.AddAColumn(strings::Substitute("c$0", i)); + } + } else if (r < 995) { + t.DropRandomColumn(rng.Next()); + } else { + RestartTabletServer(rng.Uniform(cluster_->num_tablet_servers())); + } + + if (i % 1000 == 0) { + NO_FATALS(t.Verify()); + } + } + + NO_FATALS(t.Verify()); + + // Not only should the data returned by a scanner match what we expect, + // we also expect all of the replicas to agree with each other. + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/alter_table-test.cc b/src/kudu/integration-tests/alter_table-test.cc new file mode 100644 index 000000000000..33aef927b9ec --- /dev/null +++ b/src/kudu/integration-tests/alter_table-test.cc @@ -0,0 +1,1006 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/client-test-util.h" +#include "kudu/client/row_result.h" +#include "kudu/client/schema.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master-test-util.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/atomic.h" +#include "kudu/util/faststring.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(enable_data_block_fsync); +DECLARE_bool(enable_maintenance_manager); +DECLARE_int32(heartbeat_interval_ms); +DECLARE_int32(flush_threshold_mb); +DECLARE_bool(use_hybrid_clock); + +namespace kudu { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduError; +using client::KuduInsert; +using client::KuduRowResult; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduTable; +using client::KuduTableAlterer; +using client::KuduTableCreator; +using client::KuduUpdate; +using client::KuduValue; +using client::sp::shared_ptr; +using master::AlterTableRequestPB; +using master::AlterTableResponsePB; +using master::MiniMaster; +using std::map; +using std::pair; +using std::vector; +using tablet::TabletPeer; +using tserver::MiniTabletServer; + +class AlterTableTest : public KuduTest { + public: + AlterTableTest() + : stop_threads_(false), + inserted_idx_(0) { + + KuduSchemaBuilder b; + b.AddColumn("c0")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("c1")->Type(KuduColumnSchema::INT32)->NotNull(); + CHECK_OK(b.Build(&schema_)); + + FLAGS_enable_data_block_fsync = false; // Keep unit tests fast. + FLAGS_use_hybrid_clock = false; + ANNOTATE_BENIGN_RACE(&FLAGS_flush_threshold_mb, + "safe to change at runtime"); + ANNOTATE_BENIGN_RACE(&FLAGS_enable_maintenance_manager, + "safe to change at runtime"); + } + + virtual void SetUp() OVERRIDE { + // Make heartbeats faster to speed test runtime. + FLAGS_heartbeat_interval_ms = 10; + + KuduTest::SetUp(); + + MiniClusterOptions opts; + opts.num_tablet_servers = num_replicas(); + cluster_.reset(new MiniCluster(env_.get(), opts)); + ASSERT_OK(cluster_->Start()); + ASSERT_OK(cluster_->WaitForTabletServerCount(num_replicas())); + + CHECK_OK(KuduClientBuilder() + .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr_str()) + .default_admin_operation_timeout(MonoDelta::FromSeconds(60)) + .Build(&client_)); + + // Add a table, make sure it reports itself. + gscoped_ptr table_creator(client_->NewTableCreator()); + CHECK_OK(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(num_replicas()) + .Create()); + + if (num_replicas() == 1) { + tablet_peer_ = LookupTabletPeer(); + } + LOG(INFO) << "Tablet successfully located"; + } + + virtual void TearDown() OVERRIDE { + tablet_peer_.reset(); + cluster_->Shutdown(); + } + + scoped_refptr LookupTabletPeer() { + vector > peers; + cluster_->mini_tablet_server(0)->server()->tablet_manager()->GetTabletPeers(&peers); + CHECK_EQ(1, peers.size()); + return peers[0]; + } + + void ShutdownTS() { + // Drop the tablet_peer_ reference since the tablet peer becomes invalid once + // we shut down the server. Additionally, if we hold onto the reference, + // we'll end up calling the destructor from the test code instead of the + // normal location, which can cause crashes, etc. + tablet_peer_.reset(); + if (cluster_->mini_tablet_server(0)->server() != nullptr) { + cluster_->mini_tablet_server(0)->Shutdown(); + } + } + + void RestartTabletServer(int idx = 0) { + tablet_peer_.reset(); + if (cluster_->mini_tablet_server(idx)->server()) { + ASSERT_OK(cluster_->mini_tablet_server(idx)->Restart()); + } else { + ASSERT_OK(cluster_->mini_tablet_server(idx)->Start()); + } + + ASSERT_OK(cluster_->mini_tablet_server(idx)->WaitStarted()); + if (idx == 0) { + tablet_peer_ = LookupTabletPeer(); + } + } + + Status WaitAlterTableCompletion(const std::string& table_name, int attempts) { + int wait_time = 1000; + for (int i = 0; i < attempts; ++i) { + bool in_progress; + RETURN_NOT_OK(client_->IsAlterTableInProgress(table_name, &in_progress)); + if (!in_progress) { + return Status::OK(); + } + + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + wait_time = std::min(wait_time * 5 / 4, 1000000); + } + + return Status::TimedOut("AlterTable not completed within the timeout"); + } + + Status AddNewI32Column(const string& table_name, + const string& column_name, + int32_t default_value) { + return AddNewI32Column(table_name, column_name, default_value, + MonoDelta::FromSeconds(60)); + } + + Status AddNewI32Column(const string& table_name, + const string& column_name, + int32_t default_value, + const MonoDelta& timeout) { + gscoped_ptr table_alterer(client_->NewTableAlterer(table_name)); + table_alterer->AddColumn(column_name)->Type(KuduColumnSchema::INT32)-> + NotNull()->Default(KuduValue::FromInt(default_value)); + return table_alterer->timeout(timeout)->Alter(); + } + + enum VerifyPattern { + C1_MATCHES_INDEX, + C1_IS_DEADBEEF, + C1_DOESNT_EXIST + }; + + void VerifyRows(int start_row, int num_rows, VerifyPattern pattern); + + void InsertRows(int start_row, int num_rows); + + void UpdateRow(int32_t row_key, const map& updates); + + void ScanToStrings(vector* rows); + + void InserterThread(); + void UpdaterThread(); + void ScannerThread(); + + Status CreateSplitTable(const string& table_name) { + vector split_rows; + for (int32_t i = 1; i < 10; i++) { + KuduPartialRow* row = schema_.NewRow(); + CHECK_OK(row->SetInt32(0, i * 100)); + split_rows.push_back(row); + } + gscoped_ptr table_creator(client_->NewTableCreator()); + return table_creator->table_name(table_name) + .schema(&schema_) + .num_replicas(num_replicas()) + .split_rows(split_rows) + .Create(); + } + + protected: + virtual int num_replicas() const { return 1; } + + static const char *kTableName; + + gscoped_ptr cluster_; + shared_ptr client_; + + KuduSchema schema_; + + scoped_refptr tablet_peer_; + + AtomicBool stop_threads_; + + // The index of the last row inserted by InserterThread. + // UpdaterThread uses this to figure out which rows can be + // safely updated. + AtomicInt inserted_idx_; +}; + +// Subclass which creates three servers and a replicated cluster. +class ReplicatedAlterTableTest : public AlterTableTest { + protected: + virtual int num_replicas() const OVERRIDE { return 3; } +}; + +const char *AlterTableTest::kTableName = "fake-table"; + +// Simple test to verify that the "alter table" command sent and executed +// on the TS handling the tablet of the altered table. +// TODO: create and verify multiple tablets when the client will support that. +TEST_F(AlterTableTest, TestTabletReports) { + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + ASSERT_OK(AddNewI32Column(kTableName, "new-i32", 0)); + ASSERT_EQ(1, tablet_peer_->tablet()->metadata()->schema_version()); +} + +// Verify that adding an existing column will return an "already present" error +TEST_F(AlterTableTest, TestAddExistingColumn) { + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + + { + Status s = AddNewI32Column(kTableName, "c1", 0); + ASSERT_TRUE(s.IsAlreadyPresent()); + ASSERT_STR_CONTAINS(s.ToString(), "The column already exists: c1"); + } + + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); +} + +// Verify that adding a NOT NULL column without defaults will return an error. +// +// This doesn't use the KuduClient because it's trying to make an invalid request. +// Our APIs for the client are designed such that it's impossible to send such +// a request. +TEST_F(AlterTableTest, TestAddNotNullableColumnWithoutDefaults) { + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + + { + AlterTableRequestPB req; + req.mutable_table()->set_table_name(kTableName); + + AlterTableRequestPB::Step *step = req.add_alter_schema_steps(); + step->set_type(AlterTableRequestPB::ADD_COLUMN); + ColumnSchemaToPB(ColumnSchema("c2", INT32), + step->mutable_add_column()->mutable_schema()); + AlterTableResponsePB resp; + Status s = cluster_->mini_master()->master()->catalog_manager()->AlterTable( + &req, &resp, nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "column `c2`: NOT NULL columns must have a default"); + } + + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); +} + +// Adding a nullable column with no default value should be equivalent +// to a NULL default. +TEST_F(AlterTableTest, TestAddNullableColumnWithoutDefault) { + InsertRows(0, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + table_alterer->AddColumn("new")->Type(KuduColumnSchema::INT32); + ASSERT_OK(table_alterer->Alter()); + } + + InsertRows(1, 1); + + vector rows; + ScanToStrings(&rows); + ASSERT_EQ(2, rows.size()); + EXPECT_EQ("(int32 c0=0, int32 c1=0, int32 new=NULL)", rows[0]); + EXPECT_EQ("(int32 c0=16777216, int32 c1=1, int32 new=NULL)", rows[1]); +} + +// Verify that, if a tablet server is down when an alter command is issued, +// it will eventually receive the command when it restarts. +TEST_F(AlterTableTest, TestAlterOnTSRestart) { + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + + ShutdownTS(); + + // Send the Alter request + { + Status s = AddNewI32Column(kTableName, "new-32", 10, + MonoDelta::FromMilliseconds(500)); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Verify that the Schema is the old one + KuduSchema schema; + bool alter_in_progress = false; + ASSERT_OK(client_->GetTableSchema(kTableName, &schema)); + ASSERT_TRUE(schema_.Equals(schema)); + ASSERT_OK(client_->IsAlterTableInProgress(kTableName, &alter_in_progress)) + ASSERT_TRUE(alter_in_progress); + + // Restart the TS and wait for the new schema + RestartTabletServer(); + ASSERT_OK(WaitAlterTableCompletion(kTableName, 50)); + ASSERT_EQ(1, tablet_peer_->tablet()->metadata()->schema_version()); +} + +// Verify that nothing is left behind on cluster shutdown with pending async tasks +TEST_F(AlterTableTest, TestShutdownWithPendingTasks) { + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + + ShutdownTS(); + + // Send the Alter request + { + Status s = AddNewI32Column(kTableName, "new-i32", 10, + MonoDelta::FromMilliseconds(500)); + ASSERT_TRUE(s.IsTimedOut()); + } +} + +// Verify that the new schema is applied/reported even when +// the TS is going down with the alter operation in progress. +// On TS restart the master should: +// - get the new schema state, and mark the alter as complete +// - get the old schema state, and ask the TS again to perform the alter. +TEST_F(AlterTableTest, TestRestartTSDuringAlter) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping slow test"; + return; + } + + ASSERT_EQ(0, tablet_peer_->tablet()->metadata()->schema_version()); + + Status s = AddNewI32Column(kTableName, "new-i32", 10, + MonoDelta::FromMilliseconds(1)); + ASSERT_TRUE(s.IsTimedOut()); + + // Restart the TS while alter is running + for (int i = 0; i < 3; i++) { + SleepFor(MonoDelta::FromMicroseconds(500)); + RestartTabletServer(); + } + + // Wait for the new schema + ASSERT_OK(WaitAlterTableCompletion(kTableName, 50)); + ASSERT_EQ(1, tablet_peer_->tablet()->metadata()->schema_version()); +} + +TEST_F(AlterTableTest, TestGetSchemaAfterAlterTable) { + ASSERT_OK(AddNewI32Column(kTableName, "new-i32", 10)); + + KuduSchema s; + ASSERT_OK(client_->GetTableSchema(kTableName, &s)); +} + +void AlterTableTest::InsertRows(int start_row, int num_rows) { + shared_ptr session = client_->NewSession(); + shared_ptr table; + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15 * 1000); + CHECK_OK(client_->OpenTable(kTableName, &table)); + + // Insert a bunch of rows with the current schema + for (int i = start_row; i < start_row + num_rows; i++) { + gscoped_ptr insert(table->NewInsert()); + // Endian-swap the key so that we spew inserts randomly + // instead of just a sequential write pattern. This way + // compactions may actually be triggered. + int32_t key = bswap_32(i); + CHECK_OK(insert->mutable_row()->SetInt32(0, key)); + + if (table->schema().num_columns() > 1) { + CHECK_OK(insert->mutable_row()->SetInt32(1, i)); + } + + CHECK_OK(session->Apply(insert.release())); + + if (i % 50 == 0) { + FlushSessionOrDie(session); + } + } + + FlushSessionOrDie(session); +} + +void AlterTableTest::UpdateRow(int32_t row_key, + const map& updates) { + shared_ptr session = client_->NewSession(); + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableName, &table)); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15 * 1000); + gscoped_ptr update(table->NewUpdate()); + int32_t key = bswap_32(row_key); // endian swap to match 'InsertRows' + CHECK_OK(update->mutable_row()->SetInt32(0, key)); + typedef map::value_type entry; + for (const entry& e : updates) { + CHECK_OK(update->mutable_row()->SetInt32(e.first, e.second)); + } + CHECK_OK(session->Apply(update.release())); + FlushSessionOrDie(session); +} + +void AlterTableTest::ScanToStrings(vector* rows) { + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableName, &table)); + ScanTableToStrings(table.get(), rows); + std::sort(rows->begin(), rows->end()); +} + +// Verify that the 'num_rows' starting with 'start_row' fit the given pattern. +// Note that the 'start_row' here is not a row key, but the pre-transformation row +// key (InsertRows swaps endianness so that we random-write instead of sequential-write) +void AlterTableTest::VerifyRows(int start_row, int num_rows, VerifyPattern pattern) { + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableName, &table)); + KuduScanner scanner(table.get()); + CHECK_OK(scanner.SetSelection(KuduClient::LEADER_ONLY)); + CHECK_OK(scanner.Open()); + + int verified = 0; + vector results; + while (scanner.HasMoreRows()) { + CHECK_OK(scanner.NextBatch(&results)); + + for (const KuduRowResult& row : results) { + int32_t key = 0; + CHECK_OK(row.GetInt32(0, &key)); + int32_t row_idx = bswap_32(key); + if (row_idx < start_row || row_idx >= start_row + num_rows) { + // Outside the range we're verifying + continue; + } + verified++; + + if (pattern == C1_DOESNT_EXIST) { + continue; + } + + int32_t c1 = 0; + CHECK_OK(row.GetInt32(1, &c1)); + + switch (pattern) { + case C1_MATCHES_INDEX: + CHECK_EQ(row_idx, c1); + break; + case C1_IS_DEADBEEF: + CHECK_EQ(0xdeadbeef, c1); + break; + default: + LOG(FATAL); + } + } + } + CHECK_EQ(verified, num_rows); +} + +// Test inserting/updating some data, dropping a column, and adding a new one +// with the same name. Data should not "reappear" from the old column. +// +// This is a regression test for KUDU-461. +TEST_F(AlterTableTest, TestDropAndAddNewColumn) { + // Reduce flush threshold so that we get both on-disk data + // for the alter as well as in-MRS data. + // This also increases chances of a race. + FLAGS_flush_threshold_mb = 3; + + const int kNumRows = AllowSlowTests() ? 100000 : 1000; + InsertRows(0, kNumRows); + + LOG(INFO) << "Verifying initial pattern"; + VerifyRows(0, kNumRows, C1_MATCHES_INDEX); + + LOG(INFO) << "Dropping and adding back c1"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c1") + ->Alter()); + + ASSERT_OK(AddNewI32Column(kTableName, "c1", 0xdeadbeef)); + + LOG(INFO) << "Verifying that the new default shows up"; + VerifyRows(0, kNumRows, C1_IS_DEADBEEF); +} + +// Tests that a renamed table can still be altered. This is a regression test, we used to not carry +// over column ids after a table rename. +TEST_F(AlterTableTest, TestRenameTableAndAdd) { + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + string new_name = "someothername"; + ASSERT_OK(table_alterer->RenameTo(new_name) + ->Alter()); + + ASSERT_OK(AddNewI32Column(new_name, "new", 0xdeadbeef)); +} + +// Test restarting a tablet server several times after various +// schema changes. +// This is a regression test for KUDU-462. +TEST_F(AlterTableTest, TestBootstrapAfterAlters) { + vector rows; + + ASSERT_OK(AddNewI32Column(kTableName, "c2", 12345)); + InsertRows(0, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + InsertRows(1, 1); + + UpdateRow(0, { {"c1", 10001} }); + UpdateRow(1, { {"c1", 10002} }); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=10001, int32 c2=12345)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c1=10002, int32 c2=12345)", rows[1]); + + LOG(INFO) << "Dropping c1"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c1")->Alter()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c2=12345)", rows[1]); + + // Test that restart doesn't fail when trying to replay updates or inserts + // with the dropped column. + ASSERT_NO_FATAL_FAILURE(RestartTabletServer()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c2=12345)", rows[1]); + + // Add back a column called 'c2', but should not materialize old data. + ASSERT_OK(AddNewI32Column(kTableName, "c1", 20000)); + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345, int32 c1=20000)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c2=12345, int32 c1=20000)", rows[1]); + + ASSERT_NO_FATAL_FAILURE(RestartTabletServer()); + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345, int32 c1=20000)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c2=12345, int32 c1=20000)", rows[1]); +} + +TEST_F(AlterTableTest, TestCompactAfterUpdatingRemovedColumn) { + // Disable maintenance manager, since we manually flush/compact + // in this test. + FLAGS_enable_maintenance_manager = false; + + vector rows; + + ASSERT_OK(AddNewI32Column(kTableName, "c2", 12345)); + InsertRows(0, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + InsertRows(1, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=0, int32 c2=12345)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c1=1, int32 c2=12345)", rows[1]); + + // Add a delta for c1. + UpdateRow(0, { {"c1", 54321} }); + + // Drop c1. + LOG(INFO) << "Dropping c1"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c1")->Alter()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345)", rows[0]); + + // Compact + ASSERT_OK(tablet_peer_->tablet()->Compact(tablet::Tablet::FORCE_COMPACT_ALL)); +} + +// Test which major-compacts a column for which there are updates in +// a delta file, but where the column has been removed. +TEST_F(AlterTableTest, TestMajorCompactDeltasAfterUpdatingRemovedColumn) { + // Disable maintenance manager, since we manually flush/compact + // in this test. + FLAGS_enable_maintenance_manager = false; + + vector rows; + + ASSERT_OK(AddNewI32Column(kTableName, "c2", 12345)); + InsertRows(0, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=0, int32 c2=12345)", rows[0]); + + // Add a delta for c1. + UpdateRow(0, { {"c1", 54321} }); + + // Make sure the delta is in a delta-file. + ASSERT_OK(tablet_peer_->tablet()->FlushBiggestDMS()); + + // Drop c1. + LOG(INFO) << "Dropping c1"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c1") ->Alter()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c2=12345)", rows[0]); + + // Major Compact Deltas + ASSERT_OK(tablet_peer_->tablet()->CompactWorstDeltas( + tablet::RowSet::MAJOR_DELTA_COMPACTION)); + + // Check via debug dump that the data was properly compacted, including deltas. + // We expect to see neither deltas nor base data for the deleted column. + rows.clear(); + tablet_peer_->tablet()->DebugDump(&rows); + ASSERT_EQ("Dumping tablet:\n" + "---------------------------\n" + "MRS memrowset:\n" + "RowSet RowSet(1):\n" + "(int32 c0=0, int32 c2=12345) Undos: [@2(DELETE)] Redos: []", + JoinStrings(rows, "\n")); + +} + +// Test which major-compacts a column for which we have updates +// in a DeltaFile, but for which we didn't originally flush any +// CFile in the base data (because the the RowSet was flushed +// prior to the addition of the column). +TEST_F(AlterTableTest, TestMajorCompactDeltasIntoMissingBaseData) { + // Disable maintenance manager, since we manually flush/compact + // in this test. + FLAGS_enable_maintenance_manager = false; + + vector rows; + + InsertRows(0, 2); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + // Add the new column after the Flush, so it has no base data. + ASSERT_OK(AddNewI32Column(kTableName, "c2", 12345)); + + // Add a delta for c2. + UpdateRow(0, { {"c2", 54321} }); + + // Make sure the delta is in a delta-file. + ASSERT_OK(tablet_peer_->tablet()->FlushBiggestDMS()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(2, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=0, int32 c2=54321)", rows[0]); + ASSERT_EQ("(int32 c0=16777216, int32 c1=1, int32 c2=12345)", rows[1]); + + // Major Compact Deltas + ASSERT_OK(tablet_peer_->tablet()->CompactWorstDeltas( + tablet::RowSet::MAJOR_DELTA_COMPACTION)); + + // Check via debug dump that the data was properly compacted, including deltas. + // We expect to see the updated value materialized into the base data for the first + // row, the default value materialized for the second, and a proper UNDO to undo + // the update on the first row. + rows.clear(); + tablet_peer_->tablet()->DebugDump(&rows); + ASSERT_EQ("Dumping tablet:\n" + "---------------------------\n" + "MRS memrowset:\n" + "RowSet RowSet(0):\n" + "(int32 c0=0, int32 c1=0, int32 c2=54321) " + "Undos: [@4(SET c2=12345), @1(DELETE)] Redos: []\n" + "(int32 c0=16777216, int32 c1=1, int32 c2=12345) Undos: [@2(DELETE)] Redos: []", + JoinStrings(rows, "\n")); +} + +// Test which major-compacts a column for which there we have updates +// in a DeltaFile, but for which there is no corresponding CFile +// in the base data. Unlike the above test, in this case, we also drop +// the column again before running the major delta compaction. +TEST_F(AlterTableTest, TestMajorCompactDeltasAfterAddUpdateRemoveColumn) { + // Disable maintenance manager, since we manually flush/compact + // in this test. + FLAGS_enable_maintenance_manager = false; + + vector rows; + + InsertRows(0, 1); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + // Add the new column after the Flush(), so that no CFile for this + // column is present in the base data. + ASSERT_OK(AddNewI32Column(kTableName, "c2", 12345)); + + // Add a delta for c2. + UpdateRow(0, { {"c2", 54321} }); + + // Make sure the delta is in a delta-file. + ASSERT_OK(tablet_peer_->tablet()->FlushBiggestDMS()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=0, int32 c2=54321)", rows[0]); + + // Drop c2. + LOG(INFO) << "Dropping c2"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c2")->Alter()); + + NO_FATALS(ScanToStrings(&rows)); + ASSERT_EQ(1, rows.size()); + ASSERT_EQ("(int32 c0=0, int32 c1=0)", rows[0]); + + // Major Compact Deltas + ASSERT_OK(tablet_peer_->tablet()->CompactWorstDeltas( + tablet::RowSet::MAJOR_DELTA_COMPACTION)); + + // Check via debug dump that the data was properly compacted, including deltas. + // We expect to see neither deltas nor base data for the deleted column. + rows.clear(); + tablet_peer_->tablet()->DebugDump(&rows); + ASSERT_EQ("Dumping tablet:\n" + "---------------------------\n" + "MRS memrowset:\n" + "RowSet RowSet(0):\n" + "(int32 c0=0, int32 c1=0) Undos: [@1(DELETE)] Redos: []", + JoinStrings(rows, "\n")); +} + +// Thread which inserts rows into the table. +// After each batch of rows is inserted, inserted_idx_ is updated +// to communicate how much data has been written (and should now be +// updateable) +void AlterTableTest::InserterThread() { + shared_ptr session = client_->NewSession(); + shared_ptr table; + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15 * 1000); + + CHECK_OK(client_->OpenTable(kTableName, &table)); + int32_t i = 0; + while (!stop_threads_.Load()) { + gscoped_ptr insert(table->NewInsert()); + // Endian-swap the key so that we spew inserts randomly + // instead of just a sequential write pattern. This way + // compactions may actually be triggered. + int32_t key = bswap_32(i++); + CHECK_OK(insert->mutable_row()->SetInt32(0, key)); + CHECK_OK(insert->mutable_row()->SetInt32(1, i)); + CHECK_OK(session->Apply(insert.release())); + + if (i % 50 == 0) { + FlushSessionOrDie(session); + inserted_idx_.Store(i); + } + } + + FlushSessionOrDie(session); + inserted_idx_.Store(i); +} + +// Thread which follows behind the InserterThread and generates random +// updates across the previously inserted rows. +void AlterTableTest::UpdaterThread() { + shared_ptr session = client_->NewSession(); + shared_ptr table; + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15 * 1000); + + CHECK_OK(client_->OpenTable(kTableName, &table)); + + Random rng(1); + int32_t i = 0; + while (!stop_threads_.Load()) { + gscoped_ptr update(table->NewUpdate()); + + int32_t max = inserted_idx_.Load(); + if (max == 0) { + // Inserter hasn't inserted anything yet, so we have nothing to update. + SleepFor(MonoDelta::FromMicroseconds(100)); + continue; + } + // Endian-swap the key to match the way the InserterThread generates + // keys to insert. + int32_t key = bswap_32(rng.Uniform(max)); + CHECK_OK(update->mutable_row()->SetInt32(0, key)); + CHECK_OK(update->mutable_row()->SetInt32(1, i)); + CHECK_OK(session->Apply(update.release())); + + if (i++ % 50 == 0) { + FlushSessionOrDie(session); + } + } + + FlushSessionOrDie(session); +} + +// Thread which loops reading data from the table. +// No verification is performed. +void AlterTableTest::ScannerThread() { + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableName, &table)); + while (!stop_threads_.Load()) { + KuduScanner scanner(table.get()); + int inserted_at_scanner_start = inserted_idx_.Load(); + CHECK_OK(scanner.Open()); + int count = 0; + vector results; + while (scanner.HasMoreRows()) { + CHECK_OK(scanner.NextBatch(&results)); + count += results.size(); + } + + LOG(INFO) << "Scanner saw " << count << " rows"; + // We may have gotten more rows than we expected, because inserts + // kept going while we set up the scan. But, we should never get + // fewer. + CHECK_GE(count, inserted_at_scanner_start) + << "We didn't get as many rows as expected"; + } +} + +// Test altering a table while also sending a lot of writes, +// checking for races between the two. +TEST_F(AlterTableTest, TestAlterUnderWriteLoad) { + // Increase chances of a race between flush and alter. + FLAGS_flush_threshold_mb = 3; + + scoped_refptr writer; + CHECK_OK(Thread::Create("test", "inserter", + boost::bind(&AlterTableTest::InserterThread, this), + &writer)); + + scoped_refptr updater; + CHECK_OK(Thread::Create("test", "updater", + boost::bind(&AlterTableTest::UpdaterThread, this), + &updater)); + + scoped_refptr scanner; + CHECK_OK(Thread::Create("test", "scanner", + boost::bind(&AlterTableTest::ScannerThread, this), + &scanner)); + + // Add columns until we reach 10. + for (int i = 2; i < 10; i++) { + if (AllowSlowTests()) { + // In slow test mode, let more writes accumulate in between + // alters, so that we get enough writes to cause flushes, + // compactions, etc. + SleepFor(MonoDelta::FromSeconds(3)); + } + + ASSERT_OK(AddNewI32Column(kTableName, + strings::Substitute("c$0", i), + i)); + } + + stop_threads_.Store(true); + writer->Join(); + updater->Join(); + scanner->Join(); +} + +TEST_F(AlterTableTest, TestInsertAfterAlterTable) { + const char *kSplitTableName = "split-table"; + + // Create a new table with 10 tablets. + // + // With more tablets, there's a greater chance that the TS will heartbeat + // after some but not all tablets have finished altering. + ASSERT_OK(CreateSplitTable(kSplitTableName)); + + // Add a column, and immediately try to insert a row including that + // new column. + ASSERT_OK(AddNewI32Column(kSplitTableName, "new-i32", 10)); + shared_ptr table; + ASSERT_OK(client_->OpenTable(kSplitTableName, &table)); + gscoped_ptr insert(table->NewInsert()); + ASSERT_OK(insert->mutable_row()->SetInt32("c0", 1)); + ASSERT_OK(insert->mutable_row()->SetInt32("c1", 1)); + ASSERT_OK(insert->mutable_row()->SetInt32("new-i32", 1)); + shared_ptr session = client_->NewSession(); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + session->SetTimeoutMillis(15000); + ASSERT_OK(session->Apply(insert.release())); + Status s = session->Flush(); + if (!s.ok()) { + ASSERT_EQ(1, session->CountPendingErrors()); + vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + ASSERT_FALSE(overflow); + ASSERT_EQ(1, errors.size()); + ASSERT_OK(errors[0]->status()); // will fail + } +} + +// Issue a bunch of alter tables in quick succession. Regression for a bug +// seen in an earlier implementation of "alter table" where these could +// conflict with each other. +TEST_F(AlterTableTest, TestMultipleAlters) { + const char *kSplitTableName = "split-table"; + const size_t kNumNewCols = 10; + const int32_t kDefaultValue = 10; + + // Create a new table with 10 tablets. + // + // With more tablets, there's a greater chance that the TS will heartbeat + // after some but not all tablets have finished altering. + ASSERT_OK(CreateSplitTable(kSplitTableName)); + + // Issue a bunch of new alters without waiting for them to finish. + for (int i = 0; i < kNumNewCols; i++) { + gscoped_ptr table_alterer(client_->NewTableAlterer(kSplitTableName)); + table_alterer->AddColumn(strings::Substitute("new_col$0", i)) + ->Type(KuduColumnSchema::INT32)->NotNull() + ->Default(KuduValue::FromInt(kDefaultValue)); + ASSERT_OK(table_alterer->wait(false)->Alter()); + } + + // Now wait. This should block on all of them. + WaitAlterTableCompletion(kSplitTableName, 50); + + // All new columns should be present. + KuduSchema new_schema; + ASSERT_OK(client_->GetTableSchema(kSplitTableName, &new_schema)); + ASSERT_EQ(kNumNewCols + schema_.num_columns(), new_schema.num_columns()); +} + +TEST_F(ReplicatedAlterTableTest, TestReplicatedAlter) { + const int kNumRows = 100; + InsertRows(0, kNumRows); + + LOG(INFO) << "Verifying initial pattern"; + VerifyRows(0, kNumRows, C1_MATCHES_INDEX); + + LOG(INFO) << "Dropping and adding back c1"; + gscoped_ptr table_alterer(client_->NewTableAlterer(kTableName)); + ASSERT_OK(table_alterer->DropColumn("c1")->Alter()); + + ASSERT_OK(AddNewI32Column(kTableName, "c1", 0xdeadbeef)); + + bool alter_in_progress; + ASSERT_OK(client_->IsAlterTableInProgress(kTableName, &alter_in_progress)) + ASSERT_FALSE(alter_in_progress); + + LOG(INFO) << "Verifying that the new default shows up"; + VerifyRows(0, kNumRows, C1_IS_DEADBEEF); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/client-stress-test.cc b/src/kudu/integration-tests/client-stress-test.cc new file mode 100644 index 000000000000..9be5f7168297 --- /dev/null +++ b/src/kudu/integration-tests/client-stress-test.cc @@ -0,0 +1,282 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/util/metrics.h" +#include "kudu/util/pstack_watcher.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +METRIC_DECLARE_entity(tablet); +METRIC_DECLARE_counter(leader_memory_pressure_rejections); +METRIC_DECLARE_counter(follower_memory_pressure_rejections); + +using strings::Substitute; +using std::vector; + +namespace kudu { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduScanner; +using client::KuduTable; + +class ClientStressTest : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + ExternalMiniClusterOptions opts = default_opts(); + if (multi_master()) { + opts.num_masters = 3; + opts.master_rpc_ports = { 11010, 11011, 11012 }; + } + opts.num_tablet_servers = 3; + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + } + + virtual void TearDown() OVERRIDE { + alarm(0); + cluster_->Shutdown(); + KuduTest::TearDown(); + } + + protected: + void ScannerThread(KuduClient* client, const CountDownLatch* go_latch, int32_t start_key) { + client::sp::shared_ptr table; + CHECK_OK(client->OpenTable(TestWorkload::kDefaultTableName, &table)); + vector rows; + + go_latch->Wait(); + + KuduScanner scanner(table.get()); + CHECK_OK(scanner.AddConjunctPredicate(table->NewComparisonPredicate( + "key", client::KuduPredicate::GREATER_EQUAL, + client::KuduValue::FromInt(start_key)))); + ScanToStrings(&scanner, &rows); + } + + virtual bool multi_master() const { + return false; + } + + virtual ExternalMiniClusterOptions default_opts() const { + return ExternalMiniClusterOptions(); + } + + gscoped_ptr cluster_; +}; + +// Stress test a case where most of the operations are expected to time out. +// This is a regression test for various bugs we've seen in timeout handling, +// especially with concurrent requests. +TEST_F(ClientStressTest, TestLookupTimeouts) { + const int kSleepMillis = AllowSlowTests() ? 5000 : 100; + + TestWorkload work(cluster_.get()); + work.set_num_write_threads(64); + work.set_write_timeout_millis(10); + work.set_timeout_allowed(true); + work.Setup(); + work.Start(); + SleepFor(MonoDelta::FromMilliseconds(kSleepMillis)); +} + +// Regression test for KUDU-1104, a race in which multiple scanners racing to populate a +// cold meta cache on a shared Client would crash. +// +// This test creates a table with a lot of tablets (so that we require many round-trips to +// the master to populate the meta cache) and then starts a bunch of parallel threads which +// scan starting at random points in the key space. +TEST_F(ClientStressTest, TestStartScans) { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + cluster_->SetFlag(cluster_->tablet_server(i), "log_preallocate_segments", "0"); + } + TestWorkload work(cluster_.get()); + work.set_num_tablets(40); + work.set_num_replicas(1); + work.Setup(); + + // We run the guts of the test several times -- it takes a while to build + // the 40 tablets above, but the actual scans are very fast since the table + // is empty. + for (int run = 1; run <= (AllowSlowTests() ? 10 : 2); run++) { + LOG(INFO) << "Starting run " << run; + KuduClientBuilder builder; + client::sp::shared_ptr client; + CHECK_OK(cluster_->CreateClient(builder, &client)); + + CountDownLatch go_latch(1); + vector > threads; + const int kNumThreads = 60; + Random rng(run); + for (int i = 0; i < kNumThreads; i++) { + int32_t start_key = rng.Next32(); + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create( + "test", strings::Substitute("test-scanner-$0", i), + &ClientStressTest_TestStartScans_Test::ScannerThread, this, + client.get(), &go_latch, start_key, + &new_thread)); + threads.push_back(new_thread); + } + SleepFor(MonoDelta::FromMilliseconds(50)); + + go_latch.CountDown(); + + for (const scoped_refptr& thr : threads) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + } +} + +// Override the base test to run in multi-master mode. +class ClientStressTest_MultiMaster : public ClientStressTest { + protected: + virtual bool multi_master() const OVERRIDE { + return true; + } +}; + +// Stress test a case where most of the operations are expected to time out. +// This is a regression test for KUDU-614 - it would cause a deadlock prior +// to fixing that bug. +TEST_F(ClientStressTest_MultiMaster, TestLeaderResolutionTimeout) { + TestWorkload work(cluster_.get()); + work.set_num_write_threads(64); + + // This timeout gets applied to the master requests. It's lower than the + // amount of time that we sleep the masters, to ensure they timeout. + work.set_client_default_rpc_timeout_millis(250); + // This is the time budget for the whole request. It has to be longer than + // the above timeout so that the client actually attempts to resolve + // the leader. + work.set_write_timeout_millis(280); + work.set_timeout_allowed(true); + work.Setup(); + + work.Start(); + + cluster_->tablet_server(0)->Pause(); + cluster_->tablet_server(1)->Pause(); + cluster_->tablet_server(2)->Pause(); + cluster_->master(0)->Pause(); + cluster_->master(1)->Pause(); + cluster_->master(2)->Pause(); + SleepFor(MonoDelta::FromMilliseconds(300)); + cluster_->tablet_server(0)->Resume(); + cluster_->tablet_server(1)->Resume(); + cluster_->tablet_server(2)->Resume(); + cluster_->master(0)->Resume(); + cluster_->master(1)->Resume(); + cluster_->master(2)->Resume(); + SleepFor(MonoDelta::FromMilliseconds(100)); + + // Set an explicit timeout. This test has caused deadlocks in the past. + // Also make sure to dump stacks before the alarm goes off. + PstackWatcher watcher(MonoDelta::FromSeconds(30)); + alarm(60); +} + + +// Override the base test to start a cluster with a low memory limit. +class ClientStressTest_LowMemory : public ClientStressTest { + protected: + virtual ExternalMiniClusterOptions default_opts() const OVERRIDE { + // There's nothing scientific about this number; it must be low enough to + // trigger memory pressure request rejection yet high enough for the + // servers to make forward progress. + const int kMemLimitBytes = 64 * 1024 * 1024; + ExternalMiniClusterOptions opts; + opts.extra_tserver_flags.push_back(Substitute( + "--memory_limit_hard_bytes=$0", kMemLimitBytes)); + opts.extra_tserver_flags.push_back( + "--memory_limit_soft_percentage=0"); + return opts; + } +}; + +// Stress test where, due to absurdly low memory limits, many client requests +// are rejected, forcing the client to retry repeatedly. +TEST_F(ClientStressTest_LowMemory, TestMemoryThrottling) { +#ifdef THREAD_SANITIZER + // TSAN tests run much slower, so we don't want to wait for as many + // rejections before declaring the test to be passed. + const int64_t kMinRejections = 20; +#else + const int64_t kMinRejections = 100; +#endif + + const MonoDelta kMaxWaitTime = MonoDelta::FromSeconds(60); + + TestWorkload work(cluster_.get()); + work.Setup(); + work.Start(); + + // Wait until we've rejected some number of requests. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(kMaxWaitTime); + while (true) { + int64_t total_num_rejections = 0; + + // It can take some time for the tablets (and their metric entities) to + // appear on every server. Rather than explicitly wait for that above, + // we'll just treat the lack of a metric as non-fatal. If the entity + // or metric is truly missing, we'll eventually timeout and fail. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + int64_t value; + Status s = cluster_->tablet_server(i)->GetInt64Metric( + &METRIC_ENTITY_tablet, + nullptr, + &METRIC_leader_memory_pressure_rejections, + "value", + &value); + if (!s.IsNotFound()) { + ASSERT_OK(s); + total_num_rejections += value; + } + s = cluster_->tablet_server(i)->GetInt64Metric( + &METRIC_ENTITY_tablet, + nullptr, + &METRIC_follower_memory_pressure_rejections, + "value", + &value); + if (!s.IsNotFound()) { + ASSERT_OK(s); + total_num_rejections += value; + } + } + if (total_num_rejections >= kMinRejections) { + break; + } else if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + FAIL() << "Ran for " << kMaxWaitTime.ToString() << ", deadline expired and only saw " + << total_num_rejections << " memory rejections"; + } + SleepFor(MonoDelta::FromMilliseconds(200)); + } +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/client_failover-itest.cc b/src/kudu/integration-tests/client_failover-itest.cc new file mode 100644 index 000000000000..535d39476f92 --- /dev/null +++ b/src/kudu/integration-tests/client_failover-itest.cc @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/integration-tests/external_mini_cluster-itest-base.h" +#include "kudu/integration-tests/test_workload.h" + +using kudu::client::CountTableRows; +using kudu::client::KuduTable; +using kudu::client::sp::shared_ptr; +using kudu::itest::TServerDetails; +using kudu::tablet::TABLET_DATA_TOMBSTONED; +using std::set; +using std::string; +using std::vector; +using std::unordered_map; + +namespace kudu { + +// Integration test for client failover behavior. +class ClientFailoverITest : public ExternalMiniClusterITestBase { +}; + +// Test that we can delete the leader replica while scanning it and still get +// results back. +TEST_F(ClientFailoverITest, TestDeleteLeaderWhileScanning) { + const MonoDelta kTimeout = MonoDelta::FromSeconds(30); + + vector ts_flags = { "--enable_leader_failure_detection=false", + "--enable_remote_bootstrap=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false", + "--catalog_manager_wait_for_new_tablets_to_elect_leader=false" }; + + // Start up with 4 tablet servers. + NO_FATALS(StartCluster(ts_flags, master_flags, 4)); + + // Create the test table. + TestWorkload workload(cluster_.get()); + workload.set_write_timeout_millis(kTimeout.ToMilliseconds()); + workload.Setup(); + + // Figure out the tablet id. + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + vector tablets = inspect_->ListTablets(); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + // Record the locations of the tablet replicas and the one TS that doesn't have a replica. + int missing_replica_index = -1; + set replica_indexes; + unordered_map active_ts_map; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + if (inspect_->ListTabletsOnTS(i).empty()) { + missing_replica_index = i; + } else { + replica_indexes.insert(i); + TServerDetails* ts = ts_map_[cluster_->tablet_server(i)->uuid()]; + active_ts_map[ts->uuid()] = ts; + ASSERT_OK(WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], tablet_id, + kTimeout)); + } + } + int leader_index = *replica_indexes.begin(); + TServerDetails* leader = ts_map_[cluster_->tablet_server(leader_index)->uuid()]; + ASSERT_OK(itest::StartElection(leader, tablet_id, kTimeout)); + + // Write data to a tablet. + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + // We don't want the leader that takes over after we kill the first leader to + // be unsure whether the writes have been committed, so wait until all + // replicas have all of the writes. + ASSERT_OK(WaitForServersToAgree(kTimeout, active_ts_map, tablet_id, + workload.batches_completed() + 1)); + + // Open the scanner and count the rows. + shared_ptr table; + ASSERT_OK(client_->OpenTable(TestWorkload::kDefaultTableName, &table)); + ASSERT_EQ(workload.rows_inserted(), CountTableRows(table.get())); + LOG(INFO) << "Number of rows: " << workload.rows_inserted(); + + // Delete the leader replica. This will cause the next scan to the same + // leader to get a TABLET_NOT_FOUND error. + ASSERT_OK(itest::DeleteTablet(leader, tablet_id, TABLET_DATA_TOMBSTONED, + boost::none, kTimeout)); + + int old_leader_index = leader_index; + TServerDetails* old_leader = leader; + leader_index = *(++replica_indexes.begin()); // Select the "next" replica as leader. + leader = ts_map_[cluster_->tablet_server(leader_index)->uuid()]; + + ASSERT_EQ(1, replica_indexes.erase(old_leader_index)); + ASSERT_EQ(1, active_ts_map.erase(old_leader->uuid())); + + // We need to elect a new leader to remove the old node. + ASSERT_OK(itest::StartElection(leader, tablet_id, kTimeout)); + ASSERT_OK(WaitUntilCommittedOpIdIndexIs(workload.batches_completed() + 2, leader, tablet_id, + kTimeout)); + + // Do a config change to remove the old replica and add a new one. + // Cause the new replica to become leader, then do the scan again. + ASSERT_OK(RemoveServer(leader, tablet_id, old_leader, boost::none, kTimeout)); + // Wait until the config is committed, otherwise AddServer() will fail. + ASSERT_OK(WaitUntilCommittedConfigOpIdIndexIs(workload.batches_completed() + 3, leader, tablet_id, + kTimeout)); + + TServerDetails* to_add = ts_map_[cluster_->tablet_server(missing_replica_index)->uuid()]; + ASSERT_OK(AddServer(leader, tablet_id, to_add, consensus::RaftPeerPB::VOTER, + boost::none, kTimeout)); + HostPort hp; + ASSERT_OK(HostPortFromPB(leader->registration.rpc_addresses(0), &hp)); + ASSERT_OK(StartRemoteBootstrap(to_add, tablet_id, leader->uuid(), hp, 1, kTimeout)); + + const string& new_ts_uuid = cluster_->tablet_server(missing_replica_index)->uuid(); + InsertOrDie(&replica_indexes, missing_replica_index); + InsertOrDie(&active_ts_map, new_ts_uuid, ts_map_[new_ts_uuid]); + + // Wait for remote bootstrap to complete. Then elect the new node. + ASSERT_OK(WaitForServersToAgree(kTimeout, active_ts_map, tablet_id, + workload.batches_completed() + 4)); + leader_index = missing_replica_index; + leader = ts_map_[cluster_->tablet_server(leader_index)->uuid()]; + ASSERT_OK(itest::StartElection(leader, tablet_id, kTimeout)); + ASSERT_OK(WaitUntilCommittedOpIdIndexIs(workload.batches_completed() + 5, leader, tablet_id, + kTimeout)); + + ASSERT_EQ(workload.rows_inserted(), CountTableRows(table.get())); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/cluster_itest_util.cc b/src/kudu/integration-tests/cluster_itest_util.cc new file mode 100644 index 000000000000..f7fecf1bc472 --- /dev/null +++ b/src/kudu/integration-tests/cluster_itest_util.cc @@ -0,0 +1,803 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/integration-tests/cluster_itest_util.h" + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tserver/tablet_server_test_util.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.pb.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/util/net/net_util.h" + +namespace kudu { +namespace itest { + +using client::KuduClient; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduTable; +using consensus::CONSENSUS_CONFIG_ACTIVE; +using consensus::CONSENSUS_CONFIG_COMMITTED; +using consensus::ChangeConfigRequestPB; +using consensus::ChangeConfigResponsePB; +using consensus::ConsensusConfigType; +using consensus::ConsensusStatePB; +using consensus::CountVoters; +using consensus::GetConsensusStateRequestPB; +using consensus::GetConsensusStateResponsePB; +using consensus::GetLastOpIdRequestPB; +using consensus::GetLastOpIdResponsePB; +using consensus::LeaderStepDownRequestPB; +using consensus::LeaderStepDownResponsePB; +using consensus::OpId; +using consensus::RaftPeerPB; +using consensus::RunLeaderElectionResponsePB; +using consensus::RunLeaderElectionRequestPB; +using consensus::kInvalidOpIdIndex; +using master::ListTabletServersResponsePB; +using master::MasterServiceProxy; +using master::TabletLocationsPB; +using rpc::Messenger; +using rpc::RpcController; +using std::min; +using std::shared_ptr; +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; +using tserver::CreateTsClientProxies; +using tserver::ListTabletsResponsePB; +using tserver::DeleteTabletRequestPB; +using tserver::DeleteTabletResponsePB; +using tserver::TabletServerAdminServiceProxy; +using tserver::TabletServerErrorPB; +using tserver::TabletServerServiceProxy; +using tserver::WriteRequestPB; +using tserver::WriteResponsePB; + +const string& TServerDetails::uuid() const { + return instance_id.permanent_uuid(); +} + +string TServerDetails::ToString() const { + return Substitute("TabletServer: $0, Rpc address: $1", + instance_id.permanent_uuid(), + registration.rpc_addresses(0).ShortDebugString()); +} + +client::KuduSchema SimpleIntKeyKuduSchema() { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(client::KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + CHECK_OK(b.Build(&s)); + return s; +} + +Status GetLastOpIdForEachReplica(const string& tablet_id, + const vector& replicas, + consensus::OpIdType opid_type, + const MonoDelta& timeout, + vector* op_ids) { + GetLastOpIdRequestPB opid_req; + GetLastOpIdResponsePB opid_resp; + opid_req.set_tablet_id(tablet_id); + RpcController controller; + + op_ids->clear(); + for (TServerDetails* ts : replicas) { + controller.Reset(); + controller.set_timeout(timeout); + opid_resp.Clear(); + opid_req.set_dest_uuid(ts->uuid()); + opid_req.set_tablet_id(tablet_id); + opid_req.set_opid_type(opid_type); + RETURN_NOT_OK_PREPEND( + ts->consensus_proxy->GetLastOpId(opid_req, &opid_resp, &controller), + Substitute("Failed to fetch last op id from $0", + ts->instance_id.ShortDebugString())); + op_ids->push_back(opid_resp.opid()); + } + + return Status::OK(); +} + +Status GetLastOpIdForReplica(const std::string& tablet_id, + TServerDetails* replica, + consensus::OpIdType opid_type, + const MonoDelta& timeout, + consensus::OpId* op_id) { + vector replicas; + replicas.push_back(replica); + vector op_ids; + RETURN_NOT_OK(GetLastOpIdForEachReplica(tablet_id, replicas, opid_type, timeout, &op_ids)); + CHECK_EQ(1, op_ids.size()); + *op_id = op_ids[0]; + return Status::OK(); +} + +Status WaitForServersToAgree(const MonoDelta& timeout, + const TabletServerMap& tablet_servers, + const string& tablet_id, + int64_t minimum_index) { + MonoTime now = MonoTime::Now(MonoTime::COARSE); + MonoTime deadline = now; + deadline.AddDelta(timeout); + + for (int i = 1; now.ComesBefore(deadline); i++) { + vector servers; + AppendValuesFromMap(tablet_servers, &servers); + vector ids; + Status s = GetLastOpIdForEachReplica(tablet_id, servers, consensus::RECEIVED_OPID, timeout, + &ids); + if (s.ok()) { + bool any_behind = false; + bool any_disagree = false; + int64_t cur_index = kInvalidOpIdIndex; + for (const OpId& id : ids) { + if (cur_index == kInvalidOpIdIndex) { + cur_index = id.index(); + } + if (id.index() != cur_index) { + any_disagree = true; + break; + } + if (id.index() < minimum_index) { + any_behind = true; + break; + } + } + if (!any_behind && !any_disagree) { + return Status::OK(); + } + } else { + LOG(WARNING) << "Got error getting last opid for each replica: " << s.ToString(); + } + + LOG(INFO) << "Not converged past " << minimum_index << " yet: " << ids; + SleepFor(MonoDelta::FromMilliseconds(min(i * 100, 1000))); + + now = MonoTime::Now(MonoTime::COARSE); + } + return Status::TimedOut(Substitute("Index $0 not available on all replicas after $1. ", + minimum_index, timeout.ToString())); +} + +// Wait until all specified replicas have logged the given index. +Status WaitUntilAllReplicasHaveOp(const int64_t log_index, + const string& tablet_id, + const vector& replicas, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoDelta passed = MonoDelta::FromMilliseconds(0); + while (true) { + vector op_ids; + Status s = GetLastOpIdForEachReplica(tablet_id, replicas, consensus::RECEIVED_OPID, timeout, + &op_ids); + if (s.ok()) { + bool any_behind = false; + for (const OpId& op_id : op_ids) { + if (op_id.index() < log_index) { + any_behind = true; + break; + } + } + if (!any_behind) return Status::OK(); + } else { + LOG(WARNING) << "Got error getting last opid for each replica: " << s.ToString(); + } + passed = MonoTime::Now(MonoTime::FINE).GetDeltaSince(start); + if (passed.MoreThan(timeout)) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(50)); + } + string replicas_str; + for (const TServerDetails* replica : replicas) { + if (!replicas_str.empty()) replicas_str += ", "; + replicas_str += "{ " + replica->ToString() + " }"; + } + return Status::TimedOut(Substitute("Index $0 not available on all replicas after $1. " + "Replicas: [ $2 ]", + log_index, passed.ToString())); +} + +Status CreateTabletServerMap(MasterServiceProxy* master_proxy, + const shared_ptr& messenger, + unordered_map* ts_map) { + master::ListTabletServersRequestPB req; + master::ListTabletServersResponsePB resp; + rpc::RpcController controller; + + RETURN_NOT_OK(master_proxy->ListTabletServers(req, &resp, &controller)); + RETURN_NOT_OK(controller.status()); + if (resp.has_error()) { + return Status::RemoteError("Response had an error", resp.error().ShortDebugString()); + } + + ts_map->clear(); + for (const ListTabletServersResponsePB::Entry& entry : resp.servers()) { + HostPort host_port; + RETURN_NOT_OK(HostPortFromPB(entry.registration().rpc_addresses(0), &host_port)); + vector addresses; + host_port.ResolveAddresses(&addresses); + + gscoped_ptr peer(new TServerDetails); + peer->instance_id.CopyFrom(entry.instance_id()); + peer->registration.CopyFrom(entry.registration()); + + CreateTsClientProxies(addresses[0], + messenger, + &peer->tserver_proxy, + &peer->tserver_admin_proxy, + &peer->consensus_proxy, + &peer->generic_proxy); + + InsertOrDie(ts_map, peer->instance_id.permanent_uuid(), peer.get()); + ignore_result(peer.release()); + } + return Status::OK(); +} + +Status GetConsensusState(const TServerDetails* replica, + const string& tablet_id, + consensus::ConsensusConfigType type, + const MonoDelta& timeout, + ConsensusStatePB* consensus_state) { + GetConsensusStateRequestPB req; + GetConsensusStateResponsePB resp; + RpcController controller; + controller.set_timeout(timeout); + req.set_dest_uuid(replica->uuid()); + req.set_tablet_id(tablet_id); + req.set_type(type); + + RETURN_NOT_OK(replica->consensus_proxy->GetConsensusState(req, &resp, &controller)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + *consensus_state = resp.cstate(); + return Status::OK(); +} + +Status WaitUntilCommittedConfigNumVotersIs(int config_size, + const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + + int backoff_exp = 0; + const int kMaxBackoffExp = 7; + Status s; + ConsensusStatePB cstate; + while (true) { + MonoDelta remaining_timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetConsensusState(replica, tablet_id, CONSENSUS_CONFIG_COMMITTED, + remaining_timeout, &cstate); + if (s.ok()) { + if (CountVoters(cstate.config()) == config_size) { + return Status::OK(); + } + } + + if (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).MoreThan(timeout)) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(1 << backoff_exp)); + backoff_exp = min(backoff_exp + 1, kMaxBackoffExp); + } + return Status::TimedOut(Substitute("Number of voters does not equal $0 after waiting for $1." + "Last consensus state: $2. Last status: $3", + config_size, timeout.ToString(), + cstate.ShortDebugString(), s.ToString())); +} + +Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index, + const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + + Status s; + ConsensusStatePB cstate; + while (true) { + MonoDelta remaining_timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetConsensusState(replica, tablet_id, CONSENSUS_CONFIG_COMMITTED, + remaining_timeout, &cstate); + if (s.ok() && cstate.config().opid_index() == opid_index) { + return Status::OK(); + } + if (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).MoreThan(timeout)) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut(Substitute("Committed config opid_index does not equal $0 " + "after waiting for $1. " + "Last consensus state: $2. Last status: $3", + opid_index, + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + cstate.ShortDebugString(), s.ToString())); +} + +Status WaitUntilCommittedOpIdIndexIs(int64_t opid_index, + TServerDetails* replica, + const string& tablet_id, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + + Status s; + OpId op_id; + while (true) { + MonoDelta remaining_timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetLastOpIdForReplica(tablet_id, replica, consensus::COMMITTED_OPID, remaining_timeout, + &op_id); + if (s.ok() && op_id.index() == opid_index) { + return Status::OK(); + } + if (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).MoreThan(timeout)) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut(Substitute("Committed consensus opid_index does not equal $0 " + "after waiting for $1. Last status: $2", + opid_index, + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + s.ToString())); +} + +Status GetReplicaStatusAndCheckIfLeader(const TServerDetails* replica, + const string& tablet_id, + const MonoDelta& timeout) { + ConsensusStatePB cstate; + Status s = GetConsensusState(replica, tablet_id, CONSENSUS_CONFIG_ACTIVE, + timeout, &cstate); + if (PREDICT_FALSE(!s.ok())) { + VLOG(1) << "Error getting consensus state from replica: " + << replica->instance_id.permanent_uuid(); + return Status::NotFound("Error connecting to replica", s.ToString()); + } + const string& replica_uuid = replica->instance_id.permanent_uuid(); + if (cstate.has_leader_uuid() && cstate.leader_uuid() == replica_uuid) { + return Status::OK(); + } + VLOG(1) << "Replica not leader of config: " << replica->instance_id.permanent_uuid(); + return Status::IllegalState("Replica found but not leader"); +} + +Status WaitUntilLeader(const TServerDetails* replica, + const string& tablet_id, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + + int backoff_exp = 0; + const int kMaxBackoffExp = 7; + Status s; + while (true) { + MonoDelta remaining_timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetReplicaStatusAndCheckIfLeader(replica, tablet_id, remaining_timeout); + if (s.ok()) { + return Status::OK(); + } + + if (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).MoreThan(timeout)) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(1 << backoff_exp)); + backoff_exp = min(backoff_exp + 1, kMaxBackoffExp); + } + return Status::TimedOut(Substitute("Replica $0 is not leader after waiting for $1: $2", + replica->ToString(), timeout.ToString(), s.ToString())); +} + +Status FindTabletLeader(const TabletServerMap& tablet_servers, + const string& tablet_id, + const MonoDelta& timeout, + TServerDetails** leader) { + vector tservers; + AppendValuesFromMap(tablet_servers, &tservers); + + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + Status s; + int i = 0; + while (true) { + MonoDelta remaining_timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetReplicaStatusAndCheckIfLeader(tservers[i], tablet_id, remaining_timeout); + if (s.ok()) { + *leader = tservers[i]; + return Status::OK(); + } + + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + i = (i + 1) % tservers.size(); + if (i == 0) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + } + return Status::TimedOut(Substitute("Unable to find leader of tablet $0 after $1. " + "Status message: $2", tablet_id, + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + s.ToString())); +} + +Status StartElection(const TServerDetails* replica, + const string& tablet_id, + const MonoDelta& timeout) { + RunLeaderElectionRequestPB req; + req.set_dest_uuid(replica->uuid()); + req.set_tablet_id(tablet_id); + RunLeaderElectionResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + RETURN_NOT_OK(replica->consensus_proxy->RunLeaderElection(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()) + .CloneAndPrepend(Substitute("Code $0", TabletServerErrorPB::Code_Name(resp.error().code()))); + } + return Status::OK(); +} + +Status LeaderStepDown(const TServerDetails* replica, + const string& tablet_id, + const MonoDelta& timeout, + TabletServerErrorPB* error) { + LeaderStepDownRequestPB req; + req.set_dest_uuid(replica->uuid()); + req.set_tablet_id(tablet_id); + LeaderStepDownResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + RETURN_NOT_OK(replica->consensus_proxy->LeaderStepDown(req, &resp, &rpc)); + if (resp.has_error()) { + if (error != nullptr) { + *error = resp.error(); + } + return StatusFromPB(resp.error().status()) + .CloneAndPrepend(Substitute("Code $0", TabletServerErrorPB::Code_Name(resp.error().code()))); + } + return Status::OK(); +} + +Status WriteSimpleTestRow(const TServerDetails* replica, + const std::string& tablet_id, + RowOperationsPB::Type write_type, + int32_t key, + int32_t int_val, + const string& string_val, + const MonoDelta& timeout) { + WriteRequestPB req; + WriteResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + req.set_tablet_id(tablet_id); + Schema schema = GetSimpleTestSchema(); + RETURN_NOT_OK(SchemaToPB(schema, req.mutable_schema())); + AddTestRowToPB(write_type, schema, key, int_val, string_val, req.mutable_row_operations()); + + RETURN_NOT_OK(replica->tserver_proxy->Write(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status AddServer(const TServerDetails* leader, + const std::string& tablet_id, + const TServerDetails* replica_to_add, + consensus::RaftPeerPB::MemberType member_type, + const boost::optional& cas_config_opid_index, + const MonoDelta& timeout, + TabletServerErrorPB::Code* error_code) { + ChangeConfigRequestPB req; + ChangeConfigResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + req.set_dest_uuid(leader->uuid()); + req.set_tablet_id(tablet_id); + req.set_type(consensus::ADD_SERVER); + RaftPeerPB* peer = req.mutable_server(); + peer->set_permanent_uuid(replica_to_add->uuid()); + peer->set_member_type(member_type); + *peer->mutable_last_known_addr() = replica_to_add->registration.rpc_addresses(0); + if (cas_config_opid_index) { + req.set_cas_config_opid_index(*cas_config_opid_index); + } + + RETURN_NOT_OK(leader->consensus_proxy->ChangeConfig(req, &resp, &rpc)); + if (resp.has_error()) { + if (error_code) *error_code = resp.error().code(); + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status RemoveServer(const TServerDetails* leader, + const std::string& tablet_id, + const TServerDetails* replica_to_remove, + const boost::optional& cas_config_opid_index, + const MonoDelta& timeout, + TabletServerErrorPB::Code* error_code) { + ChangeConfigRequestPB req; + ChangeConfigResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + req.set_dest_uuid(leader->uuid()); + req.set_tablet_id(tablet_id); + req.set_type(consensus::REMOVE_SERVER); + if (cas_config_opid_index) { + req.set_cas_config_opid_index(*cas_config_opid_index); + } + RaftPeerPB* peer = req.mutable_server(); + peer->set_permanent_uuid(replica_to_remove->uuid()); + + RETURN_NOT_OK(leader->consensus_proxy->ChangeConfig(req, &resp, &rpc)); + if (resp.has_error()) { + if (error_code) *error_code = resp.error().code(); + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status ListTablets(const TServerDetails* ts, + const MonoDelta& timeout, + vector* tablets) { + tserver::ListTabletsRequestPB req; + tserver::ListTabletsResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + RETURN_NOT_OK(ts->tserver_proxy->ListTablets(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + tablets->assign(resp.status_and_schema().begin(), resp.status_and_schema().end()); + return Status::OK(); +} + +Status ListRunningTabletIds(const TServerDetails* ts, + const MonoDelta& timeout, + vector* tablet_ids) { + vector tablets; + RETURN_NOT_OK(ListTablets(ts, timeout, &tablets)); + tablet_ids->clear(); + for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { + if (t.tablet_status().state() == tablet::RUNNING) { + tablet_ids->push_back(t.tablet_status().tablet_id()); + } + } + return Status::OK(); +} + +Status GetTabletLocations(const shared_ptr& master_proxy, + const string& tablet_id, + const MonoDelta& timeout, + master::TabletLocationsPB* tablet_locations) { + master::GetTabletLocationsResponsePB resp; + master::GetTabletLocationsRequestPB req; + *req.add_tablet_ids() = tablet_id; + rpc::RpcController rpc; + rpc.set_timeout(timeout); + RETURN_NOT_OK(master_proxy->GetTabletLocations(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + if (resp.errors_size() > 0) { + CHECK_EQ(1, resp.errors_size()) << resp.ShortDebugString(); + return StatusFromPB(resp.errors(0).status()); + } + CHECK_EQ(1, resp.tablet_locations_size()) << resp.ShortDebugString(); + *tablet_locations = resp.tablet_locations(0); + return Status::OK(); +} + +Status GetTableLocations(const shared_ptr& master_proxy, + const string& table_name, + const MonoDelta& timeout, + master::GetTableLocationsResponsePB* table_locations) { + master::GetTableLocationsRequestPB req; + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(1000); + rpc::RpcController rpc; + rpc.set_timeout(timeout); + RETURN_NOT_OK(master_proxy->GetTableLocations(req, table_locations, &rpc)); + if (table_locations->has_error()) { + return StatusFromPB(table_locations->error().status()); + } + return Status::OK(); +} + +Status WaitForNumVotersInConfigOnMaster(const shared_ptr& master_proxy, + const std::string& tablet_id, + int num_voters, + const MonoDelta& timeout) { + Status s; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + int num_voters_found = 0; + while (true) { + TabletLocationsPB tablet_locations; + MonoDelta time_remaining = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + s = GetTabletLocations(master_proxy, tablet_id, time_remaining, &tablet_locations); + if (s.ok()) { + num_voters_found = 0; + for (const TabletLocationsPB::ReplicaPB& r : tablet_locations.replicas()) { + if (r.role() == RaftPeerPB::LEADER || r.role() == RaftPeerPB::FOLLOWER) num_voters_found++; + } + if (num_voters_found == num_voters) break; + } + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + RETURN_NOT_OK(s); + if (num_voters_found != num_voters) { + return Status::IllegalState( + Substitute("Did not find exactly $0 voters, found $1 voters", + num_voters, num_voters_found)); + } + return Status::OK(); +} + +Status WaitForNumTabletsOnTS(TServerDetails* ts, + int count, + const MonoDelta& timeout, + vector* tablets) { + Status s; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + while (true) { + s = ListTablets(ts, MonoDelta::FromSeconds(10), tablets); + if (s.ok() && tablets->size() == count) break; + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + RETURN_NOT_OK(s); + if (tablets->size() != count) { + return Status::IllegalState( + Substitute("Did not find exactly $0 tablets, found $1 tablets", + count, tablets->size())); + } + return Status::OK(); +} + +Status WaitUntilTabletInState(TServerDetails* ts, + const std::string& tablet_id, + tablet::TabletStatePB state, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + vector tablets; + Status s; + tablet::TabletStatePB last_state = tablet::UNKNOWN; + while (true) { + s = ListTablets(ts, MonoDelta::FromSeconds(10), &tablets); + if (s.ok()) { + bool seen = false; + for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { + if (t.tablet_status().tablet_id() == tablet_id) { + seen = true; + last_state = t.tablet_status().state(); + if (last_state == state) { + return Status::OK(); + } + } + } + if (!seen) { + s = Status::NotFound("Tablet " + tablet_id + " not found"); + } + } + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut(Substitute("T $0 P $1: Tablet not in $2 state after $3: " + "Tablet state: $4, Status message: $5", + tablet_id, ts->uuid(), + tablet::TabletStatePB_Name(state), + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + tablet::TabletStatePB_Name(last_state), s.ToString())); +} + +// Wait until the specified tablet is in RUNNING state. +Status WaitUntilTabletRunning(TServerDetails* ts, + const std::string& tablet_id, + const MonoDelta& timeout) { + return WaitUntilTabletInState(ts, tablet_id, tablet::RUNNING, timeout); +} + +Status DeleteTablet(const TServerDetails* ts, + const std::string& tablet_id, + const tablet::TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + const MonoDelta& timeout, + tserver::TabletServerErrorPB::Code* error_code) { + DeleteTabletRequestPB req; + DeleteTabletResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + req.set_dest_uuid(ts->uuid()); + req.set_tablet_id(tablet_id); + req.set_delete_type(delete_type); + if (cas_config_opid_index_less_or_equal) { + req.set_cas_config_opid_index_less_or_equal(*cas_config_opid_index_less_or_equal); + } + + RETURN_NOT_OK(ts->tserver_admin_proxy->DeleteTablet(req, &resp, &rpc)); + if (resp.has_error()) { + if (error_code) { + *error_code = resp.error().code(); + } + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status StartRemoteBootstrap(const TServerDetails* ts, + const string& tablet_id, + const string& bootstrap_source_uuid, + const HostPort& bootstrap_source_addr, + int64_t caller_term, + const MonoDelta& timeout) { + consensus::StartRemoteBootstrapRequestPB req; + consensus::StartRemoteBootstrapResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout); + + req.set_dest_uuid(ts->uuid()); + req.set_tablet_id(tablet_id); + req.set_bootstrap_peer_uuid(bootstrap_source_uuid); + RETURN_NOT_OK(HostPortToPB(bootstrap_source_addr, req.mutable_bootstrap_peer_addr())); + req.set_caller_term(caller_term); + + RETURN_NOT_OK(ts->consensus_proxy->StartRemoteBootstrap(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +} // namespace itest +} // namespace kudu diff --git a/src/kudu/integration-tests/cluster_itest_util.h b/src/kudu/integration-tests/cluster_itest_util.h new file mode 100644 index 000000000000..596e4a263d37 --- /dev/null +++ b/src/kudu/integration-tests/cluster_itest_util.h @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This header file contains generic helper utilities for writing tests against +// MiniClusters and ExternalMiniClusters. Ideally, the functions will be +// generic enough to use with either type of cluster, due to operating +// primarily through RPC-based APIs or through KuduClient. +// However, it's also OK to include common operations against a particular +// cluster type if it's general enough to use from multiple tests while not +// belonging in the MiniCluster / ExternalMiniCluster classes themselves. But +// consider just putting stuff like that in those classes. + +#ifndef KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_ +#define KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_ + +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/server/server_base.pb.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" + +namespace kudu { +class HostPort; +class MonoDelta; +class Schema; +class Sockaddr; +class Status; + +namespace client { +class KuduClient; +class KuduSchema; +class KuduTable; +} + +namespace consensus { +class OpId; +} + +namespace rpc { +class Messenger; +} + +namespace tserver { +class ListTabletsResponsePB_StatusAndSchemaPB; +class TabletServerErrorPB; +} + +namespace itest { + +struct TServerDetails { + NodeInstancePB instance_id; + master::TSRegistrationPB registration; + gscoped_ptr tserver_proxy; + gscoped_ptr tserver_admin_proxy; + gscoped_ptr consensus_proxy; + gscoped_ptr generic_proxy; + + // Convenience function to get the UUID from the instance_id struct. + const std::string& uuid() const; + + std::string ToString() const; +}; + +// tablet_id -> replica map. +typedef std::unordered_multimap TabletReplicaMap; + +// uuid -> tablet server map. +typedef std::unordered_map TabletServerMap; + +// Returns possibly the simplest imaginable schema, with a single int key column. +client::KuduSchema SimpleIntKeyKuduSchema(); + +// Create a populated TabletServerMap by interrogating the master. +// Note: The bare-pointer TServerDetails values must be deleted by the caller! +// Consider using ValueDeleter (in gutil/stl_util.h) for that. +Status CreateTabletServerMap(master::MasterServiceProxy* master_proxy, + const std::shared_ptr& messenger, + std::unordered_map* ts_map); + +// Gets a vector containing the latest OpId for each of the given replicas. +// Returns a bad Status if any replica cannot be reached. +Status GetLastOpIdForEachReplica(const std::string& tablet_id, + const std::vector& replicas, + consensus::OpIdType opid_type, + const MonoDelta& timeout, + std::vector* op_ids); + +// Like the above, but for a single replica. +Status GetLastOpIdForReplica(const std::string& tablet_id, + TServerDetails* replica, + consensus::OpIdType opid_type, + const MonoDelta& timeout, + consensus::OpId* op_id); + +// Wait until all of the servers have converged on the same log index. +// The converged index must be at least equal to 'minimum_index'. +// +// Requires that all servers are running. Returns Status::TimedOut if the +// indexes do not converge within the given timeout. +Status WaitForServersToAgree(const MonoDelta& timeout, + const TabletServerMap& tablet_servers, + const std::string& tablet_id, + int64_t minimum_index); + +// Wait until all specified replicas have logged at least the given index. +// Unlike WaitForServersToAgree(), the servers do not actually have to converge +// or quiesce. They only need to progress to or past the given index. +Status WaitUntilAllReplicasHaveOp(const int64_t log_index, + const std::string& tablet_id, + const std::vector& replicas, + const MonoDelta& timeout); + +// Get the consensus state from the given replica. +Status GetConsensusState(const TServerDetails* replica, + const std::string& tablet_id, + consensus::ConsensusConfigType type, + const MonoDelta& timeout, + consensus::ConsensusStatePB* consensus_state); + +// Wait until the number of voters in the committed consensus configuration is +// 'quorum_size', according to the specified replica. +Status WaitUntilCommittedConfigNumVotersIs(int config_size, + const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Wait until the opid_index of the committed consensus config on the +// specified tablet is 'opid_index'. +Status WaitUntilCommittedConfigOpIdIndexIs(int64_t opid_index, + const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Wait until the last commited OpId has index exactly 'opid_index'. +Status WaitUntilCommittedOpIdIndexIs(int64_t opid_index, + TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Returns: +// Status::OK() if the replica is alive and leader of the consensus configuration. +// Status::NotFound() if the replica is not part of the consensus configuration or is dead. +// Status::IllegalState() if the replica is live but not the leader. +Status GetReplicaStatusAndCheckIfLeader(const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Wait until the specified replica is leader. +Status WaitUntilLeader(const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Loops over the replicas, attempting to determine the leader, until it finds +// the first replica that believes it is the leader. +Status FindTabletLeader(const TabletServerMap& tablet_servers, + const std::string& tablet_id, + const MonoDelta& timeout, + TServerDetails** leader); + +// Start an election on the specified tserver. +// 'timeout' only refers to the RPC asking the peer to start an election. The +// StartElection() RPC does not block waiting for the results of the election, +// and neither does this call. +Status StartElection(const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Cause a leader to step down on the specified server. +// 'timeout' refers to the RPC timeout waiting synchronously for stepdown to +// complete on the leader side. Since that does not require communication with +// other nodes at this time, this call is rather quick. +Status LeaderStepDown(const TServerDetails* replica, + const std::string& tablet_id, + const MonoDelta& timeout, + tserver::TabletServerErrorPB* error = NULL); + +// Write a "simple test schema" row to the specified tablet on the given +// replica. This schema is commonly used by tests and is defined in +// wire_protocol-test-util.h +// The caller must specify whether this is an INSERT or UPDATE call via +// write_type. +Status WriteSimpleTestRow(const TServerDetails* replica, + const std::string& tablet_id, + RowOperationsPB::Type write_type, + int32_t key, + int32_t int_val, + const std::string& string_val, + const MonoDelta& timeout); + +// Run a ConfigChange to ADD_SERVER on 'replica_to_add'. +// The RPC request is sent to 'leader'. +Status AddServer(const TServerDetails* leader, + const std::string& tablet_id, + const TServerDetails* replica_to_add, + consensus::RaftPeerPB::MemberType member_type, + const boost::optional& cas_config_opid_index, + const MonoDelta& timeout, + tserver::TabletServerErrorPB::Code* error_code = NULL); + +// Run a ConfigChange to REMOVE_SERVER on 'replica_to_remove'. +// The RPC request is sent to 'leader'. +Status RemoveServer(const TServerDetails* leader, + const std::string& tablet_id, + const TServerDetails* replica_to_remove, + const boost::optional& cas_config_opid_index, + const MonoDelta& timeout, + tserver::TabletServerErrorPB::Code* error_code = NULL); + +// Get the list of tablets from the remote server. +Status ListTablets(const TServerDetails* ts, + const MonoDelta& timeout, + std::vector* tablets); + +// Get the list of RUNNING tablet ids from the remote server. +Status ListRunningTabletIds(const TServerDetails* ts, + const MonoDelta& timeout, + std::vector* tablet_ids); + +// Get the list of tablet locations for the specified tablet from the Master. +Status GetTabletLocations(const std::shared_ptr& master_proxy, + const std::string& tablet_id, + const MonoDelta& timeout, + master::TabletLocationsPB* tablet_locations); + +// Get the list of tablet locations for all tablets in the specified table from the Master. +Status GetTableLocations(const std::shared_ptr& master_proxy, + const std::string& table_name, + const MonoDelta& timeout, + master::GetTableLocationsResponsePB* table_locations); + +// Wait for the specified number of voters to be reported to the config on the +// master for the specified tablet. +Status WaitForNumVotersInConfigOnMaster( + const std::shared_ptr& master_proxy, + const std::string& tablet_id, + int num_voters, + const MonoDelta& timeout); + +// Repeatedly invoke ListTablets(), waiting for up to 'timeout' time for the +// specified 'count' number of replicas. +Status WaitForNumTabletsOnTS( + TServerDetails* ts, + int count, + const MonoDelta& timeout, + std::vector* tablets); + +// Wait until the specified replica is in the specified state. +Status WaitUntilTabletInState(TServerDetails* ts, + const std::string& tablet_id, + tablet::TabletStatePB state, + const MonoDelta& timeout); + +// Wait until the specified tablet is in RUNNING state. +Status WaitUntilTabletRunning(TServerDetails* ts, + const std::string& tablet_id, + const MonoDelta& timeout); + +// Send a DeleteTablet() to the server at 'ts' of the specified 'delete_type'. +Status DeleteTablet(const TServerDetails* ts, + const std::string& tablet_id, + const tablet::TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + const MonoDelta& timeout, + tserver::TabletServerErrorPB::Code* error_code = NULL); + +// Cause the remote to initiate remote bootstrap using the specified host as a +// source. +Status StartRemoteBootstrap(const TServerDetails* ts, + const std::string& tablet_id, + const std::string& bootstrap_source_uuid, + const HostPort& bootstrap_source_addr, + int64_t caller_term, + const MonoDelta& timeout); + +} // namespace itest +} // namespace kudu + +#endif // KUDU_INTEGRATION_TESTS_CLUSTER_ITEST_UTIL_H_ diff --git a/src/kudu/integration-tests/cluster_verifier.cc b/src/kudu/integration-tests/cluster_verifier.cc new file mode 100644 index 000000000000..e876875884bf --- /dev/null +++ b/src/kudu/integration-tests/cluster_verifier.cc @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/tools/ksck_remote.h" +#include "kudu/util/monotime.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; + +namespace kudu { + +using strings::Substitute; +using tools::Ksck; +using tools::KsckCluster; +using tools::KsckMaster; +using tools::RemoteKsckMaster; + +ClusterVerifier::ClusterVerifier(ExternalMiniCluster* cluster) + : cluster_(cluster), + checksum_options_(ChecksumOptions()) { + checksum_options_.use_snapshot = false; +} + +ClusterVerifier::~ClusterVerifier() { +} + +void ClusterVerifier::SetVerificationTimeout(const MonoDelta& timeout) { + checksum_options_.timeout = timeout; +} + +void ClusterVerifier::SetScanConcurrency(int concurrency) { + checksum_options_.scan_concurrency = concurrency; +} + +void ClusterVerifier::CheckCluster() { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(checksum_options_.timeout); + + Status s; + double sleep_time = 0.1; + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + s = DoKsck(); + if (s.ok()) { + break; + } + + LOG(INFO) << "Check not successful yet, sleeping and retrying: " + s.ToString(); + sleep_time *= 1.5; + if (sleep_time > 1) { sleep_time = 1; } + SleepFor(MonoDelta::FromSeconds(sleep_time)); + } + ASSERT_OK(s); +} + +Status ClusterVerifier::DoKsck() { + Sockaddr addr = cluster_->leader_master()->bound_rpc_addr(); + + std::shared_ptr master; + RETURN_NOT_OK(RemoteKsckMaster::Build(addr, &master)); + std::shared_ptr cluster(new KsckCluster(master)); + std::shared_ptr ksck(new Ksck(cluster)); + + // This is required for everything below. + RETURN_NOT_OK(ksck->CheckMasterRunning()); + RETURN_NOT_OK(ksck->FetchTableAndTabletInfo()); + RETURN_NOT_OK(ksck->CheckTabletServersRunning()); + RETURN_NOT_OK(ksck->CheckTablesConsistency()); + + vector tables; + vector tablets; + RETURN_NOT_OK(ksck->ChecksumData(tables, tablets, checksum_options_)); + return Status::OK(); +} + +void ClusterVerifier::CheckRowCount(const std::string& table_name, + ComparisonMode mode, + int expected_row_count) { + ASSERT_OK(DoCheckRowCount(table_name, mode, expected_row_count)); +} + +Status ClusterVerifier::DoCheckRowCount(const std::string& table_name, + ComparisonMode mode, + int expected_row_count) { + client::sp::shared_ptr client; + client::KuduClientBuilder builder; + RETURN_NOT_OK_PREPEND(cluster_->CreateClient(builder, + &client), + "Unable to connect to cluster"); + client::sp::shared_ptr table; + RETURN_NOT_OK_PREPEND(client->OpenTable(table_name, &table), + "Unable to open table"); + client::KuduScanner scanner(table.get()); + CHECK_OK(scanner.SetProjectedColumns(vector())); + RETURN_NOT_OK_PREPEND(scanner.Open(), "Unable to open scanner"); + int count = 0; + vector rows; + while (scanner.HasMoreRows()) { + RETURN_NOT_OK_PREPEND(scanner.NextBatch(&rows), "Unable to read from scanner"); + count += rows.size(); + } + + if (mode == AT_LEAST && count < expected_row_count) { + return Status::Corruption(Substitute("row count $0 is not at least expected value $1", + count, expected_row_count)); + } else if (mode == EXACTLY && count != expected_row_count) { + return Status::Corruption(Substitute("row count $0 is not exactly expected value $1", + count, expected_row_count)); + } + return Status::OK(); +} + +void ClusterVerifier::CheckRowCountWithRetries(const std::string& table_name, + ComparisonMode mode, + int expected_row_count, + const MonoDelta& timeout) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + Status s; + while (true) { + s = DoCheckRowCount(table_name, mode, expected_row_count); + if (s.ok() || deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + LOG(WARNING) << "CheckRowCount() has not succeeded yet: " << s.ToString() + << "... will retry"; + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + ASSERT_OK(s); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/cluster_verifier.h b/src/kudu/integration-tests/cluster_verifier.h new file mode 100644 index 000000000000..8dc15167f34b --- /dev/null +++ b/src/kudu/integration-tests/cluster_verifier.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_INTEGRATION_TESTS_CLUSTER_VERIFIER_H +#define KUDU_INTEGRATION_TESTS_CLUSTER_VERIFIER_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/tools/ksck.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +using tools::ChecksumOptions; + +class ExternalMiniCluster; +class MonoDelta; + +// Utility class for integration tests to verify that the cluster is in a good state. +class ClusterVerifier { + public: + explicit ClusterVerifier(ExternalMiniCluster* cluster); + ~ClusterVerifier(); + + // Set the amount of time which we'll retry trying to verify the cluster + // state. We retry because it's possible that one of the replicas is behind + // but in the process of catching up. + void SetVerificationTimeout(const MonoDelta& timeout); + + /// Set the number of concurrent scans to execute per tablet server. + void SetScanConcurrency(int concurrency); + + // Verify that the cluster is in good state. Triggers a gtest assertion failure + // on failure. + // + // Currently, this just uses ksck to verify that the different replicas of each tablet + // eventually agree. + void CheckCluster(); + + // Argument for CheckRowCount(...) below. + enum ComparisonMode { + AT_LEAST, + EXACTLY + }; + + // Check that the given table has the given number of rows. Depending on ComparisonMode, + // the comparison could be exact or a lower bound. + // + // Returns a Corruption Status if the row count is not as expected. + // + // NOTE: this does not perform any retries. If it's possible that the replicas are + // still converging, it's best to use CheckCluster() first, which will wait for + // convergence. + void CheckRowCount(const std::string& table_name, + ComparisonMode mode, + int expected_row_count); + + // The same as above, but retries until a timeout elapses. + void CheckRowCountWithRetries(const std::string& table_name, + ComparisonMode mode, + int expected_row_count, + const MonoDelta& timeout); + + private: + Status DoKsck(); + + // Implementation for CheckRowCount -- returns a Status instead of firing + // gtest assertions. + Status DoCheckRowCount(const std::string& table_name, + ComparisonMode mode, + int expected_row_count); + + + ExternalMiniCluster* cluster_; + + ChecksumOptions checksum_options_; + + DISALLOW_COPY_AND_ASSIGN(ClusterVerifier); +}; + +} // namespace kudu +#endif /* KUDU_INTEGRATION_TESTS_CLUSTER_VERIFIER_H */ diff --git a/src/kudu/integration-tests/create-table-itest.cc b/src/kudu/integration-tests/create-table-itest.cc new file mode 100644 index 000000000000..f70ee4230de0 --- /dev/null +++ b/src/kudu/integration-tests/create-table-itest.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/integration-tests/external_mini_cluster-itest-base.h" +#include "kudu/util/metrics.h" + +using std::multimap; +using std::set; +using std::string; +using std::vector; + +METRIC_DECLARE_entity(server); +METRIC_DECLARE_histogram(handler_latency_kudu_tserver_TabletServerAdminService_CreateTablet); +METRIC_DECLARE_histogram(handler_latency_kudu_tserver_TabletServerAdminService_DeleteTablet); + +namespace kudu { + +const char* const kTableName = "test-table"; + +class CreateTableITest : public ExternalMiniClusterITestBase { +}; + +// Regression test for an issue seen when we fail to create a majority of the +// replicas in a tablet. Previously, we'd still consider the tablet "RUNNING" +// on the master and finish the table creation, even though that tablet would +// be stuck forever with its minority never able to elect a leader. +TEST_F(CreateTableITest, TestCreateWhenMajorityOfReplicasFailCreation) { + const int kNumReplicas = 3; + vector ts_flags; + vector master_flags; + master_flags.push_back("--tablet_creation_timeout_ms=1000"); + NO_FATALS(StartCluster(ts_flags, master_flags, kNumReplicas)); + + // Shut down 2/3 of the tablet servers. + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + + // Try to create a single-tablet table. + // This won't succeed because we can't create enough replicas to get + // a quorum. + gscoped_ptr table_creator(client_->NewTableCreator()); + client::KuduSchema client_schema(client::KuduSchemaFromSchema(GetSimpleTestSchema())); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&client_schema) + .num_replicas(3) + .wait(false) + .Create()); + + // Sleep until we've seen a couple retries on our live server. + int64_t num_create_attempts = 0; + while (num_create_attempts < 3) { + SleepFor(MonoDelta::FromMilliseconds(100)); + ASSERT_OK(cluster_->tablet_server(0)->GetInt64Metric( + &METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_handler_latency_kudu_tserver_TabletServerAdminService_CreateTablet, + "total_count", + &num_create_attempts)); + LOG(INFO) << "Waiting for the master to retry creating the tablet 3 times... " + << num_create_attempts << " RPCs seen so far"; + + // The CreateTable operation should still be considered in progress, even though + // we'll be successful at creating a single replica. + bool in_progress = false; + ASSERT_OK(client_->IsCreateTableInProgress(kTableName, &in_progress)); + ASSERT_TRUE(in_progress); + } + + // Once we restart the servers, we should succeed at creating a healthy + // replicated tablet. + ASSERT_OK(cluster_->tablet_server(1)->Restart()); + ASSERT_OK(cluster_->tablet_server(2)->Restart()); + + // We should eventually finish the table creation we started earlier. + bool in_progress = false; + while (in_progress) { + LOG(INFO) << "Waiting for the master to successfully create the table..."; + ASSERT_OK(client_->IsCreateTableInProgress(kTableName, &in_progress)); + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + // The server that was up from the beginning should be left with only + // one tablet, eventually, since the tablets which failed to get created + // properly should get deleted. + vector tablets; + int wait_iter = 0; + while (tablets.size() != 1 && wait_iter++ < 100) { + LOG(INFO) << "Waiting for only one tablet to be left on TS 0. Currently have: " + << tablets; + SleepFor(MonoDelta::FromMilliseconds(100)); + tablets = inspect_->ListTabletsWithDataOnTS(0); + } + ASSERT_EQ(1, tablets.size()) << "Tablets on TS0: " << tablets; +} + +// Regression test for KUDU-1317. Ensure that, when a table is created, +// the tablets are well spread out across the machines in the cluster and +// that recovery from failures will be well parallelized. +TEST_F(CreateTableITest, TestSpreadReplicasEvenly) { + const int kNumServers = 10; + const int kNumTablets = 20; + vector ts_flags; + vector master_flags; + ts_flags.push_back("--never_fsync"); // run faster on slow disks + NO_FATALS(StartCluster(ts_flags, master_flags, kNumServers)); + + gscoped_ptr table_creator(client_->NewTableCreator()); + client::KuduSchema client_schema(client::KuduSchemaFromSchema(GetSimpleTestSchema())); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&client_schema) + .num_replicas(3) + .add_hash_partitions({ "key" }, kNumTablets) + .Create()); + + // Check that the replicas are fairly well spread by computing the standard + // deviation of the number of replicas per server. + const double kMeanPerServer = kNumTablets * 3.0 / kNumServers; + double sum_squared_deviation = 0; + vector tablet_counts; + for (int ts_idx = 0; ts_idx < kNumServers; ts_idx++) { + int num_replicas = inspect_->ListTabletsOnTS(ts_idx).size(); + LOG(INFO) << "TS " << ts_idx << " has " << num_replicas << " tablets"; + double deviation = static_cast(num_replicas) - kMeanPerServer; + sum_squared_deviation += deviation * deviation; + } + double stddev = sqrt(sum_squared_deviation / (kMeanPerServer - 1)); + LOG(INFO) << "stddev = " << stddev; + // In 1000 runs of the test, only one run had stddev above 2.0. So, 3.0 should + // be a safe non-flaky choice. + ASSERT_LE(stddev, 3.0); + + // Construct a map from tablet ID to the set of servers that each tablet is hosted on. + multimap tablet_to_servers; + for (int ts_idx = 0; ts_idx < kNumServers; ts_idx++) { + vector tablets = inspect_->ListTabletsOnTS(ts_idx); + for (const string& tablet_id : tablets) { + tablet_to_servers.insert(std::make_pair(tablet_id, ts_idx)); + } + } + + // For each server, count how many other servers it shares tablets with. + // This is highly correlated to how well parallelized recovery will be + // in the case the server crashes. + int sum_num_peers = 0; + for (int ts_idx = 0; ts_idx < kNumServers; ts_idx++) { + vector tablets = inspect_->ListTabletsOnTS(ts_idx); + set peer_servers; + for (const string& tablet_id : tablets) { + auto peer_indexes = tablet_to_servers.equal_range(tablet_id); + for (auto it = peer_indexes.first; it != peer_indexes.second; ++it) { + peer_servers.insert(it->second); + } + } + + peer_servers.erase(ts_idx); + LOG(INFO) << "Server " << ts_idx << " has " << peer_servers.size() << " peers"; + sum_num_peers += peer_servers.size(); + } + + // On average, servers should have at least half the other servers as peers. + double avg_num_peers = static_cast(sum_num_peers) / kNumServers; + LOG(INFO) << "avg_num_peers = " << avg_num_peers; + ASSERT_GE(avg_num_peers, kNumServers / 2); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/create-table-stress-test.cc b/src/kudu/integration-tests/create-table-stress-test.cc new file mode 100644 index 000000000000..622aa0c221bb --- /dev/null +++ b/src/kudu/integration-tests/create-table-stress-test.cc @@ -0,0 +1,319 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/master-test-util.h" +#include "kudu/rpc/messenger.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduColumnSchema; +using kudu::client::KuduSchema; +using kudu::client::KuduSchemaBuilder; +using kudu::client::KuduTableCreator; +using kudu::itest::CreateTabletServerMap; +using kudu::itest::TabletServerMap; +using kudu::master::MasterServiceProxy; +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; + +DECLARE_int32(heartbeat_interval_ms); +DECLARE_bool(log_preallocate_segments); +DECLARE_bool(enable_remote_bootstrap); +DEFINE_int32(num_test_tablets, 60, "Number of tablets for stress test"); + +namespace kudu { + +const char* kTableName = "test_table"; + +class CreateTableStressTest : public KuduTest { + public: + CreateTableStressTest() { + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("v1")->Type(KuduColumnSchema::INT64)->NotNull(); + b.AddColumn("v2")->Type(KuduColumnSchema::STRING)->NotNull(); + CHECK_OK(b.Build(&schema_)); + } + + virtual void SetUp() OVERRIDE { + // Make heartbeats faster to speed test runtime. + FLAGS_heartbeat_interval_ms = 10; + + // Don't preallocate log segments, since we're creating thousands + // of tablets here. If each preallocates 64M or so, we use + // a ton of disk space in this test, and it fails on normal + // sized /tmp dirs. + // TODO: once we collapse multiple tablets into shared WAL files, + // this won't be necessary. + FLAGS_log_preallocate_segments = false; + + // Workaround KUDU-941: without this, it's likely that while shutting + // down tablets, they'll get resuscitated by their existing leaders. + FLAGS_enable_remote_bootstrap = false; + + KuduTest::SetUp(); + MiniClusterOptions opts; + opts.num_tablet_servers = 3; + cluster_.reset(new MiniCluster(env_.get(), opts)); + ASSERT_OK(cluster_->Start()); + + ASSERT_OK(KuduClientBuilder() + .add_master_server_addr(cluster_->mini_master()->bound_rpc_addr_str()) + .Build(&client_)); + + ASSERT_OK(MessengerBuilder("stress-test-msgr") + .set_num_reactors(1) + .set_negotiation_threads(1) + .Build(&messenger_)); + master_proxy_.reset(new MasterServiceProxy(messenger_, + cluster_->mini_master()->bound_rpc_addr())); + ASSERT_OK(CreateTabletServerMap(master_proxy_.get(), messenger_, &ts_map_)); + } + + virtual void TearDown() OVERRIDE { + cluster_->Shutdown(); + STLDeleteValues(&ts_map_); + } + + void CreateBigTable(const string& table_name, int num_tablets); + + protected: + client::sp::shared_ptr client_; + gscoped_ptr cluster_; + KuduSchema schema_; + std::shared_ptr messenger_; + gscoped_ptr master_proxy_; + TabletServerMap ts_map_; +}; + +void CreateTableStressTest::CreateBigTable(const string& table_name, int num_tablets) { + vector split_rows; + int num_splits = num_tablets - 1; // 4 tablets == 3 splits. + // Let the "\x8\0\0\0" keys end up in the first split; start splitting at 1. + for (int i = 1; i <= num_splits; i++) { + KuduPartialRow* row = schema_.NewRow(); + CHECK_OK(row->SetInt32(0, i)); + split_rows.push_back(row); + } + + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(table_name) + .schema(&schema_) + .split_rows(split_rows) + .num_replicas(3) + .wait(false) + .Create()); +} + +TEST_F(CreateTableStressTest, CreateAndDeleteBigTable) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping slow test"; + return; + } + string table_name = "test_table"; + ASSERT_NO_FATAL_FAILURE(CreateBigTable(table_name, FLAGS_num_test_tablets)); + master::GetTableLocationsResponsePB resp; + ASSERT_OK(WaitForRunningTabletCount(cluster_->mini_master(), table_name, + FLAGS_num_test_tablets, &resp)); + LOG(INFO) << "Created table successfully!"; + // Use std::cout instead of log, since these responses are large and log + // messages have a max size. + std::cout << "Response:\n" << resp.DebugString(); + std::cout << "CatalogManager state:\n"; + cluster_->mini_master()->master()->catalog_manager()->DumpState(&std::cerr); + + LOG(INFO) << "Deleting table..."; + ASSERT_OK(client_->DeleteTable(table_name)); + + // The actual removal of the tablets is asynchronous, so we loop for a bit + // waiting for them to get removed. + LOG(INFO) << "Waiting for tablets to be removed"; + vector tablet_ids; + for (int i = 0; i < 1000; i++) { + ASSERT_OK(itest::ListRunningTabletIds(ts_map_.begin()->second, + MonoDelta::FromSeconds(10), + &tablet_ids)); + if (tablet_ids.empty()) break; + SleepFor(MonoDelta::FromMilliseconds(100)); + } + ASSERT_TRUE(tablet_ids.empty()) << "Tablets remained: " << tablet_ids; +} + +TEST_F(CreateTableStressTest, RestartMasterDuringCreation) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping slow test"; + return; + } + + string table_name = "test_table"; + ASSERT_NO_FATAL_FAILURE(CreateBigTable(table_name, FLAGS_num_test_tablets)); + + for (int i = 0; i < 3; i++) { + SleepFor(MonoDelta::FromMicroseconds(500)); + LOG(INFO) << "Restarting master..."; + ASSERT_OK(cluster_->mini_master()->Restart()); + ASSERT_OK(cluster_->mini_master()->master()-> + WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + LOG(INFO) << "Master restarted."; + } + + master::GetTableLocationsResponsePB resp; + Status s = WaitForRunningTabletCount(cluster_->mini_master(), table_name, + FLAGS_num_test_tablets, &resp); + if (!s.ok()) { + cluster_->mini_master()->master()->catalog_manager()->DumpState(&std::cerr); + CHECK_OK(s); + } +} + +TEST_F(CreateTableStressTest, TestGetTableLocationsOptions) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping slow test"; + return; + } + + string table_name = "test_table"; + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 1. Creating big table " << table_name << " ..."; + LOG_TIMING(INFO, "creating big table") { + ASSERT_NO_FATAL_FAILURE(CreateBigTable(table_name, FLAGS_num_test_tablets)); + } + + master::GetTableLocationsRequestPB req; + master::GetTableLocationsResponsePB resp; + + // Make sure the table is completely created before we start poking. + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 2. Waiting for creation of big table " + << table_name << " to complete..."; + LOG_TIMING(INFO, "waiting for creation of big table") { + ASSERT_OK(WaitForRunningTabletCount(cluster_->mini_master(), table_name, + FLAGS_num_test_tablets, &resp)); + } + + // Test asking for 0 tablets, should fail + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 3. Asking for zero tablets..."; + LOG_TIMING(INFO, "asking for zero tablets") { + req.Clear(); + resp.Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(0); + Status s = cluster_->mini_master()->master()->catalog_manager()->GetTableLocations(&req, &resp); + ASSERT_STR_CONTAINS(s.ToString(), "must be greater than 0"); + } + + // Ask for one, get one, verify + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 4. Asking for one tablet..."; + LOG_TIMING(INFO, "asking for one tablet") { + req.Clear(); + resp.Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(1); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTableLocations(&req, &resp)); + ASSERT_EQ(resp.tablet_locations_size(), 1); + // empty since it's the first + ASSERT_EQ(resp.tablet_locations(0).partition().partition_key_start(), ""); + ASSERT_EQ(resp.tablet_locations(0).partition().partition_key_end(), string("\x80\0\0\1", 4)); + } + + int half_tablets = FLAGS_num_test_tablets / 2; + // Ask for half of them, get that number back + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 5. Asking for half the tablets..."; + LOG_TIMING(INFO, "asking for half the tablets") { + req.Clear(); + resp.Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(half_tablets); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTableLocations(&req, &resp)); + ASSERT_EQ(half_tablets, resp.tablet_locations_size()); + } + + // Ask for all of them, get that number back + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 6. Asking for all the tablets..."; + LOG_TIMING(INFO, "asking for all the tablets") { + req.Clear(); + resp.Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(FLAGS_num_test_tablets); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTableLocations(&req, &resp)); + ASSERT_EQ(FLAGS_num_test_tablets, resp.tablet_locations_size()); + } + + LOG(INFO) << "========================================================"; + LOG(INFO) << "Tables and tablets:"; + LOG(INFO) << "========================================================"; + std::vector > tables; + cluster_->mini_master()->master()->catalog_manager()->GetAllTables(&tables); + for (const scoped_refptr& table_info : tables) { + LOG(INFO) << "Table: " << table_info->ToString(); + std::vector > tablets; + table_info->GetAllTablets(&tablets); + for (const scoped_refptr& tablet_info : tablets) { + master::TabletMetadataLock l_tablet(tablet_info.get(), master::TabletMetadataLock::READ); + const master::SysTabletsEntryPB& metadata = tablet_info->metadata().state().pb; + LOG(INFO) << " Tablet: " << tablet_info->ToString() + << " { start_key: " + << ((metadata.partition().has_partition_key_start()) + ? metadata.partition().partition_key_start() : "<< none >>") + << ", end_key: " + << ((metadata.partition().has_partition_key_end()) + ? metadata.partition().partition_key_end() : "<< none >>") + << ", running = " << tablet_info->metadata().state().is_running() << " }"; + } + ASSERT_EQ(FLAGS_num_test_tablets, tablets.size()); + } + LOG(INFO) << "========================================================"; + + // Get a single tablet in the middle, make sure we get that one back + + gscoped_ptr row(schema_.NewRow()); + ASSERT_OK(row->SetInt32(0, half_tablets - 1)); + string start_key_middle; + ASSERT_OK(row->EncodeRowKey(&start_key_middle)); + + LOG(INFO) << "Start key middle: " << start_key_middle; + LOG(INFO) << CURRENT_TEST_NAME() << ": Step 7. Asking for single middle tablet..."; + LOG_TIMING(INFO, "asking for single middle tablet") { + req.Clear(); + resp.Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(1); + req.set_partition_key_start(start_key_middle); + ASSERT_OK(cluster_->mini_master()->master()->catalog_manager()->GetTableLocations(&req, &resp)); + ASSERT_EQ(1, resp.tablet_locations_size()) << "Response: [" << resp.DebugString() << "]"; + ASSERT_EQ(start_key_middle, resp.tablet_locations(0).partition().partition_key_start()); + } +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/delete_table-test.cc b/src/kudu/integration-tests/delete_table-test.cc new file mode 100644 index 000000000000..49144d933ae7 --- /dev/null +++ b/src/kudu/integration-tests/delete_table-test.cc @@ -0,0 +1,1076 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster-itest-base.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/subprocess.h" + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduSchema; +using kudu::client::KuduSchemaFromSchema; +using kudu::client::KuduTableCreator; +using kudu::consensus::CONSENSUS_CONFIG_COMMITTED; +using kudu::consensus::ConsensusMetadataPB; +using kudu::consensus::ConsensusStatePB; +using kudu::consensus::RaftPeerPB; +using kudu::itest::TServerDetails; +using kudu::tablet::TABLET_DATA_COPYING; +using kudu::tablet::TABLET_DATA_DELETED; +using kudu::tablet::TABLET_DATA_READY; +using kudu::tablet::TABLET_DATA_TOMBSTONED; +using kudu::tablet::TabletDataState; +using kudu::tablet::TabletSuperBlockPB; +using kudu::tserver::ListTabletsResponsePB; +using kudu::tserver::TabletServerErrorPB; +using std::numeric_limits; +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +namespace kudu { + +class DeleteTableTest : public ExternalMiniClusterITestBase { + protected: + enum IsCMetaExpected { + CMETA_NOT_EXPECTED = 0, + CMETA_EXPECTED = 1 + }; + + enum IsSuperBlockExpected { + SUPERBLOCK_NOT_EXPECTED = 0, + SUPERBLOCK_EXPECTED = 1 + }; + + // Get the UUID of the leader of the specified tablet, as seen by the TS with + // the given 'ts_uuid'. + string GetLeaderUUID(const string& ts_uuid, const string& tablet_id); + + Status CheckTabletTombstonedOrDeletedOnTS( + int index, + const string& tablet_id, + TabletDataState data_state, + IsCMetaExpected is_cmeta_expected, + IsSuperBlockExpected is_superblock_expected); + + Status CheckTabletTombstonedOnTS(int index, + const string& tablet_id, + IsCMetaExpected is_cmeta_expected); + + Status CheckTabletDeletedOnTS(int index, + const string& tablet_id, + IsSuperBlockExpected is_superblock_expected); + + void WaitForTabletTombstonedOnTS(int index, + const string& tablet_id, + IsCMetaExpected is_cmeta_expected); + + void WaitForTabletDeletedOnTS(int index, + const string& tablet_id, + IsSuperBlockExpected is_superblock_expected); + + void WaitForTSToCrash(int index); + void WaitForAllTSToCrash(); + void WaitUntilTabletRunning(int index, const std::string& tablet_id); + + // Delete the given table. If the operation times out, dumps the master stacks + // to help debug master-side deadlocks. + void DeleteTable(const string& table_name); + + // Repeatedly try to delete the tablet, retrying on failure up to the + // specified timeout. Deletion can fail when other operations, such as + // bootstrap, are running. + void DeleteTabletWithRetries(const TServerDetails* ts, const string& tablet_id, + TabletDataState delete_type, const MonoDelta& timeout); +}; + +string DeleteTableTest::GetLeaderUUID(const string& ts_uuid, const string& tablet_id) { + ConsensusStatePB cstate; + CHECK_OK(itest::GetConsensusState(ts_map_[ts_uuid], tablet_id, CONSENSUS_CONFIG_COMMITTED, + MonoDelta::FromSeconds(10), &cstate)); + return cstate.leader_uuid(); +} + +Status DeleteTableTest::CheckTabletTombstonedOrDeletedOnTS( + int index, + const string& tablet_id, + TabletDataState data_state, + IsCMetaExpected is_cmeta_expected, + IsSuperBlockExpected is_superblock_expected) { + CHECK(data_state == TABLET_DATA_TOMBSTONED || data_state == TABLET_DATA_DELETED) << data_state; + // There should be no WALs and no cmeta. + if (inspect_->CountWALSegmentsForTabletOnTS(index, tablet_id) > 0) { + return Status::IllegalState("WAL segments exist for tablet", tablet_id); + } + if (is_cmeta_expected == CMETA_EXPECTED && + !inspect_->DoesConsensusMetaExistForTabletOnTS(index, tablet_id)) { + return Status::IllegalState("Expected cmeta for tablet " + tablet_id + " but it doesn't exist"); + } + if (is_superblock_expected == SUPERBLOCK_EXPECTED) { + RETURN_NOT_OK(inspect_->CheckTabletDataStateOnTS(index, tablet_id, data_state)); + } else { + TabletSuperBlockPB superblock_pb; + Status s = inspect_->ReadTabletSuperBlockOnTS(index, tablet_id, &superblock_pb); + if (!s.IsNotFound()) { + return Status::IllegalState("Found unexpected superblock for tablet " + tablet_id); + } + } + return Status::OK(); +} + +Status DeleteTableTest::CheckTabletTombstonedOnTS(int index, + const string& tablet_id, + IsCMetaExpected is_cmeta_expected) { + return CheckTabletTombstonedOrDeletedOnTS(index, tablet_id, TABLET_DATA_TOMBSTONED, + is_cmeta_expected, SUPERBLOCK_EXPECTED); +} + +Status DeleteTableTest::CheckTabletDeletedOnTS(int index, + const string& tablet_id, + IsSuperBlockExpected is_superblock_expected) { + return CheckTabletTombstonedOrDeletedOnTS(index, tablet_id, TABLET_DATA_DELETED, + CMETA_NOT_EXPECTED, is_superblock_expected); +} + +void DeleteTableTest::WaitForTabletTombstonedOnTS(int index, + const string& tablet_id, + IsCMetaExpected is_cmeta_expected) { + Status s; + for (int i = 0; i < 6000; i++) { + s = CheckTabletTombstonedOnTS(index, tablet_id, is_cmeta_expected); + if (s.ok()) return; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(s); +} + +void DeleteTableTest::WaitForTabletDeletedOnTS(int index, + const string& tablet_id, + IsSuperBlockExpected is_superblock_expected) { + Status s; + for (int i = 0; i < 6000; i++) { + s = CheckTabletDeletedOnTS(index, tablet_id, is_superblock_expected); + if (s.ok()) return; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(s); +} + +void DeleteTableTest::WaitForTSToCrash(int index) { + ExternalTabletServer* ts = cluster_->tablet_server(index); + for (int i = 0; i < 6000; i++) { // wait 60sec + if (!ts->IsProcessAlive()) return; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + FAIL() << "TS " << ts->instance_id().permanent_uuid() << " did not crash!"; +} + +void DeleteTableTest::WaitForAllTSToCrash() { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + NO_FATALS(WaitForTSToCrash(i)); + } +} + +void DeleteTableTest::WaitUntilTabletRunning(int index, const std::string& tablet_id) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(index)->uuid()], + tablet_id, MonoDelta::FromSeconds(30))); +} + +void DeleteTableTest::DeleteTable(const string& table_name) { + Status s = client_->DeleteTable(table_name); + if (s.IsTimedOut()) { + WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->master()->pid()), + "Couldn't dump stacks"); + } + ASSERT_OK(s); +} + +void DeleteTableTest::DeleteTabletWithRetries(const TServerDetails* ts, + const string& tablet_id, + TabletDataState delete_type, + const MonoDelta& timeout) { + MonoTime start(MonoTime::Now(MonoTime::FINE)); + MonoTime deadline = start; + deadline.AddDelta(timeout); + Status s; + while (true) { + s = itest::DeleteTablet(ts, tablet_id, delete_type, boost::none, timeout); + if (s.ok()) return; + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(s); +} + +// Test deleting an empty table, and ensure that the tablets get removed, +// and the master no longer shows the table as existing. +TEST_F(DeleteTableTest, TestDeleteEmptyTable) { + NO_FATALS(StartCluster()); + // Create a table on the cluster. We're just using TestWorkload + // as a convenient way to create it. + TestWorkload(cluster_.get()).Setup(); + + // The table should have replicas on all three tservers. + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + + // Grab the tablet ID (used later). + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + // Delete it and wait for the replicas to get deleted. + NO_FATALS(DeleteTable(TestWorkload::kDefaultTableName)); + for (int i = 0; i < 3; i++) { + NO_FATALS(WaitForTabletDeletedOnTS(i, tablet_id, SUPERBLOCK_EXPECTED)); + } + + // Restart the cluster, the superblocks should be deleted on startup. + cluster_->Shutdown(); + ASSERT_OK(cluster_->Restart()); + ASSERT_OK(inspect_->WaitForNoData()); + + // Check that the master no longer exposes the table in any way: + + // 1) Should not list it in ListTables. + vector table_names; + ASSERT_OK(client_->ListTables(&table_names)); + ASSERT_TRUE(table_names.empty()) << "table still exposed in ListTables"; + + // 2) Should respond to GetTableSchema with a NotFound error. + KuduSchema schema; + Status s = client_->GetTableSchema(TestWorkload::kDefaultTableName, &schema); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); + + // 3) Should return an error for GetTabletLocations RPCs. + { + rpc::RpcController rpc; + master::GetTabletLocationsRequestPB req; + master::GetTabletLocationsResponsePB resp; + rpc.set_timeout(MonoDelta::FromSeconds(10)); + req.add_tablet_ids()->assign(tablet_id); + ASSERT_OK(cluster_->master_proxy()->GetTabletLocations(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_EQ(1, resp.errors_size()); + ASSERT_STR_CONTAINS(resp.errors(0).ShortDebugString(), + "code: NOT_FOUND message: \"Tablet deleted: Table deleted"); + } + + // 4) The master 'dump-entities' page should not list the deleted table or tablets. + EasyCurl c; + faststring entities_buf; + ASSERT_OK(c.FetchURL(Substitute("http://$0/dump-entities", + cluster_->master()->bound_http_hostport().ToString()), + &entities_buf)); + ASSERT_EQ("{\"tables\":[],\"tablets\":[]}", entities_buf.ToString()); +} + +// Test that a DeleteTable RPC is rejected without a matching destination UUID. +TEST_F(DeleteTableTest, TestDeleteTableDestUuidValidation) { + NO_FATALS(StartCluster()); + // Create a table on the cluster. We're just using TestWorkload + // as a convenient way to create it. + TestWorkload(cluster_.get()).Setup(); + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()]; + + tserver::DeleteTabletRequestPB req; + tserver::DeleteTabletResponsePB resp; + rpc::RpcController rpc; + rpc.set_timeout(MonoDelta::FromSeconds(20)); + + req.set_dest_uuid("fake-uuid"); + req.set_tablet_id(tablet_id); + req.set_delete_type(TABLET_DATA_TOMBSTONED); + ASSERT_OK(ts->tserver_admin_proxy->DeleteTablet(req, &resp, &rpc)); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(tserver::TabletServerErrorPB::WRONG_SERVER_UUID, resp.error().code()) + << resp.ShortDebugString(); + ASSERT_STR_CONTAINS(StatusFromPB(resp.error().status()).ToString(), + "Wrong destination UUID"); +} + +// Test the atomic CAS argument to DeleteTablet(). +TEST_F(DeleteTableTest, TestAtomicDeleteTablet) { + MonoDelta timeout = MonoDelta::FromSeconds(30); + NO_FATALS(StartCluster()); + // Create a table on the cluster. We're just using TestWorkload + // as a convenient way to create it. + TestWorkload(cluster_.get()).Setup(); + + // The table should have replicas on all three tservers. + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + + // Grab the tablet ID (used later). + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + const int kTsIndex = 0; + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + + // The committed config starts off with an opid_index of -1, so choose something lower. + boost::optional opid_index(-2); + tserver::TabletServerErrorPB::Code error_code; + ASSERT_OK(itest::WaitUntilTabletRunning(ts, tablet_id, timeout)); + + Status s; + for (int i = 0; i < 100; i++) { + s = itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, opid_index, timeout, + &error_code); + if (error_code == TabletServerErrorPB::CAS_FAILED) break; + // If we didn't get the expected CAS_FAILED error, it's OK to get 'TABLET_NOT_RUNNING' + // because the "creating" maintenance state persists just slightly after it starts to + // expose 'RUNNING' state in ListTablets() + ASSERT_EQ(TabletServerErrorPB::TABLET_NOT_RUNNING, error_code) + << "unexpected error: " << s.ToString(); + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + ASSERT_EQ(TabletServerErrorPB::CAS_FAILED, error_code) << "unexpected error: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "of -2 but the committed config has opid_index of -1"); + + // Now use the "latest", which is -1. + opid_index = -1; + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, opid_index, timeout, + &error_code)); + inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED); + + // Now that the tablet is already tombstoned, our opid_index should be + // ignored (because it's impossible to check it). + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, -9999, timeout, + &error_code)); + inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_TOMBSTONED); + + // Same with TOMBSTONED -> DELETED. + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_DELETED, -9999, timeout, + &error_code)); + inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_DELETED); +} + +TEST_F(DeleteTableTest, TestDeleteTableWithConcurrentWrites) { + NO_FATALS(StartCluster()); + int n_iters = AllowSlowTests() ? 20 : 1; + for (int i = 0; i < n_iters; i++) { + TestWorkload workload(cluster_.get()); + workload.set_table_name(Substitute("table-$0", i)); + + // We'll delete the table underneath the writers, so we expcted + // a NotFound error during the writes. + workload.set_not_found_allowed(true); + workload.Setup(); + + // Start the workload, and wait to see some rows actually inserted + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + // Delete it and wait for the replicas to get deleted. + NO_FATALS(DeleteTable(workload.table_name())); + for (int i = 0; i < 3; i++) { + NO_FATALS(WaitForTabletDeletedOnTS(i, tablet_id, SUPERBLOCK_EXPECTED)); + } + + // Sleep just a little longer to make sure client threads send + // requests to the missing tablets. + SleepFor(MonoDelta::FromMilliseconds(50)); + + workload.StopAndJoin(); + cluster_->AssertNoCrashes(); + + // Restart the cluster, the superblocks should be deleted on startup. + cluster_->Shutdown(); + ASSERT_OK(cluster_->Restart()); + ASSERT_OK(inspect_->WaitForNoData()); + } +} + +// Test that a tablet replica is automatically tombstoned on startup if a local +// crash occurs in the middle of remote bootstrap. +TEST_F(DeleteTableTest, TestAutoTombstoneAfterCrashDuringRemoteBootstrap) { + NO_FATALS(StartCluster()); + const MonoDelta timeout = MonoDelta::FromSeconds(10); + const int kTsIndex = 0; // We'll test with the first TS. + + // We'll do a config change to remote bootstrap a replica here later. For + // now, shut it down. + LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(kTsIndex)->uuid(); + cluster_->tablet_server(kTsIndex)->Shutdown(); + + // Bounce the Master so it gets new tablet reports and doesn't try to assign + // a replica to the dead TS. + cluster_->master()->Shutdown(); + ASSERT_OK(cluster_->master()->Restart()); + cluster_->WaitForTabletServerCount(2, timeout); + + // Start a workload on the cluster, and run it for a little while. + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(2); + workload.Setup(); + ASSERT_OK(inspect_->WaitForReplicaCount(2)); + + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + // Enable a fault crash when remote bootstrap occurs on TS 0. + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + const string& kFaultFlag = "fault_crash_after_rb_files_fetched"; + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(kTsIndex), kFaultFlag, "1.0")); + + // Figure out the tablet id to remote bootstrap. + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + // Add our TS 0 to the config and wait for it to crash. + string leader_uuid = GetLeaderUUID(cluster_->tablet_server(1)->uuid(), tablet_id); + TServerDetails* leader = DCHECK_NOTNULL(ts_map_[leader_uuid]); + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + ASSERT_OK(itest::AddServer(leader, tablet_id, ts, RaftPeerPB::VOTER, boost::none, timeout)); + NO_FATALS(WaitForTSToCrash(kTsIndex)); + + // The superblock should be in TABLET_DATA_COPYING state on disk. + NO_FATALS(inspect_->CheckTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_COPYING)); + + // Kill the other tablet servers so the leader doesn't try to remote + // bootstrap it again during our verification here. + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + + // Now we restart the TS. It will clean up the failed remote bootstrap and + // convert it to TABLET_DATA_TOMBSTONED. It crashed, so we have to call + // Shutdown() then Restart() to bring it back up. + cluster_->tablet_server(kTsIndex)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_NOT_EXPECTED)); +} + +// Test that a tablet replica automatically tombstones itself if the remote +// bootstrap source server fails in the middle of the remote bootstrap process. +// Also test that we can remotely bootstrap a tombstoned tablet. +TEST_F(DeleteTableTest, TestAutoTombstoneAfterRemoteBootstrapRemoteFails) { + vector flags; + flags.push_back("--log_segment_size_mb=1"); // Faster log rolls. + NO_FATALS(StartCluster(flags)); + const MonoDelta timeout = MonoDelta::FromSeconds(20); + const int kTsIndex = 0; // We'll test with the first TS. + + // We'll do a config change to remote bootstrap a replica here later. For + // now, shut it down. + LOG(INFO) << "Shutting down TS " << cluster_->tablet_server(kTsIndex)->uuid(); + cluster_->tablet_server(kTsIndex)->Shutdown(); + + // Bounce the Master so it gets new tablet reports and doesn't try to assign + // a replica to the dead TS. + cluster_->master()->Shutdown(); + ASSERT_OK(cluster_->master()->Restart()); + cluster_->WaitForTabletServerCount(2, timeout); + + // Start a workload on the cluster, and run it for a little while. + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(2); + workload.Setup(); + ASSERT_OK(inspect_->WaitForReplicaCount(2)); + + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Remote bootstrap doesn't see the active WAL segment, and we need to + // download a file to trigger the fault in this test. Due to the log index + // chunks, that means 3 files minimum: One in-flight WAL segment, one index + // chunk file (these files grow much more slowly than the WAL segments), and + // one completed WAL segment. + string leader_uuid = GetLeaderUUID(cluster_->tablet_server(1)->uuid(), tablet_id); + int leader_index = cluster_->tablet_server_index_by_uuid(leader_uuid); + ASSERT_NE(-1, leader_index); + ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(leader_index, tablet_id, 3)); + workload.StopAndJoin(); + + // Cause the leader to crash when a follower tries to remotely bootstrap from it. + const string& fault_flag = "fault_crash_on_handle_rb_fetch_data"; + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(leader_index), fault_flag, "1.0")); + + // Add our TS 0 to the config and wait for the leader to crash. + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + TServerDetails* leader = ts_map_[leader_uuid]; + TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()]; + ASSERT_OK(itest::AddServer(leader, tablet_id, ts, RaftPeerPB::VOTER, boost::none, timeout)); + NO_FATALS(WaitForTSToCrash(leader_index)); + + // The tablet server will detect that the leader failed, and automatically + // tombstone its replica. Shut down the other non-leader replica to avoid + // interference while we wait for this to happen. + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_NOT_EXPECTED)); + + // Now bring the other replicas back, and wait for the leader to remote + // bootstrap the tombstoned replica. This will have replaced a tablet with no + // consensus metadata. + ASSERT_OK(cluster_->tablet_server(1)->Restart()); + ASSERT_OK(cluster_->tablet_server(2)->Restart()); + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY)); + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); + + // Now pause the other replicas and tombstone our replica again. + ASSERT_OK(cluster_->tablet_server(1)->Pause()); + ASSERT_OK(cluster_->tablet_server(2)->Pause()); + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_NOT_EXPECTED)); + + // Bring them back again, let them yet again bootstrap our tombstoned replica. + // This time, the leader will have replaced a tablet with consensus metadata. + ASSERT_OK(cluster_->tablet_server(1)->Resume()); + ASSERT_OK(cluster_->tablet_server(2)->Resume()); + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY)); + + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); +} + +// Test for correct remote bootstrap merge of consensus metadata. +TEST_F(DeleteTableTest, TestMergeConsensusMetadata) { + // Enable manual leader selection. + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + const MonoDelta timeout = MonoDelta::FromSeconds(10); + const int kTsIndex = 0; + + TestWorkload workload(cluster_.get()); + workload.Setup(); + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + + // Figure out the tablet id to remote bootstrap. + vector tablets = inspect_->ListTabletsOnTS(1); + ASSERT_EQ(1, tablets.size()); + const string& tablet_id = tablets[0]; + + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + NO_FATALS(WaitUntilTabletRunning(i, tablet_id)); + } + + // Elect a leader and run some data through the cluster. + int leader_index = 1; + string leader_uuid = cluster_->tablet_server(leader_index)->uuid(); + ASSERT_OK(itest::StartElection(ts_map_[leader_uuid], tablet_id, timeout)); + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); + + // Verify that TS 0 voted for the chosen leader. + ConsensusMetadataPB cmeta_pb; + ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb)); + ASSERT_EQ(1, cmeta_pb.current_term()); + ASSERT_EQ(leader_uuid, cmeta_pb.voted_for()); + + // Shut down all but TS 0 and try to elect TS 0. The election will fail but + // the TS will record a vote for itself as well as a new term (term 2). + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + NO_FATALS(WaitUntilTabletRunning(kTsIndex, tablet_id)); + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + ASSERT_OK(itest::StartElection(ts, tablet_id, timeout)); + for (int i = 0; i < 6000; i++) { + Status s = inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb); + if (s.ok() && + cmeta_pb.current_term() == 2 && + cmeta_pb.voted_for() == ts->uuid()) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_EQ(2, cmeta_pb.current_term()); + ASSERT_EQ(ts->uuid(), cmeta_pb.voted_for()); + + // Tombstone our special little guy, then shut him down. + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_EXPECTED)); + cluster_->tablet_server(kTsIndex)->Shutdown(); + + // Restart the other dudes and re-elect the same leader. + ASSERT_OK(cluster_->tablet_server(1)->Restart()); + ASSERT_OK(cluster_->tablet_server(2)->Restart()); + TServerDetails* leader = ts_map_[leader_uuid]; + NO_FATALS(WaitUntilTabletRunning(1, tablet_id)); + NO_FATALS(WaitUntilTabletRunning(2, tablet_id)); + ASSERT_OK(itest::StartElection(leader, tablet_id, timeout)); + ASSERT_OK(itest::WaitUntilLeader(leader, tablet_id, timeout)); + + // Bring our special little guy back up. + // Wait until he gets remote bootstrapped. + LOG(INFO) << "Bringing TS " << cluster_->tablet_server(kTsIndex)->uuid() + << " back up..."; + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY)); + + // Assert that the election history is retained (voted for self). + ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb)); + ASSERT_EQ(2, cmeta_pb.current_term()); + ASSERT_EQ(ts->uuid(), cmeta_pb.voted_for()); + + // Now do the same thing as above, where we tombstone TS 0 then trigger a new + // term (term 3) on the other machines. TS 0 will get remotely bootstrapped + // again, but this time the vote record on TS 0 for term 2 should not be + // retained after remote bootstrap occurs. + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + + // Delete with retries because the tablet might still be bootstrapping. + NO_FATALS(DeleteTabletWithRetries(ts, tablet_id, TABLET_DATA_TOMBSTONED, timeout)); + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_EXPECTED)); + + ASSERT_OK(cluster_->tablet_server(1)->Restart()); + ASSERT_OK(cluster_->tablet_server(2)->Restart()); + NO_FATALS(WaitUntilTabletRunning(1, tablet_id)); + NO_FATALS(WaitUntilTabletRunning(2, tablet_id)); + ASSERT_OK(itest::StartElection(leader, tablet_id, timeout)); + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kTsIndex, tablet_id, TABLET_DATA_READY)); + + // The election history should have been wiped out. + ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(kTsIndex, tablet_id, &cmeta_pb)); + ASSERT_EQ(3, cmeta_pb.current_term()); + ASSERT_TRUE(!cmeta_pb.has_voted_for()) << cmeta_pb.ShortDebugString(); +} + +// Regression test for KUDU-987, a bug where followers with transactions in +// REPLICATING state, which means they have not yet been committed to a +// majority, cannot shut down during a DeleteTablet() call. +TEST_F(DeleteTableTest, TestDeleteFollowerWithReplicatingTransaction) { + if (!AllowSlowTests()) { + // We will typically wait at least 5 seconds for timeouts to occur. + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + const MonoDelta timeout = MonoDelta::FromSeconds(10); + + const int kNumTabletServers = 5; + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + ts_flags.push_back("--flush_threshold_mb=0"); // Always be flushing. + ts_flags.push_back("--maintenance_manager_polling_interval_ms=100"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers)); + + const int kTsIndex = 0; // We'll test with the first TS. + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + + // Create the table. + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(kNumTabletServers); + workload.Setup(); + + // Figure out the tablet ids of the created tablets. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); + const string& tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect TS 1 as leader. + const int kLeaderIndex = 1; + const string kLeaderUuid = cluster_->tablet_server(kLeaderIndex)->uuid(); + TServerDetails* leader = ts_map_[kLeaderUuid]; + ASSERT_OK(itest::StartElection(leader, tablet_id, timeout)); + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); + + // Kill a majority, but leave the leader and a single follower. + LOG(INFO) << "Killing majority"; + for (int i = 2; i < kNumTabletServers; i++) { + cluster_->tablet_server(i)->Shutdown(); + } + + // Now write a single row to the leader. + // We give 5 seconds for the timeout to pretty much guarantee that a flush + // will occur due to the low flush threshold we set. + LOG(INFO) << "Writing a row"; + Status s = WriteSimpleTestRow(leader, tablet_id, RowOperationsPB::INSERT, + 1, 1, "hola, world", MonoDelta::FromSeconds(5)); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_STR_CONTAINS(s.ToString(), "timed out"); + + LOG(INFO) << "Killing the leader..."; + cluster_->tablet_server(kLeaderIndex)->Shutdown(); + + // Now tombstone the follower tablet. This should succeed even though there + // are uncommitted operations on the replica. + LOG(INFO) << "Tombstoning tablet " << tablet_id << " on TS " << ts->uuid(); + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); +} + +// Test that orphaned blocks are cleared from the superblock when a tablet is +// tombstoned. +TEST_F(DeleteTableTest, TestOrphanedBlocksClearedOnDelete) { + const MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + ts_flags.push_back("--flush_threshold_mb=0"); // Flush quickly since we wait for a flush to occur. + ts_flags.push_back("--maintenance_manager_polling_interval_ms=100"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + const int kFollowerIndex = 0; + TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; + + // Create the table. + TestWorkload workload(cluster_.get()); + workload.Setup(); + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(follower_ts, 1, timeout, &tablets)); + const string& tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect TS 1 as leader. + const int kLeaderIndex = 1; + const string kLeaderUuid = cluster_->tablet_server(kLeaderIndex)->uuid(); + TServerDetails* leader_ts = ts_map_[kLeaderUuid]; + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); + + // Run a write workload and wait until we see some rowsets flush on the follower. + workload.Start(); + TabletSuperBlockPB superblock_pb; + for (int i = 0; i < 3000; i++) { + ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(kFollowerIndex, tablet_id, &superblock_pb)); + if (!superblock_pb.rowsets().empty()) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_GT(superblock_pb.rowsets_size(), 0) + << "Timed out waiting for rowset flush on TS " << follower_ts->uuid() << ": " + << "Superblock:\n" << superblock_pb.DebugString(); + + // Shut down the leader so it doesn't try to bootstrap our follower later. + workload.StopAndJoin(); + cluster_->tablet_server(kLeaderIndex)->Shutdown(); + + // Tombstone the follower and check that there are no rowsets or orphaned + // blocks retained in the superblock. + ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, + boost::none, timeout)); + NO_FATALS(WaitForTabletTombstonedOnTS(kFollowerIndex, tablet_id, CMETA_EXPECTED)); + ASSERT_OK(inspect_->ReadTabletSuperBlockOnTS(kFollowerIndex, tablet_id, &superblock_pb)); + ASSERT_EQ(0, superblock_pb.rowsets_size()) << superblock_pb.DebugString(); + ASSERT_EQ(0, superblock_pb.orphaned_blocks_size()) << superblock_pb.DebugString(); +} + +vector Grep(const string& needle, const vector& haystack) { + vector results; + for (const string& s : haystack) { + if (s.find(needle) != string::npos) { + results.push_back(&s); + } + } + return results; +} + +vector ListOpenFiles(pid_t pid) { + string cmd = strings::Substitute("export PATH=$$PATH:/usr/bin:/usr/sbin; lsof -n -p $0", pid); + vector argv = { "bash", "-c", cmd }; + string out; + CHECK_OK(Subprocess::Call(argv, &out)); + vector lines = strings::Split(out, "\n"); + return lines; +} + +int PrintOpenTabletFiles(pid_t pid, const string& tablet_id) { + vector lines = ListOpenFiles(pid); + vector wal_lines = Grep(tablet_id, lines); + LOG(INFO) << "There are " << wal_lines.size() << " open WAL files for pid " << pid << ":"; + for (const string* l : wal_lines) { + LOG(INFO) << *l; + } + return wal_lines.size(); +} + +// Regression test for tablet deletion FD leak. See KUDU-1288. +TEST_F(DeleteTableTest, TestFDsNotLeakedOnTabletTombstone) { + const MonoDelta timeout = MonoDelta::FromSeconds(30); + + vector ts_flags, master_flags; + NO_FATALS(StartCluster(ts_flags, master_flags, 1)); + + // Create the table. + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(1); + workload.Setup(); + workload.Start(); + while (workload.rows_inserted() < 1000) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(ts_map_.begin()->second, 1, timeout, &tablets)); + const string& tablet_id = tablets[0].tablet_status().tablet_id(); + + // Tombstone the tablet and then ensure that lsof does not list any + // tablet-related paths. + ExternalTabletServer* ets = cluster_->tablet_server(0); + ASSERT_OK(itest::DeleteTablet(ts_map_[ets->uuid()], + tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + ASSERT_EQ(0, PrintOpenTabletFiles(ets->pid(), tablet_id)); + + // Restart the TS after deletion and then do the same lsof check again. + ets->Shutdown(); + ASSERT_OK(ets->Restart()); + ASSERT_EQ(0, PrintOpenTabletFiles(ets->pid(), tablet_id)); +} + +// Parameterized test case for TABLET_DATA_DELETED deletions. +class DeleteTableDeletedParamTest : public DeleteTableTest, + public ::testing::WithParamInterface { +}; + +// Test that if a server crashes mid-delete that the delete will be rolled +// forward on startup. Parameterized by different fault flags that cause a +// crash at various points. +TEST_P(DeleteTableDeletedParamTest, TestRollForwardDelete) { + NO_FATALS(StartCluster()); + const string fault_flag = GetParam(); + LOG(INFO) << "Running with fault flag: " << fault_flag; + + // Dynamically set the fault flag so they crash when DeleteTablet() is called + // by the Master. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(i), fault_flag, "1.0")); + } + + // Create a table on the cluster. We're just using TestWorkload + // as a convenient way to create it. + TestWorkload(cluster_.get()).Setup(); + + // The table should have replicas on all three tservers. + ASSERT_OK(inspect_->WaitForReplicaCount(3)); + + // Delete it and wait for the tablet servers to crash. + NO_FATALS(DeleteTable(TestWorkload::kDefaultTableName)); + NO_FATALS(WaitForAllTSToCrash()); + + // There should still be data left on disk. + Status s = inspect_->CheckNoData(); + ASSERT_TRUE(s.IsIllegalState()) << s.ToString(); + + // Now restart the tablet servers. They should roll forward their deletes. + // We don't have to reset the fault flag here because it was set dynamically. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + cluster_->tablet_server(i)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(i)->Restart()); + } + ASSERT_OK(inspect_->WaitForNoData()); +} + +// Faults appropriate for the TABLET_DATA_DELETED case. +const char* deleted_faults[] = {"fault_crash_after_blocks_deleted", + "fault_crash_after_wal_deleted", + "fault_crash_after_cmeta_deleted"}; + +INSTANTIATE_TEST_CASE_P(FaultFlags, DeleteTableDeletedParamTest, + ::testing::ValuesIn(deleted_faults)); + +// Parameterized test case for TABLET_DATA_TOMBSTONED deletions. +class DeleteTableTombstonedParamTest : public DeleteTableTest, + public ::testing::WithParamInterface { +}; + +// Regression test for tablet tombstoning. Tests: +// 1. basic creation & tombstoning of a tablet. +// 2. roll-forward (crash recovery) of a partially-completed tombstoning of a tablet. +// 3. permanent deletion of a TOMBSTONED tablet +// (transition from TABLET_DATA_TOMBSTONED to TABLET_DATA_DELETED). +TEST_P(DeleteTableTombstonedParamTest, TestTabletTombstone) { + vector flags; + flags.push_back("--log_segment_size_mb=1"); // Faster log rolls. + NO_FATALS(StartCluster(flags)); + const string fault_flag = GetParam(); + LOG(INFO) << "Running with fault flag: " << fault_flag; + + MonoDelta timeout = MonoDelta::FromSeconds(30); + + // Create a table with 2 tablets. We delete the first tablet without + // injecting any faults, then we delete the second tablet while exercising + // several fault injection points. + const int kNumTablets = 2; + vector split_rows; + Schema schema(GetSimpleTestSchema()); + client::KuduSchema client_schema(client::KuduSchemaFromSchema(schema)); + KuduPartialRow* split_row = client_schema.NewRow(); + ASSERT_OK(split_row->SetInt32(0, numeric_limits::max() / kNumTablets)); + split_rows.push_back(split_row); + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(TestWorkload::kDefaultTableName) + .split_rows(split_rows) + .schema(&client_schema) + .num_replicas(3) + .Create()); + + // Start a workload on the cluster, and run it until we find WALs on disk. + TestWorkload workload(cluster_.get()); + workload.Setup(); + + // The table should have 2 tablets (1 split) on all 3 tservers (for a total of 6). + ASSERT_OK(inspect_->WaitForReplicaCount(6)); + + // Set up the proxies so we can easily send DeleteTablet() RPCs. + TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()]; + + // Ensure the tablet server is reporting 2 tablets. + vector tablets; + ASSERT_OK(itest::WaitForNumTabletsOnTS(ts, 2, timeout, &tablets)); + + // Run the workload against whoever the leader is until WALs appear on TS 0 + // for the tablets we created. + const int kTsIndex = 0; // Index of the tablet server we'll use for the test. + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(kTsIndex, + tablets[0].tablet_status().tablet_id(), 3)); + ASSERT_OK(inspect_->WaitForMinFilesInTabletWalDirOnTS(kTsIndex, + tablets[1].tablet_status().tablet_id(), 3)); + workload.StopAndJoin(); + + // Shut down the master and the other tablet servers so they don't interfere + // by attempting to create tablets or remote bootstrap while we delete tablets. + cluster_->master()->Shutdown(); + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + + // Tombstone the first tablet. + string tablet_id = tablets[0].tablet_status().tablet_id(); + LOG(INFO) << "Tombstoning first tablet " << tablet_id << "..."; + ASSERT_TRUE(inspect_->DoesConsensusMetaExistForTabletOnTS(kTsIndex, tablet_id)) << tablet_id; + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + LOG(INFO) << "Waiting for first tablet to be tombstoned..."; + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_EXPECTED)); + + ASSERT_OK(itest::WaitForNumTabletsOnTS(ts, 2, timeout, &tablets)); + for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { + if (t.tablet_status().tablet_id() == tablet_id) { + ASSERT_EQ(tablet::SHUTDOWN, t.tablet_status().state()); + ASSERT_EQ(TABLET_DATA_TOMBSTONED, t.tablet_status().tablet_data_state()) + << t.tablet_status().tablet_id() << " not tombstoned"; + } + } + + // Now tombstone the 2nd tablet, causing a fault. + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(kTsIndex), fault_flag, "1.0")); + tablet_id = tablets[1].tablet_status().tablet_id(); + LOG(INFO) << "Tombstoning second tablet " << tablet_id << "..."; + ignore_result(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + NO_FATALS(WaitForTSToCrash(kTsIndex)); + + // Restart the tablet server and wait for the WALs to be deleted and for the + // superblock to show that it is tombstoned. + cluster_->tablet_server(kTsIndex)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + LOG(INFO) << "Waiting for second tablet to be tombstoned..."; + NO_FATALS(WaitForTabletTombstonedOnTS(kTsIndex, tablet_id, CMETA_EXPECTED)); + + // The tombstoned tablets will still show up in ListTablets(), + // just with their data state set as TOMBSTONED. They should also be listed + // as NOT_STARTED because we restarted the server. + ASSERT_OK(itest::WaitForNumTabletsOnTS(ts, 2, timeout, &tablets)); + for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { + ASSERT_EQ(tablet::NOT_STARTED, t.tablet_status().state()); + ASSERT_EQ(TABLET_DATA_TOMBSTONED, t.tablet_status().tablet_data_state()) + << t.tablet_status().tablet_id() << " not tombstoned"; + } + + // Finally, delete all tablets on the TS, and wait for all data to be gone. + LOG(INFO) << "Deleting all tablets..."; + for (const ListTabletsResponsePB::StatusAndSchemaPB& tablet : tablets) { + string tablet_id = tablet.tablet_status().tablet_id(); + // We need retries here, since some of the tablets may still be + // bootstrapping after being restarted above. + NO_FATALS(DeleteTabletWithRetries(ts, tablet_id, TABLET_DATA_DELETED, timeout)); + NO_FATALS(WaitForTabletDeletedOnTS(kTsIndex, tablet_id, SUPERBLOCK_EXPECTED)); + } + + // Restart the TS, the superblock should be deleted on startup. + cluster_->tablet_server(kTsIndex)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(kTsIndex)->Restart()); + ASSERT_OK(inspect_->WaitForNoDataOnTS(kTsIndex)); +} + +// Faults appropriate for the TABLET_DATA_TOMBSTONED case. +// Tombstoning a tablet does not delete the consensus metadata. +const char* tombstoned_faults[] = {"fault_crash_after_blocks_deleted", + "fault_crash_after_wal_deleted"}; + +INSTANTIATE_TEST_CASE_P(FaultFlags, DeleteTableTombstonedParamTest, + ::testing::ValuesIn(tombstoned_faults)); + +} // namespace kudu diff --git a/src/kudu/integration-tests/external_mini_cluster-itest-base.h b/src/kudu/integration-tests/external_mini_cluster-itest-base.h new file mode 100644 index 000000000000..e2a9cf5f44e2 --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster-itest-base.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_ITEST_BASE_H_ +#define KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_ITEST_BASE_H_ + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/external_mini_cluster_fs_inspector.h" +#include "kudu/util/pstack_watcher.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +// Simple base utility class to provide an external mini cluster with common +// setup routines useful for integration tests. +class ExternalMiniClusterITestBase : public KuduTest { + public: + virtual void TearDown() OVERRIDE { + if (cluster_) { + if (HasFatalFailure()) { + LOG(INFO) << "Found fatal failure"; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + if (!cluster_->tablet_server(i)->IsProcessAlive()) { + LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks."; + continue; + } + LOG(INFO) << "Attempting to dump stacks of TS " << i + << " with UUID " << cluster_->tablet_server(i)->uuid() + << " and pid " << cluster_->tablet_server(i)->pid(); + WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()), + "Couldn't dump stacks"); + } + } + cluster_->Shutdown(); + } + KuduTest::TearDown(); + STLDeleteValues(&ts_map_); + } + + protected: + void StartCluster(const std::vector& extra_ts_flags = std::vector(), + const std::vector& extra_master_flags = std::vector(), + int num_tablet_servers = 3); + + gscoped_ptr cluster_; + gscoped_ptr inspect_; + client::sp::shared_ptr client_; + std::unordered_map ts_map_; +}; + +void ExternalMiniClusterITestBase::StartCluster(const std::vector& extra_ts_flags, + const std::vector& extra_master_flags, + int num_tablet_servers) { + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = num_tablet_servers; + opts.extra_master_flags = extra_master_flags; + opts.extra_tserver_flags = extra_ts_flags; + opts.extra_tserver_flags.push_back("--never_fsync"); // fsync causes flakiness on EC2. + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); + ASSERT_OK(itest::CreateTabletServerMap(cluster_->master_proxy().get(), + cluster_->messenger(), + &ts_map_)); + client::KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); +} + +} // namespace kudu + +#endif // KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_ITEST_BASE_H_ diff --git a/src/kudu/integration-tests/external_mini_cluster-test.cc b/src/kudu/integration-tests/external_mini_cluster-test.cc new file mode 100644 index 000000000000..cb3d37b06c31 --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster-test.cc @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/test_util.h" + +METRIC_DECLARE_entity(server); +METRIC_DECLARE_gauge_uint64(threads_running); + +namespace kudu { + +class EMCTest : public KuduTest { + public: + EMCTest() { + // Hard-coded RPC ports for the masters. This is safe, as this unit test + // runs under a resource lock (see CMakeLists.txt in this directory). + // TODO we should have a generic method to obtain n free ports. + master_peer_ports_ = { 11010, 11011, 11012 }; + } + + protected: + std::vector master_peer_ports_; +}; + +TEST_F(EMCTest, TestBasicOperation) { + ExternalMiniClusterOptions opts; + opts.num_masters = master_peer_ports_.size(); + opts.num_tablet_servers = 3; + opts.master_rpc_ports = master_peer_ports_; + + ExternalMiniCluster cluster(opts); + ASSERT_OK(cluster.Start()); + + // Verify each of the masters. + for (int i = 0; i < opts.num_masters; i++) { + SCOPED_TRACE(i); + ExternalMaster* master = CHECK_NOTNULL(cluster.master(i)); + HostPort master_rpc = master->bound_rpc_hostport(); + EXPECT_TRUE(HasPrefixString(master_rpc.ToString(), "127.0.0.1:")) << master_rpc.ToString(); + + HostPort master_http = master->bound_http_hostport(); + EXPECT_TRUE(HasPrefixString(master_http.ToString(), "127.0.0.1:")) << master_http.ToString(); + + // Retrieve a thread metric, which should always be present on any master. + int64_t value; + ASSERT_OK(master->GetInt64Metric(&METRIC_ENTITY_server, + "kudu.master", + &METRIC_threads_running, + "value", + &value)); + EXPECT_GT(value, 0); + } + + // Verify each of the tablet servers. + for (int i = 0; i < opts.num_tablet_servers; i++) { + SCOPED_TRACE(i); + ExternalTabletServer* ts = CHECK_NOTNULL(cluster.tablet_server(i)); + HostPort ts_rpc = ts->bound_rpc_hostport(); + string expected_prefix = strings::Substitute("$0:", cluster.GetBindIpForTabletServer(i)); + EXPECT_NE(expected_prefix, "127.0.0.1") << "Should bind to unique per-server hosts"; + EXPECT_TRUE(HasPrefixString(ts_rpc.ToString(), expected_prefix)) << ts_rpc.ToString(); + + HostPort ts_http = ts->bound_http_hostport(); + EXPECT_TRUE(HasPrefixString(ts_http.ToString(), expected_prefix)) << ts_http.ToString(); + + // Retrieve a thread metric, which should always be present on any TS. + int64_t value; + ASSERT_OK(ts->GetInt64Metric(&METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_threads_running, + "value", + &value)); + EXPECT_GT(value, 0); + } + + // Restart a master and a tablet server. Make sure they come back up with the same ports. + ExternalMaster* master = cluster.master(0); + HostPort master_rpc = master->bound_rpc_hostport(); + HostPort master_http = master->bound_http_hostport(); + + master->Shutdown(); + ASSERT_OK(master->Restart()); + + ASSERT_EQ(master_rpc.ToString(), master->bound_rpc_hostport().ToString()); + ASSERT_EQ(master_http.ToString(), master->bound_http_hostport().ToString()); + + ExternalTabletServer* ts = cluster.tablet_server(0); + + HostPort ts_rpc = ts->bound_rpc_hostport(); + HostPort ts_http = ts->bound_http_hostport(); + + ts->Shutdown(); + ASSERT_OK(ts->Restart()); + + ASSERT_EQ(ts_rpc.ToString(), ts->bound_rpc_hostport().ToString()); + ASSERT_EQ(ts_http.ToString(), ts->bound_http_hostport().ToString()); + + cluster.Shutdown(); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/external_mini_cluster.cc b/src/kudu/integration-tests/external_mini_cluster.cc new file mode 100644 index 000000000000..2f15edb67c98 --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster.cc @@ -0,0 +1,870 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/integration-tests/external_mini_cluster.h" + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/master_rpc.h" +#include "kudu/server/server_base.pb.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/rpc/messenger.h" +#include "kudu/util/async_util.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/env.h" +#include "kudu/util/jsonreader.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/subprocess.h" +#include "kudu/util/test_util.h" + +using kudu::master::GetLeaderMasterRpc; +using kudu::master::MasterServiceProxy; +using kudu::server::ServerStatusPB; +using kudu::tserver::ListTabletsRequestPB; +using kudu::tserver::ListTabletsResponsePB; +using kudu::tserver::TabletServerServiceProxy; +using rapidjson::Value; +using std::string; +using strings::Substitute; + +typedef ListTabletsResponsePB::StatusAndSchemaPB StatusAndSchemaPB; + +namespace kudu { + +static const char* const kMasterBinaryName = "kudu-master"; +static const char* const kTabletServerBinaryName = "kudu-tserver"; +static double kProcessStartTimeoutSeconds = 30.0; +static double kTabletServerRegistrationTimeoutSeconds = 10.0; + +#if defined(__APPLE__) +static bool kBindToUniqueLoopbackAddress = false; +#else +static bool kBindToUniqueLoopbackAddress = true; +#endif + +ExternalMiniClusterOptions::ExternalMiniClusterOptions() + : num_masters(1), + num_tablet_servers(1), + bind_to_unique_loopback_addresses(kBindToUniqueLoopbackAddress) { +} + +ExternalMiniClusterOptions::~ExternalMiniClusterOptions() { +} + + +ExternalMiniCluster::ExternalMiniCluster(const ExternalMiniClusterOptions& opts) + : opts_(opts) { +} + +ExternalMiniCluster::~ExternalMiniCluster() { + Shutdown(); +} + +Status ExternalMiniCluster::DeduceBinRoot(std::string* ret) { + string exe; + RETURN_NOT_OK(Env::Default()->GetExecutablePath(&exe)); + *ret = DirName(exe); + return Status::OK(); +} + +Status ExternalMiniCluster::HandleOptions() { + daemon_bin_path_ = opts_.daemon_bin_path; + if (daemon_bin_path_.empty()) { + RETURN_NOT_OK(DeduceBinRoot(&daemon_bin_path_)); + } + + data_root_ = opts_.data_root; + if (data_root_.empty()) { + // If they don't specify a data root, use the current gtest directory. + data_root_ = JoinPathSegments(GetTestDataDirectory(), "minicluster-data"); + } + + return Status::OK(); +} + +Status ExternalMiniCluster::Start() { + CHECK(masters_.empty()) << "Masters are not empty (size: " << masters_.size() + << "). Maybe you meant Restart()?"; + CHECK(tablet_servers_.empty()) << "Tablet servers are not empty (size: " + << tablet_servers_.size() << "). Maybe you meant Restart()?"; + RETURN_NOT_OK(HandleOptions()); + + RETURN_NOT_OK_PREPEND(rpc::MessengerBuilder("minicluster-messenger") + .set_num_reactors(1) + .set_negotiation_threads(1) + .Build(&messenger_), + "Failed to start Messenger for minicluster"); + + Status s = Env::Default()->CreateDir(data_root_); + if (!s.ok() && !s.IsAlreadyPresent()) { + RETURN_NOT_OK_PREPEND(s, "Could not create root dir " + data_root_); + } + + if (opts_.num_masters != 1) { + RETURN_NOT_OK_PREPEND(StartDistributedMasters(), + "Failed to add distributed masters"); + } else { + RETURN_NOT_OK_PREPEND(StartSingleMaster(), + Substitute("Failed to start a single Master")); + } + + for (int i = 1; i <= opts_.num_tablet_servers; i++) { + RETURN_NOT_OK_PREPEND(AddTabletServer(), + Substitute("Failed starting tablet server $0", i)); + } + RETURN_NOT_OK(WaitForTabletServerCount( + opts_.num_tablet_servers, + MonoDelta::FromSeconds(kTabletServerRegistrationTimeoutSeconds))); + + return Status::OK(); +} + +void ExternalMiniCluster::Shutdown(NodeSelectionMode mode) { + if (mode == ALL) { + for (const scoped_refptr& master : masters_) { + if (master) { + master->Shutdown(); + } + } + } + + for (const scoped_refptr& ts : tablet_servers_) { + ts->Shutdown(); + } +} + +Status ExternalMiniCluster::Restart() { + for (const scoped_refptr& master : masters_) { + if (master && master->IsShutdown()) { + RETURN_NOT_OK_PREPEND(master->Restart(), "Cannot restart master bound at: " + + master->bound_rpc_hostport().ToString()); + } + } + + for (const scoped_refptr& ts : tablet_servers_) { + if (ts->IsShutdown()) { + RETURN_NOT_OK_PREPEND(ts->Restart(), "Cannot restart tablet server bound at: " + + ts->bound_rpc_hostport().ToString()); + } + } + + RETURN_NOT_OK(WaitForTabletServerCount( + tablet_servers_.size(), + MonoDelta::FromSeconds(kTabletServerRegistrationTimeoutSeconds))); + + return Status::OK(); +} + +string ExternalMiniCluster::GetBinaryPath(const string& binary) const { + CHECK(!daemon_bin_path_.empty()); + return JoinPathSegments(daemon_bin_path_, binary); +} + +string ExternalMiniCluster::GetDataPath(const string& daemon_id) const { + CHECK(!data_root_.empty()); + return JoinPathSegments(data_root_, daemon_id); +} + +namespace { +vector SubstituteInFlags(const vector& orig_flags, + int index) { + string str_index = strings::Substitute("$0", index); + vector ret; + for (const string& orig : orig_flags) { + ret.push_back(StringReplace(orig, "${index}", str_index, true)); + } + return ret; +} + +} // anonymous namespace + +Status ExternalMiniCluster::StartSingleMaster() { + string exe = GetBinaryPath(kMasterBinaryName); + scoped_refptr master = + new ExternalMaster(messenger_, exe, GetDataPath("master"), + SubstituteInFlags(opts_.extra_master_flags, 0)); + RETURN_NOT_OK(master->Start()); + masters_.push_back(master); + return Status::OK(); +} + +Status ExternalMiniCluster::StartDistributedMasters() { + int num_masters = opts_.num_masters; + + if (opts_.master_rpc_ports.size() != num_masters) { + LOG(FATAL) << num_masters << " masters requested, but only " << + opts_.master_rpc_ports.size() << " ports specified in 'master_rpc_ports'"; + } + + vector peer_addrs; + for (int i = 0; i < num_masters; i++) { + string addr = Substitute("127.0.0.1:$0", opts_.master_rpc_ports[i]); + peer_addrs.push_back(addr); + } + string peer_addrs_str = JoinStrings(peer_addrs, ","); + vector flags = opts_.extra_master_flags; + flags.push_back("--master_addresses=" + peer_addrs_str); + flags.push_back("--enable_leader_failure_detection=true"); + string exe = GetBinaryPath(kMasterBinaryName); + + // Start the masters. + for (int i = 0; i < num_masters; i++) { + scoped_refptr peer = + new ExternalMaster(messenger_, + exe, + GetDataPath(Substitute("master-$0", i)), + peer_addrs[i], + SubstituteInFlags(flags, i)); + RETURN_NOT_OK_PREPEND(peer->Start(), + Substitute("Unable to start Master at index $0", i)); + masters_.push_back(peer); + } + + return Status::OK(); +} + +string ExternalMiniCluster::GetBindIpForTabletServer(int index) const { + if (opts_.bind_to_unique_loopback_addresses) { + pid_t p = getpid(); + CHECK_LE(p, MathLimits::kMax) << "Cannot run on systems with >16-bit pid"; + return Substitute("127.$0.$1.$2", p >> 8, p & 0xff, index); + } else { + return "127.0.0.1"; + } +} + +Status ExternalMiniCluster::AddTabletServer() { + CHECK(leader_master() != nullptr) + << "Must have started at least 1 master before adding tablet servers"; + + int idx = tablet_servers_.size(); + + string exe = GetBinaryPath(kTabletServerBinaryName); + vector master_hostports; + for (int i = 0; i < num_masters(); i++) { + master_hostports.push_back(DCHECK_NOTNULL(master(i))->bound_rpc_hostport()); + } + + scoped_refptr ts = + new ExternalTabletServer(messenger_, exe, GetDataPath(Substitute("ts-$0", idx)), + GetBindIpForTabletServer(idx), + master_hostports, + SubstituteInFlags(opts_.extra_tserver_flags, idx)); + RETURN_NOT_OK(ts->Start()); + tablet_servers_.push_back(ts); + return Status::OK(); +} + +Status ExternalMiniCluster::WaitForTabletServerCount(int count, const MonoDelta& timeout) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + + while (true) { + MonoDelta remaining = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + if (remaining.ToSeconds() < 0) { + return Status::TimedOut(Substitute("$0 TS(s) never registered with master", count)); + } + + for (int i = 0; i < masters_.size(); i++) { + master::ListTabletServersRequestPB req; + master::ListTabletServersResponsePB resp; + rpc::RpcController rpc; + rpc.set_timeout(remaining); + RETURN_NOT_OK_PREPEND(master_proxy(i)->ListTabletServers(req, &resp, &rpc), + "ListTabletServers RPC failed"); + // ListTabletServers() may return servers that are no longer online. + // Do a second step of verification to verify that the descs that we got + // are aligned (same uuid/seqno) with the TSs that we have in the cluster. + int match_count = 0; + for (const master::ListTabletServersResponsePB_Entry& e : resp.servers()) { + for (const scoped_refptr& ets : tablet_servers_) { + if (ets->instance_id().permanent_uuid() == e.instance_id().permanent_uuid() && + ets->instance_id().instance_seqno() == e.instance_id().instance_seqno()) { + match_count++; + break; + } + } + } + if (match_count == count) { + LOG(INFO) << count << " TS(s) registered with Master"; + return Status::OK(); + } + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } +} + +void ExternalMiniCluster::AssertNoCrashes() { + vector daemons = this->daemons(); + for (ExternalDaemon* d : daemons) { + if (d->IsShutdown()) continue; + EXPECT_TRUE(d->IsProcessAlive()) << "At least one process crashed"; + } +} + +Status ExternalMiniCluster::WaitForTabletsRunning(ExternalTabletServer* ts, + const MonoDelta& timeout) { + TabletServerServiceProxy proxy(messenger_, ts->bound_rpc_addr()); + ListTabletsRequestPB req; + ListTabletsResponsePB resp; + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + rpc::RpcController rpc; + rpc.set_timeout(MonoDelta::FromSeconds(10)); + RETURN_NOT_OK(proxy.ListTablets(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + int num_not_running = 0; + for (const StatusAndSchemaPB& status : resp.status_and_schema()) { + if (status.tablet_status().state() != tablet::RUNNING) { + num_not_running++; + } + } + + if (num_not_running == 0) { + return Status::OK(); + } + + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + return Status::TimedOut(resp.DebugString()); +} + +namespace { +void LeaderMasterCallback(HostPort* dst_hostport, + Synchronizer* sync, + const Status& status, + const HostPort& result) { + if (status.ok()) { + *dst_hostport = result; + } + sync->StatusCB(status); +} +} // anonymous namespace + +Status ExternalMiniCluster::GetLeaderMasterIndex(int* idx) { + scoped_refptr rpc; + Synchronizer sync; + vector addrs; + HostPort leader_master_hp; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(5)); + + for (const scoped_refptr& master : masters_) { + addrs.push_back(master->bound_rpc_addr()); + } + rpc.reset(new GetLeaderMasterRpc(Bind(&LeaderMasterCallback, + &leader_master_hp, + &sync), + addrs, + deadline, + messenger_)); + rpc->SendRpc(); + RETURN_NOT_OK(sync.Wait()); + bool found = false; + for (int i = 0; i < masters_.size(); i++) { + if (masters_[i]->bound_rpc_hostport().port() == leader_master_hp.port()) { + found = true; + *idx = i; + break; + } + } + if (!found) { + // There is never a situation where shis should happen, so it's + // better to exit with a FATAL log message right away vs. return a + // Status::IllegalState(). + LOG(FATAL) << "Leader master is not in masters_"; + } + return Status::OK(); +} + +ExternalTabletServer* ExternalMiniCluster::tablet_server_by_uuid(const std::string& uuid) const { + for (const scoped_refptr& ts : tablet_servers_) { + if (ts->instance_id().permanent_uuid() == uuid) { + return ts.get(); + } + } + return nullptr; +} + +int ExternalMiniCluster::tablet_server_index_by_uuid(const std::string& uuid) const { + for (int i = 0; i < tablet_servers_.size(); i++) { + if (tablet_servers_[i]->uuid() == uuid) { + return i; + } + } + return -1; +} + +vector ExternalMiniCluster::daemons() const { + vector results; + for (const scoped_refptr& ts : tablet_servers_) { + results.push_back(ts.get()); + } + for (const scoped_refptr& master : masters_) { + results.push_back(master.get()); + } + return results; +} + +std::shared_ptr ExternalMiniCluster::messenger() { + return messenger_; +} + +std::shared_ptr ExternalMiniCluster::master_proxy() { + CHECK_EQ(masters_.size(), 1); + return master_proxy(0); +} + +std::shared_ptr ExternalMiniCluster::master_proxy(int idx) { + CHECK_LT(idx, masters_.size()); + return std::shared_ptr( + new MasterServiceProxy(messenger_, CHECK_NOTNULL(master(idx))->bound_rpc_addr())); +} + +Status ExternalMiniCluster::CreateClient(client::KuduClientBuilder& builder, + client::sp::shared_ptr* client) { + CHECK(!masters_.empty()); + builder.clear_master_server_addrs(); + for (const scoped_refptr& master : masters_) { + builder.add_master_server_addr(master->bound_rpc_hostport().ToString()); + } + return builder.Build(client); +} + +Status ExternalMiniCluster::SetFlag(ExternalDaemon* daemon, + const string& flag, + const string& value) { + server::GenericServiceProxy proxy(messenger_, daemon->bound_rpc_addr()); + + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromSeconds(30)); + server::SetFlagRequestPB req; + server::SetFlagResponsePB resp; + req.set_flag(flag); + req.set_value(value); + req.set_force(true); + RETURN_NOT_OK_PREPEND(proxy.SetFlag(req, &resp, &controller), + "rpc failed"); + if (resp.result() != server::SetFlagResponsePB::SUCCESS) { + return Status::RemoteError("failed to set flag", + resp.ShortDebugString()); + } + return Status::OK(); +} + +//------------------------------------------------------------ +// ExternalDaemon +//------------------------------------------------------------ + +ExternalDaemon::ExternalDaemon(std::shared_ptr messenger, + string exe, string data_dir, + vector extra_flags) + : messenger_(std::move(messenger)), + exe_(std::move(exe)), + data_dir_(std::move(data_dir)), + extra_flags_(std::move(extra_flags)) {} + +ExternalDaemon::~ExternalDaemon() { +} + + +Status ExternalDaemon::StartProcess(const vector& user_flags) { + CHECK(!process_); + + vector argv; + // First the exe for argv[0] + argv.push_back(BaseName(exe_)); + + // Then all the flags coming from the minicluster framework. + argv.insert(argv.end(), user_flags.begin(), user_flags.end()); + + // Enable metrics logging. + // Even though we set -logtostderr down below, metrics logs end up being written + // based on -log_dir. So, we have to set that too. + argv.push_back("--metrics_log_interval_ms=1000"); + argv.push_back("--log_dir=" + data_dir_); + + // Then the "extra flags" passed into the ctor (from the ExternalMiniCluster + // options struct). These come at the end so they can override things like + // web port or RPC bind address if necessary. + argv.insert(argv.end(), extra_flags_.begin(), extra_flags_.end()); + + // Tell the server to dump its port information so we can pick it up. + string info_path = JoinPathSegments(data_dir_, "info.pb"); + argv.push_back("--server_dump_info_path=" + info_path); + argv.push_back("--server_dump_info_format=pb"); + + // We use ephemeral ports in many tests. They don't work for production, but are OK + // in unit tests. + argv.push_back("--rpc_server_allow_ephemeral_ports"); + + // A previous instance of the daemon may have run in the same directory. So, remove + // the previous info file if it's there. + ignore_result(Env::Default()->DeleteFile(info_path)); + + // Ensure that logging goes to the test output and doesn't get buffered. + argv.push_back("--logtostderr"); + argv.push_back("--logbuflevel=-1"); + + gscoped_ptr p(new Subprocess(exe_, argv)); + p->ShareParentStdout(false); + LOG(INFO) << "Running " << exe_ << "\n" << JoinStrings(argv, "\n"); + RETURN_NOT_OK_PREPEND(p->Start(), + Substitute("Failed to start subprocess $0", exe_)); + + // The process is now starting -- wait for the bound port info to show up. + Stopwatch sw; + sw.start(); + bool success = false; + while (sw.elapsed().wall_seconds() < kProcessStartTimeoutSeconds) { + if (Env::Default()->FileExists(info_path)) { + success = true; + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + int rc; + Status s = p->WaitNoBlock(&rc); + if (s.IsTimedOut()) { + // The process is still running. + continue; + } + RETURN_NOT_OK_PREPEND(s, Substitute("Failed waiting on $0", exe_)); + return Status::RuntimeError( + Substitute("Process exited with rc=$0", rc), + exe_); + } + + if (!success) { + ignore_result(p->Kill(SIGKILL)); + return Status::TimedOut( + Substitute("Timed out after $0s waiting for process ($1) to write info file ($2)", + kProcessStartTimeoutSeconds, exe_, info_path)); + } + + status_.reset(new ServerStatusPB()); + RETURN_NOT_OK_PREPEND(pb_util::ReadPBFromPath(Env::Default(), info_path, status_.get()), + "Failed to read info file from " + info_path); + LOG(INFO) << "Started " << exe_ << " as pid " << p->pid(); + VLOG(1) << exe_ << " instance information:\n" << status_->DebugString(); + + process_.swap(p); + return Status::OK(); +} + +Status ExternalDaemon::Pause() { + if (!process_) return Status::OK(); + VLOG(1) << "Pausing " << exe_ << " with pid " << process_->pid(); + return process_->Kill(SIGSTOP); +} + +Status ExternalDaemon::Resume() { + if (!process_) return Status::OK(); + VLOG(1) << "Resuming " << exe_ << " with pid " << process_->pid(); + return process_->Kill(SIGCONT); +} + +bool ExternalDaemon::IsShutdown() const { + return process_.get() == nullptr; +} + +bool ExternalDaemon::IsProcessAlive() const { + if (IsShutdown()) { + return false; + } + + int rc = 0; + Status s = process_->WaitNoBlock(&rc); + // If the non-blocking Wait "times out", that means the process + // is running. + return s.IsTimedOut(); +} + +pid_t ExternalDaemon::pid() const { + return process_->pid(); +} + +void ExternalDaemon::Shutdown() { + if (!process_) return; + + // Before we kill the process, store the addresses. If we're told to + // start again we'll reuse these. + bound_rpc_ = bound_rpc_hostport(); + bound_http_ = bound_http_hostport(); + + if (IsProcessAlive()) { + // In coverage builds, ask the process nicely to flush coverage info + // before we kill -9 it. Otherwise, we never get any coverage from + // external clusters. + FlushCoverage(); + + LOG(INFO) << "Killing " << exe_ << " with pid " << process_->pid(); + ignore_result(process_->Kill(SIGKILL)); + } + int ret; + WARN_NOT_OK(process_->Wait(&ret), "Waiting on " + exe_); + process_.reset(); +} + +void ExternalDaemon::FlushCoverage() { +#ifndef COVERAGE_BUILD + return; +#else + LOG(INFO) << "Attempting to flush coverage for " << exe_ << " pid " << process_->pid(); + server::GenericServiceProxy proxy(messenger_, bound_rpc_addr()); + + server::FlushCoverageRequestPB req; + server::FlushCoverageResponsePB resp; + rpc::RpcController rpc; + + // Set a reasonably short timeout, since some of our tests kill servers which + // are kill -STOPed. + rpc.set_timeout(MonoDelta::FromMilliseconds(100)); + Status s = proxy.FlushCoverage(req, &resp, &rpc); + if (s.ok() && !resp.success()) { + s = Status::RemoteError("Server does not appear to be running a coverage build"); + } + WARN_NOT_OK(s, Substitute("Unable to flush coverage on $0 pid $1", exe_, process_->pid())); +#endif +} + +HostPort ExternalDaemon::bound_rpc_hostport() const { + CHECK(status_); + CHECK_GE(status_->bound_rpc_addresses_size(), 1); + HostPort ret; + CHECK_OK(HostPortFromPB(status_->bound_rpc_addresses(0), &ret)); + return ret; +} + +Sockaddr ExternalDaemon::bound_rpc_addr() const { + HostPort hp = bound_rpc_hostport(); + vector addrs; + CHECK_OK(hp.ResolveAddresses(&addrs)); + CHECK(!addrs.empty()); + return addrs[0]; +} + +HostPort ExternalDaemon::bound_http_hostport() const { + CHECK(status_); + CHECK_GE(status_->bound_http_addresses_size(), 1); + HostPort ret; + CHECK_OK(HostPortFromPB(status_->bound_http_addresses(0), &ret)); + return ret; +} + +const NodeInstancePB& ExternalDaemon::instance_id() const { + CHECK(status_); + return status_->node_instance(); +} + +const string& ExternalDaemon::uuid() const { + CHECK(status_); + return status_->node_instance().permanent_uuid(); +} + +Status ExternalDaemon::GetInt64Metric(const MetricEntityPrototype* entity_proto, + const char* entity_id, + const MetricPrototype* metric_proto, + const char* value_field, + int64_t* value) const { + // Fetch metrics whose name matches the given prototype. + string url = Substitute( + "http://$0/jsonmetricz?metrics=$1", + bound_http_hostport().ToString(), + metric_proto->name()); + EasyCurl curl; + faststring dst; + RETURN_NOT_OK(curl.FetchURL(url, &dst)); + + // Parse the results, beginning with the top-level entity array. + JsonReader r(dst.ToString()); + RETURN_NOT_OK(r.Init()); + vector entities; + RETURN_NOT_OK(r.ExtractObjectArray(r.root(), NULL, &entities)); + for (const Value* entity : entities) { + // Find the desired entity. + string type; + RETURN_NOT_OK(r.ExtractString(entity, "type", &type)); + if (type != entity_proto->name()) { + continue; + } + if (entity_id) { + string id; + RETURN_NOT_OK(r.ExtractString(entity, "id", &id)); + if (id != entity_id) { + continue; + } + } + + // Find the desired metric within the entity. + vector metrics; + RETURN_NOT_OK(r.ExtractObjectArray(entity, "metrics", &metrics)); + for (const Value* metric : metrics) { + string name; + RETURN_NOT_OK(r.ExtractString(metric, "name", &name)); + if (name != metric_proto->name()) { + continue; + } + RETURN_NOT_OK(r.ExtractInt64(metric, value_field, value)); + return Status::OK(); + } + } + string msg; + if (entity_id) { + msg = Substitute("Could not find metric $0.$1 for entity $2", + entity_proto->name(), metric_proto->name(), + entity_id); + } else { + msg = Substitute("Could not find metric $0.$1", + entity_proto->name(), metric_proto->name()); + } + return Status::NotFound(msg); +} + +//------------------------------------------------------------ +// ScopedResumeExternalDaemon +//------------------------------------------------------------ + +ScopedResumeExternalDaemon::ScopedResumeExternalDaemon(ExternalDaemon* daemon) + : daemon_(CHECK_NOTNULL(daemon)) { +} + +ScopedResumeExternalDaemon::~ScopedResumeExternalDaemon() { + daemon_->Resume(); +} + +//------------------------------------------------------------ +// ExternalMaster +//------------------------------------------------------------ + +ExternalMaster::ExternalMaster(const std::shared_ptr& messenger, + const string& exe, + const string& data_dir, + const vector& extra_flags) + : ExternalDaemon(messenger, exe, data_dir, extra_flags), + rpc_bind_address_("127.0.0.1:0") { +} + +ExternalMaster::ExternalMaster(const std::shared_ptr& messenger, + const string& exe, const string& data_dir, + string rpc_bind_address, + const std::vector& extra_flags) + : ExternalDaemon(messenger, exe, data_dir, extra_flags), + rpc_bind_address_(std::move(rpc_bind_address)) {} + +ExternalMaster::~ExternalMaster() { +} + +Status ExternalMaster::Start() { + vector flags; + flags.push_back("--fs_wal_dir=" + data_dir_); + flags.push_back("--fs_data_dirs=" + data_dir_); + flags.push_back("--rpc_bind_addresses=" + rpc_bind_address_); + flags.push_back("--webserver_interface=localhost"); + flags.push_back("--webserver_port=0"); + RETURN_NOT_OK(StartProcess(flags)); + return Status::OK(); +} + +Status ExternalMaster::Restart() { + // We store the addresses on shutdown so make sure we did that first. + if (bound_rpc_.port() == 0) { + return Status::IllegalState("Master cannot be restarted. Must call Shutdown() first."); + } + vector flags; + flags.push_back("--fs_wal_dir=" + data_dir_); + flags.push_back("--fs_data_dirs=" + data_dir_); + flags.push_back("--rpc_bind_addresses=" + bound_rpc_.ToString()); + flags.push_back("--webserver_interface=localhost"); + flags.push_back(Substitute("--webserver_port=$0", bound_http_.port())); + RETURN_NOT_OK(StartProcess(flags)); + return Status::OK(); +} + + +//------------------------------------------------------------ +// ExternalTabletServer +//------------------------------------------------------------ + +ExternalTabletServer::ExternalTabletServer( + const std::shared_ptr& messenger, const string& exe, + const string& data_dir, string bind_host, + const vector& master_addrs, const vector& extra_flags) + : ExternalDaemon(messenger, exe, data_dir, extra_flags), + master_addrs_(HostPort::ToCommaSeparatedString(master_addrs)), + bind_host_(std::move(bind_host)) {} + +ExternalTabletServer::~ExternalTabletServer() { +} + +Status ExternalTabletServer::Start() { + vector flags; + flags.push_back("--fs_wal_dir=" + data_dir_); + flags.push_back("--fs_data_dirs=" + data_dir_); + flags.push_back(Substitute("--rpc_bind_addresses=$0:0", + bind_host_)); + flags.push_back(Substitute("--local_ip_for_outbound_sockets=$0", + bind_host_)); + flags.push_back(Substitute("--webserver_interface=$0", + bind_host_)); + flags.push_back("--webserver_port=0"); + flags.push_back("--tserver_master_addrs=" + master_addrs_); + RETURN_NOT_OK(StartProcess(flags)); + return Status::OK(); +} + +Status ExternalTabletServer::Restart() { + // We store the addresses on shutdown so make sure we did that first. + if (bound_rpc_.port() == 0) { + return Status::IllegalState("Tablet server cannot be restarted. Must call Shutdown() first."); + } + vector flags; + flags.push_back("--fs_wal_dir=" + data_dir_); + flags.push_back("--fs_data_dirs=" + data_dir_); + flags.push_back("--rpc_bind_addresses=" + bound_rpc_.ToString()); + flags.push_back(Substitute("--local_ip_for_outbound_sockets=$0", + bind_host_)); + flags.push_back(Substitute("--webserver_port=$0", bound_http_.port())); + flags.push_back(Substitute("--webserver_interface=$0", + bind_host_)); + flags.push_back("--tserver_master_addrs=" + master_addrs_); + RETURN_NOT_OK(StartProcess(flags)); + return Status::OK(); +} + + +} // namespace kudu diff --git a/src/kudu/integration-tests/external_mini_cluster.h b/src/kudu/integration-tests/external_mini_cluster.h new file mode 100644 index 000000000000..d19fc9835a02 --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster.h @@ -0,0 +1,428 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H +#define KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/status.h" + +namespace kudu { + +class ExternalDaemon; +class ExternalMaster; +class ExternalTabletServer; +class HostPort; +class MetricPrototype; +class MetricEntityPrototype; +class NodeInstancePB; +class Sockaddr; +class Subprocess; + +namespace master { +class MasterServiceProxy; +} // namespace master + +namespace rpc { +class Messenger; +} // namespace rpc + +namespace server { +class ServerStatusPB; +} // namespace server + +struct ExternalMiniClusterOptions { + ExternalMiniClusterOptions(); + ~ExternalMiniClusterOptions(); + + // Number of masters to start. + // Default: 1 + int num_masters; + + // Number of TS to start. + // Default: 1 + int num_tablet_servers; + + // Directory in which to store data. + // Default: "", which auto-generates a unique path for this cluster. + std::string data_root; + + // If true, binds each tablet server to a different loopback address. + // This affects the server's RPC server, and also forces the server to + // only use this IP address for outgoing socket connections as well. + // This allows the use of iptables on the localhost to simulate network + // partitions. + // + // The addressed used are 127... where: + // - are the high and low bytes of the pid of the process running the + // minicluster (not the daemon itself). + // - is the index of the server within this minicluster. + // + // This requires that the system is set up such that processes may bind + // to any IP address in the localhost netblock (127.0.0.0/8). This seems + // to be the case on common Linux distributions. You can verify by running + // 'ip addr | grep 127.0.0.1' and checking that the address is listed as + // '127.0.0.1/8'. + // + // This option is disabled by default on OS X. + // + // NOTE: this does not currently affect the HTTP server. + // + // Default: true + bool bind_to_unique_loopback_addresses; + + // The path where the kudu daemons should be run from. + // Default: "", which uses the same path as the currently running executable. + // This works for unit tests, since they all end up in build/latest/bin. + std::string daemon_bin_path; + + // Extra flags for tablet servers and masters respectively. + // + // In these flags, you may use the special string '${index}' which will + // be substituted with the index of the tablet server or master. + std::vector extra_tserver_flags; + std::vector extra_master_flags; + + // If more than one master is specified, list of ports for the + // masters in a consensus configuration. Port at index 0 is used for the leader + // master. + std::vector master_rpc_ports; +}; + +// A mini-cluster made up of subprocesses running each of the daemons +// separately. This is useful for black-box or grey-box failure testing +// purposes -- it provides the ability to forcibly kill or stop particular +// cluster participants, which isn't feasible in the normal MiniCluster. +// On the other hand, there is little access to inspect the internal state +// of the daemons. +class ExternalMiniCluster { + public: + // Mode to which node types a certain action (like Shutdown()) should apply. + enum NodeSelectionMode { + TS_ONLY, + ALL + }; + + explicit ExternalMiniCluster(const ExternalMiniClusterOptions& opts); + ~ExternalMiniCluster(); + + // Start the cluster. + Status Start(); + + // Restarts the cluster. Requires that it has been Shutdown() first. + Status Restart(); + + // Like the previous method but performs initialization synchronously, i.e. + // this will wait for all TS's to be started and initialized. Tests should + // use this if they interact with tablets immediately after Start(); + Status StartSync(); + + // Add a new TS to the cluster. The new TS is started. + // Requires that the master is already running. + Status AddTabletServer(); + + // Shuts down the whole cluster or part of it, depending on the selected + // 'mode'. + // Currently, this uses SIGKILL on each daemon for a non-graceful shutdown. + void Shutdown(NodeSelectionMode mode = ALL); + + // Return the IP address that the tablet server with the given index will bind to. + // If options.bind_to_unique_loopback_addresses is false, this will be 127.0.0.1 + // Otherwise, it is another IP in the local netblock. + std::string GetBindIpForTabletServer(int index) const; + + // Return a pointer to the running leader master. This may be NULL + // if the cluster is not started. + // + // TODO: Use the appropriate RPC here to return the leader master, + // to allow some of the existing tests (e.g., raft_consensus-itest) + // to use multiple masters. + ExternalMaster* leader_master() { return master(0); } + + // Perform an RPC to determine the leader of the external mini + // cluster. Set 'index' to the leader master's index (for calls to + // to master() below). + // + // NOTE: if a leader election occurs after this method is executed, + // the last result may not be valid. + Status GetLeaderMasterIndex(int* idx); + + // If this cluster is configured for a single non-distributed + // master, return the single master or NULL if the master is not + // started. Exits with a CHECK failure if there are multiple + // masters. + ExternalMaster* master() const { + CHECK_EQ(masters_.size(), 1) + << "master() should not be used with multiple masters, use leader_master() instead."; + return master(0); + } + + // Return master at 'idx' or NULL if the master at 'idx' has not + // been started. + ExternalMaster* master(int idx) const { + CHECK_LT(idx, masters_.size()); + return masters_[idx].get(); + } + + ExternalTabletServer* tablet_server(int idx) const { + CHECK_LT(idx, tablet_servers_.size()); + return tablet_servers_[idx].get(); + } + + // Return ExternalTabletServer given its UUID. If not found, returns NULL. + ExternalTabletServer* tablet_server_by_uuid(const std::string& uuid) const; + + // Return the index of the ExternalTabletServer that has the given 'uuid', or + // -1 if no such UUID can be found. + int tablet_server_index_by_uuid(const std::string& uuid) const; + + // Return all tablet servers and masters. + std::vector daemons() const; + + int num_tablet_servers() const { + return tablet_servers_.size(); + } + + int num_masters() const { + return masters_.size(); + } + + // Return the client messenger used by the ExternalMiniCluster. + std::shared_ptr messenger(); + + // If the cluster is configured for a single non-distributed master, + // return a proxy to that master. Requires that the single master is + // running. + std::shared_ptr master_proxy(); + + // Returns an RPC proxy to the master at 'idx'. Requires that the + // master at 'idx' is running. + std::shared_ptr master_proxy(int idx); + + // Wait until the number of registered tablet servers reaches the + // given count on at least one of the running masters. Returns + // Status::TimedOut if the desired count is not achieved with the + // given timeout. + Status WaitForTabletServerCount(int count, const MonoDelta& timeout); + + // Runs gtest assertions that no servers have crashed. + void AssertNoCrashes(); + + // Wait until all tablets on the given tablet server are in 'RUNNING' + // state. + Status WaitForTabletsRunning(ExternalTabletServer* ts, const MonoDelta& timeout); + + // Create a client configured to talk to this cluster. + // Builder may contain override options for the client. The master address will + // be overridden to talk to the running master. + // + // REQUIRES: the cluster must have already been Start()ed. + Status CreateClient(client::KuduClientBuilder& builder, + client::sp::shared_ptr* client); + + // Sets the given flag on the given daemon, which must be running. + // + // This uses the 'force' flag on the RPC so that, even if the flag + // is considered unsafe to change at runtime, it is changed. + Status SetFlag(ExternalDaemon* daemon, + const std::string& flag, + const std::string& value); + + private: + FRIEND_TEST(MasterFailoverTest, TestKillAnyMaster); + + Status StartSingleMaster(); + + Status StartDistributedMasters(); + + std::string GetBinaryPath(const std::string& binary) const; + std::string GetDataPath(const std::string& daemon_id) const; + + Status DeduceBinRoot(std::string* ret); + Status HandleOptions(); + + const ExternalMiniClusterOptions opts_; + + // The root for binaries. + std::string daemon_bin_path_; + + std::string data_root_; + + std::vector > masters_; + std::vector > tablet_servers_; + + std::shared_ptr messenger_; + + DISALLOW_COPY_AND_ASSIGN(ExternalMiniCluster); +}; + +class ExternalDaemon : public RefCountedThreadSafe { + public: + ExternalDaemon(std::shared_ptr messenger, std::string exe, + std::string data_dir, std::vector extra_flags); + + HostPort bound_rpc_hostport() const; + Sockaddr bound_rpc_addr() const; + HostPort bound_http_hostport() const; + const NodeInstancePB& instance_id() const; + const std::string& uuid() const; + + // Return the pid of the running process. + // Causes a CHECK failure if the process is not running. + pid_t pid() const; + + // Sends a SIGSTOP signal to the daemon. + Status Pause(); + + // Sends a SIGCONT signal to the daemon. + Status Resume(); + + // Return true if we have explicitly shut down the process. + bool IsShutdown() const; + + // Return true if the process is still running. + // This may return false if the process crashed, even if we didn't + // explicitly call Shutdown(). + bool IsProcessAlive() const; + + virtual void Shutdown(); + + const std::string& data_dir() const { return data_dir_; } + + // Return a pointer to the flags used for this server on restart. + // Modifying these flags will only take effect on the next restart. + std::vector* mutable_flags() { return &extra_flags_; } + + // Retrieve the value of a given metric from this server. The metric must + // be of int64_t type. + // + // 'value_field' represents the particular field of the metric to be read. + // For example, for a counter or gauge, this should be 'value'. For a + // histogram, it might be 'total_count' or 'mean'. + // + // 'entity_id' may be NULL, in which case the first entity of the same type + // as 'entity_proto' will be matched. + Status GetInt64Metric(const MetricEntityPrototype* entity_proto, + const char* entity_id, + const MetricPrototype* metric_proto, + const char* value_field, + int64_t* value) const; + + protected: + friend class RefCountedThreadSafe; + virtual ~ExternalDaemon(); + + Status StartProcess(const std::vector& flags); + + // In a code-coverage build, try to flush the coverage data to disk. + // In a non-coverage build, this does nothing. + void FlushCoverage(); + + const std::shared_ptr messenger_; + const std::string exe_; + const std::string data_dir_; + std::vector extra_flags_; + + gscoped_ptr process_; + + gscoped_ptr status_; + + // These capture the daemons parameters and running ports and + // are used to Restart() the daemon with the same parameters. + HostPort bound_rpc_; + HostPort bound_http_; + + DISALLOW_COPY_AND_ASSIGN(ExternalDaemon); +}; + +// Resumes a daemon that was stopped with ExteranlDaemon::Pause() upon +// exiting a scope. +class ScopedResumeExternalDaemon { + public: + // 'daemon' must remain valid for the lifetime of a + // ScopedResumeExternalDaemon object. + explicit ScopedResumeExternalDaemon(ExternalDaemon* daemon); + + // Resume 'daemon_'. + ~ScopedResumeExternalDaemon(); + + private: + ExternalDaemon* daemon_; + + DISALLOW_COPY_AND_ASSIGN(ScopedResumeExternalDaemon); +}; + + +class ExternalMaster : public ExternalDaemon { + public: + ExternalMaster(const std::shared_ptr& messenger, + const std::string& exe, const std::string& data_dir, + const std::vector& extra_flags); + + ExternalMaster(const std::shared_ptr& messenger, + const std::string& exe, const std::string& data_dir, + std::string rpc_bind_address, + const std::vector& extra_flags); + + Status Start(); + + // Restarts the daemon. + // Requires that it has previously been shutdown. + Status Restart() WARN_UNUSED_RESULT; + + + private: + friend class RefCountedThreadSafe; + virtual ~ExternalMaster(); + + const std::string rpc_bind_address_; +}; + +class ExternalTabletServer : public ExternalDaemon { + public: + ExternalTabletServer(const std::shared_ptr& messenger, + const std::string& exe, const std::string& data_dir, + std::string bind_host, + const std::vector& master_addrs, + const std::vector& extra_flags); + + Status Start(); + + // Restarts the daemon. + // Requires that it has previously been shutdown. + Status Restart() WARN_UNUSED_RESULT; + + + private: + const std::string master_addrs_; + const std::string bind_host_; + + friend class RefCountedThreadSafe; + virtual ~ExternalTabletServer(); +}; + +} // namespace kudu +#endif /* KUDU_INTEGRATION_TESTS_EXTERNAL_MINI_CLUSTER_H */ diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc new file mode 100644 index 000000000000..e1b5ca6ad433 --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.cc @@ -0,0 +1,350 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/integration-tests/external_mini_cluster_fs_inspector.h" + +#include +#include + +#include "kudu/consensus/metadata.pb.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/env.h" +#include "kudu/util/monotime.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace itest { + +using std::set; +using std::string; +using std::vector; + +using consensus::ConsensusMetadataPB; +using strings::Substitute; +using tablet::TabletDataState; +using tablet::TabletSuperBlockPB; + +ExternalMiniClusterFsInspector::ExternalMiniClusterFsInspector(ExternalMiniCluster* cluster) + : env_(Env::Default()), + cluster_(CHECK_NOTNULL(cluster)) { +} + +ExternalMiniClusterFsInspector::~ExternalMiniClusterFsInspector() {} + +Status ExternalMiniClusterFsInspector::ListFilesInDir(const string& path, + vector* entries) { + RETURN_NOT_OK(env_->GetChildren(path, entries)); + auto iter = entries->begin(); + while (iter != entries->end()) { + if (*iter == "." || *iter == ".." || iter->find(".tmp.") != string::npos) { + iter = entries->erase(iter); + continue; + } + ++iter; + } + return Status::OK(); +} + +int ExternalMiniClusterFsInspector::CountFilesInDir(const string& path) { + vector entries; + Status s = ListFilesInDir(path, &entries); + if (!s.ok()) return 0; + return entries.size(); +} + +int ExternalMiniClusterFsInspector::CountWALSegmentsOnTS(int index) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string ts_wal_dir = JoinPathSegments(data_dir, FsManager::kWalDirName); + vector tablets; + CHECK_OK(ListFilesInDir(ts_wal_dir, &tablets)); + int total_segments = 0; + for (const string& tablet : tablets) { + string tablet_wal_dir = JoinPathSegments(ts_wal_dir, tablet); + total_segments += CountFilesInDir(tablet_wal_dir); + } + return total_segments; +} + +vector ExternalMiniClusterFsInspector::ListTablets() { + set tablets; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + auto ts_tablets = ListTabletsOnTS(i); + tablets.insert(ts_tablets.begin(), ts_tablets.end()); + } + return vector(tablets.begin(), tablets.end()); +} + +vector ExternalMiniClusterFsInspector::ListTabletsOnTS(int index) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName); + vector tablets; + CHECK_OK(ListFilesInDir(meta_dir, &tablets)); + return tablets; +} + +vector ExternalMiniClusterFsInspector::ListTabletsWithDataOnTS(int index) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string wal_dir = JoinPathSegments(data_dir, FsManager::kWalDirName); + vector tablets; + CHECK_OK(ListFilesInDir(wal_dir, &tablets)); + return tablets; +} + +int ExternalMiniClusterFsInspector::CountWALSegmentsForTabletOnTS(int index, + const string& tablet_id) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string wal_dir = JoinPathSegments(data_dir, FsManager::kWalDirName); + string tablet_wal_dir = JoinPathSegments(wal_dir, tablet_id); + if (!env_->FileExists(tablet_wal_dir)) { + return 0; + } + return CountFilesInDir(tablet_wal_dir); +} + +bool ExternalMiniClusterFsInspector::DoesConsensusMetaExistForTabletOnTS(int index, + const string& tablet_id) { + ConsensusMetadataPB cmeta_pb; + Status s = ReadConsensusMetadataOnTS(index, tablet_id, &cmeta_pb); + return s.ok(); +} + +int ExternalMiniClusterFsInspector::CountReplicasInMetadataDirs() { + // Rather than using FsManager's functionality for listing blocks, we just manually + // list the contents of the metadata directory. This is because we're using an + // external minicluster, and initializing a new FsManager to point at the running + // tablet servers isn't easy. + int count = 0; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + string data_dir = cluster_->tablet_server(i)->data_dir(); + count += CountFilesInDir(JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName)); + } + return count; +} + +Status ExternalMiniClusterFsInspector::CheckNoDataOnTS(int index) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + if (CountFilesInDir(JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName)) > 0) { + return Status::IllegalState("tablet metadata blocks still exist", data_dir); + } + if (CountWALSegmentsOnTS(index) > 0) { + return Status::IllegalState("wals still exist", data_dir); + } + if (CountFilesInDir(JoinPathSegments(data_dir, FsManager::kConsensusMetadataDirName)) > 0) { + return Status::IllegalState("consensus metadata still exists", data_dir); + } + return Status::OK();; +} + +Status ExternalMiniClusterFsInspector::CheckNoData() { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + RETURN_NOT_OK(CheckNoDataOnTS(i)); + } + return Status::OK();; +} + +Status ExternalMiniClusterFsInspector::ReadTabletSuperBlockOnTS(int index, + const string& tablet_id, + TabletSuperBlockPB* sb) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string meta_dir = JoinPathSegments(data_dir, FsManager::kTabletMetadataDirName); + string superblock_path = JoinPathSegments(meta_dir, tablet_id); + return pb_util::ReadPBContainerFromPath(env_, superblock_path, sb); +} + +Status ExternalMiniClusterFsInspector::ReadConsensusMetadataOnTS(int index, + const string& tablet_id, + ConsensusMetadataPB* cmeta_pb) { + string data_dir = cluster_->tablet_server(index)->data_dir(); + string cmeta_dir = JoinPathSegments(data_dir, FsManager::kConsensusMetadataDirName); + string cmeta_file = JoinPathSegments(cmeta_dir, tablet_id); + if (!env_->FileExists(cmeta_file)) { + return Status::NotFound("Consensus metadata file not found", cmeta_file); + } + return pb_util::ReadPBContainerFromPath(env_, cmeta_file, cmeta_pb); +} + +Status ExternalMiniClusterFsInspector::CheckTabletDataStateOnTS(int index, + const string& tablet_id, + TabletDataState state) { + TabletSuperBlockPB sb; + RETURN_NOT_OK(ReadTabletSuperBlockOnTS(index, tablet_id, &sb)); + if (PREDICT_FALSE(sb.tablet_data_state() != state)) { + return Status::IllegalState("Tablet data state != " + TabletDataState_Name(state), + TabletDataState_Name(sb.tablet_data_state())); + } + return Status::OK(); +} + +Status ExternalMiniClusterFsInspector::WaitForNoData(const MonoDelta& timeout) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + Status s; + while (true) { + s = CheckNoData(); + if (s.ok()) return Status::OK(); + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut("Timed out waiting for no data", s.ToString()); +} + +Status ExternalMiniClusterFsInspector::WaitForNoDataOnTS(int index, const MonoDelta& timeout) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + Status s; + while (true) { + s = CheckNoDataOnTS(index); + if (s.ok()) return Status::OK(); + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut("Timed out waiting for no data", s.ToString()); +} + +Status ExternalMiniClusterFsInspector::WaitForMinFilesInTabletWalDirOnTS(int index, + const string& tablet_id, + int count, + const MonoDelta& timeout) { + int seen = 0; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + while (true) { + seen = CountWALSegmentsForTabletOnTS(index, tablet_id); + if (seen >= count) return Status::OK(); + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut(Substitute("Timed out waiting for number of WAL segments on tablet $0 " + "on TS $1 to be $2. Found $3", + tablet_id, index, count, seen)); +} + +Status ExternalMiniClusterFsInspector::WaitForReplicaCount(int expected, const MonoDelta& timeout) { + Status s; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + int found; + while (true) { + found = CountReplicasInMetadataDirs(); + if (found == expected) return Status::OK(); + if (CountReplicasInMetadataDirs() == expected) return Status::OK(); + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + return Status::TimedOut(Substitute("Timed out waiting for a total replica count of $0. " + "Found $2 replicas", + expected, found)); +} + +Status ExternalMiniClusterFsInspector::WaitForTabletDataStateOnTS(int index, + const string& tablet_id, + TabletDataState expected, + const MonoDelta& timeout) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = start; + deadline.AddDelta(timeout); + Status s; + while (true) { + s = CheckTabletDataStateOnTS(index, tablet_id, expected); + if (s.ok()) return Status::OK(); + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(5)); + } + return Status::TimedOut(Substitute("Timed out after $0 waiting for tablet data state $1: $2", + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + TabletDataState_Name(expected), s.ToString())); +} + +Status ExternalMiniClusterFsInspector::WaitForFilePatternInTabletWalDirOnTs( + int ts_index, const string& tablet_id, + const vector& substrings_required, + const vector& substrings_disallowed, + const MonoDelta& timeout) { + Status s; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + + string data_dir = cluster_->tablet_server(ts_index)->data_dir(); + string ts_wal_dir = JoinPathSegments(data_dir, FsManager::kWalDirName); + string tablet_wal_dir = JoinPathSegments(ts_wal_dir, tablet_id); + + string error_msg; + vector entries; + while (true) { + Status s = ListFilesInDir(tablet_wal_dir, &entries); + std::sort(entries.begin(), entries.end()); + + error_msg = ""; + bool any_missing_required = false; + for (const string& required_filter : substrings_required) { + bool filter_matched = false; + for (const string& entry : entries) { + if (entry.find(required_filter) != string::npos) { + filter_matched = true; + break; + } + } + if (!filter_matched) { + any_missing_required = true; + error_msg += "missing from substrings_required: " + required_filter + "; "; + break; + } + } + + bool any_present_disallowed = false; + for (const string& entry : entries) { + if (any_present_disallowed) break; + for (const string& disallowed_filter : substrings_disallowed) { + if (entry.find(disallowed_filter) != string::npos) { + any_present_disallowed = true; + error_msg += "present from substrings_disallowed: " + entry + + " (" + disallowed_filter + "); "; + break; + } + } + } + + if (!any_missing_required && !any_present_disallowed) { + return Status::OK(); + } + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + return Status::TimedOut(Substitute("Timed out waiting for file pattern on " + "tablet $0 on TS $1 in directory $2", + tablet_id, ts_index, tablet_wal_dir), + error_msg + "entries: " + JoinStrings(entries, ", ")); +} + +} // namespace itest +} // namespace kudu + diff --git a/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h new file mode 100644 index 000000000000..ff160da31dbf --- /dev/null +++ b/src/kudu/integration-tests/external_mini_cluster_fs_inspector.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_INTEGRATION_TESTS_CLUSTER_EXTERNAL_MINI_CLUSTER_FS_INSPECTOR_H_ +#define KUDU_INTEGRATION_TESTS_CLUSTER_EXTERNAL_MINI_CLUSTER_FS_INSPECTOR_H_ + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/tablet/metadata.pb.h" +#include "kudu/util/monotime.h" + +namespace kudu { +class Env; +class ExternalMiniCluster; +class Status; + +namespace consensus { +class ConsensusMetadataPB; +} + +namespace tablet { +class TabletSuperBlockPB; +} + +namespace itest { + +// Utility class that digs around in a tablet server's data directory and +// provides methods useful for integration testing. This class must outlive +// the Env and ExternalMiniCluster objects that are passed into it. +class ExternalMiniClusterFsInspector { + public: + // Does not take ownership of the ExternalMiniCluster pointer. + explicit ExternalMiniClusterFsInspector(ExternalMiniCluster* cluster); + ~ExternalMiniClusterFsInspector(); + + Status ListFilesInDir(const std::string& path, std::vector* entries); + int CountFilesInDir(const std::string& path); + int CountWALSegmentsOnTS(int index); + + // List all of the tablets with tablet metadata in the cluster. + std::vector ListTablets(); + + // List all of the tablets with tablet metadata on the given tablet server index. + // This may include tablets that are tombstoned and not running. + std::vector ListTabletsOnTS(int index); + + // List the tablet IDs on the given tablet which actually have data (as + // evidenced by their having a WAL). This excludes those that are tombstoned. + std::vector ListTabletsWithDataOnTS(int index); + + int CountWALSegmentsForTabletOnTS(int index, const std::string& tablet_id); + bool DoesConsensusMetaExistForTabletOnTS(int index, const std::string& tablet_id); + + int CountReplicasInMetadataDirs(); + Status CheckNoDataOnTS(int index); + Status CheckNoData(); + + Status ReadTabletSuperBlockOnTS(int index, const std::string& tablet_id, + tablet::TabletSuperBlockPB* sb); + Status ReadConsensusMetadataOnTS(int index, const std::string& tablet_id, + consensus::ConsensusMetadataPB* cmeta_pb); + Status CheckTabletDataStateOnTS(int index, + const std::string& tablet_id, + tablet::TabletDataState state); + + Status WaitForNoData(const MonoDelta& timeout = MonoDelta::FromSeconds(30)); + Status WaitForNoDataOnTS(int index, const MonoDelta& timeout = MonoDelta::FromSeconds(30)); + Status WaitForMinFilesInTabletWalDirOnTS(int index, + const std::string& tablet_id, + int count, + const MonoDelta& timeout = MonoDelta::FromSeconds(60)); + Status WaitForReplicaCount(int expected, const MonoDelta& timeout = MonoDelta::FromSeconds(30)); + Status WaitForTabletDataStateOnTS(int index, + const std::string& tablet_id, + tablet::TabletDataState data_state, + const MonoDelta& timeout = MonoDelta::FromSeconds(30)); + + // Loop and check for certain filenames in the WAL directory of the specified + // tablet. This function returns OK if we reach a state where: + // * For each string in 'substrings_required', we find *at least one file* + // whose name contains that string, and: + // * For each string in 'substrings_disallowed', we find *no files* whose name + // contains that string, even if the file also matches a string in the + // 'substrings_required'. + Status WaitForFilePatternInTabletWalDirOnTs( + int ts_index, + const std::string& tablet_id, + const std::vector& substrings_required, + const std::vector& substrings_disallowed, + const MonoDelta& timeout = MonoDelta::FromSeconds(30)); + + private: + Env* const env_; + ExternalMiniCluster* const cluster_; + + DISALLOW_COPY_AND_ASSIGN(ExternalMiniClusterFsInspector); +}; + +} // namespace itest +} // namespace kudu + +#endif // KUDU_INTEGRATION_TESTS_CLUSTER_EXTERNAL_MINI_CLUSTER_FS_INSPECTOR_H_ diff --git a/src/kudu/integration-tests/flex_partitioning-itest.cc b/src/kudu/integration-tests/flex_partitioning-itest.cc new file mode 100644 index 000000000000..5c2fea999fe0 --- /dev/null +++ b/src/kudu/integration-tests/flex_partitioning-itest.cc @@ -0,0 +1,571 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Integration test for flexible partitioning (eg buckets, range partitioning +// of PK subsets, etc). + +#include +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/common/partial_row.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tools/data_gen_util.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/test_util.h" +#include "kudu/gutil/strings/escaping.h" + +namespace kudu { +namespace itest { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduInsert; +using client::KuduPredicate; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduTable; +using client::KuduTableCreator; +using client::KuduValue; +using client::sp::shared_ptr; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +static const char* const kTableName = "test-table"; +static const int kNumRows = 1000; + +class FlexPartitioningITest : public KuduTest { + public: + FlexPartitioningITest() + : random_(GetRandomSeed32()) { + } + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = 1; + // This test produces lots of tablets. With container and log preallocation, + // we end up using quite a bit of disk space. So, we disable them. + opts.extra_tserver_flags.push_back("--log_container_preallocate_bytes=0"); + opts.extra_tserver_flags.push_back("--log_preallocate_segments=false"); + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + + KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); + + ASSERT_OK(itest::CreateTabletServerMap(cluster_->master_proxy().get(), + cluster_->messenger(), + &ts_map_)); + } + + virtual void TearDown() OVERRIDE { + cluster_->Shutdown(); + KuduTest::TearDown(); + STLDeleteValues(&ts_map_); + STLDeleteElements(&inserted_rows_); + } + + protected: + void CreateTable(int num_columns, + const vector& bucket_a, int num_buckets_a, + const vector& bucket_b, int num_buckets_b, + const vector& range_cols, + int num_splits) { + // Set up the actual PK columns based on num_columns. The PK is made up + // of all the columns. + KuduSchemaBuilder b; + vector pk; + for (int i = 0; i < num_columns; i++) { + string name = Substitute("c$0", i); + b.AddColumn(name)->Type(KuduColumnSchema::INT32)->NotNull(); + pk.push_back(name); + } + b.SetPrimaryKey(pk); + KuduSchema schema; + ASSERT_OK(b.Build(&schema)); + + gscoped_ptr table_creator(client_->NewTableCreator()); + table_creator->table_name(kTableName) + .schema(&schema) + .num_replicas(1); + + // Set up partitioning. + if (!bucket_a.empty()) { + table_creator->add_hash_partitions(bucket_a, num_buckets_a); + } + if (!bucket_b.empty()) { + table_creator->add_hash_partitions(bucket_b, num_buckets_b); + } + table_creator->set_range_partition_columns(range_cols); + + // Compute split points. + vector split_rows; + int increment = kNumRows / num_splits; + for (int i = 1; i < num_splits; i++) { + KuduPartialRow* row = schema.NewRow(); + for (int j = 0; j < range_cols.size(); j++) { + const string& range_col = range_cols[j]; + if (j == 0) { + // Set the first component of the range to a set increment. + ASSERT_OK(row->SetInt32(range_col, increment * i)); + } else { + ASSERT_OK(row->SetInt32(range_col, random_.Next32())); + } + } + split_rows.push_back(row); + } + table_creator->split_rows(split_rows); + + ASSERT_OK(table_creator->Create()); + + ASSERT_OK(client_->OpenTable(kTableName, &table_)); + } + + int CountTablets() { + vector tablets; + CHECK_OK(ListTablets(ts_map_.begin()->second, MonoDelta::FromSeconds(10), &tablets)); + return tablets.size(); + } + + // Insert 'kNumRows' rows into the given table. The first column 'c0' is ascending, + // but the rest are random int32s. + Status InsertRandomRows(); + + // Perform a scan with a predicate on 'col_name' BETWEEN 'lower' AND 'upper'. + // Verifies that the results match up with applying the same scan against our + // in-memory copy 'inserted_rows_'. + void CheckScanWithColumnPredicate(Slice col_name, int lower, int upper); + + // Like the above, but uses the primary key range scan API in the client to + // scan between 'inserted_rows_[lower]' (inclusive) and 'inserted_rows_[upper]' + // (exclusive). + void CheckPKRangeScan(int lower, int upper); + void CheckPartitionKeyRangeScanWithPKRange(int lower, int upper); + + // Performs a series of scans, each over a single tablet in the table, and + // verifies that the aggregated results match up with 'inserted_rows_'. + void CheckPartitionKeyRangeScan(); + + // Inserts data into the table, then performs a number of scans to verify that + // the data can be retrieved. + void InsertAndVerifyScans(); + + Random random_; + + gscoped_ptr cluster_; + unordered_map ts_map_; + + shared_ptr client_; + shared_ptr table_; + vector inserted_rows_; +}; + +Status FlexPartitioningITest::InsertRandomRows() { + CHECK(inserted_rows_.empty()); + + shared_ptr session(client_->NewSession()); + session->SetTimeoutMillis(10000); + RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + for (uint64_t i = 0; i < kNumRows; i++) { + gscoped_ptr insert(table_->NewInsert()); + tools::GenerateDataForRow(table_->schema(), i, &random_, insert->mutable_row()); + inserted_rows_.push_back(new KuduPartialRow(*insert->mutable_row())); + RETURN_NOT_OK(session->Apply(insert.release())); + + if (i > 0 && i % 1000 == 0) { + RETURN_NOT_OK(session->Flush()); + } + } + RETURN_NOT_OK(session->Flush()); + return Status::OK(); +} + +void FlexPartitioningITest::CheckScanWithColumnPredicate(Slice col_name, int lower, int upper) { + KuduScanner scanner(table_.get()); + scanner.SetTimeoutMillis(60000); + CHECK_OK(scanner.AddConjunctPredicate(table_->NewComparisonPredicate( + col_name, KuduPredicate::GREATER_EQUAL, KuduValue::FromInt(lower)))); + CHECK_OK(scanner.AddConjunctPredicate(table_->NewComparisonPredicate( + col_name, KuduPredicate::LESS_EQUAL, KuduValue::FromInt(upper)))); + + vector rows; + ScanToStrings(&scanner, &rows); + std::sort(rows.begin(), rows.end()); + + // Manually evaluate the predicate against the data we think we inserted. + vector expected_rows; + for (const KuduPartialRow* row : inserted_rows_) { + int32_t val; + CHECK_OK(row->GetInt32(col_name, &val)); + if (val >= lower && val <= upper) { + expected_rows.push_back("(" + row->ToString() + ")"); + } + } + std::sort(expected_rows.begin(), expected_rows.end()); + + ASSERT_EQ(expected_rows.size(), rows.size()); + ASSERT_EQ(expected_rows, rows); +} + +void FlexPartitioningITest::CheckPKRangeScan(int lower, int upper) { + KuduScanner scanner(table_.get()); + scanner.SetTimeoutMillis(60000); + ASSERT_OK(scanner.AddLowerBound(*inserted_rows_[lower])); + ASSERT_OK(scanner.AddExclusiveUpperBound(*inserted_rows_[upper])); + vector rows; + ScanToStrings(&scanner, &rows); + std::sort(rows.begin(), rows.end()); + + vector expected_rows; + for (int i = lower; i < upper; i++) { + expected_rows.push_back("(" + inserted_rows_[i]->ToString() + ")"); + } + std::sort(expected_rows.begin(), expected_rows.end()); + + ASSERT_EQ(rows.size(), expected_rows.size()); + ASSERT_EQ(rows, expected_rows); +} + +void FlexPartitioningITest::CheckPartitionKeyRangeScan() { + master::GetTableLocationsResponsePB table_locations; + ASSERT_OK(GetTableLocations(cluster_->master_proxy(), + table_->name(), + MonoDelta::FromSeconds(32), + &table_locations)); + + vector rows; + + for (const master::TabletLocationsPB& tablet_locations : + table_locations.tablet_locations()) { + + string partition_key_start = tablet_locations.partition().partition_key_start(); + string partition_key_end = tablet_locations.partition().partition_key_end(); + + KuduScanner scanner(table_.get()); + scanner.SetTimeoutMillis(60000); + ASSERT_OK(scanner.AddLowerBoundPartitionKeyRaw(partition_key_start)); + ASSERT_OK(scanner.AddExclusiveUpperBoundPartitionKeyRaw(partition_key_end)); + ScanToStrings(&scanner, &rows); + } + std::sort(rows.begin(), rows.end()); + + vector expected_rows; + for (KuduPartialRow* row : inserted_rows_) { + expected_rows.push_back("(" + row->ToString() + ")"); + } + std::sort(expected_rows.begin(), expected_rows.end()); + + ASSERT_EQ(rows.size(), expected_rows.size()); + ASSERT_EQ(rows, expected_rows); +} + +void FlexPartitioningITest::CheckPartitionKeyRangeScanWithPKRange(int lower, int upper) { + master::GetTableLocationsResponsePB table_locations; + ASSERT_OK(GetTableLocations(cluster_->master_proxy(), + table_->name(), + MonoDelta::FromSeconds(32), + &table_locations)); + + vector rows; + + for (const master::TabletLocationsPB& tablet_locations : + table_locations.tablet_locations()) { + + string partition_key_start = tablet_locations.partition().partition_key_start(); + string partition_key_end = tablet_locations.partition().partition_key_end(); + + KuduScanner scanner(table_.get()); + scanner.SetTimeoutMillis(60000); + ASSERT_OK(scanner.AddLowerBoundPartitionKeyRaw(partition_key_start)); + ASSERT_OK(scanner.AddExclusiveUpperBoundPartitionKeyRaw(partition_key_end)); + ASSERT_OK(scanner.AddLowerBound(*inserted_rows_[lower])); + ASSERT_OK(scanner.AddExclusiveUpperBound(*inserted_rows_[upper])); + ScanToStrings(&scanner, &rows); + } + std::sort(rows.begin(), rows.end()); + + vector expected_rows; + for (int i = lower; i < upper; i++) { + expected_rows.push_back("(" + inserted_rows_[i]->ToString() + ")"); + } + std::sort(expected_rows.begin(), expected_rows.end()); + + ASSERT_EQ(rows.size(), expected_rows.size()); + ASSERT_EQ(rows, expected_rows); +} + +void FlexPartitioningITest::InsertAndVerifyScans() { + ASSERT_OK(InsertRandomRows()); + + // First, ensure that we get back the same number we put in. + { + vector rows; + ScanTableToStrings(table_.get(), &rows); + std::sort(rows.begin(), rows.end()); + ASSERT_EQ(kNumRows, rows.size()); + } + + // Perform some scans with predicates. + + // 1) Various predicates on 'c0', which has non-random data. + // We concentrate around the value '500' since there is a split point + // there. + NO_FATALS(CheckScanWithColumnPredicate("c0", 100, 120)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 490, 610)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 499, 499)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 500, 500)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 501, 501)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 499, 501)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 499, 500)); + NO_FATALS(CheckScanWithColumnPredicate("c0", 500, 501)); + + // 2) Random range predicates on the other columns, which are random ints. + for (int col_idx = 1; col_idx < table_->schema().num_columns(); col_idx++) { + SCOPED_TRACE(col_idx); + for (int i = 0; i < 10; i++) { + int32_t lower = random_.Next32(); + int32_t upper = random_.Next32(); + if (upper < lower) { + std::swap(lower, upper); + } + + NO_FATALS(CheckScanWithColumnPredicate(table_->schema().Column(col_idx).name(), + lower, upper)); + } + } + + // 3) Use the "primary key range" API. + { + NO_FATALS(CheckPKRangeScan(100, 120)); + NO_FATALS(CheckPKRangeScan(490, 610)); + NO_FATALS(CheckPKRangeScan(499, 499)); + NO_FATALS(CheckPKRangeScan(500, 500)); + NO_FATALS(CheckPKRangeScan(501, 501)); + NO_FATALS(CheckPKRangeScan(499, 501)); + NO_FATALS(CheckPKRangeScan(499, 500)); + NO_FATALS(CheckPKRangeScan(500, 501)); + } + + // 4) Use the Per-tablet "partition key range" API. + { + NO_FATALS(CheckPartitionKeyRangeScan()); + } + + // 5) Use the Per-tablet "partition key range" API with primary key range. + { + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(100, 120)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(200, 400)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(490, 610)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(499, 499)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(500, 500)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(501, 501)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(499, 501)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(499, 500)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(500, 501)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(650, 700)); + NO_FATALS(CheckPartitionKeyRangeScanWithPKRange(700, 800)); + } +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (c0, c1), +// ); +TEST_F(FlexPartitioningITest, TestSimplePartitioning) { + NO_FATALS(CreateTable(1, // 2 columns + vector(), 0, // No hash buckets + vector(), 0, // No hash buckets + { "c0" }, // no range partitioning + 2)); // 1 split; + ASSERT_EQ(2, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32 PRIMARY KEY, +// BUCKET BY (c0) INTO 3 BUCKETS +// ); +TEST_F(FlexPartitioningITest, TestSinglePKBucketed) { + NO_FATALS(CreateTable(1, // 1 column + { "c0" }, 3, // bucket by "c0" in 3 buckets + vector(), 0, // no other buckets + { "c0" }, // default range + 2)); // one split + ASSERT_EQ(6, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// BUCKET BY (c1) INTO 3 BUCKETS +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_BucketOnSecondColumn) { + NO_FATALS(CreateTable(2, // 2 columns + { "c1" }, 3, // bucket by "c0" in 3 buckets + vector(), 0, // no other buckets + { "c0", "c1" }, // default range + 1)); // no splits; + ASSERT_EQ(3, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (c1, c0) +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_RangePartitionByReversedPK) { + NO_FATALS(CreateTable(2, // 2 columns + vector(), 0, // no buckets + vector(), 0, // no buckets + { "c1", "c0" }, // range partition by reversed PK + 2)); // one split + ASSERT_EQ(2, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (c0) +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_RangePartitionByPKPrefix) { + NO_FATALS(CreateTable(2, // 2 columns + vector(), 0, // no buckets + vector(), 0, // no buckets + { "c0" }, // range partition by c0 + 2)); // one split + ASSERT_EQ(2, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (c1) +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_RangePartitionByPKSuffix) { + NO_FATALS(CreateTable(2, // 2 columns + vector(), 0, // no buckets + vector(), 0, // no buckets + { "c1" }, // range partition by c1 + 2)); // one split + ASSERT_EQ(2, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (c0), +// BUCKET BY (c1) INTO 4 BUCKETS +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_RangeAndBucket) { + NO_FATALS(CreateTable(2, // 2 columns + { "c1" }, 4, // BUCKET BY c1 INTO 4 BUCKETS + vector(), 0, // no buckets + { "c0" }, // range partition by c0 + 2)); // 1 split; + ASSERT_EQ(8, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// BUCKET BY (c1) INTO 4 BUCKETS, +// BUCKET BY (c0) INTO 3 BUCKETS +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_MultipleBucketings) { + NO_FATALS(CreateTable(2, // 2 columns + { "c1" }, 4, // BUCKET BY c1 INTO 4 BUCKETS + { "c0" }, 3, // BUCKET BY c0 INTO 3 BUCKETS + { "c0", "c1" }, // default range partitioning + 2)); // 1 split; + ASSERT_EQ(4 * 3 * 2, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (), +// BUCKET BY (c0) INTO 4 BUCKETS, +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_SingleBucketNoRange) { + NO_FATALS(CreateTable(2, // 2 columns + { "c0" }, 4, // BUCKET BY c0 INTO 4 BUCKETS + vector(), 0, // no buckets + vector(), // no range partitioning + 1)); // 0 splits; + ASSERT_EQ(4, CountTablets()); + + InsertAndVerifyScans(); +} + +// CREATE TABLE t ( +// c0 INT32, +// c1 INT32, +// PRIMARY KEY (c0, c1) +// RANGE PARTITION BY (), +// BUCKET BY (c0) INTO 4 BUCKETS, +// BUCKET BY (c1) INTO 5 BUCKETS, +// ); +TEST_F(FlexPartitioningITest, TestCompositePK_MultipleBucketingsNoRange) { + NO_FATALS(CreateTable(2, // 2 columns + { "c0" }, 4, // BUCKET BY c0 INTO 4 BUCKETS + { "c1" }, 5, // BUCKET BY c1 INTO 5 BUCKETS + vector(), // no range partitioning + 1)); // 0 splits; + ASSERT_EQ(20, CountTablets()); + + InsertAndVerifyScans(); +} + +} // namespace itest +} // namespace kudu diff --git a/src/kudu/integration-tests/full_stack-insert-scan-test.cc b/src/kudu/integration-tests/full_stack-insert-scan-test.cc new file mode 100644 index 000000000000..3a07af1428c0 --- /dev/null +++ b/src/kudu/integration-tests/full_stack-insert-scan-test.cc @@ -0,0 +1,463 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/client-test-util.h" +#include "kudu/client/row_result.h" +#include "kudu/client/write_op.h" +#include "kudu/codegen/compilation_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/async_util.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/errno.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/status.h" +#include "kudu/util/subprocess.h" +#include "kudu/util/thread.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" + +// Test size parameters +DEFINE_int32(concurrent_inserts, -1, "Number of inserting clients to launch"); +DEFINE_int32(inserts_per_client, -1, + "Number of rows inserted by each inserter client"); +DEFINE_int32(rows_per_batch, -1, "Number of rows per client batch"); + +// Perf-related FLAGS_perf_stat +DEFINE_bool(perf_record_scan, false, "Call \"perf record --call-graph\" " + "for the duration of the scan, disabled by default"); +DEFINE_bool(perf_stat_scan, false, "Print \"perf stat\" results during" + "scan to stdout, disabled by default"); +DEFINE_bool(perf_fp_flag, false, "Only applicable with --perf_record_scan," + " provides argument \"fp\" to the --call-graph flag"); +DECLARE_bool(enable_maintenance_manager); + +using std::string; +using std::vector; + +namespace kudu { +namespace tablet { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduInsert; +using client::KuduRowResult; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduStatusMemberCallback; +using client::KuduTable; +using client::KuduTableCreator; +using strings::Split; +using strings::Substitute; + +class FullStackInsertScanTest : public KuduTest { + protected: + FullStackInsertScanTest() + : // Set the default value depending on whether slow tests are allowed + kNumInsertClients(DefaultFlag(FLAGS_concurrent_inserts, 3, 10)), + kNumInsertsPerClient(DefaultFlag(FLAGS_inserts_per_client, 500, 50000)), + kNumRows(kNumInsertClients * kNumInsertsPerClient), + kFlushEveryN(DefaultFlag(FLAGS_rows_per_batch, 125, 5000)), + random_(SeedRandom()), + sessions_(kNumInsertClients), + tables_(kNumInsertClients) { + + // schema has kNumIntCols contiguous columns of Int32 and Int64, in order. + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT64)->NotNull()->PrimaryKey(); + b.AddColumn("string_val")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("int32_val1")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("int32_val2")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("int32_val3")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("int32_val4")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("int64_val1")->Type(KuduColumnSchema::INT64)->NotNull(); + b.AddColumn("int64_val2")->Type(KuduColumnSchema::INT64)->NotNull(); + b.AddColumn("int64_val3")->Type(KuduColumnSchema::INT64)->NotNull(); + b.AddColumn("int64_val4")->Type(KuduColumnSchema::INT64)->NotNull(); + CHECK_OK(b.Build(&schema_)); + } + + const int kNumInsertClients; + const int kNumInsertsPerClient; + const int kNumRows; + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + } + + void CreateTable() { + ASSERT_GE(kNumInsertClients, 0); + ASSERT_GE(kNumInsertsPerClient, 0); + NO_FATALS(InitCluster()); + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(1) + .Create()); + ASSERT_OK(client_->OpenTable(kTableName, &reader_table_)); + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + } + KuduTest::TearDown(); + } + + void DoConcurrentClientInserts(); + void DoTestScans(); + void FlushToDisk(); + + private: + int DefaultFlag(int flag, int fast, int slow) { + if (flag != -1) return flag; + if (AllowSlowTests()) return slow; + return fast; + } + + // Generate random row according to schema_. + static void RandomRow(Random* rng, KuduPartialRow* row, + char* buf, int64_t key, int id); + + void InitCluster() { + // Start mini-cluster with 1 tserver, config client options + cluster_.reset(new MiniCluster(env_.get(), MiniClusterOptions())); + ASSERT_OK(cluster_->Start()); + KuduClientBuilder builder; + builder.add_master_server_addr( + cluster_->mini_master()->bound_rpc_addr_str()); + builder.default_rpc_timeout(MonoDelta::FromSeconds(30)); + ASSERT_OK(builder.Build(&client_)); + } + + // Adds newly generated client's session and table pointers to arrays at id + void CreateNewClient(int id) { + ASSERT_OK(client_->OpenTable(kTableName, &tables_[id])); + client::sp::shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(kSessionTimeoutMs); + ASSERT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + sessions_[id] = session; + } + + // Insert the rows that are associated with that ID. + void InsertRows(CountDownLatch* start_latch, int id, uint32_t seed); + + // Run a scan from the reader_client_ with the projection schema schema + // and LOG_TIMING message msg. + void ScanProjection(const vector& cols, const string& msg); + + vector AllColumnNames() const; + vector StringColumnNames() const; + vector Int32ColumnNames() const; + vector Int64ColumnNames() const; + + static const char* const kTableName; + static const int kSessionTimeoutMs = 60000; + static const int kRandomStrMinLength = 16; + static const int kRandomStrMaxLength = 31; + static const int kNumIntCols = 4; + enum { + kKeyCol, + kStrCol, + kInt32ColBase, + kInt64ColBase = kInt32ColBase + kNumIntCols + }; + const int kFlushEveryN; + + Random random_; + + KuduSchema schema_; + std::shared_ptr cluster_; + client::sp::shared_ptr client_; + client::sp::shared_ptr reader_table_; + // Concurrent client insertion test variables + vector > sessions_; + vector > tables_; +}; + +namespace { + +gscoped_ptr MakePerfStat() { + if (!FLAGS_perf_stat_scan) return gscoped_ptr(); + // No output flag for perf-stat 2.x, just print to output + string cmd = Substitute("perf stat --pid=$0", getpid()); + LOG(INFO) << "Calling: \"" << cmd << "\""; + return gscoped_ptr(new Subprocess("perf", Split(cmd, " "))); +} + +gscoped_ptr MakePerfRecord() { + if (!FLAGS_perf_record_scan) return gscoped_ptr(); + string cmd = Substitute("perf record --pid=$0 --call-graph", getpid()); + if (FLAGS_perf_fp_flag) cmd += " fp"; + LOG(INFO) << "Calling: \"" << cmd << "\""; + return gscoped_ptr(new Subprocess("perf", Split(cmd, " "))); +} + +void InterruptNotNull(gscoped_ptr sub) { + if (!sub) return; + ASSERT_OK(sub->Kill(SIGINT)); + int exit_status = 0; + ASSERT_OK(sub->Wait(&exit_status)); + if (!exit_status) { + LOG(WARNING) << "Subprocess returned " << exit_status + << ": " << ErrnoToString(exit_status); + } +} + +// If key is approximately at an even multiple of 1/10 of the way between +// start and end, then a % completion update is printed to LOG(INFO) +// Assumes that end - start + 1 fits into an int +void ReportTenthDone(int64_t key, int64_t start, int64_t end, + int id, int numids) { + int done = key - start + 1; + int total = end - start + 1; + if (total < 10) return; + if (done % (total / 10) == 0) { + int percent = done * 100 / total; + LOG(INFO) << "Insertion thread " << id << " of " + << numids << " is "<< percent << "% done."; + } +} + +void ReportAllDone(int id, int numids) { + LOG(INFO) << "Insertion thread " << id << " of " + << numids << " is 100% done."; +} + +} // anonymous namespace + +const char* const FullStackInsertScanTest::kTableName = "full-stack-mrs-test-tbl"; + +TEST_F(FullStackInsertScanTest, MRSOnlyStressTest) { + FLAGS_enable_maintenance_manager = false; + NO_FATALS(CreateTable()); + NO_FATALS(DoConcurrentClientInserts()); + NO_FATALS(DoTestScans()); +} + +TEST_F(FullStackInsertScanTest, WithDiskStressTest) { + NO_FATALS(CreateTable()); + NO_FATALS(DoConcurrentClientInserts()); + NO_FATALS(FlushToDisk()); + NO_FATALS(DoTestScans()); +} + +void FullStackInsertScanTest::DoConcurrentClientInserts() { + vector > threads(kNumInsertClients); + CountDownLatch start_latch(kNumInsertClients + 1); + for (int i = 0; i < kNumInsertClients; ++i) { + NO_FATALS(CreateNewClient(i)); + ASSERT_OK(Thread::Create(CURRENT_TEST_NAME(), + StrCat(CURRENT_TEST_CASE_NAME(), "-id", i), + &FullStackInsertScanTest::InsertRows, this, + &start_latch, i, random_.Next(), &threads[i])); + start_latch.CountDown(); + } + LOG_TIMING(INFO, + strings::Substitute("concurrent inserts ($0 rows, $1 threads)", + kNumRows, kNumInsertClients)) { + start_latch.CountDown(); + for (const scoped_refptr& thread : threads) { + ASSERT_OK(ThreadJoiner(thread.get()) + .warn_every_ms(15000) + .Join()); + } + } +} + +void FullStackInsertScanTest::DoTestScans() { + LOG(INFO) << "Doing test scans on table of " << kNumRows << " rows."; + + gscoped_ptr stat = MakePerfStat(); + gscoped_ptr record = MakePerfRecord(); + if (stat) stat->Start(); + if (record) record->Start(); + + NO_FATALS(ScanProjection(vector(), "empty projection, 0 col")); + NO_FATALS(ScanProjection({ "key" }, "key scan, 1 col")); + NO_FATALS(ScanProjection(AllColumnNames(), "full schema scan, 10 col")); + NO_FATALS(ScanProjection(StringColumnNames(), "String projection, 1 col")); + NO_FATALS(ScanProjection(Int32ColumnNames(), "Int32 projection, 4 col")); + NO_FATALS(ScanProjection(Int64ColumnNames(), "Int64 projection, 4 col")); + + NO_FATALS(InterruptNotNull(record.Pass())); + NO_FATALS(InterruptNotNull(stat.Pass())); +} + +void FullStackInsertScanTest::FlushToDisk() { + for (int i = 0; i < cluster_->num_tablet_servers(); ++i) { + tserver::TabletServer* ts = cluster_->mini_tablet_server(i)->server(); + ts->maintenance_manager()->Shutdown(); + tserver::TSTabletManager* tm = ts->tablet_manager(); + vector > peers; + tm->GetTabletPeers(&peers); + for (const scoped_refptr& peer : peers) { + Tablet* tablet = peer->tablet(); + if (!tablet->MemRowSetEmpty()) { + ASSERT_OK(tablet->Flush()); + } + ASSERT_OK(tablet->Compact(Tablet::FORCE_COMPACT_ALL)); + } + } +} + +void FullStackInsertScanTest::InsertRows(CountDownLatch* start_latch, int id, + uint32_t seed) { + Random rng(seed + id); + + start_latch->Wait(); + // Retrieve id's session and table + client::sp::shared_ptr session = sessions_[id]; + client::sp::shared_ptr table = tables_[id]; + // Identify start and end of keyrange id is responsible for + int64_t start = kNumInsertsPerClient * id; + int64_t end = start + kNumInsertsPerClient; + // Printed id value is in the range 1..kNumInsertClients inclusive + ++id; + // Use synchronizer to keep 1 asynchronous batch flush maximum + Synchronizer sync; + KuduStatusMemberCallback cb(&sync, &Synchronizer::StatusCB); + // Prime the synchronizer as if it was running a batch (for for-loop code) + cb.Run(Status::OK()); + // Maintain buffer for random string generation + char randstr[kRandomStrMaxLength + 1]; + // Insert in the id's key range + for (int64_t key = start; key < end; ++key) { + gscoped_ptr insert(table->NewInsert()); + RandomRow(&rng, insert->mutable_row(), randstr, key, id); + CHECK_OK(session->Apply(insert.release())); + + // Report updates or flush every so often, using the synchronizer to always + // start filling up the next batch while previous one is sent out. + if (key % kFlushEveryN == 0) { + Status s = sync.Wait(); + if (!s.ok()) { + LogSessionErrorsAndDie(session, s); + } + sync.Reset(); + session->FlushAsync(&cb); + } + ReportTenthDone(key, start, end, id, kNumInsertClients); + } + ReportAllDone(id, kNumInsertClients); + Status s = sync.Wait(); + if (!s.ok()) { + LogSessionErrorsAndDie(session, s); + } + FlushSessionOrDie(session); +} + +void FullStackInsertScanTest::ScanProjection(const vector& cols, + const string& msg) { + { + // Warmup codegen cache + KuduScanner scanner(reader_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns(cols)); + ASSERT_OK(scanner.Open()); + codegen::CompilationManager::GetSingleton()->Wait(); + } + KuduScanner scanner(reader_table_.get()); + ASSERT_OK(scanner.SetProjectedColumns(cols)); + uint64_t nrows = 0; + LOG_TIMING(INFO, msg) { + ASSERT_OK(scanner.Open()); + vector rows; + while (scanner.HasMoreRows()) { + ASSERT_OK(scanner.NextBatch(&rows)); + nrows += rows.size(); + } + } + ASSERT_EQ(nrows, kNumRows); +} + +// Fills in the fields for a row as defined by the Schema below +// name: (key, string_val, int32_val$, int64_val$) +// type: (int64_t, string, int32_t x4, int64_t x4) +// The first int32 gets the id and the first int64 gets the thread +// id. The key is assigned to "key," and the other fields are random. +void FullStackInsertScanTest::RandomRow(Random* rng, KuduPartialRow* row, char* buf, + int64_t key, int id) { + CHECK_OK(row->SetInt64(kKeyCol, key)); + int len = kRandomStrMinLength + + rng->Uniform(kRandomStrMaxLength - kRandomStrMinLength + 1); + RandomString(buf, len, rng); + buf[len] = '\0'; + CHECK_OK(row->SetStringCopy(kStrCol, buf)); + CHECK_OK(row->SetInt32(kInt32ColBase, id)); + CHECK_OK(row->SetInt64(kInt64ColBase, Thread::current_thread()->tid())); + for (int i = 1; i < kNumIntCols; ++i) { + CHECK_OK(row->SetInt32(kInt32ColBase + i, rng->Next32())); + CHECK_OK(row->SetInt64(kInt64ColBase + i, rng->Next64())); + } +} + +vector FullStackInsertScanTest::AllColumnNames() const { + vector ret; + for (int i = 0; i < schema_.num_columns(); i++) { + ret.push_back(schema_.Column(i).name()); + } + return ret; +} + +vector FullStackInsertScanTest::StringColumnNames() const { + return { "string_val" }; +} + +vector FullStackInsertScanTest::Int32ColumnNames() const { + return { "int32_val1", + "int32_val2", + "int32_val3", + "int32_val4" }; +} + +vector FullStackInsertScanTest::Int64ColumnNames() const { + return { "int64_val1", + "int64_val2", + "int64_val3", + "int64_val4" }; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/integration-tests/linked_list-test-util.h b/src/kudu/integration-tests/linked_list-test-util.h new file mode 100644 index 000000000000..854e103faa39 --- /dev/null +++ b/src/kudu/integration-tests/linked_list-test-util.h @@ -0,0 +1,868 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/client-test-util.h" +#include "kudu/client/row_result.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/walltime.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/tablet.h" +#include "kudu/util/atomic.h" +#include "kudu/util/blocking_queue.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/monotime.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread.h" + +namespace kudu { + +static const char* const kKeyColumnName = "rand_key"; +static const char* const kLinkColumnName = "link_to"; +static const char* const kInsertTsColumnName = "insert_ts"; +static const char* const kUpdatedColumnName = "updated"; +static const int64_t kNoSnapshot = -1; +static const int64_t kNoParticularCountExpected = -1; + +// Vector of snapshot timestamp, count pairs. +typedef vector > SnapsAndCounts; + +// Provides methods for writing data and reading it back in such a way that +// facilitates checking for data integrity. +class LinkedListTester { + public: + LinkedListTester(client::sp::shared_ptr client, + std::string table_name, int num_chains, int num_tablets, + int num_replicas, bool enable_mutation) + : verify_projection_( + {kKeyColumnName, kLinkColumnName, kUpdatedColumnName}), + table_name_(std::move(table_name)), + num_chains_(num_chains), + num_tablets_(num_tablets), + num_replicas_(num_replicas), + enable_mutation_(enable_mutation), + latency_histogram_(1000000, 3), + client_(std::move(client)) { + client::KuduSchemaBuilder b; + + b.AddColumn(kKeyColumnName)->Type(client::KuduColumnSchema::INT64)->NotNull()->PrimaryKey(); + b.AddColumn(kLinkColumnName)->Type(client::KuduColumnSchema::INT64)->NotNull(); + b.AddColumn(kInsertTsColumnName)->Type(client::KuduColumnSchema::INT64)->NotNull(); + b.AddColumn(kUpdatedColumnName)->Type(client::KuduColumnSchema::BOOL)->NotNull() + ->Default(client::KuduValue::FromBool(false)); + CHECK_OK(b.Build(&schema_)); + } + + // Create the table. + Status CreateLinkedListTable(); + + // Load the table with the linked list test pattern. + // + // Runs for the amount of time designated by 'run_for'. + // Sets *written_count to the number of rows inserted. + Status LoadLinkedList( + const MonoDelta& run_for, + int num_samples, + int64_t *written_count); + + // Variant of VerifyLinkedListRemote that verifies at the specified snapshot timestamp. + Status VerifyLinkedListAtSnapshotRemote(const uint64_t snapshot_timestamp, + const int64_t expected, + const bool log_errors, + const boost::function& cb, + int64_t* verified_count) { + return VerifyLinkedListRemote(snapshot_timestamp, + expected, + log_errors, + cb, + verified_count); + } + + // Variant of VerifyLinkedListRemote that verifies without specifying a snapshot timestamp. + Status VerifyLinkedListNoSnapshotRemote(const int64_t expected, + const bool log_errors, + int64_t* verified_count) { + return VerifyLinkedListRemote(kNoSnapshot, + expected, + log_errors, + boost::bind(&LinkedListTester::ReturnOk, this, _1), + verified_count); + } + + // Run the verify step on a table with RPCs. Calls the provided callback 'cb' once during + // verification to test scanner fault tolerance. + Status VerifyLinkedListRemote(const uint64_t snapshot_timestamp, + const int64_t expected, + const bool log_errors, + const boost::function& cb, + int64_t* verified_count); + + // Run the verify step on a specific tablet. + Status VerifyLinkedListLocal(const tablet::Tablet* tablet, + const int64_t expected, + int64_t* verified_count); + + // A variant of VerifyLinkedListRemote that is more robust towards ongoing + // bootstrapping and replication. + Status WaitAndVerify(int seconds_to_run, + int64_t expected) { + return WaitAndVerify(seconds_to_run, + expected, + boost::bind(&LinkedListTester::ReturnOk, this, _1)); + } + + // A variant of WaitAndVerify that also takes a callback to be run once during verification. + Status WaitAndVerify(int seconds_to_run, + int64_t expected, + const boost::function& cb); + + // Generates a vector of keys for the table such that each tablet is + // responsible for an equal fraction of the int64 key space. + std::vector GenerateSplitRows(const client::KuduSchema& schema); + + // Generate a vector of ints which form the split keys. + std::vector GenerateSplitInts(); + + void DumpInsertHistogram(bool print_flags); + + protected: + client::KuduSchema schema_; + const std::vector verify_projection_; + const std::string table_name_; + const int num_chains_; + const int num_tablets_; + const int num_replicas_; + const bool enable_mutation_; + HdrHistogram latency_histogram_; + client::sp::shared_ptr client_; + SnapsAndCounts sampled_timestamps_and_counts_; + + private: + Status ReturnOk(const std::string& str) { return Status::OK(); } +}; + +// Generates the linked list pattern. +// Since we can insert multiple chain in parallel, this encapsulates the +// state for each chain. +class LinkedListChainGenerator { + public: + // 'chain_idx' is a unique ID for this chain. Chains with different indexes + // will always generate distinct sets of keys (thus avoiding the possibility of + // a collision even in a longer run). + explicit LinkedListChainGenerator(uint8_t chain_idx) + : chain_idx_(chain_idx), + rand_(chain_idx * 0xDEADBEEF), + prev_key_(0) { + CHECK_LT(chain_idx, 256); + } + + ~LinkedListChainGenerator() { + } + + // Generate a random 64-bit unsigned int. + uint64_t Rand64() { + return (implicit_cast(rand_.Next()) << 32) | rand_.Next(); + } + + Status GenerateNextInsert(client::KuduTable* table, client::KuduSession* session) { + // Encode the chain index in the lowest 8 bits so that different chains never + // intersect. + int64_t this_key = (Rand64() << 8) | chain_idx_; + int64_t ts = GetCurrentTimeMicros(); + + gscoped_ptr insert(table->NewInsert()); + CHECK_OK(insert->mutable_row()->SetInt64(kKeyColumnName, this_key)); + CHECK_OK(insert->mutable_row()->SetInt64(kInsertTsColumnName, ts)); + CHECK_OK(insert->mutable_row()->SetInt64(kLinkColumnName, prev_key_)); + RETURN_NOT_OK_PREPEND(session->Apply(insert.release()), + strings::Substitute("Unable to apply insert with key $0 at ts $1", + this_key, ts)); + prev_key_ = this_key; + return Status::OK(); + } + + int64_t prev_key() const { + return prev_key_; + } + + private: + const uint8_t chain_idx_; + + // This is a linear congruential random number generator, so it won't repeat until + // it has exhausted its period (which is quite large) + Random rand_; + + // The previously output key. + int64_t prev_key_; + + DISALLOW_COPY_AND_ASSIGN(LinkedListChainGenerator); +}; + +// A thread that updates the timestamps of rows whose keys are put in its BlockingQueue. +class ScopedRowUpdater { + public: + + // Create and start a new ScopedUpdater. 'table' must remain valid for + // the lifetime of this object. + explicit ScopedRowUpdater(client::KuduTable* table) + : table_(table), + to_update_(kint64max) { // no limit + CHECK_OK(Thread::Create("linked_list-test", "updater", + &ScopedRowUpdater::RowUpdaterThread, this, &updater_)); + } + + ~ScopedRowUpdater() { + to_update_.Shutdown(); + if (updater_) { + updater_->Join(); + } + } + + BlockingQueue* to_update() { return &to_update_; } + + private: + void RowUpdaterThread() { + client::sp::shared_ptr session(table_->client()->NewSession()); + session->SetTimeoutMillis(15000); + CHECK_OK(session->SetFlushMode(client::KuduSession::MANUAL_FLUSH)); + + int64_t next_key; + int64_t updated_count = 0; + while (to_update_.BlockingGet(&next_key)) { + gscoped_ptr update(table_->NewUpdate()); + CHECK_OK(update->mutable_row()->SetInt64(kKeyColumnName, next_key)); + CHECK_OK(update->mutable_row()->SetBool(kUpdatedColumnName, true)); + CHECK_OK(session->Apply(update.release())); + if (++updated_count % 50 == 0) { + FlushSessionOrDie(session); + } + } + + FlushSessionOrDie(session); + } + + client::KuduTable* table_; + BlockingQueue to_update_; + scoped_refptr updater_; +}; + +// A thread that periodically checks tablet and master web pages during the +// linked list test. +class PeriodicWebUIChecker { + public: + PeriodicWebUIChecker(const ExternalMiniCluster& cluster, + const std::string& tablet_id, MonoDelta period) + : period_(std::move(period)), is_running_(true) { + // List of master and ts web pages to fetch + vector master_pages, ts_pages; + + master_pages.push_back("/metrics"); + master_pages.push_back("/masters"); + master_pages.push_back("/tables"); + master_pages.push_back("/dump-entities"); + master_pages.push_back("/tablet-servers"); + + ts_pages.push_back("/metrics"); + ts_pages.push_back("/tablets"); + ts_pages.push_back(strings::Substitute("/transactions?tablet_id=$0", tablet_id)); + + // Generate list of urls for each master and tablet server + for (int i = 0; i < cluster.num_masters(); i++) { + for (std::string page : master_pages) { + urls_.push_back(strings::Substitute( + "http://$0$1", + cluster.master(i)->bound_http_hostport().ToString(), + page)); + } + } + for (int i = 0; i < cluster.num_tablet_servers(); i++) { + for (std::string page : ts_pages) { + urls_.push_back(strings::Substitute( + "http://$0$1", + cluster.tablet_server(i)->bound_http_hostport().ToString(), + page)); + } + } + CHECK_OK(Thread::Create("linked_list-test", "checker", + &PeriodicWebUIChecker::CheckThread, this, &checker_)); + } + + ~PeriodicWebUIChecker() { + LOG(INFO) << "Shutting down curl thread"; + is_running_.Store(false); + if (checker_) { + checker_->Join(); + } + } + + private: + void CheckThread() { + EasyCurl curl; + faststring dst; + LOG(INFO) << "Curl thread will poll the following URLs every " << period_.ToMilliseconds() + << " ms: "; + for (std::string url : urls_) { + LOG(INFO) << url; + } + for (int count = 0; is_running_.Load(); count++) { + const std::string &url = urls_[count % urls_.size()]; + LOG(INFO) << "Curling URL " << url; + const MonoTime start = MonoTime::Now(MonoTime::FINE); + Status status = curl.FetchURL(url, &dst); + if (status.ok()) { + CHECK_GT(dst.length(), 0); + } + // Sleep until the next period + const MonoTime end = MonoTime::Now(MonoTime::FINE); + const MonoDelta elapsed = end.GetDeltaSince(start); + const int64_t sleep_ns = period_.ToNanoseconds() - elapsed.ToNanoseconds(); + if (sleep_ns > 0) { + SleepFor(MonoDelta::FromNanoseconds(sleep_ns)); + } + } + } + + const MonoDelta period_; + AtomicBool is_running_; + scoped_refptr checker_; + vector urls_; +}; + +// Helper class to hold results from a linked list scan and perform the +// verification step on the data. +class LinkedListVerifier { + public: + LinkedListVerifier(int num_chains, bool enable_mutation, int64_t expected, + std::vector split_key_ints); + + // Start the scan timer. The duration between starting the scan and verifying + // the data is logged in the VerifyData() step, so this should be called + // immediately before starting the table(t) scan. + void StartScanTimer(); + + // Register a new row result during the verify step. + void RegisterResult(int64_t key, int64_t link, bool updated); + + // Run the common verify step once the scanned data is stored. + Status VerifyData(int64_t* verified_count, bool log_errors); + + private: + // Print a summary of the broken links to the log. + void SummarizeBrokenLinks(const std::vector& broken_links); + + const int num_chains_; + const int64_t expected_; + const bool enable_mutation_; + const std::vector split_key_ints_; + std::vector seen_key_; + std::vector seen_link_to_; + int errors_; + Stopwatch scan_timer_; +}; + +///////////////////////////////////////////////////////////// +// LinkedListTester +///////////////////////////////////////////////////////////// + +std::vector LinkedListTester::GenerateSplitRows( + const client::KuduSchema& schema) { + std::vector split_keys; + for (int64_t val : GenerateSplitInts()) { + KuduPartialRow* row = schema.NewRow(); + CHECK_OK(row->SetInt64(kKeyColumnName, val)); + split_keys.push_back(row); + } + return split_keys; +} + +std::vector LinkedListTester::GenerateSplitInts() { + vector ret; + ret.reserve(num_tablets_ - 1); + int64_t increment = kint64max / num_tablets_; + for (int64_t i = 1; i < num_tablets_; i++) { + ret.push_back(i * increment); + } + return ret; +} + +Status LinkedListTester::CreateLinkedListTable() { + gscoped_ptr table_creator(client_->NewTableCreator()); + RETURN_NOT_OK_PREPEND(table_creator->table_name(table_name_) + .schema(&schema_) + .split_rows(GenerateSplitRows(schema_)) + .num_replicas(num_replicas_) + .Create(), + "Failed to create table"); + return Status::OK(); +} + +Status LinkedListTester::LoadLinkedList( + const MonoDelta& run_for, + int num_samples, + int64_t *written_count) { + + sampled_timestamps_and_counts_.clear(); + client::sp::shared_ptr table; + RETURN_NOT_OK_PREPEND(client_->OpenTable(table_name_, &table), + "Could not open table " + table_name_); + + // Instantiate a hybrid clock so that we can collect timestamps since we're running the + // tablet servers in an external mini cluster. + // TODO when they become available (KUDU-420), use client-propagated timestamps + // instead of reading from the clock directly. This will allow to run this test + // against a "real" cluster and not force the client to be synchronized. + scoped_refptr ht_clock(new server::HybridClock()); + RETURN_NOT_OK(ht_clock->Init()); + + MonoTime start = MonoTime::Now(MonoTime::COARSE); + MonoTime deadline = start; + deadline.AddDelta(run_for); + + client::sp::shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(15000); + RETURN_NOT_OK_PREPEND(session->SetFlushMode(client::KuduSession::MANUAL_FLUSH), + "Couldn't set flush mode"); + + ScopedRowUpdater updater(table.get()); + std::vector chains; + ElementDeleter d(&chains); + for (int i = 0; i < num_chains_; i++) { + chains.push_back(new LinkedListChainGenerator(i)); + } + + MonoDelta sample_interval = MonoDelta::FromMicroseconds(run_for.ToMicroseconds() / num_samples); + MonoTime next_sample = start; + next_sample.AddDelta(sample_interval); + LOG(INFO) << "Running for: " << run_for.ToString(); + LOG(INFO) << "Sampling every " << sample_interval.ToMicroseconds() << " us"; + + *written_count = 0; + int iter = 0; + while (true) { + if (iter++ % 10000 == 0) { + LOG(INFO) << "Written " << (*written_count) << " rows in chain"; + DumpInsertHistogram(false); + } + + MonoTime now = MonoTime::Now(MonoTime::COARSE); + if (next_sample.ComesBefore(now)) { + Timestamp now = ht_clock->Now(); + sampled_timestamps_and_counts_.push_back( + pair(now.ToUint64(), *written_count)); + next_sample.AddDelta(sample_interval); + LOG(INFO) << "Sample at HT timestamp: " << now.ToString() + << " Inserted count: " << *written_count; + } + if (deadline.ComesBefore(now)) { + LOG(INFO) << "Finished inserting list. Added " << (*written_count) << " in chain"; + LOG(INFO) << "Last entries inserted had keys:"; + for (int i = 0; i < num_chains_; i++) { + LOG(INFO) << i << ": " << chains[i]->prev_key(); + } + return Status::OK(); + } + for (LinkedListChainGenerator* chain : chains) { + RETURN_NOT_OK_PREPEND(chain->GenerateNextInsert(table.get(), session.get()), + "Unable to generate next insert into linked list chain"); + } + + MonoTime flush_start(MonoTime::Now(MonoTime::FINE)); + FlushSessionOrDie(session); + MonoDelta elapsed = MonoTime::Now(MonoTime::FINE).GetDeltaSince(flush_start); + latency_histogram_.Increment(elapsed.ToMicroseconds()); + + (*written_count) += chains.size(); + + if (enable_mutation_) { + // Rows have been inserted; they're now safe to update. + for (LinkedListChainGenerator* chain : chains) { + updater.to_update()->Put(chain->prev_key()); + } + } + } +} + +void LinkedListTester::DumpInsertHistogram(bool print_flags) { + // We dump to cout instead of using glog so the output isn't prefixed with + // line numbers. This makes it less ugly to copy-paste into JIRA, etc. + using std::cout; + using std::endl; + + const HdrHistogram* h = &latency_histogram_; + + cout << "------------------------------------------------------------" << endl; + cout << "Histogram for latency of insert operations (microseconds)" << endl; + if (print_flags) { + cout << "Flags: " << google::CommandlineFlagsIntoString() << endl; + } + cout << "Note: each insert is a batch of " << num_chains_ << " rows." << endl; + cout << "------------------------------------------------------------" << endl; + cout << "Count: " << h->TotalCount() << endl; + cout << "Mean: " << h->MeanValue() << endl; + cout << "Percentiles:" << endl; + cout << " 0% (min) = " << h->MinValue() << endl; + cout << " 25% = " << h->ValueAtPercentile(25) << endl; + cout << " 50% (med) = " << h->ValueAtPercentile(50) << endl; + cout << " 75% = " << h->ValueAtPercentile(75) << endl; + cout << " 95% = " << h->ValueAtPercentile(95) << endl; + cout << " 99% = " << h->ValueAtPercentile(99) << endl; + cout << " 99.9% = " << h->ValueAtPercentile(99.9) << endl; + cout << " 99.99% = " << h->ValueAtPercentile(99.99) << endl; + cout << " 100% (max) = " << h->MaxValue() << endl; + if (h->MaxValue() >= h->highest_trackable_value()) { + cout << "*NOTE: some values were greater than highest trackable value" << endl; + } +} + +// Verify that the given sorted vector does not contain any duplicate entries. +// If it does, *errors will be incremented once per duplicate and the given message +// will be logged. +static void VerifyNoDuplicateEntries(const std::vector& ints, int* errors, + const string& message) { + for (int i = 1; i < ints.size(); i++) { + if (ints[i] == ints[i - 1]) { + LOG(ERROR) << message << ": " << ints[i]; + (*errors)++; + } + } +} + +Status LinkedListTester::VerifyLinkedListRemote( + const uint64_t snapshot_timestamp, const int64_t expected, bool log_errors, + const boost::function& cb, int64_t* verified_count) { + + client::sp::shared_ptr table; + RETURN_NOT_OK(client_->OpenTable(table_name_, &table)); + + string snapshot_str; + if (snapshot_timestamp == kNoSnapshot) { + snapshot_str = "LATEST"; + } else { + snapshot_str = server::HybridClock::StringifyTimestamp(Timestamp(snapshot_timestamp)); + } + + client::KuduScanner scanner(table.get()); + RETURN_NOT_OK_PREPEND(scanner.SetProjectedColumns(verify_projection_), "Bad projection"); + RETURN_NOT_OK(scanner.SetBatchSizeBytes(0)); // Force at least one NextBatch RPC. + + if (snapshot_timestamp != kNoSnapshot) { + RETURN_NOT_OK(scanner.SetReadMode(client::KuduScanner::READ_AT_SNAPSHOT)); + RETURN_NOT_OK(scanner.SetFaultTolerant()); + RETURN_NOT_OK(scanner.SetSnapshotRaw(snapshot_timestamp)); + } + + LOG(INFO) << "Verifying Snapshot: " << snapshot_str << " Expected Rows: " << expected; + + RETURN_NOT_OK_PREPEND(scanner.Open(), "Couldn't open scanner"); + + RETURN_NOT_OK(scanner.SetBatchSizeBytes(1024)); // More normal batch size. + + LinkedListVerifier verifier(num_chains_, enable_mutation_, expected, + GenerateSplitInts()); + verifier.StartScanTimer(); + + bool cb_called = false; + std::vector rows; + while (scanner.HasMoreRows()) { + // If we're doing a snapshot scan with a big enough cluster, call the callback on the scanner's + // tserver. Do this only once. + if (snapshot_timestamp != kNoSnapshot && !cb_called) { + client::KuduTabletServer* kts_ptr; + scanner.GetCurrentServer(&kts_ptr); + gscoped_ptr kts(kts_ptr); + const std::string down_ts = kts->uuid(); + LOG(INFO) << "Calling callback on tserver " << down_ts; + RETURN_NOT_OK(cb(down_ts)); + cb_called = true; + } + RETURN_NOT_OK_PREPEND(scanner.NextBatch(&rows), "Couldn't fetch next row batch"); + for (const client::KuduRowResult& row : rows) { + int64_t key; + int64_t link; + bool updated; + RETURN_NOT_OK(row.GetInt64(0, &key)); + RETURN_NOT_OK(row.GetInt64(1, &link)); + + // For non-snapshot reads we also verify that all rows were updated. We don't + // for snapshot reads as updates are performed by their own thread. This means + // that there is no guarantee that, for any snapshot timestamp that comes before + // all writes are completed, all rows will be updated. + if (snapshot_timestamp == kNoSnapshot) { + RETURN_NOT_OK(row.GetBool(2, &updated)); + } else { + updated = enable_mutation_; + } + + verifier.RegisterResult(key, link, updated); + } + } + + Status s = verifier.VerifyData(verified_count, log_errors); + LOG(INFO) << "Snapshot: " << snapshot_str << " verified. Result: " << s.ToString(); + return s; +} + +Status LinkedListTester::VerifyLinkedListLocal(const tablet::Tablet* tablet, + int64_t expected, + int64_t* verified_count) { + DCHECK(tablet != NULL); + LinkedListVerifier verifier(num_chains_, enable_mutation_, expected, + GenerateSplitInts()); + verifier.StartScanTimer(); + + const Schema* tablet_schema = tablet->schema(); + // Cannot use schemas with col indexes in a scan (assertions fire). + Schema projection(tablet_schema->columns(), tablet_schema->num_key_columns()); + gscoped_ptr iter; + RETURN_NOT_OK_PREPEND(tablet->NewRowIterator(projection, &iter), + "Cannot create new row iterator"); + RETURN_NOT_OK_PREPEND(iter->Init(NULL), "Cannot initialize row iterator"); + + Arena arena(1024, 1024); + RowBlock block(projection, 100, &arena); + while (iter->HasNext()) { + RETURN_NOT_OK(iter->NextBlock(&block)); + for (int i = 0; i < block.nrows(); i++) { + int64_t key; + int64_t link; + bool updated; + + const RowBlockRow& row = block.row(i); + key = *tablet_schema->ExtractColumnFromRow(row, 0); + link = *tablet_schema->ExtractColumnFromRow(row, 1); + updated = *tablet_schema->ExtractColumnFromRow(row, 3); + + verifier.RegisterResult(key, link, updated); + } + } + + return verifier.VerifyData(verified_count, true); +} + +Status LinkedListTester::WaitAndVerify(int seconds_to_run, + int64_t expected, + const boost::function& cb) { + + std::list > samples_as_list(sampled_timestamps_and_counts_.begin(), + sampled_timestamps_and_counts_.end()); + + int64_t seen = 0; + bool called = false; + Stopwatch sw; + sw.start(); + + Status s; + do { + // We'll give the tablets 5 seconds to start up regardless of how long we + // inserted for. There's some fixed cost startup time, especially when + // replication is enabled. + const int kBaseTimeToWaitSecs = 5; + bool last_attempt = sw.elapsed().wall_seconds() > kBaseTimeToWaitSecs + seconds_to_run; + s = Status::OK(); + auto iter = samples_as_list.begin(); + + while (iter != samples_as_list.end()) { + // Only call the callback once, on the first verify pass, since it may be destructive. + if (iter == samples_as_list.begin() && !called) { + s = VerifyLinkedListAtSnapshotRemote((*iter).first, (*iter).second, last_attempt, cb, + &seen); + called = true; + } else { + s = VerifyLinkedListAtSnapshotRemote((*iter).first, (*iter).second, last_attempt, + boost::bind(&LinkedListTester::ReturnOk, this, _1), + &seen); + } + + if (s.ok() && (*iter).second != seen) { + // If we've seen less rows than we were expecting we should fail and not retry. + // + // The reasoning is the following: + // + // - We know that when we read this snapshot's timestamp the writes had completed, thus + // at timestamp '(*iter).first' any replica should have precisely '(*iter).second' rows. + // - We also chose to perform a snapshot scan, which, when passed a timestamp, waits for + // that timestamp to become "clean", i.e. it makes sure that all transactions with lower + // timestamps have completed before it actually performs the scan. + // + // Together these conditions mean that if we don't get the expected rows back something + // is wrong with the read path or with the write path and we should fail immediately. + return Status::Corruption(strings::Substitute("Got wrong row count on snapshot. " + "Expected: $0, Got:$1", (*iter).second, seen)); + } + + if (!s.ok()) break; + // If the snapshot verification returned OK erase it so that we don't recheck + // even if a later snapshot or the final verification failed. + iter = samples_as_list.erase(iter); + } + if (s.ok()) { + s = VerifyLinkedListNoSnapshotRemote(expected, last_attempt, &seen); + } + + // TODO: when we enable hybridtime consistency for the scans, + // then we should not allow !s.ok() here. But, with READ_LATEST + // scans, we could have a lagging replica of one tablet, with an + // up-to-date replica of another tablet, and end up with broken links + // in the chain. + + if (!s.ok()) { + LOG(INFO) << "Table not yet ready: " << seen << "/" << expected << " rows" + << " (status: " << s.ToString() << ")"; + if (last_attempt) { + // We'll give it an equal amount of time to re-load the data as it took + // to write it in. Typically it completes much faster than that. + return Status::TimedOut("Timed out waiting for table to be accessible again", + s.ToString()); + } + + // Sleep and retry until timeout. + SleepFor(MonoDelta::FromMilliseconds(20)); + } + } while (!s.ok()); + + LOG(INFO) << "Successfully verified " << expected << " rows"; + + return Status::OK(); +} + +///////////////////////////////////////////////////////////// +// LinkedListVerifier +///////////////////////////////////////////////////////////// + +LinkedListVerifier::LinkedListVerifier(int num_chains, bool enable_mutation, + int64_t expected, + std::vector split_key_ints) + : num_chains_(num_chains), + expected_(expected), + enable_mutation_(enable_mutation), + split_key_ints_(std::move(split_key_ints)), + errors_(0) { + if (expected != kNoParticularCountExpected) { + DCHECK_GE(expected, 0); + seen_key_.reserve(expected); + seen_link_to_.reserve(expected); + } +} + +void LinkedListVerifier::StartScanTimer() { + scan_timer_.start(); +} + +void LinkedListVerifier::RegisterResult(int64_t key, int64_t link, bool updated) { + seen_key_.push_back(key); + if (link != 0) { + // Links to entry 0 don't count - the first inserts use this link + seen_link_to_.push_back(link); + } + + if (updated != enable_mutation_) { + LOG(ERROR) << "Entry " << key << " was incorrectly " + << (enable_mutation_ ? "not " : "") << "updated"; + errors_++; + } +} + +void LinkedListVerifier::SummarizeBrokenLinks(const std::vector& broken_links) { + std::vector errors_by_tablet(split_key_ints_.size() + 1); + + int n_logged = 0; + const int kMaxToLog = 100; + + for (int64_t broken : broken_links) { + int tablet = std::upper_bound(split_key_ints_.begin(), + split_key_ints_.end(), + broken) - split_key_ints_.begin(); + DCHECK_GE(tablet, 0); + DCHECK_LT(tablet, errors_by_tablet.size()); + errors_by_tablet[tablet]++; + + if (n_logged < kMaxToLog) { + LOG(ERROR) << "Entry " << broken << " was linked to but not present"; + n_logged++; + if (n_logged == kMaxToLog) { + LOG(ERROR) << "... no more broken links will be logged"; + } + } + } + + // Summarize the broken links by which tablet they fell into. + if (!broken_links.empty()) { + for (int tablet = 0; tablet < errors_by_tablet.size(); tablet++) { + LOG(ERROR) << "Error count for tablet #" << tablet << ": " << errors_by_tablet[tablet]; + } + } +} + +Status LinkedListVerifier::VerifyData(int64_t* verified_count, bool log_errors) { + *verified_count = seen_key_.size(); + LOG(INFO) << "Done collecting results (" << (*verified_count) << " rows in " + << scan_timer_.elapsed().wall_millis() << "ms)"; + + VLOG(1) << "Sorting results before verification of linked list structure..."; + std::sort(seen_key_.begin(), seen_key_.end()); + std::sort(seen_link_to_.begin(), seen_link_to_.end()); + VLOG(1) << "Done sorting"; + + // Verify that no key was seen multiple times or linked to multiple times + VerifyNoDuplicateEntries(seen_key_, &errors_, "Seen row key multiple times"); + VerifyNoDuplicateEntries(seen_link_to_, &errors_, "Seen link to row multiple times"); + // Verify that every key that was linked to was present + std::vector broken_links = STLSetDifference(seen_link_to_, seen_key_); + errors_ += broken_links.size(); + if (log_errors) { + SummarizeBrokenLinks(broken_links); + } + + // Verify that only the expected number of keys were seen but not linked to. + // Only the last "batch" should have this characteristic. + std::vector not_linked_to = STLSetDifference(seen_key_, seen_link_to_); + if (not_linked_to.size() != num_chains_) { + LOG_IF(ERROR, log_errors) + << "Had " << not_linked_to.size() << " entries which were seen but not" + << " linked to. Expected only " << num_chains_; + errors_++; + } + + if (errors_ > 0) { + return Status::Corruption("Had one or more errors during verification (see log)"); + } + + if (expected_ != *verified_count) { + return Status::IllegalState(strings::Substitute( + "Missing rows, but with no broken link in the chain. This means that " + "a suffix of the inserted rows went missing. Expected=$0, seen=$1.", + expected_, *verified_count)); + } + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/linked_list-test.cc b/src/kudu/integration-tests/linked_list-test.cc new file mode 100644 index 000000000000..ed5574a218e7 --- /dev/null +++ b/src/kudu/integration-tests/linked_list-test.cc @@ -0,0 +1,311 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This is an integration test similar to TestLoadAndVerify in HBase. +// It creates a table and writes linked lists into it, where each row +// points to the previously written row. For example, a sequence of inserts +// may be: +// +// rand_key | link_to | insert_ts +// 12345 0 1 +// 823 12345 2 +// 9999 823 3 +// (each insert links to the key of the previous insert) +// +// During insertion, a configurable number of parallel chains may be inserted. +// To verify, the table is scanned, and we ensure that every key is linked to +// either zero or one times, and no link_to refers to a missing key. + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/walltime.h" +#include "kudu/integration-tests/linked_list-test-util.h" +#include "kudu/integration-tests/ts_itest-base.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/hdr_histogram.h" + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduSchema; +using kudu::client::sp::shared_ptr; +using kudu::itest::TServerDetails; + +DEFINE_int32(seconds_to_run, 5, "Number of seconds for which to run the test"); + +DEFINE_int32(num_chains, 50, "Number of parallel chains to generate"); +DEFINE_int32(num_tablets, 3, "Number of tablets over which to split the data"); +DEFINE_bool(enable_mutation, true, "Enable periodic mutation of inserted rows"); +DEFINE_int32(num_snapshots, 3, "Number of snapshots to verify across replicas and reboots."); + +DEFINE_bool(stress_flush_compact, false, + "Flush and compact way more aggressively to try to find bugs"); +DEFINE_bool(stress_wal_gc, false, + "Set WAL segment size small so that logs will be GCed during the test"); + +namespace kudu { + +class LinkedListTest : public tserver::TabletServerIntegrationTestBase { + public: + LinkedListTest() {} + + void SetUp() OVERRIDE { + TabletServerIntegrationTestBase::SetUp(); + + LOG(INFO) << "Linked List Test Configuration:"; + LOG(INFO) << "--------------"; + LOG(INFO) << FLAGS_num_chains << " chains"; + LOG(INFO) << FLAGS_num_tablets << " tablets"; + LOG(INFO) << "Mutations " << (FLAGS_enable_mutation ? "on" : "off"); + LOG(INFO) << "--------------"; + } + + void BuildAndStart() { + vector common_flags; + + common_flags.push_back("--skip_remove_old_recovery_dir"); + + vector ts_flags(common_flags); + if (FLAGS_stress_flush_compact) { + // Set the flush threshold low so that we have a mix of flushed and unflushed + // operations in the WAL, when we bootstrap. + ts_flags.push_back("--flush_threshold_mb=1"); + // Set the compaction budget to be low so that we get multiple passes of compaction + // instead of selecting all of the rowsets in a single compaction of the whole + // tablet. + ts_flags.push_back("--tablet_compaction_budget_mb=4"); + // Set the major delta compaction ratio low enough that we trigger a lot of them. + ts_flags.push_back("--tablet_delta_store_major_compact_min_ratio=0.001"); + } + if (FLAGS_stress_wal_gc) { + // Set the size of the WAL segments low so that some can be GC'd. + ts_flags.push_back("--log_segment_size_mb=1"); + } + + CreateCluster("linked-list-cluster", ts_flags, common_flags); + ResetClientAndTester(); + ASSERT_OK(tester_->CreateLinkedListTable()); + WaitForTSAndReplicas(); + } + + void ResetClientAndTester() { + KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); + tester_.reset(new LinkedListTester(client_, kTableId, + FLAGS_num_chains, + FLAGS_num_tablets, + FLAGS_num_replicas, + FLAGS_enable_mutation)); + } + + void RestartCluster() { + CHECK(cluster_); + cluster_->Shutdown(ExternalMiniCluster::TS_ONLY); + cluster_->Restart(); + ResetClientAndTester(); + } + + protected: + void AddExtraFlags(const string& flags_str, vector* flags) { + if (flags_str.empty()) { + return; + } + vector split_flags = strings::Split(flags_str, " "); + for (const string& flag : split_flags) { + flags->push_back(flag); + } + } + + shared_ptr client_; + gscoped_ptr tester_; +}; + +TEST_F(LinkedListTest, TestLoadAndVerify) { + OverrideFlagForSlowTests("seconds_to_run", "30"); + OverrideFlagForSlowTests("stress_flush_compact", "true"); + OverrideFlagForSlowTests("stress_wal_gc", "true"); + ASSERT_NO_FATAL_FAILURE(BuildAndStart()); + + string tablet_id = tablet_replicas_.begin()->first; + + // In TSAN builds, we hit the web UIs more often, so we have a better chance + // of seeing a thread error. We don't do this in normal builds since we + // also use this test as a benchmark and it soaks up a lot of CPU. +#ifdef THREAD_SANITIZER + MonoDelta check_freq = MonoDelta::FromMilliseconds(10); +#else + MonoDelta check_freq = MonoDelta::FromSeconds(1); +#endif + + PeriodicWebUIChecker checker(*cluster_.get(), tablet_id, + check_freq); + + bool can_kill_ts = FLAGS_num_tablet_servers > 1 && FLAGS_num_replicas > 2; + + int64_t written = 0; + ASSERT_OK(tester_->LoadLinkedList(MonoDelta::FromSeconds(FLAGS_seconds_to_run), + FLAGS_num_snapshots, + &written)); + + // TODO: currently we don't use hybridtime on the C++ client, so it's possible when we + // scan after writing we may not see all of our writes (we may scan a replica). So, + // we use WaitAndVerify here instead of a plain Verify. + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size())); + + LOG(INFO) << "Successfully verified " << written << " rows before killing any servers."; + + if (can_kill_ts) { + // Restart a tserver during a scan to test scanner fault tolerance. + WaitForTSAndReplicas(); + LOG(INFO) << "Will restart the tablet server during verification scan."; + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written, + boost::bind( + &TabletServerIntegrationTestBase::RestartServerWithUUID, + this, _1))); + LOG(INFO) << "Done with tserver restart test."; + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size())); + + // Kill a tserver during a scan to test scanner fault tolerance. + // Note that the previously restarted node is likely still be bootstrapping, which makes this + // even harder. + LOG(INFO) << "Will kill the tablet server during verification scan."; + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written, + boost::bind( + &TabletServerIntegrationTestBase::ShutdownServerWithUUID, + this, _1))); + LOG(INFO) << "Done with tserver kill test."; + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size()-1)); + ASSERT_NO_FATAL_FAILURE(RestartCluster()); + // Again wait for cluster to finish bootstrapping. + WaitForTSAndReplicas(); + + // Check in-memory state with a downed TS. Scans may try other replicas. + string tablet = (*tablet_replicas_.begin()).first; + TServerDetails* leader; + EXPECT_OK(GetLeaderReplicaWithRetries(tablet, &leader)); + LOG(INFO) << "Killing TS: " << leader->instance_id.permanent_uuid() << ", leader of tablet: " + << tablet << " and verifying that we can still read all results"; + ASSERT_OK(ShutdownServerWithUUID(leader->uuid())); + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size() - 1)); + } + + // Kill and restart the cluster, verify data remains. + ASSERT_NO_FATAL_FAILURE(RestartCluster()); + + LOG(INFO) << "Verifying rows after restarting entire cluster."; + + // We need to loop here because the tablet may spend some time in BOOTSTRAPPING state + // initially after a restart. TODO: Scanner should support its own retries in this circumstance. + // Remove this loop once client is more fleshed out. + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + + // In slow tests mode, we'll wait for a little bit to allow time for the tablet to + // compact. This is a regression test for bugs where compaction post-bootstrap + // could cause data loss. + if (AllowSlowTests()) { + SleepFor(MonoDelta::FromSeconds(10)); + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + } + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size())); + + // Check post-replication state with a downed TS. + if (can_kill_ts) { + string tablet = (*tablet_replicas_.begin()).first; + TServerDetails* leader; + EXPECT_OK(GetLeaderReplicaWithRetries(tablet, &leader)); + LOG(INFO) << "Killing TS: " << leader->instance_id.permanent_uuid() << ", leader of tablet: " + << tablet << " and verifying that we can still read all results"; + ASSERT_OK(ShutdownServerWithUUID(leader->uuid())); + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size() - 1)); + } + + ASSERT_NO_FATAL_FAILURE(RestartCluster()); + + // Sleep a little bit, so that the tablet is probably in bootstrapping state. + SleepFor(MonoDelta::FromMilliseconds(100)); + + // Restart while bootstrapping + ASSERT_NO_FATAL_FAILURE(RestartCluster()); + + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); + ASSERT_OK(CheckTabletServersAreAlive(tablet_servers_.size())); + + // Dump the performance info at the very end, so it's easy to read. On a failed + // test, we don't care about this stuff anyway. + tester_->DumpInsertHistogram(true); +} + +// This test loads the linked list while one of the servers is down. +// Once the loading is complete, the server is started back up and +// we wait for it to catch up. Then we shut down the other two servers +// and verify that the data is correct on the server which caught up. +TEST_F(LinkedListTest, TestLoadWhileOneServerDownAndVerify) { + OverrideFlagForSlowTests("seconds_to_run", "30"); + + if (!FLAGS_ts_flags.empty()) { + FLAGS_ts_flags += " "; + } + + FLAGS_ts_flags += "--log_cache_size_limit_mb=2"; + FLAGS_ts_flags += " --global_log_cache_size_limit_mb=4"; + + FLAGS_num_tablet_servers = 3; + FLAGS_num_tablets = 1; + ASSERT_NO_FATAL_FAILURE(BuildAndStart()); + + // Load the data with one of the three servers down. + cluster_->tablet_server(0)->Shutdown(); + + int64_t written = 0; + ASSERT_OK(tester_->LoadLinkedList(MonoDelta::FromSeconds(FLAGS_seconds_to_run), + FLAGS_num_snapshots, + &written)); + + // Start back up the server that missed all of the data being loaded. It should be + // able to stream the data back from the other server which is still up. + ASSERT_OK(cluster_->tablet_server(0)->Restart()); + + // We'll give the tablets 5 seconds to start up regardless of how long we + // inserted for. This prevents flakiness in TSAN builds in particular. + const int kBaseTimeToWaitSecs = 5; + const int kWaitTime = FLAGS_seconds_to_run + kBaseTimeToWaitSecs; + string tablet_id = tablet_replicas_.begin()->first; + ASSERT_NO_FATAL_FAILURE(WaitForServersToAgree( + MonoDelta::FromSeconds(kWaitTime), + tablet_servers_, + tablet_id, + written / FLAGS_num_chains)); + + cluster_->tablet_server(1)->Shutdown(); + cluster_->tablet_server(2)->Shutdown(); + ASSERT_OK(tester_->WaitAndVerify(FLAGS_seconds_to_run, written)); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/master_failover-itest.cc b/src/kudu/integration-tests/master_failover-itest.cc new file mode 100644 index 000000000000..02d96555470d --- /dev/null +++ b/src/kudu/integration-tests/master_failover-itest.cc @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/client/client-internal.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +// Note: this test needs to be in the client namespace in order for +// KuduClient::Data class methods to be visible via FRIEND_TEST macro. +namespace client { + +const int kNumTabletServerReplicas = 3; + +using sp::shared_ptr; +using std::string; +using std::vector; + +class MasterFailoverTest : public KuduTest { + public: + enum CreateTableMode { + kWaitForCreate = 0, + kNoWaitForCreate = 1 + }; + + MasterFailoverTest() { + opts_.master_rpc_ports = { 11010, 11011, 11012 }; + opts_.num_masters = num_masters_ = opts_.master_rpc_ports.size(); + opts_.num_tablet_servers = kNumTabletServerReplicas; + + // Reduce various timeouts below as to make the detection of + // leader master failures (specifically, failures as result of + // long pauses) more rapid. + + // Set max missed heartbeats periods to 1.0 (down from 3.0). + opts_.extra_master_flags.push_back("--leader_failure_max_missed_heartbeat_periods=1.0"); + + // Set the TS->master heartbeat timeout to 1 second (down from 15 seconds). + opts_.extra_tserver_flags.push_back("--heartbeat_rpc_timeout_ms=1000"); + // Allow one TS heartbeat failure before retrying with back-off (down from 3). + opts_.extra_tserver_flags.push_back("--heartbeat_max_failures_before_backoff=1"); + // Wait for 500 ms after 'max_consecutive_failed_heartbeats' + // before trying again (down from 1 second). + opts_.extra_tserver_flags.push_back("--heartbeat_interval_ms=500"); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + ASSERT_NO_FATAL_FAILURE(RestartCluster()); + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + } + KuduTest::TearDown(); + } + + void RestartCluster() { + if (cluster_) { + cluster_->Shutdown(); + cluster_.reset(); + } + cluster_.reset(new ExternalMiniCluster(opts_)); + ASSERT_OK(cluster_->Start()); + KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); + } + + Status CreateTable(const std::string& table_name, CreateTableMode mode) { + KuduSchema schema; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("int_val")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("string_val")->Type(KuduColumnSchema::STRING)->NotNull(); + CHECK_OK(b.Build(&schema)); + gscoped_ptr table_creator(client_->NewTableCreator()); + return table_creator->table_name(table_name) + .schema(&schema) + .timeout(MonoDelta::FromSeconds(90)) + .wait(mode == kWaitForCreate) + .Create(); + } + + Status RenameTable(const std::string& table_name_orig, const std::string& table_name_new) { + gscoped_ptr table_alterer(client_->NewTableAlterer(table_name_orig)); + return table_alterer + ->RenameTo(table_name_new) + ->timeout(MonoDelta::FromSeconds(90)) + ->wait(true) + ->Alter(); + } + + // Test that we can get the table location information from the + // master and then open scanners on the tablet server. This involves + // sending RPCs to both the master and the tablet servers and + // requires that the table and tablet exist both on the masters and + // the tablet servers. + Status OpenTableAndScanner(const std::string& table_name) { + shared_ptr table; + RETURN_NOT_OK_PREPEND(client_->OpenTable(table_name, &table), + "Unable to open table " + table_name); + KuduScanner scanner(table.get()); + RETURN_NOT_OK_PREPEND(scanner.SetProjectedColumns(vector()), + "Unable to open an empty projection on " + table_name); + RETURN_NOT_OK_PREPEND(scanner.Open(), + "Unable to open scanner on " + table_name); + return Status::OK(); + } + + protected: + int num_masters_; + ExternalMiniClusterOptions opts_; + gscoped_ptr cluster_; + shared_ptr client_; +}; + +// Test that synchronous CreateTable (issue CreateTable call and then +// wait until the table has been created) works even when the original +// leader master has been paused. +// +// Temporarily disabled since multi-master isn't supported yet. +// This test fails as of KUDU-1138, since the tablet servers haven't +// registered with the follower master, and thus it's likely to deny +// the CreateTable request thinking there are no TS available. +TEST_F(MasterFailoverTest, DISABLED_TestCreateTableSync) { + if (!AllowSlowTests()) { + LOG(INFO) << "This test can only be run in slow mode."; + return; + } + + int leader_idx; + ASSERT_OK(cluster_->GetLeaderMasterIndex(&leader_idx)); + + LOG(INFO) << "Pausing leader master"; + cluster_->master(leader_idx)->Pause(); + ScopedResumeExternalDaemon resume_daemon(cluster_->master(leader_idx)); + + string table_name = "testCreateTableSync"; + ASSERT_OK(CreateTable(table_name, kWaitForCreate)); + ASSERT_OK(OpenTableAndScanner(table_name)); +} + +// Test that we can issue a CreateTable call, pause the leader master +// immediately after, then verify that the table has been created on +// the newly elected leader master. +// +// TODO enable this test once flakiness issues are worked out and +// eliminated on test machines. +TEST_F(MasterFailoverTest, DISABLED_TestPauseAfterCreateTableIssued) { + if (!AllowSlowTests()) { + LOG(INFO) << "This test can only be run in slow mode."; + return; + } + + int leader_idx; + ASSERT_OK(cluster_->GetLeaderMasterIndex(&leader_idx)); + + string table_id = "testPauseAfterCreateTableIssued"; + LOG(INFO) << "Issuing CreateTable for " << table_id; + ASSERT_OK(CreateTable(table_id, kNoWaitForCreate)); + + LOG(INFO) << "Pausing leader master"; + cluster_->master(leader_idx)->Pause(); + ScopedResumeExternalDaemon resume_daemon(cluster_->master(leader_idx)); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(90)); + ASSERT_OK(client_->data_->WaitForCreateTableToFinish(client_.get(), + table_id, deadline)); + + ASSERT_OK(OpenTableAndScanner(table_id)); +} + +// Test the scenario where we create a table, pause the leader master, +// and then issue the DeleteTable call: DeleteTable should go to the newly +// elected leader master and succeed. +TEST_F(MasterFailoverTest, TestDeleteTableSync) { + if (!AllowSlowTests()) { + LOG(INFO) << "This test can only be run in slow mode."; + return; + } + + int leader_idx; + + ASSERT_OK(cluster_->GetLeaderMasterIndex(&leader_idx)); + + string table_name = "testDeleteTableSync"; + ASSERT_OK(CreateTable(table_name, kWaitForCreate)); + + LOG(INFO) << "Pausing leader master"; + cluster_->master(leader_idx)->Pause(); + ScopedResumeExternalDaemon resume_daemon(cluster_->master(leader_idx)); + + ASSERT_OK(client_->DeleteTable(table_name)); + shared_ptr table; + Status s = client_->OpenTable(table_name, &table); + ASSERT_TRUE(s.IsNotFound()); +} + +// Test the scenario where we create a table, pause the leader master, +// and then issue the AlterTable call renaming a table: AlterTable +// should go to the newly elected leader master and succeed, renaming +// the table. +// +// TODO: Add an equivalent async test. Add a test for adding and/or +// renaming a column in a table. +TEST_F(MasterFailoverTest, TestRenameTableSync) { + if (!AllowSlowTests()) { + LOG(INFO) << "This test can only be run in slow mode."; + return; + } + + int leader_idx; + + ASSERT_OK(cluster_->GetLeaderMasterIndex(&leader_idx)); + + string table_name_orig = "testAlterTableSync"; + ASSERT_OK(CreateTable(table_name_orig, kWaitForCreate)); + + LOG(INFO) << "Pausing leader master"; + cluster_->master(leader_idx)->Pause(); + ScopedResumeExternalDaemon resume_daemon(cluster_->master(leader_idx)); + + string table_name_new = "testAlterTableSyncRenamed"; + ASSERT_OK(RenameTable(table_name_orig, table_name_new)); + shared_ptr table; + ASSERT_OK(client_->OpenTable(table_name_new, &table)); + + Status s = client_->OpenTable(table_name_orig, &table); + ASSERT_TRUE(s.IsNotFound()); +} + +} // namespace client +} // namespace kudu diff --git a/src/kudu/integration-tests/master_replication-itest.cc b/src/kudu/integration-tests/master_replication-itest.cc new file mode 100644 index 000000000000..f3fb1bc8b863 --- /dev/null +++ b/src/kudu/integration-tests/master_replication-itest.cc @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "kudu/client/client.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/mini_master.h" +#include "kudu/util/test_util.h" + +using std::vector; + +namespace kudu { +namespace master { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduTable; +using client::KuduTableCreator; +using client::sp::shared_ptr; + +const char * const kTableId1 = "testMasterReplication-1"; +const char * const kTableId2 = "testMasterReplication-2"; + +const int kNumTabletServerReplicas = 3; + +class MasterReplicationTest : public KuduTest { + public: + MasterReplicationTest() { + // Hard-coded ports for the masters. This is safe, as this unit test + // runs under a resource lock (see CMakeLists.txt in this directory). + // TODO we should have a generic method to obtain n free ports. + opts_.master_rpc_ports = { 11010, 11011, 11012 }; + + opts_.num_masters = num_masters_ = opts_.master_rpc_ports.size(); + opts_.num_tablet_servers = kNumTabletServerReplicas; + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + cluster_.reset(new MiniCluster(env_.get(), opts_)); + ASSERT_OK(cluster_->Start()); + ASSERT_OK(cluster_->WaitForTabletServerCount(kNumTabletServerReplicas)); + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + cluster_.reset(); + } + KuduTest::TearDown(); + } + + Status RestartCluster() { + cluster_->Shutdown(); + RETURN_NOT_OK(cluster_->Start()); + RETURN_NOT_OK(cluster_->WaitForTabletServerCount(kNumTabletServerReplicas)); + return Status::OK(); + } + + // This method is meant to be run in a separate thread. + void StartClusterDelayed(int64_t micros) { + LOG(INFO) << "Sleeping for " << micros << " micro seconds..."; + SleepFor(MonoDelta::FromMicroseconds(micros)); + LOG(INFO) << "Attempting to start the cluster..."; + CHECK_OK(cluster_->Start()); + CHECK_OK(cluster_->WaitForTabletServerCount(kNumTabletServerReplicas)); + } + + void ListMasterServerAddrs(vector* out) { + for (int i = 0; i < num_masters_; i++) { + out->push_back(cluster_->mini_master(i)->bound_rpc_addr_str()); + } + } + + Status CreateClient(shared_ptr* out) { + KuduClientBuilder builder; + for (int i = 0; i < num_masters_; i++) { + if (!cluster_->mini_master(i)->master()->IsShutdown()) { + builder.add_master_server_addr(cluster_->mini_master(i)->bound_rpc_addr_str()); + } + } + return builder.Build(out); + } + + + Status CreateTable(const shared_ptr& client, + const std::string& table_name) { + KuduSchema schema; + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("int_val")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("string_val")->Type(KuduColumnSchema::STRING)->NotNull(); + CHECK_OK(b.Build(&schema)); + gscoped_ptr table_creator(client->NewTableCreator()); + return table_creator->table_name(table_name) + .schema(&schema) + .Create(); + } + + void VerifyTableExists(const std::string& table_id) { + LOG(INFO) << "Verifying that " << table_id << " exists on leader.."; + ASSERT_TRUE(cluster_->leader_mini_master()->master() + ->catalog_manager()->TableNameExists(table_id)); + } + + protected: + int num_masters_; + MiniClusterOptions opts_; + gscoped_ptr cluster_; +}; + +// Basic test. Verify that: +// +// 1) We can start multiple masters in a distributed configuration and +// that the clients and tablet servers can connect to the leader +// master. +// +// 2) We can create a table (using the standard client APIs) on the +// the leader and ensure that the appropriate table/tablet info is +// replicated to the newly elected leader. +TEST_F(MasterReplicationTest, TestSysTablesReplication) { + shared_ptr client; + + // Create the first table. + ASSERT_OK(CreateClient(&client)); + ASSERT_OK(CreateTable(client, kTableId1)); + + // TODO: once fault tolerant DDL is in, remove the line below. + ASSERT_OK(CreateClient(&client)); + + ASSERT_OK(cluster_->WaitForTabletServerCount(kNumTabletServerReplicas)); + + // Repeat the same for the second table. + ASSERT_OK(CreateTable(client, kTableId2)); + ASSERT_NO_FATAL_FAILURE(VerifyTableExists(kTableId2)); +} + +// When all masters are down, test that we can timeout the connection +// attempts after a specified deadline. +TEST_F(MasterReplicationTest, TestTimeoutWhenAllMastersAreDown) { + vector master_addrs; + ListMasterServerAddrs(&master_addrs); + + cluster_->Shutdown(); + + shared_ptr client; + KuduClientBuilder builder; + builder.master_server_addrs(master_addrs); + builder.default_rpc_timeout(MonoDelta::FromMilliseconds(100)); + Status s = builder.Build(&client); + EXPECT_TRUE(!s.ok()); + EXPECT_TRUE(s.IsTimedOut()); + + // We need to reset 'cluster_' so that TearDown() can run correctly. + cluster_.reset(); +} + +// Shut the cluster down, start initializing the client, and then +// bring the cluster back up during the initialization (but before the +// timeout can elapse). +TEST_F(MasterReplicationTest, TestCycleThroughAllMasters) { + vector master_addrs; + ListMasterServerAddrs(&master_addrs); + + // Shut the cluster down and ... + cluster_->Shutdown(); + // ... start the cluster after a delay. + scoped_refptr start_thread; + ASSERT_OK(Thread::Create("TestCycleThroughAllMasters", "start_thread", + &MasterReplicationTest::StartClusterDelayed, + this, + 100 * 1000, // start after 100 millis. + &start_thread)); + + // Verify that the client doesn't give up even though the entire + // cluster is down for 100 milliseconds. + shared_ptr client; + KuduClientBuilder builder; + builder.master_server_addrs(master_addrs); + builder.default_admin_operation_timeout(MonoDelta::FromSeconds(15)); + EXPECT_OK(builder.Build(&client)); + + ASSERT_OK(ThreadJoiner(start_thread.get()).Join()); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/integration-tests/mini_cluster.cc b/src/kudu/integration-tests/mini_cluster.cc new file mode 100644 index 000000000000..584822fe809a --- /dev/null +++ b/src/kudu/integration-tests/mini_cluster.cc @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/integration-tests/mini_cluster.h" + + +#include "kudu/client/client.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/ts_manager.h" +#include "kudu/rpc/messenger.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/path_util.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using strings::Substitute; + +namespace kudu { + +using client::KuduClient; +using client::KuduClientBuilder; +using master::MiniMaster; +using master::TabletLocationsPB; +using master::TSDescriptor; +using std::shared_ptr; +using tserver::MiniTabletServer; +using tserver::TabletServer; + +MiniClusterOptions::MiniClusterOptions() + : num_masters(1), + num_tablet_servers(1) { +} + +MiniCluster::MiniCluster(Env* env, const MiniClusterOptions& options) + : running_(false), + env_(env), + fs_root_(!options.data_root.empty() ? options.data_root : + JoinPathSegments(GetTestDataDirectory(), "minicluster-data")), + num_masters_initial_(options.num_masters), + num_ts_initial_(options.num_tablet_servers), + master_rpc_ports_(options.master_rpc_ports), + tserver_rpc_ports_(options.tserver_rpc_ports) { + mini_masters_.resize(num_masters_initial_); +} + +MiniCluster::~MiniCluster() { + CHECK(!running_); +} + +Status MiniCluster::Start() { + CHECK(!fs_root_.empty()) << "No Fs root was provided"; + CHECK(!running_); + + if (num_masters_initial_ > 1) { + CHECK_GE(master_rpc_ports_.size(), num_masters_initial_); + } + + if (!env_->FileExists(fs_root_)) { + RETURN_NOT_OK(env_->CreateDir(fs_root_)); + } + + // start the masters + if (num_masters_initial_ > 1) { + RETURN_NOT_OK_PREPEND(StartDistributedMasters(), + "Couldn't start distributed masters"); + } else { + RETURN_NOT_OK_PREPEND(StartSingleMaster(), "Couldn't start the single master"); + } + + for (int i = 0; i < num_ts_initial_; i++) { + RETURN_NOT_OK_PREPEND(AddTabletServer(), + Substitute("Error adding TS $0", i)); + } + + RETURN_NOT_OK_PREPEND(WaitForTabletServerCount(num_ts_initial_), + "Waiting for tablet servers to start"); + + running_ = true; + return Status::OK(); +} + +Status MiniCluster::StartDistributedMasters() { + CHECK_GE(master_rpc_ports_.size(), num_masters_initial_); + CHECK_GT(master_rpc_ports_.size(), 1); + + LOG(INFO) << "Creating distributed mini masters. Ports: " + << JoinInts(master_rpc_ports_, ", "); + + for (int i = 0; i < num_masters_initial_; i++) { + gscoped_ptr mini_master( + new MiniMaster(env_, GetMasterFsRoot(i), master_rpc_ports_[i])); + RETURN_NOT_OK_PREPEND(mini_master->StartDistributedMaster(master_rpc_ports_), + Substitute("Couldn't start follower $0", i)); + VLOG(1) << "Started MiniMaster with UUID " << mini_master->permanent_uuid() + << " at index " << i; + mini_masters_[i] = shared_ptr(mini_master.release()); + } + int i = 0; + for (const shared_ptr& master : mini_masters_) { + LOG(INFO) << "Waiting to initialize catalog manager on master " << i++; + RETURN_NOT_OK_PREPEND(master->WaitForCatalogManagerInit(), + Substitute("Could not initialize catalog manager on master $0", i)); + } + return Status::OK(); +} + +Status MiniCluster::StartSync() { + RETURN_NOT_OK(Start()); + int count = 0; + for (const shared_ptr& tablet_server : mini_tablet_servers_) { + RETURN_NOT_OK_PREPEND(tablet_server->WaitStarted(), + Substitute("TabletServer $0 failed to start.", count)); + count++; + } + return Status::OK(); +} + +Status MiniCluster::StartSingleMaster() { + // If there's a single master, 'mini_masters_' must be size 1. + CHECK_EQ(mini_masters_.size(), 1); + CHECK_LE(master_rpc_ports_.size(), 1); + uint16_t master_rpc_port = 0; + if (master_rpc_ports_.size() == 1) { + master_rpc_port = master_rpc_ports_[0]; + } + + // start the master (we need the port to set on the servers). + gscoped_ptr mini_master( + new MiniMaster(env_, GetMasterFsRoot(0), master_rpc_port)); + RETURN_NOT_OK_PREPEND(mini_master->Start(), "Couldn't start master"); + RETURN_NOT_OK(mini_master->master()-> + WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + mini_masters_[0] = shared_ptr(mini_master.release()); + return Status::OK(); +} + +Status MiniCluster::AddTabletServer() { + if (mini_masters_.empty()) { + return Status::IllegalState("Master not yet initialized"); + } + int new_idx = mini_tablet_servers_.size(); + + uint16_t ts_rpc_port = 0; + if (tserver_rpc_ports_.size() > new_idx) { + ts_rpc_port = tserver_rpc_ports_[new_idx]; + } + gscoped_ptr tablet_server( + new MiniTabletServer(GetTabletServerFsRoot(new_idx), ts_rpc_port)); + + // set the master addresses + tablet_server->options()->master_addresses.clear(); + for (const shared_ptr& master : mini_masters_) { + tablet_server->options()->master_addresses.push_back(HostPort(master->bound_rpc_addr())); + } + RETURN_NOT_OK(tablet_server->Start()) + mini_tablet_servers_.push_back(shared_ptr(tablet_server.release())); + return Status::OK(); +} + +MiniMaster* MiniCluster::leader_mini_master() { + Stopwatch sw; + sw.start(); + while (sw.elapsed().wall_seconds() < kMasterLeaderElectionWaitTimeSeconds) { + for (int i = 0; i < mini_masters_.size(); i++) { + MiniMaster* master = mini_master(i); + if (master->master()->IsShutdown()) { + continue; + } + if (master->master()->catalog_manager()->IsInitialized() && + master->master()->catalog_manager()->CheckIsLeaderAndReady().ok()) { + return master; + } + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } + LOG(ERROR) << "No leader master elected after " << kMasterLeaderElectionWaitTimeSeconds + << " seconds."; + return nullptr; +} + +void MiniCluster::Shutdown() { + for (const shared_ptr& tablet_server : mini_tablet_servers_) { + tablet_server->Shutdown(); + } + mini_tablet_servers_.clear(); + for (shared_ptr& master_server : mini_masters_) { + master_server->Shutdown(); + master_server.reset(); + } + running_ = false; +} + +void MiniCluster::ShutdownMasters() { + for (shared_ptr& master_server : mini_masters_) { + master_server->Shutdown(); + master_server.reset(); + } +} + +MiniMaster* MiniCluster::mini_master(int idx) { + CHECK_GE(idx, 0) << "Master idx must be >= 0"; + CHECK_LT(idx, mini_masters_.size()) << "Master idx must be < num masters started"; + return mini_masters_[idx].get(); +} + +MiniTabletServer* MiniCluster::mini_tablet_server(int idx) { + CHECK_GE(idx, 0) << "TabletServer idx must be >= 0"; + CHECK_LT(idx, mini_tablet_servers_.size()) << "TabletServer idx must be < 'num_ts_started_'"; + return mini_tablet_servers_[idx].get(); +} + +string MiniCluster::GetMasterFsRoot(int idx) { + return JoinPathSegments(fs_root_, Substitute("master-$0-root", idx)); +} + +string MiniCluster::GetTabletServerFsRoot(int idx) { + return JoinPathSegments(fs_root_, Substitute("ts-$0-root", idx)); +} + +Status MiniCluster::WaitForReplicaCount(const string& tablet_id, + int expected_count) { + TabletLocationsPB locations; + return WaitForReplicaCount(tablet_id, expected_count, &locations); +} + +Status MiniCluster::WaitForReplicaCount(const string& tablet_id, + int expected_count, + TabletLocationsPB* locations) { + Stopwatch sw; + sw.start(); + while (sw.elapsed().wall_seconds() < kTabletReportWaitTimeSeconds) { + Status s = + leader_mini_master()->master()->catalog_manager()->GetTabletLocations(tablet_id, locations); + if (s.ok() && ((locations->stale() && expected_count == 0) || + (!locations->stale() && locations->replicas_size() == expected_count))) { + return Status::OK(); + } + + SleepFor(MonoDelta::FromMilliseconds(1)); + } + return Status::TimedOut(Substitute("Tablet $0 never reached expected replica count $1", + tablet_id, expected_count)); +} + +Status MiniCluster::WaitForTabletServerCount(int count) { + vector > descs; + return WaitForTabletServerCount(count, &descs); +} + +Status MiniCluster::WaitForTabletServerCount(int count, + vector >* descs) { + Stopwatch sw; + sw.start(); + while (sw.elapsed().wall_seconds() < kRegistrationWaitTimeSeconds) { + leader_mini_master()->master()->ts_manager()->GetAllDescriptors(descs); + if (descs->size() == count) { + // GetAllDescriptors() may return servers that are no longer online. + // Do a second step of verification to verify that the descs that we got + // are aligned (same uuid/seqno) with the TSs that we have in the cluster. + int match_count = 0; + for (const shared_ptr& desc : *descs) { + for (auto mini_tablet_server : mini_tablet_servers_) { + auto ts = mini_tablet_server->server(); + if (ts->instance_pb().permanent_uuid() == desc->permanent_uuid() && + ts->instance_pb().instance_seqno() == desc->latest_seqno()) { + match_count++; + break; + } + } + } + + if (match_count == count) { + LOG(INFO) << count << " TS(s) registered with Master after " + << sw.elapsed().wall_seconds() << "s"; + return Status::OK(); + } + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } + return Status::TimedOut(Substitute("$0 TS(s) never registered with master", count)); +} + +Status MiniCluster::CreateClient(KuduClientBuilder* builder, + client::sp::shared_ptr* client) { + KuduClientBuilder default_builder; + if (builder == nullptr) { + builder = &default_builder; + } + builder->clear_master_server_addrs(); + for (const shared_ptr& master : mini_masters_) { + CHECK(master); + builder->add_master_server_addr(master->bound_rpc_addr_str()); + } + return builder->Build(client); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/mini_cluster.h b/src/kudu/integration-tests/mini_cluster.h new file mode 100644 index 000000000000..43ee3d8139cd --- /dev/null +++ b/src/kudu/integration-tests/mini_cluster.h @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_INTEGRATION_TESTS_MINI_CLUSTER_H +#define KUDU_INTEGRATION_TESTS_MINI_CLUSTER_H + +#include +#include +#include + +#include "kudu/client/shared_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/env.h" + +namespace kudu { + +namespace client { +class KuduClient; +class KuduClientBuilder; +} + +namespace master { +class MiniMaster; +class TSDescriptor; +class TabletLocationsPB; +} + +namespace tserver { +class MiniTabletServer; +} + +struct MiniClusterOptions { + MiniClusterOptions(); + + // Number of master servers. + // Default: 1 + int num_masters; + + // Number of TS to start. + // Default: 1 + int num_tablet_servers; + + // Directory in which to store data. + // Default: "", which auto-generates a unique path for this cluster. + // The default may only be used from a gtest unit test. + std::string data_root; + + // List of RPC ports for the master to run on. + // Defaults to a list 0 (ephemeral ports). + std::vector master_rpc_ports; + + // List of RPC ports for the tservers to run on. + // Defaults to a list of 0 (ephemeral ports). + std::vector tserver_rpc_ports; +}; + +// An in-process cluster with a MiniMaster and a configurable +// number of MiniTabletServers for use in tests. +class MiniCluster { + public: + MiniCluster(Env* env, const MiniClusterOptions& options); + ~MiniCluster(); + + // Start a cluster with a Master and 'num_tablet_servers' TabletServers. + // All servers run on the loopback interface with ephemeral ports. + Status Start(); + + // Like the previous method but performs initialization synchronously, i.e. + // this will wait for all TS's to be started and initialized. Tests should + // use this if they interact with tablets immediately after Start(); + Status StartSync(); + + void Shutdown(); + + // Shuts down masters only. + void ShutdownMasters(); + + // Setup a consensus configuration of distributed masters, with count specified in + // 'options'. Requires that a reserve RPC port is specified in + // 'options' for each master. + Status StartDistributedMasters(); + + // Add a new standalone master to the cluster. The new master is started. + Status StartSingleMaster(); + + // Add a new TS to the cluster. The new TS is started. + // Requires that the master is already running. + Status AddTabletServer(); + + // If this cluster is configured for a single non-distributed + // master, return the single master. Exits with a CHECK failure if + // there are multiple masters. + master::MiniMaster* mini_master() { + CHECK_EQ(mini_masters_.size(), 1); + return mini_master(0); + } + + // Returns the leader Master for this MiniCluster or NULL if none can be + // found. May block until a leader Master is ready. + master::MiniMaster* leader_mini_master(); + + // Returns the Master at index 'idx' for this MiniCluster. + master::MiniMaster* mini_master(int idx); + + // Return number of mini masters. + int num_masters() const { return mini_masters_.size(); } + + // Returns the TabletServer at index 'idx' of this MiniCluster. + // 'idx' must be between 0 and 'num_tablet_servers' -1. + tserver::MiniTabletServer* mini_tablet_server(int idx); + + int num_tablet_servers() const { return mini_tablet_servers_.size(); } + + std::string GetMasterFsRoot(int indx); + + std::string GetTabletServerFsRoot(int idx); + + // Wait for the given tablet to have 'expected_count' replicas + // reported on the master. + // Requires that the master has started. + // Returns a bad Status if the tablet does not reach the required count + // within kTabletReportWaitTimeSeconds. + Status WaitForReplicaCount(const std::string& tablet_id, int expected_count); + + // Wait for the given tablet to have 'expected_count' replicas + // reported on the master. Returns the locations in '*locations'. + // Requires that the master has started; + // Returns a bad Status if the tablet does not reach the required count + // within kTabletReportWaitTimeSeconds. + Status WaitForReplicaCount(const std::string& tablet_id, + int expected_count, + master::TabletLocationsPB* locations); + + // Wait until the number of registered tablet servers reaches the given + // count. Returns Status::TimedOut if the desired count is not achieved + // within kRegistrationWaitTimeSeconds. + Status WaitForTabletServerCount(int count); + Status WaitForTabletServerCount(int count, + std::vector >* descs); + + // Create a client configured to talk to this cluster. Builder may contain + // override options for the client. The master address will be overridden to + // talk to the running master. If 'builder' is NULL, default options will be + // used. + // + // REQUIRES: the cluster must have already been Start()ed. + Status CreateClient(client::KuduClientBuilder* builder, + client::sp::shared_ptr* client); + + private: + enum { + kTabletReportWaitTimeSeconds = 5, + kRegistrationWaitTimeSeconds = 5, + kMasterLeaderElectionWaitTimeSeconds = 10 + }; + + bool running_; + + Env* const env_; + const std::string fs_root_; + const int num_masters_initial_; + const int num_ts_initial_; + + const std::vector master_rpc_ports_; + const std::vector tserver_rpc_ports_; + + std::vector > mini_masters_; + std::vector > mini_tablet_servers_; +}; + +} // namespace kudu + +#endif /* KUDU_INTEGRATION_TESTS_MINI_CLUSTER_H */ diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc b/src/kudu/integration-tests/raft_consensus-itest.cc new file mode 100644 index 000000000000..76c1a56f79cb --- /dev/null +++ b/src/kudu/integration-tests/raft_consensus-itest.cc @@ -0,0 +1,2467 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/client/client.h" +#include "kudu/client/write_op.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/integration-tests/ts_itest-base.h" +#include "kudu/server/server_base.pb.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(num_client_threads, 8, + "Number of client threads to launch"); +DEFINE_int64(client_inserts_per_thread, 50, + "Number of rows inserted by each client thread"); +DEFINE_int64(client_num_batches_per_thread, 5, + "In how many batches to group the rows, for each client"); +DECLARE_int32(consensus_rpc_timeout_ms); + +METRIC_DECLARE_entity(tablet); +METRIC_DECLARE_counter(transaction_memory_pressure_rejections); +METRIC_DECLARE_gauge_int64(raft_term); + +namespace kudu { +namespace tserver { + +using client::KuduInsert; +using client::KuduSession; +using client::KuduTable; +using client::sp::shared_ptr; +using consensus::ConsensusRequestPB; +using consensus::ConsensusResponsePB; +using consensus::ConsensusServiceProxy; +using consensus::MajoritySize; +using consensus::MakeOpId; +using consensus::RaftPeerPB; +using consensus::ReplicateMsg; +using itest::AddServer; +using itest::GetReplicaStatusAndCheckIfLeader; +using itest::LeaderStepDown; +using itest::RemoveServer; +using itest::StartElection; +using itest::WaitUntilLeader; +using itest::WriteSimpleTestRow; +using master::GetTabletLocationsRequestPB; +using master::GetTabletLocationsResponsePB; +using master::TabletLocationsPB; +using rpc::RpcController; +using server::SetFlagRequestPB; +using server::SetFlagResponsePB; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using strings::Substitute; + +static const int kConsensusRpcTimeoutForTests = 50; + +static const int kTestRowKey = 1234; +static const int kTestRowIntVal = 5678; + +// Integration test for the raft consensus implementation. +// Uses the whole tablet server stack with ExternalMiniCluster. +class RaftConsensusITest : public TabletServerIntegrationTestBase { + public: + RaftConsensusITest() + : inserters_(FLAGS_num_client_threads) { + } + + virtual void SetUp() OVERRIDE { + TabletServerIntegrationTestBase::SetUp(); + FLAGS_consensus_rpc_timeout_ms = kConsensusRpcTimeoutForTests; + } + + void ScanReplica(TabletServerServiceProxy* replica_proxy, + vector* results) { + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + rpc.set_timeout(MonoDelta::FromSeconds(10)); // Squelch warnings. + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(tablet_id_); + ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns())); + + // Send the call + { + req.set_batch_size_bytes(0); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(replica_proxy->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + if (resp.has_error()) { + ASSERT_OK(StatusFromPB(resp.error().status())); + } + } + + if (!resp.has_more_results()) + return; + + // Drain all the rows from the scanner. + NO_FATALS(DrainScannerToStrings(resp.scanner_id(), + schema_, + results, + replica_proxy)); + + std::sort(results->begin(), results->end()); + } + + // Scan the given replica in a loop until the number of rows + // is 'expected_count'. If it takes more than 10 seconds, then + // fails the test. + void WaitForRowCount(TabletServerServiceProxy* replica_proxy, + int expected_count, + vector* results) { + LOG(INFO) << "Waiting for row count " << expected_count << "..."; + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(10)); + while (true) { + results->clear(); + NO_FATALS(ScanReplica(replica_proxy, results)); + if (results->size() == expected_count) { + return; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + if (!MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + break; + } + } + MonoTime end = MonoTime::Now(MonoTime::FINE); + LOG(WARNING) << "Didn't reach row count " << expected_count; + FAIL() << "Did not reach expected row count " << expected_count + << " after " << end.GetDeltaSince(start).ToString() + << ": rows: " << *results; + } + + + // Add an Insert operation to the given consensus request. + // The row to be inserted is generated based on the OpId. + void AddOp(const OpId& id, ConsensusRequestPB* req); + + string DumpToString(TServerDetails* leader, + const vector& leader_results, + TServerDetails* replica, + const vector& replica_results) { + string ret = strings::Substitute("Replica results did not match the leaders." + "\nLeader: $0\nReplica: $1. Results size " + "L: $2 R: $3", + leader->ToString(), + replica->ToString(), + leader_results.size(), + replica_results.size()); + + StrAppend(&ret, "Leader Results: \n"); + for (const string& result : leader_results) { + StrAppend(&ret, result, "\n"); + } + + StrAppend(&ret, "Replica Results: \n"); + for (const string& result : replica_results) { + StrAppend(&ret, result, "\n"); + } + + return ret; + } + + void InsertTestRowsRemoteThread(uint64_t first_row, + uint64_t count, + uint64_t num_batches, + const vector& latches) { + shared_ptr table; + CHECK_OK(client_->OpenTable(kTableId, &table)); + + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(60000); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + for (int i = 0; i < num_batches; i++) { + uint64_t first_row_in_batch = first_row + (i * count / num_batches); + uint64_t last_row_in_batch = first_row_in_batch + count / num_batches; + + for (int j = first_row_in_batch; j < last_row_in_batch; j++) { + gscoped_ptr insert(table->NewInsert()); + KuduPartialRow* row = insert->mutable_row(); + CHECK_OK(row->SetInt32(0, j)); + CHECK_OK(row->SetInt32(1, j * 2)); + CHECK_OK(row->SetStringCopy(2, Slice(StringPrintf("hello %d", j)))); + CHECK_OK(session->Apply(insert.release())); + } + + // We don't handle write idempotency yet. (i.e making sure that when a leader fails + // writes to it that were eventually committed by the new leader but un-ackd to the + // client are not retried), so some errors are expected. + // It's OK as long as the errors are Status::AlreadyPresent(); + + int inserted = last_row_in_batch - first_row_in_batch; + + Status s = session->Flush(); + if (PREDICT_FALSE(!s.ok())) { + std::vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK(!overflow); + for (const client::KuduError* e : errors) { + CHECK(e->status().IsAlreadyPresent()) << "Unexpected error: " << e->status().ToString(); + } + inserted -= errors.size(); + } + + for (CountDownLatch* latch : latches) { + latch->CountDown(inserted); + } + } + + inserters_.CountDown(); + } + + // Brings Chaos to a MiniTabletServer by introducing random delays. Does this by + // pausing the daemon a random amount of time. + void DelayInjectorThread(ExternalTabletServer* tablet_server, int timeout_msec) { + while (inserters_.count() > 0) { + + // Adjust the value obtained from the normalized gauss. dist. so that we steal the lock + // longer than the the timeout a small (~5%) percentage of the times. + // (95% corresponds to 1.64485, in a normalized (0,1) gaussian distribution). + double sleep_time_usec = 1000 * + ((random_.Normal(0, 1) * timeout_msec) / 1.64485); + + if (sleep_time_usec < 0) sleep_time_usec = 0; + + // Additionally only cause timeouts at all 50% of the time, otherwise sleep. + double val = (rand() * 1.0) / RAND_MAX; + if (val < 0.5) { + SleepFor(MonoDelta::FromMicroseconds(sleep_time_usec)); + continue; + } + + ASSERT_OK(tablet_server->Pause()); + LOG_IF(INFO, sleep_time_usec > 0.0) + << "Delay injector thread for TS " << tablet_server->instance_id().permanent_uuid() + << " SIGSTOPped the ts, sleeping for " << sleep_time_usec << " usec..."; + SleepFor(MonoDelta::FromMicroseconds(sleep_time_usec)); + ASSERT_OK(tablet_server->Resume()); + } + } + + // Thread which loops until '*finish' becomes true, trying to insert a row + // on the given tablet server identified by 'replica_idx'. + void StubbornlyWriteSameRowThread(int replica_idx, const AtomicBool* finish); + + // Stops the current leader of the configuration, runs leader election and then brings it back. + // Before stopping the leader this pauses all follower nodes in regular intervals so that + // we get an increased chance of stuff being pending. + void StopOrKillLeaderAndElectNewOne() { + bool kill = rand() % 2 == 0; + + TServerDetails* old_leader; + CHECK_OK(GetLeaderReplicaWithRetries(tablet_id_, &old_leader)); + ExternalTabletServer* old_leader_ets = cluster_->tablet_server_by_uuid(old_leader->uuid()); + + vector followers; + GetOnlyLiveFollowerReplicas(tablet_id_, &followers); + + for (TServerDetails* ts : followers) { + ExternalTabletServer* ets = cluster_->tablet_server_by_uuid(ts->uuid()); + CHECK_OK(ets->Pause()); + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + // When all are paused also pause or kill the current leader. Since we've waited a bit + // the old leader is likely to have operations that must be aborted. + if (kill) { + old_leader_ets->Shutdown(); + } else { + CHECK_OK(old_leader_ets->Pause()); + } + + // Resume the replicas. + for (TServerDetails* ts : followers) { + ExternalTabletServer* ets = cluster_->tablet_server_by_uuid(ts->uuid()); + CHECK_OK(ets->Resume()); + } + + // Get the new leader. + TServerDetails* new_leader; + CHECK_OK(GetLeaderReplicaWithRetries(tablet_id_, &new_leader)); + + // Bring the old leader back. + if (kill) { + CHECK_OK(old_leader_ets->Restart()); + // Wait until we have the same number of followers. + int initial_followers = followers.size(); + do { + GetOnlyLiveFollowerReplicas(tablet_id_, &followers); + } while (followers.size() < initial_followers); + } else { + CHECK_OK(old_leader_ets->Resume()); + } + } + + // Writes 'num_writes' operations to the current leader. Each of the operations + // has a payload of around 128KB. Causes a gtest failure on error. + void Write128KOpsToLeader(int num_writes); + + // Check for and restart any TS that have crashed. + // Returns the number of servers restarted. + int RestartAnyCrashedTabletServers(); + + // Assert that no tablet servers have crashed. + // Tablet servers that have been manually Shutdown() are allowed. + void AssertNoTabletServersCrashed(); + + // Ensure that a majority of servers is required for elections and writes. + // This is done by pausing a majority and asserting that writes and elections fail, + // then unpausing the majority and asserting that elections and writes succeed. + // If fails, throws a gtest assertion. + // Note: This test assumes all tablet servers listed in tablet_servers are voters. + void AssertMajorityRequiredForElectionsAndWrites(const TabletServerMap& tablet_servers, + const string& leader_uuid); + + // Return the replicas of the specified 'tablet_id', as seen by the Master. + Status GetTabletLocations(const string& tablet_id, const MonoDelta& timeout, + master::TabletLocationsPB* tablet_locations); + + enum WaitForLeader { + NO_WAIT_FOR_LEADER = 0, + WAIT_FOR_LEADER = 1 + }; + + // Wait for the specified number of replicas to be reported by the master for + // the given tablet. Fails with an assertion if the timeout expires. + void WaitForReplicasReportedToMaster(int num_replicas, const string& tablet_id, + const MonoDelta& timeout, + WaitForLeader wait_for_leader, + bool* has_leader, + master::TabletLocationsPB* tablet_locations); + + static const bool WITH_NOTIFICATION_LATENCY = true; + static const bool WITHOUT_NOTIFICATION_LATENCY = false; + void DoTestChurnyElections(bool with_latency); + + protected: + // Flags needed for CauseFollowerToFallBehindLogGC() to work well. + void AddFlagsForLogRolls(vector* extra_tserver_flags); + + // Pause one of the followers and write enough data to the remaining replicas + // to cause log GC, then resume the paused follower. On success, + // 'leader_uuid' will be set to the UUID of the leader, 'orig_term' will be + // set to the term of the leader before un-pausing the follower, and + // 'fell_behind_uuid' will be set to the UUID of the follower that was paused + // and caused to fall behind. These can be used for verification purposes. + // + // Certain flags should be set. You can add the required flags with + // AddFlagsForLogRolls() before starting the cluster. + void CauseFollowerToFallBehindLogGC(string* leader_uuid, + int64_t* orig_term, + string* fell_behind_uuid); + + shared_ptr table_; + std::vector > threads_; + CountDownLatch inserters_; +}; + +void RaftConsensusITest::AddFlagsForLogRolls(vector* extra_tserver_flags) { + // We configure a small log segment size so that we roll frequently, + // configure a small cache size so that we evict data from the cache, and + // retain as few segments as possible. We also turn off async segment + // allocation -- this ensures that we roll many segments of logs (with async + // allocation, it's possible that the preallocation is slow and we wouldn't + // roll deterministically). + extra_tserver_flags->push_back("--log_cache_size_limit_mb=1"); + extra_tserver_flags->push_back("--log_segment_size_mb=1"); + extra_tserver_flags->push_back("--log_async_preallocate_segments=false"); + extra_tserver_flags->push_back("--log_min_segments_to_retain=1"); + extra_tserver_flags->push_back("--log_min_seconds_to_retain=0"); + extra_tserver_flags->push_back("--maintenance_manager_polling_interval_ms=100"); +} + +// Test that we can retrieve the permanent uuid of a server running +// consensus service via RPC. +TEST_F(RaftConsensusITest, TestGetPermanentUuid) { + BuildAndStart(vector()); + + RaftPeerPB peer; + TServerDetails* leader = nullptr; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + peer.mutable_last_known_addr()->CopyFrom(leader->registration.rpc_addresses(0)); + const string expected_uuid = leader->instance_id.permanent_uuid(); + + rpc::MessengerBuilder builder("test builder"); + builder.set_num_reactors(1); + std::shared_ptr messenger; + ASSERT_OK(builder.Build(&messenger)); + + ASSERT_OK(consensus::SetPermanentUuidForRemotePeer(messenger, &peer)); + ASSERT_EQ(expected_uuid, peer.permanent_uuid()); +} + +// TODO allow the scan to define an operation id, fetch the last id +// from the leader and then use that id to make the replica wait +// until it is done. This will avoid the sleeps below. +TEST_F(RaftConsensusITest, TestInsertAndMutateThroughConsensus) { + BuildAndStart(vector()); + + int num_iters = AllowSlowTests() ? 10 : 1; + + for (int i = 0; i < num_iters; i++) { + InsertTestRowsRemoteThread(i * FLAGS_client_inserts_per_thread, + FLAGS_client_inserts_per_thread, + FLAGS_client_num_batches_per_thread, + vector()); + } + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * num_iters); +} + +TEST_F(RaftConsensusITest, TestFailedTransaction) { + BuildAndStart(vector()); + + // Wait until we have a stable leader. + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, + tablet_id_, 1)); + + WriteRequestPB req; + req.set_tablet_id(tablet_id_); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + RowOperationsPB* data = req.mutable_row_operations(); + data->set_rows("some gibberish!"); + + WriteResponsePB resp; + RpcController controller; + controller.set_timeout(MonoDelta::FromSeconds(FLAGS_rpc_timeout)); + + TServerDetails* leader = nullptr; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + + ASSERT_OK(DCHECK_NOTNULL(leader->tserver_proxy.get())->Write(req, &resp, &controller)); + ASSERT_TRUE(resp.has_error()); + + // Add a proper row so that we can verify that all of the replicas continue + // to process transactions after a failure. Additionally, this allows us to wait + // for all of the replicas to finish processing transactions before shutting down, + // avoiding a potential stall as we currently can't abort transactions (see KUDU-341). + data->Clear(); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 0, 0, "original0", data); + + controller.Reset(); + controller.set_timeout(MonoDelta::FromSeconds(FLAGS_rpc_timeout)); + + ASSERT_OK(DCHECK_NOTNULL(leader->tserver_proxy.get())->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.ShortDebugString()); + ASSERT_FALSE(resp.has_error()); + + ASSERT_ALL_REPLICAS_AGREE(1); +} + +// Inserts rows through consensus and also starts one delay injecting thread +// that steals consensus peer locks for a while. This is meant to test that +// even with timeouts and repeated requests consensus still works. +TEST_F(RaftConsensusITest, MultiThreadedMutateAndInsertThroughConsensus) { + BuildAndStart(vector()); + + if (500 == FLAGS_client_inserts_per_thread) { + if (AllowSlowTests()) { + FLAGS_client_inserts_per_thread = FLAGS_client_inserts_per_thread * 10; + FLAGS_client_num_batches_per_thread = FLAGS_client_num_batches_per_thread * 10; + } + } + + int num_threads = FLAGS_num_client_threads; + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("ts-test$0", i), + &RaftConsensusITest::InsertTestRowsRemoteThread, + this, i * FLAGS_client_inserts_per_thread, + FLAGS_client_inserts_per_thread, + FLAGS_client_num_batches_per_thread, + vector(), + &new_thread)); + threads_.push_back(new_thread); + } + for (int i = 0; i < FLAGS_num_replicas; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("chaos-test$0", i), + &RaftConsensusITest::DelayInjectorThread, + this, cluster_->tablet_server(i), + kConsensusRpcTimeoutForTests, + &new_thread)); + threads_.push_back(new_thread); + } + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * FLAGS_num_client_threads); +} + +TEST_F(RaftConsensusITest, TestInsertOnNonLeader) { + BuildAndStart(vector()); + + // Wait for the initial leader election to complete. + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, + tablet_id_, 1)); + + // Manually construct a write RPC to a replica and make sure it responds + // with the correct error code. + WriteRequestPB req; + WriteResponsePB resp; + RpcController rpc; + req.set_tablet_id(tablet_id_); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, kTestRowKey, kTestRowIntVal, + "hello world via RPC", req.mutable_row_operations()); + + // Get the leader. + vector followers; + GetOnlyLiveFollowerReplicas(tablet_id_, &followers); + + ASSERT_OK(followers[0]->tserver_proxy->Write(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + Status s = StatusFromPB(resp.error().status()); + EXPECT_TRUE(s.IsIllegalState()); + ASSERT_STR_CONTAINS(s.ToString(), "is not leader of this config. Role: FOLLOWER"); + // TODO: need to change the error code to be something like REPLICA_NOT_LEADER + // so that the client can properly handle this case! plumbing this is a little difficult + // so not addressing at the moment. + ASSERT_ALL_REPLICAS_AGREE(0); +} + +TEST_F(RaftConsensusITest, TestRunLeaderElection) { + // Reset consensus rpc timeout to the default value or the election might fail often. + FLAGS_consensus_rpc_timeout_ms = 1000; + + BuildAndStart(vector()); + + int num_iters = AllowSlowTests() ? 10 : 1; + + InsertTestRowsRemoteThread(0, + FLAGS_client_inserts_per_thread * num_iters, + FLAGS_client_num_batches_per_thread, + vector()); + + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * num_iters); + + // Select the last follower to be new leader. + vector followers; + GetOnlyLiveFollowerReplicas(tablet_id_, &followers); + + // Now shutdown the current leader. + TServerDetails* leader = DCHECK_NOTNULL(GetLeaderReplicaOrNull(tablet_id_)); + ExternalTabletServer* leader_ets = cluster_->tablet_server_by_uuid(leader->uuid()); + leader_ets->Shutdown(); + + TServerDetails* replica = followers.back(); + CHECK_NE(leader->instance_id.permanent_uuid(), replica->instance_id.permanent_uuid()); + + // Make the new replica leader. + ASSERT_OK(StartElection(replica, tablet_id_, MonoDelta::FromSeconds(10))); + + // Insert a bunch more rows. + InsertTestRowsRemoteThread(FLAGS_client_inserts_per_thread * num_iters, + FLAGS_client_inserts_per_thread * num_iters, + FLAGS_client_num_batches_per_thread, + vector()); + + // Restart the original replica and make sure they all agree. + ASSERT_OK(leader_ets->Restart()); + + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * num_iters * 2); +} + +void RaftConsensusITest::Write128KOpsToLeader(int num_writes) { + TServerDetails* leader = nullptr; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + + WriteRequestPB req; + req.set_tablet_id(tablet_id_); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + RowOperationsPB* data = req.mutable_row_operations(); + WriteResponsePB resp; + RpcController rpc; + rpc.set_timeout(MonoDelta::FromMilliseconds(10000)); + int key = 0; + + // generate a 128Kb dummy payload + string test_payload(128 * 1024, '0'); + for (int i = 0; i < num_writes; i++) { + rpc.Reset(); + data->Clear(); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, key, key, + test_payload, data); + key++; + ASSERT_OK(leader->tserver_proxy->Write(req, &resp, &rpc)); + + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + } +} + +// Test that when a follower is stopped for a long time, the log cache +// properly evicts operations, but still allows the follower to catch +// up when it comes back. +TEST_F(RaftConsensusITest, TestCatchupAfterOpsEvicted) { + vector extra_flags; + extra_flags.push_back("--log_cache_size_limit_mb=1"); + extra_flags.push_back("--consensus_max_batch_size_bytes=500000"); + BuildAndStart(extra_flags); + TServerDetails* replica = (*tablet_replicas_.begin()).second; + ASSERT_TRUE(replica != nullptr); + ExternalTabletServer* replica_ets = cluster_->tablet_server_by_uuid(replica->uuid()); + + // Pause a replica + ASSERT_OK(replica_ets->Pause()); + LOG(INFO)<< "Paused one of the replicas, starting to write."; + + // Insert 3MB worth of data. + const int kNumWrites = 25; + NO_FATALS(Write128KOpsToLeader(kNumWrites)); + + // Now unpause the replica, the lagging replica should eventually catch back up. + ASSERT_OK(replica_ets->Resume()); + + ASSERT_ALL_REPLICAS_AGREE(kNumWrites); +} + +void RaftConsensusITest::CauseFollowerToFallBehindLogGC(string* leader_uuid, + int64_t* orig_term, + string* fell_behind_uuid) { + MonoDelta kTimeout = MonoDelta::FromSeconds(10); + // Wait for all of the replicas to have acknowledged the elected + // leader and logged the first NO_OP. + ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers_, tablet_id_, 1)); + + // Pause one server. This might be the leader, but pausing it will cause + // a leader election to happen. + TServerDetails* replica = (*tablet_replicas_.begin()).second; + ExternalTabletServer* replica_ets = cluster_->tablet_server_by_uuid(replica->uuid()); + ASSERT_OK(replica_ets->Pause()); + + // Find a leader. In case we paused the leader above, this will wait until + // we have elected a new one. + TServerDetails* leader = nullptr; + while (true) { + Status s = GetLeaderReplicaWithRetries(tablet_id_, &leader); + if (s.ok() && leader != nullptr && leader != replica) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + *leader_uuid = leader->uuid(); + int leader_index = cluster_->tablet_server_index_by_uuid(*leader_uuid); + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.set_timeout_allowed(true); + workload.set_payload_bytes(128 * 1024); // Write ops of size 128KB. + workload.set_write_batch_size(1); + workload.set_num_write_threads(4); + workload.Setup(); + workload.Start(); + + LOG(INFO) << "Waiting until we've written at least 4MB..."; + while (workload.rows_inserted() < 8 * 4) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + LOG(INFO) << "Waiting for log GC on " << leader->uuid(); + // Some WAL segments must exist, but wal segment 1 must not exist. + ASSERT_OK(inspect_->WaitForFilePatternInTabletWalDirOnTs( + leader_index, tablet_id_, { "wal-" }, { "wal-000000001" })); + + LOG(INFO) << "Log GC complete on " << leader->uuid(); + + // Then wait another couple of seconds to be sure that it has bothered to try + // to write to the paused peer. + // TODO: would be nice to be able to poll the leader with an RPC like + // GetLeaderStatus() which could tell us whether it has made any requests + // since the log GC. + SleepFor(MonoDelta::FromSeconds(2)); + + // Make a note of whatever the current term of the cluster is, + // before we resume the follower. + { + OpId op_id; + ASSERT_OK(GetLastOpIdForReplica(tablet_id_, leader, consensus::RECEIVED_OPID, kTimeout, + &op_id)); + *orig_term = op_id.term(); + LOG(INFO) << "Servers converged with original term " << *orig_term; + } + + // Resume the follower. + LOG(INFO) << "Resuming " << replica->uuid(); + ASSERT_OK(replica_ets->Resume()); + + // Ensure that none of the tablet servers crashed. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + // Make sure it didn't crash. + ASSERT_TRUE(cluster_->tablet_server(i)->IsProcessAlive()) + << "Tablet server " << i << " crashed"; + } + *fell_behind_uuid = replica->uuid(); +} + +// Test that the leader doesn't crash if one of its followers has +// fallen behind so far that the logs necessary to catch it up +// have been GCed. +// +// In a real cluster, this will eventually cause the follower to be +// evicted/replaced. In any case, the leader should not crash. +// +// We also ensure that, when the leader stops writing to the follower, +// the follower won't disturb the other nodes when it attempts to elect +// itself. +// +// This is a regression test for KUDU-775 and KUDU-562. +TEST_F(RaftConsensusITest, TestFollowerFallsBehindLeaderGC) { + // Disable follower eviction to maintain the original intent of this test. + vector extra_flags = { "--evict_failed_followers=false" }; + AddFlagsForLogRolls(&extra_flags); // For CauseFollowerToFallBehindLogGC(). + BuildAndStart(extra_flags); + + string leader_uuid; + int64_t orig_term; + string follower_uuid; + NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid)); + + // Wait for remaining majority to agree. + TabletServerMap active_tablet_servers = tablet_servers_; + ASSERT_EQ(3, active_tablet_servers.size()); + ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid)); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(30), active_tablet_servers, tablet_id_, + 1)); + + if (AllowSlowTests()) { + // Sleep long enough that the "abandoned" server's leader election interval + // will trigger several times. Then, verify that the term has not increased. + // This ensures that the other servers properly ignore the election requests + // from the abandoned node. + // TODO: would be nicer to use an RPC to check the current term of the + // abandoned replica, and wait until it has incremented a couple of times. + SleepFor(MonoDelta::FromSeconds(5)); + OpId op_id; + TServerDetails* leader = tablet_servers_[leader_uuid]; + ASSERT_OK(GetLastOpIdForReplica(tablet_id_, leader, consensus::RECEIVED_OPID, + MonoDelta::FromSeconds(10), &op_id)); + ASSERT_EQ(orig_term, op_id.term()) + << "expected the leader to have not advanced terms but has op " << op_id; + } +} + +int RaftConsensusITest::RestartAnyCrashedTabletServers() { + int restarted = 0; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + if (!cluster_->tablet_server(i)->IsProcessAlive()) { + LOG(INFO) << "TS " << i << " appears to have crashed. Restarting."; + cluster_->tablet_server(i)->Shutdown(); + CHECK_OK(cluster_->tablet_server(i)->Restart()); + restarted++; + } + } + return restarted; +} + +void RaftConsensusITest::AssertNoTabletServersCrashed() { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + if (cluster_->tablet_server(i)->IsShutdown()) continue; + + ASSERT_TRUE(cluster_->tablet_server(i)->IsProcessAlive()) + << "Tablet server " << i << " crashed"; + } +} + +// This test starts several tablet servers, and configures them with +// fault injection so that the leaders frequently crash just before +// sending RPCs to followers. +// +// This can result in various scenarios where leaders crash right after +// being elected and never succeed in replicating their first operation. +// For example, KUDU-783 reproduces from this test approximately 5% of the +// time on a slow-test debug build. +TEST_F(RaftConsensusITest, InsertWithCrashyNodes) { + int kCrashesToCause = 3; + if (AllowSlowTests()) { + FLAGS_num_tablet_servers = 7; + FLAGS_num_replicas = 7; + kCrashesToCause = 15; + } + + vector ts_flags, master_flags; + + // Crash 5% of the time just before sending an RPC. With 7 servers, + // this means we crash about 30% of the time before we've fully + // replicated the NO_OP at the start of the term. + ts_flags.push_back("--fault_crash_on_leader_request_fraction=0.05"); + + // Inject latency to encourage the replicas to fall out of sync + // with each other. + ts_flags.push_back("--log_inject_latency"); + ts_flags.push_back("--log_inject_latency_ms_mean=30"); + ts_flags.push_back("--log_inject_latency_ms_stddev=60"); + + // Make leader elections faster so we get through more cycles of + // leaders. + ts_flags.push_back("--raft_heartbeat_interval_ms=100"); + ts_flags.push_back("--leader_failure_monitor_check_mean_ms=50"); + ts_flags.push_back("--leader_failure_monitor_check_stddev_ms=25"); + + // Avoid preallocating segments since bootstrap is a little bit + // faster if it doesn't have to scan forward through the preallocated + // log area. + ts_flags.push_back("--log_preallocate_segments=false"); + + CreateCluster("raft_consensus-itest-cluster", ts_flags, master_flags); + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(FLAGS_num_replicas); + workload.set_timeout_allowed(true); + workload.set_write_timeout_millis(1000); + workload.set_num_write_threads(10); + workload.set_write_batch_size(1); + workload.Setup(); + workload.Start(); + + int num_crashes = 0; + while (num_crashes < kCrashesToCause && + workload.rows_inserted() < 100) { + num_crashes += RestartAnyCrashedTabletServers(); + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + workload.StopAndJoin(); + + // After we stop the writes, we can still get crashes because heartbeats could + // trigger the fault path. So, disable the faults and restart one more time. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + vector* flags = ts->mutable_flags(); + bool removed_flag = false; + for (auto it = flags->begin(); it != flags->end(); ++it) { + if (HasPrefixString(*it, "--fault_crash")) { + flags->erase(it); + removed_flag = true; + break; + } + } + ASSERT_TRUE(removed_flag) << "could not remove flag from TS " << i + << "\nFlags:\n" << *flags; + ts->Shutdown(); + CHECK_OK(ts->Restart()); + } + + // Ensure that the replicas converge. + // We don't know exactly how many rows got inserted, since the writer + // probably saw many errors which left inserts in indeterminate state. + // But, we should have at least as many as we got confirmation for. + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); +} + +// This test sets all of the election timers to be very short, resulting +// in a lot of churn. We expect to make some progress and not diverge or +// crash, despite the frequent re-elections and races. +TEST_F(RaftConsensusITest, TestChurnyElections) { + DoTestChurnyElections(WITHOUT_NOTIFICATION_LATENCY); +} + +// The same test, except inject artificial latency when propagating notifications +// from the queue back to consensus. This can reproduce bugs like KUDU-1078 which +// normally only appear under high load. TODO: Re-enable once we get to the +// bottom of KUDU-1078. +TEST_F(RaftConsensusITest, DISABLED_TestChurnyElections_WithNotificationLatency) { + DoTestChurnyElections(WITH_NOTIFICATION_LATENCY); +} + +void RaftConsensusITest::DoTestChurnyElections(bool with_latency) { + vector ts_flags, master_flags; + +#ifdef THREAD_SANITIZER + // On TSAN builds, we need to be a little bit less churny in order to make + // any progress at all. + ts_flags.push_back("--raft_heartbeat_interval_ms=5"); +#else + ts_flags.push_back("--raft_heartbeat_interval_ms=1"); +#endif + ts_flags.push_back("--leader_failure_monitor_check_mean_ms=1"); + ts_flags.push_back("--leader_failure_monitor_check_stddev_ms=1"); + ts_flags.push_back("--never_fsync"); + if (with_latency) { + ts_flags.push_back("--consensus_inject_latency_ms_in_notifications=50"); + } + + CreateCluster("raft_consensus-itest-cluster", ts_flags, master_flags); + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(FLAGS_num_replicas); + workload.set_timeout_allowed(true); + workload.set_write_timeout_millis(100); + workload.set_num_write_threads(2); + workload.set_write_batch_size(1); + workload.Setup(); + workload.Start(); + + // Run for either a prescribed number of writes, or 30 seconds, + // whichever comes first. This prevents test timeouts on slower + // build machines, TSAN builds, etc. + Stopwatch sw; + sw.start(); + const int kNumWrites = AllowSlowTests() ? 10000 : 1000; + while (workload.rows_inserted() < kNumWrites && + sw.elapsed().wall_seconds() < 30) { + SleepFor(MonoDelta::FromMilliseconds(10)); + NO_FATALS(AssertNoTabletServersCrashed()); + } + workload.StopAndJoin(); + ASSERT_GT(workload.rows_inserted(), 0) << "No rows inserted"; + + // Ensure that the replicas converge. + // We don't know exactly how many rows got inserted, since the writer + // probably saw many errors which left inserts in indeterminate state. + // But, we should have at least as many as we got confirmation for. + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); + NO_FATALS(AssertNoTabletServersCrashed()); +} + +TEST_F(RaftConsensusITest, MultiThreadedInsertWithFailovers) { + int kNumElections = FLAGS_num_replicas; + + if (AllowSlowTests()) { + FLAGS_num_tablet_servers = 7; + FLAGS_num_replicas = 7; + kNumElections = 3 * FLAGS_num_replicas; + } + + // Reset consensus rpc timeout to the default value or the election might fail often. + FLAGS_consensus_rpc_timeout_ms = 1000; + + // Start a 7 node configuration cluster (since we can't bring leaders back we start with a + // higher replica count so that we kill more leaders). + + vector flags; + BuildAndStart(flags); + + OverrideFlagForSlowTests( + "client_inserts_per_thread", + strings::Substitute("$0", (FLAGS_client_inserts_per_thread * 100))); + OverrideFlagForSlowTests( + "client_num_batches_per_thread", + strings::Substitute("$0", (FLAGS_client_num_batches_per_thread * 100))); + + int num_threads = FLAGS_num_client_threads; + int64_t total_num_rows = num_threads * FLAGS_client_inserts_per_thread; + + // We create 2 * (kNumReplicas - 1) latches so that we kill the same node at least + // twice. + vector latches; + for (int i = 1; i < kNumElections; i++) { + latches.push_back(new CountDownLatch((i * total_num_rows) / kNumElections)); + } + + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("ts-test$0", i), + &RaftConsensusITest::InsertTestRowsRemoteThread, + this, i * FLAGS_client_inserts_per_thread, + FLAGS_client_inserts_per_thread, + FLAGS_client_num_batches_per_thread, + latches, + &new_thread)); + threads_.push_back(new_thread); + } + + for (CountDownLatch* latch : latches) { + latch->Wait(); + StopOrKillLeaderAndElectNewOne(); + } + + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * FLAGS_num_client_threads); + STLDeleteElements(&latches); +} + +// Test automatic leader election by killing leaders. +TEST_F(RaftConsensusITest, TestAutomaticLeaderElection) { + if (AllowSlowTests()) { + FLAGS_num_tablet_servers = 5; + FLAGS_num_replicas = 5; + } + BuildAndStart(vector()); + + TServerDetails* leader; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + + unordered_set killed_leaders; + + const int kNumLeadersToKill = FLAGS_num_replicas / 2; + const int kFinalNumReplicas = FLAGS_num_replicas / 2 + 1; + + for (int leaders_killed = 0; leaders_killed < kFinalNumReplicas; leaders_killed++) { + LOG(INFO) << Substitute("Writing data to leader of $0-node config ($1 alive)...", + FLAGS_num_replicas, FLAGS_num_replicas - leaders_killed); + + InsertTestRowsRemoteThread(leaders_killed * FLAGS_client_inserts_per_thread, + FLAGS_client_inserts_per_thread, + FLAGS_client_num_batches_per_thread, + vector()); + + // At this point, the writes are flushed but the commit index may not be + // propagated to all replicas. We kill the leader anyway. + if (leaders_killed < kNumLeadersToKill) { + LOG(INFO) << "Killing current leader " << leader->instance_id.permanent_uuid() << "..."; + cluster_->tablet_server_by_uuid(leader->uuid())->Shutdown(); + InsertOrDie(&killed_leaders, leader); + + LOG(INFO) << "Waiting for new guy to be elected leader."; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + } + } + + // Restart every node that was killed, and wait for the nodes to converge + for (TServerDetails* killed_node : killed_leaders) { + CHECK_OK(cluster_->tablet_server_by_uuid(killed_node->uuid())->Restart()); + } + // Verify the data on the remaining replicas. + ASSERT_ALL_REPLICAS_AGREE(FLAGS_client_inserts_per_thread * kFinalNumReplicas); +} + +// Single-replica leader election test. +TEST_F(RaftConsensusITest, TestAutomaticLeaderElectionOneReplica) { + FLAGS_num_tablet_servers = 1; + FLAGS_num_replicas = 1; + vector ts_flags; + vector master_flags = { "--catalog_manager_allow_local_consensus=false" }; + BuildAndStart(ts_flags, master_flags); + + TServerDetails* leader; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); +} + +void RaftConsensusITest::StubbornlyWriteSameRowThread(int replica_idx, const AtomicBool* finish) { + vector servers; + AppendValuesFromMap(tablet_servers_, &servers); + CHECK_LT(replica_idx, servers.size()); + TServerDetails* ts = servers[replica_idx]; + + // Manually construct an RPC to our target replica. We expect most of the calls + // to fail either with an "already present" or an error because we are writing + // to a follower. That's OK, though - what we care about for this test is + // just that the operations Apply() in the same order everywhere (even though + // in this case the result will just be an error). + WriteRequestPB req; + WriteResponsePB resp; + RpcController rpc; + req.set_tablet_id(tablet_id_); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, kTestRowKey, kTestRowIntVal, + "hello world", req.mutable_row_operations()); + + while (!finish->Load()) { + resp.Clear(); + rpc.Reset(); + rpc.set_timeout(MonoDelta::FromSeconds(10)); + ignore_result(ts->tserver_proxy->Write(req, &resp, &rpc)); + VLOG(1) << "Response from server " << replica_idx << ": " + << resp.ShortDebugString(); + } +} + +// Regression test for KUDU-597, an issue where we could mis-order operations on +// a machine if the following sequence occurred: +// 1) Replica is a FOLLOWER +// 2) A client request hits the machine +// 3) It receives some operations from the current leader +// 4) It gets elected LEADER +// In this scenario, it would incorrectly sequence the client request's PREPARE phase +// before the operations received in step (3), even though the correct behavior would be +// to either reject them or sequence them after those operations, because the operation +// index is higher. +// +// The test works by setting up three replicas and manually hammering them with write +// requests targeting a single row. If the bug exists, then TransactionOrderVerifier +// will trigger an assertion because the prepare order and the op indexes will become +// misaligned. +TEST_F(RaftConsensusITest, TestKUDU_597) { + FLAGS_num_replicas = 3; + FLAGS_num_tablet_servers = 3; + BuildAndStart(vector()); + + AtomicBool finish(false); + for (int i = 0; i < FLAGS_num_tablet_servers; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("ts-test$0", i), + &RaftConsensusITest::StubbornlyWriteSameRowThread, + this, i, &finish, &new_thread)); + threads_.push_back(new_thread); + } + + const int num_loops = AllowSlowTests() ? 10 : 1; + for (int i = 0; i < num_loops; i++) { + StopOrKillLeaderAndElectNewOne(); + SleepFor(MonoDelta::FromSeconds(1)); + ASSERT_OK(CheckTabletServersAreAlive(FLAGS_num_tablet_servers)); + } + + finish.Store(true); + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } +} + +void RaftConsensusITest::AddOp(const OpId& id, ConsensusRequestPB* req) { + ReplicateMsg* msg = req->add_ops(); + msg->mutable_id()->CopyFrom(id); + msg->set_timestamp(id.index()); + msg->set_op_type(consensus::WRITE_OP); + WriteRequestPB* write_req = msg->mutable_write_request(); + CHECK_OK(SchemaToPB(schema_, write_req->mutable_schema())); + write_req->set_tablet_id(tablet_id_); + int key = id.index() * 10000 + id.term(); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, key, id.term(), + id.ShortDebugString(), write_req->mutable_row_operations()); +} + +// Regression test for KUDU-644: +// Triggers some complicated scenarios on the replica involving aborting and +// replacing transactions. +TEST_F(RaftConsensusITest, TestReplicaBehaviorViaRPC) { + FLAGS_num_replicas = 3; + FLAGS_num_tablet_servers = 3; + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + // Kill all the servers but one. + TServerDetails *replica_ts; + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(3, tservers.size()); + + // Elect server 2 as leader and wait for log index 1 to propagate to all servers. + ASSERT_OK(StartElection(tservers[2], tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + + replica_ts = tservers[0]; + cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Shutdown(); + cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Shutdown(); + + LOG(INFO) << "================================== Cluster setup complete."; + + // Check that the 'term' metric is correctly exposed. + { + int64_t term_from_metric = -1; + ASSERT_OK(cluster_->tablet_server_by_uuid(replica_ts->uuid())->GetInt64Metric( + &METRIC_ENTITY_tablet, + nullptr, + &METRIC_raft_term, + "value", + &term_from_metric)); + ASSERT_EQ(term_from_metric, 1); + } + + ConsensusServiceProxy* c_proxy = CHECK_NOTNULL(replica_ts->consensus_proxy.get()); + + ConsensusRequestPB req; + ConsensusResponsePB resp; + RpcController rpc; + + // Send a simple request with no ops. + req.set_tablet_id(tablet_id_); + req.set_dest_uuid(replica_ts->uuid()); + req.set_caller_uuid("fake_caller"); + req.set_caller_term(2); + req.mutable_committed_index()->CopyFrom(MakeOpId(1, 1)); + req.mutable_preceding_id()->CopyFrom(MakeOpId(1, 1)); + + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + + // Send some operations, but don't advance the commit index. + // They should not commit. + AddOp(MakeOpId(2, 2), &req); + AddOp(MakeOpId(2, 3), &req); + AddOp(MakeOpId(2, 4), &req); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + + // We shouldn't read anything yet, because the ops should be pending. + { + vector results; + NO_FATALS(ScanReplica(replica_ts->tserver_proxy.get(), &results)); + ASSERT_EQ(0, results.size()) << results; + } + + // Send op 2.6, but set preceding OpId to 2.4. This is an invalid + // request, and the replica should reject it. + req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 4)); + req.clear_ops(); + AddOp(MakeOpId(2, 6), &req); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_TRUE(resp.has_error()) << resp.DebugString(); + ASSERT_EQ(resp.error().status().message(), + "New operation's index does not follow the previous op's index. " + "Current: 2.6. Previous: 2.4"); + + resp.Clear(); + req.clear_ops(); + // Send ops 3.5 and 2.6, then commit up to index 6, the replica + // should fail because of the out-of-order terms. + req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 4)); + AddOp(MakeOpId(3, 5), &req); + AddOp(MakeOpId(2, 6), &req); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_TRUE(resp.has_error()) << resp.DebugString(); + ASSERT_EQ(resp.error().status().message(), + "New operation's term is not >= than the previous op's term." + " Current: 2.6. Previous: 3.5"); + + // Regression test for KUDU-639: if we send a valid request, but the + // current commit index is higher than the data we're sending, we shouldn't + // commit anything higher than the last op sent by the leader. + // + // To test, we re-send operation 2.3, with the correct preceding ID 2.2, + // but we set the committed index to 2.4. This should only commit + // 2.2 and 2.3. + resp.Clear(); + req.clear_ops(); + req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 2)); + AddOp(MakeOpId(2, 3), &req); + req.mutable_committed_index()->CopyFrom(MakeOpId(2, 4)); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + // Verify only 2.2 and 2.3 are committed. + { + vector results; + NO_FATALS(WaitForRowCount(replica_ts->tserver_proxy.get(), 2, &results)); + ASSERT_STR_CONTAINS(results[0], "term: 2 index: 2"); + ASSERT_STR_CONTAINS(results[1], "term: 2 index: 3"); + } + + resp.Clear(); + req.clear_ops(); + // Now send some more ops, and commit the earlier ones. + req.mutable_committed_index()->CopyFrom(MakeOpId(2, 4)); + req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 4)); + AddOp(MakeOpId(2, 5), &req); + AddOp(MakeOpId(2, 6), &req); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + + // Verify they are committed. + { + vector results; + NO_FATALS(WaitForRowCount(replica_ts->tserver_proxy.get(), 3, &results)); + ASSERT_STR_CONTAINS(results[0], "term: 2 index: 2"); + ASSERT_STR_CONTAINS(results[1], "term: 2 index: 3"); + ASSERT_STR_CONTAINS(results[2], "term: 2 index: 4"); + } + + // At this point, we still have two operations which aren't committed. If we + // try to perform a snapshot-consistent scan, we should time out rather than + // hanging the RPC service thread. + { + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + rpc.set_timeout(MonoDelta::FromMilliseconds(100)); + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(tablet_id_); + scan->set_read_mode(READ_AT_SNAPSHOT); + ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns())); + + // Send the call. We expect to get a timeout passed back from the server side + // (i.e. not an RPC timeout) + req.set_batch_size_bytes(0); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(replica_ts->tserver_proxy->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + string err_str = StatusFromPB(resp.error().status()).ToString(); + ASSERT_STR_CONTAINS(err_str, "Timed out waiting for all transactions"); + ASSERT_STR_CONTAINS(err_str, "to commit"); + } + + resp.Clear(); + req.clear_ops(); + int leader_term = 2; + const int kNumTerms = AllowSlowTests() ? 10000 : 100; + while (leader_term < kNumTerms) { + leader_term++; + // Now pretend to be a new leader (term 3) and replace the earlier ops + // without committing the new replacements. + req.set_caller_term(leader_term); + req.set_caller_uuid("new_leader"); + req.mutable_preceding_id()->CopyFrom(MakeOpId(2, 4)); + req.clear_ops(); + AddOp(MakeOpId(leader_term, 5), &req); + AddOp(MakeOpId(leader_term, 6), &req); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << "Req: " << req.ShortDebugString() + << " Resp: " << resp.DebugString(); + } + + // Send an empty request from the newest term which should commit + // the earlier ops. + { + req.mutable_preceding_id()->CopyFrom(MakeOpId(leader_term, 6)); + req.mutable_committed_index()->CopyFrom(MakeOpId(leader_term, 6)); + req.clear_ops(); + rpc.Reset(); + ASSERT_OK(c_proxy->UpdateConsensus(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + } + + // Verify the new rows are committed. + { + vector results; + NO_FATALS(WaitForRowCount(replica_ts->tserver_proxy.get(), 5, &results)); + SCOPED_TRACE(results); + ASSERT_STR_CONTAINS(results[3], Substitute("term: $0 index: 5", leader_term)); + ASSERT_STR_CONTAINS(results[4], Substitute("term: $0 index: 6", leader_term)); + } +} + +TEST_F(RaftConsensusITest, TestLeaderStepDown) { + FLAGS_num_replicas = 3; + FLAGS_num_tablet_servers = 3; + + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + + // Start with no leader. + Status s = GetReplicaStatusAndCheckIfLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)); + ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader yet: " << s.ToString(); + + // Become leader. + ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitUntilLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 2)); + + // Step down and test that a 2nd stepdown returns the expected result. + ASSERT_OK(LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); + TabletServerErrorPB error; + s = LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10), &error); + ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader anymore: " << s.ToString(); + ASSERT_EQ(TabletServerErrorPB::NOT_THE_LEADER, error.code()) << error.ShortDebugString(); + + s = WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10)); + ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not accept writes as follower: " + << s.ToString(); +} + +void RaftConsensusITest::AssertMajorityRequiredForElectionsAndWrites( + const TabletServerMap& tablet_servers, const string& leader_uuid) { + + TServerDetails* initial_leader = FindOrDie(tablet_servers, leader_uuid); + + // Calculate number of servers to leave unpaused (minority). + // This math is a little unintuitive but works for cluster sizes including 2 and 1. + // Note: We assume all of these TSes are voters. + int config_size = tablet_servers.size(); + int minority_to_retain = MajoritySize(config_size) - 1; + + // Only perform this part of the test if we have some servers to pause, else + // the failure assertions will throw. + if (config_size > 1) { + // Pause enough replicas to prevent a majority. + int num_to_pause = config_size - minority_to_retain; + LOG(INFO) << "Pausing " << num_to_pause << " tablet servers in config of size " << config_size; + vector paused_uuids; + for (const TabletServerMap::value_type& entry : tablet_servers) { + if (paused_uuids.size() == num_to_pause) { + continue; + } + const string& replica_uuid = entry.first; + if (replica_uuid == leader_uuid) { + // Always leave this one alone. + continue; + } + ExternalTabletServer* replica_ts = cluster_->tablet_server_by_uuid(replica_uuid); + ASSERT_OK(replica_ts->Pause()); + paused_uuids.push_back(replica_uuid); + } + + // Ensure writes timeout while only a minority is alive. + Status s = WriteSimpleTestRow(initial_leader, tablet_id_, RowOperationsPB::UPDATE, + kTestRowKey, kTestRowIntVal, "foo", + MonoDelta::FromMilliseconds(100)); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + + // Step down. + ASSERT_OK(LeaderStepDown(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + + // Assert that elections time out without a live majority. + // We specify a very short timeout here to keep the tests fast. + ASSERT_OK(StartElection(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + s = WaitUntilLeader(initial_leader, tablet_id_, MonoDelta::FromMilliseconds(100)); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + LOG(INFO) << "Expected timeout encountered on election with weakened config: " << s.ToString(); + + // Resume the paused servers. + LOG(INFO) << "Resuming " << num_to_pause << " tablet servers in config of size " << config_size; + for (const string& replica_uuid : paused_uuids) { + ExternalTabletServer* replica_ts = cluster_->tablet_server_by_uuid(replica_uuid); + ASSERT_OK(replica_ts->Resume()); + } + } + + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(20), tablet_servers, tablet_id_, 1)); + + // Now an election should succeed. + ASSERT_OK(StartElection(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitUntilLeader(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + LOG(INFO) << "Successful election with full config of size " << config_size; + + // And a write should also succeed. + ASSERT_OK(WriteSimpleTestRow(initial_leader, tablet_id_, RowOperationsPB::UPDATE, + kTestRowKey, kTestRowIntVal, Substitute("qsz=$0", config_size), + MonoDelta::FromSeconds(10))); +} + +// Return the replicas of the specified 'tablet_id', as seen by the Master. +Status RaftConsensusITest::GetTabletLocations(const string& tablet_id, const MonoDelta& timeout, + master::TabletLocationsPB* tablet_locations) { + RpcController rpc; + rpc.set_timeout(timeout); + GetTabletLocationsRequestPB req; + *req.add_tablet_ids() = tablet_id; + GetTabletLocationsResponsePB resp; + RETURN_NOT_OK(cluster_->master_proxy()->GetTabletLocations(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + if (resp.errors_size() > 0) { + CHECK_EQ(1, resp.errors_size()) << resp.ShortDebugString(); + CHECK_EQ(tablet_id, resp.errors(0).tablet_id()) << resp.ShortDebugString(); + return StatusFromPB(resp.errors(0).status()); + } + CHECK_EQ(1, resp.tablet_locations_size()) << resp.ShortDebugString(); + *tablet_locations = resp.tablet_locations(0); + return Status::OK(); +} + +void RaftConsensusITest::WaitForReplicasReportedToMaster( + int num_replicas, const string& tablet_id, + const MonoDelta& timeout, + WaitForLeader wait_for_leader, + bool* has_leader, + master::TabletLocationsPB* tablet_locations) { + MonoTime deadline(MonoTime::Now(MonoTime::FINE)); + deadline.AddDelta(timeout); + while (true) { + ASSERT_OK(GetTabletLocations(tablet_id, timeout, tablet_locations)); + *has_leader = false; + if (tablet_locations->replicas_size() == num_replicas) { + for (const master::TabletLocationsPB_ReplicaPB& replica : + tablet_locations->replicas()) { + if (replica.role() == RaftPeerPB::LEADER) { + *has_leader = true; + } + } + if (wait_for_leader == NO_WAIT_FOR_LEADER || + (wait_for_leader == WAIT_FOR_LEADER && *has_leader)) { + break; + } + } + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(20)); + } + ASSERT_EQ(num_replicas, tablet_locations->replicas_size()) << tablet_locations->DebugString(); + if (wait_for_leader == WAIT_FOR_LEADER) { + ASSERT_TRUE(*has_leader) << tablet_locations->DebugString(); + } +} + +// Basic test of adding and removing servers from a configuration. +TEST_F(RaftConsensusITest, TestAddRemoveServer) { + MonoDelta kTimeout = MonoDelta::FromSeconds(10); + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 3; + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(BuildAndStart(ts_flags, master_flags)); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + // Elect server 0 as leader and wait for log index 1 to propagate to all servers. + TServerDetails* leader_tserver = tservers[0]; + const string& leader_uuid = tservers[0]->uuid(); + ASSERT_OK(StartElection(leader_tserver, tablet_id_, kTimeout)); + ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers_, tablet_id_, 1)); + + // Make sure the server rejects removal of itself from the configuration. + Status s = RemoveServer(leader_tserver, tablet_id_, leader_tserver, boost::none, kTimeout); + ASSERT_TRUE(s.IsInvalidArgument()) << "Should not be able to remove self from config: " + << s.ToString(); + + // Insert the row that we will update throughout the test. + ASSERT_OK(WriteSimpleTestRow(leader_tserver, tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "initial insert", kTimeout)); + + // Kill the master, so we can change the config without interference. + cluster_->master()->Shutdown(); + + TabletServerMap active_tablet_servers = tablet_servers_; + + // Do majority correctness check for 3 servers. + NO_FATALS(AssertMajorityRequiredForElectionsAndWrites(active_tablet_servers, leader_uuid)); + OpId opid; + ASSERT_OK(GetLastOpIdForReplica(tablet_id_, leader_tserver, consensus::RECEIVED_OPID, kTimeout, + &opid)); + int64_t cur_log_index = opid.index(); + + // Go from 3 tablet servers down to 1 in the configuration. + vector remove_list = { 2, 1 }; + for (int to_remove_idx : remove_list) { + int num_servers = active_tablet_servers.size(); + LOG(INFO) << "Remove: Going from " << num_servers << " to " << num_servers - 1 << " replicas"; + + TServerDetails* tserver_to_remove = tservers[to_remove_idx]; + LOG(INFO) << "Removing tserver with uuid " << tserver_to_remove->uuid(); + ASSERT_OK(RemoveServer(leader_tserver, tablet_id_, tserver_to_remove, boost::none, kTimeout)); + ASSERT_EQ(1, active_tablet_servers.erase(tserver_to_remove->uuid())); + ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, ++cur_log_index)); + + // Do majority correctness check for each incremental decrease. + NO_FATALS(AssertMajorityRequiredForElectionsAndWrites(active_tablet_servers, leader_uuid)); + ASSERT_OK(GetLastOpIdForReplica(tablet_id_, leader_tserver, consensus::RECEIVED_OPID, kTimeout, + &opid)); + cur_log_index = opid.index(); + } + + // Add the tablet servers back, in reverse order, going from 1 to 3 servers in the configuration. + vector add_list = { 1, 2 }; + for (int to_add_idx : add_list) { + int num_servers = active_tablet_servers.size(); + LOG(INFO) << "Add: Going from " << num_servers << " to " << num_servers + 1 << " replicas"; + + TServerDetails* tserver_to_add = tservers[to_add_idx]; + LOG(INFO) << "Adding tserver with uuid " << tserver_to_add->uuid(); + ASSERT_OK(AddServer(leader_tserver, tablet_id_, tserver_to_add, RaftPeerPB::VOTER, boost::none, + kTimeout)); + InsertOrDie(&active_tablet_servers, tserver_to_add->uuid(), tserver_to_add); + ASSERT_OK(WaitForServersToAgree(kTimeout, active_tablet_servers, tablet_id_, ++cur_log_index)); + + // Do majority correctness check for each incremental increase. + NO_FATALS(AssertMajorityRequiredForElectionsAndWrites(active_tablet_servers, leader_uuid)); + ASSERT_OK(GetLastOpIdForReplica(tablet_id_, leader_tserver, consensus::RECEIVED_OPID, kTimeout, + &opid)); + cur_log_index = opid.index(); + } +} + +// Regression test for KUDU-1169: a crash when a Config Change operation is replaced +// by a later leader. +TEST_F(RaftConsensusITest, TestReplaceChangeConfigOperation) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 3; + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(BuildAndStart(ts_flags, master_flags)); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + + // Elect server 0 as leader and wait for log index 1 to propagate to all servers. + TServerDetails* leader_tserver = tservers[0]; + + TabletServerMap original_followers = tablet_servers_; + ASSERT_EQ(1, original_followers.erase(leader_tserver->uuid())); + + + ASSERT_OK(StartElection(leader_tserver, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + + // Shut down servers 1 and 2, so that server 1 can't replicate anything. + cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Shutdown(); + cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Shutdown(); + + // Now try to replicate a ChangeConfig operation. This should get stuck and time out + // because the server can't replicate any operations. + TabletServerErrorPB::Code error_code; + Status s = RemoveServer(leader_tserver, tablet_id_, tservers[1], + -1, MonoDelta::FromSeconds(1), + &error_code); + ASSERT_TRUE(s.IsTimedOut()); + + // Pause the leader, and restart the other servers. + cluster_->tablet_server_by_uuid(tservers[0]->uuid())->Pause(); + ASSERT_OK(cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Restart()); + ASSERT_OK(cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Restart()); + + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), original_followers, tablet_id_, 1)); + + // Elect one of the other servers. + ASSERT_OK(StartElection(tservers[1], tablet_id_, MonoDelta::FromSeconds(10))); + + // Resume the original leader. Its change-config operation will now be aborted + // since it was never replicated to the majority, and the new leader will have + // replaced the operation. + cluster_->tablet_server_by_uuid(tservers[0]->uuid())->Resume(); + + // Insert some data and verify that it propagates to all servers. + NO_FATALS(InsertTestRowsRemoteThread(0, 10, 1, vector())); + ASSERT_ALL_REPLICAS_AGREE(10); +} + +// Test the atomic CAS arguments to ChangeConfig() add server and remove server. +TEST_F(RaftConsensusITest, TestAtomicAddRemoveServer) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 3; + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(BuildAndStart(ts_flags, master_flags)); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + // Elect server 0 as leader and wait for log index 1 to propagate to all servers. + TServerDetails* leader_tserver = tservers[0]; + ASSERT_OK(StartElection(leader_tserver, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + int64_t cur_log_index = 1; + + TabletServerMap active_tablet_servers = tablet_servers_; + + TServerDetails* follower_ts = tservers[2]; + + // Initial committed config should have opid_index == -1. + // Server should reject request to change config from opid other than this. + int64_t invalid_committed_opid_index = 7; + TabletServerErrorPB::Code error_code; + Status s = RemoveServer(leader_tserver, tablet_id_, follower_ts, + invalid_committed_opid_index, MonoDelta::FromSeconds(10), + &error_code); + ASSERT_EQ(TabletServerErrorPB::CAS_FAILED, error_code); + ASSERT_STR_CONTAINS(s.ToString(), "of 7 but the committed config has opid_index of -1"); + + // Specifying the correct committed opid index should work. + int64_t committed_opid_index = -1; + ASSERT_OK(RemoveServer(leader_tserver, tablet_id_, follower_ts, + committed_opid_index, MonoDelta::FromSeconds(10))); + + ASSERT_EQ(1, active_tablet_servers.erase(follower_ts->uuid())); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, ++cur_log_index)); + + // Now, add the server back. Again, specifying something other than the + // latest committed_opid_index should fail. + invalid_committed_opid_index = -1; // The old one is no longer valid. + s = AddServer(leader_tserver, tablet_id_, follower_ts, RaftPeerPB::VOTER, + invalid_committed_opid_index, MonoDelta::FromSeconds(10), + &error_code); + ASSERT_EQ(TabletServerErrorPB::CAS_FAILED, error_code); + ASSERT_STR_CONTAINS(s.ToString(), "of -1 but the committed config has opid_index of 2"); + + // Specifying the correct committed opid index should work. + // The previous config change op is the latest entry in the log. + committed_opid_index = cur_log_index; + ASSERT_OK(AddServer(leader_tserver, tablet_id_, follower_ts, RaftPeerPB::VOTER, + committed_opid_index, MonoDelta::FromSeconds(10))); + + InsertOrDie(&active_tablet_servers, follower_ts->uuid(), follower_ts); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, ++cur_log_index)); +} + +// Ensure that we can elect a server that is in the "pending" configuration. +// This is required by the Raft protocol. See Diego Ongaro's PhD thesis, section +// 4.1, where it states that "it is the caller’s configuration that is used in +// reaching consensus, both for voting and for log replication". +// +// This test also tests the case where a node comes back from the dead to a +// leader that was not in its configuration when it died. That should also work, i.e. +// the revived node should accept writes from the new leader. +TEST_F(RaftConsensusITest, TestElectPendingVoter) { + // Test plan: + // 1. Disable failure detection to avoid non-deterministic behavior. + // 2. Start with a configuration size of 5, all servers synced. + // 3. Remove one server from the configuration, wait until committed. + // 4. Pause the 3 remaining non-leaders (SIGSTOP). + // 5. Run a config change to add back the previously-removed server. + // Ensure that, while the op cannot be committed yet due to lack of a + // majority in the new config (only 2 out of 5 servers are alive), the op + // has been replicated to both the local leader and the new member. + // 6. Force the existing leader to step down. + // 7. Resume one of the paused nodes so that a majority (of the 5-node + // configuration, but not the original 4-node configuration) will be available. + // 8. Start a leader election on the new (pending) node. It should win. + // 9. Unpause the two remaining stopped nodes. + // 10. Wait for all nodes to sync to the new leader's log. + FLAGS_num_tablet_servers = 5; + FLAGS_num_replicas = 5; + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + // Elect server 0 as leader and wait for log index 1 to propagate to all servers. + TServerDetails* initial_leader = tservers[0]; + ASSERT_OK(StartElection(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + + // The server we will remove and then bring back. + TServerDetails* final_leader = tservers[4]; + + // Kill the master, so we can change the config without interference. + cluster_->master()->Shutdown(); + + // Now remove server 4 from the configuration. + TabletServerMap active_tablet_servers = tablet_servers_; + LOG(INFO) << "Removing tserver with uuid " << final_leader->uuid(); + ASSERT_OK(RemoveServer(initial_leader, tablet_id_, final_leader, boost::none, + MonoDelta::FromSeconds(10))); + ASSERT_EQ(1, active_tablet_servers.erase(final_leader->uuid())); + int64_t cur_log_index = 2; + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, cur_log_index)); + + // Pause tablet servers 1 through 3, so they won't see the operation to add + // server 4 back. + LOG(INFO) << "Pausing 3 replicas..."; + for (int i = 1; i <= 3; i++) { + ExternalTabletServer* replica_ts = cluster_->tablet_server_by_uuid(tservers[i]->uuid()); + ASSERT_OK(replica_ts->Pause()); + } + + // Now add server 4 back to the peers. + // This operation will time out on the client side. + LOG(INFO) << "Adding back Peer " << final_leader->uuid() << " and expecting timeout..."; + Status s = AddServer(initial_leader, tablet_id_, final_leader, RaftPeerPB::VOTER, boost::none, + MonoDelta::FromMilliseconds(100)); + ASSERT_TRUE(s.IsTimedOut()) << "Expected AddServer() to time out. Result: " << s.ToString(); + LOG(INFO) << "Timeout achieved."; + active_tablet_servers = tablet_servers_; // Reset to the unpaused servers. + for (int i = 1; i <= 3; i++) { + ASSERT_EQ(1, active_tablet_servers.erase(tservers[i]->uuid())); + } + // Only wait for TS 0 and 4 to agree that the new change config op has been + // replicated. + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, ++cur_log_index)); + + // Now that TS 4 is electable (and pending), have TS 0 step down. + LOG(INFO) << "Forcing Peer " << initial_leader->uuid() << " to step down..."; + ASSERT_OK(LeaderStepDown(initial_leader, tablet_id_, MonoDelta::FromSeconds(10))); + + // Resume TS 1 so we have a majority of 3 to elect a new leader. + LOG(INFO) << "Resuming Peer " << tservers[1]->uuid() << " ..."; + ASSERT_OK(cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Resume()); + InsertOrDie(&active_tablet_servers, tservers[1]->uuid(), tservers[1]); + + // Now try to get TS 4 elected. It should succeed and push a NO_OP. + LOG(INFO) << "Trying to elect Peer " << tservers[4]->uuid() << " ..."; + ASSERT_OK(StartElection(final_leader, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, ++cur_log_index)); + + // Resume the remaining paused nodes. + LOG(INFO) << "Resuming remaining nodes..."; + ASSERT_OK(cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Resume()); + ASSERT_OK(cluster_->tablet_server_by_uuid(tservers[3]->uuid())->Resume()); + active_tablet_servers = tablet_servers_; + + // Do one last operation on the new leader: an insert. + ASSERT_OK(WriteSimpleTestRow(final_leader, tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "Ob-La-Di, Ob-La-Da", + MonoDelta::FromSeconds(10))); + + // Wait for all servers to replicate everything up through the last write op. + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, ++cur_log_index)); +} + +// Writes test rows in ascending order to a single tablet server. +// Essentially a poor-man's version of TestWorkload that only operates on a +// single tablet. Does not batch, does not tolerate timeouts, and does not +// interact with the Master. 'rows_inserted' is used to determine row id and is +// incremented prior to each successful insert. Since a write failure results in +// a crash, as long as there is no crash then 'rows_inserted' will have a +// correct count at the end of the run. +// Crashes on any failure, so 'write_timeout' should be high. +void DoWriteTestRows(const TServerDetails* leader_tserver, + const string& tablet_id, + const MonoDelta& write_timeout, + AtomicInt* rows_inserted, + const AtomicBool* finish) { + + while (!finish->Load()) { + int row_key = rows_inserted->Increment(); + CHECK_OK(WriteSimpleTestRow(leader_tserver, tablet_id, RowOperationsPB::INSERT, + row_key, row_key, Substitute("key=$0", row_key), + write_timeout)); + } +} + +// Test that config change works while running a workload. +TEST_F(RaftConsensusITest, TestConfigChangeUnderLoad) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 3; + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + // Elect server 0 as leader and wait for log index 1 to propagate to all servers. + TServerDetails* leader_tserver = tservers[0]; + ASSERT_OK(StartElection(leader_tserver, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + + TabletServerMap active_tablet_servers = tablet_servers_; + + // Start a write workload. + LOG(INFO) << "Starting write workload..."; + vector > threads; + AtomicInt rows_inserted(0); + AtomicBool finish(false); + int num_threads = FLAGS_num_client_threads; + for (int i = 0; i < num_threads; i++) { + scoped_refptr thread; + ASSERT_OK(Thread::Create(CURRENT_TEST_NAME(), Substitute("row-writer-$0", i), + &DoWriteTestRows, + leader_tserver, tablet_id_, MonoDelta::FromSeconds(10), + &rows_inserted, &finish, + &thread)); + threads.push_back(thread); + } + + LOG(INFO) << "Removing servers..."; + // Go from 3 tablet servers down to 1 in the configuration. + vector remove_list = { 2, 1 }; + for (int to_remove_idx : remove_list) { + int num_servers = active_tablet_servers.size(); + LOG(INFO) << "Remove: Going from " << num_servers << " to " << num_servers - 1 << " replicas"; + + TServerDetails* tserver_to_remove = tservers[to_remove_idx]; + LOG(INFO) << "Removing tserver with uuid " << tserver_to_remove->uuid(); + ASSERT_OK(RemoveServer(leader_tserver, tablet_id_, tserver_to_remove, boost::none, + MonoDelta::FromSeconds(10))); + ASSERT_EQ(1, active_tablet_servers.erase(tserver_to_remove->uuid())); + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(active_tablet_servers.size(), + leader_tserver, tablet_id_, + MonoDelta::FromSeconds(10))); + } + + LOG(INFO) << "Adding servers..."; + // Add the tablet servers back, in reverse order, going from 1 to 3 servers in the configuration. + vector add_list = { 1, 2 }; + for (int to_add_idx : add_list) { + int num_servers = active_tablet_servers.size(); + LOG(INFO) << "Add: Going from " << num_servers << " to " << num_servers + 1 << " replicas"; + + TServerDetails* tserver_to_add = tservers[to_add_idx]; + LOG(INFO) << "Adding tserver with uuid " << tserver_to_add->uuid(); + ASSERT_OK(AddServer(leader_tserver, tablet_id_, tserver_to_add, RaftPeerPB::VOTER, boost::none, + MonoDelta::FromSeconds(10))); + InsertOrDie(&active_tablet_servers, tserver_to_add->uuid(), tserver_to_add); + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(active_tablet_servers.size(), + leader_tserver, tablet_id_, + MonoDelta::FromSeconds(10))); + } + + LOG(INFO) << "Joining writer threads..."; + finish.Store(true); + for (const scoped_refptr& thread : threads) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + + LOG(INFO) << "Waiting for replicas to agree..."; + // Wait for all servers to replicate everything up through the last write op. + // Since we don't batch, there should be at least # rows inserted log entries, + // plus the initial leader's no-op, plus 2 for the removed servers, plus 2 for + // the added servers for a total of 5. + int min_log_index = rows_inserted.Load() + 5; + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), + active_tablet_servers, tablet_id_, + min_log_index)); + + LOG(INFO) << "Number of rows inserted: " << rows_inserted.Load(); + ASSERT_ALL_REPLICAS_AGREE(rows_inserted.Load()); +} + +TEST_F(RaftConsensusITest, TestMasterNotifiedOnConfigChange) { + MonoDelta timeout = MonoDelta::FromSeconds(30); + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 2; + vector ts_flags; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + NO_FATALS(BuildAndStart(ts_flags, master_flags)); + + LOG(INFO) << "Finding tablet leader and waiting for things to start..."; + string tablet_id = tablet_replicas_.begin()->first; + + // Determine the list of tablet servers currently in the config. + TabletServerMap active_tablet_servers; + for (itest::TabletReplicaMap::const_iterator iter = tablet_replicas_.find(tablet_id); + iter != tablet_replicas_.end(); ++iter) { + InsertOrDie(&active_tablet_servers, iter->second->uuid(), iter->second); + } + + // Determine the server to add to the config. + string uuid_to_add; + for (const TabletServerMap::value_type& entry : tablet_servers_) { + if (!ContainsKey(active_tablet_servers, entry.second->uuid())) { + uuid_to_add = entry.second->uuid(); + } + } + ASSERT_FALSE(uuid_to_add.empty()); + + // Get a baseline config reported to the master. + LOG(INFO) << "Waiting for Master to see the current replicas..."; + master::TabletLocationsPB tablet_locations; + bool has_leader; + NO_FATALS(WaitForReplicasReportedToMaster(2, tablet_id, timeout, WAIT_FOR_LEADER, + &has_leader, &tablet_locations)); + LOG(INFO) << "Tablet locations:\n" << tablet_locations.DebugString(); + + // Wait for initial NO_OP to be committed by the leader. + TServerDetails* leader_ts; + ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id, timeout, &leader_ts)); + ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id, 1)); + + // Change the config. + TServerDetails* tserver_to_add = tablet_servers_[uuid_to_add]; + LOG(INFO) << "Adding tserver with uuid " << tserver_to_add->uuid(); + ASSERT_OK(AddServer(leader_ts, tablet_id_, tserver_to_add, RaftPeerPB::VOTER, boost::none, + timeout)); + ASSERT_OK(WaitForServersToAgree(timeout, tablet_servers_, tablet_id_, 2)); + + // Wait for the master to be notified of the config change. + // It should continue to have the same leader, even without waiting. + LOG(INFO) << "Waiting for Master to see config change..."; + NO_FATALS(WaitForReplicasReportedToMaster(3, tablet_id, timeout, NO_WAIT_FOR_LEADER, + &has_leader, &tablet_locations)); + ASSERT_TRUE(has_leader) << tablet_locations.DebugString(); + LOG(INFO) << "Tablet locations:\n" << tablet_locations.DebugString(); + + // Change the config again. + LOG(INFO) << "Removing tserver with uuid " << tserver_to_add->uuid(); + ASSERT_OK(RemoveServer(leader_ts, tablet_id_, tserver_to_add, boost::none, timeout)); + active_tablet_servers = tablet_servers_; + ASSERT_EQ(1, active_tablet_servers.erase(tserver_to_add->uuid())); + ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id_, 3)); + + // Wait for the master to be notified of the removal. + LOG(INFO) << "Waiting for Master to see config change..."; + NO_FATALS(WaitForReplicasReportedToMaster(2, tablet_id, timeout, NO_WAIT_FOR_LEADER, + &has_leader, &tablet_locations)); + ASSERT_TRUE(has_leader) << tablet_locations.DebugString(); + LOG(INFO) << "Tablet locations:\n" << tablet_locations.DebugString(); +} + +// Test that even with memory pressure, a replica will still commit pending +// operations that the leader has committed. +TEST_F(RaftConsensusITest, TestEarlyCommitDespiteMemoryPressure) { + // Enough operations to put us over our memory limit (defined below). + const int kNumOps = 10000; + + // Set up a 3-node configuration with only one live follower so that we can + // manipulate it directly via RPC. + vector ts_flags, master_flags; + + // If failure detection were on, a follower could be elected as leader after + // we kill the leader below. + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + + // Very low memory limit to ease testing. + ts_flags.push_back("--memory_limit_hard_bytes=4194304"); + + // Don't let transaction memory tracking get in the way. + ts_flags.push_back("--tablet_transaction_memory_limit_mb=-1"); + + BuildAndStart(ts_flags, master_flags); + + // Elect server 2 as leader, then kill it and server 1, leaving behind + // server 0 as the sole follower. + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(3, tservers.size()); + ASSERT_OK(StartElection(tservers[2], tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 1)); + TServerDetails *replica_ts = tservers[0]; + cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Shutdown(); + cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Shutdown(); + + // Pretend to be the leader and send a request to replicate some operations. + ConsensusRequestPB req; + ConsensusResponsePB resp; + RpcController rpc; + req.set_dest_uuid(replica_ts->uuid()); + req.set_tablet_id(tablet_id_); + req.set_caller_uuid(tservers[2]->instance_id.permanent_uuid()); + req.set_caller_term(1); + req.mutable_committed_index()->CopyFrom(MakeOpId(1, 1)); + req.mutable_preceding_id()->CopyFrom(MakeOpId(1, 1)); + for (int i = 0; i < kNumOps; i++) { + AddOp(MakeOpId(1, 2 + i), &req); + } + OpId last_opid = MakeOpId(1, 2 + kNumOps - 1); + ASSERT_OK(replica_ts->consensus_proxy->UpdateConsensus(req, &resp, &rpc)); + + // At the time that the follower received our request it was still under the + // tiny memory limit defined above, so the request should have succeeded. + ASSERT_FALSE(resp.has_error()) << resp.DebugString(); + ASSERT_TRUE(resp.has_status()); + ASSERT_TRUE(resp.status().has_last_committed_idx()); + ASSERT_EQ(last_opid.index(), resp.status().last_received().index()); + ASSERT_EQ(1, resp.status().last_committed_idx()); + + // But no operations have been applied yet; there should be no data. + vector rows; + WaitForRowCount(replica_ts->tserver_proxy.get(), 0, &rows); + + // Try again, but this time: + // 1. Replicate just one new operation. + // 2. Tell the follower that the previous set of operations were committed. + req.mutable_preceding_id()->CopyFrom(last_opid); + req.mutable_committed_index()->CopyFrom(last_opid); + req.mutable_ops()->Clear(); + AddOp(MakeOpId(1, last_opid.index() + 1), &req); + rpc.Reset(); + Status s = replica_ts->consensus_proxy->UpdateConsensus(req, &resp, &rpc); + + // Our memory limit was truly tiny, so we should be over it by now... + ASSERT_TRUE(s.IsRemoteError()); + ASSERT_STR_CONTAINS(s.ToString(), "Soft memory limit exceeded"); + + // ...but despite rejecting the request, we should have committed the + // previous set of operations. That is, we should be able to see those rows. + WaitForRowCount(replica_ts->tserver_proxy.get(), kNumOps, &rows); +} + +// Test that we can create (vivify) a new tablet via remote bootstrap. +TEST_F(RaftConsensusITest, TestAutoCreateReplica) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 2; + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + ts_flags.push_back("--log_cache_size_limit_mb=1"); + ts_flags.push_back("--log_segment_size_mb=1"); + ts_flags.push_back("--log_async_preallocate_segments=false"); + ts_flags.push_back("--flush_threshold_mb=1"); + ts_flags.push_back("--maintenance_manager_polling_interval_ms=300"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + // 50K is enough to cause flushes & log rolls. + int num_rows_to_write = 50000; + if (AllowSlowTests()) { + num_rows_to_write = 150000; + } + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + TabletServerMap active_tablet_servers; + TabletServerMap::const_iterator iter = tablet_replicas_.find(tablet_id_); + TServerDetails* leader = iter->second; + TServerDetails* follower = (++iter)->second; + InsertOrDie(&active_tablet_servers, leader->uuid(), leader); + InsertOrDie(&active_tablet_servers, follower->uuid(), follower); + + TServerDetails* new_node = nullptr; + for (TServerDetails* ts : tservers) { + if (!ContainsKey(active_tablet_servers, ts->uuid())) { + new_node = ts; + break; + } + } + ASSERT_TRUE(new_node != nullptr); + + // Elect the leader (still only a consensus config size of 2). + ASSERT_OK(StartElection(leader, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(30), active_tablet_servers, + tablet_id_, 1)); + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.set_num_replicas(FLAGS_num_replicas); + workload.set_num_write_threads(10); + workload.set_write_batch_size(100); + workload.Setup(); + + LOG(INFO) << "Starting write workload..."; + workload.Start(); + + while (true) { + int rows_inserted = workload.rows_inserted(); + if (rows_inserted >= num_rows_to_write) { + break; + } + LOG(INFO) << "Only inserted " << rows_inserted << " rows so far, sleeping for 100ms"; + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + LOG(INFO) << "Adding tserver with uuid " << new_node->uuid() << " as VOTER..."; + ASSERT_OK(AddServer(leader, tablet_id_, new_node, RaftPeerPB::VOTER, boost::none, + MonoDelta::FromSeconds(10))); + InsertOrDie(&active_tablet_servers, new_node->uuid(), new_node); + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(active_tablet_servers.size(), + leader, tablet_id_, + MonoDelta::FromSeconds(10))); + + workload.StopAndJoin(); + int num_batches = workload.batches_completed(); + + LOG(INFO) << "Waiting for replicas to agree..."; + // Wait for all servers to replicate everything up through the last write op. + // Since we don't batch, there should be at least # rows inserted log entries, + // plus the initial leader's no-op, plus 1 for + // the added replica for a total == #rows + 2. + int min_log_index = num_batches + 2; + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(120), + active_tablet_servers, tablet_id_, + min_log_index)); + + int rows_inserted = workload.rows_inserted(); + LOG(INFO) << "Number of rows inserted: " << rows_inserted; + ASSERT_ALL_REPLICAS_AGREE(rows_inserted); +} + +TEST_F(RaftConsensusITest, TestMemoryRemainsConstantDespiteTwoDeadFollowers) { + const int64_t kMinRejections = 100; + const MonoDelta kMaxWaitTime = MonoDelta::FromSeconds(60); + + // Start the cluster with a low per-tablet transaction memory limit, so that + // the test can complete faster. + vector flags; + flags.push_back("--tablet_transaction_memory_limit_mb=2"); + BuildAndStart(flags); + + // Kill both followers. + TServerDetails* details; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &details)); + int num_shutdown = 0; + int leader_ts_idx = -1; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + if (ts->instance_id().permanent_uuid() != details->uuid()) { + ts->Shutdown(); + num_shutdown++; + } else { + leader_ts_idx = i; + } + } + ASSERT_EQ(2, num_shutdown); + ASSERT_NE(-1, leader_ts_idx); + + // Because the majority of the cluster is dead and because of this workload's + // timeout behavior, more and more wedged transactions will accumulate in the + // leader. To prevent memory usage from skyrocketing, the leader will + // eventually reject new transactions. That's what we're testing for here. + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.set_timeout_allowed(true); + workload.set_write_timeout_millis(50); + workload.Setup(); + workload.Start(); + + // Run until the leader has rejected several transactions. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(kMaxWaitTime); + while (true) { + int64_t num_rejections = 0; + ASSERT_OK(cluster_->tablet_server(leader_ts_idx)->GetInt64Metric( + &METRIC_ENTITY_tablet, + nullptr, + &METRIC_transaction_memory_pressure_rejections, + "value", + &num_rejections)); + if (num_rejections >= kMinRejections) { + break; + } else if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + FAIL() << "Ran for " << kMaxWaitTime.ToString() << ", deadline expired"; + } + SleepFor(MonoDelta::FromMilliseconds(200)); + } +} + +static void EnableLogLatency(server::GenericServiceProxy* proxy) { + typedef unordered_map FlagMap; + FlagMap flags; + InsertOrDie(&flags, "log_inject_latency", "true"); + InsertOrDie(&flags, "log_inject_latency_ms_mean", "1000"); + for (const FlagMap::value_type& e : flags) { + SetFlagRequestPB req; + SetFlagResponsePB resp; + RpcController rpc; + req.set_flag(e.first); + req.set_value(e.second); + ASSERT_OK(proxy->SetFlag(req, &resp, &rpc)); + } +} + +// Run a regular workload with a leader that's writing to its WAL slowly. +TEST_F(RaftConsensusITest, TestSlowLeader) { + if (!AllowSlowTests()) return; + BuildAndStart(vector()); + + TServerDetails* leader; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + NO_FATALS(EnableLogLatency(leader->generic_proxy.get())); + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.Setup(); + workload.Start(); + SleepFor(MonoDelta::FromSeconds(60)); +} + +// Run a regular workload with one follower that's writing to its WAL slowly. +TEST_F(RaftConsensusITest, TestSlowFollower) { + if (!AllowSlowTests()) return; + BuildAndStart(vector()); + + TServerDetails* leader; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader)); + int num_reconfigured = 0; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + if (ts->instance_id().permanent_uuid() != leader->uuid()) { + TServerDetails* follower; + follower = GetReplicaWithUuidOrNull(tablet_id_, ts->instance_id().permanent_uuid()); + ASSERT_TRUE(follower); + NO_FATALS(EnableLogLatency(follower->generic_proxy.get())); + num_reconfigured++; + break; + } + } + ASSERT_EQ(1, num_reconfigured); + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.Setup(); + workload.Start(); + SleepFor(MonoDelta::FromSeconds(60)); +} + +// Run a special workload that constantly updates a single row on a cluster +// where every replica is writing to its WAL slowly. +TEST_F(RaftConsensusITest, TestHammerOneRow) { + if (!AllowSlowTests()) return; + BuildAndStart(vector()); + + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + TServerDetails* follower; + follower = GetReplicaWithUuidOrNull(tablet_id_, ts->instance_id().permanent_uuid()); + ASSERT_TRUE(follower); + NO_FATALS(EnableLogLatency(follower->generic_proxy.get())); + } + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.set_pathological_one_row_enabled(true); + workload.set_num_write_threads(20); + workload.Setup(); + workload.Start(); + SleepFor(MonoDelta::FromSeconds(60)); +} + +// Test that followers that fall behind the leader's log GC threshold are +// evicted from the config. +TEST_F(RaftConsensusITest, TestEvictAbandonedFollowers) { + vector ts_flags; + AddFlagsForLogRolls(&ts_flags); // For CauseFollowerToFallBehindLogGC(). + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + NO_FATALS(BuildAndStart(ts_flags, master_flags)); + + MonoDelta timeout = MonoDelta::FromSeconds(30); + TabletServerMap active_tablet_servers = tablet_servers_; + ASSERT_EQ(3, active_tablet_servers.size()); + + string leader_uuid; + int64_t orig_term; + string follower_uuid; + NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid)); + + // Wait for the abandoned follower to be evicted. + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(2, tablet_servers_[leader_uuid], + tablet_id_, timeout)); + ASSERT_EQ(1, active_tablet_servers.erase(follower_uuid)); + ASSERT_OK(WaitForServersToAgree(timeout, active_tablet_servers, tablet_id_, 2)); +} + +// Test that followers that fall behind the leader's log GC threshold are +// evicted from the config. +TEST_F(RaftConsensusITest, TestMasterReplacesEvictedFollowers) { + vector extra_flags; + AddFlagsForLogRolls(&extra_flags); // For CauseFollowerToFallBehindLogGC(). + BuildAndStart(extra_flags); + + MonoDelta timeout = MonoDelta::FromSeconds(30); + + string leader_uuid; + int64_t orig_term; + string follower_uuid; + NO_FATALS(CauseFollowerToFallBehindLogGC(&leader_uuid, &orig_term, &follower_uuid)); + + // The follower will be evicted. Now wait for the master to cause it to be + // remotely bootstrapped. + ASSERT_OK(WaitForServersToAgree(timeout, tablet_servers_, tablet_id_, 2)); + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(kTableId, ClusterVerifier::AT_LEAST, 1)); +} + +// Test that a ChangeConfig() request is rejected unless the leader has +// replicated one of its own log entries during the current term. +// This is required for correctness of Raft config change. For details, +// see https://groups.google.com/forum/#!topic/raft-dev/t4xj6dJTP6E +TEST_F(RaftConsensusITest, TestChangeConfigRejectedUnlessNoopReplicated) { + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--catalog_manager_wait_for_new_tablets_to_elect_leader=false" }; + BuildAndStart(ts_flags, master_flags); + + MonoDelta timeout = MonoDelta::FromSeconds(30); + + int kLeaderIndex = 0; + TServerDetails* leader_ts = tablet_servers_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + + // Prevent followers from accepting UpdateConsensus requests from the leader, + // even though they will vote. This will allow us to get the distributed + // system into a state where there is a valid leader (based on winning an + // election) but that leader will be unable to commit any entries from its + // own term, making it illegal to accept ChangeConfig() requests. + for (int i = 1; i <= 2; i++) { + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(i), + "follower_reject_update_consensus_requests", "true")); + } + + // Elect the leader. + ASSERT_OK(StartElection(leader_ts, tablet_id_, timeout)); + ASSERT_OK(WaitUntilLeader(leader_ts, tablet_id_, timeout)); + + // Now attempt to do a config change. It should be rejected because there + // have not been any ops (notably the initial NO_OP) from the leader's term + // that have been committed yet. + Status s = itest::RemoveServer(leader_ts, tablet_id_, + tablet_servers_[cluster_->tablet_server(1)->uuid()], + boost::none, timeout); + ASSERT_TRUE(!s.ok()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Latest committed op is not from this term"); +} + +// Test that if for some reason none of the transactions can be prepared, that it will come +// back as an error in UpdateConsensus(). +TEST_F(RaftConsensusITest, TestUpdateConsensusErrorNonePrepared) { + const int kNumOps = 10; + + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(3, tservers.size()); + + // Shutdown the other servers so they don't get chatty. + cluster_->tablet_server_by_uuid(tservers[1]->uuid())->Shutdown(); + cluster_->tablet_server_by_uuid(tservers[2]->uuid())->Shutdown(); + + // Configure the first server to fail all on prepare. + TServerDetails *replica_ts = tservers[0]; + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server(0), + "follower_fail_all_prepare", "true")); + + // Pretend to be the leader and send a request that should return an error. + ConsensusRequestPB req; + ConsensusResponsePB resp; + RpcController rpc; + req.set_dest_uuid(replica_ts->uuid()); + req.set_tablet_id(tablet_id_); + req.set_caller_uuid(tservers[2]->instance_id.permanent_uuid()); + req.set_caller_term(0); + req.mutable_committed_index()->CopyFrom(MakeOpId(0, 0)); + req.mutable_preceding_id()->CopyFrom(MakeOpId(0, 0)); + for (int i = 0; i < kNumOps; i++) { + AddOp(MakeOpId(0, 1 + i), &req); + } + + ASSERT_OK(replica_ts->consensus_proxy->UpdateConsensus(req, &resp, &rpc)); + LOG(INFO) << resp.ShortDebugString(); + ASSERT_TRUE(resp.status().has_error()); + ASSERT_EQ(consensus::ConsensusErrorPB::CANNOT_PREPARE, resp.status().error().code()); + ASSERT_STR_CONTAINS(resp.ShortDebugString(), "Could not prepare a single transaction"); +} + +} // namespace tserver +} // namespace kudu + diff --git a/src/kudu/integration-tests/registration-test.cc b/src/kudu/integration-tests/registration-test.cc new file mode 100644 index 000000000000..0895990ac33c --- /dev/null +++ b/src/kudu/integration-tests/registration-test.cc @@ -0,0 +1,164 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master-test-util.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/faststring.h" +#include "kudu/util/test_util.h" +#include "kudu/util/stopwatch.h" + +DECLARE_int32(heartbeat_interval_ms); + +namespace kudu { + +using std::vector; +using std::shared_ptr; +using master::MiniMaster; +using master::TSDescriptor; +using master::TabletLocationsPB; +using tserver::MiniTabletServer; + +// Tests for the Tablet Server registering with the Master, +// and the master maintaining the tablet descriptor. +class RegistrationTest : public KuduTest { + public: + RegistrationTest() + : schema_({ ColumnSchema("c1", UINT32) }, 1) { + } + + virtual void SetUp() OVERRIDE { + // Make heartbeats faster to speed test runtime. + FLAGS_heartbeat_interval_ms = 10; + + KuduTest::SetUp(); + + cluster_.reset(new MiniCluster(env_.get(), MiniClusterOptions())); + ASSERT_OK(cluster_->Start()); + } + + virtual void TearDown() OVERRIDE { + cluster_->Shutdown(); + } + + void CheckTabletServersPage() { + EasyCurl c; + faststring buf; + string addr = cluster_->mini_master()->bound_http_addr().ToString(); + ASSERT_OK(c.FetchURL(strings::Substitute("http://$0/tablet-servers", addr), + &buf)); + + // Should include the TS UUID + string expected_uuid = + cluster_->mini_tablet_server(0)->server()->instance_pb().permanent_uuid(); + ASSERT_STR_CONTAINS(buf.ToString(), expected_uuid); + } + + protected: + gscoped_ptr cluster_; + Schema schema_; +}; + +TEST_F(RegistrationTest, TestTSRegisters) { + // Wait for the TS to register. + vector > descs; + ASSERT_OK(cluster_->WaitForTabletServerCount(1, &descs)); + ASSERT_EQ(1, descs.size()); + + // Verify that the registration is sane. + master::TSRegistrationPB reg; + descs[0]->GetRegistration(®); + { + SCOPED_TRACE(reg.ShortDebugString()); + ASSERT_EQ(reg.ShortDebugString().find("0.0.0.0"), string::npos) + << "Should not include wildcards in registration"; + } + + ASSERT_NO_FATAL_FAILURE(CheckTabletServersPage()); + + // Restart the master, so it loses the descriptor, and ensure that the + // hearbeater thread handles re-registering. + ASSERT_OK(cluster_->mini_master()->Restart()); + + ASSERT_OK(cluster_->WaitForTabletServerCount(1)); + + // TODO: when the instance ID / sequence number stuff is implemented, + // restart the TS and ensure that it re-registers with the newer sequence + // number. +} + +// Test starting multiple tablet servers and ensuring they both register with the master. +TEST_F(RegistrationTest, TestMultipleTS) { + ASSERT_OK(cluster_->WaitForTabletServerCount(1)); + ASSERT_OK(cluster_->AddTabletServer()); + ASSERT_OK(cluster_->WaitForTabletServerCount(2)); +} + +// TODO: this doesn't belong under "RegistrationTest" - rename this file +// to something more appropriate - doesn't seem worth having separate +// whole test suites for registration, tablet reports, etc. +TEST_F(RegistrationTest, TestTabletReports) { + string tablet_id_1; + string tablet_id_2; + + ASSERT_OK(cluster_->WaitForTabletServerCount(1)); + + MiniTabletServer* ts = cluster_->mini_tablet_server(0); + string ts_root = cluster_->GetTabletServerFsRoot(0); + + // Add a tablet, make sure it reports itself. + CreateTabletForTesting(cluster_->mini_master(), "fake-table", schema_, &tablet_id_1); + + TabletLocationsPB locs; + ASSERT_OK(cluster_->WaitForReplicaCount(tablet_id_1, 1, &locs)); + ASSERT_EQ(1, locs.replicas_size()); + LOG(INFO) << "Tablet successfully reported on " << locs.replicas(0).ts_info().permanent_uuid(); + + // Add another tablet, make sure it is reported via incremental. + CreateTabletForTesting(cluster_->mini_master(), "fake-table2", schema_, &tablet_id_2); + ASSERT_OK(cluster_->WaitForReplicaCount(tablet_id_2, 1, &locs)); + + // Shut down the whole system, bring it back up, and make sure the tablets + // are reported. + ts->Shutdown(); + ASSERT_OK(cluster_->mini_master()->Restart()); + ASSERT_OK(ts->Start()); + + ASSERT_OK(cluster_->WaitForReplicaCount(tablet_id_1, 1, &locs)); + ASSERT_OK(cluster_->WaitForReplicaCount(tablet_id_2, 1, &locs)); + + // TODO: KUDU-870: once the master supports detecting failed/lost replicas, + // we should add a test case here which removes or corrupts metadata, restarts + // the TS, and verifies that the master notices the issue. +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/remote_bootstrap-itest.cc b/src/kudu/integration-tests/remote_bootstrap-itest.cc new file mode 100644 index 000000000000..914e140ce558 --- /dev/null +++ b/src/kudu/integration-tests/remote_bootstrap-itest.cc @@ -0,0 +1,686 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/client/client.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/external_mini_cluster_fs_inspector.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet_metadata.h" +#include "kudu/tserver/remote_bootstrap_client.h" +#include "kudu/util/metrics.h" +#include "kudu/util/pstack_watcher.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(test_delete_leader_num_iters, 3, + "Number of iterations to run in TestDeleteLeaderDuringRemoteBootstrapStressTest."); +DEFINE_int32(test_delete_leader_min_rows_per_iter, 20, + "Number of writer threads in TestDeleteLeaderDuringRemoteBootstrapStressTest."); +DEFINE_int32(test_delete_leader_payload_bytes, 16 * 1024, + "Payload byte size in TestDeleteLeaderDuringRemoteBootstrapStressTest."); +DEFINE_int32(test_delete_leader_num_writer_threads, 1, + "Number of writer threads in TestDeleteLeaderDuringRemoteBootstrapStressTest."); + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduSchema; +using kudu::client::KuduSchemaFromSchema; +using kudu::client::KuduTableCreator; +using kudu::client::sp::shared_ptr; +using kudu::consensus::CONSENSUS_CONFIG_COMMITTED; +using kudu::itest::TServerDetails; +using kudu::tablet::TABLET_DATA_TOMBSTONED; +using kudu::tserver::ListTabletsResponsePB; +using kudu::tserver::RemoteBootstrapClient; +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +METRIC_DECLARE_entity(server); +METRIC_DECLARE_histogram(handler_latency_kudu_consensus_ConsensusService_UpdateConsensus); +METRIC_DECLARE_counter(glog_info_messages); +METRIC_DECLARE_counter(glog_warning_messages); +METRIC_DECLARE_counter(glog_error_messages); + +namespace kudu { + +class RemoteBootstrapITest : public KuduTest { + public: + virtual void TearDown() OVERRIDE { + if (HasFatalFailure()) { + LOG(INFO) << "Found fatal failure"; + for (int i = 0; i < 3; i++) { + if (!cluster_->tablet_server(i)->IsProcessAlive()) { + LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks."; + continue; + } + LOG(INFO) << "Attempting to dump stacks of TS " << i + << " with UUID " << cluster_->tablet_server(i)->uuid() + << " and pid " << cluster_->tablet_server(i)->pid(); + WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()), + "Couldn't dump stacks"); + } + } + if (cluster_) cluster_->Shutdown(); + KuduTest::TearDown(); + STLDeleteValues(&ts_map_); + } + + protected: + void StartCluster(const vector& extra_tserver_flags = vector(), + const vector& extra_master_flags = vector(), + int num_tablet_servers = 3); + + gscoped_ptr cluster_; + gscoped_ptr inspect_; + shared_ptr client_; + unordered_map ts_map_; +}; + +void RemoteBootstrapITest::StartCluster(const vector& extra_tserver_flags, + const vector& extra_master_flags, + int num_tablet_servers) { + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = num_tablet_servers; + opts.extra_tserver_flags = extra_tserver_flags; + opts.extra_tserver_flags.push_back("--never_fsync"); // fsync causes flakiness on EC2. + opts.extra_master_flags = extra_master_flags; + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); + ASSERT_OK(itest::CreateTabletServerMap(cluster_->master_proxy().get(), + cluster_->messenger(), + &ts_map_)); + KuduClientBuilder builder; + ASSERT_OK(cluster_->CreateClient(builder, &client_)); +} + +// If a rogue (a.k.a. zombie) leader tries to remote bootstrap a tombstoned +// tablet, make sure its term isn't older than the latest term we observed. +// If it is older, make sure we reject the request, to avoid allowing old +// leaders to create a parallel universe. This is possible because config +// change could cause nodes to move around. The term check is reasonable +// because only one node can be elected leader for a given term. +// +// A leader can "go rogue" due to a VM pause, CTRL-z, partition, etc. +TEST_F(RemoteBootstrapITest, TestRejectRogueLeader) { + // This test pauses for at least 10 seconds. Only run in slow-test mode. + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + const MonoDelta timeout = MonoDelta::FromSeconds(30); + const int kTsIndex = 0; // We'll test with the first TS. + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + + TestWorkload workload(cluster_.get()); + workload.Setup(); + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader for term 1, then run some data through the cluster. + int zombie_leader_index = 1; + string zombie_leader_uuid = cluster_->tablet_server(zombie_leader_index)->uuid(); + ASSERT_OK(itest::StartElection(ts_map_[zombie_leader_uuid], tablet_id, timeout)); + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); + + // Come out of the blue and try to remotely bootstrap a running server while + // specifying an old term. That running server should reject the request. + // We are essentially masquerading as a rogue leader here. + Status s = itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid, + HostPort(cluster_->tablet_server(1)->bound_rpc_addr()), + 0, // Say I'm from term 0. + timeout); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), "term 0 lower than last logged term 1"); + + // Now pause the actual leader so we can bring him back as a zombie later. + ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Pause()); + + // Trigger TS 2 to become leader of term 2. + int new_leader_index = 2; + string new_leader_uuid = cluster_->tablet_server(new_leader_index)->uuid(); + ASSERT_OK(itest::StartElection(ts_map_[new_leader_uuid], tablet_id, timeout)); + ASSERT_OK(itest::WaitUntilLeader(ts_map_[new_leader_uuid], tablet_id, timeout)); + + unordered_map active_ts_map = ts_map_; + ASSERT_EQ(1, active_ts_map.erase(zombie_leader_uuid)); + + // Wait for the NO_OP entry from the term 2 election to propagate to the + // remaining nodes' logs so that we are guaranteed to reject the rogue + // leader's remote bootstrap request when we bring it back online. + int log_index = workload.batches_completed() + 2; // 2 terms == 2 additional NO_OP entries. + ASSERT_OK(WaitForServersToAgree(timeout, active_ts_map, tablet_id, log_index)); + // TODO: Write more rows to the new leader once KUDU-1034 is fixed. + + // Now kill the new leader and tombstone the replica on TS 0. + cluster_->tablet_server(new_leader_index)->Shutdown(); + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + + // Zombies!!! Resume the rogue zombie leader. + // He should attempt to remote bootstrap TS 0 but fail. + ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume()); + + // Loop for a few seconds to ensure that the tablet doesn't transition to READY. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(5)); + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + ASSERT_OK(itest::ListTablets(ts, timeout, &tablets)); + ASSERT_EQ(1, tablets.size()); + ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state()); + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Force the rogue leader to step down. + // Then, send a remote bootstrap start request from a "fake" leader that + // sends an up-to-date term in the RB request but the actual term stored + // in the bootstrap source's consensus metadata would still be old. + LOG(INFO) << "Forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid + << " to step down..."; + ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid], tablet_id, timeout)); + ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index); + // It's not necessarily part of the API but this could return faliure due to + // rejecting the remote. We intend to make that part async though, so ignoring + // this return value in this test. + ignore_result(itest::StartRemoteBootstrap(ts, tablet_id, zombie_leader_uuid, + HostPort(zombie_ets->bound_rpc_addr()), + 2, // Say I'm from term 2. + timeout)); + + // Wait another few seconds to be sure the remote bootstrap is rejected. + deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(5)); + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + ASSERT_OK(itest::ListTablets(ts, timeout, &tablets)); + ASSERT_EQ(1, tablets.size()); + ASSERT_EQ(TABLET_DATA_TOMBSTONED, tablets[0].tablet_status().tablet_data_state()); + SleepFor(MonoDelta::FromMilliseconds(10)); + } +} + +// Start remote bootstrap session and delete the tablet in the middle. +// It should actually be possible to complete bootstrap in such a case, because +// when a remote bootstrap session is started on the "source" server, all of +// the relevant files are either read or opened, meaning that an in-progress +// remote bootstrap can complete even after a tablet is officially "deleted" on +// the source server. This is also a regression test for KUDU-1009. +TEST_F(RemoteBootstrapITest, TestDeleteTabletDuringRemoteBootstrap) { + MonoDelta timeout = MonoDelta::FromSeconds(10); + const int kTsIndex = 0; // We'll test with the first TS. + NO_FATALS(StartCluster()); + + // Populate a tablet with some data. + TestWorkload workload(cluster_.get()); + workload.Setup(); + workload.Start(); + while (workload.rows_inserted() < 1000) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Figure out the tablet id of the created tablet. + vector tablets; + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Ensure all the servers agree before we proceed. + workload.StopAndJoin(); + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); + + // Set up an FsManager to use with the RemoteBootstrapClient. + FsManagerOpts opts; + string testbase = GetTestPath("fake-ts"); + ASSERT_OK(env_->CreateDir(testbase)); + opts.wal_path = JoinPathSegments(testbase, "wals"); + opts.data_paths.push_back(JoinPathSegments(testbase, "data-0")); + gscoped_ptr fs_manager(new FsManager(env_.get(), opts)); + ASSERT_OK(fs_manager->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager->Open()); + + // Start up a RemoteBootstrapClient and open a remote bootstrap session. + gscoped_ptr rb_client( + new RemoteBootstrapClient(tablet_id, fs_manager.get(), + cluster_->messenger(), fs_manager->uuid())); + scoped_refptr meta; + ASSERT_OK(rb_client->Start(cluster_->tablet_server(kTsIndex)->uuid(), + cluster_->tablet_server(kTsIndex)->bound_rpc_hostport(), + &meta)); + + // Tombstone the tablet on the remote! + ASSERT_OK(itest::DeleteTablet(ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, timeout)); + + // Now finish bootstrapping! + tablet::TabletStatusListener listener(meta); + ASSERT_OK(rb_client->FetchAll(&listener)); + ASSERT_OK(rb_client->Finish()); + + // Run destructor, which closes the remote session. + rb_client.reset(); + SleepFor(MonoDelta::FromMilliseconds(50)); // Give a little time for a crash (KUDU-1009). + ASSERT_TRUE(cluster_->tablet_server(kTsIndex)->IsProcessAlive()); +} + +// This test ensures that a leader can remote-bootstrap a tombstoned replica +// that has a higher term recorded in the replica's consensus metadata if the +// replica's last-logged opid has the same term (or less) as the leader serving +// as the remote bootstrap source. When a tablet is tombstoned, its last-logged +// opid is stored in a field its on-disk superblock. +TEST_F(RemoteBootstrapITest, TestRemoteBootstrapFollowerWithHigherTerm) { + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + const int kNumTabletServers = 2; + NO_FATALS(StartCluster(ts_flags, master_flags, kNumTabletServers)); + + const MonoDelta timeout = MonoDelta::FromSeconds(30); + const int kFollowerIndex = 0; + TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(2); + workload.Setup(); + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(follower_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader for term 1, then run some data through the cluster. + const int kLeaderIndex = 1; + TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); + + // Pause the leader and increment the term on the follower by starting an + // election on the follower. The election will fail asynchronously but we + // just wait until we see that its term has incremented. + ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause()); + ASSERT_OK(itest::StartElection(follower_ts, tablet_id, timeout)); + int64_t term = 0; + for (int i = 0; i < 1000; i++) { + consensus::ConsensusStatePB cstate; + ASSERT_OK(itest::GetConsensusState(follower_ts, tablet_id, CONSENSUS_CONFIG_COMMITTED, + timeout, &cstate)); + term = cstate.current_term(); + if (term == 2) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_EQ(2, term); + + // Now tombstone the follower. + ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, + timeout)); + + // Restart the follower's TS so that the leader's TS won't get its queued + // vote request messages. This is a hack but seems to work. + cluster_->tablet_server(kFollowerIndex)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); + + // Now wake the leader. It should detect that the follower needs to be + // remotely bootstrapped and proceed to bring it back up to date. + ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume()); + + // Wait for the follower to come back up. + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, workload.batches_completed())); +} + +// Test that multiple concurrent remote bootstraps do not cause problems. +// This is a regression test for KUDU-951, in which concurrent sessions on +// multiple tablets between the same remote bootstrap client host and remote +// bootstrap source host could corrupt each other. +TEST_F(RemoteBootstrapITest, TestConcurrentRemoteBootstraps) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + ts_flags.push_back("--log_cache_size_limit_mb=1"); + ts_flags.push_back("--log_segment_size_mb=1"); + ts_flags.push_back("--log_async_preallocate_segments=false"); + ts_flags.push_back("--log_min_segments_to_retain=100"); + ts_flags.push_back("--flush_threshold_mb=0"); // Constantly flush. + ts_flags.push_back("--maintenance_manager_polling_interval_ms=10"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + const MonoDelta timeout = MonoDelta::FromSeconds(60); + + // Create a table with several tablets. These will all be simultaneously + // remotely bootstrapped to a single target node from the same leader host. + const int kNumTablets = 10; + KuduSchema client_schema(KuduSchemaFromSchema(GetSimpleTestSchema())); + vector splits; + for (int i = 0; i < kNumTablets - 1; i++) { + KuduPartialRow* row = client_schema.NewRow(); + ASSERT_OK(row->SetInt32(0, numeric_limits::max() / kNumTablets * (i + 1))); + splits.push_back(row); + } + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(TestWorkload::kDefaultTableName) + .split_rows(splits) + .schema(&client_schema) + .num_replicas(3) + .Create()); + + const int kTsIndex = 0; // We'll test with the first TS. + TServerDetails* target_ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + + // Figure out the tablet ids of the created tablets. + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(target_ts, kNumTablets, timeout, &tablets)); + + vector tablet_ids; + for (const ListTabletsResponsePB::StatusAndSchemaPB& t : tablets) { + tablet_ids.push_back(t.tablet_status().tablet_id()); + } + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + for (const string& tablet_id : tablet_ids) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + } + + // Elect leaders on each tablet for term 1. All leaders will be on TS 1. + const int kLeaderIndex = 1; + const string kLeaderUuid = cluster_->tablet_server(kLeaderIndex)->uuid(); + for (const string& tablet_id : tablet_ids) { + ASSERT_OK(itest::StartElection(ts_map_[kLeaderUuid], tablet_id, timeout)); + } + + TestWorkload workload(cluster_.get()); + workload.set_write_timeout_millis(10000); + workload.set_timeout_allowed(true); + workload.set_write_batch_size(10); + workload.set_num_write_threads(10); + workload.Setup(); + workload.Start(); + while (workload.rows_inserted() < 20000) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + workload.StopAndJoin(); + + for (const string& tablet_id : tablet_ids) { + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); + } + + // Now pause the leader so we can tombstone the tablets. + ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Pause()); + + for (const string& tablet_id : tablet_ids) { + LOG(INFO) << "Tombstoning tablet " << tablet_id << " on TS " << target_ts->uuid(); + ASSERT_OK(itest::DeleteTablet(target_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, + MonoDelta::FromSeconds(10))); + } + + // Unpause the leader TS and wait for it to remotely bootstrap the tombstoned + // tablets, in parallel. + ASSERT_OK(cluster_->tablet_server(kLeaderIndex)->Resume()); + for (const string& tablet_id : tablet_ids) { + ASSERT_OK(itest::WaitUntilTabletRunning(target_ts, tablet_id, timeout)); + } + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); +} + +// Test that repeatedly runs a load, tombstones a follower, then tombstones the +// leader while the follower is remotely bootstrapping. Regression test for +// KUDU-1047. +TEST_F(RemoteBootstrapITest, TestDeleteLeaderDuringRemoteBootstrapStressTest) { + // This test takes a while due to failure detection. + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + const MonoDelta timeout = MonoDelta::FromSeconds(60); + NO_FATALS(StartCluster(vector(), vector(), 5)); + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(5); + workload.set_payload_bytes(FLAGS_test_delete_leader_payload_bytes); + workload.set_num_write_threads(FLAGS_test_delete_leader_num_writer_threads); + workload.set_write_batch_size(1); + workload.set_write_timeout_millis(10000); + workload.set_timeout_allowed(true); + workload.set_not_found_allowed(true); + workload.Setup(); + + // Figure out the tablet id. + const int kTsIndex = 0; + TServerDetails* ts = ts_map_[cluster_->tablet_server(kTsIndex)->uuid()]; + vector tablets; + ASSERT_OK(WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + int leader_index = -1; + int follower_index = -1; + TServerDetails* leader_ts = nullptr; + TServerDetails* follower_ts = nullptr; + + for (int i = 0; i < FLAGS_test_delete_leader_num_iters; i++) { + LOG(INFO) << "Iteration " << (i + 1); + int rows_previously_inserted = workload.rows_inserted(); + + // Find out who's leader. + ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, timeout, &leader_ts)); + leader_index = cluster_->tablet_server_index_by_uuid(leader_ts->uuid()); + + // Select an arbitrary follower. + follower_index = (leader_index + 1) % cluster_->num_tablet_servers(); + follower_ts = ts_map_[cluster_->tablet_server(follower_index)->uuid()]; + + // Spin up the workload. + workload.Start(); + while (workload.rows_inserted() < rows_previously_inserted + + FLAGS_test_delete_leader_min_rows_per_iter) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Tombstone the follower. + LOG(INFO) << "Tombstoning follower tablet " << tablet_id << " on TS " << follower_ts->uuid(); + ASSERT_OK(itest::DeleteTablet(follower_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, + timeout)); + + // Wait for remote bootstrap to start. + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(follower_index, tablet_id, + tablet::TABLET_DATA_COPYING, timeout)); + + // Tombstone the leader. + LOG(INFO) << "Tombstoning leader tablet " << tablet_id << " on TS " << leader_ts->uuid(); + ASSERT_OK(itest::DeleteTablet(leader_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, + timeout)); + + // Quiesce and rebuild to full strength. This involves electing a new + // leader from the remaining three, which requires a unanimous vote, and + // that leader then remotely bootstrapping the old leader. + workload.StopAndJoin(); + ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); + } + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), ClusterVerifier::AT_LEAST, + workload.rows_inserted())); +} + +namespace { +int64_t CountUpdateConsensusCalls(ExternalTabletServer* ets, const string& tablet_id) { + int64_t ret; + CHECK_OK(ets->GetInt64Metric( + &METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_handler_latency_kudu_consensus_ConsensusService_UpdateConsensus, + "total_count", + &ret)); + return ret; +} +int64_t CountLogMessages(ExternalTabletServer* ets) { + int64_t total = 0; + + int64_t count; + CHECK_OK(ets->GetInt64Metric( + &METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_glog_info_messages, + "value", + &count)); + total += count; + + CHECK_OK(ets->GetInt64Metric( + &METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_glog_warning_messages, + "value", + &count)); + total += count; + + CHECK_OK(ets->GetInt64Metric( + &METRIC_ENTITY_server, + "kudu.tabletserver", + &METRIC_glog_error_messages, + "value", + &count)); + total += count; + + return total; +} +} // anonymous namespace + +// Test that if remote bootstrap is disabled by a flag, we don't get into +// tight loops after a tablet is deleted. This is a regression test for situation +// similar to the bug described in KUDU-821: we were previously handling a missing +// tablet within consensus in such a way that we'd immediately send another RPC. +TEST_F(RemoteBootstrapITest, TestDisableRemoteBootstrap_NoTightLoopWhenTabletDeleted) { + MonoDelta timeout = MonoDelta::FromSeconds(10); + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + ts_flags.push_back("--enable_remote_bootstrap=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + TestWorkload workload(cluster_.get()); + // TODO(KUDU-1054): the client should handle retrying on different replicas + // if the tablet isn't found, rather than giving us this error. + workload.set_not_found_allowed(true); + workload.set_write_batch_size(1); + workload.Setup(); + + // Figure out the tablet id of the created tablet. + vector tablets; + ExternalTabletServer* replica_ets = cluster_->tablet_server(1); + TServerDetails* replica_ts = ts_map_[replica_ets->uuid()]; + ASSERT_OK(WaitForNumTabletsOnTS(replica_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader (TS 0) + ExternalTabletServer* leader_ts = cluster_->tablet_server(0); + ASSERT_OK(itest::StartElection(ts_map_[leader_ts->uuid()], tablet_id, timeout)); + + // Start writing, wait for some rows to be inserted. + workload.Start(); + while (workload.rows_inserted() < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Tombstone the tablet on one of the servers (TS 1) + ASSERT_OK(itest::DeleteTablet(replica_ts, tablet_id, TABLET_DATA_TOMBSTONED, boost::none, + timeout)); + + // Ensure that, if we sleep for a second while still doing writes to the leader: + // a) we don't spew logs on the leader side + // b) we don't get hit with a lot of UpdateConsensus calls on the replica. + int64_t num_update_rpcs_initial = CountUpdateConsensusCalls(replica_ets, tablet_id); + int64_t num_logs_initial = CountLogMessages(leader_ts); + + SleepFor(MonoDelta::FromSeconds(1)); + int64_t num_update_rpcs_after_sleep = CountUpdateConsensusCalls(replica_ets, tablet_id); + int64_t num_logs_after_sleep = CountLogMessages(leader_ts); + + // Calculate rate per second of RPCs and log messages + int64_t update_rpcs_per_second = num_update_rpcs_after_sleep - num_update_rpcs_initial; + EXPECT_LT(update_rpcs_per_second, 20); + int64_t num_logs_per_second = num_logs_after_sleep - num_logs_initial; + EXPECT_LT(num_logs_per_second, 20); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/tablet_replacement-itest.cc b/src/kudu/integration-tests/tablet_replacement-itest.cc new file mode 100644 index 000000000000..0a1d350303bb --- /dev/null +++ b/src/kudu/integration-tests/tablet_replacement-itest.cc @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/external_mini_cluster-itest-base.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/test_workload.h" + +using kudu::consensus::RaftPeerPB; +using kudu::itest::TServerDetails; +using kudu::tablet::TABLET_DATA_READY; +using kudu::tablet::TABLET_DATA_TOMBSTONED; +using kudu::tserver::ListTabletsResponsePB; +using std::shared_ptr; +using std::string; +using std::unordered_map; +using std::vector; +using strings::Substitute; + +namespace kudu { + +class TabletReplacementITest : public ExternalMiniClusterITestBase { +}; + +// Test that the Master will tombstone a newly-evicted replica. +// Then, test that the Master will NOT tombstone a newly-added replica that is +// not part of the committed config yet (only the pending config). +TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) { + MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags = { "--enable_leader_failure_detection=false" }; + int num_tservers = 5; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags, num_tservers)); + + TestWorkload workload(cluster_.get()); + workload.set_num_replicas(num_tservers); + workload.Setup(); // Easy way to create a new tablet. + + const int kLeaderIndex = 0; + TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + const int kFollowerIndex = 4; + TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader (TS 0) + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. + + // Remove a follower from the config. + ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout)); + + // Wait for the Master to tombstone the replica. + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED, + timeout)); + + if (!AllowSlowTests()) { + // The rest of this test has multi-second waits, so we do it in slow test mode. + LOG(INFO) << "Not verifying that a newly-added replica won't be tombstoned in fast-test mode"; + return; + } + + // Shut down a majority of followers (3 servers) and then try to add the + // follower back to the config. This will cause the config change to end up + // in a pending state. + unordered_map active_ts_map = ts_map_; + for (int i = 1; i <= 3; i++) { + cluster_->tablet_server(i)->Shutdown(); + ASSERT_EQ(1, active_ts_map.erase(cluster_->tablet_server(i)->uuid())); + } + // This will time out, but should take effect. + Status s = itest::AddServer(leader_ts, tablet_id, follower_ts, RaftPeerPB::VOTER, + boost::none, MonoDelta::FromSeconds(5)); + ASSERT_TRUE(s.IsTimedOut()); + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY, + timeout)); + ASSERT_OK(itest::WaitForServersToAgree(timeout, active_ts_map, tablet_id, 3)); + + // Sleep for a few more seconds and check again to ensure that the Master + // didn't end up tombstoning the replica. + SleepFor(MonoDelta::FromSeconds(3)); + ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_READY)); +} + +// Ensure that the Master will tombstone a replica if it reports in with an old +// config. This tests a slightly different code path in the catalog manager +// than TestMasterTombstoneEvictedReplica does. +TEST_F(TabletReplacementITest, TestMasterTombstoneOldReplicaOnReport) { + MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags = { "--enable_leader_failure_detection=false" }; + vector master_flags = { "--master_add_server_when_underreplicated=false" }; + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + TestWorkload workload(cluster_.get()); + workload.Setup(); // Easy way to create a new tablet. + + const int kLeaderIndex = 0; + TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + const int kFollowerIndex = 2; + TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader (TS 0) + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. + + // Shut down the follower to be removed, then remove it from the config. + // We will wait for the Master to be notified of the config change, then shut + // down the rest of the cluster and bring the follower back up. The follower + // will heartbeat to the Master and then be tombstoned. + cluster_->tablet_server(kFollowerIndex)->Shutdown(); + + // Remove the follower from the config and wait for the Master to notice the + // config change. + ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, follower_ts, boost::none, timeout)); + ASSERT_OK(itest::WaitForNumVotersInConfigOnMaster(cluster_->master_proxy(), tablet_id, 2, + timeout)); + + // Shut down the remaining tablet servers and restart the dead one. + cluster_->tablet_server(0)->Shutdown(); + cluster_->tablet_server(1)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); + + // Wait for the Master to tombstone the revived follower. + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, TABLET_DATA_TOMBSTONED, + timeout)); +} + +// Test that unreachable followers are evicted and replaced. +TEST_F(TabletReplacementITest, TestEvictAndReplaceDeadFollower) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags = { "--enable_leader_failure_detection=false", + "--follower_unavailable_considered_failed_sec=5" }; + vector master_flags = { "--catalog_manager_wait_for_new_tablets_to_elect_leader=false" }; + NO_FATALS(StartCluster(ts_flags, master_flags)); + + TestWorkload workload(cluster_.get()); + workload.Setup(); // Easy way to create a new tablet. + + const int kLeaderIndex = 0; + TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + const int kFollowerIndex = 2; + + // Figure out the tablet id of the created tablet. + vector tablets; + ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader (TS 0) + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. + + // Shut down the follower to be removed. It should be evicted. + cluster_->tablet_server(kFollowerIndex)->Shutdown(); + + // With a RemoveServer and AddServer, the opid_index of the committed config will be 3. + ASSERT_OK(itest::WaitUntilCommittedConfigOpIdIndexIs(3, leader_ts, tablet_id, timeout)); + ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); +} + +// Regression test for KUDU-1233. This test creates a situation in which tablet +// bootstrap will attempt to replay committed (and applied) config change +// operations. This is achieved by delaying application of a write at the +// tablet level that precedes the config change operations in the WAL, then +// initiating a remote bootstrap to a follower. The follower will not have the +// COMMIT for the write operation, so will ignore COMMIT messages for the +// applied config change operations. At startup time, the newly +// remotely-bootstrapped tablet should detect that these config change +// operations have already been applied and skip them. +TEST_F(TabletReplacementITest, TestRemoteBoostrapWithPendingConfigChangeCommits) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in fast-test mode."; + return; + } + + MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + vector master_flags; + // We will manage doing the AddServer() manually, in order to make this test + // more deterministic. + master_flags.push_back("--master_add_server_when_underreplicated=false"); + master_flags.push_back("--master_tombstone_evicted_tablet_replicas=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + TestWorkload workload(cluster_.get()); + workload.Setup(); // Convenient way to create a table. + + const int kLeaderIndex = 0; + TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; + const int kFollowerIndex = 2; + TServerDetails* ts_to_remove = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; + + // Wait for tablet creation and then identify the tablet id. + vector tablets; + ASSERT_OK(itest::WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); + string tablet_id = tablets[0].tablet_status().tablet_id(); + + // Wait until all replicas are up and running. + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + // Elect a leader (TS 0) + ASSERT_OK(itest::StartElection(leader_ts, tablet_id, timeout)); + ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. + + // Write a single row. + ASSERT_OK(WriteSimpleTestRow(leader_ts, tablet_id, RowOperationsPB::INSERT, 0, 0, "", timeout)); + + // Delay tablet applies in order to delay COMMIT messages to trigger KUDU-1233. + // Then insert another row. + ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(leader_ts->uuid()), + "tablet_inject_latency_on_apply_write_txn_ms", "5000")); + + // Kick off an async insert, which will be delayed for 5 seconds. This is + // normally enough time to evict a replica, tombstone it, add it back, and + // remotely bootstrap it when the log is only a few entries. + tserver::WriteRequestPB req; + tserver::WriteResponsePB resp; + CountDownLatch latch(1); + rpc::RpcController rpc; + rpc.set_timeout(timeout); + req.set_tablet_id(tablet_id); + Schema schema = GetSimpleTestSchema(); + ASSERT_OK(SchemaToPB(schema, req.mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema, 1, 1, "", req.mutable_row_operations()); + leader_ts->tserver_proxy->WriteAsync(req, &resp, &rpc, + boost::bind(&CountDownLatch::CountDown, &latch)); + + // Wait for the replicate to show up (this doesn't wait for COMMIT messages). + ASSERT_OK(itest::WaitForServersToAgree(timeout, ts_map_, tablet_id, 3)); + + // Manually evict the server from the cluster, tombstone the replica, then + // add the replica back to the cluster. Without the fix for KUDU-1233, this + // will cause the replica to fail to start up. + ASSERT_OK(itest::RemoveServer(leader_ts, tablet_id, ts_to_remove, boost::none, timeout)); + ASSERT_OK(itest::DeleteTablet(ts_to_remove, tablet_id, TABLET_DATA_TOMBSTONED, + boost::none, timeout)); + ASSERT_OK(itest::AddServer(leader_ts, tablet_id, ts_to_remove, RaftPeerPB::VOTER, + boost::none, timeout)); + ASSERT_OK(itest::WaitUntilTabletRunning(ts_to_remove, tablet_id, timeout)); + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(workload.table_name(), + ClusterVerifier::EXACTLY, 2)); + + latch.Wait(); // Avoid use-after-free on the response from the delayed RPC callback. +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/test_workload.cc b/src/kudu/integration-tests/test_workload.cc new file mode 100644 index 000000000000..aa0588c1336d --- /dev/null +++ b/src/kudu/integration-tests/test_workload.cc @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "kudu/client/client.h" +#include "kudu/client/client-test-util.h" +#include "kudu/client/schema-internal.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/util/env.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/random.h" +#include "kudu/util/thread.h" + +namespace kudu { + +using client::FromInternalCompressionType; +using client::FromInternalDataType; +using client::FromInternalEncodingType; +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema;; +using client::KuduInsert; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSchemaFromSchema; +using client::KuduSession; +using client::KuduTable; +using client::KuduTableCreator; +using client::KuduUpdate; +using client::sp::shared_ptr; + +const char* const TestWorkload::kDefaultTableName = "test-workload"; + +TestWorkload::TestWorkload(ExternalMiniCluster* cluster) + : cluster_(cluster), + payload_bytes_(11), + num_write_threads_(4), + write_batch_size_(50), + write_timeout_millis_(20000), + timeout_allowed_(false), + not_found_allowed_(false), + pathological_one_row_enabled_(false), + num_replicas_(3), + num_tablets_(1), + table_name_(kDefaultTableName), + start_latch_(0), + should_run_(false), + rows_inserted_(0), + batches_completed_(0) { +} + +TestWorkload::~TestWorkload() { + StopAndJoin(); +} + +void TestWorkload::WriteThread() { + Random r(Env::Default()->gettid()); + + shared_ptr table; + // Loop trying to open up the table. In some tests we set up very + // low RPC timeouts to test those behaviors, so this might fail and + // need retrying. + while (should_run_.Load()) { + Status s = client_->OpenTable(table_name_, &table); + if (s.ok()) { + break; + } + if (timeout_allowed_ && s.IsTimedOut()) { + SleepFor(MonoDelta::FromMilliseconds(50)); + continue; + } + CHECK_OK(s); + } + + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(write_timeout_millis_); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + // Wait for all of the workload threads to be ready to go. This maximizes the chance + // that they all send a flood of requests at exactly the same time. + // + // This also minimizes the chance that we see failures to call OpenTable() if + // a late-starting thread overlaps with the flood of outbound traffic from the + // ones that are already writing data. + start_latch_.CountDown(); + start_latch_.Wait(); + + while (should_run_.Load()) { + for (int i = 0; i < write_batch_size_; i++) { + if (pathological_one_row_enabled_) { + gscoped_ptr update(table->NewUpdate()); + KuduPartialRow* row = update->mutable_row(); + CHECK_OK(row->SetInt32(0, 0)); + CHECK_OK(row->SetInt32(1, r.Next())); + CHECK_OK(session->Apply(update.release())); + } else { + gscoped_ptr insert(table->NewInsert()); + KuduPartialRow* row = insert->mutable_row(); + CHECK_OK(row->SetInt32(0, r.Next())); + CHECK_OK(row->SetInt32(1, r.Next())); + string test_payload("hello world"); + if (payload_bytes_ != 11) { + // We fill with zeros if you change the default. + test_payload.assign(payload_bytes_, '0'); + } + CHECK_OK(row->SetStringCopy(2, test_payload)); + CHECK_OK(session->Apply(insert.release())); + } + } + + int inserted = write_batch_size_; + + Status s = session->Flush(); + + if (PREDICT_FALSE(!s.ok())) { + std::vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK(!overflow); + for (const client::KuduError* e : errors) { + if (timeout_allowed_ && e->status().IsTimedOut()) { + continue; + } + + if (not_found_allowed_ && e->status().IsNotFound()) { + continue; + } + // We don't handle write idempotency yet. (i.e making sure that when a leader fails + // writes to it that were eventually committed by the new leader but un-ackd to the + // client are not retried), so some errors are expected. + // It's OK as long as the errors are Status::AlreadyPresent(); + CHECK(e->status().IsAlreadyPresent()) << "Unexpected error: " << e->status().ToString(); + } + inserted -= errors.size(); + } + + rows_inserted_.IncrementBy(inserted); + if (inserted > 0) { + batches_completed_.Increment(); + } + } +} + +void TestWorkload::Setup() { + CHECK_OK(cluster_->CreateClient(client_builder_, &client_)); + + bool table_exists; + + // Retry KuduClient::TableExists() until we make that call retry reliably. + // See KUDU-1074. + MonoTime deadline(MonoTime::Now(MonoTime::FINE)); + deadline.AddDelta(MonoDelta::FromSeconds(10)); + Status s; + while (true) { + s = client_->TableExists(table_name_, &table_exists); + if (s.ok() || deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + CHECK_OK(s); + + if (!table_exists) { + KuduSchema client_schema(KuduSchemaFromSchema(GetSimpleTestSchema())); + + vector splits; + for (int i = 1; i < num_tablets_; i++) { + KuduPartialRow* r = client_schema.NewRow(); + CHECK_OK(r->SetInt32("key", MathLimits::kMax / num_tablets_ * i)); + splits.push_back(r); + } + + gscoped_ptr table_creator(client_->NewTableCreator()); + CHECK_OK(table_creator->table_name(table_name_) + .schema(&client_schema) + .num_replicas(num_replicas_) + .split_rows(splits) + // NOTE: this is quite high as a timeout, but the default (5 sec) does not + // seem to be high enough in some cases (see KUDU-550). We should remove + // this once that ticket is addressed. + .timeout(MonoDelta::FromSeconds(20)) + .Create()); + } else { + LOG(INFO) << "TestWorkload: Skipping table creation because table " + << table_name_ << " already exists"; + } + + + if (pathological_one_row_enabled_) { + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(20000); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + shared_ptr table; + CHECK_OK(client_->OpenTable(table_name_, &table)); + gscoped_ptr insert(table->NewInsert()); + KuduPartialRow* row = insert->mutable_row(); + CHECK_OK(row->SetInt32(0, 0)); + CHECK_OK(row->SetInt32(1, 0)); + CHECK_OK(row->SetStringCopy(2, "hello world")); + CHECK_OK(session->Apply(insert.release())); + CHECK_OK(session->Flush()); + } +} + +void TestWorkload::Start() { + CHECK(!should_run_.Load()) << "Already started"; + should_run_.Store(true); + start_latch_.Reset(num_write_threads_); + for (int i = 0; i < num_write_threads_; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("test-writer-$0", i), + &TestWorkload::WriteThread, this, + &new_thread)); + threads_.push_back(new_thread); + } +} + +void TestWorkload::StopAndJoin() { + should_run_.Store(false); + start_latch_.Reset(0); + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + threads_.clear(); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/test_workload.h b/src/kudu/integration-tests/test_workload.h new file mode 100644 index 000000000000..4d106edd8397 --- /dev/null +++ b/src/kudu/integration-tests/test_workload.h @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H +#define KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H + +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/atomic.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/monotime.h" + +namespace kudu { + +class ExternalMiniCluster; +class Thread; + +// Utility class for generating a workload against a test cluster. +// +// The actual data inserted is random, and thus can't be verified for +// integrity. However, this is still useful in conjunction with ClusterVerifier +// to verify that replicas do not diverge. +class TestWorkload { + public: + static const char* const kDefaultTableName; + + explicit TestWorkload(ExternalMiniCluster* cluster); + ~TestWorkload(); + + void set_payload_bytes(int n) { + payload_bytes_ = n; + } + + void set_num_write_threads(int n) { + num_write_threads_ = n; + } + + void set_write_batch_size(int s) { + write_batch_size_ = s; + } + + void set_client_default_rpc_timeout_millis(int t) { + client_builder_.default_rpc_timeout(MonoDelta::FromMilliseconds(t)); + } + + void set_write_timeout_millis(int t) { + write_timeout_millis_ = t; + } + + // Set whether to fail if we see a TimedOut() error inserting a row. + // By default, this triggers a CHECK failure. + void set_timeout_allowed(bool allowed) { + timeout_allowed_ = allowed; + } + + // Set whether to fail if we see a NotFound() error inserting a row. + // This sort of error is triggered if the table is deleted while the workload + // is running. + // By default, this triggers a CHECK failure. + void set_not_found_allowed(bool allowed) { + not_found_allowed_ = allowed; + } + + void set_num_replicas(int r) { + num_replicas_ = r; + } + + // Set the number of tablets for the table created by this workload. + // The split points are evenly distributed through positive int32s. + void set_num_tablets(int tablets) { + CHECK_GT(tablets, 1); + num_tablets_ = tablets; + } + + void set_table_name(const std::string& table_name) { + table_name_ = table_name; + } + + const std::string& table_name() const { + return table_name_; + } + + void set_pathological_one_row_enabled(bool enabled) { + pathological_one_row_enabled_ = enabled; + } + + // Sets up the internal client and creates the table which will be used for + // writing, if it doesn't already exist. + void Setup(); + + // Start the write workload. + void Start(); + + // Stop the writers and wait for them to exit. + void StopAndJoin(); + + // Return the number of rows inserted so far. This may be called either + // during or after the write workload. + int64_t rows_inserted() const { + return rows_inserted_.Load(); + } + + // Return the number of batches in which we have successfully inserted at + // least one row. + int64_t batches_completed() const { + return batches_completed_.Load(); + } + + private: + void WriteThread(); + + ExternalMiniCluster* cluster_; + client::KuduClientBuilder client_builder_; + client::sp::shared_ptr client_; + + int payload_bytes_; + int num_write_threads_; + int write_batch_size_; + int write_timeout_millis_; + bool timeout_allowed_; + bool not_found_allowed_; + bool pathological_one_row_enabled_; + + int num_replicas_; + int num_tablets_; + std::string table_name_; + + CountDownLatch start_latch_; + AtomicBool should_run_; + AtomicInt rows_inserted_; + AtomicInt batches_completed_; + + std::vector > threads_; + + DISALLOW_COPY_AND_ASSIGN(TestWorkload); +}; + +} // namespace kudu +#endif /* KUDU_INTEGRATION_TESTS_TEST_WORKLOAD_H */ diff --git a/src/kudu/integration-tests/ts_itest-base.h b/src/kudu/integration-tests/ts_itest-base.h new file mode 100644 index 000000000000..ff99c578cefd --- /dev/null +++ b/src/kudu/integration-tests/ts_itest-base.h @@ -0,0 +1,464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_INTEGRATION_TESTS_ITEST_UTIL_H_ +#define KUDU_INTEGRATION_TESTS_ITEST_UTIL_H_ + +#include +#include +#include +#include + +#include "kudu/client/client-test-util.h" +#include "kudu/client/schema-internal.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/external_mini_cluster_fs_inspector.h" +#include "kudu/master/master.proxy.h" +#include "kudu/tserver/tablet_server-test-base.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +DECLARE_int32(consensus_rpc_timeout_ms); + +DEFINE_string(ts_flags, "", "Flags to pass through to tablet servers"); +DEFINE_string(master_flags, "", "Flags to pass through to masters"); + +DEFINE_int32(num_tablet_servers, 3, "Number of tablet servers to start"); +DEFINE_int32(num_replicas, 3, "Number of replicas per tablet server"); + +#define ASSERT_ALL_REPLICAS_AGREE(count) \ + NO_FATALS(AssertAllReplicasAgree(count)) + +namespace kudu { +namespace tserver { + +using client::KuduSchemaFromSchema; +using consensus::OpId; +using consensus::RaftPeerPB; +using itest::GetReplicaStatusAndCheckIfLeader; +using itest::TabletReplicaMap; +using itest::TabletServerMap; +using itest::TServerDetails; +using master::GetTableLocationsRequestPB; +using master::GetTableLocationsResponsePB; +using master::TabletLocationsPB; +using rpc::RpcController; + +static const int kMaxRetries = 20; + +// A base for tablet server integration tests. +class TabletServerIntegrationTestBase : public TabletServerTestBase { + public: + + TabletServerIntegrationTestBase() : random_(SeedRandom()) {} + + void SetUp() OVERRIDE { + TabletServerTestBase::SetUp(); + } + + void AddExtraFlags(const std::string& flags_str, std::vector* flags) { + if (flags_str.empty()) { + return; + } + std::vector split_flags = strings::Split(flags_str, " "); + for (const std::string& flag : split_flags) { + flags->push_back(flag); + } + } + + void CreateCluster(const std::string& data_root_path, + const std::vector& non_default_ts_flags, + const std::vector& non_default_master_flags) { + + LOG(INFO) << "Starting cluster with:"; + LOG(INFO) << "--------------"; + LOG(INFO) << FLAGS_num_tablet_servers << " tablet servers"; + LOG(INFO) << FLAGS_num_replicas << " replicas per TS"; + LOG(INFO) << "--------------"; + + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = FLAGS_num_tablet_servers; + opts.data_root = GetTestPath(data_root_path); + + // If the caller passed no flags use the default ones, where we stress consensus by setting + // low timeouts and frequent cache misses. + if (non_default_ts_flags.empty()) { + opts.extra_tserver_flags.push_back("--log_cache_size_limit_mb=10"); + opts.extra_tserver_flags.push_back(strings::Substitute("--consensus_rpc_timeout_ms=$0", + FLAGS_consensus_rpc_timeout_ms)); + } else { + for (const std::string& flag : non_default_ts_flags) { + opts.extra_tserver_flags.push_back(flag); + } + } + for (const std::string& flag : non_default_master_flags) { + opts.extra_master_flags.push_back(flag); + } + + AddExtraFlags(FLAGS_ts_flags, &opts.extra_tserver_flags); + AddExtraFlags(FLAGS_master_flags, &opts.extra_master_flags); + + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); + inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); + CreateTSProxies(); + } + + // Creates TSServerDetails instance for each TabletServer and stores them + // in 'tablet_servers_'. + void CreateTSProxies() { + CHECK(tablet_servers_.empty()); + CHECK_OK(itest::CreateTabletServerMap(cluster_->master_proxy().get(), + client_messenger_, + &tablet_servers_)); + } + + // Waits that all replicas for a all tablets of 'kTableId' table are online + // and creates the tablet_replicas_ map. + void WaitForReplicasAndUpdateLocations() { + int num_retries = 0; + + bool replicas_missing = true; + do { + std::unordered_multimap tablet_replicas; + GetTableLocationsRequestPB req; + GetTableLocationsResponsePB resp; + RpcController controller; + req.mutable_table()->set_table_name(kTableId); + controller.set_timeout(MonoDelta::FromSeconds(1)); + CHECK_OK(cluster_->master_proxy()->GetTableLocations(req, &resp, &controller)); + CHECK_OK(controller.status()); + CHECK(!resp.has_error()) << "Response had an error: " << resp.error().ShortDebugString(); + + for (const master::TabletLocationsPB& location : resp.tablet_locations()) { + for (const master::TabletLocationsPB_ReplicaPB& replica : location.replicas()) { + TServerDetails* server = FindOrDie(tablet_servers_, replica.ts_info().permanent_uuid()); + tablet_replicas.insert(pair(location.tablet_id(), server)); + } + + if (tablet_replicas.count(location.tablet_id()) < FLAGS_num_replicas) { + LOG(WARNING)<< "Couldn't find the leader and/or replicas. Location: " + << location.ShortDebugString(); + replicas_missing = true; + SleepFor(MonoDelta::FromSeconds(1)); + num_retries++; + break; + } + + replicas_missing = false; + } + if (!replicas_missing) { + tablet_replicas_ = tablet_replicas; + } + } while (replicas_missing && num_retries < kMaxRetries); + } + + // Returns the last committed leader of the consensus configuration. Tries to get it from master + // but then actually tries to the get the committed consensus configuration to make sure. + TServerDetails* GetLeaderReplicaOrNull(const std::string& tablet_id) { + std::string leader_uuid; + Status master_found_leader_result = GetTabletLeaderUUIDFromMaster(tablet_id, &leader_uuid); + + // See if the master is up to date. I.e. if it does report a leader and if the + // replica it reports as leader is still alive and (at least thinks) its still + // the leader. + TServerDetails* leader; + if (master_found_leader_result.ok()) { + leader = GetReplicaWithUuidOrNull(tablet_id, leader_uuid); + if (leader && GetReplicaStatusAndCheckIfLeader(leader, tablet_id, + MonoDelta::FromMilliseconds(100)).ok()) { + return leader; + } + } + + // The replica we got from the master (if any) is either dead or not the leader. + // Find the actual leader. + pair range = + tablet_replicas_.equal_range(tablet_id); + std::vector replicas_copy; + for (;range.first != range.second; ++range.first) { + replicas_copy.push_back((*range.first).second); + } + + std::random_shuffle(replicas_copy.begin(), replicas_copy.end()); + for (TServerDetails* replica : replicas_copy) { + if (GetReplicaStatusAndCheckIfLeader(replica, tablet_id, + MonoDelta::FromMilliseconds(100)).ok()) { + return replica; + } + } + return NULL; + } + + Status GetLeaderReplicaWithRetries(const std::string& tablet_id, + TServerDetails** leader, + int max_attempts = 100) { + int attempts = 0; + while (attempts < max_attempts) { + *leader = GetLeaderReplicaOrNull(tablet_id); + if (*leader) { + return Status::OK(); + } + attempts++; + SleepFor(MonoDelta::FromMilliseconds(100 * attempts)); + } + return Status::NotFound("Leader replica not found"); + } + + Status GetTabletLeaderUUIDFromMaster(const std::string& tablet_id, std::string* leader_uuid) { + GetTableLocationsRequestPB req; + GetTableLocationsResponsePB resp; + RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(100)); + req.mutable_table()->set_table_name(kTableId); + + RETURN_NOT_OK(cluster_->master_proxy()->GetTableLocations(req, &resp, &controller)); + for (const TabletLocationsPB& loc : resp.tablet_locations()) { + if (loc.tablet_id() == tablet_id) { + for (const TabletLocationsPB::ReplicaPB& replica : loc.replicas()) { + if (replica.role() == RaftPeerPB::LEADER) { + *leader_uuid = replica.ts_info().permanent_uuid(); + return Status::OK(); + } + } + } + } + return Status::NotFound("Unable to find leader for tablet", tablet_id); + } + + TServerDetails* GetReplicaWithUuidOrNull(const std::string& tablet_id, + const std::string& uuid) { + pair range = + tablet_replicas_.equal_range(tablet_id); + for (;range.first != range.second; ++range.first) { + if ((*range.first).second->instance_id.permanent_uuid() == uuid) { + return (*range.first).second; + } + } + return NULL; + } + + // Gets the the locations of the consensus configuration and waits until all replicas + // are available for all tablets. + void WaitForTSAndReplicas() { + int num_retries = 0; + // make sure the replicas are up and find the leader + while (true) { + if (num_retries >= kMaxRetries) { + FAIL() << " Reached max. retries while looking up the config."; + } + + Status status = cluster_->WaitForTabletServerCount(FLAGS_num_tablet_servers, + MonoDelta::FromSeconds(5)); + if (status.IsTimedOut()) { + LOG(WARNING)<< "Timeout waiting for all replicas to be online, retrying..."; + num_retries++; + continue; + } + break; + } + WaitForReplicasAndUpdateLocations(); + } + + // Removes a set of servers from the replicas_ list. + // Handy for controlling who to validate against after killing servers. + void PruneFromReplicas(const unordered_set& uuids) { + auto iter = tablet_replicas_.begin(); + while (iter != tablet_replicas_.end()) { + if (uuids.count((*iter).second->instance_id.permanent_uuid()) != 0) { + iter = tablet_replicas_.erase(iter); + continue; + } + ++iter; + } + + for (const std::string& uuid : uuids) { + delete EraseKeyReturnValuePtr(&tablet_servers_, uuid); + } + } + + void GetOnlyLiveFollowerReplicas(const std::string& tablet_id, + std::vector* followers) { + followers->clear(); + TServerDetails* leader; + CHECK_OK(GetLeaderReplicaWithRetries(tablet_id, &leader)); + + std::vector replicas; + pair range = + tablet_replicas_.equal_range(tablet_id); + for (;range.first != range.second; ++range.first) { + replicas.push_back((*range.first).second); + } + + for (TServerDetails* replica : replicas) { + if (leader != NULL && + replica->instance_id.permanent_uuid() == leader->instance_id.permanent_uuid()) { + continue; + } + Status s = GetReplicaStatusAndCheckIfLeader(replica, tablet_id, + MonoDelta::FromMilliseconds(100)); + if (s.IsIllegalState()) { + followers->push_back(replica); + } + } + } + + // Return the index within 'replicas' for the replica which is farthest ahead. + int64_t GetFurthestAheadReplicaIdx(const std::string& tablet_id, + const std::vector& replicas) { + std::vector op_ids; + CHECK_OK(GetLastOpIdForEachReplica(tablet_id, replicas, consensus::RECEIVED_OPID, + MonoDelta::FromSeconds(10), &op_ids)); + + int64 max_index = 0; + int max_replica_index = -1; + for (int i = 0; i < op_ids.size(); i++) { + if (op_ids[i].index() > max_index) { + max_index = op_ids[i].index(); + max_replica_index = i; + } + } + + CHECK_NE(max_replica_index, -1); + + return max_replica_index; + } + + Status ShutdownServerWithUUID(const std::string& uuid) { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + if (ts->instance_id().permanent_uuid() == uuid) { + ts->Shutdown(); + return Status::OK(); + } + } + return Status::NotFound("Unable to find server with UUID", uuid); + } + + Status RestartServerWithUUID(const std::string& uuid) { + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ExternalTabletServer* ts = cluster_->tablet_server(i); + if (ts->instance_id().permanent_uuid() == uuid) { + ts->Shutdown(); + RETURN_NOT_OK(CheckTabletServersAreAlive(tablet_servers_.size()-1)); + RETURN_NOT_OK(ts->Restart()); + RETURN_NOT_OK(CheckTabletServersAreAlive(tablet_servers_.size())); + return Status::OK(); + } + } + return Status::NotFound("Unable to find server with UUID", uuid); + } + + // Since we're fault-tolerant we might mask when a tablet server is + // dead. This returns Status::IllegalState() if fewer than 'num_tablet_servers' + // are alive. + Status CheckTabletServersAreAlive(int num_tablet_servers) { + int live_count = 0; + std::string error = strings::Substitute("Fewer than $0 TabletServers were alive. Dead TSs: ", + num_tablet_servers); + RpcController controller; + for (const TabletServerMap::value_type& entry : tablet_servers_) { + controller.Reset(); + controller.set_timeout(MonoDelta::FromSeconds(10)); + PingRequestPB req; + PingResponsePB resp; + Status s = entry.second->tserver_proxy->Ping(req, &resp, &controller); + if (!s.ok()) { + error += "\n" + entry.second->ToString() + " (" + s.ToString() + ")"; + continue; + } + live_count++; + } + if (live_count < num_tablet_servers) { + return Status::IllegalState(error); + } + return Status::OK(); + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + } + STLDeleteValues(&tablet_servers_); + } + + void CreateClient(client::sp::shared_ptr* client) { + // Connect to the cluster. + ASSERT_OK(client::KuduClientBuilder() + .add_master_server_addr(cluster_->master()->bound_rpc_addr().ToString()) + .Build(client)); + } + + // Create a table with a single tablet, with 'num_replicas'. + void CreateTable() { + // The tests here make extensive use of server schemas, but we need + // a client schema to create the table. + client::KuduSchema client_schema(KuduSchemaFromSchema(schema_)); + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(kTableId) + .schema(&client_schema) + .num_replicas(FLAGS_num_replicas) + // NOTE: this is quite high as a timeout, but the default (5 sec) does not + // seem to be high enough in some cases (see KUDU-550). We should remove + // this once that ticket is addressed. + .timeout(MonoDelta::FromSeconds(20)) + .Create()); + ASSERT_OK(client_->OpenTable(kTableId, &table_)); + } + + // Starts an external cluster with a single tablet and a number of replicas equal + // to 'FLAGS_num_replicas'. The caller can pass 'ts_flags' to specify non-default + // flags to pass to the tablet servers. + void BuildAndStart(const std::vector& ts_flags = std::vector(), + const std::vector& master_flags = std::vector()) { + CreateCluster("raft_consensus-itest-cluster", ts_flags, master_flags); + NO_FATALS(CreateClient(&client_)); + NO_FATALS(CreateTable()); + WaitForTSAndReplicas(); + CHECK_GT(tablet_replicas_.size(), 0); + tablet_id_ = (*tablet_replicas_.begin()).first; + } + + void AssertAllReplicasAgree(int expected_result_count) { + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(kTableId, ClusterVerifier::EXACTLY, expected_result_count)); + } + + protected: + gscoped_ptr cluster_; + gscoped_ptr inspect_; + + // Maps server uuid to TServerDetails + TabletServerMap tablet_servers_; + // Maps tablet to all replicas. + TabletReplicaMap tablet_replicas_; + + client::sp::shared_ptr client_; + client::sp::shared_ptr table_; + std::string tablet_id_; + + ThreadSafeRandom random_; +}; + +} // namespace tserver +} // namespace kudu + +#endif /* SRC_KUDU_INTEGRATION_TESTS_ITEST_UTIL_H_ */ diff --git a/src/kudu/integration-tests/ts_recovery-itest.cc b/src/kudu/integration-tests/ts_recovery-itest.cc new file mode 100644 index 000000000000..6818dc91dacb --- /dev/null +++ b/src/kudu/integration-tests/ts_recovery-itest.cc @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/integration-tests/cluster_verifier.h" +#include "kudu/integration-tests/external_mini_cluster.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/util/test_util.h" + +#include + +using std::string; + +namespace kudu { + +class TsRecoveryITest : public KuduTest { + public: + virtual void TearDown() OVERRIDE { + if (cluster_) cluster_->Shutdown(); + KuduTest::TearDown(); + } + + protected: + void StartCluster(const vector& extra_tserver_flags = vector(), + int num_tablet_servers = 1); + + gscoped_ptr cluster_; +}; + +void TsRecoveryITest::StartCluster(const vector& extra_tserver_flags, + int num_tablet_servers) { + ExternalMiniClusterOptions opts; + opts.num_tablet_servers = num_tablet_servers; + opts.extra_tserver_flags = extra_tserver_flags; + cluster_.reset(new ExternalMiniCluster(opts)); + ASSERT_OK(cluster_->Start()); +} + +// Test crashing a server just before appending a COMMIT message. +// We then restart the server and ensure that all rows successfully +// inserted before the crash are recovered. +TEST_F(TsRecoveryITest, TestRestartWithOrphanedReplicates) { + NO_FATALS(StartCluster()); + cluster_->SetFlag(cluster_->tablet_server(0), + "fault_crash_before_append_commit", "0.05"); + + TestWorkload work(cluster_.get()); + work.set_num_replicas(1); + work.set_num_write_threads(4); + work.set_write_timeout_millis(100); + work.set_timeout_allowed(true); + work.Setup(); + work.Start(); + + // Wait for the process to crash due to the injected fault. + while (cluster_->tablet_server(0)->IsProcessAlive()) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + // Stop the writers. + work.StopAndJoin(); + + // Restart the server, and it should recover. + cluster_->tablet_server(0)->Shutdown(); + ASSERT_OK(cluster_->tablet_server(0)->Restart()); + + + // TODO(KUDU-796): after a restart, we may have to replay some + // orphaned replicates from the log. However, we currently + // allow reading while those are being replayed, which means we + // can "go back in time" briefly. So, we have some retries here. + // When KUDU-796 is fixed, remove the retries. + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckRowCountWithRetries(work.table_name(), + ClusterVerifier::AT_LEAST, + work.rows_inserted(), + MonoDelta::FromSeconds(20))); +} + +// Test that we replay from the recovery directory, if it exists. +TEST_F(TsRecoveryITest, TestCrashDuringLogReplay) { + NO_FATALS(StartCluster({ "--fault_crash_during_log_replay=0.05" })); + + TestWorkload work(cluster_.get()); + work.set_num_replicas(1); + work.set_num_write_threads(4); + work.set_write_batch_size(1); + work.set_write_timeout_millis(100); + work.set_timeout_allowed(true); + work.Setup(); + work.Start(); + while (work.rows_inserted() < 200) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + work.StopAndJoin(); + + // Now restart the server, which will result in log replay, which will crash + // mid-replay with very high probability since we wrote at least 200 log + // entries and we're injecting a fault 5% of the time. + cluster_->tablet_server(0)->Shutdown(); + + // Restart might crash very quickly and actually return a bad status, so we + // ignore the result. + ignore_result(cluster_->tablet_server(0)->Restart()); + + // Wait for the process to crash during log replay. + for (int i = 0; i < 3000 && cluster_->tablet_server(0)->IsProcessAlive(); i++) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_FALSE(cluster_->tablet_server(0)->IsProcessAlive()) << "TS didn't crash!"; + + // Now remove the crash flag, so the next replay will complete, and restart + // the server once more. + cluster_->tablet_server(0)->Shutdown(); + cluster_->tablet_server(0)->mutable_flags()->clear(); + ASSERT_OK(cluster_->tablet_server(0)->Restart()); + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckRowCountWithRetries(work.table_name(), + ClusterVerifier::AT_LEAST, + work.rows_inserted(), + MonoDelta::FromSeconds(30))); +} + +} // namespace kudu diff --git a/src/kudu/integration-tests/ts_tablet_manager-itest.cc b/src/kudu/integration-tests/ts_tablet_manager-itest.cc new file mode 100644 index 000000000000..d8499eddda28 --- /dev/null +++ b/src/kudu/integration-tests/ts_tablet_manager-itest.cc @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/mini_master.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(enable_leader_failure_detection); +DECLARE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader); +DEFINE_int32(num_election_test_loops, 3, + "Number of random EmulateElection() loops to execute in " + "TestReportNewLeaderOnLeaderChange"); + +namespace kudu { +namespace tserver { + +using client::KuduClient; +using client::KuduSchema; +using client::KuduTable; +using client::KuduTableCreator; +using consensus::GetConsensusRole; +using consensus::RaftPeerPB; +using itest::SimpleIntKeyKuduSchema; +using master::MasterServiceProxy; +using master::ReportedTabletPB; +using master::TabletReportPB; +using rpc::Messenger; +using rpc::MessengerBuilder; +using strings::Substitute; +using tablet::TabletPeer; +using tserver::MiniTabletServer; +using tserver::TSTabletManager; + +static const char* const kTableName = "test-table"; +static const int kNumReplicas = 2; + +class TsTabletManagerITest : public KuduTest { + public: + TsTabletManagerITest() + : schema_(SimpleIntKeyKuduSchema()) { + } + virtual void SetUp() OVERRIDE; + virtual void TearDown() OVERRIDE; + + protected: + const KuduSchema schema_; + + gscoped_ptr cluster_; + client::sp::shared_ptr client_; + std::shared_ptr client_messenger_; +}; + +void TsTabletManagerITest::SetUp() { + KuduTest::SetUp(); + + MessengerBuilder bld("client"); + ASSERT_OK(bld.Build(&client_messenger_)); + + MiniClusterOptions opts; + opts.num_tablet_servers = kNumReplicas; + cluster_.reset(new MiniCluster(env_.get(), opts)); + ASSERT_OK(cluster_->Start()); + ASSERT_OK(cluster_->CreateClient(nullptr, &client_)); +} + +void TsTabletManagerITest::TearDown() { + cluster_->Shutdown(); + KuduTest::TearDown(); +} + +// Test that when the leader changes, the tablet manager gets notified and +// includes that information in the next tablet report. +TEST_F(TsTabletManagerITest, TestReportNewLeaderOnLeaderChange) { + // We need to control elections precisely for this test since we're using + // EmulateElection() with a distributed consensus configuration. + FLAGS_enable_leader_failure_detection = false; + FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader = false; + + // Run a few more iters in slow-test mode. + OverrideFlagForSlowTests("num_election_test_loops", "10"); + + // Create the table. + client::sp::shared_ptr table; + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(kNumReplicas) + .Create()); + ASSERT_OK(client_->OpenTable(kTableName, &table)); + + // Build a TServerDetails map so we can check for convergence. + gscoped_ptr master_proxy( + new MasterServiceProxy(client_messenger_, cluster_->mini_master()->bound_rpc_addr())); + + itest::TabletServerMap ts_map; + ASSERT_OK(CreateTabletServerMap(master_proxy.get(), client_messenger_, &ts_map)); + ValueDeleter deleter(&ts_map); + + // Collect the tablet peers so we get direct access to consensus. + vector > tablet_peers; + for (int replica = 0; replica < kNumReplicas; replica++) { + MiniTabletServer* ts = cluster_->mini_tablet_server(replica); + ts->FailHeartbeats(); // Stop heartbeating we don't race against the Master. + vector > cur_ts_tablet_peers; + // The replicas may not have been created yet, so loop until we see them. + while (true) { + ts->server()->tablet_manager()->GetTabletPeers(&cur_ts_tablet_peers); + if (!cur_ts_tablet_peers.empty()) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_EQ(1, cur_ts_tablet_peers.size()); // Each TS should only have 1 tablet. + ASSERT_OK(cur_ts_tablet_peers[0]->WaitUntilConsensusRunning(MonoDelta::FromSeconds(10))); + tablet_peers.push_back(cur_ts_tablet_peers[0]); + } + + // Loop and cause elections and term changes from different servers. + // TSTabletManager should acknowledge the role changes via tablet reports. + for (int i = 0; i < FLAGS_num_election_test_loops; i++) { + SCOPED_TRACE(Substitute("Iter: $0", i)); + int new_leader_idx = rand() % 2; + LOG(INFO) << "Electing peer " << new_leader_idx << "..."; + consensus::Consensus* con = CHECK_NOTNULL(tablet_peers[new_leader_idx]->consensus()); + ASSERT_OK(con->EmulateElection()); + LOG(INFO) << "Waiting for servers to agree..."; + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(5), + ts_map, tablet_peers[0]->tablet_id(), i + 1)); + + // Now check that the tablet report reports the correct role for both servers. + for (int replica = 0; replica < kNumReplicas; replica++) { + // The MarkDirty() callback is on an async thread so it might take the + // follower a few milliseconds to execute it. Wait for that to happen. + TSTabletManager* tablet_manager = + cluster_->mini_tablet_server(replica)->server()->tablet_manager(); + for (int retry = 0; retry <= 12; retry++) { + if (tablet_manager->GetNumDirtyTabletsForTests() > 0) break; + SleepFor(MonoDelta::FromMilliseconds(1 << retry)); + } + + // Ensure that our tablet reports are consistent. + TabletReportPB report; + tablet_manager->GenerateIncrementalTabletReport(&report); + ASSERT_EQ(1, report.updated_tablets_size()) << "Wrong report size:\n" << report.DebugString(); + ReportedTabletPB reported_tablet = report.updated_tablets(0); + ASSERT_TRUE(reported_tablet.has_committed_consensus_state()); + + string uuid = tablet_peers[replica]->permanent_uuid(); + RaftPeerPB::Role role = GetConsensusRole(uuid, reported_tablet.committed_consensus_state()); + if (replica == new_leader_idx) { + ASSERT_EQ(RaftPeerPB::LEADER, role) + << "Tablet report: " << report.ShortDebugString(); + } else { + ASSERT_EQ(RaftPeerPB::FOLLOWER, role) + << "Tablet report: " << report.ShortDebugString(); + } + } + } +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/integration-tests/update_scan_delta_compact-test.cc b/src/kudu/integration-tests/update_scan_delta_compact-test.cc new file mode 100644 index 000000000000..a9aa81af442b --- /dev/null +++ b/src/kudu/integration-tests/update_scan_delta_compact-test.cc @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/client.h" +#include "kudu/client/row_result.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/monotime.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DECLARE_int32(flush_threshold_mb); +DECLARE_int32(log_segment_size_mb); +DECLARE_int32(maintenance_manager_polling_interval_ms); +DEFINE_int32(mbs_for_flushes_and_rolls, 1, "How many MBs are needed to flush and roll"); +DEFINE_int32(row_count, 2000, "How many rows will be used in this test for the base data"); +DEFINE_int32(seconds_to_run, 4, + "How long this test runs for, after inserting the base data, in seconds"); + +namespace kudu { +namespace tablet { + +using client::KuduInsert; +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduRowResult; +using client::KuduScanner; +using client::KuduSchema; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduStatusCallback; +using client::KuduStatusMemberCallback; +using client::KuduTable; +using client::KuduTableCreator; +using client::KuduUpdate; +using client::sp::shared_ptr; + +// This integration test tries to trigger all the update-related bits while also serving as a +// foundation for benchmarking. It first inserts 'row_count' rows and then starts two threads, +// one that continuously updates all the rows sequentially and one that scans them all, until +// it's been running for 'seconds_to_run'. It doesn't test for correctness, unless something +// FATALs. +class UpdateScanDeltaCompactionTest : public KuduTest { + protected: + UpdateScanDeltaCompactionTest() { + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT64)->NotNull()->PrimaryKey(); + b.AddColumn("string")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("int64")->Type(KuduColumnSchema::INT64)->NotNull(); + CHECK_OK(b.Build(&schema_)); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + } + + void CreateTable() { + ASSERT_NO_FATAL_FAILURE(InitCluster()); + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(1) + .Create()); + ASSERT_OK(client_->OpenTable(kTableName, &table_)); + } + + virtual void TearDown() OVERRIDE { + if (cluster_) { + cluster_->Shutdown(); + } + KuduTest::TearDown(); + } + + // Inserts row_count rows sequentially. + void InsertBaseData(); + + // Starts the update and scan threads then stops them after seconds_to_run. + void RunThreads(); + + private: + enum { + kKeyCol, + kStrCol, + kInt64Col + }; + static const char* const kTableName; + + void InitCluster() { + // Start mini-cluster with 1 tserver. + cluster_.reset(new MiniCluster(env_.get(), MiniClusterOptions())); + ASSERT_OK(cluster_->Start()); + KuduClientBuilder client_builder; + client_builder.add_master_server_addr( + cluster_->mini_master()->bound_rpc_addr_str()); + ASSERT_OK(client_builder.Build(&client_)); + } + + shared_ptr CreateSession() { + shared_ptr session = client_->NewSession(); + session->SetTimeoutMillis(5000); + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + return session; + } + + // Continuously updates the existing data until 'stop_latch' drops to 0. + void UpdateRows(CountDownLatch* stop_latch); + + // Continuously scans the data until 'stop_latch' drops to 0. + void ScanRows(CountDownLatch* stop_latch) const; + + // Continuously fetch various web pages on the TS. + void CurlWebPages(CountDownLatch* stop_latch) const; + + // Sets the passed values on the row. + // TODO randomize the string column. + void MakeRow(int64_t key, int64_t val, KuduPartialRow* row) const; + + // If 'key' is a multiple of kSessionBatchSize, it uses 'last_s' to wait for the previous batch + // to finish and then flushes the current one. + Status WaitForLastBatchAndFlush(int64_t key, + Synchronizer* last_s, + KuduStatusCallback* last_s_cb, + shared_ptr session); + + KuduSchema schema_; + std::shared_ptr cluster_; + shared_ptr table_; + shared_ptr client_; +}; + +const char* const UpdateScanDeltaCompactionTest::kTableName = "update-scan-delta-compact-tbl"; +const int kSessionBatchSize = 1000; + +TEST_F(UpdateScanDeltaCompactionTest, TestAll) { + OverrideFlagForSlowTests("seconds_to_run", "100"); + OverrideFlagForSlowTests("row_count", "1000000"); + OverrideFlagForSlowTests("mbs_for_flushes_and_rolls", "8"); + // Setting this high enough that we see the effects of flushes and compactions. + OverrideFlagForSlowTests("maintenance_manager_polling_interval_ms", "2000"); + FLAGS_flush_threshold_mb = FLAGS_mbs_for_flushes_and_rolls; + FLAGS_log_segment_size_mb = FLAGS_mbs_for_flushes_and_rolls; + if (!AllowSlowTests()) { + // Make it run more often since it's not a long test. + FLAGS_maintenance_manager_polling_interval_ms = 50; + } + + ASSERT_NO_FATAL_FAILURE(CreateTable()); + ASSERT_NO_FATAL_FAILURE(InsertBaseData()); + ASSERT_NO_FATAL_FAILURE(RunThreads()); +} + +void UpdateScanDeltaCompactionTest::InsertBaseData() { + shared_ptr session = CreateSession(); + Synchronizer last_s; + KuduStatusMemberCallback last_s_cb(&last_s, + &Synchronizer::StatusCB); + last_s_cb.Run(Status::OK()); + + LOG_TIMING(INFO, "Insert") { + for (int64_t key = 0; key < FLAGS_row_count; key++) { + gscoped_ptr insert(table_->NewInsert()); + MakeRow(key, 0, insert->mutable_row()); + ASSERT_OK(session->Apply(insert.release())); + ASSERT_OK(WaitForLastBatchAndFlush(key, &last_s, &last_s_cb, session)); + } + ASSERT_OK(WaitForLastBatchAndFlush(kSessionBatchSize, &last_s, &last_s_cb, session)); + ASSERT_OK(last_s.Wait()); + } +} + +void UpdateScanDeltaCompactionTest::RunThreads() { + vector > threads; + + CountDownLatch stop_latch(1); + + { + scoped_refptr t; + ASSERT_OK(Thread::Create(CURRENT_TEST_NAME(), + StrCat(CURRENT_TEST_CASE_NAME(), "-update"), + &UpdateScanDeltaCompactionTest::UpdateRows, this, + &stop_latch, &t)); + threads.push_back(t); + } + + { + scoped_refptr t; + ASSERT_OK(Thread::Create(CURRENT_TEST_NAME(), + StrCat(CURRENT_TEST_CASE_NAME(), "-scan"), + &UpdateScanDeltaCompactionTest::ScanRows, this, + &stop_latch, &t)); + threads.push_back(t); + } + + { + scoped_refptr t; + ASSERT_OK(Thread::Create(CURRENT_TEST_NAME(), + StrCat(CURRENT_TEST_CASE_NAME(), "-curl"), + &UpdateScanDeltaCompactionTest::CurlWebPages, this, + &stop_latch, &t)); + threads.push_back(t); + } + + SleepFor(MonoDelta::FromSeconds(FLAGS_seconds_to_run * 1.0)); + stop_latch.CountDown(); + + for (const scoped_refptr& thread : threads) { + ASSERT_OK(ThreadJoiner(thread.get()) + .warn_every_ms(500) + .Join()); + } +} + +void UpdateScanDeltaCompactionTest::UpdateRows(CountDownLatch* stop_latch) { + shared_ptr session = CreateSession(); + Synchronizer last_s; + KuduStatusMemberCallback last_s_cb(&last_s, + &Synchronizer::StatusCB); + + for (int64_t iteration = 1; stop_latch->count() > 0; iteration++) { + last_s_cb.Run(Status::OK()); + LOG_TIMING(INFO, "Update") { + for (int64_t key = 0; key < FLAGS_row_count && stop_latch->count() > 0; key++) { + gscoped_ptr update(table_->NewUpdate()); + MakeRow(key, iteration, update->mutable_row()); + CHECK_OK(session->Apply(update.release())); + CHECK_OK(WaitForLastBatchAndFlush(key, &last_s, &last_s_cb, session)); + } + CHECK_OK(WaitForLastBatchAndFlush(kSessionBatchSize, &last_s, &last_s_cb, session)); + CHECK_OK(last_s.Wait()); + } + } +} + +void UpdateScanDeltaCompactionTest::ScanRows(CountDownLatch* stop_latch) const { + while (stop_latch->count() > 0) { + KuduScanner scanner(table_.get()); + LOG_TIMING(INFO, "Scan") { + CHECK_OK(scanner.Open()); + vector rows; + while (scanner.HasMoreRows()) { + CHECK_OK(scanner.NextBatch(&rows)); + } + } + } +} + +void UpdateScanDeltaCompactionTest::CurlWebPages(CountDownLatch* stop_latch) const { + vector urls; + string base_url = cluster_->mini_tablet_server(0)->bound_http_addr().ToString(); + urls.push_back(base_url + "/scans"); + urls.push_back(base_url + "/transactions"); + + EasyCurl curl; + faststring dst; + while (stop_latch->count() > 0) { + for (const string& url : urls) { + VLOG(1) << "Curling URL " << url; + Status status = curl.FetchURL(url, &dst); + if (status.ok()) { + CHECK_GT(dst.length(), 0); + } + } + } +} + +void UpdateScanDeltaCompactionTest::MakeRow(int64_t key, + int64_t val, + KuduPartialRow* row) const { + CHECK_OK(row->SetInt64(kKeyCol, key)); + CHECK_OK(row->SetStringCopy(kStrCol, "TODO random string")); + CHECK_OK(row->SetInt64(kInt64Col, val)); +} + +Status UpdateScanDeltaCompactionTest::WaitForLastBatchAndFlush(int64_t key, + Synchronizer* last_s, + KuduStatusCallback* last_s_cb, + shared_ptr session) { + if (key % kSessionBatchSize == 0) { + RETURN_NOT_OK(last_s->Wait()); + last_s->Reset(); + session->FlushAsync(last_s_cb); + } + return Status::OK(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/master/CMakeLists.txt b/src/kudu/master/CMakeLists.txt new file mode 100644 index 000000000000..695188bfd501 --- /dev/null +++ b/src/kudu/master/CMakeLists.txt @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +KRPC_GENERATE( + MASTER_KRPC_SRCS MASTER_KRPC_HDRS MASTER_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES master.proto) +set(MASTER_KRPC_LIBS + consensus_metadata_proto + krpc + protobuf + rpc_header_proto + tablet_proto + wire_protocol_proto) +ADD_EXPORTABLE_LIBRARY(master_proto + SRCS ${MASTER_KRPC_SRCS} + DEPS ${MASTER_KRPC_LIBS} + NONLINK_DEPS ${MASTER_KRPC_TGTS}) + +set(MASTER_SRCS + catalog_manager.cc + master.cc + master_options.cc + master_service.cc + master-path-handlers.cc + mini_master.cc + sys_catalog.cc + ts_descriptor.cc + ts_manager.cc +) + +add_library(master ${MASTER_SRCS}) +target_link_libraries(master + kudu_common + tablet + server_common + server_process + krpc + gutil + kudu_util + tserver + tserver_service_proto + master_proto + rpc_header_proto) + +set(MASTER_RPC_SRCS + master_rpc.cc) +set(MASTER_RPC_LIBS + kudu_common + krpc + gutil + kudu_util + master_proto + rpc_header_proto) +ADD_EXPORTABLE_LIBRARY(master_rpc + SRCS ${MASTER_RPC_SRCS} + DEPS ${MASTER_RPC_LIBS}) + +# Tests +set(KUDU_TEST_LINK_LIBS master master_proto kudu_client ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(catalog_manager-test) +ADD_KUDU_TEST(master-test RESOURCE_LOCK "master-web-port") +ADD_KUDU_TEST(sys_catalog-test RESOURCE_LOCK "master-web-port") + +# Actual master executable +add_executable(kudu-master master_main.cc) +target_link_libraries(kudu-master + master + ${KUDU_BASE_LIBS}) diff --git a/src/kudu/master/README b/src/kudu/master/README new file mode 100644 index 000000000000..26023be01350 --- /dev/null +++ b/src/kudu/master/README @@ -0,0 +1,238 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +============================================================ +The Catalog Manager and System Tables +============================================================ + +The Catalog Manager keeps track of the Kudu tables and tablets defined by the +user in the cluster. + +All the table and tablet information is stored in-memory in copy-on-write +TableInfo / TabletInfo objects, as well as on-disk, in the "sys.catalog" +Kudu system table hosted only on the Masters. This system table is loaded +into memory on Master startup. At the time of this writing, the "sys.catalog" +table consists of only a single tablet in order to provide strong consistency +for the metadata under RAFT replication (as currently, each tablet has its own +log). + +To add or modify a table or tablet, the Master writes, but does not yet commit +the changes to memory, then writes and flushes the system table to disk, and +then makes the changes visible in-memory (commits them) if the disk write (and, +in a distributed master setup, config-based replication) is successful. This +allows readers to access the in-memory state in a consistent +way, even while a write is in-progress. + +This design prevents having to go through the whole scan path to service tablet +location calls, which would be more expensive, and allows for easily keeping +"soft" state in the Master for every Table and Tablet. + +The catalog manager maintains 3 hash-maps for looking up info in the sys table: +- [Table Id] -> TableInfo +- [Table Name] -> TableInfo +- [Tablet Id] -> TabletInfo + +The TableInfo has a map [tablet-start-key] -> TabletInfo used to provide +the tablets locations to the user based on a key-range request. + + +Table Creation +-------------- + +The below corresponds to the code in CatalogManager::CreateTable(). + +1. Client -> Master request: Create "table X" with N tablets and schema S. +2. Master: CatalogManager::CreateTable(): + a. Validate user request (e.g. ensure a valid schema). + b. Verify that the table name is not already taken. + TODO: What about old, deleted tables? + c. Add (in-memory) the new TableInfo (in "preparing" state). + d. Add (in-memory) the TabletInfo based on the user-provided pre-split-keys + field (in "preparing" state). + e. Write the tablets info to "sys.catalog" + (The Master process is killed if the write fails). + - Master begins writing to disk. + - Note: If the Master crashes or restarts here or at any time previous to + this point, the table will not exist when the Master comes back online. + f. Write the table info to "sys.catalog" with the "running" state + (The Master process is killed if the write fails). + - Master completes writing to disk. + - After this point, the table will exist and be re-created as necessary + at startup time after a crash or process restart. + g. Commit the "running" state to memory, which allows clients to see the table. +3. Master -> Client response: The table has been created with some ID, i.e. "xyz" + (or, in case something went wrong, an error message). + +After this point in time, the table is reported as created, which means that if +the cluster is shut down, when it starts back up the table will still exist. +However, the tablets are not yet created (see Table Assignment, below). + + +Table Deletion +-------------- + +When the user sends a DeleteTable request for table T, table T is marked as +deleted by writing a "deleted" flag in the state field in T's record in the +"sys.catalog" table, table T is removed from the in-memory "table names" +map on the Master, and the table is marked as being "deleted" in the +in-memory TableInfo / TabletInfo "state" field on the Master. +TODO: Could this race with table deletion / creation?? + +At this point, the table is no longer externally visible to clients via Master +RPC calls, but the tablet configs that make up the table may still be up and +running. New clients trying to open the table will get a NotFound error, while +clients that already have the tablet locations cached may still be able to +read and write to the tablet configs, as long as the corresponding tablet +servers are online and their respective tablets have not yet been deleted. +In some ways, this is similar the design of FS unlink. + +The Master will asynchronously send a DeleteTablet RPC request to each tablet +(one RPC request per tablet server in the config, for each tablet), and the +tablets will therefore be deleted in parallel in some unspecified order. If the +Master or tablet server goes offline before a particular DeleteTablet operation +successfully completes, the Master will send a new DeleteTablet request at the +time that the next heartbeat is received from the tablet that is to be deleted. + +A "Cleaner" process will be reponsible for removing the data from deleted tables +and tablets in the future, both on-disk and cached in memory (TODO). + + +Table Assignment (Tablet Creation) +---------------------------------- + +Once a table is created, the tablets must be created on a set of replicas. In +order to do that, the master has to select the replicas and associate them to +the tablet. + +For each tablet not created we select a set of replicas and a leader and we +send the "create tablet" request. On the next TS-heartbeat from the leader we +can mark the tablet as "running", if reported. If we don't receive a "tablet +created" report after ASSIGNMENT-TIMEOUT-MSEC we replace the tablet with a new +one, following these same steps for the new tablet. + +The Assignment is processed by the "CatalogManagerBgTasks" thread. This thread +is waiting for an event that can be: + +- Create Table (need to process the new tablet for assignment) +- Assignment Timeout (some tablet request timeout expired, replace it) + +This is the current control flow: + +- CatalogManagerBgTasks thread: + 1. Process Pending Assignments: + - For each tablet pending assignment: + - If tablet creation was already requested: + - If we did not receive a response yet, and the configurable + assignment timeout period has passed, mark the tablet as "replaced": + 1. Delete the tablet if it ever reports in. + 2. Create a new tablet in its place, add that tablet to the + "create table" list. + - Else, if the tablet is new (just created by CreateTable in "preparing" state): + - Add it to the "create tablet" list. + - Now, for each tablet in the "create tablet" list: + - Select a set of tablet servers to host the tablet config. + - Select a tablet server to be the initial config leader. + [BEGIN-WRITE-TO-DISK] + - Flush the "to create" to sys.catalog with state "creating" + [If something fails here, the "Process Pending Assignments" will + reprocess these tablets. As nothing was done, running tables will be replaced] + [END-WRITE-TO-DISK] + - For each tablet server in the config: + - Send an async CreateTablet() RPC request to the TS. + On TS-heartbeat, the Master will receive the notification of "tablet creation". + - Commit any changes in state to memory. + At this point the tablets marked as "running" are visible to the user. + + 2. Cleanup deleted tables & tablets (FIXME: is this implemented?): + - Remove the tables/tablets with "deleted" state from "sys.catalog" + - Remove the tablets with "deleted" state from the in-memory map + - Remove the tables with "deleted" state from the in-memory map + +When the TS receives a CreateTablet() RPC, it will attempt to create the tablet +replica locally. Once it is successful, it will be added to the next tablet +report. When the tablet is reported, the master-side ProcessTabletReport() +function is called. + +If we find at this point that the reported tablet is in "creating" state, and +the TS reporting the tablet is the leader selected during the assignment +process (see CatalogManagerBgTasksThread above), the tablet will be marked as +running and committed to disk, completing the assignment process. + + +Alter Table +----------- + +When the user sends an alter request, which may contain changes to the schema, +table name or attributes, the Master will send a set of AlterTable() RPCs to +each TS handling the set of tablets currently running. The Master will keep +retrying in case of error. + +If a TS is down or goes down during an AlterTable request, on restart it will +report the schema version that it is using, and if it is out of date, the Master +will send an AlterTable request to that TS at that time. + +When the Master first comes online after being restarted, a full tablet report +will be requested from each TS, and the tablet schema version sent on the next +heartbeat will be used to determine if a given TS needs an AlterTable() call. + +============================================================ +Heartbeats and TSManager +============================================================ + +Heartbeats are sent by the TS to the master. Per master.proto, a +heartbeat contains: + +1. Node instance information: permanent uuid, node sequence number +(which is incremented each time the node is started). + +2. (Optional) registration. Sent either at TS startup or if the master +responded to a previous heartbeat with "needs register" (see +'Handling heartbeats' below for an explanation of when this response +will be sent). + +3. (Optional) tablet report. Sent either when tablet information has +changed, or if the master responded to a previous heartbeat with +"needs a full tablet report" (see "Handling heartbeats" below for an +explanation of when this response will be sent). + +Handling heartbeats +------------------- + +Upon receiving a heartbeat from a TS, the master will: + +1) Check if the heartbeat has registration info. If so, register +the TS instance with TSManager (see "TSManager" below for more +details). + +2) Retrieve a TSDescriptor from TSManager. If the TSDescriptor +is not found, reply to the TS with "need re-register" field set to +true, and return early. + +3) Update the heartbeat time (see "TSManager" below) in the +registration object. + +4) If the heartbeat contains a tablet report, the Catalog Manager will +process the report and update its cache as well as the system tables +(see "Catalog Manager" above). Otherwise, the master will respond to +the TS requesting a full tablet report. + +5) Send a success respond to the TS. + +TSManager +--------- + +TSManager provides in-memory storage for information sent by the +tablet server to the master (tablet servers that have been heard from, +heartbeats, tablet reports, etc...). The information is stored in a +map, where the key is the permanent uuid of a tablet server and the +value is (a pointer to) a TSDescriptor. diff --git a/src/kudu/master/catalog_manager-test.cc b/src/kudu/master/catalog_manager-test.cc new file mode 100644 index 000000000000..bd48309f5aec --- /dev/null +++ b/src/kudu/master/catalog_manager-test.cc @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace master { + +using strings::Substitute; + +// Test of the tablet assignment algo for splits done at table creation time. +// This tests that when we define a split, the tablet lands on the expected +// side of the split, i.e. it's a closed interval on the start key and an open +// interval on the end key (non-inclusive). +TEST(TableInfoTest, TestAssignmentRanges) { + const string table_id = CURRENT_TEST_NAME(); + scoped_refptr table(new TableInfo(table_id)); + vector > tablets; + + // Define & create the splits. + const int kNumSplits = 3; + string split_keys[kNumSplits] = { "a", "b", "c" }; // The keys we split on. + for (int i = 0; i <= kNumSplits; i++) { + const string& start_key = (i == 0) ? "" : split_keys[i - 1]; + const string& end_key = (i == kNumSplits) ? "" : split_keys[i]; + string tablet_id = Substitute("tablet-$0-$1", start_key, end_key); + + TabletInfo* tablet = new TabletInfo(table, tablet_id); + TabletMetadataLock meta_lock(tablet, TabletMetadataLock::WRITE); + + PartitionPB* partition = meta_lock.mutable_data()->pb.mutable_partition(); + partition->set_partition_key_start(start_key); + partition->set_partition_key_end(end_key); + meta_lock.mutable_data()->pb.set_state(SysTabletsEntryPB::RUNNING); + + table->AddTablet(tablet); + meta_lock.Commit(); + tablets.push_back(make_scoped_refptr(tablet)); + } + + // Ensure they give us what we are expecting. + for (int i = 0; i <= kNumSplits; i++) { + // Calculate the tablet id and start key. + const string& start_key = (i == 0) ? "" : split_keys[i - 1]; + const string& end_key = (i == kNumSplits) ? "" : split_keys[i]; + string tablet_id = Substitute("tablet-$0-$1", start_key, end_key); + + // Query using the start key. + GetTableLocationsRequestPB req; + req.set_max_returned_locations(1); + req.mutable_table()->mutable_table_name()->assign(table_id); + req.mutable_partition_key_start()->assign(start_key); + vector > tablets_in_range; + table->GetTabletsInRange(&req, &tablets_in_range); + + // Only one tablet should own this key. + ASSERT_EQ(1, tablets_in_range.size()); + // The tablet with range start key matching 'start_key' should be the owner. + ASSERT_EQ(tablet_id, (*tablets_in_range.begin())->tablet_id()); + LOG(INFO) << "Key " << start_key << " found in tablet " << tablet_id; + } + + for (const scoped_refptr& tablet : tablets) { + ASSERT_TRUE(table->RemoveTablet( + tablet->metadata().state().pb.partition().partition_key_start())); + } +} + +TEST(TestTSDescriptor, TestReplicaCreationsDecay) { + TSDescriptor ts("test"); + ASSERT_EQ(0, ts.RecentReplicaCreations()); + ts.IncrementRecentReplicaCreations(); + + // The load should start at close to 1.0. + double val_a = ts.RecentReplicaCreations(); + ASSERT_NEAR(1.0, val_a, 0.05); + + // After 10ms it should have dropped a bit, but still be close to 1.0. + SleepFor(MonoDelta::FromMilliseconds(10)); + double val_b = ts.RecentReplicaCreations(); + ASSERT_LT(val_b, val_a); + ASSERT_NEAR(0.99, val_a, 0.05); + + if (AllowSlowTests()) { + // After 10 seconds, we should have dropped to 0.5^(10/60) = 0.891 + SleepFor(MonoDelta::FromSeconds(10)); + ASSERT_NEAR(0.891, ts.RecentReplicaCreations(), 0.05); + } +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc new file mode 100644 index 000000000000..9a9344ab4434 --- /dev/null +++ b/src/kudu/master/catalog_manager.cc @@ -0,0 +1,3338 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// The catalog manager handles the current list of tables +// and tablets in the cluster, as well as their current locations. +// Since most operations in the master go through these data +// structures, locking is carefully managed here to prevent unnecessary +// contention and deadlocks: +// +// - each structure has an internal spinlock used for operations that +// are purely in-memory (eg the current status of replicas) +// - data that is persisted on disk is stored in separate PersistentTable(t)Info +// structs. These are managed using copy-on-write so that writers may block +// writing them back to disk while not impacting concurrent readers. +// +// Usage rules: +// - You may obtain READ locks in any order. READ locks should never block, +// since they only conflict with COMMIT which is a purely in-memory operation. +// Thus they are deadlock-free. +// - If you need a WRITE lock on both a table and one or more of its tablets, +// acquire the lock on the table first. This strict ordering prevents deadlocks. + +#include "kudu/master/catalog_manager.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "kudu/cfile/type_encodings.h" +#include "kudu/common/partial_row.h" +#include "kudu/common/partition.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/gutil/walltime.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/sys_catalog.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/ts_manager.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/monotime.h" +#include "kudu/util/random_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/thread_restrictions.h" +#include "kudu/util/trace.h" + +DEFINE_int32(master_ts_rpc_timeout_ms, 30 * 1000, // 30 sec + "Timeout used for the Master->TS async rpc calls."); +TAG_FLAG(master_ts_rpc_timeout_ms, advanced); + +DEFINE_int32(tablet_creation_timeout_ms, 30 * 1000, // 30 sec + "Timeout used by the master when attempting to create tablet " + "replicas during table creation."); +TAG_FLAG(tablet_creation_timeout_ms, advanced); + +DEFINE_bool(catalog_manager_wait_for_new_tablets_to_elect_leader, true, + "Whether the catalog manager should wait for a newly created tablet to " + "elect a leader before considering it successfully created. " + "This is disabled in some tests where we explicitly manage leader " + "election."); +TAG_FLAG(catalog_manager_wait_for_new_tablets_to_elect_leader, hidden); + +DEFINE_int32(unresponsive_ts_rpc_timeout_ms, 60 * 60 * 1000, // 1 hour + "After this amount of time, the master will stop attempting to contact " + "a tablet server in order to perform operations such as deleting a tablet."); +TAG_FLAG(unresponsive_ts_rpc_timeout_ms, advanced); + +DEFINE_int32(default_num_replicas, 3, + "Default number of replicas for tables that do not have the num_replicas set."); +TAG_FLAG(default_num_replicas, advanced); + +DEFINE_int32(catalog_manager_bg_task_wait_ms, 1000, + "Amount of time the catalog manager background task thread waits " + "between runs"); +TAG_FLAG(catalog_manager_bg_task_wait_ms, hidden); + +DEFINE_int32(max_create_tablets_per_ts, 20, + "The number of tablets per TS that can be requested for a new table."); +TAG_FLAG(max_create_tablets_per_ts, advanced); + +DEFINE_bool(catalog_manager_allow_local_consensus, true, + "Use local consensus when config size == 1"); +TAG_FLAG(catalog_manager_allow_local_consensus, hidden); + +DEFINE_int32(master_failover_catchup_timeout_ms, 30 * 1000, // 30 sec + "Amount of time to give a newly-elected leader master to load" + " the previous master's metadata and become active. If this time" + " is exceeded, the node crashes."); +TAG_FLAG(master_failover_catchup_timeout_ms, advanced); +TAG_FLAG(master_failover_catchup_timeout_ms, experimental); + +DEFINE_bool(master_tombstone_evicted_tablet_replicas, true, + "Whether the Master should tombstone (delete) tablet replicas that " + "are no longer part of the latest reported raft config."); +TAG_FLAG(master_tombstone_evicted_tablet_replicas, hidden); + +DEFINE_bool(master_add_server_when_underreplicated, true, + "Whether the master should attempt to add a new server to a tablet " + "config when it detects that the tablet is under-replicated."); +TAG_FLAG(master_add_server_when_underreplicated, hidden); + +DEFINE_bool(catalog_manager_check_ts_count_for_create_table, true, + "Whether the master should ensure that there are enough live tablet " + "servers to satisfy the provided replication count before allowing " + "a table to be created."); +TAG_FLAG(catalog_manager_check_ts_count_for_create_table, hidden); + +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { +namespace master { + +using base::subtle::NoBarrier_Load; +using base::subtle::NoBarrier_CompareAndSwap; +using cfile::TypeEncodingInfo; +using consensus::kMinimumTerm; +using consensus::CONSENSUS_CONFIG_COMMITTED; +using consensus::Consensus; +using consensus::ConsensusServiceProxy; +using consensus::ConsensusStatePB; +using consensus::GetConsensusRole; +using consensus::OpId; +using consensus::RaftPeerPB; +using consensus::StartRemoteBootstrapRequestPB; +using rpc::RpcContext; +using strings::Substitute; +using tablet::TABLET_DATA_DELETED; +using tablet::TABLET_DATA_TOMBSTONED; +using tablet::TabletDataState; +using tablet::TabletPeer; +using tablet::TabletStatePB; +using tserver::TabletServerErrorPB; + +//////////////////////////////////////////////////////////// +// Table Loader +//////////////////////////////////////////////////////////// + +class TableLoader : public TableVisitor { + public: + explicit TableLoader(CatalogManager *catalog_manager) + : catalog_manager_(catalog_manager) { + } + + virtual Status VisitTable(const std::string& table_id, + const SysTablesEntryPB& metadata) OVERRIDE { + CHECK(!ContainsKey(catalog_manager_->table_ids_map_, table_id)) + << "Table already exists: " << table_id; + + // Setup the table info + TableInfo *table = new TableInfo(table_id); + TableMetadataLock l(table, TableMetadataLock::WRITE); + l.mutable_data()->pb.CopyFrom(metadata); + + // Add the tablet to the IDs map and to the name map (if the table is not deleted) + catalog_manager_->table_ids_map_[table->id()] = table; + if (!l.data().is_deleted()) { + catalog_manager_->table_names_map_[l.data().name()] = table; + } + + LOG(INFO) << "Loaded metadata for table " << table->ToString(); + VLOG(1) << "Metadata for table " << table->ToString() << ": " << metadata.ShortDebugString(); + l.Commit(); + return Status::OK(); + } + + private: + CatalogManager *catalog_manager_; + + DISALLOW_COPY_AND_ASSIGN(TableLoader); +}; + +//////////////////////////////////////////////////////////// +// Tablet Loader +//////////////////////////////////////////////////////////// + +class TabletLoader : public TabletVisitor { + public: + explicit TabletLoader(CatalogManager *catalog_manager) + : catalog_manager_(catalog_manager) { + } + + virtual Status VisitTablet(const std::string& table_id, + const std::string& tablet_id, + const SysTabletsEntryPB& metadata) OVERRIDE { + // Lookup the table + scoped_refptr table(FindPtrOrNull( + catalog_manager_->table_ids_map_, table_id)); + + // Setup the tablet info + TabletInfo* tablet = new TabletInfo(table, tablet_id); + TabletMetadataLock l(tablet, TabletMetadataLock::WRITE); + l.mutable_data()->pb.CopyFrom(metadata); + + // Add the tablet to the tablet manager + catalog_manager_->tablet_map_[tablet->tablet_id()] = tablet; + + if (table == nullptr) { + // if the table is missing and the tablet is in "preparing" state + // may mean that the table was not created (maybe due to a failed write + // for the sys-tablets). The cleaner will remove + if (l.data().pb.state() == SysTabletsEntryPB::PREPARING) { + LOG(WARNING) << "Missing Table " << table_id << " required by tablet " << tablet_id + << " (probably a failed table creation: the tablet was not assigned)"; + return Status::OK(); + } + + // if the tablet is not in a "preparing" state, something is wrong... + LOG(ERROR) << "Missing Table " << table_id << " required by tablet " << tablet_id; + LOG(ERROR) << "Metadata: " << metadata.DebugString(); + return Status::Corruption("Missing table for tablet: ", tablet_id); + } + + // Add the tablet to the Table + if (!l.mutable_data()->is_deleted()) { + table->AddTablet(tablet); + } + l.Commit(); + + // TODO(KUDU-1070): if we see a running tablet under a deleted table, + // we should "roll forward" the deletion of the tablet here. + + TableMetadataLock table_lock(table.get(), TableMetadataLock::READ); + + LOG(INFO) << "Loaded metadata for tablet " << tablet_id + << " (table " << table->ToString() << ")"; + VLOG(2) << "Metadata for tablet " << tablet_id << ": " << metadata.ShortDebugString(); + + return Status::OK(); + } + + private: + CatalogManager *catalog_manager_; + + DISALLOW_COPY_AND_ASSIGN(TabletLoader); +}; + +//////////////////////////////////////////////////////////// +// Background Tasks +//////////////////////////////////////////////////////////// + +class CatalogManagerBgTasks { + public: + explicit CatalogManagerBgTasks(CatalogManager *catalog_manager) + : closing_(false), pending_updates_(false), + thread_(nullptr), catalog_manager_(catalog_manager) { + } + + ~CatalogManagerBgTasks() {} + + Status Init(); + void Shutdown(); + + void Wake() { + boost::lock_guard lock(lock_); + pending_updates_ = true; + cond_.notify_all(); + } + + void Wait(int msec) { + boost::unique_lock lock(lock_); + if (closing_) return; + if (!pending_updates_) { + boost::system_time wtime = boost::get_system_time() + boost::posix_time::milliseconds(msec); + cond_.timed_wait(lock, wtime); + } + pending_updates_ = false; + } + + void WakeIfHasPendingUpdates() { + boost::lock_guard lock(lock_); + if (pending_updates_) { + cond_.notify_all(); + } + } + + private: + void Run(); + + private: + Atomic32 closing_; + bool pending_updates_; + mutable boost::mutex lock_; + boost::condition_variable cond_; + scoped_refptr thread_; + CatalogManager *catalog_manager_; +}; + +Status CatalogManagerBgTasks::Init() { + RETURN_NOT_OK(kudu::Thread::Create("catalog manager", "bgtasks", + &CatalogManagerBgTasks::Run, this, &thread_)); + return Status::OK(); +} + +void CatalogManagerBgTasks::Shutdown() { + if (Acquire_CompareAndSwap(&closing_, false, true) != false) { + VLOG(2) << "CatalogManagerBgTasks already shut down"; + return; + } + + Wake(); + if (thread_ != nullptr) { + CHECK_OK(ThreadJoiner(thread_.get()).Join()); + } +} + +void CatalogManagerBgTasks::Run() { + while (!NoBarrier_Load(&closing_)) { + // Perform assignment processing. + if (!catalog_manager_->IsInitialized()) { + LOG(WARNING) << "Catalog manager is not initialized!"; + } else if (catalog_manager_->CheckIsLeaderAndReady().ok()) { + std::vector > to_delete; + std::vector > to_process; + + // Get list of tablets not yet running or already replaced. + catalog_manager_->ExtractTabletsToProcess(&to_delete, &to_process); + + if (!to_process.empty()) { + // Transition tablet assignment state from preparing to creating, send + // and schedule creation / deletion RPC messages, etc. + Status s = catalog_manager_->ProcessPendingAssignments(to_process); + if (!s.ok()) { + // If there is an error (e.g., we are not the leader) abort this task + // and wait until we're woken up again. + // + // TODO Add tests for this in the revision that makes + // create/alter fault tolerant. + LOG(ERROR) << "Error processing pending assignments, aborting the current task: " + << s.ToString(); + } + } + } else { + VLOG(1) << "We are no longer the leader, aborting the current task..."; + } + + //if (!to_delete.empty()) { + // TODO: Run the cleaner + //} + + // Wait for a notification or a timeout expiration. + // - CreateTable will call Wake() to notify about the tablets to add + // - HandleReportedTablet/ProcessPendingAssignments will call WakeIfHasPendingUpdates() + // to notify about tablets creation. + Wait(FLAGS_catalog_manager_bg_task_wait_ms); + } + VLOG(1) << "Catalog manager background task thread shutting down"; +} + +//////////////////////////////////////////////////////////// +// CatalogManager +//////////////////////////////////////////////////////////// + +namespace { + +string RequestorString(RpcContext* rpc) { + if (rpc) { + return rpc->requestor_string(); + } else { + return "internal request"; + } +} + +// If 's' indicates that the node is no longer the leader, setup +// Service::UnavailableError as the error, set NOT_THE_LEADER as the +// error code and return true. +template +void CheckIfNoLongerLeaderAndSetupError(Status s, RespClass* resp) { + // TODO (KUDU-591): This is a bit of a hack, as right now + // there's no way to propagate why a write to a consensus configuration has + // failed. However, since we use Status::IllegalState()/IsAborted() to + // indicate the situation where a write was issued on a node + // that is no longer the leader, this suffices until we + // distinguish this cause of write failure more explicitly. + if (s.IsIllegalState() || s.IsAborted()) { + Status new_status = Status::ServiceUnavailable( + "operation requested can only be executed on a leader master, but this" + " master is no longer the leader", s.ToString()); + SetupError(resp->mutable_error(), MasterErrorPB::NOT_THE_LEADER, new_status); + } +} + +} // anonymous namespace + +CatalogManager::CatalogManager(Master *master) + : master_(master), + rng_(GetRandomSeed32()), + state_(kConstructed), + leader_ready_term_(-1) { + CHECK_OK(ThreadPoolBuilder("leader-initialization") + .set_max_threads(1) + .Build(&worker_pool_)); +} + +CatalogManager::~CatalogManager() { + Shutdown(); +} + +Status CatalogManager::Init(bool is_first_run) { + { + boost::lock_guard l(state_lock_); + CHECK_EQ(kConstructed, state_); + state_ = kStarting; + } + + RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(is_first_run), + "Failed to initialize sys tables async"); + + // WaitUntilRunning() must run outside of the lock as to prevent + // deadlock. This is safe as WaitUntilRunning waits for another + // thread to finish its work and doesn't itself depend on any state + // within CatalogManager. + + RETURN_NOT_OK_PREPEND(sys_catalog_->WaitUntilRunning(), + "Failed waiting for the catalog tablet to run"); + + boost::lock_guard l(lock_); + background_tasks_.reset(new CatalogManagerBgTasks(this)); + RETURN_NOT_OK_PREPEND(background_tasks_->Init(), + "Failed to initialize catalog manager background tasks"); + + { + boost::lock_guard l(state_lock_); + CHECK_EQ(kStarting, state_); + state_ = kRunning; + } + + return Status::OK(); +} + +Status CatalogManager::ElectedAsLeaderCb() { + boost::lock_guard l(state_lock_); + return worker_pool_->SubmitClosure( + Bind(&CatalogManager::VisitTablesAndTabletsTask, Unretained(this))); +} + +Status CatalogManager::WaitUntilCaughtUpAsLeader(const MonoDelta& timeout) { + string uuid = master_->fs_manager()->uuid(); + Consensus* consensus = sys_catalog_->tablet_peer()->consensus(); + ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); + if (!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid) { + return Status::IllegalState( + Substitute("Node $0 not leader. Consensus state: $1", + uuid, cstate.ShortDebugString())); + } + + // Wait for all transactions to be committed. + RETURN_NOT_OK(sys_catalog_->tablet_peer()->transaction_tracker()->WaitForAllToFinish(timeout)); + return Status::OK(); +} + +void CatalogManager::VisitTablesAndTabletsTask() { + + Consensus* consensus = sys_catalog_->tablet_peer()->consensus(); + int64_t term = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED).current_term(); + Status s = WaitUntilCaughtUpAsLeader( + MonoDelta::FromMilliseconds(FLAGS_master_failover_catchup_timeout_ms)); + if (!s.ok()) { + WARN_NOT_OK(s, "Failed waiting for node to catch up after master election"); + // TODO: Abdicate on timeout instead of crashing. + if (s.IsTimedOut()) { + LOG(FATAL) << "Shutting down due to unavailability of other masters after" + << " election. TODO: Abdicate instead."; + } + return; + } + + { + boost::lock_guard lock(lock_); + int64_t term_after_wait = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED).current_term(); + if (term_after_wait != term) { + // If we got elected leader again while waiting to catch up then we will + // get another callback to visit the tables and tablets, so bail. + LOG(INFO) << "Term change from " << term << " to " << term_after_wait + << " while waiting for master leader catchup. Not loading sys catalog metadata"; + return; + } + + LOG(INFO) << "Loading table and tablet metadata into memory..."; + LOG_SLOW_EXECUTION(WARNING, 1000, LogPrefix() + "Loading metadata into memory") { + CHECK_OK(VisitTablesAndTabletsUnlocked()); + } + } + boost::lock_guard l(state_lock_); + leader_ready_term_ = term; +} + +Status CatalogManager::VisitTablesAndTabletsUnlocked() { + DCHECK(lock_.is_locked()); + + // Clear the existing state. + table_names_map_.clear(); + table_ids_map_.clear(); + tablet_map_.clear(); + + // Visit tables and tablets, load them into memory. + TableLoader table_loader(this); + RETURN_NOT_OK_PREPEND(sys_catalog_->VisitTables(&table_loader), + "Failed while visiting tables in sys catalog"); + TabletLoader tablet_loader(this); + RETURN_NOT_OK_PREPEND(sys_catalog_->VisitTablets(&tablet_loader), + "Failed while visiting tablets in sys catalog"); + return Status::OK(); +} + +Status CatalogManager::InitSysCatalogAsync(bool is_first_run) { + boost::lock_guard l(lock_); + sys_catalog_.reset(new SysCatalogTable(master_, + master_->metric_registry(), + Bind(&CatalogManager::ElectedAsLeaderCb, + Unretained(this)))); + if (is_first_run) { + RETURN_NOT_OK(sys_catalog_->CreateNew(master_->fs_manager())); + } else { + RETURN_NOT_OK(sys_catalog_->Load(master_->fs_manager())); + } + return Status::OK(); +} + +bool CatalogManager::IsInitialized() const { + boost::lock_guard l(state_lock_); + return state_ == kRunning; +} + +Status CatalogManager::CheckIsLeaderAndReady() const { + boost::lock_guard l(state_lock_); + if (PREDICT_FALSE(state_ != kRunning)) { + return Status::ServiceUnavailable( + Substitute("Catalog manager is shutting down. State: $0", state_)); + } + Consensus* consensus = sys_catalog_->tablet_peer_->consensus(); + ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); + string uuid = master_->fs_manager()->uuid(); + if (PREDICT_FALSE(!cstate.has_leader_uuid() || cstate.leader_uuid() != uuid)) { + return Status::IllegalState( + Substitute("Not the leader. Local UUID: $0, Consensus state: $1", + uuid, cstate.ShortDebugString())); + } + if (PREDICT_FALSE(leader_ready_term_ != cstate.current_term())) { + return Status::ServiceUnavailable("Leader not yet ready to serve requests"); + } + return Status::OK(); +} + +RaftPeerPB::Role CatalogManager::Role() const { + CHECK(IsInitialized()); + return sys_catalog_->tablet_peer_->consensus()->role(); +} + +void CatalogManager::Shutdown() { + { + boost::lock_guard l(state_lock_); + if (state_ == kClosing) { + VLOG(2) << "CatalogManager already shut down"; + return; + } + state_ = kClosing; + } + + // Shutdown the Catalog Manager background thread + if (background_tasks_) { + background_tasks_->Shutdown(); + } + + // Abort and Wait tables task completion + for (const TableInfoMap::value_type& e : table_ids_map_) { + e.second->AbortTasks(); + e.second->WaitTasksCompletion(); + } + + // Shut down the underlying storage for tables and tablets. + if (sys_catalog_) { + sys_catalog_->Shutdown(); + } +} + +static void SetupError(MasterErrorPB* error, + MasterErrorPB::Code code, + const Status& s) { + StatusToPB(s, error->mutable_status()); + error->set_code(code); +} + +Status CatalogManager::CheckOnline() const { + if (PREDICT_FALSE(!IsInitialized())) { + return Status::ServiceUnavailable("CatalogManager is not running"); + } + return Status::OK(); +} + +void CatalogManager::AbortTableCreation(TableInfo* table, + const vector& tablets) { + string table_id = table->id(); + string table_name = table->mutable_metadata()->mutable_dirty()->pb.name(); + vector tablet_ids_to_erase; + for (TabletInfo* tablet : tablets) { + tablet_ids_to_erase.push_back(tablet->tablet_id()); + } + + LOG(INFO) << "Aborting creation of table '" << table_name << "', erasing table and tablets (" << + JoinStrings(tablet_ids_to_erase, ",") << ") from in-memory state."; + + // Since this is a failed creation attempt, it's safe to just abort + // all tasks, as (by definition) no tasks may be pending against a + // table that has failed to succesfully create. + table->AbortTasks(); + table->WaitTasksCompletion(); + + boost::lock_guard l(lock_); + + // Call AbortMutation() manually, as otherwise the lock won't be + // released. + for (TabletInfo* tablet : tablets) { + tablet->mutable_metadata()->AbortMutation(); + } + table->mutable_metadata()->AbortMutation(); + for (const string& tablet_id_to_erase : tablet_ids_to_erase) { + CHECK_EQ(tablet_map_.erase(tablet_id_to_erase), 1) + << "Unable to erase tablet " << tablet_id_to_erase << " from tablet map."; + } + CHECK_EQ(table_names_map_.erase(table_name), 1) + << "Unable to erase table named " << table_name << " from table names map."; + CHECK_EQ(table_ids_map_.erase(table_id), 1) + << "Unable to erase tablet with id " << table_id << " from tablet ids map."; +} + +// Create a new table. +// See README file in this directory for a description of the design. +Status CatalogManager::CreateTable(const CreateTableRequestPB* orig_req, + CreateTableResponsePB* resp, + rpc::RpcContext* rpc) { + RETURN_NOT_OK(CheckOnline()); + Status s; + + // Copy the request, so we can fill in some defaults. + CreateTableRequestPB req = *orig_req; + LOG(INFO) << "CreateTable from " << RequestorString(rpc) + << ":\n" << req.DebugString(); + + // a. Validate the user request. + Schema client_schema; + RETURN_NOT_OK(SchemaFromPB(req.schema(), &client_schema)); + if (client_schema.has_column_ids()) { + s = Status::InvalidArgument("User requests should not have Column IDs"); + SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); + return s; + } + if (PREDICT_FALSE(client_schema.num_key_columns() <= 0)) { + s = Status::InvalidArgument("Must specify at least one key column"); + SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); + return s; + } + for (int i = 0; i < client_schema.num_key_columns(); i++) { + if (!IsTypeAllowableInKey(client_schema.column(i).type_info())) { + Status s = Status::InvalidArgument( + "Key column may not have type of BOOL, FLOAT, or DOUBLE"); + SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); + return s; + } + } + Schema schema = client_schema.CopyWithColumnIds(); + + // If the client did not set a partition schema in the create table request, + // the default partition schema (no hash bucket components and a range + // partitioned on the primary key columns) will be used. + PartitionSchema partition_schema; + s = PartitionSchema::FromPB(req.partition_schema(), schema, &partition_schema); + if (!s.ok()) { + SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); + return s; + } + + // Decode split rows. + vector split_rows; + + RowOperationsPBDecoder decoder(&req.split_rows(), &client_schema, &schema, nullptr); + vector ops; + RETURN_NOT_OK(decoder.DecodeOperations(&ops)); + + for (const DecodedRowOperation& op : ops) { + if (op.type != RowOperationsPB::SPLIT_ROW) { + Status s = Status::InvalidArgument( + "Split rows must be specified as RowOperationsPB::SPLIT_ROW"); + SetupError(resp->mutable_error(), MasterErrorPB::UNKNOWN_ERROR, s); + return s; + } + + split_rows.push_back(*op.split_row); + } + + // Create partitions based on specified partition schema and split rows. + vector partitions; + RETURN_NOT_OK(partition_schema.CreatePartitions(split_rows, schema, &partitions)); + + // If they didn't specify a num_replicas, set it based on the default. + if (!req.has_num_replicas()) { + req.set_num_replicas(FLAGS_default_num_replicas); + } + + // Verify that the total number of tablets is reasonable, relative to the number + // of live tablet servers. + TSDescriptorVector ts_descs; + master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); + int num_live_tservers = ts_descs.size(); + int max_tablets = FLAGS_max_create_tablets_per_ts * num_live_tservers; + if (req.num_replicas() > 1 && max_tablets > 0 && partitions.size() > max_tablets) { + s = Status::InvalidArgument(Substitute("The requested number of tablets is over the " + "permitted maximum ($0)", max_tablets)); + SetupError(resp->mutable_error(), MasterErrorPB::TOO_MANY_TABLETS, s); + return s; + } + + // Verify that the number of replicas isn't larger than the number of live tablet + // servers. + if (FLAGS_catalog_manager_check_ts_count_for_create_table && + req.num_replicas() > num_live_tservers) { + s = Status::InvalidArgument(Substitute( + "Not enough live tablet servers to create a table with the requested replication " + "factor $0. $1 tablet servers are alive.", req.num_replicas(), num_live_tservers)); + SetupError(resp->mutable_error(), MasterErrorPB::REPLICATION_FACTOR_TOO_HIGH, s); + return s; + } + + scoped_refptr table; + vector tablets; + { + boost::lock_guard l(lock_); + TRACE("Acquired catalog manager lock"); + + // b. Verify that the table does not exist. + table = FindPtrOrNull(table_names_map_, req.name()); + if (table != nullptr) { + s = Status::AlreadyPresent("Table already exists", table->id()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_ALREADY_PRESENT, s); + return s; + } + + // c. Add the new table in "preparing" state. + table = CreateTableInfo(req, schema, partition_schema); + table_ids_map_[table->id()] = table; + table_names_map_[req.name()] = table; + + // d. Create the TabletInfo objects in state PREPARING. + for (const Partition& partition : partitions) { + PartitionPB partition_pb; + partition.ToPB(&partition_pb); + tablets.push_back(CreateTabletInfo(table.get(), partition_pb)); + } + + // Add the table/tablets to the in-memory map for the assignment. + resp->set_table_id(table->id()); + table->AddTablets(tablets); + for (TabletInfo* tablet : tablets) { + InsertOrDie(&tablet_map_, tablet->tablet_id(), tablet); + } + } + TRACE("Inserted new table and tablet info into CatalogManager maps"); + + // NOTE: the table and tablets are already locked for write at this point, + // since the CreateTableInfo/CreateTabletInfo functions leave them in that state. + // They will get committed at the end of this function. + // Sanity check: the tables and tablets should all be in "preparing" state. + CHECK_EQ(SysTablesEntryPB::PREPARING, table->metadata().dirty().pb.state()); + for (const TabletInfo *tablet : tablets) { + CHECK_EQ(SysTabletsEntryPB::PREPARING, tablet->metadata().dirty().pb.state()); + } + + // e. Write Tablets to sys-tablets (in "preparing" state) + s = sys_catalog_->AddTablets(tablets); + if (!s.ok()) { + s = s.CloneAndPrepend(Substitute("An error occurred while inserting to sys-tablets: $0", + s.ToString())); + LOG(WARNING) << s.ToString(); + AbortTableCreation(table.get(), tablets); + CheckIfNoLongerLeaderAndSetupError(s, resp); + return s; + } + TRACE("Wrote tablets to system table"); + + // f. Update the on-disk table state to "running". + table->mutable_metadata()->mutable_dirty()->pb.set_state(SysTablesEntryPB::RUNNING); + s = sys_catalog_->AddTable(table.get()); + if (!s.ok()) { + s = s.CloneAndPrepend(Substitute("An error occurred while inserting to sys-tablets: $0", + s.ToString())); + LOG(WARNING) << s.ToString(); + AbortTableCreation(table.get(), tablets); + CheckIfNoLongerLeaderAndSetupError(s, resp); + return s; + } + TRACE("Wrote table to system table"); + + // g. Commit the in-memory state. + table->mutable_metadata()->CommitMutation(); + + for (TabletInfo *tablet : tablets) { + tablet->mutable_metadata()->CommitMutation(); + } + + VLOG(1) << "Created table " << table->ToString(); + background_tasks_->Wake(); + return Status::OK(); +} + +Status CatalogManager::IsCreateTableDone(const IsCreateTableDoneRequestPB* req, + IsCreateTableDoneResponsePB* resp) { + RETURN_NOT_OK(CheckOnline()); + + scoped_refptr table; + + // 1. Lookup the table and verify if it exists + TRACE("Looking up table"); + RETURN_NOT_OK(FindTable(req->table(), &table)); + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist", req->table().DebugString()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Locking table"); + TableMetadataLock l(table.get(), TableMetadataLock::READ); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + // 2. Verify if the create is in-progress + TRACE("Verify if the table creation is in progress for $0", table->ToString()); + resp->set_done(!table->IsCreateInProgress()); + + return Status::OK(); +} + +TableInfo *CatalogManager::CreateTableInfo(const CreateTableRequestPB& req, + const Schema& schema, + const PartitionSchema& partition_schema) { + DCHECK(schema.has_column_ids()); + TableInfo* table = new TableInfo(GenerateId()); + table->mutable_metadata()->StartMutation(); + SysTablesEntryPB *metadata = &table->mutable_metadata()->mutable_dirty()->pb; + metadata->set_state(SysTablesEntryPB::PREPARING); + metadata->set_name(req.name()); + metadata->set_version(0); + metadata->set_next_column_id(ColumnId(schema.max_col_id() + 1)); + metadata->set_num_replicas(req.num_replicas()); + // Use the Schema object passed in, since it has the column IDs already assigned, + // whereas the user request PB does not. + CHECK_OK(SchemaToPB(schema, metadata->mutable_schema())); + partition_schema.ToPB(metadata->mutable_partition_schema()); + return table; +} + +TabletInfo* CatalogManager::CreateTabletInfo(TableInfo* table, + const PartitionPB& partition) { + TabletInfo* tablet = new TabletInfo(table, GenerateId()); + tablet->mutable_metadata()->StartMutation(); + SysTabletsEntryPB *metadata = &tablet->mutable_metadata()->mutable_dirty()->pb; + metadata->set_state(SysTabletsEntryPB::PREPARING); + metadata->mutable_partition()->CopyFrom(partition); + metadata->set_table_id(table->id()); + return tablet; +} + +Status CatalogManager::FindTable(const TableIdentifierPB& table_identifier, + scoped_refptr *table_info) { + boost::shared_lock l(lock_); + + if (table_identifier.has_table_id()) { + *table_info = FindPtrOrNull(table_ids_map_, table_identifier.table_id()); + } else if (table_identifier.has_table_name()) { + *table_info = FindPtrOrNull(table_names_map_, table_identifier.table_name()); + } else { + return Status::InvalidArgument("Missing Table ID or Table Name"); + } + return Status::OK(); +} + +// Delete a Table +// - Update the table state to "removed" +// - Write the updated table metadata to sys-table +// +// we are lazy about deletions... +// the cleaner will remove tables and tablets marked as "removed" +Status CatalogManager::DeleteTable(const DeleteTableRequestPB* req, + DeleteTableResponsePB* resp, + rpc::RpcContext* rpc) { + LOG(INFO) << "Servicing DeleteTable request from " << RequestorString(rpc) + << ": " << req->ShortDebugString(); + + RETURN_NOT_OK(CheckOnline()); + + scoped_refptr table; + + // 1. Lookup the table and verify if it exists + TRACE("Looking up table"); + RETURN_NOT_OK(FindTable(req->table(), &table)); + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist", req->table().DebugString()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Locking table"); + TableMetadataLock l(table.get(), TableMetadataLock::WRITE); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Updating metadata on disk"); + // 2. Update the metadata for the on-disk state + l.mutable_data()->set_state(SysTablesEntryPB::REMOVED, + Substitute("Deleted at $0", LocalTimeAsString())); + + // 3. Update sys-catalog with the removed table state. + Status s = sys_catalog_->UpdateTable(table.get()); + if (!s.ok()) { + // The mutation will be aborted when 'l' exits the scope on early return. + s = s.CloneAndPrepend(Substitute("An error occurred while updating sys tables: $0", + s.ToString())); + LOG(WARNING) << s.ToString(); + CheckIfNoLongerLeaderAndSetupError(s, resp); + return s; + } + + // 4. Remove it from the by-name map + { + TRACE("Removing from by-name map"); + boost::lock_guard l_map(lock_); + if (table_names_map_.erase(l.data().name()) != 1) { + PANIC_RPC(rpc, "Could not remove table from map, name=" + l.data().name()); + } + } + + table->AbortTasks(); + + // 5. Update the in-memory state + TRACE("Committing in-memory state"); + l.Commit(); + + // Send a DeleteTablet() request to each tablet replica in the table. + DeleteTabletsAndSendRequests(table); + + LOG(INFO) << "Successfully deleted table " << table->ToString() + << " per request from " << RequestorString(rpc); + background_tasks_->Wake(); + return Status::OK(); +} + +static Status ApplyAlterSteps(const SysTablesEntryPB& current_pb, + const AlterTableRequestPB* req, + Schema* new_schema, + ColumnId* next_col_id) { + const SchemaPB& current_schema_pb = current_pb.schema(); + Schema cur_schema; + RETURN_NOT_OK(SchemaFromPB(current_schema_pb, &cur_schema)); + + SchemaBuilder builder(cur_schema); + if (current_pb.has_next_column_id()) { + builder.set_next_column_id(ColumnId(current_pb.next_column_id())); + } + + for (const AlterTableRequestPB::Step& step : req->alter_schema_steps()) { + switch (step.type()) { + case AlterTableRequestPB::ADD_COLUMN: { + if (!step.has_add_column()) { + return Status::InvalidArgument("ADD_COLUMN missing column info"); + } + + // Verify that encoding is appropriate for the new column's + // type + ColumnSchemaPB new_col_pb = step.add_column().schema(); + if (new_col_pb.has_id()) { + return Status::InvalidArgument("column $0: client should not specify column ID", + new_col_pb.ShortDebugString()); + } + ColumnSchema new_col = ColumnSchemaFromPB(new_col_pb); + const TypeEncodingInfo *dummy; + RETURN_NOT_OK(TypeEncodingInfo::Get(new_col.type_info(), + new_col.attributes().encoding, + &dummy)); + + // can't accept a NOT NULL column without read default + if (!new_col.is_nullable() && !new_col.has_read_default()) { + return Status::InvalidArgument( + Substitute("column `$0`: NOT NULL columns must have a default", new_col.name())); + } + + RETURN_NOT_OK(builder.AddColumn(new_col, false)); + break; + } + + case AlterTableRequestPB::DROP_COLUMN: { + if (!step.has_drop_column()) { + return Status::InvalidArgument("DROP_COLUMN missing column info"); + } + + if (cur_schema.is_key_column(step.drop_column().name())) { + return Status::InvalidArgument("cannot remove a key column"); + } + + RETURN_NOT_OK(builder.RemoveColumn(step.drop_column().name())); + break; + } + + case AlterTableRequestPB::RENAME_COLUMN: { + if (!step.has_rename_column()) { + return Status::InvalidArgument("RENAME_COLUMN missing column info"); + } + + // TODO: In theory we can rename a key + if (cur_schema.is_key_column(step.rename_column().old_name())) { + return Status::InvalidArgument("cannot rename a key column"); + } + + RETURN_NOT_OK(builder.RenameColumn( + step.rename_column().old_name(), + step.rename_column().new_name())); + break; + } + + // TODO: EDIT_COLUMN + + default: { + return Status::InvalidArgument( + Substitute("Invalid alter step type: $0", step.type())); + } + } + } + *new_schema = builder.Build(); + *next_col_id = builder.next_column_id(); + return Status::OK(); +} + +Status CatalogManager::AlterTable(const AlterTableRequestPB* req, + AlterTableResponsePB* resp, + rpc::RpcContext* rpc) { + LOG(INFO) << "Servicing AlterTable request from " << RequestorString(rpc) + << ": " << req->ShortDebugString(); + + RETURN_NOT_OK(CheckOnline()); + + scoped_refptr table; + + // 1. Lookup the table and verify if it exists + TRACE("Looking up table"); + RETURN_NOT_OK(FindTable(req->table(), &table)); + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist", req->table().DebugString()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Locking table"); + TableMetadataLock l(table.get(), TableMetadataLock::WRITE); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + bool has_changes = false; + string table_name = l.data().name(); + + // 2. Calculate new schema for the on-disk state, not persisted yet + Schema new_schema; + ColumnId next_col_id = ColumnId(l.data().pb.next_column_id()); + if (req->alter_schema_steps_size()) { + TRACE("Apply alter schema"); + Status s = ApplyAlterSteps(l.data().pb, req, &new_schema, &next_col_id); + if (!s.ok()) { + SetupError(resp->mutable_error(), MasterErrorPB::INVALID_SCHEMA, s); + return s; + } + DCHECK_NE(next_col_id, 0); + DCHECK_EQ(new_schema.find_column_by_id(next_col_id), + static_cast(Schema::kColumnNotFound)); + has_changes = true; + } + + // 3. Try to acquire the new table name + if (req->has_new_table_name()) { + boost::lock_guard catalog_lock(lock_); + + TRACE("Acquired catalog manager lock"); + + // Verify that the table does not exist + scoped_refptr other_table = FindPtrOrNull(table_names_map_, req->new_table_name()); + if (other_table != nullptr) { + Status s = Status::AlreadyPresent("Table already exists", other_table->id()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_ALREADY_PRESENT, s); + return s; + } + + // Acquire the new table name (now we have 2 name for the same table) + table_names_map_[req->new_table_name()] = table; + l.mutable_data()->pb.set_name(req->new_table_name()); + + has_changes = true; + } + + // Skip empty requests... + if (!has_changes) { + return Status::OK(); + } + + // 4. Serialize the schema Increment the version number + if (new_schema.initialized()) { + if (!l.data().pb.has_fully_applied_schema()) { + l.mutable_data()->pb.mutable_fully_applied_schema()->CopyFrom(l.data().pb.schema()); + } + CHECK_OK(SchemaToPB(new_schema, l.mutable_data()->pb.mutable_schema())); + } + l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1); + l.mutable_data()->pb.set_next_column_id(next_col_id); + l.mutable_data()->set_state(SysTablesEntryPB::ALTERING, + Substitute("Alter Table version=$0 ts=$1", + l.mutable_data()->pb.version(), + LocalTimeAsString())); + + // 5. Update sys-catalog with the new table schema. + TRACE("Updating metadata on disk"); + Status s = sys_catalog_->UpdateTable(table.get()); + if (!s.ok()) { + s = s.CloneAndPrepend( + Substitute("An error occurred while updating sys-catalog tables entry: $0", + s.ToString())); + LOG(WARNING) << s.ToString(); + if (req->has_new_table_name()) { + boost::lock_guard catalog_lock(lock_); + CHECK_EQ(table_names_map_.erase(req->new_table_name()), 1); + } + CheckIfNoLongerLeaderAndSetupError(s, resp); + // TableMetadaLock follows RAII paradigm: when it leaves scope, + // 'l' will be unlocked, and the mutation will be aborted. + return s; + } + + // 6. Remove the old name + if (req->has_new_table_name()) { + TRACE("Removing old-name $0 from by-name map", table_name); + boost::lock_guard l_map(lock_); + if (table_names_map_.erase(table_name) != 1) { + PANIC_RPC(rpc, "Could not remove table from map, name=" + l.data().name()); + } + } + + // 7. Update the in-memory state + TRACE("Committing in-memory state"); + l.Commit(); + + SendAlterTableRequest(table); + return Status::OK(); +} + +Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req, + IsAlterTableDoneResponsePB* resp, + rpc::RpcContext* rpc) { + RETURN_NOT_OK(CheckOnline()); + + scoped_refptr table; + + // 1. Lookup the table and verify if it exists + TRACE("Looking up table"); + RETURN_NOT_OK(FindTable(req->table(), &table)); + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist", req->table().DebugString()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Locking table"); + TableMetadataLock l(table.get(), TableMetadataLock::READ); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + // 2. Verify if the alter is in-progress + TRACE("Verify if there is an alter operation in progress for $0", table->ToString()); + resp->set_schema_version(l.data().pb.version()); + resp->set_done(l.data().pb.state() != SysTablesEntryPB::ALTERING); + + return Status::OK(); +} + +Status CatalogManager::GetTableSchema(const GetTableSchemaRequestPB* req, + GetTableSchemaResponsePB* resp) { + RETURN_NOT_OK(CheckOnline()); + + scoped_refptr table; + + // 1. Lookup the table and verify if it exists + TRACE("Looking up table"); + RETURN_NOT_OK(FindTable(req->table(), &table)); + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist", req->table().DebugString()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TRACE("Locking table"); + TableMetadataLock l(table.get(), TableMetadataLock::READ); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + if (l.data().pb.has_fully_applied_schema()) { + // An AlterTable is in progress; fully_applied_schema is the last + // schema that has reached every TS. + CHECK(l.data().pb.state() == SysTablesEntryPB::ALTERING); + resp->mutable_schema()->CopyFrom(l.data().pb.fully_applied_schema()); + } else { + // There's no AlterTable, the regular schema is "fully applied". + resp->mutable_schema()->CopyFrom(l.data().pb.schema()); + } + resp->set_num_replicas(l.data().pb.num_replicas()); + resp->set_table_id(table->id()); + resp->mutable_partition_schema()->CopyFrom(l.data().pb.partition_schema()); + resp->set_create_table_done(!table->IsCreateInProgress()); + + return Status::OK(); +} + +Status CatalogManager::ListTables(const ListTablesRequestPB* req, + ListTablesResponsePB* resp) { + RETURN_NOT_OK(CheckOnline()); + + boost::shared_lock l(lock_); + + for (const TableInfoMap::value_type& entry : table_names_map_) { + TableMetadataLock ltm(entry.second.get(), TableMetadataLock::READ); + if (!ltm.data().is_running()) continue; + + if (req->has_name_filter()) { + size_t found = ltm.data().name().find(req->name_filter()); + if (found == string::npos) { + continue; + } + } + + ListTablesResponsePB::TableInfo *table = resp->add_tables(); + table->set_id(entry.second->id()); + table->set_name(ltm.data().name()); + } + + return Status::OK(); +} + +bool CatalogManager::GetTableInfo(const string& table_id, scoped_refptr *table) { + boost::shared_lock l(lock_); + *table = FindPtrOrNull(table_ids_map_, table_id); + return *table != nullptr; +} + +void CatalogManager::GetAllTables(std::vector > *tables) { + tables->clear(); + boost::shared_lock l(lock_); + for (const TableInfoMap::value_type& e : table_ids_map_) { + tables->push_back(e.second); + } +} + +bool CatalogManager::TableNameExists(const string& table_name) { + boost::shared_lock l(lock_); + return table_names_map_.find(table_name) != table_names_map_.end(); +} + +void CatalogManager::NotifyTabletDeleteSuccess(const string& permanent_uuid, + const string& tablet_id) { + // TODO: Clean up the stale deleted tablet data once all relevant tablet + // servers have responded that they have removed the remnants of the deleted + // tablet. +} + +Status CatalogManager::ProcessTabletReport(TSDescriptor* ts_desc, + const TabletReportPB& report, + TabletReportUpdatesPB *report_update, + RpcContext* rpc) { + TRACE_EVENT2("master", "ProcessTabletReport", + "requestor", rpc->requestor_string(), + "num_tablets", report.updated_tablets_size()); + + if (VLOG_IS_ON(2)) { + VLOG(2) << "Received tablet report from " << + RequestorString(rpc) << ": " << report.DebugString(); + } + if (!ts_desc->has_tablet_report() && report.is_incremental()) { + string msg = "Received an incremental tablet report when a full one was needed"; + LOG(WARNING) << "Invalid tablet report from " << RequestorString(rpc) << ": " + << msg; + return Status::IllegalState(msg); + } + + // TODO: on a full tablet report, we may want to iterate over the tablets we think + // the server should have, compare vs the ones being reported, and somehow mark + // any that have been "lost" (eg somehow the tablet metadata got corrupted or something). + + for (const ReportedTabletPB& reported : report.updated_tablets()) { + ReportedTabletUpdatesPB *tablet_report = report_update->add_tablets(); + tablet_report->set_tablet_id(reported.tablet_id()); + RETURN_NOT_OK_PREPEND(HandleReportedTablet(ts_desc, reported, tablet_report), + Substitute("Error handling $0", reported.ShortDebugString())); + } + + ts_desc->set_has_tablet_report(true); + + if (report.updated_tablets_size() > 0) { + background_tasks_->WakeIfHasPendingUpdates(); + } + + return Status::OK(); +} + +namespace { +// Return true if receiving 'report' for a tablet in CREATING state should +// transition it to the RUNNING state. +bool ShouldTransitionTabletToRunning(const ReportedTabletPB& report) { + if (report.state() != tablet::RUNNING) return false; + + // In many tests, we disable leader election, so newly created tablets + // will never elect a leader on their own. In this case, we transition + // to RUNNING as soon as we get a single report. + if (!FLAGS_catalog_manager_wait_for_new_tablets_to_elect_leader) { + return true; + } + + // Otherwise, we only transition to RUNNING once a leader is elected. + return report.committed_consensus_state().has_leader_uuid(); +} +} // anonymous namespace + +Status CatalogManager::HandleReportedTablet(TSDescriptor* ts_desc, + const ReportedTabletPB& report, + ReportedTabletUpdatesPB *report_updates) { + TRACE_EVENT1("master", "HandleReportedTablet", + "tablet_id", report.tablet_id()); + scoped_refptr tablet; + { + boost::shared_lock l(lock_); + tablet = FindPtrOrNull(tablet_map_, report.tablet_id()); + } + RETURN_NOT_OK_PREPEND(CheckIsLeaderAndReady(), + Substitute("This master is no longer the leader, unable to handle report for tablet $0", + report.tablet_id())); + if (!tablet) { + LOG(INFO) << "Got report from unknown tablet " << report.tablet_id() + << ": Sending delete request for this orphan tablet"; + SendDeleteTabletRequest(report.tablet_id(), TABLET_DATA_DELETED, boost::none, nullptr, ts_desc, + "Report from unknown tablet"); + return Status::OK(); + } + if (!tablet->table()) { + LOG(INFO) << "Got report from an orphaned tablet " << report.tablet_id(); + SendDeleteTabletRequest(report.tablet_id(), TABLET_DATA_DELETED, boost::none, nullptr, ts_desc, + "Report from an orphaned tablet"); + return Status::OK(); + } + VLOG(3) << "tablet report: " << report.ShortDebugString(); + + // TODO: we don't actually need to do the COW here until we see we're going + // to change the state. Can we change CowedObject to lazily do the copy? + TableMetadataLock table_lock(tablet->table().get(), TableMetadataLock::READ); + TabletMetadataLock tablet_lock(tablet.get(), TabletMetadataLock::WRITE); + + // If the TS is reporting a tablet which has been deleted, or a tablet from + // a table which has been deleted, send it an RPC to delete it. + // NOTE: when a table is deleted, we don't currently iterate over all of the + // tablets and mark them as deleted. Hence, we have to check the table state, + // not just the tablet state. + if (tablet_lock.data().is_deleted() || + table_lock.data().is_deleted()) { + report_updates->set_state_msg(tablet_lock.data().pb.state_msg()); + const string msg = tablet_lock.data().pb.state_msg(); + LOG(INFO) << "Got report from deleted tablet " << tablet->ToString() + << " (" << msg << "): Sending delete request for this tablet"; + // TODO: Cancel tablet creation, instead of deleting, in cases where + // that might be possible (tablet creation timeout & replacement). + SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, boost::none, + tablet->table(), ts_desc, + Substitute("Tablet deleted: $0", msg)); + return Status::OK(); + } + + if (!table_lock.data().is_running()) { + LOG(INFO) << "Got report from tablet " << tablet->tablet_id() + << " for non-running table " << tablet->table()->ToString() << ": " + << tablet_lock.data().pb.state_msg(); + report_updates->set_state_msg(tablet_lock.data().pb.state_msg()); + return Status::OK(); + } + + // Check if the tablet requires an "alter table" call + bool tablet_needs_alter = false; + if (report.has_schema_version() && + table_lock.data().pb.version() != report.schema_version()) { + if (report.schema_version() > table_lock.data().pb.version()) { + LOG(ERROR) << "TS " << ts_desc->permanent_uuid() + << " has reported a schema version greater than the current one " + << " for tablet " << tablet->ToString() + << ". Expected version " << table_lock.data().pb.version() + << " got " << report.schema_version() + << " (corruption)"; + } else { + LOG(INFO) << "TS " << ts_desc->permanent_uuid() + << " does not have the latest schema for tablet " << tablet->ToString() + << ". Expected version " << table_lock.data().pb.version() + << " got " << report.schema_version(); + } + // It's possible that the tablet being reported is a laggy replica, and in fact + // the leader has already received an AlterTable RPC. That's OK, though -- + // it'll safely ignore it if we send another. + tablet_needs_alter = true; + } + + + if (report.has_error()) { + Status s = StatusFromPB(report.error()); + DCHECK(!s.ok()); + DCHECK_EQ(report.state(), tablet::FAILED); + LOG(WARNING) << "Tablet " << tablet->ToString() << " has failed on TS " + << ts_desc->permanent_uuid() << ": " << s.ToString(); + return Status::OK(); + } + + // The report will not have a committed_consensus_state if it is in the + // middle of starting up, such as during tablet bootstrap. + if (report.has_committed_consensus_state()) { + const ConsensusStatePB& prev_cstate = tablet_lock.data().pb.committed_consensus_state(); + ConsensusStatePB cstate = report.committed_consensus_state(); + + // Check if we got a report from a tablet that is no longer part of the raft + // config. If so, tombstone it. We only tombstone replicas that include a + // committed raft config in their report that has an opid_index strictly + // less than the latest reported committed config, and (obviously) who are + // not members of the latest config. This prevents us from spuriously + // deleting replicas that have just been added to a pending config and are + // in the process of catching up to the log entry where they were added to + // the config. + if (FLAGS_master_tombstone_evicted_tablet_replicas && + cstate.config().opid_index() < prev_cstate.config().opid_index() && + !IsRaftConfigMember(ts_desc->permanent_uuid(), prev_cstate.config())) { + SendDeleteTabletRequest(report.tablet_id(), TABLET_DATA_TOMBSTONED, + prev_cstate.config().opid_index(), tablet->table(), ts_desc, + Substitute("Replica from old config with index $0 (latest is $1)", + cstate.config().opid_index(), + prev_cstate.config().opid_index())); + return Status::OK(); + } + + // If the tablet was not RUNNING, and we have a leader elected, mark it as RUNNING. + // We need to wait for a leader before marking a tablet as RUNNING, or else we + // could incorrectly consider a tablet created when only a minority of its replicas + // were successful. In that case, the tablet would be stuck in this bad state + // forever. + if (!tablet_lock.data().is_running() && ShouldTransitionTabletToRunning(report)) { + DCHECK_EQ(SysTabletsEntryPB::CREATING, tablet_lock.data().pb.state()) + << "Tablet in unexpected state: " << tablet->ToString() + << ": " << tablet_lock.data().pb.ShortDebugString(); + // Mark the tablet as running + // TODO: we could batch the IO onto a background thread, or at least + // across multiple tablets in the same report. + VLOG(1) << "Tablet " << tablet->ToString() << " is now online"; + tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::RUNNING, + "Tablet reported with an active leader"); + } + + // The Master only accepts committed consensus configurations since it needs the committed index + // to only cache the most up-to-date config. + if (PREDICT_FALSE(!cstate.config().has_opid_index())) { + LOG(DFATAL) << "Missing opid_index in reported config:\n" << report.DebugString(); + return Status::InvalidArgument("Missing opid_index in reported config"); + } + + bool modified_cstate = false; + if (cstate.config().opid_index() > prev_cstate.config().opid_index() || + (cstate.has_leader_uuid() && + (!prev_cstate.has_leader_uuid() || cstate.current_term() > prev_cstate.current_term()))) { + + // When a config change is reported to the master, it may not include the + // leader because the follower doing the reporting may not know who the + // leader is yet (it may have just started up). If the reported config + // has the same term as the previous config, and the leader was + // previously known for the current term, then retain knowledge of that + // leader even if it wasn't reported in the latest config. + if (cstate.current_term() == prev_cstate.current_term()) { + if (!cstate.has_leader_uuid() && prev_cstate.has_leader_uuid()) { + cstate.set_leader_uuid(prev_cstate.leader_uuid()); + modified_cstate = true; + // Sanity check to detect consensus divergence bugs. + } else if (cstate.has_leader_uuid() && prev_cstate.has_leader_uuid() && + cstate.leader_uuid() != prev_cstate.leader_uuid()) { + string msg = Substitute("Previously reported cstate for tablet $0 gave " + "a different leader for term $1 than the current cstate. " + "Previous cstate: $2. Current cstate: $3.", + tablet->ToString(), cstate.current_term(), + prev_cstate.ShortDebugString(), cstate.ShortDebugString()); + LOG(DFATAL) << msg; + return Status::InvalidArgument(msg); + } + } + + // If a replica is reporting a new consensus configuration, reset the tablet's replicas. + // Note that we leave out replicas who live in tablet servers who have not heartbeated to + // master yet. + LOG(INFO) << "Tablet: " << tablet->tablet_id() << " reported consensus state change." + << " New consensus state: " << cstate.ShortDebugString(); + + // If we need to change the report, copy the whole thing on the stack + // rather than const-casting. + const ReportedTabletPB* final_report = &report; + ReportedTabletPB updated_report; + if (modified_cstate) { + updated_report = report; + *updated_report.mutable_committed_consensus_state() = cstate; + final_report = &updated_report; + } + + VLOG(2) << "Resetting replicas for tablet " << final_report->tablet_id() + << " from config reported by " << ts_desc->permanent_uuid() + << " to that committed in log index " + << final_report->committed_consensus_state().config().opid_index() + << " with leader state from term " + << final_report->committed_consensus_state().current_term(); + + RETURN_NOT_OK(ResetTabletReplicasFromReportedConfig(*final_report, tablet, + &tablet_lock, &table_lock)); + + } else { + // Report opid_index is equal to the previous opid_index. If some + // replica is reporting the same consensus configuration we already know about and hasn't + // been added as replica, add it. + DVLOG(2) << "Peer " << ts_desc->permanent_uuid() << " sent full tablet report" + << " with data we have already received. Ensuring replica is being tracked." + << " Replica consensus state: " << cstate.ShortDebugString(); + AddReplicaToTabletIfNotFound(ts_desc, report, tablet); + } + } + + table_lock.Unlock(); + // We update the tablets each time that someone reports it. + // This shouldn't be very frequent and should only happen when something in fact changed. + Status s = sys_catalog_->UpdateTablets({ tablet.get() }); + if (!s.ok()) { + LOG(WARNING) << "Error updating tablets: " << s.ToString() << ". Tablet report was: " + << report.ShortDebugString(); + return s; + } + tablet_lock.Commit(); + + // Need to defer the AlterTable command to after we've committed the new tablet data, + // since the tablet report may also be updating the raft config, and the Alter Table + // request needs to know who the most recent leader is. + if (tablet_needs_alter) { + SendAlterTabletRequest(tablet); + } else if (report.has_schema_version()) { + HandleTabletSchemaVersionReport(tablet.get(), report.schema_version()); + } + + return Status::OK(); +} + +Status CatalogManager::ResetTabletReplicasFromReportedConfig( + const ReportedTabletPB& report, + const scoped_refptr& tablet, + TabletMetadataLock* tablet_lock, + TableMetadataLock* table_lock) { + + DCHECK(tablet_lock->is_write_locked()); + ConsensusStatePB prev_cstate = tablet_lock->mutable_data()->pb.committed_consensus_state(); + const ConsensusStatePB& cstate = report.committed_consensus_state(); + *tablet_lock->mutable_data()->pb.mutable_committed_consensus_state() = cstate; + + TabletInfo::ReplicaMap replica_locations; + for (const consensus::RaftPeerPB& peer : cstate.config().peers()) { + shared_ptr ts_desc; + if (!peer.has_permanent_uuid()) { + return Status::InvalidArgument("Missing UUID for peer", peer.ShortDebugString()); + } + if (!master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) { + LOG_WITH_PREFIX(WARNING) << "Tablet server has never reported in. " + << "Not including in replica locations map yet. Peer: " << peer.ShortDebugString() + << "; Tablet: " << tablet->ToString(); + continue; + } + + TabletReplica replica; + NewReplica(ts_desc.get(), report, &replica); + InsertOrDie(&replica_locations, replica.ts_desc->permanent_uuid(), replica); + } + tablet->SetReplicaLocations(replica_locations); + + if (FLAGS_master_tombstone_evicted_tablet_replicas) { + unordered_set current_member_uuids; + for (const consensus::RaftPeerPB& peer : cstate.config().peers()) { + InsertOrDie(¤t_member_uuids, peer.permanent_uuid()); + } + // Send a DeleteTablet() request to peers that are not in the new config. + for (const consensus::RaftPeerPB& prev_peer : prev_cstate.config().peers()) { + const string& peer_uuid = prev_peer.permanent_uuid(); + if (!ContainsKey(current_member_uuids, peer_uuid)) { + shared_ptr ts_desc; + if (!master_->ts_manager()->LookupTSByUUID(peer_uuid, &ts_desc)) continue; + SendDeleteTabletRequest(report.tablet_id(), TABLET_DATA_TOMBSTONED, + prev_cstate.config().opid_index(), tablet->table(), ts_desc.get(), + Substitute("TS $0 not found in new config with opid_index $1", + peer_uuid, cstate.config().opid_index())); + } + } + } + + // If the config is under-replicated, add a server to the config. + if (FLAGS_master_add_server_when_underreplicated && + CountVoters(cstate.config()) < table_lock->data().pb.num_replicas()) { + SendAddServerRequest(tablet, cstate); + } + + return Status::OK(); +} + +void CatalogManager::AddReplicaToTabletIfNotFound(TSDescriptor* ts_desc, + const ReportedTabletPB& report, + const scoped_refptr& tablet) { + TabletReplica replica; + NewReplica(ts_desc, report, &replica); + // Only inserts if a replica with a matching UUID was not already present. + ignore_result(tablet->AddToReplicaLocations(replica)); +} + +void CatalogManager::NewReplica(TSDescriptor* ts_desc, + const ReportedTabletPB& report, + TabletReplica* replica) { + CHECK(report.has_committed_consensus_state()) << "No cstate: " << report.ShortDebugString(); + replica->state = report.state(); + replica->role = GetConsensusRole(ts_desc->permanent_uuid(), report.committed_consensus_state()); + replica->ts_desc = ts_desc; +} + +Status CatalogManager::GetTabletPeer(const string& tablet_id, + scoped_refptr* tablet_peer) const { + // Note: CatalogManager has only one table, 'sys_catalog', with only + // one tablet. + boost::shared_lock l(lock_); + CHECK(sys_catalog_.get() != nullptr) << "sys_catalog_ must be initialized!"; + if (sys_catalog_->tablet_id() == tablet_id) { + *tablet_peer = sys_catalog_->tablet_peer(); + } else { + return Status::NotFound(Substitute("no SysTable exists with tablet_id $0 in CatalogManager", + tablet_id)); + } + return Status::OK(); +} + +const NodeInstancePB& CatalogManager::NodeInstance() const { + return master_->instance_pb(); +} + +Status CatalogManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { + return Status::NotSupported("Remote bootstrap not yet implemented for the master tablet"); +} + +// Interface used by RetryingTSRpcTask to pick the tablet server to +// send the next RPC to. +class TSPicker { + public: + TSPicker() {} + virtual ~TSPicker() {} + + // Sets *ts_desc to the tablet server to contact for the next RPC. + // + // This assumes that TSDescriptors are never deleted by the master, + // so the caller does not take ownership of the returned pointer. + virtual Status PickReplica(TSDescriptor** ts_desc) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(TSPicker); +}; + +// Implementation of TSPicker which sends to a specific tablet server, +// identified by its UUID. +class PickSpecificUUID : public TSPicker { + public: + PickSpecificUUID(Master* master, string ts_uuid) + : master_(master), ts_uuid_(std::move(ts_uuid)) {} + + virtual Status PickReplica(TSDescriptor** ts_desc) OVERRIDE { + shared_ptr ts; + if (!master_->ts_manager()->LookupTSByUUID(ts_uuid_, &ts)) { + return Status::NotFound("unknown tablet server ID", ts_uuid_); + } + *ts_desc = ts.get(); + return Status::OK(); + } + + private: + Master* const master_; + const string ts_uuid_; + + DISALLOW_COPY_AND_ASSIGN(PickSpecificUUID); +}; + +// Implementation of TSPicker which locates the current leader replica, +// and sends the RPC to that server. +class PickLeaderReplica : public TSPicker { + public: + explicit PickLeaderReplica(const scoped_refptr& tablet) : + tablet_(tablet) { + } + + virtual Status PickReplica(TSDescriptor** ts_desc) OVERRIDE { + TabletInfo::ReplicaMap replica_locations; + tablet_->GetReplicaLocations(&replica_locations); + for (const TabletInfo::ReplicaMap::value_type& r : replica_locations) { + if (r.second.role == consensus::RaftPeerPB::LEADER) { + *ts_desc = r.second.ts_desc; + return Status::OK(); + } + } + return Status::NotFound("no leader"); + } + + private: + const scoped_refptr tablet_; +}; + +// A background task which continuously retries sending an RPC to a tablet server. +// +// The target tablet server is refreshed before each RPC by consulting the provided +// TSPicker implementation. +class RetryingTSRpcTask : public MonitoredTask { + public: + RetryingTSRpcTask(Master *master, + ThreadPool* callback_pool, + gscoped_ptr replica_picker, + const scoped_refptr& table) + : master_(master), + callback_pool_(callback_pool), + replica_picker_(replica_picker.Pass()), + table_(table), + start_ts_(MonoTime::Now(MonoTime::FINE)), + attempt_(0), + state_(kStateRunning) { + deadline_ = start_ts_; + deadline_.AddDelta(MonoDelta::FromMilliseconds(FLAGS_unresponsive_ts_rpc_timeout_ms)); + } + + // Send the subclass RPC request. + Status Run() { + Status s = ResetTSProxy(); + if (!s.ok()) { + LOG(WARNING) << "Unable to reset TS proxy: " << s.ToString(); + MarkFailed(); + UnregisterAsyncTask(); // May delete this. + return s.CloneAndPrepend("Failed to reset TS proxy"); + } + + // Calculate and set the timeout deadline. + MonoTime timeout = MonoTime::Now(MonoTime::FINE); + timeout.AddDelta(MonoDelta::FromMilliseconds(FLAGS_master_ts_rpc_timeout_ms)); + const MonoTime& deadline = MonoTime::Earliest(timeout, deadline_); + rpc_.set_deadline(deadline); + + if (!SendRequest(++attempt_)) { + if (!RescheduleWithBackoffDelay()) { + UnregisterAsyncTask(); // May call 'delete this'. + } + } + return Status::OK(); + } + + // Abort this task. + virtual void Abort() OVERRIDE { + MarkAborted(); + } + + virtual State state() const OVERRIDE { + return static_cast(NoBarrier_Load(&state_)); + } + + virtual MonoTime start_timestamp() const OVERRIDE { return start_ts_; } + virtual MonoTime completion_timestamp() const OVERRIDE { return end_ts_; } + + protected: + // Send an RPC request and register a callback. + // The implementation must return true if the callback was registered, and + // false if an error occurred and no callback will occur. + virtual bool SendRequest(int attempt) = 0; + + // Handle the response from the RPC request. On success, MarkSuccess() must + // be called to mutate the state_ variable. If retry is desired, then + // no state change is made. Retries will automatically be attempted as long + // as the state is kStateRunning and deadline_ has not yet passed. + virtual void HandleResponse(int attempt) = 0; + + // Return the id of the tablet that is the subject of the async request. + virtual string tablet_id() const = 0; + + // Overridable log prefix with reasonable default. + virtual string LogPrefix() const { + return Substitute("$0: ", description()); + } + + // Transition from running -> complete. + void MarkComplete() { + NoBarrier_CompareAndSwap(&state_, kStateRunning, kStateComplete); + } + + // Transition from running -> aborted. + void MarkAborted() { + NoBarrier_CompareAndSwap(&state_, kStateRunning, kStateAborted); + } + + // Transition from running -> failed. + void MarkFailed() { + NoBarrier_CompareAndSwap(&state_, kStateRunning, kStateFailed); + } + + // Callback meant to be invoked from asynchronous RPC service proxy calls. + void RpcCallback() { + // Defer the actual work of the callback off of the reactor thread. + // This is necessary because our callbacks often do synchronous writes to + // the catalog table, and we can't do synchronous IO on the reactor. + CHECK_OK(callback_pool_->SubmitClosure( + Bind(&RetryingTSRpcTask::DoRpcCallback, + Unretained(this)))); + } + + // Handle the actual work of the RPC callback. This is run on the master's worker + // pool, rather than a reactor thread, so it may do blocking IO operations. + void DoRpcCallback() { + if (!rpc_.status().ok()) { + LOG(WARNING) << "TS " << target_ts_desc_->permanent_uuid() << ": " + << type_name() << " RPC failed for tablet " + << tablet_id() << ": " << rpc_.status().ToString(); + } else if (state() != kStateAborted) { + HandleResponse(attempt_); // Modifies state_. + } + + // Schedule a retry if the RPC call was not successful. + if (RescheduleWithBackoffDelay()) { + return; + } + + UnregisterAsyncTask(); // May call 'delete this'. + } + + Master * const master_; + ThreadPool* const callback_pool_; + const gscoped_ptr replica_picker_; + const scoped_refptr table_; + + MonoTime start_ts_; + MonoTime end_ts_; + MonoTime deadline_; + + int attempt_; + rpc::RpcController rpc_; + TSDescriptor* target_ts_desc_; + shared_ptr ts_proxy_; + shared_ptr consensus_proxy_; + + private: + // Reschedules the current task after a backoff delay. + // Returns false if the task was not rescheduled due to reaching the maximum + // timeout or because the task is no longer in a running state. + // Returns true if rescheduling the task was successful. + bool RescheduleWithBackoffDelay() { + if (state() != kStateRunning) return false; + MonoTime now = MonoTime::Now(MonoTime::FINE); + // We assume it might take 10ms to process the request in the best case, + // fail if we have less than that amount of time remaining. + int64_t millis_remaining = deadline_.GetDeltaSince(now).ToMilliseconds() - 10; + // Exponential backoff with jitter. + int64_t base_delay_ms; + if (attempt_ <= 12) { + base_delay_ms = 1 << (attempt_ + 3); // 1st retry delayed 2^4 ms, 2nd 2^5, etc. + } else { + base_delay_ms = 60 * 1000; // cap at 1 minute + } + int64_t jitter_ms = rand() % 50; // Add up to 50ms of additional random delay. + int64_t delay_millis = std::min(base_delay_ms + jitter_ms, millis_remaining); + + if (delay_millis <= 0) { + LOG(WARNING) << "Request timed out: " << description(); + MarkFailed(); + } else { + MonoTime new_start_time = now; + new_start_time.AddDelta(MonoDelta::FromMilliseconds(delay_millis)); + LOG(INFO) << "Scheduling retry of " << description() << " with a delay" + << " of " << delay_millis << "ms (attempt = " << attempt_ << ")..."; + master_->messenger()->ScheduleOnReactor( + boost::bind(&RetryingTSRpcTask::RunDelayedTask, this, _1), + MonoDelta::FromMilliseconds(delay_millis)); + return true; + } + return false; + } + + // Callback for Reactor delayed task mechanism. Called either when it is time + // to execute the delayed task (with status == OK) or when the task + // is cancelled, i.e. when the scheduling timer is shut down (status != OK). + void RunDelayedTask(const Status& status) { + if (!status.ok()) { + LOG(WARNING) << "Async tablet task " << description() << " failed or was cancelled: " + << status.ToString(); + UnregisterAsyncTask(); // May delete this. + return; + } + + string desc = description(); // Save in case we need to log after deletion. + Status s = Run(); // May delete this. + if (!s.ok()) { + LOG(WARNING) << "Async tablet task " << desc << " failed: " << s.ToString(); + } + } + + // Clean up request and release resources. May call 'delete this'. + void UnregisterAsyncTask() { + end_ts_ = MonoTime::Now(MonoTime::FINE); + if (table_ != nullptr) { + table_->RemoveTask(this); + } else { + // This is a floating task (since the table does not exist) + // created as response to a tablet report. + Release(); // May call "delete this"; + } + } + + Status ResetTSProxy() { + // TODO: if there is no replica available, should we still keep the task running? + RETURN_NOT_OK(replica_picker_->PickReplica(&target_ts_desc_)); + + shared_ptr ts_proxy; + RETURN_NOT_OK(target_ts_desc_->GetTSAdminProxy(master_->messenger(), &ts_proxy)); + ts_proxy_.swap(ts_proxy); + + shared_ptr consensus_proxy; + RETURN_NOT_OK(target_ts_desc_->GetConsensusProxy(master_->messenger(), &consensus_proxy)); + consensus_proxy_.swap(consensus_proxy); + + rpc_.Reset(); + return Status::OK(); + } + + // Use state() and MarkX() accessors. + AtomicWord state_; +}; + +// RetryingTSRpcTask subclass which always retries the same tablet server, +// identified by its UUID. +class RetrySpecificTSRpcTask : public RetryingTSRpcTask { + public: + RetrySpecificTSRpcTask(Master* master, + ThreadPool* callback_pool, + const string& permanent_uuid, + const scoped_refptr& table) + : RetryingTSRpcTask(master, + callback_pool, + gscoped_ptr(new PickSpecificUUID(master, permanent_uuid)), + table), + permanent_uuid_(permanent_uuid) { + } + + protected: + const string permanent_uuid_; +}; + +// Fire off the async create tablet. +// This requires that the new tablet info is locked for write, and the +// consensus configuration information has been filled into the 'dirty' data. +class AsyncCreateReplica : public RetrySpecificTSRpcTask { + public: + AsyncCreateReplica(Master *master, + ThreadPool *callback_pool, + const string& permanent_uuid, + const scoped_refptr& tablet) + : RetrySpecificTSRpcTask(master, callback_pool, permanent_uuid, tablet->table().get()), + tablet_id_(tablet->tablet_id()) { + deadline_ = start_ts_; + deadline_.AddDelta(MonoDelta::FromMilliseconds(FLAGS_tablet_creation_timeout_ms)); + + TableMetadataLock table_lock(tablet->table().get(), TableMetadataLock::READ); + const SysTabletsEntryPB& tablet_pb = tablet->metadata().dirty().pb; + + req_.set_dest_uuid(permanent_uuid); + req_.set_table_id(tablet->table()->id()); + req_.set_tablet_id(tablet->tablet_id()); + req_.mutable_partition()->CopyFrom(tablet_pb.partition()); + req_.set_table_name(table_lock.data().pb.name()); + req_.mutable_schema()->CopyFrom(table_lock.data().pb.schema()); + req_.mutable_partition_schema()->CopyFrom(table_lock.data().pb.partition_schema()); + req_.mutable_config()->CopyFrom(tablet_pb.committed_consensus_state().config()); + } + + virtual string type_name() const OVERRIDE { return "Create Tablet"; } + + virtual string description() const OVERRIDE { + return "CreateTablet RPC for tablet " + tablet_id_ + " on TS " + permanent_uuid_; + } + + protected: + virtual string tablet_id() const OVERRIDE { return tablet_id_; } + + virtual void HandleResponse(int attempt) OVERRIDE { + if (!resp_.has_error()) { + MarkComplete(); + } else { + Status s = StatusFromPB(resp_.error().status()); + if (s.IsAlreadyPresent()) { + LOG(INFO) << "CreateTablet RPC for tablet " << tablet_id_ + << " on TS " << permanent_uuid_ << " returned already present: " + << s.ToString(); + MarkComplete(); + } else { + LOG(WARNING) << "CreateTablet RPC for tablet " << tablet_id_ + << " on TS " << permanent_uuid_ << " failed: " << s.ToString(); + } + } + } + + virtual bool SendRequest(int attempt) OVERRIDE { + ts_proxy_->CreateTabletAsync(req_, &resp_, &rpc_, + boost::bind(&AsyncCreateReplica::RpcCallback, this)); + VLOG(1) << "Send create tablet request to " << permanent_uuid_ << ":\n" + << " (attempt " << attempt << "):\n" + << req_.DebugString(); + return true; + } + + private: + const string tablet_id_; + tserver::CreateTabletRequestPB req_; + tserver::CreateTabletResponsePB resp_; +}; + +// Send a DeleteTablet() RPC request. +class AsyncDeleteReplica : public RetrySpecificTSRpcTask { + public: + AsyncDeleteReplica( + Master* master, ThreadPool* callback_pool, const string& permanent_uuid, + const scoped_refptr& table, std::string tablet_id, + TabletDataState delete_type, + boost::optional cas_config_opid_index_less_or_equal, + string reason) + : RetrySpecificTSRpcTask(master, callback_pool, permanent_uuid, table), + tablet_id_(std::move(tablet_id)), + delete_type_(delete_type), + cas_config_opid_index_less_or_equal_( + std::move(cas_config_opid_index_less_or_equal)), + reason_(std::move(reason)) {} + + virtual string type_name() const OVERRIDE { return "Delete Tablet"; } + + virtual string description() const OVERRIDE { + return tablet_id_ + " Delete Tablet RPC for TS=" + permanent_uuid_; + } + + protected: + virtual string tablet_id() const OVERRIDE { return tablet_id_; } + + virtual void HandleResponse(int attempt) OVERRIDE { + if (resp_.has_error()) { + Status status = StatusFromPB(resp_.error().status()); + + // Do not retry on a fatal error + TabletServerErrorPB::Code code = resp_.error().code(); + switch (code) { + case TabletServerErrorPB::TABLET_NOT_FOUND: + LOG(WARNING) << "TS " << permanent_uuid_ << ": delete failed for tablet " << tablet_id_ + << " because the tablet was not found. No further retry: " + << status.ToString(); + MarkComplete(); + break; + case TabletServerErrorPB::CAS_FAILED: + LOG(WARNING) << "TS " << permanent_uuid_ << ": delete failed for tablet " << tablet_id_ + << " due to a CAS failure. No further retry: " << status.ToString(); + MarkComplete(); + break; + default: + LOG(WARNING) << "TS " << permanent_uuid_ << ": delete failed for tablet " << tablet_id_ + << " with error code " << TabletServerErrorPB::Code_Name(code) + << ": " << status.ToString(); + break; + } + } else { + master_->catalog_manager()->NotifyTabletDeleteSuccess(permanent_uuid_, tablet_id_); + if (table_) { + LOG(INFO) << "TS " << permanent_uuid_ << ": tablet " << tablet_id_ + << " (table " << table_->ToString() << ") successfully deleted"; + } else { + LOG(WARNING) << "TS " << permanent_uuid_ << ": tablet " << tablet_id_ + << " did not belong to a known table, but was successfully deleted"; + } + MarkComplete(); + VLOG(1) << "TS " << permanent_uuid_ << ": delete complete on tablet " << tablet_id_; + } + } + + virtual bool SendRequest(int attempt) OVERRIDE { + tserver::DeleteTabletRequestPB req; + req.set_dest_uuid(permanent_uuid_); + req.set_tablet_id(tablet_id_); + req.set_reason(reason_); + req.set_delete_type(delete_type_); + if (cas_config_opid_index_less_or_equal_) { + req.set_cas_config_opid_index_less_or_equal(*cas_config_opid_index_less_or_equal_); + } + + ts_proxy_->DeleteTabletAsync(req, &resp_, &rpc_, + boost::bind(&AsyncDeleteReplica::RpcCallback, this)); + VLOG(1) << "Send delete tablet request to " << permanent_uuid_ + << " (attempt " << attempt << "):\n" + << req.DebugString(); + return true; + } + + const std::string tablet_id_; + const TabletDataState delete_type_; + const boost::optional cas_config_opid_index_less_or_equal_; + const std::string reason_; + tserver::DeleteTabletResponsePB resp_; +}; + +// Send the "Alter Table" with the latest table schema to the leader replica +// for the tablet. +// Keeps retrying until we get an "ok" response. +// - Alter completed +// - Tablet has already a newer version +// (which may happen in case of concurrent alters, or in case a previous attempt timed +// out but was actually applied). +class AsyncAlterTable : public RetryingTSRpcTask { + public: + AsyncAlterTable(Master *master, + ThreadPool* callback_pool, + const scoped_refptr& tablet) + : RetryingTSRpcTask(master, + callback_pool, + gscoped_ptr(new PickLeaderReplica(tablet)), + tablet->table().get()), + tablet_(tablet) { + } + + virtual string type_name() const OVERRIDE { return "Alter Table"; } + + virtual string description() const OVERRIDE { + return tablet_->ToString() + " Alter Table RPC"; + } + + private: + virtual string tablet_id() const OVERRIDE { return tablet_->tablet_id(); } + string permanent_uuid() const { + return target_ts_desc_->permanent_uuid(); + } + + virtual void HandleResponse(int attempt) OVERRIDE { + if (resp_.has_error()) { + Status status = StatusFromPB(resp_.error().status()); + + // Do not retry on a fatal error + switch (resp_.error().code()) { + case TabletServerErrorPB::TABLET_NOT_FOUND: + case TabletServerErrorPB::MISMATCHED_SCHEMA: + case TabletServerErrorPB::TABLET_HAS_A_NEWER_SCHEMA: + LOG(WARNING) << "TS " << permanent_uuid() << ": alter failed for tablet " + << tablet_->ToString() << " no further retry: " << status.ToString(); + MarkComplete(); + break; + default: + LOG(WARNING) << "TS " << permanent_uuid() << ": alter failed for tablet " + << tablet_->ToString() << ": " << status.ToString(); + break; + } + } else { + MarkComplete(); + VLOG(1) << "TS " << permanent_uuid() << ": alter complete on tablet " << tablet_->ToString(); + } + + if (state() == kStateComplete) { + master_->catalog_manager()->HandleTabletSchemaVersionReport(tablet_.get(), schema_version_); + } else { + VLOG(1) << "Still waiting for other tablets to finish ALTER"; + } + } + + virtual bool SendRequest(int attempt) OVERRIDE { + TableMetadataLock l(tablet_->table().get(), TableMetadataLock::READ); + + tserver::AlterSchemaRequestPB req; + req.set_dest_uuid(permanent_uuid()); + req.set_tablet_id(tablet_->tablet_id()); + req.set_new_table_name(l.data().pb.name()); + req.set_schema_version(l.data().pb.version()); + req.mutable_schema()->CopyFrom(l.data().pb.schema()); + schema_version_ = l.data().pb.version(); + + l.Unlock(); + + ts_proxy_->AlterSchemaAsync(req, &resp_, &rpc_, + boost::bind(&AsyncAlterTable::RpcCallback, this)); + VLOG(1) << "Send alter table request to " << permanent_uuid() + << " (attempt " << attempt << "):\n" + << req.DebugString(); + return true; + } + + uint32_t schema_version_; + scoped_refptr tablet_; + tserver::AlterSchemaResponsePB resp_; +}; + +namespace { + +// Select a random TS not in the 'exclude_uuids' list. +// Will not select tablet servers that have not heartbeated recently. +// Returns true iff it was possible to select a replica. +bool SelectRandomTSForReplica(const TSDescriptorVector& ts_descs, + const unordered_set& exclude_uuids, + shared_ptr* selection) { + TSDescriptorVector tablet_servers; + for (const shared_ptr& ts : ts_descs) { + if (!ContainsKey(exclude_uuids, ts->permanent_uuid())) { + tablet_servers.push_back(ts); + } + } + if (tablet_servers.empty()) { + return false; + } + *selection = tablet_servers[rand() % tablet_servers.size()]; + return true; +} + +} // anonymous namespace + +class AsyncAddServerTask : public RetryingTSRpcTask { + public: + AsyncAddServerTask(Master *master, + ThreadPool* callback_pool, + const scoped_refptr& tablet, + const ConsensusStatePB& cstate) + : RetryingTSRpcTask(master, + callback_pool, + gscoped_ptr(new PickLeaderReplica(tablet)), + tablet->table()), + tablet_(tablet), + cstate_(cstate) { + deadline_ = MonoTime::Max(); // Never time out. + } + + virtual string type_name() const OVERRIDE { return "AddServer ChangeConfig"; } + + virtual string description() const OVERRIDE { + return Substitute("AddServer ChangeConfig RPC for tablet $0 on peer $1 " + "with cas_config_opid_index $2", + tablet_->tablet_id(), permanent_uuid(), cstate_.config().opid_index()); + } + + protected: + virtual bool SendRequest(int attempt) OVERRIDE; + virtual void HandleResponse(int attempt) OVERRIDE; + + private: + virtual string tablet_id() const OVERRIDE { return tablet_->tablet_id(); } + string permanent_uuid() const { + return target_ts_desc_->permanent_uuid(); + } + + const scoped_refptr tablet_; + const ConsensusStatePB cstate_; + + consensus::ChangeConfigRequestPB req_; + consensus::ChangeConfigResponsePB resp_; +}; + +bool AsyncAddServerTask::SendRequest(int attempt) { + // Bail if we're retrying in vain. + int64_t latest_index; + { + TabletMetadataLock tablet_lock(tablet_.get(), TabletMetadataLock::READ); + latest_index = tablet_lock.data().pb.committed_consensus_state().config().opid_index(); + } + if (latest_index > cstate_.config().opid_index()) { + LOG_WITH_PREFIX(INFO) << "Latest config for has opid_index of " << latest_index + << " while this task has opid_index of " + << cstate_.config().opid_index() << ". Aborting task."; + MarkAborted(); + return false; + } + + // Select the replica we wish to add to the config. + // Do not include current members of the config. + unordered_set replica_uuids; + for (const RaftPeerPB& peer : cstate_.config().peers()) { + InsertOrDie(&replica_uuids, peer.permanent_uuid()); + } + TSDescriptorVector ts_descs; + master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); + shared_ptr replacement_replica; + if (PREDICT_FALSE(!SelectRandomTSForReplica(ts_descs, replica_uuids, &replacement_replica))) { + KLOG_EVERY_N(WARNING, 100) << LogPrefix() << "No candidate replacement replica found " + << "for tablet " << tablet_->ToString(); + return false; + } + + req_.set_dest_uuid(permanent_uuid()); + req_.set_tablet_id(tablet_->tablet_id()); + req_.set_type(consensus::ADD_SERVER); + req_.set_cas_config_opid_index(cstate_.config().opid_index()); + RaftPeerPB* peer = req_.mutable_server(); + peer->set_permanent_uuid(replacement_replica->permanent_uuid()); + TSRegistrationPB peer_reg; + replacement_replica->GetRegistration(&peer_reg); + if (peer_reg.rpc_addresses_size() == 0) { + KLOG_EVERY_N(WARNING, 100) << LogPrefix() << "Candidate replacement " + << replacement_replica->permanent_uuid() + << " has no registered rpc address: " + << peer_reg.ShortDebugString(); + return false; + } + *peer->mutable_last_known_addr() = peer_reg.rpc_addresses(0); + peer->set_member_type(RaftPeerPB::VOTER); + consensus_proxy_->ChangeConfigAsync(req_, &resp_, &rpc_, + boost::bind(&AsyncAddServerTask::RpcCallback, this)); + VLOG(1) << "Sent AddServer ChangeConfig request to " << permanent_uuid() << ":\n" + << req_.DebugString(); + return true; +} + +void AsyncAddServerTask::HandleResponse(int attempt) { + if (!resp_.has_error()) { + MarkComplete(); + LOG_WITH_PREFIX(INFO) << "Change config succeeded"; + return; + } + + Status status = StatusFromPB(resp_.error().status()); + + // Do not retry on a CAS error, otherwise retry forever or until cancelled. + switch (resp_.error().code()) { + case TabletServerErrorPB::CAS_FAILED: + LOG_WITH_PREFIX(WARNING) << "ChangeConfig() failed with leader " << permanent_uuid() + << " due to CAS failure. No further retry: " + << status.ToString(); + MarkFailed(); + break; + default: + LOG_WITH_PREFIX(INFO) << "ChangeConfig() failed with leader " << permanent_uuid() + << " due to error " + << TabletServerErrorPB::Code_Name(resp_.error().code()) + << ". This operation will be retried. Error detail: " + << status.ToString(); + break; + } +} + +void CatalogManager::SendAlterTableRequest(const scoped_refptr& table) { + vector > tablets; + table->GetAllTablets(&tablets); + + for (const scoped_refptr& tablet : tablets) { + SendAlterTabletRequest(tablet); + } +} + +void CatalogManager::SendAlterTabletRequest(const scoped_refptr& tablet) { + auto call = new AsyncAlterTable(master_, worker_pool_.get(), tablet); + tablet->table()->AddTask(call); + WARN_NOT_OK(call->Run(), "Failed to send alter table request"); +} + +void CatalogManager::DeleteTabletReplicas( + const TabletInfo* tablet, + const std::string& msg) { + TabletInfo::ReplicaMap locations; + tablet->GetReplicaLocations(&locations); + LOG(INFO) << "Sending DeleteTablet for " << locations.size() + << " replicas of tablet " << tablet->tablet_id(); + for (const TabletInfo::ReplicaMap::value_type& r : locations) { + SendDeleteTabletRequest(tablet->tablet_id(), TABLET_DATA_DELETED, + boost::none, tablet->table(), r.second.ts_desc, msg); + } +} + +void CatalogManager::DeleteTabletsAndSendRequests(const scoped_refptr& table) { + vector > tablets; + table->GetAllTablets(&tablets); + + string deletion_msg = "Table deleted at " + LocalTimeAsString(); + + for (const scoped_refptr& tablet : tablets) { + DeleteTabletReplicas(tablet.get(), deletion_msg); + + TabletMetadataLock tablet_lock(tablet.get(), TabletMetadataLock::WRITE); + tablet_lock.mutable_data()->set_state(SysTabletsEntryPB::DELETED, deletion_msg); + CHECK_OK(sys_catalog_->UpdateTablets({ tablet.get() })); + tablet_lock.Commit(); + } +} + +void CatalogManager::SendDeleteTabletRequest( + const std::string& tablet_id, + TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + const scoped_refptr& table, + TSDescriptor* ts_desc, + const string& reason) { + LOG_WITH_PREFIX(INFO) << Substitute("Deleting tablet $0 on peer $1 " + "with delete type $2 ($3)", + tablet_id, ts_desc->permanent_uuid(), + TabletDataState_Name(delete_type), + reason); + AsyncDeleteReplica* call = + new AsyncDeleteReplica(master_, worker_pool_.get(), ts_desc->permanent_uuid(), table, + tablet_id, delete_type, cas_config_opid_index_less_or_equal, + reason); + if (table != nullptr) { + table->AddTask(call); + } else { + // This is a floating task (since the table does not exist) + // created as response to a tablet report. + call->AddRef(); + } + WARN_NOT_OK(call->Run(), "Failed to send delete tablet request"); +} + +void CatalogManager::SendAddServerRequest(const scoped_refptr& tablet, + const ConsensusStatePB& cstate) { + auto task = new AsyncAddServerTask(master_, worker_pool_.get(), tablet, cstate); + tablet->table()->AddTask(task); + WARN_NOT_OK(task->Run(), "Failed to send new AddServer request"); + + // Need to print this after Run() because that's where it picks the TS which description() + // needs. + LOG(INFO) << "Started AddServer task: " << task->description(); +} + +void CatalogManager::ExtractTabletsToProcess( + std::vector > *tablets_to_delete, + std::vector > *tablets_to_process) { + + boost::shared_lock l(lock_); + + // TODO: At the moment we loop through all the tablets + // we can keep a set of tablets waiting for "assignment" + // or just a counter to avoid to take the lock and loop through the tablets + // if everything is "stable". + + for (const TabletInfoMap::value_type& entry : tablet_map_) { + scoped_refptr tablet = entry.second; + TabletMetadataLock tablet_lock(tablet.get(), TabletMetadataLock::READ); + + if (!tablet->table()) { + // Tablet is orphaned or in preparing state, continue. + continue; + } + + TableMetadataLock table_lock(tablet->table().get(), TableMetadataLock::READ); + + // If the table is deleted or the tablet was replaced at table creation time. + if (tablet_lock.data().is_deleted() || table_lock.data().is_deleted()) { + tablets_to_delete->push_back(tablet); + continue; + } + + // Running tablets. + if (tablet_lock.data().is_running()) { + // TODO: handle last update > not responding timeout? + continue; + } + + // Tablets not yet assigned or with a report just received + tablets_to_process->push_back(tablet); + } +} + +struct DeferredAssignmentActions { + vector tablets_to_add; + vector tablets_to_update; + vector needs_create_rpc; +}; + +void CatalogManager::HandleAssignPreparingTablet(TabletInfo* tablet, + DeferredAssignmentActions* deferred) { + // The tablet was just created (probably by a CreateTable RPC). + // Update the state to "creating" to be ready for the creation request. + tablet->mutable_metadata()->mutable_dirty()->set_state( + SysTabletsEntryPB::CREATING, "Sending initial creation of tablet"); + deferred->tablets_to_update.push_back(tablet); + deferred->needs_create_rpc.push_back(tablet); + VLOG(1) << "Assign new tablet " << tablet->ToString(); +} + +void CatalogManager::HandleAssignCreatingTablet(TabletInfo* tablet, + DeferredAssignmentActions* deferred, + vector >* new_tablets) { + MonoDelta time_since_updated = + MonoTime::Now(MonoTime::FINE).GetDeltaSince(tablet->last_update_time()); + int64_t remaining_timeout_ms = + FLAGS_tablet_creation_timeout_ms - time_since_updated.ToMilliseconds(); + + // Skip the tablet if the assignment timeout is not yet expired + if (remaining_timeout_ms > 0) { + VLOG(2) << "Tablet " << tablet->ToString() << " still being created. " + << remaining_timeout_ms << "ms remain until timeout."; + return; + } + + const PersistentTabletInfo& old_info = tablet->metadata().state(); + + // The "tablet creation" was already sent, but we didn't receive an answer + // within the timeout. So the tablet will be replaced by a new one. + TabletInfo *replacement = CreateTabletInfo(tablet->table().get(), + old_info.pb.partition()); + LOG(WARNING) << "Tablet " << tablet->ToString() << " was not created within " + << "the allowed timeout. Replacing with a new tablet " + << replacement->tablet_id(); + + tablet->table()->AddTablet(replacement); + { + boost::lock_guard l_maps(lock_); + tablet_map_[replacement->tablet_id()] = replacement; + } + + // Mark old tablet as replaced. + tablet->mutable_metadata()->mutable_dirty()->set_state( + SysTabletsEntryPB::REPLACED, + Substitute("Replaced by $0 at $1", + replacement->tablet_id(), LocalTimeAsString())); + + // Mark new tablet as being created. + replacement->mutable_metadata()->mutable_dirty()->set_state( + SysTabletsEntryPB::CREATING, + Substitute("Replacement for $0", tablet->tablet_id())); + + deferred->tablets_to_update.push_back(tablet); + deferred->tablets_to_add.push_back(replacement); + deferred->needs_create_rpc.push_back(replacement); + VLOG(1) << "Replaced tablet " << tablet->tablet_id() + << " with " << replacement->tablet_id() + << " (Table " << tablet->table()->ToString() << ")"; + + new_tablets->push_back(replacement); +} + +// TODO: we could batch the IO onto a background thread. +// but this is following the current HandleReportedTablet() +Status CatalogManager::HandleTabletSchemaVersionReport(TabletInfo *tablet, uint32_t version) { + // Update the schema version if it's the latest + tablet->set_reported_schema_version(version); + + // Verify if it's the last tablet report, and the alter completed. + TableInfo *table = tablet->table().get(); + TableMetadataLock l(table, TableMetadataLock::WRITE); + if (l.data().is_deleted() || l.data().pb.state() != SysTablesEntryPB::ALTERING) { + return Status::OK(); + } + + uint32_t current_version = l.data().pb.version(); + if (table->IsAlterInProgress(current_version)) { + return Status::OK(); + } + + // Update the state from altering to running and remove the last fully + // applied schema (if it exists). + l.mutable_data()->pb.clear_fully_applied_schema(); + l.mutable_data()->set_state(SysTablesEntryPB::RUNNING, + Substitute("Current schema version=$0", current_version)); + + Status s = sys_catalog_->UpdateTable(table); + if (!s.ok()) { + LOG(WARNING) << "An error occurred while updating sys-tables: " << s.ToString(); + return s; + } + + l.Commit(); + LOG(INFO) << table->ToString() << " - Alter table completed version=" << current_version; + return Status::OK(); +} + +// Helper class to commit TabletInfo mutations at the end of a scope. +namespace { + +class ScopedTabletInfoCommitter { + public: + explicit ScopedTabletInfoCommitter(const std::vector >* tablets) + : tablets_(DCHECK_NOTNULL(tablets)), + aborted_(false) { + } + + // This method is not thread safe. Must be called by the same thread + // that would destroy this instance. + void Abort() { + for (const scoped_refptr& tablet : *tablets_) { + tablet->mutable_metadata()->AbortMutation(); + } + aborted_ = true; + } + + // Commit the transactions. + ~ScopedTabletInfoCommitter() { + if (PREDICT_TRUE(!aborted_)) { + for (const scoped_refptr& tablet : *tablets_) { + tablet->mutable_metadata()->CommitMutation(); + } + } + } + + private: + const std::vector >* tablets_; + bool aborted_; +}; +} // anonymous namespace + +Status CatalogManager::ProcessPendingAssignments( + const std::vector >& tablets) { + VLOG(1) << "Processing pending assignments"; + + // Take write locks on all tablets to be processed, and ensure that they are + // unlocked at the end of this scope. + for (const scoped_refptr& tablet : tablets) { + tablet->mutable_metadata()->StartMutation(); + } + ScopedTabletInfoCommitter unlocker_in(&tablets); + + // Any tablets created by the helper functions will also be created in a + // locked state, so we must ensure they are unlocked before we return to + // avoid deadlocks. + std::vector > new_tablets; + ScopedTabletInfoCommitter unlocker_out(&new_tablets); + + DeferredAssignmentActions deferred; + + // Iterate over each of the tablets and handle it, whatever state + // it may be in. The actions required for the tablet are collected + // into 'deferred'. + for (const scoped_refptr& tablet : tablets) { + SysTabletsEntryPB::State t_state = tablet->metadata().state().pb.state(); + + switch (t_state) { + case SysTabletsEntryPB::PREPARING: + HandleAssignPreparingTablet(tablet.get(), &deferred); + break; + + case SysTabletsEntryPB::CREATING: + HandleAssignCreatingTablet(tablet.get(), &deferred, &new_tablets); + break; + + default: + VLOG(2) << "Nothing to do for tablet " << tablet->tablet_id() << ": state = " + << SysTabletsEntryPB_State_Name(t_state); + break; + } + } + + // Nothing to do + if (deferred.tablets_to_add.empty() && + deferred.tablets_to_update.empty() && + deferred.needs_create_rpc.empty()) { + return Status::OK(); + } + + // For those tablets which need to be created in this round, assign replicas. + TSDescriptorVector ts_descs; + master_->ts_manager()->GetAllLiveDescriptors(&ts_descs); + + Status s; + for (TabletInfo *tablet : deferred.needs_create_rpc) { + // NOTE: if we fail to select replicas on the first pass (due to + // insufficient Tablet Servers being online), we will still try + // again unless the tablet/table creation is cancelled. + s = SelectReplicasForTablet(ts_descs, tablet); + if (!s.ok()) { + s = s.CloneAndPrepend(Substitute( + "An error occured while selecting replicas for tablet $0: $1", + tablet->tablet_id(), s.ToString())); + break; + } + } + + // Update the sys catalog with the new set of tablets/metadata. + if (s.ok()) { + s = sys_catalog_->AddAndUpdateTablets(deferred.tablets_to_add, + deferred.tablets_to_update); + if (!s.ok()) { + s = s.CloneAndPrepend("An error occurred while persisting the updated tablet metadata"); + } + } + + if (!s.ok()) { + LOG(WARNING) << "Aborting the current task due to error: " << s.ToString(); + // If there was an error, abort any mutations started by the + // current task. + vector tablet_ids_to_remove; + for (scoped_refptr& new_tablet : new_tablets) { + TableInfo* table = new_tablet->table().get(); + TableMetadataLock l_table(table, TableMetadataLock::READ); + if (table->RemoveTablet( + new_tablet->metadata().dirty().pb.partition().partition_key_start())) { + VLOG(1) << "Removed tablet " << new_tablet->tablet_id() << " from " + "table " << l_table.data().name(); + } + tablet_ids_to_remove.push_back(new_tablet->tablet_id()); + } + boost::lock_guard l(lock_); + unlocker_out.Abort(); + unlocker_in.Abort(); + for (const string& tablet_id_to_remove : tablet_ids_to_remove) { + CHECK_EQ(tablet_map_.erase(tablet_id_to_remove), 1) + << "Unable to erase " << tablet_id_to_remove << " from tablet map."; + } + return s; + } + + // Send DeleteTablet requests to tablet servers serving deleted tablets. + // This is asynchronous / non-blocking. + for (const TabletInfo* tablet : deferred.tablets_to_update) { + if (tablet->metadata().dirty().is_deleted()) { + DeleteTabletReplicas(tablet, tablet->metadata().dirty().pb.state_msg()); + } + } + // Send the CreateTablet() requests to the servers. This is asynchronous / non-blocking. + SendCreateTabletRequests(deferred.needs_create_rpc); + return Status::OK(); +} + +Status CatalogManager::SelectReplicasForTablet(const TSDescriptorVector& ts_descs, + TabletInfo* tablet) { + TableMetadataLock table_guard(tablet->table().get(), TableMetadataLock::READ); + + if (!table_guard.data().pb.IsInitialized()) { + return Status::InvalidArgument( + Substitute("TableInfo for tablet $0 is not initialized (aborted CreateTable attempt?)", + tablet->tablet_id())); + } + + int nreplicas = table_guard.data().pb.num_replicas(); + + if (ts_descs.size() < nreplicas) { + return Status::InvalidArgument( + Substitute("Not enough tablet servers are online for table '$0'. Need at least $1 " + "replicas, but only $2 tablet servers are available", + table_guard.data().name(), nreplicas, ts_descs.size())); + } + + // Select the set of replicas for the tablet. + ConsensusStatePB* cstate = tablet->mutable_metadata()->mutable_dirty() + ->pb.mutable_committed_consensus_state(); + cstate->set_current_term(kMinimumTerm); + consensus::RaftConfigPB *config = cstate->mutable_config(); + + if (nreplicas == 1 && FLAGS_catalog_manager_allow_local_consensus) { + config->set_local(true); + } else { + config->set_local(false); + } + config->set_opid_index(consensus::kInvalidOpIdIndex); + SelectReplicas(ts_descs, nreplicas, config); + return Status::OK(); +} + +void CatalogManager::SendCreateTabletRequests(const vector& tablets) { + for (TabletInfo *tablet : tablets) { + const consensus::RaftConfigPB& config = + tablet->metadata().dirty().pb.committed_consensus_state().config(); + tablet->set_last_update_time(MonoTime::Now(MonoTime::FINE)); + for (const RaftPeerPB& peer : config.peers()) { + AsyncCreateReplica* task = new AsyncCreateReplica(master_, worker_pool_.get(), + peer.permanent_uuid(), tablet); + tablet->table()->AddTask(task); + WARN_NOT_OK(task->Run(), "Failed to send new tablet request"); + } + } +} + +shared_ptr CatalogManager::PickBetterReplicaLocation( + const TSDescriptorVector& two_choices) { + DCHECK_EQ(two_choices.size(), 2); + + const auto& a = two_choices[0]; + const auto& b = two_choices[1]; + + // When creating replicas, we consider two aspects of load: + // (1) how many tablet replicas are already on the server, and + // (2) how often we've chosen this server recently. + // + // The first factor will attempt to put more replicas on servers that + // are under-loaded (eg because they have newly joined an existing cluster, or have + // been reformatted and re-joined). + // + // The second factor will ensure that we take into account the recent selection + // decisions even if those replicas are still in the process of being created (and thus + // not yet reported by the server). This is important because, while creating a table, + // we batch the selection process before sending any creation commands to the + // servers themselves. + // + // TODO: in the future we may want to factor in other items such as available disk space, + // actual request load, etc. + double load_a = a->RecentReplicaCreations() + a->num_live_replicas(); + double load_b = b->RecentReplicaCreations() + b->num_live_replicas(); + if (load_a < load_b) { + return a; + } else if (load_b < load_a) { + return b; + } else { + // If the load is the same, we can just pick randomly. + return two_choices[rng_.Uniform(2)]; + } +} + +shared_ptr CatalogManager::SelectReplica( + const TSDescriptorVector& ts_descs, + const set>& excluded) { + // The replica selection algorithm follows the idea from + // "Power of Two Choices in Randomized Load Balancing"[1]. For each replica, + // we randomly select two tablet servers, and then assign the replica to the + // less-loaded one of the two. This has some nice properties: + // + // 1) because the initial selection of two servers is random, we get good + // spreading of replicas across the cluster. In contrast if we sorted by + // load and always picked under-loaded servers first, we'd end up causing + // all tablets of a new table to be placed on an empty server. This wouldn't + // give good load balancing of that table. + // + // 2) because we pick the less-loaded of two random choices, we do end up with a + // weighting towards filling up the underloaded one over time, without + // the extreme scenario above. + // + // 3) because we don't follow any sequential pattern, every server is equally + // likely to replicate its tablets to every other server. In contrast, a + // round-robin design would enforce that each server only replicates to its + // adjacent nodes in the TS sort order, limiting recovery bandwidth (see + // KUDU-1317). + // + // [1] http://www.eecs.harvard.edu/~michaelm/postscripts/mythesis.pdf + + // Pick two random servers, excluding those we've already picked. + // If we've only got one server left, 'two_choices' will actually + // just contain one element. + vector > two_choices; + rng_.ReservoirSample(ts_descs, 2, excluded, &two_choices); + + if (two_choices.size() == 2) { + // Pick the better of the two. + return PickBetterReplicaLocation(two_choices); + } + + // If we couldn't randomly sample two servers, it's because we only had one + // more non-excluded choice left. + CHECK_EQ(1, ts_descs.size() - excluded.size()) + << "ts_descs: " << ts_descs.size() << " already_sel: " << excluded.size(); + return two_choices[0]; +} + +void CatalogManager::SelectReplicas(const TSDescriptorVector& ts_descs, + int nreplicas, + consensus::RaftConfigPB *config) { + DCHECK_EQ(0, config->peers_size()) << "RaftConfig not empty: " << config->ShortDebugString(); + DCHECK_LE(nreplicas, ts_descs.size()); + + // Keep track of servers we've already selected, so that we don't attempt to + // put two replicas on the same host. + set > already_selected; + for (int i = 0; i < nreplicas; ++i) { + shared_ptr ts = SelectReplica(ts_descs, already_selected); + InsertOrDie(&already_selected, ts); + + // Increment the number of pending replicas so that we take this selection into + // account when assigning replicas for other tablets of the same table. This + // value decays back to 0 over time. + ts->IncrementRecentReplicaCreations(); + + TSRegistrationPB reg; + ts->GetRegistration(®); + + RaftPeerPB *peer = config->add_peers(); + peer->set_member_type(RaftPeerPB::VOTER); + peer->set_permanent_uuid(ts->permanent_uuid()); + + // TODO: This is temporary, we will use only UUIDs + for (const HostPortPB& addr : reg.rpc_addresses()) { + peer->mutable_last_known_addr()->CopyFrom(addr); + } + } +} + +Status CatalogManager::BuildLocationsForTablet(const scoped_refptr& tablet, + TabletLocationsPB* locs_pb) { + TSRegistrationPB reg; + + TabletInfo::ReplicaMap locs; + consensus::ConsensusStatePB cstate; + { + TabletMetadataLock l_tablet(tablet.get(), TabletMetadataLock::READ); + if (PREDICT_FALSE(l_tablet.data().is_deleted())) { + return Status::NotFound("Tablet deleted", l_tablet.data().pb.state_msg()); + } + + if (PREDICT_FALSE(!l_tablet.data().is_running())) { + return Status::ServiceUnavailable("Tablet not running"); + } + + tablet->GetReplicaLocations(&locs); + if (locs.empty() && l_tablet.data().pb.has_committed_consensus_state()) { + cstate = l_tablet.data().pb.committed_consensus_state(); + } + + locs_pb->mutable_partition()->CopyFrom(tablet->metadata().state().pb.partition()); + } + + locs_pb->set_tablet_id(tablet->tablet_id()); + locs_pb->set_stale(locs.empty()); + + // If the locations are cached. + if (!locs.empty()) { + for (const TabletInfo::ReplicaMap::value_type& replica : locs) { + TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); + replica_pb->set_role(replica.second.role); + + TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info(); + tsinfo_pb->set_permanent_uuid(replica.second.ts_desc->permanent_uuid()); + + replica.second.ts_desc->GetRegistration(®); + tsinfo_pb->mutable_rpc_addresses()->Swap(reg.mutable_rpc_addresses()); + } + return Status::OK(); + } + + // If the locations were not cached. + // TODO: Why would this ever happen? See KUDU-759. + if (cstate.IsInitialized()) { + for (const consensus::RaftPeerPB& peer : cstate.config().peers()) { + TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas(); + CHECK(peer.has_permanent_uuid()) << "Missing UUID: " << peer.ShortDebugString(); + replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate)); + + TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info(); + tsinfo_pb->set_permanent_uuid(peer.permanent_uuid()); + tsinfo_pb->add_rpc_addresses()->CopyFrom(peer.last_known_addr()); + } + } + + return Status::OK(); +} + +Status CatalogManager::GetTabletLocations(const std::string& tablet_id, + TabletLocationsPB* locs_pb) { + RETURN_NOT_OK(CheckOnline()); + + locs_pb->mutable_replicas()->Clear(); + scoped_refptr tablet_info; + { + boost::shared_lock l(lock_); + if (!FindCopy(tablet_map_, tablet_id, &tablet_info)) { + return Status::NotFound(Substitute("Unknown tablet $0", tablet_id)); + } + } + + return BuildLocationsForTablet(tablet_info, locs_pb); +} + +Status CatalogManager::GetTableLocations(const GetTableLocationsRequestPB* req, + GetTableLocationsResponsePB* resp) { + RETURN_NOT_OK(CheckOnline()); + + // If start-key is > end-key report an error instead of swap the two + // since probably there is something wrong app-side. + if (req->has_partition_key_start() && req->has_partition_key_end() + && req->partition_key_start() > req->partition_key_end()) { + return Status::InvalidArgument("start partition key is greater than the end partition key"); + } + + if (req->max_returned_locations() <= 0) { + return Status::InvalidArgument("max_returned_locations must be greater than 0"); + } + + scoped_refptr table; + RETURN_NOT_OK(FindTable(req->table(), &table)); + + if (table == nullptr) { + Status s = Status::NotFound("The table does not exist"); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + TableMetadataLock l(table.get(), TableMetadataLock::READ); + if (l.data().is_deleted()) { + Status s = Status::NotFound("The table was deleted", + l.data().pb.state_msg()); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + if (!l.data().is_running()) { + Status s = Status::ServiceUnavailable("The table is not running"); + SetupError(resp->mutable_error(), MasterErrorPB::TABLE_NOT_FOUND, s); + return s; + } + + vector > tablets_in_range; + table->GetTabletsInRange(req, &tablets_in_range); + + TSRegistrationPB reg; + vector locs; + for (const scoped_refptr& tablet : tablets_in_range) { + if (!BuildLocationsForTablet(tablet, resp->add_tablet_locations()).ok()) { + // Not running. + resp->mutable_tablet_locations()->RemoveLast(); + } + } + return Status::OK(); +} + +void CatalogManager::DumpState(std::ostream* out) const { + TableInfoMap ids_copy, names_copy; + TabletInfoMap tablets_copy; + + // Copy the internal state so that, if the output stream blocks, + // we don't end up holding the lock for a long time. + { + boost::shared_lock l(lock_); + ids_copy = table_ids_map_; + names_copy = table_names_map_; + tablets_copy = tablet_map_; + } + + *out << "Tables:\n"; + for (const TableInfoMap::value_type& e : ids_copy) { + TableInfo* t = e.second.get(); + TableMetadataLock l(t, TableMetadataLock::READ); + const string& name = l.data().name(); + + *out << t->id() << ":\n"; + *out << " name: \"" << strings::CHexEscape(name) << "\"\n"; + // Erase from the map, so later we can check that we don't have + // any orphaned tables in the by-name map that aren't in the + // by-id map. + if (names_copy.erase(name) != 1) { + *out << " [not present in by-name map]\n"; + } + *out << " metadata: " << l.data().pb.ShortDebugString() << "\n"; + + *out << " tablets:\n"; + + vector > table_tablets; + t->GetAllTablets(&table_tablets); + for (const scoped_refptr& tablet : table_tablets) { + TabletMetadataLock l_tablet(tablet.get(), TabletMetadataLock::READ); + *out << " " << tablet->tablet_id() << ": " + << l_tablet.data().pb.ShortDebugString() << "\n"; + + if (tablets_copy.erase(tablet->tablet_id()) != 1) { + *out << " [ERROR: not present in CM tablet map!]\n"; + } + } + } + + if (!tablets_copy.empty()) { + *out << "Orphaned tablets (not referenced by any table):\n"; + for (const TabletInfoMap::value_type& entry : tablets_copy) { + const scoped_refptr& tablet = entry.second; + TabletMetadataLock l_tablet(tablet.get(), TabletMetadataLock::READ); + *out << " " << tablet->tablet_id() << ": " + << l_tablet.data().pb.ShortDebugString() << "\n"; + } + } + + if (!names_copy.empty()) { + *out << "Orphaned tables (in by-name map, but not id map):\n"; + for (const TableInfoMap::value_type& e : names_copy) { + *out << e.second->id() << ":\n"; + *out << " name: \"" << CHexEscape(e.first) << "\"\n"; + } + } +} + +std::string CatalogManager::LogPrefix() const { + return Substitute("T $0 P $1: ", + sys_catalog_->tablet_peer()->tablet_id(), + sys_catalog_->tablet_peer()->permanent_uuid()); +} + +//////////////////////////////////////////////////////////// +// TabletInfo +//////////////////////////////////////////////////////////// + +TabletInfo::TabletInfo(const scoped_refptr& table, + std::string tablet_id) + : tablet_id_(std::move(tablet_id)), + table_(table), + last_update_time_(MonoTime::Now(MonoTime::FINE)), + reported_schema_version_(0) {} + +TabletInfo::~TabletInfo() { +} + +void TabletInfo::SetReplicaLocations(const ReplicaMap& replica_locations) { + boost::lock_guard l(lock_); + last_update_time_ = MonoTime::Now(MonoTime::FINE); + replica_locations_ = replica_locations; +} + +void TabletInfo::GetReplicaLocations(ReplicaMap* replica_locations) const { + boost::lock_guard l(lock_); + *replica_locations = replica_locations_; +} + +bool TabletInfo::AddToReplicaLocations(const TabletReplica& replica) { + boost::lock_guard l(lock_); + return InsertIfNotPresent(&replica_locations_, replica.ts_desc->permanent_uuid(), replica); +} + +void TabletInfo::set_last_update_time(const MonoTime& ts) { + boost::lock_guard l(lock_); + last_update_time_ = ts; +} + +MonoTime TabletInfo::last_update_time() const { + boost::lock_guard l(lock_); + return last_update_time_; +} + +bool TabletInfo::set_reported_schema_version(uint32_t version) { + boost::lock_guard l(lock_); + if (version > reported_schema_version_) { + reported_schema_version_ = version; + return true; + } + return false; +} + +uint32_t TabletInfo::reported_schema_version() const { + boost::lock_guard l(lock_); + return reported_schema_version_; +} + +std::string TabletInfo::ToString() const { + return Substitute("$0 (table $1)", tablet_id_, + (table_ != nullptr ? table_->ToString() : "MISSING")); +} + +void PersistentTabletInfo::set_state(SysTabletsEntryPB::State state, const string& msg) { + pb.set_state(state); + pb.set_state_msg(msg); +} + +//////////////////////////////////////////////////////////// +// TableInfo +//////////////////////////////////////////////////////////// + +TableInfo::TableInfo(std::string table_id) : table_id_(std::move(table_id)) {} + +TableInfo::~TableInfo() { +} + +std::string TableInfo::ToString() const { + TableMetadataLock l(this, TableMetadataLock::READ); + return Substitute("$0 [id=$1]", l.data().pb.name(), table_id_); +} + +bool TableInfo::RemoveTablet(const std::string& partition_key_start) { + boost::lock_guard l(lock_); + return EraseKeyReturnValuePtr(&tablet_map_, partition_key_start) != NULL; +} + +void TableInfo::AddTablet(TabletInfo *tablet) { + boost::lock_guard l(lock_); + AddTabletUnlocked(tablet); +} + +void TableInfo::AddTablets(const vector& tablets) { + boost::lock_guard l(lock_); + for (TabletInfo *tablet : tablets) { + AddTabletUnlocked(tablet); + } +} + +void TableInfo::AddTabletUnlocked(TabletInfo* tablet) { + TabletInfo* old = nullptr; + if (UpdateReturnCopy(&tablet_map_, + tablet->metadata().dirty().pb.partition().partition_key_start(), + tablet, &old)) { + VLOG(1) << "Replaced tablet " << old->tablet_id() << " with " << tablet->tablet_id(); + // TODO: can we assert that the replaced tablet is not in Running state? + // May be a little tricky since we don't know whether to look at its committed or + // uncommitted state. + } +} + +void TableInfo::GetTabletsInRange(const GetTableLocationsRequestPB* req, + vector > *ret) const { + boost::lock_guard l(lock_); + int max_returned_locations = req->max_returned_locations(); + + TableInfo::TabletInfoMap::const_iterator it, it_end; + if (req->has_partition_key_start()) { + it = tablet_map_.upper_bound(req->partition_key_start()); + --it; + } else { + it = tablet_map_.begin(); + } + + if (req->has_partition_key_end()) { + it_end = tablet_map_.upper_bound(req->partition_key_end()); + } else { + it_end = tablet_map_.end(); + } + + int count = 0; + for (; it != it_end && count < max_returned_locations; ++it) { + ret->push_back(make_scoped_refptr(it->second)); + count++; + } +} + +bool TableInfo::IsAlterInProgress(uint32_t version) const { + boost::lock_guard l(lock_); + for (const TableInfo::TabletInfoMap::value_type& e : tablet_map_) { + if (e.second->reported_schema_version() < version) { + VLOG(3) << "Table " << table_id_ << " ALTER in progress due to tablet " + << e.second->ToString() << " because reported schema " + << e.second->reported_schema_version() << " < expected " << version; + return true; + } + } + return false; +} + +bool TableInfo::IsCreateInProgress() const { + boost::lock_guard l(lock_); + for (const TableInfo::TabletInfoMap::value_type& e : tablet_map_) { + TabletMetadataLock tablet_lock(e.second, TabletMetadataLock::READ); + if (!tablet_lock.data().is_running()) { + return true; + } + } + return false; +} + +void TableInfo::AddTask(MonitoredTask* task) { + boost::lock_guard l(lock_); + task->AddRef(); + pending_tasks_.insert(task); +} + +void TableInfo::RemoveTask(MonitoredTask* task) { + boost::lock_guard l(lock_); + pending_tasks_.erase(task); + task->Release(); +} + +void TableInfo::AbortTasks() { + boost::lock_guard l(lock_); + for (MonitoredTask* task : pending_tasks_) { + task->Abort(); + } +} + +void TableInfo::WaitTasksCompletion() { + int wait_time = 5; + while (1) { + { + boost::lock_guard l(lock_); + if (pending_tasks_.empty()) { + break; + } + } + base::SleepForMilliseconds(wait_time); + wait_time = std::min(wait_time * 5 / 4, 10000); + } +} + +void TableInfo::GetTaskList(std::vector > *ret) { + boost::lock_guard l(lock_); + for (MonitoredTask* task : pending_tasks_) { + ret->push_back(make_scoped_refptr(task)); + } +} + +void TableInfo::GetAllTablets(vector > *ret) const { + ret->clear(); + boost::lock_guard l(lock_); + for (const TableInfo::TabletInfoMap::value_type& e : tablet_map_) { + ret->push_back(make_scoped_refptr(e.second)); + } +} + +void PersistentTableInfo::set_state(SysTablesEntryPB::State state, const string& msg) { + pb.set_state(state); + pb.set_state_msg(msg); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/catalog_manager.h b/src/kudu/master/catalog_manager.h new file mode 100644 index 000000000000..2365361f85cc --- /dev/null +++ b/src/kudu/master/catalog_manager.h @@ -0,0 +1,676 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_CATALOG_MANAGER_H +#define KUDU_MASTER_CATALOG_MANAGER_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/partition.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/ts_manager.h" +#include "kudu/server/monitored_task.h" +#include "kudu/tserver/tablet_peer_lookup.h" +#include "kudu/util/cow_object.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/oid_generator.h" +#include "kudu/util/promise.h" +#include "kudu/util/random.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Schema; +class ThreadPool; + +namespace rpc { +class RpcContext; +} // namespace rpc + +namespace master { + +class CatalogManagerBgTasks; +class Master; +class SysCatalogTable; +class TableInfo; +class TSDescriptor; + +struct DeferredAssignmentActions; + +// The data related to a tablet which is persisted on disk. +// This portion of TableInfo is managed via CowObject. +// It wraps the underlying protobuf to add useful accessors. +struct PersistentTabletInfo { + bool is_running() const { + return pb.state() == SysTabletsEntryPB::RUNNING; + } + + bool is_deleted() const { + return pb.state() == SysTabletsEntryPB::REPLACED || + pb.state() == SysTabletsEntryPB::DELETED; + } + + // Helper to set the state of the tablet with a custom message. + // Requires that the caller has prepared this object for write. + // The change will only be visible after Commit(). + void set_state(SysTabletsEntryPB::State state, const std::string& msg); + + SysTabletsEntryPB pb; +}; + +// Information on a current replica of a tablet. +// This is copyable so that no locking is needed. +struct TabletReplica { + TSDescriptor* ts_desc; + tablet::TabletStatePB state; + consensus::RaftPeerPB::Role role; +}; + +// The information about a single tablet which exists in the cluster, +// including its state and locations. +// +// This object uses copy-on-write for the portions of data which are persisted +// on disk. This allows the mutated data to be staged and written to disk +// while readers continue to access the previous version. These portions +// of data are in PersistentTableInfo above, and typically accessed using +// TabletMetadataLock. For example: +// +// TabletInfo* table = ...; +// TabletMetadataLock l(tablet, TableMetadataLock::READ); +// if (l.data().is_running()) { ... } +// +// The non-persistent information about the tablet is protected by an internal +// spin-lock. +// +// The object is owned/managed by the CatalogManager, and exposed for testing. +class TabletInfo : public RefCountedThreadSafe { + public: + typedef PersistentTabletInfo cow_state; + typedef std::unordered_map ReplicaMap; + + TabletInfo(const scoped_refptr& table, std::string tablet_id); + + const std::string& tablet_id() const { return tablet_id_; } + const scoped_refptr& table() const { return table_; } + + // Access the persistent metadata. Typically you should use + // TabletMetadataLock to gain access to this data. + const CowObject& metadata() const { return metadata_; } + CowObject* mutable_metadata() { return &metadata_; } + + // Accessors for the latest known tablet replica locations. + // These locations include only the members of the latest-reported Raft + // configuration whose tablet servers have ever heartbeated to this Master. + void SetReplicaLocations(const ReplicaMap& replica_locations); + void GetReplicaLocations(ReplicaMap* replica_locations) const; + + // Adds the given replica to the replica_locations_ map. + // Returns true iff the replica was inserted. + bool AddToReplicaLocations(const TabletReplica& replica); + + // Accessors for the last time the replica locations were updated. + void set_last_update_time(const MonoTime& ts); + MonoTime last_update_time() const; + + // Accessors for the last reported schema version + bool set_reported_schema_version(uint32_t version); + uint32_t reported_schema_version() const; + + // No synchronization needed. + std::string ToString() const; + + private: + friend class RefCountedThreadSafe; + ~TabletInfo(); + + const std::string tablet_id_; + const scoped_refptr table_; + + CowObject metadata_; + + // Lock protecting the below mutable fields. + // This doesn't protect metadata_ (the on-disk portion). + mutable simple_spinlock lock_; + + // The last time the replica locations were updated. + // Also set when the Master first attempts to create the tablet. + MonoTime last_update_time_; + + // The locations in the latest raft config where this tablet has been + // reported. The map is keyed by tablet server UUID. + ReplicaMap replica_locations_; + + // Reported schema version (in-memory only). + uint32_t reported_schema_version_; + + DISALLOW_COPY_AND_ASSIGN(TabletInfo); +}; + +// The data related to a table which is persisted on disk. +// This portion of TableInfo is managed via CowObject. +// It wraps the underlying protobuf to add useful accessors. +struct PersistentTableInfo { + bool is_deleted() const { + return pb.state() == SysTablesEntryPB::REMOVED; + } + + bool is_running() const { + return pb.state() == SysTablesEntryPB::RUNNING || + pb.state() == SysTablesEntryPB::ALTERING; + } + + // Return the table's name. + const std::string& name() const { + return pb.name(); + } + + // Helper to set the state of the tablet with a custom message. + void set_state(SysTablesEntryPB::State state, const std::string& msg); + + SysTablesEntryPB pb; +}; + +// The information about a table, including its state and tablets. +// +// This object uses copy-on-write techniques similarly to TabletInfo. +// Please see the TabletInfo class doc above for more information. +// +// The non-persistent information about the table is protected by an internal +// spin-lock. +class TableInfo : public RefCountedThreadSafe { + public: + typedef PersistentTableInfo cow_state; + + explicit TableInfo(std::string table_id); + + std::string ToString() const; + + // Return the table's ID. Does not require synchronization. + const std::string& id() const { return table_id_; } + + // Add a tablet to this table. + void AddTablet(TabletInfo *tablet); + // Add multiple tablets to this table. + void AddTablets(const std::vector& tablets); + + // Return true if tablet with 'partition_key_start' has been + // removed from 'tablet_map_' below. + bool RemoveTablet(const std::string& partition_key_start); + + // This only returns tablets which are in RUNNING state. + void GetTabletsInRange(const GetTableLocationsRequestPB* req, + std::vector > *ret) const; + + void GetAllTablets(std::vector > *ret) const; + + // Access the persistent metadata. Typically you should use + // TableMetadataLock to gain access to this data. + const CowObject& metadata() const { return metadata_; } + CowObject* mutable_metadata() { return &metadata_; } + + // Returns true if the table creation is in-progress + bool IsCreateInProgress() const; + + // Returns true if an "Alter" operation is in-progress + bool IsAlterInProgress(uint32_t version) const; + + void AddTask(MonitoredTask *task); + void RemoveTask(MonitoredTask *task); + void AbortTasks(); + void WaitTasksCompletion(); + + // Allow for showing outstanding tasks in the master UI. + void GetTaskList(std::vector > *tasks); + + private: + friend class RefCountedThreadSafe; + ~TableInfo(); + + void AddTabletUnlocked(TabletInfo* tablet); + + const std::string table_id_; + + // Sorted index of tablet start partition-keys to TabletInfo. + // The TabletInfo objects are owned by the CatalogManager. + typedef std::map TabletInfoMap; + TabletInfoMap tablet_map_; + + // Protects tablet_map_ and pending_tasks_ + mutable simple_spinlock lock_; + + CowObject metadata_; + + // List of pending tasks (e.g. create/alter tablet requests) + std::unordered_set pending_tasks_; + + DISALLOW_COPY_AND_ASSIGN(TableInfo); +}; + +// Helper to manage locking on the persistent metadata of TabletInfo or TableInfo. +template +class MetadataLock : public CowLock { + public: + typedef CowLock super; + MetadataLock(MetadataClass* info, typename super::LockMode mode) + : super(DCHECK_NOTNULL(info)->mutable_metadata(), mode) { + } + MetadataLock(const MetadataClass* info, typename super::LockMode mode) + : super(&(DCHECK_NOTNULL(info))->metadata(), mode) { + } +}; + +typedef MetadataLock TabletMetadataLock; +typedef MetadataLock TableMetadataLock; + +// The component of the master which tracks the state and location +// of tables/tablets in the cluster. +// +// This is the master-side counterpart of TSTabletManager, which tracks +// the state of each tablet on a given tablet-server. +// +// Thread-safe. +class CatalogManager : public tserver::TabletPeerLookupIf { + public: + explicit CatalogManager(Master *master); + virtual ~CatalogManager(); + + Status Init(bool is_first_run); + + void Shutdown(); + Status CheckOnline() const; + + // Create a new Table with the specified attributes + // + // The RPC context is provided for logging/tracing purposes, + // but this function does not itself respond to the RPC. + Status CreateTable(const CreateTableRequestPB* req, + CreateTableResponsePB* resp, + rpc::RpcContext* rpc); + + // Get the information about an in-progress create operation + Status IsCreateTableDone(const IsCreateTableDoneRequestPB* req, + IsCreateTableDoneResponsePB* resp); + + // Delete the specified table + // + // The RPC context is provided for logging/tracing purposes, + // but this function does not itself respond to the RPC. + Status DeleteTable(const DeleteTableRequestPB* req, + DeleteTableResponsePB* resp, + rpc::RpcContext* rpc); + + // Alter the specified table + // + // The RPC context is provided for logging/tracing purposes, + // but this function does not itself respond to the RPC. + Status AlterTable(const AlterTableRequestPB* req, + AlterTableResponsePB* resp, + rpc::RpcContext* rpc); + + // Get the information about an in-progress alter operation + // + // The RPC context is provided for logging/tracing purposes, + // but this function does not itself respond to the RPC. + Status IsAlterTableDone(const IsAlterTableDoneRequestPB* req, + IsAlterTableDoneResponsePB* resp, + rpc::RpcContext* rpc); + + // Get the information about the specified table + Status GetTableSchema(const GetTableSchemaRequestPB* req, + GetTableSchemaResponsePB* resp); + + // List all the running tables + Status ListTables(const ListTablesRequestPB* req, + ListTablesResponsePB* resp); + + Status GetTableLocations(const GetTableLocationsRequestPB* req, + GetTableLocationsResponsePB* resp); + + // Look up the locations of the given tablet. The locations + // vector is overwritten (not appended to). + // If the tablet is not found, returns Status::NotFound. + // If the tablet is not running, returns Status::ServiceUnavailable. + // Otherwise, returns Status::OK and puts the result in 'locs_pb'. + // This only returns tablets which are in RUNNING state. + Status GetTabletLocations(const std::string& tablet_id, + TabletLocationsPB* locs_pb); + + // Handle a tablet report from the given tablet server. + // + // The RPC context is provided for logging/tracing purposes, + // but this function does not itself respond to the RPC. + Status ProcessTabletReport(TSDescriptor* ts_desc, + const TabletReportPB& report, + TabletReportUpdatesPB *report_update, + rpc::RpcContext* rpc); + + SysCatalogTable* sys_catalog() { return sys_catalog_.get(); } + + // Dump all of the current state about tables and tablets to the + // given output stream. This is verbose, meant for debugging. + void DumpState(std::ostream* out) const; + + // Return true if the table with the specified ID exists, + // and set the table pointer to the TableInfo object + // NOTE: This should only be used by tests or web-ui + bool GetTableInfo(const std::string& table_id, scoped_refptr *table); + + // Return all the available TableInfo, which also may include not running tables + // NOTE: This should only be used by tests or web-ui + void GetAllTables(std::vector > *tables); + + // Return true if the specified table name exists + // NOTE: This should only be used by tests + bool TableNameExists(const std::string& table_name); + + // Let the catalog manager know that the the given tablet server successfully + // deleted the specified tablet. + void NotifyTabletDeleteSuccess(const std::string& permanent_uuid, const std::string& tablet_id); + + // Used by ConsensusService to retrieve the TabletPeer for a system + // table specified by 'tablet_id'. + // + // See also: TabletPeerLookupIf, ConsensusServiceImpl. + virtual Status GetTabletPeer(const std::string& tablet_id, + scoped_refptr* tablet_peer) const OVERRIDE; + + virtual const NodeInstancePB& NodeInstance() const OVERRIDE; + + bool IsInitialized() const; + + virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) OVERRIDE; + + // Return OK if this CatalogManager is a leader in a consensus configuration and if + // the required leader state (metadata for tables and tablets) has + // been successfully loaded into memory. CatalogManager must be + // initialized before calling this method. + Status CheckIsLeaderAndReady() const; + + // Returns this CatalogManager's role in a consensus configuration. CatalogManager + // must be initialized before calling this method. + consensus::RaftPeerPB::Role Role() const; + + private: + friend class TableLoader; + friend class TabletLoader; + + // Called by SysCatalog::SysCatalogStateChanged when this node + // becomes the leader of a consensus configuration. Executes VisitTablesAndTabletsTask + // below. + Status ElectedAsLeaderCb(); + + // Loops and sleeps until one of the following conditions occurs: + // 1. The current node is the leader master in the current term + // and at least one op from the current term is committed. Returns OK. + // 2. The current node is not the leader master. + // Returns IllegalState. + // 3. The provided timeout expires. Returns TimedOut. + // + // This method is intended to ensure that all operations replicated by + // previous masters are committed and visible to the local node before + // reading that data, to ensure consistency across failovers. + Status WaitUntilCaughtUpAsLeader(const MonoDelta& timeout); + + // This method is submitted to 'leader_initialization_pool_' by + // ElectedAsLeaderCb above. It: + // 1) Acquired 'lock_' + // 2) Resets 'tables_tablets_visited_status_' + // 3) Runs VisitTablesAndTabletsUnlocked below + // 4) Sets 'tables_tablets_visited_status_' to return value of + // the call to VisitTablesAndTabletsUnlocked. + // 5) Releases 'lock_' and if successful, updates 'leader_ready_term_' + // to true (under state_lock_). + void VisitTablesAndTabletsTask(); + + // Clears out the existing metadata ('table_names_map_', 'table_ids_map_', + // and 'tablet_map_'), loads tables metadata into memory and if successful + // loads the tablets metadata. + // + // NOTE: Must be called under external synchronization, see + // VisitTablesAndTabletsTask() above. + Status VisitTablesAndTabletsUnlocked(); + + // Helper for initializing 'sys_catalog_'. After calling this + // method, the caller should call WaitUntilRunning() on sys_catalog_ + // WITHOUT holding 'lock_' to wait for consensus to start for + // sys_catalog_. + // + // This method is thread-safe. + Status InitSysCatalogAsync(bool is_first_run); + + // Helper for creating the initial TableInfo state + // Leaves the table "write locked" with the new info in the + // "dirty" state field. + TableInfo* CreateTableInfo(const CreateTableRequestPB& req, + const Schema& schema, + const PartitionSchema& partition_schema); + + // Helper for creating the initial TabletInfo state. + // Leaves the tablet "write locked" with the new info in the + // "dirty" state field. + TabletInfo *CreateTabletInfo(TableInfo* table, + const PartitionPB& partition); + + // Builds the TabletLocationsPB for a tablet based on the provided TabletInfo. + // Populates locs_pb and returns true on success. + // Returns Status::ServiceUnavailable if tablet is not running. + Status BuildLocationsForTablet(const scoped_refptr& tablet, + TabletLocationsPB* locs_pb); + + Status FindTable(const TableIdentifierPB& table_identifier, + scoped_refptr* table_info); + + // Handle one of the tablets in a tablet reported. + // Requires that the lock is already held. + Status HandleReportedTablet(TSDescriptor* ts_desc, + const ReportedTabletPB& report, + ReportedTabletUpdatesPB *report_updates); + + Status ResetTabletReplicasFromReportedConfig(const ReportedTabletPB& report, + const scoped_refptr& tablet, + TabletMetadataLock* tablet_lock, + TableMetadataLock* table_lock); + + // Register a tablet server whenever it heartbeats with a consensus configuration. This is + // needed because we have logic in the Master that states that if a tablet + // server that is part of a consensus configuration has not heartbeated to the Master yet, we + // leave it out of the consensus configuration reported to clients. + // TODO: See if we can remove this logic, as it seems confusing. + void AddReplicaToTabletIfNotFound(TSDescriptor* ts_desc, + const ReportedTabletPB& report, + const scoped_refptr& tablet); + + void NewReplica(TSDescriptor* ts_desc, const ReportedTabletPB& report, TabletReplica* replica); + + // Extract the set of tablets that can be deleted and the set of tablets + // that must be processed because not running yet. + void ExtractTabletsToProcess(std::vector > *tablets_to_delete, + std::vector > *tablets_to_process); + + // Task that takes care of the tablet assignments/creations. + // Loops through the "not created" tablets and sends a CreateTablet() request. + Status ProcessPendingAssignments(const std::vector >& tablets); + + // Given 'two_choices', which should be a vector of exactly two elements, select which + // one is the better choice for a new replica. + std::shared_ptr PickBetterReplicaLocation(const TSDescriptorVector& two_choices); + + // Select a tablet server from 'ts_descs' on which to place a new replica. + // Any tablet servers in 'excluded' are not considered. + // REQUIRES: 'ts_descs' must include at least one non-excluded server. + std::shared_ptr SelectReplica( + const TSDescriptorVector& ts_descs, + const std::set>& excluded); + + // Select N Replicas from online tablet servers (as specified by + // 'ts_descs') for the specified tablet and populate the consensus configuration + // object. If 'ts_descs' does not specify enough online tablet + // servers to select the N replicas, return Status::InvalidArgument. + // + // This method is called by "ProcessPendingAssignments()". + Status SelectReplicasForTablet(const TSDescriptorVector& ts_descs, TabletInfo* tablet); + + // Select N Replicas from the online tablet servers + // and populate the consensus configuration object. + // + // This method is called by "SelectReplicasForTablet". + void SelectReplicas(const TSDescriptorVector& ts_descs, + int nreplicas, + consensus::RaftConfigPB *config); + + void HandleAssignPreparingTablet(TabletInfo* tablet, + DeferredAssignmentActions* deferred); + + // Assign tablets and send CreateTablet RPCs to tablet servers. + // The out param 'new_tablets' should have any newly-created TabletInfo + // objects appended to it. + void HandleAssignCreatingTablet(TabletInfo* tablet, + DeferredAssignmentActions* deferred, + std::vector >* new_tablets); + + Status HandleTabletSchemaVersionReport(TabletInfo *tablet, + uint32_t version); + + // Send the create tablet requests to the selected peers of the consensus configurations. + // The creation is async, and at the moment there is no error checking on the + // caller side. We rely on the assignment timeout. If we don't see the tablet + // after the timeout, we regenerate a new one and proceed with a new + // assignment/creation. + // + // This method is part of the "ProcessPendingAssignments()" + // + // This must be called after persisting the tablet state as + // CREATING to ensure coherent state after Master failover. + void SendCreateTabletRequests(const std::vector& tablets); + + // Send the "alter table request" to all tablets of the specified table. + void SendAlterTableRequest(const scoped_refptr& table); + + // Start the background task to send the AlterTable() RPC to the leader for this + // tablet. + void SendAlterTabletRequest(const scoped_refptr& tablet); + + // Request tablet servers to delete all replicas of the tablet. + void DeleteTabletReplicas(const TabletInfo* tablet, const std::string& msg); + + // Marks each of the tablets in the given table as deleted and triggers requests + // to the tablet servers to delete them. + void DeleteTabletsAndSendRequests(const scoped_refptr& table); + + // Send the "delete tablet request" to the specified TS/tablet. + // The specified 'reason' will be logged on the TS. + void SendDeleteTabletRequest(const std::string& tablet_id, + tablet::TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + const scoped_refptr& table, + TSDescriptor* ts_desc, + const std::string& reason); + + // Start a task to change the config to add an additional voter because the + // specified tablet is under-replicated. + void SendAddServerRequest(const scoped_refptr& tablet, + const consensus::ConsensusStatePB& cstate); + + std::string GenerateId() { return oid_generator_.Next(); } + + // Abort creation of 'table': abort all mutation for TabletInfo and + // TableInfo objects (releasing all COW locks), abort all pending + // tasks associated with the table, and erase any state related to + // the table we failed to create from the in-memory maps + // ('table_names_map_', 'table_ids_map_', 'tablet_map_' below). + void AbortTableCreation(TableInfo* table, const std::vector& tablets); + + // Conventional "T xxx P yyy: " prefix for logging. + std::string LogPrefix() const; + + // TODO: the maps are a little wasteful of RAM, since the TableInfo/TabletInfo + // objects have a copy of the string key. But STL doesn't make it + // easy to make a "gettable set". + + // Lock protecting the various maps below. + typedef rw_spinlock LockType; + mutable LockType lock_; + + // Table maps: table-id -> TableInfo and table-name -> TableInfo + typedef std::unordered_map > TableInfoMap; + TableInfoMap table_ids_map_; + TableInfoMap table_names_map_; + + // Tablet maps: tablet-id -> TabletInfo + typedef std::unordered_map > TabletInfoMap; + TabletInfoMap tablet_map_; + + Master *master_; + Atomic32 closing_; + ObjectIdGenerator oid_generator_; + + // Random number generator used for selecting replica locations. + ThreadSafeRandom rng_; + + gscoped_ptr sys_catalog_; + + // Background thread, used to execute the catalog manager tasks + // like the assignment and cleaner + friend class CatalogManagerBgTasks; + gscoped_ptr background_tasks_; + + enum State { + kConstructed, + kStarting, + kRunning, + kClosing + }; + + // Lock protecting state_, leader_ready_term_ + mutable simple_spinlock state_lock_; + State state_; + + // Used to defer work from reactor threads onto a thread where + // blocking behavior is permissible. + // + // NOTE: Presently, this thread pool must contain only a single + // thread (to correctly serialize invocations of ElectedAsLeaderCb + // upon closely timed consecutive elections). + gscoped_ptr worker_pool_; + + // This field is updated when a node becomes leader master, + // waits for all outstanding uncommitted metadata (table and tablet metadata) + // in the sys catalog to commit, and then reads that metadata into in-memory + // data structures. This is used to "fence" client and tablet server requests + // that depend on the in-memory state until this master can respond + // correctly. + int64_t leader_ready_term_; + + // Async operations are accessing some private methods + // (TODO: this stuff should be deferred and done in the background thread) + friend class AsyncAlterTable; + + DISALLOW_COPY_AND_ASSIGN(CatalogManager); +}; + +} // namespace master +} // namespace kudu +#endif /* KUDU_MASTER_CATALOG_MANAGER_H */ diff --git a/src/kudu/master/master-path-handlers.cc b/src/kudu/master/master-path-handlers.cc new file mode 100644 index 000000000000..e71c5ceb02be --- /dev/null +++ b/src/kudu/master/master-path-handlers.cc @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/master-path-handlers.h" + +#include +#include +#include +#include +#include + +#include "kudu/common/partition.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/webui_util.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/sys_catalog.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/ts_manager.h" +#include "kudu/util/string_case.h" +#include "kudu/util/url-coding.h" + + +namespace kudu { + +using consensus::RaftPeerPB; +using std::vector; +using std::string; +using std::stringstream; +using strings::Substitute; + +namespace master { + +MasterPathHandlers::~MasterPathHandlers() { +} + +void MasterPathHandlers::HandleTabletServers(const Webserver::WebRequest& req, + stringstream* output) { + vector > descs; + master_->ts_manager()->GetAllDescriptors(&descs); + + *output << "

Tablet Servers

\n"; + + *output << "\n"; + *output << " \n"; + for (const std::shared_ptr& desc : descs) { + const string time_since_hb = StringPrintf("%.1fs", desc->TimeSinceHeartbeat().ToSeconds()); + TSRegistrationPB reg; + desc->GetRegistration(®); + *output << Substitute("\n", + RegistrationToHtml(reg, desc->permanent_uuid()), + time_since_hb, + EscapeForHtmlToString(reg.ShortDebugString())); + } + *output << "
UUIDTime since heartbeatRegistration
$0$1$2
\n"; +} + +void MasterPathHandlers::HandleCatalogManager(const Webserver::WebRequest& req, + stringstream* output) { + *output << "

Tables

\n"; + + std::vector > tables; + master_->catalog_manager()->GetAllTables(&tables); + + *output << "\n"; + *output << " \n"; + typedef std::map StringMap; + StringMap ordered_tables; + for (const scoped_refptr& table : tables) { + TableMetadataLock l(table.get(), TableMetadataLock::READ); + if (!l.data().is_running()) { + continue; + } + string state = SysTablesEntryPB_State_Name(l.data().pb.state()); + Capitalize(&state); + ordered_tables[l.data().name()] = Substitute( + "\n", + EscapeForHtmlToString(l.data().name()), + EscapeForHtmlToString(table->id()), + state, + EscapeForHtmlToString(l.data().pb.state_msg())); + } + for (const StringMap::value_type& table : ordered_tables) { + *output << table.second; + } + *output << "
Table NameTable IdState
$0$1$2 $3
\n"; +} + +namespace { + +bool CompareByRole(const TabletReplica& a, const TabletReplica& b) { + return a.role < b.role; +} + +} // anonymous namespace + + +void MasterPathHandlers::HandleTablePage(const Webserver::WebRequest& req, + stringstream *output) { + // Parse argument. + string table_id; + if (!FindCopy(req.parsed_args, "id", &table_id)) { + // TODO: webserver should give a way to return a non-200 response code + *output << "Missing 'id' argument"; + return; + } + + scoped_refptr table; + if (!master_->catalog_manager()->GetTableInfo(table_id, &table)) { + *output << "Table not found"; + return; + } + + Schema schema; + PartitionSchema partition_schema; + string table_name; + vector > tablets; + { + TableMetadataLock l(table.get(), TableMetadataLock::READ); + table_name = l.data().name(); + *output << "

Table: " << EscapeForHtmlToString(table_name) + << " (" << EscapeForHtmlToString(table_id) << ")

\n"; + + *output << "\n"; + *output << " \n"; + + string state = SysTablesEntryPB_State_Name(l.data().pb.state()); + Capitalize(&state); + *output << " \n"; + *output << "
Version:" << l.data().pb.version() << "
State:" + << state + << EscapeForHtmlToString(l.data().pb.state_msg()) + << "
\n"; + + SchemaFromPB(l.data().pb.schema(), &schema); + Status s = PartitionSchema::FromPB(l.data().pb.partition_schema(), schema, &partition_schema); + if (!s.ok()) { + *output << "Unable to decode partition schema: " << s.ToString(); + return; + } + table->GetAllTablets(&tablets); + } + + HtmlOutputSchemaTable(schema, output); + + *output << "\n"; + *output << " " + "\n"; + for (const scoped_refptr& tablet : tablets) { + TabletInfo::ReplicaMap locations; + tablet->GetReplicaLocations(&locations); + vector sorted_locations; + AppendValuesFromMap(locations, &sorted_locations); + std::sort(sorted_locations.begin(), sorted_locations.end(), &CompareByRole); + + TabletMetadataLock l(tablet.get(), TabletMetadataLock::READ); + + Partition partition; + Partition::FromPB(l.data().pb.partition(), &partition); + + string state = SysTabletsEntryPB_State_Name(l.data().pb.state()); + Capitalize(&state); + *output << Substitute( + "\n", + tablet->tablet_id(), + EscapeForHtmlToString(partition_schema.PartitionDebugString(partition, schema)), + state, + EscapeForHtmlToString(l.data().pb.state_msg()), + RaftConfigToHtml(sorted_locations, tablet->tablet_id())); + } + *output << "
Tablet IDPartitionStateMessageRaftConfig
$0$1$2$3$4
\n"; + + *output << "

Impala CREATE TABLE statement

\n"; + + string master_addresses; + if (master_->opts().IsDistributed()) { + vector all_addresses; + for (const HostPort& hp : master_->opts().master_addresses) { + master_addresses.append(hp.ToString()); + } + master_addresses = JoinElements(all_addresses, ","); + } else { + Sockaddr addr = master_->first_rpc_address(); + HostPort hp; + Status s = HostPortFromSockaddrReplaceWildcard(addr, &hp); + if (s.ok()) { + master_addresses = hp.ToString(); + } else { + LOG(WARNING) << "Unable to determine proper local hostname: " << s.ToString(); + master_addresses = addr.ToString(); + } + } + HtmlOutputImpalaSchema(table_name, schema, master_addresses, output); + + std::vector > task_list; + table->GetTaskList(&task_list); + HtmlOutputTaskList(task_list, output); +} + +void MasterPathHandlers::HandleMasters(const Webserver::WebRequest& req, + stringstream* output) { + vector masters; + Status s = master_->ListMasters(&masters); + if (!s.ok()) { + s = s.CloneAndPrepend("Unable to list Masters"); + LOG(WARNING) << s.ToString(); + *output << "

" << s.ToString() << "

\n"; + return; + } + *output << "

Masters

\n"; + *output << "\n"; + *output << " \n"; + + for (const ServerEntryPB& master : masters) { + if (master.has_error()) { + Status error = StatusFromPB(master.error()); + *output << Substitute(" \n", + EscapeForHtmlToString(error.ToString())); + continue; + } + string reg_text = RegistrationToHtml(master.registration(), + master.instance_id().permanent_uuid()); + if (master.instance_id().permanent_uuid() == master_->instance_pb().permanent_uuid()) { + reg_text = Substitute("$0", reg_text); + } + *output << Substitute(" \n", reg_text, + master.has_role() ? RaftPeerPB_Role_Name(master.role()) : "N/A"); + } + + *output << "
RegistrationRole
$0
$0$1
"; +} + +namespace { + +// Visitor for the catalog table which dumps tables and tablets in a JSON format. This +// dump is interpreted by the CM agent in order to track time series entities in the SMON +// database. +// +// This implementation relies on scanning the catalog table directly instead of using the +// catalog manager APIs. This allows it to work even on a non-leader master, and avoids +// any requirement for locking. For the purposes of metrics entity gathering, it's OK to +// serve a slightly stale snapshot. +// +// It is tempting to directly dump the metadata protobufs using JsonWriter::Protobuf(...), +// but then we would be tying ourselves to textual compatibility of the PB field names in +// our catalog table. Instead, the implementation specifically dumps the fields that we +// care about. +// +// This should be considered a "stable" protocol -- do not rename, remove, or restructure +// without consulting with the CM team. +class JsonDumper : public TableVisitor, public TabletVisitor { + public: + explicit JsonDumper(JsonWriter* jw) : jw_(jw) { + } + + Status VisitTable(const std::string& table_id, + const SysTablesEntryPB& metadata) OVERRIDE { + if (metadata.state() != SysTablesEntryPB::RUNNING) { + return Status::OK(); + } + + jw_->StartObject(); + jw_->String("table_id"); + jw_->String(table_id); + + jw_->String("table_name"); + jw_->String(metadata.name()); + + jw_->String("state"); + jw_->String(SysTablesEntryPB::State_Name(metadata.state())); + + jw_->EndObject(); + return Status::OK(); + } + + Status VisitTablet(const std::string& table_id, + const std::string& tablet_id, + const SysTabletsEntryPB& metadata) OVERRIDE { + if (metadata.state() != SysTabletsEntryPB::RUNNING) { + return Status::OK(); + } + + jw_->StartObject(); + jw_->String("table_id"); + jw_->String(table_id); + + jw_->String("tablet_id"); + jw_->String(tablet_id); + + jw_->String("state"); + jw_->String(SysTabletsEntryPB::State_Name(metadata.state())); + + // Dump replica UUIDs + if (metadata.has_committed_consensus_state()) { + const consensus::ConsensusStatePB& cs = metadata.committed_consensus_state(); + jw_->String("replicas"); + jw_->StartArray(); + for (const RaftPeerPB& peer : cs.config().peers()) { + jw_->StartObject(); + jw_->String("type"); + jw_->String(RaftPeerPB::MemberType_Name(peer.member_type())); + + jw_->String("server_uuid"); + jw_->String(peer.permanent_uuid()); + + jw_->String("addr"); + jw_->String(Substitute("$0:$1", peer.last_known_addr().host(), + peer.last_known_addr().port())); + + jw_->EndObject(); + } + jw_->EndArray(); + + if (cs.has_leader_uuid()) { + jw_->String("leader"); + jw_->String(cs.leader_uuid()); + } + } + + jw_->EndObject(); + return Status::OK(); + } + + private: + JsonWriter* jw_; +}; + +void JsonError(const Status& s, stringstream* out) { + out->str(""); + JsonWriter jw(out, JsonWriter::COMPACT); + jw.StartObject(); + jw.String("error"); + jw.String(s.ToString()); + jw.EndObject(); +} +} // anonymous namespace + +void MasterPathHandlers::HandleDumpEntities(const Webserver::WebRequest& req, + stringstream* output) { + JsonWriter jw(output, JsonWriter::COMPACT); + JsonDumper d(&jw); + + jw.StartObject(); + + jw.String("tables"); + jw.StartArray(); + Status s = master_->catalog_manager()->sys_catalog()->VisitTables(&d); + if (!s.ok()) { + JsonError(s, output); + return; + } + jw.EndArray(); + + jw.String("tablets"); + jw.StartArray(); + s = master_->catalog_manager()->sys_catalog()->VisitTablets(&d); + if (!s.ok()) { + JsonError(s, output); + return; + } + jw.EndArray(); + + jw.EndObject(); +} + +Status MasterPathHandlers::Register(Webserver* server) { + bool is_styled = true; + bool is_on_nav_bar = true; + server->RegisterPathHandler("/tablet-servers", "Tablet Servers", + boost::bind(&MasterPathHandlers::HandleTabletServers, this, _1, _2), + is_styled, is_on_nav_bar); + server->RegisterPathHandler("/tables", "Tables", + boost::bind(&MasterPathHandlers::HandleCatalogManager, this, _1, _2), + is_styled, is_on_nav_bar); + server->RegisterPathHandler("/table", "", + boost::bind(&MasterPathHandlers::HandleTablePage, this, _1, _2), + is_styled, false); + server->RegisterPathHandler("/masters", "Masters", + boost::bind(&MasterPathHandlers::HandleMasters, this, _1, _2), + is_styled, is_on_nav_bar); + server->RegisterPathHandler("/dump-entities", "Dump Entities", + boost::bind(&MasterPathHandlers::HandleDumpEntities, this, _1, _2), + false, false); + return Status::OK(); +} + +string MasterPathHandlers::RaftConfigToHtml(const std::vector& locations, + const std::string& tablet_id) const { + stringstream html; + + html << "
    \n"; + for (const TabletReplica& location : locations) { + string location_html = TSDescriptorToHtml(*location.ts_desc, tablet_id); + if (location.role == RaftPeerPB::LEADER) { + html << Substitute("
  • LEADER: $0
  • \n", location_html); + } else { + html << Substitute("
  • $0: $1
  • \n", + RaftPeerPB_Role_Name(location.role), location_html); + } + } + html << "
\n"; + return html.str(); +} + +string MasterPathHandlers::TSDescriptorToHtml(const TSDescriptor& desc, + const std::string& tablet_id) const { + TSRegistrationPB reg; + desc.GetRegistration(®); + + if (reg.http_addresses().size() > 0) { + return Substitute("
$3", + reg.http_addresses(0).host(), + reg.http_addresses(0).port(), + EscapeForHtmlToString(tablet_id), + EscapeForHtmlToString(reg.http_addresses(0).host())); + } else { + return EscapeForHtmlToString(desc.permanent_uuid()); + } +} + +template +string MasterPathHandlers::RegistrationToHtml(const RegistrationType& reg, + const std::string& link_text) const { + string link_html = EscapeForHtmlToString(link_text); + if (reg.http_addresses().size() > 0) { + link_html = Substitute("$2", + reg.http_addresses(0).host(), + reg.http_addresses(0).port(), link_html); + } + return link_html; +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master-path-handlers.h b/src/kudu/master/master-path-handlers.h new file mode 100644 index 000000000000..149eac52e288 --- /dev/null +++ b/src/kudu/master/master-path-handlers.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_MASTER_PATH_HANDLERS_H +#define KUDU_MASTER_MASTER_PATH_HANDLERS_H + +#include "kudu/gutil/macros.h" +#include "kudu/server/webserver.h" + +#include +#include +#include + +namespace kudu { + +class Schema; + +namespace master { + +class Master; +struct TabletReplica; +class TSDescriptor; +class TSRegistrationPB; + +// Web page support for the master. +class MasterPathHandlers { + public: + explicit MasterPathHandlers(Master* master) + : master_(master) { + } + + ~MasterPathHandlers(); + + Status Register(Webserver* server); + + private: + void HandleTabletServers(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleCatalogManager(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleTablePage(const Webserver::WebRequest& req, + std::stringstream *output); + void HandleMasters(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleDumpEntities(const Webserver::WebRequest& req, + std::stringstream* output); + + // Convert location of peers to HTML, indicating the roles + // of each tablet server in a consensus configuration. + // This method will display 'locations' in the order given. + std::string RaftConfigToHtml(const std::vector& locations, + const std::string& tablet_id) const; + + // Convert the specified TSDescriptor to HTML, adding a link to the + // tablet server's own webserver if specified in 'desc'. + std::string TSDescriptorToHtml(const TSDescriptor& desc, + const std::string& tablet_id) const; + + // Convert the specified server registration to HTML, adding a link + // to the server's own web server (if specified in 'reg') with + // anchor text 'link_text'. 'RegistrationType' must be + // TSRegistrationPB or MasterRegistrationPB. + template + std::string RegistrationToHtml(const RegistrationType& reg, + const std::string& link_text) const; + + Master* master_; + DISALLOW_COPY_AND_ASSIGN(MasterPathHandlers); +}; + +void HandleTabletServersPage(const Webserver::WebRequest& req, std::stringstream* output); + +} // namespace master +} // namespace kudu +#endif /* KUDU_MASTER_MASTER_PATH_HANDLERS_H */ diff --git a/src/kudu/master/master-test-util.h b/src/kudu/master/master-test-util.h new file mode 100644 index 000000000000..1ee264cf0481 --- /dev/null +++ b/src/kudu/master/master-test-util.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_MASTER_TEST_UTIL_H_ +#define KUDU_MASTER_TEST_UTIL_H_ + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace master { + +Status WaitForRunningTabletCount(MiniMaster* mini_master, + const string& table_name, + int expected_count, + GetTableLocationsResponsePB* resp) { + int wait_time = 1000; + + SCOPED_LOG_TIMING(INFO, strings::Substitute("waiting for tablet count of $0", expected_count)); + while (true) { + GetTableLocationsRequestPB req; + resp->Clear(); + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(expected_count); + RETURN_NOT_OK(mini_master->master()->catalog_manager()->GetTableLocations(&req, resp)); + if (resp->tablet_locations_size() >= expected_count) { + bool is_stale = false; + for (const TabletLocationsPB& loc : resp->tablet_locations()) { + is_stale |= loc.stale(); + } + + if (!is_stale) { + return Status::OK(); + } + } + + LOG(INFO) << "Waiting for " << expected_count << " tablets for table " + << table_name << ". So far we have " << resp->tablet_locations_size(); + + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + wait_time = std::min(wait_time * 5 / 4, 1000000); + } + + // Unreachable. + LOG(FATAL) << "Reached unreachable section"; + return Status::RuntimeError("Unreachable statement"); // Suppress compiler warnings. +} + +void CreateTabletForTesting(MiniMaster* mini_master, + const string& table_name, + const Schema& schema, + string *tablet_id) { + { + CreateTableRequestPB req; + CreateTableResponsePB resp; + + req.set_name(table_name); + req.set_num_replicas(1); + ASSERT_OK(SchemaToPB(schema, req.mutable_schema())); + ASSERT_OK(mini_master->master()->catalog_manager()->CreateTable(&req, &resp, NULL)); + } + + int wait_time = 1000; + bool is_table_created = false; + for (int i = 0; i < 80; ++i) { + IsCreateTableDoneRequestPB req; + IsCreateTableDoneResponsePB resp; + + req.mutable_table()->set_table_name(table_name); + ASSERT_OK(mini_master->master()->catalog_manager()->IsCreateTableDone(&req, &resp)); + if (resp.done()) { + is_table_created = true; + break; + } + + VLOG(1) << "Waiting for table '" << table_name << "'to be created"; + + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + wait_time = std::min(wait_time * 5 / 4, 1000000); + } + ASSERT_TRUE(is_table_created); + + { + GetTableSchemaRequestPB req; + GetTableSchemaResponsePB resp; + req.mutable_table()->set_table_name(table_name); + ASSERT_OK(mini_master->master()->catalog_manager()->GetTableSchema(&req, &resp)); + ASSERT_TRUE(resp.create_table_done()); + } + + GetTableLocationsResponsePB resp; + ASSERT_OK(WaitForRunningTabletCount(mini_master, table_name, 1, &resp)); + *tablet_id = resp.tablet_locations(0).tablet_id(); + LOG(INFO) << "Got tablet " << *tablet_id << " for table " << table_name; +} + +} // namespace master +} // namespace kudu + +#endif /* KUDU_MASTER_TEST_UTIL_H_ */ diff --git a/src/kudu/master/master-test.cc b/src/kudu/master/master-test.cc new file mode 100644 index 000000000000..9fb52db837fe --- /dev/null +++ b/src/kudu/master/master-test.cc @@ -0,0 +1,456 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_operations.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/master/master.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/master-test-util.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/sys_catalog.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/ts_manager.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/rpc_server.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; +using std::shared_ptr; +using std::string; + +DECLARE_bool(catalog_manager_check_ts_count_for_create_table); + +namespace kudu { +namespace master { + +class MasterTest : public KuduTest { + protected: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // In this test, we create tables to test catalog manager behavior, + // but we have no tablet servers. Typically this would be disallowed. + FLAGS_catalog_manager_check_ts_count_for_create_table = false; + + // Start master + mini_master_.reset(new MiniMaster(Env::Default(), GetTestPath("Master"), 0)); + ASSERT_OK(mini_master_->Start()); + master_ = mini_master_->master(); + ASSERT_OK(master_->WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + + // Create a client proxy to it. + MessengerBuilder bld("Client"); + ASSERT_OK(bld.Build(&client_messenger_)); + proxy_.reset(new MasterServiceProxy(client_messenger_, mini_master_->bound_rpc_addr())); + } + + virtual void TearDown() OVERRIDE { + mini_master_->Shutdown(); + KuduTest::TearDown(); + } + + void DoListTables(const ListTablesRequestPB& req, ListTablesResponsePB* resp); + void DoListAllTables(ListTablesResponsePB* resp); + Status CreateTable(const string& table_name, + const Schema& schema); + Status CreateTable(const string& table_name, + const Schema& schema, + const vector& split_rows); + + shared_ptr client_messenger_; + gscoped_ptr mini_master_; + Master* master_; + gscoped_ptr proxy_; +}; + +TEST_F(MasterTest, TestPingServer) { + // Ping the server. + PingRequestPB req; + PingResponsePB resp; + RpcController controller; + ASSERT_OK(proxy_->Ping(req, &resp, &controller)); +} + +static void MakeHostPortPB(const string& host, uint32_t port, HostPortPB* pb) { + pb->set_host(host); + pb->set_port(port); +} + +// Test that shutting down a MiniMaster without starting it does not +// SEGV. +TEST_F(MasterTest, TestShutdownWithoutStart) { + MiniMaster m(Env::Default(), "/xxxx", 0); + m.Shutdown(); +} + +TEST_F(MasterTest, TestRegisterAndHeartbeat) { + const char *kTsUUID = "my-ts-uuid"; + + TSToMasterCommonPB common; + common.mutable_ts_instance()->set_permanent_uuid(kTsUUID); + common.mutable_ts_instance()->set_instance_seqno(1); + + // Try a heartbeat. The server hasn't heard of us, so should ask us + // to re-register. + { + RpcController rpc; + TSHeartbeatRequestPB req; + TSHeartbeatResponsePB resp; + req.mutable_common()->CopyFrom(common); + ASSERT_OK(proxy_->TSHeartbeat(req, &resp, &rpc)); + + ASSERT_TRUE(resp.needs_reregister()); + ASSERT_TRUE(resp.needs_full_tablet_report()); + } + + vector > descs; + master_->ts_manager()->GetAllDescriptors(&descs); + ASSERT_EQ(0, descs.size()) << "Should not have registered anything"; + + shared_ptr ts_desc; + ASSERT_FALSE(master_->ts_manager()->LookupTSByUUID(kTsUUID, &ts_desc)); + + // Register the fake TS, without sending any tablet report. + TSRegistrationPB fake_reg; + MakeHostPortPB("localhost", 1000, fake_reg.add_rpc_addresses()); + MakeHostPortPB("localhost", 2000, fake_reg.add_http_addresses()); + + { + TSHeartbeatRequestPB req; + TSHeartbeatResponsePB resp; + RpcController rpc; + req.mutable_common()->CopyFrom(common); + req.mutable_registration()->CopyFrom(fake_reg); + ASSERT_OK(proxy_->TSHeartbeat(req, &resp, &rpc)); + + ASSERT_FALSE(resp.needs_reregister()); + ASSERT_TRUE(resp.needs_full_tablet_report()); + } + + descs.clear(); + master_->ts_manager()->GetAllDescriptors(&descs); + ASSERT_EQ(1, descs.size()) << "Should have registered the TS"; + TSRegistrationPB reg; + descs[0]->GetRegistration(®); + ASSERT_EQ(fake_reg.DebugString(), reg.DebugString()) << "Master got different registration"; + + ASSERT_TRUE(master_->ts_manager()->LookupTSByUUID(kTsUUID, &ts_desc)); + ASSERT_EQ(ts_desc, descs[0]); + + // If the tablet server somehow lost the response to its registration RPC, it would + // attempt to register again. In that case, we shouldn't reject it -- we should + // just respond the same. + { + TSHeartbeatRequestPB req; + TSHeartbeatResponsePB resp; + RpcController rpc; + req.mutable_common()->CopyFrom(common); + req.mutable_registration()->CopyFrom(fake_reg); + ASSERT_OK(proxy_->TSHeartbeat(req, &resp, &rpc)); + + ASSERT_FALSE(resp.needs_reregister()); + ASSERT_TRUE(resp.needs_full_tablet_report()); + } + + // Now send a tablet report + { + TSHeartbeatRequestPB req; + TSHeartbeatResponsePB resp; + RpcController rpc; + req.mutable_common()->CopyFrom(common); + TabletReportPB* tr = req.mutable_tablet_report(); + tr->set_is_incremental(false); + tr->set_sequence_number(0); + ASSERT_OK(proxy_->TSHeartbeat(req, &resp, &rpc)); + + ASSERT_FALSE(resp.needs_reregister()); + ASSERT_FALSE(resp.needs_full_tablet_report()); + } + + descs.clear(); + master_->ts_manager()->GetAllDescriptors(&descs); + ASSERT_EQ(1, descs.size()) << "Should still only have one TS registered"; + + ASSERT_TRUE(master_->ts_manager()->LookupTSByUUID(kTsUUID, &ts_desc)); + ASSERT_EQ(ts_desc, descs[0]); + + // Ensure that the ListTabletServers shows the faked server. + { + ListTabletServersRequestPB req; + ListTabletServersResponsePB resp; + RpcController rpc; + ASSERT_OK(proxy_->ListTabletServers(req, &resp, &rpc)); + LOG(INFO) << resp.DebugString(); + ASSERT_EQ(1, resp.servers_size()); + ASSERT_EQ("my-ts-uuid", resp.servers(0).instance_id().permanent_uuid()); + ASSERT_EQ(1, resp.servers(0).instance_id().instance_seqno()); + } +} + +Status MasterTest::CreateTable(const string& table_name, + const Schema& schema) { + KuduPartialRow split1(&schema); + RETURN_NOT_OK(split1.SetInt32("key", 10)); + + KuduPartialRow split2(&schema); + RETURN_NOT_OK(split2.SetInt32("key", 20)); + + return CreateTable(table_name, schema, { split1, split2 }); +} + +Status MasterTest::CreateTable(const string& table_name, + const Schema& schema, + const vector& split_rows) { + + CreateTableRequestPB req; + CreateTableResponsePB resp; + RpcController controller; + + req.set_name(table_name); + RETURN_NOT_OK(SchemaToPB(schema, req.mutable_schema())); + RowOperationsPBEncoder encoder(req.mutable_split_rows()); + for (const KuduPartialRow& row : split_rows) { + encoder.Add(RowOperationsPB::SPLIT_ROW, row); + } + + RETURN_NOT_OK(proxy_->CreateTable(req, &resp, &controller)); + if (resp.has_error()) { + RETURN_NOT_OK(StatusFromPB(resp.error().status())); + } + return Status::OK(); +} + +void MasterTest::DoListTables(const ListTablesRequestPB& req, ListTablesResponsePB* resp) { + RpcController controller; + ASSERT_OK(proxy_->ListTables(req, resp, &controller)); + SCOPED_TRACE(resp->DebugString()); + ASSERT_FALSE(resp->has_error()); +} + +void MasterTest::DoListAllTables(ListTablesResponsePB* resp) { + ListTablesRequestPB req; + DoListTables(req, resp); +} + +TEST_F(MasterTest, TestCatalog) { + const char *kTableName = "testtb"; + const char *kOtherTableName = "tbtest"; + const Schema kTableSchema({ ColumnSchema("key", INT32), + ColumnSchema("v1", UINT64), + ColumnSchema("v2", STRING) }, + 1); + + ASSERT_OK(CreateTable(kTableName, kTableSchema)); + + ListTablesResponsePB tables; + ASSERT_NO_FATAL_FAILURE(DoListAllTables(&tables)); + ASSERT_EQ(1, tables.tables_size()); + ASSERT_EQ(kTableName, tables.tables(0).name()); + + // Delete the table + { + DeleteTableRequestPB req; + DeleteTableResponsePB resp; + RpcController controller; + req.mutable_table()->set_table_name(kTableName); + ASSERT_OK(proxy_->DeleteTable(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // List tables, should show no table + ASSERT_NO_FATAL_FAILURE(DoListAllTables(&tables)); + ASSERT_EQ(0, tables.tables_size()); + + // Re-create the table + ASSERT_OK(CreateTable(kTableName, kTableSchema)); + + // Restart the master, verify the table still shows up. + ASSERT_OK(mini_master_->Restart()); + ASSERT_OK(mini_master_->master()-> + WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + + ASSERT_NO_FATAL_FAILURE(DoListAllTables(&tables)); + ASSERT_EQ(1, tables.tables_size()); + ASSERT_EQ(kTableName, tables.tables(0).name()); + + // Test listing tables with a filter. + ASSERT_OK(CreateTable(kOtherTableName, kTableSchema)); + + { + ListTablesRequestPB req; + req.set_name_filter("test"); + DoListTables(req, &tables); + ASSERT_EQ(2, tables.tables_size()); + } + + { + ListTablesRequestPB req; + req.set_name_filter("tb"); + DoListTables(req, &tables); + ASSERT_EQ(2, tables.tables_size()); + } + + { + ListTablesRequestPB req; + req.set_name_filter(kTableName); + DoListTables(req, &tables); + ASSERT_EQ(1, tables.tables_size()); + ASSERT_EQ(kTableName, tables.tables(0).name()); + } + + { + ListTablesRequestPB req; + req.set_name_filter("btes"); + DoListTables(req, &tables); + ASSERT_EQ(1, tables.tables_size()); + ASSERT_EQ(kOtherTableName, tables.tables(0).name()); + } + + { + ListTablesRequestPB req; + req.set_name_filter("randomname"); + DoListTables(req, &tables); + ASSERT_EQ(0, tables.tables_size()); + } +} + +TEST_F(MasterTest, TestCreateTableCheckSplitRows) { + const char *kTableName = "testtb"; + const Schema kTableSchema({ ColumnSchema("key", INT32), ColumnSchema("val", INT32) }, 1); + + // No duplicate split rows. + { + KuduPartialRow split1 = KuduPartialRow(&kTableSchema); + ASSERT_OK(split1.SetInt32("key", 1)); + KuduPartialRow split2(&kTableSchema); + ASSERT_OK(split2.SetInt32("key", 2)); + Status s = CreateTable(kTableName, kTableSchema, { split1, split1, split2 }); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Duplicate split row"); + } + + // No empty split rows. + { + KuduPartialRow split1 = KuduPartialRow(&kTableSchema); + ASSERT_OK(split1.SetInt32("key", 1)); + KuduPartialRow split2(&kTableSchema); + Status s = CreateTable(kTableName, kTableSchema, { split1, split2 }); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), + "Invalid argument: Split rows must contain a value for at " + "least one range partition column"); + } + + // No non-range columns + { + KuduPartialRow split = KuduPartialRow(&kTableSchema); + ASSERT_OK(split.SetInt32("key", 1)); + ASSERT_OK(split.SetInt32("val", 1)); + Status s = CreateTable(kTableName, kTableSchema, { split }); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), + "Invalid argument: Split rows may only contain values " + "for range partitioned columns: val") + } +} + +TEST_F(MasterTest, TestCreateTableInvalidKeyType) { + const char *kTableName = "testtb"; + + { + const Schema kTableSchema({ ColumnSchema("key", BOOL) }, 1); + Status s = CreateTable(kTableName, kTableSchema, vector()); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), + "Key column may not have type of BOOL, FLOAT, or DOUBLE"); + } + + { + const Schema kTableSchema({ ColumnSchema("key", FLOAT) }, 1); + Status s = CreateTable(kTableName, kTableSchema, vector()); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), + "Key column may not have type of BOOL, FLOAT, or DOUBLE"); + } + + { + const Schema kTableSchema({ ColumnSchema("key", DOUBLE) }, 1); + Status s = CreateTable(kTableName, kTableSchema, vector()); + ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), + "Key column may not have type of BOOL, FLOAT, or DOUBLE"); + } +} + +// Regression test for KUDU-253/KUDU-592: crash if the schema passed to CreateTable +// is invalid. +TEST_F(MasterTest, TestCreateTableInvalidSchema) { + CreateTableRequestPB req; + CreateTableResponsePB resp; + RpcController controller; + + req.set_name("table"); + for (int i = 0; i < 2; i++) { + ColumnSchemaPB* col = req.mutable_schema()->add_columns(); + col->set_name("col"); + col->set_type(INT32); + col->set_is_key(true); + } + + ASSERT_OK(proxy_->CreateTable(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ("code: INVALID_ARGUMENT message: \"Duplicate column name: col\"", + resp.error().status().ShortDebugString()); +} + +// Regression test for KUDU-253/KUDU-592: crash if the GetTableLocations RPC call is +// invalid. +TEST_F(MasterTest, TestInvalidGetTableLocations) { + const string kTableName = "test"; + Schema schema({ ColumnSchema("key", INT32) }, 1); + ASSERT_OK(CreateTable(kTableName, schema)); + { + GetTableLocationsRequestPB req; + GetTableLocationsResponsePB resp; + RpcController controller; + req.mutable_table()->set_table_name(kTableName); + // Set the "start" key greater than the "end" key. + req.set_partition_key_start("zzzz"); + req.set_partition_key_end("aaaa"); + ASSERT_OK(proxy_->GetTableLocations(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ("code: INVALID_ARGUMENT message: " + "\"start partition key is greater than the end partition key\"", + resp.error().status().ShortDebugString()); + } +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master.cc b/src/kudu/master/master.cc new file mode 100644 index 000000000000..c3c46f3fdaae --- /dev/null +++ b/src/kudu/master/master.cc @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/master.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master_service.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/master-path-handlers.h" +#include "kudu/master/ts_manager.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/service_if.h" +#include "kudu/rpc/service_pool.h" +#include "kudu/server/rpc_server.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tserver/tablet_service.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/threadpool.h" + +DEFINE_int32(master_registration_rpc_timeout_ms, 1500, + "Timeout for retrieving master registration over RPC."); +TAG_FLAG(master_registration_rpc_timeout_ms, experimental); + +using std::min; +using std::shared_ptr; +using std::vector; + +using kudu::consensus::RaftPeerPB; +using kudu::rpc::ServiceIf; +using kudu::tserver::ConsensusServiceImpl; +using strings::Substitute; + +namespace kudu { +namespace master { + +Master::Master(const MasterOptions& opts) + : ServerBase("Master", opts, "kudu.master"), + state_(kStopped), + ts_manager_(new TSManager()), + catalog_manager_(new CatalogManager(this)), + path_handlers_(new MasterPathHandlers(this)), + opts_(opts), + registration_initialized_(false), + maintenance_manager_(new MaintenanceManager(MaintenanceManager::DEFAULT_OPTIONS)) { +} + +Master::~Master() { + CHECK_NE(kRunning, state_); +} + +string Master::ToString() const { + if (state_ != kRunning) { + return "Master (stopped)"; + } + return strings::Substitute("Master@$0", first_rpc_address().ToString()); +} + +Status Master::Init() { + CHECK_EQ(kStopped, state_); + + cfile::BlockCache::GetSingleton()->StartInstrumentation(metric_entity()); + + RETURN_NOT_OK(ThreadPoolBuilder("init").set_max_threads(1).Build(&init_pool_)); + + RETURN_NOT_OK(ServerBase::Init()); + + RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); + + state_ = kInitialized; + return Status::OK(); +} + +Status Master::Start() { + RETURN_NOT_OK(StartAsync()); + RETURN_NOT_OK(WaitForCatalogManagerInit()); + google::FlushLogFiles(google::INFO); // Flush the startup messages. + return Status::OK(); +} + +Status Master::StartAsync() { + CHECK_EQ(kInitialized, state_); + + RETURN_NOT_OK(maintenance_manager_->Init()); + + gscoped_ptr impl(new MasterServiceImpl(this)); + gscoped_ptr consensus_service(new ConsensusServiceImpl(metric_entity(), + catalog_manager_.get())); + + RETURN_NOT_OK(ServerBase::RegisterService(impl.Pass())); + RETURN_NOT_OK(ServerBase::RegisterService(consensus_service.Pass())); + RETURN_NOT_OK(ServerBase::Start()); + + // Now that we've bound, construct our ServerRegistrationPB. + RETURN_NOT_OK(InitMasterRegistration()); + + // Start initializing the catalog manager. + RETURN_NOT_OK(init_pool_->SubmitClosure(Bind(&Master::InitCatalogManagerTask, + Unretained(this)))); + + state_ = kRunning; + + return Status::OK(); +} + +void Master::InitCatalogManagerTask() { + Status s = InitCatalogManager(); + if (!s.ok()) { + LOG(ERROR) << ToString() << ": Unable to init master catalog manager: " << s.ToString(); + } + init_status_.Set(s); +} + +Status Master::InitCatalogManager() { + if (catalog_manager_->IsInitialized()) { + return Status::IllegalState("Catalog manager is already initialized"); + } + RETURN_NOT_OK_PREPEND(catalog_manager_->Init(is_first_run_), + "Unable to initialize catalog manager"); + return Status::OK(); +} + +Status Master::WaitForCatalogManagerInit() { + CHECK_EQ(state_, kRunning); + + return init_status_.Get(); +} + +Status Master::WaitUntilCatalogManagerIsLeaderAndReadyForTests(const MonoDelta& timeout) { + Status s; + MonoTime start = MonoTime::Now(MonoTime::FINE); + int backoff_ms = 1; + const int kMaxBackoffMs = 256; + do { + s = catalog_manager_->CheckIsLeaderAndReady(); + if (s.ok()) { + return Status::OK(); + } + SleepFor(MonoDelta::FromMilliseconds(backoff_ms)); + backoff_ms = min(backoff_ms << 1, kMaxBackoffMs); + } while (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).LessThan(timeout)); + return Status::TimedOut("Maximum time exceeded waiting for master leadership", + s.ToString()); +} + +void Master::Shutdown() { + if (state_ == kRunning) { + string name = ToString(); + LOG(INFO) << name << " shutting down..."; + maintenance_manager_->Shutdown(); + ServerBase::Shutdown(); + catalog_manager_->Shutdown(); + LOG(INFO) << name << " shutdown complete."; + } + state_ = kStopped; +} + +Status Master::GetMasterRegistration(ServerRegistrationPB* reg) const { + if (!registration_initialized_.load(std::memory_order_acquire)) { + return Status::ServiceUnavailable("Master startup not complete"); + } + reg->CopyFrom(registration_); + return Status::OK(); +} + +Status Master::InitMasterRegistration() { + CHECK(!registration_initialized_.load()); + + ServerRegistrationPB reg; + vector rpc_addrs; + RETURN_NOT_OK_PREPEND(rpc_server()->GetBoundAddresses(&rpc_addrs), + "Couldn't get RPC addresses"); + RETURN_NOT_OK(AddHostPortPBs(rpc_addrs, reg.mutable_rpc_addresses())); + vector http_addrs; + web_server()->GetBoundAddresses(&http_addrs); + RETURN_NOT_OK(AddHostPortPBs(http_addrs, reg.mutable_http_addresses())); + + registration_.Swap(®); + registration_initialized_.store(true); + + return Status::OK(); +} + +namespace { + +// TODO this method should be moved to a separate class (along with +// ListMasters), so that it can also be used in TS and client when +// bootstrapping. +Status GetMasterEntryForHost(const shared_ptr& messenger, + const HostPort& hostport, + ServerEntryPB* e) { + Sockaddr sockaddr; + RETURN_NOT_OK(SockaddrFromHostPort(hostport, &sockaddr)); + MasterServiceProxy proxy(messenger, sockaddr); + GetMasterRegistrationRequestPB req; + GetMasterRegistrationResponsePB resp; + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(FLAGS_master_registration_rpc_timeout_ms)); + RETURN_NOT_OK(proxy.GetMasterRegistration(req, &resp, &controller)); + e->mutable_instance_id()->CopyFrom(resp.instance_id()); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + e->mutable_registration()->CopyFrom(resp.registration()); + e->set_role(resp.role()); + return Status::OK(); +} + +} // anonymous namespace + +Status Master::ListMasters(std::vector* masters) const { + if (!opts_.IsDistributed()) { + ServerEntryPB local_entry; + local_entry.mutable_instance_id()->CopyFrom(catalog_manager_->NodeInstance()); + RETURN_NOT_OK(GetMasterRegistration(local_entry.mutable_registration())); + local_entry.set_role(RaftPeerPB::LEADER); + masters->push_back(local_entry); + return Status::OK(); + } + + for (const HostPort& peer_addr : opts_.master_addresses) { + ServerEntryPB peer_entry; + Status s = GetMasterEntryForHost(messenger_, peer_addr, &peer_entry); + if (!s.ok()) { + s = s.CloneAndPrepend( + Substitute("Unable to get registration information for peer ($0)", + peer_addr.ToString())); + LOG(WARNING) << s.ToString(); + StatusToPB(s, peer_entry.mutable_error()); + } + masters->push_back(peer_entry); + } + + return Status::OK(); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master.h b/src/kudu/master/master.h new file mode 100644 index 000000000000..b3296b51c861 --- /dev/null +++ b/src/kudu/master/master.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_MASTER_H +#define KUDU_MASTER_MASTER_H + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/master/master_options.h" +#include "kudu/master/master.pb.h" +#include "kudu/server/server_base.h" +#include "kudu/util/metrics.h" +#include "kudu/util/promise.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MaintenanceManager; +class RpcServer; +struct RpcServerOptions; +class ServerEntryPB; +class ThreadPool; + +namespace rpc { +class Messenger; +class ServicePool; +} + +namespace master { + +class CatalogManager; +class TSManager; +class MasterPathHandlers; + +class Master : public server::ServerBase { + public: + static const uint16_t kDefaultPort = 7051; + static const uint16_t kDefaultWebPort = 8051; + + explicit Master(const MasterOptions& opts); + ~Master(); + + Status Init(); + Status Start(); + + Status StartAsync(); + Status WaitForCatalogManagerInit(); + + // Wait until this Master's catalog manager instance is the leader and is ready. + // This method is intended for use by unit tests. + // If 'timeout' time is exceeded, returns Status::TimedOut. + Status WaitUntilCatalogManagerIsLeaderAndReadyForTests(const MonoDelta& timeout) + WARN_UNUSED_RESULT; + + void Shutdown(); + + std::string ToString() const; + + TSManager* ts_manager() { return ts_manager_.get(); } + + CatalogManager* catalog_manager() { return catalog_manager_.get(); } + + const MasterOptions& opts() { return opts_; } + + // Get the RPC and HTTP addresses for this master instance. + Status GetMasterRegistration(ServerRegistrationPB* registration) const; + + // Get node instance, Raft role, RPC and HTTP addresses for all + // masters. + // + // TODO move this to a separate class to be re-used in TS and + // client; cache this information with a TTL (possibly in another + // SysTable), so that we don't have to perform an RPC call on every + // request. + Status ListMasters(std::vector* masters) const; + + bool IsShutdown() const { + return state_ == kStopped; + } + + MaintenanceManager* maintenance_manager() { + return maintenance_manager_.get(); + } + + private: + friend class MasterTest; + + void InitCatalogManagerTask(); + Status InitCatalogManager(); + + // Initialize registration_. + // Requires that the web server and RPC server have been started. + Status InitMasterRegistration(); + + enum MasterState { + kStopped, + kInitialized, + kRunning + }; + + MasterState state_; + + gscoped_ptr ts_manager_; + gscoped_ptr catalog_manager_; + gscoped_ptr path_handlers_; + + // For initializing the catalog manager. + gscoped_ptr init_pool_; + + // The status of the master initialization. This is set + // by the async initialization task. + Promise init_status_; + + MasterOptions opts_; + + ServerRegistrationPB registration_; + // True once registration_ has been initialized. + std::atomic registration_initialized_; + + // The maintenance manager for this master. + std::shared_ptr maintenance_manager_; + + DISALLOW_COPY_AND_ASSIGN(Master); +}; + +} // namespace master +} // namespace kudu +#endif diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto new file mode 100644 index 000000000000..29f1e4d9e978 --- /dev/null +++ b/src/kudu/master/master.proto @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.master; + +option java_package = "org.kududb.master"; + +import "kudu/common/common.proto"; +import "kudu/common/wire_protocol.proto"; +import "kudu/consensus/metadata.proto"; +import "kudu/tablet/metadata.proto"; + +//////////////////////////////////////////////////////////// +// Common data structures +//////////////////////////////////////////////////////////// + +// Master specific errors use this protobuf. +message MasterErrorPB { + enum Code { + // An error which has no more specific error code. + // The code and message in 'status' may reveal more details. + // + // RPCs should avoid returning this, since callers will not be + // able to easily parse the error. + UNKNOWN_ERROR = 1; + + // The schema provided for a request was not well-formed. + INVALID_SCHEMA = 2; + + // The requested table does not exist + TABLE_NOT_FOUND = 3; + + // The name requested for the table is already in use + TABLE_ALREADY_PRESENT = 4; + + // The number of tablets requested for a new table is over the per TS limit. + TOO_MANY_TABLETS = 5; + + // Catalog manager is not yet initialized. + CATALOG_MANAGER_NOT_INITIALIZED = 6; + + // The operation attempted can only be invoked against either the + // leader or a single non-distributed master, which this node + // isn't. + NOT_THE_LEADER = 7; + + // The number of replicas requested is greater than the number of live servers + // in the cluster. + REPLICATION_FACTOR_TOO_HIGH = 8; + } + + // The error code. + required Code code = 1; + + // The Status object for the error. This will include a textual + // message that may be more useful to present in log messages, etc, + // though its error code is less specific. + required AppStatusPB status = 2; +} + +// Common information sent with every request from the tablet server +// to the master. +message TSToMasterCommonPB { + // The instance of the tablet server sending the heartbeat. + required NodeInstancePB ts_instance = 1; +} + +message TableIdentifierPB { + // The table ID to fetch info. + optional bytes table_id = 1; + + // The table name to fetch info. + optional string table_name = 2; +} + +//////////////////////////////////////////////////////////// +// Sys Tables Metadata +//////////////////////////////////////////////////////////// + +// The on-disk entry in the sys.catalog table ("metadata" column) for +// tablets entries. +message SysTabletsEntryPB { + enum State { + UNKNOWN = 999; + PREPARING = 0; + CREATING = 1; + RUNNING = 2; + REPLACED = 3; + DELETED = 4; + } + + // DEPRECATED. Replaced by 'partition'. + optional bytes DEPRECATED_start_key = 1; + optional bytes DEPRECATED_end_key = 2; + + // Tablet partition. + optional PartitionPB partition = 7; + + // The latest committed consensus configuration consensus configuration reported to the Master. + optional consensus.ConsensusStatePB committed_consensus_state = 3; + + // Debug state for the tablet. + optional State state = 4 [ default = UNKNOWN ]; + optional bytes state_msg = 5; + + // The table id for the tablet. + required bytes table_id = 6; +} + +// The on-disk entry in the sys.catalog table ("metadata" column) for +// tables entries. +message SysTablesEntryPB { + enum State { + UNKNOWN = 0; + PREPARING = 1; + RUNNING = 2; + ALTERING = 3; + REMOVED = 4; + } + + // Table name + required bytes name = 1; + + // sequence-id for the table metadata. + // Used on tablet-report to avoid sending "alter-table" notifications. + required uint32 version = 2; + + // Newest table schema (every TS will eventually have it). + required SchemaPB schema = 3; + + // Last table schema that is guaranteed to have reached every TS, though + // not necessarily the newest schema. + // + // This is the schema provided to the user on client->GetSchema(tableName). + optional SchemaPB fully_applied_schema = 4; + + // The table's partitioning schema. + optional PartitionSchemaPB partition_schema = 9; + + // The next column ID to assign to newly added columns in this table. + // This prevents column ID reuse. + optional int32 next_column_id = 8; + + // Number of TS replicas + required int32 num_replicas = 5; + + // Debug state for the table. + optional State state = 6 [ default = UNKNOWN ]; + optional bytes state_msg = 7; +} + +//////////////////////////////////////////////////////////// +// RPCs +//////////////////////////////////////////////////////////// + +message PingRequestPB { +} + +message PingResponsePB { +} + +// Sent by the TS when it first heartbeats with a master. This sends the +// master all of the necessary information about the current instance +// of the TS. +message TSRegistrationPB { + repeated HostPortPB rpc_addresses = 1; + repeated HostPortPB http_addresses = 2; + + // TODO: add stuff like software version, etc. +} + +message ReportedTabletPB { + required bytes tablet_id = 1; + optional tablet.TabletStatePB state = 2 [ default = UNKNOWN ]; + optional tablet.TabletDataState tablet_data_state = 6 [ default = TABLET_DATA_UNKNOWN ]; + + // The latest _committed_ consensus state. + // This will be missing if the tablet is not in a RUNNING state + // (i.e. if it is BOOTSTRAPPING). + optional consensus.ConsensusStatePB committed_consensus_state = 3; + + optional AppStatusPB error = 4; + optional uint32 schema_version = 5; +} + +// Sent by the tablet server to report the set of tablets hosted by that TS. +message TabletReportPB { + // If false, then this is a full report, and any prior information about + // tablets hosted by this server should be dropped. + required bool is_incremental = 1; + + // Tablets for which to update information. If 'is_incremental' is false, + // then this is the full set of tablets on the server, and any tablets + // which the master is aware of but not listed in this protobuf should + // be assumed to have been removed from this server. + repeated ReportedTabletPB updated_tablets = 2; + + // Tablet IDs which the tablet server has removed and should no longer be + // considered hosted here. This will always be empty in a non-incremental + // report. + repeated bytes removed_tablet_ids = 3; + + // Every time the TS generates a tablet report, it creates a sequence + // number. This can be useful in debugging, and also determining which + // changes have not yet been reported to the master. + // The first tablet report (non-incremental) is sequence number 0. + required int32 sequence_number = 4; +} + +message ReportedTabletUpdatesPB { + required bytes tablet_id = 1; + optional string state_msg = 2; +} + +// Sent by the Master in response to the TS tablet report (part of the heartbeats) +message TabletReportUpdatesPB { + repeated ReportedTabletUpdatesPB tablets = 1; +} + +// Heartbeat sent from the tablet-server to the master +// to establish liveness and report back any status changes. +message TSHeartbeatRequestPB { + required TSToMasterCommonPB common = 1; + + // Sent upon start-up of the TS, or in response to 'needs_reregister' on a heartbeat + // response. + optional TSRegistrationPB registration = 2; + + // Sent when the tablet information has changed, or in response to + // 'needs_full_tablet_report'. + optional TabletReportPB tablet_report = 3; + + // TODO; add a heartbeat sequence number? + + // TODO: perhaps add some very basic metrics reporting here, like + // free space, reqs/sec, etc? + + // The number of tablets that are BOOTSTRAPPING or RUNNING. + // Used by the master to determine load when creating new tablet replicas. + optional int32 num_live_tablets = 4; +} + +message TSHeartbeatResponsePB { + optional MasterErrorPB error = 1; + + // As with most other master RPC responses (e.g., + // ListTablesResponsePB), all fields below are optional as they may + // not be set if there is an error. + + optional NodeInstancePB master_instance = 2; + + // Indicates that the server which heartbeated needs to re-register + // with the master -- i.e send a heartbeat with the 'registration' + // filled in. + optional bool needs_reregister = 3 [ default = false ]; + + optional bool needs_full_tablet_report = 4 [ default = false ]; + + // Sent when the master receives a TabletReport + optional TabletReportUpdatesPB tablet_report = 5; + + // Specify whether or not the node is the leader master. + optional bool leader_master = 6; +} + +////////////////////////////// +// GetTabletLocations +////////////////////////////// + +message TabletLocationsPB { + message ReplicaPB { + required TSInfoPB ts_info = 1; + required consensus.RaftPeerPB.Role role = 2; + } + + required bytes tablet_id = 1; + + // DEPRECATED. + optional bytes start_key = 2; + optional bytes end_key = 3; + + optional PartitionPB partition = 6; + + repeated ReplicaPB replicas = 4; + + // true if the tablet was running but no tablet server has reported it yet. + // The set of replicas will be the last one that was hosting the tablet. + // This should happen on Master restart when the request is issued before + // the TS has the time to notify the Master about the tablets that is hosting. + required bool stale = 5; +} + +// Info about a single tablet server, returned to the client as part +// of the GetTabletLocations response. This can be used on the client +// to update the local cache of where each TS UUID is located. In +// the future we may also want to transmit software version info, +// load info, topology, etc. +message TSInfoPB { + required bytes permanent_uuid = 1; + + repeated HostPortPB rpc_addresses = 2; +} + +message GetTabletLocationsRequestPB { + // The tablet IDs about which to fetch info. + repeated bytes tablet_ids = 1; +} + +message GetTabletLocationsResponsePB { + optional MasterErrorPB error = 1; + + repeated TabletLocationsPB tablet_locations = 2; + + message Error { + required bytes tablet_id = 1; + required AppStatusPB status = 2; + } + repeated Error errors = 3; +} + +// ============================================================================ +// Catalog +// ============================================================================ +message CreateTableRequestPB { + required string name = 1; + required SchemaPB schema = 2; + // repeated bytes pre_split_keys = 3; + // repeated PartialRowPB split_rows = 5; + optional RowOperationsPB split_rows = 6; + optional PartitionSchemaPB partition_schema = 7; + optional int32 num_replicas = 4; +} + +message CreateTableResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + optional bytes table_id = 2; +} + +message IsCreateTableDoneRequestPB { + required TableIdentifierPB table = 1; +} + +message IsCreateTableDoneResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + // true if the create operation is completed, false otherwise + optional bool done = 3; +} + +message DeleteTableRequestPB { + required TableIdentifierPB table = 1; +} + +message DeleteTableResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; +} + +message ListTablesRequestPB { + // When used, only returns tables that satisfy a substring match on name_filter. + optional string name_filter = 1; +} + +message ListTablesResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + message TableInfo { + required bytes id = 1; + required string name = 2; + } + + repeated TableInfo tables = 2; +} + +message GetTableLocationsRequestPB { + required TableIdentifierPB table = 1; + + // Partition-key range. + optional bytes partition_key_start = 3; + optional bytes partition_key_end = 4; + + optional uint32 max_returned_locations = 5 [ default = 10 ]; +} + +message GetTableLocationsResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + repeated TabletLocationsPB tablet_locations = 2; +} + +message AlterTableRequestPB { + enum StepType { + UNKNOWN = 0; + ADD_COLUMN = 1; + DROP_COLUMN = 2; + RENAME_COLUMN = 3; + + // TODO(KUDU-861): this will subsume RENAME_COLUMN, but not yet implemented + // on the master side. + ALTER_COLUMN = 4; + } + message AddColumn { + // The schema to add. + // NOTE: the 'id' field of the schema should not be provided here -- + // the server will assign an ID. + required ColumnSchemaPB schema = 1; + } + message DropColumn { + // Name of the column to drop. + required string name = 1; + } + message RenameColumn { + // Name of the column to rename; + required string old_name = 1; + required string new_name = 2; + } + + message Step { + optional StepType type = 1 [ default = UNKNOWN ]; + + // Exactly one of the following must be set, based on 'type' + optional AddColumn add_column = 2; + optional DropColumn drop_column = 3; + optional RenameColumn rename_column = 4; + } + + required TableIdentifierPB table = 1; + repeated Step alter_schema_steps = 2; + optional string new_table_name = 3; +} + +message AlterTableResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + optional uint32 schema_version = 2; +} + +message IsAlterTableDoneRequestPB { + required TableIdentifierPB table = 1; +} + +message IsAlterTableDoneResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + // this is the current schema, or the 'new' schema version if an alter is in progress + optional uint32 schema_version = 2; + + // true if the alter operation is completed, false otherwise + optional bool done = 3; +} + +message GetTableSchemaRequestPB { + required TableIdentifierPB table = 1; +} + +message GetTableSchemaResponsePB { + // The error, if an error occurred with this request. + optional MasterErrorPB error = 1; + + // This is the schema that every TS should be able to understand + // if your alter is keeping the schema compatible. + // In case of an alter table in progress, this is the previous schema; + // otherwise it is the latest schema. + optional SchemaPB schema = 2; + + // The table's partition schema. + optional PartitionSchemaPB partition_schema = 5; + + optional int32 num_replicas = 3; + + // The ID of the table. + optional bytes table_id = 4; + + // True if the create operation is completed, false otherwise. + optional bool create_table_done = 6; +} + +// ============================================================================ +// Administration/monitoring +// ============================================================================ + +message ListTabletServersRequestPB { +} + +message ListTabletServersResponsePB { + optional MasterErrorPB error = 1; + + message Entry { + required NodeInstancePB instance_id = 1; + optional TSRegistrationPB registration = 2; + optional int32 millis_since_heartbeat = 3; + } + repeated Entry servers = 2; +} + +// GetMasterRegistrationRequest/Response: get the instance id and +// HTTP/RPC addresses for this Master server. +message GetMasterRegistrationRequestPB { +} + +// TODO: Just use ServerRegistration here. +message GetMasterRegistrationResponsePB { + // Node instance information is always set. + required NodeInstancePB instance_id = 1; + + // These fields are optional, as they won't be set if there's an + // error retrieving the host/port information. + optional ServerRegistrationPB registration = 2; + + // This server's role in the consensus configuration. + optional consensus.RaftPeerPB.Role role = 3; + + // Set if there an error retrieving the registration information. + optional MasterErrorPB error = 4; +} + +// ListMastersRequest/Response: get information about all of the known +// master servers, including this node. +message ListMastersRequestPB { +} + +message ListMastersResponsePB { + // An entry for each individual master server. + repeated ServerEntryPB masters = 1; + + // Set only if there's an error in retrieving the list of servers or + // in getting this server's own local registration information. + optional AppStatusPB error = 2; +} + +service MasterService { + rpc Ping(PingRequestPB) returns (PingResponsePB); + + // TS->Master RPCs + rpc TSHeartbeat(TSHeartbeatRequestPB) returns (TSHeartbeatResponsePB); + + // Client->Master RPCs + rpc GetTabletLocations(GetTabletLocationsRequestPB) returns (GetTabletLocationsResponsePB); + + rpc CreateTable(CreateTableRequestPB) returns (CreateTableResponsePB); + rpc IsCreateTableDone(IsCreateTableDoneRequestPB) returns (IsCreateTableDoneResponsePB); + + rpc DeleteTable(DeleteTableRequestPB) returns (DeleteTableResponsePB); + + rpc AlterTable(AlterTableRequestPB) returns (AlterTableResponsePB); + rpc IsAlterTableDone(IsAlterTableDoneRequestPB) returns (IsAlterTableDoneResponsePB); + + rpc ListTables(ListTablesRequestPB) returns (ListTablesResponsePB); + rpc GetTableLocations(GetTableLocationsRequestPB) returns (GetTableLocationsResponsePB); + rpc GetTableSchema(GetTableSchemaRequestPB) returns (GetTableSchemaResponsePB); + + // Administrative/monitoring RPCs + rpc ListTabletServers(ListTabletServersRequestPB) returns (ListTabletServersResponsePB); + rpc ListMasters(ListMastersRequestPB) returns (ListMastersResponsePB); + rpc GetMasterRegistration(GetMasterRegistrationRequestPB) returns + (GetMasterRegistrationResponsePB); +} diff --git a/src/kudu/master/master_main.cc b/src/kudu/master/master_main.cc new file mode 100644 index 000000000000..2539deca7017 --- /dev/null +++ b/src/kudu/master/master_main.cc @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.h" +#include "kudu/util/flags.h" +#include "kudu/util/init.h" +#include "kudu/util/logging.h" + +using kudu::master::Master; + +DECLARE_string(rpc_bind_addresses); +DECLARE_int32(webserver_port); +DECLARE_bool(evict_failed_followers); + +namespace kudu { +namespace master { + +static int MasterMain(int argc, char** argv) { + InitKuduOrDie(); + + // Reset some default values before parsing gflags. + FLAGS_rpc_bind_addresses = strings::Substitute("0.0.0.0:$0", + Master::kDefaultPort); + FLAGS_webserver_port = Master::kDefaultWebPort; + + // A multi-node Master leader should not evict failed Master followers + // because there is no-one to assign replacement servers in order to maintain + // the desired replication factor. (It's not turtles all the way down!) + FLAGS_evict_failed_followers = false; + + ParseCommandLineFlags(&argc, &argv, true); + if (argc != 1) { + std::cerr << "usage: " << argv[0] << std::endl; + return 1; + } + InitGoogleLoggingSafe(argv[0]); + + MasterOptions opts; + Master server(opts); + LOG(INFO) << "Initializing master server..."; + CHECK_OK(server.Init()); + + LOG(INFO) << "Starting Master server..."; + CHECK_OK(server.Start()); + + LOG(INFO) << "Master server successfully started."; + while (true) { + SleepFor(MonoDelta::FromSeconds(60)); + } + + return 0; +} + +} // namespace master +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::master::MasterMain(argc, argv); +} diff --git a/src/kudu/master/master_options.cc b/src/kudu/master/master_options.cc new file mode 100644 index 000000000000..53e8c103daa6 --- /dev/null +++ b/src/kudu/master/master_options.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/master_options.h" + +#include +#include + +#include "kudu/master/master.h" +#include "kudu/util/flag_tags.h" + +namespace kudu { +namespace master { + +DEFINE_string(master_addresses, "", + "Comma-separated list of all the RPC addresses for Master config. " + "This is used to configure the replicated Master process " + "(currently considered experimental). " + "NOTE: if not specified, configures a non-replicated Master."); +TAG_FLAG(master_addresses, experimental); + +MasterOptions::MasterOptions() { + rpc_opts.default_port = Master::kDefaultPort; + + if (!FLAGS_master_addresses.empty()) { + Status s = HostPort::ParseStrings(FLAGS_master_addresses, Master::kDefaultPort, + &master_addresses); + if (!s.ok()) { + LOG(FATAL) << "Couldn't parse the master_addresses flag('" << FLAGS_master_addresses << "'): " + << s.ToString(); + } + if (master_addresses.size() < 2) { + LOG(FATAL) << "At least 2 masters are required for a distributed config, but " + "master_addresses flag ('" << FLAGS_master_addresses << "') only specifies " + << master_addresses.size() << " masters."; + } + if (master_addresses.size() == 2) { + LOG(WARNING) << "Only 2 masters are specified by master_addresses_flag ('" << + FLAGS_master_addresses << "'), but minimum of 3 are required to tolerate failures" + " of any one master. It is recommended to use at least 3 masters."; + } + } +} + +bool MasterOptions::IsDistributed() const { + return !master_addresses.empty(); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master_options.h b/src/kudu/master/master_options.h new file mode 100644 index 000000000000..e1a36a1e4c2a --- /dev/null +++ b/src/kudu/master/master_options.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_MASTER_OPTIONS_H +#define KUDU_MASTER_MASTER_OPTIONS_H + +#include + +#include "kudu/server/server_base_options.h" +#include "kudu/util/net/net_util.h" + +namespace kudu { +namespace master { + +// Options for constructing the master. +// These are filled in by gflags by default -- see the .cc file for +// the list of options and corresponding flags. +struct MasterOptions : public server::ServerBaseOptions { + MasterOptions(); + + std::vector master_addresses; + + bool IsDistributed() const; +}; + +} // namespace master +} // namespace kudu +#endif /* KUDU_MASTER_MASTER_OPTIONS_H */ diff --git a/src/kudu/master/master_rpc.cc b/src/kudu/master/master_rpc.cc new file mode 100644 index 000000000000..60290a8e366c --- /dev/null +++ b/src/kudu/master/master_rpc.cc @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This module is internal to the client and not a public API. + +#include "kudu/master/master_rpc.h" + +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.proxy.h" +#include "kudu/util/net/net_util.h" + + +using std::shared_ptr; +using std::string; +using std::vector; + +using kudu::consensus::RaftPeerPB; +using kudu::rpc::Messenger; +using kudu::rpc::Rpc; + +namespace kudu { +namespace master { + +//////////////////////////////////////////////////////////// +// GetMasterRegistrationRpc +//////////////////////////////////////////////////////////// + +GetMasterRegistrationRpc::GetMasterRegistrationRpc( + StatusCallback user_cb, Sockaddr addr, const MonoTime& deadline, + const shared_ptr& messenger, ServerEntryPB* out) + : Rpc(deadline, messenger), + user_cb_(std::move(user_cb)), + addr_(std::move(addr)), + out_(DCHECK_NOTNULL(out)) {} + +GetMasterRegistrationRpc::~GetMasterRegistrationRpc() { +} + +void GetMasterRegistrationRpc::SendRpc() { + MasterServiceProxy proxy(retrier().messenger(), + addr_); + GetMasterRegistrationRequestPB req; + proxy.GetMasterRegistrationAsync(req, &resp_, + mutable_retrier()->mutable_controller(), + boost::bind(&GetMasterRegistrationRpc::SendRpcCb, + this, + Status::OK())); +} + +string GetMasterRegistrationRpc::ToString() const { + return strings::Substitute("GetMasterRegistrationRpc(address: $0, num_attempts: $1)", + addr_.ToString(), num_attempts()); +} + +void GetMasterRegistrationRpc::SendRpcCb(const Status& status) { + gscoped_ptr deleter(this); + Status new_status = status; + if (new_status.ok() && mutable_retrier()->HandleResponse(this, &new_status)) { + ignore_result(deleter.release()); + return; + } + if (new_status.ok() && resp_.has_error()) { + if (resp_.error().code() == MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED) { + // If CatalogManager is not initialized, treat the node as a + // FOLLOWER for the time being, as currently this RPC is only + // used for the purposes of finding the leader master. + resp_.set_role(RaftPeerPB::FOLLOWER); + new_status = Status::OK(); + } else { + out_->mutable_error()->CopyFrom(resp_.error().status()); + new_status = StatusFromPB(resp_.error().status()); + } + } + if (new_status.ok()) { + out_->mutable_instance_id()->CopyFrom(resp_.instance_id()); + out_->mutable_registration()->CopyFrom(resp_.registration()); + out_->set_role(resp_.role()); + } + user_cb_.Run(new_status); +} + +//////////////////////////////////////////////////////////// +// GetLeaderMasterRpc +//////////////////////////////////////////////////////////// + +GetLeaderMasterRpc::GetLeaderMasterRpc(LeaderCallback user_cb, + vector addrs, + const MonoTime& deadline, + const shared_ptr& messenger) + : Rpc(deadline, messenger), + user_cb_(std::move(user_cb)), + addrs_(std::move(addrs)), + pending_responses_(0), + completed_(false) { + DCHECK(deadline.Initialized()); + + // Using resize instead of reserve to explicitly initialized the + // values. + responses_.resize(addrs_.size()); +} + +GetLeaderMasterRpc::~GetLeaderMasterRpc() { +} + +string GetLeaderMasterRpc::ToString() const { + vector sockaddr_str; + for (const Sockaddr& addr : addrs_) { + sockaddr_str.push_back(addr.ToString()); + } + return strings::Substitute("GetLeaderMasterRpc(addrs: $0, num_attempts: $1)", + JoinStrings(sockaddr_str, ","), + num_attempts()); +} + +void GetLeaderMasterRpc::SendRpc() { + lock_guard l(&lock_); + for (int i = 0; i < addrs_.size(); i++) { + GetMasterRegistrationRpc* rpc = new GetMasterRegistrationRpc( + Bind(&GetLeaderMasterRpc::GetMasterRegistrationRpcCbForNode, + this, ConstRef(addrs_[i]), ConstRef(responses_[i])), + addrs_[i], + retrier().deadline(), + retrier().messenger(), + &responses_[i]); + rpc->SendRpc(); + ++pending_responses_; + } +} + +void GetLeaderMasterRpc::SendRpcCb(const Status& status) { + // If we've received replies from all of the nodes without finding + // the leader, or if there were network errors talking to all of the + // nodes the error is retriable and we can perform a delayed retry. + if (status.IsNetworkError() || status.IsNotFound()) { + // TODO (KUDU-573): Allow cancelling delayed tasks on reactor so + // that we can safely use DelayedRetry here. + mutable_retrier()->DelayedRetryCb(this, Status::OK()); + return; + } + { + lock_guard l(&lock_); + // 'completed_' prevents 'user_cb_' from being invoked twice. + if (completed_) { + return; + } + completed_ = true; + } + user_cb_.Run(status, leader_master_); +} + +void GetLeaderMasterRpc::GetMasterRegistrationRpcCbForNode(const Sockaddr& node_addr, + const ServerEntryPB& resp, + const Status& status) { + // TODO: handle the situation where one Master is partitioned from + // the rest of the Master consensus configuration, all are reachable by the client, + // and the partitioned node "thinks" it's the leader. + // + // The proper way to do so is to add term/index to the responses + // from the Master, wait for majority of the Masters to respond, and + // pick the one with the highest term/index as the leader. + Status new_status = status; + { + lock_guard lock(&lock_); + if (completed_) { + // If 'user_cb_' has been invoked (see SendRpcCb above), we can + // stop. + return; + } + if (new_status.ok()) { + if (resp.role() != RaftPeerPB::LEADER) { + // Use a Status::NotFound() to indicate that the node is not + // the leader: this way, we can handle the case where we've + // received a reply from all of the nodes in the cluster (no + // network or other errors encountered), but haven't found a + // leader (which means that SendRpcCb() above can perform a + // delayed retry). + new_status = Status::NotFound("no leader found: " + ToString()); + } else { + // We've found a leader. + leader_master_ = HostPort(node_addr); + } + } + --pending_responses_; + if (!new_status.ok()) { + if (pending_responses_ > 0) { + // Don't call SendRpcCb() on error unless we're the last + // outstanding response: calling SendRpcCb() will trigger + // a delayed re-try, which don't need to do unless we've + // been unable to find a leader so far. + return; + } + } + } + // Called if the leader has been determined, or if we've received + // all of the responses. + SendRpcCb(new_status); +} + + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master_rpc.h b/src/kudu/master/master_rpc.h new file mode 100644 index 000000000000..d7ffc5d87af3 --- /dev/null +++ b/src/kudu/master/master_rpc.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This module is internal to the client and not a public API. +#ifndef KUDU_MASTER_MASTER_RPC_H +#define KUDU_MASTER_MASTER_RPC_H + +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/master/master.pb.h" +#include "kudu/rpc/rpc.h" +#include "kudu/util/locks.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" + + +namespace kudu { + +class ServerEntryPB; +class HostPort; + +namespace master { + +// An RPC for getting a Master server's registration. +class GetMasterRegistrationRpc : public rpc::Rpc { + public: + + // Create a wrapper object for a retriable GetMasterRegistration RPC + // to 'addr'. The result is stored in 'out', which must be a valid + // pointer for the lifetime of this object. + // + // Invokes 'user_cb' upon failure or success of the RPC call. + GetMasterRegistrationRpc(StatusCallback user_cb, Sockaddr addr, + const MonoTime& deadline, + const std::shared_ptr& messenger, + ServerEntryPB* out); + + ~GetMasterRegistrationRpc(); + + virtual void SendRpc() OVERRIDE; + + virtual std::string ToString() const OVERRIDE; + + private: + virtual void SendRpcCb(const Status& status) OVERRIDE; + + StatusCallback user_cb_; + Sockaddr addr_; + + ServerEntryPB* out_; + + GetMasterRegistrationResponsePB resp_; +}; + +// In parallel, send requests to the specified Master servers until a +// response comes back from the leader of the Master consensus configuration. +// +// If queries have been made to all of the specified servers, but no +// leader has been found, we re-try again (with an increasing delay, +// see: RpcRetrier in kudu/rpc/rpc.{cc,h}) until a specified deadline +// passes or we find a leader. +// +// The RPCs are sent in parallel in order to avoid prolonged delays on +// the client-side that would happen with a serial approach when one +// of the Master servers is slow or stopped (that is, when we have to +// wait for an RPC request to server N to timeout before we can make +// an RPC request to server N+1). This allows for true fault tolerance +// for the Kudu client. +// +// The class is reference counted to avoid a "use-after-free" +// scenario, when responses to the RPC return to the caller _after_ a +// leader has already been found. +class GetLeaderMasterRpc : public rpc::Rpc, + public RefCountedThreadSafe { + public: + typedef Callback LeaderCallback; + // The host and port of the leader master server is stored in + // 'leader_master', which must remain valid for the lifetime of this + // object. + // + // Calls 'user_cb' when the leader is found, or if no leader can be + // found until 'deadline' passes. + GetLeaderMasterRpc(LeaderCallback user_cb, std::vector addrs, + const MonoTime& deadline, + const std::shared_ptr& messenger); + + virtual void SendRpc() OVERRIDE; + + virtual std::string ToString() const OVERRIDE; + private: + friend class RefCountedThreadSafe; + ~GetLeaderMasterRpc(); + + virtual void SendRpcCb(const Status& status) OVERRIDE; + + // Invoked when a response comes back from a Master with address + // 'node_addr'. + // + // Invokes SendRpcCb if the response indicates that the specified + // master is a leader, or if responses have been received from all + // of the Masters. + void GetMasterRegistrationRpcCbForNode(const Sockaddr& node_addr, + const ServerEntryPB& resp, + const Status& status); + + LeaderCallback user_cb_; + std::vector addrs_; + + HostPort leader_master_; + + // The received responses. + // + // See also: GetMasterRegistrationRpc above. + std::vector responses_; + + // Number of pending responses. + int pending_responses_; + + // If true, then we've already executed the user callback and the + // RPC can be deallocated. + bool completed_; + + // Protects 'pending_responses_' and 'completed_'. + mutable simple_spinlock lock_; +}; + +} // namespace master +} // namespace kudu + +#endif /* KUDU_MASTER_MASTER_RPC_H */ diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc new file mode 100644 index 000000000000..4ea77d543ad3 --- /dev/null +++ b/src/kudu/master/master_service.cc @@ -0,0 +1,383 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/master_service.h" + +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/ts_manager.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/server/webserver.h" +#include "kudu/util/flag_tags.h" + + +DEFINE_int32(master_inject_latency_on_tablet_lookups_ms, 0, + "Number of milliseconds that the master will sleep before responding to " + "requests for tablet locations."); +TAG_FLAG(master_inject_latency_on_tablet_lookups_ms, unsafe); +TAG_FLAG(master_inject_latency_on_tablet_lookups_ms, hidden); + +namespace kudu { +namespace master { + +using consensus::RaftPeerPB; +using std::string; +using std::vector; +using std::shared_ptr; + +namespace { + +template +bool CheckCatalogManagerInitializedOrRespond(Master* master, + RespClass* resp, + rpc::RpcContext* rpc) { + if (PREDICT_FALSE(!master->catalog_manager()->IsInitialized())) { + SetupErrorAndRespond(resp->mutable_error(), + Status::ServiceUnavailable("catalog manager has not been initialized"), + MasterErrorPB::CATALOG_MANAGER_NOT_INITIALIZED, + rpc); + return false; + } + return true; +} + +template +bool CheckIsLeaderOrRespond(Master* master, + RespClass* resp, + rpc::RpcContext* rpc) { + Status s = master->catalog_manager()->CheckIsLeaderAndReady(); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + MasterErrorPB::NOT_THE_LEADER, + rpc); + return false; + } + return true; +} + +template +bool CheckLeaderAndCatalogManagerInitializedOrRespond(Master* master, + RespClass* resp, + rpc::RpcContext* rpc) { + return PREDICT_TRUE(CheckCatalogManagerInitializedOrRespond(master, resp, rpc) && + CheckIsLeaderOrRespond(master, resp, rpc)); +} + +// If 's' is not OK and 'resp' has no application specific error set, +// set the error field of 'resp' to match 's' and set the code to +// UNKNOWN_ERROR. +template +void CheckRespErrorOrSetUnknown(const Status& s, RespClass* resp) { + if (PREDICT_FALSE(!s.ok() && !resp->has_error())) { + StatusToPB(s, resp->mutable_error()->mutable_status()); + resp->mutable_error()->set_code(MasterErrorPB::UNKNOWN_ERROR); + } +} + +} // anonymous namespace + +static void SetupErrorAndRespond(MasterErrorPB* error, + const Status& s, + MasterErrorPB::Code code, + rpc::RpcContext* rpc) { + StatusToPB(s, error->mutable_status()); + error->set_code(code); + // TODO RespondSuccess() is better called 'Respond'. + rpc->RespondSuccess(); +} + + +MasterServiceImpl::MasterServiceImpl(Master* server) + : MasterServiceIf(server->metric_entity()), + server_(server) { +} + +void MasterServiceImpl::Ping(const PingRequestPB* req, + PingResponsePB* resp, + rpc::RpcContext* rpc) { + rpc->RespondSuccess(); +} + +void MasterServiceImpl::TSHeartbeat(const TSHeartbeatRequestPB* req, + TSHeartbeatResponsePB* resp, + rpc::RpcContext* rpc) { + // If CatalogManager is not initialized don't even know whether + // or not we will be a leader (so we can't tell whether or not we can + // accept tablet reports). + if (!CheckCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + resp->mutable_master_instance()->CopyFrom(server_->instance_pb()); + Status s = server_->catalog_manager()->CheckIsLeaderAndReady(); + if (!s.ok()) { + // For the time being, ignore heartbeats sent to non-leader distributed + // masters. + // + // TODO KUDU-493 Allow all master processes to receive heartbeat + // information: by having the TabletServers send heartbeats to all + // masters, or by storing heartbeat information in a replicated + // SysTable. + LOG(WARNING) << "Received a heartbeat, but this Master instance is not a leader or a " + << "single Master: " << s.ToString(); + resp->set_leader_master(false); + rpc->RespondSuccess(); + return; + } + resp->set_leader_master(true); + + shared_ptr ts_desc; + // If the TS is registering, register in the TS manager. + if (req->has_registration()) { + Status s = server_->ts_manager()->RegisterTS(req->common().ts_instance(), + req->registration(), + &ts_desc); + if (!s.ok()) { + LOG(WARNING) << "Unable to register tablet server (" << rpc->requestor_string() << "): " + << s.ToString(); + // TODO: add service-specific errors + rpc->RespondFailure(s); + return; + } + } + + // TODO: KUDU-86 if something fails after this point the TS will not be able + // to register again. + + // Look up the TS -- if it just registered above, it will be found here. + // This allows the TS to register and tablet-report in the same RPC. + s = server_->ts_manager()->LookupTS(req->common().ts_instance(), &ts_desc); + if (s.IsNotFound()) { + LOG(INFO) << "Got heartbeat from unknown tablet server { " + << req->common().ts_instance().ShortDebugString() + << " } as " << rpc->requestor_string() + << "; Asking this server to re-register."; + resp->set_needs_reregister(true); + resp->set_needs_full_tablet_report(true); + rpc->RespondSuccess(); + return; + } else if (!s.ok()) { + LOG(WARNING) << "Unable to look up tablet server for heartbeat request " + << req->DebugString() << " from " << rpc->requestor_string() + << "\nStatus: " << s.ToString(); + rpc->RespondFailure(s.CloneAndPrepend("Unable to lookup TS")); + return; + } + + ts_desc->UpdateHeartbeatTime(); + ts_desc->set_num_live_replicas(req->num_live_tablets()); + + if (req->has_tablet_report()) { + s = server_->catalog_manager()->ProcessTabletReport( + ts_desc.get(), req->tablet_report(), resp->mutable_tablet_report(), rpc); + if (!s.ok()) { + rpc->RespondFailure(s.CloneAndPrepend("Failed to process tablet report")); + return; + } + } + + if (!ts_desc->has_tablet_report()) { + resp->set_needs_full_tablet_report(true); + } + + rpc->RespondSuccess(); +} + +void MasterServiceImpl::GetTabletLocations(const GetTabletLocationsRequestPB* req, + GetTabletLocationsResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + if (PREDICT_FALSE(FLAGS_master_inject_latency_on_tablet_lookups_ms > 0)) { + SleepFor(MonoDelta::FromMilliseconds(FLAGS_master_inject_latency_on_tablet_lookups_ms)); + } + + TSRegistrationPB reg; + vector locs; + for (const string& tablet_id : req->tablet_ids()) { + // TODO: once we have catalog data. ACL checks would also go here, probably. + TabletLocationsPB* locs_pb = resp->add_tablet_locations(); + Status s = server_->catalog_manager()->GetTabletLocations(tablet_id, locs_pb); + if (!s.ok()) { + resp->mutable_tablet_locations()->RemoveLast(); + + GetTabletLocationsResponsePB::Error* err = resp->add_errors(); + err->set_tablet_id(tablet_id); + StatusToPB(s, err->mutable_status()); + } + } + + rpc->RespondSuccess(); +} + +void MasterServiceImpl::CreateTable(const CreateTableRequestPB* req, + CreateTableResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->CreateTable(req, resp, rpc); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::IsCreateTableDone(const IsCreateTableDoneRequestPB* req, + IsCreateTableDoneResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->IsCreateTableDone(req, resp); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::DeleteTable(const DeleteTableRequestPB* req, + DeleteTableResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->DeleteTable(req, resp, rpc); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::AlterTable(const AlterTableRequestPB* req, + AlterTableResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->AlterTable(req, resp, rpc); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::IsAlterTableDone(const IsAlterTableDoneRequestPB* req, + IsAlterTableDoneResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->IsAlterTableDone(req, resp, rpc); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::ListTables(const ListTablesRequestPB* req, + ListTablesResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->ListTables(req, resp); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::GetTableLocations(const GetTableLocationsRequestPB* req, + GetTableLocationsResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + if (PREDICT_FALSE(FLAGS_master_inject_latency_on_tablet_lookups_ms > 0)) { + SleepFor(MonoDelta::FromMilliseconds(FLAGS_master_inject_latency_on_tablet_lookups_ms)); + } + Status s = server_->catalog_manager()->GetTableLocations(req, resp); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::GetTableSchema(const GetTableSchemaRequestPB* req, + GetTableSchemaResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + Status s = server_->catalog_manager()->GetTableSchema(req, resp); + CheckRespErrorOrSetUnknown(s, resp); + rpc->RespondSuccess(); +} + +void MasterServiceImpl::ListTabletServers(const ListTabletServersRequestPB* req, + ListTabletServersResponsePB* resp, + rpc::RpcContext* rpc) { + if (!CheckLeaderAndCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + + vector > descs; + server_->ts_manager()->GetAllDescriptors(&descs); + for (const std::shared_ptr& desc : descs) { + ListTabletServersResponsePB::Entry* entry = resp->add_servers(); + desc->GetNodeInstancePB(entry->mutable_instance_id()); + desc->GetRegistration(entry->mutable_registration()); + entry->set_millis_since_heartbeat(desc->TimeSinceHeartbeat().ToMilliseconds()); + } + rpc->RespondSuccess(); +} + +void MasterServiceImpl::ListMasters(const ListMastersRequestPB* req, + ListMastersResponsePB* resp, + rpc::RpcContext* rpc) { + vector masters; + Status s = server_->ListMasters(&masters); + if (!s.ok()) { + StatusToPB(s, resp->mutable_error()); + resp->mutable_error()->set_code(AppStatusPB::UNKNOWN_ERROR); + } else { + for (const ServerEntryPB& master : masters) { + resp->add_masters()->CopyFrom(master); + } + } + rpc->RespondSuccess(); +} + +void MasterServiceImpl::GetMasterRegistration(const GetMasterRegistrationRequestPB* req, + GetMasterRegistrationResponsePB* resp, + rpc::RpcContext* rpc) { + // instance_id must always be set in order for status pages to be useful. + resp->mutable_instance_id()->CopyFrom(server_->instance_pb()); + if (!CheckCatalogManagerInitializedOrRespond(server_, resp, rpc)) { + return; + } + Status s = server_->GetMasterRegistration(resp->mutable_registration()); + CheckRespErrorOrSetUnknown(s, resp); + resp->set_role(server_->catalog_manager()->Role()); + rpc->RespondSuccess(); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/master_service.h b/src/kudu/master/master_service.h new file mode 100644 index 000000000000..eddeb43ddff0 --- /dev/null +++ b/src/kudu/master/master_service.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_MASTER_SERVICE_H +#define KUDU_MASTER_MASTER_SERVICE_H + +#include "kudu/gutil/macros.h" +#include "kudu/master/master.service.h" +#include "kudu/util/metrics.h" + +namespace kudu { + +class NodeInstancePB; + +namespace master { + +class Master; +class TSDescriptor; + +// Implementation of the master service. See master.proto for docs +// on each RPC. +class MasterServiceImpl : public MasterServiceIf { + public: + explicit MasterServiceImpl(Master* server); + + virtual void Ping(const PingRequestPB* req, + PingResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void TSHeartbeat(const TSHeartbeatRequestPB* req, + TSHeartbeatResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void GetTabletLocations(const GetTabletLocationsRequestPB* req, + GetTabletLocationsResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void CreateTable(const CreateTableRequestPB* req, + CreateTableResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void IsCreateTableDone(const IsCreateTableDoneRequestPB* req, + IsCreateTableDoneResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void DeleteTable(const DeleteTableRequestPB* req, + DeleteTableResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void AlterTable(const AlterTableRequestPB* req, + AlterTableResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void IsAlterTableDone(const IsAlterTableDoneRequestPB* req, + IsAlterTableDoneResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void ListTables(const ListTablesRequestPB* req, + ListTablesResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void GetTableLocations(const GetTableLocationsRequestPB* req, + GetTableLocationsResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void GetTableSchema(const GetTableSchemaRequestPB* req, + GetTableSchemaResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + virtual void ListTabletServers(const ListTabletServersRequestPB* req, + ListTabletServersResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void ListMasters(const ListMastersRequestPB* req, + ListMastersResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void GetMasterRegistration(const GetMasterRegistrationRequestPB* req, + GetMasterRegistrationResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + private: + Master* server_; + + DISALLOW_COPY_AND_ASSIGN(MasterServiceImpl); +}; + +} // namespace master +} // namespace kudu + +#endif diff --git a/src/kudu/master/mini_master.cc b/src/kudu/master/mini_master.cc new file mode 100644 index 000000000000..2739d628cc3e --- /dev/null +++ b/src/kudu/master/mini_master.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/mini_master.h" + +#include + +#include + +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/rpc_server.h" +#include "kudu/server/webserver.h" +#include "kudu/master/master.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +using strings::Substitute; + +DECLARE_bool(rpc_server_allow_ephemeral_ports); + +namespace kudu { +namespace master { + +MiniMaster::MiniMaster(Env* env, string fs_root, uint16_t rpc_port) + : running_(false), + env_(env), + fs_root_(std::move(fs_root)), + rpc_port_(rpc_port) {} + +MiniMaster::~MiniMaster() { + CHECK(!running_); +} + +Status MiniMaster::Start() { + CHECK(!running_); + FLAGS_rpc_server_allow_ephemeral_ports = true; + RETURN_NOT_OK(StartOnPorts(rpc_port_, 0)); + return master_->WaitForCatalogManagerInit(); +} + + +Status MiniMaster::StartDistributedMaster(const vector& peer_ports) { + CHECK(!running_); + return StartDistributedMasterOnPorts(rpc_port_, 0, peer_ports); +} + +void MiniMaster::Shutdown() { + if (running_) { + master_->Shutdown(); + } + running_ = false; + master_.reset(); +} + +Status MiniMaster::StartOnPorts(uint16_t rpc_port, uint16_t web_port) { + CHECK(!running_); + CHECK(!master_); + + MasterOptions opts; + return StartOnPorts(rpc_port, web_port, &opts); +} + +Status MiniMaster::StartOnPorts(uint16_t rpc_port, uint16_t web_port, + MasterOptions* opts) { + opts->rpc_opts.rpc_bind_addresses = Substitute("127.0.0.1:$0", rpc_port); + opts->webserver_opts.port = web_port; + opts->fs_opts.wal_path = fs_root_; + opts->fs_opts.data_paths = { fs_root_ }; + + gscoped_ptr server(new Master(*opts)); + RETURN_NOT_OK(server->Init()); + RETURN_NOT_OK(server->StartAsync()); + + master_.swap(server); + running_ = true; + + return Status::OK(); +} + +Status MiniMaster::StartDistributedMasterOnPorts(uint16_t rpc_port, uint16_t web_port, + const vector& peer_ports) { + CHECK(!running_); + CHECK(!master_); + + MasterOptions opts; + + vector peer_addresses; + for (uint16_t peer_port : peer_ports) { + HostPort peer_address("127.0.0.1", peer_port); + peer_addresses.push_back(peer_address); + } + opts.master_addresses = peer_addresses; + + return StartOnPorts(rpc_port, web_port, &opts); +} + +Status MiniMaster::Restart() { + CHECK(running_); + + Sockaddr prev_rpc = bound_rpc_addr(); + Sockaddr prev_http = bound_http_addr(); + Shutdown(); + + RETURN_NOT_OK(StartOnPorts(prev_rpc.port(), prev_http.port())); + CHECK(running_); + return WaitForCatalogManagerInit(); +} + +Status MiniMaster::WaitForCatalogManagerInit() { + return master_->WaitForCatalogManagerInit(); +} + +const Sockaddr MiniMaster::bound_rpc_addr() const { + CHECK(running_); + return master_->first_rpc_address(); +} + +const Sockaddr MiniMaster::bound_http_addr() const { + CHECK(running_); + return master_->first_http_address(); +} + +std::string MiniMaster::permanent_uuid() const { + CHECK(master_); + return DCHECK_NOTNULL(master_->fs_manager())->uuid(); +} + +std::string MiniMaster::bound_rpc_addr_str() const { + return bound_rpc_addr().ToString(); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/mini_master.h b/src/kudu/master/mini_master.h new file mode 100644 index 000000000000..3d1f8015edca --- /dev/null +++ b/src/kudu/master/mini_master.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_MINI_MASTER_H +#define KUDU_MASTER_MINI_MASTER_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/env.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +namespace kudu { + +class HostPort; + +namespace master { + +class Master; +struct MasterOptions; + +// An in-process Master meant for use in test cases. +// +// TODO: Store the distributed cluster configuration in the object, to avoid +// having multiple Start methods. +class MiniMaster { + public: + MiniMaster(Env* env, std::string fs_root, uint16_t rpc_port); + ~MiniMaster(); + + // Start a master running on the loopback interface and + // an ephemeral port. To determine the address that the server + // bound to, call MiniMaster::bound_addr() + Status Start(); + + Status StartDistributedMaster(const std::vector& peer_ports); + + Status WaitForCatalogManagerInit(); + + void Shutdown(); + + // Restart the master on the same ports as it was previously bound. + // Requires that the master is currently started. + Status Restart(); + + const Sockaddr bound_rpc_addr() const; + const Sockaddr bound_http_addr() const; + + const Master* master() const { return master_.get(); } + Master* master() { return master_.get(); } + + // Return UUID of this mini master. + std::string permanent_uuid() const; + + std::string bound_rpc_addr_str() const; + + private: + Status StartDistributedMasterOnPorts(uint16_t rpc_port, uint16_t web_port, + const std::vector& peer_ports); + + Status StartOnPorts(uint16_t rpc_port, uint16_t web_port); + + Status StartOnPorts(uint16_t rpc_port, uint16_t web_port, + MasterOptions* options); + + bool running_; + + ATTRIBUTE_MEMBER_UNUSED Env* const env_; + const std::string fs_root_; + const uint16_t rpc_port_; + + gscoped_ptr master_; +}; + +} // namespace master +} // namespace kudu + +#endif /* KUDU_MASTER_MINI_MASTER_H */ diff --git a/src/kudu/master/sys_catalog-test.cc b/src/kudu/master/sys_catalog-test.cc new file mode 100644 index 000000000000..51de9b015103 --- /dev/null +++ b/src/kudu/master/sys_catalog-test.cc @@ -0,0 +1,370 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/master.proxy.h" +#include "kudu/master/mini_master.h" +#include "kudu/master/sys_catalog.h" +#include "kudu/server/rpc_server.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" +#include "kudu/rpc/messenger.h" + +using std::string; +using std::shared_ptr; +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; + +namespace kudu { +namespace master { + +class SysCatalogTest : public KuduTest { + protected: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // Start master + mini_master_.reset(new MiniMaster(Env::Default(), GetTestPath("Master"), 0)); + ASSERT_OK(mini_master_->Start()); + master_ = mini_master_->master(); + ASSERT_OK(master_->WaitUntilCatalogManagerIsLeaderAndReadyForTests(MonoDelta::FromSeconds(5))); + + // Create a client proxy to it. + MessengerBuilder bld("Client"); + ASSERT_OK(bld.Build(&client_messenger_)); + proxy_.reset(new MasterServiceProxy(client_messenger_, mini_master_->bound_rpc_addr())); + } + + virtual void TearDown() OVERRIDE { + mini_master_->Shutdown(); + KuduTest::TearDown(); + } + + shared_ptr client_messenger_; + gscoped_ptr mini_master_; + Master* master_; + gscoped_ptr proxy_; +}; + +class TableLoader : public TableVisitor { + public: + TableLoader() {} + ~TableLoader() { Reset(); } + + void Reset() { + for (TableInfo* ti : tables) { + ti->Release(); + } + tables.clear(); + } + + virtual Status VisitTable(const std::string& table_id, + const SysTablesEntryPB& metadata) OVERRIDE { + // Setup the table info + TableInfo *table = new TableInfo(table_id); + TableMetadataLock l(table, TableMetadataLock::WRITE); + l.mutable_data()->pb.CopyFrom(metadata); + l.Commit(); + table->AddRef(); + tables.push_back(table); + return Status::OK(); + } + + vector tables; +}; + +static bool PbEquals(const google::protobuf::Message& a, const google::protobuf::Message& b) { + return a.DebugString() == b.DebugString(); +} + +template +static bool MetadatasEqual(C* ti_a, C* ti_b) { + MetadataLock l_a(ti_a, MetadataLock::READ); + MetadataLock l_b(ti_a, MetadataLock::READ); + return PbEquals(l_a.data().pb, l_b.data().pb); +} + +// Test the sys-catalog tables basic operations (add, update, delete, +// visit) +TEST_F(SysCatalogTest, TestSysCatalogTablesOperations) { + TableLoader loader; + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTables(&loader)); + ASSERT_EQ(0, loader.tables.size()); + + // Create new table. + scoped_refptr table(new TableInfo("abc")); + { + TableMetadataLock l(table.get(), TableMetadataLock::WRITE); + l.mutable_data()->pb.set_name("testtb"); + l.mutable_data()->pb.set_version(0); + l.mutable_data()->pb.set_num_replicas(1); + l.mutable_data()->pb.set_state(SysTablesEntryPB::PREPARING); + ASSERT_OK(SchemaToPB(Schema(), l.mutable_data()->pb.mutable_schema())); + // Add the table + ASSERT_OK(master_->catalog_manager()->sys_catalog()->AddTable(table.get())); + l.Commit(); + } + + // Verify it showed up. + loader.Reset(); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTables(&loader)); + ASSERT_EQ(1, loader.tables.size()); + ASSERT_TRUE(MetadatasEqual(table.get(), loader.tables[0])); + + // Update the table + { + TableMetadataLock l(table.get(), TableMetadataLock::WRITE); + l.mutable_data()->pb.set_version(1); + l.mutable_data()->pb.set_state(SysTablesEntryPB::REMOVED); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->UpdateTable(table.get())); + l.Commit(); + } + + loader.Reset(); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTables(&loader)); + ASSERT_EQ(1, loader.tables.size()); + ASSERT_TRUE(MetadatasEqual(table.get(), loader.tables[0])); + + // Delete the table + loader.Reset(); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->DeleteTable(table.get())); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTables(&loader)); + ASSERT_EQ(0, loader.tables.size()); +} + +// Verify that data mutations are not available from metadata() until commit. +TEST_F(SysCatalogTest, TestTableInfoCommit) { + scoped_refptr table(new TableInfo("123")); + + // Mutate the table, under the write lock. + TableMetadataLock writer_lock(table.get(), TableMetadataLock::WRITE); + writer_lock.mutable_data()->pb.set_name("foo"); + + // Changes should not be visible to a reader. + // The reader can still lock for read, since readers don't block + // writers in the RWC lock. + { + TableMetadataLock reader_lock(table.get(), TableMetadataLock::READ); + ASSERT_NE("foo", reader_lock.data().name()); + } + writer_lock.mutable_data()->set_state(SysTablesEntryPB::RUNNING, "running"); + + + { + TableMetadataLock reader_lock(table.get(), TableMetadataLock::READ); + ASSERT_NE("foo", reader_lock.data().pb.name()); + ASSERT_NE("running", reader_lock.data().pb.state_msg()); + ASSERT_NE(SysTablesEntryPB::RUNNING, reader_lock.data().pb.state()); + } + + // Commit the changes + writer_lock.Commit(); + + // Verify that the data is visible + { + TableMetadataLock reader_lock(table.get(), TableMetadataLock::READ); + ASSERT_EQ("foo", reader_lock.data().pb.name()); + ASSERT_EQ("running", reader_lock.data().pb.state_msg()); + ASSERT_EQ(SysTablesEntryPB::RUNNING, reader_lock.data().pb.state()); + } +} + +class TabletLoader : public TabletVisitor { + public: + TabletLoader() {} + ~TabletLoader() { Reset(); } + + void Reset() { + for (TabletInfo* ti : tablets) { + ti->Release(); + } + tablets.clear(); + } + + virtual Status VisitTablet(const std::string& table_id, + const std::string& tablet_id, + const SysTabletsEntryPB& metadata) OVERRIDE { + // Setup the tablet info + TabletInfo *tablet = new TabletInfo(nullptr, tablet_id); + TabletMetadataLock l(tablet, TabletMetadataLock::WRITE); + l.mutable_data()->pb.CopyFrom(metadata); + l.Commit(); + tablet->AddRef(); + tablets.push_back(tablet); + return Status::OK(); + } + + vector tablets; +}; + +// Create a new TabletInfo. The object is in uncommitted +// state. +static TabletInfo *CreateTablet(TableInfo *table, + const string& tablet_id, + const string& start_key, + const string& end_key) { + TabletInfo *tablet = new TabletInfo(table, tablet_id); + TabletMetadataLock l(tablet, TabletMetadataLock::WRITE); + l.mutable_data()->pb.set_state(SysTabletsEntryPB::PREPARING); + l.mutable_data()->pb.mutable_partition()->set_partition_key_start(start_key); + l.mutable_data()->pb.mutable_partition()->set_partition_key_end(end_key); + l.mutable_data()->pb.set_table_id(table->id()); + l.Commit(); + return tablet; +} + +// Test the sys-catalog tablets basic operations (add, update, delete, +// visit) +TEST_F(SysCatalogTest, TestSysCatalogTabletsOperations) { + scoped_refptr table(new TableInfo("abc")); + scoped_refptr tablet1(CreateTablet(table.get(), "123", "a", "b")); + scoped_refptr tablet2(CreateTablet(table.get(), "456", "b", "c")); + scoped_refptr tablet3(CreateTablet(table.get(), "789", "c", "d")); + + SysCatalogTable* sys_catalog = master_->catalog_manager()->sys_catalog(); + + TabletLoader loader; + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTablets(&loader)); + ASSERT_EQ(0, loader.tablets.size()); + + // Add tablet1 and tablet2 + { + std::vector tablets; + tablets.push_back(tablet1.get()); + tablets.push_back(tablet2.get()); + + loader.Reset(); + TabletMetadataLock l1(tablet1.get(), TabletMetadataLock::WRITE); + TabletMetadataLock l2(tablet2.get(), TabletMetadataLock::WRITE); + ASSERT_OK(sys_catalog->AddTablets(tablets)); + l1.Commit(); + l2.Commit(); + + ASSERT_OK(sys_catalog->VisitTablets(&loader)); + ASSERT_EQ(2, loader.tablets.size()); + ASSERT_TRUE(MetadatasEqual(tablet1.get(), loader.tablets[0])); + ASSERT_TRUE(MetadatasEqual(tablet2.get(), loader.tablets[1])); + } + + // Update tablet1 + { + std::vector tablets; + tablets.push_back(tablet1.get()); + + TabletMetadataLock l1(tablet1.get(), TabletMetadataLock::WRITE); + l1.mutable_data()->pb.set_state(SysTabletsEntryPB::RUNNING); + ASSERT_OK(sys_catalog->UpdateTablets(tablets)); + l1.Commit(); + + loader.Reset(); + ASSERT_OK(sys_catalog->VisitTablets(&loader)); + ASSERT_EQ(2, loader.tablets.size()); + ASSERT_TRUE(MetadatasEqual(tablet1.get(), loader.tablets[0])); + ASSERT_TRUE(MetadatasEqual(tablet2.get(), loader.tablets[1])); + } + + // Add tablet3 and Update tablet1 and tablet2 + { + std::vector to_add; + std::vector to_update; + + TabletMetadataLock l3(tablet3.get(), TabletMetadataLock::WRITE); + to_add.push_back(tablet3.get()); + to_update.push_back(tablet1.get()); + to_update.push_back(tablet2.get()); + + TabletMetadataLock l1(tablet1.get(), TabletMetadataLock::WRITE); + l1.mutable_data()->pb.set_state(SysTabletsEntryPB::REPLACED); + TabletMetadataLock l2(tablet2.get(), TabletMetadataLock::WRITE); + l2.mutable_data()->pb.set_state(SysTabletsEntryPB::RUNNING); + + loader.Reset(); + ASSERT_OK(sys_catalog->AddAndUpdateTablets(to_add, to_update)); + + l1.Commit(); + l2.Commit(); + l3.Commit(); + + ASSERT_OK(sys_catalog->VisitTablets(&loader)); + ASSERT_EQ(3, loader.tablets.size()); + ASSERT_TRUE(MetadatasEqual(tablet1.get(), loader.tablets[0])); + ASSERT_TRUE(MetadatasEqual(tablet2.get(), loader.tablets[1])); + ASSERT_TRUE(MetadatasEqual(tablet3.get(), loader.tablets[2])); + } + + // Delete tablet1 and tablet3 tablets + { + std::vector tablets; + tablets.push_back(tablet1.get()); + tablets.push_back(tablet3.get()); + + loader.Reset(); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->DeleteTablets(tablets)); + ASSERT_OK(master_->catalog_manager()->sys_catalog()->VisitTablets(&loader)); + ASSERT_EQ(1, loader.tablets.size()); + ASSERT_TRUE(MetadatasEqual(tablet2.get(), loader.tablets[0])); + } +} + +// Verify that data mutations are not available from metadata() until commit. +TEST_F(SysCatalogTest, TestTabletInfoCommit) { + scoped_refptr tablet(new TabletInfo(nullptr, "123")); + + // Mutate the tablet, the changes should not be visible + TabletMetadataLock l(tablet.get(), TabletMetadataLock::WRITE); + PartitionPB* partition = l.mutable_data()->pb.mutable_partition(); + partition->set_partition_key_start("a"); + partition->set_partition_key_end("b"); + l.mutable_data()->set_state(SysTabletsEntryPB::RUNNING, "running"); + { + // Changes shouldn't be visible, and lock should still be + // acquired even though the mutation is under way. + TabletMetadataLock read_lock(tablet.get(), TabletMetadataLock::READ); + ASSERT_NE("a", read_lock.data().pb.partition().partition_key_start()); + ASSERT_NE("b", read_lock.data().pb.partition().partition_key_end()); + ASSERT_NE("running", read_lock.data().pb.state_msg()); + ASSERT_NE(SysTabletsEntryPB::RUNNING, + read_lock.data().pb.state()); + } + + // Commit the changes + l.Commit(); + + // Verify that the data is visible + { + TabletMetadataLock read_lock(tablet.get(), TabletMetadataLock::READ); + ASSERT_EQ("a", read_lock.data().pb.partition().partition_key_start()); + ASSERT_EQ("b", read_lock.data().pb.partition().partition_key_end()); + ASSERT_EQ("running", read_lock.data().pb.state_msg()); + ASSERT_EQ(SysTabletsEntryPB::RUNNING, + read_lock.data().pb.state()); + } +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/sys_catalog.cc b/src/kudu/master/sys_catalog.cc new file mode 100644 index 000000000000..caf47dd9c54b --- /dev/null +++ b/src/kudu/master/sys_catalog.cc @@ -0,0 +1,617 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/sys_catalog.h" + +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/partition.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/consensus_peers.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/catalog_manager.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/logging.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/threadpool.h" + +using kudu::consensus::CONSENSUS_CONFIG_COMMITTED; +using kudu::consensus::ConsensusMetadata; +using kudu::consensus::RaftConfigPB; +using kudu::consensus::RaftPeerPB; +using kudu::log::Log; +using kudu::log::LogAnchorRegistry; +using kudu::tablet::LatchTransactionCompletionCallback; +using kudu::tablet::Tablet; +using kudu::tablet::TabletPeer; +using kudu::tserver::WriteRequestPB; +using kudu::tserver::WriteResponsePB; +using std::shared_ptr; +using strings::Substitute; + +namespace kudu { +namespace master { + +static const char* const kSysCatalogTabletId = "00000000000000000000000000000000"; + +static const char* const kSysCatalogTableColType = "entry_type"; +static const char* const kSysCatalogTableColId = "entry_id"; +static const char* const kSysCatalogTableColMetadata = "metadata"; + +SysCatalogTable::SysCatalogTable(Master* master, MetricRegistry* metrics, + ElectedLeaderCallback leader_cb) + : metric_registry_(metrics), + master_(master), + leader_cb_(std::move(leader_cb)), + old_role_(RaftPeerPB::FOLLOWER) { + CHECK_OK(ThreadPoolBuilder("apply").Build(&apply_pool_)); +} + +SysCatalogTable::~SysCatalogTable() { +} + +void SysCatalogTable::Shutdown() { + if (tablet_peer_) { + tablet_peer_->Shutdown(); + } + apply_pool_->Shutdown(); +} + +Status SysCatalogTable::Load(FsManager *fs_manager) { + // Load Metadata Information from disk + scoped_refptr metadata; + RETURN_NOT_OK(tablet::TabletMetadata::Load(fs_manager, kSysCatalogTabletId, &metadata)); + + // Verify that the schema is the current one + if (!metadata->schema().Equals(BuildTableSchema())) { + // TODO: In this case we probably should execute the migration step. + return(Status::Corruption("Unexpected schema", metadata->schema().ToString())); + } + + // Allow for statically and explicitly assigning the consensus configuration and roles through + // the master configuration on startup. + // + // TODO: The following assumptions need revisiting: + // 1. We always believe the local config options for who is in the consensus configuration. + // 2. We always want to look up all node's UUIDs on start (via RPC). + // - TODO: Cache UUIDs. See KUDU-526. + if (master_->opts().IsDistributed()) { + LOG(INFO) << "Configuring consensus for distributed operation..."; + + string tablet_id = metadata->tablet_id(); + gscoped_ptr cmeta; + RETURN_NOT_OK_PREPEND(ConsensusMetadata::Load(fs_manager, tablet_id, + fs_manager->uuid(), &cmeta), + "Unable to load consensus metadata for tablet " + tablet_id); + + RaftConfigPB config; + RETURN_NOT_OK(SetupDistributedConfig(master_->opts(), &config)); + cmeta->set_committed_config(config); + RETURN_NOT_OK_PREPEND(cmeta->Flush(), + "Unable to persist consensus metadata for tablet " + tablet_id); + } + + RETURN_NOT_OK(SetupTablet(metadata)); + return Status::OK(); +} + +Status SysCatalogTable::CreateNew(FsManager *fs_manager) { + // Create the new Metadata + scoped_refptr metadata; + Schema schema = BuildTableSchema(); + PartitionSchema partition_schema; + RETURN_NOT_OK(PartitionSchema::FromPB(PartitionSchemaPB(), schema, &partition_schema)); + + vector split_rows; + vector partitions; + RETURN_NOT_OK(partition_schema.CreatePartitions(split_rows, schema, &partitions)); + DCHECK_EQ(1, partitions.size()); + + RETURN_NOT_OK(tablet::TabletMetadata::CreateNew(fs_manager, + kSysCatalogTabletId, + table_name(), + schema, partition_schema, + partitions[0], + tablet::TABLET_DATA_READY, + &metadata)); + + RaftConfigPB config; + if (master_->opts().IsDistributed()) { + RETURN_NOT_OK_PREPEND(SetupDistributedConfig(master_->opts(), &config), + "Failed to initialize distributed config"); + } else { + config.set_local(true); + config.set_opid_index(consensus::kInvalidOpIdIndex); + RaftPeerPB* peer = config.add_peers(); + peer->set_permanent_uuid(fs_manager->uuid()); + peer->set_member_type(RaftPeerPB::VOTER); + } + + string tablet_id = metadata->tablet_id(); + gscoped_ptr cmeta; + RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager, tablet_id, fs_manager->uuid(), + config, consensus::kMinimumTerm, &cmeta), + "Unable to persist consensus metadata for tablet " + tablet_id); + + return SetupTablet(metadata); +} + +Status SysCatalogTable::SetupDistributedConfig(const MasterOptions& options, + RaftConfigPB* committed_config) { + DCHECK(options.IsDistributed()); + + RaftConfigPB new_config; + new_config.set_local(false); + new_config.set_opid_index(consensus::kInvalidOpIdIndex); + + // Build the set of followers from our server options. + for (const HostPort& host_port : options.master_addresses) { + RaftPeerPB peer; + HostPortPB peer_host_port_pb; + RETURN_NOT_OK(HostPortToPB(host_port, &peer_host_port_pb)); + peer.mutable_last_known_addr()->CopyFrom(peer_host_port_pb); + peer.set_member_type(RaftPeerPB::VOTER); + new_config.add_peers()->CopyFrom(peer); + } + + // Now resolve UUIDs. + // By the time a SysCatalogTable is created and initted, the masters should be + // starting up, so this should be fine to do. + DCHECK(master_->messenger()); + RaftConfigPB resolved_config = new_config; + resolved_config.clear_peers(); + for (const RaftPeerPB& peer : new_config.peers()) { + if (peer.has_permanent_uuid()) { + resolved_config.add_peers()->CopyFrom(peer); + } else { + LOG(INFO) << peer.ShortDebugString() + << " has no permanent_uuid. Determining permanent_uuid..."; + RaftPeerPB new_peer = peer; + // TODO: Use ConsensusMetadata to cache the results of these lookups so + // we only require RPC access to the full consensus configuration on first startup. + // See KUDU-526. + RETURN_NOT_OK_PREPEND(consensus::SetPermanentUuidForRemotePeer(master_->messenger(), + &new_peer), + Substitute("Unable to resolve UUID for peer $0", + peer.ShortDebugString())); + resolved_config.add_peers()->CopyFrom(new_peer); + } + } + + RETURN_NOT_OK(consensus::VerifyRaftConfig(resolved_config, consensus::COMMITTED_QUORUM)); + VLOG(1) << "Distributed Raft configuration: " << resolved_config.ShortDebugString(); + + *committed_config = resolved_config; + return Status::OK(); +} + +void SysCatalogTable::SysCatalogStateChanged(const string& tablet_id, const string& reason) { + CHECK_EQ(tablet_id, tablet_peer_->tablet_id()); + scoped_refptr consensus = tablet_peer_->shared_consensus(); + if (!consensus) { + LOG_WITH_PREFIX(WARNING) << "Received notification of tablet state change " + << "but tablet no longer running. Tablet ID: " + << tablet_id << ". Reason: " << reason; + return; + } + consensus::ConsensusStatePB cstate = consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED); + LOG_WITH_PREFIX(INFO) << "SysCatalogTable state changed. Reason: " << reason << ". " + << "Latest consensus state: " << cstate.ShortDebugString(); + RaftPeerPB::Role new_role = GetConsensusRole(tablet_peer_->permanent_uuid(), cstate); + LOG_WITH_PREFIX(INFO) << "This master's current role is: " + << RaftPeerPB::Role_Name(new_role) + << ", previous role was: " << RaftPeerPB::Role_Name(old_role_); + if (new_role == RaftPeerPB::LEADER) { + CHECK_OK(leader_cb_.Run()); + } +} + +Status SysCatalogTable::SetupTablet(const scoped_refptr& metadata) { + shared_ptr tablet; + scoped_refptr log; + + InitLocalRaftPeerPB(); + + // TODO: handle crash mid-creation of tablet? do we ever end up with a + // partially created tablet here? + tablet_peer_.reset(new TabletPeer( + metadata, + local_peer_pb_, + apply_pool_.get(), + Bind(&SysCatalogTable::SysCatalogStateChanged, Unretained(this), metadata->tablet_id()))); + + consensus::ConsensusBootstrapInfo consensus_info; + tablet_peer_->SetBootstrapping(); + RETURN_NOT_OK(BootstrapTablet(metadata, + scoped_refptr(master_->clock()), + master_->mem_tracker(), + metric_registry_, + tablet_peer_->status_listener(), + &tablet, + &log, + tablet_peer_->log_anchor_registry(), + &consensus_info)); + + // TODO: Do we have a setSplittable(false) or something from the outside is + // handling split in the TS? + + RETURN_NOT_OK_PREPEND(tablet_peer_->Init(tablet, + scoped_refptr(master_->clock()), + master_->messenger(), + log, + tablet->GetMetricEntity()), + "Failed to Init() TabletPeer"); + + RETURN_NOT_OK_PREPEND(tablet_peer_->Start(consensus_info), + "Failed to Start() TabletPeer"); + + tablet_peer_->RegisterMaintenanceOps(master_->maintenance_manager()); + + const Schema* schema = tablet->schema(); + schema_ = SchemaBuilder(*schema).BuildWithoutIds(); + key_schema_ = schema_.CreateKeyProjection(); + return Status::OK(); +} + +std::string SysCatalogTable::LogPrefix() const { + return Substitute("T $0 P $1 [$2]: ", + tablet_peer_->tablet_id(), + tablet_peer_->permanent_uuid(), + table_name()); +} + +Status SysCatalogTable::WaitUntilRunning() { + TRACE_EVENT0("master", "SysCatalogTable::WaitUntilRunning"); + int seconds_waited = 0; + while (true) { + Status status = tablet_peer_->WaitUntilConsensusRunning(MonoDelta::FromSeconds(1)); + seconds_waited++; + if (status.ok()) { + LOG_WITH_PREFIX(INFO) << "configured and running, proceeding with master startup."; + break; + } + if (status.IsTimedOut()) { + LOG_WITH_PREFIX(INFO) << "not online yet (have been trying for " + << seconds_waited << " seconds)"; + continue; + } + // if the status is not OK or TimedOut return it. + return status; + } + return Status::OK(); +} + +Status SysCatalogTable::SyncWrite(const WriteRequestPB *req, WriteResponsePB *resp) { + CountDownLatch latch(1); + gscoped_ptr txn_callback( + new LatchTransactionCompletionCallback(&latch, resp)); + auto tx_state = new tablet::WriteTransactionState(tablet_peer_.get(), req, resp); + tx_state->set_completion_callback(txn_callback.Pass()); + + RETURN_NOT_OK(tablet_peer_->SubmitWrite(tx_state)); + latch.Wait(); + + if (resp->has_error()) { + return StatusFromPB(resp->error().status()); + } + if (resp->per_row_errors_size() > 0) { + for (const WriteResponsePB::PerRowErrorPB& error : resp->per_row_errors()) { + LOG(WARNING) << "row " << error.row_index() << ": " << StatusFromPB(error.error()).ToString(); + } + return Status::Corruption("One or more rows failed to write"); + } + return Status::OK(); +} + +// Schema for the unified SysCatalogTable: +// +// (entry_type, entry_id) -> metadata +// +// entry_type is a enum defined in sys_tables. It indicates +// whether an entry is a table or a tablet. +// +// entry_type is the first part of a compound key as to allow +// efficient scans of entries of only a single type (e.g., only +// scan all of the tables, or only scan all of the tablets). +// +// entry_id is either a table id or a tablet id. For tablet entries, +// the table id that the tablet is associated with is stored in the +// protobuf itself. +Schema SysCatalogTable::BuildTableSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn(kSysCatalogTableColType, INT8)); + CHECK_OK(builder.AddKeyColumn(kSysCatalogTableColId, STRING)); + CHECK_OK(builder.AddColumn(kSysCatalogTableColMetadata, STRING)); + return builder.Build(); +} + +// ================================================================== +// Table related methods +// ================================================================== + +Status SysCatalogTable::AddTable(const TableInfo *table) { + TRACE_EVENT1("master", "SysCatalogTable::AddTable", + "table", table->ToString()); + faststring metadata_buf; + if (!pb_util::SerializeToString(table->metadata().dirty().pb, &metadata_buf)) { + return Status::Corruption("Unable to serialize SysCatalogTablesEntryPB for tablet", + table->metadata().dirty().name()); + } + + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kSysCatalogTabletId); + RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema())); + + KuduPartialRow row(&schema_); + CHECK_OK(row.SetInt8(kSysCatalogTableColType, TABLES_ENTRY)); + CHECK_OK(row.SetString(kSysCatalogTableColId, table->id())); + CHECK_OK(row.SetString(kSysCatalogTableColMetadata, metadata_buf)); + RowOperationsPBEncoder enc(req.mutable_row_operations()); + enc.Add(RowOperationsPB::INSERT, row); + + RETURN_NOT_OK(SyncWrite(&req, &resp)); + return Status::OK(); +} + +Status SysCatalogTable::UpdateTable(const TableInfo *table) { + TRACE_EVENT1("master", "SysCatalogTable::UpdateTable", + "table", table->ToString()); + + faststring metadata_buf; + if (!pb_util::SerializeToString(table->metadata().dirty().pb, &metadata_buf)) { + return Status::Corruption("Unable to serialize SysCatalogTablesEntryPB for tablet", + table->id()); + } + + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kSysCatalogTabletId); + RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema())); + + KuduPartialRow row(&schema_); + CHECK_OK(row.SetInt8(kSysCatalogTableColType, TABLES_ENTRY)); + CHECK_OK(row.SetString(kSysCatalogTableColId, table->id())); + CHECK_OK(row.SetString(kSysCatalogTableColMetadata, metadata_buf)); + RowOperationsPBEncoder enc(req.mutable_row_operations()); + enc.Add(RowOperationsPB::UPDATE, row); + + RETURN_NOT_OK(SyncWrite(&req, &resp)); + return Status::OK(); +} + +Status SysCatalogTable::DeleteTable(const TableInfo *table) { + TRACE_EVENT1("master", "SysCatalogTable::DeleteTable", + "table", table->ToString()); + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kSysCatalogTableColMetadata); + RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema())); + + KuduPartialRow row(&schema_); + CHECK_OK(row.SetInt8(kSysCatalogTableColType, TABLES_ENTRY)); + CHECK_OK(row.SetString(kSysCatalogTableColId, table->id())); + + RowOperationsPBEncoder enc(req.mutable_row_operations()); + enc.Add(RowOperationsPB::DELETE, row); + + RETURN_NOT_OK(SyncWrite(&req, &resp)); + return Status::OK(); +} + +Status SysCatalogTable::VisitTables(TableVisitor* visitor) { + TRACE_EVENT0("master", "SysCatalogTable::VisitTables"); + + const int8_t tables_entry = TABLES_ENTRY; + const int type_col_idx = schema_.find_column(kSysCatalogTableColType); + CHECK(type_col_idx != Schema::kColumnNotFound); + + ColumnRangePredicate pred_tables(schema_.column(type_col_idx), + &tables_entry, &tables_entry); + ScanSpec spec; + spec.AddPredicate(pred_tables); + + gscoped_ptr iter; + RETURN_NOT_OK(tablet_peer_->tablet()->NewRowIterator(schema_, &iter)); + RETURN_NOT_OK(iter->Init(&spec)); + + Arena arena(32 * 1024, 256 * 1024); + RowBlock block(iter->schema(), 512, &arena); + while (iter->HasNext()) { + RETURN_NOT_OK(iter->NextBlock(&block)); + for (size_t i = 0; i < block.nrows(); i++) { + if (!block.selection_vector()->IsRowSelected(i)) continue; + + RETURN_NOT_OK(VisitTableFromRow(block.row(i), visitor)); + } + } + return Status::OK(); +} + +Status SysCatalogTable::VisitTableFromRow(const RowBlockRow& row, + TableVisitor* visitor) { + const Slice* table_id = + schema_.ExtractColumnFromRow(row, schema_.find_column(kSysCatalogTableColId)); + const Slice* data = + schema_.ExtractColumnFromRow(row, schema_.find_column(kSysCatalogTableColMetadata)); + + SysTablesEntryPB metadata; + RETURN_NOT_OK_PREPEND(pb_util::ParseFromArray(&metadata, data->data(), data->size()), + "Unable to parse metadata field for table " + table_id->ToString()); + + RETURN_NOT_OK(visitor->VisitTable(table_id->ToString(), metadata)); + return Status::OK(); +} + +// ================================================================== +// Tablet related methods +// ================================================================== + +Status SysCatalogTable::AddTabletsToPB(const vector& tablets, + RowOperationsPB::Type op_type, + RowOperationsPB* ops) const { + faststring metadata_buf; + KuduPartialRow row(&schema_); + RowOperationsPBEncoder enc(ops); + for (const TabletInfo *tablet : tablets) { + if (!pb_util::SerializeToString(tablet->metadata().dirty().pb, &metadata_buf)) { + return Status::Corruption("Unable to serialize SysCatalogTabletsEntryPB for tablet", + tablet->tablet_id()); + } + + CHECK_OK(row.SetInt8(kSysCatalogTableColType, TABLETS_ENTRY)); + CHECK_OK(row.SetString(kSysCatalogTableColId, tablet->tablet_id())); + CHECK_OK(row.SetString(kSysCatalogTableColMetadata, metadata_buf)); + enc.Add(op_type, row); + } + return Status::OK(); +} + +Status SysCatalogTable::AddAndUpdateTablets(const vector& tablets_to_add, + const vector& tablets_to_update) { + TRACE_EVENT2("master", "AddAndUpdateTablets", + "num_add", tablets_to_add.size(), + "num_update", tablets_to_update.size()); + + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kSysCatalogTabletId); + RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema())); + + // Insert new Tablets + if (!tablets_to_add.empty()) { + RETURN_NOT_OK(AddTabletsToPB(tablets_to_add, RowOperationsPB::INSERT, + req.mutable_row_operations())); + } + + // Update already existing Tablets + if (!tablets_to_update.empty()) { + RETURN_NOT_OK(AddTabletsToPB(tablets_to_update, RowOperationsPB::UPDATE, + req.mutable_row_operations())); + } + + RETURN_NOT_OK(SyncWrite(&req, &resp)); + return Status::OK(); +} + +Status SysCatalogTable::AddTablets(const vector& tablets) { + vector empty_tablets; + return AddAndUpdateTablets(tablets, empty_tablets); +} + +Status SysCatalogTable::UpdateTablets(const vector& tablets) { + vector empty_tablets; + return AddAndUpdateTablets(empty_tablets, tablets); +} + +Status SysCatalogTable::DeleteTablets(const vector& tablets) { + TRACE_EVENT1("master", "DeleteTablets", + "num_tablets", tablets.size()); + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kSysCatalogTabletId); + RETURN_NOT_OK(SchemaToPB(schema_, req.mutable_schema())); + + RowOperationsPBEncoder enc(req.mutable_row_operations()); + KuduPartialRow row(&schema_); + for (const TabletInfo* tablet : tablets) { + CHECK_OK(row.SetInt8(kSysCatalogTableColType, TABLETS_ENTRY)); + CHECK_OK(row.SetString(kSysCatalogTableColId, tablet->tablet_id())); + enc.Add(RowOperationsPB::DELETE, row); + } + + RETURN_NOT_OK(SyncWrite(&req, &resp)); + return Status::OK(); +} + +Status SysCatalogTable::VisitTabletFromRow(const RowBlockRow& row, TabletVisitor *visitor) { + const Slice *tablet_id = + schema_.ExtractColumnFromRow(row, schema_.find_column(kSysCatalogTableColId)); + const Slice *data = + schema_.ExtractColumnFromRow(row, schema_.find_column(kSysCatalogTableColMetadata)); + + SysTabletsEntryPB metadata; + RETURN_NOT_OK_PREPEND(pb_util::ParseFromArray(&metadata, data->data(), data->size()), + "Unable to parse metadata field for tablet " + tablet_id->ToString()); + + // Upgrade from the deprecated start/end-key fields to the 'partition' field. + if (!metadata.has_partition()) { + metadata.mutable_partition()->set_partition_key_start( + metadata.deprecated_start_key()); + metadata.mutable_partition()->set_partition_key_end( + metadata.deprecated_end_key()); + metadata.clear_deprecated_start_key(); + metadata.clear_deprecated_end_key(); + } + + RETURN_NOT_OK(visitor->VisitTablet(metadata.table_id(), tablet_id->ToString(), metadata)); + return Status::OK(); +} + +Status SysCatalogTable::VisitTablets(TabletVisitor* visitor) { + TRACE_EVENT0("master", "SysCatalogTable::VisitTablets"); + const int8_t tablets_entry = TABLETS_ENTRY; + const int type_col_idx = schema_.find_column(kSysCatalogTableColType); + CHECK(type_col_idx != Schema::kColumnNotFound); + + ColumnRangePredicate pred_tablets(schema_.column(type_col_idx), + &tablets_entry, &tablets_entry); + ScanSpec spec; + spec.AddPredicate(pred_tablets); + + gscoped_ptr iter; + RETURN_NOT_OK(tablet_peer_->tablet()->NewRowIterator(schema_, &iter)); + RETURN_NOT_OK(iter->Init(&spec)); + + Arena arena(32 * 1024, 256 * 1024); + RowBlock block(iter->schema(), 512, &arena); + while (iter->HasNext()) { + RETURN_NOT_OK(iter->NextBlock(&block)); + for (size_t i = 0; i < block.nrows(); i++) { + if (!block.selection_vector()->IsRowSelected(i)) continue; + + RETURN_NOT_OK(VisitTabletFromRow(block.row(i), visitor)); + } + } + return Status::OK(); +} + +void SysCatalogTable::InitLocalRaftPeerPB() { + local_peer_pb_.set_permanent_uuid(master_->fs_manager()->uuid()); + Sockaddr addr = master_->first_rpc_address(); + HostPort hp; + CHECK_OK(HostPortFromSockaddrReplaceWildcard(addr, &hp)); + CHECK_OK(HostPortToPB(hp, local_peer_pb_.mutable_last_known_addr())); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/sys_catalog.h b/src/kudu/master/sys_catalog.h new file mode 100644 index 000000000000..a588082941c9 --- /dev/null +++ b/src/kudu/master/sys_catalog.h @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_SYS_CATALOG_H_ +#define KUDU_MASTER_SYS_CATALOG_H_ + +#include +#include + +#include "kudu/master/master.pb.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Schema; +class FsManager; + +namespace tserver { +class WriteRequestPB; +class WriteResponsePB; +} + +namespace master { +class Master; +struct MasterOptions; +class TableInfo; +class TabletInfo; + +// The SysCatalogTable has two separate visitors because the tables +// data must be loaded into memory before the tablets data. +class TableVisitor { + public: + virtual Status VisitTable(const std::string& table_id, + const SysTablesEntryPB& metadata) = 0; +}; + +class TabletVisitor { + public: + virtual Status VisitTablet(const std::string& table_id, + const std::string& tablet_id, + const SysTabletsEntryPB& metadata) = 0; +}; + +// SysCatalogTable is a Kudu table that keeps track of table and +// tablet metadata. +// - SysCatalogTable has only one tablet. +// - SysCatalogTable is managed by the master and not exposed to the user +// as a "normal table", instead we have Master APIs to query the table. +class SysCatalogTable { + public: + typedef Callback ElectedLeaderCallback; + + enum CatalogEntryType { + TABLES_ENTRY = 1, + TABLETS_ENTRY = 2 + }; + + // 'leader_cb_' is invoked whenever this node is elected as a leader + // of the consensus configuration for this tablet, including for local standalone + // master consensus configurations. It used to initialize leader state, submit any + // leader-specific tasks and so forth. + // + /// NOTE: Since 'leader_cb_' is invoked synchronously and can block + // the consensus configuration's progress, any long running tasks (e.g., scanning + // tablets) should be performed asynchronously (by, e.g., submitting + // them to a to a separate threadpool). + SysCatalogTable(Master* master, MetricRegistry* metrics, + ElectedLeaderCallback leader_cb); + + ~SysCatalogTable(); + + // Allow for orderly shutdown of tablet peer, etc. + void Shutdown(); + + // Load the Metadata from disk, and initialize the TabletPeer for the sys-table + Status Load(FsManager *fs_manager); + + // Create the new Metadata and initialize the TabletPeer for the sys-table. + Status CreateNew(FsManager *fs_manager); + + // ================================================================== + // Tables related methods + // ================================================================== + Status AddTable(const TableInfo* table); + Status UpdateTable(const TableInfo* table); + Status DeleteTable(const TableInfo* table); + + // Scan of the table-related entries. + Status VisitTables(TableVisitor* visitor); + + // ================================================================== + // Tablets related methods + // ================================================================== + Status AddTablets(const vector& tablets); + Status UpdateTablets(const vector& tablets); + Status AddAndUpdateTablets(const vector& tablets_to_add, + const vector& tablets_to_update); + Status DeleteTablets(const vector& tablets); + + // Scan of the tablet-related entries. + Status VisitTablets(TabletVisitor* visitor); + + private: + DISALLOW_COPY_AND_ASSIGN(SysCatalogTable); + + friend class CatalogManager; + + const char *table_name() const { return "sys.catalog"; } + + // Return the schema of the table. + // NOTE: This is the "server-side" schema, so it must have the column IDs. + Schema BuildTableSchema(); + + // Returns 'Status::OK()' if the WriteTranasction completed + Status SyncWrite(const tserver::WriteRequestPB *req, tserver::WriteResponsePB *resp); + + void SysCatalogStateChanged(const std::string& tablet_id, const std::string& reason); + + Status SetupTablet(const scoped_refptr& metadata); + + // Use the master options to generate a new consensus configuration. + // In addition, resolve all UUIDs of this consensus configuration. + // + // Note: The current node adds itself to the peers whether leader or + // follower, depending on whether the Master options leader flag is + // set. Even if the local node should be a follower, it should not be listed + // in the Master options followers list, as it will add itself automatically. + // + // TODO: Revisit this whole thing when integrating leader election. + Status SetupDistributedConfig(const MasterOptions& options, + consensus::RaftConfigPB* committed_config); + + const scoped_refptr& tablet_peer() const { + return tablet_peer_; + } + + std::string tablet_id() const { + return tablet_peer_->tablet_id(); + } + + // Conventional "T xxx P xxxx..." prefix for logging. + std::string LogPrefix() const; + + // Waits for the tablet to reach 'RUNNING' state. + // + // Contrary to tablet servers, in master we actually wait for the master tablet + // to become online synchronously, this allows us to fail fast if something fails + // and shouldn't induce the all-workers-blocked-waiting-for-tablets problem + // that we've seen in tablet servers since the master only has to boot a few + // tablets. + Status WaitUntilRunning(); + + // Table related private methods. + Status VisitTableFromRow(const RowBlockRow& row, TableVisitor* visitor); + + // Tablet related private methods. + + // Add dirty tablet data to the given row operations. + Status AddTabletsToPB(const std::vector& tablets, + RowOperationsPB::Type op_type, + RowOperationsPB* ops) const; + Status VisitTabletFromRow(const RowBlockRow& row, TabletVisitor* visitor); + + // Initializes the RaftPeerPB for the local peer. + // Crashes due to an invariant check if the rpc server is not running. + void InitLocalRaftPeerPB(); + + // Table schema, without IDs, used to send messages to the TabletPeer + Schema schema_; + Schema key_schema_; + + MetricRegistry* metric_registry_; + + gscoped_ptr apply_pool_; + + scoped_refptr tablet_peer_; + + Master* master_; + + ElectedLeaderCallback leader_cb_; + consensus::RaftPeerPB::Role old_role_; + + consensus::RaftPeerPB local_peer_pb_; +}; + +} // namespace master +} // namespace kudu + +#endif diff --git a/src/kudu/master/ts_descriptor.cc b/src/kudu/master/ts_descriptor.cc new file mode 100644 index 000000000000..d92a60027970 --- /dev/null +++ b/src/kudu/master/ts_descriptor.cc @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/master/master.pb.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/util/net/net_util.h" + +#include +#include + +#include +#include + +using std::shared_ptr; + +namespace kudu { +namespace master { + +Status TSDescriptor::RegisterNew(const NodeInstancePB& instance, + const TSRegistrationPB& registration, + gscoped_ptr* desc) { + gscoped_ptr ret(new TSDescriptor(instance.permanent_uuid())); + RETURN_NOT_OK(ret->Register(instance, registration)); + desc->swap(ret); + return Status::OK(); +} + +TSDescriptor::TSDescriptor(std::string perm_id) + : permanent_uuid_(std::move(perm_id)), + latest_seqno_(-1), + last_heartbeat_(MonoTime::Now(MonoTime::FINE)), + has_tablet_report_(false), + recent_replica_creations_(0), + last_replica_creations_decay_(MonoTime::Now(MonoTime::FINE)), + num_live_replicas_(0) { +} + +TSDescriptor::~TSDescriptor() { +} + +Status TSDescriptor::Register(const NodeInstancePB& instance, + const TSRegistrationPB& registration) { + boost::lock_guard l(lock_); + CHECK_EQ(instance.permanent_uuid(), permanent_uuid_); + + if (instance.instance_seqno() < latest_seqno_) { + return Status::AlreadyPresent( + strings::Substitute("Cannot register with sequence number $0:" + " Already have a registration from sequence number $1", + instance.instance_seqno(), + latest_seqno_)); + } else if (instance.instance_seqno() == latest_seqno_) { + // It's possible that the TS registered, but our response back to it + // got lost, so it's trying to register again with the same sequence + // number. That's fine. + LOG(INFO) << "Processing retry of TS registration from " << instance.ShortDebugString(); + } + + latest_seqno_ = instance.instance_seqno(); + // After re-registering, make the TS re-report its tablets. + has_tablet_report_ = false; + + registration_.reset(new TSRegistrationPB(registration)); + ts_admin_proxy_.reset(); + consensus_proxy_.reset(); + + return Status::OK(); +} + +void TSDescriptor::UpdateHeartbeatTime() { + boost::lock_guard l(lock_); + last_heartbeat_ = MonoTime::Now(MonoTime::FINE); +} + +MonoDelta TSDescriptor::TimeSinceHeartbeat() const { + MonoTime now(MonoTime::Now(MonoTime::FINE)); + boost::lock_guard l(lock_); + return now.GetDeltaSince(last_heartbeat_); +} + +int64_t TSDescriptor::latest_seqno() const { + boost::lock_guard l(lock_); + return latest_seqno_; +} + +bool TSDescriptor::has_tablet_report() const { + boost::lock_guard l(lock_); + return has_tablet_report_; +} + +void TSDescriptor::set_has_tablet_report(bool has_report) { + boost::lock_guard l(lock_); + has_tablet_report_ = has_report; +} + +void TSDescriptor::DecayRecentReplicaCreationsUnlocked() { + // In most cases, we won't have any recent replica creations, so + // we don't need to bother calling the clock, etc. + if (recent_replica_creations_ == 0) return; + + const double kHalflifeSecs = 60; + MonoTime now = MonoTime::Now(MonoTime::FINE); + double secs_since_last_decay = now.GetDeltaSince(last_replica_creations_decay_).ToSeconds(); + recent_replica_creations_ *= pow(0.5, secs_since_last_decay / kHalflifeSecs); + + // If sufficiently small, reset down to 0 to take advantage of the fast path above. + if (recent_replica_creations_ < 1e-12) { + recent_replica_creations_ = 0; + } + last_replica_creations_decay_ = now; +} + +void TSDescriptor::IncrementRecentReplicaCreations() { + lock_guard l(&lock_); + DecayRecentReplicaCreationsUnlocked(); + recent_replica_creations_ += 1; +} + +double TSDescriptor::RecentReplicaCreations() { + boost::lock_guard l(lock_); + DecayRecentReplicaCreationsUnlocked(); + return recent_replica_creations_; +} + +void TSDescriptor::GetRegistration(TSRegistrationPB* reg) const { + boost::lock_guard l(lock_); + CHECK(registration_) << "No registration"; + CHECK_NOTNULL(reg)->CopyFrom(*registration_); +} + +void TSDescriptor::GetNodeInstancePB(NodeInstancePB* instance_pb) const { + boost::lock_guard l(lock_); + instance_pb->set_permanent_uuid(permanent_uuid_); + instance_pb->set_instance_seqno(latest_seqno_); +} + +Status TSDescriptor::ResolveSockaddr(Sockaddr* addr) const { + vector hostports; + { + boost::lock_guard l(lock_); + for (const HostPortPB& addr : registration_->rpc_addresses()) { + hostports.push_back(HostPort(addr.host(), addr.port())); + } + } + + // Resolve DNS outside the lock. + HostPort last_hostport; + vector addrs; + for (const HostPort& hostport : hostports) { + RETURN_NOT_OK(hostport.ResolveAddresses(&addrs)); + if (!addrs.empty()) { + last_hostport = hostport; + break; + } + } + + if (addrs.size() == 0) { + return Status::NetworkError("Unable to find the TS address: ", registration_->DebugString()); + } + + if (addrs.size() > 1) { + LOG(WARNING) << "TS address " << last_hostport.ToString() + << " resolves to " << addrs.size() << " different addresses. Using " + << addrs[0].ToString(); + } + *addr = addrs[0]; + return Status::OK(); +} + +Status TSDescriptor::GetTSAdminProxy(const shared_ptr& messenger, + shared_ptr* proxy) { + { + boost::lock_guard l(lock_); + if (ts_admin_proxy_) { + *proxy = ts_admin_proxy_; + return Status::OK(); + } + } + + Sockaddr addr; + RETURN_NOT_OK(ResolveSockaddr(&addr)); + + boost::lock_guard l(lock_); + if (!ts_admin_proxy_) { + ts_admin_proxy_.reset(new tserver::TabletServerAdminServiceProxy(messenger, addr)); + } + *proxy = ts_admin_proxy_; + return Status::OK(); +} + +Status TSDescriptor::GetConsensusProxy(const shared_ptr& messenger, + shared_ptr* proxy) { + { + boost::lock_guard l(lock_); + if (consensus_proxy_) { + *proxy = consensus_proxy_; + return Status::OK(); + } + } + + Sockaddr addr; + RETURN_NOT_OK(ResolveSockaddr(&addr)); + + boost::lock_guard l(lock_); + if (!consensus_proxy_) { + consensus_proxy_.reset(new consensus::ConsensusServiceProxy(messenger, addr)); + } + *proxy = consensus_proxy_; + return Status::OK(); +} + +} // namespace master +} // namespace kudu diff --git a/src/kudu/master/ts_descriptor.h b/src/kudu/master/ts_descriptor.h new file mode 100644 index 000000000000..b0327fcc167a --- /dev/null +++ b/src/kudu/master/ts_descriptor.h @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_TS_DESCRIPTOR_H +#define KUDU_MASTER_TS_DESCRIPTOR_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +class NodeInstancePB; +class Sockaddr; + +namespace consensus { +class ConsensusServiceProxy; +} + +namespace rpc { +class Messenger; +} + +namespace tserver { +class TabletServerAdminServiceProxy; +} + +namespace master { + +class TSRegistrationPB; + +// Master-side view of a single tablet server. +// +// Tracks the last heartbeat, status, instance identifier, etc. +// This class is thread-safe. +class TSDescriptor { + public: + static Status RegisterNew(const NodeInstancePB& instance, + const TSRegistrationPB& registration, + gscoped_ptr* desc); + + virtual ~TSDescriptor(); + + // Set the last-heartbeat time to now. + void UpdateHeartbeatTime(); + + // Return the amount of time since the last heartbeat received + // from this TS. + MonoDelta TimeSinceHeartbeat() const; + + // Register this tablet server. + Status Register(const NodeInstancePB& instance, + const TSRegistrationPB& registration); + + const std::string &permanent_uuid() const { return permanent_uuid_; } + int64_t latest_seqno() const; + + bool has_tablet_report() const; + void set_has_tablet_report(bool has_report); + + // Copy the current registration info into the given PB object. + // A safe copy is returned because the internal Registration object + // may be mutated at any point if the tablet server re-registers. + void GetRegistration(TSRegistrationPB* reg) const; + + void GetNodeInstancePB(NodeInstancePB* instance_pb) const; + + // Return an RPC proxy to the tablet server admin service. + Status GetTSAdminProxy(const std::shared_ptr& messenger, + std::shared_ptr* proxy); + + // Return an RPC proxy to the consensus service. + Status GetConsensusProxy(const std::shared_ptr& messenger, + std::shared_ptr* proxy); + + // Increment the accounting of the number of replicas recently created on this + // server. This value will automatically decay over time. + void IncrementRecentReplicaCreations(); + + // Return the number of replicas which have recently been created on this + // TS. This number is incremented when replicas are placed on the TS, and + // then decayed over time. This method is not 'const' because each call + // actually performs the time-based decay. + double RecentReplicaCreations(); + + // Set the number of live replicas (i.e. running or bootstrapping). + void set_num_live_replicas(int n) { + DCHECK_GE(n, 0); + lock_guard l(&lock_); + num_live_replicas_ = n; + } + + // Return the number of live replicas (i.e running or bootstrapping). + int num_live_replicas() const { + lock_guard l(&lock_); + return num_live_replicas_; + } + + private: + FRIEND_TEST(TestTSDescriptor, TestReplicaCreationsDecay); + + explicit TSDescriptor(std::string perm_id); + + // Uses DNS to resolve registered hosts to a single Sockaddr. + Status ResolveSockaddr(Sockaddr* addr) const; + + void DecayRecentReplicaCreationsUnlocked(); + + mutable simple_spinlock lock_; + + const std::string permanent_uuid_; + int64_t latest_seqno_; + + // The last time a heartbeat was received for this node. + MonoTime last_heartbeat_; + + // Set to true once this instance has reported all of its tablets. + bool has_tablet_report_; + + // The number of times this tablet server has recently been selected to create a + // tablet replica. This value decays back to 0 over time. + double recent_replica_creations_; + MonoTime last_replica_creations_decay_; + + // The number of live replicas on this host, from the last heartbeat. + int num_live_replicas_; + + gscoped_ptr registration_; + + std::shared_ptr ts_admin_proxy_; + std::shared_ptr consensus_proxy_; + + DISALLOW_COPY_AND_ASSIGN(TSDescriptor); +}; + +} // namespace master +} // namespace kudu +#endif /* KUDU_MASTER_TS_DESCRIPTOR_H */ diff --git a/src/kudu/master/ts_manager.cc b/src/kudu/master/ts_manager.cc new file mode 100644 index 000000000000..1b5e140a5559 --- /dev/null +++ b/src/kudu/master/ts_manager.cc @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/master/ts_manager.h" + +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/ts_descriptor.h" +#include "kudu/util/flag_tags.h" + +DEFINE_int32(tserver_unresponsive_timeout_ms, 60 * 1000, + "The period of time that a Master can go without receiving a heartbeat from a " + "tablet server before considering it unresponsive. Unresponsive servers are not " + "selected when assigning replicas during table creation or re-replication."); +TAG_FLAG(tserver_unresponsive_timeout_ms, advanced); + +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { +namespace master { + +TSManager::TSManager() { +} + +TSManager::~TSManager() { +} + +Status TSManager::LookupTS(const NodeInstancePB& instance, + shared_ptr* ts_desc) { + boost::shared_lock l(lock_); + const shared_ptr* found_ptr = + FindOrNull(servers_by_id_, instance.permanent_uuid()); + if (!found_ptr) { + return Status::NotFound("unknown tablet server ID", instance.ShortDebugString()); + } + const shared_ptr& found = *found_ptr; + + if (instance.instance_seqno() != found->latest_seqno()) { + return Status::NotFound("mismatched instance sequence number", instance.ShortDebugString()); + } + + *ts_desc = found; + return Status::OK(); +} + +bool TSManager::LookupTSByUUID(const string& uuid, + std::shared_ptr* ts_desc) { + boost::shared_lock l(lock_); + return FindCopy(servers_by_id_, uuid, ts_desc); +} + +Status TSManager::RegisterTS(const NodeInstancePB& instance, + const TSRegistrationPB& registration, + std::shared_ptr* desc) { + boost::lock_guard l(lock_); + const string& uuid = instance.permanent_uuid(); + + if (!ContainsKey(servers_by_id_, uuid)) { + gscoped_ptr new_desc; + RETURN_NOT_OK(TSDescriptor::RegisterNew(instance, registration, &new_desc)); + InsertOrDie(&servers_by_id_, uuid, shared_ptr(new_desc.release())); + LOG(INFO) << "Registered new tablet server { " << instance.ShortDebugString() + << " } with Master"; + } else { + const shared_ptr& found = FindOrDie(servers_by_id_, uuid); + RETURN_NOT_OK(found->Register(instance, registration)); + LOG(INFO) << "Re-registered known tablet server { " << instance.ShortDebugString() + << " } with Master"; + } + + return Status::OK(); +} + +void TSManager::GetAllDescriptors(vector > *descs) const { + descs->clear(); + boost::shared_lock l(lock_); + AppendValuesFromMap(servers_by_id_, descs); +} + +void TSManager::GetAllLiveDescriptors(vector > *descs) const { + descs->clear(); + + boost::shared_lock l(lock_); + descs->reserve(servers_by_id_.size()); + for (const TSDescriptorMap::value_type& entry : servers_by_id_) { + const shared_ptr& ts = entry.second; + if (ts->TimeSinceHeartbeat().ToMilliseconds() < FLAGS_tserver_unresponsive_timeout_ms) { + descs->push_back(ts); + } + } +} + +int TSManager::GetCount() const { + boost::shared_lock l(lock_); + return servers_by_id_.size(); +} + +} // namespace master +} // namespace kudu + diff --git a/src/kudu/master/ts_manager.h b/src/kudu/master/ts_manager.h new file mode 100644 index 000000000000..b2df8ae2739b --- /dev/null +++ b/src/kudu/master/ts_manager.h @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_MASTER_TS_MANAGER_H +#define KUDU_MASTER_TS_MANAGER_H + +#include +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +class NodeInstancePB; + +namespace master { + +class TSDescriptor; +class TSRegistrationPB; + +typedef std::vector > TSDescriptorVector; + +// Tracks the servers that the master has heard from, along with their +// last heartbeat, etc. +// +// Note that TSDescriptors are never deleted, even if the TS crashes +// and has not heartbeated in quite a while. This makes it simpler to +// keep references to TSDescriptors elsewhere in the master without +// fear of lifecycle problems. Dead servers are "dead, but not forgotten" +// (they live on in the heart of the master). +// +// This class is thread-safe. +class TSManager { + public: + TSManager(); + virtual ~TSManager(); + + // Lookup the tablet server descriptor for the given instance identifier. + // If the TS has never registered, or this instance doesn't match the + // current instance ID for the TS, then a NotFound status is returned. + // Otherwise, *desc is set and OK is returned. + Status LookupTS(const NodeInstancePB& instance, + std::shared_ptr* desc); + + // Lookup the tablet server descriptor for the given UUID. + // Returns false if the TS has never registered. + // Otherwise, *desc is set and returns true. + bool LookupTSByUUID(const std::string& uuid, + std::shared_ptr* desc); + + // Register or re-register a tablet server with the manager. + // + // If successful, *desc reset to the registered descriptor. + Status RegisterTS(const NodeInstancePB& instance, + const TSRegistrationPB& registration, + std::shared_ptr* desc); + + // Return all of the currently registered TS descriptors into the provided + // list. + void GetAllDescriptors(std::vector >* descs) const; + + // Return all of the currently registered TS descriptors that have sent a + // heartbeat recently, indicating that they're alive and well. + void GetAllLiveDescriptors(std::vector >* descs) const; + + // Get the TS count. + int GetCount() const; + + private: + mutable rw_spinlock lock_; + + typedef std::unordered_map< + std::string, std::shared_ptr > TSDescriptorMap; + TSDescriptorMap servers_by_id_; + + DISALLOW_COPY_AND_ASSIGN(TSManager); +}; + +} // namespace master +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/CMakeLists.txt b/src/kudu/rpc/CMakeLists.txt new file mode 100644 index 000000000000..0533b3e43434 --- /dev/null +++ b/src/kudu/rpc/CMakeLists.txt @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#### Global header protobufs +PROTOBUF_GENERATE_CPP( + RPC_HEADER_PROTO_SRCS RPC_HEADER_PROTO_HDRS RPC_HEADER_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES rpc_header.proto) +ADD_EXPORTABLE_LIBRARY(rpc_header_proto + SRCS ${RPC_HEADER_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${RPC_HEADER_PROTO_TGTS}) + +PROTOBUF_GENERATE_CPP( + RPC_INTROSPECTION_PROTO_SRCS RPC_INTROSPECTION_PROTO_HDRS RPC_INTROSPECTION_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES rpc_introspection.proto) +set(RPC_INTROSPECTION_PROTO_LIBS + rpc_header_proto + protobuf) +ADD_EXPORTABLE_LIBRARY(rpc_introspection_proto + SRCS ${RPC_INTROSPECTION_PROTO_SRCS} + DEPS ${RPC_INTROSPECTION_PROTO_LIBS} + NONLINK_DEPS ${RPC_INTROSPECTION_PROTO_TGTS}) + +### RPC library +set(KRPC_SRCS + acceptor_pool.cc + auth_store.cc + blocking_ops.cc + outbound_call.cc + connection.cc + constants.cc + inbound_call.cc + messenger.cc + negotiation.cc + proxy.cc + reactor.cc + remote_method.cc + rpc.cc + rpc_context.cc + rpc_controller.cc + sasl_common.cc + sasl_client.cc + sasl_helper.cc + sasl_server.cc + serialization.cc + service_if.cc + service_pool.cc + transfer.cc +) + +set(KRPC_LIBS + rpc_header_proto + rpc_introspection_proto + kudu_util + gutil + libev + cyrus_sasl) + +ADD_EXPORTABLE_LIBRARY(krpc + SRCS ${KRPC_SRCS} + DEPS ${KRPC_LIBS}) + +### RPC generator tool +add_executable(protoc-gen-krpc protoc-gen-krpc.cc) +target_link_libraries(protoc-gen-krpc + ${KUDU_BASE_LIBS} + protoc + protobuf + gutil + kudu_util) + +#### RPC test +PROTOBUF_GENERATE_CPP( + RPC_TEST_DIFF_PACKAGE_SRCS RPC_TEST_DIFF_PACKAGE_HDRS RPC_TEST_DIFF_PACKAGE_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES rtest_diff_package.proto) +add_library(rtest_diff_package_proto ${RPC_TEST_DIFF_PACKAGE_SRCS} ${RPC_TEST_DIFF_PACKAGE_HDRS}) +target_link_libraries(rtest_diff_package_proto rpc_header_proto) + +KRPC_GENERATE( + RTEST_KRPC_SRCS RTEST_KRPC_HDRS RTEST_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES rtest.proto) +add_library(rtest_krpc ${RTEST_KRPC_SRCS} ${RTEST_KRPC_HDRS}) +target_link_libraries(rtest_krpc + krpc + rpc_header_proto + rtest_diff_package_proto) + +# Tests +set(KUDU_TEST_LINK_LIBS rtest_krpc krpc ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(mt-rpc-test RUN_SERIAL true) +ADD_KUDU_TEST(reactor-test) +ADD_KUDU_TEST(rpc-bench RUN_SERIAL true) +ADD_KUDU_TEST(rpc-test) +ADD_KUDU_TEST(rpc_stub-test) +ADD_KUDU_TEST(sasl_rpc-test) diff --git a/src/kudu/rpc/README b/src/kudu/rpc/README new file mode 100644 index 000000000000..c4031dc9450a --- /dev/null +++ b/src/kudu/rpc/README @@ -0,0 +1,361 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=============================================================================== +RPC +=============================================================================== + +------------------------------------------------------------------------------- +Intro +------------------------------------------------------------------------------- +The RPC layer makes communication with remote processes look like local +function calls. You can make either asynchronous calls, in which you provide a +callback which is invoked later, or synchronous calls, where your thread blocks +until the remote system responds. + +The wire format of Kudu RPC is very close to the wire format of Hadoop IPC in +hadoop-3 and beyond. It is not identical since there are still some java-isms +left in Hadoop IPC which we did not want to inherit. In addition, Kudu RPC has +developed some extra features such as deadline propagation which are not +available in Hadoop. However, the overall structure of the wire protocol is +very similar. + +We use protocol buffers for serialization, and libev for non-blocking I/O. + +For some code examples, look in rpc-test.cc and rpc_stub-test. + +------------------------------------------------------------------------------- +Overview +------------------------------------------------------------------------------- + + +------------------------------------+ + | AcceptorPool | + | a pool of threads which | + +-------------------------+ | call accept() | + | Proxy | +------------------------------------+ + | | | new socket + | The proxy is the object | V + | which has the remote | +------------------------------------+ + | method definitions. | --------> | Messenger | + | | | | + +-------------------------+ | +-----------+ +-----------+ | + | | reactor 1 | | reactor 2 | ... | + +-------------------------+ | +-----------+ +-----------+ | + | ResponseCallback | <-------- | |<-. + | | +------------------------------------+ | + | The callback which gets | | | + | invoked when the remote | V | + | end replies or the call | +------------------------------------+ | + | otherwise terminates. | | ServicePool | | + +-------------------------+ | a pool of threads which | | Call responses + | pull new inbound calls from a | | sent back via + | work queue. | | messenger. + +------------------------------------+ | + | | + v | + +------------------------------------+ | + | ServiceIf | | + | user-implemented class which | / + | handles new inbound RPCs | + +------------------------------------+ + +Each reactor has a thread which uses epoll to handle many sockets using +non-blocking I/O. Blocking calls are implemented by the Proxy using +non-blocking calls-- from the point of view of the Messenger, all calls are +non-blocking. + +The acceptor pool and the service pool are optional components. If you don't +expect anyone to be connecting to you, you do not have to start them. If a server +expects to listen on multiple ports (eg for different protocols), multiple +AcceptorPools may be attached. + +------------------------------------------------------------------------------- +Proxy classes +------------------------------------------------------------------------------- + +Proxy classes are used by the client to send calls to a remote service. +Calls may be made synchronously or asynchronously -- the synchronous calls are simply +a wrapper around the asynchronous version, which makes the call and then waits +on the callback to be triggered. + +In order to make a call, the user must provide a method name, a request protobuf, +a response protobuf, an RpcController, and a callback. + +Each RpcController object corresponds to exactly one in-flight call on the client. +This class is where per-call settings may be adjusted before making an RPC -- +currently this is just timeout functionality, but in the future may include +other call properties such as tracing information, priority classes, deadline +propagation, etc. + +Upon issuing the asynchronous request, the RPC layer enqueues the call to be sent +to the server and immediately returns. During this period, the caller thread +may continue to send other RPCs or perform other processing while waiting for +the callback to be triggered. In the future, we will provide an RPC cancellation +function on the RpcController object in case the user determines that the call +is no longer required. + +When the call completes, the RPC layer will invoke the provided ResponseCallback +function from within the context of the reactor thread. Given this, +ResponseCallbacks should be careful to never block, as it would prevent other +threads from concurrent sending or receiving RPCs. + +The callback is invoked exactly one time, regardless of the call's termination state. +The user can determine the call's state by invoking methods on the RpcController object, +for example to determine whether the call succeded, timed out, or suffered a +transport error. In the case that the call succeeds, the user-provided response protobuf +will have been initialized to contain the result. + +Please see the accompanying documentation in the Proxy and RpcController classes +for more information on the specific API, as well as the test cases in rpc-test.cc +for example usage. + +------------------------------------------------------------------------------- +Generated Code +------------------------------------------------------------------------------- + +In general, clients will use auto-generated subclasses of Proxy and ServiceIf to +get additional type safety and nicer APIs. + +The generated proxy object has the same API as the generic Proxy, except that +methods are generated for each RPC defined within the protobuf service. Each +RPC has a synchronous and async version, corresponding to Proxy::AsyncRequest and +Proxy::SyncRequest. These generated methods have an identical API to the generic +one except that they are type-safe and do not require the method name to be passed. + +The generated ServiceIf class contains pure virtual methods for each of the RPCs +in the service. Each method to be implemented has an API like: + + void MethodName(const RequestPB *req, + ResponsePB *resp, ::kudu::rpc::RpcContext *context); + +The request PB is the user-provided request, and the response PB is a cleared +protobuf ready to store the RPC response. Once the RPC response has been filled in, +the service should call context->RespondSuccess(). This method may be called +from any thread in the application at any point either before or after the +actual handler method returns. + +In the case of an unexpected error, the generated code may alternatively call +context->RespondFailure(...). However, for any error responses which should be +parseable by the client code, it is preferable to define an error response inside +the response protobuf itself -- this is a much more flexible way of returning +actionable information with an error, given that Status just holds a string +and not much else. + +See rpc/rpc-test-base.h for an example service implementation, as well as the +documentation comments in rpc/service_if.h. + +------------------------------------------------------------------------------- +ServiceIf classes +------------------------------------------------------------------------------- +ServiceIf classes are abstract interfaces that the server implements to handle +incoming RPCs. In general, each generated service has several virtual methods +which you can override in order to implement the relevant function call. + +There is a ServicePool which you can use to coordinate several worker threads +handling callbacks. + +------------------------------------------------------------------------------- +RPC Sidecars +------------------------------------------------------------------------------- +RPC sidecars are used to avoid excess copies for large volumes of data. +Prior to RPC sidecars, the sequence of steps for creating an RPC response +on the server side would be as follows: + +1. Write the prepared response to a Google protobuf message. +2. Pass the message off to the InboundCall class, which serializes the + protobuf into a process-local buffer. +3. Copy the process-local buffer to the kernel buffer (send() to a socket). + +The client follows these steps in reverse order. On top of the extra copy, +this procedure also forces us to use std::string, which is difficult for +compilers to inline code for and requires that reserved bytes are nulled out, +which is an unnecessary call to memset. + +Instead, sidecars provide a mechanism to indicate the need to pass a large +store of data to the InboundCall class, which manages the response to a single +RPC on the server side. When send()-ing the rest of the message (i.e., the +protobuf), the sidecar's data is directly written to the socket. + +The data is appended directly after the main message protobuf. Here's what +a typical message looks like without sidecars: + ++------------------------------------------------+ +| Total message length (4 bytes) | ++------------------------------------------------+ +| RPC Header protobuf length (variable encoding) | ++------------------------------------------------+ +| RPC Header protobuf | ++------------------------------------------------+ +| Main message length (variable encoding) | ++------------------------------------------------+ +| Main message protobuf | ++------------------------------------------------+ + +In this case, the main message length is equal to the protobuf's byte size. +Since there are no sidecars, the header protobuf's sidecar_offsets list +will will be empty. + +Here's what it looks like with the sidecars: + ++------------------------------------------------+ +| Total message length (4 bytes) | ++------------------------------------------------+ +| RPC Header protobuf length (variable encoding) | ++------------------------------------------------+ +| RPC Header protobuf | ++------------------------------------------------+ +| Main message length (variable encoding) | ++------------------------------------------------+ --- 0 +| Main message protobuf | ++------------------------------------------------+ --- sidecar_offsets(0) +| Sidecar 0 | ++------------------------------------------------+ --- sidecar_offsets(1) +| Sidecar 1 | ++------------------------------------------------+ --- sidecar_offsets(2) +| Sidecar 2 | ++------------------------------------------------+ --- ... +| ... | ++------------------------------------------------+ + +When there are sidecars, the sidecar_offsets member in the header will be a +nonempty list, whose values indicate the offset, measured from the beginning +of the main message protobuf, of the start of each sidecar. The number +of offsets will indicate the number of sidecars. + +Then, on the client side, the sidecars locations are decoded and made available +by RpcController::GetSidecars() (which returns the pointer to the array of all +the sidecars). The caller must be sure to check that the sidecar index in the +sidecar array is correct and in-bounds. + +More information is available in rpc/rpc_sidecar.h. + +------------------------------------------------------------------------------- +Wire Protocol +------------------------------------------------------------------------------- + +Connection establishment and connection header +---------------------------------------------- + +After the client connects to a server, the client first sends a connection header. +The connection header consists of a magic number "hrpc" and three byte flags, +for a total of 7 bytes: + ++----------------------------------+ +| "hrpc" 4 bytes | ++----------------------------------+ +| Version (1 byte) | ++----------------------------------+ +| ServiceClass (1 byte) | ++----------------------------------+ +| AuthProtocol (1 byte) | ++----------------------------------+ + +Currently, the RPC version is 9. The ServiceClass and AuthProtocol fields are unused. + + +Message framing and request/response headers +-------------------------------------------- +Aside from the initial connection header described above, all other messages are +serialized as follows: + + total_size: (32-bit big-endian integer) + the size of the rest of the message, not including this 4-byte header + + header: varint-prefixed header protobuf + - client->server messages use the RequestHeader protobuf + - server->client messages use the ResponseHeader protobuf + + body: varint-prefixed protobuf + - for typical RPC calls, this is the user-specified request or response + protobuf + - for RPC calls which caused an error, the response is a ErrorResponsePB + - during SASL negotiation, this is a SaslMessagePB + + +Example packet capture +-------------------------- +An example call (captured with strace on rpc-test.cc) follows: + + "\x00\x00\x00\x17" (total_size: 23 bytes to follow) + "\x09" RequestHeader varint: 9 bytes + "\x08\x0a\x1a\x03\x41\x64\x64\x20\x01" (RequestHeader protobuf) + Decoded with protoc --decode=RequestHeader rpc_header.proto: + callId: 10 + methodName: "Add" + requestParam: true + + "\x0c" Request parameter varint: 12 bytes + "\x08\xd4\x90\x80\x91\x01\x10\xf8\xcf\xc4\xed\x04" Request parameter + Decoded with protoc --decode=kudu.rpc_test.AddRequestPB rpc/rtest.proto + x: 304089172 + y: 1303455736 + + + +SASL negotiation +------------------ +After the initial connection header is sent, SASL negotiation begins. +Kudu always uses SASL regardless of security settings. In the case that +no strong authentication is required, SASL PLAIN is used with no password. + +This SASL negotiation protocol matches the Hadoop protocol. +The negotiation proceeds as described in this diagram: + + CLIENT | | SERVER + | | +(1) SaslMessagePB } | | +state=NEGOTIATE } --------------------------> | + | | + | | { (2) SaslMessagePB + | | { state=NEGOTIATE + | <------- { auths= + | | +(3) SaslMessagePB } | | +state=INITIATE } | | +auths[0]= } | | +token= } ---------> | + | | + | | { (4) SaslMessagePB + | | { state=CHALLENGE (or SUCCESS) + | <------- { token= + | | +(5) SaslMessagePB } | | +state=RESPONSE } | | +token= } -----------------> | + | | + | | { GOTO (4) above + | | + + +Each of the SaslMessagePBs above is framed as usual using RequestHeader or ResponseHeader +protobufs. For each SASL message, the CallId should be set to '-33'. + + + +Connection Context: +------------------ +Once the SASL negotiation is complete, before the first request, the client +sends the server a special call with call_id -3. The body of this call is a +ConnectionContextPB. The server should not respond to this call. + + +Steady state +------------ +During steady state operation, the client sends call protobufs prefixed by +RequestHeader protobufs. The server sends responses prefixed by ResponseHeader +protobufs. + +The client must send calls in strictly increasing 'call_id' order. The server +may reject repeated calls or calls with lower IDs. The server's responses may +arrive out-of-order, and use the 'call_id' in the response to associate a response +with the correct call. \ No newline at end of file diff --git a/src/kudu/rpc/acceptor_pool.cc b/src/kudu/rpc/acceptor_pool.cc new file mode 100644 index 000000000000..f6a98bef8d82 --- /dev/null +++ b/src/kudu/rpc/acceptor_pool.cc @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/acceptor_pool.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/messenger.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" + +using google::protobuf::Message; +using std::string; + +METRIC_DEFINE_counter(server, rpc_connections_accepted, + "RPC Connections Accepted", + kudu::MetricUnit::kConnections, + "Number of incoming TCP connections made to the RPC server"); + +DEFINE_int32(rpc_acceptor_listen_backlog, 128, + "Socket backlog parameter used when listening for RPC connections. " + "This defines the maximum length to which the queue of pending " + "TCP connections inbound to the RPC server may grow. If a connection " + "request arrives when the queue is full, the client may receive " + "an error. Higher values may help the server ride over bursts of " + "new inbound connection requests."); +TAG_FLAG(rpc_acceptor_listen_backlog, advanced); + +namespace kudu { +namespace rpc { + +AcceptorPool::AcceptorPool(Messenger* messenger, Socket* socket, + Sockaddr bind_address) + : messenger_(messenger), + socket_(socket->Release()), + bind_address_(std::move(bind_address)), + rpc_connections_accepted_(METRIC_rpc_connections_accepted.Instantiate( + messenger->metric_entity())), + closing_(false) {} + +AcceptorPool::~AcceptorPool() { + Shutdown(); +} + +Status AcceptorPool::Start(int num_threads) { + RETURN_NOT_OK(socket_.Listen(FLAGS_rpc_acceptor_listen_backlog)); + + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + Status s = kudu::Thread::Create("acceptor pool", "acceptor", + &AcceptorPool::RunThread, this, &new_thread); + if (!s.ok()) { + Shutdown(); + return s; + } + threads_.push_back(new_thread); + } + return Status::OK(); +} + +void AcceptorPool::Shutdown() { + if (Acquire_CompareAndSwap(&closing_, false, true) != false) { + VLOG(2) << "Acceptor Pool on " << bind_address_.ToString() + << " already shut down"; + return; + } + +#if defined(__linux__) + // Closing the socket will break us out of accept() if we're in it, and + // prevent future accepts. + WARN_NOT_OK(socket_.Shutdown(true, true), + strings::Substitute("Could not shut down acceptor socket on $0", + bind_address_.ToString())); +#else + // Calling shutdown on an accepting (non-connected) socket is illegal on most + // platforms (but not Linux). Instead, the accepting threads are interrupted + // forcefully. + for (const scoped_refptr& thread : threads_) { + pthread_cancel(thread.get()->pthread_id()); + } +#endif + + for (const scoped_refptr& thread : threads_) { + CHECK_OK(ThreadJoiner(thread.get()).Join()); + } + threads_.clear(); +} + +Sockaddr AcceptorPool::bind_address() const { + return bind_address_; +} + +Status AcceptorPool::GetBoundAddress(Sockaddr* addr) const { + return socket_.GetSocketAddress(addr); +} + +void AcceptorPool::RunThread() { + while (true) { + Socket new_sock; + Sockaddr remote; + VLOG(2) << "calling accept() on socket " << socket_.GetFd() + << " listening on " << bind_address_.ToString(); + Status s = socket_.Accept(&new_sock, &remote, Socket::FLAG_NONBLOCKING); + if (!s.ok()) { + if (Release_Load(&closing_)) { + break; + } + LOG(WARNING) << "AcceptorPool: accept failed: " << s.ToString(); + continue; + } + s = new_sock.SetNoDelay(true); + if (!s.ok()) { + LOG(WARNING) << "Acceptor with remote = " << remote.ToString() + << " failed to set TCP_NODELAY on a newly accepted socket: " + << s.ToString(); + continue; + } + rpc_connections_accepted_->Increment(); + messenger_->RegisterInboundSocket(&new_sock, remote); + } + VLOG(1) << "AcceptorPool shutting down."; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/acceptor_pool.h b/src/kudu/rpc/acceptor_pool.h new file mode 100644 index 000000000000..92b7fc5118ed --- /dev/null +++ b/src/kudu/rpc/acceptor_pool.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_ACCEPTOR_POOL_H +#define KUDU_RPC_ACCEPTOR_POOL_H + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/util/thread.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Counter; +class Socket; + +namespace rpc { + +class Messenger; + +// A pool of threads calling accept() to create new connections. +// Acceptor pool threads terminate when they notice that the messenger has been +// shut down, if Shutdown() is called, or if the pool object is destructed. +class AcceptorPool { + public: + // Create a new acceptor pool. Calls socket::Release to take ownership of the + // socket. + // 'socket' must be already bound, but should not yet be listening. + AcceptorPool(Messenger *messenger, Socket *socket, Sockaddr bind_address); + ~AcceptorPool(); + + // Start listening and accepting connections. + Status Start(int num_threads); + void Shutdown(); + + // Return the address that the pool is bound to. If the port is specified as + // 0, then this will always return port 0. + Sockaddr bind_address() const; + + // Return the address that the pool is bound to. This only works while the + // socket is open, and if the specified port is 0 then this will return the + // actual port that was bound. + Status GetBoundAddress(Sockaddr* addr) const; + + private: + void RunThread(); + + Messenger *messenger_; + Socket socket_; + Sockaddr bind_address_; + std::vector > threads_; + + scoped_refptr rpc_connections_accepted_; + + Atomic32 closing_; + + DISALLOW_COPY_AND_ASSIGN(AcceptorPool); +}; + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/rpc/auth_store.cc b/src/kudu/rpc/auth_store.cc new file mode 100644 index 000000000000..ec28b54de2ce --- /dev/null +++ b/src/kudu/rpc/auth_store.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/auth_store.h" + +#include +#include + +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +AuthStore::AuthStore() { +} + +AuthStore::~AuthStore() { +} + +Status AuthStore::Add(const string& user, const string& pass) { + user_cred_map_[user] = pass; + return Status::OK(); +} + +Status AuthStore::Authenticate(const string& user, const string& pass) const { + auto it = user_cred_map_.find(user); + if (it == user_cred_map_.end()) { + return Status::NotFound("Unknown user", user); + } + if (it->second != pass) { + return Status::NotAuthorized("Invalid credentials for user", user); + } + return Status::OK(); +} + +DummyAuthStore::DummyAuthStore() { +} + +DummyAuthStore::~DummyAuthStore() { +} + +Status DummyAuthStore::Authenticate(const string& user, const string& password) const { + return Status::OK(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/auth_store.h b/src/kudu/rpc/auth_store.h new file mode 100644 index 000000000000..b6e937fe6af6 --- /dev/null +++ b/src/kudu/rpc/auth_store.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_AUTH_STORE_H +#define KUDU_RPC_AUTH_STORE_H + +#include +#include + +#include "kudu/gutil/macros.h" + +namespace kudu { + +class Status; + +namespace rpc { + +using std::string; +using std::unordered_map; + +// This class stores username / password pairs in memory for use in PLAIN SASL auth. +// Add() is NOT thread safe. +// Authenticate() is safe to call from multiple threads. +class AuthStore { + public: + AuthStore(); + virtual ~AuthStore(); + + // Add user to the auth store. + virtual Status Add(const string& user, const string& password); + + // Validate whether user/password combination exists in auth store. + // Returns OK if the user has valid credentials. + // Returns NotFound if the user is not found. + // Returns NotAuthorized if the password is incorrect. + virtual Status Authenticate(const string& user, const string& password) const; + + private: + unordered_map user_cred_map_; + + DISALLOW_COPY_AND_ASSIGN(AuthStore); +}; + +// This class simply allows anybody through. +class DummyAuthStore : public AuthStore { + public: + DummyAuthStore(); + virtual ~DummyAuthStore(); + + // Always returns OK + virtual Status Authenticate(const string& user, const string& password) const OVERRIDE; +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_AUTH_STORE_H diff --git a/src/kudu/rpc/blocking_ops.cc b/src/kudu/rpc/blocking_ops.cc new file mode 100644 index 000000000000..923d85554e5a --- /dev/null +++ b/src/kudu/rpc/blocking_ops.cc @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/blocking_ops.h" + +#include + +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/serialization.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/faststring.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +using google::protobuf::MessageLite; + +Status EnsureBlockingMode(const Socket* const sock) { + bool is_nonblocking; + RETURN_NOT_OK(sock->IsNonBlocking(&is_nonblocking)); + if (is_nonblocking) { + return Status::IllegalState("Underlying socket is not set to blocking mode!"); + } + return Status::OK(); +} + +Status SendFramedMessageBlocking(Socket* sock, const MessageLite& header, const MessageLite& msg, + const MonoTime& deadline) { + DCHECK(sock != nullptr); + DCHECK(header.IsInitialized()) << "header protobuf must be initialized"; + DCHECK(msg.IsInitialized()) << "msg protobuf must be initialized"; + + RETURN_NOT_OK(EnsureBlockingMode(sock)); + + // Ensure we are in blocking mode. + // These blocking calls are typically not in the fast path, so doing this for all build types. + bool is_non_blocking = false; + RETURN_NOT_OK(sock->IsNonBlocking(&is_non_blocking)); + DCHECK(!is_non_blocking) << "Socket must be in blocking mode to use SendFramedMessage"; + + // Serialize message + faststring param_buf; + RETURN_NOT_OK(serialization::SerializeMessage(msg, ¶m_buf)); + + // Serialize header and initial length + faststring header_buf; + RETURN_NOT_OK(serialization::SerializeHeader(header, param_buf.size(), &header_buf)); + + // Write header & param to stream + size_t nsent; + RETURN_NOT_OK(sock->BlockingWrite(header_buf.data(), header_buf.size(), &nsent, deadline)); + RETURN_NOT_OK(sock->BlockingWrite(param_buf.data(), param_buf.size(), &nsent, deadline)); + + return Status::OK(); +} + +Status ReceiveFramedMessageBlocking(Socket* sock, faststring* recv_buf, + MessageLite* header, Slice* param_buf, const MonoTime& deadline) { + DCHECK(sock != nullptr); + DCHECK(recv_buf != nullptr); + DCHECK(header != nullptr); + DCHECK(param_buf != nullptr); + + RETURN_NOT_OK(EnsureBlockingMode(sock)); + + // Read the message prefix, which specifies the length of the payload. + recv_buf->clear(); + recv_buf->resize(kMsgLengthPrefixLength); + size_t recvd = 0; + RETURN_NOT_OK(sock->BlockingRecv(recv_buf->data(), kMsgLengthPrefixLength, &recvd, deadline)); + uint32_t payload_len = NetworkByteOrder::Load32(recv_buf->data()); + + // Verify that the payload size isn't out of bounds. + // This can happen because of network corruption, or a naughty client. + if (PREDICT_FALSE(payload_len > FLAGS_rpc_max_message_size)) { + return Status::IOError( + strings::Substitute( + "Received invalid message of size $0 which exceeds" + " the rpc_max_message_size of $1 bytes", + payload_len, FLAGS_rpc_max_message_size)); + } + + // Read the message payload. + recvd = 0; + recv_buf->resize(payload_len + kMsgLengthPrefixLength); + RETURN_NOT_OK(sock->BlockingRecv(recv_buf->data() + kMsgLengthPrefixLength, + payload_len, &recvd, deadline)); + RETURN_NOT_OK(serialization::ParseMessage(Slice(*recv_buf), header, param_buf)); + return Status::OK(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/blocking_ops.h b/src/kudu/rpc/blocking_ops.h new file mode 100644 index 000000000000..9ff7bd34c59e --- /dev/null +++ b/src/kudu/rpc/blocking_ops.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_BLOCKING_OPS_H +#define KUDU_RPC_BLOCKING_OPS_H + +#include +#include + +namespace google { +namespace protobuf { +class MessageLite; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class faststring; +class MonoTime; +class Slice; +class Sockaddr; +class Socket; +class Status; + +namespace rpc { + +class SaslMessagePB; + +// Returns OK if socket is in blocking mode. Otherwise, returns an error. +Status EnsureBlockingMode(const Socket* const sock); + +// Encode and send a message over a socket. +// header: Request or Response header protobuf. +// msg: Protobuf message to send. This message must be fully initialized. +// deadline: Latest time allowed for receive to complete before timeout. +Status SendFramedMessageBlocking(Socket* sock, const google::protobuf::MessageLite& header, + const google::protobuf::MessageLite& msg, const MonoTime& deadline); + +// Receive a full message frame from the server. +// recv_buf: buffer to use for reading the data from the socket. +// header: Request or Response header protobuf. +// param_buf: Slice into recv_buf containing unparsed RPC param protobuf data. +// deadline: Latest time allowed for receive to complete before timeout. +Status ReceiveFramedMessageBlocking(Socket* sock, faststring* recv_buf, + google::protobuf::MessageLite* header, Slice* param_buf, const MonoTime& deadline); + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_BLOCKING_OPS_H diff --git a/src/kudu/rpc/connection.cc b/src/kudu/rpc/connection.cc new file mode 100644 index 000000000000..5dad5c374b14 --- /dev/null +++ b/src/kudu/rpc/connection.cc @@ -0,0 +1,619 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/connection.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/auth_store.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/reactor.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/sasl_client.h" +#include "kudu/rpc/sasl_server.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/trace.h" + +using std::shared_ptr; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace rpc { + +/// +/// Connection +/// +Connection::Connection(ReactorThread *reactor_thread, Sockaddr remote, + int socket, Direction direction) + : reactor_thread_(reactor_thread), + socket_(socket), + remote_(std::move(remote)), + direction_(direction), + last_activity_time_(MonoTime::Now(MonoTime::FINE)), + is_epoll_registered_(false), + next_call_id_(1), + sasl_client_(kSaslAppName, socket), + sasl_server_(kSaslAppName, socket), + negotiation_complete_(false) {} + +Status Connection::SetNonBlocking(bool enabled) { + return socket_.SetNonBlocking(enabled); +} + +void Connection::EpollRegister(ev::loop_ref& loop) { + DCHECK(reactor_thread_->IsCurrentThread()); + DVLOG(4) << "Registering connection for epoll: " << ToString(); + write_io_.set(loop); + write_io_.set(socket_.GetFd(), ev::WRITE); + write_io_.set(this); + if (direction_ == CLIENT && negotiation_complete_) { + write_io_.start(); + } + read_io_.set(loop); + read_io_.set(socket_.GetFd(), ev::READ); + read_io_.set(this); + read_io_.start(); + is_epoll_registered_ = true; +} + +Connection::~Connection() { + // Must clear the outbound_transfers_ list before deleting. + CHECK(outbound_transfers_.begin() == outbound_transfers_.end()); + + // It's crucial that the connection is Shutdown first -- otherwise + // our destructor will end up calling read_io_.stop() and write_io_.stop() + // from a possibly non-reactor thread context. This can then make all + // hell break loose with libev. + CHECK(!is_epoll_registered_); +} + +bool Connection::Idle() const { + DCHECK(reactor_thread_->IsCurrentThread()); + // check if we're in the middle of receiving something + InboundTransfer *transfer = inbound_.get(); + if (transfer && (transfer->TransferStarted())) { + return false; + } + // check if we still need to send something + if (!outbound_transfers_.empty()) { + return false; + } + // can't kill a connection if calls are waiting response + if (!awaiting_response_.empty()) { + return false; + } + + if (!calls_being_handled_.empty()) { + return false; + } + + // We are not idle if we are in the middle of connection negotiation. + if (!negotiation_complete_) { + return false; + } + + return true; +} + +void Connection::Shutdown(const Status &status) { + DCHECK(reactor_thread_->IsCurrentThread()); + shutdown_status_ = status; + + if (inbound_ && inbound_->TransferStarted()) { + double secs_since_active = reactor_thread_->cur_time() + .GetDeltaSince(last_activity_time_).ToSeconds(); + LOG(WARNING) << "Shutting down connection " << ToString() << " with pending inbound data (" + << inbound_->StatusAsString() << ", last active " + << HumanReadableElapsedTime::ToShortString(secs_since_active) + << " ago, status=" << status.ToString() << ")"; + } + + // Clear any calls which have been sent and were awaiting a response. + for (const car_map_t::value_type &v : awaiting_response_) { + CallAwaitingResponse *c = v.second; + if (c->call) { + c->call->SetFailed(status); + } + // And we must return the CallAwaitingResponse to the pool + car_pool_.Destroy(c); + } + awaiting_response_.clear(); + + // Clear any outbound transfers. + while (!outbound_transfers_.empty()) { + OutboundTransfer *t = &outbound_transfers_.front(); + outbound_transfers_.pop_front(); + delete t; + } + + read_io_.stop(); + write_io_.stop(); + is_epoll_registered_ = false; + WARN_NOT_OK(socket_.Close(), "Error closing socket"); +} + +void Connection::QueueOutbound(gscoped_ptr transfer) { + DCHECK(reactor_thread_->IsCurrentThread()); + + if (!shutdown_status_.ok()) { + // If we've already shut down, then we just need to abort the + // transfer rather than bothering to queue it. + transfer->Abort(shutdown_status_); + return; + } + + DVLOG(3) << "Queueing transfer: " << transfer->HexDump(); + + outbound_transfers_.push_back(*transfer.release()); + + if (negotiation_complete_ && !write_io_.is_active()) { + // If we weren't currently in the middle of sending anything, + // then our write_io_ interest is stopped. Need to re-start it. + // Only do this after connection negotiation is done doing its work. + write_io_.start(); + } +} + +Connection::CallAwaitingResponse::~CallAwaitingResponse() { + DCHECK(conn->reactor_thread_->IsCurrentThread()); +} + +void Connection::CallAwaitingResponse::HandleTimeout(ev::timer &watcher, int revents) { + conn->HandleOutboundCallTimeout(this); +} + +void Connection::HandleOutboundCallTimeout(CallAwaitingResponse *car) { + DCHECK(reactor_thread_->IsCurrentThread()); + DCHECK(car->call); + // The timeout timer is stopped by the car destructor exiting Connection::HandleCallResponse() + DCHECK(!car->call->IsFinished()); + + // Mark the call object as failed. + car->call->SetTimedOut(); + + // Drop the reference to the call. If the original caller has moved on after + // seeing the timeout, we no longer need to hold onto the allocated memory + // from the request. + car->call.reset(); + + // We still leave the CallAwaitingResponse in the map -- this is because we may still + // receive a response from the server, and we don't want a spurious log message + // when we do finally receive the response. The fact that CallAwaitingResponse::call + // is a NULL pointer indicates to the response processing code that the call + // already timed out. +} + + +// Callbacks after sending a call on the wire. +// This notifies the OutboundCall object to change its state to SENT once it +// has been fully transmitted. +struct CallTransferCallbacks : public TransferCallbacks { + public: + explicit CallTransferCallbacks(shared_ptr call) + : call_(std::move(call)) {} + + virtual void NotifyTransferFinished() OVERRIDE { + // TODO: would be better to cancel the transfer while it is still on the queue if we + // timed out before the transfer started, but there is still a race in the case of + // a partial send that we have to handle here + if (call_->IsFinished()) { + DCHECK(call_->IsTimedOut()); + } else { + call_->SetSent(); + } + delete this; + } + + virtual void NotifyTransferAborted(const Status &status) OVERRIDE { + VLOG(1) << "Connection torn down before " << + call_->ToString() << " could send its call: " << status.ToString(); + delete this; + } + + private: + shared_ptr call_; +}; + +void Connection::QueueOutboundCall(const shared_ptr &call) { + DCHECK(call); + DCHECK_EQ(direction_, CLIENT); + DCHECK(reactor_thread_->IsCurrentThread()); + + if (PREDICT_FALSE(!shutdown_status_.ok())) { + // Already shutdown + call->SetFailed(shutdown_status_); + return; + } + + // At this point the call has a serialized request, but no call header, since we haven't + // yet assigned a call ID. + DCHECK(!call->call_id_assigned()); + + // Assign the call ID. + int32_t call_id = GetNextCallId(); + call->set_call_id(call_id); + + // Serialize the actual bytes to be put on the wire. + slices_tmp_.clear(); + Status s = call->SerializeTo(&slices_tmp_); + if (PREDICT_FALSE(!s.ok())) { + call->SetFailed(s); + return; + } + + call->SetQueued(); + + scoped_car car(car_pool_.make_scoped_ptr(car_pool_.Construct())); + car->conn = this; + car->call = call; + + // Set up the timeout timer. + const MonoDelta &timeout = call->controller()->timeout(); + if (timeout.Initialized()) { + reactor_thread_->RegisterTimeout(&car->timeout_timer); + car->timeout_timer.set(car.get()); + car->timeout_timer.set(timeout.ToSeconds(), 0); + car->timeout_timer.start(); + } + + TransferCallbacks *cb = new CallTransferCallbacks(call); + awaiting_response_[call_id] = car.release(); + QueueOutbound(gscoped_ptr( + new OutboundTransfer(slices_tmp_, cb))); +} + +// Callbacks for sending an RPC call response from the server. +// This takes ownership of the InboundCall object so that, once it has +// been responded to, we can free up all of the associated memory. +struct ResponseTransferCallbacks : public TransferCallbacks { + public: + ResponseTransferCallbacks(gscoped_ptr call, + Connection *conn) : + call_(call.Pass()), + conn_(conn) + {} + + ~ResponseTransferCallbacks() { + // Remove the call from the map. + InboundCall *call_from_map = EraseKeyReturnValuePtr( + &conn_->calls_being_handled_, call_->call_id()); + DCHECK_EQ(call_from_map, call_.get()); + } + + virtual void NotifyTransferFinished() OVERRIDE { + delete this; + } + + virtual void NotifyTransferAborted(const Status &status) OVERRIDE { + LOG(WARNING) << "Connection torn down before " << + call_->ToString() << " could send its response"; + delete this; + } + + private: + gscoped_ptr call_; + Connection *conn_; +}; + +// Reactor task which puts a transfer on the outbound transfer queue. +class QueueTransferTask : public ReactorTask { + public: + QueueTransferTask(gscoped_ptr transfer, + Connection *conn) + : transfer_(transfer.Pass()), + conn_(conn) + {} + + virtual void Run(ReactorThread *thr) OVERRIDE { + conn_->QueueOutbound(transfer_.Pass()); + delete this; + } + + virtual void Abort(const Status &status) OVERRIDE { + transfer_->Abort(status); + delete this; + } + + private: + gscoped_ptr transfer_; + Connection *conn_; +}; + +void Connection::QueueResponseForCall(gscoped_ptr call) { + // This is usually called by the IPC worker thread when the response + // is set, but in some circumstances may also be called by the + // reactor thread (e.g. if the service has shut down) + + DCHECK_EQ(direction_, SERVER); + + // If the connection is torn down, then the QueueOutbound() call that + // eventually runs in the reactor thread will take care of calling + // ResponseTransferCallbacks::NotifyTransferAborted. + + std::vector slices; + call->SerializeResponseTo(&slices); + + TransferCallbacks *cb = new ResponseTransferCallbacks(call.Pass(), this); + // After the response is sent, can delete the InboundCall object. + gscoped_ptr t(new OutboundTransfer(slices, cb)); + + QueueTransferTask *task = new QueueTransferTask(t.Pass(), this); + reactor_thread_->reactor()->ScheduleReactorTask(task); +} + +void Connection::set_user_credentials(const UserCredentials &user_credentials) { + user_credentials_.CopyFrom(user_credentials); +} + +void Connection::ReadHandler(ev::io &watcher, int revents) { + DCHECK(reactor_thread_->IsCurrentThread()); + + DVLOG(3) << ToString() << " ReadHandler(revents=" << revents << ")"; + if (revents & EV_ERROR) { + reactor_thread_->DestroyConnection(this, Status::NetworkError(ToString() + + ": ReadHandler encountered an error")); + return; + } + last_activity_time_ = reactor_thread_->cur_time(); + + while (true) { + if (!inbound_) { + inbound_.reset(new InboundTransfer()); + } + Status status = inbound_->ReceiveBuffer(socket_); + if (PREDICT_FALSE(!status.ok())) { + if (status.posix_code() == ESHUTDOWN) { + VLOG(1) << ToString() << " shut down by remote end."; + } else { + LOG(WARNING) << ToString() << " recv error: " << status.ToString(); + } + reactor_thread_->DestroyConnection(this, status); + return; + } + if (!inbound_->TransferFinished()) { + DVLOG(3) << ToString() << ": read is not yet finished yet."; + return; + } + DVLOG(3) << ToString() << ": finished reading " << inbound_->data().size() << " bytes"; + + if (direction_ == CLIENT) { + HandleCallResponse(inbound_.Pass()); + } else if (direction_ == SERVER) { + HandleIncomingCall(inbound_.Pass()); + } else { + LOG(FATAL) << "Invalid direction: " << direction_; + } + + // TODO: it would seem that it would be good to loop around and see if + // there is more data on the socket by trying another recv(), but it turns + // out that it really hurts throughput to do so. A better approach + // might be for each InboundTransfer to actually try to read an extra byte, + // and if it succeeds, then we'd copy that byte into a new InboundTransfer + // and loop around, since it's likely the next call also arrived at the + // same time. + break; + } +} + +void Connection::HandleIncomingCall(gscoped_ptr transfer) { + DCHECK(reactor_thread_->IsCurrentThread()); + + gscoped_ptr call(new InboundCall(this)); + Status s = call->ParseFrom(transfer.Pass()); + if (!s.ok()) { + LOG(WARNING) << ToString() << ": received bad data: " << s.ToString(); + // TODO: shutdown? probably, since any future stuff on this socket will be + // "unsynchronized" + return; + } + + if (!InsertIfNotPresent(&calls_being_handled_, call->call_id(), call.get())) { + LOG(WARNING) << ToString() << ": received call ID " << call->call_id() << + " but was already processing this ID! Ignoring"; + reactor_thread_->DestroyConnection( + this, Status::RuntimeError("Received duplicate call id", + Substitute("$0", call->call_id()))); + return; + } + + reactor_thread_->reactor()->messenger()->QueueInboundCall(call.Pass()); +} + +void Connection::HandleCallResponse(gscoped_ptr transfer) { + DCHECK(reactor_thread_->IsCurrentThread()); + gscoped_ptr resp(new CallResponse); + CHECK_OK(resp->ParseFrom(transfer.Pass())); + + CallAwaitingResponse *car_ptr = + EraseKeyReturnValuePtr(&awaiting_response_, resp->call_id()); + if (PREDICT_FALSE(car_ptr == nullptr)) { + LOG(WARNING) << ToString() << ": Got a response for call id " << resp->call_id() << " which " + << "was not pending! Ignoring."; + return; + } + + // The car->timeout_timer ev::timer will be stopped automatically by its destructor. + scoped_car car(car_pool_.make_scoped_ptr(car_ptr)); + + if (PREDICT_FALSE(car->call.get() == nullptr)) { + // The call already failed due to a timeout. + VLOG(1) << "Got response to call id " << resp->call_id() << " after client already timed out"; + return; + } + + car->call->SetResponse(resp.Pass()); +} + +void Connection::WriteHandler(ev::io &watcher, int revents) { + DCHECK(reactor_thread_->IsCurrentThread()); + + if (revents & EV_ERROR) { + reactor_thread_->DestroyConnection(this, Status::NetworkError(ToString() + + ": writeHandler encountered an error")); + return; + } + DVLOG(3) << ToString() << ": writeHandler: revents = " << revents; + + OutboundTransfer *transfer; + if (outbound_transfers_.empty()) { + LOG(WARNING) << ToString() << " got a ready-to-write callback, but there is " + "nothing to write."; + write_io_.stop(); + return; + } + + while (!outbound_transfers_.empty()) { + transfer = &(outbound_transfers_.front()); + + last_activity_time_ = reactor_thread_->cur_time(); + Status status = transfer->SendBuffer(socket_); + if (PREDICT_FALSE(!status.ok())) { + LOG(WARNING) << ToString() << " send error: " << status.ToString(); + reactor_thread_->DestroyConnection(this, status); + return; + } + + if (!transfer->TransferFinished()) { + DVLOG(3) << ToString() << ": writeHandler: xfer not finished."; + return; + } + + outbound_transfers_.pop_front(); + delete transfer; + } + + + // If we were able to write all of our outbound transfers, + // we don't have any more to write. + write_io_.stop(); +} + +std::string Connection::ToString() const { + // This may be called from other threads, so we cannot + // include anything in the output about the current state, + // which might concurrently change from another thread. + return strings::Substitute( + "$0 $1", + direction_ == SERVER ? "server connection from" : "client connection to", + remote_.ToString()); +} + +Status Connection::InitSaslClient() { + RETURN_NOT_OK(sasl_client().Init(kSaslProtoName)); + RETURN_NOT_OK(sasl_client().EnableAnonymous()); + RETURN_NOT_OK(sasl_client().EnablePlain(user_credentials().real_user(), + user_credentials().password())); + return Status::OK(); +} + +Status Connection::InitSaslServer() { + // TODO: Do necessary configuration plumbing to enable user authentication. + // Right now we just enable PLAIN with a "dummy" auth store, which allows everyone in. + RETURN_NOT_OK(sasl_server().Init(kSaslProtoName)); + gscoped_ptr auth_store(new DummyAuthStore()); + RETURN_NOT_OK(sasl_server().EnablePlain(auth_store.Pass())); + return Status::OK(); +} + +// Reactor task that transitions this Connection from connection negotiation to +// regular RPC handling. Destroys Connection on negotiation error. +class NegotiationCompletedTask : public ReactorTask { + public: + NegotiationCompletedTask(Connection* conn, + const Status& negotiation_status) + : conn_(conn), + negotiation_status_(negotiation_status) { + } + + virtual void Run(ReactorThread *rthread) OVERRIDE { + rthread->CompleteConnectionNegotiation(conn_, negotiation_status_); + delete this; + } + + virtual void Abort(const Status &status) OVERRIDE { + DCHECK(conn_->reactor_thread()->reactor()->closing()); + VLOG(1) << "Failed connection negotiation due to shut down reactor thread: " << + status.ToString(); + delete this; + } + + private: + scoped_refptr conn_; + Status negotiation_status_; +}; + +void Connection::CompleteNegotiation(const Status& negotiation_status) { + auto task = new NegotiationCompletedTask(this, negotiation_status); + reactor_thread_->reactor()->ScheduleReactorTask(task); +} + +void Connection::MarkNegotiationComplete() { + DCHECK(reactor_thread_->IsCurrentThread()); + negotiation_complete_ = true; +} + +Status Connection::DumpPB(const DumpRunningRpcsRequestPB& req, + RpcConnectionPB* resp) { + DCHECK(reactor_thread_->IsCurrentThread()); + resp->set_remote_ip(remote_.ToString()); + if (negotiation_complete_) { + resp->set_state(RpcConnectionPB::OPEN); + resp->set_remote_user_credentials(user_credentials_.ToString()); + } else { + // It's racy to dump credentials while negotiating, since the Connection + // object is owned by the negotiation thread at that point. + resp->set_state(RpcConnectionPB::NEGOTIATING); + } + + if (direction_ == CLIENT) { + for (const car_map_t::value_type& entry : awaiting_response_) { + CallAwaitingResponse *c = entry.second; + if (c->call) { + c->call->DumpPB(req, resp->add_calls_in_flight()); + } + } + } else if (direction_ == SERVER) { + for (const inbound_call_map_t::value_type& entry : calls_being_handled_) { + InboundCall* c = entry.second; + c->DumpPB(req, resp->add_calls_in_flight()); + } + } else { + LOG(FATAL); + } + return Status::OK(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/connection.h b/src/kudu/rpc/connection.h new file mode 100644 index 000000000000..9ad7e42f50c1 --- /dev/null +++ b/src/kudu/rpc/connection.h @@ -0,0 +1,288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_CONNECTION_H +#define KUDU_RPC_CONNECTION_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/outbound_call.h" +#include "kudu/rpc/sasl_client.h" +#include "kudu/rpc/sasl_server.h" +#include "kudu/rpc/inbound_call.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/object_pool.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +class DumpRunningRpcsRequestPB; +class RpcConnectionPB; +class ReactorThread; + +// +// A connection between an endpoint and us. +// +// Inbound connections are created by AcceptorPools, which eventually schedule +// RegisterConnection() to be called from the reactor thread. +// +// Outbound connections are created by the Reactor thread in order to service +// outbound calls. +// +// Once a Connection is created, it can be used both for sending messages and +// receiving them, but any given connection is explicitly a client or server. +// If a pair of servers are making bidirectional RPCs, they will use two separate +// TCP connections (and Connection objects). +// +// This class is not fully thread-safe. It is accessed only from the context of a +// single ReactorThread except where otherwise specified. +// +class Connection : public RefCountedThreadSafe { + public: + enum Direction { + // This host is sending calls via this connection. + CLIENT, + // This host is receiving calls via this connection. + SERVER + }; + + // Create a new Connection. + // reactor_thread: the reactor that owns us. + // remote: the address of the remote end + // socket: the socket to take ownership of. + // direction: whether we are the client or server side + Connection(ReactorThread *reactor_thread, Sockaddr remote, int socket, + Direction direction); + + // Set underlying socket to non-blocking (or blocking) mode. + Status SetNonBlocking(bool enabled); + + // Register our socket with an epoll loop. We will only ever be registered in + // one epoll loop at a time. + void EpollRegister(ev::loop_ref& loop); + + ~Connection(); + + MonoTime last_activity_time() const { + return last_activity_time_; + } + + // Returns true if we are not in the process of receiving or sending a + // message, and we have no outstanding calls. + bool Idle() const; + + // Fail any calls which are currently queued or awaiting response. + // Prohibits any future calls (they will be failed immediately with this + // same Status). + void Shutdown(const Status &status); + + // Queue a new call to be made. If the queueing fails, the call will be + // marked failed. + // Takes ownership of the 'call' object regardless of whether it succeeds or fails. + // This may be called from a non-reactor thread. + void QueueOutboundCall(const std::shared_ptr &call); + + // Queue a call response back to the client on the server side. + // + // This may be called from a non-reactor thread. + void QueueResponseForCall(gscoped_ptr call); + + // The address of the remote end of the connection. + const Sockaddr &remote() const { return remote_; } + + // Set the user credentials which should be used to log in. + void set_user_credentials(const UserCredentials &user_credentials); + + // Modify the user credentials which will be used to log in. + UserCredentials* mutable_user_credentials() { return &user_credentials_; } + + // Get the user credentials which will be used to log in. + const UserCredentials &user_credentials() const { return user_credentials_; } + + // libev callback when data is available to read. + void ReadHandler(ev::io &watcher, int revents); + + // libev callback when we may write to the socket. + void WriteHandler(ev::io &watcher, int revents); + + // Safe to be called from other threads. + std::string ToString() const; + + Direction direction() const { return direction_; } + + Socket *socket() { return &socket_; } + + // Return SASL client instance for this connection. + SaslClient &sasl_client() { return sasl_client_; } + + // Return SASL server instance for this connection. + SaslServer &sasl_server() { return sasl_server_; } + + // Initialize SASL client before negotiation begins. + Status InitSaslClient(); + + // Initialize SASL server before negotiation begins. + Status InitSaslServer(); + + // Go through the process of transferring control of the underlying socket back to the Reactor. + void CompleteNegotiation(const Status &negotiation_status); + + // Indicate that negotiation is complete and that the Reactor is now in control of the socket. + void MarkNegotiationComplete(); + + Status DumpPB(const DumpRunningRpcsRequestPB& req, + RpcConnectionPB* resp); + + ReactorThread *reactor_thread() const { return reactor_thread_; } + + private: + friend struct CallAwaitingResponse; + friend class QueueTransferTask; + friend struct ResponseTransferCallbacks; + + // A call which has been fully sent to the server, which we're waiting for + // the server to process. This is used on the client side only. + struct CallAwaitingResponse { + ~CallAwaitingResponse(); + + // Notification from libev that the call has timed out. + void HandleTimeout(ev::timer &watcher, int revents); + + Connection *conn; + std::shared_ptr call; + ev::timer timeout_timer; + }; + + typedef std::unordered_map car_map_t; + typedef std::unordered_map inbound_call_map_t; + + // Returns the next valid (positive) sequential call ID by incrementing a counter + // and ensuring we roll over from INT32_MAX to 0. + // Negative numbers are reserved for special purposes. + int32_t GetNextCallId() { + int32_t call_id = next_call_id_; + if (PREDICT_FALSE(next_call_id_ == std::numeric_limits::max())) { + next_call_id_ = 0; + } else { + next_call_id_++; + } + return call_id; + } + + // An incoming packet has completed transferring on the server side. + // This parses the call and delivers it into the call queue. + void HandleIncomingCall(gscoped_ptr transfer); + + // An incoming packet has completed on the client side. This parses the + // call response, looks up the CallAwaitingResponse, and calls the + // client callback. + void HandleCallResponse(gscoped_ptr transfer); + + // The given CallAwaitingResponse has elapsed its user-defined timeout. + // Set it to Failed. + void HandleOutboundCallTimeout(CallAwaitingResponse *car); + + // Queue a transfer for sending on this connection. + // We will take ownership of the transfer. + // This must be called from the reactor thread. + void QueueOutbound(gscoped_ptr transfer); + + // The reactor thread that created this connection. + ReactorThread * const reactor_thread_; + + // The socket we're communicating on. + Socket socket_; + + // The remote address we're talking to. + const Sockaddr remote_; + + // The credentials of the user operating on this connection (if a client user). + UserCredentials user_credentials_; + + // whether we are client or server + Direction direction_; + + // The last time we read or wrote from the socket. + MonoTime last_activity_time_; + + // the inbound transfer, if any + gscoped_ptr inbound_; + + // notifies us when our socket is writable. + ev::io write_io_; + + // notifies us when our socket is readable. + ev::io read_io_; + + // Set to true when the connection is registered on a loop. + // This is used for a sanity check in the destructor that we are properly + // un-registered before shutting down. + bool is_epoll_registered_; + + // waiting to be sent + boost::intrusive::list outbound_transfers_; // NOLINT(*) + + // Calls which have been sent and are now waiting for a response. + car_map_t awaiting_response_; + + // Calls which have been received on the server and are currently + // being handled. + inbound_call_map_t calls_being_handled_; + + // the next call ID to use + int32_t next_call_id_; + + // Starts as Status::OK, gets set to a shutdown status upon Shutdown(). + Status shutdown_status_; + + // Temporary vector used when serializing - avoids an allocation + // when serializing calls. + std::vector slices_tmp_; + + // Pool from which CallAwaitingResponse objects are allocated. + // Also a funny name. + ObjectPool car_pool_; + typedef ObjectPool::scoped_ptr scoped_car; + + // SASL client instance used for connection negotiation when Direction == CLIENT. + SaslClient sasl_client_; + + // SASL server instance used for connection negotiation when Direction == SERVER. + SaslServer sasl_server_; + + // Whether we completed connection negotiation. + bool negotiation_complete_; +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/constants.cc b/src/kudu/rpc/constants.cc new file mode 100644 index 000000000000..fcff30bab4e4 --- /dev/null +++ b/src/kudu/rpc/constants.cc @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/constants.h" + +namespace kudu { +namespace rpc { + +const char* const kMagicNumber = "hrpc"; +const char* const kSaslAppName = "Kudu"; +const char* const kSaslProtoName = "kudu"; + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/constants.h b/src/kudu/rpc/constants.h new file mode 100644 index 000000000000..8ffa81f32853 --- /dev/null +++ b/src/kudu/rpc/constants.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_RPC_CONSTANTS_H +#define KUDU_RPC_RPC_CONSTANTS_H + +#include + +namespace kudu { +namespace rpc { + +// Magic number bytes sent at connection setup time. +extern const char* const kMagicNumber; + +// App name for SASL library init +extern const char* const kSaslAppName; + +// Network protocol name for SASL library init +extern const char* const kSaslProtoName; + +// Current version of the RPC protocol. +static const uint32_t kCurrentRpcVersion = 9; + +// From Hadoop. +static const int32_t kInvalidCallId = -2; +static const int32_t kConnectionContextCallId = -3; +static const int32_t kSaslCallId = -33; + +static const uint8_t kMagicNumberLength = 4; +static const uint8_t kHeaderFlagsLength = 3; + +// There is a 4-byte length prefix before any packet. +static const uint8_t kMsgLengthPrefixLength = 4; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_RPC_CONSTANTS_H diff --git a/src/kudu/rpc/inbound_call.cc b/src/kudu/rpc/inbound_call.cc new file mode 100644 index 000000000000..bcef5763063c --- /dev/null +++ b/src/kudu/rpc/inbound_call.cc @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/inbound_call.h" + +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/connection.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/rpc/rpc_sidecar.h" +#include "kudu/rpc/serialization.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/trace.h" + +using google::protobuf::FieldDescriptor; +using google::protobuf::Message; +using google::protobuf::MessageLite; +using google::protobuf::io::CodedOutputStream; +using std::shared_ptr; +using std::vector; +using strings::Substitute; + +DEFINE_bool(rpc_dump_all_traces, false, + "If true, dump all RPC traces at INFO level"); +TAG_FLAG(rpc_dump_all_traces, advanced); +TAG_FLAG(rpc_dump_all_traces, runtime); + + +namespace kudu { +namespace rpc { + +InboundCall::InboundCall(Connection* conn) + : conn_(conn), + sidecars_deleter_(&sidecars_), + trace_(new Trace) { + RecordCallReceived(); +} + +InboundCall::~InboundCall() {} + +Status InboundCall::ParseFrom(gscoped_ptr transfer) { + TRACE_EVENT_FLOW_BEGIN0("rpc", "InboundCall", this); + TRACE_EVENT0("rpc", "InboundCall::ParseFrom"); + RETURN_NOT_OK(serialization::ParseMessage(transfer->data(), &header_, &serialized_request_)); + + // Adopt the service/method info from the header as soon as it's available. + if (PREDICT_FALSE(!header_.has_remote_method())) { + return Status::Corruption("Non-connection context request header must specify remote_method"); + } + if (PREDICT_FALSE(!header_.remote_method().IsInitialized())) { + return Status::Corruption("remote_method in request header is not initialized", + header_.remote_method().InitializationErrorString()); + } + remote_method_.FromPB(header_.remote_method()); + + // Retain the buffer that we have a view into. + transfer_.swap(transfer); + return Status::OK(); +} + +void InboundCall::RespondSuccess(const MessageLite& response) { + TRACE_EVENT0("rpc", "InboundCall::RespondSuccess"); + Respond(response, true); +} + +void InboundCall::RespondFailure(ErrorStatusPB::RpcErrorCodePB error_code, + const Status& status) { + TRACE_EVENT0("rpc", "InboundCall::RespondFailure"); + ErrorStatusPB err; + err.set_message(status.ToString()); + err.set_code(error_code); + + Respond(err, false); +} + +void InboundCall::RespondApplicationError(int error_ext_id, const std::string& message, + const MessageLite& app_error_pb) { + ErrorStatusPB err; + ApplicationErrorToPB(error_ext_id, message, app_error_pb, &err); + Respond(err, false); +} + +void InboundCall::ApplicationErrorToPB(int error_ext_id, const std::string& message, + const google::protobuf::MessageLite& app_error_pb, + ErrorStatusPB* err) { + err->set_message(message); + const FieldDescriptor* app_error_field = + err->GetReflection()->FindKnownExtensionByNumber(error_ext_id); + if (app_error_field != nullptr) { + err->GetReflection()->MutableMessage(err, app_error_field)->CheckTypeAndMergeFrom(app_error_pb); + } else { + LOG(DFATAL) << "Unable to find application error extension ID " << error_ext_id + << " (message=" << message << ")"; + } +} + +void InboundCall::Respond(const MessageLite& response, + bool is_success) { + TRACE_EVENT_FLOW_END0("rpc", "InboundCall", this); + Status s = SerializeResponseBuffer(response, is_success); + if (PREDICT_FALSE(!s.ok())) { + // TODO: test error case, serialize error response instead + LOG(DFATAL) << "Unable to serialize response: " << s.ToString(); + } + + TRACE_EVENT_ASYNC_END1("rpc", "InboundCall", this, + "method", remote_method_.method_name()); + TRACE_TO(trace_, "Queueing $0 response", is_success ? "success" : "failure"); + + LogTrace(); + conn_->QueueResponseForCall(gscoped_ptr(this).Pass()); +} + +Status InboundCall::SerializeResponseBuffer(const MessageLite& response, + bool is_success) { + uint32_t protobuf_msg_size = response.ByteSize(); + + ResponseHeader resp_hdr; + resp_hdr.set_call_id(header_.call_id()); + resp_hdr.set_is_error(!is_success); + uint32_t absolute_sidecar_offset = protobuf_msg_size; + for (RpcSidecar* car : sidecars_) { + resp_hdr.add_sidecar_offsets(absolute_sidecar_offset); + absolute_sidecar_offset += car->AsSlice().size(); + } + + int additional_size = absolute_sidecar_offset - protobuf_msg_size; + RETURN_NOT_OK(serialization::SerializeMessage(response, &response_msg_buf_, + additional_size, true)); + int main_msg_size = additional_size + response_msg_buf_.size(); + RETURN_NOT_OK(serialization::SerializeHeader(resp_hdr, main_msg_size, + &response_hdr_buf_)); + + return Status::OK(); +} + +void InboundCall::SerializeResponseTo(vector* slices) const { + TRACE_EVENT0("rpc", "InboundCall::SerializeResponseTo"); + CHECK_GT(response_hdr_buf_.size(), 0); + CHECK_GT(response_msg_buf_.size(), 0); + slices->reserve(slices->size() + 2 + sidecars_.size()); + slices->push_back(Slice(response_hdr_buf_)); + slices->push_back(Slice(response_msg_buf_)); + for (RpcSidecar* car : sidecars_) { + slices->push_back(car->AsSlice()); + } +} + +Status InboundCall::AddRpcSidecar(gscoped_ptr car, int* idx) { + // Check that the number of sidecars does not exceed the number of payload + // slices that are free (two are used up by the header and main message + // protobufs). + if (sidecars_.size() + 2 > OutboundTransfer::kMaxPayloadSlices) { + return Status::ServiceUnavailable("All available sidecars already used"); + } + sidecars_.push_back(car.release()); + *idx = sidecars_.size() - 1; + return Status::OK(); +} + +string InboundCall::ToString() const { + return Substitute("Call $0 from $1 (request call id $2)", + remote_method_.ToString(), + conn_->remote().ToString(), + header_.call_id()); +} + +void InboundCall::DumpPB(const DumpRunningRpcsRequestPB& req, + RpcCallInProgressPB* resp) { + resp->mutable_header()->CopyFrom(header_); + if (req.include_traces() && trace_) { + resp->set_trace_buffer(trace_->DumpToString(true)); + } + resp->set_micros_elapsed(MonoTime::Now(MonoTime::FINE).GetDeltaSince(timing_.time_received) + .ToMicroseconds()); +} + +void InboundCall::LogTrace() const { + MonoTime now = MonoTime::Now(MonoTime::FINE); + int total_time = now.GetDeltaSince(timing_.time_received).ToMilliseconds(); + + if (header_.has_timeout_millis() && header_.timeout_millis() > 0) { + double log_threshold = header_.timeout_millis() * 0.75f; + if (total_time > log_threshold) { + // TODO: consider pushing this onto another thread since it may be slow. + // The traces may also be too large to fit in a log message. + LOG(WARNING) << ToString() << " took " << total_time << "ms (client timeout " + << header_.timeout_millis() << ")."; + std::string s = trace_->DumpToString(true); + if (!s.empty()) { + LOG(WARNING) << "Trace:\n" << s; + } + return; + } + } + + if (PREDICT_FALSE(FLAGS_rpc_dump_all_traces)) { + LOG(INFO) << ToString() << " took " << total_time << "ms. Trace:"; + trace_->Dump(&LOG(INFO), true); + } +} + +const UserCredentials& InboundCall::user_credentials() const { + return conn_->user_credentials(); +} + +const Sockaddr& InboundCall::remote_address() const { + return conn_->remote(); +} + +const scoped_refptr& InboundCall::connection() const { + return conn_; +} + +Trace* InboundCall::trace() { + return trace_.get(); +} + +void InboundCall::RecordCallReceived() { + TRACE_EVENT_ASYNC_BEGIN0("rpc", "InboundCall", this); + DCHECK(!timing_.time_received.Initialized()); // Protect against multiple calls. + timing_.time_received = MonoTime::Now(MonoTime::FINE); +} + +void InboundCall::RecordHandlingStarted(scoped_refptr incoming_queue_time) { + DCHECK(incoming_queue_time != nullptr); + DCHECK(!timing_.time_handled.Initialized()); // Protect against multiple calls. + timing_.time_handled = MonoTime::Now(MonoTime::FINE); + incoming_queue_time->Increment( + timing_.time_handled.GetDeltaSince(timing_.time_received).ToMicroseconds()); +} + +void InboundCall::RecordHandlingCompleted(scoped_refptr handler_run_time) { + DCHECK(handler_run_time != nullptr); + DCHECK(!timing_.time_completed.Initialized()); // Protect against multiple calls. + timing_.time_completed = MonoTime::Now(MonoTime::FINE); + handler_run_time->Increment( + timing_.time_completed.GetDeltaSince(timing_.time_handled).ToMicroseconds()); +} + +bool InboundCall::ClientTimedOut() const { + if (!header_.has_timeout_millis() || header_.timeout_millis() == 0) { + return false; + } + + MonoTime now = MonoTime::Now(MonoTime::FINE); + int total_time = now.GetDeltaSince(timing_.time_received).ToMilliseconds(); + return total_time > header_.timeout_millis(); +} + +MonoTime InboundCall::GetClientDeadline() const { + if (!header_.has_timeout_millis() || header_.timeout_millis() == 0) { + return MonoTime::Max(); + } + MonoTime deadline = timing_.time_received; + deadline.AddDelta(MonoDelta::FromMilliseconds(header_.timeout_millis())); + return deadline; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/inbound_call.h b/src/kudu/rpc/inbound_call.h new file mode 100644 index 000000000000..88706d390f3e --- /dev/null +++ b/src/kudu/rpc/inbound_call.h @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_INBOUND_CALL_H +#define KUDU_RPC_INBOUND_CALL_H + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/remote_method.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/faststring.h" +#include "kudu/util/monotime.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class Histogram; +class Trace; + +namespace rpc { + +class Connection; +class DumpRunningRpcsRequestPB; +class RpcCallInProgressPB; +class RpcSidecar; +class UserCredentials; + +struct InboundCallTiming { + MonoTime time_received; // Time the call was first accepted. + MonoTime time_handled; // Time the call handler was kicked off. + MonoTime time_completed; // Time the call handler completed. +}; + +// Inbound call on server +class InboundCall { + public: + explicit InboundCall(Connection* conn); + ~InboundCall(); + + // Parse an inbound call message. + // + // This only deserializes the call header, populating the 'header_' and + // 'serialized_request_' member variables. The actual call parameter is + // not deserialized, as this may be CPU-expensive, and this is called + // from the reactor thread. + Status ParseFrom(gscoped_ptr transfer); + + // Return the serialized request parameter protobuf. + const Slice &serialized_request() const { + return serialized_request_; + } + + const RemoteMethod& remote_method() const { + return remote_method_; + } + + const int32_t call_id() const { + return header_.call_id(); + } + + // Serializes 'response' into the InboundCall's internal buffer, and marks + // the call as a success. Enqueues the response back to the connection + // that made the call. + // + // This method deletes the InboundCall object, so no further calls may be + // made after this one. + void RespondSuccess(const google::protobuf::MessageLite& response); + + // Serializes a failure response into the internal buffer, marking the + // call as a failure. Enqueues the response back to the connection that + // made the call. + // + // This method deletes the InboundCall object, so no further calls may be + // made after this one. + void RespondFailure(ErrorStatusPB::RpcErrorCodePB error_code, + const Status &status); + + void RespondApplicationError(int error_ext_id, const std::string& message, + const google::protobuf::MessageLite& app_error_pb); + + // Convert an application error extension to an ErrorStatusPB. + // These ErrorStatusPB objects are what are returned in application error responses. + static void ApplicationErrorToPB(int error_ext_id, const std::string& message, + const google::protobuf::MessageLite& app_error_pb, + ErrorStatusPB* err); + + // Serialize the response packet for the finished call. + // The resulting slices refer to memory in this object. + void SerializeResponseTo(std::vector* slices) const; + + // See RpcContext::AddRpcSidecar() + Status AddRpcSidecar(gscoped_ptr car, int* idx); + + std::string ToString() const; + + void DumpPB(const DumpRunningRpcsRequestPB& req, RpcCallInProgressPB* resp); + + const UserCredentials& user_credentials() const; + + const Sockaddr& remote_address() const; + + const scoped_refptr& connection() const; + + Trace* trace(); + + // When this InboundCall was received (instantiated). + // Should only be called once on a given instance. + // Not thread-safe. Should only be called by the current "owner" thread. + void RecordCallReceived(); + + // When RPC call Handle() was called on the server side. + // Updates the Histogram with time elapsed since the call was received, + // and should only be called once on a given instance. + // Not thread-safe. Should only be called by the current "owner" thread. + void RecordHandlingStarted(scoped_refptr incoming_queue_time); + + // When RPC call Handle() completed execution on the server side. + // Updates the Histogram with time elapsed since the call was started, + // and should only be called once on a given instance. + // Not thread-safe. Should only be called by the current "owner" thread. + void RecordHandlingCompleted(scoped_refptr handler_run_time); + + // Return true if the deadline set by the client has already elapsed. + // In this case, the server may stop processing the call, since the + // call response will be ignored anyway. + bool ClientTimedOut() const; + + // Return an upper bound on the client timeout deadline. This does not + // account for transmission delays between the client and the server. + // If the client did not specify a deadline, returns MonoTime::Max(). + MonoTime GetClientDeadline() const; + + private: + // Serialize and queue the response. + void Respond(const google::protobuf::MessageLite& response, + bool is_success); + + // Serialize a response message for either success or failure. If it is a success, + // 'response' should be the user-defined response type for the call. If it is a + // failure, 'response' should be an ErrorStatusPB instance. + Status SerializeResponseBuffer(const google::protobuf::MessageLite& response, + bool is_success); + + // Log a WARNING message if the RPC response was slow enough that the + // client likely timed out. This is based on the client-provided timeout + // value. + // Also can be configured to log _all_ RPC traces for help debugging. + void LogTrace() const; + + // The connection on which this inbound call arrived. + scoped_refptr conn_; + + // The header of the incoming call. Set by ParseFrom() + RequestHeader header_; + + // The serialized bytes of the request param protobuf. Set by ParseFrom(). + // This references memory held by 'transfer_'. + Slice serialized_request_; + + // The transfer that produced the call. + // This is kept around because it retains the memory referred to + // by 'serialized_request_' above. + gscoped_ptr transfer_; + + // The buffers for serialized response. Set by SerializeResponseBuffer(). + faststring response_hdr_buf_; + faststring response_msg_buf_; + + // Vector of additional sidecars that are tacked on to the call's response + // after serialization of the protobuf. See rpc/rpc_sidecar.h for more info. + std::vector sidecars_; + ElementDeleter sidecars_deleter_; + + // The trace buffer. + scoped_refptr trace_; + + // Timing information related to this RPC call. + InboundCallTiming timing_; + + // Proto service this calls belongs to. Used for routing. + // This field is filled in when the inbound request header is parsed. + RemoteMethod remote_method_; + + DISALLOW_COPY_AND_ASSIGN(InboundCall); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/messenger.cc b/src/kudu/rpc/messenger.cc new file mode 100644 index 000000000000..dfad3864db22 --- /dev/null +++ b/src/kudu/rpc/messenger.cc @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/messenger.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/acceptor_pool.h" +#include "kudu/rpc/connection.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/reactor.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/rpc_service.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/errno.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/status.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" + +using std::string; +using std::shared_ptr; +using strings::Substitute; + +DEFINE_int32(rpc_default_keepalive_time_ms, 65000, + "If an RPC connection from a client is idle for this amount of time, the server " + "will disconnect the client."); +TAG_FLAG(rpc_default_keepalive_time_ms, advanced); + +namespace kudu { +namespace rpc { + +class Messenger; +class ServerBuilder; + +MessengerBuilder::MessengerBuilder(std::string name) + : name_(std::move(name)), + connection_keepalive_time_( + MonoDelta::FromMilliseconds(FLAGS_rpc_default_keepalive_time_ms)), + num_reactors_(4), + num_negotiation_threads_(4), + coarse_timer_granularity_(MonoDelta::FromMilliseconds(100)) {} + +MessengerBuilder& MessengerBuilder::set_connection_keepalive_time(const MonoDelta &keepalive) { + connection_keepalive_time_ = keepalive; + return *this; +} + +MessengerBuilder& MessengerBuilder::set_num_reactors(int num_reactors) { + num_reactors_ = num_reactors; + return *this; +} + +MessengerBuilder& MessengerBuilder::set_negotiation_threads(int num_negotiation_threads) { + num_negotiation_threads_ = num_negotiation_threads; + return *this; +} + +MessengerBuilder& MessengerBuilder::set_coarse_timer_granularity(const MonoDelta &granularity) { + coarse_timer_granularity_ = granularity; + return *this; +} + +MessengerBuilder &MessengerBuilder::set_metric_entity( + const scoped_refptr& metric_entity) { + metric_entity_ = metric_entity; + return *this; +} + +Status MessengerBuilder::Build(Messenger **msgr) { + RETURN_NOT_OK(SaslInit(kSaslAppName)); // Initialize SASL library before we start making requests + gscoped_ptr new_msgr(new Messenger(*this)); + RETURN_NOT_OK(new_msgr.get()->Init()); + *msgr = new_msgr.release(); + return Status::OK(); +} + +Status MessengerBuilder::Build(shared_ptr *msgr) { + Messenger *ptr; + RETURN_NOT_OK(Build(&ptr)); + + // See docs on Messenger::retain_self_ for info about this odd hack. + *msgr = shared_ptr( + ptr, std::mem_fun(&Messenger::AllExternalReferencesDropped)); + return Status::OK(); +} + +// See comment on Messenger::retain_self_ member. +void Messenger::AllExternalReferencesDropped() { + Shutdown(); + CHECK(retain_self_.get()); + // If we have no more external references, then we no longer + // need to retain ourself. We'll destruct as soon as all our + // internal-facing references are dropped (ie those from reactor + // threads). + retain_self_.reset(); +} + +void Messenger::Shutdown() { + // Since we're shutting down, it's OK to block. + ThreadRestrictions::ScopedAllowWait allow_wait; + + lock_guard guard(&lock_); + if (closing_) { + return; + } + VLOG(1) << "shutting down messenger " << name_; + closing_ = true; + + DCHECK(rpc_services_.empty()) << "Unregister RPC services before shutting down Messenger"; + rpc_services_.clear(); + + for (const shared_ptr& acceptor_pool : acceptor_pools_) { + acceptor_pool->Shutdown(); + } + acceptor_pools_.clear(); + + // Need to shut down negotiation pool before the reactors, since the + // reactors close the Connection sockets, and may race against the negotiation + // threads' blocking reads & writes. + negotiation_pool_->Shutdown(); + + for (Reactor* reactor : reactors_) { + reactor->Shutdown(); + } +} + +Status Messenger::AddAcceptorPool(const Sockaddr &accept_addr, + shared_ptr* pool) { + Socket sock; + RETURN_NOT_OK(sock.Init(0)); + RETURN_NOT_OK(sock.SetReuseAddr(true)); + RETURN_NOT_OK(sock.Bind(accept_addr)); + Sockaddr remote; + RETURN_NOT_OK(sock.GetSocketAddress(&remote)); + shared_ptr acceptor_pool(new AcceptorPool(this, &sock, remote)); + + lock_guard guard(&lock_); + acceptor_pools_.push_back(acceptor_pool); + *pool = acceptor_pool; + return Status::OK(); +} + +// Register a new RpcService to handle inbound requests. +Status Messenger::RegisterService(const string& service_name, + const scoped_refptr& service) { + DCHECK(service); + lock_guard guard(&lock_); + if (InsertIfNotPresent(&rpc_services_, service_name, service)) { + return Status::OK(); + } else { + return Status::AlreadyPresent("This service is already present"); + } +} + +Status Messenger::UnregisterAllServices() { + lock_guard guard(&lock_); + rpc_services_.clear(); + return Status::OK(); +} + +// Unregister an RpcService. +Status Messenger::UnregisterService(const string& service_name) { + lock_guard guard(&lock_); + if (rpc_services_.erase(service_name)) { + return Status::OK(); + } else { + return Status::ServiceUnavailable(Substitute("service $0 not registered on $1", + service_name, name_)); + } +} + +void Messenger::QueueOutboundCall(const shared_ptr &call) { + Reactor *reactor = RemoteToReactor(call->conn_id().remote()); + reactor->QueueOutboundCall(call); +} + +void Messenger::QueueInboundCall(gscoped_ptr call) { + shared_lock guard(&lock_.get_lock()); + scoped_refptr* service = FindOrNull(rpc_services_, + call->remote_method().service_name()); + if (PREDICT_FALSE(!service)) { + Status s = Status::ServiceUnavailable(Substitute("service $0 not registered on $1", + call->remote_method().service_name(), name_)); + LOG(INFO) << s.ToString(); + call.release()->RespondFailure(ErrorStatusPB::ERROR_NO_SUCH_SERVICE, s); + return; + } + + // The RpcService will respond to the client on success or failure. + WARN_NOT_OK((*service)->QueueInboundCall(call.Pass()), "Unable to handle RPC call"); +} + +void Messenger::RegisterInboundSocket(Socket *new_socket, const Sockaddr &remote) { + Reactor *reactor = RemoteToReactor(remote); + reactor->RegisterInboundSocket(new_socket, remote); +} + +Messenger::Messenger(const MessengerBuilder &bld) + : name_(bld.name_), + closing_(false), + metric_entity_(bld.metric_entity_), + retain_self_(this) { + for (int i = 0; i < bld.num_reactors_; i++) { + reactors_.push_back(new Reactor(retain_self_, i, bld)); + } + CHECK_OK(ThreadPoolBuilder("negotiator") + .set_max_threads(bld.num_negotiation_threads_) + .Build(&negotiation_pool_)); +} + +Messenger::~Messenger() { + lock_guard guard(&lock_); + CHECK(closing_) << "Should have already shut down"; + STLDeleteElements(&reactors_); +} + +Reactor* Messenger::RemoteToReactor(const Sockaddr &remote) { + uint32_t hashCode = remote.HashCode(); + int reactor_idx = hashCode % reactors_.size(); + // This is just a static partitioning; we could get a lot + // fancier with assigning Sockaddrs to Reactors. + return reactors_[reactor_idx]; +} + + +Status Messenger::Init() { + Status status; + for (Reactor* r : reactors_) { + RETURN_NOT_OK(r->Init()); + } + + return Status::OK(); +} + +Status Messenger::DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp) { + shared_lock guard(&lock_.get_lock()); + for (Reactor* reactor : reactors_) { + RETURN_NOT_OK(reactor->DumpRunningRpcs(req, resp)); + } + return Status::OK(); +} + +void Messenger::ScheduleOnReactor(const boost::function& func, + MonoDelta when) { + DCHECK(!reactors_.empty()); + + // If we're already running on a reactor thread, reuse it. + Reactor* chosen = nullptr; + for (Reactor* r : reactors_) { + if (r->IsCurrentThread()) { + chosen = r; + } + } + if (chosen == nullptr) { + // Not running on a reactor thread, pick one at random. + chosen = reactors_[rand() % reactors_.size()]; + } + + DelayedTask* task = new DelayedTask(func, when); + chosen->ScheduleReactorTask(task); +} + +const scoped_refptr Messenger::rpc_service(const string& service_name) const { + lock_guard guard(&lock_); + scoped_refptr service; + if (FindCopy(rpc_services_, service_name, &service)) { + return service; + } else { + return scoped_refptr(nullptr); + } +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/messenger.h b/src/kudu/rpc/messenger.h new file mode 100644 index 000000000000..5286780e7582 --- /dev/null +++ b/src/kudu/rpc/messenger.h @@ -0,0 +1,275 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_MESSENGER_H +#define KUDU_RPC_MESSENGER_H + +#include +#include +#include + +#include +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/response_callback.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Socket; +class ThreadPool; + +namespace rpc { + +class AcceptorPool; +class DumpRunningRpcsRequestPB; +class DumpRunningRpcsResponsePB; +class InboundCall; +class Messenger; +class OutboundCall; +class Reactor; +class ReactorThread; +class RpcService; + +struct AcceptorPoolInfo { + public: + explicit AcceptorPoolInfo(Sockaddr bind_address) + : bind_address_(std::move(bind_address)) {} + + Sockaddr bind_address() const { + return bind_address_; + } + + private: + Sockaddr bind_address_; +}; + +// Used to construct a Messenger. +class MessengerBuilder { + public: + friend class Messenger; + friend class ReactorThread; + + explicit MessengerBuilder(std::string name); + + // Set the length of time we will keep a TCP connection will alive with no traffic. + MessengerBuilder &set_connection_keepalive_time(const MonoDelta &keepalive); + + // Set the number of reactor threads that will be used for sending and + // receiving. + MessengerBuilder &set_num_reactors(int num_reactors); + + // Set the number of connection-negotiation threads that will be used to handle the + // blocking connection-negotiation step. + MessengerBuilder &set_negotiation_threads(int num_negotiation_threads); + + // Set the granularity with which connections are checked for keepalive. + MessengerBuilder &set_coarse_timer_granularity(const MonoDelta &granularity); + + // Set metric entity for use by RPC systems. + MessengerBuilder &set_metric_entity(const scoped_refptr& metric_entity); + + Status Build(std::shared_ptr *msgr); + + private: + Status Build(Messenger **msgr); + const std::string name_; + MonoDelta connection_keepalive_time_; + int num_reactors_; + int num_negotiation_threads_; + MonoDelta coarse_timer_granularity_; + scoped_refptr metric_entity_; +}; + +// A Messenger is a container for the reactor threads which run event loops +// for the RPC services. If the process is a server, a Messenger can also have +// one or more attached AcceptorPools which accept RPC connections. In this case, +// calls received over the connection are enqueued into the messenger's service_queue +// for processing by a ServicePool. +// +// Users do not typically interact with the Messenger directly except to create +// one as a singleton, and then make calls using Proxy objects. +// +// See rpc-test.cc and rpc-bench.cc for example usages. +class Messenger { + public: + friend class MessengerBuilder; + friend class Proxy; + friend class Reactor; + typedef std::vector > acceptor_vec_t; + typedef std::unordered_map > RpcServicesMap; + + static const uint64_t UNKNOWN_CALL_ID = 0; + + ~Messenger(); + + // Stop all communication and prevent further use. + // It's not required to call this -- dropping the shared_ptr provided + // from MessengerBuilder::Build will automatically call this method. + void Shutdown(); + + // Add a new acceptor pool listening to the given accept address. + // You can create any number of acceptor pools you want, including none. + // + // The created pool is returned in *pool. The Messenger also retains + // a reference to the pool, so the caller may safely drop this reference + // and the pool will remain live. + // + // NOTE: the returned pool is not initially started. You must call + // pool->Start(...) to begin accepting connections. + Status AddAcceptorPool(const Sockaddr &accept_addr, + std::shared_ptr* pool); + + // Register a new RpcService to handle inbound requests. + Status RegisterService(const std::string& service_name, + const scoped_refptr& service); + + // Unregister currently-registered RpcService. + Status UnregisterService(const std::string& service_name); + + Status UnregisterAllServices(); + + // Queue a call for transmission. This will pick the appropriate reactor, + // and enqueue a task on that reactor to assign and send the call. + void QueueOutboundCall(const std::shared_ptr &call); + + // Enqueue a call for processing on the server. + void QueueInboundCall(gscoped_ptr call); + + // Take ownership of the socket via Socket::Release + void RegisterInboundSocket(Socket *new_socket, const Sockaddr &remote); + + // Dump the current RPCs into the given protobuf. + Status DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp); + + // Run 'func' on a reactor thread after 'when' time elapses. + // + // The status argument conveys whether 'func' was run correctly (i.e. + // after the elapsed time) or not. + void ScheduleOnReactor(const boost::function& func, + MonoDelta when); + + ThreadPool* negotiation_pool() const { return negotiation_pool_.get(); } + + std::string name() const { + return name_; + } + + bool closing() const { + shared_lock guard(&lock_.get_lock()); + return closing_; + } + + scoped_refptr metric_entity() const { return metric_entity_.get(); } + + const scoped_refptr rpc_service(const std::string& service_name) const; + + private: + FRIEND_TEST(TestRpc, TestConnectionKeepalive); + + explicit Messenger(const MessengerBuilder &bld); + + Reactor* RemoteToReactor(const Sockaddr &remote); + Status Init(); + void RunTimeoutThread(); + void UpdateCurTime(); + + // Called by external-facing shared_ptr when the user no longer holds + // any references. See 'retain_self_' for more info. + void AllExternalReferencesDropped(); + + const std::string name_; + + // Protects closing_, acceptor_pools_, rpc_services_. + mutable percpu_rwlock lock_; + + bool closing_; + + // Pools which are listening on behalf of this messenger. + // Note that the user may have called Shutdown() on one of these + // pools, so even though we retain the reference, it may no longer + // be listening. + acceptor_vec_t acceptor_pools_; + + // RPC services that handle inbound requests. + RpcServicesMap rpc_services_; + + std::vector reactors_; + + gscoped_ptr negotiation_pool_; + + scoped_refptr metric_entity_; + + // The ownership of the Messenger object is somewhat subtle. The pointer graph + // looks like this: + // + // [User Code ] | [ Internal code ] + // | + // shared_ptr[1] | + // | | + // v + // Messenger <------------ shared_ptr[2] --- Reactor + // ^ | ----------- bare pointer --> Reactor + // \__/ + // shared_ptr[2] + // (retain_self_) + // + // shared_ptr[1] instances use Messenger::AllExternalReferencesDropped() + // as a deleter. + // shared_ptr[2] are "traditional" shared_ptrs which call 'delete' on the + // object. + // + // The teardown sequence is as follows: + // Option 1): User calls "Shutdown()" explicitly: + // - Messenger::Shutdown tells Reactors to shut down + // - When each reactor thread finishes, it drops its shared_ptr[2] + // - the Messenger::retain_self instance remains, keeping the Messenger + // alive. + // - The user eventually drops its shared_ptr[1], which calls + // Messenger::AllExternalReferencesDropped. This drops retain_self_ + // and results in object destruction. + // Option 2): User drops all of its shared_ptr[1] references + // - Though the Reactors still reference the Messenger, AllExternalReferencesDropped + // will get called, which triggers Messenger::Shutdown. + // - AllExternalReferencesDropped drops retain_self_, so the only remaining + // references are from Reactor threads. But the reactor threads are shutting down. + // - When the last Reactor thread dies, there will be no more shared_ptr[1] references + // and the Messenger will be destroyed. + // + // The main goal of all of this confusion is that the reactor threads need to be able + // to shut down asynchronously, and we need to keep the Messenger alive until they + // do so. So, handing out a normal shared_ptr to users would force the Messenger + // destructor to Join() the reactor threads, which causes a problem if the user + // tries to destruct the Messenger from within a Reactor thread itself. + std::shared_ptr retain_self_; + + DISALLOW_COPY_AND_ASSIGN(Messenger); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/mt-rpc-test.cc b/src/kudu/rpc/mt-rpc-test.cc new file mode 100644 index 000000000000..6b05784aa90b --- /dev/null +++ b/src/kudu/rpc/mt-rpc-test.cc @@ -0,0 +1,291 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/rpc-test-base.h" + +#include + +#include +#include + +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +METRIC_DECLARE_counter(rpc_connections_accepted); +METRIC_DECLARE_counter(rpcs_queue_overflow); + +using std::string; +using std::shared_ptr; +using strings::Substitute; + +namespace kudu { +namespace rpc { + +class MultiThreadedRpcTest : public RpcTestBase { + public: + // Make a single RPC call. + void SingleCall(Sockaddr server_addr, const char* method_name, + Status* result, CountDownLatch* latch) { + LOG(INFO) << "Connecting to " << server_addr.ToString(); + shared_ptr client_messenger(CreateMessenger("ClientSC")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + *result = DoTestSyncCall(p, method_name); + latch->CountDown(); + } + + // Make RPC calls until we see a failure. + void HammerServer(Sockaddr server_addr, const char* method_name, + Status* last_result) { + shared_ptr client_messenger(CreateMessenger("ClientHS")); + HammerServerWithMessenger(server_addr, method_name, last_result, client_messenger); + } + + void HammerServerWithMessenger( + Sockaddr server_addr, const char* method_name, Status* last_result, + const shared_ptr& messenger) { + LOG(INFO) << "Connecting to " << server_addr.ToString(); + Proxy p(messenger, server_addr, GenericCalculatorService::static_service_name()); + + int i = 0; + while (true) { + i++; + Status s = DoTestSyncCall(p, method_name); + if (!s.ok()) { + // Return on first failure. + LOG(INFO) << "Call failed. Shutting down client thread. Ran " << i << " calls: " + << s.ToString(); + *last_result = s; + return; + } + } + } +}; + +static void AssertShutdown(kudu::Thread* thread, const Status* status) { + ASSERT_OK(ThreadJoiner(thread).warn_every_ms(500).Join()); + string msg = status->ToString(); + ASSERT_TRUE(msg.find("Service unavailable") != string::npos || + msg.find("Network error") != string::npos) + << "Status is actually: " << msg; +} + +// Test making several concurrent RPC calls while shutting down. +// Simply verify that we don't hit any CHECK errors. +TEST_F(MultiThreadedRpcTest, TestShutdownDuringService) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + const int kNumThreads = 4; + scoped_refptr threads[kNumThreads]; + Status statuses[kNumThreads]; + for (int i = 0; i < kNumThreads; i++) { + ASSERT_OK(kudu::Thread::Create("test", strings::Substitute("t$0", i), + &MultiThreadedRpcTest::HammerServer, this, server_addr, + GenericCalculatorService::kAddMethodName, &statuses[i], &threads[i])); + } + + SleepFor(MonoDelta::FromMilliseconds(50)); + + // Shut down server. + ASSERT_OK(server_messenger_->UnregisterService(service_name_)); + service_pool_->Shutdown(); + server_messenger_->Shutdown(); + + for (int i = 0; i < kNumThreads; i++) { + AssertShutdown(threads[i].get(), &statuses[i]); + } +} + +// Test shutting down the client messenger exactly as a thread is about to start +// a new connection. This is a regression test for KUDU-104. +TEST_F(MultiThreadedRpcTest, TestShutdownClientWhileCallsPending) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + shared_ptr client_messenger(CreateMessenger("Client")); + + scoped_refptr thread; + Status status; + ASSERT_OK(kudu::Thread::Create("test", "test", + &MultiThreadedRpcTest::HammerServerWithMessenger, this, server_addr, + GenericCalculatorService::kAddMethodName, &status, client_messenger, &thread)); + + // Shut down the messenger after a very brief sleep. This often will race so that the + // call gets submitted to the messenger before shutdown, but the negotiation won't have + // started yet. In a debug build this fails about half the time without the bug fix. + // See KUDU-104. + SleepFor(MonoDelta::FromMicroseconds(10)); + client_messenger->Shutdown(); + client_messenger.reset(); + + ASSERT_OK(ThreadJoiner(thread.get()).warn_every_ms(500).Join()); + ASSERT_TRUE(status.IsAborted() || + status.IsServiceUnavailable()); + string msg = status.ToString(); + SCOPED_TRACE(msg); + ASSERT_TRUE(msg.find("Client RPC Messenger shutting down") != string::npos || + msg.find("reactor is shutting down") != string::npos || + msg.find("Unable to start connection negotiation thread") != string::npos) + << "Status is actually: " << msg; +} + +// This bogus service pool leaves the service queue full. +class BogusServicePool : public ServicePool { + public: + BogusServicePool(gscoped_ptr service, + const scoped_refptr& metric_entity, + size_t service_queue_length) + : ServicePool(service.Pass(), metric_entity, service_queue_length) { + } + virtual Status Init(int num_threads) OVERRIDE { + // Do nothing + return Status::OK(); + } +}; + +void IncrementBackpressureOrShutdown(const Status* status, int* backpressure, int* shutdown) { + string msg = status->ToString(); + if (msg.find("service queue is full") != string::npos) { + ++(*backpressure); + } else if (msg.find("shutting down") != string::npos) { + ++(*shutdown); + } else if (msg.find("got EOF from remote") != string::npos) { + ++(*shutdown); + } else { + FAIL() << "Unexpected status message: " << msg; + } +} + +// Test that we get a Service Unavailable error when we max out the incoming RPC service queue. +TEST_F(MultiThreadedRpcTest, TestBlowOutServiceQueue) { + const size_t kMaxConcurrency = 2; + + MessengerBuilder bld("messenger1"); + bld.set_num_reactors(kMaxConcurrency); + bld.set_metric_entity(metric_entity_); + CHECK_OK(bld.Build(&server_messenger_)); + + shared_ptr pool; + ASSERT_OK(server_messenger_->AddAcceptorPool(Sockaddr(), &pool)); + ASSERT_OK(pool->Start(kMaxConcurrency)); + Sockaddr server_addr = pool->bind_address(); + + gscoped_ptr service(new GenericCalculatorService()); + service_name_ = service->service_name(); + service_pool_ = new BogusServicePool(service.Pass(), + server_messenger_->metric_entity(), + kMaxConcurrency); + ASSERT_OK(service_pool_->Init(n_worker_threads_)); + server_messenger_->RegisterService(service_name_, service_pool_); + + scoped_refptr threads[3]; + Status status[3]; + CountDownLatch latch(1); + for (int i = 0; i < 3; i++) { + ASSERT_OK(kudu::Thread::Create("test", strings::Substitute("t$0", i), + &MultiThreadedRpcTest::SingleCall, this, server_addr, + GenericCalculatorService::kAddMethodName, &status[i], &latch, &threads[i])); + } + + // One should immediately fail due to backpressure. The latch is only initialized + // to wait for the first of three threads to finish. + latch.Wait(); + + // The rest would time out after 10 sec, but we help them along. + ASSERT_OK(server_messenger_->UnregisterService(service_name_)); + service_pool_->Shutdown(); + server_messenger_->Shutdown(); + + for (const auto& thread : threads) { + ASSERT_OK(ThreadJoiner(thread.get()).warn_every_ms(500).Join()); + } + + // Verify that one error was due to backpressure. + int errors_backpressure = 0; + int errors_shutdown = 0; + + for (const auto& s : status) { + IncrementBackpressureOrShutdown(&s, &errors_backpressure, &errors_shutdown); + } + + ASSERT_EQ(1, errors_backpressure); + ASSERT_EQ(2, errors_shutdown); + + // Check that RPC queue overflow metric is 1 + Counter *rpcs_queue_overflow = + METRIC_rpcs_queue_overflow.Instantiate(server_messenger_->metric_entity()).get(); + ASSERT_EQ(1, rpcs_queue_overflow->value()); +} + +static void HammerServerWithTCPConns(const Sockaddr& addr) { + while (true) { + Socket socket; + CHECK_OK(socket.Init(0)); + Status s; + LOG_SLOW_EXECUTION(INFO, 100, "Connect took long") { + s = socket.Connect(addr); + } + if (!s.ok()) { + CHECK(s.IsNetworkError()) << "Unexpected error: " << s.ToString(); + return; + } + CHECK_OK(socket.Close()); + } +} + +// Regression test for KUDU-128. +// Test that shuts down the server while new TCP connections are incoming. +TEST_F(MultiThreadedRpcTest, TestShutdownWithIncomingConnections) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Start a number of threads which just hammer the server with TCP connections. + vector > threads; + for (int i = 0; i < 8; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("t$0", i), + &HammerServerWithTCPConns, server_addr, &new_thread)); + threads.push_back(new_thread); + } + + // Sleep until the server has started to actually accept some connections from the + // test threads. + scoped_refptr conns_accepted = + METRIC_rpc_connections_accepted.Instantiate(server_messenger_->metric_entity()); + while (conns_accepted->value() == 0) { + SleepFor(MonoDelta::FromMicroseconds(100)); + } + + // Shutdown while there are still new connections appearing. + ASSERT_OK(server_messenger_->UnregisterService(service_name_)); + service_pool_->Shutdown(); + server_messenger_->Shutdown(); + + for (scoped_refptr& t : threads) { + ASSERT_OK(ThreadJoiner(t.get()).warn_every_ms(500).Join()); + } +} + +} // namespace rpc +} // namespace kudu + diff --git a/src/kudu/rpc/negotiation.cc b/src/kudu/rpc/negotiation.cc new file mode 100644 index 000000000000..e2cbb4a90d30 --- /dev/null +++ b/src/kudu/rpc/negotiation.cc @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/negotiation.h" + +#include +#include + +#include + +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/blocking_ops.h" +#include "kudu/rpc/connection.h" +#include "kudu/rpc/reactor.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/sasl_client.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/sasl_server.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/status.h" +#include "kudu/util/trace.h" + +DEFINE_bool(rpc_trace_negotiation, false, + "If enabled, dump traces of all RPC negotiations to the log"); +TAG_FLAG(rpc_trace_negotiation, runtime); +TAG_FLAG(rpc_trace_negotiation, advanced); +TAG_FLAG(rpc_trace_negotiation, experimental); + +namespace kudu { +namespace rpc { + +using std::shared_ptr; +using strings::Substitute; + +// Client: Send ConnectionContextPB message based on information stored in the Connection object. +static Status SendConnectionContext(Connection* conn, const MonoTime& deadline) { + TRACE("Sending connection context"); + RequestHeader header; + header.set_call_id(kConnectionContextCallId); + + ConnectionContextPB conn_context; + conn_context.mutable_user_info()->set_effective_user(conn->user_credentials().effective_user()); + conn_context.mutable_user_info()->set_real_user(conn->user_credentials().real_user()); + + return SendFramedMessageBlocking(conn->socket(), header, conn_context, deadline); +} + +// Server: Receive ConnectionContextPB message and update the corresponding fields in the +// associated Connection object. Perform validation against SASL-negotiated information +// as needed. +static Status RecvConnectionContext(Connection* conn, const MonoTime& deadline) { + TRACE("Waiting for connection context"); + faststring recv_buf(1024); // Should be plenty for a ConnectionContextPB message. + RequestHeader header; + Slice param_buf; + RETURN_NOT_OK(ReceiveFramedMessageBlocking(conn->socket(), &recv_buf, + &header, ¶m_buf, deadline)); + DCHECK(header.IsInitialized()); + + if (header.call_id() != kConnectionContextCallId) { + return Status::IllegalState("Expected ConnectionContext callid, received", + Substitute("$0", header.call_id())); + } + + ConnectionContextPB conn_context; + if (!conn_context.ParseFromArray(param_buf.data(), param_buf.size())) { + return Status::InvalidArgument("Invalid ConnectionContextPB message, missing fields", + conn_context.InitializationErrorString()); + } + + // Update the fields of our Connection object from the ConnectionContextPB. + if (conn_context.has_user_info()) { + // Validate real user against SASL impl. + if (conn->sasl_server().negotiated_mechanism() == SaslMechanism::PLAIN) { + if (conn->sasl_server().plain_auth_user() != conn_context.user_info().real_user()) { + return Status::NotAuthorized( + "ConnectionContextPB specified different real user than sent in SASL negotiation", + StringPrintf("\"%s\" vs. \"%s\"", + conn_context.user_info().real_user().c_str(), + conn->sasl_server().plain_auth_user().c_str())); + } + } + conn->mutable_user_credentials()->set_real_user(conn_context.user_info().real_user()); + + // TODO: Validate effective user when we implement impersonation. + if (conn_context.user_info().has_effective_user()) { + conn->mutable_user_credentials()->set_effective_user( + conn_context.user_info().effective_user()); + } + } + return Status::OK(); +} + +// Wait for the client connection to be established and become ready for writing. +static Status WaitForClientConnect(Connection* conn, const MonoTime& deadline) { + TRACE("Waiting for socket to connect"); + int fd = conn->socket()->GetFd(); + struct pollfd poll_fd; + poll_fd.fd = fd; + poll_fd.events = POLLOUT; + poll_fd.revents = 0; + + MonoTime now; + MonoDelta remaining; + while (true) { + now = MonoTime::Now(MonoTime::FINE); + remaining = deadline.GetDeltaSince(now); + DVLOG(4) << "Client waiting to connect for negotiation, time remaining until timeout deadline: " + << remaining.ToString(); + if (PREDICT_FALSE(remaining.ToNanoseconds() <= 0)) { + return Status::TimedOut("Timeout exceeded waiting to connect"); + } +#if defined(__linux__) + struct timespec ts; + remaining.ToTimeSpec(&ts); + int ready = ppoll(&poll_fd, 1, &ts, NULL); +#else + int ready = poll(&poll_fd, 1, remaining.ToMilliseconds()); +#endif + if (ready == -1) { + int err = errno; + if (err == EINTR) { + // We were interrupted by a signal, let's go again. + continue; + } else { + return Status::NetworkError("Error from ppoll() while waiting to connect", + ErrnoToString(err), err); + } + } else if (ready == 0) { + // Timeout exceeded. Loop back to the top to our impending doom. + continue; + } else { + // Success. + break; + } + } + + // Connect finished, but this doesn't mean that we connected successfully. + // Check the socket for an error. + int so_error = 0; + socklen_t socklen = sizeof(so_error); + int rc = getsockopt(fd, SOL_SOCKET, SO_ERROR, &so_error, &socklen); + if (rc != 0) { + return Status::NetworkError("Unable to check connected socket for errors", + ErrnoToString(errno), + errno); + } + if (so_error != 0) { + return Status::NetworkError("connect", ErrnoToString(so_error), so_error); + } + + return Status::OK(); +} + +// Disable / reset socket timeouts. +static Status DisableSocketTimeouts(Connection* conn) { + RETURN_NOT_OK(conn->socket()->SetSendTimeout(MonoDelta::FromNanoseconds(0L))); + RETURN_NOT_OK(conn->socket()->SetRecvTimeout(MonoDelta::FromNanoseconds(0L))); + return Status::OK(); +} + +// Perform client negotiation. We don't LOG() anything, we leave that to our caller. +static Status DoClientNegotiation(Connection* conn, + const MonoTime& deadline) { + RETURN_NOT_OK(WaitForClientConnect(conn, deadline)); + RETURN_NOT_OK(conn->SetNonBlocking(false)); + RETURN_NOT_OK(conn->InitSaslClient()); + conn->sasl_client().set_deadline(deadline); + RETURN_NOT_OK(conn->sasl_client().Negotiate()); + RETURN_NOT_OK(SendConnectionContext(conn, deadline)); + RETURN_NOT_OK(DisableSocketTimeouts(conn)); + + return Status::OK(); +} + +// Perform server negotiation. We don't LOG() anything, we leave that to our caller. +static Status DoServerNegotiation(Connection* conn, + const MonoTime& deadline) { + RETURN_NOT_OK(conn->SetNonBlocking(false)); + RETURN_NOT_OK(conn->InitSaslServer()); + conn->sasl_server().set_deadline(deadline); + RETURN_NOT_OK(conn->sasl_server().Negotiate()); + RETURN_NOT_OK(RecvConnectionContext(conn, deadline)); + RETURN_NOT_OK(DisableSocketTimeouts(conn)); + + return Status::OK(); +} + +// Perform negotiation for a connection (either server or client) +void Negotiation::RunNegotiation(const scoped_refptr& conn, + const MonoTime& deadline) { + Status s; + if (conn->direction() == Connection::SERVER) { + s = DoServerNegotiation(conn.get(), deadline); + } else { + s = DoClientNegotiation(conn.get(), deadline); + } + + if (PREDICT_FALSE(!s.ok())) { + string msg = Substitute("$0 connection negotiation failed: $1", + conn->direction() == Connection::SERVER ? "Server" : "Client", + conn->ToString()); + s = s.CloneAndPrepend(msg); + } + TRACE("Negotiation complete: $0", s.ToString()); + + bool is_bad = !s.ok() && !(s.IsNetworkError() && s.posix_code() == ECONNREFUSED); + + if (is_bad || FLAGS_rpc_trace_negotiation) { + string msg = Trace::CurrentTrace()->DumpToString(true); + if (is_bad) { + LOG(WARNING) << "Failed RPC negotiation. Trace:\n" << msg; + } else { + LOG(INFO) << "RPC negotiation tracing enabled. Trace:\n" << msg; + } + } + conn->CompleteNegotiation(s); +} + + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/negotiation.h b/src/kudu/rpc/negotiation.h new file mode 100644 index 000000000000..0562555859d9 --- /dev/null +++ b/src/kudu/rpc/negotiation.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_NEGOTIATION_H +#define KUDU_RPC_NEGOTIATION_H + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/monotime.h" + +namespace kudu { +namespace rpc { + +class Connection; + +class Negotiation { + public: + static void RunNegotiation(const scoped_refptr& conn, + const MonoTime &deadline); + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Negotiation); +}; + +} // namespace rpc +} // namespace kudu +#endif // KUDU_RPC_NEGOTIATION_H diff --git a/src/kudu/rpc/outbound_call.cc b/src/kudu/rpc/outbound_call.cc new file mode 100644 index 000000000000..e608dadc620a --- /dev/null +++ b/src/kudu/rpc/outbound_call.cc @@ -0,0 +1,488 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/rpc/outbound_call.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/rpc/serialization.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/kernel_stack_watchdog.h" + +namespace kudu { +namespace rpc { + +using strings::Substitute; +using google::protobuf::Message; +using google::protobuf::io::CodedOutputStream; + +static const double kMicrosPerSecond = 1000000.0; + +// 100M cycles should be about 50ms on a 2Ghz box. This should be high +// enough that involuntary context switches don't trigger it, but low enough +// that any serious blocking behavior on the reactor would. +DEFINE_int64(rpc_callback_max_cycles, 100 * 1000 * 1000, + "The maximum number of cycles for which an RPC callback " + "should be allowed to run without emitting a warning." + " (Advanced debugging option)"); +TAG_FLAG(rpc_callback_max_cycles, advanced); +TAG_FLAG(rpc_callback_max_cycles, runtime); + +/// +/// OutboundCall +/// + +OutboundCall::OutboundCall(const ConnectionId& conn_id, + const RemoteMethod& remote_method, + google::protobuf::Message* response_storage, + RpcController* controller, ResponseCallback callback) + : state_(READY), + remote_method_(remote_method), + conn_id_(conn_id), + callback_(std::move(callback)), + controller_(DCHECK_NOTNULL(controller)), + response_(DCHECK_NOTNULL(response_storage)) { + DVLOG(4) << "OutboundCall " << this << " constructed with state_: " << StateName(state_) + << " and RPC timeout: " + << (controller->timeout().Initialized() ? controller->timeout().ToString() : "none"); + header_.set_call_id(kInvalidCallId); + remote_method.ToPB(header_.mutable_remote_method()); + start_time_ = MonoTime::Now(MonoTime::FINE); +} + +OutboundCall::~OutboundCall() { + DCHECK(IsFinished()); + DVLOG(4) << "OutboundCall " << this << " destroyed with state_: " << StateName(state_); +} + +Status OutboundCall::SerializeTo(vector* slices) { + size_t param_len = request_buf_.size(); + if (PREDICT_FALSE(param_len == 0)) { + return Status::InvalidArgument("Must call SetRequestParam() before SerializeTo()"); + } + + const MonoDelta &timeout = controller_->timeout(); + if (timeout.Initialized()) { + header_.set_timeout_millis(timeout.ToMilliseconds()); + } + + CHECK_OK(serialization::SerializeHeader(header_, param_len, &header_buf_)); + + // Return the concatenated packet. + slices->push_back(Slice(header_buf_)); + slices->push_back(Slice(request_buf_)); + return Status::OK(); +} + +Status OutboundCall::SetRequestParam(const Message& message) { + return serialization::SerializeMessage(message, &request_buf_); +} + +Status OutboundCall::status() const { + lock_guard l(&lock_); + return status_; +} + +const ErrorStatusPB* OutboundCall::error_pb() const { + lock_guard l(&lock_); + return error_pb_.get(); +} + + +string OutboundCall::StateName(State state) { + switch (state) { + case READY: + return "READY"; + case ON_OUTBOUND_QUEUE: + return "ON_OUTBOUND_QUEUE"; + case SENT: + return "SENT"; + case TIMED_OUT: + return "TIMED_OUT"; + case FINISHED_ERROR: + return "FINISHED_ERROR"; + case FINISHED_SUCCESS: + return "FINISHED_SUCCESS"; + default: + LOG(DFATAL) << "Unknown state in OutboundCall: " << state; + return StringPrintf("UNKNOWN(%d)", state); + } +} + +void OutboundCall::set_state(State new_state) { + lock_guard l(&lock_); + set_state_unlocked(new_state); +} + +OutboundCall::State OutboundCall::state() const { + lock_guard l(&lock_); + return state_; +} + +void OutboundCall::set_state_unlocked(State new_state) { + // Sanity check state transitions. + DVLOG(3) << "OutboundCall " << this << " (" << ToString() << ") switching from " << + StateName(state_) << " to " << StateName(new_state); + switch (new_state) { + case ON_OUTBOUND_QUEUE: + DCHECK_EQ(state_, READY); + break; + case SENT: + DCHECK_EQ(state_, ON_OUTBOUND_QUEUE); + break; + case TIMED_OUT: + DCHECK(state_ == SENT || state_ == ON_OUTBOUND_QUEUE); + break; + case FINISHED_SUCCESS: + DCHECK_EQ(state_, SENT); + break; + default: + // No sanity checks for others. + break; + } + + state_ = new_state; +} + +void OutboundCall::CallCallback() { + int64_t start_cycles = CycleClock::Now(); + { + SCOPED_WATCH_STACK(100); + callback_(); + // Clear the callback, since it may be holding onto reference counts + // via bound parameters. We do this inside the timer because it's possible + // the user has naughty destructors that block, and we want to account for that + // time here if they happen to run on this thread. + callback_ = NULL; + } + int64_t end_cycles = CycleClock::Now(); + int64_t wait_cycles = end_cycles - start_cycles; + if (PREDICT_FALSE(wait_cycles > FLAGS_rpc_callback_max_cycles)) { + double micros = static_cast(wait_cycles) / base::CyclesPerSecond() + * kMicrosPerSecond; + + LOG(WARNING) << "RPC callback for " << ToString() << " blocked reactor thread for " + << micros << "us"; + } +} + +void OutboundCall::SetResponse(gscoped_ptr resp) { + call_response_ = resp.Pass(); + Slice r(call_response_->serialized_response()); + + if (call_response_->is_success()) { + // TODO: here we're deserializing the call response within the reactor thread, + // which isn't great, since it would block processing of other RPCs in parallel. + // Should look into a way to avoid this. + if (!response_->ParseFromArray(r.data(), r.size())) { + SetFailed(Status::IOError("Invalid response, missing fields", + response_->InitializationErrorString())); + return; + } + set_state(FINISHED_SUCCESS); + CallCallback(); + } else { + // Error + gscoped_ptr err(new ErrorStatusPB()); + if (!err->ParseFromArray(r.data(), r.size())) { + SetFailed(Status::IOError("Was an RPC error but could not parse error response", + err->InitializationErrorString())); + return; + } + ErrorStatusPB* err_raw = err.release(); + SetFailed(Status::RemoteError(err_raw->message()), err_raw); + } +} + +void OutboundCall::SetQueued() { + set_state(ON_OUTBOUND_QUEUE); +} + +void OutboundCall::SetSent() { + set_state(SENT); + + // This method is called in the reactor thread, so free the header buf, + // which was also allocated from this thread. tcmalloc's thread caching + // behavior is a lot more efficient if memory is freed from the same thread + // which allocated it -- this lets it keep to thread-local operations instead + // of taking a mutex to put memory back on the global freelist. + delete [] header_buf_.release(); + + // request_buf_ is also done being used here, but since it was allocated by + // the caller thread, we would rather let that thread free it whenever it + // deletes the RpcController. +} + +void OutboundCall::SetFailed(const Status &status, + ErrorStatusPB* err_pb) { + { + lock_guard l(&lock_); + status_ = status; + if (status_.IsRemoteError()) { + CHECK(err_pb); + error_pb_.reset(err_pb); + } else { + CHECK(!err_pb); + } + set_state_unlocked(FINISHED_ERROR); + } + CallCallback(); +} + +void OutboundCall::SetTimedOut() { + { + lock_guard l(&lock_); + status_ = Status::TimedOut(Substitute( + "$0 RPC to $1 timed out after $2", + remote_method_.method_name(), + conn_id_.remote().ToString(), + controller_->timeout().ToString())); + set_state_unlocked(TIMED_OUT); + } + CallCallback(); +} + +bool OutboundCall::IsTimedOut() const { + lock_guard l(&lock_); + return state_ == TIMED_OUT; +} + +bool OutboundCall::IsFinished() const { + lock_guard l(&lock_); + switch (state_) { + case READY: + case ON_OUTBOUND_QUEUE: + case SENT: + return false; + case TIMED_OUT: + case FINISHED_ERROR: + case FINISHED_SUCCESS: + return true; + default: + LOG(FATAL) << "Unknown call state: " << state_; + return false; + } +} + +string OutboundCall::ToString() const { + return Substitute("RPC call $0 -> $1", remote_method_.ToString(), conn_id_.ToString()); +} + +void OutboundCall::DumpPB(const DumpRunningRpcsRequestPB& req, + RpcCallInProgressPB* resp) { + lock_guard l(&lock_); + resp->mutable_header()->CopyFrom(header_); + resp->set_micros_elapsed( + MonoTime::Now(MonoTime::FINE) .GetDeltaSince(start_time_).ToMicroseconds()); +} + +/// +/// UserCredentials +/// + +UserCredentials::UserCredentials() {} + +bool UserCredentials::has_effective_user() const { + return !eff_user_.empty(); +} + +void UserCredentials::set_effective_user(const string& eff_user) { + eff_user_ = eff_user; +} + +bool UserCredentials::has_real_user() const { + return !real_user_.empty(); +} + +void UserCredentials::set_real_user(const string& real_user) { + real_user_ = real_user; +} + +bool UserCredentials::has_password() const { + return !password_.empty(); +} + +void UserCredentials::set_password(const string& password) { + password_ = password; +} + +void UserCredentials::CopyFrom(const UserCredentials& other) { + eff_user_ = other.eff_user_; + real_user_ = other.real_user_; + password_ = other.password_; +} + +string UserCredentials::ToString() const { + // Does not print the password. + return StringPrintf("{real_user=%s, eff_user=%s}", real_user_.c_str(), eff_user_.c_str()); +} + +size_t UserCredentials::HashCode() const { + size_t seed = 0; + if (has_effective_user()) { + boost::hash_combine(seed, effective_user()); + } + if (has_real_user()) { + boost::hash_combine(seed, real_user()); + } + if (has_password()) { + boost::hash_combine(seed, password()); + } + return seed; +} + +bool UserCredentials::Equals(const UserCredentials& other) const { + return (effective_user() == other.effective_user() + && real_user() == other.real_user() + && password() == other.password()); +} + +/// +/// ConnectionId +/// + +ConnectionId::ConnectionId() {} + +ConnectionId::ConnectionId(const ConnectionId& other) { + DoCopyFrom(other); +} + +ConnectionId::ConnectionId(const Sockaddr& remote, const UserCredentials& user_credentials) { + remote_ = remote; + user_credentials_.CopyFrom(user_credentials); +} + +void ConnectionId::set_remote(const Sockaddr& remote) { + remote_ = remote; +} + +void ConnectionId::set_user_credentials(const UserCredentials& user_credentials) { + user_credentials_.CopyFrom(user_credentials); +} + +void ConnectionId::CopyFrom(const ConnectionId& other) { + DoCopyFrom(other); +} + +string ConnectionId::ToString() const { + // Does not print the password. + return StringPrintf("{remote=%s, user_credentials=%s}", + remote_.ToString().c_str(), + user_credentials_.ToString().c_str()); +} + +void ConnectionId::DoCopyFrom(const ConnectionId& other) { + remote_ = other.remote_; + user_credentials_.CopyFrom(other.user_credentials_); +} + +size_t ConnectionId::HashCode() const { + size_t seed = 0; + boost::hash_combine(seed, remote_.HashCode()); + boost::hash_combine(seed, user_credentials_.HashCode()); + return seed; +} + +bool ConnectionId::Equals(const ConnectionId& other) const { + return (remote() == other.remote() + && user_credentials().Equals(other.user_credentials())); +} + +size_t ConnectionIdHash::operator() (const ConnectionId& conn_id) const { + return conn_id.HashCode(); +} + +bool ConnectionIdEqual::operator() (const ConnectionId& cid1, const ConnectionId& cid2) const { + return cid1.Equals(cid2); +} + +/// +/// CallResponse +/// + +CallResponse::CallResponse() + : parsed_(false) { +} + +Status CallResponse::GetSidecar(int idx, Slice* sidecar) const { + DCHECK(parsed_); + if (idx < 0 || idx >= header_.sidecar_offsets_size()) { + return Status::InvalidArgument(strings::Substitute( + "Index $0 does not reference a valid sidecar", idx)); + } + *sidecar = sidecar_slices_[idx]; + return Status::OK(); +} + +Status CallResponse::ParseFrom(gscoped_ptr transfer) { + CHECK(!parsed_); + Slice entire_message; + RETURN_NOT_OK(serialization::ParseMessage(transfer->data(), &header_, + &entire_message)); + + // Use information from header to extract the payload slices. + int last = header_.sidecar_offsets_size() - 1; + + if (last >= OutboundTransfer::kMaxPayloadSlices) { + return Status::Corruption(strings::Substitute( + "Received $0 additional payload slices, expected at most %d", + last, OutboundTransfer::kMaxPayloadSlices)); + } + + if (last >= 0) { + serialized_response_ = Slice(entire_message.data(), + header_.sidecar_offsets(0)); + for (int i = 0; i < last; ++i) { + uint32_t next_offset = header_.sidecar_offsets(i); + int32_t len = header_.sidecar_offsets(i + 1) - next_offset; + if (next_offset + len > entire_message.size() || len < 0) { + return Status::Corruption(strings::Substitute( + "Invalid sidecar offsets; sidecar $0 apparently starts at $1," + " has length $2, but the entire message has length $3", + i, next_offset, len, entire_message.size())); + } + sidecar_slices_[i] = Slice(entire_message.data() + next_offset, len); + } + uint32_t next_offset = header_.sidecar_offsets(last); + if (next_offset > entire_message.size()) { + return Status::Corruption(strings::Substitute( + "Invalid sidecar offsets; the last sidecar ($0) apparently starts " + "at $1, but the entire message has length $3", + last, next_offset, entire_message.size())); + } + sidecar_slices_[last] = Slice(entire_message.data() + next_offset, + entire_message.size() - next_offset); + } else { + serialized_response_ = entire_message; + } + + transfer_.swap(transfer); + parsed_ = true; + return Status::OK(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/outbound_call.h b/src/kudu/rpc/outbound_call.h new file mode 100644 index 000000000000..21557522e779 --- /dev/null +++ b/src/kudu/rpc/outbound_call.h @@ -0,0 +1,365 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_CLIENT_CALL_H +#define KUDU_RPC_CLIENT_CALL_H + +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/remote_method.h" +#include "kudu/rpc/response_callback.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { +namespace rpc { + +class CallResponse; +class Connection; +class DumpRunningRpcsRequestPB; +class InboundTransfer; +class RpcCallInProgressPB; +class RpcController; + +// Client-side user credentials, such as a user's username & password. +// In the future, we will add Kerberos credentials. +// +// TODO(mpercy): this is actually used server side too -- should +// we instead introduce a RemoteUser class or something? +class UserCredentials { + public: + UserCredentials(); + + // Effective user, in cases where impersonation is supported. + // If impersonation is not supported, this should be left empty. + bool has_effective_user() const; + void set_effective_user(const std::string& eff_user); + const std::string& effective_user() const { return eff_user_; } + + // Real user. + bool has_real_user() const; + void set_real_user(const std::string& real_user); + const std::string& real_user() const { return real_user_; } + + // The real user's password. + bool has_password() const; + void set_password(const std::string& password); + const std::string& password() const { return password_; } + + // Copy state from another object to this one. + void CopyFrom(const UserCredentials& other); + + // Returns a string representation of the object, not including the password field. + std::string ToString() const; + + std::size_t HashCode() const; + bool Equals(const UserCredentials& other) const; + + private: + // Remember to update HashCode() and Equals() when new fields are added. + std::string eff_user_; + std::string real_user_; + std::string password_; + + DISALLOW_COPY_AND_ASSIGN(UserCredentials); +}; + +// Used to key on Connection information. +// For use as a key in an unordered STL collection, use ConnectionIdHash and ConnectionIdEqual. +// This class is copyable for STL compatibility, but not assignable (use CopyFrom() for that). +class ConnectionId { + public: + ConnectionId(); + + // Copy constructor required for use with STL unordered_map. + ConnectionId(const ConnectionId& other); + + // Convenience constructor. + ConnectionId(const Sockaddr& remote, const UserCredentials& user_credentials); + + // The remote address. + void set_remote(const Sockaddr& remote); + const Sockaddr& remote() const { return remote_; } + + // The credentials of the user associated with this connection, if any. + void set_user_credentials(const UserCredentials& user_credentials); + const UserCredentials& user_credentials() const { return user_credentials_; } + UserCredentials* mutable_user_credentials() { return &user_credentials_; } + + // Copy state from another object to this one. + void CopyFrom(const ConnectionId& other); + + // Returns a string representation of the object, not including the password field. + std::string ToString() const; + + size_t HashCode() const; + bool Equals(const ConnectionId& other) const; + + private: + // Remember to update HashCode() and Equals() when new fields are added. + Sockaddr remote_; + UserCredentials user_credentials_; + + // Implementation of CopyFrom that can be shared with copy constructor. + void DoCopyFrom(const ConnectionId& other); + + // Disable assignment operator. + void operator=(const ConnectionId&); +}; + +class ConnectionIdHash { + public: + std::size_t operator() (const ConnectionId& conn_id) const; +}; + +class ConnectionIdEqual { + public: + bool operator() (const ConnectionId& cid1, const ConnectionId& cid2) const; +}; + +// Tracks the status of a call on the client side. +// +// This is an internal-facing class -- clients interact with the +// RpcController class. +// +// This is allocated by the Proxy when a call is first created, +// then passed to the reactor thread to send on the wire. It's typically +// kept using a shared_ptr because a call may terminate in any number +// of different threads, making it tricky to enforce single ownership. +class OutboundCall { + public: + OutboundCall(const ConnectionId& conn_id, const RemoteMethod& remote_method, + google::protobuf::Message* response_storage, + RpcController* controller, ResponseCallback callback); + + ~OutboundCall(); + + // Serialize the given request PB into this call's internal storage. + // + // Because the data is fully serialized by this call, 'req' may be + // subsequently mutated with no ill effects. + Status SetRequestParam(const google::protobuf::Message& req); + + // Assign the call ID for this call. This is called from the reactor + // thread once a connection has been assigned. Must only be called once. + void set_call_id(int32_t call_id) { + DCHECK_EQ(header_.call_id(), kInvalidCallId) << "Already has a call ID"; + header_.set_call_id(call_id); + } + + // Serialize the call for the wire. Requires that SetRequestParam() + // is called first. This is called from the Reactor thread. + Status SerializeTo(std::vector* slices); + + // Callback after the call has been put on the outbound connection queue. + void SetQueued(); + + // Update the call state to show that the request has been sent. + void SetSent(); + + // Mark the call as failed. This also triggers the callback to notify + // the caller. If the call failed due to a remote error, then err_pb + // should be set to the error returned by the remote server. Takes + // ownership of 'err_pb'. + void SetFailed(const Status& status, + ErrorStatusPB* err_pb = NULL); + + // Mark the call as timed out. This also triggers the callback to notify + // the caller. + void SetTimedOut(); + bool IsTimedOut() const; + + // Is the call finished? + bool IsFinished() const; + + // Fill in the call response. + void SetResponse(gscoped_ptr resp); + + std::string ToString() const; + + void DumpPB(const DumpRunningRpcsRequestPB& req, RpcCallInProgressPB* resp); + + //////////////////////////////////////////////////////////// + // Getters + //////////////////////////////////////////////////////////// + + const ConnectionId& conn_id() const { return conn_id_; } + const RemoteMethod& remote_method() const { return remote_method_; } + const ResponseCallback &callback() const { return callback_; } + RpcController* controller() { return controller_; } + const RpcController* controller() const { return controller_; } + + // Return true if a call ID has been assigned to this call. + bool call_id_assigned() const { + return header_.call_id() != kInvalidCallId; + } + + int32_t call_id() const { + DCHECK(call_id_assigned()); + return header_.call_id(); + } + + private: + friend class RpcController; + + // Various states the call propagates through. + // NB: if adding another state, be sure to update OutboundCall::IsFinished() + // and OutboundCall::StateName(State state) as well. + enum State { + READY = 0, + ON_OUTBOUND_QUEUE = 1, + SENT = 2, + TIMED_OUT = 3, + FINISHED_ERROR = 4, + FINISHED_SUCCESS = 5 + }; + + static std::string StateName(State state); + + void set_state(State new_state); + State state() const; + + // Same as set_state, but requires that the caller already holds + // lock_ + void set_state_unlocked(State new_state); + + // return current status + Status status() const; + + // Time when the call was first initiatied. + MonoTime start_time_; + + // Return the error protobuf, if a remote error occurred. + // This will only be non-NULL if status().IsRemoteError(). + const ErrorStatusPB* error_pb() const; + + // Lock for state_ status_, error_pb_ fields, since they + // may be mutated by the reactor thread while the client thread + // reads them. + mutable simple_spinlock lock_; + State state_; + Status status_; + gscoped_ptr error_pb_; + + // Call the user-provided callback. + void CallCallback(); + + // The RPC header. + // Parts of this (eg the call ID) are only assigned once this call has been + // passed to the reactor thread and assigned a connection. + RequestHeader header_; + + // The remote method being called. + RemoteMethod remote_method_; + + ConnectionId conn_id_; + ResponseCallback callback_; + RpcController* controller_; + + // Pointer for the protobuf where the response should be written. + google::protobuf::Message* response_; + + // Buffers for storing segments of the wire-format request. + faststring header_buf_; + faststring request_buf_; + + // Once a response has been received for this call, contains that response. + // Otherwise NULL. + gscoped_ptr call_response_; + + DISALLOW_COPY_AND_ASSIGN(OutboundCall); +}; + +// A response to a call, on the client side. +// Upon receiving a response, this is allocated in the reactor thread and filled +// into the OutboundCall instance via OutboundCall::SetResponse. +// +// This may either be a success or error response. +// +// This class takes care of separating out the distinct payload slices sent +// over. +class CallResponse { + public: + CallResponse(); + + // Parse the response received from a call. This must be called before any + // other methods on this object. + Status ParseFrom(gscoped_ptr transfer); + + // Return true if the call succeeded. + bool is_success() const { + DCHECK(parsed_); + return !header_.is_error(); + } + + // Return the call ID that this response is related to. + int32_t call_id() const { + DCHECK(parsed_); + return header_.call_id(); + } + + // Return the serialized response data. This is just the response "body" -- + // either a serialized ErrorStatusPB, or the serialized user response protobuf. + const Slice &serialized_response() const { + DCHECK(parsed_); + return serialized_response_; + } + + // See RpcController::GetSidecar() + Status GetSidecar(int idx, Slice* sidecar) const; + + private: + // True once ParseFrom() is called. + bool parsed_; + + // The parsed header. + ResponseHeader header_; + + // The slice of data for the encoded protobuf response. + // This slice refers to memory allocated by transfer_ + Slice serialized_response_; + + // Slices of data for rpc sidecars. They point into memory owned by transfer_. + Slice sidecar_slices_[OutboundTransfer::kMaxPayloadSlices]; + + // The incoming transfer data - retained because serialized_response_ + // and sidecar_slices_ refer into its data. + gscoped_ptr transfer_; + + DISALLOW_COPY_AND_ASSIGN(CallResponse); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/protoc-gen-krpc.cc b/src/kudu/rpc/protoc-gen-krpc.cc new file mode 100644 index 000000000000..4f3d250a18a6 --- /dev/null +++ b/src/kudu/rpc/protoc-gen-krpc.cc @@ -0,0 +1,685 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//////////////////////////////////////////////////////////////////////////////// +// Example usage: +// protoc --plugin=protoc-gen-krpc --krpc_out . --proto_path . .proto +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/status.h" +#include "kudu/util/string_case.h" + +using google::protobuf::FileDescriptor; +using google::protobuf::io::Printer; +using google::protobuf::MethodDescriptor; +using google::protobuf::ServiceDescriptor; +using std::map; +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { +namespace rpc { + +class Substituter { + public: + virtual ~Substituter() {} + virtual void InitSubstitutionMap(map *map) const = 0; +}; + +// NameInfo contains information about the output names. +class FileSubstitutions : public Substituter { + public: + static const std::string PROTO_EXTENSION; + + Status Init(const FileDescriptor *file) { + string path = file->name(); + map_["path"] = path; + + // Initialize path_ + // If path = /foo/bar/baz_stuff.proto, path_ = /foo/bar/baz_stuff + if (!TryStripSuffixString(path, PROTO_EXTENSION, &path_no_extension_)) { + return Status::InvalidArgument("file name " + path + + " did not end in " + PROTO_EXTENSION); + } + map_["path_no_extension"] = path_no_extension_; + + // If path = /foo/bar/baz_stuff.proto, base_ = baz_stuff + string base; + GetBaseName(path_no_extension_, &base); + map_["base"] = base; + + // If path = /foo/bar/baz_stuff.proto, camel_case_ = BazStuff + string camel_case; + SnakeToCamelCase(base, &camel_case); + map_["camel_case"] = camel_case; + + // If path = /foo/bar/baz_stuff.proto, upper_case_ = BAZ_STUFF + string upper_case; + ToUpperCase(base, &upper_case); + map_["upper_case"] = upper_case; + + map_["open_namespace"] = GenerateOpenNamespace(file->package()); + map_["close_namespace"] = GenerateCloseNamespace(file->package()); + + return Status::OK(); + } + + virtual void InitSubstitutionMap(map *map) const OVERRIDE { + typedef std::map::value_type kv_pair; + for (const kv_pair &pair : map_) { + (*map)[pair.first] = pair.second; + } + } + + std::string service_header() const { + return path_no_extension_ + ".service.h"; + } + + std::string service() const { + return path_no_extension_ + ".service.cc"; + } + + std::string proxy_header() const { + return path_no_extension_ + ".proxy.h"; + } + + std::string proxy() const { + return path_no_extension_ + ".proxy.cc"; + } + + private: + // Extract the last filename component. + static void GetBaseName(const string &path, + string *base) { + size_t last_slash = path.find_last_of("/"); + if (last_slash != string::npos) { + *base = path.substr(last_slash + 1); + } else { + *base = path; + } + } + + static string GenerateOpenNamespace(const string &str) { + vector components = strings::Split(str, "."); + string out; + for (const string &c : components) { + out.append("namespace ").append(c).append(" {\n"); + } + return out; + } + + static string GenerateCloseNamespace(const string &str) { + vector components = strings::Split(str, "."); + string out; + for (auto c = components.crbegin(); c != components.crend(); c++) { + out.append("} // namespace ").append(*c).append("\n"); + } + return out; + } + + std::string path_no_extension_; + map map_; +}; + +const std::string FileSubstitutions::PROTO_EXTENSION(".proto"); + +class MethodSubstitutions : public Substituter { + public: + explicit MethodSubstitutions(const MethodDescriptor *method) + : method_(method) { + } + + virtual void InitSubstitutionMap(map *map) const OVERRIDE { + (*map)["rpc_name"] = method_->name(); + (*map)["rpc_full_name"] = method_->full_name(); + (*map)["rpc_full_name_plainchars"] = + StringReplace(method_->full_name(), ".", "_", true); + (*map)["request"] = + ReplaceNamespaceDelimiters( + StripNamespaceIfPossible(method_->service()->full_name(), + method_->input_type()->full_name())); + (*map)["response"] = + ReplaceNamespaceDelimiters( + StripNamespaceIfPossible(method_->service()->full_name(), + method_->output_type()->full_name())); + (*map)["metric_enum_key"] = strings::Substitute("kMetricIndex$0", method_->name()); + } + + // Strips the package from method arguments if they are in the same package as + // the service, otherwise leaves them so that we can have fully qualified + // namespaces for method arguments. + static std::string StripNamespaceIfPossible(const std::string& service_full_name, + const std::string& arg_full_name) { + StringPiece service_package(service_full_name); + if (!service_package.contains(".")) { + return arg_full_name; + } + // remove the service name so that we are left with only the package, including + // the last '.' so that we account for different packages with the same prefix. + service_package.remove_suffix(service_package.length() - + service_package.find_last_of(".") - 1); + + StringPiece argfqn(arg_full_name); + if (argfqn.starts_with(service_package)) { + argfqn.remove_prefix(argfqn.find_last_of(".") + 1); + } + return argfqn.ToString(); + } + + static std::string ReplaceNamespaceDelimiters(const std::string& arg_full_name) { + return JoinStrings(strings::Split(arg_full_name, "."), "::"); + } + + private: + const MethodDescriptor *method_; +}; + +class ServiceSubstitutions : public Substituter { + public: + explicit ServiceSubstitutions(const ServiceDescriptor *service) + : service_(service) + {} + + virtual void InitSubstitutionMap(map *map) const OVERRIDE { + (*map)["service_name"] = service_->name(); + (*map)["full_service_name"] = service_->full_name(); + (*map)["service_method_count"] = SimpleItoa(service_->method_count()); + + // TODO: upgrade to protobuf 2.5.x and attach service comments + // to the generated service classes using the SourceLocation API. + } + + private: + const ServiceDescriptor *service_; +}; + + +class SubstitutionContext { + public: + // Takes ownership of the substituter + void Push(const Substituter *sub) { + subs_.push_back(shared_ptr(sub)); + } + + void PushMethod(const MethodDescriptor *method) { + Push(new MethodSubstitutions(method)); + } + + void PushService(const ServiceDescriptor *service) { + Push(new ServiceSubstitutions(service)); + } + + void Pop() { + CHECK(!subs_.empty()); + subs_.pop_back(); + } + + void InitSubstitutionMap(map *subs) const { + for (const shared_ptr &sub : subs_) { + sub->InitSubstitutionMap(subs); + } + } + + private: + vector > subs_; +}; + + + +class CodeGenerator : public ::google::protobuf::compiler::CodeGenerator { + public: + CodeGenerator() { } + + ~CodeGenerator() { } + + bool Generate(const google::protobuf::FileDescriptor *file, + const std::string &/* parameter */, + google::protobuf::compiler::GeneratorContext *gen_context, + std::string *error) const OVERRIDE { + auto name_info = new FileSubstitutions(); + Status ret = name_info->Init(file); + if (!ret.ok()) { + *error = "name_info.Init failed: " + ret.ToString(); + return false; + } + + SubstitutionContext subs; + subs.Push(name_info); + + gscoped_ptr ih_output( + gen_context->Open(name_info->service_header())); + Printer ih_printer(ih_output.get(), '$'); + GenerateServiceIfHeader(&ih_printer, &subs, file); + + gscoped_ptr i_output( + gen_context->Open(name_info->service())); + Printer i_printer(i_output.get(), '$'); + GenerateServiceIf(&i_printer, &subs, file); + + gscoped_ptr ph_output( + gen_context->Open(name_info->proxy_header())); + Printer ph_printer(ph_output.get(), '$'); + GenerateProxyHeader(&ph_printer, &subs, file); + + gscoped_ptr p_output( + gen_context->Open(name_info->proxy())); + Printer p_printer(p_output.get(), '$'); + GenerateProxy(&p_printer, &subs, file); + + return true; + } + + private: + void Print(Printer *printer, + const SubstitutionContext &sub, + const char *text) const { + map subs; + sub.InitSubstitutionMap(&subs); + printer->Print(subs, text); + } + + void GenerateServiceIfHeader(Printer *printer, + SubstitutionContext *subs, + const FileDescriptor *file) const { + Print(printer, *subs, + "// THIS FILE IS AUTOGENERATED FROM $path$\n" + "\n" + "#ifndef KUDU_RPC_$upper_case$_SERVICE_IF_DOT_H\n" + "#define KUDU_RPC_$upper_case$_SERVICE_IF_DOT_H\n" + "\n" + "#include \"$path_no_extension$.pb.h\"\n" + "\n" + "#include \n" + "\n" + "#include \"kudu/rpc/rpc_header.pb.h\"\n" + "#include \"kudu/rpc/service_if.h\"\n" + "\n" + "namespace kudu {\n" + "class MetricEntity;\n" + "namespace rpc {\n" + "class Messenger;\n" + "class RpcContext;\n" + "} // namespace rpc\n" + "} // namespace kudu\n" + "\n" + "$open_namespace$" + "\n" + ); + + for (int service_idx = 0; service_idx < file->service_count(); + ++service_idx) { + const ServiceDescriptor *service = file->service(service_idx); + subs->PushService(service); + + Print(printer, *subs, + "\n" + "class $service_name$If : public ::kudu::rpc::ServiceIf {\n" + " public:\n" + " explicit $service_name$If(const scoped_refptr& entity);\n" + " virtual ~$service_name$If();\n" + " virtual void Handle(::kudu::rpc::InboundCall *call);\n" + " virtual std::string service_name() const;\n" + " static std::string static_service_name();\n" + "\n" + ); + + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + + Print(printer, *subs, + " virtual void $rpc_name$(const $request$ *req,\n" + " $response$ *resp, ::kudu::rpc::RpcContext *context) = 0;\n" + ); + + subs->Pop(); + } + + Print(printer, *subs, + "\n" + " private:\n" + ); + + + Print(printer, *subs, + " enum RpcMetricIndexes {\n" + ); + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + + Print(printer, *subs, + " $metric_enum_key$,\n" + ); + + subs->Pop(); + } + Print(printer, *subs, + " };\n" // enum + ); + + Print(printer, *subs, + " static const int kMethodCount = $service_method_count$;\n" + "\n" + " // Pre-initialize metrics because calling METRIC_foo.Instantiate() is expensive.\n" + " void InitMetrics(const scoped_refptr& ent);\n" + "\n" + " ::kudu::rpc::RpcMethodMetrics metrics_[kMethodCount];\n" + "\n" + "};\n" + ); + + subs->Pop(); // Service + } + + Print(printer, *subs, + "\n" + "$close_namespace$\n" + "#endif\n"); + } + + void GenerateServiceIf(Printer *printer, + SubstitutionContext *subs, + const FileDescriptor *file) const { + Print(printer, *subs, + "// THIS FILE IS AUTOGENERATED FROM $path$\n" + "\n" + "#include \"$path_no_extension$.pb.h\"\n" + "#include \"$path_no_extension$.service.h\"\n" + "\n" + "#include \n" + "\n" + "#include \"kudu/rpc/inbound_call.h\"\n" + "#include \"kudu/rpc/remote_method.h\"\n" + "#include \"kudu/rpc/rpc_context.h\"\n" + "#include \"kudu/rpc/service_if.h\"\n" + "#include \"kudu/util/metrics.h\"\n" + "\n"); + + // Define metric prototypes for each method in the service. + for (int service_idx = 0; service_idx < file->service_count(); + ++service_idx) { + const ServiceDescriptor *service = file->service(service_idx); + subs->PushService(service); + + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + Print(printer, *subs, + "METRIC_DEFINE_histogram(server, handler_latency_$rpc_full_name_plainchars$,\n" + " \"$rpc_full_name$ RPC Time\",\n" + " kudu::MetricUnit::kMicroseconds,\n" + " \"Microseconds spent handling $rpc_full_name$() RPC requests\",\n" + " 60000000LU, 2);\n" + "\n"); + subs->Pop(); + } + + subs->Pop(); + } + + Print(printer, *subs, + "$open_namespace$" + "\n"); + + for (int service_idx = 0; service_idx < file->service_count(); + ++service_idx) { + const ServiceDescriptor *service = file->service(service_idx); + subs->PushService(service); + + Print(printer, *subs, + "$service_name$If::$service_name$If(const scoped_refptr& entity) {\n" + " InitMetrics(entity);\n" + "}\n" + "\n" + "$service_name$If::~$service_name$If() {\n" + "}\n" + "\n" + "void $service_name$If::Handle(::kudu::rpc::InboundCall *call) {\n" + " {\n"); + + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + + Print(printer, *subs, + " if (call->remote_method().method_name() == \"$rpc_name$\") {\n" + " $request$ *req = new $request$;\n" + " if (PREDICT_FALSE(!ParseParam(call, req))) {\n" + " delete req;\n" + " return;\n" + " }\n" + " $response$ *resp = new $response$;\n" + " $rpc_name$(req, resp,\n" + " new ::kudu::rpc::RpcContext(call, req, resp,\n" + " metrics_[$metric_enum_key$]));\n" + " return;\n" + " }\n" + "\n"); + subs->Pop(); + } + Print(printer, *subs, + " }\n" + " RespondBadMethod(call);\n" + "}\n" + "\n" + "std::string $service_name$If::service_name() const {\n" + " return \"$full_service_name$\";\n" + "}\n" + "std::string $service_name$If::static_service_name() {\n" + " return \"$full_service_name$\";\n" + "}\n" + "\n" + ); + + Print(printer, *subs, + "void $service_name$If::InitMetrics(const scoped_refptr& entity) {\n" + ); + // Expose per-RPC metrics. + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + + Print(printer, *subs, + " metrics_[$metric_enum_key$].handler_latency = \n" + " METRIC_handler_latency_$rpc_full_name_plainchars$.Instantiate(entity);\n" + ); + + subs->Pop(); + } + Print(printer, *subs, + "}\n" + "\n" + ); + + subs->Pop(); + } + + Print(printer, *subs, + "$close_namespace$" + ); + } + + void GenerateProxyHeader(Printer *printer, + SubstitutionContext *subs, + const FileDescriptor *file) const { + Print(printer, *subs, + "// THIS FILE IS AUTOGENERATED FROM $path$\n" + "\n" + "#ifndef KUDU_RPC_$upper_case$_PROXY_DOT_H\n" + "#define KUDU_RPC_$upper_case$_PROXY_DOT_H\n" + "\n" + "#include \"$path_no_extension$.pb.h\"\n" + "\n" + "#include \"kudu/rpc/proxy.h\"\n" + "#include \"kudu/util/status.h\"\n" + "\n" + "namespace kudu { class Sockaddr; }\n" + "namespace kudu { namespace rpc { class UserCredentials; } }\n" + "$open_namespace$" + "\n" + "\n" + ); + + for (int service_idx = 0; service_idx < file->service_count(); + ++service_idx) { + const ServiceDescriptor *service = file->service(service_idx); + subs->PushService(service); + + Print(printer, *subs, + "class $service_name$Proxy {\n" + " public:\n" + " $service_name$Proxy(const std::shared_ptr< ::kudu::rpc::Messenger>\n" + " &messenger, const ::kudu::Sockaddr &sockaddr);\n" + " ~$service_name$Proxy();\n" + "\n" + " // Set the user information for the connection.\n" + " void set_user_credentials(const ::kudu::rpc::UserCredentials& user_credentials);\n" + "\n" + " // Get the current user information for the connection.\n" + " const ::kudu::rpc::UserCredentials& user_credentials() const;\n" + "\n" + ); + + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + + Print(printer, *subs, + "\n" + " ::kudu::Status $rpc_name$(const $request$ &req, $response$ *resp,\n" + " ::kudu::rpc::RpcController *controller);\n" + " void $rpc_name$Async(const $request$ &req,\n" + " $response$ *response,\n" + " ::kudu::rpc::RpcController *controller,\n" + " const ::kudu::rpc::ResponseCallback &callback);\n" + ); + subs->Pop(); + } + Print(printer, *subs, + " private:\n" + " ::kudu::rpc::Proxy proxy_;\n" + "};\n"); + subs->Pop(); + } + Print(printer, *subs, + "\n" + "$close_namespace$" + "\n" + "#endif\n" + ); + } + + void GenerateProxy(Printer *printer, + SubstitutionContext *subs, + const FileDescriptor *file) const { + Print(printer, *subs, + "// THIS FILE IS AUTOGENERATED FROM $path$\n" + "\n" + "#include \"$path_no_extension$.proxy.h\"\n" + "\n" + "#include \"kudu/rpc/outbound_call.h\"\n" + "#include \"kudu/util/net/sockaddr.h\"\n" + "\n" + "$open_namespace$" + "\n" + ); + + for (int service_idx = 0; service_idx < file->service_count(); + ++service_idx) { + const ServiceDescriptor *service = file->service(service_idx); + subs->PushService(service); + Print(printer, *subs, + "$service_name$Proxy::$service_name$Proxy(\n" + " const std::shared_ptr< ::kudu::rpc::Messenger> &messenger,\n" + " const ::kudu::Sockaddr &remote)\n" + " : proxy_(messenger, remote, \"$full_service_name$\") {\n" + "}\n" + "\n" + "$service_name$Proxy::~$service_name$Proxy() {\n" + "}\n" + "\n" + "void $service_name$Proxy::set_user_credentials(\n" + " const ::kudu::rpc::UserCredentials& user_credentials) {\n" + " proxy_.set_user_credentials(user_credentials);\n" + "}\n" + "\n" + "const ::kudu::rpc::UserCredentials& $service_name$Proxy::user_credentials() const {\n" + " return proxy_.user_credentials();\n" + "}\n" + "\n"); + for (int method_idx = 0; method_idx < service->method_count(); + ++method_idx) { + const MethodDescriptor *method = service->method(method_idx); + subs->PushMethod(method); + Print(printer, *subs, + "::kudu::Status $service_name$Proxy::$rpc_name$(const $request$ &req, $response$ *resp,\n" + " ::kudu::rpc::RpcController *controller) {\n" + " return proxy_.SyncRequest(\"$rpc_name$\", req, resp, controller);\n" + "}\n" + "\n" + "void $service_name$Proxy::$rpc_name$Async(const $request$ &req,\n" + " $response$ *resp, ::kudu::rpc::RpcController *controller,\n" + " const ::kudu::rpc::ResponseCallback &callback) {\n" + " proxy_.AsyncRequest(\"$rpc_name$\", req, resp, controller, callback);\n" + "}\n" + "\n"); + subs->Pop(); + } + + subs->Pop(); + } + Print(printer, *subs, + "$close_namespace$"); + } +}; +} // namespace rpc +} // namespace kudu + +int main(int argc, char *argv[]) { + kudu::rpc::CodeGenerator generator; + return google::protobuf::compiler::PluginMain(argc, argv, &generator); +} diff --git a/src/kudu/rpc/proxy.cc b/src/kudu/rpc/proxy.cc new file mode 100644 index 000000000000..d2e75372421e --- /dev/null +++ b/src/kudu/rpc/proxy.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/proxy.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "kudu/rpc/outbound_call.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/remote_method.h" +#include "kudu/rpc/response_callback.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/status.h" +#include "kudu/util/user.h" + +using google::protobuf::Message; +using std::string; +using std::shared_ptr; + +namespace kudu { +namespace rpc { + +Proxy::Proxy(const std::shared_ptr& messenger, + const Sockaddr& remote, string service_name) + : service_name_(std::move(service_name)), + messenger_(messenger), + is_started_(false) { + CHECK(messenger != nullptr); + DCHECK(!service_name_.empty()) << "Proxy service name must not be blank"; + + // By default, we set the real user to the currently logged-in user. + // Effective user and password remain blank. + string real_user; + Status s = GetLoggedInUser(&real_user); + if (!s.ok()) { + LOG(WARNING) << "Proxy for " << service_name_ << ": Unable to get logged-in user name: " + << s.ToString() << " before connecting to remote: " << remote.ToString(); + } + + conn_id_.set_remote(remote); + conn_id_.mutable_user_credentials()->set_real_user(real_user); +} + +Proxy::~Proxy() { +} + +void Proxy::AsyncRequest(const string& method, + const google::protobuf::Message& req, + google::protobuf::Message* response, + RpcController* controller, + const ResponseCallback& callback) const { + CHECK(controller->call_.get() == nullptr) << "Controller should be reset"; + base::subtle::NoBarrier_Store(&is_started_, true); + RemoteMethod remote_method(service_name_, method); + OutboundCall* call = new OutboundCall(conn_id_, remote_method, response, controller, callback); + controller->call_.reset(call); + Status s = call->SetRequestParam(req); + if (PREDICT_FALSE(!s.ok())) { + // Failed to serialize request: likely the request is missing a required + // field. + call->SetFailed(s); // calls callback internally + return; + } + + // If this fails to queue, the callback will get called immediately + // and the controller will be in an ERROR state. + messenger_->QueueOutboundCall(controller->call_); +} + + +Status Proxy::SyncRequest(const string& method, + const google::protobuf::Message& req, + google::protobuf::Message* resp, + RpcController* controller) const { + CountDownLatch latch(1); + AsyncRequest(method, req, DCHECK_NOTNULL(resp), controller, + boost::bind(&CountDownLatch::CountDown, boost::ref(latch))); + + latch.Wait(); + return controller->status(); +} + +void Proxy::set_user_credentials(const UserCredentials& user_credentials) { + CHECK(base::subtle::NoBarrier_Load(&is_started_) == false) + << "It is illegal to call set_user_credentials() after request processing has started"; + conn_id_.set_user_credentials(user_credentials); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/proxy.h b/src/kudu/rpc/proxy.h new file mode 100644 index 000000000000..7e27d55c5557 --- /dev/null +++ b/src/kudu/rpc/proxy.h @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_PROXY_H +#define KUDU_RPC_PROXY_H + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/rpc/outbound_call.h" +#include "kudu/rpc/response_callback.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { +namespace rpc { + +class Messenger; + +// Interface to send calls to a remote service. +// +// Proxy objects do not map one-to-one with TCP connections. The underlying TCP +// connection is not established until the first call, and may be torn down and +// re-established as necessary by the messenger. Additionally, the messenger is +// likely to multiplex many Proxy objects on the same connection. +// +// Proxy objects are thread-safe after initialization only. +// Setters on the Proxy are not thread-safe, and calling a setter after any RPC +// request has started will cause a fatal error. +// +// After initialization, multiple threads may make calls using the same proxy object. +class Proxy { + public: + Proxy(const std::shared_ptr& messenger, const Sockaddr& remote, + std::string service_name); + ~Proxy(); + + // Call a remote method asynchronously. + // + // Typically, users will not call this directly, but rather through + // a generated Proxy subclass. + // + // method: the method name to invoke on the remote server. + // + // req: the request protobuf. This will be serialized immediately, + // so the caller may free or otherwise mutate 'req' safely. + // + // resp: the response protobuf. This protobuf will be mutated upon + // completion of the call. The RPC system does not take ownership + // of this storage. + // + // NOTE: 'req' and 'resp' should be the appropriate protocol buffer implementation + // class corresponding to the parameter and result types of the service method + // defined in the service's '.proto' file. + // + // controller: the RpcController to associate with this call. Each call + // must use a unique controller object. Does not take ownership. + // + // callback: the callback to invoke upon call completion. This callback may + // be invoked before AsyncRequest() itself returns, or any time + // thereafter. It may be invoked either on the caller's thread + // or by an RPC IO thread, and thus should take care to not + // block or perform any heavy CPU work. + void AsyncRequest(const std::string& method, + const google::protobuf::Message& req, + google::protobuf::Message* resp, + RpcController* controller, + const ResponseCallback& callback) const; + + // The same as AsyncRequest(), except that the call blocks until the call + // finishes. If the call fails, returns a non-OK result. + Status SyncRequest(const std::string& method, + const google::protobuf::Message& req, + google::protobuf::Message* resp, + RpcController* controller) const; + + // Set the user credentials which should be used to log in. + void set_user_credentials(const UserCredentials& user_credentials); + + // Get the user credentials which should be used to log in. + const UserCredentials& user_credentials() const { return conn_id_.user_credentials(); } + + private: + const std::string service_name_; + std::shared_ptr messenger_; + ConnectionId conn_id_; + mutable Atomic32 is_started_; + + DISALLOW_COPY_AND_ASSIGN(Proxy); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/reactor-test.cc b/src/kudu/rpc/reactor-test.cc new file mode 100644 index 000000000000..c7af51864fd1 --- /dev/null +++ b/src/kudu/rpc/reactor-test.cc @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/reactor.h" + +#include "kudu/rpc/rpc-test-base.h" +#include "kudu/util/countdown_latch.h" + +using std::shared_ptr; + +namespace kudu { +namespace rpc { + +class ReactorTest : public RpcTestBase { + public: + ReactorTest() + : messenger_(CreateMessenger("my_messenger", 4)), + latch_(1) { + } + + void ScheduledTask(const Status& status, const Status& expected_status) { + CHECK_EQ(expected_status.CodeAsString(), status.CodeAsString()); + latch_.CountDown(); + } + + void ScheduledTaskCheckThread(const Status& status, const Thread* thread) { + CHECK_OK(status); + CHECK_EQ(thread, Thread::current_thread()); + latch_.CountDown(); + } + + void ScheduledTaskScheduleAgain(const Status& status) { + messenger_->ScheduleOnReactor( + boost::bind(&ReactorTest::ScheduledTaskCheckThread, this, _1, + Thread::current_thread()), + MonoDelta::FromMilliseconds(0)); + latch_.CountDown(); + } + + protected: + const shared_ptr messenger_; + CountDownLatch latch_; +}; + +TEST_F(ReactorTest, TestFunctionIsCalled) { + messenger_->ScheduleOnReactor( + boost::bind(&ReactorTest::ScheduledTask, this, _1, Status::OK()), + MonoDelta::FromSeconds(0)); + latch_.Wait(); +} + +TEST_F(ReactorTest, TestFunctionIsCalledAtTheRightTime) { + MonoTime before = MonoTime::Now(MonoTime::FINE); + messenger_->ScheduleOnReactor( + boost::bind(&ReactorTest::ScheduledTask, this, _1, Status::OK()), + MonoDelta::FromMilliseconds(100)); + latch_.Wait(); + MonoTime after = MonoTime::Now(MonoTime::FINE); + MonoDelta delta = after.GetDeltaSince(before); + CHECK_GE(delta.ToMilliseconds(), 100); +} + +TEST_F(ReactorTest, TestFunctionIsCalledIfReactorShutdown) { + messenger_->ScheduleOnReactor( + boost::bind(&ReactorTest::ScheduledTask, this, _1, + Status::Aborted("doesn't matter")), + MonoDelta::FromSeconds(60)); + messenger_->Shutdown(); + latch_.Wait(); +} + +TEST_F(ReactorTest, TestReschedulesOnSameReactorThread) { + // Our scheduled task will schedule yet another task. + latch_.Reset(2); + + messenger_->ScheduleOnReactor( + boost::bind(&ReactorTest::ScheduledTaskScheduleAgain, this, _1), + MonoDelta::FromSeconds(0)); + latch_.Wait(); + latch_.Wait(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/reactor.cc b/src/kudu/rpc/reactor.cc new file mode 100644 index 000000000000..2e6c4db13427 --- /dev/null +++ b/src/kudu/rpc/reactor.cc @@ -0,0 +1,667 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/reactor.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/rpc/connection.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/negotiation.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/rpc/sasl_client.h" +#include "kudu/rpc/sasl_server.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/errno.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/monotime.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/thread_restrictions.h" +#include "kudu/util/trace.h" +#include "kudu/util/status.h" +#include "kudu/util/net/socket.h" + +// When compiling on Mac OS X, use 'kqueue' instead of the default, 'select', for the event loop. +// Otherwise we run into problems because 'select' can't handle connections when more than 1024 +// file descriptors are open by the process. +#if defined(__APPLE__) +static const int kDefaultLibEvFlags = ev::KQUEUE; +#else +static const int kDefaultLibEvFlags = ev::AUTO; +#endif + +using std::string; +using std::shared_ptr; + +DEFINE_int64(rpc_negotiation_timeout_ms, 3000, + "Timeout for negotiating an RPC connection."); +TAG_FLAG(rpc_negotiation_timeout_ms, advanced); +TAG_FLAG(rpc_negotiation_timeout_ms, runtime); + +namespace kudu { +namespace rpc { + +namespace { +Status ShutdownError(bool aborted) { + const char* msg = "reactor is shutting down"; + return aborted ? + Status::Aborted(msg, "", ESHUTDOWN) : + Status::ServiceUnavailable(msg, "", ESHUTDOWN); +} +} // anonymous namespace + +ReactorThread::ReactorThread(Reactor *reactor, const MessengerBuilder &bld) + : loop_(kDefaultLibEvFlags), + cur_time_(MonoTime::Now(MonoTime::COARSE)), + last_unused_tcp_scan_(cur_time_), + reactor_(reactor), + connection_keepalive_time_(bld.connection_keepalive_time_), + coarse_timer_granularity_(bld.coarse_timer_granularity_) { +} + +Status ReactorThread::Init() { + DCHECK(thread_.get() == nullptr) << "Already started"; + DVLOG(6) << "Called ReactorThread::Init()"; + // Register to get async notifications in our epoll loop. + async_.set(loop_); + async_.set(this); + async_.start(); + + // Register the timer watcher. + // The timer is used for closing old TCP connections and applying + // backpressure. + timer_.set(loop_); + timer_.set(this); // NOLINT(*) + timer_.start(coarse_timer_granularity_.ToSeconds(), + coarse_timer_granularity_.ToSeconds()); + + // Create Reactor thread. + return kudu::Thread::Create("reactor", "rpc reactor", &ReactorThread::RunThread, this, &thread_); +} + +void ReactorThread::Shutdown() { + CHECK(reactor_->closing()) << "Should be called after setting closing_ flag"; + + VLOG(1) << name() << ": shutting down Reactor thread."; + WakeThread(); +} + +void ReactorThread::ShutdownInternal() { + DCHECK(IsCurrentThread()); + + // Tear down any outbound TCP connections. + Status service_unavailable = ShutdownError(false); + VLOG(1) << name() << ": tearing down outbound TCP connections..."; + for (auto c = client_conns_.begin(); c != client_conns_.end(); + c = client_conns_.begin()) { + const scoped_refptr& conn = (*c).second; + VLOG(1) << name() << ": shutting down " << conn->ToString(); + conn->Shutdown(service_unavailable); + client_conns_.erase(c); + } + + // Tear down any inbound TCP connections. + VLOG(1) << name() << ": tearing down inbound TCP connections..."; + for (const scoped_refptr& conn : server_conns_) { + VLOG(1) << name() << ": shutting down " << conn->ToString(); + conn->Shutdown(service_unavailable); + } + server_conns_.clear(); + + // Abort any scheduled tasks. + // + // These won't be found in the ReactorThread's list of pending tasks + // because they've been "run" (that is, they've been scheduled). + Status aborted = ShutdownError(true); // aborted + for (DelayedTask* task : scheduled_tasks_) { + task->Abort(aborted); // should also free the task. + } + scheduled_tasks_.clear(); +} + +ReactorTask::ReactorTask() { +} +ReactorTask::~ReactorTask() { +} + +Status ReactorThread::GetMetrics(ReactorMetrics *metrics) { + DCHECK(IsCurrentThread()); + metrics->num_client_connections_ = client_conns_.size(); + metrics->num_server_connections_ = server_conns_.size(); + return Status::OK(); +} + +Status ReactorThread::DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp) { + DCHECK(IsCurrentThread()); + for (const scoped_refptr& conn : server_conns_) { + RETURN_NOT_OK(conn->DumpPB(req, resp->add_inbound_connections())); + } + for (const conn_map_t::value_type& entry : client_conns_) { + Connection* conn = entry.second.get(); + RETURN_NOT_OK(conn->DumpPB(req, resp->add_outbound_connections())); + } + return Status::OK(); +} + +void ReactorThread::WakeThread() { + async_.send(); +} + +// Handle async events. These events are sent to the reactor by other +// threads that want to bring something to our attention, like the fact that +// we're shutting down, or the fact that there is a new outbound Transfer +// ready to send. +void ReactorThread::AsyncHandler(ev::async &watcher, int revents) { + DCHECK(IsCurrentThread()); + + if (PREDICT_FALSE(reactor_->closing())) { + ShutdownInternal(); + loop_.break_loop(); // break the epoll loop and terminate the thread + return; + } + + boost::intrusive::list tasks; + reactor_->DrainTaskQueue(&tasks); + + while (!tasks.empty()) { + ReactorTask &task = tasks.front(); + tasks.pop_front(); + task.Run(this); + } +} + +void ReactorThread::RegisterConnection(const scoped_refptr& conn) { + DCHECK(IsCurrentThread()); + + // Set a limit on how long the server will negotiate with a new client. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(FLAGS_rpc_negotiation_timeout_ms)); + + Status s = StartConnectionNegotiation(conn, deadline); + if (!s.ok()) { + LOG(ERROR) << "Server connection negotiation failed: " << s.ToString(); + DestroyConnection(conn.get(), s); + } + server_conns_.push_back(conn); +} + +void ReactorThread::AssignOutboundCall(const shared_ptr &call) { + DCHECK(IsCurrentThread()); + scoped_refptr conn; + + // TODO: Move call deadline timeout computation into OutboundCall constructor. + const MonoDelta &timeout = call->controller()->timeout(); + MonoTime deadline; + if (!timeout.Initialized()) { + LOG(WARNING) << "Client call " << call->remote_method().ToString() + << " has no timeout set for connection id: " + << call->conn_id().ToString(); + deadline = MonoTime::Max(); + } else { + deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(timeout); + } + + Status s = FindOrStartConnection(call->conn_id(), &conn, deadline); + if (PREDICT_FALSE(!s.ok())) { + call->SetFailed(s); + return; + } + + conn->QueueOutboundCall(call); +} + +// +// Handles timer events. The periodic timer: +// +// 1. updates Reactor::cur_time_ +// 2. every tcp_conn_timeo_ seconds, close down connections older than +// tcp_conn_timeo_ seconds. +// +void ReactorThread::TimerHandler(ev::timer &watcher, int revents) { + DCHECK(IsCurrentThread()); + if (EV_ERROR & revents) { + LOG(WARNING) << "Reactor " << name() << " got an error in " + "the timer handler."; + return; + } + MonoTime now(MonoTime::Now(MonoTime::COARSE)); + VLOG(4) << name() << ": timer tick at " << now.ToString(); + cur_time_ = now; + + ScanIdleConnections(); +} + +void ReactorThread::RegisterTimeout(ev::timer *watcher) { + watcher->set(loop_); +} + +void ReactorThread::ScanIdleConnections() { + DCHECK(IsCurrentThread()); + // enforce TCP connection timeouts + auto c = server_conns_.begin(); + auto c_end = server_conns_.end(); + uint64_t timed_out = 0; + for (; c != c_end; ) { + const scoped_refptr& conn = *c; + if (!conn->Idle()) { + VLOG(3) << "Connection " << conn->ToString() << " not idle"; + ++c; // TODO: clean up this loop + continue; + } + + MonoDelta connection_delta(cur_time_.GetDeltaSince(conn->last_activity_time())); + if (connection_delta.MoreThan(connection_keepalive_time_)) { + conn->Shutdown(Status::NetworkError( + StringPrintf("connection timed out after %s seconds", + connection_keepalive_time_.ToString().c_str()))); + VLOG(1) << "Timing out connection " << conn->ToString() << " - it has been idle for " + << connection_delta.ToSeconds() << "s"; + server_conns_.erase(c++); + ++timed_out; + } else { + ++c; + } + } + + // TODO: above only times out on the server side. + // Clients may want to set their keepalive timeout as well. + + VLOG_IF(1, timed_out > 0) << name() << ": timed out " << timed_out << " TCP connections."; +} + +const std::string &ReactorThread::name() const { + return reactor_->name(); +} + +MonoTime ReactorThread::cur_time() const { + return cur_time_; +} + +Reactor *ReactorThread::reactor() { + return reactor_; +} + +bool ReactorThread::IsCurrentThread() const { + return thread_.get() == kudu::Thread::current_thread(); +} + +void ReactorThread::RunThread() { + ThreadRestrictions::SetWaitAllowed(false); + ThreadRestrictions::SetIOAllowed(false); + DVLOG(6) << "Calling ReactorThread::RunThread()..."; + loop_.run(0); + VLOG(1) << name() << " thread exiting."; + + // No longer need the messenger. This causes the messenger to + // get deleted when all the reactors exit. + reactor_->messenger_.reset(); +} + +Status ReactorThread::FindOrStartConnection(const ConnectionId &conn_id, + scoped_refptr* conn, + const MonoTime &deadline) { + DCHECK(IsCurrentThread()); + conn_map_t::const_iterator c = client_conns_.find(conn_id); + if (c != client_conns_.end()) { + *conn = (*c).second; + return Status::OK(); + } + + // No connection to this remote. Need to create one. + VLOG(2) << name() << " FindOrStartConnection: creating " + << "new connection for " << conn_id.remote().ToString(); + + // Create a new socket and start connecting to the remote. + Socket sock; + RETURN_NOT_OK(CreateClientSocket(&sock)); + bool connect_in_progress; + RETURN_NOT_OK(StartConnect(&sock, conn_id.remote(), &connect_in_progress)); + + // Register the new connection in our map. + *conn = new Connection(this, conn_id.remote(), sock.Release(), Connection::CLIENT); + (*conn)->set_user_credentials(conn_id.user_credentials()); + + // Kick off blocking client connection negotiation. + Status s = StartConnectionNegotiation(*conn, deadline); + if (s.IsIllegalState()) { + // Return a nicer error message to the user indicating -- if we just + // forward the status we'd get something generic like "ThreadPool is closing". + return Status::ServiceUnavailable("Client RPC Messenger shutting down"); + } + // Propagate any other errors as-is. + RETURN_NOT_OK_PREPEND(s, "Unable to start connection negotiation thread"); + + // Insert into the client connection map to avoid duplicate connection requests. + client_conns_.insert(conn_map_t::value_type(conn_id, *conn)); + return Status::OK(); +} + +Status ReactorThread::StartConnectionNegotiation(const scoped_refptr& conn, + const MonoTime &deadline) { + DCHECK(IsCurrentThread()); + + scoped_refptr trace(new Trace()); + ADOPT_TRACE(trace.get()); + TRACE("Submitting negotiation task for $0", conn->ToString()); + RETURN_NOT_OK(reactor()->messenger()->negotiation_pool()->SubmitClosure( + Bind(&Negotiation::RunNegotiation, conn, deadline))); + return Status::OK(); +} + +void ReactorThread::CompleteConnectionNegotiation(const scoped_refptr& conn, + const Status &status) { + DCHECK(IsCurrentThread()); + if (PREDICT_FALSE(!status.ok())) { + DestroyConnection(conn.get(), status); + return; + } + + // Switch the socket back to non-blocking mode after negotiation. + Status s = conn->SetNonBlocking(true); + if (PREDICT_FALSE(!s.ok())) { + LOG(DFATAL) << "Unable to set connection to non-blocking mode: " << s.ToString(); + DestroyConnection(conn.get(), s); + return; + } + conn->MarkNegotiationComplete(); + conn->EpollRegister(loop_); +} + +Status ReactorThread::CreateClientSocket(Socket *sock) { + Status ret = sock->Init(Socket::FLAG_NONBLOCKING); + if (ret.ok()) { + ret = sock->SetNoDelay(true); + } + LOG_IF(WARNING, !ret.ok()) << "failed to create an " + "outbound connection because a new socket could not " + "be created: " << ret.ToString(); + return ret; +} + +Status ReactorThread::StartConnect(Socket *sock, const Sockaddr &remote, bool *in_progress) { + Status ret = sock->Connect(remote); + if (ret.ok()) { + VLOG(3) << "StartConnect: connect finished immediately for " << remote.ToString(); + *in_progress = false; // connect() finished immediately. + return ret; + } + + int posix_code = ret.posix_code(); + if (Socket::IsTemporarySocketError(posix_code) || (posix_code == EINPROGRESS)) { + // The connect operation is in progress. + *in_progress = true; + VLOG(3) << "StartConnect: connect in progress for " << remote.ToString(); + return Status::OK(); + } else { + LOG(WARNING) << "failed to create an outbound connection to " << remote.ToString() + << " because connect failed: " << ret.ToString(); + return ret; + } +} + +void ReactorThread::DestroyConnection(Connection *conn, + const Status &conn_status) { + DCHECK(IsCurrentThread()); + + conn->Shutdown(conn_status); + + // Unlink connection from lists. + if (conn->direction() == Connection::CLIENT) { + ConnectionId conn_id(conn->remote(), conn->user_credentials()); + auto it = client_conns_.find(conn_id); + CHECK(it != client_conns_.end()) << "Couldn't find connection " << conn->ToString(); + client_conns_.erase(it); + } else if (conn->direction() == Connection::SERVER) { + auto it = server_conns_.begin(); + while (it != server_conns_.end()) { + if ((*it).get() == conn) { + server_conns_.erase(it); + break; + } + ++it; + } + } +} + +DelayedTask::DelayedTask(boost::function func, + MonoDelta when) + : func_(std::move(func)), when_(std::move(when)), thread_(nullptr) {} + +void DelayedTask::Run(ReactorThread* thread) { + DCHECK(thread_ == nullptr) << "Task has already been scheduled"; + DCHECK(thread->IsCurrentThread()); + + // Schedule the task to run later. + thread_ = thread; + timer_.set(thread->loop_); + timer_.set(this); + timer_.start(when_.ToSeconds(), // after + 0); // repeat + thread_->scheduled_tasks_.insert(this); +} + +void DelayedTask::Abort(const Status& abort_status) { + func_(abort_status); + delete this; +} + +void DelayedTask::TimerHandler(ev::timer& watcher, int revents) { + // We will free this task's memory. + thread_->scheduled_tasks_.erase(this); + + if (EV_ERROR & revents) { + string msg = "Delayed task got an error in its timer handler"; + LOG(WARNING) << msg; + Abort(Status::Aborted(msg)); // Will delete 'this'. + } else { + func_(Status::OK()); + delete this; + } +} + +Reactor::Reactor(const shared_ptr& messenger, + int index, const MessengerBuilder &bld) + : messenger_(messenger), + name_(StringPrintf("%s_R%03d", messenger->name().c_str(), index)), + closing_(false), + thread_(this, bld) { +} + +Status Reactor::Init() { + DVLOG(6) << "Called Reactor::Init()"; + return thread_.Init(); +} + +void Reactor::Shutdown() { + { + lock_guard l(&lock_); + if (closing_) { + return; + } + closing_ = true; + } + + thread_.Shutdown(); + + // Abort all pending tasks. No new tasks can get scheduled after this + // because ScheduleReactorTask() tests the closing_ flag set above. + Status aborted = ShutdownError(true); + while (!pending_tasks_.empty()) { + ReactorTask& task = pending_tasks_.front(); + pending_tasks_.pop_front(); + task.Abort(aborted); + } +} + +Reactor::~Reactor() { + Shutdown(); +} + +const std::string &Reactor::name() const { + return name_; +} + +bool Reactor::closing() const { + lock_guard l(&lock_); + return closing_; +} + +// Task to call an arbitrary function within the reactor thread. +class RunFunctionTask : public ReactorTask { + public: + explicit RunFunctionTask(boost::function f) + : function_(std::move(f)), latch_(1) {} + + virtual void Run(ReactorThread *reactor) OVERRIDE { + status_ = function_(); + latch_.CountDown(); + } + virtual void Abort(const Status &status) OVERRIDE { + status_ = status; + latch_.CountDown(); + } + + // Wait until the function has completed, and return the Status + // returned by the function. + Status Wait() { + latch_.Wait(); + return status_; + } + + private: + boost::function function_; + Status status_; + CountDownLatch latch_; +}; + +Status Reactor::GetMetrics(ReactorMetrics *metrics) { + return RunOnReactorThread(boost::bind(&ReactorThread::GetMetrics, &thread_, metrics)); +} + +Status Reactor::RunOnReactorThread(const boost::function& f) { + RunFunctionTask task(f); + ScheduleReactorTask(&task); + return task.Wait(); +} + +Status Reactor::DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp) { + return RunOnReactorThread(boost::bind(&ReactorThread::DumpRunningRpcs, &thread_, + boost::ref(req), resp)); +} + +class RegisterConnectionTask : public ReactorTask { + public: + explicit RegisterConnectionTask(const scoped_refptr& conn) : + conn_(conn) + {} + + virtual void Run(ReactorThread *thread) OVERRIDE { + thread->RegisterConnection(conn_); + delete this; + } + + virtual void Abort(const Status &status) OVERRIDE { + // We don't need to Shutdown the connection since it was never registered. + // This is only used for inbound connections, and inbound connections will + // never have any calls added to them until they've been registered. + delete this; + } + + private: + scoped_refptr conn_; +}; + +void Reactor::RegisterInboundSocket(Socket *socket, const Sockaddr &remote) { + VLOG(3) << name_ << ": new inbound connection to " << remote.ToString(); + scoped_refptr conn( + new Connection(&thread_, remote, socket->Release(), Connection::SERVER)); + auto task = new RegisterConnectionTask(conn); + ScheduleReactorTask(task); +} + +// Task which runs in the reactor thread to assign an outbound call +// to a connection. +class AssignOutboundCallTask : public ReactorTask { + public: + explicit AssignOutboundCallTask(shared_ptr call) + : call_(std::move(call)) {} + + virtual void Run(ReactorThread *reactor) OVERRIDE { + reactor->AssignOutboundCall(call_); + delete this; + } + + virtual void Abort(const Status &status) OVERRIDE { + call_->SetFailed(status); + delete this; + } + + private: + shared_ptr call_; +}; + +void Reactor::QueueOutboundCall(const shared_ptr &call) { + DVLOG(3) << name_ << ": queueing outbound call " + << call->ToString() << " to remote " << call->conn_id().remote().ToString(); + AssignOutboundCallTask *task = new AssignOutboundCallTask(call); + ScheduleReactorTask(task); +} + +void Reactor::ScheduleReactorTask(ReactorTask *task) { + { + unique_lock l(&lock_); + if (closing_) { + // We guarantee the reactor lock is not taken when calling Abort(). + l.unlock(); + task->Abort(ShutdownError(false)); + return; + } + pending_tasks_.push_back(*task); + } + thread_.WakeThread(); +} + +bool Reactor::DrainTaskQueue(boost::intrusive::list *tasks) { // NOLINT(*) + lock_guard l(&lock_); + if (closing_) { + return false; + } + tasks->swap(pending_tasks_); + return true; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/reactor.h b/src/kudu/rpc/reactor.h new file mode 100644 index 000000000000..008b952331aa --- /dev/null +++ b/src/kudu/rpc/reactor.h @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_REACTOR_H +#define KUDU_RPC_REACTOR_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/connection.h" +#include "kudu/rpc/transfer.h" +#include "kudu/util/thread.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +typedef std::list > conn_list_t; + +class DumpRunningRpcsRequestPB; +class DumpRunningRpcsResponsePB; +class Messenger; +class MessengerBuilder; +class Reactor; + +// Simple metrics information from within a reactor. +struct ReactorMetrics { + // Number of client RPC connections currently connected. + int32_t num_client_connections_; + // Number of server RPC connections currently connected. + int32_t num_server_connections_; +}; + +// A task which can be enqueued to run on the reactor thread. +class ReactorTask : public boost::intrusive::list_base_hook<> { + public: + ReactorTask(); + + // Run the task. 'reactor' is guaranteed to be the current thread. + virtual void Run(ReactorThread *reactor) = 0; + + // Abort the task, in the case that the reactor shut down before the + // task could be processed. This may or may not run on the reactor thread + // itself. + // + // The Reactor guarantees that the Reactor lock is free when this + // method is called. + virtual void Abort(const Status &abort_status) {} + + virtual ~ReactorTask(); + + private: + DISALLOW_COPY_AND_ASSIGN(ReactorTask); +}; + +// A ReactorTask that is scheduled to run at some point in the future. +// +// Semantically it works like RunFunctionTask with a few key differences: +// 1. The user function is called during Abort. Put another way, the +// user function is _always_ invoked, even during reactor shutdown. +// 2. To differentiate between Abort and non-Abort, the user function +// receives a Status as its first argument. +class DelayedTask : public ReactorTask { + public: + DelayedTask(boost::function func, MonoDelta when); + + // Schedules the task for running later but doesn't actually run it yet. + virtual void Run(ReactorThread* reactor) OVERRIDE; + + // Behaves like ReactorTask::Abort. + virtual void Abort(const Status& abort_status) OVERRIDE; + + private: + // libev callback for when the registered timer fires. + void TimerHandler(ev::timer& watcher, int revents); + + // User function to invoke when timer fires or when task is aborted. + const boost::function func_; + + // Delay to apply to this task. + const MonoDelta when_; + + // Link back to registering reactor thread. + ReactorThread* thread_; + + // libev timer. Set when Run() is invoked. + ev::timer timer_; +}; + +// A ReactorThread is a libev event handler thread which manages I/O +// on a list of sockets. +// +// All methods in this class are _only_ called from the reactor thread itself +// except where otherwise specified. New methods should DCHECK(IsCurrentThread()) +// to ensure this. +class ReactorThread { + public: + friend class Connection; + + // Client-side connection map. + typedef std::unordered_map, + ConnectionIdHash, ConnectionIdEqual> conn_map_t; + + ReactorThread(Reactor *reactor, const MessengerBuilder &bld); + + // This may be called from another thread. + Status Init(); + + // Add any connections on this reactor thread into the given status dump. + // May be called from another thread. + Status DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp); + + // Block until the Reactor thread is shut down + // + // This must be called from another thread. + void Shutdown(); + + // This method is thread-safe. + void WakeThread(); + + // libev callback for handling async notifications in our epoll thread. + void AsyncHandler(ev::async &watcher, int revents); + + // libev callback for handling timer events in our epoll thread. + void TimerHandler(ev::timer &watcher, int revents); + + // Register an epoll timer watcher with our event loop. + // Does not set a timeout or start it. + void RegisterTimeout(ev::timer *watcher); + + // This may be called from another thread. + const std::string &name() const; + + MonoTime cur_time() const; + + // This may be called from another thread. + Reactor *reactor(); + + // Return true if this reactor thread is the thread currently + // running. Should be used in DCHECK assertions. + bool IsCurrentThread() const; + + // Begin the process of connection negotiation. + // Must be called from the reactor thread. + // Deadline specifies latest time negotiation may complete before timeout. + Status StartConnectionNegotiation(const scoped_refptr& conn, + const MonoTime& deadline); + + // Transition back from negotiating to processing requests. + // Must be called from the reactor thread. + void CompleteConnectionNegotiation(const scoped_refptr& conn, + const Status& status); + + // Collect metrics. + // Must be called from the reactor thread. + Status GetMetrics(ReactorMetrics *metrics); + + private: + friend class AssignOutboundCallTask; + friend class RegisterConnectionTask; + friend class DelayedTask; + + // Run the main event loop of the reactor. + void RunThread(); + + // Find or create a new connection to the given remote. + // If such a connection already exists, returns that, otherwise creates a new one. + // May return a bad Status if the connect() call fails. + // The resulting connection object is managed internally by the reactor thread. + // Deadline specifies latest time allowed for initializing the connection. + Status FindOrStartConnection(const ConnectionId& conn_id, + scoped_refptr* conn, + const MonoTime& deadline); + + // Shut down the given connection, removing it from the connection tracking + // structures of this reactor. + // + // The connection is not explicitly deleted -- shared_ptr reference counting + // may hold on to the object after this, but callers should assume that it + // _may_ be deleted by this call. + void DestroyConnection(Connection *conn, const Status &conn_status); + + // Scan any open connections for idle ones that have been idle longer than + // connection_keepalive_time_ + void ScanIdleConnections(); + + // Create a new client socket (non-blocking, NODELAY) + static Status CreateClientSocket(Socket *sock); + + // Initiate a new connection on the given socket, setting *in_progress + // to true if the connection is still pending upon return. + static Status StartConnect(Socket *sock, const Sockaddr &remote, bool *in_progress); + + // Assign a new outbound call to the appropriate connection object. + // If this fails, the call is marked failed and completed. + void AssignOutboundCall(const std::shared_ptr &call); + + // Register a new connection. + void RegisterConnection(const scoped_refptr& conn); + + // Actually perform shutdown of the thread, tearing down any connections, + // etc. This is called from within the thread. + void ShutdownInternal(); + + scoped_refptr thread_; + + // our epoll object (or kqueue, etc). + ev::dynamic_loop loop_; + + // Used by other threads to notify the reactor thread + ev::async async_; + + // Handles the periodic timer. + ev::timer timer_; + + // Scheduled (but not yet run) delayed tasks. + // + // Each task owns its own memory and must be freed by its TaskRun and + // Abort members, provided it was allocated on the heap. + std::set scheduled_tasks_; + + // The current monotonic time. Updated every coarse_timer_granularity_secs_. + MonoTime cur_time_; + + // last time we did TCP timeouts. + MonoTime last_unused_tcp_scan_; + + // Map of sockaddrs to Connection objects for outbound (client) connections. + conn_map_t client_conns_; + + // List of current connections coming into the server. + conn_list_t server_conns_; + + Reactor *reactor_; + + // If a connection has been idle for this much time, it is torn down. + const MonoDelta connection_keepalive_time_; + + // Scan for idle connections on this granularity. + const MonoDelta coarse_timer_granularity_; +}; + +// A Reactor manages a ReactorThread +class Reactor { + public: + Reactor(const std::shared_ptr& messenger, + int index, + const MessengerBuilder &bld); + Status Init(); + + // Block until the Reactor is shut down + void Shutdown(); + + ~Reactor(); + + const std::string &name() const; + + // Collect metrics about the reactor. + Status GetMetrics(ReactorMetrics *metrics); + + // Add any connections on this reactor thread into the given status dump. + Status DumpRunningRpcs(const DumpRunningRpcsRequestPB& req, + DumpRunningRpcsResponsePB* resp); + + // Queue a new incoming connection. Takes ownership of the underlying fd from + // 'socket', but not the Socket object itself. + // If the reactor is already shut down, takes care of closing the socket. + void RegisterInboundSocket(Socket *socket, const Sockaddr &remote); + + // Queue a new call to be sent. If the reactor is already shut down, marks + // the call as failed. + void QueueOutboundCall(const std::shared_ptr &call); + + // Schedule the given task's Run() method to be called on the + // reactor thread. + // If the reactor shuts down before it is run, the Abort method will be + // called. + // Does _not_ take ownership of 'task' -- the task should take care of + // deleting itself after running if it is allocated on the heap. + void ScheduleReactorTask(ReactorTask *task); + + Status RunOnReactorThread(const boost::function& f); + + // If the Reactor is closing, returns false. + // Otherwise, drains the pending_tasks_ queue into the provided list. + bool DrainTaskQueue(boost::intrusive::list *tasks); + + Messenger *messenger() const { + return messenger_.get(); + } + + // Indicates whether the reactor is shutting down. + // + // This method is thread-safe. + bool closing() const; + + // Is this reactor's thread the current thread? + bool IsCurrentThread() const { + return thread_.IsCurrentThread(); + } + + private: + friend class ReactorThread; + typedef simple_spinlock LockType; + mutable LockType lock_; + + // parent messenger + std::shared_ptr messenger_; + + const std::string name_; + + // Whether the reactor is shutting down. + // Guarded by lock_. + bool closing_; + + // Tasks to be run within the reactor thread. + // Guarded by lock_. + boost::intrusive::list pending_tasks_; // NOLINT(build/include_what_you_use) + + ReactorThread thread_; + + DISALLOW_COPY_AND_ASSIGN(Reactor); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/remote_method.cc b/src/kudu/rpc/remote_method.cc new file mode 100644 index 000000000000..32ec40dae402 --- /dev/null +++ b/src/kudu/rpc/remote_method.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/remote_method.h" +#include "kudu/rpc/rpc_header.pb.h" + +namespace kudu { +namespace rpc { + +using strings::Substitute; + +RemoteMethod::RemoteMethod(std::string service_name, + const std::string method_name) + : service_name_(std::move(service_name)), method_name_(method_name) {} + +void RemoteMethod::FromPB(const RemoteMethodPB& pb) { + DCHECK(pb.IsInitialized()) << "PB is uninitialized: " << pb.InitializationErrorString(); + service_name_ = pb.service_name(); + method_name_ = pb.method_name(); +} + +void RemoteMethod::ToPB(RemoteMethodPB* pb) const { + pb->set_service_name(service_name_); + pb->set_method_name(method_name_); +} + +string RemoteMethod::ToString() const { + return Substitute("$0.$1", service_name_, method_name_); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/remote_method.h b/src/kudu/rpc/remote_method.h new file mode 100644 index 000000000000..5b78dad4d7b6 --- /dev/null +++ b/src/kudu/rpc/remote_method.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_REMOTE_METHOD_H_ +#define KUDU_RPC_REMOTE_METHOD_H_ + +#include + +namespace kudu { +namespace rpc { + +class RemoteMethodPB; + +// Simple class that acts as a container for a fully qualified remote RPC name +// and converts to/from RemoteMethodPB. +// This class is also copyable and assignable for convenience reasons. +class RemoteMethod { + public: + RemoteMethod() {} + RemoteMethod(std::string service_name, const std::string method_name); + std::string service_name() const { return service_name_; } + std::string method_name() const { return method_name_; } + + // Encode/decode to/from 'pb'. + void FromPB(const RemoteMethodPB& pb); + void ToPB(RemoteMethodPB* pb) const; + + std::string ToString() const; + + private: + std::string service_name_; + std::string method_name_; +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_REMOTE_METHOD_H_ diff --git a/src/kudu/rpc/response_callback.h b/src/kudu/rpc/response_callback.h new file mode 100644 index 000000000000..8c4fc038e2e3 --- /dev/null +++ b/src/kudu/rpc/response_callback.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_RESPONSE_CALLBACK_H +#define KUDU_RPC_RESPONSE_CALLBACK_H + +#include + +namespace kudu { +namespace rpc { + +typedef boost::function ResponseCallback; + +} +} + +#endif diff --git a/src/kudu/rpc/rpc-bench.cc b/src/kudu/rpc/rpc-bench.cc new file mode 100644 index 000000000000..34376b2741e8 --- /dev/null +++ b/src/kudu/rpc/rpc-bench.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/rpc/rpc-test-base.h" +#include "kudu/rpc/rtest.proxy.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::shared_ptr; + +namespace kudu { +namespace rpc { + +class RpcBench : public RpcTestBase { + public: + RpcBench() + : should_run_(true) + {} + + protected: + friend class ClientThread; + + Sockaddr server_addr_; + shared_ptr client_messenger_; + Atomic32 should_run_; +}; + +class ClientThread { + public: + explicit ClientThread(RpcBench *bench) + : bench_(bench), + request_count_(0) { + } + + void Start() { + thread_.reset(new boost::thread(&ClientThread::Run, this)); + } + + void Join() { + thread_->join(); + } + + void Run() { + shared_ptr client_messenger = bench_->CreateMessenger("Client"); + + CalculatorServiceProxy p(client_messenger, bench_->server_addr_); + + AddRequestPB req; + AddResponsePB resp; + while (Acquire_Load(&bench_->should_run_)) { + req.set_x(request_count_); + req.set_y(request_count_); + RpcController controller; + controller.set_timeout(MonoDelta::FromSeconds(10)); + CHECK_OK(p.Add(req, &resp, &controller)); + CHECK_EQ(req.x() + req.y(), resp.result()); + request_count_++; + } + } + + gscoped_ptr thread_; + RpcBench *bench_; + int request_count_; +}; + + +// Test making successful RPC calls. +TEST_F(RpcBench, BenchmarkCalls) { + n_worker_threads_ = 1; + + // Set up server. + StartTestServerWithGeneratedCode(&server_addr_); + + // Set up client. + LOG(INFO) << "Connecting to " << server_addr_.ToString(); + client_messenger_ = CreateMessenger("Client", 2); + + Stopwatch sw(Stopwatch::ALL_THREADS); + sw.start(); + + boost::ptr_vector threads; + for (int i = 0; i < 16; i++) { + auto thr = new ClientThread(this); + thr->Start(); + threads.push_back(thr); + } + + SleepFor(MonoDelta::FromSeconds(AllowSlowTests() ? 10 : 1)); + Release_Store(&should_run_, false); + + int total_reqs = 0; + + for (ClientThread &thr : threads) { + thr.Join(); + total_reqs += thr.request_count_; + } + sw.stop(); + + float reqs_per_second = static_cast(total_reqs / sw.elapsed().wall_seconds()); + float user_cpu_micros_per_req = static_cast(sw.elapsed().user / 1000.0 / total_reqs); + float sys_cpu_micros_per_req = static_cast(sw.elapsed().system / 1000.0 / total_reqs); + + LOG(INFO) << "Reqs/sec: " << reqs_per_second; + LOG(INFO) << "User CPU per req: " << user_cpu_micros_per_req << "us"; + LOG(INFO) << "Sys CPU per req: " << sys_cpu_micros_per_req << "us"; +} + +} // namespace rpc +} // namespace kudu + diff --git a/src/kudu/rpc/rpc-test-base.h b/src/kudu/rpc/rpc-test-base.h new file mode 100644 index 000000000000..cf02fa486184 --- /dev/null +++ b/src/kudu/rpc/rpc-test-base.h @@ -0,0 +1,427 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_RPC_TEST_BASE_H +#define KUDU_RPC_RPC_TEST_BASE_H + +#include +#include +#include +#include + +#include "kudu/rpc/acceptor_pool.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/proxy.h" +#include "kudu/rpc/reactor.h" +#include "kudu/rpc/remote_method.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/rpc/rpc_sidecar.h" +#include "kudu/rpc/rtest.pb.h" +#include "kudu/rpc/rtest.proxy.h" +#include "kudu/rpc/rtest.service.h" +#include "kudu/rpc/service_if.h" +#include "kudu/rpc/service_pool.h" +#include "kudu/util/faststring.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/trace.h" + +namespace kudu { namespace rpc { + +using kudu::rpc_test::AddRequestPartialPB; +using kudu::rpc_test::AddRequestPB; +using kudu::rpc_test::AddResponsePB; +using kudu::rpc_test::CalculatorError; +using kudu::rpc_test::CalculatorServiceIf; +using kudu::rpc_test::CalculatorServiceProxy; +using kudu::rpc_test::EchoRequestPB; +using kudu::rpc_test::EchoResponsePB; +using kudu::rpc_test::PanicRequestPB; +using kudu::rpc_test::PanicResponsePB; +using kudu::rpc_test::SendTwoStringsRequestPB; +using kudu::rpc_test::SendTwoStringsResponsePB; +using kudu::rpc_test::SleepRequestPB; +using kudu::rpc_test::SleepResponsePB; +using kudu::rpc_test::WhoAmIRequestPB; +using kudu::rpc_test::WhoAmIResponsePB; +using kudu::rpc_test_diff_package::ReqDiffPackagePB; +using kudu::rpc_test_diff_package::RespDiffPackagePB; + +// Implementation of CalculatorService which just implements the generic +// RPC handler (no generated code). +class GenericCalculatorService : public ServiceIf { + public: + static const char *kFullServiceName; + static const char *kAddMethodName; + static const char *kSleepMethodName; + static const char *kSendTwoStringsMethodName; + + static const char* kFirstString; + static const char* kSecondString; + + GenericCalculatorService() { + } + + // To match the argument list of the generated CalculatorService. + explicit GenericCalculatorService(const scoped_refptr& entity) { + // this test doesn't generate metrics, so we ignore the argument. + } + + virtual void Handle(InboundCall *incoming) OVERRIDE { + if (incoming->remote_method().method_name() == kAddMethodName) { + DoAdd(incoming); + } else if (incoming->remote_method().method_name() == kSleepMethodName) { + DoSleep(incoming); + } else if (incoming->remote_method().method_name() == kSendTwoStringsMethodName) { + DoSendTwoStrings(incoming); + } else { + incoming->RespondFailure(ErrorStatusPB::ERROR_NO_SUCH_METHOD, + Status::InvalidArgument("bad method")); + } + } + + std::string service_name() const OVERRIDE { return kFullServiceName; } + static std::string static_service_name() { return kFullServiceName; } + + private: + void DoAdd(InboundCall *incoming) { + Slice param(incoming->serialized_request()); + AddRequestPB req; + if (!req.ParseFromArray(param.data(), param.size())) { + LOG(FATAL) << "couldn't parse: " << param.ToDebugString(); + } + + AddResponsePB resp; + resp.set_result(req.x() + req.y()); + incoming->RespondSuccess(resp); + } + + void DoSendTwoStrings(InboundCall* incoming) { + Slice param(incoming->serialized_request()); + SendTwoStringsRequestPB req; + if (!req.ParseFromArray(param.data(), param.size())) { + LOG(FATAL) << "couldn't parse: " << param.ToDebugString(); + } + + gscoped_ptr first(new faststring); + gscoped_ptr second(new faststring); + + Random r(req.random_seed()); + first->resize(req.size1()); + RandomString(first->data(), req.size1(), &r); + + second->resize(req.size2()); + RandomString(second->data(), req.size2(), &r); + + SendTwoStringsResponsePB resp; + int idx1, idx2; + CHECK_OK(incoming->AddRpcSidecar( + make_gscoped_ptr(new RpcSidecar(first.Pass())), &idx1)); + CHECK_OK(incoming->AddRpcSidecar( + make_gscoped_ptr(new RpcSidecar(second.Pass())), &idx2)); + resp.set_sidecar1(idx1); + resp.set_sidecar2(idx2); + + incoming->RespondSuccess(resp); + } + + void DoSleep(InboundCall *incoming) { + Slice param(incoming->serialized_request()); + SleepRequestPB req; + if (!req.ParseFromArray(param.data(), param.size())) { + incoming->RespondFailure(ErrorStatusPB::ERROR_INVALID_REQUEST, + Status::InvalidArgument("Couldn't parse pb", + req.InitializationErrorString())); + return; + } + + LOG(INFO) << "got call: " << req.ShortDebugString(); + SleepFor(MonoDelta::FromMicroseconds(req.sleep_micros())); + SleepResponsePB resp; + incoming->RespondSuccess(resp); + } +}; + +class CalculatorService : public CalculatorServiceIf { + public: + explicit CalculatorService(const scoped_refptr& entity) + : CalculatorServiceIf(entity) { + } + + virtual void Add(const AddRequestPB *req, + AddResponsePB *resp, + RpcContext *context) OVERRIDE { + resp->set_result(req->x() + req->y()); + context->RespondSuccess(); + } + + virtual void Sleep(const SleepRequestPB *req, + SleepResponsePB *resp, + RpcContext *context) OVERRIDE { + if (req->return_app_error()) { + CalculatorError my_error; + my_error.set_extra_error_data("some application-specific error data"); + context->RespondApplicationError(CalculatorError::app_error_ext.number(), + "Got some error", my_error); + return; + } + + // Respond w/ error if the RPC specifies that the client deadline is set, + // but it isn't. + if (req->client_timeout_defined()) { + MonoTime deadline = context->GetClientDeadline(); + if (deadline.Equals(MonoTime::Max())) { + CalculatorError my_error; + my_error.set_extra_error_data("Timeout not set"); + context->RespondApplicationError(CalculatorError::app_error_ext.number(), + "Missing required timeout", my_error); + return; + } + } + + if (req->deferred()) { + // Spawn a new thread which does the sleep and responds later. + scoped_refptr thread; + CHECK_OK(Thread::Create("rpc-test", "deferred", + &CalculatorService::DoSleep, this, req, context, + &thread)); + return; + } + DoSleep(req, context); + } + + virtual void Echo(const EchoRequestPB *req, + EchoResponsePB *resp, + RpcContext *context) OVERRIDE { + resp->set_data(req->data()); + context->RespondSuccess(); + } + + virtual void WhoAmI(const WhoAmIRequestPB* req, + WhoAmIResponsePB* resp, + RpcContext* context) OVERRIDE { + const UserCredentials& creds = context->user_credentials(); + if (creds.has_effective_user()) { + resp->mutable_credentials()->set_effective_user(creds.effective_user()); + } + resp->mutable_credentials()->set_real_user(creds.real_user()); + resp->set_address(context->remote_address().ToString()); + context->RespondSuccess(); + } + + virtual void TestArgumentsInDiffPackage(const ReqDiffPackagePB *req, + RespDiffPackagePB *resp, + ::kudu::rpc::RpcContext *context) OVERRIDE { + context->RespondSuccess(); + } + + virtual void Panic(const PanicRequestPB* req, + PanicResponsePB* resp, + RpcContext* context) OVERRIDE { + TRACE("Got panic request"); + PANIC_RPC(context, "Test method panicking!"); + } + + private: + void DoSleep(const SleepRequestPB *req, + RpcContext *context) { + SleepFor(MonoDelta::FromMicroseconds(req->sleep_micros())); + context->RespondSuccess(); + } + +}; + +const char *GenericCalculatorService::kFullServiceName = "kudu.rpc.GenericCalculatorService"; +const char *GenericCalculatorService::kAddMethodName = "Add"; +const char *GenericCalculatorService::kSleepMethodName = "Sleep"; +const char *GenericCalculatorService::kSendTwoStringsMethodName = "SendTwoStrings"; + +const char *GenericCalculatorService::kFirstString = + "1111111111111111111111111111111111111111111111111111111111"; +const char *GenericCalculatorService::kSecondString = + "2222222222222222222222222222222222222222222222222222222222222222222222"; + +class RpcTestBase : public KuduTest { + public: + RpcTestBase() + : n_worker_threads_(3), + n_server_reactor_threads_(3), + keepalive_time_ms_(1000), + metric_entity_(METRIC_ENTITY_server.Instantiate(&metric_registry_, "test.rpc_test")) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + } + + virtual void TearDown() OVERRIDE { + if (service_pool_) { + server_messenger_->UnregisterService(service_name_); + service_pool_->Shutdown(); + } + if (server_messenger_) { + server_messenger_->Shutdown(); + } + KuduTest::TearDown(); + } + + protected: + std::shared_ptr CreateMessenger(const string &name, + int n_reactors = 1) { + MessengerBuilder bld(name); + bld.set_num_reactors(n_reactors); + bld.set_connection_keepalive_time( + MonoDelta::FromMilliseconds(keepalive_time_ms_)); + bld.set_coarse_timer_granularity(MonoDelta::FromMilliseconds( + std::min(keepalive_time_ms_, 100))); + bld.set_metric_entity(metric_entity_); + std::shared_ptr messenger; + CHECK_OK(bld.Build(&messenger)); + return messenger; + } + + Status DoTestSyncCall(const Proxy &p, const char *method) { + AddRequestPB req; + req.set_x(rand()); + req.set_y(rand()); + AddResponsePB resp; + RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(10000)); + RETURN_NOT_OK(p.SyncRequest(method, req, &resp, &controller)); + + LOG(INFO) << "Result: " << resp.ShortDebugString(); + CHECK_EQ(req.x() + req.y(), resp.result()); + return Status::OK(); + } + + void DoTestSidecar(const Proxy &p, int size1, int size2) { + const uint32_t kSeed = 12345; + + SendTwoStringsRequestPB req; + req.set_size1(size1); + req.set_size2(size2); + req.set_random_seed(kSeed); + + SendTwoStringsResponsePB resp; + RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(10000)); + CHECK_OK(p.SyncRequest(GenericCalculatorService::kSendTwoStringsMethodName, + req, &resp, &controller)); + + Slice first = GetSidecarPointer(controller, resp.sidecar1(), size1); + Slice second = GetSidecarPointer(controller, resp.sidecar2(), size2); + + Random rng(kSeed); + faststring expected; + + expected.resize(size1); + RandomString(expected.data(), size1, &rng); + CHECK_EQ(0, first.compare(Slice(expected))); + + expected.resize(size2); + RandomString(expected.data(), size2, &rng); + CHECK_EQ(0, second.compare(Slice(expected))); + } + + void DoTestExpectTimeout(const Proxy &p, const MonoDelta &timeout) { + SleepRequestPB req; + SleepResponsePB resp; + req.set_sleep_micros(500000); // 0.5sec + + RpcController c; + c.set_timeout(timeout); + Stopwatch sw; + sw.start(); + Status s = p.SyncRequest(GenericCalculatorService::kSleepMethodName, req, &resp, &c); + ASSERT_FALSE(s.ok()); + sw.stop(); + + int expected_millis = timeout.ToMilliseconds(); + int elapsed_millis = sw.elapsed().wall_millis(); + + // We shouldn't timeout significantly faster than our configured timeout. + EXPECT_GE(elapsed_millis, expected_millis - 10); + // And we also shouldn't take the full 0.5sec that we asked for + EXPECT_LT(elapsed_millis, 500); + EXPECT_TRUE(s.IsTimedOut()); + LOG(INFO) << "status: " << s.ToString() << ", seconds elapsed: " << sw.elapsed().wall_seconds(); + } + + void StartTestServer(Sockaddr *server_addr) { + DoStartTestServer(server_addr); + } + + void StartTestServerWithGeneratedCode(Sockaddr *server_addr) { + DoStartTestServer(server_addr); + } + + // Start a simple socket listening on a local port, returning the address. + // This isn't an RPC server -- just a plain socket which can be helpful for testing. + Status StartFakeServer(Socket *listen_sock, Sockaddr *listen_addr) { + Sockaddr bind_addr; + bind_addr.set_port(0); + RETURN_NOT_OK(listen_sock->Init(0)); + RETURN_NOT_OK(listen_sock->BindAndListen(bind_addr, 1)); + RETURN_NOT_OK(listen_sock->GetSocketAddress(listen_addr)); + LOG(INFO) << "Bound to: " << listen_addr->ToString(); + return Status::OK(); + } + + private: + + static Slice GetSidecarPointer(const RpcController& controller, int idx, + int expected_size) { + Slice sidecar; + CHECK_OK(controller.GetSidecar(idx, &sidecar)); + CHECK_EQ(expected_size, sidecar.size()); + return Slice(sidecar.data(), expected_size); + } + + template + void DoStartTestServer(Sockaddr *server_addr) { + server_messenger_ = CreateMessenger("TestServer", n_server_reactor_threads_); + std::shared_ptr pool; + ASSERT_OK(server_messenger_->AddAcceptorPool(Sockaddr(), &pool)); + ASSERT_OK(pool->Start(2)); + *server_addr = pool->bind_address(); + + gscoped_ptr service(new ServiceClass(metric_entity_)); + service_name_ = service->service_name(); + scoped_refptr metric_entity = server_messenger_->metric_entity(); + service_pool_ = new ServicePool(service.Pass(), metric_entity, 50); + server_messenger_->RegisterService(service_name_, service_pool_); + ASSERT_OK(service_pool_->Init(n_worker_threads_)); + } + + protected: + string service_name_; + std::shared_ptr server_messenger_; + scoped_refptr service_pool_; + int n_worker_threads_; + int n_server_reactor_threads_; + int keepalive_time_ms_; + + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; +}; + + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/rpc/rpc-test.cc b/src/kudu/rpc/rpc-test.cc new file mode 100644 index 000000000000..03928af55931 --- /dev/null +++ b/src/kudu/rpc/rpc-test.cc @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/rpc-test-base.h" + +#include +#include +#include + +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/rpc/serialization.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/env.h" +#include "kudu/util/test_util.h" + +METRIC_DECLARE_histogram(handler_latency_kudu_rpc_test_CalculatorService_Sleep); +METRIC_DECLARE_histogram(rpc_incoming_queue_time); + +using std::string; +using std::shared_ptr; +using std::unordered_map; + +namespace kudu { +namespace rpc { + +class TestRpc : public RpcTestBase { +}; + +TEST_F(TestRpc, TestSockaddr) { + Sockaddr addr1, addr2; + addr1.set_port(1000); + addr2.set_port(2000); + // port is ignored when comparing Sockaddr objects + ASSERT_FALSE(addr1 < addr2); + ASSERT_FALSE(addr2 < addr1); + ASSERT_EQ(1000, addr1.port()); + ASSERT_EQ(2000, addr2.port()); + ASSERT_EQ(string("0.0.0.0:1000"), addr1.ToString()); + ASSERT_EQ(string("0.0.0.0:2000"), addr2.ToString()); + Sockaddr addr3(addr1); + ASSERT_EQ(string("0.0.0.0:1000"), addr3.ToString()); +} + +TEST_F(TestRpc, TestMessengerCreateDestroy) { + shared_ptr messenger(CreateMessenger("TestCreateDestroy")); + LOG(INFO) << "started messenger " << messenger->name(); + messenger->Shutdown(); +} + +// Test starting and stopping a messenger. This is a regression +// test for a segfault seen in early versions of the RPC code, +// in which shutting down the acceptor would trigger an assert, +// making our tests flaky. +TEST_F(TestRpc, TestAcceptorPoolStartStop) { + int n_iters = AllowSlowTests() ? 100 : 5; + for (int i = 0; i < n_iters; i++) { + shared_ptr messenger(CreateMessenger("TestAcceptorPoolStartStop")); + shared_ptr pool; + ASSERT_OK(messenger->AddAcceptorPool(Sockaddr(), &pool)); + Sockaddr bound_addr; + ASSERT_OK(pool->GetBoundAddress(&bound_addr)); + ASSERT_NE(0, bound_addr.port()); + ASSERT_OK(pool->Start(2)); + messenger->Shutdown(); + } +} + +TEST_F(TestRpc, TestConnHeaderValidation) { + MessengerBuilder mb("TestRpc.TestConnHeaderValidation"); + const int conn_hdr_len = kMagicNumberLength + kHeaderFlagsLength; + uint8_t buf[conn_hdr_len]; + serialization::SerializeConnHeader(buf); + ASSERT_OK(serialization::ValidateConnHeader(Slice(buf, conn_hdr_len))); +} + +// Test making successful RPC calls. +TEST_F(TestRpc, TestCall) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client. + LOG(INFO) << "Connecting to " << server_addr.ToString(); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(DoTestSyncCall(p, GenericCalculatorService::kAddMethodName)); + } +} + +// Test that connecting to an invalid server properly throws an error. +TEST_F(TestRpc, TestCallToBadServer) { + shared_ptr client_messenger(CreateMessenger("Client")); + Sockaddr addr; + addr.set_port(0); + Proxy p(client_messenger, addr, GenericCalculatorService::static_service_name()); + + // Loop a few calls to make sure that we properly set up and tear down + // the connections. + for (int i = 0; i < 5; i++) { + Status s = DoTestSyncCall(p, GenericCalculatorService::kAddMethodName); + LOG(INFO) << "Status: " << s.ToString(); + ASSERT_TRUE(s.IsNetworkError()) << "unexpected status: " << s.ToString(); + } +} + +// Test that RPC calls can be failed with an error status on the server. +TEST_F(TestRpc, TestInvalidMethodCall) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client. + LOG(INFO) << "Connecting to " << server_addr.ToString(); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + // Call the method which fails. + Status s = DoTestSyncCall(p, "ThisMethodDoesNotExist"); + ASSERT_TRUE(s.IsRemoteError()) << "unexpected status: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "bad method"); +} + +// Test that the error message returned when connecting to the wrong service +// is reasonable. +TEST_F(TestRpc, TestWrongService) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client with the wrong service name. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, "WrongServiceName"); + + // Call the method which fails. + Status s = DoTestSyncCall(p, "ThisMethodDoesNotExist"); + ASSERT_TRUE(s.IsRemoteError()) << "unexpected status: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), + "Service unavailable: service WrongServiceName " + "not registered on TestServer"); +} + +namespace { +int GetOpenFileLimit() { + struct rlimit limit; + PCHECK(getrlimit(RLIMIT_NOFILE, &limit) == 0); + return limit.rlim_cur; +} +} // anonymous namespace + +// Test that we can still make RPC connections even if many fds are in use. +// This is a regression test for KUDU-650. +TEST_F(TestRpc, TestHighFDs) { + // This test can only run if ulimit is set high. + const int kNumFakeFiles = 3500; + const int kMinUlimit = kNumFakeFiles + 100; + if (GetOpenFileLimit() < kMinUlimit) { + LOG(INFO) << "Test skipped: must increase ulimit -n to at least " << kMinUlimit; + return; + } + + // Open a bunch of fds just to increase our fd count. + vector fake_files; + ElementDeleter d(&fake_files); + for (int i = 0; i < kNumFakeFiles; i++) { + gscoped_ptr f; + CHECK_OK(Env::Default()->NewRandomAccessFile("/dev/zero", &f)); + fake_files.push_back(f.release()); + } + + // Set up server and client, and verify we can make a successful call. + Sockaddr server_addr; + StartTestServer(&server_addr); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + ASSERT_OK(DoTestSyncCall(p, GenericCalculatorService::kAddMethodName)); +} + +// Test that connections are kept alive between calls. +TEST_F(TestRpc, TestConnectionKeepalive) { + // Only run one reactor per messenger, so we can grab the metrics from that + // one without having to check all. + n_server_reactor_threads_ = 1; + keepalive_time_ms_ = 50; + + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client. + LOG(INFO) << "Connecting to " << server_addr.ToString(); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + ASSERT_OK(DoTestSyncCall(p, GenericCalculatorService::kAddMethodName)); + + SleepFor(MonoDelta::FromMilliseconds(5)); + + ReactorMetrics metrics; + ASSERT_OK(server_messenger_->reactors_[0]->GetMetrics(&metrics)); + ASSERT_EQ(1, metrics.num_server_connections_) << "Server should have 1 server connection"; + ASSERT_EQ(0, metrics.num_client_connections_) << "Server should have 0 client connections"; + + ASSERT_OK(client_messenger->reactors_[0]->GetMetrics(&metrics)); + ASSERT_EQ(0, metrics.num_server_connections_) << "Client should have 0 server connections"; + ASSERT_EQ(1, metrics.num_client_connections_) << "Client should have 1 client connections"; + + SleepFor(MonoDelta::FromMilliseconds(100)); + + // After sleeping, the keepalive timer should have closed both sides of + // the connection. + ASSERT_OK(server_messenger_->reactors_[0]->GetMetrics(&metrics)); + ASSERT_EQ(0, metrics.num_server_connections_) << "Server should have 0 server connections"; + ASSERT_EQ(0, metrics.num_client_connections_) << "Server should have 0 client connections"; + + ASSERT_OK(client_messenger->reactors_[0]->GetMetrics(&metrics)); + ASSERT_EQ(0, metrics.num_server_connections_) << "Client should have 0 server connections"; + ASSERT_EQ(0, metrics.num_client_connections_) << "Client should have 0 client connections"; +} + +// Test that a call which takes longer than the keepalive time +// succeeds -- i.e that we don't consider a connection to be "idle" on the +// server if there is a call outstanding on it. +TEST_F(TestRpc, TestCallLongerThanKeepalive) { + // set very short keepalive + keepalive_time_ms_ = 50; + + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + // Make a call which sleeps longer than the keepalive. + RpcController controller; + SleepRequestPB req; + req.set_sleep_micros(100 * 1000); + req.set_deferred(true); + SleepResponsePB resp; + ASSERT_OK(p.SyncRequest(GenericCalculatorService::kSleepMethodName, + req, &resp, &controller)); +} + +// Test that the RpcSidecar transfers the expected messages. +TEST_F(TestRpc, TestRpcSidecar) { + // Set up server. + Sockaddr server_addr; + StartTestServer(&server_addr); + + // Set up client. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + // Test some small sidecars + DoTestSidecar(p, 123, 456); + + // Test some larger sidecars to verify that we properly handle the case where + // we can't write the whole response to the socket in a single call. + DoTestSidecar(p, 3000 * 1024, 2000 * 1024); +} + +// Test that timeouts are properly handled. +TEST_F(TestRpc, TestCallTimeout) { + Sockaddr server_addr; + StartTestServer(&server_addr); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + // Test a very short timeout - we expect this will time out while the + // call is still trying to connect, or in the send queue. This was triggering ASAN failures + // before. + ASSERT_NO_FATAL_FAILURE(DoTestExpectTimeout(p, MonoDelta::FromNanoseconds(1))); + + // Test a longer timeout - expect this will time out after we send the request. + ASSERT_NO_FATAL_FAILURE(DoTestExpectTimeout(p, MonoDelta::FromMilliseconds(10))); +} + +static void AcceptAndReadForever(Socket* listen_sock) { + // Accept the TCP connection. + Socket server_sock; + Sockaddr remote; + CHECK_OK(listen_sock->Accept(&server_sock, &remote, 0)); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(10)); + + size_t nread; + uint8_t buf[1024]; + while (server_sock.BlockingRecv(buf, sizeof(buf), &nread, deadline).ok()) { + } +} + +// Starts a fake listening socket which never actually negotiates. +// Ensures that the client gets a reasonable status code in this case. +TEST_F(TestRpc, TestNegotiationTimeout) { + // Set up a simple socket server which accepts a connection. + Sockaddr server_addr; + Socket listen_sock; + ASSERT_OK(StartFakeServer(&listen_sock, &server_addr)); + + // Create another thread to accept the connection on the fake server. + scoped_refptr acceptor_thread; + ASSERT_OK(Thread::Create("test", "acceptor", + AcceptAndReadForever, &listen_sock, + &acceptor_thread)); + + // Set up client. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + ASSERT_NO_FATAL_FAILURE(DoTestExpectTimeout(p, MonoDelta::FromMilliseconds(100))); + + acceptor_thread->Join(); +} + +// Test that client calls get failed properly when the server they're connected to +// shuts down. +TEST_F(TestRpc, TestServerShutsDown) { + // Set up a simple socket server which accepts a connection. + Sockaddr server_addr; + Socket listen_sock; + ASSERT_OK(StartFakeServer(&listen_sock, &server_addr)); + + // Set up client. + LOG(INFO) << "Connecting to " << server_addr.ToString(); + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, GenericCalculatorService::static_service_name()); + + // Send a call. + AddRequestPB req; + req.set_x(rand()); + req.set_y(rand()); + AddResponsePB resp; + + boost::ptr_vector controllers; + + // We'll send several calls async, and ensure that they all + // get the error status when the connection drops. + int n_calls = 5; + + CountDownLatch latch(n_calls); + for (int i = 0; i < n_calls; i++) { + auto controller = new RpcController(); + controllers.push_back(controller); + p.AsyncRequest(GenericCalculatorService::kAddMethodName, req, &resp, controller, + boost::bind(&CountDownLatch::CountDown, boost::ref(latch))); + } + + // Accept the TCP connection. + Socket server_sock; + Sockaddr remote; + ASSERT_OK(listen_sock.Accept(&server_sock, &remote, 0)); + + // The call is still in progress at this point. + for (const RpcController &controller : controllers) { + ASSERT_FALSE(controller.finished()); + } + + // Shut down the socket. + ASSERT_OK(listen_sock.Close()); + ASSERT_OK(server_sock.Close()); + + // Wait for the call to be marked finished. + latch.Wait(); + + // Should get the appropriate error on the client for all calls; + for (const RpcController &controller : controllers) { + ASSERT_TRUE(controller.finished()); + Status s = controller.status(); + ASSERT_TRUE(s.IsNetworkError()) << + "Unexpected status: " << s.ToString(); + + // Any of these errors could happen, depending on whether we were + // in the middle of sending a call while the connection died, or + // if we were already waiting for responses. + // + // ECONNREFUSED is possible because the sending of the calls is async. + // For example, the following interleaving: + // - Enqueue 3 calls + // - Reactor wakes up, creates connection, starts writing calls + // - Enqueue 2 more calls + // - Shut down socket + // - Reactor wakes up, tries to write more of the first 3 calls, gets error + // - Reactor shuts down connection + // - Reactor sees the 2 remaining calls, makes a new connection + // - Because the socket is shut down, gets ECONNREFUSED. + // + // EINVAL is possible if the controller socket had already disconnected by + // the time it trys to set the SO_SNDTIMEO socket option as part of the + // normal blocking SASL handshake. + ASSERT_TRUE(s.posix_code() == EPIPE || + s.posix_code() == ECONNRESET || + s.posix_code() == ESHUTDOWN || + s.posix_code() == ECONNREFUSED || + s.posix_code() == EINVAL) + << "Unexpected status: " << s.ToString(); + } +} + +// Test handler latency metric. +TEST_F(TestRpc, TestRpcHandlerLatencyMetric) { + + const uint64_t sleep_micros = 20 * 1000; + + // Set up server. + Sockaddr server_addr; + StartTestServerWithGeneratedCode(&server_addr); + + // Set up client. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, CalculatorService::static_service_name()); + + RpcController controller; + SleepRequestPB req; + req.set_sleep_micros(sleep_micros); + req.set_deferred(true); + SleepResponsePB resp; + ASSERT_OK(p.SyncRequest("Sleep", req, &resp, &controller)); + + const unordered_map > metric_map = + server_messenger_->metric_entity()->UnsafeMetricsMapForTests(); + + scoped_refptr latency_histogram = down_cast( + FindOrDie(metric_map, + &METRIC_handler_latency_kudu_rpc_test_CalculatorService_Sleep).get()); + + LOG(INFO) << "Sleep() min lat: " << latency_histogram->MinValueForTests(); + LOG(INFO) << "Sleep() mean lat: " << latency_histogram->MeanValueForTests(); + LOG(INFO) << "Sleep() max lat: " << latency_histogram->MaxValueForTests(); + LOG(INFO) << "Sleep() #calls: " << latency_histogram->TotalCount(); + + ASSERT_EQ(1, latency_histogram->TotalCount()); + ASSERT_GE(latency_histogram->MaxValueForTests(), sleep_micros); + ASSERT_TRUE(latency_histogram->MinValueForTests() == latency_histogram->MaxValueForTests()); + + // TODO: Implement an incoming queue latency test. + // For now we just assert that the metric exists. + ASSERT_TRUE(FindOrDie(metric_map, &METRIC_rpc_incoming_queue_time)); +} + +static void DestroyMessengerCallback(shared_ptr* messenger, + CountDownLatch* latch) { + messenger->reset(); + latch->CountDown(); +} + +TEST_F(TestRpc, TestRpcCallbackDestroysMessenger) { + shared_ptr client_messenger(CreateMessenger("Client")); + Sockaddr bad_addr; + CountDownLatch latch(1); + + AddRequestPB req; + req.set_x(rand()); + req.set_y(rand()); + AddResponsePB resp; + RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(1)); + { + Proxy p(client_messenger, bad_addr, "xxx"); + p.AsyncRequest("my-fake-method", req, &resp, &controller, + boost::bind(&DestroyMessengerCallback, &client_messenger, &latch)); + } + latch.Wait(); +} + +// Test that setting the client timeout / deadline gets propagated to RPC +// services. +TEST_F(TestRpc, TestRpcContextClientDeadline) { + const uint64_t sleep_micros = 20 * 1000; + + // Set up server. + Sockaddr server_addr; + StartTestServerWithGeneratedCode(&server_addr); + + // Set up client. + shared_ptr client_messenger(CreateMessenger("Client")); + Proxy p(client_messenger, server_addr, CalculatorService::static_service_name()); + + SleepRequestPB req; + req.set_sleep_micros(sleep_micros); + req.set_client_timeout_defined(true); + SleepResponsePB resp; + RpcController controller; + Status s = p.SyncRequest("Sleep", req, &resp, &controller); + ASSERT_TRUE(s.IsRemoteError()); + ASSERT_STR_CONTAINS(s.ToString(), "Missing required timeout"); + + controller.Reset(); + controller.set_timeout(MonoDelta::FromMilliseconds(1000)); + ASSERT_OK(p.SyncRequest("Sleep", req, &resp, &controller)); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/rpc.cc b/src/kudu/rpc/rpc.cc new file mode 100644 index 000000000000..3f19cc7f7317 --- /dev/null +++ b/src/kudu/rpc/rpc.cc @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/rpc.h" + +#include +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_header.pb.h" + +using std::shared_ptr; +using strings::Substitute; +using strings::SubstituteAndAppend; + +namespace kudu { + +namespace rpc { + +bool RpcRetrier::HandleResponse(Rpc* rpc, Status* out_status) { + ignore_result(DCHECK_NOTNULL(rpc)); + ignore_result(DCHECK_NOTNULL(out_status)); + + // Always retry a TOO_BUSY error. + Status controller_status = controller_.status(); + if (controller_status.IsRemoteError()) { + const ErrorStatusPB* err = controller_.error_response(); + if (err && + err->has_code() && + err->code() == ErrorStatusPB::ERROR_SERVER_TOO_BUSY) { + DelayedRetry(rpc, controller_status); + return true; + } + } + + *out_status = controller_status; + return false; +} + +void RpcRetrier::DelayedRetry(Rpc* rpc, const Status& why_status) { + if (!why_status.ok() && (last_error_.ok() || last_error_.IsTimedOut())) { + last_error_ = why_status; + } + // Add some jitter to the retry delay. + // + // If the delay causes us to miss our deadline, RetryCb will fail the + // RPC on our behalf. + int num_ms = ++attempt_num_ + ((rand() % 5)); + messenger_->ScheduleOnReactor(boost::bind(&RpcRetrier::DelayedRetryCb, + this, + rpc, _1), + MonoDelta::FromMilliseconds(num_ms)); +} + +void RpcRetrier::DelayedRetryCb(Rpc* rpc, const Status& status) { + Status new_status = status; + if (new_status.ok()) { + // Has this RPC timed out? + if (deadline_.Initialized()) { + MonoTime now = MonoTime::Now(MonoTime::FINE); + if (deadline_.ComesBefore(now)) { + string err_str = Substitute("$0 passed its deadline", rpc->ToString()); + if (!last_error_.ok()) { + SubstituteAndAppend(&err_str, ": $0", last_error_.ToString()); + } + new_status = Status::TimedOut(err_str); + } + } + } + if (new_status.ok()) { + controller_.Reset(); + rpc->SendRpc(); + } else { + rpc->SendRpcCb(new_status); + } +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/rpc.h b/src/kudu/rpc/rpc.h new file mode 100644 index 000000000000..8eb4456681c0 --- /dev/null +++ b/src/kudu/rpc/rpc.h @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_RPC_H +#define KUDU_RPC_RPC_H + +#include +#include + +#include "kudu/gutil/callback.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status_callback.h" + +namespace kudu { + +namespace rpc { + +class Messenger; +class Rpc; + +// Provides utilities for retrying failed RPCs. +// +// All RPCs should use HandleResponse() to retry certain generic errors. +class RpcRetrier { + public: + RpcRetrier(MonoTime deadline, std::shared_ptr messenger) + : attempt_num_(1), + deadline_(std::move(deadline)), + messenger_(std::move(messenger)) { + if (deadline_.Initialized()) { + controller_.set_deadline(deadline_); + } + controller_.Reset(); + } + + // Tries to handle a failed RPC. + // + // If it was handled (e.g. scheduled for retry in the future), returns + // true. In this case, callers should ensure that 'rpc' remains alive. + // + // Otherwise, returns false and writes the controller status to + // 'out_status'. + bool HandleResponse(Rpc* rpc, Status* out_status); + + // Retries an RPC at some point in the near future. If 'why_status' is not OK, + // records it as the most recent error causing the RPC to retry. This is + // reported to the caller eventually if the RPC never succeeds. + // + // If the RPC's deadline expires, the callback will fire with a timeout + // error when the RPC comes up for retrying. This is true even if the + // deadline has already expired at the time that Retry() was called. + // + // Callers should ensure that 'rpc' remains alive. + void DelayedRetry(Rpc* rpc, const Status& why_status); + + RpcController* mutable_controller() { return &controller_; } + const RpcController& controller() const { return controller_; } + + const MonoTime& deadline() const { return deadline_; } + + const std::shared_ptr& messenger() const { + return messenger_; + } + + int attempt_num() const { return attempt_num_; } + + // Called when an RPC comes up for retrying. Actually sends the RPC. + void DelayedRetryCb(Rpc* rpc, const Status& status); + + private: + // The next sent rpc will be the nth attempt (indexed from 1). + int attempt_num_; + + // If the remote end is busy, the RPC will be retried (with a small + // delay) until this deadline is reached. + // + // May be uninitialized. + MonoTime deadline_; + + // Messenger to use when sending the RPC. + std::shared_ptr messenger_; + + // RPC controller to use when sending the RPC. + RpcController controller_; + + // In case any retries have already happened, remembers the last error. + // Errors from the server take precedence over timeout errors. + Status last_error_; + + DISALLOW_COPY_AND_ASSIGN(RpcRetrier); +}; + +// An in-flight remote procedure call to some server. +class Rpc { + public: + Rpc(const MonoTime& deadline, + const std::shared_ptr& messenger) + : retrier_(deadline, messenger) { + } + + virtual ~Rpc() {} + + // Asynchronously sends the RPC to the remote end. + // + // Subclasses should use SendRpcCb() below as the callback function. + virtual void SendRpc() = 0; + + // Returns a string representation of the RPC. + virtual std::string ToString() const = 0; + + // Returns the number of times this RPC has been sent. Will always be at + // least one. + int num_attempts() const { return retrier().attempt_num(); } + + protected: + const RpcRetrier& retrier() const { return retrier_; } + RpcRetrier* mutable_retrier() { return &retrier_; } + + private: + friend class RpcRetrier; + + // Callback for SendRpc(). If 'status' is not OK, something failed + // before the RPC was sent. + virtual void SendRpcCb(const Status& status) = 0; + + // Used to retry some failed RPCs. + RpcRetrier retrier_; + + DISALLOW_COPY_AND_ASSIGN(Rpc); +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_RPC_H diff --git a/src/kudu/rpc/rpc_context.cc b/src/kudu/rpc/rpc_context.cc new file mode 100644 index 000000000000..51110000420c --- /dev/null +++ b/src/kudu/rpc/rpc_context.cc @@ -0,0 +1,184 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/rpc_context.h" + +#include +#include + +#include "kudu/rpc/outbound_call.h" +#include "kudu/rpc/inbound_call.h" +#include "kudu/rpc/rpc_sidecar.h" +#include "kudu/rpc/service_if.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/metrics.h" +#include "kudu/util/trace.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/pb_util.h" + +using google::protobuf::Message; + +namespace kudu { +namespace rpc { + +namespace { + +// Wrapper for a protobuf message which lazily converts to JSON when +// the trace buffer is dumped. This pushes the work of stringification +// to the trace dumping process. +class PbTracer : public debug::ConvertableToTraceFormat { + public: + enum { + kMaxFieldLengthToTrace = 100 + }; + + explicit PbTracer(const Message& msg) : msg_(msg.New()) { + msg_->CopyFrom(msg); + } + + virtual void AppendAsTraceFormat(std::string* out) const OVERRIDE { + pb_util::TruncateFields(msg_.get(), kMaxFieldLengthToTrace); + std::stringstream ss; + JsonWriter jw(&ss, JsonWriter::COMPACT); + jw.Protobuf(*msg_); + out->append(ss.str()); + } + private: + const gscoped_ptr msg_; +}; + +scoped_refptr TracePb(const Message& msg) { + return make_scoped_refptr(new PbTracer(msg)); +} +} // anonymous namespace + +RpcContext::RpcContext(InboundCall *call, + const google::protobuf::Message *request_pb, + google::protobuf::Message *response_pb, + RpcMethodMetrics metrics) + : call_(CHECK_NOTNULL(call)), + request_pb_(request_pb), + response_pb_(response_pb), + metrics_(metrics) { + VLOG(4) << call_->remote_method().service_name() << ": Received RPC request for " + << call_->ToString() << ":" << std::endl << request_pb_->DebugString(); + TRACE_EVENT_ASYNC_BEGIN2("rpc_call", "RPC", this, + "call", call_->ToString(), + "request", TracePb(*request_pb_)); +} + +RpcContext::~RpcContext() { +} + +void RpcContext::RespondSuccess() { + call_->RecordHandlingCompleted(metrics_.handler_latency); + VLOG(4) << call_->remote_method().service_name() << ": Sending RPC success response for " + << call_->ToString() << ":" << std::endl << response_pb_->DebugString(); + TRACE_EVENT_ASYNC_END2("rpc_call", "RPC", this, + "response", TracePb(*response_pb_), + "trace", trace()->DumpToString(true)); + call_->RespondSuccess(*response_pb_); + delete this; +} + +void RpcContext::RespondFailure(const Status &status) { + call_->RecordHandlingCompleted(metrics_.handler_latency); + VLOG(4) << call_->remote_method().service_name() << ": Sending RPC failure response for " + << call_->ToString() << ": " << status.ToString(); + TRACE_EVENT_ASYNC_END2("rpc_call", "RPC", this, + "status", status.ToString(), + "trace", trace()->DumpToString(true)); + call_->RespondFailure(ErrorStatusPB::ERROR_APPLICATION, + status); + delete this; +} + +void RpcContext::RespondRpcFailure(ErrorStatusPB_RpcErrorCodePB err, const Status& status) { + call_->RecordHandlingCompleted(metrics_.handler_latency); + VLOG(4) << call_->remote_method().service_name() << ": Sending RPC failure response for " + << call_->ToString() << ": " << status.ToString(); + TRACE_EVENT_ASYNC_END2("rpc_call", "RPC", this, + "status", status.ToString(), + "trace", trace()->DumpToString(true)); + call_->RespondFailure(err, status); + delete this; +} + +void RpcContext::RespondApplicationError(int error_ext_id, const std::string& message, + const Message& app_error_pb) { + call_->RecordHandlingCompleted(metrics_.handler_latency); + if (VLOG_IS_ON(4)) { + ErrorStatusPB err; + InboundCall::ApplicationErrorToPB(error_ext_id, message, app_error_pb, &err); + VLOG(4) << call_->remote_method().service_name() << ": Sending application error response for " + << call_->ToString() << ":" << std::endl << err.DebugString(); + TRACE_EVENT_ASYNC_END2("rpc_call", "RPC", this, + "response", TracePb(app_error_pb), + "trace", trace()->DumpToString(true)); + } + call_->RespondApplicationError(error_ext_id, message, app_error_pb); + delete this; +} + +Status RpcContext::AddRpcSidecar(gscoped_ptr car, int* idx) { + return call_->AddRpcSidecar(car.Pass(), idx); +} + +const UserCredentials& RpcContext::user_credentials() const { + return call_->user_credentials(); +} + +const Sockaddr& RpcContext::remote_address() const { + return call_->remote_address(); +} + +std::string RpcContext::requestor_string() const { + return call_->user_credentials().ToString() + " at " + + call_->remote_address().ToString(); +} + +MonoTime RpcContext::GetClientDeadline() const { + return call_->GetClientDeadline(); +} + +Trace* RpcContext::trace() { + return call_->trace(); +} + +void RpcContext::Panic(const char* filepath, int line_number, const string& message) { + // Use the LogMessage class directly so that the log messages appear to come from + // the line of code which caused the panic, not this code. +#define MY_ERROR google::LogMessage(filepath, line_number, google::GLOG_ERROR).stream() +#define MY_FATAL google::LogMessageFatal(filepath, line_number).stream() + + MY_ERROR << "Panic handling " << call_->ToString() << ": " << message; + MY_ERROR << "Request:\n" << request_pb_->DebugString(); + Trace* t = trace(); + if (t) { + MY_ERROR << "RPC trace:"; + t->Dump(&MY_ERROR, true); + } + MY_FATAL << "Exiting due to panic."; + +#undef MY_ERROR +#undef MY_FATAL +} + + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/rpc_context.h b/src/kudu/rpc/rpc_context.h new file mode 100644 index 000000000000..f7508ddfea8f --- /dev/null +++ b/src/kudu/rpc/rpc_context.h @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_RPC_CONTEXT_H +#define KUDU_RPC_RPC_CONTEXT_H + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/service_if.h" +#include "kudu/util/status.h" + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class Sockaddr; +class Trace; + +namespace rpc { + +class InboundCall; +class RpcSidecar; +class UserCredentials; + + +#define PANIC_RPC(rpc_context, message) \ + do { \ + if (rpc_context) { \ + rpc_context->Panic(__FILE__, __LINE__, (message)); \ + } else { \ + LOG(FATAL) << message; \ + } \ + } while (0) + +// The context provided to a generated ServiceIf. This provides +// methods to respond to the RPC. In the future, this will also +// include methods to access information about the caller: e.g +// authentication info, tracing info, and cancellation status. +// +// This is the server-side analogue to the RpcController class. +class RpcContext { + public: + // Create an RpcContext. This is called only from generated code + // and is not a public API. + RpcContext(InboundCall *call, + const google::protobuf::Message *request_pb, + google::protobuf::Message *response_pb, + RpcMethodMetrics metrics); + + ~RpcContext(); + + // Return the trace buffer for this call. + Trace* trace(); + + // Send a response to the call. The service may call this method + // before or after returning from the original handler method, + // and it may call this method from a different thread. + // + // The response should be prepared already in the response PB pointer + // which was passed to the handler method. + // + // After this method returns, this RpcContext object is destroyed. The request + // and response protobufs are also destroyed. + void RespondSuccess(); + + // Respond with an error to the client. This sends back an error with the code + // ERROR_APPLICATION. Because there is no more specific error code passed back + // to the client, most applications should create a custom error PB extension + // and use RespondApplicationError(...) below. This method should only be used + // for unexpected errors where the server doesn't expect the client to do any + // more advanced handling. + // + // After this method returns, this RpcContext object is destroyed. The request + // and response protobufs are also destroyed. + void RespondFailure(const Status &status); + + // Respond with an RPC-level error. This typically manifests to the client as + // a remote error, one whose handling is agnostic to the particulars of the + // sent RPC. For example, ERROR_SERVER_TOO_BUSY usually causes the client to + // retry the RPC at a later time. + // + // After this method returns, this RpcContext object is destroyed. The request + // and response protobufs are also destroyed. + void RespondRpcFailure(ErrorStatusPB_RpcErrorCodePB err, const Status& status); + + // Respond with an application-level error. This causes the caller to get a + // RemoteError status with the provided string message. Additionally, a + // service-specific error extension is passed back to the client. The + // extension must be registered with the ErrorStatusPB protobuf. For + // example: + // + // message MyServiceError { + // extend kudu.rpc.ErrorStatusPB { + // optional MyServiceError my_service_error_ext = 101; + // } + // // Add any extra fields or status codes you want to pass back to + // // the client here. + // required string extra_error_data = 1; + // } + // + // NOTE: the numeric '101' above must be an integer greater than 101 + // and must be unique across your code base. + // + // Given the above definition in your service protobuf file, you would + // use this method like: + // + // MyServiceError err; + // err.set_extra_error_data("foo bar"); + // ctx->RespondApplicationError(MyServiceError::my_service_error_ext.number(), + // "Some error occurred", err); + // + // The client side may then retreieve the error by calling: + // const MyServiceError& err_details = + // controller->error_response()->GetExtension(MyServiceError::my_service_error_ext); + // + // After this method returns, this RpcContext object is destroyed. The request + // and response protobufs are also destroyed. + void RespondApplicationError(int error_ext_id, const std::string& message, + const google::protobuf::Message& app_error_pb); + + + // Adds an RpcSidecar to the response. This is the preferred method for + // transferring large amounts of binary data, because this avoids additional + // copies made by serializing the protobuf. + // + // Assumes no changes to the sidecar's data are made after insertion. + // + // Upon success, writes the index of the sidecar (necessary to be retrieved + // later) to 'idx'. Call may fail if all sidecars have already been used + // by the RPC response. + Status AddRpcSidecar(gscoped_ptr car, int* idx); + + // Return the credentials of the remote user who made this call. + const UserCredentials& user_credentials() const; + + // Return the remote IP address and port which sent the current RPC call. + const Sockaddr& remote_address() const; + + // A string identifying the requestor -- both the user info and the IP address. + // Suitable for use in log messages. + std::string requestor_string() const; + + const google::protobuf::Message *request_pb() const { return request_pb_.get(); } + google::protobuf::Message *response_pb() const { return response_pb_.get(); } + + // Return an upper bound on the client timeout deadline. This does not + // account for transmission delays between the client and the server. + // If the client did not specify a deadline, returns MonoTime::Max(). + MonoTime GetClientDeadline() const; + + // Panic the server. This logs a fatal error with the given message, and + // also includes the current RPC request, requestor, trace information, etc, + // to make it easier to debug. + // + // Call this via the PANIC_RPC() macro. + void Panic(const char* filepath, int line_number, const std::string& message) + __attribute__((noreturn)); + + private: + InboundCall* const call_; + const gscoped_ptr request_pb_; + const gscoped_ptr response_pb_; + RpcMethodMetrics metrics_; +}; + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/rpc/rpc_controller.cc b/src/kudu/rpc/rpc_controller.cc new file mode 100644 index 000000000000..53f76d9b5243 --- /dev/null +++ b/src/kudu/rpc/rpc_controller.cc @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/rpc/rpc_controller.h" +#include "kudu/rpc/outbound_call.h" + +namespace kudu { namespace rpc { + +RpcController::RpcController() { + DVLOG(4) << "RpcController " << this << " constructed"; +} + +RpcController::~RpcController() { + DVLOG(4) << "RpcController " << this << " destroyed"; +} + +void RpcController::Swap(RpcController* other) { + // Cannot swap RPC controllers while they are in-flight. + if (call_) { + CHECK(finished()); + } + if (other->call_) { + CHECK(other->finished()); + } + + std::swap(timeout_, other->timeout_); + std::swap(call_, other->call_); +} + +void RpcController::Reset() { + lock_guard l(&lock_); + if (call_) { + CHECK(finished()); + } + call_.reset(); +} + +bool RpcController::finished() const { + if (call_) { + return call_->IsFinished(); + } + return false; +} + +Status RpcController::status() const { + if (call_) { + return call_->status(); + } + return Status::OK(); +} + +const ErrorStatusPB* RpcController::error_response() const { + if (call_) { + return call_->error_pb(); + } + return nullptr; +} + +Status RpcController::GetSidecar(int idx, Slice* sidecar) const { + return call_->call_response_->GetSidecar(idx, sidecar); +} + +void RpcController::set_timeout(const MonoDelta& timeout) { + lock_guard l(&lock_); + DCHECK(!call_ || call_->state() == OutboundCall::READY); + timeout_ = timeout; +} + +void RpcController::set_deadline(const MonoTime& deadline) { + set_timeout(deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE))); +} + +MonoDelta RpcController::timeout() const { + lock_guard l(&lock_); + return timeout_; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/rpc_controller.h b/src/kudu/rpc/rpc_controller.h new file mode 100644 index 000000000000..bea703b8f07e --- /dev/null +++ b/src/kudu/rpc/rpc_controller.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_RPC_CONTROLLER_H +#define KUDU_RPC_RPC_CONTROLLER_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +namespace rpc { + +class ErrorStatusPB; +class OutboundCall; + +// Controller for managing properties of a single RPC call, on the client side. +// +// An RpcController maps to exactly one call and is not thread-safe. The client +// may use this class prior to sending an RPC in order to set properties such +// as the call's timeout. +// +// After the call has been sent (e.g using Proxy::AsyncRequest()) the user +// may invoke methods on the RpcController object in order to probe the status +// of the call. +class RpcController { + public: + RpcController(); + ~RpcController(); + + // Swap the state of the controller (including ownership of sidecars, buffers, + // etc) with another one. + void Swap(RpcController* other); + + // Reset this controller so it may be used with another call. + void Reset(); + + // Return true if the call has finished. + // A call is finished if the server has responded, or if the call + // has timed out. + bool finished() const; + + // Return the current status of a call. + // + // A call is "OK" status until it finishes, at which point it may + // either remain in "OK" status (if the call was successful), or + // change to an error status. Error status indicates that there was + // some RPC-layer issue with making the call, for example, one of: + // + // * failed to establish a connection to the server + // * the server was too busy to handle the request + // * the server was unable to interpret the request (eg due to a version + // mismatch) + // * a network error occurred which caused the connection to be torn + // down + // * the call timed out + Status status() const; + + // If status() returns a RemoteError object, then this function returns + // the error response provided by the server. Service implementors may + // use protobuf Extensions to add application-specific data to this PB. + // + // If Status was not a RemoteError, this returns NULL. + // The returned pointer is only valid as long as the controller object. + const ErrorStatusPB* error_response() const; + + // Set the timeout for the call to be made with this RPC controller. + // + // The configured timeout applies to the entire time period between + // the AsyncRequest() method call and getting a response. For example, + // if it takes too long to establish a connection to the remote host, + // or to DNS-resolve the remote host, those will be accounted as part + // of the timeout period. + // + // Timeouts must be set prior to making the request -- the timeout may + // not currently be adjusted for an already-sent call. + // + // Using an uninitialized timeout will result in a call which never + // times out (not recommended!) + void set_timeout(const MonoDelta& timeout); + + // Like a timeout, but based on a fixed point in time instead of a delta. + // + // Using an uninitialized deadline means the call won't time out. + void set_deadline(const MonoTime& deadline); + + // Return the configured timeout. + MonoDelta timeout() const; + + // Fills the 'sidecar' parameter with the slice pointing to the i-th + // sidecar upon success. + // + // Should only be called if the call's finished, but the controller has not + // been Reset(). + // + // May fail if index is invalid. + Status GetSidecar(int idx, Slice* sidecar) const; + + private: + friend class OutboundCall; + friend class Proxy; + + MonoDelta timeout_; + + mutable simple_spinlock lock_; + + // Once the call is sent, it is tracked here. + std::shared_ptr call_; + + DISALLOW_COPY_AND_ASSIGN(RpcController); +}; + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/rpc/rpc_header.proto b/src/kudu/rpc/rpc_header.proto new file mode 100644 index 000000000000..a6e219983684 --- /dev/null +++ b/src/kudu/rpc/rpc_header.proto @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +option optimize_for = SPEED; + +package kudu.rpc; + +option java_package = "org.kududb.rpc"; + + +// The Kudu RPC protocol is similar to the RPC protocol of Hadoop and HBase. +// See the following for reference on those other protocols: +// - https://issues.apache.org/jira/browse/HBASE-7898 +// - https://issues.apache.org/jira/browse/HADOOP-8990 +// +// For a description of the Kudu protocol, see 'README' in this directory. + +// User Information proto. Included in ConnectionContextPB on connection setup. +message UserInformationPB { + optional string effective_user = 1; + required string real_user = 2; +} + +/** + * The connection context is sent as part of the connection establishment. + * It establishes the context for ALL RPC calls within the connection. + * This is sent on connection setup after the connection preamble is sent + * and SASL has been negotiated. + * No response is sent from the server to the client. + */ +message ConnectionContextPB { + // UserInfo beyond what is determined as part of security handshake + // at connection time (kerberos, tokens etc). + optional UserInformationPB user_info = 2; +} + +// Message type passed back & forth for the SASL negotiation. +message SaslMessagePB { + enum SaslState { + UNKNOWN = 999; + SUCCESS = 0; + NEGOTIATE = 1; + INITIATE = 2; + CHALLENGE = 3; + RESPONSE = 4; + } + + message SaslAuth { + optional string method = 1; // Deprecated, but was 'required' in Kudu 0.5.0 and 0.6.0. + required string mechanism = 2; // Standard SASL mechanism, i.e. ANONYMOUS, PLAIN, GSSAPI. + + // SASL challenge token from server, if the client chooses to use this method. + // Only used when the server is piggy-backing a challenge on a NEGOTIATE response. + // Otherwise, SaslMessagePB::token is used as the challenge token. + optional bytes challenge = 5; + } + + optional uint32 version = 1; + required SaslState state = 2; // RPC system SASL state. + optional bytes token = 3; + repeated SaslAuth auths = 4; +} + +message RemoteMethodPB { + // Service name for the RPC layer. + // The client created a proxy with this service name. + // Example: kudu.rpc_test.CalculatorService + required string service_name = 1; + + // Name of the RPC method. + required string method_name = 2; +}; + +// The header for the RPC request frame. +message RequestHeader { + // A sequence number that is sent back in the Response. Hadoop specifies a uint32 and + // casts it to a signed int. That is counterintuitive, so we use an int32 instead. + // Allowed values (inherited from Hadoop): + // 0 through INT32_MAX: Regular RPC call IDs. + // -2: Invalid call ID. + // -3: Connection context call ID. + // -33: SASL negotiation call ID. + required int32 call_id = 3; + + // RPC method being invoked. + // Not used for "connection setup" calls. + optional RemoteMethodPB remote_method = 6; + + // Propagate the timeout as specified by the user. Note that, since there is some + // transit time between the client and server, if you wait exactly this amount of + // time and then respond, you are likely to cause a timeout on the client. + optional uint32 timeout_millis = 10; +} + +message ResponseHeader { + required int32 call_id = 1; + + // If this is set, then this is an error response and the + // response message will be of type ErrorStatusPB instead of + // the expected response type. + optional bool is_error = 2 [ default = false ]; + + // Byte offsets for side cars in the main body of the response message. + // These offsets are counted AFTER the message header, i.e., offset 0 + // is the first byte after the bytes for this protobuf. + repeated uint32 sidecar_offsets = 3; + +} + +// Sent as response when is_error == true. +message ErrorStatusPB { + + // These codes have all been inherited from Hadoop's RPC mechanism. + enum RpcErrorCodePB { + FATAL_UNKNOWN = 10; + + // Non-fatal RPC errors. Connection should be left open for future RPC calls. + //------------------------------------------------------------ + // The application generated an error status. See the message field for + // more details. + ERROR_APPLICATION = 1; + + // The specified method was not valid. + ERROR_NO_SUCH_METHOD = 2; + + // The specified service was not valid. + ERROR_NO_SUCH_SERVICE = 3; + + // The server is overloaded - the client should try again shortly. + ERROR_SERVER_TOO_BUSY = 4; + + // The request parameter was not parseable or was missing required fields. + ERROR_INVALID_REQUEST = 5; + + // FATAL_* errors indicate that the client should shut down the connection. + //------------------------------------------------------------ + // The RPC server is already shutting down. + FATAL_SERVER_SHUTTING_DOWN = 11; + // Fields of RpcHeader are invalid. + FATAL_INVALID_RPC_HEADER = 12; + // Could not deserialize RPC request. + FATAL_DESERIALIZING_REQUEST = 13; + // IPC Layer version mismatch. + FATAL_VERSION_MISMATCH = 14; + // Auth failed. + FATAL_UNAUTHORIZED = 15; + } + + required string message = 1; + + // TODO: Make code required? + optional RpcErrorCodePB code = 2; // Specific error identifier. + + // Allow extensions. When the RPC returns ERROR_APPLICATION, the server + // should also fill in exactly one of these extension fields, which contains + // more details on the service-specific error. + extensions 100 to max; +} diff --git a/src/kudu/rpc/rpc_introspection.proto b/src/kudu/rpc/rpc_introspection.proto new file mode 100644 index 000000000000..be72450acc77 --- /dev/null +++ b/src/kudu/rpc/rpc_introspection.proto @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Protobuf used for introspection of RPC services (eg listing in-flight RPCs, +// reflection, etc) + +package kudu.rpc; + +option java_package = "org.kududb"; + +import "kudu/rpc/rpc_header.proto"; + +message RpcCallInProgressPB { + required RequestHeader header = 1; + optional string trace_buffer = 2; + optional uint64 micros_elapsed = 3; +} + +message RpcConnectionPB { + enum StateType { + UNKNOWN = 999; + NEGOTIATING = 0; // Connection is still being negotiated. + OPEN = 1; // Connection is active. + }; + + required string remote_ip = 1; + required StateType state = 2; + // TODO: swap out for separate fields + optional string remote_user_credentials = 3; + repeated RpcCallInProgressPB calls_in_flight = 4; +} + +message DumpRunningRpcsRequestPB { + optional bool include_traces = 1 [ default = false ]; +} + +message DumpRunningRpcsResponsePB { + repeated RpcConnectionPB inbound_connections = 1; + repeated RpcConnectionPB outbound_connections = 2; +} diff --git a/src/kudu/rpc/rpc_service.h b/src/kudu/rpc/rpc_service.h new file mode 100644 index 000000000000..2ef45d449f1e --- /dev/null +++ b/src/kudu/rpc/rpc_service.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_SERVICE_H_ +#define KUDU_RPC_SERVICE_H_ + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +class InboundCall; + +class RpcService : public RefCountedThreadSafe { + public: + virtual ~RpcService() {} + + // Enqueue a call for processing. + // On failure, the RpcService::QueueInboundCall() implementation is + // responsible for responding to the client with a failure message. + virtual Status QueueInboundCall(gscoped_ptr call) = 0; +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_SERVICE_H_ diff --git a/src/kudu/rpc/rpc_sidecar.h b/src/kudu/rpc/rpc_sidecar.h new file mode 100644 index 000000000000..5f3ea0b7168e --- /dev/null +++ b/src/kudu/rpc/rpc_sidecar.h @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_RPC_SIDECAR_H +#define KUDU_RPC_RPC_SIDECAR_H + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace rpc { + +// An RpcSidecar is a mechanism which allows replies to RPCs +// to reference blocks of data without extra copies. In other words, +// whenever a protobuf would have a large field where additional copies +// become expensive, one may opt instead to use an RpcSidecar. +// +// The RpcSidecar saves on an additional copy to/from the protobuf on both the +// server and client side. The InboundCall class accepts RpcSidecars, ignorant +// of the form that the sidecar's data is kept in, requiring only that it can +// be represented as a Slice. Data is then immediately copied from the +// Slice returned from AsSlice() to the socket that is responding to the original +// RPC. +// +// In order to distinguish between separate sidecars, whenever a sidecar is +// added to the RPC response on the server side, an index for that sidecar is +// returned. This index must then in some way (i.e., via protobuf) be +// communicated to the client side. +// +// After receiving the RPC response on the client side, OutboundCall decodes +// the original message along with the separate sidecars by using a list +// of sidecar byte offsets that was sent in the message header. +// +// After reconstructing the array of sidecars, the OutboundCall (through +// RpcController's interface) is able to offer retrieval of the sidecar data +// through the same indices that were returned by InboundCall (or indirectly +// through the RpcContext wrapper) on the client side. +class RpcSidecar { + public: + // Generates a sidecar with the parameter faststring as its data. + explicit RpcSidecar(gscoped_ptr data) : data_(data.Pass()) {} + + // Returns a Slice representation of the sidecar's data. + Slice AsSlice() const { return *data_; } + + private: + const gscoped_ptr data_; + + DISALLOW_COPY_AND_ASSIGN(RpcSidecar); +}; + +} // namespace rpc +} // namespace kudu + + +#endif /* KUDU_RPC_RPC_SIDECAR_H */ diff --git a/src/kudu/rpc/rpc_stub-test.cc b/src/kudu/rpc/rpc_stub-test.cc new file mode 100644 index 000000000000..48292be25867 --- /dev/null +++ b/src/kudu/rpc/rpc_stub-test.cc @@ -0,0 +1,433 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "kudu/gutil/stl_util.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/rpc/rtest.proxy.h" +#include "kudu/rpc/rtest.service.h" +#include "kudu/rpc/rpc-test-base.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/metrics.h" +#include "kudu/util/subprocess.h" +#include "kudu/util/test_util.h" +#include "kudu/util/user.h" + +DEFINE_bool(is_panic_test_child, false, "Used by TestRpcPanic"); +DECLARE_bool(socket_inject_short_recvs); + +using boost::ptr_vector; +using std::shared_ptr; +using std::vector; + +namespace kudu { +namespace rpc { + +class RpcStubTest : public RpcTestBase { + public: + virtual void SetUp() OVERRIDE { + RpcTestBase::SetUp(); + StartTestServerWithGeneratedCode(&server_addr_); + client_messenger_ = CreateMessenger("Client"); + } + protected: + void SendSimpleCall() { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + RpcController controller; + AddRequestPB req; + req.set_x(10); + req.set_y(20); + AddResponsePB resp; + ASSERT_OK(p.Add(req, &resp, &controller)); + ASSERT_EQ(30, resp.result()); + } + + Sockaddr server_addr_; + shared_ptr client_messenger_; +}; + +TEST_F(RpcStubTest, TestSimpleCall) { + SendSimpleCall(); +} + +// Regression test for a bug in which we would not properly parse a call +// response when recv() returned a 'short read'. This injects such short +// reads and then makes a number of calls. +TEST_F(RpcStubTest, TestShortRecvs) { + FLAGS_socket_inject_short_recvs = true; + CalculatorServiceProxy p(client_messenger_, server_addr_); + + for (int i = 0; i < 100; i++) { + NO_FATALS(SendSimpleCall()); + } +} + +// Test calls which are rather large. +// This test sends many of them at once using the async API and then +// waits for them all to return. This is meant to ensure that the +// IO threads can deal with read/write calls that don't succeed +// in sending the entire data in one go. +TEST_F(RpcStubTest, TestBigCallData) { + const int kNumSentAtOnce = 20; + const size_t kMessageSize = 5 * 1024 * 1024; + string data; + data.resize(kMessageSize); + + CalculatorServiceProxy p(client_messenger_, server_addr_); + + EchoRequestPB req; + req.set_data(data); + + ptr_vector resps; + ptr_vector controllers; + + CountDownLatch latch(kNumSentAtOnce); + for (int i = 0; i < kNumSentAtOnce; i++) { + auto resp = new EchoResponsePB; + resps.push_back(resp); + auto controller = new RpcController; + controllers.push_back(controller); + + p.EchoAsync(req, resp, controller, + boost::bind(&CountDownLatch::CountDown, boost::ref(latch))); + } + + latch.Wait(); + + for (RpcController &c : controllers) { + ASSERT_OK(c.status()); + } +} + +TEST_F(RpcStubTest, TestRespondDeferred) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + RpcController controller; + SleepRequestPB req; + req.set_sleep_micros(1000); + req.set_deferred(true); + SleepResponsePB resp; + ASSERT_OK(p.Sleep(req, &resp, &controller)); +} + +// Test that the default user credentials are propagated to the server. +TEST_F(RpcStubTest, TestDefaultCredentialsPropagated) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + string expected; + ASSERT_OK(GetLoggedInUser(&expected)); + + RpcController controller; + WhoAmIRequestPB req; + WhoAmIResponsePB resp; + ASSERT_OK(p.WhoAmI(req, &resp, &controller)); + ASSERT_EQ(expected, resp.credentials().real_user()); + ASSERT_FALSE(resp.credentials().has_effective_user()); +} + +// Test that the user can specify other credentials. +TEST_F(RpcStubTest, TestCustomCredentialsPropagated) { + const char* const kFakeUserName = "some fake user"; + CalculatorServiceProxy p(client_messenger_, server_addr_); + + UserCredentials creds; + creds.set_real_user(kFakeUserName); + p.set_user_credentials(creds); + + RpcController controller; + WhoAmIRequestPB req; + WhoAmIResponsePB resp; + ASSERT_OK(p.WhoAmI(req, &resp, &controller)); + ASSERT_EQ(kFakeUserName, resp.credentials().real_user()); + ASSERT_FALSE(resp.credentials().has_effective_user()); +} + +// Test that the user's remote address is accessible to the server. +TEST_F(RpcStubTest, TestRemoteAddress) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + RpcController controller; + WhoAmIRequestPB req; + WhoAmIResponsePB resp; + ASSERT_OK(p.WhoAmI(req, &resp, &controller)); + ASSERT_STR_CONTAINS(resp.address(), "127.0.0.1:"); +} + +//////////////////////////////////////////////////////////// +// Tests for error cases +//////////////////////////////////////////////////////////// + +// Test sending a PB parameter with a missing field, where the client +// thinks it has sent a full PB. (eg due to version mismatch) +TEST_F(RpcStubTest, TestCallWithInvalidParam) { + Proxy p(client_messenger_, server_addr_, CalculatorService::static_service_name()); + + AddRequestPartialPB req; + req.set_x(rand()); + // AddRequestPartialPB is missing the 'y' field. + AddResponsePB resp; + RpcController controller; + Status s = p.SyncRequest("Add", req, &resp, &controller); + ASSERT_TRUE(s.IsRemoteError()) << "Bad status: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), + "Invalid argument: Invalid parameter for call " + "kudu.rpc_test.CalculatorService.Add: y"); +} + +// Wrapper around AtomicIncrement, since AtomicIncrement returns the 'old' +// value, and our callback needs to be a void function. +static void DoIncrement(Atomic32* count) { + base::subtle::Barrier_AtomicIncrement(count, 1); +} + +// Test sending a PB parameter with a missing field on the client side. +// This also ensures that the async callback is only called once +// (regression test for a previously-encountered bug). +TEST_F(RpcStubTest, TestCallWithMissingPBFieldClientSide) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + RpcController controller; + AddRequestPB req; + req.set_x(10); + // Request is missing the 'y' field. + AddResponsePB resp; + Atomic32 callback_count = 0; + p.AddAsync(req, &resp, &controller, boost::bind(&DoIncrement, &callback_count)); + while (NoBarrier_Load(&callback_count) == 0) { + SleepFor(MonoDelta::FromMicroseconds(10)); + } + SleepFor(MonoDelta::FromMicroseconds(100)); + ASSERT_EQ(1, NoBarrier_Load(&callback_count)); + ASSERT_STR_CONTAINS(controller.status().ToString(), + "Invalid argument: RPC argument missing required fields: y"); +} + +// Test sending a call which isn't implemented by the server. +TEST_F(RpcStubTest, TestCallMissingMethod) { + Proxy p(client_messenger_, server_addr_, CalculatorService::static_service_name()); + + Status s = DoTestSyncCall(p, "DoesNotExist"); + ASSERT_TRUE(s.IsRemoteError()) << "Bad status: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "with an invalid method name: DoesNotExist"); +} + +TEST_F(RpcStubTest, TestApplicationError) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + RpcController controller; + SleepRequestPB req; + SleepResponsePB resp; + req.set_sleep_micros(1); + req.set_return_app_error(true); + Status s = p.Sleep(req, &resp, &controller); + ASSERT_TRUE(s.IsRemoteError()); + EXPECT_EQ("Remote error: Got some error", s.ToString()); + EXPECT_EQ("message: \"Got some error\"\n" + "[kudu.rpc_test.CalculatorError.app_error_ext] {\n" + " extra_error_data: \"some application-specific error data\"\n" + "}\n", controller.error_response()->DebugString()); +} + +TEST_F(RpcStubTest, TestRpcPanic) { + if (!FLAGS_is_panic_test_child) { + // This is a poor man's death test. We call this same + // test case, but set the above flag, and verify that + // it aborted. gtest death tests don't work here because + // there are already threads started up. + vector argv; + string executable_path; + CHECK_OK(env_->GetExecutablePath(&executable_path)); + argv.push_back(executable_path); + argv.push_back("--is_panic_test_child"); + argv.push_back("--gtest_filter=RpcStubTest.TestRpcPanic"); + + Subprocess subp(argv[0], argv); + subp.ShareParentStderr(false); + CHECK_OK(subp.Start()); + FILE* in = fdopen(subp.from_child_stderr_fd(), "r"); + PCHECK(in); + + // Search for string "Test method panicking!" somewhere in stderr + char buf[1024]; + bool found_string = false; + while (fgets(buf, sizeof(buf), in)) { + if (strstr(buf, "Test method panicking!")) { + found_string = true; + break; + } + } + CHECK(found_string); + + // Check return status + int wait_status = 0; + CHECK_OK(subp.Wait(&wait_status)); + CHECK(!WIFEXITED(wait_status)); // should not have been successful + if (WIFSIGNALED(wait_status)) { + CHECK_EQ(WTERMSIG(wait_status), SIGABRT); + } else { + // On some systems, we get exit status 134 from SIGABRT rather than + // WIFSIGNALED getting flagged. + CHECK_EQ(WEXITSTATUS(wait_status), 134); + } + return; + } else { + // Before forcing the panic, explicitly remove the test directory. This + // should be safe; this test doesn't generate any data. + CHECK_OK(env_->DeleteRecursively(GetTestDataDirectory())); + + // Make an RPC which causes the server to abort. + CalculatorServiceProxy p(client_messenger_, server_addr_); + RpcController controller; + PanicRequestPB req; + PanicResponsePB resp; + p.Panic(req, &resp, &controller); + } +} + +struct AsyncSleep { + AsyncSleep() : latch(1) {} + + RpcController rpc; + SleepRequestPB req; + SleepResponsePB resp; + CountDownLatch latch; +}; + +TEST_F(RpcStubTest, TestDontHandleTimedOutCalls) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + vector sleeps; + ElementDeleter d(&sleeps); + + // Send enough sleep calls to occupy the worker threads. + for (int i = 0; i < n_worker_threads_; i++) { + gscoped_ptr sleep(new AsyncSleep); + sleep->rpc.set_timeout(MonoDelta::FromSeconds(1)); + sleep->req.set_sleep_micros(100*1000); // 100ms + p.SleepAsync(sleep->req, &sleep->resp, &sleep->rpc, + boost::bind(&CountDownLatch::CountDown, &sleep->latch)); + sleeps.push_back(sleep.release()); + } + + // Send another call with a short timeout. This shouldn't get processed, because + // it'll get stuck in the queue for longer than its timeout. + RpcController rpc; + SleepRequestPB req; + SleepResponsePB resp; + req.set_sleep_micros(1000); + rpc.set_timeout(MonoDelta::FromMilliseconds(1)); + Status s = p.Sleep(req, &resp, &rpc); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + + for (AsyncSleep* s : sleeps) { + s->latch.Wait(); + } + + // Verify that the timedout call got short circuited before being processed. + const Counter* timed_out_in_queue = service_pool_->RpcsTimedOutInQueueMetricForTests(); + ASSERT_EQ(1, timed_out_in_queue->value()); +} + +TEST_F(RpcStubTest, TestDumpCallsInFlight) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + AsyncSleep sleep; + sleep.req.set_sleep_micros(100 * 1000); // 100ms + p.SleepAsync(sleep.req, &sleep.resp, &sleep.rpc, + boost::bind(&CountDownLatch::CountDown, &sleep.latch)); + + // Check the running RPC status on the client messenger. + DumpRunningRpcsRequestPB dump_req; + DumpRunningRpcsResponsePB dump_resp; + dump_req.set_include_traces(true); + + ASSERT_OK(client_messenger_->DumpRunningRpcs(dump_req, &dump_resp)); + LOG(INFO) << "client messenger: " << dump_resp.DebugString(); + ASSERT_EQ(1, dump_resp.outbound_connections_size()); + ASSERT_EQ(1, dump_resp.outbound_connections(0).calls_in_flight_size()); + ASSERT_EQ("Sleep", dump_resp.outbound_connections(0).calls_in_flight(0). + header().remote_method().method_name()); + ASSERT_GT(dump_resp.outbound_connections(0).calls_in_flight(0).micros_elapsed(), 0); + + // And the server messenger. + // We have to loop this until we find a result since the actual call is sent + // asynchronously off of the main thread (ie the server may not be handling it yet) + for (int i = 0; i < 100; i++) { + dump_resp.Clear(); + ASSERT_OK(server_messenger_->DumpRunningRpcs(dump_req, &dump_resp)); + if (dump_resp.inbound_connections_size() > 0 && + dump_resp.inbound_connections(0).calls_in_flight_size() > 0) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } + + LOG(INFO) << "server messenger: " << dump_resp.DebugString(); + ASSERT_EQ(1, dump_resp.inbound_connections_size()); + ASSERT_EQ(1, dump_resp.inbound_connections(0).calls_in_flight_size()); + ASSERT_EQ("Sleep", dump_resp.inbound_connections(0).calls_in_flight(0). + header().remote_method().method_name()); + ASSERT_GT(dump_resp.inbound_connections(0).calls_in_flight(0).micros_elapsed(), 0); + ASSERT_STR_CONTAINS(dump_resp.inbound_connections(0).calls_in_flight(0).trace_buffer(), + "Inserting onto call queue"); + sleep.latch.Wait(); +} + +namespace { +struct RefCountedTest : public RefCountedThreadSafe { +}; + +// Test callback which takes a refcounted pointer. +// We don't use this parameter, but it's used to validate that the bound callback +// is cleared in TestCallbackClearedAfterRunning. +void MyTestCallback(CountDownLatch* latch, scoped_refptr my_refptr) { + latch->CountDown(); +} +} // anonymous namespace + +// Verify that, after a call has returned, no copy of the call's callback +// is held. This is important when the callback holds a refcounted ptr, +// since we expect to be able to release that pointer when the call is done. +TEST_F(RpcStubTest, TestCallbackClearedAfterRunning) { + CalculatorServiceProxy p(client_messenger_, server_addr_); + + CountDownLatch latch(1); + scoped_refptr my_refptr(new RefCountedTest); + RpcController controller; + AddRequestPB req; + req.set_x(10); + req.set_y(20); + AddResponsePB resp; + p.AddAsync(req, &resp, &controller, + boost::bind(MyTestCallback, &latch, my_refptr)); + latch.Wait(); + + // The ref count should go back down to 1. However, we need to loop a little + // bit, since the deref is happening on another thread. If the other thread gets + // descheduled directly after calling our callback, we'd fail without these sleeps. + for (int i = 0; i < 100 && !my_refptr->HasOneRef(); i++) { + SleepFor(MonoDelta::FromMilliseconds(1)); + } + ASSERT_TRUE(my_refptr->HasOneRef()); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/rtest.proto b/src/kudu/rpc/rtest.proto new file mode 100644 index 000000000000..f1e3e93b0819 --- /dev/null +++ b/src/kudu/rpc/rtest.proto @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Test protocol for kudu RPC. +package kudu.rpc_test; + +import "kudu/rpc/rpc_header.proto"; +import "kudu/rpc/rtest_diff_package.proto"; + +message AddRequestPB { + required uint32 x = 1; + required uint32 y = 2; +} + +// Used by tests to simulate an old client which is missing +// a newly added required field. +message AddRequestPartialPB { + required uint32 x = 1; +} + +message AddResponsePB { + required uint32 result = 1; +} + +message SleepRequestPB { + required uint32 sleep_micros = 1; + + // Used in rpc_stub-test: if this is true, it will respond from a different + // thread than the one that receives the request. + optional bool deferred = 2 [ default = false ]; + + // If set, returns a CalculatorError response. + optional bool return_app_error = 3 [ default = false ]; + + // Used in rpc-test: if this is set to true and no client timeout is set, + // the service will respond to the client with an error. + optional bool client_timeout_defined = 4 [ default = false ]; +} + +message SleepResponsePB { +} + +message SendTwoStringsRequestPB { + required uint32 random_seed = 1; + required uint64 size1 = 2; + required uint64 size2 = 3; +} + +message SendTwoStringsResponsePB { + required uint32 sidecar1 = 1; + required uint32 sidecar2 = 2; +} + +message EchoRequestPB { + required string data = 1; +} +message EchoResponsePB { + required string data = 1; +} + +message WhoAmIRequestPB { +} +message WhoAmIResponsePB { + required kudu.rpc.UserInformationPB credentials = 1; + required string address = 2; +} + +message CalculatorError { + extend kudu.rpc.ErrorStatusPB { + optional CalculatorError app_error_ext = 101; + } + + required string extra_error_data = 1; +} + +message PanicRequestPB {} +message PanicResponsePB {} + +service CalculatorService { + rpc Add(AddRequestPB) returns(AddResponsePB); + rpc Sleep(SleepRequestPB) returns(SleepResponsePB); + rpc Echo(EchoRequestPB) returns(EchoResponsePB); + rpc WhoAmI(WhoAmIRequestPB) returns (WhoAmIResponsePB); + rpc TestArgumentsInDiffPackage(kudu.rpc_test_diff_package.ReqDiffPackagePB) + returns(kudu.rpc_test_diff_package.RespDiffPackagePB); + rpc Panic(PanicRequestPB) returns (PanicResponsePB); +} diff --git a/src/kudu/rpc/rtest_diff_package.proto b/src/kudu/rpc/rtest_diff_package.proto new file mode 100644 index 000000000000..6ecba2e221e6 --- /dev/null +++ b/src/kudu/rpc/rtest_diff_package.proto @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Request/Response in different package to test that RPC methods +// handle arguments with packages different from the service itself. +package kudu.rpc_test_diff_package; + +message ReqDiffPackagePB { +} +message RespDiffPackagePB { +} diff --git a/src/kudu/rpc/sasl_client.cc b/src/kudu/rpc/sasl_client.cc new file mode 100644 index 000000000000..675e1b01cb2a --- /dev/null +++ b/src/kudu/rpc/sasl_client.cc @@ -0,0 +1,492 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/sasl_client.h" + +#include + +#include +#include +#include + +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/rpc/blocking_ops.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/sasl_helper.h" +#include "kudu/rpc/serialization.h" +#include "kudu/util/faststring.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace rpc { + +using std::map; +using std::set; +using std::string; + +static int SaslClientGetoptCb(void* sasl_client, const char* plugin_name, const char* option, + const char** result, unsigned* len) { + return static_cast(sasl_client) + ->GetOptionCb(plugin_name, option, result, len); +} + +static int SaslClientSimpleCb(void *sasl_client, int id, + const char **result, unsigned *len) { + return static_cast(sasl_client)->SimpleCb(id, result, len); +} + +static int SaslClientSecretCb(sasl_conn_t* conn, void *sasl_client, int id, + sasl_secret_t** psecret) { + return static_cast(sasl_client)->SecretCb(conn, id, psecret); +} + +// Return an appropriately-typed Status object based on an ErrorStatusPB returned +// from an Error RPC. +// In case there is no relevant Status type, return a RuntimeError. +static Status StatusFromRpcError(const ErrorStatusPB& error) { + DCHECK(error.IsInitialized()) << "Error status PB must be initialized"; + if (PREDICT_FALSE(!error.has_code())) { + return Status::RuntimeError(error.message()); + } + string code_name = ErrorStatusPB::RpcErrorCodePB_Name(error.code()); + switch (error.code()) { + case ErrorStatusPB_RpcErrorCodePB_FATAL_UNAUTHORIZED: + return Status::NotAuthorized(code_name, error.message()); + default: + return Status::RuntimeError(code_name, error.message()); + } +} + +SaslClient::SaslClient(string app_name, int fd) + : app_name_(std::move(app_name)), + sock_(fd), + helper_(SaslHelper::CLIENT), + client_state_(SaslNegotiationState::NEW), + negotiated_mech_(SaslMechanism::INVALID), + deadline_(MonoTime::Max()) { + callbacks_.push_back(SaslBuildCallback(SASL_CB_GETOPT, + reinterpret_cast(&SaslClientGetoptCb), this)); + callbacks_.push_back(SaslBuildCallback(SASL_CB_AUTHNAME, + reinterpret_cast(&SaslClientSimpleCb), this)); + callbacks_.push_back(SaslBuildCallback(SASL_CB_PASS, + reinterpret_cast(&SaslClientSecretCb), this)); + callbacks_.push_back(SaslBuildCallback(SASL_CB_LIST_END, nullptr, nullptr)); +} + +SaslClient::~SaslClient() { + sock_.Release(); // Do not close the underlying socket when this object is destroyed. +} + +Status SaslClient::EnableAnonymous() { + DCHECK_EQ(client_state_, SaslNegotiationState::INITIALIZED); + return helper_.EnableAnonymous(); +} + +Status SaslClient::EnablePlain(const string& user, const string& pass) { + DCHECK_EQ(client_state_, SaslNegotiationState::INITIALIZED); + RETURN_NOT_OK(helper_.EnablePlain()); + plain_auth_user_ = user; + plain_pass_ = pass; + return Status::OK(); +} + +SaslMechanism::Type SaslClient::negotiated_mechanism() const { + DCHECK_EQ(client_state_, SaslNegotiationState::NEGOTIATED); + return negotiated_mech_; +} + +void SaslClient::set_local_addr(const Sockaddr& addr) { + DCHECK_EQ(client_state_, SaslNegotiationState::NEW); + helper_.set_local_addr(addr); +} + +void SaslClient::set_remote_addr(const Sockaddr& addr) { + DCHECK_EQ(client_state_, SaslNegotiationState::NEW); + helper_.set_remote_addr(addr); +} + +void SaslClient::set_server_fqdn(const string& domain_name) { + DCHECK_EQ(client_state_, SaslNegotiationState::NEW); + helper_.set_server_fqdn(domain_name); +} + +void SaslClient::set_deadline(const MonoTime& deadline) { + DCHECK_NE(client_state_, SaslNegotiationState::NEGOTIATED); + deadline_ = deadline; +} + +// calls sasl_client_init() and sasl_client_new() +Status SaslClient::Init(const string& service_type) { + RETURN_NOT_OK(SaslInit(app_name_.c_str())); + + // Ensure we are not called more than once. + if (client_state_ != SaslNegotiationState::NEW) { + return Status::IllegalState("Init() may only be called once per SaslClient object."); + } + + // TODO: Support security flags. + unsigned secflags = 0; + + sasl_conn_t* sasl_conn = nullptr; + int result = sasl_client_new( + service_type.c_str(), // Registered name of the service using SASL. Required. + helper_.server_fqdn(), // The fully qualified domain name of the remote server. + helper_.local_addr_string(), // Local and remote IP address strings. (NULL disables + helper_.remote_addr_string(), // mechanisms which require this info.) + &callbacks_[0], // Connection-specific callbacks. + secflags, // Security flags. + &sasl_conn); + + if (PREDICT_FALSE(result != SASL_OK)) { + return Status::RuntimeError("Unable to create new SASL client", + SaslErrDesc(result, sasl_conn)); + } + sasl_conn_.reset(sasl_conn); + + client_state_ = SaslNegotiationState::INITIALIZED; + return Status::OK(); +} + +Status SaslClient::Negotiate() { + TRACE("Called SaslClient::Negotiate()"); + + // Ensure we called exactly once, and in the right order. + if (client_state_ == SaslNegotiationState::NEW) { + return Status::IllegalState("SaslClient: Init() must be called before calling Negotiate()"); + } else if (client_state_ == SaslNegotiationState::NEGOTIATED) { + return Status::IllegalState("SaslClient: Negotiate() may only be called once per object."); + } + + // Ensure we can use blocking calls on the socket during negotiation. + RETURN_NOT_OK(EnsureBlockingMode(&sock_)); + + // Start by asking the server for a list of available auth mechanisms. + RETURN_NOT_OK(SendNegotiateMessage()); + + faststring recv_buf; + nego_ok_ = false; + + // We set nego_ok_ = true when the SASL library returns SASL_OK to us. + // We set nego_response_expected_ = true each time we send a request to the server. + // When using ANONYMOUS, we get SASL_OK back immediately but still send INITIATE to the server. + while (!nego_ok_ || nego_response_expected_) { + ResponseHeader header; + Slice param_buf; + RETURN_NOT_OK(ReceiveFramedMessageBlocking(&sock_, &recv_buf, &header, ¶m_buf, deadline_)); + nego_response_expected_ = false; + + SaslMessagePB response; + RETURN_NOT_OK(ParseSaslMsgResponse(header, param_buf, &response)); + + switch (response.state()) { + // NEGOTIATE: Server has sent us its list of supported SASL mechanisms. + case SaslMessagePB::NEGOTIATE: + RETURN_NOT_OK(HandleNegotiateResponse(response)); + break; + + // CHALLENGE: Server sent us a follow-up to an INITIATE or RESPONSE request. + case SaslMessagePB::CHALLENGE: + RETURN_NOT_OK(HandleChallengeResponse(response)); + break; + + // SUCCESS: Server has accepted our authentication request. Negotiation successful. + case SaslMessagePB::SUCCESS: + RETURN_NOT_OK(HandleSuccessResponse(response)); + break; + + // Client sent us some unsupported SASL response. + default: + LOG(ERROR) << "SASL Client: Received unsupported response from server"; + return Status::InvalidArgument("RPC client doesn't support SASL state in response", + SaslMessagePB::SaslState_Name(response.state())); + } + } + + TRACE("SASL Client: Successful negotiation"); + client_state_ = SaslNegotiationState::NEGOTIATED; + return Status::OK(); +} + +Status SaslClient::SendSaslMessage(const SaslMessagePB& msg) { + DCHECK_NE(client_state_, SaslNegotiationState::NEW) + << "Must not send SASL messages before calling Init()"; + DCHECK_NE(client_state_, SaslNegotiationState::NEGOTIATED) + << "Must not send SASL messages after Negotiate() succeeds"; + + // Create header with SASL-specific callId + RequestHeader header; + header.set_call_id(kSaslCallId); + return helper_.SendSaslMessage(&sock_, header, msg, deadline_); +} + +Status SaslClient::ParseSaslMsgResponse(const ResponseHeader& header, const Slice& param_buf, + SaslMessagePB* response) { + RETURN_NOT_OK(helper_.SanityCheckSaslCallId(header.call_id())); + + if (header.is_error()) { + return ParseError(param_buf); + } + + return helper_.ParseSaslMessage(param_buf, response); +} + +Status SaslClient::SendNegotiateMessage() { + SaslMessagePB msg; + msg.set_state(SaslMessagePB::NEGOTIATE); + TRACE("SASL Client: Sending NEGOTIATE request to server."); + RETURN_NOT_OK(SendSaslMessage(msg)); + nego_response_expected_ = true; + return Status::OK(); +} + +Status SaslClient::SendInitiateMessage(const SaslMessagePB_SaslAuth& auth, + const char* init_msg, unsigned init_msg_len) { + SaslMessagePB msg; + msg.set_state(SaslMessagePB::INITIATE); + msg.mutable_token()->assign(init_msg, init_msg_len); + msg.add_auths()->CopyFrom(auth); + TRACE("SASL Client: Sending INITIATE request to server."); + RETURN_NOT_OK(SendSaslMessage(msg)); + nego_response_expected_ = true; + return Status::OK(); +} + +Status SaslClient::SendResponseMessage(const char* resp_msg, unsigned resp_msg_len) { + SaslMessagePB reply; + reply.set_state(SaslMessagePB::RESPONSE); + reply.mutable_token()->assign(resp_msg, resp_msg_len); + TRACE("SASL Client: Sending RESPONSE request to server."); + RETURN_NOT_OK(SendSaslMessage(reply)); + nego_response_expected_ = true; + return Status::OK(); +} + +Status SaslClient::DoSaslStep(const string& in, const char** out, unsigned* out_len, int* result) { + TRACE("SASL Client: Calling sasl_client_step()"); + int res = sasl_client_step(sasl_conn_.get(), in.c_str(), in.length(), nullptr, out, out_len); + *result = res; + if (res == SASL_OK) { + nego_ok_ = true; + } + if (PREDICT_FALSE(res != SASL_OK && res != SASL_CONTINUE)) { + return Status::NotAuthorized("Unable to negotiate SASL connection", + SaslErrDesc(res, sasl_conn_.get())); + } + return Status::OK(); +} + +Status SaslClient::HandleNegotiateResponse(const SaslMessagePB& response) { + TRACE("SASL Client: Received NEGOTIATE response from server"); + map mech_auth_map; + + string mech_list; + mech_list.reserve(64); // Avoid resizing the buffer later. + for (const SaslMessagePB::SaslAuth& auth : response.auths()) { + if (mech_list.length() > 0) mech_list.append(" "); + string mech = auth.mechanism(); + mech_list.append(mech); + mech_auth_map[mech] = auth; + } + TRACE("SASL Client: Server mech list: $0", mech_list); + + const char* init_msg = nullptr; + unsigned init_msg_len = 0; + const char* negotiated_mech = nullptr; + + /* select a mechanism for a connection + * mechlist -- mechanisms server has available (punctuation ignored) + * output: + * prompt_need -- on SASL_INTERACT, list of prompts needed to continue + * clientout -- the initial client response to send to the server + * mech -- set to mechanism name + * + * Returns: + * SASL_OK -- success + * SASL_CONTINUE -- negotiation required + * SASL_NOMEM -- not enough memory + * SASL_NOMECH -- no mechanism meets requested properties + * SASL_INTERACT -- user interaction needed to fill in prompt_need list + */ + TRACE("SASL Client: Calling sasl_client_start()"); + int result = sasl_client_start( + sasl_conn_.get(), // The SASL connection context created by init() + mech_list.c_str(), // The list of mechanisms from the server. + nullptr, // Disables INTERACT return if NULL. + &init_msg, // Filled in on success. + &init_msg_len, // Filled in on success. + &negotiated_mech); // Filled in on success. + + if (PREDICT_FALSE(result == SASL_OK)) { + nego_ok_ = true; + } else if (PREDICT_FALSE(result != SASL_CONTINUE)) { + return Status::NotAuthorized("Unable to negotiate SASL connection", + SaslErrDesc(result, sasl_conn_.get())); + } + + // The server matched one of our mechanisms. + SaslMessagePB::SaslAuth* auth = FindOrNull(mech_auth_map, negotiated_mech); + if (PREDICT_FALSE(auth == nullptr)) { + return Status::IllegalState("Unable to find auth in map, unexpected error", negotiated_mech); + } + negotiated_mech_ = SaslMechanism::value_of(negotiated_mech); + + // Handle the case where the server sent a challenge with the NEGOTIATE response. + if (auth->has_challenge()) { + if (PREDICT_FALSE(nego_ok_)) { + LOG(DFATAL) << "Server sent challenge after sasl_client_start() returned SASL_OK"; + } + RETURN_NOT_OK(DoSaslStep(auth->challenge(), &init_msg, &init_msg_len, &result)); + } + + RETURN_NOT_OK(SendInitiateMessage(*auth, init_msg, init_msg_len)); + return Status::OK(); +} + +Status SaslClient::HandleChallengeResponse(const SaslMessagePB& response) { + TRACE("SASL Client: Received CHALLENGE response from server"); + if (PREDICT_FALSE(nego_ok_)) { + LOG(DFATAL) << "Server sent CHALLENGE response after client library returned SASL_OK"; + } + + if (PREDICT_FALSE(!response.has_token())) { + return Status::InvalidArgument("No token in CHALLENGE response from server"); + } + + const char* out = nullptr; + unsigned out_len = 0; + int result = 0; + RETURN_NOT_OK(DoSaslStep(response.token(), &out, &out_len, &result)); + + RETURN_NOT_OK(SendResponseMessage(out, out_len)); + return Status::OK(); +} + +Status SaslClient::HandleSuccessResponse(const SaslMessagePB& response) { + TRACE("SASL Client: Received SUCCESS response from server"); + if (!nego_ok_) { + const char* out = nullptr; + unsigned out_len = 0; + int result = 0; + RETURN_NOT_OK(DoSaslStep(response.token(), &out, &out_len, &result)); + if (out_len > 0) { + return Status::IllegalState("SASL client library generated spurious token after SUCCESS", + string(out, out_len)); + } + if (PREDICT_FALSE(result != SASL_OK)) { + return Status::NotAuthorized("Unable to negotiate SASL connection", + SaslErrDesc(result, sasl_conn_.get())); + } + } + nego_ok_ = true; + return Status::OK(); +} + +// Parse error status message from raw bytes of an ErrorStatusPB. +Status SaslClient::ParseError(const Slice& err_data) { + ErrorStatusPB error; + if (!error.ParseFromArray(err_data.data(), err_data.size())) { + return Status::IOError("Invalid error response, missing fields", + error.InitializationErrorString()); + } + Status s = StatusFromRpcError(error); + TRACE("SASL Client: Received error response from server: $0", s.ToString()); + return s; +} + +int SaslClient::GetOptionCb(const char* plugin_name, const char* option, + const char** result, unsigned* len) { + return helper_.GetOptionCb(plugin_name, option, result, len); +} + +// Used for PLAIN and ANONYMOUS. +// SASL callback for SASL_CB_USER, SASL_CB_AUTHNAME, SASL_CB_LANGUAGE +int SaslClient::SimpleCb(int id, const char** result, unsigned* len) { + if (PREDICT_FALSE(result == nullptr)) { + LOG(DFATAL) << "SASL Client: result outparam is NULL"; + return SASL_BADPARAM; + } + switch (id) { + // TODO: Support impersonation? + // For impersonation, USER is the impersonated user, AUTHNAME is the "sudoer". + case SASL_CB_USER: + TRACE("SASL Client: callback for SASL_CB_USER"); + if (helper_.IsPlainEnabled()) { + *result = plain_auth_user_.c_str(); + if (len != nullptr) *len = plain_auth_user_.length(); + } else if (helper_.IsAnonymousEnabled()) { + *result = nullptr; + } + break; + case SASL_CB_AUTHNAME: + TRACE("SASL Client: callback for SASL_CB_AUTHNAME"); + if (helper_.IsPlainEnabled()) { + *result = plain_auth_user_.c_str(); + if (len != nullptr) *len = plain_auth_user_.length(); + } + break; + case SASL_CB_LANGUAGE: + LOG(DFATAL) << "SASL Client: Unable to handle SASL callback type SASL_CB_LANGUAGE" + << "(" << id << ")"; + return SASL_BADPARAM; + default: + LOG(DFATAL) << "SASL Client: Unexpected SASL callback type: " << id; + return SASL_BADPARAM; + } + + return SASL_OK; +} + +// Used for PLAIN. +// SASL callback for SASL_CB_PASS: User password. +int SaslClient::SecretCb(sasl_conn_t* conn, int id, sasl_secret_t** psecret) { + if (PREDICT_FALSE(!helper_.IsPlainEnabled())) { + LOG(DFATAL) << "SASL Client: Plain secret callback called, but PLAIN auth is not enabled"; + return SASL_FAIL; + } + switch (id) { + case SASL_CB_PASS: { + if (!conn || !psecret) return SASL_BADPARAM; + + int len = plain_pass_.length(); + *psecret = reinterpret_cast(malloc(sizeof(sasl_secret_t) + len)); + if (!*psecret) { + return SASL_NOMEM; + } + psecret_.reset(*psecret); // Ensure that we free() this structure later. + (*psecret)->len = len; + memcpy(reinterpret_cast((*psecret)->data), plain_pass_.c_str(), len + 1); + break; + } + default: + LOG(DFATAL) << "SASL Client: Unexpected SASL callback type: " << id; + return SASL_BADPARAM; + } + + return SASL_OK; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/sasl_client.h b/src/kudu/rpc/sasl_client.h new file mode 100644 index 000000000000..b9f460b115a5 --- /dev/null +++ b/src/kudu/rpc/sasl_client.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_SASL_CLIENT_H +#define KUDU_RPC_SASL_CLIENT_H + +#include +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/sasl_helper.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/net/socket.h" + +namespace kudu { +namespace rpc { + +using std::string; + +class ResponseHeader; +class SaslMessagePB; +class SaslMessagePB_SaslAuth; + +// Class for doing SASL negotiation with a SaslServer over a bidirectional socket. +// Operations on this class are NOT thread-safe. +class SaslClient { + public: + // Does not take ownership of the socket indicated by the fd. + SaslClient(string app_name, int fd); + ~SaslClient(); + + // Enable ANONYMOUS authentication. + // Call after Init(). + Status EnableAnonymous(); + + // Enable PLAIN authentication. + // Call after Init(). + Status EnablePlain(const string& user, const string& pass); + + // Returns mechanism negotiated by this connection. + // Call after Negotiate(). + SaslMechanism::Type negotiated_mechanism() const; + + // Specify IP:port of local side of connection. + // Call before Init(). Required for some mechanisms. + void set_local_addr(const Sockaddr& addr); + + // Specify IP:port of remote side of connection. + // Call before Init(). Required for some mechanisms. + void set_remote_addr(const Sockaddr& addr); + + // Specify the fully-qualified domain name of the remote server. + // Call before Init(). Required for some mechanisms. + void set_server_fqdn(const string& domain_name); + + // Set deadline for connection negotiation. + void set_deadline(const MonoTime& deadline); + + // Get deadline for connection negotiation. + const MonoTime& deadline() const { return deadline_; } + + // Initialize a new SASL client. Must be called before Negotiate(). + // Returns OK on success, otherwise RuntimeError. + Status Init(const string& service_type); + + // Begin negotiation with the SASL server on the other side of the fd socket + // that this client was constructed with. + // Returns OK on success. + // Otherwise, it may return NotAuthorized, NotSupported, or another non-OK status. + Status Negotiate(); + + // SASL callback for plugin options, supported mechanisms, etc. + // Returns SASL_FAIL if the option is not handled, which does not fail the handshake. + int GetOptionCb(const char* plugin_name, const char* option, + const char** result, unsigned* len); + + // SASL callback for SASL_CB_USER, SASL_CB_AUTHNAME, SASL_CB_LANGUAGE + int SimpleCb(int id, const char** result, unsigned* len); + + // SASL callback for SASL_CB_PASS + int SecretCb(sasl_conn_t* conn, int id, sasl_secret_t** psecret); + + private: + // Encode and send the specified SASL message to the server. + Status SendSaslMessage(const SaslMessagePB& msg); + + // Validate that header does not indicate an error, parse param_buf into response. + Status ParseSaslMsgResponse(const ResponseHeader& header, const Slice& param_buf, + SaslMessagePB* response); + + // Send an NEGOTIATE message to the server. + Status SendNegotiateMessage(); + + // Send an INITIATE message to the server. + Status SendInitiateMessage(const SaslMessagePB_SaslAuth& auth, + const char* init_msg, unsigned init_msg_len); + + // Send a RESPONSE message to the server. + Status SendResponseMessage(const char* resp_msg, unsigned resp_msg_len); + + // Perform a client-side step of the SASL negotiation. + // Input is what came from the server. Output is what we will send back to the server. + // Return code from sasl_client_step is stored in result. + // Returns Status::OK if sasl_client_step returns SASL_OK or SASL_CONTINUE; otherwise, + // returns Status::NotAuthorized. + Status DoSaslStep(const string& in, const char** out, unsigned* out_len, int* result); + + // Handle case when server sends NEGOTIATE response. + Status HandleNegotiateResponse(const SaslMessagePB& response); + + // Handle case when server sends CHALLENGE response. + Status HandleChallengeResponse(const SaslMessagePB& response); + + // Handle case when server sends SUCCESS response. + Status HandleSuccessResponse(const SaslMessagePB& response); + + // Parse error status message from raw bytes of an ErrorStatusPB. + Status ParseError(const Slice& err_data); + + string app_name_; + Socket sock_; + std::vector callbacks_; + gscoped_ptr sasl_conn_; + SaslHelper helper_; + + string plain_auth_user_; + string plain_pass_; + gscoped_ptr psecret_; + + SaslNegotiationState::Type client_state_; + + // The mechanism we negotiated with the server. + SaslMechanism::Type negotiated_mech_; + + // Intra-negotiation state. + bool nego_ok_; // During negotiation: did we get a SASL_OK response from the SASL library? + bool nego_response_expected_; // During negotiation: Are we waiting for a server response? + + // Negotiation timeout deadline. + MonoTime deadline_; + + DISALLOW_COPY_AND_ASSIGN(SaslClient); +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_SASL_CLIENT_H diff --git a/src/kudu/rpc/sasl_common.cc b/src/kudu/rpc/sasl_common.cc new file mode 100644 index 000000000000..b2c7849acb09 --- /dev/null +++ b/src/kudu/rpc/sasl_common.cc @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/sasl_common.h" + +#include +#include + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/once.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/net/sockaddr.h" + +using std::set; + +namespace kudu { +namespace rpc { + +const char* const kSaslMechAnonymous = "ANONYMOUS"; +const char* const kSaslMechPlain = "PLAIN"; + +// Output Sasl messages. +// context: not used. +// level: logging level. +// message: message to output; +static int SaslLogCallback(void* context, int level, const char* message) { + + if (message == nullptr) return SASL_BADPARAM; + + switch (level) { + case SASL_LOG_NONE: + break; + + case SASL_LOG_ERR: + case SASL_LOG_FAIL: + LOG(ERROR) << "SASL: " << message; + break; + + case SASL_LOG_WARN: + LOG(WARNING) << "SASL: " << message; + break; + + case SASL_LOG_NOTE: + LOG(INFO) << "SASL: " << message; + break; + + case SASL_LOG_DEBUG: + VLOG(1) << "SASL: " << message; + break; + + case SASL_LOG_TRACE: + case SASL_LOG_PASS: + VLOG(3) << "SASL: " << message; + break; + } + + return SASL_OK; +} + +// Get Sasl option. +// context: not used +// plugin_name: name of plugin for which an option is being requested. +// option: option requested +// result: set to result which persists until next getopt in same thread, +// unchanged if option not found +// len: length of the result +// Return SASL_FAIL if the option is not handled, this does not fail the handshake. +static int SaslGetOption(void* context, const char* plugin_name, const char* option, + const char** result, unsigned* len) { + // Handle Sasl Library options + if (plugin_name == nullptr) { + // Return the logging level that we want the sasl library to use. + if (strcmp("log_level", option) == 0) { + int level = SASL_LOG_NOTE; + if (VLOG_IS_ON(1)) { + level = SASL_LOG_DEBUG; + } else if (VLOG_IS_ON(3)) { + level = SASL_LOG_TRACE; + } + // The library's contract for this method is that the caller gets to keep + // the returned buffer until the next call by the same thread, so we use a + // threadlocal for the buffer. + static __thread char buf[4]; + snprintf(buf, arraysize(buf), "%d", level); + *result = buf; + if (len != nullptr) *len = strlen(buf); + return SASL_OK; + } + // Options can default so don't complain. + VLOG(4) << "SaslGetOption: Unknown library option: " << option; + return SASL_FAIL; + } + VLOG(4) << "SaslGetOption: Unknown plugin: " << plugin_name; + return SASL_FAIL; +} + +// Array of callbacks for the sasl library. +static sasl_callback_t callbacks[] = { + { SASL_CB_LOG, reinterpret_cast(&SaslLogCallback), nullptr }, + { SASL_CB_GETOPT, reinterpret_cast(&SaslGetOption), nullptr }, + { SASL_CB_LIST_END, nullptr, nullptr } +}; + +// Determine whether initialization was ever called +struct InitializationData { + Status status; + string app_name; +}; +static struct InitializationData* sasl_init_data; + +// Actually perform the initialization for the SASL subsystem. +// Meant to be called via GoogleOnceInitArg(). +static void DoSaslInit(void* app_name_char_array) { + // Explicitly cast from void* here so GoogleOnce doesn't have to deal with it. + // We were getting Clang 3.4 UBSAN errors when letting GoogleOnce cast. + const char* const app_name = reinterpret_cast(app_name_char_array); + VLOG(3) << "Initializing SASL library"; + + sasl_init_data = new InitializationData(); + sasl_init_data->app_name = app_name; + + int result = sasl_client_init(&callbacks[0]); + if (result != SASL_OK) { + sasl_init_data->status = Status::RuntimeError("Could not initialize SASL client", + sasl_errstring(result, nullptr, nullptr)); + return; + } + + result = sasl_server_init(&callbacks[0], sasl_init_data->app_name.c_str()); + if (result != SASL_OK) { + sasl_init_data->status = Status::RuntimeError("Could not initialize SASL server", + sasl_errstring(result, nullptr, nullptr)); + return; + } + + sasl_init_data->status = Status::OK(); +} + +// Only execute SASL initialization once +static GoogleOnceType once = GOOGLE_ONCE_INIT; + +Status SaslInit(const char* const app_name) { + GoogleOnceInitArg(&once, + &DoSaslInit, + // This is a bit ugly, but Clang 3.4 UBSAN complains otherwise. + reinterpret_cast(const_cast(app_name))); + if (PREDICT_FALSE(sasl_init_data->app_name != app_name)) { + return Status::InvalidArgument("SaslInit called successively with different arguments", + StringPrintf("Previous: %s, current: %s", sasl_init_data->app_name.c_str(), app_name)); + } + return sasl_init_data->status; +} + +string SaslErrDesc(int status, sasl_conn_t* conn) { + if (conn != nullptr) { + return StringPrintf("SASL result code: %s, error: %s", + sasl_errstring(status, nullptr, nullptr), + sasl_errdetail(conn)); + } + return StringPrintf("SASL result code: %s", sasl_errstring(status, nullptr, nullptr)); +} + +string SaslIpPortString(const Sockaddr& addr) { + string addr_str = addr.ToString(); + size_t colon_pos = addr_str.find(':'); + if (colon_pos != string::npos) { + addr_str[colon_pos] = ';'; + } + return addr_str; +} + +set SaslListAvailableMechs() { + set mechs; + + // Array of NULL-terminated strings. Array terminated with NULL. + const char** mech_strings = sasl_global_listmech(); + while (mech_strings != nullptr && *mech_strings != nullptr) { + mechs.insert(*mech_strings); + mech_strings++; + } + return mechs; +} + +sasl_callback_t SaslBuildCallback(int id, int (*proc)(void), void* context) { + sasl_callback_t callback; + callback.id = id; + callback.proc = proc; + callback.context = context; + return callback; +} + +SaslMechanism::Type SaslMechanism::value_of(const string& mech) { + if (boost::iequals(mech, "ANONYMOUS")) { + return ANONYMOUS; + } else if (boost::iequals(mech, "PLAIN")) { + return PLAIN; + } + return INVALID; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/sasl_common.h b/src/kudu/rpc/sasl_common.h new file mode 100644 index 000000000000..ebd879a64819 --- /dev/null +++ b/src/kudu/rpc/sasl_common.h @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_SASL_COMMON_H +#define KUDU_RPC_SASL_COMMON_H + +#include // Required for sasl/sasl.h + +#include +#include + +#include + +#include "kudu/util/status.h" + +namespace kudu { + +class Sockaddr; + +namespace rpc { + +using std::string; + +// Constants +extern const char* const kSaslMechAnonymous; +extern const char* const kSaslMechPlain; + +// Initialize the SASL library. +// appname: Name of the application for logging messages & sasl plugin configuration. +// Note that this string must remain allocated for the lifetime of the program. +// This function must be called before using SASL. +// If the library initializes without error, calling more than once has no effect. +// +// Some SASL plugins take time to initialize random number generators and other things, +// so the first time this function is invoked it may execute for several seconds. +// After that, it should be very fast. This function should be invoked as early as possible +// in the application lifetime to avoid SASL initialization taking place in a +// performance-critical section. +// +// This function is thread safe and uses a static lock. +// This function should NOT be called during static initialization. +Status SaslInit(const char* const app_name); + +// Return a string describing the SASL error response. +string SaslErrDesc(int status, sasl_conn_t* conn); + +// Return ; string formatted for SASL library use. +string SaslIpPortString(const Sockaddr& addr); + +// Return available plugin mechanisms for the given connection. +std::set SaslListAvailableMechs(); + +// Initialize and return a libsasl2 callback data structure based on the passed args. +// id: A SASL callback identifier (e.g., SASL_CB_GETOPT). +// proc: A C-style callback with appropriate signature based on the callback id, or NULL. +// context: An object to pass to the callback as the context pointer, or NULL. +sasl_callback_t SaslBuildCallback(int id, int (*proc)(void), void* context); + +// Deleter for sasl_conn_t instances, for use with gscoped_ptr after calling sasl_*_new() +struct SaslDeleter { + inline void operator()(sasl_conn_t* conn) { + sasl_dispose(&conn); + } +}; + +struct SaslNegotiationState { + enum Type { + NEW, + INITIALIZED, + NEGOTIATED + }; +}; + +struct SaslMechanism { + enum Type { + INVALID, + ANONYMOUS, + PLAIN + }; + static Type value_of(const std::string& mech); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/sasl_helper.cc b/src/kudu/rpc/sasl_helper.cc new file mode 100644 index 000000000000..0309f0c368aa --- /dev/null +++ b/src/kudu/rpc/sasl_helper.cc @@ -0,0 +1,198 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/sasl_helper.h" + +#include +#include + +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/rpc/blocking_ops.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/serialization.h" +#include "kudu/util/faststring.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace rpc { + +using google::protobuf::MessageLite; + +SaslHelper::SaslHelper(PeerType peer_type) + : peer_type_(peer_type), + conn_header_exchanged_(false), + anonymous_enabled_(false), + plain_enabled_(false) { + tag_ = (peer_type_ == SERVER) ? "Sasl Server" : "Sasl Client"; +} + +SaslHelper::~SaslHelper() { +} + +void SaslHelper::set_local_addr(const Sockaddr& addr) { + local_addr_ = SaslIpPortString(addr); +} +const char* SaslHelper::local_addr_string() const { + return local_addr_.empty() ? nullptr : local_addr_.c_str(); +} + +void SaslHelper::set_remote_addr(const Sockaddr& addr) { + remote_addr_ = SaslIpPortString(addr); +} +const char* SaslHelper::remote_addr_string() const { + return remote_addr_.empty() ? nullptr : remote_addr_.c_str(); +} + +void SaslHelper::set_server_fqdn(const string& domain_name) { + server_fqdn_ = domain_name; +} +const char* SaslHelper::server_fqdn() const { + return server_fqdn_.empty() ? nullptr : server_fqdn_.c_str(); +} + +const std::set& SaslHelper::GlobalMechs() const { + if (!global_mechs_) { + global_mechs_.reset(new set(SaslListAvailableMechs())); + } + return *global_mechs_; +} + +void SaslHelper::AddToLocalMechList(const string& mech) { + mechs_.insert(mech); +} + +const std::set& SaslHelper::LocalMechs() const { + return mechs_; +} + +const char* SaslHelper::LocalMechListString() const { + JoinStrings(mechs_, " ", &mech_list_); + return mech_list_.empty() ? nullptr : mech_list_.c_str(); +} + + +int SaslHelper::GetOptionCb(const char* plugin_name, const char* option, + const char** result, unsigned* len) { + string cb_name("client_mech_list"); + if (peer_type_ == SERVER) { + cb_name = "mech_list"; + } + + DVLOG(4) << tag_ << ": GetOption Callback called. "; + DVLOG(4) << tag_ << ": GetOption Plugin name: " + << (plugin_name == nullptr ? "NULL" : plugin_name); + DVLOG(4) << tag_ << ": GetOption Option name: " << option; + + if (PREDICT_FALSE(result == nullptr)) { + LOG(DFATAL) << tag_ << ": SASL Library passed NULL result out-param to GetOption callback!"; + return SASL_BADPARAM; + } + + if (plugin_name == nullptr) { + // SASL library option, not a plugin option + if (cb_name == option) { + *result = LocalMechListString(); + if (len != nullptr) *len = strlen(*result); + DVLOG(3) << tag_ << ": Enabled mech list: " << (*result == nullptr ? "NULL" : *result); + return SASL_OK; + } + VLOG(4) << tag_ << ": GetOptionCb: Unknown library option: " << option; + } else { + VLOG(4) << tag_ << ": GetOptionCb: Unknown plugin: " << plugin_name; + } + return SASL_FAIL; +} + +Status SaslHelper::EnableAnonymous() { + if (PREDICT_FALSE(!ContainsKey(GlobalMechs(), kSaslMechAnonymous))) { + LOG(DFATAL) << tag_ << ": Unable to find ANONYMOUS SASL plugin"; + return Status::InvalidArgument("Client unable to find ANONYMOUS SASL plugin"); + } + AddToLocalMechList(kSaslMechAnonymous); + anonymous_enabled_ = true; + return Status::OK(); +} + +bool SaslHelper::IsAnonymousEnabled() const { + return anonymous_enabled_; +} + +Status SaslHelper::EnablePlain() { + if (PREDICT_FALSE(!ContainsKey(GlobalMechs(), kSaslMechPlain))) { + LOG(DFATAL) << tag_ << ": Unable to find PLAIN SASL plugin"; + return Status::InvalidArgument("Unable to find PLAIN SASL plugin"); + } + AddToLocalMechList(kSaslMechPlain); + plain_enabled_ = true; + return Status::OK(); +} + +bool SaslHelper::IsPlainEnabled() const { + return plain_enabled_; +} + +Status SaslHelper::SanityCheckSaslCallId(int32_t call_id) const { + if (call_id != kSaslCallId) { + Status s = Status::IllegalState(StringPrintf("Non-SASL request during negotiation. " + "Expected callId: %d, received callId: %d", kSaslCallId, call_id)); + LOG(DFATAL) << tag_ << ": " << s.ToString(); + return s; + } + return Status::OK(); +} + +Status SaslHelper::ParseSaslMessage(const Slice& param_buf, SaslMessagePB* msg) { + if (!msg->ParseFromArray(param_buf.data(), param_buf.size())) { + return Status::IOError(tag_ + ": Invalid SASL message, missing fields", + msg->InitializationErrorString()); + } + return Status::OK(); +} + +Status SaslHelper::SendSaslMessage(Socket* sock, const MessageLite& header, const MessageLite& msg, + const MonoTime& deadline) { + DCHECK(sock != nullptr); + DCHECK(header.IsInitialized()) << tag_ << ": Header must be initialized"; + DCHECK(msg.IsInitialized()) << tag_ << ": Message must be initialized"; + + // Write connection header, if needed + if (PREDICT_FALSE(peer_type_ == CLIENT && !conn_header_exchanged_)) { + const uint8_t buflen = kMagicNumberLength + kHeaderFlagsLength; + uint8_t buf[buflen]; + serialization::SerializeConnHeader(buf); + size_t nsent; + RETURN_NOT_OK(sock->BlockingWrite(buf, buflen, &nsent, deadline)); + conn_header_exchanged_ = true; + } + + RETURN_NOT_OK(SendFramedMessageBlocking(sock, header, msg, deadline)); + return Status::OK(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/sasl_helper.h b/src/kudu/rpc/sasl_helper.h new file mode 100644 index 000000000000..aeb162b06fd4 --- /dev/null +++ b/src/kudu/rpc/sasl_helper.h @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_SASL_HELPER_H +#define KUDU_RPC_SASL_HELPER_H + +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/net/socket.h" + +namespace google { +namespace protobuf { +class MessageLite; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class MonoTime; +class Sockaddr; +class Status; + +namespace rpc { + +using std::string; + +class SaslMessagePB; + +// Helper class which contains functionality that is common to SaslClient & SaslServer. +// Most of these methods are convenience methods for interacting with the libsasl2 library. +class SaslHelper { + public: + enum PeerType { + CLIENT, + SERVER + }; + + explicit SaslHelper(PeerType peer_type); + ~SaslHelper(); + + // Specify IP:port of local side of connection. + void set_local_addr(const Sockaddr& addr); + const char* local_addr_string() const; + + // Specify IP:port of remote side of connection. + void set_remote_addr(const Sockaddr& addr); + const char* remote_addr_string() const; + + // Specify the fully-qualified domain name of the remote server. + void set_server_fqdn(const string& domain_name); + const char* server_fqdn() const; + + // Globally-registered available SASL plugins. + const std::set& GlobalMechs() const; + + // Helper functions for managing the list of active SASL mechanisms. + void AddToLocalMechList(const string& mech); + const std::set& LocalMechs() const; + + // Returns space-delimited local mechanism list string suitable for passing + // to libsasl2, such as via "mech_list" callbacks. + // The returned pointer is valid only until the next call to LocalMechListString(). + const char* LocalMechListString() const; + + // Implements the client_mech_list / mech_list callbacks. + int GetOptionCb(const char* plugin_name, const char* option, const char** result, unsigned* len); + + // Enable the ANONYMOUS SASL mechanism. + Status EnableAnonymous(); + + // Check for the ANONYMOUS SASL mechanism. + bool IsAnonymousEnabled() const; + + // Enable the PLAIN SASL mechanism. + Status EnablePlain(); + + // Check for the PLAIN SASL mechanism. + bool IsPlainEnabled() const; + + // Sanity check that the call ID is the SASL call ID. + // Logs DFATAL if call_id does not match. + Status SanityCheckSaslCallId(int32_t call_id) const; + + // Parse msg from the given Slice. + Status ParseSaslMessage(const Slice& param_buf, SaslMessagePB* msg); + + // Encode and send a message over a socket, sending the connection header if necessary. + Status SendSaslMessage(Socket* sock, const google::protobuf::MessageLite& header, + const google::protobuf::MessageLite& msg, const MonoTime& deadline); + + private: + string local_addr_; + string remote_addr_; + string server_fqdn_; + + // Authentication types and data. + const PeerType peer_type_; + bool conn_header_exchanged_; + string tag_; + mutable gscoped_ptr< std::set > global_mechs_; // Cache of global mechanisms. + std::set mechs_; // Active mechanisms. + mutable string mech_list_; // Mechanism list string returned by callbacks. + + bool anonymous_enabled_; + bool plain_enabled_; + + DISALLOW_COPY_AND_ASSIGN(SaslHelper); +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_SASL_HELPER_H diff --git a/src/kudu/rpc/sasl_rpc-test.cc b/src/kudu/rpc/sasl_rpc-test.cc new file mode 100644 index 000000000000..1e46380da226 --- /dev/null +++ b/src/kudu/rpc/sasl_rpc-test.cc @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/rpc-test-base.h" + +#include + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/auth_store.h" +#include "kudu/rpc/sasl_client.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/sasl_server.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" + +using std::string; + +namespace kudu { +namespace rpc { + +class TestSaslRpc : public RpcTestBase { + public: + virtual void SetUp() OVERRIDE { + RpcTestBase::SetUp(); + ASSERT_OK(SaslInit(kSaslAppName)); + } +}; + +// Test basic initialization of the objects. +TEST_F(TestSaslRpc, TestBasicInit) { + SaslServer server(kSaslAppName, -1); + ASSERT_OK(server.Init(kSaslAppName)); + SaslClient client(kSaslAppName, -1); + ASSERT_OK(client.Init(kSaslAppName)); +} + +// A "Callable" that takes a Socket* param, for use with starting a thread. +// Can be used for SaslServer or SaslClient threads. +typedef void (*socket_callable_t)(Socket*); + +// Call Accept() on the socket, then pass the connection to the server runner +static void RunAcceptingDelegator(Socket* acceptor, socket_callable_t server_runner) { + Socket conn; + Sockaddr remote; + CHECK_OK(acceptor->Accept(&conn, &remote, 0)); + server_runner(&conn); +} + +// Set up a socket and run a SASL negotiation. +static void RunNegotiationTest(socket_callable_t server_runner, socket_callable_t client_runner) { + Socket server_sock; + CHECK_OK(server_sock.Init(0)); + ASSERT_OK(server_sock.BindAndListen(Sockaddr(), 1)); + Sockaddr server_bind_addr; + ASSERT_OK(server_sock.GetSocketAddress(&server_bind_addr)); + boost::thread server(RunAcceptingDelegator, &server_sock, server_runner); + + Socket client_sock; + CHECK_OK(client_sock.Init(0)); + ASSERT_OK(client_sock.Connect(server_bind_addr)); + boost::thread client(client_runner, &client_sock); + + LOG(INFO) << "Waiting for test threads to terminate..."; + client.join(); + LOG(INFO) << "Client thread terminated."; + server.join(); + LOG(INFO) << "Server thread terminated."; +} + +//////////////////////////////////////////////////////////////////////////////// + +static void RunAnonNegotiationServer(Socket* conn) { + SaslServer sasl_server(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_server.Init(kSaslAppName)); + CHECK_OK(sasl_server.EnableAnonymous()); + CHECK_OK(sasl_server.Negotiate()); +} + +static void RunAnonNegotiationClient(Socket* conn) { + SaslClient sasl_client(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_client.Init(kSaslAppName)); + CHECK_OK(sasl_client.EnableAnonymous()); + CHECK_OK(sasl_client.Negotiate()); +} + +// Test SASL negotiation using the ANONYMOUS mechanism over a socket. +TEST_F(TestSaslRpc, TestAnonNegotiation) { + RunNegotiationTest(RunAnonNegotiationServer, RunAnonNegotiationClient); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void RunPlainNegotiationServer(Socket* conn) { + SaslServer sasl_server(kSaslAppName, conn->GetFd()); + gscoped_ptr authstore(new AuthStore()); + CHECK_OK(authstore->Add("danger", "burrito")); + CHECK_OK(sasl_server.Init(kSaslAppName)); + CHECK_OK(sasl_server.EnablePlain(authstore.Pass())); + CHECK_OK(sasl_server.Negotiate()); +} + +static void RunPlainNegotiationClient(Socket* conn) { + SaslClient sasl_client(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_client.Init(kSaslAppName)); + CHECK_OK(sasl_client.EnablePlain("danger", "burrito")); + CHECK_OK(sasl_client.Negotiate()); +} + +// Test SASL negotiation using the PLAIN mechanism over a socket. +TEST_F(TestSaslRpc, TestPlainNegotiation) { + RunNegotiationTest(RunPlainNegotiationServer, RunPlainNegotiationClient); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void RunPlainFailingNegotiationServer(Socket* conn) { + SaslServer sasl_server(kSaslAppName, conn->GetFd()); + gscoped_ptr authstore(new AuthStore()); + CHECK_OK(authstore->Add("danger", "burrito")); + CHECK_OK(sasl_server.Init(kSaslAppName)); + CHECK_OK(sasl_server.EnablePlain(authstore.Pass())); + Status s = sasl_server.Negotiate(); + ASSERT_TRUE(s.IsNotAuthorized()) << "Expected auth failure! Got: " << s.ToString(); +} + +static void RunPlainFailingNegotiationClient(Socket* conn) { + SaslClient sasl_client(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_client.Init(kSaslAppName)); + CHECK_OK(sasl_client.EnablePlain("unknown", "burrito")); + Status s = sasl_client.Negotiate(); + ASSERT_TRUE(s.IsNotAuthorized()) << "Expected auth failure! Got: " << s.ToString(); +} + +// Test SASL negotiation using the PLAIN mechanism over a socket. +TEST_F(TestSaslRpc, TestPlainFailingNegotiation) { + RunNegotiationTest(RunPlainFailingNegotiationServer, RunPlainFailingNegotiationClient); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void RunTimeoutExpectingServer(Socket* conn) { + SaslServer sasl_server(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_server.Init(kSaslAppName)); + CHECK_OK(sasl_server.EnableAnonymous()); + Status s = sasl_server.Negotiate(); + ASSERT_TRUE(s.IsNetworkError()) << "Expected client to time out and close the connection. Got: " + << s.ToString(); +} + +static void RunTimeoutNegotiationClient(Socket* sock) { + SaslClient sasl_client(kSaslAppName, sock->GetFd()); + CHECK_OK(sasl_client.Init(kSaslAppName)); + CHECK_OK(sasl_client.EnableAnonymous()); + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(-100L)); + sasl_client.set_deadline(deadline); + Status s = sasl_client.Negotiate(); + ASSERT_TRUE(s.IsTimedOut()) << "Expected timeout! Got: " << s.ToString(); + CHECK_OK(sock->Shutdown(true, true)); +} + +// Ensure that the client times out. +TEST_F(TestSaslRpc, TestClientTimeout) { + RunNegotiationTest(RunTimeoutExpectingServer, RunTimeoutNegotiationClient); +} + +//////////////////////////////////////////////////////////////////////////////// + +static void RunTimeoutNegotiationServer(Socket* sock) { + SaslServer sasl_server(kSaslAppName, sock->GetFd()); + CHECK_OK(sasl_server.Init(kSaslAppName)); + CHECK_OK(sasl_server.EnableAnonymous()); + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(-100L)); + sasl_server.set_deadline(deadline); + Status s = sasl_server.Negotiate(); + ASSERT_TRUE(s.IsTimedOut()) << "Expected timeout! Got: " << s.ToString(); + CHECK_OK(sock->Close()); +} + +static void RunTimeoutExpectingClient(Socket* conn) { + SaslClient sasl_client(kSaslAppName, conn->GetFd()); + CHECK_OK(sasl_client.Init(kSaslAppName)); + CHECK_OK(sasl_client.EnableAnonymous()); + Status s = sasl_client.Negotiate(); + ASSERT_TRUE(s.IsNetworkError()) << "Expected server to time out and close the connection. Got: " + << s.ToString(); +} + +// Ensure that the server times out. +TEST_F(TestSaslRpc, TestServerTimeout) { + RunNegotiationTest(RunTimeoutNegotiationServer, RunTimeoutExpectingClient); +} + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/sasl_server.cc b/src/kudu/rpc/sasl_server.cc new file mode 100644 index 000000000000..308d66b537fc --- /dev/null +++ b/src/kudu/rpc/sasl_server.cc @@ -0,0 +1,447 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/sasl_server.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/rpc/blocking_ops.h" +#include "kudu/rpc/auth_store.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/serialization.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace rpc { + +static int SaslServerGetoptCb(void* sasl_server, const char* plugin_name, const char* option, + const char** result, unsigned* len) { + return static_cast(sasl_server) + ->GetOptionCb(plugin_name, option, result, len); +} + +static int SaslServerPlainAuthCb(sasl_conn_t *conn, void *sasl_server, const char *user, + const char *pass, unsigned passlen, struct propctx *propctx) { + return static_cast(sasl_server) + ->PlainAuthCb(conn, user, pass, passlen, propctx); +} + +SaslServer::SaslServer(string app_name, int fd) + : app_name_(std::move(app_name)), + sock_(fd), + helper_(SaslHelper::SERVER), + server_state_(SaslNegotiationState::NEW), + negotiated_mech_(SaslMechanism::INVALID), + deadline_(MonoTime::Max()) { + callbacks_.push_back(SaslBuildCallback(SASL_CB_GETOPT, + reinterpret_cast(&SaslServerGetoptCb), this)); + callbacks_.push_back(SaslBuildCallback(SASL_CB_SERVER_USERDB_CHECKPASS, + reinterpret_cast(&SaslServerPlainAuthCb), this)); + callbacks_.push_back(SaslBuildCallback(SASL_CB_LIST_END, nullptr, nullptr)); +} + +SaslServer::~SaslServer() { + sock_.Release(); // Do not close the underlying socket when this object is destroyed. +} + +Status SaslServer::EnableAnonymous() { + DCHECK_EQ(server_state_, SaslNegotiationState::INITIALIZED); + return helper_.EnableAnonymous(); +} + +Status SaslServer::EnablePlain(gscoped_ptr authstore) { + DCHECK_EQ(server_state_, SaslNegotiationState::INITIALIZED); + RETURN_NOT_OK(helper_.EnablePlain()); + authstore_.swap(authstore); + return Status::OK(); +} + +SaslMechanism::Type SaslServer::negotiated_mechanism() const { + DCHECK_EQ(server_state_, SaslNegotiationState::NEGOTIATED); + return negotiated_mech_; +} + +const std::string& SaslServer::plain_auth_user() const { + DCHECK_EQ(server_state_, SaslNegotiationState::NEGOTIATED); + DCHECK_EQ(negotiated_mech_, SaslMechanism::PLAIN); + return plain_auth_user_; +} + +void SaslServer::set_local_addr(const Sockaddr& addr) { + DCHECK_EQ(server_state_, SaslNegotiationState::NEW); + helper_.set_local_addr(addr); +} + +void SaslServer::set_remote_addr(const Sockaddr& addr) { + DCHECK_EQ(server_state_, SaslNegotiationState::NEW); + helper_.set_remote_addr(addr); +} + +void SaslServer::set_server_fqdn(const string& domain_name) { + DCHECK_EQ(server_state_, SaslNegotiationState::NEW); + helper_.set_server_fqdn(domain_name); +} + +void SaslServer::set_deadline(const MonoTime& deadline) { + DCHECK_NE(server_state_, SaslNegotiationState::NEGOTIATED); + deadline_ = deadline; +} + +// calls sasl_server_init() and sasl_server_new() +Status SaslServer::Init(const string& service_type) { + RETURN_NOT_OK(SaslInit(app_name_.c_str())); + + // Ensure we are not called more than once. + if (server_state_ != SaslNegotiationState::NEW) { + return Status::IllegalState("Init() may only be called once per SaslServer object."); + } + + // TODO: Support security flags. + unsigned secflags = 0; + + sasl_conn_t* sasl_conn = nullptr; + int result = sasl_server_new( + service_type.c_str(), // Registered name of the service using SASL. Required. + helper_.server_fqdn(), // The fully qualified domain name of this server. + nullptr, // Permits multiple user realms on server. NULL == use default. + helper_.local_addr_string(), // Local and remote IP address strings. (NULL disables + helper_.remote_addr_string(), // mechanisms which require this info.) + &callbacks_[0], // Connection-specific callbacks. + secflags, // Security flags. + &sasl_conn); + + if (PREDICT_FALSE(result != SASL_OK)) { + return Status::RuntimeError("Unable to create new SASL server", + SaslErrDesc(result, sasl_conn_.get())); + } + sasl_conn_.reset(sasl_conn); + + server_state_ = SaslNegotiationState::INITIALIZED; + return Status::OK(); +} + +Status SaslServer::Negotiate() { + DVLOG(4) << "Called SaslServer::Negotiate()"; + + // Ensure we are called exactly once, and in the right order. + if (server_state_ == SaslNegotiationState::NEW) { + return Status::IllegalState("SaslServer: Init() must be called before calling Negotiate()"); + } else if (server_state_ == SaslNegotiationState::NEGOTIATED) { + return Status::IllegalState("SaslServer: Negotiate() may only be called once per object."); + } + + // Ensure we can use blocking calls on the socket during negotiation. + RETURN_NOT_OK(EnsureBlockingMode(&sock_)); + + faststring recv_buf; + + // Read connection header + RETURN_NOT_OK(ValidateConnectionHeader(&recv_buf)); + + nego_ok_ = false; + while (!nego_ok_) { + TRACE("Waiting for next SASL message..."); + RequestHeader header; + Slice param_buf; + RETURN_NOT_OK(ReceiveFramedMessageBlocking(&sock_, &recv_buf, &header, ¶m_buf, deadline_)); + + SaslMessagePB request; + RETURN_NOT_OK(ParseSaslMsgRequest(header, param_buf, &request)); + + switch (request.state()) { + // NEGOTIATE: They want a list of available mechanisms. + case SaslMessagePB::NEGOTIATE: + RETURN_NOT_OK(HandleNegotiateRequest(request)); + break; + + // INITIATE: They want to initiate negotiation based on their specified mechanism. + case SaslMessagePB::INITIATE: + RETURN_NOT_OK(HandleInitiateRequest(request)); + break; + + // RESPONSE: Client sent a new request as a follow-up to a CHALLENGE response. + case SaslMessagePB::RESPONSE: + RETURN_NOT_OK(HandleResponseRequest(request)); + break; + + // Client sent us some unsupported SASL request. + default: { + TRACE("SASL Server: Received unsupported request from client"); + Status s = Status::InvalidArgument("RPC server doesn't support SASL state in request", + SaslMessagePB::SaslState_Name(request.state())); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + } + } + + TRACE("SASL Server: Successful negotiation"); + server_state_ = SaslNegotiationState::NEGOTIATED; + return Status::OK(); +} + +Status SaslServer::ValidateConnectionHeader(faststring* recv_buf) { + TRACE("Waiting for connection header"); + size_t num_read; + const size_t conn_header_len = kMagicNumberLength + kHeaderFlagsLength; + recv_buf->resize(conn_header_len); + RETURN_NOT_OK(sock_.BlockingRecv(recv_buf->data(), conn_header_len, &num_read, deadline_)); + DCHECK_EQ(conn_header_len, num_read); + + RETURN_NOT_OK(serialization::ValidateConnHeader(*recv_buf)); + TRACE("Connection header received"); + return Status::OK(); +} + +Status SaslServer::ParseSaslMsgRequest(const RequestHeader& header, const Slice& param_buf, + SaslMessagePB* request) { + Status s = helper_.SanityCheckSaslCallId(header.call_id()); + if (!s.ok()) { + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_INVALID_RPC_HEADER, s)); + } + + s = helper_.ParseSaslMessage(param_buf, request); + if (!s.ok()) { + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_DESERIALIZING_REQUEST, s)); + return s; + } + + return Status::OK(); +} + +Status SaslServer::SendSaslMessage(const SaslMessagePB& msg) { + DCHECK_NE(server_state_, SaslNegotiationState::NEW) + << "Must not send SASL messages before calling Init()"; + DCHECK_NE(server_state_, SaslNegotiationState::NEGOTIATED) + << "Must not send SASL messages after Negotiate() succeeds"; + + // Create header with SASL-specific callId + ResponseHeader header; + header.set_call_id(kSaslCallId); + return helper_.SendSaslMessage(&sock_, header, msg, deadline_); +} + +Status SaslServer::SendSaslError(ErrorStatusPB::RpcErrorCodePB code, const Status& err) { + DCHECK_NE(server_state_, SaslNegotiationState::NEW) + << "Must not send SASL messages before calling Init()"; + DCHECK_NE(server_state_, SaslNegotiationState::NEGOTIATED) + << "Must not send SASL messages after Negotiate() succeeds"; + if (err.ok()) { + return Status::InvalidArgument("Cannot send error message using OK status"); + } + + // Create header with SASL-specific callId + ResponseHeader header; + header.set_call_id(kSaslCallId); + header.set_is_error(true); + + // Get RPC error code from Status object + ErrorStatusPB msg; + msg.set_code(code); + msg.set_message(err.ToString()); + + RETURN_NOT_OK(helper_.SendSaslMessage(&sock_, header, msg, deadline_)); + TRACE("Sent SASL error: $0", ErrorStatusPB::RpcErrorCodePB_Name(code)); + return Status::OK(); +} + +Status SaslServer::HandleNegotiateRequest(const SaslMessagePB& request) { + TRACE("SASL Server: Received NEGOTIATE request from client"); + + // Authentication mechanisms this server supports (i.e. plugins). + set server_mechs = helper_.LocalMechs(); + if (PREDICT_FALSE(server_mechs.empty())) { + // This will happen if no mechanisms are enabled before calling Init() + Status s = Status::IllegalState("SASL server mechanism list is empty!"); + LOG(ERROR) << s.ToString(); + TRACE("SASL Server: Sending FATAL_UNAUTHORIZED response to client"); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + + RETURN_NOT_OK(SendNegotiateResponse(server_mechs)); + return Status::OK(); +} + +Status SaslServer::SendNegotiateResponse(const set& server_mechs) { + SaslMessagePB response; + response.set_state(SaslMessagePB::NEGOTIATE); + + for (const string& mech : server_mechs) { + SaslMessagePB::SaslAuth* auth = response.add_auths(); + + // The 'method' field is deprecated, but older versions of Kudu marked it 'required'. + // So, we have to set it to something to keep compatibility. At some point, we can + // consider removing it and breaking compatibility with Kudu <=0.6. + auth->set_method(""); + auth->set_mechanism(mech); + } + + RETURN_NOT_OK(SendSaslMessage(response)); + TRACE("Sent NEGOTIATE response"); + return Status::OK(); +} + + +Status SaslServer::HandleInitiateRequest(const SaslMessagePB& request) { + TRACE("SASL Server: Received INITIATE request from client"); + + if (request.auths_size() != 1) { + Status s = Status::NotAuthorized(StringPrintf( + "SASL INITIATE request must include exactly one SaslAuth section, found: %d", + request.auths_size())); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + + const SaslMessagePB::SaslAuth& auth = request.auths(0); + TRACE("SASL Server: Client requested to use mechanism: $0", auth.mechanism()); + + // Security issue to display this. Commented out but left for debugging purposes. + //DVLOG(3) << "SASL server: Client token: " << request.token(); + + const char* server_out = nullptr; + uint32_t server_out_len = 0; + TRACE("SASL Server: Calling sasl_server_start()"); + int result = sasl_server_start( + sasl_conn_.get(), // The SASL connection context created by init() + auth.mechanism().c_str(), // The mechanism requested by the client. + request.token().c_str(), // Optional string the client gave us. + request.token().length(), // Client string len. + &server_out, // The output of the SASL library, might not be NULL terminated + &server_out_len); // Output len. + + if (PREDICT_FALSE(result != SASL_OK && result != SASL_CONTINUE)) { + Status s = Status::NotAuthorized("Unable to negotiate SASL connection", + SaslErrDesc(result, sasl_conn_.get())); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + negotiated_mech_ = SaslMechanism::value_of(auth.mechanism()); + + // We have a valid mechanism match + if (result == SASL_OK) { + nego_ok_ = true; + RETURN_NOT_OK(SendSuccessResponse(server_out, server_out_len)); + } else { // result == SASL_CONTINUE + RETURN_NOT_OK(SendChallengeResponse(server_out, server_out_len)); + } + return Status::OK(); +} + +Status SaslServer::SendChallengeResponse(const char* challenge, unsigned clen) { + if (clen < 1) { + Status s = Status::NotAuthorized("SASL library did not provide challenge token!"); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + + SaslMessagePB response; + response.set_state(SaslMessagePB::CHALLENGE); + response.mutable_token()->assign(challenge, clen); + TRACE("SASL Server: Sending CHALLENGE response to client"); + RETURN_NOT_OK(SendSaslMessage(response)); + return Status::OK(); +} + +Status SaslServer::SendSuccessResponse(const char* token, unsigned tlen) { + SaslMessagePB response; + response.set_state(SaslMessagePB::SUCCESS); + if (PREDICT_FALSE(tlen > 0)) { + response.mutable_token()->assign(token, tlen); + } + TRACE("SASL Server: Sending SUCCESS response to client"); + RETURN_NOT_OK(SendSaslMessage(response)); + return Status::OK(); +} + + +Status SaslServer::HandleResponseRequest(const SaslMessagePB& request) { + TRACE("SASL Server: Received RESPONSE request from client"); + + if (!request.has_token()) { + Status s = Status::InvalidArgument("No token in CHALLENGE RESPONSE from client"); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + + const char* server_out = nullptr; + uint32_t server_out_len = 0; + TRACE("SASL Server: Calling sasl_server_step()"); + int result = sasl_server_step( + sasl_conn_.get(), // The SASL connection context created by init() + request.token().c_str(), // Optional string the client gave us + request.token().length(), // Client string len + &server_out, // The output of the SASL library, might not be NULL terminated + &server_out_len); // Output len + + if (result != SASL_OK && result != SASL_CONTINUE) { + Status s = Status::NotAuthorized("Unable to negotiate SASL connection", + SaslErrDesc(result, sasl_conn_.get())); + RETURN_NOT_OK(SendSaslError(ErrorStatusPB::FATAL_UNAUTHORIZED, s)); + return s; + } + + SaslMessagePB msg; + if (result == SASL_OK) { + nego_ok_ = true; + RETURN_NOT_OK(SendSuccessResponse(server_out, server_out_len)); + } else { // result == SASL_CONTINUE + RETURN_NOT_OK(SendChallengeResponse(server_out, server_out_len)); + } + return Status::OK(); +} + +int SaslServer::GetOptionCb(const char* plugin_name, const char* option, + const char** result, unsigned* len) { + return helper_.GetOptionCb(plugin_name, option, result, len); +} + +int SaslServer::PlainAuthCb(sasl_conn_t *conn, const char *user, const char *pass, + unsigned passlen, struct propctx *propctx) { + TRACE("SASL Server: Checking PLAIN auth credentials"); + if (PREDICT_FALSE(!helper_.IsPlainEnabled())) { + LOG(DFATAL) << "Password authentication callback called while PLAIN auth disabled"; + return SASL_BADPARAM; + } + if (PREDICT_FALSE(!authstore_)) { + LOG(DFATAL) << "AuthStore not initialized"; + return SASL_FAIL; + } + Status s = authstore_->Authenticate(user, string(pass, passlen)); + TRACE("SASL Server: PLAIN user authentication status: $0", s.ToString()); + if (!s.ok()) { + LOG(INFO) << "Failed login for user: " << user; + return SASL_FAIL; + } + plain_auth_user_ = user; // Store username of authenticated user. + return SASL_OK; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/sasl_server.h b/src/kudu/rpc/sasl_server.h new file mode 100644 index 000000000000..e71e958e74c6 --- /dev/null +++ b/src/kudu/rpc/sasl_server.h @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_SASL_SERVER_H +#define KUDU_RPC_SASL_SERVER_H + +#include +#include +#include + +#include + +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/sasl_common.h" +#include "kudu/rpc/sasl_helper.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Slice; + +namespace rpc { + +using std::string; + +class AuthStore; + +// Class for doing SASL negotiation with a SaslClient over a bidirectional socket. +// Operations on this class are NOT thread-safe. +class SaslServer { + public: + // Does not take ownership of the socket indicated by the fd. + SaslServer(string app_name, int fd); + ~SaslServer(); + + // Enable ANONYMOUS authentication. + // Call after Init(). + Status EnableAnonymous(); + + // Enable PLAIN authentication. TODO: Support impersonation. + // Call after Init(). + Status EnablePlain(gscoped_ptr authstore); + + // Returns mechanism negotiated by this connection. + // Call after Negotiate(). + SaslMechanism::Type negotiated_mechanism() const; + + // Name of the user that authenticated using plain auth. + // Call after Negotiate() and only if the negotiated mechanism was PLAIN. + const std::string& plain_auth_user() const; + + // Specify IP:port of local side of connection. + // Call before Init(). Required for some mechanisms. + void set_local_addr(const Sockaddr& addr); + + // Specify IP:port of remote side of connection. + // Call before Init(). Required for some mechanisms. + void set_remote_addr(const Sockaddr& addr); + + // Specify the fully-qualified domain name of the remote server. + // Call before Init(). Required for some mechanisms. + void set_server_fqdn(const string& domain_name); + + // Set deadline for connection negotiation. + void set_deadline(const MonoTime& deadline); + + // Get deadline for connection negotiation. + const MonoTime& deadline() const { return deadline_; } + + // Initialize a new SASL server. Must be called before Negotiate(). + // Returns OK on success, otherwise RuntimeError. + Status Init(const string& service_type); + + // Begin negotiation with the SASL client on the other side of the fd socket + // that this server was constructed with. + // Returns OK on success. + // Otherwise, it may return NotAuthorized, NotSupported, or another non-OK status. + Status Negotiate(); + + // SASL callback for plugin options, supported mechanisms, etc. + // Returns SASL_FAIL if the option is not handled, which does not fail the handshake. + int GetOptionCb(const char* plugin_name, const char* option, + const char** result, unsigned* len); + + // SASL callback for PLAIN authentication via SASL_CB_SERVER_USERDB_CHECKPASS. + int PlainAuthCb(sasl_conn_t* conn, const char* user, const char* pass, + unsigned passlen, struct propctx* propctx); + + private: + // Parse and validate connection header. + Status ValidateConnectionHeader(faststring* recv_buf); + + // Parse request body. If malformed, sends an error message to the client. + Status ParseSaslMsgRequest(const RequestHeader& header, const Slice& param_buf, + SaslMessagePB* request); + + // Encode and send the specified SASL message to the client. + Status SendSaslMessage(const SaslMessagePB& msg); + + // Encode and send the specified RPC error message to the client. + // Calls Status.ToString() for the embedded error message. + Status SendSaslError(ErrorStatusPB::RpcErrorCodePB code, const Status& err); + + // Handle case when client sends NEGOTIATE request. + Status HandleNegotiateRequest(const SaslMessagePB& request); + + // Send a NEGOTIATE response to the client with the list of available mechanisms. + Status SendNegotiateResponse(const std::set& server_mechs); + + // Handle case when client sends INITIATE request. + Status HandleInitiateRequest(const SaslMessagePB& request); + + // Send a CHALLENGE response to the client with a challenge token. + Status SendChallengeResponse(const char* challenge, unsigned clen); + + // Send a SUCCESS response to the client with an token (typically empty). + Status SendSuccessResponse(const char* token, unsigned tlen); + + // Handle case when client sends RESPONSE request. + Status HandleResponseRequest(const SaslMessagePB& request); + + string app_name_; + Socket sock_; + std::vector callbacks_; + gscoped_ptr sasl_conn_; + SaslHelper helper_; + + // Authentication store used for PLAIN authentication. + gscoped_ptr authstore_; + + // The successfully-authenticated user, if applicable. + string plain_auth_user_; + + SaslNegotiationState::Type server_state_; + + // The mechanism we negotiated with the client. + SaslMechanism::Type negotiated_mech_; + + // Intra-negotiation state. + bool nego_ok_; // During negotiation: did we get a SASL_OK response from the SASL library? + + // Negotiation timeout deadline. + MonoTime deadline_; + + DISALLOW_COPY_AND_ASSIGN(SaslServer); +}; + +} // namespace rpc +} // namespace kudu + +#endif // KUDU_RPC_SASL_SERVER_H diff --git a/src/kudu/rpc/serialization.cc b/src/kudu/rpc/serialization.cc new file mode 100644 index 000000000000..3160c53719ec --- /dev/null +++ b/src/kudu/rpc/serialization.cc @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/serialization.h" + +#include +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/rpc/constants.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +DECLARE_int32(rpc_max_message_size); + +using google::protobuf::MessageLite; +using google::protobuf::io::CodedInputStream; +using google::protobuf::io::CodedOutputStream; + +namespace kudu { +namespace rpc { +namespace serialization { + +enum { + kHeaderPosVersion = 0, + kHeaderPosServiceClass = 1, + kHeaderPosAuthProto = 2 +}; + +Status SerializeMessage(const MessageLite& message, faststring* param_buf, + int additional_size, bool use_cached_size) { + + if (PREDICT_FALSE(!message.IsInitialized())) { + return Status::InvalidArgument("RPC argument missing required fields", + message.InitializationErrorString()); + } + int pb_size = use_cached_size ? message.GetCachedSize() : message.ByteSize(); + DCHECK_EQ(message.ByteSize(), pb_size); + int recorded_size = pb_size + additional_size; + int size_with_delim = pb_size + CodedOutputStream::VarintSize32(recorded_size); + int total_size = size_with_delim + additional_size; + + if (total_size > FLAGS_rpc_max_message_size) { + LOG(DFATAL) << "Sending too long of an RPC message (" << total_size + << " bytes)"; + } + + param_buf->resize(size_with_delim); + uint8_t* dst = param_buf->data(); + dst = CodedOutputStream::WriteVarint32ToArray(recorded_size, dst); + dst = message.SerializeWithCachedSizesToArray(dst); + CHECK_EQ(dst, param_buf->data() + size_with_delim); + + return Status::OK(); +} + +Status SerializeHeader(const MessageLite& header, + size_t param_len, + faststring* header_buf) { + + if (PREDICT_FALSE(!header.IsInitialized())) { + LOG(DFATAL) << "Uninitialized RPC header"; + return Status::InvalidArgument("RPC header missing required fields", + header.InitializationErrorString()); + } + + // Compute all the lengths for the packet. + size_t header_pb_len = header.ByteSize(); + size_t header_tot_len = kMsgLengthPrefixLength // Int prefix for the total length. + + CodedOutputStream::VarintSize32(header_pb_len) // Varint delimiter for header PB. + + header_pb_len; // Length for the header PB itself. + size_t total_size = header_tot_len + param_len; + + header_buf->resize(header_tot_len); + uint8_t* dst = header_buf->data(); + + // 1. The length for the whole request, not including the 4-byte + // length prefix. + NetworkByteOrder::Store32(dst, total_size - kMsgLengthPrefixLength); + dst += sizeof(uint32_t); + + // 2. The varint-prefixed RequestHeader PB + dst = CodedOutputStream::WriteVarint32ToArray(header_pb_len, dst); + dst = header.SerializeWithCachedSizesToArray(dst); + + // We should have used the whole buffer we allocated. + CHECK_EQ(dst, header_buf->data() + header_tot_len); + + return Status::OK(); +} + +Status ParseMessage(const Slice& buf, + MessageLite* parsed_header, + Slice* parsed_main_message) { + + // First grab the total length + if (PREDICT_FALSE(buf.size() < kMsgLengthPrefixLength)) { + return Status::Corruption("Invalid packet: not enough bytes for length header", + buf.ToDebugString()); + } + + int total_len = NetworkByteOrder::Load32(buf.data()); + DCHECK_EQ(total_len + kMsgLengthPrefixLength, buf.size()) + << "Got mis-sized buffer: " << buf.ToDebugString(); + + CodedInputStream in(buf.data(), buf.size()); + in.Skip(kMsgLengthPrefixLength); + + uint32_t header_len; + if (PREDICT_FALSE(!in.ReadVarint32(&header_len))) { + return Status::Corruption("Invalid packet: missing header delimiter", + buf.ToDebugString()); + } + + CodedInputStream::Limit l; + l = in.PushLimit(header_len); + if (PREDICT_FALSE(!parsed_header->ParseFromCodedStream(&in))) { + return Status::Corruption("Invalid packet: header too short", + buf.ToDebugString()); + } + in.PopLimit(l); + + uint32_t main_msg_len; + if (PREDICT_FALSE(!in.ReadVarint32(&main_msg_len))) { + return Status::Corruption("Invalid packet: missing main msg length", + buf.ToDebugString()); + } + + if (PREDICT_FALSE(!in.Skip(main_msg_len))) { + return Status::Corruption( + StringPrintf("Invalid packet: data too short, expected %d byte main_msg", main_msg_len), + buf.ToDebugString()); + } + + if (PREDICT_FALSE(in.BytesUntilLimit() > 0)) { + return Status::Corruption( + StringPrintf("Invalid packet: %d extra bytes at end of packet", in.BytesUntilLimit()), + buf.ToDebugString()); + } + + *parsed_main_message = Slice(buf.data() + buf.size() - main_msg_len, + main_msg_len); + return Status::OK(); +} + +void SerializeConnHeader(uint8_t* buf) { + memcpy(reinterpret_cast(buf), kMagicNumber, kMagicNumberLength); + buf += kMagicNumberLength; + buf[kHeaderPosVersion] = kCurrentRpcVersion; + buf[kHeaderPosServiceClass] = 0; // TODO: implement + buf[kHeaderPosAuthProto] = 0; // TODO: implement +} + +// validate the entire rpc header (magic number + flags) +Status ValidateConnHeader(const Slice& slice) { + DCHECK_EQ(kMagicNumberLength + kHeaderFlagsLength, slice.size()) + << "Invalid RPC header length"; + + // validate actual magic + if (!slice.starts_with(kMagicNumber)) { + return Status::InvalidArgument("Connection must begin with magic number", kMagicNumber); + } + + const uint8_t *data = slice.data(); + data += kMagicNumberLength; + + // validate version + if (data[kHeaderPosVersion] != kCurrentRpcVersion) { + return Status::InvalidArgument("Unsupported RPC version", + StringPrintf("Received: %d, Supported: %d", + data[kHeaderPosVersion], kCurrentRpcVersion)); + } + + // TODO: validate additional header flags: + // RPC_SERVICE_CLASS + // RPC_AUTH_PROTOCOL + + return Status::OK(); +} + +} // namespace serialization +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/serialization.h b/src/kudu/rpc/serialization.h new file mode 100644 index 000000000000..ed1275bf3464 --- /dev/null +++ b/src/kudu/rpc/serialization.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_SERIALIZATION_H +#define KUDU_RPC_SERIALIZATION_H + +#include +#include + +namespace google { +namespace protobuf { +class MessageLite; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class Status; +class faststring; +class Slice; + +namespace rpc { +namespace serialization { + +// Serialize the request param into a buffer which is allocated by this function. +// Uses the message's cached size by calling MessageLite::GetCachedSize(). +// In : 'message' Protobuf Message to serialize +// 'additional_size' Optional argument which increases the recorded size +// within param_buf. This argument is necessary if there will be +// additional sidecars appended onto the message (that aren't part of +// the protobuf itself). +// 'use_cached_size' Additional optional argument whether to use the cached +// or explicit byte size by calling MessageLite::GetCachedSize() or +// MessageLite::ByteSize(), respectively. +// Out: The faststring 'param_buf' to be populated with the serialized bytes. +// The faststring's length is only determined by the amount that +// needs to be serialized for the protobuf (i.e., no additional space +// is reserved for 'additional_size', which only affects the +// size indicator prefix in 'param_buf'). +Status SerializeMessage(const google::protobuf::MessageLite& message, + faststring* param_buf, int additional_size = 0, + bool use_cached_size = false); + +// Serialize the request or response header into a buffer which is allocated +// by this function. +// Includes leading 32-bit length of the buffer. +// In: Protobuf Header to serialize, +// Length of the message param following this header in the frame. +// Out: faststring to be populated with the serialized bytes. +Status SerializeHeader(const google::protobuf::MessageLite& header, + size_t param_len, + faststring* header_buf); + +// Deserialize the request. +// In: data buffer Slice. +// Out: parsed_header PB initialized, +// parsed_main_message pointing to offset in original buffer containing +// the main payload. +Status ParseMessage(const Slice& buf, + google::protobuf::MessageLite* parsed_header, + Slice* parsed_main_message); + +// Serialize the RPC connection header (magic number + flags). +// buf must have 7 bytes available (kMagicNumberLength + kHeaderFlagsLength). +void SerializeConnHeader(uint8_t* buf); + +// Validate the entire rpc header (magic number + flags). +Status ValidateConnHeader(const Slice& slice); + + +} // namespace serialization +} // namespace rpc +} // namespace kudu +#endif // KUDU_RPC_SERIALIZATION_H diff --git a/src/kudu/rpc/service_if.cc b/src/kudu/rpc/service_if.cc new file mode 100644 index 000000000000..7377cea65d53 --- /dev/null +++ b/src/kudu/rpc/service_if.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/service_if.h" + +#include + +#include "kudu/gutil/strings/substitute.h" + +#include "kudu/rpc/connection.h" +#include "kudu/rpc/inbound_call.h" +#include "kudu/rpc/rpc_header.pb.h" + +using std::string; +using strings::Substitute; + +namespace kudu { +namespace rpc { + +RpcMethodMetrics::RpcMethodMetrics() + : handler_latency(nullptr) { +} + +RpcMethodMetrics::~RpcMethodMetrics() { +} + +ServiceIf::~ServiceIf() { +} + +void ServiceIf::Shutdown() { +} + +bool ServiceIf::ParseParam(InboundCall *call, google::protobuf::Message *message) { + Slice param(call->serialized_request()); + if (PREDICT_FALSE(!message->ParseFromArray(param.data(), param.size()))) { + string err = Substitute("Invalid parameter for call $0: $1", + call->remote_method().ToString(), + message->InitializationErrorString().c_str()); + LOG(WARNING) << err; + call->RespondFailure(ErrorStatusPB::ERROR_INVALID_REQUEST, + Status::InvalidArgument(err)); + return false; + } + return true; +} + +void ServiceIf::RespondBadMethod(InboundCall *call) { + Sockaddr local_addr, remote_addr; + + CHECK_OK(call->connection()->socket()->GetSocketAddress(&local_addr)); + CHECK_OK(call->connection()->socket()->GetPeerAddress(&remote_addr)); + string err = Substitute("Call on service $0 received at $1 from $2 with an " + "invalid method name: $3", + call->remote_method().service_name(), + local_addr.ToString(), + remote_addr.ToString(), + call->remote_method().method_name()); + LOG(WARNING) << err; + call->RespondFailure(ErrorStatusPB::ERROR_NO_SUCH_METHOD, + Status::InvalidArgument(err)); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/service_if.h b/src/kudu/rpc/service_if.h new file mode 100644 index 000000000000..02244207c52c --- /dev/null +++ b/src/kudu/rpc/service_if.h @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_SERVICE_IF_H +#define KUDU_RPC_SERVICE_IF_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/metrics.h" +#include "kudu/util/net/sockaddr.h" + +namespace google { +namespace protobuf { +class Message; +} +} + +namespace kudu { + +class Histogram; + +namespace rpc { + +class InboundCall; + +struct RpcMethodMetrics { + RpcMethodMetrics(); + ~RpcMethodMetrics(); + + scoped_refptr handler_latency; +}; + +// Handles incoming messages that initiate an RPC. +class ServiceIf { + public: + virtual ~ServiceIf(); + virtual void Handle(InboundCall* incoming) = 0; + virtual void Shutdown(); + virtual std::string service_name() const = 0; + + protected: + bool ParseParam(InboundCall* call, google::protobuf::Message* message); + void RespondBadMethod(InboundCall* call); + +}; + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/rpc/service_pool.cc b/src/kudu/rpc/service_pool.cc new file mode 100644 index 000000000000..8a10b6fa6a16 --- /dev/null +++ b/src/kudu/rpc/service_pool.cc @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/service_pool.h" + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/inbound_call.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/service_if.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" +#include "kudu/util/trace.h" + +using std::shared_ptr; +using strings::Substitute; + +METRIC_DEFINE_histogram(server, rpc_incoming_queue_time, + "RPC Queue Time", + kudu::MetricUnit::kMicroseconds, + "Number of microseconds incoming RPC requests spend in the worker queue", + 60000000LU, 3); + +METRIC_DEFINE_counter(server, rpcs_timed_out_in_queue, + "RPC Queue Timeouts", + kudu::MetricUnit::kRequests, + "Number of RPCs whose timeout elapsed while waiting " + "in the service queue, and thus were not processed."); + +METRIC_DEFINE_counter(server, rpcs_queue_overflow, + "RPC Queue Overflows", + kudu::MetricUnit::kRequests, + "Number of RPCs dropped because the service queue " + "was full."); + +namespace kudu { +namespace rpc { + +ServicePool::ServicePool(gscoped_ptr service, + const scoped_refptr& entity, + size_t service_queue_length) + : service_(service.Pass()), + service_queue_(service_queue_length), + incoming_queue_time_(METRIC_rpc_incoming_queue_time.Instantiate(entity)), + rpcs_timed_out_in_queue_(METRIC_rpcs_timed_out_in_queue.Instantiate(entity)), + rpcs_queue_overflow_(METRIC_rpcs_queue_overflow.Instantiate(entity)), + closing_(false) { +} + +ServicePool::~ServicePool() { + Shutdown(); +} + +Status ServicePool::Init(int num_threads) { + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("service pool", "rpc worker", + &ServicePool::RunThread, this, &new_thread)); + threads_.push_back(new_thread); + } + return Status::OK(); +} + +void ServicePool::Shutdown() { + service_queue_.Shutdown(); + + MutexLock lock(shutdown_lock_); + if (closing_) return; + closing_ = true; + // TODO: Use a proper thread pool implementation. + for (scoped_refptr& thread : threads_) { + CHECK_OK(ThreadJoiner(thread.get()).Join()); + } + + // Now we must drain the service queue. + Status status = Status::ServiceUnavailable("Service is shutting down"); + gscoped_ptr incoming; + while (service_queue_.BlockingGet(&incoming)) { + incoming.release()->RespondFailure(ErrorStatusPB::FATAL_SERVER_SHUTTING_DOWN, status); + } + + service_->Shutdown(); +} + +Status ServicePool::QueueInboundCall(gscoped_ptr call) { + InboundCall* c = call.release(); + + TRACE_TO(c->trace(), "Inserting onto call queue"); + // Queue message on service queue + QueueStatus queue_status = service_queue_.Put(c); + if (PREDICT_TRUE(queue_status == QUEUE_SUCCESS)) { + // NB: do not do anything with 'c' after it is successfully queued -- + // a service thread may have already dequeued it, processed it, and + // responded by this point, in which case the pointer would be invalid. + return Status::OK(); + } + + Status status = Status::OK(); + if (queue_status == QUEUE_FULL) { + string err_msg = + Substitute("$0 request on $1 from $2 dropped due to backpressure. " + "The service queue is full; it has $3 items.", + c->remote_method().method_name(), + service_->service_name(), + c->remote_address().ToString(), + service_queue_.max_size()); + status = Status::ServiceUnavailable(err_msg); + rpcs_queue_overflow_->Increment(); + c->RespondFailure(ErrorStatusPB::ERROR_SERVER_TOO_BUSY, status); + DLOG(INFO) << err_msg << " Contents of service queue:\n" + << service_queue_.ToString(); + } else if (queue_status == QUEUE_SHUTDOWN) { + status = Status::ServiceUnavailable("Service is shutting down"); + c->RespondFailure(ErrorStatusPB::FATAL_SERVER_SHUTTING_DOWN, status); + } else { + status = Status::RuntimeError(Substitute("Unknown error from BlockingQueue: $0", queue_status)); + c->RespondFailure(ErrorStatusPB::FATAL_UNKNOWN, status); + } + return status; +} + +void ServicePool::RunThread() { + while (true) { + gscoped_ptr incoming; + if (!service_queue_.BlockingGet(&incoming)) { + VLOG(1) << "ServicePool: messenger shutting down."; + return; + } + + incoming->RecordHandlingStarted(incoming_queue_time_); + ADOPT_TRACE(incoming->trace()); + + if (PREDICT_FALSE(incoming->ClientTimedOut())) { + TRACE_TO(incoming->trace(), "Skipping call since client already timed out"); + rpcs_timed_out_in_queue_->Increment(); + + // Respond as a failure, even though the client will probably ignore + // the response anyway. + incoming->RespondFailure( + ErrorStatusPB::ERROR_SERVER_TOO_BUSY, + Status::TimedOut("Call waited in the queue past client deadline")); + + // Must release since RespondFailure above ends up taking ownership + // of the object. + ignore_result(incoming.release()); + continue; + } + + TRACE_TO(incoming->trace(), "Handling call"); + + // Release the InboundCall pointer -- when the call is responded to, + // it will get deleted at that point. + service_->Handle(incoming.release()); + } +} + +const string ServicePool::service_name() const { + return service_->service_name(); +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/service_pool.h b/src/kudu/rpc/service_pool.h new file mode 100644 index 000000000000..54e76e058e0f --- /dev/null +++ b/src/kudu/rpc/service_pool.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_SERVICE_POOL_H +#define KUDU_SERVICE_POOL_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/rpc_service.h" +#include "kudu/util/blocking_queue.h" +#include "kudu/util/mutex.h" +#include "kudu/util/thread.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Counter; +class Histogram; +class MetricEntity; +class Socket; + +namespace rpc { + +class Messenger; +class ServiceIf; + +// A pool of threads that handle new incoming RPC calls. +// Also includes a queue that calls get pushed onto for handling by the pool. +class ServicePool : public RpcService { + public: + ServicePool(gscoped_ptr service, + const scoped_refptr& metric_entity, + size_t service_queue_length); + virtual ~ServicePool(); + + // Start up the thread pool. + virtual Status Init(int num_threads); + + // Shut down the queue and the thread pool. + virtual void Shutdown(); + + virtual Status QueueInboundCall(gscoped_ptr call) OVERRIDE; + + const Counter* RpcsTimedOutInQueueMetricForTests() const { + return rpcs_timed_out_in_queue_.get(); + } + + const Counter* RpcsQueueOverflowMetric() const { + return rpcs_queue_overflow_.get(); + } + + const std::string service_name() const; + + private: + void RunThread(); + gscoped_ptr service_; + std::vector > threads_; + BlockingQueue service_queue_; + scoped_refptr incoming_queue_time_; + scoped_refptr rpcs_timed_out_in_queue_; + scoped_refptr rpcs_queue_overflow_; + + mutable Mutex shutdown_lock_; + bool closing_; + + DISALLOW_COPY_AND_ASSIGN(ServicePool); +}; + +} // namespace rpc +} // namespace kudu + +#endif diff --git a/src/kudu/rpc/transfer.cc b/src/kudu/rpc/transfer.cc new file mode 100644 index 000000000000..b1807aef96d1 --- /dev/null +++ b/src/kudu/rpc/transfer.cc @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/rpc/transfer.h" + +#include + +#include +#include + +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/constants.h" +#include "kudu/rpc/messenger.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/net/socket.h" + +DEFINE_int32(rpc_max_message_size, (8 * 1024 * 1024), + "The maximum size of a message that any RPC that the server will accept."); +TAG_FLAG(rpc_max_message_size, advanced); +TAG_FLAG(rpc_max_message_size, runtime); + +namespace kudu { +namespace rpc { + +using std::ostringstream; +using std::string; + +#define RETURN_ON_ERROR_OR_SOCKET_NOT_READY(status) \ + if (PREDICT_FALSE(!status.ok())) { \ + if (Socket::IsTemporarySocketError(status.posix_code())) { \ + return Status::OK(); /* EAGAIN, etc. */ \ + } \ + return status; \ + } + +TransferCallbacks::~TransferCallbacks() +{} + +InboundTransfer::InboundTransfer() + : total_length_(kMsgLengthPrefixLength), + cur_offset_(0) { + buf_.resize(kMsgLengthPrefixLength); +} + +Status InboundTransfer::ReceiveBuffer(Socket &socket) { + if (cur_offset_ < kMsgLengthPrefixLength) { + // receive int32 length prefix + int32_t rem = kMsgLengthPrefixLength - cur_offset_; + int32_t nread; + Status status = socket.Recv(&buf_[cur_offset_], rem, &nread); + RETURN_ON_ERROR_OR_SOCKET_NOT_READY(status); + if (nread == 0) { + return Status::OK(); + } + DCHECK_GE(nread, 0); + cur_offset_ += nread; + if (cur_offset_ < kMsgLengthPrefixLength) { + // If we still don't have the full length prefix, we can't continue + // reading yet. + return Status::OK(); + } + // Since we only read 'rem' bytes above, we should now have exactly + // the length prefix in our buffer and no more. + DCHECK_EQ(cur_offset_, kMsgLengthPrefixLength); + + // The length prefix doesn't include its own 4 bytes, so we have to + // add that back in. + total_length_ = NetworkByteOrder::Load32(&buf_[0]) + kMsgLengthPrefixLength; + if (total_length_ > FLAGS_rpc_max_message_size) { + return Status::NetworkError(StringPrintf("the frame had a " + "length of %d, but we only support messages up to %d bytes " + "long.", total_length_, FLAGS_rpc_max_message_size)); + } + if (total_length_ <= kMsgLengthPrefixLength) { + return Status::NetworkError(StringPrintf("the frame had a " + "length of %d, which is invalid", total_length_)); + } + buf_.resize(total_length_); + + // Fall through to receive the message body, which is likely to be already + // available on the socket. + } + + // receive message body + int32_t nread; + int32_t rem = total_length_ - cur_offset_; + Status status = socket.Recv(&buf_[cur_offset_], rem, &nread); + RETURN_ON_ERROR_OR_SOCKET_NOT_READY(status); + cur_offset_ += nread; + + return Status::OK(); +} + +bool InboundTransfer::TransferStarted() const { + return cur_offset_ != 0; +} + +bool InboundTransfer::TransferFinished() const { + return cur_offset_ == total_length_; +} + +string InboundTransfer::StatusAsString() const { + return strings::Substitute("$0/$1 bytes received", cur_offset_, total_length_); +} + +OutboundTransfer::OutboundTransfer(const std::vector &payload, + TransferCallbacks *callbacks) + : cur_slice_idx_(0), + cur_offset_in_slice_(0), + callbacks_(callbacks), + aborted_(false) { + CHECK(!payload.empty()); + + n_payload_slices_ = payload.size(); + CHECK_LE(n_payload_slices_, arraysize(payload_slices_)); + for (int i = 0; i < payload.size(); i++) { + payload_slices_[i] = payload[i]; + } +} + +OutboundTransfer::~OutboundTransfer() { + if (!TransferFinished() && !aborted_) { + callbacks_->NotifyTransferAborted( + Status::RuntimeError("RPC transfer destroyed before it finished sending")); + } +} + +void OutboundTransfer::Abort(const Status &status) { + CHECK(!aborted_) << "Already aborted"; + CHECK(!TransferFinished()) << "Cannot abort a finished transfer"; + callbacks_->NotifyTransferAborted(status); + aborted_ = true; +} + +Status OutboundTransfer::SendBuffer(Socket &socket) { + CHECK_LT(cur_slice_idx_, n_payload_slices_); + + int n_iovecs = n_payload_slices_ - cur_slice_idx_; + struct iovec iovec[n_iovecs]; + { + int offset_in_slice = cur_offset_in_slice_; + for (int i = 0; i < n_iovecs; i++) { + Slice &slice = payload_slices_[cur_slice_idx_ + i]; + iovec[i].iov_base = slice.mutable_data() + offset_in_slice; + iovec[i].iov_len = slice.size() - offset_in_slice; + + offset_in_slice = 0; + } + } + + int32_t written; + Status status = socket.Writev(iovec, n_iovecs, &written); + RETURN_ON_ERROR_OR_SOCKET_NOT_READY(status); + + // Adjust our accounting of current writer position. + for (int i = cur_slice_idx_; i < n_payload_slices_; i++) { + Slice &slice = payload_slices_[i]; + int rem_in_slice = slice.size() - cur_offset_in_slice_; + DCHECK_GE(rem_in_slice, 0); + + if (written >= rem_in_slice) { + // Used up this entire slice, advance to the next slice. + cur_slice_idx_++; + cur_offset_in_slice_ = 0; + written -= rem_in_slice; + } else { + // Partially used up this slice, just advance the offset within it. + cur_offset_in_slice_ += written; + break; + } + } + + if (cur_slice_idx_ == n_payload_slices_) { + callbacks_->NotifyTransferFinished(); + DCHECK_EQ(0, cur_offset_in_slice_); + } else { + DCHECK_LT(cur_slice_idx_, n_payload_slices_); + DCHECK_LT(cur_offset_in_slice_, payload_slices_[cur_slice_idx_].size()); + } + + return Status::OK(); +} + +bool OutboundTransfer::TransferStarted() const { + return cur_offset_in_slice_ != 0 || cur_slice_idx_ != 0; +} + +bool OutboundTransfer::TransferFinished() const { + if (cur_slice_idx_ == n_payload_slices_) { + DCHECK_EQ(0, cur_offset_in_slice_); // sanity check + return true; + } + return false; +} + +string OutboundTransfer::HexDump() const { + string ret; + for (int i = 0; i < n_payload_slices_; i++) { + ret.append(payload_slices_[i].ToDebugString()); + } + return ret; +} + +int32_t OutboundTransfer::TotalLength() const { + int32_t ret = 0; + for (int i = 0; i < n_payload_slices_; i++) { + ret += payload_slices_[i].size(); + } + return ret; +} + +} // namespace rpc +} // namespace kudu diff --git a/src/kudu/rpc/transfer.h b/src/kudu/rpc/transfer.h new file mode 100644 index 000000000000..cc66b4e9502c --- /dev/null +++ b/src/kudu/rpc/transfer.h @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_RPC_TRANSFER_H +#define KUDU_RPC_TRANSFER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +DECLARE_int32(rpc_max_message_size); + +namespace google { +namespace protobuf { +class Message; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class Socket; + +namespace rpc { + +class Messenger; +struct TransferCallbacks; + +// This class is used internally by the RPC layer to represent an inbound +// transfer in progress. +// +// Inbound Transfer objects are created by a Connection receiving data. When the +// message is fully received, it is either parsed as a call, or a call response, +// and the InboundTransfer object itself is handed off. +class InboundTransfer { + public: + + InboundTransfer(); + + // read from the socket into our buffer + Status ReceiveBuffer(Socket &socket); + + // Return true if any bytes have yet been sent. + bool TransferStarted() const; + + // Return true if the entire transfer has been sent. + bool TransferFinished() const; + + Slice data() const { + return Slice(buf_); + } + + // Return a string indicating the status of this transfer (number of bytes received, etc) + // suitable for logging. + std::string StatusAsString() const; + + private: + + Status ProcessInboundHeader(); + + faststring buf_; + + int32_t total_length_; + int32_t cur_offset_; + + DISALLOW_COPY_AND_ASSIGN(InboundTransfer); +}; + + +// When the connection wants to send data, it creates an OutboundTransfer object +// to encompass it. This sits on a queue within the Connection, so that each time +// the Connection wakes up with a writable socket, it consumes more bytes off +// the next pending transfer in the queue. +// +// Upon completion of the transfer, a callback is triggered. +class OutboundTransfer : public boost::intrusive::list_base_hook<> { + public: + enum { kMaxPayloadSlices = 10 }; + + // Create a new transfer. The 'payload' slices will be concatenated and + // written to the socket. When the transfer completes or errors, the + // appropriate method of 'callbacks' is invoked. + // + // Does not take ownership of the callbacks object or the underlying + // memory of the slices. The slices must remain valid until the callback + // is triggered. + // + // NOTE: 'payload' is currently restricted to a maximum of kMaxPayloadSlices + // slices. + OutboundTransfer(const std::vector &payload, + TransferCallbacks *callbacks); + + // Destruct the transfer. A transfer object should never be deallocated + // before it has either (a) finished transferring, or (b) been Abort()ed. + ~OutboundTransfer(); + + // Abort the current transfer, with the given status. + // This triggers TransferCallbacks::NotifyTransferAborted. + void Abort(const Status &status); + + // send from our buffers into the sock + Status SendBuffer(Socket &socket); + + // Return true if any bytes have yet been sent. + bool TransferStarted() const; + + // Return true if the entire transfer has been sent. + bool TransferFinished() const; + + // Return the total number of bytes to be sent (including those already sent) + int32_t TotalLength() const; + + std::string HexDump() const; + + private: + // Slices to send. Uses an array here instead of a vector to avoid an expensive + // vector construction (improved performance a couple percent). + Slice payload_slices_[kMaxPayloadSlices]; + size_t n_payload_slices_; + + // The current slice that is being sent. + int32_t cur_slice_idx_; + // The number of bytes in the above slice which has already been sent. + int32_t cur_offset_in_slice_; + + TransferCallbacks *callbacks_; + + bool aborted_; + + DISALLOW_COPY_AND_ASSIGN(OutboundTransfer); +}; + +// Callbacks made after a transfer completes. +struct TransferCallbacks { + public: + virtual ~TransferCallbacks(); + + // The transfer finished successfully. + virtual void NotifyTransferFinished() = 0; + + // The transfer was aborted (e.g because the connection died or an error occurred). + virtual void NotifyTransferAborted(const Status &status) = 0; +}; + +} // namespace rpc +} // namespace kudu +#endif diff --git a/src/kudu/scripts/benchmarks.sh b/src/kudu/scripts/benchmarks.sh new file mode 100755 index 000000000000..98dcc2172467 --- /dev/null +++ b/src/kudu/scripts/benchmarks.sh @@ -0,0 +1,606 @@ +#!/bin/bash -xe +######################################################################## +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Run and compare benchmarks. +# +# Allows for running comparisons either locally or as part of a +# Jenkins job which integrates with a historical stats DB. +# Run this script with -help for usage information. +# +# Jenkins job: http://sandbox.jenkins.cloudera.com/job/kudu-benchmarks +######################################################################## + +################################################################ +# Constants +################################################################ + +MODE_JENKINS="jenkins" +MODE_LOCAL="local" + +LOCAL_STATS_BASE="local-stats" + +NUM_MT_TABLET_TESTS=5 +MT_TABLET_TEST=mt-tablet-test +RPC_BENCH_TEST=RpcBenchBenchmark +CBTREE_TEST=cbtree-test +BLOOM_TEST=BloomfileBenchmark +MT_BLOOM_TEST=MultithreadedBloomfileBenchmark +WIRE_PROTOCOL_TEST=WireProtocolBenchmark +COMPACT_MERGE_BENCH=CompactBenchMerge +WITH_OVERLAP=Overlap +NO_OVERLAP=NoOverlap + +MEMROWSET_BENCH=MemRowSetBenchmark +TS_INSERT_LATENCY=TabletServerInsertLatency +TS_8THREAD_BENCH=TabletServer8Threads +INSERT=Insert +SCAN_NONE_COMMITTED=ScanNoneCommitted +SCAN_ALL_COMMITTED=ScanAllCommitted + +FS_SCANINSERT_MRS=FullStackScanInsertMRSOnly +FS_SCANINSERT_DISK=FullStackScanInsertWithDisk + +LOG_DIR_NAME=build/latest/bench-logs +OUT_DIR_NAME=build/latest/bench-out +HTML_FILE="benchmarks.html" + +# Most tests will run this many times. +NUM_SAMPLES=${NUM_SAMPLES:-10} + +################################################################ +# Global variables +################################################################ + +BENCHMARK_MODE=$MODE_JENKINS # we default to "jenkins mode" +BASE_DIR="" +LOGDIR="" +OUTDIR="" + +################################################################ +# Functions +################################################################ + +usage_and_die() { + set +x + echo "Usage: $0 [-local [git-hash-1 [git-hash-2 ...]]]" + echo " When -local is specified, perf of 1 or more git hashes are plotted." + echo " Otherwise, the script is run in 'Jenkins' mode and expects the" + echo " usual Jenkins environment variables to be defined, such as" + echo " BUILD_NUMBER and JOB_NAME." + exit 1 +} + +ensure_cpu_scaling() { + $(dirname $BASH_SOURCE)/ensure_cpu_scaling.sh "$@" +} + +record_result() { + local BUILD_IDENTIFIER=$1 + local TEST_NAME=$2 + local ITER=$3 + local VALUE=$4 + if [ $BENCHMARK_MODE = $MODE_JENKINS ]; then + python write-jobs-stats-to-mysql.py $JOB_NAME $BUILD_IDENTIFIER $TEST_NAME $ITER $VALUE + else + local STATS_FILE="$OUTDIR/$LOCAL_STATS_BASE-$TEST_NAME.tsv" + # Note: literal tabs in below string. + echo "${TEST_NAME} ${VALUE} ${BUILD_IDENTIFIER}" >> "$STATS_FILE" + fi +} + +load_stats() { + local TEST_NAME="$1" + if [ "$BENCHMARK_MODE" = "$MODE_JENKINS" ]; then + # Get last 4 weeks of stats + python get-job-stats-from-mysql.py $TEST_NAME 28 + else + # Convert MySQL wildcards to shell wildcards. + local TEST_NAME=$(echo $TEST_NAME | perl -pe 's/%/*/g') + local STATS_FILES=$(ls $OUTDIR/$LOCAL_STATS_BASE-$TEST_NAME.tsv) + # Note: literal tabs in below string. + echo "workload runtime build_number" + for f in $STATS_FILES; do + cat $f + done + fi +} + +write_img_plot() { + local INPUT_FILE=$1 + local TEST_NAME=$2 + # Rscript fails when there's only a header, so just skip + if [ `wc -l $INPUT_FILE | cut -d ' ' -f1` -gt 1 ]; then + Rscript jobs_runtime.R $INPUT_FILE $TEST_NAME + fi +} + +write_mttablet_img_plots() { + local INPUT_FILE=$1 + local TEST_NAME=$2 + xvfb-run Rscript mt-tablet-test-graph.R $INPUT_FILE $TEST_NAME +} + +build_kudu() { + # PATH=:$PATH + export TOOLCHAIN=/mnt/toolchain/toolchain.sh + if [ -f "$TOOLCHAIN" ]; then + source $TOOLCHAIN + fi + + # Build thirdparty + $BASE_DIR/build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh + + # PATH=::$PATH + THIRDPARTY_BIN=$BASE_DIR/thirdparty/installed/bin + export PPROF_PATH=$THIRDPARTY_BIN/pprof + + BUILD_TYPE=release + + # Build Kudu + mkdir -p build/$BUILD_TYPE + pushd build/$BUILD_TYPE + rm -rf CMakeCache.txt CMakeFiles/ + + $BASE_DIR/build-support/enable_devtoolset.sh $THIRDPARTY_BIN/cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ../.. + + # clean up before we run + rm -Rf /tmp/kudutpch1-$UID + mkdir -p /tmp/kudutpch1-$UID + + NUM_PROCS=$(cat /proc/cpuinfo | grep processor | wc -l) + make -j${NUM_PROCS} 2>&1 | tee build.log + popd + +} + +run_benchmarks() { + # Create output directories if needed. + mkdir -p "$LOGDIR" + mkdir -p "$OUTDIR" + + # run all of the variations of mt-tablet-test + ./build/latest/bin/mt-tablet-test \ + --gtest_filter=\*DoTestAllAtOnce\* \ + --num_counter_threads=0 \ + --tablet_test_flush_threshold_mb=32 \ + --num_slowreader_threads=0 \ + --flusher_backoff=1.0 \ + --flusher_initial_frequency_ms=1000 \ + --inserts_per_thread=1000000 \ + &> $LOGDIR/${MT_TABLET_TEST}.log + + # run rpc-bench test 5 times. 10 seconds per run + for i in $(seq 1 $NUM_SAMPLES); do + KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/rpc-bench &> $LOGDIR/$RPC_BENCH_TEST$i.log + done + + # run cbtree-test 5 times. 20 seconds per run + for i in $(seq 1 $NUM_SAMPLES); do + KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/cbtree-test \ + --gtest_filter=TestCBTree.TestScanPerformance &> $LOGDIR/${CBTREE_TEST}$i.log + done + + # run bloomfile-test 5 times. ~3.3 seconds per run + for i in $(seq 1 $NUM_SAMPLES); do + ./build/latest/bin/bloomfile-test --benchmark_queries=10000000 --bloom_size_bytes=32768 \ + --n_keys=100000 --gtest_filter=*Benchmark &> $LOGDIR/$BLOOM_TEST$i.log + done + + # run mt-bloomfile-test 5 times. 20-30 seconds per run. + # The block cache is set to 1MB to generate churn. + for i in $(seq 1 $NUM_SAMPLES); do + ./build/latest/bin/mt-bloomfile-test --benchmark_queries=2000000 --bloom_size_bytes=32768 \ + --n_keys=5000000 --block_cache_capacity_mb=1 &> $LOGDIR/$MT_BLOOM_TEST$i.log + done + + # run wire_protocol-test 5 times. 6 seconds per run + for i in $(seq 1 $NUM_SAMPLES); do + KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/wire_protocol-test --gtest_filter=*Benchmark \ + &> $LOGDIR/$WIRE_PROTOCOL_TEST$i.log + done + + # run compaction-test 5 times, 6 seconds each + for i in $(seq 1 $NUM_SAMPLES); do + KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/compaction-test \ + --gtest_filter=TestCompaction.BenchmarkMerge* &> $LOGDIR/${COMPACT_MERGE_BENCH}$i.log + done + + # run memrowset benchmark 5 times, ~10 seconds per run + for i in $(seq 1 $NUM_SAMPLES) ; do + ./build/latest/bin/memrowset-test --roundtrip_num_rows=10000000 \ + --gtest_filter=\*InsertCount\* &> $LOGDIR/${MEMROWSET_BENCH}$i.log + done + + # Run single-threaded TS insert latency benchmark, 5-6 seconds per run + for i in $(seq 1 $NUM_SAMPLES) ; do + KUDU_ALLOW_SLOW_TESTS=true ./build/latest/bin/tablet_server-test \ + --gtest_filter=*MicroBench* \ + --single_threaded_insert_latency_bench_warmup_rows=1000 \ + --single_threaded_insert_latency_bench_insert_rows=10000 &> $LOGDIR/${TS_INSERT_LATENCY}$i.log + done + + # Run multi-threaded TS insert benchmark + for i in $(seq 1 $NUM_SAMPLES) ; do + KUDU_ALLOW_SLOW_TESTS=1 build/latest/bin/tablet_server-stress-test \ + --num_inserts_per_thread=30000 &> $LOGDIR/${TS_8THREAD_BENCH}$i.log + done + + # Run full stack scan/insert test using MRS only, ~26s each + for i in $(seq 1 $NUM_SAMPLES) ; do + ./build/latest/bin/full_stack-insert-scan-test \ + --gtest_filter=FullStackInsertScanTest.MRSOnlyStressTest \ + --concurrent_inserts=50 \ + --inserts_per_client=200000 \ + --rows_per_batch=10000 \ + &> $LOGDIR/${FS_SCANINSERT_MRS}$i.log + done + + # Run full stack scan/insert test with disk, ~50s each + for i in $(seq 1 $NUM_SAMPLES) ; do + ./build/latest/bin/full_stack-insert-scan-test \ + --gtest_filter=FullStackInsertScanTest.WithDiskStressTest \ + --concurrent_inserts=50 \ + --inserts_per_client=200000 \ + --rows_per_batch=10000 \ + &> $LOGDIR/${FS_SCANINSERT_DISK}$i.log + done +} + +parse_and_record_all_results() { + local BUILD_IDENTIFIER="$1" + + if [ -z "$BUILD_IDENTIFIER" ]; then + echo "ERROR: BUILD_IDENTIFIER not defined" + exit 1 + fi + + pushd src + pushd kudu + pushd scripts + + # parse the number of ms out of "[ OK ] MultiThreadedTabletTest/5.DoTestAllAtOnce (14966 ms)" + local MT_TABLET_TEST_TIMINGS="${MT_TABLET_TEST}-timings" + grep OK $LOGDIR/${MT_TABLET_TEST}.log | cut -d "(" -f2 | cut -d ")" -f1 | cut -d " " -f1 \ + > $LOGDIR/${MT_TABLET_TEST_TIMINGS}.txt + + # The tests go from 0 to NUM_MT_TABLET_TEST, but files start at line one so we add +1 to the line number. + # Then using the timing we found, we multiply it by 1000 to gets seconds in float, then send it to MySQL + for i in $(seq 0 $NUM_MT_TABLET_TESTS); do + linenumber=$[ $i + 1 ] + timing=`sed -n "${linenumber}p" $LOGDIR/${MT_TABLET_TEST_TIMINGS}.txt` + record_result $BUILD_IDENTIFIER MultiThreadedTabletTest_$i 1 `echo $timing / 1000 | bc -l` + done + + # parse out the real time from: "Time spent Insert 10000000 keys: real 16.438s user 16.164s sys 0.229s" + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "Time spent Insert" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ConcurrentBTreeScanInsert $i $real + done + + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "not frozen" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ConcurrentBTreeScanNotFrozen $i $real + done + + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "(frozen" $LOGDIR/${CBTREE_TEST}$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ConcurrentBTreeScanFrozen $i $real + done + + # parse out the real time from "Time spent with overlap: real 0.557s user 0.546s sys 0.010s" + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "with overlap" $LOGDIR/${COMPACT_MERGE_BENCH}$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${COMPACT_MERGE_BENCH}${WITH_OVERLAP} $i $real + done + + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "without overlap" $LOGDIR/${COMPACT_MERGE_BENCH}$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${COMPACT_MERGE_BENCH}${NO_OVERLAP} $i $real + done + + # parse out time from MRS benchmarks + for i in $(seq 1 $NUM_SAMPLES); do + log=$LOGDIR/${MEMROWSET_BENCH}$i.log + real=`grep "Time spent Inserting" $log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${INSERT} $i $real + real=`grep "Time spent Scanning rows where none" $log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${SCAN_NONE_COMMITTED} $i $real + real=`grep "Time spent Scanning rows where all" $log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${MEMROWSET_BENCH}${SCAN_ALL_COMMITTED} $i $real + done + + # Parse out the real time from: "Time spent Running 10000000 queries: real 3.281s user 3.273s sys 0.000s" + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "Time spent Running" $LOGDIR/$BLOOM_TEST$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER $BLOOM_TEST $i $real + done + + # Parse out the real time from: "Time spent Running 2000000 queries: real 28.193s user 26.903s sys 1.032s" + # Many threads output their value, we keep the last; + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "Time spent Running" $LOGDIR/$MT_BLOOM_TEST$i.log | tail -n1 | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER $MT_BLOOM_TEST $i $real + done + + # Parse out the real time from: "Time spent Converting to PB: real 5.962s user 5.918s sys 0.025s" + for i in $(seq 1 $NUM_SAMPLES); do + real=`grep "Time spent Converting" $LOGDIR/$WIRE_PROTOCOL_TEST$i.log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER $WIRE_PROTOCOL_TEST $i $real + done + + # parse the rate out of: "I1009 15:00:30.023576 27043 rpc-bench.cc:108] Reqs/sec: 84404.4" + for i in $(seq 1 $NUM_SAMPLES); do + rate=`grep Reqs $LOGDIR/$RPC_BENCH_TEST$i.log | cut -d ":" -f 5 | tr -d ' '` + record_result $BUILD_IDENTIFIER $RPC_BENCH_TEST $i $rate + done + + # parse latency numbers from single-threaded tserver benchmark + for i in $(seq 1 $NUM_SAMPLES); do + for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do + val=$(grep "\"$metric\": " $LOGDIR/${TS_INSERT_LATENCY}$i.log | awk '{print $2}' | sed -e 's/,//') + record_result $BUILD_IDENTIFIER ${TS_INSERT_LATENCY}_$metric $i $val + done + done + + # parse latency and throughput numbers from multi-threaded tserver benchmark + for i in $(seq 1 $NUM_SAMPLES); do + local log=$LOGDIR/${TS_8THREAD_BENCH}$i.log + for metric in min mean percentile_95 percentile_99 percentile_99_9 ; do + val=$(grep "\"$metric\": " $log | awk '{print $2}' | sed -e 's/,//') + record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_${metric}_latency $i $val + done + rate=$(grep -o 'Throughput.*' $log | awk '{print $2}') + record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_throughput_wall $i $rate + rate=$(grep -o 'CPU efficiency.*' $log | awk '{print $3}') + record_result $BUILD_IDENTIFIER ${TS_8THREAD_BENCH}_throughput_cpu $i $rate + done + + # parse scan timings for scans and inserts with MRS only + for i in $(seq 1 $NUM_SAMPLES); do + local log=$LOGDIR/${FS_SCANINSERT_MRS}$i.log + insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh` + scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh` + scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh` + scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh` + scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_insert $i $insert + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_full $i $scan_full + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_str $i $scan_str + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_int32 $i $scan_int32 + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_MRS}_scan_int64 $i $scan_int64 + done + + # parse scan timings for scans and inserts with disk + for i in $(seq 1 $NUM_SAMPLES); do + local log=$LOGDIR/${FS_SCANINSERT_DISK}$i.log + insert=`grep "Time spent concurrent inserts" $log | ./parse_real_out.sh` + scan_full=`grep "Time spent full schema scan" $log | ./parse_real_out.sh` + scan_str=`grep "Time spent String projection" $log | ./parse_real_out.sh` + scan_int32=`grep "Time spent Int32 projection" $log | ./parse_real_out.sh` + scan_int64=`grep "Time spent Int64 projection" $log | ./parse_real_out.sh` + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_insert $i $insert + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_full $i $scan_full + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_str $i $scan_str + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_int32 $i $scan_int32 + record_result $BUILD_IDENTIFIER ${FS_SCANINSERT_DISK}_scan_int64 $i $scan_int64 + done + + popd + popd + popd +} + +generate_ycsb_plots() { + local WORKLOAD=$1 + local PHASE=$2 + METRIC_NAME=ycsb-$PHASE-$WORKLOAD + + # first plot the overall stats for that phase + OVERALL_FILENAME=$METRIC_NAME-OVERALL + load_stats $OVERALL_FILENAME-runtime_ms > $OUTDIR/$OVERALL_FILENAME-runtime_ms.tsv + write_img_plot $OUTDIR/$OVERALL_FILENAME-runtime_ms.tsv $OVERALL_FILENAME-runtime_ms + load_stats $OVERALL_FILENAME-throughput_ops_sec > $OUTDIR/$OVERALL_FILENAME-throughput_ops_sec.tsv + write_img_plot $OUTDIR/$OVERALL_FILENAME-throughput_ops_sec.tsv $OVERALL_FILENAME-throughput_ops_sec + + # now plot the individual operations + OPS="INSERT UPDATE READ" + + for op in $OPS; do + OP_FILENAME=$METRIC_NAME-$op + load_stats $OP_FILENAME-average_latency_us > $OUTDIR/$OP_FILENAME-average_latency_us.tsv + write_img_plot $OUTDIR/$OP_FILENAME-average_latency_us.tsv $OP_FILENAME-average_latency_us + + load_stats $OP_FILENAME-95th_latency_ms > $OUTDIR/$OP_FILENAME-95th_latency_ms.tsv + write_img_plot $OUTDIR/$OP_FILENAME-95th_latency_ms.tsv $OP_FILENAME-95th_latency_ms + + load_stats $OP_FILENAME-99th_latency_ms > $OUTDIR/$OP_FILENAME-99th_latency_ms.tsv + write_img_plot $OUTDIR/$OP_FILENAME-99th_latency_ms.tsv $OP_FILENAME-99th_latency_ms + done +} + +load_and_generate_plot() { + local TEST_NAME=$1 + local PLOT_NAME=$2 + load_stats "$TEST_NAME" > $OUTDIR/$PLOT_NAME.tsv + write_img_plot $OUTDIR/$PLOT_NAME.tsv $PLOT_NAME +} + +load_stats_and_generate_plots() { + pushd src + pushd kudu + pushd scripts + + load_and_generate_plot "%MultiThreadedTabletTest%" mt-tablet-test-runtime + + load_and_generate_plot ConcurrentBTreeScanInsert cb-tree-insert + load_and_generate_plot ConcurrentBTreeScanNotFrozen cb-ctree-not-frozen + load_and_generate_plot ConcurrentBTreeScanFrozen cb-ctree-frozen + + load_and_generate_plot "${COMPACT_MERGE_BENCH}%" compact-merge-bench + + load_and_generate_plot "${MEMROWSET_BENCH}${INSERT}" memrowset-bench-insert + load_and_generate_plot "${MEMROWSET_BENCH}Scan%" memrowset-bench-scan + + load_and_generate_plot $BLOOM_TEST bloom-test + load_and_generate_plot $MT_BLOOM_TEST mt-bloom-test + + load_and_generate_plot $WIRE_PROTOCOL_TEST wire-protocol-test + + load_and_generate_plot $RPC_BENCH_TEST rpc-bench-test + + load_and_generate_plot "${TS_INSERT_LATENCY}%" ts-insert-latency + + load_and_generate_plot "${TS_8THREAD_BENCH}%_latency" ts-8thread-insert-latency + load_and_generate_plot "${TS_8THREAD_BENCH}%_throughput_%" ts-8thread-insert-throughput + + load_and_generate_plot "${FS_SCANINSERT_MRS}%_insert" fs-mrsonly-insert + load_and_generate_plot "${FS_SCANINSERT_MRS}%_scan%" fs-mrsonly-scan + load_and_generate_plot "${FS_SCANINSERT_DISK}%_insert" fs-withdisk-insert + load_and_generate_plot "${FS_SCANINSERT_DISK}%_scan%" fs-withdisk-scan + + # Generate all the pngs for all the mt-tablet tests + for i in $(seq 0 $NUM_MT_TABLET_TESTS); do + cat $LOGDIR/${MT_TABLET_TEST}.log | ./graph-metrics.py MultiThreadedTabletTest/$i > $OUTDIR/test$i.tsv + # Don't bail on failure (why not?) + write_mttablet_img_plots $OUTDIR/test$i.tsv test$i || true + done + + if [ "${BENCHMARK_MODE}" = "${MODE_JENKINS}" ]; then + ################################################################ + # Plot the separately-recorded TPCH and YCSB graphs as well + # (only for Jenkins) + ################################################################ + + # TPC-H 1 runs separately, let's just get those graphs + load_and_generate_plot query_1_1gb tpch1-query + load_and_generate_plot insert_1gb tpch1-insert + + # YCSB which runs the 5nodes_workload on a cluster + # First we process the loading phase + generate_ycsb_plots 5nodes_workload load + + # Then the running phase + generate_ycsb_plots 5nodes_workload run + fi + + # Move all the pngs to OUT_DIR. + mv *.png $OUTDIR/ + + # Generate an HTML file aggregating the PNGs. + # Mostly for local usage, but somewhat useful to check the Jenkins runs too. + pushd $OUTDIR/ + PNGS=$(ls *.png) + echo -n > "$OUTDIR/$HTML_FILE" + echo "Kudu Benchmarks" >> "$OUTDIR/$HTML_FILE" + echo "

Kudu Benchmarks

" >> "$OUTDIR/$HTML_FILE" + for png in $PNGS; do + echo "
" >> "$OUTDIR/$HTML_FILE" + done + popd + + popd + popd + popd +} + +build_run_record() { + local BUILD_IDENTIFIER=$1 + build_kudu + run_benchmarks + parse_and_record_all_results "$BUILD_IDENTIFIER" +} + +git_checkout() { + local GIT_HASH=$1 + git checkout $GIT_HASH +} + +run() { + + # Parse command-line options. + if [ -n "$1" ]; then + [ "$1" = "-local" ] || usage_and_die + shift + + BENCHMARK_MODE=$MODE_LOCAL + + # If no hashes are provided, run against the current HEAD. + if [ -z "$1" ]; then + build_run_record "working_tree" + else + # Convert the passed-in git refs into their hashes. + # This allows you to use "HEAD~3 HEAD" as arguments + # and end up with those being evaluated with regard to + # the _current_ branch, instead of evaluating the second + # "HEAD" after checking out the first. + local ref + local hashes + for ref in "$@" ; do + hashes="$hashes $(git rev-parse "$ref")" + done + set $hashes + while [ -n "$1" ]; do + local GIT_HASH="$1" + shift + git_checkout "$GIT_HASH" + build_run_record "$GIT_HASH" + done + fi + + else + [ -n "$BUILD_NUMBER" ] || usage_and_die + build_run_record "$BUILD_NUMBER" + fi + + # The last step is the same for both modes. + load_stats_and_generate_plots +} + +################################################################ +# main +################################################################ + +# Figure out where we are, store in global variables. +BASE_DIR=$(pwd) +LOGDIR="$BASE_DIR/$LOG_DIR_NAME" +OUTDIR="$BASE_DIR/$OUT_DIR_NAME" + +# Ensure we are in KUDU_HOME +if [ ! -f "$BASE_DIR/LICENSE.txt" ]; then + set +x + echo "Error: must run from top of Kudu source tree" + usage_and_die +fi + +# Set up environment. +ulimit -m $[3000*1000] +ulimit -c unlimited # gather core dumps + +# Set CPU governor, and restore it on exit. +old_governor=$(ensure_cpu_scaling performance) +restore_governor() { + ensure_cpu_scaling $old_governor >/dev/null +} +trap restore_governor EXIT + +# Kick off the benchmark script. +run $* + +exit 0 diff --git a/src/kudu/scripts/compare-hbase-kudu.R b/src/kudu/scripts/compare-hbase-kudu.R new file mode 100644 index 000000000000..a119b9e18872 --- /dev/null +++ b/src/kudu/scripts/compare-hbase-kudu.R @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +library(ggplot2) +library(reshape) + +source("multiplot.R") +source("si_vec.R") + +d.kudu <- read.table(file="/tmp/kudu.tsv", header=T) +d.kudu$system <- as.factor("kudu") +d.kudu <- subset(d.kudu, select = -c(num_layers)) + + +d.hbase <- read.table(file="/tmp/hbase.tsv", header=T) +d.hbase$system <- as.factor("hbase") +d.hbase <- subset(d.hbase, select = -c(num_storefiles)) + +d <- rbind(d.kudu, d.hbase) + + +d$insert_rate = c(0, diff(d$inserted)/diff(d$time)) +d$scan_rate = c(0, diff(d$scanned)/diff(d$time)) +d <- subset(d, select = -c(scanned)) + +d.melted <- melt(d, id=c("time", "system")) + +vlines <- c( + geom_vline(xintercept=d.kudu[d.kudu$inserted >= 200*1000*1000,][1,]$time, colour="blue"), + geom_vline(xintercept=d.hbase[d.hbase$inserted >= 200*1000*1000,][1,]$time, colour="red")) + +smooth.span <- 1.0/max(d$time) + +p.scan_rate <- ggplot(subset(d.melted, variable=="scan_rate")) + + aes(x=time, y=value, colour=system) + + geom_line() + + scale_y_log10(labels=si_vec) + + labs(title="Scan rate during insert workload\n(log scale)", + x=NULL, y="Rows/sec") + + vlines + +p.insert_rate <- ggplot(subset(d.melted, variable=="insert_rate")) + + aes(x=time, y=value, colour=system) + + stat_smooth(span=smooth.span) + + geom_line(alpha=0.4) + + scale_y_continuous(labels=si_vec) + + labs(title="Insert rate during insert workload", + x="Time (s)", y="Rows/sec") + + vlines + +scan_rate_histo <- ggplot(d, aes(scan_rate, fill=system)) + + geom_density(alpha=0.5) + + scale_x_log10(labels=si_vec) + + labs(x="Scan rate (rows/sec)") + +insert_rate_histo <- ggplot(d, aes(insert_rate, fill=system)) + + geom_density(alpha=0.5) + + scale_x_continuous(labels=si_vec) + + labs(x="Insert rate (rows/sec)") + +tryCatch({dev.off()}, error=function(e){}) +multiplot(p.scan_rate, p.insert_rate); +dev.new() +multiplot(scan_rate_histo, insert_rate_histo) + + diff --git a/src/kudu/scripts/ensure_cpu_scaling.sh b/src/kudu/scripts/ensure_cpu_scaling.sh new file mode 100755 index 000000000000..0de657ae344b --- /dev/null +++ b/src/kudu/scripts/ensure_cpu_scaling.sh @@ -0,0 +1,24 @@ +#!/bin/bash -e +# Ensure that the CPU governor is set to a particular governor, outputting +# the prior governor on stdout. +# +# Without this, some of our tests end up having higher variance due to +# changing CPU speed during the test. +# +# Assumes that all CPUs are set to the same governor. +target_governor=$1 +old_governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor) +for cpu_dir in /sys/devices/system/cpu/cpu[0-9]*/ ; do + governor_file=$cpu_dir/cpufreq/scaling_governor + governor=$(cat $governor_file) + if [ "$governor" != "$target_governor" ]; then + >&2 echo "CPU $cpu_dir not in '$target_governor' mode. Attempting to change" + echo $target_governor | sudo tee $governor_file > /dev/null + if [ $? -ne 0 ]; then + >&2 echo Could not set $target_governor governor! + >&2 echo Perhaps you need passwordless sudo for this user + exit 1 + fi + fi +done +echo $old_governor diff --git a/src/kudu/scripts/get-job-stats-from-mysql.py b/src/kudu/scripts/get-job-stats-from-mysql.py new file mode 100644 index 000000000000..5850a8a911d4 --- /dev/null +++ b/src/kudu/scripts/get-job-stats-from-mysql.py @@ -0,0 +1,42 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import MySQLdb as mdb +import sys +import os + +if len(sys.argv) < 3: + sys.exit("usage: %s " % sys.argv[0]) + +host = os.environ["MYSQLHOST"] +user = os.environ["MYSQLUSER"] +pwd = os.environ["MYSQLPWD"] +db = os.environ["MYSQLDB"] + +con = mdb.connect(host, user, pwd, db) +with con: + cur = con.cursor() + job_name = sys.argv[1] + days = sys.argv[2] + cur.execute("select workload, runtime, build_number from kudu_perf_tpch where workload like %s AND curr_date >= DATE_SUB(NOW(), INTERVAL %s DAY) and runtime != 0 ORDER BY workload, build_number, curr_date", (job_name, days)) + rows = cur.fetchall() + print 'workload', '\t', 'runtime', '\t', 'build_number' + for row in rows: + print row[0], '\t', row[1], '\t', row[2] + diff --git a/src/kudu/scripts/graph-metrics.py b/src/kudu/scripts/graph-metrics.py new file mode 100755 index 000000000000..2ed126c69c6c --- /dev/null +++ b/src/kudu/scripts/graph-metrics.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Script which parses a test log for 'metrics: ' lines emited by +# TimeSeriesCollector, and constructs a graph from them + +import os +import re +import simplejson +import sys + +METRICS_LINE = re.compile('metrics: (.+)$') +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +def parse_data_from(stream, scope): + data = [] + scanned = 0 + prev_time = 0 + for line in stream: + if 'metrics: {' not in line: + continue + match = METRICS_LINE.search(line) + if not match: + continue + json = match.group(1) + try: + data_points = simplejson.loads(json) + except: + print >>sys.stderr, "bad json:", json + raise + if data_points['scope'] != scope: + continue + del data_points['scope'] + if 'scan_rate' in data_points: + scanned += (data_points['scan_rate'] * (data_points['time'] - prev_time)) + data_points['scanned'] = scanned + del data_points['scan_rate'] + prev_time = data_points['time'] + + data.append(data_points) + return data + + +def get_keys(raw_data): + keys = set() + for row in raw_data: + keys.update(row.keys()) + return keys + +def main(): + scope = sys.argv[1] + data = parse_data_from(sys.stdin, scope) + keys = get_keys(data) + + with sys.stdout as f: + print >>f, "\t".join(keys) + for row in data: + print >>f, "\t".join([str(row.get(k, 0)) for k in keys]) + + +if __name__ == "__main__": + main() diff --git a/src/kudu/scripts/jobs_runtime.R b/src/kudu/scripts/jobs_runtime.R new file mode 100644 index 000000000000..e6dc97efd8fa --- /dev/null +++ b/src/kudu/scripts/jobs_runtime.R @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# How to invoke: +# jobs_runtime.R +# This script takes in input a TSV file with the following header: +# workload runtime build_number +# It generates a png where x is the build number, y is the runtime +# and each workload is a different line. The test name is used to generate +# the output file's name. +# R needs to be installed with the graphic libraries + +library(Cairo) +library(ggplot2) + +newpng <- function(filename = "img.png", width = 1500, height = 500) { + CairoPNG(filename, width, height) +} + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 2) { + stop("usage: jobs_runtime.R ") +} +filename = args[1] +testname = args[2] + +newpng(paste(testname, "-jobs-runtime.png", sep = "")) + +d <- read.table(file=filename, header=T) + +print(ggplot(d, aes(x = build_number, y = runtime, color = workload)) + + stat_summary(aes(group = workload), fun.y=median, geom = "line") + + geom_boxplot(aes(group = interaction(workload, build_number)), position = "identity", outlier.size = 1.7, outlier.colour = "gray32") + + ggtitle(testname)) diff --git a/src/kudu/scripts/mt-tablet-test-graph.R b/src/kudu/scripts/mt-tablet-test-graph.R new file mode 100644 index 000000000000..a2ac8eeaed5a --- /dev/null +++ b/src/kudu/scripts/mt-tablet-test-graph.R @@ -0,0 +1,101 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# How to invoke: +# mt-tablet-test-graph.R +# This script takes in input a TSV file that contains the timing results +# from running mt-table-test, and parsed out by graph-metrics.py +# The file needs to have the following header: +# memrowset_kb updated scanned time num_rowsets inserted +# Three png are generated: +# - Insert rate as data is inserted +# - Scan rate as data is inserted +# - Multiple plots, where x is time, and y shows a variety of different +# progressions like the number of rowsets over time. + +library(ggplot2) +library(reshape) +library(Cairo) + +newpng<- function(filename = "img.png", width = 400, height = 400) { + CairoPNG(filename, width, height) +} + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 2) { + stop("usage: jobs_runtime.R ") +} +filename = args[1] +testname = args[2] + +source("si_vec.R") +newpng(paste(testname, "-1.png", sep = "")) + +print(c("Using file ", filename)) + +d <- read.table(file=filename, header=T) + +d$insert_rate = c(0, diff(d$inserted)/diff(d$time)) + +if (exists("scanned", where=d)) { + d$scan_rate = c(0, diff(d$scanned)/diff(d$time)) + d <- subset(d, select = -c(scanned)) +} + +if (!is.null(d$updated)) { + d$update_rate = c(0, diff(d$updated)/diff(d$time)) + d <- subset(d, select = -c(updated)) +} + +# Put memrowset usage in bytes +d$memrowset_bytes <- d$memrowset * 1024 +d <- subset(d, select = -c(memrowset_kb)) + +print(ggplot(d, aes(inserted, insert_rate)) + + geom_point(alpha=0.5) + + scale_x_continuous(labels=si_vec) + + scale_y_log10(labels=si_vec)) + +if (exists("scan_rate", where=d)) { + newpng(paste(testname, "-2.png", sep = "")) + print(ggplot(d, aes(inserted, scan_rate)) + + geom_point(alpha=0.5) + + scale_x_continuous(labels=si_vec) + + scale_y_log10(labels=si_vec)) +} + +newpng(paste(testname, "-3.png", sep = "")) + +d <- rename(d, c( + insert_rate="Insert rate (rows/sec)", + memrowset="Memstore Memory Usage")) + + +if (exists("scan_rate", where=d)) { + d <- rename(d, c( + scan_rate="Scan int col (rows/sec)")) +} + +# set span to 5 seconds worth of data +span = 5.0/max(d$time) + +d.melted = melt(d, id="time") +print(qplot(time, value, data=d.melted, geom="line", group = variable) + + scale_y_continuous(labels=si_vec) + + facet_grid(variable~., scale = "free_y") + + stat_smooth()) + diff --git a/src/kudu/scripts/multiplot.R b/src/kudu/scripts/multiplot.R new file mode 100644 index 000000000000..de70d71950b3 --- /dev/null +++ b/src/kudu/scripts/multiplot.R @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# From http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + require(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} + diff --git a/src/kudu/scripts/parse_metrics_log.py b/src/kudu/scripts/parse_metrics_log.py new file mode 100644 index 000000000000..30c11576b5c9 --- /dev/null +++ b/src/kudu/scripts/parse_metrics_log.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This script parses a set of metrics logs output from a tablet server, +and outputs a TSV file including some metrics. + +This isn't meant to be used standalone as written, but rather as a template +which is edited based on whatever metrics you'd like to extract. The set +of metrics described below are just a starting point to work from. +""" + +import gzip +try: + import simplejson as json +except: + import json +import sys + +# These metrics will be extracted "as-is" into the TSV. +# The first element of each tuple is the metric name. +# The second is the name that will be used in the TSV header line. +SIMPLE_METRICS = [ + ("server.generic_current_allocated_bytes", "heap_allocated"), + ("server.log_block_manager_bytes_under_management", "bytes_on_disk"), + ("tablet.memrowset_size", "mrs_size"), + ("server.block_cache_usage", "bc_usage"), +] + +# These metrics will be extracted as per-second rates into the TSV. +RATE_METRICS = [ + ("server.block_manager_total_bytes_read", "bytes_r_per_sec"), + ("server.block_manager_total_bytes_written", "bytes_w_per_sec"), + ("server.block_cache_lookups", "bc_lookups_per_sec"), + ("tablet.rows_inserted", "inserts_per_sec"), +] + +# These metrics will be extracted as percentile metrics into the TSV. +# Each metric will generate several columns in the output TSV, with +# percentile numbers suffixed to the column name provided here (foo_p95, +# foo_p99, etc) +HISTOGRAM_METRICS = [ + ("server.handler_latency_kudu_tserver_TabletServerService_Write", "write"), + ("tablet.log_append_latency", "log") +] + +NaN = float('nan') +UNKNOWN_PERCENTILES = dict(p50=NaN, p95=NaN, p99=NaN, p999=NaN) + +def json_to_map(j): + """ + Parse the JSON structure in the log into a python dictionary + keyed by .. + + The entity ID is currently ignored. If there is more than one + entity of a given type (eg tables), it is undefined which one + will be reflected in the output metrics. + + TODO: add some way to specify a particular tablet to parse out. + """ + ret = {} + for entity in j: + for m in entity['metrics']: + ret[entity['type'] + "." + m['name']] = m + return ret + +def delta(prev, cur, m): + """ Compute the delta in metric 'm' between two metric snapshots. """ + if m not in prev or m not in cur: + return 0 + return cur[m]['value'] - prev[m]['value'] + +def histogram_stats(prev, cur, m): + """ + Compute percentile stats for the metric 'm' in the window between two + metric snapshots. + """ + if m not in prev or m not in cur or 'values' not in cur[m]: + return UNKNOWN_PERCENTILES + prev = prev[m] + cur = cur[m] + + p_dict = dict(zip(prev.get('values', []), + prev.get('counts', []))) + c_zip = zip(cur.get('values', []), + cur.get('counts', [])) + delta_total = cur['total_count'] - prev['total_count'] + if delta_total == 0: + return UNKNOWN_PERCENTILES + res = dict() + cum_count = 0 + for cur_val, cur_count in c_zip: + prev_count = p_dict.get(cur_val, 0) + delta_count = cur_count - prev_count + cum_count += delta_count + percentile = float(cum_count) / delta_total + if 'p50' not in res and percentile > 0.50: + res['p50'] = cur_val + if 'p95' not in res and percentile > 0.95: + res['p95'] = cur_val + if 'p99' not in res and percentile > 0.99: + res['p99'] = cur_val + if 'p999' not in res and percentile > 0.999: + res['p999'] = cur_val + return res + +def cache_hit_ratio(prev, cur): + """ + Calculate the cache hit ratio between the two samples. + If there were no cache hits or misses, this returns NaN. + """ + delta_hits = delta(prev, cur, 'server.block_cache_hits_caching') + delta_misses = delta(prev, cur, 'server.block_cache_misses_caching') + if delta_hits + delta_misses > 0: + cache_ratio = float(delta_hits) / (delta_hits + delta_misses) + else: + cache_ratio = NaN + return cache_ratio + +def process(prev, cur): + """ Process a pair of metric snapshots, outputting a line of TSV. """ + delta_ts = cur['ts'] - prev['ts'] + cache_ratio = cache_hit_ratio(prev, cur) + calc_vals = [] + for metric, _ in SIMPLE_METRICS: + if metric in cur: + calc_vals.append(cur[metric]['value']) + else: + calc_vals.append(NaN) + calc_vals.extend(delta(prev, cur, metric)/delta_ts for (metric, _) in RATE_METRICS) + for metric, _ in HISTOGRAM_METRICS: + stats = histogram_stats(prev, cur, metric) + calc_vals.extend([stats['p50'], stats['p95'], stats['p99'], stats['p999']]) + + print (cur['ts'] + prev['ts'])/2, \ + cache_ratio, \ + " ".join(str(x) for x in calc_vals) + +def main(argv): + prev_data = None + + simple_headers = [header for _, header in SIMPLE_METRICS + RATE_METRICS] + for _, header in HISTOGRAM_METRICS: + simple_headers.append(header + "_p50") + simple_headers.append(header + "_p95") + simple_headers.append(header + "_p99") + simple_headers.append(header + "_p999") + + print "time cache_hit_ratio", " ".join(simple_headers) + + for path in sorted(argv[1:]): + if path.endswith(".gz"): + f = gzip.GzipFile(path) + else: + f = file(path) + for line in f: + (_, ts, metrics_json) = line.split(" ", 2) + ts = float(ts) / 1000000.0 + if prev_data and ts < prev_data['ts'] + 30: + continue + data = json_to_map(json.loads(metrics_json)) + data['ts'] = ts + if prev_data: + process(prev_data, data) + prev_data = data + +if __name__ == "__main__": + main(sys.argv) diff --git a/src/kudu/scripts/parse_real_out.sh b/src/kudu/scripts/parse_real_out.sh new file mode 100755 index 000000000000..0accafb410eb --- /dev/null +++ b/src/kudu/scripts/parse_real_out.sh @@ -0,0 +1,26 @@ +#! /usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# This script takes a line in stdin and parses the real time out of it. +# Input example: +# "Times for Insert 10000000 keys: real 16.438s user 16.164s sys 0.229s" +# Output: +# 16.438 + +awk -F 'real ' '{ print $2 }' | awk -F s '{ print $1 }' diff --git a/src/kudu/scripts/si_vec.R b/src/kudu/scripts/si_vec.R new file mode 100644 index 000000000000..f390992bf535 --- /dev/null +++ b/src/kudu/scripts/si_vec.R @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +si_num <- function (x) { + + if (!is.na(x)) { + if (x >= 1e9) { + rem <- format(x/1e9, digits=3) + rem <- append(rem, "B"); + } else if (x >= 1e6) { + rem <- format(x/1e6, digits=3) + rem <- append(rem, "M"); + } else if (x > 1e3) { + rem <- format(x/1e3, digits=3) + rem <- append(rem, "K"); + } + else { + return(x); + } + + return(paste(rem, sep="", collapse="")); + } + else return(NA); +} + +si_vec <- function(x) { + sapply(x, FUN=si_num); +} + diff --git a/src/kudu/scripts/tpch.sh b/src/kudu/scripts/tpch.sh new file mode 100755 index 000000000000..db9fb7fd5c28 --- /dev/null +++ b/src/kudu/scripts/tpch.sh @@ -0,0 +1,146 @@ +#!/bin/bash -xe +######################################################################## +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Run tpch benchmark and write results to the DB. +# +# Expects to find the following Jenkins environment variables set: +# - JOB_NAME +# - BUILD_NUMBER +# If these are not set, the script will still run but will not record +# the results in the MySQL database. Instead, it will output results +# into .tsv files in the kudu source root directory. This is useful for +# running this benchmark locally for testing / dev purposes. +# +# Optional environment variables to override (defaults set for Jenkins): +# - LINEITEM_TBL_PATH: Path to lineitem.tbl from the TPC-H suite. +# - KUDU_DATA_DIR: Directory to use for data storage. +# - TPCH_NUM_QUERY_ITERS: Number of TPC-H query iterations to run. +# +# Jenkins job: http://sandbox.jenkins.cloudera.com/job/kudu-tpch1 +######################################################################## + +########################################################## +# Constants +########################################################## +ROOT=$(readlink -f $(dirname $0)/../../..) + +########################################################## +# Overridable params +########################################################## +LINEITEM_TBL_PATH=${LINEITEM_TBL_PATH:-/home/jdcryans/lineitem.tbl} +KUDU_DATA_DIR=${KUDU_DATA_DIR:-/data/2/tmp/kudutpch1-jenkins} +TPCH_NUM_QUERY_ITERS=${TPCH_NUM_QUERY_ITERS:-5} + +########################################################## +# Functions +########################################################## +record_result() { + local RECORD_STATS_SCRIPT=$ROOT/src/kudu/scripts/write-jobs-stats-to-mysql.py + local TEST_NAME=$1 + local ITER=$2 + local VALUE=$3 + if [ -n "$JOB_NAME" ]; then + # Jenkins. + python $RECORD_STATS_SCRIPT $JOB_NAME $BUILD_NUMBER $TEST_NAME $ITER $VALUE + else + # Running locally. + local STATS_FILE="$OUTDIR/tpch-$TEST_NAME.tsv" + echo -e "${TEST_NAME}\t${ITER}\t${VALUE}" >> "$STATS_FILE" + fi +} + +ensure_cpu_scaling() { + $(dirname $BASH_SOURCE)/ensure_cpu_scaling.sh "$@" +} + +########################################################## +# Main +########################################################## +if [ $TPCH_NUM_QUERY_ITERS -lt 2 ]; then + echo "Error: TPCH_NUM_QUERY_ITERS must be 2 or greater" + exit 1 +fi + +cd $ROOT + +# Set up environment +set -o pipefail +ulimit -m $[3000*1000] +ulimit -c unlimited # gather core dumps + +# Set CPU governor, and restore it on exit. +old_governor=$(ensure_cpu_scaling performance) +restore_governor() { + ensure_cpu_scaling $old_governor >/dev/null +} +trap restore_governor EXIT + +# PATH=:$PATH +export TOOLCHAIN=/mnt/toolchain/toolchain.sh +if [ -f "$TOOLCHAIN" ]; then + source $TOOLCHAIN +fi + +# Build thirdparty +$ROOT/build-support/enable_devtoolset.sh $ROOT/thirdparty/build-if-necessary.sh + +# PATH=::$PATH +THIRDPARTY_BIN=$(pwd)/thirdparty/installed/bin +export PPROF_PATH=$THIRDPARTY_BIN/pprof + +BUILD_TYPE=release + +# Build Kudu +mkdir -p build/$BUILD_TYPE +pushd build/$BUILD_TYPE +rm -rf CMakeCache CMakeFiles/ + +$ROOT/build-support/enable_devtoolset.sh $THIRDPARTY_BIN/cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ../.. + +NUM_PROCS=$(cat /proc/cpuinfo | grep processor | wc -l) +make -j${NUM_PROCS} tpch1 2>&1 | tee build.log +popd + +# Warming up the OS buffer. +cat $LINEITEM_TBL_PATH > /dev/null +cat $LINEITEM_TBL_PATH > /dev/null + +OUTDIR=$ROOT/build/$BUILD_TYPE/tpch +rm -Rf $KUDU_DATA_DIR # Clean up data dir. +mkdir -p $OUTDIR # Create log file output dir. + +./build/$BUILD_TYPE/bin/tpch1 -logtostderr=1 \ + -tpch_path_to_data=$LINEITEM_TBL_PATH \ + -mini_cluster_base_dir=$KUDU_DATA_DIR \ + -tpch_num_query_iterations=$TPCH_NUM_QUERY_ITERS \ + >$OUTDIR/benchmark.log 2>&1 + +cat $OUTDIR/benchmark.log +INSERT_TIME=$(grep "Time spent loading" $OUTDIR/benchmark.log | \ + perl -pe 's/.*Time spent loading: real ([0-9\.]+)s.*/\1/') +record_result insert_1gb 1 $INSERT_TIME + +# We do not record the first iteration (#0) because we want to record the +# in-cache performance. +for iter in $(seq 1 $(expr $TPCH_NUM_QUERY_ITERS - 1)); do + QUERY_TIME=$(grep "iteration # $iter" $OUTDIR/benchmark.log | \ + perl -pe "s/.*iteration # $iter: real ([0-9\.]+)s.*/\1/") + record_result query_1_1gb $iter $QUERY_TIME +done diff --git a/src/kudu/scripts/write-jobs-stats-to-mysql.py b/src/kudu/scripts/write-jobs-stats-to-mysql.py new file mode 100644 index 000000000000..a6b092a7a002 --- /dev/null +++ b/src/kudu/scripts/write-jobs-stats-to-mysql.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import MySQLdb as mdb +import sys +import os + +if len(sys.argv) < 6: + sys.exit("usage: %s " % sys.argv[0]) + +host = os.environ["MYSQLHOST"] +user = os.environ["MYSQLUSER"] +pwd = os.environ["MYSQLPWD"] +db = os.environ["MYSQLDB"] + +con = mdb.connect(host, user, pwd, db) +print "Connected to mysql" +with con: + cur = con.cursor() + job_name = sys.argv[1] + build_number = sys.argv[2] + workload = sys.argv[3] + iteration = sys.argv[4] + runtime = sys.argv[5] + cur.execute("INSERT INTO kudu_perf_tpch VALUES(%s, %s, %s, %s, %s, DEFAULT)", + (job_name, build_number, workload, iteration, runtime)) + rows = cur.fetchall() + diff --git a/src/kudu/server/CMakeLists.txt b/src/kudu/server/CMakeLists.txt new file mode 100644 index 000000000000..5f678920dedd --- /dev/null +++ b/src/kudu/server/CMakeLists.txt @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################### +# server_common +######################################### + +set(SERVER_COMMON_SRCS + hybrid_clock.cc + logical_clock.cc +) + +add_library(server_common ${SERVER_COMMON_SRCS}) +target_link_libraries(server_common + kudu_common + codegen + gutil + kudu_fs + kudu_util + consensus_metadata_proto) + +######################################### +# server_common tests +######################################### + +set(KUDU_TEST_LINK_LIBS server_common ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(hybrid_clock-test) +ADD_KUDU_TEST(logical_clock-test) + +######################################### +# server_base_proto +######################################### + +KRPC_GENERATE( + SERVER_BASE_PROTO_SRCS SERVER_BASE_PROTO_HDRS SERVER_BASE_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES server_base.proto) + +add_library(server_base_proto ${SERVER_BASE_PROTO_SRCS} ${SERVER_BASE_PROTO_HDRS}) +target_link_libraries(server_base_proto + krpc + kudu_common_proto + protobuf + rpc_header_proto + version_info_proto + wire_protocol_proto) + +######################################### +# server_process +######################################### + +set(SERVER_PROCESS_SRCS + default-path-handlers.cc + generic_service.cc + glog_metrics.cc + pprof-path-handlers.cc + rpcz-path-handler.cc + rpc_server.cc + server_base.cc + server_base_options.cc + tcmalloc_metrics.cc + tracing-path-handlers.cc + webserver.cc + webserver_options.cc + webui_util.cc +) + +add_library(server_process ${SERVER_PROCESS_SRCS}) +target_link_libraries(server_process + server_base_proto + server_common + kudu_common + kudu_fs + gutil + krpc + kudu_util + squeasel) + +# This module depends on tcmalloc and profiler directly, so need to make +# sure that they get linked in the right order. +if(${KUDU_TCMALLOC_AVAILABLE}) + target_link_libraries(server_process tcmalloc profiler) +endif() + +######################################### +# server_process tests +######################################### + +set(KUDU_TEST_LINK_LIBS server_process ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(webserver-test) diff --git a/src/kudu/server/clock.h b/src/kudu/server/clock.h new file mode 100644 index 000000000000..e55fa5b8929d --- /dev/null +++ b/src/kudu/server/clock.h @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_SERVER_CLOCK_H_ +#define KUDU_SERVER_CLOCK_H_ + +#include + +#include "kudu/common/common.pb.h" +#include "kudu/common/timestamp.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { +class faststring; +class MetricEntity; +class MonoDelta; +class Slice; +class Status; +namespace server { + +// An interface for a clock that can be used to assign timestamps to +// operations. +// Implementations must respect the following assumptions: +// 1 - Now() must return monotonically increasing numbers +// i.e. for any two calls, i.e. Now returns timestamp1 and timestamp2, it must +// hold that timestamp1 < timestamp2. +// 2 - Update() must never set the clock backwards (corollary of 1) +class Clock : public RefCountedThreadSafe { + public: + + // Initializes the clock. + virtual Status Init() = 0; + + // Obtains a new transaction timestamp corresponding to the current instant. + virtual Timestamp Now() = 0; + + // Obtains a new transaction timestamp corresponding to the current instant + // plus the max_error. + virtual Timestamp NowLatest() = 0; + + // Obtain a timestamp which is guaranteed to be later than the current time + // on any machine in the cluster. + // + // NOTE: this is not a very tight bound. + virtual Status GetGlobalLatest(Timestamp* t) { + return Status::NotSupported("clock does not support global properties"); + } + + // Indicates whether this clock supports the required external consistency mode. + virtual bool SupportsExternalConsistencyMode(ExternalConsistencyMode mode) = 0; + + // Update the clock with a transaction timestamp originating from + // another server. For instance replicas can call this so that, + // if elected leader, they are guaranteed to generate timestamps + // higher than the timestamp of the last transaction accepted from the + // leader. + virtual Status Update(const Timestamp& to_update) = 0; + + // Waits until the clock on all machines has advanced past 'then'. + // Can also be used to implement 'external consistency' in the same sense as + // Google's Spanner. + virtual Status WaitUntilAfter(const Timestamp& then, + const MonoTime& deadline) = 0; + + // Waits until the clock on this machine advances past 'then'. Unlike + // WaitUntilAfter(), this does not make any global guarantees. + virtual Status WaitUntilAfterLocally(const Timestamp& then, + const MonoTime& deadline) = 0; + + // Return true if the given time has definitely passed (i.e any future call + // to Now() would return a higher value than t). + virtual bool IsAfter(Timestamp t) = 0; + + // Register the clock metrics in the given entity. + virtual void RegisterMetrics(const scoped_refptr& metric_entity) = 0; + + // Strigifies the provided timestamp according to this clock's internal format. + virtual std::string Stringify(Timestamp timestamp) = 0; + + virtual ~Clock() {} +}; + +} // namespace server +} // namespace kudu + +#endif /* KUDU_SERVER_CLOCK_H_ */ diff --git a/src/kudu/server/default-path-handlers.cc b/src/kudu/server/default-path-handlers.cc new file mode 100644 index 000000000000..6053f4d4490f --- /dev/null +++ b/src/kudu/server/default-path-handlers.cc @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kudu/server/default-path-handlers.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/pprof-path-handlers.h" +#include "kudu/server/webserver.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/histogram.pb.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/jsonwriter.h" + +using boost::replace_all; +using google::CommandlineFlagsIntoString; +using std::ifstream; +using std::string; +using std::endl; +using strings::Substitute; + +DEFINE_int64(web_log_bytes, 1024 * 1024, + "The maximum number of bytes to display on the debug webserver's log page"); +TAG_FLAG(web_log_bytes, advanced); +TAG_FLAG(web_log_bytes, runtime); + +namespace kudu { + +using std::shared_ptr; + +namespace { +// Html/Text formatting tags +struct Tags { + string pre_tag, end_pre_tag, line_break, header, end_header; + + // If as_text is true, set the html tags to a corresponding raw text representation. + explicit Tags(bool as_text) { + if (as_text) { + pre_tag = ""; + end_pre_tag = "\n"; + line_break = "\n"; + header = ""; + end_header = ""; + } else { + pre_tag = "
";
+      end_pre_tag = "
"; + line_break = "
"; + header = "

"; + end_header = "

"; + } + } +}; +} // anonymous namespace + +// Writes the last FLAGS_web_log_bytes of the INFO logfile to a webpage +// Note to get best performance, set GLOG_logbuflevel=-1 to prevent log buffering +static void LogsHandler(const Webserver::WebRequest& req, std::stringstream* output) { + bool as_text = (req.parsed_args.find("raw") != req.parsed_args.end()); + Tags tags(as_text); + string logfile; + GetFullLogFilename(google::INFO, &logfile); + (*output) << tags.header <<"INFO logs" << tags.end_header << endl; + (*output) << "Log path is: " << logfile << endl; + + struct stat file_stat; + if (stat(logfile.c_str(), &file_stat) == 0) { + size_t size = file_stat.st_size; + size_t seekpos = size < FLAGS_web_log_bytes ? 0L : size - FLAGS_web_log_bytes; + ifstream log(logfile.c_str(), std::ios::in); + // Note if the file rolls between stat and seek, this could fail + // (and we could wind up reading the whole file). But because the + // file is likely to be small, this is unlikely to be an issue in + // practice. + log.seekg(seekpos); + (*output) << tags.line_break <<"Showing last " << FLAGS_web_log_bytes + << " bytes of log" << endl; + (*output) << tags.line_break << tags.pre_tag << log.rdbuf() << tags.end_pre_tag; + + } else { + (*output) << tags.line_break << "Couldn't open INFO log file: " << logfile; + } +} + +// Registered to handle "/flags", and prints out all command-line flags and their values +static void FlagsHandler(const Webserver::WebRequest& req, std::stringstream* output) { + bool as_text = (req.parsed_args.find("raw") != req.parsed_args.end()); + Tags tags(as_text); + (*output) << tags.header << "Command-line Flags" << tags.end_header; + (*output) << tags.pre_tag << CommandlineFlagsIntoString() << tags.end_pre_tag; +} + +// Registered to handle "/memz", and prints out memory allocation statistics. +static void MemUsageHandler(const Webserver::WebRequest& req, std::stringstream* output) { + bool as_text = (req.parsed_args.find("raw") != req.parsed_args.end()); + Tags tags(as_text); + + (*output) << tags.pre_tag; +#ifndef TCMALLOC_ENABLED + (*output) << "Memory tracking is not available unless tcmalloc is enabled."; +#else + char buf[2048]; + MallocExtension::instance()->GetStats(buf, 2048); + // Replace new lines with
for html + string tmp(buf); + replace_all(tmp, "\n", tags.line_break); + (*output) << tmp << tags.end_pre_tag; +#endif +} + +// Registered to handle "/mem-trackers", and prints out to handle memory tracker information. +static void MemTrackersHandler(const Webserver::WebRequest& req, std::stringstream* output) { + *output << "

Memory usage by subsystem

\n"; + *output << "\n"; + *output << " " + "\n"; + + vector > trackers; + MemTracker::ListTrackers(&trackers); + for (const shared_ptr& tracker : trackers) { + string parent = tracker->parent() == nullptr ? "none" : tracker->parent()->id(); + string limit_str = tracker->limit() == -1 ? "none" : + HumanReadableNumBytes::ToString(tracker->limit()); + string current_consumption_str = HumanReadableNumBytes::ToString(tracker->consumption()); + string peak_consumption_str = HumanReadableNumBytes::ToString(tracker->peak_consumption()); + (*output) << Substitute(" " // id, parent, limit + "\n", // current, peak + tracker->id(), parent, limit_str, current_consumption_str, + peak_consumption_str); + } + *output << "
IdParentLimitCurrent ConsumptionPeak consumption
$0$1$2$3$4
\n"; +} + +void AddDefaultPathHandlers(Webserver* webserver) { + webserver->RegisterPathHandler("/logs", "Logs", LogsHandler); + webserver->RegisterPathHandler("/varz", "Flags", FlagsHandler); + webserver->RegisterPathHandler("/memz", "Memory (total)", MemUsageHandler); + webserver->RegisterPathHandler("/mem-trackers", "Memory (detail)", MemTrackersHandler); + + AddPprofPathHandlers(webserver); +} + + +static void WriteMetricsAsJson(const MetricRegistry* const metrics, + const Webserver::WebRequest& req, std::stringstream* output) { + const string* requested_metrics_param = FindOrNull(req.parsed_args, "metrics"); + vector requested_metrics; + MetricJsonOptions opts; + + { + string arg = FindWithDefault(req.parsed_args, "include_raw_histograms", "false"); + opts.include_raw_histograms = ParseLeadingBoolValue(arg.c_str(), false); + } + { + string arg = FindWithDefault(req.parsed_args, "include_schema", "false"); + opts.include_schema_info = ParseLeadingBoolValue(arg.c_str(), false); + } + JsonWriter::Mode json_mode; + { + string arg = FindWithDefault(req.parsed_args, "compact", "false"); + json_mode = ParseLeadingBoolValue(arg.c_str(), false) ? + JsonWriter::COMPACT : JsonWriter::PRETTY; + } + + JsonWriter writer(output, json_mode); + + if (requested_metrics_param != nullptr) { + SplitStringUsing(*requested_metrics_param, ",", &requested_metrics); + } else { + // Default to including all metrics. + requested_metrics.push_back("*"); + } + + WARN_NOT_OK(metrics->WriteAsJson(&writer, requested_metrics, opts), + "Couldn't write JSON metrics over HTTP"); +} + +void RegisterMetricsJsonHandler(Webserver* webserver, const MetricRegistry* const metrics) { + Webserver::PathHandlerCallback callback = boost::bind(WriteMetricsAsJson, metrics, _1, _2); + bool not_styled = false; + bool not_on_nav_bar = false; + bool is_on_nav_bar = true; + webserver->RegisterPathHandler("/metrics", "Metrics", callback, not_styled, is_on_nav_bar); + + // The old name -- this is preserved for compatibility with older releases of + // monitoring software which expects the old name. + webserver->RegisterPathHandler("/jsonmetricz", "Metrics", callback, not_styled, not_on_nav_bar); +} + +} // namespace kudu diff --git a/src/kudu/server/default-path-handlers.h b/src/kudu/server/default-path-handlers.h new file mode 100644 index 000000000000..0a2647b79ee4 --- /dev/null +++ b/src/kudu/server/default-path-handlers.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef KUDU_SERVER_DEFAULT_PATH_HANDLERS_H +#define KUDU_SERVER_DEFAULT_PATH_HANDLERS_H + +#include + +namespace kudu { + +class MetricRegistry; +class Webserver; + +// Adds a set of default path handlers to the webserver to display +// logs and configuration flags. +void AddDefaultPathHandlers(Webserver* webserver); + +// Adds an endpoint to get metrics in JSON format. +void RegisterMetricsJsonHandler(Webserver* webserver, const MetricRegistry* const metrics); + +} // namespace kudu + +#endif // KUDU_SERVER_DEFAULT_PATH_HANDLERS_H diff --git a/src/kudu/server/generic_service.cc b/src/kudu/server/generic_service.cc new file mode 100644 index 000000000000..4d1d9f323755 --- /dev/null +++ b/src/kudu/server/generic_service.cc @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/generic_service.h" + +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/server/clock.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/server_base.h" +#include "kudu/util/flag_tags.h" + +DECLARE_bool(use_mock_wall_clock); +DECLARE_bool(use_hybrid_clock); + +using std::string; +using std::unordered_set; + +#ifdef COVERAGE_BUILD +extern "C" void __gcov_flush(void); +#endif + + +namespace kudu { +namespace server { + +GenericServiceImpl::GenericServiceImpl(ServerBase* server) + : GenericServiceIf(server->metric_entity()), + server_(server) { +} + +GenericServiceImpl::~GenericServiceImpl() { +} + +void GenericServiceImpl::SetFlag(const SetFlagRequestPB* req, + SetFlagResponsePB* resp, + rpc::RpcContext* rpc) { + + // Validate that the flag exists and get the current value. + string old_val; + if (!google::GetCommandLineOption(req->flag().c_str(), + &old_val)) { + resp->set_result(SetFlagResponsePB::NO_SUCH_FLAG); + rpc->RespondSuccess(); + return; + } + + // Validate that the flag is runtime-changeable. + unordered_set tags; + GetFlagTags(req->flag(), &tags); + if (!ContainsKey(tags, "runtime")) { + if (req->force()) { + LOG(WARNING) << rpc->requestor_string() << " forcing change of " + << "non-runtime-safe flag " << req->flag(); + } else { + resp->set_result(SetFlagResponsePB::NOT_SAFE); + resp->set_msg("Flag is not safe to change at runtime"); + rpc->RespondSuccess(); + return; + } + } + + resp->set_old_value(old_val); + + // Try to set the new value. + string ret = google::SetCommandLineOption( + req->flag().c_str(), + req->value().c_str()); + if (ret.empty()) { + resp->set_result(SetFlagResponsePB::BAD_VALUE); + resp->set_msg("Unable to set flag: bad value"); + } else { + LOG(INFO) << rpc->requestor_string() << " changed flags via RPC: " + << req->flag() << " from '" << old_val << "' to '" + << req->value() << "'"; + resp->set_result(SetFlagResponsePB::SUCCESS); + resp->set_msg(ret); + } + + rpc->RespondSuccess(); +} + +void GenericServiceImpl::FlushCoverage(const FlushCoverageRequestPB* req, + FlushCoverageResponsePB* resp, + rpc::RpcContext* rpc) { +#ifdef COVERAGE_BUILD + __gcov_flush(); + LOG(INFO) << "Flushed coverage info. (request from " << rpc->requestor_string() << ")"; + resp->set_success(true); +#else + LOG(WARNING) << "Non-coverage build cannot flush coverage (request from " + << rpc->requestor_string() << ")"; + resp->set_success(false); +#endif + rpc->RespondSuccess(); +} + +void GenericServiceImpl::ServerClock(const ServerClockRequestPB* req, + ServerClockResponsePB* resp, + rpc::RpcContext* rpc) { + resp->set_timestamp(server_->clock()->Now().ToUint64()); + rpc->RespondSuccess(); +} + +void GenericServiceImpl::SetServerWallClockForTests(const SetServerWallClockForTestsRequestPB *req, + SetServerWallClockForTestsResponsePB *resp, + rpc::RpcContext *context) { + if (!FLAGS_use_hybrid_clock || !FLAGS_use_mock_wall_clock) { + LOG(WARNING) << "Error setting wall clock for tests. Server is not using HybridClock" + "or was not started with '--use_mock_wall_clock= true'"; + resp->set_success(false); + } + + server::HybridClock* clock = down_cast(server_->clock()); + if (req->has_now_usec()) { + clock->SetMockClockWallTimeForTests(req->now_usec()); + } + if (req->has_max_error_usec()) { + clock->SetMockMaxClockErrorForTests(req->max_error_usec()); + } + resp->set_success(true); + context->RespondSuccess(); +} + +void GenericServiceImpl::GetStatus(const GetStatusRequestPB* req, + GetStatusResponsePB* resp, + rpc::RpcContext* rpc) { + server_->GetStatusPB(resp->mutable_status()); + rpc->RespondSuccess(); +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/generic_service.h b/src/kudu/server/generic_service.h new file mode 100644 index 000000000000..74c49944ad11 --- /dev/null +++ b/src/kudu/server/generic_service.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_GENERIC_SERVICE_H +#define KUDU_SERVER_GENERIC_SERVICE_H + +#include "kudu/gutil/macros.h" +#include "kudu/server/server_base.service.h" + +namespace kudu { +namespace server { + +class ServerBase; + +class GenericServiceImpl : public GenericServiceIf { + public: + explicit GenericServiceImpl(ServerBase* server); + virtual ~GenericServiceImpl(); + + virtual void SetFlag(const SetFlagRequestPB* req, + SetFlagResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void FlushCoverage(const FlushCoverageRequestPB* req, + FlushCoverageResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void ServerClock(const ServerClockRequestPB* req, + ServerClockResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + + virtual void SetServerWallClockForTests(const SetServerWallClockForTestsRequestPB *req, + SetServerWallClockForTestsResponsePB *resp, + rpc::RpcContext *context) OVERRIDE; + + virtual void GetStatus(const GetStatusRequestPB* req, + GetStatusResponsePB* resp, + rpc::RpcContext* rpc) OVERRIDE; + private: + ServerBase* server_; + + DISALLOW_COPY_AND_ASSIGN(GenericServiceImpl); +}; + +} // namespace server +} // namespace kudu +#endif /* KUDU_SERVER_GENERIC_SERVICE_H */ diff --git a/src/kudu/server/glog_metrics.cc b/src/kudu/server/glog_metrics.cc new file mode 100644 index 000000000000..6294227245cb --- /dev/null +++ b/src/kudu/server/glog_metrics.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/server/glog_metrics.h" + +#include + +#include "kudu/gutil/once.h" +#include "kudu/util/metrics.h" + +METRIC_DEFINE_counter(server, glog_info_messages, + "INFO-level Log Messages", kudu::MetricUnit::kMessages, + "Number of INFO-level log messages emitted by the application."); + +METRIC_DEFINE_counter(server, glog_warning_messages, + "WARNING-level Log Messages", kudu::MetricUnit::kMessages, + "Number of WARNING-level log messages emitted by the application."); + +METRIC_DEFINE_counter(server, glog_error_messages, + "ERROR-level Log Messages", kudu::MetricUnit::kMessages, + "Number of ERROR-level log messages emitted by the application."); + +namespace kudu { + +class MetricsSink : public google::LogSink { + public: + explicit MetricsSink(const scoped_refptr& entity) : + info_counter_(METRIC_glog_info_messages.Instantiate(entity)), + warning_counter_(METRIC_glog_warning_messages.Instantiate(entity)), + error_counter_(METRIC_glog_error_messages.Instantiate(entity)) { + } + + virtual void send(google::LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct ::tm* tm_time, + const char* message, size_t message_len) OVERRIDE { + + Counter* c; + switch (severity) { + case google::INFO: + c = info_counter_.get(); + break; + case google::WARNING: + c = warning_counter_.get(); + break; + case google::ERROR: + c = error_counter_.get(); + break; + default: + return; + } + + c->Increment(); + } + + private: + scoped_refptr info_counter_; + scoped_refptr warning_counter_; + scoped_refptr error_counter_; +}; + +ScopedGLogMetrics::ScopedGLogMetrics(const scoped_refptr& entity) + : sink_(new MetricsSink(entity)) { + google::AddLogSink(sink_.get()); +} + +ScopedGLogMetrics::~ScopedGLogMetrics() { + google::RemoveLogSink(sink_.get()); +} + + + +} // namespace kudu diff --git a/src/kudu/server/glog_metrics.h b/src/kudu/server/glog_metrics.h new file mode 100644 index 000000000000..e9e81a38cc74 --- /dev/null +++ b/src/kudu/server/glog_metrics.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_GLOG_METRICS_H +#define KUDU_SERVER_GLOG_METRICS_H + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" + +namespace google { +class LogSink; +} // namespace google + +namespace kudu { +class MetricEntity; + +// Attaches GLog metrics to the given entity, for the duration of this +// scoped object's lifetime. +// +// NOTE: the metrics are collected process-wide, not confined to any set of +// threads, etc. +class ScopedGLogMetrics { + public: + explicit ScopedGLogMetrics(const scoped_refptr& entity); + ~ScopedGLogMetrics(); + + private: + gscoped_ptr sink_; +}; + + +// Registers glog-related metrics. +// This can be called multiple times on different entities, though the resulting +// metrics will be identical, since the GLog tracking is process-wide. +void RegisterGLogMetrics(const scoped_refptr& entity); + +} // namespace kudu +#endif /* KUDU_SERVER_GLOG_METRICS_H */ diff --git a/src/kudu/server/hybrid_clock-test.cc b/src/kudu/server/hybrid_clock-test.cc new file mode 100644 index 000000000000..e8212dfdc115 --- /dev/null +++ b/src/kudu/server/hybrid_clock-test.cc @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/server/hybrid_clock.h" +#include "kudu/util/monotime.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(use_mock_wall_clock); + +namespace kudu { +namespace server { + +class HybridClockTest : public KuduTest { + public: + HybridClockTest() + : clock_(new HybridClock) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + ASSERT_OK(clock_->Init()); + } + + protected: + scoped_refptr clock_; +}; + +TEST(MockHybridClockTest, TestMockedSystemClock) { + google::FlagSaver saver; + FLAGS_use_mock_wall_clock = true; + scoped_refptr clock(new HybridClock()); + clock->Init(); + Timestamp timestamp; + uint64_t max_error_usec; + clock->NowWithError(×tamp, &max_error_usec); + ASSERT_EQ(timestamp.ToUint64(), 0); + ASSERT_EQ(max_error_usec, 0); + // If we read the clock again we should see the logical component be incremented. + clock->NowWithError(×tamp, &max_error_usec); + ASSERT_EQ(timestamp.ToUint64(), 1); + // Now set an arbitrary time and check that is the time returned by the clock. + uint64_t time = 1234; + uint64_t error = 100 * 1000; + clock->SetMockClockWallTimeForTests(time); + clock->SetMockMaxClockErrorForTests(error); + clock->NowWithError(×tamp, &max_error_usec); + ASSERT_EQ(timestamp.ToUint64(), + HybridClock::TimestampFromMicrosecondsAndLogicalValue(time, 0).ToUint64()); + ASSERT_EQ(max_error_usec, error); + // Perform another read, we should observe the logical component increment, again. + clock->NowWithError(×tamp, &max_error_usec); + ASSERT_EQ(timestamp.ToUint64(), + HybridClock::TimestampFromMicrosecondsAndLogicalValue(time, 1).ToUint64()); +} + +// Test that two subsequent time reads are monotonically increasing. +TEST_F(HybridClockTest, TestNow_ValuesIncreaseMonotonically) { + const Timestamp now1 = clock_->Now(); + const Timestamp now2 = clock_->Now(); + ASSERT_LT(now1.value(), now2.value()); +} + +// Tests the clock updates with the incoming value if it is higher. +TEST_F(HybridClockTest, TestUpdate_LogicalValueIncreasesByAmount) { + Timestamp now = clock_->Now(); + uint64_t now_micros = HybridClock::GetPhysicalValueMicros(now); + + // increase the logical value + uint64_t logical = HybridClock::GetLogicalValue(now); + logical += 10; + + // increase the physical value so that we're sure the clock will take this + // one, 200 msecs should be more than enough. + now_micros += 200000; + + Timestamp now_increased = HybridClock::TimestampFromMicrosecondsAndLogicalValue(now_micros, + logical); + + ASSERT_OK(clock_->Update(now_increased)); + + Timestamp now2 = clock_->Now(); + ASSERT_EQ(logical + 1, HybridClock::GetLogicalValue(now2)); + ASSERT_EQ(HybridClock::GetPhysicalValueMicros(now) + 200000, + HybridClock::GetPhysicalValueMicros(now2)); +} + +// Test that the incoming event is in the past, i.e. less than now - max_error +TEST_F(HybridClockTest, TestWaitUntilAfter_TestCase1) { + MonoTime no_deadline; + MonoTime before = MonoTime::Now(MonoTime::FINE); + + Timestamp past_ts; + uint64_t max_error; + clock_->NowWithError(&past_ts, &max_error); + + // make the event 3 * the max. possible error in the past + Timestamp past_ts_changed = HybridClock::AddPhysicalTimeToTimestamp( + past_ts, + MonoDelta::FromMicroseconds(-3 * max_error)); + + Status s = clock_->WaitUntilAfter(past_ts_changed, no_deadline); + + ASSERT_OK(s); + + MonoTime after = MonoTime::Now(MonoTime::FINE); + MonoDelta delta = after.GetDeltaSince(before); + // The delta should be close to 0, but it takes some time for the hybrid + // logical clock to decide that it doesn't need to wait. + ASSERT_LT(delta.ToMicroseconds(), 25000); +} + +// The normal case for transactions. Obtain a timestamp and then wait until +// we're sure that tx_latest < now_earliest. +TEST_F(HybridClockTest, TestWaitUntilAfter_TestCase2) { + MonoTime before = MonoTime::Now(MonoTime::FINE); + + // we do no time adjustment, this event should fall right within the possible + // error interval + Timestamp past_ts; + uint64_t past_max_error; + clock_->NowWithError(&past_ts, &past_max_error); + // Make sure the error is at least a small number of microseconds, to ensure + // that we always have to wait. + past_max_error = std::max(past_max_error, static_cast(20)); + Timestamp wait_until = HybridClock::AddPhysicalTimeToTimestamp( + past_ts, + MonoDelta::FromMicroseconds(past_max_error)); + + Timestamp current_ts; + uint64_t current_max_error; + clock_->NowWithError(¤t_ts, ¤t_max_error); + + // Check waiting with a deadline which already expired. + { + MonoTime deadline = before; + Status s = clock_->WaitUntilAfter(wait_until, deadline); + ASSERT_TRUE(s.IsTimedOut()); + } + + // Wait with a deadline well in the future. This should succeed. + { + MonoTime deadline = before; + deadline.AddDelta(MonoDelta::FromSeconds(60)); + ASSERT_OK(clock_->WaitUntilAfter(wait_until, deadline)); + } + + MonoTime after = MonoTime::Now(MonoTime::FINE); + MonoDelta delta = after.GetDeltaSince(before); + + // In the common case current_max_error >= past_max_error and we should have waited + // 2 * past_max_error, but if the clock's error is reset between the two reads we might + // have waited less time, but always more than 'past_max_error'. + if (current_max_error >= past_max_error) { + ASSERT_GE(delta.ToMicroseconds(), 2 * past_max_error); + } else { + ASSERT_GE(delta.ToMicroseconds(), past_max_error); + } +} + +TEST_F(HybridClockTest, TestIsAfter) { + Timestamp ts1 = clock_->Now(); + ASSERT_TRUE(clock_->IsAfter(ts1)); + + // Update the clock in the future, make sure it still + // handles "IsAfter" properly even when it's running in + // "logical" mode. + Timestamp now_increased = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(ts1) + 1 * 1000 * 1000); + ASSERT_OK(clock_->Update(now_increased)); + Timestamp ts2 = clock_->Now(); + + ASSERT_TRUE(clock_->IsAfter(ts1)); + ASSERT_TRUE(clock_->IsAfter(ts2)); +} + +// Thread which loops polling the clock and updating it slightly +// into the future. +void StresserThread(HybridClock* clock, AtomicBool* stop) { + Random rng(GetRandomSeed32()); + Timestamp prev(0);; + while (!stop->Load()) { + Timestamp t = clock->Now(); + CHECK_GT(t.value(), prev.value()); + prev = t; + + // Add a random bit of offset to the clock, and perform an update. + Timestamp new_ts = HybridClock::AddPhysicalTimeToTimestamp( + t, MonoDelta::FromMicroseconds(rng.Uniform(10000))); + clock->Update(new_ts); + } +} + +// Regression test for KUDU-953: if threads are updating and polling the +// clock concurrently, the clock should still never run backwards. +TEST_F(HybridClockTest, TestClockDoesntGoBackwardsWithUpdates) { + vector > threads; + + AtomicBool stop(false); + for (int i = 0; i < 4; i++) { + scoped_refptr thread; + ASSERT_OK(Thread::Create("test", "stresser", + &StresserThread, clock_.get(), &stop, + &thread)); + threads.push_back(thread); + } + + SleepFor(MonoDelta::FromSeconds(1)); + stop.Store(true); + for (const scoped_refptr t : threads) { + t->Join(); + } +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/hybrid_clock.cc b/src/kudu/server/hybrid_clock.cc new file mode 100644 index 000000000000..929bdcbac668 --- /dev/null +++ b/src/kudu/server/hybrid_clock.cc @@ -0,0 +1,493 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/server/hybrid_clock.h" + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/errno.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" + +#if !defined(__APPLE__) +#include +#endif // !defined(__APPLE__) + +DEFINE_int32(max_clock_sync_error_usec, 10 * 1000 * 1000, // 10 secs + "Maximum allowed clock synchronization error as reported by NTP " + "before the server will abort."); +TAG_FLAG(max_clock_sync_error_usec, advanced); +TAG_FLAG(max_clock_sync_error_usec, runtime); + +DEFINE_bool(use_hybrid_clock, true, + "Whether HybridClock should be used as the default clock" + " implementation. This should be disabled for testing purposes only."); +TAG_FLAG(use_hybrid_clock, hidden); + +DEFINE_bool(use_mock_wall_clock, false, + "Whether HybridClock should use a mock wall clock which is updated manually" + "instead of reading time from the system clock, for tests."); +TAG_FLAG(use_mock_wall_clock, hidden); + +METRIC_DEFINE_gauge_uint64(server, hybrid_clock_timestamp, + "Hybrid Clock Timestamp", + kudu::MetricUnit::kMicroseconds, + "Hybrid clock timestamp."); +METRIC_DEFINE_gauge_uint64(server, hybrid_clock_error, + "Hybrid Clock Error", + kudu::MetricUnit::kMicroseconds, + "Server clock maximum error."); + +using kudu::Status; +using strings::Substitute; + +namespace kudu { +namespace server { + +namespace { + +#if !defined(__APPLE__) +// Returns the clock modes and checks if the clock is synchronized. +Status GetClockModes(timex* timex) { + // this makes ntp_adjtime a read-only call + timex->modes = 0; + int rc = ntp_adjtime(timex); + if (PREDICT_FALSE(rc == TIME_ERROR)) { + return Status::ServiceUnavailable( + Substitute("Error reading clock. Clock considered unsynchronized. Return code: $0", rc)); + } + // TODO what to do about leap seconds? see KUDU-146 + if (PREDICT_FALSE(rc != TIME_OK)) { + LOG(ERROR) << Substitute("TODO Server undergoing leap second. Return code: $0", rc); + } + return Status::OK(); +} + +// Returns the current time/max error and checks if the clock is synchronized. +kudu::Status GetClockTime(ntptimeval* timeval) { + int rc = ntp_gettime(timeval); + switch (rc) { + case TIME_OK: + return Status::OK(); + case -1: // generic error + return Status::ServiceUnavailable("Error reading clock. ntp_gettime() failed", + ErrnoToString(errno)); + case TIME_ERROR: + return Status::ServiceUnavailable("Error reading clock. Clock considered unsynchronized"); + default: + // TODO what to do about leap seconds? see KUDU-146 + KLOG_FIRST_N(ERROR, 1) << "Server undergoing leap second. This may cause consistency issues " + << "(rc=" << rc << ")"; + return Status::OK(); + } +} +#endif // !defined(__APPLE__) + +Status CheckDeadlineNotWithinMicros(const MonoTime& deadline, int64_t wait_for_usec) { + if (!deadline.Initialized()) { + // No deadline. + return Status::OK(); + } + int64_t us_until_deadline = deadline.GetDeltaSince( + MonoTime::Now(MonoTime::FINE)).ToMicroseconds(); + if (us_until_deadline <= wait_for_usec) { + return Status::TimedOut(Substitute( + "specified time is $0us in the future, but deadline expires in $1us", + wait_for_usec, us_until_deadline)); + } + return Status::OK(); +} + +} // anonymous namespace + +// Left shifting 12 bits gives us 12 bits for the logical value +// and should still keep accurate microseconds time until 2100+ +const int HybridClock::kBitsToShift = 12; +// This mask gives us back the logical bits. +const uint64_t HybridClock::kLogicalBitMask = (1 << kBitsToShift) - 1; + +const uint64_t HybridClock::kNanosPerSec = 1000000; + +const double HybridClock::kAdjtimexScalingFactor = 65536; + +HybridClock::HybridClock() + : mock_clock_time_usec_(0), + mock_clock_max_error_usec_(0), +#if !defined(__APPLE__) + divisor_(1), +#endif + tolerance_adjustment_(1), + last_usec_(0), + next_logical_(0), + state_(kNotInitialized) { +} + +Status HybridClock::Init() { + if (PREDICT_FALSE(FLAGS_use_mock_wall_clock)) { + LOG(WARNING) << "HybridClock set to mock the wall clock."; + state_ = kInitialized; + return Status::OK(); + } +#if defined(__APPLE__) + LOG(WARNING) << "HybridClock initialized in local mode (OS X only). " + << "Not suitable for distributed clusters."; +#else + // Read the current time. This will return an error if the clock is not synchronized. + uint64_t now_usec; + uint64_t error_usec; + RETURN_NOT_OK(WalltimeWithError(&now_usec, &error_usec)); + + timex timex; + RETURN_NOT_OK(GetClockModes(&timex)); + // read whether the STA_NANO bit is set to know whether we'll get back nanos + // or micros in timeval.time.tv_usec. See: + // http://stackoverflow.com/questions/16063408/does-ntp-gettime-actually-return-nanosecond-precision + // set the timeval.time.tv_usec divisor so that we always get micros + if (timex.status & STA_NANO) { + divisor_ = 1000; + } else { + divisor_ = 1; + } + + // Calculate the sleep skew adjustment according to the max tolerance of the clock. + // Tolerance comes in parts per million but needs to be applied a scaling factor. + tolerance_adjustment_ = (1 + ((timex.tolerance / kAdjtimexScalingFactor) / 1000000.0)); + + LOG(INFO) << "HybridClock initialized. Resolution in nanos?: " << (divisor_ == 1000) + << " Wait times tolerance adjustment: " << tolerance_adjustment_ + << " Current error: " << error_usec; +#endif // defined(__APPLE__) + + state_ = kInitialized; + + return Status::OK(); +} + +Timestamp HybridClock::Now() { + Timestamp now; + uint64_t error; + + boost::lock_guard lock(lock_); + NowWithError(&now, &error); + return now; +} + +Timestamp HybridClock::NowLatest() { + Timestamp now; + uint64_t error; + + { + boost::lock_guard lock(lock_); + NowWithError(&now, &error); + } + + uint64_t now_latest = GetPhysicalValueMicros(now) + error; + uint64_t now_logical = GetLogicalValue(now); + + return TimestampFromMicrosecondsAndLogicalValue(now_latest, now_logical); +} + +Status HybridClock::GetGlobalLatest(Timestamp* t) { + Timestamp now = Now(); + uint64_t now_latest = GetPhysicalValueMicros(now) + FLAGS_max_clock_sync_error_usec; + uint64_t now_logical = GetLogicalValue(now); + *t = TimestampFromMicrosecondsAndLogicalValue(now_latest, now_logical); + return Status::OK(); +} + +void HybridClock::NowWithError(Timestamp* timestamp, uint64_t* max_error_usec) { + + DCHECK_EQ(state_, kInitialized) << "Clock not initialized. Must call Init() first."; + + uint64_t now_usec; + uint64_t error_usec; + Status s = WalltimeWithError(&now_usec, &error_usec); + if (PREDICT_FALSE(!s.ok())) { + LOG(FATAL) << Substitute("Couldn't get the current time: Clock unsynchronized. " + "Status: $0", s.ToString()); + } + + // If the current time surpasses the last update just return it + if (PREDICT_TRUE(now_usec > last_usec_)) { + last_usec_ = now_usec; + next_logical_ = 1; + *timestamp = TimestampFromMicroseconds(last_usec_); + *max_error_usec = error_usec; + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + VLOG(2) << "Current clock is higher than the last one. Resetting logical values." + << " Physical Value: " << now_usec << " usec Logical Value: 0 Error: " + << error_usec; + } + return; + } + + // We don't have the last time read max error since it might have originated + // in another machine, but we can put a bound on the maximum error of the + // timestamp we are providing. + // In particular we know that the "true" time falls within the interval + // now_usec +- now.maxerror so we get the following situations: + // + // 1) + // --------|----------|----|---------|--------------------------> time + // now - e now last now + e + // 2) + // --------|----------|--------------|------|-------------------> time + // now - e now now + e last + // + // Assuming, in the worst case, that the "true" time is now - error we need to + // always return: last - (now - e) as the new maximum error. + // This broadens the error interval for both cases but always returns + // a correct error interval. + + *max_error_usec = last_usec_ - (now_usec - error_usec); + *timestamp = TimestampFromMicrosecondsAndLogicalValue(last_usec_, next_logical_); + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + VLOG(2) << "Current clock is lower than the last one. Returning last read and incrementing" + " logical values. Physical Value: " << now_usec << " usec Logical Value: " + << next_logical_ << " Error: " << *max_error_usec; + } + next_logical_++; +} + +Status HybridClock::Update(const Timestamp& to_update) { + boost::lock_guard lock(lock_); + Timestamp now; + uint64_t error_ignored; + NowWithError(&now, &error_ignored); + + if (PREDICT_TRUE(now.CompareTo(to_update) > 0)) return Status::OK(); + + uint64_t to_update_physical = GetPhysicalValueMicros(to_update); + uint64_t to_update_logical = GetLogicalValue(to_update); + uint64_t now_physical = GetPhysicalValueMicros(now); + + // we won't update our clock if to_update is more than 'max_clock_sync_error_usec' + // into the future as it might have been corrupted or originated from an out-of-sync + // server. + if ((to_update_physical - now_physical) > FLAGS_max_clock_sync_error_usec) { + return Status::InvalidArgument("Tried to update clock beyond the max. error."); + } + + last_usec_ = to_update_physical; + next_logical_ = to_update_logical + 1; + return Status::OK(); +} + +bool HybridClock::SupportsExternalConsistencyMode(ExternalConsistencyMode mode) { + return true; +} + +Status HybridClock::WaitUntilAfter(const Timestamp& then_latest, + const MonoTime& deadline) { + TRACE_EVENT0("clock", "HybridClock::WaitUntilAfter"); + Timestamp now; + uint64_t error; + { + boost::lock_guard lock(lock_); + NowWithError(&now, &error); + } + + // "unshift" the timestamps so that we can measure actual time + uint64_t now_usec = GetPhysicalValueMicros(now); + uint64_t then_latest_usec = GetPhysicalValueMicros(then_latest); + + uint64_t now_earliest_usec = now_usec - error; + + // Case 1, event happened definitely in the past, return + if (PREDICT_TRUE(then_latest_usec < now_earliest_usec)) { + return Status::OK(); + } + + // Case 2 wait out until we are sure that then_latest has passed + + // We'll sleep then_latest_usec - now_earliest_usec so that the new + // nw.earliest is higher than then.latest. + uint64_t wait_for_usec = (then_latest_usec - now_earliest_usec); + + // Additionally adjust the sleep time with the max tolerance adjustment + // to account for the worst case clock skew while we're sleeping. + wait_for_usec *= tolerance_adjustment_; + + // Check that sleeping wouldn't sleep longer than our deadline. + RETURN_NOT_OK(CheckDeadlineNotWithinMicros(deadline, wait_for_usec)); + + SleepFor(MonoDelta::FromMicroseconds(wait_for_usec)); + + + VLOG(1) << "WaitUntilAfter(): Incoming time(latest): " << then_latest_usec + << " Now(earliest): " << now_earliest_usec << " error: " << error + << " Waiting for: " << wait_for_usec; + + return Status::OK(); +} + + Status HybridClock::WaitUntilAfterLocally(const Timestamp& then, + const MonoTime& deadline) { + while (true) { + Timestamp now; + uint64_t error; + { + boost::lock_guard lock(lock_); + NowWithError(&now, &error); + } + if (now.CompareTo(then) > 0) { + return Status::OK(); + } + uint64_t wait_for_usec = GetPhysicalValueMicros(then) - GetPhysicalValueMicros(now); + + // Check that sleeping wouldn't sleep longer than our deadline. + RETURN_NOT_OK(CheckDeadlineNotWithinMicros(deadline, wait_for_usec)); + } +} + +bool HybridClock::IsAfter(Timestamp t) { + // Manually get the time, rather than using Now(), so we don't end up causing + // a time update. + uint64_t now_usec; + uint64_t error_usec; + CHECK_OK(WalltimeWithError(&now_usec, &error_usec)); + + boost::lock_guard lock(lock_); + now_usec = std::max(now_usec, last_usec_); + + Timestamp now; + if (now_usec > last_usec_) { + now = TimestampFromMicroseconds(now_usec); + } else { + // last_usec_ may be in the future if we were updated from a remote + // node. + now = TimestampFromMicrosecondsAndLogicalValue(last_usec_, next_logical_); + } + + return t.value() < now.value(); +} + +kudu::Status HybridClock::WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec) { + if (PREDICT_FALSE(FLAGS_use_mock_wall_clock)) { + VLOG(1) << "Current clock time: " << mock_clock_time_usec_ << " error: " + << mock_clock_max_error_usec_ << ". Updating to time: " << now_usec + << " and error: " << error_usec; + *now_usec = mock_clock_time_usec_; + *error_usec = mock_clock_max_error_usec_; + } else { +#if defined(__APPLE__) + *now_usec = GetCurrentTimeMicros(); + *error_usec = 0; + } +#else + // Read the time. This will return an error if the clock is not synchronized. + ntptimeval timeval; + RETURN_NOT_OK(GetClockTime(&timeval)); + *now_usec = timeval.time.tv_sec * kNanosPerSec + timeval.time.tv_usec / divisor_; + *error_usec = timeval.maxerror; + } + + // If the clock is synchronized but has max_error beyond max_clock_sync_error_usec + // we also return a non-ok status. + if (*error_usec > FLAGS_max_clock_sync_error_usec) { + return Status::ServiceUnavailable(Substitute("Error: Clock synchronized but error was" + "too high ($0 us).", *error_usec)); + } +#endif // defined(__APPLE__) + return kudu::Status::OK(); +} + +void HybridClock::SetMockClockWallTimeForTests(uint64_t now_usec) { + CHECK(FLAGS_use_mock_wall_clock); + boost::lock_guard lock(lock_); + CHECK_GE(now_usec, mock_clock_time_usec_); + mock_clock_time_usec_ = now_usec; +} + +void HybridClock::SetMockMaxClockErrorForTests(uint64_t max_error_usec) { + CHECK(FLAGS_use_mock_wall_clock); + boost::lock_guard lock(lock_); + mock_clock_max_error_usec_ = max_error_usec; +} + +// Used to get the timestamp for metrics. +uint64_t HybridClock::NowForMetrics() { + return Now().ToUint64(); +} + +// Used to get the current error, for metrics. +uint64_t HybridClock::ErrorForMetrics() { + Timestamp now; + uint64_t error; + + boost::lock_guard lock(lock_); + NowWithError(&now, &error); + return error; +} + +void HybridClock::RegisterMetrics(const scoped_refptr& metric_entity) { + METRIC_hybrid_clock_timestamp.InstantiateFunctionGauge( + metric_entity, + Bind(&HybridClock::NowForMetrics, Unretained(this))) + ->AutoDetachToLastValue(&metric_detacher_); + METRIC_hybrid_clock_error.InstantiateFunctionGauge( + metric_entity, + Bind(&HybridClock::ErrorForMetrics, Unretained(this))) + ->AutoDetachToLastValue(&metric_detacher_); +} + +string HybridClock::Stringify(Timestamp timestamp) { + return StringifyTimestamp(timestamp); +} + +uint64_t HybridClock::GetLogicalValue(const Timestamp& timestamp) { + return timestamp.value() & kLogicalBitMask; +} + +uint64_t HybridClock::GetPhysicalValueMicros(const Timestamp& timestamp) { + return timestamp.value() >> kBitsToShift; +} + +Timestamp HybridClock::TimestampFromMicroseconds(uint64_t micros) { + return Timestamp(micros << kBitsToShift); +} + +Timestamp HybridClock::TimestampFromMicrosecondsAndLogicalValue( + uint64_t micros, + uint64_t logical_value) { + return Timestamp((micros << kBitsToShift) + logical_value); +} + +Timestamp HybridClock::AddPhysicalTimeToTimestamp(const Timestamp& original, + const MonoDelta& to_add) { + uint64_t new_physical = GetPhysicalValueMicros(original) + to_add.ToMicroseconds(); + uint64_t old_logical = GetLogicalValue(original); + return TimestampFromMicrosecondsAndLogicalValue(new_physical, old_logical); +} + +string HybridClock::StringifyTimestamp(const Timestamp& timestamp) { + return Substitute("P: $0 usec, L: $1", + GetPhysicalValueMicros(timestamp), + GetLogicalValue(timestamp)); +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/hybrid_clock.h b/src/kudu/server/hybrid_clock.h new file mode 100644 index 000000000000..933c00c4b262 --- /dev/null +++ b/src/kudu/server/hybrid_clock.h @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_SERVER_HYBRID_CLOCK_H_ +#define KUDU_SERVER_HYBRID_CLOCK_H_ + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/server/clock.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace server { + +// The HybridTime clock. +// +// HybridTime should not be used on a distributed cluster running on OS X hosts, +// since NTP clock error is not available. +class HybridClock : public Clock { + public: + HybridClock(); + + virtual Status Init() OVERRIDE; + + // Obtains the timestamp corresponding to the current time. + virtual Timestamp Now() OVERRIDE; + + // Obtains the timestamp corresponding to latest possible current + // time. + virtual Timestamp NowLatest() OVERRIDE; + + // Obtain a timestamp which is guaranteed to be later than the current time + // on any machine in the cluster. + // + // NOTE: this is not a very tight bound. + virtual Status GetGlobalLatest(Timestamp* t) OVERRIDE; + + // Updates the clock with a timestamp originating on another machine. + virtual Status Update(const Timestamp& to_update) OVERRIDE; + + virtual void RegisterMetrics(const scoped_refptr& metric_entity) OVERRIDE; + + // HybridClock supports all external consistency modes. + virtual bool SupportsExternalConsistencyMode(ExternalConsistencyMode mode) OVERRIDE; + + // Blocks the caller thread until the true time is after 'then'. + // In other words, waits until the HybridClock::Now() on _all_ nodes + // will return a value greater than 'then'. + // + // The incoming time 'then' is assumed to be the latest time possible + // at the time the read was performed, i.e. 'then' = now + max_error. + // + // This method can be used to make Kudu behave like Spanner/TrueTime. + // This is implemented by possibly making the caller thread wait for a + // a certain period of time. + // + // As an example, the following cases might happen: + // + // 1 - 'then' is lower than now.earliest() -> Definitely in + // the past, no wait necessary. + // + // 2 - 'then' is greater than > now.earliest(): need to wait until + // 'then' <= now.earliest() + // + // Returns OK if it waited long enough or if no wait was necessary. + // + // Returns Status::ServiceUnavailable if the system clock was not + // synchronized and therefore it couldn't wait out the error. + // + // Returns Status::TimedOut() if 'deadline' will pass before the specified + // timestamp. NOTE: unlike most "wait" methods, this may return _immediately_ + // with a timeout, rather than actually waiting for the timeout to expire. + // This is because, by looking at the current clock, we can know how long + // we'll have to wait, in contrast to most Wait() methods which are waiting + // on some external condition to become true. + virtual Status WaitUntilAfter(const Timestamp& then, + const MonoTime& deadline) OVERRIDE; + + // Blocks the caller thread until the local time is after 'then'. + // This is in contrast to the above method, which waits until the time + // on _all_ machines is past the given time. + // + // Returns Status::TimedOut() if 'deadline' will pass before the specified + // timestamp. NOTE: unlike most "wait" methods, this may return _immediately_ + // with a timeout. See WaitUntilAfter() for details. + virtual Status WaitUntilAfterLocally(const Timestamp& then, + const MonoTime& deadline) OVERRIDE; + + // Return true if the given time has passed (i.e any future call + // to Now() would return a higher value than t). + // + // NOTE: this only refers to the _local_ clock, and is not a guarantee + // that other nodes' clocks have definitely passed this timestamp. + // This is in contrast to WaitUntilAfter() above. + virtual bool IsAfter(Timestamp t) OVERRIDE; + + // Obtains the timestamp corresponding to the current time and the associated + // error in micros. This may fail if the clock is unsynchronized or synchronized + // but the error is too high and, since we can't do anything about it, + // LOG(FATAL)'s in that case. + void NowWithError(Timestamp* timestamp, uint64_t* max_error_usec); + + virtual std::string Stringify(Timestamp timestamp) OVERRIDE; + + // Static encoding/decoding methods for timestamps. Public mostly + // for testing/debugging purposes. + + // Returns the logical value embedded in 'timestamp' + static uint64_t GetLogicalValue(const Timestamp& timestamp); + + // Returns the physical value embedded in 'timestamp', in microseconds. + static uint64_t GetPhysicalValueMicros(const Timestamp& timestamp); + + // Obtains a new Timestamp with the logical value zeroed out. + static Timestamp TimestampFromMicroseconds(uint64_t micros); + + // Obtains a new Timestamp that embeds both the physical and logical values. + static Timestamp TimestampFromMicrosecondsAndLogicalValue(uint64_t micros, + uint64_t logical_value); + + // Creates a new timestamp whose physical time is GetPhysicalValue(original) + + // 'micros_to_add' and which retains the same logical value. + static Timestamp AddPhysicalTimeToTimestamp(const Timestamp& original, + const MonoDelta& to_add); + + // Outputs a string containing the physical and logical values of the timestamp, + // separated. + static std::string StringifyTimestamp(const Timestamp& timestamp); + + // Sets the time to be returned by a mock call to the system clock, for tests. + // Requires that 'FLAGS_use_mock_wall_clock' is set to true and that 'now_usec' is less + // than the previously set time. + // NOTE: This refers to the time returned by the system clock, not the time returned + // by HybridClock, i.e. 'now_usec' is not a HybridTime timestmap and shouldn't have + // a logical component. + void SetMockClockWallTimeForTests(uint64_t now_usec); + + // Sets the max. error to be returned by a mock call to the system clock, for tests. + // Requires that 'FLAGS_use_mock_wall_clock' is set to true. + // This can be used to make HybridClock report the wall clock as unsynchronized, by + // setting error to be more than the configured tolerance. + void SetMockMaxClockErrorForTests(uint64_t max_error_usec); + + private: + + // Obtains the current wallclock time and maximum error in microseconds, + // and checks if the clock is synchronized. + // + // On OS X, the error will always be 0. + kudu::Status WalltimeWithError(uint64_t* now_usec, uint64_t* error_usec); + + // Used to get the timestamp for metrics. + uint64_t NowForMetrics(); + + // Used to get the current error, for metrics. + uint64_t ErrorForMetrics(); + + // Set by calls to SetMockClockWallTimeForTests(). + // For testing purposes only. + uint64_t mock_clock_time_usec_; + + // Set by calls to SetMockClockErrorForTests(). + // For testing purposes only. + uint64_t mock_clock_max_error_usec_; + +#if !defined(__APPLE__) + uint64_t divisor_; +#endif + + double tolerance_adjustment_; + + mutable simple_spinlock lock_; + + // the last clock read/update, in microseconds. + uint64_t last_usec_; + // the next logical value to be assigned to a timestamp + uint64_t next_logical_; + + // How many bits to left shift a microseconds clock read. The remainder + // of the timestamp will be reserved for logical values. + static const int kBitsToShift; + + // Mask to extract the pure logical bits. + static const uint64_t kLogicalBitMask; + + static const uint64_t kNanosPerSec; + + // The scaling factor used to obtain ppms. From the adjtimex source: + // "scale factor used by adjtimex freq param. 1 ppm = 65536" + static const double kAdjtimexScalingFactor; + + enum State { + kNotInitialized, + kInitialized + }; + + State state_; + + // Clock metrics are set to detach to their last value. This means + // that, during our destructor, we'll need to access other class members + // declared above this. Hence, this member must be declared last. + FunctionGaugeDetacher metric_detacher_; +}; + +} // namespace server +} // namespace kudu + +#endif /* KUDU_SERVER_HYBRID_CLOCK_H_ */ diff --git a/src/kudu/server/logical_clock-test.cc b/src/kudu/server/logical_clock-test.cc new file mode 100644 index 000000000000..0a4b52f49d18 --- /dev/null +++ b/src/kudu/server/logical_clock-test.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/server/logical_clock.h" +#include "kudu/util/monotime.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace server { + +class LogicalClockTest : public KuduTest { + public: + LogicalClockTest() + : clock_(LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp)) { + } + + protected: + scoped_refptr clock_; +}; + +// Test that two subsequent time reads are monotonically increasing. +TEST_F(LogicalClockTest, TestNow_ValuesIncreaseMonotonically) { + const Timestamp now1 = clock_->Now(); + const Timestamp now2 = clock_->Now(); + ASSERT_EQ(now1.value() + 1, now2.value()); +} + +// Tests that the clock gets updated if the incoming value is higher. +TEST_F(LogicalClockTest, TestUpdate_LogicalValueIncreasesByAmount) { + Timestamp initial = clock_->Now(); + Timestamp future(initial.value() + 10); + clock_->Update(future); + Timestamp now = clock_->Now(); + // now should be 1 after future + ASSERT_EQ(initial.value() + 11, now.value()); +} + +// Tests that the clock doesn't get updated if the incoming value is lower. +TEST_F(LogicalClockTest, TestUpdate_LogicalValueDoesNotIncrease) { + Timestamp ts(1); + // update the clock to 1, the initial value, should do nothing + clock_->Update(ts); + Timestamp now = clock_->Now(); + ASSERT_EQ(now.value(), 2); +} + +TEST_F(LogicalClockTest, TestWaitUntilAfterIsUnavailable) { + Status status = clock_->WaitUntilAfter( + Timestamp(10), MonoTime::Now(MonoTime::FINE)); + ASSERT_TRUE(status.IsServiceUnavailable()); +} + +TEST_F(LogicalClockTest, TestIsAfter) { + Timestamp ts1 = clock_->Now(); + ASSERT_TRUE(clock_->IsAfter(ts1)); + + // Update the clock in the future, make sure it still + // handles "IsAfter" properly even when it's running in + // "logical" mode. + Timestamp now_increased = Timestamp(1000); + ASSERT_OK(clock_->Update(now_increased)); + Timestamp ts2 = clock_->Now(); + + ASSERT_TRUE(clock_->IsAfter(ts1)); + ASSERT_TRUE(clock_->IsAfter(ts2)); +} + +} // namespace server +} // namespace kudu + diff --git a/src/kudu/server/logical_clock.cc b/src/kudu/server/logical_clock.cc new file mode 100644 index 000000000000..7f79d565fa1f --- /dev/null +++ b/src/kudu/server/logical_clock.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/logical_clock.h" + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace server { + +METRIC_DEFINE_gauge_uint64(server, logical_clock_timestamp, + "Logical Clock Timestamp", + kudu::MetricUnit::kUnits, + "Logical clock timestamp."); + +using base::subtle::Atomic64; +using base::subtle::Barrier_AtomicIncrement; +using base::subtle::NoBarrier_CompareAndSwap; + +Timestamp LogicalClock::Now() { + return Timestamp(Barrier_AtomicIncrement(&now_, 1)); +} + +Timestamp LogicalClock::NowLatest() { + return Now(); +} + +Status LogicalClock::Update(const Timestamp& to_update) { + DCHECK_NE(to_update.value(), Timestamp::kInvalidTimestamp.value()) + << "Updating the clock with an invalid timestamp"; + Atomic64 new_value = to_update.value(); + + while (true) { + Atomic64 current_value = NoBarrier_Load(&now_); + // if the incoming value is less than the current one, or we've failed the + // CAS because the current clock increased to higher than the incoming value, + // we can stop the loop now. + if (new_value <= current_value) return Status::OK(); + // otherwise try a CAS + if (PREDICT_TRUE(NoBarrier_CompareAndSwap(&now_, current_value, new_value) + == current_value)) + break; + } + return Status::OK(); +} + +Status LogicalClock::WaitUntilAfter(const Timestamp& then, + const MonoTime& deadline) { + return Status::ServiceUnavailable( + "Logical clock does not support WaitUntilAfter()"); +} + +Status LogicalClock::WaitUntilAfterLocally(const Timestamp& then, + const MonoTime& deadline) { + if (IsAfter(then)) return Status::OK(); + return Status::ServiceUnavailable( + "Logical clock does not support WaitUntilAfterLocally()"); +} + +bool LogicalClock::IsAfter(Timestamp t) { + return base::subtle::Acquire_Load(&now_) >= t.value(); +} + +LogicalClock* LogicalClock::CreateStartingAt(const Timestamp& timestamp) { + // initialize at 'timestamp' - 1 so that the first output value is 'timestamp'. + return new LogicalClock(timestamp.value() - 1); +} + +uint64_t LogicalClock::NowForMetrics() { + // We don't want reading metrics to change the clock. + return NoBarrier_Load(&now_); +} + + +void LogicalClock::RegisterMetrics(const scoped_refptr& metric_entity) { + METRIC_logical_clock_timestamp.InstantiateFunctionGauge( + metric_entity, + Bind(&LogicalClock::NowForMetrics, Unretained(this))) + ->AutoDetachToLastValue(&metric_detacher_); +} + +string LogicalClock::Stringify(Timestamp timestamp) { + return strings::Substitute("L: $0", timestamp.ToUint64()); +} + +} // namespace server +} // namespace kudu + diff --git a/src/kudu/server/logical_clock.h b/src/kudu/server/logical_clock.h new file mode 100644 index 000000000000..4de22b556dfd --- /dev/null +++ b/src/kudu/server/logical_clock.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_SERVER_LOGICAL_CLOCK_H_ +#define KUDU_SERVER_LOGICAL_CLOCK_H_ + +#include + +#include "kudu/server/clock.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" + +namespace kudu { +class MonoDelta; +class MonoTime; +namespace server { + +// An implementation of Clock that behaves as a plain Lamport Clock. +// In a single node, single tablet, setting this generates exactly the +// same Timestamp sequence as the original MvccManager did, but it can be +// updated to make sure replicas generate new timestamps on becoming leader. +// This can be used as a deterministic timestamp generator that has the same +// consistency properties as a HybridTime clock. +// +// The Wait* methods are unavailable in this implementation and will +// return Status::ServiceUnavailable(). +// +// NOTE: this class is thread safe. +class LogicalClock : public Clock { + public: + + virtual Status Init() OVERRIDE { return Status::OK(); } + + virtual Timestamp Now() OVERRIDE; + + // In the logical clock this call is equivalent to Now(); + virtual Timestamp NowLatest() OVERRIDE; + + virtual Status Update(const Timestamp& to_update) OVERRIDE; + + // The Wait*() functions are not available for this clock. + virtual Status WaitUntilAfter(const Timestamp& then, + const MonoTime& deadline) OVERRIDE; + virtual Status WaitUntilAfterLocally(const Timestamp& then, + const MonoTime& deadline) OVERRIDE; + + virtual bool IsAfter(Timestamp t) OVERRIDE; + + virtual void RegisterMetrics(const scoped_refptr& metric_entity) OVERRIDE; + + virtual std::string Stringify(Timestamp timestamp) OVERRIDE; + + // Logical clock doesn't support COMMIT_WAIT. + virtual bool SupportsExternalConsistencyMode(ExternalConsistencyMode mode) OVERRIDE { + return mode != COMMIT_WAIT; + } + + // Creates a logical clock whose first output value on a Now() call is 'timestamp'. + static LogicalClock* CreateStartingAt(const Timestamp& timestamp); + + private: + // Should use LogicalClock::CreatingStartingAt() + explicit LogicalClock(Timestamp::val_type initial_time) : now_(initial_time) {} + + // Used to get the timestamp for metrics. + uint64_t NowForMetrics(); + + base::subtle::Atomic64 now_; + + FunctionGaugeDetacher metric_detacher_; +}; + +} // namespace server +} // namespace kudu + +#endif /* KUDU_SERVER_LOGICAL_CLOCK_H_ */ + diff --git a/src/kudu/server/metadata.h b/src/kudu/server/metadata.h new file mode 100644 index 000000000000..73ff98ce6184 --- /dev/null +++ b/src/kudu/server/metadata.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_METADATA_H +#define KUDU_TABLET_METADATA_H + +// This header left around temporarily to make people's lives +// easier after a refactor. We should clean this up and change +// include sites. + +#include "kudu/consensus/metadata.pb.h" + +#endif diff --git a/src/kudu/server/monitored_task.h b/src/kudu/server/monitored_task.h new file mode 100644 index 000000000000..06e78130ddfd --- /dev/null +++ b/src/kudu/server/monitored_task.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_MONITORED_TASK_H +#define KUDU_MONITORED_TASK_H + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/monotime.h" + +namespace kudu { + +class MonitoredTask : public RefCountedThreadSafe { + public: + virtual ~MonitoredTask() {} + + enum State { + kStatePreparing, + kStateRunning, + kStateComplete, + kStateFailed, + kStateAborted, + }; + + // Abort the ongoing task. + virtual void Abort() = 0; + + // Task State + virtual State state() const = 0; + + // Task Type Identifier + virtual std::string type_name() const = 0; + + // Task description + virtual std::string description() const = 0; + + // Task start time, may be !Initialized() + virtual MonoTime start_timestamp() const = 0; + + // Task completion time, may be !Initialized() + virtual MonoTime completion_timestamp() const = 0; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/server/pprof-path-handlers.cc b/src/kudu/server/pprof-path-handlers.cc new file mode 100644 index 000000000000..52a045acd5cc --- /dev/null +++ b/src/kudu/server/pprof-path-handlers.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kudu/server/pprof-path-handlers.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/server/webserver.h" +#include "kudu/util/env.h" +#include "kudu/util/logging.h" +#include "kudu/util/monotime.h" +#include "kudu/util/spinlock_profiling.h" +#include "kudu/util/status.h" + +DECLARE_bool(enable_process_lifetime_heap_profiling); +DECLARE_string(heap_profile_path); + + +using std::endl; +using std::ifstream; +using std::ostringstream; +using std::string; +using std::stringstream; + +// GLog already implements symbolization. Just import their hidden symbol. +namespace google { +// Symbolizes a program counter. On success, returns true and write the +// symbol name to "out". The symbol name is demangled if possible +// (supports symbols generated by GCC 3.x or newer). Otherwise, +// returns false. +bool Symbolize(void *pc, char *out, int out_size); +} + +namespace kudu { + +const int PPROF_DEFAULT_SAMPLE_SECS = 30; // pprof default sample time in seconds. + +// pprof asks for the url /pprof/cmdline to figure out what application it's profiling. +// The server should respond by sending the executable path. +static void PprofCmdLineHandler(const Webserver::WebRequest& req, stringstream* output) { + string executable_path; + Env* env = Env::Default(); + WARN_NOT_OK(env->GetExecutablePath(&executable_path), "Failed to get executable path"); + *output << executable_path; +} + +// pprof asks for the url /pprof/heap to get heap information. This should be implemented +// by calling HeapProfileStart(filename), continue to do work, and then, some number of +// seconds later, call GetHeapProfile() followed by HeapProfilerStop(). +static void PprofHeapHandler(const Webserver::WebRequest& req, stringstream* output) { +#ifndef TCMALLOC_ENABLED + (*output) << "Heap profiling is not available without tcmalloc."; +#else + // Remote (on-demand) profiling is disabled if the process is already being profiled. + if (FLAGS_enable_process_lifetime_heap_profiling) { + (*output) << "Heap profiling is running for the process lifetime."; + return; + } + + auto it = req.parsed_args.find("seconds"); + int seconds = PPROF_DEFAULT_SAMPLE_SECS; + if (it != req.parsed_args.end()) { + seconds = atoi(it->second.c_str()); + } + + HeapProfilerStart(FLAGS_heap_profile_path.c_str()); + // Sleep to allow for some samples to be collected. + SleepFor(MonoDelta::FromSeconds(seconds)); + const char* profile = GetHeapProfile(); + HeapProfilerStop(); + (*output) << profile; + delete profile; +#endif +} + +// pprof asks for the url /pprof/profile?seconds=XX to get cpu-profiling information. +// The server should respond by calling ProfilerStart(), continuing to do its work, +// and then, XX seconds later, calling ProfilerStop(). +static void PprofCpuProfileHandler(const Webserver::WebRequest& req, stringstream* output) { +#ifndef TCMALLOC_ENABLED + (*output) << "CPU profiling is not available without tcmalloc."; +#else + auto it = req.parsed_args.find("seconds"); + int seconds = PPROF_DEFAULT_SAMPLE_SECS; + if (it != req.parsed_args.end()) { + seconds = atoi(it->second.c_str()); + } + // Build a temporary file name that is hopefully unique. + string tmp_prof_file_name = strings::Substitute("/tmp/kudu_cpu_profile/$0.$1", getpid(), rand()); + ProfilerStart(tmp_prof_file_name.c_str()); + SleepFor(MonoDelta::FromSeconds(seconds)); + ProfilerStop(); + ifstream prof_file(tmp_prof_file_name.c_str(), std::ios::in); + if (!prof_file.is_open()) { + (*output) << "Unable to open cpu profile: " << tmp_prof_file_name; + return; + } + (*output) << prof_file.rdbuf(); + prof_file.close(); +#endif +} + +// pprof asks for the url /pprof/growth to get heap-profiling delta (growth) information. +// The server should respond by calling: +// MallocExtension::instance()->GetHeapGrowthStacks(&output); +static void PprofGrowthHandler(const Webserver::WebRequest& req, stringstream* output) { +#ifndef TCMALLOC_ENABLED + (*output) << "Growth profiling is not available without tcmalloc."; +#else + string heap_growth_stack; + MallocExtension::instance()->GetHeapGrowthStacks(&heap_growth_stack); + (*output) << heap_growth_stack; +#endif +} + +// Lock contention profiling +static void PprofContentionHandler(const Webserver::WebRequest& req, stringstream* output) { + string secs_str = FindWithDefault(req.parsed_args, "seconds", ""); + int32_t seconds = ParseLeadingInt32Value(secs_str.c_str(), PPROF_DEFAULT_SAMPLE_SECS); + int64_t discarded_samples = 0; + + *output << "--- contention" << endl; + *output << "sampling period = 1" << endl; + *output << "cycles/second = " << base::CyclesPerSecond() << endl; + + MonoTime end = MonoTime::Now(MonoTime::FINE); + end.AddDelta(MonoDelta::FromSeconds(seconds)); + StartSynchronizationProfiling(); + while (MonoTime::Now(MonoTime::FINE).ComesBefore(end)) { + SleepFor(MonoDelta::FromMilliseconds(500)); + FlushSynchronizationProfile(output, &discarded_samples); + } + StopSynchronizationProfiling(); + FlushSynchronizationProfile(output, &discarded_samples); + + // pprof itself ignores this value, but we can at least look at it in the textual + // output. + *output << "discarded samples = " << discarded_samples << std::endl; + +#if defined(__linux__) + // procfs only exists on Linux. + faststring maps; + ReadFileToString(Env::Default(), "/proc/self/maps", &maps); + *output << maps.ToString(); +#endif // defined(__linux__) +} + + +// pprof asks for the url /pprof/symbol to map from hex addresses to variable names. +// When the server receives a GET request for /pprof/symbol, it should return a line +// formatted like: num_symbols: ### +// where ### is the number of symbols found in the binary. For now, the only important +// distinction is whether the value is 0, which it is for executables that lack debug +// information, or not-0). +// +// In addition to the GET request for this url, the server must accept POST requests. +// This means that after the HTTP headers, pprof will pass in a list of hex addresses +// connected by +, like: +// curl -d '0x0824d061+0x0824d1cf' http://remote_host:80/pprof/symbol +// The server should read the POST data, which will be in one line, and for each hex value +// should write one line of output to the output stream, like so: +// +// For instance: +// 0x08b2dabd _Update +static void PprofSymbolHandler(const Webserver::WebRequest& req, stringstream* output) { + if (req.request_method == "GET") { + // Per the above comment, pprof doesn't expect to know the actual number of symbols. + // Any non-zero value indicates that we support symbol lookup. + (*output) << "num_symbols: 1"; + return; + } + + int missing_symbols = 0; + int invalid_addrs = 0; + + // Symbolization request. + vector pieces = strings::Split(req.post_data, "+"); + for (StringPiece p : pieces) { + string hex_addr; + if (!TryStripPrefixString(p, "0x", &hex_addr)) { + invalid_addrs++; + continue; + } + uint64_t addr; + if (!safe_strtou64_base(hex_addr.c_str(), &addr, 16)) { + invalid_addrs++; + continue; + } + char symbol_buf[1024]; + if (google::Symbolize(reinterpret_cast(addr), symbol_buf, sizeof(symbol_buf))) { + *output << p << "\t" << symbol_buf << std::endl; + } else { + missing_symbols++; + } + } + + LOG(INFO) << strings::Substitute( + "Handled request for /pprof/symbol: requested=$0 invalid_addrs=$1 missing=$2", + pieces.size(), invalid_addrs, missing_symbols); +} + +void AddPprofPathHandlers(Webserver* webserver) { + // Path handlers for remote pprof profiling. For information see: + // https://gperftools.googlecode.com/svn/trunk/doc/pprof_remote_servers.html + webserver->RegisterPathHandler("/pprof/cmdline", "", PprofCmdLineHandler, false, false); + webserver->RegisterPathHandler("/pprof/heap", "", PprofHeapHandler, false, false); + webserver->RegisterPathHandler("/pprof/growth", "", PprofGrowthHandler, false, false); + webserver->RegisterPathHandler("/pprof/profile", "", PprofCpuProfileHandler, false, false); + webserver->RegisterPathHandler("/pprof/symbol", "", PprofSymbolHandler, false, false); + webserver->RegisterPathHandler("/pprof/contention", "", PprofContentionHandler, false, false); +} + +} // namespace kudu diff --git a/src/kudu/server/pprof-path-handlers.h b/src/kudu/server/pprof-path-handlers.h new file mode 100644 index 000000000000..94f914b3e453 --- /dev/null +++ b/src/kudu/server/pprof-path-handlers.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef KUDU_SERVER_PPROF_DEFAULT_PATH_HANDLERS_H +#define KUDU_SERVER_PPROF_DEFAULT_PATH_HANDLERS_H + +namespace kudu { +class Webserver; + +// Adds set of path handlers to support pprof profiling of a remote server. +void AddPprofPathHandlers(Webserver* webserver); +} + +#endif // KUDU_SERVER_PPROF_DEFAULT_PATH_HANDLERS_H diff --git a/src/kudu/server/rpc_server.cc b/src/kudu/server/rpc_server.cc new file mode 100644 index 000000000000..aab4220ea203 --- /dev/null +++ b/src/kudu/server/rpc_server.cc @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include + +#include "kudu/gutil/casts.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/acceptor_pool.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/service_if.h" +#include "kudu/rpc/service_pool.h" +#include "kudu/server/rpc_server.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +using kudu::rpc::AcceptorPool; +using kudu::rpc::Messenger; +using kudu::rpc::ServiceIf; +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +DEFINE_string(rpc_bind_addresses, "0.0.0.0", + "Comma-separated list of addresses to bind to for RPC connections. " + "Currently, ephemeral ports (i.e. port 0) are not allowed."); +TAG_FLAG(rpc_bind_addresses, stable); + +DEFINE_int32(rpc_num_acceptors_per_address, 1, + "Number of RPC acceptor threads for each bound address"); +TAG_FLAG(rpc_num_acceptors_per_address, advanced); + +DEFINE_int32(rpc_num_service_threads, 10, + "Number of RPC worker threads to run"); +TAG_FLAG(rpc_num_service_threads, advanced); + +DEFINE_int32(rpc_service_queue_length, 50, + "Default length of queue for incoming RPC requests"); +TAG_FLAG(rpc_service_queue_length, advanced); + +DEFINE_bool(rpc_server_allow_ephemeral_ports, false, + "Allow binding to ephemeral ports. This can cause problems, so currently " + "only allowed in tests."); +TAG_FLAG(rpc_server_allow_ephemeral_ports, unsafe); + +namespace kudu { + +RpcServerOptions::RpcServerOptions() + : rpc_bind_addresses(FLAGS_rpc_bind_addresses), + num_acceptors_per_address(FLAGS_rpc_num_acceptors_per_address), + num_service_threads(FLAGS_rpc_num_service_threads), + default_port(0), + service_queue_length(FLAGS_rpc_service_queue_length) { +} + +RpcServer::RpcServer(RpcServerOptions opts) + : server_state_(UNINITIALIZED), options_(std::move(opts)) {} + +RpcServer::~RpcServer() { + Shutdown(); +} + +string RpcServer::ToString() const { + // TODO: include port numbers, etc. + return "RpcServer"; +} + +Status RpcServer::Init(const shared_ptr& messenger) { + CHECK_EQ(server_state_, UNINITIALIZED); + messenger_ = messenger; + + RETURN_NOT_OK(ParseAddressList(options_.rpc_bind_addresses, + options_.default_port, + &rpc_bind_addresses_)); + for (const Sockaddr& addr : rpc_bind_addresses_) { + if (IsPrivilegedPort(addr.port())) { + LOG(WARNING) << "May be unable to bind to privileged port for address " + << addr.ToString(); + } + + // Currently, we can't support binding to ephemeral ports outside of + // unit tests, because consensus caches RPC ports of other servers + // across restarts. See KUDU-334. + if (addr.port() == 0 && !FLAGS_rpc_server_allow_ephemeral_ports) { + LOG(FATAL) << "Binding to ephemeral ports not supported (RPC address " + << "configured to " << addr.ToString() << ")"; + } + } + + server_state_ = INITIALIZED; + return Status::OK(); +} + +Status RpcServer::RegisterService(gscoped_ptr service) { + CHECK(server_state_ == INITIALIZED || + server_state_ == BOUND) << "bad state: " << server_state_; + const scoped_refptr& metric_entity = messenger_->metric_entity(); + string service_name = service->service_name(); + scoped_refptr service_pool = + new rpc::ServicePool(service.Pass(), metric_entity, options_.service_queue_length); + RETURN_NOT_OK(service_pool->Init(options_.num_service_threads)); + RETURN_NOT_OK(messenger_->RegisterService(service_name, service_pool)); + return Status::OK(); +} + +Status RpcServer::Bind() { + CHECK_EQ(server_state_, INITIALIZED); + + // Create the Acceptor pools (one per bind address) + vector > new_acceptor_pools; + // Create the AcceptorPool for each bind address. + for (const Sockaddr& bind_addr : rpc_bind_addresses_) { + shared_ptr pool; + RETURN_NOT_OK(messenger_->AddAcceptorPool( + bind_addr, + &pool)); + new_acceptor_pools.push_back(pool); + } + acceptor_pools_.swap(new_acceptor_pools); + + server_state_ = BOUND; + return Status::OK(); +} + +Status RpcServer::Start() { + if (server_state_ == INITIALIZED) { + RETURN_NOT_OK(Bind()); + } + CHECK_EQ(server_state_, BOUND); + server_state_ = STARTED; + + for (const shared_ptr& pool : acceptor_pools_) { + RETURN_NOT_OK(pool->Start(options_.num_acceptors_per_address)); + } + + vector bound_addrs; + RETURN_NOT_OK(GetBoundAddresses(&bound_addrs)); + string bound_addrs_str; + for (const Sockaddr& bind_addr : bound_addrs) { + if (!bound_addrs_str.empty()) bound_addrs_str += ", "; + bound_addrs_str += bind_addr.ToString(); + } + LOG(INFO) << "RPC server started. Bound to: " << bound_addrs_str; + + return Status::OK(); +} + +void RpcServer::Shutdown() { + for (const shared_ptr& pool : acceptor_pools_) { + pool->Shutdown(); + } + acceptor_pools_.clear(); + + if (messenger_) { + WARN_NOT_OK(messenger_->UnregisterAllServices(), "Unable to unregister our services"); + } +} + +Status RpcServer::GetBoundAddresses(vector* addresses) const { + CHECK(server_state_ == BOUND || + server_state_ == STARTED) << "bad state: " << server_state_; + for (const shared_ptr& pool : acceptor_pools_) { + Sockaddr bound_addr; + RETURN_NOT_OK_PREPEND(pool->GetBoundAddress(&bound_addr), + "Unable to get bound address from AcceptorPool"); + addresses->push_back(bound_addr); + } + return Status::OK(); +} + +const rpc::ServicePool* RpcServer::service_pool(const string& service_name) const { + return down_cast(messenger_->rpc_service(service_name).get()); +} + +} // namespace kudu diff --git a/src/kudu/server/rpc_server.h b/src/kudu/server/rpc_server.h new file mode 100644 index 000000000000..1d201b1018ee --- /dev/null +++ b/src/kudu/server/rpc_server.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_RPC_SERVER_H +#define KUDU_RPC_SERVER_H + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/service_pool.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +namespace kudu { + +namespace rpc { +class AcceptorPool; +class Messenger; +class ServiceIf; +} // namespace rpc + +struct RpcServerOptions { + RpcServerOptions(); + + std::string rpc_bind_addresses; + uint32_t num_acceptors_per_address; + uint32_t num_service_threads; + uint16_t default_port; + size_t service_queue_length; +}; + +class RpcServer { + public: + explicit RpcServer(RpcServerOptions opts); + ~RpcServer(); + + Status Init(const std::shared_ptr& messenger); + // Services need to be registered after Init'ing, but before Start'ing. + // The service's ownership will be given to a ServicePool. + Status RegisterService(gscoped_ptr service); + Status Bind(); + Status Start(); + void Shutdown(); + + std::string ToString() const; + + // Return the addresses that this server has successfully + // bound to. Requires that the server has been Start()ed. + Status GetBoundAddresses(std::vector* addresses) const WARN_UNUSED_RESULT; + + const rpc::ServicePool* service_pool(const std::string& service_name) const; + + private: + enum ServerState { + // Default state when the rpc server is constructed. + UNINITIALIZED, + // State after Init() was called. + INITIALIZED, + // State after Bind(). + BOUND, + // State after Start() was called. + STARTED + }; + ServerState server_state_; + + const RpcServerOptions options_; + std::shared_ptr messenger_; + + // Parsed addresses to bind RPC to. Set by Init() + std::vector rpc_bind_addresses_; + + std::vector > acceptor_pools_; + + DISALLOW_COPY_AND_ASSIGN(RpcServer); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/server/rpcz-path-handler.cc b/src/kudu/server/rpcz-path-handler.cc new file mode 100644 index 000000000000..9df2132f909e --- /dev/null +++ b/src/kudu/server/rpcz-path-handler.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/rpcz-path-handler.h" + +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_introspection.pb.h" +#include "kudu/server/webserver.h" + +using kudu::rpc::DumpRunningRpcsRequestPB; +using kudu::rpc::DumpRunningRpcsResponsePB; +using kudu::rpc::Messenger; +using std::shared_ptr; +using std::stringstream; + +namespace kudu { + +namespace { + +void RpczPathHandler(const shared_ptr& messenger, + const Webserver::WebRequest& req, stringstream* output) { + DumpRunningRpcsRequestPB dump_req; + DumpRunningRpcsResponsePB dump_resp; + + string arg = FindWithDefault(req.parsed_args, "include_traces", "false"); + dump_req.set_include_traces(ParseLeadingBoolValue(arg.c_str(), false)); + + messenger->DumpRunningRpcs(dump_req, &dump_resp); + + JsonWriter writer(output, JsonWriter::PRETTY); + writer.Protobuf(dump_resp); +} + +} // anonymous namespace + +void AddRpczPathHandlers(const shared_ptr& messenger, Webserver* webserver) { + webserver->RegisterPathHandler("/rpcz", "RPCs", + boost::bind(RpczPathHandler, messenger, _1, _2), + false, true); +} + +} // namespace kudu diff --git a/src/kudu/server/rpcz-path-handler.h b/src/kudu/server/rpcz-path-handler.h new file mode 100644 index 000000000000..388c78cfcc21 --- /dev/null +++ b/src/kudu/server/rpcz-path-handler.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_RPCZ_PATH_HANDLER_H +#define KUDU_SERVER_RPCZ_PATH_HANDLER_H + +#include + +namespace kudu { + +namespace rpc { +class Messenger; +} // namespace rpc + +class Webserver; + +void AddRpczPathHandlers(const std::shared_ptr& messenger, + Webserver* webserver); + +} // namespace kudu +#endif /* KUDU_SERVER_RPCZ_PATH_HANDLER_H */ diff --git a/src/kudu/server/server_base.cc b/src/kudu/server/server_base.cc new file mode 100644 index 000000000000..edcf56f70cf6 --- /dev/null +++ b/src/kudu/server/server_base.cc @@ -0,0 +1,333 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/server/server_base.h" + +#include +#include +#include +#include + +#include "kudu/codegen/compilation_manager.h" +#include "kudu/common/wire_protocol.pb.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/default-path-handlers.h" +#include "kudu/server/generic_service.h" +#include "kudu/server/glog_metrics.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/logical_clock.h" +#include "kudu/server/rpc_server.h" +#include "kudu/server/tcmalloc_metrics.h" +#include "kudu/server/webserver.h" +#include "kudu/server/rpcz-path-handler.h" +#include "kudu/server/server_base_options.h" +#include "kudu/server/server_base.pb.h" +#include "kudu/server/tracing-path-handlers.h" +#include "kudu/util/atomic.h" +#include "kudu/util/env.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/rolling_log.h" +#include "kudu/util/spinlock_profiling.h" +#include "kudu/util/thread.h" +#include "kudu/util/version_info.h" + +DEFINE_int32(num_reactor_threads, 4, "Number of libev reactor threads to start."); +TAG_FLAG(num_reactor_threads, advanced); + +DECLARE_bool(use_hybrid_clock); + +using std::shared_ptr; +using std::string; +using std::stringstream; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace server { + +namespace { + +// Disambiguates between servers when in a minicluster. +AtomicInt mem_tracker_id_counter(-1); + +shared_ptr CreateMemTrackerForServer() { + int32_t id = mem_tracker_id_counter.Increment(); + string id_str = "server"; + if (id != 0) { + StrAppend(&id_str, " ", id); + } + return shared_ptr(MemTracker::CreateTracker(-1, id_str)); +} + +} // anonymous namespace + +ServerBase::ServerBase(string name, const ServerBaseOptions& options, + const string& metric_namespace) + : name_(std::move(name)), + mem_tracker_(CreateMemTrackerForServer()), + metric_registry_(new MetricRegistry()), + metric_entity_(METRIC_ENTITY_server.Instantiate(metric_registry_.get(), + metric_namespace)), + rpc_server_(new RpcServer(options.rpc_opts)), + web_server_(new Webserver(options.webserver_opts)), + is_first_run_(false), + options_(options), + stop_metrics_logging_latch_(1) { + FsManagerOpts fs_opts; + fs_opts.metric_entity = metric_entity_; + fs_opts.parent_mem_tracker = mem_tracker_; + fs_opts.wal_path = options.fs_opts.wal_path; + fs_opts.data_paths = options.fs_opts.data_paths; + fs_manager_.reset(new FsManager(options.env, fs_opts)); + + if (FLAGS_use_hybrid_clock) { + clock_ = new HybridClock(); + } else { + clock_ = LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp); + } + + CHECK_OK(StartThreadInstrumentation(metric_entity_, web_server_.get())); + CHECK_OK(codegen::CompilationManager::GetSingleton()->StartInstrumentation( + metric_entity_)); +} + +ServerBase::~ServerBase() { + Shutdown(); + mem_tracker_->UnregisterFromParent(); +} + +Sockaddr ServerBase::first_rpc_address() const { + vector addrs; + WARN_NOT_OK(rpc_server_->GetBoundAddresses(&addrs), + "Couldn't get bound RPC address"); + CHECK(!addrs.empty()) << "Not bound"; + return addrs[0]; +} + +Sockaddr ServerBase::first_http_address() const { + vector addrs; + WARN_NOT_OK(web_server_->GetBoundAddresses(&addrs), + "Couldn't get bound webserver addresses"); + CHECK(!addrs.empty()) << "Not bound"; + return addrs[0]; +} + +const NodeInstancePB& ServerBase::instance_pb() const { + return *DCHECK_NOTNULL(instance_pb_.get()); +} + +void ServerBase::GenerateInstanceID() { + instance_pb_.reset(new NodeInstancePB); + instance_pb_->set_permanent_uuid(fs_manager_->uuid()); + // TODO: maybe actually bump a sequence number on local disk instead of + // using time. + instance_pb_->set_instance_seqno(Env::Default()->NowMicros()); +} + +Status ServerBase::Init() { + glog_metrics_.reset(new ScopedGLogMetrics(metric_entity_)); + tcmalloc::RegisterMetrics(metric_entity_); + RegisterSpinLockContentionMetrics(metric_entity_); + + InitSpinLockContentionProfiling(); + + // Initialize the clock immediately. This checks that the clock is synchronized + // so we're less likely to get into a partially initialized state on disk during startup + // if we're having clock problems. + RETURN_NOT_OK_PREPEND(clock_->Init(), "Cannot initialize clock"); + + Status s = fs_manager_->Open(); + if (s.IsNotFound()) { + LOG(INFO) << "Could not load existing FS layout: " << s.ToString(); + LOG(INFO) << "Creating new FS layout"; + is_first_run_ = true; + RETURN_NOT_OK_PREPEND(fs_manager_->CreateInitialFileSystemLayout(), + "Could not create new FS layout"); + s = fs_manager_->Open(); + } + RETURN_NOT_OK_PREPEND(s, "Failed to load FS layout"); + + // Create the Messenger. + rpc::MessengerBuilder builder(name_); + + builder.set_num_reactors(FLAGS_num_reactor_threads); + builder.set_metric_entity(metric_entity()); + RETURN_NOT_OK(builder.Build(&messenger_)); + + RETURN_NOT_OK(rpc_server_->Init(messenger_)); + RETURN_NOT_OK(rpc_server_->Bind()); + clock_->RegisterMetrics(metric_entity_); + + RETURN_NOT_OK_PREPEND(StartMetricsLogging(), "Could not enable metrics logging"); + + return Status::OK(); +} + +void ServerBase::GetStatusPB(ServerStatusPB* status) const { + // Node instance + status->mutable_node_instance()->CopyFrom(*instance_pb_); + + // RPC ports + { + vector addrs; + CHECK_OK(rpc_server_->GetBoundAddresses(&addrs)); + for (const Sockaddr& addr : addrs) { + HostPortPB* pb = status->add_bound_rpc_addresses(); + pb->set_host(addr.host()); + pb->set_port(addr.port()); + } + } + + // HTTP ports + { + vector addrs; + CHECK_OK(web_server_->GetBoundAddresses(&addrs)); + for (const Sockaddr& addr : addrs) { + HostPortPB* pb = status->add_bound_http_addresses(); + pb->set_host(addr.host()); + pb->set_port(addr.port()); + } + } + + VersionInfo::GetVersionInfoPB(status->mutable_version_info()); +} + +Status ServerBase::DumpServerInfo(const string& path, + const string& format) const { + ServerStatusPB status; + GetStatusPB(&status); + + if (boost::iequals(format, "json")) { + string json = JsonWriter::ToJson(status, JsonWriter::PRETTY); + RETURN_NOT_OK(WriteStringToFile(options_.env, Slice(json), path)); + } else if (boost::iequals(format, "pb")) { + // TODO: Use PB container format? + RETURN_NOT_OK(pb_util::WritePBToPath(options_.env, path, status, + pb_util::NO_SYNC)); // durability doesn't matter + } else { + return Status::InvalidArgument("bad format", format); + } + + LOG(INFO) << "Dumped server information to " << path; + return Status::OK(); +} + +Status ServerBase::RegisterService(gscoped_ptr rpc_impl) { + return rpc_server_->RegisterService(rpc_impl.Pass()); +} + +Status ServerBase::StartMetricsLogging() { + if (options_.metrics_log_interval_ms <= 0) { + return Status::OK(); + } + + return Thread::Create("server", "metrics-logger", &ServerBase::MetricsLoggingThread, + this, &metrics_logging_thread_); +} + +void ServerBase::MetricsLoggingThread() { + RollingLog log(Env::Default(), FLAGS_log_dir, "metrics"); + + // How long to wait before trying again if we experience a failure + // logging metrics. + const MonoDelta kWaitBetweenFailures = MonoDelta::FromSeconds(60); + + + MonoTime next_log = MonoTime::Now(MonoTime::FINE); + while (!stop_metrics_logging_latch_.WaitUntil(next_log)) { + next_log = MonoTime::Now(MonoTime::FINE); + next_log.AddDelta(MonoDelta::FromMilliseconds(options_.metrics_log_interval_ms)); + + std::stringstream buf; + buf << "metrics " << GetCurrentTimeMicros() << " "; + + // Collect the metrics JSON string. + vector metrics; + metrics.push_back("*"); + MetricJsonOptions opts; + opts.include_raw_histograms = true; + + JsonWriter writer(&buf, JsonWriter::COMPACT); + Status s = metric_registry_->WriteAsJson(&writer, metrics, opts); + if (!s.ok()) { + WARN_NOT_OK(s, "Unable to collect metrics to log"); + next_log.AddDelta(kWaitBetweenFailures); + continue; + } + + buf << "\n"; + + s = log.Append(buf.str()); + if (!s.ok()) { + WARN_NOT_OK(s, "Unable to write metrics to log"); + next_log.AddDelta(kWaitBetweenFailures); + continue; + } + } + + WARN_NOT_OK(log.Close(), "Unable to close metric log"); +} + +std::string ServerBase::FooterHtml() const { + return Substitute("
$0\nserver uuid $1
", + VersionInfo::GetShortVersionString(), + instance_pb_->permanent_uuid()); +} + +Status ServerBase::Start() { + GenerateInstanceID(); + + RETURN_NOT_OK(RegisterService(make_gscoped_ptr( + new GenericServiceImpl(this)))); + + RETURN_NOT_OK(rpc_server_->Start()); + + AddDefaultPathHandlers(web_server_.get()); + AddRpczPathHandlers(messenger_, web_server_.get()); + RegisterMetricsJsonHandler(web_server_.get(), metric_registry_.get()); + TracingPathHandlers::RegisterHandlers(web_server_.get()); + web_server_->set_footer_html(FooterHtml()); + RETURN_NOT_OK(web_server_->Start()); + + if (!options_.dump_info_path.empty()) { + RETURN_NOT_OK_PREPEND(DumpServerInfo(options_.dump_info_path, options_.dump_info_format), + "Failed to dump server info to " + options_.dump_info_path); + } + + return Status::OK(); +} + +void ServerBase::Shutdown() { + if (metrics_logging_thread_) { + stop_metrics_logging_latch_.CountDown(); + metrics_logging_thread_->Join(); + } + web_server_->Stop(); + rpc_server_->Shutdown(); +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/server_base.h b/src/kudu/server/server_base.h new file mode 100644 index 000000000000..ec26ce9acf08 --- /dev/null +++ b/src/kudu/server/server_base.h @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_SERVER_BASE_H +#define KUDU_SERVER_SERVER_BASE_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/rpc/service_if.h" +#include "kudu/server/server_base_options.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Env; +class FsManager; +class MemTracker; +class MetricEntity; +class MetricRegistry; +class NodeInstancePB; +class RpcServer; +class ScopedGLogMetrics; +class Sockaddr; +class Thread; +class Webserver; + +namespace rpc { +class Messenger; +class ServiceIf; +} // namespace rpc + +namespace server { +class Clock; + +struct ServerBaseOptions; +class ServerStatusPB; + +// Base class for tablet server and master. +// Handles starting and stopping the RPC server and web server, +// and provides a common interface for server-type-agnostic functions. +class ServerBase { + public: + const RpcServer *rpc_server() const { return rpc_server_.get(); } + const Webserver *web_server() const { return web_server_.get(); } + const std::shared_ptr& messenger() const { return messenger_; } + + // Return the first RPC address that this server has bound to. + // FATALs if the server is not started. + Sockaddr first_rpc_address() const; + + // Return the first HTTP address that this server has bound to. + // FATALs if the server is not started. + Sockaddr first_http_address() const; + + FsManager* fs_manager() { return fs_manager_.get(); } + + // Return the instance identifier of this server. + // This may not be called until after the server is Initted. + const NodeInstancePB& instance_pb() const; + + const std::shared_ptr& mem_tracker() const { return mem_tracker_; } + + const scoped_refptr& metric_entity() const { return metric_entity_; } + + MetricRegistry* metric_registry() { return metric_registry_.get(); } + + // Returns this server's clock. + Clock* clock() { return clock_.get(); } + + // Return a PB describing the status of the server (version info, bound ports, etc) + void GetStatusPB(ServerStatusPB* status) const; + + protected: + ServerBase(std::string name, const ServerBaseOptions& options, + const std::string& metrics_namespace); + virtual ~ServerBase(); + + Status Init(); + Status RegisterService(gscoped_ptr rpc_impl); + Status Start(); + void Shutdown(); + + const std::string name_; + + std::shared_ptr mem_tracker_; + gscoped_ptr metric_registry_; + scoped_refptr metric_entity_; + gscoped_ptr fs_manager_; + gscoped_ptr rpc_server_; + gscoped_ptr web_server_; + std::shared_ptr messenger_; + bool is_first_run_; + + scoped_refptr clock_; + + // The instance identifier of this server. + gscoped_ptr instance_pb_; + + private: + void GenerateInstanceID(); + Status DumpServerInfo(const std::string& path, + const std::string& format) const; + Status StartMetricsLogging(); + void MetricsLoggingThread(); + std::string FooterHtml() const; + + ServerBaseOptions options_; + + scoped_refptr metrics_logging_thread_; + CountDownLatch stop_metrics_logging_latch_; + + gscoped_ptr glog_metrics_; + + DISALLOW_COPY_AND_ASSIGN(ServerBase); +}; + +} // namespace server +} // namespace kudu +#endif /* KUDU_SERVER_SERVER_BASE_H */ diff --git a/src/kudu/server/server_base.proto b/src/kudu/server/server_base.proto new file mode 100644 index 000000000000..cf69a2d9cb14 --- /dev/null +++ b/src/kudu/server/server_base.proto @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.server; + +option java_package = "org.kududb.server"; + +import "kudu/common/common.proto"; +import "kudu/common/wire_protocol.proto"; +import "kudu/util/version_info.proto"; + +// The status information dumped by a server after it starts. +// +// This is optionally exposed on the local host in a text file which the server writes out +// at startup. +// +// Additionally, it is exposed via RPC through the GenericService interface. +message ServerStatusPB { + required NodeInstancePB node_instance = 1; + repeated HostPortPB bound_rpc_addresses = 2; + repeated HostPortPB bound_http_addresses = 3; + optional VersionInfoPB version_info = 4; +} + +// Attempt to set a command line flag. +// Note that many command line flags do not take effect if changed +// at runtime. +// +// TODO: We plan to add tags indicating which flags can be modified at +// runtime. For now, this is an advanced option. +message SetFlagRequestPB { + required string flag = 1; + required string value = 2; + + // Force the change, even if the flag is not marked as safe to change + // at runtime. This can cause crashes or other bad behavior, so should + // only be used as a last resort. + optional bool force = 3 [default = false]; +} + +message SetFlagResponsePB { + enum Code { + UNKNOWN = 0; + SUCCESS = 1; + NO_SUCH_FLAG = 2; + BAD_VALUE = 3; + + // The flag is not safe to change at runtime without the 'force' flag. + NOT_SAFE = 4; + } + + required Code result = 1; + + // A string describing the new value that the option has been set to. + // This passes through the return value of SetCommandLineOption() from + // gflags, which doesn't specify anything about the format of this message. + // + // Using 'result' above is more reliable. + optional string msg = 2; + + // If the flag exists, the prior value of the flag. This is set even in the + // case of BAD_VALUE. + optional string old_value = 3; +} + +// Attempt to flush coverage information to disk, if running a coverage build. +message FlushCoverageRequestPB { +} +message FlushCoverageResponsePB { + // If the current build is not a coverage build, returns false. + optional bool success = 1; +} + +// Requests the server's current timestamp. +message ServerClockRequestPB { +} +message ServerClockResponsePB { + // The current timestamp of the server. + optional fixed64 timestamp = 1; +} + +// Requests the server's status and version info +message GetStatusRequestPB { +} +message GetStatusResponsePB { + required ServerStatusPB status = 1; +} + +// Makes the HybridClock of the server use these values for wall clock time and error, +// for testing purposes. +// Requires that the server was started with '--use_mock_wall_clock=true'. +message SetServerWallClockForTestsRequestPB { + optional uint64 now_usec = 1; + optional uint64 max_error_usec = 2; +} + +// Response corresponding to the request above. +message SetServerWallClockForTestsResponsePB { + // Set to 'true' if the clock was updated successfully. + required bool success = 1; +} + +service GenericService { + rpc SetFlag(SetFlagRequestPB) + returns (SetFlagResponsePB); + + rpc FlushCoverage(FlushCoverageRequestPB) + returns (FlushCoverageResponsePB); + + rpc ServerClock(ServerClockRequestPB) + returns (ServerClockResponsePB); + + rpc SetServerWallClockForTests(SetServerWallClockForTestsRequestPB) + returns (SetServerWallClockForTestsResponsePB); + + rpc GetStatus(GetStatusRequestPB) + returns (GetStatusResponsePB); +} diff --git a/src/kudu/server/server_base_options.cc b/src/kudu/server/server_base_options.cc new file mode 100644 index 000000000000..6f7c2b477614 --- /dev/null +++ b/src/kudu/server/server_base_options.cc @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/server_base_options.h" + +#include +#include "kudu/util/flag_tags.h" + +namespace kudu { +namespace server { + +DEFINE_string(server_dump_info_path, "", + "Path into which the server information will be " + "dumped after startup. The dumped data is described by " + "ServerStatusPB in server_base.proto. The dump format is " + "determined by --server_dump_info_format"); +DEFINE_string(server_dump_info_format, "json", + "Format for --server_dump_info_path. This may be either " + "'pb' or 'json'."); +TAG_FLAG(server_dump_info_path, hidden); +TAG_FLAG(server_dump_info_format, hidden); + +DEFINE_int32(metrics_log_interval_ms, 0, + "Interval (in milliseconds) at which the server will dump its " + "metrics to a local log file. The log files are located in the same " + "directory as specified by the -log_dir flag. If this is not a positive " + "value, then metrics logging will be disabled."); +TAG_FLAG(metrics_log_interval_ms, advanced); + +ServerBaseOptions::ServerBaseOptions() + : env(Env::Default()), + dump_info_path(FLAGS_server_dump_info_path), + dump_info_format(FLAGS_server_dump_info_format), + metrics_log_interval_ms(FLAGS_metrics_log_interval_ms) { +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/server_base_options.h b/src/kudu/server/server_base_options.h new file mode 100644 index 000000000000..7c665d73a661 --- /dev/null +++ b/src/kudu/server/server_base_options.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_SERVER_BASE_OPTIONS_H +#define KUDU_SERVER_SERVER_BASE_OPTIONS_H + +#include +#include + +#include "kudu/fs/fs_manager.h" +#include "kudu/server/webserver_options.h" +#include "kudu/server/rpc_server.h" + +namespace kudu { + +class Env; + +namespace server { + +// Options common to both types of servers. +// The subclass constructor should fill these in with defaults from +// server-specific flags. +struct ServerBaseOptions { + Env* env; + + FsManagerOpts fs_opts; + RpcServerOptions rpc_opts; + WebserverOptions webserver_opts; + + std::string dump_info_path; + std::string dump_info_format; + + int32_t metrics_log_interval_ms; + + protected: + ServerBaseOptions(); +}; + +} // namespace server +} // namespace kudu +#endif /* KUDU_SERVER_SERVER_BASE_OPTIONS_H */ diff --git a/src/kudu/server/tcmalloc_metrics.cc b/src/kudu/server/tcmalloc_metrics.cc new file mode 100644 index 000000000000..e700d2f718d3 --- /dev/null +++ b/src/kudu/server/tcmalloc_metrics.cc @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/server/tcmalloc_metrics.h" + +#include +#include +#include + +#include "kudu/util/metrics.h" + +#ifndef TCMALLOC_ENABLED +#define TCM_ASAN_MSG " (Disabled - no tcmalloc in this build)" +#else +#define TCM_ASAN_MSG +#endif + +// As of this writing, we expose all of the un-deprecated tcmalloc status metrics listed at: +// http://gperftools.googlecode.com/svn/trunk/doc/tcmalloc.html + +METRIC_DEFINE_gauge_uint64(server, generic_current_allocated_bytes, + "Heap Memory Usage", kudu::MetricUnit::kBytes, + "Number of bytes used by the application. This will not typically match the memory " + "use reported by the OS, because it does not include TCMalloc overhead or memory " + "fragmentation." TCM_ASAN_MSG); + +METRIC_DEFINE_gauge_uint64(server, generic_heap_size, + "Reserved Heap Memory", kudu::MetricUnit::kBytes, + "Bytes of system memory reserved by TCMalloc." TCM_ASAN_MSG); + +METRIC_DEFINE_gauge_uint64(server, tcmalloc_pageheap_free_bytes, + "Free Heap Memory", kudu::MetricUnit::kBytes, + "Number of bytes in free, mapped pages in page heap. These bytes can be used to " + "fulfill allocation requests. They always count towards virtual memory usage, and " + "unless the underlying memory is swapped out by the OS, they also count towards " + "physical memory usage." TCM_ASAN_MSG); + +METRIC_DEFINE_gauge_uint64(server, tcmalloc_pageheap_unmapped_bytes, + "Unmapped Heap Memory", kudu::MetricUnit::kBytes, + "Number of bytes in free, unmapped pages in page heap. These are bytes that have " + "been released back to the OS, possibly by one of the MallocExtension \"Release\" " + "calls. They can be used to fulfill allocation requests, but typically incur a page " + "fault. They always count towards virtual memory usage, and depending on the OS, " + "typically do not count towards physical memory usage." TCM_ASAN_MSG); + +METRIC_DEFINE_gauge_uint64(server, tcmalloc_max_total_thread_cache_bytes, + "Thread Cache Memory Limit", kudu::MetricUnit::kBytes, + "A limit to how much memory TCMalloc dedicates for small objects. Higher numbers " + "trade off more memory use for -- in some situations -- improved efficiency." TCM_ASAN_MSG); + +METRIC_DEFINE_gauge_uint64(server, tcmalloc_current_total_thread_cache_bytes, + "Thread Cache Memory Usage", kudu::MetricUnit::kBytes, + "A measure of some of the memory TCMalloc is using (for small objects)." TCM_ASAN_MSG); + +#undef TCM_ASAN_MSG + +namespace kudu { +namespace tcmalloc { + +static uint64_t GetTCMallocPropValue(const char* prop) { + size_t value = 0; +#ifdef TCMALLOC_ENABLED + if (!MallocExtension::instance()->GetNumericProperty(prop, &value)) { + LOG(DFATAL) << "Failed to get value of numeric tcmalloc property: " << prop; + } +#endif + return value; +} + +void RegisterMetrics(const scoped_refptr& entity) { + entity->NeverRetire( + METRIC_generic_current_allocated_bytes.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, Unretained("generic.current_allocated_bytes")))); + entity->NeverRetire( + METRIC_generic_heap_size.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, Unretained("generic.heap_size")))); + entity->NeverRetire( + METRIC_tcmalloc_pageheap_free_bytes.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, Unretained("tcmalloc.pageheap_free_bytes")))); + entity->NeverRetire( + METRIC_tcmalloc_pageheap_unmapped_bytes.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, Unretained("tcmalloc.pageheap_unmapped_bytes")))); + entity->NeverRetire( + METRIC_tcmalloc_max_total_thread_cache_bytes.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, Unretained("tcmalloc.max_total_thread_cache_bytes")))); + entity->NeverRetire( + METRIC_tcmalloc_current_total_thread_cache_bytes.InstantiateFunctionGauge( + entity, Bind(GetTCMallocPropValue, + Unretained("tcmalloc.current_total_thread_cache_bytes")))); +} + +} // namespace tcmalloc +} // namespace kudu diff --git a/src/kudu/server/tcmalloc_metrics.h b/src/kudu/server/tcmalloc_metrics.h new file mode 100644 index 000000000000..721d4fc34e14 --- /dev/null +++ b/src/kudu/server/tcmalloc_metrics.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_TCMALLOC_METRICS_H_ +#define KUDU_SERVER_TCMALLOC_METRICS_H_ + +#include "kudu/gutil/ref_counted.h" + +namespace kudu { +class MetricEntity; +namespace tcmalloc { + +// Registers tcmalloc-related status etrics. +// This can be called multiple times on different entities, though the resulting +// metrics will be identical, since the tcmalloc tracking is process-wide. +void RegisterMetrics(const scoped_refptr& entity); + +} // namespace tcmalloc +} // namespace kudu + +#endif // KUDU_SERVER_TCMALLOC_METRICS_H_ diff --git a/src/kudu/server/tracing-path-handlers.cc b/src/kudu/server/tracing-path-handlers.cc new file mode 100644 index 000000000000..6b0205b4e2a1 --- /dev/null +++ b/src/kudu/server/tracing-path-handlers.cc @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/server/tracing-path-handlers.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/escaping.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/debug/trace_event_impl.h" + +using std::map; +using std::string; +using std::stringstream; +using std::vector; + +using kudu::debug::CategoryFilter; +using kudu::debug::TraceLog; +using kudu::debug::TraceResultBuffer; + +namespace kudu { +namespace server { + +enum Handler { + kBeginMonitoring, + kEndMonitoring, + kCaptureMonitoring, + kGetMonitoringStatus, + kCategories, + kBeginRecording, + kGetBufferPercentFull, + kEndRecording, + kSimpleDump +}; + +namespace { + +Status ParseBase64JsonRequest(const string& json_base64, + rapidjson::Document* doc) { + string json_str; + if (!Base64Unescape(json_base64, &json_str)) { + return Status::InvalidArgument("Invalid base64-encoded JSON"); + } + + doc->Parse<0>(json_str.c_str()); + if (!doc->IsObject()) { + return Status::InvalidArgument("Invalid JSON", json_str); + } + return Status::OK(); +} + +Status GetTracingOptions(const std::string& json_base64, + std::string* category_filter_string, + int* tracing_options) { + rapidjson::Document doc; + RETURN_NOT_OK(ParseBase64JsonRequest(json_base64, &doc)); + + bool use_continuous_tracing = false; + bool use_sampling = false; + + if (!doc.HasMember("categoryFilter") || + !doc["categoryFilter"].IsString()) { + return Status::InvalidArgument("missing categoryFilter"); + } + *category_filter_string = doc["categoryFilter"].GetString(); + + if (doc.HasMember("useContinuousTracing") && + doc["useContinuousTracing"].IsBool()) { + use_continuous_tracing = doc["useContinuousTracing"].GetBool(); + } + + if (doc.HasMember("useSampling") && + doc["useSampling"].IsBool()) { + use_sampling = doc["useSampling"].GetBool(); + } + + *tracing_options = 0; + if (use_sampling) + *tracing_options |= TraceLog::ENABLE_SAMPLING; + if (use_continuous_tracing) + *tracing_options |= TraceLog::RECORD_CONTINUOUSLY; + return Status::OK(); +} + +Status BeginRecording(const Webserver::WebRequest& req, + TraceLog::Mode mode) { + string filter_str; + int options; + RETURN_NOT_OK(GetTracingOptions(req.query_string, &filter_str, &options)); + + kudu::debug::TraceLog::GetInstance()->SetEnabled( + CategoryFilter(filter_str), + mode, + static_cast(options)); + return Status::OK(); +} + + +Status EndRecording(const Webserver::WebRequest& req, + stringstream* out) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetDisabled(); + *out << TraceResultBuffer::FlushTraceLogToString(); + return Status::OK(); +} + +Status CaptureMonitoring(stringstream* out) { + TraceLog* tl = TraceLog::GetInstance(); + if (!tl->IsEnabled()) { + return Status::IllegalState("monitoring not enabled"); + } + *out << TraceResultBuffer::FlushTraceLogToStringButLeaveBufferIntact(); + return Status::OK(); +} + +void GetCategories(stringstream* out) { + vector groups; + kudu::debug::TraceLog::GetInstance()->GetKnownCategoryGroups(&groups); + JsonWriter j(out, JsonWriter::COMPACT); + j.StartArray(); + for (const string& g : groups) { + j.String(g); + } + j.EndArray(); +} + +void GetMonitoringStatus(stringstream* out) { + TraceLog* tl = TraceLog::GetInstance(); + bool is_monitoring = tl->IsEnabled(); + std::string category_filter = tl->GetCurrentCategoryFilter().ToString(); + int options = static_cast(tl->trace_options()); + + stringstream json_out; + JsonWriter j(&json_out, JsonWriter::COMPACT); + j.StartObject(); + + j.String("isMonitoring"); + j.Bool(is_monitoring); + + j.String("categoryFilter"); + j.String(category_filter); + + j.String("useContinuousTracing"); + j.Bool((options & TraceLog::RECORD_CONTINUOUSLY) != 0); + + j.String("useSampling"); + j.Bool((options & TraceLog::ENABLE_SAMPLING) != 0); + + j.EndObject(); + + string encoded; + strings::Base64Escape(json_out.str(), &encoded); + *out << encoded; +} + +void HandleTraceJsonPage(const Webserver::ArgumentMap &args, + std::stringstream* output) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + SleepFor(MonoDelta::FromSeconds(10)); + tl->SetDisabled(); + + *output << TraceResultBuffer::FlushTraceLogToString(); +} + +Status DoHandleRequest(Handler handler, + const Webserver::WebRequest& req, + std::stringstream* output) { + VLOG(2) << "Tracing request type=" << handler << ": " << req.query_string; + + switch (handler) { + case kBeginMonitoring: + RETURN_NOT_OK(BeginRecording(req, TraceLog::MONITORING_MODE)); + break; + case kCaptureMonitoring: + RETURN_NOT_OK(CaptureMonitoring(output)); + break; + case kGetMonitoringStatus: + GetMonitoringStatus(output); + break; + case kCategories: + GetCategories(output); + break; + case kBeginRecording: + RETURN_NOT_OK(BeginRecording(req, TraceLog::RECORDING_MODE)); + break; + case kGetBufferPercentFull: + *output << TraceLog::GetInstance()->GetBufferPercentFull(); + break; + case kEndMonitoring: + case kEndRecording: + RETURN_NOT_OK(EndRecording(req, output)); + break; + case kSimpleDump: + HandleTraceJsonPage(req.parsed_args, output); + break; + } + + return Status::OK(); +} + + +void HandleRequest(Handler handler, + const Webserver::WebRequest& req, + std::stringstream* output) { + Status s = DoHandleRequest(handler, req, output); + if (!s.ok()) { + LOG(WARNING) << "Tracing error for handler " << handler << ": " + << s.ToString(); + // The trace-viewer JS expects '##ERROR##' to indicate that an error + // occurred. TODO: change the JS to bubble up the actual error message + // to the user. + *output << "##ERROR##"; + } +} +} // anonymous namespace + + +void TracingPathHandlers::RegisterHandlers(Webserver* server) { + // All of the tracing-related hand + std::map handlers = { + { "/tracing/json/begin_monitoring", kBeginMonitoring }, + { "/tracing/json/end_monitoring", kEndMonitoring }, + { "/tracing/json/capture_monitoring", kCaptureMonitoring }, + { "/tracing/json/get_monitoring_status", kGetMonitoringStatus }, + { "/tracing/json/categories", kCategories }, + { "/tracing/json/begin_recording", kBeginRecording }, + { "/tracing/json/get_buffer_percent_full", kGetBufferPercentFull }, + { "/tracing/json/end_recording", kEndRecording }, + { "/tracing/json/simple_dump", kSimpleDump } }; + + typedef pair HandlerPair; + for (const HandlerPair& e : handlers) { + server->RegisterPathHandler( + e.first, "", + boost::bind(&HandleRequest, e.second, _1, _2), + false /* styled */, false /* is_on_nav_bar */); + } +} + +} // namespace server +} // namespace kudu diff --git a/src/kudu/server/tracing-path-handlers.h b/src/kudu/server/tracing-path-handlers.h new file mode 100644 index 000000000000..7936c600af0c --- /dev/null +++ b/src/kudu/server/tracing-path-handlers.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_TRACING_PATH_HANDLERS_H +#define KUDU_SERVER_TRACING_PATH_HANDLERS_H + +#include "kudu/gutil/macros.h" +#include "kudu/server/webserver.h" +#include "kudu/util/status.h" + +#include + +namespace kudu { +namespace server { + +// Web handlers for Chromium tracing. +// These handlers provide AJAX endpoints for /tracing.html provided by +// the trace-viewer package. +class TracingPathHandlers { + public: + static void RegisterHandlers(Webserver* server); + + DISALLOW_IMPLICIT_CONSTRUCTORS(TracingPathHandlers); +}; + +} // namespace server +} // namespace kudu +#endif /* KUDU_SERVER_TRACING_PATH_HANDLERS_H */ diff --git a/src/kudu/server/webserver-test.cc b/src/kudu/server/webserver-test.cc new file mode 100644 index 000000000000..6fc9e940aeb3 --- /dev/null +++ b/src/kudu/server/webserver-test.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/server/default-path-handlers.h" +#include "kudu/server/webserver.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/test_util.h" + +using std::string; + +DECLARE_int32(webserver_max_post_length_bytes); + +namespace kudu { + +class WebserverTest : public KuduTest { + public: + WebserverTest() { + static_dir_ = GetTestPath("webserver-docroot"); + CHECK_OK(env_->CreateDir(static_dir_)); + + WebserverOptions opts; + opts.port = 0; + opts.doc_root = static_dir_; + server_.reset(new Webserver(opts)); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + AddDefaultPathHandlers(server_.get()); + ASSERT_OK(server_->Start()); + + vector addrs; + ASSERT_OK(server_->GetBoundAddresses(&addrs)); + ASSERT_EQ(addrs.size(), 1); + addr_ = addrs[0]; + } + + protected: + EasyCurl curl_; + faststring buf_; + gscoped_ptr server_; + Sockaddr addr_; + + string static_dir_; +}; + +TEST_F(WebserverTest, TestIndexPage) { + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/", addr_.ToString()), + &buf_)); + // Should have expected title. + ASSERT_STR_CONTAINS(buf_.ToString(), "Kudu"); + + // Should have link to default path handlers (e.g memz) + ASSERT_STR_CONTAINS(buf_.ToString(), "memz"); +} + +TEST_F(WebserverTest, TestDefaultPaths) { + // Test memz + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/memz?raw=1", addr_.ToString()), + &buf_)); +#ifdef TCMALLOC_ENABLED + ASSERT_STR_CONTAINS(buf_.ToString(), "Bytes in use by application"); +#else + ASSERT_STR_CONTAINS(buf_.ToString(), "not available unless tcmalloc is enabled"); +#endif + + // Test varz -- check for one of the built-in gflags flags. + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/varz?raw=1", addr_.ToString()), + &buf_)); + ASSERT_STR_CONTAINS(buf_.ToString(), "--v="); +} + +// Used in symbolization test below. +void SomeMethodForSymbolTest1() {} +// Used in symbolization test below. +void SomeMethodForSymbolTest2() {} + +TEST_F(WebserverTest, TestPprofPaths) { + // Test /pprof/cmdline GET + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/pprof/cmdline", addr_.ToString()), + &buf_)); + ASSERT_STR_CONTAINS(buf_.ToString(), "webserver-test"); + ASSERT_TRUE(!HasSuffixString(buf_.ToString(), string("\x00", 1))) + << "should not have trailing NULL: " << Slice(buf_).ToDebugString(); + + // Test /pprof/symbol GET + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/pprof/symbol", addr_.ToString()), + &buf_)); + ASSERT_EQ(buf_.ToString(), "num_symbols: 1"); + + // Test /pprof/symbol POST + { + // Formulate a request with some valid symbol addresses. + string req = StringPrintf("%p+%p", + &SomeMethodForSymbolTest1, + &SomeMethodForSymbolTest2); + SCOPED_TRACE(req); + ASSERT_OK(curl_.PostToURL(strings::Substitute("http://$0/pprof/symbol", addr_.ToString()), + req, &buf_)); + ASSERT_EQ(buf_.ToString(), + StringPrintf("%p\tkudu::SomeMethodForSymbolTest1()\n" + "%p\tkudu::SomeMethodForSymbolTest2()\n", + &SomeMethodForSymbolTest1, + &SomeMethodForSymbolTest2)); + } +} + +// Send a POST request with too much data. It should reject +// the request with the correct HTTP error code. +TEST_F(WebserverTest, TestPostTooBig) { + FLAGS_webserver_max_post_length_bytes = 10; + string req(10000, 'c'); + Status s = curl_.PostToURL(strings::Substitute("http://$0/pprof/symbol", addr_.ToString()), + req, &buf_); + ASSERT_EQ("Remote error: HTTP 413", s.ToString()); +} + +// Test that static files are served and that directory listings are +// disabled. +TEST_F(WebserverTest, TestStaticFiles) { + // Fetch a non-existent static file. + Status s = curl_.FetchURL(strings::Substitute("http://$0/foo.txt", addr_.ToString()), + &buf_); + ASSERT_EQ("Remote error: HTTP 404", s.ToString()); + + // Create the file and fetch again. This time it should succeed. + ASSERT_OK(WriteStringToFile(env_.get(), "hello world", + strings::Substitute("$0/foo.txt", static_dir_))); + ASSERT_OK(curl_.FetchURL(strings::Substitute("http://$0/foo.txt", addr_.ToString()), + &buf_)); + ASSERT_EQ("hello world", buf_.ToString()); + + // Create a directory and ensure that subdirectory listing is disabled. + ASSERT_OK(env_->CreateDir(strings::Substitute("$0/dir", static_dir_))); + s = curl_.FetchURL(strings::Substitute("http://$0/dir/", addr_.ToString()), + &buf_); + ASSERT_EQ("Remote error: HTTP 403", s.ToString()); +} + +} // namespace kudu diff --git a/src/kudu/server/webserver.cc b/src/kudu/server/webserver.cc new file mode 100644 index 000000000000..913f3b2e44d9 --- /dev/null +++ b/src/kudu/server/webserver.cc @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kudu/server/webserver.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/env.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/url-coding.h" +#include "kudu/util/version_info.h" + +#if defined(__APPLE__) +typedef sig_t sighandler_t; +#endif + +using std::string; +using std::stringstream; +using std::vector; +using std::make_pair; + +DEFINE_int32(webserver_max_post_length_bytes, 1024 * 1024, + "The maximum length of a POST request that will be accepted by " + "the embedded web server."); +TAG_FLAG(webserver_max_post_length_bytes, advanced); +TAG_FLAG(webserver_max_post_length_bytes, runtime); + +namespace kudu { + +Webserver::Webserver(const WebserverOptions& opts) + : opts_(opts), + context_(nullptr) { + string host = opts.bind_interface.empty() ? "0.0.0.0" : opts.bind_interface; + http_address_ = host + ":" + boost::lexical_cast(opts.port); +} + +Webserver::~Webserver() { + Stop(); + STLDeleteValues(&path_handlers_); +} + +void Webserver::RootHandler(const Webserver::WebRequest& args, stringstream* output) { + (*output) << "

Status Pages

"; + for (const PathHandlerMap::value_type& handler : path_handlers_) { + if (handler.second->is_on_nav_bar()) { + (*output) << "" << handler.second->alias() << "
"; + } + } + (*output) << "
\n"; + (*output) << "

Version Info

\n"; + (*output) << "
" << EscapeForHtmlToString(VersionInfo::GetAllVersionInfo()) << "
"; +} + +void Webserver::BuildArgumentMap(const string& args, ArgumentMap* output) { + vector arg_pairs = strings::Split(args, "&"); + + for (const StringPiece& arg_pair : arg_pairs) { + vector key_value = strings::Split(arg_pair, "="); + if (key_value.empty()) continue; + + string key; + if (!UrlDecode(key_value[0].ToString(), &key)) continue; + string value; + if (!UrlDecode((key_value.size() >= 2 ? key_value[1].ToString() : ""), &value)) continue; + boost::to_lower(key); + (*output)[key] = value; + } +} + +bool Webserver::IsSecure() const { + return !opts_.certificate_file.empty(); +} + +Status Webserver::BuildListenSpec(string* spec) const { + vector addrs; + RETURN_NOT_OK(ParseAddressList(http_address_, 80, &addrs)); + + vector parts; + for (const Sockaddr& addr : addrs) { + // Mongoose makes sockets with 's' suffixes accept SSL traffic only + parts.push_back(addr.ToString() + (IsSecure() ? "s" : "")); + } + + JoinStrings(parts, ",", spec); + return Status::OK(); +} + +Status Webserver::Start() { + LOG(INFO) << "Starting webserver on " << http_address_; + + vector options; + + if (static_pages_available()) { + LOG(INFO) << "Document root: " << opts_.doc_root; + options.push_back("document_root"); + options.push_back(opts_.doc_root.c_str()); + options.push_back("enable_directory_listing"); + options.push_back("no"); + } else { + LOG(INFO)<< "Document root disabled"; + } + + if (IsSecure()) { + LOG(INFO) << "Webserver: Enabling HTTPS support"; + options.push_back("ssl_certificate"); + options.push_back(opts_.certificate_file.c_str()); + } + + if (!opts_.authentication_domain.empty()) { + options.push_back("authentication_domain"); + options.push_back(opts_.authentication_domain.c_str()); + } + + if (!opts_.password_file.empty()) { + // Mongoose doesn't log anything if it can't stat the password file (but will if it + // can't open it, which it tries to do during a request) + if (!Env::Default()->FileExists(opts_.password_file)) { + stringstream ss; + ss << "Webserver: Password file does not exist: " << opts_.password_file; + return Status::InvalidArgument(ss.str()); + } + LOG(INFO) << "Webserver: Password file is " << opts_.password_file; + options.push_back("global_passwords_file"); + options.push_back(opts_.password_file.c_str()); + } + + options.push_back("listening_ports"); + string listening_str; + RETURN_NOT_OK(BuildListenSpec(&listening_str)); + options.push_back(listening_str.c_str()); + + // Num threads + options.push_back("num_threads"); + string num_threads_str = SimpleItoa(opts_.num_worker_threads); + options.push_back(num_threads_str.c_str()); + + // Options must be a NULL-terminated list + options.push_back(nullptr); + + // mongoose ignores SIGCHLD and we need it to run kinit. This means that since + // mongoose does not reap its own children CGI programs must be avoided. + // Save the signal handler so we can restore it after mongoose sets it to be ignored. + sighandler_t sig_chld = signal(SIGCHLD, SIG_DFL); + + sq_callbacks callbacks; + memset(&callbacks, 0, sizeof(callbacks)); + callbacks.begin_request = &Webserver::BeginRequestCallbackStatic; + callbacks.log_message = &Webserver::LogMessageCallbackStatic; + + // To work around not being able to pass member functions as C callbacks, we store a + // pointer to this server in the per-server state, and register a static method as the + // default callback. That method unpacks the pointer to this and calls the real + // callback. + context_ = sq_start(&callbacks, reinterpret_cast(this), &options[0]); + + // Restore the child signal handler so wait() works properly. + signal(SIGCHLD, sig_chld); + + if (context_ == nullptr) { + stringstream error_msg; + error_msg << "Webserver: Could not start on address " << http_address_; + Sockaddr addr; + addr.set_port(opts_.port); + TryRunLsof(addr); + return Status::NetworkError(error_msg.str()); + } + + PathHandlerCallback default_callback = + boost::bind(boost::mem_fn(&Webserver::RootHandler), this, _1, _2); + + RegisterPathHandler("/", "Home", default_callback); + + vector addrs; + RETURN_NOT_OK(GetBoundAddresses(&addrs)); + string bound_addresses_str; + for (const Sockaddr& addr : addrs) { + if (!bound_addresses_str.empty()) { + bound_addresses_str += ", "; + } + bound_addresses_str += "http://" + addr.ToString() + "/"; + } + + LOG(INFO) << "Webserver started. Bound to: " << bound_addresses_str; + return Status::OK(); +} + +void Webserver::Stop() { + if (context_ != nullptr) { + sq_stop(context_); + context_ = nullptr; + } +} + +Status Webserver::GetBoundAddresses(std::vector* addrs) const { + if (!context_) { + return Status::IllegalState("Not started"); + } + + struct sockaddr_in** sockaddrs; + int num_addrs; + + if (sq_get_bound_addresses(context_, &sockaddrs, &num_addrs)) { + return Status::NetworkError("Unable to get bound addresses from Mongoose"); + } + + addrs->reserve(num_addrs); + + for (int i = 0; i < num_addrs; i++) { + addrs->push_back(Sockaddr(*sockaddrs[i])); + free(sockaddrs[i]); + } + free(sockaddrs); + + return Status::OK(); +} + +int Webserver::LogMessageCallbackStatic(const struct sq_connection* connection, + const char* message) { + if (message != nullptr) { + LOG(INFO) << "Webserver: " << message; + return 1; + } + return 0; +} + +int Webserver::BeginRequestCallbackStatic(struct sq_connection* connection) { + struct sq_request_info* request_info = sq_get_request_info(connection); + Webserver* instance = reinterpret_cast(request_info->user_data); + return instance->BeginRequestCallback(connection, request_info); +} + +int Webserver::BeginRequestCallback(struct sq_connection* connection, + struct sq_request_info* request_info) { + PathHandler* handler; + { + boost::shared_lock lock(lock_); + PathHandlerMap::const_iterator it = path_handlers_.find(request_info->uri); + if (it == path_handlers_.end()) { + // Let Mongoose deal with this request; returning NULL will fall through + // to the default handler which will serve files. + if (!opts_.doc_root.empty() && opts_.enable_doc_root) { + VLOG(2) << "HTTP File access: " << request_info->uri; + return 0; + } else { + sq_printf(connection, "HTTP/1.1 404 Not Found\r\n" + "Content-Type: text/plain\r\n\r\n"); + sq_printf(connection, "No handler for URI %s\r\n\r\n", request_info->uri); + return 1; + } + } + handler = it->second; + } + + return RunPathHandler(*handler, connection, request_info); +} + + +int Webserver::RunPathHandler(const PathHandler& handler, + struct sq_connection* connection, + struct sq_request_info* request_info) { + // Should we render with css styles? + bool use_style = true; + + WebRequest req; + if (request_info->query_string != nullptr) { + req.query_string = request_info->query_string; + BuildArgumentMap(request_info->query_string, &req.parsed_args); + } + req.request_method = request_info->request_method; + if (req.request_method == "POST") { + const char* content_len_str = sq_get_header(connection, "Content-Length"); + int32_t content_len = 0; + if (content_len_str == nullptr || + !safe_strto32(content_len_str, &content_len)) { + sq_printf(connection, "HTTP/1.1 411 Length Required\r\n"); + return 1; + } + if (content_len > FLAGS_webserver_max_post_length_bytes) { + // TODO: for this and other HTTP requests, we should log the + // remote IP, etc. + LOG(WARNING) << "Rejected POST with content length " << content_len; + sq_printf(connection, "HTTP/1.1 413 Request Entity Too Large\r\n"); + return 1; + } + + char buf[8192]; + int rem = content_len; + while (rem > 0) { + int n = sq_read(connection, buf, std::min(sizeof(buf), rem)); + if (n <= 0) { + LOG(WARNING) << "error reading POST data: expected " + << content_len << " bytes but only read " + << req.post_data.size(); + sq_printf(connection, "HTTP/1.1 500 Internal Server Error\r\n"); + return 1; + } + + req.post_data.append(buf, n); + rem -= n; + } + } + + if (!handler.is_styled() || ContainsKey(req.parsed_args, "raw")) { + use_style = false; + } + + stringstream output; + if (use_style) BootstrapPageHeader(&output); + for (const PathHandlerCallback& callback_ : handler.callbacks()) { + callback_(req, &output); + } + if (use_style) BootstrapPageFooter(&output); + + string str = output.str(); + // Without styling, render the page as plain text + if (!use_style) { + sq_printf(connection, "HTTP/1.1 200 OK\r\n" + "Content-Type: text/plain\r\n" + "Content-Length: %zd\r\n" + "\r\n", str.length()); + } else { + sq_printf(connection, "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n" + "Content-Length: %zd\r\n" + "\r\n", str.length()); + } + + // Make sure to use sq_write for printing the body; sq_printf truncates at 8kb + sq_write(connection, str.c_str(), str.length()); + return 1; +} + +void Webserver::RegisterPathHandler(const string& path, const string& alias, + const PathHandlerCallback& callback, bool is_styled, bool is_on_nav_bar) { + boost::lock_guard lock(lock_); + auto it = path_handlers_.find(path); + if (it == path_handlers_.end()) { + it = path_handlers_.insert( + make_pair(path, new PathHandler(is_styled, is_on_nav_bar, alias))).first; + } + it->second->AddCallback(callback); +} + +const char* const PAGE_HEADER = "" +" " +" Kudu" +" " +" " +" " +" "; + +static const char* const NAVIGATION_BAR_PREFIX = +"" +"
"; + +void Webserver::BootstrapPageHeader(stringstream* output) { + (*output) << PAGE_HEADER; + (*output) << NAVIGATION_BAR_PREFIX; + for (const PathHandlerMap::value_type& handler : path_handlers_) { + if (handler.second->is_on_nav_bar()) { + (*output) << "
  • " << handler.second->alias() + << "
  • "; + } + } + (*output) << NAVIGATION_BAR_SUFFIX; + + if (!static_pages_available()) { + (*output) << "
    " + << "Static pages not available. Configure KUDU_HOME or use the --webserver_doc_root " + << "flag to fix page styling.
    \n"; + } +} + +bool Webserver::static_pages_available() const { + return !opts_.doc_root.empty() && opts_.enable_doc_root; +} + +void Webserver::set_footer_html(const std::string& html) { + boost::lock_guard l(lock_); + footer_html_ = html; +} + +void Webserver::BootstrapPageFooter(stringstream* output) { + boost::shared_lock l(lock_); + *output << "
    \n"; // end bootstrap 'container' div + if (!footer_html_.empty()) { + *output << "
    "; + *output << footer_html_; + *output << "
    "; + } + *output << ""; +} + +} // namespace kudu diff --git a/src/kudu/server/webserver.h b/src/kudu/server/webserver.h new file mode 100644 index 000000000000..a45261c9a7aa --- /dev/null +++ b/src/kudu/server/webserver.h @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef KUDU_UTIL_WEBSERVER_H +#define KUDU_UTIL_WEBSERVER_H + +#include +#include +#include +#include +#include + +#include "kudu/server/webserver_options.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/web_callback_registry.h" + +struct sq_connection; +struct sq_request_info; +struct sq_context; + +namespace kudu { + +// Wrapper class for the Mongoose web server library. Clients may register callback +// methods which produce output for a given URL path +class Webserver : public WebCallbackRegistry { + public: + // Using this constructor, the webserver will bind to all available + // interfaces. + explicit Webserver(const WebserverOptions& opts); + + ~Webserver(); + + // Starts a webserver on the port passed to the constructor. The webserver runs in a + // separate thread, so this call is non-blocking. + Status Start(); + + // Stops the webserver synchronously. + void Stop(); + + // Return the addresses that this server has successfully + // bound to. Requires that the server has been Start()ed. + Status GetBoundAddresses(std::vector* addrs) const; + + virtual void RegisterPathHandler(const std::string& path, const std::string& alias, + const PathHandlerCallback& callback, + bool is_styled = true, bool is_on_nav_bar = true) OVERRIDE; + + // Change the footer HTML to be displayed at the bottom of all styled web pages. + void set_footer_html(const std::string& html); + + // True if serving all traffic over SSL, false otherwise + bool IsSecure() const; + private: + // Container class for a list of path handler callbacks for a single URL. + class PathHandler { + public: + PathHandler(bool is_styled, bool is_on_nav_bar, std::string alias) + : is_styled_(is_styled), + is_on_nav_bar_(is_on_nav_bar), + alias_(std::move(alias)) {} + + void AddCallback(const PathHandlerCallback& callback) { + callbacks_.push_back(callback); + } + + bool is_styled() const { return is_styled_; } + bool is_on_nav_bar() const { return is_on_nav_bar_; } + const std::string& alias() const { return alias_; } + const std::vector& callbacks() const { return callbacks_; } + + private: + // If true, the page appears is rendered styled. + bool is_styled_; + + // If true, the page appears in the navigation bar. + bool is_on_nav_bar_; + + // Alias used when displaying this link on the nav bar. + std::string alias_; + + // List of callbacks to render output for this page, called in order. + std::vector callbacks_; + }; + + bool static_pages_available() const; + + // Build the string to pass to mongoose specifying where to bind. + Status BuildListenSpec(std::string* spec) const; + + // Renders a common Bootstrap-styled header + void BootstrapPageHeader(std::stringstream* output); + + // Renders a common Bootstrap-styled footer. Must be used in conjunction with + // BootstrapPageHeader. + void BootstrapPageFooter(std::stringstream* output); + + // Dispatch point for all incoming requests. + // Static so that it can act as a function pointer, and then call the next method + static int BeginRequestCallbackStatic(struct sq_connection* connection); + int BeginRequestCallback(struct sq_connection* connection, + struct sq_request_info* request_info); + + int RunPathHandler(const PathHandler& handler, + struct sq_connection* connection, + struct sq_request_info* request_info); + + // Callback to funnel mongoose logs through glog. + static int LogMessageCallbackStatic(const struct sq_connection* connection, + const char* message); + + // Registered to handle "/", and prints a list of available URIs + void RootHandler(const WebRequest& args, std::stringstream* output); + + // Builds a map of argument name to argument value from a typical URL argument + // string (that is, "key1=value1&key2=value2.."). If no value is given for a + // key, it is entered into the map as (key, ""). + void BuildArgumentMap(const std::string& args, ArgumentMap* output); + + const WebserverOptions opts_; + + // Lock guarding the path_handlers_ map and footer_html. + boost::shared_mutex lock_; + + // Map of path to a PathHandler containing a list of handlers for that + // path. More than one handler may register itself with a path so that many + // components may contribute to a single page. + typedef std::map PathHandlerMap; + PathHandlerMap path_handlers_; + + // Snippet of HTML which will be displayed in the footer of all pages + // rendered by this server. Protected by 'lock_'. + std::string footer_html_; + + // The address of the interface on which to run this webserver. + std::string http_address_; + + // Handle to Mongoose context; owned and freed by Mongoose internally + struct sq_context* context_; +}; + +} // namespace kudu + +#endif // KUDU_UTIL_WEBSERVER_H diff --git a/src/kudu/server/webserver_options.cc b/src/kudu/server/webserver_options.cc new file mode 100644 index 000000000000..0730f0603334 --- /dev/null +++ b/src/kudu/server/webserver_options.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/webserver_options.h" + +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/flag_tags.h" + +using std::string; + +namespace kudu { + +static std::string GetDefaultDocumentRoot(); + +} // namespace kudu + +// Flags defining web server behavior. The class implementation should +// not use these directly, but rather access them via WebserverOptions. +// This makes it easier to instantiate web servers with different options +// within a single unit test. +DEFINE_string(webserver_interface, "", + "Interface to start debug webserver on. If blank, webserver binds to 0.0.0.0"); +TAG_FLAG(webserver_interface, advanced); + +DEFINE_string(webserver_doc_root, kudu::GetDefaultDocumentRoot(), + "Files under are accessible via the debug webserver. " + "Defaults to $KUDU_HOME/www, or if $KUDU_HOME is not set, disables the document " + "root"); +TAG_FLAG(webserver_doc_root, advanced); + +DEFINE_bool(webserver_enable_doc_root, true, + "If true, webserver may serve static files from the webserver_doc_root"); +TAG_FLAG(webserver_enable_doc_root, advanced); + +DEFINE_string(webserver_certificate_file, "", + "The location of the debug webserver's SSL certificate file, in .pem format. If " + "empty, webserver SSL support is not enabled"); +DEFINE_string(webserver_authentication_domain, "", + "Domain used for debug webserver authentication"); +DEFINE_string(webserver_password_file, "", + "(Optional) Location of .htpasswd file containing user names and hashed passwords for" + " debug webserver authentication"); + + +DEFINE_int32(webserver_num_worker_threads, 50, + "Maximum number of threads to start for handling web server requests"); +TAG_FLAG(webserver_num_worker_threads, advanced); + +DEFINE_int32(webserver_port, 0, + "Port to bind to for the web server"); +TAG_FLAG(webserver_port, stable); + +namespace kudu { + +// Returns KUDU_HOME if set, otherwise we won't serve any static files. +static string GetDefaultDocumentRoot() { + char* kudu_home = getenv("KUDU_HOME"); + // Empty document root means don't serve static files + return kudu_home ? strings::Substitute("$0/www", kudu_home) : ""; +} + +WebserverOptions::WebserverOptions() + : bind_interface(FLAGS_webserver_interface), + port(FLAGS_webserver_port), + doc_root(FLAGS_webserver_doc_root), + enable_doc_root(FLAGS_webserver_enable_doc_root), + certificate_file(FLAGS_webserver_certificate_file), + authentication_domain(FLAGS_webserver_authentication_domain), + password_file(FLAGS_webserver_password_file), + num_worker_threads(FLAGS_webserver_num_worker_threads) { +} + +} // namespace kudu diff --git a/src/kudu/server/webserver_options.h b/src/kudu/server/webserver_options.h new file mode 100644 index 000000000000..ddc507991116 --- /dev/null +++ b/src/kudu/server/webserver_options.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_WEBSERVER_OPTIONS_H +#define KUDU_SERVER_WEBSERVER_OPTIONS_H + +#include +#include + +namespace kudu { + +// Options controlling the web server. +// The default constructor sets these from the gflags defined in webserver_options.cc. +// See those flags for documentation. +struct WebserverOptions { + WebserverOptions(); + + std::string bind_interface; + uint16_t port; + std::string doc_root; + bool enable_doc_root; + std::string certificate_file; + std::string authentication_domain; + std::string password_file; + uint32_t num_worker_threads; +}; + +} // namespace kudu +#endif /* KUDU_SERVER_WEBSERVER_OPTIONS_H */ diff --git a/src/kudu/server/webui_util.cc b/src/kudu/server/webui_util.cc new file mode 100644 index 000000000000..c7a8135714b2 --- /dev/null +++ b/src/kudu/server/webui_util.cc @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/server/webui_util.h" + +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/monitored_task.h" +#include "kudu/util/url-coding.h" + +using strings::Substitute; + +namespace kudu { + +void HtmlOutputSchemaTable(const Schema& schema, + std::stringstream* output) { + *output << "\n"; + *output << " " + << "" + << "" + << "\n"; + + for (int i = 0; i < schema.num_columns(); i++) { + const ColumnSchema& col = schema.column(i); + string read_default = "-"; + if (col.has_read_default()) { + read_default = col.Stringify(col.read_default_value()); + } + string write_default = "-"; + if (col.has_write_default()) { + write_default = col.Stringify(col.write_default_value()); + } + *output << Substitute("\n", + EscapeForHtmlToString(col.name()), + schema.column_id(i), + col.TypeToString(), + EscapeForHtmlToString(read_default), + EscapeForHtmlToString(write_default)); + } + *output << "
    ColumnIDTypeRead defaultWrite default
    $0$1$2$3$4
    \n"; +} + +void HtmlOutputImpalaSchema(const std::string& table_name, + const Schema& schema, + const string& master_addresses, + std::stringstream* output) { + *output << "
    \n";
    +
    +  // Escape table and column names with ` to avoid conflicts with Impala reserved words.
    +  *output << "CREATE EXTERNAL TABLE " << EscapeForHtmlToString("`" + table_name + "`")
    +          << " (\n";
    +
    +  vector key_columns;
    +
    +  for (int i = 0; i < schema.num_columns(); i++) {
    +    const ColumnSchema& col = schema.column(i);
    +
    +    *output << EscapeForHtmlToString("`" + col.name() + "`") << " ";
    +    switch (col.type_info()->type()) {
    +      case STRING:
    +        *output << "STRING";
    +        break;
    +      case BINARY:
    +        *output << "BINARY";
    +        break;
    +      case UINT8:
    +      case INT8:
    +        *output << "TINYINT";
    +        break;
    +      case UINT16:
    +      case INT16:
    +        *output << "SMALLINT";
    +        break;
    +      case UINT32:
    +      case INT32:
    +        *output << "INT";
    +        break;
    +      case UINT64:
    +      case INT64:
    +        *output << "BIGINT";
    +        break;
    +      case TIMESTAMP:
    +        *output << "TIMESTAMP";
    +        break;
    +      case FLOAT:
    +        *output << "FLOAT";
    +        break;
    +      case DOUBLE:
    +        *output << "DOUBLE";
    +        break;
    +      default:
    +        *output << "[unsupported type " << col.type_info()->name() << "!]";
    +        break;
    +    }
    +    if (i < schema.num_columns() - 1) {
    +      *output << ",";
    +    }
    +    *output << "\n";
    +
    +    if (schema.is_key_column(i)) {
    +      key_columns.push_back(col.name());
    +    }
    +  }
    +  *output << ")\n";
    +
    +  *output << "TBLPROPERTIES(\n";
    +  *output << "  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',\n";
    +  *output << "  'kudu.table_name' = '" << table_name << "',\n";
    +  *output << "  'kudu.master_addresses' = '" << master_addresses << "',\n";
    +  *output << "  'kudu.key_columns' = '" << JoinElements(key_columns, ", ") << "'\n";
    +  *output << ");\n";
    +  *output << "
    \n"; +} + +void HtmlOutputTaskList(const std::vector >& tasks, + std::stringstream* output) { + *output << "\n"; + *output << " \n"; + for (const scoped_refptr& task : tasks) { + string state; + switch (task->state()) { + case MonitoredTask::kStatePreparing: + state = "Preparing"; + break; + case MonitoredTask::kStateRunning: + state = "Running"; + break; + case MonitoredTask::kStateComplete: + state = "Complete"; + break; + case MonitoredTask::kStateFailed: + state = "Failed"; + break; + case MonitoredTask::kStateAborted: + state = "Aborted"; + break; + } + + double running_secs = 0; + if (task->completion_timestamp().Initialized()) { + running_secs = task->completion_timestamp().GetDeltaSince( + task->start_timestamp()).ToSeconds(); + } else if (task->start_timestamp().Initialized()) { + running_secs = MonoTime::Now(MonoTime::FINE).GetDeltaSince( + task->start_timestamp()).ToSeconds(); + } + + *output << Substitute( + "\n", + EscapeForHtmlToString(task->type_name()), + EscapeForHtmlToString(state), + EscapeForHtmlToString(HumanReadableElapsedTime::ToShortString(running_secs)), + EscapeForHtmlToString(task->description())); + } + *output << "
    Task NameStateTimeDescription
    $0$1$2$3
    \n"; +} + +} // namespace kudu diff --git a/src/kudu/server/webui_util.h b/src/kudu/server/webui_util.h new file mode 100644 index 000000000000..43ac6ea7357a --- /dev/null +++ b/src/kudu/server/webui_util.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_SERVER_WEBUI_UTIL_H +#define KUDU_SERVER_WEBUI_UTIL_H + +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" + +namespace kudu { + +class Schema; +class MonitoredTask; + +void HtmlOutputSchemaTable(const Schema& schema, + std::stringstream* output); +void HtmlOutputImpalaSchema(const std::string& table_name, + const Schema& schema, + const std::string& master_address, + std::stringstream* output); +void HtmlOutputTaskList(const std::vector >& tasks, + std::stringstream* output); +} // namespace kudu + +#endif // KUDU_SERVER_WEBUI_UTIL_H diff --git a/src/kudu/tablet/CMakeLists.txt b/src/kudu/tablet/CMakeLists.txt new file mode 100644 index 000000000000..4b6c6e7f24c2 --- /dev/null +++ b/src/kudu/tablet/CMakeLists.txt @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(TABLET_SRCS + tablet.cc + tablet_bootstrap.cc + tablet_metrics.cc + tablet_peer_mm_ops.cc + tablet_peer.cc + transactions/transaction.cc + transactions/alter_schema_transaction.cc + transactions/transaction_driver.cc + transactions/transaction_tracker.cc + transactions/write_transaction.cc + transaction_order_verifier.cc + cfile_set.cc + compaction.cc + compaction_policy.cc + delta_key.cc + diskrowset.cc + lock_manager.cc + maintenance_manager.cc + memrowset.cc + multi_column_writer.cc + mutation.cc + mvcc.cc + row_op.cc + rowset.cc + rowset_info.cc + rowset_tree.cc + svg_dump.cc + tablet_metadata.cc + rowset_metadata.cc + deltafile.cc + deltamemstore.cc + delta_applier.cc + delta_compaction.cc + delta_iterator_merger.cc + delta_stats.cc + delta_store.cc + delta_tracker.cc +) + +PROTOBUF_GENERATE_CPP( + TABLET_PROTO_SRCS TABLET_PROTO_HDRS TABLET_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES + tablet.proto + metadata.proto) +set(TABLET_PROTO_LIBS + protobuf + fs_proto + consensus_metadata_proto + kudu_common) +ADD_EXPORTABLE_LIBRARY(tablet_proto + SRCS ${TABLET_PROTO_SRCS} + DEPS ${TABLET_PROTO_LIBS} + NONLINK_DEPS ${TABLET_PROTO_TGTS}) + +add_library(tablet ${TABLET_SRCS}) +target_link_libraries(tablet + tablet_proto + codegen + kudu_common + cfile + gutil + server_common + kudu_fs + kudu_util + consensus) + +set(KUDU_TEST_LINK_LIBS tablet ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(tablet-test) +ADD_KUDU_TEST(tablet_metadata-test) +ADD_KUDU_TEST(mt-tablet-test RUN_SERIAL true) +ADD_KUDU_TEST(compaction_policy-test) +ADD_KUDU_TEST(diskrowset-test) +ADD_KUDU_TEST(mt-diskrowset-test RUN_SERIAL true) +ADD_KUDU_TEST(memrowset-test) +ADD_KUDU_TEST(deltamemstore-test) +ADD_KUDU_TEST(deltafile-test) +ADD_KUDU_TEST(cfile_set-test) +ADD_KUDU_TEST(tablet-pushdown-test) +ADD_KUDU_TEST(tablet-schema-test) +ADD_KUDU_TEST(tablet_bootstrap-test) +ADD_KUDU_TEST(maintenance_manager-test) +ADD_KUDU_TEST(metadata-test) +ADD_KUDU_TEST(mvcc-test) +ADD_KUDU_TEST(compaction-test) +ADD_KUDU_TEST(lock_manager-test) +ADD_KUDU_TEST(rowset_tree-test) +ADD_KUDU_TEST(composite-pushdown-test) +ADD_KUDU_TEST(delta_compaction-test) +ADD_KUDU_TEST(mt-rowset_delta_compaction-test) +ADD_KUDU_TEST(major_delta_compaction-test) +ADD_KUDU_TEST(transactions/transaction_tracker-test) +ADD_KUDU_TEST(tablet_peer-test) +ADD_KUDU_TEST(tablet_random_access-test) +ADD_KUDU_TEST(tablet_mm_ops-test) + +# Some tests don't have dependencies on other tablet stuff +set(KUDU_TEST_LINK_LIBS kudu_util gutil ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(cbtree-test RUN_SERIAL true) diff --git a/src/kudu/tablet/README b/src/kudu/tablet/README new file mode 100644 index 000000000000..b0ae0b75b5b5 --- /dev/null +++ b/src/kudu/tablet/README @@ -0,0 +1,759 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +A Tablet is a horizontal partition of a Kudu table, similar to tablets +in BigTable or regions in HBase. Each tablet hosts a contiguous range +of rows which does not overlap with any other tablet's range. Together, +all the tablets in a table comprise the table's entire key space. + +Each tablet is further subdivided into a number of sets of rows called +RowSets. Each RowSet consists of the data for a set of rows. RowSets +are disjoint, ie the set of rows for different RowSets do not +intersect, so any given key is present in at most one RowSet. While +RowSets are disjoint, their key spaces may overlap. + +============================================================ +Handling Insertions +============================================================ + +One RowSet is held in memory and is referred to as the MemRowSet. All +inserts go directly into the MemRowSet, which is an in-memory B-Tree sorted +by the table's primary key. As data is inserted, it is accumulated in the MemRowSet, +where it is made immediately visible to future readers, subject to MVCC +(see below). + +NOTE: Unlike BigTable, only inserts and updates of recently-inserted data go into the MemRowSet +-- mutations such as updates and deletions of on-disk rows are discussed in a later section of +this document. + +Each row exists in exactly one entry in the MemRowSet. The value of this entry consists +of a special header, followed by the packed format of the row data (more detail below). +Since the MemRowSet is fully in-memory, it will eventually fill up and "Flush" to disk -- +this process is described in detail later in this document. + +============================================================ +MVCC Overview +============================================================ + +Kudu uses multi-version concurrency control in order to provide a number of useful +features: + +- Snapshot scanners: when a scanner is created, it operates as of a point-in-time + snapshot of the tablet. Any further updates to the tablet which occur during + the course of the scan are ignored. In addition, this point-in-time can be + stored and re-used for additional scans on the same tablet, for example if an application + would like to perform analytics requiring multiple passes on a consistent view of the data. + +- Time-travel scanners: similar to the above, a user may create a scanner which + operates as of some point in time from the past, providing a consistent "time travel read". + This can be used to take point-in-time consistent backups. + +- Change-history queries: given two MVCC snapshots, the user may be able to query + the set of deltas between those two snapshots for any given row. This can be leveraged + to take incremental backups, perform cross-cluster synchronization, or for offline audit + analysis. + +- Multi-row atomic updates within a tablet: a single mutation may apply to multiple + rows within a tablet, and it will be made visible in a single atomic action. + +In order to provide MVCC, each mutation is tagged with a timestamp. Timestamps are generated by a +TS-wide Clock instance, and ensured to be unique within a tablet by the tablet's MvccManager. The +state of the MvccManager determines the set of timestamps which are considered "committed" and thus +visible to newly generated scanners. Upon creation, a scanner takes a snapshot of the MvccManager +state, and any data which seen by that scanner is then compared against the MvccSnapshot to +determine which insertions, updates, and deletes should be considered visible. + +Timestamps are monotonically increasing per tablet. We use a technique called HybridTime (see +OSDI'14 submission for details) to create timestamps which correspond to true wall clock +time but also reflect causality between nodes. + +In order to support these snapshot and time-travel reads, multiple versions of any given +row must be stored in the database. To prevent unbounded space usage, the user may configure +a retention period beyond which old transaction records may be GCed (thus preventing any snapshot +reads from earlier than that point in history). +(NOTE: history GC not currently implemented) + +============================================================ +MVCC Mutations in MemRowSet +============================================================ + +In order to support MVCC in the MemRowSet, each row is tagged with the timestamp which +inserted the row. Additionally, the row contains a singly linked list containing any further +mutations that were made to the row after its insertion, each tagged with the mutation's +timestamp: + + + MemRowSet Row ++----------------------------------------------------+ +| insertion timestamp | mutation head | row data... | ++-------------------------|--------------------------+ + | + v First mutation + +-----------------------------------------------+ + | mutation timestamp | next_mut | change record | + +--------------------|--------------------------+ + __________/ + / + | Second mutation + +--------v--------------------------------------+ + | mutation timestamp | next_mut | change record | + +--------------------|--------------------------+ + __________/ + / + ... + + +In traditional database terms, one can think of the mutation list forming a sort of +"REDO log" containing all changes which affect this row. + +Any reader traversing the MemRowSet needs to apply these mutations to read the correct +snapshot of the row, via the following logic: + +- If row.insertion_timestamp is not committed in scanner's MVCC snapshot, skip the row + (it was not yet inserted when the scanner's snapshot was made). +- Otherwise, copy the row data into the output buffer. +- For each mutation in the list: + - if mutation.timestamp is committed in the scanner's MVCC snapshot, apply the change + to the in-memory copy of the row. Otherwise, skip this mutation (it was not yet + mutated at the time of the snapshot). + - if the mutation indicates a DELETE, mark the row as deleted in the output buffer + of the scanner by zeroing its bit in the scanner's selection vector. + +Note that "mutation" in this case can be one of three types: +- UPDATE: changes the value of one or more columns +- DELETE: removes the row from the database +- REINSERT: reinsert the row with a new set of data (only occurs on a MemRowSet row + with a prior DELETE mutation) + +As a concrete example, consider the following sequence on a table with schema +(key STRING, val UINT32): + + INSERT INTO t VALUES ("row", 1); [timestamp 1] + UPDATE t SET val = 2 WHERE key = "row"; [timestamp 2] + DELETE FROM t WHERE key = "row"; [timestamp 3] + INSERT INTO t VALUES ("row", 3); [timestamp 4] + +This would result in the following structure in the MemRowSet: + + +-----------------------------------+ + | tx 1 | mutation head | ("row", 1) | + +----------|------------------------+ + | + | + +---v--------------------------+ + | tx 2 | next ptr | SET val=2 | + +-----------|------------------+ + ______/ + | + +---v-------v----------------+ + | tx 3 | next ptr | DELETE | + +-----------|----------------+ + ______/ + | + +---v------------------------------------+ + | tx 4 | next ptr | REINSERT ("row", 3) | + +----------------------------------------+ + + +Note that this has a couple of undesirable properties when update frequency is high: +- readers must chase pointers through a singly linked list, likely causing many CPU cache + misses. +- updates must append to the end of a singly linked list, which is O(n) where 'n' is the + number of times this row has been updated. + +However, we consider the above inefficiencies tolerable given the following assumptions: +- Kudu's target uses cases have a relatively low update rate: we assume that a single row + won't have a high frequency of updates +- Only a very small fraction of the total database will be in the MemRowSet -- once the MemRowSet + reaches some target size threshold, it will flush. So, even if scanning MemRowSet is slow + due to update handling, it will make up only a small percentage of overall query time. + +If it turns out that the above inefficiencies impact real applications, various optimizations +can be applied in the future to reduce the overhead. + +============================================================ +MemRowSet Flushes +============================================================ + +When the MemRowSet fills up, a Flush occurs, which persists the data to disk. + ++------------+ +| MemRowSet | ++------------+ + | + | Flush process writes entries in memory to a new DiskRowSet on disk + v ++--------------+ +--------------+ +--------------+ +| DiskRowSet 0 | | DiskRowSet 1 | .. | DiskRowSet N | ++-------------+- +--------------+ +--------------+ + +When the data is flushed, it is stored as a set of CFiles (see src/kudu/cfile/README). +Each of the rows in the data is addressable by a sequential "rowid", which is +dense, immutable, and unique within this DiskRowSet. For example, if a given +DiskRowSet contains 5 rows, then they will be assigned rowid 0 through 4, in +order of ascending key. Within a different DiskRowSet, there will be different +rows with the same rowids. + +Reads may map between primary keys (user-visible) and rowids (internal) using an index +structure. In the case that the primary key is a simple key, the key structure is +embedded within the primary key column's cfile. Otherwise, a separate index cfile +stores the encoded compound key and provides a similar function. + +NOTE: rowids are not explicitly stored with each row, but rather an implicit +identifier based on the row's ordinal index in the file. Some parts of the source +code refer to rowids as "row indexes" or "ordinal indexes". + +NOTE: other systems such as C-Store call the MemRowSet the +"write optimized store" (WOS), and the on-disk files the "read-optimized store" +(ROS). + +============================================================ +Historical MVCC in DiskRowSets +============================================================ + +In order to continue to provide MVCC for on-disk data, each on-disk RowSet +consists not only of the current columnar data, but also "UNDO" records which +provide the ability to rollback a row's data to an earlier version. + ++--------------+ +-----------+ +| UNDO records | <--- | base data | ++--------------+ +-----------+ +- time of data progresses to the right ---> + +When a user wants to read the most recent version of the data immediately after +a flush, only the base data is required. Because the base data is stored in a +columnar format, this common case is very efficient. If instead, the user wants +to run a time-travel query, the read path consults the UNDO records in order to +roll back the visible data to the earlier point in time. + +When a scanner encounters a row, it processes the MVCC information as follows: + - Read base image of row + - For each UNDO record: + -- If the associated timestamp is NOT committed, execute rollback change. + +For example, recall the series of mutations used in "MVCC Mutations in MemRowSet" above: + + INSERT INTO t VALUES ("row", 1); [timestamp 1] + UPDATE t SET val = 2 WHERE key = "row"; [timestamp 2] + DELETE FROM t WHERE key = "row"; [timestamp 3] + INSERT INTO t VALUES ("row", 3); [timestamp 4] + +When this row is flushed to disk, we store it on disk in the following way: + + Base data: + ("row", 3) + UNDO records (roll-back): + Before Tx 4: DELETE + Before Tx 3: INSERT ("row", 2") + Before Tx 2: SET row=1 + Before Tx 1: DELETE + +Each UNDO record is the inverse of the transaction which triggered it -- for example +the INSERT at transaction 1 turns into a "DELETE" when it is saved as an UNDO record. + +The use of the UNDO record here acts to preserve the insertion timestamp: +queries whose MVCC snapshot indicates Tx 1 is not yet committed will execute +the DELETE "UNDO" record, such that the row is made invisible. + +For example, consider two different example scanners: + + Current time scanner (all txns committed) + ----------------------------------------- + - Read base data + - Since tx 1-4 are committed, ignore all UNDO records + - No REDO records + Result: current row ("row", 3) + + + Scanner as of timestamp 1 + --------------------- + - Read base data. Buffer = ("row", 3) + - Rollback Tx 4: Buffer = + - Rollback Tx 3: Buffer = ("row", 2) + - Rollback Tx 2: Buffer = ("row", 1) + Result: ("row", 1) + +Each case processes the correct set of UNDO records to yield the state of the row as of +the desired point of time. + + +Given that the most common case of queries will be running against "current" data. In +that case, we would like to optimize query execution by avoiding the processing of any +UNDO records. To do so, we include file-level metadata indicating +the range of transactions for which UNDO records are present. If the scanner's MVCC +snapshot indicates that all of these transactions are already committed, then the set +of deltas may be short circuited, and the query can proceed with no MVCC overhead. + +============================================================ +Handling mutations against on-disk files +============================================================ + +Updates or deletes of already-flushed rows do not go into the MemRowSet. +Instead, the updated key is searched for among all RowSets in order to locate +the unique RowSet which holds this key. This processes first uses an interval +tree to locate a set of candidate rowsets which may contain the key in question. +Following this, we consult a bloom filter for each of those candidates. For +rowsets which pass both checks, we seek the primary key index to determine +the row's rowid within that rowset. + +Once the appropriate RowSet has been determined, the mutation will also +be aware of the key's rowid within the RowSet (as a result of the same +key search which verified that the key is present in the RowSet). The +mutation can then enter an in-memory structure called the DeltaMemStore. + +The DeltaMemStore is an in-memory concurrent BTree keyed by a composite key of the +rowid and the mutating timestamp. At read time, these mutations +are processed in the same manner as the mutations for newly inserted data. + +When the Delta MemStore grows too large, it performs a flush to an +on-disk DeltaFile, and resets itself to become empty: + ++------------+ +---------+ +---------+ +----------------+ +| base data | <--- | delta 0 | <-- | delta N | <-- | delta memstore | ++------------+ +---------+ +---------+ +----------------+ + +The DeltaFiles contain the same type of information as the Delta MemStore, +but compacted to a dense on-disk serialized format. Because these delta files +contain records of transactions that need to be re-applied to the base data +in order to bring rows up-to-date, they are called "REDO" files, and the +mutations contained are called "REDO" records. Similar to data resident in the +MemRowSet, REDO mutations need to be applied to read newer versions of the data. + +A given row may have delta information in multiple delta structures. In that +case, the deltas are applied sequentially, with later modifications winning +over earlier modifications. + +Note that the mutation tracking structure for a given row does not +necessarily include the entirety of the row. If only a single column of a row +is updated, then the mutation structure will only include the updated column. +This allows for fast updates of small columns without the overhead of reading +or re-writing larger columns (an advantage compared to the MVCC techniques used +by systems such as C-Store and PostgreSQL). + +============================================================ +Summary of delta file processing +============================================================ + +In summary, each DiskRowSet consists of three logical components: + ++--------------+ +-----------+ +--------------+ +| UNDO records | <--- | base data | ---> | REDO records | ++--------------+ +-----------+ +--------------+ + +Base data: the columnar data for the RowSet, at the time the RowSet was flushed + +UNDO records: historical data which needs to be processed to rollback rows to + points in time prior to the RowSet flush. + +REDO records: data which needs to be processed in order to bring rows up to date + with respect to modifications made after the RowSet was flushed. + +UNDO records and REDO records are stored in the same file format, called a DeltaFile. + +============================================================ +Delta Compactions +============================================================ + +Within a RowSet, reads become less efficient as more mutations accumulate +in the delta tracking structures; in particular, each flushed delta file +will have to be seeked and merged as the base data is read. Additionally, +if a record has been updated many times, many REDO records have to be +applied in order to expose the most current version to a scanner. + +In order to mitigate this and improve read performance, Kudu performs background +processing which transforms a RowSet from inefficient physical layouts to more +efficient ones, while maintaining the same logical contents. These types +of transformations are called "delta compactions". Delta compactions serve +several main goals: + +1) Reduce the number of delta files + + The more delta files that have been flushed for a RowSet, the more separate + files must be read in order to produce the current version of a row. In + workloads that do not fit in RAM, each random read will result in a disk seek + for each of the delta files, causing performance to suffer. + +2) Migrate REDO records to UNDO records + + As described above, a RowSet consists of base data (stored per-column), + a set of "undo" records (to move back in time), and a set of "redo" records + (to move forward in time from the base data). Given that most queries will be + made against the present version of the database, we would like to minimize + the number of REDO records stored. + + At any point, a row's REDO records may be merged into the base data, and + replaced by an equivalent set of UNDO records containing the old versions + of the cells. + +3) Garbage collect old UNDO records. + + UNDO records need to be retained only as far back as a user-configured + historical retention period. Beyond this period, we can remove old "undo" + records to save disk space. + +NOTE: In the BigTable design, timestamps are associated with data, not with changes. +In the Kudu design, timestamps are associated with changes, not with data. After historical +UNDO logs have been removed, there is no remaining record of when any row or +cell was inserted or updated. If users need this functionality, they should +keep their own "inserted_on" timestamp column, as they would in a traditional RDBMS. + +============================================================ +Types of Delta Compaction +============================================================ + +A delta compaction may be classified as either 'minor' or 'major': + +Minor delta compaction: +------------------------ + +A 'minor' compaction is one that does not include the base data. In this +type of compaction, the resulting file is itself a delta file. + ++------------+ +---------+ +---------+ +---------+ +---------+ +| base data | <--- | delta 0 + <-- | delta 1 + <-- | delta 2 + <-- | delta 3 + ++------------+ +---------+ +---------+ +---------+ +---------+ + \_________________________________________/ + files selected for compaction + + =====> + ++------------+ +---------+ +-----------------------+ +| base data | <--- | delta 0 + <-- | delta 1 (old delta 3) + ++------------+ +---------+ +-----------------------+ + \_________/ + compaction result + + +Minor delta compactions serve only goals 1 and 3: because they do not read or re-write +base data, they cannot transform REDO records into UNDO. + +Major delta compaction: +------------------------ + +A 'major' compaction is one that includes the base data along with any number +of delta files. + ++------------+ +---------+ +---------+ +---------+ +---------+ +| base data | <--- | delta 0 + <-- | delta 1 + <-- | delta 2 + <-- | delta 3 + ++------------+ +---------+ +---------+ +---------+ +---------+ +\_____________________________________________/ + files selected for compaction + + =====> + ++------------+ +----------------+ +-----------------------+ +-----------------------+ +| new UNDOs | --> | new base data | <--- | delta 0 (old delta 2) + <-- | delta 1 (old delta 3) + ++------------+ +----------------+ +-----------------------+ +-----------------------+ +\____________________________________/ + compaction result + +Major delta compactions can satisfy all three goals of delta compactions, but cost +more than than minor delta compactions since they must read and re-write the base data, +which is typically larger than the delta data. + +A major delta compaction may be performed against any subset of the columns +in a DiskRowSet -- if only a single column has received a significant number of updates, +then a compaction can be performed which only reads and rewrites that column. It is +assumed that this is a common workload in many EDW-like applications (e.g updating +an `order_status` column in an order table, or a `visit_count` column in a user table). + +Note that both types of delta compactions maintain the row ids within the RowSet: +hence, they can be done entirely in the background with no locking. The resulting +compaction file can be introduced into the RowSet by atomically swapping it with +the compaction inputs. After the swap is complete, the pre-compaction files may +be removed. + +============================================================ +Merging compactions +============================================================ + +As more data is inserted into a tablet, more and more DiskRowSets will accumulate. +This can hurt performance for the following cases: + +a) Random access (get or update a single row by primary key) + +In this case, each RowSet whose key range includes the probe key must be individually consulted to +locate the specified key. Bloom filters can mitigate the number of physical seeks, but extra bloom +filter accesses can impact CPU and also increase memory usage. + +b) Scan with specified range (eg scan where primary key between 'A' and 'B') + +In this case, each RowSet with an overlapping key range must be individually seeked, regardless of +bloom filters. Specialized index structures might be able to assist, here, but again at the cost of +memory, etc. + +c) Sorted scans + +If the user query requires that the scan result be yielded in primary-key-sorted +order, then the results must be passed through a merge process. Merging is typically +logarithmic in the number of inputs: as the number of inputs grows higher, the merge +becomes more expensive. + +Given the above, it is desirable to merge RowSets together to reduce the number of +RowSets: + ++------------+ +| RowSet 0 | ++------------+ + ++------------+ \ +| RowSet 1 | | ++------------+ | + | ++------------+ | +--------------+ +| RowSet 2 | |===> RowSet compaction ===> | new RowSet 1 | ++------------+ | +--------------+ + | ++------------+ | +| RowSet 3 | | ++------------+ / + + +Unlike Delta Compactions described above, note that row ids are _not_ maintained +in a Merging Compaction. This makes the handling of concurrent mutations a somewhat +intricate dance. This process is described in more detail in 'compaction.txt' in this +directory. + +============================================================ +Overall picture +============================================================ + +Go go gadget ASCII art! + ++-----------+ +| MemRowSet | ++-----------+ + | + | flush: creates a new DiskRowSet 0 + v ++---------------+ +| DiskRowSet 0 | ++---------------+ + +DiskRowSet 1: ++---------+ +------------+ +---------+ +---------+ +---------+ +---------+ +| UNDOs 0 | --> | base data | <--- | REDOs 0 | <-- | REDOS 1 | <-- | REDOs 2 | <-- | REDOs 3 | ++---------+ +------------+ +---------+ +---------+ +---------+ +---------+ +\____________________________________________________________/ + | major compaction + v + ++---------+ +------------+ +---------+ +---------+ +| UNDOs 0'| --> | base data' | <--- | REDOs 2 | <-- | REDOs 3 | ++---------+ +------------+ +---------+ +---------+ +\____________________________/ + compaction result + + +DiskRowSet 2: ++---------+ +------------+ +---------+ +---------+ +---------+ +---------+ +| UNDOs 0 | --> | base data | <--- | REDOs 0 | <-- | REDOS 1 | <-- | REDOs 2 | <-- | REDOs 3 | ++---------+ +------------+ +---------+ +---------+ +---------+ +---------+ + \_________________________/ + | minor compaction + v ++---------+ +------------+ +---------+ +---------+ +---------+ +| UNDOs 0 | --> | base data | <--- | REDOS 0'| <-- | REDOs 2 | <-- | REDOs 3 | ++---------+ +------------+ +---------+ +---------+ +---------+ + \_________/ + compaction result + ++-----------------+ \ +| DiskRowSet 3 | | ++-----------------+ | + | ++-----------------+ | +----------------+ +| DiskRowSet 4 | |===> Merging compaction ===> | new DiskRowSet | ++-----------------+ | +----------------+ + | ++-----------------+ | +| DiskRowSet 5 | | ++-----------------+ / + + +============================================================ +Comparison to BigTable approach +============================================================ + +This design differs from the approach used in BigTable in a few key ways: + +1) A given key is only present in at most one RowSet in the tablet. + +In BigTable, a key may be present in several different SSTables. An entire +Tablet in BigTable looks more like the RowSet in Kudu -- any read of a key +must merge together data found in all of the SSTables, just like a single +row lookup in Kudu must merge together the base data with all of the DeltaFiles. + +The advantage of the Kudu approach is that, when reading a row, or servicing a query +for which sort-order is not important, no merge is required. For example, +an aggregate over a range of keys can individually scan each RowSet (even +in parallel) and then sum the results, since the order in which keys are +presented is not important. Similarly, selects without an explicit +'ORDER BY primary_key' specification do not need to conduct a merge. +It's obvious why this can result in more efficient scanning. + +The disadvantage here is that, unlike BigTable, inserts and mutations +are distinct operations: inserts must go into the MemRowSet, whereas +mutations (delete/update) must go into the DeltaMemStore in the specific RowSet +containing that key. This has performance impacts as follows: + + a) Inserts must determine that they are in fact new keys. + + This results in a bloom filter query against all present RowSets. If + any RowSet indicates a possible match, then a seek must be performed + against the key column(s) to determine whether it is in fact an + insert or update. + + It is assumed that, so long as the number of RowSets is small, and the + bloom filters accurate enough, the vast majority of inserts will not + require any physical disk seeks. Additionally, if the key pattern + for inserts is locally sequential (eg '_' in a time-series + application), then the blocks corresponding to those keys are likely to + be kept in the data block cache due to their frequent usage. + + b) Updates must determine which RowSet they correspond to. + + Similar to above, this results in a bloom filter query against + all RowSets, as well as a primary key lookup against any matching RowSets. + +One advantage to this difference is that the semantics are more familiar to +users who are accustomed to RDBMS systems where an INSERT of a duplicate +primary key gives a Primary Key Violation error rather than replacing the +existing row. Similarly, an UPDATE of a row which does not exist can give +a key violation error, indicating that no rows were updated. These semantics +are not generally provided by BigTable-like systems. + +2) Mutation applications of data on disk are performed on numeric rowids rather than + arbitrary keys. + +In order to reconcile a key on disk with its potentially-mutated form, +BigTable performs a merge based on the row's key. These keys may be arbitrarily +long strings, so comparison can be expensive. Additionally, even if the +key column is not needed to service a query (e.g an aggregate computation), +the key column must be read off disk and processed, which causes extra IO. +Given that composite keys are often used in BigTable applications, the key size +may dwarf the size of the column of interest by an order of magnitude, especially +if the queried column is stored in a dense encoding. + +In contrast, mutations in Kudu are stored by rowid. So, merges can proceed +much more efficiently by maintaining counters: given the next mutation to apply, +we can simply subtract to find how many rows of unmutated base data may be passed +through unmodified. Alternatively, direct addressing can be used to efficiently +"patch" entire blocks of base data given a set of mutations. + +Additionally, if the key is not needed in the query results, the query plan +need not consult the key except perhaps to determine scan boundaries. + +As an example, consider the query: + > SELECT SUM(cpu_usage) FROM timeseries WHERE machine = 'foo.cloudera.com' + AND unix_time BETWEEN 1349658729 AND 1352250720; + ... given a composite primary key (host, unix_time) + +This may be evaluated in Kudu with the following pseudo-code: + sum = 0 + foreach RowSet: + start_rowid = rowset.lookup_key(1349658729) + end_rowid = rowset.lookup_key(1352250720) + iter = rowset.new_iterator("cpu_usage") + iter.seek(start_rowid) + remaining = end_rowid - start_rowid + while remaining > 0: + block = iter.fetch_upto(remaining) + sum += sum(block) + +The fetching of blocks can be done very efficiently since the application +of any potential mutations can simply index into the block and replace +any mutated values with their new data. + +3) timestamps are not part of the data model + +In BigTable-like systems, the timestamp of each cell is exposed to the user, and +essentially forms the last element of a composite row key. This means that it is +efficient to directly access some particular version of a cell, and store entire +time series as many different versions of a single cell. This is not efficient +in Kudu -- timestamps should be considered an implementation detail used for MVCC, +not another dimension in the row key. Instead, Kudu provides native composite row keys +which can be useful for time series. + + +============================================================ +Comparing the MVCC implementation to other databases +============================================================ + +C-Store/Vertica +---------- +C-Store provides MVCC by adding two extra columns to each table: an insertion epoch +and a deletion epoch. Epochs in Vertica are essentially equivalent to timestamps in +Kudu. When a row is inserted, the transaction's epoch is written in the row's epoch +column. The deletion epoch column is initially NULL. When a row is deleted, the epoch +of the deletion transaction is written into that column. As a scanner iterates over +the table, it only includes rows where the insertion epoch is committed and the +deletion epoch is either NULL or uncommitted. + +Updates in Vertica are always implemented as a transactional DELETE followed by a +re-INSERT. So, the old version of the row has the update's epoch as its deletion epoch, +and the new version of the row has the update's epoch as its insertion epoch. + +This has the downside that even updates of one small column must read all of the columns +for that row, incurring many seeks and additional IO overhead for logging the re-insertion. +Additionally, while both versions of the row need to be retained, the space usage of the +row has been doubled. If a row is being frequently updated, then the space usage will +increase significantly, even if only a single column of the row has been changed. + +In contrast, Kudu does not need to read the other columns, and only needs to re-store +the columns which have changed, which should yield much improved UPDATE throughput +for online applications. + +References: + - http://vertica-forums.com/viewtopic.php?f=48&t=345&start=10 + - http://vldb.org/pvldb/vol5/p1790_andrewlamb_vldb2012.pdf + + +PostgreSQL +---------- +PostgreSQL's MVCC implementation is very similar to Vertica's. Each tuple has an associated +"xmin" and "xmax" column. "xmin" contains the timestamp when the row was inserted, and "xmax" +contains the timestamp when the row was deleted or updated. + +PostgreSQL has the same downsides as C-Store in that a frequently updated row will end up +replicated many times in the tablespace, taking up extra storage and IO. The overhead is not +as bad, though, since Postgres is a row-store, and thus re-reading all of the N columns for an +update does not incur N separate seeks. + +References: + - postgres source code + - http://www.packtpub.com/article/transaction-model-of-postgresql + +Oracle Database +--------------- +Oracle's MVCC and time-travel implementations are somewhat similar to +Kudu's. Its MVCC operates on physical blocks rather than records. Whenever a +block is modified, it is modified in place and a compensating UNDO record is +written to a Rollback Segment (RBS) in the transaction log. The block header is +then modified to point to the Rollback Segment which contains the UNDO record. + +When readers read a block, the read path looks at the data block header to +determine if rollback is required. If so, it reads the associated rollback +segment to apply UNDO logs. + +This has the downside that the rollback segments are allocated based on the +order of transaction commit, and thus are not likely to be sequentially laid out +with regard to the order of rows being read. So, scanning through a table in a +time travel query may require a random access to retrieve associated UNDO logs +for each block, whereas in Kudu, the undo logs have been sorted and organized by +row-id. + +NOTE: the above is very simplified, but the overall idea is correct. + +References: + - http://asktom.oracle.com/pls/asktom/f?p=100:11:0::::P11_QUESTION_ID:275215756923 diff --git a/src/kudu/tablet/cbtree-test.cc b/src/kudu/tablet/cbtree-test.cc new file mode 100644 index 000000000000..0f42e1bab249 --- /dev/null +++ b/src/kudu/tablet/cbtree-test.cc @@ -0,0 +1,782 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/tablet/concurrent_btree.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/memory/memory.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tablet { +namespace btree { + +using boost::unordered_set; + +class TestCBTree : public KuduTest { + protected: + template + InsertStatus InsertInLeaf(LeafNode *l, ThreadSafeArena *arena, + const Slice &k, const Slice &v) { + PreparedMutation pm(k); + pm.arena_ = arena; + + // Must lock the node even in the single threaded test + // to avoid firing the debug assertions. + l->Lock(); + l->SetInserting(); + l->PrepareMutation(&pm); + InsertStatus ret = l->Insert(&pm, v); + l->Unlock(); + return ret; + } + + void DoBigKVTest(size_t key_size, size_t val_size) { + ThreadSafeArena arena(1024, 1024); + + char kbuf[key_size]; + char vbuf[val_size]; + OverwriteWithPattern(kbuf, key_size, "KEY"); + OverwriteWithPattern(vbuf, key_size, "VAL"); + Slice key(kbuf, key_size); + Slice val(vbuf, val_size); + + LeafNode lnode(false); + ASSERT_EQ(INSERT_SUCCESS, + InsertInLeaf(&lnode, &arena, key, val)); + } + + template + void DoTestConcurrentInsert(); + +}; + +// Ensure that the template magic to make the nodes sized +// as we expect is working. +// The nodes may come in slightly smaller than the requested size, +// but should not be any larger. +TEST_F(TestCBTree, TestNodeSizes) { + ThreadSafeArena arena(1024, 1024); + + LeafNode lnode(false); + ASSERT_LE(sizeof(lnode), BTreeTraits::leaf_node_size); + + InternalNode inode(Slice("split"), &lnode, &lnode, &arena); + ASSERT_LE(sizeof(inode), BTreeTraits::internal_node_size); + +} + +TEST_F(TestCBTree, TestLeafNode) { + LeafNode lnode(false); + ThreadSafeArena arena(1024, 1024); + + Slice k1("key1"); + Slice v1("val1"); + ASSERT_EQ(INSERT_SUCCESS, + InsertInLeaf(&lnode, &arena, k1, v1)); + ASSERT_EQ(INSERT_DUPLICATE, + InsertInLeaf(&lnode, &arena, k1, v1)); + + // Insert another entry after first + Slice k2("key2"); + Slice v2("val2"); + ASSERT_EQ(INSERT_SUCCESS, InsertInLeaf(&lnode, &arena, k2, v2)); + ASSERT_EQ(INSERT_DUPLICATE, InsertInLeaf(&lnode, &arena, k2, v2)); + + // Another entry before first + Slice k0("key0"); + Slice v0("val0"); + ASSERT_EQ(INSERT_SUCCESS, InsertInLeaf(&lnode, &arena, k0, v0)); + ASSERT_EQ(INSERT_DUPLICATE, InsertInLeaf(&lnode, &arena, k0, v0)); + + // Another entry in the middle + Slice k15("key1.5"); + Slice v15("val1.5"); + ASSERT_EQ(INSERT_SUCCESS, InsertInLeaf(&lnode, &arena, k15, v15)); + ASSERT_EQ(INSERT_DUPLICATE, InsertInLeaf(&lnode, &arena, k15, v15)); + ASSERT_EQ("[key0=val0], [key1=val1], [key1.5=val1.5], [key2=val2]", + lnode.ToString()); + + // Add entries until it is full + int i; + bool full = false; + for (i = 0; i < 1000 && !full; i++) { + char buf[64]; + snprintf(buf, sizeof(buf), "filler_key_%d", i); + switch (InsertInLeaf(&lnode, &arena, Slice(buf), Slice("data"))) { + case INSERT_SUCCESS: + continue; + case INSERT_DUPLICATE: + FAIL() << "Unexpected INSERT_DUPLICATE for " << buf; + break; + case INSERT_FULL: + full = true; + break; + default: + FAIL() << "unexpected result"; + } + } + ASSERT_LT(i, 1000) << "should have filled up node before 1000 entries"; +} + +// Directly test leaf node with keys and values which are large (such that +// only zero or one would fit in the actual allocated space) +TEST_F(TestCBTree, TestLeafNodeBigKVs) { + LeafNode lnode(false); + + DoBigKVTest(1000, 1000); +} + +// Setup the tree to fanout quicker, so we test internal node +// splitting, etc. +struct SmallFanoutTraits : public BTreeTraits { + + static const size_t internal_node_size = 84; + static const size_t leaf_node_size = 92; +}; + +// Enables yield() calls at interesting points of the btree +// implementation to ensure that we are still correct even +// with adversarial scheduling. +struct RacyTraits : public SmallFanoutTraits { + static const size_t debug_raciness = 100; +}; + +void MakeKey(char *kbuf, size_t len, int i) { + snprintf(kbuf, len, "key_%d%d", i % 10, i / 10); +} + +template +void VerifyEntry(CBTree *tree, int i) { + char kbuf[64]; + char vbuf[64]; + char vbuf_out[64]; + + MakeKey(kbuf, sizeof(kbuf), i); + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + + size_t len = sizeof(vbuf_out); + ASSERT_EQ(CBTree::GET_SUCCESS, + tree->GetCopy(Slice(kbuf), vbuf_out, &len)) + << "Failed to verify entry " << kbuf; + ASSERT_EQ(string(vbuf, len), string(vbuf_out, len)); +} + + +template +void InsertRange(CBTree *tree, + int start_idx, + int end_idx) { + char kbuf[64]; + char vbuf[64]; + for (int i = start_idx; i < end_idx; i++) { + MakeKey(kbuf, sizeof(kbuf), i); + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + if (!tree->Insert(Slice(kbuf), Slice(vbuf))) { + FAIL() << "Failed insert at iteration " << i; + } + + /* + int to_verify = start_idx + (rand() % (i - start_idx + 1)); + CHECK_LE(to_verify, i); + VerifyEntry(tree, to_verify); + */ + } +} + +template +void VerifyGet(const CBTree &tree, + Slice key, + Slice expected_val) { + char vbuf[64]; + size_t len = sizeof(vbuf); + ASSERT_EQ(CBTree::GET_SUCCESS, + tree.GetCopy(key, vbuf, &len)) + << "Failed on key " << HexDump(key); + + Slice got_val(vbuf, len); + ASSERT_EQ(0, expected_val.compare(got_val)) + << "Failure!\n" + << "Expected: " << HexDump(expected_val) + << "Got: " << HexDump(got_val); +} + +template +void VerifyRange(const CBTree &tree, + int start_idx, + int end_idx) { + char kbuf[64]; + char vbuf[64]; + for (int i = start_idx; i < end_idx; i++) { + MakeKey(kbuf, sizeof(kbuf), i); + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + + VerifyGet(tree, Slice(kbuf), Slice(vbuf)); + } +} + + +// Function which inserts a range of keys formatted key_ +// into the given tree, then verifies that they are all +// inserted properly +template +void InsertAndVerify(boost::barrier *go_barrier, + boost::barrier *done_barrier, + gscoped_ptr > *tree, + int start_idx, + int end_idx) { + while (true) { + go_barrier->wait(); + + if (tree->get() == nullptr) return; + + InsertRange(tree->get(), start_idx, end_idx); + VerifyRange(*tree->get(), start_idx, end_idx); + + done_barrier->wait(); + } +} + + +TEST_F(TestCBTree, TestInsertAndVerify) { + CBTree t; + char kbuf[64]; + char vbuf[64]; + + int n_keys = 10000; + + for (int i = 0; i < n_keys; i++) { + snprintf(kbuf, sizeof(kbuf), "key_%d", i); + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + if (!t.Insert(Slice(kbuf), Slice(vbuf))) { + FAIL() << "Failed insert at iteration " << i; + } + } + + + for (int i = 0; i < n_keys; i++) { + snprintf(kbuf, sizeof(kbuf), "key_%d", i); + + // Try to insert with a different value, to ensure that on failure + // it doesn't accidentally replace the old value anyway. + snprintf(vbuf, sizeof(vbuf), "xxx_%d", i); + if (t.Insert(Slice(kbuf), Slice(vbuf))) { + FAIL() << "Allowed duplicate insert at iteration " << i; + } + + // Do a Get() and check that the real value is still accessible. + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + VerifyGet(t, Slice(kbuf), Slice(vbuf)); + } +} + +template +static void InsertRandomKeys(TREE *t, int n_keys, + COLLECTION *inserted) { + char kbuf[64]; + char vbuf[64]; + int i = 0; + while (inserted->size() < n_keys) { + int key = rand(); + memcpy(kbuf, &key, sizeof(key)); + snprintf(vbuf, sizeof(vbuf), "val_%d", i); + t->Insert(Slice(kbuf, sizeof(key)), Slice(vbuf)); + inserted->insert(key); + i++; + } +} + +// Similar to above, but inserts in random order +TEST_F(TestCBTree, TestInsertAndVerifyRandom) { + CBTree t; + char kbuf[64]; + char vbuf_out[64]; + + int n_keys = 1000; + if (AllowSlowTests()) { + n_keys = 100000; + } + + unordered_set inserted(n_keys); + + InsertRandomKeys(&t, n_keys, &inserted); + + + for (int key : inserted) { + memcpy(kbuf, &key, sizeof(key)); + + // Do a Get() and check that the real value is still accessible. + size_t len = sizeof(vbuf_out); + ASSERT_EQ(CBTree::GET_SUCCESS, + t.GetCopy(Slice(kbuf, sizeof(key)), vbuf_out, &len)); + } +} + +// Thread which cycles through doing the following: +// - lock the node +// - either mark it splitting or inserting (alternatingly) +// - unlock it +void LockCycleThread(AtomicVersion *v, int count_split, int count_insert) { + int i = 0; + while (count_split > 0 || count_insert > 0) { + i++; + VersionField::Lock(v); + if (i % 2 && count_split > 0) { + VersionField::SetSplitting(v); + count_split--; + } else { + VersionField::SetInserting(v); + count_insert--; + } + VersionField::Unlock(v); + } +} + +// Single-threaded test case which verifies the correct behavior of +// VersionField. +TEST_F(TestCBTree, TestVersionLockSimple) { + AtomicVersion v = 0; + VersionField::Lock(&v); + ASSERT_EQ(1L << 63, v); + VersionField::Unlock(&v); + ASSERT_EQ(0, v); + + VersionField::Lock(&v); + VersionField::SetSplitting(&v); + VersionField::Unlock(&v); + + ASSERT_EQ(0, VersionField::GetVInsert(v)); + ASSERT_EQ(1, VersionField::GetVSplit(v)); + + VersionField::Lock(&v); + VersionField::SetInserting(&v); + VersionField::Unlock(&v); + ASSERT_EQ(1, VersionField::GetVInsert(v)); + ASSERT_EQ(1, VersionField::GetVSplit(v)); + +} + +// Multi-threaded test case which spawns several threads, each of which +// locks and unlocks a version field a predetermined number of times. +// Verifies that the counters are correct at the end. +TEST_F(TestCBTree, TestVersionLockConcurrent) { + boost::ptr_vector threads; + int num_threads = 4; + int split_per_thread = 2348; + int insert_per_thread = 8327; + + AtomicVersion v = 0; + + for (int i = 0; i < num_threads; i++) { + threads.push_back(new boost::thread( + LockCycleThread, &v, split_per_thread, insert_per_thread)); + } + + for (boost::thread &thr : threads) { + thr.join(); + } + + + ASSERT_EQ(split_per_thread * num_threads, + VersionField::GetVSplit(v)); + ASSERT_EQ(insert_per_thread * num_threads, + VersionField::GetVInsert(v)); +} + +// Test that the tree holds up properly under a concurrent insert workload. +// Each thread inserts a number of elements and then verifies that it can +// read them back. +TEST_F(TestCBTree, TestConcurrentInsert) { + DoTestConcurrentInsert(); +} + +// Same, but with a tree that tries to provoke race conditions. +TEST_F(TestCBTree, TestRacyConcurrentInsert) { + DoTestConcurrentInsert(); +} + +template +void TestCBTree::DoTestConcurrentInsert() { + gscoped_ptr > tree; + + int num_threads = 16; + int ins_per_thread = 30; +#ifdef NDEBUG + int n_trials = 600; +#else + int n_trials = 30; +#endif + + boost::ptr_vector threads; + boost::barrier go_barrier(num_threads + 1); + boost::barrier done_barrier(num_threads + 1); + + + for (int i = 0; i < num_threads; i++) { + threads.push_back(new boost::thread( + InsertAndVerify, + &go_barrier, + &done_barrier, + &tree, + ins_per_thread * i, + ins_per_thread * (i + 1))); + } + + + // Rather than running one long trial, better to run + // a bunch of short trials, so that the threads contend a lot + // more on a smaller tree. As the tree gets larger, contention + // on areas of the key space diminishes. + + for (int trial = 0; trial < n_trials; trial++) { + tree.reset(new CBTree()); + go_barrier.wait(); + + done_barrier.wait(); + + if (::testing::Test::HasFatalFailure()) { + tree->DebugPrint(); + return; + } + } + + tree.reset(nullptr); + go_barrier.wait(); + + for (boost::thread &thr : threads) { + thr.join(); + } +} + +TEST_F(TestCBTree, TestIterator) { + CBTree t; + + int n_keys = 100000; + unordered_set inserted(n_keys); + InsertRandomKeys(&t, n_keys, &inserted); + + // now iterate through, making sure we saw all + // the keys that were inserted + LOG_TIMING(INFO, "Iterating") { + gscoped_ptr > iter( + t.NewIterator()); + bool exact; + ASSERT_TRUE(iter->SeekAtOrAfter(Slice(""), &exact)); + int count = 0; + while (iter->IsValid()) { + Slice k, v; + iter->GetCurrentEntry(&k, &v); + + int k_int; + CHECK_EQ(sizeof(k_int), k.size()); + memcpy(&k_int, k.data(), k.size()); + + bool removed = inserted.erase(k_int); + if (!removed) { + FAIL() << "Iterator saw entry " << k_int << " but not inserted"; + } + count++; + iter->Next(); + } + + ASSERT_EQ(n_keys, count); + ASSERT_EQ(0, inserted.size()) << "Some entries were not seen by iterator"; + } +} + +// Test the limited "Rewind" functionality within a given leaf node. +TEST_F(TestCBTree, TestIteratorRewind) { + CBTree t; + + ASSERT_TRUE(t.Insert(Slice("key1"), Slice("val"))); + ASSERT_TRUE(t.Insert(Slice("key2"), Slice("val"))); + ASSERT_TRUE(t.Insert(Slice("key3"), Slice("val"))); + + gscoped_ptr > iter(t.NewIterator()); + bool exact; + ASSERT_TRUE(iter->SeekAtOrAfter(Slice(""), &exact)); + + Slice k, v; + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key1", k.ToString()); + ASSERT_EQ(0, iter->index_in_leaf()); + ASSERT_EQ(3, iter->remaining_in_leaf()); + ASSERT_TRUE(iter->Next()); + + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key2", k.ToString()); + ASSERT_EQ(1, iter->index_in_leaf()); + ASSERT_EQ(2, iter->remaining_in_leaf()); + ASSERT_TRUE(iter->Next()); + + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key3", k.ToString()); + ASSERT_EQ(2, iter->index_in_leaf()); + ASSERT_EQ(1, iter->remaining_in_leaf()); + + // Rewind to beginning of leaf. + iter->RewindToIndexInLeaf(0); + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key1", k.ToString()); + ASSERT_EQ(0, iter->index_in_leaf()); + ASSERT_EQ(3, iter->remaining_in_leaf()); + ASSERT_TRUE(iter->Next()); + + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key2", k.ToString()); + ASSERT_EQ(1, iter->index_in_leaf()); + ASSERT_EQ(2, iter->remaining_in_leaf()); + ASSERT_TRUE(iter->Next()); +} + +TEST_F(TestCBTree, TestIteratorSeekOnEmptyTree) { + CBTree t; + + gscoped_ptr > iter( + t.NewIterator()); + bool exact = true; + ASSERT_FALSE(iter->SeekAtOrAfter(Slice(""), &exact)); + ASSERT_FALSE(exact); + ASSERT_FALSE(iter->IsValid()); +} + +// Test seeking to exactly the first and last key, as well +// as the boundary conditions (before first and after last) +TEST_F(TestCBTree, TestIteratorSeekConditions) { + CBTree t; + + ASSERT_TRUE(t.Insert(Slice("key1"), Slice("val"))); + ASSERT_TRUE(t.Insert(Slice("key2"), Slice("val"))); + ASSERT_TRUE(t.Insert(Slice("key3"), Slice("val"))); + + // Seek to before first key should successfully reach first key + { + gscoped_ptr > iter( + t.NewIterator()); + + bool exact; + ASSERT_TRUE(iter->SeekAtOrAfter(Slice("key0"), &exact)); + ASSERT_FALSE(exact); + + ASSERT_TRUE(iter->IsValid()); + Slice k, v; + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key1", k.ToString()); + } + + // Seek to exactly first key should successfully reach first key + // and set exact = true + { + gscoped_ptr > iter( + t.NewIterator()); + + bool exact; + ASSERT_TRUE(iter->SeekAtOrAfter(Slice("key1"), &exact)); + ASSERT_TRUE(exact); + + ASSERT_TRUE(iter->IsValid()); + Slice k, v; + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key1", k.ToString()); + } + + // Seek to exactly last key should successfully reach last key + // and set exact = true + { + gscoped_ptr > iter( + t.NewIterator()); + + bool exact; + ASSERT_TRUE(iter->SeekAtOrAfter(Slice("key3"), &exact)); + ASSERT_TRUE(exact); + + ASSERT_TRUE(iter->IsValid()); + Slice k, v; + iter->GetCurrentEntry(&k, &v); + ASSERT_EQ("key3", k.ToString()); + ASSERT_FALSE(iter->Next()); + } + + // Seek to after last key should fail. + { + gscoped_ptr > iter( + t.NewIterator()); + + bool exact; + ASSERT_FALSE(iter->SeekAtOrAfter(Slice("key4"), &exact)); + ASSERT_FALSE(exact); + ASSERT_FALSE(iter->IsValid()); + } +} + +// Thread which scans through the entirety of the tree verifying +// that results are returned in-order. The scan is performed in a loop +// until tree->get() == NULL. +// go_barrier: waits on this barrier to start running +// done_barrier: waits on this barrier once finished. +template +static void ScanThread(boost::barrier *go_barrier, + boost::barrier *done_barrier, + gscoped_ptr > *tree) { + while (true) { + go_barrier->wait(); + if (tree->get() == nullptr) return; + + int prev_count = 0; + int count = 0; + do { + prev_count = count; + count = 0; + + faststring prev_key; + + gscoped_ptr > iter((*tree)->NewIterator()); + bool exact; + iter->SeekAtOrAfter(Slice(""), &exact); + while (iter->IsValid()) { + count++; + Slice k, v; + iter->GetCurrentEntry(&k, &v); + + if (k.compare(Slice(prev_key)) <= 0) { + FAIL() << "prev key " << Slice(prev_key).ToString() << + " wasn't less than cur key " << k.ToString(); + } + prev_key.assign_copy(k.data(), k.size()); + + iter->Next(); + } + ASSERT_GE(count, prev_count); + } while (count != prev_count || count == 0); + + done_barrier->wait(); + } +} + +// Thread which starts a number of threads to insert data while +// other threads repeatedly scan and verify that the results come back +// in order. +TEST_F(TestCBTree, TestConcurrentIterateAndInsert) { + gscoped_ptr > tree; + + int num_ins_threads = 4; + int num_scan_threads = 4; + int num_threads = num_ins_threads + num_scan_threads; + int ins_per_thread = 1000; + int trials = 2; + + if (AllowSlowTests()) { + ins_per_thread = 30000; + } + + boost::ptr_vector threads; + boost::barrier go_barrier(num_threads + 1); + boost::barrier done_barrier(num_threads + 1); + + for (int i = 0; i < num_ins_threads; i++) { + threads.push_back(new boost::thread( + InsertAndVerify, + &go_barrier, + &done_barrier, + &tree, + ins_per_thread * i, + ins_per_thread * (i + 1))); + } + for (int i = 0; i < num_scan_threads; i++) { + threads.push_back(new boost::thread( + ScanThread, + &go_barrier, + &done_barrier, + &tree)); + } + + + // Rather than running one long trial, better to run + // a bunch of short trials, so that the threads contend a lot + // more on a smaller tree. As the tree gets larger, contention + // on areas of the key space diminishes. + for (int trial = 0; trial < trials; trial++) { + tree.reset(new CBTree()); + go_barrier.wait(); + + done_barrier.wait(); + + if (::testing::Test::HasFatalFailure()) { + tree->DebugPrint(); + return; + } + } + + tree.reset(nullptr); + go_barrier.wait(); + + for (boost::thread &thr : threads) { + thr.join(); + } +} + +// Check the performance of scanning through a large tree. +TEST_F(TestCBTree, TestScanPerformance) { + CBTree tree; +#ifndef NDEBUG + int n_keys = 10000; +#else + int n_keys = 1000000; +#endif + if (AllowSlowTests()) { + n_keys = 4000000; + } + LOG_TIMING(INFO, StringPrintf("Insert %d keys", n_keys)) { + InsertRange(&tree, 0, n_keys); + } + + for (int freeze = 0; freeze <= 1; freeze++) { + if (freeze) { + tree.Freeze(); + } + int scan_trials = 10; + LOG_TIMING(INFO, StringPrintf("Scan %d keys %d times (%s)", + n_keys, scan_trials, + freeze ? "frozen" : "not frozen")) { + for (int i = 0; i < 10; i++) { + gscoped_ptr > iter( + tree.NewIterator()); + bool exact; + iter->SeekAtOrAfter(Slice(""), &exact); + int count = 0; + while (iter->IsValid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, n_keys); + } + } + } +} + +} // namespace btree +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/cfile_set-test.cc b/src/kudu/tablet/cfile_set-test.cc new file mode 100644 index 000000000000..fc62d30530b8 --- /dev/null +++ b/src/kudu/tablet/cfile_set-test.cc @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/diskrowset-test-base.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/test_util.h" + +DECLARE_int32(cfile_default_block_size); + +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +class TestCFileSet : public KuduRowSetTest { + public: + TestCFileSet() : + KuduRowSetTest(Schema({ ColumnSchema("c0", UINT32), + ColumnSchema("c1", UINT32, false, nullptr, nullptr, GetRLEStorage()), + ColumnSchema("c2", UINT32) }, 1)) + {} + + virtual void SetUp() OVERRIDE { + KuduRowSetTest::SetUp(); + + // Use a small cfile block size, so that when we skip materializing a given + // column for 10,000 rows, it can actually skip over a number of blocks. + FLAGS_cfile_default_block_size = 512; + } + + // Write out a test rowset with two int columns. + // The first column contains the row index * 2. + // The second contains the row index * 10. + // The third column contains index * 100, but is never read. + void WriteTestRowSet(int nrows) { + DiskRowSetWriter rsw(rowset_meta_.get(), &schema_, + BloomFilterSizing::BySizeAndFPRate(32*1024, 0.01f)); + + ASSERT_OK(rsw.Open()); + + RowBuilder rb(schema_); + for (int i = 0; i < nrows; i++) { + rb.Reset(); + rb.AddUint32(i * 2); + rb.AddUint32(i * 10); + rb.AddUint32(i * 100); + ASSERT_OK_FAST(WriteRow(rb.data(), &rsw)); + } + ASSERT_OK(rsw.Finish()); + } + + // Issue a range scan between 'lower' and 'upper', and verify that all result + // rows indeed fall inside that predicate. + void DoTestRangeScan(const shared_ptr &fileset, + uint32_t lower, + uint32_t upper) { + // Create iterator. + shared_ptr cfile_iter(fileset->NewIterator(&schema_)); + gscoped_ptr iter(new MaterializingIterator(cfile_iter)); + + // Create a scan with a range predicate on the key column. + ScanSpec spec; + ColumnRangePredicate pred1( + schema_.column(0), + lower != kNoBound ? &lower : nullptr, + upper != kNoBound ? &upper : nullptr); + spec.AddPredicate(pred1); + ASSERT_OK(iter->Init(&spec)); + + // Check that the range was respected on all the results. + Arena arena(1024, 1024); + RowBlock block(schema_, 100, &arena); + while (iter->HasNext()) { + ASSERT_OK_FAST(iter->NextBlock(&block)); + for (size_t i = 0; i < block.nrows(); i++) { + if (block.selection_vector()->IsRowSelected(i)) { + RowBlockRow row = block.row(i); + if ((lower != kNoBound && *schema_.ExtractColumnFromRow(row, 0) < lower) || + (upper != kNoBound && *schema_.ExtractColumnFromRow(row, 0) > upper)) { + FAIL() << "Row " << schema_.DebugRow(row) << " should not have " + << "passed predicate " << pred1.ToString(); + } + } + } + } + } + + private: + ColumnStorageAttributes GetRLEStorage() const { + ColumnStorageAttributes attr; + attr.encoding = RLE; + return attr; + } + + protected: + static const uint32_t kNoBound; + google::FlagSaver saver; +}; + +const uint32_t TestCFileSet::kNoBound = kuint32max; + +TEST_F(TestCFileSet, TestPartiallyMaterialize) { + const int kCycleInterval = 10000; + const int kNumRows = 100000; + WriteTestRowSet(kNumRows); + + shared_ptr fileset(new CFileSet(rowset_meta_)); + ASSERT_OK(fileset->Open()); + + gscoped_ptr iter(fileset->NewIterator(&schema_)); + ASSERT_OK(iter->Init(nullptr)); + + Arena arena(4096, 1024*1024); + RowBlock block(schema_, 100, &arena); + rowid_t row_idx = 0; + while (iter->HasNext()) { + arena.Reset(); + + size_t n = block.nrows(); + ASSERT_OK_FAST(iter->PrepareBatch(&n)); + block.Resize(n); + + // Cycle between: + // 0: materializing just column 0 + // 1: materializing just column 1 + // 2: materializing both column 0 and 1 + // NOTE: column 2 ("c2") is never materialized, even though it was part of + // the projection. It should thus do no IO. + int cycle = (row_idx / kCycleInterval) % 3; + if (cycle == 0 || cycle == 2) { + ColumnBlock col(block.column_block(0)); + ASSERT_OK_FAST(iter->MaterializeColumn(0, &col)); + + // Verify + for (int i = 0; i < n; i++) { + uint32_t got = *reinterpret_cast(col.cell_ptr(i)); + uint32_t expected = (row_idx + i) * 2; + if (got != expected) { + FAIL() << "Failed at row index " << (row_idx + i) << ": expected " + << expected << " got " << got; + } + } + } + if (cycle == 1 || cycle == 2) { + ColumnBlock col(block.column_block(1)); + ASSERT_OK_FAST(iter->MaterializeColumn(1, &col)); + + // Verify + for (int i = 0; i < n; i++) { + uint32_t got = *reinterpret_cast(col.cell_ptr(i)); + if (got != 10 * (row_idx + i)) { + FAIL() << "Failed at row index " << (row_idx + i) << ": expected " + << 10 * (row_idx + i) << " got " << got; + } + } + } + + ASSERT_OK_FAST(iter->FinishBatch()); + row_idx += n; + } + + // Verify through the iterator statistics that IO was saved by not materializing + // all of the columns. + vector stats; + iter->GetIteratorStats(&stats); + ASSERT_EQ(3, stats.size()); + for (int i = 0; i < 3; i++) { + LOG(INFO) << "Col " << i << " stats: " << stats[i].ToString(); + } + + // Since we pushed down the block size, we expect to have read 100+ blocks of column 0 + ASSERT_GT(stats[0].data_blocks_read_from_disk, 100); + + // Since we didn't ever materialize column 2, we shouldn't have read any data blocks. + ASSERT_EQ(0, stats[2].data_blocks_read_from_disk); + + // Column 0 and 1 skipped a lot of blocks, so should not have read all of the cells + // from either column. + ASSERT_LT(stats[0].cells_read_from_disk, kNumRows * 3 / 4); + ASSERT_LT(stats[1].cells_read_from_disk, kNumRows * 3 / 4); +} + +TEST_F(TestCFileSet, TestIteratePartialSchema) { + const int kNumRows = 100; + WriteTestRowSet(kNumRows); + + shared_ptr fileset(new CFileSet(rowset_meta_)); + ASSERT_OK(fileset->Open()); + + Schema new_schema; + ASSERT_OK(schema_.CreateProjectionByNames({ "c0", "c2" }, &new_schema)); + shared_ptr cfile_iter(fileset->NewIterator(&new_schema)); + gscoped_ptr iter(new MaterializingIterator(cfile_iter)); + + ASSERT_OK(iter->Init(nullptr)); + + // Read all the results. + vector results; + ASSERT_OK(IterateToStringList(iter.get(), &results)); + + VLOG(1) << "Results of iterating over sparse partial schema: "; + for (const string &str : results) { + VLOG(1) << str; + } + + // Ensure that we got the expected rows. + ASSERT_EQ(results.size(), kNumRows); + for (int i = 0; i < kNumRows; i++) { + ASSERT_EQ(StringPrintf("(uint32 c0=%d, uint32 c2=%d)", i * 2, i * 100), + results[i]); + } +} + +// Add a range predicate on the key column and ensure that only the relevant small number of rows +// are read off disk. +TEST_F(TestCFileSet, TestRangeScan) { + const int kNumRows = 10000; + WriteTestRowSet(kNumRows); + + shared_ptr fileset(new CFileSet(rowset_meta_)); + ASSERT_OK(fileset->Open()); + + // Create iterator. + shared_ptr cfile_iter(fileset->NewIterator(&schema_)); + gscoped_ptr iter(new MaterializingIterator(cfile_iter)); + Schema key_schema = schema_.CreateKeyProjection(); + Arena arena(1024, 256 * 1024); + RangePredicateEncoder encoder(&key_schema, &arena); + + // Create a scan with a range predicate on the key column. + ScanSpec spec; + uint32_t lower = 2000; + uint32_t upper = 2009; + ColumnRangePredicate pred1(schema_.column(0), &lower, &upper); + spec.AddPredicate(pred1); + encoder.EncodeRangePredicates(&spec, true); + ASSERT_OK(iter->Init(&spec)); + + // Check that the bounds got pushed as index bounds. + // Since the key column is the rowidx * 2, we need to divide the integer bounds + // back down. + EXPECT_EQ(lower / 2, cfile_iter->lower_bound_idx_); + // + 1 because the upper bound is exclusive + EXPECT_EQ(upper / 2 + 1, cfile_iter->upper_bound_idx_); + + // Read all the results. + vector results; + ASSERT_OK(IterateToStringList(iter.get(), &results)); + + // Ensure that we got the expected rows. + for (const string &str : results) { + LOG(INFO) << str; + } + ASSERT_EQ(5, results.size()); + EXPECT_EQ("(uint32 c0=2000, uint32 c1=10000, uint32 c2=100000)", results[0]); + EXPECT_EQ("(uint32 c0=2008, uint32 c1=10040, uint32 c2=100400)", results[4]); + + // Ensure that we only read the relevant range from all of the columns. + // Since it's a small range, it should be all in one data block in each column. + vector stats; + iter->GetIteratorStats(&stats); + EXPECT_EQ(stats[0].data_blocks_read_from_disk, 1); + EXPECT_EQ(stats[1].data_blocks_read_from_disk, 1); + EXPECT_EQ(stats[2].data_blocks_read_from_disk, 1); +} + +// Several other black-box tests for range scans. These are similar to +// TestRangeScan above, except don't inspect internal state. +TEST_F(TestCFileSet, TestRangePredicates2) { + const int kNumRows = 10000; + WriteTestRowSet(kNumRows); + + shared_ptr fileset(new CFileSet(rowset_meta_)); + ASSERT_OK(fileset->Open()); + + // Range scan where rows match on both ends + DoTestRangeScan(fileset, 2000, 2010); + // Range scan which falls between rows on both ends + DoTestRangeScan(fileset, 2001, 2009); + // Range scan with open lower bound + DoTestRangeScan(fileset, kNoBound, 2009); + // Range scan with open upper bound + DoTestRangeScan(fileset, 2001, kNoBound); + // Range scan with upper bound coming at end of data + DoTestRangeScan(fileset, 2001, kNumRows * 2); + // Range scan with upper bound coming after end of data + DoTestRangeScan(fileset, 2001, kNumRows * 10); + // Range scan with lower bound coming at end of data + DoTestRangeScan(fileset, kNumRows * 2, kNoBound); + // Range scan with lower bound coming after end of data + DoTestRangeScan(fileset, kNumRows * 10, kNoBound); +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/cfile_set.cc b/src/kudu/tablet/cfile_set.cc new file mode 100644 index 000000000000..0f64e08c8703 --- /dev/null +++ b/src/kudu/tablet/cfile_set.cc @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/cfile/bloomfile.h" +#include "kudu/cfile/cfile_util.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/common/scan_spec.h" +#include "kudu/gutil/algorithm.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/util/flag_tags.h" + +DEFINE_bool(consult_bloom_filters, true, "Whether to consult bloom filters on row presence checks"); +TAG_FLAG(consult_bloom_filters, hidden); + +namespace kudu { +namespace tablet { + +using cfile::ReaderOptions; +using cfile::DefaultColumnValueIterator; +using fs::ReadableBlock; +using std::shared_ptr; +using strings::Substitute; + +//////////////////////////////////////////////////////////// +// Utilities +//////////////////////////////////////////////////////////// + +static Status OpenReader(const shared_ptr& rowset_metadata, + ColumnId col_id, + gscoped_ptr *new_reader) { + FsManager* fs = rowset_metadata->fs_manager(); + gscoped_ptr block; + BlockId block_id = rowset_metadata->column_data_block_for_col_id(col_id); + RETURN_NOT_OK(fs->OpenBlock(block_id, &block)); + + // TODO: somehow pass reader options in schema + ReaderOptions opts; + return CFileReader::OpenNoInit(block.Pass(), opts, new_reader); +} + +//////////////////////////////////////////////////////////// +// CFile Base +//////////////////////////////////////////////////////////// + +CFileSet::CFileSet(shared_ptr rowset_metadata) + : rowset_metadata_(std::move(rowset_metadata)) {} + +CFileSet::~CFileSet() { +} + + +Status CFileSet::Open() { + RETURN_NOT_OK(OpenBloomReader()); + + // Lazily open the column data cfiles. Each one will be fully opened + // later, when the first iterator seeks for the first time. + RowSetMetadata::ColumnIdToBlockIdMap block_map = rowset_metadata_->GetColumnBlocksById(); + for (const RowSetMetadata::ColumnIdToBlockIdMap::value_type& e : block_map) { + ColumnId col_id = e.first; + DCHECK(!ContainsKey(readers_by_col_id_, col_id)) << "already open"; + + gscoped_ptr reader; + RETURN_NOT_OK(OpenReader(rowset_metadata_, col_id, &reader)); + readers_by_col_id_[col_id] = shared_ptr(reader.release()); + VLOG(1) << "Successfully opened cfile for column id " << col_id + << " in " << rowset_metadata_->ToString(); + } + + // However, the key reader should always be fully opened, so that we + // can figure out where in the rowset tree we belong. + if (rowset_metadata_->has_adhoc_index_block()) { + RETURN_NOT_OK(OpenAdHocIndexReader()); + } else { + RETURN_NOT_OK(key_index_reader()->Init()); + } + + // Determine the upper and lower key bounds for this CFileSet. + RETURN_NOT_OK(LoadMinMaxKeys()); + + return Status::OK(); +} + +Status CFileSet::OpenAdHocIndexReader() { + if (ad_hoc_idx_reader_ != nullptr) { + return Status::OK(); + } + + FsManager* fs = rowset_metadata_->fs_manager(); + gscoped_ptr block; + RETURN_NOT_OK(fs->OpenBlock(rowset_metadata_->adhoc_index_block(), &block)); + + ReaderOptions opts; + return CFileReader::Open(block.Pass(), opts, &ad_hoc_idx_reader_); +} + + +Status CFileSet::OpenBloomReader() { + if (bloom_reader_ != nullptr) { + return Status::OK(); + } + + FsManager* fs = rowset_metadata_->fs_manager(); + gscoped_ptr block; + RETURN_NOT_OK(fs->OpenBlock(rowset_metadata_->bloom_block(), &block)); + + ReaderOptions opts; + Status s = BloomFileReader::OpenNoInit(block.Pass(), opts, &bloom_reader_); + if (!s.ok()) { + LOG(WARNING) << "Unable to open bloom file in " << rowset_metadata_->ToString() << ": " + << s.ToString(); + // Continue without bloom. + } + + return Status::OK(); +} + +Status CFileSet::LoadMinMaxKeys() { + CFileReader *key_reader = key_index_reader(); + if (!key_reader->GetMetadataEntry(DiskRowSet::kMinKeyMetaEntryName, &min_encoded_key_)) { + return Status::Corruption("No min key found", ToString()); + } + if (!key_reader->GetMetadataEntry(DiskRowSet::kMaxKeyMetaEntryName, &max_encoded_key_)) { + return Status::Corruption("No max key found", ToString()); + } + if (Slice(min_encoded_key_).compare(max_encoded_key_) > 0) { + return Status::Corruption(StringPrintf("Min key %s > max key %s", + Slice(min_encoded_key_).ToDebugString().c_str(), + Slice(max_encoded_key_).ToDebugString().c_str()), + ToString()); + } + + return Status::OK(); +} + +CFileReader* CFileSet::key_index_reader() const { + if (ad_hoc_idx_reader_) { + return ad_hoc_idx_reader_.get(); + } + // If there is no special index cfile, then we have a non-compound key + // and we can just use the key column. + // This is always the first column listed in the tablet schema. + int key_col_id = tablet_schema().column_id(0); + return FindOrDie(readers_by_col_id_, key_col_id).get(); +} + +Status CFileSet::NewColumnIterator(ColumnId col_id, CFileReader::CacheControl cache_blocks, + CFileIterator **iter) const { + return FindOrDie(readers_by_col_id_, col_id)->NewIterator(iter, cache_blocks); +} + +CFileSet::Iterator *CFileSet::NewIterator(const Schema *projection) const { + return new CFileSet::Iterator(shared_from_this(), projection); +} + +Status CFileSet::CountRows(rowid_t *count) const { + return key_index_reader()->CountRows(count); +} + +Status CFileSet::GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const { + *min_encoded_key = Slice(min_encoded_key_); + *max_encoded_key = Slice(max_encoded_key_); + return Status::OK(); +} + +uint64_t CFileSet::EstimateOnDiskSize() const { + uint64_t ret = 0; + for (const ReaderMap::value_type& e : readers_by_col_id_) { + const shared_ptr &reader = e.second; + ret += reader->file_size(); + } + return ret; +} + +Status CFileSet::FindRow(const RowSetKeyProbe &probe, rowid_t *idx, + ProbeStats* stats) const { + if (bloom_reader_ != nullptr && FLAGS_consult_bloom_filters) { + // Fully open the BloomFileReader if it was lazily opened earlier. + // + // If it's already initialized, this is a no-op. + RETURN_NOT_OK(bloom_reader_->Init()); + + stats->blooms_consulted++; + bool present; + Status s = bloom_reader_->CheckKeyPresent(probe.bloom_probe(), &present); + if (s.ok() && !present) { + return Status::NotFound("not present in bloom filter"); + } else if (!s.ok()) { + LOG(WARNING) << "Unable to query bloom: " << s.ToString() + << " (disabling bloom for this rowset from this point forward)"; + const_cast(this)->bloom_reader_.reset(nullptr); + // Continue with the slow path + } + } + + stats->keys_consulted++; + CFileIterator *key_iter = nullptr; + RETURN_NOT_OK(NewKeyIterator(&key_iter)); + + gscoped_ptr key_iter_scoped(key_iter); // free on return + + bool exact; + RETURN_NOT_OK(key_iter->SeekAtOrAfter(probe.encoded_key(), &exact)); + if (!exact) { + return Status::NotFound("not present in storefile (failed seek)"); + } + + *idx = key_iter->GetCurrentOrdinal(); + return Status::OK(); +} + +Status CFileSet::CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + rowid_t *rowid, ProbeStats* stats) const { + + Status s = FindRow(probe, rowid, stats); + if (s.IsNotFound()) { + // In the case that the key comes past the end of the file, Seek + // will return NotFound. In that case, it is OK from this function's + // point of view - just a non-present key. + *present = false; + return Status::OK(); + } + *present = true; + return s; +} + +Status CFileSet::NewKeyIterator(CFileIterator **key_iter) const { + return key_index_reader()->NewIterator(key_iter, CFileReader::CACHE_BLOCK); +} + +//////////////////////////////////////////////////////////// +// Iterator +//////////////////////////////////////////////////////////// +CFileSet::Iterator::~Iterator() { + STLDeleteElements(&col_iters_); +} + +Status CFileSet::Iterator::CreateColumnIterators(const ScanSpec* spec) { + DCHECK_EQ(0, col_iters_.size()); + vector ret_iters; + ElementDeleter del(&ret_iters); + ret_iters.reserve(projection_->num_columns()); + + CFileReader::CacheControl cache_blocks = CFileReader::CACHE_BLOCK; + if (spec && !spec->cache_blocks()) { + cache_blocks = CFileReader::DONT_CACHE_BLOCK; + } + + for (int proj_col_idx = 0; + proj_col_idx < projection_->num_columns(); + proj_col_idx++) { + ColumnId col_id = projection_->column_id(proj_col_idx); + + if (!base_data_->has_data_for_column_id(col_id)) { + // If we have no data for a column, most likely it was added via an ALTER + // operation after this CFileSet was flushed. In that case, we're guaranteed + // that it is either NULLable, or has a "read-default". Otherwise, consider it a corruption. + const ColumnSchema& col_schema = projection_->column(proj_col_idx); + if (PREDICT_FALSE(!col_schema.is_nullable() && !col_schema.has_read_default())) { + return Status::Corruption(Substitute("column $0 has no data in rowset $1", + col_schema.ToString(), base_data_->ToString())); + } + ret_iters.push_back(new DefaultColumnValueIterator(col_schema.type_info(), + col_schema.read_default_value())); + continue; + } + CFileIterator *iter; + RETURN_NOT_OK_PREPEND(base_data_->NewColumnIterator(col_id, cache_blocks, &iter), + Substitute("could not create iterator for column $0", + projection_->column(proj_col_idx).ToString())); + ret_iters.push_back(iter); + } + + col_iters_.swap(ret_iters); + return Status::OK(); +} + +Status CFileSet::Iterator::Init(ScanSpec *spec) { + CHECK(!initted_); + + // Setup Key Iterator + CFileIterator *tmp; + RETURN_NOT_OK(base_data_->NewKeyIterator(&tmp)); + key_iter_.reset(tmp); + + // Setup column iterators. + RETURN_NOT_OK(CreateColumnIterators(spec)); + + // If there is a range predicate on the key column, push that down into an + // ordinal range. + RETURN_NOT_OK(PushdownRangeScanPredicate(spec)); + + initted_ = true; + + // Don't actually seek -- we'll seek when we first actually read the + // data. + cur_idx_ = lower_bound_idx_; + Unprepare(); // Reset state. + return Status::OK(); +} + +Status CFileSet::Iterator::PushdownRangeScanPredicate(ScanSpec *spec) { + CHECK_GT(row_count_, 0); + + lower_bound_idx_ = 0; + upper_bound_idx_ = row_count_; + + if (spec == nullptr) { + // No predicate. + return Status::OK(); + } + + Schema key_schema_for_vlog; + if (VLOG_IS_ON(1)) { + key_schema_for_vlog = base_data_->tablet_schema().CreateKeyProjection(); + } + + if (spec->lower_bound_key() && + spec->lower_bound_key()->encoded_key().compare(base_data_->min_encoded_key_) > 0) { + bool exact; + Status s = key_iter_->SeekAtOrAfter(*spec->lower_bound_key(), &exact); + if (s.IsNotFound()) { + // The lower bound is after the end of the key range. + // Thus, no rows will pass the predicate, so we set the lower bound + // to the end of the file. + lower_bound_idx_ = row_count_; + return Status::OK(); + } + RETURN_NOT_OK(s); + + lower_bound_idx_ = std::max(lower_bound_idx_, key_iter_->GetCurrentOrdinal()); + VLOG(1) << "Pushed lower bound value " + << spec->lower_bound_key()->Stringify(key_schema_for_vlog) + << " as row_idx >= " << lower_bound_idx_; + } + if (spec->exclusive_upper_bound_key() && + spec->exclusive_upper_bound_key()->encoded_key().compare( + base_data_->max_encoded_key_) <= 0) { + bool exact; + Status s = key_iter_->SeekAtOrAfter(*spec->exclusive_upper_bound_key(), &exact); + if (PREDICT_FALSE(s.IsNotFound())) { + LOG(DFATAL) << "CFileSet indicated upper bound was within range, but " + << "key iterator could not seek. " + << "CFileSet upper_bound = " + << Slice(base_data_->max_encoded_key_).ToDebugString() + << ", enc_key = " + << spec->exclusive_upper_bound_key()->encoded_key().ToDebugString(); + } else { + RETURN_NOT_OK(s); + + rowid_t cur = key_iter_->GetCurrentOrdinal(); + upper_bound_idx_ = std::min(upper_bound_idx_, cur); + + VLOG(1) << "Pushed upper bound value " + << spec->exclusive_upper_bound_key()->Stringify(key_schema_for_vlog) + << " as row_idx < " << upper_bound_idx_; + } + } + return Status::OK(); +} + +void CFileSet::Iterator::Unprepare() { + prepared_count_ = 0; + cols_prepared_.assign(col_iters_.size(), false); +} + +Status CFileSet::Iterator::PrepareBatch(size_t *n) { + DCHECK_EQ(prepared_count_, 0) << "Already prepared"; + + size_t remaining = upper_bound_idx_ - cur_idx_; + if (*n > remaining) { + *n = remaining; + } + + prepared_count_ = *n; + + // Lazily prepare the first column when it is materialized. + return Status::OK(); +} + + +Status CFileSet::Iterator::PrepareColumn(size_t idx) { + if (cols_prepared_[idx]) { + // Already prepared in this batch. + return Status::OK(); + } + + ColumnIterator* col_iter = col_iters_[idx]; + size_t n = prepared_count_; + + if (!col_iter->seeked() || col_iter->GetCurrentOrdinal() != cur_idx_) { + // Either this column has not yet been accessed, or it was accessed + // but then skipped in a prior block (e.g because predicates on other + // columns completely eliminated the block). + // + // Either way, we need to seek it to the correct offset. + RETURN_NOT_OK(col_iter->SeekToOrdinal(cur_idx_)); + } + + Status s = col_iter->PrepareBatch(&n); + if (!s.ok()) { + LOG(WARNING) << "Unable to prepare column " << idx << ": " << s.ToString(); + return s; + } + + if (n != prepared_count_) { + return Status::Corruption( + StringPrintf("Column %zd (%s) didn't yield enough rows at offset %zd: expected " + "%zd but only got %zd", idx, projection_->column(idx).ToString().c_str(), + cur_idx_, prepared_count_, n)); + } + + cols_prepared_[idx] = true; + + return Status::OK(); +} + +Status CFileSet::Iterator::InitializeSelectionVector(SelectionVector *sel_vec) { + sel_vec->SetAllTrue(); + return Status::OK(); +} + +Status CFileSet::Iterator::MaterializeColumn(size_t col_idx, ColumnBlock *dst) { + CHECK_EQ(prepared_count_, dst->nrows()); + DCHECK_LT(col_idx, col_iters_.size()); + + RETURN_NOT_OK(PrepareColumn(col_idx)); + ColumnIterator* iter = col_iters_[col_idx]; + return iter->Scan(dst); +} + +Status CFileSet::Iterator::FinishBatch() { + CHECK_GT(prepared_count_, 0); + + for (size_t i = 0; i < col_iters_.size(); i++) { + if (cols_prepared_[i]) { + Status s = col_iters_[i]->FinishBatch(); + if (!s.ok()) { + LOG(WARNING) << "Unable to FinishBatch() on column " << i; + return s; + } + } + } + + cur_idx_ += prepared_count_; + Unprepare(); + + return Status::OK(); +} + + +void CFileSet::Iterator::GetIteratorStats(vector* stats) const { + stats->clear(); + stats->reserve(col_iters_.size()); + for (const ColumnIterator* iter : col_iters_) { + ANNOTATE_IGNORE_READS_BEGIN(); + stats->push_back(iter->io_statistics()); + ANNOTATE_IGNORE_READS_END(); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/cfile_set.h b/src/kudu/tablet/cfile_set.h new file mode 100644 index 000000000000..5e0090528bfb --- /dev/null +++ b/src/kudu/tablet/cfile_set.h @@ -0,0 +1,233 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_LAYER_BASEDATA_H +#define KUDU_TABLET_LAYER_BASEDATA_H + +#include +#include +#include +#include +#include + +#include "kudu/cfile/bloomfile.h" +#include "kudu/cfile/cfile_reader.h" + +#include "kudu/common/iterator.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/tablet/memrowset.h" +#include "kudu/tablet/rowset_metadata.h" +#include "kudu/util/env.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/slice.h" + +namespace kudu { + +namespace metadata { +class RowSetMetadata; +} + +namespace tablet { + +using kudu::cfile::BloomFileReader; +using kudu::cfile::CFileIterator; +using kudu::cfile::CFileReader; +using kudu::cfile::ColumnIterator; + +// Set of CFiles which make up the base data for a single rowset +// +// All of these files have the same number of rows, and thus the positional +// indexes can be used to seek to corresponding entries in each. +class CFileSet : public std::enable_shared_from_this { + public: + class Iterator; + + explicit CFileSet(std::shared_ptr rowset_metadata); + + Status Open(); + + // Create an iterator with the given projection. 'projection' must remain valid + // for the lifetime of the returned iterator. + virtual Iterator *NewIterator(const Schema *projection) const; + + Status CountRows(rowid_t *count) const; + + // See RowSet::GetBounds + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const; + + uint64_t EstimateOnDiskSize() const; + + // Determine the index of the given row key. + Status FindRow(const RowSetKeyProbe &probe, rowid_t *idx, ProbeStats* stats) const; + + string ToString() const { + return string("CFile base data in ") + rowset_metadata_->ToString(); + } + + // Check if the given row is present. If it is, sets *rowid to the + // row's index. + Status CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + rowid_t *rowid, ProbeStats* stats) const; + + // Return true if there exists a CFile for the given column ID. + bool has_data_for_column_id(ColumnId col_id) const { + return ContainsKey(readers_by_col_id_, col_id); + } + + virtual ~CFileSet(); + + private: + friend class Iterator; + friend class CFileSetIteratorProjector; + + DISALLOW_COPY_AND_ASSIGN(CFileSet); + + Status OpenBloomReader(); + Status OpenAdHocIndexReader(); + Status LoadMinMaxKeys(); + + Status NewColumnIterator(ColumnId col_id, CFileReader::CacheControl cache_blocks, + CFileIterator **iter) const; + Status NewKeyIterator(CFileIterator **iter) const; + + // Return the CFileReader responsible for reading the key index. + // (the ad-hoc reader for composite keys, otherwise the key column reader) + CFileReader* key_index_reader() const; + + const Schema &tablet_schema() const { return rowset_metadata_->tablet_schema(); } + + std::shared_ptr rowset_metadata_; + + std::string min_encoded_key_; + std::string max_encoded_key_; + + // Map of column ID to reader. These are lazily initialized as needed. + typedef std::unordered_map > ReaderMap; + ReaderMap readers_by_col_id_; + + // A file reader for an ad-hoc index, i.e. an index that sits in its own file + // and is not embedded with the column's data blocks. This is used when the + // index pertains to more than one column, as in the case of composite keys. + gscoped_ptr ad_hoc_idx_reader_; + gscoped_ptr bloom_reader_; +}; + + +//////////////////////////////////////////////////////////// + +// Column-wise iterator implementation over a set of column files. +// +// This simply ties together underlying files so that they can be batched +// together, and iterated in parallel. +class CFileSet::Iterator : public ColumnwiseIterator { + public: + + virtual Status Init(ScanSpec *spec) OVERRIDE; + + virtual Status PrepareBatch(size_t *nrows) OVERRIDE; + + virtual Status InitializeSelectionVector(SelectionVector *sel_vec) OVERRIDE; + + virtual Status MaterializeColumn(size_t col_idx, ColumnBlock *dst) OVERRIDE; + + virtual Status FinishBatch() OVERRIDE; + + virtual bool HasNext() const OVERRIDE { + DCHECK(initted_); + return cur_idx_ < upper_bound_idx_; + } + + virtual string ToString() const OVERRIDE { + return string("rowset iterator for ") + base_data_->ToString(); + } + + const Schema &schema() const OVERRIDE { + return *projection_; + } + + // Return the ordinal index of the next row to be returned from + // the iterator. + rowid_t cur_ordinal_idx() const { + return cur_idx_; + } + + // Collect the IO statistics for each of the underlying columns. + virtual void GetIteratorStats(vector *stats) const OVERRIDE; + + virtual ~Iterator(); + private: + DISALLOW_COPY_AND_ASSIGN(Iterator); + FRIEND_TEST(TestCFileSet, TestRangeScan); + friend class CFileSet; + + // 'projection' must remain valid for the lifetime of this object. + Iterator(std::shared_ptr base_data, const Schema *projection) + : base_data_(std::move(base_data)), + projection_(projection), + initted_(false), + cur_idx_(0), + prepared_count_(0) { + CHECK_OK(base_data_->CountRows(&row_count_)); + } + + // Fill in col_iters_ for each of the requested columns. + Status CreateColumnIterators(const ScanSpec* spec); + + // Look for a predicate which can be converted into a range scan using the key + // column's index. If such a predicate exists, remove it from the scan spec and + // store it in member fields. + Status PushdownRangeScanPredicate(ScanSpec *spec); + + void Unprepare(); + + // Prepare the given column if not already prepared. + Status PrepareColumn(size_t col_idx); + + const std::shared_ptr base_data_; + const Schema* projection_; + + // Iterator for the key column in the underlying data. + gscoped_ptr key_iter_; + std::vector col_iters_; + + bool initted_; + + size_t cur_idx_; + size_t prepared_count_; + + // The total number of rows in the file + rowid_t row_count_; + + // Lower bound (inclusive) and upper bound (exclusive) for this iterator, in terms of + // ordinal row indexes. + // Both of these bounds are always set (even if there is no predicate). + // If there is no predicate, then the bounds will be [0, row_count_] + rowid_t lower_bound_idx_; + rowid_t upper_bound_idx_; + + + // The underlying columns are prepared lazily, so that if a column is never + // materialized, it doesn't need to be read off disk. + vector cols_prepared_; + +}; + +} // namespace tablet +} // namespace kudu +#endif diff --git a/src/kudu/tablet/compaction-policy.txt b/src/kudu/tablet/compaction-policy.txt new file mode 100644 index 000000000000..ee9a28300dc5 --- /dev/null +++ b/src/kudu/tablet/compaction-policy.txt @@ -0,0 +1,397 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This document explains the policy of performing a compaction. +For details explaining how compactions are implemented, see compaction.txt. + +Compaction Policy +============================================================ + +The compaction policy is responsible for selecting a set of rowsets to compact +together. Compactions are necessary in order to reduce the number of DiskRowSets +which must be consulted for various operations, thus improving the overall +performance of the tablet. + +Coming up with a good compaction policy is a balancing act between several goals: + +1) Re-arrange the physical layout to be more efficient for subsequent operations. + +2) Do so without using too many resources in the compaction itself. + +3) Do so "smoothly" - spread work out over time so that operation performance is + predictable and reasonably constant. + + +The following sections provide some analysis of the above goals: + +Benefit of compaction for subsequent operations +============================================================ + +In order to determine a good compaction policy, we want to define a cost measure +for a given set of RowSets within a tablet. Consider the following set of +RowSets: + + + 1 2 3 4 5 +|--A--||-B--||--C--||---D----| +|--------------E-------------| + |-F--| + +In this diagram, the key space spans from left to right, and each RowSet is drawn +as an interval based on its first and last contained key. We'll define a few terms +for later use in this document: + + "Width" + ------- + Let the Width of a RowSet be proportional to the percentage of key + space that it spans. For example, rowset E has a width of 1, since + it spans the whole tablet. Rowset B has width 0.2, since it spans + about 20% of the tablet. + + Note that the Width is also the probability that any read in a + uniform random read workload will have to consult that RowSet. + + "Height" + -------- + The "Height" of a tablet at a given key is the number of rowsets + whose key ranges contain that key. For example, the height of the + above tablet at key 1 is 2, since rowsets A and E span that key. + The height at key 4 is 3, since D, E, and F span that key. + + The Height at any key is the number of RowSets that will be have to + be consulted for a random read of that key. + +Let us consider the cost of various operations on the tablet: + +Insert +------- +In order to Insert, each of the rowsets must be checked for a duplicate key. By +storing the rowset ranges in an interval tree, we can efficiently determine the +set of rowsets whose intervals may contain the key to be inserted, and thus the +cost is linear in that number of rowsets: + + Let n = the Height of the tablet at the given key + Let B = the bloom filter false positive rate + Let C_bf = cost of bloom filter check + Let C_pk = cost of a primary key lookup + Cost = n*C_bf + n*B*C_pk + Cost = n(C_bf + B*C_pk) + +Typically, B is approximately 1% or lower, so the bloom filter checks dominate this +equation. However, in some cases where the primary key column is very large, every +primary key check will incur a disk seek, meaning that C_pk is orders of magnitude +higher than C_bf (which we expect to be in RAM or SSD). So, we cannot fully ignore +the term resulting from the bloom filter misses. + +Random read +------------ +The costs for random read are similar to the cost for inserts: given the known key, +each potentially overlapping rowset must be queried. + + +Short Scan +----------- +Scans cannot make use of bloom filters, so the cost is similar to the above, except +that all overlapping rowsets must be seeked by PK: + +Cost = n*C_pk + +We assume a "short" scan is one in which the sequential IO cost after finding the start +key is small compared to the seek cost. (eg assuming a 10ms seek time, 1MB or less of +sequential IO). + + +Long scan (e.g full table scan): +--------------------------------- +A long scan is likely to retrieve data from many rowsets. In this case, the size +of the rowsets comes into play. + +Let S = the number of MB in the scan +Let B = the disk bandwidth (MB/sec) +Let n = the number of rowsets accessed, as before + +Assume that accessing each rowset costs 1 seek (same as C_pk). + +Cost = n*C_pk + S/B + + +To summarize the above, all of the costs of operations are heavily dependent on the +number of rowsets which must be accessed. Therefore, to minimize cost, we should +follow the following strategies: + +1) In the case of point queries (inserts and random read/short scan), merge + rowsets which overlap in keyspace, thus reducing the average height of the + Tablet. + +2) In the case of longer scans, merge together rowsets to improve the ratio of + sequential IO to seeks. + +We can assume that, so long as the rowsets are reasonably large, goal #2 above has +diminishing returns after rowsets achieve ~10MB or so of sequential IO for every +seek (1 seek ~= 10ms, 10MB IO ~= 100ms). However, goal #1 has linear returns, so we +focus on goal #1. + + +Cost of doing a compaction +============================================================ +According to the above analysis, the optimal configuration for a tablet is a +single giant rowset which spans the entirety of the key space. This is +intuitively true: a fully-compacted tablet is going to perform the best because +every access will require at most one bloom filter check and one seek. + +However, it is obviously not optimal to simply compact all RowSets together in every +compaction. This would be inefficient, since every compaction would rewrite the +entire rowset, causing huge write amplification and wasted IO for only a small +amount of efficiency gain. + +So, we need to consider not just how efficient the resulting tablet would be, but also +how expensive it is to perform the candidate compaction. Only by weighing those two +against each other can we decide on the best compaction to perform at any given point +in time. + +For the purposes of this analysis, we consider the cost of a compaction to simply be +the sum of the IO performed by the compaction. We'll assume that deletions are rare, +in which case the output data size of a compaction is approximately equal to the +input data size. We also assume that the compaction inputs are large enough that +sequential IO outweighs any seeks required. + +Thus the cost of performing a compaction is O(input size). + + +Incremental work +============================================================ +The third goal for compaction is to be able to perform work incrementally. Doing +frequent incremental compactions rather than occasional large ones results in a +more consistent performance profile for end-user applications. Incremental work +also allows the system to react more quickly to changes in workload: for example, +if one area of the keyspace becomes hot, we would like to be able to quickly +react and compact that area of the keyspace within a short time window. + +One way to achieve this goal is to put a bound on the amount of data that any +given compaction will read and write. Bounding this data on the range of several +hundred MB means that a compaction can occur in 10 seconds or less, allowing +quick reaction time to shifts in workload. + + +Proposed strategy: +============================================================ + +Limiting RowSet Sizes +------------------------------ +The first key piece of the proposed compaction strategy is to limit the maximum size of +any RowSet to a relatively small footprint - e.g 64MB or even less. This can be done +by modifying the DiskRowSet writer code to "roll over" to a new rowset after the size +threshold has been reached. Thus, even if flushing a larger dataset from memory, the +on-disk rowset sizes can be limited. + + +Flushes with limited RowSet size +--------------------------------- +For example, imagine that the max rowset size is set to 64MB, and 150MB of data has +accumulated in the MemRowSet before a flush. The resulting output of the flush, then +looks like: + + A B C +|------||------||--| + 64MB 64MB 22MB + +Note that even though the maximum DiskRowSet size is 64MB, the third flushed rowset +will be smaller. In the future, we could esimate the on-disk data size and try to make +the three RowSets approximately equal-sized, but it is not necessary for correctness. + +Compactions with limited RowSet size +------------------------------------- +Now imagine another scenario, where a Tablet flushes several times, each resulting in +small files which span the entirety of the key space -- commonly seen in a uniform +random insert load. After 3 flushes, the Tablet looks like: + + + A (50MB) +|-------------------| + B (50MB) +|-------------------| + C (50MB) +|-------------------| + + +Because the three rowset ranges overlap, every access to the tablet must query each of the +rowsets (i.e the average rowset "depth" is 3). If the compaction policy selects these +three RowSets for compaction, the compaction result will look like: + + D E F +|------||------||--| + 64MB 64MB 22MB + + +Essentially, the compaction reorganizes the data from overlapping rowsets into non-overlapping +rowsets of a similar size. This reduces the average depth from 3 to 1, improving the +Tablet performance. + + +Dealing with large numbers of RowSets +-------------------------------------- +With these limited sizes, a modestly sized Tablet (eg 20GB) will have on the order of hundreds +of RowSets. In order to efficiently determine the set of RowSets which may contain a given +query key or range, we have to change the Tablet code to store the RowSets in an interval +tree instead of a simple list. The Interval Tree is a data structure which provides efficient +query for the set of intervals overlapping a given query point or query interval. + + +Intuition behind compaction selection policy +--------------------------------------------- +As a simplification, assume for now that all RowSets are exactly the same size (rather +than bounded under a maximum). Then, we can classify a RowSet as "good" or "bad" based on +one simple factor: the smaller the range of key space that it spans, the better. +Assuming a uniform insert workload, every flushed RowSet will span the entirety of the +Tablet's key space -- and hence must be queried by every subsequent operation. Once there +are multiple such flushed RowSets (A, B, and C in the diagram), compacting them results in +skinnier rowsets D, E, and F. + +Intuitively, then, a good compaction policy finds rowsets which are wide and overlapping, and +compacts them together, resulting in rowsets which are skinny and non-overlapping. + +Taking the cost factors developed above, we can look at compaction selection as an optimization +problem: reduce the cost of the Tablet configuration as much as possible under a given IO budget. + +Per the analysis above, the cost of a single read or insert is linear in the "height" of the +RowSets at the key being accessed. So, the average cost of operations can be calculated by +integrating the tablet height across the key space, or equivalently adding up the widths +of all of the RowSets. For example: + + |---A----| (width 10) + |-----B-------| (width 15) +|-C-||-----D-------| (width 5, width 15) +|--------E---------| (width 20) + +So, the summed width = 20+5+15+15+10 = 65. + +Imagine that we choose to compact rowsets A, B, and D above, resulting in the following +output: + +|-C-||-F-||-G-||-H-| (width 5, width 5, width 5, width 5) +|--------E---------| (width 20) + +Note that the total number of bytes have not changed: we've just reorganized the bytes +into a more compact form, reducing the average height of the tablet. + +Now the summed cost is 40. So, the compaction had benefit 25, using a budget of 3 units of IO +(remember that rowsets are assumed to be constant size for this analysis). + +Another choice for the compaction might have been to compact B, D, and E, resulting in: + |---A----| (width 10) +|-C-| (width 5) +|---F--||--G--||-H-| (width 8, width 7, width 5) + +This compaction reduced the tablet cost from 65 to 35 -- so its benefit was 30, using the same +IO budget of 3. + +Given that the second compaction choice reduced the tablet height more using the same budget, +it is a more optimal solution. + +Mathematical analysis +----------------------- +The reduction of cost due to a compaction is simple to calculate: + +Cost change = sum(original rowset widths) - sum(output rowset widths) + +We know that the output rowsets will not overlap at all, and that their total width will +span the union of the input rowset ranges. Therefore: + +Cost change = sum(original rowset widths) - (union width of original rowsets) + +Note that, for this analysis, the key ranges are treated as integers. This can be extended +to string keys in a straightforward manner by treating the string data as unsigned integers. + +Algorithm +---------- + +Given budget N rowsets: + +For each pair of rowsets (A, B): + Evaluate BestForPair(A, B): + +BestForPair(A, B): + Let union width = max(A.max_key, B.max_key) - min(A.min_key, B.min_key) + Determine the subset R of rowsets that are fully contained within the range A, B + Evaluate PickRowsetsWithBudget(R, N): + Set objective = sum(rowset width) - union width + If objective > best objective: + best solution = this set + +PickRowsetsWithBudget(R, N): + Choose the N rowsets in R which which maximize sum(rowset width) + + +PickRowsetsWithBudget can be solved by simply sorting the rowsets by their width and +choosing the top N. + + +Extending algorithm to non-constant sizes +------------------------------------------ + +Even though we limit the maximum rowset size to a constant, some rowsets may be smaller +due to more frequent flushes, etc. Thus, we would like to change the budget to be a number +of MB of IO, rather than a simple count N of input files. The subproblem PickNRowSets then becomes: + + Choose a set of RowSets such that their total file size falls within a budget, and + maximizes their total widths. + +This is an instance of the 0-1 knapsack problem, so we replace PickRowsetsWithBudget(R, N) +with a knapsack problem solver. + +Computational complexity +---------------------------- + +The algorithm contains O(n^2) calls to BestForPair, each of which contains one instance of the +0-1 knapsack problem, which has complexity O(n * max_budget). Thus, the total complexity is cubic +in the number of rowsets, which can become quite expensive when a given tablet may include on the +order of a thousand rowsets. + +We can optimize the approach by changing the order in which we consider pairs (A, B) in the +above-described algorithm: + +For each rowset A: + candidates = all rowsets B such that B.min_key >= A.min_key + sort candidates B by increasing B.max + For each pair (A, B): + Evaluate BestForPair(A, B) + +Considering the pairs in this order simplifies BestForPair as follows: + +BestForPair(A, B): + Let union width = max(A.max_key, b.max_key) - min(A.min_key, B.min_key) + Determine the subset R of rowsets that are fully contained within the range A, B + ** Because B.max_key is non_decreasing, this subset R is identical to R in the + previous call, except that B is now added to the end. No extra loop + is required. + Evaluate PickRowsetsWithBudget(R, N): + ** This instantiation of the knapsack problem now is identical to the previous + instantiation, except with one additional item. Thus, it can be computed + incrementally from the previous solution. + Set objective = sum(rowset width) - union width + If objective > best objective: + best solution = this set + + +Additionally, upper bounds can be calculated by solving the simpler fractional knapsack +problem and used to short-circuit the more complex calculations. + + +Extending algorithm to non-uniform workloads +-------------------------------------------- + +The above analysis is done in terms of constant workloads. However, in practice, workloads +may be skewed. Given that, it is more important to compact the areas of the key space which +are seeing frequent access. The algorithms can be extended in a straightforward way by changing +all references to the "width" of a rowset to instead be CDF(max key) - CDF(min key) where CDF +is the cumulative distribution function for accesses over a lagging time window. diff --git a/src/kudu/tablet/compaction-test.cc b/src/kudu/tablet/compaction-test.cc new file mode 100644 index 000000000000..338386602989 --- /dev/null +++ b/src/kudu/tablet/compaction-test.cc @@ -0,0 +1,818 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/fs/log_block_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/compaction.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +DEFINE_string(merge_benchmark_input_dir, "", + "Directory to benchmark merge. The benchmark will merge " + "all rowsets from this directory, pointed by the super-block " + "with id 00000 or 1111 and tablet id 'KuduCompactionBenchTablet', " + "if this is specified. Otherwise, inputs will " + "be generated as part of the test itself."); +DEFINE_int32(merge_benchmark_num_rowsets, 3, + "Number of rowsets as input to the merge"); +DEFINE_int32(merge_benchmark_num_rows_per_rowset, 500000, + "Number of rowsets as input to the merge"); + +DECLARE_string(block_manager); +DECLARE_bool(enable_data_block_fsync); + +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +using consensus::OpId; +using log::LogAnchorRegistry; +using strings::Substitute; + +static const char *kRowKeyFormat = "hello %08d"; +static const size_t kLargeRollThreshold = 1024 * 1024 * 1024; // 1GB +static const size_t kSmallRollThreshold = 1024; // 1KB + +class TestCompaction : public KuduRowSetTest { + public: + TestCompaction() + : KuduRowSetTest(CreateSchema()), + op_id_(consensus::MaximumOpId()), + row_builder_(schema_), + mvcc_(scoped_refptr( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp))), + log_anchor_registry_(new log::LogAnchorRegistry()) { + } + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn("key", STRING)); + CHECK_OK(builder.AddColumn("val", INT32)); + CHECK_OK(builder.AddNullableColumn("nullable_val", INT32)); + return builder.BuildWithoutIds(); + } + + // Insert n_rows rows of data. + // Each row is the tuple: (string key=hello , val=) + void InsertRows(MemRowSet *mrs, int n_rows, int delta) { + for (int32_t i = 0; i < n_rows; i++) { + InsertRow(mrs, i * 10 + delta, i); + } + } + + // Inserts a row. + // The 'nullable_val' column is set to either NULL (when val is odd) + // or 'val' (when val is even). + void InsertRow(MemRowSet *mrs, int row_key, int32_t val) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + row_builder_.Reset(); + snprintf(key_buf_, sizeof(key_buf_), kRowKeyFormat, row_key); + row_builder_.AddString(Slice(key_buf_)); + row_builder_.AddInt32(val); + if (val % 2 == 0) { + row_builder_.AddInt32(val); + } else { + row_builder_.AddNull(); + } + if (!mrs->schema().Equals(row_builder_.schema())) { + // The MemRowSet is not projecting the row, so must be done by the caller + RowProjector projector(&row_builder_.schema(), &mrs->schema()); + uint8_t rowbuf[ContiguousRowHelper::row_size(mrs->schema())]; + ContiguousRow dst_row(&mrs->schema(), rowbuf); + ASSERT_OK_FAST(projector.Init()); + ASSERT_OK_FAST(projector.ProjectRowForWrite(row_builder_.row(), + &dst_row, static_cast(nullptr))); + ASSERT_OK_FAST(mrs->Insert(tx.timestamp(), ConstContiguousRow(dst_row), op_id_)); + } else { + ASSERT_OK_FAST(mrs->Insert(tx.timestamp(), row_builder_.row(), op_id_)); + } + tx.Commit(); + } + + // Update n_rows rows of data. + // Each row has the key (string key=hello ) and its 'val' column + // is set to new_val. + // If 'val' is even, 'nullable_val' is set to NULL. Otherwise, set to 'val'. + // Note that this is the opposite of InsertRow() above, so that the updates + // flop NULL to non-NULL and vice versa. + void UpdateRows(RowSet *rowset, int n_rows, int delta, int32_t new_val) { + char keybuf[256]; + faststring update_buf; + ColumnId col_id = schema_.column_id(schema_.find_column("val")); + ColumnId nullable_col_id = schema_.column_id(schema_.find_column("nullable_val")); + for (uint32_t i = 0; i < n_rows; i++) { + SCOPED_TRACE(i); + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + snprintf(keybuf, sizeof(keybuf), kRowKeyFormat, i * 10 + delta); + + update_buf.clear(); + RowChangeListEncoder update(&update_buf); + update.AddColumnUpdate(schema_.column_by_id(col_id), col_id, &new_val); + if (new_val % 2 == 0) { + update.AddColumnUpdate(schema_.column_by_id(nullable_col_id), + nullable_col_id, nullptr); + } else { + update.AddColumnUpdate(schema_.column_by_id(nullable_col_id), + nullable_col_id, &new_val); + } + + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice(keybuf)); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + OperationResultPB result; + ASSERT_OK(rowset->MutateRow(tx.timestamp(), + probe, + RowChangeList(update_buf), + op_id_, + &stats, + &result)); + tx.Commit(); + } + } + + void DeleteRows(RowSet *rowset, int n_rows, int delta) { + char keybuf[256]; + faststring update_buf; + for (uint32_t i = 0; i < n_rows; i++) { + SCOPED_TRACE(i); + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + snprintf(keybuf, sizeof(keybuf), kRowKeyFormat, i * 10 + delta); + + update_buf.clear(); + RowChangeListEncoder update(&update_buf); + update.SetToDelete(); + + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice(keybuf)); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + OperationResultPB result; + ASSERT_OK(rowset->MutateRow(tx.timestamp(), + probe, + RowChangeList(update_buf), + op_id_, + &stats, + &result)); + tx.Commit(); + } + } + + // Iterate over the given compaction input, stringifying and dumping each + // yielded row to *out + void IterateInput(CompactionInput *input, vector *out) { + ASSERT_OK(DebugDumpCompactionInput(input, out)); + } + + // Flush the given CompactionInput 'input' to disk with the given snapshot. + // If 'result_rowsets' is not NULL, reopens the resulting rowset(s) and appends + // them to the vector. + void DoFlushAndReopen( + CompactionInput *input, const Schema& projection, const MvccSnapshot &snap, + int64_t roll_threshold, vector >* result_rowsets) { + // Flush with a large roll threshold so we only write a single file. + // This simplifies the test so we always need to reopen only a single rowset. + RollingDiskRowSetWriter rsw(tablet()->metadata(), projection, + BloomFilterSizing::BySizeAndFPRate(32*1024, 0.01f), + roll_threshold); + ASSERT_OK(rsw.Open()); + ASSERT_OK(FlushCompactionInput(input, snap, &rsw)); + ASSERT_OK(rsw.Finish()); + + vector > metas; + rsw.GetWrittenRowSetMetadata(&metas); + ASSERT_GE(metas.size(), 1); + for (const shared_ptr& meta : metas) { + ASSERT_TRUE(meta->HasBloomDataBlockForTests()); + } + if (result_rowsets) { + // Re-open the outputs + for (const shared_ptr& meta : metas) { + shared_ptr rs; + ASSERT_OK(DiskRowSet::Open(meta, log_anchor_registry_.get(), &rs)); + result_rowsets->push_back(rs); + } + } + } + + Status BuildCompactionInput(const MvccSnapshot& merge_snap, + const vector >& rowsets, + const Schema& projection, + gscoped_ptr* out) { + vector > merge_inputs; + for (const shared_ptr &rs : rowsets) { + gscoped_ptr input; + RETURN_NOT_OK(CompactionInput::Create(*rs, &projection, merge_snap, &input)); + merge_inputs.push_back(shared_ptr(input.release())); + } + out->reset(CompactionInput::Merge(merge_inputs, &projection)); + return Status::OK(); + } + + // Compacts a set of DRSs. + // If 'result_rowsets' is not NULL, reopens the resulting rowset(s) and appends + // them to the vector. + Status CompactAndReopen(const vector >& rowsets, + const Schema& projection, int64_t roll_threshold, + vector >* result_rowsets) { + MvccSnapshot merge_snap(mvcc_); + gscoped_ptr compact_input; + RETURN_NOT_OK(BuildCompactionInput(merge_snap, rowsets, projection, &compact_input)); + DoFlushAndReopen(compact_input.get(), projection, merge_snap, roll_threshold, + result_rowsets); + return Status::OK(); + } + + // Same as above, but sets a high roll threshold so it only produces a single output. + void CompactAndReopenNoRoll(const vector >& input_rowsets, + const Schema& projection, + shared_ptr* result_rs) { + vector > result_rowsets; + CompactAndReopen(input_rowsets, projection, kLargeRollThreshold, &result_rowsets); + ASSERT_EQ(1, result_rowsets.size()); + *result_rs = result_rowsets[0]; + } + + // Flush an MRS to disk. + // If 'result_rowsets' is not NULL, reopens the resulting rowset(s) and appends + // them to the vector. + void FlushMRSAndReopen(const MemRowSet& mrs, const Schema& projection, + int64_t roll_threshold, + vector >* result_rowsets) { + MvccSnapshot snap(mvcc_); + vector > rowset_metas; + gscoped_ptr input(CompactionInput::Create(mrs, &projection, snap)); + DoFlushAndReopen(input.get(), projection, snap, roll_threshold, result_rowsets); + } + + // Same as above, but sets a high roll threshold so it only produces a single output. + void FlushMRSAndReopenNoRoll(const MemRowSet& mrs, const Schema& projection, + shared_ptr* result_rs) { + vector > rowsets; + FlushMRSAndReopen(mrs, projection, kLargeRollThreshold, &rowsets); + ASSERT_EQ(1, rowsets.size()); + *result_rs = rowsets[0]; + } + + // Test compaction where each of the input rowsets has + // each of the input schemas. The output rowset will + // have the 'projection' schema. + void DoMerge(const Schema& projection, const vector& schemas) { + vector > rowsets; + + // Create one input rowset for each of the input schemas + int delta = 0; + for (const Schema& schema : schemas) { + // Create a memrowset with a bunch of rows and updates. + shared_ptr mrs(new MemRowSet(delta, schema, log_anchor_registry_.get())); + InsertRows(mrs.get(), 1000, delta); + UpdateRows(mrs.get(), 1000, delta, 1); + + // Flush it to disk and re-open it. + shared_ptr rs; + FlushMRSAndReopenNoRoll(*mrs, schema, &rs); + ASSERT_NO_FATAL_FAILURE(); + rowsets.push_back(rs); + + // Perform some updates into DMS + UpdateRows(rs.get(), 1000, delta, 2); + delta++; + } + + // Merge them. + shared_ptr result_rs; + ASSERT_NO_FATAL_FAILURE(CompactAndReopenNoRoll(rowsets, projection, &result_rs)); + + // Verify the resulting compaction output has the right number + // of rows. + rowid_t count = 0; + ASSERT_OK(result_rs->CountRows(&count)); + ASSERT_EQ(1000 * schemas.size(), count); + } + + template + void DoBenchmark() { + vector > rowsets; + + if (FLAGS_merge_benchmark_input_dir.empty()) { + // Create inputs. + for (int i = 0; i < FLAGS_merge_benchmark_num_rowsets; i++) { + // Create a memrowset with a bunch of rows and updates. + shared_ptr mrs(new MemRowSet(i, schema_, log_anchor_registry_.get())); + + for (int n = 0; n < FLAGS_merge_benchmark_num_rows_per_rowset; n++) { + + int row_key; + if (OVERLAP_INPUTS) { + // input 0: 0 3 6 9 ... + // input 1: 1 4 7 10 ... + // input 2: 2 5 8 11 ... + row_key = n * FLAGS_merge_benchmark_num_rowsets + i; + } else { + // input 0: 0 1 2 3 + // input 1: 1000 1001 1002 1003 + // ... + row_key = i * FLAGS_merge_benchmark_num_rows_per_rowset + n; + } + InsertRow(mrs.get(), row_key, n); + } + shared_ptr rs; + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs); + ASSERT_NO_FATAL_FAILURE(); + rowsets.push_back(rs); + } + } else { + string tablet_id = "KuduCompactionBenchTablet"; + FsManager fs_manager(env_.get(), FLAGS_merge_benchmark_input_dir); + scoped_refptr input_meta; + ASSERT_OK(TabletMetadata::Load(&fs_manager, tablet_id, &input_meta)); + + for (const shared_ptr& meta : input_meta->rowsets()) { + shared_ptr rs; + CHECK_OK(DiskRowSet::Open(meta, log_anchor_registry_.get(), &rs)); + rowsets.push_back(rs); + } + + CHECK(!rowsets.empty()) << "No rowsets found in " << FLAGS_merge_benchmark_input_dir; + } + LOG(INFO) << "Beginning compaction"; + LOG_TIMING(INFO, "compacting " + + std::string((OVERLAP_INPUTS ? "with overlap" : "without overlap"))) { + MvccSnapshot merge_snap(mvcc_); + gscoped_ptr compact_input; + ASSERT_OK(BuildCompactionInput(merge_snap, rowsets, schema_, &compact_input)); + // Use a low target row size to increase the number of resulting rowsets. + RollingDiskRowSetWriter rdrsw(tablet()->metadata(), schema_, + BloomFilterSizing::BySizeAndFPRate(32 * 1024, 0.01f), + 1024 * 1024); // 1 MB + ASSERT_OK(rdrsw.Open()); + ASSERT_OK(FlushCompactionInput(compact_input.get(), merge_snap, &rdrsw)); + ASSERT_OK(rdrsw.Finish()); + } + } + + Status GetDataDiskSpace(uint64_t* bytes_used) { + *bytes_used = 0; + return env_->Walk(fs_manager()->GetDataRootDirs().at(0), + Env::PRE_ORDER, Bind(&TestCompaction::GetDataDiskSpaceCb, + Unretained(this), bytes_used)); + } + + protected: + OpId op_id_; + + RowBuilder row_builder_; + char key_buf_[256]; + MvccManager mvcc_; + + scoped_refptr log_anchor_registry_; + + private: + + Status GetDataDiskSpaceCb(uint64_t* bytes_used, + Env::FileType type, + const string& dirname, const string& basename) { + uint64_t file_bytes_used = 0; + switch (type) { + case Env::FILE_TYPE: + RETURN_NOT_OK(env_->GetFileSizeOnDisk( + JoinPathSegments(dirname, basename), &file_bytes_used)); + *bytes_used += file_bytes_used; + break; + case Env::DIRECTORY_TYPE: + // Ignore directory space consumption; it varies from filesystem to + // filesystem and isn't interesting for this test. + break; + default: + LOG(FATAL) << "Unknown file type: " << type; + } + return Status::OK(); + } +}; + +TEST_F(TestCompaction, TestMemRowSetInput) { + // Create a memrowset with 10 rows and several updates. + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + UpdateRows(mrs.get(), 10, 0, 1); + UpdateRows(mrs.get(), 10, 0, 2); + + // Ensure that the compaction input yields the expected rows + // and mutations. + vector out; + MvccSnapshot snap(mvcc_); + gscoped_ptr input(CompactionInput::Create(*mrs, &schema_, snap)); + IterateInput(input.get(), &out); + ASSERT_EQ(10, out.size()); + ASSERT_EQ("(string key=hello 00000000, int32 val=0, int32 nullable_val=0) " + "Undos: [@1(DELETE)] " + "Redos: [@11(SET val=1, nullable_val=1), @21(SET val=2, nullable_val=NULL)]", + out[0]); + ASSERT_EQ("(string key=hello 00000090, int32 val=9, int32 nullable_val=NULL) " + "Undos: [@10(DELETE)] " + "Redos: [@20(SET val=1, nullable_val=1), @30(SET val=2, nullable_val=NULL)]", + out[9]); +} + +TEST_F(TestCompaction, TestFlushMRSWithRolling) { + // Create a memrowset with enough rows so that, when we flush with a small + // roll threshold, we'll end up creating multiple DiskRowSets. + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 30000, 0); + + vector > rowsets; + FlushMRSAndReopen(*mrs, schema_, kSmallRollThreshold, &rowsets); + ASSERT_GT(rowsets.size(), 1); + + vector rows; + rows.reserve(30000 / 2); + rowsets[0]->DebugDump(&rows); + EXPECT_EQ("(string key=hello 00000000, int32 val=0, int32 nullable_val=0) " + "Undos: [@1(DELETE)] Redos: []", + rows[0]); + + rows.clear(); + rowsets[1]->DebugDump(&rows); + EXPECT_EQ("(string key=hello 00154700, int32 val=15470, int32 nullable_val=15470) " + "Undos: [@15471(DELETE)] Redos: []", + rows[0]); + EXPECT_EQ("(string key=hello 00154710, int32 val=15471, int32 nullable_val=NULL) " + "Undos: [@15472(DELETE)] Redos: []", + rows[1]); +} + +TEST_F(TestCompaction, TestRowSetInput) { + // Create a memrowset with a bunch of rows, flush and reopen. + shared_ptr rs; + { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs); + ASSERT_NO_FATAL_FAILURE(); + } + + // Update the rows in the rowset. + UpdateRows(rs.get(), 10, 0, 1); + UpdateRows(rs.get(), 10, 0, 2); + // Flush DMS, update some more. + ASSERT_OK(rs->FlushDeltas()); + UpdateRows(rs.get(), 10, 0, 3); + UpdateRows(rs.get(), 10, 0, 4); + + // Check compaction input + vector out; + gscoped_ptr input; + ASSERT_OK(CompactionInput::Create(*rs, &schema_, MvccSnapshot(mvcc_), &input)); + IterateInput(input.get(), &out); + ASSERT_EQ(10, out.size()); + EXPECT_EQ("(string key=hello 00000000, int32 val=0, int32 nullable_val=0) " + "Undos: [@1(DELETE)] " + "Redos: [" + "@11(SET val=1, nullable_val=1), " + "@21(SET val=2, nullable_val=NULL), " + "@31(SET val=3, nullable_val=3), " + "@41(SET val=4, nullable_val=NULL)]", + out[0]); + EXPECT_EQ("(string key=hello 00000090, int32 val=9, int32 nullable_val=NULL) " + "Undos: [@10(DELETE)] " + "Redos: [" + "@20(SET val=1, nullable_val=1), " + "@30(SET val=2, nullable_val=NULL), " + "@40(SET val=3, nullable_val=3), " + "@50(SET val=4, nullable_val=NULL)]", + out[9]); +} + +// Tests that the same rows, duplicated in three DRSs, ghost in two of them +// appears only once on the compaction output +TEST_F(TestCompaction, TestDuplicatedGhostRowsDontSurviveCompaction) { + shared_ptr rs1; + { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs1); + ASSERT_NO_FATAL_FAILURE(); + } + // Now delete the rows, this will make the rs report them as deleted and + // so we would reinsert them into the MRS. + DeleteRows(rs1.get(), 10, 0); + + shared_ptr rs2; + { + shared_ptr mrs(new MemRowSet(1, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + UpdateRows(mrs.get(), 10, 0, 1); + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs2); + ASSERT_NO_FATAL_FAILURE(); + } + DeleteRows(rs2.get(), 10, 0); + + shared_ptr rs3; + { + shared_ptr mrs(new MemRowSet(1, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + UpdateRows(mrs.get(), 10, 0, 2); + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs3); + ASSERT_NO_FATAL_FAILURE(); + } + + shared_ptr result; + vector > all_rss; + all_rss.push_back(rs3); + all_rss.push_back(rs1); + all_rss.push_back(rs2); + + SeedRandom(); + // Shuffle the row sets to make sure we test different orderings + std::random_shuffle(all_rss.begin(), all_rss.end()); + + // Now compact all the drs and make sure we don't get duplicated keys on the output + CompactAndReopenNoRoll(all_rss, schema_, &result); + + gscoped_ptr input; + ASSERT_OK(CompactionInput::Create(*result, + &schema_, + MvccSnapshot::CreateSnapshotIncludingAllTransactions(), + &input)); + vector out; + IterateInput(input.get(), &out); + ASSERT_EQ(out.size(), 10); + EXPECT_EQ("(string key=hello 00000000, int32 val=2, int32 nullable_val=NULL) " + "Undos: [@61(SET val=0, nullable_val=0), @51(DELETE)] " + "Redos: []", out[0]); + EXPECT_EQ("(string key=hello 00000090, int32 val=2, int32 nullable_val=NULL) " + "Undos: [@70(SET val=9, nullable_val=NULL), @60(DELETE)] " + "Redos: []", out[9]); +} + +// Test case which doesn't do any merging -- just compacts +// a single input rowset (which may be the memrowset) into a single +// output rowset (on disk). +TEST_F(TestCompaction, TestOneToOne) { + // Create a memrowset with a bunch of rows and updates. + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 1000, 0); + UpdateRows(mrs.get(), 1000, 0, 1); + MvccSnapshot snap(mvcc_); + + // Flush it to disk and re-open. + shared_ptr rs; + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs); + ASSERT_NO_FATAL_FAILURE(); + + // Update the rows with some updates that weren't in the snapshot. + UpdateRows(mrs.get(), 1000, 0, 2); + + // Catch the updates that came in after the snapshot flush was made. + MvccSnapshot snap2(mvcc_); + gscoped_ptr input(CompactionInput::Create(*mrs, &schema_, snap2)); + + // Add some more updates which come into the new rowset while the "reupdate" is happening. + UpdateRows(rs.get(), 1000, 0, 3); + + string dummy_name = ""; + + ASSERT_OK(ReupdateMissedDeltas(dummy_name, input.get(), snap, snap2, { rs })); + + // If we look at the contents of the DiskRowSet now, we should see the "re-updated" data. + vector out; + ASSERT_OK(CompactionInput::Create(*rs, &schema_, MvccSnapshot(mvcc_), &input)); + IterateInput(input.get(), &out); + ASSERT_EQ(1000, out.size()); + EXPECT_EQ("(string key=hello 00000000, int32 val=1, int32 nullable_val=1) " + "Undos: [@1001(SET val=0, nullable_val=0), @1(DELETE)] " + "Redos: [@2001(SET val=2, nullable_val=NULL), " + "@3001(SET val=3, nullable_val=3)]", out[0]); + + // And compact (1 input to 1 output) + MvccSnapshot snap3(mvcc_); + gscoped_ptr compact_input; + ASSERT_OK(CompactionInput::Create(*rs, &schema_, snap3, &compact_input)); + DoFlushAndReopen(compact_input.get(), schema_, snap3, kLargeRollThreshold, nullptr); +} + +// Test merging two row sets and the second one has updates, KUDU-102 +// We re-create the conditions by providing two DRS that are both the input and the +// output of a compaction, and trying to merge two MRS. +TEST_F(TestCompaction, TestKUDU102) { + // Create 2 row sets, flush them + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs.get(), 10, 0); + shared_ptr rs; + FlushMRSAndReopenNoRoll(*mrs, schema_, &rs); + ASSERT_NO_FATAL_FAILURE(); + + shared_ptr mrs_b(new MemRowSet(1, schema_, log_anchor_registry_.get())); + InsertRows(mrs_b.get(), 10, 100); + MvccSnapshot snap(mvcc_); + shared_ptr rs_b; + FlushMRSAndReopenNoRoll(*mrs_b, schema_, &rs_b); + ASSERT_NO_FATAL_FAILURE(); + + // Update all the rows in the second row set + UpdateRows(mrs_b.get(), 10, 100, 2); + + // Catch the updates that came in after the snapshot flush was made. + // Note that we are merging two MRS, it's a hack + MvccSnapshot snap2(mvcc_); + vector > merge_inputs; + merge_inputs.push_back( + shared_ptr(CompactionInput::Create(*mrs, &schema_, snap2))); + merge_inputs.push_back( + shared_ptr(CompactionInput::Create(*mrs_b, &schema_, snap2))); + gscoped_ptr input(CompactionInput::Merge(merge_inputs, &schema_)); + + string dummy_name = ""; + + // This would fail without KUDU-102 + ASSERT_OK(ReupdateMissedDeltas(dummy_name, input.get(), snap, snap2, { rs, rs_b })); +} + + +// Test compacting when all of the inputs and the output have the same schema +TEST_F(TestCompaction, TestMerge) { + vector schemas; + schemas.push_back(schema_); + schemas.push_back(schema_); + schemas.push_back(schema_); + DoMerge(schemas.back(), schemas); +} + +// test compacting when the inputs have different base schemas +TEST_F(TestCompaction, TestMergeMultipleSchemas) { + vector schemas; + SchemaBuilder builder(schema_); + schemas.push_back(schema_); + + // Add an int column with default + int32_t default_c2 = 10; + CHECK_OK(builder.AddColumn("c2", INT32, false, &default_c2, &default_c2)); + schemas.push_back(builder.Build()); + + // add a string column with default + Slice default_c3("Hello World"); + CHECK_OK(builder.AddColumn("c3", STRING, false, &default_c3, &default_c3)); + schemas.push_back(builder.Build()); + + DoMerge(schemas.back(), schemas); +} + +// Test MergeCompactionInput against MemRowSets. This behavior isn't currently +// used (we never compact in-memory), but this is a regression test for a bug +// encountered during development where the first row of each MRS got dropped. +TEST_F(TestCompaction, TestMergeMRS) { + shared_ptr mrs_a(new MemRowSet(0, schema_, log_anchor_registry_.get())); + InsertRows(mrs_a.get(), 10, 0); + + shared_ptr mrs_b(new MemRowSet(1, schema_, log_anchor_registry_.get())); + InsertRows(mrs_b.get(), 10, 1); + + MvccSnapshot snap(mvcc_); + vector > merge_inputs; + merge_inputs.push_back( + shared_ptr(CompactionInput::Create(*mrs_a, &schema_, snap))); + merge_inputs.push_back( + shared_ptr(CompactionInput::Create(*mrs_b, &schema_, snap))); + gscoped_ptr input(CompactionInput::Merge(merge_inputs, &schema_)); + + vector out; + IterateInput(input.get(), &out); + ASSERT_EQ(out.size(), 20); + EXPECT_EQ("(string key=hello 00000000, int32 val=0, int32 nullable_val=0) " + "Undos: [@1(DELETE)] Redos: []", out[0]); + EXPECT_EQ("(string key=hello 00000091, int32 val=9, int32 nullable_val=NULL) " + "Undos: [@20(DELETE)] Redos: []", out[19]); +} + +#ifdef NDEBUG +// Benchmark for the compaction merge input for the case where the inputs +// contain non-overlapping data. In this case the merge can be optimized +// to be block-wise. +TEST_F(TestCompaction, BenchmarkMergeWithoutOverlap) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipped: must enable slow tests."; + return; + } + ASSERT_NO_FATAL_FAILURE(DoBenchmark()); +} + +// Benchmark for the compaction merge input when the inputs are entirely +// overlapping (i.e the inputs become fully interleaved in the output) +TEST_F(TestCompaction, BenchmarkMergeWithOverlap) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipped: must enable slow tests."; + return; + } + ASSERT_NO_FATAL_FAILURE(DoBenchmark()); +} +#endif + +TEST_F(TestCompaction, TestCompactionFreesDiskSpace) { + // On RHEL 6.4 with an ext4 filesystem mounted as ext3, it was observed + // that freshly created files report st_blocks=0 via stat(2) for several + // seconds. This appears to be some buggy interaction with ext4 delalloc. + // + // Enabling data block fsync appears to work around the problem. We do + // that here and not for all tests because: + // 1. fsync is expensive, and + // 2. This is the only test that cares about disk space usage and can't + // explicitly fsync() after writing new files. + + FLAGS_enable_data_block_fsync = true; + + { + // We must force the LocalTabletWriter out of scope before measuring + // disk space usage. Otherwise some deleted blocks are kept open for + // reading and aren't properly deallocated by the block manager. + LocalTabletWriter writer(tablet().get(), &client_schema()); + KuduPartialRow row(&client_schema()); + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 10; j++) { + int val = (i * 10) + j; + ASSERT_OK(row.SetStringCopy("key", Substitute("hello $0", val))); + ASSERT_OK(row.SetInt32("val", val)); + ASSERT_OK(writer.Insert(row)); + } + ASSERT_OK(tablet()->Flush()); + } + } + + uint64_t bytes_before; + ASSERT_NO_FATAL_FAILURE(GetDataDiskSpace(&bytes_before)); + + ASSERT_OK(tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + + // Block deletion may happen asynchronously, so let's loop for a bit until + // the space becomes free. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(30)); + while (true) { + uint64_t bytes_after; + ASSERT_NO_FATAL_FAILURE(GetDataDiskSpace(&bytes_after)); + LOG(INFO) << Substitute("Data disk space: $0 (before), $1 (after) ", + bytes_before, bytes_after); + if (bytes_after < bytes_before) { + break; + } else if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) { + FAIL() << "Timed out waiting for compaction to reduce data block disk " + << "space usage"; + } + SleepFor(MonoDelta::FromMilliseconds(200)); + } +} + +// Regression test for KUDU-1237, a bug in which empty flushes or compactions +// would result in orphaning near-empty cfile blocks on the disk. +TEST_F(TestCompaction, TestEmptyFlushDoesntLeakBlocks) { + if (FLAGS_block_manager != "log") { + LOG(WARNING) << "Test requires the log block manager"; + return; + } + + // Fetch the metric for the number of on-disk blocks, so we can later verify + // that we actually remove data. + fs::LogBlockManager* lbm = down_cast( + harness_->fs_manager()->block_manager()); + + int64_t before_count = lbm->CountBlocksForTests(); + ASSERT_OK(tablet()->Flush()); + int64_t after_count = lbm->CountBlocksForTests(); + + ASSERT_EQ(after_count, before_count); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/compaction.cc b/src/kudu/tablet/compaction.cc new file mode 100644 index 000000000000..3a99ef76bc05 --- /dev/null +++ b/src/kudu/tablet/compaction.cc @@ -0,0 +1,1021 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/compaction.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/tablet/delta_tracker.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/util/debug/trace_event.h" + +using std::shared_ptr; +using std::unordered_set; +using strings::Substitute; + +namespace kudu { +namespace tablet { + +namespace { + +// CompactionInput yielding rows and mutations from a MemRowSet. +class MemRowSetCompactionInput : public CompactionInput { + public: + MemRowSetCompactionInput(const MemRowSet& memrowset, + const MvccSnapshot& snap, + const Schema* projection) + : iter_(memrowset.NewIterator(projection, snap)), + arena_(32*1024, 128*1024), + has_more_blocks_(false) { + } + + virtual Status Init() OVERRIDE { + RETURN_NOT_OK(iter_->Init(NULL)); + has_more_blocks_ = iter_->HasNext(); + return Status::OK(); + } + + virtual bool HasMoreBlocks() OVERRIDE { + return has_more_blocks_; + } + + virtual Status PrepareBlock(vector *block) OVERRIDE { + int num_in_block = iter_->remaining_in_leaf(); + block->resize(num_in_block); + + // Realloc the internal block storage if we don't have enough space to + // copy the whole leaf node's worth of data into it. + if (PREDICT_FALSE(!row_block_ || num_in_block > row_block_->nrows())) { + row_block_.reset(new RowBlock(iter_->schema(), num_in_block, nullptr)); + } + + arena_.Reset(); + RowChangeListEncoder undo_encoder(&buffer_); + for (int i = 0; i < num_in_block; i++) { + // TODO: A copy is performed to make all CompactionInputRow have the same schema + CompactionInputRow &input_row = block->at(i); + input_row.row.Reset(row_block_.get(), i); + Timestamp insertion_timestamp; + RETURN_NOT_OK(iter_->GetCurrentRow(&input_row.row, + reinterpret_cast(NULL), + &input_row.redo_head, + &arena_, + &insertion_timestamp)); + + // Materialize MRSRow undo insert (delete) + undo_encoder.SetToDelete(); + input_row.undo_head = Mutation::CreateInArena(&arena_, + insertion_timestamp, + undo_encoder.as_changelist()); + undo_encoder.Reset(); + iter_->Next(); + } + + has_more_blocks_ = iter_->HasNext(); + return Status::OK(); + } + + Arena* PreparedBlockArena() OVERRIDE { return &arena_; } + + virtual Status FinishBlock() OVERRIDE { + return Status::OK(); + } + + virtual const Schema &schema() const OVERRIDE { + return iter_->schema(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(MemRowSetCompactionInput); + gscoped_ptr row_block_; + + gscoped_ptr iter_; + + // Arena used to store the projected undo/redo mutations of the current block. + Arena arena_; + + faststring buffer_; + + bool has_more_blocks_; +}; + +//////////////////////////////////////////////////////////// + +// CompactionInput yielding rows and mutations from an on-disk DiskRowSet. +class DiskRowSetCompactionInput : public CompactionInput { + public: + DiskRowSetCompactionInput(gscoped_ptr base_iter, + shared_ptr redo_delta_iter, + shared_ptr undo_delta_iter) + : base_iter_(base_iter.Pass()), + redo_delta_iter_(std::move(redo_delta_iter)), + undo_delta_iter_(std::move(undo_delta_iter)), + arena_(32 * 1024, 128 * 1024), + block_(base_iter_->schema(), kRowsPerBlock, &arena_), + redo_mutation_block_(kRowsPerBlock, reinterpret_cast(NULL)), + undo_mutation_block_(kRowsPerBlock, reinterpret_cast(NULL)), + first_rowid_in_block_(0) {} + + virtual Status Init() OVERRIDE { + ScanSpec spec; + spec.set_cache_blocks(false); + RETURN_NOT_OK(base_iter_->Init(&spec)); + RETURN_NOT_OK(redo_delta_iter_->Init(&spec)); + RETURN_NOT_OK(redo_delta_iter_->SeekToOrdinal(0)); + RETURN_NOT_OK(undo_delta_iter_->Init(&spec)); + RETURN_NOT_OK(undo_delta_iter_->SeekToOrdinal(0)); + return Status::OK(); + } + + virtual bool HasMoreBlocks() OVERRIDE { + return base_iter_->HasNext(); + } + + virtual Status PrepareBlock(vector *block) OVERRIDE { + RETURN_NOT_OK(base_iter_->NextBlock(&block_)); + std::fill(redo_mutation_block_.begin(), redo_mutation_block_.end(), + reinterpret_cast(NULL)); + std::fill(undo_mutation_block_.begin(), undo_mutation_block_.end(), + reinterpret_cast(NULL)); + RETURN_NOT_OK(redo_delta_iter_->PrepareBatch( + block_.nrows(), DeltaIterator::PREPARE_FOR_COLLECT)); + RETURN_NOT_OK(redo_delta_iter_->CollectMutations(&redo_mutation_block_, block_.arena())); + RETURN_NOT_OK(undo_delta_iter_->PrepareBatch( + block_.nrows(), DeltaIterator::PREPARE_FOR_COLLECT)); + RETURN_NOT_OK(undo_delta_iter_->CollectMutations(&undo_mutation_block_, block_.arena())); + + block->resize(block_.nrows()); + for (int i = 0; i < block_.nrows(); i++) { + CompactionInputRow &input_row = block->at(i); + input_row.row.Reset(&block_, i); + input_row.redo_head = redo_mutation_block_[i]; + input_row.undo_head = undo_mutation_block_[i]; + } + + first_rowid_in_block_ += block_.nrows(); + return Status::OK(); + } + + virtual Arena* PreparedBlockArena() OVERRIDE { return &arena_; } + + virtual Status FinishBlock() OVERRIDE { + return Status::OK(); + } + + virtual const Schema &schema() const OVERRIDE { + return base_iter_->schema(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(DiskRowSetCompactionInput); + gscoped_ptr base_iter_; + shared_ptr redo_delta_iter_; + shared_ptr undo_delta_iter_; + + Arena arena_; + + // The current block of data which has come from the input iterator + RowBlock block_; + vector redo_mutation_block_; + vector undo_mutation_block_; + + rowid_t first_rowid_in_block_; + + enum { + kRowsPerBlock = 100 + }; +}; + +class MergeCompactionInput : public CompactionInput { + private: + // State kept for each of the inputs. + struct MergeState { + MergeState() : + pending_idx(0) + {} + + ~MergeState() { + STLDeleteElements(&dominated); + } + + bool empty() const { + return pending_idx >= pending.size(); + } + + const CompactionInputRow &next() const { + return pending[pending_idx]; + } + + void pop_front() { + pending_idx++; + } + + void Reset() { + pending.clear(); + pending_idx = 0; + } + + // Return true if the current block of this input fully dominates + // the current block of the other input -- i.e that the last + // row of this block is less than the first row of the other block. + // In this case, we can remove the other input from the merge until + // this input's current block has been exhausted. + bool Dominates(const MergeState &other, const Schema &schema) const { + DCHECK(!empty()); + DCHECK(!other.empty()); + + return schema.Compare(pending.back().row, other.next().row) < 0; + } + + shared_ptr input; + vector pending; + int pending_idx; + + vector dominated; + }; + + public: + MergeCompactionInput(const vector > &inputs, + const Schema* schema) + : schema_(schema) { + for (const shared_ptr &input : inputs) { + gscoped_ptr state(new MergeState); + state->input = input; + states_.push_back(state.release()); + } + } + + virtual ~MergeCompactionInput() { + STLDeleteElements(&states_); + } + + virtual Status Init() OVERRIDE { + for (MergeState *state : states_) { + RETURN_NOT_OK(state->input->Init()); + } + + // Pull the first block of rows from each input. + RETURN_NOT_OK(ProcessEmptyInputs()); + return Status::OK(); + } + + virtual bool HasMoreBlocks() OVERRIDE { + // Return true if any of the input blocks has more rows pending + // or more blocks which have yet to be pulled. + for (MergeState *state : states_) { + if (!state->empty() || + state->input->HasMoreBlocks()) { + return true; + } + } + + return false; + } + + virtual Status PrepareBlock(vector *block) OVERRIDE { + CHECK(!states_.empty()); + + block->clear(); + + while (true) { + int smallest_idx = -1; + CompactionInputRow smallest; + + // Iterate over the inputs to find the one with the smallest next row. + // It may seem like an O(n lg k) merge using a heap would be more efficient, + // but some benchmarks indicated that the simpler code path of the O(n k) merge + // actually ends up a bit faster. + for (int i = 0; i < states_.size(); i++) { + MergeState *state = states_[i]; + + if (state->empty()) { + prepared_block_arena_ = state->input->PreparedBlockArena(); + // If any of our inputs runs out of pending entries, then we can't keep + // merging -- this input may have further blocks to process. + // Rather than pulling another block here, stop the loop. If it's truly + // out of blocks, then FinishBlock() will remove this input entirely. + return Status::OK(); + } + + if (smallest_idx < 0) { + smallest_idx = i; + smallest = state->next(); + continue; + } + int row_comp = schema_->Compare(state->next().row, smallest.row); + if (row_comp < 0) { + smallest_idx = i; + smallest = state->next(); + continue; + } + // If we found two duplicated rows, we want the row with the highest + // live version. If they're equal, that can only be because they're both + // dead, in which case it doesn't matter. + // TODO: this is going to change with historical REINSERT handling. + if (PREDICT_FALSE(row_comp == 0)) { + int mutation_comp = CompareLatestLiveVersion(state->next(), smallest); + if (mutation_comp > 0) { + // If the previous smallest row has a highest version that is lower + // than this one, discard it. + states_[smallest_idx]->pop_front(); + smallest_idx = i; + smallest = state->next(); + continue; + } else { + // .. otherwise pop the other one. + // + // NOTE: If they're equal, then currently that means that both versions are + // ghosts. Once we handle REINSERTS, we'll have to figure out which one "comes + // first" and deal with this properly. For now, we can just pick arbitrarily. + states_[i]->pop_front(); + continue; + } + } + } + DCHECK_GE(smallest_idx, 0); + + states_[smallest_idx]->pop_front(); + block->push_back(smallest); + } + + return Status::OK(); + } + + virtual Arena* PreparedBlockArena() OVERRIDE { return prepared_block_arena_; } + + virtual Status FinishBlock() OVERRIDE { + return ProcessEmptyInputs(); + } + + virtual const Schema &schema() const OVERRIDE { + return *schema_; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MergeCompactionInput); + + // Look through our current set of inputs. For any that are empty, + // pull the next block into its pending list. If there is no next + // block, remove it from our input set. + // + // Postcondition: every input has a non-empty pending list. + Status ProcessEmptyInputs() { + int j = 0; + for (int i = 0; i < states_.size(); i++) { + MergeState *state = states_[i]; + states_[j++] = state; + + if (!state->empty()) { + continue; + } + + RETURN_NOT_OK(state->input->FinishBlock()); + + // If an input is fully exhausted, no need to consider it + // in the merge anymore. + if (!state->input->HasMoreBlocks()) { + + // Any inputs that were dominated by the last block of this input + // need to be re-added into the merge. + states_.insert(states_.end(), state->dominated.begin(), state->dominated.end()); + state->dominated.clear(); + delete state; + j--; + continue; + } + + state->Reset(); + RETURN_NOT_OK(state->input->PrepareBlock(&state->pending)); + + // Now that this input has moved to its next block, it's possible that + // it no longer dominates the inputs in it 'dominated' list. Re-check + // all of those dominance relations and remove any that are no longer + // valid. + for (auto it = state->dominated.begin(); it != state->dominated.end(); ++it) { + MergeState *dominated = *it; + if (!state->Dominates(*dominated, *schema_)) { + states_.push_back(dominated); + it = state->dominated.erase(it); + --it; + } + } + } + // We may have removed exhausted states as we iterated through the + // array, so resize them away. + states_.resize(j); + + // Check pairs of states to see if any have dominance relations. + // This algorithm is probably not the most efficient, but it's the + // most obvious, and this doesn't ever show up in the profiler as + // much of a hot spot. + check_dominance: + for (int i = 0; i < states_.size(); i++) { + for (int j = i + 1; j < states_.size(); j++) { + if (TryInsertIntoDominanceList(states_[i], states_[j])) { + states_.erase(states_.begin() + j); + // Since we modified the vector, re-start iteration from the + // top. + goto check_dominance; + } else if (TryInsertIntoDominanceList(states_[j], states_[i])) { + states_.erase(states_.begin() + i); + // Since we modified the vector, re-start iteration from the + // top. + goto check_dominance; + } + } + } + + return Status::OK(); + } + + bool TryInsertIntoDominanceList(MergeState *dominator, MergeState *candidate) { + if (dominator->Dominates(*candidate, *schema_)) { + dominator->dominated.push_back(candidate); + return true; + } else { + return false; + } + } + + // Compare the mutations of two duplicated rows. + // Returns -1 if latest_version(left) < latest_version(right) + static int CompareLatestLiveVersion(const CompactionInputRow& left, + const CompactionInputRow& right) { + if (left.redo_head == nullptr) { + // left must still be alive + DCHECK(right.redo_head != nullptr); + return 1; + } + if (right.redo_head == nullptr) { + DCHECK(left.redo_head != nullptr); + return -1; + } + + // Duplicated rows have disjoint histories, we don't need to get the latest + // mutation, the first one should be enough for the sake of determining the most recent + // row, but in debug mode do get the latest to make sure one of the rows is a ghost. + const Mutation* left_latest = left.redo_head; + const Mutation* right_latest = right.redo_head; + int ret = left_latest->timestamp().CompareTo(right_latest->timestamp()); +#ifndef NDEBUG + AdvanceToLastInList(&left_latest); + AdvanceToLastInList(&right_latest); + int debug_ret = left_latest->timestamp().CompareTo(right_latest->timestamp()); + if (debug_ret != 0) { + // If in fact both rows were deleted at the same time, this is OK -- we could + // have a case like TestRandomAccess.TestFuzz3, in which a single batch + // DELETED from the DRS, INSERTed into MRS, and DELETED from MRS. In that case, + // the timestamp of the last REDO will be the same and we can pick whichever + // we like. + CHECK_EQ(ret, debug_ret); + } +#endif + return ret; + } + + static void AdvanceToLastInList(const Mutation** m) { + while ((*m)->next() != nullptr) { + *m = (*m)->next(); + } + } + + const Schema* schema_; + vector states_; + Arena* prepared_block_arena_; +}; + +} // anonymous namespace + +//////////////////////////////////////////////////////////// + +Status CompactionInput::Create(const DiskRowSet &rowset, + const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) { + CHECK(projection->has_column_ids()); + + // Assertion which checks for an earlier bug where the compaction snapshot + // chosen was too early. This resulted in UNDO files being mistakenly + // identified as REDO files and corruption ensued. If the assertion fails, + // the process crashes; only unrelated I/O-related errors are returned. + RETURN_NOT_OK_PREPEND(rowset.delta_tracker_->CheckSnapshotComesAfterAllUndos(snap), + "Could not open UNDOs"); + + shared_ptr base_cwise(rowset.base_data_->NewIterator(projection)); + gscoped_ptr base_iter(new MaterializingIterator(base_cwise)); + // Creates a DeltaIteratorMerger that will only include part of the redo deltas, + // since 'snap' will be after the snapshot of the last flush/compaction, + // i.e. past all undo deltas's max transaction ID. + shared_ptr redo_deltas; + RETURN_NOT_OK_PREPEND(rowset.delta_tracker_->NewDeltaIterator(projection, snap, &redo_deltas), + "Could not open REDOs"); + // Creates a DeltaIteratorMerger that will only include undo deltas, since + // MvccSnapshot::CreateSnapshotIncludingNoTransactions() excludes all redo + // deltas's min transaction ID. + shared_ptr undo_deltas; + RETURN_NOT_OK_PREPEND(rowset.delta_tracker_->NewDeltaIterator(projection, + MvccSnapshot::CreateSnapshotIncludingNoTransactions(), + &undo_deltas), "Could not open UNDOs"); + + out->reset(new DiskRowSetCompactionInput(base_iter.Pass(), redo_deltas, undo_deltas)); + return Status::OK(); +} + +CompactionInput *CompactionInput::Create(const MemRowSet &memrowset, + const Schema* projection, + const MvccSnapshot &snap) { + CHECK(projection->has_column_ids()); + return new MemRowSetCompactionInput(memrowset, snap, projection); +} + +CompactionInput *CompactionInput::Merge(const vector > &inputs, + const Schema* schema) { + CHECK(schema->has_column_ids()); + return new MergeCompactionInput(inputs, schema); +} + + +Status RowSetsInCompaction::CreateCompactionInput(const MvccSnapshot &snap, + const Schema* schema, + shared_ptr *out) const { + CHECK(schema->has_column_ids()); + + vector > inputs; + for (const shared_ptr &rs : rowsets_) { + gscoped_ptr input; + RETURN_NOT_OK_PREPEND(rs->NewCompactionInput(schema, snap, &input), + Substitute("Could not create compaction input for rowset $0", + rs->ToString())); + inputs.push_back(shared_ptr(input.release())); + } + + if (inputs.size() == 1) { + out->swap(inputs[0]); + } else { + out->reset(CompactionInput::Merge(inputs, schema)); + } + + return Status::OK(); +} + +void RowSetsInCompaction::DumpToLog() const { + LOG(INFO) << "Selected " << rowsets_.size() << " rowsets to compact:"; + // Dump the selected rowsets to the log, and collect corresponding iterators. + for (const shared_ptr &rs : rowsets_) { + LOG(INFO) << rs->ToString() << "(current size on disk: ~" + << rs->EstimateOnDiskSize() << " bytes)"; + } +} + + +Status ApplyMutationsAndGenerateUndos(const MvccSnapshot& snap, + const CompactionInputRow& src_row, + const Schema* base_schema, + Mutation** new_undo_head, + Mutation** new_redo_head, + Arena* arena, + RowBlockRow* dst_row, + bool* is_garbage_collected, + uint64_t* num_rows_history_truncated) { + // TODO actually perform garbage collection (KUDU-236). + // Right now we persist all mutations. + *is_garbage_collected = false; + + const Schema* dst_schema = dst_row->schema(); + + bool is_deleted = false; + + #define ERROR_LOG_CONTEXT \ + "Source Row: " << dst_schema->DebugRow(src_row.row) << \ + " Redo Mutations: " << Mutation::StringifyMutationList(*base_schema, src_row.redo_head) << \ + " Undo Mutations: " << Mutation::StringifyMutationList(*base_schema, src_row.undo_head) << \ + "\nDest Row: " << dst_schema->DebugRow(*dst_row) << \ + " Redo Mutations: " << Mutation::StringifyMutationList(*dst_schema, redo_head) << \ + " Undo Mutations: " << Mutation::StringifyMutationList(*dst_schema, undo_head) + + faststring dst; + RowChangeListEncoder undo_encoder(&dst); + + // Const cast this away here since we're ever only going to point to it + // which doesn't actually mutate it and having Mutation::set_next() + // take a non-const value is required in other places. + Mutation* undo_head = const_cast(src_row.undo_head); + Mutation* redo_head = nullptr; + + for (const Mutation *redo_mut = src_row.redo_head; + redo_mut != nullptr; + redo_mut = redo_mut->next()) { + + // Skip anything not committed. + if (!snap.IsCommitted(redo_mut->timestamp())) { + continue; + } + + undo_encoder.Reset(); + + Mutation* current_undo; + DVLOG(3) << " @" << redo_mut->timestamp() << ": " + << redo_mut->changelist().ToString(*base_schema); + + RowChangeListDecoder redo_decoder(redo_mut->changelist()); + Status s = redo_decoder.Init(); + if (PREDICT_FALSE(!s.ok())) { + LOG(ERROR) << "Unable to decode changelist. " << ERROR_LOG_CONTEXT; + return s; + } + + if (redo_decoder.is_update()) { + DCHECK(!is_deleted) << "Got UPDATE for deleted row. " << ERROR_LOG_CONTEXT; + + s = redo_decoder.ApplyRowUpdate(dst_row, + reinterpret_cast(NULL), &undo_encoder); + if (PREDICT_FALSE(!s.ok())) { + LOG(ERROR) << "Unable to apply update/create undo: " << s.ToString() + << "\n" << ERROR_LOG_CONTEXT; + return s; + } + + // If all of the updates were for columns that we aren't projecting, we don't + // need to push them into the UNDO file. + if (undo_encoder.is_empty()) { + continue; + } + + // create the UNDO mutation in the provided arena. + current_undo = Mutation::CreateInArena(arena, redo_mut->timestamp(), + undo_encoder.as_changelist()); + + // In the case where the previous undo was NULL just make this one + // the head. + if (undo_head == nullptr) { + undo_head = current_undo; + } else { + current_undo->set_next(undo_head); + undo_head = current_undo; + } + + + } else if (redo_decoder.is_delete() || redo_decoder.is_reinsert()) { + redo_decoder.TwiddleDeleteStatus(&is_deleted); + + if (redo_decoder.is_reinsert()) { + // Right now when a REINSERT mutation is found it is treated as a new insert and it + // clears the whole row history before it. + + // Copy the reinserted row over. + Slice reinserted_slice; + RETURN_NOT_OK(redo_decoder.GetReinsertedRowSlice(*dst_schema, &reinserted_slice)); + ConstContiguousRow reinserted(dst_schema, reinserted_slice.data()); + // No need to copy into an arena -- can refer to the mutation's arena. + Arena* null_arena = nullptr; + RETURN_NOT_OK(CopyRow(reinserted, dst_row, null_arena)); + + // Create an undo for the REINSERT + undo_encoder.SetToDelete(); + // Reset the UNDO head, losing all previous undos. + undo_head = Mutation::CreateInArena(arena, + redo_mut->timestamp(), + undo_encoder.as_changelist()); + + // Also reset the previous redo head since it stored the delete which was nullified + // by this reinsert + redo_head = nullptr; + + if ((*num_rows_history_truncated)++ == 0) { + LOG(WARNING) << "Found REINSERT REDO truncating row history for " + << ERROR_LOG_CONTEXT << " Note: this warning will appear " + "only for the first truncated row"; + } + + if (PREDICT_FALSE(VLOG_IS_ON(2))) { + VLOG(2) << "Found REINSERT REDO, cannot create UNDO for it, resetting row history " + " under snap: " << snap.ToString() << ERROR_LOG_CONTEXT; + } + } else { + // Delete mutations are left as redos + undo_encoder.SetToDelete(); + // Encode the DELETE as a redo + redo_head = Mutation::CreateInArena(arena, + redo_mut->timestamp(), + undo_encoder.as_changelist()); + } + } else { + LOG(FATAL) << "Unknown mutation type!" << ERROR_LOG_CONTEXT; + } + } + + *new_undo_head = undo_head; + *new_redo_head = redo_head; + + return Status::OK(); + + #undef ERROR_LOG_CONTEXT +} + +Status FlushCompactionInput(CompactionInput* input, + const MvccSnapshot& snap, + RollingDiskRowSetWriter* out) { + RETURN_NOT_OK(input->Init()); + vector rows; + + DCHECK(out->schema().has_column_ids()); + + RowBlock block(out->schema(), 100, nullptr); + + uint64_t num_rows_history_truncated = 0; + + while (input->HasMoreBlocks()) { + RETURN_NOT_OK(input->PrepareBlock(&rows)); + + int n = 0; + for (const CompactionInputRow &input_row : rows) { + RETURN_NOT_OK(out->RollIfNecessary()); + + const Schema* schema = input_row.row.schema(); + DCHECK_SCHEMA_EQ(*schema, out->schema()); + DCHECK(schema->has_column_ids()); + + RowBlockRow dst_row = block.row(n); + RETURN_NOT_OK(CopyRow(input_row.row, &dst_row, reinterpret_cast(NULL))); + + DVLOG(2) << "Input Row: " << dst_row.schema()->DebugRow(dst_row) << + " RowId: " << input_row.row.row_index() << + " Undo Mutations: " << Mutation::StringifyMutationList(*schema, input_row.undo_head) << + " Redo Mutations: " << Mutation::StringifyMutationList(*schema, input_row.redo_head); + + // Collect the new UNDO/REDO mutations. + Mutation* new_undos_head = nullptr; + Mutation* new_redos_head = nullptr; + + bool is_garbage_collected; + RETURN_NOT_OK(ApplyMutationsAndGenerateUndos(snap, + input_row, + schema, + &new_undos_head, + &new_redos_head, + input->PreparedBlockArena(), + &dst_row, + &is_garbage_collected, + &num_rows_history_truncated)); + + // Whether this row was garbage collected + if (is_garbage_collected) { + DVLOG(2) << "Garbage Collected!"; + // Don't flush the row. + continue; + } + + rowid_t index_in_current_drs_; + + // We should always have UNDO deltas, until we implement delta GC. For now, + // this is a convenient assertion to catch bugs like KUDU-632. + CHECK(new_undos_head != nullptr) << + "Writing an output row with no UNDOs: " + "Input Row: " << dst_row.schema()->DebugRow(dst_row) << + " RowId: " << input_row.row.row_index() << + " Undo Mutations: " << Mutation::StringifyMutationList(*schema, input_row.undo_head) << + " Redo Mutations: " << Mutation::StringifyMutationList(*schema, input_row.redo_head); + out->AppendUndoDeltas(dst_row.row_index(), new_undos_head, &index_in_current_drs_); + + if (new_redos_head != nullptr) { + out->AppendRedoDeltas(dst_row.row_index(), new_redos_head, &index_in_current_drs_); + } + + DVLOG(2) << "Output Row: " << dst_row.schema()->DebugRow(dst_row) << + " RowId: " << index_in_current_drs_ + << " Undo Mutations: " << Mutation::StringifyMutationList(*schema, new_undos_head) + << " Redo Mutations: " << Mutation::StringifyMutationList(*schema, new_redos_head); + + n++; + if (n == block.nrows()) { + RETURN_NOT_OK(out->AppendBlock(block)); + n = 0; + } + } + + if (n > 0) { + block.Resize(n); + RETURN_NOT_OK(out->AppendBlock(block)); + } + + RETURN_NOT_OK(input->FinishBlock()); + } + + if (num_rows_history_truncated > 0) { + LOG(WARNING) << "Total " << num_rows_history_truncated + << " rows lost some history due to REINSERT after DELETE"; + } + return Status::OK(); +} + +Status ReupdateMissedDeltas(const string &tablet_name, + CompactionInput *input, + const MvccSnapshot &snap_to_exclude, + const MvccSnapshot &snap_to_include, + const RowSetVector &output_rowsets) { + TRACE_EVENT0("tablet", "ReupdateMissedDeltas"); + RETURN_NOT_OK(input->Init()); + + VLOG(1) << "Re-updating missed deltas between snapshot " << + snap_to_exclude.ToString() << " and " << snap_to_include.ToString(); + + // Collect the delta trackers that we'll push the updates into. + deque delta_trackers; + for (const shared_ptr &rs : output_rowsets) { + delta_trackers.push_back(down_cast(rs.get())->delta_tracker()); + } + + // The map of updated delta trackers, indexed by id. + unordered_set updated_trackers; + + // When we apply the updates to the new DMS, there is no need to anchor them + // since these stores are not yet part of the tablet. + const consensus::OpId max_op_id = consensus::MaximumOpId(); + + // The rowid where the current (front) delta tracker starts. + int64_t delta_tracker_base_row = 0; + + // TODO: on this pass, we don't actually need the row data, just the + // updates. So, this can be made much faster. + vector rows; + const Schema* schema = &input->schema(); + const Schema key_schema(input->schema().CreateKeyProjection()); + + // Arena and projector to store/project row keys for missed delta updates + Arena arena(1024, 1024*1024); + RowProjector key_projector(schema, &key_schema); + RETURN_NOT_OK(key_projector.Init()); + faststring buf; + + rowid_t row_idx = 0; + while (input->HasMoreBlocks()) { + RETURN_NOT_OK(input->PrepareBlock(&rows)); + + for (const CompactionInputRow &row : rows) { + DVLOG(2) << "Revisiting row: " << schema->DebugRow(row.row) << + " Redo Mutations: " << Mutation::StringifyMutationList(*schema, row.redo_head) << + " Undo Mutations: " << Mutation::StringifyMutationList(*schema, row.undo_head); + + for (const Mutation *mut = row.redo_head; + mut != nullptr; + mut = mut->next()) { + RowChangeListDecoder decoder(mut->changelist()); + RETURN_NOT_OK(decoder.Init()); + + if (snap_to_exclude.IsCommitted(mut->timestamp())) { + // This update was already taken into account in the first phase of the + // compaction. + continue; + } + + // We should never see a REINSERT in an input RowSet which was not + // caught in the original flush. REINSERT only occurs when an INSERT is + // done to a row when a ghost is already present for that row in + // MemRowSet. If the ghost is in a disk RowSet, it is ignored and the + // new row is inserted in the MemRowSet instead. + // + // At the beginning of a compaction/flush, a new empty MRS is swapped in for + // the one to be flushed. Therefore, any INSERT that happens _after_ this swap + // is made will not trigger a REINSERT: it sees the row as "deleted" in the + // snapshotted MRS, and insert triggers an INSERT into the new MRS. + // + // Any INSERT that happened _before_ the swap-out would create a + // REINSERT in the MRS to be flushed, but it would also be considered as + // part of the MvccSnapshot which we flush from ('snap_to_exclude' here) + // and therefore won't make it to this point in the code. + CHECK(!decoder.is_reinsert()) + << "Shouldn't see REINSERT missed by first flush pass in compaction." + << " snap_to_exclude=" << snap_to_exclude.ToString() + << " row=" << schema->DebugRow(row.row) + << " mutations=" << Mutation::StringifyMutationList(*schema, row.redo_head); + + if (!snap_to_include.IsCommitted(mut->timestamp())) { + // The mutation was inserted after the DuplicatingRowSet was swapped in. + // Therefore, it's already present in the output rowset, and we don't need + // to copy it in. + + DVLOG(2) << "Skipping already-duplicated delta for row " << row_idx + << " @" << mut->timestamp() << ": " << mut->changelist().ToString(*schema); + continue; + } + + // Otherwise, this is an update that arrived after the snapshot for the first + // pass, but before the DuplicatingRowSet was swapped in. We need to transfer + // this over to the output rowset. + DVLOG(1) << "Flushing missed delta for row " << row_idx + << " @" << mut->timestamp() << ": " << mut->changelist().ToString(*schema); + + DeltaTracker *cur_tracker = delta_trackers.front(); + + // The index on the input side isn't necessarily the index on the output side: + // we may have output several small DiskRowSets, so we need to find the index + // relative to the current one. + int64_t idx_in_delta_tracker = row_idx - delta_tracker_base_row; + while (idx_in_delta_tracker >= cur_tracker->num_rows()) { + // If the current index is higher than the total number of rows in the current + // DeltaTracker, that means we're now processing the next one in the list. + // Pop the current front tracker, and make the indexes relative to the next + // in the list. + delta_tracker_base_row += cur_tracker->num_rows(); + idx_in_delta_tracker -= cur_tracker->num_rows(); + DCHECK_GE(idx_in_delta_tracker, 0); + delta_trackers.pop_front(); + cur_tracker = delta_trackers.front(); + } + + gscoped_ptr result(new OperationResultPB); + Status s = cur_tracker->Update(mut->timestamp(), + idx_in_delta_tracker, + mut->changelist(), + max_op_id, + result.get()); + DCHECK(s.ok()) << "Failed update on compaction for row " << row_idx + << " @" << mut->timestamp() << ": " << mut->changelist().ToString(*schema); + if (s.ok()) { + // Update the set of delta trackers with the one we've just updated. + InsertIfNotPresent(&updated_trackers, cur_tracker); + } + } + + // TODO when garbage collection kicks in we need to take care that + // CGed rows do not increment this. + row_idx++; + } + + RETURN_NOT_OK(input->FinishBlock()); + } + + + // Flush the trackers that got updated, this will make sure that all missed deltas + // get flushed before we update the tablet's metadata at the end of compaction/flush. + // Note that we don't flush the metadata here, as to we will update the metadata + // at the end of the compaction/flush. + // + // TODO: there should be a more elegant way of preventing metadata flush at this point + // using pinning, or perhaps a builder interface for new rowset metadata objects. + // See KUDU-204. + + { + TRACE_EVENT0("tablet", "Flushing missed deltas"); + for (DeltaTracker* tracker : updated_trackers) { + VLOG(1) << "Flushing DeltaTracker updated with missed deltas..."; + RETURN_NOT_OK_PREPEND(tracker->Flush(DeltaTracker::NO_FLUSH_METADATA), + "Could not flush delta tracker after missed delta update"); + } + } + + return Status::OK(); +} + + +Status DebugDumpCompactionInput(CompactionInput *input, vector *lines) { + RETURN_NOT_OK(input->Init()); + vector rows; + + while (input->HasMoreBlocks()) { + RETURN_NOT_OK(input->PrepareBlock(&rows)); + + for (const CompactionInputRow &input_row : rows) { + const Schema* schema = input_row.row.schema(); + LOG_STRING(INFO, lines) << schema->DebugRow(input_row.row) << + " Undos: " + Mutation::StringifyMutationList(*schema, input_row.undo_head) << + " Redos: " + Mutation::StringifyMutationList(*schema, input_row.redo_head); + } + + RETURN_NOT_OK(input->FinishBlock()); + } + return Status::OK(); +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/compaction.h b/src/kudu/tablet/compaction.h new file mode 100644 index 000000000000..99a6ddadb5aa --- /dev/null +++ b/src/kudu/tablet/compaction.h @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_COMPACTION_H +#define KUDU_TABLET_COMPACTION_H + +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/common/iterator.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/memrowset.h" + +namespace kudu { +namespace tablet { +struct CompactionInputRow; +class WriteTransactionState; + +// Interface for an input feeding into a compaction or flush. +class CompactionInput { + public: + // Create an input which reads from the given rowset, yielding base rows + // prior to the given snapshot. + // + // NOTE: For efficiency, this doesn't currently filter the mutations to only + // include those committed in the given snapshot. It does, however, filter out + // rows that weren't inserted prior to this snapshot. Users of this input still + // need to call snap.IsCommitted() on each mutation. + // + // TODO: can we make the above less messy? + static Status Create(const DiskRowSet &rowset, + const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out); + + // Create an input which reads from the given memrowset, yielding base rows and updates + // prior to the given snapshot. + static CompactionInput *Create(const MemRowSet &memrowset, + const Schema* projection, + const MvccSnapshot &snap); + + // Create an input which merges several other compaction inputs. The inputs are merged + // in key-order according to the given schema. All inputs must have matching schemas. + static CompactionInput *Merge(const vector > &inputs, + const Schema *schema); + + virtual Status Init() = 0; + virtual Status PrepareBlock(vector *block) = 0; + + // Returns the arena for this compaction input corresponding to the last + // prepared block. This must be called *after* PrepareBlock() as if this + // is a MergeCompactionInput only then will the right arena be selected. + virtual Arena* PreparedBlockArena() = 0; + virtual Status FinishBlock() = 0; + + virtual bool HasMoreBlocks() = 0; + virtual const Schema &schema() const = 0; + + virtual ~CompactionInput() {} +}; + +// The set of rowsets which are taking part in a given compaction. +class RowSetsInCompaction { + public: + void AddRowSet(const std::shared_ptr &rowset, + const std::shared_ptr &lock) { + CHECK(lock->owns_lock()); + + locks_.push_back(lock); + rowsets_.push_back(rowset); + } + + // Create the appropriate compaction input for this compaction -- either a merge + // of all the inputs, or the single input if there was only one. + // + // 'schema' is the schema for the output of the compaction, and must remain valid + // for the lifetime of the returned CompactionInput. + Status CreateCompactionInput(const MvccSnapshot &snap, + const Schema* schema, + std::shared_ptr *out) const; + + // Dump a log message indicating the chosen rowsets. + void DumpToLog() const; + + const RowSetVector &rowsets() const { return rowsets_; } + + size_t num_rowsets() const { + return rowsets_.size(); + } + + private: + typedef vector > LockVector; + + RowSetVector rowsets_; + LockVector locks_; +}; + +// One row yielded by CompactionInput::PrepareBlock. +struct CompactionInputRow { + // The compaction input base row. + RowBlockRow row; + // The current redo head for this row, may be null if the base row has no mutations. + const Mutation* redo_head; + // The current undo head for this row, may be null if all undos were garbage collected. + const Mutation* undo_head; +}; + +// Function shared by flushes, compactions and major delta compactions. Applies all the REDO +// mutations from 'src_row' to the 'dst_row', and generates the related UNDO mutations. Some +// handling depends on the nature of the operation being performed: +// - Flush: Applies all the REDOs to all the columns. +// - Compaction: Applies all the REDOs to all the columns. +// - Major delta compaction: Applies only the REDOs that have corresponding columns in the schema +// belonging to 'dst_row'. Those that don't belong to that schema are +// ignored. +// +// Currently, 'is_garbage_collected' is always false (KUDU-236). +Status ApplyMutationsAndGenerateUndos(const MvccSnapshot& snap, + const CompactionInputRow& src_row, + const Schema* base_schema, + Mutation** new_undo_head, + Mutation** new_redo_head, + Arena* arena, + RowBlockRow* dst_row, + bool* is_garbage_collected, + uint64_t* num_rows_history_truncated); + + +// Iterate through this compaction input, flushing all rows to the given RollingDiskRowSetWriter. +// The 'snap' argument should match the MvccSnapshot used to create the compaction input. +// +// After return of this function, this CompactionInput object is "used up" and will +// no longer be useful. +Status FlushCompactionInput(CompactionInput *input, + const MvccSnapshot &snap, + RollingDiskRowSetWriter *out); + +// Iterate through this compaction input, finding any mutations which came between +// snap_to_exclude and snap_to_include (ie those transactions that were not yet +// committed in 'snap_to_exclude' but _are_ committed in 'snap_to_include'). For +// each such mutation, propagate it into the compaction's output rowsets. +// +// The output rowsets passed in must be non-overlapping and in ascending key order: +// typically they are the resulting rowsets from a RollingDiskRowSetWriter. +// +// After return of this function, this CompactionInput object is "used up" and will +// yield no further rows. +Status ReupdateMissedDeltas(const string &tablet_name, + CompactionInput *input, + const MvccSnapshot &snap_to_exclude, + const MvccSnapshot &snap_to_include, + const RowSetVector &output_rowsets); + +// Dump the given compaction input to 'lines' or LOG(INFO) if it is NULL. +// This consumes all of the input in the compaction input. +Status DebugDumpCompactionInput(CompactionInput *input, vector *lines); + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/compaction.txt b/src/kudu/tablet/compaction.txt new file mode 100644 index 000000000000..49a8c8596c0f --- /dev/null +++ b/src/kudu/tablet/compaction.txt @@ -0,0 +1,95 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This document explains the mechanics of performing a rowset flush/compaction. +For details explaining how compactions are selected, see compaction-policy.txt. +NOTE: this does not describe anything about flushing delta stores to delta files! + +Compaction design notes +------------------------------------------------------------ + +Goal: Take two or more RowSets with overlapping key ranges, and merge +them into a new RowSet, while updates are concurrently being applied. +The output RowSet should also garbage collect (i.e reclaim storage from) +any rows which were deleted in the old RowSets. + +------------------------------ + +Let's start with the simple example of compacting from 1 input rowset to +1 output rowset. This has the effect of removing GC-able data and +applying updates. The compaction has two main phases: + + + "flush_snap" + | + | + before v +<----------| + Phase 1: + merging/flushing + |-----------| + Phase 2: migrate + deltas + |---------------| + compaction + complete + |-----------> + +|-------------- time -----------------------------> + + +System steady state: + - Updates are applied only to the "source RowSet" + +Transition into Phase 1: + - Create a snapshot iterator to merge the input RowSets, and save the + associated MVCC snapshot state. + +Phase 1: merge/flush data: + - Use the iterator created above to create a new set of data for the output + RowSet. This will reflect any updates or deletes which arrived prior to the + start of phase 1, but no updates or deletes which arrive during either + phase of the compaction. + + - Any mutations which arrive during this phase are applied only to the input + RowSets' delta tracking structures. Because the merge operates on a snapshot, + it will not take these into account in the output RowSet. + +Phase 2: migrate deltas from phase 1 + - Any mutations which arrive during this phase should be applied to both the + input RowSet and the output RowSet. This is simple to do by duplicating + the key lookup into the output RowSet's key column when the update arrives. + This is implemented by swapping in a "DuplicatingRowSet" implementation which + forwards updates to both the input and output rowsets. + + - Any reads during this phase must be served from the input RowSet, since the + output RowSet is missing the deltas which arrived during the merge phase. + + - Because the merge output ignored any mutations which arrived during phase 1, + we must now 'migrate' those mutations to the output RowSet. This can be done + efficiently by collecting all of the deltas which were not included in the + snapshot iterator, and applying them to the output rowset's delta tracker. + + +End of Phase 2: swap RowSets + - After Phase 2, the two RowSets have logically identical data, and they may + be atomically swapped. Once the output RowSet has been swapped in, new updates + only need to be applied to the output RowSet, and the old RowSet may be dropped. + +Extending to multiple RowSets +------------------------------ + +The above algorithm can be extended to multiple RowSets equally well. At the beginning +of the compaction, each RowSet is snapshotted, and a snapshot iterator created. A merge +iterator then performs the merge of all of the snapshots in ascending key order. + diff --git a/src/kudu/tablet/compaction_policy-test.cc b/src/kudu/tablet/compaction_policy-test.cc new file mode 100644 index 000000000000..6a72cf7d2242 --- /dev/null +++ b/src/kudu/tablet/compaction_policy-test.cc @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/util/test_util.h" +#include "kudu/tablet/mock-rowsets.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/rowset_tree.h" +#include "kudu/tablet/compaction_policy.h" + +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +// Simple test for budgeted compaction: with three rowsets which +// mostly overlap, and an high budget, they should all be selected. +TEST(TestCompactionPolicy, TestBudgetedSelection) { + RowSetVector vec; + vec.push_back(shared_ptr(new MockDiskRowSet("C", "c"))); + vec.push_back(shared_ptr(new MockDiskRowSet("B", "a"))); + vec.push_back(shared_ptr(new MockDiskRowSet("A", "b"))); + + RowSetTree tree; + ASSERT_OK(tree.Reset(vec)); + + const int kBudgetMb = 1000; // enough to select all + BudgetedCompactionPolicy policy(kBudgetMb); + + unordered_set picked; + double quality = 0; + ASSERT_OK(policy.PickRowSets(tree, &picked, &quality, nullptr)); + ASSERT_EQ(3, picked.size()); + ASSERT_GE(quality, 1.0); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/compaction_policy.cc b/src/kudu/tablet/compaction_policy.cc new file mode 100644 index 000000000000..c1d642e2a2c2 --- /dev/null +++ b/src/kudu/tablet/compaction_policy.cc @@ -0,0 +1,347 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/compaction_policy.h" + +#include + +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/rowset_info.h" +#include "kudu/tablet/rowset_tree.h" +#include "kudu/tablet/svg_dump.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/knapsack_solver.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +using std::vector; + +DEFINE_int32(budgeted_compaction_target_rowset_size, 32*1024*1024, + "The target size for DiskRowSets during flush/compact when the " + "budgeted compaction policy is used"); +TAG_FLAG(budgeted_compaction_target_rowset_size, experimental); +TAG_FLAG(budgeted_compaction_target_rowset_size, advanced); + +namespace kudu { +namespace tablet { + +// Adjust the result downward slightly for wider solutions. +// Consider this input: +// +// |-----A----||----C----| +// |-----B----| +// +// where A, B, and C are all 1MB, and the budget is 10MB. +// +// Without this tweak, the solution {A, B, C} has the exact same +// solution value as {A, B}, since both compactions would yield a +// tablet with average height 1. Since both solutions fit within +// the budget, either would be a valid pick, and it would be up +// to chance which solution would be selected. +// Intuitively, though, there's no benefit to including "C" in the +// compaction -- it just uses up some extra IO. If we slightly +// penalize wider solutions as a tie-breaker, then we'll pick {A, B} +// here. +static const double kSupportAdjust = 1.01; + +//////////////////////////////////////////////////////////// +// BudgetedCompactionPolicy +//////////////////////////////////////////////////////////// + +BudgetedCompactionPolicy::BudgetedCompactionPolicy(int budget) + : size_budget_mb_(budget) { + CHECK_GT(budget, 0); +} + +uint64_t BudgetedCompactionPolicy::target_rowset_size() const { + CHECK_GT(FLAGS_budgeted_compaction_target_rowset_size, 0); + return FLAGS_budgeted_compaction_target_rowset_size; +} + +// Returns in min-key and max-key sorted order +void BudgetedCompactionPolicy::SetupKnapsackInput(const RowSetTree &tree, + vector* min_key, + vector* max_key) { + RowSetInfo::CollectOrdered(tree, min_key, max_key); + + if (min_key->size() < 2) { + // require at least 2 rowsets to compact + min_key->clear(); + max_key->clear(); + return; + } +} + +namespace { + +struct CompareByDescendingDensity { + bool operator()(const RowSetInfo& a, const RowSetInfo& b) const { + return a.density() > b.density(); + } +}; + +struct KnapsackTraits { + typedef RowSetInfo item_type; + typedef double value_type; + static int get_weight(const RowSetInfo &item) { + return item.size_mb(); + } + static value_type get_value(const RowSetInfo &item) { + return item.width(); + } +}; + +// Dereference-then-compare comparator +template +struct DerefCompare { + template + bool operator()(T* a, T* b) const { + static const Compare comp = Compare(); + return comp(*a, *b); + } +}; + +// Incremental calculator for the upper bound on a knapsack solution, +// given a set of items. The upper bound is computed by solving the +// simpler "fractional knapsack problem" -- i.e the related problem +// in which each input may be fractionally put in the knapsack, instead +// of all-or-nothing. The fractional knapsack problem has a very efficient +// solution: sort by descending density and greedily choose elements +// until the budget is reached. The last element to be chosen may be +// partially included in the knapsack. +// +// Because this greedy solution only depends on sorting, it can be computed +// incrementally as items are considered by maintaining a min-heap, ordered +// by the density of the input elements. We need only maintain enough elements +// to satisfy the budget, making this logarithmic in the budget and linear +// in the number of elements added. +class UpperBoundCalculator { + public: + explicit UpperBoundCalculator(int max_weight) + : total_weight_(0), + total_value_(0), + max_weight_(max_weight), + topdensity_(MathLimits::kNegInf) { + } + + void Add(const RowSetInfo& candidate) { + // No need to add if less dense than the top and have no more room + if (total_weight_ >= max_weight_ && + candidate.density() <= topdensity_) + return; + + fractional_solution_.push_back(&candidate); + std::push_heap(fractional_solution_.begin(), fractional_solution_.end(), + DerefCompare()); + + total_weight_ += candidate.size_mb(); + total_value_ += candidate.width(); + const RowSetInfo& top = *fractional_solution_.front(); + if (total_weight_ - top.size_mb() >= max_weight_) { + total_weight_ -= top.size_mb(); + total_value_ -= top.width(); + std::pop_heap(fractional_solution_.begin(), fractional_solution_.end(), + DerefCompare()); + fractional_solution_.pop_back(); + } + topdensity_ = fractional_solution_.front()->density(); + } + + // Compute the upper-bound to the 0-1 knapsack problem with the elements + // added so far. + double ComputeUpperBound() const { + int excess_weight = total_weight_ - max_weight_; + if (excess_weight <= 0) { + return total_value_; + } + + const RowSetInfo& top = *fractional_solution_.front(); + double fraction_of_top_to_remove = static_cast(excess_weight) / top.size_mb(); + DCHECK_GT(fraction_of_top_to_remove, 0); + return total_value_ - fraction_of_top_to_remove * top.width(); + } + + void clear() { + fractional_solution_.clear(); + total_weight_ = 0; + total_value_ = 0; + } + + private: + + // Store pointers to RowSetInfo rather than whole copies in order + // to allow for fast swapping in the heap. + vector fractional_solution_; + int total_weight_; + double total_value_; + int max_weight_; + double topdensity_; +}; + +} // anonymous namespace + +Status BudgetedCompactionPolicy::PickRowSets(const RowSetTree &tree, + unordered_set* picked, + double* quality, + std::vector* log) { + vector asc_min_key, asc_max_key; + SetupKnapsackInput(tree, &asc_min_key, &asc_max_key); + if (asc_max_key.empty()) { + if (log) { + LOG_STRING(INFO, log) << "No rowsets to compact"; + } + // nothing to compact. + return Status::OK(); + } + + UpperBoundCalculator ub_calc(size_budget_mb_); + KnapsackSolver solver; + + // The best set of rowsets chosen so far + unordered_set best_chosen; + // The value attained by the 'best_chosen' solution. + double best_optimal = 0; + + vector chosen_indexes; + vector inrange_candidates; + inrange_candidates.reserve(asc_min_key.size()); + vector upper_bounds; + + for (const RowSetInfo& cc_a : asc_min_key) { + chosen_indexes.clear(); + inrange_candidates.clear(); + ub_calc.clear(); + upper_bounds.clear(); + + double ab_min = cc_a.cdf_min_key(); + double ab_max = cc_a.cdf_max_key(); + + // Collect all other candidates which would not expand the support to the + // left of this one. Because these are sorted by ascending max key, we can + // easily ensure that whenever we add a 'cc_b' to our candidate list for the + // knapsack problem, we've already included all rowsets which fall in the + // range from cc_a.min to cc_b.max. + // + // For example: + // + // |-----A----| + // |-----B----| + // |----C----| + // |--------D-------| + // + // We process in the order: A, B, C, D. + // + // This saves us from having to iterate through the list again to find all + // such rowsets. + // + // Additionally, each knapsack problem builds on the previous knapsack + // problem by adding just a single rowset, meaning that we can reuse the + // existing dynamic programming state to incrementally update the solution, + // rather than having to rebuild from scratch. + for (const RowSetInfo& cc_b : asc_max_key) { + if (cc_b.cdf_min_key() < ab_min) { + // Would expand support to the left. + // TODO: possible optimization here: binary search to skip to the first + // cc_b with cdf_max_key() > cc_a.cdf_min_key() + continue; + } + inrange_candidates.push_back(cc_b); + + // While we're iterating, also calculate the upper bound for the solution + // on the set within the [ab_min, ab_max] output range. + ab_max = std::max(cc_b.cdf_max_key(), ab_max); + double union_width = ab_max - ab_min; + + ub_calc.Add(cc_b); + upper_bounds.push_back(ub_calc.ComputeUpperBound() - union_width * kSupportAdjust); + } + if (inrange_candidates.empty()) continue; + // If the best upper bound across this whole range is worse than our current + // optimal, we can short circuit all the knapsack-solving. + if (*std::max_element(upper_bounds.begin(), upper_bounds.end()) < best_optimal) continue; + + solver.Reset(size_budget_mb_, &inrange_candidates); + + ab_max = cc_a.cdf_max_key(); + + int i = 0; + while (solver.ProcessNext()) { + // If this candidate's upper bound is worse than the optimal, we don't + // need to look at it. + const RowSetInfo& item = inrange_candidates[i]; + double upper_bound = upper_bounds[i]; + i++; + if (upper_bound < best_optimal) continue; + + std::pair best_with_this_item = solver.GetSolution(); + double best_value = best_with_this_item.second; + + ab_max = std::max(item.cdf_max_key(), ab_max); + DCHECK_GE(ab_max, ab_min); + double solution = best_value - (ab_max - ab_min) * kSupportAdjust; + DCHECK_LE(solution, upper_bound + 0.0001); + + if (solution > best_optimal) { + solver.TracePath(best_with_this_item, &chosen_indexes); + best_optimal = solution; + } + } + + // If we came up with a new solution, replace. + if (!chosen_indexes.empty()) { + best_chosen.clear(); + for (int i : chosen_indexes) { + best_chosen.insert(inrange_candidates[i].rowset()); + } + } + } + + // Log the input and output of the selection. + if (VLOG_IS_ON(1) || log != nullptr) { + LOG_STRING(INFO, log) << "Budgeted compaction selection:"; + for (RowSetInfo &cand : asc_min_key) { + const char *checkbox = "[ ]"; + if (ContainsKey(best_chosen, cand.rowset())) { + checkbox = "[x]"; + } + LOG_STRING(INFO, log) << " " << checkbox << " " << cand.ToString(); + } + LOG_STRING(INFO, log) << "Solution value: " << best_optimal; + } + + *quality = best_optimal; + + if (best_optimal <= 0) { + VLOG(1) << "Best compaction available makes things worse. Not compacting."; + return Status::OK(); + } + + picked->swap(best_chosen); + DumpCompactionSVG(asc_min_key, *picked); + + return Status::OK(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/compaction_policy.h b/src/kudu/tablet/compaction_policy.h new file mode 100644 index 000000000000..7f660eee92e6 --- /dev/null +++ b/src/kudu/tablet/compaction_policy.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_COMPACTION_POLICY_H +#define KUDU_TABLET_COMPACTION_POLICY_H + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tablet { + +class RowSet; +class RowSetTree; + +class RowSetInfo; + +// A Compaction Policy is responsible for picking which files in a tablet +// should be compacted together. +class CompactionPolicy { + public: + CompactionPolicy() {} + virtual ~CompactionPolicy() {} + + // Select a set of RowSets to compact out of 'tree'. + // + // Callers are responsible for externally synchronizing selection within a + // given Tablet. This will only select rowsets whose compact_flush_lock + // is unlocked, but will not itself take the lock. Hence no other threads + // should lock or unlock the rowsets' compact_flush_lock while this method + // is running. + // + // *quality is set to represent how effective the compaction will be on + // reducing IO in the tablet. TODO: determine the units/ranges of this thing. + // + // If 'log' is not NULL, then a verbose log of the compaction selection + // process will be appended to it. + virtual Status PickRowSets(const RowSetTree &tree, + std::unordered_set* picked, + double* quality, + std::vector* log) = 0; + + // Return the size at which flush/compact should "roll" to new files. Some + // compaction policies may prefer to deal with small constant-size files + // whereas others may prefer large ones. + virtual uint64_t target_rowset_size() const { + return 1024 * 1024 * 1024; // no rolling + } + + private: + DISALLOW_COPY_AND_ASSIGN(CompactionPolicy); +}; + +// Compaction policy which, given a size budget for a compaction, and a workload, +// tries to pick a set of RowSets which fit into that budget and minimize the +// future cost of operations on the tablet. +// +// See src/kudu/tablet/compaction-policy.txt for details. +class BudgetedCompactionPolicy : public CompactionPolicy { + public: + explicit BudgetedCompactionPolicy(int size_budget_mb); + + virtual Status PickRowSets(const RowSetTree &tree, + std::unordered_set* picked, + double* quality, + std::vector* log) OVERRIDE; + + virtual uint64_t target_rowset_size() const OVERRIDE; + + private: + void SetupKnapsackInput(const RowSetTree &tree, + std::vector* min_key, + std::vector* max_key); + + size_t size_budget_mb_; +}; + +} // namespace tablet +} // namespace kudu +#endif diff --git a/src/kudu/tablet/composite-pushdown-test.cc b/src/kudu/tablet/composite-pushdown-test.cc new file mode 100644 index 000000000000..47d4d312d959 --- /dev/null +++ b/src/kudu/tablet/composite-pushdown-test.cc @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tablet { + +const char* const kTestHostnames[] = { "foo", "foobar", "baz", nullptr }; + +class CompositePushdownTest : public KuduTabletTest { + public: + CompositePushdownTest() + : KuduTabletTest(Schema({ ColumnSchema("year", INT16), + ColumnSchema("month", INT8), + ColumnSchema("day", INT8), + ColumnSchema("hostname", STRING), + ColumnSchema("data", STRING) }, + 4)) { + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + + FillTestTablet(); + } + + void FillTestTablet() { + uint32_t nrows = 10 * 12 * 28; + int i = 0; + + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow row(&client_schema_); + for (int16_t year = 2000; year <= 2010; year++) { + for (int8_t month = 1; month <= 12; month++) { + for (int8_t day = 1; day <= 28; day++) { + for (int host_idx = 0; kTestHostnames[host_idx] != nullptr; host_idx++) { + CHECK_OK(row.SetInt16(0, year)); + CHECK_OK(row.SetInt8(1, month)); + CHECK_OK(row.SetInt8(2, day)); + CHECK_OK(row.SetStringCopy(3, kTestHostnames[host_idx])); + CHECK_OK(row.SetStringCopy(4, StringPrintf("%d/%02d/%02d-%s", year, month, day, + kTestHostnames[host_idx]))); + ASSERT_OK_FAST(writer.Insert(row)); + + if (i == nrows * 9 / 10) { + ASSERT_OK(tablet()->Flush()); + } + ++i; + } + } + } + } + } + + // Helper function for sorting returned results by the 'data' field. + // This is needed as "2" is lexicographically greater than "12" which means + // that, e.g., comparing "(int16 year=2001, int8 month=2, int8 day=7, string + // data=2001/02/07)" to "(int16 year=2001, int8 month=12, int8 + // day=7, string data=2001/12/07)" would be semantically incorrect if + // the comparison was on the whole string vs the last portion of the + // string ("2001/02/01" vs. "2001/12/01") + struct SuffixComparator { + bool operator()(const string &a, const string &b) { + string s_a = a.substr(a.find("data=")); + string s_b = b.substr(b.find("data=")); + return s_a < s_b; + } + }; + + void ScanTablet(ScanSpec *spec, vector *results, const char *descr) { + SCOPED_TRACE(descr); + + gscoped_ptr iter; + ASSERT_OK(tablet()->NewRowIterator(client_schema_, &iter)); + ASSERT_OK(iter->Init(spec)); + ASSERT_TRUE(spec->predicates().empty()) << "Should have accepted all predicates"; + LOG_TIMING(INFO, descr) { + ASSERT_OK(IterateToStringList(iter.get(), results)); + } + std::sort(results->begin(), results->end(), SuffixComparator()); + for (const string &str : *results) { + VLOG(1) << str; + } + } +}; + +TEST_F(CompositePushdownTest, TestPushDownExactEquality) { + ScanSpec spec; + int16_t year = 2001; + int8_t month = 9; + int8_t day = 7; + Slice host(kTestHostnames[0]); + ColumnRangePredicate pred_year(schema_.column(0), &year, &year); + ColumnRangePredicate pred_month(schema_.column(1), &month, &month); + ColumnRangePredicate pred_day(schema_.column(2), &day, &day); + ColumnRangePredicate pred_host(schema_.column(3), &host, &host); + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month); + spec.AddPredicate(pred_day); + spec.AddPredicate(pred_host); + vector results; + + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Exact match using compound key")); + ASSERT_EQ(1, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=foo, string data=2001/09/07-foo)", + results.front()); +} + + +// Test for "host <= 'foo'" which should reject 'foobaz'. +// Regression test for a bug in an earlier implementation of predicate pushdown. +TEST_F(CompositePushdownTest, TestPushDownStringInequality) { + ScanSpec spec; + int16_t year = 2001; + int8_t month = 9; + int8_t day = 7; + Slice host("foo"); + ColumnRangePredicate pred_year(schema_.column(0), &year, &year); + ColumnRangePredicate pred_month(schema_.column(1), &month, &month); + ColumnRangePredicate pred_day(schema_.column(2), &day, &day); + ColumnRangePredicate pred_host(schema_.column(3), nullptr, &host); + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month); + spec.AddPredicate(pred_day); + spec.AddPredicate(pred_host); + vector results; + + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Exact match using compound key")); + ASSERT_EQ(2, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=baz, string data=2001/09/07-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=foo, string data=2001/09/07-foo)", + results.back()); +} + + +TEST_F(CompositePushdownTest, TestPushDownDateEquality) { + ScanSpec spec; + int16_t year = 2001; + int8_t month = 9; + int8_t day = 7; + ColumnRangePredicate pred_year(schema_.column(0), &year, &year); + ColumnRangePredicate pred_month(schema_.column(1), &month, &month); + ColumnRangePredicate pred_day(schema_.column(2), &day, &day); + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month); + spec.AddPredicate(pred_day); + vector results; + + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Exact match using compound key")); + ASSERT_EQ(3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=baz, string data=2001/09/07-baz)", + results[0]); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=foo, string data=2001/09/07-foo)", + results[1]); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=7, " + "string hostname=foobar, string data=2001/09/07-foobar)", + results[2]); +} + +TEST_F(CompositePushdownTest, TestPushDownPrefixEquality) { + int16_t year = 2001; + int8_t month = 9; + ColumnRangePredicate pred_year(schema_.column(0), &year, &year); + ColumnRangePredicate pred_month(schema_.column(1), &month, &month); + + { + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, + "Prefix match using 2/3 of a compound key")); + ASSERT_EQ(28 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=1, " + "string hostname=baz, string data=2001/09/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=28, " + "string hostname=foobar, string data=2001/09/28-foobar)", + results.back()); + } + + { + ScanSpec spec; + spec.AddPredicate(pred_year); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, + "Prefix match using 1/3 of a compound key")); + ASSERT_EQ(28 * 12 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=1, int8 day=1, " + "string hostname=baz, string data=2001/01/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=2, int8 day=1, " + "string hostname=baz, string data=2001/02/01-baz)", + results[28 * 3]); + EXPECT_EQ("(int16 year=2001, int8 month=12, int8 day=28, " + "string hostname=foobar, string data=2001/12/28-foobar)", + results.back()); + } +} + +TEST_F(CompositePushdownTest, TestPushDownPrefixEqualitySuffixInequality) { + int16_t year = 2001; + int8_t month_l = 9; + int8_t month_u = 11; + int8_t day_l = 1; + int8_t day_u = 15; + + ColumnRangePredicate pred_year(schema_.column(0), &year, &year); + + ColumnRangePredicate pred_month_eq(schema_.column(1), &month_l, &month_l); + ColumnRangePredicate pred_month_ge_le(schema_.column(1), &month_l, &month_u); + ColumnRangePredicate pred_month_le(schema_.column(1), nullptr, &month_l); + + ColumnRangePredicate pred_day_ge_le(schema_.column(2), &day_l, &day_u); + ColumnRangePredicate pred_day_ge(schema_.column(2), &day_l, nullptr); + ColumnRangePredicate pred_day_le(schema_.column(2), nullptr, &day_u); + + { + // year=2001, month=9, day >= 1 && day <= 15 + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month_eq); + spec.AddPredicate(pred_day_ge_le); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix equality, suffix inequality")); + ASSERT_EQ(15 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=1, " + "string hostname=baz, string data=2001/09/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=15, " + "string hostname=foobar, string data=2001/09/15-foobar)", + results.back()); + } + + { + // year=2001, month=9, day >= 1 + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month_eq); + spec.AddPredicate(pred_day_ge); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix equality, suffix inequality")); + ASSERT_EQ(28 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=1, " + "string hostname=baz, string data=2001/09/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=28, " + "string hostname=foobar, string data=2001/09/28-foobar)", + results.back()); + } + + { + // year=2001, month=9, day <= 15 + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month_eq); + spec.AddPredicate(pred_day_le); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix equality, suffix inequality")); + ASSERT_EQ(15 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=1, " + "string hostname=baz, string data=2001/09/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=15, " + "string hostname=foobar, string data=2001/09/15-foobar)", + results.back()); + } + + { + // year=2001, month >= 9 && month <= 11 + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month_ge_le); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix equality, suffix inequality")); + ASSERT_EQ(3 * 28 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=1, " + "string hostname=baz, string data=2001/09/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=11, int8 day=28, " + "string hostname=foobar, string data=2001/11/28-foobar)", + results.back()); + } + + { + // year=2001, month <= 9 + ScanSpec spec; + spec.AddPredicate(pred_year); + spec.AddPredicate(pred_month_le); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix equality, suffix inequality")); + ASSERT_EQ(9 * 28 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=1, int8 day=1, " + "string hostname=baz, string data=2001/01/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2001, int8 month=9, int8 day=28, " + "string hostname=foobar, string data=2001/09/28-foobar)", + results.back()); + } +} + +TEST_F(CompositePushdownTest, TestPushdownPrefixInequality) { + + int16_t year_2001 = 2001; + int16_t year_2003 = 2003; + { + // year >= 2001 && year <= 2003 + ColumnRangePredicate pred_year(schema_.column(0), &year_2001, &year_2003); + ScanSpec spec; + spec.AddPredicate(pred_year); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix inequality")); + ASSERT_EQ(3 * 12 * 28 * 3, results.size()); + EXPECT_EQ("(int16 year=2001, int8 month=1, int8 day=1, " + "string hostname=baz, string data=2001/01/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2003, int8 month=12, int8 day=28, " + "string hostname=foobar, string data=2003/12/28-foobar)", + results.back()); + } + + { + // year >= 2001 + ColumnRangePredicate pred_year(schema_.column(0), &year_2001, nullptr); + ScanSpec spec; + spec.AddPredicate(pred_year); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix inequality")); + ASSERT_EQ(10 * 12 * 28 * 3, results.size()); + // Needed because results from memrowset are returned first and memrowset begins + // with last 10% of the keys (e.g., last few years) + EXPECT_EQ("(int16 year=2001, int8 month=1, int8 day=1, " + "string hostname=baz, string data=2001/01/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2010, int8 month=12, int8 day=28, " + "string hostname=foobar, string data=2010/12/28-foobar)", + results.back()); + } + + { + // year <= 2003 + ColumnRangePredicate pred_year(schema_.column(0), nullptr, &year_2003); + ScanSpec spec; + spec.AddPredicate(pred_year); + vector results; + ASSERT_NO_FATAL_FAILURE(ScanTablet(&spec, &results, "Prefix inequality")); + ASSERT_EQ(4 * 12 * 28 * 3, results.size()); + EXPECT_EQ("(int16 year=2000, int8 month=1, int8 day=1, " + "string hostname=baz, string data=2000/01/01-baz)", + results.front()); + EXPECT_EQ("(int16 year=2003, int8 month=12, int8 day=28, " + "string hostname=foobar, string data=2003/12/28-foobar)", + results.back()); + } +} + + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/concurrent_btree.h b/src/kudu/tablet/concurrent_btree.h new file mode 100644 index 000000000000..1d3be267021e --- /dev/null +++ b/src/kudu/tablet/concurrent_btree.h @@ -0,0 +1,1800 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This file implements a concurrent in-memory B-tree similar to the one +// described in the MassTree paper; +// "Cache Craftiness for Fast Multicore Key-Value Storage" +// Mao, Kohler, and Morris +// Eurosys 2012 +// +// This implementation is only the B-tree component, and not the "trie of trees" +// which make up their full data structure. In addition to this, there are +// some other key differences: +// - We do not support removal of elements from the tree -- in the Kudu memrowset +// use case, we use a deletion bit to indicate a removed record, and end up +// actually removing the storage at compaction time. +// - We do not support updating elements in the tree. Because we use MVCC, we +// only append new entries. A limited form of update is allowed in that data +// may be modified so long as the size is not changed. In that case, it is +// up to the user to provide concurrency control of the update (eg by using +// atomic operations or external locking) +// - The leaf nodes are linked together with a "next" pointer. This makes +// scanning simpler (the Masstree implementation avoids this because it +// complicates the removal operation) +#ifndef KUDU_TABLET_CONCURRENT_BTREE_H +#define KUDU_TABLET_CONCURRENT_BTREE_H + +#include +#include +#include +#include +#include + +#include "kudu/util/inline_slice.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/status.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/port.h" + +//#define TRAVERSE_PREFETCH +#define SCAN_PREFETCH + + +// Define the following to get an ugly printout on each node split +// to see how much of the node was actually being used. +// #define DEBUG_DUMP_SPLIT_STATS + +namespace kudu { namespace tablet { +namespace btree { + +// All CBTree implementation classes are templatized on a traits +// structure which customizes the implementation at compile-time. +// +// This default implementation should be reasonable for most usage. +struct BTreeTraits { + enum TraitConstants { + // Number of bytes used per internal node. + internal_node_size = 4 * CACHELINE_SIZE, + + // Number of bytes used by a leaf node. + leaf_node_size = 4 * CACHELINE_SIZE, + + // Tests can set this trait to a non-zero value, which inserts + // some pause-loops in key parts of the code to try to simulate + // races. + debug_raciness = 0 + }; + typedef ThreadSafeArena ArenaType; +}; + +template +inline void PrefetchMemory(const T *addr) { + int size = std::min(sizeof(T), 4 * CACHELINE_SIZE); + + for (int i = 0; i < size; i += CACHELINE_SIZE) { + prefetch(reinterpret_cast(addr) + i, PREFETCH_HINT_T0); + } +} + +// Utility function that, when Traits::debug_raciness is non-zero +// (i.e only in debug code), will spin for some amount of time +// related to that setting. +// This can be used when trying to debug race conditions, but +// will compile away in production code. +template +void DebugRacyPoint() { + if (Traits::debug_raciness > 0) { + boost::detail::yield(Traits::debug_raciness); + } +} + +template class NodeBase; +template class InternalNode; +template class LeafNode; +template class PreparedMutation; +template class CBTree; +template class CBTreeIterator; + +typedef base::subtle::Atomic64 AtomicVersion; + +struct VersionField { + public: + static AtomicVersion StableVersion(volatile AtomicVersion *version) { + for (int loop_count = 0; true; loop_count++) { + AtomicVersion v_acq = base::subtle::Acquire_Load(version); + if (PREDICT_TRUE(!IsLocked(v_acq))) { + return v_acq; + } + boost::detail::yield(loop_count++); + } + } + + static void Lock(volatile AtomicVersion *version) { + int loop_count = 0; + + while (true) { + AtomicVersion v_acq = base::subtle::Acquire_Load(version); + if (PREDICT_TRUE(!IsLocked(v_acq))) { + AtomicVersion v_locked = SetLockBit(v_acq, 1); + if (PREDICT_TRUE(base::subtle::Acquire_CompareAndSwap(version, v_acq, v_locked) == v_acq)) { + return; + } + } + // Either was already locked by someone else, or CAS failed. + boost::detail::yield(loop_count++); + } + } + + static void Unlock(volatile AtomicVersion *version) { + // NoBarrier should be OK here, because no one else modifies the + // version while we have it locked. + AtomicVersion v = base::subtle::NoBarrier_Load(version); + + DCHECK(v & BTREE_LOCK_MASK); + + // If splitting, increment the splitting field + v += ((v & BTREE_SPLITTING_MASK) >> BTREE_SPLITTING_BIT) << BTREE_VSPLIT_SHIFT; + // If inserting, increment the insert field + v += ((v & BTREE_INSERTING_MASK) >> BTREE_INSERTING_BIT) << BTREE_VINSERT_SHIFT; + + // Get rid of the lock, flags and any overflow into the unused section. + v = SetLockBit(v, 0); + v &= ~(BTREE_UNUSED_MASK | BTREE_INSERTING_MASK | BTREE_SPLITTING_MASK); + + base::subtle::Release_Store(version, v); + } + + static uint64_t GetVSplit(AtomicVersion v) { + return v & BTREE_VSPLIT_MASK; + } + static uint64_t GetVInsert(AtomicVersion v) { + return (v & BTREE_VINSERT_MASK) >> BTREE_VINSERT_SHIFT; + } + static void SetSplitting(volatile AtomicVersion *v) { + base::subtle::Release_Store(v, *v | BTREE_SPLITTING_MASK); + } + static void SetInserting(volatile AtomicVersion *v) { + base::subtle::Release_Store(v, *v | BTREE_INSERTING_MASK); + } + static void SetLockedInsertingNoBarrier(volatile AtomicVersion *v) { + *v = VersionField::BTREE_LOCK_MASK | VersionField::BTREE_INSERTING_MASK; + } + + // Return true if the two version fields differ in more + // than just the lock status. + static bool IsDifferent(AtomicVersion v1, AtomicVersion v2) { + return PREDICT_FALSE((v1 & ~BTREE_LOCK_MASK) != (v2 & ~BTREE_LOCK_MASK)); + } + + // Return true if a split has occurred between the two versions + // or is currently in progress + static bool HasSplit(AtomicVersion v1, AtomicVersion v2) { + return PREDICT_FALSE((v1 & (BTREE_VSPLIT_MASK | BTREE_SPLITTING_MASK)) != + (v2 & (BTREE_VSPLIT_MASK | BTREE_SPLITTING_MASK))); + } + + static inline bool IsLocked(AtomicVersion v) { + return v & BTREE_LOCK_MASK; + } + static inline bool IsSplitting(AtomicVersion v) { + return v & BTREE_SPLITTING_MASK; + } + static inline bool IsInserting(AtomicVersion v) { + return v & BTREE_INSERTING_MASK; + } + + static string Stringify(AtomicVersion v) { + return StringPrintf("[flags=%c%c%c vins=%" PRIu64 " vsplit=%" PRIu64 "]", + (v & BTREE_LOCK_MASK) ? 'L':' ', + (v & BTREE_SPLITTING_MASK) ? 'S':' ', + (v & BTREE_INSERTING_MASK) ? 'I':' ', + GetVInsert(v), + GetVSplit(v)); + } + + private: + enum { + BTREE_LOCK_BIT = 63, + BTREE_SPLITTING_BIT = 62, + BTREE_INSERTING_BIT = 61, + BTREE_VINSERT_SHIFT = 27, + BTREE_VSPLIT_SHIFT = 0, + +#define BB(x) BOOST_BINARY(x) + BTREE_LOCK_MASK = + BB(10000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), + BTREE_SPLITTING_MASK = + BB(01000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), + BTREE_INSERTING_MASK = + BB(00100000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), + + // There is one unused byte between the single-bit fields and the + // incremented fields. This allows us to efficiently increment the + // fields and avoid an extra instruction or two, since we don't need + // to worry about overflow. If vsplit overflows into vinsert, that's + // not a problem, since the vsplit change always results in a retry. + // If we roll over into this unused bit, we'll mask it out. + BTREE_UNUSED_MASK = + BB(00010000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), + BTREE_VINSERT_MASK = + BB(00001111 11111111 11111111 11111111 11111100 00000000 00000000 00000000 ), + BTREE_VSPLIT_MASK = + BB(00000000 00000000 00000000 00000000 00000011 11111111 11111111 11111111 ), +#undef BB + }; + + //Undeclared constructor - this is just static utilities. + VersionField(); + + static AtomicVersion SetLockBit(AtomicVersion v, int lock) { + DCHECK(lock == 0 || lock == 1); + v = v & ~BTREE_LOCK_MASK; + COMPILE_ASSERT(sizeof(AtomicVersion) == 8, must_use_64bit_version); + v |= (uint64_t)lock << BTREE_LOCK_BIT; + return v; + } +}; + +// Slice-like class for representing pointers to values in leaf nodes. +// This is used in preference to a normal Slice only so that it can have +// the same API as InlineSlice, and because it takes up less space +// inside of the tree leaves themselves. +// +// Stores the length of its data as the first sizeof(uintptr_t) bytes of +// the pointed-to data. +class ValueSlice { + private: + // We have to use a word-size field to store the length of the slice so + // that the user's data starts at a word-aligned address. + // Otherwise, the user could not use atomic operations on pointers inside + // their value (eg the mutation linked list in an MRS). + typedef uintptr_t size_type; + public: + Slice as_slice() const { + return Slice(ptr_ + sizeof(size_type), + *reinterpret_cast(ptr_)); + } + + // Set this slice to a copy of 'src', allocated from alloc_arena. + // The copy will be word-aligned. No memory ordering is implied. + template + void set(const Slice& src, ArenaType* alloc_arena) { + uint8_t* in_arena = reinterpret_cast( + alloc_arena->AllocateBytesAligned(src.size() + sizeof(size_type), + sizeof(uint8_t*))); + // No special CAS/etc are necessary here, since anyone calling this holds the + // lock on the row. Concurrent readers never try to follow this pointer until + // they've gotten a consistent snapshot. + // + // (This is different than the keys, where concurrent tree traversers may + // actually try to follow the key indirection pointers from InlineSlice + // without copying a snapshot first). + DCHECK_LE(src.size(), MathLimits::kMax) + << "Slice too large for btree"; + size_type size = src.size(); + memcpy(in_arena, &size, sizeof(size)); + memcpy(in_arena + sizeof(size), src.data(), src.size()); + ptr_ = const_cast(in_arena); + } + + private: + const uint8_t* ptr_; +} PACKED; + +// Return the index of the first entry in the array which is +// >= the given value +template +size_t FindInSliceArray(const InlineSlice *array, ssize_t num_entries, + const Slice &key, bool *exact) { + DCHECK_GE(num_entries, 0); + + if (PREDICT_FALSE(num_entries == 0)) { + *exact = false; + return 0; + } + + size_t left = 0; + size_t right = num_entries - 1; + + while (left < right) { + int mid = (left + right + 1) / 2; + // TODO: inline slices with more than 8 bytes will store a prefix of the + // slice inline, which we could use to short circuit some of these comparisons. + int compare = array[mid].as_slice().compare(key); + if (compare < 0) { // mid < key + left = mid; + } else if (compare > 0) { // mid > search + right = mid - 1; + } else { // mid == search + *exact = true; + return mid; + } + } + + int compare = array[left].as_slice().compare(key); + *exact = compare == 0; + if (compare < 0) { // key > left + left++; + } + return left; +} + + +template +static void InsertInSliceArray(ISlice *array, size_t num_entries, + const Slice &src, size_t idx, + ArenaType *arena) { + DCHECK_LT(idx, num_entries); + for (size_t i = num_entries - 1; i > idx; i--) { + array[i] = array[i - 1]; + } + array[idx].set(src, arena); +} + + +template +class NodeBase { + public: + AtomicVersion StableVersion() { + return VersionField::StableVersion(&version_); + } + + AtomicVersion AcquireVersion() { + return base::subtle::Acquire_Load(&version_); + } + + void Lock() { + VersionField::Lock(&version_); + } + + bool IsLocked() { + return VersionField::IsLocked(version_); + } + + void Unlock() { + VersionField::Unlock(&version_); + } + + void SetSplitting() { + VersionField::SetSplitting(&version_); + } + + void SetInserting() { + VersionField::SetInserting(&version_); + } + + // Return the parent node for this node, with the lock acquired. + InternalNode *GetLockedParent() { + while (true) { + InternalNode *ret = parent_; + if (ret == NULL) { + return NULL; + } + + ret->Lock(); + + if (PREDICT_FALSE(parent_ != ret)) { + // My parent changed after accomplishing the lock + ret->Unlock(); + continue; + } + + return ret; + } + } + + protected: + friend class CBTree; + + NodeBase() : version_(0), parent_(NULL) + {} + + public: + volatile AtomicVersion version_; + + // parent_ field is protected not by this node's lock, but by + // the parent's lock. This allows reassignment of the parent_ + // field to occur after a split without gathering locks for all + // the children. + InternalNode *parent_; + + private: + DISALLOW_COPY_AND_ASSIGN(NodeBase); +} PACKED; + + + +// Wrapper around a void pointer, which encodes the type +// of the pointed-to object in its most-significant-bit. +// The pointer may reference either an internal node or a +// leaf node. +// This assumes that the most significant bit of all valid pointers is +// 0, so that that bit can be used as storage. This is true on x86, where +// pointers are truly only 48-bit. +template +struct NodePtr { + enum NodeType { + INTERNAL_NODE, + LEAF_NODE + }; + + + NodePtr() : p_(NULL) {} + + NodePtr(InternalNode *p) { // NOLINT(runtime/explicit) + uintptr_t p_int = reinterpret_cast(p); + DCHECK(!(p_int & kDiscriminatorBit)) << "Pointer must not use most significant bit"; + p_ = p; + } + + NodePtr(LeafNode *p) { // NOLINT(runtime/explicit) + uintptr_t p_int = reinterpret_cast(p); + DCHECK(!(p_int & kDiscriminatorBit)) << "Pointer must not use most significant bit"; + p_ = reinterpret_cast(p_int | kDiscriminatorBit); + } + + NodeType type() { + DCHECK(p_ != NULL); + if (reinterpret_cast(p_) & kDiscriminatorBit) { + return LEAF_NODE; + } else { + return INTERNAL_NODE; + } + } + + bool is_null() { + return p_ == NULL; + } + + InternalNode *internal_node_ptr() { + DCHECK_EQ(type(), INTERNAL_NODE); + return reinterpret_cast *>(p_); + } + + LeafNode *leaf_node_ptr() { + DCHECK_EQ(type(), LEAF_NODE); + return reinterpret_cast *>( + reinterpret_cast(p_) & (~kDiscriminatorBit)); + } + + NodeBase *base_ptr() { + DCHECK(!is_null()); + return reinterpret_cast *>( + reinterpret_cast(p_) & (~kDiscriminatorBit)); + } + + void *p_; + + private: + enum { + kDiscriminatorBit = (1L << (sizeof(uintptr_t) * 8 - 1)) + }; +} PACKED; + +enum InsertStatus { + INSERT_SUCCESS, + INSERT_FULL, + INSERT_DUPLICATE +}; + +//////////////////////////////////////////////////////////// +// Internal node +//////////////////////////////////////////////////////////// + +template +class PACKED InternalNode : public NodeBase { + public: + + // Construct a new internal node, containing the given children. + // This also reassigns the parent pointer of the children. + // Because other accessors of the tree may follow the children's + // parent pointers back up to discover a new root, and the parent + // pointers are covered by their parent's lock, this requires that + // the new internal node node is constructed in LOCKED state. + InternalNode(const Slice &split_key, + NodePtr lchild, + NodePtr rchild, + typename Traits::ArenaType* arena) + : num_children_(0) { + DCHECK_EQ(lchild.type(), rchild.type()) + << "Only expect to create a new internal node on account of a " + << "split: child nodes should have same type"; + + // Just assign the version, instead of using the proper ->Lock() + // since we don't need a CAS here. + VersionField::SetLockedInsertingNoBarrier(&this->version_); + + keys_[0].set(split_key, arena); + DCHECK_GT(split_key.size(), 0); + child_pointers_[0] = lchild; + child_pointers_[1] = rchild; + ReassignParent(lchild); + ReassignParent(rchild); + + num_children_ = 2; + } + + // Insert a new entry to the internal node. + // + // This is typically called after one of its child nodes has split. + InsertStatus Insert(const Slice &key, NodePtr right_child, + typename Traits::ArenaType* arena) { + DCHECK(this->IsLocked()); + CHECK_GT(key.size(), 0); + + bool exact; + size_t idx = Find(key, &exact); + CHECK(!exact) + << "Trying to insert duplicate key " << key.ToDebugString() + << " into an internal node! Internal node keys should result " + << " from splits and therefore be unique."; + + if (PREDICT_FALSE(num_children_ == kFanout)) { + return INSERT_FULL; + } + + // About to modify this node - flag it so that concurrent + // readers will retry. + this->SetInserting(); + + // Insert the key and child pointer in the right spot in the list + int new_num_children = num_children_ + 1; + InsertInSliceArray(keys_, new_num_children, key, idx, arena); + for (int i = new_num_children - 1; i > idx + 1; i--) { + child_pointers_[i] = child_pointers_[i - 1]; + } + child_pointers_[idx + 1] = right_child; + + base::subtle::Release_Store(reinterpret_cast( + &num_children_), new_num_children); + + ReassignParent(right_child); + + return INSERT_SUCCESS; + } + + // Return the node index responsible for the given key. + // For example, if the key is less than the first discriminating + // node, returns 0. If it is between 0 and 1, returns 1, etc. + size_t Find(const Slice &key, bool *exact) { + return FindInSliceArray(keys_, key_count(), key, exact); + } + + // Find the child whose subtree may contain the given key. + // Note that this result may be an invalid or incorrect pointer if the + // caller has not locked the node, in which case OCC should be + // used to verify it after its usage. + NodePtr FindChild(const Slice &key) { + bool exact; + size_t idx = Find(key, &exact); + if (exact) { + idx++; + } + return child_pointers_[idx]; + } + + Slice GetKey(size_t idx) const { + DCHECK_LT(idx, key_count()); + return keys_[idx].as_slice(); + } + + // Truncates the node, removing entries from the right to reduce + // to the new size. Also compacts the underlying storage so that all + // free space is contiguous, allowing for new inserts. + void Truncate(size_t new_num_keys) { + DCHECK(this->IsLocked()); + DCHECK(VersionField::IsSplitting(this->version_)); + DCHECK_GT(new_num_keys, 0); + + DCHECK_LT(new_num_keys, key_count()); + num_children_ = new_num_keys + 1; + + #ifndef NDEBUG + // This loop isn't necessary for correct operation, but nulling the pointers + // might help us catch bugs in debug mode. + for (int i = 0; i < num_children_; i++) { + DCHECK(!child_pointers_[i].is_null()); + } + for (int i = num_children_; i < kFanout; i++) { + // reset to NULL + child_pointers_[i] = NodePtr(); + } + #endif + } + + string ToString() const { + string ret("["); + for (int i = 0; i < num_children_; i++) { + if (i > 0) { + ret.append(", "); + } + Slice k = keys_[i].as_slice(); + ret.append(k.ToDebugString()); + } + ret.append("]"); + return ret; + } + + private: + friend class CBTree; + + void ReassignParent(NodePtr child) { + child.base_ptr()->parent_ = this; + } + + int key_count() const { + // The node uses N keys to separate N+1 child pointers. + DCHECK_GT(num_children_, 0); + return num_children_ - 1; + } + + typedef InlineSlice KeyInlineSlice; + + enum SpaceConstants { + constant_overhead = sizeof(NodeBase) // base class + + sizeof(uint32_t), // num_children_ + keyptr_space = Traits::internal_node_size - constant_overhead, + kFanout = keyptr_space / (sizeof(KeyInlineSlice) + sizeof(NodePtr)) + }; + + // This ordering of members ensures KeyInlineSlices are properly aligned + // for atomic ops + KeyInlineSlice keys_[kFanout]; + NodePtr child_pointers_[kFanout]; + uint32_t num_children_; +} PACKED; + +//////////////////////////////////////////////////////////// +// Leaf node +//////////////////////////////////////////////////////////// + +template +class LeafNode : public NodeBase { + public: + // Construct a new leaf node. + // If initially_locked is true, then the new node is created + // with LOCKED and INSERTING set. + explicit LeafNode(bool initially_locked) + : next_(NULL), + num_entries_(0) { + if (initially_locked) { + // Just assign the version, instead of using the proper ->Lock() + // since we don't need a CAS here. + VersionField::SetLockedInsertingNoBarrier(&this->version_); + } + } + + int num_entries() const { return num_entries_; } + + void PrepareMutation(PreparedMutation *ret) { + DCHECK(this->IsLocked()); + ret->leaf_ = this; + ret->idx_ = Find(ret->key(), &ret->exists_); + } + + // Insert a new entry into this leaf node. + InsertStatus Insert(PreparedMutation *mut, const Slice &val) { + DCHECK_EQ(this, mut->leaf()); + DCHECK(this->IsLocked()); + + if (PREDICT_FALSE(mut->exists())) { + return INSERT_DUPLICATE; + } + + return InsertNew(mut->idx(), mut->key(), val, mut->arena()); + } + + // Insert an entry at the given index, which is guaranteed to be + // new. + InsertStatus InsertNew(size_t idx, const Slice &key, const Slice &val, + typename Traits::ArenaType* arena) { + if (PREDICT_FALSE(num_entries_ == kMaxEntries)) { + // Full due to metadata + return INSERT_FULL; + } + + DCHECK_LT(idx, kMaxEntries); + + this->SetInserting(); + + // The following inserts should always succeed because we + // verified that there is space available above. + num_entries_++; + InsertInSliceArray(keys_, num_entries_, key, idx, arena); + DebugRacyPoint(); + InsertInSliceArray(vals_, num_entries_, val, idx, arena); + + return INSERT_SUCCESS; + } + + // Find the index of the first key which is >= the given + // search key. + // If the comparison is equal, then sets *exact to true. + // If no keys in the leaf are >= the given search key, + // then returns the size of the leaf node. + // + // Note that, if the lock is not held, this may return + // bogus results, in which case OCC must be used to verify. + size_t Find(const Slice &key, bool *exact) const { + return FindInSliceArray(keys_, num_entries_, key, exact); + } + + // Get the slice corresponding to the nth key. + // + // If the caller does not hold the lock, then this Slice + // may point to arbitrary data, and the result should be only + // trusted when verified by checking for conflicts. + Slice GetKey(size_t idx) const { + return keys_[idx].as_slice(); + } + + // Get the slice corresponding to the nth key and value. + // + // If the caller does not hold the lock, then this Slice + // may point to arbitrary data, and the result should be only + // trusted when verified by checking for conflicts. + // + // NOTE: the value slice may include an *invalid pointer*, not + // just invalid data, so any readers should check for conflicts + // before accessing the value slice. + // The key, on the other hand, will always be a valid pointer, but + // may be invalid data. + void Get(size_t idx, Slice *k, ValueSlice *v) const { + *k = keys_[idx].as_slice(); + *v = vals_[idx]; + } + + // Truncates the node, removing entries from the right to reduce + // to the new size. + // Caller must hold the node's lock with the SPLITTING flag set. + void Truncate(size_t new_num_entries) { + DCHECK(this->IsLocked()); + DCHECK(VersionField::IsSplitting(this->version_)); + + DCHECK_LT(new_num_entries, num_entries_); + num_entries_ = new_num_entries; + } + + string ToString() const { + string ret; + for (int i = 0; i < num_entries_; i++) { + if (i > 0) { + ret.append(", "); + } + Slice k = keys_[i].as_slice(); + Slice v = vals_[i].as_slice(); + ret.append("["); + ret.append(k.ToDebugString()); + ret.append("="); + ret.append(v.ToDebugString()); + ret.append("]"); + } + return ret; + } + + private: + friend class CBTree; + friend class InternalNode; + friend class CBTreeIterator; + + typedef InlineSlice KeyInlineSlice; + + // It is necessary to name this enum so that DCHECKs can use its + // constants (the macros may attempt to specialize templates + // with the constants, which require a named type). + enum SpaceConstants { + constant_overhead = sizeof(NodeBase) // base class + + sizeof(LeafNode*) // next_ + + sizeof(uint8_t), // num_entries_ + kv_space = Traits::leaf_node_size - constant_overhead, + kMaxEntries = kv_space / (sizeof(KeyInlineSlice) + sizeof(ValueSlice)) + }; + + // This ordering of members keeps KeyInlineSlices so pointers are aligned + LeafNode* next_; + KeyInlineSlice keys_[kMaxEntries]; + ValueSlice vals_[kMaxEntries]; + uint8_t num_entries_; +} PACKED; + + +//////////////////////////////////////////////////////////// +// Tree API +//////////////////////////////////////////////////////////// + +// A "scoped" object which holds a lock on a leaf node. +// Instances should be prepared with CBTree::PrepareMutation() +// and then used with a further Insert() call. +template +class PreparedMutation { + public: + // Construct a PreparedMutation. + // + // The data referred to by the 'key' Slice passed in themust remain + // valid for the lifetime of the PreparedMutation object. + explicit PreparedMutation(Slice key) + : key_(std::move(key)), tree_(NULL), leaf_(NULL), needs_unlock_(false) {} + + ~PreparedMutation() { + UnPrepare(); + } + + void Reset(const Slice& key) { + UnPrepare(); + key_ = key; + } + + // Prepare a mutation against the given tree. + // + // This prepared mutation may then be used with Insert(). + // In between preparing and executing the insert, the leaf node remains + // locked, so callers should endeavour to keep the critical section short. + // + // If the returned PreparedMutation object is not used with + // Insert(), it will be automatically unlocked by its destructor. + void Prepare(CBTree *tree) { + CHECK(!prepared()); + this->tree_ = tree; + this->arena_ = tree->arena_.get(); + tree->PrepareMutation(this); + needs_unlock_ = true; + } + + bool Insert(const Slice &val) { + CHECK(prepared()); + return tree_->Insert(this, val); + } + + // Return a slice referencing the existing data in the row. + // + // This is mutable data, but the size may not be changed. + // This can be used for updating in place if the new data + // has the same size as the original data. + Slice current_mutable_value() { + CHECK(prepared()); + Slice k; + ValueSlice v; + leaf_->Get(idx_, &k, &v); + leaf_->SetInserting(); + return v.as_slice(); + } + + // Accessors + + bool prepared() const { + return tree_ != NULL; + } + + // Return the key that was prepared. + const Slice &key() const { return key_; } + + CBTree *tree() const { + return tree_; + } + + LeafNode *leaf() const { + return CHECK_NOTNULL(leaf_); + } + + // Return true if the key that was prepared already exists. + bool exists() const { + return exists_; + } + + const size_t idx() const { + return idx_; + } + + typename Traits::ArenaType* arena() { + return arena_; + } + + private: + friend class CBTree; + friend class LeafNode; + friend class TestCBTree; + + DISALLOW_COPY_AND_ASSIGN(PreparedMutation); + + void mark_done() { + // set leaf_ back to NULL without unlocking it, + // since the caller will unlock it. + needs_unlock_ = false; + } + + void UnPrepare() { + if (leaf_ != NULL && needs_unlock_) { + leaf_->Unlock(); + needs_unlock_ = false; + } + tree_ = NULL; + } + + Slice key_; + CBTree *tree_; + + // The arena where inserted data may be copied if the data is too + // large to fit entirely within a tree node. + typename Traits::ArenaType* arena_; + + LeafNode *leaf_; + + size_t idx_; + bool exists_; + bool needs_unlock_; +}; + + +template +class CBTree { + public: + CBTree() + : arena_(new typename Traits::ArenaType(512*1024, 4*1024*1024)), + root_(NewLeaf(false)), + frozen_(false) { + } + + explicit CBTree(std::shared_ptr arena) + : arena_(std::move(arena)), root_(NewLeaf(false)), frozen_(false) {} + + ~CBTree() { + RecursiveDelete(root_); + } + + + // Convenience API to insert an item. + // + // Returns true if successfully inserted, false if an item with the given + // key already exists. + // + // More advanced users can use the PreparedMutation class instead. + bool Insert(const Slice &key, const Slice &val) { + PreparedMutation mutation(key); + mutation.Prepare(this); + return mutation.Insert(val); + } + + void DebugPrint() const { + AtomicVersion v; + DebugPrint(StableRoot(&v), NULL, 0); + CHECK_EQ(root_.base_ptr()->AcquireVersion(), v) + << "Concurrent modification during DebugPrint not allowed"; + } + + enum GetResult { + GET_SUCCESS, + GET_NOT_FOUND, + GET_TOO_BIG + }; + + // Get a copy of the given key, storing the result in the + // provided buffer. + // Returns SUCCESS and sets *buf_len on success + // Returns NOT_FOUND if no such key is found + // Returns TOO_BIG if the key is too large to fit in the provided buffer. + // In this case, sets *buf_len to the required buffer size. + // + // TODO: this call probably won't be necessary in the final implementation + GetResult GetCopy(const Slice &key, char *buf, size_t *buf_len) const { + size_t in_buf_len = *buf_len; + + retry_from_root: + { + AtomicVersion version; + LeafNode *leaf = CHECK_NOTNULL(TraverseToLeaf(key, &version)); + + DebugRacyPoint(); + + retry_in_leaf: + { + GetResult ret; + Slice key_in_node; + ValueSlice val_in_node; + bool exact; + size_t idx = leaf->Find(key, &exact); + DebugRacyPoint(); + + if (!exact) { + ret = GET_NOT_FOUND; + } else { + leaf->Get(idx, &key_in_node, &val_in_node); + ret = GET_SUCCESS; + } + + // Got some kind of result, but may be based on racy data. + // Verify it. + AtomicVersion new_version = leaf->StableVersion(); + if (VersionField::HasSplit(version, new_version)) { + goto retry_from_root; + } else if (VersionField::IsDifferent(version, new_version)) { + version = new_version; + goto retry_in_leaf; + } + + // If we found a matching key earlier, and the read of the node + // wasn't racy, we can safely work with the ValueSlice. + if (ret == GET_SUCCESS) { + Slice val = val_in_node.as_slice(); + *buf_len = val.size(); + + if (PREDICT_FALSE(val.size() > in_buf_len)) { + ret = GET_TOO_BIG; + } else { + memcpy(buf, val.data(), val.size()); + } + } + return ret; + } + } + } + + // Returns true if the given key is contained in the tree. + // TODO: unit test + bool ContainsKey(const Slice &key) const { + bool ret; + + retry_from_root: + { + AtomicVersion version; + LeafNode *leaf = CHECK_NOTNULL(TraverseToLeaf(key, &version)); + + DebugRacyPoint(); + + retry_in_leaf: + { + leaf->Find(key, &ret); + DebugRacyPoint(); + + // Got some kind of result, but may be based on racy data. + // Verify it. + AtomicVersion new_version = leaf->StableVersion(); + if (VersionField::HasSplit(version, new_version)) { + goto retry_from_root; + } else if (VersionField::IsDifferent(version, new_version)) { + version = new_version; + goto retry_in_leaf; + } + return ret; + } + } + } + + CBTreeIterator *NewIterator() const { + return new CBTreeIterator(this, frozen_); + } + + // Return the current number of elements in the tree. + // + // Note that this requires iterating through the entire tree, + // so it is not very efficient. + size_t count() const { + gscoped_ptr > iter(NewIterator()); + bool exact; + iter->SeekAtOrAfter(Slice(""), &exact); + size_t count = 0; + while (iter->IsValid()) { + count++; + iter->Next(); + } + return count; + } + + // Return true if this tree contains no elements + bool empty() const { + NodePtr root = root_; + switch (root.type()) { + case NodePtr::INTERNAL_NODE: + // If there's already an internal node, then we've inserted some data. + // Because we don't remove, this means we definitely have data. + return false; + case NodePtr::LEAF_NODE: + return root.leaf_node_ptr()->num_entries() == 0; + default: + CHECK(0) << "bad type"; + return true; + } + } + + size_t estimate_memory_usage() const { + return arena_->memory_footprint(); + } + + // Mark the tree as frozen. + // Once frozen, no further mutations may occur without triggering a CHECK + // violation. But, new iterators created after this point can scan more + // efficiently. + void Freeze() { + frozen_ = true; + } + + private: + friend class PreparedMutation; + friend class CBTreeIterator; + + DISALLOW_COPY_AND_ASSIGN(CBTree); + + NodePtr StableRoot(AtomicVersion *stable_version) const { + while (true) { + NodePtr node = root_; + NodeBase *node_base = node.base_ptr(); + *stable_version = node_base->StableVersion(); + + if (PREDICT_TRUE(node_base->parent_ == NULL)) { + // Found a good root + return node; + } else { + // root has been swapped out + root_ = node_base->parent_; + } + } + } + + LeafNode *TraverseToLeaf(const Slice &key, + AtomicVersion *stable_version) const { + retry_from_root: + AtomicVersion version = 0; + NodePtr node = StableRoot(&version); + NodeBase *node_base = node.base_ptr(); + + while (node.type() != NodePtr::LEAF_NODE) { +#ifdef TRAVERSE_PREFETCH + PrefetchMemory(node.internal_node_ptr()); +#endif + retry_in_node: + int num_children = node.internal_node_ptr()->num_children_; + NodePtr child = node.internal_node_ptr()->FindChild(key); + NodeBase *child_base = NULL; + + AtomicVersion child_version = -1; + + if (PREDICT_TRUE(!child.is_null())) { + child_base = child.base_ptr(); + child_version = child_base->StableVersion(); + } + AtomicVersion new_node_version = node_base->AcquireVersion(); + + if (VersionField::IsDifferent(version, new_node_version)) { + new_node_version = node_base->StableVersion(); + + if (VersionField::HasSplit(version, new_node_version)) { + goto retry_from_root; + } else { + version = new_node_version; + goto retry_in_node; + } + } + int new_children = node.internal_node_ptr()->num_children_; + DCHECK(!child.is_null()) + << "should have changed versions when child was NULL: " + << "old version: " << VersionField::Stringify(version) + << " new version: " << VersionField::Stringify(new_node_version) + << " version now: " << VersionField::Stringify(node_base->AcquireVersion()) + << " num_children: " << num_children << " -> " << new_children; + + node = child; + node_base = child_base; + version = child_version; + } +#ifdef TRAVERSE_PREFETCH + PrefetchMemory(node.leaf_node_ptr()); +#endif + *stable_version = version; + return node.leaf_node_ptr(); + } + + void DebugRacyPoint() const { + btree::DebugRacyPoint(); + } + + // Dump the tree. + // Requires that there are no concurrent modifications/ + void DebugPrint(NodePtr node, + InternalNode *expected_parent, + int indent) const { + + std::string buf; + switch (node.type()) { + case NodePtr::LEAF_NODE: + { + LeafNode *leaf = node.leaf_node_ptr(); + SStringPrintf(&buf, "%*sLEAF %p: ", indent, "", leaf); + buf.append(leaf->ToString()); + LOG(INFO) << buf; + CHECK_EQ(leaf->parent_, expected_parent) << "failed for " << leaf; + break; + } + case NodePtr::INTERNAL_NODE: + { + InternalNode *inode = node.internal_node_ptr(); + + SStringPrintf(&buf, "%*sINTERNAL %p: ", indent, "", inode); + LOG(INFO) << buf; + + for (int i = 0; i < inode->num_children_; i++) { + DebugPrint(inode->child_pointers_[i], inode, indent + 4); + if (i < inode->key_count()) { + SStringPrintf(&buf, "%*sKEY ", indent + 2, ""); + buf.append(inode->GetKey(i).ToDebugString()); + LOG(INFO) << buf; + } + } + CHECK_EQ(inode->parent_, expected_parent) << "failed for " << inode; + break; + } + default: + CHECK(0) << "bad node type"; + } + } + + void RecursiveDelete(NodePtr node) { + switch (node.type()) { + case NodePtr::LEAF_NODE: + FreeLeaf(node.leaf_node_ptr()); + break; + case NodePtr::INTERNAL_NODE: + { + InternalNode *inode = node.internal_node_ptr(); + for (int i = 0; i < inode->num_children_; i++) { + RecursiveDelete(inode->child_pointers_[i]); + inode->child_pointers_[i] = NodePtr(); + } + FreeInternalNode(inode); + break; + } + default: + CHECK(0); + } + } + + void PrepareMutation(PreparedMutation *mutation) { + DCHECK_EQ(mutation->tree(), this); + while (true) { + AtomicVersion stable_version; + LeafNode *lnode = TraverseToLeaf(mutation->key(), &stable_version); + + lnode->Lock(); + if (VersionField::HasSplit(lnode->AcquireVersion(), stable_version)) { + // Retry traversal due to a split + lnode->Unlock(); + continue; + } + + lnode->PrepareMutation(mutation); + return; + } + } + + // Inserts the given key/value into the prepared leaf node. + // If the leaf node is already full, handles splitting it and + // propagating splits up the tree. + // + // Precondition: + // 'node' is locked + // Postcondition: + // 'node' is unlocked + bool Insert(PreparedMutation *mutation, + const Slice &val) { + CHECK(!frozen_); + CHECK_NOTNULL(mutation); + DCHECK_EQ(mutation->tree(), this); + + LeafNode *node = mutation->leaf(); + DCHECK(node->IsLocked()); + + // After this function, the prepared mutation cannot be used + // again. + mutation->mark_done(); + + switch (node->Insert(mutation, val)) { + case INSERT_SUCCESS: + node->Unlock(); + return true; + case INSERT_DUPLICATE: + node->Unlock(); + return false; + case INSERT_FULL: + return SplitLeafAndInsertUp(mutation, val); + // SplitLeafAndInsertUp takes care of unlocking + default: + CHECK(0) << "Unexpected result"; + break; + } + CHECK(0) << "should not get here"; + return false; + } + + // Splits the node 'node', returning the newly created right-sibling + // internal node 'new_inode'. + // + // Locking conditions: + // Precondition: + // node is locked + // Postcondition: + // node is still locked and marked SPLITTING + // new_inode is locked and marked INSERTING + InternalNode *SplitInternalNode(InternalNode *node, + faststring *separator_key) { + DCHECK(node->IsLocked()); + //VLOG(2) << "splitting internal node " << node->GetKey(0).ToString(); + + // TODO: simplified implementation doesn't deal with splitting + // when there are very small internal nodes. + CHECK_GT(node->key_count(), 2) + << "TODO: currently only support splitting nodes with >2 keys"; + + // TODO: can we share code better between the node types here? + // perhaps by making this part of NodeBase, wrapping the K,V slice pair + // in a struct type, etc? + + // Pick the split point. The split point is the key which + // will be moved up into the parent node. + int split_point = node->key_count() / 2; + Slice sep_slice = node->GetKey(split_point); + DCHECK_GT(sep_slice.size(), 0) << + "got bad split key when splitting: " << node->ToString(); + + separator_key->assign_copy(sep_slice.data(), sep_slice.size()); + + // Example split: + // [ 0, 1, 2 ] + // / | | \ . + // [A] [B] [C] [D] + // + // split_point = 3/2 = 1 + // separator_key = 1 + // + // =====> + // + // [ 1 ] + // / | + // [ 0 ] [ 2 ] + // / | | \ . + // [A] [B] [C] [D] + // + + NodePtr separator_ptr; + + InternalNode *new_inode = NewInternalNode( + node->GetKey(split_point + 1), + node->child_pointers_[split_point + 1], + node->child_pointers_[split_point + 2]); + + // The new inode is constructed in locked and INSERTING state. + + // Copy entries to the new right-hand node. + for (int i = split_point + 2; i < node->key_count(); i++) { + Slice k = node->GetKey(i); + DCHECK_GT(k.size(), 0); + NodePtr child = node->child_pointers_[i + 1]; + DCHECK(!child.is_null()); + + // TODO: this could be done more efficiently since we know that + // these inserts are coming in sorted order. + CHECK_EQ(INSERT_SUCCESS, new_inode->Insert(k, child, arena_.get())); + } + + // Up to this point, we haven't modified the left node, so concurrent + // reads were consistent. But, now we're about to actually mutate, + // so set the flag. + node->SetSplitting(); + + // Truncate the left node to remove the keys which have been + // moved to the right node + node->Truncate(split_point); + return new_inode; + } + + // Split the given leaf node 'node', creating a new node + // with the higher half of the elements. + // + // N.B: the new node is initially locked, but doesn't have the + // SPLITTING flag. This function sets the SPLITTING flag before + // modifying it. + void SplitLeafNode(LeafNode *node, + LeafNode **new_node) { + DCHECK(node->IsLocked()); + +#ifdef DEBUG_DUMP_SPLIT_STATS + do { + size_t key_size = 0, val_size = 0; + for (size_t i = 0; i < node->num_entries(); i++) { + Slice k, v; + node->Get(i, &k, &v); + key_size += k.size(); + val_size += v.size(); + } + LOG(INFO) << "split leaf. entries=" << node->num_entries() + << " keysize=" << key_size + << " valsize=" << val_size; + } while (0); +#endif + + LeafNode *new_leaf = NewLeaf(true); + new_leaf->next_ = node->next_; + + // Copy half the keys from node into the new leaf + int copy_start = node->num_entries() / 2; + CHECK_GT(copy_start, 0) << + "Trying to split a node with 0 or 1 entries"; + + std::copy(node->keys_ + copy_start, node->keys_ + node->num_entries(), + new_leaf->keys_); + std::copy(node->vals_ + copy_start, node->vals_ + node->num_entries(), + new_leaf->vals_); + new_leaf->num_entries_ = node->num_entries() - copy_start; + + // Truncate the left node to remove the keys which have been + // moved to the right node. + node->SetSplitting(); + node->next_ = new_leaf; + node->Truncate(copy_start); + *new_node = new_leaf; + } + + + // Splits a leaf node which is full, adding the new sibling + // node to the tree. + // This recurses upward splitting internal nodes as necessary. + // The node should be locked on entrance to the function + // and will be unlocked upon exit. + bool SplitLeafAndInsertUp(PreparedMutation *mutation, + const Slice &val) { + LeafNode *node = mutation->leaf(); + Slice key = mutation->key_; + + // Leaf node should already be locked at this point + DCHECK(node->IsLocked()); + + //DebugPrint(); + + LeafNode *new_leaf; + SplitLeafNode(node, &new_leaf); + + // The new leaf node is returned still locked. + DCHECK(new_leaf->IsLocked()); + + // Insert the key that we were originally trying to insert in the + // correct side post-split. + Slice split_key = new_leaf->GetKey(0); + LeafNode *dst_leaf = (key.compare(split_key) < 0) ? node : new_leaf; + // Re-prepare the mutation after the split. + dst_leaf->PrepareMutation(mutation); + + CHECK_EQ(INSERT_SUCCESS, dst_leaf->Insert(mutation, val)) + << "node split at " << split_key.ToDebugString() + << " did not result in enough space for key " << key.ToDebugString() + << " in left node"; + + // Insert the new node into the parents. + PropagateSplitUpward(node, new_leaf, split_key); + + // NB: No ned to unlock nodes here, since it is done by the upward + // propagation path ('ascend' label in Figure 5 in the masstree paper) + + return true; + } + + // Assign the parent pointer of 'right', and insert it into the tree + // by propagating splits upward. + // Locking: + // Precondition: + // left and right are both locked + // left is marked SPLITTING + // Postcondition: + // parent is non-null + // parent is marked INSERTING + // left and right are unlocked + void PropagateSplitUpward(NodePtr left_ptr, NodePtr right_ptr, + const Slice &split_key) { + NodeBase *left = left_ptr.base_ptr(); + NodeBase *right = right_ptr.base_ptr(); + + DCHECK(left->IsLocked()); + DCHECK(right->IsLocked()); + + InternalNode *parent = left->GetLockedParent(); + if (parent == NULL) { + // Node is the root - make new parent node + parent = NewInternalNode(split_key, left_ptr, right_ptr); + // Constructor also reassigns parents. + // root_ will be updated lazily by next traverser + left->Unlock(); + right->Unlock(); + parent->Unlock(); + return; + } + + // Parent exists. Try to insert + switch (parent->Insert(split_key, right_ptr, arena_.get())) { + case INSERT_SUCCESS: + { + VLOG(3) << "Inserted new entry into internal node " + << parent << " for " << split_key.ToDebugString(); + left->Unlock(); + right->Unlock(); + parent->Unlock(); + return; + } + case INSERT_FULL: + { + // Split the node in two + faststring sep_key(0); + InternalNode *new_inode = SplitInternalNode(parent, &sep_key); + + DCHECK(new_inode->IsLocked()); + DCHECK(parent->IsLocked()) << "original should still be locked"; + + // Insert the new entry into the appropriate half. + Slice inode_split(sep_key); + InternalNode *dst_inode = + (split_key.compare(inode_split) < 0) ? parent : new_inode; + + VLOG(2) << "Split internal node " << parent << " for insert of " + << split_key.ToDebugString() << "[" << right << "]" + << " (split at " << inode_split.ToDebugString() << ")"; + + CHECK_EQ(INSERT_SUCCESS, dst_inode->Insert(split_key, right_ptr, arena_.get())); + + left->Unlock(); + right->Unlock(); + PropagateSplitUpward(parent, new_inode, inode_split); + break; + } + default: + CHECK(0); + } + } + + LeafNode *NewLeaf(bool locked) { + void *mem = CHECK_NOTNULL(arena_->AllocateBytesAligned(sizeof(LeafNode), + sizeof(AtomicVersion))); + return new (mem) LeafNode(locked); + } + + InternalNode *NewInternalNode(const Slice &split_key, + NodePtr lchild, + NodePtr rchild) { + void *mem = CHECK_NOTNULL(arena_->AllocateBytesAligned(sizeof(InternalNode), + sizeof(AtomicVersion))); + return new (mem) InternalNode(split_key, lchild, rchild, arena_.get()); + } + + void FreeLeaf(LeafNode *leaf) { + leaf->~LeafNode(); + // No need to actually free, since it came from the arena + } + + void FreeInternalNode(InternalNode *node) { + node->~InternalNode(); + // No need to actually free, since it came from the arena + } + + std::shared_ptr arena_; + + // marked 'mutable' because readers will lazy-update the root + // when they encounter a stale root pointer. + mutable NodePtr root_; + + // If true, the tree is no longer mutable. Once a tree becomes + // frozen, it may not be un-frozen. If an iterator is created on + // a frozen tree, it will be more efficient. + bool frozen_; +}; + +template +class CBTreeIterator { + public: + bool SeekToStart() { + bool exact; + return SeekAtOrAfter(Slice(""), &exact); + } + + bool SeekAtOrAfter(const Slice &key, bool *exact) { + SeekToLeaf(key); + SeekInLeaf(key, exact); + return IsValid(); + } + + bool IsValid() const { + return seeked_; + } + + bool Next() { + DCHECK(seeked_); + idx_in_leaf_++; + if (idx_in_leaf_ < leaf_to_scan_->num_entries()) { + return true; + } else { + return SeekNextLeaf(); + } + } + + void GetCurrentEntry(Slice *key, Slice *val) const { + DCHECK(seeked_); + ValueSlice val_slice; + leaf_to_scan_->Get(idx_in_leaf_, key, &val_slice); + *val = val_slice.as_slice(); + } + + Slice GetCurrentKey() const { + DCHECK(seeked_); + return leaf_to_scan_->GetKey(idx_in_leaf_); + } + + //////////////////////////////////////////////////////////// + // Advanced functions which expose some of the internal state + // of the iterator, allowing for limited "rewind" capability + // within a given leaf. + // + // Single leaf nodes are the unit of "snapshotting" of this iterator. + // Hence, within a leaf node, the caller may rewind arbitrarily, but once + // moving to the next leaf node, there is no way to go back to the prior + // leaf node without losing consistency. + //////////////////////////////////////////////////////////// + + // Return the number of entries, including the current one, remaining + // in the leaf. + // For example, if the leaf has three entries [A, B, C], and GetCurrentEntry + // would return 'A', then this will return 3. + size_t remaining_in_leaf() const { + DCHECK(seeked_); + return leaf_to_scan_->num_entries() - idx_in_leaf_; + } + + // Return the index of the iterator inside the current leaf node. + size_t index_in_leaf() const { + return idx_in_leaf_; + } + + // Rewind the iterator to the given index in the current leaf node, + // which was probably saved off from a previous call to + // remaining_in_leaf(). + // + // If Next() was called more times than remaining_in_leaf(), then + // this call will not be successful. + void RewindToIndexInLeaf(size_t new_index_in_leaf) { + DCHECK(seeked_); + DCHECK_LT(new_index_in_leaf, leaf_to_scan_->num_entries()); + idx_in_leaf_ = new_index_in_leaf; + } + + // Get the key at a specific leaf node + Slice GetKeyInLeaf(size_t idx) const { + DCHECK(seeked_); + return leaf_to_scan_->GetKey(idx); + } + + // Get the given indexed entry in the current leaf node. + void GetEntryInLeaf(size_t idx, Slice *key, Slice *val) { + DCHECK(seeked_); + DCHECK_LT(idx, leaf_to_scan_->num_entries()); + leaf_to_scan_->Get(idx, key, val); + } + + private: + friend class CBTree; + + CBTreeIterator(const CBTree *tree, + bool tree_frozen) : + tree_(tree), + tree_frozen_(tree_frozen), + seeked_(false), + idx_in_leaf_(-1), + leaf_copy_(false), + leaf_to_scan_(&leaf_copy_) + {} + + bool SeekInLeaf(const Slice &key, bool *exact) { + DCHECK(seeked_); + idx_in_leaf_ = leaf_to_scan_->Find(key, exact); + if (idx_in_leaf_ == leaf_to_scan_->num_entries()) { + // not found in leaf, seek to start of next leaf if it exists. + return SeekNextLeaf(); + } + return true; + } + + + void SeekToLeaf(const Slice &key) { + retry_from_root: + { + AtomicVersion version; + LeafNode *leaf = tree_->TraverseToLeaf(key, &version); +#ifdef SCAN_PREFETCH + PrefetchMemory(leaf->next_); +#endif + + // If the tree is frozen, we don't need to follow optimistic concurrency. + if (tree_frozen_) { + leaf_to_scan_ = leaf; + seeked_ = true; + return; + } + + retry_in_leaf: + { + memcpy(&leaf_copy_, leaf, sizeof(leaf_copy_)); + + AtomicVersion new_version = leaf->StableVersion(); + if (VersionField::HasSplit(version, new_version)) { + goto retry_from_root; + } else if (VersionField::IsDifferent(version, new_version)) { + version = new_version; + goto retry_in_leaf; + } + // Got a consistent snapshot copy of the leaf node into + // leaf_copy_ + leaf_to_scan_ = &leaf_copy_; + } + } + seeked_ = true; + } + + bool SeekNextLeaf() { + DCHECK(seeked_); + LeafNode *next = leaf_to_scan_->next_; + if (PREDICT_FALSE(next == NULL)) { + seeked_ = false; + return false; + } +#ifdef SCAN_PREFETCH + PrefetchMemory(next->next_); +#endif + + // If the tree is frozen, we don't need to play optimistic concurrency + // games or make a defensive copy. + if (tree_frozen_) { + leaf_to_scan_ = next; + idx_in_leaf_ = 0; + return true; + } + + while (true) { + AtomicVersion version = next->StableVersion(); + memcpy(&leaf_copy_, next, sizeof(leaf_copy_)); + AtomicVersion new_version = next->StableVersion(); + if (VersionField::IsDifferent(new_version, version)) { + version = new_version; + } else { + idx_in_leaf_ = 0; + leaf_to_scan_ = &leaf_copy_; + return true; + } + } + } + + const CBTree *tree_; + + // If true, the tree we are scanning is completely frozen and we don't + // need to perform optimistic concurrency control or copies for safety. + bool tree_frozen_; + + bool seeked_; + size_t idx_in_leaf_; + + + LeafNode leaf_copy_; + LeafNode *leaf_to_scan_; +}; + +} // namespace btree +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/delta_applier.cc b/src/kudu/tablet/delta_applier.cc new file mode 100644 index 000000000000..7439b101ce91 --- /dev/null +++ b/src/kudu/tablet/delta_applier.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_applier.h" + +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/util/status.h" + +using std::shared_ptr; +using std::string; + +namespace kudu { +namespace tablet { + + // Construct. The base_iter and delta_iter should not be Initted. +DeltaApplier::DeltaApplier(shared_ptr base_iter, + shared_ptr delta_iter) + : base_iter_(std::move(base_iter)), + delta_iter_(std::move(delta_iter)), + first_prepare_(true) {} + +DeltaApplier::~DeltaApplier() { +} + +Status DeltaApplier::Init(ScanSpec *spec) { + RETURN_NOT_OK(base_iter_->Init(spec)); + RETURN_NOT_OK(delta_iter_->Init(spec)); + return Status::OK(); +} + + +string DeltaApplier::ToString() const { + string s; + s.append("DeltaApplier("); + s.append(base_iter_->ToString()); + s.append(" + "); + s.append(delta_iter_->ToString()); + s.append(")"); + return s; +} + +const Schema &DeltaApplier::schema() const { + return base_iter_->schema(); +} + +void DeltaApplier::GetIteratorStats(std::vector* stats) const { + return base_iter_->GetIteratorStats(stats); +} + +bool DeltaApplier::HasNext() const { + return base_iter_->HasNext(); +} + +Status DeltaApplier::PrepareBatch(size_t *nrows) { + // The initial seek is deferred from Init() into the first PrepareBatch() + // because it requires a loaded delta file, and we don't want to require + // that at Init() time. + if (first_prepare_) { + RETURN_NOT_OK(delta_iter_->SeekToOrdinal(base_iter_->cur_ordinal_idx())); + first_prepare_ = false; + } + RETURN_NOT_OK(base_iter_->PrepareBatch(nrows)); + RETURN_NOT_OK(delta_iter_->PrepareBatch(*nrows, DeltaIterator::PREPARE_FOR_APPLY)); + return Status::OK(); +} + +Status DeltaApplier::FinishBatch() { + return base_iter_->FinishBatch(); +} + +Status DeltaApplier::InitializeSelectionVector(SelectionVector *sel_vec) { + DCHECK(!first_prepare_) << "PrepareBatch() must be called at least once"; + RETURN_NOT_OK(base_iter_->InitializeSelectionVector(sel_vec)); + return delta_iter_->ApplyDeletes(sel_vec); +} + +Status DeltaApplier::MaterializeColumn(size_t col_idx, ColumnBlock *dst) { + DCHECK(!first_prepare_) << "PrepareBatch() must be called at least once"; + + // Copy the base data. + RETURN_NOT_OK(base_iter_->MaterializeColumn(col_idx, dst)); + + // Apply all the updates for this column. + RETURN_NOT_OK(delta_iter_->ApplyUpdates(col_idx, dst)); + return Status::OK(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_applier.h b/src/kudu/tablet/delta_applier.h new file mode 100644 index 000000000000..5877978235df --- /dev/null +++ b/src/kudu/tablet/delta_applier.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_APPLIER_H +#define KUDU_TABLET_DELTA_APPLIER_H + +#include +#include +#include + +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" +#include "kudu/tablet/cfile_set.h" + +namespace kudu { +namespace tablet { + +class DeltaIterator; + +//////////////////////////////////////////////////////////// +// Delta-applying iterators +//////////////////////////////////////////////////////////// + +// A DeltaApplier takes in a base ColumnwiseIterator along with a a +// DeltaIterator. It is responsible for applying the updates coming +// from the delta iterator to the results of the base iterator. +class DeltaApplier : public ColumnwiseIterator { + public: + virtual Status Init(ScanSpec *spec) OVERRIDE; + Status PrepareBatch(size_t *nrows) OVERRIDE; + + Status FinishBatch() OVERRIDE; + + bool HasNext() const OVERRIDE; + + std::string ToString() const OVERRIDE; + + const Schema &schema() const OVERRIDE; + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE; + + // Initialize the selection vector for the current batch. + // This processes DELETEs -- any deleted rows are set to 0 in 'sel_vec'. + // All other rows are set to 1. + virtual Status InitializeSelectionVector(SelectionVector *sel_vec) OVERRIDE; + + Status MaterializeColumn(size_t col_idx, ColumnBlock *dst) OVERRIDE; + private: + friend class DeltaTracker; + + FRIEND_TEST(TestMajorDeltaCompaction, TestCompact); + + DISALLOW_COPY_AND_ASSIGN(DeltaApplier); + + // Construct. The base_iter and delta_iter should not be Initted. + DeltaApplier(std::shared_ptr base_iter, + std::shared_ptr delta_iter); + virtual ~DeltaApplier(); + + std::shared_ptr base_iter_; + std::shared_ptr delta_iter_; + + bool first_prepare_; +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_DELTA_APPLIER_H */ diff --git a/src/kudu/tablet/delta_compaction-test.cc b/src/kudu/tablet/delta_compaction-test.cc new file mode 100644 index 000000000000..6d67d3e66c92 --- /dev/null +++ b/src/kudu/tablet/delta_compaction-test.cc @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/delta_iterator_merger.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/algorithm.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/path_util.h" +#include "kudu/util/status.h" +#include "kudu/util/auto_release_pool.h" + +DEFINE_int32(num_rows, 2100, "the first row to update"); +DEFINE_int32(num_delta_files, 3, "number of delta files"); + +using std::is_sorted; +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { +namespace tablet { + +using fs::ReadableBlock; +using fs::WritableBlock; + +class TestDeltaCompaction : public KuduTest { + public: + TestDeltaCompaction() + : deltafile_idx_(0), + schema_(CreateSchema()) { + } + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddColumn("val", UINT32)); + return builder.Build(); + } + + Status GetDeltaFileWriter(gscoped_ptr* dfw, + BlockId* block_id) const { + gscoped_ptr block; + RETURN_NOT_OK(fs_manager_->CreateNewBlock(&block)); + *block_id = block->id(); + dfw->reset(new DeltaFileWriter(block.Pass())); + RETURN_NOT_OK((*dfw)->Start()); + return Status::OK(); + } + + Status GetDeltaFileReader(const BlockId& block_id, + shared_ptr* dfr) const { + gscoped_ptr block; + RETURN_NOT_OK(fs_manager_->OpenBlock(block_id, &block)); + shared_ptr delta_reader; + return DeltaFileReader::Open(block.Pass(), block_id, dfr, REDO); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + SeedRandom(); + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + protected: + int64_t deltafile_idx_; + Schema schema_; + gscoped_ptr fs_manager_; +}; + +TEST_F(TestDeltaCompaction, TestMergeMultipleSchemas) { + vector schemas; + SchemaBuilder builder(schema_); + schemas.push_back(builder.Build()); + + // Add an int column with default + uint32_t default_c2 = 10; + ASSERT_OK(builder.AddColumn("c2", UINT32, false, &default_c2, &default_c2)); + schemas.push_back(builder.Build()); + + // add a string column with default + Slice default_c3("Hello World"); + ASSERT_OK(builder.AddColumn("c3", STRING, false, &default_c3, &default_c3)); + schemas.push_back(builder.Build()); + + vector > inputs; + + faststring buf; + int row_id = 0; + int curr_timestamp = 0; + int deltafile_idx = 0; + for (const Schema& schema : schemas) { + // Write the Deltas + BlockId block_id; + gscoped_ptr dfw; + ASSERT_OK(GetDeltaFileWriter(&dfw, &block_id)); + + // Generate N updates with the new schema, some of them are on existing + // rows others are on new rows (see kNumUpdates and kNumMultipleUpdates). + // Each column will be updated with value composed by delta file id + // and update number (see update_value assignment). + size_t kNumUpdates = 10; + size_t kNumMultipleUpdates = kNumUpdates / 2; + DeltaStats stats; + for (size_t i = 0; i < kNumUpdates; ++i) { + buf.clear(); + RowChangeListEncoder update(&buf); + for (size_t col_idx = schema.num_key_columns(); col_idx < schema.num_columns(); ++col_idx) { + ColumnId col_id = schema.column_id(col_idx); + DCHECK_GE(col_id, 0); + + stats.IncrUpdateCount(col_id, 1); + const ColumnSchema& col_schema = schema.column(col_idx); + int update_value = deltafile_idx * 100 + i; + switch (col_schema.type_info()->physical_type()) { + case UINT32: + { + uint32_t u32_val = update_value; + update.AddColumnUpdate(col_schema, col_id, &u32_val); + } + break; + case BINARY: + { + string s = boost::lexical_cast(update_value); + Slice str_val(s); + update.AddColumnUpdate(col_schema, col_id, &str_val); + } + break; + default: + FAIL() << "Type " << DataType_Name(col_schema.type_info()->type()) << " Not Supported"; + break; + } + } + + // To simulate multiple updates on the same row, the first N updates + // of this new schema will always be on rows [0, 1, 2, ...] while the + // others will be on new rows. (N is tunable by changing kNumMultipleUpdates) + DeltaKey key((i < kNumMultipleUpdates) ? i : row_id, Timestamp(curr_timestamp)); + RowChangeList row_changes = update.as_changelist(); + ASSERT_OK(dfw->AppendDelta(key, row_changes)); + ASSERT_OK(stats.UpdateStats(key.timestamp(), row_changes)); + curr_timestamp++; + row_id++; + } + + ASSERT_OK(dfw->WriteDeltaStats(stats)); + ASSERT_OK(dfw->Finish()); + shared_ptr dfr; + ASSERT_OK(GetDeltaFileReader(block_id, &dfr)); + inputs.push_back(dfr); + deltafile_idx++; + } + + // Merge + MvccSnapshot snap(MvccSnapshot::CreateSnapshotIncludingAllTransactions()); + const Schema& merge_schema = schemas.back(); + shared_ptr merge_iter; + ASSERT_OK(DeltaIteratorMerger::Create(inputs, &merge_schema, + snap, &merge_iter)); + gscoped_ptr dfw; + BlockId block_id; + ASSERT_OK(GetDeltaFileWriter(&dfw, &block_id)); + ASSERT_OK(WriteDeltaIteratorToFile(merge_iter.get(), + ITERATE_OVER_ALL_ROWS, + dfw.get())); + ASSERT_OK(dfw->Finish()); + + shared_ptr dfr; + ASSERT_OK(GetDeltaFileReader(block_id, &dfr)); + DeltaIterator* raw_iter; + ASSERT_OK(dfr->NewDeltaIterator(&merge_schema, snap, &raw_iter)); + gscoped_ptr scoped_iter(raw_iter); + + vector results; + ASSERT_OK(DebugDumpDeltaIterator(REDO, scoped_iter.get(), merge_schema, + ITERATE_OVER_ALL_ROWS, &results)); + for (const string &str : results) { + VLOG(1) << str; + } + ASSERT_TRUE(is_sorted(results.begin(), results.end())); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_compaction.cc b/src/kudu/tablet/delta_compaction.cc new file mode 100644 index 000000000000..8defcfb4fab9 --- /dev/null +++ b/src/kudu/tablet/delta_compaction.cc @@ -0,0 +1,351 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_compaction.h" + +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/common/columnblock.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/compaction.h" +#include "kudu/tablet/delta_key.h" +#include "kudu/tablet/deltamemstore.h" +#include "kudu/tablet/multi_column_writer.h" +#include "kudu/tablet/mvcc.h" + +using std::shared_ptr; + +namespace kudu { + +using cfile::CFileIterator; +using cfile::CFileReader; +using cfile::IndexTreeIterator; +using fs::WritableBlock; +using std::vector; +using strings::Substitute; + +namespace tablet { + +namespace { + +const size_t kRowsPerBlock = 100; // Number of rows per block of columns + +} // anonymous namespace + +// TODO: can you major-delta-compact a new column after an alter table in order +// to materialize it? should write a test for this. +MajorDeltaCompaction::MajorDeltaCompaction( + FsManager* fs_manager, const Schema& base_schema, CFileSet* base_data, + shared_ptr delta_iter, + vector > included_stores, + const vector& col_ids) + : fs_manager_(fs_manager), + base_schema_(base_schema), + column_ids_(col_ids), + base_data_(base_data), + included_stores_(std::move(included_stores)), + delta_iter_(std::move(delta_iter)), + redo_delta_mutations_written_(0), + undo_delta_mutations_written_(0), + state_(kInitialized) { + CHECK(!col_ids.empty()); +} + +MajorDeltaCompaction::~MajorDeltaCompaction() { +} + +string MajorDeltaCompaction::ColumnNamesToString() const { + std::string result; + for (ColumnId col_id : column_ids_) { + int col_idx = base_schema_.find_column_by_id(col_id); + if (col_idx != Schema::kColumnNotFound) { + result += base_schema_.column_by_id(col_id).ToString() + " "; + } else { + result += Substitute("[deleted column id $0] ", col_id); + } + } + return result; +} + +Status MajorDeltaCompaction::FlushRowSetAndDeltas() { + CHECK_EQ(state_, kInitialized); + + shared_ptr old_base_data_cwise(base_data_->NewIterator(&partial_schema_)); + gscoped_ptr old_base_data_rwise(new MaterializingIterator(old_base_data_cwise)); + + ScanSpec spec; + spec.set_cache_blocks(false); + RETURN_NOT_OK_PREPEND( + old_base_data_rwise->Init(&spec), + "Unable to open iterator for specified columns (" + partial_schema_.ToString() + ")"); + + RETURN_NOT_OK(delta_iter_->Init(&spec)); + RETURN_NOT_OK(delta_iter_->SeekToOrdinal(0)); + + Arena arena(32 * 1024, 128 * 1024); + RowBlock block(partial_schema_, kRowsPerBlock, &arena); + + DVLOG(1) << "Applying deltas and rewriting columns (" << partial_schema_.ToString() << ")"; + DeltaStats redo_stats; + DeltaStats undo_stats; + uint64_t num_rows_history_truncated = 0; + size_t nrows = 0; + // We know that we're reading everything from disk so we're including all transactions. + MvccSnapshot snap = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + while (old_base_data_rwise->HasNext()) { + + // 1) Get the next batch of base data for the columns we're compacting. + arena.Reset(); + RETURN_NOT_OK(old_base_data_rwise->NextBlock(&block)); + size_t n = block.nrows(); + + // 2) Fetch all the REDO mutations. + vector redo_mutation_block(kRowsPerBlock, reinterpret_cast(NULL)); + RETURN_NOT_OK(delta_iter_->PrepareBatch(n, DeltaIterator::PREPARE_FOR_COLLECT)); + RETURN_NOT_OK(delta_iter_->CollectMutations(&redo_mutation_block, block.arena())); + + // 3) Apply new UNDO mutations for the current block. The REDO mutations are picked up + // at step 6). + vector input_rows; + input_rows.resize(block.nrows()); + for (int i = 0; i < block.nrows(); i++) { + CompactionInputRow &input_row = input_rows.at(i); + input_row.row.Reset(&block, i); + input_row.redo_head = redo_mutation_block[i]; + input_row.undo_head = nullptr; + + RowBlockRow dst_row = block.row(i); + RETURN_NOT_OK(CopyRow(input_row.row, &dst_row, reinterpret_cast(NULL))); + + Mutation* new_undos_head = nullptr; + // We're ignoring the result from new_redos_head because we'll find them later at step 5). + Mutation* new_redos_head = nullptr; + + bool is_garbage_collected; + + RETURN_NOT_OK(ApplyMutationsAndGenerateUndos(snap, + input_row, + &base_schema_, + &new_undos_head, + &new_redos_head, + &arena, + &dst_row, + &is_garbage_collected, + &num_rows_history_truncated)); + + VLOG(2) << "Output Row: " << dst_row.schema()->DebugRow(dst_row) + << " Undo Mutations: " << Mutation::StringifyMutationList(partial_schema_, new_undos_head) + << " Redo Mutations: " << Mutation::StringifyMutationList(partial_schema_, new_redos_head); + + // We only create a new undo delta file if we need to. + if (new_undos_head != nullptr && !new_undo_delta_writer_) { + RETURN_NOT_OK(OpenUndoDeltaFileWriter()); + } + for (const Mutation *mut = new_undos_head; mut != nullptr; mut = mut->next()) { + DeltaKey undo_key(nrows + dst_row.row_index(), mut->timestamp()); + RETURN_NOT_OK(new_undo_delta_writer_->AppendDelta(undo_key, mut->changelist())); + undo_stats.UpdateStats(mut->timestamp(), mut->changelist()); + undo_delta_mutations_written_++; + } + } + + // 4) Write the new base data. + RETURN_NOT_OK(base_data_writer_->AppendBlock(block)); + + // 5) Remove the columns that we're compacting from the delta flush, but keep all the + // delete mutations. + arena.Reset(); + vector out; + RETURN_NOT_OK(delta_iter_->FilterColumnIdsAndCollectDeltas(column_ids_, &out, &arena)); + + // We only create a new redo delta file if we need to. + if (!out.empty() && !new_redo_delta_writer_) { + RETURN_NOT_OK(OpenRedoDeltaFileWriter()); + } + + // 6) Write the deltas we're not compacting back into a delta file. + for (const DeltaKeyAndUpdate& key_and_update : out) { + RowChangeList update(key_and_update.cell); + RETURN_NOT_OK_PREPEND(new_redo_delta_writer_->AppendDelta(key_and_update.key, update), + "Failed to append a delta"); + WARN_NOT_OK(redo_stats.UpdateStats(key_and_update.key.timestamp(), update), + "Failed to update stats"); + } + redo_delta_mutations_written_ += out.size(); + nrows += n; + } + + RETURN_NOT_OK(base_data_writer_->Finish()); + + if (redo_delta_mutations_written_ > 0) { + RETURN_NOT_OK(new_redo_delta_writer_->WriteDeltaStats(redo_stats)); + RETURN_NOT_OK(new_redo_delta_writer_->Finish()); + } + + if (undo_delta_mutations_written_ > 0) { + RETURN_NOT_OK(new_undo_delta_writer_->WriteDeltaStats(undo_stats)); + RETURN_NOT_OK(new_undo_delta_writer_->Finish()); + } + + DVLOG(1) << "Applied all outstanding deltas for columns " + << partial_schema_.ToString() + << ", and flushed the resulting rowsets and a total of " + << redo_delta_mutations_written_ + << " REDO delta mutations and " + << undo_delta_mutations_written_ + << " UNDO delta mutations to disk."; + + state_ = kFinished; + return Status::OK(); +} + +Status MajorDeltaCompaction::OpenBaseDataWriter() { + CHECK(!base_data_writer_); + + gscoped_ptr w(new MultiColumnWriter(fs_manager_, &partial_schema_)); + RETURN_NOT_OK(w->Open()); + base_data_writer_.swap(w); + return Status::OK(); +} + +Status MajorDeltaCompaction::OpenRedoDeltaFileWriter() { + gscoped_ptr block; + RETURN_NOT_OK_PREPEND(fs_manager_->CreateNewBlock(&block), + "Unable to create REDO delta output block"); + new_redo_delta_block_ = block->id(); + new_redo_delta_writer_.reset(new DeltaFileWriter(block.Pass())); + return new_redo_delta_writer_->Start(); +} + +Status MajorDeltaCompaction::OpenUndoDeltaFileWriter() { + gscoped_ptr block; + RETURN_NOT_OK_PREPEND(fs_manager_->CreateNewBlock(&block), + "Unable to create UNDO delta output block"); + new_undo_delta_block_ = block->id(); + new_undo_delta_writer_.reset(new DeltaFileWriter(block.Pass())); + return new_undo_delta_writer_->Start(); +} + +Status MajorDeltaCompaction::Compact() { + CHECK_EQ(state_, kInitialized); + + LOG(INFO) << "Starting major delta compaction for columns " << ColumnNamesToString(); + RETURN_NOT_OK(base_schema_.CreateProjectionByIdsIgnoreMissing(column_ids_, &partial_schema_)); + + for (const shared_ptr& ds : included_stores_) { + LOG(INFO) << "Preparing to major compact delta file: " << ds->ToString(); + } + + // We defer on calling OpenNewDeltaBlock since we might not need to flush. + RETURN_NOT_OK(OpenBaseDataWriter()); + RETURN_NOT_OK(FlushRowSetAndDeltas()); + LOG(INFO) << "Finished major delta compaction of columns " << + ColumnNamesToString(); + return Status::OK(); +} + +Status MajorDeltaCompaction::CreateMetadataUpdate( + RowSetMetadataUpdate* update) { + CHECK(update); + CHECK_EQ(state_, kFinished); + + vector compacted_delta_blocks; + for (const shared_ptr& store : included_stores_) { + DeltaFileReader* dfr = down_cast(store.get()); + compacted_delta_blocks.push_back(dfr->block_id()); + } + + vector new_delta_blocks; + if (redo_delta_mutations_written_ > 0) { + new_delta_blocks.push_back(new_redo_delta_block_); + } + + update->ReplaceRedoDeltaBlocks(compacted_delta_blocks, + new_delta_blocks); + + if (undo_delta_mutations_written_ > 0) { + update->SetNewUndoBlock(new_undo_delta_block_); + } + + // Replace old column blocks with new ones + RowSetMetadata::ColumnIdToBlockIdMap new_column_blocks; + base_data_writer_->GetFlushedBlocksByColumnId(&new_column_blocks); + + // NOTE: in the case that one of the columns being compacted is deleted, + // we may have fewer elements in new_column_blocks compared to 'column_ids'. + // For those deleted columns, we just remove the old column data. + CHECK_LE(new_column_blocks.size(), column_ids_.size()); + + for (ColumnId col_id : column_ids_) { + BlockId new_block; + if (FindCopy(new_column_blocks, col_id, &new_block)) { + update->ReplaceColumnId(col_id, new_block); + } else { + // The column has been deleted. + // If the base data has a block for this column, we need to remove it. + // NOTE: It's possible that the base data has no data for this column in the + // case that the column was added and removed in succession after the base + // data was flushed. + CHECK_EQ(base_schema_.find_column_by_id(col_id), Schema::kColumnNotFound) + << "major compaction removing column " << col_id << " but still present in Schema!"; + if (base_data_->has_data_for_column_id(col_id)) { + update->RemoveColumnId(col_id); + } + } + } + + return Status::OK(); +} + +// We're called under diskrowset's component_lock_ and delta_tracker's compact_flush_lock_ +// so both AtomicUpdateStores calls can be done separately and still be seen as one atomic +// operation. +Status MajorDeltaCompaction::UpdateDeltaTracker(DeltaTracker* tracker) { + CHECK_EQ(state_, kFinished); + vector new_delta_blocks; + // We created a new delta block only if we had deltas to write back. We still need to update + // the tracker so that it removes the included_stores_. + if (redo_delta_mutations_written_ > 0) { + new_delta_blocks.push_back(new_redo_delta_block_); + } + RETURN_NOT_OK(tracker->AtomicUpdateStores(included_stores_, + new_delta_blocks, + REDO)); + + // We only call AtomicUpdateStores() if we wrote UNDOs, we're not removing stores so we don't + // need to call it otherwise. + if (undo_delta_mutations_written_ > 0) { + vector new_undo_blocks; + new_undo_blocks.push_back(new_undo_delta_block_); + return tracker->AtomicUpdateStores(SharedDeltaStoreVector(), + new_undo_blocks, + UNDO); + } else { + return Status::OK(); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_compaction.h b/src/kudu/tablet/delta_compaction.h new file mode 100644 index 000000000000..d8571f14d665 --- /dev/null +++ b/src/kudu/tablet/delta_compaction.h @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_COMPACTION_H +#define KUDU_TABLET_DELTA_COMPACTION_H + +#include +#include +#include +#include +#include + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/tablet/deltafile.h" + +namespace kudu { + +namespace metadata { +class RowSetMetadata; +} // namespace metadata + +namespace tablet { + +class CFileSet; +class DeltaMemStore; +class DeltaKey; +class MultiColumnWriter; + +// Handles major delta compaction: applying deltas to specific columns +// of a DiskRowSet, writing out an updated DiskRowSet without re-writing the +// unchanged columns (see RowSetColumnUpdater), and writing out a new +// deltafile which does not contain the deltas applied to the specific rows. +class MajorDeltaCompaction { + public: + // Creates a new major delta compaction. The given 'base_data' should already + // be open and must remain valid for the lifetime of this object. + // 'delta_iter' must not be initialized. + // 'col_ids' determines which columns of 'base_schema' should be compacted. + // + // TODO: is base_schema supposed to be the same as base_data->schema()? how about + // in an ALTER scenario? + MajorDeltaCompaction( + FsManager* fs_manager, const Schema& base_schema, CFileSet* base_data, + std::shared_ptr delta_iter, + std::vector > included_stores, + const std::vector& col_ids); + ~MajorDeltaCompaction(); + + // Executes the compaction. + // This has no effect on the metadata of the tablet, etc. + Status Compact(); + + // After a compaction is successful, prepares a metadata update which: + // 1) swaps out the old columns for the new ones + // 2) removes the compacted deltas + // 3) adds the new REDO delta which contains any uncompacted deltas + Status CreateMetadataUpdate(RowSetMetadataUpdate* update); + + // Apply the changes to the given delta tracker. + Status UpdateDeltaTracker(DeltaTracker* tracker); + + private: + std::string ColumnNamesToString() const; + + // Opens a writer for the base data. + Status OpenBaseDataWriter(); + + // Opens a writer for the REDO delta file, won't be called if we don't need to write + // back REDO delta mutations. + Status OpenRedoDeltaFileWriter(); + + // Opens a writer for the UNDO delta file, won't be called if we don't need to write + // back UNDO delta mutations. + Status OpenUndoDeltaFileWriter(); + + // Reads the current base data, applies the deltas, and then writes the new base data. + // A new delta file is written if not all columns were selected for compaction and some + // deltas need to be written back into a delta file. + Status FlushRowSetAndDeltas(); + + FsManager* const fs_manager_; + + // TODO: doc me + const Schema base_schema_; + + // The computed partial schema which includes only the columns being + // compacted. + Schema partial_schema_; + + // The column ids to compact. + const std::vector column_ids_; + + // Inputs: + //----------------- + + // The base data into which deltas are being compacted. + CFileSet* const base_data_; + + // The DeltaStores from which deltas are being read. + const SharedDeltaStoreVector included_stores_; + + // The merged view of the deltas from included_stores_. + const std::shared_ptr delta_iter_; + + // Outputs: + gscoped_ptr base_data_writer_; + // The following two may not be initialized if we don't need to write a delta file. + gscoped_ptr new_redo_delta_writer_; + BlockId new_redo_delta_block_; + + gscoped_ptr new_undo_delta_writer_; + BlockId new_undo_delta_block_; + + size_t redo_delta_mutations_written_; + size_t undo_delta_mutations_written_; + + enum State { + kInitialized = 1, + kFinished = 2, + }; + State state_; +}; + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/delta_iterator_merger.cc b/src/kudu/tablet/delta_iterator_merger.cc new file mode 100644 index 000000000000..9ce37fd492a9 --- /dev/null +++ b/src/kudu/tablet/delta_iterator_merger.cc @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_iterator_merger.h" + +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/deltafile.h" + +namespace kudu { +namespace tablet { + +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +DeltaIteratorMerger::DeltaIteratorMerger( + vector > iters) + : iters_(std::move(iters)) {} + +Status DeltaIteratorMerger::Init(ScanSpec *spec) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->Init(spec)); + } + return Status::OK(); +} + +Status DeltaIteratorMerger::SeekToOrdinal(rowid_t idx) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->SeekToOrdinal(idx)); + } + return Status::OK(); +} + +Status DeltaIteratorMerger::PrepareBatch(size_t nrows, PrepareFlag flag) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->PrepareBatch(nrows, flag)); + } + return Status::OK(); +} + +Status DeltaIteratorMerger::ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->ApplyUpdates(col_to_apply, dst)); + } + return Status::OK(); +} + +Status DeltaIteratorMerger::ApplyDeletes(SelectionVector *sel_vec) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->ApplyDeletes(sel_vec)); + } + return Status::OK(); +} + +Status DeltaIteratorMerger::CollectMutations(vector *dst, Arena *arena) { + for (const shared_ptr &iter : iters_) { + RETURN_NOT_OK(iter->CollectMutations(dst, arena)); + } + // TODO: do we need to do some kind of sorting here to deal with out-of-order + // timestamps? + return Status::OK(); +} + +struct DeltaKeyUpdateComparator { + bool operator() (const DeltaKeyAndUpdate& a, const DeltaKeyAndUpdate &b) { + return a.key.CompareTo(b.key) < 0; + } +}; + +Status DeltaIteratorMerger::FilterColumnIdsAndCollectDeltas( + const vector& col_ids, + vector* out, + Arena* arena) { + for (const shared_ptr& iter : iters_) { + RETURN_NOT_OK(iter->FilterColumnIdsAndCollectDeltas(col_ids, out, arena)); + } + // We use a stable sort here since an input may include multiple deltas for the + // same row at the same timestamp, in the case of a user batch which had several + // mutations for the same row. Stable sort preserves the user-provided ordering. + std::stable_sort(out->begin(), out->end(), DeltaKeyUpdateComparator()); + return Status::OK(); +} + +bool DeltaIteratorMerger::HasNext() { + for (const shared_ptr& iter : iters_) { + if (iter->HasNext()) { + return true; + } + } + + return false; +} + +string DeltaIteratorMerger::ToString() const { + string ret; + ret.append("DeltaIteratorMerger("); + + bool first = true; + for (const shared_ptr &iter : iters_) { + if (!first) { + ret.append(", "); + } + first = false; + + ret.append(iter->ToString()); + } + ret.append(")"); + return ret; +} + + +Status DeltaIteratorMerger::Create( + const vector > &stores, + const Schema* projection, + const MvccSnapshot &snapshot, + shared_ptr* out) { + vector > delta_iters; + + for (const shared_ptr &store : stores) { + DeltaIterator* raw_iter; + Status s = store->NewDeltaIterator(projection, snapshot, &raw_iter); + if (s.IsNotFound()) { + continue; + } + RETURN_NOT_OK_PREPEND(s, Substitute("Could not create iterator for store $0", + store->ToString())); + + delta_iters.push_back(shared_ptr(raw_iter)); + } + + if (delta_iters.size() == 1) { + // If we only have one input to the "merge", we can just directly + // return that iterator. + *out = delta_iters[0]; + } else { + *out = shared_ptr(new DeltaIteratorMerger(delta_iters)); + } + return Status::OK(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_iterator_merger.h b/src/kudu/tablet/delta_iterator_merger.h new file mode 100644 index 000000000000..e703447769a2 --- /dev/null +++ b/src/kudu/tablet/delta_iterator_merger.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_ITERATOR_MERGER_H +#define KUDU_TABLET_DELTA_ITERATOR_MERGER_H + +#include +#include +#include + +#include "kudu/tablet/delta_store.h" + +namespace kudu { + +class ScanSpec; + +namespace tablet { + +// DeltaIterator that simply combines together other DeltaIterators, +// applying deltas from each in order. +class DeltaIteratorMerger : public DeltaIterator { + public: + // Create a new DeltaIterator which combines the deltas from + // all of the input delta stores. + // + // If only one store is input, this will automatically return an unwrapped + // iterator for greater efficiency. + static Status Create( + const std::vector > &stores, + const Schema* projection, + const MvccSnapshot &snapshot, + std::shared_ptr* out); + + //////////////////////////////////////////////////////////// + // Implementations of DeltaIterator + //////////////////////////////////////////////////////////// + virtual Status Init(ScanSpec *spec) OVERRIDE; + virtual Status SeekToOrdinal(rowid_t idx) OVERRIDE; + virtual Status PrepareBatch(size_t nrows, PrepareFlag flag) OVERRIDE; + virtual Status ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) OVERRIDE; + virtual Status ApplyDeletes(SelectionVector *sel_vec) OVERRIDE; + virtual Status CollectMutations(vector *dst, Arena *arena) OVERRIDE; + virtual Status FilterColumnIdsAndCollectDeltas(const std::vector& col_ids, + vector* out, + Arena* arena) OVERRIDE; + virtual bool HasNext() OVERRIDE; + virtual std::string ToString() const OVERRIDE; + + private: + explicit DeltaIteratorMerger(vector > iters); + + std::vector > iters_; +}; + +} // namespace tablet +} // namespace kudu + +#endif // KUDU_TABLET_DELTA_ITERATOR_MERGER_H diff --git a/src/kudu/tablet/delta_key.cc b/src/kudu/tablet/delta_key.cc new file mode 100644 index 000000000000..2bfce8f2bd6b --- /dev/null +++ b/src/kudu/tablet/delta_key.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_key.h" + +#include + +namespace kudu { +namespace tablet { + +const char* DeltaType_Name(DeltaType t) { + switch (t) { + case UNDO: + return "UNDO"; + case REDO: + return "REDO"; + default: + LOG(DFATAL) << "Unknown delta type: " << t; + } + return "UNKNOWN"; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_key.h b/src/kudu/tablet/delta_key.h new file mode 100644 index 000000000000..d5338b16c44d --- /dev/null +++ b/src/kudu/tablet/delta_key.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_KEY_H +#define KUDU_TABLET_DELTA_KEY_H + +#include +#include "kudu/common/rowid.h" +#include "kudu/gutil/endian.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tablet { + +// The type of the delta. +enum DeltaType { + // REDO delta files contain the mutations that were applied + // since the base data was last flushed/compacted. REDO deltas + // are sorted by increasing transaction timestamp. + REDO, + // UNDO delta files contain the mutations that were applied + // prior to the time the base data was last/flushed compacted + // and allow to execute point-in-time snapshot scans. UNDO + // deltas are sorted by decreasing transaction timestamp. + UNDO +}; + +const char* DeltaType_Name(DeltaType t); + +// Each entry in the delta memrowset or delta files is keyed by the rowid +// which has been updated, as well as the timestamp which performed the update. +class DeltaKey { + public: + DeltaKey() : + row_idx_(-1) + {} + + DeltaKey(rowid_t id, Timestamp timestamp) + : row_idx_(id), timestamp_(std::move(timestamp)) {} + + // Encode this key into the given buffer. + // + // The encoded form of a DeltaKey is guaranteed to share the same sort + // order as the DeltaKey itself when compared using memcmp(), so it may + // be used as a string key in indexing structures, etc. + void EncodeTo(faststring *dst) const { + EncodeRowId(dst, row_idx_); + timestamp_.EncodeTo(dst); + } + + + // Decode a DeltaKey object from its serialized form. + // + // The slice 'key' should contain the encoded key at its beginning, and may + // contain further data after that. + // The 'key' slice is mutated so that, upon return, the decoded key has been removed from + // its beginning. + Status DecodeFrom(Slice *key) { + Slice orig(*key); + if (!PREDICT_TRUE(DecodeRowId(key, &row_idx_))) { + return Status::Corruption("Bad delta key: bad rowid", orig.ToDebugString(20)); + } + + if (!PREDICT_TRUE(timestamp_.DecodeFrom(key))) { + return Status::Corruption("Bad delta key: bad timestamp", orig.ToDebugString(20)); + } + return Status::OK(); + } + + string ToString() const { + return strings::Substitute("(row $0@tx$1)", row_idx_, timestamp_.ToString()); + } + + // Compare this key to another key. Delta keys are sorted by ascending rowid, + // then ascending timestamp, except if this is an undo delta key, in which case the + // the keys are sorted by ascending rowid and then by _descending_ timestamp so that + // the transaction closer to the base data comes first. + template + int CompareTo(const DeltaKey &other) const; + + rowid_t row_idx() const { return row_idx_; } + + const Timestamp ×tamp() const { return timestamp_; } + + private: + // The row which has been updated. + rowid_t row_idx_; + + // The timestamp of the transaction which applied the update. + Timestamp timestamp_; +}; + +template<> +inline int DeltaKey::CompareTo(const DeltaKey &other) const { + if (row_idx_ < other.row_idx_) { + return -1; + } else if (row_idx_ > other.row_idx_) { + return 1; + } + + return timestamp_.CompareTo(other.timestamp_); +} + +template<> +inline int DeltaKey::CompareTo(const DeltaKey &other) const { + if (row_idx_ < other.row_idx_) { + return -1; + } else if (row_idx_ > other.row_idx_) { + return 1; + } + + return other.timestamp_.CompareTo(timestamp_); +} + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/delta_stats.cc b/src/kudu/tablet/delta_stats.cc new file mode 100644 index 000000000000..4594ca603aeb --- /dev/null +++ b/src/kudu/tablet/delta_stats.cc @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tablet/delta_stats.h" + +#include +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +using std::vector; + +namespace tablet { + +DeltaStats::DeltaStats() + : delete_count_(0), + max_timestamp_(Timestamp::kMin), + min_timestamp_(Timestamp::kMax) { +} + +void DeltaStats::IncrUpdateCount(ColumnId col_id, int64_t update_count) { + DCHECK_GE(col_id, 0); + update_counts_by_col_id_[col_id] += update_count; +} + +void DeltaStats::IncrDeleteCount(int64_t delete_count) { + delete_count_ += delete_count; +} + +Status DeltaStats::UpdateStats(const Timestamp& timestamp, + const RowChangeList& update) { + // Decode the update, incrementing the update count for each of the + // columns we find present. + RowChangeListDecoder update_decoder(update); + RETURN_NOT_OK(update_decoder.Init()); + if (PREDICT_FALSE(update_decoder.is_delete())) { + IncrDeleteCount(1); + } else if (PREDICT_TRUE(update_decoder.is_update())) { + vector col_ids; + RETURN_NOT_OK(update_decoder.GetIncludedColumnIds(&col_ids)); + for (ColumnId col_id : col_ids) { + IncrUpdateCount(col_id, 1); + } + } // Don't handle re-inserts + + if (min_timestamp_.CompareTo(timestamp) > 0) { + min_timestamp_ = timestamp; + } + if (max_timestamp_.CompareTo(timestamp) < 0) { + max_timestamp_ = timestamp; + } + + return Status::OK(); +} + +string DeltaStats::ToString() const { + string ret = strings::Substitute( + "ts range=[$0, $1]", + min_timestamp_.ToString(), + max_timestamp_.ToString()); + ret.append(", update_counts_by_col_id=["); + ret.append(JoinKeysAndValuesIterator(update_counts_by_col_id_.begin(), + update_counts_by_col_id_.end(), + ":", ",")); + ret.append(")"); + return ret; +} + + +void DeltaStats::ToPB(DeltaStatsPB* pb) const { + pb->Clear(); + pb->set_delete_count(delete_count_); + typedef std::pair entry; + for (const entry& e : update_counts_by_col_id_) { + DeltaStatsPB::ColumnStats* stats = pb->add_column_stats(); + stats->set_col_id(e.first); + stats->set_update_count(e.second); + } + + pb->set_max_timestamp(max_timestamp_.ToUint64()); + pb->set_min_timestamp(min_timestamp_.ToUint64()); +} + +Status DeltaStats::InitFromPB(const DeltaStatsPB& pb) { + delete_count_ = pb.delete_count(); + update_counts_by_col_id_.clear(); + for (const DeltaStatsPB::ColumnStats stats : pb.column_stats()) { + IncrUpdateCount(ColumnId(stats.col_id()), stats.update_count()); + } + RETURN_NOT_OK(max_timestamp_.FromUint64(pb.max_timestamp())); + RETURN_NOT_OK(min_timestamp_.FromUint64(pb.min_timestamp())); + return Status::OK(); +} + +void DeltaStats::AddColumnIdsWithUpdates(std::set* col_ids) const { + typedef std::pair entry; + for (const entry& e : update_counts_by_col_id_) { + if (e.second > 0) { + col_ids->insert(e.first); + } + } +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_stats.h b/src/kudu/tablet/delta_stats.h new file mode 100644 index 000000000000..d10f4b29f23c --- /dev/null +++ b/src/kudu/tablet/delta_stats.h @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_STATS_H +#define KUDU_TABLET_DELTA_STATS_H + +#include +#include + +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/map-util.h" +#include "kudu/common/row_changelist.h" +#include "kudu/tablet/mvcc.h" + +namespace kudu { + +namespace tablet { + +class DeltaStatsPB; + +// A wrapper class for describing data statistics. +class DeltaStats { + public: + DeltaStats(); + + // Increment update count for column 'col_id' by 'update_count'. + void IncrUpdateCount(ColumnId col_id, int64_t update_count); + + // Increment the per-store delete count by 'delete_count'. + void IncrDeleteCount(int64_t delete_count); + + // Increment delete and update counts based on changes contained in + // 'update'. + Status UpdateStats(const Timestamp& timestamp, + const RowChangeList& update); + + // Return the number of deletes in the current delta store. + int64_t delete_count() const { return delete_count_; } + + // Returns number of updates for a given column. + int64_t update_count_for_col_id(ColumnId col_id) const { + return FindWithDefault(update_counts_by_col_id_, col_id, 0); + } + + // Returns the maximum transaction id of any mutation in a delta file. + Timestamp max_timestamp() const { + return max_timestamp_; + } + + // Returns the minimum transaction id of any mutation in a delta file. + Timestamp min_timestamp() const { + return min_timestamp_; + } + + // Set the maximum transaction id of any mutation in a delta file. + void set_max_timestamp(const Timestamp& timestamp) { + max_timestamp_ = timestamp; + } + + // Set the minimum transaction id in of any mutation in a delta file. + void set_min_timestamp(const Timestamp& timestamp) { + min_timestamp_ = timestamp; + } + + std::string ToString() const; + + // Convert this object to the protobuf which is stored in the DeltaFile footer. + void ToPB(DeltaStatsPB* pb) const; + + // Load this object from the protobuf which is stored in the DeltaFile footer. + Status InitFromPB(const DeltaStatsPB& pb); + + // For each column which has at least one update, add that column's ID to the + // set 'col_ids'. + void AddColumnIdsWithUpdates(std::set* col_ids) const; + + private: + std::unordered_map update_counts_by_col_id_; + uint64_t delete_count_; + Timestamp max_timestamp_; + Timestamp min_timestamp_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/delta_store.cc b/src/kudu/tablet/delta_store.cc new file mode 100644 index 000000000000..029015ce5754 --- /dev/null +++ b/src/kudu/tablet/delta_store.cc @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_store.h" + +#include + +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/deltafile.h" + +namespace kudu { +namespace tablet { + +using std::shared_ptr; +using std::string; +using strings::Substitute; + +string DeltaKeyAndUpdate::Stringify(DeltaType type, const Schema& schema) const { + return StrCat(Substitute("($0 delta key=$1, change_list=$2)", + type == UNDO ? "UNDO" : "REDO", + StringPrintf("%06u@tx%06u", key.row_idx(), + atoi(key.timestamp().ToString().c_str())), + RowChangeList(cell).ToString(schema))); + +} + +Status DebugDumpDeltaIterator(DeltaType type, + DeltaIterator* iter, + const Schema& schema, + size_t nrows, + vector* out) { + ScanSpec spec; + spec.set_cache_blocks(false); + RETURN_NOT_OK(iter->Init(&spec)); + RETURN_NOT_OK(iter->SeekToOrdinal(0)); + + const size_t kRowsPerBlock = 100; + + Arena arena(32 * 1024, 128 * 1024); + for (size_t i = 0; iter->HasNext(); ) { + size_t n; + if (nrows > 0) { + if (i >= nrows) { + break; + } + n = std::min(kRowsPerBlock, nrows - i); + } else { + n = kRowsPerBlock; + } + + arena.Reset(); + + RETURN_NOT_OK(iter->PrepareBatch(n, DeltaIterator::PREPARE_FOR_COLLECT)); + vector cells; + RETURN_NOT_OK(iter->FilterColumnIdsAndCollectDeltas( + vector(), + &cells, + &arena)); + for (const DeltaKeyAndUpdate& cell : cells) { + LOG_STRING(INFO, out) << cell.Stringify(type, schema); + } + + i += n; + } + return Status::OK(); +} + +template +Status WriteDeltaIteratorToFile(DeltaIterator* iter, + size_t nrows, + DeltaFileWriter* out) { + ScanSpec spec; + spec.set_cache_blocks(false); + RETURN_NOT_OK(iter->Init(&spec)); + RETURN_NOT_OK(iter->SeekToOrdinal(0)); + + const size_t kRowsPerBlock = 100; + DeltaStats stats; + Arena arena(32 * 1024, 128 * 1024); + for (size_t i = 0; iter->HasNext(); ) { + size_t n; + if (nrows > 0) { + if (i >= nrows) { + break; + } + n = std::min(kRowsPerBlock, nrows - i); + } else { + n = kRowsPerBlock; + } + + arena.Reset(); + + RETURN_NOT_OK(iter->PrepareBatch(n, DeltaIterator::PREPARE_FOR_COLLECT)); + vector cells; + RETURN_NOT_OK(iter->FilterColumnIdsAndCollectDeltas(vector(), + &cells, + &arena)); + for (const DeltaKeyAndUpdate& cell : cells) { + RowChangeList rcl(cell.cell); + RETURN_NOT_OK(out->AppendDelta(cell.key, rcl)); + RETURN_NOT_OK(stats.UpdateStats(cell.key.timestamp(), rcl)); + } + + i += n; + } + RETURN_NOT_OK(out->WriteDeltaStats(stats)); + return Status::OK(); +} + +template +Status WriteDeltaIteratorToFile(DeltaIterator* iter, + size_t nrows, + DeltaFileWriter* out); + +template +Status WriteDeltaIteratorToFile(DeltaIterator* iter, + size_t nrows, + DeltaFileWriter* out); + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_store.h b/src/kudu/tablet/delta_store.h new file mode 100644 index 000000000000..e4dafe6d4ddf --- /dev/null +++ b/src/kudu/tablet/delta_store.h @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTA_STORE_H +#define KUDU_TABLET_DELTA_STORE_H + +#include +#include +#include + +#include "kudu/common/columnblock.h" +#include "kudu/common/schema.h" +#include "kudu/util/status.h" +#include "kudu/tablet/mutation.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/tablet/delta_key.h" +#include "kudu/tablet/delta_stats.h" +#include "kudu/tablet/tablet_metadata.h" + +namespace kudu { + +class ScanSpec; +class SelectionVector; + +namespace tablet { + +class DeltaIterator; +class DeltaFileWriter; + +// Interface for the pieces of the system that track deltas/updates. +// This is implemented by DeltaMemStore and by DeltaFileReader. +class DeltaStore { + public: + // Performs any post-construction work for the DeltaStore, which may + // include additional I/O. + virtual Status Init() = 0; + + // Whether this delta store was initialized or not. + virtual bool Initted() = 0; + + // Create a DeltaIterator for the given projection. + // + // The projection corresponds to whatever scan is currently ongoing. + // All RowBlocks passed to this DeltaIterator must have this same schema. + // + // 'snapshot' is the MVCC state which determines which transactions + // should be considered committed (and thus applied by the iterator). + // + // Returns Status::OK and sets 'iterator' to the new DeltaIterator, or + // returns Status::NotFound if the mutations within this delta store + // cannot include 'snap'. + virtual Status NewDeltaIterator(const Schema *projection, + const MvccSnapshot &snap, + DeltaIterator** iterator) const = 0; + + // Set *deleted to true if the latest update for the given row is a deletion. + virtual Status CheckRowDeleted(rowid_t row_idx, bool *deleted) const = 0; + + // Get the store's estimated size in bytes. + virtual uint64_t EstimateSize() const = 0; + + virtual std::string ToString() const = 0; + + // TODO remove this once we don't need to have delta_stats for both DMS and DFR. Currently + // DeltaTracker#GetColumnsIdxWithUpdates() needs to filter out DMS from the redo list but it + // can't without RTTI. + virtual const DeltaStats& delta_stats() const = 0; + + virtual ~DeltaStore() {} +}; + +typedef std::vector > SharedDeltaStoreVector; + +// Iterator over deltas. +// For each rowset, this iterator is constructed alongside the base data iterator, +// and used to apply any updates which haven't been yet compacted into the base +// (i.e. those edits in the DeltaMemStore or in delta files) +// +// Typically this is used as follows: +// +// Open iterator, seek to particular point in file +// RowBlock rowblock; +// foreach RowBlock in base data { +// clear row block +// CHECK_OK(iter->PrepareBatch(rowblock.size())); +// ... read column 0 from base data into row block ... +// CHECK_OK(iter->ApplyUpdates(0, rowblock.column(0)) +// ... check predicates for column ... +// ... read another column from base data... +// CHECK_OK(iter->ApplyUpdates(1, rowblock.column(1))) +// ... +// } + +struct DeltaKeyAndUpdate { + DeltaKey key; + Slice cell; + + std::string Stringify(DeltaType type, const Schema& schema) const; +}; + +class DeltaIterator { + public: + // Initialize the iterator. This must be called once before any other + // call. + virtual Status Init(ScanSpec *spec) = 0; + + // Seek to a particular ordinal position in the delta data. This cancels any prepared + // block, and must be called at least once prior to PrepareBatch(). + virtual Status SeekToOrdinal(rowid_t idx) = 0; + + // Argument to PrepareBatch(). See below. + enum PrepareFlag { + PREPARE_FOR_APPLY, + PREPARE_FOR_COLLECT + }; + + // Prepare to apply deltas to a block of rows. This takes a consistent snapshot + // of all updates to the next 'nrows' rows, so that subsequent calls to + // ApplyUpdates() will not cause any "tearing"/non-atomicity. + // + // 'flag' denotes whether the batch will be used for collecting mutations or + // for applying them. Some implementations may choose to prepare differently. + // + // Each time this is called, the iterator is advanced by the full length + // of the previously prepared block. + virtual Status PrepareBatch(size_t nrows, PrepareFlag flag) = 0; + + // Apply the snapshotted updates to one of the columns. + // 'dst' must be the same length as was previously passed to PrepareBatch() + // Must have called PrepareBatch() with flag = PREPARE_FOR_APPLY. + virtual Status ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) = 0; + + // Apply any deletes to the given selection vector. + // Rows which have been deleted in the associated MVCC snapshot are set to + // 0 in the selection vector so that they don't show up in the output. + // Must have called PrepareBatch() with flag = PREPARE_FOR_APPLY. + virtual Status ApplyDeletes(SelectionVector *sel_vec) = 0; + + // Collect the mutations associated with each row in the current prepared batch. + // + // Each entry in the vector will be treated as a singly linked list of Mutation + // objects. If there are no mutations for that row, the entry will be unmodified. + // If there are mutations, they will be appended at the tail of the linked list + // (i.e in ascending timestamp order) + // + // The Mutation objects will be allocated out of the provided Arena, which must be non-NULL. + // Must have called PrepareBatch() with flag = PREPARE_FOR_COLLECT. + virtual Status CollectMutations(vector *dst, Arena *arena) = 0; + + // Iterate through all deltas, adding deltas for columns not + // specified in 'col_ids' to 'out'. + // + // The delta objects will be allocated out the provided Arena which + // must be non-NULL. + // Must have called PrepareBatch() with flag = PREPARE_FOR_COLLECT. + virtual Status FilterColumnIdsAndCollectDeltas(const std::vector& col_ids, + vector* out, + Arena* arena) = 0; + + // Returns true if there are any more rows left in this iterator. + virtual bool HasNext() = 0; + + // Return a string representation suitable for debug printouts. + virtual std::string ToString() const = 0; + + virtual ~DeltaIterator() {} +}; + +enum { + ITERATE_OVER_ALL_ROWS = 0 +}; + +// Dumps contents of 'iter' to 'out', line-by-line. Used to unit test +// minor delta compaction. +// +// If nrows is 0, all rows will be dumped. +Status DebugDumpDeltaIterator(DeltaType type, + DeltaIterator* iter, + const Schema& schema, + size_t nrows, + vector* out); + +// Writes the contents of 'iter' to 'out', block by block. Used by +// minor delta compaction. +// +// If nrows is 0, all rows will be dumped. +template +Status WriteDeltaIteratorToFile(DeltaIterator* iter, + size_t nrows, + DeltaFileWriter* out); + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/delta_tracker.cc b/src/kudu/tablet/delta_tracker.cc new file mode 100644 index 000000000000..41ba17abb761 --- /dev/null +++ b/src/kudu/tablet/delta_tracker.cc @@ -0,0 +1,532 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/delta_tracker.h" + +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/delta_applier.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/delta_iterator_merger.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tablet { + +using fs::ReadableBlock; +using fs::WritableBlock; +using std::shared_ptr; +using std::string; +using strings::Substitute; + +DeltaTracker::DeltaTracker(shared_ptr rowset_metadata, + rowid_t num_rows, + log::LogAnchorRegistry* log_anchor_registry, + shared_ptr parent_tracker) + : rowset_metadata_(std::move(rowset_metadata)), + num_rows_(num_rows), + open_(false), + log_anchor_registry_(log_anchor_registry), + parent_tracker_(std::move(parent_tracker)) {} + +Status DeltaTracker::OpenDeltaReaders(const vector& blocks, + vector >* stores, + DeltaType type) { + FsManager* fs = rowset_metadata_->fs_manager(); + for (const BlockId& block_id : blocks) { + gscoped_ptr block; + Status s = fs->OpenBlock(block_id, &block); + if (!s.ok()) { + LOG(ERROR) << "Failed to open " << DeltaType_Name(type) + << " delta file " << block_id.ToString() << ": " + << s.ToString(); + return s; + } + + shared_ptr dfr; + s = DeltaFileReader::OpenNoInit(block.Pass(), block_id, &dfr, type); + if (!s.ok()) { + LOG(ERROR) << "Failed to open " << DeltaType_Name(type) + << " delta file reader " << block_id.ToString() << ": " + << s.ToString(); + return s; + } + + VLOG(1) << "Successfully opened " << DeltaType_Name(type) + << " delta file " << block_id.ToString(); + stores->push_back(dfr); + } + return Status::OK(); +} + + +// Open any previously flushed DeltaFiles in this rowset +Status DeltaTracker::Open() { + CHECK(redo_delta_stores_.empty()) << "should call before opening any readers"; + CHECK(undo_delta_stores_.empty()) << "should call before opening any readers"; + CHECK(!open_); + + RETURN_NOT_OK(OpenDeltaReaders(rowset_metadata_->redo_delta_blocks(), + &redo_delta_stores_, + REDO)); + RETURN_NOT_OK(OpenDeltaReaders(rowset_metadata_->undo_delta_blocks(), + &undo_delta_stores_, + UNDO)); + + // the id of the first DeltaMemStore is the max id of the current ones +1 + dms_.reset(new DeltaMemStore(rowset_metadata_->last_durable_redo_dms_id() + 1, + rowset_metadata_->id(), + log_anchor_registry_, + parent_tracker_)); + open_ = true; + return Status::OK(); +} + +Status DeltaTracker::MakeDeltaIteratorMergerUnlocked(size_t start_idx, size_t end_idx, + const Schema* projection, + vector > *target_stores, + vector *target_blocks, + std::shared_ptr *out) { + CHECK(open_); + CHECK_LE(start_idx, end_idx); + CHECK_LT(end_idx, redo_delta_stores_.size()); + vector > inputs; + for (size_t idx = start_idx; idx <= end_idx; ++idx) { + shared_ptr &delta_store = redo_delta_stores_[idx]; + + // In DEBUG mode, the following asserts that the object is of the right type + // (using RTTI) + ignore_result(down_cast(delta_store.get())); + shared_ptr dfr = std::static_pointer_cast(delta_store); + + LOG(INFO) << "Preparing to minor compact delta file: " << dfr->ToString(); + + inputs.push_back(delta_store); + target_stores->push_back(delta_store); + target_blocks->push_back(dfr->block_id()); + } + RETURN_NOT_OK(DeltaIteratorMerger::Create( + inputs, projection, + MvccSnapshot::CreateSnapshotIncludingAllTransactions(), out)); + return Status::OK(); +} + +namespace { + +string JoinDeltaStoreStrings(const SharedDeltaStoreVector& stores) { + vector strings; + for (const shared_ptr& store : stores) { + strings.push_back(store->ToString()); + } + return ::JoinStrings(strings, ","); +} + +} // anonymous namespace + +Status DeltaTracker::AtomicUpdateStores(const SharedDeltaStoreVector& to_remove, + const vector& new_delta_blocks, + DeltaType type) { + SharedDeltaStoreVector new_stores; + RETURN_NOT_OK_PREPEND(OpenDeltaReaders(new_delta_blocks, &new_stores, type), + "Unable to open delta blocks"); + + lock_guard lock(&component_lock_); + SharedDeltaStoreVector* stores_to_update = + type == REDO ? &redo_delta_stores_ : &undo_delta_stores_; + SharedDeltaStoreVector::iterator start_it; + // TODO this is hacky, we do this because UNDOs don't currently get replaced and we need to + // front-load them. When we start GCing UNDO files (KUDU-236) we'll need to be able to atomically + // replace them too, and in their right order. + if (!to_remove.empty()) { + start_it = + std::find(stores_to_update->begin(), stores_to_update->end(), to_remove[0]); + + auto end_it = start_it; + for (const shared_ptr& ds : to_remove) { + if (end_it == stores_to_update->end() || *end_it != ds) { + return Status::InvalidArgument( + strings::Substitute("Cannot find deltastore sequence <$0> in <$1>", + JoinDeltaStoreStrings(to_remove), + JoinDeltaStoreStrings(*stores_to_update))); + } + ++end_it; + } + // Remove the old stores + stores_to_update->erase(start_it, end_it); + } else { + start_it = stores_to_update->begin(); + } + + // Insert the new store + stores_to_update->insert(start_it, new_stores.begin(), new_stores.end()); + + VLOG(1) << "New " << DeltaType_Name(type) << " stores: " + << JoinDeltaStoreStrings(*stores_to_update); + return Status::OK(); +} + +Status DeltaTracker::Compact() { + return CompactStores(0, -1); +} + +Status DeltaTracker::CompactStores(int start_idx, int end_idx) { + // Prevent concurrent compactions or a compaction concurrent with a flush + // + // TODO(perf): this could be more fine grained + lock_guard l(&compact_flush_lock_); + if (CountRedoDeltaStores() <= 1) { + return Status::OK(); + } + + if (end_idx == -1) { + end_idx = redo_delta_stores_.size() - 1; + } + + CHECK_LE(start_idx, end_idx); + CHECK_LT(end_idx, redo_delta_stores_.size()); + CHECK(open_); + + // Open a writer for the new destination delta block + FsManager* fs = rowset_metadata_->fs_manager(); + gscoped_ptr block; + RETURN_NOT_OK_PREPEND(fs->CreateNewBlock(&block), + "Could not allocate delta block"); + BlockId new_block_id(block->id()); + + // Merge and compact the stores and write and output to "data_writer" + vector > compacted_stores; + vector compacted_blocks; + RETURN_NOT_OK(DoCompactStores(start_idx, end_idx, block.Pass(), + &compacted_stores, &compacted_blocks)); + + // Update delta_stores_, removing the compacted delta files and inserted the new + RETURN_NOT_OK(AtomicUpdateStores(compacted_stores, { new_block_id }, REDO)); + LOG(INFO) << "Opened delta block for read: " << new_block_id.ToString(); + + // Update the metadata accordingly + RowSetMetadataUpdate update; + update.ReplaceRedoDeltaBlocks(compacted_blocks, { new_block_id }); + // TODO: need to have some error handling here -- if we somehow can't persist the + // metadata, do we end up losing data on recovery? + CHECK_OK(rowset_metadata_->CommitUpdate(update)); + + Status s = rowset_metadata_->Flush(); + if (!s.ok()) { + // TODO: again need to figure out some way of making this safe. Should we be + // writing the metadata _ahead_ of the actual store swap? Probably. + LOG(FATAL) << "Unable to commit delta data block metadata for " + << new_block_id.ToString() << ": " << s.ToString(); + return s; + } + + return Status::OK(); +} + +Status DeltaTracker::DoCompactStores(size_t start_idx, size_t end_idx, + gscoped_ptr block, + vector > *compacted_stores, + vector *compacted_blocks) { + shared_ptr inputs_merge; + + // Currently, DeltaFile iterators ignore the passed-in projection in + // FilterColumnIdsAndCollectDeltas(). So, we just pass an empty schema here. + // If this changes in the future, we'll have to pass in the current tablet + // schema here. + Schema empty_schema; + RETURN_NOT_OK(MakeDeltaIteratorMergerUnlocked(start_idx, end_idx, &empty_schema, compacted_stores, + compacted_blocks, &inputs_merge)); + LOG(INFO) << "Compacting " << (end_idx - start_idx + 1) << " delta files."; + DeltaFileWriter dfw(block.Pass()); + RETURN_NOT_OK(dfw.Start()); + RETURN_NOT_OK(WriteDeltaIteratorToFile(inputs_merge.get(), + ITERATE_OVER_ALL_ROWS, + &dfw)); + RETURN_NOT_OK(dfw.Finish()); + LOG(INFO) << "Succesfully compacted the specified delta files."; + return Status::OK(); +} + +void DeltaTracker::CollectStores(vector > *deltas) const { + lock_guard lock(&component_lock_); + deltas->assign(undo_delta_stores_.begin(), undo_delta_stores_.end()); + deltas->insert(deltas->end(), redo_delta_stores_.begin(), redo_delta_stores_.end()); + deltas->push_back(dms_); +} + +Status DeltaTracker::CheckSnapshotComesAfterAllUndos(const MvccSnapshot& snap) const { + std::vector > undos; + { + lock_guard lock(&component_lock_); + undos = undo_delta_stores_; + } + for (const shared_ptr& undo : undos) { + DeltaFileReader* dfr = down_cast(undo.get()); + + // Even though IsRelevantForSnapshot() is safe to call without + // initializing the reader, the assertion being tested by this function + // will probably fail without real delta stats. + RETURN_NOT_OK(dfr->Init()); + + CHECK(!dfr->IsRelevantForSnapshot(snap)) + << "Invalid snapshot " << snap.ToString() + << " does not come after undo file " << undo->ToString() + << " with stats: " << dfr->delta_stats().ToString(); + } + + return Status::OK(); +} + +Status DeltaTracker::NewDeltaIterator(const Schema* schema, + const MvccSnapshot& snap, + shared_ptr* out) const { + std::vector > stores; + CollectStores(&stores); + return DeltaIteratorMerger::Create(stores, schema, snap, out); +} + +Status DeltaTracker::NewDeltaFileIterator( + const Schema* schema, + const MvccSnapshot& snap, + DeltaType type, + vector >* included_stores, + shared_ptr* out) const { + { + lock_guard lock(&component_lock_); + // TODO perf: is this really needed? Will check + // DeltaIteratorMerger::Create() + if (type == UNDO) { + *included_stores = undo_delta_stores_; + } else if (type == REDO) { + *included_stores = redo_delta_stores_; + } else { + LOG(FATAL); + } + } + + // Verify that we're only merging files and not DeltaMemStores. + // TODO: we need to somehow ensure this doesn't fail - we have to somehow coordinate + // minor delta compaction against delta flush. Add a test case here to trigger this + // condition. + for (const shared_ptr& store : *included_stores) { + ignore_result(down_cast(store.get())); + } + + return DeltaIteratorMerger::Create(*included_stores, schema, snap, out); +} + +Status DeltaTracker::WrapIterator(const shared_ptr &base, + const MvccSnapshot &mvcc_snap, + gscoped_ptr* out) const { + shared_ptr iter; + RETURN_NOT_OK(NewDeltaIterator(&base->schema(), mvcc_snap, &iter)); + + out->reset(new DeltaApplier(base, iter)); + return Status::OK(); +} + + +Status DeltaTracker::Update(Timestamp timestamp, + rowid_t row_idx, + const RowChangeList &update, + const consensus::OpId& op_id, + OperationResultPB* result) { + // TODO: can probably lock this more fine-grained. + shared_lock lock(&component_lock_); + DCHECK_LT(row_idx, num_rows_); + + Status s = dms_->Update(timestamp, row_idx, update, op_id); + if (s.ok()) { + MemStoreTargetPB* target = result->add_mutated_stores(); + target->set_rs_id(rowset_metadata_->id()); + target->set_dms_id(dms_->id()); + } + return s; +} + +Status DeltaTracker::CheckRowDeleted(rowid_t row_idx, bool *deleted, + ProbeStats* stats) const { + shared_lock lock(&component_lock_); + + DCHECK_LT(row_idx, num_rows_); + + *deleted = false; + // Check if the row has a deletion in DeltaMemStore. + RETURN_NOT_OK(dms_->CheckRowDeleted(row_idx, deleted)); + if (*deleted) { + return Status::OK(); + } + + // Then check backwards through the list of trackers. + for (auto ds = redo_delta_stores_.crbegin(); ds != redo_delta_stores_.crend(); ds++) { + stats->deltas_consulted++; + RETURN_NOT_OK((*ds)->CheckRowDeleted(row_idx, deleted)); + if (*deleted) { + return Status::OK(); + } + } + + return Status::OK(); +} + +Status DeltaTracker::FlushDMS(DeltaMemStore* dms, + shared_ptr* dfr, + MetadataFlushType flush_type) { + // Open file for write. + FsManager* fs = rowset_metadata_->fs_manager(); + gscoped_ptr writable_block; + RETURN_NOT_OK_PREPEND(fs->CreateNewBlock(&writable_block), + "Unable to allocate new delta data writable_block"); + BlockId block_id(writable_block->id()); + + DeltaFileWriter dfw(writable_block.Pass()); + RETURN_NOT_OK_PREPEND(dfw.Start(), + Substitute("Unable to start writing to delta block $0", + block_id.ToString())); + + gscoped_ptr stats; + RETURN_NOT_OK(dms->FlushToFile(&dfw, &stats)); + RETURN_NOT_OK(dfw.Finish()); + LOG(INFO) << "Flushed delta block: " << block_id.ToString(); + + // Now re-open for read + gscoped_ptr readable_block; + RETURN_NOT_OK(fs->OpenBlock(block_id, &readable_block)); + RETURN_NOT_OK(DeltaFileReader::OpenNoInit(readable_block.Pass(), block_id, dfr, REDO)); + LOG(INFO) << "Reopened delta block for read: " << block_id.ToString(); + + RETURN_NOT_OK(rowset_metadata_->CommitRedoDeltaDataBlock(dms->id(), block_id)); + if (flush_type == FLUSH_METADATA) { + RETURN_NOT_OK_PREPEND(rowset_metadata_->Flush(), + Substitute("Unable to commit Delta block metadata for: $0", + block_id.ToString())); + } + return Status::OK(); +} + +Status DeltaTracker::Flush(MetadataFlushType flush_type) { + lock_guard l(&compact_flush_lock_); + + // First, swap out the old DeltaMemStore a new one, + // and add it to the list of delta stores to be reflected + // in reads. + shared_ptr old_dms; + size_t count; + { + // Lock the component_lock_ in exclusive mode. + // This shuts out any concurrent readers or writers. + lock_guard lock(&component_lock_); + + count = dms_->Count(); + + // Swap the DeltaMemStore to use the new schema + old_dms = dms_; + dms_.reset(new DeltaMemStore(old_dms->id() + 1, rowset_metadata_->id(), + log_anchor_registry_, parent_tracker_)); + + if (count == 0) { + // No need to flush if there are no deltas. + // Ensure that the DeltaMemStore is using the latest schema. + return Status::OK(); + } + + redo_delta_stores_.push_back(old_dms); + } + + LOG(INFO) << "Flushing " << count << " deltas from DMS " << old_dms->id() << "..."; + + // Now, actually flush the contents of the old DMS. + // TODO: need another lock to prevent concurrent flushers + // at some point. + shared_ptr dfr; + Status s = FlushDMS(old_dms.get(), &dfr, flush_type); + CHECK(s.ok()) + << "Failed to flush DMS: " << s.ToString() + << "\nTODO: need to figure out what to do with error handling " + << "if this fails -- we end up with a DeltaMemStore permanently " + << "in the store list. For now, abort."; + + + // Now, re-take the lock and swap in the DeltaFileReader in place of + // of the DeltaMemStore + { + lock_guard lock(&component_lock_); + size_t idx = redo_delta_stores_.size() - 1; + + CHECK_EQ(redo_delta_stores_[idx], old_dms) + << "Another thread modified the delta store list during flush"; + redo_delta_stores_[idx] = dfr; + } + + return Status::OK(); + + // TODO: wherever we write stuff, we should write to a tmp path + // and rename to final path! +} + +size_t DeltaTracker::DeltaMemStoreSize() const { + shared_lock lock(&component_lock_); + return dms_->memory_footprint(); +} + +bool DeltaTracker::DeltaMemStoreEmpty() const { + shared_lock lock(&component_lock_); + return dms_->Empty(); +} + +int64_t DeltaTracker::MinUnflushedLogIndex() const { + shared_lock lock(&component_lock_); + return dms_->MinLogIndex(); +} + +size_t DeltaTracker::CountRedoDeltaStores() const { + shared_lock lock(&component_lock_); + return redo_delta_stores_.size(); +} + +uint64_t DeltaTracker::EstimateOnDiskSize() const { + shared_lock lock(&component_lock_); + uint64_t size = 0; + for (const shared_ptr& ds : redo_delta_stores_) { + size += ds->EstimateSize(); + } + return size; +} + +void DeltaTracker::GetColumnIdsWithUpdates(std::vector* col_ids) const { + shared_lock lock(&component_lock_); + + set column_ids_with_updates; + for (const shared_ptr& ds : redo_delta_stores_) { + // We won't force open files just to read their stats. + if (!ds->Initted()) { + continue; + } + + ds->delta_stats().AddColumnIdsWithUpdates(&column_ids_with_updates); + } + col_ids->assign(column_ids_with_updates.begin(), column_ids_with_updates.end()); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/delta_tracker.h b/src/kudu/tablet/delta_tracker.h new file mode 100644 index 000000000000..9c636f8e6579 --- /dev/null +++ b/src/kudu/tablet/delta_tracker.h @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTATRACKER_H +#define KUDU_TABLET_DELTATRACKER_H + +#include +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/rowid.h" +#include "kudu/gutil/macros.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MemTracker; + +namespace consensus { +class OpId; +} + +namespace log { +class LogAnchorRegistry; +} + +namespace metadata { +class RowSetMetadata; +} + +namespace tablet { + +class DeltaMemStore; +class DeltaFileReader; +class OperationResultPB; +class MemStoreTargetPB; +struct ProbeStats; + +// The DeltaTracker is the part of a DiskRowSet which is responsible for +// tracking modifications against the base data. It consists of a set of +// DeltaStores which each contain a set of mutations against the base data. +// These DeltaStores may be on disk (DeltaFileReader) or in-memory (DeltaMemStore). +// +// This class is also responsible for flushing the in-memory deltas to disk. +class DeltaTracker { + public: + enum MetadataFlushType { + FLUSH_METADATA, + NO_FLUSH_METADATA + }; + + DeltaTracker(std::shared_ptr rowset_metadata, + rowid_t num_rows, log::LogAnchorRegistry* log_anchor_registry, + std::shared_ptr parent_tracker); + + Status WrapIterator(const std::shared_ptr &base, + const MvccSnapshot &mvcc_snap, + gscoped_ptr* out) const; + + // TODO: this shouldn't need to return a shared_ptr, but there is some messiness + // where this has bled around. + // + // 'schema' is the schema of the rows that are being read by the client. + // It must remain valid for the lifetime of the returned iterator. + Status NewDeltaIterator(const Schema* schema, + const MvccSnapshot& snap, + std::shared_ptr* out) const; + + // Like NewDeltaIterator() but only includes file based stores, does not include + // the DMS. + // Returns the delta stores being merged in *included_stores. + Status NewDeltaFileIterator( + const Schema* schema, + const MvccSnapshot &snap, + DeltaType type, + std::vector >* included_stores, + std::shared_ptr* out) const; + + // CHECKs that the given snapshot includes all of the UNDO stores in this + // delta tracker. If this is not the case, crashes the process. This is + // used as an assertion during compaction, where we always expect the + // compaction snapshot to be in the future relative to any UNDOs. + // + // Returns a bad status in the event of an I/O related error. + Status CheckSnapshotComesAfterAllUndos(const MvccSnapshot& snap) const; + + Status Open(); + + // Flushes the current DeltaMemStore and replaces it with a new one. + // Caller selects whether to also have the RowSetMetadata (and consequently + // the TabletMetadata) flushed. + // + // NOTE: 'flush_type' should almost always be set to 'FLUSH_METADATA', or else + // delta stores might become unrecoverable. TODO: see KUDU-204 to clean this up + // a bit. + Status Flush(MetadataFlushType flush_type); + + // Update the given row in the database. + // Copies the data, as well as any referenced values into a local arena. + // "result" tracks the status of the update as well as which data + // structure(s) it ended up at. + Status Update(Timestamp timestamp, + rowid_t row_idx, + const RowChangeList &update, + const consensus::OpId& op_id, + OperationResultPB* result); + + // Check if the given row has been deleted -- i.e if the most recent + // delta for this row is a deletion. + // + // Sets *deleted to true if so; otherwise sets it to false. + Status CheckRowDeleted(rowid_t row_idx, bool *deleted, ProbeStats* stats) const; + + // Compacts all deltafiles + // + // TODO keep metadata in the delta stores to indicate whether or not + // a minor (or -- when implemented -- major) compaction is warranted + // and if so, compact the stores. + Status Compact(); + + // Performs minor compaction on all delta files between index + // "start_idx" and "end_idx" (inclusive) and writes this to a + // new delta block. If "end_idx" is set to -1, then delta files at + // all indexes starting with "start_idx" will be compacted. + Status CompactStores(int start_idx, int end_idx); + + // Replace the subsequence of stores that matches 'stores_to_replace' with + // delta file readers corresponding to 'new_delta_blocks', which may be empty. + Status AtomicUpdateStores(const SharedDeltaStoreVector& stores_to_replace, + const std::vector& new_delta_blocks, + DeltaType type); + + // Return the number of rows encompassed by this DeltaTracker. Note that + // this is _not_ the number of updated rows, but rather the number of rows + // in the associated CFileSet base data. All updates must have a rowid + // strictly less than num_rows(). + int64_t num_rows() const { return num_rows_; } + + // Get the delta MemStore's size in bytes, including pre-allocation. + size_t DeltaMemStoreSize() const; + + // Returns true if the DMS has no entries. This doesn't rely on the size. + bool DeltaMemStoreEmpty() const; + + // Get the minimum log index for this tracker's DMS, -1 if it wasn't set. + int64_t MinUnflushedLogIndex() const; + + // Return the number of redo delta stores, not including the DeltaMemStore. + size_t CountRedoDeltaStores() const; + + uint64_t EstimateOnDiskSize() const; + + // Retrieves the list of column indexes that currently have updates. + void GetColumnIdsWithUpdates(std::vector* col_ids) const; + + Mutex* compact_flush_lock() { + return &compact_flush_lock_; + } + + private: + friend class DiskRowSet; + + DISALLOW_COPY_AND_ASSIGN(DeltaTracker); + + FRIEND_TEST(TestRowSet, TestRowSetUpdate); + FRIEND_TEST(TestRowSet, TestDMSFlush); + FRIEND_TEST(TestRowSet, TestMakeDeltaIteratorMergerUnlocked); + FRIEND_TEST(TestRowSet, TestCompactStores); + FRIEND_TEST(TestMajorDeltaCompaction, TestCompact); + + Status OpenDeltaReaders(const std::vector& blocks, + std::vector >* stores, + DeltaType type); + + Status FlushDMS(DeltaMemStore* dms, + std::shared_ptr* dfr, + MetadataFlushType flush_type); + + // This collects all undo and redo stores. + void CollectStores(vector > *stores) const; + + // Performs the actual compaction. Results of compaction are written to "block", + // while delta stores that underwent compaction are appended to "compacted_stores", while + // their corresponding block ids are appended to "compacted_blocks". + // + // NOTE: the caller of this method should acquire or already hold an + // exclusive lock on 'compact_flush_lock_' before calling this + // method in order to protect 'redo_delta_stores_'. + Status DoCompactStores(size_t start_idx, size_t end_idx, + gscoped_ptr block, + vector > *compacted_stores, + std::vector* compacted_blocks); + + // Creates a merge delta iterator and captures the delta stores and + // delta blocks under compaction into 'target_stores' and + // 'target_blocks', respectively. The merge iterator is stored in + // 'out'; 'out' is valid until this instance of DeltaTracker + // is destroyed. + // + // NOTE: the caller of this method must first acquire or already + // hold a lock on 'compact_flush_lock_'in order to guard against a + // race on 'redo_delta_stores_'. + Status MakeDeltaIteratorMergerUnlocked(size_t start_idx, size_t end_idx, + const Schema* schema, + vector > *target_stores, + vector *target_blocks, + std::shared_ptr *out); + + std::shared_ptr rowset_metadata_; + + // The number of rows in the DiskRowSet that this tracker is associated with. + // This is just used for assertions to make sure that we don't update a row + // which doesn't exist. + rowid_t num_rows_; + + bool open_; + + log::LogAnchorRegistry* log_anchor_registry_; + + std::shared_ptr parent_tracker_; + + // The current DeltaMemStore into which updates should be written. + std::shared_ptr dms_; + // The set of tracked REDO delta stores, in increasing timestamp order. + SharedDeltaStoreVector redo_delta_stores_; + // The set of tracked UNDO delta stores, in decreasing timestamp order. + SharedDeltaStoreVector undo_delta_stores_; + + // read-write lock protecting dms_ and {redo,undo}_delta_stores_. + // - Readers and mutators take this lock in shared mode. + // - Flushers take this lock in exclusive mode before they modify the + // structure of the rowset. + // + // TODO(perf): convert this to a reader-biased lock to avoid any cacheline + // contention between threads. + mutable rw_spinlock component_lock_; + + // Exclusive lock that ensures that only one flush or compaction can run + // at a time. Protects delta_stores_. NOTE: this lock cannot be acquired + // while component_lock is held: otherwise, Flush and Compaction threads + // (that both first acquire this lock and then component_lock) will deadlock. + // + // TODO(perf): this needs to be more fine grained + mutable Mutex compact_flush_lock_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/deltafile-test.cc b/src/kudu/tablet/deltafile-test.cc new file mode 100644 index 000000000000..3005f7f9e445 --- /dev/null +++ b/src/kudu/tablet/deltafile-test.cc @@ -0,0 +1,372 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/fs/fs-test-util.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/delta_tracker.h" +#include "kudu/gutil/algorithm.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/test_macros.h" + +DECLARE_int32(deltafile_default_block_size); +DECLARE_bool(log_block_manager_test_hole_punching); +DEFINE_int32(first_row_to_update, 10000, "the first row to update"); +DEFINE_int32(last_row_to_update, 100000, "the last row to update"); +DEFINE_int32(n_verify, 1, "number of times to verify the updates" + "(useful for benchmarks"); + +using std::is_sorted; +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +using fs::CountingReadableBlock; +using fs::ReadableBlock; +using fs::WritableBlock; + +// Test path to write delta file to (in in-memory environment) +const char kTestPath[] = "/tmp/test"; + +class TestDeltaFile : public ::testing::Test { + public: + TestDeltaFile() : + env_(NewMemEnv(Env::Default())), + schema_(CreateSchema()), + arena_(1024, 1024) { + // Can't check on-disk file size with a memenv. + FLAGS_log_block_manager_test_hole_punching = false; + } + + public: + void SetUp() OVERRIDE { + fs_manager_.reset(new FsManager(env_.get(), kTestPath)); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddColumn("val", UINT32)); + return builder.Build(); + } + + void WriteTestFile(int min_timestamp = 0, int max_timestamp = 0) { + gscoped_ptr block; + ASSERT_OK(fs_manager_->CreateNewBlock(&block)); + test_block_ = block->id(); + DeltaFileWriter dfw(block.Pass()); + ASSERT_OK(dfw.Start()); + + // Update even numbered rows. + faststring buf; + + DeltaStats stats; + for (int i = FLAGS_first_row_to_update; i <= FLAGS_last_row_to_update; i += 2) { + for (int timestamp = min_timestamp; timestamp <= max_timestamp; timestamp++) { + buf.clear(); + RowChangeListEncoder update(&buf); + uint32_t new_val = timestamp + i; + update.AddColumnUpdate(schema_.column(0), schema_.column_id(0), &new_val); + DeltaKey key(i, Timestamp(timestamp)); + RowChangeList rcl(buf); + ASSERT_OK_FAST(dfw.AppendDelta(key, rcl)); + ASSERT_OK_FAST(stats.UpdateStats(key.timestamp(), rcl)); + } + } + ASSERT_OK(dfw.WriteDeltaStats(stats)); + ASSERT_OK(dfw.Finish()); + } + + + void DoTestRoundTrip() { + // First write the file. + WriteTestFile(); + + // Then iterate back over it, applying deltas to a fake row block. + for (int i = 0; i < FLAGS_n_verify; i++) { + VerifyTestFile(); + } + } + + Status OpenDeltaFileReader(const BlockId& block_id, shared_ptr* out) { + gscoped_ptr block; + RETURN_NOT_OK(fs_manager_->OpenBlock(block_id, &block)); + return DeltaFileReader::Open(block.Pass(), block_id, out, REDO); + } + + // TODO handle UNDO deltas + Status OpenDeltaFileIterator(const BlockId& block_id, gscoped_ptr* out) { + shared_ptr reader; + RETURN_NOT_OK(OpenDeltaFileReader(block_id, &reader)); + return OpenDeltaFileIteratorFromReader(REDO, reader, out); + } + + Status OpenDeltaFileIteratorFromReader(DeltaType type, + const shared_ptr& reader, + gscoped_ptr* out) { + MvccSnapshot snap = type == REDO ? + MvccSnapshot::CreateSnapshotIncludingAllTransactions() : + MvccSnapshot::CreateSnapshotIncludingNoTransactions(); + DeltaIterator* raw_iter; + RETURN_NOT_OK(reader->NewDeltaIterator(&schema_, snap, &raw_iter)); + out->reset(raw_iter); + return Status::OK(); + } + + void VerifyTestFile() { + shared_ptr reader; + ASSERT_OK(OpenDeltaFileReader(test_block_, &reader)); + ASSERT_EQ(((FLAGS_last_row_to_update - FLAGS_first_row_to_update) / 2) + 1, + reader->delta_stats().update_count_for_col_id(schema_.column_id(0))); + ASSERT_EQ(0, reader->delta_stats().delete_count()); + gscoped_ptr it; + Status s = OpenDeltaFileIteratorFromReader(REDO, reader, &it); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of an include-all snapshot"; + } + ASSERT_OK(s); + ASSERT_OK(it->Init(nullptr)); + + RowBlock block(schema_, 100, &arena_); + + // Iterate through the faked table, starting with batches that + // come before all of the updates, and extending a bit further + // past the updates, to ensure that nothing breaks on the boundaries. + ASSERT_OK(it->SeekToOrdinal(0)); + + int start_row = 0; + while (start_row < FLAGS_last_row_to_update + 10000) { + block.ZeroMemory(); + arena_.Reset(); + + ASSERT_OK_FAST(it->PrepareBatch(block.nrows(), DeltaIterator::PREPARE_FOR_APPLY)); + ColumnBlock dst_col = block.column_block(0); + ASSERT_OK_FAST(it->ApplyUpdates(0, &dst_col)); + + for (int i = 0; i < block.nrows(); i++) { + uint32_t row = start_row + i; + bool should_be_updated = (row >= FLAGS_first_row_to_update) && + (row <= FLAGS_last_row_to_update) && + (row % 2 == 0); + + DCHECK_EQ(block.row(i).cell_ptr(0), dst_col.cell_ptr(i)); + uint32_t updated_val = *schema_.ExtractColumnFromRow(block.row(i), 0); + VLOG(2) << "row " << row << ": " << updated_val; + uint32_t expected_val = should_be_updated ? row : 0; + // Don't use ASSERT_EQ, since it's slow (records positive results, not just negative) + if (updated_val != expected_val) { + FAIL() << "failed on row " << row << + ": expected " << expected_val << ", got " << updated_val; + } + } + + start_row += block.nrows(); + } + } + + protected: + gscoped_ptr env_; + gscoped_ptr fs_manager_; + Schema schema_; + Arena arena_; + BlockId test_block_; +}; + +TEST_F(TestDeltaFile, TestDumpDeltaFileIterator) { + WriteTestFile(); + + gscoped_ptr it; + Status s = OpenDeltaFileIterator(test_block_, &it); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of an include-all snapshot"; + } + ASSERT_OK(s); + vector it_contents; + ASSERT_OK(DebugDumpDeltaIterator(REDO, + it.get(), + schema_, + ITERATE_OVER_ALL_ROWS, + &it_contents)); + for (const string& str : it_contents) { + VLOG(1) << str; + } + ASSERT_TRUE(is_sorted(it_contents.begin(), it_contents.end())); + ASSERT_EQ(it_contents.size(), (FLAGS_last_row_to_update - FLAGS_first_row_to_update) / 2 + 1); +} + +TEST_F(TestDeltaFile, TestWriteDeltaFileIteratorToFile) { + WriteTestFile(); + gscoped_ptr it; + Status s = OpenDeltaFileIterator(test_block_, &it); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of an include-all snapshot"; + } + ASSERT_OK(s); + + gscoped_ptr block; + ASSERT_OK(fs_manager_->CreateNewBlock(&block)); + BlockId block_id(block->id()); + DeltaFileWriter dfw(block.Pass()); + ASSERT_OK(dfw.Start()); + ASSERT_OK(WriteDeltaIteratorToFile(it.get(), + ITERATE_OVER_ALL_ROWS, + &dfw)); + ASSERT_OK(dfw.Finish()); + + + // If delta stats are incorrect, then a Status::NotFound would be + // returned. + + ASSERT_OK(OpenDeltaFileIterator(block_id, &it)); + vector it_contents; + ASSERT_OK(DebugDumpDeltaIterator(REDO, + it.get(), + schema_, + ITERATE_OVER_ALL_ROWS, + &it_contents)); + for (const string& str : it_contents) { + VLOG(1) << str; + } + ASSERT_TRUE(is_sorted(it_contents.begin(), it_contents.end())); + ASSERT_EQ(it_contents.size(), (FLAGS_last_row_to_update - FLAGS_first_row_to_update) / 2 + 1); +} + +TEST_F(TestDeltaFile, TestRoundTripTinyDeltaBlocks) { + // Set block size small, so that we get good coverage + // of the case where multiple delta blocks correspond to a + // single underlying data block. + google::FlagSaver saver; + FLAGS_deltafile_default_block_size = 256; + DoTestRoundTrip(); +} + +TEST_F(TestDeltaFile, TestRoundTrip) { + DoTestRoundTrip(); +} + +TEST_F(TestDeltaFile, TestCollectMutations) { + WriteTestFile(); + + { + gscoped_ptr it; + Status s = OpenDeltaFileIterator(test_block_, &it); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of an include-all snapshot"; + } + ASSERT_OK(s); + + ASSERT_OK(it->Init(nullptr)); + ASSERT_OK(it->SeekToOrdinal(0)); + + vector mutations; + mutations.resize(100); + + int start_row = 0; + while (start_row < FLAGS_last_row_to_update + 10000) { + std::fill(mutations.begin(), mutations.end(), reinterpret_cast(NULL)); + + arena_.Reset(); + ASSERT_OK_FAST(it->PrepareBatch(mutations.size(), DeltaIterator::PREPARE_FOR_COLLECT)); + ASSERT_OK(it->CollectMutations(&mutations, &arena_)); + + for (int i = 0; i < mutations.size(); i++) { + Mutation *mut_head = mutations[i]; + if (mut_head != nullptr) { + rowid_t row = start_row + i; + string str = Mutation::StringifyMutationList(schema_, mut_head); + VLOG(1) << "Mutation on row " << row << ": " << str; + } + } + + start_row += mutations.size(); + } + } + +} + +TEST_F(TestDeltaFile, TestSkipsDeltasOutOfRange) { + WriteTestFile(10, 20); + shared_ptr reader; + ASSERT_OK(OpenDeltaFileReader(test_block_, &reader)); + + gscoped_ptr iter; + + // should skip + MvccSnapshot snap1(Timestamp(9)); + ASSERT_FALSE(snap1.MayHaveCommittedTransactionsAtOrAfter(Timestamp(10))); + DeltaIterator* raw_iter = nullptr; + Status s = reader->NewDeltaIterator(&schema_, snap1, &raw_iter); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_TRUE(raw_iter == nullptr); + + // should include + raw_iter = nullptr; + MvccSnapshot snap2(Timestamp(15)); + ASSERT_OK(reader->NewDeltaIterator(&schema_, snap2, &raw_iter)); + ASSERT_TRUE(raw_iter != nullptr); + iter.reset(raw_iter); + + // should include + raw_iter = nullptr; + MvccSnapshot snap3(Timestamp(21)); + ASSERT_OK(reader->NewDeltaIterator(&schema_, snap3, &raw_iter)); + ASSERT_TRUE(raw_iter != nullptr); + iter.reset(raw_iter); +} + +TEST_F(TestDeltaFile, TestLazyInit) { + WriteTestFile(); + + // Open it using a "counting" readable block. + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(test_block_, &block)); + size_t bytes_read = 0; + gscoped_ptr count_block( + new CountingReadableBlock(block.Pass(), &bytes_read)); + + // Lazily opening the delta file should not trigger any reads. + shared_ptr reader; + ASSERT_OK(DeltaFileReader::OpenNoInit( + count_block.Pass(), test_block_, &reader, REDO)); + ASSERT_EQ(0, bytes_read); + + // But initializing it should (only the first time). + ASSERT_OK(reader->Init()); + ASSERT_GT(bytes_read, 0); + size_t bytes_read_after_init = bytes_read; + ASSERT_OK(reader->Init()); + ASSERT_EQ(bytes_read_after_init, bytes_read); + + // And let's test non-lazy open for good measure; it should yield the + // same number of bytes read. + ASSERT_OK(fs_manager_->OpenBlock(test_block_, &block)); + bytes_read = 0; + count_block.reset(new CountingReadableBlock(block.Pass(), &bytes_read)); + ASSERT_OK(DeltaFileReader::Open(count_block.Pass(), test_block_, &reader, REDO)); + ASSERT_EQ(bytes_read_after_init, bytes_read); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/deltafile.cc b/src/kudu/tablet/deltafile.cc new file mode 100644 index 000000000000..79965f03e666 --- /dev/null +++ b/src/kudu/tablet/deltafile.cc @@ -0,0 +1,842 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/deltafile.h" + +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/cfile/binary_plain_block.h" +#include "kudu/cfile/block_encodings.h" +#include "kudu/cfile/block_handle.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/tablet/mutation.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/pb_util.h" + +DECLARE_bool(cfile_lazy_open); +DEFINE_int32(deltafile_default_block_size, 32*1024, + "Block size for delta files. In the future, this may become configurable " + "on a per-table basis."); +TAG_FLAG(deltafile_default_block_size, experimental); + +using std::shared_ptr; + +namespace kudu { + +using cfile::BlockHandle; +using cfile::BlockPointer; +using cfile::IndexTreeIterator; +using cfile::BinaryPlainBlockDecoder; +using cfile::CFileReader; +using fs::ReadableBlock; +using fs::ScopedWritableBlockCloser; +using fs::WritableBlock; + +namespace tablet { + +const char * const DeltaFileReader::kDeltaStatsEntryName = "deltafilestats"; + +namespace { + +} // namespace + +DeltaFileWriter::DeltaFileWriter(gscoped_ptr block) +#ifndef NDEBUG + : has_appended_(false) +#endif +{ // NOLINT(*) + cfile::WriterOptions opts; + opts.write_validx = true; + opts.storage_attributes.cfile_block_size = FLAGS_deltafile_default_block_size; + opts.storage_attributes.encoding = PLAIN_ENCODING; + writer_.reset(new cfile::CFileWriter(opts, GetTypeInfo(BINARY), false, block.Pass())); +} + + +Status DeltaFileWriter::Start() { + return writer_->Start(); +} + +Status DeltaFileWriter::Finish() { + ScopedWritableBlockCloser closer; + RETURN_NOT_OK(FinishAndReleaseBlock(&closer)); + return closer.CloseBlocks(); +} + +Status DeltaFileWriter::FinishAndReleaseBlock(ScopedWritableBlockCloser* closer) { + return writer_->FinishAndReleaseBlock(closer); +} + +Status DeltaFileWriter::DoAppendDelta(const DeltaKey &key, + const RowChangeList &delta) { + Slice delta_slice(delta.slice()); + + // See TODO in RowChangeListEncoder::SetToReinsert + CHECK(!delta.is_reinsert()) + << "TODO: REINSERT deltas cannot currently be written to disk " + << "since they don't have a standalone encoded form."; + + tmp_buf_.clear(); + + // Write the encoded form of the key to the file. + key.EncodeTo(&tmp_buf_); + + tmp_buf_.append(delta_slice.data(), delta_slice.size()); + Slice tmp_buf_slice(tmp_buf_); + + return writer_->AppendEntries(&tmp_buf_slice, 1); +} + +template<> +Status DeltaFileWriter::AppendDelta( + const DeltaKey &key, const RowChangeList &delta) { + +#ifndef NDEBUG + // Sanity check insertion order in debug mode. + if (has_appended_) { + DCHECK(last_key_.CompareTo(key) <= 0) + << "must insert redo deltas in sorted order (ascending key, then ascending ts): " + << "got key " << key.ToString() << " after " + << last_key_.ToString(); + } + has_appended_ = true; + last_key_ = key; +#endif + + return DoAppendDelta(key, delta); +} + +template<> +Status DeltaFileWriter::AppendDelta( + const DeltaKey &key, const RowChangeList &delta) { + +#ifndef NDEBUG + // Sanity check insertion order in debug mode. + if (has_appended_) { + DCHECK(last_key_.CompareTo(key) <= 0) + << "must insert undo deltas in sorted order (ascending key, then descending ts): " + << "got key " << key.ToString() << " after " + << last_key_.ToString(); + } + has_appended_ = true; + last_key_ = key; +#endif + + return DoAppendDelta(key, delta); +} + +Status DeltaFileWriter::WriteDeltaStats(const DeltaStats& stats) { + DeltaStatsPB delta_stats_pb; + stats.ToPB(&delta_stats_pb); + + faststring buf; + if (!pb_util::SerializeToString(delta_stats_pb, &buf)) { + return Status::IOError("Unable to serialize DeltaStatsPB", delta_stats_pb.DebugString()); + } + + writer_->AddMetadataPair(DeltaFileReader::kDeltaStatsEntryName, buf.ToString()); + return Status::OK(); +} + + +//////////////////////////////////////////////////////////// +// Reader +//////////////////////////////////////////////////////////// + +Status DeltaFileReader::Open(gscoped_ptr block, + const BlockId& block_id, + shared_ptr* reader_out, + DeltaType delta_type) { + shared_ptr df_reader; + RETURN_NOT_OK(DeltaFileReader::OpenNoInit(block.Pass(), + block_id, &df_reader, delta_type)); + RETURN_NOT_OK(df_reader->Init()); + + *reader_out = df_reader; + return Status::OK(); +} + +Status DeltaFileReader::OpenNoInit(gscoped_ptr block, + const BlockId& block_id, + shared_ptr* reader_out, + DeltaType delta_type) { + gscoped_ptr cf_reader; + RETURN_NOT_OK(CFileReader::OpenNoInit(block.Pass(), + cfile::ReaderOptions(), &cf_reader)); + gscoped_ptr df_reader(new DeltaFileReader(block_id, + cf_reader.release(), + delta_type)); + if (!FLAGS_cfile_lazy_open) { + RETURN_NOT_OK(df_reader->Init()); + } + + reader_out->reset(df_reader.release()); + + return Status::OK(); +} + +DeltaFileReader::DeltaFileReader(BlockId block_id, CFileReader *cf_reader, + DeltaType delta_type) + : reader_(cf_reader), + block_id_(std::move(block_id)), + delta_type_(delta_type) {} + +Status DeltaFileReader::Init() { + return init_once_.Init(&DeltaFileReader::InitOnce, this); +} + +Status DeltaFileReader::InitOnce() { + // Fully open the CFileReader if it was lazily opened earlier. + // + // If it's already initialized, this is a no-op. + RETURN_NOT_OK(reader_->Init()); + + if (!reader_->has_validx()) { + return Status::Corruption("file does not have a value index!"); + } + + // Initialize delta file stats + RETURN_NOT_OK(ReadDeltaStats()); + return Status::OK(); +} + +Status DeltaFileReader::ReadDeltaStats() { + string filestats_pb_buf; + if (!reader_->GetMetadataEntry(kDeltaStatsEntryName, &filestats_pb_buf)) { + return Status::Corruption("missing delta stats from the delta file metadata"); + } + + DeltaStatsPB deltastats_pb; + if (!deltastats_pb.ParseFromString(filestats_pb_buf)) { + return Status::Corruption("unable to parse the delta stats protobuf"); + } + gscoped_ptrstats(new DeltaStats()); + RETURN_NOT_OK(stats->InitFromPB(deltastats_pb)); + delta_stats_.swap(stats); + return Status::OK(); +} + +bool DeltaFileReader::IsRelevantForSnapshot(const MvccSnapshot& snap) const { + if (!init_once_.initted()) { + // If we're not initted, it means we have no delta stats and must + // assume that this file is relevant for every snapshot. + return true; + } + if (delta_type_ == REDO) { + return snap.MayHaveCommittedTransactionsAtOrAfter(delta_stats_->min_timestamp()); + } + if (delta_type_ == UNDO) { + return snap.MayHaveUncommittedTransactionsAtOrBefore(delta_stats_->max_timestamp()); + } + LOG(DFATAL) << "Cannot reach here"; + return false; +} + +Status DeltaFileReader::NewDeltaIterator(const Schema *projection, + const MvccSnapshot &snap, + DeltaIterator** iterator) const { + if (IsRelevantForSnapshot(snap)) { + if (VLOG_IS_ON(2)) { + if (!init_once_.initted()) { + VLOG(2) << (delta_type_ == REDO ? "REDO" : "UNDO") << " delta " << ToString() + << "has no delta stats" + << ": can't cull for " << snap.ToString(); + } else if (delta_type_ == REDO) { + VLOG(2) << "REDO delta " << ToString() + << " has min ts " << delta_stats_->min_timestamp().ToString() + << ": can't cull for " << snap.ToString(); + } else { + VLOG(2) << "UNDO delta " << ToString() + << " has max ts " << delta_stats_->max_timestamp().ToString() + << ": can't cull for " << snap.ToString(); + } + } + + // Ugly cast, but it lets the iterator fully initialize the reader + // during its first seek. + *iterator = new DeltaFileIterator( + const_cast(this)->shared_from_this(), projection, snap, delta_type_); + return Status::OK(); + } else { + VLOG(2) << "Culling " + << ((delta_type_ == REDO) ? "REDO":"UNDO") + << " delta " << ToString() << " for " << snap.ToString(); + return Status::NotFound("MvccSnapshot outside the range of this delta."); + } +} + +Status DeltaFileReader::CheckRowDeleted(rowid_t row_idx, bool *deleted) const { + MvccSnapshot snap_all(MvccSnapshot::CreateSnapshotIncludingAllTransactions()); + + // TODO: would be nice to avoid allocation here, but we don't want to + // duplicate all the logic from NewDeltaIterator. So, we'll heap-allocate + // for now. + Schema empty_schema; + DeltaIterator* raw_iter; + Status s = NewDeltaIterator(&empty_schema, snap_all, &raw_iter); + if (s.IsNotFound()) { + *deleted = false; + return Status::OK(); + } + RETURN_NOT_OK(s); + + gscoped_ptr iter(raw_iter); + + ScanSpec spec; + RETURN_NOT_OK(iter->Init(&spec)); + RETURN_NOT_OK(iter->SeekToOrdinal(row_idx)); + RETURN_NOT_OK(iter->PrepareBatch(1, DeltaIterator::PREPARE_FOR_APPLY)); + + // TODO: this does an allocation - can we stack-allocate the bitmap + // and make SelectionVector able to "release" its buffer? + SelectionVector sel_vec(1); + sel_vec.SetAllTrue(); + RETURN_NOT_OK(iter->ApplyDeletes(&sel_vec)); + *deleted = !sel_vec.IsRowSelected(0); + return Status::OK(); +} + +uint64_t DeltaFileReader::EstimateSize() const { + return reader_->file_size(); +} + + +//////////////////////////////////////////////////////////// +// DeltaFileIterator +//////////////////////////////////////////////////////////// + +DeltaFileIterator::DeltaFileIterator(shared_ptr dfr, + const Schema *projection, + MvccSnapshot snap, DeltaType delta_type) + : dfr_(std::move(dfr)), + projection_(projection), + mvcc_snap_(std::move(snap)), + prepared_idx_(0xdeadbeef), + prepared_count_(0), + prepared_(false), + exhausted_(false), + initted_(false), + delta_type_(delta_type), + cache_blocks_(CFileReader::CACHE_BLOCK) {} + +Status DeltaFileIterator::Init(ScanSpec *spec) { + DCHECK(!initted_) << "Already initted"; + + if (spec) { + cache_blocks_ = spec->cache_blocks() ? CFileReader::CACHE_BLOCK : + CFileReader::DONT_CACHE_BLOCK; + } + + initted_ = true; + return Status::OK(); +} + +Status DeltaFileIterator::SeekToOrdinal(rowid_t idx) { + DCHECK(initted_) << "Must call Init()"; + + // Finish the initialization of any lazily-initialized state. + RETURN_NOT_OK(dfr_->Init()); + if (!index_iter_) { + index_iter_.reset(IndexTreeIterator::Create( + dfr_->cfile_reader().get(), + dfr_->cfile_reader()->validx_root())); + } + + tmp_buf_.clear(); + DeltaKey(idx, Timestamp(0)).EncodeTo(&tmp_buf_); + Slice key_slice(tmp_buf_); + + Status s = index_iter_->SeekAtOrBefore(key_slice); + if (PREDICT_FALSE(s.IsNotFound())) { + // Seeking to a value before the first value in the file + // will return NotFound, due to the way the index seek + // works. We need to special-case this and have the + // iterator seek all the way down its leftmost branches + // to get the correct result. + s = index_iter_->SeekToFirst(); + } + RETURN_NOT_OK(s); + + prepared_idx_ = idx; + prepared_count_ = 0; + prepared_ = false; + delta_blocks_.clear(); + exhausted_ = false; + return Status::OK(); +} + +Status DeltaFileIterator::ReadCurrentBlockOntoQueue() { + DCHECK(initted_) << "Must call Init()"; + DCHECK(index_iter_) << "Must call SeekToOrdinal()"; + + gscoped_ptr pdb(new PreparedDeltaBlock()); + BlockPointer dblk_ptr = index_iter_->GetCurrentBlockPointer(); + RETURN_NOT_OK(dfr_->cfile_reader()->ReadBlock( + dblk_ptr, cache_blocks_, &pdb->block_)); + + // The data has been successfully read. Finish creating the decoder. + pdb->prepared_block_start_idx_ = 0; + pdb->block_ptr_ = dblk_ptr; + + // Decode the block. + pdb->decoder_.reset(new BinaryPlainBlockDecoder(pdb->block_.data())); + RETURN_NOT_OK(pdb->decoder_->ParseHeader()); + + RETURN_NOT_OK(GetFirstRowIndexInCurrentBlock(&pdb->first_updated_idx_)); + RETURN_NOT_OK(GetLastRowIndexInDecodedBlock(*pdb->decoder_, &pdb->last_updated_idx_)); + + #ifndef NDEBUG + VLOG(2) << "Read delta block which updates " << + pdb->first_updated_idx_ << " through " << + pdb->last_updated_idx_; + #endif + + delta_blocks_.push_back(pdb.release()); + return Status::OK(); +} + +Status DeltaFileIterator::GetFirstRowIndexInCurrentBlock(rowid_t *idx) { + DCHECK(index_iter_) << "Must call SeekToOrdinal()"; + + Slice index_entry = index_iter_->GetCurrentKey(); + DeltaKey k; + RETURN_NOT_OK(k.DecodeFrom(&index_entry)); + *idx = k.row_idx(); + return Status::OK(); +} + +Status DeltaFileIterator::GetLastRowIndexInDecodedBlock(const BinaryPlainBlockDecoder &dec, + rowid_t *idx) { + DCHECK_GT(dec.Count(), 0); + Slice s(dec.string_at_index(dec.Count() - 1)); + DeltaKey k; + RETURN_NOT_OK(k.DecodeFrom(&s)); + *idx = k.row_idx(); + return Status::OK(); +} + + +string DeltaFileIterator::PreparedDeltaBlock::ToString() const { + return StringPrintf("%d-%d (%s)", first_updated_idx_, last_updated_idx_, + block_ptr_.ToString().c_str()); +} + +Status DeltaFileIterator::PrepareBatch(size_t nrows, PrepareFlag flag) { + DCHECK(initted_) << "Must call Init()"; + DCHECK(index_iter_) << "Must call SeekToOrdinal()"; + + CHECK_GT(nrows, 0); + + rowid_t start_row = prepared_idx_ + prepared_count_; + rowid_t stop_row = start_row + nrows - 1; + + // Remove blocks from our list which are no longer relevant to the range + // being prepared. + while (!delta_blocks_.empty() && + delta_blocks_.front().last_updated_idx_ < start_row) { + delta_blocks_.pop_front(); + } + + while (!exhausted_) { + rowid_t next_block_rowidx; + RETURN_NOT_OK(GetFirstRowIndexInCurrentBlock(&next_block_rowidx)); + VLOG(2) << "Current delta block starting at row " << next_block_rowidx; + + if (next_block_rowidx > stop_row) { + break; + } + + RETURN_NOT_OK(ReadCurrentBlockOntoQueue()); + + Status s = index_iter_->Next(); + if (s.IsNotFound()) { + exhausted_ = true; + break; + } + RETURN_NOT_OK(s); + } + + if (!delta_blocks_.empty()) { + PreparedDeltaBlock &block = delta_blocks_.front(); + int i = 0; + for (i = block.prepared_block_start_idx_; + i < block.decoder_->Count(); + i++) { + Slice s(block.decoder_->string_at_index(i)); + DeltaKey key; + RETURN_NOT_OK(key.DecodeFrom(&s)); + if (key.row_idx() >= start_row) break; + } + block.prepared_block_start_idx_ = i; + } + + #ifndef NDEBUG + VLOG(2) << "Done preparing deltas for " << start_row << "-" << stop_row + << ": row block spans " << delta_blocks_.size() << " delta blocks"; + #endif + prepared_idx_ = start_row; + prepared_count_ = nrows; + prepared_ = true; + return Status::OK(); +} + +template +Status DeltaFileIterator::VisitMutations(Visitor *visitor) { + DCHECK(prepared_) << "must Prepare"; + + rowid_t start_row = prepared_idx_; + + for (PreparedDeltaBlock &block : delta_blocks_) { + BinaryPlainBlockDecoder &bpd = *block.decoder_; + DVLOG(2) << "Visiting delta block " << block.first_updated_idx_ << "-" + << block.last_updated_idx_ << " for row block starting at " << start_row; + + if (PREDICT_FALSE(start_row > block.last_updated_idx_)) { + // The block to be updated completely falls after this delta block: + // <-- delta block --> <-- delta block --> + // <-- block to update --> + // This can happen because we don't know the block's last entry until after + // we queued it in PrepareBatch(). We could potentially remove it at that + // point during the prepare step, but for now just skip it here. + continue; + } + + rowid_t previous_rowidx = MathLimits::kMax; + bool continue_visit = true; + for (int i = block.prepared_block_start_idx_; i < bpd.Count(); i++) { + Slice slice = bpd.string_at_index(i); + + // Decode and check the ID of the row we're going to update. + DeltaKey key; + RETURN_NOT_OK(key.DecodeFrom(&slice)); + rowid_t row_idx = key.row_idx(); + + // Check if the previous visitor notified us we don't need to apply more + // mutations to this row and skip if we don't. + if (row_idx == previous_rowidx && !continue_visit) { + continue; + } else { + previous_rowidx = row_idx; + continue_visit = true; + } + + // Check that the delta is within the block we're currently processing. + if (row_idx >= start_row + prepared_count_) { + // Delta is for a row which comes after the block we're processing. + return Status::OK(); + } else if (row_idx < start_row) { + // Delta is for a row which comes before the block we're processing. + continue; + } + RETURN_NOT_OK(visitor->Visit(key, slice, &continue_visit)); + if (VLOG_IS_ON(3)) { + RowChangeList rcl(slice); + DVLOG(3) << "Visited delta for key: " << key.ToString() << " Mut: " + << rcl.ToString(*projection_) << " Continue?: " + << (continue_visit ? "TRUE" : "FALSE"); + } + } + } + + return Status::OK(); +} + +// Returns whether a REDO mutation with 'timestamp' is relevant under 'snap'. +// If snap cannot include any mutations with a higher timestamp 'continue_visit' is +// set to false, it's set to true otherwise. +inline bool IsRedoRelevant(const MvccSnapshot& snap, + const Timestamp& timestamp, + bool* continue_visit) { + *continue_visit = true; + if (!snap.IsCommitted(timestamp)) { + if (!snap.MayHaveCommittedTransactionsAtOrAfter(timestamp)) { + *continue_visit = false; + } + return false; + } + return true; +} + +// Returns whether an UNDO mutation with 'timestamp' is relevant under 'snap'. +// If snap cannot include any mutations with a lower timestamp 'continue_visit' is +// set to false, it's set to true otherwise. +inline bool IsUndoRelevant(const MvccSnapshot& snap, + const Timestamp& timestamp, + bool* continue_visit) { + *continue_visit = true; + if (snap.IsCommitted(timestamp)) { + if (!snap.MayHaveUncommittedTransactionsAtOrBefore(timestamp)) { + *continue_visit = false; + } + return false; + } + return true; +} + +template +struct ApplyingVisitor { + + Status Visit(const DeltaKey &key, const Slice &deltas, bool* continue_visit); + + inline Status ApplyMutation(const DeltaKey &key, const Slice &deltas) { + int64_t rel_idx = key.row_idx() - dfi->prepared_idx_; + DCHECK_GE(rel_idx, 0); + + // TODO: this code looks eerily similar to DMSIterator::ApplyUpdates! + // I bet it can be combined. + + const Schema* schema = dfi->projection_; + RowChangeListDecoder decoder((RowChangeList(deltas))); + RETURN_NOT_OK(decoder.Init()); + if (decoder.is_update()) { + return decoder.ApplyToOneColumn(rel_idx, dst, *schema, col_to_apply, dst->arena()); + } else if (decoder.is_delete()) { + // If it's a DELETE, then it will be processed by DeletingVisitor. + return Status::OK(); + } else { + dfi->FatalUnexpectedDelta(key, deltas, "Expect only UPDATE or DELETE deltas on disk"); + } + return Status::OK(); + } + + DeltaFileIterator *dfi; + size_t col_to_apply; + ColumnBlock *dst; +}; + +template<> +inline Status ApplyingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, + bool* continue_visit) { + if (IsRedoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + DVLOG(3) << "Applied redo delta"; + return ApplyMutation(key, deltas); + } + DVLOG(3) << "Redo delta uncommitted, skipped applying."; + return Status::OK(); +} + +template<> +inline Status ApplyingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, + bool* continue_visit) { + if (IsUndoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + DVLOG(3) << "Applied undo delta"; + return ApplyMutation(key, deltas); + } + DVLOG(3) << "Undo delta committed, skipped applying."; + return Status::OK(); +} + +Status DeltaFileIterator::ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) { + DCHECK_LE(prepared_count_, dst->nrows()); + + if (delta_type_ == REDO) { + DVLOG(3) << "Applying REDO mutations to " << col_to_apply; + ApplyingVisitor visitor = {this, col_to_apply, dst}; + return VisitMutations(&visitor); + } else { + DVLOG(3) << "Applying UNDO mutations to " << col_to_apply; + ApplyingVisitor visitor = {this, col_to_apply, dst}; + return VisitMutations(&visitor); + } +} + +// Visitor which applies deletes to the selection vector. +template +struct DeletingVisitor { + + Status Visit(const DeltaKey &key, const Slice &deltas, bool* continue_visit); + + inline Status ApplyDelete(const DeltaKey &key, const Slice &deltas) { + int64_t rel_idx = key.row_idx() - dfi->prepared_idx_; + DCHECK_GE(rel_idx, 0); + + RowChangeListDecoder decoder((RowChangeList(deltas))); + RETURN_NOT_OK(decoder.Init()); + if (decoder.is_update()) { + DVLOG(3) << "Didn't delete row (update)"; + // If this is an update the row must be selected. + DCHECK(sel_vec->IsRowSelected(rel_idx)); + return Status::OK(); + } else if (decoder.is_delete()) { + DVLOG(3) << "Row deleted"; + sel_vec->SetRowUnselected(rel_idx); + } else { + dfi->FatalUnexpectedDelta(key, deltas, "Expect only UPDATE or DELETE deltas on disk"); + } + return Status::OK(); + } + + DeltaFileIterator *dfi; + SelectionVector *sel_vec; +}; + +template<> +inline Status DeletingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, + bool* continue_visit) { + if (IsRedoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + return ApplyDelete(key, deltas); + } + return Status::OK(); +} + +template<> +inline Status DeletingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, bool* + continue_visit) { + if (IsUndoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + return ApplyDelete(key, deltas); + } + return Status::OK(); +} + + +Status DeltaFileIterator::ApplyDeletes(SelectionVector *sel_vec) { + DCHECK_LE(prepared_count_, sel_vec->nrows()); + if (delta_type_ == REDO) { + DVLOG(3) << "Applying REDO deletes"; + DeletingVisitor visitor = { this, sel_vec}; + return VisitMutations(&visitor); + } else { + DVLOG(3) << "Applying UNDO deletes"; + DeletingVisitor visitor = { this, sel_vec}; + return VisitMutations(&visitor); + } +} + +// Visitor which, for each mutation, appends it into a ColumnBlock of +// Mutation *s. See CollectMutations() +// Each mutation is projected into the iterator schema, if required. +template +struct CollectingVisitor { + + Status Visit(const DeltaKey &key, const Slice &deltas, bool* continue_visit); + + Status Collect(const DeltaKey &key, const Slice &deltas) { + int64_t rel_idx = key.row_idx() - dfi->prepared_idx_; + DCHECK_GE(rel_idx, 0); + + RowChangeList changelist(deltas); + Mutation *mutation = Mutation::CreateInArena(dst_arena, key.timestamp(), changelist); + mutation->AppendToList(&dst->at(rel_idx)); + + return Status::OK(); + } + + DeltaFileIterator *dfi; + vector *dst; + Arena *dst_arena; +}; + +template<> +inline Status CollectingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, + bool* continue_visit) { + if (IsRedoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + return Collect(key, deltas); + } + return Status::OK(); +} + +template<> +inline Status CollectingVisitor::Visit(const DeltaKey& key, + const Slice& deltas, bool* + continue_visit) { + if (IsUndoRelevant(dfi->mvcc_snap_, key.timestamp(), continue_visit)) { + return Collect(key, deltas); + } + return Status::OK(); +} + +Status DeltaFileIterator::CollectMutations(vector *dst, Arena *dst_arena) { + DCHECK_LE(prepared_count_, dst->size()); + if (delta_type_ == REDO) { + CollectingVisitor visitor = {this, dst, dst_arena}; + return VisitMutations(&visitor); + } else { + CollectingVisitor visitor = {this, dst, dst_arena}; + return VisitMutations(&visitor); + } +} + +bool DeltaFileIterator::HasNext() { + return !exhausted_ || !delta_blocks_.empty(); +} + +string DeltaFileIterator::ToString() const { + return "DeltaFileIterator(" + dfr_->ToString() + ")"; +} + +struct FilterAndAppendVisitor { + + Status Visit(const DeltaKey& key, const Slice& deltas, bool* continue_visit) { + + // FilterAndAppendVisitor visitor visits all mutations. + *continue_visit = true; + + faststring buf; + RowChangeListEncoder enc(&buf); + RETURN_NOT_OK( + RowChangeListDecoder::RemoveColumnIdsFromChangeList(RowChangeList(deltas), + col_ids, + &enc)); + if (enc.is_initialized()) { + RowChangeList rcl = enc.as_changelist(); + DeltaKeyAndUpdate upd; + upd.key = key; + CHECK(arena->RelocateSlice(rcl.slice(), &upd.cell)); + out->push_back(upd); + } + // if enc.is_initialized() return false, that means deltas only + // contained the specified columns. + return Status::OK(); + } + + const DeltaFileIterator* dfi; + const vector& col_ids; + vector* out; + Arena* arena; +}; + +Status DeltaFileIterator::FilterColumnIdsAndCollectDeltas( + const vector& col_ids, + vector* out, + Arena* arena) { + FilterAndAppendVisitor visitor = {this, col_ids, out, arena}; + return VisitMutations(&visitor); +} + +void DeltaFileIterator::FatalUnexpectedDelta(const DeltaKey &key, const Slice &deltas, + const string &msg) { + LOG(FATAL) << "Saw unexpected delta type in deltafile " << dfr_->ToString() << ": " + << " rcl=" << RowChangeList(deltas).ToString(*projection_) + << " key=" << key.ToString() << " (" << msg << ")"; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/deltafile.h b/src/kudu/tablet/deltafile.h new file mode 100644 index 000000000000..89df8c52b1fa --- /dev/null +++ b/src/kudu/tablet/deltafile.h @@ -0,0 +1,314 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTAFILE_H +#define KUDU_TABLET_DELTAFILE_H + +#include +#include +#include +#include + +#include "kudu/cfile/block_handle.h" +#include "kudu/cfile/cfile_reader.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/index_btree.h" +#include "kudu/common/columnblock.h" +#include "kudu/common/schema.h" +#include "kudu/fs/block_id.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/deltamemstore.h" +#include "kudu/tablet/delta_key.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/once.h" + +namespace kudu { + +class ScanSpec; + +namespace cfile { +class BinaryPlainBlockDecoder; +} // namespace cfile + +namespace tablet { + +class DeltaFileIterator; +class DeltaKey; +template +struct ApplyingVisitor; +template +struct CollectingVisitor; +template +struct DeletingVisitor; + +class DeltaFileWriter { + public: + // Construct a new delta file writer. + // + // The writer takes ownership of the block and will Close it in Finish(). + explicit DeltaFileWriter(gscoped_ptr block); + + Status Start(); + + // Closes the delta file, including the underlying writable block. + Status Finish(); + + // Closes the delta file, releasing the underlying block to 'closer'. + Status FinishAndReleaseBlock(fs::ScopedWritableBlockCloser* closer); + + // Append a given delta to the file. This must be called in ascending order + // of (key, timestamp) for REDOS and ascending order of key, descending order + // of timestamp for UNDOS. + template + Status AppendDelta(const DeltaKey &key, const RowChangeList &delta); + + Status WriteDeltaStats(const DeltaStats& stats); + + private: + Status DoAppendDelta(const DeltaKey &key, const RowChangeList &delta); + + gscoped_ptr writer_; + + // Buffer used as a temporary for storing the serialized form + // of the deltas + faststring tmp_buf_; + + #ifndef NDEBUG + // The index of the previously written row. + // This is used in debug mode to make sure that rows are appended + // in order. + DeltaKey last_key_; + bool has_appended_; + #endif + + DISALLOW_COPY_AND_ASSIGN(DeltaFileWriter); +}; + +class DeltaFileReader : public DeltaStore, + public std::enable_shared_from_this { + public: + static const char * const kDeltaStatsEntryName; + + // Fully open a delta file using a previously opened block. + // + // After this call, the delta reader is safe for use. + static Status Open(gscoped_ptr file, + const BlockId& block_id, + std::shared_ptr* reader_out, + DeltaType delta_type); + + // Lazily opens a delta file using a previously opened block. A lazy open + // does not incur additional I/O, nor does it validate the contents of + // the delta file. + // + // Init() must be called before using the file's stats. + static Status OpenNoInit(gscoped_ptr file, + const BlockId& block_id, + std::shared_ptr* reader_out, + DeltaType delta_type); + + virtual Status Init() OVERRIDE; + + virtual bool Initted() OVERRIDE { + return init_once_.initted(); + } + + // See DeltaStore::NewDeltaIterator(...) + Status NewDeltaIterator(const Schema *projection, + const MvccSnapshot &snap, + DeltaIterator** iterator) const OVERRIDE; + + // See DeltaStore::CheckRowDeleted + virtual Status CheckRowDeleted(rowid_t row_idx, bool *deleted) const OVERRIDE; + + virtual uint64_t EstimateSize() const OVERRIDE; + + const BlockId& block_id() const { return block_id_; } + + virtual const DeltaStats& delta_stats() const OVERRIDE { + DCHECK(init_once_.initted()); + return *delta_stats_; + } + + virtual std::string ToString() const OVERRIDE { + return reader_->ToString(); + } + + // Returns true if this delta file may include any deltas which need to be + // applied when scanning the given snapshot, or if the file has not yet + // been fully initialized. + bool IsRelevantForSnapshot(const MvccSnapshot& snap) const; + + private: + friend class DeltaFileIterator; + + DISALLOW_COPY_AND_ASSIGN(DeltaFileReader); + + const std::shared_ptr &cfile_reader() const { + return reader_; + } + + DeltaFileReader(BlockId block_id, cfile::CFileReader *cf_reader, + DeltaType delta_type); + + // Callback used in 'init_once_' to initialize this delta file. + Status InitOnce(); + + Status ReadDeltaStats(); + + std::shared_ptr reader_; + gscoped_ptr delta_stats_; + + const BlockId block_id_; + + // The type of this delta, i.e. UNDO or REDO. + const DeltaType delta_type_; + + KuduOnceDynamic init_once_; +}; + +// Iterator over the deltas contained in a delta file. +// +// See DeltaIterator for details. +class DeltaFileIterator : public DeltaIterator { + public: + Status Init(ScanSpec *spec) OVERRIDE; + + Status SeekToOrdinal(rowid_t idx) OVERRIDE; + Status PrepareBatch(size_t nrows, PrepareFlag flag) OVERRIDE; + Status ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) OVERRIDE; + Status ApplyDeletes(SelectionVector *sel_vec) OVERRIDE; + Status CollectMutations(vector *dst, Arena *arena) OVERRIDE; + Status FilterColumnIdsAndCollectDeltas(const std::vector& col_ids, + vector* out, + Arena* arena) OVERRIDE; + string ToString() const OVERRIDE; + virtual bool HasNext() OVERRIDE; + + private: + friend class DeltaFileReader; + friend struct ApplyingVisitor; + friend struct ApplyingVisitor; + friend struct CollectingVisitor; + friend struct CollectingVisitor; + friend struct DeletingVisitor; + friend struct DeletingVisitor; + friend struct FilterAndAppendVisitor; + + DISALLOW_COPY_AND_ASSIGN(DeltaFileIterator); + + // PrepareBatch() will read forward all blocks from the deltafile + // which overlap with the block being prepared, enqueueing them onto + // the 'delta_blocks_' deque. The prepared blocks are then used to + // actually apply deltas in ApplyUpdates(). + struct PreparedDeltaBlock { + // The pointer from which this block was read. This is only used for + // logging, etc. + cfile::BlockPointer block_ptr_; + + // Handle to the block, so it doesn't get freed from underneath us. + cfile::BlockHandle block_; + + // The block decoder, to avoid having to re-parse the block header + // on every ApplyUpdates() call + gscoped_ptr decoder_; + + // The first row index for which there is an update in this delta block. + rowid_t first_updated_idx_; + + // The last row index for which there is an update in this delta block. + rowid_t last_updated_idx_; + + // Within this block, the index of the update which is the first one that + // needs to be consulted. This allows deltas to be skipped at the beginning + // of the block when the row block starts towards the end of the delta block. + // For example: + // <-- delta block ----> + // <--- prepared row block ---> + // Here, we can skip a bunch of deltas at the beginning of the delta block + // which we know don't apply to the prepared row block. + rowid_t prepared_block_start_idx_; + + // Return a string description of this prepared block, for logging. + string ToString() const; + }; + + + // The passed 'projection' and 'dfr' must remain valid for the lifetime + // of the iterator. + DeltaFileIterator(std::shared_ptr dfr, + const Schema *projection, MvccSnapshot snap, + DeltaType delta_type); + + // Determine the row index of the first update in the block currently + // pointed to by index_iter_. + Status GetFirstRowIndexInCurrentBlock(rowid_t *idx); + + // Determine the last updated row index contained in the given decoded block. + static Status GetLastRowIndexInDecodedBlock( + const cfile::BinaryPlainBlockDecoder &dec, rowid_t *idx); + + // Read the current block of data from the current position in the file + // onto the end of the delta_blocks_ queue. + Status ReadCurrentBlockOntoQueue(); + + // Visit all mutations in the currently prepared row range with the specified + // visitor class. + template + Status VisitMutations(Visitor *visitor); + + // Log a FATAL error message about a bad delta. + void FatalUnexpectedDelta(const DeltaKey &key, const Slice &deltas, const string &msg); + + std::shared_ptr dfr_; + + // Schema used during projection. + const Schema* projection_; + + // The MVCC state which determines which deltas should be applied. + const MvccSnapshot mvcc_snap_; + + gscoped_ptr index_iter_; + + // TODO: add better comments here. + rowid_t prepared_idx_; + uint32_t prepared_count_; + bool prepared_; + bool exhausted_; + bool initted_; + + // After PrepareBatch(), the set of delta blocks in the delta file + // which correspond to prepared_block_. + boost::ptr_deque delta_blocks_; + + // Temporary buffer used in seeking. + faststring tmp_buf_; + + // Temporary buffer used for RowChangeList projection. + faststring delta_buf_; + + // The type of this delta iterator, i.e. UNDO or REDO. + const DeltaType delta_type_; + + CFileReader::CacheControl cache_blocks_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/deltamemstore-test.cc b/src/kudu/tablet/deltamemstore-test.cc new file mode 100644 index 000000000000..60a789075e8b --- /dev/null +++ b/src/kudu/tablet/deltamemstore-test.cc @@ -0,0 +1,504 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/deltamemstore.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/mutation.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(benchmark_num_passes, 100, "Number of passes to apply deltas in the benchmark"); + +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +using fs::WritableBlock; + +class TestDeltaMemStore : public KuduTest { + public: + TestDeltaMemStore() + : op_id_(consensus::MaximumOpId()), + schema_(CreateSchema()), + dms_(new DeltaMemStore(0, 0, new log::LogAnchorRegistry())), + mvcc_(scoped_refptr( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp))) { + } + + void SetUp() OVERRIDE { + KuduTest::SetUp(); + + fs_manager_.reset(new FsManager(env_.get(), GetTestPath("fs_root"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + } + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddColumn("col1", STRING)); + CHECK_OK(builder.AddColumn("col2", STRING)); + CHECK_OK(builder.AddColumn("col3", UINT32)); + return builder.Build(); + } + + template + void UpdateIntsAtIndexes(const Iterable &indexes_to_update) { + faststring buf; + RowChangeListEncoder update(&buf); + + for (uint32_t idx_to_update : indexes_to_update) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + update.Reset(); + uint32_t new_val = idx_to_update * 10; + update.AddColumnUpdate(schema_.column(kIntColumn), + schema_.column_id(kIntColumn), &new_val); + + CHECK_OK(dms_->Update(tx.timestamp(), idx_to_update, RowChangeList(buf), op_id_)); + tx.Commit(); + } + } + + void ApplyUpdates(const MvccSnapshot &snapshot, + uint32_t row_idx, + size_t col_idx, + ColumnBlock *cb) { + ColumnSchema col_schema(schema_.column(col_idx)); + Schema single_col_projection({ col_schema }, + { schema_.column_id(col_idx) }, + 0); + + DeltaIterator* raw_iter; + Status s = dms_->NewDeltaIterator(&single_col_projection, snapshot, &raw_iter); + if (s.IsNotFound()) { + return; + } + ASSERT_OK(s); + gscoped_ptr iter(raw_iter); + ASSERT_OK(iter->Init(nullptr)); + ASSERT_OK(iter->SeekToOrdinal(row_idx)); + ASSERT_OK(iter->PrepareBatch(cb->nrows(), DeltaIterator::PREPARE_FOR_APPLY)); + ASSERT_OK(iter->ApplyUpdates(0, cb)); + } + + + protected: + static const int kStringColumn = 1; + static const int kIntColumn = 2; + + consensus::OpId op_id_; + + const Schema schema_; + shared_ptr dms_; + MvccManager mvcc_; + gscoped_ptr fs_manager_; +}; + +static void GenerateRandomIndexes(uint32_t range, uint32_t count, + unordered_set *out) { + CHECK_LE(count, range / 2) << + "this will be too slow unless count is much smaller than range"; + out->clear(); + + for (int i = 0; i < count; i++) { + bool inserted = false; + do { + inserted = out->insert(random() % range).second; + } while (!inserted); + } +} + +TEST_F(TestDeltaMemStore, TestUpdateCount) { + uint32_t n_rows = 1000; + faststring update_buf; + + RowChangeListEncoder update(&update_buf); + for (uint32_t idx = 0; idx < n_rows; idx++) { + update.Reset(); + if (idx % 4 == 0) { + char buf[256] = "update buf"; + Slice s(buf); + update.AddColumnUpdate(schema_.column(kStringColumn), + schema_.column_id(kStringColumn), &s); + } + if (idx % 2 == 0) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + uint32_t new_val = idx * 10; + update.AddColumnUpdate(schema_.column(kIntColumn), + schema_.column_id(kIntColumn), &new_val); + ASSERT_OK_FAST(dms_->Update(tx.timestamp(), idx, RowChangeList(update_buf), op_id_)); + tx.Commit(); + } + } + + + // Flush the delta file so that the stats get updated. + gscoped_ptr block; + ASSERT_OK(fs_manager_->CreateNewBlock(&block)); + DeltaFileWriter dfw(block.Pass()); + ASSERT_OK(dfw.Start()); + gscoped_ptr stats; + dms_->FlushToFile(&dfw, &stats); + + ASSERT_EQ(n_rows / 2, stats->update_count_for_col_id(schema_.column_id(kIntColumn))); + ASSERT_EQ(n_rows / 4, stats->update_count_for_col_id(schema_.column_id(kStringColumn))); +} + +TEST_F(TestDeltaMemStore, TestDMSSparseUpdates) { + + int n_rows = 1000; + + // Update 100 random rows out of the 1000. + srand(12345); + unordered_set indexes_to_update; + GenerateRandomIndexes(n_rows, 100, &indexes_to_update); + UpdateIntsAtIndexes(indexes_to_update); + ASSERT_EQ(100, dms_->Count()); + + // Now apply the updates from the DMS back to an array + ScopedColumnBlock read_back(1000); + for (int i = 0; i < 1000; i++) { + read_back[i] = 0xDEADBEEF; + } + MvccSnapshot snap(mvcc_); + ApplyUpdates(snap, 0, kIntColumn, &read_back); + + // And verify that only the rows that we updated are modified within + // the array. + for (int i = 0; i < 1000; i++) { + // If this wasn't one of the ones we updated, expect our marker + if (indexes_to_update.find(i) == indexes_to_update.end()) { + // If this wasn't one of the ones we updated, expect our marker + ASSERT_EQ(0xDEADBEEF, read_back[i]); + } else { + // Otherwise expect the updated value + ASSERT_EQ(i * 10, read_back[i]); + } + } +} + +// Performance test for KUDU-749: zipfian workloads can cause a lot +// of updates to a single row. This benchmark updates a single row many +// times and times how long it takes to apply those updates during +// the read path. +TEST_F(TestDeltaMemStore, BenchmarkManyUpdatesToOneRow) { + const int kNumRows = 1000; + const int kNumUpdates = 10000; + const int kIdxToUpdate = 10; + const int kStringDataSize = 1000; + + for (int i = 0; i < kNumUpdates; i++) { + faststring buf; + RowChangeListEncoder update(&buf); + + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + string str(kStringDataSize, 'x'); + Slice s(str); + update.AddColumnUpdate(schema_.column(kStringColumn), + schema_.column_id(kStringColumn), &s); + CHECK_OK(dms_->Update(tx.timestamp(), kIdxToUpdate, RowChangeList(buf), op_id_)); + tx.Commit(); + } + + MvccSnapshot snap(mvcc_); + LOG_TIMING(INFO, "Applying updates") { + for (int i = 0; i < FLAGS_benchmark_num_passes; i++) { + ScopedColumnBlock strings(kNumRows); + for (int i = 0; i < kNumRows; i++) { + strings[i] = Slice(); + } + + ApplyUpdates(snap, 0, kStringColumn, &strings); + } + } +} + +// Test when a slice column has been updated multiple times in the +// memrowset that the referred to values properly end up in the +// right arena. +TEST_F(TestDeltaMemStore, TestReUpdateSlice) { + faststring update_buf; + RowChangeListEncoder update(&update_buf); + + // Update a cell, taking care that the buffer we use to perform + // the update gets cleared after usage. This ensures that the + // underlying data is properly copied into the DMS arena. + { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + char buf[256] = "update 1"; + Slice s(buf); + update.AddColumnUpdate(schema_.column(0), + schema_.column_id(0), &s); + ASSERT_OK_FAST(dms_->Update(tx.timestamp(), 123, RowChangeList(update_buf), op_id_)); + memset(buf, 0xff, sizeof(buf)); + tx.Commit(); + } + MvccSnapshot snapshot_after_first_update(mvcc_); + + // Update the same cell again with a different value + { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + char buf[256] = "update 2"; + Slice s(buf); + update.Reset(); + update.AddColumnUpdate(schema_.column(0), + schema_.column_id(0), &s); + ASSERT_OK_FAST(dms_->Update(tx.timestamp(), 123, RowChangeList(update_buf), op_id_)); + memset(buf, 0xff, sizeof(buf)); + tx.Commit(); + } + MvccSnapshot snapshot_after_second_update(mvcc_); + + // Ensure we end up with a second entry for the cell, at the + // new timestamp + ASSERT_EQ(2, dms_->Count()); + + // Ensure that we ended up with the right data, and that the old MVCC snapshot + // yields the correct old value. + ScopedColumnBlock read_back(1); + ApplyUpdates(snapshot_after_first_update, 123, 0, &read_back); + ASSERT_EQ("update 1", read_back[0].ToString()); + + ApplyUpdates(snapshot_after_second_update, 123, 0, &read_back); + ASSERT_EQ("update 2", read_back[0].ToString()); +} + +// Test that if two updates come in with out-of-order transaction IDs, +// the one with the higher transaction ID ends up winning. +// +// This is important during flushing when updates against the old rowset +// are carried forward, but may fall behind newer transactions. +TEST_F(TestDeltaMemStore, TestOutOfOrderTxns) { + faststring update_buf; + RowChangeListEncoder update(&update_buf); + + { + ScopedTransaction tx1(&mvcc_); + ScopedTransaction tx2(&mvcc_); + + tx2.StartApplying(); + Slice s("update 2"); + update.AddColumnUpdate(schema_.column(kStringColumn), + schema_.column_id(kStringColumn), &s); + ASSERT_OK(dms_->Update(tx2.timestamp(), 123, RowChangeList(update_buf), op_id_)); + tx2.Commit(); + + + tx1.StartApplying(); + update.Reset(); + s = Slice("update 1"); + update.AddColumnUpdate(schema_.column(kStringColumn), + schema_.column_id(kStringColumn), &s); + ASSERT_OK(dms_->Update(tx1.timestamp(), 123, RowChangeList(update_buf), op_id_)); + tx1.Commit(); + } + + // Ensure we end up two entries for the cell. + ASSERT_EQ(2, dms_->Count()); + + // Ensure that we ended up with the right data. + ScopedColumnBlock read_back(1); + ApplyUpdates(MvccSnapshot(mvcc_), 123, kStringColumn, &read_back); + ASSERT_EQ("update 2", read_back[0].ToString()); +} + +TEST_F(TestDeltaMemStore, TestDMSBasic) { + faststring update_buf; + RowChangeListEncoder update(&update_buf); + + char buf[256]; + for (uint32_t i = 0; i < 1000; i++) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + update.Reset(); + + uint32_t val = i * 10; + update.AddColumnUpdate(schema_.column(kIntColumn), + schema_.column_id(kIntColumn), &val); + + snprintf(buf, sizeof(buf), "hello %d", i); + Slice s(buf); + update.AddColumnUpdate(schema_.column(kStringColumn), + schema_.column_id(kStringColumn), &s); + + ASSERT_OK_FAST(dms_->Update(tx.timestamp(), i, RowChangeList(update_buf), op_id_)); + tx.Commit(); + } + + ASSERT_EQ(1000, dms_->Count()); + + // Read back the values and check correctness. + MvccSnapshot snap(mvcc_); + ScopedColumnBlock read_back(1000); + ScopedColumnBlock read_back_slices(1000); + ApplyUpdates(snap, 0, kIntColumn, &read_back); + ApplyUpdates(snap, 0, kStringColumn, &read_back_slices); + + // When reading back the slice, do so into a different buffer - + // otherwise if the slice references weren't properly copied above, + // we'd be writing our comparison value into the same buffer that + // we're comparing against! + char buf2[256]; + for (uint32_t i = 0; i < 1000; i++) { + ASSERT_EQ(i * 10, read_back[i]) << "failed at iteration " << i; + snprintf(buf2, sizeof(buf2), "hello %d", i); + Slice s(buf2); + ASSERT_EQ(0, s.compare(read_back_slices[i])); + } + + + // Update the same rows again, with new transactions. Even though + // the same rows are updated, new entries should be added because + // these are separate transactions and we need to maintain the + // old ones for snapshot consistency purposes. + for (uint32_t i = 0; i < 1000; i++) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + update.Reset(); + + uint32_t val = i * 20; + update.AddColumnUpdate(schema_.column(kIntColumn), + schema_.column_id(kIntColumn), &val); + ASSERT_OK_FAST(dms_->Update(tx.timestamp(), i, RowChangeList(update_buf), op_id_)); + tx.Commit(); + } + + ASSERT_EQ(2000, dms_->Count()); +} + +TEST_F(TestDeltaMemStore, TestIteratorDoesUpdates) { + unordered_set to_update; + for (uint32_t i = 0; i < 1000; i++) { + to_update.insert(i); + } + UpdateIntsAtIndexes(to_update); + ASSERT_EQ(1000, dms_->Count()); + + // TODO: test snapshot reads from different points + MvccSnapshot snap(mvcc_); + ScopedColumnBlock block(100); + + DeltaIterator* raw_iter; + Status s = dms_->NewDeltaIterator(&schema_, snap, &raw_iter); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of the snapshot"; + } + ASSERT_OK(s); + + gscoped_ptr iter(down_cast(raw_iter)); + ASSERT_OK(iter->Init(nullptr)); + + int block_start_row = 50; + ASSERT_OK(iter->SeekToOrdinal(block_start_row)); + ASSERT_OK(iter->PrepareBatch(block.nrows(), DeltaIterator::PREPARE_FOR_APPLY)); + ASSERT_OK(iter->ApplyUpdates(kIntColumn, &block)); + + for (int i = 0; i < 100; i++) { + int actual_row = block_start_row + i; + ASSERT_EQ(actual_row * 10, block[i]) << "at row " << actual_row; + } + + // Apply the next block + block_start_row += block.nrows(); + ASSERT_OK(iter->PrepareBatch(block.nrows(), DeltaIterator::PREPARE_FOR_APPLY)); + ASSERT_OK(iter->ApplyUpdates(kIntColumn, &block)); + for (int i = 0; i < 100; i++) { + int actual_row = block_start_row + i; + ASSERT_EQ(actual_row * 10, block[i]) << "at row " << actual_row; + } +} + +TEST_F(TestDeltaMemStore, TestCollectMutations) { + Arena arena(1024, 1024); + + // Update rows 5 and 12 + vector to_update; + to_update.push_back(5); + to_update.push_back(12); + UpdateIntsAtIndexes(to_update); + + ASSERT_EQ(2, dms_->Count()); + + MvccSnapshot snap(mvcc_); + + const int kBatchSize = 10; + vector mutations; + mutations.resize(kBatchSize); + + DeltaIterator* raw_iter; + Status s = dms_->NewDeltaIterator(&schema_, snap, &raw_iter); + if (s.IsNotFound()) { + FAIL() << "Iterator fell outside of the range of the snapshot"; + } + ASSERT_OK(s); + + gscoped_ptr iter(down_cast(raw_iter)); + + ASSERT_OK(iter->Init(nullptr)); + ASSERT_OK(iter->SeekToOrdinal(0)); + ASSERT_OK(iter->PrepareBatch(kBatchSize, DeltaIterator::PREPARE_FOR_COLLECT)); + ASSERT_OK(iter->CollectMutations(&mutations, &arena)); + + // Only row 5 is updated, everything else should be NULL. + for (int i = 0; i < kBatchSize; i++) { + string str = Mutation::StringifyMutationList(schema_, mutations[i]); + VLOG(1) << "row " << i << ": " << str; + if (i != 5) { + EXPECT_EQ("[]", str); + } else { + EXPECT_EQ("[@1(SET col3=50)]", str); + } + } + + // Collect the next batch of 10. + arena.Reset(); + std::fill(mutations.begin(), mutations.end(), reinterpret_cast(NULL)); + ASSERT_OK(iter->PrepareBatch(kBatchSize, DeltaIterator::PREPARE_FOR_COLLECT)); + ASSERT_OK(iter->CollectMutations(&mutations, &arena)); + + // Only row 2 is updated, everything else should be NULL. + for (int i = 0; i < 10; i++) { + string str = Mutation::StringifyMutationList(schema_, mutations[i]); + VLOG(1) << "row " << i << ": " << str; + if (i != 2) { + EXPECT_EQ("[]", str); + } else { + EXPECT_EQ("[@2(SET col3=120)]", str); + } + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/deltamemstore.cc b/src/kudu/tablet/deltamemstore.cc new file mode 100644 index 000000000000..6e0966952f87 --- /dev/null +++ b/src/kudu/tablet/deltamemstore.cc @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/consensus/consensus.pb.h" +#include "kudu/gutil/port.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/deltamemstore.h" +#include "kudu/tablet/delta_tracker.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/tablet/tablet.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tablet { + +using log::LogAnchorRegistry; +using std::shared_ptr; +using strings::Substitute; + +//////////////////////////////////////////////////////////// +// DeltaMemStore implementation +//////////////////////////////////////////////////////////// + +static const int kInitialArenaSize = 16; +static const int kMaxArenaBufferSize = 5*1024*1024; + +DeltaMemStore::DeltaMemStore(int64_t id, + int64_t rs_id, + LogAnchorRegistry* log_anchor_registry, + const shared_ptr& parent_tracker) + : id_(id), + rs_id_(rs_id), + anchorer_(log_anchor_registry, Substitute("Rowset-$0/DeltaMemStore-$1", rs_id_, id_)), + disambiguator_sequence_number_(0) { + if (parent_tracker) { + CHECK(MemTracker::FindTracker(Tablet::kDMSMemTrackerId, + &mem_tracker_, + parent_tracker)); + } else { + mem_tracker_ = MemTracker::GetRootTracker(); + } + allocator_.reset(new MemoryTrackingBufferAllocator( + HeapBufferAllocator::Get(), mem_tracker_)); + arena_.reset(new ThreadSafeMemoryTrackingArena( + kInitialArenaSize, kMaxArenaBufferSize, allocator_)); + tree_.reset(new DMSTree(arena_)); +} + +Status DeltaMemStore::Init() { + return Status::OK(); +} + +Status DeltaMemStore::Update(Timestamp timestamp, + rowid_t row_idx, + const RowChangeList &update, + const consensus::OpId& op_id) { + DeltaKey key(row_idx, timestamp); + + faststring buf; + + key.EncodeTo(&buf); + + Slice key_slice(buf); + btree::PreparedMutation mutation(key_slice); + mutation.Prepare(tree_.get()); + if (PREDICT_FALSE(mutation.exists())) { + // We already have a delta for this row at the same timestamp. + // Try again with a disambiguating sequence number appended to the key. + int seq = disambiguator_sequence_number_.Increment(); + PutMemcmpableVarint64(&buf, seq); + key_slice = Slice(buf); + mutation.Reset(key_slice); + mutation.Prepare(tree_.get()); + CHECK(!mutation.exists()) + << "Appended a sequence number but still hit a duplicate " + << "for rowid " << row_idx << " at timestamp " << timestamp; + } + if (PREDICT_FALSE(!mutation.Insert(update.slice()))) { + return Status::IOError("Unable to insert into tree"); + } + + anchorer_.AnchorIfMinimum(op_id.index()); + + return Status::OK(); +} + +Status DeltaMemStore::FlushToFile(DeltaFileWriter *dfw, + gscoped_ptr* stats_ret) { + gscoped_ptr stats(new DeltaStats()); + + gscoped_ptr iter(tree_->NewIterator()); + iter->SeekToStart(); + while (iter->IsValid()) { + Slice key_slice, val; + iter->GetCurrentEntry(&key_slice, &val); + DeltaKey key; + RETURN_NOT_OK(key.DecodeFrom(&key_slice)); + RowChangeList rcl(val); + RETURN_NOT_OK_PREPEND(dfw->AppendDelta(key, rcl), "Failed to append delta"); + stats->UpdateStats(key.timestamp(), rcl); + iter->Next(); + } + RETURN_NOT_OK(dfw->WriteDeltaStats(*stats)); + + stats_ret->swap(stats); + return Status::OK(); +} + +Status DeltaMemStore::NewDeltaIterator(const Schema *projection, + const MvccSnapshot &snap, + DeltaIterator** iterator) const { + *iterator = new DMSIterator(shared_from_this(), projection, snap); + return Status::OK(); +} + +Status DeltaMemStore::CheckRowDeleted(rowid_t row_idx, bool *deleted) const { + *deleted = false; + + DeltaKey key(row_idx, Timestamp(0)); + faststring buf; + key.EncodeTo(&buf); + Slice key_slice(buf); + + bool exact; + + // TODO: can we avoid the allocation here? + gscoped_ptr iter(tree_->NewIterator()); + if (!iter->SeekAtOrAfter(key_slice, &exact)) { + return Status::OK(); + } + + while (iter->IsValid()) { + // Iterate forward until reaching an entry with a larger row idx. + Slice key_slice, v; + iter->GetCurrentEntry(&key_slice, &v); + RETURN_NOT_OK(key.DecodeFrom(&key_slice)); + DCHECK_GE(key.row_idx(), row_idx); + if (key.row_idx() != row_idx) break; + + RowChangeList val(v); + // Mutation is for the target row, check deletion status. + RowChangeListDecoder decoder((RowChangeList(v))); + decoder.InitNoSafetyChecks(); + decoder.TwiddleDeleteStatus(deleted); + + iter->Next(); + } + + return Status::OK(); +} + +void DeltaMemStore::DebugPrint() const { + tree_->DebugPrint(); +} + +//////////////////////////////////////////////////////////// +// DMSIterator +//////////////////////////////////////////////////////////// + +DMSIterator::DMSIterator(const shared_ptr& dms, + const Schema* projection, MvccSnapshot snapshot) + : dms_(dms), + mvcc_snapshot_(std::move(snapshot)), + iter_(dms->tree_->NewIterator()), + initted_(false), + prepared_idx_(0), + prepared_count_(0), + prepared_for_(NOT_PREPARED), + seeked_(false), + projection_(projection) {} + +Status DMSIterator::Init(ScanSpec *spec) { + initted_ = true; + return Status::OK(); +} + +Status DMSIterator::SeekToOrdinal(rowid_t row_idx) { + faststring buf; + DeltaKey key(row_idx, Timestamp(0)); + key.EncodeTo(&buf); + + bool exact; /* unused */ + iter_->SeekAtOrAfter(Slice(buf), &exact); + prepared_idx_ = row_idx; + prepared_count_ = 0; + prepared_for_ = NOT_PREPARED; + seeked_ = true; + return Status::OK(); +} + +Status DMSIterator::PrepareBatch(size_t nrows, PrepareFlag flag) { + // This current implementation copies the whole batch worth of deltas + // into a buffer local to this iterator, after filtering out deltas which + // aren't yet committed in the current MVCC snapshot. The theory behind + // this approach is the following: + + // Each batch needs to be processed once per column, meaning that unless + // we make a local copy, we'd have to reset the CBTree iterator back to the + // start of the batch and re-iterate for each column. CBTree iterators make + // local copies as they progress in order to shield from concurrent mutation, + // so with N columns, we'd end up making N copies of the data. Making a local + // copy here is instead a single copy of the data, so is likely faster. + CHECK(seeked_); + DCHECK(initted_) << "must init"; + rowid_t start_row = prepared_idx_ + prepared_count_; + rowid_t stop_row = start_row + nrows - 1; + + if (updates_by_col_.empty()) { + updates_by_col_.resize(projection_->num_columns()); + } + for (UpdatesForColumn& ufc : updates_by_col_) { + ufc.clear(); + } + deletes_and_reinserts_.clear(); + prepared_deltas_.clear(); + + while (iter_->IsValid()) { + Slice key_slice, val; + iter_->GetCurrentEntry(&key_slice, &val); + DeltaKey key; + RETURN_NOT_OK(key.DecodeFrom(&key_slice)); + DCHECK_GE(key.row_idx(), start_row); + if (key.row_idx() > stop_row) break; + + if (!mvcc_snapshot_.IsCommitted(key.timestamp())) { + // The transaction which applied this update is not yet committed + // in this iterator's MVCC snapshot. Hence, skip it. + iter_->Next(); + continue; + } + + if (flag == PREPARE_FOR_APPLY) { + RowChangeListDecoder decoder((RowChangeList(val))); + decoder.InitNoSafetyChecks(); + if (decoder.is_delete() || decoder.is_reinsert()) { + DeleteOrReinsert dor; + dor.row_id = key.row_idx(); + dor.exists = decoder.is_reinsert(); + deletes_and_reinserts_.push_back(dor); + } else { + DCHECK(decoder.is_update()); + while (decoder.HasNext()) { + RowChangeListDecoder::DecodedUpdate dec; + RETURN_NOT_OK(decoder.DecodeNext(&dec)); + int col_idx; + const void* col_val; + RETURN_NOT_OK(dec.Validate(*projection_, &col_idx, &col_val)); + if (col_idx == -1) { + // This column isn't being projected. + continue; + } + int col_size = projection_->column(col_idx).type_info()->size(); + + // If we already have an earlier update for the same column, we can + // just overwrite that one. + if (updates_by_col_[col_idx].empty() || + updates_by_col_[col_idx].back().row_id != key.row_idx()) { + updates_by_col_[col_idx].push_back(ColumnUpdate()); + } + + ColumnUpdate& cu = updates_by_col_[col_idx].back(); + cu.row_id = key.row_idx(); + if (col_val == nullptr) { + cu.new_val_ptr = nullptr; + } else { + memcpy(cu.new_val_buf, col_val, col_size); + // NOTE: we're constructing a pointer here to an element inside the deque. + // This is safe because deques never invalidate pointers to their elements. + cu.new_val_ptr = cu.new_val_buf; + } + } + } + } else { + DCHECK_EQ(flag, PREPARE_FOR_COLLECT); + PreparedDelta d; + d.key = key; + d.val = val; + prepared_deltas_.push_back(d); + } + + iter_->Next(); + } + prepared_idx_ = start_row; + prepared_count_ = nrows; + prepared_for_ = flag == PREPARE_FOR_APPLY ? PREPARED_FOR_APPLY : PREPARED_FOR_COLLECT; + return Status::OK(); +} + +Status DMSIterator::ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) { + DCHECK_EQ(prepared_for_, PREPARED_FOR_APPLY); + DCHECK_EQ(prepared_count_, dst->nrows()); + + const ColumnSchema* col_schema = &projection_->column(col_to_apply); + for (const ColumnUpdate& cu : updates_by_col_[col_to_apply]) { + int32_t idx_in_block = cu.row_id - prepared_idx_; + DCHECK_GE(idx_in_block, 0); + SimpleConstCell src(col_schema, cu.new_val_ptr); + ColumnBlock::Cell dst_cell = dst->cell(idx_in_block); + RETURN_NOT_OK(CopyCell(src, &dst_cell, dst->arena())); + } + + return Status::OK(); +} + + +Status DMSIterator::ApplyDeletes(SelectionVector *sel_vec) { + DCHECK_EQ(prepared_for_, PREPARED_FOR_APPLY); + DCHECK_EQ(prepared_count_, sel_vec->nrows()); + + for (const DeleteOrReinsert& dor : deletes_and_reinserts_) { + uint32_t idx_in_block = dor.row_id - prepared_idx_; + if (!dor.exists) { + sel_vec->SetRowUnselected(idx_in_block); + } + } + + return Status::OK(); +} + + +Status DMSIterator::CollectMutations(vector *dst, Arena *arena) { + DCHECK_EQ(prepared_for_, PREPARED_FOR_COLLECT); + for (const PreparedDelta& src : prepared_deltas_) { + DeltaKey key = src.key;; + RowChangeList changelist(src.val); + uint32_t rel_idx = key.row_idx() - prepared_idx_; + + Mutation *mutation = Mutation::CreateInArena(arena, key.timestamp(), changelist); + mutation->AppendToList(&dst->at(rel_idx)); + } + return Status::OK(); +} + +Status DMSIterator::FilterColumnIdsAndCollectDeltas(const vector& col_ids, + vector* out, + Arena* arena) { + LOG(DFATAL) << "Attempt to call FilterColumnIdsAndCollectDeltas on DMS" << GetStackTrace(); + return Status::InvalidArgument("FilterColumsAndAppend() is not supported by DMSIterator"); +} + +bool DMSIterator::HasNext() { + // TODO implement this if we ever want to include DeltaMemStore in minor + // delta compaction. + LOG(FATAL) << "Unimplemented"; + return false; +} + +string DMSIterator::ToString() const { + return "DMSIterator"; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/deltamemstore.h b/src/kudu/tablet/deltamemstore.h new file mode 100644 index 000000000000..cd81875c0cc7 --- /dev/null +++ b/src/kudu/tablet/deltamemstore.h @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_DELTAMEMSTORE_H +#define KUDU_TABLET_DELTAMEMSTORE_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/columnblock.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/concurrent_btree.h" +#include "kudu/tablet/delta_key.h" +#include "kudu/tablet/delta_tracker.h" +#include "kudu/tablet/delta_stats.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/atomic.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { + +class MemTracker; +class RowChangeList; + +namespace tablet { + +class DeltaFileWriter; +class DeltaStats; +class DMSIterator; +class Mutation; + +struct DMSTreeTraits : public btree::BTreeTraits { + typedef ThreadSafeMemoryTrackingArena ArenaType; +}; + +// In-memory storage for data which has been recently updated. +// This essentially tracks a 'diff' per row, which contains the +// modified columns. + +class DeltaMemStore : public DeltaStore, + public std::enable_shared_from_this { + public: + DeltaMemStore(int64_t id, int64_t rs_id, + log::LogAnchorRegistry* log_anchor_registry, + const std::shared_ptr& parent_tracker = std::shared_ptr()); + + virtual Status Init() OVERRIDE; + + virtual bool Initted() OVERRIDE { + return true; + } + + // Update the given row in the database. + // Copies the data, as well as any referenced values into this DMS's local + // arena. + Status Update(Timestamp timestamp, rowid_t row_idx, + const RowChangeList &update, + const consensus::OpId& op_id); + + size_t Count() const { + return tree_->count(); + } + + bool Empty() const { + return tree_->empty(); + } + + // Dump a debug version of the tree to the logs. This is not thread-safe, so + // is only really useful in unit tests. + void DebugPrint() const; + + // Flush the DMS to the given file writer. + // Returns statistics in *stats. + Status FlushToFile(DeltaFileWriter *dfw, + gscoped_ptr* stats); + + // Create an iterator for applying deltas from this DMS. + // + // The projection passed here must be the same as the schema of any + // RowBlocks which are passed in, or else bad things will happen. + // + // 'snapshot' is the MVCC state which determines which transactions + // should be considered committed (and thus applied by the iterator). + // + // Returns Status::OK and sets 'iterator' to the new DeltaIterator, or + // returns Status::NotFound if the mutations within this delta store + // cannot include 'snap'. + virtual Status NewDeltaIterator(const Schema *projection, + const MvccSnapshot &snap, + DeltaIterator** iterator) const OVERRIDE; + + virtual Status CheckRowDeleted(rowid_t row_idx, bool *deleted) const OVERRIDE; + + virtual uint64_t EstimateSize() const OVERRIDE { + return memory_footprint(); + } + + const int64_t id() const { return id_; } + + typedef btree::CBTree DMSTree; + typedef btree::CBTreeIterator DMSTreeIter; + + size_t memory_footprint() const { + return arena_->memory_footprint(); + } + + virtual std::string ToString() const OVERRIDE { + return "DMS"; + } + + // Get the minimum log index for this DMS, -1 if it wasn't set. + int64_t MinLogIndex() const { + return anchorer_.minimum_log_index(); + } + + // The returned stats will always be empty, and the number of columns unset. + virtual const DeltaStats& delta_stats() const OVERRIDE { + return delta_stats_; + } + + private: + friend class DMSIterator; + + const DMSTree& tree() const { + return *tree_; + } + + const int64_t id_; // DeltaMemStore ID. + const int64_t rs_id_; // Rowset ID. + + std::shared_ptr mem_tracker_; + std::shared_ptr allocator_; + + std::shared_ptr arena_; + + // Concurrent B-Tree storing -> RowChangeList + gscoped_ptr tree_; + + log::MinLogIndexAnchorer anchorer_; + + const DeltaStats delta_stats_; + + // It's possible for multiple mutations to apply to the same row + // in the same timestamp (e.g. if a batch contains multiple updates for that + // row). In that case, we need to append a sequence number to the delta key + // in the underlying tree, so that the later operations will sort after + // the earlier ones. This atomic integer serves to provide such a sequence + // number, and is only used in the case that such a collision occurs. + AtomicInt disambiguator_sequence_number_; + + DISALLOW_COPY_AND_ASSIGN(DeltaMemStore); +}; + +// Iterator over the deltas currently in the delta memstore. +// This iterator is a wrapper around the underlying tree iterator +// which snapshots sets of deltas on a per-block basis, and allows +// the caller to then apply the deltas column-by-column. This supports +// column-by-column predicate evaluation, and lazily loading columns +// only after predicates have passed. +// +// See DeltaStore for more details on usage and the implemented +// functions. +class DMSIterator : public DeltaIterator { + public: + Status Init(ScanSpec *spec) OVERRIDE; + + Status SeekToOrdinal(rowid_t row_idx) OVERRIDE; + + Status PrepareBatch(size_t nrows, PrepareFlag flag) OVERRIDE; + + Status ApplyUpdates(size_t col_to_apply, ColumnBlock *dst) OVERRIDE; + + Status ApplyDeletes(SelectionVector *sel_vec) OVERRIDE; + + Status CollectMutations(vector *dst, Arena *arena) OVERRIDE; + + Status FilterColumnIdsAndCollectDeltas(const vector& col_ids, + vector* out, + Arena* arena) OVERRIDE; + + string ToString() const OVERRIDE; + + virtual bool HasNext() OVERRIDE; + + private: + DISALLOW_COPY_AND_ASSIGN(DMSIterator); + FRIEND_TEST(TestDeltaMemStore, TestIteratorDoesUpdates); + FRIEND_TEST(TestDeltaMemStore, TestCollectMutations); + friend class DeltaMemStore; + + // Initialize the iterator. + // The projection passed here must be the same as the schema of any + // RowBlocks which are passed in, or else bad things will happen. + // The pointer must also remain valid for the lifetime of the iterator. + DMSIterator(const std::shared_ptr &dms, + const Schema *projection, MvccSnapshot snapshot); + + const std::shared_ptr dms_; + + // MVCC state which allows us to ignore uncommitted transactions. + const MvccSnapshot mvcc_snapshot_; + + gscoped_ptr iter_; + + bool initted_; + + // The index at which the last PrepareBatch() call was made + rowid_t prepared_idx_; + + // The number of rows for which the last PrepareBatch() call was made + uint32_t prepared_count_; + + // Whether there are prepared blocks built through PrepareBatch(). + enum PreparedFor { + NOT_PREPARED, + PREPARED_FOR_APPLY, + PREPARED_FOR_COLLECT + }; + PreparedFor prepared_for_; + + // True if SeekToOrdinal() been called at least once. + bool seeked_; + + // The schema of the row blocks that will be passed to PrepareBatch(), etc. + const Schema* projection_; + + // State when prepared_for_ == PREPARED_FOR_APPLY + // ------------------------------------------------------------ + struct ColumnUpdate { + rowid_t row_id; + void* new_val_ptr; + uint8_t new_val_buf[16]; + }; + typedef std::deque UpdatesForColumn; + std::vector updates_by_col_; + struct DeleteOrReinsert { + rowid_t row_id; + bool exists; + }; + std::deque deletes_and_reinserts_; + + // State when prepared_for_ == PREPARED_FOR_COLLECT + // ------------------------------------------------------------ + struct PreparedDelta { + DeltaKey key; + Slice val; + }; + std::deque prepared_deltas_; + + // Temporary buffer used for RowChangeList projection. + faststring delta_buf_; + +}; + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/diskrowset-test-base.h b/src/kudu/tablet/diskrowset-test-base.h new file mode 100644 index 000000000000..1ef1b80505ed --- /dev/null +++ b/src/kudu/tablet/diskrowset-test-base.h @@ -0,0 +1,338 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_LAYER_TEST_BASE_H +#define KUDU_TABLET_LAYER_TEST_BASE_H + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/util/env.h" +#include "kudu/util/logging.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(roundtrip_num_rows, 10000, + "Number of rows to use for the round-trip test"); +DEFINE_int32(n_read_passes, 10, + "number of times to read data for perf test"); + +namespace kudu { +namespace tablet { + +using std::unordered_set; + +class TestRowSet : public KuduRowSetTest { + public: + TestRowSet() + : KuduRowSetTest(CreateTestSchema()), + n_rows_(FLAGS_roundtrip_num_rows), + op_id_(consensus::MaximumOpId()), + mvcc_(scoped_refptr( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp))) { + CHECK_GT(n_rows_, 0); + } + + protected: + static Schema CreateTestSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn("key", STRING)); + CHECK_OK(builder.AddColumn("val", UINT32)); + return builder.BuildWithoutIds(); + } + + static Schema CreateProjection(const Schema& schema, + const vector& cols) { + vector col_schemas; + vector col_ids; + for (const string& col : cols) { + int idx = schema.find_column(col); + CHECK_GE(idx, 0); + col_schemas.push_back(schema.column(idx)); + col_ids.push_back(schema.column_id(idx)); + } + return Schema(col_schemas, col_ids, 0); + } + + void BuildRowKey(RowBuilder *rb, int row_idx) { + char buf[256]; + FormatKey(row_idx, buf, sizeof(buf)); + rb->AddString(Slice(buf)); + } + + // Write out a test rowset with n_rows_ rows. + // The data in the rowset looks like: + // ("hello <00n>", ) + // ... where n is the index of the row in the rowset + // or 0 if 'zero_vals' is true. + // The string values are padded out to 15 digits + void WriteTestRowSet(int n_rows = 0, bool zero_vals = false) { + DiskRowSetWriter drsw(rowset_meta_.get(), &schema_, + BloomFilterSizing::BySizeAndFPRate(32*1024, 0.01f)); + DoWriteTestRowSet(n_rows, &drsw, zero_vals); + } + + template + void DoWriteTestRowSet(int n_rows, WriterClass *writer, + bool zero_vals = false) { + if (n_rows == 0) { + n_rows = n_rows_; + } + + // Write rows into a new DiskRowSet. + LOG_TIMING(INFO, "Writing rowset") { + CHECK_OK(writer->Open()); + + char buf[256]; + RowBuilder rb(schema_); + for (int i = 0; i < n_rows; i++) { + CHECK_OK(writer->RollIfNecessary()); + rb.Reset(); + FormatKey(i, buf, sizeof(buf)); + rb.AddString(Slice(buf)); + rb.AddUint32(zero_vals ? 0 : i); + CHECK_OK(WriteRow(rb.data(), writer)); + } + CHECK_OK(writer->Finish()); + } + } + + // Picks some number of rows from the given rowset and updates + // them. Stores the indexes of the updated rows in *updated. + void UpdateExistingRows(DiskRowSet *rs, float update_ratio, + unordered_set *updated) { + int to_update = static_cast(n_rows_ * update_ratio); + faststring update_buf; + RowChangeListEncoder update(&update_buf); + for (int i = 0; i < to_update; i++) { + uint32_t idx_to_update = random() % n_rows_; + uint32_t new_val = idx_to_update * 5; + update.Reset(); + update.AddColumnUpdate(schema_.column(1), schema_.column_id(1), &new_val); + OperationResultPB result; + CHECK_OK(MutateRow(rs, + idx_to_update, + RowChangeList(update_buf), + &result)); + CHECK_EQ(1, result.mutated_stores_size()); + CHECK_EQ(rs->metadata()->id(), result.mutated_stores(0).rs_id()); + if (updated != NULL) { + updated->insert(idx_to_update); + } + } + } + + // Delete the row with the given identifier. + Status DeleteRow(DiskRowSet *rs, uint32_t row_idx, OperationResultPB* result) { + faststring update_buf; + RowChangeListEncoder update(&update_buf); + update.Reset(); + update.SetToDelete(); + + return MutateRow(rs, row_idx, RowChangeList(update_buf), result); + } + + Status UpdateRow(DiskRowSet *rs, + uint32_t row_idx, + uint32_t new_val, + OperationResultPB* result) { + faststring update_buf; + RowChangeListEncoder update(&update_buf); + update.Reset(); + update.AddColumnUpdate(schema_.column(1), schema_.column_id(1), &new_val); + + return MutateRow(rs, row_idx, RowChangeList(update_buf), result); + } + + // Mutate the given row. + Status MutateRow(DiskRowSet *rs, + uint32_t row_idx, + const RowChangeList &mutation, + OperationResultPB* result) { + RowBuilder rb(schema_.CreateKeyProjection()); + BuildRowKey(&rb, row_idx); + RowSetKeyProbe probe(rb.row()); + + ProbeStats stats; + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + Status s = rs->MutateRow(tx.timestamp(), probe, mutation, op_id_, &stats, result); + tx.Commit(); + return s; + } + + Status CheckRowPresent(const DiskRowSet &rs, uint32_t row_idx, bool *present) { + RowBuilder rb(schema_.CreateKeyProjection()); + BuildRowKey(&rb, row_idx); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + return rs.CheckRowPresent(probe, present, &stats); + } + + // Verify the contents of the given rowset. + // Updated rows (those whose index is present in 'updated') should have + // a 'val' column equal to idx*5. + // Other rows should have val column equal to idx. + void VerifyUpdates(const DiskRowSet &rs, const unordered_set &updated) { + LOG_TIMING(INFO, "Reading updated rows with row iter") { + VerifyUpdatesWithRowIter(rs, updated); + } + } + + void VerifyUpdatesWithRowIter(const DiskRowSet &rs, + const unordered_set &updated) { + Schema proj_val = CreateProjection(schema_, { "val" }); + MvccSnapshot snap = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + gscoped_ptr row_iter; + CHECK_OK(rs.NewRowIterator(&proj_val, snap, &row_iter)); + CHECK_OK(row_iter->Init(NULL)); + Arena arena(1024, 1024*1024); + int batch_size = 10000; + RowBlock dst(proj_val, batch_size, &arena); + + int i = 0; + while (row_iter->HasNext()) { + arena.Reset(); + CHECK_OK(row_iter->NextBlock(&dst)); + VerifyUpdatedBlock(proj_val.ExtractColumnFromRow(dst.row(0), 0), + i, dst.nrows(), updated); + i += dst.nrows(); + } + } + + void VerifyUpdatedBlock(const uint32_t *from_file, int start_row, size_t n_rows, + const unordered_set &updated) { + for (int j = 0; j < n_rows; j++) { + uint32_t idx_in_file = start_row + j; + int expected; + if (updated.count(idx_in_file) > 0) { + expected = idx_in_file * 5; + } else { + expected = idx_in_file; + } + + if (from_file[j] != expected) { + FAIL() << "Incorrect value at idx " << idx_in_file + << ": expected=" << expected << " got=" << from_file[j]; + } + } + } + + // Perform a random read of the given row key, + // asserting that the result matches 'expected_val'. + void VerifyRandomRead(const DiskRowSet& rs, const Slice& row_key, + const string& expected_val) { + Arena arena(256, 1024); + ScanSpec spec; + ColumnRangePredicate pred(schema_.column(0), &row_key, &row_key); + spec.AddPredicate(pred); + RangePredicateEncoder enc(&schema_, &arena); + enc.EncodeRangePredicates(&spec, true); + + MvccSnapshot snap = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + gscoped_ptr row_iter; + CHECK_OK(rs.NewRowIterator(&schema_, snap, &row_iter)); + CHECK_OK(row_iter->Init(&spec)); + vector rows; + IterateToStringList(row_iter.get(), &rows); + string result = JoinStrings(rows, "\n"); + ASSERT_EQ(expected_val, result); + } + + // Iterate over a DiskRowSet, dumping occasional rows to the console, + // using the given schema as a projection. + static void IterateProjection(const DiskRowSet &rs, const Schema &schema, + int expected_rows, bool do_log = true) { + MvccSnapshot snap = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + gscoped_ptr row_iter; + CHECK_OK(rs.NewRowIterator(&schema, snap, &row_iter)); + CHECK_OK(row_iter->Init(NULL)); + + int batch_size = 1000; + Arena arena(1024, 1024*1024); + RowBlock dst(schema, batch_size, &arena); + + int i = 0; + int log_interval = expected_rows/20 / batch_size; + while (row_iter->HasNext()) { + arena.Reset(); + CHECK_OK(row_iter->NextBlock(&dst)); + i += dst.nrows(); + + if (do_log) { + KLOG_EVERY_N(INFO, log_interval) << "Got row: " << schema.DebugRow(dst.row(0)); + } + } + + EXPECT_EQ(expected_rows, i); + } + + void BenchmarkIterationPerformance(const DiskRowSet &rs, + const string &log_message) { + Schema proj_val = CreateProjection(schema_, { "val" }); + LOG_TIMING(INFO, log_message + " (val column only)") { + for (int i = 0; i < FLAGS_n_read_passes; i++) { + IterateProjection(rs, proj_val, n_rows_, false); + } + } + + Schema proj_key = CreateProjection(schema_, { "key" }); + LOG_TIMING(INFO, log_message + " (key string column only)") { + for (int i = 0; i < FLAGS_n_read_passes; i++) { + IterateProjection(rs, proj_key, n_rows_, false); + } + } + + LOG_TIMING(INFO, log_message + " (both columns)") { + for (int i = 0; i < FLAGS_n_read_passes; i++) { + IterateProjection(rs, schema_, n_rows_, false); + } + } + } + + Status OpenTestRowSet(std::shared_ptr *rowset) { + return DiskRowSet::Open(rowset_meta_, new log::LogAnchorRegistry(), rowset); + } + + void FormatKey(int i, char *buf, size_t buf_len) { + snprintf(buf, buf_len, "hello %015d", i); + } + + size_t n_rows_; + consensus::OpId op_id_; // Generally a "fake" OpId for these tests. + MvccManager mvcc_; +}; + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/diskrowset-test.cc b/src/kudu/tablet/diskrowset-test.cc new file mode 100644 index 000000000000..657bb4252359 --- /dev/null +++ b/src/kudu/tablet/diskrowset-test.cc @@ -0,0 +1,540 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/algorithm.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/diskrowset-test-base.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/util/env.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" + +DEFINE_double(update_fraction, 0.1f, "fraction of rows to update"); +DECLARE_bool(cfile_lazy_open); +DECLARE_int32(cfile_default_block_size); +DECLARE_double(tablet_delta_store_major_compact_min_ratio); +DECLARE_int32(tablet_delta_store_minor_compact_max); + +using std::is_sorted; +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +// TODO: add test which calls CopyNextRows on an iterator with no more +// rows - i think it segfaults! + +// Test round-trip writing and reading back a rowset with +// multiple columns. Does not test any modifications. +TEST_F(TestRowSet, TestRowSetRoundTrip) { + WriteTestRowSet(); + + // Now open the DiskRowSet for read + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // First iterate over all columns + LOG_TIMING(INFO, "Iterating over all columns") { + IterateProjection(*rs, schema_, n_rows_); + } + + // Now iterate only over the key column + Schema proj_key; + ASSERT_OK(schema_.CreateProjectionByNames({ "key" }, &proj_key)); + + LOG_TIMING(INFO, "Iterating over only key column") { + IterateProjection(*rs, proj_key, n_rows_); + } + + + // Now iterate only over the non-key column + Schema proj_val; + ASSERT_OK(schema_.CreateProjectionByNames({ "val" }, &proj_val)); + LOG_TIMING(INFO, "Iterating over only val column") { + IterateProjection(*rs, proj_val, n_rows_); + } + + // Test that CheckRowPresent returns correct results + ProbeStats stats; + + // 1. Check a key which comes before all keys in rowset + { + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice("h")); + RowSetKeyProbe probe(rb.row()); + bool present; + ASSERT_OK(rs->CheckRowPresent(probe, &present, &stats)); + ASSERT_FALSE(present); + } + + // 2. Check a key which comes after all keys in rowset + { + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice("z")); + RowSetKeyProbe probe(rb.row()); + bool present; + ASSERT_OK(rs->CheckRowPresent(probe, &present, &stats)); + ASSERT_FALSE(present); + } + + // 3. Check a key which is not present, but comes between present + // keys + { + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice("hello 00000000000049x")); + RowSetKeyProbe probe(rb.row()); + bool present; + ASSERT_OK(rs->CheckRowPresent(probe, &present, &stats)); + ASSERT_FALSE(present); + } + + // 4. Check a key which is present + { + char buf[256]; + RowBuilder rb(schema_.CreateKeyProjection()); + FormatKey(49, buf, sizeof(buf)); + rb.AddString(Slice(buf)); + RowSetKeyProbe probe(rb.row()); + bool present; + ASSERT_OK(rs->CheckRowPresent(probe, &present, &stats)); + ASSERT_TRUE(present); + } +} + +// Test writing a rowset, and then updating some rows in it. +TEST_F(TestRowSet, TestRowSetUpdate) { + WriteTestRowSet(); + + // Now open the DiskRowSet for read + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Add an update to the delta tracker for a number of keys + // which exist. These updates will change the value to + // equal idx*5 (whereas in the original data, value = idx) + unordered_set updated; + UpdateExistingRows(rs.get(), FLAGS_update_fraction, &updated); + ASSERT_EQ(static_cast(n_rows_ * FLAGS_update_fraction), + rs->delta_tracker_->dms_->Count()); + + // Try to add a mutation for a key not in the file (but which falls + // between two valid keys) + faststring buf; + RowChangeListEncoder enc(&buf); + enc.SetToDelete(); + + Timestamp timestamp(0); + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(Slice("hello 00000000000049x")); + RowSetKeyProbe probe(rb.row()); + + OperationResultPB result; + ProbeStats stats; + Status s = rs->MutateRow(timestamp, probe, enc.as_changelist(), op_id_, &stats, &result); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(0, result.mutated_stores_size()); + + // Now read back the value column, and verify that the updates + // are visible. + VerifyUpdates(*rs, updated); +} + +TEST_F(TestRowSet, TestRandomRead) { + // Write 100 rows. + WriteTestRowSet(100); + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Read un-updated row. + VerifyRandomRead(*rs, "hello 000000000000050", + "(string key=hello 000000000000050, uint32 val=50)"); + NO_FATALS(); + + // Update the row. + OperationResultPB result; + ASSERT_OK(UpdateRow(rs.get(), 50, 12345, &result)); + + // Read it again -- should see the updated value. + VerifyRandomRead(*rs, "hello 000000000000050", + "(string key=hello 000000000000050, uint32 val=12345)"); + NO_FATALS(); + + // Try to read a row which comes before the first key. + // This should return no rows. + VerifyRandomRead(*rs, "aaaaa", ""); + NO_FATALS(); + + // Same with a row which falls between keys. + VerifyRandomRead(*rs, "hello 000000000000050_between_keys", ""); + NO_FATALS(); + + // And a row which falls after the last key. + VerifyRandomRead(*rs, "hello 000000000000101", ""); + NO_FATALS(); +} + +// Test Delete() support within a DiskRowSet. +TEST_F(TestRowSet, TestDelete) { + // Write and open a DiskRowSet with 2 rows. + WriteTestRowSet(2); + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + MvccSnapshot snap_before_delete(mvcc_); + + // Delete one of the two rows + OperationResultPB result; + ASSERT_OK(DeleteRow(rs.get(), 0, &result)); + ASSERT_EQ(1, result.mutated_stores_size()); + ASSERT_EQ(0L, result.mutated_stores(0).rs_id()); + ASSERT_EQ(0L, result.mutated_stores(0).dms_id()); + MvccSnapshot snap_after_delete(mvcc_); + + vector rows; + Status s; + + for (int i = 0; i < 2; i++) { + // Reading the MVCC snapshot prior to deletion should show the row. + ASSERT_OK(DumpRowSet(*rs, schema_, snap_before_delete, &rows)); + ASSERT_EQ(2, rows.size()); + EXPECT_EQ("(string key=hello 000000000000000, uint32 val=0)", rows[0]); + EXPECT_EQ("(string key=hello 000000000000001, uint32 val=1)", rows[1]); + + // Reading the MVCC snapshot after the deletion should hide the row. + ASSERT_OK(DumpRowSet(*rs, schema_, snap_after_delete, &rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ("(string key=hello 000000000000001, uint32 val=1)", rows[0]); + + // Trying to delete or update the same row again should fail. + OperationResultPB result; + s = DeleteRow(rs.get(), 0, &result); + ASSERT_TRUE(s.IsNotFound()) << "bad status: " << s.ToString(); + ASSERT_EQ(0, result.mutated_stores_size()); + result.Clear(); + s = UpdateRow(rs.get(), 0, 12345, &result); + ASSERT_TRUE(s.IsNotFound()) << "bad status: " << s.ToString(); + ASSERT_EQ(0, result.mutated_stores_size()); + + // CheckRowPresent should return false. + bool present; + ASSERT_OK(CheckRowPresent(*rs, 0, &present)); + EXPECT_FALSE(present); + + if (i == 1) { + // Flush DMS. The second pass through the loop will re-verify that the + // externally visible state of the layer has not changed. + // deletions now in a DeltaFile. + ASSERT_OK(rs->FlushDeltas()); + } + } +} + + +TEST_F(TestRowSet, TestDMSFlush) { + WriteTestRowSet(); + + unordered_set updated; + + // Now open the DiskRowSet for read + { + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Add an update to the delta tracker for a number of keys + // which exist. These updates will change the value to + // equal idx*5 (whereas in the original data, value = idx) + UpdateExistingRows(rs.get(), FLAGS_update_fraction, &updated); + ASSERT_EQ(static_cast(n_rows_ * FLAGS_update_fraction), + rs->delta_tracker_->dms_->Count()); + + ASSERT_OK(rs->FlushDeltas()); + + // Check that the DiskRowSet's DMS has now been emptied. + ASSERT_EQ(0, rs->delta_tracker_->dms_->Count()); + + // Now read back the value column, and verify that the updates + // are visible. + SCOPED_TRACE("before reopen"); + VerifyUpdates(*rs, updated); + } + + LOG(INFO) << "Reopening rowset ==============="; + // Close and re-open the rowset and ensure that the updates were + // persistent. + { + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Now read back the value column, and verify that the updates + // are visible. + SCOPED_TRACE("after reopen"); + VerifyUpdates(*rs, updated); + } +} + +// Test that when a single row is updated multiple times, we can query the +// historical values using MVCC, even after it is flushed. +TEST_F(TestRowSet, TestFlushedUpdatesRespectMVCC) { + const Slice key_slice("row"); + + // Write a single row into a new DiskRowSet. + LOG_TIMING(INFO, "Writing rowset") { + DiskRowSetWriter drsw(rowset_meta_.get(), &schema_, + BloomFilterSizing::BySizeAndFPRate(32*1024, 0.01f)); + + ASSERT_OK(drsw.Open()); + + RowBuilder rb(schema_); + rb.AddString(key_slice); + rb.AddUint32(1); + ASSERT_OK_FAST(WriteRow(rb.data(), &drsw)); + ASSERT_OK(drsw.Finish()); + } + + + // Reopen the rowset. + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Take a snapshot of the pre-update state. + vector snaps; + snaps.push_back(MvccSnapshot(mvcc_)); + + + // Update the single row multiple times, taking an MVCC snapshot + // after each update. + faststring update_buf; + RowChangeListEncoder update(&update_buf); + for (uint32_t i = 2; i <= 5; i++) { + { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + update.Reset(); + update.AddColumnUpdate(schema_.column(1), schema_.column_id(1), &i); + RowBuilder rb(schema_.CreateKeyProjection()); + rb.AddString(key_slice); + RowSetKeyProbe probe(rb.row()); + OperationResultPB result; + ProbeStats stats; + ASSERT_OK_FAST(rs->MutateRow(tx.timestamp(), + probe, + RowChangeList(update_buf), + op_id_, + &stats, + &result)); + ASSERT_EQ(1, result.mutated_stores_size()); + ASSERT_EQ(0L, result.mutated_stores(0).rs_id()); + ASSERT_EQ(0L, result.mutated_stores(0).dms_id()); + tx.Commit(); + } + snaps.push_back(MvccSnapshot(mvcc_)); + } + + // Ensure that MVCC is respected by reading the value at each of the stored + // snapshots. + ASSERT_EQ(5, snaps.size()); + for (int i = 0; i < 5; i++) { + SCOPED_TRACE(i); + gscoped_ptr iter; + ASSERT_OK(rs->NewRowIterator(&schema_, snaps[i], &iter)); + string data = InitAndDumpIterator(iter.Pass()); + EXPECT_EQ(StringPrintf("(string key=row, uint32 val=%d)", i + 1), data); + } + + // Flush deltas to disk and ensure that the historical versions are still + // accessible. + ASSERT_OK(rs->FlushDeltas()); + + for (int i = 0; i < 5; i++) { + SCOPED_TRACE(i); + gscoped_ptr iter; + ASSERT_OK(rs->NewRowIterator(&schema_, snaps[i], &iter)); + string data = InitAndDumpIterator(iter.Pass()); + EXPECT_EQ(StringPrintf("(string key=row, uint32 val=%d)", i + 1), data); + } + +} + +// Similar to TestDMSFlush above, except does not actually verify +// the results (since the verification step is expensive). Additionally, +// loops the "read" side of the benchmark a number of times, so that +// the speed of applying deltas during read can be micro-benchmarked. +// +// This is most usefully run with an invocation like: +// ./rowset-test --gtest_filter=\*Performance --roundtrip_num_rows=1000000 +// --n_read_passes=1000 --update_fraction=0.01 +TEST_F(TestRowSet, TestDeltaApplicationPerformance) { + WriteTestRowSet(); + + // Now open the DiskRowSet for read + { + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + BenchmarkIterationPerformance(*rs.get(), + StringPrintf("Reading %zd rows prior to updates %d times", + n_rows_, FLAGS_n_read_passes)); + + UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr); + + BenchmarkIterationPerformance(*rs.get(), + StringPrintf("Reading %zd rows with %.2f%% updates %d times (updates in DMS)", + n_rows_, FLAGS_update_fraction * 100.0f, + FLAGS_n_read_passes)); + ASSERT_OK(rs->FlushDeltas()); + + BenchmarkIterationPerformance(*rs.get(), + StringPrintf("Reading %zd rows with %.2f%% updates %d times (updates on disk)", + n_rows_, FLAGS_update_fraction * 100.0f, + FLAGS_n_read_passes)); + } +} + +TEST_F(TestRowSet, TestRollingDiskRowSetWriter) { + // Set small block size so that we can roll frequently. Otherwise + // we couldn't output such small files. + google::FlagSaver saver; + FLAGS_cfile_default_block_size = 4096; + + RollingDiskRowSetWriter writer(tablet()->metadata(), schema_, + BloomFilterSizing::BySizeAndFPRate(32*1024, 0.01f), + 64 * 1024); // roll every 64KB + DoWriteTestRowSet(10000, &writer); + + // Should have rolled 4 times. + vector > metas; + writer.GetWrittenRowSetMetadata(&metas); + EXPECT_EQ(4, metas.size()); + for (const shared_ptr& meta : metas) { + ASSERT_TRUE(meta->HasDataForColumnIdForTests(schema_.column_id(0))); + } +} + +TEST_F(TestRowSet, TestMakeDeltaIteratorMergerUnlocked) { + WriteTestRowSet(); + + // Now open the DiskRowSet for read + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr); + ASSERT_OK(rs->FlushDeltas()); + DeltaTracker *dt = rs->delta_tracker(); + int num_stores = dt->redo_delta_stores_.size(); + vector > compacted_stores; + vector compacted_blocks; + shared_ptr merge_iter; + ASSERT_OK(dt->MakeDeltaIteratorMergerUnlocked(0, num_stores - 1, &schema_, + &compacted_stores, + &compacted_blocks, &merge_iter)); + vector results; + ASSERT_OK(DebugDumpDeltaIterator(REDO, merge_iter.get(), schema_, + ITERATE_OVER_ALL_ROWS, + &results)); + for (const string &str : results) { + VLOG(1) << str; + } + ASSERT_EQ(compacted_stores.size(), num_stores); + ASSERT_EQ(compacted_blocks.size(), num_stores); + ASSERT_TRUE(is_sorted(results.begin(), results.end())); +} + +void BetweenZeroAndOne(double to_check) { + ASSERT_LT(0, to_check); + ASSERT_GT(1, to_check); +} + +TEST_F(TestRowSet, TestCompactStores) { + // With this setting, we want major compactions to basically always have a score. + FLAGS_tablet_delta_store_major_compact_min_ratio = 0.0001; + // With this setting, the perf improvement will be 0 until we have two files, at which point + // it will be the expected ratio, then with three files we get the maximum improvement. + FLAGS_tablet_delta_store_minor_compact_max = 3; + // Turning this off so that we can call DeltaStoresCompactionPerfImprovementScore without having + // to open the files after creating them. + FLAGS_cfile_lazy_open = false; + + + WriteTestRowSet(); + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + ASSERT_EQ(0, rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MINOR_DELTA_COMPACTION)); + ASSERT_EQ(0, rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MAJOR_DELTA_COMPACTION)); + + // Write a first delta file. + UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr); + ASSERT_OK(rs->FlushDeltas()); + // One file isn't enough for minor compactions, but a major compaction can run. + ASSERT_EQ(0, rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MINOR_DELTA_COMPACTION)); + BetweenZeroAndOne(rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MAJOR_DELTA_COMPACTION)); + + // Write a second delta file. + UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr); + ASSERT_OK(rs->FlushDeltas()); + // Two files is enough for all delta compactions. + BetweenZeroAndOne(rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MINOR_DELTA_COMPACTION)); + BetweenZeroAndOne(rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MAJOR_DELTA_COMPACTION)); + + // Write a third delta file. + UpdateExistingRows(rs.get(), FLAGS_update_fraction, nullptr); + ASSERT_OK(rs->FlushDeltas()); + // We're hitting the max for minor compactions but not for major compactions. + ASSERT_EQ(1, rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MINOR_DELTA_COMPACTION)); + BetweenZeroAndOne(rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MAJOR_DELTA_COMPACTION)); + + // Compact the deltafiles + DeltaTracker *dt = rs->delta_tracker(); + int num_stores = dt->redo_delta_stores_.size(); + VLOG(1) << "Number of stores before compaction: " << num_stores; + ASSERT_EQ(num_stores, 3); + ASSERT_OK(dt->CompactStores(0, num_stores - 1)); + num_stores = dt->redo_delta_stores_.size(); + VLOG(1) << "Number of stores after compaction: " << num_stores; + ASSERT_EQ(1, num_stores); + // Back to one store, can't minor compact. + ASSERT_EQ(0, rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MINOR_DELTA_COMPACTION)); + BetweenZeroAndOne(rs->DeltaStoresCompactionPerfImprovementScore(RowSet::MAJOR_DELTA_COMPACTION)); + + // Verify that the resulting deltafile is valid + vector > compacted_stores; + vector compacted_blocks; + shared_ptr merge_iter; + ASSERT_OK(dt->MakeDeltaIteratorMergerUnlocked(0, num_stores - 1, &schema_, + &compacted_stores, + &compacted_blocks, &merge_iter)); + vector results; + ASSERT_OK(DebugDumpDeltaIterator(REDO, merge_iter.get(), schema_, + ITERATE_OVER_ALL_ROWS, + &results)); + for (const string &str : results) { + VLOG(1) << str; + } + ASSERT_TRUE(is_sorted(results.begin(), results.end())); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc new file mode 100644 index 000000000000..55decd88170e --- /dev/null +++ b/src/kudu/tablet/diskrowset.cc @@ -0,0 +1,742 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/common/iterator.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/cfile/bloomfile.h" +#include "kudu/cfile/cfile_writer.h" +#include "kudu/cfile/type_encodings.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/compaction.h" +#include "kudu/tablet/delta_store.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/multi_column_writer.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +DEFINE_int32(tablet_delta_store_minor_compact_max, 1000, + "How many delta stores are required before forcing a minor delta compaction " + "(Advanced option)"); +TAG_FLAG(tablet_delta_store_minor_compact_max, experimental); + +DEFINE_double(tablet_delta_store_major_compact_min_ratio, 0.1f, + "Minimum ratio of sizeof(deltas) to sizeof(base data) before a major compaction " + "can run (Advanced option)"); +TAG_FLAG(tablet_delta_store_major_compact_min_ratio, experimental); + +DEFINE_int32(default_composite_key_index_block_size_bytes, 4096, + "Block size used for composite key indexes."); +TAG_FLAG(default_composite_key_index_block_size_bytes, experimental); + +namespace kudu { +namespace tablet { + +using cfile::BloomFileWriter; +using fs::ScopedWritableBlockCloser; +using fs::WritableBlock; +using log::LogAnchorRegistry; +using std::shared_ptr; +using std::string; + +const char *DiskRowSet::kMinKeyMetaEntryName = "min_key"; +const char *DiskRowSet::kMaxKeyMetaEntryName = "max_key"; + +DiskRowSetWriter::DiskRowSetWriter(RowSetMetadata* rowset_metadata, + const Schema* schema, + BloomFilterSizing bloom_sizing) + : rowset_metadata_(rowset_metadata), + schema_(schema), + bloom_sizing_(std::move(bloom_sizing)), + finished_(false), + written_count_(0) { + CHECK(schema->has_column_ids()); +} + +Status DiskRowSetWriter::Open() { + TRACE_EVENT0("tablet", "DiskRowSetWriter::Open"); + + FsManager* fs = rowset_metadata_->fs_manager(); + col_writer_.reset(new MultiColumnWriter(fs, schema_)); + RETURN_NOT_OK(col_writer_->Open()); + + // Open bloom filter. + RETURN_NOT_OK(InitBloomFileWriter()); + + if (schema_->num_key_columns() > 1) { + // Open ad-hoc index writer + RETURN_NOT_OK(InitAdHocIndexWriter()); + } + + return Status::OK(); +} + +Status DiskRowSetWriter::InitBloomFileWriter() { + TRACE_EVENT0("tablet", "DiskRowSetWriter::InitBloomFileWriter"); + gscoped_ptr block; + FsManager* fs = rowset_metadata_->fs_manager(); + RETURN_NOT_OK_PREPEND(fs->CreateNewBlock(&block), + "Couldn't allocate a block for bloom filter"); + rowset_metadata_->set_bloom_block(block->id()); + + bloom_writer_.reset(new cfile::BloomFileWriter(block.Pass(), bloom_sizing_)); + RETURN_NOT_OK(bloom_writer_->Start()); + return Status::OK(); +} + +Status DiskRowSetWriter::InitAdHocIndexWriter() { + TRACE_EVENT0("tablet", "DiskRowSetWriter::InitAdHocIndexWriter"); + gscoped_ptr block; + FsManager* fs = rowset_metadata_->fs_manager(); + RETURN_NOT_OK_PREPEND(fs->CreateNewBlock(&block), + "Couldn't allocate a block for compoound index"); + + rowset_metadata_->set_adhoc_index_block(block->id()); + + // TODO: allow options to be configured, perhaps on a per-column + // basis as part of the schema. For now use defaults. + // + // Also would be able to set encoding here, or do something smart + // to figure out the encoding on the fly. + cfile::WriterOptions opts; + + // Index the composite key by value + opts.write_validx = true; + + // no need to index positions + opts.write_posidx = false; + + opts.storage_attributes.encoding = PREFIX_ENCODING; + opts.storage_attributes.compression = LZ4; + opts.storage_attributes.cfile_block_size = FLAGS_default_composite_key_index_block_size_bytes; + + // Create the CFile writer for the ad-hoc index. + ad_hoc_index_writer_.reset(new cfile::CFileWriter( + opts, + GetTypeInfo(BINARY), + false, + block.Pass())); + return ad_hoc_index_writer_->Start(); + +} + +Status DiskRowSetWriter::AppendBlock(const RowBlock &block) { + DCHECK_EQ(block.schema().num_columns(), schema_->num_columns()); + CHECK(!finished_); + + // If this is the very first block, encode the first key and save it as metadata + // in the index column. + if (written_count_ == 0) { + Slice enc_key = schema_->EncodeComparableKey(block.row(0), &last_encoded_key_); + key_index_writer()->AddMetadataPair(DiskRowSet::kMinKeyMetaEntryName, enc_key); + last_encoded_key_.clear(); + } + + // Write the batch to each of the columns + RETURN_NOT_OK(col_writer_->AppendBlock(block)); + +#ifndef NDEBUG + faststring prev_key; +#endif + + // Write the batch to the bloom and optionally the ad-hoc index + for (size_t i = 0; i < block.nrows(); i++) { +#ifndef NDEBUG + prev_key.assign_copy(last_encoded_key_.data(), last_encoded_key_.size()); +#endif + + // TODO: performance might be better if we actually batch this - + // encode a bunch of key slices, then pass them all in one go. + RowBlockRow row = block.row(i); + // Insert the encoded key into the bloom. + Slice enc_key = schema_->EncodeComparableKey(row, &last_encoded_key_); + RETURN_NOT_OK(bloom_writer_->AppendKeys(&enc_key, 1)); + + // Write the batch to the ad hoc index if we're using one + if (ad_hoc_index_writer_ != nullptr) { + RETURN_NOT_OK(ad_hoc_index_writer_->AppendEntries(&enc_key, 1)); + } + +#ifndef NDEBUG + CHECK_LT(Slice(prev_key).compare(enc_key), 0) + << enc_key.ToDebugString() << " appended to file not > previous key " + << Slice(prev_key).ToDebugString(); +#endif + } + + written_count_ += block.nrows(); + + return Status::OK(); +} + +Status DiskRowSetWriter::Finish() { + TRACE_EVENT0("tablet", "DiskRowSetWriter::Finish"); + ScopedWritableBlockCloser closer; + RETURN_NOT_OK(FinishAndReleaseBlocks(&closer)); + return closer.CloseBlocks(); +} + +Status DiskRowSetWriter::FinishAndReleaseBlocks(ScopedWritableBlockCloser* closer) { + TRACE_EVENT0("tablet", "DiskRowSetWriter::FinishAndReleaseBlocks"); + CHECK(!finished_); + + if (written_count_ == 0) { + finished_ = true; + return Status::Aborted("no data written"); + } + + // Save the last encoded (max) key + CHECK_GT(last_encoded_key_.size(), 0); + Slice last_enc_slice(last_encoded_key_); + Slice first_enc_slice(key_index_writer()->GetMetaValueOrDie(DiskRowSet::kMinKeyMetaEntryName)); + CHECK_LE(first_enc_slice.compare(last_enc_slice), 0) + << "First Key not <= Last key: first_key=" << first_enc_slice.ToDebugString() + << " last_key=" << last_enc_slice.ToDebugString(); + key_index_writer()->AddMetadataPair(DiskRowSet::kMaxKeyMetaEntryName, last_enc_slice); + + // Finish writing the columns themselves. + RETURN_NOT_OK(col_writer_->FinishAndReleaseBlocks(closer)); + + // Put the column data blocks in the metadata. + RowSetMetadata::ColumnIdToBlockIdMap flushed_blocks; + col_writer_->GetFlushedBlocksByColumnId(&flushed_blocks); + rowset_metadata_->SetColumnDataBlocks(flushed_blocks); + + if (ad_hoc_index_writer_ != nullptr) { + Status s = ad_hoc_index_writer_->FinishAndReleaseBlock(closer); + if (!s.ok()) { + LOG(WARNING) << "Unable to Finish ad hoc index writer: " << s.ToString(); + return s; + } + } + + // Finish bloom. + Status s = bloom_writer_->FinishAndReleaseBlock(closer); + if (!s.ok()) { + LOG(WARNING) << "Unable to Finish bloom filter writer: " << s.ToString(); + return s; + } + + finished_ = true; + return Status::OK(); +} + +cfile::CFileWriter *DiskRowSetWriter::key_index_writer() { + return ad_hoc_index_writer_ ? ad_hoc_index_writer_.get() : col_writer_->writer_for_col_idx(0); +} + +size_t DiskRowSetWriter::written_size() const { + size_t size = 0; + + if (col_writer_) { + size += col_writer_->written_size(); + } + + if (bloom_writer_) { + size += bloom_writer_->written_size(); + } + + if (ad_hoc_index_writer_) { + size += ad_hoc_index_writer_->written_size(); + } + + return size; +} + +DiskRowSetWriter::~DiskRowSetWriter() { +} + +RollingDiskRowSetWriter::RollingDiskRowSetWriter( + TabletMetadata* tablet_metadata, const Schema& schema, + BloomFilterSizing bloom_sizing, size_t target_rowset_size) + : state_(kInitialized), + tablet_metadata_(DCHECK_NOTNULL(tablet_metadata)), + schema_(schema), + bloom_sizing_(std::move(bloom_sizing)), + target_rowset_size_(target_rowset_size), + row_idx_in_cur_drs_(0), + can_roll_(false), + written_count_(0), + written_size_(0) { + CHECK(schema.has_column_ids()); +} + +Status RollingDiskRowSetWriter::Open() { + TRACE_EVENT0("tablet", "RollingDiskRowSetWriter::Open"); + CHECK_EQ(state_, kInitialized); + + RETURN_NOT_OK(RollWriter()); + state_ = kStarted; + return Status::OK(); +} + +Status RollingDiskRowSetWriter::RollWriter() { + TRACE_EVENT0("tablet", "RollingDiskRowSetWriter::RollWriter"); + // Close current writer if it is open + RETURN_NOT_OK(FinishCurrentWriter()); + + RETURN_NOT_OK(tablet_metadata_->CreateRowSet(&cur_drs_metadata_, schema_)); + + cur_writer_.reset(new DiskRowSetWriter(cur_drs_metadata_.get(), &schema_, bloom_sizing_)); + RETURN_NOT_OK(cur_writer_->Open()); + + FsManager* fs = tablet_metadata_->fs_manager(); + gscoped_ptr undo_data_block; + gscoped_ptr redo_data_block; + RETURN_NOT_OK(fs->CreateNewBlock(&undo_data_block)); + RETURN_NOT_OK(fs->CreateNewBlock(&redo_data_block)); + cur_undo_ds_block_id_ = undo_data_block->id(); + cur_redo_ds_block_id_ = redo_data_block->id(); + cur_undo_writer_.reset(new DeltaFileWriter(undo_data_block.Pass())); + cur_redo_writer_.reset(new DeltaFileWriter(redo_data_block.Pass())); + cur_undo_delta_stats.reset(new DeltaStats()); + cur_redo_delta_stats.reset(new DeltaStats()); + + row_idx_in_cur_drs_ = 0; + can_roll_ = false; + + RETURN_NOT_OK(cur_undo_writer_->Start()); + return cur_redo_writer_->Start(); +} + +Status RollingDiskRowSetWriter::RollIfNecessary() { + DCHECK_EQ(state_, kStarted); + if (can_roll_ && cur_writer_->written_size() > target_rowset_size_) { + RETURN_NOT_OK(RollWriter()); + } + return Status::OK(); +} + +Status RollingDiskRowSetWriter::AppendBlock(const RowBlock &block) { + DCHECK_EQ(state_, kStarted); + RETURN_NOT_OK(cur_writer_->AppendBlock(block)); + + written_count_ += block.nrows(); + + row_idx_in_cur_drs_ += block.nrows(); + can_roll_ = true; + return Status::OK(); +} + +Status RollingDiskRowSetWriter::AppendUndoDeltas(rowid_t row_idx_in_block, + Mutation* undo_delta_head, + rowid_t* row_idx) { + return AppendDeltas(row_idx_in_block, undo_delta_head, + row_idx, + cur_undo_writer_.get(), + cur_undo_delta_stats.get()); +} + +Status RollingDiskRowSetWriter::AppendRedoDeltas(rowid_t row_idx_in_block, + Mutation* redo_delta_head, + rowid_t* row_idx) { + return AppendDeltas(row_idx_in_block, redo_delta_head, + row_idx, + cur_redo_writer_.get(), + cur_redo_delta_stats.get()); +} + +template +Status RollingDiskRowSetWriter::AppendDeltas(rowid_t row_idx_in_block, + Mutation* delta_head, + rowid_t* row_idx, + DeltaFileWriter* writer, + DeltaStats* delta_stats) { + can_roll_ = false; + + *row_idx = row_idx_in_cur_drs_ + row_idx_in_block; + for (const Mutation *mut = delta_head; mut != nullptr; mut = mut->next()) { + DeltaKey undo_key(*row_idx, mut->timestamp()); + RETURN_NOT_OK(writer->AppendDelta(undo_key, mut->changelist())); + delta_stats->UpdateStats(mut->timestamp(), mut->changelist()); + } + return Status::OK(); +} + +Status RollingDiskRowSetWriter::FinishCurrentWriter() { + TRACE_EVENT0("tablet", "RollingDiskRowSetWriter::FinishCurrentWriter"); + if (!cur_writer_) { + return Status::OK(); + } + CHECK_EQ(state_, kStarted); + + Status writer_status = cur_writer_->FinishAndReleaseBlocks(&block_closer_); + + // If no rows were written (e.g. due to an empty flush or a compaction with all rows + // deleted), FinishAndReleaseBlocks(...) returns Aborted. In that case, we don't + // generate a RowSetMetadata. + if (writer_status.IsAborted()) { + CHECK_EQ(cur_writer_->written_count(), 0); + } else { + RETURN_NOT_OK(writer_status); + CHECK_GT(cur_writer_->written_count(), 0); + + cur_undo_writer_->WriteDeltaStats(*cur_undo_delta_stats); + cur_redo_writer_->WriteDeltaStats(*cur_redo_delta_stats); + + RETURN_NOT_OK(cur_undo_writer_->FinishAndReleaseBlock(&block_closer_)); + RETURN_NOT_OK(cur_redo_writer_->FinishAndReleaseBlock(&block_closer_)); + + // If the writer is not null _AND_ we've written something to the undo + // delta store commit the undo delta block. + if (cur_undo_writer_.get() != nullptr && + cur_undo_delta_stats->min_timestamp().CompareTo(Timestamp::kMax) != 0) { + cur_drs_metadata_->CommitUndoDeltaDataBlock(cur_undo_ds_block_id_); + } + + // If the writer is not null _AND_ we've written something to the redo + // delta store commit the redo delta block. + if (cur_redo_writer_.get() != nullptr && + cur_redo_delta_stats->min_timestamp().CompareTo(Timestamp::kMax) != 0) { + cur_drs_metadata_->CommitRedoDeltaDataBlock(0, cur_redo_ds_block_id_); + } else { + // TODO: KUDU-678: the block will get orphaned here, since we're not putting + // it in the metadata, nor deleting it. + } + + written_size_ += cur_writer_->written_size(); + + written_drs_metas_.push_back(cur_drs_metadata_); + } + + cur_writer_.reset(nullptr); + cur_undo_writer_.reset(nullptr); + cur_redo_writer_.reset(nullptr); + + cur_drs_metadata_.reset(); + + return Status::OK(); +} + +Status RollingDiskRowSetWriter::Finish() { + TRACE_EVENT0("tablet", "RollingDiskRowSetWriter::Finish"); + DCHECK_EQ(state_, kStarted); + + RETURN_NOT_OK(FinishCurrentWriter()); + RETURN_NOT_OK(block_closer_.CloseBlocks()); + + state_ = kFinished; + return Status::OK(); +} + +void RollingDiskRowSetWriter::GetWrittenRowSetMetadata(RowSetMetadataVector* metas) const { + CHECK_EQ(state_, kFinished); + metas->assign(written_drs_metas_.begin(), written_drs_metas_.end()); +} + +RollingDiskRowSetWriter::~RollingDiskRowSetWriter() { +} + +//////////////////////////////////////////////////////////// +// Reader +//////////////////////////////////////////////////////////// + +Status DiskRowSet::Open(const shared_ptr& rowset_metadata, + log::LogAnchorRegistry* log_anchor_registry, + shared_ptr *rowset, + const shared_ptr& parent_tracker) { + shared_ptr rs(new DiskRowSet(rowset_metadata, log_anchor_registry, parent_tracker)); + + RETURN_NOT_OK(rs->Open()); + + rowset->swap(rs); + return Status::OK(); +} + +DiskRowSet::DiskRowSet(shared_ptr rowset_metadata, + LogAnchorRegistry* log_anchor_registry, + shared_ptr parent_tracker) + : rowset_metadata_(std::move(rowset_metadata)), + open_(false), + log_anchor_registry_(log_anchor_registry), + parent_tracker_(std::move(parent_tracker)) {} + +Status DiskRowSet::Open() { + TRACE_EVENT0("tablet", "DiskRowSet::Open"); + gscoped_ptr new_base(new CFileSet(rowset_metadata_)); + RETURN_NOT_OK(new_base->Open()); + base_data_.reset(new_base.release()); + + rowid_t num_rows; + RETURN_NOT_OK(base_data_->CountRows(&num_rows)); + delta_tracker_.reset(new DeltaTracker(rowset_metadata_, num_rows, + log_anchor_registry_, + parent_tracker_)); + RETURN_NOT_OK(delta_tracker_->Open()); + + open_ = true; + + return Status::OK(); +} + +Status DiskRowSet::FlushDeltas() { + TRACE_EVENT0("tablet", "DiskRowSet::FlushDeltas"); + return delta_tracker_->Flush(DeltaTracker::FLUSH_METADATA); +} + +Status DiskRowSet::MinorCompactDeltaStores() { + TRACE_EVENT0("tablet", "DiskRowSet::MinorCompactDeltaStores"); + return delta_tracker_->Compact(); +} + +Status DiskRowSet::MajorCompactDeltaStores() { + vector col_ids; + delta_tracker_->GetColumnIdsWithUpdates(&col_ids); + + if (col_ids.empty()) { + return Status::OK(); + } + + return MajorCompactDeltaStoresWithColumnIds(col_ids); +} + +Status DiskRowSet::MajorCompactDeltaStoresWithColumnIds(const vector& col_ids) { + TRACE_EVENT0("tablet", "DiskRowSet::MajorCompactDeltaStores"); + boost::lock_guard l(*delta_tracker()->compact_flush_lock()); + + // TODO: do we need to lock schema or anything here? + gscoped_ptr compaction; + RETURN_NOT_OK(NewMajorDeltaCompaction(col_ids, &compaction)); + + RETURN_NOT_OK(compaction->Compact()); + + // Update and flush the metadata. This needs to happen before we make the new files visible to + // prevent inconsistencies after a server crash. + RowSetMetadataUpdate update; + RETURN_NOT_OK(compaction->CreateMetadataUpdate(&update)); + RETURN_NOT_OK(rowset_metadata_->CommitUpdate(update)); + RETURN_NOT_OK(rowset_metadata_->Flush()); + + // Make the new base data and delta files visible. + gscoped_ptr new_base(new CFileSet(rowset_metadata_)); + RETURN_NOT_OK(new_base->Open()); + { + boost::lock_guard lock(component_lock_); + RETURN_NOT_OK(compaction->UpdateDeltaTracker(delta_tracker_.get())); + base_data_.reset(new_base.release()); + } + return Status::OK(); +} + +Status DiskRowSet::NewMajorDeltaCompaction(const vector& col_ids, + gscoped_ptr* out) const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + + const Schema* schema = &rowset_metadata_->tablet_schema(); + + vector > included_stores; + shared_ptr delta_iter; + RETURN_NOT_OK(delta_tracker_->NewDeltaFileIterator( + schema, + MvccSnapshot::CreateSnapshotIncludingAllTransactions(), + REDO, + &included_stores, + &delta_iter)); + + out->reset(new MajorDeltaCompaction(rowset_metadata_->fs_manager(), + *schema, + base_data_.get(), + delta_iter, + included_stores, + col_ids)); + return Status::OK(); +} + +Status DiskRowSet::NewRowIterator(const Schema *projection, + const MvccSnapshot &mvcc_snap, + gscoped_ptr* out) const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + + shared_ptr base_iter(base_data_->NewIterator(projection)); + gscoped_ptr col_iter; + RETURN_NOT_OK(delta_tracker_->WrapIterator(base_iter, mvcc_snap, &col_iter)); + + out->reset(new MaterializingIterator( + shared_ptr(col_iter.release()))); + return Status::OK(); +} + +Status DiskRowSet::NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const { + return CompactionInput::Create(*this, projection, snap, out); +} + +Status DiskRowSet::MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB* result) { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + + rowid_t row_idx; + RETURN_NOT_OK(base_data_->FindRow(probe, &row_idx, stats)); + + // It's possible that the row key exists in this DiskRowSet, but it has + // in fact been Deleted already. Check with the delta tracker to be sure. + bool deleted; + RETURN_NOT_OK(delta_tracker_->CheckRowDeleted(row_idx, &deleted, stats)); + if (deleted) { + return Status::NotFound("row not found"); + } + + RETURN_NOT_OK(delta_tracker_->Update(timestamp, row_idx, update, op_id, result)); + + return Status::OK(); +} + +Status DiskRowSet::CheckRowPresent(const RowSetKeyProbe &probe, + bool* present, + ProbeStats* stats) const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + + rowid_t row_idx; + RETURN_NOT_OK(base_data_->CheckRowPresent(probe, present, &row_idx, stats)); + if (!*present) { + // If it wasn't in the base data, then it's definitely not in the rowset. + return Status::OK(); + } + + // Otherwise it might be in the base data but deleted. + bool deleted = false; + RETURN_NOT_OK(delta_tracker_->CheckRowDeleted(row_idx, &deleted, stats)); + *present = !deleted; + return Status::OK(); +} + +Status DiskRowSet::CountRows(rowid_t *count) const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + + return base_data_->CountRows(count); +} + +Status DiskRowSet::GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + return base_data_->GetBounds(min_encoded_key, max_encoded_key); +} + +uint64_t DiskRowSet::EstimateBaseDataDiskSize() const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + return base_data_->EstimateOnDiskSize(); +} + +uint64_t DiskRowSet::EstimateDeltaDiskSize() const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + return delta_tracker_->EstimateOnDiskSize(); +} + +uint64_t DiskRowSet::EstimateOnDiskSize() const { + DCHECK(open_); + boost::shared_lock lock(component_lock_.get_lock()); + return EstimateBaseDataDiskSize() + EstimateDeltaDiskSize(); +} + +size_t DiskRowSet::DeltaMemStoreSize() const { + DCHECK(open_); + return delta_tracker_->DeltaMemStoreSize(); +} + +bool DiskRowSet::DeltaMemStoreEmpty() const { + DCHECK(open_); + return delta_tracker_->DeltaMemStoreEmpty(); +} + +int64_t DiskRowSet::MinUnflushedLogIndex() const { + DCHECK(open_); + return delta_tracker_->MinUnflushedLogIndex(); +} + +size_t DiskRowSet::CountDeltaStores() const { + DCHECK(open_); + return delta_tracker_->CountRedoDeltaStores(); +} + + + +// In this implementation, the returned improvement score is 0 if there aren't any redo files to +// compact or if the base data is empty. After this, with a max score of 1: +// - Major compactions: the score will be the result of sizeof(deltas)/sizeof(base data), unless +// it is smaller than tablet_delta_store_major_compact_min_ratio or if the +// delta files are only composed of deletes, in which case the score is +// brought down to zero. +// - Minor compactions: the score will be zero if there's only 1 redo file, else it will be the +// result of redo_files_count/tablet_delta_store_minor_compact_max. The +// latter is meant to be high since minor compactions don't give us much, so +// we only consider it a gain if it gets rid of many tiny files. +double DiskRowSet::DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const { + DCHECK(open_); + double perf_improv = 0; + size_t store_count = CountDeltaStores(); + uint64_t base_data_size = EstimateBaseDataDiskSize(); + + if (store_count == 0) { + return perf_improv; + } + + if (type == RowSet::MAJOR_DELTA_COMPACTION) { + vector col_ids_with_updates; + delta_tracker_->GetColumnIdsWithUpdates(&col_ids_with_updates); + // If we have files but no updates, we don't want to major compact. + if (!col_ids_with_updates.empty()) { + double ratio = static_cast(EstimateDeltaDiskSize()) / base_data_size; + if (ratio >= FLAGS_tablet_delta_store_major_compact_min_ratio) { + perf_improv = ratio; + } + } + } else if (type == RowSet::MINOR_DELTA_COMPACTION) { + if (store_count > 1) { + perf_improv = static_cast(store_count) / FLAGS_tablet_delta_store_minor_compact_max; + } + } else { + LOG(FATAL) << "Unknown delta compaction type " << type; + } + return std::min(1.0, perf_improv); +} + +Status DiskRowSet::DebugDump(vector *lines) { + // Using CompactionInput to dump our data is an easy way of seeing all the + // rows and deltas. + gscoped_ptr input; + RETURN_NOT_OK(NewCompactionInput(&rowset_metadata_->tablet_schema(), + MvccSnapshot::CreateSnapshotIncludingAllTransactions(), + &input)); + return DebugDumpCompactionInput(input.get(), lines); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/diskrowset.h b/src/kudu/tablet/diskrowset.h new file mode 100644 index 000000000000..0bc04308d002 --- /dev/null +++ b/src/kudu/tablet/diskrowset.h @@ -0,0 +1,413 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// A DiskRowSet is a horizontal slice of a Kudu tablet. +// Each DiskRowSet contains data for a a disjoint set of keys. +// See src/kudu/tablet/README for a detailed description. + +#ifndef KUDU_TABLET_DISKROWSET_H_ +#define KUDU_TABLET_DISKROWSET_H_ + +#include +#include +#include +#include +#include + +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/delta_key.h" +#include "kudu/tablet/rowset_metadata.h" +#include "kudu/tablet/rowset.h" +#include "kudu/util/atomic.h" +#include "kudu/util/bloom_filter.h" +#include "kudu/util/locks.h" + +namespace kudu { + +class FsManager; +class MemTracker; +class RowBlock; +class RowChangeList; + +namespace cfile { +class BloomFileWriter; +class CFileWriter; +} + +namespace log { +class LogAnchorRegistry; +} + +namespace tablet { + +class CFileSet; +class DeltaFileWriter; +class DeltaStats; +class DeltaTracker; +class MultiColumnWriter; +class Mutation; +class OperationResultPB; + +class DiskRowSetWriter { + public: + // TODO: document ownership of rowset_metadata + DiskRowSetWriter(RowSetMetadata* rowset_metadata, const Schema* schema, + BloomFilterSizing bloom_sizing); + + ~DiskRowSetWriter(); + + Status Open(); + + // The block is written to all column writers as well as the bloom filter, + // if configured. + // Rows must be appended in ascending order. + Status AppendBlock(const RowBlock &block); + + // Closes the CFiles and their underlying writable blocks. + // If no rows were written, returns Status::Aborted(). + Status Finish(); + + // Closes the CFiles, releasing the underlying blocks to 'closer'. + // If no rows were written, returns Status::Aborted(). + Status FinishAndReleaseBlocks(fs::ScopedWritableBlockCloser* closer); + + // The base DiskRowSetWriter never rolls. This method is necessary for tests + // which are templatized on the writer type. + Status RollIfNecessary() { return Status::OK(); } + + rowid_t written_count() const { + CHECK(finished_); + return written_count_; + } + + // Return the total number of bytes written so far to this DiskRowSet. + // Additional bytes may be written by "Finish()", but this should provide + // a reasonable estimate for the total data size. + size_t written_size() const; + + const Schema& schema() const { return *schema_; } + + private: + DISALLOW_COPY_AND_ASSIGN(DiskRowSetWriter); + + Status InitBloomFileWriter(); + + // Initializes the index writer required for compound keys + // this index is written to a new file instead of embedded in the col_* files + Status InitAdHocIndexWriter(); + + // Return the cfile::Writer responsible for writing the key index. + // (the ad-hoc writer for composite keys, otherwise the key column writer) + cfile::CFileWriter *key_index_writer(); + + RowSetMetadata *rowset_metadata_; + const Schema* const schema_; + + BloomFilterSizing bloom_sizing_; + + bool finished_; + rowid_t written_count_; + gscoped_ptr col_writer_; + gscoped_ptr bloom_writer_; + gscoped_ptr ad_hoc_index_writer_; + + // The last encoded key written. + faststring last_encoded_key_; +}; + + +// Wrapper around DiskRowSetWriter which "rolls" to a new DiskRowSet after +// a certain amount of data has been written. Each output rowset is suffixed +// with ".N" where N starts at 0 and increases as new rowsets are generated. +// +// See AppendBlock(...) for important usage information. +class RollingDiskRowSetWriter { + public: + // Create a new rolling writer. The given 'tablet_metadata' must stay valid + // for the lifetime of this writer, and is used to construct the new rowsets + // that this RollingDiskRowSetWriter creates. + RollingDiskRowSetWriter(TabletMetadata* tablet_metadata, const Schema& schema, + BloomFilterSizing bloom_sizing, + size_t target_rowset_size); + ~RollingDiskRowSetWriter(); + + Status Open(); + + // The block is written to all column writers as well as the bloom filter, + // if configured. + // Rows must be appended in ascending order. + // + // NOTE: data must be appended in a particular order: for each set of rows + // you must append deltas using the APIs below *before* appending the block + // of rows that they correspond to. This ensures that the output delta files + // and data files are aligned. + Status AppendBlock(const RowBlock &block); + + // Appends a sequence of REDO deltas for the same row to the current + // redo delta file. 'row_idx_in_next_block' is the positional index after + // the last written block. The 'row_idx_in_drs' out parameter will be set + // with the row index from the start of the DiskRowSet currently being written. + Status AppendRedoDeltas(rowid_t row_idx_in_next_block, + Mutation* redo_deltas, + rowid_t* row_idx_in_drs); + + // Appends a sequence of UNDO deltas for the same row to the current + // undo delta file. 'row_idx_in_next_block' is the positional index after + // the last written block. The 'row_idx_in_drs' out parameter will be set + // with the row index from the start of the DiskRowSet currently being written. + Status AppendUndoDeltas(rowid_t row_idx_in_next_block, + Mutation* undo_deltas, + rowid_t* row_idx_in_drs); + + // Try to roll the output, if we've passed the configured threshold. This will + // only roll if called immediately after an AppendBlock() call. The implementation + // of AppendBlock() doesn't call it automatically, because it doesn't know if there + // is any more data to be appended. It is safe to call this in other circumstances -- + // it will be ignored if it is not a good time to roll. + Status RollIfNecessary(); + + Status Finish(); + + int64_t written_count() const { return written_count_; } + + const Schema &schema() const { return schema_; } + + // Return the set of rowset paths that were written by this writer. + // This must only be called after Finish() returns an OK result. + void GetWrittenRowSetMetadata(RowSetMetadataVector* metas) const; + + uint64_t written_size() const { return written_size_; } + + private: + Status RollWriter(); + + // Close the current DRS and delta writers, releasing their finished blocks + // into block_closer_. + Status FinishCurrentWriter(); + + template + Status AppendDeltas(rowid_t row_idx_in_block, + Mutation* delta_head, + rowid_t* row_idx, + DeltaFileWriter* writer, + DeltaStats* delta_stats); + + enum State { + kInitialized, + kStarted, + kFinished + }; + State state_; + + TabletMetadata* tablet_metadata_; + const Schema schema_; + std::shared_ptr cur_drs_metadata_; + const BloomFilterSizing bloom_sizing_; + const size_t target_rowset_size_; + + gscoped_ptr cur_writer_; + + // A delta writer to store the undos for each DRS + gscoped_ptr cur_undo_writer_; + gscoped_ptr cur_undo_delta_stats; + // a delta writer to store the redos for each DRS + gscoped_ptr cur_redo_writer_; + gscoped_ptr cur_redo_delta_stats; + BlockId cur_undo_ds_block_id_; + BlockId cur_redo_ds_block_id_; + + uint64_t row_idx_in_cur_drs_; + + // True when we are allowed to roll. We can only roll when the delta writers + // and data writers are aligned (i.e. just after we've appended a new block of data). + bool can_roll_; + + // RowSetMetadata objects for diskrowsets which have been successfully + // written out. + RowSetMetadataVector written_drs_metas_; + + int64_t written_count_; + uint64_t written_size_; + + // Syncs and closes all outstanding blocks when the rolling writer is + // destroyed. + fs::ScopedWritableBlockCloser block_closer_; + + DISALLOW_COPY_AND_ASSIGN(RollingDiskRowSetWriter); +}; + +//////////////////////////////////////////////////////////// +// DiskRowSet +//////////////////////////////////////////////////////////// + +class MajorDeltaCompaction; +class RowSetColumnUpdater; + +class DiskRowSet : public RowSet { + public: + static const char *kMinKeyMetaEntryName; + static const char *kMaxKeyMetaEntryName; + + // Open a rowset from disk. + // If successful, sets *rowset to the newly open rowset + static Status Open(const std::shared_ptr& rowset_metadata, + log::LogAnchorRegistry* log_anchor_registry, + std::shared_ptr *rowset, + const std::shared_ptr& parent_tracker = + std::shared_ptr()); + + //////////////////////////////////////////////////////////// + // "Management" functions + //////////////////////////////////////////////////////////// + + // Flush all accumulated delta data to disk. + Status FlushDeltas() OVERRIDE; + + // Perform delta store minor compaction. + // This compacts the delta files down to a single one. + // If there is already only a single delta file, this does nothing. + Status MinorCompactDeltaStores() OVERRIDE; + + //////////////////////////////////////////////////////////// + // RowSet implementation + //////////////////////////////////////////////////////////// + + //////////////////// + // Updates + //////////////////// + + // Update the given row. + // 'key' should be the key portion of the row -- i.e a contiguous + // encoding of the key columns. + Status MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB* result) OVERRIDE; + + Status CheckRowPresent(const RowSetKeyProbe &probe, + bool *present, + ProbeStats* stats) const OVERRIDE; + + //////////////////// + // Read functions. + //////////////////// + virtual Status NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE; + + virtual Status NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE; + + // Count the number of rows in this rowset. + Status CountRows(rowid_t *count) const OVERRIDE; + + // See RowSet::GetBounds(...) + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const OVERRIDE; + + // Estimate the number of bytes on-disk for the base data. + uint64_t EstimateBaseDataDiskSize() const; + + // Estimate the number of bytes on-disk for the delta stores. + uint64_t EstimateDeltaDiskSize() const; + + // Estimate the total number of bytes on-disk, excluding the bloom files and the ad hoc index. + // TODO Offer a version that has the real total disk space usage. + uint64_t EstimateOnDiskSize() const OVERRIDE; + + size_t DeltaMemStoreSize() const OVERRIDE; + + bool DeltaMemStoreEmpty() const OVERRIDE; + + int64_t MinUnflushedLogIndex() const OVERRIDE; + + size_t CountDeltaStores() const; + + double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const OVERRIDE; + + // Major compacts all the delta files for all the columns. + Status MajorCompactDeltaStores(); + + boost::mutex *compact_flush_lock() OVERRIDE { + return &compact_flush_lock_; + } + + DeltaTracker *delta_tracker() { + return DCHECK_NOTNULL(delta_tracker_.get()); + } + + std::shared_ptr metadata() OVERRIDE { + return rowset_metadata_; + } + + std::string ToString() const OVERRIDE { + return rowset_metadata_->ToString(); + } + + virtual Status DebugDump(std::vector *out = NULL) OVERRIDE; + + private: + FRIEND_TEST(TestRowSet, TestRowSetUpdate); + FRIEND_TEST(TestRowSet, TestDMSFlush); + FRIEND_TEST(TestCompaction, TestOneToOne); + + friend class CompactionInput; + friend class Tablet; + + DiskRowSet(std::shared_ptr rowset_metadata, + log::LogAnchorRegistry* log_anchor_registry, + std::shared_ptr parent_tracker); + + Status Open(); + + // Create a new major delta compaction object to compact the specified columns. + Status NewMajorDeltaCompaction(const std::vector& col_ids, + gscoped_ptr* out) const; + + // Major compacts all the delta files for the specified columns. + Status MajorCompactDeltaStoresWithColumnIds(const std::vector& col_ids); + + std::shared_ptr rowset_metadata_; + + bool open_; + + log::LogAnchorRegistry* log_anchor_registry_; + + std::shared_ptr parent_tracker_; + + // Base data for this rowset. + mutable percpu_rwlock component_lock_; + std::shared_ptr base_data_; + gscoped_ptr delta_tracker_; + + // Lock governing this rowset's inclusion in a compact/flush. If locked, + // no other compactor will attempt to include this rowset. + boost::mutex compact_flush_lock_; + + DISALLOW_COPY_AND_ASSIGN(DiskRowSet); +}; + +} // namespace tablet +} // namespace kudu + +#endif // KUDU_TABLET_DISKROWSET_H_ diff --git a/src/kudu/tablet/local_tablet_writer.h b/src/kudu/tablet/local_tablet_writer.h new file mode 100644 index 000000000000..3a6a6450c1c6 --- /dev/null +++ b/src/kudu/tablet/local_tablet_writer.h @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_LOCAL_TABLET_WRITER_H +#define KUDU_TABLET_LOCAL_TABLET_WRITER_H + +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_operations.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/gutil/macros.h" + +namespace kudu { +namespace tablet { + +// Helper class to write directly into a local tablet, without going +// through TabletPeer, consensus, etc. +// +// This is useful for unit-testing the Tablet code paths with no consensus +// implementation or thread pools. +class LocalTabletWriter { + public: + struct Op { + Op(RowOperationsPB::Type type, + const KuduPartialRow* row) + : type(type), + row(row) { + } + + RowOperationsPB::Type type; + const KuduPartialRow* row; + }; + + explicit LocalTabletWriter(Tablet* tablet, + const Schema* client_schema) + : tablet_(tablet), + client_schema_(client_schema) { + CHECK(!client_schema->has_column_ids()); + CHECK_OK(SchemaToPB(*client_schema, req_.mutable_schema())); + } + + ~LocalTabletWriter() {} + + Status Insert(const KuduPartialRow& row) { + return Write(RowOperationsPB::INSERT, row); + } + + Status Delete(const KuduPartialRow& row) { + return Write(RowOperationsPB::DELETE, row); + } + + Status Update(const KuduPartialRow& row) { + return Write(RowOperationsPB::UPDATE, row); + } + + // Perform a write against the local tablet. + // Returns a bad Status if the applied operation had a per-row error. + Status Write(RowOperationsPB::Type type, + const KuduPartialRow& row) { + vector ops; + ops.push_back(Op(type, &row)); + return WriteBatch(ops); + } + + Status WriteBatch(const std::vector& ops) { + req_.mutable_row_operations()->Clear(); + RowOperationsPBEncoder encoder(req_.mutable_row_operations()); + + for (const Op& op : ops) { + encoder.Add(op.type, *op.row); + } + + tx_state_.reset(new WriteTransactionState(NULL, &req_, NULL)); + + RETURN_NOT_OK(tablet_->DecodeWriteOperations(client_schema_, tx_state_.get())); + RETURN_NOT_OK(tablet_->AcquireRowLocks(tx_state_.get())); + tablet_->StartTransaction(tx_state_.get()); + + // Create a "fake" OpId and set it in the TransactionState for anchoring. + tx_state_->mutable_op_id()->CopyFrom(consensus::MaximumOpId()); + tablet_->ApplyRowOperations(tx_state_.get()); + + tx_state_->ReleaseTxResultPB(&result_); + tx_state_->Commit(); + tx_state_->release_row_locks(); + tx_state_->ReleaseSchemaLock(); + + // Return the status of first failed op. + int op_idx = 0; + for (const OperationResultPB& result : result_.ops()) { + if (result.has_failed_status()) { + return StatusFromPB(result.failed_status()) + .CloneAndPrepend(ops[op_idx].row->ToString()); + break; + } + op_idx++; + } + return Status::OK(); + } + + // Return the result of the last row operation run against the tablet. + const OperationResultPB& last_op_result() { + CHECK_GE(result_.ops_size(), 1); + return result_.ops(result_.ops_size() - 1); + } + + private: + Tablet* const tablet_; + const Schema* client_schema_; + + TxResultPB result_; + tserver::WriteRequestPB req_; + gscoped_ptr tx_state_; + + DISALLOW_COPY_AND_ASSIGN(LocalTabletWriter); +}; + + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_LOCAL_TABLET_WRITER_H */ diff --git a/src/kudu/tablet/lock_manager-test.cc b/src/kudu/tablet/lock_manager-test.cc new file mode 100644 index 000000000000..28a7a35c1f0f --- /dev/null +++ b/src/kudu/tablet/lock_manager-test.cc @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/tablet/lock_manager.h" +#include "kudu/util/env.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +using std::vector; +using std::shared_ptr; + +DEFINE_int32(num_test_threads, 10, "number of stress test client threads"); +DEFINE_int32(num_iterations, 1000, "number of iterations per client thread"); + +namespace kudu { +namespace tablet { + +static const TransactionState* kFakeTransaction = + reinterpret_cast(0xdeadbeef); + +class LockManagerTest : public KuduTest { + public: + void VerifyAlreadyLocked(const Slice& key) { + LockEntry *entry; + ASSERT_EQ(LockManager::LOCK_BUSY, + lock_manager_.TryLock(key, kFakeTransaction, LockManager::LOCK_EXCLUSIVE, &entry)); + } + + LockManager lock_manager_; +}; + +TEST_F(LockManagerTest, TestLockUnlockSingleRow) { + Slice key_a("a"); + ScopedRowLock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ScopedRowLock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ScopedRowLock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); +} + +// Test if the same transaction locks the same row multiple times. +TEST_F(LockManagerTest, TestMultipleLockSameRow) { + Slice key_a("a"); + ScopedRowLock first_lock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ASSERT_EQ(LockManager::LOCK_ACQUIRED, first_lock.GetLockStatusForTests()); + VerifyAlreadyLocked(key_a); + + { + ScopedRowLock second_lock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ASSERT_EQ(LockManager::LOCK_ACQUIRED, second_lock.GetLockStatusForTests()); + VerifyAlreadyLocked(key_a); + } + + ASSERT_EQ(LockManager::LOCK_ACQUIRED, first_lock.GetLockStatusForTests()); + VerifyAlreadyLocked(key_a); +} + +TEST_F(LockManagerTest, TestLockUnlockMultipleRows) { + Slice key_a("a"), key_b("b"); + for (int i = 0; i < 3; ++i) { + ScopedRowLock l1(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ScopedRowLock l2(&lock_manager_, kFakeTransaction, key_b, LockManager::LOCK_EXCLUSIVE); + VerifyAlreadyLocked(key_a); + VerifyAlreadyLocked(key_b); + } +} + +TEST_F(LockManagerTest, TestRelockSameRow) { + Slice key_a("a"); + ScopedRowLock row_lock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + VerifyAlreadyLocked(key_a); +} + +TEST_F(LockManagerTest, TestMoveLock) { + // Acquire a lock. + Slice key_a("a"); + ScopedRowLock row_lock(&lock_manager_, kFakeTransaction, key_a, LockManager::LOCK_EXCLUSIVE); + ASSERT_TRUE(row_lock.acquired()); + + // Move it to a new instance. + ScopedRowLock moved_lock(row_lock.Pass()); + ASSERT_TRUE(moved_lock.acquired()); + ASSERT_FALSE(row_lock.acquired()); +} + +class LmTestResource { + public: + explicit LmTestResource(const Slice* id) + : id_(id), + owner_(0), + is_owned_(false) { + } + + const Slice* id() const { + return id_; + } + + void acquire(uint64_t tid) { + boost::unique_lock lock(lock_); + CHECK(!is_owned_); + CHECK_EQ(0, owner_); + owner_ = tid; + is_owned_ = true; + } + + void release(uint64_t tid) { + boost::unique_lock lock(lock_); + CHECK(is_owned_); + CHECK_EQ(tid, owner_); + owner_ = 0; + is_owned_ = false; + } + + private: + DISALLOW_COPY_AND_ASSIGN(LmTestResource); + + const Slice* id_; + boost::mutex lock_; + uint64_t owner_; + bool is_owned_; +}; + +class LmTestThread { + public: + LmTestThread(LockManager* manager, vector keys, + const vector resources) + : manager_(manager), keys_(std::move(keys)), resources_(resources) {} + + void Start() { + CHECK_OK(kudu::Thread::Create("test", "test", &LmTestThread::Run, this, &thread_)); + } + + void Run() { + tid_ = Env::Default()->gettid(); + const TransactionState* my_txn = reinterpret_cast(tid_); + + std::sort(keys_.begin(), keys_.end()); + for (int i = 0; i < FLAGS_num_iterations; i++) { + std::vector > locks; + // TODO: We don't have an API for multi-row + for (const Slice* key : keys_) { + locks.push_back(shared_ptr( + new ScopedRowLock(manager_, my_txn, + *key, LockManager::LOCK_EXCLUSIVE))); + } + + for (LmTestResource* r : resources_) { + r->acquire(tid_); + } + for (LmTestResource* r : resources_) { + r->release(tid_); + } + } + } + + void Join() { + CHECK_OK(ThreadJoiner(thread_.get()). + warn_after_ms(1000). + warn_every_ms(5000). + Join()); + thread_ = nullptr; + } + + private: + DISALLOW_COPY_AND_ASSIGN(LmTestThread); + LockManager* manager_; + vector keys_; + const vector resources_; + uint64_t tid_; + scoped_refptr thread_; +}; + +static void runPerformanceTest(const char *test_type, + vector > *threads) { + Stopwatch sw(Stopwatch::ALL_THREADS); + sw.start(); + for (const shared_ptr& t : *threads) { + t->Start(); + } + + for (const shared_ptr& t : *threads) { + t->Join(); + } + sw.stop(); + + float num_cycles = FLAGS_num_iterations; + num_cycles *= FLAGS_num_test_threads; + + float cycles_per_second = num_cycles / sw.elapsed().wall_seconds(); + float user_cpu_micros_per_cycle = + (sw.elapsed().user / 1000.0) / cycles_per_second; + float sys_cpu_micros_per_cycle = + (sw.elapsed().system / 1000.0) / cycles_per_second; + LOG(INFO) << "*** testing with " << FLAGS_num_test_threads << " threads, " + << FLAGS_num_iterations << " iterations."; + LOG(INFO) << test_type << " Lock/Unlock cycles per second: " + << cycles_per_second; + LOG(INFO) << test_type << " User CPU per lock/unlock cycle: " + << user_cpu_micros_per_cycle << "us"; + LOG(INFO) << test_type << " Sys CPU per lock/unlock cycle: " + << sys_cpu_micros_per_cycle << "us"; +} + +// Test running a bunch of threads at once that want an overlapping set of +// resources. +TEST_F(LockManagerTest, TestContention) { + Slice slice_a("a"); + LmTestResource resource_a(&slice_a); + Slice slice_b("b"); + LmTestResource resource_b(&slice_b); + Slice slice_c("c"); + LmTestResource resource_c(&slice_c); + vector > threads; + for (int i = 0; i < FLAGS_num_test_threads; ++i) { + vector resources; + if (i % 3 == 0) { + resources.push_back(&resource_a); + resources.push_back(&resource_b); + } else if (i % 3 == 1) { + resources.push_back(&resource_b); + resources.push_back(&resource_c); + } else { + resources.push_back(&resource_c); + resources.push_back(&resource_a); + } + vector keys; + for (vector::const_iterator r = resources.begin(); + r != resources.end(); ++r) { + keys.push_back((*r)->id()); + } + threads.push_back(shared_ptr( + new LmTestThread(&lock_manager_, keys, resources))); + } + runPerformanceTest("Contended", &threads); +} + +// Test running a bunch of threads at once that want different +// resources. +TEST_F(LockManagerTest, TestUncontended) { + vector slice_strings; + for (int i = 0; i < FLAGS_num_test_threads; i++) { + slice_strings.push_back(StringPrintf("slice%03d", i)); + } + vector slices; + for (int i = 0; i < FLAGS_num_test_threads; i++) { + slices.push_back(Slice(slice_strings[i])); + } + vector > resources; + for (int i = 0; i < FLAGS_num_test_threads; i++) { + resources.push_back( + shared_ptr(new LmTestResource(&slices[i]))); + } + vector > threads; + for (int i = 0; i < FLAGS_num_test_threads; ++i) { + vector k; + k.push_back(&slices[i]); + vector r; + r.push_back(resources[i].get()); + threads.push_back(shared_ptr( + new LmTestThread(&lock_manager_, k, r))); + } + runPerformanceTest("Uncontended", &threads); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/lock_manager.cc b/src/kudu/tablet/lock_manager.cc new file mode 100644 index 000000000000..0f17dfc50096 --- /dev/null +++ b/src/kudu/tablet/lock_manager.cc @@ -0,0 +1,399 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/tablet/lock_manager.h" +#include "kudu/util/locks.h" +#include "kudu/util/semaphore.h" + +namespace kudu { +namespace tablet { + +class TransactionState; + +// ============================================================================ +// LockTable +// ============================================================================ + +// The entry returned to a thread which has taken a lock. +// Callers should generally use ScopedRowLock (see below). +class LockEntry { + public: + explicit LockEntry(const Slice& key) + : sem(1), + recursion_(0) { + key_hash_ = util_hash::CityHash64(reinterpret_cast(key.data()), key.size()); + key_ = key; + refs_ = 1; + } + + bool Equals(const Slice& key, uint64_t hash) const { + return key_hash_ == hash && key_ == key; + } + + std::string ToString() const { + return key_.ToDebugString(); + } + + // Mutex used by the LockManager + Semaphore sem; + int recursion_; + + private: + friend class LockTable; + friend class LockManager; + + void CopyKey() { + key_buf_.assign_copy(key_.data(), key_.size()); + key_ = Slice(key_buf_); + } + + // Pointer to the next entry in the same hash table bucket + LockEntry *ht_next_; + + // Hash of the key, used to lookup the hash table bucket + uint64_t key_hash_; + + // key of the entry, used to compare the entries + Slice key_; + + // number of users that are referencing this object + uint64_t refs_; + + // buffer of the key, allocated on insertion by CopyKey() + faststring key_buf_; + + // The transaction currently holding the lock + const TransactionState* holder_; +}; + +class LockTable { + private: + struct Bucket { + simple_spinlock lock; + // First entry chained from this bucket, or NULL if the bucket is empty. + LockEntry *chain_head; + Bucket() : chain_head(nullptr) {} + }; + + public: + LockTable() : mask_(0), size_(0), item_count_(0) { + Resize(); + } + + ~LockTable() { + // Sanity checks: The table shouldn't be destructed when there are any entries in it. + DCHECK_EQ(0, NoBarrier_Load(&(item_count_))) << "There are some unreleased locks"; + for (size_t i = 0; i < size_; ++i) { + for (LockEntry *p = buckets_[i].chain_head; p != nullptr; p = p->ht_next_) { + DCHECK(p == nullptr) << "The entry " << p->ToString() << " was not released"; + } + } + } + + LockEntry *GetLockEntry(const Slice &key); + void ReleaseLockEntry(LockEntry *entry); + + private: + Bucket *FindBucket(uint64_t hash) const { + return &(buckets_[hash & mask_]); + } + + // Return a pointer to slot that points to a lock entry that + // matches key/hash. If there is no such lock entry, return a + // pointer to the trailing slot in the corresponding linked list. + LockEntry **FindSlot(Bucket *bucket, const Slice& key, uint64_t hash) const { + LockEntry **node = &(bucket->chain_head); + while (*node && !(*node)->Equals(key, hash)) { + node = &((*node)->ht_next_); + } + return node; + } + + // Return a pointer to slot that points to a lock entry that + // matches the specified 'entry'. + // If there is no such lock entry, NULL is returned. + LockEntry **FindEntry(Bucket *bucket, LockEntry *entry) const { + for (LockEntry **node = &(bucket->chain_head); *node != nullptr; node = &((*node)->ht_next_)) { + if (*node == entry) { + return node; + } + } + return nullptr; + } + + void Resize(); + + private: + // table rwlock used as write on resize + percpu_rwlock lock_; + // size - 1 used to lookup the bucket (hash & mask_) + uint64_t mask_; + // number of buckets in the table + uint64_t size_; + // table buckets + gscoped_array buckets_; + // number of items in the table + base::subtle::Atomic64 item_count_; +}; + +LockEntry *LockTable::GetLockEntry(const Slice& key) { + auto new_entry = new LockEntry(key); + LockEntry *old_entry; + + { + boost::shared_lock table_rdlock(lock_.get_lock()); + Bucket *bucket = FindBucket(new_entry->key_hash_); + { + boost::lock_guard bucket_lock(bucket->lock); + LockEntry **node = FindSlot(bucket, new_entry->key_, new_entry->key_hash_); + old_entry = *node; + if (old_entry != nullptr) { + old_entry->refs_++; + } else { + new_entry->ht_next_ = nullptr; + new_entry->CopyKey(); + *node = new_entry; + } + } + } + + if (old_entry != nullptr) { + delete new_entry; + return old_entry; + } + + if (base::subtle::NoBarrier_AtomicIncrement(&item_count_, 1) > size_) { + boost::unique_lock table_wrlock(lock_, boost::try_to_lock); + // if we can't take the lock, means that someone else is resizing. + // (The percpu_rwlock try_lock waits for readers to complete) + if (table_wrlock.owns_lock()) { + Resize(); + } + } + + return new_entry; +} + +void LockTable::ReleaseLockEntry(LockEntry *entry) { + bool removed = false; + { + boost::lock_guard table_rdlock(lock_.get_lock()); + Bucket *bucket = FindBucket(entry->key_hash_); + { + boost::lock_guard bucket_lock(bucket->lock); + LockEntry **node = FindEntry(bucket, entry); + if (node != nullptr) { + // ASSUMPTION: There are few updates, so locking the same row at the same time is rare + // TODO: Move out this if we're going with the TryLock + if (--entry->refs_ > 0) + return; + + *node = entry->ht_next_; + removed = true; + } + } + } + + DCHECK(removed) << "Unable to find LockEntry on release"; + base::subtle::NoBarrier_AtomicIncrement(&item_count_, -1); + delete entry; +} + +void LockTable::Resize() { + // Calculate a new table size + size_t new_size = 16; + while (new_size < item_count_) { + new_size <<= 1; + } + + if (PREDICT_FALSE(size_ >= new_size)) + return; + + // Allocate a new bucket list + gscoped_array new_buckets(new Bucket[new_size]); + size_t new_mask = new_size - 1; + + // Copy entries + for (size_t i = 0; i < size_; ++i) { + LockEntry *p = buckets_[i].chain_head; + while (p != nullptr) { + LockEntry *next = p->ht_next_; + + // Insert Entry + Bucket *bucket = &(new_buckets[p->key_hash_ & new_mask]); + p->ht_next_ = bucket->chain_head; + bucket->chain_head = p; + + p = next; + } + } + + // Swap the bucket + mask_ = new_mask; + size_ = new_size; + buckets_.swap(new_buckets); +} + +// ============================================================================ +// ScopedRowLock +// ============================================================================ + +ScopedRowLock::ScopedRowLock(LockManager *manager, + const TransactionState* tx, + const Slice &key, + LockManager::LockMode mode) + : manager_(DCHECK_NOTNULL(manager)), + acquired_(false) { + ls_ = manager_->Lock(key, tx, mode, &entry_); + + if (ls_ == LockManager::LOCK_ACQUIRED) { + acquired_ = true; + } else { + // the lock might already have been acquired by this transaction so + // simply check that we didn't get a LOCK_BUSY status (we should have waited) + CHECK_NE(ls_, LockManager::LOCK_BUSY); + } +} + +ScopedRowLock::ScopedRowLock(RValue other) { + TakeState(other.object); +} + +ScopedRowLock& ScopedRowLock::operator=(RValue other) { + TakeState(other.object); + return *this; +} + +void ScopedRowLock::TakeState(ScopedRowLock* other) { + manager_ = other->manager_; + acquired_ = other->acquired_; + entry_ = other->entry_; + ls_ = other->ls_; + + other->acquired_ = false; + other->entry_ = nullptr; +} + +ScopedRowLock::~ScopedRowLock() { + Release(); +} + +void ScopedRowLock::Release() { + if (entry_) { + manager_->Release(entry_, ls_); + acquired_ = false; + entry_ = nullptr; + } +} + +// ============================================================================ +// LockManager +// ============================================================================ + +LockManager::LockManager() + : locks_(new LockTable()) { +} + +LockManager::~LockManager() { + delete locks_; +} + +LockManager::LockStatus LockManager::Lock(const Slice& key, + const TransactionState* tx, + LockManager::LockMode mode, + LockEntry** entry) { + *entry = locks_->GetLockEntry(key); + + // We expect low contention, so just try to try_lock first. This is faster + // than a timed_lock, since we don't have to do a syscall to get the current + // time. + if (!(*entry)->sem.TryAcquire()) { + // If the current holder of this lock is the same transaction just return + // a LOCK_ALREADY_ACQUIRED status without actually acquiring the mutex. + // + // + // NOTE: This is not a problem for the current way locks are managed since + // they are obtained and released in bulk (all locks for a transaction are + // obtained and released at the same time). If at any time in the future + // we opt to perform more fine grained locking, possibly letting transactions + // release a portion of the locks they no longer need, this no longer is OK. + if (ANNOTATE_UNPROTECTED_READ((*entry)->holder_) == tx) { + // TODO: this is likely to be problematic even today: if you issue two + // UPDATEs for the same row in the same transaction, we can get: + // "deltamemstore.cc:74] Check failed: !mutation.exists() Already have an entry ..." + (*entry)->recursion_++; + return LOCK_ACQUIRED; + } + + // If we couldn't immediately acquire the lock, do a timed lock so we can + // warn if it takes a long time. + // TODO: would be nice to hook in some histogram metric about lock acquisition + // time. + int waited_seconds = 0; + while (!(*entry)->sem.TimedAcquire(MonoDelta::FromSeconds(1))) { + const TransactionState* cur_holder = ANNOTATE_UNPROTECTED_READ((*entry)->holder_); + LOG(WARNING) << "Waited " << (++waited_seconds) << " seconds to obtain row lock on key " + << key.ToDebugString() << " cur holder: " << cur_holder; + // TODO: add RPC trace annotation here. Above warning should also include an RPC + // trace ID. + // TODO: would be nice to also include some info about the blocking transaction, + // but it's a bit tricky to do in a non-racy fashion (the other transaction may + // complete at any point) + } + } + + (*entry)->holder_ = tx; + return LOCK_ACQUIRED; +} + +LockManager::LockStatus LockManager::TryLock(const Slice& key, + const TransactionState* tx, + LockManager::LockMode mode, + LockEntry **entry) { + *entry = locks_->GetLockEntry(key); + bool locked = (*entry)->sem.TryAcquire(); + if (!locked) { + locks_->ReleaseLockEntry(*entry); + return LOCK_BUSY; + } + (*entry)->holder_ = tx; + return LOCK_ACQUIRED; +} + +void LockManager::Release(LockEntry *lock, LockStatus ls) { + DCHECK_NOTNULL(lock)->holder_ = nullptr; + if (ls == LOCK_ACQUIRED) { + if (lock->recursion_ > 0) { + lock->recursion_--; + } else { + lock->sem.Release(); + } + } + locks_->ReleaseLockEntry(lock); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/lock_manager.h b/src/kudu/tablet/lock_manager.h new file mode 100644 index 000000000000..6bfaa6762c8a --- /dev/null +++ b/src/kudu/tablet/lock_manager.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_LOCK_MANAGER_H +#define KUDU_TABLET_LOCK_MANAGER_H + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/move.h" +#include "kudu/util/slice.h" + +namespace kudu { namespace tablet { + +class LockManager; +class LockTable; +class LockEntry; +class TransactionState; + +// Super-simple lock manager implementation. This only supports exclusive +// locks, and makes no attempt to prevent deadlocks if a single thread +// takes multiple locks. +// +// In the future when we want to support multi-row transactions of some kind +// we'll have to implement a proper lock manager with all its trappings, +// but this should be enough for the single-row use case. +class LockManager { + public: + LockManager(); + ~LockManager(); + + enum LockStatus { + LOCK_ACQUIRED = 0, + LOCK_BUSY = 1, + }; + + enum LockMode { + LOCK_EXCLUSIVE + }; + + private: + friend class ScopedRowLock; + friend class LockManagerTest; + + LockStatus Lock(const Slice& key, const TransactionState* tx, + LockMode mode, LockEntry **entry); + LockStatus TryLock(const Slice& key, const TransactionState* tx, + LockMode mode, LockEntry **entry); + void Release(LockEntry *lock, LockStatus ls); + + LockTable *locks_; + + DISALLOW_COPY_AND_ASSIGN(LockManager); +}; + + +// Hold a lock on a given row, for the scope of this object. +// Usage: +// { +// ScopedRowLock(&manager, my_encoded_row_key, LOCK_EXCLUSIVE); +// .. do stuff with the row .. +// } +// // lock is released when the object exits its scope. +// +// This class emulates C++11 move constructors and thus can be +// copied by using the special '.Pass()' function. For example: +// +// void DoSomething(ScopedRowLock l) { +// // l owns the lock and will release at the end of this function +// } +// ScopedRowLock my_lock(&manager, ...); +// DoSomething(l.Pass()); +// CHECK(!l.acquired()); // doesn't own lock anymore, since it Pass()ed +class ScopedRowLock { + MOVE_ONLY_TYPE_FOR_CPP_03(ScopedRowLock, RValue); + public: + + // Construct an initially-unlocked lock holder. + // You can later assign this to actually hold a lock using + // the emulated move-constructor: + // ScopedRowLock l; + // l = ScopedRowLock(...); // use the ctor below + // or + // l = other_row_lock.Pass(); + ScopedRowLock() + : manager_(NULL), + acquired_(false), + entry_(NULL) { + } + + // Lock row in the given LockManager. The 'key' slice must remain + // valid and un-changed for the duration of this object's lifetime. + ScopedRowLock(LockManager *manager, const TransactionState* ctx, + const Slice &key, LockManager::LockMode mode); + + // Emulated Move constructor + ScopedRowLock(RValue other); // NOLINT(runtime/explicit) + ScopedRowLock& operator=(RValue other); + + void Release(); + + bool acquired() const { return acquired_; } + + LockManager::LockStatus GetLockStatusForTests() { return ls_; } + + ~ScopedRowLock(); + + private: + void TakeState(ScopedRowLock* other); + + LockManager *manager_; + + bool acquired_; + LockEntry *entry_; + LockManager::LockStatus ls_; +}; + +} // namespace tablet +} // namespace kudu +#endif diff --git a/src/kudu/tablet/maintenance_manager-test.cc b/src/kudu/tablet/maintenance_manager-test.cc new file mode 100644 index 000000000000..5bb57660e39d --- /dev/null +++ b/src/kudu/tablet/maintenance_manager-test.cc @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +using kudu::tablet::MaintenanceManagerStatusPB; +using std::shared_ptr; +using std::vector; +using strings::Substitute; + +METRIC_DEFINE_entity(test); +METRIC_DEFINE_gauge_uint32(test, maintenance_ops_running, + "Number of Maintenance Operations Running", + kudu::MetricUnit::kMaintenanceOperations, + "The number of background maintenance operations currently running."); +METRIC_DEFINE_histogram(test, maintenance_op_duration, + "Maintenance Operation Duration", + kudu::MetricUnit::kSeconds, "", 60000000LU, 2); + +namespace kudu { + +const int kHistorySize = 4; + +class MaintenanceManagerTest : public KuduTest { + public: + MaintenanceManagerTest() { + test_tracker_ = MemTracker::CreateTracker(1000, "test"); + MaintenanceManager::Options options; + options.num_threads = 2; + options.polling_interval_ms = 1; + options.history_size = kHistorySize; + options.parent_mem_tracker = test_tracker_; + manager_.reset(new MaintenanceManager(options)); + manager_->Init(); + } + ~MaintenanceManagerTest() { + manager_->Shutdown(); + } + + protected: + shared_ptr test_tracker_; + shared_ptr manager_; +}; + +// Just create the MaintenanceManager and then shut it down, to make sure +// there are no race conditions there. +TEST_F(MaintenanceManagerTest, TestCreateAndShutdown) { +} + +enum TestMaintenanceOpState { + OP_DISABLED, + OP_RUNNABLE, + OP_RUNNING, + OP_FINISHED, +}; + +class TestMaintenanceOp : public MaintenanceOp { + public: + TestMaintenanceOp(const std::string& name, + IOUsage io_usage, + TestMaintenanceOpState state, + const shared_ptr& tracker) + : MaintenanceOp(name, io_usage), + state_change_cond_(&lock_), + state_(state), + consumption_(tracker, 500), + logs_retained_bytes_(0), + perf_improvement_(0), + metric_entity_(METRIC_ENTITY_test.Instantiate(&metric_registry_, "test")), + maintenance_op_duration_(METRIC_maintenance_op_duration.Instantiate(metric_entity_)), + maintenance_ops_running_(METRIC_maintenance_ops_running.Instantiate(metric_entity_, 0)) { + } + + virtual ~TestMaintenanceOp() {} + + virtual bool Prepare() OVERRIDE { + lock_guard guard(&lock_); + if (state_ != OP_RUNNABLE) { + return false; + } + state_ = OP_RUNNING; + state_change_cond_.Broadcast(); + DLOG(INFO) << "Prepared op " << name(); + return true; + } + + virtual void Perform() OVERRIDE { + DLOG(INFO) << "Performing op " << name(); + lock_guard guard(&lock_); + CHECK_EQ(OP_RUNNING, state_); + state_ = OP_FINISHED; + state_change_cond_.Broadcast(); + } + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE { + lock_guard guard(&lock_); + stats->set_runnable(state_ == OP_RUNNABLE); + stats->set_ram_anchored(consumption_.consumption()); + stats->set_logs_retained_bytes(logs_retained_bytes_); + stats->set_perf_improvement(perf_improvement_); + } + + void Enable() { + lock_guard guard(&lock_); + DCHECK((state_ == OP_DISABLED) || (state_ == OP_FINISHED)); + state_ = OP_RUNNABLE; + state_change_cond_.Broadcast(); + } + + void WaitForState(TestMaintenanceOpState state) { + lock_guard guard(&lock_); + while (true) { + if (state_ == state) { + return; + } + state_change_cond_.Wait(); + } + } + + bool WaitForStateWithTimeout(TestMaintenanceOpState state, int ms) { + MonoDelta to_wait = MonoDelta::FromMilliseconds(ms); + lock_guard guard(&lock_); + while (true) { + if (state_ == state) { + return true; + } + if (!state_change_cond_.TimedWait(to_wait)) { + return false; + } + } + } + + void set_ram_anchored(uint64_t ram_anchored) { + lock_guard guard(&lock_); + consumption_.Reset(ram_anchored); + } + + void set_logs_retained_bytes(uint64_t logs_retained_bytes) { + lock_guard guard(&lock_); + logs_retained_bytes_ = logs_retained_bytes; + } + + void set_perf_improvement(uint64_t perf_improvement) { + lock_guard guard(&lock_); + perf_improvement_ = perf_improvement; + } + + virtual scoped_refptr DurationHistogram() const OVERRIDE { + return maintenance_op_duration_; + } + + virtual scoped_refptr > RunningGauge() const OVERRIDE { + return maintenance_ops_running_; + } + + private: + Mutex lock_; + ConditionVariable state_change_cond_; + enum TestMaintenanceOpState state_; + ScopedTrackedConsumption consumption_; + uint64_t logs_retained_bytes_; + uint64_t perf_improvement_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + scoped_refptr maintenance_op_duration_; + scoped_refptr > maintenance_ops_running_; +}; + +// Create an op and wait for it to start running. Unregister it while it is +// running and verify that UnregisterOp waits for it to finish before +// proceeding. +TEST_F(MaintenanceManagerTest, TestRegisterUnregister) { + TestMaintenanceOp op1("1", MaintenanceOp::HIGH_IO_USAGE, OP_DISABLED, test_tracker_); + op1.set_ram_anchored(1001); + manager_->RegisterOp(&op1); + scoped_refptr thread; + CHECK_OK(Thread::Create("TestThread", "TestRegisterUnregister", + boost::bind(&TestMaintenanceOp::Enable, &op1), &thread)); + op1.WaitForState(OP_FINISHED); + manager_->UnregisterOp(&op1); + ThreadJoiner(thread.get()).Join(); +} + +// Test that we'll run an operation that doesn't improve performance when memory +// pressure gets high. +TEST_F(MaintenanceManagerTest, TestMemoryPressure) { + TestMaintenanceOp op("op", MaintenanceOp::HIGH_IO_USAGE, OP_RUNNABLE, test_tracker_); + op.set_ram_anchored(100); + manager_->RegisterOp(&op); + + // At first, we don't want to run this, since there is no perf_improvement. + CHECK_EQ(false, op.WaitForStateWithTimeout(OP_FINISHED, 20)); + + // set the ram_anchored by the high mem op so high that we'll have to run it. + scoped_refptr thread; + CHECK_OK(Thread::Create("TestThread", "MaintenanceManagerTest", + boost::bind(&TestMaintenanceOp::set_ram_anchored, &op, 1100), &thread)); + op.WaitForState(OP_FINISHED); + manager_->UnregisterOp(&op); + ThreadJoiner(thread.get()).Join(); +} + +// Test that ops are prioritized correctly when we add log retention. +TEST_F(MaintenanceManagerTest, TestLogRetentionPrioritization) { + manager_->Shutdown(); + + TestMaintenanceOp op1("op1", MaintenanceOp::LOW_IO_USAGE, OP_RUNNABLE, test_tracker_); + op1.set_ram_anchored(0); + op1.set_logs_retained_bytes(100); + + TestMaintenanceOp op2("op2", MaintenanceOp::HIGH_IO_USAGE, OP_RUNNABLE, test_tracker_); + op2.set_ram_anchored(100); + op2.set_logs_retained_bytes(100); + + TestMaintenanceOp op3("op3", MaintenanceOp::HIGH_IO_USAGE, OP_RUNNABLE, test_tracker_); + op3.set_ram_anchored(200); + op3.set_logs_retained_bytes(100); + + manager_->RegisterOp(&op1); + manager_->RegisterOp(&op2); + manager_->RegisterOp(&op3); + + // We want to do the low IO op first since it clears up some log retention. + ASSERT_EQ(&op1, manager_->FindBestOp()); + + manager_->UnregisterOp(&op1); + + // Low IO is taken care of, now we find the op clears the most log retention and ram. + ASSERT_EQ(&op3, manager_->FindBestOp()); + + manager_->UnregisterOp(&op3); + + ASSERT_EQ(&op2, manager_->FindBestOp()); + + manager_->UnregisterOp(&op2); +} + +// Test adding operations and make sure that the history of recently completed operations +// is correct in that it wraps around and doesn't grow. +TEST_F(MaintenanceManagerTest, TestCompletedOpsHistory) { + for (int i = 0; i < 5; i++) { + string name = Substitute("op$0", i); + TestMaintenanceOp op(name, MaintenanceOp::HIGH_IO_USAGE, OP_RUNNABLE, test_tracker_); + op.set_perf_improvement(1); + op.set_ram_anchored(100); + manager_->RegisterOp(&op); + + CHECK_EQ(true, op.WaitForStateWithTimeout(OP_FINISHED, 200)); + manager_->UnregisterOp(&op); + + MaintenanceManagerStatusPB status_pb; + manager_->GetMaintenanceManagerStatusDump(&status_pb); + // The size should be at most the history_size. + ASSERT_GE(kHistorySize, status_pb.completed_operations_size()); + // See that we have the right name, even if we wrap around. + ASSERT_EQ(name, status_pb.completed_operations(i % 4).name()); + } +} + +} // namespace kudu diff --git a/src/kudu/tablet/maintenance_manager.cc b/src/kudu/tablet/maintenance_manager.cc new file mode 100644 index 000000000000..14877bcc4acf --- /dev/null +++ b/src/kudu/tablet/maintenance_manager.cc @@ -0,0 +1,412 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/maintenance_manager.h" + +#include +#include +#include +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/debug/trace_logging.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread.h" + +using std::pair; +using std::shared_ptr; +using strings::Substitute; + +DEFINE_int32(maintenance_manager_num_threads, 1, + "Size of the maintenance manager thread pool. Beyond a value of '1', one thread is " + "reserved for emergency flushes. For spinning disks, the number of threads should " + "not be above the number of devices."); +TAG_FLAG(maintenance_manager_num_threads, stable); + +DEFINE_int32(maintenance_manager_polling_interval_ms, 250, + "Polling interval for the maintenance manager scheduler, " + "in milliseconds."); +TAG_FLAG(maintenance_manager_polling_interval_ms, hidden); + +DEFINE_int32(maintenance_manager_history_size, 8, + "Number of completed operations the manager is keeping track of."); +TAG_FLAG(maintenance_manager_history_size, hidden); + +DEFINE_bool(enable_maintenance_manager, true, + "Enable the maintenance manager, runs compaction and tablet cleaning tasks."); +TAG_FLAG(enable_maintenance_manager, unsafe); + +namespace kudu { + +using kudu::tablet::MaintenanceManagerStatusPB; +using kudu::tablet::MaintenanceManagerStatusPB_CompletedOpPB; +using kudu::tablet::MaintenanceManagerStatusPB_MaintenanceOpPB; + +MaintenanceOpStats::MaintenanceOpStats() { + Clear(); +} + +void MaintenanceOpStats::Clear() { + valid_ = false; + runnable_ = false; + ram_anchored_ = 0; + logs_retained_bytes_ = 0; + perf_improvement_ = 0; +} + +MaintenanceOp::MaintenanceOp(std::string name, IOUsage io_usage) + : name_(std::move(name)), running_(0), io_usage_(io_usage) {} + +MaintenanceOp::~MaintenanceOp() { + CHECK(!manager_.get()) << "You must unregister the " << name_ + << " Op before destroying it."; +} + +void MaintenanceOp::Unregister() { + CHECK(manager_.get()) << "Op " << name_ << " was never registered."; + manager_->UnregisterOp(this); +} + +const MaintenanceManager::Options MaintenanceManager::DEFAULT_OPTIONS = { + 0, + 0, + 0, + shared_ptr(), +}; + +MaintenanceManager::MaintenanceManager(const Options& options) + : num_threads_(options.num_threads <= 0 ? + FLAGS_maintenance_manager_num_threads : options.num_threads), + cond_(&lock_), + shutdown_(false), + running_ops_(0), + polling_interval_ms_(options.polling_interval_ms <= 0 ? + FLAGS_maintenance_manager_polling_interval_ms : + options.polling_interval_ms), + completed_ops_count_(0), + parent_mem_tracker_(!options.parent_mem_tracker ? + MemTracker::GetRootTracker() : options.parent_mem_tracker) { + CHECK_OK(ThreadPoolBuilder("MaintenanceMgr").set_min_threads(num_threads_) + .set_max_threads(num_threads_).Build(&thread_pool_)); + uint32_t history_size = options.history_size == 0 ? + FLAGS_maintenance_manager_history_size : + options.history_size; + completed_ops_.resize(history_size); +} + +MaintenanceManager::~MaintenanceManager() { + Shutdown(); +} + +Status MaintenanceManager::Init() { + RETURN_NOT_OK(Thread::Create("maintenance", "maintenance_scheduler", + boost::bind(&MaintenanceManager::RunSchedulerThread, this), + &monitor_thread_)); + return Status::OK(); +} + +void MaintenanceManager::Shutdown() { + { + lock_guard guard(&lock_); + if (shutdown_) { + return; + } + shutdown_ = true; + cond_.Broadcast(); + } + if (monitor_thread_.get()) { + CHECK_OK(ThreadJoiner(monitor_thread_.get()).Join()); + monitor_thread_.reset(); + thread_pool_->Shutdown(); + } +} + +void MaintenanceManager::RegisterOp(MaintenanceOp* op) { + lock_guard guard(&lock_); + CHECK(!op->manager_.get()) << "Tried to register " << op->name() + << ", but it was already registered."; + pair val + (ops_.insert(OpMapTy::value_type(op, MaintenanceOpStats()))); + CHECK(val.second) + << "Tried to register " << op->name() + << ", but it already exists in ops_."; + op->manager_ = shared_from_this(); + op->cond_.reset(new ConditionVariable(&lock_)); + VLOG_AND_TRACE("maintenance", 1) << "Registered " << op->name(); +} + +void MaintenanceManager::UnregisterOp(MaintenanceOp* op) { + { + lock_guard guard(&lock_); + CHECK(op->manager_.get() == this) << "Tried to unregister " << op->name() + << ", but it is not currently registered with this maintenance manager."; + auto iter = ops_.find(op); + CHECK(iter != ops_.end()) << "Tried to unregister " << op->name() + << ", but it was never registered"; + // While the op is running, wait for it to be finished. + if (iter->first->running_ > 0) { + VLOG_AND_TRACE("maintenance", 1) << "Waiting for op " << op->name() << " to finish so " + << "we can unregister it."; + } + while (iter->first->running_ > 0) { + op->cond_->Wait(); + iter = ops_.find(op); + CHECK(iter != ops_.end()) << "Tried to unregister " << op->name() + << ", but another thread unregistered it while we were " + << "waiting for it to complete"; + } + ops_.erase(iter); + } + LOG(INFO) << "Unregistered op " << op->name(); + op->cond_.reset(); + // Remove the op's shared_ptr reference to us. This might 'delete this'. + op->manager_.reset(); +} + +void MaintenanceManager::RunSchedulerThread() { + MonoDelta polling_interval = MonoDelta::FromMilliseconds(polling_interval_ms_); + + unique_lock guard(&lock_); + while (true) { + // Loop until we are shutting down or it is time to run another op. + cond_.TimedWait(polling_interval); + if (shutdown_) { + VLOG_AND_TRACE("maintenance", 1) << "Shutting down maintenance manager."; + return; + } + + // Find the best op. + MaintenanceOp* op = FindBestOp(); + if (!op) { + VLOG_AND_TRACE("maintenance", 2) << "No maintenance operations look worth doing."; + continue; + } + + // Prepare the maintenance operation. + op->running_++; + running_ops_++; + guard.unlock(); + bool ready = op->Prepare(); + guard.lock(); + if (!ready) { + LOG(INFO) << "Prepare failed for " << op->name() + << ". Re-running scheduler."; + op->running_--; + op->cond_->Signal(); + continue; + } + + // Run the maintenance operation. + Status s = thread_pool_->SubmitFunc(boost::bind( + &MaintenanceManager::LaunchOp, this, op)); + CHECK(s.ok()); + } +} + +// Finding the best operation goes through four filters: +// - If there's an Op that we can run quickly that frees log retention, we run it. +// - If we've hit the overall process memory limit (note: this includes memory that the Ops cannot +// free), we run the Op with the highest RAM usage. +// - If there are Ops that retain logs, we run the one that has the highest retention (and if many +// qualify, then we run the one that also frees up the most RAM). +// - Finally, if there's nothing else that we really need to do, we run the Op that will improve +// performance the most. +// +// The reason it's done this way is that we want to prioritize limiting the amount of resources we +// hold on to. Low IO Ops go first since we can quickly run them, then we can look at memory usage. +// Reversing those can starve the low IO Ops when the system is under intense memory pressure. +// +// In the third priority we're at a point where nothing's urgent and there's nothing we can run +// quickly. +// TODO We currently optimize for freeing log retention but we could consider having some sort of +// sliding priority between log retention and RAM usage. For example, is an Op that frees +// 128MB of log retention and 12MB of RAM always better than an op that frees 12MB of log retention +// and 128MB of RAM? Maybe a more holistic approach would be better. +MaintenanceOp* MaintenanceManager::FindBestOp() { + TRACE_EVENT0("maintenance", "MaintenanceManager::FindBestOp"); + + if (!FLAGS_enable_maintenance_manager) { + VLOG_AND_TRACE("maintenance", 1) << "Maintenance manager is disabled. Doing nothing"; + return nullptr; + } + size_t free_threads = num_threads_ - running_ops_; + if (free_threads == 0) { + VLOG_AND_TRACE("maintenance", 1) << "there are no free threads, so we can't run anything."; + return nullptr; + } + + int64_t low_io_most_logs_retained_bytes = 0; + MaintenanceOp* low_io_most_logs_retained_bytes_op = nullptr; + + uint64_t most_mem_anchored = 0; + MaintenanceOp* most_mem_anchored_op = nullptr; + + int64_t most_logs_retained_bytes = 0; + int64_t most_logs_retained_bytes_ram_anchored = 0; + MaintenanceOp* most_logs_retained_bytes_op = nullptr; + + double best_perf_improvement = 0; + MaintenanceOp* best_perf_improvement_op = nullptr; + for (OpMapTy::value_type &val : ops_) { + MaintenanceOp* op(val.first); + MaintenanceOpStats& stats(val.second); + // Update op stats. + stats.Clear(); + op->UpdateStats(&stats); + if (!stats.valid() || !stats.runnable()) { + continue; + } + if (stats.logs_retained_bytes() > low_io_most_logs_retained_bytes && + op->io_usage_ == MaintenanceOp::LOW_IO_USAGE) { + low_io_most_logs_retained_bytes_op = op; + low_io_most_logs_retained_bytes = stats.logs_retained_bytes(); + } + + if (stats.ram_anchored() > most_mem_anchored) { + most_mem_anchored_op = op; + most_mem_anchored = stats.ram_anchored(); + } + // We prioritize ops that can free more logs, but when it's the same we pick the one that + // also frees up the most memory. + if (stats.logs_retained_bytes() > 0 && + (stats.logs_retained_bytes() > most_logs_retained_bytes || + (stats.logs_retained_bytes() == most_logs_retained_bytes && + stats.ram_anchored() > most_logs_retained_bytes_ram_anchored))) { + most_logs_retained_bytes_op = op; + most_logs_retained_bytes = stats.logs_retained_bytes(); + most_logs_retained_bytes_ram_anchored = stats.ram_anchored(); + } + if ((!best_perf_improvement_op) || + (stats.perf_improvement() > best_perf_improvement)) { + best_perf_improvement_op = op; + best_perf_improvement = stats.perf_improvement(); + } + } + + // Look at ops that we can run quickly that free up log retention. + if (low_io_most_logs_retained_bytes_op) { + if (low_io_most_logs_retained_bytes > 0) { + VLOG_AND_TRACE("maintenance", 1) + << "Performing " << low_io_most_logs_retained_bytes_op->name() << ", " + << "because it can free up more logs " + << "at " << low_io_most_logs_retained_bytes + << " bytes with a low IO cost"; + return low_io_most_logs_retained_bytes_op; + } + } + + // Look at free memory. If it is dangerously low, we must select something + // that frees memory-- the op with the most anchored memory. + double capacity_pct; + if (parent_mem_tracker_->AnySoftLimitExceeded(&capacity_pct)) { + if (!most_mem_anchored_op) { + string msg = StringPrintf("we have exceeded our soft memory limit " + "(current capacity is %.2f%%). However, there are no ops currently " + "runnable which would free memory.", capacity_pct); + LOG(INFO) << msg; + return nullptr; + } + VLOG_AND_TRACE("maintenance", 1) << "we have exceeded our soft memory limit " + << "(current capacity is " << capacity_pct << "%). Running the op " + << "which anchors the most memory: " << most_mem_anchored_op->name(); + return most_mem_anchored_op; + } + + if (most_logs_retained_bytes_op) { + VLOG_AND_TRACE("maintenance", 1) + << "Performing " << most_logs_retained_bytes_op->name() << ", " + << "because it can free up more logs " << "at " << most_logs_retained_bytes + << " bytes"; + return most_logs_retained_bytes_op; + } + + if (best_perf_improvement_op) { + if (best_perf_improvement > 0) { + VLOG_AND_TRACE("maintenance", 1) << "Performing " << best_perf_improvement_op->name() << ", " + << "because it had the best perf_improvement score, " + << "at " << best_perf_improvement; + return best_perf_improvement_op; + } + } + return nullptr; +} + +void MaintenanceManager::LaunchOp(MaintenanceOp* op) { + MonoTime start_time(MonoTime::Now(MonoTime::FINE)); + op->RunningGauge()->Increment(); + LOG_TIMING(INFO, Substitute("running $0", op->name())) { + TRACE_EVENT1("maintenance", "MaintenanceManager::LaunchOp", + "name", op->name()); + op->Perform(); + } + op->RunningGauge()->Decrement(); + MonoTime end_time(MonoTime::Now(MonoTime::FINE)); + MonoDelta delta(end_time.GetDeltaSince(start_time)); + lock_guard guard(&lock_); + + CompletedOp& completed_op = completed_ops_[completed_ops_count_ % completed_ops_.size()]; + completed_op.name = op->name(); + completed_op.duration = delta; + completed_op.start_mono_time = start_time; + completed_ops_count_++; + + op->DurationHistogram()->Increment(delta.ToMilliseconds()); + + running_ops_--; + op->running_--; + op->cond_->Signal(); +} + +void MaintenanceManager::GetMaintenanceManagerStatusDump(MaintenanceManagerStatusPB* out_pb) { + DCHECK(out_pb != nullptr); + lock_guard guard(&lock_); + MaintenanceOp* best_op = FindBestOp(); + for (MaintenanceManager::OpMapTy::value_type& val : ops_) { + MaintenanceManagerStatusPB_MaintenanceOpPB* op_pb = out_pb->add_registered_operations(); + MaintenanceOp* op(val.first); + MaintenanceOpStats& stat(val.second); + op_pb->set_name(op->name()); + op_pb->set_running(op->running()); + op_pb->set_runnable(stat.runnable()); + op_pb->set_ram_anchored_bytes(stat.ram_anchored()); + op_pb->set_logs_retained_bytes(stat.logs_retained_bytes()); + op_pb->set_perf_improvement(stat.perf_improvement()); + + if (best_op == op) { + out_pb->mutable_best_op()->CopyFrom(*op_pb); + } + } + + for (const CompletedOp& completed_op : completed_ops_) { + if (!completed_op.name.empty()) { + MaintenanceManagerStatusPB_CompletedOpPB* completed_pb = out_pb->add_completed_operations(); + completed_pb->set_name(completed_op.name); + completed_pb->set_duration_millis(completed_op.duration.ToMilliseconds()); + + MonoDelta delta(MonoTime::Now(MonoTime::FINE).GetDeltaSince(completed_op.start_mono_time)); + completed_pb->set_secs_since_start(delta.ToSeconds()); + } + } +} + +} // namespace kudu diff --git a/src/kudu/tablet/maintenance_manager.h b/src/kudu/tablet/maintenance_manager.h new file mode 100644 index 000000000000..1d1a6094f933 --- /dev/null +++ b/src/kudu/tablet/maintenance_manager.h @@ -0,0 +1,280 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_MAINTENANCE_MANAGER_H +#define KUDU_TABLET_MAINTENANCE_MANAGER_H + +#include + +#include +#include +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/monotime.h" +#include "kudu/util/mutex.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" + +namespace kudu { + +template +class AtomicGauge; +class Histogram; +class MaintenanceManager; +class MemTracker; + +class MaintenanceOpStats { + public: + MaintenanceOpStats(); + + // Zero all stats. They are invalid until the first setter is called. + void Clear(); + + bool runnable() const { + DCHECK(valid_); + return runnable_; + } + + void set_runnable(bool runnable) { + UpdateLastModified(); + runnable_ = runnable; + } + + uint64_t ram_anchored() const { + DCHECK(valid_); + return ram_anchored_; + } + + void set_ram_anchored(uint64_t ram_anchored) { + UpdateLastModified(); + ram_anchored_ = ram_anchored; + } + + int64_t logs_retained_bytes() const { + DCHECK(valid_); + return logs_retained_bytes_; + } + + void set_logs_retained_bytes(int64_t logs_retained_bytes) { + UpdateLastModified(); + logs_retained_bytes_ = logs_retained_bytes; + } + + double perf_improvement() const { + DCHECK(valid_); + return perf_improvement_; + } + + void set_perf_improvement(double perf_improvement) { + UpdateLastModified(); + perf_improvement_ = perf_improvement; + } + + const MonoTime& last_modified() const { + DCHECK(valid_); + return last_modified_; + } + + bool valid() const { + return valid_; + } + + private: + void UpdateLastModified() { + valid_ = true; + last_modified_ = MonoTime::Now(MonoTime::FINE); + } + + // True if these stats are valid. + bool valid_; + + // True if this op can be run now. + bool runnable_; + + // The approximate amount of memory that not doing this operation keeps + // around. This number is used to decide when to start freeing memory, so it + // should be fairly accurate. May be 0. + uint64_t ram_anchored_; + + // The approximate amount of disk space that not doing this operation keeps us from GCing from + // the logs. May be 0. + int64_t logs_retained_bytes_; + + // The estimated performance improvement-- how good it is to do this on some + // absolute scale (yet TBD). + double perf_improvement_; + + // The last time that the stats were modified. + MonoTime last_modified_; +}; + +// MaintenanceOp objects represent background operations that the +// MaintenanceManager can schedule. Once a MaintenanceOp is registered, the +// manager will periodically poll it for statistics. The registrant is +// responsible for managing the memory associated with the MaintenanceOp object. +// Op objects should be unregistered before being de-allocated. +class MaintenanceOp { + public: + friend class MaintenanceManager; + + // General indicator of how much IO the Op will use. + enum IOUsage { + LOW_IO_USAGE, // Low impact operations like removing a file, updating metadata. + HIGH_IO_USAGE // Everything else. + }; + + explicit MaintenanceOp(std::string name, IOUsage io_usage); + virtual ~MaintenanceOp(); + + // Unregister this op, if it is currently registered. + void Unregister(); + + // Update the op statistics. This will be called every scheduling period + // (about a few times a second), so it should not be too expensive. It's + // possible for the returned statistics to be invalid; the caller should + // call MaintenanceOpStats::valid() before using them. This will be run + // under the MaintenanceManager lock. + virtual void UpdateStats(MaintenanceOpStats* stats) = 0; + + // Prepare to perform the operation. This will be run without holding the + // maintenance manager lock. It should be short, since it is run from the + // context of the maintenance op scheduler thread rather than a worker thread. + // If this returns false, we will abort the operation. + virtual bool Prepare() = 0; + + // Perform the operation. This will be run without holding the maintenance + // manager lock, and may take a long time. + virtual void Perform() = 0; + + // Returns the histogram for this op that tracks duration. Cannot be NULL. + virtual scoped_refptr DurationHistogram() const = 0; + + // Returns the gauge for this op that tracks when this op is running. Cannot be NULL. + virtual scoped_refptr > RunningGauge() const = 0; + + uint32_t running() { return running_; } + + std::string name() const { return name_; } + + IOUsage io_usage() const { return io_usage_; } + + private: + DISALLOW_COPY_AND_ASSIGN(MaintenanceOp); + + // The name of the operation. Op names must be unique. + const std::string name_; + + // The number of times that this op is currently running. + uint32_t running_; + + // Condition variable which the UnregisterOp function can wait on. + // + // Note: 'cond_' is used with the MaintenanceManager's mutex. As such, + // it only exists when the op is registered. + gscoped_ptr cond_; + + // The MaintenanceManager with which this op is registered, or null + // if it is not registered. + std::shared_ptr manager_; + + IOUsage io_usage_; +}; + +struct MaintenanceOpComparator { + bool operator() (const MaintenanceOp* lhs, + const MaintenanceOp* rhs) const { + return lhs->name().compare(rhs->name()) < 0; + } +}; + +// Holds the information regarding a recently completed operation. +struct CompletedOp { + std::string name; + MonoDelta duration; + MonoTime start_mono_time; +}; + +// The MaintenanceManager manages the scheduling of background operations such +// as flushes or compactions. It runs these operations in the background, in a +// thread pool. It uses information provided in MaintenanceOpStats objects to +// decide which operations, if any, to run. +class MaintenanceManager : public std::enable_shared_from_this { + public: + struct Options { + int32_t num_threads; + int32_t polling_interval_ms; + uint32_t history_size; + std::shared_ptr parent_mem_tracker; + }; + + explicit MaintenanceManager(const Options& options); + ~MaintenanceManager(); + + Status Init(); + void Shutdown(); + + // Register an op with the manager. + void RegisterOp(MaintenanceOp* op); + + // Unregister an op with the manager. + // If the Op is currently running, it will not be interrupted. However, this + // function will block until the Op is finished. + void UnregisterOp(MaintenanceOp* op); + + void GetMaintenanceManagerStatusDump(tablet::MaintenanceManagerStatusPB* out_pb); + + static const Options DEFAULT_OPTIONS; + + private: + FRIEND_TEST(MaintenanceManagerTest, TestLogRetentionPrioritization); + typedef std::map OpMapTy; + + void RunSchedulerThread(); + + // find the best op, or null if there is nothing we want to run + MaintenanceOp* FindBestOp(); + + void LaunchOp(MaintenanceOp* op); + + const int32_t num_threads_; + OpMapTy ops_; // registered operations + Mutex lock_; + scoped_refptr monitor_thread_; + gscoped_ptr thread_pool_; + ConditionVariable cond_; + bool shutdown_; + uint64_t running_ops_; + int32_t polling_interval_ms_; + // Vector used as a circular buffer for recently completed ops. Elements need to be added at + // the completed_ops_count_ % the vector's size and then the count needs to be incremented. + std::vector completed_ops_; + int64_t completed_ops_count_; + std::shared_ptr parent_mem_tracker_; + + DISALLOW_COPY_AND_ASSIGN(MaintenanceManager); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/major_delta_compaction-test.cc b/src/kudu/tablet/major_delta_compaction-test.cc new file mode 100644 index 000000000000..1f5c69d6d827 --- /dev/null +++ b/src/kudu/tablet/major_delta_compaction-test.cc @@ -0,0 +1,364 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/common/partial_row.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/tablet/diskrowset-test-base.h" +#include "kudu/util/test_util.h" +#include "kudu/gutil/algorithm.h" + +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +using strings::Substitute; + +class TestMajorDeltaCompaction : public KuduRowSetTest { + public: + TestMajorDeltaCompaction() : + KuduRowSetTest(Schema({ ColumnSchema("key", STRING), + ColumnSchema("val1", INT32), + ColumnSchema("val2", STRING), + ColumnSchema("val3", INT32), + ColumnSchema("val4", STRING) }, 1)), + mvcc_(scoped_refptr( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp))) { + } + + struct ExpectedRow { + string key; + int32_t val1; + string val2; + int32_t val3; + string val4; + + string Formatted() const { + return strings::Substitute( + "(string key=$0, int32 val1=$1, string val2=$2, int32 val3=$3, string val4=$4)", + key, val1, val2, val3, val4); + } + }; + + virtual void SetUp() OVERRIDE { + KuduRowSetTest::SetUp(); + } + + // Insert data into tablet_, setting up equivalent state in + // expected_state_. + void WriteTestTablet(int nrows) { + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow ins_row(&client_schema_); + + for (int i = 0; i < nrows; i++) { + ExpectedRow row; + row.key = StringPrintf("hello %08d", i); + row.val1 = i * 2; + row.val2 = StringPrintf("a %08d", i * 2); + row.val3 = i * 10; + row.val4 = StringPrintf("b %08d", i * 10); + + int col = 0; + CHECK_OK(ins_row.SetString(col++, row.key)); + CHECK_OK(ins_row.SetInt32(col++, row.val1)); + CHECK_OK(ins_row.SetString(col++, row.val2)); + CHECK_OK(ins_row.SetInt32(col++, row.val3)); + CHECK_OK(ins_row.SetString(col++, row.val4)); + ASSERT_OK_FAST(writer.Insert(ins_row)); + expected_state_.push_back(row); + } + } + + // Delete the data that was inserted and clear the expected state, end to front. + void DeleteRows(int nrows) { + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow del_row(&client_schema_); + + for (int i = nrows - 1; i >= 0; i--) { + CHECK_OK(del_row.SetString(0, expected_state_[i].key)); + ASSERT_OK(writer.Delete(del_row)); + expected_state_.pop_back(); + } + ASSERT_EQ(expected_state_.size(), 0); + } + + // Update the data, touching only odd or even rows based on the + // value of 'even'. + // Makes corresponding updates in expected_state_. + void UpdateRows(int nrows, bool even) { + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow prow(&client_schema_); + for (int idx = 0; idx < nrows; idx++) { + ExpectedRow* row = &expected_state_[idx]; + if ((idx % 2 == 0) == even) { + // Set key + CHECK_OK(prow.SetString(0, row->key)); + + // Update the data + row->val1 *= 2; + row->val3 *= 2; + row->val4.append("[U]"); + + // Apply the updates. + CHECK_OK(prow.SetInt32(1, row->val1)); + CHECK_OK(prow.SetInt32(3, row->val3)); + CHECK_OK(prow.SetString(4, row->val4)); + ASSERT_OK(writer.Update(prow)); + } + } + } + + // Verify that the data seen by scanning the tablet matches the data in + // expected_state_. + void VerifyData() { + MvccSnapshot snap(*tablet()->mvcc_manager()); + VerifyDataWithMvccAndExpectedState(snap, expected_state_); + } + + void VerifyDataWithMvccAndExpectedState(MvccSnapshot& snap, + const vector& passed_expected_state) { + gscoped_ptr row_iter; + ASSERT_OK(tablet()->NewRowIterator(client_schema_, snap, + Tablet::UNORDERED, &row_iter)); + ASSERT_OK(row_iter->Init(nullptr)); + + vector results; + ASSERT_OK(IterateToStringList(row_iter.get(), &results)); + VLOG(1) << "Results of iterating over the updated materialized rows:"; + ASSERT_EQ(passed_expected_state.size(), results.size()); + for (int i = 0; i < results.size(); i++) { + SCOPED_TRACE(Substitute("row $0", i)); + const string& str = results[i]; + const ExpectedRow& expected = passed_expected_state[i]; + ASSERT_EQ(expected.Formatted(), str); + } + } + + MvccManager mvcc_; + vector expected_state_; +}; + +// Tests a major delta compaction run. +// Verifies that the output rowset accurately reflects the mutations, but keeps the +// unchanged columns intact. +TEST_F(TestMajorDeltaCompaction, TestCompact) { + const int kNumRows = 100; + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); + ASSERT_OK(tablet()->Flush()); + + vector > all_rowsets; + tablet()->GetRowSetsForTests(&all_rowsets); + + shared_ptr rs = all_rowsets.front(); + + vector col_ids_to_compact = { schema_.column_id(1), + schema_.column_id(3), + schema_.column_id(4) }; + + // We'll run a few rounds of update/compact to make sure + // that we don't get into some funny state (regression test for + // an earlier bug). + // We first compact all the columns, then for each other round we do one less, + // so that we test a few combinations. + for (int i = 0; i < 3; i++) { + SCOPED_TRACE(Substitute("Update/compact round $0", i)); + // Update the even rows and verify. + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, false)); + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Flush the deltas, make sure data stays the same. + ASSERT_OK(tablet()->FlushBiggestDMS()); + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Update the odd rows and flush deltas + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, true)); + ASSERT_OK(tablet()->FlushBiggestDMS()); + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Major compact some columns. + vector col_ids; + for (int col_index = 0; col_index < col_ids_to_compact.size() - i; col_index++) { + col_ids.push_back(col_ids_to_compact[col_index]); + } + ASSERT_OK(tablet()->DoMajorDeltaCompaction(col_ids, rs)); + + ASSERT_NO_FATAL_FAILURE(VerifyData()); + } +} + +// Verify that we do issue UNDO files and that we can read them. +TEST_F(TestMajorDeltaCompaction, TestUndos) { + const int kNumRows = 100; + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); + ASSERT_OK(tablet()->Flush()); + + vector > all_rowsets; + tablet()->GetRowSetsForTests(&all_rowsets); + + shared_ptr rs = all_rowsets.front(); + + MvccSnapshot snap(*tablet()->mvcc_manager()); + + // Verify the old data and grab a copy of the old state. + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(snap, expected_state_)); + vector old_state(expected_state_.size()); + std::copy(expected_state_.begin(), expected_state_.end(), old_state.begin()); + + // Flush the DMS, make sure we still see the old data. + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, false)); + ASSERT_OK(tablet()->FlushBiggestDMS()); + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(snap, old_state)); + + // Major compact, check we still have the old data. + vector col_ids_to_compact = { schema_.column_id(1), + schema_.column_id(3), + schema_.column_id(4) }; + ASSERT_OK(tablet()->DoMajorDeltaCompaction(col_ids_to_compact, rs)); + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(snap, old_state)); + + // Test adding three updates per row to three REDO files. + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, false)); + } + ASSERT_OK(tablet()->FlushBiggestDMS()); + } + + // To complicate things further, only major compact two columns, then verify we can read the old + // and the new data. + col_ids_to_compact.pop_back(); + ASSERT_OK(tablet()->DoMajorDeltaCompaction(col_ids_to_compact, rs)); + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(snap, old_state)); + ASSERT_NO_FATAL_FAILURE(VerifyData()); +} + +// Test that the delete REDO mutations are written back and not filtered out. +TEST_F(TestMajorDeltaCompaction, TestCarryDeletesOver) { + const int kNumRows = 100; + + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); + ASSERT_OK(tablet()->Flush()); + + vector > all_rowsets; + tablet()->GetRowSetsForTests(&all_rowsets); + shared_ptr rs = all_rowsets.front(); + + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, false)); + ASSERT_OK(tablet()->FlushBiggestDMS()); + + MvccSnapshot updates_snap(*tablet()->mvcc_manager()); + vector old_state(expected_state_.size()); + std::copy(expected_state_.begin(), expected_state_.end(), old_state.begin()); + + ASSERT_NO_FATAL_FAILURE(DeleteRows(kNumRows)); + ASSERT_OK(tablet()->FlushBiggestDMS()); + + vector col_ids_to_compact = { schema_.column_id(4) }; + ASSERT_OK(tablet()->DoMajorDeltaCompaction(col_ids_to_compact, rs)); + + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(updates_snap, old_state)); +} + +// Verify that reinserts only happen in the MRS and not down into the DRS. This test serves as a +// way to document how things work, and if they change then we'll know that our assumptions have +// changed. +TEST_F(TestMajorDeltaCompaction, TestReinserts) { + const int kNumRows = 100; + + // Reinsert all the rows directly in the MRS. + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); // 1st batch. + ASSERT_NO_FATAL_FAILURE(DeleteRows(kNumRows)); // Delete 1st batch. + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); // 2nd batch. + ASSERT_OK(tablet()->Flush()); + + // Update those rows, we'll try to read them at the end. + ASSERT_NO_FATAL_FAILURE(UpdateRows(kNumRows, false)); // Update 2nd batch. + vector old_state(expected_state_.size()); + std::copy(expected_state_.begin(), expected_state_.end(), old_state.begin()); + MvccSnapshot second_batch_inserts(*tablet()->mvcc_manager()); + + vector > all_rowsets; + tablet()->GetRowSetsForTests(&all_rowsets); + ASSERT_EQ(1, all_rowsets.size()); + + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Delete the rows (will go into the DMS) then reinsert them (will go in a new MRS), then flush + // the DMS with the deletes so that we can major compact them. + ASSERT_NO_FATAL_FAILURE(DeleteRows(kNumRows)); // Delete 2nd batch. + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); // 3rd batch. + ASSERT_OK(tablet()->FlushBiggestDMS()); + + // At this point, here's the layout (the 1st batch was discarded during the first flush): + // MRS: 3rd batch of inserts. + // RS1: UNDO DF: Deletes for the 2nd batch. + // DS: Base data for the 2nd batch. + // REDO DF: Updates and deletes for the 2nd. + + // Now we'll push some of the updates down. + shared_ptr rs = all_rowsets.front(); + vector col_ids_to_compact = { schema_.column_id(4) }; + ASSERT_OK(tablet()->DoMajorDeltaCompaction(col_ids_to_compact, rs)); + + // The data we'll see here is the 3rd batch of inserts, doesn't have updates. + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Test that the 3rd batch of inserts goes into a new RS, even though it's the same row keys. + ASSERT_OK(tablet()->Flush()); + all_rowsets.clear(); + tablet()->GetRowSetsForTests(&all_rowsets); + ASSERT_EQ(2, all_rowsets.size()); + + // Verify the 3rd batch. + ASSERT_NO_FATAL_FAILURE(VerifyData()); + + // Verify the updates in the second batch are still readable, from the first RS. + ASSERT_NO_FATAL_FAILURE(VerifyDataWithMvccAndExpectedState(second_batch_inserts, old_state)); +} + +// Verify that we won't schedule a major compaction when files are just composed of deletes. +TEST_F(TestMajorDeltaCompaction, TestJustDeletes) { + const int kNumRows = 100; + + ASSERT_NO_FATAL_FAILURE(WriteTestTablet(kNumRows)); + ASSERT_OK(tablet()->Flush()); + ASSERT_NO_FATAL_FAILURE(DeleteRows(kNumRows)); + ASSERT_OK(tablet()->FlushBiggestDMS()); + + shared_ptr rs; + ASSERT_EQ(0, + tablet()->GetPerfImprovementForBestDeltaCompact(RowSet::MAJOR_DELTA_COMPACTION, &rs)); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/memrowset-test.cc b/src/kudu/tablet/memrowset-test.cc new file mode 100644 index 000000000000..4247dc0d9e4a --- /dev/null +++ b/src/kudu/tablet/memrowset-test.cc @@ -0,0 +1,523 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/row.h" +#include "kudu/common/scan_spec.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/memrowset.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_macros.h" + +DECLARE_bool(enable_data_block_fsync); +DEFINE_int32(roundtrip_num_rows, 10000, + "Number of rows to use for the round-trip test"); +DEFINE_int32(num_scan_passes, 1, + "Number of passes to run the scan portion of the round-trip test"); + +namespace kudu { +namespace tablet { + +using consensus::OpId; +using log::LogAnchorRegistry; +using std::shared_ptr; + +class TestMemRowSet : public ::testing::Test { + public: + TestMemRowSet() + : op_id_(consensus::MaximumOpId()), + log_anchor_registry_(new LogAnchorRegistry()), + schema_(CreateSchema()), + key_schema_(schema_.CreateKeyProjection()), + mvcc_(scoped_refptr( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp))) { + FLAGS_enable_data_block_fsync = false; // Keep unit tests fast. + } + + static Schema CreateSchema() { + SchemaBuilder builder; + CHECK_OK(builder.AddKeyColumn("key", STRING)); + CHECK_OK(builder.AddColumn("val", UINT32)); + return builder.Build(); + } + + protected: + // Check that the given row in the memrowset contains the given data. + void CheckValue(const shared_ptr &mrs, string key, + const string &expected_row) { + gscoped_ptr iter(mrs->NewIterator()); + ASSERT_OK(iter->Init(nullptr)); + + Slice keystr_slice(key); + Slice key_slice(reinterpret_cast(&keystr_slice), sizeof(Slice)); + + bool exact; + ASSERT_OK(iter->SeekAtOrAfter(key_slice, &exact)); + ASSERT_TRUE(exact) << "unable to seek to key " << key; + ASSERT_TRUE(iter->HasNext()); + + vector out; + ASSERT_OK(IterateToStringList(iter.get(), &out, 1)); + ASSERT_EQ(1, out.size()); + ASSERT_EQ(expected_row, out[0]) << "bad result for key " << key; + } + + Status CheckRowPresent(const MemRowSet &mrs, + const string &key, bool *present) { + RowBuilder rb(key_schema_); + rb.AddString(Slice(key)); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + + return mrs.CheckRowPresent(probe, present, &stats); + } + + Status InsertRows(MemRowSet *mrs, int num_rows) { + RowBuilder rb(schema_); + char keybuf[256]; + for (uint32_t i = 0; i < num_rows; i++) { + rb.Reset(); + snprintf(keybuf, sizeof(keybuf), "hello %d", i); + rb.AddString(Slice(keybuf)); + rb.AddUint32(i); + RETURN_NOT_OK(mrs->Insert(Timestamp(i), rb.row(), op_id_)); + } + + return Status::OK(); + } + + Status InsertRow(MemRowSet *mrs, const string &key, uint32_t val) { + ScopedTransaction tx(&mvcc_); + RowBuilder rb(schema_); + rb.AddString(key); + rb.AddUint32(val); + tx.StartApplying(); + Status s = mrs->Insert(tx.timestamp(), rb.row(), op_id_); + tx.Commit(); + return s; + } + + Status UpdateRow(MemRowSet *mrs, + const string &key, + uint32_t new_val, + OperationResultPB* result) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + + mutation_buf_.clear(); + RowChangeListEncoder update(&mutation_buf_); + update.AddColumnUpdate(schema_.column(1), schema_.column_id(1), &new_val); + + RowBuilder rb(key_schema_); + rb.AddString(Slice(key)); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + Status s = mrs->MutateRow(tx.timestamp(), + probe, + RowChangeList(mutation_buf_), + op_id_, + &stats, + result); + tx.Commit(); + return s; + } + + Status DeleteRow(MemRowSet *mrs, const string &key, OperationResultPB* result) { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + + mutation_buf_.clear(); + RowChangeListEncoder update(&mutation_buf_); + update.SetToDelete(); + + RowBuilder rb(key_schema_); + rb.AddString(Slice(key)); + RowSetKeyProbe probe(rb.row()); + ProbeStats stats; + Status s = mrs->MutateRow(tx.timestamp(), + probe, + RowChangeList(mutation_buf_), + op_id_, + &stats, + result); + tx.Commit(); + return s; + } + + int ScanAndCount(MemRowSet* mrs, const MvccSnapshot& snap) { + gscoped_ptr iter(mrs->NewIterator(&schema_, snap)); + CHECK_OK(iter->Init(NULL)); + + Arena arena(1024, 256*1024); + RowBlock block(schema_, 100, &arena); + int fetched = 0; + while (iter->HasNext()) { + CHECK_OK(iter->NextBlock(&block)); + fetched += block.selection_vector()->CountSelected(); + } + return fetched; + } + + OpId op_id_; + scoped_refptr log_anchor_registry_; + + faststring mutation_buf_; + const Schema schema_; + const Schema key_schema_; + MvccManager mvcc_; +}; + + +TEST_F(TestMemRowSet, TestInsertAndIterate) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + ASSERT_OK(InsertRow(mrs.get(), "hello world", 12345)); + ASSERT_OK(InsertRow(mrs.get(), "goodbye world", 54321)); + + ASSERT_EQ(2, mrs->entry_count()); + + gscoped_ptr iter(mrs->NewIterator()); + ASSERT_OK(iter->Init(nullptr)); + + // The first row returned from the iterator should + // be "goodbye" because 'g' sorts before 'h' + ASSERT_TRUE(iter->HasNext()); + MRSRow row = iter->GetCurrentRow(); + EXPECT_EQ("(string key=goodbye world, uint32 val=54321)", schema_.DebugRow(row)); + + // Next row should be 'hello world' + ASSERT_TRUE(iter->Next()); + ASSERT_TRUE(iter->HasNext()); + row = iter->GetCurrentRow(); + EXPECT_EQ("(string key=hello world, uint32 val=12345)", schema_.DebugRow(row)); + + ASSERT_FALSE(iter->Next()); + ASSERT_FALSE(iter->HasNext()); +} + +TEST_F(TestMemRowSet, TestInsertAndIterateCompoundKey) { + + SchemaBuilder builder; + ASSERT_OK(builder.AddKeyColumn("key1", STRING)); + ASSERT_OK(builder.AddKeyColumn("key2", INT32)); + ASSERT_OK(builder.AddColumn("val", UINT32)); + Schema compound_key_schema = builder.Build(); + + shared_ptr mrs(new MemRowSet(0, compound_key_schema, log_anchor_registry_.get())); + + RowBuilder rb(compound_key_schema); + { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + rb.AddString(string("hello world")); + rb.AddInt32(1); + rb.AddUint32(12345); + Status row1 = mrs->Insert(tx.timestamp(), rb.row(), op_id_); + ASSERT_OK(row1); + tx.Commit(); + } + + { + ScopedTransaction tx2(&mvcc_); + tx2.StartApplying(); + rb.Reset(); + rb.AddString(string("goodbye world")); + rb.AddInt32(2); + rb.AddUint32(54321); + Status row2 = mrs->Insert(tx2.timestamp(), rb.row(), op_id_); + ASSERT_OK(row2); + tx2.Commit(); + } + + { + ScopedTransaction tx3(&mvcc_); + tx3.StartApplying(); + rb.Reset(); + rb.AddString(string("goodbye world")); + rb.AddInt32(1); + rb.AddUint32(12345); + Status row3 = mrs->Insert(tx3.timestamp(), rb.row(), op_id_); + ASSERT_OK(row3); + tx3.Commit(); + } + + ASSERT_EQ(3, mrs->entry_count()); + + gscoped_ptr iter(mrs->NewIterator()); + ASSERT_OK(iter->Init(nullptr)); + + // The first row returned from the iterator should + // be "goodbye" (row3) sorted on the second key + ASSERT_TRUE(iter->HasNext()); + MRSRow row = iter->GetCurrentRow(); + EXPECT_EQ("(string key1=goodbye world, int32 key2=1, uint32 val=12345)", + compound_key_schema.DebugRow(row)); + + // Next row should be "goodbye" (row2) + ASSERT_TRUE(iter->Next()); + ASSERT_TRUE(iter->HasNext()); + row = iter->GetCurrentRow(); + EXPECT_EQ("(string key1=goodbye world, int32 key2=2, uint32 val=54321)", + compound_key_schema.DebugRow(row)); + + // Next row should be 'hello world' (row1) + ASSERT_TRUE(iter->Next()); + ASSERT_TRUE(iter->HasNext()); + row = iter->GetCurrentRow(); + EXPECT_EQ("(string key1=hello world, int32 key2=1, uint32 val=12345)", + compound_key_schema.DebugRow(row)); + + ASSERT_FALSE(iter->Next()); + ASSERT_FALSE(iter->HasNext()); +} + +// Test that inserting duplicate key data fails with Status::AlreadyPresent +TEST_F(TestMemRowSet, TestInsertDuplicate) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + ASSERT_OK(InsertRow(mrs.get(), "hello world", 12345)); + Status s = InsertRow(mrs.get(), "hello world", 12345); + ASSERT_TRUE(s.IsAlreadyPresent()) << "bad status: " << s.ToString(); +} + +// Test for updating rows in memrowset +TEST_F(TestMemRowSet, TestUpdate) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + ASSERT_OK(InsertRow(mrs.get(), "hello world", 1)); + + // Validate insertion + CheckValue(mrs, "hello world", "(string key=hello world, uint32 val=1)"); + + // Update a key which exists. + OperationResultPB result; + ASSERT_OK(UpdateRow(mrs.get(), "hello world", 2, &result)); + ASSERT_EQ(1, result.mutated_stores_size()); + ASSERT_EQ(0L, result.mutated_stores(0).mrs_id()); + + // Validate the updated value + CheckValue(mrs, "hello world", "(string key=hello world, uint32 val=2)"); + + // Try to update a key which doesn't exist - should return NotFound + result.Clear(); + Status s = UpdateRow(mrs.get(), "does not exist", 3, &result); + ASSERT_TRUE(s.IsNotFound()) << "bad status: " << s.ToString(); + ASSERT_EQ(0, result.mutated_stores_size()); +} + +// Test which inserts many rows into memrowset and checks for their +// existence +TEST_F(TestMemRowSet, TestInsertCopiesToArena) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + ASSERT_OK(InsertRows(mrs.get(), 100)); + // Validate insertion + char keybuf[256]; + for (uint32_t i = 0; i < 100; i++) { + snprintf(keybuf, sizeof(keybuf), "hello %d", i); + CheckValue(mrs, keybuf, + StringPrintf("(string key=%s, uint32 val=%d)", keybuf, i)); + } +} + +TEST_F(TestMemRowSet, TestDelete) { + const char kRowKey[] = "hello world"; + bool present; + + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + // Insert row. + ASSERT_OK(InsertRow(mrs.get(), kRowKey, 1)); + MvccSnapshot snapshot_before_delete(mvcc_); + + // CheckRowPresent should return true + ASSERT_OK(CheckRowPresent(*mrs, kRowKey, &present)); + EXPECT_TRUE(present); + + // Delete it. + OperationResultPB result; + ASSERT_OK(DeleteRow(mrs.get(), kRowKey, &result)); + ASSERT_EQ(1, result.mutated_stores_size()); + ASSERT_EQ(0L, result.mutated_stores(0).mrs_id()); + + MvccSnapshot snapshot_after_delete(mvcc_); + + // CheckRowPresent should return false + ASSERT_OK(CheckRowPresent(*mrs, kRowKey, &present)); + EXPECT_FALSE(present); + + // Trying to Delete again or Update should get an error. + result.Clear(); + Status s = DeleteRow(mrs.get(), kRowKey, &result); + ASSERT_TRUE(s.IsNotFound()) << "Unexpected status: " << s.ToString(); + ASSERT_EQ(0, result.mutated_stores_size()); + + result.Clear(); + s = UpdateRow(mrs.get(), kRowKey, 12345, &result); + ASSERT_TRUE(s.IsNotFound()) << "Unexpected status: " << s.ToString(); + ASSERT_EQ(0, result.mutated_stores_size()); + + // Re-insert a new row with the same key. + ASSERT_OK(InsertRow(mrs.get(), kRowKey, 2)); + MvccSnapshot snapshot_after_reinsert(mvcc_); + + // CheckRowPresent should now return true + ASSERT_OK(CheckRowPresent(*mrs, kRowKey, &present)); + EXPECT_TRUE(present); + + // Verify the MVCC contents of the memrowset. + // NOTE: the REINSERT has timestamp 4 because of the two failed attempts + // at mutating the deleted row above -- each of them grabs a timestamp even + // though it doesn't actually make any successful mutations. + vector rows; + ASSERT_OK(mrs->DebugDump(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ("@1: row (string key=hello world, uint32 val=1) mutations=" + "[@2(DELETE), " + "@5(REINSERT (string key=hello world, uint32 val=2))]", + rows[0]); + + // Verify that iterating the rowset at the first snapshot shows the row. + ASSERT_OK(DumpRowSet(*mrs, schema_, snapshot_before_delete, &rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ("(string key=hello world, uint32 val=1)", rows[0]); + + // Verify that iterating the rowset at the snapshot where it's deleted + // doesn't show the row. + ASSERT_OK(DumpRowSet(*mrs, schema_, snapshot_after_delete, &rows)); + ASSERT_EQ(0, rows.size()); + + // Verify that iterating the rowset after it's re-inserted shows the row. + ASSERT_OK(DumpRowSet(*mrs, schema_, snapshot_after_reinsert, &rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ("(string key=hello world, uint32 val=2)", rows[0]); +} + +// Test for basic operations. +// Can operate as a benchmark by setting --roundtrip_num_rows to a high value like 10M +TEST_F(TestMemRowSet, TestMemRowSetInsertCountAndScan) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + LOG_TIMING(INFO, "Inserting rows") { + ASSERT_OK(InsertRows(mrs.get(), FLAGS_roundtrip_num_rows)); + } + + LOG_TIMING(INFO, "Counting rows") { + int count = mrs->entry_count(); + ASSERT_EQ(FLAGS_roundtrip_num_rows, count); + } + + for (int i = 0; i < FLAGS_num_scan_passes; i++) { + LOG_TIMING(INFO, "Scanning rows where none are committed") { + ASSERT_EQ(0, ScanAndCount(mrs.get(), MvccSnapshot(Timestamp(0)))); + } + + LOG_TIMING(INFO, "Scanning rows where all are committed") { + ASSERT_EQ(FLAGS_roundtrip_num_rows, + ScanAndCount(mrs.get(), + MvccSnapshot(Timestamp(FLAGS_roundtrip_num_rows + 1)))); + } + } +} +// Test that scanning at past MVCC snapshots will hide rows which are +// not committed in that snapshot. +TEST_F(TestMemRowSet, TestInsertionMVCC) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + vector snapshots; + + // Insert 5 rows in tx 0 through 4 + for (uint32_t i = 0; i < 5; i++) { + { + ScopedTransaction tx(&mvcc_); + tx.StartApplying(); + RowBuilder rb(schema_); + char keybuf[256]; + rb.Reset(); + snprintf(keybuf, sizeof(keybuf), "tx%d", i); + rb.AddString(Slice(keybuf)); + rb.AddUint32(i); + ASSERT_OK_FAST(mrs->Insert(tx.timestamp(), rb.row(), op_id_)); + tx.Commit(); + } + + // Transaction is committed. Save the snapshot after this commit. + snapshots.push_back(MvccSnapshot(mvcc_)); + } + LOG(INFO) << "MemRowSet after inserts:"; + ASSERT_OK(mrs->DebugDump()); + + ASSERT_EQ(5, snapshots.size()); + for (int i = 0; i < 5; i++) { + SCOPED_TRACE(i); + // Each snapshot 'i' is taken after row 'i' was committed. + vector rows; + ASSERT_OK(kudu::tablet::DumpRowSet(*mrs, schema_, snapshots[i], &rows)); + ASSERT_EQ(1 + i, rows.size()); + string expected = StringPrintf("(string key=tx%d, uint32 val=%d)", i, i); + ASSERT_EQ(expected, rows[i]); + } +} + +// Test that updates respect MVCC -- i.e. that scanning with a past MVCC snapshot +// will yield old versions of a row which has been updated. +TEST_F(TestMemRowSet, TestUpdateMVCC) { + shared_ptr mrs(new MemRowSet(0, schema_, log_anchor_registry_.get())); + + // Insert a row ("myrow", 0) + ASSERT_OK(InsertRow(mrs.get(), "my row", 0)); + + vector snapshots; + // First snapshot is after insertion + snapshots.push_back(MvccSnapshot(mvcc_)); + + // Update the row 5 times (setting its int column to increasing ints 1-5) + for (uint32_t i = 1; i <= 5; i++) { + OperationResultPB result; + ASSERT_OK(UpdateRow(mrs.get(), "my row", i, &result)); + ASSERT_EQ(1, result.mutated_stores_size()); + ASSERT_EQ(0L, result.mutated_stores(0).mrs_id()); + + // Transaction is committed. Save the snapshot after this commit. + snapshots.push_back(MvccSnapshot(mvcc_)); + } + + LOG(INFO) << "MemRowSet after updates:"; + ASSERT_OK(mrs->DebugDump()); + + // Validate that each snapshot returns the expected value + ASSERT_EQ(6, snapshots.size()); + for (int i = 0; i <= 5; i++) { + SCOPED_TRACE(i); + vector rows; + ASSERT_OK(kudu::tablet::DumpRowSet(*mrs, schema_, snapshots[i], &rows)); + ASSERT_EQ(1, rows.size()); + + string expected = StringPrintf("(string key=my row, uint32 val=%d)", i); + LOG(INFO) << "Reading with snapshot " << snapshots[i].ToString() << ": " + << rows[0]; + EXPECT_EQ(expected, rows[0]); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/memrowset.cc b/src/kudu/tablet/memrowset.cc new file mode 100644 index 000000000000..55f9b3a650ab --- /dev/null +++ b/src/kudu/tablet/memrowset.cc @@ -0,0 +1,630 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/memrowset.h" + +#include +#include +#include +#include + +#include "kudu/codegen/compilation_manager.h" +#include "kudu/codegen/row_projector.h" +#include "kudu/common/common.pb.h" +#include "kudu/common/generic_iterators.h" +#include "kudu/common/row.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/tablet/compaction.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/mem_tracker.h" + +DEFINE_bool(mrs_use_codegen, true, "whether the memrowset should use code " + "generation for iteration"); +TAG_FLAG(mrs_use_codegen, hidden); + +using std::pair; +using std::shared_ptr; + +namespace kudu { namespace tablet { + +using consensus::OpId; +using log::LogAnchorRegistry; +using strings::Substitute; + +static const int kInitialArenaSize = 16; +static const int kMaxArenaBufferSize = 8*1024*1024; + +bool MRSRow::IsGhost() const { + bool is_ghost = false; + for (const Mutation *mut = header_->redo_head; + mut != nullptr; + mut = mut->next()) { + RowChangeListDecoder decoder(mut->changelist()); + Status s = decoder.Init(); + if (!PREDICT_TRUE(s.ok())) { + LOG(FATAL) << "Failed to decode: " << mut->changelist().ToString(*schema()) + << " (" << s.ToString() << ")"; + } + if (decoder.is_delete()) { + DCHECK(!is_ghost); + is_ghost = true; + } else if (decoder.is_reinsert()) { + DCHECK(is_ghost); + is_ghost = false; + } + } + return is_ghost; +} + +namespace { + +shared_ptr CreateMemTrackerForMemRowSet( + int64_t id, const shared_ptr& parent_tracker) { + string mem_tracker_id = Substitute("MemRowSet-$0", id); + return MemTracker::CreateTracker(-1, mem_tracker_id, parent_tracker); +} + +} // anonymous namespace + +MemRowSet::MemRowSet(int64_t id, + const Schema &schema, + LogAnchorRegistry* log_anchor_registry, + const shared_ptr& parent_tracker) + : id_(id), + schema_(schema), + parent_tracker_(parent_tracker), + mem_tracker_(CreateMemTrackerForMemRowSet(id, parent_tracker)), + allocator_(new MemoryTrackingBufferAllocator(HeapBufferAllocator::Get(), mem_tracker_)), + arena_(new ThreadSafeMemoryTrackingArena(kInitialArenaSize, kMaxArenaBufferSize, + allocator_)), + tree_(arena_), + debug_insert_count_(0), + debug_update_count_(0), + has_logged_throttling_(false), + anchorer_(log_anchor_registry, Substitute("MemRowSet-$0", id_)) { + CHECK(schema.has_column_ids()); + ANNOTATE_BENIGN_RACE(&debug_insert_count_, "insert count isnt accurate"); + ANNOTATE_BENIGN_RACE(&debug_update_count_, "update count isnt accurate"); +} + +MemRowSet::~MemRowSet() { + mem_tracker_->UnregisterFromParent(); +} + +Status MemRowSet::DebugDump(vector *lines) { + gscoped_ptr iter(NewIterator()); + RETURN_NOT_OK(iter->Init(NULL)); + while (iter->HasNext()) { + MRSRow row = iter->GetCurrentRow(); + LOG_STRING(INFO, lines) + << "@" << row.insertion_timestamp() << ": row " + << schema_.DebugRow(row) + << " mutations=" << Mutation::StringifyMutationList(schema_, row.header_->redo_head) + << std::endl; + iter->Next(); + } + + return Status::OK(); +} + + +Status MemRowSet::Insert(Timestamp timestamp, + const ConstContiguousRow& row, + const OpId& op_id) { + CHECK(row.schema()->has_column_ids()); + DCHECK_SCHEMA_EQ(schema_, *row.schema()); + + { + faststring enc_key_buf; + schema_.EncodeComparableKey(row, &enc_key_buf); + Slice enc_key(enc_key_buf); + + btree::PreparedMutation mutation(enc_key); + mutation.Prepare(&tree_); + + // TODO: for now, the key ends up stored doubly -- + // once encoded in the btree key, and again in the value + // (unencoded). + // That's not very memory-efficient! + + if (mutation.exists()) { + // It's OK for it to exist if it's just a "ghost" row -- i.e the + // row is deleted. + MRSRow ms_row(this, mutation.current_mutable_value()); + if (!ms_row.IsGhost()) { + return Status::AlreadyPresent("entry already present in memrowset"); + } + + // Insert a "reinsert" mutation. + return Reinsert(timestamp, row, &ms_row); + } + + // Copy the non-encoded key onto the stack since we need + // to mutate it when we relocate its Slices into our arena. + DEFINE_MRSROW_ON_STACK(this, mrsrow, mrsrow_slice); + mrsrow.header_->insertion_timestamp = timestamp; + mrsrow.header_->redo_head = nullptr; + RETURN_NOT_OK(mrsrow.CopyRow(row, arena_.get())); + + CHECK(mutation.Insert(mrsrow_slice)) + << "Expected to be able to insert, since the prepared mutation " + << "succeeded!"; + } + + anchorer_.AnchorIfMinimum(op_id.index()); + + debug_insert_count_++; + return Status::OK(); +} + +Status MemRowSet::Reinsert(Timestamp timestamp, const ConstContiguousRow& row, MRSRow *ms_row) { + DCHECK_SCHEMA_EQ(schema_, *row.schema()); + + // TODO(perf): This path makes some unnecessary copies that could be reduced, + // but let's assume that REINSERT is really rare and code for clarity over speed + // here. + + // Make a copy of the row, and relocate any of its indirected data into + // our Arena. + DEFINE_MRSROW_ON_STACK(this, row_copy, row_copy_slice); + RETURN_NOT_OK(row_copy.CopyRow(row, arena_.get())); + + // Encode the REINSERT mutation from the relocated row copy. + faststring buf; + RowChangeListEncoder encoder(&buf); + encoder.SetToReinsert(row_copy.row_slice()); + + // Move the REINSERT mutation itself into our Arena. + Mutation *mut = Mutation::CreateInArena(arena_.get(), timestamp, encoder.as_changelist()); + + // Append the mutation into the row's mutation list. + // This function has "release" semantics which ensures that the memory writes + // for the mutation are fully published before any concurrent reader sees + // the appended mutation. + mut->AppendToListAtomic(&ms_row->header_->redo_head); + return Status::OK(); +} + +Status MemRowSet::MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &delta, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB *result) { + { + btree::PreparedMutation mutation(probe.encoded_key_slice()); + mutation.Prepare(&tree_); + + if (!mutation.exists()) { + return Status::NotFound("not in memrowset"); + } + + MRSRow row(this, mutation.current_mutable_value()); + + // If the row exists, it may still be a "ghost" row -- i.e a row + // that's been deleted. If that's the case, we should treat it as + // NotFound. + if (row.IsGhost()) { + return Status::NotFound("not in memrowset (ghost)"); + } + + // Append to the linked list of mutations for this row. + Mutation *mut = Mutation::CreateInArena(arena_.get(), timestamp, delta); + + // This function has "release" semantics which ensures that the memory writes + // for the mutation are fully published before any concurrent reader sees + // the appended mutation. + mut->AppendToListAtomic(&row.header_->redo_head); + + MemStoreTargetPB* target = result->add_mutated_stores(); + target->set_mrs_id(id_); + } + + stats->mrs_consulted++; + + anchorer_.AnchorIfMinimum(op_id.index()); + debug_update_count_++; + return Status::OK(); +} + +Status MemRowSet::CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + ProbeStats* stats) const { + // Use a PreparedMutation here even though we don't plan to mutate. Even though + // this takes a lock rather than an optimistic copy, it should be a very short + // critical section, and this call is only made on updates, which are rare. + + stats->mrs_consulted++; + + btree::PreparedMutation mutation(probe.encoded_key_slice()); + mutation.Prepare(const_cast(&tree_)); + + if (!mutation.exists()) { + *present = false; + return Status::OK(); + } + + // TODO(perf): using current_mutable_value() will actually change the data's + // version number, even though we're not going to do any mutation. This would + // make concurrent readers retry, even though they don't have to (we aren't + // actually mutating anything here!) + MRSRow row(this, mutation.current_mutable_value()); + + // If the row exists, it may still be a "ghost" row -- i.e a row + // that's been deleted. If that's the case, we should treat it as + // NotFound. + *present = !row.IsGhost(); + return Status::OK(); +} + +MemRowSet::Iterator *MemRowSet::NewIterator(const Schema *projection, + const MvccSnapshot &snap) const { + return new MemRowSet::Iterator(shared_from_this(), tree_.NewIterator(), + projection, snap); +} + +MemRowSet::Iterator *MemRowSet::NewIterator() const { + // TODO: can we kill this function? should be only used by tests? + return NewIterator(&schema(), MvccSnapshot::CreateSnapshotIncludingAllTransactions()); +} + +Status MemRowSet::NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const { + out->reset(NewIterator(projection, snap)); + return Status::OK(); +} + +Status MemRowSet::NewCompactionInput(const Schema* projection, + const MvccSnapshot& snap, + gscoped_ptr* out) const { + out->reset(CompactionInput::Create(*this, projection, snap)); + return Status::OK(); +} + +Status MemRowSet::GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const { + return Status::NotSupported(""); +} + +// Virtual interface allows two possible row projector implementations +class MemRowSet::Iterator::MRSRowProjector { + public: + typedef RowProjector::ProjectionIdxMapping ProjectionIdxMapping; + virtual ~MRSRowProjector() {} + virtual Status ProjectRowForRead(const MRSRow& src_row, + RowBlockRow* dst_row, + Arena* arena) = 0; + virtual Status ProjectRowForRead(const ConstContiguousRow& src_row, + RowBlockRow* dst_row, + Arena* arena) = 0; + virtual const vector& base_cols_mapping() const = 0; + virtual const vector& adapter_cols_mapping() const = 0; + virtual Status Init() = 0; +}; + +namespace { + +typedef MemRowSet::Iterator::MRSRowProjector MRSRowProjector; + +template +class MRSRowProjectorImpl : public MRSRowProjector { + public: + explicit MRSRowProjectorImpl(gscoped_ptr actual) + : actual_(actual.Pass()) {} + + Status Init() override { return actual_->Init(); } + + Status ProjectRowForRead(const MRSRow& src_row, RowBlockRow* dst_row, + Arena* arena) override { + return actual_->ProjectRowForRead(src_row, dst_row, arena); + } + Status ProjectRowForRead(const ConstContiguousRow& src_row, + RowBlockRow* dst_row, + Arena* arena) override { + return actual_->ProjectRowForRead(src_row, dst_row, arena); + } + + const vector& base_cols_mapping() const override { + return actual_->base_cols_mapping(); + } + const vector& adapter_cols_mapping() const override { + return actual_->adapter_cols_mapping(); + } + + private: + gscoped_ptr actual_; +}; + +// If codegen is enabled, then generates a codegen::RowProjector; +// otherwise makes a regular one. +gscoped_ptr GenerateAppropriateProjector( + const Schema* base, const Schema* projection) { + // Attempt code-generated implementation + if (FLAGS_mrs_use_codegen) { + gscoped_ptr actual; + if (codegen::CompilationManager::GetSingleton()->RequestRowProjector( + base, projection, &actual)) { + return gscoped_ptr( + new MRSRowProjectorImpl(actual.Pass())); + } + } + + // Proceed with default implementation + gscoped_ptr actual(new RowProjector(base, projection)); + return gscoped_ptr( + new MRSRowProjectorImpl(actual.Pass())); +} + +} // anonymous namespace + +MemRowSet::Iterator::Iterator(const std::shared_ptr& mrs, + MemRowSet::MSBTIter* iter, + const Schema* projection, MvccSnapshot mvcc_snap) + : memrowset_(mrs), + iter_(iter), + mvcc_snap_(std::move(mvcc_snap)), + projection_(projection), + projector_( + GenerateAppropriateProjector(&mrs->schema_nonvirtual(), projection)), + delta_projector_(&mrs->schema_nonvirtual(), projection), + state_(kUninitialized) { + // TODO: various code assumes that a newly constructed iterator + // is pointed at the beginning of the dataset. This causes a redundant + // seek. Could make this lazy instead, or change the semantics so that + // a seek is required (probably the latter) + iter_->SeekToStart(); +} + +MemRowSet::Iterator::~Iterator() {} + +Status MemRowSet::Iterator::Init(ScanSpec *spec) { + DCHECK_EQ(state_, kUninitialized); + + RETURN_NOT_OK(projector_->Init()); + RETURN_NOT_OK(delta_projector_.Init()); + + if (spec && spec->lower_bound_key()) { + bool exact; + const Slice &lower_bound = spec->lower_bound_key()->encoded_key(); + if (!iter_->SeekAtOrAfter(lower_bound, &exact)) { + // Lower bound is after the end of the key range, no rows will + // pass the predicate so we can stop the scan right away. + state_ = kFinished; + return Status::OK(); + } + } + + if (spec && spec->exclusive_upper_bound_key()) { + const Slice &upper_bound = spec->exclusive_upper_bound_key()->encoded_key(); + exclusive_upper_bound_.reset(upper_bound); + } + + state_ = kScanning; + return Status::OK(); +} + +Status MemRowSet::Iterator::SeekAtOrAfter(const Slice &key, bool *exact) { + DCHECK_NE(state_, kUninitialized) << "not initted"; + + if (key.size() > 0) { + ConstContiguousRow row_slice(&memrowset_->schema(), key); + memrowset_->schema().EncodeComparableKey(row_slice, &tmp_buf); + } else { + // Seeking to empty key shouldn't try to run any encoding. + tmp_buf.resize(0); + } + + if (iter_->SeekAtOrAfter(Slice(tmp_buf), exact) || + key.size() == 0) { + return Status::OK(); + } else { + return Status::NotFound("no match in memrowset"); + } +} + +Status MemRowSet::Iterator::NextBlock(RowBlock *dst) { + // TODO: add dcheck that dst->schema() matches our schema + // also above TODO applies to a lot of other CopyNextRows cases + + DCHECK_NE(state_, kUninitialized) << "not initted"; + if (PREDICT_FALSE(!iter_->IsValid())) { + dst->Resize(0); + return Status::NotFound("end of iter"); + } + if (PREDICT_FALSE(state_ != kScanning)) { + dst->Resize(0); + return Status::OK(); + } + if (PREDICT_FALSE(dst->row_capacity() == 0)) { + return Status::OK(); + } + + // Reset rowblock arena to eventually reach appropriate buffer size. + // Always allocating the full capacity is only a problem for the last block. + dst->Resize(dst->row_capacity()); + if (dst->arena()) { + dst->arena()->Reset(); + } + + // Fill + dst->selection_vector()->SetAllTrue(); + size_t fetched; + RETURN_NOT_OK(FetchRows(dst, &fetched)); + DCHECK_LE(0, fetched); + DCHECK_LE(fetched, dst->nrows()); + + // Clear unreached bits by resizing + dst->Resize(fetched); + + return Status::OK(); +} + +Status MemRowSet::Iterator::FetchRows(RowBlock* dst, size_t* fetched) { + *fetched = 0; + do { + Slice k, v; + RowBlockRow dst_row = dst->row(*fetched); + + // Copy the row into the destination, including projection + // and relocating slices. + // TODO: can we share some code here with CopyRowToArena() from row.h + // or otherwise put this elsewhere? + iter_->GetCurrentEntry(&k, &v); + MRSRow row(memrowset_.get(), v); + + if (mvcc_snap_.IsCommitted(row.insertion_timestamp())) { + if (has_upper_bound() && out_of_bounds(k)) { + state_ = kFinished; + break; + } else { + RETURN_NOT_OK(projector_->ProjectRowForRead(row, &dst_row, dst->arena())); + + // Roll-forward MVCC for committed updates. + RETURN_NOT_OK(ApplyMutationsToProjectedRow( + row.header_->redo_head, &dst_row, dst->arena())); + } + } else { + // This row was not yet committed in the current MVCC snapshot + dst->selection_vector()->SetRowUnselected(*fetched); + + // In debug mode, fill the row data for easy debugging + #ifndef NDEBUG + if (state_ != kFinished) { + dst_row.OverwriteWithPattern("MVCCMVCCMVCCMVCCMVCCMVCC" + "MVCCMVCCMVCCMVCCMVCCMVCC" + "MVCCMVCCMVCCMVCCMVCCMVCC"); + } + #endif + } + + ++*fetched; + } while (iter_->Next() && *fetched < dst->nrows()); + + return Status::OK(); +} + +Status MemRowSet::Iterator::ApplyMutationsToProjectedRow( + const Mutation *mutation_head, RowBlockRow *dst_row, Arena *dst_arena) { + // Fast short-circuit the likely case of a row which was inserted and never + // updated. + if (PREDICT_TRUE(mutation_head == nullptr)) { + return Status::OK(); + } + + bool is_deleted = false; + + for (const Mutation *mut = mutation_head; + mut != nullptr; + mut = mut->next_) { + if (!mvcc_snap_.IsCommitted(mut->timestamp_)) { + // Transaction which wasn't committed yet in the reader's snapshot. + continue; + } + + // Apply the mutation. + + // Check if it's a deletion. + RowChangeListDecoder decoder(mut->changelist()); + RETURN_NOT_OK(decoder.Init()); + if (decoder.is_delete()) { + decoder.TwiddleDeleteStatus(&is_deleted); + } else if (decoder.is_reinsert()) { + decoder.TwiddleDeleteStatus(&is_deleted); + + Slice reinserted_slice; + RETURN_NOT_OK(decoder.GetReinsertedRowSlice(memrowset_->schema_nonvirtual(), + &reinserted_slice)); + ConstContiguousRow reinserted(&memrowset_->schema_nonvirtual(), + reinserted_slice); + RETURN_NOT_OK(projector_->ProjectRowForRead(reinserted, dst_row, dst_arena)); + } else { + DCHECK(decoder.is_update()); + + // TODO: this is slow, since it makes multiple passes through the rowchangelist. + // Instead, we should keep the backwards mapping of columns. + for (const RowProjector::ProjectionIdxMapping& mapping : projector_->base_cols_mapping()) { + RowChangeListDecoder decoder(mut->changelist()); + RETURN_NOT_OK(decoder.Init()); + ColumnBlock dst_col = dst_row->column_block(mapping.first); + RETURN_NOT_OK(decoder.ApplyToOneColumn(dst_row->row_index(), &dst_col, + memrowset_->schema_nonvirtual(), + mapping.second, dst_arena)); + } + + // TODO: Handle Delta Apply on projector_.adapter_cols_mapping() + DCHECK_EQ(projector_->adapter_cols_mapping().size(), 0) << "alter type is not supported"; + } + } + + // If the most recent mutation seen for the row was a DELETE, then set the selection + // vector bit to 0, so it doesn't show up in the results. + if (is_deleted) { + dst_row->SetRowUnselected(); + } + + return Status::OK(); +} + +// Copy the current MRSRow to the 'dst_row' provided using the iterator projection schema. +Status MemRowSet::Iterator::GetCurrentRow(RowBlockRow* dst_row, + Arena* row_arena, + const Mutation** redo_head, + Arena* mutation_arena, + Timestamp* insertion_timestamp) { + + DCHECK(redo_head != nullptr); + + // Get the row from the MemRowSet. It may have a different schema from the iterator projection. + const MRSRow src_row = GetCurrentRow(); + + *insertion_timestamp = src_row.insertion_timestamp(); + + // Project the RowChangeList if required + *redo_head = src_row.redo_head(); + if (!delta_projector_.is_identity()) { + DCHECK(mutation_arena != nullptr); + + Mutation *prev_redo = nullptr; + *redo_head = nullptr; + for (const Mutation *mut = src_row.redo_head(); mut != nullptr; mut = mut->next()) { + RETURN_NOT_OK(RowChangeListDecoder::ProjectUpdate(delta_projector_, + mut->changelist(), + &delta_buf_)); + + // The projection resulted in an empty mutation (e.g. update of a removed column) + if (delta_buf_.size() == 0) continue; + + Mutation *mutation = Mutation::CreateInArena(mutation_arena, + mut->timestamp(), + RowChangeList(delta_buf_)); + if (prev_redo != nullptr) { + prev_redo->set_next(mutation); + } else { + *redo_head = mutation; + } + prev_redo = mutation; + } + } + + // Project the Row + return projector_->ProjectRowForRead(src_row, dst_row, row_arena); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/memrowset.h b/src/kudu/tablet/memrowset.h new file mode 100644 index 000000000000..dee7db28d9d8 --- /dev/null +++ b/src/kudu/tablet/memrowset.h @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_MEMROWSET_H +#define KUDU_TABLET_MEMROWSET_H + +#include +#include +#include +#include + +#include "kudu/common/scan_spec.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/tablet/concurrent_btree.h" +#include "kudu/tablet/mutation.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/memory/memory.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MemTracker; + +namespace tablet { + +// +// Implementation notes: +// -------------------------- +// The MemRowSet is a concurrent b-tree which stores newly inserted data which +// has not yet been flushed to on-disk rowsets. In order to provide snapshot +// consistency, data is never updated in-place in the memrowset after insertion. +// Rather, a chain of mutations hangs off each row, acting as a per-row "redo log". +// +// Each row is stored in exactly one CBTree entry. Its key is the encoded form +// of the row's primary key, such that the entries sort correctly using the default +// lexicographic comparator. The value for each row is an instance of MRSRow. +// +// NOTE: all allocations done by the MemRowSet are done inside its associated +// thread-safe arena, and then freed in bulk when the MemRowSet is destructed. + +class MemRowSet; + +// The value stored in the CBTree for a single row. +class MRSRow { + public: + typedef ContiguousRowCell Cell; + + MRSRow(const MemRowSet *memrowset, const Slice &s) { + DCHECK_GE(s.size(), sizeof(Header)); + row_slice_ = s; + header_ = reinterpret_cast
    (row_slice_.mutable_data()); + row_slice_.remove_prefix(sizeof(Header)); + memrowset_ = memrowset; + } + + const Schema* schema() const; + + Timestamp insertion_timestamp() const { return header_->insertion_timestamp; } + + Mutation* redo_head() { return header_->redo_head; } + const Mutation* redo_head() const { return header_->redo_head; } + + const Slice &row_slice() const { return row_slice_; } + + const uint8_t* row_data() const { return row_slice_.data(); } + + bool is_null(size_t col_idx) const { + return ContiguousRowHelper::is_null(*schema(), row_slice_.data(), col_idx); + } + + void set_null(size_t col_idx, bool is_null) const { + ContiguousRowHelper::SetCellIsNull(*schema(), + const_cast(row_slice_.data()), col_idx, is_null); + } + + const uint8_t *cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::cell_ptr(*schema(), row_slice_.data(), col_idx); + } + + uint8_t *mutable_cell_ptr(size_t col_idx) const { + return const_cast(cell_ptr(col_idx)); + } + + const uint8_t *nullable_cell_ptr(size_t col_idx) const { + return ContiguousRowHelper::nullable_cell_ptr(*schema(), row_slice_.data(), col_idx); + } + + Cell cell(size_t col_idx) const { + return Cell(this, col_idx); + } + + // Return true if this row is a "ghost" -- i.e its most recent mutation is + // a deletion. + // + // NOTE: this call is O(n) in the number of mutations, since it has to walk + // the linked list all the way to the end, checking if each mutation is a + // DELETE or REINSERT. We expect the list is usually short (low-update use + // cases) but if this becomes a bottleneck, we could cache the 'ghost' status + // as a bit inside the row header. + bool IsGhost() const; + + private: + friend class MemRowSet; + + template + Status CopyRow(const ConstContiguousRow& row, ArenaType *arena) { + // the representation of the MRSRow and ConstContiguousRow is the same. + // so, instead of using CopyRow we can just do a memcpy. + memcpy(row_slice_.mutable_data(), row.row_data(), row_slice_.size()); + // Copy any referred-to memory to arena. + return kudu::RelocateIndirectDataToArena(this, arena); + } + + struct Header { + // Timestamp for the transaction which inserted this row. If a scanner with an + // older snapshot sees this row, it will be ignored. + Timestamp insertion_timestamp; + + // Pointer to the first mutation which has been applied to this row. Each + // mutation is an instance of the Mutation class, making up a singly-linked + // list for any mutations applied to the row. + Mutation* redo_head; + }; + + Header *header_; + + // Actual row data. + Slice row_slice_; + + const MemRowSet *memrowset_; +}; + +struct MSBTreeTraits : public btree::BTreeTraits { + typedef ThreadSafeMemoryTrackingArena ArenaType; +}; + +// Define an MRSRow instance using on-stack storage. +// This defines an array on the stack which is sized correctly for an MRSRow::Header +// plus a single row of the given schema, then constructs an MRSRow object which +// points into that stack storage. +#define DEFINE_MRSROW_ON_STACK(memrowset, varname, slice_name) \ + size_t varname##_size = sizeof(MRSRow::Header) + \ + ContiguousRowHelper::row_size((memrowset)->schema_nonvirtual()); \ + uint8_t varname##_storage[varname##_size]; \ + Slice slice_name(varname##_storage, varname##_size); \ + ContiguousRowHelper::InitNullsBitmap((memrowset)->schema_nonvirtual(), slice_name); \ + MRSRow varname(memrowset, slice_name); + + +// In-memory storage for data currently being written to the tablet. +// This is a holding area for inserts, currently held in row form +// (i.e not columnar) +// +// The data is kept sorted. +class MemRowSet : public RowSet, + public std::enable_shared_from_this { + public: + class Iterator; + + MemRowSet(int64_t id, + const Schema &schema, + log::LogAnchorRegistry* log_anchor_registry, + const std::shared_ptr& parent_tracker = + std::shared_ptr()); + + ~MemRowSet(); + + // Insert a new row into the memrowset. + // + // The provided 'row' must have the same memrowset's Schema. + // (TODO: Different schema are not yet supported) + // + // After insert, the row and any referred-to memory (eg for strings) + // have been copied into this MemRowSet's internal storage, and thus + // the provided memory buffer may safely be re-used or freed. + // + // Returns Status::OK unless allocation fails. + Status Insert(Timestamp timestamp, + const ConstContiguousRow& row, + const consensus::OpId& op_id); + + + // Update or delete an existing row in the memrowset. + // + // Returns Status::NotFound if the row doesn't exist. + virtual Status MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &delta, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB *result) OVERRIDE; + + // Return the number of entries in the memrowset. + // NOTE: this requires iterating all data, and is thus + // not very fast. + uint64_t entry_count() const { + return tree_.count(); + } + + // Conform entry_count to RowSet + Status CountRows(rowid_t *count) const OVERRIDE { + *count = entry_count(); + return Status::OK(); + } + + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const OVERRIDE; + + uint64_t EstimateOnDiskSize() const OVERRIDE { + return 0; + } + + boost::mutex *compact_flush_lock() OVERRIDE { + return &compact_flush_lock_; + } + + // MemRowSets are never available for compaction, currently. + virtual bool IsAvailableForCompaction() OVERRIDE { + return false; + } + + // Return true if there are no entries in the memrowset. + bool empty() const { + return tree_.empty(); + } + + // TODO: unit test me + Status CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + ProbeStats* stats) const OVERRIDE; + + // Return the memory footprint of this memrowset. + // Note that this may be larger than the sum of the data + // inserted into the memrowset, due to arena and data structure + // overhead. + size_t memory_footprint() const { + return arena_->memory_footprint(); + } + + // Return an iterator over the items in this memrowset. + // + // NOTE: for this function to work, there must be a shared_ptr + // referring to this MemRowSet. Otherwise, this will throw + // a C++ exception and all bets are off. + // + // TODO: clarify the consistency of this iterator in the method doc + Iterator *NewIterator() const; + Iterator *NewIterator(const Schema *projection, + const MvccSnapshot &snap) const; + + // Alias to conform to DiskRowSet interface + virtual Status NewRowIterator(const Schema* projection, + const MvccSnapshot& snap, + gscoped_ptr* out) const OVERRIDE; + + // Create compaction input. + virtual Status NewCompactionInput(const Schema* projection, + const MvccSnapshot& snap, + gscoped_ptr* out) const OVERRIDE; + + // Return the Schema for the rows in this memrowset. + const Schema &schema() const { + return schema_; + } + + // Same as schema(), but non-virtual method + const Schema& schema_nonvirtual() const { + return schema_; + } + + int64_t mrs_id() const { + return id_; + } + + std::shared_ptr metadata() OVERRIDE { + return std::shared_ptr( + reinterpret_cast(NULL)); + } + + // Dump the contents of the memrowset to the given vector. + // If 'lines' is NULL, dumps to LOG(INFO). + // + // This dumps every row, so should only be used in tests, etc. + virtual Status DebugDump(vector *lines = NULL) OVERRIDE; + + string ToString() const OVERRIDE { + return string("memrowset"); + } + + // Mark the memrowset as frozen. See CBTree::Freeze() + void Freeze() { + tree_.Freeze(); + } + + uint64_t debug_insert_count() const { + return debug_insert_count_; + } + uint64_t debug_update_count() const { + return debug_update_count_; + } + + size_t DeltaMemStoreSize() const OVERRIDE { return 0; } + + bool DeltaMemStoreEmpty() const OVERRIDE { return true; } + + int64_t MinUnflushedLogIndex() const OVERRIDE { + return anchorer_.minimum_log_index(); + } + + double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const OVERRIDE { + return 0; + } + + Status FlushDeltas() OVERRIDE { return Status::OK(); } + + Status MinorCompactDeltaStores() OVERRIDE { return Status::OK(); } + + private: + friend class Iterator; + + // Perform a "Reinsert" -- handle an insertion into a row which was previously + // inserted and deleted, but still has an entry in the MemRowSet. + Status Reinsert(Timestamp timestamp, + const ConstContiguousRow& row_data, + MRSRow *row); + + typedef btree::CBTree MSBTree; + + int64_t id_; + + const Schema schema_; + std::shared_ptr parent_tracker_; + std::shared_ptr mem_tracker_; + std::shared_ptr allocator_; + std::shared_ptr arena_; + + typedef btree::CBTreeIterator MSBTIter; + + MSBTree tree_; + + // Approximate counts of mutations. This variable is updated non-atomically, + // so it cannot be relied upon to be in any way accurate. It's only used + // as a sanity check during flush. + volatile uint64_t debug_insert_count_; + volatile uint64_t debug_update_count_; + + boost::mutex compact_flush_lock_; + + Atomic32 has_logged_throttling_; + + log::MinLogIndexAnchorer anchorer_; + + DISALLOW_COPY_AND_ASSIGN(MemRowSet); +}; + +// An iterator through in-memory data stored in a MemRowSet. +// This holds a reference to the MemRowSet, and so the memrowset +// must not be freed while this iterator is outstanding. +// +// This iterator is not a full snapshot, but individual rows +// are consistent, and it is safe to iterate during concurrent +// mutation. The consistency guarantee is that it will return +// at least all rows that were present at the time of construction, +// and potentially more. Each row will be at least as current as +// the time of construction, and potentially more current. +class MemRowSet::Iterator : public RowwiseIterator { + public: + class MRSRowProjector; + + virtual ~Iterator(); + + virtual Status Init(ScanSpec *spec) OVERRIDE; + + Status SeekAtOrAfter(const Slice &key, bool *exact); + + virtual Status NextBlock(RowBlock *dst) OVERRIDE; + + bool has_upper_bound() const { + return exclusive_upper_bound_.is_initialized(); + } + + bool out_of_bounds(const Slice &key) const { + DCHECK(has_upper_bound()) << "No upper bound set!"; + + return key.compare(*exclusive_upper_bound_) >= 0; + } + + size_t remaining_in_leaf() const { + DCHECK_NE(state_, kUninitialized) << "not initted"; + return iter_->remaining_in_leaf(); + } + + virtual bool HasNext() const OVERRIDE { + DCHECK_NE(state_, kUninitialized) << "not initted"; + return state_ != kFinished && iter_->IsValid(); + } + + // NOTE: This method will return a MRSRow with the MemRowSet schema. + // The row is NOT projected using the schema specified to the iterator. + const MRSRow GetCurrentRow() const { + DCHECK_NE(state_, kUninitialized) << "not initted"; + Slice dummy, mrsrow_data; + iter_->GetCurrentEntry(&dummy, &mrsrow_data); + return MRSRow(memrowset_.get(), mrsrow_data); + } + + // Copy the current MRSRow to the 'dst_row' provided using the iterator projection schema. + Status GetCurrentRow(RowBlockRow* dst_row, + Arena* row_arena, + const Mutation** redo_head, + Arena* mutation_arena, + Timestamp* insertion_timestamp); + + bool Next() { + DCHECK_NE(state_, kUninitialized) << "not initted"; + return iter_->Next(); + } + + string ToString() const OVERRIDE { + return "memrowset iterator"; + } + + const Schema& schema() const OVERRIDE { + return *projection_; + } + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE { + // Currently we do not expose any non-disk related statistics in + // IteratorStats. However, callers of GetIteratorStats expected + // an IteratorStats object for every column; vector::resize() is + // used as it will also fill the 'stats' with new instances of + // IteratorStats. + stats->resize(schema().num_columns()); + } + + private: + friend class MemRowSet; + + enum ScanState { + // Enumerated constants to indicate the iterator state: + kUninitialized = 0, + kScanning = 1, // We may continue fetching and returning values. + kFinished = 2 // We either know we can never reach the lower bound, or + // we've exceeded the upper bound. + }; + + DISALLOW_COPY_AND_ASSIGN(Iterator); + + Iterator(const std::shared_ptr &mrs, + MemRowSet::MSBTIter *iter, const Schema *projection, + MvccSnapshot mvcc_snap); + + // Various helper functions called while getting the next RowBlock + Status FetchRows(RowBlock* dst, size_t* fetched); + Status ApplyMutationsToProjectedRow(const Mutation *mutation_head, + RowBlockRow *dst_row, + Arena *dst_arena); + + const std::shared_ptr memrowset_; + gscoped_ptr iter_; + + // The MVCC snapshot which determines which rows and mutations are visible to + // this iterator. + const MvccSnapshot mvcc_snap_; + + // Mapping from projected column index back to memrowset column index. + // Relies on the MRSRowProjector interface to abstract from the two + // different implementations of the RowProjector, which may change + // at runtime (using vs. not using code generation). + const Schema* const projection_; + gscoped_ptr projector_; + DeltaProjector delta_projector_; + + // Temporary buffer used for RowChangeList projection. + faststring delta_buf_; + + size_t prepared_count_; + + // Temporary local buffer used for seeking to hold the encoded + // seek target. + faststring tmp_buf; + + // State of the scanner: indicates whether we should keep scanning/fetching, + // whether we've scanned the last batch, or whether we've reached the upper bounds + // or will never reach the lower bounds (no more rows can be returned) + ScanState state_; + + // Pushed down encoded upper bound key, if any + boost::optional exclusive_upper_bound_; +}; + +inline const Schema* MRSRow::schema() const { + return &memrowset_->schema_nonvirtual(); +} + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/metadata-test.cc b/src/kudu/tablet/metadata-test.cc new file mode 100644 index 000000000000..a404e6ddb2ab --- /dev/null +++ b/src/kudu/tablet/metadata-test.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/tablet/rowset_metadata.h" +#include "kudu/tablet/tablet_metadata.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +using std::vector; +using std::string; +using strings::Substitute; + +namespace kudu { +namespace tablet { + +class MetadataTest : public KuduTest { + public: + MetadataTest() { + all_blocks_ = { BlockId(1), BlockId(2), BlockId(3), BlockId(4) }; + + tablet_meta_ = new TabletMetadata(nullptr, "fake-tablet"); + CHECK_OK(RowSetMetadata::CreateNew(tablet_meta_.get(), 0, &meta_)); + for (int i = 0; i < all_blocks_.size(); i++) { + CHECK_OK(meta_->CommitRedoDeltaDataBlock(i, all_blocks_[i])); + } + CHECK_EQ(4, meta_->redo_delta_blocks().size()); + } + + protected: + vector all_blocks_; + scoped_refptr tablet_meta_; + gscoped_ptr meta_; +}; + +// Swap out some deltas from the middle of the list +TEST_F(MetadataTest, RSMD_TestReplaceDeltas_1) { + vector to_replace; + to_replace.push_back(BlockId(2)); + to_replace.push_back(BlockId(3)); + + ASSERT_OK(meta_->CommitUpdate( + RowSetMetadataUpdate() + .ReplaceRedoDeltaBlocks(to_replace, { BlockId(123) }))); + ASSERT_EQ(vector({ BlockId(1), BlockId(123), BlockId(4) }), + meta_->redo_delta_blocks()); +} + +// Swap out some deltas from the beginning of the list +TEST_F(MetadataTest, RSMD_TestReplaceDeltas_2) { + vector to_replace; + to_replace.push_back(BlockId(1)); + to_replace.push_back(BlockId(2)); + + ASSERT_OK(meta_->CommitUpdate( + RowSetMetadataUpdate() + .ReplaceRedoDeltaBlocks(to_replace, { BlockId(123) }))); + ASSERT_EQ(vector({ BlockId(123), BlockId(3), BlockId(4) }), + meta_->redo_delta_blocks()); +} + +// Swap out some deltas from the end of the list +TEST_F(MetadataTest, RSMD_TestReplaceDeltas_3) { + vector to_replace; + to_replace.push_back(BlockId(3)); + to_replace.push_back(BlockId(4)); + + ASSERT_OK(meta_->CommitUpdate( + RowSetMetadataUpdate() + .ReplaceRedoDeltaBlocks(to_replace, { BlockId(123) }))); + ASSERT_EQ(vector({ BlockId(1), BlockId(2), BlockId(123) }), + meta_->redo_delta_blocks()); +} + +// Swap out a non-contiguous list, check error. +TEST_F(MetadataTest, RSMD_TestReplaceDeltas_Bad_NonContiguous) { + vector to_replace; + to_replace.push_back(BlockId(2)); + to_replace.push_back(BlockId(4)); + + Status s = meta_->CommitUpdate( + RowSetMetadataUpdate() + .ReplaceRedoDeltaBlocks(to_replace, { BlockId(123) })); + EXPECT_EQ(Substitute("Invalid argument: Cannot find subsequence <$0> in <$1>", + BlockId::JoinStrings(to_replace), + BlockId::JoinStrings(all_blocks_)), + s.ToString()); + + // Should be unchanged + EXPECT_EQ(all_blocks_, meta_->redo_delta_blocks()); +} + +// Swap out a list which contains an invalid element, check error. +TEST_F(MetadataTest, RSMD_TestReplaceDeltas_Bad_DoesntExist) { + vector to_replace; + to_replace.push_back(BlockId(555)); + + Status s = meta_->CommitUpdate( + RowSetMetadataUpdate() + .ReplaceRedoDeltaBlocks(to_replace, { BlockId(123) })); + EXPECT_EQ(Substitute("Invalid argument: Cannot find subsequence <$0> in <$1>", + BlockId::JoinStrings(to_replace), + BlockId::JoinStrings(all_blocks_)), + s.ToString()); + + // Should be unchanged + EXPECT_EQ(all_blocks_, meta_->redo_delta_blocks()); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/metadata.proto b/src/kudu/tablet/metadata.proto new file mode 100644 index 000000000000..8e71d94a0ab2 --- /dev/null +++ b/src/kudu/tablet/metadata.proto @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tablet; + +option java_package = "org.kududb.tablet"; + +import "kudu/common/common.proto"; +import "kudu/consensus/opid.proto"; +import "kudu/fs/fs.proto"; + +// ============================================================================ +// Tablet Metadata +// ============================================================================ + +message ColumnDataPB { + required BlockIdPB block = 2; + // REMOVED: optional ColumnSchemaPB OBSOLETE_schema = 3; + optional int32 column_id = 4; +} + +message DeltaDataPB { + required BlockIdPB block = 2; +} + +message RowSetDataPB { + required uint64 id = 1; + required int64 last_durable_dms_id = 2; + repeated ColumnDataPB columns = 3; + repeated DeltaDataPB redo_deltas = 4; + repeated DeltaDataPB undo_deltas = 5; + optional BlockIdPB bloom_block = 6; + optional BlockIdPB adhoc_index_block = 7; +} + +// State flags indicating whether the tablet is in the middle of being copied +// and is therefore not possible to bring up, whether it has been deleted, or +// whether the data is in a usable state. +enum TabletDataState { + TABLET_DATA_UNKNOWN = 999; + + // The tablet is set to TABLET_DATA_COPYING state when in the middle of + // remote bootstrap while copying data files from a remote peer. If a tablet + // server crashes with a tablet in this state, the tablet must be deleted and + // the remote bootstrap process must be restarted for that tablet. + TABLET_DATA_COPYING = 0; + + // Fresh empty tablets and successfully copied tablets are set to the + // TABLET_DATA_READY state. + TABLET_DATA_READY = 1; + + // This tablet is in the process of being deleted. + // The tablet server should "roll forward" the deletion during boot, + // rather than trying to load the tablet. + TABLET_DATA_DELETED = 2; + + // The tablet has been deleted, and now just consists of a "tombstone". + TABLET_DATA_TOMBSTONED = 3; +} + +// The super-block keeps track of the tablet data blocks. +// A tablet contains one or more RowSets, which contain +// a set of blocks (one for each column), a set of delta blocks +// and optionally a block containing the bloom filter +// and a block containing the compound-keys. +message TabletSuperBlockPB { + // Table ID of the table this tablet is part of. + required bytes table_id = 1; + + // Tablet Id + required bytes tablet_id = 2; + + // The latest durable MemRowSet id + required int64 last_durable_mrs_id = 3; + + // DEPRECATED. + optional bytes start_key = 4; + + // DEPRECATED. + optional bytes end_key = 5; + + // The partition of the table. + optional PartitionPB partition = 13; + + // Tablet RowSets + repeated RowSetDataPB rowsets = 6; + + // The latest schema + // TODO: maybe this should be TableSchemaPB? Need to actually put those attributes + // into use throughout the code. Using the simpler one for now. + required string table_name = 7; + required SchemaPB schema = 8; + required uint32 schema_version = 9; + + // The partition schema of the table. + optional PartitionSchemaPB partition_schema = 14; + + // The current state of the tablet's data. + optional TabletDataState tablet_data_state = 10 [ default = TABLET_DATA_UNKNOWN ]; + + // Blocks that became orphans after flushing this superblock. In other + // words, the set difference of the blocks belonging to the previous + // superblock and this one. + // + // It's always safe to delete the blocks found here. + repeated BlockIdPB orphaned_blocks = 11; + + // For tablets that have been tombstoned, stores the last OpId stored in the + // WAL before tombstoning. + // Only relevant for TOMBSTONED tablets. + optional consensus.OpId tombstone_last_logged_opid = 12; +} + +// The enum of tablet states. +// Tablet states are sent in TabletReports and kept in TabletPeer. +enum TabletStatePB { + UNKNOWN = 999; + + // Tablet has not yet started. + NOT_STARTED = 5; + + // Indicates the Tablet is bootstrapping, i.e. that the Tablet is not + // available for RPC. + BOOTSTRAPPING = 0; + + // Once the configuration phase is over Peers are in RUNNING state. In this + // state Peers are available for client RPCs. + RUNNING = 1; + + // The tablet failed to for some reason. TabletPeer::error() will return + // the reason for the failure. + FAILED = 2; + + // The Tablet is shutting down, and will not accept further requests. + QUIESCING = 3; + + // The Tablet has been stopped. + SHUTDOWN = 4; +} diff --git a/src/kudu/tablet/mock-rowsets.h b/src/kudu/tablet/mock-rowsets.h new file mode 100644 index 000000000000..ca9b84bc58e8 --- /dev/null +++ b/src/kudu/tablet/mock-rowsets.h @@ -0,0 +1,173 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_MOCK_ROWSETS_H +#define KUDU_TABLET_MOCK_ROWSETS_H + +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/rowset_metadata.h" + +namespace kudu { +namespace tablet { + +// Mock implementation of RowSet which just aborts on every call. +class MockRowSet : public RowSet { + public: + virtual Status CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + ProbeStats* stats) const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual Status MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id_, + ProbeStats* stats, + OperationResultPB *result) OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual Status NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual Status NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual Status CountRows(rowid_t *count) const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual std::string ToString() const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return ""; + } + virtual Status DebugDump(vector *lines = NULL) OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual Status Delete() { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + virtual uint64_t EstimateOnDiskSize() const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return 0; + } + virtual boost::mutex *compact_flush_lock() OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return NULL; + } + virtual std::shared_ptr metadata() OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return std::shared_ptr( + reinterpret_cast(NULL)); + } + + virtual size_t DeltaMemStoreSize() const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return 0; + } + + virtual bool DeltaMemStoreEmpty() const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return 0; + } + + virtual int64_t MinUnflushedLogIndex() const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return -1; + } + + virtual double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) + const OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return 0; + } + + virtual Status FlushDeltas() OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + + virtual Status MinorCompactDeltaStores() OVERRIDE { + LOG(FATAL) << "Unimplemented"; + return Status::OK(); + } + + virtual bool IsAvailableForCompaction() OVERRIDE { + return true; + } +}; + +// Mock which implements GetBounds() with constant provided bonuds. +class MockDiskRowSet : public MockRowSet { + public: + MockDiskRowSet(std::string first_key, std::string last_key, + int size = 1000000) + : first_key_(std::move(first_key)), + last_key_(std::move(last_key)), + size_(size) {} + + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const OVERRIDE { + *min_encoded_key = Slice(first_key_); + *max_encoded_key = Slice(last_key_); + return Status::OK(); + } + + virtual uint64_t EstimateOnDiskSize() const OVERRIDE { + return size_; + } + + virtual std::string ToString() const OVERRIDE { + return strings::Substitute("mock[$0, $1]", + Slice(first_key_).ToDebugString(), + Slice(last_key_).ToDebugString()); + } + + private: + const std::string first_key_; + const std::string last_key_; + const uint64_t size_; +}; + +// Mock which acts like a MemRowSet and has no known bounds. +class MockMemRowSet : public MockRowSet { + public: + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const OVERRIDE { + return Status::NotSupported(""); + } + + private: + const std::string first_key_; + const std::string last_key_; +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_MOCK_ROWSETS_H */ diff --git a/src/kudu/tablet/mt-diskrowset-test.cc b/src/kudu/tablet/mt-diskrowset-test.cc new file mode 100644 index 000000000000..98b3dc4108f4 --- /dev/null +++ b/src/kudu/tablet/mt-diskrowset-test.cc @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/tablet/diskrowset-test-base.h" + +DEFINE_int32(num_threads, 2, "Number of threads to test"); + +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +class TestMultiThreadedRowSet : public TestRowSet { + public: + void RowSetUpdateThread(DiskRowSet *rs) { + unordered_set updated; + UpdateExistingRows(rs, 0.5f, &updated); + } + + void FlushThread(DiskRowSet *rs) { + for (int i = 0; i < 10; i++) { + CHECK_OK(rs->FlushDeltas()); + } + } + + void StartUpdaterThreads(boost::ptr_vector *threads, + DiskRowSet *rs, + int n_threads) { + for (int i = 0; i < n_threads; i++) { + threads->push_back(new boost::thread( + &TestMultiThreadedRowSet::RowSetUpdateThread, this, + rs)); + } + } + + void StartFlushThread(boost::ptr_vector *threads, + DiskRowSet *rs) { + threads->push_back(new boost::thread( + &TestMultiThreadedRowSet::FlushThread, this, rs)); + } + + void JoinThreads(boost::ptr_vector *threads) { + for (boost::thread &thr : *threads) { + thr.join(); + } + } +}; + + +TEST_F(TestMultiThreadedRowSet, TestMTUpdate) { + if (2 == FLAGS_num_threads) { + if (AllowSlowTests()) { + FLAGS_num_threads = 16; + } + } + + WriteTestRowSet(); + + // Re-open the rowset + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Spawn a bunch of threads, each of which will do updates. + boost::ptr_vector threads; + StartUpdaterThreads(&threads, rs.get(), FLAGS_num_threads); + + JoinThreads(&threads); +} + +TEST_F(TestMultiThreadedRowSet, TestMTUpdateAndFlush) { + if (2 == FLAGS_num_threads) { + if (AllowSlowTests()) { + FLAGS_num_threads = 16; + } + } + + WriteTestRowSet(); + + // Re-open the rowset + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + // Spawn a bunch of threads, each of which will do updates. + boost::ptr_vector threads; + StartUpdaterThreads(&threads, rs.get(), FLAGS_num_threads); + StartFlushThread(&threads, rs.get()); + + JoinThreads(&threads); + + // TODO: test that updates were successful -- collect the updated + // row lists from all the threads, and verify them. +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/mt-rowset_delta_compaction-test.cc b/src/kudu/tablet/mt-rowset_delta_compaction-test.cc new file mode 100644 index 000000000000..c9e8004d6eaf --- /dev/null +++ b/src/kudu/tablet/mt-rowset_delta_compaction-test.cc @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/thread.h" +#include "kudu/tablet/diskrowset-test-base.h" + +enum { + kDefaultNumSecondsPerThread = 1, + kDefaultNumFlushThreads = 4, + kDefaultNumCompactionThreads = 4, +}; + +DEFINE_int32(num_update_threads, 1, "Number of updater threads"); +DEFINE_int32(num_flush_threads, kDefaultNumFlushThreads, "Number of flusher threads"); +DEFINE_int32(num_compaction_threads, kDefaultNumCompactionThreads, "Number of compaction threads"); +DEFINE_int32(num_seconds_per_thread, kDefaultNumSecondsPerThread, + "Minimum number of seconds each thread should work"); + +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +using base::subtle::Release_Store; +using base::subtle::Release_Load; +using base::subtle::NoBarrier_Load; + +class TestMultiThreadedRowSetDeltaCompaction : public TestRowSet { + public: + + TestMultiThreadedRowSetDeltaCompaction() + : TestRowSet(), + update_counter_(0), + should_run_(1) { + } + + // This thread read the value of an atomic integer, updates all rows + // in 'rs' to the value + 1, and then sets the atomic integer back + // to value + 1. This is done so that the verifying threads knows the + // latest expected value of the row (simply calling AtomicIncrement + // won't work as a thread setting a value n+1 is not guaranteed to finish + // before a thread setting value n). + void RowSetUpdateThread(DiskRowSet *rs) { + while (ShouldRun()) { + uint32_t val = Release_Load(&update_counter_); + UpdateRowSet(rs, val + 1); + if (ShouldRun()) { + Release_Store(&update_counter_, val + 1); + } + } + } + + void RowSetFlushThread(DiskRowSet *rs) { + while (ShouldRun()) { + if (rs->CountDeltaStores() < 5) { + CHECK_OK(rs->FlushDeltas()); + } else { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + } + } + + void RowSetDeltaCompactionThread(DiskRowSet *rs) { + while (ShouldRun()) { + CHECK_OK(rs->MinorCompactDeltaStores()); + } + } + + void ReadVerify(DiskRowSet *rs) { + Arena arena(1024, 1024*1024); + RowBlock dst(schema_, 1000, &arena); + gscoped_ptr iter; + ASSERT_OK(rs->NewRowIterator(&schema_, + MvccSnapshot::CreateSnapshotIncludingAllTransactions(), + &iter)); + uint32_t expected = NoBarrier_Load(&update_counter_); + ASSERT_OK(iter->Init(nullptr)); + while (iter->HasNext()) { + ASSERT_OK_FAST(iter->NextBlock(&dst)); + size_t n = dst.nrows(); + ASSERT_GT(n, 0); + for (size_t j = 0; j < n; j++) { + uint32_t val = *schema_.ExtractColumnFromRow(dst.row(j), 1); + ASSERT_GE(val, expected); + } + } + } + + void StartThreads(DiskRowSet *rs) { + for (int i = 0; i < FLAGS_num_update_threads; i++) { + scoped_refptr thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("log_writer$0", i), + &TestMultiThreadedRowSetDeltaCompaction::RowSetUpdateThread, this, rs, &thread)); + update_threads_.push_back(thread); + } + for (int i = 0; i < FLAGS_num_flush_threads; i++) { + scoped_refptr thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("delta_flush$0", i), + &TestMultiThreadedRowSetDeltaCompaction::RowSetFlushThread, this, rs, &thread)); + flush_threads_.push_back(thread); + } + for (int i = 0; i < FLAGS_num_compaction_threads; i++) { + scoped_refptr thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("delta_compaction$0", i), + &TestMultiThreadedRowSetDeltaCompaction::RowSetDeltaCompactionThread, this, rs, &thread)); + compaction_threads_.push_back(thread); + } + } + + void JoinThreads() { + for (const auto& thread : update_threads_) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + for (const auto& thread : flush_threads_) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + for (const auto& thread : compaction_threads_) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + for (const auto& thread : alter_schema_threads_) { + ASSERT_OK(ThreadJoiner(thread.get()).Join()); + } + } + + void WriteTestRowSetWithZeros() { + WriteTestRowSet(0, true); + } + + void UpdateRowSet(DiskRowSet *rs, uint32_t value) { + for (uint32_t idx = 0; idx < n_rows_ && ShouldRun(); idx++) { + OperationResultPB result; + ASSERT_OK_FAST(UpdateRow(rs, idx, value, &result)); + } + } + + void TestUpdateAndVerify() { + WriteTestRowSetWithZeros(); + shared_ptr rs; + ASSERT_OK(OpenTestRowSet(&rs)); + + StartThreads(rs.get()); + SleepFor(MonoDelta::FromSeconds(FLAGS_num_seconds_per_thread)); + base::subtle::NoBarrier_Store(&should_run_, 0); + ASSERT_NO_FATAL_FAILURE(JoinThreads()); + + ASSERT_NO_FATAL_FAILURE(ReadVerify(rs.get())); + } + + bool ShouldRun() const { + return NoBarrier_Load(&should_run_); + } + + protected: + + Atomic32 update_counter_; + Atomic32 should_run_; + vector > update_threads_; + vector > flush_threads_; + vector > compaction_threads_; + vector > alter_schema_threads_; +}; + +static void SetupFlagsForSlowTests() { + if (kDefaultNumSecondsPerThread == FLAGS_num_seconds_per_thread) { + FLAGS_num_seconds_per_thread = 40; + } + if (kDefaultNumFlushThreads == FLAGS_num_flush_threads) { + FLAGS_num_flush_threads = 8; + } + if (kDefaultNumCompactionThreads == FLAGS_num_compaction_threads) { + FLAGS_num_compaction_threads = 8; + } +} + +TEST_F(TestMultiThreadedRowSetDeltaCompaction, TestMTUpdateAndCompact) { + if (AllowSlowTests()) { + SetupFlagsForSlowTests(); + } + + TestUpdateAndVerify(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/mt-tablet-test.cc b/src/kudu/tablet/mt-tablet-test.cc new file mode 100644 index 000000000000..41815309b995 --- /dev/null +++ b/src/kudu/tablet/mt-tablet-test.cc @@ -0,0 +1,470 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/codegen/compilation_manager.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/test_graph.h" +#include "kudu/util/thread.h" + +DECLARE_double(tablet_delta_store_major_compact_min_ratio); +DECLARE_int32(tablet_delta_store_minor_compact_max); +DEFINE_int32(num_insert_threads, 8, "Number of inserting threads to launch"); +DEFINE_int32(num_counter_threads, 8, "Number of counting threads to launch"); +DEFINE_int32(num_summer_threads, 1, "Number of summing threads to launch"); +DEFINE_int32(num_updater_threads, 1, "Number of updating threads to launch"); +DEFINE_int32(num_slowreader_threads, 1, "Number of 'slow' reader threads to launch"); +DEFINE_int32(num_flush_threads, 1, "Number of flusher reader threads to launch"); +DEFINE_int32(num_compact_threads, 1, "Number of compactor threads to launch"); +DEFINE_int32(num_flush_delta_threads, 1, "Number of delta flusher reader threads to launch"); +DEFINE_int32(num_minor_compact_deltas_threads, 1, + "Number of delta minor compactor threads to launch"); +DEFINE_int32(num_major_compact_deltas_threads, 1, + "Number of delta major compactor threads to launch"); + +DEFINE_int64(inserts_per_thread, 1000, + "Number of rows inserted by each inserter thread"); +DEFINE_int32(tablet_test_flush_threshold_mb, 0, "Minimum memrowset size to flush"); +DEFINE_double(flusher_backoff, 2.0f, "Ratio to backoff the flusher thread"); +DEFINE_int32(flusher_initial_frequency_ms, 30, "Number of ms to wait between flushes"); + +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +template +class MultiThreadedTabletTest : public TabletTestBase { + // Import some names from superclass, since C++ is stingy about + // letting us refer to the members otherwise. + typedef TabletTestBase superclass; + using superclass::schema_; + using superclass::client_schema_; + using superclass::tablet; + using superclass::setup_; + public: + virtual void SetUp() { + superclass::SetUp(); + + // Warm up code cache with all the projections we'll be using. + gscoped_ptr iter; + CHECK_OK(tablet()->NewRowIterator(client_schema_, &iter)); + uint64_t count; + CHECK_OK(tablet()->CountRows(&count)); + const Schema* schema = tablet()->schema(); + ColumnSchema valcol = schema->column(schema->find_column("val")); + valcol_projection_ = Schema({ valcol }, 0); + CHECK_OK(tablet()->NewRowIterator(valcol_projection_, &iter)); + codegen::CompilationManager::GetSingleton()->Wait(); + + ts_collector_.StartDumperThread(); + } + + MultiThreadedTabletTest() + : running_insert_count_(FLAGS_num_insert_threads), + ts_collector_(::testing::UnitTest::GetInstance()->current_test_info()->test_case_name()) { + } + + void InsertThread(int tid) { + CountDownOnScopeExit dec_count(&running_insert_count_); + shared_ptr inserts = ts_collector_.GetTimeSeries("inserted"); + + // TODO: add a test where some of the inserts actually conflict + // on the same row. + + uint64_t max_rows = this->ClampRowCount(FLAGS_inserts_per_thread * FLAGS_num_insert_threads) + / FLAGS_num_insert_threads; + + if (max_rows < FLAGS_inserts_per_thread) { + LOG(WARNING) << "Clamping the inserts per thread to " << max_rows << " to prevent overflow"; + } + + this->InsertTestRows(tid * max_rows, + max_rows, 0, + inserts.get()); + } + + void UpdateThread(int tid) { + const Schema &schema = schema_; + + shared_ptr updates = ts_collector_.GetTimeSeries("updated"); + + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + + Arena tmp_arena(1024, 1024); + RowBlock block(schema_, 1, &tmp_arena); + faststring update_buf; + + uint64_t updates_since_last_report = 0; + int col_idx = schema.num_key_columns() == 1 ? 2 : 3; + LOG(INFO) << "Update thread using schema: " << schema.ToString(); + + KuduPartialRow row(&client_schema_); + + while (running_insert_count_.count() > 0) { + gscoped_ptr iter; + CHECK_OK(tablet()->NewRowIterator(client_schema_, &iter)); + CHECK_OK(iter->Init(NULL)); + + while (iter->HasNext() && running_insert_count_.count() > 0) { + tmp_arena.Reset(); + CHECK_OK(iter->NextBlock(&block)); + CHECK_EQ(block.nrows(), 1); + + if (!block.selection_vector()->IsRowSelected(0)) { + // Don't try to update rows which aren't visible yet -- + // this will crash, since the data in row_slice isn't even copied. + continue; + } + + + RowBlockRow rb_row = block.row(0); + if (rand() % 10 == 7) { + // Increment the "val" + const int32_t *old_val = schema.ExtractColumnFromRow(rb_row, col_idx); + // Issue an update. In the NullableValue setup, many of the rows start with + // NULL here, so we have to check for it. + int32_t new_val; + if (old_val != nullptr) { + new_val = *old_val + 1; + } else { + new_val = 0; + } + + // Rebuild the key by extracting the cells from the row + setup_.BuildRowKeyFromExistingRow(&row, rb_row); + CHECK_OK(row.SetInt32(col_idx, new_val)); + CHECK_OK(writer.Update(row)); + + if (++updates_since_last_report >= 10) { + updates->AddValue(updates_since_last_report); + updates_since_last_report = 0; + } + } + } + } + } + + // Thread which repeatedly issues CountRows() and makes sure + // that the count doesn't go ever down. + void CountThread(int tid) { + rowid_t last_count = 0; + while (running_insert_count_.count() > 0) { + uint64_t count; + CHECK_OK(tablet()->CountRows(&count)); + ASSERT_GE(count, last_count); + last_count = count; + } + } + + // Thread which iterates slowly over the first 10% of the data. + // This is meant to test that outstanding iterators don't end up + // trying to reference already-freed memrowset memory. + void SlowReaderThread(int tid) { + Arena arena(32*1024, 256*1024); + RowBlock block(schema_, 1, &arena); + + uint64_t max_rows = this->ClampRowCount(FLAGS_inserts_per_thread * FLAGS_num_insert_threads) + / FLAGS_num_insert_threads; + + int max_iters = FLAGS_num_insert_threads * max_rows / 10; + + while (running_insert_count_.count() > 0) { + gscoped_ptr iter; + CHECK_OK(tablet()->NewRowIterator(client_schema_, &iter)); + CHECK_OK(iter->Init(NULL)); + + for (int i = 0; i < max_iters && iter->HasNext(); i++) { + CHECK_OK(iter->NextBlock(&block)); + + if (running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(1))) { + return; + } + } + } + } + + void SummerThread(int tid) { + shared_ptr scanned_ts = ts_collector_.GetTimeSeries( + "scanned"); + + while (running_insert_count_.count() > 0) { + CountSum(scanned_ts); + } + } + + uint64_t CountSum(const shared_ptr &scanned_ts) { + Arena arena(1024, 1024); // unused, just scanning ints + + static const int kBufInts = 1024*1024 / 8; + RowBlock block(valcol_projection_, kBufInts, &arena); + ColumnBlock column = block.column_block(0); + + uint64_t count_since_report = 0; + + uint64_t sum = 0; + + gscoped_ptr iter; + CHECK_OK(tablet()->NewRowIterator(valcol_projection_, &iter)); + CHECK_OK(iter->Init(NULL)); + + while (iter->HasNext()) { + arena.Reset(); + CHECK_OK(iter->NextBlock(&block)); + + for (size_t j = 0; j < block.nrows(); j++) { + sum += *reinterpret_cast(column.cell_ptr(j)); + } + count_since_report += block.nrows(); + + // Report metrics if enough time has passed + if (count_since_report > 100) { + if (scanned_ts.get()) { + scanned_ts->AddValue(count_since_report); + } + count_since_report = 0; + } + } + + if (scanned_ts.get()) { + scanned_ts->AddValue(count_since_report); + } + + return sum; + } + + + + void FlushThread(int tid) { + // Start off with a very short wait time between flushes. + // But, especially in debug mode, this will only allow a few + // rows to get inserted between each flush, and the test will take + // quite a while. So, after every flush, we double the wait time below. + int wait_time = FLAGS_flusher_initial_frequency_ms; + while (running_insert_count_.count() > 0) { + + if (tablet()->MemRowSetSize() > FLAGS_tablet_test_flush_threshold_mb * 1024 * 1024) { + CHECK_OK(tablet()->Flush()); + } else { + LOG(INFO) << "Not flushing, memrowset not very full"; + } + + if (tablet()->DeltaMemStoresSize() > FLAGS_tablet_test_flush_threshold_mb * 1024 * 1024) { + CHECK_OK(tablet()->FlushBiggestDMS()); + } + + // Wait, unless the inserters are all done. + running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(wait_time)); + wait_time *= FLAGS_flusher_backoff; + } + } + + void FlushDeltasThread(int tid) { + int wait_time = 100; + while (running_insert_count_.count() > 0) { + CHECK_OK(tablet()->FlushBiggestDMS()); + + // Wait, unless the inserters are all done. + running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(wait_time)); + } + } + + void MinorCompactDeltasThread(int tid) { + CompactDeltas(RowSet::MINOR_DELTA_COMPACTION); + } + + void MajorCompactDeltasThread(int tid) { + CompactDeltas(RowSet::MAJOR_DELTA_COMPACTION); + } + + void CompactDeltas(RowSet::DeltaCompactionType type) { + int wait_time = 100; + while (running_insert_count_.count() > 0) { + CHECK_OK(tablet()->CompactWorstDeltas(type)); + + // Wait, unless the inserters are all done. + running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(wait_time)); + } + } + + void CompactThread(int tid) { + int wait_time = 100; + while (running_insert_count_.count() > 0) { + CHECK_OK(tablet()->Compact(Tablet::COMPACT_NO_FLAGS)); + + // Wait, unless the inserters are all done. + running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(wait_time)); + } + } + + // Thread which cycles between inserting and deleting a test row, each time + // with a different value. + void DeleteAndReinsertCycleThread(int tid) { + int32_t iteration = 0; + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + + while (running_insert_count_.count() > 0) { + for (int i = 0; i < 100; i++) { + CHECK_OK(this->InsertTestRow(&writer, tid, iteration++)); + CHECK_OK(this->DeleteTestRow(&writer, tid)); + } + } + } + + // Thread which continuously sends updates at the same row, ignoring any + // "not found" errors that might come back. This is used simultaneously with + // DeleteAndReinsertCycleThread to check for races where we might accidentally + // succeed in UPDATING a ghost row. + void StubbornlyUpdateSameRowThread(int tid) { + int32_t iteration = 0; + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + while (running_insert_count_.count() > 0) { + for (int i = 0; i < 100; i++) { + Status s = this->UpdateTestRow(&writer, tid, iteration++); + if (!s.ok() && !s.IsNotFound()) { + // We expect "not found", but not any other errors. + CHECK_OK(s); + } + } + } + } + + // Thread which wakes up periodically and collects metrics like memrowset + // size, etc. Eventually we should have a metrics system to collect things + // like this, but for now, this is what we've got. + void CollectStatisticsThread(int tid) { + shared_ptr num_rowsets_ts = ts_collector_.GetTimeSeries( + "num_rowsets"); + shared_ptr memrowset_size_ts = ts_collector_.GetTimeSeries( + "memrowset_kb"); + + while (running_insert_count_.count() > 0) { + num_rowsets_ts->SetValue(tablet()->num_rowsets()); + memrowset_size_ts->SetValue(tablet()->MemRowSetSize() / 1024); + + // Wait, unless the inserters are all done. + running_insert_count_.WaitFor(MonoDelta::FromMilliseconds(250)); + } + } + + template + void StartThreads(int n_threads, const FunctionType &function) { + for (int i = 0; i < n_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("test$0", i), + function, this, i, &new_thread)); + threads_.push_back(new_thread); + } + } + + void JoinThreads() { + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + } + + std::vector > threads_; + CountDownLatch running_insert_count_; + + // Projection with only an int column. + // This is provided by both harnesses. + Schema valcol_projection_; + + TimeSeriesCollector ts_collector_; +}; + + +TYPED_TEST_CASE(MultiThreadedTabletTest, TabletTestHelperTypes); + + +TYPED_TEST(MultiThreadedTabletTest, DoTestAllAtOnce) { + if (1000 == FLAGS_inserts_per_thread) { + if (AllowSlowTests()) { + FLAGS_inserts_per_thread = 50000; + } + } + + // Spawn a bunch of threads, each of which will do updates. + this->StartThreads(1, &TestFixture::CollectStatisticsThread); + this->StartThreads(FLAGS_num_insert_threads, &TestFixture::InsertThread); + this->StartThreads(FLAGS_num_counter_threads, &TestFixture::CountThread); + this->StartThreads(FLAGS_num_summer_threads, &TestFixture::SummerThread); + this->StartThreads(FLAGS_num_flush_threads, &TestFixture::FlushThread); + this->StartThreads(FLAGS_num_compact_threads, &TestFixture::CompactThread); + this->StartThreads(FLAGS_num_flush_delta_threads, &TestFixture::FlushDeltasThread); + this->StartThreads(FLAGS_num_minor_compact_deltas_threads, + &TestFixture::MinorCompactDeltasThread); + this->StartThreads(FLAGS_num_major_compact_deltas_threads, + &TestFixture::MajorCompactDeltasThread); + this->StartThreads(FLAGS_num_slowreader_threads, &TestFixture::SlowReaderThread); + this->StartThreads(FLAGS_num_updater_threads, &TestFixture::UpdateThread); + this->JoinThreads(); + LOG_TIMING(INFO, "Summing int32 column") { + uint64_t sum = this->CountSum(shared_ptr()); + LOG(INFO) << "Sum = " << sum; + } + + uint64_t max_rows = this->ClampRowCount(FLAGS_inserts_per_thread * FLAGS_num_insert_threads) + / FLAGS_num_insert_threads; + + this->VerifyTestRows(0, max_rows * FLAGS_num_insert_threads); +} + +// Start up a bunch of threads which repeatedly insert and delete the same +// row, while flushing and compacting. This checks various concurrent handling +// of DELETE/REINSERT during flushes. +TYPED_TEST(MultiThreadedTabletTest, DeleteAndReinsert) { + google::FlagSaver saver; + FLAGS_flusher_backoff = 1.0f; + FLAGS_flusher_initial_frequency_ms = 1; + FLAGS_tablet_delta_store_major_compact_min_ratio = 0.01f; + FLAGS_tablet_delta_store_minor_compact_max = 10; + this->StartThreads(FLAGS_num_flush_threads, &TestFixture::FlushThread); + this->StartThreads(FLAGS_num_compact_threads, &TestFixture::CompactThread); + this->StartThreads(FLAGS_num_flush_delta_threads, &TestFixture::FlushDeltasThread); + this->StartThreads(FLAGS_num_minor_compact_deltas_threads, + &TestFixture::MinorCompactDeltasThread); + this->StartThreads(FLAGS_num_major_compact_deltas_threads, + &TestFixture::MajorCompactDeltasThread); + this->StartThreads(10, &TestFixture::DeleteAndReinsertCycleThread); + this->StartThreads(10, &TestFixture::StubbornlyUpdateSameRowThread); + + // Run very quickly in dev builds, longer in slow builds. + float runtime_seconds = AllowSlowTests() ? 2 : 0.1; + Stopwatch sw; + sw.start(); + while (sw.elapsed().wall < runtime_seconds * NANOS_PER_SECOND && + !this->HasFatalFailure()) { + SleepFor(MonoDelta::FromMicroseconds(5000)); + } + + // This is sort of a hack -- the flusher thread stops when it sees this + // countdown latch go to 0. + this->running_insert_count_.Reset(0); + this->JoinThreads(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/multi_column_writer.cc b/src/kudu/tablet/multi_column_writer.cc new file mode 100644 index 000000000000..0632415eb648 --- /dev/null +++ b/src/kudu/tablet/multi_column_writer.cc @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/multi_column_writer.h" + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/schema.h" +#include "kudu/fs/block_id.h" +#include "kudu/gutil/stl_util.h" + +namespace kudu { +namespace tablet { + +using cfile::CFileWriter; +using fs::ScopedWritableBlockCloser; +using fs::WritableBlock; + +MultiColumnWriter::MultiColumnWriter(FsManager* fs, + const Schema* schema) + : fs_(fs), + schema_(schema), + finished_(false) { +} + +MultiColumnWriter::~MultiColumnWriter() { + STLDeleteElements(&cfile_writers_); +} + +Status MultiColumnWriter::Open() { + CHECK(cfile_writers_.empty()); + + // Open columns. + for (int i = 0; i < schema_->num_columns(); i++) { + const ColumnSchema &col = schema_->column(i); + + // TODO: allow options to be configured, perhaps on a per-column + // basis as part of the schema. For now use defaults. + // + // Also would be able to set encoding here, or do something smart + // to figure out the encoding on the fly. + cfile::WriterOptions opts; + + // Index all columns by ordinal position, so we can match up + // the corresponding rows. + opts.write_posidx = true; + + /// Set the column storage attributes. + opts.storage_attributes = col.attributes(); + + // If the schema has a single PK and this is the PK col + if (i == 0 && schema_->num_key_columns() == 1) { + opts.write_validx = true; + } + + // Open file for write. + gscoped_ptr block; + RETURN_NOT_OK_PREPEND(fs_->CreateNewBlock(&block), + "Unable to open output file for column " + col.ToString()); + BlockId block_id(block->id()); + + // Create the CFile writer itself. + gscoped_ptr writer(new CFileWriter( + opts, + col.type_info(), + col.is_nullable(), + block.Pass())); + RETURN_NOT_OK_PREPEND(writer->Start(), + "Unable to Start() writer for column " + col.ToString()); + + LOG(INFO) << "Opened CFile writer for column " << col.ToString(); + cfile_writers_.push_back(writer.release()); + block_ids_.push_back(block_id); + } + + return Status::OK(); +} + +Status MultiColumnWriter::AppendBlock(const RowBlock& block) { + for (int i = 0; i < schema_->num_columns(); i++) { + ColumnBlock column = block.column_block(i); + if (column.is_nullable()) { + RETURN_NOT_OK(cfile_writers_[i]->AppendNullableEntries(column.null_bitmap(), + column.data(), column.nrows())); + } else { + RETURN_NOT_OK(cfile_writers_[i]->AppendEntries(column.data(), column.nrows())); + } + } + return Status::OK(); +} + +Status MultiColumnWriter::Finish() { + ScopedWritableBlockCloser closer; + RETURN_NOT_OK(FinishAndReleaseBlocks(&closer)); + return closer.CloseBlocks(); +} + +Status MultiColumnWriter::FinishAndReleaseBlocks(ScopedWritableBlockCloser* closer) { + CHECK(!finished_); + for (int i = 0; i < schema_->num_columns(); i++) { + CFileWriter *writer = cfile_writers_[i]; + Status s = writer->FinishAndReleaseBlock(closer); + if (!s.ok()) { + LOG(WARNING) << "Unable to Finish writer for column " << + schema_->column(i).ToString() << ": " << s.ToString(); + return s; + } + } + finished_ = true; + return Status::OK(); +} + +void MultiColumnWriter::GetFlushedBlocksByColumnId(std::map* ret) const { + CHECK(finished_); + ret->clear(); + for (int i = 0; i < schema_->num_columns(); i++) { + (*ret)[schema_->column_id(i)] = block_ids_[i]; + } +} + +size_t MultiColumnWriter::written_size() const { + size_t size = 0; + for (const CFileWriter *writer : cfile_writers_) { + size += writer->written_size(); + } + return size; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/multi_column_writer.h b/src/kudu/tablet/multi_column_writer.h new file mode 100644 index 000000000000..71a0691a1752 --- /dev/null +++ b/src/kudu/tablet/multi_column_writer.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_MULTI_COLUMN_WRITER_H +#define KUDU_TABLET_MULTI_COLUMN_WRITER_H + +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/macros.h" + +namespace kudu { + +class RowBlock; +class Schema; + +namespace cfile { +class CFileWriter; +} // namespace cfile + +namespace fs { +class ScopedWritableBlockCloser; +} // namespace fs + +namespace tablet { + +// Wrapper which writes several columns in parallel corresponding to some +// Schema. +class MultiColumnWriter { + public: + MultiColumnWriter(FsManager* fs, + const Schema* schema); + + virtual ~MultiColumnWriter(); + + // Open and start writing the columns. + Status Open(); + + // Append the given block to the output columns. + // + // Note that the selection vector here is ignored. + Status AppendBlock(const RowBlock& block); + + // Close the in-progress files. + // + // The file's blocks may be retrieved using FlushedBlocks(). + Status Finish(); + + // Close the in-progress CFiles, releasing the underlying writable blocks + // to 'closer'. + Status FinishAndReleaseBlocks(fs::ScopedWritableBlockCloser* closer); + + // Return the number of bytes written so far. + size_t written_size() const; + + cfile::CFileWriter* writer_for_col_idx(int i) { + DCHECK_LT(i, cfile_writers_.size()); + return cfile_writers_[i]; + } + + // Return the block IDs of the written columns, keyed by column ID. + // + // REQUIRES: Finish() already called. + void GetFlushedBlocksByColumnId(std::map* ret) const; + + private: + FsManager* const fs_; + const Schema* const schema_; + + bool finished_; + + std::vector cfile_writers_; + std::vector block_ids_; + + DISALLOW_COPY_AND_ASSIGN(MultiColumnWriter); +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_MULTI_COLUMN_WRITER_H */ diff --git a/src/kudu/tablet/mutation.cc b/src/kudu/tablet/mutation.cc new file mode 100644 index 000000000000..96b970c72737 --- /dev/null +++ b/src/kudu/tablet/mutation.cc @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/tablet/mutation.h" +#include + +namespace kudu { +namespace tablet { + +string Mutation::StringifyMutationList(const Schema &schema, const Mutation *head) { + string ret; + + ret.append("["); + + bool first = true; + while (head != nullptr) { + if (!first) { + ret.append(", "); + } + first = false; + + StrAppend(&ret, "@", head->timestamp().ToString(), "("); + ret.append(head->changelist().ToString(schema)); + ret.append(")"); + + head = head->next(); + } + + ret.append("]"); + return ret; +} + + +void Mutation::AppendToListAtomic(Mutation **list) { + DoAppendToList(list); +} + +void Mutation::AppendToList(Mutation **list) { + DoAppendToList(list); +} + +namespace { +template +inline void Store(Mutation** pointer, Mutation* val); + +template<> +inline void Store(Mutation** pointer, Mutation* val) { + Release_Store(reinterpret_cast(pointer), + reinterpret_cast(val)); +} + +template<> +inline void Store(Mutation** pointer, Mutation* val) { + *pointer = val; +} +} // anonymous namespace + +template +inline void Mutation::DoAppendToList(Mutation **list) { + next_ = nullptr; + if (*list == nullptr) { + Store(list, this); + } else { + // Find tail and append. + Mutation *tail = *list; + while (tail->next_ != nullptr) { + tail = tail->next_; + } + Store(&tail->next_, this); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/mutation.h b/src/kudu/tablet/mutation.h new file mode 100644 index 000000000000..6e29586ea105 --- /dev/null +++ b/src/kudu/tablet/mutation.h @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_MUTATION_H +#define KUDU_TABLET_MUTATION_H + +#include + +#include "kudu/common/row_changelist.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/slice.h" +#include "kudu/tablet/mvcc.h" + +namespace kudu { +namespace tablet { + +// A single mutation associated with a row. +// This object also acts as a node in a linked list connected to other +// mutations in the row. +// +// This is a variable-length object. +class Mutation { + public: + Mutation() { } + + // Create a new Mutation object with a copy of the given changelist. + // The object is allocated from the provided Arena. + template + static Mutation *CreateInArena( + ArenaType *arena, Timestamp timestamp, const RowChangeList &rcl); + + RowChangeList changelist() const { + return RowChangeList(Slice(changelist_data_, changelist_size_)); + } + + Timestamp timestamp() const { return timestamp_; } + const Mutation *next() const { return next_; } + void set_next(Mutation *next) { + next_ = next; + } + + // Return a stringified version of the given list of mutations. + // This should only be used for debugging/logging. + static string StringifyMutationList(const Schema &schema, const Mutation *head); + + // Append this mutation to the list at the given pointer. + void AppendToListAtomic(Mutation **list); + + // Same as above, except that this version implies "Release" memory semantics + // (see atomicops.h). The pointer as well as all of the mutations in the list + // must be word-aligned. + void AppendToList(Mutation **list); + + private: + friend class MSRow; + friend class MemRowSet; + + template + void DoAppendToList(Mutation **list); + + DISALLOW_COPY_AND_ASSIGN(Mutation); + + // The transaction ID which made this mutation. If this transaction is not + // committed in the snapshot of the reader, this mutation should be ignored. + Timestamp timestamp_; + + // Link to the next mutation on this row + Mutation *next_; + + uint32_t changelist_size_; + + // The actual encoded RowChangeList + char changelist_data_[0]; +}; + +template +inline Mutation *Mutation::CreateInArena( + ArenaType *arena, Timestamp timestamp, const RowChangeList &rcl) { + DCHECK(!rcl.is_null()); + + size_t size = sizeof(Mutation) + rcl.slice().size(); + void *storage = arena->AllocateBytesAligned(size, BASE_PORT_H_ALIGN_OF(Mutation)); + CHECK(storage) << "failed to allocate storage from arena"; + auto ret = new (storage) Mutation(); + ret->timestamp_ = timestamp; + ret->next_ = NULL; + ret->changelist_size_ = rcl.slice().size(); + memcpy(ret->changelist_data_, rcl.slice().data(), rcl.slice().size()); + return ret; +} + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/mvcc-test.cc b/src/kudu/tablet/mvcc-test.cc new file mode 100644 index 000000000000..13504ea4f3ec --- /dev/null +++ b/src/kudu/tablet/mvcc-test.cc @@ -0,0 +1,619 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/monotime.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tablet { + +using server::Clock; +using server::HybridClock; + +class MvccTest : public KuduTest { + public: + MvccTest() + : clock_( + server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp)) { + } + + void WaitForSnapshotAtTSThread(MvccManager* mgr, Timestamp ts) { + MvccSnapshot s; + CHECK_OK(mgr->WaitForCleanSnapshotAtTimestamp(ts, &s, MonoTime::Max())); + CHECK(s.is_clean()) << "verifying postcondition"; + boost::lock_guard lock(lock_); + result_snapshot_.reset(new MvccSnapshot(s)); + } + + bool HasResultSnapshot() { + boost::lock_guard lock(lock_); + return result_snapshot_ != nullptr; + } + + protected: + scoped_refptr clock_; + + mutable simple_spinlock lock_; + gscoped_ptr result_snapshot_; +}; + +TEST_F(MvccTest, TestMvccBasic) { + MvccManager mgr(clock_.get()); + MvccSnapshot snap; + + // Initial state should not have any committed transactions. + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed={T|T < 1}]", snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(Timestamp(1))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(2))); + + // Start timestamp 1 + Timestamp t = mgr.StartTransaction(); + ASSERT_EQ(1, t.value()); + + // State should still have no committed transactions, since 1 is in-flight. + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed={T|T < 1}]", snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(Timestamp(1))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(2))); + + // Mark timestamp 1 as "applying" + mgr.StartApplyingTransaction(t); + + // This should not change the set of committed transactions. + ASSERT_FALSE(snap.IsCommitted(Timestamp(1))); + + // Commit timestamp 1 + mgr.CommitTransaction(t); + + // State should show 0 as committed, 1 as uncommitted. + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed={T|T < 1 or (T in {1})}]", snap.ToString()); + ASSERT_TRUE(snap.IsCommitted(Timestamp(1))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(2))); +} + +TEST_F(MvccTest, TestMvccMultipleInFlight) { + MvccManager mgr(clock_.get()); + MvccSnapshot snap; + + // Start timestamp 1, timestamp 2 + Timestamp t1 = mgr.StartTransaction(); + ASSERT_EQ(1, t1.value()); + Timestamp t2 = mgr.StartTransaction(); + ASSERT_EQ(2, t2.value()); + + // State should still have no committed transactions, since both are in-flight. + + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed={T|T < 1}]", snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(t1)); + ASSERT_FALSE(snap.IsCommitted(t2)); + + // Commit timestamp 2 + mgr.StartApplyingTransaction(t2); + mgr.CommitTransaction(t2); + + // State should show 2 as committed, 1 as uncommitted. + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed=" + "{T|T < 1 or (T in {2})}]", + snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(t1)); + ASSERT_TRUE(snap.IsCommitted(t2)); + + // Start another transaction. This gets timestamp 3 + Timestamp t3 = mgr.StartTransaction(); + ASSERT_EQ(3, t3.value()); + + // State should show 2 as committed, 1 and 4 as uncommitted. + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed=" + "{T|T < 1 or (T in {2})}]", + snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(t1)); + ASSERT_TRUE(snap.IsCommitted(t2)); + ASSERT_FALSE(snap.IsCommitted(t3)); + + // Commit 3 + mgr.StartApplyingTransaction(t3); + mgr.CommitTransaction(t3); + + // 2 and 3 committed + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed=" + "{T|T < 1 or (T in {2,3})}]", + snap.ToString()); + ASSERT_FALSE(snap.IsCommitted(t1)); + ASSERT_TRUE(snap.IsCommitted(t2)); + ASSERT_TRUE(snap.IsCommitted(t3)); + + // Commit 1 + mgr.StartApplyingTransaction(t1); + mgr.CommitTransaction(t1); + + // all committed + mgr.TakeSnapshot(&snap); + ASSERT_EQ("MvccSnapshot[committed={T|T < 3 or (T in {3})}]", snap.ToString()); + ASSERT_TRUE(snap.IsCommitted(t1)); + ASSERT_TRUE(snap.IsCommitted(t2)); + ASSERT_TRUE(snap.IsCommitted(t3)); +} + +TEST_F(MvccTest, TestOutOfOrderTxns) { + scoped_refptr hybrid_clock(new HybridClock()); + ASSERT_OK(hybrid_clock->Init()); + MvccManager mgr(hybrid_clock); + + // Start a normal non-commit-wait txn. + Timestamp normal_txn = mgr.StartTransaction(); + + MvccSnapshot s1(mgr); + + // Start a transaction as if it were using commit-wait (i.e. started in future) + Timestamp cw_txn = mgr.StartTransactionAtLatest(); + + // Commit the original txn + mgr.StartApplyingTransaction(normal_txn); + mgr.CommitTransaction(normal_txn); + + // Start a new txn + Timestamp normal_txn_2 = mgr.StartTransaction(); + + // The old snapshot should not have either txn + EXPECT_FALSE(s1.IsCommitted(normal_txn)); + EXPECT_FALSE(s1.IsCommitted(normal_txn_2)); + + // A new snapshot should have only the first transaction + MvccSnapshot s2(mgr); + EXPECT_TRUE(s2.IsCommitted(normal_txn)); + EXPECT_FALSE(s2.IsCommitted(normal_txn_2)); + + // Commit the commit-wait one once it is time. + ASSERT_OK(hybrid_clock->WaitUntilAfter(cw_txn, MonoTime::Max())); + mgr.StartApplyingTransaction(cw_txn); + mgr.CommitTransaction(cw_txn); + + // A new snapshot at this point should still think that normal_txn_2 is uncommitted + MvccSnapshot s3(mgr); + EXPECT_FALSE(s3.IsCommitted(normal_txn_2)); +} + +// Tests starting transaction at a point-in-time in the past and committing them. +// This is disconnected from the current time (whatever is returned from clock->Now()) +// for replication/bootstrap. +TEST_F(MvccTest, TestOfflineTransactions) { + MvccManager mgr(clock_.get()); + + // set the clock to some time in the "future" + ASSERT_OK(clock_->Update(Timestamp(100))); + + // now start a transaction in the "past" + ASSERT_OK(mgr.StartTransactionAtTimestamp(Timestamp(50))); + + ASSERT_EQ(mgr.GetCleanTimestamp().CompareTo(Timestamp::kInitialTimestamp), 0); + + // and committing this transaction "offline" this + // should not advance the MvccManager 'all_committed_before_' + // watermark. + mgr.StartApplyingTransaction(Timestamp(50)); + mgr.OfflineCommitTransaction(Timestamp(50)); + + // Now take a snaphsot. + MvccSnapshot snap1; + mgr.TakeSnapshot(&snap1); + + // Because we did not advance the watermark, even though the only + // in-flight transaction was committed at time 50, a transaction at + // time 40 should still be considered uncommitted. + ASSERT_FALSE(snap1.IsCommitted(Timestamp(40))); + + // Now advance the watermark to the last committed transaction. + mgr.OfflineAdjustSafeTime(Timestamp(50)); + + ASSERT_EQ(mgr.GetCleanTimestamp().CompareTo(Timestamp(50)), 0); + + MvccSnapshot snap2; + mgr.TakeSnapshot(&snap2); + + ASSERT_TRUE(snap2.IsCommitted(Timestamp(40))); +} + +TEST_F(MvccTest, TestScopedTransaction) { + MvccManager mgr(clock_.get()); + MvccSnapshot snap; + + { + ScopedTransaction t1(&mgr); + ScopedTransaction t2(&mgr); + + ASSERT_EQ(1, t1.timestamp().value()); + ASSERT_EQ(2, t2.timestamp().value()); + + t1.StartApplying(); + t1.Commit(); + + mgr.TakeSnapshot(&snap); + ASSERT_TRUE(snap.IsCommitted(t1.timestamp())); + ASSERT_FALSE(snap.IsCommitted(t2.timestamp())); + } + + // t2 going out of scope aborts it. + mgr.TakeSnapshot(&snap); + ASSERT_TRUE(snap.IsCommitted(Timestamp(1))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(2))); +} + +TEST_F(MvccTest, TestPointInTimeSnapshot) { + MvccSnapshot snap(Timestamp(10)); + + ASSERT_TRUE(snap.IsCommitted(Timestamp(1))); + ASSERT_TRUE(snap.IsCommitted(Timestamp(9))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(10))); + ASSERT_FALSE(snap.IsCommitted(Timestamp(11))); +} + +TEST_F(MvccTest, TestMayHaveCommittedTransactionsAtOrAfter) { + MvccSnapshot snap; + snap.all_committed_before_ = Timestamp(10); + snap.committed_timestamps_.push_back(11); + snap.committed_timestamps_.push_back(13); + snap.none_committed_at_or_after_ = Timestamp(14); + + ASSERT_TRUE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(9))); + ASSERT_TRUE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(10))); + ASSERT_TRUE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(12))); + ASSERT_TRUE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(13))); + ASSERT_FALSE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(14))); + ASSERT_FALSE(snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(15))); + + // Test for "all committed" snapshot + MvccSnapshot all_committed = + MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + ASSERT_TRUE( + all_committed.MayHaveCommittedTransactionsAtOrAfter(Timestamp(1))); + ASSERT_TRUE( + all_committed.MayHaveCommittedTransactionsAtOrAfter(Timestamp(12345))); + + // And "none committed" snapshot + MvccSnapshot none_committed = + MvccSnapshot::CreateSnapshotIncludingNoTransactions(); + ASSERT_FALSE( + none_committed.MayHaveCommittedTransactionsAtOrAfter(Timestamp(1))); + ASSERT_FALSE( + none_committed.MayHaveCommittedTransactionsAtOrAfter(Timestamp(12345))); + + // Test for a "clean" snapshot + MvccSnapshot clean_snap(Timestamp(10)); + ASSERT_TRUE(clean_snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(9))); + ASSERT_FALSE(clean_snap.MayHaveCommittedTransactionsAtOrAfter(Timestamp(10))); +} + +TEST_F(MvccTest, TestMayHaveUncommittedTransactionsBefore) { + MvccSnapshot snap; + snap.all_committed_before_ = Timestamp(10); + snap.committed_timestamps_.push_back(11); + snap.committed_timestamps_.push_back(13); + snap.none_committed_at_or_after_ = Timestamp(14); + + ASSERT_FALSE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(9))); + ASSERT_TRUE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(10))); + ASSERT_TRUE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(11))); + ASSERT_TRUE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(13))); + ASSERT_TRUE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(14))); + ASSERT_TRUE(snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(15))); + + // Test for "all committed" snapshot + MvccSnapshot all_committed = + MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + ASSERT_FALSE( + all_committed.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(1))); + ASSERT_FALSE( + all_committed.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(12345))); + + // And "none committed" snapshot + MvccSnapshot none_committed = + MvccSnapshot::CreateSnapshotIncludingNoTransactions(); + ASSERT_TRUE( + none_committed.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(1))); + ASSERT_TRUE( + none_committed.MayHaveUncommittedTransactionsAtOrBefore( + Timestamp(12345))); + + // Test for a "clean" snapshot + MvccSnapshot clean_snap(Timestamp(10)); + ASSERT_FALSE(clean_snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(9))); + ASSERT_TRUE(clean_snap.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(10))); + + // Test for the case where we have a single transaction in flight. Since this is + // also the earliest transaction, all_committed_before_ is equal to the txn's + // ts, but when it gets committed we can't advance all_committed_before_ past it + // because there is no other transaction to advance it to. In this case we should + // still report that there can't be any uncommitted transactions before. + MvccSnapshot snap2; + snap2.all_committed_before_ = Timestamp(10); + snap2.committed_timestamps_.push_back(10); + + ASSERT_FALSE(snap2.MayHaveUncommittedTransactionsAtOrBefore(Timestamp(10))); +} + +TEST_F(MvccTest, TestAreAllTransactionsCommitted) { + MvccManager mgr(clock_.get()); + + // start several transactions and take snapshots along the way + Timestamp tx1 = mgr.StartTransaction(); + Timestamp tx2 = mgr.StartTransaction(); + Timestamp tx3 = mgr.StartTransaction(); + + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(1))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(2))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(3))); + + // commit tx3, should all still report as having as having uncommitted + // transactions. + mgr.StartApplyingTransaction(tx3); + mgr.CommitTransaction(tx3); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(1))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(2))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(3))); + + // commit tx1, first snap with in-flights should now report as all committed + // and remaining snaps as still having uncommitted transactions + mgr.StartApplyingTransaction(tx1); + mgr.CommitTransaction(tx1); + ASSERT_TRUE(mgr.AreAllTransactionsCommitted(Timestamp(1))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(2))); + ASSERT_FALSE(mgr.AreAllTransactionsCommitted(Timestamp(3))); + + // Now they should all report as all committed. + mgr.StartApplyingTransaction(tx2); + mgr.CommitTransaction(tx2); + ASSERT_TRUE(mgr.AreAllTransactionsCommitted(Timestamp(1))); + ASSERT_TRUE(mgr.AreAllTransactionsCommitted(Timestamp(2))); + ASSERT_TRUE(mgr.AreAllTransactionsCommitted(Timestamp(3))); +} + +TEST_F(MvccTest, TestWaitForCleanSnapshot_SnapWithNoInflights) { + MvccManager mgr(clock_.get()); + boost::thread waiting_thread = boost::thread( + &MvccTest::WaitForSnapshotAtTSThread, this, &mgr, clock_->Now()); + + // join immediately. + waiting_thread.join(); + ASSERT_TRUE(HasResultSnapshot()); +} + +TEST_F(MvccTest, TestWaitForCleanSnapshot_SnapWithInFlights) { + + MvccManager mgr(clock_.get()); + + Timestamp tx1 = mgr.StartTransaction(); + Timestamp tx2 = mgr.StartTransaction(); + + boost::thread waiting_thread = boost::thread( + &MvccTest::WaitForSnapshotAtTSThread, this, &mgr, clock_->Now()); + + ASSERT_FALSE(HasResultSnapshot()); + mgr.StartApplyingTransaction(tx1); + mgr.CommitTransaction(tx1); + ASSERT_FALSE(HasResultSnapshot()); + mgr.StartApplyingTransaction(tx2); + mgr.CommitTransaction(tx2); + waiting_thread.join(); + ASSERT_TRUE(HasResultSnapshot()); +} + +TEST_F(MvccTest, TestWaitForApplyingTransactionsToCommit) { + MvccManager mgr(clock_.get()); + + Timestamp tx1 = mgr.StartTransaction(); + Timestamp tx2 = mgr.StartTransaction(); + + // Wait should return immediately, since we have no transactions "applying" + // yet. + mgr.WaitForApplyingTransactionsToCommit(); + + mgr.StartApplyingTransaction(tx1); + + boost::thread waiting_thread = boost::thread( + &MvccManager::WaitForApplyingTransactionsToCommit, &mgr); + while (mgr.GetNumWaitersForTests() == 0) { + SleepFor(MonoDelta::FromMilliseconds(5)); + } + ASSERT_EQ(mgr.GetNumWaitersForTests(), 1); + + // Aborting the other transaction shouldn't affect our waiter. + mgr.AbortTransaction(tx2); + ASSERT_EQ(mgr.GetNumWaitersForTests(), 1); + + // Committing our transaction should wake the waiter. + mgr.CommitTransaction(tx1); + ASSERT_EQ(mgr.GetNumWaitersForTests(), 0); + waiting_thread.join(); +} + +TEST_F(MvccTest, TestWaitForCleanSnapshot_SnapAtTimestampWithInFlights) { + + MvccManager mgr(clock_.get()); + + // Transactions with timestamp 1 through 3 + Timestamp tx1 = mgr.StartTransaction(); + Timestamp tx2 = mgr.StartTransaction(); + Timestamp tx3 = mgr.StartTransaction(); + + // Start a thread waiting for transactions with ts <= 2 to commit + boost::thread waiting_thread = boost::thread( + &MvccTest::WaitForSnapshotAtTSThread, this, &mgr, tx2); + ASSERT_FALSE(HasResultSnapshot()); + + // Commit tx 1 - thread should still wait. + mgr.StartApplyingTransaction(tx1); + mgr.CommitTransaction(tx1); + SleepFor(MonoDelta::FromMilliseconds(1)); + ASSERT_FALSE(HasResultSnapshot()); + + // Commit tx 3 - thread should still wait. + mgr.StartApplyingTransaction(tx3); + mgr.CommitTransaction(tx3); + SleepFor(MonoDelta::FromMilliseconds(1)); + ASSERT_FALSE(HasResultSnapshot()); + + // Commit tx 2 - thread can now continue + mgr.StartApplyingTransaction(tx2); + mgr.CommitTransaction(tx2); + waiting_thread.join(); + ASSERT_TRUE(HasResultSnapshot()); +} + +// Test that if we abort a transaction we don't advance the safe time and don't +// add the transaction to the committed set. +TEST_F(MvccTest, TestTxnAbort) { + + MvccManager mgr(clock_.get()); + + // Transactions with timestamps 1 through 3 + Timestamp tx1 = mgr.StartTransaction(); + Timestamp tx2 = mgr.StartTransaction(); + Timestamp tx3 = mgr.StartTransaction(); + + // Now abort tx1, this shouldn't move the clean time and the transaction + // shouldn't be reported as committed. + mgr.AbortTransaction(tx1); + ASSERT_EQ(mgr.GetCleanTimestamp().CompareTo(Timestamp::kInitialTimestamp), 0); + ASSERT_FALSE(mgr.cur_snap_.IsCommitted(tx1)); + + // Committing tx3 shouldn't advance the clean time since it is not the earliest + // in-flight, but it should advance 'no_new_transactions_at_or_before_', the "safe" + // time, to 3. + mgr.StartApplyingTransaction(tx3); + mgr.CommitTransaction(tx3); + ASSERT_TRUE(mgr.cur_snap_.IsCommitted(tx3)); + ASSERT_EQ(mgr.no_new_transactions_at_or_before_.CompareTo(tx3), 0); + + // Committing tx2 should advance the clean time to 3. + mgr.StartApplyingTransaction(tx2); + mgr.CommitTransaction(tx2); + ASSERT_TRUE(mgr.cur_snap_.IsCommitted(tx2)); + ASSERT_EQ(mgr.GetCleanTimestamp().CompareTo(tx3), 0); +} + +// This tests for a bug we were observing, where a clean snapshot would not +// coalesce to the latest timestamp, for offline transactions. +TEST_F(MvccTest, TestCleanTimeCoalescingOnOfflineTransactions) { + + MvccManager mgr(clock_.get()); + clock_->Update(Timestamp(20)); + + CHECK_OK(mgr.StartTransactionAtTimestamp(Timestamp(10))); + CHECK_OK(mgr.StartTransactionAtTimestamp(Timestamp(15))); + mgr.OfflineAdjustSafeTime(Timestamp(15)); + + mgr.StartApplyingTransaction(Timestamp(15)); + mgr.OfflineCommitTransaction(Timestamp(15)); + + mgr.StartApplyingTransaction(Timestamp(10)); + mgr.OfflineCommitTransaction(Timestamp(10)); + ASSERT_EQ(mgr.cur_snap_.ToString(), "MvccSnapshot[committed={T|T < 15 or (T in {15})}]"); +} + +// Various death tests which ensure that we can only transition in one of the following +// valid ways: +// +// - Start() -> StartApplying() -> Commit() +// - Start() -> Abort() +// +// Any other transition should fire a CHECK failure. +TEST_F(MvccTest, TestIllegalStateTransitionsCrash) { + MvccManager mgr(clock_.get()); + MvccSnapshot snap; + + EXPECT_DEATH({ + mgr.StartApplyingTransaction(Timestamp(1)); + }, "Cannot mark timestamp 1 as APPLYING: not in the in-flight map"); + + // Depending whether this is a DEBUG or RELEASE build, the error message + // could be different for this case -- the "future timestamp" check is only + // run in DEBUG builds. + EXPECT_DEATH({ + mgr.CommitTransaction(Timestamp(1)); + }, + "Trying to commit a transaction with a future timestamp|" + "Trying to remove timestamp which isn't in the in-flight set: 1"); + + clock_->Update(Timestamp(20)); + + EXPECT_DEATH({ + mgr.CommitTransaction(Timestamp(1)); + }, "Trying to remove timestamp which isn't in the in-flight set: 1"); + + // Start a transaction, and try committing it without having moved to "Applying" + // state. + Timestamp t = mgr.StartTransaction(); + EXPECT_DEATH({ + mgr.CommitTransaction(t); + }, "Trying to commit a transaction which never entered APPLYING state"); + + // Aborting should succeed, since we never moved to Applying. + mgr.AbortTransaction(t); + + // Aborting a second time should fail + EXPECT_DEATH({ + mgr.AbortTransaction(t); + }, "Trying to remove timestamp which isn't in the in-flight set: 21"); + + // Start a new transaction. This time, mark it as Applying. + t = mgr.StartTransaction(); + mgr.StartApplyingTransaction(t); + + // Can only call StartApplying once. + EXPECT_DEATH({ + mgr.StartApplyingTransaction(t); + }, "Cannot mark timestamp 22 as APPLYING: wrong state: 1"); + + // Cannot Abort() a transaction once we start applying it. + EXPECT_DEATH({ + mgr.AbortTransaction(t); + }, "transaction with timestamp 22 cannot be aborted in state 1"); + + // We can commit it successfully. + mgr.CommitTransaction(t); +} + +TEST_F(MvccTest, TestWaitUntilCleanDeadline) { + MvccManager mgr(clock_.get()); + + // Transactions with timestamp 1 through 3 + Timestamp tx1 = mgr.StartTransaction(); + + // Wait until the 'tx1' timestamp is clean -- this won't happen because the + // transaction isn't committed yet. + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(10)); + MvccSnapshot snap; + Status s = mgr.WaitForCleanSnapshotAtTimestamp(tx1, &snap, deadline); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/mvcc.cc b/src/kudu/tablet/mvcc.cc new file mode 100644 index 000000000000..f972379585c1 --- /dev/null +++ b/src/kudu/tablet/mvcc.cc @@ -0,0 +1,584 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { namespace tablet { + +MvccManager::MvccManager(const scoped_refptr& clock) + : no_new_transactions_at_or_before_(Timestamp::kMin), + earliest_in_flight_(Timestamp::kMax), + clock_(clock) { + cur_snap_.all_committed_before_ = Timestamp::kInitialTimestamp; + cur_snap_.none_committed_at_or_after_ = Timestamp::kInitialTimestamp; +} + +Timestamp MvccManager::StartTransaction() { + while (true) { + Timestamp now = clock_->Now(); + boost::lock_guard l(lock_); + if (PREDICT_TRUE(InitTransactionUnlocked(now))) { + return now; + } + } + // dummy return to avoid compiler warnings + LOG(FATAL) << "Unreachable, added to avoid compiler warning."; + return Timestamp::kInvalidTimestamp; +} + +Timestamp MvccManager::StartTransactionAtLatest() { + boost::lock_guard l(lock_); + Timestamp now_latest = clock_->NowLatest(); + while (PREDICT_FALSE(!InitTransactionUnlocked(now_latest))) { + now_latest = clock_->NowLatest(); + } + + // If in debug mode enforce that transactions have monotonically increasing + // timestamps at all times +#ifndef NDEBUG + if (!timestamps_in_flight_.empty()) { + Timestamp max(std::max_element(timestamps_in_flight_.begin(), + timestamps_in_flight_.end())->first); + CHECK_EQ(max.value(), now_latest.value()); + } +#endif + + return now_latest; +} + +Status MvccManager::StartTransactionAtTimestamp(Timestamp timestamp) { + boost::lock_guard l(lock_); + if (PREDICT_FALSE(cur_snap_.IsCommitted(timestamp))) { + return Status::IllegalState( + strings::Substitute("Timestamp: $0 is already committed. Current Snapshot: $1", + timestamp.value(), cur_snap_.ToString())); + } + if (!InitTransactionUnlocked(timestamp)) { + return Status::IllegalState( + strings::Substitute("There is already a transaction with timestamp: $0 in flight.", + timestamp.value())); + } + return Status::OK(); +} + +void MvccManager::StartApplyingTransaction(Timestamp timestamp) { + boost::lock_guard l(lock_); + auto it = timestamps_in_flight_.find(timestamp.value()); + if (PREDICT_FALSE(it == timestamps_in_flight_.end())) { + LOG(FATAL) << "Cannot mark timestamp " << timestamp.ToString() << " as APPLYING: " + << "not in the in-flight map."; + } + + TxnState cur_state = it->second; + if (PREDICT_FALSE(cur_state != RESERVED)) { + LOG(FATAL) << "Cannot mark timestamp " << timestamp.ToString() << " as APPLYING: " + << "wrong state: " << cur_state; + } + + it->second = APPLYING; +} + +bool MvccManager::InitTransactionUnlocked(const Timestamp& timestamp) { + // Ensure that we didn't mark the given timestamp as "safe" in between + // acquiring the time and taking the lock. This allows us to acquire timestamps + // outside of the MVCC lock. + if (PREDICT_FALSE(no_new_transactions_at_or_before_.CompareTo(timestamp) >= 0)) { + return false; + } + // Since transactions only commit once they are in the past, and new + // transactions always start either in the current time or the future, + // we should never be trying to start a new transaction at the same time + // as an already-committed one. + DCHECK(!cur_snap_.IsCommitted(timestamp)) + << "Trying to start a new txn at already-committed timestamp " + << timestamp.ToString() + << " cur_snap_: " << cur_snap_.ToString(); + + if (timestamp.CompareTo(earliest_in_flight_) < 0) { + earliest_in_flight_ = timestamp; + } + + return InsertIfNotPresent(×tamps_in_flight_, timestamp.value(), RESERVED); +} + +void MvccManager::CommitTransaction(Timestamp timestamp) { + boost::lock_guard l(lock_); + bool was_earliest = false; + CommitTransactionUnlocked(timestamp, &was_earliest); + + // No more transactions will start with a ts that is lower than or equal + // to 'timestamp', so we adjust the snapshot accordingly. + if (no_new_transactions_at_or_before_.CompareTo(timestamp) < 0) { + no_new_transactions_at_or_before_ = timestamp; + } + + if (was_earliest) { + // If this transaction was the earliest in-flight, we might have to adjust + // the "clean" timestamp. + AdjustCleanTime(); + } +} + +void MvccManager::AbortTransaction(Timestamp timestamp) { + boost::lock_guard l(lock_); + + // Remove from our in-flight list. + TxnState old_state = RemoveInFlightAndGetStateUnlocked(timestamp); + CHECK_EQ(old_state, RESERVED) << "transaction with timestamp " << timestamp.ToString() + << " cannot be aborted in state " << old_state; + + // If we're aborting the earliest transaction that was in flight, + // update our cached value. + if (earliest_in_flight_.CompareTo(timestamp) == 0) { + AdvanceEarliestInFlightTimestamp(); + } +} + +void MvccManager::OfflineCommitTransaction(Timestamp timestamp) { + boost::lock_guard l(lock_); + + // Commit the transaction, but do not adjust 'all_committed_before_', that will + // be done with a separate OfflineAdjustCurSnap() call. + bool was_earliest = false; + CommitTransactionUnlocked(timestamp, &was_earliest); + + if (was_earliest + && no_new_transactions_at_or_before_.CompareTo(timestamp) >= 0) { + // If this transaction was the earliest in-flight, we might have to adjust + // the "clean" timestamp. + AdjustCleanTime(); + } +} + +MvccManager::TxnState MvccManager::RemoveInFlightAndGetStateUnlocked(Timestamp ts) { + DCHECK(lock_.is_locked()); + + auto it = timestamps_in_flight_.find(ts.value()); + if (it == timestamps_in_flight_.end()) { + LOG(FATAL) << "Trying to remove timestamp which isn't in the in-flight set: " + << ts.ToString(); + } + TxnState state = it->second; + timestamps_in_flight_.erase(it); + return state; +} + +void MvccManager::CommitTransactionUnlocked(Timestamp timestamp, + bool* was_earliest_in_flight) { + DCHECK(clock_->IsAfter(timestamp)) + << "Trying to commit a transaction with a future timestamp: " + << timestamp.ToString() << ". Current time: " << clock_->Stringify(clock_->Now()); + + *was_earliest_in_flight = earliest_in_flight_ == timestamp; + + // Remove from our in-flight list. + TxnState old_state = RemoveInFlightAndGetStateUnlocked(timestamp); + CHECK_EQ(old_state, APPLYING) + << "Trying to commit a transaction which never entered APPLYING state: " + << timestamp.ToString() << " state=" << old_state; + + // Add to snapshot's committed list + cur_snap_.AddCommittedTimestamp(timestamp); + + // If we're committing the earliest transaction that was in flight, + // update our cached value. + if (*was_earliest_in_flight) { + AdvanceEarliestInFlightTimestamp(); + } +} + +void MvccManager::AdvanceEarliestInFlightTimestamp() { + if (timestamps_in_flight_.empty()) { + earliest_in_flight_ = Timestamp::kMax; + } else { + earliest_in_flight_ = Timestamp(std::min_element(timestamps_in_flight_.begin(), + timestamps_in_flight_.end())->first); + } +} + +void MvccManager::OfflineAdjustSafeTime(Timestamp safe_time) { + boost::lock_guard l(lock_); + + // No more transactions will start with a ts that is lower than or equal + // to 'safe_time', so we adjust the snapshot accordingly. + if (no_new_transactions_at_or_before_.CompareTo(safe_time) < 0) { + no_new_transactions_at_or_before_ = safe_time; + } + + AdjustCleanTime(); +} + +// Remove any elements from 'v' which are < the given watermark. +static void FilterTimestamps(std::vector* v, + Timestamp::val_type watermark) { + int j = 0; + for (const auto& ts : *v) { + if (ts >= watermark) { + (*v)[j++] = ts; + } + } + v->resize(j); +} + +void MvccManager::AdjustCleanTime() { + // There are two possibilities: + // + // 1) We still have an in-flight transaction earlier than 'no_new_transactions_at_or_before_'. + // In this case, we update the watermark to that transaction's timestamp. + // + // 2) There are no in-flight transactions earlier than 'no_new_transactions_at_or_before_'. + // (There may still be in-flight transactions with future timestamps due to + // commit-wait transactions which start in the future). In this case, we update + // the watermark to 'no_new_transactions_at_or_before_', since we know that no new + // transactions can start with an earlier timestamp. + // + // In either case, we have to add the newly committed ts only if it remains higher + // than the new watermark. + + if (earliest_in_flight_.CompareTo(no_new_transactions_at_or_before_) < 0) { + cur_snap_.all_committed_before_ = earliest_in_flight_; + } else { + cur_snap_.all_committed_before_ = no_new_transactions_at_or_before_; + } + + // Filter out any committed timestamps that now fall below the watermark + FilterTimestamps(&cur_snap_.committed_timestamps_, cur_snap_.all_committed_before_.value()); + + // it may also have unblocked some waiters. + // Check if someone is waiting for transactions to be committed. + if (PREDICT_FALSE(!waiters_.empty())) { + auto iter = waiters_.begin(); + while (iter != waiters_.end()) { + WaitingState* waiter = *iter; + if (IsDoneWaitingUnlocked(*waiter)) { + iter = waiters_.erase(iter); + waiter->latch->CountDown(); + continue; + } + iter++; + } + } +} + +Status MvccManager::WaitUntil(WaitFor wait_for, Timestamp ts, + const MonoTime& deadline) const { + TRACE_EVENT2("tablet", "MvccManager::WaitUntil", + "wait_for", wait_for == ALL_COMMITTED ? "all_committed" : "none_applying", + "ts", ts.ToUint64()) + + CountDownLatch latch(1); + WaitingState waiting_state; + { + waiting_state.timestamp = ts; + waiting_state.latch = &latch; + waiting_state.wait_for = wait_for; + + boost::lock_guard l(lock_); + if (IsDoneWaitingUnlocked(waiting_state)) return Status::OK(); + waiters_.push_back(&waiting_state); + } + if (waiting_state.latch->WaitUntil(deadline)) { + return Status::OK(); + } + // We timed out. We need to clean up our entry in the waiters_ array. + + boost::lock_guard l(lock_); + // It's possible that while we were re-acquiring the lock, we did get + // notified. In that case, we have no cleanup to do. + if (waiting_state.latch->count() == 0) { + return Status::OK(); + } + + waiters_.erase(std::find(waiters_.begin(), waiters_.end(), &waiting_state)); + return Status::TimedOut(strings::Substitute( + "Timed out waiting for all transactions with ts < $0 to $1", + clock_->Stringify(ts), + wait_for == ALL_COMMITTED ? "commit" : "finish applying")); +} + +bool MvccManager::IsDoneWaitingUnlocked(const WaitingState& waiter) const { + switch (waiter.wait_for) { + case ALL_COMMITTED: + return AreAllTransactionsCommittedUnlocked(waiter.timestamp); + case NONE_APPLYING: + return !AnyApplyingAtOrBeforeUnlocked(waiter.timestamp); + } + LOG(FATAL); // unreachable +} + +bool MvccManager::AreAllTransactionsCommittedUnlocked(Timestamp ts) const { + if (timestamps_in_flight_.empty()) { + // If nothing is in-flight, then check the clock. If the timestamp is in the past, + // we know that no new uncommitted transactions may start before this ts. + return ts.CompareTo(clock_->Now()) <= 0; + } + // If some transactions are in flight, then check the in-flight list. + return !cur_snap_.MayHaveUncommittedTransactionsAtOrBefore(ts); +} + +bool MvccManager::AnyApplyingAtOrBeforeUnlocked(Timestamp ts) const { + for (const InFlightMap::value_type entry : timestamps_in_flight_) { + if (entry.first <= ts.value()) { + return true; + } + } + return false; +} + +void MvccManager::TakeSnapshot(MvccSnapshot *snap) const { + boost::lock_guard l(lock_); + *snap = cur_snap_; +} + +Status MvccManager::WaitForCleanSnapshotAtTimestamp(Timestamp timestamp, + MvccSnapshot *snap, + const MonoTime& deadline) const { + TRACE_EVENT0("tablet", "MvccManager::WaitForCleanSnapshotAtTimestamp"); + RETURN_NOT_OK(clock_->WaitUntilAfterLocally(timestamp, deadline)); + RETURN_NOT_OK(WaitUntil(ALL_COMMITTED, timestamp, deadline)); + *snap = MvccSnapshot(timestamp); + return Status::OK(); +} + +void MvccManager::WaitForCleanSnapshot(MvccSnapshot* snap) const { + CHECK_OK(WaitForCleanSnapshotAtTimestamp(clock_->Now(), snap, MonoTime::Max())); +} + +void MvccManager::WaitForApplyingTransactionsToCommit() const { + TRACE_EVENT0("tablet", "MvccManager::WaitForApplyingTransactionsToCommit"); + + // Find the highest timestamp of an APPLYING transaction. + Timestamp wait_for = Timestamp::kMin; + { + boost::lock_guard l(lock_); + for (const InFlightMap::value_type entry : timestamps_in_flight_) { + if (entry.second == APPLYING) { + wait_for = Timestamp(std::max(entry.first, wait_for.value())); + } + } + } + + // Wait until there are no transactions applying with that timestamp + // or below. It's possible that we're a bit conservative here - more transactions + // may enter the APPLYING set while we're waiting, but we will eventually + // succeed. + if (wait_for == Timestamp::kMin) { + // None were APPLYING: we can just return. + return; + } + CHECK_OK(WaitUntil(NONE_APPLYING, wait_for, MonoTime::Max())); +} + +bool MvccManager::AreAllTransactionsCommitted(Timestamp ts) const { + boost::lock_guard l(lock_); + return AreAllTransactionsCommittedUnlocked(ts); +} + +int MvccManager::CountTransactionsInFlight() const { + boost::lock_guard l(lock_); + return timestamps_in_flight_.size(); +} + +Timestamp MvccManager::GetCleanTimestamp() const { + boost::lock_guard l(lock_); + return cur_snap_.all_committed_before_; +} + +void MvccManager::GetApplyingTransactionsTimestamps(std::vector* timestamps) const { + boost::lock_guard l(lock_); + timestamps->reserve(timestamps_in_flight_.size()); + for (const InFlightMap::value_type entry : timestamps_in_flight_) { + if (entry.second == APPLYING) { + timestamps->push_back(Timestamp(entry.first)); + } + } +} + +MvccManager::~MvccManager() { + CHECK(waiters_.empty()); +} + +//////////////////////////////////////////////////////////// +// MvccSnapshot +//////////////////////////////////////////////////////////// + +MvccSnapshot::MvccSnapshot() + : all_committed_before_(Timestamp::kInitialTimestamp), + none_committed_at_or_after_(Timestamp::kInitialTimestamp) { +} + +MvccSnapshot::MvccSnapshot(const MvccManager &manager) { + manager.TakeSnapshot(this); +} + +MvccSnapshot::MvccSnapshot(const Timestamp& timestamp) + : all_committed_before_(timestamp), + none_committed_at_or_after_(timestamp) { + } + +MvccSnapshot MvccSnapshot::CreateSnapshotIncludingAllTransactions() { + return MvccSnapshot(Timestamp::kMax); +} + +MvccSnapshot MvccSnapshot::CreateSnapshotIncludingNoTransactions() { + return MvccSnapshot(Timestamp::kMin); +} + +bool MvccSnapshot::IsCommittedFallback(const Timestamp& timestamp) const { + for (const Timestamp::val_type& v : committed_timestamps_) { + if (v == timestamp.value()) return true; + } + + return false; +} + +bool MvccSnapshot::MayHaveCommittedTransactionsAtOrAfter(const Timestamp& timestamp) const { + return timestamp.CompareTo(none_committed_at_or_after_) < 0; +} + +bool MvccSnapshot::MayHaveUncommittedTransactionsAtOrBefore(const Timestamp& timestamp) const { + // The snapshot may have uncommitted transactions before 'timestamp' if: + // - 'all_committed_before_' comes before 'timestamp' + // - 'all_committed_before_' is precisely 'timestamp' but 'timestamp' isn't in the + // committed set. + return timestamp.CompareTo(all_committed_before_) > 0 || + (timestamp.CompareTo(all_committed_before_) == 0 && !IsCommittedFallback(timestamp)); +} + +std::string MvccSnapshot::ToString() const { + string ret("MvccSnapshot[committed={T|"); + + if (committed_timestamps_.size() == 0) { + StrAppend(&ret, "T < ", all_committed_before_.ToString(),"}]"); + return ret; + } + StrAppend(&ret, "T < ", all_committed_before_.ToString(), + " or (T in {"); + + bool first = true; + for (Timestamp::val_type t : committed_timestamps_) { + if (!first) { + ret.push_back(','); + } + first = false; + StrAppend(&ret, t); + } + ret.append("})}]"); + return ret; +} + +void MvccSnapshot::AddCommittedTimestamps(const std::vector& timestamps) { + for (const Timestamp& ts : timestamps) { + AddCommittedTimestamp(ts); + } +} + +void MvccSnapshot::AddCommittedTimestamp(Timestamp timestamp) { + if (IsCommitted(timestamp)) return; + + committed_timestamps_.push_back(timestamp.value()); + + // If this is a new upper bound commit mark, update it. + if (none_committed_at_or_after_.CompareTo(timestamp) <= 0) { + none_committed_at_or_after_ = Timestamp(timestamp.value() + 1); + } +} + +//////////////////////////////////////////////////////////// +// ScopedTransaction +//////////////////////////////////////////////////////////// +ScopedTransaction::ScopedTransaction(MvccManager *mgr, TimestampAssignmentType assignment_type) + : done_(false), + manager_(DCHECK_NOTNULL(mgr)), + assignment_type_(assignment_type) { + + switch (assignment_type_) { + case NOW: { + timestamp_ = mgr->StartTransaction(); + break; + } + case NOW_LATEST: { + timestamp_ = mgr->StartTransactionAtLatest(); + break; + } + default: { + LOG(FATAL) << "Illegal TransactionAssignmentType. Only NOW and NOW_LATEST are supported" + " by this ctor."; + } + } +} + +ScopedTransaction::ScopedTransaction(MvccManager *mgr, Timestamp timestamp) + : done_(false), + manager_(DCHECK_NOTNULL(mgr)), + assignment_type_(PRE_ASSIGNED), + timestamp_(timestamp) { + CHECK_OK(mgr->StartTransactionAtTimestamp(timestamp)); +} + +ScopedTransaction::~ScopedTransaction() { + if (!done_) { + Abort(); + } +} + +void ScopedTransaction::StartApplying() { + manager_->StartApplyingTransaction(timestamp_); +} + +void ScopedTransaction::Commit() { + switch (assignment_type_) { + case NOW: + case NOW_LATEST: { + manager_->CommitTransaction(timestamp_); + break; + } + case PRE_ASSIGNED: { + manager_->OfflineCommitTransaction(timestamp_); + break; + } + default: { + LOG(FATAL) << "Unexpected transaction assignment type."; + } + } + + done_ = true; +} + +void ScopedTransaction::Abort() { + manager_->AbortTransaction(timestamp_); + done_ = true; +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/mvcc.h b/src/kudu/tablet/mvcc.h new file mode 100644 index 000000000000..ffa736ac5501 --- /dev/null +++ b/src/kudu/tablet/mvcc.h @@ -0,0 +1,466 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_MVCC_H +#define KUDU_TABLET_MVCC_H + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/server/clock.h" +#include "kudu/util/locks.h" + +namespace kudu { +class CountDownLatch; +namespace tablet { +class MvccManager; + +using std::string; + +// A snapshot of the current MVCC state, which can determine whether +// a transaction ID should be considered visible. +class MvccSnapshot { + public: + MvccSnapshot(); + + // Create a snapshot with the current state of the given manager + explicit MvccSnapshot(const MvccManager &manager); + + // Create a snapshot at a specific Timestamp. + // + // This snapshot considers all transactions with lower timestamps to + // be committed, and those with higher timestamps to be uncommitted. + explicit MvccSnapshot(const Timestamp& timestamp); + + // Create a snapshot which considers all transactions as committed. + // This is mostly useful in test contexts. + static MvccSnapshot CreateSnapshotIncludingAllTransactions(); + + // Creates a snapshot which considers no transactions committed. + static MvccSnapshot CreateSnapshotIncludingNoTransactions(); + + // Return true if the given transaction ID should be considered committed + // in this snapshot. + inline bool IsCommitted(const Timestamp& timestamp) const { + // Inline the most likely path, in which our watermarks determine + // whether a transaction is committed. + if (PREDICT_TRUE(timestamp.CompareTo(all_committed_before_) < 0)) { + return true; + } + if (PREDICT_TRUE(timestamp.CompareTo(none_committed_at_or_after_) >= 0)) { + return false; + } + // Out-of-line the unlikely case which involves more complex (loopy) code. + return IsCommittedFallback(timestamp); + } + + // Returns true if this snapshot may have any committed transactions with ID + // equal to or higher than the provided 'timestamp'. + // This is mostly useful to avoid scanning REDO deltas in certain cases. + // If MayHaveCommittedTransactionsAtOrAfter(delta_stats.min) returns true + // it means that there might be transactions that need to be applied in the + // context of this snapshot; otherwise no scanning is necessary. + bool MayHaveCommittedTransactionsAtOrAfter(const Timestamp& timestamp) const; + + // Returns true if this snapshot may have any uncommitted transactions with ID + // equal to or lower than the provided 'timestamp'. + // This is mostly useful to avoid scanning UNDO deltas in certain cases. + // If MayHaveUncommittedTransactionsAtOrBefore(delta_stats.max) returns false it + // means that all UNDO delta transactions are committed in the context of this + // snapshot and no scanning is necessary; otherwise there might be some + // transactions that need to be undone. + bool MayHaveUncommittedTransactionsAtOrBefore(const Timestamp& timestamp) const; + + // Return a string representation of the set of committed transactions + // in this snapshot, suitable for debug printouts. + string ToString() const; + + // Return true if the snapshot is considered 'clean'. A clean snapshot is one + // which is determined only by a timestamp -- the snapshot considers all + // transactions with timestamps less than some timestamp to be committed, + // and all other transactions to be uncommitted. + bool is_clean() const { + return committed_timestamps_.empty(); + } + + // Consider the given list of timestamps to be committed in this snapshot, + // even if they weren't when the snapshot was constructed. + // This is used in the flush path, where the set of commits going into a + // flushed file may not be a consistent snapshot from the MVCC point of view, + // yet we need to construct a scanner that accurately represents that set. + void AddCommittedTimestamps(const std::vector& timestamps); + + private: + friend class MvccManager; + FRIEND_TEST(MvccTest, TestMayHaveCommittedTransactionsAtOrAfter); + FRIEND_TEST(MvccTest, TestMayHaveUncommittedTransactionsBefore); + FRIEND_TEST(MvccTest, TestWaitUntilAllCommitted_SnapAtTimestampWithInFlights); + + bool IsCommittedFallback(const Timestamp& timestamp) const; + + void AddCommittedTimestamp(Timestamp timestamp); + + // Summary rule: + // A transaction T is committed if and only if: + // T < all_committed_before_ or + // or committed_timestamps_.contains(T) + // + // In ASCII form, where 'C' represents a committed transaction, + // and 'U' represents an uncommitted one: + // + // CCCCCCCCCCCCCCCCCUUUUUCUUUCU + // | \___\___ committed_timestamps_ + // | + // \- all_committed_before_ + + + // A transaction ID below which all transactions have been committed. + // For any timestamp X, if X < all_committed_timestamp_, then X is committed. + Timestamp all_committed_before_; + + // A transaction ID at or beyond which no transactions have been committed. + // For any timestamp X, if X >= none_committed_after_, then X is uncommitted. + // This is equivalent to max(committed_timestamps_) + 1, but since + // that vector is unsorted, we cache it. + Timestamp none_committed_at_or_after_; + + // The set of transactions higher than all_committed_before_timestamp_ which + // are committed in this snapshot. + // It might seem like using an unordered_set<> or a set<> would be faster here, + // but in practice, this list tends to be stay pretty small, and is only + // rarely consulted (most data will be culled by 'all_committed_before_' + // or none_committed_at_or_after_. So, using the compact vector structure fits + // the whole thing on one or two cache lines, and it ends up going faster. + std::vector committed_timestamps_; + +}; + +// Coordinator of MVCC transactions. Threads wishing to make updates use +// the MvccManager to obtain a unique timestamp, usually through the ScopedTransaction +// class defined below. +// +// MVCC is used to defer updates until commit time, and allow iterators to +// operate on a snapshot which contains only committed transactions. +// +// There are two valid paths for a transaction: +// +// 1) StartTransaction() -> StartApplyingTransaction() -> CommitTransaction() +// or +// 2) StartTransaction() -> AbortTransaction() +// +// When a transaction is started, a timestamp is assigned. The manager will +// never assign a timestamp if there is already another transaction with +// the same timestamp in flight or previously committed. +// +// When a transaction is ready to start making changes to in-memory data, +// it should transition to APPLYING state by calling StartApplyingTransaction(). +// At this point, the transaction should apply its in-memory operations and +// must commit in a bounded amount of time (i.e it should not wait on external +// input such as an RPC from another host). +// +// NOTE: we do not support "rollback" of in-memory edits. Thus, once we call +// StartApplyingTransaction(), the transaction _must_ commit. +// +class MvccManager { + public: + explicit MvccManager(const scoped_refptr& clock); + + // Begin a new transaction, assigning it a transaction ID. + // Callers should generally prefer using the ScopedTransaction class defined + // below, which will automatically finish the transaction when it goes out + // of scope. + Timestamp StartTransaction(); + + // The same as the above but but starts the transaction at the latest possible + // time, i.e. now + max_error. Returns Timestamp::kInvalidTimestamp if it was + // not possible to obtain the latest time. + Timestamp StartTransactionAtLatest(); + + // Begins a new transaction, which is assigned the provided timestamp. + // Returns Status::OK() if the transaction was started successfully or + // Status::IllegalState() if the provided timestamp is already considered + // committed, e.g. if timestamp < 'all_committed_before_'. + Status StartTransactionAtTimestamp(Timestamp timestamp); + + // Mark that the transaction with the given timestamp is starting to apply + // its writes to in-memory stores. This must be called before CommitTransaction(). + // If this is called, then AbortTransaction(timestamp) must never be called. + void StartApplyingTransaction(Timestamp timestamp); + + // Commit the given transaction. + // + // If the transaction is not currently in-flight, this will trigger an + // assertion error. It is an error to commit the same transaction more + // than once. + // + // This should be used for 'true' online transaction processing on LEADER + // replicas and not for delayed processing on FOLLOWER/LEARNER replicas or + // on bootstrap, as this advances 'all_committed_before_' to clock_->Now() + // when possible. + // + // The transaction must already have been marked as 'APPLYING' by calling + // StartApplyingTransaction(), or else this logs a FATAL error. + void CommitTransaction(Timestamp timestamp); + + // Abort the given transaction. + // + // If the transaction is not currently in-flight, this will trigger an + // assertion error. It is an error to abort the same transaction more + // than once. + // + // This makes sure that the transaction with 'timestamp' is removed from + // the in-flight set but without advancing the safe time since a new + // transaction with a lower timestamp might be executed later. + // + // The transaction must not have been marked as 'APPLYING' by calling + // StartApplyingTransaction(), or else this logs a FATAL error. + void AbortTransaction(Timestamp timestamp); + + // Same as commit transaction but does not advance 'all_committed_before_'. + // Used for bootstrap and delayed processing in FOLLOWERS/LEARNERS. + // + // The transaction must already have been marked as 'APPLYING' by calling + // StartApplyingTransaction(), or else this logs a FATAL error. + void OfflineCommitTransaction(Timestamp timestamp); + + // Used in conjunction with OfflineCommitTransaction() so that the mvcc + // manager can trim state. + void OfflineAdjustSafeTime(Timestamp safe_time); + + // Take a snapshot of the current MVCC state, which indicates which + // transactions have been committed at the time of this call. + void TakeSnapshot(MvccSnapshot *snapshot) const; + + // Take a snapshot of the MVCC state at 'timestamp' (i.e which includes + // all transactions which have a lower timestamp) + // + // If there are any in-flight transactions at a lower timestamp, waits for + // them to complete before returning. Hence, we guarantee that, upon return, + // snapshot->is_clean(). + // + // TODO(KUDU-689): this may currently block forever, stalling scanner threads + // and potentially blocking tablet shutdown. + // + // REQUIRES: 'timestamp' must be in the past according to the configured + // clock. + Status WaitForCleanSnapshotAtTimestamp(Timestamp timestamp, + MvccSnapshot* snapshot, + const MonoTime& deadline) const WARN_UNUSED_RESULT; + + // Take a snapshot at the current timestamp, and then wait for any + // currently running transactions at an earlier timestamp to finish. + // + // The returned snapshot acts as a "barrier": + // - all transactions which started prior to this call are included in + // snapshot + // - no transactions which start after the call returns will be included + // in snapshot + // - snapshot->is_clean() is guaranteed + // + // Note that transactions are not blocked during this call. + void WaitForCleanSnapshot(MvccSnapshot* snapshot) const; + + // Wait for all operations that are currently APPLYING to commit. + // + // NOTE: this does _not_ guarantee that no transactions are APPLYING upon + // return -- just that those that were APPLYING at call time are finished + // upon return. + void WaitForApplyingTransactionsToCommit() const; + + bool AreAllTransactionsCommitted(Timestamp ts) const; + + // Return the number of transactions in flight.. + int CountTransactionsInFlight() const; + + // Returns the earliest possible timestamp for an uncommitted transaction. + // All timestamps before this one are guaranteed to be committed. + Timestamp GetCleanTimestamp() const; + + // Return the timestamps of all transactions which are currently 'APPLYING' + // (i.e. those which have started to apply their operations to in-memory data + // structures). Other transactions may have reserved their timestamps via + // StartTransaction() but not yet been applied. + // + // These transactions are guaranteed to eventually Commit() -- i.e. they will + // never Abort(). + void GetApplyingTransactionsTimestamps(std::vector* timestamps) const; + + ~MvccManager(); + + private: + friend class MvccTest; + FRIEND_TEST(MvccTest, TestAreAllTransactionsCommitted); + FRIEND_TEST(MvccTest, TestTxnAbort); + FRIEND_TEST(MvccTest, TestCleanTimeCoalescingOnOfflineTransactions); + FRIEND_TEST(MvccTest, TestWaitForApplyingTransactionsToCommit); + + enum TxnState { + RESERVED, + APPLYING + }; + + bool InitTransactionUnlocked(const Timestamp& timestamp); + + enum WaitFor { + ALL_COMMITTED, + NONE_APPLYING + }; + + struct WaitingState { + Timestamp timestamp; + CountDownLatch* latch; + WaitFor wait_for; + }; + + // Returns true if all transactions before the given timestamp are committed. + // + // If 'ts' is not in the past, it's still possible that new transactions could + // start with a lower timestamp after this returns. + bool AreAllTransactionsCommittedUnlocked(Timestamp ts) const; + + // Return true if there is any APPLYING operation with a timestamp + // less than or equal to 'ts'. + bool AnyApplyingAtOrBeforeUnlocked(Timestamp ts) const; + + // Waits until all transactions before the given time are committed. + Status WaitUntil(WaitFor wait_for, Timestamp ts, + const MonoTime& deadline) const WARN_UNUSED_RESULT; + + // Return true if the condition that the given waiter is waiting on has + // been achieved. + bool IsDoneWaitingUnlocked(const WaitingState& waiter) const; + + // Commits the given transaction. + // Sets *was_earliest to true if this was the earliest in-flight transaction. + void CommitTransactionUnlocked(Timestamp timestamp, + bool* was_earliest); + + // Remove the timestamp 'ts' from the in-flight map. + // FATALs if the ts is not in the in-flight map. + // Returns its state. + TxnState RemoveInFlightAndGetStateUnlocked(Timestamp ts); + + // Adjusts the clean time, i.e. the timestamp such that all transactions with + // lower timestamps are committed or aborted, based on which transactions are + // currently in flight and on what is the latest value of 'no_new_transactions_at_or_before_'. + void AdjustCleanTime(); + + // Advances the earliest in-flight timestamp, based on which transactions are + // currently in-flight. Usually called when the previous earliest transaction + // commits or aborts. + void AdvanceEarliestInFlightTimestamp(); + + int GetNumWaitersForTests() const { + lock_guard l(&lock_); + return waiters_.size(); + } + + typedef simple_spinlock LockType; + mutable LockType lock_; + + MvccSnapshot cur_snap_; + + // The set of timestamps corresponding to currently in-flight transactions. + typedef std::unordered_map InFlightMap; + InFlightMap timestamps_in_flight_; + + // A transaction ID below which all transactions are either committed or in-flight, + // meaning no new transactions will be started with a timestamp that is equal + // to or lower than this one. + Timestamp no_new_transactions_at_or_before_; + + // The minimum timestamp in timestamps_in_flight_, or Timestamp::kMax + // if that set is empty. This is cached in order to avoid having to iterate + // over timestamps_in_flight_ on every commit. + Timestamp earliest_in_flight_; + + scoped_refptr clock_; + mutable std::vector waiters_; + + DISALLOW_COPY_AND_ASSIGN(MvccManager); +}; + +// A scoped handle to a running transaction. +// When this object goes out of scope, the transaction is automatically +// committed. +class ScopedTransaction { + public: + + // How to assign the timestamp to this transaction: + // NOW - Based on the value obtained from clock_->Now(). + // NOW_LATEST - Based on the value obtained from clock_->NowLatest(). + // PRE_ASSIGNED - Based on the value passed in the ctor. + enum TimestampAssignmentType { + NOW, + NOW_LATEST, + PRE_ASSIGNED + }; + + // Create a new transaction from the given MvccManager. + // If 'latest' is true this transaction will use MvccManager::StartTransactionAtLatest() + // instead of MvccManager::StartTransaction(). + // + // The MvccManager must remain valid for the lifetime of this object. + explicit ScopedTransaction(MvccManager *manager, TimestampAssignmentType assignment_type = NOW); + + // Like the ctor above but starts the transaction at a pre-defined timestamp. + // When this transaction is committed it will use MvccManager::OfflineCommitTransaction() + // so this is appropriate for offline replaying of transactions for replica catch-up or + // bootstrap. + explicit ScopedTransaction(MvccManager *manager, Timestamp timestamp); + + // Commit the transaction referenced by this scoped object, if it hasn't + // already been committed. + ~ScopedTransaction(); + + Timestamp timestamp() const { + return timestamp_; + } + + // Mark that this transaction is about to begin applying its modifications to + // in-memory stores. + // + // This must be called before Commit(). Abort() may not be called after this + // method. + void StartApplying(); + + // Commit the in-flight transaction. + // + // Requires that StartApplying() has been called. + void Commit(); + + // Abort the in-flight transaction. + // + // Requires that StartApplying() has NOT been called. + void Abort(); + + private: + bool done_; + MvccManager * const manager_; + TimestampAssignmentType assignment_type_; + Timestamp timestamp_; + + DISALLOW_COPY_AND_ASSIGN(ScopedTransaction); +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/row_op.cc b/src/kudu/tablet/row_op.cc new file mode 100644 index 000000000000..28dfeb962077 --- /dev/null +++ b/src/kudu/tablet/row_op.cc @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/common/wire_protocol.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/tablet.pb.h" + +namespace kudu { +namespace tablet { + +RowOp::RowOp(DecodedRowOperation decoded_op) + : decoded_op(std::move(decoded_op)) {} + +RowOp::~RowOp() { +} + +void RowOp::SetFailed(const Status& s) { + DCHECK(!result) << result->DebugString(); + result.reset(new OperationResultPB()); + StatusToPB(s, result->mutable_failed_status()); +} + +void RowOp::SetInsertSucceeded(int mrs_id) { + DCHECK(!result) << result->DebugString(); + result.reset(new OperationResultPB()); + result->add_mutated_stores()->set_mrs_id(mrs_id); +} + +void RowOp::SetMutateSucceeded(gscoped_ptr result) { + DCHECK(!this->result) << result->DebugString(); + this->result = result.Pass(); +} + +string RowOp::ToString(const Schema& schema) const { + return decoded_op.ToString(schema); +} + +void RowOp::SetAlreadyFlushed() { + DCHECK(!result) << result->DebugString(); + result.reset(new OperationResultPB()); + result->set_flushed(true); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/row_op.h b/src/kudu/tablet/row_op.h new file mode 100644 index 000000000000..8ba2071e9b92 --- /dev/null +++ b/src/kudu/tablet/row_op.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_ROW_OP_H +#define KUDU_TABLET_ROW_OP_H + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/common/row_operations.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/lock_manager.h" + +namespace kudu { + +class Schema; + +namespace tablet { + +// Structure tracking the progress of a single row operation within a WriteTransaction. +struct RowOp { + public: + explicit RowOp(DecodedRowOperation decoded_op); + ~RowOp(); + + // Functions to set the result of the mutation. + // Only one of the following three functions must be called, + // at most once. + void SetFailed(const Status& s); + void SetInsertSucceeded(int mrs_id); + void SetMutateSucceeded(gscoped_ptr result); + void SetAlreadyFlushed(); + + bool has_row_lock() const { + return row_lock.acquired(); + } + + std::string ToString(const Schema& schema) const; + + // The original operation as decoded from the client request. + DecodedRowOperation decoded_op; + + // The key probe structure contains the row key in both key-encoded and + // ContiguousRow formats, bloom probe structure, etc. This is set during + // the "prepare" phase. + gscoped_ptr key_probe; + + // The row lock which has been acquired for this row. Set during the "prepare" + // phase. + ScopedRowLock row_lock; + + // The result of the operation, after Apply. + gscoped_ptr result; +}; + + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_ROW_OP_H */ + diff --git a/src/kudu/tablet/rowset.cc b/src/kudu/tablet/rowset.cc new file mode 100644 index 000000000000..247393c8a55c --- /dev/null +++ b/src/kudu/tablet/rowset.cc @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/rowset.h" + +#include +#include +#include +#include + +#include "kudu/common/generic_iterators.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/rowset_metadata.h" + +using std::shared_ptr; +using strings::Substitute; + +namespace kudu { namespace tablet { + +DuplicatingRowSet::DuplicatingRowSet(RowSetVector old_rowsets, + RowSetVector new_rowsets) + : old_rowsets_(std::move(old_rowsets)), + new_rowsets_(std::move(new_rowsets)) { + CHECK_GT(old_rowsets_.size(), 0); + CHECK_GT(new_rowsets_.size(), 0); +} + +DuplicatingRowSet::~DuplicatingRowSet() { +} + +// Stringify the given list of rowsets into 'dst'. +static void AppendRowSetStrings(const RowSetVector &rowsets, string *dst) { + bool first = true; + dst->append("["); + for (const shared_ptr &rs : rowsets) { + if (!first) { + dst->append(", "); + } + first = false; + dst->append(rs->ToString()); + } + dst->append("]"); +} + +string DuplicatingRowSet::ToString() const { + string ret; + ret.append("DuplicatingRowSet("); + + AppendRowSetStrings(old_rowsets_, &ret); + ret.append(" -> "); + AppendRowSetStrings(new_rowsets_, &ret); + ret.append(")"); + return ret; +} + +Status DuplicatingRowSet::NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const { + // Use the original rowset. + if (old_rowsets_.size() == 1) { + return old_rowsets_[0]->NewRowIterator(projection, snap, out); + } else { + // Union between them + + vector > iters; + for (const shared_ptr &rowset : old_rowsets_) { + gscoped_ptr iter; + RETURN_NOT_OK_PREPEND(rowset->NewRowIterator(projection, snap, &iter), + Substitute("Could not create iterator for rowset $0", + rowset->ToString())); + iters.push_back(shared_ptr(iter.release())); + } + + out->reset(new UnionIterator(iters)); + return Status::OK(); + } +} + +Status DuplicatingRowSet::NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const { + LOG(FATAL) << "duplicating rowsets do not act as compaction input"; + return Status::OK(); +} + + +Status DuplicatingRowSet::MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB* result) { + // Duplicate the update to both the relevant input rowset and the output rowset. + // + // It's crucial to do the mutation against the input side first, due to the potential + // for a race during flush: the output rowset may not yet hold a DELETE which + // is present in the input rowset. In that case, the UPDATE against the output rowset would + // succeed whereas it can't be applied to the input rowset. So, we update the input rowset first, + // and if it succeeds, propagate to the output. + + // First mutate the relevant input rowset. + bool updated = false; + for (const shared_ptr &rowset : old_rowsets_) { + Status s = rowset->MutateRow(timestamp, probe, update, op_id, stats, result); + if (s.ok()) { + updated = true; + break; + } else if (!s.IsNotFound()) { + LOG(ERROR) << "Unable to update key " + << probe.schema()->CreateKeyProjection().DebugRow(probe.row_key()) + << " (failed on rowset " << rowset->ToString() << "): " + << s.ToString(); + return s; + } + } + + if (!updated) { + return Status::NotFound("not found in any compaction input"); + } + + // If it succeeded there, we also need to mirror into the new rowset. + int mirrored_count = 0; + for (const shared_ptr &new_rowset : new_rowsets_) { + Status s = new_rowset->MutateRow(timestamp, probe, update, op_id, stats, result); + if (s.ok()) { + mirrored_count++; + #ifdef NDEBUG + // In non-DEBUG builds, we can break as soon as we find the correct + // rowset to mirror to. In a DEBUG build, though, we keep looking + // through all, and make sure that we only update in one of them. + break; + #endif + } else if (!s.IsNotFound()) { + LOG(FATAL) << "Unable to mirror update to rowset " << new_rowset->ToString() + << " for key: " << probe.schema()->CreateKeyProjection().DebugRow(probe.row_key()) + << ": " << s.ToString(); + } + // IsNotFound is OK - it might be in a different one. + } + CHECK_EQ(mirrored_count, 1) + << "Updated row in compaction input, but didn't mirror in exactly 1 new rowset: " + << probe.schema()->CreateKeyProjection().DebugRow(probe.row_key()); + return Status::OK(); +} + +Status DuplicatingRowSet::CheckRowPresent(const RowSetKeyProbe &probe, + bool *present, ProbeStats* stats) const { + *present = false; + for (const shared_ptr &rowset : old_rowsets_) { + RETURN_NOT_OK(rowset->CheckRowPresent(probe, present, stats)); + if (*present) { + return Status::OK(); + } + } + return Status::OK(); +} + +Status DuplicatingRowSet::CountRows(rowid_t *count) const { + int64_t accumulated_count = 0; + for (const shared_ptr &rs : new_rowsets_) { + rowid_t this_count; + RETURN_NOT_OK(rs->CountRows(&this_count)); + accumulated_count += this_count; + } + + CHECK_LT(accumulated_count, std::numeric_limits::max()) + << "TODO: should make sure this is 64-bit safe - probably not right now" + << " because rowid_t is only 32-bit."; + *count = accumulated_count; + return Status::OK(); +} + +Status DuplicatingRowSet::GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const { + // The range out of the output rowset always spans the full range + // of the input rowsets, since no new rows can be inserted. + // The output rowsets are in ascending order, so their total range + // spans the range [front().min, back().max]. + Slice junk; + RETURN_NOT_OK(new_rowsets_.front()->GetBounds(min_encoded_key, &junk)); + RETURN_NOT_OK(new_rowsets_.back()->GetBounds(&junk, max_encoded_key)); + return Status::OK(); +} + +uint64_t DuplicatingRowSet::EstimateOnDiskSize() const { + // The actual value of this doesn't matter, since it won't be selected + // for compaction. + uint64_t size = 0; + for (const shared_ptr &rs : new_rowsets_) { + size += rs->EstimateOnDiskSize(); + } + return size; +} + +shared_ptr DuplicatingRowSet::metadata() { + return shared_ptr(reinterpret_cast(NULL)); +} + +Status DuplicatingRowSet::DebugDump(vector *lines) { + int i = 1; + for (const shared_ptr &rs : old_rowsets_) { + LOG_STRING(INFO, lines) << "Duplicating rowset input " << ToString() << " " + << i << "/" << old_rowsets_.size() << ":"; + RETURN_NOT_OK(rs->DebugDump(lines)); + i++; + } + i = 1; + for (const shared_ptr &rs : new_rowsets_) { + LOG_STRING(INFO, lines) << "Duplicating rowset output " << ToString() << " " + << i << "/" << new_rowsets_.size() << ":"; + RETURN_NOT_OK(rs->DebugDump(lines)); + i++; + } + + return Status::OK(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/rowset.h b/src/kudu/tablet/rowset.h new file mode 100644 index 000000000000..3a301d89252e --- /dev/null +++ b/src/kudu/tablet/rowset.h @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_ROWSET_H +#define KUDU_TABLET_ROWSET_H + +#include +#include +#include +#include + +#include "kudu/cfile/cfile_util.h" +#include "kudu/common/iterator.h" +#include "kudu/common/rowid.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/util/bloom_filter.h" +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { + +class RowChangeList; + +namespace consensus { +class OpId; +} + +namespace tablet { + +class CompactionInput; +class OperationResultPB; +class MvccSnapshot; +class RowSetKeyProbe; +class RowSetMetadata; +struct ProbeStats; + +class RowSet { + public: + enum DeltaCompactionType { + MAJOR_DELTA_COMPACTION, + MINOR_DELTA_COMPACTION + }; + + // Check if a given row key is present in this rowset. + // Sets *present and returns Status::OK, unless an error + // occurs. + // + // If the row was once present in this rowset, but no longer present + // due to a DELETE, then this should set *present = false, as if + // it were never there. + virtual Status CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + ProbeStats* stats) const = 0; + + // Update/delete a row in this rowset. + // The 'update_schema' is the client schema used to encode the 'update' RowChangeList. + // + // If the row does not exist in this rowset, returns + // Status::NotFound(). + virtual Status MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB* result) = 0; + + // Return a new RowIterator for this rowset, with the given projection. + // The projection schema must remain valid for the lifetime of the iterator. + // The iterator will return rows/updates which were committed as of the time of + // 'snap'. + // The returned iterator is not Initted. + virtual Status NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const = 0; + + // Create the input to be used for a compaction. + // The provided 'projection' is for the compaction output. Each row + // will be projected into this Schema. + virtual Status NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const = 0; + + // Count the number of rows in this rowset. + virtual Status CountRows(rowid_t *count) const = 0; + + // Return the bounds for this RowSet. 'min_encoded_key' and 'max_encoded_key' + // are set to the first and last encoded keys for this RowSet. The storage + // for these slices is part of the RowSet and only guaranteed to stay valid + // until the RowSet is destroyed. + // + // In the case that the rowset is still mutable (eg MemRowSet), this may + // return Status::NotImplemented. + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const = 0; + + // Return a displayable string for this rowset. + virtual string ToString() const = 0; + + // Dump the full contents of this rowset, for debugging. + // This is very verbose so only useful within unit tests. + virtual Status DebugDump(vector *lines = NULL) = 0; + + // Estimate the number of bytes on-disk + virtual uint64_t EstimateOnDiskSize() const = 0; + + // Return the lock used for including this DiskRowSet in a compaction. + // This prevents multiple compactions and flushes from trying to include + // the same rowset. + virtual boost::mutex *compact_flush_lock() = 0; + + // Returns the metadata associated with this rowset. + virtual std::shared_ptr metadata() = 0; + + // Get the size of the delta's MemStore + virtual size_t DeltaMemStoreSize() const = 0; + + virtual bool DeltaMemStoreEmpty() const = 0; + + // Get the minimum log index corresponding to unflushed data in this row set. + virtual int64_t MinUnflushedLogIndex() const = 0; + + // Get the performance improvement that running a minor or major delta compaction would give. + // The returned score ranges between 0 and 1 inclusively. + virtual double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const = 0; + + // Flush the DMS if there's one + virtual Status FlushDeltas() = 0; + + // Compact delta stores if more than one. + virtual Status MinorCompactDeltaStores() = 0; + + virtual ~RowSet() {} + + // Return true if this RowSet is available for compaction, based on + // the current state of the compact_flush_lock. This should only be + // used under the Tablet's compaction selection lock, or else the + // lock status may change at any point. + virtual bool IsAvailableForCompaction() { + // Try to obtain the lock. If we don't succeed, it means the rowset + // was already locked for compaction by some other compactor thread, + // or it is a RowSet type which can't be used as a compaction input. + // + // We can be sure that our check here will remain true until after + // the compaction selection has finished because only one thread + // makes compaction selection at a time on a given Tablet due to + // Tablet::compact_select_lock_. + boost::mutex::scoped_try_lock try_lock(*compact_flush_lock()); + return try_lock.owns_lock(); + } + +}; + +// Used often enough, may as well typedef it. +typedef vector > RowSetVector; +// Structure which caches an encoded and hashed key, suitable +// for probing against rowsets. +class RowSetKeyProbe { + public: + // row_key: a reference to the key portion of a row in memory + // to probe for. + // + // NOTE: row_key is not copied and must be valid for the lifetime + // of this object. + explicit RowSetKeyProbe(ConstContiguousRow row_key) + : row_key_(std::move(row_key)) { + encoded_key_ = EncodedKey::FromContiguousRow(row_key_); + bloom_probe_ = BloomKeyProbe(encoded_key_slice()); + } + + // RowSetKeyProbes are usually allocated on the stack, which means that we + // must copy it if we require it later (e.g. Table::Mutate()). + // + // Still, the ConstContiguousRow row_key_ remains a reference to the data + // underlying the original RowsetKeyProbe and is not copied. + explicit RowSetKeyProbe(const RowSetKeyProbe& probe) + : row_key_(probe.row_key_) { + encoded_key_ = EncodedKey::FromContiguousRow(row_key_); + bloom_probe_ = BloomKeyProbe(encoded_key_slice()); + } + + const ConstContiguousRow& row_key() const { return row_key_; } + + // Pointer to the key which has been encoded to be contiguous + // and lexicographically comparable + const Slice &encoded_key_slice() const { return encoded_key_->encoded_key(); } + + // Return the cached structure used to query bloom filters. + const BloomKeyProbe &bloom_probe() const { return bloom_probe_; } + + // The schema containing the key. + const Schema* schema() const { return row_key_.schema(); } + + const EncodedKey &encoded_key() const { + return *encoded_key_; + } + + private: + const ConstContiguousRow row_key_; + gscoped_ptr encoded_key_; + BloomKeyProbe bloom_probe_; +}; + +// Statistics collected during row operations, counting how many times +// various structures had to be consulted to perform the operation. +// +// These eventually propagate into tablet-scoped metrics, and when we +// have RPC tracing capability, we could also stringify them into the +// trace to understand why an RPC may have been slow. +struct ProbeStats { + ProbeStats() + : blooms_consulted(0), + keys_consulted(0), + deltas_consulted(0), + mrs_consulted(0) { + } + + // Incremented for each bloom filter consulted. + int blooms_consulted; + + // Incremented for each key cfile consulted. + int keys_consulted; + + // Incremented for each delta file consulted. + int deltas_consulted; + + // Incremented for each MemRowSet consulted. + int mrs_consulted; +}; + +// RowSet which is used during the middle of a flush or compaction. +// It consists of a set of one or more input rowsets, and a single +// output rowset. All mutations are duplicated to the appropriate input +// rowset as well as the output rowset. All reads are directed to the +// union of the input rowsets. +// +// See compaction.txt for a little more detail on how this is used. +class DuplicatingRowSet : public RowSet { + public: + DuplicatingRowSet(RowSetVector old_rowsets, RowSetVector new_rowsets); + + virtual Status MutateRow(Timestamp timestamp, + const RowSetKeyProbe &probe, + const RowChangeList &update, + const consensus::OpId& op_id, + ProbeStats* stats, + OperationResultPB* result) OVERRIDE; + + Status CheckRowPresent(const RowSetKeyProbe &probe, bool *present, + ProbeStats* stats) const OVERRIDE; + + virtual Status NewRowIterator(const Schema *projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE; + + virtual Status NewCompactionInput(const Schema* projection, + const MvccSnapshot &snap, + gscoped_ptr* out) const OVERRIDE; + + Status CountRows(rowid_t *count) const OVERRIDE; + + virtual Status GetBounds(Slice *min_encoded_key, + Slice *max_encoded_key) const OVERRIDE; + + uint64_t EstimateOnDiskSize() const OVERRIDE; + + string ToString() const OVERRIDE; + + virtual Status DebugDump(vector *lines = NULL) OVERRIDE; + + std::shared_ptr metadata() OVERRIDE; + + // A flush-in-progress rowset should never be selected for compaction. + boost::mutex *compact_flush_lock() OVERRIDE { + LOG(FATAL) << "Cannot be compacted"; + return NULL; + } + + virtual bool IsAvailableForCompaction() OVERRIDE { + return false; + } + + ~DuplicatingRowSet(); + + size_t DeltaMemStoreSize() const OVERRIDE { return 0; } + + bool DeltaMemStoreEmpty() const OVERRIDE { return true; } + + double DeltaStoresCompactionPerfImprovementScore(DeltaCompactionType type) const OVERRIDE { + return 0; + } + + int64_t MinUnflushedLogIndex() const OVERRIDE { return -1; } + + Status FlushDeltas() OVERRIDE { + // It's important that DuplicatingRowSet does not FlushDeltas. This prevents + // a bug where we might end up with out-of-order deltas. See the long + // comment in Tablet::Flush(...) + return Status::OK(); + } + + Status MinorCompactDeltaStores() OVERRIDE { return Status::OK(); } + + private: + friend class Tablet; + + DISALLOW_COPY_AND_ASSIGN(DuplicatingRowSet); + + RowSetVector old_rowsets_; + RowSetVector new_rowsets_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/rowset_info.cc b/src/kudu/tablet/rowset_info.cc new file mode 100644 index 000000000000..5e45f6ab51ce --- /dev/null +++ b/src/kudu/tablet/rowset_info.cc @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/rowset_info.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include "kudu/gutil/algorithm.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/endian.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/rowset_tree.h" +#include "kudu/util/slice.h" + +using std::shared_ptr; +using std::unordered_map; +using std::vector; + +// Enforce a minimum size of 1MB, since otherwise the knapsack algorithm +// will always pick up small rowsets no matter what. +static const int kMinSizeMb = 1; + +namespace kudu { +namespace tablet { + +namespace { + +// Less-than comparison by minimum key (both by actual key slice and cdf) +bool LessCDFAndRSMin(const RowSetInfo& a, const RowSetInfo& b) { + Slice amin, bmin, max; + a.rowset()->GetBounds(&amin, &max); + b.rowset()->GetBounds(&bmin, &max); + return a.cdf_min_key() < b.cdf_min_key() && amin.compare(bmin) < 0; +} + +// Less-than comparison by maximum key (both by actual key slice and cdf) +bool LessCDFAndRSMax(const RowSetInfo& a, const RowSetInfo& b) { + Slice amax, bmax, min; + a.rowset()->GetBounds(&min, &amax); + b.rowset()->GetBounds(&min, &bmax); + return a.cdf_max_key() < b.cdf_max_key() && amax.compare(bmax) < 0; +} + +// Debug-checks that min <= imin <= imax <= max +void DCheckInside(const Slice& min, const Slice& max, + const Slice& imin, const Slice& imax) { + DCHECK_LE(min.compare(max), 0); + DCHECK_LE(imin.compare(imax), 0); + DCHECK_LE(min.compare(imin), 0); + DCHECK_LE(imax.compare(max), 0); +} + +// Return the number of bytes of common prefix shared by 'min' and 'max' +int CommonPrefix(const Slice& min, const Slice& max) { + int min_len = std::min(min.size(), max.size()); + int common_prefix = 0; + while (common_prefix < min_len && + min[common_prefix] == max[common_prefix]) { + ++common_prefix; + } + return common_prefix; +} + +void DCheckCommonPrefix(const Slice& min, const Slice& imin, + const Slice& imax, int common_prefix) { + DCHECK_EQ(memcmp(min.data(), imin.data(), common_prefix), 0) + << "slices should share common prefix:\n" + << "\t" << min.ToDebugString() << "\n" + << "\t" << imin.ToDebugString(); + DCHECK_EQ(memcmp(min.data(), imax.data(), common_prefix), 0) + << "slices should share common prefix:\n" + << "\t" << min.ToDebugString() << "\n" + << "\t" << imin.ToDebugString(); +} + +uint64_t SliceTailToInt(const Slice& slice, int start) { + uint64_t ret = 0; + DCHECK_GE(start, 0); + DCHECK_LE(start, slice.size()); + memcpy(&ret, &slice.data()[start], std::min(slice.size() - start, sizeof(ret))); + ret = BigEndian::ToHost64(ret); + return ret; +} + +// Finds fraction (imin, imax) takes up of rs->GetBounds(). +// Requires that (imin, imax) is contained in rs->GetBounds(). +double StringFractionInRange(const RowSet* rs, + const Slice& imin, + const Slice& imax) { + Slice min, max; + if (!rs->GetBounds(&min, &max).ok()) { + VLOG(2) << "Ignoring " << rs->ToString() << " in CDF calculation"; + return 0; + } + DCheckInside(min, max, imin, imax); + + int common_prefix = CommonPrefix(min, max); + DCheckCommonPrefix(min, imin, imax, common_prefix); + + // Convert the remaining portion of each string to an integer. + uint64_t min_int = SliceTailToInt(min, common_prefix); + uint64_t max_int = SliceTailToInt(max, common_prefix); + uint64_t imin_int = SliceTailToInt(imin, common_prefix); + uint64_t imax_int = SliceTailToInt(imax, common_prefix); + + // Compute how far between min and max the query point falls. + if (min_int == max_int) return 0; + return static_cast(imax_int - imin_int) / (max_int - min_int); +} + +// Typedef needed to use boost foreach macro +typedef unordered_map::value_type RowSetRowSetInfoPair; + +// Computes the "width" of an interval [prev, next] according to the amount +// of data estimated to be inside the interval, where this is calculated by +// multiplying the fraction that the interval takes up in the keyspace of +// each rowset by the rowset's size (assumes distribution of rows is somewhat +// uniform). +// Requires: [prev, next] contained in each rowset in "active" +double WidthByDataSize(const Slice& prev, const Slice& next, + const unordered_map& active) { + double weight = 0; + + for (const RowSetRowSetInfoPair& rsi : active) { + RowSet* rs = rsi.first; + double fraction = StringFractionInRange(rs, prev, next); + weight += rs->EstimateOnDiskSize() * fraction; + } + + return weight; +} + + +void CheckCollectOrderedCorrectness(const vector& min_key, + const vector& max_key, + double total_width) { + CHECK_GE(total_width, 0); + CHECK_EQ(min_key.size(), max_key.size()); + if (!min_key.empty()) { + CHECK_EQ(min_key.front().cdf_min_key(), 0.0f); + CHECK_EQ(max_key.back().cdf_max_key(), total_width); + } + DCHECK(std::is_sorted(min_key.begin(), min_key.end(), LessCDFAndRSMin)); + DCHECK(std::is_sorted(max_key.begin(), max_key.end(), LessCDFAndRSMax)); +} + +} // anonymous namespace + +// RowSetInfo class --------------------------------------------------- + +void RowSetInfo::Collect(const RowSetTree& tree, vector* rsvec) { + rsvec->reserve(tree.all_rowsets().size()); + for (const shared_ptr& ptr : tree.all_rowsets()) { + rsvec->push_back(RowSetInfo(ptr.get(), 0)); + } +} + +void RowSetInfo::CollectOrdered(const RowSetTree& tree, + vector* min_key, + vector* max_key) { + // Resize + size_t len = tree.all_rowsets().size(); + min_key->reserve(min_key->size() + len); + max_key->reserve(max_key->size() + len); + + // The collection process works as follows: + // For each sorted endpoint, first we identify whether it is a + // start or stop endpoint. + // + // At a start point, the associated rowset is added to the + // "active" rowset mapping, allowing us to keep track of the index + // of the rowset's RowSetInfo in the min_key vector. + // + // At a stop point, the rowset is removed from the "active" map. + // Note that the "active" map allows access to the incomplete + // RowSetInfo that the RowSet maps to. + // + // The algorithm keeps track of its state - a "sliding window" + // across the keyspace - by maintaining the previous key and current + // value of the total width traversed over the intervals. + Slice prev; + unordered_map active; + double total_width = 0.0f; + + // We need to filter out the rowsets that aren't available before we process the endpoints, + // else there's a race since we see endpoints twice and a delta compaction might finish in + // between. + RowSetVector available_rowsets; + for (const shared_ptr rs : tree.all_rowsets()) { + if (rs->IsAvailableForCompaction()) { + available_rowsets.push_back(rs); + } + } + + RowSetTree available_rs_tree; + available_rs_tree.Reset(available_rowsets); + for (const RowSetTree::RSEndpoint& rse : + available_rs_tree.key_endpoints()) { + RowSet* rs = rse.rowset_; + const Slice& next = rse.slice_; + double interval_width = WidthByDataSize(prev, next, active); + + // Increment active rowsets in min_key by the interval_width. + for (const RowSetRowSetInfoPair& rsi : active) { + RowSetInfo& cdf_rs = *rsi.second; + cdf_rs.cdf_max_key_ += interval_width; + } + + // Move sliding window + total_width += interval_width; + prev = next; + + // Add/remove current RowSetInfo + if (rse.endpoint_ == RowSetTree::START) { + min_key->push_back(RowSetInfo(rs, total_width)); + // Store reference from vector. This is safe b/c of reserve() above. + active.insert(std::make_pair(rs, &min_key->back())); + } else if (rse.endpoint_ == RowSetTree::STOP) { + // If not in active set, then STOP before START in endpoint tree + RowSetInfo* cdf_rs = CHECK_NOTNULL(active[rs]); + CHECK_EQ(cdf_rs->rowset(), rs) << "Inconsistent key interval tree."; + CHECK_EQ(active.erase(rs), 1); + max_key->push_back(*cdf_rs); + } else { + LOG(FATAL) << "Undefined RowSet endpoint type.\n" + << "\tExpected either RowSetTree::START=" << RowSetTree::START + << " or RowSetTree::STOP=" << RowSetTree::STOP << ".\n" + << "\tRecieved:\n" + << "\t\tRowSet=" << rs->ToString() << "\n" + << "\t\tKey=" << next << "\n" + << "\t\tEndpointType=" << rse.endpoint_; + } + } + + CheckCollectOrderedCorrectness(*min_key, *max_key, total_width); + + FinalizeCDFVector(min_key, total_width); + FinalizeCDFVector(max_key, total_width); +} + +RowSetInfo::RowSetInfo(RowSet* rs, double init_cdf) + : rowset_(rs), + size_mb_(std::max(implicit_cast(rs->EstimateOnDiskSize() / 1024 / 1024), + kMinSizeMb)), + cdf_min_key_(init_cdf), + cdf_max_key_(init_cdf) { +} + +void RowSetInfo::FinalizeCDFVector(vector* vec, + double quot) { + if (quot == 0) return; + for (RowSetInfo& cdf_rs : *vec) { + CHECK_GT(cdf_rs.size_mb_, 0) << "Expected file size to be at least 1MB " + << "for RowSet " << cdf_rs.rowset_->ToString() + << ", was " << cdf_rs.rowset_->EstimateOnDiskSize() + << " bytes."; + cdf_rs.cdf_min_key_ /= quot; + cdf_rs.cdf_max_key_ /= quot; + cdf_rs.density_ = (cdf_rs.cdf_max_key() - cdf_rs.cdf_min_key()) + / cdf_rs.size_mb_; + } +} + +string RowSetInfo::ToString() const { + string ret; + ret.append(rowset_->ToString()); + StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", size_mb_, + cdf_min_key_, cdf_max_key_); + Slice min, max; + if (rowset_->GetBounds(&min, &max).ok()) { + ret.append(" [").append(min.ToDebugString()); + ret.append(",").append(max.ToDebugString()); + ret.append("]"); + } + return ret; +} + +bool RowSetInfo::Intersects(const RowSetInfo &other) const { + if (other.cdf_min_key() > cdf_max_key()) return false; + if (other.cdf_max_key() < cdf_min_key()) return false; + return true; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/rowset_info.h b/src/kudu/tablet/rowset_info.h new file mode 100644 index 000000000000..df3a63ae5a09 --- /dev/null +++ b/src/kudu/tablet/rowset_info.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_ROWSET_INFO_H_ +#define KUDU_TABLET_ROWSET_INFO_H_ + +#include +#include + +namespace kudu { +namespace tablet { + +class RowSet; +class RowSetTree; + +// Class used to cache some computed statistics on a RowSet used +// during evaluation of budgeted compaction policy. +// +// Class is immutable. +class RowSetInfo { + public: + + // Appends the rowsets in no order without the cdf values set. + static void Collect(const RowSetTree& tree, std::vector* rsvec); + // Appends the rowsets in min-key and max-key sorted order, with + // cdf values set. + static void CollectOrdered(const RowSetTree& tree, + std::vector* min_key, + std::vector* max_key); + + int size_mb() const { return size_mb_; } + + // Return the value of the CDF at the minimum key of this candidate. + double cdf_min_key() const { return cdf_min_key_; } + // Return the value of the CDF at the maximum key of this candidate. + double cdf_max_key() const { return cdf_max_key_; } + + // Return the "width" of the candidate rowset. + // + // This is an estimate of the percentage of the tablet data which + // is spanned by this RowSet, calculated by integrating the + // probability distribution function across this rowset's keyrange. + double width() const { + return cdf_max_key_ - cdf_min_key_; + } + + double density() const { return density_; } + + RowSet* rowset() const { return rowset_; } + + std::string ToString() const; + + // Return true if this candidate overlaps the other candidate in + // the computed cdf interval. To check intersection in key space, + // use this instance's rowset()->GetBounds(). + // The two intersection results may not agree because of floating + // point error in the cdf calculation. + bool Intersects(const RowSetInfo& other) const; + + private: + explicit RowSetInfo(RowSet* rs, double init_cdf); + + static void FinalizeCDFVector(std::vector* vec, + double quot); + + RowSet* rowset_; + int size_mb_; + double cdf_min_key_, cdf_max_key_; + double density_; +}; + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/rowset_metadata.cc b/src/kudu/tablet/rowset_metadata.cc new file mode 100644 index 000000000000..042cdef58784 --- /dev/null +++ b/src/kudu/tablet/rowset_metadata.cc @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/rowset_metadata.h" + +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/map-util.h" + +using strings::Substitute; + +namespace kudu { +namespace tablet { + +// ============================================================================ +// RowSet Metadata +// ============================================================================ +Status RowSetMetadata::Load(TabletMetadata* tablet_metadata, + const RowSetDataPB& pb, + gscoped_ptr* metadata) { + gscoped_ptr ret(new RowSetMetadata(tablet_metadata)); + RETURN_NOT_OK(ret->InitFromPB(pb)); + metadata->reset(ret.release()); + return Status::OK(); +} + +Status RowSetMetadata::CreateNew(TabletMetadata* tablet_metadata, + int64_t id, + gscoped_ptr* metadata) { + metadata->reset(new RowSetMetadata(tablet_metadata, id)); + return Status::OK(); +} + +Status RowSetMetadata::Flush() { + return tablet_metadata_->Flush(); +} + +Status RowSetMetadata::InitFromPB(const RowSetDataPB& pb) { + CHECK(!initted_); + + id_ = pb.id(); + + // Load Bloom File + if (pb.has_bloom_block()) { + bloom_block_ = BlockId::FromPB(pb.bloom_block()); + } + + // Load AdHoc Index File + if (pb.has_adhoc_index_block()) { + adhoc_index_block_ = BlockId::FromPB(pb.adhoc_index_block()); + } + + // Load Column Files + for (const ColumnDataPB& col_pb : pb.columns()) { + ColumnId col_id = ColumnId(col_pb.column_id()); + blocks_by_col_id_[col_id] = BlockId::FromPB(col_pb.block()); + } + + // Load redo delta files + for (const DeltaDataPB& redo_delta_pb : pb.redo_deltas()) { + redo_delta_blocks_.push_back(BlockId::FromPB(redo_delta_pb.block())); + } + + last_durable_redo_dms_id_ = pb.last_durable_dms_id(); + + // Load undo delta files + for (const DeltaDataPB& undo_delta_pb : pb.undo_deltas()) { + undo_delta_blocks_.push_back(BlockId::FromPB(undo_delta_pb.block())); + } + + initted_ = true; + return Status::OK(); +} + +void RowSetMetadata::ToProtobuf(RowSetDataPB *pb) { + pb->set_id(id_); + + lock_guard l(&lock_); + + // Write Column Files + for (const ColumnIdToBlockIdMap::value_type& e : blocks_by_col_id_) { + ColumnId col_id = e.first; + const BlockId& block_id = e.second; + + ColumnDataPB *col_data = pb->add_columns(); + block_id.CopyToPB(col_data->mutable_block()); + col_data->set_column_id(col_id); + } + + // Write Delta Files + pb->set_last_durable_dms_id(last_durable_redo_dms_id_); + + for (const BlockId& redo_delta_block : redo_delta_blocks_) { + DeltaDataPB *redo_delta_pb = pb->add_redo_deltas(); + redo_delta_block.CopyToPB(redo_delta_pb->mutable_block()); + } + + for (const BlockId& undo_delta_block : undo_delta_blocks_) { + DeltaDataPB *undo_delta_pb = pb->add_undo_deltas(); + undo_delta_block.CopyToPB(undo_delta_pb->mutable_block()); + } + + // Write Bloom File + if (!bloom_block_.IsNull()) { + bloom_block_.CopyToPB(pb->mutable_bloom_block()); + } + + // Write AdHoc Index + if (!adhoc_index_block_.IsNull()) { + adhoc_index_block_.CopyToPB(pb->mutable_adhoc_index_block()); + } +} + +const string RowSetMetadata::ToString() const { + return Substitute("RowSet($0)", id_); +} + +void RowSetMetadata::SetColumnDataBlocks(const ColumnIdToBlockIdMap& blocks) { + lock_guard l(&lock_); + blocks_by_col_id_ = blocks; +} + +Status RowSetMetadata::CommitRedoDeltaDataBlock(int64_t dms_id, + const BlockId& block_id) { + lock_guard l(&lock_); + last_durable_redo_dms_id_ = dms_id; + redo_delta_blocks_.push_back(block_id); + return Status::OK(); +} + +Status RowSetMetadata::CommitUndoDeltaDataBlock(const BlockId& block_id) { + lock_guard l(&lock_); + undo_delta_blocks_.push_back(block_id); + return Status::OK(); +} + +Status RowSetMetadata::CommitUpdate(const RowSetMetadataUpdate& update) { + vector removed; + { + lock_guard l(&lock_); + + for (const RowSetMetadataUpdate::ReplaceDeltaBlocks rep : + update.replace_redo_blocks_) { + CHECK(!rep.to_remove.empty()); + + auto start_it = std::find(redo_delta_blocks_.begin(), + redo_delta_blocks_.end(), rep.to_remove[0]); + + auto end_it = start_it; + for (const BlockId& b : rep.to_remove) { + if (end_it == redo_delta_blocks_.end() || *end_it != b) { + return Status::InvalidArgument( + Substitute("Cannot find subsequence <$0> in <$1>", + BlockId::JoinStrings(rep.to_remove), + BlockId::JoinStrings(redo_delta_blocks_))); + } + ++end_it; + } + + removed.insert(removed.end(), start_it, end_it); + redo_delta_blocks_.erase(start_it, end_it); + redo_delta_blocks_.insert(start_it, rep.to_add.begin(), rep.to_add.end()); + } + + // Add new redo blocks + for (const BlockId& b : update.new_redo_blocks_) { + redo_delta_blocks_.push_back(b); + } + + if (!update.new_undo_block_.IsNull()) { + // Front-loading to keep the UNDO files in their natural order. + undo_delta_blocks_.insert(undo_delta_blocks_.begin(), update.new_undo_block_); + } + + for (const ColumnIdToBlockIdMap::value_type& e : update.cols_to_replace_) { + // If we are major-compacting deltas into a column which previously had no + // base-data (e.g. because it was newly added), then there will be no original + // block there to replace. + BlockId old_block_id; + if (UpdateReturnCopy(&blocks_by_col_id_, e.first, e.second, &old_block_id)) { + removed.push_back(old_block_id); + } + } + + for (ColumnId col_id : update.col_ids_to_remove_) { + BlockId old = FindOrDie(blocks_by_col_id_, col_id); + CHECK_EQ(1, blocks_by_col_id_.erase(col_id)); + removed.push_back(old); + } + } + + // Should only be NULL in tests. + if (tablet_metadata()) { + tablet_metadata()->AddOrphanedBlocks(removed); + } + return Status::OK(); +} + +vector RowSetMetadata::GetAllBlocks() { + vector blocks; + lock_guard l(&lock_); + if (!adhoc_index_block_.IsNull()) { + blocks.push_back(adhoc_index_block_); + } + if (!bloom_block_.IsNull()) { + blocks.push_back(bloom_block_); + } + AppendValuesFromMap(blocks_by_col_id_, &blocks); + + blocks.insert(blocks.end(), + undo_delta_blocks_.begin(), undo_delta_blocks_.end()); + blocks.insert(blocks.end(), + redo_delta_blocks_.begin(), redo_delta_blocks_.end()); + return blocks; +} + +RowSetMetadataUpdate::RowSetMetadataUpdate() { +} + +RowSetMetadataUpdate::~RowSetMetadataUpdate() { +} + +RowSetMetadataUpdate& RowSetMetadataUpdate::ReplaceColumnId(ColumnId col_id, + const BlockId& block_id) { + InsertOrDie(&cols_to_replace_, col_id, block_id); + return *this; +} + +RowSetMetadataUpdate& RowSetMetadataUpdate::RemoveColumnId(ColumnId col_id) { + col_ids_to_remove_.push_back(col_id); + return *this; +} + +RowSetMetadataUpdate& RowSetMetadataUpdate::ReplaceRedoDeltaBlocks( + const std::vector& to_remove, + const std::vector& to_add) { + + ReplaceDeltaBlocks rdb = { to_remove, to_add }; + replace_redo_blocks_.push_back(rdb); + return *this; +} + +RowSetMetadataUpdate& RowSetMetadataUpdate::SetNewUndoBlock(const BlockId& undo_block) { + new_undo_block_ = undo_block; + return *this; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/rowset_metadata.h b/src/kudu/tablet/rowset_metadata.h new file mode 100644 index 000000000000..d84f6084deb7 --- /dev/null +++ b/src/kudu/tablet/rowset_metadata.h @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_ROWSET_METADATA_H +#define KUDU_TABLET_ROWSET_METADATA_H + +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/tablet/tablet_metadata.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/env.h" +#include "kudu/util/locks.h" + +namespace kudu { + +namespace tools { +class FsTool; +} // namespace tools + +namespace tablet { + +class RowSetMetadataUpdate; +class TabletMetadata; + +// Keeps tracks of the RowSet data blocks. +// +// On each tablet MemRowSet flush, a new RowSetMetadata is created, +// and the DiskRowSetWriter will create and write the "immutable" blocks for +// columns, bloom filter and adHoc-Index. +// +// Once the flush is completed and all the blocks are written, +// the RowSetMetadata will be flushed. Currently, there is only a block +// containing all the tablet metadata, so flushing the RowSetMetadata will +// trigger a full TabletMetadata flush. +// +// Metadata writeback can be lazy: usage should generally be: +// +// 1) create new files on disk (durably) +// 2) change in-memory state to point to new files +// 3) make corresponding change in RowSetMetadata in-memory +// 4) trigger asynchronous flush +// +// callback: when metadata has been written: +// 1) remove old data files from disk +// 2) remove log anchors corresponding to previously in-memory data +// +class RowSetMetadata { + public: + typedef std::map ColumnIdToBlockIdMap; + + // Create a new RowSetMetadata + static Status CreateNew(TabletMetadata* tablet_metadata, + int64_t id, + gscoped_ptr* metadata); + + // Load metadata from a protobuf which was previously read from disk. + static Status Load(TabletMetadata* tablet_metadata, + const RowSetDataPB& pb, + gscoped_ptr* metadata); + + Status Flush(); + + const std::string ToString() const; + + int64_t id() const { return id_; } + + const Schema& tablet_schema() const { + return tablet_metadata_->schema(); + } + + void set_bloom_block(const BlockId& block_id) { + lock_guard l(&lock_); + DCHECK(bloom_block_.IsNull()); + bloom_block_ = block_id; + } + + void set_adhoc_index_block(const BlockId& block_id) { + lock_guard l(&lock_); + DCHECK(adhoc_index_block_.IsNull()); + adhoc_index_block_ = block_id; + } + + void SetColumnDataBlocks(const ColumnIdToBlockIdMap& blocks_by_col_id); + + Status CommitRedoDeltaDataBlock(int64_t dms_id, const BlockId& block_id); + + Status CommitUndoDeltaDataBlock(const BlockId& block_id); + + BlockId bloom_block() const { + lock_guard l(&lock_); + return bloom_block_; + } + + BlockId adhoc_index_block() const { + lock_guard l(&lock_); + return adhoc_index_block_; + } + + bool has_adhoc_index_block() const { + lock_guard l(&lock_); + return !adhoc_index_block_.IsNull(); + } + + BlockId column_data_block_for_col_id(ColumnId col_id) { + lock_guard l(&lock_); + return FindOrDie(blocks_by_col_id_, col_id); + } + + ColumnIdToBlockIdMap GetColumnBlocksById() const { + lock_guard l(&lock_); + return blocks_by_col_id_; + } + + vector redo_delta_blocks() const { + lock_guard l(&lock_); + return redo_delta_blocks_; + } + + vector undo_delta_blocks() const { + lock_guard l(&lock_); + return undo_delta_blocks_; + } + + TabletMetadata *tablet_metadata() const { return tablet_metadata_; } + + int64_t last_durable_redo_dms_id() const { + lock_guard l(&lock_); + return last_durable_redo_dms_id_; + } + + void SetLastDurableRedoDmsIdForTests(int64_t redo_dms_id) { + lock_guard l(&lock_); + last_durable_redo_dms_id_ = redo_dms_id; + } + + bool HasDataForColumnIdForTests(ColumnId col_id) const { + BlockId b; + lock_guard l(&lock_); + if (!FindCopy(blocks_by_col_id_, col_id, &b)) return false; + return fs_manager()->BlockExists(b); + } + + bool HasBloomDataBlockForTests() const { + lock_guard l(&lock_); + return !bloom_block_.IsNull() && fs_manager()->BlockExists(bloom_block_); + } + + FsManager *fs_manager() const { return tablet_metadata_->fs_manager(); } + + // Atomically commit a set of changes to this object. + // + // On success, calls TabletMetadata::AddOrphanedBlocks() on the removed blocks. + Status CommitUpdate(const RowSetMetadataUpdate& update); + + std::vector GetAllBlocks(); + + private: + friend class TabletMetadata; + friend class kudu::tools::FsTool; + + typedef simple_spinlock LockType; + + explicit RowSetMetadata(TabletMetadata *tablet_metadata) + : tablet_metadata_(tablet_metadata), + initted_(false), + last_durable_redo_dms_id_(kNoDurableMemStore) { + } + + RowSetMetadata(TabletMetadata *tablet_metadata, + int64_t id) + : tablet_metadata_(DCHECK_NOTNULL(tablet_metadata)), + initted_(true), + id_(id), + last_durable_redo_dms_id_(kNoDurableMemStore) { + } + + Status InitFromPB(const RowSetDataPB& pb); + + void ToProtobuf(RowSetDataPB *pb); + + TabletMetadata* const tablet_metadata_; + bool initted_; + int64_t id_; + + // Protects the below mutable fields. + mutable LockType lock_; + + BlockId bloom_block_; + BlockId adhoc_index_block_; + + // Map of column ID to block ID. + ColumnIdToBlockIdMap blocks_by_col_id_; + std::vector redo_delta_blocks_; + std::vector undo_delta_blocks_; + + int64_t last_durable_redo_dms_id_; + + DISALLOW_COPY_AND_ASSIGN(RowSetMetadata); +}; + +// A set up of updates to be made to a RowSetMetadata object. +// Updates can be collected here, and then atomically applied to a RowSetMetadata +// using the CommitUpdate() function. +class RowSetMetadataUpdate { + public: + RowSetMetadataUpdate(); + ~RowSetMetadataUpdate(); + + // Replace the subsequence of redo delta blocks with the new (compacted) delta blocks. + // The replaced blocks must be a contiguous subsequence of the the full list, + // since delta files cannot overlap in time. + // 'to_add' may be empty, in which case the blocks in to_remove are simply removed + // with no replacement. + RowSetMetadataUpdate& ReplaceRedoDeltaBlocks(const std::vector& to_remove, + const std::vector& to_add); + + // Replace the CFile for the given column ID. + RowSetMetadataUpdate& ReplaceColumnId(ColumnId col_id, const BlockId& block_id); + + // Remove the CFile for the given column ID. + RowSetMetadataUpdate& RemoveColumnId(ColumnId col_id); + + // Add a new UNDO delta block to the list of UNDO files. + // We'll need to replace them instead when we start GCing. + RowSetMetadataUpdate& SetNewUndoBlock(const BlockId& undo_block); + + private: + friend class RowSetMetadata; + RowSetMetadata::ColumnIdToBlockIdMap cols_to_replace_; + std::vector col_ids_to_remove_; + std::vector new_redo_blocks_; + + struct ReplaceDeltaBlocks { + std::vector to_remove; + std::vector to_add; + }; + std::vector replace_redo_blocks_; + BlockId new_undo_block_; + + DISALLOW_COPY_AND_ASSIGN(RowSetMetadataUpdate); +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_ROWSET_METADATA_H */ diff --git a/src/kudu/tablet/rowset_tree-test.cc b/src/kudu/tablet/rowset_tree-test.cc new file mode 100644 index 000000000000..e3488be9ce4e --- /dev/null +++ b/src/kudu/tablet/rowset_tree-test.cc @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/tablet/mock-rowsets.h" +#include "kudu/tablet/rowset.h" +#include "kudu/tablet/rowset_tree.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using std::shared_ptr; +using std::string; +using std::unordered_set; + +namespace kudu { namespace tablet { + +class TestRowSetTree : public KuduTest { +}; + +namespace { + +// Generates random rowsets with keys between 0 and 10000 +static RowSetVector GenerateRandomRowSets(int num_sets) { + RowSetVector vec; + for (int i = 0; i < num_sets; i++) { + int min = rand() % 9000; + int max = min + 1000; + + vec.push_back(shared_ptr(new MockDiskRowSet(StringPrintf("%04d", min), + StringPrintf("%04d", max)))); + } + return vec; +} + +} // anonymous namespace + +TEST_F(TestRowSetTree, TestTree) { + RowSetVector vec; + vec.push_back(shared_ptr(new MockDiskRowSet("0", "5"))); + vec.push_back(shared_ptr(new MockDiskRowSet("3", "5"))); + vec.push_back(shared_ptr(new MockDiskRowSet("5", "9"))); + vec.push_back(shared_ptr(new MockMemRowSet())); + + RowSetTree tree; + ASSERT_OK(tree.Reset(vec)); + + // "2" overlaps 0-5 and the MemRowSet. + vector out; + tree.FindRowSetsWithKeyInRange("2", &out); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(vec[3].get(), out[0]); // MemRowSet + ASSERT_EQ(vec[0].get(), out[1]); + + // "4" overlaps 0-5, 3-5, and the MemRowSet + out.clear(); + tree.FindRowSetsWithKeyInRange("4", &out); + ASSERT_EQ(3, out.size()); + ASSERT_EQ(vec[3].get(), out[0]); // MemRowSet + ASSERT_EQ(vec[0].get(), out[1]); + ASSERT_EQ(vec[1].get(), out[2]); + + // interval (2,4) overlaps 0-5, 3-5 and the MemRowSet + out.clear(); + tree.FindRowSetsIntersectingInterval("3", "4", &out); + ASSERT_EQ(3, out.size()); + ASSERT_EQ(vec[3].get(), out[0]); + ASSERT_EQ(vec[0].get(), out[1]); + ASSERT_EQ(vec[1].get(), out[2]); + + // interval (0,2) overlaps 0-5 and the MemRowSet + out.clear(); + tree.FindRowSetsIntersectingInterval("0", "2", &out); + ASSERT_EQ(2, out.size()); + ASSERT_EQ(vec[3].get(), out[0]); + ASSERT_EQ(vec[0].get(), out[1]); + + // interval (5,7) overlaps 0-5, 3-5, 5-9 and the MemRowSet + out.clear(); + tree.FindRowSetsIntersectingInterval("5", "7", &out); + ASSERT_EQ(4, out.size()); + ASSERT_EQ(vec[3].get(), out[0]); + ASSERT_EQ(vec[0].get(), out[1]); + ASSERT_EQ(vec[1].get(), out[2]); + ASSERT_EQ(vec[2].get(), out[3]); +} + +TEST_F(TestRowSetTree, TestPerformance) { + const int kNumRowSets = 200; + const int kNumQueries = AllowSlowTests() ? 1000000 : 10000; + SeedRandom(); + + // Create a bunch of rowsets, each of which spans about 10% of the "row space". + // The row space here is 4-digit 0-padded numbers. + RowSetVector vec = GenerateRandomRowSets(kNumRowSets); + + RowSetTree tree; + ASSERT_OK(tree.Reset(vec)); + + LOG_TIMING(INFO, StringPrintf("Querying rowset %d times", kNumQueries)) { + vector out; + char buf[32]; + for (int i = 0; i < kNumQueries; i++) { + out.clear(); + int query = rand() % 10000; + snprintf(buf, arraysize(buf), "%04d", query); + tree.FindRowSetsWithKeyInRange(Slice(buf, 4), &out); + } + } +} + +TEST_F(TestRowSetTree, TestEndpointsConsistency) { + const int kNumRowSets = 1000; + RowSetVector vec = GenerateRandomRowSets(kNumRowSets); + // Add pathological one-key rows + for (int i = 0; i < 10; ++i) { + vec.push_back(shared_ptr(new MockDiskRowSet(StringPrintf("%04d", 11000), + StringPrintf("%04d", 11000)))); + } + vec.push_back(shared_ptr(new MockDiskRowSet(StringPrintf("%04d", 12000), + StringPrintf("%04d", 12000)))); + // Make tree + RowSetTree tree; + ASSERT_OK(tree.Reset(vec)); + // Keep track of "currently open" intervals defined by the endpoints + unordered_set open; + // Keep track of all rowsets that have been visited + unordered_set visited; + + Slice prev; + for (const RowSetTree::RSEndpoint& rse : tree.key_endpoints()) { + RowSet* rs = rse.rowset_; + enum RowSetTree::EndpointType ept = rse.endpoint_; + const Slice& slice = rse.slice_; + + ASSERT_TRUE(rs != nullptr) << "RowSetTree has an endpoint with no rowset"; + ASSERT_TRUE(!slice.empty()) << "RowSetTree has an endpoint with no key"; + + if (!prev.empty()) { + ASSERT_LE(prev.compare(slice), 0); + } + + Slice min, max; + ASSERT_OK(rs->GetBounds(&min, &max)); + if (ept == RowSetTree::START) { + ASSERT_EQ(min.data(), slice.data()); + ASSERT_EQ(min.size(), slice.size()); + ASSERT_TRUE(InsertIfNotPresent(&open, rs)); + ASSERT_TRUE(InsertIfNotPresent(&visited, rs)); + } else if (ept == RowSetTree::STOP) { + ASSERT_EQ(max.data(), slice.data()); + ASSERT_EQ(max.size(), slice.size()); + ASSERT_TRUE(open.erase(rs) == 1); + } else { + FAIL() << "No such endpoint type exists"; + } + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/rowset_tree.cc b/src/kudu/tablet/rowset_tree.cc new file mode 100644 index 000000000000..69ca35a65632 --- /dev/null +++ b/src/kudu/tablet/rowset_tree.cc @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/rowset_tree.h" + +#include +#include +#include +#include +#include + +#include "kudu/gutil/stl_util.h" +#include "kudu/tablet/rowset.h" +#include "kudu/util/interval_tree.h" +#include "kudu/util/interval_tree-inl.h" +#include "kudu/util/slice.h" + +using std::vector; +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +namespace { + +// Lexicographic, first by slice, then by rowset pointer, then by start/stop +bool RSEndpointBySliceCompare(const RowSetTree::RSEndpoint& a, + const RowSetTree::RSEndpoint& b) { + int slice_cmp = a.slice_.compare(b.slice_); + if (slice_cmp) return slice_cmp < 0; + ptrdiff_t rs_cmp = a.rowset_ - b.rowset_; + if (rs_cmp) return rs_cmp < 0; + if (a.endpoint_ != b.endpoint_) return a.endpoint_ == RowSetTree::START; + return false; +} + +} // anonymous namespace + +// Entry for use in the interval tree. +struct RowSetWithBounds { + RowSet *rowset; + string min_key; + string max_key; +}; + +// Traits struct for IntervalTree. +struct RowSetIntervalTraits { + typedef Slice point_type; + typedef RowSetWithBounds *interval_type; + + static Slice get_left(const RowSetWithBounds *rs) { + return Slice(rs->min_key); + } + + static Slice get_right(const RowSetWithBounds *rs) { + return Slice(rs->max_key); + } + + static int compare(const Slice &a, const Slice &b) { + return a.compare(b); + } +}; + +RowSetTree::RowSetTree() + : initted_(false) { +} + +Status RowSetTree::Reset(const RowSetVector &rowsets) { + CHECK(!initted_); + std::vector entries; + RowSetVector unbounded; + ElementDeleter deleter(&entries); + entries.reserve(rowsets.size()); + std::vector endpoints; + endpoints.reserve(rowsets.size()*2); + + // Iterate over each of the provided RowSets, fetching their + // bounds and adding them to the local vectors. + for (const shared_ptr &rs : rowsets) { + gscoped_ptr rsit(new RowSetWithBounds()); + rsit->rowset = rs.get(); + Slice min_key, max_key; + Status s = rs->GetBounds(&min_key, &max_key); + if (s.IsNotSupported()) { + // This rowset is a MemRowSet, for which the bounds change as more + // data gets inserted. Therefore we can't put it in the static + // interval tree -- instead put it on the list which is consulted + // on every access. + unbounded.push_back(rs); + continue; + } else if (!s.ok()) { + LOG(WARNING) << "Unable to construct RowSetTree: " + << rs->ToString() << " unable to determine its bounds: " + << s.ToString(); + return s; + } + DCHECK_LE(min_key.compare(max_key), 0) + << "Rowset min must be <= max: " << rs->ToString(); + // Load into key endpoints. + endpoints.push_back(RSEndpoint(rsit->rowset, START, min_key)); + endpoints.push_back(RSEndpoint(rsit->rowset, STOP, max_key)); + + // Load bounds and save entry + rsit->min_key = min_key.ToString(); + rsit->max_key = max_key.ToString(); + entries.push_back(rsit.release()); + } + + // Sort endpoints + std::sort(endpoints.begin(), endpoints.end(), RSEndpointBySliceCompare); + + // Install the vectors into the object. + entries_.swap(entries); + unbounded_rowsets_.swap(unbounded); + tree_.reset(new IntervalTree(entries_)); + key_endpoints_.swap(endpoints); + all_rowsets_.assign(rowsets.begin(), rowsets.end()); + initted_ = true; + + return Status::OK(); +} + +void RowSetTree::FindRowSetsIntersectingInterval(const Slice &lower_bound, + const Slice &upper_bound, + vector *rowsets) const { + DCHECK(initted_); + + // All rowsets with unknown bounds need to be checked. + for (const shared_ptr &rs : unbounded_rowsets_) { + rowsets->push_back(rs.get()); + } + + // perf TODO: make it possible to query using raw Slices + // instead of copying to strings here + RowSetWithBounds query; + query.min_key = lower_bound.ToString(); + query.max_key = upper_bound.ToString(); + + vector from_tree; + from_tree.reserve(all_rowsets_.size()); + tree_->FindIntersectingInterval(&query, &from_tree); + rowsets->reserve(rowsets->size() + from_tree.size()); + for (RowSetWithBounds *rs : from_tree) { + rowsets->push_back(rs->rowset); + } +} + +void RowSetTree::FindRowSetsWithKeyInRange(const Slice &encoded_key, + vector *rowsets) const { + DCHECK(initted_); + + // All rowsets with unknown bounds need to be checked. + for (const shared_ptr &rs : unbounded_rowsets_) { + rowsets->push_back(rs.get()); + } + + // Query the interval tree to efficiently find rowsets with known bounds + // whose ranges overlap the probe key. + vector from_tree; + from_tree.reserve(all_rowsets_.size()); + tree_->FindContainingPoint(encoded_key, &from_tree); + rowsets->reserve(rowsets->size() + from_tree.size()); + for (RowSetWithBounds *rs : from_tree) { + rowsets->push_back(rs->rowset); + } +} + +RowSetTree::~RowSetTree() { + STLDeleteElements(&entries_); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/rowset_tree.h b/src/kudu/tablet/rowset_tree.h new file mode 100644 index 000000000000..dbb518a1d3ec --- /dev/null +++ b/src/kudu/tablet/rowset_tree.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_ROWSET_MANAGER_H +#define KUDU_TABLET_ROWSET_MANAGER_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/status.h" +#include "kudu/tablet/rowset.h" + +namespace kudu { + +template +class IntervalTree; + +namespace tablet { + +struct RowSetIntervalTraits; +struct RowSetWithBounds; + +// Class which encapsulates the set of rowsets which are active for a given +// Tablet. This provides efficient lookup by key for RowSets which may overlap +// that key range. +// +// Additionally, the rowset tree maintains information about the implicit +// intervals generated by the row sets (for instance, if a tablet has +// rowsets [0, 2] and [1, 3] it has three implicit contiguous intervals: +// [0, 1], [1, 2], and [2, 3]. +class RowSetTree { + public: + // An RSEndpoint is a POD which associates a rowset, an EndpointType + // (either the START or STOP of an interval), and the key at which the + // endpoint is located. + enum EndpointType { + START, + STOP + }; + struct RSEndpoint { + RSEndpoint(RowSet *rowset, EndpointType endpoint, Slice slice) + : rowset_(rowset), endpoint_(endpoint), slice_(std::move(slice)) {} + + RowSet* rowset_; + enum EndpointType endpoint_; + Slice slice_; + }; + + RowSetTree(); + Status Reset(const RowSetVector &rowsets); + ~RowSetTree(); + + // Return all RowSets whose range may contain the given encoded key. + // + // The returned pointers are guaranteed to be valid at least until this + // RowSetTree object is Reset(). + void FindRowSetsWithKeyInRange(const Slice &encoded_key, + std::vector *rowsets) const; + + void FindRowSetsIntersectingInterval(const Slice &lower_bound, + const Slice &upper_bound, + std::vector *rowsets) const; + + const RowSetVector &all_rowsets() const { return all_rowsets_; } + + // Iterates over RowSetTree::RSEndpoint, guaranteed to be ordered and for + // any rowset to appear exactly twice, once at its start slice and once at + // its stop slice, equivalent to its GetBounds() values. + const std::vector& key_endpoints() const { return key_endpoints_; } + + private: + // Interval tree of the rowsets. Used to efficiently find rowsets which might contain + // a probe row. + gscoped_ptr > tree_; + + // Ordered map of all the interval endpoints, holding the implicit contiguous + // intervals + // TODO map to usage statistics as well. See KUDU-??? + std::vector key_endpoints_; + + // Container for all of the entries in tree_. IntervalTree does + // not itself manage memory, so this provides a simple way to enumerate + // all the entry structs and free them in the destructor. + std::vector entries_; + + // All of the rowsets which were put in this RowSetTree. + RowSetVector all_rowsets_; + + // Rowsets for which the bounds are unknown -- e.g because they + // are mutable (MemRowSets). + // + // These have to be consulted for every access, so are not + // stored in the interval tree. + RowSetVector unbounded_rowsets_; + + bool initted_; +}; + +} // namespace tablet +} // namespace kudu +#endif diff --git a/src/kudu/tablet/schema-change.txt b/src/kudu/tablet/schema-change.txt new file mode 100644 index 000000000000..6bbe1754d9a5 --- /dev/null +++ b/src/kudu/tablet/schema-change.txt @@ -0,0 +1,107 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +============================================================ +Schema Changes +============================================================ + +Column IDs +------------------------------ +Internal to a Schema, and not exposed to the user, each column in a schema has +a unique identifier. The identifiers are integers which are not re-used, +and serve to distinguish an old column from a new one in the case that they +have the same name. + +For example: + +> CREATE TABLE x (col_a int, col_b int); +> INSERT INTO x VALUES (1, 1); +> ALTER TABLE x DROP COLUMN col_b; +> ALTER TABLE x ADD COLUMN col_b int not null default 999; + +In this case, although the Schema at the end of the sequence looks the same +as the one at the beginning, the correct data is: + +> SELECT * from x; + col_a | col_b +------------------ + 1 | 999 + +In other words, we cannot re-materialize data from the old 'col_b' into the new +'col_b'. + +If we were to dump the initial schema and the new schema, we would see that although +the two 'col_b's have the same name, they would have different column IDs. + +Column IDs are internal to the server and not sent by the user on RPCs. Clients +specify columns by name. This is because we expect a client to continue to make +queries like "select sum(col_b) from x;" without any refresh of the schema, even +if the column is dropped and re-added with new data. + +Schemas specified in RPCs +------------------------------ + +When the user makes an RPC to read or write from a tablet, the RPC specifies only +the names, types, and nullability of the columns. Internal to the server, we map +the names to the internal IDs. + +If the user specifies a column name which does not exist in the latest schema, +it is considered an error. + +If the type or nullability does not match, we also currently consider it an error. +In the future, we may be able to adapt the data to the requested type (eg promote +smaller to larger integers on read, promote non-null data to a nullable read, etc). + +Handling varying schemas at read time +------------------------------ + + Tablet + |---- MemRowSet + |---- DiskRowSet N + |-------- CFileSet + |-------- Delta Tracker + |------------ Delta Memstore + |------------ Delta File N + +Because the Schema of a table may change over time, different rowsets may have +been written with different schemas. At read time, the server determines a Schema +for the read based on the current metadata of the tablet. This Schema determines +what to do as the read path encounters older data which was inserted prior to +the schema change and thus may be missing some columns. + +For each column in the read schema which is not present in the data, that column +may be treated in one of two ways: + + 1) In the case that the new column has a "read default" in the metadata, that + value is materialized for each cell. + 2) If no "read default" is present, then the column must be nullable. In that + case, a column of NULLs is materialized. + +Currently, Kudu does not handle type changes. In the future, we may also need to +add type adapters to convert older data to the new type. + +When reading delta files, updates to columns which have since been removed are +ignored. Updates to new columns are applied on top of the materialized default +column data. + +Compaction +------------------------------ +Each CFileSet and DeltaFile has a schema associated to describe the data in it. +On compaction, CFileSet/DeltaFiles with different schemas may be aggregated into a new file. +This new file will have the latest schema and all the rows must be projected. + +In the case of CFiles, the projection affects only the new columns, where the read default +value will be written as data, or in case of "alter type" where the "encoding" is changed. + +In the case of DeltaFiles, the projection is essential since the RowChangeList is serialized +with no hint of the schema used. This means that you can read a RowChangeList only if you +know the exact serialization schema. diff --git a/src/kudu/tablet/svg_dump.cc b/src/kudu/tablet/svg_dump.cc new file mode 100644 index 000000000000..56af07955ce2 --- /dev/null +++ b/src/kudu/tablet/svg_dump.cc @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/svg_dump.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include "kudu/common/encoded_key.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/tablet/rowset_info.h" +#include "kudu/util/flag_tags.h" + +using std::ostream; +using std::unordered_set; +using std::vector; + +namespace kudu { +namespace tablet { + +// Flag to dump SVGs of every compaction decision. +// +// After dumping, these may be converted to an animation using a series of +// commands like: +// $ for x in compaction-*svg ; do convert $x $x.png ; done +// $ mencoder mf://compaction*png -mf fps=1 -ovc lavc -o compactions.avi + +DEFINE_string(compaction_policy_dump_svgs_pattern, "", + "File path into which to dump SVG visualization of " + "selected compactions. This is mostly useful in " + "the context of unit tests and benchmarks. " + "The special string 'TIME' will be substituted " + "with the compaction selection timestamp."); +TAG_FLAG(compaction_policy_dump_svgs_pattern, hidden); + +namespace { + +// Organize the input rowsets into rows for presentation. This simply +// distributes 'rowsets' into separate vectors in 'rows' such that +// within any given row, none of the rowsets overlap in keyspace. +void OrganizeSVGRows(const vector& candidates, + vector >* rows) { + rows->push_back(vector()); + + for (const RowSetInfo &candidate : candidates) { + // Slot into the first row of the output which fits it + bool found_slot = false; + for (vector &row : *rows) { + // If this candidate doesn't intersect any other candidates in this + // row, we can put it here. + bool fits_in_row = true; + for (const RowSetInfo *already_in_row : row) { + if (candidate.Intersects(*already_in_row)) { + fits_in_row = false; + break; + } + } + if (fits_in_row) { + row.push_back(&candidate); + found_slot = true; + break; + } + } + + // If we couldn't find a spot in any existing row, add a new row + // to the bottom of the SVG. + if (!found_slot) { + vector new_row; + new_row.push_back(&candidate); + rows->push_back(new_row); + } + } +} + +void DumpSVG(const vector& candidates, + const unordered_set& picked, + ostream* outptr) { + CHECK(outptr) << "Dump SVG expects an ostream"; + CHECK(outptr->good()) << "Dump SVG expects a good ostream"; + using std::endl; + ostream& out = *outptr; + + vector > svg_rows; + OrganizeSVGRows(candidates, &svg_rows); + + const char *kPickedColor = "#f66"; + const char *kDefaultColor = "#666"; + const double kTotalWidth = 1200; + const int kRowHeight = 15; + const double kHeaderHeight = 60; + const double kTotalHeight = kRowHeight * svg_rows.size() + kHeaderHeight; + + out << "" << endl; + + // Background + out << "" << endl; + + for (int row_index = 0; row_index < svg_rows.size(); row_index++) { + const vector &row = svg_rows[row_index]; + + int y = kRowHeight * row_index + kHeaderHeight; + for (const RowSetInfo *cand : row) { + bool was_picked = ContainsKey(picked, cand->rowset()); + const char *color = was_picked ? kPickedColor : kDefaultColor; + + double x = cand->cdf_min_key() * kTotalWidth; + double width = cand->width() * kTotalWidth; + out << StringPrintf("", + x, y, width, kRowHeight, color) << endl; + out << StringPrintf("%dMB", + x, y + kRowHeight, width, kRowHeight, cand->size_mb()) << endl; + } + } + + out << "" << endl; +} + +void PrintXMLHeader(ostream* o) { + CHECK(o) << "XML header printer expects an ostream"; + CHECK(o->good()) << "XML header printer expects a good ostream"; + *o << "" << std::endl; + *o << "" << std::endl; +} + +// Prepares ofstream to default dump location. +// In case any of the preparation fails or default pattern is empty, +// NULL is returned. +gscoped_ptr PrepareOstream() { + using std::ofstream; + gscoped_ptr out; + // Get default file name + const string &pattern = FLAGS_compaction_policy_dump_svgs_pattern; + if (pattern.empty()) return gscoped_ptr(); + const string path = StringReplace(pattern, "TIME", StringPrintf("%ld", time(nullptr)), true); + + // Open + out.reset(new ofstream(path.c_str())); + if (!out->is_open()) { + LOG(WARNING) << "Could not dump compaction output to " << path << ": file open failed"; + return gscoped_ptr(); + } + + return out.PassAs(); +} + +} // anonymous namespace + +void DumpCompactionSVG(const vector& candidates, + const unordered_set& picked, + ostream* out, + bool print_xml) { + // Get the desired pointer to the ostream + gscoped_ptr dfl; + if (!out) { + dfl = PrepareOstream(); + out = dfl.get(); + if (!out) return; + } + + // Print out with the correct ostream + LOG(INFO) << "Dumping SVG of DiskRowSetLayout with" + << (print_xml ? "" : "out") << " XML header"; + if (print_xml) { + PrintXMLHeader(out); + } + DumpSVG(candidates, picked, out); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/svg_dump.h b/src/kudu/tablet/svg_dump.h new file mode 100644 index 000000000000..caba77767728 --- /dev/null +++ b/src/kudu/tablet/svg_dump.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_COMPACTION_SVG_DUMP_H_ +#define KUDU_TABLET_COMPACTION_SVG_DUMP_H_ + +#include +#include +#include + +namespace kudu { +namespace tablet { + +class RowSet; + +class RowSetInfo; + +// Dump an SVG file which represents the candidates +// for compaction, highlighting the ones that were selected. +// Dumps in to parameter ostream. If ostream is null, then default ostream +// specified as a flag is used (see svg_dump.cc). +// The last optional parameter controls whether to print an XML header in +// the file. If true, prints the header (xml tag and DOCTYPE). Otherwise, only +// the ... section is printed. +void DumpCompactionSVG(const std::vector& candidates, + const std::unordered_set& picked, + std::ostream* out = NULL, + bool print_xml = true); + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/tablet-harness.h b/src/kudu/tablet/tablet-harness.h new file mode 100644 index 000000000000..cb81fae69844 --- /dev/null +++ b/src/kudu/tablet/tablet-harness.h @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_PEER_HARNESS_H +#define KUDU_TABLET_TABLET_PEER_HARNESS_H + +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/server/logical_clock.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/tablet.h" +#include "kudu/util/env.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" + +using std::string; +using std::vector; + +namespace kudu { +namespace tablet { + +// Creates a default partition schema and partition for a table. +// +// The provided schema must include column IDs. +// +// The partition schema will have no hash components, and a single range +// component over the primary key columns. The partition will cover the +// entire partition-key space. +static std::pair CreateDefaultPartition(const Schema& schema) { + // Create a default partition schema. + PartitionSchema partition_schema; + CHECK_OK(PartitionSchema::FromPB(PartitionSchemaPB(), schema, &partition_schema)); + + // Create the tablet partitions. + vector partitions; + CHECK_OK(partition_schema.CreatePartitions(vector(), schema, &partitions)); + CHECK_EQ(1, partitions.size()); + return std::make_pair(partition_schema, partitions[0]); +} + +class TabletHarness { + public: + struct Options { + explicit Options(string root_dir) + : env(Env::Default()), + tablet_id("test_tablet_id"), + root_dir(std::move(root_dir)), + enable_metrics(true) {} + + Env* env; + string tablet_id; + string root_dir; + bool enable_metrics; + }; + + TabletHarness(const Schema& schema, Options options) + : options_(std::move(options)), schema_(schema) {} + + Status Create(bool first_time) { + std::pair partition(CreateDefaultPartition(schema_)); + + // Build the Tablet + fs_manager_.reset(new FsManager(options_.env, options_.root_dir)); + if (first_time) { + RETURN_NOT_OK(fs_manager_->CreateInitialFileSystemLayout()); + } + RETURN_NOT_OK(fs_manager_->Open()); + + scoped_refptr metadata; + RETURN_NOT_OK(TabletMetadata::LoadOrCreate(fs_manager_.get(), + options_.tablet_id, + "KuduTableTest", + schema_, + partition.first, + partition.second, + TABLET_DATA_READY, + &metadata)); + if (options_.enable_metrics) { + metrics_registry_.reset(new MetricRegistry()); + } + + clock_ = server::LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp); + tablet_.reset(new Tablet(metadata, + clock_, + std::shared_ptr(), + metrics_registry_.get(), + new log::LogAnchorRegistry())); + return Status::OK(); + } + + Status Open() { + RETURN_NOT_OK(tablet_->Open()); + tablet_->MarkFinishedBootstrapping(); + return Status::OK(); + } + + server::Clock* clock() const { + return clock_.get(); + } + + const std::shared_ptr& tablet() { + return tablet_; + } + + FsManager* fs_manager() { + return fs_manager_.get(); + } + + MetricRegistry* metrics_registry() { + return metrics_registry_.get(); + } + + private: + Options options_; + + gscoped_ptr metrics_registry_; + + scoped_refptr clock_; + Schema schema_; + gscoped_ptr fs_manager_; + std::shared_ptr tablet_; +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_TABLET_PEER_HARNESS_H */ diff --git a/src/kudu/tablet/tablet-pushdown-test.cc b/src/kudu/tablet/tablet-pushdown-test.cc new file mode 100644 index 000000000000..b02e225c0304 --- /dev/null +++ b/src/kudu/tablet/tablet-pushdown-test.cc @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tablet { + +enum Setup { + ALL_IN_MEMORY, + SPLIT_MEMORY_DISK, + ALL_ON_DISK +}; + +class TabletPushdownTest : public KuduTabletTest, + public ::testing::WithParamInterface { + public: + TabletPushdownTest() + : KuduTabletTest(Schema({ ColumnSchema("key", INT32), + ColumnSchema("int_val", INT32), + ColumnSchema("string_val", STRING) }, 1)) { + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + + FillTestTablet(); + } + + void FillTestTablet() { + RowBuilder rb(client_schema_); + + nrows_ = 2100; + if (AllowSlowTests()) { + nrows_ = 100000; + } + + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow row(&client_schema_); + for (int64_t i = 0; i < nrows_; i++) { + CHECK_OK(row.SetInt32(0, i)); + CHECK_OK(row.SetInt32(1, i * 10)); + CHECK_OK(row.SetStringCopy(2, StringPrintf("%08" PRId64, i))); + ASSERT_OK_FAST(writer.Insert(row)); + + if (i == 205 && GetParam() == SPLIT_MEMORY_DISK) { + ASSERT_OK(tablet()->Flush()); + } + } + + if (GetParam() == ALL_ON_DISK) { + ASSERT_OK(tablet()->Flush()); + } + } + + // The predicates tested in the various test cases all yield + // the same set of rows. Run the scan and verify that the + // expected rows are returned. + void TestScanYieldsExpectedResults(ScanSpec spec) { + gscoped_ptr iter; + ASSERT_OK(tablet()->NewRowIterator(client_schema_, &iter)); + ASSERT_OK(iter->Init(&spec)); + ASSERT_TRUE(spec.predicates().empty()) << "Should have accepted all predicates"; + + vector results; + LOG_TIMING(INFO, "Filtering by int value") { + ASSERT_OK(IterateToStringList(iter.get(), &results)); + } + std::sort(results.begin(), results.end()); + for (const string &str : results) { + LOG(INFO) << str; + } + ASSERT_EQ(11, results.size()); + ASSERT_EQ("(int32 key=200, int32 int_val=2000, string string_val=00000200)", + results[0]); + ASSERT_EQ("(int32 key=210, int32 int_val=2100, string string_val=00000210)", + results[10]); + + int expected_blocks_from_disk; + int expected_rows_from_disk; + bool check_stats = true; + switch (GetParam()) { + case ALL_IN_MEMORY: + expected_blocks_from_disk = 0; + expected_rows_from_disk = 0; + break; + case SPLIT_MEMORY_DISK: + expected_blocks_from_disk = 1; + expected_rows_from_disk = 206; + break; + case ALL_ON_DISK: + // If AllowSlowTests() is true and all data is on disk + // (vs. first 206 rows -- containing the values we're looking + // for -- on disk and the rest in-memory), then the number + // of blocks and rows we will scan through can't be easily + // determined (as it depends on default cfile block size, the + // size of cfile header, and how much data each column takes + // up). + if (AllowSlowTests()) { + check_stats = false; + } else { + // If AllowSlowTests() is false, then all of the data fits + // into a single cfile. + expected_blocks_from_disk = 1; + expected_rows_from_disk = nrows_; + } + break; + } + if (check_stats) { + vector stats; + iter->GetIteratorStats(&stats); + for (const IteratorStats& col_stats : stats) { + EXPECT_EQ(expected_blocks_from_disk, col_stats.data_blocks_read_from_disk); + EXPECT_EQ(expected_rows_from_disk, col_stats.cells_read_from_disk); + } + } + } + + // Test that a scan with an empty projection and the given spec + // returns the expected number of rows. The rows themselves + // should be empty. + void TestCountOnlyScanYieldsExpectedResults(ScanSpec spec) { + Schema empty_schema(std::vector(), 0); + gscoped_ptr iter; + ASSERT_OK(tablet()->NewRowIterator(empty_schema, &iter)); + ASSERT_OK(iter->Init(&spec)); + ASSERT_TRUE(spec.predicates().empty()) << "Should have accepted all predicates"; + + vector results; + ASSERT_OK(IterateToStringList(iter.get(), &results)); + ASSERT_EQ(11, results.size()); + for (const string& result : results) { + ASSERT_EQ("()", result); + } + } + private: + uint64_t nrows_; +}; + +TEST_P(TabletPushdownTest, TestPushdownIntKeyRange) { + ScanSpec spec; + int32_t lower = 200; + int32_t upper = 210; + ColumnRangePredicate pred0(schema_.column(0), &lower, &upper); + spec.AddPredicate(pred0); + + TestScanYieldsExpectedResults(spec); + TestCountOnlyScanYieldsExpectedResults(spec); +} + +TEST_P(TabletPushdownTest, TestPushdownIntValueRange) { + // Push down a double-ended range on the integer value column. + + ScanSpec spec; + int32_t lower = 2000; + int32_t upper = 2100; + ColumnRangePredicate pred1(schema_.column(1), &lower, &upper); + spec.AddPredicate(pred1); + + TestScanYieldsExpectedResults(spec); + + // TODO: support non-key predicate pushdown on columns which aren't + // part of the projection. The following line currently would crash. + // TestCountOnlyScanYieldsExpectedResults(spec); + + // TODO: collect IO statistics per column, verify that most of the string blocks + // were not read. +} + +INSTANTIATE_TEST_CASE_P(AllMemory, TabletPushdownTest, ::testing::Values(ALL_IN_MEMORY)); +INSTANTIATE_TEST_CASE_P(SplitMemoryDisk, TabletPushdownTest, ::testing::Values(SPLIT_MEMORY_DISK)); +INSTANTIATE_TEST_CASE_P(AllDisk, TabletPushdownTest, ::testing::Values(ALL_ON_DISK)); + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet-schema-test.cc b/src/kudu/tablet/tablet-schema-test.cc new file mode 100644 index 000000000000..d87e598dbde1 --- /dev/null +++ b/src/kudu/tablet/tablet-schema-test.cc @@ -0,0 +1,299 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" + +using strings::Substitute; + +namespace kudu { +namespace tablet { + +class TestTabletSchema : public KuduTabletTest { + public: + TestTabletSchema() + : KuduTabletTest(CreateBaseSchema()) { + } + + void InsertRows(const Schema& schema, size_t first_key, size_t nrows) { + for (size_t i = first_key; i < nrows; ++i) { + InsertRow(schema, i); + + // Half of the rows will be on disk + // and the other half in the MemRowSet + if (i == (nrows / 2)) { + ASSERT_OK(tablet()->Flush()); + } + } + } + + void InsertRow(const Schema& schema, size_t key) { + LocalTabletWriter writer(tablet().get(), &schema); + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32(0, key)); + CHECK_OK(row.SetInt32(1, key)); + ASSERT_OK(writer.Insert(row)); + } + + void DeleteRow(const Schema& schema, size_t key) { + LocalTabletWriter writer(tablet().get(), &schema); + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32(0, key)); + ASSERT_OK(writer.Delete(row)); + } + + void MutateRow(const Schema& schema, size_t key, size_t col_idx, int32_t new_val) { + LocalTabletWriter writer(tablet().get(), &schema); + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32(0, key)); + CHECK_OK(row.SetInt32(col_idx, new_val)); + ASSERT_OK(writer.Update(row)); + } + + void VerifyTabletRows(const Schema& projection, + const std::vector >& keys) { + typedef std::pair StringPair; + + vector rows; + ASSERT_OK(DumpTablet(*tablet(), projection, &rows)); + for (const string& row : rows) { + bool found = false; + for (const StringPair& k : keys) { + if (row.find(k.first) != string::npos) { + ASSERT_STR_CONTAINS(row, k.second); + found = true; + break; + } + } + ASSERT_TRUE(found); + } + } + + private: + Schema CreateBaseSchema() { + return Schema({ ColumnSchema("key", INT32), + ColumnSchema("c1", INT32) }, 1); + } +}; + +// Read from a tablet using a projection schema with columns not present in +// the original schema. Verify that the server reject the request. +TEST_F(TestTabletSchema, TestRead) { + const size_t kNumRows = 10; + Schema projection({ ColumnSchema("key", INT32), + ColumnSchema("c2", INT64), + ColumnSchema("c3", STRING) }, + 1); + + InsertRows(client_schema_, 0, kNumRows); + + gscoped_ptr iter; + ASSERT_OK(tablet()->NewRowIterator(projection, &iter)); + + Status s = iter->Init(nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.message().ToString(), + "Some columns are not present in the current schema: c2, c3"); +} + +// Write to the tablet using different schemas, +// and verifies that the read and write defauls are respected. +TEST_F(TestTabletSchema, TestWrite) { + const size_t kNumBaseRows = 10; + + // Insert some rows with the base schema + InsertRows(client_schema_, 0, kNumBaseRows); + + // Add one column with a default value + const int32_t c2_write_default = 5; + const int32_t c2_read_default = 7; + + SchemaBuilder builder(tablet()->metadata()->schema()); + ASSERT_OK(builder.AddColumn("c2", INT32, false, &c2_read_default, &c2_write_default)); + AlterSchema(builder.Build()); + Schema s2 = builder.BuildWithoutIds(); + + // Insert with base/old schema + size_t s2Key = kNumBaseRows + 1; + InsertRow(client_schema_, s2Key); + + // Verify the default value + std::vector > keys; + keys.push_back(std::pair(Substitute("key=$0", s2Key), + Substitute("c2=$0", c2_write_default))); + keys.push_back(std::pair("", Substitute("c2=$0", c2_read_default))); + VerifyTabletRows(s2, keys); + + // Delete the row + DeleteRow(s2, s2Key); + + // Verify the default value + VerifyTabletRows(s2, keys); + + // Re-Insert with base/old schema + InsertRow(client_schema_, s2Key); + VerifyTabletRows(s2, keys); + + // Try compact all (different schemas) + ASSERT_OK(tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + VerifyTabletRows(s2, keys); +} + +// Verify that the RowChangeList projection works for reinsert mutation +TEST_F(TestTabletSchema, TestReInsert) { + // Insert some rows with the base schema + size_t s1Key = 0; + InsertRow(client_schema_, s1Key); + DeleteRow(client_schema_, s1Key); + InsertRow(client_schema_, s1Key); + + // Add one column with a default value + const int32_t c2_write_default = 5; + const int32_t c2_read_default = 7; + + SchemaBuilder builder(tablet()->metadata()->schema()); + ASSERT_OK(builder.AddColumn("c2", INT32, false, &c2_read_default, &c2_write_default)); + AlterSchema(builder.Build()); + Schema s2 = builder.BuildWithoutIds(); + + // Insert with base/old schema + size_t s2Key = 1; + InsertRow(client_schema_, s2Key); + + // Verify the default value + std::vector > keys; + keys.push_back(std::pair(Substitute("key=$0", s1Key), + Substitute("c2=$0", c2_read_default))); + keys.push_back(std::pair(Substitute("key=$0", s2Key), + Substitute("c2=$0", c2_write_default))); + VerifyTabletRows(s2, keys); + + // Try compact all (different schemas) + ASSERT_OK(tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + VerifyTabletRows(s2, keys); +} + +// Write to the table using a projection schema with a renamed field. +TEST_F(TestTabletSchema, TestRenameProjection) { + std::vector > keys; + + // Insert with the base schema + InsertRow(client_schema_, 1); + + // Switch schema to s2 + SchemaBuilder builder(tablet()->metadata()->schema()); + ASSERT_OK(builder.RenameColumn("c1", "c1_renamed")); + AlterSchema(builder.Build()); + Schema s2 = builder.BuildWithoutIds(); + + // Insert with the s2 schema after AlterSchema(s2) + InsertRow(s2, 2); + + // Read and verify using the s2 schema + keys.clear(); + for (int i = 1; i <= 4; ++i) { + keys.push_back(std::pair(Substitute("key=$0", i), + Substitute("c1_renamed=$0", i))); + } + VerifyTabletRows(s2, keys); + + // Delete the first two rows + DeleteRow(s2, /* key= */ 1); + + // Alter the remaining row + MutateRow(s2, /* key= */ 2, /* col_idx= */ 1, /* new_val= */ 6); + + // Read and verify using the s2 schema + keys.clear(); + keys.push_back(std::pair("key=2", "c1_renamed=6")); + VerifyTabletRows(s2, keys); +} + +// Verify that removing a column and re-adding it will not result in making old data visible +TEST_F(TestTabletSchema, TestDeleteAndReAddColumn) { + std::vector > keys; + + // Insert and Mutate with the base schema + InsertRow(client_schema_, 1); + MutateRow(client_schema_, /* key= */ 1, /* col_idx= */ 1, /* new_val= */ 2); + + keys.clear(); + keys.push_back(std::pair("key=1", "c1=2")); + VerifyTabletRows(client_schema_, keys); + + // Switch schema to s2 + SchemaBuilder builder(tablet()->metadata()->schema()); + ASSERT_OK(builder.RemoveColumn("c1")); + // NOTE this new 'c1' will have a different id from the previous one + // so the data added to the previous 'c1' will not be visible. + ASSERT_OK(builder.AddNullableColumn("c1", INT32)); + AlterSchema(builder.Build()); + Schema s2 = builder.BuildWithoutIds(); + + // Verify that the new 'c1' have the default value + keys.clear(); + keys.push_back(std::pair("key=1", "c1=NULL")); + VerifyTabletRows(s2, keys); +} + +// Verify modifying an empty MemRowSet +TEST_F(TestTabletSchema, TestModifyEmptyMemRowSet) { + std::vector > keys; + + // Switch schema to s2 + SchemaBuilder builder(tablet()->metadata()->schema()); + ASSERT_OK(builder.AddNullableColumn("c2", INT32)); + AlterSchema(builder.Build()); + Schema s2 = builder.BuildWithoutIds(); + + // Verify we can insert some new data. + // Inserts the row "(2, 2, 2)" + LocalTabletWriter writer(tablet().get(), &s2); + KuduPartialRow row(&s2); + CHECK_OK(row.SetInt32(0, 2)); + CHECK_OK(row.SetInt32(1, 2)); + CHECK_OK(row.SetInt32(2, 2)); + ASSERT_OK(writer.Insert(row)); + + vector rows; + ASSERT_OK(DumpTablet(*tablet(), s2, &rows)); + EXPECT_EQ("(int32 key=2, int32 c1=2, int32 c2=2)", rows[0]); + + // Update some columns. + MutateRow(s2, /* key= */ 2, /* col_idx= */ 2, /* new_val= */ 3); + ASSERT_OK(DumpTablet(*tablet(), s2, &rows)); + EXPECT_EQ("(int32 key=2, int32 c1=2, int32 c2=3)", rows[0]); + + MutateRow(s2, /* key= */ 2, /* col_idx= */ 1, /* new_val= */ 4); + ASSERT_OK(DumpTablet(*tablet(), s2, &rows)); + EXPECT_EQ("(int32 key=2, int32 c1=4, int32 c2=3)", rows[0]); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet-test-base.h b/src/kudu/tablet/tablet-test-base.h new file mode 100644 index 000000000000..6d726cc15af8 --- /dev/null +++ b/src/kudu/tablet/tablet-test-base.h @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_TEST_BASE_H +#define KUDU_TABLET_TABLET_TEST_BASE_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/env.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_graph.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/test_util.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/gutil/strings/numbers.h" + +using std::unordered_set; +using strings::Substitute; + +namespace kudu { +namespace tablet { + +// The base class takes as a template argument a "setup" class +// which can customize the schema for the tests. This way we can +// get coverage on various schemas without duplicating test code. +struct StringKeyTestSetup { + static Schema CreateSchema() { + return Schema({ ColumnSchema("key", STRING), + ColumnSchema("key_idx", INT32), + ColumnSchema("val", INT32) }, + 1); + } + + void BuildRowKey(KuduPartialRow *row, int64_t key_idx) { + // This is called from multiple threads, so can't move this buffer + // to be a class member. However, it's likely to get inlined anyway + // and loop-hosted. + char buf[256]; + FormatKey(buf, sizeof(buf), key_idx); + CHECK_OK(row->SetStringCopy(0, Slice(buf))); + } + + // builds a row key from an existing row for updates + void BuildRowKeyFromExistingRow(KuduPartialRow *row, const RowBlockRow& src_row) { + CHECK_OK(row->SetStringCopy(0, *reinterpret_cast(src_row.cell_ptr(0)))); + } + + void BuildRow(KuduPartialRow *row, int64_t key_idx, int32_t val = 0) { + BuildRowKey(row, key_idx); + CHECK_OK(row->SetInt32(1, key_idx)); + CHECK_OK(row->SetInt32(2, val)); + } + + static void FormatKey(char *buf, size_t buf_size, int64_t key_idx) { + snprintf(buf, buf_size, "hello %" PRId64, key_idx); + } + + string FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + char buf[256]; + FormatKey(buf, sizeof(buf), key_idx); + + return Substitute( + "(string key=$0, int32 key_idx=$1, int32 val=$2)", + buf, key_idx, val); + } + + // Slices can be arbitrarily large + // but in practice tests won't overflow a uint64_t + uint64_t GetMaxRows() const { + return std::numeric_limits::max() - 1; + } +}; + +// Setup for testing composite keys +struct CompositeKeyTestSetup { + static Schema CreateSchema() { + return Schema({ ColumnSchema("key1", STRING), + ColumnSchema("key2", INT32), + ColumnSchema("key_idx", INT32), + ColumnSchema("val", INT32) }, + 2); + } + + // builds a row key from an existing row for updates + void BuildRowKeyFromExistingRow(KuduPartialRow *row, const RowBlockRow& src_row) { + CHECK_OK(row->SetStringCopy(0, *reinterpret_cast(src_row.cell_ptr(0)))); + CHECK_OK(row->SetInt32(1, *reinterpret_cast(src_row.cell_ptr(1)))); + } + + static void FormatKey(char *buf, size_t buf_size, int64_t key_idx) { + snprintf(buf, buf_size, "hello %" PRId64, key_idx); + } + + string FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + char buf[256]; + FormatKey(buf, sizeof(buf), key_idx); + return Substitute( + "(string key1=$0, int32 key2=$1, int32 val=$2, int32 val=$3)", + buf, key_idx, key_idx, val); + } + + // Slices can be arbitrarily large + // but in practice tests won't overflow a uint64_t + uint64_t GetMaxRows() const { + return std::numeric_limits::max() - 1; + } +}; + +// Setup for testing integer keys +template +struct IntKeyTestSetup { + static Schema CreateSchema() { + return Schema({ ColumnSchema("key", Type), + ColumnSchema("key_idx", INT32), + ColumnSchema("val", INT32) }, 1); + } + + void BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK(false) << "Unsupported type"; + } + + // builds a row key from an existing row for updates + template + void BuildRowKeyFromExistingRow(KuduPartialRow *dst_row, const RowType& row) { + CHECK(false) << "Unsupported type"; + } + + void BuildRow(KuduPartialRow *row, int64_t key_idx, + int32_t val = 0) { + BuildRowKey(row, key_idx); + CHECK_OK(row->SetInt32(1, key_idx)); + CHECK_OK(row->SetInt32(2, val)); + } + + string FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + CHECK(false) << "Unsupported type"; + return ""; + } + + uint64_t GetMaxRows() const { + return std::numeric_limits::cpp_type>::max() - 1; + } +}; + +template<> +void IntKeyTestSetup::BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK_OK(row->SetInt8(0, (int8_t) i * (i % 2 == 0 ? -1 : 1))); +} + +template<> +void IntKeyTestSetup::BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK_OK(row->SetInt16(0, (int16_t) i * (i % 2 == 0 ? -1 : 1))); +} + +template<> +void IntKeyTestSetup::BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK_OK(row->SetInt32(0, (int32_t) i * (i % 2 == 0 ? -1 : 1))); +} + +template<> +void IntKeyTestSetup::BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK_OK(row->SetInt64(0, (int64_t) i * (i % 2 == 0 ? -1 : 1))); +} + +template<> template +void IntKeyTestSetup::BuildRowKeyFromExistingRow(KuduPartialRow *row, + const RowType& src_row) { + CHECK_OK(row->SetInt8(0, *reinterpret_cast(src_row.cell_ptr(0)))); +} + +template<> template +void IntKeyTestSetup::BuildRowKeyFromExistingRow(KuduPartialRow *row, + const RowType& src_row) { + CHECK_OK(row->SetInt16(0, *reinterpret_cast(src_row.cell_ptr(0)))); +} +template<> template +void IntKeyTestSetup::BuildRowKeyFromExistingRow(KuduPartialRow *row, + const RowType& src_row) { + CHECK_OK(row->SetInt32(0, *reinterpret_cast(src_row.cell_ptr(0)))); +} + +template<> template +void IntKeyTestSetup::BuildRowKeyFromExistingRow(KuduPartialRow *row, + const RowType& src_row) { + CHECK_OK(row->SetInt64(0, *reinterpret_cast(src_row.cell_ptr(0)))); +} + +template<> +string IntKeyTestSetup::FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + return Substitute( + "(int8 key=$0, int32 key_idx=$1, int32 val=$2)", + (key_idx % 2 == 0) ? -key_idx : key_idx, key_idx, val); +} + +template<> +string IntKeyTestSetup::FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + return Substitute( + "(int16 key=$0, int32 key_idx=$1, int32 val=$2)", + (key_idx % 2 == 0) ? -key_idx : key_idx, key_idx, val); +} + +template<> +string IntKeyTestSetup::FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + return Substitute( + "(int32 key=$0, int32 key_idx=$1, int32 val=$2)", + (key_idx % 2 == 0) ? -key_idx : key_idx, key_idx, val); +} + +template<> +string IntKeyTestSetup::FormatDebugRow(int64_t key_idx, int32_t val, bool updated) { + return Substitute( + "(int64 key=$0, int32 key_idx=$1, int32 val=$2)", + (key_idx % 2 == 0) ? -key_idx : key_idx, key_idx, val); +} + +// Setup for testing nullable columns +struct NullableValueTestSetup { + static Schema CreateSchema() { + return Schema({ ColumnSchema("key", INT32), + ColumnSchema("key_idx", INT32), + ColumnSchema("val", INT32, true) }, 1); + } + + void BuildRowKey(KuduPartialRow *row, int64_t i) { + CHECK_OK(row->SetInt32(0, (int32_t)i)); + } + + // builds a row key from an existing row for updates + template + void BuildRowKeyFromExistingRow(KuduPartialRow *row, const RowType& src_row) { + CHECK_OK(row->SetInt32(0, *reinterpret_cast(src_row.cell_ptr(0)))); + } + + void BuildRow(KuduPartialRow *row, int64_t key_idx, int32_t val = 0) { + BuildRowKey(row, key_idx); + CHECK_OK(row->SetInt32(1, key_idx)); + if (ShouldInsertAsNull(key_idx)) { + CHECK_OK(row->SetNull(2)); + } else { + CHECK_OK(row->SetInt32(2, val)); + } + } + + string FormatDebugRow(int64_t key_idx, int64_t val, bool updated) { + if (!updated && ShouldInsertAsNull(key_idx)) { + return Substitute( + "(int32 key=$0, int32 key_idx=$1, int32 val=NULL)", + (int32_t)key_idx, key_idx); + } + + return Substitute( + "(int32 key=$0, int32 key_idx=$1, int32 val=$2)", + (int32_t)key_idx, key_idx, val); + } + + static bool ShouldInsertAsNull(int64_t key_idx) { + return (key_idx & 2) != 0; + } + + uint64_t GetMaxRows() const { + return std::numeric_limits::max() - 1; + } +}; + +// Use this with TYPED_TEST_CASE from gtest +typedef ::testing::Types< + StringKeyTestSetup, + IntKeyTestSetup, + IntKeyTestSetup, + IntKeyTestSetup, + IntKeyTestSetup, + NullableValueTestSetup + > TabletTestHelperTypes; + +template +class TabletTestBase : public KuduTabletTest { + public: + TabletTestBase() : + KuduTabletTest(TESTSETUP::CreateSchema()), + setup_(), + max_rows_(setup_.GetMaxRows()), + arena_(1024, 4*1024*1024) + {} + + // Inserts "count" rows. + void InsertTestRows(int64_t first_row, + int64_t count, + int32_t val, + TimeSeries *ts = NULL) { + + LocalTabletWriter writer(tablet().get(), &client_schema_); + KuduPartialRow row(&client_schema_); + + uint64_t inserted_since_last_report = 0; + for (int64_t i = first_row; i < first_row + count; i++) { + setup_.BuildRow(&row, i, val); + CHECK_OK(writer.Insert(row)); + + if ((inserted_since_last_report++ > 100) && ts) { + ts->AddValue(static_cast(inserted_since_last_report)); + inserted_since_last_report = 0; + } + } + + if (ts) { + ts->AddValue(static_cast(inserted_since_last_report)); + } + } + + // Inserts a single test row within a transaction. + Status InsertTestRow(LocalTabletWriter* writer, + int64_t key_idx, + int32_t val) { + KuduPartialRow row(&client_schema_); + setup_.BuildRow(&row, key_idx, val); + return writer->Insert(row); + } + + Status UpdateTestRow(LocalTabletWriter* writer, + int64_t key_idx, + int32_t new_val) { + KuduPartialRow row(&client_schema_); + setup_.BuildRowKey(&row, key_idx); + + // select the col to update (the third if there is only one key + // or the fourth if there are two col keys). + int col_idx = schema_.num_key_columns() == 1 ? 2 : 3; + CHECK_OK(row.SetInt32(col_idx, new_val)); + return writer->Update(row); + } + + Status UpdateTestRowToNull(LocalTabletWriter* writer, + int64_t key_idx) { + KuduPartialRow row(&client_schema_); + setup_.BuildRowKey(&row, key_idx); + + // select the col to update (the third if there is only one key + // or the fourth if there are two col keys). + int col_idx = schema_.num_key_columns() == 1 ? 2 : 3; + CHECK_OK(row.SetNull(col_idx)); + return writer->Update(row); + } + + Status DeleteTestRow(LocalTabletWriter* writer, int64_t key_idx) { + KuduPartialRow row(&client_schema_); + setup_.BuildRowKey(&row, key_idx); + return writer->Delete(row); + } + + template + void VerifyRow(const RowType& row, int64_t key_idx, int32_t val) { + ASSERT_EQ(setup_.FormatDebugRow(key_idx, val, false), schema_.DebugRow(row)); + } + + void VerifyTestRows(int64_t first_row, uint64_t expected_count) { + gscoped_ptr iter; + ASSERT_OK(tablet()->NewRowIterator(client_schema_, &iter)); + ASSERT_OK(iter->Init(NULL)); + int batch_size = std::max( + (size_t)1, std::min((size_t)(expected_count / 10), + 4*1024*1024 / schema_.byte_size())); + Arena arena(32*1024, 256*1024); + RowBlock block(schema_, batch_size, &arena); + + if (expected_count > INT_MAX) { + LOG(INFO) << "Not checking rows for duplicates -- duplicates expected since " + << "there were more than " << INT_MAX << " rows inserted."; + return; + } + + // Keep a bitmap of which rows have been seen from the requested + // range. + std::vector seen_rows; + seen_rows.resize(expected_count); + + while (iter->HasNext()) { + ASSERT_OK_FAST(iter->NextBlock(&block)); + + RowBlockRow rb_row = block.row(0); + if (VLOG_IS_ON(2)) { + VLOG(2) << "Fetched batch of " << block.nrows() << "\n" + << "First row: " << schema_.DebugRow(rb_row); + } + + for (int i = 0; i < block.nrows(); i++) { + rb_row.Reset(&block, i); + int32_t key_idx = *schema_.ExtractColumnFromRow(rb_row, 1); + if (key_idx >= first_row && key_idx < first_row + expected_count) { + size_t rel_idx = key_idx - first_row; + if (seen_rows[rel_idx]) { + FAIL() << "Saw row " << key_idx << " twice!\n" + << "Row: " << schema_.DebugRow(rb_row); + } + seen_rows[rel_idx] = true; + } + } + } + + // Verify that all the rows were seen. + for (int i = 0; i < expected_count; i++) { + ASSERT_EQ(true, seen_rows[i]) << "Never saw row: " << (i + first_row); + } + LOG(INFO) << "Successfully verified " << expected_count << "rows"; + } + + // Iterate through the full table, stringifying the resulting rows + // into the given vector. This is only useful in tests which insert + // a very small number of rows. + Status IterateToStringList(vector *out) { + gscoped_ptr iter; + RETURN_NOT_OK(this->tablet()->NewRowIterator(this->client_schema_, &iter)); + RETURN_NOT_OK(iter->Init(NULL)); + return kudu::tablet::IterateToStringList(iter.get(), out); + } + + // Return the number of rows in the tablet. + uint64_t TabletCount() const { + uint64_t count; + CHECK_OK(tablet()->CountRows(&count)); + return count; + } + + // because some types are small we need to + // make sure that we don't overflow the type on inserts + // or else we get errors because the key already exists + uint64_t ClampRowCount(uint64_t proposal) const { + uint64_t num_rows = min(max_rows_, proposal); + if (num_rows < proposal) { + LOG(WARNING) << "Clamping max rows to " << num_rows << " to prevent overflow"; + } + return num_rows; + } + + TESTSETUP setup_; + + const uint64_t max_rows_; + + Arena arena_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/tablet-test-util.h b/src/kudu/tablet/tablet-test-util.h new file mode 100644 index 000000000000..1b2553856a5a --- /dev/null +++ b/src/kudu/tablet/tablet-test-util.h @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_TEST_UTIL_H +#define KUDU_TABLET_TABLET_TEST_UTIL_H + +#include +#include +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/tablet-harness.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +DECLARE_bool(enable_data_block_fsync); + +namespace kudu { +namespace tablet { + +using consensus::RaftConfigPB; +using std::string; +using std::vector; + +class KuduTabletTest : public KuduTest { + public: + explicit KuduTabletTest(const Schema& schema) + : schema_(schema.CopyWithColumnIds()), + client_schema_(schema) { + // Keep unit tests fast, but only if no one has set the flag explicitly. + if (google::GetCommandLineFlagInfoOrDie("enable_data_block_fsync").is_default) { + FLAGS_enable_data_block_fsync = false; + } + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + SetUpTestTablet(); + } + + void CreateTestTablet(const string& root_dir = "") { + string dir = root_dir.empty() ? GetTestPath("fs_root") : root_dir; + TabletHarness::Options opts(dir); + opts.enable_metrics = true; + bool first_time = harness_ == NULL; + harness_.reset(new TabletHarness(schema_, opts)); + CHECK_OK(harness_->Create(first_time)); + } + + void SetUpTestTablet(const string& root_dir = "") { + CreateTestTablet(root_dir); + CHECK_OK(harness_->Open()); + } + + void TabletReOpen(const string& root_dir = "") { + SetUpTestTablet(root_dir); + } + + const Schema &schema() const { + return schema_; + } + + const Schema &client_schema() const { + return client_schema_; + } + + server::Clock* clock() { + return harness_->clock(); + } + + FsManager* fs_manager() { + return harness_->fs_manager(); + } + + void AlterSchema(const Schema& schema) { + tserver::AlterSchemaRequestPB req; + req.set_schema_version(tablet()->metadata()->schema_version() + 1); + + AlterSchemaTransactionState tx_state(NULL, &req, NULL); + ASSERT_OK(tablet()->CreatePreparedAlterSchema(&tx_state, &schema)); + ASSERT_OK(tablet()->AlterSchema(&tx_state)); + tx_state.Finish(); + } + + const std::shared_ptr& tablet() const { + return harness_->tablet(); + } + + TabletHarness* harness() { + return harness_.get(); + } + + protected: + const Schema schema_; + const Schema client_schema_; + + gscoped_ptr harness_; +}; + +class KuduRowSetTest : public KuduTabletTest { + public: + explicit KuduRowSetTest(const Schema& schema) + : KuduTabletTest(schema) { + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + ASSERT_OK(tablet()->metadata()->CreateRowSet(&rowset_meta_, + SchemaBuilder(schema_).Build())); + } + + Status FlushMetadata() { + return tablet()->metadata()->Flush(); + } + + protected: + std::shared_ptr rowset_meta_; +}; + +static inline Status IterateToStringList(RowwiseIterator *iter, + vector *out, + int limit = INT_MAX) { + out->clear(); + Schema schema = iter->schema(); + Arena arena(1024, 1024); + RowBlock block(schema, 100, &arena); + int fetched = 0; + while (iter->HasNext() && fetched < limit) { + RETURN_NOT_OK(iter->NextBlock(&block)); + for (size_t i = 0; i < block.nrows() && fetched < limit; i++) { + if (block.selection_vector()->IsRowSelected(i)) { + out->push_back(schema.DebugRow(block.row(i))); + fetched++; + } + } + } + return Status::OK(); +} + +// Performs snapshot reads, under each of the snapshots in 'snaps', and stores +// the results in 'collected_rows'. +static inline void CollectRowsForSnapshots(Tablet* tablet, + const Schema& schema, + const vector& snaps, + vector* >* collected_rows) { + for (const MvccSnapshot& snapshot : snaps) { + DVLOG(1) << "Snapshot: " << snapshot.ToString(); + gscoped_ptr iter; + ASSERT_OK(tablet->NewRowIterator(schema, + snapshot, + Tablet::UNORDERED, + &iter)); + ASSERT_OK(iter->Init(NULL)); + auto collector = new vector(); + ASSERT_OK(IterateToStringList(iter.get(), collector)); + for (const auto& mrs : *collector) { + DVLOG(1) << "Got from MRS: " << mrs; + } + collected_rows->push_back(collector); + } +} + +// Performs snapshot reads, under each of the snapshots in 'snaps', and verifies that +// the results match the ones in 'expected_rows'. +static inline void VerifySnapshotsHaveSameResult(Tablet* tablet, + const Schema& schema, + const vector& snaps, + const vector* >& expected_rows) { + int idx = 0; + // Now iterate again and make sure we get the same thing. + for (const MvccSnapshot& snapshot : snaps) { + DVLOG(1) << "Snapshot: " << snapshot.ToString(); + gscoped_ptr iter; + ASSERT_OK(tablet->NewRowIterator(schema, + snapshot, + Tablet::UNORDERED, + &iter)); + ASSERT_OK(iter->Init(NULL)); + vector collector; + ASSERT_OK(IterateToStringList(iter.get(), &collector)); + ASSERT_EQ(collector.size(), expected_rows[idx]->size()); + + for (int i = 0; i < expected_rows[idx]->size(); i++) { + DVLOG(1) << "Got from DRS: " << collector[i]; + DVLOG(1) << "Expected: " << (*expected_rows[idx])[i]; + ASSERT_EQ((*expected_rows[idx])[i], collector[i]); + } + idx++; + } +} + +// Construct a new iterator from the given rowset, and dump +// all of its results into 'out'. The previous contents +// of 'out' are cleared. +static inline Status DumpRowSet(const RowSet &rs, + const Schema &projection, + const MvccSnapshot &snap, + vector *out, + int limit = INT_MAX) { + gscoped_ptr iter; + RETURN_NOT_OK(rs.NewRowIterator(&projection, snap, &iter)); + RETURN_NOT_OK(iter->Init(NULL)); + RETURN_NOT_OK(IterateToStringList(iter.get(), out, limit)); + return Status::OK(); +} + +// Take an un-initialized iterator, Init() it, and iterate through all of its rows. +// The resulting string contains a line per entry. +static inline string InitAndDumpIterator(gscoped_ptr iter) { + CHECK_OK(iter->Init(NULL)); + + vector out; + CHECK_OK(IterateToStringList(iter.get(), &out)); + return JoinStrings(out, "\n"); +} + +// Dump all of the rows of the tablet into the given vector. +static inline Status DumpTablet(const Tablet& tablet, + const Schema& projection, + vector* out) { + gscoped_ptr iter; + RETURN_NOT_OK(tablet.NewRowIterator(projection, &iter)); + RETURN_NOT_OK(iter->Init(NULL)); + std::vector rows; + RETURN_NOT_OK(IterateToStringList(iter.get(), &rows)); + std::sort(rows.begin(), rows.end()); + out->swap(rows); + return Status::OK(); +} + +// Write a single row to the given RowSetWriter (which may be of the rolling +// or non-rolling variety). +template +static Status WriteRow(const Slice &row_slice, RowSetWriterClass *writer) { + const Schema &schema = writer->schema(); + DCHECK_EQ(row_slice.size(), schema.byte_size()); + + RowBlock block(schema, 1, NULL); + ConstContiguousRow row(&schema, row_slice.data()); + RowBlockRow dst_row = block.row(0); + RETURN_NOT_OK(CopyRow(row, &dst_row, reinterpret_cast(NULL))); + + return writer->AppendBlock(block); +} + +} // namespace tablet +} // namespace kudu +#endif diff --git a/src/kudu/tablet/tablet-test.cc b/src/kudu/tablet/tablet-test.cc new file mode 100644 index 000000000000..f0d539779288 --- /dev/null +++ b/src/kudu/tablet/tablet-test.cc @@ -0,0 +1,962 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/row.h" +#include "kudu/common/scan_spec.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/slice.h" +#include "kudu/util/test_macros.h" + +using std::shared_ptr; +using std::unordered_set; + +namespace kudu { +namespace tablet { + +using fs::ReadableBlock; + +DEFINE_int32(testflush_num_inserts, 1000, + "Number of rows inserted in TestFlush"); +DEFINE_int32(testiterator_num_inserts, 1000, + "Number of rows inserted in TestRowIterator/TestInsert"); +DEFINE_int32(testcompaction_num_rows, 1000, + "Number of rows per rowset in TestCompaction"); + +template +class TestTablet : public TabletTestBase { + typedef SETUP Type; + + public: + // Verify that iteration doesn't fail + void CheckCanIterate() { + vector out_rows; + ASSERT_OK(this->IterateToStringList(&out_rows)); + } + +}; +TYPED_TEST_CASE(TestTablet, TabletTestHelperTypes); + +TYPED_TEST(TestTablet, TestFlush) { + // Insert 1000 rows into memrowset + uint64_t max_rows = this->ClampRowCount(FLAGS_testflush_num_inserts); + this->InsertTestRows(0, max_rows, 0); + + // Flush it. + ASSERT_OK(this->tablet()->Flush()); + TabletMetadata* tablet_meta = this->tablet()->metadata(); + + // Make sure the files were created as expected. + RowSetMetadata* rowset_meta = tablet_meta->GetRowSetForTests(0); + CHECK(rowset_meta) << "No row set found"; + ASSERT_TRUE(rowset_meta->HasDataForColumnIdForTests(this->schema_.column_id(0))); + ASSERT_TRUE(rowset_meta->HasDataForColumnIdForTests(this->schema_.column_id(1))); + ASSERT_TRUE(rowset_meta->HasDataForColumnIdForTests(this->schema_.column_id(2))); + ASSERT_TRUE(rowset_meta->HasBloomDataBlockForTests()); + + // check that undo deltas are present + vector undo_blocks = rowset_meta->undo_delta_blocks(); + ASSERT_EQ(1, undo_blocks.size()); + + // Read the undo delta, we should get one undo mutation (delete) for each row. + gscoped_ptr block; + ASSERT_OK(this->fs_manager()->OpenBlock(undo_blocks[0], &block)); + + shared_ptr dfr; + ASSERT_OK(DeltaFileReader::Open(block.Pass(), undo_blocks[0], &dfr, UNDO)); + // Assert there were 'max_rows' deletions in the undo delta (one for each inserted row) + ASSERT_EQ(dfr->delta_stats().delete_count(), max_rows); +} + +// Test that historical data for a row is maintained even after the row +// is flushed from the memrowset. +TYPED_TEST(TestTablet, TestInsertsAndMutationsAreUndoneWithMVCCAfterFlush) { + // Insert 5 rows into the memrowset. + // After the first one, each time we insert a new row we mutate + // the previous one. + + // Take snapshots after each operation + vector snaps; + snaps.push_back(MvccSnapshot(*this->tablet()->mvcc_manager())); + + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + for (int i = 0; i < 5; i++) { + this->InsertTestRows(i, 1, 0); + DVLOG(1) << "Inserted row=" << i << ", row_idx=" << i << ", val=0"; + MvccSnapshot ins_snaphsot(*this->tablet()->mvcc_manager()); + snaps.push_back(ins_snaphsot); + LOG(INFO) << "After Insert Snapshot: " << ins_snaphsot.ToString(); + if (i > 0) { + ASSERT_OK(this->UpdateTestRow(&writer, i - 1, i)); + DVLOG(1) << "Mutated row=" << i - 1 << ", row_idx=" << i - 1 << ", val=" << i; + MvccSnapshot mut_snaphsot(*this->tablet()->mvcc_manager()); + snaps.push_back(mut_snaphsot); + DVLOG(1) << "After Mutate Snapshot: " << mut_snaphsot.ToString(); + } + } + + // Collect the expected rows from the MRS, where there are no + // undos + vector* > expected_rows; + CollectRowsForSnapshots(this->tablet().get(), this->client_schema_, + snaps, &expected_rows); + + // Flush the tablet + ASSERT_OK(this->tablet()->Flush()); + + // Now verify that with undos we get the same thing. + VerifySnapshotsHaveSameResult(this->tablet().get(), this->client_schema_, + snaps, expected_rows); + + // Do some more work and flush/compact + // take a snapshot and mutate the rows so that we have undos and + // redos + snaps.push_back(MvccSnapshot(*this->tablet()->mvcc_manager())); +// + for (int i = 0; i < 4; i++) { + ASSERT_OK(this->UpdateTestRow(&writer, i, i + 10)); + DVLOG(1) << "Mutated row=" << i << ", row_idx=" << i << ", val=" << i + 10; + MvccSnapshot mut_snaphsot(*this->tablet()->mvcc_manager()); + snaps.push_back(mut_snaphsot); + DVLOG(1) << "After Mutate Snapshot: " << mut_snaphsot.ToString(); + } + + // also throw a delete in there. + ASSERT_OK(this->DeleteTestRow(&writer, 4)); + MvccSnapshot delete_snaphsot(*this->tablet()->mvcc_manager()); + snaps.push_back(delete_snaphsot); + DVLOG(1) << "After Delete Snapshot: " << delete_snaphsot.ToString(); + + // Collect the expected rows now that we have undos and redos + STLDeleteElements(&expected_rows); + CollectRowsForSnapshots(this->tablet().get(), this->client_schema_, + snaps, &expected_rows); + + // now flush and the compact everything + ASSERT_OK(this->tablet()->Flush()); + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + + // Now verify that with undos and redos we get the same thing. + VerifySnapshotsHaveSameResult(this->tablet().get(), this->client_schema_, + snaps, expected_rows); + + STLDeleteElements(&expected_rows); +} + +// This tests KUDU-165, a regression where multiple old ghost rows were appearing in +// compaction outputs and sometimes would be selected as the most recent version +// of the row. +// In particular this makes sure that when there is a ghost row in one row set +// and a live one on another the live one is the only one that survives compaction. +TYPED_TEST(TestTablet, TestGhostRowsOnDiskRowSets) { + // Create a few INSERT/DELETE pairs on-disk by writing and flushing. + // Each of the resulting rowsets has a single row which is a "ghost" since its + // redo data has the DELETE. + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + + for (int i = 0; i < 3; i++) { + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + this->DeleteTestRow(&writer, 0); + ASSERT_OK(this->tablet()->Flush()); + } + + // Create one more rowset on disk which has just an INSERT (ie a non-ghost row). + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + ASSERT_OK(this->tablet()->Flush()); + + // Compact. This should result in a rowset with just one row in it. + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + + // Should still be able to update, since the row is live. + ASSERT_OK(this->UpdateTestRow(&writer, 0, 1)); +} + +// Test that inserting a row which already exists causes an AlreadyPresent +// error +TYPED_TEST(TestTablet, TestInsertDuplicateKey) { + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + + CHECK_OK(this->InsertTestRow(&writer, 12345, 0)); + ASSERT_FALSE(writer.last_op_result().has_failed_status()); + + // Insert again, should fail! + Status s = this->InsertTestRow(&writer, 12345, 0); + ASSERT_STR_CONTAINS(s.ToString(), "entry already present in memrowset"); + + ASSERT_EQ(1, this->TabletCount()); + + // Flush, and make sure that inserting duplicate still fails + ASSERT_OK(this->tablet()->Flush()); + + ASSERT_EQ(1, this->TabletCount()); + + s = this->InsertTestRow(&writer, 12345, 0); + ASSERT_STR_CONTAINS(s.ToString(), "key already present"); + ASSERT_EQ(1, this->TabletCount()); +} + + +// Test flushes and compactions dealing with deleted rows. +TYPED_TEST(TestTablet, TestDeleteWithFlushAndCompact) { + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + ASSERT_OK(this->DeleteTestRow(&writer, 0)); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).mrs_id()); + + // The row is deleted, so we shouldn't see it in the iterator. + vector rows; + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(0, rows.size()); + + // Flush the tablet and make sure the data doesn't re-appear. + ASSERT_OK(this->tablet()->Flush()); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(0, rows.size()); + + // Re-inserting should succeed. This will reinsert into the MemRowSet. + // Set the int column to '1' this time, so we can differentiate the two + // versions of the row. + CHECK_OK(this->InsertTestRow(&writer, 0, 1)); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 1, false), rows[0]); + + // Flush again, so the DiskRowSet has the row. + ASSERT_OK(this->tablet()->Flush()); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 1, false), rows[0]); + + // Delete it again, now that it's in DRS. + ASSERT_OK(this->DeleteTestRow(&writer, 0)); + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(1L, writer.last_op_result().mutated_stores(0).rs_id()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).dms_id()); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(0, rows.size()); + + // We now have an INSERT in the MemRowSet and the + // deleted row in the DiskRowSet. The new version + // of the row has '2' in the int column. + CHECK_OK(this->InsertTestRow(&writer, 0, 2)); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 2, false), rows[0]); + + // Flush - now we have the row in two different DRSs. + ASSERT_OK(this->tablet()->Flush()); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 2, false), rows[0]); + + // Compaction should succeed even with the duplicate rows. + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 2, false), rows[0]); +} + +// Test flushes dealing with REINSERT mutations in the MemRowSet. +TYPED_TEST(TestTablet, TestFlushWithReinsert) { + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + // Insert, delete, and re-insert a row in the MRS. + + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + ASSERT_OK(this->DeleteTestRow(&writer, 0)); + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).mrs_id()); + CHECK_OK(this->InsertTestRow(&writer, 0, 1)); + + // Flush the tablet and make sure the data persists. + ASSERT_OK(this->tablet()->Flush()); + vector rows; + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 1, false), rows[0]); +} + +// Test flushes dealing with REINSERT mutations if they arrive in the middle +// of a flush. +TYPED_TEST(TestTablet, TestReinsertDuringFlush) { + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + // Insert/delete/insert/delete in MemRowStore. + + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + ASSERT_OK(this->DeleteTestRow(&writer, 0)); + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).mrs_id()); + + CHECK_OK(this->InsertTestRow(&writer, 0, 1)); + ASSERT_OK(this->DeleteTestRow(&writer, 0)); + + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).mrs_id()); + + // During the snapshot flush, insert/delete/insert some more during the flush. + class MyCommonHooks : public Tablet::FlushCompactCommonHooks { + public: + explicit MyCommonHooks(TestFixture *test) : test_(test) {} + + Status PostWriteSnapshot() OVERRIDE { + LocalTabletWriter writer(test_->tablet().get(), &test_->client_schema()); + test_->InsertTestRow(&writer, 0, 1); + CHECK_OK(test_->DeleteTestRow(&writer, 0)); + CHECK_EQ(1, writer.last_op_result().mutated_stores_size()); + CHECK_EQ(1L, writer.last_op_result().mutated_stores(0).mrs_id()); + test_->InsertTestRow(&writer, 0, 2); + CHECK_OK(test_->DeleteTestRow(&writer, 0)); + CHECK_EQ(1, writer.last_op_result().mutated_stores_size()); + CHECK_EQ(1L, writer.last_op_result().mutated_stores(0).mrs_id()); + test_->InsertTestRow(&writer, 0, 3); + return Status::OK(); + } + + private: + TestFixture *test_; + }; + shared_ptr common_hooks( + reinterpret_cast(new MyCommonHooks(this))); + this->tablet()->SetFlushCompactCommonHooksForTests(common_hooks); + + // Flush the tablet and make sure the data persists. + ASSERT_OK(this->tablet()->Flush()); + vector rows; + ASSERT_OK(this->IterateToStringList(&rows)); + ASSERT_EQ(1, rows.size()); + EXPECT_EQ(this->setup_.FormatDebugRow(0, 3, false), rows[0]); +} + +// Test iterating over a tablet which contains data +// in the memrowset as well as two rowsets. This simple test +// only puts one row in each with no updates. +TYPED_TEST(TestTablet, TestRowIteratorSimple) { + const int kInRowSet1 = 1; + const int kInRowSet2 = 2; + const int kInMemRowSet = 3; + + // Put a row in disk rowset 1 (insert and flush) + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + CHECK_OK(this->InsertTestRow(&writer, kInRowSet1, 0)); + ASSERT_OK(this->tablet()->Flush()); + + // Put a row in disk rowset 2 (insert and flush) + CHECK_OK(this->InsertTestRow(&writer, kInRowSet2, 0)); + ASSERT_OK(this->tablet()->Flush()); + + // Put a row in memrowset + CHECK_OK(this->InsertTestRow(&writer, kInMemRowSet, 0)); + + // Now iterate the tablet and make sure the rows show up + gscoped_ptr iter; + ASSERT_OK(this->tablet()->NewRowIterator(this->client_schema_, &iter)); + ASSERT_OK(iter->Init(nullptr)); + + ASSERT_TRUE(iter->HasNext()); + + RowBlock block(this->schema_, 100, &this->arena_); + + // First call to CopyNextRows should fetch the whole memrowset. + ASSERT_OK_FAST(iter->NextBlock(&block)); + ASSERT_EQ(1, block.nrows()) << "should get only the one row from memrowset"; + this->VerifyRow(block.row(0), kInMemRowSet, 0); + + // Next, should fetch the older rowset + ASSERT_TRUE(iter->HasNext()); + ASSERT_OK(iter->NextBlock(&block)); + ASSERT_EQ(1, block.nrows()) << "should get only the one row from rowset 1"; + this->VerifyRow(block.row(0), kInRowSet1, 0); + + // Next, should fetch the newer rowset + ASSERT_TRUE(iter->HasNext()); + ASSERT_OK(iter->NextBlock(&block)); + ASSERT_EQ(1, block.nrows()) << "should get only the one row from rowset 2"; + this->VerifyRow(block.row(0), kInRowSet2, 0); + + ASSERT_FALSE(iter->HasNext()); +} + +TYPED_TEST(TestTablet, TestRowIteratorOrdered) { + // Create interleaved keys in each rowset, so they are clearly not in order + const int kNumRows = 128; + const int kNumBatches = 4; + LOG(INFO) << "Schema: " << this->schema_.ToString(); + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + for (int i = 0; i < kNumBatches; i++) { + ASSERT_OK(this->tablet()->Flush()); + for (int j = 0; j < kNumRows; j++) { + if (j % kNumBatches == i) { + LOG(INFO) << "Inserting row " << j; + CHECK_OK(this->InsertTestRow(&writer, 654321+j, j)); + } + } + } + + MvccSnapshot snap(*this->tablet()->mvcc_manager()); + // Iterate through with a few different block sizes. + for (int numBlocks = 1; numBlocks < 5; numBlocks*=2) { + const int rowsPerBlock = kNumRows / numBlocks; + // Make a new ordered iterator for the current snapshot. + gscoped_ptr iter; + + ASSERT_OK(this->tablet()->NewRowIterator(this->client_schema_, snap, Tablet::ORDERED, &iter)); + ASSERT_OK(iter->Init(nullptr)); + + // Iterate the tablet collecting rows. + vector > rows; + for (int i = 0; i < numBlocks; i++) { + RowBlock block(this->schema_, rowsPerBlock, &this->arena_); + ASSERT_TRUE(iter->HasNext()); + ASSERT_OK(iter->NextBlock(&block)); + ASSERT_EQ(rowsPerBlock, block.nrows()) << "unexpected number of rows returned"; + for (int j = 0; j < rowsPerBlock; j++) { + RowBlockRow row = block.row(j); + shared_ptr encoded(new faststring()); + this->client_schema_.EncodeComparableKey(row, encoded.get()); + rows.push_back(encoded); + } + } + // Verify the collected rows, checking that they are sorted. + for (int j = 1; j < rows.size(); j++) { + // Use the schema for comparison, since this test is run with different schemas. + ASSERT_LT((*rows[j-1]).ToString(), (*rows[j]).ToString()); + } + ASSERT_FALSE(iter->HasNext()); + ASSERT_EQ(kNumRows, rows.size()); + } +} + + +template +bool TestSetupExpectsNulls(int32_t key_idx) { + return false; +} + +template<> +bool TestSetupExpectsNulls(int32_t key_idx) { + // If it's a row that the test updates, then we should expect null + // based on whether it updated to NULL or away from NULL. + bool should_update = (key_idx % 2 == 1); + if (should_update) { + return (key_idx % 10 == 1); + } + + // Otherwise, expect whatever was inserted. + return NullableValueTestSetup::ShouldInsertAsNull(key_idx); +} + +// Test iterating over a tablet which has a memrowset +// and several rowsets, each with many rows of data. +TYPED_TEST(TestTablet, TestRowIteratorComplex) { + + uint64_t max_rows = this->ClampRowCount(FLAGS_testiterator_num_inserts); + + // Put a row in disk rowset 1 (insert and flush) + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + for (int32_t i = 0; i < max_rows; i++) { + ASSERT_OK_FAST(this->InsertTestRow(&writer, i, 0)); + + if (i % 300 == 0) { + LOG(INFO) << "Flushing after " << i << " rows inserted"; + ASSERT_OK(this->tablet()->Flush()); + } + } + LOG(INFO) << "Successfully inserted " << max_rows << " rows"; + + // At this point, we should have several rowsets as well + // as some data in memrowset. + + // Update a subset of the rows + for (int32_t i = 0; i < max_rows; i++) { + bool should_update = (i % 2 == 1); + if (!should_update) continue; + + bool set_to_null = TestSetupExpectsNulls(i); + if (set_to_null) { + this->UpdateTestRowToNull(&writer, i); + } else { + ASSERT_OK_FAST(this->UpdateTestRow(&writer, i, i)); + } + } + + // Now iterate the tablet and make sure the rows show up. + gscoped_ptr iter; + const Schema& schema = this->client_schema_; + ASSERT_OK(this->tablet()->NewRowIterator(schema, &iter)); + ASSERT_OK(iter->Init(nullptr)); + LOG(INFO) << "Created iter: " << iter->ToString(); + + vector seen(max_rows, false); + int seen_count = 0; + + RowBlock block(schema, 100, &this->arena_); + while (iter->HasNext()) { + this->arena_.Reset(); + ASSERT_OK(iter->NextBlock(&block)); + LOG(INFO) << "Fetched batch of " << block.nrows(); + for (size_t i = 0; i < block.nrows(); i++) { + SCOPED_TRACE(schema.DebugRow(block.row(i))); + // Verify that we see each key exactly once. + int32_t key_idx = *schema.ExtractColumnFromRow(block.row(i), 1); + if (seen[key_idx]) { + FAIL() << "Saw row " << key_idx << " multiple times"; + } + seen[key_idx] = true; + seen_count++; + + // Verify that we see the correctly updated value + const int32_t* val = schema.ExtractColumnFromRow(block.row(i), 2); + + bool set_to_null = TestSetupExpectsNulls(key_idx); + bool should_update = (key_idx % 2 == 1); + if (val == nullptr) { + ASSERT_TRUE(set_to_null); + } else if (should_update) { + ASSERT_EQ(key_idx, *val); + } else { + ASSERT_EQ(0, *val); + } + } + } + + ASSERT_EQ(seen_count, max_rows) + << "expected to see all inserted data through iterator."; +} + +// Test that, when a tablet has flushed data and is +// reopened, that the data persists +TYPED_TEST(TestTablet, TestInsertsPersist) { + uint64_t max_rows = this->ClampRowCount(FLAGS_testiterator_num_inserts); + + this->InsertTestRows(0, max_rows, 0); + ASSERT_EQ(max_rows, this->TabletCount()); + + // Flush it. + ASSERT_OK(this->tablet()->Flush()); + + ASSERT_EQ(max_rows, this->TabletCount()); + + // Close and re-open tablet + this->TabletReOpen(); + + // Ensure that rows exist + ASSERT_EQ(max_rows, this->TabletCount()); + this->VerifyTestRows(0, max_rows); + + // TODO: add some more data, re-flush +} + +// Test that when a row has been updated many times, it always yields +// the most recent value. +TYPED_TEST(TestTablet, TestMultipleUpdates) { + // Insert and update several times in MemRowSet + LocalTabletWriter writer(this->tablet().get(), &this->client_schema_); + CHECK_OK(this->InsertTestRow(&writer, 0, 0)); + ASSERT_OK(this->UpdateTestRow(&writer, 0, 1)); + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).mrs_id()); + ASSERT_OK(this->UpdateTestRow(&writer, 0, 2)); + ASSERT_OK(this->UpdateTestRow(&writer, 0, 3)); + + // Should see most recent value. + vector out_rows; + ASSERT_OK(this->IterateToStringList(&out_rows)); + ASSERT_EQ(1, out_rows.size()); + ASSERT_EQ(this->setup_.FormatDebugRow(0, 3, false), out_rows[0]); + + // Flush it. + ASSERT_OK(this->tablet()->Flush()); + + // Should still see most recent value. + ASSERT_OK(this->IterateToStringList(&out_rows)); + ASSERT_EQ(1, out_rows.size()); + ASSERT_EQ(this->setup_.FormatDebugRow(0, 3, false), out_rows[0]); + + // Update the row a few times in DeltaMemStore + ASSERT_OK(this->UpdateTestRow(&writer, 0, 4)); + ASSERT_EQ(1, writer.last_op_result().mutated_stores_size()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).rs_id()); + ASSERT_EQ(0L, writer.last_op_result().mutated_stores(0).dms_id()); + ASSERT_OK(this->UpdateTestRow(&writer, 0, 5)); + ASSERT_OK(this->UpdateTestRow(&writer, 0, 6)); + + // Should still see most recent value. + ASSERT_OK(this->IterateToStringList(&out_rows)); + ASSERT_EQ(1, out_rows.size()); + ASSERT_EQ(this->setup_.FormatDebugRow(0, 6, false), out_rows[0]); + + + // Force a compaction after adding a new rowset with one row. + CHECK_OK(this->InsertTestRow(&writer, 1, 0)); + ASSERT_OK(this->tablet()->Flush()); + ASSERT_EQ(2, this->tablet()->num_rowsets()); + + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + ASSERT_EQ(1, this->tablet()->num_rowsets()); + + // Should still see most recent value. + ASSERT_OK(this->IterateToStringList(&out_rows)); + ASSERT_EQ(2, out_rows.size()); + ASSERT_EQ(this->setup_.FormatDebugRow(0, 6, false), out_rows[0]); + ASSERT_EQ(this->setup_.FormatDebugRow(1, 0, false), out_rows[1]); +} + + + +TYPED_TEST(TestTablet, TestCompaction) { + uint64_t max_rows = this->ClampRowCount(FLAGS_testcompaction_num_rows); + + uint64_t n_rows = max_rows / 3; + // Create three rowsets by inserting and flushing + LOG_TIMING(INFO, "Inserting rows") { + this->InsertTestRows(0, n_rows, 0); + + LOG_TIMING(INFO, "Flushing rows") { + ASSERT_OK(this->tablet()->Flush()); + } + + // first MemRowSet had id 0, current one should be 1 + ASSERT_EQ(1, this->tablet()->CurrentMrsIdForTests()); + ASSERT_TRUE( + this->tablet()->metadata()->GetRowSetForTests(0)->HasDataForColumnIdForTests( + this->schema_.column_id(0))); + } + + LOG_TIMING(INFO, "Inserting rows") { + this->InsertTestRows(n_rows, n_rows, 0); + + LOG_TIMING(INFO, "Flushing rows") { + ASSERT_OK(this->tablet()->Flush()); + } + + // previous MemRowSet had id 1, current one should be 2 + ASSERT_EQ(2, this->tablet()->CurrentMrsIdForTests()); + ASSERT_TRUE( + this->tablet()->metadata()->GetRowSetForTests(1)->HasDataForColumnIdForTests( + this->schema_.column_id(0))); + } + + LOG_TIMING(INFO, "Inserting rows") { + this->InsertTestRows(n_rows * 2, n_rows, 0); + + LOG_TIMING(INFO, "Flushing rows") { + ASSERT_OK(this->tablet()->Flush()); + } + + // previous MemRowSet had id 2, current one should be 3 + ASSERT_EQ(3, this->tablet()->CurrentMrsIdForTests()); + ASSERT_TRUE( + this->tablet()->metadata()->GetRowSetForTests(2)->HasDataForColumnIdForTests( + this->schema_.column_id(0))); + } + + // Issue compaction + LOG_TIMING(INFO, "Compacting rows") { + + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + // Compaction does not swap the memrowsets so we should still get 3 + ASSERT_EQ(3, this->tablet()->CurrentMrsIdForTests()); + ASSERT_EQ(n_rows * 3, this->TabletCount()); + + const RowSetMetadata *rowset_meta = this->tablet()->metadata()->GetRowSetForTests(3); + ASSERT_TRUE(rowset_meta != nullptr); + ASSERT_TRUE(rowset_meta->HasDataForColumnIdForTests(this->schema_.column_id(0))); + ASSERT_TRUE(rowset_meta->HasBloomDataBlockForTests()); + } + + // Old rowsets should not exist anymore + for (int i = 0; i <= 2; i++) { + const RowSetMetadata *rowset_meta = this->tablet()->metadata()->GetRowSetForTests(i); + ASSERT_TRUE(rowset_meta == nullptr); + } +} + +enum MutationType { + MRS_MUTATION, + DELTA_MUTATION, + DUPLICATED_MUTATION +}; + +// Hook used by the Test*WithConcurrentMutation tests. +// +// Every time one of these hooks triggers, it inserts a row starting +// at row 20 (and increasing), and updates a row starting at row 10 +// (and increasing). +template +class MyCommonHooks : public Tablet::FlushCompactCommonHooks { + public: + explicit MyCommonHooks(TestFixture *test, bool flushed) + : test_(test), + flushed_(flushed), + i_(0) { + } + Status DoHook(MutationType expected_mutation_type) { + LocalTabletWriter writer(test_->tablet().get(), &test_->client_schema()); + RETURN_NOT_OK(test_->DeleteTestRow(&writer, i_)); + + switch (expected_mutation_type) { + case MRS_MUTATION: + CHECK_EQ(1, writer.last_op_result().mutated_stores_size()); + CHECK(writer.last_op_result().mutated_stores(0).has_mrs_id()); + break; + case DELTA_MUTATION: + CHECK_EQ(1, writer.last_op_result().mutated_stores_size()); + CHECK(writer.last_op_result().mutated_stores(0).has_rs_id()); + CHECK(writer.last_op_result().mutated_stores(0).has_dms_id()); + break; + case DUPLICATED_MUTATION: + CHECK_EQ(2, writer.last_op_result().mutated_stores_size()); + break; + } + RETURN_NOT_OK(test_->UpdateTestRow(&writer, 10 + i_, 1000 + i_)); + test_->InsertTestRows(20 + i_, 1, 0); + test_->CheckCanIterate(); + i_++; + return Status::OK(); + } + + virtual Status PostTakeMvccSnapshot() OVERRIDE { + // before we flush we update the MemRowSet afterwards we update the + // DeltaMemStore + if (!flushed_) { + return DoHook(MRS_MUTATION); + } else { + return DoHook(DELTA_MUTATION); + } + } + virtual Status PostWriteSnapshot() OVERRIDE { + if (!flushed_) { + return DoHook(MRS_MUTATION); + } else { + return DoHook(DELTA_MUTATION); + } + } + virtual Status PostSwapInDuplicatingRowSet() OVERRIDE { + return DoHook(DUPLICATED_MUTATION); + } + virtual Status PostReupdateMissedDeltas() OVERRIDE { + return DoHook(DUPLICATED_MUTATION); + } + virtual Status PostSwapNewRowSet() OVERRIDE { + return DoHook(DELTA_MUTATION); + } + protected: + TestFixture *test_; + bool flushed_; + int i_; +}; + +template +class MyFlushHooks : public Tablet::FlushFaultHooks, public MyCommonHooks { + public: + explicit MyFlushHooks(TestFixture *test, bool flushed) : + MyCommonHooks(test, flushed) {} + virtual Status PostSwapNewMemRowSet() { return this->DoHook(MRS_MUTATION); } +}; + +template +class MyCompactHooks : public Tablet::CompactionFaultHooks, public MyCommonHooks { + public: + explicit MyCompactHooks(TestFixture *test, bool flushed) : + MyCommonHooks(test, flushed) {} + Status PostSelectIterators() { return this->DoHook(DELTA_MUTATION); } +}; + +// Test for Flush with concurrent update, delete and insert during the +// various phases. +TYPED_TEST(TestTablet, TestFlushWithConcurrentMutation) { + this->InsertTestRows(0, 7, 0); // 0-6 inclusive: these rows will be deleted + this->InsertTestRows(10, 7, 0); // 10-16 inclusive: these rows will be updated + // Rows 20-26 inclusive will be inserted during the flush + + // Inject hooks which mutate those rows and add more rows at + // each key stage of flushing. + shared_ptr > hooks(new MyFlushHooks(this, false)); + this->tablet()->SetFlushHooksForTests(hooks); + this->tablet()->SetFlushCompactCommonHooksForTests(hooks); + + // First hook before we do the Flush + ASSERT_OK(hooks->DoHook(MRS_MUTATION)); + + // Then do the flush with the hooks enabled. + ASSERT_OK(this->tablet()->Flush()); + + // Now verify that the results saw all the mutated_stores. + vector out_rows; + ASSERT_OK(this->IterateToStringList(&out_rows)); + std::sort(out_rows.begin(), out_rows.end()); + + vector expected_rows; + expected_rows.push_back(this->setup_.FormatDebugRow(10, 1000, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(11, 1001, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(12, 1002, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(13, 1003, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(14, 1004, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(15, 1005, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(16, 1006, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(20, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(21, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(22, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(23, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(24, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(25, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(26, 0, false)); + + std::sort(expected_rows.begin(), expected_rows.end()); + + // Verify that all the inserts and updates arrived and persisted. + LOG(INFO) << "Expected: " << JoinStrings(expected_rows, "\n"); + + // Verify that all the inserts and updates arrived and persisted. + LOG(INFO) << "Results: " << JoinStrings(out_rows, "\n"); + + ASSERT_EQ(expected_rows.size(), out_rows.size()); + vector::const_iterator exp_it = expected_rows.begin(); + for (vector::const_iterator out_it = out_rows.begin(); out_it!= out_rows.end();) { + ASSERT_EQ(*out_it, *exp_it); + out_it++; + exp_it++; + } +} + +// Test for compaction with concurrent update and insert during the +// various phases. +TYPED_TEST(TestTablet, TestCompactionWithConcurrentMutation) { + // Create three rowsets by inserting and flushing. + // The rows from these layers will get updated or deleted during the flush: + // - rows 0-6 inclusive will be deleted + // - rows 10-16 inclusive will be updated + + this->InsertTestRows(0, 2, 0); // rows 0-1 + this->InsertTestRows(10, 2, 0); // rows 10-11 + ASSERT_OK(this->tablet()->Flush()); + + this->InsertTestRows(2, 2, 0); // rows 2-3 + this->InsertTestRows(12, 2, 0); // rows 12-13 + ASSERT_OK(this->tablet()->Flush()); + + this->InsertTestRows(4, 3, 0); // rows 4-6 + this->InsertTestRows(14, 3, 0); // rows 14-16 + ASSERT_OK(this->tablet()->Flush()); + + // Rows 20-26 inclusive will be inserted during the flush. + + shared_ptr > hooks(new MyCompactHooks(this, true)); + this->tablet()->SetCompactionHooksForTests(hooks); + this->tablet()->SetFlushCompactCommonHooksForTests(hooks); + + // First hook pre-compaction. + ASSERT_OK(hooks->DoHook(DELTA_MUTATION)); + + // Issue compaction + ASSERT_OK(this->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + + // Grab the resulting data into a vector. + vector out_rows; + ASSERT_OK(this->IterateToStringList(&out_rows)); + std::sort(out_rows.begin(), out_rows.end()); + + vector expected_rows; + expected_rows.push_back(this->setup_.FormatDebugRow(10, 1000, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(11, 1001, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(12, 1002, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(13, 1003, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(14, 1004, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(15, 1005, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(16, 1006, true)); + expected_rows.push_back(this->setup_.FormatDebugRow(20, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(21, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(22, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(23, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(24, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(25, 0, false)); + expected_rows.push_back(this->setup_.FormatDebugRow(26, 0, false)); + + std::sort(expected_rows.begin(), expected_rows.end()); + + ASSERT_EQ(expected_rows.size(), out_rows.size()); + + // Verify that all the inserts and updates arrived and persisted. + LOG(INFO) << "Expected: " << JoinStrings(expected_rows, "\n"); + + // Verify that all the inserts and updates arrived and persisted. + LOG(INFO) << "Results: " << JoinStrings(out_rows, "\n"); + + vector::const_iterator exp_it = expected_rows.begin(); + for (vector::const_iterator out_it = out_rows.begin(); out_it!= out_rows.end();) { + ASSERT_EQ(*out_it, *exp_it); + out_it++; + exp_it++; + } +} + +// Test that metrics behave properly during tablet initialization +TYPED_TEST(TestTablet, TestMetricsInit) { + // Create a tablet, but do not open it + this->CreateTestTablet(); + MetricRegistry* registry = this->harness()->metrics_registry(); + std::stringstream out; + JsonWriter writer(&out, JsonWriter::PRETTY); + ASSERT_OK(registry->WriteAsJson(&writer, { "*" }, MetricJsonOptions())); + // Open tablet, should still work + this->harness()->Open(); + ASSERT_OK(registry->WriteAsJson(&writer, { "*" }, MetricJsonOptions())); +} + +// Test that we find the correct log segment size for different indexes. +TEST(TestTablet, TestGetLogRetentionSizeForIndex) { + std::map idx_size_map; + // We build a map that represents 3 logs. The key is the index where that log ends, and the value + // is its size. + idx_size_map[3] = 1; + idx_size_map[6] = 10; + idx_size_map[9] = 100; + + // The default value should return a size of 0. + int64_t min_log_index = -1; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 0); + + // A value at the beginning of the first segment retains all the logs. + min_log_index = 1; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 111); + + // A value at the end of the first segment also retains everything. + min_log_index = 3; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 111); + + // Beginning of second segment, only retain that one and the next. + min_log_index = 4; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 110); + + // Beginning of third segment, only retain that one. + min_log_index = 7; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 100); + + // A value after all the passed segments, doesn't retain anything. + min_log_index = 10; + ASSERT_EQ(Tablet::GetLogRetentionSizeForIndex(min_log_index, idx_size_map), 0); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet.cc b/src/kudu/tablet/tablet.cc new file mode 100644 index 000000000000..88f841718280 --- /dev/null +++ b/src/kudu/tablet/tablet.cc @@ -0,0 +1,1777 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/cfile/cfile_writer.h" +#include "kudu/common/iterator.h" +#include "kudu/common/row_changelist.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/scan_spec.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/compaction.h" +#include "kudu/tablet/compaction_policy.h" +#include "kudu/tablet/delta_compaction.h" +#include "kudu/tablet/diskrowset.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/rowset_info.h" +#include "kudu/tablet/rowset_tree.h" +#include "kudu/tablet/svg_dump.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tablet/tablet_mm_ops.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/util/bloom_filter.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/trace.h" +#include "kudu/util/url-coding.h" + +DEFINE_bool(tablet_do_dup_key_checks, true, + "Whether to check primary keys for duplicate on insertion. " + "Use at your own risk!"); +TAG_FLAG(tablet_do_dup_key_checks, unsafe); + +DEFINE_int32(tablet_compaction_budget_mb, 128, + "Budget for a single compaction"); +TAG_FLAG(tablet_compaction_budget_mb, experimental); + +DEFINE_int32(tablet_bloom_block_size, 4096, + "Block size of the bloom filters used for tablet keys."); +TAG_FLAG(tablet_bloom_block_size, advanced); + +DEFINE_double(tablet_bloom_target_fp_rate, 0.01f, + "Target false-positive rate (between 0 and 1) to size tablet key bloom filters. " + "A lower false positive rate may reduce the number of disk seeks required " + "in heavy insert workloads, at the expense of more space and RAM " + "required for bloom filters."); +TAG_FLAG(tablet_bloom_target_fp_rate, advanced); + +METRIC_DEFINE_entity(tablet); +METRIC_DEFINE_gauge_size(tablet, memrowset_size, "MemRowSet Memory Usage", + kudu::MetricUnit::kBytes, + "Size of this tablet's memrowset"); +METRIC_DEFINE_gauge_size(tablet, on_disk_size, "Tablet Size On Disk", + kudu::MetricUnit::kBytes, + "Size of this tablet on disk."); + +using std::shared_ptr; +using std::string; +using std::unordered_set; +using std::vector; + +namespace kudu { +namespace tablet { + +using kudu::MaintenanceManager; +using consensus::OpId; +using consensus::MaximumOpId; +using log::LogAnchorRegistry; +using strings::Substitute; +using base::subtle::Barrier_AtomicIncrement; + +static CompactionPolicy *CreateCompactionPolicy() { + return new BudgetedCompactionPolicy(FLAGS_tablet_compaction_budget_mb); +} + +//////////////////////////////////////////////////////////// +// TabletComponents +//////////////////////////////////////////////////////////// + +TabletComponents::TabletComponents(shared_ptr mrs, + shared_ptr rs_tree) + : memrowset(std::move(mrs)), rowsets(std::move(rs_tree)) {} + +//////////////////////////////////////////////////////////// +// Tablet +//////////////////////////////////////////////////////////// + +const char* Tablet::kDMSMemTrackerId = "DeltaMemStores"; + +Tablet::Tablet(const scoped_refptr& metadata, + const scoped_refptr& clock, + const shared_ptr& parent_mem_tracker, + MetricRegistry* metric_registry, + const scoped_refptr& log_anchor_registry) + : key_schema_(metadata->schema().CreateKeyProjection()), + metadata_(metadata), + log_anchor_registry_(log_anchor_registry), + mem_tracker_(MemTracker::CreateTracker( + -1, Substitute("tablet-$0", tablet_id()), + parent_mem_tracker)), + dms_mem_tracker_(MemTracker::CreateTracker( + -1, kDMSMemTrackerId, mem_tracker_)), + next_mrs_id_(0), + clock_(clock), + mvcc_(clock), + rowsets_flush_sem_(1), + state_(kInitialized) { + CHECK(schema()->has_column_ids()); + compaction_policy_.reset(CreateCompactionPolicy()); + + if (metric_registry) { + MetricEntity::AttributeMap attrs; + // TODO(KUDU-745): table_id is apparently not set in the metadata. + attrs["table_id"] = metadata_->table_id(); + attrs["table_name"] = metadata_->table_name(); + attrs["partition"] = metadata_->partition_schema().PartitionDebugString(metadata_->partition(), + *schema()); + metric_entity_ = METRIC_ENTITY_tablet.Instantiate(metric_registry, tablet_id(), attrs); + metrics_.reset(new TabletMetrics(metric_entity_)); + METRIC_memrowset_size.InstantiateFunctionGauge( + metric_entity_, Bind(&Tablet::MemRowSetSize, Unretained(this))) + ->AutoDetach(&metric_detacher_); + METRIC_on_disk_size.InstantiateFunctionGauge( + metric_entity_, Bind(&Tablet::EstimateOnDiskSize, Unretained(this))) + ->AutoDetach(&metric_detacher_); + } +} + +Tablet::~Tablet() { + Shutdown(); + dms_mem_tracker_->UnregisterFromParent(); + mem_tracker_->UnregisterFromParent(); +} + +Status Tablet::Open() { + TRACE_EVENT0("tablet", "Tablet::Open"); + boost::lock_guard lock(component_lock_); + CHECK_EQ(state_, kInitialized) << "already open"; + CHECK(schema()->has_column_ids()); + + next_mrs_id_ = metadata_->last_durable_mrs_id() + 1; + + RowSetVector rowsets_opened; + + // open the tablet row-sets + for (const shared_ptr& rowset_meta : metadata_->rowsets()) { + shared_ptr rowset; + Status s = DiskRowSet::Open(rowset_meta, log_anchor_registry_.get(), &rowset, mem_tracker_); + if (!s.ok()) { + LOG(ERROR) << "Failed to open rowset " << rowset_meta->ToString() << ": " + << s.ToString(); + return s; + } + + rowsets_opened.push_back(rowset); + } + + shared_ptr new_rowset_tree(new RowSetTree()); + CHECK_OK(new_rowset_tree->Reset(rowsets_opened)); + // now that the current state is loaded, create the new MemRowSet with the next id + shared_ptr new_mrs(new MemRowSet(next_mrs_id_++, *schema(), + log_anchor_registry_.get(), + mem_tracker_)); + components_ = new TabletComponents(new_mrs, new_rowset_tree); + + state_ = kBootstrapping; + return Status::OK(); +} + +void Tablet::MarkFinishedBootstrapping() { + CHECK_EQ(state_, kBootstrapping); + state_ = kOpen; +} + +void Tablet::Shutdown() { + UnregisterMaintenanceOps(); + + boost::lock_guard lock(component_lock_); + components_ = nullptr; + state_ = kShutdown; + + // In the case of deleting a tablet, we still keep the metadata around after + // ShutDown(), and need to flush the metadata to indicate that the tablet is deleted. + // During that flush, we don't want metadata to call back into the Tablet, so we + // have to unregister the pre-flush callback. + metadata_->SetPreFlushCallback(Bind(DoNothingStatusClosure)); +} + +Status Tablet::GetMappedReadProjection(const Schema& projection, + Schema *mapped_projection) const { + const Schema* cur_schema = schema(); + return cur_schema->GetMappedReadProjection(projection, mapped_projection); +} + +BloomFilterSizing Tablet::bloom_sizing() const { + return BloomFilterSizing::BySizeAndFPRate(FLAGS_tablet_bloom_block_size, + FLAGS_tablet_bloom_target_fp_rate); +} + +Status Tablet::NewRowIterator(const Schema &projection, + gscoped_ptr *iter) const { + // Yield current rows. + MvccSnapshot snap(mvcc_); + return NewRowIterator(projection, snap, Tablet::UNORDERED, iter); +} + + +Status Tablet::NewRowIterator(const Schema &projection, + const MvccSnapshot &snap, + const OrderMode order, + gscoped_ptr *iter) const { + CHECK_EQ(state_, kOpen); + if (metrics_) { + metrics_->scans_started->Increment(); + } + VLOG(2) << "Created new Iterator under snap: " << snap.ToString(); + iter->reset(new Iterator(this, projection, snap, order)); + return Status::OK(); +} + +Status Tablet::DecodeWriteOperations(const Schema* client_schema, + WriteTransactionState* tx_state) { + TRACE_EVENT0("tablet", "Tablet::DecodeWriteOperations"); + + DCHECK_EQ(tx_state->row_ops().size(), 0); + + // Acquire the schema lock in shared mode, so that the schema doesn't + // change while this transaction is in-flight. + tx_state->AcquireSchemaLock(&schema_lock_); + + // The Schema needs to be held constant while any transactions are between + // PREPARE and APPLY stages + TRACE("PREPARE: Decoding operations"); + vector ops; + + // Decode the ops + RowOperationsPBDecoder dec(&tx_state->request()->row_operations(), + client_schema, + schema(), + tx_state->arena()); + RETURN_NOT_OK(dec.DecodeOperations(&ops)); + + // Create RowOp objects for each + vector row_ops; + ops.reserve(ops.size()); + for (const DecodedRowOperation& op : ops) { + row_ops.push_back(new RowOp(op)); + } + + // Important to set the schema before the ops -- we need the + // schema in order to stringify the ops. + tx_state->set_schema_at_decode_time(schema()); + tx_state->swap_row_ops(&row_ops); + + return Status::OK(); +} + +Status Tablet::AcquireRowLocks(WriteTransactionState* tx_state) { + TRACE_EVENT1("tablet", "Tablet::AcquireRowLocks", + "num_locks", tx_state->row_ops().size()); + TRACE("PREPARE: Acquiring locks for $0 operations", tx_state->row_ops().size()); + for (RowOp* op : tx_state->row_ops()) { + RETURN_NOT_OK(AcquireLockForOp(tx_state, op)); + } + TRACE("PREPARE: locks acquired"); + return Status::OK(); +} + +Status Tablet::CheckRowInTablet(const ConstContiguousRow& row) const { + bool contains_row; + RETURN_NOT_OK(metadata_->partition_schema().PartitionContainsRow(metadata_->partition(), + row, + &contains_row)); + + if (PREDICT_FALSE(!contains_row)) { + return Status::NotFound( + Substitute("Row not in tablet partition. Partition: '$0', row: '$1'.", + metadata_->partition_schema().PartitionDebugString(metadata_->partition(), + *schema()), + metadata_->partition_schema().RowDebugString(row))); + } + return Status::OK(); +} + +Status Tablet::AcquireLockForOp(WriteTransactionState* tx_state, RowOp* op) { + ConstContiguousRow row_key(&key_schema_, op->decoded_op.row_data); + op->key_probe.reset(new tablet::RowSetKeyProbe(row_key)); + RETURN_NOT_OK(CheckRowInTablet(row_key)); + + ScopedRowLock row_lock(&lock_manager_, + tx_state, + op->key_probe->encoded_key_slice(), + LockManager::LOCK_EXCLUSIVE); + op->row_lock = row_lock.Pass(); + return Status::OK(); +} + +void Tablet::StartTransaction(WriteTransactionState* tx_state) { + gscoped_ptr mvcc_tx; + + // If the state already has a timestamp then we're replaying a transaction that occurred + // before a crash or at another node... + if (tx_state->has_timestamp()) { + mvcc_tx.reset(new ScopedTransaction(&mvcc_, tx_state->timestamp())); + + // ... otherwise this is a new transaction and we must assign a new timestamp. We either + // assign a timestamp in the future, if the consistency mode is COMMIT_WAIT, or we assign + // one in the present if the consistency mode is any other one. + } else if (tx_state->external_consistency_mode() == COMMIT_WAIT) { + mvcc_tx.reset(new ScopedTransaction(&mvcc_, ScopedTransaction::NOW_LATEST)); + } else { + mvcc_tx.reset(new ScopedTransaction(&mvcc_, ScopedTransaction::NOW)); + } + tx_state->SetMvccTxAndTimestamp(mvcc_tx.Pass()); +} + +Status Tablet::InsertUnlocked(WriteTransactionState *tx_state, + RowOp* insert) { + const TabletComponents* comps = DCHECK_NOTNULL(tx_state->tablet_components()); + + CHECK(state_ == kOpen || state_ == kBootstrapping); + // make sure that the WriteTransactionState has the component lock and that + // there the RowOp has the row lock. + DCHECK(insert->has_row_lock()) << "RowOp must hold the row lock."; + DCHECK_EQ(tx_state->schema_at_decode_time(), schema()) << "Raced against schema change"; + DCHECK(tx_state->op_id().IsInitialized()) << "TransactionState OpId needed for anchoring"; + + ProbeStats stats; + + // Submit the stats before returning from this function + ProbeStatsSubmitter submitter(stats, metrics_.get()); + + // First, ensure that it is a unique key by checking all the open RowSets. + if (FLAGS_tablet_do_dup_key_checks) { + vector to_check; + comps->rowsets->FindRowSetsWithKeyInRange(insert->key_probe->encoded_key_slice(), + &to_check); + + for (const RowSet *rowset : to_check) { + bool present = false; + RETURN_NOT_OK(rowset->CheckRowPresent(*insert->key_probe, &present, &stats)); + if (PREDICT_FALSE(present)) { + Status s = Status::AlreadyPresent("key already present"); + if (metrics_) { + metrics_->insertions_failed_dup_key->Increment(); + } + insert->SetFailed(s); + return s; + } + } + } + + Timestamp ts = tx_state->timestamp(); + ConstContiguousRow row(schema(), insert->decoded_op.row_data); + + // TODO: the Insert() call below will re-encode the key, which is a + // waste. Should pass through the KeyProbe structure perhaps. + + // Now try to insert into memrowset. The memrowset itself will return + // AlreadyPresent if it has already been inserted there. + Status s = comps->memrowset->Insert(ts, row, tx_state->op_id()); + if (PREDICT_TRUE(s.ok())) { + insert->SetInsertSucceeded(comps->memrowset->mrs_id()); + } else { + if (s.IsAlreadyPresent() && metrics_) { + metrics_->insertions_failed_dup_key->Increment(); + } + insert->SetFailed(s); + } + return s; +} + +Status Tablet::MutateRowUnlocked(WriteTransactionState *tx_state, + RowOp* mutate) { + DCHECK(tx_state != nullptr) << "you must have a WriteTransactionState"; + DCHECK(tx_state->op_id().IsInitialized()) << "TransactionState OpId needed for anchoring"; + DCHECK_EQ(tx_state->schema_at_decode_time(), schema()); + + gscoped_ptr result(new OperationResultPB()); + + const TabletComponents* comps = DCHECK_NOTNULL(tx_state->tablet_components()); + + // Validate the update. + RowChangeListDecoder rcl_decoder(mutate->decoded_op.changelist); + Status s = rcl_decoder.Init(); + if (rcl_decoder.is_reinsert()) { + // REINSERT mutations are the byproduct of an INSERT on top of a ghost + // row, not something the user is allowed to specify on their own. + s = Status::InvalidArgument("User may not specify REINSERT mutations"); + } + if (!s.ok()) { + mutate->SetFailed(s); + return s; + } + + Timestamp ts = tx_state->timestamp(); + + ProbeStats stats; + // Submit the stats before returning from this function + ProbeStatsSubmitter submitter(stats, metrics_.get()); + + // First try to update in memrowset. + s = comps->memrowset->MutateRow(ts, + *mutate->key_probe, + mutate->decoded_op.changelist, + tx_state->op_id(), + &stats, + result.get()); + if (s.ok()) { + mutate->SetMutateSucceeded(result.Pass()); + return s; + } + if (!s.IsNotFound()) { + mutate->SetFailed(s); + return s; + } + + // Next, check the disk rowsets. + + // TODO: could iterate the rowsets in a smart order + // based on recent statistics - eg if a rowset is getting + // updated frequently, pick that one first. + vector to_check; + comps->rowsets->FindRowSetsWithKeyInRange(mutate->key_probe->encoded_key_slice(), + &to_check); + for (RowSet *rs : to_check) { + s = rs->MutateRow(ts, + *mutate->key_probe, + mutate->decoded_op.changelist, + tx_state->op_id(), + &stats, + result.get()); + if (s.ok()) { + mutate->SetMutateSucceeded(result.Pass()); + return s; + } + if (!s.IsNotFound()) { + mutate->SetFailed(s); + return s; + } + } + + s = Status::NotFound("key not found"); + mutate->SetFailed(s); + return s; +} + +void Tablet::StartApplying(WriteTransactionState* tx_state) { + boost::shared_lock lock(component_lock_); + tx_state->StartApplying(); + tx_state->set_tablet_components(components_); +} + +void Tablet::ApplyRowOperations(WriteTransactionState* tx_state) { + StartApplying(tx_state); + for (RowOp* row_op : tx_state->row_ops()) { + ApplyRowOperation(tx_state, row_op); + } +} + +void Tablet::ApplyRowOperation(WriteTransactionState* tx_state, + RowOp* row_op) { + switch (row_op->decoded_op.type) { + case RowOperationsPB::INSERT: + ignore_result(InsertUnlocked(tx_state, row_op)); + return; + + case RowOperationsPB::UPDATE: + case RowOperationsPB::DELETE: + ignore_result(MutateRowUnlocked(tx_state, row_op)); + return; + + default: + LOG(FATAL) << RowOperationsPB::Type_Name(row_op->decoded_op.type); + } +} + +void Tablet::ModifyRowSetTree(const RowSetTree& old_tree, + const RowSetVector& rowsets_to_remove, + const RowSetVector& rowsets_to_add, + RowSetTree* new_tree) { + RowSetVector post_swap; + + // O(n^2) diff algorithm to collect the set of rowsets excluding + // the rowsets that were included in the compaction + int num_removed = 0; + + for (const shared_ptr &rs : old_tree.all_rowsets()) { + // Determine if it should be removed + bool should_remove = false; + for (const shared_ptr &to_remove : rowsets_to_remove) { + if (to_remove == rs) { + should_remove = true; + num_removed++; + break; + } + } + if (!should_remove) { + post_swap.push_back(rs); + } + } + + CHECK_EQ(num_removed, rowsets_to_remove.size()); + + // Then push the new rowsets on the end of the new list + std::copy(rowsets_to_add.begin(), + rowsets_to_add.end(), + std::back_inserter(post_swap)); + + CHECK_OK(new_tree->Reset(post_swap)); +} + +void Tablet::AtomicSwapRowSets(const RowSetVector &old_rowsets, + const RowSetVector &new_rowsets) { + boost::lock_guard lock(component_lock_); + AtomicSwapRowSetsUnlocked(old_rowsets, new_rowsets); +} + +void Tablet::AtomicSwapRowSetsUnlocked(const RowSetVector &to_remove, + const RowSetVector &to_add) { + DCHECK(component_lock_.is_locked()); + + shared_ptr new_tree(new RowSetTree()); + ModifyRowSetTree(*components_->rowsets, + to_remove, to_add, new_tree.get()); + + components_ = new TabletComponents(components_->memrowset, new_tree); +} + +Status Tablet::DoMajorDeltaCompaction(const vector& col_ids, + shared_ptr input_rs) { + CHECK_EQ(state_, kOpen); + Status s = down_cast(input_rs.get()) + ->MajorCompactDeltaStoresWithColumnIds(col_ids); + return s; +} + +Status Tablet::Flush() { + TRACE_EVENT1("tablet", "Tablet::Flush", "id", tablet_id()); + boost::lock_guard lock(rowsets_flush_sem_); + return FlushUnlocked(); +} + +Status Tablet::FlushUnlocked() { + TRACE_EVENT0("tablet", "Tablet::FlushUnlocked"); + RowSetsInCompaction input; + shared_ptr old_mrs; + { + // Create a new MRS with the latest schema. + boost::lock_guard lock(component_lock_); + RETURN_NOT_OK(ReplaceMemRowSetUnlocked(&input, &old_mrs)); + } + + // Wait for any in-flight transactions to finish against the old MRS + // before we flush it. + mvcc_.WaitForApplyingTransactionsToCommit(); + + // Note: "input" should only contain old_mrs. + return FlushInternal(input, old_mrs); +} + +Status Tablet::ReplaceMemRowSetUnlocked(RowSetsInCompaction *compaction, + shared_ptr *old_ms) { + *old_ms = components_->memrowset; + // Mark the memrowset rowset as locked, so compactions won't consider it + // for inclusion in any concurrent compactions. + shared_ptr ms_lock( + new boost::mutex::scoped_try_lock(*((*old_ms)->compact_flush_lock()))); + CHECK(ms_lock->owns_lock()); + + // Add to compaction. + compaction->AddRowSet(*old_ms, ms_lock); + + shared_ptr new_mrs(new MemRowSet(next_mrs_id_++, *schema(), log_anchor_registry_.get(), + mem_tracker_)); + shared_ptr new_rst(new RowSetTree()); + ModifyRowSetTree(*components_->rowsets, + RowSetVector(), // remove nothing + { *old_ms }, // add the old MRS + new_rst.get()); + + // Swap it in + components_ = new TabletComponents(new_mrs, new_rst); + return Status::OK(); +} + +Status Tablet::FlushInternal(const RowSetsInCompaction& input, + const shared_ptr& old_ms) { + CHECK(state_ == kOpen || state_ == kBootstrapping); + + // Step 1. Freeze the old memrowset by blocking readers and swapping + // it in as a new rowset, replacing it with an empty one. + // + // At this point, we have already swapped in a new empty rowset, and + // any new inserts are going into that one. 'old_ms' is effectively + // frozen -- no new inserts should arrive after this point. + // + // NOTE: updates and deletes may still arrive into 'old_ms' at this point. + // + // TODO(perf): there's a memrowset.Freeze() call which we might be able to + // use to improve iteration performance during the flush. The old design + // used this, but not certain whether it's still doable with the new design. + + uint64_t start_insert_count = old_ms->debug_insert_count(); + int64_t mrs_being_flushed = old_ms->mrs_id(); + + if (flush_hooks_) { + RETURN_NOT_OK_PREPEND(flush_hooks_->PostSwapNewMemRowSet(), + "PostSwapNewMemRowSet hook failed"); + } + + LOG(INFO) << "Flush: entering stage 1 (old memrowset already frozen for inserts)"; + input.DumpToLog(); + LOG(INFO) << "Memstore in-memory size: " << old_ms->memory_footprint() << " bytes"; + + RETURN_NOT_OK(DoCompactionOrFlush(input, mrs_being_flushed)); + + // Sanity check that no insertions happened during our flush. + CHECK_EQ(start_insert_count, old_ms->debug_insert_count()) + << "Sanity check failed: insertions continued in memrowset " + << "after flush was triggered! Aborting to prevent dataloss."; + + return Status::OK(); +} + +Status Tablet::CreatePreparedAlterSchema(AlterSchemaTransactionState *tx_state, + const Schema* schema) { + if (!key_schema_.KeyEquals(*schema)) { + return Status::InvalidArgument("Schema keys cannot be altered", + schema->CreateKeyProjection().ToString()); + } + + if (!schema->has_column_ids()) { + // this probably means that the request is not from the Master + return Status::InvalidArgument("Missing Column IDs"); + } + + // Alter schema must run when no reads/writes are in progress. + // However, compactions and flushes can continue to run in parallel + // with the schema change, + tx_state->AcquireSchemaLock(&schema_lock_); + + tx_state->set_schema(schema); + return Status::OK(); +} + +Status Tablet::AlterSchema(AlterSchemaTransactionState *tx_state) { + DCHECK(key_schema_.KeyEquals(*DCHECK_NOTNULL(tx_state->schema()))) << + "Schema keys cannot be altered"; + + // Prevent any concurrent flushes. Otherwise, we run into issues where + // we have an MRS in the rowset tree, and we can't alter its schema + // in-place. + boost::lock_guard lock(rowsets_flush_sem_); + + RowSetsInCompaction input; + shared_ptr old_ms; + { + // If the current version >= new version, there is nothing to do. + bool same_schema = schema()->Equals(*tx_state->schema()); + if (metadata_->schema_version() >= tx_state->schema_version()) { + LOG(INFO) << "Already running schema version " << metadata_->schema_version() + << " got alter request for version " << tx_state->schema_version(); + return Status::OK(); + } + + LOG(INFO) << "Alter schema from " << schema()->ToString() + << " version " << metadata_->schema_version() + << " to " << tx_state->schema()->ToString() + << " version " << tx_state->schema_version(); + DCHECK(schema_lock_.is_locked()); + metadata_->SetSchema(*tx_state->schema(), tx_state->schema_version()); + if (tx_state->has_new_table_name()) { + metadata_->SetTableName(tx_state->new_table_name()); + if (metric_entity_) { + metric_entity_->SetAttribute("table_name", tx_state->new_table_name()); + } + } + + // If the current schema and the new one are equal, there is nothing to do. + if (same_schema) { + return metadata_->Flush(); + } + } + + + // Replace the MemRowSet + { + boost::lock_guard lock(component_lock_); + RETURN_NOT_OK(ReplaceMemRowSetUnlocked(&input, &old_ms)); + } + + // TODO(KUDU-915): ideally we would release the schema_lock here so that + // we don't block access to the tablet while we flush the MRS. + // However, doing so opens up some subtle issues with the ordering of + // the alter's COMMIT message against the COMMIT messages of other + // writes. A "big hammer" fix has been applied here to hold the lock + // all the way until the COMMIT message has been appended to the WAL. + + // Flush the old MemRowSet + return FlushInternal(input, old_ms); +} + +Status Tablet::RewindSchemaForBootstrap(const Schema& new_schema, + int64_t schema_version) { + CHECK_EQ(state_, kBootstrapping); + + // We know that the MRS should be empty at this point, because we + // rewind the schema before replaying any operations. So, we just + // swap in a new one with the correct schema, rather than attempting + // to flush. + LOG(INFO) << "Rewinding schema during bootstrap to " << new_schema.ToString(); + + metadata_->SetSchema(new_schema, schema_version); + { + boost::lock_guard lock(component_lock_); + + shared_ptr old_mrs = components_->memrowset; + shared_ptr old_rowsets = components_->rowsets; + CHECK(old_mrs->empty()); + int64_t old_mrs_id = old_mrs->mrs_id(); + // We have to reset the components here before creating the new MemRowSet, + // or else the new MRS will end up trying to claim the same MemTracker ID + // as the old one. + components_.reset(); + old_mrs.reset(); + shared_ptr new_mrs(new MemRowSet(old_mrs_id, new_schema, + log_anchor_registry_.get(), mem_tracker_)); + components_ = new TabletComponents(new_mrs, old_rowsets); + } + return Status::OK(); +} + +void Tablet::SetCompactionHooksForTests( + const shared_ptr &hooks) { + compaction_hooks_ = hooks; +} + +void Tablet::SetFlushHooksForTests( + const shared_ptr &hooks) { + flush_hooks_ = hooks; +} + +void Tablet::SetFlushCompactCommonHooksForTests( + const shared_ptr &hooks) { + common_hooks_ = hooks; +} + +int32_t Tablet::CurrentMrsIdForTests() const { + boost::shared_lock lock(component_lock_); + return components_->memrowset->mrs_id(); +} + +//////////////////////////////////////////////////////////// +// CompactRowSetsOp +//////////////////////////////////////////////////////////// + +CompactRowSetsOp::CompactRowSetsOp(Tablet* tablet) + : MaintenanceOp(Substitute("CompactRowSetsOp($0)", tablet->tablet_id()), + MaintenanceOp::HIGH_IO_USAGE), + last_num_mrs_flushed_(0), + last_num_rs_compacted_(0), + tablet_(tablet) { +} + +void CompactRowSetsOp::UpdateStats(MaintenanceOpStats* stats) { + boost::lock_guard l(lock_); + + // Any operation that changes the on-disk row layout invalidates the + // cached stats. + TabletMetrics* metrics = tablet_->metrics(); + if (metrics) { + uint64_t new_num_mrs_flushed = metrics->flush_mrs_duration->TotalCount(); + uint64_t new_num_rs_compacted = metrics->compact_rs_duration->TotalCount(); + if (prev_stats_.valid() && + new_num_mrs_flushed == last_num_mrs_flushed_ && + new_num_rs_compacted == last_num_rs_compacted_) { + *stats = prev_stats_; + return; + } else { + last_num_mrs_flushed_ = new_num_mrs_flushed; + last_num_rs_compacted_ = new_num_rs_compacted; + } + } + + tablet_->UpdateCompactionStats(&prev_stats_); + *stats = prev_stats_; +} + +bool CompactRowSetsOp::Prepare() { + boost::lock_guard l(lock_); + // Invalidate the cached stats so that another section of the tablet can + // be compacted concurrently. + // + // TODO: we should acquire the rowset compaction locks here. Otherwise, until + // Compact() acquires them, the maintenance manager may compute the same + // stats for this op and run it again, even though Perform() will end up + // performing a much less fruitful compaction. See KUDU-790 for more details. + prev_stats_.Clear(); + return true; +} + +void CompactRowSetsOp::Perform() { + WARN_NOT_OK(tablet_->Compact(Tablet::COMPACT_NO_FLAGS), + Substitute("Compaction failed on $0", tablet_->tablet_id())); +} + +scoped_refptr CompactRowSetsOp::DurationHistogram() const { + return tablet_->metrics()->compact_rs_duration; +} + +scoped_refptr > CompactRowSetsOp::RunningGauge() const { + return tablet_->metrics()->compact_rs_running; +} + +//////////////////////////////////////////////////////////// +// MinorDeltaCompactionOp +//////////////////////////////////////////////////////////// + +MinorDeltaCompactionOp::MinorDeltaCompactionOp(Tablet* tablet) + : MaintenanceOp(Substitute("MinorDeltaCompactionOp($0)", tablet->tablet_id()), + MaintenanceOp::HIGH_IO_USAGE), + last_num_mrs_flushed_(0), + last_num_dms_flushed_(0), + last_num_rs_compacted_(0), + last_num_rs_minor_delta_compacted_(0), + tablet_(tablet) { +} + +void MinorDeltaCompactionOp::UpdateStats(MaintenanceOpStats* stats) { + boost::lock_guard l(lock_); + + // Any operation that changes the number of REDO files invalidates the + // cached stats. + TabletMetrics* metrics = tablet_->metrics(); + if (metrics) { + uint64_t new_num_mrs_flushed = metrics->flush_mrs_duration->TotalCount(); + uint64_t new_num_dms_flushed = metrics->flush_dms_duration->TotalCount(); + uint64_t new_num_rs_compacted = metrics->compact_rs_duration->TotalCount(); + uint64_t new_num_rs_minor_delta_compacted = + metrics->delta_minor_compact_rs_duration->TotalCount(); + if (prev_stats_.valid() && + new_num_mrs_flushed == last_num_mrs_flushed_ && + new_num_dms_flushed == last_num_dms_flushed_ && + new_num_rs_compacted == last_num_rs_compacted_ && + new_num_rs_minor_delta_compacted == last_num_rs_minor_delta_compacted_) { + *stats = prev_stats_; + return; + } else { + last_num_mrs_flushed_ = new_num_mrs_flushed; + last_num_dms_flushed_ = new_num_dms_flushed; + last_num_rs_compacted_ = new_num_rs_compacted; + last_num_rs_minor_delta_compacted_ = new_num_rs_minor_delta_compacted; + } + } + + double perf_improv = tablet_->GetPerfImprovementForBestDeltaCompact( + RowSet::MINOR_DELTA_COMPACTION, nullptr); + prev_stats_.set_perf_improvement(perf_improv); + prev_stats_.set_runnable(perf_improv > 0); + *stats = prev_stats_; +} + +bool MinorDeltaCompactionOp::Prepare() { + boost::lock_guard l(lock_); + // Invalidate the cached stats so that another rowset in the tablet can + // be delta compacted concurrently. + // + // TODO: See CompactRowSetsOp::Prepare(). + prev_stats_.Clear(); + return true; +} + +void MinorDeltaCompactionOp::Perform() { + WARN_NOT_OK(tablet_->CompactWorstDeltas(RowSet::MINOR_DELTA_COMPACTION), + Substitute("Minor delta compaction failed on $0", tablet_->tablet_id())); +} + +scoped_refptr MinorDeltaCompactionOp::DurationHistogram() const { + return tablet_->metrics()->delta_minor_compact_rs_duration; +} + +scoped_refptr > MinorDeltaCompactionOp::RunningGauge() const { + return tablet_->metrics()->delta_minor_compact_rs_running; +} + +//////////////////////////////////////////////////////////// +// MajorDeltaCompactionOp +//////////////////////////////////////////////////////////// + +MajorDeltaCompactionOp::MajorDeltaCompactionOp(Tablet* tablet) + : MaintenanceOp(Substitute("MajorDeltaCompactionOp($0)", tablet->tablet_id()), + MaintenanceOp::HIGH_IO_USAGE), + last_num_mrs_flushed_(0), + last_num_dms_flushed_(0), + last_num_rs_compacted_(0), + last_num_rs_minor_delta_compacted_(0), + last_num_rs_major_delta_compacted_(0), + tablet_(tablet) { +} + +void MajorDeltaCompactionOp::UpdateStats(MaintenanceOpStats* stats) { + boost::lock_guard l(lock_); + + // Any operation that changes the size of the on-disk data invalidates the + // cached stats. + TabletMetrics* metrics = tablet_->metrics(); + if (metrics) { + int64_t new_num_mrs_flushed = metrics->flush_mrs_duration->TotalCount(); + int64_t new_num_dms_flushed = metrics->flush_dms_duration->TotalCount(); + int64_t new_num_rs_compacted = metrics->compact_rs_duration->TotalCount(); + int64_t new_num_rs_minor_delta_compacted = + metrics->delta_minor_compact_rs_duration->TotalCount(); + int64_t new_num_rs_major_delta_compacted = + metrics->delta_major_compact_rs_duration->TotalCount(); + if (prev_stats_.valid() && + new_num_mrs_flushed == last_num_mrs_flushed_ && + new_num_dms_flushed == last_num_dms_flushed_ && + new_num_rs_compacted == last_num_rs_compacted_ && + new_num_rs_minor_delta_compacted == last_num_rs_minor_delta_compacted_ && + new_num_rs_major_delta_compacted == last_num_rs_major_delta_compacted_) { + *stats = prev_stats_; + return; + } else { + last_num_mrs_flushed_ = new_num_mrs_flushed; + last_num_dms_flushed_ = new_num_dms_flushed; + last_num_rs_compacted_ = new_num_rs_compacted; + last_num_rs_minor_delta_compacted_ = new_num_rs_minor_delta_compacted; + last_num_rs_major_delta_compacted_ = new_num_rs_major_delta_compacted; + } + } + + double perf_improv = tablet_->GetPerfImprovementForBestDeltaCompact( + RowSet::MAJOR_DELTA_COMPACTION, nullptr); + prev_stats_.set_perf_improvement(perf_improv); + prev_stats_.set_runnable(perf_improv > 0); + *stats = prev_stats_; +} + +bool MajorDeltaCompactionOp::Prepare() { + boost::lock_guard l(lock_); + // Invalidate the cached stats so that another rowset in the tablet can + // be delta compacted concurrently. + // + // TODO: See CompactRowSetsOp::Prepare(). + prev_stats_.Clear(); + return true; +} + +void MajorDeltaCompactionOp::Perform() { + WARN_NOT_OK(tablet_->CompactWorstDeltas(RowSet::MAJOR_DELTA_COMPACTION), + Substitute("Major delta compaction failed on $0", tablet_->tablet_id())); +} + +scoped_refptr MajorDeltaCompactionOp::DurationHistogram() const { + return tablet_->metrics()->delta_major_compact_rs_duration; +} + +scoped_refptr > MajorDeltaCompactionOp::RunningGauge() const { + return tablet_->metrics()->delta_major_compact_rs_running; +} + +//////////////////////////////////////////////////////////// +// Tablet +//////////////////////////////////////////////////////////// + +Status Tablet::PickRowSetsToCompact(RowSetsInCompaction *picked, + CompactFlags flags) const { + CHECK_EQ(state_, kOpen); + // Grab a local reference to the current RowSetTree. This is to avoid + // holding the component_lock_ for too long. See the comment on component_lock_ + // in tablet.h for details on why that would be bad. + shared_ptr rowsets_copy; + { + boost::shared_lock lock(component_lock_); + rowsets_copy = components_->rowsets; + } + + boost::lock_guard compact_lock(compact_select_lock_); + CHECK_EQ(picked->num_rowsets(), 0); + + unordered_set picked_set; + + if (flags & FORCE_COMPACT_ALL) { + // Compact all rowsets, regardless of policy. + for (const shared_ptr& rs : rowsets_copy->all_rowsets()) { + if (rs->IsAvailableForCompaction()) { + picked_set.insert(rs.get()); + } + } + } else { + // Let the policy decide which rowsets to compact. + double quality = 0; + RETURN_NOT_OK(compaction_policy_->PickRowSets(*rowsets_copy, &picked_set, &quality, NULL)); + VLOG(2) << "Compaction quality: " << quality; + } + + boost::shared_lock lock(component_lock_); + for (const shared_ptr& rs : components_->rowsets->all_rowsets()) { + if (picked_set.erase(rs.get()) == 0) { + // Not picked. + continue; + } + + // Grab the compact_flush_lock: this prevents any other concurrent + // compaction from selecting this same rowset, and also ensures that + // we don't select a rowset which is currently in the middle of being + // flushed. + shared_ptr lock( + new boost::mutex::scoped_try_lock(*rs->compact_flush_lock())); + CHECK(lock->owns_lock()) << rs->ToString() << " appeared available for " + "compaction when inputs were selected, but was unable to lock its " + "compact_flush_lock to prepare for compaction."; + + // Push the lock on our scoped list, so we unlock when done. + picked->AddRowSet(rs, lock); + } + + // When we iterated through the current rowsets, we should have found all of the + // rowsets that we picked. If we didn't, that implies that some other thread swapped + // them out while we were making our selection decision -- that's not possible + // since we only picked rowsets that were marked as available for compaction. + if (!picked_set.empty()) { + for (const RowSet* not_found : picked_set) { + LOG(ERROR) << "Rowset selected for compaction but not available anymore: " + << not_found->ToString(); + } + LOG(FATAL) << "Was unable to find all rowsets selected for compaction"; + } + return Status::OK(); +} + +void Tablet::GetRowSetsForTests(RowSetVector* out) { + shared_ptr rowsets_copy; + { + boost::shared_lock lock(component_lock_); + rowsets_copy = components_->rowsets; + } + for (const shared_ptr& rs : rowsets_copy->all_rowsets()) { + out->push_back(rs); + } +} + +void Tablet::RegisterMaintenanceOps(MaintenanceManager* maint_mgr) { + CHECK_EQ(state_, kOpen); + DCHECK(maintenance_ops_.empty()); + + gscoped_ptr rs_compact_op(new CompactRowSetsOp(this)); + maint_mgr->RegisterOp(rs_compact_op.get()); + maintenance_ops_.push_back(rs_compact_op.release()); + + gscoped_ptr minor_delta_compact_op(new MinorDeltaCompactionOp(this)); + maint_mgr->RegisterOp(minor_delta_compact_op.get()); + maintenance_ops_.push_back(minor_delta_compact_op.release()); + + gscoped_ptr major_delta_compact_op(new MajorDeltaCompactionOp(this)); + maint_mgr->RegisterOp(major_delta_compact_op.get()); + maintenance_ops_.push_back(major_delta_compact_op.release()); +} + +void Tablet::UnregisterMaintenanceOps() { + for (MaintenanceOp* op : maintenance_ops_) { + op->Unregister(); + } + STLDeleteElements(&maintenance_ops_); +} + +Status Tablet::FlushMetadata(const RowSetVector& to_remove, + const RowSetMetadataVector& to_add, + int64_t mrs_being_flushed) { + RowSetMetadataIds to_remove_meta; + for (const shared_ptr& rowset : to_remove) { + // Skip MemRowSet & DuplicatingRowSets which don't have metadata. + if (rowset->metadata().get() == nullptr) { + continue; + } + to_remove_meta.insert(rowset->metadata()->id()); + } + + return metadata_->UpdateAndFlush(to_remove_meta, to_add, mrs_being_flushed); +} + +Status Tablet::DoCompactionOrFlush(const RowSetsInCompaction &input, int64_t mrs_being_flushed) { + const char *op_name = + (mrs_being_flushed == TabletMetadata::kNoMrsFlushed) ? "Compaction" : "Flush"; + TRACE_EVENT2("tablet", "Tablet::DoCompactionOrFlush", + "tablet_id", tablet_id(), + "op", op_name); + + MvccSnapshot flush_snap(mvcc_); + LOG(INFO) << op_name << ": entering phase 1 (flushing snapshot). Phase 1 snapshot: " + << flush_snap.ToString(); + + if (common_hooks_) { + RETURN_NOT_OK_PREPEND(common_hooks_->PostTakeMvccSnapshot(), + "PostTakeMvccSnapshot hook failed"); + } + + shared_ptr merge; + RETURN_NOT_OK(input.CreateCompactionInput(flush_snap, schema(), &merge)); + + RollingDiskRowSetWriter drsw(metadata_.get(), merge->schema(), bloom_sizing(), + compaction_policy_->target_rowset_size()); + RETURN_NOT_OK_PREPEND(drsw.Open(), "Failed to open DiskRowSet for flush"); + RETURN_NOT_OK_PREPEND(FlushCompactionInput(merge.get(), flush_snap, &drsw), + "Flush to disk failed"); + RETURN_NOT_OK_PREPEND(drsw.Finish(), "Failed to finish DRS writer"); + + if (common_hooks_) { + RETURN_NOT_OK_PREPEND(common_hooks_->PostWriteSnapshot(), + "PostWriteSnapshot hook failed"); + } + + // Though unlikely, it's possible that all of the input rows were actually + // GCed in this compaction. In that case, we don't actually want to reopen. + bool gced_all_input = drsw.written_count() == 0; + if (gced_all_input) { + LOG(INFO) << op_name << " resulted in no output rows (all input rows " + << "were GCed!) Removing all input rowsets."; + + // Write out the new Tablet Metadata and remove old rowsets. + // TODO: Consensus catch-up may want to preserve the compaction inputs. + RETURN_NOT_OK_PREPEND(FlushMetadata(input.rowsets(), + RowSetMetadataVector(), + mrs_being_flushed), + "Failed to flush new tablet metadata"); + + AtomicSwapRowSets(input.rowsets(), RowSetVector()); + + return Status::OK(); + } + + // The RollingDiskRowSet writer wrote out one or more RowSets as the + // output. Open these into 'new_rowsets'. + vector > new_disk_rowsets; + RowSetMetadataVector new_drs_metas; + drsw.GetWrittenRowSetMetadata(&new_drs_metas); + + if (metrics_.get()) metrics_->bytes_flushed->IncrementBy(drsw.written_size()); + CHECK(!new_drs_metas.empty()); + { + TRACE_EVENT0("tablet", "Opening compaction results"); + for (const shared_ptr& meta : new_drs_metas) { + shared_ptr new_rowset; + Status s = DiskRowSet::Open(meta, log_anchor_registry_.get(), &new_rowset, mem_tracker_); + if (!s.ok()) { + LOG(WARNING) << "Unable to open snapshot " << op_name << " results " + << meta->ToString() << ": " << s.ToString(); + return s; + } + new_disk_rowsets.push_back(new_rowset); + } + } + + // Setup for Phase 2: Start duplicating any new updates into the new on-disk + // rowsets. + // + // During Phase 1, we may have missed some updates which came into the input + // rowsets while we were writing. So, we can't immediately start reading from + // the on-disk rowsets alone. Starting here, we continue to read from the + // original rowset(s), but mirror updates to both the input and the output + // data. + // + // It's crucial that, during the rest of the compaction, we do not allow the + // output rowsets to flush their deltas to disk. This is to avoid the following + // bug: + // - during phase 1, timestamp 1 updates a flushed row. This is only reflected in the + // input rowset. (ie it is a "missed delta") + // - during phase 2, timestamp 2 updates the same row. This is reflected in both the + // input and output, because of the DuplicatingRowSet. + // - now suppose the output rowset were allowed to flush deltas. This would create the + // first DeltaFile for the output rowset, with only timestamp 2. + // - Now we run the "ReupdateMissedDeltas", and copy over the first transaction to the output + // DMS, which later flushes. + // The end result would be that redos[0] has timestamp 2, and redos[1] has timestamp 1. + // This breaks an invariant that the redo files are time-ordered, and would we would probably + // reapply the deltas in the wrong order on the read path. + // + // The way that we avoid this case is that DuplicatingRowSet's FlushDeltas method is a + // no-op. + LOG(INFO) << op_name << ": entering phase 2 (starting to duplicate updates " + << "in new rowsets)"; + shared_ptr inprogress_rowset( + new DuplicatingRowSet(input.rowsets(), new_disk_rowsets)); + + // The next step is to swap in the DuplicatingRowSet, and at the same time, determine an + // MVCC snapshot which includes all of the transactions that saw a pre-DuplicatingRowSet + // version of components_. + MvccSnapshot non_duplicated_txns_snap; + vector applying_during_swap; + { + TRACE_EVENT0("tablet", "Swapping DuplicatingRowSet"); + // Taking component_lock_ in write mode ensures that no new transactions + // can StartApplying() (or snapshot components_) during this block. + boost::lock_guard lock(component_lock_); + AtomicSwapRowSetsUnlocked(input.rowsets(), { inprogress_rowset }); + + // NOTE: transactions may *commit* in between these two lines. + // We need to make sure all such transactions end up in the + // 'applying_during_swap' list, the 'non_duplicated_txns_snap' snapshot, + // or both. Thus it's crucial that these next two lines are in this order! + mvcc_.GetApplyingTransactionsTimestamps(&applying_during_swap); + non_duplicated_txns_snap = MvccSnapshot(mvcc_); + } + + // All transactions committed in 'non_duplicated_txns_snap' saw the pre-swap components_. + // Additionally, any transactions that were APPLYING during the above block by definition + // _started_ doing so before the swap. Hence those transactions also need to get included in + // non_duplicated_txns_snap. To do so, we wait for them to commit, and then + // manually include them into our snapshot. + if (VLOG_IS_ON(1) && !applying_during_swap.empty()) { + VLOG(1) << "Waiting for " << applying_during_swap.size() << " mid-APPLY txns to commit " + << "before finishing compaction..."; + for (const Timestamp& ts : applying_during_swap) { + VLOG(1) << " " << ts.value(); + } + } + + // This wait is a little bit conservative - technically we only need to wait for + // those transactions in 'applying_during_swap', but MVCC doesn't implement the + // ability to wait for a specific set. So instead we wait for all currently applying -- + // a bit more than we need, but still correct. + mvcc_.WaitForApplyingTransactionsToCommit(); + + // Then we want to consider all those transactions that were in-flight when we did the + // swap as committed in 'non_duplicated_txns_snap'. + non_duplicated_txns_snap.AddCommittedTimestamps(applying_during_swap); + + if (common_hooks_) { + RETURN_NOT_OK_PREPEND(common_hooks_->PostSwapInDuplicatingRowSet(), + "PostSwapInDuplicatingRowSet hook failed"); + } + + // Phase 2. Here we re-scan the compaction input, copying those missed updates into the + // new rowset's DeltaTracker. + LOG(INFO) << op_name << " Phase 2: carrying over any updates which arrived during Phase 1"; + LOG(INFO) << "Phase 2 snapshot: " << non_duplicated_txns_snap.ToString(); + RETURN_NOT_OK_PREPEND( + input.CreateCompactionInput(non_duplicated_txns_snap, schema(), &merge), + Substitute("Failed to create $0 inputs", op_name).c_str()); + + // Update the output rowsets with the deltas that came in in phase 1, before we swapped + // in the DuplicatingRowSets. This will perform a flush of the updated DeltaTrackers + // in the end so that the data that is reported in the log as belonging to the input + // rowsets is flushed. + RETURN_NOT_OK_PREPEND(ReupdateMissedDeltas(metadata_->tablet_id(), + merge.get(), + flush_snap, + non_duplicated_txns_snap, + new_disk_rowsets), + Substitute("Failed to re-update deltas missed during $0 phase 1", + op_name).c_str()); + + if (common_hooks_) { + RETURN_NOT_OK_PREPEND(common_hooks_->PostReupdateMissedDeltas(), + "PostReupdateMissedDeltas hook failed"); + } + + // ------------------------------ + // Flush was successful. + + // Write out the new Tablet Metadata and remove old rowsets. + RETURN_NOT_OK_PREPEND(FlushMetadata(input.rowsets(), new_drs_metas, mrs_being_flushed), + "Failed to flush new tablet metadata"); + + // Replace the compacted rowsets with the new on-disk rowsets, making them visible now that + // their metadata was written to disk. + AtomicSwapRowSets({ inprogress_rowset }, new_disk_rowsets); + + LOG(INFO) << op_name << " successful on " << drsw.written_count() + << " rows " << "(" << drsw.written_size() << " bytes)"; + + if (common_hooks_) { + RETURN_NOT_OK_PREPEND(common_hooks_->PostSwapNewRowSet(), + "PostSwapNewRowSet hook failed"); + } + + return Status::OK(); +} + +Status Tablet::Compact(CompactFlags flags) { + CHECK_EQ(state_, kOpen); + + RowSetsInCompaction input; + // Step 1. Capture the rowsets to be merged + RETURN_NOT_OK_PREPEND(PickRowSetsToCompact(&input, flags), + "Failed to pick rowsets to compact"); + if (input.num_rowsets() < 2) { + VLOG(1) << "Not enough rowsets to run compaction! Aborting..."; + return Status::OK(); + } + LOG(INFO) << "Compaction: stage 1 complete, picked " + << input.num_rowsets() << " rowsets to compact"; + if (compaction_hooks_) { + RETURN_NOT_OK_PREPEND(compaction_hooks_->PostSelectIterators(), + "PostSelectIterators hook failed"); + } + + input.DumpToLog(); + + return DoCompactionOrFlush(input, + TabletMetadata::kNoMrsFlushed); +} + +void Tablet::UpdateCompactionStats(MaintenanceOpStats* stats) { + + // TODO: use workload statistics here to find out how "hot" the tablet has + // been in the last 5 minutes, and somehow scale the compaction quality + // based on that, so we favor hot tablets. + double quality = 0; + unordered_set picked_set_ignored; + + shared_ptr rowsets_copy; + { + boost::shared_lock lock(component_lock_); + rowsets_copy = components_->rowsets; + } + + { + boost::lock_guard compact_lock(compact_select_lock_); + WARN_NOT_OK(compaction_policy_->PickRowSets(*rowsets_copy, &picked_set_ignored, &quality, NULL), + Substitute("Couldn't determine compaction quality for $0", tablet_id())); + } + + VLOG(1) << "Best compaction for " << tablet_id() << ": " << quality; + + stats->set_runnable(quality >= 0); + stats->set_perf_improvement(quality); +} + + +Status Tablet::DebugDump(vector *lines) { + boost::shared_lock lock(component_lock_); + + LOG_STRING(INFO, lines) << "Dumping tablet:"; + LOG_STRING(INFO, lines) << "---------------------------"; + + LOG_STRING(INFO, lines) << "MRS " << components_->memrowset->ToString() << ":"; + RETURN_NOT_OK(components_->memrowset->DebugDump(lines)); + + for (const shared_ptr &rs : components_->rowsets->all_rowsets()) { + LOG_STRING(INFO, lines) << "RowSet " << rs->ToString() << ":"; + RETURN_NOT_OK(rs->DebugDump(lines)); + } + + return Status::OK(); +} + +Status Tablet::CaptureConsistentIterators( + const Schema *projection, + const MvccSnapshot &snap, + const ScanSpec *spec, + vector > *iters) const { + boost::shared_lock lock(component_lock_); + + // Construct all the iterators locally first, so that if we fail + // in the middle, we don't modify the output arguments. + vector > ret; + + // Grab the memrowset iterator. + gscoped_ptr ms_iter; + RETURN_NOT_OK(components_->memrowset->NewRowIterator(projection, snap, &ms_iter)); + ret.push_back(shared_ptr(ms_iter.release())); + + // Cull row-sets in the case of key-range queries. + if (spec != nullptr && spec->lower_bound_key() && spec->exclusive_upper_bound_key()) { + // TODO : support open-ended intervals + // TODO: the upper bound key is exclusive, but the RowSetTree function takes + // an inclusive interval. So, we might end up fetching one more rowset than + // necessary. + vector interval_sets; + components_->rowsets->FindRowSetsIntersectingInterval( + spec->lower_bound_key()->encoded_key(), + spec->exclusive_upper_bound_key()->encoded_key(), + &interval_sets); + for (const RowSet *rs : interval_sets) { + gscoped_ptr row_it; + RETURN_NOT_OK_PREPEND(rs->NewRowIterator(projection, snap, &row_it), + Substitute("Could not create iterator for rowset $0", + rs->ToString())); + ret.push_back(shared_ptr(row_it.release())); + } + ret.swap(*iters); + return Status::OK(); + } + + // If there are no encoded predicates or they represent an open-ended range, then + // fall back to grabbing all rowset iterators + for (const shared_ptr &rs : components_->rowsets->all_rowsets()) { + gscoped_ptr row_it; + RETURN_NOT_OK_PREPEND(rs->NewRowIterator(projection, snap, &row_it), + Substitute("Could not create iterator for rowset $0", + rs->ToString())); + ret.push_back(shared_ptr(row_it.release())); + } + + // Swap results into the parameters. + ret.swap(*iters); + return Status::OK(); +} + +Status Tablet::CountRows(uint64_t *count) const { + // First grab a consistent view of the components of the tablet. + scoped_refptr comps; + GetComponents(&comps); + + // Now sum up the counts. + *count = comps->memrowset->entry_count(); + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + rowid_t l_count; + RETURN_NOT_OK(rowset->CountRows(&l_count)); + *count += l_count; + } + + return Status::OK(); +} + +size_t Tablet::MemRowSetSize() const { + scoped_refptr comps; + GetComponents(&comps); + + if (comps) { + return comps->memrowset->memory_footprint(); + } + return 0; +} + +bool Tablet::MemRowSetEmpty() const { + scoped_refptr comps; + GetComponents(&comps); + + return comps->memrowset->empty(); +} + +size_t Tablet::MemRowSetLogRetentionSize(const MaxIdxToSegmentMap& max_idx_to_segment_size) const { + scoped_refptr comps; + GetComponents(&comps); + + return GetLogRetentionSizeForIndex(comps->memrowset->MinUnflushedLogIndex(), + max_idx_to_segment_size); +} + +size_t Tablet::EstimateOnDiskSize() const { + scoped_refptr comps; + GetComponents(&comps); + + if (!comps) return 0; + + size_t ret = 0; + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + ret += rowset->EstimateOnDiskSize(); + } + + return ret; +} + +size_t Tablet::DeltaMemStoresSize() const { + scoped_refptr comps; + GetComponents(&comps); + + size_t ret = 0; + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + ret += rowset->DeltaMemStoreSize(); + } + + return ret; +} + +bool Tablet::DeltaMemRowSetEmpty() const { + scoped_refptr comps; + GetComponents(&comps); + + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + if (!rowset->DeltaMemStoreEmpty()) { + return false; + } + } + + return true; +} + +void Tablet::GetInfoForBestDMSToFlush(const MaxIdxToSegmentMap& max_idx_to_segment_size, + int64_t* mem_size, int64_t* retention_size) const { + shared_ptr rowset = FindBestDMSToFlush(max_idx_to_segment_size); + + if (rowset) { + *retention_size = GetLogRetentionSizeForIndex(rowset->MinUnflushedLogIndex(), + max_idx_to_segment_size); + *mem_size = rowset->DeltaMemStoreSize(); + } else { + *retention_size = 0; + *mem_size = 0; + } +} + +Status Tablet::FlushDMSWithHighestRetention(const MaxIdxToSegmentMap& + max_idx_to_segment_size) const { + shared_ptr rowset = FindBestDMSToFlush(max_idx_to_segment_size); + if (rowset) { + return rowset->FlushDeltas(); + } + return Status::OK(); +} + +shared_ptr Tablet::FindBestDMSToFlush(const MaxIdxToSegmentMap& + max_idx_to_segment_size) const { + scoped_refptr comps; + GetComponents(&comps); + int64_t mem_size = 0; + int64_t retention_size = 0; + shared_ptr best_dms; + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + if (rowset->DeltaMemStoreEmpty()) { + continue; + } + int64_t size = GetLogRetentionSizeForIndex(rowset->MinUnflushedLogIndex(), + max_idx_to_segment_size); + if ((size > retention_size) || + (size == retention_size && + (rowset->DeltaMemStoreSize() > mem_size))) { + mem_size = rowset->DeltaMemStoreSize(); + retention_size = size; + best_dms = rowset; + } + } + return best_dms; +} + +int64_t Tablet::GetLogRetentionSizeForIndex(int64_t min_log_index, + const MaxIdxToSegmentMap& max_idx_to_segment_size) { + if (max_idx_to_segment_size.size() == 0 || min_log_index == -1) { + return 0; + } + int64_t total_size = 0; + for (const MaxIdxToSegmentMap::value_type& entry : max_idx_to_segment_size) { + if (min_log_index > entry.first) { + continue; // We're not in this segment, probably someone else is retaining it. + } + total_size += entry.second; + } + return total_size; +} + +Status Tablet::FlushBiggestDMS() { + CHECK_EQ(state_, kOpen); + scoped_refptr comps; + GetComponents(&comps); + + int64_t max_size = -1; + shared_ptr biggest_drs; + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + int64_t current = rowset->DeltaMemStoreSize(); + if (current > max_size) { + max_size = current; + biggest_drs = rowset; + } + } + return max_size > 0 ? biggest_drs->FlushDeltas() : Status::OK(); +} + +Status Tablet::CompactWorstDeltas(RowSet::DeltaCompactionType type) { + CHECK_EQ(state_, kOpen); + shared_ptr rs; + // We're required to grab the rowset's compact_flush_lock under the compact_select_lock_. + shared_ptr lock; + double perf_improv; + { + // We only want to keep the selection lock during the time we look at rowsets to compact. + // The returned rowset is guaranteed to be available to lock since locking must be done + // under this lock. + boost::lock_guard compact_lock(compact_select_lock_); + perf_improv = GetPerfImprovementForBestDeltaCompactUnlocked(type, &rs); + if (rs) { + lock.reset(new boost::mutex::scoped_try_lock(*rs->compact_flush_lock())); + CHECK(lock->owns_lock()); + } else { + return Status::OK(); + } + } + + // We just released compact_select_lock_ so other compactions can select and run, but the + // rowset is ours. + DCHECK(perf_improv != 0); + if (type == RowSet::MINOR_DELTA_COMPACTION) { + RETURN_NOT_OK_PREPEND(rs->MinorCompactDeltaStores(), + "Failed minor delta compaction on " + rs->ToString()); + } else if (type == RowSet::MAJOR_DELTA_COMPACTION) { + RETURN_NOT_OK_PREPEND(down_cast(rs.get())->MajorCompactDeltaStores(), + "Failed major delta compaction on " + rs->ToString()); + } + return Status::OK(); +} + +double Tablet::GetPerfImprovementForBestDeltaCompact(RowSet::DeltaCompactionType type, + shared_ptr* rs) const { + boost::lock_guard compact_lock(compact_select_lock_); + return GetPerfImprovementForBestDeltaCompactUnlocked(type, rs); +} + +double Tablet::GetPerfImprovementForBestDeltaCompactUnlocked(RowSet::DeltaCompactionType type, + shared_ptr* rs) const { + boost::mutex::scoped_try_lock cs_lock(compact_select_lock_); + DCHECK(!cs_lock.owns_lock()); + scoped_refptr comps; + GetComponents(&comps); + double worst_delta_perf = 0; + shared_ptr worst_rs; + for (const shared_ptr &rowset : comps->rowsets->all_rowsets()) { + if (!rowset->IsAvailableForCompaction()) { + continue; + } + double perf_improv = rowset->DeltaStoresCompactionPerfImprovementScore(type); + if (perf_improv > worst_delta_perf) { + worst_rs = rowset; + worst_delta_perf = perf_improv; + } + } + if (rs && worst_delta_perf > 0) { + *rs = worst_rs; + } + return worst_delta_perf; +} + +size_t Tablet::num_rowsets() const { + boost::shared_lock lock(component_lock_); + return components_->rowsets->all_rowsets().size(); +} + +void Tablet::PrintRSLayout(ostream* o) { + shared_ptr rowsets_copy; + { + boost::shared_lock lock(component_lock_); + rowsets_copy = components_->rowsets; + } + boost::lock_guard compact_lock(compact_select_lock_); + // Run the compaction policy in order to get its log and highlight those + // rowsets which would be compacted next. + vector log; + unordered_set picked; + double quality; + Status s = compaction_policy_->PickRowSets(*rowsets_copy, &picked, &quality, &log); + if (!s.ok()) { + *o << "Error: " << EscapeForHtmlToString(s.ToString()); + return; + } + + if (!picked.empty()) { + *o << "

    "; + *o << "Highlighted rowsets indicate those that would be compacted next if a " + << "compaction were to run on this tablet."; + *o << "

    "; + } + + vector min, max; + RowSetInfo::CollectOrdered(*rowsets_copy, &min, &max); + DumpCompactionSVG(min, picked, o, false); + + *o << "

    Compaction policy log

    " << std::endl; + + *o << "
    " << std::endl;
    +  for (const string& s : log) {
    +    *o << EscapeForHtmlToString(s) << std::endl;
    +  }
    +  *o << "
    " << std::endl; +} + +//////////////////////////////////////////////////////////// +// Tablet::Iterator +//////////////////////////////////////////////////////////// + +Tablet::Iterator::Iterator(const Tablet* tablet, const Schema& projection, + MvccSnapshot snap, const OrderMode order) + : tablet_(tablet), + projection_(projection), + snap_(std::move(snap)), + order_(order), + arena_(256, 4096), + encoder_(&tablet_->key_schema(), &arena_) {} + +Tablet::Iterator::~Iterator() {} + +Status Tablet::Iterator::Init(ScanSpec *spec) { + DCHECK(iter_.get() == nullptr); + + RETURN_NOT_OK(tablet_->GetMappedReadProjection(projection_, &projection_)); + + vector > iters; + if (spec != nullptr) { + VLOG(3) << "Before encoding range preds: " << spec->ToString(); + encoder_.EncodeRangePredicates(spec, true); + VLOG(3) << "After encoding range preds: " << spec->ToString(); + } + + RETURN_NOT_OK(tablet_->CaptureConsistentIterators( + &projection_, snap_, spec, &iters)); + + switch (order_) { + case ORDERED: + iter_.reset(new MergeIterator(projection_, iters)); + break; + case UNORDERED: + default: + iter_.reset(new UnionIterator(iters)); + break; + } + + RETURN_NOT_OK(iter_->Init(spec)); + return Status::OK(); +} + +bool Tablet::Iterator::HasNext() const { + DCHECK(iter_.get() != nullptr) << "Not initialized!"; + return iter_->HasNext(); +} + +Status Tablet::Iterator::NextBlock(RowBlock *dst) { + DCHECK(iter_.get() != nullptr) << "Not initialized!"; + return iter_->NextBlock(dst); +} + +string Tablet::Iterator::ToString() const { + string s; + s.append("tablet iterator: "); + if (iter_.get() == nullptr) { + s.append("NULL"); + } else { + s.append(iter_->ToString()); + } + return s; +} + +void Tablet::Iterator::GetIteratorStats(vector* stats) const { + iter_->GetIteratorStats(stats); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet.h b/src/kudu/tablet/tablet.h new file mode 100644 index 000000000000..3a05f64984ed --- /dev/null +++ b/src/kudu/tablet/tablet.h @@ -0,0 +1,630 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_H +#define KUDU_TABLET_TABLET_H + +#include +#include +#include +#include +#include + +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/predicate_encoder.h" +#include "kudu/common/schema.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/rowset_metadata.h" +#include "kudu/tablet/tablet_metadata.h" +#include "kudu/tablet/lock_manager.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/tablet/rowset.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/semaphore.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MemTracker; +class MetricEntity; +class RowChangeList; +class UnionIterator; + +namespace log { +class LogAnchorRegistry; +} + +namespace server { +class Clock; +} + +class MaintenanceManager; +class MaintenanceOp; +class MaintenanceOpStats; + +namespace tablet { + +class AlterSchemaTransactionState; +class CompactionPolicy; +class MemRowSet; +class MvccSnapshot; +struct RowOp; +class RowSetsInCompaction; +class RowSetTree; +struct TabletComponents; +struct TabletMetrics; +class WriteTransactionState; + +class Tablet { + public: + typedef std::map MaxIdxToSegmentMap; + friend class CompactRowSetsOp; + friend class FlushMRSOp; + + class CompactionFaultHooks; + class FlushCompactCommonHooks; + class FlushFaultHooks; + class Iterator; + + // Create a new tablet. + // + // If 'metric_registry' is non-NULL, then this tablet will create a 'tablet' entity + // within the provided registry. Otherwise, no metrics are collected. + Tablet(const scoped_refptr& metadata, + const scoped_refptr& clock, + const std::shared_ptr& parent_mem_tracker, + MetricRegistry* metric_registry, + const scoped_refptr& log_anchor_registry); + + ~Tablet(); + + // Open the tablet. + // Upon completion, the tablet enters the kBootstrapping state. + Status Open(); + + // Mark that the tablet has finished bootstrapping. + // This transitions from kBootstrapping to kOpen state. + void MarkFinishedBootstrapping(); + + void Shutdown(); + + // Decode the Write (insert/mutate) operations from within a user's + // request. + Status DecodeWriteOperations(const Schema* client_schema, + WriteTransactionState* tx_state); + + // Acquire locks for each of the operations in the given txn. + // + // Note that, if this fails, it's still possible that the transaction + // state holds _some_ of the locks. In that case, we expect that + // the transaction will still clean them up when it is aborted (or + // otherwise destructed). + Status AcquireRowLocks(WriteTransactionState* tx_state); + + // Finish the Prepare phase of a write transaction. + // + // Starts an MVCC transaction and assigns a timestamp for the transaction. + // This also snapshots the current set of tablet components into the transaction + // state. + // + // This should always be done _after_ any relevant row locks are acquired + // (using CreatePreparedInsert/CreatePreparedMutate). This ensures that, + // within each row, timestamps only move forward. If we took a timestamp before + // getting the row lock, we could have the following situation: + // + // Thread 1 | Thread 2 + // ---------------------- + // Start tx 1 | + // | Start tx 2 + // | Obtain row lock + // | Update row + // | Commit tx 2 + // Obtain row lock | + // Delete row | + // Commit tx 1 + // + // This would cause the mutation list to look like: @t1: DELETE, @t2: UPDATE + // which is invalid, since we expect to be able to be able to replay mutations + // in increasing timestamp order on a given row. + // + // This requirement is basically two-phase-locking: the order in which row locks + // are acquired for transactions determines their serialization order. If/when + // we support multi-node serializable transactions, we'll have to acquire _all_ + // row locks (across all nodes) before obtaining a timestamp. + // + // TODO: rename this to something like "FinishPrepare" or "StartApply", since + // it's not the first thing in a transaction! + void StartTransaction(WriteTransactionState* tx_state); + + // Insert a new row into the tablet. + // + // The provided 'data' slice should have length equivalent to this + // tablet's Schema.byte_size(). + // + // After insert, the row and any referred-to memory (eg for strings) + // have been copied into internal memory, and thus the provided memory + // buffer may safely be re-used or freed. + // + // Returns Status::AlreadyPresent() if an entry with the same key is already + // present in the tablet. + // Returns Status::OK unless allocation fails. + // + // Acquires the row lock for the given operation, setting it in the + // RowOp struct. This also sets the row op's RowSetKeyProbe. + Status AcquireLockForOp(WriteTransactionState* tx_state, + RowOp* op); + + // Signal that the given transaction is about to Apply. + void StartApplying(WriteTransactionState* tx_state); + + // Apply all of the row operations associated with this transaction. + void ApplyRowOperations(WriteTransactionState* tx_state); + + // Apply a single row operation, which must already be prepared. + // The result is set back into row_op->result + void ApplyRowOperation(WriteTransactionState* tx_state, + RowOp* row_op); + + // Create a new row iterator which yields the rows as of the current MVCC + // state of this tablet. + // The returned iterator is not initialized. + Status NewRowIterator(const Schema &projection, + gscoped_ptr *iter) const; + + // Whether the iterator should return results in order. + enum OrderMode { + UNORDERED = 0, + ORDERED = 1 + }; + + // Create a new row iterator for some historical snapshot. + Status NewRowIterator(const Schema &projection, + const MvccSnapshot &snap, + const OrderMode order, + gscoped_ptr *iter) const; + + // Flush the current MemRowSet for this tablet to disk. This swaps + // in a new (initially empty) MemRowSet in its place. + // + // This doesn't flush any DeltaMemStores for any existing RowSets. + // To do that, call FlushBiggestDMS() for example. + Status Flush(); + + // Prepares the transaction context for the alter schema operation. + // An error will be returned if the specified schema is invalid (e.g. + // key mismatch, or missing IDs) + Status CreatePreparedAlterSchema(AlterSchemaTransactionState *tx_state, + const Schema* schema); + + // Apply the Schema of the specified transaction. + // This operation will trigger a flush on the current MemRowSet. + Status AlterSchema(AlterSchemaTransactionState* tx_state); + + // Rewind the schema to an earlier version than is written in the on-disk + // metadata. This is done during bootstrap to roll the schema back to the + // point in time where the logs-to-be-replayed begin, so we can then decode + // the operations in the log with the correct schema. + // + // REQUIRES: state_ == kBootstrapping + Status RewindSchemaForBootstrap(const Schema& schema, + int64_t schema_version); + + // Prints current RowSet layout, taking a snapshot of the current RowSet interval + // tree. Also prints the log of the compaction algorithm as evaluated + // on the current layout. + void PrintRSLayout(std::ostream* o); + + // Flags to change the behavior of compaction. + enum CompactFlag { + COMPACT_NO_FLAGS = 0, + + // Force the compaction to include all rowsets, regardless of the + // configured compaction policy. This is currently only used in + // tests. + FORCE_COMPACT_ALL = 1 << 0 + }; + typedef int CompactFlags; + + Status Compact(CompactFlags flags); + + // Update the statistics for performing a compaction. + void UpdateCompactionStats(MaintenanceOpStats* stats); + + // Returns the exact current size of the MRS, in bytes. A value greater than 0 doesn't imply + // that the MRS has data, only that it has allocated that amount of memory. + // This method takes a read lock on component_lock_ and is thread-safe. + size_t MemRowSetSize() const; + + // Returns true if the MRS is empty, else false. Doesn't rely on size and + // actually verifies that the MRS has no elements. + // This method takes a read lock on component_lock_ and is thread-safe. + bool MemRowSetEmpty() const; + + // Returns the size in bytes for the MRS's log retention. + size_t MemRowSetLogRetentionSize(const MaxIdxToSegmentMap& max_idx_to_segment_size) const; + + // Estimate the total on-disk size of this tablet, in bytes. + size_t EstimateOnDiskSize() const; + + // Get the total size of all the DMS + size_t DeltaMemStoresSize() const; + + // Same as MemRowSetEmpty(), but for the DMS. + bool DeltaMemRowSetEmpty() const; + + // Fills in the in-memory size and retention size in bytes for the DMS with the + // highest retention. + void GetInfoForBestDMSToFlush(const MaxIdxToSegmentMap& max_idx_to_segment_size, + int64_t* mem_size, int64_t* retention_size) const; + + // Flushes the DMS with the highest retention. + Status FlushDMSWithHighestRetention(const MaxIdxToSegmentMap& max_idx_to_segment_size) const; + + // Flush only the biggest DMS + Status FlushBiggestDMS(); + + // Finds the RowSet which has the most separate delta files and + // issues a minor delta compaction. + Status CompactWorstDeltas(RowSet::DeltaCompactionType type); + + // Get the highest performance improvement that would come from compacting the delta stores + // of one of the rowsets. If the returned performance improvement is 0, or if 'rs' is NULL, + // then 'rs' isn't set. Callers who already own compact_select_lock_ + // can call GetPerfImprovementForBestDeltaCompactUnlocked(). + double GetPerfImprovementForBestDeltaCompact(RowSet::DeltaCompactionType type, + std::shared_ptr* rs) const; + + // Same as GetPerfImprovementForBestDeltaCompact(), but doesn't take a lock on + // compact_select_lock_. + double GetPerfImprovementForBestDeltaCompactUnlocked(RowSet::DeltaCompactionType type, + std::shared_ptr* rs) const; + + // Return the current number of rowsets in the tablet. + size_t num_rowsets() const; + + // Attempt to count the total number of rows in the tablet. + // This is not super-efficient since it must iterate over the + // memrowset in the current implementation. + Status CountRows(uint64_t *count) const; + + + // Verbosely dump this entire tablet to the logs. This is only + // really useful when debugging unit tests failures where the tablet + // has a very small number of rows. + Status DebugDump(vector *lines = NULL); + + const Schema* schema() const { + return &metadata_->schema(); + } + + // Returns a reference to the key projection of the tablet schema. + // The schema keys are immutable. + const Schema& key_schema() const { return key_schema_; } + + // Return the MVCC manager for this tablet. + MvccManager* mvcc_manager() { return &mvcc_; } + + // Return the Lock Manager for this tablet + LockManager* lock_manager() { return &lock_manager_; } + + const TabletMetadata *metadata() const { return metadata_.get(); } + TabletMetadata *metadata() { return metadata_.get(); } + + void SetCompactionHooksForTests(const std::shared_ptr &hooks); + void SetFlushHooksForTests(const std::shared_ptr &hooks); + void SetFlushCompactCommonHooksForTests( + const std::shared_ptr &hooks); + + // Returns the current MemRowSet id, for tests. + // This method takes a read lock on component_lock_ and is thread-safe. + int32_t CurrentMrsIdForTests() const; + + // Runs a major delta major compaction on columns with specified IDs. + // NOTE: RowSet must presently be a DiskRowSet. (Perhaps the API should be + // a shared_ptr API for now?) + // + // TODO: Handle MVCC to support MemRowSet and handle deltas in DeltaMemStore + Status DoMajorDeltaCompaction(const std::vector& column_ids, + std::shared_ptr input_rowset); + + // Method used by tests to retrieve all rowsets of this table. This + // will be removed once code for selecting the appropriate RowSet is + // finished and delta files is finished is part of Tablet class. + void GetRowSetsForTests(vector >* out); + + // Register the maintenance ops associated with this tablet + void RegisterMaintenanceOps(MaintenanceManager* maintenance_manager); + + // Unregister the maintenance ops associated with this tablet. + // This method is not thread safe. + void UnregisterMaintenanceOps(); + + const std::string& tablet_id() const { return metadata_->tablet_id(); } + + // Return the metrics for this tablet. + // May be NULL in unit tests, etc. + TabletMetrics* metrics() { return metrics_.get(); } + + // Return handle to the metric entity of this tablet. + const scoped_refptr& GetMetricEntity() const { return metric_entity_; } + + // Returns a reference to this tablet's memory tracker. + const std::shared_ptr& mem_tracker() const { return mem_tracker_; } + + static const char* kDMSMemTrackerId; + private: + friend class Iterator; + friend class TabletPeerTest; + FRIEND_TEST(TestTablet, TestGetLogRetentionSizeForIndex); + + Status FlushUnlocked(); + + // A version of Insert that does not acquire locks and instead assumes that + // they were already acquired. Requires that handles for the relevant locks + // and MVCC transaction are present in the transaction state. + Status InsertUnlocked(WriteTransactionState *tx_state, + RowOp* insert); + + // A version of MutateRow that does not acquire locks and instead assumes + // they were already acquired. Requires that handles for the relevant locks + // and MVCC transaction are present in the transaction state. + Status MutateRowUnlocked(WriteTransactionState *tx_state, + RowOp* mutate); + + // Capture a set of iterators which, together, reflect all of the data in the tablet. + // + // These iterators are not true snapshot iterators, but they are safe against + // concurrent modification. They will include all data that was present at the time + // of creation, and potentially newer data. + // + // The returned iterators are not Init()ed. + // 'projection' must remain valid and unchanged for the lifetime of the returned iterators. + Status CaptureConsistentIterators(const Schema *projection, + const MvccSnapshot &snap, + const ScanSpec *spec, + vector > *iters) const; + + Status PickRowSetsToCompact(RowSetsInCompaction *picked, + CompactFlags flags) const; + + Status DoCompactionOrFlush(const RowSetsInCompaction &input, + int64_t mrs_being_flushed); + + Status FlushMetadata(const RowSetVector& to_remove, + const RowSetMetadataVector& to_add, + int64_t mrs_being_flushed); + + static void ModifyRowSetTree(const RowSetTree& old_tree, + const RowSetVector& rowsets_to_remove, + const RowSetVector& rowsets_to_add, + RowSetTree* new_tree); + + // Swap out a set of rowsets, atomically replacing them with the new rowset + // under the lock. + void AtomicSwapRowSets(const RowSetVector &to_remove, + const RowSetVector &to_add); + + // Same as the above, but without taking the lock. This should only be used + // in cases where the lock is already held. + void AtomicSwapRowSetsUnlocked(const RowSetVector &to_remove, + const RowSetVector &to_add); + + void GetComponents(scoped_refptr* comps) const { + boost::shared_lock lock(component_lock_); + *comps = components_; + } + + // Create a new MemRowSet, replacing the current one. + // The 'old_ms' pointer will be set to the current MemRowSet set before the replacement. + // If the MemRowSet is not empty it will be added to the 'compaction' input + // and the MemRowSet compaction lock will be taken to prevent the inclusion + // in any concurrent compactions. + Status ReplaceMemRowSetUnlocked(RowSetsInCompaction *compaction, + std::shared_ptr *old_ms); + + // TODO: Document me. + Status FlushInternal(const RowSetsInCompaction& input, + const std::shared_ptr& old_ms); + + BloomFilterSizing bloom_sizing() const; + + // Convert the specified read client schema (without IDs) to a server schema (with IDs) + // This method is used by NewRowIterator(). + Status GetMappedReadProjection(const Schema& projection, + Schema *mapped_projection) const; + + Status CheckRowInTablet(const ConstContiguousRow& probe) const; + + // Helper method to find the rowset that has the DMS with the highest retention. + std::shared_ptr FindBestDMSToFlush( + const MaxIdxToSegmentMap& max_idx_to_segment_size) const; + + // Helper method to find how many bytes this index retains. + static int64_t GetLogRetentionSizeForIndex(int64_t min_log_index, + const MaxIdxToSegmentMap& max_idx_to_segment_size); + + // Lock protecting schema_ and key_schema_. + // + // Writers take this lock in shared mode before decoding and projecting + // their requests. They hold the lock until after APPLY. + // + // Readers take this lock in shared mode only long enough to copy the + // current schema into the iterator, after which all projection is taken + // care of based on that copy. + // + // On an AlterSchema, this is taken in exclusive mode during Prepare() and + // released after the schema change has been applied. + mutable rw_semaphore schema_lock_; + + const Schema key_schema_; + + scoped_refptr metadata_; + + // Lock protecting access to the 'components_' member (i.e the rowsets in the tablet) + // + // Shared mode: + // - Writers take this in shared mode at the same time as they obtain an MVCC timestamp + // and capture a reference to components_. This ensures that we can use the MVCC timestamp + // to determine which writers are writing to which components during compaction. + // - Readers take this in shared mode while capturing their iterators. This ensures that + // they see a consistent view when racing against flush/compact. + // + // Exclusive mode: + // - Flushes/compactions take this lock in order to lock out concurrent updates when + // swapping in a new memrowset. + // + // NOTE: callers should avoid taking this lock for a long time, even in shared mode. + // This is because the lock has some concept of fairness -- if, while a long reader + // is active, a writer comes along, then all future short readers will be blocked. + // TODO: now that this is single-threaded again, we should change it to rw_spinlock + mutable rw_spinlock component_lock_; + + // The current components of the tablet. These should always be read + // or swapped under the component_lock. + scoped_refptr components_; + + scoped_refptr log_anchor_registry_; + std::shared_ptr mem_tracker_; + std::shared_ptr dms_mem_tracker_; + + scoped_refptr metric_entity_; + gscoped_ptr metrics_; + FunctionGaugeDetacher metric_detacher_; + + int64_t next_mrs_id_; + + // A pointer to the server's clock. + scoped_refptr clock_; + + MvccManager mvcc_; + LockManager lock_manager_; + + gscoped_ptr compaction_policy_; + + + // Lock protecting the selection of rowsets for compaction. + // Only one thread may run the compaction selection algorithm at a time + // so that they don't both try to select the same rowset. + mutable boost::mutex compact_select_lock_; + + // We take this lock when flushing the tablet's rowsets in Tablet::Flush. We + // don't want to have two flushes in progress at once, in case the one which + // started earlier completes after the one started later. + mutable Semaphore rowsets_flush_sem_; + + enum State { + kInitialized, + kBootstrapping, + kOpen, + kShutdown + }; + State state_; + + // Fault hooks. In production code, these will always be NULL. + std::shared_ptr compaction_hooks_; + std::shared_ptr flush_hooks_; + std::shared_ptr common_hooks_; + + std::vector maintenance_ops_; + + DISALLOW_COPY_AND_ASSIGN(Tablet); +}; + + +// Hooks used in test code to inject faults or other code into interesting +// parts of the compaction code. +class Tablet::CompactionFaultHooks { + public: + virtual Status PostSelectIterators() { return Status::OK(); } + virtual ~CompactionFaultHooks() {} +}; + +class Tablet::FlushCompactCommonHooks { + public: + virtual Status PostTakeMvccSnapshot() { return Status::OK(); } + virtual Status PostWriteSnapshot() { return Status::OK(); } + virtual Status PostSwapInDuplicatingRowSet() { return Status::OK(); } + virtual Status PostReupdateMissedDeltas() { return Status::OK(); } + virtual Status PostSwapNewRowSet() { return Status::OK(); } + virtual ~FlushCompactCommonHooks() {} +}; + +// Hooks used in test code to inject faults or other code into interesting +// parts of the Flush() code. +class Tablet::FlushFaultHooks { + public: + virtual Status PostSwapNewMemRowSet() { return Status::OK(); } + virtual ~FlushFaultHooks() {} +}; + +class Tablet::Iterator : public RowwiseIterator { + public: + virtual ~Iterator(); + + virtual Status Init(ScanSpec *spec) OVERRIDE; + + virtual bool HasNext() const OVERRIDE; + + virtual Status NextBlock(RowBlock *dst) OVERRIDE; + + std::string ToString() const OVERRIDE; + + const Schema &schema() const OVERRIDE { + return projection_; + } + + virtual void GetIteratorStats(std::vector* stats) const OVERRIDE; + + private: + friend class Tablet; + + DISALLOW_COPY_AND_ASSIGN(Iterator); + + Iterator(const Tablet* tablet, const Schema& projection, MvccSnapshot snap, + const OrderMode order); + + const Tablet *tablet_; + Schema projection_; + const MvccSnapshot snap_; + const OrderMode order_; + gscoped_ptr iter_; + + // TODO: we could probably share an arena with the Scanner object inside the + // tserver, but piping it in would require changing a lot of call-sites. + Arena arena_; + RangePredicateEncoder encoder_; +}; + +// Structure which represents the components of the tablet's storage. +// This structure is immutable -- a transaction can grab it and be sure +// that it won't change. +struct TabletComponents : public RefCountedThreadSafe { + TabletComponents(std::shared_ptr mrs, + std::shared_ptr rs_tree); + const std::shared_ptr memrowset; + const std::shared_ptr rowsets; +}; + +} // namespace tablet +} // namespace kudu + +#endif diff --git a/src/kudu/tablet/tablet.proto b/src/kudu/tablet/tablet.proto new file mode 100644 index 000000000000..ae02e35ad0b5 --- /dev/null +++ b/src/kudu/tablet/tablet.proto @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tablet; + +option java_package = "org.kududb.tablet"; + +import "kudu/common/common.proto"; +import "kudu/common/wire_protocol.proto"; +import "kudu/tablet/metadata.proto"; + +// Stores the id of the MemRowSet (for inserts or mutations against MRS) +// or of the (row set, delta ID) pair for mutations against a DiskRowSet. +message MemStoreTargetPB { + // -1 defaults here are so that, if a caller forgets to check has_mrs_id(), + // they won't accidentally see real-looking (i.e 0) IDs. + + // Either this field... + optional int64 mrs_id = 1 [ default = -1]; + + // ... or both of the following fields are set. + optional int64 rs_id = 2 [ default = -1 ]; + optional int64 dms_id = 3 [ default = -1 ]; +} + +// Stores the result of an Insert or Mutate. +message OperationResultPB { + + // set on replay if this operation was already flushed. + optional bool flushed = 1 [ default = false ]; + + // set if this particular operation failed + optional kudu.AppStatusPB failed_status = 2; + + // The stores that the operation affected. + // For INSERTs, this will always be just one store. + // For MUTATE, it may be more than one if the mutation arrived during + // a compaction. + repeated MemStoreTargetPB mutated_stores = 3; +} + +// The final result of a transaction, including the result of each individual +// operation. +message TxResultPB { + // all the operations in this transaction + repeated OperationResultPB ops = 1; +} + +// Delta statistics for a flushed deltastore +message DeltaStatsPB { + // Number of deletes (deletes result in deletion of an entire row) + required int64 delete_count = 1; + + // REMOVED: replaced by column_stats, which maps by column ID, + // whereas this older version mapped by index. + // repeated int64 per_column_update_count = 2; + + // The min Timestamp that was stored in this delta. + required fixed64 min_timestamp = 3; + // The max Timestamp that was stored in this delta. + required fixed64 max_timestamp = 4; + + // Per-column statistics about this delta file. + message ColumnStats { + // The column ID. + required int32 col_id = 1; + // The number of updates which refer to this column ID. + optional int64 update_count = 2 [ default = 0 ]; + } + repeated ColumnStats column_stats = 5; +} + +message TabletStatusPB { + required string tablet_id = 1; + required string table_name = 2; + optional TabletStatePB state = 3 [ default = UNKNOWN ]; + optional tablet.TabletDataState tablet_data_state = 8 [ default = TABLET_DATA_UNKNOWN ]; + required string last_status = 4; + // DEPRECATED. + optional bytes start_key = 5; + // DEPRECATED. + optional bytes end_key = 6; + optional PartitionPB partition = 9; + optional int64 estimated_on_disk_size = 7; +} + +// Used to present the maintenance manager's internal state. +message MaintenanceManagerStatusPB { + message MaintenanceOpPB { + required string name = 1; + // Number of times this operation is currently running. + required uint32 running = 2; + required bool runnable = 3; + required uint64 ram_anchored_bytes = 4; + required int64 logs_retained_bytes = 5; + required double perf_improvement = 6; + } + + message CompletedOpPB { + required string name = 1; + required int32 duration_millis = 2; + // Number of seconds since this operation started. + required int32 secs_since_start = 3; + } + + // The next operation that would run. + optional MaintenanceOpPB best_op = 1; + + // List of all the operations. + repeated MaintenanceOpPB registered_operations = 2; + + // This list isn't in order of anything. Can contain the same operation mutiple times. + repeated CompletedOpPB completed_operations = 3; +} diff --git a/src/kudu/tablet/tablet_bootstrap-test.cc b/src/kudu/tablet/tablet_bootstrap-test.cc new file mode 100644 index 000000000000..d4863d1ca158 --- /dev/null +++ b/src/kudu/tablet/tablet_bootstrap-test.cc @@ -0,0 +1,555 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/consensus/log-test-base.h" + +#include + +#include "kudu/common/iterator.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/consensus-test-util.h" +#include "kudu/server/logical_clock.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/tablet/tablet_metadata.h" + +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { + +namespace log { + +extern const char* kTestTable; +extern const char* kTestTablet; + +} // namespace log + +namespace tablet { + +using consensus::ConsensusBootstrapInfo; +using consensus::ConsensusMetadata; +using consensus::kMinimumTerm; +using consensus::MakeOpId; +using consensus::OpId; +using consensus::ReplicateMsg; +using consensus::ReplicateRefPtr; +using consensus::make_scoped_refptr_replicate; +using log::Log; +using log::LogAnchorRegistry; +using log::LogTestBase; +using log::ReadableLogSegment; +using server::Clock; +using server::LogicalClock; +using tserver::WriteRequestPB; + +class BootstrapTest : public LogTestBase { + protected: + + void SetUp() OVERRIDE { + LogTestBase::SetUp(); + } + + Status LoadTestTabletMetadata(int mrs_id, int delta_id, scoped_refptr* meta) { + Schema schema = SchemaBuilder(schema_).Build(); + std::pair partition = CreateDefaultPartition(schema); + + RETURN_NOT_OK(TabletMetadata::LoadOrCreate(fs_manager_.get(), + log::kTestTablet, + log::kTestTable, + schema, + partition.first, + partition.second, + TABLET_DATA_READY, + meta)); + (*meta)->SetLastDurableMrsIdForTests(mrs_id); + if ((*meta)->GetRowSetForTests(0) != nullptr) { + (*meta)->GetRowSetForTests(0)->SetLastDurableRedoDmsIdForTests(delta_id); + } + return (*meta)->Flush(); + } + + Status PersistTestTabletMetadataState(TabletDataState state) { + scoped_refptr meta; + RETURN_NOT_OK(LoadTestTabletMetadata(-1, -1, &meta)); + meta->set_tablet_data_state(state); + RETURN_NOT_OK(meta->Flush()); + return Status::OK(); + } + + Status RunBootstrapOnTestTablet(const scoped_refptr& meta, + shared_ptr* tablet, + ConsensusBootstrapInfo* boot_info) { + gscoped_ptr listener(new TabletStatusListener(meta)); + scoped_refptr log_anchor_registry(new LogAnchorRegistry()); + // Now attempt to recover the log + RETURN_NOT_OK(BootstrapTablet( + meta, + scoped_refptr(LogicalClock::CreateStartingAt(Timestamp::kInitialTimestamp)), + shared_ptr(), + NULL, + listener.get(), + tablet, + &log_, + log_anchor_registry, + boot_info)); + + return Status::OK(); + } + + Status BootstrapTestTablet(int mrs_id, + int delta_id, + shared_ptr* tablet, + ConsensusBootstrapInfo* boot_info) { + scoped_refptr meta; + RETURN_NOT_OK_PREPEND(LoadTestTabletMetadata(mrs_id, delta_id, &meta), + "Unable to load test tablet metadata"); + + consensus::RaftConfigPB config; + config.set_local(true); + config.add_peers()->set_permanent_uuid(meta->fs_manager()->uuid()); + config.set_opid_index(consensus::kInvalidOpIdIndex); + + gscoped_ptr cmeta; + RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(meta->fs_manager(), meta->tablet_id(), + meta->fs_manager()->uuid(), + config, kMinimumTerm, &cmeta), + "Unable to create consensus metadata"); + + RETURN_NOT_OK_PREPEND(RunBootstrapOnTestTablet(meta, tablet, boot_info), + "Unable to bootstrap test tablet"); + return Status::OK(); + } + + void IterateTabletRows(const Tablet* tablet, + vector* results) { + gscoped_ptr iter; + // TODO: there seems to be something funny with timestamps in this test. + // Unless we explicitly scan at a snapshot including all timestamps, we don't + // see the bootstrapped operation. This is likely due to KUDU-138 -- perhaps + // we aren't properly setting up the clock after bootstrap. + MvccSnapshot snap = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + ASSERT_OK(tablet->NewRowIterator(schema_, snap, Tablet::UNORDERED, &iter)); + ASSERT_OK(iter->Init(nullptr)); + ASSERT_OK(IterateToStringList(iter.get(), results)); + for (const string& result : *results) { + VLOG(1) << result; + } + } +}; + +// Tests a normal bootstrap scenario +TEST_F(BootstrapTest, TestBootstrap) { + BuildLog(); + + AppendReplicateBatch(MakeOpId(1, current_index_)); + ASSERT_OK(RollLog()); + + AppendCommit(MakeOpId(1, current_index_)); + + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); +} + +// Tests attempting a local bootstrap of a tablet that was in the middle of a +// remote bootstrap before "crashing". +TEST_F(BootstrapTest, TestIncompleteRemoteBootstrap) { + BuildLog(); + + ASSERT_OK(PersistTestTabletMetadataState(TABLET_DATA_COPYING)); + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + Status s = BootstrapTestTablet(-1, -1, &tablet, &boot_info); + ASSERT_TRUE(s.IsCorruption()) << "Expected corruption: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "TabletMetadata bootstrap state is TABLET_DATA_COPYING"); + LOG(INFO) << "State is still TABLET_DATA_COPYING, as expected: " << s.ToString(); +} + +// Tests the KUDU-141 scenario: bootstrap when there is +// an orphaned commit after a log roll. +// The test simulates the following scenario: +// +// 1) 'Replicate A' is written to Segment_1, which is anchored +// on MemRowSet_1. +// 2) Segment_1 is rolled, 'Commit A' is written to Segment_2. +// 3) MemRowSet_1 is flushed, releasing all anchors. +// 4) Segment_1 is garbage collected. +// 5) We crash, requiring a recovery of Segment_2 which now contains +// the orphan 'Commit A'. +TEST_F(BootstrapTest, TestOrphanCommit) { + BuildLog(); + + OpId opid = MakeOpId(1, current_index_); + + // Step 1) Write a REPLICATE to the log, and roll it. + AppendReplicateBatch(opid); + ASSERT_OK(RollLog()); + + // Step 2) Write the corresponding COMMIT in the second segment. + AppendCommit(opid); + + { + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + + // Step 3) Apply the operations in the log to the tablet and flush + // the tablet to disk. + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + ASSERT_OK(tablet->Flush()); + + // Create a new log segment. + ASSERT_OK(RollLog()); + + // Step 4) Create an orphanned commit by first adding a commit to + // the newly rolled logfile, and then by removing the previous + // commits. + AppendCommit(opid); + log::SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + fs_manager_->env()->DeleteFile(segments[0]->path()); + } + { + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + + // Note: when GLOG_v=1, the test logs should include 'Ignoring + // orphan commit: op_type: WRITE_OP...' line. + ASSERT_OK(BootstrapTestTablet(2, 1, &tablet, &boot_info)); + + // Confirm that the legitimate data (from Step 3) is still there. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); + ASSERT_EQ("(int32 key=1, int32 int_val=0, string string_val=this is a test insert)", + results[0]); + ASSERT_EQ(2, tablet->metadata()->last_durable_mrs_id()); + } +} + +// Tests this scenario: +// Orphan COMMIT with id <= current mrs id, followed by a REPLICATE +// message with mrs_id > current mrs_id, and a COMMIT message for that +// REPLICATE message. +// +// This should result in the orphan COMMIT being ignored, but the last +// REPLICATE/COMMIT messages ending up in the tablet. +TEST_F(BootstrapTest, TestNonOrphansAfterOrphanCommit) { + BuildLog(); + + OpId opid = MakeOpId(1, current_index_); + + AppendReplicateBatch(opid); + ASSERT_OK(RollLog()); + + AppendCommit(opid); + + log::SegmentSequence segments; + ASSERT_OK(log_->GetLogReader()->GetSegmentsSnapshot(&segments)); + fs_manager_->env()->DeleteFile(segments[0]->path()); + + current_index_ += 2; + + opid = MakeOpId(1, current_index_); + + AppendReplicateBatch(opid); + AppendCommit(opid, 2, 1, 0); + + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + ASSERT_OK(BootstrapTestTablet(1, 0, &tablet, &boot_info)); + + // Confirm that the legitimate data is there. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); + + // 'key=3' means the REPLICATE message was inserted when current_id_ was 3, meaning + // that only the non-orphan commit went in. + ASSERT_EQ("(int32 key=3, int32 int_val=0, string string_val=this is a test insert)", + results[0]); +} + +// Test for where the server crashes in between REPLICATE and COMMIT. +// Bootstrap should not replay the operation, but should return it in +// the ConsensusBootstrapInfo +TEST_F(BootstrapTest, TestOrphanedReplicate) { + BuildLog(); + + // Append a REPLICATE with no commit + int replicate_index = current_index_++; + + OpId opid = MakeOpId(1, replicate_index); + + AppendReplicateBatch(opid); + + // Bootstrap the tablet. It shouldn't replay anything. + ConsensusBootstrapInfo boot_info; + shared_ptr tablet; + ASSERT_OK(BootstrapTestTablet(0, 0, &tablet, &boot_info)); + + // Table should be empty because we didn't replay the REPLICATE + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(0, results.size()); + + // The consensus bootstrap info should include the orphaned REPLICATE. + ASSERT_EQ(1, boot_info.orphaned_replicates.size()); + ASSERT_STR_CONTAINS(boot_info.orphaned_replicates[0]->ShortDebugString(), + "this is a test mutate"); + + // And it should also include the latest opids. + EXPECT_EQ("term: 1 index: 1", boot_info.last_id.ShortDebugString()); +} + +// Bootstrap should fail if no ConsensusMetadata file exists. +TEST_F(BootstrapTest, TestMissingConsensusMetadata) { + BuildLog(); + + scoped_refptr meta; + ASSERT_OK(LoadTestTabletMetadata(-1, -1, &meta)); + + shared_ptr tablet; + ConsensusBootstrapInfo boot_info; + Status s = RunBootstrapOnTestTablet(meta, &tablet, &boot_info); + + ASSERT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), "Unable to load Consensus metadata"); +} + +TEST_F(BootstrapTest, TestOperationOverwriting) { + BuildLog(); + + OpId opid = MakeOpId(1, 1); + + // Append a replicate in term 1 + AppendReplicateBatch(opid); + + // Append a commit for op 1.1 + AppendCommit(opid); + + // Now append replicates for 4.2 and 4.3 + AppendReplicateBatch(MakeOpId(4, 2)); + AppendReplicateBatch(MakeOpId(4, 3)); + + ASSERT_OK(RollLog()); + // And overwrite with 3.2 + AppendReplicateBatch(MakeOpId(3, 2), true); + + // When bootstrapping we should apply ops 1.1 and get 3.2 as pending. + ConsensusBootstrapInfo boot_info; + shared_ptr tablet; + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + + ASSERT_EQ(boot_info.orphaned_replicates.size(), 1); + ASSERT_OPID_EQ(boot_info.orphaned_replicates[0]->id(), MakeOpId(3, 2)); + + // Confirm that the legitimate data is there. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); + + ASSERT_EQ("(int32 key=1, int32 int_val=0, string string_val=this is a test insert)", + results[0]); +} + +// Tests that when we have out-of-order commits that touch the same rows, operations are +// still applied and in the correct order. +TEST_F(BootstrapTest, TestOutOfOrderCommits) { + BuildLog(); + + consensus::ReplicateRefPtr replicate = consensus::make_scoped_refptr_replicate( + new consensus::ReplicateMsg()); + replicate->get()->set_op_type(consensus::WRITE_OP); + tserver::WriteRequestPB* batch_request = replicate->get()->mutable_write_request(); + ASSERT_OK(SchemaToPB(schema_, batch_request->mutable_schema())); + batch_request->set_tablet_id(log::kTestTablet); + + // This appends Insert(1) with op 10.10 + OpId insert_opid = MakeOpId(10, 10); + replicate->get()->mutable_id()->CopyFrom(insert_opid); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 10, 1, + "this is a test insert", batch_request->mutable_row_operations()); + AppendReplicateBatch(replicate, true); + + // This appends Mutate(1) with op 10.11 + OpId mutate_opid = MakeOpId(10, 11); + batch_request->mutable_row_operations()->Clear(); + replicate->get()->mutable_id()->CopyFrom(mutate_opid); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, + 10, 2, "this is a test mutate", + batch_request->mutable_row_operations()); + AppendReplicateBatch(replicate, true); + + // Now commit the mutate before the insert (in the log). + gscoped_ptr mutate_commit(new consensus::CommitMsg); + mutate_commit->set_op_type(consensus::WRITE_OP); + mutate_commit->mutable_commited_op_id()->CopyFrom(mutate_opid); + TxResultPB* result = mutate_commit->mutable_result(); + OperationResultPB* mutate = result->add_ops(); + MemStoreTargetPB* target = mutate->add_mutated_stores(); + target->set_mrs_id(1); + + AppendCommit(mutate_commit.Pass()); + + gscoped_ptr insert_commit(new consensus::CommitMsg); + insert_commit->set_op_type(consensus::WRITE_OP); + insert_commit->mutable_commited_op_id()->CopyFrom(insert_opid); + result = insert_commit->mutable_result(); + OperationResultPB* insert = result->add_ops(); + target = insert->add_mutated_stores(); + target->set_mrs_id(1); + + AppendCommit(insert_commit.Pass()); + + ConsensusBootstrapInfo boot_info; + shared_ptr tablet; + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + + // Confirm that both operations were applied. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); + + ASSERT_EQ("(int32 key=10, int32 int_val=2, string string_val=this is a test mutate)", + results[0]); +} + +// Tests that when we have two consecutive replicates but the commit message for the +// first one is missing, both appear as pending in ConsensusInfo. +TEST_F(BootstrapTest, TestMissingCommitMessage) { + BuildLog(); + + consensus::ReplicateRefPtr replicate = consensus::make_scoped_refptr_replicate( + new consensus::ReplicateMsg()); + replicate->get()->set_op_type(consensus::WRITE_OP); + tserver::WriteRequestPB* batch_request = replicate->get()->mutable_write_request(); + ASSERT_OK(SchemaToPB(schema_, batch_request->mutable_schema())); + batch_request->set_tablet_id(log::kTestTablet); + + // This appends Insert(1) with op 10.10 + OpId insert_opid = MakeOpId(10, 10); + replicate->get()->mutable_id()->CopyFrom(insert_opid); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 10, 1, + "this is a test insert", batch_request->mutable_row_operations()); + AppendReplicateBatch(replicate, true); + + // This appends Mutate(1) with op 10.11 + OpId mutate_opid = MakeOpId(10, 11); + batch_request->mutable_row_operations()->Clear(); + replicate->get()->mutable_id()->CopyFrom(mutate_opid); + replicate->get()->set_timestamp(clock_->Now().ToUint64()); + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, + 10, 2, "this is a test mutate", + batch_request->mutable_row_operations()); + AppendReplicateBatch(replicate, true); + + // Now commit the mutate before the insert (in the log). + gscoped_ptr mutate_commit(new consensus::CommitMsg); + mutate_commit->set_op_type(consensus::WRITE_OP); + mutate_commit->mutable_commited_op_id()->CopyFrom(mutate_opid); + TxResultPB* result = mutate_commit->mutable_result(); + OperationResultPB* mutate = result->add_ops(); + MemStoreTargetPB* target = mutate->add_mutated_stores(); + target->set_mrs_id(1); + + AppendCommit(mutate_commit.Pass()); + + ConsensusBootstrapInfo boot_info; + shared_ptr tablet; + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + ASSERT_EQ(boot_info.orphaned_replicates.size(), 2); + ASSERT_OPID_EQ(boot_info.last_committed_id, mutate_opid); + + // Confirm that no operation was applied. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(0, results.size()); +} + +// Test that we do not crash when a consensus-only operation has a timestamp +// that is higher than a timestamp assigned to a write operation that follows +// it in the log. +TEST_F(BootstrapTest, TestConsensusOnlyOperationOutOfOrderTimestamp) { + BuildLog(); + + // Append NO_OP. + ReplicateRefPtr noop_replicate = make_scoped_refptr_replicate(new ReplicateMsg()); + noop_replicate->get()->set_op_type(consensus::NO_OP); + *noop_replicate->get()->mutable_id() = MakeOpId(1, 1); + noop_replicate->get()->set_timestamp(2); + + AppendReplicateBatch(noop_replicate, true); + + // Append WRITE_OP with higher OpId and lower timestamp. + ReplicateRefPtr write_replicate = make_scoped_refptr_replicate(new ReplicateMsg()); + write_replicate->get()->set_op_type(consensus::WRITE_OP); + WriteRequestPB* batch_request = write_replicate->get()->mutable_write_request(); + ASSERT_OK(SchemaToPB(schema_, batch_request->mutable_schema())); + batch_request->set_tablet_id(log::kTestTablet); + *write_replicate->get()->mutable_id() = MakeOpId(1, 2); + write_replicate->get()->set_timestamp(1); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1, 1, "foo", + batch_request->mutable_row_operations()); + + AppendReplicateBatch(write_replicate, true); + + // Now commit in OpId order. + // NO_OP... + gscoped_ptr mutate_commit(new consensus::CommitMsg); + mutate_commit->set_op_type(consensus::NO_OP); + *mutate_commit->mutable_commited_op_id() = noop_replicate->get()->id(); + + AppendCommit(mutate_commit.Pass()); + + // ...and WRITE_OP... + mutate_commit.reset(new consensus::CommitMsg); + mutate_commit->set_op_type(consensus::WRITE_OP); + *mutate_commit->mutable_commited_op_id() = write_replicate->get()->id(); + TxResultPB* result = mutate_commit->mutable_result(); + OperationResultPB* mutate = result->add_ops(); + MemStoreTargetPB* target = mutate->add_mutated_stores(); + target->set_mrs_id(1); + + AppendCommit(mutate_commit.Pass()); + + ConsensusBootstrapInfo boot_info; + shared_ptr tablet; + ASSERT_OK(BootstrapTestTablet(-1, -1, &tablet, &boot_info)); + ASSERT_EQ(boot_info.orphaned_replicates.size(), 0); + ASSERT_OPID_EQ(boot_info.last_committed_id, write_replicate->get()->id()); + + // Confirm that the insert op was applied. + vector results; + IterateTabletRows(tablet.get(), &results); + ASSERT_EQ(1, results.size()); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_bootstrap.cc b/src/kudu/tablet/tablet_bootstrap.cc new file mode 100644 index 000000000000..b47b49923fd0 --- /dev/null +++ b/src/kudu/tablet/tablet_bootstrap.cc @@ -0,0 +1,1478 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/tablet_bootstrap.h" + +#include +#include +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/walltime.h" +#include "kudu/server/clock.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/lock_manager.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/logging.h" +#include "kudu/util/path_util.h" +#include "kudu/util/stopwatch.h" + +DEFINE_bool(skip_remove_old_recovery_dir, false, + "Skip removing WAL recovery dir after startup. (useful for debugging)"); +TAG_FLAG(skip_remove_old_recovery_dir, hidden); + +DEFINE_double(fault_crash_during_log_replay, 0.0, + "Fraction of the time when the tablet will crash immediately " + "after processing a log entry during log replay. " + "(For testing only!)"); +TAG_FLAG(fault_crash_during_log_replay, unsafe); + +DECLARE_int32(max_clock_sync_error_usec); + +namespace kudu { +namespace tablet { + +using boost::shared_lock; +using consensus::ALTER_SCHEMA_OP; +using consensus::CHANGE_CONFIG_OP; +using consensus::ChangeConfigRecordPB; +using consensus::CommitMsg; +using consensus::ConsensusBootstrapInfo; +using consensus::ConsensusMetadata; +using consensus::ConsensusRound; +using consensus::MinimumOpId; +using consensus::NO_OP; +using consensus::OperationType; +using consensus::OperationType_Name; +using consensus::OpId; +using consensus::OpIdEquals; +using consensus::OpIdEqualsFunctor; +using consensus::OpIdHashFunctor; +using consensus::OpIdToString; +using consensus::RaftConfigPB; +using consensus::ReplicateMsg; +using consensus::WRITE_OP; +using log::Log; +using log::LogAnchorRegistry; +using log::LogEntryPB; +using log::LogOptions; +using log::LogReader; +using log::ReadableLogSegment; +using server::Clock; +using std::map; +using std::shared_ptr; +using std::string; +using std::unordered_map; +using strings::Substitute; +using tserver::AlterSchemaRequestPB; +using tserver::WriteRequestPB; + +struct ReplayState; + +// Information from the tablet metadata which indicates which data was +// flushed prior to this restart. +// +// We take a snapshot of this information at the beginning of the bootstrap +// process so that we can allow compactions and flushes to run during bootstrap +// without confusing our tracking of flushed stores. +class FlushedStoresSnapshot { + public: + FlushedStoresSnapshot() {} + Status InitFrom(const TabletMetadata& meta); + + bool WasStoreAlreadyFlushed(const MemStoreTargetPB& target) const; + + private: + int64_t last_durable_mrs_id_; + unordered_map flushed_dms_by_drs_id_; + + DISALLOW_COPY_AND_ASSIGN(FlushedStoresSnapshot); +}; + +// Bootstraps an existing tablet by opening the metadata from disk, and rebuilding soft +// state by playing log segments. A bootstrapped tablet can then be added to an existing +// consensus configuration as a LEARNER, which will bring its state up to date with the +// rest of the consensus configuration, or it can start serving the data itself, after it +// has been appointed LEADER of that particular consensus configuration. +// +// NOTE: this does not handle pulling data from other replicas in the cluster. That +// is handled by the 'RemoteBootstrap' classes, which copy blocks and metadata locally +// before invoking this local bootstrap functionality. +// +// TODO Because the table that is being rebuilt is never flushed/compacted, consensus +// is only set on the tablet after bootstrap, when we get to flushes/compactions though +// we need to set it before replay or we won't be able to re-rebuild. +class TabletBootstrap { + public: + TabletBootstrap(const scoped_refptr& meta, + const scoped_refptr& clock, + shared_ptr mem_tracker, + MetricRegistry* metric_registry, + TabletStatusListener* listener, + const scoped_refptr& log_anchor_registry); + + // Plays the log segments, rebuilding the portion of the Tablet's soft + // state that is present in the log (additional soft state may be present + // in other replicas). + // A successful call will yield the rebuilt tablet and the rebuilt log. + Status Bootstrap(shared_ptr* rebuilt_tablet, + scoped_refptr* rebuilt_log, + ConsensusBootstrapInfo* results); + + private: + + // Opens the tablet. + // Sets '*has_blocks' to true if there was any data on disk for this tablet. + Status OpenTablet(bool* has_blocks); + + // Checks if a previous log recovery directory exists. If so, it deletes any + // files in the log dir and sets 'needs_recovery' to true, meaning that the + // previous recovery attempt should be retried from the recovery dir. + // + // Otherwise, if there is a log directory with log files in it, renames that + // log dir to the log recovery dir and creates a new, empty log dir so that + // log replay can proceed. 'needs_recovery' is also returned as true in this + // case. + // + // If no log segments are found, 'needs_recovery' is set to false. + Status PrepareRecoveryDir(bool* needs_recovery); + + // Opens the latest log segments for the Tablet that will allow to rebuild + // the tablet's soft state. If there are existing log segments in the tablet's + // log directly they are moved to a "log-recovery" directory which is deleted + // when the replay process is completed (as they have been duplicated in the + // current log directory). + // + // If a "log-recovery" directory is already present, we will continue to replay + // from the "log-recovery" directory. Tablet metadata is updated once replay + // has finished from the "log-recovery" directory. + Status OpenLogReaderInRecoveryDir(); + + // Opens a new log in the tablet's log directory. + // The directory is expected to be clean. + Status OpenNewLog(); + + // Finishes bootstrap, setting 'rebuilt_log' and 'rebuilt_tablet'. + Status FinishBootstrap(const string& message, + scoped_refptr* rebuilt_log, + shared_ptr* rebuilt_tablet); + + // Plays the log segments into the tablet being built. + // The process of playing the segments generates a new log that can be continued + // later on when then tablet is rebuilt and starts accepting writes from clients. + Status PlaySegments(ConsensusBootstrapInfo* results); + + // Append the given commit message to the log. + // Does not support writing a TxResult. + Status AppendCommitMsg(const CommitMsg& commit_msg); + + Status PlayWriteRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg); + + Status PlayAlterSchemaRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg); + + Status PlayChangeConfigRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg); + + Status PlayNoOpRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg); + + // Plays operations, skipping those that have already been flushed. + Status PlayRowOperations(WriteTransactionState* tx_state, + const SchemaPB& schema_pb, + const RowOperationsPB& ops_pb, + const TxResultPB& result); + + // Pass through all of the decoded operations in tx_state. For + // each op: + // - if it was previously failed, mark as failed + // - if it previously succeeded but was flushed, mark as skipped + // - otherwise, re-apply to the tablet being bootstrapped. + Status FilterAndApplyOperations(WriteTransactionState* tx_state, + const TxResultPB& orig_result); + + // Filter a single insert operation, setting it to failed if + // it was already flushed. + Status FilterInsert(WriteTransactionState* tx_state, + RowOp* op, + const OperationResultPB& op_result); + + // Filter a single mutate operation, setting it to failed if + // it was already flushed. + Status FilterMutate(WriteTransactionState* tx_state, + RowOp* op, + const OperationResultPB& op_result); + + // Returns whether all the stores that are referred to in the commit + // message are already flushed. + bool AreAllStoresAlreadyFlushed(const CommitMsg& commit); + + // Returns whether there is any store that is referred to in the commit + // message that is already flushed. + bool AreAnyStoresAlreadyFlushed(const CommitMsg& commit); + + void DumpReplayStateToLog(const ReplayState& state); + + // Handlers for each type of message seen in the log during replay. + Status HandleEntry(ReplayState* state, LogEntryPB* entry); + Status HandleReplicateMessage(ReplayState* state, LogEntryPB* replicate_entry); + Status HandleCommitMessage(ReplayState* state, LogEntryPB* commit_entry); + Status ApplyCommitMessage(ReplayState* state, LogEntryPB* commit_entry); + Status HandleEntryPair(LogEntryPB* replicate_entry, LogEntryPB* commit_entry); + + // Checks that an orphaned commit message is actually irrelevant, i.e that the + // data stores it refers to are already flushed. + Status CheckOrphanedCommitAlreadyFlushed(const CommitMsg& commit); + + // Decodes a Timestamp from the provided string and updates the clock + // with it. + Status UpdateClock(uint64_t timestamp); + + // Removes the recovery directory and all files contained therein. + // Intended to be invoked after log replay successfully completes. + Status RemoveRecoveryDir(); + + // Return a log prefix string in the standard "T xxx P yyy" format. + string LogPrefix() const; + + scoped_refptr meta_; + scoped_refptr clock_; + shared_ptr mem_tracker_; + MetricRegistry* metric_registry_; + TabletStatusListener* listener_; + gscoped_ptr tablet_; + const scoped_refptr log_anchor_registry_; + scoped_refptr log_; + gscoped_ptr log_reader_; + + Arena arena_; + + gscoped_ptr cmeta_; + + // Statistics on the replay of entries in the log. + struct Stats { + Stats() + : ops_read(0), + ops_overwritten(0), + ops_committed(0), + inserts_seen(0), + inserts_ignored(0), + mutations_seen(0), + mutations_ignored(0), + orphaned_commits(0) { + } + + string ToString() const { + return Substitute("ops{read=$0 overwritten=$1 applied=$2} " + "inserts{seen=$3 ignored=$4} " + "mutations{seen=$5 ignored=$6} " + "orphaned_commits=$7", + ops_read, ops_overwritten, ops_committed, + inserts_seen, inserts_ignored, + mutations_seen, mutations_ignored, + orphaned_commits); + } + + // Number of REPLICATE messages read from the log + int ops_read; + // Number of REPLICATE messages which were overwritten by later entries. + int ops_overwritten; + // Number of REPLICATE messages for which a matching COMMIT was found. + int ops_committed; + + // Number inserts/mutations seen and ignored. + int inserts_seen, inserts_ignored; + int mutations_seen, mutations_ignored; + + // Number of COMMIT messages for which a corresponding REPLICATE was not found. + int orphaned_commits; + }; + Stats stats_; + + // Snapshot of which stores were flushed prior to restart. + FlushedStoresSnapshot flushed_stores_; + + DISALLOW_COPY_AND_ASSIGN(TabletBootstrap); +}; + +TabletStatusListener::TabletStatusListener(const scoped_refptr& meta) + : meta_(meta), + last_status_("") { +} + +const string TabletStatusListener::tablet_id() const { + return meta_->tablet_id(); +} + +const string TabletStatusListener::table_name() const { + return meta_->table_name(); +} + +const Partition& TabletStatusListener::partition() const { + return meta_->partition(); +} + +const Schema& TabletStatusListener::schema() const { + return meta_->schema(); +} + +TabletStatusListener::~TabletStatusListener() { +} + +void TabletStatusListener::StatusMessage(const string& status) { + LOG(INFO) << "T " << tablet_id() << " P " << meta_->fs_manager()->uuid() << ": " + << status; + boost::lock_guard l(lock_); + last_status_ = status; +} + +Status BootstrapTablet(const scoped_refptr& meta, + const scoped_refptr& clock, + const shared_ptr& mem_tracker, + MetricRegistry* metric_registry, + TabletStatusListener* listener, + shared_ptr* rebuilt_tablet, + scoped_refptr* rebuilt_log, + const scoped_refptr& log_anchor_registry, + ConsensusBootstrapInfo* consensus_info) { + TRACE_EVENT1("tablet", "BootstrapTablet", + "tablet_id", meta->tablet_id()); + TabletBootstrap bootstrap(meta, clock, mem_tracker, + metric_registry, listener, log_anchor_registry); + RETURN_NOT_OK(bootstrap.Bootstrap(rebuilt_tablet, rebuilt_log, consensus_info)); + // This is necessary since OpenNewLog() initially disables sync. + RETURN_NOT_OK((*rebuilt_log)->ReEnableSyncIfRequired()); + return Status::OK(); +} + +static string DebugInfo(const string& tablet_id, + int segment_seqno, + int entry_idx, + const string& segment_path, + const LogEntryPB& entry) { + // Truncate the debug string to a reasonable length for logging. + // Otherwise, glog will truncate for us and we may miss important + // information which came after this long string. + string debug_str = entry.ShortDebugString(); + if (debug_str.size() > 500) { + debug_str.resize(500); + debug_str.append("..."); + } + return Substitute("Debug Info: Error playing entry $0 of segment $1 of tablet $2. " + "Segment path: $3. Entry: $4", entry_idx, segment_seqno, tablet_id, + segment_path, debug_str); +} + +TabletBootstrap::TabletBootstrap( + const scoped_refptr& meta, + const scoped_refptr& clock, shared_ptr mem_tracker, + MetricRegistry* metric_registry, TabletStatusListener* listener, + const scoped_refptr& log_anchor_registry) + : meta_(meta), + clock_(clock), + mem_tracker_(std::move(mem_tracker)), + metric_registry_(metric_registry), + listener_(listener), + log_anchor_registry_(log_anchor_registry), + arena_(256 * 1024, 4 * 1024 * 1024) {} + +Status TabletBootstrap::Bootstrap(shared_ptr* rebuilt_tablet, + scoped_refptr* rebuilt_log, + ConsensusBootstrapInfo* consensus_info) { + string tablet_id = meta_->tablet_id(); + + // Replay requires a valid Consensus metadata file to exist in order to + // compare the committed consensus configuration seqno with the log entries and also to persist + // committed but unpersisted changes. + RETURN_NOT_OK_PREPEND(ConsensusMetadata::Load(meta_->fs_manager(), tablet_id, + meta_->fs_manager()->uuid(), &cmeta_), + "Unable to load Consensus metadata"); + + // Make sure we don't try to locally bootstrap a tablet that was in the middle + // of a remote bootstrap. It's likely that not all files were copied over + // successfully. + TabletDataState tablet_data_state = meta_->tablet_data_state(); + if (tablet_data_state != TABLET_DATA_READY) { + return Status::Corruption("Unable to locally bootstrap tablet " + tablet_id + ": " + + "TabletMetadata bootstrap state is " + + TabletDataState_Name(tablet_data_state)); + } + + meta_->PinFlush(); + + listener_->StatusMessage("Bootstrap starting."); + + if (VLOG_IS_ON(1)) { + TabletSuperBlockPB super_block; + RETURN_NOT_OK(meta_->ToSuperBlock(&super_block)); + VLOG_WITH_PREFIX(1) << "Tablet Metadata: " << super_block.DebugString(); + } + + RETURN_NOT_OK(flushed_stores_.InitFrom(*meta_.get())); + + bool has_blocks; + RETURN_NOT_OK(OpenTablet(&has_blocks)); + + bool needs_recovery; + RETURN_NOT_OK(PrepareRecoveryDir(&needs_recovery)); + if (needs_recovery) { + RETURN_NOT_OK(OpenLogReaderInRecoveryDir()); + } + + // This is a new tablet, nothing left to do. + if (!has_blocks && !needs_recovery) { + LOG_WITH_PREFIX(INFO) << "No blocks or log segments found. Creating new log."; + RETURN_NOT_OK_PREPEND(OpenNewLog(), "Failed to open new log"); + RETURN_NOT_OK(FinishBootstrap("No bootstrap required, opened a new log", + rebuilt_log, + rebuilt_tablet)); + consensus_info->last_id = MinimumOpId(); + consensus_info->last_committed_id = MinimumOpId(); + return Status::OK(); + } + + // If there were blocks, there must be segments to replay. This is required + // by Raft, since we always need to know the term and index of the last + // logged op in order to vote, know how to respond to AppendEntries(), etc. + if (has_blocks && !needs_recovery) { + return Status::IllegalState(Substitute("Tablet $0: Found rowsets but no log " + "segments could be found.", + tablet_id)); + } + + // Before playing any segments we set the safe and clean times to 'kMin' so that + // the MvccManager will accept all transactions that we replay as uncommitted. + tablet_->mvcc_manager()->OfflineAdjustSafeTime(Timestamp::kMin); + RETURN_NOT_OK_PREPEND(PlaySegments(consensus_info), "Failed log replay. Reason"); + + // Flush the consensus metadata once at the end to persist our changes, if any. + cmeta_->Flush(); + + RETURN_NOT_OK(RemoveRecoveryDir()); + RETURN_NOT_OK(FinishBootstrap("Bootstrap complete.", rebuilt_log, rebuilt_tablet)); + + return Status::OK(); +} + +Status TabletBootstrap::FinishBootstrap(const string& message, + scoped_refptr* rebuilt_log, + shared_ptr* rebuilt_tablet) { + // Add a callback to TabletMetadata that makes sure that each time we flush the metadata + // we also wait for in-flights to finish and for their wal entry to be fsynced. + // This might be a bit conservative in some situations but it will prevent us from + // ever flushing the metadata referring to tablet data blocks containing data whose + // commit entries are not durable, a pre-requisite for recovery. + meta_->SetPreFlushCallback( + Bind(&FlushInflightsToLogCallback::WaitForInflightsAndFlushLog, + make_scoped_refptr(new FlushInflightsToLogCallback(tablet_.get(), + log_)))); + tablet_->MarkFinishedBootstrapping(); + RETURN_NOT_OK(tablet_->metadata()->UnPinFlush()); + listener_->StatusMessage(message); + rebuilt_tablet->reset(tablet_.release()); + rebuilt_log->swap(log_); + return Status::OK(); +} + +Status TabletBootstrap::OpenTablet(bool* has_blocks) { + gscoped_ptr tablet(new Tablet(meta_, + clock_, + mem_tracker_, + metric_registry_, + log_anchor_registry_)); + // doing nothing for now except opening a tablet locally. + LOG_TIMING_PREFIX(INFO, LogPrefix(), "opening tablet") { + RETURN_NOT_OK(tablet->Open()); + } + *has_blocks = tablet->num_rowsets() != 0; + tablet_.reset(tablet.release()); + return Status::OK(); +} + +Status TabletBootstrap::PrepareRecoveryDir(bool* needs_recovery) { + *needs_recovery = false; + + FsManager* fs_manager = tablet_->metadata()->fs_manager(); + string tablet_id = tablet_->metadata()->tablet_id(); + string log_dir = fs_manager->GetTabletWalDir(tablet_id); + + // If the recovery directory exists, then we crashed mid-recovery. + // Throw away any logs from the previous recovery attempt and restart the log + // replay process from the beginning using the same recovery dir as last time. + string recovery_path = fs_manager->GetTabletWalRecoveryDir(tablet_id); + if (fs_manager->Exists(recovery_path)) { + LOG_WITH_PREFIX(INFO) << "Previous recovery directory found at " << recovery_path << ": " + << "Replaying log files from this location instead of " << log_dir; + + // Since we have a recovery directory, clear out the log_dir by recursively + // deleting it and creating a new one so that we don't end up with remnants + // of old WAL segments or indexes after replay. + if (fs_manager->env()->FileExists(log_dir)) { + LOG_WITH_PREFIX(INFO) << "Deleting old log files from previous recovery attempt in " + << log_dir; + RETURN_NOT_OK_PREPEND(fs_manager->env()->DeleteRecursively(log_dir), + "Could not recursively delete old log dir " + log_dir); + } + + RETURN_NOT_OK_PREPEND(fs_manager->CreateDirIfMissing(log_dir), + "Failed to create log directory " + log_dir); + + *needs_recovery = true; + return Status::OK(); + } + + // If we made it here, there was no pre-existing recovery dir. + // Now we look for log files in log_dir, and if we find any then we rename + // the whole log_dir to a recovery dir and return needs_recovery = true. + RETURN_NOT_OK_PREPEND(fs_manager->CreateDirIfMissing(log_dir), + "Failed to create log dir"); + + vector children; + RETURN_NOT_OK_PREPEND(fs_manager->ListDir(log_dir, &children), + "Couldn't list log segments."); + for (const string& child : children) { + if (!log::IsLogFileName(child)) { + continue; + } + + string source_path = JoinPathSegments(log_dir, child); + string dest_path = JoinPathSegments(recovery_path, child); + LOG_WITH_PREFIX(INFO) << "Will attempt to recover log segment " << source_path + << " to " << dest_path; + *needs_recovery = true; + } + + if (*needs_recovery) { + // Atomically rename the log directory to the recovery directory + // and then re-create the log directory. + LOG_WITH_PREFIX(INFO) << "Moving log directory " << log_dir << " to recovery directory " + << recovery_path << " in preparation for log replay"; + RETURN_NOT_OK_PREPEND(fs_manager->env()->RenameFile(log_dir, recovery_path), + Substitute("Could not move log directory $0 to recovery dir $1", + log_dir, recovery_path)); + RETURN_NOT_OK_PREPEND(fs_manager->env()->CreateDir(log_dir), + "Failed to recreate log directory " + log_dir); + } + return Status::OK(); +} + +Status TabletBootstrap::OpenLogReaderInRecoveryDir() { + VLOG_WITH_PREFIX(1) << "Opening log reader in log recovery dir " + << meta_->fs_manager()->GetTabletWalRecoveryDir(tablet_->tablet_id()); + // Open the reader. + RETURN_NOT_OK_PREPEND(LogReader::OpenFromRecoveryDir(tablet_->metadata()->fs_manager(), + tablet_->metadata()->tablet_id(), + tablet_->GetMetricEntity().get(), + &log_reader_), + "Could not open LogReader. Reason"); + return Status::OK(); +} + +Status TabletBootstrap::RemoveRecoveryDir() { + FsManager* fs_manager = tablet_->metadata()->fs_manager(); + string recovery_path = fs_manager->GetTabletWalRecoveryDir(tablet_->metadata()->tablet_id()); + CHECK(fs_manager->Exists(recovery_path)) + << "Tablet WAL recovery dir " << recovery_path << " does not exist."; + + LOG_WITH_PREFIX(INFO) << "Preparing to delete log recovery files and directory " << recovery_path; + + string tmp_path = Substitute("$0-$1", recovery_path, GetCurrentTimeMicros()); + LOG_WITH_PREFIX(INFO) << "Renaming log recovery dir from " << recovery_path + << " to " << tmp_path; + RETURN_NOT_OK_PREPEND(fs_manager->env()->RenameFile(recovery_path, tmp_path), + Substitute("Could not rename old recovery dir from: $0 to: $1", + recovery_path, tmp_path)); + + if (FLAGS_skip_remove_old_recovery_dir) { + LOG_WITH_PREFIX(INFO) << "--skip_remove_old_recovery_dir enabled. NOT deleting " << tmp_path; + return Status::OK(); + } + LOG_WITH_PREFIX(INFO) << "Deleting all files from renamed log recovery directory " << tmp_path; + RETURN_NOT_OK_PREPEND(fs_manager->env()->DeleteRecursively(tmp_path), + "Could not remove renamed recovery dir " + tmp_path); + LOG_WITH_PREFIX(INFO) << "Completed deletion of old log recovery files and directory " + << tmp_path; + return Status::OK(); +} + +Status TabletBootstrap::OpenNewLog() { + OpId init; + init.set_term(0); + init.set_index(0); + RETURN_NOT_OK(Log::Open(LogOptions(), + tablet_->metadata()->fs_manager(), + tablet_->tablet_id(), + *tablet_->schema(), + tablet_->metadata()->schema_version(), + tablet_->GetMetricEntity(), + &log_)); + // Disable sync temporarily in order to speed up appends during the + // bootstrap process. + log_->DisableSync(); + return Status::OK(); +} + +typedef map OpIndexToEntryMap; + +// State kept during replay. +struct ReplayState { + ReplayState() + : prev_op_id(MinimumOpId()), + committed_op_id(MinimumOpId()) { + } + + ~ReplayState() { + STLDeleteValues(&pending_replicates); + STLDeleteValues(&pending_commits); + } + + // Return true if 'b' is allowed to immediately follow 'a' in the log. + static bool IsValidSequence(const OpId& a, const OpId& b) { + if (a.term() == 0 && a.index() == 0) { + // Not initialized - can start with any opid. + return true; + } + + // Within the same term, we should never skip entries. + // We can, however go backwards (see KUDU-783 for an example) + if (b.term() == a.term() && + b.index() > a.index() + 1) { + return false; + } + + return true; + } + + // Return a Corruption status if 'id' seems to be out-of-sequence in the log. + Status CheckSequentialReplicateId(const ReplicateMsg& msg) { + DCHECK(msg.has_id()); + if (PREDICT_FALSE(!IsValidSequence(prev_op_id, msg.id()))) { + string op_desc = Substitute("$0 REPLICATE (Type: $1)", + OpIdToString(msg.id()), + OperationType_Name(msg.op_type())); + return Status::Corruption( + Substitute("Unexpected opid following opid $0. Operation: $1", + OpIdToString(prev_op_id), + op_desc)); + } + + prev_op_id = msg.id(); + return Status::OK(); + } + + void UpdateCommittedOpId(const OpId& id) { + if (id.index() > committed_op_id.index()) { + committed_op_id = id; + } + } + + void AddEntriesToStrings(const OpIndexToEntryMap& entries, vector* strings) const { + for (const OpIndexToEntryMap::value_type& map_entry : entries) { + LogEntryPB* entry = DCHECK_NOTNULL(map_entry.second); + strings->push_back(Substitute(" $0", entry->ShortDebugString())); + } + } + + void DumpReplayStateToStrings(vector* strings) const { + strings->push_back(Substitute("ReplayState: Previous OpId: $0, Committed OpId: $1, " + "Pending Replicates: $2, Pending Commits: $3", OpIdToString(prev_op_id), + OpIdToString(committed_op_id), pending_replicates.size(), pending_commits.size())); + if (!pending_replicates.empty()) { + strings->push_back("Dumping REPLICATES: "); + AddEntriesToStrings(pending_replicates, strings); + } + if (!pending_commits.empty()) { + strings->push_back("Dumping COMMITS: "); + AddEntriesToStrings(pending_commits, strings); + } + } + + // The last replicate message's ID. + OpId prev_op_id; + + // The last operation known to be committed. + // All other operations with lower IDs are also committed. + OpId committed_op_id; + + // REPLICATE log entries whose corresponding COMMIT record has + // not yet been seen. Keyed by index. + OpIndexToEntryMap pending_replicates; + + // COMMIT log entries which couldn't be applied immediately. + OpIndexToEntryMap pending_commits; +}; + +// Handle the given log entry. If OK is returned, then takes ownership of 'entry'. +// Otherwise, caller frees. +Status TabletBootstrap::HandleEntry(ReplayState* state, LogEntryPB* entry) { + if (VLOG_IS_ON(1)) { + VLOG_WITH_PREFIX(1) << "Handling entry: " << entry->ShortDebugString(); + } + + switch (entry->type()) { + case log::REPLICATE: + RETURN_NOT_OK(HandleReplicateMessage(state, entry)); + break; + case log::COMMIT: + // check the unpaired ops for the matching replicate msg, abort if not found + RETURN_NOT_OK(HandleCommitMessage(state, entry)); + break; + default: + return Status::Corruption(Substitute("Unexpected log entry type: $0", entry->type())); + } + MAYBE_FAULT(FLAGS_fault_crash_during_log_replay); + return Status::OK(); +} + +// Takes ownership of 'replicate_entry' on OK status. +Status TabletBootstrap::HandleReplicateMessage(ReplayState* state, LogEntryPB* replicate_entry) { + stats_.ops_read++; + + const ReplicateMsg& replicate = replicate_entry->replicate(); + RETURN_NOT_OK(state->CheckSequentialReplicateId(replicate)); + DCHECK(replicate.has_timestamp()); + CHECK_OK(UpdateClock(replicate.timestamp())); + + // Append the replicate message to the log as is + RETURN_NOT_OK(log_->Append(replicate_entry)); + + int64_t index = replicate_entry->replicate().id().index(); + + LogEntryPB** existing_entry_ptr = InsertOrReturnExisting( + &state->pending_replicates, index, replicate_entry); + + // If there was a entry with the same index we're overwriting then we need to delete + // that entry and all entries with higher indexes. + if (existing_entry_ptr) { + LogEntryPB* existing_entry = *existing_entry_ptr; + + auto iter = state->pending_replicates.lower_bound(index); + DCHECK(OpIdEquals((*iter).second->replicate().id(), existing_entry->replicate().id())); + + LogEntryPB* last_entry = (*state->pending_replicates.rbegin()).second; + + LOG_WITH_PREFIX(INFO) << "Overwriting operations starting at: " + << existing_entry->replicate().id() + << " up to: " << last_entry->replicate().id() + << " with operation: " << replicate_entry->replicate().id(); + + while (iter != state->pending_replicates.end()) { + delete (*iter).second; + state->pending_replicates.erase(iter++); + stats_.ops_overwritten++; + } + + InsertOrDie(&state->pending_replicates, index, replicate_entry); + } + return Status::OK(); +} + +// Takes ownership of 'commit_entry' on OK status. +Status TabletBootstrap::HandleCommitMessage(ReplayState* state, LogEntryPB* commit_entry) { + DCHECK(commit_entry->has_commit()) << "Not a commit message: " << commit_entry->DebugString(); + + // Match up the COMMIT record with the original entry that it's applied to. + const OpId& committed_op_id = commit_entry->commit().commited_op_id(); + state->UpdateCommittedOpId(committed_op_id); + + // If there are no pending replicates, or if this commit's index is lower than the + // the first pending replicate on record this is likely an orphaned commit. + if (state->pending_replicates.empty() || + (*state->pending_replicates.begin()).first > committed_op_id.index()) { + VLOG_WITH_PREFIX(2) << "Found orphaned commit for " << committed_op_id; + RETURN_NOT_OK(CheckOrphanedCommitAlreadyFlushed(commit_entry->commit())); + stats_.orphaned_commits++; + delete commit_entry; + return Status::OK(); + } + + // If this commit does not correspond to the first replicate message in the pending + // replicates set we keep it to apply later... + if ((*state->pending_replicates.begin()).first != committed_op_id.index()) { + if (!ContainsKey(state->pending_replicates, committed_op_id.index())) { + return Status::Corruption(Substitute("Could not find replicate for commit: $0", + commit_entry->ShortDebugString())); + } + VLOG_WITH_PREFIX(2) << "Adding pending commit for " << committed_op_id; + InsertOrDie(&state->pending_commits, committed_op_id.index(), commit_entry); + return Status::OK(); + } + + // ... if it does, we apply it and all the commits that immediately follow in the sequence. + OpId last_applied = commit_entry->commit().commited_op_id(); + RETURN_NOT_OK(ApplyCommitMessage(state, commit_entry)); + delete commit_entry; + + auto iter = state->pending_commits.begin(); + while (iter != state->pending_commits.end()) { + if ((*iter).first == last_applied.index() + 1) { + gscoped_ptr buffered_commit_entry((*iter).second); + state->pending_commits.erase(iter++); + last_applied = buffered_commit_entry->commit().commited_op_id(); + RETURN_NOT_OK(ApplyCommitMessage(state, buffered_commit_entry.get())); + continue; + } + break; + } + + return Status::OK(); +} + +bool TabletBootstrap::AreAllStoresAlreadyFlushed(const CommitMsg& commit) { + for (const OperationResultPB& op_result : commit.result().ops()) { + for (const MemStoreTargetPB& mutated_store : op_result.mutated_stores()) { + if (!flushed_stores_.WasStoreAlreadyFlushed(mutated_store)) { + return false; + } + } + } + return true; +} + +bool TabletBootstrap::AreAnyStoresAlreadyFlushed(const CommitMsg& commit) { + for (const OperationResultPB& op_result : commit.result().ops()) { + for (const MemStoreTargetPB& mutated_store : op_result.mutated_stores()) { + if (flushed_stores_.WasStoreAlreadyFlushed(mutated_store)) { + return true; + } + } + } + return false; +} + +Status TabletBootstrap::CheckOrphanedCommitAlreadyFlushed(const CommitMsg& commit) { + if (!AreAllStoresAlreadyFlushed(commit)) { + TabletSuperBlockPB super; + WARN_NOT_OK(meta_->ToSuperBlock(&super), LogPrefix() + "Couldn't build TabletSuperBlockPB"); + return Status::Corruption(Substitute("CommitMsg was orphaned but it referred to " + "unflushed stores. Commit: $0. TabletMetadata: $1", commit.ShortDebugString(), + super.ShortDebugString())); + } + return Status::OK(); +} + +Status TabletBootstrap::ApplyCommitMessage(ReplayState* state, LogEntryPB* commit_entry) { + + const OpId& committed_op_id = commit_entry->commit().commited_op_id(); + VLOG_WITH_PREFIX(2) << "Applying commit for " << committed_op_id; + gscoped_ptr pending_replicate_entry; + + // They should also have an associated replicate index (it may have been in a + // deleted log segment though). + pending_replicate_entry.reset(EraseKeyReturnValuePtr(&state->pending_replicates, + committed_op_id.index())); + + if (pending_replicate_entry != nullptr) { + // We found a replicate with the same index, make sure it also has the same + // term. + if (!OpIdEquals(committed_op_id, pending_replicate_entry->replicate().id())) { + string error_msg = Substitute("Committed operation's OpId: $0 didn't match the" + "commit message's committed OpId: $1. Pending operation: $2, Commit message: $3", + pending_replicate_entry->replicate().id().ShortDebugString(), + committed_op_id.ShortDebugString(), + pending_replicate_entry->replicate().ShortDebugString(), + commit_entry->commit().ShortDebugString()); + LOG_WITH_PREFIX(DFATAL) << error_msg; + return Status::Corruption(error_msg); + } + RETURN_NOT_OK(HandleEntryPair(pending_replicate_entry.get(), commit_entry)); + stats_.ops_committed++; + } else { + stats_.orphaned_commits++; + RETURN_NOT_OK(CheckOrphanedCommitAlreadyFlushed(commit_entry->commit())); + } + + return Status::OK(); +} + +// Never deletes 'replicate_entry' or 'commit_entry'. +Status TabletBootstrap::HandleEntryPair(LogEntryPB* replicate_entry, LogEntryPB* commit_entry) { + const char* error_fmt = "Failed to play $0 request. ReplicateMsg: { $1 }, CommitMsg: { $2 }"; + +#define RETURN_NOT_OK_REPLAY(ReplayMethodName, replicate, commit) \ + RETURN_NOT_OK_PREPEND(ReplayMethodName(replicate, commit), \ + Substitute(error_fmt, OperationType_Name(op_type), \ + replicate->ShortDebugString(), commit.ShortDebugString())) + + ReplicateMsg* replicate = replicate_entry->mutable_replicate(); + const CommitMsg& commit = commit_entry->commit(); + OperationType op_type = commit.op_type(); + + switch (op_type) { + case WRITE_OP: + RETURN_NOT_OK_REPLAY(PlayWriteRequest, replicate, commit); + break; + + case ALTER_SCHEMA_OP: + RETURN_NOT_OK_REPLAY(PlayAlterSchemaRequest, replicate, commit); + break; + + case CHANGE_CONFIG_OP: + RETURN_NOT_OK_REPLAY(PlayChangeConfigRequest, replicate, commit); + break; + + case NO_OP: + RETURN_NOT_OK_REPLAY(PlayNoOpRequest, replicate, commit); + break; + + default: + return Status::IllegalState(Substitute("Unsupported commit entry type: $0", + commit.op_type())); + } + +#undef RETURN_NOT_OK_REPLAY + + // Non-tablet operations should not advance the safe time, because they are + // not started serially and so may have timestamps that are out of order. + if (op_type == NO_OP || op_type == CHANGE_CONFIG_OP) { + return Status::OK(); + } + + // Handle safe time advancement: + // + // If this operation has an external consistency mode other than COMMIT_WAIT, we know that no + // future transaction will have a timestamp that is lower than it, so we can just advance the + // safe timestamp to this operation's timestamp. + // + // If the hybrid clock is disabled, all transactions will fall into this category. + Timestamp safe_time; + if (replicate->write_request().external_consistency_mode() != COMMIT_WAIT) { + safe_time = Timestamp(replicate->timestamp()); + // ... else we set the safe timestamp to be the transaction's timestamp minus the maximum clock + // error. This opens the door for problems if the flags changed across reboots, but this is + // unlikely and the problem would manifest itself immediately and clearly (mvcc would complain + // the operation is already committed, with a CHECK failure). + } else { + DCHECK(clock_->SupportsExternalConsistencyMode(COMMIT_WAIT)) << "The provided clock does not" + "support COMMIT_WAIT external consistency mode."; + safe_time = server::HybridClock::AddPhysicalTimeToTimestamp( + Timestamp(replicate->timestamp()), + MonoDelta::FromMicroseconds(-FLAGS_max_clock_sync_error_usec)); + } + tablet_->mvcc_manager()->OfflineAdjustSafeTime(safe_time); + + return Status::OK(); +} + +void TabletBootstrap::DumpReplayStateToLog(const ReplayState& state) { + // Dump the replay state, this will log the pending replicates as well as the pending commits, + // which might be useful for debugging. + vector state_dump; + state.DumpReplayStateToStrings(&state_dump); + for (const string& string : state_dump) { + LOG_WITH_PREFIX(INFO) << string; + } +} + +Status TabletBootstrap::PlaySegments(ConsensusBootstrapInfo* consensus_info) { + ReplayState state; + log::SegmentSequence segments; + RETURN_NOT_OK(log_reader_->GetSegmentsSnapshot(&segments)); + + // The first thing to do is to rewind the tablet's schema back to the schema + // as of the point in time where the logs begin. We must replay the writes + // in the logs with the correct point-in-time schema. + if (!segments.empty()) { + const scoped_refptr& segment = segments[0]; + // Set the point-in-time schema for the tablet based on the log header. + Schema pit_schema; + RETURN_NOT_OK_PREPEND(SchemaFromPB(segment->header().schema(), &pit_schema), + "Couldn't decode log segment schema"); + RETURN_NOT_OK_PREPEND(tablet_->RewindSchemaForBootstrap( + pit_schema, segment->header().schema_version()), + "couldn't set point-in-time schema"); + } + + // We defer opening the log until here, so that we properly reproduce the + // point-in-time schema from the log we're reading into the log we're + // writing. + RETURN_NOT_OK_PREPEND(OpenNewLog(), "Failed to open new log"); + + int segment_count = 0; + for (const scoped_refptr& segment : segments) { + vector entries; + ElementDeleter deleter(&entries); + // TODO: Optimize this to not read the whole thing into memory? + Status read_status = segment->ReadEntries(&entries); + for (int entry_idx = 0; entry_idx < entries.size(); ++entry_idx) { + LogEntryPB* entry = entries[entry_idx]; + Status s = HandleEntry(&state, entry); + if (!s.ok()) { + DumpReplayStateToLog(state); + RETURN_NOT_OK_PREPEND(s, DebugInfo(tablet_->tablet_id(), + segment->header().sequence_number(), + entry_idx, segment->path(), + *entry)); + } + + + // If HandleEntry returns OK, then it has taken ownership of the entry. + // So, we have to remove it from the entries vector to avoid it getting + // freed by ElementDeleter. + entries[entry_idx] = nullptr; + } + + // If the LogReader failed to read for some reason, we'll still try to + // replay as many entries as possible, and then fail with Corruption. + // TODO: this is sort of scary -- why doesn't LogReader expose an + // entry-by-entry iterator-like API instead? Seems better to avoid + // exposing the idea of segments to callers. + if (PREDICT_FALSE(!read_status.ok())) { + return Status::Corruption(Substitute("Error reading Log Segment of tablet $0: $1 " + "(Read up to entry $2 of segment $3, in path $4)", + tablet_->tablet_id(), + read_status.ToString(), + entries.size(), + segment->header().sequence_number(), + segment->path())); + } + + // TODO: could be more granular here and log during the segments as well, + // plus give info about number of MB processed, but this is better than + // nothing. + listener_->StatusMessage(Substitute("Bootstrap replayed $0/$1 log segments. " + "Stats: $2. Pending: $3 replicates", + segment_count + 1, log_reader_->num_segments(), + stats_.ToString(), + state.pending_replicates.size())); + segment_count++; + } + + // If we have non-applied commits they all must belong to pending operations and + // they should only pertain to unflushed stores. + if (!state.pending_commits.empty()) { + for (const OpIndexToEntryMap::value_type& entry : state.pending_commits) { + if (!ContainsKey(state.pending_replicates, entry.first)) { + DumpReplayStateToLog(state); + return Status::Corruption("Had orphaned commits at the end of replay."); + } + if (AreAnyStoresAlreadyFlushed(entry.second->commit())) { + DumpReplayStateToLog(state); + TabletSuperBlockPB super; + WARN_NOT_OK(meta_->ToSuperBlock(&super), "Couldn't build TabletSuperBlockPB."); + return Status::Corruption(Substitute("CommitMsg was pending but it referred to " + "flushed stores. Commit: $0. TabletMetadata: $1", + entry.second->commit().ShortDebugString(), super.ShortDebugString())); + } + } + } + + // Note that we don't pass the information contained in the pending commits along with + // ConsensusBootstrapInfo. We know that this is safe as they must refer to unflushed + // stores (we make doubly sure above). + // + // Example/Explanation: + // Say we have two different operations that touch the same row, one insert and one + // mutate. Since we use Early Lock Release the commit for the second (mutate) operation + // might end up in the log before the insert's commit. This wouldn't matter since + // we replay in order, but a corner case here is that we might crash before we + // write the commit for the insert, meaning it might not be present at all. + // + // One possible log for this situation would be: + // - Replicate 10.10 (insert) + // - Replicate 10.11 (mutate) + // - Commit 10.11 (mutate) + // ~CRASH while Commit 10.10 is in-flight~ + // + // We can't replay 10.10 during bootstrap because we haven't seen its commit, but + // since we can't replay out-of-order we won't replay 10.11 either, in fact we'll + // pass them both as "pending" to consensus to be applied again. + // + // The reason why it is safe to simply disregard 10.11's commit is that we know that + // it must refer only to unflushed stores. We know this because one important flush/compact + // pre-condition is: + // - No flush will become visible on reboot (meaning we won't durably update the tablet + // metadata), unless the snapshot under which the flush/compact was performed has no + // in-flight transactions and all the messages that are in-flight to the log are durable. + // + // In our example this means that if we had flushed/compacted after 10.10 was applied + // (meaning losing the commit message would lead to corruption as we might re-apply it) + // then the commit for 10.10 would be durable. Since it isn't then no flush/compaction + // occurred after 10.10 was applied and thus we can disregard the commit message for + // 10.11 and simply apply both 10.10 and 10.11 as if we hadn't applied them before. + // + // This generalizes to: + // - If a committed replicate message with index Y is missing a commit message, + // no later committed replicate message (with index > Y) is visible across reboots + // in the tablet data. + + DumpReplayStateToLog(state); + + // Set up the ConsensusBootstrapInfo structure for the caller. + for (OpIndexToEntryMap::value_type& e : state.pending_replicates) { + consensus_info->orphaned_replicates.push_back(e.second->release_replicate()); + } + consensus_info->last_id = state.prev_op_id; + consensus_info->last_committed_id = state.committed_op_id; + + return Status::OK(); +} + +Status TabletBootstrap::AppendCommitMsg(const CommitMsg& commit_msg) { + LogEntryPB commit_entry; + commit_entry.set_type(log::COMMIT); + CommitMsg* commit = commit_entry.mutable_commit(); + commit->CopyFrom(commit_msg); + return log_->Append(&commit_entry); +} + +Status TabletBootstrap::PlayWriteRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg) { + DCHECK(replicate_msg->has_timestamp()); + WriteRequestPB* write = replicate_msg->mutable_write_request(); + + WriteTransactionState tx_state(nullptr, write, nullptr); + tx_state.mutable_op_id()->CopyFrom(replicate_msg->id()); + tx_state.set_timestamp(Timestamp(replicate_msg->timestamp())); + + tablet_->StartTransaction(&tx_state); + tablet_->StartApplying(&tx_state); + + // Use committed OpId for mem store anchoring. + tx_state.mutable_op_id()->CopyFrom(replicate_msg->id()); + + if (write->has_row_operations()) { + // TODO: get rid of redundant params below - they can be gotten from the Request + RETURN_NOT_OK(PlayRowOperations(&tx_state, + write->schema(), + write->row_operations(), + commit_msg.result())); + } + + // Append the commit msg to the log but replace the result with the new one. + LogEntryPB commit_entry; + commit_entry.set_type(log::COMMIT); + CommitMsg* commit = commit_entry.mutable_commit(); + commit->CopyFrom(commit_msg); + tx_state.ReleaseTxResultPB(commit->mutable_result()); + RETURN_NOT_OK(log_->Append(&commit_entry)); + + return Status::OK(); +} + +Status TabletBootstrap::PlayAlterSchemaRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg) { + AlterSchemaRequestPB* alter_schema = replicate_msg->mutable_alter_schema_request(); + + // Decode schema + Schema schema; + RETURN_NOT_OK(SchemaFromPB(alter_schema->schema(), &schema)); + + AlterSchemaTransactionState tx_state(nullptr, alter_schema, nullptr); + + // TODO(KUDU-860): we should somehow distinguish if an alter table failed on its original + // attempt (e.g due to being an invalid request, or a request with a too-early + // schema version). + + RETURN_NOT_OK(tablet_->CreatePreparedAlterSchema(&tx_state, &schema)); + + // Apply the alter schema to the tablet + RETURN_NOT_OK_PREPEND(tablet_->AlterSchema(&tx_state), "Failed to AlterSchema:"); + + // Also update the log information. Normally, the AlterSchema() call above + // takes care of this, but our new log isn't hooked up to the tablet yet. + log_->SetSchemaForNextLogSegment(schema, tx_state.schema_version()); + + return AppendCommitMsg(commit_msg); +} + +Status TabletBootstrap::PlayChangeConfigRequest(ReplicateMsg* replicate_msg, + const CommitMsg& commit_msg) { + ChangeConfigRecordPB* change_config = replicate_msg->mutable_change_config_record(); + RaftConfigPB config = change_config->new_config(); + + int64_t cmeta_opid_index = cmeta_->committed_config().opid_index(); + if (replicate_msg->id().index() > cmeta_opid_index) { + DCHECK(!config.has_opid_index()); + config.set_opid_index(replicate_msg->id().index()); + VLOG_WITH_PREFIX(1) << "WAL replay found Raft configuration with log index " + << config.opid_index() + << " that is greater than the committed config's index " + << cmeta_opid_index + << ". Applying this configuration change."; + cmeta_->set_committed_config(config); + // We flush once at the end of bootstrap. + } else { + VLOG_WITH_PREFIX(1) << "WAL replay found Raft configuration with log index " + << replicate_msg->id().index() + << ", which is less than or equal to the committed " + << "config's index " << cmeta_opid_index << ". " + << "Skipping application of this config change."; + } + + return AppendCommitMsg(commit_msg); +} + +Status TabletBootstrap::PlayNoOpRequest(ReplicateMsg* replicate_msg, const CommitMsg& commit_msg) { + return AppendCommitMsg(commit_msg); +} + +Status TabletBootstrap::PlayRowOperations(WriteTransactionState* tx_state, + const SchemaPB& schema_pb, + const RowOperationsPB& ops_pb, + const TxResultPB& result) { + Schema inserts_schema; + RETURN_NOT_OK_PREPEND(SchemaFromPB(schema_pb, &inserts_schema), + "Couldn't decode client schema"); + + arena_.Reset(); + + RETURN_NOT_OK_PREPEND(tablet_->DecodeWriteOperations(&inserts_schema, tx_state), + Substitute("Could not decode row operations: $0", + ops_pb.ShortDebugString())); + CHECK_EQ(tx_state->row_ops().size(), result.ops_size()); + + // Run AcquireRowLocks, Apply, etc! + RETURN_NOT_OK_PREPEND(tablet_->AcquireRowLocks(tx_state), + "Failed to acquire row locks"); + + RETURN_NOT_OK(FilterAndApplyOperations(tx_state, result)); + + return Status::OK(); +} + +Status TabletBootstrap::FilterAndApplyOperations(WriteTransactionState* tx_state, + const TxResultPB& orig_result) { + int32_t op_idx = 0; + for (RowOp* op : tx_state->row_ops()) { + const OperationResultPB& orig_op_result = orig_result.ops(op_idx++); + + // check if the operation failed in the original transaction + if (PREDICT_FALSE(orig_op_result.has_failed_status())) { + Status status = StatusFromPB(orig_op_result.failed_status()); + if (VLOG_IS_ON(1)) { + VLOG_WITH_PREFIX(1) << "Skipping operation that originally resulted in error. OpId: " + << tx_state->op_id().DebugString() << " op index: " + << op_idx - 1 << " original error: " + << status.ToString(); + } + op->SetFailed(status); + continue; + } + + // Check if it should be filtered out because it's already flushed. + switch (op->decoded_op.type) { + case RowOperationsPB::INSERT: + stats_.inserts_seen++; + if (!orig_op_result.flushed()) { + RETURN_NOT_OK(FilterInsert(tx_state, op, orig_op_result)); + } else { + op->SetAlreadyFlushed(); + stats_.inserts_ignored++; + continue; + } + break; + case RowOperationsPB::UPDATE: + case RowOperationsPB::DELETE: + stats_.mutations_seen++; + if (!orig_op_result.flushed()) { + RETURN_NOT_OK(FilterMutate(tx_state, op, orig_op_result)); + } else { + op->SetAlreadyFlushed(); + stats_.mutations_ignored++; + continue; + } + break; + default: + LOG_WITH_PREFIX(FATAL) << "Bad op type: " << op->decoded_op.type; + break; + } + if (op->result != nullptr) { + continue; + } + + // Actually apply it. + tablet_->ApplyRowOperation(tx_state, op); + DCHECK(op->result != nullptr); + + // We expect that the above Apply() will always succeed, because we're + // applying an operation that we know succeeded before the server + // restarted. If it doesn't succeed, something is wrong and we are + // diverging from our prior state, so bail. + if (op->result->has_failed_status()) { + return Status::Corruption("Operation which previously succeeded failed " + "during log replay", + Substitute("Op: $0\nFailure: $1", + op->ToString(*tablet_->schema()), + op->result->failed_status().ShortDebugString())); + } + } + return Status::OK(); +} + +Status TabletBootstrap::FilterInsert(WriteTransactionState* tx_state, + RowOp* op, + const OperationResultPB& op_result) { + DCHECK_EQ(op->decoded_op.type, RowOperationsPB::INSERT); + + if (PREDICT_FALSE(op_result.mutated_stores_size() != 1 || + !op_result.mutated_stores(0).has_mrs_id())) { + return Status::Corruption(Substitute("Insert operation result must have an mrs_id: $0", + op_result.ShortDebugString())); + } + // check if the insert is already flushed + if (flushed_stores_.WasStoreAlreadyFlushed(op_result.mutated_stores(0))) { + if (VLOG_IS_ON(1)) { + VLOG_WITH_PREFIX(1) << "Skipping insert that was already flushed. OpId: " + << tx_state->op_id().DebugString() + << " flushed to: " << op_result.mutated_stores(0).mrs_id() + << " latest durable mrs id: " + << tablet_->metadata()->last_durable_mrs_id(); + } + + op->SetAlreadyFlushed(); + stats_.inserts_ignored++; + } + return Status::OK(); +} + +Status TabletBootstrap::FilterMutate(WriteTransactionState* tx_state, + RowOp* op, + const OperationResultPB& op_result) { + DCHECK(op->decoded_op.type == RowOperationsPB::UPDATE || + op->decoded_op.type == RowOperationsPB::DELETE) + << RowOperationsPB::Type_Name(op->decoded_op.type); + + int num_mutated_stores = op_result.mutated_stores_size(); + if (PREDICT_FALSE(num_mutated_stores == 0 || num_mutated_stores > 2)) { + return Status::Corruption(Substitute("Mutations must have one or two mutated_stores: $0", + op_result.ShortDebugString())); + } + + // The mutation may have been duplicated, so we'll check whether any of the + // output targets was "unflushed". + int num_unflushed_stores = 0; + for (const MemStoreTargetPB& mutated_store : op_result.mutated_stores()) { + if (!flushed_stores_.WasStoreAlreadyFlushed(mutated_store)) { + num_unflushed_stores++; + } else { + if (VLOG_IS_ON(1)) { + string mutation = op->decoded_op.changelist.ToString(*tablet_->schema()); + VLOG_WITH_PREFIX(1) << "Skipping mutation to " << mutated_store.ShortDebugString() + << " that was already flushed. " + << "OpId: " << tx_state->op_id().DebugString(); + } + } + } + + if (num_unflushed_stores == 0) { + // The mutation was fully flushed. + op->SetFailed(Status::AlreadyPresent("Update was already flushed.")); + stats_.mutations_ignored++; + return Status::OK(); + } + + if (num_unflushed_stores == 2) { + // 18:47 < dralves> off the top of my head, if we crashed before writing the meta + // at the end of a flush/compation then both mutations could + // potentually be considered unflushed + // This case is not currently covered by any tests -- we need to add test coverage + // for this. See KUDU-218. It's likely the correct behavior is just to apply the edit, + // ie not fatal below. + LOG_WITH_PREFIX(DFATAL) << "TODO: add test coverage for case where op is unflushed " + << "in both duplicated targets"; + } + + return Status::OK(); +} + +Status TabletBootstrap::UpdateClock(uint64_t timestamp) { + Timestamp ts; + RETURN_NOT_OK(ts.FromUint64(timestamp)); + RETURN_NOT_OK(clock_->Update(ts)); + return Status::OK(); +} + +string TabletBootstrap::LogPrefix() const { + return Substitute("T $0 P $1: ", meta_->tablet_id(), meta_->fs_manager()->uuid()); +} + +Status FlushedStoresSnapshot::InitFrom(const TabletMetadata& meta) { + CHECK(flushed_dms_by_drs_id_.empty()) << "already initted"; + last_durable_mrs_id_ = meta.last_durable_mrs_id(); + for (const shared_ptr& rsmd : meta.rowsets()) { + if (!InsertIfNotPresent(&flushed_dms_by_drs_id_, rsmd->id(), + rsmd->last_durable_redo_dms_id())) { + return Status::Corruption(Substitute( + "Duplicate DRS ID $0 in tablet metadata. " + "Found DRS $0 with last durable redo DMS ID $1 while trying to " + "initialize DRS $0 with last durable redo DMS ID $2", + rsmd->id(), + flushed_dms_by_drs_id_[rsmd->id()], + rsmd->last_durable_redo_dms_id())); + } + } + return Status::OK(); +} + +bool FlushedStoresSnapshot::WasStoreAlreadyFlushed(const MemStoreTargetPB& target) const { + if (target.has_mrs_id()) { + DCHECK(!target.has_rs_id()); + DCHECK(!target.has_dms_id()); + + // The original mutation went to the MRS. It is flushed if it went to an MRS + // with a lower ID than the latest flushed one. + return target.mrs_id() <= last_durable_mrs_id_; + } else { + // The original mutation went to a DRS's delta store. + int64_t last_durable_dms_id; + if (!FindCopy(flushed_dms_by_drs_id_, target.rs_id(), &last_durable_dms_id)) { + // if we have no data about this RowSet, then it must have been flushed and + // then deleted. + // TODO: how do we avoid a race where we get an update on a rowset before + // it is persisted? add docs about the ordering of flush. + return true; + } + + // If the original rowset that we applied the edit to exists, check whether + // the edit was in a flushed DMS or a live one. + if (target.dms_id() <= last_durable_dms_id) { + return true; + } + + return false; + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_bootstrap.h b/src/kudu/tablet/tablet_bootstrap.h new file mode 100644 index 000000000000..be3353c70d1f --- /dev/null +++ b/src/kudu/tablet/tablet_bootstrap.h @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_BOOTSTRAP_H_ +#define KUDU_TABLET_TABLET_BOOTSTRAP_H_ + +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/consensus/log.pb.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MetricRegistry; +class Partition; +class PartitionSchema; + +namespace log { +class Log; +class LogAnchorRegistry; +} + +namespace consensus { +struct ConsensusBootstrapInfo; +} // namespace consensus + +namespace server { +class Clock; +} + +namespace tablet { +class Tablet; +class TabletMetadata; + +// A listener for logging the tablet related statuses as well as +// piping it into the web UI. +class TabletStatusListener { + public: + explicit TabletStatusListener(const scoped_refptr& meta); + + ~TabletStatusListener(); + + void StatusMessage(const std::string& status); + + const std::string tablet_id() const; + + const std::string table_name() const; + + const Partition& partition() const; + + const Schema& schema() const; + + std::string last_status() const { + boost::shared_lock l(lock_); + return last_status_; + } + + private: + mutable boost::shared_mutex lock_; + + scoped_refptr meta_; + std::string last_status_; + + DISALLOW_COPY_AND_ASSIGN(TabletStatusListener); +}; + +extern const char* kLogRecoveryDir; + +// Bootstraps a tablet, initializing it with the provided metadata. If the tablet +// has blocks and log segments, this method rebuilds the soft state by replaying +// the Log. +// +// This is a synchronous method, but is typically called within a thread pool by +// TSTabletManager. +Status BootstrapTablet(const scoped_refptr& meta, + const scoped_refptr& clock, + const std::shared_ptr& mem_tracker, + MetricRegistry* metric_registry, + TabletStatusListener* status_listener, + std::shared_ptr* rebuilt_tablet, + scoped_refptr* rebuilt_log, + const scoped_refptr& log_anchor_registry, + consensus::ConsensusBootstrapInfo* consensus_info); + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TABLET_BOOTSTRAP_H_ */ diff --git a/src/kudu/tablet/tablet_metadata-test.cc b/src/kudu/tablet/tablet_metadata-test.cc new file mode 100644 index 000000000000..a56ac80c44e9 --- /dev/null +++ b/src/kudu/tablet/tablet_metadata-test.cc @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/tablet-test-util.h" + +namespace kudu { +namespace tablet { + +class TestTabletMetadata : public KuduTabletTest { + public: + TestTabletMetadata() + : KuduTabletTest(GetSimpleTestSchema()) { + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + writer_.reset(new LocalTabletWriter(harness_->tablet().get(), + &client_schema_)); + } + + void BuildPartialRow(int key, int intval, const char* strval, + gscoped_ptr* row); + + protected: + gscoped_ptr writer_; +}; + +void TestTabletMetadata::BuildPartialRow(int key, int intval, const char* strval, + gscoped_ptr* row) { + row->reset(new KuduPartialRow(&client_schema_)); + CHECK_OK((*row)->SetInt32(0, key)); + CHECK_OK((*row)->SetInt32(1, intval)); + CHECK_OK((*row)->SetStringCopy(2, strval)); +} + +// Test that loading & storing the superblock results in an equivalent file. +TEST_F(TestTabletMetadata, TestLoadFromSuperBlock) { + // Write some data to the tablet and flush. + gscoped_ptr row; + BuildPartialRow(0, 0, "foo", &row); + writer_->Insert(*row); + ASSERT_OK(harness_->tablet()->Flush()); + + // Create one more rowset. Write and flush. + BuildPartialRow(1, 1, "bar", &row); + writer_->Insert(*row); + ASSERT_OK(harness_->tablet()->Flush()); + + // Shut down the tablet. + harness_->tablet()->Shutdown(); + + TabletMetadata* meta = harness_->tablet()->metadata(); + + // Dump the superblock to a PB. Save the PB to the side. + TabletSuperBlockPB superblock_pb_1; + ASSERT_OK(meta->ToSuperBlock(&superblock_pb_1)); + + // Load the superblock PB back into the TabletMetadata. + ASSERT_OK(meta->ReplaceSuperBlock(superblock_pb_1)); + + // Dump the tablet metadata to a superblock PB again, and save it. + TabletSuperBlockPB superblock_pb_2; + ASSERT_OK(meta->ToSuperBlock(&superblock_pb_2)); + + // Compare the 2 dumped superblock PBs. + ASSERT_EQ(superblock_pb_1.SerializeAsString(), + superblock_pb_2.SerializeAsString()) + << superblock_pb_1.DebugString() + << superblock_pb_2.DebugString(); + + LOG(INFO) << "Superblocks match:\n" + << superblock_pb_1.DebugString(); +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_metadata.cc b/src/kudu/tablet/tablet_metadata.cc new file mode 100644 index 000000000000..92a3961a6f7e --- /dev/null +++ b/src/kudu/tablet/tablet_metadata.cc @@ -0,0 +1,636 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/tablet_metadata.h" + +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/opid.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/rowset_metadata.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/logging.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/status.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/trace.h" + +DEFINE_bool(enable_tablet_orphaned_block_deletion, true, + "Whether to enable deletion of orphaned blocks from disk. " + "Note: This is only exposed for debugging purposes!"); +TAG_FLAG(enable_tablet_orphaned_block_deletion, advanced); +TAG_FLAG(enable_tablet_orphaned_block_deletion, hidden); +TAG_FLAG(enable_tablet_orphaned_block_deletion, runtime); + +using std::shared_ptr; + +using base::subtle::Barrier_AtomicIncrement; +using strings::Substitute; + +using kudu::consensus::MinimumOpId; +using kudu::consensus::OpId; +using kudu::consensus::RaftConfigPB; + +namespace kudu { +namespace tablet { + +const int64 kNoDurableMemStore = -1; + +// ============================================================================ +// Tablet Metadata +// ============================================================================ + +Status TabletMetadata::CreateNew(FsManager* fs_manager, + const string& tablet_id, + const string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + const Partition& partition, + const TabletDataState& initial_tablet_data_state, + scoped_refptr* metadata) { + + // Verify that no existing tablet exists with the same ID. + if (fs_manager->env()->FileExists(fs_manager->GetTabletMetadataPath(tablet_id))) { + return Status::AlreadyPresent("Tablet already exists", tablet_id); + } + + scoped_refptr ret(new TabletMetadata(fs_manager, + tablet_id, + table_name, + schema, + partition_schema, + partition, + initial_tablet_data_state)); + RETURN_NOT_OK(ret->Flush()); + metadata->swap(ret); + return Status::OK(); +} + +Status TabletMetadata::Load(FsManager* fs_manager, + const string& tablet_id, + scoped_refptr* metadata) { + scoped_refptr ret(new TabletMetadata(fs_manager, tablet_id)); + RETURN_NOT_OK(ret->LoadFromDisk()); + metadata->swap(ret); + return Status::OK(); +} + +Status TabletMetadata::LoadOrCreate(FsManager* fs_manager, + const string& tablet_id, + const string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + const Partition& partition, + const TabletDataState& initial_tablet_data_state, + scoped_refptr* metadata) { + Status s = Load(fs_manager, tablet_id, metadata); + if (s.ok()) { + if (!(*metadata)->schema().Equals(schema)) { + return Status::Corruption(Substitute("Schema on disk ($0) does not " + "match expected schema ($1)", (*metadata)->schema().ToString(), + schema.ToString())); + } + return Status::OK(); + } else if (s.IsNotFound()) { + return CreateNew(fs_manager, tablet_id, table_name, schema, + partition_schema, partition, initial_tablet_data_state, + metadata); + } else { + return s; + } +} + +void TabletMetadata::CollectBlockIdPBs(const TabletSuperBlockPB& superblock, + std::vector* block_ids) { + for (const RowSetDataPB& rowset : superblock.rowsets()) { + for (const ColumnDataPB& column : rowset.columns()) { + block_ids->push_back(column.block()); + } + for (const DeltaDataPB& redo : rowset.redo_deltas()) { + block_ids->push_back(redo.block()); + } + for (const DeltaDataPB& undo : rowset.undo_deltas()) { + block_ids->push_back(undo.block()); + } + if (rowset.has_bloom_block()) { + block_ids->push_back(rowset.bloom_block()); + } + if (rowset.has_adhoc_index_block()) { + block_ids->push_back(rowset.adhoc_index_block()); + } + } +} + +Status TabletMetadata::DeleteTabletData(TabletDataState delete_type, + const boost::optional& last_logged_opid) { + CHECK(delete_type == TABLET_DATA_DELETED || + delete_type == TABLET_DATA_TOMBSTONED) + << "DeleteTabletData() called with unsupported delete_type on tablet " + << tablet_id_ << ": " << TabletDataState_Name(delete_type) + << " (" << delete_type << ")"; + + // First add all of our blocks to the orphan list + // and clear our rowsets. This serves to erase all the data. + // + // We also set the state in our persisted metadata to indicate that + // we have been deleted. + { + boost::lock_guard l(data_lock_); + for (const shared_ptr& rsmd : rowsets_) { + AddOrphanedBlocksUnlocked(rsmd->GetAllBlocks()); + } + rowsets_.clear(); + tablet_data_state_ = delete_type; + if (last_logged_opid) { + tombstone_last_logged_opid_ = *last_logged_opid; + } + } + + // Flushing will sync the new tablet_data_state_ to disk and will now also + // delete all the data. + RETURN_NOT_OK(Flush()); + + // Re-sync to disk one more time. + // This call will typically re-sync with an empty orphaned blocks list + // (unless deleting any orphans failed during the last Flush()), so that we + // don't try to re-delete the deleted orphaned blocks on every startup. + return Flush(); +} + +Status TabletMetadata::DeleteSuperBlock() { + boost::lock_guard l(data_lock_); + if (!orphaned_blocks_.empty()) { + return Status::InvalidArgument("The metadata for tablet " + tablet_id_ + + " still references orphaned blocks. " + "Call DeleteTabletData() first"); + } + if (tablet_data_state_ != TABLET_DATA_DELETED) { + return Status::IllegalState( + Substitute("Tablet $0 is not in TABLET_DATA_DELETED state. " + "Call DeleteTabletData(TABLET_DATA_DELETED) first. " + "Tablet data state: $1 ($2)", + tablet_id_, + TabletDataState_Name(tablet_data_state_), + tablet_data_state_)); + } + + string path = fs_manager_->GetTabletMetadataPath(tablet_id_); + RETURN_NOT_OK_PREPEND(fs_manager_->env()->DeleteFile(path), + "Unable to delete superblock for tablet " + tablet_id_); + return Status::OK(); +} + +TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id, + string table_name, const Schema& schema, + PartitionSchema partition_schema, + Partition partition, + const TabletDataState& tablet_data_state) + : state_(kNotWrittenYet), + tablet_id_(std::move(tablet_id)), + partition_(std::move(partition)), + fs_manager_(fs_manager), + next_rowset_idx_(0), + last_durable_mrs_id_(kNoDurableMemStore), + schema_(new Schema(schema)), + schema_version_(0), + table_name_(std::move(table_name)), + partition_schema_(std::move(partition_schema)), + tablet_data_state_(tablet_data_state), + tombstone_last_logged_opid_(MinimumOpId()), + num_flush_pins_(0), + needs_flush_(false), + pre_flush_callback_(Bind(DoNothingStatusClosure)) { + CHECK(schema_->has_column_ids()); + CHECK_GT(schema_->num_key_columns(), 0); +} + +TabletMetadata::~TabletMetadata() { + STLDeleteElements(&old_schemas_); + delete schema_; +} + +TabletMetadata::TabletMetadata(FsManager* fs_manager, string tablet_id) + : state_(kNotLoadedYet), + tablet_id_(std::move(tablet_id)), + fs_manager_(fs_manager), + next_rowset_idx_(0), + schema_(nullptr), + tombstone_last_logged_opid_(MinimumOpId()), + num_flush_pins_(0), + needs_flush_(false), + pre_flush_callback_(Bind(DoNothingStatusClosure)) {} + +Status TabletMetadata::LoadFromDisk() { + TRACE_EVENT1("tablet", "TabletMetadata::LoadFromDisk", + "tablet_id", tablet_id_); + + CHECK_EQ(state_, kNotLoadedYet); + + TabletSuperBlockPB superblock; + RETURN_NOT_OK(ReadSuperBlockFromDisk(&superblock)); + RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(superblock), + "Failed to load data from superblock protobuf"); + state_ = kInitialized; + return Status::OK(); +} + +Status TabletMetadata::LoadFromSuperBlock(const TabletSuperBlockPB& superblock) { + vector orphaned_blocks; + + VLOG(2) << "Loading TabletMetadata from SuperBlockPB:" << std::endl + << superblock.DebugString(); + + { + boost::lock_guard l(data_lock_); + + // Verify that the tablet id matches with the one in the protobuf + if (superblock.tablet_id() != tablet_id_) { + return Status::Corruption("Expected id=" + tablet_id_ + + " found " + superblock.tablet_id(), + superblock.DebugString()); + } + + table_id_ = superblock.table_id(); + last_durable_mrs_id_ = superblock.last_durable_mrs_id(); + + table_name_ = superblock.table_name(); + + uint32_t schema_version = superblock.schema_version(); + gscoped_ptr schema(new Schema()); + RETURN_NOT_OK_PREPEND(SchemaFromPB(superblock.schema(), schema.get()), + "Failed to parse Schema from superblock " + + superblock.ShortDebugString()); + SetSchemaUnlocked(schema.Pass(), schema_version); + + // This check provides backwards compatibility with the + // flexible-partitioning changes introduced in KUDU-818. + if (superblock.has_partition()) { + RETURN_NOT_OK(PartitionSchema::FromPB(superblock.partition_schema(), + *schema_, &partition_schema_)); + Partition::FromPB(superblock.partition(), &partition_); + } else { + // This clause may be removed after compatibility with tables created + // before KUDU-818 is not needed. + RETURN_NOT_OK(PartitionSchema::FromPB(PartitionSchemaPB(), *schema_, &partition_schema_)); + PartitionPB partition; + if (!superblock.has_start_key() || !superblock.has_end_key()) { + return Status::Corruption( + "tablet superblock must contain either a partition or start and end primary keys", + superblock.ShortDebugString()); + } + partition.set_partition_key_start(superblock.start_key()); + partition.set_partition_key_end(superblock.end_key()); + Partition::FromPB(partition, &partition_); + } + + tablet_data_state_ = superblock.tablet_data_state(); + + rowsets_.clear(); + for (const RowSetDataPB& rowset_pb : superblock.rowsets()) { + gscoped_ptr rowset_meta; + RETURN_NOT_OK(RowSetMetadata::Load(this, rowset_pb, &rowset_meta)); + next_rowset_idx_ = std::max(next_rowset_idx_, rowset_meta->id() + 1); + rowsets_.push_back(shared_ptr(rowset_meta.release())); + } + + for (const BlockIdPB& block_pb : superblock.orphaned_blocks()) { + orphaned_blocks.push_back(BlockId::FromPB(block_pb)); + } + AddOrphanedBlocksUnlocked(orphaned_blocks); + + if (superblock.has_tombstone_last_logged_opid()) { + tombstone_last_logged_opid_ = superblock.tombstone_last_logged_opid(); + } else { + tombstone_last_logged_opid_ = MinimumOpId(); + } + } + + // Now is a good time to clean up any orphaned blocks that may have been + // left behind from a crash just after replacing the superblock. + if (!fs_manager()->read_only()) { + DeleteOrphanedBlocks(orphaned_blocks); + } + + return Status::OK(); +} + +Status TabletMetadata::UpdateAndFlush(const RowSetMetadataIds& to_remove, + const RowSetMetadataVector& to_add, + int64_t last_durable_mrs_id) { + { + boost::lock_guard l(data_lock_); + RETURN_NOT_OK(UpdateUnlocked(to_remove, to_add, last_durable_mrs_id)); + } + return Flush(); +} + +void TabletMetadata::AddOrphanedBlocks(const vector& blocks) { + boost::lock_guard l(data_lock_); + AddOrphanedBlocksUnlocked(blocks); +} + +void TabletMetadata::AddOrphanedBlocksUnlocked(const vector& blocks) { + DCHECK(data_lock_.is_locked()); + orphaned_blocks_.insert(blocks.begin(), blocks.end()); +} + +void TabletMetadata::DeleteOrphanedBlocks(const vector& blocks) { + if (PREDICT_FALSE(!FLAGS_enable_tablet_orphaned_block_deletion)) { + LOG_WITH_PREFIX(WARNING) << "Not deleting " << blocks.size() + << " block(s) from disk. Block deletion disabled via " + << "--enable_tablet_orphaned_block_deletion=false"; + return; + } + + vector deleted; + for (const BlockId& b : blocks) { + Status s = fs_manager()->DeleteBlock(b); + // If we get NotFound, then the block was actually successfully + // deleted before. So, we can remove it from our orphaned block list + // as if it was a success. + if (!s.ok() && !s.IsNotFound()) { + WARN_NOT_OK(s, Substitute("Could not delete block $0", b.ToString())); + continue; + } + + deleted.push_back(b); + } + + // Remove the successfully-deleted blocks from the set. + { + boost::lock_guard l(data_lock_); + for (const BlockId& b : deleted) { + orphaned_blocks_.erase(b); + } + } +} + +void TabletMetadata::PinFlush() { + boost::lock_guard l(data_lock_); + CHECK_GE(num_flush_pins_, 0); + num_flush_pins_++; + VLOG(1) << "Number of flush pins: " << num_flush_pins_; +} + +Status TabletMetadata::UnPinFlush() { + boost::unique_lock l(data_lock_); + CHECK_GT(num_flush_pins_, 0); + num_flush_pins_--; + if (needs_flush_) { + l.unlock(); + RETURN_NOT_OK(Flush()); + } + return Status::OK(); +} + +Status TabletMetadata::Flush() { + TRACE_EVENT1("tablet", "TabletMetadata::Flush", + "tablet_id", tablet_id_); + + MutexLock l_flush(flush_lock_); + vector orphaned; + TabletSuperBlockPB pb; + { + boost::lock_guard l(data_lock_); + CHECK_GE(num_flush_pins_, 0); + if (num_flush_pins_ > 0) { + needs_flush_ = true; + LOG(INFO) << "Not flushing: waiting for " << num_flush_pins_ << " pins to be released."; + return Status::OK(); + } + needs_flush_ = false; + + RETURN_NOT_OK(ToSuperBlockUnlocked(&pb, rowsets_)); + + // Make a copy of the orphaned blocks list which corresponds to the superblock + // that we're writing. It's important to take this local copy to avoid a race + // in which another thread may add new orphaned blocks to the 'orphaned_blocks_' + // set while we're in the process of writing the new superblock to disk. We don't + // want to accidentally delete those blocks before that next metadata update + // is persisted. See KUDU-701 for details. + orphaned.assign(orphaned_blocks_.begin(), orphaned_blocks_.end()); + } + pre_flush_callback_.Run(); + RETURN_NOT_OK(ReplaceSuperBlockUnlocked(pb)); + TRACE("Metadata flushed"); + l_flush.Unlock(); + + // Now that the superblock is written, try to delete the orphaned blocks. + // + // If we crash just before the deletion, we'll retry when reloading from + // disk; the orphaned blocks were persisted as part of the superblock. + DeleteOrphanedBlocks(orphaned); + + return Status::OK(); +} + +Status TabletMetadata::UpdateUnlocked( + const RowSetMetadataIds& to_remove, + const RowSetMetadataVector& to_add, + int64_t last_durable_mrs_id) { + DCHECK(data_lock_.is_locked()); + CHECK_NE(state_, kNotLoadedYet); + if (last_durable_mrs_id != kNoMrsFlushed) { + DCHECK_GE(last_durable_mrs_id, last_durable_mrs_id_); + last_durable_mrs_id_ = last_durable_mrs_id; + } + + RowSetMetadataVector new_rowsets = rowsets_; + auto it = new_rowsets.begin(); + while (it != new_rowsets.end()) { + if (ContainsKey(to_remove, (*it)->id())) { + AddOrphanedBlocksUnlocked((*it)->GetAllBlocks()); + it = new_rowsets.erase(it); + } else { + it++; + } + } + + for (const shared_ptr& meta : to_add) { + new_rowsets.push_back(meta); + } + rowsets_ = new_rowsets; + + TRACE("TabletMetadata updated"); + return Status::OK(); +} + +Status TabletMetadata::ReplaceSuperBlock(const TabletSuperBlockPB &pb) { + { + MutexLock l(flush_lock_); + RETURN_NOT_OK_PREPEND(ReplaceSuperBlockUnlocked(pb), "Unable to replace superblock"); + } + + RETURN_NOT_OK_PREPEND(LoadFromSuperBlock(pb), + "Failed to load data from superblock protobuf"); + + return Status::OK(); +} + +Status TabletMetadata::ReplaceSuperBlockUnlocked(const TabletSuperBlockPB &pb) { + flush_lock_.AssertAcquired(); + + string path = fs_manager_->GetTabletMetadataPath(tablet_id_); + RETURN_NOT_OK_PREPEND(pb_util::WritePBContainerToPath( + fs_manager_->env(), path, pb, + pb_util::OVERWRITE, pb_util::SYNC), + Substitute("Failed to write tablet metadata $0", tablet_id_)); + + return Status::OK(); +} + +Status TabletMetadata::ReadSuperBlockFromDisk(TabletSuperBlockPB* superblock) const { + string path = fs_manager_->GetTabletMetadataPath(tablet_id_); + RETURN_NOT_OK_PREPEND( + pb_util::ReadPBContainerFromPath(fs_manager_->env(), path, superblock), + Substitute("Could not load tablet metadata from $0", path)); + return Status::OK(); +} + +Status TabletMetadata::ToSuperBlock(TabletSuperBlockPB* super_block) const { + // acquire the lock so that rowsets_ doesn't get changed until we're finished. + boost::lock_guard l(data_lock_); + return ToSuperBlockUnlocked(super_block, rowsets_); +} + +Status TabletMetadata::ToSuperBlockUnlocked(TabletSuperBlockPB* super_block, + const RowSetMetadataVector& rowsets) const { + DCHECK(data_lock_.is_locked()); + // Convert to protobuf + TabletSuperBlockPB pb; + pb.set_table_id(table_id_); + pb.set_tablet_id(tablet_id_); + partition_.ToPB(pb.mutable_partition()); + pb.set_last_durable_mrs_id(last_durable_mrs_id_); + pb.set_schema_version(schema_version_); + partition_schema_.ToPB(pb.mutable_partition_schema()); + pb.set_table_name(table_name_); + + for (const shared_ptr& meta : rowsets) { + meta->ToProtobuf(pb.add_rowsets()); + } + + DCHECK(schema_->has_column_ids()); + RETURN_NOT_OK_PREPEND(SchemaToPB(*schema_, pb.mutable_schema()), + "Couldn't serialize schema into superblock"); + + pb.set_tablet_data_state(tablet_data_state_); + if (!OpIdEquals(tombstone_last_logged_opid_, MinimumOpId())) { + *pb.mutable_tombstone_last_logged_opid() = tombstone_last_logged_opid_; + } + + for (const BlockId& block_id : orphaned_blocks_) { + block_id.CopyToPB(pb.mutable_orphaned_blocks()->Add()); + } + + super_block->Swap(&pb); + return Status::OK(); +} + +Status TabletMetadata::CreateRowSet(shared_ptr *rowset, + const Schema& schema) { + AtomicWord rowset_idx = Barrier_AtomicIncrement(&next_rowset_idx_, 1) - 1; + gscoped_ptr scoped_rsm; + RETURN_NOT_OK(RowSetMetadata::CreateNew(this, rowset_idx, &scoped_rsm)); + rowset->reset(DCHECK_NOTNULL(scoped_rsm.release())); + return Status::OK(); +} + +const RowSetMetadata *TabletMetadata::GetRowSetForTests(int64_t id) const { + for (const shared_ptr& rowset_meta : rowsets_) { + if (rowset_meta->id() == id) { + return rowset_meta.get(); + } + } + return nullptr; +} + +RowSetMetadata *TabletMetadata::GetRowSetForTests(int64_t id) { + boost::lock_guard l(data_lock_); + for (const shared_ptr& rowset_meta : rowsets_) { + if (rowset_meta->id() == id) { + return rowset_meta.get(); + } + } + return nullptr; +} + +void TabletMetadata::SetSchema(const Schema& schema, uint32_t version) { + gscoped_ptr new_schema(new Schema(schema)); + boost::lock_guard l(data_lock_); + SetSchemaUnlocked(new_schema.Pass(), version); +} + +void TabletMetadata::SetSchemaUnlocked(gscoped_ptr new_schema, uint32_t version) { + DCHECK(new_schema->has_column_ids()); + + Schema* old_schema = schema_; + // "Release" barrier ensures that, when we publish the new Schema object, + // all of its initialization is also visible. + base::subtle::Release_Store(reinterpret_cast(&schema_), + reinterpret_cast(new_schema.release())); + if (PREDICT_TRUE(old_schema)) { + old_schemas_.push_back(old_schema); + } + schema_version_ = version; +} + +void TabletMetadata::SetTableName(const string& table_name) { + boost::lock_guard l(data_lock_); + table_name_ = table_name; +} + +string TabletMetadata::table_name() const { + boost::lock_guard l(data_lock_); + DCHECK_NE(state_, kNotLoadedYet); + return table_name_; +} + +uint32_t TabletMetadata::schema_version() const { + boost::lock_guard l(data_lock_); + DCHECK_NE(state_, kNotLoadedYet); + return schema_version_; +} + +void TabletMetadata::set_tablet_data_state(TabletDataState state) { + boost::lock_guard l(data_lock_); + tablet_data_state_ = state; +} + +string TabletMetadata::LogPrefix() const { + return Substitute("T $0 P $1: ", tablet_id_, fs_manager_->uuid()); +} + +TabletDataState TabletMetadata::tablet_data_state() const { + boost::lock_guard l(data_lock_); + return tablet_data_state_; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_metadata.h b/src/kudu/tablet/tablet_metadata.h new file mode 100644 index 000000000000..e956ea612c58 --- /dev/null +++ b/src/kudu/tablet/tablet_metadata.h @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_METADATA_H +#define KUDU_TABLET_TABLET_METADATA_H + +#include +#include +#include +#include +#include + +#include "kudu/common/partition.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/opid.pb.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/metadata.pb.h" +#include "kudu/util/mutex.h" +#include "kudu/util/status.h" +#include "kudu/util/status_callback.h" + +namespace kudu { +namespace tablet { + +class RowSetMetadata; +class RowSetMetadataUpdate; + +typedef std::vector > RowSetMetadataVector; +typedef std::unordered_set RowSetMetadataIds; + +extern const int64 kNoDurableMemStore; + +// Manages the "blocks tracking" for the specified tablet. +// +// TabletMetadata is owned by the Tablet. As new blocks are written to store +// the tablet's data, the Tablet calls Flush() to persist the block list +// on disk. +// +// At startup, the TSTabletManager will load a TabletMetadata for each +// super block found in the tablets/ directory, and then instantiate +// tablets from this data. +class TabletMetadata : public RefCountedThreadSafe { + public: + // Create metadata for a new tablet. This assumes that the given superblock + // has not been written before, and writes out the initial superblock with + // the provided parameters. + static Status CreateNew(FsManager* fs_manager, + const std::string& tablet_id, + const std::string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + const Partition& partition, + const TabletDataState& initial_tablet_data_state, + scoped_refptr* metadata); + + // Load existing metadata from disk. + static Status Load(FsManager* fs_manager, + const std::string& tablet_id, + scoped_refptr* metadata); + + // Try to load an existing tablet. If it does not exist, create it. + // If it already existed, verifies that the schema of the tablet matches the + // provided 'schema'. + // + // This is mostly useful for tests which instantiate tablets directly. + static Status LoadOrCreate(FsManager* fs_manager, + const std::string& tablet_id, + const std::string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + const Partition& partition, + const TabletDataState& initial_tablet_data_state, + scoped_refptr* metadata); + + static void CollectBlockIdPBs(const TabletSuperBlockPB& superblock, + std::vector* block_ids); + + const std::string& tablet_id() const { + DCHECK_NE(state_, kNotLoadedYet); + return tablet_id_; + } + + // Returns the partition of the tablet. + const Partition& partition() const { + return partition_; + } + + std::string table_id() const { + DCHECK_NE(state_, kNotLoadedYet); + return table_id_; + } + + std::string table_name() const; + + uint32_t schema_version() const; + + void SetSchema(const Schema& schema, uint32_t version); + + void SetTableName(const std::string& table_name); + + // Return a reference to the current schema. + // This pointer will be valid until the TabletMetadata is destructed, + // even if the schema is changed. + const Schema& schema() const { + const Schema* s = reinterpret_cast( + base::subtle::Acquire_Load(reinterpret_cast(&schema_))); + return *s; + } + + // Returns the partition schema of the tablet's table. + const PartitionSchema& partition_schema() const { + return partition_schema_; + } + + // Set / get the remote bootstrap / tablet data state. + void set_tablet_data_state(TabletDataState state); + TabletDataState tablet_data_state() const; + + // Increments flush pin count by one: if flush pin count > 0, + // metadata will _not_ be flushed to disk during Flush(). + void PinFlush(); + + // Decrements flush pin count by one: if flush pin count is zero, + // metadata will be flushed to disk during the next call to Flush() + // or -- if Flush() had been called after a call to PinFlush() but + // before this method was called -- Flush() will be called inside + // this method. + Status UnPinFlush(); + + Status Flush(); + + // Updates the metadata in the following ways: + // 1. Adds rowsets from 'to_add'. + // 2. Removes rowsets from 'to_remove'. + // 3. Adds orphaned blocks from 'to_remove'. + // 4. Updates the last durable MRS ID from 'last_durable_mrs_id', + // assuming it's not kNoMrsFlushed. + static const int64_t kNoMrsFlushed = -1; + Status UpdateAndFlush(const RowSetMetadataIds& to_remove, + const RowSetMetadataVector& to_add, + int64_t last_durable_mrs_id); + + // Adds the blocks referenced by 'block_ids' to 'orphaned_blocks_'. + // + // This set will be written to the on-disk metadata in any subsequent + // flushes. + // + // Blocks are removed from this set after they are successfully deleted + // in a call to DeleteOrphanedBlocks(). + void AddOrphanedBlocks(const std::vector& block_ids); + + // Mark the superblock to be in state 'delete_type', sync it to disk, and + // then delete all of the rowsets in this tablet. + // The metadata (superblock) is not deleted. For that, call DeleteSuperBlock(). + // + // 'delete_type' must be one of TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED. + // 'last_logged_opid' should be set to the last opid in the log, if any is known. + // If 'last_logged_opid' is not set, then the current value of + // last_logged_opid is not modified. This is important for roll-forward of + // partially-tombstoned tablets during crash recovery. + // + // Returns only once all data has been removed. + Status DeleteTabletData(TabletDataState delete_type, + const boost::optional& last_logged_opid); + + // Permanently deletes the superblock from the disk. + // DeleteTabletData() must first be called and the tablet data state must be + // TABLET_DATA_DELETED. + // Returns Status::InvalidArgument if the list of orphaned blocks is not empty. + // Returns Status::IllegalState if the tablet data state is not TABLET_DATA_DELETED. + Status DeleteSuperBlock(); + + // Create a new RowSetMetadata for this tablet. + // Does not add the new rowset to the list of rowsets. Use one of the Update() + // calls to do so. + Status CreateRowSet(std::shared_ptr *rowset, const Schema& schema); + + const RowSetMetadataVector& rowsets() const { return rowsets_; } + + FsManager *fs_manager() const { return fs_manager_; } + + int64_t last_durable_mrs_id() const { return last_durable_mrs_id_; } + + void SetLastDurableMrsIdForTests(int64_t mrs_id) { last_durable_mrs_id_ = mrs_id; } + + void SetPreFlushCallback(StatusClosure callback) { pre_flush_callback_ = callback; } + + consensus::OpId tombstone_last_logged_opid() const { return tombstone_last_logged_opid_; } + + // Loads the currently-flushed superblock from disk into the given protobuf. + Status ReadSuperBlockFromDisk(TabletSuperBlockPB* superblock) const; + + // Sets *super_block to the serialized form of the current metadata. + Status ToSuperBlock(TabletSuperBlockPB* super_block) const; + + // Fully replace a superblock (used for bootstrap). + Status ReplaceSuperBlock(const TabletSuperBlockPB &pb); + + // ========================================================================== + // Stuff used by the tests + // ========================================================================== + const RowSetMetadata *GetRowSetForTests(int64_t id) const; + + RowSetMetadata *GetRowSetForTests(int64_t id); + + private: + friend class RefCountedThreadSafe; + friend class MetadataTest; + + // Compile time assert that no one deletes TabletMetadata objects. + ~TabletMetadata(); + + // Constructor for creating a new tablet. + // + // TODO: get rid of this many-arg constructor in favor of just passing in a + // SuperBlock, which already contains all of these fields. + TabletMetadata(FsManager* fs_manager, std::string tablet_id, + std::string table_name, const Schema& schema, + PartitionSchema partition_schema, Partition partition, + const TabletDataState& tablet_data_state); + + // Constructor for loading an existing tablet. + TabletMetadata(FsManager* fs_manager, std::string tablet_id); + + void SetSchemaUnlocked(gscoped_ptr schema, uint32_t version); + + Status LoadFromDisk(); + + // Update state of metadata to that of the given superblock PB. + Status LoadFromSuperBlock(const TabletSuperBlockPB& superblock); + + Status ReadSuperBlock(TabletSuperBlockPB *pb); + + // Fully replace superblock. + // Requires 'flush_lock_'. + Status ReplaceSuperBlockUnlocked(const TabletSuperBlockPB &pb); + + // Requires 'data_lock_'. + Status UpdateUnlocked(const RowSetMetadataIds& to_remove, + const RowSetMetadataVector& to_add, + int64_t last_durable_mrs_id); + + // Requires 'data_lock_'. + Status ToSuperBlockUnlocked(TabletSuperBlockPB* super_block, + const RowSetMetadataVector& rowsets) const; + + // Requires 'data_lock_'. + void AddOrphanedBlocksUnlocked(const std::vector& block_ids); + + // Deletes the provided 'blocks' on disk. + // + // All blocks that are successfully deleted are removed from the + // 'orphaned_blocks_' set. + // + // Failures are logged, but are not fatal. + void DeleteOrphanedBlocks(const std::vector& blocks); + + // Return standard "T xxx P yyy" log prefix. + std::string LogPrefix() const; + + enum State { + kNotLoadedYet, + kNotWrittenYet, + kInitialized + }; + State state_; + + // Lock protecting the underlying data. + typedef simple_spinlock LockType; + mutable LockType data_lock_; + + // Lock protecting flushing the data to disk. + // If taken together with 'data_lock_', must be acquired first. + mutable Mutex flush_lock_; + + const std::string tablet_id_; + std::string table_id_; + + Partition partition_; + + FsManager* const fs_manager_; + RowSetMetadataVector rowsets_; + + base::subtle::Atomic64 next_rowset_idx_; + + int64_t last_durable_mrs_id_; + + // The current schema version. This is owned by this class. + // We don't use gscoped_ptr so that we can do an atomic swap. + Schema* schema_; + uint32_t schema_version_; + std::string table_name_; + PartitionSchema partition_schema_; + + // Previous values of 'schema_'. + // These are currently kept alive forever, under the assumption that + // a given tablet won't have thousands of "alter table" calls. + // They are kept alive so that callers of schema() don't need to + // worry about reference counting or locking. + std::vector old_schemas_; + + // Protected by 'data_lock_'. + std::unordered_set orphaned_blocks_; + + // The current state of remote bootstrap for the tablet. + TabletDataState tablet_data_state_; + + // Record of the last opid logged by the tablet before it was last + // tombstoned. Has no meaning for non-tombstoned tablets. + consensus::OpId tombstone_last_logged_opid_; + + // If this counter is > 0 then Flush() will not write any data to + // disk. + int32_t num_flush_pins_; + + // Set if Flush() is called when num_flush_pins_ is > 0; if true, + // then next UnPinFlush will call Flush() again to ensure the + // metadata is persisted. + bool needs_flush_; + + // A callback that, if set, is called before this metadata is flushed + // to disk. + StatusClosure pre_flush_callback_; + + DISALLOW_COPY_AND_ASSIGN(TabletMetadata); +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TABLET_METADATA_H */ diff --git a/src/kudu/tablet/tablet_metrics.cc b/src/kudu/tablet/tablet_metrics.cc new file mode 100644 index 000000000000..6a6d87de46bf --- /dev/null +++ b/src/kudu/tablet/tablet_metrics.cc @@ -0,0 +1,270 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tablet/tablet_metrics.h" + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/metrics.h" +#include "kudu/util/trace.h" + +// Tablet-specific metrics. +METRIC_DEFINE_counter(tablet, rows_inserted, "Rows Inserted", + kudu::MetricUnit::kRows, + "Number of rows inserted into this tablet since service start"); +METRIC_DEFINE_counter(tablet, rows_updated, "Rows Updated", + kudu::MetricUnit::kRows, + "Number of row update operations performed on this tablet since service start"); +METRIC_DEFINE_counter(tablet, rows_deleted, "Rows Deleted", + kudu::MetricUnit::kRows, + "Number of row delete operations performed on this tablet since service start"); + +METRIC_DEFINE_counter(tablet, scanner_rows_returned, "Scanner Rows Returned", + kudu::MetricUnit::kRows, + "Number of rows returned by scanners to clients. This count " + "is measured after predicates are applied, and thus is not " + "a reflection of the amount of work being done by scanners."); +METRIC_DEFINE_counter(tablet, scanner_cells_returned, "Scanner Cells Returned", + kudu::MetricUnit::kCells, + "Number of table cells returned by scanners to clients. This count " + "is measured after predicates are applied, and thus is not " + "a reflection of the amount of work being done by scanners."); +METRIC_DEFINE_counter(tablet, scanner_bytes_returned, "Scanner Bytes Returned", + kudu::MetricUnit::kBytes, + "Number of bytes returned by scanners to clients. This count " + "is measured after predicates are applied and the data is decoded " + "for consumption by clients, and thus is not " + "a reflection of the amount of work being done by scanners."); + + +METRIC_DEFINE_counter(tablet, scanner_rows_scanned, "Scanner Rows Scanned", + kudu::MetricUnit::kRows, + "Number of rows processed by scan requests. This is measured " + "as a raw count prior to application of predicates, deleted data," + "or MVCC-based filtering. Thus, this is a better measure of actual " + "table rows that have been processed by scan operations compared " + "to the Scanner Rows Returned metric."); + +METRIC_DEFINE_counter(tablet, scanner_cells_scanned_from_disk, "Scanner Cells Scanned From Disk", + kudu::MetricUnit::kCells, + "Number of table cells processed by scan requests. This is measured " + "as a raw count prior to application of predicates, deleted data," + "or MVCC-based filtering. Thus, this is a better measure of actual " + "table cells that have been processed by scan operations compared " + "to the Scanner Cells Returned metric.\n" + "Note that this only counts data that has been flushed to disk, " + "and does not include data read from in-memory stores. However, it" + "includes both cache misses and cache hits."); + +METRIC_DEFINE_counter(tablet, scanner_bytes_scanned_from_disk, "Scanner Bytes Scanned From Disk", + kudu::MetricUnit::kBytes, + "Number of bytes read by scan requests. This is measured " + "as a raw count prior to application of predicates, deleted data," + "or MVCC-based filtering. Thus, this is a better measure of actual " + "IO that has been caused by scan operations compared " + "to the Scanner Bytes Returned metric.\n" + "Note that this only counts data that has been flushed to disk, " + "and does not include data read from in-memory stores. However, it" + "includes both cache misses and cache hits."); + + +METRIC_DEFINE_counter(tablet, insertions_failed_dup_key, "Duplicate Key Inserts", + kudu::MetricUnit::kRows, + "Number of inserts which failed because the key already existed"); +METRIC_DEFINE_counter(tablet, scans_started, "Scans Started", + kudu::MetricUnit::kScanners, + "Number of scanners which have been started on this tablet"); + +METRIC_DEFINE_counter(tablet, bloom_lookups, "Bloom Filter Lookups", + kudu::MetricUnit::kProbes, + "Number of times a bloom filter was consulted"); +METRIC_DEFINE_counter(tablet, key_file_lookups, "Key File Lookups", + kudu::MetricUnit::kProbes, + "Number of times a key cfile was consulted"); +METRIC_DEFINE_counter(tablet, delta_file_lookups, "Delta File Lookups", + kudu::MetricUnit::kProbes, + "Number of times a delta file was consulted"); +METRIC_DEFINE_counter(tablet, mrs_lookups, "MemRowSet Lookups", + kudu::MetricUnit::kProbes, + "Number of times a MemRowSet was consulted."); +METRIC_DEFINE_counter(tablet, bytes_flushed, "Bytes Flushed", + kudu::MetricUnit::kBytes, + "Amount of data that has been flushed to disk by this tablet."); + +METRIC_DEFINE_histogram(tablet, bloom_lookups_per_op, "Bloom Lookups per Operation", + kudu::MetricUnit::kProbes, + "Tracks the number of bloom filter lookups performed by each " + "operation. A single operation may perform several bloom filter " + "lookups if the tablet is not fully compacted. High frequency of " + "high values may indicate that compaction is falling behind.", + 20, 2); + +METRIC_DEFINE_histogram(tablet, key_file_lookups_per_op, "Key Lookups per Operation", + kudu::MetricUnit::kProbes, + "Tracks the number of key file lookups performed by each " + "operation. A single operation may perform several key file " + "lookups if the tablet is not fully compacted and if bloom filters " + "are not effectively culling lookups.", 20, 2); + +METRIC_DEFINE_histogram(tablet, delta_file_lookups_per_op, "Delta File Lookups per Operation", + kudu::MetricUnit::kProbes, + "Tracks the number of delta file lookups performed by each " + "operation. A single operation may perform several delta file " + "lookups if the tablet is not fully compacted. High frequency of " + "high values may indicate that compaction is falling behind.", 20, 2); + +METRIC_DEFINE_histogram(tablet, write_op_duration_client_propagated_consistency, + "Write Op Duration with Propagated Consistency", + kudu::MetricUnit::kMicroseconds, + "Duration of writes to this tablet with external consistency set to CLIENT_PROPAGATED.", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, write_op_duration_commit_wait_consistency, + "Write Op Duration with Commit-Wait Consistency", + kudu::MetricUnit::kMicroseconds, + "Duration of writes to this tablet with external consistency set to COMMIT_WAIT.", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, commit_wait_duration, + "Commit-Wait Duration", + kudu::MetricUnit::kMicroseconds, + "Time spent waiting for COMMIT_WAIT external consistency writes for this tablet.", + 60000000LU, 2); + +METRIC_DEFINE_histogram(tablet, snapshot_read_inflight_wait_duration, + "Time Waiting For Snapshot Reads", + kudu::MetricUnit::kMicroseconds, + "Time spent waiting for in-flight writes to complete for READ_AT_SNAPSHOT scans.", + 60000000LU, 2); + +METRIC_DEFINE_gauge_uint32(tablet, flush_dms_running, + "DeltaMemStore Flushes Running", + kudu::MetricUnit::kMaintenanceOperations, + "Number of delta memstore flushes currently running."); + +METRIC_DEFINE_gauge_uint32(tablet, flush_mrs_running, + "MemRowSet Flushes Running", + kudu::MetricUnit::kMaintenanceOperations, + "Number of MemRowSet flushes currently running."); + +METRIC_DEFINE_gauge_uint32(tablet, compact_rs_running, + "RowSet Compactions Running", + kudu::MetricUnit::kMaintenanceOperations, + "Number of RowSet compactions currently running."); + +METRIC_DEFINE_gauge_uint32(tablet, delta_minor_compact_rs_running, + "Minor Delta Compactions Running", + kudu::MetricUnit::kMaintenanceOperations, + "Number of delta minor compactions currently running."); + +METRIC_DEFINE_gauge_uint32(tablet, delta_major_compact_rs_running, + "Major Delta Compactions Running", + kudu::MetricUnit::kMaintenanceOperations, + "Number of delta major compactions currently running."); + +METRIC_DEFINE_histogram(tablet, flush_dms_duration, + "DeltaMemStore Flush Duration", + kudu::MetricUnit::kMilliseconds, + "Time spent flushing DeltaMemStores.", 60000LU, 1); + +METRIC_DEFINE_histogram(tablet, flush_mrs_duration, + "MemRowSet Flush Duration", + kudu::MetricUnit::kMilliseconds, + "Time spent flushing MemRowSets.", 60000LU, 1); + +METRIC_DEFINE_histogram(tablet, compact_rs_duration, + "RowSet Compaction Duration", + kudu::MetricUnit::kMilliseconds, + "Time spent compacting RowSets.", 60000LU, 1); + +METRIC_DEFINE_histogram(tablet, delta_minor_compact_rs_duration, + "Minor Delta Compaction Duration", + kudu::MetricUnit::kMilliseconds, + "Time spent minor delta compacting.", 60000LU, 1); + +METRIC_DEFINE_histogram(tablet, delta_major_compact_rs_duration, + "Major Delta Compaction Duration", + kudu::MetricUnit::kSeconds, + "Seconds spent major delta compacting.", 60000000LU, 2); + +METRIC_DEFINE_counter(tablet, leader_memory_pressure_rejections, + "Leader Memory Pressure Rejections", + kudu::MetricUnit::kRequests, + "Number of RPC requests rejected due to memory pressure while LEADER."); + +using strings::Substitute; + +namespace kudu { +namespace tablet { + +#define MINIT(x) x(METRIC_##x.Instantiate(entity)) +#define GINIT(x) x(METRIC_##x.Instantiate(entity, 0)) +TabletMetrics::TabletMetrics(const scoped_refptr& entity) + : MINIT(rows_inserted), + MINIT(rows_updated), + MINIT(rows_deleted), + MINIT(insertions_failed_dup_key), + MINIT(scanner_rows_returned), + MINIT(scanner_cells_returned), + MINIT(scanner_bytes_returned), + MINIT(scanner_rows_scanned), + MINIT(scanner_cells_scanned_from_disk), + MINIT(scanner_bytes_scanned_from_disk), + MINIT(scans_started), + MINIT(bloom_lookups), + MINIT(key_file_lookups), + MINIT(delta_file_lookups), + MINIT(mrs_lookups), + MINIT(bytes_flushed), + MINIT(bloom_lookups_per_op), + MINIT(key_file_lookups_per_op), + MINIT(delta_file_lookups_per_op), + MINIT(commit_wait_duration), + MINIT(snapshot_read_inflight_wait_duration), + MINIT(write_op_duration_client_propagated_consistency), + MINIT(write_op_duration_commit_wait_consistency), + GINIT(flush_dms_running), + GINIT(flush_mrs_running), + GINIT(compact_rs_running), + GINIT(delta_minor_compact_rs_running), + GINIT(delta_major_compact_rs_running), + MINIT(flush_dms_duration), + MINIT(flush_mrs_duration), + MINIT(compact_rs_duration), + MINIT(delta_minor_compact_rs_duration), + MINIT(delta_major_compact_rs_duration), + MINIT(leader_memory_pressure_rejections) { +} +#undef MINIT +#undef GINIT + +void TabletMetrics::AddProbeStats(const ProbeStats& stats) { + bloom_lookups->IncrementBy(stats.blooms_consulted); + key_file_lookups->IncrementBy(stats.keys_consulted); + delta_file_lookups->IncrementBy(stats.deltas_consulted); + mrs_lookups->IncrementBy(stats.mrs_consulted); + + bloom_lookups_per_op->Increment(stats.blooms_consulted); + key_file_lookups_per_op->Increment(stats.keys_consulted); + delta_file_lookups_per_op->Increment(stats.deltas_consulted); + + TRACE("ProbeStats: bloom_lookups=$0,key_file_lookups=$1," + "delta_file_lookups=$2,mrs_lookups=$3", + stats.blooms_consulted, stats.keys_consulted, + stats.deltas_consulted, stats.mrs_consulted); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_metrics.h b/src/kudu/tablet/tablet_metrics.h new file mode 100644 index 000000000000..f8c60cb34c03 --- /dev/null +++ b/src/kudu/tablet/tablet_metrics.h @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TABLET_METRICS_H +#define KUDU_TABLET_TABLET_METRICS_H + +#include "kudu/gutil/macros.h" +#include "kudu/tablet/rowset.h" + +namespace kudu { + +class Counter; +template +class AtomicGauge; +class Histogram; +class MetricEntity; + +namespace tablet { + +struct ProbeStats; + +// Container for all metrics specific to a single tablet. +struct TabletMetrics { + explicit TabletMetrics(const scoped_refptr& metric_entity); + + void AddProbeStats(const ProbeStats& stats); + + // Operation rates + scoped_refptr rows_inserted; + scoped_refptr rows_updated; + scoped_refptr rows_deleted; + scoped_refptr insertions_failed_dup_key; + scoped_refptr scanner_rows_returned; + scoped_refptr scanner_cells_returned; + scoped_refptr scanner_bytes_returned; + scoped_refptr scanner_rows_scanned; + scoped_refptr scanner_cells_scanned_from_disk; + scoped_refptr scanner_bytes_scanned_from_disk; + scoped_refptr scans_started; + + // Probe stats + scoped_refptr bloom_lookups; + scoped_refptr key_file_lookups; + scoped_refptr delta_file_lookups; + scoped_refptr mrs_lookups; + scoped_refptr bytes_flushed; + + scoped_refptr bloom_lookups_per_op; + scoped_refptr key_file_lookups_per_op; + scoped_refptr delta_file_lookups_per_op; + + scoped_refptr commit_wait_duration; + scoped_refptr snapshot_read_inflight_wait_duration; + scoped_refptr write_op_duration_client_propagated_consistency; + scoped_refptr write_op_duration_commit_wait_consistency; + + scoped_refptr > flush_dms_running; + scoped_refptr > flush_mrs_running; + scoped_refptr > compact_rs_running; + scoped_refptr > delta_minor_compact_rs_running; + scoped_refptr > delta_major_compact_rs_running; + + scoped_refptr flush_dms_duration; + scoped_refptr flush_mrs_duration; + scoped_refptr compact_rs_duration; + scoped_refptr delta_minor_compact_rs_duration; + scoped_refptr delta_major_compact_rs_duration; + + scoped_refptr leader_memory_pressure_rejections; +}; + +class ProbeStatsSubmitter { + public: + ProbeStatsSubmitter(const ProbeStats& stats, TabletMetrics* metrics) + : stats_(stats), + metrics_(metrics) { + } + + ~ProbeStatsSubmitter() { + if (metrics_) { + metrics_->AddProbeStats(stats_); + } + } + + private: + const ProbeStats& stats_; + TabletMetrics* const metrics_; + + DISALLOW_COPY_AND_ASSIGN(ProbeStatsSubmitter); +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_TABLET_METRICS_H */ diff --git a/src/kudu/tablet/tablet_mm_ops-test.cc b/src/kudu/tablet/tablet_mm_ops-test.cc new file mode 100644 index 000000000000..421f18255853 --- /dev/null +++ b/src/kudu/tablet/tablet_mm_ops-test.cc @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tablet/tablet_mm_ops.h" +#include "kudu/tablet/tablet-test-base.h" + +namespace kudu { +namespace tablet { + +class KuduTabletMmOpsTest : public TabletTestBase> { + protected: + typedef TabletTestBase > Superclass; + + KuduTabletMmOpsTest() + : Superclass(), + next_time_(MonoTime::Now(MonoTime::FINE)) { + } + + virtual void SetUp() OVERRIDE { + Superclass::SetUp(); + TabletMetrics* metrics = tablet()->metrics(); + all_possible_metrics_.push_back(metrics->flush_mrs_duration); + all_possible_metrics_.push_back(metrics->flush_dms_duration); + all_possible_metrics_.push_back(metrics->compact_rs_duration); + all_possible_metrics_.push_back(metrics->delta_minor_compact_rs_duration); + all_possible_metrics_.push_back(metrics->delta_major_compact_rs_duration); + } + + // Functions that call MaintenanceOp::UpdateStats() first sleep for a nominal + // amount of time, to ensure the "before" and "after" timestamps are unique + // if the stats are modified. + void StatsShouldChange(MaintenanceOp* op) { + SleepFor(MonoDelta::FromMilliseconds(1)); + op->UpdateStats(&stats_); + ASSERT_TRUE(next_time_.ComesBefore(stats_.last_modified())); + next_time_ = stats_.last_modified(); + } + + void StatsShouldNotChange(MaintenanceOp* op) { + SleepFor(MonoDelta::FromMilliseconds(1)); + op->UpdateStats(&stats_); + ASSERT_TRUE(next_time_.Equals(stats_.last_modified())); + next_time_ = stats_.last_modified(); + } + + void TestFirstCall(MaintenanceOp* op) { + // The very first call to UpdateStats() will update the stats, but + // subsequent calls are cached. + NO_FATALS(StatsShouldChange(op)); + NO_FATALS(StatsShouldNotChange(op)); + NO_FATALS(StatsShouldNotChange(op)); + } + + void TestAffectedMetrics(MaintenanceOp* op, + const unordered_set< + scoped_refptr, + ScopedRefPtrHashFunctor, + ScopedRefPtrEqualToFunctor >& metrics) { + for (const scoped_refptr& c : all_possible_metrics_) { + c->Increment(1); // value doesn't matter + if (ContainsKey(metrics, c)) { + NO_FATALS(StatsShouldChange(op)); + } + NO_FATALS(StatsShouldNotChange(op)); + NO_FATALS(StatsShouldNotChange(op)); + } + } + + MaintenanceOpStats stats_; + MonoTime next_time_; + vector > all_possible_metrics_; +}; + +TEST_F(KuduTabletMmOpsTest, TestCompactRowSetsOpCacheStats) { + CompactRowSetsOp op(tablet().get()); + NO_FATALS(TestFirstCall(&op)); + NO_FATALS(TestAffectedMetrics(&op, { tablet()->metrics()->flush_mrs_duration, + tablet()->metrics()->compact_rs_duration })); +} + +TEST_F(KuduTabletMmOpsTest, TestMinorDeltaCompactionOpCacheStats) { + MinorDeltaCompactionOp op(tablet().get()); + NO_FATALS(TestFirstCall(&op)); + NO_FATALS(TestAffectedMetrics(&op, { tablet()->metrics()->flush_mrs_duration, + tablet()->metrics()->flush_dms_duration, + tablet()->metrics()->compact_rs_duration, + tablet()->metrics()->delta_minor_compact_rs_duration })); +} + +TEST_F(KuduTabletMmOpsTest, TestMajorDeltaCompactionOpCacheStats) { + MajorDeltaCompactionOp op(tablet().get()); + NO_FATALS(TestFirstCall(&op)); + NO_FATALS(TestAffectedMetrics(&op, { tablet()->metrics()->flush_mrs_duration, + tablet()->metrics()->flush_dms_duration, + tablet()->metrics()->compact_rs_duration, + tablet()->metrics()->delta_minor_compact_rs_duration, + tablet()->metrics()->delta_major_compact_rs_duration })); +} +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_mm_ops.h b/src/kudu/tablet/tablet_mm_ops.h new file mode 100644 index 000000000000..cecf4439f667 --- /dev/null +++ b/src/kudu/tablet/tablet_mm_ops.h @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TABLET_MM_OPS_H_ +#define KUDU_TABLET_TABLET_MM_OPS_H_ + +#include "kudu/tablet/maintenance_manager.h" + +namespace kudu { + +class Histogram; +template +class AtomicGauge; + +namespace tablet { + +// MaintenanceOp for rowset compaction. +// +// This periodically invokes the tablet's CompactionPolicy to select a compaction. The +// compaction policy's "quality" is used as a proxy for the performance improvement which +// is exposed back to the maintenance manager. As compactions become more fruitful (i.e. +// more overlapping rowsets), the perf_improvement score goes up, increasing priority +// with which a compaction on this tablet will be selected by the maintenance manager. +class CompactRowSetsOp : public MaintenanceOp { + public: + explicit CompactRowSetsOp(Tablet* tablet); + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE; + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + mutable simple_spinlock lock_; + MaintenanceOpStats prev_stats_; + uint64_t last_num_mrs_flushed_; + uint64_t last_num_rs_compacted_; + Tablet* const tablet_; +}; + +// MaintenanceOp to run minor compaction on delta stores. +// +// There is only one MinorDeltaCompactionOp per tablet, so it picks the RowSet that needs the most +// work. The RS we end up compacting in Perform() can be different than the one reported in +// UpdateStats, we just pick the worst each time. +class MinorDeltaCompactionOp : public MaintenanceOp { + public: + explicit MinorDeltaCompactionOp(Tablet* tablet); + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE; + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + mutable simple_spinlock lock_; + MaintenanceOpStats prev_stats_; + uint64_t last_num_mrs_flushed_; + uint64_t last_num_dms_flushed_; + uint64_t last_num_rs_compacted_; + uint64_t last_num_rs_minor_delta_compacted_; + Tablet* const tablet_; +}; + +// MaintenanceOp to run major compaction on delta stores. +// +// It functions just like MinorDeltaCompactionOp does, except it runs major compactions. +class MajorDeltaCompactionOp : public MaintenanceOp { + public: + explicit MajorDeltaCompactionOp(Tablet* tablet); + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE; + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + mutable simple_spinlock lock_; + MaintenanceOpStats prev_stats_; + uint64_t last_num_mrs_flushed_; + uint64_t last_num_dms_flushed_; + uint64_t last_num_rs_compacted_; + uint64_t last_num_rs_minor_delta_compacted_; + uint64_t last_num_rs_major_delta_compacted_; + Tablet* const tablet_; +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TABLET_MM_OPS_H_ */ diff --git a/src/kudu/tablet/tablet_peer-test.cc b/src/kudu/tablet/tablet_peer-test.cc new file mode 100644 index 000000000000..915c7bcf0f8f --- /dev/null +++ b/src/kudu/tablet/tablet_peer-test.cc @@ -0,0 +1,568 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/timestamp.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/clock.h" +#include "kudu/server/logical_clock.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/tablet/transactions/transaction_driver.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet_peer_mm_ops.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/threadpool.h" + +METRIC_DECLARE_entity(tablet); + +DECLARE_int32(log_min_seconds_to_retain); + +namespace kudu { +namespace tablet { + +using consensus::CommitMsg; +using consensus::Consensus; +using consensus::ConsensusBootstrapInfo; +using consensus::ConsensusMetadata; +using consensus::MakeOpId; +using consensus::MinimumOpId; +using consensus::OpId; +using consensus::OpIdEquals; +using consensus::RaftPeerPB; +using consensus::WRITE_OP; +using log::Log; +using log::LogAnchorRegistry; +using log::LogOptions; +using rpc::Messenger; +using server::Clock; +using server::LogicalClock; +using std::shared_ptr; +using std::string; +using strings::Substitute; +using tserver::WriteRequestPB; +using tserver::WriteResponsePB; + +static Schema GetTestSchema() { + return Schema({ ColumnSchema("key", INT32) }, 1); +} + +class TabletPeerTest : public KuduTabletTest { + public: + TabletPeerTest() + : KuduTabletTest(GetTestSchema()), + insert_counter_(0), + delete_counter_(0) { + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + + ASSERT_OK(ThreadPoolBuilder("apply").Build(&apply_pool_)); + + rpc::MessengerBuilder builder(CURRENT_TEST_NAME()); + ASSERT_OK(builder.Build(&messenger_)); + + metric_entity_ = METRIC_ENTITY_tablet.Instantiate(&metric_registry_, "test-tablet"); + + RaftPeerPB config_peer; + config_peer.set_permanent_uuid(tablet()->metadata()->fs_manager()->uuid()); + config_peer.set_member_type(RaftPeerPB::VOTER); + + // "Bootstrap" and start the TabletPeer. + tablet_peer_.reset( + new TabletPeer(make_scoped_refptr(tablet()->metadata()), + config_peer, + apply_pool_.get(), + Bind(&TabletPeerTest::TabletPeerStateChangedCallback, + Unretained(this), + tablet()->tablet_id()))); + + // Make TabletPeer use the same LogAnchorRegistry as the Tablet created by the harness. + // TODO: Refactor TabletHarness to allow taking a LogAnchorRegistry, while also providing + // TabletMetadata for consumption by TabletPeer before Tablet is instantiated. + tablet_peer_->log_anchor_registry_ = tablet()->log_anchor_registry_; + + RaftConfigPB config; + config.set_local(true); + config.add_peers()->CopyFrom(config_peer); + config.set_opid_index(consensus::kInvalidOpIdIndex); + + gscoped_ptr cmeta; + ASSERT_OK(ConsensusMetadata::Create(tablet()->metadata()->fs_manager(), + tablet()->tablet_id(), + tablet()->metadata()->fs_manager()->uuid(), + config, + consensus::kMinimumTerm, &cmeta)); + + scoped_refptr log; + ASSERT_OK(Log::Open(LogOptions(), fs_manager(), tablet()->tablet_id(), + *tablet()->schema(), tablet()->metadata()->schema_version(), + metric_entity_.get(), &log)); + + tablet_peer_->SetBootstrapping(); + ASSERT_OK(tablet_peer_->Init(tablet(), + clock(), + messenger_, + log, + metric_entity_)); + } + + Status StartPeer(const ConsensusBootstrapInfo& info) { + RETURN_NOT_OK(tablet_peer_->Start(info)); + + return Status::OK(); + } + + void TabletPeerStateChangedCallback(const string& tablet_id, const string& reason) { + LOG(INFO) << "Tablet peer state changed for tablet " << tablet_id << ". Reason: " << reason; + } + + virtual void TearDown() OVERRIDE { + tablet_peer_->Shutdown(); + apply_pool_->Shutdown(); + KuduTabletTest::TearDown(); + } + + protected: + // Generate monotonic sequence of key column integers. + Status GenerateSequentialInsertRequest(WriteRequestPB* write_req) { + Schema schema(GetTestSchema()); + write_req->set_tablet_id(tablet()->tablet_id()); + CHECK_OK(SchemaToPB(schema, write_req->mutable_schema())); + + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32("key", insert_counter_++)); + + RowOperationsPBEncoder enc(write_req->mutable_row_operations()); + enc.Add(RowOperationsPB::INSERT, row); + return Status::OK(); + } + + // Generate monotonic sequence of deletions, starting with 0. + // Will assert if you try to delete more rows than you inserted. + Status GenerateSequentialDeleteRequest(WriteRequestPB* write_req) { + CHECK_LT(delete_counter_, insert_counter_); + Schema schema(GetTestSchema()); + write_req->set_tablet_id(tablet()->tablet_id()); + CHECK_OK(SchemaToPB(schema, write_req->mutable_schema())); + + KuduPartialRow row(&schema); + CHECK_OK(row.SetInt32("key", delete_counter_++)); + + RowOperationsPBEncoder enc(write_req->mutable_row_operations()); + enc.Add(RowOperationsPB::DELETE, row); + return Status::OK(); + } + + Status ExecuteWriteAndRollLog(TabletPeer* tablet_peer, const WriteRequestPB& req) { + gscoped_ptr resp(new WriteResponsePB()); + auto tx_state = new WriteTransactionState(tablet_peer, &req, resp.get()); + + CountDownLatch rpc_latch(1); + tx_state->set_completion_callback(gscoped_ptr( + new LatchTransactionCompletionCallback(&rpc_latch, resp.get())).Pass()); + + CHECK_OK(tablet_peer->SubmitWrite(tx_state)); + rpc_latch.Wait(); + CHECK(!resp->has_error()) + << "\nReq:\n" << req.DebugString() << "Resp:\n" << resp->DebugString(); + + // Roll the log after each write. + // Usually the append thread does the roll and no additional sync is required. However in + // this test the thread that is appending is not the same thread that is rolling the log + // so we must make sure the Log's queue is flushed before we roll or we might have a race + // between the appender thread and the thread executing the test. + CHECK_OK(tablet_peer->log_->WaitUntilAllFlushed()); + CHECK_OK(tablet_peer->log_->AllocateSegmentAndRollOver()); + return Status::OK(); + } + + // Execute insert requests and roll log after each one. + Status ExecuteInsertsAndRollLogs(int num_inserts) { + for (int i = 0; i < num_inserts; i++) { + gscoped_ptr req(new WriteRequestPB()); + RETURN_NOT_OK(GenerateSequentialInsertRequest(req.get())); + RETURN_NOT_OK(ExecuteWriteAndRollLog(tablet_peer_.get(), *req)); + } + + return Status::OK(); + } + + // Execute delete requests and roll log after each one. + Status ExecuteDeletesAndRollLogs(int num_deletes) { + for (int i = 0; i < num_deletes; i++) { + gscoped_ptr req(new WriteRequestPB()); + CHECK_OK(GenerateSequentialDeleteRequest(req.get())); + CHECK_OK(ExecuteWriteAndRollLog(tablet_peer_.get(), *req)); + } + + return Status::OK(); + } + + void AssertNoLogAnchors() { + // Make sure that there are no registered anchors in the registry + CHECK_EQ(0, tablet_peer_->log_anchor_registry()->GetAnchorCountForTests()); + int64_t earliest_index = -1; + // And that there are no in-flight transactions (which are implicit + // anchors) by comparing the TabletPeer's earliest needed OpId and the last + // entry in the log; if they match there is nothing in flight. + tablet_peer_->GetEarliestNeededLogIndex(&earliest_index); + OpId last_log_opid; + tablet_peer_->log_->GetLatestEntryOpId(&last_log_opid); + CHECK_EQ(earliest_index, last_log_opid.index()) + << "Found unexpected anchor: " << earliest_index + << " Last log entry: " << last_log_opid.ShortDebugString(); + } + + // Assert that the Log GC() anchor is earlier than the latest OpId in the Log. + void AssertLogAnchorEarlierThanLogLatest() { + int64_t earliest_index = -1; + tablet_peer_->GetEarliestNeededLogIndex(&earliest_index); + OpId last_log_opid; + tablet_peer_->log_->GetLatestEntryOpId(&last_log_opid); + CHECK_LT(earliest_index, last_log_opid.index()) + << "Expected valid log anchor, got earliest opid: " << earliest_index + << " (expected any value earlier than last log id: " << last_log_opid.ShortDebugString() + << ")"; + } + + // We disable automatic log GC. Don't leak those changes. + google::FlagSaver flag_saver_; + + int32_t insert_counter_; + int32_t delete_counter_; + MetricRegistry metric_registry_; + scoped_refptr metric_entity_; + shared_ptr messenger_; + scoped_refptr tablet_peer_; + gscoped_ptr apply_pool_; +}; + +// A Transaction that waits on the apply_continue latch inside of Apply(). +class DelayedApplyTransaction : public WriteTransaction { + public: + DelayedApplyTransaction(CountDownLatch* apply_started, + CountDownLatch* apply_continue, + WriteTransactionState* state) + : WriteTransaction(state, consensus::LEADER), + apply_started_(DCHECK_NOTNULL(apply_started)), + apply_continue_(DCHECK_NOTNULL(apply_continue)) { + } + + virtual Status Apply(gscoped_ptr* commit_msg) OVERRIDE { + apply_started_->CountDown(); + LOG(INFO) << "Delaying apply..."; + apply_continue_->Wait(); + LOG(INFO) << "Apply proceeding"; + return WriteTransaction::Apply(commit_msg); + } + + private: + CountDownLatch* apply_started_; + CountDownLatch* apply_continue_; + DISALLOW_COPY_AND_ASSIGN(DelayedApplyTransaction); +}; + +// Ensure that Log::GC() doesn't delete logs when the MRS has an anchor. +TEST_F(TabletPeerTest, TestMRSAnchorPreventsLogGC) { + FLAGS_log_min_seconds_to_retain = 0; + ConsensusBootstrapInfo info; + ASSERT_OK(StartPeer(info)); + + Log* log = tablet_peer_->log_.get(); + int32_t num_gced; + + AssertNoLogAnchors(); + + log::SegmentSequence segments; + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_EQ(1, segments.size()); + ASSERT_OK(ExecuteInsertsAndRollLogs(3)); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(4, segments.size()); + + AssertLogAnchorEarlierThanLogLatest(); + ASSERT_GT(tablet_peer_->log_anchor_registry()->GetAnchorCountForTests(), 0); + + // Ensure nothing gets deleted. + int64_t min_log_index = -1; + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(0, num_gced) << "earliest needed: " << min_log_index; + + // Flush MRS as needed to ensure that we don't have OpId anchors in the MRS. + tablet_peer_->tablet()->Flush(); + AssertNoLogAnchors(); + + // The first two segments should be deleted. + // The last is anchored due to the commit in the last segment being the last + // OpId in the log. + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(2, num_gced) << "earliest needed: " << min_log_index; + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()); +} + +// Ensure that Log::GC() doesn't delete logs when the DMS has an anchor. +TEST_F(TabletPeerTest, TestDMSAnchorPreventsLogGC) { + FLAGS_log_min_seconds_to_retain = 0; + ConsensusBootstrapInfo info; + ASSERT_OK(StartPeer(info)); + + Log* log = tablet_peer_->log_.get(); + int32_t num_gced; + + AssertNoLogAnchors(); + + log::SegmentSequence segments; + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_EQ(1, segments.size()); + ASSERT_OK(ExecuteInsertsAndRollLogs(2)); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(3, segments.size()); + + // Flush MRS & GC log so the next mutation goes into a DMS. + ASSERT_OK(tablet_peer_->tablet()->Flush()); + int64_t min_log_index = -1; + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + // We will only GC 1, and have 1 left because the earliest needed OpId falls + // back to the latest OpId written to the Log if no anchors are set. + ASSERT_EQ(1, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()); + AssertNoLogAnchors(); + + OpId id; + log->GetLatestEntryOpId(&id); + LOG(INFO) << "Before: " << id.ShortDebugString(); + + + // We currently have no anchors and the last operation in the log is 0.3 + // Before the below was ExecuteDeletesAndRollLogs(1) but that was breaking + // what I think is a wrong assertion. + // I.e. since 0.4 is the last operation that we know is in memory 0.4 is the + // last anchor we expect _and_ it's the last op in the log. + // Only if we apply two operations is the last anchored operation and the + // last operation in the log different. + + // Execute a mutation. + ASSERT_OK(ExecuteDeletesAndRollLogs(2)); + AssertLogAnchorEarlierThanLogLatest(); + ASSERT_GT(tablet_peer_->log_anchor_registry()->GetAnchorCountForTests(), 0); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(4, segments.size()); + + // Execute another couple inserts, but Flush it so it doesn't anchor. + ASSERT_OK(ExecuteInsertsAndRollLogs(2)); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(6, segments.size()); + + // Ensure the delta and last insert remain in the logs, anchored by the delta. + // Note that this will allow GC of the 2nd insert done above. + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(1, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(5, segments.size()); + + // Flush DMS to release the anchor. + tablet_peer_->tablet()->FlushBiggestDMS(); + + // Verify no anchors after Flush(). + AssertNoLogAnchors(); + + // We should only hang onto one segment due to no anchors. + // The last log OpId is the commit in the last segment, so it only anchors + // that segment, not the previous, because it's not the first OpId in the + // segment. + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(3, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()); +} + +// Ensure that Log::GC() doesn't compact logs with OpIds of active transactions. +TEST_F(TabletPeerTest, TestActiveTransactionPreventsLogGC) { + FLAGS_log_min_seconds_to_retain = 0; + ConsensusBootstrapInfo info; + ASSERT_OK(StartPeer(info)); + + Log* log = tablet_peer_->log_.get(); + int32_t num_gced; + + AssertNoLogAnchors(); + + log::SegmentSequence segments; + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + + ASSERT_EQ(1, segments.size()); + ASSERT_OK(ExecuteInsertsAndRollLogs(4)); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(5, segments.size()); + + // Flush MRS as needed to ensure that we don't have OpId anchors in the MRS. + ASSERT_EQ(1, tablet_peer_->log_anchor_registry()->GetAnchorCountForTests()); + tablet_peer_->tablet()->Flush(); + + // Verify no anchors after Flush(). + AssertNoLogAnchors(); + + // Now create a long-lived Transaction that hangs during Apply(). + // Allow other transactions to go through. Logs should be populated, but the + // long-lived Transaction should prevent the log from being deleted since it + // is in-flight. + CountDownLatch rpc_latch(1); + CountDownLatch apply_started(1); + CountDownLatch apply_continue(1); + gscoped_ptr req(new WriteRequestPB()); + gscoped_ptr resp(new WriteResponsePB()); + { + // Long-running mutation. + ASSERT_OK(GenerateSequentialDeleteRequest(req.get())); + auto tx_state = new WriteTransactionState(tablet_peer_.get(), req.get(), resp.get()); + + tx_state->set_completion_callback(gscoped_ptr( + new LatchTransactionCompletionCallback(&rpc_latch, resp.get())).Pass()); + + gscoped_ptr transaction(new DelayedApplyTransaction(&apply_started, + &apply_continue, + tx_state)); + + scoped_refptr driver; + ASSERT_OK(tablet_peer_->NewLeaderTransactionDriver(transaction.PassAs(), + &driver)); + + ASSERT_OK(driver->ExecuteAsync()); + apply_started.Wait(); + ASSERT_TRUE(driver->GetOpId().IsInitialized()) + << "By the time a transaction is applied, it should have an Opid"; + // The apply will hang until we CountDown() the continue latch. + // Now, roll the log. Below, we execute a few more insertions with rolling. + ASSERT_OK(log->AllocateSegmentAndRollOver()); + } + + ASSERT_EQ(1, tablet_peer_->txn_tracker_.GetNumPendingForTests()); + // The log anchor is currently equal to the latest OpId written to the Log + // because we are delaying the Commit message with the CountDownLatch. + + // GC the first four segments created by the inserts. + int64_t min_log_index = -1; + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(4, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()); + + // We use mutations here, since an MRS Flush() quiesces the tablet, and we + // want to ensure the only thing "anchoring" is the TransactionTracker. + ASSERT_OK(ExecuteDeletesAndRollLogs(3)); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(5, segments.size()); + ASSERT_EQ(1, tablet_peer_->log_anchor_registry()->GetAnchorCountForTests()); + tablet_peer_->tablet()->FlushBiggestDMS(); + ASSERT_EQ(0, tablet_peer_->log_anchor_registry()->GetAnchorCountForTests()); + ASSERT_EQ(1, tablet_peer_->txn_tracker_.GetNumPendingForTests()); + + AssertLogAnchorEarlierThanLogLatest(); + + // Try to GC(), nothing should be deleted due to the in-flight transaction. + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(0, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(5, segments.size()); + + // Now we release the transaction and wait for everything to complete. + // We fully quiesce and flush, which should release all anchors. + ASSERT_EQ(1, tablet_peer_->txn_tracker_.GetNumPendingForTests()); + apply_continue.CountDown(); + rpc_latch.Wait(); + tablet_peer_->txn_tracker_.WaitForAllToFinish(); + ASSERT_EQ(0, tablet_peer_->txn_tracker_.GetNumPendingForTests()); + tablet_peer_->tablet()->FlushBiggestDMS(); + AssertNoLogAnchors(); + + // All should be deleted except the two last segments. + tablet_peer_->GetEarliestNeededLogIndex(&min_log_index); + ASSERT_OK(log->GC(min_log_index, &num_gced)); + ASSERT_EQ(3, num_gced); + ASSERT_OK(log->GetLogReader()->GetSegmentsSnapshot(&segments)); + ASSERT_EQ(2, segments.size()); +} + +TEST_F(TabletPeerTest, TestGCEmptyLog) { + ConsensusBootstrapInfo info; + tablet_peer_->Start(info); + // We don't wait on consensus on purpose. + ASSERT_OK(tablet_peer_->RunLogGC()); +} + +TEST_F(TabletPeerTest, TestFlushOpsPerfImprovements) { + MaintenanceOpStats stats; + + // Just on the threshold and not enough time has passed for a time-based flush. + stats.set_ram_anchored(64 * 1024 * 1024); + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 1); + ASSERT_EQ(0.0, stats.perf_improvement()); + stats.Clear(); + + // Just on the threshold and enough time has passed, we'll have a low improvement. + stats.set_ram_anchored(64 * 1024 * 1024); + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 3 * 60 * 1000); + ASSERT_GT(stats.perf_improvement(), 0.01); + stats.Clear(); + + // Way over the threshold, number is much higher than 1. + stats.set_ram_anchored(128 * 1024 * 1024); + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 1); + ASSERT_LT(1.0, stats.perf_improvement()); + stats.Clear(); + + // Below the threshold but have been there a long time, closing in to 1.0. + stats.set_ram_anchored(30 * 1024 * 1024); + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(&stats, 60 * 50 * 1000); + ASSERT_LT(0.7, stats.perf_improvement()); + ASSERT_GT(1.0, stats.perf_improvement()); + stats.Clear(); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_peer.cc b/src/kudu/tablet/tablet_peer.cc new file mode 100644 index 000000000000..1c18187ab5d2 --- /dev/null +++ b/src/kudu/tablet/tablet_peer.cc @@ -0,0 +1,652 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/tablet_peer.h" + +#include +#include +#include +#include + +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/local_consensus.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/consensus/raft_consensus.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/tablet/transactions/transaction_driver.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tablet/tablet_peer_mm_ops.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" + +using std::shared_ptr; + +namespace kudu { +namespace tablet { + +METRIC_DEFINE_histogram(tablet, op_prepare_queue_length, "Operation Prepare Queue Length", + MetricUnit::kTasks, + "Number of operations waiting to be prepared within this tablet. " + "High queue lengths indicate that the server is unable to process " + "operations as fast as they are being written to the WAL.", + 10000, 2); + +METRIC_DEFINE_histogram(tablet, op_prepare_queue_time, "Operation Prepare Queue Time", + MetricUnit::kMicroseconds, + "Time that operations spent waiting in the prepare queue before being " + "processed. High queue times indicate that the server is unable to " + "process operations as fast as they are being written to the WAL.", + 10000000, 2); + +METRIC_DEFINE_histogram(tablet, op_prepare_run_time, "Operation Prepare Run Time", + MetricUnit::kMicroseconds, + "Time that operations spent being prepared in the tablet. " + "High values may indicate that the server is under-provisioned or " + "that operations are experiencing high contention with one another for " + "locks.", + 10000000, 2); + +using consensus::Consensus; +using consensus::ConsensusBootstrapInfo; +using consensus::ConsensusMetadata; +using consensus::ConsensusOptions; +using consensus::ConsensusRound; +using consensus::LocalConsensus; +using consensus::OpId; +using consensus::RaftConfigPB; +using consensus::RaftPeerPB; +using consensus::RaftConsensus; +using consensus::ALTER_SCHEMA_OP; +using consensus::WRITE_OP; +using log::Log; +using log::LogAnchorRegistry; +using rpc::Messenger; +using strings::Substitute; +using tserver::TabletServerErrorPB; + +// ============================================================================ +// Tablet Peer +// ============================================================================ +TabletPeer::TabletPeer(const scoped_refptr& meta, + const consensus::RaftPeerPB& local_peer_pb, + ThreadPool* apply_pool, + Callback mark_dirty_clbk) + : meta_(meta), + tablet_id_(meta->tablet_id()), + local_peer_pb_(local_peer_pb), + state_(NOT_STARTED), + status_listener_(new TabletStatusListener(meta)), + apply_pool_(apply_pool), + log_anchor_registry_(new LogAnchorRegistry()), + mark_dirty_clbk_(std::move(mark_dirty_clbk)) {} + +TabletPeer::~TabletPeer() { + boost::lock_guard lock(lock_); + // We should either have called Shutdown(), or we should have never called + // Init(). + CHECK(!tablet_) + << "TabletPeer not fully shut down. State: " + << TabletStatePB_Name(state_); +} + +Status TabletPeer::Init(const shared_ptr& tablet, + const scoped_refptr& clock, + const shared_ptr& messenger, + const scoped_refptr& log, + const scoped_refptr& metric_entity) { + + DCHECK(tablet) << "A TabletPeer must be provided with a Tablet"; + DCHECK(log) << "A TabletPeer must be provided with a Log"; + + RETURN_NOT_OK(ThreadPoolBuilder("prepare").set_max_threads(1).Build(&prepare_pool_)); + prepare_pool_->SetQueueLengthHistogram( + METRIC_op_prepare_queue_length.Instantiate(metric_entity)); + prepare_pool_->SetQueueTimeMicrosHistogram( + METRIC_op_prepare_queue_time.Instantiate(metric_entity)); + prepare_pool_->SetRunTimeMicrosHistogram( + METRIC_op_prepare_run_time.Instantiate(metric_entity)); + + { + boost::lock_guard lock(lock_); + CHECK_EQ(BOOTSTRAPPING, state_); + tablet_ = tablet; + clock_ = clock; + messenger_ = messenger; + log_ = log; + + ConsensusOptions options; + options.tablet_id = meta_->tablet_id(); + + TRACE("Creating consensus instance"); + + gscoped_ptr cmeta; + RETURN_NOT_OK(ConsensusMetadata::Load(meta_->fs_manager(), tablet_id_, + meta_->fs_manager()->uuid(), &cmeta)); + + if (cmeta->committed_config().local()) { + consensus_.reset(new LocalConsensus(options, + cmeta.Pass(), + meta_->fs_manager()->uuid(), + clock_, + this, + log_.get())); + } else { + consensus_ = RaftConsensus::Create(options, + cmeta.Pass(), + local_peer_pb_, + metric_entity, + clock_, + this, + messenger_, + log_.get(), + tablet_->mem_tracker(), + mark_dirty_clbk_); + } + } + + if (tablet_->metrics() != nullptr) { + TRACE("Starting instrumentation"); + txn_tracker_.StartInstrumentation(tablet_->GetMetricEntity()); + } + txn_tracker_.StartMemoryTracking(tablet_->mem_tracker()); + + TRACE("TabletPeer::Init() finished"); + VLOG(2) << "T " << tablet_id() << " P " << consensus_->peer_uuid() << ": Peer Initted"; + return Status::OK(); +} + +Status TabletPeer::Start(const ConsensusBootstrapInfo& bootstrap_info) { + lock_guard l(&state_change_lock_); + TRACE("Starting consensus"); + + VLOG(2) << "T " << tablet_id() << " P " << consensus_->peer_uuid() << ": Peer starting"; + + VLOG(2) << "RaftConfig before starting: " << consensus_->CommittedConfig().DebugString(); + + RETURN_NOT_OK(consensus_->Start(bootstrap_info)); + { + boost::lock_guard lock(lock_); + CHECK_EQ(state_, BOOTSTRAPPING); + state_ = RUNNING; + } + + // Because we changed the tablet state, we need to re-report the tablet to the master. + mark_dirty_clbk_.Run("Started TabletPeer"); + + return Status::OK(); +} + +const consensus::RaftConfigPB TabletPeer::RaftConfig() const { + CHECK(consensus_) << "consensus is null"; + return consensus_->CommittedConfig(); +} + +void TabletPeer::Shutdown() { + + LOG(INFO) << "Initiating TabletPeer shutdown for tablet: " << tablet_id_; + + { + unique_lock lock(&lock_); + if (state_ == QUIESCING || state_ == SHUTDOWN) { + lock.unlock(); + WaitUntilShutdown(); + return; + } + state_ = QUIESCING; + } + + lock_guard l(&state_change_lock_); + // Even though Tablet::Shutdown() also unregisters its ops, we have to do it here + // to ensure that any currently running operation finishes before we proceed with + // the rest of the shutdown sequence. In particular, a maintenance operation could + // indirectly end up calling into the log, which we are about to shut down. + if (tablet_) tablet_->UnregisterMaintenanceOps(); + UnregisterMaintenanceOps(); + + if (consensus_) consensus_->Shutdown(); + + // TODO: KUDU-183: Keep track of the pending tasks and send an "abort" message. + LOG_SLOW_EXECUTION(WARNING, 1000, + Substitute("TabletPeer: tablet $0: Waiting for Transactions to complete", tablet_id())) { + txn_tracker_.WaitForAllToFinish(); + } + + if (prepare_pool_) { + prepare_pool_->Shutdown(); + } + + if (log_) { + WARN_NOT_OK(log_->Close(), "Error closing the Log."); + } + + if (VLOG_IS_ON(1)) { + VLOG(1) << "TabletPeer: tablet " << tablet_id() << " shut down!"; + } + + if (tablet_) { + tablet_->Shutdown(); + } + + // Only mark the peer as SHUTDOWN when all other components have shut down. + { + boost::lock_guard lock(lock_); + // Release mem tracker resources. + consensus_.reset(); + tablet_.reset(); + state_ = SHUTDOWN; + } +} + +void TabletPeer::WaitUntilShutdown() { + while (true) { + { + boost::lock_guard lock(lock_); + if (state_ == SHUTDOWN) { + return; + } + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } +} + +Status TabletPeer::CheckRunning() const { + { + boost::lock_guard lock(lock_); + if (state_ != RUNNING) { + return Status::IllegalState(Substitute("The tablet is not in a running state: $0", + TabletStatePB_Name(state_))); + } + } + return Status::OK(); +} + +Status TabletPeer::WaitUntilConsensusRunning(const MonoDelta& timeout) { + MonoTime start(MonoTime::Now(MonoTime::FINE)); + + int backoff_exp = 0; + const int kMaxBackoffExp = 8; + while (true) { + bool has_consensus = false; + TabletStatePB cached_state; + { + boost::lock_guard lock(lock_); + cached_state = state_; + if (consensus_) { + has_consensus = true; // consensus_ is a set-once object. + } + } + if (cached_state == QUIESCING || cached_state == SHUTDOWN) { + return Status::IllegalState( + Substitute("The tablet is already shutting down or shutdown. State: $0", + TabletStatePB_Name(cached_state))); + } + if (cached_state == RUNNING && has_consensus && consensus_->IsRunning()) { + break; + } + MonoTime now(MonoTime::Now(MonoTime::FINE)); + MonoDelta elapsed(now.GetDeltaSince(start)); + if (elapsed.MoreThan(timeout)) { + return Status::TimedOut(Substitute("Consensus is not running after waiting for $0. State; $1", + elapsed.ToString(), TabletStatePB_Name(cached_state))); + } + SleepFor(MonoDelta::FromMilliseconds(1 << backoff_exp)); + backoff_exp = std::min(backoff_exp + 1, kMaxBackoffExp); + } + return Status::OK(); +} + +Status TabletPeer::SubmitWrite(WriteTransactionState *state) { + RETURN_NOT_OK(CheckRunning()); + + gscoped_ptr transaction(new WriteTransaction(state, consensus::LEADER)); + scoped_refptr driver; + RETURN_NOT_OK(NewLeaderTransactionDriver(transaction.PassAs(), &driver)); + return driver->ExecuteAsync(); +} + +Status TabletPeer::SubmitAlterSchema(gscoped_ptr state) { + RETURN_NOT_OK(CheckRunning()); + + gscoped_ptr transaction( + new AlterSchemaTransaction(state.release(), consensus::LEADER)); + scoped_refptr driver; + RETURN_NOT_OK(NewLeaderTransactionDriver(transaction.PassAs(), &driver)); + return driver->ExecuteAsync(); +} + +void TabletPeer::GetTabletStatusPB(TabletStatusPB* status_pb_out) const { + boost::lock_guard lock(lock_); + DCHECK(status_pb_out != nullptr); + DCHECK(status_listener_.get() != nullptr); + status_pb_out->set_tablet_id(status_listener_->tablet_id()); + status_pb_out->set_table_name(status_listener_->table_name()); + status_pb_out->set_last_status(status_listener_->last_status()); + status_listener_->partition().ToPB(status_pb_out->mutable_partition()); + status_pb_out->set_state(state_); + status_pb_out->set_tablet_data_state(meta_->tablet_data_state()); + if (tablet_) { + status_pb_out->set_estimated_on_disk_size(tablet_->EstimateOnDiskSize()); + } +} + +Status TabletPeer::RunLogGC() { + if (!CheckRunning().ok()) { + return Status::OK(); + } + int64_t min_log_index; + int32_t num_gced; + GetEarliestNeededLogIndex(&min_log_index); + Status s = log_->GC(min_log_index, &num_gced); + if (!s.ok()) { + s = s.CloneAndPrepend("Unexpected error while running Log GC from TabletPeer"); + LOG(ERROR) << s.ToString(); + } + return Status::OK(); +} + +string TabletPeer::HumanReadableState() const { + boost::lock_guard lock(lock_); + TabletDataState data_state = meta_->tablet_data_state(); + // If failed, any number of things could have gone wrong. + if (state_ == FAILED) { + return Substitute("$0 ($1): $2", TabletStatePB_Name(state_), + TabletDataState_Name(data_state), + error_.ToString()); + // If it's remotely bootstrapping, or tombstoned, that is the important thing + // to show. + } else if (data_state != TABLET_DATA_READY) { + return TabletDataState_Name(data_state); + } + // Otherwise, the tablet's data is in a "normal" state, so we just display + // the runtime state (BOOTSTRAPPING, RUNNING, etc). + return TabletStatePB_Name(state_); +} + +void TabletPeer::GetInFlightTransactions(Transaction::TraceType trace_type, + vector* out) const { + vector > pending_transactions; + txn_tracker_.GetPendingTransactions(&pending_transactions); + for (const scoped_refptr& driver : pending_transactions) { + if (driver->state() != nullptr) { + consensus::TransactionStatusPB status_pb; + status_pb.mutable_op_id()->CopyFrom(driver->GetOpId()); + switch (driver->tx_type()) { + case Transaction::WRITE_TXN: + status_pb.set_tx_type(consensus::WRITE_OP); + break; + case Transaction::ALTER_SCHEMA_TXN: + status_pb.set_tx_type(consensus::ALTER_SCHEMA_OP); + break; + } + status_pb.set_description(driver->ToString()); + int64_t running_for_micros = + MonoTime::Now(MonoTime::FINE).GetDeltaSince(driver->start_time()).ToMicroseconds(); + status_pb.set_running_for_micros(running_for_micros); + if (trace_type == Transaction::TRACE_TXNS) { + status_pb.set_trace_buffer(driver->trace()->DumpToString(true)); + } + out->push_back(status_pb); + } + } +} + +void TabletPeer::GetEarliestNeededLogIndex(int64_t* min_index) const { + // First, we anchor on the last OpId in the Log to establish a lower bound + // and avoid racing with the other checks. This limits the Log GC candidate + // segments before we check the anchors. + { + OpId last_log_op; + log_->GetLatestEntryOpId(&last_log_op); + *min_index = last_log_op.index(); + } + + // If we never have written to the log, no need to proceed. + if (*min_index == 0) return; + + // Next, we interrogate the anchor registry. + // Returns OK if minimum known, NotFound if no anchors are registered. + { + int64_t min_anchor_index; + Status s = log_anchor_registry_->GetEarliestRegisteredLogIndex(&min_anchor_index); + if (PREDICT_FALSE(!s.ok())) { + DCHECK(s.IsNotFound()) << "Unexpected error calling LogAnchorRegistry: " << s.ToString(); + } else { + *min_index = std::min(*min_index, min_anchor_index); + } + } + + // Next, interrogate the TransactionTracker. + vector > pending_transactions; + txn_tracker_.GetPendingTransactions(&pending_transactions); + for (const scoped_refptr& driver : pending_transactions) { + OpId tx_op_id = driver->GetOpId(); + // A transaction which doesn't have an opid hasn't been submitted for replication yet and + // thus has no need to anchor the log. + if (tx_op_id.IsInitialized()) { + *min_index = std::min(*min_index, tx_op_id.index()); + } + } +} + +Status TabletPeer::GetMaxIndexesToSegmentSizeMap(MaxIdxToSegmentSizeMap* idx_size_map) const { + RETURN_NOT_OK(CheckRunning()); + int64_t min_op_idx; + GetEarliestNeededLogIndex(&min_op_idx); + log_->GetMaxIndexesToSegmentSizeMap(min_op_idx, idx_size_map); + return Status::OK(); +} + +Status TabletPeer::GetGCableDataSize(int64_t* retention_size) const { + RETURN_NOT_OK(CheckRunning()); + int64_t min_op_idx; + GetEarliestNeededLogIndex(&min_op_idx); + log_->GetGCableDataSize(min_op_idx, retention_size); + return Status::OK(); +} + +Status TabletPeer::StartReplicaTransaction(const scoped_refptr& round) { + { + boost::lock_guard lock(lock_); + if (state_ != RUNNING && state_ != BOOTSTRAPPING) { + return Status::IllegalState(TabletStatePB_Name(state_)); + } + } + + consensus::ReplicateMsg* replicate_msg = round->replicate_msg(); + DCHECK(replicate_msg->has_timestamp()); + gscoped_ptr transaction; + switch (replicate_msg->op_type()) { + case WRITE_OP: + { + DCHECK(replicate_msg->has_write_request()) << "WRITE_OP replica" + " transaction must receive a WriteRequestPB"; + transaction.reset(new WriteTransaction( + new WriteTransactionState(this, &replicate_msg->write_request()), + consensus::REPLICA)); + break; + } + case ALTER_SCHEMA_OP: + { + DCHECK(replicate_msg->has_alter_schema_request()) << "ALTER_SCHEMA_OP replica" + " transaction must receive an AlterSchemaRequestPB"; + transaction.reset( + new AlterSchemaTransaction( + new AlterSchemaTransactionState(this, &replicate_msg->alter_schema_request(), + nullptr), + consensus::REPLICA)); + break; + } + default: + LOG(FATAL) << "Unsupported Operation Type"; + } + + // TODO(todd) Look at wiring the stuff below on the driver + TransactionState* state = transaction->state(); + state->set_consensus_round(round); + Timestamp ts(replicate_msg->timestamp()); + state->set_timestamp(ts); + clock_->Update(ts); + + scoped_refptr driver; + RETURN_NOT_OK(NewReplicaTransactionDriver(transaction.Pass(), &driver)); + + // Unretained is required to avoid a refcount cycle. + state->consensus_round()->SetConsensusReplicatedCallback( + Bind(&TransactionDriver::ReplicationFinished, Unretained(driver.get()))); + + RETURN_NOT_OK(driver->ExecuteAsync()); + return Status::OK(); +} + +Status TabletPeer::NewLeaderTransactionDriver(gscoped_ptr transaction, + scoped_refptr* driver) { + scoped_refptr tx_driver = new TransactionDriver( + &txn_tracker_, + consensus_.get(), + log_.get(), + prepare_pool_.get(), + apply_pool_, + &txn_order_verifier_); + RETURN_NOT_OK(tx_driver->Init(transaction.Pass(), consensus::LEADER)); + driver->swap(tx_driver); + + return Status::OK(); +} + +Status TabletPeer::NewReplicaTransactionDriver(gscoped_ptr transaction, + scoped_refptr* driver) { + scoped_refptr tx_driver = new TransactionDriver( + &txn_tracker_, + consensus_.get(), + log_.get(), + prepare_pool_.get(), + apply_pool_, + &txn_order_verifier_); + RETURN_NOT_OK(tx_driver->Init(transaction.Pass(), consensus::REPLICA)); + driver->swap(tx_driver); + + return Status::OK(); +} + +void TabletPeer::RegisterMaintenanceOps(MaintenanceManager* maint_mgr) { + // Taking state_change_lock_ ensures that we don't shut down concurrently with + // this last start-up task. + lock_guard l(&state_change_lock_); + + if (state() != RUNNING) { + LOG(WARNING) << "Not registering maintenance operations for " << tablet_ + << ": tablet not in RUNNING state"; + return; + } + + DCHECK(maintenance_ops_.empty()); + + gscoped_ptr mrs_flush_op(new FlushMRSOp(this)); + maint_mgr->RegisterOp(mrs_flush_op.get()); + maintenance_ops_.push_back(mrs_flush_op.release()); + + gscoped_ptr dms_flush_op(new FlushDeltaMemStoresOp(this)); + maint_mgr->RegisterOp(dms_flush_op.get()); + maintenance_ops_.push_back(dms_flush_op.release()); + + gscoped_ptr log_gc(new LogGCOp(this)); + maint_mgr->RegisterOp(log_gc.get()); + maintenance_ops_.push_back(log_gc.release()); + + tablet_->RegisterMaintenanceOps(maint_mgr); +} + +void TabletPeer::UnregisterMaintenanceOps() { + DCHECK(state_change_lock_.is_locked()); + for (MaintenanceOp* op : maintenance_ops_) { + op->Unregister(); + } + STLDeleteElements(&maintenance_ops_); +} + +Status FlushInflightsToLogCallback::WaitForInflightsAndFlushLog() { + // This callback is triggered prior to any TabletMetadata flush. + // The guarantee that we are trying to enforce is this: + // + // If an operation has been flushed to stable storage (eg a DRS or DeltaFile) + // then its COMMIT message must be present in the log. + // + // The purpose for this is so that, during bootstrap, we can accurately identify + // whether each operation has been flushed. If we don't see a COMMIT message for + // an operation, then we assume it was not completely applied and needs to be + // re-applied. Thus, if we had something on disk but with no COMMIT message, + // we'd attempt to double-apply the write, resulting in an error (eg trying to + // delete an already-deleted row). + // + // So, to enforce this property, we do two steps: + // + // 1) Wait for any operations which are already mid-Apply() to Commit() in MVCC. + // + // Because the operations always enqueue their COMMIT message to the log + // before calling Commit(), this ensures that any in-flight operations have + // their commit messages "en route". + // + // NOTE: we only wait for those operations that have started their Apply() phase. + // Any operations which haven't yet started applying haven't made any changes + // to in-memory state: thus, they obviously couldn't have made any changes to + // on-disk storage either (data can only get to the disk by going through an in-memory + // store). Only those that have started Apply() could have potentially written some + // data which is now on disk. + // + // Perhaps more importantly, if we waited on operations that hadn't started their + // Apply() phase, we might be waiting forever -- for example, if a follower has been + // partitioned from its leader, it may have operations sitting around in flight + // for quite a long time before eventually aborting or committing. This would + // end up blocking all flushes if we waited on it. + // + // 2) Flush the log + // + // This ensures that the above-mentioned commit messages are not just enqueued + // to the log, but also on disk. + VLOG(1) << "T " << tablet_->metadata()->tablet_id() + << ": Waiting for in-flight transactions to commit."; + LOG_SLOW_EXECUTION(WARNING, 200, "Committing in-flights took a long time.") { + tablet_->mvcc_manager()->WaitForApplyingTransactionsToCommit(); + } + VLOG(1) << "T " << tablet_->metadata()->tablet_id() + << ": Waiting for the log queue to be flushed."; + LOG_SLOW_EXECUTION(WARNING, 200, "Flushing the Log queue took a long time.") { + RETURN_NOT_OK(log_->WaitUntilAllFlushed()); + } + return Status::OK(); +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_peer.h b/src/kudu/tablet/tablet_peer.h new file mode 100644 index 000000000000..56a92816d9bc --- /dev/null +++ b/src/kudu/tablet/tablet_peer.h @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TABLET_PEER_H_ +#define KUDU_TABLET_TABLET_PEER_H_ + +#include +#include +#include +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/log.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/transaction_order_verifier.h" +#include "kudu/tablet/transactions/transaction_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/semaphore.h" + +namespace kudu { + +namespace log { +class LogAnchorRegistry; +} + +namespace rpc { +class Messenger; +} + +namespace tserver { +class CatchUpServiceTest; +} + +class MaintenanceManager; +class MaintenanceOp; + +namespace tablet { +class LeaderTransactionDriver; +class ReplicaTransactionDriver; +class TabletPeer; +class TabletStatusPB; +class TabletStatusListener; +class TransactionDriver; + +// A peer in a tablet consensus configuration, which coordinates writes to tablets. +// Each time Write() is called this class appends a new entry to a replicated +// state machine through a consensus algorithm, which makes sure that other +// peers see the same updates in the same order. In addition to this, this +// class also splits the work and coordinates multi-threaded execution. +class TabletPeer : public RefCountedThreadSafe, + public consensus::ReplicaTransactionFactory { + public: + typedef std::map MaxIdxToSegmentSizeMap; + + TabletPeer(const scoped_refptr& meta, + const consensus::RaftPeerPB& local_peer_pb, ThreadPool* apply_pool, + Callback mark_dirty_clbk); + + // Initializes the TabletPeer, namely creating the Log and initializing + // Consensus. + Status Init(const std::shared_ptr& tablet, + const scoped_refptr& clock, + const std::shared_ptr& messenger, + const scoped_refptr& log, + const scoped_refptr& metric_entity); + + // Starts the TabletPeer, making it available for Write()s. If this + // TabletPeer is part of a consensus configuration this will connect it to other peers + // in the consensus configuration. + Status Start(const consensus::ConsensusBootstrapInfo& info); + + // Shutdown this tablet peer. + // If a shutdown is already in progress, blocks until that shutdown is complete. + void Shutdown(); + + // Check that the tablet is in a RUNNING state. + Status CheckRunning() const; + + // Wait until the tablet is in a RUNNING state or if there's a timeout. + // TODO have a way to wait for any state? + Status WaitUntilConsensusRunning(const MonoDelta& timeout); + + // Submits a write to a tablet and executes it asynchronously. + // The caller is expected to build and pass a TrasactionContext that points + // to the RPC WriteRequest, WriteResponse, RpcContext and to the tablet's + // MvccManager. + Status SubmitWrite(WriteTransactionState *tx_state); + + // Called by the tablet service to start an alter schema transaction. + // + // The transaction contains all the information required to execute the + // AlterSchema operation and send the response back. + // + // If the returned Status is OK, the response to the client will be sent + // asynchronously. Otherwise the tablet service will have to send the response directly. + // + // The AlterSchema operation is taking the tablet component lock in exclusive mode + // meaning that no other operation on the tablet can be executed while the + // AlterSchema is in progress. + Status SubmitAlterSchema(gscoped_ptr tx_state); + + void GetTabletStatusPB(TabletStatusPB* status_pb_out) const; + + // Used by consensus to create and start a new ReplicaTransaction. + virtual Status StartReplicaTransaction( + const scoped_refptr& round) OVERRIDE; + + consensus::Consensus* consensus() { + boost::lock_guard lock(lock_); + return consensus_.get(); + } + + scoped_refptr shared_consensus() const { + boost::lock_guard lock(lock_); + return consensus_; + } + + Tablet* tablet() const { + boost::lock_guard lock(lock_); + return tablet_.get(); + } + + std::shared_ptr shared_tablet() const { + boost::lock_guard lock(lock_); + return tablet_; + } + + const TabletStatePB state() const { + boost::lock_guard lock(lock_); + return state_; + } + + // Returns the current Raft configuration. + const consensus::RaftConfigPB RaftConfig() const; + + // If any peers in the consensus configuration lack permanent uuids, get them via an + // RPC call and update. + // TODO: move this to raft_consensus.h. + Status UpdatePermanentUuids(); + + TabletStatusListener* status_listener() const { + return status_listener_.get(); + } + + // Sets the tablet to a BOOTSTRAPPING state, indicating it is starting up. + void SetBootstrapping() { + boost::lock_guard lock(lock_); + CHECK_EQ(NOT_STARTED, state_); + state_ = BOOTSTRAPPING; + } + + // sets the tablet state to FAILED additionally setting the error to the provided + // one. + void SetFailed(const Status& error) { + boost::lock_guard lock(lock_); + state_ = FAILED; + error_ = error; + } + + // Returns the error that occurred, when state is FAILED. + Status error() const { + boost::lock_guard lock(lock_); + return error_; + } + + // Returns a human-readable string indicating the state of the tablet. + // Typically this looks like "NOT_STARTED", "TABLET_DATA_COPYING", + // etc. For use in places like the Web UI. + std::string HumanReadableState() const; + + // Adds list of transactions in-flight at the time of the call to + // 'out'. TransactionStatusPB objects are used to allow this method + // to be used by both the web-UI and ts-cli. + void GetInFlightTransactions(Transaction::TraceType trace_type, + std::vector* out) const; + + // Returns the minimum known log index that is in-memory or in-flight. + // Used for selection of log segments to delete during Log GC. + void GetEarliestNeededLogIndex(int64_t* log_index) const; + + // Returns a map of log index -> segment size, of all the segments that currently cannot be GCed + // because in-memory structures have anchors in them. + // + // Returns a non-ok status if the tablet isn't running. + Status GetMaxIndexesToSegmentSizeMap(MaxIdxToSegmentSizeMap* idx_size_map) const; + + // Returns the amount of bytes that would be GC'd if RunLogGC() was called. + // + // Returns a non-ok status if the tablet isn't running. + Status GetGCableDataSize(int64_t* retention_size) const; + + // Return a pointer to the Log. + // TabletPeer keeps a reference to Log after Init(). + log::Log* log() const { + return log_.get(); + } + + server::Clock* clock() { + return clock_.get(); + } + + const scoped_refptr& log_anchor_registry() const { + return log_anchor_registry_; + } + + // Returns the tablet_id of the tablet managed by this TabletPeer. + // Returns the correct tablet_id even if the underlying tablet is not available + // yet. + const std::string& tablet_id() const { return tablet_id_; } + + // Convenience method to return the permanent_uuid of this peer. + std::string permanent_uuid() const { return tablet_->metadata()->fs_manager()->uuid(); } + + Status NewLeaderTransactionDriver(gscoped_ptr transaction, + scoped_refptr* driver); + + Status NewReplicaTransactionDriver(gscoped_ptr transaction, + scoped_refptr* driver); + + // Tells the tablet's log to garbage collect. + Status RunLogGC(); + + // Register the maintenance ops associated with this peer's tablet, also invokes + // Tablet::RegisterMaintenanceOps(). + void RegisterMaintenanceOps(MaintenanceManager* maintenance_manager); + + // Unregister the maintenance ops associated with this peer's tablet. + // This method is not thread safe. + void UnregisterMaintenanceOps(); + + // Return pointer to the transaction tracker for this peer. + const TransactionTracker* transaction_tracker() const { return &txn_tracker_; } + + const scoped_refptr& tablet_metadata() const { + return meta_; + } + + private: + friend class RefCountedThreadSafe; + friend class TabletPeerTest; + FRIEND_TEST(TabletPeerTest, TestMRSAnchorPreventsLogGC); + FRIEND_TEST(TabletPeerTest, TestDMSAnchorPreventsLogGC); + FRIEND_TEST(TabletPeerTest, TestActiveTransactionPreventsLogGC); + + ~TabletPeer(); + + // Wait until the TabletPeer is fully in SHUTDOWN state. + void WaitUntilShutdown(); + + // After bootstrap is complete and consensus is setup this initiates the transactions + // that were not complete on bootstrap. + // Not implemented yet. See .cc file. + Status StartPendingTransactions(consensus::RaftPeerPB::Role my_role, + const consensus::ConsensusBootstrapInfo& bootstrap_info); + + const scoped_refptr meta_; + + const std::string tablet_id_; + + const consensus::RaftPeerPB local_peer_pb_; + + TabletStatePB state_; + Status error_; + TransactionTracker txn_tracker_; + TransactionOrderVerifier txn_order_verifier_; + scoped_refptr log_; + std::shared_ptr tablet_; + std::shared_ptr messenger_; + scoped_refptr consensus_; + gscoped_ptr status_listener_; + simple_spinlock prepare_replicate_lock_; + + // Lock protecting state_ as well as smart pointers to collaborating + // classes such as tablet_ and consensus_. + mutable simple_spinlock lock_; + + // Lock taken during Init/Shutdown which ensures that only a single thread + // attempts to perform major lifecycle operations (Init/Shutdown) at once. + // This must be acquired before acquiring lock_ if they are acquired together. + // We don't just use lock_ since the lifecycle operations may take a while + // and we'd like other threads to be able to quickly poll the state_ variable + // during them in order to reject RPCs, etc. + mutable simple_spinlock state_change_lock_; + + // IMPORTANT: correct execution of PrepareTask assumes that 'prepare_pool_' + // is single-threaded, moving to a multi-tablet setup where multiple TabletPeers + // use the same 'prepare_pool_' needs to enforce that, for a single + // TabletPeer, PrepareTasks are executed *serially*. + // TODO move the prepare pool to TabletServer. + gscoped_ptr prepare_pool_; + + // Pool that executes apply tasks for transactions. This is a multi-threaded + // pool, constructor-injected by either the Master (for system tables) or + // the Tablet server. + ThreadPool* apply_pool_; + + scoped_refptr clock_; + + scoped_refptr log_anchor_registry_; + + // Function to mark this TabletPeer's tablet as dirty in the TSTabletManager. + // This function must be called any time the cluster membership or cluster + // leadership changes. + Callback mark_dirty_clbk_; + + // List of maintenance operations for the tablet that need information that only the peer + // can provide. + std::vector maintenance_ops_; + + DISALLOW_COPY_AND_ASSIGN(TabletPeer); +}; + +// A callback to wait for the in-flight transactions to complete and to flush +// the Log when they do. +// Tablet is passed as a raw pointer as this callback is set in TabletMetadata and +// were we to keep the tablet as a shared_ptr a circular dependency would occur: +// callback->tablet->metadata->callback. Since the tablet indirectly owns this +// callback we know that is must still be alive when it fires. +class FlushInflightsToLogCallback : public RefCountedThreadSafe { + public: + FlushInflightsToLogCallback(Tablet* tablet, + const scoped_refptr& log) + : tablet_(tablet), + log_(log) {} + + Status WaitForInflightsAndFlushLog(); + + private: + Tablet* tablet_; + scoped_refptr log_; +}; + + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TABLET_PEER_H_ */ diff --git a/src/kudu/tablet/tablet_peer_mm_ops.cc b/src/kudu/tablet/tablet_peer_mm_ops.cc new file mode 100644 index 000000000000..9c25d620e76d --- /dev/null +++ b/src/kudu/tablet/tablet_peer_mm_ops.cc @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/tablet_peer_mm_ops.h" + +#include +#include +#include + +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" + +DEFINE_int32(flush_threshold_mb, 64, + "Size at which MemRowSet flushes are triggered. " + "A MRS can still flush below this threshold if it if hasn't flushed in a while"); +TAG_FLAG(flush_threshold_mb, experimental); + +METRIC_DEFINE_gauge_uint32(tablet, log_gc_running, + "Log GCs Running", + kudu::MetricUnit::kOperations, + "Number of log GC operations currently running."); +METRIC_DEFINE_histogram(tablet, log_gc_duration, + "Log GC Duration", + kudu::MetricUnit::kMilliseconds, + "Time spent garbage collecting the logs.", 60000LU, 1); + +namespace kudu { +namespace tablet { + +using std::map; +using strings::Substitute; + +// How long we wait before considering a time-based flush. +const double kFlushDueToTimeMs = 2 * 60 * 1000; +// Upper bound for how long it takes to reach "full perf improvement" in time-based flushing. +const double kFlushUpperBoundMs = 60 * 60 * 1000; + +// +// FlushOpPerfImprovementPolicy. +// + +void FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush(MaintenanceOpStats* stats, + double elapsed_ms) { + if (stats->ram_anchored() > FLAGS_flush_threshold_mb * 1024 * 1024) { + // If we're over the user-specified flush threshold, then consider the perf + // improvement to be 1 for every extra MB. This produces perf_improvement results + // which are much higher than any compaction would produce, and means that, when + // there is an MRS over threshold, a flush will almost always be selected instead of + // a compaction. That's not necessarily a good thing, but in the absence of better + // heuristics, it will do for now. + double extra_mb = + static_cast(FLAGS_flush_threshold_mb - (stats->ram_anchored()) / (1024 * 1024)); + stats->set_perf_improvement(extra_mb); + } else if (elapsed_ms > kFlushDueToTimeMs) { + // Even if we aren't over the threshold, consider flushing if we haven't flushed + // in a long time. But, don't give it a large perf_improvement score. We should + // only do this if we really don't have much else to do, and if we've already waited a bit. + // The following will give an improvement that's between 0.0 and 1.0, gradually growing + // as 'elapsed_ms' approaches 'kFlushUpperBoundMs'. + double perf = elapsed_ms / kFlushUpperBoundMs; + if (perf > 1.0) { + perf = 1.0; + } + stats->set_perf_improvement(perf); + } +} + +// +// FlushMRSOp. +// + +void FlushMRSOp::UpdateStats(MaintenanceOpStats* stats) { + boost::lock_guard l(lock_); + + map max_idx_to_segment_size; + if (tablet_peer_->tablet()->MemRowSetEmpty() || + !tablet_peer_->GetMaxIndexesToSegmentSizeMap(&max_idx_to_segment_size).ok()) { + return; + } + + { + boost::unique_lock lock(tablet_peer_->tablet()->rowsets_flush_sem_, + boost::defer_lock); + stats->set_runnable(lock.try_lock()); + } + + stats->set_ram_anchored(tablet_peer_->tablet()->MemRowSetSize()); + stats->set_logs_retained_bytes( + tablet_peer_->tablet()->MemRowSetLogRetentionSize(max_idx_to_segment_size)); + + // TODO: use workload statistics here to find out how "hot" the tablet has + // been in the last 5 minutes. + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush( + stats, + time_since_flush_.elapsed().wall_millis()); +} + +bool FlushMRSOp::Prepare() { + // Try to acquire the rowsets_flush_sem_. If we can't, the Prepare step + // fails. This also implies that only one instance of FlushMRSOp can be + // running at once. + return tablet_peer_->tablet()->rowsets_flush_sem_.try_lock(); +} + +void FlushMRSOp::Perform() { + CHECK(!tablet_peer_->tablet()->rowsets_flush_sem_.try_lock()); + + tablet_peer_->tablet()->FlushUnlocked(); + + { + boost::lock_guard l(lock_); + time_since_flush_.start(); + } + tablet_peer_->tablet()->rowsets_flush_sem_.unlock(); +} + +scoped_refptr FlushMRSOp::DurationHistogram() const { + return tablet_peer_->tablet()->metrics()->flush_mrs_duration; +} + +scoped_refptr > FlushMRSOp::RunningGauge() const { + return tablet_peer_->tablet()->metrics()->flush_mrs_running; +} + +// +// FlushDeltaMemStoresOp. +// + +void FlushDeltaMemStoresOp::UpdateStats(MaintenanceOpStats* stats) { + boost::lock_guard l(lock_); + int64_t dms_size; + int64_t retention_size; + map max_idx_to_segment_size; + if (tablet_peer_->tablet()->DeltaMemRowSetEmpty() || + !tablet_peer_->GetMaxIndexesToSegmentSizeMap(&max_idx_to_segment_size).ok()) { + return; + } + tablet_peer_->tablet()->GetInfoForBestDMSToFlush(max_idx_to_segment_size, + &dms_size, &retention_size); + + stats->set_ram_anchored(dms_size); + stats->set_runnable(true); + stats->set_logs_retained_bytes(retention_size); + + FlushOpPerfImprovementPolicy::SetPerfImprovementForFlush( + stats, + time_since_flush_.elapsed().wall_millis()); +} + +void FlushDeltaMemStoresOp::Perform() { + map max_idx_to_segment_size; + if (!tablet_peer_->GetMaxIndexesToSegmentSizeMap(&max_idx_to_segment_size).ok()) { + LOG(WARNING) << "Won't flush deltas since tablet shutting down: " << tablet_peer_->tablet_id(); + return; + } + WARN_NOT_OK(tablet_peer_->tablet()->FlushDMSWithHighestRetention(max_idx_to_segment_size), + Substitute("Failed to flush DMS on $0", + tablet_peer_->tablet()->tablet_id())); + { + boost::lock_guard l(lock_); + time_since_flush_.start(); + } +} + +scoped_refptr FlushDeltaMemStoresOp::DurationHistogram() const { + return tablet_peer_->tablet()->metrics()->flush_dms_duration; +} + +scoped_refptr > FlushDeltaMemStoresOp::RunningGauge() const { + return tablet_peer_->tablet()->metrics()->flush_dms_running; +} + +// +// LogGCOp. +// + +LogGCOp::LogGCOp(TabletPeer* tablet_peer) + : MaintenanceOp(StringPrintf("LogGCOp(%s)", tablet_peer->tablet()->tablet_id().c_str()), + MaintenanceOp::LOW_IO_USAGE), + tablet_peer_(tablet_peer), + log_gc_duration_(METRIC_log_gc_duration.Instantiate( + tablet_peer->tablet()->GetMetricEntity())), + log_gc_running_(METRIC_log_gc_running.Instantiate( + tablet_peer->tablet()->GetMetricEntity(), 0)), + sem_(1) {} + +void LogGCOp::UpdateStats(MaintenanceOpStats* stats) { + int64_t retention_size; + + if (!tablet_peer_->GetGCableDataSize(&retention_size).ok()) { + return; + } + + stats->set_logs_retained_bytes(retention_size); + stats->set_runnable(sem_.GetValue() == 1); +} + +bool LogGCOp::Prepare() { + return sem_.try_lock(); +} + +void LogGCOp::Perform() { + CHECK(!sem_.try_lock()); + + tablet_peer_->RunLogGC(); + + sem_.unlock(); +} + +scoped_refptr LogGCOp::DurationHistogram() const { + return log_gc_duration_; +} + +scoped_refptr > LogGCOp::RunningGauge() const { + return log_gc_running_; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/tablet_peer_mm_ops.h b/src/kudu/tablet/tablet_peer_mm_ops.h new file mode 100644 index 000000000000..a4b475d9c4c6 --- /dev/null +++ b/src/kudu/tablet/tablet_peer_mm_ops.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TABLET_PEER_MM_OPS_H_ +#define KUDU_TABLET_TABLET_PEER_MM_OPS_H_ + +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { + +class Histogram; +template +class AtomicGauge; + +namespace tablet { + +class FlushOpPerfImprovementPolicy { + public: + ~FlushOpPerfImprovementPolicy() {} + + // Sets the performance improvement based on the anchored ram if it's over the threshold, + // else it will set it based on how long it has been since the last flush. + static void SetPerfImprovementForFlush(MaintenanceOpStats* stats, double elapsed_ms); + + private: + FlushOpPerfImprovementPolicy() {} +}; + +// Maintenance op for MRS flush. Only one can happen at a time. +class FlushMRSOp : public MaintenanceOp { + public: + explicit FlushMRSOp(TabletPeer* tablet_peer) + : MaintenanceOp(StringPrintf("FlushMRSOp(%s)", tablet_peer->tablet()->tablet_id().c_str()), + MaintenanceOp::HIGH_IO_USAGE), + tablet_peer_(tablet_peer) { + time_since_flush_.start(); + } + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE; + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + // Lock protecting time_since_flush_. + mutable simple_spinlock lock_; + Stopwatch time_since_flush_; + + TabletPeer *const tablet_peer_; +}; + +// Maintenance op for DMS flush. +// Reports stats for all the DMS this tablet contains but only flushes one in Perform(). +class FlushDeltaMemStoresOp : public MaintenanceOp { + public: + explicit FlushDeltaMemStoresOp(TabletPeer* tablet_peer) + : MaintenanceOp(StringPrintf("FlushDeltaMemStoresOp(%s)", + tablet_peer->tablet()->tablet_id().c_str()), + MaintenanceOp::HIGH_IO_USAGE), + tablet_peer_(tablet_peer) { + time_since_flush_.start(); + } + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE { + return true; + } + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + // Lock protecting time_since_flush_ + mutable simple_spinlock lock_; + Stopwatch time_since_flush_; + + TabletPeer *const tablet_peer_; +}; + +// Maintenance task that runs log GC. Reports log retention that represents the amount of data +// that can be GC'd. +// +// Only one LogGC op can run at a time. +class LogGCOp : public MaintenanceOp { + public: + explicit LogGCOp(TabletPeer* tablet_peer); + + virtual void UpdateStats(MaintenanceOpStats* stats) OVERRIDE; + + virtual bool Prepare() OVERRIDE; + + virtual void Perform() OVERRIDE; + + virtual scoped_refptr DurationHistogram() const OVERRIDE; + + virtual scoped_refptr > RunningGauge() const OVERRIDE; + + private: + TabletPeer *const tablet_peer_; + scoped_refptr log_gc_duration_; + scoped_refptr > log_gc_running_; + mutable Semaphore sem_; +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TABLET_PEER_MM_OPS_H_ */ diff --git a/src/kudu/tablet/tablet_random_access-test.cc b/src/kudu/tablet/tablet_random_access-test.cc new file mode 100644 index 000000000000..bb7b6cd39e0a --- /dev/null +++ b/src/kudu/tablet/tablet_random_access-test.cc @@ -0,0 +1,567 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/casts.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet-test-base.h" +#include "kudu/util/stopwatch.h" + +DEFINE_int32(keyspace_size, 3000, "number of unique row keys to insert/mutate"); +DEFINE_int32(runtime_seconds, 1, "number of seconds to run the test"); +DEFINE_int32(sleep_between_background_ops_ms, 100, + "number of milliseconds to sleep between flushing or compacting"); +DEFINE_int32(update_delete_ratio, 4, "ratio of update:delete when mutating existing rows"); + +DECLARE_int32(deltafile_default_block_size); + +using std::string; +using std::vector; + +enum TestOp { + TEST_INSERT, + TEST_UPDATE, + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + TEST_FLUSH_DELTAS, + TEST_MINOR_COMPACT_DELTAS, + TEST_MAJOR_COMPACT_DELTAS, + TEST_COMPACT_TABLET, + TEST_NUM_OP_TYPES // max value for enum +}; +MAKE_ENUM_LIMITS(TestOp, TEST_INSERT, TEST_NUM_OP_TYPES); + +namespace kudu { +namespace tablet { + +const char* TestOp_names[] = { + "TEST_INSERT", + "TEST_UPDATE", + "TEST_DELETE", + "TEST_FLUSH_OPS", + "TEST_FLUSH_TABLET", + "TEST_FLUSH_DELTAS", + "TEST_MINOR_COMPACT_DELTAS", + "TEST_MAJOR_COMPACT_DELTAS", + "TEST_COMPACT_TABLET" +}; + +// Test which does only random operations against a tablet, including update and random +// get (ie scans with equal lower and upper bounds). +// +// The test maintains an in-memory copy of the expected state of the tablet, and uses only +// a single thread, so that it's easy to verify that the tablet always matches the expected +// state. +class TestRandomAccess : public KuduTabletTest { + public: + TestRandomAccess() + : KuduTabletTest(Schema({ ColumnSchema("key", INT32), + ColumnSchema("val", INT32, true) }, 1)), + done_(1) { + OverrideFlagForSlowTests("keyspace_size", "30000"); + OverrideFlagForSlowTests("runtime_seconds", "10"); + OverrideFlagForSlowTests("sleep_between_background_ops_ms", "1000"); + + // Set a small block size to increase chances that a single update will span + // multiple delta blocks. + FLAGS_deltafile_default_block_size = 1024; + expected_tablet_state_.resize(FLAGS_keyspace_size); + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + writer_.reset(new LocalTabletWriter(tablet().get(), &client_schema_)); + } + + // Pick a random row of the table, verify its current state, and then + // modify it in some way. The modifications may include multiple mutations + // to the same row in a single batch (eg insert/update/delete). + // + // The mutations are always valid. For example: + // - inserting if it doesn't exist yet + // - perform an update or delete the row if it does exist. + // + // TODO: should add a version of this test which also tries invalid operations + // and validates the correct errors. + void DoRandomBatch() { + int key = rand() % expected_tablet_state_.size(); + string& cur_val = expected_tablet_state_[key]; + + // Check that a read yields what we expect. + string val_in_table = GetRow(key); + ASSERT_EQ("(" + cur_val + ")", val_in_table); + + vector pending; + for (int i = 0; i < 3; i++) { + int new_val = rand(); + if (cur_val.empty()) { + // If there is no row, then insert one. + cur_val = InsertRow(key, new_val, &pending); + } else { + if (new_val % (FLAGS_update_delete_ratio + 1) == 0) { + cur_val = DeleteRow(key, &pending); + } else { + cur_val = MutateRow(key, new_val, &pending); + } + } + } + CHECK_OK(writer_->WriteBatch(pending)); + for (LocalTabletWriter::Op op : pending) { + delete op.row; + } + } + + void DoRandomBatches() { + int op_count = 0; + Stopwatch s; + s.start(); + while (s.elapsed().wall_seconds() < FLAGS_runtime_seconds) { + for (int i = 0; i < 100; i++) { + ASSERT_NO_FATAL_FAILURE(DoRandomBatch()); + op_count++; + } + } + LOG(INFO) << "Ran " << op_count << " ops " + << "(" << (op_count / s.elapsed().wall_seconds()) << " ops/sec)"; + } + + // Wakes up periodically to perform a flush or compaction. + void BackgroundOpThread() { + int n_flushes = 0; + while (!done_.WaitFor(MonoDelta::FromMilliseconds(FLAGS_sleep_between_background_ops_ms))) { + CHECK_OK(tablet()->Flush()); + ++n_flushes; + switch (n_flushes % 3) { + case 0: + CHECK_OK(tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + break; + case 1: + CHECK_OK(tablet()->CompactWorstDeltas(RowSet::MAJOR_DELTA_COMPACTION)); + break; + case 2: + CHECK_OK(tablet()->CompactWorstDeltas(RowSet::MINOR_DELTA_COMPACTION)); + break; + } + } + } + + // Adds an insert for the given key/value pair to 'ops', returning the new stringified + // value of the row. + string InsertRow(int key, int val, vector* ops) { + gscoped_ptr row(new KuduPartialRow(&client_schema_)); + CHECK_OK(row->SetInt32(0, key)); + if (val & 1) { + CHECK_OK(row->SetNull(1)); + } else { + CHECK_OK(row->SetInt32(1, val)); + } + string ret = row->ToString(); + ops->push_back(LocalTabletWriter::Op(RowOperationsPB::INSERT, row.release())); + return ret; + } + + // Adds an update of the given key/value pair to 'ops', returning the new stringified + // value of the row. + string MutateRow(int key, uint32_t new_val, vector* ops) { + gscoped_ptr row(new KuduPartialRow(&client_schema_)); + CHECK_OK(row->SetInt32(0, key)); + if (new_val & 1) { + CHECK_OK(row->SetNull(1)); + } else { + CHECK_OK(row->SetInt32(1, new_val)); + } + string ret = row->ToString(); + ops->push_back(LocalTabletWriter::Op(RowOperationsPB::UPDATE, row.release())); + return ret; + } + + // Adds a delete of the given row to 'ops', returning an empty string (indicating that + // the row no longer exists). + string DeleteRow(int key, vector* ops) { + gscoped_ptr row(new KuduPartialRow(&client_schema_)); + CHECK_OK(row->SetInt32(0, key)); + ops->push_back(LocalTabletWriter::Op(RowOperationsPB::DELETE, row.release())); + return ""; + } + + // Random-read the given row, returning its current value. + // If the row doesn't exist, returns "()". + string GetRow(int key) { + ScanSpec spec; + const Schema& schema = this->client_schema_; + gscoped_ptr iter; + CHECK_OK(this->tablet()->NewRowIterator(schema, &iter)); + ColumnRangePredicate pred_one(schema.column(0), &key, &key); + spec.AddPredicate(pred_one); + CHECK_OK(iter->Init(&spec)); + + string ret = "()"; + int n_results = 0; + + Arena arena(1024, 4*1024*1024); + RowBlock block(schema, 100, &arena); + while (iter->HasNext()) { + arena.Reset(); + CHECK_OK(iter->NextBlock(&block)); + for (int i = 0; i < block.nrows(); i++) { + if (!block.selection_vector()->IsRowSelected(i)) { + continue; + } + // We expect to only get exactly one result per read. + CHECK_EQ(n_results, 0) + << "Already got result when looking up row " + << key << ": " << ret + << " and now have new matching row: " + << schema.DebugRow(block.row(i)) + << " iterator: " << iter->ToString(); + ret = schema.DebugRow(block.row(i)); + n_results++; + } + } + return ret; + } + + protected: + void RunFuzzCase(const vector& ops, + int update_multiplier); + + // The current expected state of the tablet. + vector expected_tablet_state_; + + // Latch triggered when the main thread is finished performing + // operations. This stops the compact/flush thread. + CountDownLatch done_; + + gscoped_ptr writer_; +}; + +TEST_F(TestRandomAccess, Test) { + scoped_refptr flush_thread; + CHECK_OK(Thread::Create("test", "flush", + boost::bind(&TestRandomAccess::BackgroundOpThread, this), + &flush_thread)); + + DoRandomBatches(); + done_.CountDown(); + flush_thread->Join(); +} + + +void GenerateTestCase(vector* ops, int len) { + bool exists = false; + bool ops_pending = false; + bool data_in_mrs = false; + bool worth_compacting = false; + bool data_in_dms = false; + ops->clear(); + while (ops->size() < len) { + TestOp r = tight_enum_cast(rand() % enum_limits::max_enumerator); + switch (r) { + case TEST_INSERT: + if (exists) continue; + ops->push_back(TEST_INSERT); + exists = true; + ops_pending = true; + data_in_mrs = true; + break; + case TEST_UPDATE: + if (!exists) continue; + ops->push_back(TEST_UPDATE); + ops_pending = true; + if (!data_in_mrs) { + data_in_dms = true; + } + break; + case TEST_DELETE: + if (!exists) continue; + ops->push_back(TEST_DELETE); + ops_pending = true; + exists = false; + if (!data_in_mrs) { + data_in_dms = true; + } + break; + case TEST_FLUSH_OPS: + if (ops_pending) { + ops->push_back(TEST_FLUSH_OPS); + ops_pending = false; + } + break; + case TEST_FLUSH_TABLET: + if (data_in_mrs) { + if (ops_pending) { + ops->push_back(TEST_FLUSH_OPS); + ops_pending = false; + } + ops->push_back(TEST_FLUSH_TABLET); + data_in_mrs = false; + worth_compacting = true; + } + break; + case TEST_COMPACT_TABLET: + if (worth_compacting) { + if (ops_pending) { + ops->push_back(TEST_FLUSH_OPS); + ops_pending = false; + } + ops->push_back(TEST_COMPACT_TABLET); + worth_compacting = false; + } + break; + case TEST_FLUSH_DELTAS: + if (data_in_dms) { + if (ops_pending) { + ops->push_back(TEST_FLUSH_OPS); + ops_pending = false; + } + ops->push_back(TEST_FLUSH_DELTAS); + data_in_dms = false; + } + break; + case TEST_MAJOR_COMPACT_DELTAS: + ops->push_back(TEST_MAJOR_COMPACT_DELTAS); + break; + case TEST_MINOR_COMPACT_DELTAS: + ops->push_back(TEST_MINOR_COMPACT_DELTAS); + break; + default: + LOG(FATAL); + } + } +} + +string DumpTestCase(const vector& ops) { + vector names; + for (TestOp test_op : ops) { + names.push_back(TestOp_names[test_op]); + } + return JoinStrings(names, ",\n"); +} + +void TestRandomAccess::RunFuzzCase(const vector& test_ops, + int update_multiplier = 1) { + LOG(INFO) << "test case: " << DumpTestCase(test_ops); + + LocalTabletWriter writer(tablet().get(), &client_schema_); + vector ops; + + string cur_val = ""; + string pending_val = ""; + + int i = 0; + for (TestOp test_op : test_ops) { + string val_in_table = GetRow(1); + ASSERT_EQ("(" + cur_val + ")", val_in_table); + + i++; + LOG(INFO) << TestOp_names[test_op]; + switch (test_op) { + case TEST_INSERT: + pending_val = InsertRow(1, i, &ops); + break; + case TEST_UPDATE: + for (int j = 0; j < update_multiplier; j++) { + pending_val = MutateRow(1, i, &ops); + } + break; + case TEST_DELETE: + pending_val = DeleteRow(1, &ops); + break; + case TEST_FLUSH_OPS: + ASSERT_OK(writer.WriteBatch(ops)); + for (LocalTabletWriter::Op op : ops) { + delete op.row; + } + ops.clear(); + cur_val = pending_val; + break; + case TEST_FLUSH_TABLET: + ASSERT_OK(tablet()->Flush()); + break; + case TEST_FLUSH_DELTAS: + ASSERT_OK(tablet()->FlushBiggestDMS()); + break; + case TEST_MAJOR_COMPACT_DELTAS: + ASSERT_OK(tablet()->CompactWorstDeltas(RowSet::MAJOR_DELTA_COMPACTION)); + break; + case TEST_MINOR_COMPACT_DELTAS: + ASSERT_OK(tablet()->CompactWorstDeltas(RowSet::MINOR_DELTA_COMPACTION)); + break; + case TEST_COMPACT_TABLET: + ASSERT_OK(tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + break; + default: + LOG(FATAL) << test_op; + } + } + for (LocalTabletWriter::Op op : ops) { + delete op.row; + } +} + +// Generates a random test sequence and runs it. +// The logs of this test are designed to easily be copy-pasted and create +// more specific test cases like TestFuzz below. +TEST_F(TestRandomAccess, TestFuzz) { + SeedRandom(); + vector test_ops; + GenerateTestCase(&test_ops, 500); + RunFuzzCase(test_ops); +} + +// Generates a random test case, but the UPDATEs are all repeated 1000 times. +// This results in very large batches which are likely to span multiple delta blocks +// when flushed. +TEST_F(TestRandomAccess, TestFuzzHugeBatches) { + SeedRandom(); + vector test_ops; + GenerateTestCase(&test_ops, AllowSlowTests() ? 1000 : 50); + RunFuzzCase(test_ops, 1000); +} + +// A particular test case which previously failed TestFuzz. +TEST_F(TestRandomAccess, TestFuzz1) { + TestOp test_ops[] = { + // Get an inserted row in a DRS. + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + + // DELETE in DMS, INSERT in MRS and flush again. + TEST_DELETE, + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + + // State: + // RowSet RowSet(0): + // (int32 key=1, int32 val=NULL) Undos: [@1(DELETE)] Redos (in DMS): [@2 DELETE] + // RowSet RowSet(1): + // (int32 key=1, int32 val=NULL) Undos: [@2(DELETE)] Redos: [] + + TEST_COMPACT_TABLET, + }; + RunFuzzCase(vector(test_ops, test_ops + arraysize(test_ops))); +} + +// A particular test case which previously failed TestFuzz. +TEST_F(TestRandomAccess, TestFuzz2) { + TestOp test_ops[] = { + TEST_INSERT, + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + // (int32 key=1, int32 val=NULL) + // Undo Mutations: [@1(DELETE)] + // Redo Mutations: [@1(DELETE)] + + TEST_INSERT, + TEST_DELETE, + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + // (int32 key=1, int32 val=NULL) + // Undo Mutations: [@2(DELETE)] + // Redo Mutations: [] + + TEST_COMPACT_TABLET, + // Output Row: (int32 key=1, int32 val=NULL) + // Undo Mutations: [@1(DELETE)] + // Redo Mutations: [@1(DELETE)] + + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_COMPACT_TABLET + }; + RunFuzzCase(vector(test_ops, test_ops + arraysize(test_ops))); +} + +// A particular test case which previously failed TestFuzz. +TEST_F(TestRandomAccess, TestFuzz3) { + TestOp test_ops[] = { + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + // Output Row: (int32 key=1, int32 val=NULL) + // Undo Mutations: [@1(DELETE)] + // Redo Mutations: [] + + TEST_DELETE, + // Adds a @2 DELETE to DMS for above row. + + TEST_INSERT, + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + // (int32 key=1, int32 val=NULL) + // Undo Mutations: [@2(DELETE)] + // Redo Mutations: [@2(DELETE)] + + // Compaction input: + // Row 1: (int32 key=1, int32 val=NULL) + // Undo Mutations: [@2(DELETE)] + // Redo Mutations: [@2(DELETE)] + // Row 2: (int32 key=1, int32 val=NULL) + // Undo Mutations: [@1(DELETE)] + // Redo Mutations: [@2(DELETE)] + + TEST_COMPACT_TABLET, + }; + RunFuzzCase(vector(test_ops, test_ops + arraysize(test_ops))); +} + +// A particular test case which previously failed TestFuzz. +TEST_F(TestRandomAccess, TestFuzz4) { + TestOp test_ops[] = { + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_COMPACT_TABLET, + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_COMPACT_TABLET, + TEST_INSERT, + TEST_UPDATE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + TEST_DELETE, + TEST_INSERT, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + TEST_UPDATE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + TEST_UPDATE, + TEST_DELETE, + TEST_INSERT, + TEST_DELETE, + TEST_FLUSH_OPS, + TEST_FLUSH_TABLET, + TEST_COMPACT_TABLET + }; + RunFuzzCase(vector(test_ops, test_ops + arraysize(test_ops))); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transaction_order_verifier.cc b/src/kudu/tablet/transaction_order_verifier.cc new file mode 100644 index 000000000000..57b9b59309f5 --- /dev/null +++ b/src/kudu/tablet/transaction_order_verifier.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "kudu/tablet/transaction_order_verifier.h" + +namespace kudu { +namespace tablet { + +TransactionOrderVerifier::TransactionOrderVerifier() + : prev_idx_(0), + prev_prepare_phys_timestamp_(0) { +} + +TransactionOrderVerifier::~TransactionOrderVerifier() { +} + +void TransactionOrderVerifier::CheckApply(int64_t op_idx, + MicrosecondsInt64 prepare_phys_timestamp) { + DFAKE_SCOPED_LOCK(fake_lock_); + + if (prev_idx_ != 0) { + // We need to allow skips because certain ops (like NO_OP) don't have an + // Apply() phase and are not managed by Transactions. + CHECK_GE(op_idx, prev_idx_ + 1) << "Should apply operations in monotonic index order"; + CHECK_GE(prepare_phys_timestamp, prev_prepare_phys_timestamp_) + << "Prepare phases should have executed in the same order as the op indexes. " + << "op_idx=" << op_idx; + } + prev_idx_ = op_idx; + prev_prepare_phys_timestamp_ = prepare_phys_timestamp; +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transaction_order_verifier.h b/src/kudu/tablet/transaction_order_verifier.h new file mode 100644 index 000000000000..8c41a32b1cee --- /dev/null +++ b/src/kudu/tablet/transaction_order_verifier.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TABLET_TRANSACTION_ORDER_VERIFIER_H +#define KUDU_TABLET_TRANSACTION_ORDER_VERIFIER_H + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/walltime.h" +#include "kudu/gutil/threading/thread_collision_warner.h" + +namespace kudu { +namespace tablet { + +// Simple class which verifies the invariant that we eventually submit +// each operation to be applied in increasing operation index order, and that +// these operations have gone through the Prepare() phase in that same +// order. +// +// This currently runs even in release builds. If we ever find it to be a +// bottleneck, we could run it only in DEBUG builds, but it is extraordinarily +// simple and thus should not be problematic. +// +// NOTE ON SYNCHRONIZATION +// ------------------------ +// This class is not thread-safe, because the synchronization is handled externally. It is +// always called for an operation after both its PREPARE and REPLICATE phases are complete +// (i.e before it is submitted to be applied). This may occur on a number of different +// threads -- eg the prepare pool thread, an RPC handler handling UpdateConsensus() calls +// on a replica, or another thread when the leader receives a response from one of its +// replicas. However, we can ensure that there are no concurrent calls into this class +// based on the following logic: +// - CheckApply(N) only runs after both Prepare(N) and Replicate(N) are complete, on either +// the thread that called Prepare(N) or Replicate(N). +// - Prepare(N-1) always completes before Prepare(N) because Prepare is single-threaded. +// - Replicate(N-1) always completes before Replicate(N), as ensured by the consensus +// implementation. +// - Therefore, both Prepare(N-1) and Replicate(N-1) have completed, and therefore CheckApply(N-1) +// has completed. +// - Therefore, CheckApply(N-1) is not concurrent with CheckApply(N). +// +// Note that some of the assumptions above are implementation-dependent, not algorithmic +// properties of Raft. In particular, we currently trigger ReplicateFinished() in strict +// order, but it would still be correct to do it from a threadpool. If we change the +// implementation, the implementation of this verifier class will need to change +// accordingly. +// +// Because the above reasoning is somewhat complex, and the assumptions may change in the +// future, this class uses a DFAKE_MUTEX. This way, if we break any assumptions, we'll +// hopefully see the bug with an assertion error if not from a TSAN failure. +class TransactionOrderVerifier { + public: + TransactionOrderVerifier(); + ~TransactionOrderVerifier(); + + // Verify that it would be correct to apply an operation with the given + // index and prepare timestamp. This ensures that the indexes are increasing + // one by one (with no gaps) and that the prepare timestamps are also increasing. + // + // NOTE: the 'timestamp' here is a local system monotonic timestamp, not + // a Kudu Timestamp. We are enforcing/verifying a local ordering property, + // so local real time is what matters. + // + // If the checks fail, the server is FATALed. + void CheckApply(int64_t op_idx, + MicrosecondsInt64 prepare_phys_timestamp); + + private: + DFAKE_MUTEX(fake_lock_); + + int64_t prev_idx_; + MicrosecondsInt64 prev_prepare_phys_timestamp_; + + DISALLOW_COPY_AND_ASSIGN(TransactionOrderVerifier); +}; + +} // namespace tablet +} // namespace kudu +#endif /* KUDU_TABLET_TRANSACTION_ORDER_VERIFIER_H */ diff --git a/src/kudu/tablet/transactions/alter_schema_transaction.cc b/src/kudu/tablet/transactions/alter_schema_transaction.cc new file mode 100644 index 000000000000..9defd8342b0a --- /dev/null +++ b/src/kudu/tablet/transactions/alter_schema_transaction.cc @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/tablet/transactions/alter_schema_transaction.h" + +#include "kudu/common/wire_protocol.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace tablet { + +using boost::bind; +using consensus::ReplicateMsg; +using consensus::CommitMsg; +using consensus::ALTER_SCHEMA_OP; +using consensus::DriverType; +using strings::Substitute; +using tserver::TabletServerErrorPB; +using tserver::AlterSchemaRequestPB; +using tserver::AlterSchemaResponsePB; + +string AlterSchemaTransactionState::ToString() const { + return Substitute("AlterSchemaTransactionState " + "[timestamp=$0, schema=$1, request=$2]", + timestamp().ToString(), + schema_ == nullptr ? "(none)" : schema_->ToString(), + request_ == nullptr ? "(none)" : request_->ShortDebugString()); +} + +void AlterSchemaTransactionState::AcquireSchemaLock(rw_semaphore* l) { + TRACE("Acquiring schema lock in exclusive mode"); + schema_lock_ = std::unique_lock(*l); + TRACE("Acquired schema lock"); +} + +void AlterSchemaTransactionState::ReleaseSchemaLock() { + CHECK(schema_lock_.owns_lock()); + schema_lock_ = std::unique_lock(); + TRACE("Released schema lock"); +} + + +AlterSchemaTransaction::AlterSchemaTransaction(AlterSchemaTransactionState* state, + DriverType type) + : Transaction(state, type, Transaction::ALTER_SCHEMA_TXN), + state_(state) { +} + +void AlterSchemaTransaction::NewReplicateMsg(gscoped_ptr* replicate_msg) { + replicate_msg->reset(new ReplicateMsg); + (*replicate_msg)->set_op_type(ALTER_SCHEMA_OP); + (*replicate_msg)->mutable_alter_schema_request()->CopyFrom(*state()->request()); +} + +Status AlterSchemaTransaction::Prepare() { + TRACE("PREPARE ALTER-SCHEMA: Starting"); + + // Decode schema + gscoped_ptr schema(new Schema); + Status s = SchemaFromPB(state_->request()->schema(), schema.get()); + if (!s.ok()) { + state_->completion_callback()->set_error(s, TabletServerErrorPB::INVALID_SCHEMA); + return s; + } + + Tablet* tablet = state_->tablet_peer()->tablet(); + RETURN_NOT_OK(tablet->CreatePreparedAlterSchema(state(), schema.get())); + + state_->AddToAutoReleasePool(schema.release()); + + TRACE("PREPARE ALTER-SCHEMA: finished"); + return s; +} + +Status AlterSchemaTransaction::Start() { + if (!state_->has_timestamp()) { + state_->set_timestamp(state_->tablet_peer()->clock()->Now()); + } + TRACE("START. Timestamp: $0", server::HybridClock::GetPhysicalValueMicros(state_->timestamp())); + return Status::OK(); +} + +Status AlterSchemaTransaction::Apply(gscoped_ptr* commit_msg) { + TRACE("APPLY ALTER-SCHEMA: Starting"); + + Tablet* tablet = state_->tablet_peer()->tablet(); + RETURN_NOT_OK(tablet->AlterSchema(state())); + state_->tablet_peer()->log() + ->SetSchemaForNextLogSegment(*DCHECK_NOTNULL(state_->schema()), + state_->schema_version()); + + commit_msg->reset(new CommitMsg()); + (*commit_msg)->set_op_type(ALTER_SCHEMA_OP); + return Status::OK(); +} + +void AlterSchemaTransaction::Finish(TransactionResult result) { + if (PREDICT_FALSE(result == Transaction::ABORTED)) { + TRACE("AlterSchemaCommitCallback: transaction aborted"); + state()->Finish(); + return; + } + + // The schema lock was acquired by Tablet::CreatePreparedAlterSchema. + // Normally, we would release it in tablet.cc after applying the operation, + // but currently we need to wait until after the COMMIT message is logged + // to release this lock as a workaround for KUDU-915. See the TODO in + // Tablet::AlterSchema(). + state()->ReleaseSchemaLock(); + + DCHECK_EQ(result, Transaction::COMMITTED); + // Now that all of the changes have been applied and the commit is durable + // make the changes visible to readers. + TRACE("AlterSchemaCommitCallback: making alter schema visible"); + state()->Finish(); +} + +string AlterSchemaTransaction::ToString() const { + return Substitute("AlterSchemaTransaction [state=$0]", state_->ToString()); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/alter_schema_transaction.h b/src/kudu/tablet/transactions/alter_schema_transaction.h new file mode 100644 index 000000000000..4e77f7b9dfac --- /dev/null +++ b/src/kudu/tablet/transactions/alter_schema_transaction.h @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_ALTER_SCHEMA_TRANSACTION_H_ +#define KUDU_TABLET_ALTER_SCHEMA_TRANSACTION_H_ + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/util/locks.h" + +namespace kudu { + +class Schema; + +namespace consensus { +class Consensus; +} + +namespace tablet { + +// Transaction Context for the AlterSchema operation. +// Keeps track of the Transaction states (request, result, ...) +class AlterSchemaTransactionState : public TransactionState { + public: + ~AlterSchemaTransactionState() { + } + + AlterSchemaTransactionState(TabletPeer* tablet_peer, + const tserver::AlterSchemaRequestPB* request, + tserver::AlterSchemaResponsePB* response) + : TransactionState(tablet_peer), + schema_(NULL), + request_(request), + response_(response) { + } + + const tserver::AlterSchemaRequestPB* request() const OVERRIDE { return request_; } + tserver::AlterSchemaResponsePB* response() OVERRIDE { return response_; } + + void set_schema(const Schema* schema) { schema_ = schema; } + const Schema* schema() const { return schema_; } + + std::string new_table_name() const { + return request_->new_table_name(); + } + + bool has_new_table_name() const { + return request_->has_new_table_name(); + } + + uint32_t schema_version() const { + return request_->schema_version(); + } + + void AcquireSchemaLock(rw_semaphore* l); + + // Release the acquired schema lock. + // Crashes if the lock was not already acquired. + void ReleaseSchemaLock(); + + // Note: request_ and response_ are set to NULL after this method returns. + void Finish() { + // Make the request NULL since after this transaction commits + // the request may be deleted at any moment. + request_ = NULL; + response_ = NULL; + } + + virtual std::string ToString() const OVERRIDE; + + private: + DISALLOW_COPY_AND_ASSIGN(AlterSchemaTransactionState); + + // The new (target) Schema. + const Schema* schema_; + + // The original RPC request and response. + const tserver::AlterSchemaRequestPB *request_; + tserver::AlterSchemaResponsePB *response_; + + // The lock held on the tablet's schema_lock_. + std::unique_lock schema_lock_; +}; + +// Executes the alter schema transaction,. +class AlterSchemaTransaction : public Transaction { + public: + AlterSchemaTransaction(AlterSchemaTransactionState* tx_state, consensus::DriverType type); + + virtual AlterSchemaTransactionState* state() OVERRIDE { return state_.get(); } + virtual const AlterSchemaTransactionState* state() const OVERRIDE { return state_.get(); } + + void NewReplicateMsg(gscoped_ptr* replicate_msg) OVERRIDE; + + // Executes a Prepare for the alter schema transaction. + // + // TODO: need a schema lock? + + virtual Status Prepare() OVERRIDE; + + // Starts the AlterSchemaTransaction by assigning it a timestamp. + virtual Status Start() OVERRIDE; + + // Executes an Apply for the alter schema transaction + virtual Status Apply(gscoped_ptr* commit_msg) OVERRIDE; + + // Actually commits the transaction. + virtual void Finish(TransactionResult result) OVERRIDE; + + virtual std::string ToString() const OVERRIDE; + + private: + gscoped_ptr state_; + DISALLOW_COPY_AND_ASSIGN(AlterSchemaTransaction); +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_ALTER_SCHEMA_TRANSACTION_H_ */ diff --git a/src/kudu/tablet/transactions/transaction.cc b/src/kudu/tablet/transactions/transaction.cc new file mode 100644 index 000000000000..508d8f11fc04 --- /dev/null +++ b/src/kudu/tablet/transactions/transaction.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/transactions/transaction.h" + +namespace kudu { +namespace tablet { + +using consensus::DriverType; + +Transaction::Transaction(TransactionState* state, DriverType type, TransactionType tx_type) + : state_(state), + type_(type), + tx_type_(tx_type) { +} + +TransactionState::TransactionState(TabletPeer* tablet_peer) + : tablet_peer_(tablet_peer), + completion_clbk_(new TransactionCompletionCallback()), + timestamp_error_(0), + arena_(32 * 1024, 4 * 1024 * 1024), + external_consistency_mode_(CLIENT_PROPAGATED) { +} + +TransactionState::~TransactionState() { +} + +TransactionCompletionCallback::TransactionCompletionCallback() + : code_(tserver::TabletServerErrorPB::UNKNOWN_ERROR) { +} + +void TransactionCompletionCallback::set_error(const Status& status, + tserver::TabletServerErrorPB::Code code) { + status_ = status; + code_ = code; +} + +void TransactionCompletionCallback::set_error(const Status& status) { + status_ = status; +} + +bool TransactionCompletionCallback::has_error() const { + return !status_.ok(); +} + +const Status& TransactionCompletionCallback::status() const { + return status_; +} + +const tserver::TabletServerErrorPB::Code TransactionCompletionCallback::error_code() const { + return code_; +} + +void TransactionCompletionCallback::TransactionCompleted() {} + +TransactionCompletionCallback::~TransactionCompletionCallback() {} + +TransactionMetrics::TransactionMetrics() + : successful_inserts(0), + successful_updates(0), + successful_deletes(0), + commit_wait_duration_usec(0) { +} + +void TransactionMetrics::Reset() { + successful_inserts = 0; + successful_updates = 0; + successful_deletes = 0; + commit_wait_duration_usec = 0; +} + + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/transaction.h b/src/kudu/tablet/transactions/transaction.h new file mode 100644 index 000000000000..9d2aa253347f --- /dev/null +++ b/src/kudu/tablet/transactions/transaction.h @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TRANSACTION_H_ +#define KUDU_TABLET_TRANSACTION_H_ + +#include + +#include "kudu/common/timestamp.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.h" +#include "kudu/util/auto_release_pool.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { + +namespace tablet { +class TabletPeer; +class TransactionCompletionCallback; +class TransactionState; + +// All metrics associated with a Transaction. +struct TransactionMetrics { + TransactionMetrics(); + void Reset(); + int successful_inserts; + int successful_updates; + int successful_deletes; + uint64_t commit_wait_duration_usec; +}; + +// Base class for transactions. +// There are different implementations for different types (Write, AlterSchema, etc.). +// TransactionDriver implementations use Transactions along with Consensus to execute +// and replicate operations in a consensus configuration. +class Transaction { + public: + + enum TransactionType { + WRITE_TXN, + ALTER_SCHEMA_TXN, + }; + + enum TraceType { + NO_TRACE_TXNS = 0, + TRACE_TXNS = 1 + }; + + enum TransactionResult { + COMMITTED, + ABORTED + }; + + Transaction(TransactionState* state, consensus::DriverType type, TransactionType tx_type); + + // Returns the TransactionState for this transaction. + virtual TransactionState* state() { return state_; } + virtual const TransactionState* state() const { return state_; } + + // Returns whether this transaction is being executed on the leader or on a + // replica. + consensus::DriverType type() const { return type_; } + + // Returns this transaction's type. + TransactionType tx_type() const { return tx_type_; } + + // Builds the ReplicateMsg for this transaction. + virtual void NewReplicateMsg(gscoped_ptr* replicate_msg) = 0; + + // Executes the prepare phase of this transaction, the actual actions + // of this phase depend on the transaction type, but usually are limited + // to what can be done without actually changing data structures and without + // side-effects. + virtual Status Prepare() = 0; + + // Actually starts a transaction, assigning a timestamp to the transaction. + // LEADER replicas execute this in or right after Prepare(), while FOLLOWER/LEARNER + // replicas execute this right before the Apply() phase as the transaction's + // timestamp is only available on the LEADER's commit message. + // Once Started(), state might have leaked to other replicas/local log and the + // transaction can't be cancelled without issuing an abort message. + virtual Status Start() = 0; + + // Executes the Apply() phase of the transaction, the actual actions of + // this phase depend on the transaction type, but usually this is the + // method where data-structures are changed. + virtual Status Apply(gscoped_ptr* commit_msg) = 0; + + // Executed after Apply() but before the commit is submitted to consensus. + // Some transactions use this to perform pre-commit actions (e.g. write + // transactions perform early lock release on this hook). + // Default implementation does nothing. + virtual void PreCommit() {} + + // Executed after the transaction has been applied and the commit message has + // been appended to the log (though it might not be durable yet), or if the + // transaction was aborted. + // Implementations are expected to perform cleanup on this method, the driver + // will reply to the client after this method call returns. + // 'result' will be either COMMITTED or ABORTED, letting implementations + // know what was the final status of the transaction. + virtual void Finish(TransactionResult result) {} + + // Each implementation should have its own ToString() method. + virtual std::string ToString() const = 0; + + virtual ~Transaction() {} + + private: + // A private version of this transaction's transaction state so that + // we can use base TransactionState methods on destructors. + TransactionState* state_; + const consensus::DriverType type_; + const TransactionType tx_type_; +}; + +class TransactionState { + public: + + // Returns the request PB associated with this transaction. May be NULL if + // the transaction's state has been reset. + virtual const google::protobuf::Message* request() const { return NULL; } + + // Returns the response PB associated with this transaction, or NULL. + // This will only return a non-null object for leader-side transactions. + virtual google::protobuf::Message* response() { return NULL; } + + // Sets the ConsensusRound for this transaction, if this transaction is + // being executed through the consensus system. + void set_consensus_round(const scoped_refptr& consensus_round) { + consensus_round_ = consensus_round; + op_id_ = consensus_round_->id(); + } + + // Returns the ConsensusRound being used, if this transaction is being + // executed through the consensus system or NULL if it's not. + consensus::ConsensusRound* consensus_round() { + return consensus_round_.get(); + } + + TabletPeer* tablet_peer() const { + return tablet_peer_; + } + + // Return metrics related to this transaction. + const TransactionMetrics& metrics() const { + return tx_metrics_; + } + + TransactionMetrics* mutable_metrics() { + return &tx_metrics_; + } + + void set_completion_callback(gscoped_ptr completion_clbk) { + completion_clbk_.reset(completion_clbk.release()); + } + + // Returns the completion callback. + TransactionCompletionCallback* completion_callback() { + return DCHECK_NOTNULL(completion_clbk_.get()); + } + + // Sets a heap object to be managed by this transaction's AutoReleasePool. + template + T* AddToAutoReleasePool(T* t) { + return pool_.Add(t); + } + + // Sets an array heap object to be managed by this transaction's AutoReleasePool. + template + T* AddArrayToAutoReleasePool(T* t) { + return pool_.AddArray(t); + } + + // Return the arena associated with this transaction. + // NOTE: this is not a thread-safe arena! + Arena* arena() { + return &arena_; + } + + // Each implementation should have its own ToString() method. + virtual std::string ToString() const = 0; + + // Sets the timestamp for the transaction + virtual void set_timestamp(const Timestamp& timestamp) { + // make sure we set the timestamp only once + lock_guard l(&txn_state_lock_); + DCHECK_EQ(timestamp_, Timestamp::kInvalidTimestamp); + timestamp_ = timestamp; + } + + Timestamp timestamp() const { + lock_guard l(&txn_state_lock_); + DCHECK(timestamp_ != Timestamp::kInvalidTimestamp); + return timestamp_; + } + + bool has_timestamp() const { + lock_guard l(&txn_state_lock_); + return timestamp_ != Timestamp::kInvalidTimestamp; + } + + consensus::OpId* mutable_op_id() { + return &op_id_; + } + + const consensus::OpId& op_id() const { + return op_id_; + } + + ExternalConsistencyMode external_consistency_mode() const { + return external_consistency_mode_; + } + + protected: + explicit TransactionState(TabletPeer* tablet_peer); + virtual ~TransactionState(); + + TransactionMetrics tx_metrics_; + + // The tablet peer that is coordinating this transaction. + TabletPeer* const tablet_peer_; + + // Optional callback to be called once the transaction completes. + gscoped_ptr completion_clbk_; + + AutoReleasePool pool_; + + // This transaction's timestamp. Protected by txn_state_lock_. + Timestamp timestamp_; + + // The clock error when timestamp_ was read. + uint64_t timestamp_error_; + + Arena arena_; + + // This OpId stores the canonical "anchor" OpId for this transaction. + consensus::OpId op_id_; + + scoped_refptr consensus_round_; + + // The defined consistency mode for this transaction. + ExternalConsistencyMode external_consistency_mode_; + + // Lock that protects access to transaction state. + mutable simple_spinlock txn_state_lock_; +}; + +// A parent class for the callback that gets called when transactions +// complete. +// +// This must be set in the TransactionState if the transaction initiator is to +// be notified of when a transaction completes. The callback belongs to the +// transaction context and is deleted along with it. +// +// NOTE: this is a concrete class so that we can use it as a default implementation +// which avoids callers having to keep checking for NULL. +class TransactionCompletionCallback { + public: + + TransactionCompletionCallback(); + + // Allows to set an error for this transaction and a mapping to a server level code. + // Calling this method does not mean the transaction is completed. + void set_error(const Status& status, tserver::TabletServerErrorPB::Code code); + + void set_error(const Status& status); + + bool has_error() const; + + const Status& status() const; + + const tserver::TabletServerErrorPB::Code error_code() const; + + // Subclasses should override this. + virtual void TransactionCompleted(); + + virtual ~TransactionCompletionCallback(); + + protected: + Status status_; + tserver::TabletServerErrorPB::Code code_; +}; + +// TransactionCompletionCallback implementation that can be waited on. +// Helper to make async transactions, sync. +// This is templated to accept any response PB that has a TabletServerError +// 'error' field and to set the error before performing the latch countdown. +// The callback does *not* take ownership of either latch or response. +template +class LatchTransactionCompletionCallback : public TransactionCompletionCallback { + public: + explicit LatchTransactionCompletionCallback(CountDownLatch* latch, + ResponsePB* response) + : latch_(DCHECK_NOTNULL(latch)), + response_(DCHECK_NOTNULL(response)) { + } + + virtual void TransactionCompleted() OVERRIDE { + if (!status_.ok()) { + StatusToPB(status_, response_->mutable_error()->mutable_status()); + } + latch_->CountDown(); + } + + private: + CountDownLatch* latch_; + ResponsePB* response_; +}; + +// A transaction completion callback that takes a StatusCallback and simply +// calls it with the transaction status when it completes. +class StatusTransactionCompletionCallback : public TransactionCompletionCallback { + public: + explicit StatusTransactionCompletionCallback(StatusCallback callback) + : callback_(std::move(callback)) {} + + virtual void TransactionCompleted() OVERRIDE { + callback_.Run(status()); + } + private: + StatusCallback callback_; +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TRANSACTION_H_ */ diff --git a/src/kudu/tablet/transactions/transaction_driver.cc b/src/kudu/tablet/transactions/transaction_driver.cc new file mode 100644 index 000000000000..4a63fe33ff52 --- /dev/null +++ b/src/kudu/tablet/transactions/transaction_driver.cc @@ -0,0 +1,485 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/transactions/transaction_driver.h" + +#include "kudu/consensus/consensus.h" +#include "kudu/gutil/strings/strcat.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/transactions/transaction_tracker.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/logging.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace tablet { + +using consensus::CommitMsg; +using consensus::Consensus; +using consensus::ConsensusRound; +using consensus::ReplicateMsg; +using consensus::CommitMsg; +using consensus::DriverType; +using log::Log; +using std::shared_ptr; + +static const char* kTimestampFieldName = "timestamp"; + + +//////////////////////////////////////////////////////////// +// TransactionDriver +//////////////////////////////////////////////////////////// + +TransactionDriver::TransactionDriver(TransactionTracker *txn_tracker, + Consensus* consensus, + Log* log, + ThreadPool* prepare_pool, + ThreadPool* apply_pool, + TransactionOrderVerifier* order_verifier) + : txn_tracker_(txn_tracker), + consensus_(consensus), + log_(log), + prepare_pool_(prepare_pool), + apply_pool_(apply_pool), + order_verifier_(order_verifier), + trace_(new Trace()), + start_time_(MonoTime::Now(MonoTime::FINE)), + replication_state_(NOT_REPLICATING), + prepare_state_(NOT_PREPARED) { + if (Trace::CurrentTrace()) { + Trace::CurrentTrace()->AddChildTrace(trace_.get()); + } +} + +Status TransactionDriver::Init(gscoped_ptr transaction, + DriverType type) { + transaction_ = transaction.Pass(); + + if (type == consensus::REPLICA) { + boost::lock_guard lock(opid_lock_); + op_id_copy_ = transaction_->state()->op_id(); + DCHECK(op_id_copy_.IsInitialized()); + replication_state_ = REPLICATING; + } else { + DCHECK_EQ(type, consensus::LEADER); + gscoped_ptr replicate_msg; + transaction_->NewReplicateMsg(&replicate_msg); + if (consensus_) { // sometimes NULL in tests + // Unretained is required to avoid a refcount cycle. + mutable_state()->set_consensus_round( + consensus_->NewRound(replicate_msg.Pass(), + Bind(&TransactionDriver::ReplicationFinished, Unretained(this)))); + } + } + + RETURN_NOT_OK(txn_tracker_->Add(this)); + + return Status::OK(); +} + +consensus::OpId TransactionDriver::GetOpId() { + boost::lock_guard lock(opid_lock_); + return op_id_copy_; +} + +const TransactionState* TransactionDriver::state() const { + return transaction_ != nullptr ? transaction_->state() : nullptr; +} + +TransactionState* TransactionDriver::mutable_state() { + return transaction_ != nullptr ? transaction_->state() : nullptr; +} + +Transaction::TransactionType TransactionDriver::tx_type() const { + return transaction_->tx_type(); +} + +string TransactionDriver::ToString() const { + boost::lock_guard lock(lock_); + return ToStringUnlocked(); +} + +string TransactionDriver::ToStringUnlocked() const { + string ret = StateString(replication_state_, prepare_state_); + if (transaction_ != nullptr) { + ret += " " + transaction_->ToString(); + } else { + ret += "[unknown txn]"; + } + return ret; +} + + +Status TransactionDriver::ExecuteAsync() { + VLOG_WITH_PREFIX(4) << "ExecuteAsync()"; + TRACE_EVENT_FLOW_BEGIN0("txn", "ExecuteAsync", this); + ADOPT_TRACE(trace()); + + Status s; + if (replication_state_ == NOT_REPLICATING) { + // We're a leader transaction. Before submitting, check that we are the leader and + // determine the current term. + s = consensus_->CheckLeadershipAndBindTerm(mutable_state()->consensus_round()); + } + + if (s.ok()) { + s = prepare_pool_->SubmitClosure( + Bind(&TransactionDriver::PrepareAndStartTask, Unretained(this))); + } + + if (!s.ok()) { + HandleFailure(s); + } + + // TODO: make this return void + return Status::OK(); +} + +void TransactionDriver::PrepareAndStartTask() { + TRACE_EVENT_FLOW_END0("txn", "PrepareAndStartTask", this); + Status prepare_status = PrepareAndStart(); + if (PREDICT_FALSE(!prepare_status.ok())) { + HandleFailure(prepare_status); + } +} + +Status TransactionDriver::PrepareAndStart() { + TRACE_EVENT1("txn", "PrepareAndStart", "txn", this); + VLOG_WITH_PREFIX(4) << "PrepareAndStart()"; + // Actually prepare and start the transaction. + prepare_physical_timestamp_ = GetMonoTimeMicros(); + RETURN_NOT_OK(transaction_->Prepare()); + + RETURN_NOT_OK(transaction_->Start()); + + + // Only take the lock long enough to take a local copy of the + // replication state and set our prepare state. This ensures that + // exactly one of Replicate/Prepare callbacks will trigger the apply + // phase. + ReplicationState repl_state_copy; + { + boost::lock_guard lock(lock_); + CHECK_EQ(prepare_state_, NOT_PREPARED); + prepare_state_ = PREPARED; + repl_state_copy = replication_state_; + } + + switch (repl_state_copy) { + case NOT_REPLICATING: + { + + // Set the timestamp in the message, now that it's prepared. + transaction_->state()->consensus_round()->replicate_msg()->set_timestamp( + transaction_->state()->timestamp().ToUint64()); + + VLOG_WITH_PREFIX(4) << "Triggering consensus repl"; + // Trigger the consensus replication. + + { + boost::lock_guard lock(lock_); + replication_state_ = REPLICATING; + } + Status s = consensus_->Replicate(mutable_state()->consensus_round()); + + if (PREDICT_FALSE(!s.ok())) { + boost::lock_guard lock(lock_); + CHECK_EQ(replication_state_, REPLICATING); + transaction_status_ = s; + replication_state_ = REPLICATION_FAILED; + return s; + } + break; + } + case REPLICATING: + { + // Already replicating - nothing to trigger + break; + } + case REPLICATION_FAILED: + DCHECK(!transaction_status_.ok()); + FALLTHROUGH_INTENDED; + case REPLICATED: + { + // We can move on to apply. + // Note that ApplyAsync() will handle the error status in the + // REPLICATION_FAILED case. + return ApplyAsync(); + } + } + + return Status::OK(); +} + +void TransactionDriver::HandleFailure(const Status& s) { + VLOG_WITH_PREFIX(2) << "Failed transaction: " << s.ToString(); + CHECK(!s.ok()); + TRACE("HandleFailure($0)", s.ToString()); + + ReplicationState repl_state_copy; + + { + boost::lock_guard lock(lock_); + transaction_status_ = s; + repl_state_copy = replication_state_; + } + + + switch (repl_state_copy) { + case NOT_REPLICATING: + case REPLICATION_FAILED: + { + VLOG_WITH_PREFIX(1) << "Transaction " << ToString() << " failed prior to " + "replication success: " << s.ToString(); + transaction_->Finish(Transaction::ABORTED); + mutable_state()->completion_callback()->set_error(transaction_status_); + mutable_state()->completion_callback()->TransactionCompleted(); + txn_tracker_->Release(this); + return; + } + + case REPLICATING: + case REPLICATED: + { + LOG_WITH_PREFIX(FATAL) << "Cannot cancel transactions that have already replicated" + << ": " << transaction_status_.ToString() + << " transaction:" << ToString(); + } + } +} + +void TransactionDriver::ReplicationFinished(const Status& status) { + { + boost::lock_guard op_id_lock(opid_lock_); + // TODO: it's a bit silly that we have three copies of the opid: + // one here, one in ConsensusRound, and one in TransactionState. + + op_id_copy_ = DCHECK_NOTNULL(mutable_state()->consensus_round())->id(); + DCHECK(op_id_copy_.IsInitialized()); + mutable_state()->mutable_op_id()->CopyFrom(op_id_copy_); + } + + PrepareState prepare_state_copy; + { + boost::lock_guard lock(lock_); + CHECK_EQ(replication_state_, REPLICATING); + if (status.ok()) { + replication_state_ = REPLICATED; + } else { + replication_state_ = REPLICATION_FAILED; + transaction_status_ = status; + } + prepare_state_copy = prepare_state_; + } + + // If we have prepared and replicated, we're ready + // to move ahead and apply this operation. + // Note that if we set the state to REPLICATION_FAILED above, + // ApplyAsync() will actually abort the transaction, i.e. + // ApplyTask() will never be called and the transaction will never + // be applied to the tablet. + if (prepare_state_copy == PREPARED) { + // We likely need to do cleanup if this fails so for now just + // CHECK_OK + CHECK_OK(ApplyAsync()); + } +} + +void TransactionDriver::Abort(const Status& status) { + CHECK(!status.ok()); + + ReplicationState repl_state_copy; + { + boost::lock_guard lock(lock_); + repl_state_copy = replication_state_; + transaction_status_ = status; + } + + // If the state is not NOT_REPLICATING we abort immediately and the transaction + // will never be replicated. + // In any other state we just set the transaction status, if the transaction's + // Apply hasn't started yet this prevents it from starting, but if it has then + // the transaction runs to completion. + if (repl_state_copy == NOT_REPLICATING) { + HandleFailure(status); + } +} + +Status TransactionDriver::ApplyAsync() { + { + boost::unique_lock lock(lock_); + DCHECK_EQ(prepare_state_, PREPARED); + if (transaction_status_.ok()) { + DCHECK_EQ(replication_state_, REPLICATED); + order_verifier_->CheckApply(op_id_copy_.index(), + prepare_physical_timestamp_); + // Now that the transaction is committed in consensus advance the safe time. + if (transaction_->state()->external_consistency_mode() != COMMIT_WAIT) { + transaction_->state()->tablet_peer()->tablet()->mvcc_manager()-> + OfflineAdjustSafeTime(transaction_->state()->timestamp()); + } + } else { + DCHECK_EQ(replication_state_, REPLICATION_FAILED); + DCHECK(!transaction_status_.ok()); + lock.unlock(); + HandleFailure(transaction_status_); + return Status::OK(); + } + } + + TRACE_EVENT_FLOW_BEGIN0("txn", "ApplyTask", this); + return apply_pool_->SubmitClosure(Bind(&TransactionDriver::ApplyTask, Unretained(this))); +} + +void TransactionDriver::ApplyTask() { + TRACE_EVENT_FLOW_END0("txn", "ApplyTask", this); + ADOPT_TRACE(trace()); + + { + boost::lock_guard lock(lock_); + DCHECK_EQ(replication_state_, REPLICATED); + DCHECK_EQ(prepare_state_, PREPARED); + } + + // We need to ref-count ourself, since Commit() may run very quickly + // and end up calling Finalize() while we're still in this code. + scoped_refptr ref(this); + + { + gscoped_ptr commit_msg; + CHECK_OK(transaction_->Apply(&commit_msg)); + commit_msg->mutable_commited_op_id()->CopyFrom(op_id_copy_); + SetResponseTimestamp(transaction_->state(), transaction_->state()->timestamp()); + + // If the client requested COMMIT_WAIT as the external consistency mode + // calculate the latest that the prepare timestamp could be and wait + // until now.earliest > prepare_latest. Only after this are the locks + // released. + if (mutable_state()->external_consistency_mode() == COMMIT_WAIT) { + // TODO: only do this on the leader side + TRACE("APPLY: Commit Wait."); + // If we can't commit wait and have already applied we might have consistency + // issues if we still reply to the client that the operation was a success. + // On the other hand we don't have rollbacks as of yet thus we can't undo the + // the apply either, so we just CHECK_OK for now. + CHECK_OK(CommitWait()); + } + + transaction_->PreCommit(); + { + TRACE_EVENT1("txn", "AsyncAppendCommit", "txn", this); + CHECK_OK(log_->AsyncAppendCommit(commit_msg.Pass(), Bind(DoNothingStatusCB))); + } + Finalize(); + } +} + +void TransactionDriver::SetResponseTimestamp(TransactionState* transaction_state, + const Timestamp& timestamp) { + google::protobuf::Message* response = transaction_state->response(); + if (response) { + const google::protobuf::FieldDescriptor* ts_field = + response->GetDescriptor()->FindFieldByName(kTimestampFieldName); + response->GetReflection()->SetUInt64(response, ts_field, timestamp.ToUint64()); + } +} + +Status TransactionDriver::CommitWait() { + MonoTime before = MonoTime::Now(MonoTime::FINE); + DCHECK(mutable_state()->external_consistency_mode() == COMMIT_WAIT); + // TODO: we could plumb the RPC deadline in here, and not bother commit-waiting + // if the deadline is already expired. + RETURN_NOT_OK( + mutable_state()->tablet_peer()->clock()->WaitUntilAfter(mutable_state()->timestamp(), + MonoTime::Max())); + mutable_state()->mutable_metrics()->commit_wait_duration_usec = + MonoTime::Now(MonoTime::FINE).GetDeltaSince(before).ToMicroseconds(); + return Status::OK(); +} + +void TransactionDriver::Finalize() { + ADOPT_TRACE(trace()); + // TODO: this is an ugly hack so that the Release() call doesn't delete the + // object while we still hold the lock. + scoped_refptr ref(this); + boost::lock_guard lock(lock_); + transaction_->Finish(Transaction::COMMITTED); + mutable_state()->completion_callback()->TransactionCompleted(); + txn_tracker_->Release(this); +} + + +std::string TransactionDriver::StateString(ReplicationState repl_state, + PrepareState prep_state) { + string state_str; + switch (repl_state) { + case NOT_REPLICATING: + StrAppend(&state_str, "NR-"); // For Not Replicating + break; + case REPLICATING: + StrAppend(&state_str, "R-"); // For Replicating + break; + case REPLICATION_FAILED: + StrAppend(&state_str, "RF-"); // For Replication Failed + break; + case REPLICATED: + StrAppend(&state_str, "RD-"); // For Replication Done + break; + default: + LOG(DFATAL) << "Unexpected replication state: " << repl_state; + } + switch (prep_state) { + case PREPARED: + StrAppend(&state_str, "P"); + break; + case NOT_PREPARED: + StrAppend(&state_str, "NP"); + break; + default: + LOG(DFATAL) << "Unexpected prepare state: " << prep_state; + } + return state_str; +} + +std::string TransactionDriver::LogPrefix() const { + + ReplicationState repl_state_copy; + PrepareState prep_state_copy; + string ts_string; + + { + boost::lock_guard lock(lock_); + repl_state_copy = replication_state_; + prep_state_copy = prepare_state_; + ts_string = state()->has_timestamp() ? state()->timestamp().ToString() : "No timestamp"; + } + + string state_str = StateString(repl_state_copy, prep_state_copy); + // We use the tablet and the peer (T, P) to identify ts and tablet and the timestamp (Ts) to + // (help) identify the transaction. The state string (S) describes the state of the transaction. + return strings::Substitute("T $0 P $1 S $2 Ts $3: ", + // consensus_ is NULL in some unit tests. + PREDICT_TRUE(consensus_) ? consensus_->tablet_id() : "(unknown)", + PREDICT_TRUE(consensus_) ? consensus_->peer_uuid() : "(unknown)", + state_str, + ts_string); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/transaction_driver.h b/src/kudu/tablet/transactions/transaction_driver.h new file mode 100644 index 000000000000..2deefb819d42 --- /dev/null +++ b/src/kudu/tablet/transactions/transaction_driver.h @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TRANSACTION_DRIVER_H_ +#define KUDU_TABLET_TRANSACTION_DRIVER_H_ + +#include + +#include "kudu/consensus/consensus.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/walltime.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/util/status.h" +#include "kudu/util/trace.h" + +namespace kudu { +class ThreadPool; + +namespace log { +class Log; +} // namespace log + +namespace tablet { +class TransactionOrderVerifier; +class TransactionTracker; + +// Base class for transaction drivers. +// +// TransactionDriver classes encapsulate the logic of coordinating the execution of +// an operation. The exact triggering of the methods differs based on whether the +// operation is being executed on a leader or replica, but the general flow is: +// +// 1 - Init() is called on a newly created driver object. +// If the driver is instantiated from a REPLICA, then we know that +// the operation is already "REPLICATING" (and thus we don't need to +// trigger replication ourself later on). +// +// 2 - ExecuteAsync() is called. This submits PrepareAndStartTask() to prepare_pool_ +// and returns immediately. +// +// 3 - PrepareAndStartTask() calls Prepare() and Start() on the transaction. +// +// Once successfully prepared, if we have not yet replicated (i.e we are leader), +// also triggers consensus->Replicate() and changes the replication state to +// REPLICATING. +// +// On the other hand, if we have already successfully replicated (eg we are the +// follower and ConsensusCommitted() has already been called, then we can move +// on to ApplyAsync(). +// +// 4 - The Consensus implementation calls ConsensusCommitted() +// +// This is triggered by consensus when the commit index moves past our own +// OpId. On followers, this can happen before Prepare() finishes, and thus +// we have to check whether we have already done step 3. On leaders, we +// don't start the consensus round until after Prepare, so this check always +// passes. +// +// If Prepare() has already completed, then we trigger ApplyAsync(). +// +// 5 - ApplyAsync() submits ApplyTask() to the apply_pool_. +// ApplyTask() calls transaction_->Apply(). +// +// When Apply() is called, changes are made to the in-memory data structures. These +// changes are not visible to clients yet. After Apply() completes, a CommitMsg +// is enqueued to the WAL in order to store information about the operation result +// and provide correct recovery. +// +// After the commit message has been enqueued in the Log, the driver executes Finalize() +// which, in turn, makes transactions make their changes visible to other transactions. +// After this step the driver replies to the client if needed and the transaction +// is completed. +// In-mem data structures that contain the changes made by the transaction can now +// be made durable. +// +// [1] - see 'Implementation Techniques for Main Memory Database Systems', DeWitt et. al. +// +// This class is thread safe. +class TransactionDriver : public RefCountedThreadSafe { + + public: + // Construct TransactionDriver. TransactionDriver does not take ownership + // of any of the objects pointed to in the constructor's arguments. + TransactionDriver(TransactionTracker* txn_tracker, + consensus::Consensus* consensus, + log::Log* log, + ThreadPool* prepare_pool, + ThreadPool* apply_pool, + TransactionOrderVerifier* order_verifier); + + // Perform any non-constructor initialization. Sets the transaction + // that will be executed. + Status Init(gscoped_ptr transaction, + consensus::DriverType driver); + + // Returns the OpId of the transaction being executed or an uninitialized + // OpId if none has been assigned. Returns a copy and thus should not + // be used in tight loops. + consensus::OpId GetOpId(); + + // Submits the transaction for execution. + // The returned status acknowledges any error on the submission process. + // The transaction will be replied to asynchronously. + Status ExecuteAsync(); + + // Aborts the transaction, if possible. Since transactions are executed in + // multiple stages by multiple executors it might not be possible to stop + // the transaction immediately, but this will make sure it is aborted + // at the next synchronization point. + void Abort(const Status& status); + + // Callback from Consensus when replication is complete, and thus the operation + // is considered "committed" from the consensus perspective (ie it will be + // applied on every node, and not ever truncated from the state machine history). + // If status is anything different from OK() we don't proceed with the apply. + // + // see comment in the interface for an important TODO. + void ReplicationFinished(const Status& status); + + std::string ToString() const; + + std::string ToStringUnlocked() const; + + std::string LogPrefix() const; + + // Returns the type of the transaction being executed by this driver. + Transaction::TransactionType tx_type() const; + + // Returns the state of the transaction being executed by this driver. + const TransactionState* state() const; + + const MonoTime& start_time() const { return start_time_; } + + Trace* trace() { return trace_.get(); } + + private: + friend class RefCountedThreadSafe; + enum ReplicationState { + // The operation has not yet been sent to consensus for replication + NOT_REPLICATING, + + // Replication has been triggered (either because we are the leader and triggered it, + // or because we are a follower and we started this operation in response to a + // leader's call) + REPLICATING, + + // Replication has failed, and we are certain that no other may have received the + // operation (ie we failed before even sending the request off of our node). + REPLICATION_FAILED, + + // Replication has succeeded. + REPLICATED + }; + + enum PrepareState { + NOT_PREPARED, + PREPARED + }; + + ~TransactionDriver() {} + + // The task submitted to the prepare threadpool to prepare and start + // the transaction. If PrepareAndStart() fails, calls HandleFailure. + void PrepareAndStartTask(); + // Actually prepare and start. + Status PrepareAndStart(); + + // Submits ApplyTask to the apply pool. + Status ApplyAsync(); + + // Calls Transaction::Apply() followed by Consensus::Commit() with the + // results from the Apply(). + void ApplyTask(); + + // Sleeps until the transaction is allowed to commit based on the + // requested consistency mode. + Status CommitWait(); + + // Handle a failure in any of the stages of the operation. + // In some cases, this will end the operation and call its callback. + // In others, where we can't recover, this will FATAL. + void HandleFailure(const Status& s); + + // Called on Transaction::Apply() after the CommitMsg has been successfully + // appended to the WAL. + void Finalize(); + + // Returns the mutable state of the transaction being executed by + // this driver. + TransactionState* mutable_state(); + + // Return a short string indicating where the transaction currently is in the + // state machine. + static std::string StateString(ReplicationState repl_state, + PrepareState prep_state); + + // Sets the timestamp on the response PB, if there is one. + void SetResponseTimestamp(TransactionState* transaction_state, + const Timestamp& timestamp); + + TransactionTracker* const txn_tracker_; + consensus::Consensus* const consensus_; + log::Log* const log_; + ThreadPool* const prepare_pool_; + ThreadPool* const apply_pool_; + TransactionOrderVerifier* const order_verifier_; + + Status transaction_status_; + + // Lock that synchronizes access to the transaction's state. + mutable simple_spinlock lock_; + + // A copy of the transaction's OpId, set when the transaction first + // receives one from Consensus and uninitialized until then. + // TODO(todd): we have three separate copies of this now -- in TransactionState, + // CommitMsg, and here... we should be able to consolidate! + consensus::OpId op_id_copy_; + + // Lock that protects access to the driver's copy of the op_id, specifically. + // GetOpId() is the only method expected to be called by threads outside + // of the control of the driver, so we use a special lock to control access + // otherwise callers would block for a long time for long running transactions. + mutable simple_spinlock opid_lock_; + + // The transaction to be executed by this driver. + gscoped_ptr transaction_; + + // Trace object for tracing any transactions started by this driver. + scoped_refptr trace_; + + const MonoTime start_time_; + + ReplicationState replication_state_; + PrepareState prepare_state_; + + // The system monotonic time when the operation was prepared. + // This is used for debugging only, not any actual operation ordering. + MicrosecondsInt64 prepare_physical_timestamp_; + + DISALLOW_COPY_AND_ASSIGN(TransactionDriver); +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_TRANSACTION_DRIVER_H_ */ diff --git a/src/kudu/tablet/transactions/transaction_tracker-test.cc b/src/kudu/tablet/transactions/transaction_tracker-test.cc new file mode 100644 index 000000000000..6b4515700d77 --- /dev/null +++ b/src/kudu/tablet/transactions/transaction_tracker-test.cc @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/transactions/transaction_driver.h" +#include "kudu/tablet/transactions/transaction_tracker.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DECLARE_int64(tablet_transaction_memory_limit_mb); + +METRIC_DECLARE_entity(tablet); + +METRIC_DECLARE_gauge_uint64(all_transactions_inflight); +METRIC_DECLARE_gauge_uint64(write_transactions_inflight); +METRIC_DECLARE_gauge_uint64(alter_schema_transactions_inflight); +METRIC_DECLARE_counter(transaction_memory_pressure_rejections); + +using std::shared_ptr; +using std::vector; + +namespace kudu { +namespace tablet { + +class TransactionTrackerTest : public KuduTest { + public: + class NoOpTransactionState : public TransactionState { + public: + NoOpTransactionState() : TransactionState(nullptr) {} + virtual const google::protobuf::Message* request() const OVERRIDE { return &req_; } + virtual std::string ToString() const OVERRIDE { return "NoOpTransactionState"; } + private: + consensus::ReplicateMsg req_; + }; + class NoOpTransaction : public Transaction { + public: + explicit NoOpTransaction(NoOpTransactionState* state) + : Transaction(state, consensus::LEADER, Transaction::WRITE_TXN), + state_(state) { + } + + virtual void NewReplicateMsg(gscoped_ptr* replicate_msg) OVERRIDE { + replicate_msg->reset(new consensus::ReplicateMsg()); + } + + virtual Status Prepare() OVERRIDE { return Status::OK(); } + virtual Status Start() OVERRIDE { return Status::OK(); } + virtual Status Apply(gscoped_ptr* commit_msg) OVERRIDE { + return Status::OK(); + } + virtual std::string ToString() const OVERRIDE { + return "NoOp"; + } + private: + gscoped_ptr state_; + }; + + TransactionTrackerTest() + : entity_(METRIC_ENTITY_tablet.Instantiate(®istry_, "test")) { + tracker_.StartInstrumentation(entity_); + } + + void RunTransactionsThread(CountDownLatch* finish_latch); + + Status AddDrivers(int num_drivers, + vector >* drivers) { + vector > local_drivers; + for (int i = 0; i < num_drivers; i++) { + scoped_refptr driver(new TransactionDriver(&tracker_, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr)); + gscoped_ptr tx(new NoOpTransaction(new NoOpTransactionState)); + RETURN_NOT_OK(driver->Init(tx.PassAs(), consensus::LEADER)); + local_drivers.push_back(driver); + } + + for (const scoped_refptr& d : local_drivers) { + drivers->push_back(d); + } + return Status::OK(); + } + + MetricRegistry registry_; + scoped_refptr entity_; + TransactionTracker tracker_; +}; + +TEST_F(TransactionTrackerTest, TestGetPending) { + ASSERT_EQ(0, tracker_.GetNumPendingForTests()); + vector > drivers; + ASSERT_OK(AddDrivers(1, &drivers)); + scoped_refptr driver = drivers[0]; + ASSERT_EQ(1, tracker_.GetNumPendingForTests()); + + vector > pending_transactions; + tracker_.GetPendingTransactions(&pending_transactions); + ASSERT_EQ(1, pending_transactions.size()); + ASSERT_EQ(driver.get(), pending_transactions.front().get()); + + // And mark the transaction as failed, which will cause it to unregister itself. + driver->Abort(Status::Aborted("")); + + ASSERT_EQ(0, tracker_.GetNumPendingForTests()); +} + +// Thread which starts a bunch of transactions and later stops them all. +void TransactionTrackerTest::RunTransactionsThread(CountDownLatch* finish_latch) { + const int kNumTransactions = 100; + // Start a bunch of transactions. + vector > drivers; + ASSERT_OK(AddDrivers(kNumTransactions, &drivers)); + + // Wait for the main thread to tell us to proceed. + finish_latch->Wait(); + + // Sleep a tiny bit to give the main thread a chance to get into the + // WaitForAllToFinish() call. + SleepFor(MonoDelta::FromMilliseconds(1)); + + // Finish all the transactions + for (const scoped_refptr& driver : drivers) { + // And mark the transaction as failed, which will cause it to unregister itself. + driver->Abort(Status::Aborted("")); + } +} + +// Regression test for KUDU-384 (thread safety issue with TestWaitForAllToFinish) +TEST_F(TransactionTrackerTest, TestWaitForAllToFinish) { + CountDownLatch finish_latch(1); + scoped_refptr thr; + CHECK_OK(Thread::Create("test", "txn-thread", + &TransactionTrackerTest::RunTransactionsThread, this, &finish_latch, + &thr)); + + // Wait for the txns to start. + while (tracker_.GetNumPendingForTests() == 0) { + SleepFor(MonoDelta::FromMilliseconds(1)); + } + + // Allow the thread to proceed, and then wait for it to abort all the + // transactions. + finish_latch.CountDown(); + tracker_.WaitForAllToFinish(); + + CHECK_OK(ThreadJoiner(thr.get()).Join()); + ASSERT_EQ(tracker_.GetNumPendingForTests(), 0); +} + +static void CheckMetrics(const scoped_refptr& entity, + int expected_num_writes, + int expected_num_alters, + int expected_num_rejections) { + ASSERT_EQ(expected_num_writes + expected_num_alters, down_cast*>( + entity->FindOrNull(METRIC_all_transactions_inflight).get())->value()); + ASSERT_EQ(expected_num_writes, down_cast*>( + entity->FindOrNull(METRIC_write_transactions_inflight).get())->value()); + ASSERT_EQ(expected_num_alters, down_cast*>( + entity->FindOrNull(METRIC_alter_schema_transactions_inflight).get())->value()); + ASSERT_EQ(expected_num_rejections, down_cast( + entity->FindOrNull(METRIC_transaction_memory_pressure_rejections).get())->value()); +} + +// Basic testing for metrics. Note that the NoOpTransactions we use in this +// test are all write transactions. +TEST_F(TransactionTrackerTest, TestMetrics) { + NO_FATALS(CheckMetrics(entity_, 0, 0, 0)); + + vector > drivers; + ASSERT_OK(AddDrivers(3, &drivers)); + NO_FATALS(CheckMetrics(entity_, 3, 0, 0)); + + drivers[0]->Abort(Status::Aborted("")); + NO_FATALS(CheckMetrics(entity_, 2, 0, 0)); + + drivers[1]->Abort(Status::Aborted("")); + drivers[2]->Abort(Status::Aborted("")); + NO_FATALS(CheckMetrics(entity_, 0, 0, 0)); +} + +// Check that the tracker's consumption is very close (but not quite equal to) +// the defined transaction memory limit. +static void CheckMemTracker(const shared_ptr& t) { + int64_t val = t->consumption(); + uint64_t defined_limit = + FLAGS_tablet_transaction_memory_limit_mb * 1024 * 1024; + ASSERT_GT(val, (defined_limit * 99) / 100); + ASSERT_LE(val, defined_limit); +} + +// Test that if too many transactions are added, eventually the tracker starts +// rejecting new ones. +TEST_F(TransactionTrackerTest, TestTooManyTransactions) { + FLAGS_tablet_transaction_memory_limit_mb = 1; + shared_ptr t = MemTracker::CreateTracker(-1, "test"); + tracker_.StartMemoryTracking(t); + + // Fill up the tracker. + // + // It's difficult to anticipate exactly how many drivers we can add (each + // carries an empty ReplicateMsg), so we'll just add as many as possible + // and check that when we fail, it's because we've hit the limit. + Status s; + vector > drivers; + for (int i = 0; s.ok();i++) { + s = AddDrivers(1, &drivers); + } + + LOG(INFO) << "Added " << drivers.size() << " drivers"; + ASSERT_TRUE(s.IsServiceUnavailable()); + ASSERT_STR_CONTAINS(s.ToString(), "exceeded its limit"); + NO_FATALS(CheckMetrics(entity_, drivers.size(), 0, 1)); + NO_FATALS(CheckMemTracker(t)); + + ASSERT_TRUE(AddDrivers(1, &drivers).IsServiceUnavailable()); + NO_FATALS(CheckMetrics(entity_, drivers.size(), 0, 2)); + NO_FATALS(CheckMemTracker(t)); + + // If we abort one transaction, we should be able to add one more. + drivers.back()->Abort(Status::Aborted("")); + drivers.pop_back(); + NO_FATALS(CheckMemTracker(t)); + ASSERT_OK(AddDrivers(1, &drivers)); + NO_FATALS(CheckMemTracker(t)); + + // Clean up. + for (const scoped_refptr& driver : drivers) { + driver->Abort(Status::Aborted("")); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/transaction_tracker.cc b/src/kudu/tablet/transactions/transaction_tracker.cc new file mode 100644 index 000000000000..ca21cd58527e --- /dev/null +++ b/src/kudu/tablet/transactions/transaction_tracker.cc @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/transactions/transaction_tracker.h" + +#include +#include +#include + + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/transactions/transaction_driver.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" + +DEFINE_int64(tablet_transaction_memory_limit_mb, 64, + "Maximum amount of memory that may be consumed by all in-flight " + "transactions belonging to a particular tablet. When this limit " + "is reached, new transactions will be rejected and clients will " + "be forced to retry them. If -1, transaction memory tracking is " + "disabled."); +TAG_FLAG(tablet_transaction_memory_limit_mb, advanced); + +METRIC_DEFINE_gauge_uint64(tablet, all_transactions_inflight, + "Transactions In Flight", + kudu::MetricUnit::kTransactions, + "Number of transactions currently in-flight, including any type."); +METRIC_DEFINE_gauge_uint64(tablet, write_transactions_inflight, + "Write Transactions In Flight", + kudu::MetricUnit::kTransactions, + "Number of write transactions currently in-flight"); +METRIC_DEFINE_gauge_uint64(tablet, alter_schema_transactions_inflight, + "Alter Schema Transactions In Flight", + kudu::MetricUnit::kTransactions, + "Number of alter schema transactions currently in-flight"); + +METRIC_DEFINE_counter(tablet, transaction_memory_pressure_rejections, + "Transaction Memory Pressure Rejections", + kudu::MetricUnit::kTransactions, + "Number of transactions rejected because the tablet's " + "transaction memory limit was reached."); + +using std::shared_ptr; +using std::vector; + +namespace kudu { +namespace tablet { + +using strings::Substitute; + +#define MINIT(x) x(METRIC_##x.Instantiate(entity)) +#define GINIT(x) x(METRIC_##x.Instantiate(entity, 0)) +TransactionTracker::Metrics::Metrics(const scoped_refptr& entity) + : GINIT(all_transactions_inflight), + GINIT(write_transactions_inflight), + GINIT(alter_schema_transactions_inflight), + MINIT(transaction_memory_pressure_rejections) { +} +#undef GINIT +#undef MINIT + +TransactionTracker::State::State() + : memory_footprint(0) { +} + +TransactionTracker::TransactionTracker() { +} + +TransactionTracker::~TransactionTracker() { + lock_guard l(&lock_); + CHECK_EQ(pending_txns_.size(), 0); + if (mem_tracker_) { + mem_tracker_->UnregisterFromParent(); + } +} + +Status TransactionTracker::Add(TransactionDriver* driver) { + int64_t driver_mem_footprint = driver->state()->request()->SpaceUsed(); + if (mem_tracker_ && !mem_tracker_->TryConsume(driver_mem_footprint)) { + if (metrics_) { + metrics_->transaction_memory_pressure_rejections->Increment(); + } + + // May be null in unit tests. + TabletPeer* peer = driver->state()->tablet_peer(); + + string msg = Substitute( + "Transaction failed, tablet $0 transaction memory consumption ($1) " + "has exceeded its limit ($2) or the limit of an ancestral tracker", + peer ? peer->tablet()->tablet_id() : "(unknown)", + mem_tracker_->consumption(), mem_tracker_->limit()); + + KLOG_EVERY_N_SECS(WARNING, 1) << msg << THROTTLE_MSG; + + return Status::ServiceUnavailable(msg); + } + + IncrementCounters(*driver); + + // Cache the transaction memory footprint so we needn't refer to the request + // again, as it may disappear between now and then. + State st; + st.memory_footprint = driver_mem_footprint; + lock_guard l(&lock_); + InsertOrDie(&pending_txns_, driver, st); + return Status::OK(); +} + +void TransactionTracker::IncrementCounters(const TransactionDriver& driver) const { + if (!metrics_) { + return; + } + + metrics_->all_transactions_inflight->Increment(); + switch (driver.tx_type()) { + case Transaction::WRITE_TXN: + metrics_->write_transactions_inflight->Increment(); + break; + case Transaction::ALTER_SCHEMA_TXN: + metrics_->alter_schema_transactions_inflight->Increment(); + break; + } +} + +void TransactionTracker::DecrementCounters(const TransactionDriver& driver) const { + if (!metrics_) { + return; + } + + DCHECK_GT(metrics_->all_transactions_inflight->value(), 0); + metrics_->all_transactions_inflight->Decrement(); + switch (driver.tx_type()) { + case Transaction::WRITE_TXN: + DCHECK_GT(metrics_->write_transactions_inflight->value(), 0); + metrics_->write_transactions_inflight->Decrement(); + break; + case Transaction::ALTER_SCHEMA_TXN: + DCHECK_GT(metrics_->alter_schema_transactions_inflight->value(), 0); + metrics_->alter_schema_transactions_inflight->Decrement(); + break; + } +} + +void TransactionTracker::Release(TransactionDriver* driver) { + DecrementCounters(*driver); + + State st; + { + // Remove the transaction from the map, retaining the state for use + // below. + lock_guard l(&lock_); + st = FindOrDie(pending_txns_, driver); + if (PREDICT_FALSE(pending_txns_.erase(driver) != 1)) { + LOG(FATAL) << "Could not remove pending transaction from map: " + << driver->ToStringUnlocked(); + } + } + + if (mem_tracker_) { + mem_tracker_->Release(st.memory_footprint); + } +} + +void TransactionTracker::GetPendingTransactions( + vector >* pending_out) const { + DCHECK(pending_out->empty()); + lock_guard l(&lock_); + for (const TxnMap::value_type& e : pending_txns_) { + // Increments refcount of each transaction. + pending_out->push_back(e.first); + } +} + +int TransactionTracker::GetNumPendingForTests() const { + lock_guard l(&lock_); + return pending_txns_.size(); +} + +void TransactionTracker::WaitForAllToFinish() const { + // Wait indefinitely. + CHECK_OK(WaitForAllToFinish(MonoDelta::FromNanoseconds(std::numeric_limits::max()))); +} + +Status TransactionTracker::WaitForAllToFinish(const MonoDelta& timeout) const { + const int complain_ms = 1000; + int wait_time = 250; + int num_complaints = 0; + MonoTime start_time = MonoTime::Now(MonoTime::FINE); + while (1) { + vector > txns; + GetPendingTransactions(&txns); + + if (txns.empty()) { + break; + } + + MonoDelta diff = MonoTime::Now(MonoTime::FINE).GetDeltaSince(start_time); + if (diff.MoreThan(timeout)) { + return Status::TimedOut(Substitute("Timed out waiting for all transactions to finish. " + "$0 transactions pending. Waited for $1", + txns.size(), diff.ToString())); + } + int64_t waited_ms = diff.ToMilliseconds(); + if (waited_ms / complain_ms > num_complaints) { + LOG(WARNING) << Substitute("TransactionTracker waiting for $0 outstanding transactions to" + " complete now for $1 ms", txns.size(), waited_ms); + num_complaints++; + } + wait_time = std::min(wait_time * 5 / 4, 1000000); + + LOG(INFO) << "Dumping currently running transactions: "; + for (scoped_refptr driver : txns) { + LOG(INFO) << driver->ToString(); + } + SleepFor(MonoDelta::FromMicroseconds(wait_time)); + } + return Status::OK(); +} + +void TransactionTracker::StartInstrumentation( + const scoped_refptr& metric_entity) { + metrics_.reset(new Metrics(metric_entity)); +} + +void TransactionTracker::StartMemoryTracking( + const shared_ptr& parent_mem_tracker) { + if (FLAGS_tablet_transaction_memory_limit_mb != -1) { + mem_tracker_ = MemTracker::CreateTracker( + FLAGS_tablet_transaction_memory_limit_mb * 1024 * 1024, + "txn_tracker", + parent_mem_tracker); + } +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/transaction_tracker.h b/src/kudu/tablet/transactions/transaction_tracker.h new file mode 100644 index 000000000000..88a910318035 --- /dev/null +++ b/src/kudu/tablet/transactions/transaction_tracker.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_TRANSACTION_TRACKER_H_ +#define KUDU_TABLET_TRANSACTION_TRACKER_H_ + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/util/locks.h" + +namespace kudu { + +template +class AtomicGauge; +class Counter; +class MemTracker; +class MetricEntity; + +namespace tablet { +class TransactionDriver; + +// Each TabletPeer has a TransactionTracker which keeps track of pending transactions. +// Each "LeaderTransaction" will register itself by calling Add(). +// It will remove itself by calling Release(). +class TransactionTracker { + public: + TransactionTracker(); + ~TransactionTracker(); + + // Adds a transaction to the set of tracked transactions. + // + // In the event that the tracker's memory limit is exceeded, returns a + // ServiceUnavailable status. + Status Add(TransactionDriver* driver); + + // Removes the txn from the pending list. + // Also triggers the deletion of the Transaction object, if its refcount == 0. + void Release(TransactionDriver* driver); + + // Populates list of currently-running transactions into 'pending_out' vector. + void GetPendingTransactions(std::vector >* pending_out) const; + + // Returns number of pending transactions. + int GetNumPendingForTests() const; + + void WaitForAllToFinish() const; + Status WaitForAllToFinish(const MonoDelta& timeout) const; + + void StartInstrumentation(const scoped_refptr& metric_entity); + void StartMemoryTracking(const std::shared_ptr& parent_mem_tracker); + + private: + struct Metrics { + explicit Metrics(const scoped_refptr& entity); + + scoped_refptr > all_transactions_inflight; + scoped_refptr > write_transactions_inflight; + scoped_refptr > alter_schema_transactions_inflight; + + scoped_refptr transaction_memory_pressure_rejections; + }; + + // Increments relevant metric counters. + void IncrementCounters(const TransactionDriver& driver) const; + + // Decrements relevant metric counters. + void DecrementCounters(const TransactionDriver& driver) const; + + mutable simple_spinlock lock_; + + // Per-transaction state that is tracked along with the transaction itself. + struct State { + State(); + + // Approximate memory footprint of the transaction. + int64_t memory_footprint; + }; + + // Protected by 'lock_'. + typedef std::unordered_map, + State, + ScopedRefPtrHashFunctor, + ScopedRefPtrEqualToFunctor > TxnMap; + TxnMap pending_txns_; + + gscoped_ptr metrics_; + + std::shared_ptr mem_tracker_; + + DISALLOW_COPY_AND_ASSIGN(TransactionTracker); +}; + +} // namespace tablet +} // namespace kudu + +#endif // KUDU_TABLET_TRANSACTION_TRACKER_H_ diff --git a/src/kudu/tablet/transactions/write_transaction.cc b/src/kudu/tablet/transactions/write_transaction.cc new file mode 100644 index 000000000000..ebb74d821e3a --- /dev/null +++ b/src/kudu/tablet/transactions/write_transaction.cc @@ -0,0 +1,375 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tablet/transactions/write_transaction.h" + +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/common/row_operations.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/walltime.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/row_op.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/trace.h" + +DEFINE_int32(tablet_inject_latency_on_apply_write_txn_ms, 0, + "How much latency to inject when a write transaction is applied. " + "For testing only!"); +TAG_FLAG(tablet_inject_latency_on_apply_write_txn_ms, unsafe); +TAG_FLAG(tablet_inject_latency_on_apply_write_txn_ms, runtime); + +namespace kudu { +namespace tablet { + +using boost::bind; +using consensus::ReplicateMsg; +using consensus::CommitMsg; +using consensus::DriverType; +using consensus::WRITE_OP; +using tserver::TabletServerErrorPB; +using tserver::WriteRequestPB; +using tserver::WriteResponsePB; +using strings::Substitute; + +WriteTransaction::WriteTransaction(WriteTransactionState* state, DriverType type) + : Transaction(state, type, Transaction::WRITE_TXN), + state_(state) { + start_time_ = MonoTime::Now(MonoTime::FINE); +} + +void WriteTransaction::NewReplicateMsg(gscoped_ptr* replicate_msg) { + replicate_msg->reset(new ReplicateMsg); + (*replicate_msg)->set_op_type(WRITE_OP); + (*replicate_msg)->mutable_write_request()->CopyFrom(*state()->request()); +} + +Status WriteTransaction::Prepare() { + TRACE_EVENT0("txn", "WriteTransaction::Prepare"); + TRACE("PREPARE: Starting"); + // Decode everything first so that we give up if something major is wrong. + Schema client_schema; + RETURN_NOT_OK_PREPEND(SchemaFromPB(state_->request()->schema(), &client_schema), + "Cannot decode client schema"); + if (client_schema.has_column_ids()) { + // TODO: we have this kind of code a lot - add a new SchemaFromPB variant which + // does this check inline. + Status s = Status::InvalidArgument("User requests should not have Column IDs"); + state_->completion_callback()->set_error(s, TabletServerErrorPB::INVALID_SCHEMA); + return s; + } + + Tablet* tablet = state()->tablet_peer()->tablet(); + + Status s = tablet->DecodeWriteOperations(&client_schema, state()); + if (!s.ok()) { + // TODO: is MISMATCHED_SCHEMA always right here? probably not. + state()->completion_callback()->set_error(s, TabletServerErrorPB::MISMATCHED_SCHEMA); + return s; + } + + // Now acquire row locks and prepare everything for apply + RETURN_NOT_OK(tablet->AcquireRowLocks(state())); + + TRACE("PREPARE: finished."); + return Status::OK(); +} + +Status WriteTransaction::Start() { + TRACE_EVENT0("txn", "WriteTransaction::Start"); + TRACE("Start()"); + state_->tablet_peer()->tablet()->StartTransaction(state_.get()); + TRACE("Timestamp: $0", state_->tablet_peer()->clock()->Stringify(state_->timestamp())); + return Status::OK(); +} + +// FIXME: Since this is called as a void in a thread-pool callback, +// it seems pointless to return a Status! +Status WriteTransaction::Apply(gscoped_ptr* commit_msg) { + TRACE_EVENT0("txn", "WriteTransaction::Apply"); + TRACE("APPLY: Starting"); + + if (PREDICT_FALSE( + ANNOTATE_UNPROTECTED_READ(FLAGS_tablet_inject_latency_on_apply_write_txn_ms) > 0)) { + TRACE("Injecting $0ms of latency due to --tablet_inject_latency_on_apply_write_txn_ms", + FLAGS_tablet_inject_latency_on_apply_write_txn_ms); + SleepFor(MonoDelta::FromMilliseconds(FLAGS_tablet_inject_latency_on_apply_write_txn_ms)); + } + + Tablet* tablet = state()->tablet_peer()->tablet(); + + tablet->ApplyRowOperations(state()); + + // Add per-row errors to the result, update metrics. + int i = 0; + for (const RowOp* op : state()->row_ops()) { + if (state()->response() != nullptr && op->result->has_failed_status()) { + // Replicas disregard the per row errors, for now + // TODO check the per-row errors against the leader's, at least in debug mode + WriteResponsePB::PerRowErrorPB* error = state()->response()->add_per_row_errors(); + error->set_row_index(i); + error->mutable_error()->CopyFrom(op->result->failed_status()); + } + + state()->UpdateMetricsForOp(*op); + i++; + } + + // Create the Commit message + commit_msg->reset(new CommitMsg()); + state()->ReleaseTxResultPB((*commit_msg)->mutable_result()); + (*commit_msg)->set_op_type(WRITE_OP); + + return Status::OK(); +} + +void WriteTransaction::PreCommit() { + TRACE_EVENT0("txn", "WriteTransaction::PreCommit"); + TRACE("PRECOMMIT: Releasing row and schema locks"); + // Perform early lock release after we've applied all changes + state()->release_row_locks(); + state()->ReleaseSchemaLock(); +} + +void WriteTransaction::Finish(TransactionResult result) { + TRACE_EVENT0("txn", "WriteTransaction::Finish"); + if (PREDICT_FALSE(result == Transaction::ABORTED)) { + TRACE("FINISH: aborting transaction"); + state()->Abort(); + return; + } + + DCHECK_EQ(result, Transaction::COMMITTED); + // Now that all of the changes have been applied and the commit is durable + // make the changes visible to readers. + TRACE("FINISH: making edits visible"); + state()->Commit(); + + TabletMetrics* metrics = state_->tablet_peer()->tablet()->metrics(); + if (metrics) { + // TODO: should we change this so it's actually incremented by the + // Tablet code itself instead of this wrapper code? + metrics->rows_inserted->IncrementBy(state_->metrics().successful_inserts); + metrics->rows_updated->IncrementBy(state_->metrics().successful_updates); + metrics->rows_deleted->IncrementBy(state_->metrics().successful_deletes); + + if (type() == consensus::LEADER) { + if (state()->external_consistency_mode() == COMMIT_WAIT) { + metrics->commit_wait_duration->Increment(state_->metrics().commit_wait_duration_usec); + } + uint64_t op_duration_usec = + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start_time_).ToMicroseconds(); + switch (state()->external_consistency_mode()) { + case CLIENT_PROPAGATED: + metrics->write_op_duration_client_propagated_consistency->Increment(op_duration_usec); + break; + case COMMIT_WAIT: + metrics->write_op_duration_commit_wait_consistency->Increment(op_duration_usec); + break; + case UNKNOWN_EXTERNAL_CONSISTENCY_MODE: + break; + } + } + } +} + +string WriteTransaction::ToString() const { + MonoTime now(MonoTime::Now(MonoTime::FINE)); + MonoDelta d = now.GetDeltaSince(start_time_); + WallTime abs_time = WallTime_Now() - d.ToSeconds(); + string abs_time_formatted; + StringAppendStrftime(&abs_time_formatted, "%Y-%m-%d %H:%M:%S", (time_t)abs_time, true); + return Substitute("WriteTransaction [type=$0, start_time=$1, state=$2]", + DriverType_Name(type()), abs_time_formatted, state_->ToString()); +} + +WriteTransactionState::WriteTransactionState(TabletPeer* tablet_peer, + const tserver::WriteRequestPB *request, + tserver::WriteResponsePB *response) + : TransactionState(tablet_peer), + request_(request), + response_(response), + mvcc_tx_(nullptr), + schema_at_decode_time_(nullptr) { + if (request) { + external_consistency_mode_ = request->external_consistency_mode(); + } else { + external_consistency_mode_ = CLIENT_PROPAGATED; + } +} + + +void WriteTransactionState::SetMvccTxAndTimestamp(gscoped_ptr mvcc_tx) { + DCHECK(!mvcc_tx_) << "Mvcc transaction already started/set."; + if (has_timestamp()) { + DCHECK_EQ(timestamp(), mvcc_tx->timestamp()); + } else { + set_timestamp(mvcc_tx->timestamp()); + } + mvcc_tx_ = mvcc_tx.Pass(); +} + +void WriteTransactionState::set_tablet_components( + const scoped_refptr& components) { + DCHECK(!tablet_components_) << "Already set"; + DCHECK(components); + tablet_components_ = components; +} + +void WriteTransactionState::AcquireSchemaLock(rw_semaphore* schema_lock) { + TRACE("Acquiring schema lock in shared mode"); + shared_lock temp(schema_lock); + schema_lock_.swap(temp); + TRACE("Acquired schema lock"); +} + +void WriteTransactionState::ReleaseSchemaLock() { + shared_lock temp; + schema_lock_.swap(temp); + TRACE("Released schema lock"); +} + +void WriteTransactionState::StartApplying() { + CHECK_NOTNULL(mvcc_tx_.get())->StartApplying(); +} + +void WriteTransactionState::Abort() { + if (mvcc_tx_.get() != nullptr) { + // Abort the transaction. + mvcc_tx_->Abort(); + } + mvcc_tx_.reset(); + + release_row_locks(); + ReleaseSchemaLock(); + + // After commiting, we may respond to the RPC and delete the + // original request, so null them out here. + ResetRpcFields(); +} +void WriteTransactionState::Commit() { + if (mvcc_tx_.get() != nullptr) { + // Commit the transaction. + mvcc_tx_->Commit(); + } + mvcc_tx_.reset(); + + // After commiting, we may respond to the RPC and delete the + // original request, so null them out here. + ResetRpcFields(); +} + +void WriteTransactionState::ReleaseTxResultPB(TxResultPB* result) const { + result->Clear(); + result->mutable_ops()->Reserve(row_ops_.size()); + for (RowOp* op : row_ops_) { + result->mutable_ops()->AddAllocated(CHECK_NOTNULL(op->result.release())); + } +} + +void WriteTransactionState::UpdateMetricsForOp(const RowOp& op) { + if (op.result->has_failed_status()) { + return; + } + switch (op.decoded_op.type) { + case RowOperationsPB::INSERT: + tx_metrics_.successful_inserts++; + break; + case RowOperationsPB::UPDATE: + tx_metrics_.successful_updates++; + break; + case RowOperationsPB::DELETE: + tx_metrics_.successful_deletes++; + break; + case RowOperationsPB::UNKNOWN: + case RowOperationsPB::SPLIT_ROW: + break; + } +} + +void WriteTransactionState::release_row_locks() { + // free the row locks + for (RowOp* op : row_ops_) { + op->row_lock.Release(); + } +} + +WriteTransactionState::~WriteTransactionState() { + Reset(); +} + +void WriteTransactionState::Reset() { + // We likely shouldn't Commit() here. See KUDU-625. + Commit(); + tx_metrics_.Reset(); + timestamp_ = Timestamp::kInvalidTimestamp; + tablet_components_ = nullptr; + schema_at_decode_time_ = nullptr; +} + +void WriteTransactionState::ResetRpcFields() { + lock_guard l(&txn_state_lock_); + request_ = nullptr; + response_ = nullptr; + STLDeleteElements(&row_ops_); +} + +string WriteTransactionState::ToString() const { + string ts_str; + if (has_timestamp()) { + ts_str = timestamp().ToString(); + } else { + ts_str = ""; + } + + // Stringify the actual row operations (eg INSERT/UPDATE/etc) + // NOTE: we'll eventually need to gate this by some flag if we want to avoid + // user data escaping into the log. See KUDU-387. + string row_ops_str = "["; + { + lock_guard l(&txn_state_lock_); + const size_t kMaxToStringify = 3; + for (int i = 0; i < std::min(row_ops_.size(), kMaxToStringify); i++) { + if (i > 0) { + row_ops_str.append(", "); + } + row_ops_str.append(row_ops_[i]->ToString(*DCHECK_NOTNULL(schema_at_decode_time_))); + } + if (row_ops_.size() > kMaxToStringify) { + row_ops_str.append(", ..."); + } + row_ops_str.append("]"); + } + + return Substitute("WriteTransactionState $0 [op_id=($1), ts=$2, rows=$3]", + this, + op_id().ShortDebugString(), + ts_str, + row_ops_str); +} + +} // namespace tablet +} // namespace kudu diff --git a/src/kudu/tablet/transactions/write_transaction.h b/src/kudu/tablet/transactions/write_transaction.h new file mode 100644 index 000000000000..6ec52e2d69f9 --- /dev/null +++ b/src/kudu/tablet/transactions/write_transaction.h @@ -0,0 +1,277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TABLET_WRITE_TRANSACTION_H_ +#define KUDU_TABLET_WRITE_TRANSACTION_H_ + +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/tablet/lock_manager.h" +#include "kudu/tablet/mvcc.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tablet/transactions/transaction.h" +#include "kudu/util/locks.h" + +namespace kudu { +struct DecodedRowOperation; +class ConstContiguousRow; +class RowwiseRowBlockPB; + +namespace consensus { +class Consensus; +} + +namespace tserver { +class WriteRequestPB; +class WriteResponsePB; +} + +namespace tablet { +struct RowOp; +class RowSetKeyProbe; +struct TabletComponents; + +// A TransactionState for a batch of inserts/mutates. This class holds and +// owns most everything related to a transaction, including: +// - A RowOp structure for each of the rows being inserted or mutated, which itself +// contains: +// - decoded/projected data +// - row lock reference +// - result of this particular insert/mutate operation, once executed +// - the Replicate and Commit PB messages +// +// All the transaction related pointers are owned by this class +// and destroyed on Reset() or by the destructor. +// +// IMPORTANT: All the acquired locks will not be released unless the TransactionState +// is either destroyed or Reset() or release_locks() is called. Beware of this +// or else there will be lock leaks. +// +// Used when logging to WAL in that we keep track of where inserts/updates +// were applied and add that information to the commit message that is stored +// on the WAL. +// +// NOTE: this class isn't thread safe. +class WriteTransactionState : public TransactionState { + public: + WriteTransactionState(TabletPeer* tablet_peer = NULL, + const tserver::WriteRequestPB *request = NULL, + tserver::WriteResponsePB *response = NULL); + virtual ~WriteTransactionState(); + + // Returns the result of this transaction in its protocol buffers form. + // The transaction result holds information on exactly which memory stores + // were mutated in the context of this transaction and can be used to + // perform recovery. + // + // This releases part of the state of the transaction, and will crash + // if called more than once. + void ReleaseTxResultPB(TxResultPB* result) const; + + // Returns the original client request for this transaction, if there was + // one. + const tserver::WriteRequestPB *request() const OVERRIDE { + return request_; + } + + // Returns the prepared response to the client that will be sent when this + // transaction is completed, if this transaction was started by a client. + tserver::WriteResponsePB *response() OVERRIDE { + return response_; + } + + // Set the MVCC transaction associated with this Write operation. + // This must be called exactly once, during the PREPARE phase just + // after the MvccManager has assigned a timestamp. + // This also copies the timestamp from the MVCC transaction into the + // WriteTransactionState object. + void SetMvccTxAndTimestamp(gscoped_ptr mvcc_tx); + + // Set the Tablet components that this transaction will write into. + // Called exactly once at the beginning of Apply, before applying its + // in-memory edits. + void set_tablet_components(const scoped_refptr& components); + + // Take a shared lock on the given schema lock. + // This is required prior to decoding rows so that the schema does + // not change in between performing the projection and applying + // the writes. + void AcquireSchemaLock(rw_semaphore* schema_lock); + + // Release the already-acquired schema lock. + void ReleaseSchemaLock(); + + + void set_schema_at_decode_time(const Schema* schema) { + lock_guard l(&txn_state_lock_); + schema_at_decode_time_ = schema; + } + + const Schema* schema_at_decode_time() const { + lock_guard l(&txn_state_lock_); + return schema_at_decode_time_; + } + + const TabletComponents* tablet_components() const { + return tablet_components_.get(); + } + + // Notifies the MVCC manager that this operation is about to start applying + // its in-memory edits. After this method is called, the transaction _must_ + // Commit() within a bounded amount of time (there may be other threads + // blocked on it). + void StartApplying(); + + // Commits the Mvcc transaction and releases the component lock. After + // this method is called all the inserts and mutations will become + // visible to other transactions. + // + // Only one of Commit() or Abort() should be called. + // REQUIRES: StartApplying() was called. + // + // Note: request_ and response_ are set to NULL after this method returns. + void Commit(); + + // Aborts the mvcc transaction and releases the component lock. + // Only one of Commit() or Abort() should be called. + // + // REQUIRES: StartApplying() must never have been called. + void Abort(); + + // Returns all the prepared row writes for this transaction. Usually called + // on the apply phase to actually make changes to the tablet. + const std::vector& row_ops() const { + return row_ops_; + } + + void swap_row_ops(std::vector* new_ops) { + lock_guard l(&txn_state_lock_); + row_ops_.swap(*new_ops); + } + + void UpdateMetricsForOp(const RowOp& op); + + // Releases all the row locks acquired by this transaction. + void release_row_locks(); + + // Resets this TransactionState, releasing all locks, destroying all prepared + // writes, clearing the transaction result _and_ committing the current Mvcc + // transaction. + void Reset(); + + virtual std::string ToString() const OVERRIDE; + + private: + // Reset the RPC request, response, and row_ops_ (which refers to data + // from the request). + void ResetRpcFields(); + + // pointers to the rpc context, request and response, lifecyle + // is managed by the rpc subsystem. These pointers maybe NULL if the + // transaction was not initiated by an RPC call. + const tserver::WriteRequestPB* request_; + tserver::WriteResponsePB* response_; + + // The row operations which are decoded from the request during PREPARE + // Protected by superclass's txn_state_lock_. + std::vector row_ops_; + + // The MVCC transaction, set up during PREPARE phase + gscoped_ptr mvcc_tx_; + + // The tablet components, acquired at the same time as mvcc_tx_ is set. + scoped_refptr tablet_components_; + + // A lock held on the tablet's schema. Prevents concurrent schema change + // from racing with a write. + shared_lock schema_lock_; + + // The Schema of the tablet when the transaction was first decoded. + // This is verified at APPLY time to ensure we don't have races against + // schema change. + // Protected by superclass's txn_state_lock_. + const Schema* schema_at_decode_time_; + + DISALLOW_COPY_AND_ASSIGN(WriteTransactionState); +}; + +// Executes a write transaction. +class WriteTransaction : public Transaction { + public: + WriteTransaction(WriteTransactionState* tx_state, consensus::DriverType type); + + virtual WriteTransactionState* state() OVERRIDE { return state_.get(); } + virtual const WriteTransactionState* state() const OVERRIDE { return state_.get(); } + + void NewReplicateMsg(gscoped_ptr* replicate_msg) OVERRIDE; + + // Executes a Prepare for a write transaction + // + // Decodes the operations in the request PB and acquires row locks for each of the + // affected rows. This results in adding 'RowOp' objects for each of the operations + // into the WriteTransactionState. + virtual Status Prepare() OVERRIDE; + + // Actually starts the Mvcc transaction and assigns a timestamp to this transaction. + virtual Status Start() OVERRIDE; + + // Executes an Apply for a write transaction. + // + // Actually applies inserts/mutates into the tablet. After these start being + // applied, the transaction must run to completion as there is currently no + // means of undoing an update. + // + // After completing the inserts/mutates, the row locks and the mvcc transaction + // can be released, allowing other transactions to update the same rows. + // However the component lock must not be released until the commit msg, which + // indicates where each of the inserts/mutates were applied, is persisted to + // stable storage. Because of this ApplyTask must enqueue a CommitTask before + // releasing both the row locks and deleting the MvccTransaction as we need to + // make sure that Commits that touch the same set of rows are persisted in + // order, for recovery. + // This, of course, assumes that commits are executed in the same order they + // are placed in the queue (but not necessarily in the same order of the + // original requests) which is already a requirement of the consensus + // algorithm. + virtual Status Apply(gscoped_ptr* commit_msg) OVERRIDE; + + // Releases the row locks (Early Lock Release). + virtual void PreCommit() OVERRIDE; + + // If result == COMMITTED, commits the mvcc transaction and updates + // the metrics, if result == ABORTED aborts the mvcc transaction. + virtual void Finish(TransactionResult result) OVERRIDE; + + virtual std::string ToString() const OVERRIDE; + + private: + // this transaction's start time + MonoTime start_time_; + + gscoped_ptr state_; + + private: + DISALLOW_COPY_AND_ASSIGN(WriteTransaction); +}; + +} // namespace tablet +} // namespace kudu + +#endif /* KUDU_TABLET_WRITE_TRANSACTION_H_ */ diff --git a/src/kudu/tablet/triggering-maintenance-ops.txt b/src/kudu/tablet/triggering-maintenance-ops.txt new file mode 100644 index 000000000000..b9922fe515ab --- /dev/null +++ b/src/kudu/tablet/triggering-maintenance-ops.txt @@ -0,0 +1,211 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=============================================================================== +Maintenance Op Scheduling +=============================================================================== + +For the purpose of this document, "maintenance operations" are any background +processes that Kudu runs in the course of normal operation. The +MaintenanceManager must schedule these operations intelligently to keep the +system operating smoothly. Partly, this is a tradeoff between current +performance and future performance. For example, running a compaction will +spend some I/O now in order to speed up insertions later. Partly, this is a +matter of performing necessary tasks that, if left undone, would compromise the +stability of the system. For example, if we never flushed MemRowSets, we would +eventually run out of memory. As memory gets low, admissions control will slow +the pace of new requests getting accepted. + + +Decision Criteria +=============================================================================== +The most important things that we need to weigh in order to make good decisions +are: +1. memory usage +2. tablet statistics +3. the age of memrowsets + +Some other criteria that we considered, but rejected for v1 include: +1. free disk space. +2. load-balancing between disks or disksets which will be touched by +maintenance operations + +Free disk space should not be an issue in most competently administered setups. +We may revisit this later, but for the initial version, it is best to assume we +have enough space. + +We can't consider disk-based scheduling right now since we don't have support +for multiple disks yet. + + +Memory usage +------------------------------------------------------------------------------- +Memory usage can be broken down into a few buckets: +1. System overhead (C++ data structures, operating system overheads, and so +forth). +2. MemRowSets +3. The LRU block cache + +We assume that #1 is relatively constant. The maintenance op scheduler can +make tradeoffs between #2 and #3 by deciding to flush certain MemRowSets to +disk. + +We want to keep the total amount of memory held by #1, #2 and #3 from growing +too large. For now, our goal is to keep this sum relatively constant. We have +not yet implemented giving memory held by tcmalloc back to the operating system. + + +Tablet Statistics +------------------------------------------------------------------------------- +If we know that a tablet's workload is scan-heavy (rather than insert-heavy), +we may wish to do a major delta compaction for that tablet to speed up scans. +It's probably smarter to do compactions on tables that are heavily used, than +on obscure tables that don't see much traffic. + +This is probably the most difficult information source to make effective use +of, simply because it involves many workload-dependent assumptions and +heuristics. + + +The Age of MemRowSet objects +------------------------------------------------------------------------------- +MemRowSet and DeltaMemRowSet objects must be flushed to disk when they get too +old. If we don't do this, the write-ahead log (WAL) will grow without bound. +This growth would waste disk space and slow startup to a crawl, since the +entire WAL must be traversed during the startup process. + +We should embed a WAL op id in each MemRowSets and DeltaMemRowSet. The +scheduler will look more favorably on the flushing of a MemRowSet as it ages. +After the operation id falls too far behind, it will try to flush the MemRowSet +no matter what. + + +Maintenance Operation types +=============================================================================== + +Maintenance operations to reduce memory usage +---------------------------------------- + +These operations spend some I/O or CPU in order to free up memory usage. They +may also incur further performance costs after completion. These cannot be +delayed indefinitely, as RAM is a finite resource. + + +MemStore Flush +------------------------------ +Cost: +- Sequential I/O now (writing the actual memstore contents to disk) +- Sequential I/O later (frequent small flushes will cost more compactions down the road) + +Benefit: +- RAM: frees up memory + +Other/wash: +- At first glance, flushing might seem to increase cost of further insert/updates + because it adds a new RowSet. However, because memstores are not compressed in + any way, typically the newly flushed RowSet will be much smaller on disk than the + memstore that it came from. This means that, even if we have to cache the whole + result RowSet in the block cache, we're making much more effective use of RAM and + thus may _reduce_ the total number of actual I/Os. + + +DeltaMemStore Flush +------------------------------ +Basically the same costs as MemStore flush + +Additional benefits: +TODO: flushing may also speed up scans substantially. Need to run experiments on this -- +how much better is scanning a static cached file compared to scanning the equivalent +memstore. Maybe an order of magnitude. + + +LRU cache eviction +------------------------------ +Cost: slower reads, slower inserts if evicting key columns or blooms +Benefit: frees RAM + + + + +Maintenance operations to manage future performance +---------------------------------------- + +These operations expend some kind of I/O and CPU now in order to improve the performance +of the system after they complete. They are only ever "necessary" in that if we put them +off forever, the system will slow to a crawl eventually. + + +Merging Compaction +------------------------------ +Cost: +- Sequential I/O now (reading input, re-writing output) + +Benefit: +- reduce the number of RowSets: speeds up inserts, updates. Speeds up short scans where blooms don't apply. + + +Minor Delta Compaction +------------------------------ +Cost: +- Sequential I/O (reading input, re-writing output) + +Benefit: +- Speeds up scans -- fewer delta trackers to apply +- May save disk space (eg when snapshot isolation is implemented, old version updates may be discarded) + + +Major delta compaction +------------------------------ +Cost: +- Sequential I/O (reading input, re-writing output) + +Benefit: +- Speeds up scans -- fewer delta trackers to apply, fewer total rows with deltas to apply. +- Save disk space (eg when snapshot isolation is implemented, old version updates may be discarded) + +Relevant metrics: +- for each column, % of rows in RowSet which have been updated +- for each column, % of deltas which could be fully merged +- workload: scan heavy vs insert/update heavy? + + +Implementation Considerations +=============================================================================== +Each tablet creates several MaintenanceOp objects, representing the various +maintenance operations which can be performed on it. It registers these +operations with the MaintenanceManager. + +The MaintenanceManager has a main thread which periodically polls the +registered MaintenanceOp objects and determines whether it should execute any +of them. The default polling interval is 250 ms, but this is configurable. +Access to the MaintenanceOp is assumed to be thread-safe. It's important to +note that the scheduler can choose any op available to it. It is not bound to +execute operations on a first-come, first-serve basis. + +If the MaintenanceManager decides to execute one of these operations, it will +run it in a thread-pool of configurable size. We assume that maintenance +operations are blocking and require a thread context. If the operation fails, +the MaintenanceManager will log a warning message and re-trigger the main +thread. The failed MaintenanceOp will not be retried until a configurable +grace period has expired. + +The MaintenanceOp has various fields indicating how much memory it will +probably free, how much CPU it will use, and so forth. It also has a field +which marks it as not currently executable. For example, this may be used by +some Ops that don't want multiple instances of themselves to run concurrently. + +We want to keep at least one thread free to run flush operations, so that we +don't ever get into a situation where we need to free up memory, but all the +maintenance op threads are working on compactions or other operations. +Hopefully, most compactions will be reasonably short, so that we won't have to +schedule long compactions differently than short ones. diff --git a/src/kudu/tools/CMakeLists.txt b/src/kudu/tools/CMakeLists.txt new file mode 100644 index 000000000000..1a6a57015998 --- /dev/null +++ b/src/kudu/tools/CMakeLists.txt @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(LINK_LIBS + kudu_client + log + consensus + tserver + kudu_common + kudu_fs + kudu_util + gutil + cfile + tablet + ${KUDU_BASE_LIBS} +) + +add_library(kudu_tools_util + data_gen_util.cc) +target_link_libraries(kudu_tools_util + ${LINK_LIBS}) + +add_executable(create-demo-table create-demo-table.cc) +target_link_libraries(create-demo-table + ${LINK_LIBS}) + +add_executable(insert-generated-rows insert-generated-rows.cc) +target_link_libraries(insert-generated-rows + kudu_tools_util + ${LINK_LIBS}) + +add_executable(kudu-admin kudu-admin.cc) +target_link_libraries(kudu-admin + ${LINK_LIBS}) + +add_executable(kudu-ts-cli ts-cli.cc) +target_link_libraries(kudu-ts-cli + ${LINK_LIBS}) + +add_library(fs_tool fs_tool.cc) +target_link_libraries(fs_tool + gutil + kudu_common + server_common + consensus + tablet) + +add_executable(kudu-fs_list fs_list-tool.cc) +target_link_libraries(kudu-fs_list + ${LINK_LIBS} + fs_tool) + +add_executable(kudu-fs_dump fs_dump-tool.cc) +target_link_libraries(kudu-fs_dump + ${LINK_LIBS} + fs_tool) + +add_library(ksck + ksck.cc + ksck_remote.cc +) +target_link_libraries(ksck + master_proto + server_base_proto + tserver_proto + tserver_service_proto + ${KUDU_BASE_LIBS} +) + +add_executable(kudu-ksck kudu-ksck.cc) +target_link_libraries(kudu-ksck + ksck +) + +add_executable(kudu-pbc-dump pbc-dump.cc) +target_link_libraries(kudu-pbc-dump + ${LINK_LIBS} +) + +set(KUDU_TEST_LINK_LIBS + ksck + kudu_tools_util + integration-tests + ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(ksck-test) +ADD_KUDU_TEST(ksck_remote-test) +ADD_KUDU_TEST(kudu-admin-test) +ADD_KUDU_TEST_DEPENDENCIES(kudu-admin-test + kudu-admin) +ADD_KUDU_TEST(kudu-ts-cli-test) +ADD_KUDU_TEST_DEPENDENCIES(kudu-ts-cli-test + kudu-ts-cli) + diff --git a/src/kudu/tools/README.systemtap b/src/kudu/tools/README.systemtap new file mode 100644 index 000000000000..d2ecb2ee34c6 --- /dev/null +++ b/src/kudu/tools/README.systemtap @@ -0,0 +1,53 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SystemTap +--------- + +SystemTap is a robust system for tracing kernel and user events. + + +Requirements +------------ +On Ubuntu systems (written against 14.04): +1. Install the 'systemtap' package. +2. Optionally install the 'systemtap-doc' package for some useful manpages + (like 'stapprobes' and 'stapfuncs'). +3. Run stap-prep. It'll probably tell you to do the following: + a. Install the debug symbols for your kernel. Follow the instructions at + https://wiki.ubuntu.com/DebuggingProgramCrash#Debug_Symbol_Packages to + install the right package. It should be named something like + linux-image-3.13.0-36-generic-dbgsym. + b. Add yourself to both the 'stapusr' and 'stapdev' groups. This isn't + strictly required; you can run SystemTap scripts as root. + +On RHEL systems (written against CentOS 6.4): +1. Install the 'systemtap' package. +2. Install the debug symbols for your kernel. Follow the instructions at + http://fendertech.blogspot.com/2013/04/centos-install-kernel-debuginfo.html + to add the appropriate repo, then install the right package. It should be + named something like kernel-debuginfo-2.6.32-358.23.2.el6.x86_64. +3. Add yourself to both the 'stapusr' and 'stapdev' groups. This isn't + strictly required; you can run SystemTap scripts as root. + +Running +------- +Some SystemTap scripts have a hashbang so that they can be run directly. +Others need to be run with the 'stap' binary. + +Useful options include: + -o FILE: send standard output to the given file. + -c CMD: run the command, set the probe target to the command's PID, and exit + when it finishes. Relevant for scripts that filter on target(). + -x PID: set the probe target to PID. Relevant for scripts that filter on + target(). diff --git a/src/kudu/tools/create-demo-table.cc b/src/kudu/tools/create-demo-table.cc new file mode 100644 index 000000000000..d43da3fe5ae0 --- /dev/null +++ b/src/kudu/tools/create-demo-table.cc @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple tool to send an CREATE TABLE request for one of the demo tablets. +// This will eventually be replaced by a proper shell -- just a quick +// hack for easy demo purposes. + +#include +#include +#include +#include + +#include "kudu/benchmarks/tpch/tpch-schemas.h" +#include "kudu/benchmarks/ycsb-schema.h" +#include "kudu/client/client.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/twitter-demo/twitter-schema.h" +#include "kudu/util/env.h" +#include "kudu/util/faststring.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" + +using kudu::client::KuduClient; +using kudu::client::KuduClientBuilder; +using kudu::client::KuduSchema; +using kudu::client::KuduTableCreator; +using kudu::client::sp::shared_ptr; +using kudu::rpc::RpcController; +using std::string; +using std::vector; + +DEFINE_string(master_address, "localhost", + "Comma separated list of master addresses to run against."); + +static const char* const kTwitterTabletId = "twitter"; +static const char* const kTPCH1TabletId = "tpch1"; +static const char* const kYCSBTabletId = "ycsb"; + +namespace kudu { + +void PrintUsage(char** argv) { + std::cerr << "usage: " << argv[0] << " " + << kTwitterTabletId << "|" + << kTPCH1TabletId << "|" + << kYCSBTabletId + << std::endl; +} + +string LoadFile(const string& path) { + faststring buf; + CHECK_OK(ReadFileToString(Env::Default(), path, &buf)); + return buf.ToString(); +} + +// TODO: refactor this and the associated constants into some sort of +// demo-tables.h class in a src/demos/ directory. +Status GetDemoSchema(const string& table_name, KuduSchema* schema) { + if (table_name == kTwitterTabletId) { + *schema = twitter_demo::CreateTwitterSchema(); + } else if (table_name == kTPCH1TabletId) { + *schema = tpch::CreateLineItemSchema(); + } else if (table_name == kYCSBTabletId) { + *schema = kudu::CreateYCSBSchema(); + } else { + return Status::InvalidArgument("Invalid demo table name", table_name); + } + return Status::OK(); +} + +static int CreateDemoTable(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + if (argc != 2) { + PrintUsage(argv); + return 1; + } + InitGoogleLoggingSafe(argv[0]); + FLAGS_logtostderr = true; + + string table_name = argv[1]; + + vector addrs = strings::Split(FLAGS_master_address, ","); + CHECK(!addrs.empty()) << "At least one master address must be specified!"; + + KuduSchema schema; + CHECK_OK(GetDemoSchema(table_name, &schema)); + + // Set up client. + shared_ptr client; + CHECK_OK(KuduClientBuilder() + .master_server_addrs(addrs) + .Build(&client)); + + gscoped_ptr table_creator(client->NewTableCreator()); + CHECK_OK(table_creator->table_name(table_name) + .schema(&schema) + .Create()); + return 0; +} + +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::CreateDemoTable(argc, argv); +} diff --git a/src/kudu/tools/data_gen_util.cc b/src/kudu/tools/data_gen_util.cc new file mode 100644 index 000000000000..bb13d6ce8a8a --- /dev/null +++ b/src/kudu/tools/data_gen_util.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tools/data_gen_util.h" + +#include "kudu/client/schema.h" +#include "kudu/common/partial_row.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/util/random.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tools { + +void WriteValueToColumn(const client::KuduSchema& schema, + int col_idx, + uint64_t value, + KuduPartialRow* row) { + client::KuduColumnSchema::DataType type = schema.Column(col_idx).type(); + char buf[kFastToBufferSize]; + switch (type) { + case client::KuduColumnSchema::INT8: + CHECK_OK(row->SetInt8(col_idx, value)); + break; + case client::KuduColumnSchema::INT16: + CHECK_OK(row->SetInt16(col_idx, value)); + break; + case client::KuduColumnSchema::INT32: + CHECK_OK(row->SetInt32(col_idx, value)); + break; + case client::KuduColumnSchema::INT64: + CHECK_OK(row->SetInt64(col_idx, value)); + break; + case client::KuduColumnSchema::FLOAT: + CHECK_OK(row->SetFloat(col_idx, value / 123.0)); + break; + case client::KuduColumnSchema::DOUBLE: + CHECK_OK(row->SetDouble(col_idx, value / 123.0)); + break; + case client::KuduColumnSchema::STRING: + CHECK_OK(row->SetStringCopy(col_idx, FastHex64ToBuffer(value, buf))); + break; + case client::KuduColumnSchema::BOOL: + CHECK_OK(row->SetBool(col_idx, value)); + break; + default: + LOG(FATAL) << "Unexpected data type: " << type; + } +} + +void GenerateDataForRow(const client::KuduSchema& schema, uint64_t record_id, + Random* random, KuduPartialRow* row) { + for (int col_idx = 0; col_idx < schema.num_columns(); col_idx++) { + // We randomly generate the inserted data, except for the first column, + // which is always based on a monotonic "record id". + uint64_t value; + if (col_idx == 0) { + value = record_id; + } else { + value = random->Next64(); + } + WriteValueToColumn(schema, col_idx, value, row); + } +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/data_gen_util.h b/src/kudu/tools/data_gen_util.h new file mode 100644 index 000000000000..826543c5240b --- /dev/null +++ b/src/kudu/tools/data_gen_util.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility functions for generating data for use by tools and tests. + +#ifndef KUDU_TOOLS_DATA_GEN_UTIL_H_ +#define KUDU_TOOLS_DATA_GEN_UTIL_H_ + +#include + +namespace kudu { +class KuduPartialRow; +class Random; + +namespace client { +class KuduSchema; +} // namespace client + +namespace tools { + +// Detect the type of the given column and coerce the given number value in +// 'value' to the data type of that column. +// At the time of this writing, we only support ints, bools, and strings. +// For the numbers / bool, the value is truncated to fit the data type. +// For the string, we encode the number as hex. +void WriteValueToColumn(const client::KuduSchema& schema, + int col_idx, + uint64_t value, + KuduPartialRow* row); + +// Generate row data for an arbitrary schema. Initial column value determined +// by the value of 'record_id'. +void GenerateDataForRow(const client::KuduSchema& schema, uint64_t record_id, + Random* random, KuduPartialRow* row); + +} // namespace tools +} // namespace kudu + +#endif // KUDU_TOOLS_DATA_GEN_UTIL_H_ diff --git a/src/kudu/tools/fs_dump-tool.cc b/src/kudu/tools/fs_dump-tool.cc new file mode 100644 index 000000000000..fe4561493e6b --- /dev/null +++ b/src/kudu/tools/fs_dump-tool.cc @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tool to dump tablets, rowsets, and blocks + +#include "kudu/tools/fs_tool.h" + +#include +#include +#include +#include + +#include +#include + +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" + +DEFINE_int32(nrows, 0, "Number of rows to dump"); +DEFINE_bool(metadata_only, false, "Whether just to dump the block metadata, " + "when printing blocks."); + +/* + TODO: support specifying start and end keys + + DEFINE_string(start_key, "", "Start key for rows to dump"); + DEFINE_string(end_key, "", "Start key for rows to dump"); +*/ + +DEFINE_bool(headers_only, false, "Don't dump contents, dump headers only"); + +namespace kudu { +namespace tools { + +using std::string; +using std::vector; +using strings::Substitute; + +namespace { + +enum CommandType { + DUMP_TABLET_BLOCKS, + DUMP_TABLET_DATA, + DUMP_ROWSET, + DUMP_CFILE_BLOCK, + PRINT_TABLET_META, + PRINT_UUID, +}; + +struct CommandHandler { + CommandType type_; + string name_; + string desc_; + + CommandHandler(CommandType type, string name, string desc) + : type_(type), name_(std::move(name)), desc_(std::move(desc)) {} +}; + +const vector kCommandHandlers = { + CommandHandler(DUMP_TABLET_DATA, "dump_tablet_data", + "Dump a tablet's data (requires a tablet id)"), + CommandHandler(DUMP_TABLET_BLOCKS, "dump_tablet_blocks", + "Dump a tablet's constituent blocks (requires a tablet id)"), + CommandHandler(DUMP_ROWSET, "dump_rowset", + "Dump a rowset (requires a tablet id and an index)"), + CommandHandler(DUMP_CFILE_BLOCK, "dump_block", + "Dump a cfile block (requires a block id)"), + CommandHandler(PRINT_TABLET_META, "print_meta", + "Print a tablet metadata (requires a tablet id)"), + CommandHandler(PRINT_UUID, "print_uuid", + "Print the UUID (master or TS) to whom the data belongs") }; + +void PrintUsageToStream(const std::string& prog_name, std::ostream* out) { + *out << "Usage: " << prog_name + << " [-headers_only] [-nrows ] " + << "-fs_wal_dir -fs_data_dirs " + << std::endl << std::endl; + *out << "Commands: " << std::endl; + for (const CommandHandler& handler : kCommandHandlers) { + *out << handler.name_ << ": " << handler.desc_ << std::endl; + } +} +void Usage(const string& prog_name, const string& msg) { + std::cerr << "Error " << prog_name << ": " << msg << std::endl; + PrintUsageToStream(prog_name, &std::cerr); +} + +bool ValidateCommand(int argc, char** argv, CommandType* out) { + if (argc < 2) { + Usage(argv[0], "At least one command must be specified!"); + return false; + } + for (const CommandHandler& handler : kCommandHandlers) { + if (argv[1] == handler.name_) { + *out = handler.type_; + return true; + } + } + Usage("Invalid command specified: ", argv[1]); + return false; +} + +} // anonymous namespace + +static int FsDumpToolMain(int argc, char** argv) { + FLAGS_logtostderr = 1; + std::stringstream usage_str; + PrintUsageToStream(argv[0], &usage_str); + google::SetUsageMessage(usage_str.str()); + ParseCommandLineFlags(&argc, &argv, true); + InitGoogleLoggingSafe(argv[0]); + + CommandType cmd; + if (!ValidateCommand(argc, argv, &cmd)) { + return 2; + } + + FsTool fs_tool(FLAGS_headers_only ? FsTool::HEADERS_ONLY : FsTool::MAXIMUM); + CHECK_OK(fs_tool.Init()); + + DumpOptions opts; + // opts.start_key = FLAGS_start_key; + // opts.end_key = FLAGS_end_key; + opts.nrows = FLAGS_nrows; + opts.metadata_only = FLAGS_metadata_only; + + switch (cmd) { + case DUMP_TABLET_DATA: + case DUMP_TABLET_BLOCKS: + { + if (argc < 3) { + Usage(argv[0], + Substitute("dump_tablet requires tablet id: $0 " + "dump_tablet ", + argv[0])); + return 2; + } + if (cmd == DUMP_TABLET_DATA) { + CHECK_OK(fs_tool.DumpTabletData(argv[2])); + } else if (cmd == DUMP_TABLET_BLOCKS) { + CHECK_OK(fs_tool.DumpTabletBlocks(argv[2], opts, 0)); + } + break; + } + + case DUMP_ROWSET: { + if (argc < 4) { + Usage(argv[0], + Substitute("dump_rowset requires tablet id and rowset index: $0" + "dump_rowset ", + argv[0])); + return 2; + } + uint32_t rowset_idx; + CHECK(safe_strtou32(argv[3], &rowset_idx)) + << "Invalid index specified: " << argv[2]; + CHECK_OK(fs_tool.DumpRowSet(argv[2], rowset_idx, opts, 0)); + break; + } + case DUMP_CFILE_BLOCK: { + if (argc < 3) { + Usage(argv[0], + Substitute("dump_block requires a block id: $0" + "dump_block ", argv[0])); + return 2; + } + CHECK_OK(fs_tool.DumpCFileBlock(argv[2], opts, 0)); + break; + } + case PRINT_TABLET_META: { + if (argc < 3) { + Usage(argv[0], Substitute("print_meta requires a tablet id: $0" + "print_meta ", argv[0])); + return 2; + } + CHECK_OK(fs_tool.PrintTabletMeta(argv[2], 0)); + break; + } + case PRINT_UUID: { + if (argc < 2) { + Usage(argv[0], Substitute("$0 print_uuid", argv[0])); + return 2; + } + CHECK_OK(fs_tool.PrintUUID(0)); + break; + } + } + + return 0; +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tools::FsDumpToolMain(argc, argv); +} diff --git a/src/kudu/tools/fs_list-tool.cc b/src/kudu/tools/fs_list-tool.cc new file mode 100644 index 000000000000..228967ceb577 --- /dev/null +++ b/src/kudu/tools/fs_list-tool.cc @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tool to list local files and directories + +#include "kudu/tools/fs_tool.h" + +#include +#include +#include + +#include +#include + +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" + +DEFINE_bool(verbose, false, + "Print additional information (e.g., log segment headers)"); + +namespace kudu { +namespace tools { + +using std::string; +using std::vector; + +namespace { + +enum CommandType { + FS_TREE = 1, + LIST_LOGS = 2, + LIST_TABLETS = 3, + LIST_BLOCKS = 4 +}; + +// TODO: extract and generalized the "verb" handling code with other +// tools such that it can be shared with other tools. + +struct CommandHandler { + CommandType type_; + string name_; + string desc_; + + CommandHandler(CommandType type, string name, string desc) + : type_(type), name_(std::move(name)), desc_(std::move(desc)) {} +}; + +const vector kCommandHandlers = { + CommandHandler(FS_TREE, "tree", "Print out a file system tree." ), + CommandHandler(LIST_LOGS, "list_logs", + "List file system logs (optionally accepts a tablet id)."), + CommandHandler(LIST_TABLETS, "list_tablets", "List tablets." ), + CommandHandler(LIST_BLOCKS, "list_blocks", + "List block for tablet (optionally accepts a tablet id).") }; + +void PrintUsageToStream(const string& prog_name, std::ostream* out) { + *out << "Usage: " << prog_name << " [-verbose] " + << "-fs_wal_dir -fs_data_dirs [option] " + << std::endl << std::endl + << "Commands: " << std::endl; + for (const CommandHandler& handler : kCommandHandlers) { + *out << handler.name_ << ": " << handler.desc_ << std::endl; + } +} + +void Usage(const string& prog_name, const string& msg) { + std::cerr << "Error " << prog_name << ": " << msg << std::endl + << std::endl; + PrintUsageToStream(prog_name, &std::cerr); +} + +bool ValidateCommand(int argc, char** argv, CommandType* out) { + if (argc < 2) { + Usage(argv[0], "At least one command must be specified!"); + return false; + } + for (const CommandHandler& handler : kCommandHandlers) { + if (argv[1] == handler.name_) { + *out = handler.type_; + return true; + } + } + Usage("Invalid command specified ", argv[1]); + return false; +} + +} // anonymous namespace + +static int FsListToolMain(int argc, char** argv) { + FLAGS_logtostderr = 1; + std::stringstream usage_str; + PrintUsageToStream(argv[0], &usage_str); + google::SetUsageMessage(usage_str.str()); + ParseCommandLineFlags(&argc, &argv, true); + InitGoogleLoggingSafe(argv[0]); + + CommandType cmd; + if (!ValidateCommand(argc, argv, &cmd)) { + return 2; + } + + FsTool fs_tool(FLAGS_verbose ? FsTool::HEADERS_ONLY : FsTool::MINIMUM); + CHECK_OK_PREPEND(fs_tool.Init(), "Error initializing file system tool"); + + switch (cmd) { + case FS_TREE: { + CHECK_OK(fs_tool.FsTree()); + break; + } + case LIST_LOGS: { + if (argc > 2) { + CHECK_OK(fs_tool.ListLogSegmentsForTablet(argv[2])); + } else { + CHECK_OK(fs_tool.ListAllLogSegments()); + } + break; + } + case LIST_TABLETS: { + CHECK_OK(fs_tool.ListAllTablets()); + break; + } + case LIST_BLOCKS: { + if (argc > 2) { + CHECK_OK(fs_tool.ListBlocksForTablet(argv[2])); + } else { + CHECK_OK(fs_tool.ListBlocksForAllTablets()); + } + } + } + + return 0; +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tools::FsListToolMain(argc, argv); +} diff --git a/src/kudu/tools/fs_tool.cc b/src/kudu/tools/fs_tool.cc new file mode 100644 index 000000000000..f2c2ae8138cd --- /dev/null +++ b/src/kudu/tools/fs_tool.cc @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tools/fs_tool.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "kudu/cfile/cfile_reader.h" +#include "kudu/common/rowblock.h" +#include "kudu/common/row_changelist.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/tablet/cfile_set.h" +#include "kudu/tablet/deltafile.h" +#include "kudu/tablet/tablet.h" +#include "kudu/util/env.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tools { + +using cfile::CFileIterator; +using cfile::CFileReader; +using cfile::DumpIterator; +using cfile::DumpIteratorOptions; +using cfile::ReaderOptions; +using fs::ReadableBlock; +using log::LogReader; +using log::ReadableLogSegment; +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; +using tablet::CFileSet; +using tablet::DeltaFileReader; +using tablet::DeltaIterator; +using tablet::DeltaKeyAndUpdate; +using tablet::DeltaType; +using tablet::MvccSnapshot; +using tablet::RowSetMetadata; +using tablet::Tablet; +using tablet::TabletMetadata; + +static const char* const kSeparatorLine = + "----------------------------------------------------------------------\n"; + +namespace { +string Indent(int indent) { + return string(indent, ' '); +} + +string IndentString(const string& s, int indent) { + return Indent(indent) + StringReplace(s, "\n", "\n" + Indent(indent), true); +} +} // anonymous namespace + +FsTool::FsTool(DetailLevel detail_level) + : initialized_(false), + detail_level_(detail_level) { +} + +FsTool::~FsTool() { +} + +Status FsTool::Init() { + CHECK(!initialized_) << "Already initialized"; + // Allow read-only access to live blocks. + FsManagerOpts opts; + opts.read_only = true; + fs_manager_.reset(new FsManager(Env::Default(), opts)); + RETURN_NOT_OK(fs_manager_->Open()); + + LOG(INFO) << "Opened file system with uuid: " << fs_manager_->uuid(); + + initialized_ = true; + return Status::OK(); +} + +Status FsTool::FsTree() { + DCHECK(initialized_); + + fs_manager_->DumpFileSystemTree(std::cout); + return Status::OK(); +} + +Status FsTool::ListAllLogSegments() { + DCHECK(initialized_); + + string wals_dir = fs_manager_->GetWalsRootDir(); + if (!fs_manager_->Exists(wals_dir)) { + return Status::Corruption(Substitute( + "root log directory '$0' does not exist", wals_dir)); + } + + std::cout << "Root log directory: " << wals_dir << std::endl; + + vector children; + RETURN_NOT_OK_PREPEND(fs_manager_->ListDir(wals_dir, &children), + "Could not list log directories"); + for (const string& child : children) { + if (HasPrefixString(child, ".")) { + // Hidden files or ./.. + VLOG(1) << "Ignoring hidden file in root log directory " << child; + continue; + } + string path = JoinPathSegments(wals_dir, child); + if (HasSuffixString(child, FsManager::kWalsRecoveryDirSuffix)) { + std::cout << "Log recovery dir found: " << path << std::endl; + } else { + std::cout << "Log directory: " << path << std::endl; + } + RETURN_NOT_OK(ListSegmentsInDir(path)); + } + return Status::OK(); +} + +Status FsTool::ListLogSegmentsForTablet(const string& tablet_id) { + DCHECK(initialized_); + + string tablet_wal_dir = fs_manager_->GetTabletWalDir(tablet_id); + if (!fs_manager_->Exists(tablet_wal_dir)) { + return Status::NotFound(Substitute("tablet '$0' has no logs in wals dir '$1'", + tablet_id, tablet_wal_dir)); + } + std::cout << "Tablet WAL dir found: " << tablet_wal_dir << std::endl; + RETURN_NOT_OK(ListSegmentsInDir(tablet_wal_dir)); + string recovery_dir = fs_manager_->GetTabletWalRecoveryDir(tablet_id); + if (fs_manager_->Exists(recovery_dir)) { + std::cout << "Recovery dir found: " << recovery_dir << std::endl; + RETURN_NOT_OK(ListSegmentsInDir(recovery_dir)); + } + return Status::OK(); +} + + +Status FsTool::ListAllTablets() { + DCHECK(initialized_); + + vector tablets; + RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablets)); + for (const string& tablet : tablets) { + if (detail_level_ >= HEADERS_ONLY) { + std::cout << "Tablet: " << tablet << std::endl; + RETURN_NOT_OK(PrintTabletMeta(tablet, 2)); + } else { + std::cout << "\t" << tablet << std::endl; + } + } + return Status::OK(); +} + +Status FsTool::ListSegmentsInDir(const string& segments_dir) { + vector segments; + RETURN_NOT_OK_PREPEND(fs_manager_->ListDir(segments_dir, &segments), + "Unable to list log segments"); + std::cout << "Segments in " << segments_dir << ":" << std::endl; + for (const string& segment : segments) { + if (!log::IsLogFileName(segment)) { + continue; + } + if (detail_level_ >= HEADERS_ONLY) { + std::cout << "Segment: " << segment << std::endl; + string path = JoinPathSegments(segments_dir, segment); + RETURN_NOT_OK(PrintLogSegmentHeader(path, 2)); + } else { + std::cout << "\t" << segment << std::endl; + } + } + return Status::OK(); +} + +Status FsTool::PrintLogSegmentHeader(const string& path, + int indent) { + scoped_refptr segment; + Status s = ReadableLogSegment::Open(fs_manager_->env(), + path, + &segment); + + if (s.IsUninitialized()) { + LOG(ERROR) << path << " is not initialized: " << s.ToString(); + return Status::OK(); + } + if (s.IsCorruption()) { + LOG(ERROR) << path << " is corrupt: " << s.ToString(); + return Status::OK(); + } + RETURN_NOT_OK_PREPEND(s, "Unexpected error reading log segment " + path); + + std::cout << Indent(indent) << "Size: " + << HumanReadableNumBytes::ToStringWithoutRounding(segment->file_size()) + << std::endl; + std::cout << Indent(indent) << "Header: " << std::endl; + std::cout << IndentString(segment->header().DebugString(), indent); + return Status::OK(); +} + +Status FsTool::PrintTabletMeta(const string& tablet_id, int indent) { + scoped_refptr meta; + RETURN_NOT_OK(TabletMetadata::Load(fs_manager_.get(), tablet_id, &meta)); + + const Schema& schema = meta->schema(); + + std::cout << Indent(indent) << "Partition: " + << meta->partition_schema().PartitionDebugString(meta->partition(), meta->schema()) + << std::endl; + std::cout << Indent(indent) << "Table name: " << meta->table_name() + << " Table id: " << meta->table_id() << std::endl; + std::cout << Indent(indent) << "Schema (version=" << meta->schema_version() << "): " + << schema.ToString() << std::endl; + + tablet::TabletSuperBlockPB pb; + RETURN_NOT_OK_PREPEND(meta->ToSuperBlock(&pb), "Could not get superblock"); + std::cout << "Superblock:\n" << pb.DebugString() << std::endl; + + return Status::OK(); +} + +Status FsTool::ListBlocksForAllTablets() { + DCHECK(initialized_); + + vector tablets; + RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablets)); + for (string tablet : tablets) { + RETURN_NOT_OK(ListBlocksForTablet(tablet)); + } + return Status::OK(); +} + +Status FsTool::ListBlocksForTablet(const string& tablet_id) { + DCHECK(initialized_); + + scoped_refptr meta; + RETURN_NOT_OK(TabletMetadata::Load(fs_manager_.get(), tablet_id, &meta)); + + if (meta->rowsets().empty()) { + std::cout << "No rowsets found on disk for tablet " << tablet_id << std::endl; + return Status::OK(); + } + + std::cout << "Listing all data blocks in tablet " << tablet_id << ":" << std::endl; + + Schema schema = meta->schema(); + + size_t idx = 0; + for (const shared_ptr& rs_meta : meta->rowsets()) { + std::cout << "Rowset " << idx++ << std::endl; + RETURN_NOT_OK(ListBlocksInRowSet(schema, *rs_meta)); + } + + return Status::OK(); +} + +Status FsTool::ListBlocksInRowSet(const Schema& schema, + const RowSetMetadata& rs_meta) { + RowSetMetadata::ColumnIdToBlockIdMap col_blocks = rs_meta.GetColumnBlocksById(); + for (const RowSetMetadata::ColumnIdToBlockIdMap::value_type& e : col_blocks) { + ColumnId col_id = e.first; + const BlockId& block_id = e.second; + std::cout << "Column block for column ID " << col_id; + int col_idx = schema.find_column_by_id(col_id); + if (col_idx != -1) { + std::cout << " (" << schema.column(col_idx).ToString() << ")"; + } + std::cout << ": "; + std::cout << block_id.ToString() << std::endl; + } + + for (const BlockId& block : rs_meta.undo_delta_blocks()) { + std::cout << "UNDO: " << block.ToString() << std::endl; + } + + for (const BlockId& block : rs_meta.redo_delta_blocks()) { + std::cout << "REDO: " << block.ToString() << std::endl; + } + + return Status::OK(); +} + +Status FsTool::DumpTabletBlocks(const std::string& tablet_id, + const DumpOptions& opts, + int indent) { + DCHECK(initialized_); + + scoped_refptr meta; + RETURN_NOT_OK(TabletMetadata::Load(fs_manager_.get(), tablet_id, &meta)); + + if (meta->rowsets().empty()) { + std::cout << Indent(indent) << "No rowsets found on disk for tablet " + << tablet_id << std::endl; + return Status::OK(); + } + + Schema schema = meta->schema(); + + size_t idx = 0; + for (const shared_ptr& rs_meta : meta->rowsets()) { + std::cout << std::endl << Indent(indent) << "Dumping rowset " << idx++ + << std::endl << Indent(indent) << kSeparatorLine; + RETURN_NOT_OK(DumpRowSetInternal(meta->schema(), rs_meta, opts, indent + 2)); + } + return Status::OK(); +} + +Status FsTool::DumpTabletData(const std::string& tablet_id) { + DCHECK(initialized_); + + scoped_refptr meta; + RETURN_NOT_OK(TabletMetadata::Load(fs_manager_.get(), tablet_id, &meta)); + + scoped_refptr reg(new log::LogAnchorRegistry()); + Tablet t(meta, scoped_refptr(nullptr), shared_ptr(), + nullptr, reg.get()); + RETURN_NOT_OK_PREPEND(t.Open(), "Couldn't open tablet"); + vector lines; + RETURN_NOT_OK_PREPEND(t.DebugDump(&lines), "Couldn't dump tablet"); + for (const string& line : lines) { + std::cout << line << std::endl; + } + return Status::OK(); +} + +Status FsTool::DumpRowSet(const string& tablet_id, + int64_t rowset_id, + const DumpOptions& opts, + int indent) { + DCHECK(initialized_); + + scoped_refptr meta; + RETURN_NOT_OK(TabletMetadata::Load(fs_manager_.get(), tablet_id, &meta)); + + for (const shared_ptr& rs_meta : meta->rowsets()) { + if (rs_meta->id() == rowset_id) { + return DumpRowSetInternal(meta->schema(), rs_meta, opts, indent); + } + } + + return Status::InvalidArgument( + Substitute("Could not find rowset $0 in tablet id $1", rowset_id, tablet_id)); +} + +Status FsTool::DumpRowSetInternal(const Schema& schema, + const shared_ptr& rs_meta, + const DumpOptions& opts, + int indent) { + tablet::RowSetDataPB pb; + rs_meta->ToProtobuf(&pb); + + std::cout << Indent(indent) << "RowSet metadata: " << pb.DebugString() << std::endl + << std::endl; + + RowSetMetadata::ColumnIdToBlockIdMap col_blocks = rs_meta->GetColumnBlocksById(); + for (const RowSetMetadata::ColumnIdToBlockIdMap::value_type& e : col_blocks) { + ColumnId col_id = e.first; + const BlockId& block_id = e.second; + + std::cout << Indent(indent) << "Dumping column block " << block_id << " for column id " + << col_id; + int col_idx = schema.find_column_by_id(col_id); + if (col_idx != -1) { + std::cout << "( " << schema.column(col_idx).ToString() << ")"; + } + std::cout << ":" << std::endl; + std::cout << Indent(indent) << kSeparatorLine; + if (opts.metadata_only) continue; + RETURN_NOT_OK(DumpCFileBlockInternal(block_id, opts, indent)); + std::cout << std::endl; + } + + for (const BlockId& block : rs_meta->undo_delta_blocks()) { + std::cout << Indent(indent) << "Dumping undo delta block " << block << ":" << std::endl + << Indent(indent) << kSeparatorLine; + RETURN_NOT_OK(DumpDeltaCFileBlockInternal(schema, + rs_meta, + block, + tablet::UNDO, + opts, + indent, + opts.metadata_only)); + std::cout << std::endl; + } + + for (const BlockId& block : rs_meta->redo_delta_blocks()) { + std::cout << Indent(indent) << "Dumping redo delta block " << block << ":" << std::endl + << Indent(indent) << kSeparatorLine; + RETURN_NOT_OK(DumpDeltaCFileBlockInternal(schema, + rs_meta, + block, + tablet::REDO, + opts, + indent, + opts.metadata_only)); + std::cout << std::endl; + } + + return Status::OK(); +} + +Status FsTool::DumpCFileBlock(const std::string& block_id_str, + const DumpOptions &opts, + int indent) { + uint64_t numeric_id; + if (!safe_strtou64(block_id_str, &numeric_id) && + !safe_strtou64_base(block_id_str, &numeric_id, 16)) { + return Status::InvalidArgument(Substitute("block '$0' could not be parsed", + block_id_str)); + } + BlockId block_id(numeric_id); + if (!fs_manager_->BlockExists(block_id)) { + return Status::NotFound(Substitute("block '$0' does not exist", block_id_str)); + } + return DumpCFileBlockInternal(block_id, opts, indent); +} + +Status FsTool::PrintUUID(int indent) { + std::cout << Indent(indent) << fs_manager_->uuid() << std::endl; + return Status::OK(); +} + +Status FsTool::DumpCFileBlockInternal(const BlockId& block_id, + const DumpOptions& opts, + int indent) { + gscoped_ptr block; + RETURN_NOT_OK(fs_manager_->OpenBlock(block_id, &block)); + gscoped_ptr reader; + RETURN_NOT_OK(CFileReader::Open(block.Pass(), ReaderOptions(), &reader)); + + std::cout << Indent(indent) << "CFile Header: " + << reader->header().ShortDebugString() << std::endl; + std::cout << Indent(indent) << reader->footer().num_values() + << " values:" << std::endl; + + gscoped_ptr it; + RETURN_NOT_OK(reader->NewIterator(&it, CFileReader::DONT_CACHE_BLOCK)); + RETURN_NOT_OK(it->SeekToFirst()); + DumpIteratorOptions iter_opts; + iter_opts.nrows = opts.nrows; + iter_opts.print_rows = detail_level_ > HEADERS_ONLY; + return DumpIterator(*reader, it.get(), &std::cout, iter_opts, indent + 2); +} + +Status FsTool::DumpDeltaCFileBlockInternal(const Schema& schema, + const shared_ptr& rs_meta, + const BlockId& block_id, + DeltaType delta_type, + const DumpOptions& opts, + int indent, + bool metadata_only) { + // Open the delta reader + gscoped_ptr readable_block; + RETURN_NOT_OK(fs_manager_->OpenBlock(block_id, &readable_block)); + shared_ptr delta_reader; + RETURN_NOT_OK(DeltaFileReader::Open(readable_block.Pass(), + block_id, + &delta_reader, + delta_type)); + + std::cout << Indent(indent) << "Delta stats: " << delta_reader->delta_stats().ToString() + << std::endl; + if (metadata_only) { + return Status::OK(); + } + + // Create the delta iterator. + // TODO: see if it's worth re-factoring NewDeltaIterator to return a + // gscoped_ptr that can then be released if we need a raw or shared + // pointer. + DeltaIterator* raw_iter; + + MvccSnapshot snap_all; + if (delta_type == tablet::REDO) { + snap_all = MvccSnapshot::CreateSnapshotIncludingAllTransactions(); + } else if (delta_type == tablet::UNDO) { + snap_all = MvccSnapshot::CreateSnapshotIncludingNoTransactions(); + } + + Status s = delta_reader->NewDeltaIterator(&schema, snap_all, &raw_iter); + + if (s.IsNotFound()) { + std::cout << "Empty delta block." << std::endl; + return Status::OK(); + } + RETURN_NOT_OK(s); + + // NewDeltaIterator returns Status::OK() iff a new DeltaIterator is created. Thus, + // it's safe to have a gscoped_ptr take possesion of 'raw_iter' here. + gscoped_ptr delta_iter(raw_iter); + RETURN_NOT_OK(delta_iter->Init(NULL)); + RETURN_NOT_OK(delta_iter->SeekToOrdinal(0)); + + // TODO: it's awkward that whenever we want to iterate over deltas we also + // need to open the CFileSet for the rowset. Ideally, we should use information stored + // in the footer/store additional information in the footer as to make it feasible + // iterate over all deltas using a DeltaFileIterator alone. + shared_ptr cfileset(new CFileSet(rs_meta)); + RETURN_NOT_OK(cfileset->Open()); + gscoped_ptr cfileset_iter(cfileset->NewIterator(&schema)); + + RETURN_NOT_OK(cfileset_iter->Init(NULL)); + + const size_t kRowsPerBlock = 100; + size_t nrows = 0; + size_t ndeltas = 0; + Arena arena(32 * 1024, 128 * 1024); + RowBlock block(schema, kRowsPerBlock, &arena); + + // See tablet/delta_compaction.cc to understand why this loop is structured the way + // it is. + while (cfileset_iter->HasNext()) { + size_t n; + if (opts.nrows > 0) { + // Note: number of deltas may not equal the number of rows, but + // since this is a CLI tool (and the nrows option exists + // primarily to limit copious output) it's okay not to be + // exact here. + size_t remaining = opts.nrows - nrows; + if (remaining == 0) break; + n = std::min(remaining, kRowsPerBlock); + } else { + n = kRowsPerBlock; + } + + arena.Reset(); + cfileset_iter->PrepareBatch(&n); + + block.Resize(n); + + RETURN_NOT_OK(delta_iter->PrepareBatch(n, DeltaIterator::PREPARE_FOR_COLLECT)); + vector out; + RETURN_NOT_OK(delta_iter->FilterColumnIdsAndCollectDeltas(vector(), + &out, + &arena)); + for (const DeltaKeyAndUpdate& upd : out) { + if (detail_level_ > HEADERS_ONLY) { + std::cout << Indent(indent) << upd.key.ToString() << " " + << RowChangeList(upd.cell).ToString(schema) << std::endl; + ++ndeltas; + } + } + RETURN_NOT_OK(cfileset_iter->FinishBatch()); + + nrows += n; + } + + VLOG(1) << "Processed " << ndeltas << " deltas, for total of " << nrows << " possible rows."; + return Status::OK(); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/fs_tool.h b/src/kudu/tools/fs_tool.h new file mode 100644 index 000000000000..5c85c17b289c --- /dev/null +++ b/src/kudu/tools/fs_tool.h @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Shared fields and methods for querying local files and directories +#ifndef KUDU_TOOLS_FS_TOOL_H +#define KUDU_TOOLS_FS_TOOL_H + +#include +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/status.h" +#include "kudu/tablet/delta_key.h" + +namespace kudu { + +class FsManager; +class Schema; +class BlockId; +class RandomAccessFile; + +namespace tablet { +class TabletMetadata; +class RowSetMetadata; +} + +namespace tools { + +struct DumpOptions { + std::string start_key; + std::string end_key; + size_t nrows; + bool metadata_only; + + DumpOptions() + : start_key(""), + end_key(""), + nrows(0), + metadata_only(false) { + } +}; + +class FsTool { + public: + + enum DetailLevel { + MINIMUM = 0, // Minimum amount of information + HEADERS_ONLY = 1, // Tablet/segment headers only + MAXIMUM = 2, + }; + + explicit FsTool(DetailLevel detail_level); + ~FsTool(); + + Status Init(); + + // Prints out the file system tree. + Status FsTree(); + + // Lists all log segments in the root WALs directory. + Status ListAllLogSegments(); + + // Lists all log segments for tablet 'tablet_id'. + Status ListLogSegmentsForTablet(const std::string& tablet_id); + + // Lists all tablets in a tablet server's local file system. + Status ListAllTablets(); + + // Prints the header for a log segment residing in 'path'. + Status PrintLogSegmentHeader(const std::string& path, int indent); + + // Lists blocks for a tablet organized by rowset. + Status ListBlocksForTablet(const std::string& tablet_id); + + // Lists blocks for all tablets. + Status ListBlocksForAllTablets(); + + // Prints the tablet metadata for a tablet 'tablet_id'. + Status PrintTabletMeta(const std::string& tablet_id, int indent); + + // Dumps the blocks that make up a tablet, rowset by rowset. This ends up + // outputting on a column-by-column basis, as close as possible to the raw + // storage. See also: DumpRowSet(). + Status DumpTabletBlocks(const std::string& tablet_id, + const DumpOptions& opts, + int indent); + + // Dump the data stored in a tablet. The output here is much more readable + // than DumpTabletBlocks, since it reconstructs rows and associates undo/redo deltas + // with those rows. + Status DumpTabletData(const std::string& tablet_id); + + // Dumps column blocks, all types of delta blocks for a given + // rowset. + Status DumpRowSet(const std::string& tablet_id, + int64_t rowset_id, + const DumpOptions& opts, + int indent); + + Status DumpCFileBlock(const std::string& block_id, + const DumpOptions& opts, + int indent); + + // Prints the server's UUID to whom the data belongs and nothing else. + Status PrintUUID(int indent); + private: + Status ListSegmentsInDir(const std::string& segments_dir); + + Status ListBlocksInRowSet(const Schema& schema, + const tablet::RowSetMetadata& rs_meta); + + Status DumpRowSetInternal(const Schema& schema, + const std::shared_ptr& rs_meta, + const DumpOptions& opts, + int indent); + + Status DumpCFileBlockInternal(const BlockId& block_id, + const DumpOptions& opts, + int indent); + + Status DumpDeltaCFileBlockInternal(const Schema& schema, + const std::shared_ptr& rs_meta, + const BlockId& block_id, + tablet::DeltaType delta_type, + const DumpOptions& opts, + int indent, + bool metadata_only); + + Status OpenBlockAsFile(const BlockId& block_id, + uint64_t* file_size, + std::shared_ptr* block_reader); + + bool initialized_; + const DetailLevel detail_level_; + gscoped_ptr fs_manager_; +}; + +} // namespace tools +} // namespace kudu + +#endif // KUDU_TOOLS_FS_TOOL_H diff --git a/src/kudu/tools/insert-generated-rows.cc b/src/kudu/tools/insert-generated-rows.cc new file mode 100644 index 000000000000..8e5985786477 --- /dev/null +++ b/src/kudu/tools/insert-generated-rows.cc @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple tool to insert "random junk" rows into an arbitrary table. +// First column is in ascending order, the rest are random data. +// Helps make things like availability demos a little easier. + +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tools/data_gen_util.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" + +DEFINE_string(master_address, "localhost", + "Comma separated list of master addresses to run against."); + +namespace kudu { +namespace tools { + +using std::string; +using std::vector; + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduColumnSchema; +using client::KuduInsert; +using client::KuduSchema; +using client::KuduSession; +using client::KuduTable; +using client::sp::shared_ptr; + +void PrintUsage(char** argv) { + std::cerr << "usage: " << argv[0] << " [--master_address localhost] " + << std::endl; +} + +static int WriteRandomDataToTable(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + if (argc != 2) { + PrintUsage(argv); + return 1; + } + InitGoogleLoggingSafe(argv[0]); + FLAGS_logtostderr = true; + + string table_name = argv[1]; + + vector addrs = strings::Split(FLAGS_master_address, ","); + CHECK(!addrs.empty()) << "At least one master address must be specified!"; + + // Set up client. + LOG(INFO) << "Connecting to Kudu Master..."; + shared_ptr client; + CHECK_OK(KuduClientBuilder() + .master_server_addrs(addrs) + .Build(&client)); + + LOG(INFO) << "Opening table..."; + shared_ptr table; + CHECK_OK(client->OpenTable(table_name, &table)); + KuduSchema schema = table->schema(); + + shared_ptr session = client->NewSession(); + session->SetTimeoutMillis(5000); // Time out after 5 seconds. + CHECK_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + + Random random(GetRandomSeed32()); + + LOG(INFO) << "Inserting random rows..."; + for (uint64_t record_id = 0; true; ++record_id) { + + gscoped_ptr insert(table->NewInsert()); + KuduPartialRow* row = insert->mutable_row(); + GenerateDataForRow(schema, record_id, &random, row); + + LOG(INFO) << "Inserting record: " << row->ToString(); + CHECK_OK(session->Apply(insert.release())); + Status s = session->Flush(); + if (PREDICT_FALSE(!s.ok())) { + std::vector errors; + ElementDeleter d(&errors); + bool overflow; + session->GetPendingErrors(&errors, &overflow); + CHECK(!overflow); + for (const client::KuduError* e : errors) { + if (e->status().IsAlreadyPresent()) { + LOG(WARNING) << "Ignoring insert error: " << e->status().ToString(); + } else { + LOG(FATAL) << "Unexpected insert error: " << e->status().ToString(); + } + } + continue; + } + LOG(INFO) << "OK"; + } + + return 0; +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tools::WriteRandomDataToTable(argc, argv); +} diff --git a/src/kudu/tools/ksck-test.cc b/src/kudu/tools/ksck-test.cc new file mode 100644 index 000000000000..5cbdee000610 --- /dev/null +++ b/src/kudu/tools/ksck-test.cc @@ -0,0 +1,258 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tools/ksck.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tools { + +using std::shared_ptr; +using std::static_pointer_cast; +using std::string; +using std::unordered_map; +using std::vector; + +class MockKsckTabletServer : public KsckTabletServer { + public: + explicit MockKsckTabletServer(const string& uuid) + : KsckTabletServer(uuid), + connect_status_(Status::OK()), + address_("") { + } + + virtual Status Connect() const OVERRIDE { + return connect_status_; + } + + virtual void RunTabletChecksumScanAsync( + const std::string& tablet_id, + const Schema& schema, + const ChecksumOptions& options, + const ReportResultCallback& callback) OVERRIDE { + callback.Run(Status::OK(), 0); + } + + virtual Status CurrentTimestamp(uint64_t* timestamp) const OVERRIDE { + *timestamp = 0; + return Status::OK(); + } + + virtual const std::string& address() const OVERRIDE { + return address_; + } + + // Public because the unit tests mutate this variable directly. + Status connect_status_; + + private: + const string address_; +}; + +class MockKsckMaster : public KsckMaster { + public: + MockKsckMaster() + : connect_status_(Status::OK()) { + } + + virtual Status Connect() const OVERRIDE { + return connect_status_; + } + + virtual Status RetrieveTabletServers(TSMap* tablet_servers) OVERRIDE { + *tablet_servers = tablet_servers_; + return Status::OK(); + } + + virtual Status RetrieveTablesList(vector>* tables) OVERRIDE { + tables->assign(tables_.begin(), tables_.end()); + return Status::OK(); + } + + virtual Status RetrieveTabletsList(const shared_ptr& table) OVERRIDE { + return Status::OK(); + } + + // Public because the unit tests mutate these variables directly. + Status connect_status_; + TSMap tablet_servers_; + vector> tables_; +}; + +class KsckTest : public KuduTest { + public: + KsckTest() + : master_(new MockKsckMaster()), + cluster_(new KsckCluster(static_pointer_cast(master_))), + ksck_(new Ksck(cluster_)) { + unordered_map> tablet_servers; + for (int i = 0; i < 3; i++) { + string name = strings::Substitute("$0", i); + shared_ptr ts(new MockKsckTabletServer(name)); + InsertOrDie(&tablet_servers, ts->uuid(), ts); + } + master_->tablet_servers_.swap(tablet_servers); + } + + protected: + void CreateDefaultAssignmentPlan(int tablets_count) { + while (tablets_count > 0) { + for (const KsckMaster::TSMap::value_type& entry : master_->tablet_servers_) { + if (tablets_count-- == 0) return; + assignment_plan_.push_back(entry.second->uuid()); + } + } + } + + void CreateOneTableOneTablet() { + CreateDefaultAssignmentPlan(1); + + shared_ptr tablet(new KsckTablet("1")); + CreateAndFillTablet(tablet, 1, true); + + CreateAndAddTable({ tablet }, "test", 1); + } + + void CreateOneSmallReplicatedTable() { + int num_replicas = 3; + int num_tablets = 3; + vector> tablets; + CreateDefaultAssignmentPlan(num_replicas * num_tablets); + for (int i = 0; i < num_tablets; i++) { + shared_ptr tablet(new KsckTablet(boost::lexical_cast(i))); + CreateAndFillTablet(tablet, num_replicas, true); + tablets.push_back(tablet); + } + + CreateAndAddTable(tablets, "test", num_replicas); + } + + void CreateOneOneTabletReplicatedBrokenTable() { + // We're placing only two tablets, the 3rd goes nowhere. + CreateDefaultAssignmentPlan(2); + + shared_ptr tablet(new KsckTablet("1")); + CreateAndFillTablet(tablet, 2, false); + + CreateAndAddTable({ tablet }, "test", 3); + } + + void CreateAndAddTable(vector> tablets, + const string& name, int num_replicas) { + shared_ptr table(new KsckTable(name, Schema(), num_replicas)); + table->set_tablets(tablets); + + vector> tables = { table }; + master_->tables_.assign(tables.begin(), tables.end()); + } + + void CreateAndFillTablet(shared_ptr& tablet, int num_replicas, bool has_leader) { + vector> replicas; + if (has_leader) { + CreateReplicaAndAdd(replicas, true); + num_replicas--; + } + for (int i = 0; i < num_replicas; i++) { + CreateReplicaAndAdd(replicas, false); + } + tablet->set_replicas(replicas); + } + + void CreateReplicaAndAdd(vector>& replicas, bool is_leader) { + shared_ptr replica(new KsckTabletReplica(assignment_plan_.back(), + is_leader, !is_leader)); + assignment_plan_.pop_back(); + replicas.push_back(replica); + } + + shared_ptr master_; + shared_ptr cluster_; + shared_ptr ksck_; + // This is used as a stack. First the unit test is responsible to create a plan to follow, that + // is the order in which each replica of each tablet will be assigned, starting from the end. + // So if you have 2 tablets with num_replicas=3 and 3 tablet servers, then to distribute evenly + // you should have a list that looks like ts1,ts2,ts3,ts3,ts2,ts1 so that the two LEADERS, which + // are assigned first, end up on ts1 and ts3. + vector assignment_plan_; +}; + +TEST_F(KsckTest, TestMasterOk) { + ASSERT_OK(ksck_->CheckMasterRunning()); +} + +TEST_F(KsckTest, TestMasterUnavailable) { + Status error = Status::NetworkError("Network failure"); + master_->connect_status_ = error; + ASSERT_TRUE(ksck_->CheckMasterRunning().IsNetworkError()); +} + +TEST_F(KsckTest, TestTabletServersOk) { + ASSERT_OK(ksck_->CheckMasterRunning()); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->CheckTabletServersRunning()); +} + +TEST_F(KsckTest, TestBadTabletServer) { + ASSERT_OK(ksck_->CheckMasterRunning()); + Status error = Status::NetworkError("Network failure"); + static_pointer_cast(master_->tablet_servers_.begin()->second) + ->connect_status_ = error; + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + Status s = ksck_->CheckTabletServersRunning(); + ASSERT_TRUE(s.IsNetworkError()) << "Status returned: " << s.ToString(); +} + +TEST_F(KsckTest, TestZeroTableCheck) { + ASSERT_OK(ksck_->CheckMasterRunning()); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->CheckTabletServersRunning()); + ASSERT_OK(ksck_->CheckTablesConsistency()); +} + +TEST_F(KsckTest, TestOneTableCheck) { + CreateOneTableOneTablet(); + ASSERT_OK(ksck_->CheckMasterRunning()); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->CheckTabletServersRunning()); + ASSERT_OK(ksck_->CheckTablesConsistency()); +} + +TEST_F(KsckTest, TestOneSmallReplicatedTable) { + CreateOneSmallReplicatedTable(); + ASSERT_OK(ksck_->CheckMasterRunning()); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->CheckTabletServersRunning()); + ASSERT_OK(ksck_->CheckTablesConsistency()); +} + +TEST_F(KsckTest, TestOneOneTabletBrokenTable) { + CreateOneOneTabletReplicatedBrokenTable(); + ASSERT_OK(ksck_->CheckMasterRunning()); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->CheckTabletServersRunning()); + ASSERT_TRUE(ksck_->CheckTablesConsistency().IsCorruption()); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc new file mode 100644 index 000000000000..b1ff84af3d60 --- /dev/null +++ b/src/kudu/tools/ksck.cc @@ -0,0 +1,497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tools/ksck.h" + +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/blocking_queue.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" + +namespace kudu { +namespace tools { + +using std::cerr; +using std::cout; +using std::endl; +using std::ostream; +using std::shared_ptr; +using std::string; +using std::unordered_map; +using strings::Substitute; + +DEFINE_int32(checksum_timeout_sec, 120, + "Maximum total seconds to wait for a checksum scan to complete " + "before timing out."); +DEFINE_int32(checksum_scan_concurrency, 4, + "Number of concurrent checksum scans to execute per tablet server."); +DEFINE_bool(checksum_snapshot, true, "Should the checksum scanner use a snapshot scan"); +DEFINE_uint64(checksum_snapshot_timestamp, ChecksumOptions::kCurrentTimestamp, + "timestamp to use for snapshot checksum scans, defaults to 0, which " + "uses the current timestamp of a tablet server involved in the scan"); + +// Print an informational message to cerr. +static ostream& Info() { + cerr << "INFO: "; + return cerr; +} + +// Print a warning message to cerr. +static ostream& Warn() { + cerr << "WARNING: "; + return cerr; +} + +// Print an error message to cerr. +static ostream& Error() { + cerr << "ERROR: "; + return cerr; +} + +ChecksumOptions::ChecksumOptions() + : timeout(MonoDelta::FromSeconds(FLAGS_checksum_timeout_sec)), + scan_concurrency(FLAGS_checksum_scan_concurrency), + use_snapshot(FLAGS_checksum_snapshot), + snapshot_timestamp(FLAGS_checksum_snapshot_timestamp) { +} + +ChecksumOptions::ChecksumOptions(MonoDelta timeout, int scan_concurrency, + bool use_snapshot, uint64_t snapshot_timestamp) + : timeout(std::move(timeout)), + scan_concurrency(scan_concurrency), + use_snapshot(use_snapshot), + snapshot_timestamp(snapshot_timestamp) {} + +const uint64_t ChecksumOptions::kCurrentTimestamp = 0; + +KsckCluster::~KsckCluster() { +} + +Status KsckCluster::FetchTableAndTabletInfo() { + RETURN_NOT_OK(master_->Connect()); + RETURN_NOT_OK(RetrieveTablesList()); + RETURN_NOT_OK(RetrieveTabletServers()); + for (const shared_ptr& table : tables()) { + RETURN_NOT_OK(RetrieveTabletsList(table)); + } + return Status::OK(); +} + +// Gets the list of tablet servers from the Master. +Status KsckCluster::RetrieveTabletServers() { + return master_->RetrieveTabletServers(&tablet_servers_); +} + +// Gets the list of tables from the Master. +Status KsckCluster::RetrieveTablesList() { + return master_->RetrieveTablesList(&tables_); +} + +Status KsckCluster::RetrieveTabletsList(const shared_ptr& table) { + return master_->RetrieveTabletsList(table); +} + +Status Ksck::CheckMasterRunning() { + VLOG(1) << "Connecting to the Master"; + Status s = cluster_->master()->Connect(); + if (s.ok()) { + Info() << "Connected to the Master" << endl; + } + return s; +} + +Status Ksck::FetchTableAndTabletInfo() { + return cluster_->FetchTableAndTabletInfo(); +} + +Status Ksck::CheckTabletServersRunning() { + VLOG(1) << "Getting the Tablet Servers list"; + int servers_count = cluster_->tablet_servers().size(); + VLOG(1) << Substitute("List of $0 Tablet Servers retrieved", servers_count); + + if (servers_count == 0) { + return Status::NotFound("No tablet servers found"); + } + + int bad_servers = 0; + VLOG(1) << "Connecting to all the Tablet Servers"; + for (const KsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) { + Status s = ConnectToTabletServer(entry.second); + if (!s.ok()) { + bad_servers++; + } + } + if (bad_servers == 0) { + Info() << Substitute("Connected to all $0 Tablet Servers", servers_count) << endl; + return Status::OK(); + } else { + Warn() << Substitute("Connected to $0 Tablet Servers, $1 weren't reachable", + servers_count - bad_servers, bad_servers) << endl; + return Status::NetworkError("Not all Tablet Servers are reachable"); + } +} + +Status Ksck::ConnectToTabletServer(const shared_ptr& ts) { + VLOG(1) << "Going to connect to Tablet Server: " << ts->uuid(); + Status s = ts->Connect(); + if (s.ok()) { + VLOG(1) << "Connected to Tablet Server: " << ts->uuid(); + } else { + Warn() << Substitute("Unable to connect to Tablet Server $0 because $1", + ts->uuid(), s.ToString()) << endl; + } + return s; +} + +Status Ksck::CheckTablesConsistency() { + VLOG(1) << "Getting the tables list"; + int tables_count = cluster_->tables().size(); + VLOG(1) << Substitute("List of $0 tables retrieved", tables_count); + + if (tables_count == 0) { + Info() << "The cluster doesn't have any tables" << endl; + return Status::OK(); + } + + VLOG(1) << "Verifying each table"; + int bad_tables_count = 0; + for (const shared_ptr &table : cluster_->tables()) { + if (!VerifyTable(table)) { + bad_tables_count++; + } + } + if (bad_tables_count == 0) { + Info() << Substitute("The metadata for $0 tables is HEALTHY", tables_count) << endl; + return Status::OK(); + } else { + Warn() << Substitute("$0 out of $1 tables are not in a healthy state", + bad_tables_count, tables_count) << endl; + return Status::Corruption(Substitute("$0 tables are bad", bad_tables_count)); + } +} + +// Class to act as a collector of scan results. +// Provides thread-safe accessors to update and read a hash table of results. +class ChecksumResultReporter : public RefCountedThreadSafe { + public: + typedef std::pair ResultPair; + typedef std::unordered_map ReplicaResultMap; + typedef std::unordered_map TabletResultMap; + + // Initialize reporter with the number of replicas being queried. + explicit ChecksumResultReporter(int num_tablet_replicas) + : responses_(num_tablet_replicas) { + } + + // Write an entry to the result map indicating a response from the remote. + void ReportResult(const std::string& tablet_id, + const std::string& replica_uuid, + const Status& status, + uint64_t checksum) { + lock_guard guard(&lock_); + unordered_map& replica_results = + LookupOrInsert(&checksums_, tablet_id, unordered_map()); + InsertOrDie(&replica_results, replica_uuid, ResultPair(status, checksum)); + responses_.CountDown(); + } + + // Blocks until either the number of results plus errors reported equals + // num_tablet_replicas (from the constructor), or until the timeout expires, + // whichever comes first. + // Returns false if the timeout expired before all responses came in. + // Otherwise, returns true. + bool WaitFor(const MonoDelta& timeout) const { return responses_.WaitFor(timeout); } + + // Returns true iff all replicas have reported in. + bool AllReported() const { return responses_.count() == 0; } + + // Get reported results. + TabletResultMap checksums() const { + lock_guard guard(&lock_); + return checksums_; + } + + private: + friend class RefCountedThreadSafe; + ~ChecksumResultReporter() {} + + // Report either a success or error response. + void HandleResponse(const std::string& tablet_id, const std::string& replica_uuid, + const Status& status, uint64_t checksum); + + CountDownLatch responses_; + mutable simple_spinlock lock_; // Protects 'checksums_'. + // checksums_ is an unordered_map of { tablet_id : { replica_uuid : checksum } }. + TabletResultMap checksums_; +}; + +// Queue of tablet replicas for an individual tablet server. +typedef shared_ptr > > TabletQueue; + +// A callback function which records the result of a tablet replica's checksum, +// and then checks if the tablet server has any more tablets to checksum. If so, +// a new async checksum scan is started. +void TabletServerChecksumCallback( + const scoped_refptr& reporter, + const shared_ptr& tablet_server, + const TabletQueue& queue, + const std::string& tablet_id, + const ChecksumOptions& options, + const Status& status, + uint64_t checksum) { + reporter->ReportResult(tablet_id, tablet_server->uuid(), status, checksum); + + std::pair table_tablet; + if (queue->BlockingGet(&table_tablet)) { + const Schema& table_schema = table_tablet.first; + const std::string& tablet_id = table_tablet.second; + ReportResultCallback callback = Bind(&TabletServerChecksumCallback, + reporter, + tablet_server, + queue, + tablet_id, + options); + tablet_server->RunTabletChecksumScanAsync(tablet_id, table_schema, options, callback); + } +} + +Status Ksck::ChecksumData(const vector& tables, + const vector& tablets, + const ChecksumOptions& opts) { + const unordered_set tables_filter(tables.begin(), tables.end()); + const unordered_set tablets_filter(tablets.begin(), tablets.end()); + + // Copy options so that local modifications can be made and passed on. + ChecksumOptions options = opts; + + typedef unordered_map, shared_ptr> TabletTableMap; + TabletTableMap tablet_table_map; + + int num_tablet_replicas = 0; + for (const shared_ptr& table : cluster_->tables()) { + VLOG(1) << "Table: " << table->name(); + if (!tables_filter.empty() && !ContainsKey(tables_filter, table->name())) continue; + for (const shared_ptr& tablet : table->tablets()) { + VLOG(1) << "Tablet: " << tablet->id(); + if (!tablets_filter.empty() && !ContainsKey(tablets_filter, tablet->id())) continue; + InsertOrDie(&tablet_table_map, tablet, table); + num_tablet_replicas += tablet->replicas().size(); + } + } + if (num_tablet_replicas == 0) { + string msg = "No tablet replicas found."; + if (!tables.empty() || !tablets.empty()) { + msg += " Filter: "; + if (!tables.empty()) { + msg += "tables=" + JoinStrings(tables, ",") + "."; + } + if (!tablets.empty()) { + msg += "tablets=" + JoinStrings(tablets, ",") + "."; + } + } + return Status::NotFound(msg); + } + + // Map of tablet servers to tablet queue. + typedef unordered_map, TabletQueue> TabletServerQueueMap; + + TabletServerQueueMap tablet_server_queues; + scoped_refptr reporter(new ChecksumResultReporter(num_tablet_replicas)); + + // Create a queue of checksum callbacks grouped by the tablet server. + for (const TabletTableMap::value_type& entry : tablet_table_map) { + const shared_ptr& tablet = entry.first; + const shared_ptr& table = entry.second; + for (const shared_ptr& replica : tablet->replicas()) { + const shared_ptr& ts = + FindOrDie(cluster_->tablet_servers(), replica->ts_uuid()); + + const TabletQueue& queue = + LookupOrInsertNewSharedPtr(&tablet_server_queues, ts, num_tablet_replicas); + CHECK_EQ(QUEUE_SUCCESS, queue->Put(make_pair(table->schema(), tablet->id()))); + } + } + + if (options.use_snapshot && options.snapshot_timestamp == ChecksumOptions::kCurrentTimestamp) { + // Set the snapshot timestamp to the current timestamp of an arbitrary tablet server. + tablet_server_queues.begin()->first->CurrentTimestamp(&options.snapshot_timestamp); + Info() << "Using snapshot timestamp: " << options.snapshot_timestamp << endl; + } + + // Kick off checksum scans in parallel. For each tablet server, we start + // scan_concurrency scans. Each callback then initiates one additional + // scan when it returns if the queue for that TS is not empty. + for (const TabletServerQueueMap::value_type& entry : tablet_server_queues) { + const shared_ptr& tablet_server = entry.first; + const TabletQueue& queue = entry.second; + queue->Shutdown(); // Ensures that BlockingGet() will not block. + for (int i = 0; i < options.scan_concurrency; i++) { + std::pair table_tablet; + if (queue->BlockingGet(&table_tablet)) { + const Schema& table_schema = table_tablet.first; + const std::string& tablet_id = table_tablet.second; + ReportResultCallback callback = Bind(&TabletServerChecksumCallback, + reporter, + tablet_server, + queue, + tablet_id, + options); + tablet_server->RunTabletChecksumScanAsync(tablet_id, table_schema, options, callback); + } + } + } + + bool timed_out = false; + if (!reporter->WaitFor(options.timeout)) { + timed_out = true; + } + ChecksumResultReporter::TabletResultMap checksums = reporter->checksums(); + + int num_errors = 0; + int num_mismatches = 0; + int num_results = 0; + for (const shared_ptr& table : cluster_->tables()) { + bool printed_table_name = false; + for (const shared_ptr& tablet : table->tablets()) { + if (ContainsKey(checksums, tablet->id())) { + if (!printed_table_name) { + printed_table_name = true; + cout << "-----------------------" << endl; + cout << table->name() << endl; + cout << "-----------------------" << endl; + } + bool seen_first_replica = false; + uint64_t first_checksum = 0; + + for (const ChecksumResultReporter::ReplicaResultMap::value_type& r : + FindOrDie(checksums, tablet->id())) { + const string& replica_uuid = r.first; + + shared_ptr ts = FindOrDie(cluster_->tablet_servers(), replica_uuid); + const ChecksumResultReporter::ResultPair& result = r.second; + const Status& status = result.first; + uint64_t checksum = result.second; + string status_str = (status.ok()) ? Substitute("Checksum: $0", checksum) + : Substitute("Error: $0", status.ToString()); + cout << Substitute("T $0 P $1 ($2): $3", tablet->id(), ts->uuid(), ts->address(), + status_str) << endl; + if (!status.ok()) { + num_errors++; + } else if (!seen_first_replica) { + seen_first_replica = true; + first_checksum = checksum; + } else if (checksum != first_checksum) { + num_mismatches++; + Error() << ">> Mismatch found in table " << table->name() + << " tablet " << tablet->id() << endl; + } + num_results++; + } + } + } + if (printed_table_name) cout << endl; + } + if (num_results != num_tablet_replicas) { + CHECK(timed_out) << Substitute("Unexpected error: only got $0 out of $1 replica results", + num_results, num_tablet_replicas); + return Status::TimedOut(Substitute("Checksum scan did not complete within the timeout of $0: " + "Received results for $1 out of $2 expected replicas", + options.timeout.ToString(), num_results, + num_tablet_replicas)); + } + if (num_mismatches != 0) { + return Status::Corruption(Substitute("$0 checksum mismatches were detected", num_mismatches)); + } + if (num_errors != 0) { + return Status::Aborted(Substitute("$0 errors were detected", num_errors)); + } + + return Status::OK(); +} + +bool Ksck::VerifyTable(const shared_ptr& table) { + bool good_table = true; + vector > tablets = table->tablets(); + int tablets_count = tablets.size(); + if (tablets_count == 0) { + Warn() << Substitute("Table $0 has 0 tablets", table->name()) << endl; + return false; + } + int table_num_replicas = table->num_replicas(); + VLOG(1) << Substitute("Verifying $0 tablets for table $1 configured with num_replicas = $2", + tablets_count, table->name(), table_num_replicas); + int bad_tablets_count = 0; + // TODO check if the tablets are contiguous and in order. + for (const shared_ptr &tablet : tablets) { + if (!VerifyTablet(tablet, table_num_replicas)) { + bad_tablets_count++; + } + } + if (bad_tablets_count == 0) { + Info() << Substitute("Table $0 is HEALTHY", table->name()) << endl; + } else { + Warn() << Substitute("Table $0 has $1 bad tablets", table->name(), bad_tablets_count) << endl; + good_table = false; + } + return good_table; +} + +bool Ksck::VerifyTablet(const shared_ptr& tablet, int table_num_replicas) { + vector > replicas = tablet->replicas(); + bool good_tablet = true; + if (replicas.size() != table_num_replicas) { + Warn() << Substitute("Tablet $0 has $1 instead of $2 replicas", + tablet->id(), replicas.size(), table_num_replicas) << endl; + // We only fail the "goodness" check if the tablet is under-replicated. + if (replicas.size() < table_num_replicas) { + good_tablet = false; + } + } + int leaders_count = 0; + int followers_count = 0; + for (const shared_ptr replica : replicas) { + if (replica->is_leader()) { + VLOG(1) << Substitute("Replica at $0 is a LEADER", replica->ts_uuid()); + leaders_count++; + } else if (replica->is_follower()) { + VLOG(1) << Substitute("Replica at $0 is a FOLLOWER", replica->ts_uuid()); + followers_count++; + } + } + if (leaders_count == 0) { + Warn() << Substitute("Tablet $0 doesn't have a leader", tablet->id()) << endl; + good_tablet = false; + } + VLOG(1) << Substitute("Tablet $0 has $1 leader and $2 followers", + tablet->id(), leaders_count, followers_count); + return good_tablet; +} + +Status Ksck::CheckAssignments() { + // TODO + return Status::NotSupported("CheckAssignments hasn't been implemented"); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h new file mode 100644 index 000000000000..f9d25a369e3d --- /dev/null +++ b/src/kudu/tools/ksck.h @@ -0,0 +1,311 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Ksck, a tool to run a Kudu System Check. + +#ifndef KUDU_TOOLS_KSCK_H +#define KUDU_TOOLS_KSCK_H + +#include +#include +#include +#include +#include + +#include "kudu/common/schema.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { +class MonoDelta; +namespace tools { + +// Options for checksum scans. +struct ChecksumOptions { + public: + + ChecksumOptions(); + + ChecksumOptions(MonoDelta timeout, + int scan_concurrency, + bool use_snapshot, + uint64_t snapshot_timestamp); + + // The maximum total time to wait for results to come back from all replicas. + MonoDelta timeout; + + // The maximum number of concurrent checksum scans to run per tablet server. + int scan_concurrency; + + // Whether to use a snapshot checksum scanner. + bool use_snapshot; + + // The snapshot timestamp to use for snapshot checksum scans. + uint64_t snapshot_timestamp; + + // A timestamp indicicating that the current time should be used for a checksum snapshot. + static const uint64_t kCurrentTimestamp; +}; + +// Representation of a tablet replica on a tablet server. +class KsckTabletReplica { + public: + KsckTabletReplica(const std::string ts_uuid, const bool is_leader, const bool is_follower) + : is_leader_(is_leader), + is_follower_(is_follower), + ts_uuid_(ts_uuid) { + } + + const bool& is_leader() const { + return is_leader_; + } + + const bool& is_follower() const { + return is_follower_; + } + + const std::string& ts_uuid() const { + return ts_uuid_; + } + + private: + const bool is_leader_; + const bool is_follower_; + const std::string ts_uuid_; + DISALLOW_COPY_AND_ASSIGN(KsckTabletReplica); +}; + +// Representation of a tablet belonging to a table. The tablet is composed of replicas. +class KsckTablet { + public: + // TODO add start/end keys, stale. + explicit KsckTablet(std::string id) : id_(std::move(id)) {} + + const std::string& id() const { + return id_; + } + + const std::vector >& replicas() const { + return replicas_; + } + + void set_replicas(std::vector >& replicas) { + replicas_.assign(replicas.begin(), replicas.end()); + } + private: + const std::string id_; + std::vector> replicas_; + DISALLOW_COPY_AND_ASSIGN(KsckTablet); +}; + +// Representation of a table. Composed of tablets. +class KsckTable { + public: + KsckTable(std::string name, const Schema& schema, int num_replicas) + : name_(std::move(name)), schema_(schema), num_replicas_(num_replicas) {} + + const std::string& name() const { + return name_; + } + + const Schema& schema() const { + return schema_; + } + + int num_replicas() const { + return num_replicas_; + } + + void set_tablets(std::vector>& tablets) { + tablets_.assign(tablets.begin(), tablets.end()); + } + + std::vector >& tablets() { + return tablets_; + } + + private: + const std::string name_; + const Schema schema_; + const int num_replicas_; + std::vector> tablets_; + DISALLOW_COPY_AND_ASSIGN(KsckTable); +}; + +typedef Callback ReportResultCallback; + +// The following two classes must be extended in order to communicate with their respective +// components. The two main use cases envisioned for this are: +// - To be able to mock a cluster to more easily test the Ksck checks. +// - To be able to communicate with a real Kudu cluster. + +// Class that must be extended to represent a tablet server. +class KsckTabletServer { + public: + explicit KsckTabletServer(std::string uuid) : uuid_(std::move(uuid)) {} + virtual ~KsckTabletServer() { } + + // Connects to the configured Tablet Server. + virtual Status Connect() const = 0; + + virtual Status CurrentTimestamp(uint64_t* timestamp) const = 0; + + // Executes a checksum scan on the associated tablet, and runs the callback + // with the result. The callback must be threadsafe and non-blocking. + virtual void RunTabletChecksumScanAsync( + const std::string& tablet_id, + const Schema& schema, + const ChecksumOptions& options, + const ReportResultCallback& callback) = 0; + + virtual const std::string& uuid() const { + return uuid_; + } + + virtual const std::string& address() const = 0; + + private: + const std::string uuid_; + DISALLOW_COPY_AND_ASSIGN(KsckTabletServer); +}; + +// Class that must be extended to represent a master. +class KsckMaster { + public: + // Map of KsckTabletServer objects keyed by tablet server permanent_uuid. + typedef std::unordered_map > TSMap; + + KsckMaster() { } + virtual ~KsckMaster() { } + + // Connects to the configured Master. + virtual Status Connect() const = 0; + + // Gets the list of Tablet Servers from the Master and stores it in the passed + // map, which is keyed on server permanent_uuid. + // 'tablet_servers' is only modified if this method returns OK. + virtual Status RetrieveTabletServers(TSMap* tablet_servers) = 0; + + // Gets the list of tables from the Master and stores it in the passed vector. + // tables is only modified if this method returns OK. + virtual Status RetrieveTablesList( + std::vector >* tables) = 0; + + // Gets the list of tablets for the specified table and stores the list in it. + // The table's tablet list is only modified if this method returns OK. + virtual Status RetrieveTabletsList(const std::shared_ptr& table) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(KsckMaster); +}; + +// Class used to communicate with the cluster. It bootstraps this by using the provided master. +class KsckCluster { + public: + explicit KsckCluster(std::shared_ptr master) + : master_(std::move(master)) {} + ~KsckCluster(); + + // Fetches list of tables, tablets, and tablet servers from the master and + // populates the full list in cluster_->tables(). + Status FetchTableAndTabletInfo(); + + const std::shared_ptr& master() { + return master_; + } + + const std::unordered_map >& tablet_servers() { + return tablet_servers_; + } + + const std::vector >& tables() { + return tables_; + } + + private: + // Gets the list of tablet servers from the Master. + Status RetrieveTabletServers(); + + // Gets the list of tables from the Master. + Status RetrieveTablesList(); + + // Fetch the list of tablets for the given table from the Master. + Status RetrieveTabletsList(const std::shared_ptr& table); + + const std::shared_ptr master_; + std::unordered_map > tablet_servers_; + std::vector > tables_; + DISALLOW_COPY_AND_ASSIGN(KsckCluster); +}; + +// Externally facing class to run checks against the provided cluster. +class Ksck { + public: + explicit Ksck(std::shared_ptr cluster) + : cluster_(std::move(cluster)) {} + ~Ksck() {} + + // Verifies that it can connect to the Master. + Status CheckMasterRunning(); + + // Populates all the cluster table and tablet info from the Master. + Status FetchTableAndTabletInfo(); + + // Verifies that it can connect to all the Tablet Servers reported by the master. + // Must first call FetchTableAndTabletInfo(). + Status CheckTabletServersRunning(); + + // Establishes a connection with the specified Tablet Server. + // Must first call FetchTableAndTabletInfo(). + Status ConnectToTabletServer(const std::shared_ptr& ts); + + // Verifies that all the tables have contiguous tablets and that each tablet has enough replicas + // and a leader. + // Must first call FetchTableAndTabletInfo(). + Status CheckTablesConsistency(); + + // Verifies data checksums on all tablets by doing a scan of the database on each replica. + // If tables is not empty, checks only the named tables. + // If tablets is not empty, checks only the specified tablets. + // If both are specified, takes the intersection. + // If both are empty, all tables and tablets are checked. + // Must first call FetchTableAndTabletInfo(). + Status ChecksumData(const std::vector& tables, + const std::vector& tablets, + const ChecksumOptions& options); + + // Verifies that the assignments reported by the master are the same reported by the + // Tablet Servers. + // Must first call FetchTableAndTabletInfo(). + Status CheckAssignments(); + + private: + bool VerifyTable(const std::shared_ptr& table); + bool VerifyTableWithTimeout(const std::shared_ptr& table, + const MonoDelta& timeout, + const MonoDelta& retry_interval); + bool VerifyTablet(const std::shared_ptr& tablet, int table_num_replicas); + + const std::shared_ptr cluster_; + DISALLOW_COPY_AND_ASSIGN(Ksck); +}; +} // namespace tools +} // namespace kudu + +#endif // KUDU_TOOLS_KSCK_H diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc new file mode 100644 index 000000000000..579f5713a3ae --- /dev/null +++ b/src/kudu/tools/ksck_remote-test.cc @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/mini_cluster.h" +#include "kudu/master/mini_master.h" +#include "kudu/tools/data_gen_util.h" +#include "kudu/tools/ksck_remote.h" +#include "kudu/util/monotime.h" +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +DECLARE_int32(heartbeat_interval_ms); + +namespace kudu { +namespace tools { + +using client::KuduColumnSchema; +using client::KuduInsert; +using client::KuduSchemaBuilder; +using client::KuduSession; +using client::KuduTable; +using client::KuduTableCreator; +using client::sp::shared_ptr; +using std::static_pointer_cast; +using std::string; +using std::vector; +using strings::Substitute; + +static const char *kTableName = "ksck-test-table"; + +class RemoteKsckTest : public KuduTest { + public: + RemoteKsckTest() + : random_(SeedRandom()) { + KuduSchemaBuilder b; + b.AddColumn("key")->Type(KuduColumnSchema::INT32)->NotNull()->PrimaryKey(); + b.AddColumn("int_val")->Type(KuduColumnSchema::INT32)->NotNull(); + CHECK_OK(b.Build(&schema_)); + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + // Speed up testing, saves about 700ms per TEST_F. + FLAGS_heartbeat_interval_ms = 10; + + MiniClusterOptions opts; + opts.num_tablet_servers = 3; + mini_cluster_.reset(new MiniCluster(env_.get(), opts)); + ASSERT_OK(mini_cluster_->Start()); + + master_rpc_addr_ = mini_cluster_->mini_master()->bound_rpc_addr(); + + // Connect to the cluster. + ASSERT_OK(client::KuduClientBuilder() + .add_master_server_addr(master_rpc_addr_.ToString()) + .Build(&client_)); + + // Create one table. + gscoped_ptr table_creator(client_->NewTableCreator()); + ASSERT_OK(table_creator->table_name(kTableName) + .schema(&schema_) + .num_replicas(3) + .split_rows(GenerateSplitRows()) + .Create()); + // Make sure we can open the table. + ASSERT_OK(client_->OpenTable(kTableName, &client_table_)); + + ASSERT_OK(RemoteKsckMaster::Build(master_rpc_addr_, &master_)); + cluster_.reset(new KsckCluster(master_)); + ksck_.reset(new Ksck(cluster_)); + } + + virtual void TearDown() OVERRIDE { + if (mini_cluster_) { + mini_cluster_->Shutdown(); + mini_cluster_.reset(); + } + KuduTest::TearDown(); + } + + // Writes rows to the table until the continue_writing flag is set to false. + // + // Public for use with boost::bind. + void GenerateRowWritesLoop(CountDownLatch* started_writing, + const AtomicBool& continue_writing, + Promise* promise) { + shared_ptr table; + Status status; + status = client_->OpenTable(kTableName, &table); + if (!status.ok()) { + promise->Set(status); + } + shared_ptr session(client_->NewSession()); + session->SetTimeoutMillis(10000); + status = session->SetFlushMode(KuduSession::MANUAL_FLUSH); + if (!status.ok()) { + promise->Set(status); + } + + for (uint64_t i = 0; continue_writing.Load(); i++) { + gscoped_ptr insert(table->NewInsert()); + GenerateDataForRow(table->schema(), i, &random_, insert->mutable_row()); + status = session->Apply(insert.release()); + if (!status.ok()) { + promise->Set(status); + } + status = session->Flush(); + if (!status.ok()) { + promise->Set(status); + } + started_writing->CountDown(1); + } + promise->Set(Status::OK()); + } + + protected: + // Generate a set of split rows for tablets used in this test. + vector GenerateSplitRows() { + vector split_rows; + vector split_nums = { 33, 66 }; + for (int i : split_nums) { + KuduPartialRow* row = schema_.NewRow(); + CHECK_OK(row->SetInt32(0, i)); + split_rows.push_back(row); + } + return split_rows; + } + + Status GenerateRowWrites(uint64_t num_rows) { + shared_ptr table; + RETURN_NOT_OK(client_->OpenTable(kTableName, &table)); + shared_ptr session(client_->NewSession()); + session->SetTimeoutMillis(10000); + RETURN_NOT_OK(session->SetFlushMode(KuduSession::MANUAL_FLUSH)); + for (uint64_t i = 0; i < num_rows; i++) { + VLOG(1) << "Generating write for row id " << i; + gscoped_ptr insert(table->NewInsert()); + GenerateDataForRow(table->schema(), i, &random_, insert->mutable_row()); + RETURN_NOT_OK(session->Apply(insert.release())); + + if (i > 0 && i % 1000 == 0) { + RETURN_NOT_OK(session->Flush()); + } + } + RETURN_NOT_OK(session->Flush()); + return Status::OK(); + } + + std::shared_ptr ksck_; + shared_ptr client_; + + private: + Sockaddr master_rpc_addr_; + std::shared_ptr mini_cluster_; + client::KuduSchema schema_; + shared_ptr client_table_; + std::shared_ptr master_; + std::shared_ptr cluster_; + Random random_; +}; + +TEST_F(RemoteKsckTest, TestMasterOk) { + ASSERT_OK(ksck_->CheckMasterRunning()); +} + +TEST_F(RemoteKsckTest, TestTabletServersOk) { + LOG(INFO) << "Fetching table and tablet info..."; + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + LOG(INFO) << "Checking tablet servers are running..."; + ASSERT_OK(ksck_->CheckTabletServersRunning()); +} + +TEST_F(RemoteKsckTest, TestTableConsistency) { + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(30)); + Status s; + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + s = ksck_->CheckTablesConsistency(); + if (s.ok()) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(s); +} + +TEST_F(RemoteKsckTest, TestChecksum) { + uint64_t num_writes = 100; + LOG(INFO) << "Generating row writes..."; + ASSERT_OK(GenerateRowWrites(num_writes)); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(30)); + Status s; + while (MonoTime::Now(MonoTime::FINE).ComesBefore(deadline)) { + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + s = ksck_->ChecksumData(vector(), + vector(), + ChecksumOptions(MonoDelta::FromSeconds(1), 16, false, 0)); + if (s.ok()) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(10)); + } + ASSERT_OK(s); +} + +TEST_F(RemoteKsckTest, TestChecksumTimeout) { + uint64_t num_writes = 10000; + LOG(INFO) << "Generating row writes..."; + ASSERT_OK(GenerateRowWrites(num_writes)); + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + // Use an impossibly low timeout value of zero! + Status s = ksck_->ChecksumData(vector(), + vector(), + ChecksumOptions(MonoDelta::FromNanoseconds(0), 16, false, 0)); + ASSERT_TRUE(s.IsTimedOut()) << "Expected TimedOut Status, got: " << s.ToString(); +} + +TEST_F(RemoteKsckTest, TestChecksumSnapshot) { + CountDownLatch started_writing(1); + AtomicBool continue_writing(true); + Promise promise; + scoped_refptr writer_thread; + + Thread::Create("RemoteKsckTest", "TestChecksumSnapshot", + &RemoteKsckTest::GenerateRowWritesLoop, this, + &started_writing, boost::cref(continue_writing), &promise, + &writer_thread); + CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30))); + + uint64_t ts = client_->GetLatestObservedTimestamp(); + MonoTime start(MonoTime::Now(MonoTime::FINE)); + MonoTime deadline = start; + deadline.AddDelta(MonoDelta::FromSeconds(30)); + Status s; + // TODO: We need to loop here because safe time is not yet implemented. + // Remove this loop when that is done. See KUDU-1056. + while (true) { + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + Status s = ksck_->ChecksumData(vector(), vector(), + ChecksumOptions(MonoDelta::FromSeconds(10), 16, true, ts)); + if (s.ok()) break; + if (deadline.ComesBefore(MonoTime::Now(MonoTime::FINE))) break; + SleepFor(MonoDelta::FromMilliseconds(10)); + } + if (!s.ok()) { + LOG(WARNING) << Substitute("Timed out after $0 waiting for ksck to become consistent on TS $1. " + "Status: $2", + MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToString(), + ts, s.ToString()); + EXPECT_OK(s); // To avoid ASAN complaints due to thread reading the CountDownLatch. + } + continue_writing.Store(false); + ASSERT_OK(promise.Get()); + writer_thread->Join(); +} + +// Test that followers & leader wait until safe time to respond to a snapshot +// scan at current timestamp. TODO: Safe time not yet implemented. See KUDU-1056. +TEST_F(RemoteKsckTest, DISABLED_TestChecksumSnapshotCurrentTimestamp) { + CountDownLatch started_writing(1); + AtomicBool continue_writing(true); + Promise promise; + scoped_refptr writer_thread; + + Thread::Create("RemoteKsckTest", "TestChecksumSnapshot", + &RemoteKsckTest::GenerateRowWritesLoop, this, + &started_writing, boost::cref(continue_writing), &promise, + &writer_thread); + CHECK(started_writing.WaitFor(MonoDelta::FromSeconds(30))); + + ASSERT_OK(ksck_->FetchTableAndTabletInfo()); + ASSERT_OK(ksck_->ChecksumData(vector(), vector(), + ChecksumOptions(MonoDelta::FromSeconds(10), 16, true, + ChecksumOptions::kCurrentTimestamp))); + continue_writing.Store(false); + ASSERT_OK(promise.Get()); + writer_thread->Join(); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc new file mode 100644 index 000000000000..76c464ceec89 --- /dev/null +++ b/src/kudu/tools/ksck_remote.cc @@ -0,0 +1,342 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tools/ksck_remote.h" + +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" + +DEFINE_bool(checksum_cache_blocks, false, "Should the checksum scanners cache the read blocks"); +DEFINE_int64(timeout_ms, 1000 * 60, "RPC timeout in milliseconds"); +DEFINE_int64(tablets_batch_size_max, 100, "How many tablets to get from the Master per RPC"); + +namespace kudu { +namespace tools { + +static const std::string kMessengerName = "ksck"; + +using rpc::Messenger; +using rpc::MessengerBuilder; +using rpc::RpcController; +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +MonoDelta GetDefaultTimeout() { + return MonoDelta::FromMilliseconds(FLAGS_timeout_ms); +} + +Status RemoteKsckTabletServer::Connect() const { + tserver::PingRequestPB req; + tserver::PingResponsePB resp; + RpcController rpc; + rpc.set_timeout(GetDefaultTimeout()); + return ts_proxy_->Ping(req, &resp, &rpc); +} + +Status RemoteKsckTabletServer::CurrentTimestamp(uint64_t* timestamp) const { + server::ServerClockRequestPB req; + server::ServerClockResponsePB resp; + RpcController rpc; + rpc.set_timeout(GetDefaultTimeout()); + RETURN_NOT_OK(generic_proxy_->ServerClock(req, &resp, &rpc)); + CHECK(resp.has_timestamp()); + *timestamp = resp.timestamp(); + return Status::OK(); +} + +class ChecksumStepper; + +// Simple class to act as a callback in order to collate results from parallel +// checksum scans. +class ChecksumCallbackHandler { + public: + explicit ChecksumCallbackHandler(ChecksumStepper* const stepper) + : stepper(DCHECK_NOTNULL(stepper)) { + } + + // Invoked by an RPC completion callback. Simply calls back into the stepper. + // Then the call to the stepper returns, deletes 'this'. + void Run(); + + private: + ChecksumStepper* const stepper; +}; + +// Simple class to have a "conversation" over multiple requests to a server +// to carry out a multi-part checksum scan. +// If any errors or timeouts are encountered, the checksum operation fails. +// After the ChecksumStepper reports its results to the reporter, it deletes itself. +class ChecksumStepper { + public: + ChecksumStepper(string tablet_id, const Schema& schema, string server_uuid, + ChecksumOptions options, ReportResultCallback callback, + shared_ptr proxy) + : schema_(schema), + tablet_id_(std::move(tablet_id)), + server_uuid_(std::move(server_uuid)), + options_(std::move(options)), + reporter_callback_(std::move(callback)), + proxy_(std::move(proxy)), + call_seq_id_(0), + checksum_(0) { + DCHECK(proxy_); + } + + void Start() { + Status s = SchemaToColumnPBs(schema_, &cols_, + SCHEMA_PB_WITHOUT_IDS | SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES); + if (!s.ok()) { + reporter_callback_.Run(s, 0); + } else { + SendRequest(kNewRequest); + } + } + + void HandleResponse() { + gscoped_ptr deleter(this); + Status s = rpc_.status(); + if (s.ok() && resp_.has_error()) { + s = StatusFromPB(resp_.error().status()); + } + if (!s.ok()) { + reporter_callback_.Run(s, 0); + return; // Deletes 'this'. + } + + DCHECK(resp_.has_checksum()); + checksum_ = resp_.checksum(); + + // Report back with results. + if (!resp_.has_more_results()) { + reporter_callback_.Run(s, checksum_); + return; // Deletes 'this'. + } + + // We're not done scanning yet. Fetch the next chunk. + if (resp_.has_scanner_id()) { + scanner_id_ = resp_.scanner_id(); + } + SendRequest(kContinueRequest); + ignore_result(deleter.release()); // We have more work to do. + } + + private: + enum RequestType { + kNewRequest, + kContinueRequest + }; + + void SendRequest(RequestType type) { + switch (type) { + case kNewRequest: { + req_.set_call_seq_id(call_seq_id_); + req_.mutable_new_request()->mutable_projected_columns()->CopyFrom(cols_); + req_.mutable_new_request()->set_tablet_id(tablet_id_); + req_.mutable_new_request()->set_cache_blocks(FLAGS_checksum_cache_blocks); + if (options_.use_snapshot) { + req_.mutable_new_request()->set_read_mode(READ_AT_SNAPSHOT); + req_.mutable_new_request()->set_snap_timestamp(options_.snapshot_timestamp); + } + rpc_.set_timeout(GetDefaultTimeout()); + break; + } + case kContinueRequest: { + req_.Clear(); + resp_.Clear(); + rpc_.Reset(); + + req_.set_call_seq_id(++call_seq_id_); + DCHECK(!scanner_id_.empty()); + req_.mutable_continue_request()->set_scanner_id(scanner_id_); + req_.mutable_continue_request()->set_previous_checksum(checksum_); + break; + } + default: + LOG(FATAL) << "Unknown type"; + break; + } + gscoped_ptr handler(new ChecksumCallbackHandler(this)); + rpc::ResponseCallback cb = boost::bind(&ChecksumCallbackHandler::Run, handler.get()); + proxy_->ChecksumAsync(req_, &resp_, &rpc_, cb); + ignore_result(handler.release()); + } + + const Schema schema_; + google::protobuf::RepeatedPtrField cols_; + + const string tablet_id_; + const string server_uuid_; + const ChecksumOptions options_; + const ReportResultCallback reporter_callback_; + const shared_ptr proxy_; + + uint32_t call_seq_id_; + string scanner_id_; + uint64_t checksum_; + tserver::ChecksumRequestPB req_; + tserver::ChecksumResponsePB resp_; + RpcController rpc_; +}; + +void ChecksumCallbackHandler::Run() { + stepper->HandleResponse(); + delete this; +} + +void RemoteKsckTabletServer::RunTabletChecksumScanAsync( + const string& tablet_id, + const Schema& schema, + const ChecksumOptions& options, + const ReportResultCallback& callback) { + gscoped_ptr stepper( + new ChecksumStepper(tablet_id, schema, uuid(), options, callback, ts_proxy_)); + stepper->Start(); + ignore_result(stepper.release()); // Deletes self on callback. +} + +Status RemoteKsckMaster::Connect() const { + master::PingRequestPB req; + master::PingResponsePB resp; + RpcController rpc; + rpc.set_timeout(GetDefaultTimeout()); + return proxy_->Ping(req, &resp, &rpc); +} + +Status RemoteKsckMaster::Build(const Sockaddr& address, shared_ptr* master) { + shared_ptr messenger; + MessengerBuilder builder(kMessengerName); + RETURN_NOT_OK(builder.Build(&messenger)); + master->reset(new RemoteKsckMaster(address, messenger)); + return Status::OK(); +} + +Status RemoteKsckMaster::RetrieveTabletServers(TSMap* tablet_servers) { + master::ListTabletServersRequestPB req; + master::ListTabletServersResponsePB resp; + RpcController rpc; + + rpc.set_timeout(GetDefaultTimeout()); + RETURN_NOT_OK(proxy_->ListTabletServers(req, &resp, &rpc)); + tablet_servers->clear(); + for (const master::ListTabletServersResponsePB_Entry& e : resp.servers()) { + HostPortPB addr = e.registration().rpc_addresses(0); + vector addresses; + RETURN_NOT_OK(ParseAddressList(HostPort(addr.host(), addr.port()).ToString(), + tserver::TabletServer::kDefaultPort, &addresses)); + shared_ptr ts( + new RemoteKsckTabletServer(e.instance_id().permanent_uuid(), addresses[0], messenger_)); + InsertOrDie(tablet_servers, ts->uuid(), ts); + } + return Status::OK(); +} + +Status RemoteKsckMaster::RetrieveTablesList(vector >* tables) { + master::ListTablesRequestPB req; + master::ListTablesResponsePB resp; + RpcController rpc; + + rpc.set_timeout(GetDefaultTimeout()); + RETURN_NOT_OK(proxy_->ListTables(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + vector > tables_temp; + for (const master::ListTablesResponsePB_TableInfo& info : resp.tables()) { + Schema schema; + int num_replicas; + RETURN_NOT_OK(GetTableInfo(info.name(), &schema, &num_replicas)); + shared_ptr table(new KsckTable(info.name(), schema, num_replicas)); + tables_temp.push_back(table); + } + tables->assign(tables_temp.begin(), tables_temp.end()); + return Status::OK(); +} + +Status RemoteKsckMaster::RetrieveTabletsList(const shared_ptr& table) { + vector > tablets; + bool more_tablets = true; + string last_key; + while (more_tablets) { + GetTabletsBatch(table->name(), &last_key, tablets, &more_tablets); + } + + table->set_tablets(tablets); + return Status::OK(); +} + +Status RemoteKsckMaster::GetTabletsBatch(const string& table_name, + string* last_partition_key, + vector >& tablets, + bool* more_tablets) { + master::GetTableLocationsRequestPB req; + master::GetTableLocationsResponsePB resp; + RpcController rpc; + + req.mutable_table()->set_table_name(table_name); + req.set_max_returned_locations(FLAGS_tablets_batch_size_max); + req.set_partition_key_start(*last_partition_key); + + rpc.set_timeout(GetDefaultTimeout()); + RETURN_NOT_OK(proxy_->GetTableLocations(req, &resp, &rpc)); + for (const master::TabletLocationsPB& locations : resp.tablet_locations()) { + shared_ptr tablet(new KsckTablet(locations.tablet_id())); + vector > replicas; + for (const master::TabletLocationsPB_ReplicaPB& replica : locations.replicas()) { + bool is_leader = replica.role() == consensus::RaftPeerPB::LEADER; + bool is_follower = replica.role() == consensus::RaftPeerPB::FOLLOWER; + replicas.push_back(shared_ptr( + new KsckTabletReplica(replica.ts_info().permanent_uuid(), is_leader, is_follower))); + } + tablet->set_replicas(replicas); + tablets.push_back(tablet); + } + if (resp.tablet_locations_size() != 0) { + *last_partition_key = (resp.tablet_locations().end() - 1)->partition().partition_key_end(); + } else { + return Status::NotFound(Substitute( + "The Master returned 0 tablets for GetTableLocations of table $0 at start key $1", + table_name, *(last_partition_key))); + } + if (last_partition_key->empty()) { + *more_tablets = false; + } + return Status::OK(); +} + +Status RemoteKsckMaster::GetTableInfo(const string& table_name, Schema* schema, int* num_replicas) { + master::GetTableSchemaRequestPB req; + master::GetTableSchemaResponsePB resp; + RpcController rpc; + + req.mutable_table()->set_table_name(table_name); + + rpc.set_timeout(GetDefaultTimeout()); + RETURN_NOT_OK(proxy_->GetTableSchema(req, &resp, &rpc)); + + RETURN_NOT_OK(SchemaFromPB(resp.schema(), schema)); + *num_replicas = resp.num_replicas(); + return Status::OK(); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/ksck_remote.h b/src/kudu/tools/ksck_remote.h new file mode 100644 index 000000000000..441403807bc3 --- /dev/null +++ b/src/kudu/tools/ksck_remote.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TOOLS_KSCK_REMOTE_H +#define KUDU_TOOLS_KSCK_REMOTE_H + +#include +#include +#include + +#include "kudu/master/master.h" +#include "kudu/master/master.proxy.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/server_base.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tools/ksck.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/util/net/sockaddr.h" + +namespace kudu { + +class Schema; + +namespace tools { + +// This implementation connects to a Tablet Server via RPC. +class RemoteKsckTabletServer : public KsckTabletServer { + public: + explicit RemoteKsckTabletServer(const std::string& id, + const Sockaddr& address, + const std::shared_ptr& messenger) + : KsckTabletServer(id), + address_(address.ToString()), + messenger_(messenger), + generic_proxy_(new server::GenericServiceProxy(messenger, address)), + ts_proxy_(new tserver::TabletServerServiceProxy(messenger, address)) { + } + + virtual Status Connect() const OVERRIDE; + + virtual Status CurrentTimestamp(uint64_t* timestamp) const OVERRIDE; + + virtual void RunTabletChecksumScanAsync( + const std::string& tablet_id, + const Schema& schema, + const ChecksumOptions& options, + const ReportResultCallback& callback) OVERRIDE; + + + virtual const std::string& address() const OVERRIDE { + return address_; + } + + private: + const std::string address_; + const std::shared_ptr messenger_; + const std::shared_ptr generic_proxy_; + const std::shared_ptr ts_proxy_; +}; + +// This implementation connects to a Master via RPC. +class RemoteKsckMaster : public KsckMaster { + public: + + static Status Build(const Sockaddr& address, std::shared_ptr* master); + + virtual ~RemoteKsckMaster() { } + + virtual Status Connect() const OVERRIDE; + + virtual Status RetrieveTabletServers(TSMap* tablet_servers) OVERRIDE; + + virtual Status RetrieveTablesList(std::vector >* tables) OVERRIDE; + + virtual Status RetrieveTabletsList(const std::shared_ptr& table) OVERRIDE; + + private: + + explicit RemoteKsckMaster(const Sockaddr& address, + const std::shared_ptr& messenger) + : messenger_(messenger), + proxy_(new master::MasterServiceProxy(messenger, address)) { + } + + Status GetTableInfo(const std::string& table_name, Schema* schema, int* num_replicas); + + // Used to get a batch of tablets from the master, passing a pointer to the + // seen last key that will be used as the new start key. The + // last_partition_key is updated to point at the new last key that came in + // the batch. + Status GetTabletsBatch(const std::string& table_name, std::string* last_partition_key, + std::vector >& tablets, bool* more_tablets); + + std::shared_ptr messenger_; + std::shared_ptr proxy_; +}; + +} // namespace tools +} // namespace kudu + +#endif // KUDU_TOOLS_KSCK_REMOTE_H diff --git a/src/kudu/tools/kudu-admin-test.cc b/src/kudu/tools/kudu-admin-test.cc new file mode 100644 index 000000000000..87cf951486b9 --- /dev/null +++ b/src/kudu/tools/kudu-admin-test.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tests for the kudu-admin command-line tool. + +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/integration-tests/ts_itest-base.h" +#include "kudu/util/subprocess.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tools { + +using client::KuduClient; +using client::KuduClientBuilder; +using client::sp::shared_ptr; +using itest::TabletServerMap; +using itest::TServerDetails; +using strings::Substitute; + +static const char* const kAdminToolName = "kudu-admin"; + +class AdminCliTest : public tserver::TabletServerIntegrationTestBase { + protected: + // Figure out where the admin tool is. + string GetAdminToolPath() const; +}; + +string AdminCliTest::GetAdminToolPath() const { + string exe; + CHECK_OK(Env::Default()->GetExecutablePath(&exe)); + string binroot = DirName(exe); + string tool_path = JoinPathSegments(binroot, kAdminToolName); + CHECK(Env::Default()->FileExists(tool_path)) << "kudu-admin tool not found at " << tool_path; + return tool_path; +} + +// Test kudu-admin config change while running a workload. +// 1. Instantiate external mini cluster with 3 TS. +// 2. Create table with 2 replicas. +// 3. Invoke kudu-admin CLI to invoke a config change. +// 4. Wait until the new server bootstraps. +// 5. Profit! +TEST_F(AdminCliTest, TestChangeConfig) { + FLAGS_num_tablet_servers = 3; + FLAGS_num_replicas = 2; + + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + BuildAndStart(ts_flags, master_flags); + + vector tservers; + AppendValuesFromMap(tablet_servers_, &tservers); + ASSERT_EQ(FLAGS_num_tablet_servers, tservers.size()); + + TabletServerMap active_tablet_servers; + TabletServerMap::const_iterator iter = tablet_replicas_.find(tablet_id_); + TServerDetails* leader = iter->second; + TServerDetails* follower = (++iter)->second; + InsertOrDie(&active_tablet_servers, leader->uuid(), leader); + InsertOrDie(&active_tablet_servers, follower->uuid(), follower); + + TServerDetails* new_node = nullptr; + for (TServerDetails* ts : tservers) { + if (!ContainsKey(active_tablet_servers, ts->uuid())) { + new_node = ts; + break; + } + } + ASSERT_TRUE(new_node != nullptr); + + // Elect the leader (still only a consensus config size of 2). + ASSERT_OK(StartElection(leader, tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(30), active_tablet_servers, + tablet_id_, 1)); + + TestWorkload workload(cluster_.get()); + workload.set_table_name(kTableId); + workload.set_timeout_allowed(true); + workload.set_write_timeout_millis(10000); + workload.set_num_replicas(FLAGS_num_replicas); + workload.set_num_write_threads(1); + workload.set_write_batch_size(1); + workload.Setup(); + workload.Start(); + + // Wait until the Master knows about the leader tserver. + TServerDetails* master_observed_leader; + ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &master_observed_leader)); + ASSERT_EQ(leader->uuid(), master_observed_leader->uuid()); + + LOG(INFO) << "Adding tserver with uuid " << new_node->uuid() << " as VOTER..."; + string exe_path = GetAdminToolPath(); + string arg_str = Substitute("$0 -master_addresses $1 change_config $2 ADD_SERVER $3 VOTER", + exe_path, + cluster_->master()->bound_rpc_addr().ToString(), + tablet_id_, new_node->uuid()); + ASSERT_OK(Subprocess::Call(arg_str)); + + InsertOrDie(&active_tablet_servers, new_node->uuid(), new_node); + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(active_tablet_servers.size(), + leader, tablet_id_, + MonoDelta::FromSeconds(10))); + + workload.StopAndJoin(); + int num_batches = workload.batches_completed(); + + LOG(INFO) << "Waiting for replicas to agree..."; + // Wait for all servers to replicate everything up through the last write op. + // Since we don't batch, there should be at least # rows inserted log entries, + // plus the initial leader's no-op, plus 1 for + // the added replica for a total == #rows + 2. + int min_log_index = num_batches + 2; + ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(30), + active_tablet_servers, tablet_id_, + min_log_index)); + + int rows_inserted = workload.rows_inserted(); + LOG(INFO) << "Number of rows inserted: " << rows_inserted; + + ClusterVerifier v(cluster_.get()); + NO_FATALS(v.CheckCluster()); + NO_FATALS(v.CheckRowCount(kTableId, ClusterVerifier::AT_LEAST, rows_inserted)); + + // Now remove the server once again. + LOG(INFO) << "Removing tserver with uuid " << new_node->uuid() << " from the config..."; + arg_str = Substitute("$0 -master_addresses $1 change_config $2 REMOVE_SERVER $3", + exe_path, + cluster_->master()->bound_rpc_addr().ToString(), + tablet_id_, new_node->uuid()); + + ASSERT_OK(Subprocess::Call(arg_str)); + + ASSERT_EQ(1, active_tablet_servers.erase(new_node->uuid())); + ASSERT_OK(WaitUntilCommittedConfigNumVotersIs(active_tablet_servers.size(), + leader, tablet_id_, + MonoDelta::FromSeconds(10))); +} + +TEST_F(AdminCliTest, TestDeleteTable) { + FLAGS_num_tablet_servers = 1; + FLAGS_num_replicas = 1; + + vector ts_flags, master_flags; + BuildAndStart(ts_flags, master_flags); + string master_address = cluster_->master()->bound_rpc_addr().ToString(); + + shared_ptr client; + CHECK_OK(KuduClientBuilder() + .add_master_server_addr(master_address) + .Build(&client)); + + // Default table that gets created; + string table_name = "TestTable"; + + string exe_path = GetAdminToolPath(); + string arg_str = Substitute("$0 -master_addresses $1 delete_table $2", + exe_path, + master_address, + table_name); + + ASSERT_OK(Subprocess::Call(arg_str)); + + vector tables; + ASSERT_OK(client->ListTables(&tables)); + ASSERT_TRUE(tables.empty()); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/kudu-admin.cc b/src/kudu/tools/kudu-admin.cc new file mode 100644 index 000000000000..9d81dfd3c823 --- /dev/null +++ b/src/kudu/tools/kudu-admin.cc @@ -0,0 +1,426 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tool to administer a cluster from the CLI. + +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.h" +#include "kudu/master/master.pb.h" +#include "kudu/master/master.proxy.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/env.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/string_case.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_controller.h" + +DEFINE_string(master_addresses, "localhost", + "Comma-separated list of Kudu Master server addresses"); +DEFINE_int64(timeout_ms, 1000 * 60, "RPC timeout in milliseconds"); + +#define EXIT_NOT_OK_PREPEND(status, msg) \ + do { \ + Status _s = (status); \ + if (PREDICT_FALSE(!_s.ok())) { \ + std::cerr << _s.CloneAndPrepend(msg).ToString() << std::endl; \ + google::ShowUsageWithFlagsRestrict(g_progname, __FILE__); \ + exit(1); \ + } \ + } while (0) + +namespace kudu { +namespace tools { + +using std::ostringstream; +using std::string; +using std::vector; + +using google::protobuf::RepeatedPtrField; + +using client::KuduClient; +using client::KuduClientBuilder; +using client::KuduTabletServer; +using consensus::ConsensusServiceProxy; +using consensus::RaftPeerPB; +using master::ListTabletServersRequestPB; +using master::ListTabletServersResponsePB; +using master::MasterServiceProxy; +using master::TabletLocationsPB; +using master::TSInfoPB; +using rpc::Messenger; +using rpc::MessengerBuilder; +using rpc::RpcController; +using strings::Split; +using strings::Substitute; + +const char* const kChangeConfigOp = "change_config"; +const char* const kListTablesOp = "list_tables"; +const char* const kDeleteTableOp = "delete_table"; +static const char* g_progname = nullptr; + +class ClusterAdminClient { + public: + // Creates an admin client for host/port combination e.g., + // "localhost" or "127.0.0.1:7050". + ClusterAdminClient(std::string addrs, int64_t timeout_millis); + + // Initialized the client and connects to the specified tablet + // server. + Status Init(); + + // Change the configuration of the specified tablet. + Status ChangeConfig(const string& tablet_id, + const string& change_type, + const string& peer_uuid, + const boost::optional& member_type); + + // List all the tables. + Status ListTables(); + + // Delete a single table by name. + Status DeleteTable(const string& table_name); + + private: + // Fetch the locations of the replicas for a given tablet from the Master. + Status GetTabletLocations(const std::string& tablet_id, + TabletLocationsPB* locations); + + // Fetch information about the location of the tablet leader from the Master. + Status GetTabletLeader(const std::string& tablet_id, TSInfoPB* ts_info); + + // Fetch the latest list of tablet servers from the Master. + Status ListTabletServers(RepeatedPtrField* servers); + + // Look up the RPC address of the server with the specified UUID from the Master. + Status GetFirstRpcAddressForTS(const std::string& uuid, HostPort* hp); + + const std::string master_addr_list_; + const MonoDelta timeout_; + + bool initted_; + std::shared_ptr messenger_; + gscoped_ptr master_proxy_; + client::sp::shared_ptr kudu_client_; + + DISALLOW_COPY_AND_ASSIGN(ClusterAdminClient); +}; + +ClusterAdminClient::ClusterAdminClient(string addrs, int64_t timeout_millis) + : master_addr_list_(std::move(addrs)), + timeout_(MonoDelta::FromMilliseconds(timeout_millis)), + initted_(false) {} + +Status ClusterAdminClient::Init() { + CHECK(!initted_); + + // Build master proxy. + // TODO: Support multi-master by adding replica lookup support to the client. + vector master_addr_strings = Split(master_addr_list_, ","); + if (master_addr_strings.size() != 1) { + return Status::InvalidArgument("This tool does not yet support multiple masters. " + "Please specify only the leader master address in " + "-master_addresses"); + } + + HostPort master_hostport; + RETURN_NOT_OK(master_hostport.ParseString(master_addr_strings[0], + master::Master::kDefaultPort)); + MessengerBuilder builder("kudu-admin"); + RETURN_NOT_OK(builder.Build(&messenger_)); + vector master_addrs; + RETURN_NOT_OK(master_hostport.ResolveAddresses(&master_addrs)); + CHECK(!master_addrs.empty()) << "Unable to resolve IP address for master host: " + << master_hostport.ToString(); + master_proxy_.reset(new MasterServiceProxy(messenger_, master_addrs[0])); + + CHECK_OK(KuduClientBuilder() + .add_master_server_addr(master_addr_list_) + .default_admin_operation_timeout(timeout_) + .Build(&kudu_client_)); + + initted_ = true; + return Status::OK(); +} + +Status ClusterAdminClient::ChangeConfig(const string& tablet_id, + const string& change_type, + const string& peer_uuid, + const boost::optional& member_type) { + CHECK(initted_); + + // Parse the change type. + consensus::ChangeConfigType cc_type = consensus::UNKNOWN_CHANGE; + string uppercase_change_type; + ToUpperCase(change_type, &uppercase_change_type); + if (!consensus::ChangeConfigType_Parse(uppercase_change_type, &cc_type) || + cc_type == consensus::UNKNOWN_CHANGE) { + return Status::InvalidArgument("Unsupported change_type", change_type); + } + + RaftPeerPB peer_pb; + peer_pb.set_permanent_uuid(peer_uuid); + + // Parse the optional fields. + if (member_type) { + RaftPeerPB::MemberType member_type_val; + string uppercase_member_type; + ToUpperCase(*member_type, &uppercase_member_type); + if (!RaftPeerPB::MemberType_Parse(uppercase_member_type, &member_type_val)) { + return Status::InvalidArgument("Unrecognized member_type", *member_type); + } + peer_pb.set_member_type(member_type_val); + } + + // Validate the existence of the optional fields. + if (!member_type && (cc_type == consensus::ADD_SERVER || cc_type == consensus::CHANGE_ROLE)) { + return Status::InvalidArgument("Must specify member_type when adding " + "a server or changing a role"); + } + + // Look up RPC address of peer if adding as a new server. + if (cc_type == consensus::ADD_SERVER) { + HostPort host_port; + RETURN_NOT_OK(GetFirstRpcAddressForTS(peer_uuid, &host_port)); + RETURN_NOT_OK(HostPortToPB(host_port, peer_pb.mutable_last_known_addr())); + } + + // Look up the location of the tablet leader from the Master. + TSInfoPB leader_ts_info; + RETURN_NOT_OK(GetTabletLeader(tablet_id, &leader_ts_info)); + CHECK_GT(leader_ts_info.rpc_addresses_size(), 0) << leader_ts_info.ShortDebugString(); + + HostPort leader_hostport; + RETURN_NOT_OK(HostPortFromPB(leader_ts_info.rpc_addresses(0), &leader_hostport)); + vector leader_addrs; + RETURN_NOT_OK(leader_hostport.ResolveAddresses(&leader_addrs)); + CHECK(!leader_addrs.empty()) << "Unable to resolve IP address for tablet leader host: " + << leader_hostport.ToString(); + gscoped_ptr consensus_proxy( + new ConsensusServiceProxy(messenger_, leader_addrs[0])); + + consensus::ChangeConfigRequestPB req; + consensus::ChangeConfigResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout_); + + req.set_dest_uuid(leader_ts_info.permanent_uuid()); + req.set_tablet_id(tablet_id); + req.set_type(cc_type); + *req.mutable_server() = peer_pb; + + RETURN_NOT_OK(consensus_proxy->ChangeConfig(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + return Status::OK(); +} + +Status ClusterAdminClient::GetTabletLocations(const string& tablet_id, + TabletLocationsPB* locations) { + rpc::RpcController rpc; + rpc.set_timeout(timeout_); + master::GetTabletLocationsRequestPB req; + *req.add_tablet_ids() = tablet_id; + master::GetTabletLocationsResponsePB resp; + RETURN_NOT_OK(master_proxy_->GetTabletLocations(req, &resp, &rpc)); + + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + if (resp.errors_size() > 0) { + // This tool only needs to support one-by-one requests for tablet + // locations, so we only look at the first error. + return StatusFromPB(resp.errors(0).status()); + } + + // Same as above, no batching, and we already got past the error checks. + CHECK_EQ(1, resp.tablet_locations_size()) << resp.ShortDebugString(); + + *locations = resp.tablet_locations(0); + return Status::OK(); +} + +Status ClusterAdminClient::GetTabletLeader(const string& tablet_id, + TSInfoPB* ts_info) { + TabletLocationsPB locations; + RETURN_NOT_OK(GetTabletLocations(tablet_id, &locations)); + CHECK_EQ(tablet_id, locations.tablet_id()) << locations.ShortDebugString(); + bool found = false; + for (const TabletLocationsPB::ReplicaPB& replica : locations.replicas()) { + if (replica.role() == RaftPeerPB::LEADER) { + *ts_info = replica.ts_info(); + found = true; + break; + } + } + if (!found) { + return Status::NotFound("No leader replica found for tablet", tablet_id); + } + return Status::OK(); +} + +Status ClusterAdminClient::ListTabletServers( + RepeatedPtrField* servers) { + + rpc::RpcController rpc; + rpc.set_timeout(timeout_); + ListTabletServersRequestPB req; + ListTabletServersResponsePB resp; + RETURN_NOT_OK(master_proxy_->ListTabletServers(req, &resp, &rpc)); + + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + servers->Swap(resp.mutable_servers()); + return Status::OK(); +} + +Status ClusterAdminClient::GetFirstRpcAddressForTS(const std::string& uuid, HostPort* hp) { + RepeatedPtrField servers; + RETURN_NOT_OK(ListTabletServers(&servers)); + for (const ListTabletServersResponsePB::Entry& server : servers) { + if (server.instance_id().permanent_uuid() == uuid) { + if (!server.has_registration() || server.registration().rpc_addresses_size() == 0) { + break; + } + RETURN_NOT_OK(HostPortFromPB(server.registration().rpc_addresses(0), hp)); + return Status::OK(); + } + } + + return Status::NotFound(Substitute("Server with UUID $0 has no RPC address " + "registered with the Master", uuid)); +} + +Status ClusterAdminClient::ListTables() { + vector tables; + RETURN_NOT_OK(kudu_client_->ListTables(&tables)); + for (const string& table : tables) { + std::cout << table << std::endl; + } + return Status::OK(); +} + +Status ClusterAdminClient::DeleteTable(const string& table_name) { + vector tables; + RETURN_NOT_OK(kudu_client_->DeleteTable(table_name)); + std::cout << "Deleted table " << table_name << std::endl; + return Status::OK(); +} + +static void SetUsage(const char* argv0) { + ostringstream str; + + str << argv0 << " [-master_addresses server1,server2,server3] \n" + << " must be one of:\n" + << " " << kChangeConfigOp << " " + << " " + << "[VOTER|NON_VOTER]" << std::endl + << " " << kListTablesOp << std::endl + << " " << kDeleteTableOp << " "; + google::SetUsageMessage(str.str()); +} + +static string GetOp(int argc, char** argv) { + if (argc < 2) { + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + + return argv[1]; +} + +static int ClusterAdminCliMain(int argc, char** argv) { + g_progname = argv[0]; + FLAGS_logtostderr = 1; + SetUsage(argv[0]); + ParseCommandLineFlags(&argc, &argv, true); + InitGoogleLoggingSafe(argv[0]); + const string addrs = FLAGS_master_addresses; + + string op = GetOp(argc, argv); + + ClusterAdminClient client(addrs, FLAGS_timeout_ms); + + EXIT_NOT_OK_PREPEND(client.Init(), "Unable to establish connection to " + addrs); + + if (op == kChangeConfigOp) { + if (argc < 5) { + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + string tablet_id = argv[2]; + string change_type = argv[3]; + string peer_uuid = argv[4]; + boost::optional member_type; + if (argc > 5) { + member_type = argv[5]; + } + Status s = client.ChangeConfig(tablet_id, change_type, peer_uuid, member_type); + if (!s.ok()) { + std::cerr << "Unable to change config: " << s.ToString() << std::endl; + return 1; + } + } else if (op == kListTablesOp) { + Status s = client.ListTables(); + if (!s.ok()) { + std::cerr << "Unable to list tables: " << s.ToString() << std::endl; + return 1; + } + } else if (op == kDeleteTableOp) { + if (argc < 3) { + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + string table_name = argv[2]; + Status s = client.DeleteTable(table_name); + if (!s.ok()) { + std::cerr << "Unable to delete table " << table_name << ": " << s.ToString() << std::endl; + return 1; + } + } else { + std::cerr << "Invalid operation: " << op << std::endl; + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + + return 0; +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tools::ClusterAdminCliMain(argc, argv); +} diff --git a/src/kudu/tools/kudu-ksck.cc b/src/kudu/tools/kudu-ksck.cc new file mode 100644 index 000000000000..a663750d2d36 --- /dev/null +++ b/src/kudu/tools/kudu-ksck.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Command line tool to run Ksck against a cluster. Defaults to running against a local Master +// on the default RPC port. It verifies that all the reported Tablet Servers are running and that +// the tablets are in a consistent state. + +#include +#include +#include + +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tools/ksck_remote.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" + +#define PUSH_PREPEND_NOT_OK(s, statuses, msg) do { \ + ::kudu::Status _s = (s); \ + if (PREDICT_FALSE(!_s.ok())) { \ + statuses->push_back(string(msg) + ": " + _s.ToString()); \ + } \ +} while (0); + +using std::cerr; +using std::cout; +using std::endl; +using std::shared_ptr; +using std::vector; +using strings::Substitute; + +DEFINE_string(master_address, "", + "Address of master server to run against."); + +DEFINE_bool(checksum_scan, false, + "Perform a checksum scan on data in the cluster."); + +DEFINE_string(tables, "", + "Tables to check (comma-separated list of names). " + "If not specified, checks all tables."); + +DEFINE_string(tablets, "", + "Tablets to check (comma-separated list of IDs) " + "If not specified, checks all tablets."); + +namespace kudu { +namespace tools { + +static string GetKsckUsage(const char* progname) { + string msg = Substitute("Usage: $0 --master_address= \n\n", progname); + msg += "Check the health of a Kudu cluster.\n\n" + "By default, ksck checks that master and tablet server processes are running,\n" + "and that table metadata is consistent. Use the 'checksum' flag to check that\n" + "tablet data is consistent (also see the 'tables' and 'tablets' flags below).\n" + "Use the 'checksum_snapshot' along with 'checksum' if the table or tablets are\n" + "actively receiving inserts or updates."; + return msg; +} + +// Run ksck. +// Error information is appended to the provided vector. +// If the vector is empty upon completion, ksck ran successfully. +static void RunKsck(vector* error_messages) { + vector master_addrs; + PUSH_PREPEND_NOT_OK(ParseAddressList(FLAGS_master_address, + master::Master::kDefaultPort, + &master_addrs), + error_messages, "Unable to parse master address"); + + shared_ptr master; + PUSH_PREPEND_NOT_OK(RemoteKsckMaster::Build(master_addrs[0], &master), + error_messages, "Unable to build KsckMaster"); + if (!error_messages->empty()) return; + shared_ptr cluster(new KsckCluster(master)); + shared_ptr ksck(new Ksck(cluster)); + + // This is required for everything below. + PUSH_PREPEND_NOT_OK(ksck->CheckMasterRunning(), error_messages, + "Master aliveness check error"); + if (!error_messages->empty()) return; + + // This is also required for everything below. + PUSH_PREPEND_NOT_OK(ksck->FetchTableAndTabletInfo(), error_messages, + "Error fetching the cluster metadata from the Master server"); + if (!error_messages->empty()) return; + + PUSH_PREPEND_NOT_OK(ksck->CheckTabletServersRunning(), error_messages, + "Tablet server aliveness check error"); + + // TODO: Add support for tables / tablets filter in the consistency check. + PUSH_PREPEND_NOT_OK(ksck->CheckTablesConsistency(), error_messages, + "Table consistency check error"); + + if (FLAGS_checksum_scan) { + vector tables = strings::Split(FLAGS_tables, ",", strings::SkipEmpty()); + vector tablets = strings::Split(FLAGS_tablets, ",", strings::SkipEmpty()); + PUSH_PREPEND_NOT_OK(ksck->ChecksumData(tables, tablets, ChecksumOptions()), + error_messages, "Checksum scan error"); + } +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + google::SetUsageMessage(kudu::tools::GetKsckUsage(argv[0])); + if (argc < 2) { + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + kudu::ParseCommandLineFlags(&argc, &argv, true); + FLAGS_logtostderr = true; + kudu::InitGoogleLoggingSafe(argv[0]); + + vector error_messages; + kudu::tools::RunKsck(&error_messages); + + // All good. + if (error_messages.empty()) { + cout << "OK" << endl; + return 0; + } + + // Something went wrong. + cerr << "==================" << endl; + cerr << "Errors:" << endl; + cerr << "==================" << endl; + for (const string& s : error_messages) { + cerr << s << endl; + } + cerr << endl; + cerr << "FAILED" << endl; + return 1; +} diff --git a/src/kudu/tools/kudu-ts-cli-test.cc b/src/kudu/tools/kudu-ts-cli-test.cc new file mode 100644 index 000000000000..2a65de03d15a --- /dev/null +++ b/src/kudu/tools/kudu-ts-cli-test.cc @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tests for the kudu-admin command-line tool. + +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/integration-tests/cluster_itest_util.h" +#include "kudu/integration-tests/external_mini_cluster-itest-base.h" +#include "kudu/integration-tests/test_workload.h" +#include "kudu/util/path_util.h" +#include "kudu/util/subprocess.h" + +using boost::assign::list_of; +using kudu::itest::TabletServerMap; +using kudu::itest::TServerDetails; +using strings::Split; +using strings::Substitute; + +namespace kudu { +namespace tools { + +static const char* const kTsCliToolName = "kudu-ts-cli"; + +class KuduTsCliTest : public ExternalMiniClusterITestBase { + protected: + // Figure out where the admin tool is. + string GetTsCliToolPath() const; +}; + +string KuduTsCliTest::GetTsCliToolPath() const { + string exe; + CHECK_OK(Env::Default()->GetExecutablePath(&exe)); + string binroot = DirName(exe); + string tool_path = JoinPathSegments(binroot, kTsCliToolName); + CHECK(Env::Default()->FileExists(tool_path)) << "kudu-admin tool not found at " << tool_path; + return tool_path; +} + +// Test deleting a tablet. +TEST_F(KuduTsCliTest, TestDeleteTablet) { + MonoDelta timeout = MonoDelta::FromSeconds(30); + vector ts_flags, master_flags; + ts_flags.push_back("--enable_leader_failure_detection=false"); + master_flags.push_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); + NO_FATALS(StartCluster(ts_flags, master_flags)); + + TestWorkload workload(cluster_.get()); + workload.Setup(); // Easy way to create a new tablet. + + vector tablets; + for (const itest::TabletServerMap::value_type& entry : ts_map_) { + TServerDetails* ts = entry.second; + ASSERT_OK(itest::WaitForNumTabletsOnTS(ts, 1, timeout, &tablets)); + } + string tablet_id = tablets[0].tablet_status().tablet_id(); + + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + ASSERT_OK(itest::WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], + tablet_id, timeout)); + } + + string exe_path = GetTsCliToolPath(); + vector argv; + argv.push_back(exe_path); + argv.push_back("--server_address"); + argv.push_back(cluster_->tablet_server(0)->bound_rpc_addr().ToString()); + argv.push_back("delete_tablet"); + argv.push_back(tablet_id); + argv.push_back("Deleting for kudu-ts-cli-test"); + ASSERT_OK(Subprocess::Call(argv)); + + ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(0, tablet_id, tablet::TABLET_DATA_TOMBSTONED)); + TServerDetails* ts = ts_map_[cluster_->tablet_server(0)->uuid()]; + ASSERT_OK(itest::WaitUntilTabletInState(ts, tablet_id, tablet::SHUTDOWN, timeout)); +} + +} // namespace tools +} // namespace kudu diff --git a/src/kudu/tools/parse_debug_refcounted.pl b/src/kudu/tools/parse_debug_refcounted.pl new file mode 100755 index 000000000000..03ee9afefe59 --- /dev/null +++ b/src/kudu/tools/parse_debug_refcounted.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl +###################################################################### +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +###################################################################### +# Tool to parse the output of "debug" refcounted objects. +# This is helpful for tracking down ref-counting leaks. +# Generate compatible logs by making your refcounted object inherit +# from kudu::DebugRefCountedThreadSafe. +###################################################################### +use strict; +use warnings; + +my $verbose = 0; +while ($ARGV[0] =~ /^-v/) { + shift; + $verbose++; +} +my $infile = shift; + +if (!defined $infile || $infile eq '-h' || $infile eq '-help' || $infile eq '--help') { + die "Usage: $0 [-v] input.log\n"; +} + +open(FILE, "< $infile") or die "Error: unable to open input file ($infile) for read: $!"; +my $content = ''; +read(FILE, $content, -s FILE) or die "Error: unable to read input file ($infile): $!"; +close FILE; + +my @lines = split /\n/, $content; +chomp @lines; + +# First, find all the objects we are interested in. +# The really interesting ones have a mismatch in their Inc/Dec counts. + +my @incdec = grep { /Incremented ref|Decrementing ref/ } @lines; +my %counts = (); # map of address -> final ref count +foreach my $line (@incdec) { + if ($line =~ /(Incremented|Decrementing) ref on (0x[a-z0-9]+):/) { + my $op = $1; + my $addr = $2; + if ($op eq 'Incremented') { + $counts{$addr}++; + } else { + $counts{$addr}--; + } + } +} + +my @bad_addrs = (); +foreach my $addr (sort keys %counts) { + if ($counts{$addr} != 0) { + push @bad_addrs, $addr; + } +} + +# Print all the interesting stack traces of the relevant addresses. +foreach my $addr (@bad_addrs) { + print "Address $addr has bad ref count: " . ($counts{$addr}) . "\n"; + # Parse the stack traces: + # Find a debug line and grab thru the next two lines that contain "kudu", + # which should be kudu::DebugRefCountedThreadSafe frame through the likely + # "interesting" frame where the scoped_refptr was instantiated or destroyed. + my @matches = $content =~ /([^\n]*(?:Incremented|Decrementing) ref on $addr:\n.*?[^\n]+kudu[^\n]+\n.*?[^\n]+kudu[^\n]+\n)/smg; + foreach (@matches) { + if ($verbose) { + # In "verbose" mode, print a slightly larger trace. + print; + } else { + # Regular mode... we just try to get the frame that caused the ref change. + chomp; + s/\n.*\n//sm; # Only keep the first and last lines. + s/\s+\@\s+/ /; # Make it look a little nicer. + print "$_\n"; + } + } +} diff --git a/src/kudu/tools/pbc-dump.cc b/src/kudu/tools/pbc-dump.cc new file mode 100644 index 000000000000..33f0e3c50c89 --- /dev/null +++ b/src/kudu/tools/pbc-dump.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/env.h" +#include "kudu/util/flags.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/status.h" + +using kudu::Status; +using std::cerr; +using std::endl; +using std::string; + +DEFINE_bool(oneline, false, "print each protobuf on a single line"); +TAG_FLAG(oneline, stable); + +namespace kudu { +namespace pb_util { + +Status DumpPBContainerFile(const string& filename) { + Env* env = Env::Default(); + gscoped_ptr reader; + RETURN_NOT_OK(env->NewRandomAccessFile(filename, &reader)); + ReadablePBContainerFile pb_reader(reader.Pass()); + RETURN_NOT_OK(pb_reader.Init()); + RETURN_NOT_OK(pb_reader.Dump(&std::cout, FLAGS_oneline)); + + return Status::OK(); +} + +} // namespace pb_util +} // namespace kudu + +int main(int argc, char **argv) { + kudu::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + if (argc != 2) { + cerr << "usage: " << argv[0] << " [--oneline] " << endl; + return 2; + } + + Status s = kudu::pb_util::DumpPBContainerFile(argv[1]); + if (s.ok()) { + return 0; + } else { + cerr << s.ToString() << endl; + return 1; + } +} diff --git a/src/kudu/tools/trace_io.stp b/src/kudu/tools/trace_io.stp new file mode 100755 index 000000000000..dab377af75eb --- /dev/null +++ b/src/kudu/tools/trace_io.stp @@ -0,0 +1,242 @@ +#!/usr/bin/stap +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Systemtap script for tracing the filesystem-related system calls issued +// by a particular process, optionally scoped to within a particular path +// subtree (i.e. a path and all paths below it). +// +// Effectively like 'strace -fT', except that fds are automatically resolved +// to file names. +// +// Invoke as follows: +// - stap trace_io.stp -x [path] +// To start tracing an existing process. +// - stap trace_io.stp -c [path] +// To run a command and trace it. +// +// See README.systemtap for prerequisite information. + +// Maps (pid, fd) ==> filename. +global fds + +// Maps (pid, filename) ==> reference_count. +global references + +// Path to filter all filenames against. +// +// Sadly, SystemTap doesn't provide a way to test if a string starts with a +// prefix that isn't a string literal. Regex matching requires that the +// regex be a string literal (not a variable), and the "string library" only +// provides a substring matching method. We could write our own in embedded +// C, but that means the script must be run in "unsafe" mode. Given those +// options we use substring matching, opening us up to false positives. +global filter_path + +global running + +// Add a reference to [fd, filename] for the current process. +function start_tracking (fd, filename) { + fds[pid(), fd] = filename + if ([pid(), filename] in references) { + references[pid(), filename]++ + } else { + references[pid(), filename] = 1 + } +} + +// Remove a reference from [fd, filename] for the current process. Returns +// 1 if the pair has no more references, 0 otherwise. +function stop_tracking (fd, filename) { + delete fds[pid(), fd] + + references[pid(), filename]-- + if (references[pid(), filename] == 0) { + delete references[pid(), filename] + return 1 + } + return 0 +} + +probe begin { + filter_path = argc ? argv[1] : "" + running = 0 + + if (target()) { + printf("[%d] %s: beginning tracing of PID %d with path \"%s\"\n", + pid(), ctime(gettimeofday_s()), target(), filter_path) + running = 1 + } else { + printf("Need to supply a target on the stap command line\n") + printf("Pass either -x (to trace an existing process)") + printf(" or -c (to trace a new one)\n"); + exit() + } +} + +probe end { + if (running) { + printf("[%d] %s: ending tracing of PID %d\n", + pid(), ctime(gettimeofday_s()), target()) + running = 0 + } +} + +probe syscall.open.return { + // The target() corresponds to the pid provided to stap(1) via -x, or the + // pid of the command run via -c. + // + // See 'man stap' for more details. + filename = user_string_quoted($filename) + if (pid() == target() && isinstr(filename, filter_path)) { + if ($flags & 64) { + argstr = sprintf("%s, %s, %#o", filename, _sys_open_flag_str($flags), + $mode) + } else { + argstr = sprintf("%s, %s", filename, _sys_open_flag_str($flags)) + } + if ($return >= 0) { + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: open %s (%d us)\n", + tid(), ctime(gettimeofday_s()), argstr, time) + start_tracking($return, filename) + } + } +} + +probe syscall.close.return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + if (stop_tracking($fd, filename)) { + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: close %s (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, time) + } + } +} + +probe syscall.dup.return { + oldfd = $fildes + newfd = $return + if (newfd >= 0) { + if ([pid(), oldfd] in fds) { + // oldfd is duplicated to newfd, update the tracking. + start_tracking(newfd, fds[pid(), oldfd]) + } + } +} + +// For some reason probes on syscall.dup2 never trigger, but probes on the +// raw kernel function do. +probe kernel.function("sys_dup2").return { + oldfd = $oldfd + newfd = $return + if (newfd >= 0 && newfd != oldfd) { + // If newfd is already open, dup2() closes it. + if ([pid(), newfd] in fds) { + stop_tracking(newfd, fds[pid(), newfd]) + } + + // oldfd is duplicated to newfd. + if ([pid(), oldfd] in fds) { + start_tracking(newfd, fds[pid(), oldfd]) + } + } +} + +// The constants here can be found in linux/fs.h, but are hard-coded to +// avoid usage of embedded C in systemtap. +// +// See `man 2 sync_file_range` for more information. +function _sys_sync_file_range_flag_str:string (f:long) { + retval = "" + if (f & 1) { + retval = retval . "|WAIT_BEFORE" + } + if (f & 2) { + retval = retval . "|WRITE" + } + if (f & 4) { + retval = retval . "|WAIT_AFTER" + } + if (retval == "") { + return "0" + } else { + // Trim the extra pipe at the beginning. + return substr(retval, 1, strlen(retval) - 1) + } +} + +// As of Ubuntu 14.04 we have to refer to sync_file_range by kernel function +// name because there's no syscall alias defined. +probe kernel.function("sys_sync_file_range").return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: sync_file_range %s %d %d %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, $offset, $nbytes, + _sys_sync_file_range_flag_str($flags), $return, time) + } +} + +probe syscall.read.return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + argstr = sprintf("%p, %d", $buf, $count) + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: read %s %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, argstr, $return, time) + } +} + +probe syscall.write.return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + argstr = sprintf("%p, %d", $buf, $count) + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: write %s %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, argstr, $return, time) + } +} + +probe syscall.fsync.return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: fsync %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, $return, time) + } +} + +probe syscall.fdatasync.return { + if ([pid(), $fd] in fds) { + filename = fds[pid(), $fd] + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: fdatasync %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, $return, time) + } +} + +probe syscall.unlink.return { + filename = user_string_quoted($pathname) + if (pid() == target() && isinstr(filename, filter_path)) { + time = gettimeofday_us() - @entry(gettimeofday_us()) + printf("[%d] %s: unlink %s -> %d (%d us)\n", + tid(), ctime(gettimeofday_s()), filename, $return, time) + } +} diff --git a/src/kudu/tools/ts-cli.cc b/src/kudu/tools/ts-cli.cc new file mode 100644 index 000000000000..80bcda33eec7 --- /dev/null +++ b/src/kudu/tools/ts-cli.cc @@ -0,0 +1,493 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Tool to query tablet server operational data + +#include +#include +#include +#include +#include + +#include "kudu/client/row_result.h" +#include "kudu/client/scanner-internal.h" +#include "kudu/common/partition.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/env.h" +#include "kudu/util/faststring.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/rpc_controller.h" + +using kudu::client::KuduRowResult; +using kudu::HostPort; +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; +using kudu::server::ServerStatusPB; +using kudu::Sockaddr; +using kudu::client::KuduScanBatch; +using kudu::tablet::TabletStatusPB; +using kudu::tserver::DeleteTabletRequestPB; +using kudu::tserver::DeleteTabletResponsePB; +using kudu::tserver::ListTabletsRequestPB; +using kudu::tserver::ListTabletsResponsePB; +using kudu::tserver::NewScanRequestPB; +using kudu::tserver::ScanRequestPB; +using kudu::tserver::ScanResponsePB; +using kudu::tserver::TabletServerAdminServiceProxy; +using kudu::tserver::TabletServerServiceProxy; +using std::ostringstream; +using std::shared_ptr; +using std::string; +using std::vector; + +const char* const kListTabletsOp = "list_tablets"; +const char* const kAreTabletsRunningOp = "are_tablets_running"; +const char* const kSetFlagOp = "set_flag"; +const char* const kDumpTabletOp = "dump_tablet"; +const char* const kDeleteTabletOp = "delete_tablet"; +const char* const kCurrentTimestamp = "current_timestamp"; +const char* const kStatus = "status"; + +DEFINE_string(server_address, "localhost", + "Address of server to run against"); +DEFINE_int64(timeout_ms, 1000 * 60, "RPC timeout in milliseconds"); + +DEFINE_bool(force, false, "If true, allows the set_flag command to set a flag " + "which is not explicitly marked as runtime-settable. Such flag changes may be " + "simply ignored on the server, or may cause the server to crash."); + +// Check that the value of argc matches what's expected, otherwise return a +// non-zero exit code. Should be used in main(). +#define CHECK_ARGC_OR_RETURN_WITH_USAGE(op, expected) \ + do { \ + const string& _op = (op); \ + const int _expected = (expected); \ + if (argc != _expected) { \ + /* We substract 2 from _expected because we don't want to count argv[0] or [1]. */ \ + std::cerr << "Invalid number of arguments for " << _op \ + << ": expected " << (_expected - 2) << " arguments" << std::endl; \ + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); \ + return 2; \ + } \ + } while (0); + +// Invoke 'to_call' and check its result. If it failed, print 'to_prepend' and +// the error to cerr and return a non-zero exit code. Should be used in main(). +#define RETURN_NOT_OK_PREPEND_FROM_MAIN(to_call, to_prepend) \ + do { \ + ::kudu::Status s = (to_call); \ + if (!s.ok()) { \ + std::cerr << (to_prepend) << ": " << s.ToString() << std::endl; \ + return 1; \ + } \ + } while (0); + +namespace kudu { +namespace tools { + +typedef ListTabletsResponsePB::StatusAndSchemaPB StatusAndSchemaPB; + +class TsAdminClient { + public: + // Creates an admin client for host/port combination e.g., + // "localhost" or "127.0.0.1:7050". + TsAdminClient(std::string addr, int64_t timeout_millis); + + // Initialized the client and connects to the specified tablet + // server. + Status Init(); + + // Sets 'tablets' a list of status information for all tablets on a + // given tablet server. + Status ListTablets(std::vector* tablets); + + + // Sets the gflag 'flag' to 'val' on the remote server via RPC. + // If 'force' is true, allows setting flags even if they're not marked as + // safe to change at runtime. + Status SetFlag(const string& flag, const string& val, + bool force); + + // Get the schema for the given tablet. + Status GetTabletSchema(const std::string& tablet_id, SchemaPB* schema); + + // Dump the contents of the given tablet, in key order, to the console. + Status DumpTablet(const std::string& tablet_id); + + // Delete a tablet replica from the specified peer. + // The 'reason' string is passed to the tablet server, used for logging. + Status DeleteTablet(const std::string& tablet_id, + const std::string& reason); + + // Sets timestamp to the value of the tablet server's current timestamp. + Status CurrentTimestamp(uint64_t* timestamp); + + // Get the server status + Status GetStatus(ServerStatusPB* pb); + private: + std::string addr_; + vector addrs_; + MonoDelta timeout_; + bool initted_; + shared_ptr generic_proxy_; + gscoped_ptr ts_proxy_; + gscoped_ptr ts_admin_proxy_; + shared_ptr messenger_; + + DISALLOW_COPY_AND_ASSIGN(TsAdminClient); +}; + +TsAdminClient::TsAdminClient(string addr, int64_t timeout_millis) + : addr_(std::move(addr)), + timeout_(MonoDelta::FromMilliseconds(timeout_millis)), + initted_(false) {} + +Status TsAdminClient::Init() { + CHECK(!initted_); + + HostPort host_port; + RETURN_NOT_OK(host_port.ParseString(addr_, tserver::TabletServer::kDefaultPort)); + MessengerBuilder builder("ts-cli"); + RETURN_NOT_OK(builder.Build(&messenger_)); + + RETURN_NOT_OK(host_port.ResolveAddresses(&addrs_)) + + generic_proxy_.reset(new server::GenericServiceProxy(messenger_, addrs_[0])); + ts_proxy_.reset(new TabletServerServiceProxy(messenger_, addrs_[0])); + ts_admin_proxy_.reset(new TabletServerAdminServiceProxy(messenger_, addrs_[0])); + + initted_ = true; + + VLOG(1) << "Connected to " << addr_; + + return Status::OK(); +} + +Status TsAdminClient::ListTablets(vector* tablets) { + CHECK(initted_); + + ListTabletsRequestPB req; + ListTabletsResponsePB resp; + RpcController rpc; + + rpc.set_timeout(timeout_); + RETURN_NOT_OK(ts_proxy_->ListTablets(req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + tablets->assign(resp.status_and_schema().begin(), resp.status_and_schema().end()); + + return Status::OK(); +} + +Status TsAdminClient::SetFlag(const string& flag, const string& val, + bool force) { + server::SetFlagRequestPB req; + server::SetFlagResponsePB resp; + RpcController rpc; + + rpc.set_timeout(timeout_); + req.set_flag(flag); + req.set_value(val); + req.set_force(force); + + RETURN_NOT_OK(generic_proxy_->SetFlag(req, &resp, &rpc)); + switch (resp.result()) { + case server::SetFlagResponsePB::SUCCESS: + return Status::OK(); + case server::SetFlagResponsePB::NOT_SAFE: + return Status::RemoteError(resp.msg() + " (use --force flag to allow anyway)"); + default: + return Status::RemoteError(resp.ShortDebugString()); + } +} + +Status TsAdminClient::GetTabletSchema(const std::string& tablet_id, + SchemaPB* schema) { + VLOG(1) << "Fetching schema for tablet " << tablet_id; + vector tablets; + RETURN_NOT_OK(ListTablets(&tablets)); + for (const StatusAndSchemaPB& pair : tablets) { + if (pair.tablet_status().tablet_id() == tablet_id) { + *schema = pair.schema(); + return Status::OK(); + } + } + return Status::NotFound("Cannot find tablet", tablet_id); +} + +Status TsAdminClient::DumpTablet(const std::string& tablet_id) { + SchemaPB schema_pb; + RETURN_NOT_OK(GetTabletSchema(tablet_id, &schema_pb)); + Schema schema; + RETURN_NOT_OK(SchemaFromPB(schema_pb, &schema)); + kudu::client::KuduSchema client_schema(schema); + + ScanRequestPB req; + ScanResponsePB resp; + + NewScanRequestPB* new_req = req.mutable_new_scan_request(); + RETURN_NOT_OK(SchemaToColumnPBs( + schema, new_req->mutable_projected_columns(), + SCHEMA_PB_WITHOUT_IDS | SCHEMA_PB_WITHOUT_STORAGE_ATTRIBUTES)); + new_req->set_tablet_id(tablet_id); + new_req->set_cache_blocks(false); + new_req->set_order_mode(ORDERED); + new_req->set_read_mode(READ_AT_SNAPSHOT); + + vector rows; + while (true) { + RpcController rpc; + rpc.set_timeout(timeout_); + RETURN_NOT_OK_PREPEND(ts_proxy_->Scan(req, &resp, &rpc), + "Scan() failed"); + + if (resp.has_error()) { + return Status::IOError("Failed to read: ", resp.error().ShortDebugString()); + } + + rows.clear(); + KuduScanBatch::Data results; + RETURN_NOT_OK(results.Reset(&rpc, + &schema, + &client_schema, + make_gscoped_ptr(resp.release_data()))); + results.ExtractRows(&rows); + for (const KuduRowResult& r : rows) { + std::cout << r.ToString() << std::endl; + } + + // The first response has a scanner ID. We use this for all subsequent + // responses. + if (resp.has_scanner_id()) { + req.set_scanner_id(resp.scanner_id()); + req.clear_new_scan_request(); + } + req.set_call_seq_id(req.call_seq_id() + 1); + if (!resp.has_more_results()) { + break; + } + } + return Status::OK(); +} + +Status TsAdminClient::DeleteTablet(const string& tablet_id, + const string& reason) { + ServerStatusPB status_pb; + RETURN_NOT_OK(GetStatus(&status_pb)); + + DeleteTabletRequestPB req; + DeleteTabletResponsePB resp; + RpcController rpc; + + req.set_tablet_id(tablet_id); + req.set_dest_uuid(status_pb.node_instance().permanent_uuid()); + req.set_reason(reason); + req.set_delete_type(tablet::TABLET_DATA_TOMBSTONED); + rpc.set_timeout(timeout_); + RETURN_NOT_OK_PREPEND(ts_admin_proxy_->DeleteTablet(req, &resp, &rpc), + "DeleteTablet() failed"); + + if (resp.has_error()) { + return Status::IOError("Failed to delete tablet: ", + resp.error().ShortDebugString()); + } + return Status::OK(); +} + +Status TsAdminClient::CurrentTimestamp(uint64_t* timestamp) { + server::ServerClockRequestPB req; + server::ServerClockResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout_); + RETURN_NOT_OK(generic_proxy_->ServerClock(req, &resp, &rpc)); + CHECK(resp.has_timestamp()) << resp.DebugString(); + *timestamp = resp.timestamp(); + return Status::OK(); +} + +Status TsAdminClient::GetStatus(ServerStatusPB* pb) { + server::GetStatusRequestPB req; + server::GetStatusResponsePB resp; + RpcController rpc; + rpc.set_timeout(timeout_); + RETURN_NOT_OK(generic_proxy_->GetStatus(req, &resp, &rpc)); + CHECK(resp.has_status()) << resp.DebugString(); + pb->Swap(resp.mutable_status()); + return Status::OK(); +} + +namespace { + +void SetUsage(const char* argv0) { + ostringstream str; + + str << argv0 << " [--server_address=] \n" + << " must be one of:\n" + << " " << kListTabletsOp << "\n" + << " " << kAreTabletsRunningOp << "\n" + << " " << kSetFlagOp << " [-force] \n" + << " " << kDumpTabletOp << " \n" + << " " << kDeleteTabletOp << " \n" + << " " << kCurrentTimestamp << "\n" + << " " << kStatus; + google::SetUsageMessage(str.str()); +} + +string GetOp(int argc, char** argv) { + if (argc < 2) { + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + exit(1); + } + + return argv[1]; +} + +} // anonymous namespace + +static int TsCliMain(int argc, char** argv) { + FLAGS_logtostderr = 1; + SetUsage(argv[0]); + ParseCommandLineFlags(&argc, &argv, true); + InitGoogleLoggingSafe(argv[0]); + const string addr = FLAGS_server_address; + + string op = GetOp(argc, argv); + + TsAdminClient client(addr, FLAGS_timeout_ms); + + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.Init(), + "Unable to establish connection to " + addr); + + // TODO add other operations here... + if (op == kListTabletsOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2); + + vector tablets; + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.ListTablets(&tablets), + "Unable to list tablets on " + addr); + for (const StatusAndSchemaPB& status_and_schema : tablets) { + Schema schema; + RETURN_NOT_OK_PREPEND_FROM_MAIN(SchemaFromPB(status_and_schema.schema(), &schema), + "Unable to deserialize schema from " + addr); + PartitionSchema partition_schema; + RETURN_NOT_OK_PREPEND_FROM_MAIN(PartitionSchema::FromPB(status_and_schema.partition_schema(), + schema, &partition_schema), + "Unable to deserialize partition schema from " + addr); + + + TabletStatusPB ts = status_and_schema.tablet_status(); + + Partition partition; + Partition::FromPB(ts.partition(), &partition); + + string state = tablet::TabletStatePB_Name(ts.state()); + std::cout << "Tablet id: " << ts.tablet_id() << std::endl; + std::cout << "State: " << state << std::endl; + std::cout << "Table name: " << ts.table_name() << std::endl; + std::cout << "Partition: " << partition_schema.PartitionDebugString(partition, schema) + << std::endl; + if (ts.has_estimated_on_disk_size()) { + std::cout << "Estimated on disk size: " << + HumanReadableNumBytes::ToString(ts.estimated_on_disk_size()) << std::endl; + } + std::cout << "Schema: " << schema.ToString() << std::endl; + } + } else if (op == kAreTabletsRunningOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2); + + vector tablets; + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.ListTablets(&tablets), + "Unable to list tablets on " + addr); + bool all_running = true; + for (const StatusAndSchemaPB& status_and_schema : tablets) { + TabletStatusPB ts = status_and_schema.tablet_status(); + if (ts.state() != tablet::RUNNING) { + std::cout << "Tablet id: " << ts.tablet_id() << " is " + << tablet::TabletStatePB_Name(ts.state()) << std::endl; + all_running = false; + } + } + + if (all_running) { + std::cout << "All tablets are running" << std::endl; + } else { + std::cout << "Not all tablets are running" << std::endl; + return 1; + } + } else if (op == kSetFlagOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 4); + + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.SetFlag(argv[2], argv[3], FLAGS_force), + "Unable to set flag"); + + } else if (op == kDumpTabletOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 3); + + string tablet_id = argv[2]; + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.DumpTablet(tablet_id), + "Unable to dump tablet"); + } else if (op == kDeleteTabletOp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 4); + + string tablet_id = argv[2]; + string reason = argv[3]; + + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.DeleteTablet(tablet_id, reason), + "Unable to delete tablet"); + } else if (op == kCurrentTimestamp) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2); + + uint64_t timestamp; + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.CurrentTimestamp(×tamp), + "Unable to get timestamp"); + std::cout << timestamp << std::endl; + } else if (op == kStatus) { + CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2); + + ServerStatusPB status; + RETURN_NOT_OK_PREPEND_FROM_MAIN(client.GetStatus(&status), + "Unable to get status"); + std::cout << status.DebugString() << std::endl; + } else { + std::cerr << "Invalid operation: " << op << std::endl; + google::ShowUsageWithFlagsRestrict(argv[0], __FILE__); + return 2; + } + + return 0; +} + +} // namespace tools +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tools::TsCliMain(argc, argv); +} diff --git a/src/kudu/tserver/CMakeLists.txt b/src/kudu/tserver/CMakeLists.txt new file mode 100644 index 000000000000..6b104b2b04a7 --- /dev/null +++ b/src/kudu/tserver/CMakeLists.txt @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################### +# remote_bootstrap_proto +######################################### + +KRPC_GENERATE( + REMOTE_BOOTSTRAP_KRPC_SRCS REMOTE_BOOTSTRAP_KRPC_HDRS REMOTE_BOOTSTRAP_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES remote_bootstrap.proto) +set(REMOTE_BOOTSTRAP_KRPC_LIBS + consensus_proto + krpc + protobuf + rpc_header_proto + tablet_proto + wire_protocol_proto) +ADD_EXPORTABLE_LIBRARY(remote_bootstrap_proto + SRCS ${REMOTE_BOOTSTRAP_KRPC_SRCS} + DEPS ${REMOTE_BOOTSTRAP_KRPC_LIBS} + NONLINK_DEPS ${REMOTE_BOOTSTRAP_KRPC_TGTS}) + +######################################### +# tserver_proto +######################################### + +PROTOBUF_GENERATE_CPP( + TSERVER_PROTO_SRCS TSERVER_PROTO_HDRS TSERVER_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES tserver.proto) +set(TSERVER_PROTO_LIBS + kudu_common_proto + consensus_metadata_proto + tablet_proto + wire_protocol_proto) +ADD_EXPORTABLE_LIBRARY(tserver_proto + SRCS ${TSERVER_PROTO_SRCS} + DEPS ${TSERVER_PROTO_LIBS} + NONLINK_DEPS ${TSERVER_PROTO_TGTS}) + +######################################### +# tserver_admin_proto +######################################### + +KRPC_GENERATE( + TSERVER_ADMIN_KRPC_SRCS TSERVER_ADMIN_KRPC_HDRS TSERVER_ADMIN_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES tserver_admin.proto) +set(TSERVER_ADMIN_KRPC_LIBS + krpc + protobuf + rpc_header_proto + tserver_proto + wire_protocol_proto) +ADD_EXPORTABLE_LIBRARY(tserver_admin_proto + SRCS ${TSERVER_ADMIN_KRPC_SRCS} + DEPS ${TSERVER_ADMIN_KRPC_LIBS} + NONLINK_DEPS ${TSERVER_ADMIN_KRPC_TGTS}) + +######################################### +# tserver_service_proto +######################################### + +KRPC_GENERATE( + TSERVER_KRPC_SRCS TSERVER_KRPC_HDRS TSERVER_KRPC_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES tserver_service.proto) +set(TSERVER_KRPC_LIBS + krpc + kudu_common_proto + protobuf + remote_bootstrap_proto + rpc_header_proto + tserver_proto + wire_protocol_proto) +ADD_EXPORTABLE_LIBRARY(tserver_service_proto + SRCS ${TSERVER_KRPC_SRCS} + DEPS ${TSERVER_KRPC_LIBS} + NONLINK_DEPS ${TSERVER_KRPC_TGTS}) + +######################################### +# tserver +######################################### + +set(TSERVER_SRCS + heartbeater.cc + mini_tablet_server.cc + remote_bootstrap_client.cc + remote_bootstrap_service.cc + remote_bootstrap_session.cc + scanner_metrics.cc + scanners.cc + tablet_server.cc + tablet_server_options.cc + tablet_service.cc + ts_tablet_manager.cc + tserver-path-handlers.cc +) + +add_library(tserver ${TSERVER_SRCS}) +target_link_libraries(tserver + protobuf + tserver_proto + tserver_admin_proto + tserver_service_proto + remote_bootstrap_proto + master_rpc + master_proto + consensus_proto + log_proto + log + consensus + krpc + server_common + server_process + tablet) + +######################################### +# kudu-tserver +######################################### + +add_executable(kudu-tserver tablet_server_main.cc) +target_link_libraries(kudu-tserver + tserver + ${KUDU_BASE_LIBS}) + +######################################### +# tserver_test_util +######################################### + +set(TSERVER_TEST_UTIL_SRCS + tablet_server_test_util.cc +) + +add_library(tserver_test_util ${TSERVER_TEST_UTIL_SRCS}) +target_link_libraries(tserver_test_util + tserver) + +######################################### +# tserver tests +######################################### + +set(KUDU_TEST_LINK_LIBS + tserver + tserver_test_util + ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(remote_bootstrap_client-test) +ADD_KUDU_TEST(remote_bootstrap_session-test) +ADD_KUDU_TEST(remote_bootstrap_service-test) +ADD_KUDU_TEST(tablet_server-test) +ADD_KUDU_TEST(tablet_server-stress-test RUN_SERIAL true) +ADD_KUDU_TEST(scanners-test) +ADD_KUDU_TEST(ts_tablet_manager-test) diff --git a/src/kudu/tserver/heartbeater.cc b/src/kudu/tserver/heartbeater.cc new file mode 100644 index 000000000000..e70c31a3e22f --- /dev/null +++ b/src/kudu/tserver/heartbeater.cc @@ -0,0 +1,464 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/heartbeater.h" + +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.h" +#include "kudu/master/master_rpc.h" +#include "kudu/master/master.proxy.h" +#include "kudu/server/webserver.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/tablet_server_options.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/thread.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/status.h" + +DEFINE_int32(heartbeat_rpc_timeout_ms, 15000, + "Timeout used for the TS->Master heartbeat RPCs."); +TAG_FLAG(heartbeat_rpc_timeout_ms, advanced); + +DEFINE_int32(heartbeat_interval_ms, 1000, + "Interval at which the TS heartbeats to the master."); +TAG_FLAG(heartbeat_interval_ms, advanced); + +DEFINE_int32(heartbeat_max_failures_before_backoff, 3, + "Maximum number of consecutive heartbeat failures until the " + "Tablet Server backs off to the normal heartbeat interval, " + "rather than retrying."); +TAG_FLAG(heartbeat_max_failures_before_backoff, advanced); + +using google::protobuf::RepeatedPtrField; +using kudu::HostPortPB; +using kudu::consensus::RaftPeerPB; +using kudu::master::GetLeaderMasterRpc; +using kudu::master::ListMastersResponsePB; +using kudu::master::Master; +using kudu::master::MasterServiceProxy; +using kudu::rpc::RpcController; +using std::shared_ptr; +using strings::Substitute; + +namespace kudu { +namespace tserver { + +namespace { + +// Creates a proxy to 'hostport'. +Status MasterServiceProxyForHostPort(const HostPort& hostport, + const shared_ptr& messenger, + gscoped_ptr* proxy) { + vector addrs; + RETURN_NOT_OK(hostport.ResolveAddresses(&addrs)); + if (addrs.size() > 1) { + LOG(WARNING) << "Master address '" << hostport.ToString() << "' " + << "resolves to " << addrs.size() << " different addresses. Using " + << addrs[0].ToString(); + } + proxy->reset(new MasterServiceProxy(messenger, addrs[0])); + return Status::OK(); +} + +} // anonymous namespace + +// Most of the actual logic of the heartbeater is inside this inner class, +// to avoid having too many dependencies from the header itself. +// +// This is basically the "PIMPL" pattern. +class Heartbeater::Thread { + public: + Thread(const TabletServerOptions& opts, TabletServer* server); + + Status Start(); + Status Stop(); + void TriggerASAP(); + + private: + void RunThread(); + Status FindLeaderMaster(const MonoTime& deadline, + HostPort* leader_hostport); + Status ConnectToMaster(); + int GetMinimumHeartbeatMillis() const; + int GetMillisUntilNextHeartbeat() const; + Status DoHeartbeat(); + Status SetupRegistration(master::TSRegistrationPB* reg); + void SetupCommonField(master::TSToMasterCommonPB* common); + bool IsCurrentThread() const; + + // The hosts/ports of masters that we may heartbeat to. + // + // We keep the HostPort around rather than a Sockaddr because the + // masters may change IP addresses, and we'd like to re-resolve on + // every new attempt at connecting. + vector master_addrs_; + + // Index of the master we last succesfully obtained the master + // consensus configuration information from. + int last_locate_master_idx_; + + // The server for which we are heartbeating. + TabletServer* const server_; + + // The actual running thread (NULL before it is started) + scoped_refptr thread_; + + // Host and port of the most recent leader master. + HostPort leader_master_hostport_; + + // Current RPC proxy to the leader master. + gscoped_ptr proxy_; + + // The most recent response from a heartbeat. + master::TSHeartbeatResponsePB last_hb_response_; + + // True once at least one heartbeat has been sent. + bool has_heartbeated_; + + // The number of heartbeats which have failed in a row. + // This is tracked so as to back-off heartbeating. + int consecutive_failed_heartbeats_; + + // Mutex/condition pair to trigger the heartbeater thread + // to either heartbeat early or exit. + Mutex mutex_; + ConditionVariable cond_; + + // Protected by mutex_. + bool should_run_; + bool heartbeat_asap_; + + DISALLOW_COPY_AND_ASSIGN(Thread); +}; + +//////////////////////////////////////////////////////////// +// Heartbeater +//////////////////////////////////////////////////////////// + +Heartbeater::Heartbeater(const TabletServerOptions& opts, TabletServer* server) + : thread_(new Thread(opts, server)) { +} +Heartbeater::~Heartbeater() { + WARN_NOT_OK(Stop(), "Unable to stop heartbeater thread"); +} + +Status Heartbeater::Start() { return thread_->Start(); } +Status Heartbeater::Stop() { return thread_->Stop(); } +void Heartbeater::TriggerASAP() { thread_->TriggerASAP(); } + +//////////////////////////////////////////////////////////// +// Heartbeater::Thread +//////////////////////////////////////////////////////////// + +Heartbeater::Thread::Thread(const TabletServerOptions& opts, TabletServer* server) + : master_addrs_(opts.master_addresses), + last_locate_master_idx_(0), + server_(server), + has_heartbeated_(false), + consecutive_failed_heartbeats_(0), + cond_(&mutex_), + should_run_(false), + heartbeat_asap_(false) { + CHECK(!master_addrs_.empty()); +} + +namespace { +void LeaderMasterCallback(HostPort* dst_hostport, + Synchronizer* sync, + const Status& status, + const HostPort& result) { + if (status.ok()) { + *dst_hostport = result; + } + sync->StatusCB(status); +} +} // anonymous namespace + +Status Heartbeater::Thread::FindLeaderMaster(const MonoTime& deadline, + HostPort* leader_hostport) { + Status s = Status::OK(); + if (master_addrs_.size() == 1) { + // "Shortcut" the process when a single master is specified. + *leader_hostport = master_addrs_[0]; + return Status::OK(); + } + vector master_sock_addrs; + for (const HostPort& master_addr : master_addrs_) { + vector addrs; + Status s = master_addr.ResolveAddresses(&addrs); + if (!s.ok()) { + LOG(WARNING) << "Unable to resolve address '" << master_addr.ToString() + << "': " << s.ToString(); + continue; + } + if (addrs.size() > 1) { + LOG(WARNING) << "Master address '" << master_addr.ToString() << "' " + << "resolves to " << addrs.size() << " different addresses. Using " + << addrs[0].ToString(); + } + master_sock_addrs.push_back(addrs[0]); + } + if (master_sock_addrs.empty()) { + return Status::NotFound("unable to resolve any of the master addresses!"); + } + Synchronizer sync; + scoped_refptr rpc(new GetLeaderMasterRpc( + Bind(&LeaderMasterCallback, + leader_hostport, + &sync), + master_sock_addrs, + deadline, + server_->messenger())); + rpc->SendRpc(); + return sync.Wait(); +} + +Status Heartbeater::Thread::ConnectToMaster() { + vector addrs; + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromMilliseconds(FLAGS_heartbeat_rpc_timeout_ms)); + // TODO send heartbeats without tablet reports to non-leader masters. + RETURN_NOT_OK(FindLeaderMaster(deadline, &leader_master_hostport_)); + gscoped_ptr new_proxy; + MasterServiceProxyForHostPort(leader_master_hostport_, + server_->messenger(), + &new_proxy); + RETURN_NOT_OK(leader_master_hostport_.ResolveAddresses(&addrs)); + + // Ping the master to verify that it's alive. + master::PingRequestPB req; + master::PingResponsePB resp; + RpcController rpc; + rpc.set_timeout(MonoDelta::FromMilliseconds(FLAGS_heartbeat_rpc_timeout_ms)); + RETURN_NOT_OK_PREPEND(new_proxy->Ping(req, &resp, &rpc), + Substitute("Failed to ping master at $0", addrs[0].ToString())); + LOG(INFO) << "Connected to a leader master server at " << leader_master_hostport_.ToString(); + proxy_.reset(new_proxy.release()); + return Status::OK(); +} + +void Heartbeater::Thread::SetupCommonField(master::TSToMasterCommonPB* common) { + common->mutable_ts_instance()->CopyFrom(server_->instance_pb()); +} + +Status Heartbeater::Thread::SetupRegistration(master::TSRegistrationPB* reg) { + reg->Clear(); + + vector addrs; + RETURN_NOT_OK(CHECK_NOTNULL(server_->rpc_server())->GetBoundAddresses(&addrs)); + RETURN_NOT_OK_PREPEND(AddHostPortPBs(addrs, reg->mutable_rpc_addresses()), + "Failed to add RPC addresses to registration"); + + addrs.clear(); + RETURN_NOT_OK_PREPEND(CHECK_NOTNULL(server_->web_server())->GetBoundAddresses(&addrs), + "Unable to get bound HTTP addresses"); + RETURN_NOT_OK_PREPEND(AddHostPortPBs(addrs, reg->mutable_http_addresses()), + "Failed to add HTTP addresses to registration"); + return Status::OK(); +} + +int Heartbeater::Thread::GetMinimumHeartbeatMillis() const { + // If we've failed a few heartbeats in a row, back off to the normal + // interval, rather than retrying in a loop. + if (consecutive_failed_heartbeats_ == FLAGS_heartbeat_max_failures_before_backoff) { + LOG(WARNING) << "Failed " << consecutive_failed_heartbeats_ <<" heartbeats " + << "in a row: no longer allowing fast heartbeat attempts."; + } + + return consecutive_failed_heartbeats_ > FLAGS_heartbeat_max_failures_before_backoff ? + FLAGS_heartbeat_interval_ms : 0; +} + +int Heartbeater::Thread::GetMillisUntilNextHeartbeat() const { + // When we first start up, heartbeat immediately. + if (!has_heartbeated_) { + return GetMinimumHeartbeatMillis(); + } + + // If the master needs something from us, we should immediately + // send another heartbeat with that info, rather than waiting for the interval. + if (last_hb_response_.needs_reregister() || + last_hb_response_.needs_full_tablet_report()) { + return GetMinimumHeartbeatMillis(); + } + + return FLAGS_heartbeat_interval_ms; +} + +Status Heartbeater::Thread::DoHeartbeat() { + if (PREDICT_FALSE(server_->fail_heartbeats_for_tests())) { + return Status::IOError("failing all heartbeats for tests"); + } + + CHECK(IsCurrentThread()); + + if (!proxy_) { + VLOG(1) << "No valid master proxy. Connecting..."; + RETURN_NOT_OK(ConnectToMaster()); + DCHECK(proxy_); + } + + master::TSHeartbeatRequestPB req; + + SetupCommonField(req.mutable_common()); + if (last_hb_response_.needs_reregister()) { + LOG(INFO) << "Registering TS with master..."; + RETURN_NOT_OK_PREPEND(SetupRegistration(req.mutable_registration()), + "Unable to set up registration"); + } + + if (last_hb_response_.needs_full_tablet_report()) { + LOG(INFO) << "Sending a full tablet report to master..."; + server_->tablet_manager()->GenerateFullTabletReport( + req.mutable_tablet_report()); + } else { + VLOG(2) << "Sending an incremental tablet report to master..."; + server_->tablet_manager()->GenerateIncrementalTabletReport( + req.mutable_tablet_report()); + } + req.set_num_live_tablets(server_->tablet_manager()->GetNumLiveTablets()); + + RpcController rpc; + rpc.set_timeout(MonoDelta::FromSeconds(10)); + + VLOG(2) << "Sending heartbeat:\n" << req.DebugString(); + master::TSHeartbeatResponsePB resp; + RETURN_NOT_OK_PREPEND(proxy_->TSHeartbeat(req, &resp, &rpc), + "Failed to send heartbeat"); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + VLOG(2) << "Received heartbeat response:\n" << resp.DebugString(); + if (!resp.leader_master()) { + // If the master is no longer a leader, reset proxy so that we can + // determine the master and attempt to heartbeat during in the + // next heartbeat interval. + proxy_.reset(); + return Status::ServiceUnavailable("master is no longer the leader"); + } + last_hb_response_.Swap(&resp); + + + // TODO: Handle TSHeartbeatResponsePB (e.g. deleted tablets and schema changes) + server_->tablet_manager()->MarkTabletReportAcknowledged(req.tablet_report()); + + return Status::OK(); +} + +void Heartbeater::Thread::RunThread() { + CHECK(IsCurrentThread()); + VLOG(1) << "Heartbeat thread starting"; + + // Set up a fake "last heartbeat response" which indicates that we + // need to register -- since we've never registered before, we know + // this to be true. This avoids an extra + // heartbeat/response/heartbeat cycle. + last_hb_response_.set_needs_reregister(true); + last_hb_response_.set_needs_full_tablet_report(true); + + while (true) { + MonoTime next_heartbeat = MonoTime::Now(MonoTime::FINE); + next_heartbeat.AddDelta(MonoDelta::FromMilliseconds(GetMillisUntilNextHeartbeat())); + + // Wait for either the heartbeat interval to elapse, or for an "ASAP" heartbeat, + // or for the signal to shut down. + { + MutexLock l(mutex_); + while (true) { + MonoDelta remaining = next_heartbeat.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + if (remaining.ToMilliseconds() <= 0 || + heartbeat_asap_ || + !should_run_) { + break; + } + cond_.TimedWait(remaining); + } + + heartbeat_asap_ = false; + + if (!should_run_) { + VLOG(1) << "Heartbeat thread finished"; + return; + } + } + + Status s = DoHeartbeat(); + if (!s.ok()) { + LOG(WARNING) << "Failed to heartbeat to " << leader_master_hostport_.ToString() + << ": " << s.ToString(); + consecutive_failed_heartbeats_++; + if (master_addrs_.size() > 1) { + // If we encountered a network error (e.g., connection + // refused) and there's more than one master available, try + // determining the leader master again. + if (s.IsNetworkError() || + consecutive_failed_heartbeats_ == FLAGS_heartbeat_max_failures_before_backoff) { + proxy_.reset(); + } + } + continue; + } + consecutive_failed_heartbeats_ = 0; + has_heartbeated_ = true; + } +} + +bool Heartbeater::Thread::IsCurrentThread() const { + return thread_.get() == kudu::Thread::current_thread(); +} + +Status Heartbeater::Thread::Start() { + CHECK(thread_ == nullptr); + + should_run_ = true; + return kudu::Thread::Create("heartbeater", "heartbeat", + &Heartbeater::Thread::RunThread, this, &thread_); +} + +Status Heartbeater::Thread::Stop() { + if (!thread_) { + return Status::OK(); + } + + { + MutexLock l(mutex_); + should_run_ = false; + cond_.Signal(); + } + RETURN_NOT_OK(ThreadJoiner(thread_.get()).Join()); + thread_ = nullptr; + return Status::OK(); +} + +void Heartbeater::Thread::TriggerASAP() { + MutexLock l(mutex_); + heartbeat_asap_ = true; + cond_.Signal(); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/heartbeater.h b/src/kudu/tserver/heartbeater.h new file mode 100644 index 000000000000..2529ce28b203 --- /dev/null +++ b/src/kudu/tserver/heartbeater.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_HEARTBEATER_H +#define KUDU_TSERVER_HEARTBEATER_H + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace tserver { + +class TabletServer; +struct TabletServerOptions; + +// Component of the Tablet Server which is responsible for heartbeating to the +// leader master. +// +// TODO: send heartbeats to non-leader masters. +class Heartbeater { + public: + Heartbeater(const TabletServerOptions& options, TabletServer* server); + Status Start(); + Status Stop(); + + // Trigger a heartbeat as soon as possible, even if the normal + // heartbeat interval has not expired. + void TriggerASAP(); + + ~Heartbeater(); + + private: + class Thread; + gscoped_ptr thread_; + DISALLOW_COPY_AND_ASSIGN(Heartbeater); +}; + +} // namespace tserver +} // namespace kudu +#endif /* KUDU_TSERVER_HEARTBEATER_H */ diff --git a/src/kudu/tserver/mini_tablet_server.cc b/src/kudu/tserver/mini_tablet_server.cc new file mode 100644 index 000000000000..6f835adfa644 --- /dev/null +++ b/src/kudu/tserver/mini_tablet_server.cc @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/mini_tablet_server.h" + +#include + +#include + +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/metadata.h" +#include "kudu/server/rpc_server.h" +#include "kudu/server/webserver.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/log.pb.h" +#include "kudu/consensus/consensus.h" +#include "kudu/consensus/consensus.pb.h" +#include "kudu/consensus/local_consensus.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +using std::pair; + +using kudu::consensus::Consensus; +using kudu::consensus::ConsensusOptions; +using kudu::consensus::OpId; +using kudu::consensus::RaftPeerPB; +using kudu::consensus::RaftConfigPB; +using kudu::log::Log; +using kudu::log::LogOptions; +using strings::Substitute; + +DECLARE_bool(rpc_server_allow_ephemeral_ports); + +namespace kudu { +namespace tserver { + +MiniTabletServer::MiniTabletServer(const string& fs_root, + uint16_t rpc_port) + : started_(false) { + + // Start RPC server on loopback. + FLAGS_rpc_server_allow_ephemeral_ports = true; + opts_.rpc_opts.rpc_bind_addresses = Substitute("127.0.0.1:$0", rpc_port); + opts_.webserver_opts.port = 0; + opts_.fs_opts.wal_path = fs_root; + opts_.fs_opts.data_paths = { fs_root }; +} + +MiniTabletServer::~MiniTabletServer() { +} + +Status MiniTabletServer::Start() { + CHECK(!started_); + + gscoped_ptr server(new TabletServer(opts_)); + RETURN_NOT_OK(server->Init()); + RETURN_NOT_OK(server->Start()); + + server_.swap(server); + started_ = true; + return Status::OK(); +} + +Status MiniTabletServer::WaitStarted() { + return server_->WaitInited(); +} + +void MiniTabletServer::Shutdown() { + if (started_) { + server_->Shutdown(); + server_.reset(); + } + started_ = false; +} + +Status MiniTabletServer::Restart() { + CHECK(started_); + opts_.rpc_opts.rpc_bind_addresses = Substitute("127.0.0.1:$0", bound_rpc_addr().port()); + opts_.webserver_opts.port = bound_http_addr().port(); + Shutdown(); + RETURN_NOT_OK(Start()); + return Status::OK(); +} + +RaftConfigPB MiniTabletServer::CreateLocalConfig() const { + CHECK(started_) << "Must Start()"; + RaftConfigPB config; + config.set_local(true); + RaftPeerPB* peer = config.add_peers(); + peer->set_permanent_uuid(server_->instance_pb().permanent_uuid()); + peer->set_member_type(RaftPeerPB::VOTER); + peer->mutable_last_known_addr()->set_host(bound_rpc_addr().host()); + peer->mutable_last_known_addr()->set_port(bound_rpc_addr().port()); + return config; +} + +Status MiniTabletServer::AddTestTablet(const std::string& table_id, + const std::string& tablet_id, + const Schema& schema) { + return AddTestTablet(table_id, tablet_id, schema, CreateLocalConfig()); +} + +Status MiniTabletServer::AddTestTablet(const std::string& table_id, + const std::string& tablet_id, + const Schema& schema, + const RaftConfigPB& config) { + CHECK(started_) << "Must Start()"; + Schema schema_with_ids = SchemaBuilder(schema).Build(); + pair partition = tablet::CreateDefaultPartition(schema_with_ids); + + return server_->tablet_manager()->CreateNewTablet( + table_id, tablet_id, partition.second, table_id, + schema_with_ids, partition.first, config, nullptr); +} + +void MiniTabletServer::FailHeartbeats() { + server_->set_fail_heartbeats_for_tests(true); +} + +const Sockaddr MiniTabletServer::bound_rpc_addr() const { + CHECK(started_); + return server_->first_rpc_address(); +} + +const Sockaddr MiniTabletServer::bound_http_addr() const { + CHECK(started_); + return server_->first_http_address(); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/mini_tablet_server.h b/src/kudu/tserver/mini_tablet_server.h new file mode 100644 index 000000000000..4c205fdb2703 --- /dev/null +++ b/src/kudu/tserver/mini_tablet_server.h @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_MINI_TABLET_SERVER_H +#define KUDU_TSERVER_MINI_TABLET_SERVER_H + +#include "kudu/common/schema.h" +#include "kudu/gutil/macros.h" +#include "kudu/tserver/tablet_server_options.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +#include + +namespace kudu { + +class FsManager; + +namespace consensus { +class RaftConfigPB; +} // namespace consensus + +namespace tserver { + +class TabletServer; + +// An in-process tablet server meant for use in test cases. +class MiniTabletServer { + public: + MiniTabletServer(const std::string& fs_root, uint16_t rpc_port); + ~MiniTabletServer(); + + // Return the options which will be used to start the tablet server. + // If you wish to make changes to these options, they need to be made + // before calling Start(), or else they will have no effect. + TabletServerOptions* options() { return &opts_; } + + // Start a tablet server running on the loopback interface and + // an ephemeral port. To determine the address that the server + // bound to, call MiniTabletServer::bound_addr(). + // The TS will be initialized asynchronously and then started. + Status Start(); + + // Waits for the tablet server to be fully initialized, including + // having all tablets bootstrapped. + Status WaitStarted(); + + void Shutdown(); + + // Restart a tablet server on the same RPC and webserver ports. + Status Restart(); + + // Add a new tablet to the test server, use the default consensus configuration. + // + // Requires that the server has already been started with Start(). + Status AddTestTablet(const std::string& table_id, + const std::string& tablet_id, + const Schema& schema); + + // Add a new tablet to the test server and specify the consensus configuration + // for the tablet. + Status AddTestTablet(const std::string& table_id, + const std::string& tablet_id, + const Schema& schema, + const consensus::RaftConfigPB& config); + + // Create a RaftConfigPB which should be used to create a local-only + // tablet on the given tablet server. + consensus::RaftConfigPB CreateLocalConfig() const; + + const Sockaddr bound_rpc_addr() const; + const Sockaddr bound_http_addr() const; + + const TabletServer* server() const { return server_.get(); } + TabletServer* server() { return server_.get(); } + + bool is_started() const { return started_; } + + void FailHeartbeats(); + + private: + bool started_; + + TabletServerOptions opts_; + + gscoped_ptr fs_manager_; + gscoped_ptr server_; +}; + +} // namespace tserver +} // namespace kudu +#endif diff --git a/src/kudu/tserver/remote_bootstrap-test-base.h b/src/kudu/tserver/remote_bootstrap-test-base.h new file mode 100644 index 000000000000..8792c95c105f --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap-test-base.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_REMOTE_BOOTSTRAP_TEST_BASE_H_ +#define KUDU_TSERVER_REMOTE_BOOTSTRAP_TEST_BASE_H_ + +#include "kudu/tserver/tablet_server-test-base.h" + +#include + +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/tablet/metadata.pb.h" +#include "kudu/tserver/remote_bootstrap.pb.h" +#include "kudu/util/crc.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace tserver { + +using consensus::MinimumOpId; + +// Number of times to roll the log. +static const int kNumLogRolls = 2; + +class RemoteBootstrapTest : public TabletServerTestBase { + public: + virtual void SetUp() OVERRIDE { + TabletServerTestBase::SetUp(); + StartTabletServer(); + // Prevent logs from being deleted out from under us until / unless we want + // to test that we are anchoring correctly. Since GenerateTestData() does a + // Flush(), Log GC is allowed to eat the logs before we get around to + // starting a remote bootstrap session. + tablet_peer_->log_anchor_registry()->Register( + MinimumOpId().index(), CURRENT_TEST_NAME(), &anchor_); + ASSERT_NO_FATAL_FAILURE(GenerateTestData()); + } + + virtual void TearDown() OVERRIDE { + ASSERT_OK(tablet_peer_->log_anchor_registry()->Unregister(&anchor_)); + TabletServerTestBase::TearDown(); + } + + protected: + // Grab the first column block we find in the SuperBlock. + static BlockId FirstColumnBlockId(const tablet::TabletSuperBlockPB& superblock) { + const tablet::RowSetDataPB& rowset = superblock.rowsets(0); + const tablet::ColumnDataPB& column = rowset.columns(0); + const BlockIdPB& block_id_pb = column.block(); + return BlockId::FromPB(block_id_pb); + } + + // Check that the contents and CRC32C of a DataChunkPB are equal to a local buffer. + static void AssertDataEqual(const uint8_t* local, int64_t size, const DataChunkPB& remote) { + ASSERT_EQ(size, remote.data().size()); + ASSERT_TRUE(strings::memeq(local, remote.data().data(), size)); + uint32_t crc32 = crc::Crc32c(local, size); + ASSERT_EQ(crc32, remote.crc32()); + } + + // Generate the test data for the tablet and do the flushing we assume will be + // done in the unit tests for remote bootstrap. + void GenerateTestData() { + const int kIncr = 50; + LOG_TIMING(INFO, "Loading test data") { + for (int row_id = 0; row_id < kNumLogRolls * kIncr; row_id += kIncr) { + InsertTestRowsRemote(0, row_id, kIncr); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + ASSERT_OK(tablet_peer_->log()->AllocateSegmentAndRollOver()); + } + } + } + + // Return the permananent_uuid of the local service. + const std::string GetLocalUUID() const { + return tablet_peer_->permanent_uuid(); + } + + const std::string& GetTabletId() const { + return tablet_peer_->tablet()->tablet_id(); + } + + // Read a block file from the file system fully into memory and return a + // Slice pointing to it. + Status ReadLocalBlockFile(FsManager* fs_manager, const BlockId& block_id, + faststring* scratch, Slice* slice) { + gscoped_ptr block; + RETURN_NOT_OK(fs_manager->OpenBlock(block_id, &block)); + + uint64_t size = 0; + RETURN_NOT_OK(block->Size(&size)); + scratch->resize(size); + RETURN_NOT_OK(block->Read(0, size, slice, scratch->data())); + + // Since the mmap will go away on return, copy the data into scratch. + if (slice->data() != scratch->data()) { + memcpy(scratch->data(), slice->data(), slice->size()); + *slice = Slice(scratch->data(), slice->size()); + } + return Status::OK(); + } + + log::LogAnchor anchor_; +}; + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_REMOTE_BOOTSTRAP_TEST_BASE_H_ diff --git a/src/kudu/tserver/remote_bootstrap.proto b/src/kudu/tserver/remote_bootstrap.proto new file mode 100644 index 000000000000..9fa652204dd0 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap.proto @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tserver; + +option java_package = "org.kududb.tserver"; + +import "kudu/common/wire_protocol.proto"; +import "kudu/consensus/metadata.proto"; +import "kudu/fs/fs.proto"; +import "kudu/rpc/rpc_header.proto"; +import "kudu/tablet/metadata.proto"; + +// RaftConfig remote bootstrap RPC calls. +service RemoteBootstrapService { + // Establish a remote bootstrap session. + rpc BeginRemoteBootstrapSession(BeginRemoteBootstrapSessionRequestPB) + returns (BeginRemoteBootstrapSessionResponsePB); + + // Check whether the specified session is active. + rpc CheckSessionActive(CheckRemoteBootstrapSessionActiveRequestPB) + returns (CheckRemoteBootstrapSessionActiveResponsePB); + + // Fetch data (blocks, logs) from the server. + rpc FetchData(FetchDataRequestPB) + returns (FetchDataResponsePB); + + // End a remote bootstrap session, allow server to release resources. + rpc EndRemoteBootstrapSession(EndRemoteBootstrapSessionRequestPB) + returns (EndRemoteBootstrapSessionResponsePB); +} + +// Remote bootstrap-specific errors use this protobuf. +message RemoteBootstrapErrorPB { + extend kudu.rpc.ErrorStatusPB { + optional RemoteBootstrapErrorPB remote_bootstrap_error_ext = 102; + } + + enum Code { + // An error which has no more specific error code. + // The code and message in 'status' may reveal more details. + // + // RPCs should avoid returning this, since callers will not be + // able to easily parse the error. + UNKNOWN_ERROR = 1; + + // The specified remote bootstrap session either never existed or has expired. + NO_SESSION = 2; + + // Unknown tablet. + TABLET_NOT_FOUND = 3; + + // Unknown data block. + BLOCK_NOT_FOUND = 4; + + // Unknown WAL segment. + WAL_SEGMENT_NOT_FOUND = 5; + + // Invalid request. Possibly missing parameters. + INVALID_REMOTE_BOOTSTRAP_REQUEST = 6; + + // Error reading or transferring data. + IO_ERROR = 7; + } + + // The error code. + required Code code = 1 [ default = UNKNOWN_ERROR ]; + + // The Status object for the error. This will include a textual + // message that may be more useful to present in log messages, etc, + // though its error code is less specific. + required AppStatusPB status = 2; +} + +message BeginRemoteBootstrapSessionRequestPB { + // permanent_uuid of the requesting peer. + required bytes requestor_uuid = 1; + + // tablet_id of the tablet the requester desires to bootstrap from. + required bytes tablet_id = 2; +} + +message BeginRemoteBootstrapSessionResponsePB { + // Opaque session id assigned by the server. + // No guarantees are made as to the format of the session id. + required bytes session_id = 1; + + // Maximum session idle timeout between requests. + // Learners will have to start over again if they reach this timeout. + // A value of 0 means there is no timeout. + required uint64 session_idle_timeout_millis = 2; + + // Active superblock at the time of the request. + required tablet.TabletSuperBlockPB superblock = 3; + + // Identifiers for the WAL segments available for download. + // Each WAL segment is keyed by its sequence number. + repeated uint64 wal_segment_seqnos = 4; + + // A snapshot of the committed Consensus state at the time that the + // remote bootstrap session was started. + required consensus.ConsensusStatePB initial_committed_cstate = 5; +} + +message CheckRemoteBootstrapSessionActiveRequestPB { + // Valid Session ID returned by a BeginRemoteBootstrapSession() RPC call. + required bytes session_id = 1; + + // Set keepalive to true to reset the session timeout timer. + optional bool keepalive = 2 [default = false]; +} + +message CheckRemoteBootstrapSessionActiveResponsePB { + // Whether the given session id represents an active remote bootstrap session. + required bool session_is_active = 1; +} + +// A "union" type that allows the same RPC call to fetch different types of +// data (data blocks or log files). +message DataIdPB { + enum IdType { + UNKNOWN = 0; + BLOCK = 1; + LOG_SEGMENT = 2; + } + + // Indicator whether it's a block or log segment id. + required IdType type = 1; + + // Exactly one of these must be set. + optional BlockIdPB block_id = 2; // To fetch a block. + optional uint64 wal_segment_seqno = 3; // To fetch a log segment. +} + +message FetchDataRequestPB { + // Valid Session ID returned by a BeginRemoteBootstrapSession() RPC call. + required bytes session_id = 1; + + // The server will use this ID to determine the key and type of data + // that was requested. + required DataIdPB data_id = 2; + + // Offset into data to start reading from. + // If not specified, the server will send the data from offset 0. + optional uint64 offset = 3 [default = 0]; + + // Maximum length of the chunk of data to return. + // If max_length is not specified, or if the server's max is less than the + // requested max, the server will use its own max. + optional int64 max_length = 4 [default = 0]; +} + +// A chunk of data (a slice of a block, file, etc). +message DataChunkPB { + // Offset into the complete data block or file that 'data' starts at. + required uint64 offset = 1; + + // Actual bytes of data from the data block, starting at 'offset'. + required bytes data = 2; + + // CRC32C of the bytes contained in 'data'. + required fixed32 crc32 = 3; + + // Full length, in bytes, of the complete data block or file on the server. + // The number of bytes returned in 'data' can certainly be less than this. + required int64 total_data_length = 4; +} + +message FetchDataResponsePB { + // The server will automatically release the resources (i.e. close file, free + // read buffers) for a given data resource after the last byte is read. + // So, per-resource, chunks are optimized to be fetched in-order. + required DataChunkPB chunk = 1; +} + +message EndRemoteBootstrapSessionRequestPB { + required bytes session_id = 1; + + // Set to true if bootstrap is successful. + required bool is_success = 2; + + // Client-provided error message. The server will log this error so that an + // admin can identify when bad things are happening with remote bootstrap. + optional AppStatusPB error = 3; +} + +message EndRemoteBootstrapSessionResponsePB { +} diff --git a/src/kudu/tserver/remote_bootstrap_client-test.cc b/src/kudu/tserver/remote_bootstrap_client-test.cc new file mode 100644 index 000000000000..bccff45811af --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_client-test.cc @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/remote_bootstrap-test-base.h" + +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tserver/remote_bootstrap_client.h" +#include "kudu/util/env_util.h" + +using std::shared_ptr; + +namespace kudu { +namespace tserver { + +using consensus::GetRaftConfigLeader; +using consensus::RaftPeerPB; +using tablet::TabletMetadata; +using tablet::TabletStatusListener; + +class RemoteBootstrapClientTest : public RemoteBootstrapTest { + public: + virtual void SetUp() OVERRIDE { + RemoteBootstrapTest::SetUp(); + + fs_manager_.reset(new FsManager(Env::Default(), GetTestPath("client_tablet"))); + ASSERT_OK(fs_manager_->CreateInitialFileSystemLayout()); + ASSERT_OK(fs_manager_->Open()); + + tablet_peer_->WaitUntilConsensusRunning(MonoDelta::FromSeconds(10.0)); + rpc::MessengerBuilder(CURRENT_TEST_NAME()).Build(&messenger_); + client_.reset(new RemoteBootstrapClient(GetTabletId(), + fs_manager_.get(), + messenger_, + fs_manager_->uuid())); + ASSERT_OK(GetRaftConfigLeader(tablet_peer_->consensus() + ->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED), &leader_)); + + HostPort host_port; + HostPortFromPB(leader_.last_known_addr(), &host_port); + ASSERT_OK(client_->Start(leader_.permanent_uuid(), host_port, &meta_)); + } + + protected: + Status CompareFileContents(const string& path1, const string& path2); + + gscoped_ptr fs_manager_; + shared_ptr messenger_; + gscoped_ptr client_; + scoped_refptr meta_; + RaftPeerPB leader_; +}; + +Status RemoteBootstrapClientTest::CompareFileContents(const string& path1, const string& path2) { + shared_ptr file1, file2; + RETURN_NOT_OK(env_util::OpenFileForRandom(fs_manager_->env(), path1, &file1)); + RETURN_NOT_OK(env_util::OpenFileForRandom(fs_manager_->env(), path2, &file2)); + + uint64_t size1, size2; + RETURN_NOT_OK(file1->Size(&size1)); + RETURN_NOT_OK(file2->Size(&size2)); + if (size1 != size2) { + return Status::Corruption("Sizes of files don't match", + strings::Substitute("$0 vs $1 bytes", size1, size2)); + } + + Slice slice1, slice2; + faststring scratch1, scratch2; + scratch1.resize(size1); + scratch2.resize(size2); + RETURN_NOT_OK(env_util::ReadFully(file1.get(), 0, size1, &slice1, scratch1.data())); + RETURN_NOT_OK(env_util::ReadFully(file2.get(), 0, size2, &slice2, scratch2.data())); + int result = strings::fastmemcmp_inlined(slice1.data(), slice2.data(), size1); + if (result != 0) { + return Status::Corruption("Files do not match"); + } + return Status::OK(); +} + +// Basic begin / end remote bootstrap session. +TEST_F(RemoteBootstrapClientTest, TestBeginEndSession) { + TabletStatusListener listener(meta_); + ASSERT_OK(client_->FetchAll(&listener)); + ASSERT_OK(client_->Finish()); +} + +// Basic data block download unit test. +TEST_F(RemoteBootstrapClientTest, TestDownloadBlock) { + TabletStatusListener listener(meta_); + BlockId block_id = FirstColumnBlockId(*client_->superblock_); + Slice slice; + faststring scratch; + + // Ensure the block wasn't there before (it shouldn't be, we use our own FsManager dir). + Status s; + s = ReadLocalBlockFile(fs_manager_.get(), block_id, &scratch, &slice); + ASSERT_TRUE(s.IsNotFound()) << "Expected block not found: " << s.ToString(); + + // Check that the client downloaded the block and verification passed. + BlockId new_block_id; + ASSERT_OK(client_->DownloadBlock(block_id, &new_block_id)); + + // Ensure it placed the block where we expected it to. + s = ReadLocalBlockFile(fs_manager_.get(), block_id, &scratch, &slice); + ASSERT_TRUE(s.IsNotFound()) << "Expected block not found: " << s.ToString(); + ASSERT_OK(ReadLocalBlockFile(fs_manager_.get(), new_block_id, &scratch, &slice)); +} + +// Basic WAL segment download unit test. +TEST_F(RemoteBootstrapClientTest, TestDownloadWalSegment) { + ASSERT_OK(fs_manager_->CreateDirIfMissing(fs_manager_->GetTabletWalDir(GetTabletId()))); + + uint64_t seqno = client_->wal_seqnos_[0]; + string path = fs_manager_->GetWalSegmentFileName(GetTabletId(), seqno); + + ASSERT_FALSE(fs_manager_->Exists(path)); + ASSERT_OK(client_->DownloadWAL(seqno)); + ASSERT_TRUE(fs_manager_->Exists(path)); + + log::SegmentSequence local_segments; + ASSERT_OK(tablet_peer_->log()->GetLogReader()->GetSegmentsSnapshot(&local_segments)); + const scoped_refptr& segment = local_segments[0]; + string server_path = segment->path(); + + // Compare the downloaded file with the source file. + ASSERT_OK(CompareFileContents(path, server_path)); +} + +// Ensure that we detect data corruption at the per-transfer level. +TEST_F(RemoteBootstrapClientTest, TestVerifyData) { + string good = "This is a known good string"; + string bad = "This is a known bad! string"; + const int kGoodOffset = 0; + const int kBadOffset = 1; + const int64_t kDataTotalLen = std::numeric_limits::max(); // Ignored. + + // Create a known-good PB. + DataChunkPB valid_chunk; + valid_chunk.set_offset(0); + valid_chunk.set_data(good); + valid_chunk.set_crc32(crc::Crc32c(good.data(), good.length())); + valid_chunk.set_total_data_length(kDataTotalLen); + + // Make sure we work on the happy case. + ASSERT_OK(client_->VerifyData(kGoodOffset, valid_chunk)); + + // Test unexpected offset. + DataChunkPB bad_offset = valid_chunk; + bad_offset.set_offset(kBadOffset); + Status s; + s = client_->VerifyData(kGoodOffset, bad_offset); + ASSERT_TRUE(s.IsInvalidArgument()) << "Bad offset expected: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Offset did not match"); + LOG(INFO) << "Expected error returned: " << s.ToString(); + + // Test bad checksum. + DataChunkPB bad_checksum = valid_chunk; + bad_checksum.set_data(bad); + s = client_->VerifyData(kGoodOffset, bad_checksum); + ASSERT_TRUE(s.IsCorruption()) << "Invalid checksum expected: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "CRC32 does not match"); + LOG(INFO) << "Expected error returned: " << s.ToString(); +} + +namespace { + +vector GetAllSortedBlocks(const tablet::TabletSuperBlockPB& sb) { + vector data_blocks; + + for (const tablet::RowSetDataPB& rowset : sb.rowsets()) { + for (const tablet::DeltaDataPB& redo : rowset.redo_deltas()) { + data_blocks.push_back(BlockId::FromPB(redo.block())); + } + for (const tablet::DeltaDataPB& undo : rowset.undo_deltas()) { + data_blocks.push_back(BlockId::FromPB(undo.block())); + } + for (const tablet::ColumnDataPB& column : rowset.columns()) { + data_blocks.push_back(BlockId::FromPB(column.block())); + } + if (rowset.has_bloom_block()) { + data_blocks.push_back(BlockId::FromPB(rowset.bloom_block())); + } + if (rowset.has_adhoc_index_block()) { + data_blocks.push_back(BlockId::FromPB(rowset.adhoc_index_block())); + } + } + + std::sort(data_blocks.begin(), data_blocks.end(), BlockIdCompare()); + return data_blocks; +} + +} // anonymous namespace + +TEST_F(RemoteBootstrapClientTest, TestDownloadAllBlocks) { + // Download all the blocks. + ASSERT_OK(client_->DownloadBlocks()); + + // Verify that the new superblock reflects the changes in block IDs. + // + // As long as block IDs are generated with UUIDs or something equally + // unique, there's no danger of a block in the new superblock somehow + // being assigned the same ID as a block in the existing superblock. + vector old_data_blocks = GetAllSortedBlocks(*client_->superblock_.get()); + vector new_data_blocks = GetAllSortedBlocks(*client_->new_superblock_.get()); + vector result; + std::set_intersection(old_data_blocks.begin(), old_data_blocks.end(), + new_data_blocks.begin(), new_data_blocks.end(), + std::back_inserter(result), BlockIdCompare()); + ASSERT_TRUE(result.empty()); + ASSERT_EQ(old_data_blocks.size(), new_data_blocks.size()); + + // Verify that the old blocks aren't found. We're using a different + // FsManager than 'tablet_peer', so the only way an old block could end + // up in ours is due to a remote bootstrap client bug. + for (const BlockId& block_id : old_data_blocks) { + gscoped_ptr block; + Status s = fs_manager_->OpenBlock(block_id, &block); + ASSERT_TRUE(s.IsNotFound()) << "Expected block not found: " << s.ToString(); + } + // And the new blocks are all present. + for (const BlockId& block_id : new_data_blocks) { + gscoped_ptr block; + ASSERT_OK(fs_manager_->OpenBlock(block_id, &block)); + } +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_client.cc b/src/kudu/tserver/remote_bootstrap_client.cc new file mode 100644 index 000000000000..bac717eaf56b --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_client.cc @@ -0,0 +1,552 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/remote_bootstrap_client.h" + +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/walltime.h" +#include "kudu/rpc/messenger.h" +#include "kudu/rpc/transfer.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/remote_bootstrap.pb.h" +#include "kudu/tserver/remote_bootstrap.proxy.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/crc.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/net_util.h" + +DEFINE_int32(remote_bootstrap_begin_session_timeout_ms, 3000, + "Tablet server RPC client timeout for BeginRemoteBootstrapSession calls. " + "Also used for EndRemoteBootstrapSession calls."); +TAG_FLAG(remote_bootstrap_begin_session_timeout_ms, hidden); + +DEFINE_bool(remote_bootstrap_save_downloaded_metadata, false, + "Save copies of the downloaded remote bootstrap files for debugging purposes. " + "Note: This is only intended for debugging and should not be normally used!"); +TAG_FLAG(remote_bootstrap_save_downloaded_metadata, advanced); +TAG_FLAG(remote_bootstrap_save_downloaded_metadata, hidden); +TAG_FLAG(remote_bootstrap_save_downloaded_metadata, runtime); + +// RETURN_NOT_OK_PREPEND() with a remote-error unwinding step. +#define RETURN_NOT_OK_UNWIND_PREPEND(status, controller, msg) \ + RETURN_NOT_OK_PREPEND(UnwindRemoteError(status, controller), msg) + +namespace kudu { +namespace tserver { + +using consensus::ConsensusMetadata; +using consensus::ConsensusStatePB; +using consensus::OpId; +using consensus::RaftConfigPB; +using consensus::RaftPeerPB; +using env_util::CopyFile; +using fs::WritableBlock; +using rpc::Messenger; +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; +using tablet::ColumnDataPB; +using tablet::DeltaDataPB; +using tablet::RowSetDataPB; +using tablet::TabletDataState; +using tablet::TabletDataState_Name; +using tablet::TabletMetadata; +using tablet::TabletStatusListener; +using tablet::TabletSuperBlockPB; + +RemoteBootstrapClient::RemoteBootstrapClient(std::string tablet_id, + FsManager* fs_manager, + shared_ptr messenger, + string client_permanent_uuid) + : tablet_id_(std::move(tablet_id)), + fs_manager_(fs_manager), + messenger_(std::move(messenger)), + permanent_uuid_(std::move(client_permanent_uuid)), + started_(false), + downloaded_wal_(false), + downloaded_blocks_(false), + replace_tombstoned_tablet_(false), + status_listener_(nullptr), + session_idle_timeout_millis_(0), + start_time_micros_(0) {} + +RemoteBootstrapClient::~RemoteBootstrapClient() { + // Note: Ending the remote bootstrap session releases anchors on the remote. + WARN_NOT_OK(EndRemoteSession(), "Unable to close remote bootstrap session"); +} + +Status RemoteBootstrapClient::SetTabletToReplace(const scoped_refptr& meta, + int64_t caller_term) { + CHECK_EQ(tablet_id_, meta->tablet_id()); + TabletDataState data_state = meta->tablet_data_state(); + if (data_state != tablet::TABLET_DATA_TOMBSTONED) { + return Status::IllegalState(Substitute("Tablet $0 not in tombstoned state: $1 ($2)", + tablet_id_, + TabletDataState_Name(data_state), + data_state)); + } + + replace_tombstoned_tablet_ = true; + meta_ = meta; + + int64_t last_logged_term = meta->tombstone_last_logged_opid().term(); + if (last_logged_term > caller_term) { + return Status::InvalidArgument( + Substitute("Leader has term $0 but the last log entry written by the tombstoned replica " + "for tablet $1 has higher term $2. Refusing remote bootstrap from leader", + caller_term, tablet_id_, last_logged_term)); + } + + // Load the old consensus metadata, if it exists. + gscoped_ptr cmeta; + Status s = ConsensusMetadata::Load(fs_manager_, tablet_id_, + fs_manager_->uuid(), &cmeta); + if (s.IsNotFound()) { + // The consensus metadata was not written to disk, possibly due to a failed + // remote bootstrap. + return Status::OK(); + } + RETURN_NOT_OK(s); + cmeta_.swap(cmeta); + return Status::OK(); +} + +Status RemoteBootstrapClient::Start(const string& bootstrap_peer_uuid, + const HostPort& bootstrap_peer_addr, + scoped_refptr* meta) { + CHECK(!started_); + start_time_micros_ = GetCurrentTimeMicros(); + + Sockaddr addr; + RETURN_NOT_OK(SockaddrFromHostPort(bootstrap_peer_addr, &addr)); + if (addr.IsWildcard()) { + return Status::InvalidArgument("Invalid wildcard address to remote bootstrap from", + Substitute("$0 (resolved to $1)", + bootstrap_peer_addr.host(), addr.host())); + } + LOG_WITH_PREFIX(INFO) << "Beginning remote bootstrap session" + << " from remote peer at address " << bootstrap_peer_addr.ToString(); + + // Set up an RPC proxy for the RemoteBootstrapService. + proxy_.reset(new RemoteBootstrapServiceProxy(messenger_, addr)); + + BeginRemoteBootstrapSessionRequestPB req; + req.set_requestor_uuid(permanent_uuid_); + req.set_tablet_id(tablet_id_); + + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds( + FLAGS_remote_bootstrap_begin_session_timeout_ms)); + + // Begin the remote bootstrap session with the remote peer. + BeginRemoteBootstrapSessionResponsePB resp; + RETURN_NOT_OK_UNWIND_PREPEND(proxy_->BeginRemoteBootstrapSession(req, &resp, &controller), + controller, + "Unable to begin remote bootstrap session"); + + if (resp.superblock().tablet_data_state() != tablet::TABLET_DATA_READY) { + Status s = Status::IllegalState("Remote peer (" + bootstrap_peer_uuid + ")" + + " is currently remotely bootstrapping itself!", + resp.superblock().ShortDebugString()); + LOG_WITH_PREFIX(WARNING) << s.ToString(); + return s; + } + + session_id_ = resp.session_id(); + session_idle_timeout_millis_ = resp.session_idle_timeout_millis(); + superblock_.reset(resp.release_superblock()); + superblock_->set_tablet_data_state(tablet::TABLET_DATA_COPYING); + wal_seqnos_.assign(resp.wal_segment_seqnos().begin(), resp.wal_segment_seqnos().end()); + remote_committed_cstate_.reset(resp.release_initial_committed_cstate()); + + Schema schema; + RETURN_NOT_OK_PREPEND(SchemaFromPB(superblock_->schema(), &schema), + "Cannot deserialize schema from remote superblock"); + + if (replace_tombstoned_tablet_) { + // Also validate the term of the bootstrap source peer, in case they are + // different. This is a sanity check that protects us in case a bug or + // misconfiguration causes us to attempt to bootstrap from an out-of-date + // source peer, even after passing the term check from the caller in + // SetTabletToReplace(). + int64_t last_logged_term = meta_->tombstone_last_logged_opid().term(); + if (last_logged_term > remote_committed_cstate_->current_term()) { + return Status::InvalidArgument( + Substitute("Tablet $0: Bootstrap source has term $1 but " + "tombstoned replica has last-logged opid with higher term $2. " + "Refusing remote bootstrap from source peer $3", + tablet_id_, + remote_committed_cstate_->current_term(), + last_logged_term, + bootstrap_peer_uuid)); + } + + // This will flush to disk, but we set the data state to COPYING above. + RETURN_NOT_OK_PREPEND(meta_->ReplaceSuperBlock(*superblock_), + "Remote bootstrap unable to replace superblock on tablet " + + tablet_id_); + } else { + + Partition partition; + Partition::FromPB(superblock_->partition(), &partition); + PartitionSchema partition_schema; + RETURN_NOT_OK(PartitionSchema::FromPB(superblock_->partition_schema(), + schema, &partition_schema)); + + // Create the superblock on disk. + RETURN_NOT_OK(TabletMetadata::CreateNew(fs_manager_, tablet_id_, + superblock_->table_name(), + schema, + partition_schema, + partition, + tablet::TABLET_DATA_COPYING, + &meta_)); + } + + started_ = true; + if (meta) { + *meta = meta_; + } + return Status::OK(); +} + +Status RemoteBootstrapClient::FetchAll(TabletStatusListener* status_listener) { + CHECK(started_); + status_listener_ = CHECK_NOTNULL(status_listener); + + // Download all the files (serially, for now, but in parallel in the future). + RETURN_NOT_OK(DownloadBlocks()); + RETURN_NOT_OK(DownloadWALs()); + + return Status::OK(); +} + +Status RemoteBootstrapClient::Finish() { + CHECK(meta_); + CHECK(started_); + CHECK(downloaded_wal_); + CHECK(downloaded_blocks_); + + RETURN_NOT_OK(WriteConsensusMetadata()); + + // Replace tablet metadata superblock. This will set the tablet metadata state + // to TABLET_DATA_READY, since we checked above that the response + // superblock is in a valid state to bootstrap from. + LOG_WITH_PREFIX(INFO) << "Remote bootstrap complete. Replacing tablet superblock."; + UpdateStatusMessage("Replacing tablet superblock"); + new_superblock_->set_tablet_data_state(tablet::TABLET_DATA_READY); + RETURN_NOT_OK(meta_->ReplaceSuperBlock(*new_superblock_)); + + if (FLAGS_remote_bootstrap_save_downloaded_metadata) { + string meta_path = fs_manager_->GetTabletMetadataPath(tablet_id_); + string meta_copy_path = Substitute("$0.copy.$1.tmp", meta_path, start_time_micros_); + RETURN_NOT_OK_PREPEND(CopyFile(Env::Default(), meta_path, meta_copy_path, + WritableFileOptions()), + "Unable to make copy of tablet metadata"); + } + + return Status::OK(); +} + +// Decode the remote error into a human-readable Status object. +Status RemoteBootstrapClient::ExtractRemoteError(const rpc::ErrorStatusPB& remote_error) { + if (PREDICT_TRUE(remote_error.HasExtension(RemoteBootstrapErrorPB::remote_bootstrap_error_ext))) { + const RemoteBootstrapErrorPB& error = + remote_error.GetExtension(RemoteBootstrapErrorPB::remote_bootstrap_error_ext); + return StatusFromPB(error.status()).CloneAndPrepend("Received error code " + + RemoteBootstrapErrorPB::Code_Name(error.code()) + " from remote service"); + } else { + return Status::InvalidArgument("Unable to decode remote bootstrap RPC error message", + remote_error.ShortDebugString()); + } +} + +// Enhance a RemoteError Status message with additional details from the remote. +Status RemoteBootstrapClient::UnwindRemoteError(const Status& status, + const rpc::RpcController& controller) { + if (!status.IsRemoteError()) { + return status; + } + Status extension_status = ExtractRemoteError(*controller.error_response()); + return status.CloneAndAppend(extension_status.ToString()); +} + +void RemoteBootstrapClient::UpdateStatusMessage(const string& message) { + if (status_listener_ != nullptr) { + status_listener_->StatusMessage("RemoteBootstrap: " + message); + } +} + +Status RemoteBootstrapClient::EndRemoteSession() { + if (!started_) { + return Status::OK(); + } + + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds( + FLAGS_remote_bootstrap_begin_session_timeout_ms)); + + EndRemoteBootstrapSessionRequestPB req; + req.set_session_id(session_id_); + req.set_is_success(true); + EndRemoteBootstrapSessionResponsePB resp; + RETURN_NOT_OK_UNWIND_PREPEND(proxy_->EndRemoteBootstrapSession(req, &resp, &controller), + controller, + "Failure ending remote bootstrap session"); + + return Status::OK(); +} + +Status RemoteBootstrapClient::DownloadWALs() { + CHECK(started_); + + // Delete and recreate WAL dir if it already exists, to ensure stray files are + // not kept from previous bootstraps and runs. + string path = fs_manager_->GetTabletWalDir(tablet_id_); + if (fs_manager_->env()->FileExists(path)) { + RETURN_NOT_OK(fs_manager_->env()->DeleteRecursively(path)); + } + RETURN_NOT_OK(fs_manager_->env()->CreateDir(path)); + RETURN_NOT_OK(fs_manager_->env()->SyncDir(DirName(path))); // fsync() parent dir. + + // Download the WAL segments. + int num_segments = wal_seqnos_.size(); + LOG_WITH_PREFIX(INFO) << "Starting download of " << num_segments << " WAL segments..."; + uint64_t counter = 0; + for (uint64_t seg_seqno : wal_seqnos_) { + UpdateStatusMessage(Substitute("Downloading WAL segment with seq. number $0 ($1/$2)", + seg_seqno, counter + 1, num_segments)); + RETURN_NOT_OK(DownloadWAL(seg_seqno)); + ++counter; + } + + downloaded_wal_ = true; + return Status::OK(); +} + +Status RemoteBootstrapClient::DownloadBlocks() { + CHECK(started_); + + // Count up the total number of blocks to download. + int num_blocks = 0; + for (const RowSetDataPB& rowset : superblock_->rowsets()) { + num_blocks += rowset.columns_size(); + num_blocks += rowset.redo_deltas_size(); + num_blocks += rowset.undo_deltas_size(); + if (rowset.has_bloom_block()) { + num_blocks++; + } + if (rowset.has_adhoc_index_block()) { + num_blocks++; + } + } + + // Download each block, writing the new block IDs into the new superblock + // as each block downloads. + gscoped_ptr new_sb(new TabletSuperBlockPB()); + new_sb->CopyFrom(*superblock_); + int block_count = 0; + LOG_WITH_PREFIX(INFO) << "Starting download of " << num_blocks << " data blocks..."; + for (RowSetDataPB& rowset : *new_sb->mutable_rowsets()) { + for (ColumnDataPB& col : *rowset.mutable_columns()) { + RETURN_NOT_OK(DownloadAndRewriteBlock(col.mutable_block(), + &block_count, num_blocks)); + } + for (DeltaDataPB& redo : *rowset.mutable_redo_deltas()) { + RETURN_NOT_OK(DownloadAndRewriteBlock(redo.mutable_block(), + &block_count, num_blocks)); + } + for (DeltaDataPB& undo : *rowset.mutable_undo_deltas()) { + RETURN_NOT_OK(DownloadAndRewriteBlock(undo.mutable_block(), + &block_count, num_blocks)); + } + if (rowset.has_bloom_block()) { + RETURN_NOT_OK(DownloadAndRewriteBlock(rowset.mutable_bloom_block(), + &block_count, num_blocks)); + } + if (rowset.has_adhoc_index_block()) { + RETURN_NOT_OK(DownloadAndRewriteBlock(rowset.mutable_adhoc_index_block(), + &block_count, num_blocks)); + } + } + + // The orphaned physical block ids at the remote have no meaning to us. + new_sb->clear_orphaned_blocks(); + new_superblock_.swap(new_sb); + + downloaded_blocks_ = true; + + return Status::OK(); +} + +Status RemoteBootstrapClient::DownloadWAL(uint64_t wal_segment_seqno) { + VLOG_WITH_PREFIX(1) << "Downloading WAL segment with seqno " << wal_segment_seqno; + DataIdPB data_id; + data_id.set_type(DataIdPB::LOG_SEGMENT); + data_id.set_wal_segment_seqno(wal_segment_seqno); + string dest_path = fs_manager_->GetWalSegmentFileName(tablet_id_, wal_segment_seqno); + + WritableFileOptions opts; + opts.sync_on_close = true; + gscoped_ptr writer; + RETURN_NOT_OK_PREPEND(fs_manager_->env()->NewWritableFile(opts, dest_path, &writer), + "Unable to open file for writing"); + RETURN_NOT_OK_PREPEND(DownloadFile(data_id, writer.get()), + Substitute("Unable to download WAL segment with seq. number $0", + wal_segment_seqno)); + return Status::OK(); +} + +Status RemoteBootstrapClient::WriteConsensusMetadata() { + // If we didn't find a previous consensus meta file, create one. + if (!cmeta_) { + gscoped_ptr cmeta; + return ConsensusMetadata::Create(fs_manager_, tablet_id_, fs_manager_->uuid(), + remote_committed_cstate_->config(), + remote_committed_cstate_->current_term(), + &cmeta); + } + + // Otherwise, update the consensus metadata to reflect the config and term + // sent by the remote bootstrap source. + cmeta_->MergeCommittedConsensusStatePB(*remote_committed_cstate_); + RETURN_NOT_OK(cmeta_->Flush()); + + if (FLAGS_remote_bootstrap_save_downloaded_metadata) { + string cmeta_path = fs_manager_->GetConsensusMetadataPath(tablet_id_); + string cmeta_copy_path = Substitute("$0.copy.$1.tmp", cmeta_path, start_time_micros_); + RETURN_NOT_OK_PREPEND(CopyFile(Env::Default(), cmeta_path, cmeta_copy_path, + WritableFileOptions()), + "Unable to make copy of consensus metadata"); + } + + return Status::OK(); +} + +Status RemoteBootstrapClient::DownloadAndRewriteBlock(BlockIdPB* block_id, + int* block_count, int num_blocks) { + BlockId old_block_id(BlockId::FromPB(*block_id)); + UpdateStatusMessage(Substitute("Downloading block $0 ($1/$2)", + old_block_id.ToString(), *block_count, + num_blocks)); + BlockId new_block_id; + RETURN_NOT_OK_PREPEND(DownloadBlock(old_block_id, &new_block_id), + "Unable to download block with id " + old_block_id.ToString()); + + new_block_id.CopyToPB(block_id); + (*block_count)++; + return Status::OK(); +} + +Status RemoteBootstrapClient::DownloadBlock(const BlockId& old_block_id, + BlockId* new_block_id) { + VLOG_WITH_PREFIX(1) << "Downloading block with block_id " << old_block_id.ToString(); + + gscoped_ptr block; + RETURN_NOT_OK_PREPEND(fs_manager_->CreateNewBlock(&block), + "Unable to create new block"); + + DataIdPB data_id; + data_id.set_type(DataIdPB::BLOCK); + old_block_id.CopyToPB(data_id.mutable_block_id()); + RETURN_NOT_OK_PREPEND(DownloadFile(data_id, block.get()), + Substitute("Unable to download block $0", + old_block_id.ToString())); + + *new_block_id = block->id(); + RETURN_NOT_OK_PREPEND(block->Close(), "Unable to close block"); + return Status::OK(); +} + +template +Status RemoteBootstrapClient::DownloadFile(const DataIdPB& data_id, + Appendable* appendable) { + uint64_t offset = 0; + int32_t max_length = FLAGS_rpc_max_message_size - 1024; // Leave 1K for message headers. + + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromMilliseconds(session_idle_timeout_millis_)); + FetchDataRequestPB req; + + bool done = false; + while (!done) { + controller.Reset(); + req.set_session_id(session_id_); + req.mutable_data_id()->CopyFrom(data_id); + req.set_offset(offset); + req.set_max_length(max_length); + + FetchDataResponsePB resp; + RETURN_NOT_OK_UNWIND_PREPEND(proxy_->FetchData(req, &resp, &controller), + controller, + "Unable to fetch data from remote"); + + // Sanity-check for corruption. + RETURN_NOT_OK_PREPEND(VerifyData(offset, resp.chunk()), + Substitute("Error validating data item $0", data_id.ShortDebugString())); + + // Write the data. + RETURN_NOT_OK(appendable->Append(resp.chunk().data())); + + if (offset + resp.chunk().data().size() == resp.chunk().total_data_length()) { + done = true; + } + offset += resp.chunk().data().size(); + } + + return Status::OK(); +} + +Status RemoteBootstrapClient::VerifyData(uint64_t offset, const DataChunkPB& chunk) { + // Verify the offset is what we expected. + if (offset != chunk.offset()) { + return Status::InvalidArgument("Offset did not match what was asked for", + Substitute("$0 vs $1", offset, chunk.offset())); + } + + // Verify the checksum. + uint32_t crc32 = crc::Crc32c(chunk.data().data(), chunk.data().length()); + if (PREDICT_FALSE(crc32 != chunk.crc32())) { + return Status::Corruption( + Substitute("CRC32 does not match at offset $0 size $1: $2 vs $3", + offset, chunk.data().size(), crc32, chunk.crc32())); + } + return Status::OK(); +} + +string RemoteBootstrapClient::LogPrefix() { + return Substitute("T $0 P $1: Remote bootstrap client: ", tablet_id_, permanent_uuid_); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_client.h b/src/kudu/tserver/remote_bootstrap_client.h new file mode 100644 index 000000000000..df95038fbf92 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_client.h @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_REMOTE_BOOTSTRAP_CLIENT_H +#define KUDU_TSERVER_REMOTE_BOOTSTRAP_CLIENT_H + +#include +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace kudu { + +class BlockId; +class BlockIdPB; +class FsManager; +class HostPort; + +namespace consensus { +class ConsensusMetadata; +class ConsensusStatePB; +class RaftConfigPB; +class RaftPeerPB; +} // namespace consensus + +namespace rpc { +class ErrorStatusPB; +class Messenger; +class RpcController; +} // namespace rpc + +namespace tablet { +class TabletMetadata; +class TabletPeer; +class TabletStatusListener; +class TabletSuperBlockPB; +} // namespace tablet + +namespace tserver { +class DataIdPB; +class DataChunkPB; +class RemoteBootstrapServiceProxy; + +// Client class for using remote bootstrap to copy a tablet from another host. +// This class is not thread-safe. +// +// TODO: +// * Parallelize download of blocks and WAL segments. +// +class RemoteBootstrapClient { + public: + + // Construct the remote bootstrap client. + // 'fs_manager' and 'messenger' must remain valid until this object is destroyed. + // 'client_permanent_uuid' is the permanent UUID of the caller server. + RemoteBootstrapClient(std::string tablet_id, FsManager* fs_manager, + std::shared_ptr messenger, + std::string client_permanent_uuid); + + // Attempt to clean up resources on the remote end by sending an + // EndRemoteBootstrapSession() RPC + ~RemoteBootstrapClient(); + + // Pass in the existing metadata for a tombstoned tablet, which will be + // replaced if validation checks pass in Start(). + // 'meta' is the metadata for the tombstoned tablet and 'caller_term' is the + // term provided by the caller (assumed to be the current leader of the + // consensus config) for validation purposes. + // If the consensus metadata exists on disk for this tablet, and if + // 'caller_term' is lower than the current term stored in that consensus + // metadata, then this method will fail with a Status::InvalidArgument error. + Status SetTabletToReplace(const scoped_refptr& meta, + int64_t caller_term); + + // Start up a remote bootstrap session to bootstrap from the specified + // bootstrap peer. Place a new superblock indicating that remote bootstrap is + // in progress. If the 'metadata' pointer is passed as NULL, it is ignored, + // otherwise the TabletMetadata object resulting from the initial remote + // bootstrap response is returned. + // TODO: Rename these parameters to bootstrap_source_*. + Status Start(const std::string& bootstrap_peer_uuid, + const HostPort& bootstrap_peer_addr, + scoped_refptr* metadata); + + // Runs a "full" remote bootstrap, copying the physical layout of a tablet + // from the leader of the specified consensus configuration. + Status FetchAll(tablet::TabletStatusListener* status_listener); + + // After downloading all files successfully, write out the completed + // replacement superblock. + Status Finish(); + + private: + FRIEND_TEST(RemoteBootstrapClientTest, TestBeginEndSession); + FRIEND_TEST(RemoteBootstrapClientTest, TestDownloadBlock); + FRIEND_TEST(RemoteBootstrapClientTest, TestVerifyData); + FRIEND_TEST(RemoteBootstrapClientTest, TestDownloadWalSegment); + FRIEND_TEST(RemoteBootstrapClientTest, TestDownloadAllBlocks); + + // Extract the embedded Status message from the given ErrorStatusPB. + // The given ErrorStatusPB must extend RemoteBootstrapErrorPB. + static Status ExtractRemoteError(const rpc::ErrorStatusPB& remote_error); + + static Status UnwindRemoteError(const Status& status, const rpc::RpcController& controller); + + // Update the bootstrap StatusListener with a message. + // The string "RemoteBootstrap: " will be prepended to each message. + void UpdateStatusMessage(const std::string& message); + + // End the remote bootstrap session. + Status EndRemoteSession(); + + // Download all WAL files sequentially. + Status DownloadWALs(); + + // Download a single WAL file. + // Assumes the WAL directories have already been created. + // WAL file is opened with options so that it will fsync() on close. + Status DownloadWAL(uint64_t wal_segment_seqno); + + // Write out the Consensus Metadata file based on the ConsensusStatePB + // downloaded as part of initiating the remote bootstrap session. + Status WriteConsensusMetadata(); + + // Download all blocks belonging to a tablet sequentially. + // + // Blocks are given new IDs upon creation. On success, 'new_superblock_' + // is populated to reflect the new block IDs and should be used in lieu + // of 'superblock_' henceforth. + Status DownloadBlocks(); + + // Download the block specified by 'block_id'. + // + // On success: + // - 'block_id' is set to the new ID of the downloaded block. + // - 'block_count' is incremented. + Status DownloadAndRewriteBlock(BlockIdPB* block_id, int* block_count, int num_blocks); + + // Download a single block. + // Data block is opened with options so that it will fsync() on close. + // + // On success, 'new_block_id' is set to the new ID of the downloaded block. + Status DownloadBlock(const BlockId& old_block_id, BlockId* new_block_id); + + // Download a single remote file. The block and WAL implementations delegate + // to this method when downloading files. + // + // An Appendable is typically a WritableBlock (block) or WritableFile (WAL). + // + // Only used in one compilation unit, otherwise the implementation would + // need to be in the header. + template + Status DownloadFile(const DataIdPB& data_id, Appendable* appendable); + + Status VerifyData(uint64_t offset, const DataChunkPB& resp); + + // Return standard log prefix. + std::string LogPrefix(); + + // Set-once members. + const std::string tablet_id_; + FsManager* const fs_manager_; + const std::shared_ptr messenger_; + const std::string permanent_uuid_; + + // State flags that enforce the progress of remote bootstrap. + bool started_; // Session started. + bool downloaded_wal_; // WAL segments downloaded. + bool downloaded_blocks_; // Data blocks downloaded. + + // Session-specific data items. + bool replace_tombstoned_tablet_; + + // Local tablet metadata file. + scoped_refptr meta_; + + // Local Consensus metadata file. This may initially be NULL if this is + // bootstrapping a new replica (rather than replacing an old one). + gscoped_ptr cmeta_; + + tablet::TabletStatusListener* status_listener_; + std::shared_ptr proxy_; + std::string session_id_; + uint64_t session_idle_timeout_millis_; + gscoped_ptr superblock_; + gscoped_ptr new_superblock_; + gscoped_ptr remote_committed_cstate_; + std::vector wal_seqnos_; + int64_t start_time_micros_; + + DISALLOW_COPY_AND_ASSIGN(RemoteBootstrapClient); +}; + +} // namespace tserver +} // namespace kudu +#endif /* KUDU_TSERVER_REMOTE_BOOTSTRAP_CLIENT_H */ diff --git a/src/kudu/tserver/remote_bootstrap_service-test.cc b/src/kudu/tserver/remote_bootstrap_service-test.cc new file mode 100644 index 000000000000..0ab2c1b0cbe4 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_service-test.cc @@ -0,0 +1,459 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/remote_bootstrap-test-base.h" + +#include +#include + +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/rpc/rpc_header.pb.h" +#include "kudu/rpc/transfer.h" +#include "kudu/tserver/remote_bootstrap.pb.h" +#include "kudu/tserver/tserver_service.pb.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/util/crc.h" +#include "kudu/util/env_util.h" +#include "kudu/util/monotime.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +#define ASSERT_REMOTE_ERROR(status, err, code, str) \ + ASSERT_NO_FATAL_FAILURE(AssertRemoteError(status, err, code, str)) + +DECLARE_uint64(remote_bootstrap_idle_timeout_ms); +DECLARE_uint64(remote_bootstrap_timeout_poll_period_ms); + +namespace kudu { +namespace tserver { + +using consensus::MaximumOpId; +using consensus::MinimumOpId; +using consensus::OpIdEquals; +using env_util::ReadFully; +using log::ReadableLogSegment; +using rpc::ErrorStatusPB; +using rpc::RpcController; + +class RemoteBootstrapServiceTest : public RemoteBootstrapTest { + public: + RemoteBootstrapServiceTest() { + // Poll for session expiration every 10 ms for the session timeout test. + FLAGS_remote_bootstrap_timeout_poll_period_ms = 10; + } + + protected: + void SetUp() OVERRIDE { + RemoteBootstrapTest::SetUp(); + remote_bootstrap_proxy_.reset( + new RemoteBootstrapServiceProxy(client_messenger_, mini_server_->bound_rpc_addr())); + } + + Status DoBeginRemoteBootstrapSession(const string& tablet_id, + const string& requestor_uuid, + BeginRemoteBootstrapSessionResponsePB* resp, + RpcController* controller) { + controller->set_timeout(MonoDelta::FromSeconds(1.0)); + BeginRemoteBootstrapSessionRequestPB req; + req.set_tablet_id(tablet_id); + req.set_requestor_uuid(requestor_uuid); + return UnwindRemoteError( + remote_bootstrap_proxy_->BeginRemoteBootstrapSession(req, resp, controller), controller); + } + + Status DoBeginValidRemoteBootstrapSession(string* session_id, + tablet::TabletSuperBlockPB* superblock = nullptr, + uint64_t* idle_timeout_millis = nullptr, + vector* sequence_numbers = nullptr) { + BeginRemoteBootstrapSessionResponsePB resp; + RpcController controller; + RETURN_NOT_OK(DoBeginRemoteBootstrapSession(GetTabletId(), GetLocalUUID(), &resp, &controller)); + *session_id = resp.session_id(); + if (superblock) { + *superblock = resp.superblock(); + } + if (idle_timeout_millis) { + *idle_timeout_millis = resp.session_idle_timeout_millis(); + } + if (sequence_numbers) { + sequence_numbers->assign(resp.wal_segment_seqnos().begin(), resp.wal_segment_seqnos().end()); + } + return Status::OK(); + } + + Status DoCheckSessionActive(const string& session_id, + CheckRemoteBootstrapSessionActiveResponsePB* resp, + RpcController* controller) { + controller->set_timeout(MonoDelta::FromSeconds(1.0)); + CheckRemoteBootstrapSessionActiveRequestPB req; + req.set_session_id(session_id); + return UnwindRemoteError( + remote_bootstrap_proxy_->CheckSessionActive(req, resp, controller), controller); + } + + Status DoFetchData(const string& session_id, const DataIdPB& data_id, + uint64_t* offset, int64_t* max_length, + FetchDataResponsePB* resp, + RpcController* controller) { + controller->set_timeout(MonoDelta::FromSeconds(1.0)); + FetchDataRequestPB req; + req.set_session_id(session_id); + req.mutable_data_id()->CopyFrom(data_id); + if (offset) { + req.set_offset(*offset); + } + if (max_length) { + req.set_max_length(*max_length); + } + return UnwindRemoteError( + remote_bootstrap_proxy_->FetchData(req, resp, controller), controller); + } + + Status DoEndRemoteBootstrapSession(const string& session_id, bool is_success, + const Status* error_msg, + EndRemoteBootstrapSessionResponsePB* resp, + RpcController* controller) { + controller->set_timeout(MonoDelta::FromSeconds(1.0)); + EndRemoteBootstrapSessionRequestPB req; + req.set_session_id(session_id); + req.set_is_success(is_success); + if (error_msg) { + StatusToPB(*error_msg, req.mutable_error()); + } + return UnwindRemoteError( + remote_bootstrap_proxy_->EndRemoteBootstrapSession(req, resp, controller), controller); + } + + // Decode the remote error into a Status object. + Status ExtractRemoteError(const ErrorStatusPB* remote_error) { + const RemoteBootstrapErrorPB& error = + remote_error->GetExtension(RemoteBootstrapErrorPB::remote_bootstrap_error_ext); + return StatusFromPB(error.status()); + } + + // Enhance a RemoteError Status message with additional details from the remote. + Status UnwindRemoteError(Status status, const RpcController* controller) { + if (!status.IsRemoteError()) { + return status; + } + Status remote_error = ExtractRemoteError(controller->error_response()); + return status.CloneAndPrepend(remote_error.ToString()); + } + + void AssertRemoteError(Status status, const ErrorStatusPB* remote_error, + const RemoteBootstrapErrorPB::Code app_code, + const string& status_code_string) { + ASSERT_TRUE(status.IsRemoteError()) << "Unexpected status code: " << status.ToString() + << ", app code: " + << RemoteBootstrapErrorPB::Code_Name(app_code) + << ", status code string: " << status_code_string; + const Status app_status = ExtractRemoteError(remote_error); + const RemoteBootstrapErrorPB& error = + remote_error->GetExtension(RemoteBootstrapErrorPB::remote_bootstrap_error_ext); + ASSERT_EQ(app_code, error.code()) << error.ShortDebugString(); + ASSERT_EQ(status_code_string, app_status.CodeAsString()) << app_status.ToString(); + LOG(INFO) << app_status.ToString(); + } + + // Return BlockId in format suitable for a FetchData() call. + static DataIdPB AsDataTypeId(const BlockId& block_id) { + DataIdPB data_id; + data_id.set_type(DataIdPB::BLOCK); + block_id.CopyToPB(data_id.mutable_block_id()); + return data_id; + } + + gscoped_ptr remote_bootstrap_proxy_; +}; + +// Test beginning and ending a remote bootstrap session. +TEST_F(RemoteBootstrapServiceTest, TestSimpleBeginEndSession) { + string session_id; + tablet::TabletSuperBlockPB superblock; + uint64_t idle_timeout_millis; + vector segment_seqnos; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id, + &superblock, + &idle_timeout_millis, + &segment_seqnos)); + // Basic validation of returned params. + ASSERT_FALSE(session_id.empty()); + ASSERT_EQ(FLAGS_remote_bootstrap_idle_timeout_ms, idle_timeout_millis); + ASSERT_TRUE(superblock.IsInitialized()); + // We should have number of segments = number of rolls + 1 (due to the active segment). + ASSERT_EQ(kNumLogRolls + 1, segment_seqnos.size()); + + EndRemoteBootstrapSessionResponsePB resp; + RpcController controller; + ASSERT_OK(DoEndRemoteBootstrapSession(session_id, true, nullptr, &resp, &controller)); +} + +// Test starting two sessions. The current implementation will silently only create one. +TEST_F(RemoteBootstrapServiceTest, TestBeginTwice) { + // Second time through should silently succeed. + for (int i = 0; i < 2; i++) { + string session_id; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id)); + ASSERT_FALSE(session_id.empty()); + } +} + +// Test bad session id error condition. +TEST_F(RemoteBootstrapServiceTest, TestInvalidSessionId) { + vector bad_session_ids; + bad_session_ids.push_back("hodor"); + bad_session_ids.push_back(GetLocalUUID()); + + // Fetch a block for a non-existent session. + for (const string& session_id : bad_session_ids) { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::BLOCK); + data_id.mutable_block_id()->set_id(1); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), RemoteBootstrapErrorPB::NO_SESSION, + Status::NotFound("").CodeAsString()); + } + + // End a non-existent session. + for (const string& session_id : bad_session_ids) { + EndRemoteBootstrapSessionResponsePB resp; + RpcController controller; + Status status = DoEndRemoteBootstrapSession(session_id, true, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), RemoteBootstrapErrorPB::NO_SESSION, + Status::NotFound("").CodeAsString()); + } +} + +// Test bad tablet id error condition. +TEST_F(RemoteBootstrapServiceTest, TestInvalidTabletId) { + BeginRemoteBootstrapSessionResponsePB resp; + RpcController controller; + Status status = + DoBeginRemoteBootstrapSession("some-unknown-tablet", GetLocalUUID(), &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), RemoteBootstrapErrorPB::TABLET_NOT_FOUND, + Status::NotFound("").CodeAsString()); +} + +// Test DataIdPB validation. +TEST_F(RemoteBootstrapServiceTest, TestInvalidBlockOrOpId) { + string session_id; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id)); + + // Invalid BlockId. + { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::BLOCK); + data_id.mutable_block_id()->set_id(1); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), + RemoteBootstrapErrorPB::BLOCK_NOT_FOUND, + Status::NotFound("").CodeAsString()); + } + + // Invalid Segment Sequence Number for log fetch. + { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::LOG_SEGMENT); + data_id.set_wal_segment_seqno(31337); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), + RemoteBootstrapErrorPB::WAL_SEGMENT_NOT_FOUND, + Status::NotFound("").CodeAsString()); + } + + // Empty data type with BlockId. + // The RPC system will not let us send the required type field. + { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.mutable_block_id()->set_id(1); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_TRUE(status.IsInvalidArgument()); + } + + // Empty data type id (no BlockId, no Segment Sequence Number); + { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::LOG_SEGMENT); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), + RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST, + Status::InvalidArgument("").CodeAsString()); + } + + // Both BlockId and Segment Sequence Number in the same "union" PB (illegal). + { + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::BLOCK); + data_id.mutable_block_id()->set_id(1); + data_id.set_wal_segment_seqno(0); + Status status = DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), + RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST, + Status::InvalidArgument("").CodeAsString()); + } +} + +// Test invalid file offset error condition. +TEST_F(RemoteBootstrapServiceTest, TestFetchInvalidBlockOffset) { + string session_id; + tablet::TabletSuperBlockPB superblock; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id, &superblock)); + + FetchDataResponsePB resp; + RpcController controller; + // Impossible offset. + uint64_t offset = std::numeric_limits::max(); + Status status = DoFetchData(session_id, AsDataTypeId(FirstColumnBlockId(superblock)), + &offset, nullptr, &resp, &controller); + ASSERT_REMOTE_ERROR(status, controller.error_response(), + RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST, + Status::InvalidArgument("").CodeAsString()); +} + +// Test that we are able to fetch an entire block. +TEST_F(RemoteBootstrapServiceTest, TestFetchBlockAtOnce) { + string session_id; + tablet::TabletSuperBlockPB superblock; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id, &superblock)); + + // Local. + BlockId block_id = FirstColumnBlockId(superblock); + Slice local_data; + faststring scratch; + ASSERT_OK(ReadLocalBlockFile(mini_server_->server()->fs_manager(), block_id, + &scratch, &local_data)); + + // Remote. + FetchDataResponsePB resp; + RpcController controller; + ASSERT_OK(DoFetchData(session_id, AsDataTypeId(block_id), nullptr, nullptr, &resp, &controller)); + + AssertDataEqual(local_data.data(), local_data.size(), resp.chunk()); +} + +// Test that we are able to incrementally fetch blocks. +TEST_F(RemoteBootstrapServiceTest, TestFetchBlockIncrementally) { + string session_id; + tablet::TabletSuperBlockPB superblock; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id, &superblock)); + + BlockId block_id = FirstColumnBlockId(superblock); + Slice local_data; + faststring scratch; + ASSERT_OK(ReadLocalBlockFile(mini_server_->server()->fs_manager(), block_id, + &scratch, &local_data)); + + // Grab the remote data in several chunks. + int64_t block_size = local_data.size(); + int64_t max_chunk_size = block_size / 5; + uint64_t offset = 0; + while (offset < block_size) { + FetchDataResponsePB resp; + RpcController controller; + ASSERT_OK(DoFetchData(session_id, AsDataTypeId(block_id), + &offset, &max_chunk_size, &resp, &controller)); + int64_t returned_bytes = resp.chunk().data().size(); + ASSERT_LE(returned_bytes, max_chunk_size); + AssertDataEqual(local_data.data() + offset, returned_bytes, resp.chunk()); + offset += returned_bytes; + } +} + +// Test that we are able to fetch log segments. +TEST_F(RemoteBootstrapServiceTest, TestFetchLog) { + string session_id; + tablet::TabletSuperBlockPB superblock; + uint64_t idle_timeout_millis; + vector segment_seqnos; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id, + &superblock, + &idle_timeout_millis, + &segment_seqnos)); + + ASSERT_EQ(kNumLogRolls + 1, segment_seqnos.size()); + uint64_t seg_seqno = *segment_seqnos.begin(); + + // Fetch the remote data. + FetchDataResponsePB resp; + RpcController controller; + DataIdPB data_id; + data_id.set_type(DataIdPB::LOG_SEGMENT); + data_id.set_wal_segment_seqno(seg_seqno); + ASSERT_OK(DoFetchData(session_id, data_id, nullptr, nullptr, &resp, &controller)); + + // Fetch the local data. + log::SegmentSequence local_segments; + ASSERT_OK(tablet_peer_->log()->GetLogReader()->GetSegmentsSnapshot(&local_segments)); + + uint64_t first_seg_seqno = (*local_segments.begin())->header().sequence_number(); + + + ASSERT_EQ(seg_seqno, first_seg_seqno) + << "Expected equal sequence numbers: " << seg_seqno + << " and " << first_seg_seqno; + const scoped_refptr& segment = local_segments[0]; + faststring scratch; + int64_t size = segment->file_size(); + scratch.resize(size); + Slice slice; + ASSERT_OK(ReadFully(segment->readable_file().get(), 0, size, &slice, scratch.data())); + + AssertDataEqual(slice.data(), slice.size(), resp.chunk()); +} + +// Test that the remote bootstrap session timeout works properly. +TEST_F(RemoteBootstrapServiceTest, TestSessionTimeout) { + // This flag should be seen by the service due to TSO. + // We have also reduced the timeout polling frequency in SetUp(). + FLAGS_remote_bootstrap_idle_timeout_ms = 1; // Expire the session almost immediately. + + // Start session. + string session_id; + ASSERT_OK(DoBeginValidRemoteBootstrapSession(&session_id)); + + MonoTime start_time = MonoTime::Now(MonoTime::FINE); + CheckRemoteBootstrapSessionActiveResponsePB resp; + + do { + RpcController controller; + ASSERT_OK(DoCheckSessionActive(session_id, &resp, &controller)); + if (!resp.session_is_active()) { + break; + } + SleepFor(MonoDelta::FromMilliseconds(1)); // 1 ms + } while (MonoTime::Now(MonoTime::FINE).GetDeltaSince(start_time).ToSeconds() < 10); + + ASSERT_FALSE(resp.session_is_active()) << "Remote bootstrap session did not time out!"; +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_service.cc b/src/kudu/tserver/remote_bootstrap_service.cc new file mode 100644 index 000000000000..eb9ab8943fd4 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_service.cc @@ -0,0 +1,359 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/remote_bootstrap_service.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/log.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/map-util.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/tserver/remote_bootstrap_session.h" +#include "kudu/tserver/tablet_peer_lookup.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/crc.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" + +// Note, this macro assumes the existence of a local var named 'context'. +#define RPC_RETURN_APP_ERROR(app_err, message, s) \ + do { \ + SetupErrorAndRespond(context, app_err, message, s); \ + return; \ + } while (false) + +#define RPC_RETURN_NOT_OK(expr, app_err, message) \ + do { \ + Status s = (expr); \ + if (!s.ok()) { \ + RPC_RETURN_APP_ERROR(app_err, message, s); \ + } \ + } while (false) + +DEFINE_uint64(remote_bootstrap_idle_timeout_ms, 180000, + "Amount of time without activity before a remote bootstrap " + "session will expire, in millis"); +TAG_FLAG(remote_bootstrap_idle_timeout_ms, hidden); + +DEFINE_uint64(remote_bootstrap_timeout_poll_period_ms, 10000, + "How often the remote_bootstrap service polls for expired " + "remote bootstrap sessions, in millis"); +TAG_FLAG(remote_bootstrap_timeout_poll_period_ms, hidden); + +DEFINE_double(fault_crash_on_handle_rb_fetch_data, 0.0, + "Fraction of the time when the tablet will crash while " + "servicing a RemoteBootstrapService FetchData() RPC call. " + "(For testing only!)"); +TAG_FLAG(fault_crash_on_handle_rb_fetch_data, unsafe); + +namespace kudu { +namespace tserver { + +using crc::Crc32c; +using strings::Substitute; +using tablet::TabletPeer; + +static void SetupErrorAndRespond(rpc::RpcContext* context, + RemoteBootstrapErrorPB::Code code, + const string& message, + const Status& s) { + LOG(WARNING) << "Error handling RemoteBootstrapService RPC request from " + << context->requestor_string() << ": " + << s.ToString(); + RemoteBootstrapErrorPB error; + StatusToPB(s, error.mutable_status()); + error.set_code(code); + context->RespondApplicationError(RemoteBootstrapErrorPB::remote_bootstrap_error_ext.number(), + message, error); +} + +RemoteBootstrapServiceImpl::RemoteBootstrapServiceImpl( + FsManager* fs_manager, + TabletPeerLookupIf* tablet_peer_lookup, + const scoped_refptr& metric_entity) + : RemoteBootstrapServiceIf(metric_entity), + fs_manager_(CHECK_NOTNULL(fs_manager)), + tablet_peer_lookup_(CHECK_NOTNULL(tablet_peer_lookup)), + shutdown_latch_(1) { + CHECK_OK(Thread::Create("remote-bootstrap", "rb-session-exp", + &RemoteBootstrapServiceImpl::EndExpiredSessions, this, + &session_expiration_thread_)); +} + +void RemoteBootstrapServiceImpl::BeginRemoteBootstrapSession( + const BeginRemoteBootstrapSessionRequestPB* req, + BeginRemoteBootstrapSessionResponsePB* resp, + rpc::RpcContext* context) { + const string& requestor_uuid = req->requestor_uuid(); + const string& tablet_id = req->tablet_id(); + + // For now, we use the requestor_uuid with the tablet id as the session id, + // but there is no guarantee this will not change in the future. + const string session_id = Substitute("$0-$1", requestor_uuid, tablet_id); + + scoped_refptr tablet_peer; + RPC_RETURN_NOT_OK(tablet_peer_lookup_->GetTabletPeer(tablet_id, &tablet_peer), + RemoteBootstrapErrorPB::TABLET_NOT_FOUND, + Substitute("Unable to find specified tablet: $0", tablet_id)); + + scoped_refptr session; + { + boost::lock_guard l(sessions_lock_); + if (!FindCopy(sessions_, session_id, &session)) { + LOG(INFO) << "Beginning new remote bootstrap session on tablet " << tablet_id + << " from peer " << requestor_uuid << " at " << context->requestor_string() + << ": session id = " << session_id; + session.reset(new RemoteBootstrapSession(tablet_peer, session_id, + requestor_uuid, fs_manager_)); + RPC_RETURN_NOT_OK(session->Init(), + RemoteBootstrapErrorPB::UNKNOWN_ERROR, + Substitute("Error initializing remote bootstrap session for tablet $0", + tablet_id)); + InsertOrDie(&sessions_, session_id, session); + } else { + LOG(INFO) << "Re-initializing existing remote bootstrap session on tablet " << tablet_id + << " from peer " << requestor_uuid << " at " << context->requestor_string() + << ": session id = " << session_id; + RPC_RETURN_NOT_OK(session->Init(), + RemoteBootstrapErrorPB::UNKNOWN_ERROR, + Substitute("Error initializing remote bootstrap session for tablet $0", + tablet_id)); + } + ResetSessionExpirationUnlocked(session_id); + } + + resp->set_session_id(session_id); + resp->set_session_idle_timeout_millis(FLAGS_remote_bootstrap_idle_timeout_ms); + resp->mutable_superblock()->CopyFrom(session->tablet_superblock()); + resp->mutable_initial_committed_cstate()->CopyFrom(session->initial_committed_cstate()); + + for (const scoped_refptr& segment : session->log_segments()) { + resp->add_wal_segment_seqnos(segment->header().sequence_number()); + } + + context->RespondSuccess(); +} + +void RemoteBootstrapServiceImpl::CheckSessionActive( + const CheckRemoteBootstrapSessionActiveRequestPB* req, + CheckRemoteBootstrapSessionActiveResponsePB* resp, + rpc::RpcContext* context) { + const string& session_id = req->session_id(); + + // Look up and validate remote bootstrap session. + scoped_refptr session; + boost::lock_guard l(sessions_lock_); + RemoteBootstrapErrorPB::Code app_error; + Status status = FindSessionUnlocked(session_id, &app_error, &session); + if (status.ok()) { + if (req->keepalive()) { + ResetSessionExpirationUnlocked(session_id); + } + resp->set_session_is_active(true); + context->RespondSuccess(); + return; + } else if (app_error == RemoteBootstrapErrorPB::NO_SESSION) { + resp->set_session_is_active(false); + context->RespondSuccess(); + return; + } else { + RPC_RETURN_NOT_OK(status, app_error, + Substitute("Error trying to check whether session $0 is active", session_id)); + } +} + +void RemoteBootstrapServiceImpl::FetchData(const FetchDataRequestPB* req, + FetchDataResponsePB* resp, + rpc::RpcContext* context) { + const string& session_id = req->session_id(); + + // Look up and validate remote bootstrap session. + scoped_refptr session; + { + boost::lock_guard l(sessions_lock_); + RemoteBootstrapErrorPB::Code app_error; + RPC_RETURN_NOT_OK(FindSessionUnlocked(session_id, &app_error, &session), + app_error, "No such session"); + ResetSessionExpirationUnlocked(session_id); + } + + MAYBE_FAULT(FLAGS_fault_crash_on_handle_rb_fetch_data); + + uint64_t offset = req->offset(); + int64_t client_maxlen = req->max_length(); + + const DataIdPB& data_id = req->data_id(); + RemoteBootstrapErrorPB::Code error_code = RemoteBootstrapErrorPB::UNKNOWN_ERROR; + RPC_RETURN_NOT_OK(ValidateFetchRequestDataId(data_id, &error_code, session), + error_code, "Invalid DataId"); + + DataChunkPB* data_chunk = resp->mutable_chunk(); + string* data = data_chunk->mutable_data(); + int64_t total_data_length = 0; + if (data_id.type() == DataIdPB::BLOCK) { + // Fetching a data block chunk. + const BlockId& block_id = BlockId::FromPB(data_id.block_id()); + RPC_RETURN_NOT_OK(session->GetBlockPiece(block_id, offset, client_maxlen, + data, &total_data_length, &error_code), + error_code, "Unable to get piece of data block"); + } else { + // Fetching a log segment chunk. + uint64_t segment_seqno = data_id.wal_segment_seqno(); + RPC_RETURN_NOT_OK(session->GetLogSegmentPiece(segment_seqno, offset, client_maxlen, + data, &total_data_length, &error_code), + error_code, "Unable to get piece of log segment"); + } + + data_chunk->set_total_data_length(total_data_length); + data_chunk->set_offset(offset); + + // Calculate checksum. + uint32_t crc32 = Crc32c(data->data(), data->length()); + data_chunk->set_crc32(crc32); + + context->RespondSuccess(); +} + +void RemoteBootstrapServiceImpl::EndRemoteBootstrapSession( + const EndRemoteBootstrapSessionRequestPB* req, + EndRemoteBootstrapSessionResponsePB* resp, + rpc::RpcContext* context) { + { + boost::lock_guard l(sessions_lock_); + RemoteBootstrapErrorPB::Code app_error; + LOG(INFO) << "Request end of remote bootstrap session " << req->session_id() + << " received from " << context->requestor_string(); + RPC_RETURN_NOT_OK(DoEndRemoteBootstrapSessionUnlocked(req->session_id(), &app_error), + app_error, "No such session"); + } + context->RespondSuccess(); +} + +void RemoteBootstrapServiceImpl::Shutdown() { + shutdown_latch_.CountDown(); + session_expiration_thread_->Join(); + + // Destroy all remote bootstrap sessions. + vector session_ids; + for (const MonoTimeMap::value_type& entry : session_expirations_) { + session_ids.push_back(entry.first); + } + for (const string& session_id : session_ids) { + LOG(INFO) << "Destroying remote bootstrap session " << session_id << " due to service shutdown"; + RemoteBootstrapErrorPB::Code app_error; + CHECK_OK(DoEndRemoteBootstrapSessionUnlocked(session_id, &app_error)); + } +} + +Status RemoteBootstrapServiceImpl::FindSessionUnlocked( + const string& session_id, + RemoteBootstrapErrorPB::Code* app_error, + scoped_refptr* session) const { + if (!FindCopy(sessions_, session_id, session)) { + *app_error = RemoteBootstrapErrorPB::NO_SESSION; + return Status::NotFound( + Substitute("Remote bootstrap session with Session ID \"$0\" not found", session_id)); + } + return Status::OK(); +} + +Status RemoteBootstrapServiceImpl::ValidateFetchRequestDataId( + const DataIdPB& data_id, + RemoteBootstrapErrorPB::Code* app_error, + const scoped_refptr& session) const { + if (PREDICT_FALSE(data_id.has_block_id() && data_id.has_wal_segment_seqno())) { + *app_error = RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST; + return Status::InvalidArgument( + Substitute("Only one of BlockId or segment sequence number are required, " + "but both were specified. DataTypeID: $0", data_id.ShortDebugString())); + } else if (PREDICT_FALSE(!data_id.has_block_id() && !data_id.has_wal_segment_seqno())) { + *app_error = RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST; + return Status::InvalidArgument( + Substitute("Only one of BlockId or segment sequence number are required, " + "but neither were specified. DataTypeID: $0", data_id.ShortDebugString())); + } + + if (data_id.type() == DataIdPB::BLOCK) { + if (PREDICT_FALSE(!data_id.has_block_id())) { + return Status::InvalidArgument("block_id must be specified for type == BLOCK", + data_id.ShortDebugString()); + } + } else { + if (PREDICT_FALSE(!data_id.wal_segment_seqno())) { + return Status::InvalidArgument( + "segment sequence number must be specified for type == LOG_SEGMENT", + data_id.ShortDebugString()); + } + } + + return Status::OK(); +} + +void RemoteBootstrapServiceImpl::ResetSessionExpirationUnlocked(const std::string& session_id) { + MonoTime expiration(MonoTime::Now(MonoTime::FINE)); + expiration.AddDelta(MonoDelta::FromMilliseconds(FLAGS_remote_bootstrap_idle_timeout_ms)); + InsertOrUpdate(&session_expirations_, session_id, expiration); +} + +Status RemoteBootstrapServiceImpl::DoEndRemoteBootstrapSessionUnlocked( + const std::string& session_id, + RemoteBootstrapErrorPB::Code* app_error) { + scoped_refptr session; + RETURN_NOT_OK(FindSessionUnlocked(session_id, app_error, &session)); + // Remove the session from the map. + // It will get destroyed once there are no outstanding refs. + LOG(INFO) << "Ending remote bootstrap session " << session_id << " on tablet " + << session->tablet_id() << " with peer " << session->requestor_uuid(); + CHECK_EQ(1, sessions_.erase(session_id)); + CHECK_EQ(1, session_expirations_.erase(session_id)); + + return Status::OK(); +} + +void RemoteBootstrapServiceImpl::EndExpiredSessions() { + do { + boost::lock_guard l(sessions_lock_); + MonoTime now = MonoTime::Now(MonoTime::FINE); + + vector expired_session_ids; + for (const MonoTimeMap::value_type& entry : session_expirations_) { + const string& session_id = entry.first; + const MonoTime& expiration = entry.second; + if (expiration.ComesBefore(now)) { + expired_session_ids.push_back(session_id); + } + } + for (const string& session_id : expired_session_ids) { + LOG(INFO) << "Remote bootstrap session " << session_id + << " has expired. Terminating session."; + RemoteBootstrapErrorPB::Code app_error; + CHECK_OK(DoEndRemoteBootstrapSessionUnlocked(session_id, &app_error)); + } + } while (!shutdown_latch_.WaitFor(MonoDelta::FromMilliseconds( + FLAGS_remote_bootstrap_timeout_poll_period_ms))); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_service.h b/src/kudu/tserver/remote_bootstrap_service.h new file mode 100644 index 000000000000..897b221b3878 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_service.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_REMOTE_BOOTSTRAP_SERVICE_H_ +#define KUDU_TSERVER_REMOTE_BOOTSTRAP_SERVICE_H_ + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tserver/remote_bootstrap.service.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" + +namespace kudu { +class FsManager; + +namespace log { +class ReadableLogSegment; +} // namespace log + +namespace tserver { + +class RemoteBootstrapSession; +class TabletPeerLookupIf; + +class RemoteBootstrapServiceImpl : public RemoteBootstrapServiceIf { + public: + RemoteBootstrapServiceImpl(FsManager* fs_manager, + TabletPeerLookupIf* tablet_peer_lookup, + const scoped_refptr& metric_entity); + + virtual void BeginRemoteBootstrapSession(const BeginRemoteBootstrapSessionRequestPB* req, + BeginRemoteBootstrapSessionResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void CheckSessionActive(const CheckRemoteBootstrapSessionActiveRequestPB* req, + CheckRemoteBootstrapSessionActiveResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void FetchData(const FetchDataRequestPB* req, + FetchDataResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void EndRemoteBootstrapSession(const EndRemoteBootstrapSessionRequestPB* req, + EndRemoteBootstrapSessionResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void Shutdown() OVERRIDE; + + private: + typedef std::unordered_map > SessionMap; + typedef std::unordered_map MonoTimeMap; + + // Look up session in session map. + Status FindSessionUnlocked(const std::string& session_id, + RemoteBootstrapErrorPB::Code* app_error, + scoped_refptr* session) const; + + // Validate the data identifier in a FetchData request. + Status ValidateFetchRequestDataId(const DataIdPB& data_id, + RemoteBootstrapErrorPB::Code* app_error, + const scoped_refptr& session) const; + + // Take note of session activity; Re-update the session timeout deadline. + void ResetSessionExpirationUnlocked(const std::string& session_id); + + // Destroy the specified remote bootstrap session. + Status DoEndRemoteBootstrapSessionUnlocked(const std::string& session_id, + RemoteBootstrapErrorPB::Code* app_error); + + // The timeout thread periodically checks whether sessions are expired and + // removes them from the map. + void EndExpiredSessions(); + + FsManager* fs_manager_; + TabletPeerLookupIf* tablet_peer_lookup_; + + // Protects sessions_ and session_expirations_ maps. + mutable simple_spinlock sessions_lock_; + SessionMap sessions_; + MonoTimeMap session_expirations_; + + // Session expiration thread. + // TODO: this is a hack, replace with some kind of timer impl. See KUDU-286. + CountDownLatch shutdown_latch_; + scoped_refptr session_expiration_thread_; +}; + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_REMOTE_BOOTSTRAP_SERVICE_H_ diff --git a/src/kudu/tserver/remote_bootstrap_session-test.cc b/src/kudu/tserver/remote_bootstrap_session-test.cc new file mode 100644 index 000000000000..04b876b7221b --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_session-test.cc @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tablet/tablet-test-util.h" + +#include +#include +#include + +#include "kudu/common/partial_row.h" +#include "kudu/common/row_operations.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/block_id.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/messenger.h" +#include "kudu/tserver/remote_bootstrap_session.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/crc.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" +#include "kudu/util/threadpool.h" + +METRIC_DECLARE_entity(tablet); + +using std::shared_ptr; +using std::string; + +namespace kudu { +namespace tserver { + +using consensus::ConsensusMetadata; +using consensus::OpId; +using consensus::RaftConfigPB; +using consensus::RaftPeerPB; +using fs::ReadableBlock; +using log::Log; +using log::LogOptions; +using log::LogAnchorRegistry; +using rpc::Messenger; +using rpc::MessengerBuilder; +using strings::Substitute; +using tablet::ColumnDataPB; +using tablet::DeltaDataPB; +using tablet::KuduTabletTest; +using tablet::RowSetDataPB; +using tablet::TabletPeer; +using tablet::TabletSuperBlockPB; +using tablet::WriteTransactionState; + +class RemoteBootstrapTest : public KuduTabletTest { + public: + RemoteBootstrapTest() + : KuduTabletTest(Schema({ ColumnSchema("key", STRING), + ColumnSchema("val", INT32) }, 1)) { + CHECK_OK(ThreadPoolBuilder("test-exec").Build(&apply_pool_)); + } + + virtual void SetUp() OVERRIDE { + KuduTabletTest::SetUp(); + SetUpTabletPeer(); + ASSERT_NO_FATAL_FAILURE(PopulateTablet()); + InitSession(); + } + + virtual void TearDown() OVERRIDE { + session_.reset(); + tablet_peer_->Shutdown(); + KuduTabletTest::TearDown(); + } + + protected: + void SetUpTabletPeer() { + scoped_refptr log; + CHECK_OK(Log::Open(LogOptions(), fs_manager(), tablet()->tablet_id(), + *tablet()->schema(), + 0, // schema_version + NULL, &log)); + + scoped_refptr metric_entity = + METRIC_ENTITY_tablet.Instantiate(&metric_registry_, CURRENT_TEST_NAME()); + + RaftPeerPB config_peer; + config_peer.set_permanent_uuid(fs_manager()->uuid()); + config_peer.set_member_type(RaftPeerPB::VOTER); + + tablet_peer_.reset( + new TabletPeer(tablet()->metadata(), + config_peer, + apply_pool_.get(), + Bind(&RemoteBootstrapTest::TabletPeerStateChangedCallback, + Unretained(this), + tablet()->tablet_id()))); + + // TODO similar to code in tablet_peer-test, consider refactor. + RaftConfigPB config; + config.set_local(true); + config.add_peers()->CopyFrom(config_peer); + config.set_opid_index(consensus::kInvalidOpIdIndex); + + gscoped_ptr cmeta; + CHECK_OK(ConsensusMetadata::Create(tablet()->metadata()->fs_manager(), + tablet()->tablet_id(), fs_manager()->uuid(), + config, consensus::kMinimumTerm, &cmeta)); + + shared_ptr messenger; + MessengerBuilder mbuilder(CURRENT_TEST_NAME()); + mbuilder.Build(&messenger); + + log_anchor_registry_.reset(new LogAnchorRegistry()); + tablet_peer_->SetBootstrapping(); + CHECK_OK(tablet_peer_->Init(tablet(), + clock(), + messenger, + log, + metric_entity)); + consensus::ConsensusBootstrapInfo boot_info; + CHECK_OK(tablet_peer_->Start(boot_info)); + + ASSERT_OK(tablet_peer_->WaitUntilConsensusRunning(MonoDelta::FromSeconds(2))); + } + + void TabletPeerStateChangedCallback(const string& tablet_id, const string& reason) { + LOG(INFO) << "Tablet peer state changed for tablet " << tablet_id << ". Reason: " << reason; + } + + void PopulateTablet() { + for (int32_t i = 0; i < 1000; i++) { + WriteRequestPB req; + req.set_tablet_id(tablet_peer_->tablet_id()); + ASSERT_OK(SchemaToPB(client_schema_, req.mutable_schema())); + RowOperationsPB* data = req.mutable_row_operations(); + RowOperationsPBEncoder enc(data); + KuduPartialRow row(&client_schema_); + + string key = Substitute("key$0", i); + ASSERT_OK(row.SetString(0, key)); + ASSERT_OK(row.SetInt32(1, i)); + enc.Add(RowOperationsPB::INSERT, row); + + WriteResponsePB resp; + CountDownLatch latch(1); + + auto state = new WriteTransactionState(tablet_peer_.get(), &req, &resp); + state->set_completion_callback(gscoped_ptr( + new tablet::LatchTransactionCompletionCallback(&latch, &resp)).Pass()); + ASSERT_OK(tablet_peer_->SubmitWrite(state)); + latch.Wait(); + ASSERT_FALSE(resp.has_error()) << "Request failed: " << resp.error().ShortDebugString(); + ASSERT_EQ(0, resp.per_row_errors_size()) << "Insert error: " << resp.ShortDebugString(); + } + ASSERT_OK(tablet()->Flush()); + } + + void InitSession() { + session_.reset(new RemoteBootstrapSession(tablet_peer_.get(), "TestSession", "FakeUUID", + fs_manager())); + CHECK_OK(session_->Init()); + } + + // Read the specified BlockId, via the RemoteBootstrapSession, into a file. + // 'path' will be populated with the name of the file used. + // 'file' will be set to point to the SequentialFile containing the data. + void FetchBlockToFile(const BlockId& block_id, + string* path, + gscoped_ptr* file) { + string data; + int64_t block_file_size = 0; + RemoteBootstrapErrorPB::Code error_code; + CHECK_OK(session_->GetBlockPiece(block_id, 0, 0, &data, &block_file_size, &error_code)); + if (block_file_size > 0) { + CHECK_GT(data.size(), 0); + } + + // Write the file to a temporary location. + WritableFileOptions opts; + string path_template = GetTestPath(Substitute("test_block_$0.tmp.XXXXXX", block_id.ToString())); + gscoped_ptr writable_file; + CHECK_OK(Env::Default()->NewTempWritableFile(opts, path_template, path, &writable_file)); + CHECK_OK(writable_file->Append(Slice(data.data(), data.size()))); + CHECK_OK(writable_file->Close()); + + CHECK_OK(Env::Default()->NewSequentialFile(*path, file)); + } + + MetricRegistry metric_registry_; + scoped_refptr log_anchor_registry_; + gscoped_ptr apply_pool_; + scoped_refptr tablet_peer_; + scoped_refptr session_; +}; + +// Ensure that the serialized SuperBlock included in the RemoteBootstrapSession is +// equal to the serialized live superblock (on a quiesced tablet). +TEST_F(RemoteBootstrapTest, TestSuperBlocksEqual) { + // Compare content of superblocks. + faststring session_buf; + faststring tablet_buf; + + { + const TabletSuperBlockPB& session_superblock = session_->tablet_superblock(); + int size = session_superblock.ByteSize(); + session_buf.resize(size); + uint8_t* session_dst = session_buf.data(); + session_dst = session_superblock.SerializeWithCachedSizesToArray(session_dst); + } + + { + TabletSuperBlockPB tablet_superblock; + ASSERT_OK(tablet()->metadata()->ToSuperBlock(&tablet_superblock)); + int size = tablet_superblock.ByteSize(); + tablet_buf.resize(size); + uint8_t* tablet_dst = tablet_buf.data(); + tablet_dst = tablet_superblock.SerializeWithCachedSizesToArray(tablet_dst); + } + + ASSERT_EQ(session_buf.size(), tablet_buf.size()); + int size = tablet_buf.size(); + ASSERT_EQ(0, strings::fastmemcmp_inlined(session_buf.data(), tablet_buf.data(), size)); +} + +// Test fetching all files from tablet server, ensure the checksums for each +// chunk and the total file sizes match. +TEST_F(RemoteBootstrapTest, TestBlocksEqual) { + TabletSuperBlockPB tablet_superblock; + ASSERT_OK(tablet()->metadata()->ToSuperBlock(&tablet_superblock)); + for (int i = 0; i < tablet_superblock.rowsets_size(); i++) { + const RowSetDataPB& rowset = tablet_superblock.rowsets(i); + for (int j = 0; j < rowset.columns_size(); j++) { + const ColumnDataPB& column = rowset.columns(j); + const BlockIdPB& block_id_pb = column.block(); + BlockId block_id = BlockId::FromPB(block_id_pb); + + string path; + gscoped_ptr file; + FetchBlockToFile(block_id, &path, &file); + uint64_t session_block_size = 0; + ASSERT_OK(Env::Default()->GetFileSize(path, &session_block_size)); + faststring buf; + buf.resize(session_block_size); + Slice data; + ASSERT_OK(file->Read(session_block_size, &data, buf.data())); + uint32_t session_crc = crc::Crc32c(data.data(), data.size()); + LOG(INFO) << "session block file has size of " << session_block_size + << " and CRC32C of " << session_crc << ": " << path; + + gscoped_ptr tablet_block; + ASSERT_OK(fs_manager()->OpenBlock(block_id, &tablet_block)); + uint64_t tablet_block_size = 0; + ASSERT_OK(tablet_block->Size(&tablet_block_size)); + buf.resize(tablet_block_size); + ASSERT_OK(tablet_block->Read(0, tablet_block_size, &data, buf.data())); + uint32_t tablet_crc = crc::Crc32c(data.data(), data.size()); + LOG(INFO) << "tablet block file has size of " << tablet_block_size + << " and CRC32C of " << tablet_crc + << ": " << block_id; + + // Compare the blocks. + ASSERT_EQ(tablet_block_size, session_block_size); + ASSERT_EQ(tablet_crc, session_crc); + } + } +} + +// Ensure that blocks are still readable through the open session even +// after they've been deleted. +TEST_F(RemoteBootstrapTest, TestBlocksAreFetchableAfterBeingDeleted) { + TabletSuperBlockPB tablet_superblock; + ASSERT_OK(tablet()->metadata()->ToSuperBlock(&tablet_superblock)); + + // Gather all the blocks. + vector data_blocks; + for (const RowSetDataPB& rowset : tablet_superblock.rowsets()) { + for (const DeltaDataPB& redo : rowset.redo_deltas()) { + data_blocks.push_back(BlockId::FromPB(redo.block())); + } + for (const DeltaDataPB& undo : rowset.undo_deltas()) { + data_blocks.push_back(BlockId::FromPB(undo.block())); + } + for (const ColumnDataPB& column : rowset.columns()) { + data_blocks.push_back(BlockId::FromPB(column.block())); + } + if (rowset.has_bloom_block()) { + data_blocks.push_back(BlockId::FromPB(rowset.bloom_block())); + } + if (rowset.has_adhoc_index_block()) { + data_blocks.push_back(BlockId::FromPB(rowset.adhoc_index_block())); + } + } + + // Delete them. + for (const BlockId& block_id : data_blocks) { + ASSERT_OK(fs_manager()->DeleteBlock(block_id)); + } + + // Read them back. + for (const BlockId& block_id : data_blocks) { + ASSERT_TRUE(session_->IsBlockOpenForTests(block_id)); + string data; + RemoteBootstrapErrorPB::Code error_code; + int64_t piece_size; + ASSERT_OK(session_->GetBlockPiece(block_id, 0, 0, + &data, &piece_size, &error_code)); + } +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_session.cc b/src/kudu/tserver/remote_bootstrap_session.cc new file mode 100644 index 000000000000..c0c6aa158cab --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_session.cc @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/remote_bootstrap_session.h" + +#include + +#include "kudu/consensus/log.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/type_traits.h" +#include "kudu/rpc/transfer.h" +#include "kudu/server/metadata.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/trace.h" + +namespace kudu { +namespace tserver { + +using consensus::MinimumOpId; +using consensus::OpId; +using fs::ReadableBlock; +using log::LogAnchorRegistry; +using log::ReadableLogSegment; +using std::shared_ptr; +using strings::Substitute; +using tablet::ColumnDataPB; +using tablet::DeltaDataPB; +using tablet::RowSetDataPB; +using tablet::TabletMetadata; +using tablet::TabletPeer; +using tablet::TabletSuperBlockPB; + +RemoteBootstrapSession::RemoteBootstrapSession( + const scoped_refptr& tablet_peer, std::string session_id, + std::string requestor_uuid, FsManager* fs_manager) + : tablet_peer_(tablet_peer), + session_id_(std::move(session_id)), + requestor_uuid_(std::move(requestor_uuid)), + fs_manager_(fs_manager), + blocks_deleter_(&blocks_), + logs_deleter_(&logs_) {} + +RemoteBootstrapSession::~RemoteBootstrapSession() { + // No lock taken in the destructor, should only be 1 thread with access now. + CHECK_OK(UnregisterAnchorIfNeededUnlocked()); +} + +Status RemoteBootstrapSession::Init() { + // Take locks to support re-initialization of the same session. + boost::lock_guard l(session_lock_); + RETURN_NOT_OK(UnregisterAnchorIfNeededUnlocked()); + + STLDeleteValues(&blocks_); + STLDeleteValues(&logs_); + blocks_.clear(); + logs_.clear(); + + const string& tablet_id = tablet_peer_->tablet_id(); + + // Prevent log GC while we grab log segments and Tablet metadata. + string anchor_owner_token = Substitute("RemoteBootstrap-$0", session_id_); + tablet_peer_->log_anchor_registry()->Register( + MinimumOpId().index(), anchor_owner_token, &log_anchor_); + + // Read the SuperBlock from disk. + const scoped_refptr& metadata = tablet_peer_->tablet_metadata(); + RETURN_NOT_OK_PREPEND(metadata->ReadSuperBlockFromDisk(&tablet_superblock_), + Substitute("Unable to access superblock for tablet $0", + tablet_id)); + + // Anchor the data blocks by opening them and adding them to the cache. + // + // All subsequent requests should reuse the opened blocks. + vector data_blocks; + TabletMetadata::CollectBlockIdPBs(tablet_superblock_, &data_blocks); + for (const BlockIdPB& block_id : data_blocks) { + LOG(INFO) << "Opening block " << block_id.DebugString(); + RETURN_NOT_OK(OpenBlockUnlocked(BlockId::FromPB(block_id))); + } + + // Get the latest opid in the log at this point in time so we can re-anchor. + OpId last_logged_opid; + tablet_peer_->log()->GetLatestEntryOpId(&last_logged_opid); + + // Get the current segments from the log, including the active segment. + // The Log doesn't add the active segment to the log reader's list until + // a header has been written to it (but it will not have a footer). + RETURN_NOT_OK(tablet_peer_->log()->GetLogReader()->GetSegmentsSnapshot(&log_segments_)); + for (const scoped_refptr& segment : log_segments_) { + RETURN_NOT_OK(OpenLogSegmentUnlocked(segment->header().sequence_number())); + } + LOG(INFO) << "Got snapshot of " << log_segments_.size() << " log segments"; + + // Look up the committed consensus state. + // We do this after snapshotting the log to avoid a scenario where the latest + // entry in the log has a term higher than the term stored in the consensus + // metadata, which will results in a CHECK failure on RaftConsensus init. + scoped_refptr consensus = tablet_peer_->shared_consensus(); + if (!consensus) { + tablet::TabletStatePB tablet_state = tablet_peer_->state(); + return Status::IllegalState(Substitute("Unable to initialize remote bootstrap session " + "for tablet $0. Consensus is not available. Tablet state: $1 ($2)", + tablet_id, tablet::TabletStatePB_Name(tablet_state), tablet_state)); + } + initial_committed_cstate_ = consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED); + + // Re-anchor on the highest OpId that was in the log right before we + // snapshotted the log segments. This helps ensure that we don't end up in a + // remote bootstrap loop due to a follower falling too far behind the + // leader's log when remote bootstrap is slow. The remote controls when + // this anchor is released by ending the remote bootstrap session. + RETURN_NOT_OK(tablet_peer_->log_anchor_registry()->UpdateRegistration( + last_logged_opid.index(), anchor_owner_token, &log_anchor_)); + + return Status::OK(); +} + +const std::string& RemoteBootstrapSession::tablet_id() const { + return tablet_peer_->tablet_id(); +} + +const std::string& RemoteBootstrapSession::requestor_uuid() const { + return requestor_uuid_; +} + +// Determine the length of the data chunk to return to the client. +static int64_t DetermineReadLength(int64_t bytes_remaining, int64_t requested_len) { + // Determine the size of the chunks we want to read. + // Choose "system max" as a multiple of typical HDD block size (4K) with 4K to + // spare for other stuff in the message, like headers, other protobufs, etc. + const int32_t kSpareBytes = 4096; + const int32_t kDiskSectorSize = 4096; + int32_t system_max_chunk_size = + ((FLAGS_rpc_max_message_size - kSpareBytes) / kDiskSectorSize) * kDiskSectorSize; + CHECK_GT(system_max_chunk_size, 0) << "rpc_max_message_size is too low to transfer data: " + << FLAGS_rpc_max_message_size; + + // The min of the {requested, system} maxes is the effective max. + int64_t maxlen = (requested_len > 0) ? std::min(requested_len, system_max_chunk_size) : + system_max_chunk_size; + return std::min(bytes_remaining, maxlen); +} + +// Calculate the size of the data to return given a maximum client message +// length, the file itself, and the offset into the file to be read from. +static Status GetResponseDataSize(int64_t total_size, + uint64_t offset, int64_t client_maxlen, + RemoteBootstrapErrorPB::Code* error_code, int64_t* data_size) { + // If requested offset is off the end of the data, bail. + if (offset >= total_size) { + *error_code = RemoteBootstrapErrorPB::INVALID_REMOTE_BOOTSTRAP_REQUEST; + return Status::InvalidArgument( + Substitute("Requested offset ($0) is beyond the data size ($1)", + offset, total_size)); + } + + int64_t bytes_remaining = total_size - offset; + + *data_size = DetermineReadLength(bytes_remaining, client_maxlen); + DCHECK_GT(*data_size, 0); + if (client_maxlen > 0) { + DCHECK_LE(*data_size, client_maxlen); + } + + return Status::OK(); +} + +// Read a chunk of a file into a buffer. +// data_name provides a string for the block/log to be used in error messages. +template +static Status ReadFileChunkToBuf(const Info* info, + uint64_t offset, int64_t client_maxlen, + const string& data_name, + string* data, int64_t* file_size, + RemoteBootstrapErrorPB::Code* error_code) { + int64_t response_data_size = 0; + RETURN_NOT_OK_PREPEND(GetResponseDataSize(info->size, offset, client_maxlen, error_code, + &response_data_size), + Substitute("Error reading $0", data_name)); + + Stopwatch chunk_timer(Stopwatch::THIS_THREAD); + chunk_timer.start(); + + // Writing into a std::string buffer is basically guaranteed to work on C++11, + // however any modern compiler should be compatible with it. + // Violates the API contract, but avoids excessive copies. + data->resize(response_data_size); + uint8_t* buf = reinterpret_cast(const_cast(data->data())); + Slice slice; + Status s = info->ReadFully(offset, response_data_size, &slice, buf); + if (PREDICT_FALSE(!s.ok())) { + s = s.CloneAndPrepend( + Substitute("Unable to read existing file for $0", data_name)); + LOG(WARNING) << s.ToString(); + *error_code = RemoteBootstrapErrorPB::IO_ERROR; + return s; + } + // Figure out if Slice points to buf or if Slice points to the mmap. + // If it points to the mmap then copy into buf. + if (slice.data() != buf) { + memcpy(buf, slice.data(), slice.size()); + } + chunk_timer.stop(); + TRACE("Remote bootstrap: $0: $1 total bytes read. Total time elapsed: $2", + data_name, response_data_size, chunk_timer.elapsed().ToString()); + + *file_size = info->size; + return Status::OK(); +} + +Status RemoteBootstrapSession::GetBlockPiece(const BlockId& block_id, + uint64_t offset, int64_t client_maxlen, + string* data, int64_t* block_file_size, + RemoteBootstrapErrorPB::Code* error_code) { + ImmutableReadableBlockInfo* block_info; + RETURN_NOT_OK(FindBlock(block_id, &block_info, error_code)); + + RETURN_NOT_OK(ReadFileChunkToBuf(block_info, offset, client_maxlen, + Substitute("block $0", block_id.ToString()), + data, block_file_size, error_code)); + + // Note: We do not eagerly close the block, as doing so may delete the + // underlying data if this was its last reader and it had been previously + // marked for deletion. This would be a problem for parallel readers in + // the same session; they would not be able to find the block. + + return Status::OK(); +} + +Status RemoteBootstrapSession::GetLogSegmentPiece(uint64_t segment_seqno, + uint64_t offset, int64_t client_maxlen, + std::string* data, int64_t* block_file_size, + RemoteBootstrapErrorPB::Code* error_code) { + ImmutableRandomAccessFileInfo* file_info; + RETURN_NOT_OK(FindLogSegment(segment_seqno, &file_info, error_code)); + RETURN_NOT_OK(ReadFileChunkToBuf(file_info, offset, client_maxlen, + Substitute("log segment $0", segment_seqno), + data, block_file_size, error_code)); + + // Note: We do not eagerly close log segment files, since we share ownership + // of the LogSegment objects with the Log itself. + + return Status::OK(); +} + +bool RemoteBootstrapSession::IsBlockOpenForTests(const BlockId& block_id) const { + boost::lock_guard l(session_lock_); + return ContainsKey(blocks_, block_id); +} + +// Add a file to the cache and populate the given ImmutableRandomAcccessFileInfo +// object with the file ref and size. +template +static Status AddImmutableFileToMap(Collection* const cache, + const Key& key, + const Readable& readable, + uint64_t size) { + // Sanity check for 0-length files. + if (size == 0) { + return Status::Corruption("Found 0-length object"); + } + + // Looks good, add it to the cache. + typedef typename Collection::mapped_type InfoPtr; + typedef typename base::remove_pointer::type Info; + InsertOrDie(cache, key, new Info(readable, size)); + + return Status::OK(); +} + +Status RemoteBootstrapSession::OpenBlockUnlocked(const BlockId& block_id) { + DCHECK(session_lock_.is_locked()); + + gscoped_ptr block; + Status s = fs_manager_->OpenBlock(block_id, &block); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << "Unable to open requested (existing) block file: " + << block_id.ToString() << ": " << s.ToString(); + return s.CloneAndPrepend(Substitute("Unable to open block file for block $0", + block_id.ToString())); + } + + uint64_t size; + s = block->Size(&size); + if (PREDICT_FALSE(!s.ok())) { + return s.CloneAndPrepend("Unable to get size of block"); + } + + s = AddImmutableFileToMap(&blocks_, block_id, block.get(), size); + if (!s.ok()) { + s = s.CloneAndPrepend(Substitute("Error accessing data for block $0", block_id.ToString())); + LOG(DFATAL) << "Data block disappeared: " << s.ToString(); + } else { + ignore_result(block.release()); + } + return s; +} + +Status RemoteBootstrapSession::FindBlock(const BlockId& block_id, + ImmutableReadableBlockInfo** block_info, + RemoteBootstrapErrorPB::Code* error_code) { + Status s; + boost::lock_guard l(session_lock_); + if (!FindCopy(blocks_, block_id, block_info)) { + *error_code = RemoteBootstrapErrorPB::BLOCK_NOT_FOUND; + s = Status::NotFound("Block not found", block_id.ToString()); + } + return s; +} + +Status RemoteBootstrapSession::OpenLogSegmentUnlocked(uint64_t segment_seqno) { + DCHECK(session_lock_.is_locked()); + + scoped_refptr log_segment; + int position = -1; + if (!log_segments_.empty()) { + position = segment_seqno - log_segments_[0]->header().sequence_number(); + } + if (position < 0 || position >= log_segments_.size()) { + return Status::NotFound(Substitute("Segment with sequence number $0 not found", + segment_seqno)); + } + log_segment = log_segments_[position]; + CHECK_EQ(log_segment->header().sequence_number(), segment_seqno); + + uint64_t size = log_segment->readable_up_to(); + Status s = AddImmutableFileToMap(&logs_, segment_seqno, log_segment->readable_file(), size); + if (!s.ok()) { + s = s.CloneAndPrepend( + Substitute("Error accessing data for log segment with seqno $0", + segment_seqno)); + LOG(INFO) << s.ToString(); + } + return s; +} + +Status RemoteBootstrapSession::FindLogSegment(uint64_t segment_seqno, + ImmutableRandomAccessFileInfo** file_info, + RemoteBootstrapErrorPB::Code* error_code) { + boost::lock_guard l(session_lock_); + if (!FindCopy(logs_, segment_seqno, file_info)) { + *error_code = RemoteBootstrapErrorPB::WAL_SEGMENT_NOT_FOUND; + return Status::NotFound(Substitute("Segment with sequence number $0 not found", + segment_seqno)); + } + return Status::OK(); +} + +Status RemoteBootstrapSession::UnregisterAnchorIfNeededUnlocked() { + return tablet_peer_->log_anchor_registry()->UnregisterIfAnchored(&log_anchor_); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/remote_bootstrap_session.h b/src/kudu/tserver/remote_bootstrap_session.h new file mode 100644 index 000000000000..8bd991029af6 --- /dev/null +++ b/src/kudu/tserver/remote_bootstrap_session.h @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_REMOTE_BOOTSTRAP_SESSION_H_ +#define KUDU_TSERVER_REMOTE_BOOTSTRAP_SESSION_H_ + +#include +#include +#include +#include + +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/log_util.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/fs/block_id.h" +#include "kudu/fs/block_manager.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/tserver/remote_bootstrap.pb.h" +#include "kudu/util/env_util.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +namespace kudu { + +class FsManager; + +namespace tablet { +class TabletPeer; +} // namespace tablet + +namespace tserver { + +class TabletPeerLookupIf; + +// Caches file size and holds a shared_ptr reference to a RandomAccessFile. +// Assumes that the file underlying the RandomAccessFile is immutable. +struct ImmutableRandomAccessFileInfo { + std::shared_ptr readable; + int64_t size; + + ImmutableRandomAccessFileInfo(std::shared_ptr readable, + int64_t size) + : readable(std::move(readable)), size(size) {} + + Status ReadFully(uint64_t offset, int64_t size, Slice* data, uint8_t* scratch) const { + return env_util::ReadFully(readable.get(), offset, size, data, scratch); + } +}; + +// Caches block size and holds an exclusive reference to a ReadableBlock. +// Assumes that the block underlying the ReadableBlock is immutable. +struct ImmutableReadableBlockInfo { + gscoped_ptr readable; + int64_t size; + + ImmutableReadableBlockInfo(fs::ReadableBlock* readable, + int64_t size) + : readable(readable), + size(size) { + } + + Status ReadFully(uint64_t offset, int64_t size, Slice* data, uint8_t* scratch) const { + return readable->Read(offset, size, data, scratch); + } +}; + +// A potential Learner must establish a RemoteBootstrapSession with the leader in order +// to fetch the needed superblock, blocks, and log segments. +// This class is refcounted to make it easy to remove it from the session map +// on expiration while it is in use by another thread. +class RemoteBootstrapSession : public RefCountedThreadSafe { + public: + RemoteBootstrapSession(const scoped_refptr& tablet_peer, + std::string session_id, std::string requestor_uuid, + FsManager* fs_manager); + + // Initialize the session, including anchoring files (TODO) and fetching the + // tablet superblock and list of WAL segments. + Status Init(); + + // Return ID of tablet corresponding to this session. + const std::string& tablet_id() const; + + // Return UUID of the requestor that initiated this session. + const std::string& requestor_uuid() const; + + // Open block for reading, if it's not already open, and read some of it. + // If maxlen is 0, we use a system-selected length for the data piece. + // *data is set to a std::string containing the data. Ownership of this object + // is passed to the caller. A string is used because the RPC interface is + // sending data serialized as protobuf and we want to minimize copying. + // On error, Status is set to a non-OK value and error_code is filled in. + // + // This method is thread-safe. + Status GetBlockPiece(const BlockId& block_id, + uint64_t offset, int64_t client_maxlen, + std::string* data, int64_t* block_file_size, + RemoteBootstrapErrorPB::Code* error_code); + + // Get a piece of a log segment. + // The behavior and params are very similar to GetBlockPiece(), but this one + // is only for sending WAL segment files. + Status GetLogSegmentPiece(uint64_t segment_seqno, + uint64_t offset, int64_t client_maxlen, + std::string* data, int64_t* log_file_size, + RemoteBootstrapErrorPB::Code* error_code); + + const tablet::TabletSuperBlockPB& tablet_superblock() const { return tablet_superblock_; } + + const consensus::ConsensusStatePB& initial_committed_cstate() const { + return initial_committed_cstate_; + } + + const log::SegmentSequence& log_segments() const { return log_segments_; } + + // Check if a block is currently open. + bool IsBlockOpenForTests(const BlockId& block_id) const; + + private: + friend class RefCountedThreadSafe; + + typedef std::unordered_map BlockMap; + typedef std::unordered_map LogMap; + + ~RemoteBootstrapSession(); + + // Open the block and add it to the block map. + Status OpenBlockUnlocked(const BlockId& block_id); + + // Look up cached block information. + Status FindBlock(const BlockId& block_id, + ImmutableReadableBlockInfo** block_info, + RemoteBootstrapErrorPB::Code* error_code); + + // Snapshot the log segment's length and put it into segment map. + Status OpenLogSegmentUnlocked(uint64_t segment_seqno); + + // Look up log segment in cache or log segment map. + Status FindLogSegment(uint64_t segment_seqno, + ImmutableRandomAccessFileInfo** file_info, + RemoteBootstrapErrorPB::Code* error_code); + + // Unregister log anchor, if it's registered. + Status UnregisterAnchorIfNeededUnlocked(); + + scoped_refptr tablet_peer_; + const std::string session_id_; + const std::string requestor_uuid_; + FsManager* const fs_manager_; + + mutable simple_spinlock session_lock_; + + BlockMap blocks_; // Protected by session_lock_. + LogMap logs_; // Protected by session_lock_. + ValueDeleter blocks_deleter_; + ValueDeleter logs_deleter_; + + tablet::TabletSuperBlockPB tablet_superblock_; + + consensus::ConsensusStatePB initial_committed_cstate_; + + // The sequence of log segments that will be sent in the course of this + // session. + log::SegmentSequence log_segments_; + + log::LogAnchor log_anchor_; + + DISALLOW_COPY_AND_ASSIGN(RemoteBootstrapSession); +}; + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_REMOTE_BOOTSTRAP_SESSION_H_ diff --git a/src/kudu/tserver/scanner_metrics.cc b/src/kudu/tserver/scanner_metrics.cc new file mode 100644 index 000000000000..f68395521183 --- /dev/null +++ b/src/kudu/tserver/scanner_metrics.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/scanner_metrics.h" + +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" + +METRIC_DEFINE_counter(server, scanners_expired, + "Scanners Expired", + kudu::MetricUnit::kScanners, + "Number of scanners that have expired since service start"); + +METRIC_DEFINE_histogram(server, scanner_duration, + "Scanner Duration", + kudu::MetricUnit::kMicroseconds, + "Histogram of the duration of active scanners on this tablet.", + 60000000LU, 2); + +namespace kudu { + +namespace tserver { + +ScannerMetrics::ScannerMetrics(const scoped_refptr& metric_entity) + : scanners_expired( + METRIC_scanners_expired.Instantiate(metric_entity)), + scanner_duration(METRIC_scanner_duration.Instantiate(metric_entity)) { +} + +void ScannerMetrics::SubmitScannerDuration(const MonoTime& time_started) { + scanner_duration->Increment( + MonoTime::Now(MonoTime::COARSE).GetDeltaSince(time_started).ToMicroseconds()); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/scanner_metrics.h b/src/kudu/tserver/scanner_metrics.h new file mode 100644 index 000000000000..41a67b7a203c --- /dev/null +++ b/src/kudu/tserver/scanner_metrics.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_SCANNER_METRICS_H +#define KUDU_TSERVER_SCANNER_METRICS_H + +#include "kudu/gutil/ref_counted.h" + +namespace kudu { + +class MetricEntity; +class Counter; +class Histogram; +class MonoTime; + +namespace tserver { + +// Keeps track of scanner related metrics for a given ScannerManager +// instance. +struct ScannerMetrics { + explicit ScannerMetrics(const scoped_refptr& metric_entity); + + // Adds the the number of microseconds that have passed since + // 'time_started' to 'scanner_duration' histogram. + void SubmitScannerDuration(const MonoTime& time_started); + + // Keeps track of the total number of scanners that have been + // expired since the start of service. + scoped_refptr scanners_expired; + + // Keeps track of the duration of scanners. + scoped_refptr scanner_duration; +}; + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_SCANNER_METRICS_H diff --git a/src/kudu/tserver/scanners-test.cc b/src/kudu/tserver/scanners-test.cc new file mode 100644 index 000000000000..68cfe4c97081 --- /dev/null +++ b/src/kudu/tserver/scanners-test.cc @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/scanners.h" + +#include + +#include +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/scanner_metrics.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +DECLARE_int32(scanner_ttl_ms); + +namespace kudu { + +using tablet::TabletPeer; + +namespace tserver { + +using std::vector; + +TEST(ScannersTest, TestManager) { + scoped_refptr null_peer(nullptr); + ScannerManager mgr(nullptr); + + // Create two scanners, make sure their ids are different. + SharedScanner s1, s2; + mgr.NewScanner(null_peer, "", &s1); + mgr.NewScanner(null_peer, "", &s2); + ASSERT_NE(s1->id(), s2->id()); + + // Check that they're both registered. + SharedScanner result; + ASSERT_TRUE(mgr.LookupScanner(s1->id(), &result)); + ASSERT_EQ(result.get(), s1.get()); + + ASSERT_TRUE(mgr.LookupScanner(s2->id(), &result)); + ASSERT_EQ(result.get(), s2.get()); + + // Check that looking up a bad scanner returns false. + ASSERT_FALSE(mgr.LookupScanner("xxx", &result)); + + // Remove the scanners. + ASSERT_TRUE(mgr.UnregisterScanner(s1->id())); + ASSERT_TRUE(mgr.UnregisterScanner(s2->id())); + + // Removing a missing scanner should return false. + ASSERT_FALSE(mgr.UnregisterScanner("xxx")); +} + +TEST(ScannerTest, TestExpire) { + scoped_refptr null_peer(nullptr); + FLAGS_scanner_ttl_ms = 100; + MetricRegistry registry; + ScannerManager mgr(METRIC_ENTITY_server.Instantiate(®istry, "test")); + SharedScanner s1, s2; + mgr.NewScanner(null_peer, "", &s1); + mgr.NewScanner(null_peer, "", &s2); + SleepFor(MonoDelta::FromMilliseconds(200)); + s2->UpdateAccessTime(); + mgr.RemoveExpiredScanners(); + ASSERT_EQ(1, mgr.CountActiveScanners()); + ASSERT_EQ(1, mgr.metrics_->scanners_expired->value()); + vector active_scanners; + mgr.ListScanners(&active_scanners); + ASSERT_EQ(s2->id(), active_scanners[0]->id()); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/scanners.cc b/src/kudu/tserver/scanners.cc new file mode 100644 index 000000000000..eaf2ecc203c3 --- /dev/null +++ b/src/kudu/tserver/scanners.cc @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/scanners.h" + +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/scan_spec.h" +#include "kudu/gutil/hash/string_hash.h" +#include "kudu/gutil/map-util.h" +#include "kudu/tserver/scanner_metrics.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/thread.h" +#include "kudu/util/metrics.h" + +DEFINE_int32(scanner_ttl_ms, 60000, + "Number of milliseconds of inactivity allowed for a scanner" + "before it may be expired"); +TAG_FLAG(scanner_ttl_ms, advanced); +DEFINE_int32(scanner_gc_check_interval_us, 5 * 1000L *1000L, // 5 seconds + "Number of microseconds in the interval at which we remove expired scanners"); +TAG_FLAG(scanner_ttl_ms, hidden); + +// TODO: would be better to scope this at a tablet level instead of +// server level. +METRIC_DEFINE_gauge_size(server, active_scanners, + "Active Scanners", + kudu::MetricUnit::kScanners, + "Number of scanners that are currently active"); + +namespace kudu { + +using tablet::TabletPeer; + +namespace tserver { + +ScannerManager::ScannerManager(const scoped_refptr& metric_entity) + : shutdown_(false) { + if (metric_entity) { + metrics_.reset(new ScannerMetrics(metric_entity)); + METRIC_active_scanners.InstantiateFunctionGauge( + metric_entity, Bind(&ScannerManager::CountActiveScanners, + Unretained(this))) + ->AutoDetach(&metric_detacher_); + } + for (size_t i = 0; i < kNumScannerMapStripes; i++) { + scanner_maps_.push_back(new ScannerMapStripe()); + } +} + +ScannerManager::~ScannerManager() { + { + boost::lock_guard l(shutdown_lock_); + shutdown_ = true; + shutdown_cv_.notify_all(); + } + if (removal_thread_.get() != nullptr) { + CHECK_OK(ThreadJoiner(removal_thread_.get()).Join()); + } + STLDeleteElements(&scanner_maps_); +} + +Status ScannerManager::StartRemovalThread() { + RETURN_NOT_OK(Thread::Create("scanners", "removal_thread", + &ScannerManager::RunRemovalThread, this, + &removal_thread_)); + return Status::OK(); +} + +void ScannerManager::RunRemovalThread() { + while (true) { + // Loop until we are shutdown. + { + boost::unique_lock l(shutdown_lock_); + if (shutdown_) { + return; + } + boost::system_time wtime = boost::get_system_time() + + boost::posix_time::microseconds(FLAGS_scanner_gc_check_interval_us); + shutdown_cv_.timed_wait(l, wtime); + } + RemoveExpiredScanners(); + } +} + +ScannerManager::ScannerMapStripe& ScannerManager::GetStripeByScannerId(const string& scanner_id) { + size_t slot = HashStringThoroughly(scanner_id.data(), scanner_id.size()) % kNumScannerMapStripes; + return *scanner_maps_[slot]; +} + +void ScannerManager::NewScanner(const scoped_refptr& tablet_peer, + const std::string& requestor_string, + SharedScanner* scanner) { + // Keep trying to generate a unique ID until we get one. + bool success = false; + while (!success) { + // TODO(security): are these UUIDs predictable? If so, we should + // probably generate random numbers instead, since we can safely + // just retry until we avoid a collission. + string id = oid_generator_.Next(); + scanner->reset(new Scanner(id, tablet_peer, requestor_string, metrics_.get())); + + ScannerMapStripe& stripe = GetStripeByScannerId(id); + boost::lock_guard l(stripe.lock_); + success = InsertIfNotPresent(&stripe.scanners_by_id_, id, *scanner); + } +} + +bool ScannerManager::LookupScanner(const string& scanner_id, SharedScanner* scanner) { + ScannerMapStripe& stripe = GetStripeByScannerId(scanner_id); + boost::shared_lock l(stripe.lock_); + return FindCopy(stripe.scanners_by_id_, scanner_id, scanner); +} + +bool ScannerManager::UnregisterScanner(const string& scanner_id) { + ScannerMapStripe& stripe = GetStripeByScannerId(scanner_id); + boost::lock_guard l(stripe.lock_); + return stripe.scanners_by_id_.erase(scanner_id) > 0; +} + +size_t ScannerManager::CountActiveScanners() const { + size_t total = 0; + for (const ScannerMapStripe* e : scanner_maps_) { + boost::shared_lock l(e->lock_); + total += e->scanners_by_id_.size(); + } + return total; +} + +void ScannerManager::ListScanners(std::vector* scanners) { + for (const ScannerMapStripe* stripe : scanner_maps_) { + boost::shared_lock l(stripe->lock_); + for (const ScannerMapEntry& se : stripe->scanners_by_id_) { + scanners->push_back(se.second); + } + } +} + +void ScannerManager::RemoveExpiredScanners() { + MonoDelta scanner_ttl = MonoDelta::FromMilliseconds(FLAGS_scanner_ttl_ms); + + for (ScannerMapStripe* stripe : scanner_maps_) { + boost::lock_guard l(stripe->lock_); + for (auto it = stripe->scanners_by_id_.begin(); it != stripe->scanners_by_id_.end();) { + SharedScanner& scanner = it->second; + MonoDelta time_live = + scanner->TimeSinceLastAccess(MonoTime::Now(MonoTime::COARSE)); + if (time_live.MoreThan(scanner_ttl)) { + // TODO: once we have a metric for the number of scanners expired, make this a + // VLOG(1). + LOG(INFO) << "Expiring scanner id: " << it->first << ", of tablet " << scanner->tablet_id() + << ", after " << time_live.ToMicroseconds() + << " us of inactivity, which is > TTL (" + << scanner_ttl.ToMicroseconds() << " us)."; + it = stripe->scanners_by_id_.erase(it); + if (metrics_) { + metrics_->scanners_expired->Increment(); + } + } else { + ++it; + } + } + } +} + +Scanner::Scanner(string id, const scoped_refptr& tablet_peer, + string requestor_string, ScannerMetrics* metrics) + : id_(std::move(id)), + tablet_peer_(tablet_peer), + requestor_string_(std::move(requestor_string)), + call_seq_id_(0), + start_time_(MonoTime::Now(MonoTime::COARSE)), + metrics_(metrics), + arena_(1024, 1024 * 1024) { + UpdateAccessTime(); +} + +Scanner::~Scanner() { + if (metrics_) { + metrics_->SubmitScannerDuration(start_time_); + } +} + +void Scanner::UpdateAccessTime() { + boost::lock_guard l(lock_); + last_access_time_ = MonoTime::Now(MonoTime::COARSE); +} + +void Scanner::Init(gscoped_ptr iter, + gscoped_ptr spec) { + boost::lock_guard l(lock_); + CHECK(!iter_) << "Already initialized"; + iter_.reset(iter.release()); + spec_.reset(spec.release()); +} + +const ScanSpec& Scanner::spec() const { + return *spec_; +} + +void Scanner::GetIteratorStats(vector* stats) const { + iter_->GetIteratorStats(stats); +} + + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/scanners.h b/src/kudu/tserver/scanners.h new file mode 100644 index 000000000000..49c2073d59da --- /dev/null +++ b/src/kudu/tserver/scanners.h @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_SCANNERS_H +#define KUDU_TSERVER_SCANNERS_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/common/iterator_stats.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/util/auto_release_pool.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/oid_generator.h" + +namespace kudu { + +class MetricEntity; +class RowwiseIterator; +class ScanSpec; +class Schema; +class Status; +class Thread; + +struct IteratorStats; + +namespace tserver { + +class Scanner; +struct ScannerMetrics; +typedef std::shared_ptr SharedScanner; + +// Manages the live scanners within a Tablet Server. +// +// When a scanner is created by a client, it is assigned a unique scanner ID. +// The client may then use this ID to fetch more rows from the scanner +// or close it. +// +// Since scanners keep resources on the server, the manager periodically +// removes any scanners which have not been accessed since a configurable TTL. +class ScannerManager { + public: + explicit ScannerManager(const scoped_refptr& metric_entity); + ~ScannerManager(); + + // Starts the expired scanner removal thread. + Status StartRemovalThread(); + + // Create a new scanner with a unique ID, inserting it into the map. + void NewScanner(const scoped_refptr& tablet_peer, + const std::string& requestor_string, + SharedScanner* scanner); + + // Lookup the given scanner by its ID. + // Returns true if the scanner is found successfully. + bool LookupScanner(const std::string& scanner_id, SharedScanner* scanner); + + // Unregister the given scanner by its ID. + // Returns true if unregistered successfully. + bool UnregisterScanner(const std::string& scanner_id); + + // Return the number of scanners currently active. + // Note this method will not return accurate value + // if under concurrent modifications. + size_t CountActiveScanners() const; + + // List all active scanners. + // Note this method will not return a consistent view + // of all active scanners if under concurrent modifications. + void ListScanners(std::vector* scanners); + + // Iterate through scanners and remove any which are past their TTL. + void RemoveExpiredScanners(); + + private: + FRIEND_TEST(ScannerTest, TestExpire); + + enum { + kNumScannerMapStripes = 32 + }; + + typedef std::unordered_map ScannerMap; + + typedef std::pair ScannerMapEntry; + + struct ScannerMapStripe { + // Lock protecting the scanner map. + mutable boost::shared_mutex lock_; + // Map of the currently active scanners. + ScannerMap scanners_by_id_; + }; + + // Periodically call RemoveExpiredScanners(). + void RunRemovalThread(); + + ScannerMapStripe& GetStripeByScannerId(const string& scanner_id); + + // (Optional) scanner metrics for this instance. + gscoped_ptr metrics_; + + // If true, removal thread should shut itself down. Protected + // by 'shutdown_lock_' and 'shutdown_cv_'. + bool shutdown_; + mutable boost::mutex shutdown_lock_; + boost::condition_variable shutdown_cv_; + + std::vector scanner_maps_; + + // Generator for scanner IDs. + ObjectIdGenerator oid_generator_; + + // Thread to remove expired scanners. + scoped_refptr removal_thread_; + + FunctionGaugeDetacher metric_detacher_; + + DISALLOW_COPY_AND_ASSIGN(ScannerManager); +}; + +// RAII wrapper to unregister a scanner upon scope exit. +class ScopedUnregisterScanner { + public: + ScopedUnregisterScanner(ScannerManager* mgr, std::string id) + : mgr_(mgr), id_(std::move(id)), cancelled_(false) {} + + ~ScopedUnregisterScanner() { + if (!cancelled_) { + mgr_->UnregisterScanner(id_); + } + } + + // Do not unregister the scanner when the scope is exited. + void Cancel() { + cancelled_ = true; + } + + private: + ScannerManager* const mgr_; + const std::string id_; + bool cancelled_; +}; + +// An open scanner on the server side. +class Scanner { + public: + explicit Scanner(std::string id, + const scoped_refptr& tablet_peer, + std::string requestor_string, ScannerMetrics* metrics); + ~Scanner(); + + // Attach an actual iterator and a ScanSpec to this Scanner. + // Takes ownership of 'iter' and 'spec'. + void Init(gscoped_ptr iter, + gscoped_ptr spec); + + // Return true if the scanner has been initialized (i.e has an iterator). + // Once a Scanner is initialized, it is safe to assume that iter() and spec() + // return non-NULL for the lifetime of the Scanner object. + bool IsInitialized() const { + boost::lock_guard l(lock_); + return iter_ != NULL; + } + + RowwiseIterator* iter() { + return DCHECK_NOTNULL(iter_.get()); + } + + const RowwiseIterator* iter() const { + return DCHECK_NOTNULL(iter_.get()); + } + + // Update the last-access time to the current time, + // delaying the expiration of the Scanner for another TTL + // period. + void UpdateAccessTime(); + + // Return the auto-release pool which will be freed when this scanner + // closes. This can be used as a storage area for the ScanSpec and any + // associated data (eg storage for its predicates). + AutoReleasePool* autorelease_pool() { + return &autorelease_pool_; + } + + Arena* arena() { + return &arena_; + } + + const std::string& id() const { return id_; } + + // Return the ScanSpec associated with this Scanner. + const ScanSpec& spec() const; + + const std::string tablet_id() const { + // scanners-test passes a null tablet_peer. + return tablet_peer_ ? tablet_peer_->tablet_id() : "null tablet"; + } + + const scoped_refptr& tablet_peer() const { return tablet_peer_; } + + const std::string& requestor_string() const { return requestor_string_; } + + // Returns the current call sequence ID of the scanner. + uint32_t call_seq_id() const { + boost::lock_guard l(lock_); + return call_seq_id_; + } + + // Increments the call sequence ID. + void IncrementCallSeqId() { + boost::lock_guard l(lock_); + call_seq_id_ += 1; + } + + // Return the delta from the last time this scan was updated to 'now'. + MonoDelta TimeSinceLastAccess(const MonoTime& now) const { + boost::lock_guard l(lock_); + return now.GetDeltaSince(last_access_time_); + } + + // Returns the time this scan was started. + const MonoTime& start_time() const { return start_time_; } + + // Associate a projection schema with the Scanner. The scanner takes + // ownership of 'client_projection_schema'. + // + // Note: 'client_projection_schema' is set if the client's + // projection is a subset of the iterator's schema -- the iterator's + // schema needs to include all columns that have predicates, whereas + // the client may not want to project all of them. + void set_client_projection_schema(gscoped_ptr client_projection_schema) { + client_projection_schema_.swap(client_projection_schema); + } + + // Returns request's projection schema if it differs from the schema + // used by the iterator (which must contain all columns used as + // predicates). Returns NULL if the iterator's schema is the same as + // the projection schema. + // See the note about 'set_client_projection_schema' above. + const Schema* client_projection_schema() const { return client_projection_schema_.get(); } + + // Get per-column stats for each iterator. + void GetIteratorStats(std::vector* stats) const; + + const IteratorStats& already_reported_stats() const { + return already_reported_stats_; + } + void set_already_reported_stats(const IteratorStats& stats) { + already_reported_stats_ = stats; + } + + private: + friend class ScannerManager; + + // The unique ID of this scanner. + const std::string id_; + + // Tablet associated with the scanner. + const scoped_refptr tablet_peer_; + + // Information about the requestor. Populated from + // RpcContext::requestor_string(). + const std::string requestor_string_; + + // The last time that the scanner was accessed. + MonoTime last_access_time_; + + // The current call sequence ID. + uint32_t call_seq_id_; + + // Protects last_access_time_ call_seq_id_, iter_, and spec_. + mutable simple_spinlock lock_; + + // The time the scanner was started. + const MonoTime start_time_; + + // (Optional) scanner metrics struct, for recording scanner's duration. + ScannerMetrics* metrics_; + + // A summary of the statistics already reported to the metrics system + // for this scanner. This allows us to report the metrics incrementally + // as the scanner proceeds. + IteratorStats already_reported_stats_; + + // The spec used by 'iter_' + gscoped_ptr spec_; + + // Stores the request's projection schema, if it differs from the + // schema used by the iterator. + gscoped_ptr client_projection_schema_; + + gscoped_ptr iter_; + + AutoReleasePool autorelease_pool_; + + // Arena used for allocations which must last as long as the scanner + // itself. This is _not_ used for row data, which is scoped to a single RPC + // response. + Arena arena_; + + DISALLOW_COPY_AND_ASSIGN(Scanner); +}; + + +} // namespace tserver +} // namespace kudu + +#endif diff --git a/src/kudu/tserver/tablet_peer_lookup.h b/src/kudu/tserver/tablet_peer_lookup.h new file mode 100644 index 000000000000..ceb087c60467 --- /dev/null +++ b/src/kudu/tserver/tablet_peer_lookup.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TABLET_PEER_LOOKUP_H_ +#define KUDU_TSERVER_TABLET_PEER_LOOKUP_H_ + +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/status.h" + +namespace kudu { + +class HostPort; +class NodeInstancePB; + +namespace consensus { +class StartRemoteBootstrapRequestPB; +} // namespace consensus + +namespace tablet { +class TabletPeer; +} // namespace tablet + +namespace tserver { + +// Pure virtual interface that provides an abstraction for something that +// contains and manages TabletPeers. This interface is implemented on both +// tablet servers and master servers. +// TODO: Rename this interface. +class TabletPeerLookupIf { + public: + virtual Status GetTabletPeer(const std::string& tablet_id, + scoped_refptr* tablet_peer) const = 0; + + virtual const NodeInstancePB& NodeInstance() const = 0; + + virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) = 0; +}; + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_TABLET_PEER_LOOKUP_H_ diff --git a/src/kudu/tserver/tablet_server-stress-test.cc b/src/kudu/tserver/tablet_server-stress-test.cc new file mode 100644 index 000000000000..d9c29311030b --- /dev/null +++ b/src/kudu/tserver/tablet_server-stress-test.cc @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/tablet_server-test-base.h" + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/stopwatch.h" + +DEFINE_int32(num_inserter_threads, 8, "Number of inserter threads to run"); +DEFINE_int32(num_inserts_per_thread, 0, "Number of inserts from each thread"); +DECLARE_bool(enable_maintenance_manager); + +METRIC_DEFINE_histogram(test, insert_latency, + "Insert Latency", + kudu::MetricUnit::kMicroseconds, + "TabletServer single threaded insert latency.", + 10000000, + 2); + +namespace kudu { +namespace tserver { + +class TSStressTest : public TabletServerTestBase { + public: + TSStressTest() + : start_latch_(FLAGS_num_inserter_threads) { + + if (FLAGS_num_inserts_per_thread == 0) { + FLAGS_num_inserts_per_thread = AllowSlowTests() ? 100000 : 1000; + } + + // Re-enable the maintenance manager which is disabled by default + // in TS tests. We want to stress the whole system including + // flushes, etc. + FLAGS_enable_maintenance_manager = true; + } + + virtual void SetUp() OVERRIDE { + TabletServerTestBase::SetUp(); + StartTabletServer(); + + histogram_ = METRIC_insert_latency.Instantiate(ts_test_metric_entity_); + } + + void StartThreads() { + for (int i = 0; i < FLAGS_num_inserter_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("test$0", i), + &TSStressTest::InserterThread, this, i, &new_thread)); + threads_.push_back(new_thread); + } + } + + void JoinThreads() { + for (scoped_refptr thr : threads_) { + CHECK_OK(ThreadJoiner(thr.get()).Join()); + } + } + + void InserterThread(int thread_idx); + + protected: + scoped_refptr histogram_; + CountDownLatch start_latch_; + std::vector > threads_; +}; + +void TSStressTest::InserterThread(int thread_idx) { + // Wait for all the threads to be ready before we start. + start_latch_.CountDown(); + start_latch_.Wait(); + LOG(INFO) << "Starting inserter thread " << thread_idx << " complete"; + + uint64_t max_rows = FLAGS_num_inserts_per_thread; + int start_row = thread_idx * max_rows; + for (int i = start_row; i < start_row + max_rows ; i++) { + MonoTime before = MonoTime::Now(MonoTime::FINE); + InsertTestRowsRemote(thread_idx, i, 1); + MonoTime after = MonoTime::Now(MonoTime::FINE); + MonoDelta delta = after.GetDeltaSince(before); + histogram_->Increment(delta.ToMicroseconds()); + } + LOG(INFO) << "Inserter thread " << thread_idx << " complete"; +} + +TEST_F(TSStressTest, TestMTInserts) { + StartThreads(); + Stopwatch s(Stopwatch::ALL_THREADS); + s.start(); + JoinThreads(); + s.stop(); + int num_rows = (FLAGS_num_inserter_threads * FLAGS_num_inserts_per_thread); + LOG(INFO) << "Inserted " << num_rows << " rows in " << s.elapsed().wall_millis() << " ms"; + LOG(INFO) << "Throughput: " << (num_rows * 1000 / s.elapsed().wall_millis()) << " rows/sec"; + LOG(INFO) << "CPU efficiency: " << (num_rows / s.elapsed().user_cpu_seconds()) << " rows/cpusec"; + + + // Generate the JSON. + std::stringstream out; + JsonWriter writer(&out, JsonWriter::PRETTY); + ASSERT_OK(histogram_->WriteAsJson(&writer, MetricJsonOptions())); + + LOG(INFO) << out.str(); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_server-test-base.h b/src/kudu/tserver/tablet_server-test-base.h new file mode 100644 index 000000000000..1a1e8368f286 --- /dev/null +++ b/src/kudu/tserver/tablet_server-test-base.h @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_TSERVER_TABLET_SERVER_TEST_BASE_H_ +#define KUDU_TSERVER_TABLET_SERVER_TEST_BASE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol-test-util.h" +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/consensus/log_reader.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tablet/local_tablet_writer.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/remote_bootstrap.proxy.h" +#include "kudu/tserver/scanners.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/tablet_server_test_util.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_graph.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(rpc_timeout, 1000, "Timeout for RPC calls, in seconds"); +DEFINE_int32(num_updater_threads, 1, "Number of updating threads to launch"); +DECLARE_bool(log_force_fsync_all); +DECLARE_bool(enable_maintenance_manager); +DECLARE_bool(enable_data_block_fsync); +DECLARE_int32(heartbeat_rpc_timeout_ms); + +METRIC_DEFINE_entity(test); + +namespace kudu { +namespace tserver { + +class TabletServerTestBase : public KuduTest { + public: + typedef pair KeyValue; + + TabletServerTestBase() + : schema_(GetSimpleTestSchema()), + ts_test_metric_entity_(METRIC_ENTITY_test.Instantiate( + &ts_test_metric_registry_, "ts_server-test")) { + + // Disable the maintenance ops manager since we want to trigger our own + // maintenance operations at predetermined times. + FLAGS_enable_maintenance_manager = false; + + // Decrease heartbeat timeout: we keep re-trying heartbeats when a + // single master server fails due to a network error. Decreasing + // the hearbeat timeout to 1 second speeds up unit tests which + // purposefully specify non-running Master servers. + FLAGS_heartbeat_rpc_timeout_ms = 1000; + + // Keep unit tests fast, but only if no one has set the flag explicitly. + if (google::GetCommandLineFlagInfoOrDie("enable_data_block_fsync").is_default) { + FLAGS_enable_data_block_fsync = false; + } + } + + // Starts the tablet server, override to start it later. + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + key_schema_ = schema_.CreateKeyProjection(); + rb_.reset(new RowBuilder(schema_)); + + rpc::MessengerBuilder bld("Client"); + ASSERT_OK(bld.Build(&client_messenger_)); + } + + virtual void StartTabletServer() { + // Start server with an invalid master address, so it never successfully + // heartbeats, even if there happens to be a master running on this machine. + mini_server_.reset(new MiniTabletServer(GetTestPath("TabletServerTest-fsroot"), 0)); + mini_server_->options()->master_addresses.clear(); + mini_server_->options()->master_addresses.push_back(HostPort("255.255.255.255", 1)); + CHECK_OK(mini_server_->Start()); + + // Set up a tablet inside the server. + CHECK_OK(mini_server_->AddTestTablet(kTableId, kTabletId, schema_)); + CHECK(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet_peer_)); + + // Creating a tablet is async, we wait here instead of having to handle errors later. + CHECK_OK(WaitForTabletRunning(kTabletId)); + + // Connect to it. + ResetClientProxies(); + } + + Status WaitForTabletRunning(const char *tablet_id) { + scoped_refptr tablet_peer; + RETURN_NOT_OK(mini_server_->server()->tablet_manager()->GetTabletPeer(tablet_id, &tablet_peer)); + return tablet_peer->WaitUntilConsensusRunning(MonoDelta::FromSeconds(10)); + } + + void UpdateTestRowRemote(int tid, + int64_t row_idx, + int32_t new_val, + TimeSeries *ts = NULL) { + + WriteRequestPB req; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + WriteResponsePB resp; + rpc::RpcController controller; + controller.set_timeout(MonoDelta::FromSeconds(FLAGS_rpc_timeout)); + string new_string_val(strings::Substitute("mutated$0", row_idx)); + + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, row_idx, new_val, new_string_val, + req.mutable_row_operations()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error())<< resp.ShortDebugString(); + ASSERT_EQ(0, resp.per_row_errors_size()); + if (ts) { + ts->AddValue(1); + } + } + + void ResetClientProxies() { + CreateTsClientProxies(mini_server_->bound_rpc_addr(), + client_messenger_, + &proxy_, &admin_proxy_, &consensus_proxy_, &generic_proxy_); + } + + // Inserts 'num_rows' test rows directly into the tablet (i.e not via RPC) + void InsertTestRowsDirect(int64_t start_row, uint64_t num_rows) { + tablet::LocalTabletWriter writer(tablet_peer_->tablet(), &schema_); + KuduPartialRow row(&schema_); + for (int64_t i = 0; i < num_rows; i++) { + BuildTestRow(start_row + i, &row); + CHECK_OK(writer.Insert(row)); + } + } + + // Inserts 'num_rows' test rows remotely into the tablet (i.e via RPC) + // Rows are grouped in batches of 'count'/'num_batches' size. + // Batch size defaults to 1. + void InsertTestRowsRemote(int tid, + int64_t first_row, + uint64_t count, + uint64_t num_batches = -1, + TabletServerServiceProxy* proxy = NULL, + string tablet_id = kTabletId, + vector* write_timestamps_collector = NULL, + TimeSeries *ts = NULL, + bool string_field_defined = true) { + + if (!proxy) { + proxy = proxy_.get(); + } + + if (num_batches == -1) { + num_batches = count; + } + + WriteRequestPB req; + req.set_tablet_id(tablet_id); + + WriteResponsePB resp; + rpc::RpcController controller; + + RowOperationsPB* data = req.mutable_row_operations(); + + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + uint64_t inserted_since_last_report = 0; + for (int i = 0; i < num_batches; ++i) { + + // reset the controller and the request + controller.Reset(); + controller.set_timeout(MonoDelta::FromSeconds(FLAGS_rpc_timeout)); + data->Clear(); + + uint64_t first_row_in_batch = first_row + (i * count / num_batches); + uint64_t last_row_in_batch = first_row_in_batch + count / num_batches; + + for (int j = first_row_in_batch; j < last_row_in_batch; j++) { + string str_val = strings::Substitute("original$0", j); + const char* cstr_val = str_val.c_str(); + if (!string_field_defined) { + cstr_val = NULL; + } + AddTestRowWithNullableStringToPB(RowOperationsPB::INSERT, schema_, j, j, + cstr_val, data); + } + CHECK_OK(DCHECK_NOTNULL(proxy)->Write(req, &resp, &controller)); + if (write_timestamps_collector) { + write_timestamps_collector->push_back(resp.timestamp()); + } + + if (resp.has_error() || resp.per_row_errors_size() > 0) { + LOG(FATAL) << "Failed to insert batch " + << first_row_in_batch << "-" << last_row_in_batch + << ": " << resp.DebugString(); + } + + inserted_since_last_report += count / num_batches; + if ((inserted_since_last_report > 100) && ts) { + ts->AddValue(static_cast(inserted_since_last_report)); + inserted_since_last_report = 0; + } + } + + if (ts) { + ts->AddValue(static_cast(inserted_since_last_report)); + } + } + + // Delete specified test row range. + void DeleteTestRowsRemote(int64_t first_row, + uint64_t count, + TabletServerServiceProxy* proxy = NULL, + string tablet_id = kTabletId) { + if (!proxy) { + proxy = proxy_.get(); + } + + WriteRequestPB req; + WriteResponsePB resp; + rpc::RpcController controller; + + req.set_tablet_id(tablet_id); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + RowOperationsPB* ops = req.mutable_row_operations(); + for (int64_t rowid = first_row; rowid < first_row + count; rowid++) { + AddTestKeyToPB(RowOperationsPB::DELETE, schema_, rowid, ops); + } + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()) << resp.ShortDebugString(); + } + + void BuildTestRow(int index, KuduPartialRow* row) { + ASSERT_OK(row->SetInt32(0, index)); + ASSERT_OK(row->SetInt32(1, index * 2)); + ASSERT_OK(row->SetStringCopy(2, StringPrintf("hello %d", index))); + } + + void DrainScannerToStrings(const string& scanner_id, + const Schema& projection, + vector* results, + TabletServerServiceProxy* proxy = NULL, + uint32_t call_seq_id = 1) { + + if (!proxy) { + proxy = proxy_.get(); + } + + rpc::RpcController rpc; + rpc.set_timeout(MonoDelta::FromSeconds(FLAGS_rpc_timeout)); + ScanRequestPB req; + ScanResponsePB resp; + req.set_scanner_id(scanner_id); + + // NOTE: we do not sort the results here, since this function is used + // by test cases which are verifying the server side's ability to + // do ordered scans. + do { + rpc.Reset(); + req.set_batch_size_bytes(10000); + req.set_call_seq_id(call_seq_id); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(DCHECK_NOTNULL(proxy)->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + + StringifyRowsFromResponse(projection, rpc, resp, results); + call_seq_id += 1; + } while (resp.has_more_results()); + } + + void StringifyRowsFromResponse(const Schema& projection, + const rpc::RpcController& rpc, + ScanResponsePB& resp, + vector* results) { + RowwiseRowBlockPB* rrpb = resp.mutable_data(); + Slice direct, indirect; // sidecar data buffers + ASSERT_OK(rpc.GetSidecar(rrpb->rows_sidecar(), &direct)); + if (rrpb->has_indirect_data_sidecar()) { + ASSERT_OK(rpc.GetSidecar(rrpb->indirect_data_sidecar(), + &indirect)); + } + vector rows; + ASSERT_OK(ExtractRowsFromRowBlockPB(projection, *rrpb, + indirect, &direct, &rows)); + VLOG(1) << "Round trip got " << rows.size() << " rows"; + for (const uint8_t* row_ptr : rows) { + ConstContiguousRow row(&projection, row_ptr); + results->push_back(projection.DebugRow(row)); + } + } + + void ShutdownTablet() { + if (mini_server_.get()) { + // The tablet peer must be destroyed before the TS, otherwise data + // blocks may be destroyed after their owning block manager. + tablet_peer_.reset(); + mini_server_->Shutdown(); + mini_server_.reset(); + } + } + + Status ShutdownAndRebuildTablet() { + ShutdownTablet(); + + // Start server. + mini_server_.reset(new MiniTabletServer(GetTestPath("TabletServerTest-fsroot"), 0)); + mini_server_->options()->master_addresses.clear(); + mini_server_->options()->master_addresses.push_back(HostPort("255.255.255.255", 1)); + // this should open the tablet created on StartTabletServer() + RETURN_NOT_OK(mini_server_->Start()); + RETURN_NOT_OK(mini_server_->WaitStarted()); + + if (!mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet_peer_)) { + return Status::NotFound("Tablet was not found"); + } + // Connect to it. + ResetClientProxies(); + + // Opening a tablet is async, we wait here instead of having to handle errors later. + RETURN_NOT_OK(WaitForTabletRunning(kTabletId)); + return Status::OK(); + + } + + // Verifies that a set of expected rows (key, value) is present in the tablet. + void VerifyRows(const Schema& schema, const vector& expected) { + gscoped_ptr iter; + ASSERT_OK(tablet_peer_->tablet()->NewRowIterator(schema, &iter)); + ASSERT_OK(iter->Init(NULL)); + + int batch_size = std::max( + (size_t)1, std::min((size_t)(expected.size() / 10), + 4*1024*1024 / schema.byte_size())); + + Arena arena(32*1024, 256*1024); + RowBlock block(schema, batch_size, &arena); + + int count = 0; + while (iter->HasNext()) { + ASSERT_OK_FAST(iter->NextBlock(&block)); + RowBlockRow rb_row = block.row(0); + for (int i = 0; i < block.nrows(); i++) { + if (block.selection_vector()->IsRowSelected(i)) { + rb_row.Reset(&block, i); + VLOG(1) << "Verified row " << schema.DebugRow(rb_row); + ASSERT_LT(count, expected.size()) << "Got more rows than expected!"; + ASSERT_EQ(expected[count].first, *schema.ExtractColumnFromRow(rb_row, 0)); + ASSERT_EQ(expected[count].second, *schema.ExtractColumnFromRow(rb_row, 1)); + count++; + } + } + } + ASSERT_EQ(count, expected.size()); + } + + // Verifies that a simple scan request fails with the specified error code/message. + void VerifyScanRequestFailure(const Schema& projection, + TabletServerErrorPB::Code expected_code, + const char *expected_message) { + ScanRequestPB req; + ScanResponsePB resp; + rpc::RpcController rpc; + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(expected_code, resp.error().code()); + ASSERT_STR_CONTAINS(resp.error().status().message(), expected_message); + } + } + + // Open a new scanner which scans all of the columns in the table. + void OpenScannerWithAllColumns(ScanResponsePB* resp) { + ScanRequestPB req; + rpc::RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, resp, &rpc)); + SCOPED_TRACE(resp->DebugString()); + ASSERT_FALSE(resp->has_error()); + ASSERT_TRUE(resp->has_more_results()); + } + } + + protected: + static const char* kTableId; + static const char* kTabletId; + + const Schema schema_; + Schema key_schema_; + gscoped_ptr rb_; + + std::shared_ptr client_messenger_; + + gscoped_ptr mini_server_; + scoped_refptr tablet_peer_; + gscoped_ptr proxy_; + gscoped_ptr admin_proxy_; + gscoped_ptr consensus_proxy_; + gscoped_ptr generic_proxy_; + + MetricRegistry ts_test_metric_registry_; + scoped_refptr ts_test_metric_entity_; + + void* shared_region_; +}; + +const char* TabletServerTestBase::kTableId = "TestTable"; +const char* TabletServerTestBase::kTabletId = "TestTablet"; + +} // namespace tserver +} // namespace kudu + + +#endif /* KUDU_TSERVER_TABLET_SERVER_TEST_BASE_H_ */ diff --git a/src/kudu/tserver/tablet_server-test.cc b/src/kudu/tserver/tablet_server-test.cc new file mode 100644 index 000000000000..5d5464371465 --- /dev/null +++ b/src/kudu/tserver/tablet_server-test.cc @@ -0,0 +1,2277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/tserver/tablet_server-test-base.h" + +#include "kudu/consensus/log-test-base.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/master/master.pb.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/server/server_base.pb.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/util/crc.h" +#include "kudu/util/curl_util.h" +#include "kudu/util/url-coding.h" + +using kudu::consensus::RaftConfigPB; +using kudu::consensus::RaftPeerPB; +using kudu::rpc::Messenger; +using kudu::rpc::MessengerBuilder; +using kudu::rpc::RpcController; +using kudu::server::Clock; +using kudu::server::HybridClock; +using kudu::tablet::Tablet; +using kudu::tablet::TabletPeer; +using std::shared_ptr; +using std::string; +using strings::Substitute; + +DEFINE_int32(single_threaded_insert_latency_bench_warmup_rows, 100, + "Number of rows to insert in the warmup phase of the single threaded" + " tablet server insert latency micro-benchmark"); + +DEFINE_int32(single_threaded_insert_latency_bench_insert_rows, 1000, + "Number of rows to insert in the testing phase of the single threaded" + " tablet server insert latency micro-benchmark"); + +DECLARE_int32(scanner_batch_size_rows); +DECLARE_int32(metrics_retirement_age_ms); +DECLARE_string(block_manager); + +// Declare these metrics prototypes for simpler unit testing of their behavior. +METRIC_DECLARE_counter(rows_inserted); +METRIC_DECLARE_counter(rows_updated); +METRIC_DECLARE_counter(rows_deleted); +METRIC_DECLARE_gauge_uint64(log_block_manager_blocks_under_management); + +namespace kudu { +namespace tserver { + +class TabletServerTest : public TabletServerTestBase { + public: + // Starts the tablet server, override to start it later. + virtual void SetUp() OVERRIDE { + TabletServerTestBase::SetUp(); + StartTabletServer(); + } + + void DoOrderedScanTest(const Schema& projection, const string& expected_rows_as_string); +}; + +TEST_F(TabletServerTest, TestPingServer) { + // Ping the server. + PingRequestPB req; + PingResponsePB resp; + RpcController controller; + ASSERT_OK(proxy_->Ping(req, &resp, &controller)); +} + +TEST_F(TabletServerTest, TestServerClock) { + server::ServerClockRequestPB req; + server::ServerClockResponsePB resp; + RpcController controller; + + ASSERT_OK(generic_proxy_->ServerClock(req, &resp, &controller)); + ASSERT_GT(mini_server_->server()->clock()->Now().ToUint64(), resp.timestamp()); +} + +TEST_F(TabletServerTest, TestSetFlags) { + server::GenericServiceProxy proxy( + client_messenger_, mini_server_->bound_rpc_addr()); + + server::SetFlagRequestPB req; + server::SetFlagResponsePB resp; + + // Set an invalid flag. + { + RpcController controller; + req.set_flag("foo"); + req.set_value("bar"); + ASSERT_OK(proxy.SetFlag(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + EXPECT_EQ(server::SetFlagResponsePB::NO_SUCH_FLAG, resp.result()); + EXPECT_TRUE(resp.msg().empty()); + } + + // Set a valid flag to a valid value. + { + int32_t old_val = FLAGS_metrics_retirement_age_ms; + RpcController controller; + req.set_flag("metrics_retirement_age_ms"); + req.set_value("12345"); + ASSERT_OK(proxy.SetFlag(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + EXPECT_EQ(server::SetFlagResponsePB::SUCCESS, resp.result()); + EXPECT_EQ(resp.msg(), "metrics_retirement_age_ms set to 12345\n"); + EXPECT_EQ(Substitute("$0", old_val), resp.old_value()); + EXPECT_EQ(12345, FLAGS_metrics_retirement_age_ms); + } + + // Set a valid flag to an invalid value. + { + RpcController controller; + req.set_flag("metrics_retirement_age_ms"); + req.set_value("foo"); + ASSERT_OK(proxy.SetFlag(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + EXPECT_EQ(server::SetFlagResponsePB::BAD_VALUE, resp.result()); + EXPECT_EQ(resp.msg(), "Unable to set flag: bad value"); + EXPECT_EQ(12345, FLAGS_metrics_retirement_age_ms); + } + + // Try setting a flag which isn't runtime-modifiable + { + RpcController controller; + req.set_flag("tablet_do_dup_key_checks"); + req.set_value("true"); + ASSERT_OK(proxy.SetFlag(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + EXPECT_EQ(server::SetFlagResponsePB::NOT_SAFE, resp.result()); + } + + // Try again, but with the force flag. + { + RpcController controller; + req.set_flag("tablet_do_dup_key_checks"); + req.set_value("true"); + req.set_force(true); + ASSERT_OK(proxy.SetFlag(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + EXPECT_EQ(server::SetFlagResponsePB::SUCCESS, resp.result()); + } +} + +TEST_F(TabletServerTest, TestWebPages) { + EasyCurl c; + faststring buf; + string addr = mini_server_->bound_http_addr().ToString(); + + // Tablets page should list tablet. + ASSERT_OK(c.FetchURL(Substitute("http://$0/tablets", addr), + &buf)); + ASSERT_STR_CONTAINS(buf.ToString(), kTabletId); + ASSERT_STR_CONTAINS(buf.ToString(), "range: [(<start>), (<end>))"); + + // Tablet page should include the schema. + ASSERT_OK(c.FetchURL(Substitute("http://$0/tablet?id=$1", addr, kTabletId), + &buf)); + ASSERT_STR_CONTAINS(buf.ToString(), "key"); + ASSERT_STR_CONTAINS(buf.ToString(), "string NULLABLE"); + + // Test fetching metrics. + // Fetching metrics has the side effect of retiring metrics, but not in a single pass. + // So, we check a couple of times in a loop -- thus, if we had a bug where one of these + // metrics was accidentally un-referenced too early, we'd cause it to get retired. + // If the metrics survive several passes of fetching, then we are pretty sure they will + // stick around properly for the whole lifetime of the server. + FLAGS_metrics_retirement_age_ms = 0; + for (int i = 0; i < 3; i++) { + SCOPED_TRACE(i); + ASSERT_OK(c.FetchURL(strings::Substitute("http://$0/jsonmetricz", addr, kTabletId), + &buf)); + + // Check that the tablet entry shows up. + ASSERT_STR_CONTAINS(buf.ToString(), "\"type\": \"tablet\""); + ASSERT_STR_CONTAINS(buf.ToString(), "\"id\": \"TestTablet\""); + ASSERT_STR_CONTAINS(buf.ToString(), "\"partition\": \"range: [(), ())\""); + + + // Check entity attributes. + ASSERT_STR_CONTAINS(buf.ToString(), "\"table_name\": \"TestTable\""); + + // Check for the existence of some particular metrics for which we've had early-retirement + // bugs in the past. + ASSERT_STR_CONTAINS(buf.ToString(), "hybrid_clock_timestamp"); + ASSERT_STR_CONTAINS(buf.ToString(), "active_scanners"); + ASSERT_STR_CONTAINS(buf.ToString(), "threads_started"); + ASSERT_STR_CONTAINS(buf.ToString(), "code_cache_queries"); +#ifdef TCMALLOC_ENABLED + ASSERT_STR_CONTAINS(buf.ToString(), "tcmalloc_max_total_thread_cache_bytes"); +#endif + ASSERT_STR_CONTAINS(buf.ToString(), "glog_info_messages"); + } + + // Smoke-test the tracing infrastructure. + ASSERT_OK(c.FetchURL( + Substitute("http://$0/tracing/json/get_buffer_percent_full", addr, kTabletId), + &buf)); + ASSERT_EQ(buf.ToString(), "0"); + + string enable_req_json = "{\"categoryFilter\":\"*\", \"useContinuousTracing\": \"true\"," + " \"useSampling\": \"false\"}"; + string req_b64; + Base64Escape(enable_req_json, &req_b64); + + ASSERT_OK(c.FetchURL(Substitute("http://$0/tracing/json/begin_recording?$1", + addr, + req_b64), &buf)); + ASSERT_EQ(buf.ToString(), ""); + ASSERT_OK(c.FetchURL(Substitute("http://$0/tracing/json/end_recording", addr), + &buf)); + ASSERT_STR_CONTAINS(buf.ToString(), "__metadata"); + ASSERT_OK(c.FetchURL(Substitute("http://$0/tracing/json/categories", addr), + &buf)); + ASSERT_STR_CONTAINS(buf.ToString(), "\"rpc\""); + + // Smoke test the pprof contention profiler handler. + ASSERT_OK(c.FetchURL(Substitute("http://$0/pprof/contention?seconds=1", addr), + &buf)); + ASSERT_STR_CONTAINS(buf.ToString(), "discarded samples = 0"); +#if defined(__linux__) + // The executable name appears as part of the dump of /proc/self/maps, which + // only exists on Linux. + ASSERT_STR_CONTAINS(buf.ToString(), "tablet_server-test"); +#endif +} + +TEST_F(TabletServerTest, TestInsert) { + WriteRequestPB req; + + req.set_tablet_id(kTabletId); + + WriteResponsePB resp; + RpcController controller; + + scoped_refptr tablet; + ASSERT_TRUE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + scoped_refptr rows_inserted = + METRIC_rows_inserted.Instantiate(tablet->tablet()->GetMetricEntity()); + ASSERT_EQ(0, rows_inserted->value()); + tablet.reset(); + + // Send a bad insert which has an empty schema. This should result + // in an error. + { + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1234, 5678, "hello world via RPC", + req.mutable_row_operations()); + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::MISMATCHED_SCHEMA, resp.error().code()); + Status s = StatusFromPB(resp.error().status()); + EXPECT_TRUE(s.IsInvalidArgument()); + ASSERT_STR_CONTAINS(s.ToString(), + "Client missing required column: key[int32 NOT NULL]"); + req.clear_row_operations(); + } + + // Send an empty request with the correct schema. + // This should succeed and do nothing. + { + controller.Reset(); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + req.clear_row_operations(); + } + + // Send an actual row insert. + { + controller.Reset(); + RowOperationsPB* data = req.mutable_row_operations(); + data->Clear(); + + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1234, 5678, + "hello world via RPC", data); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + req.clear_row_operations(); + ASSERT_EQ(1, rows_inserted->value()); + } + + // Send a batch with multiple rows, one of which is a duplicate of + // the above insert. This should generate one error into per_row_errors. + { + controller.Reset(); + RowOperationsPB* data = req.mutable_row_operations(); + data->Clear(); + + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1, 1, "ceci n'est pas une dupe", data); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 2, 1, "also not a dupe key", data); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1234, 1, "I am a duplicate key", data); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()) << resp.ShortDebugString(); + ASSERT_EQ(1, resp.per_row_errors().size()); + ASSERT_EQ(2, resp.per_row_errors().Get(0).row_index()); + Status s = StatusFromPB(resp.per_row_errors().Get(0).error()); + ASSERT_STR_CONTAINS(s.ToString(), "Already present"); + ASSERT_EQ(3, rows_inserted->value()); // This counter only counts successful inserts. + } + + // get the clock's current timestamp + Timestamp now_before = mini_server_->server()->clock()->Now(); + + rows_inserted = nullptr; + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(1, 1), KeyValue(2, 1), KeyValue(1234, 5678) }); + + // get the clock's timestamp after replay + Timestamp now_after = mini_server_->server()->clock()->Now(); + + // make sure 'now_after' is greater than or equal to 'now_before' + ASSERT_GE(now_after.value(), now_before.value()); +} + +TEST_F(TabletServerTest, TestExternalConsistencyModes_ClientPropagated) { + WriteRequestPB req; + req.set_tablet_id(kTabletId); + WriteResponsePB resp; + RpcController controller; + + scoped_refptr tablet; + ASSERT_TRUE( + mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, + &tablet)); + scoped_refptr rows_inserted = + METRIC_rows_inserted.Instantiate(tablet->tablet()->GetMetricEntity()); + ASSERT_EQ(0, rows_inserted->value()); + + // get the current time + Timestamp current = mini_server_->server()->clock()->Now(); + // advance current to some time in the future. we do 5 secs to make + // sure this timestamp will still be in the future when it reaches the + // server. + current = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(current) + 5000000); + + // Send an actual row insert. + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1234, 5678, "hello world via RPC", + req.mutable_row_operations()); + + // set the external consistency mode and the timestamp + req.set_external_consistency_mode(CLIENT_PROPAGATED); + + req.set_propagated_timestamp(current.ToUint64()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + req.clear_row_operations(); + ASSERT_EQ(1, rows_inserted->value()); + + // make sure the server returned a write timestamp where only + // the logical value was increased since he should have updated + // its clock with the client's value. + Timestamp write_timestamp(resp.timestamp()); + + ASSERT_EQ(HybridClock::GetPhysicalValueMicros(current), + HybridClock::GetPhysicalValueMicros(write_timestamp)); + + ASSERT_EQ(HybridClock::GetLogicalValue(current) + 1, + HybridClock::GetLogicalValue(write_timestamp)); +} + +TEST_F(TabletServerTest, TestExternalConsistencyModes_CommitWait) { + WriteRequestPB req; + req.set_tablet_id(kTabletId); + WriteResponsePB resp; + RpcController controller; + HybridClock* hclock = down_cast(mini_server_->server()->clock()); + + scoped_refptr tablet; + ASSERT_TRUE( + mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, + &tablet)); + scoped_refptr rows_inserted = + METRIC_rows_inserted.Instantiate( + tablet->tablet()->GetMetricEntity()); + ASSERT_EQ(0, rows_inserted->value()); + + // get current time, with and without error + Timestamp now_before; + uint64_t error_before; + hclock->NowWithError(&now_before, &error_before); + + uint64_t now_before_usec = HybridClock::GetPhysicalValueMicros(now_before); + LOG(INFO) << "Submitting write with commit wait at: " << now_before_usec << " us +- " + << error_before << " us"; + + // Send an actual row insert. + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1234, 5678, "hello world via RPC", + req.mutable_row_operations()); + + // set the external consistency mode to COMMIT_WAIT + req.set_external_consistency_mode(COMMIT_WAIT); + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + req.clear_row_operations(); + ASSERT_EQ(1, rows_inserted->value()); + + // Two things must have happened. + // 1 - The write timestamp must be greater than 'now_before' + // 2 - The write must have taken at least 'error_before' to complete (two + // times more in average). + + Timestamp now_after; + uint64_t error_after; + hclock->NowWithError(&now_after, &error_after); + + Timestamp write_timestamp(resp.timestamp()); + + uint64_t write_took = HybridClock::GetPhysicalValueMicros(now_after) - + HybridClock::GetPhysicalValueMicros(now_before); + + LOG(INFO) << "Write applied at: " << HybridClock::GetPhysicalValueMicros(write_timestamp) + << " us, current time: " << HybridClock::GetPhysicalValueMicros(now_after) + << " us, write took: " << write_took << " us"; + + ASSERT_GT(write_timestamp.value(), now_before.value()); + + // see HybridClockTest.TestWaitUntilAfter_TestCase2 + if (error_after >= error_before) { + ASSERT_GE(write_took, 2 * error_before); + } else { + ASSERT_GE(write_took, error_before); + } +} + + +TEST_F(TabletServerTest, TestInsertAndMutate) { + + scoped_refptr tablet; + ASSERT_TRUE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + scoped_refptr rows_inserted = + METRIC_rows_inserted.Instantiate(tablet->tablet()->GetMetricEntity()); + scoped_refptr rows_updated = + METRIC_rows_updated.Instantiate(tablet->tablet()->GetMetricEntity()); + scoped_refptr rows_deleted = + METRIC_rows_deleted.Instantiate(tablet->tablet()->GetMetricEntity()); + ASSERT_EQ(0, rows_inserted->value()); + ASSERT_EQ(0, rows_updated->value()); + ASSERT_EQ(0, rows_deleted->value()); + tablet.reset(); + + RpcController controller; + + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + RowOperationsPB* data = req.mutable_row_operations(); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 1, 1, "original1", data); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 2, 2, "original2", data); + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 3, 3, "original3", data); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()) << resp.ShortDebugString(); + ASSERT_EQ(0, resp.per_row_errors().size()); + ASSERT_EQ(3, rows_inserted->value()); + ASSERT_EQ(0, rows_updated->value()); + controller.Reset(); + } + + // Try and mutate the rows inserted above + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 1, 2, "mutation1", + req.mutable_row_operations()); + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 2, 3, "mutation2", + req.mutable_row_operations()); + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 3, 4, "mutation3", + req.mutable_row_operations()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()) << resp.ShortDebugString(); + ASSERT_EQ(0, resp.per_row_errors().size()); + ASSERT_EQ(3, rows_inserted->value()); + ASSERT_EQ(3, rows_updated->value()); + controller.Reset(); + } + + // Try and mutate a non existent row key (should get an error) + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 1234, 2, "mutated", + req.mutable_row_operations()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()) << resp.ShortDebugString(); + ASSERT_EQ(1, resp.per_row_errors().size()); + ASSERT_EQ(3, rows_updated->value()); + controller.Reset(); + } + + // Try and delete 1 row + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + AddTestKeyToPB(RowOperationsPB::DELETE, schema_, 1, req.mutable_row_operations()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error())<< resp.ShortDebugString(); + ASSERT_EQ(0, resp.per_row_errors().size()); + ASSERT_EQ(3, rows_updated->value()); + ASSERT_EQ(1, rows_deleted->value()); + controller.Reset(); + } + + // Now try and mutate a row we just deleted, we should get an error + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 1, 2, "mutated1", + req.mutable_row_operations()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error())<< resp.ShortDebugString(); + ASSERT_EQ(1, resp.per_row_errors().size()); + controller.Reset(); + } + + ASSERT_EQ(3, rows_inserted->value()); + ASSERT_EQ(3, rows_updated->value()); + + // At this point, we have two rows left (row key 2 and 3). + VerifyRows(schema_, { KeyValue(2, 3), KeyValue(3, 4) }); + + // Do a mixed operation (some insert, update, and delete, some of which fail) + { + WriteRequestPB req; + WriteResponsePB resp; + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + RowOperationsPB* ops = req.mutable_row_operations(); + // op 0: Mutate row 1, which doesn't exist. This should fail. + AddTestRowToPB(RowOperationsPB::UPDATE, schema_, 1, 3, "mutate_should_fail", ops); + // op 1: Insert a new row 4 (succeeds) + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 4, 4, "new row 4", ops); + // op 2: Delete a non-existent row 5 (should fail) + AddTestKeyToPB(RowOperationsPB::DELETE, schema_, 5, ops); + // op 3: Insert a new row 6 (succeeds) + AddTestRowToPB(RowOperationsPB::INSERT, schema_, 6, 6, "new row 6", ops); + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error())<< resp.ShortDebugString(); + ASSERT_EQ(2, resp.per_row_errors().size()); + EXPECT_EQ("row_index: 0 error { code: NOT_FOUND message: \"key not found\" }", + resp.per_row_errors(0).ShortDebugString()); + EXPECT_EQ("row_index: 2 error { code: NOT_FOUND message: \"key not found\" }", + resp.per_row_errors(1).ShortDebugString()); + controller.Reset(); + } + + // get the clock's current timestamp + Timestamp now_before = mini_server_->server()->clock()->Now(); + + rows_inserted = nullptr; + rows_updated = nullptr; + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(2, 3), KeyValue(3, 4), KeyValue(4, 4), KeyValue(6, 6) }); + + // get the clock's timestamp after replay + Timestamp now_after = mini_server_->server()->clock()->Now(); + + // make sure 'now_after' is greater that or equal to 'now_before' + ASSERT_GE(now_after.value(), now_before.value()); +} + +// Test that passing a schema with fields not present in the tablet schema +// throws an exception. +TEST_F(TabletServerTest, TestInvalidWriteRequest_BadSchema) { + SchemaBuilder schema_builder(schema_); + ASSERT_OK(schema_builder.AddColumn("col_doesnt_exist", INT32)); + Schema bad_schema_with_ids = schema_builder.Build(); + Schema bad_schema = schema_builder.BuildWithoutIds(); + + // Send a row insert with an extra column + { + WriteRequestPB req; + WriteResponsePB resp; + RpcController controller; + + req.set_tablet_id(kTabletId); + RowOperationsPB* data = req.mutable_row_operations(); + ASSERT_OK(SchemaToPB(bad_schema, req.mutable_schema())); + + KuduPartialRow row(&bad_schema); + CHECK_OK(row.SetInt32("key", 1234)); + CHECK_OK(row.SetInt32("int_val", 5678)); + CHECK_OK(row.SetStringCopy("string_val", "hello world via RPC")); + CHECK_OK(row.SetInt32("col_doesnt_exist", 91011)); + RowOperationsPBEncoder enc(data); + enc.Add(RowOperationsPB::INSERT, row); + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::MISMATCHED_SCHEMA, resp.error().code()); + ASSERT_STR_CONTAINS(resp.error().status().message(), + "Client provided column col_doesnt_exist[int32 NOT NULL]" + " not present in tablet"); + } + + // Send a row mutation with an extra column and IDs + { + WriteRequestPB req; + WriteResponsePB resp; + RpcController controller; + + req.set_tablet_id(kTabletId); + ASSERT_OK(SchemaToPB(bad_schema_with_ids, req.mutable_schema())); + + AddTestKeyToPB(RowOperationsPB::UPDATE, bad_schema_with_ids, 1, + req.mutable_row_operations()); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::INVALID_SCHEMA, resp.error().code()); + ASSERT_STR_CONTAINS(resp.error().status().message(), + "User requests should not have Column IDs"); + } +} + +// Executes mutations each time a Tablet goes through a compaction/flush +// lifecycle hook. This allows to create mutations of all possible types +// deterministically. The purpose is to make sure such mutations are replayed +// correctly on tablet bootstrap. +class MyCommonHooks : public Tablet::FlushCompactCommonHooks, + public Tablet::FlushFaultHooks, + public Tablet::CompactionFaultHooks { + public: + explicit MyCommonHooks(TabletServerTest* test) + : test_(test), + iteration_(0) {} + + Status DoHook(int32_t key, int32_t new_int_val) { + test_->UpdateTestRowRemote(0, key, new_int_val); + return Status::OK(); + } + + // This should go in pre-flush and get flushed + virtual Status PostSwapNewMemRowSet() OVERRIDE { + return DoHook(1, 10 + iteration_); + } + // This should go in after the flush, but before + // the duplicating row set, i.e., this should appear as + // a missed delta. + virtual Status PostTakeMvccSnapshot() OVERRIDE { + return DoHook(2, 20 + iteration_); + } + // This too should appear as a missed delta. + virtual Status PostWriteSnapshot() OVERRIDE { + return DoHook(3, 30 + iteration_); + } + // This should appear as a duplicated mutation + virtual Status PostSwapInDuplicatingRowSet() OVERRIDE { + return DoHook(4, 40 + iteration_); + } + // This too should appear as a duplicated mutation + virtual Status PostReupdateMissedDeltas() OVERRIDE { + return DoHook(5, 50 + iteration_); + } + // This should go into the new delta. + virtual Status PostSwapNewRowSet() OVERRIDE { + return DoHook(6, 60 + iteration_); + } + // This should go in pre-flush (only on compactions) + virtual Status PostSelectIterators() OVERRIDE { + return DoHook(7, 70 + iteration_); + } + void increment_iteration() { + iteration_++; + } + protected: + TabletServerTest* test_; + int iteration_; +}; + +// Tests performing mutations that are going to the initial MRS +// or to a DMS, when the MRS is flushed. This also tests that the +// log produced on recovery allows to re-recover the original state. +TEST_F(TabletServerTest, TestRecoveryWithMutationsWhileFlushing) { + + InsertTestRowsRemote(0, 1, 7); + + shared_ptr hooks(new MyCommonHooks(this)); + + tablet_peer_->tablet()->SetFlushHooksForTests(hooks); + tablet_peer_->tablet()->SetCompactionHooksForTests(hooks); + tablet_peer_->tablet()->SetFlushCompactCommonHooksForTests(hooks); + + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + // Shutdown the tserver and try and rebuild the tablet from the log + // produced on recovery (recovery flushed no state, but produced a new + // log). + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(1, 10), + KeyValue(2, 20), + KeyValue(3, 30), + KeyValue(4, 40), + KeyValue(5, 50), + KeyValue(6, 60), + // the last hook only fires on compaction + // so this isn't mutated + KeyValue(7, 7) }); + + // Shutdown and rebuild again to test that the log generated during + // the previous recovery allows to perform recovery again. + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(1, 10), + KeyValue(2, 20), + KeyValue(3, 30), + KeyValue(4, 40), + KeyValue(5, 50), + KeyValue(6, 60), + KeyValue(7, 7) }); +} + +// Tests performing mutations that are going to a DMS or to the following +// DMS, when the initial one is flushed. +TEST_F(TabletServerTest, TestRecoveryWithMutationsWhileFlushingAndCompacting) { + + InsertTestRowsRemote(0, 1, 7); + + shared_ptr hooks(new MyCommonHooks(this)); + + tablet_peer_->tablet()->SetFlushHooksForTests(hooks); + tablet_peer_->tablet()->SetCompactionHooksForTests(hooks); + tablet_peer_->tablet()->SetFlushCompactCommonHooksForTests(hooks); + + // flush the first time + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(1, 10), + KeyValue(2, 20), + KeyValue(3, 30), + KeyValue(4, 40), + KeyValue(5, 50), + KeyValue(6, 60), + KeyValue(7, 7) }); + hooks->increment_iteration(); + + // set the hooks on the new tablet + tablet_peer_->tablet()->SetFlushHooksForTests(hooks); + tablet_peer_->tablet()->SetCompactionHooksForTests(hooks); + tablet_peer_->tablet()->SetFlushCompactCommonHooksForTests(hooks); + + // insert an additional row so that we can flush + InsertTestRowsRemote(0, 8, 1); + + // flush an additional MRS so that we have two DiskRowSets and then compact + // them making sure that mutations executed mid compaction are replayed as + // expected + ASSERT_OK(tablet_peer_->tablet()->Flush()); + VerifyRows(schema_, { KeyValue(1, 11), + KeyValue(2, 21), + KeyValue(3, 31), + KeyValue(4, 41), + KeyValue(5, 51), + KeyValue(6, 61), + KeyValue(7, 7), + KeyValue(8, 8) }); + + hooks->increment_iteration(); + ASSERT_OK(tablet_peer_->tablet()->Compact(Tablet::FORCE_COMPACT_ALL)); + + // get the clock's current timestamp + Timestamp now_before = mini_server_->server()->clock()->Now(); + + // Shutdown the tserver and try and rebuild the tablet from the log + // produced on recovery (recovery flushed no state, but produced a new + // log). + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(schema_, { KeyValue(1, 11), + KeyValue(2, 22), + KeyValue(3, 32), + KeyValue(4, 42), + KeyValue(5, 52), + KeyValue(6, 62), + KeyValue(7, 72), + KeyValue(8, 8) }); + + // get the clock's timestamp after replay + Timestamp now_after = mini_server_->server()->clock()->Now(); + + // make sure 'now_after' is greater than or equal to 'now_before' + ASSERT_GE(now_after.value(), now_before.value()); +} + +#define ANFF ASSERT_NO_FATAL_FAILURE + +// Regression test for KUDU-176. Ensures that after a major delta compaction, +// restarting properly recovers the tablet. +TEST_F(TabletServerTest, TestKUDU_176_RecoveryAfterMajorDeltaCompaction) { + + // Flush a DRS with 1 rows. + ASSERT_NO_FATAL_FAILURE(InsertTestRowsRemote(0, 1, 1)); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + ANFF(VerifyRows(schema_, { KeyValue(1, 1) })); + + // Update it, flush deltas. + ANFF(UpdateTestRowRemote(0, 1, 2)); + ASSERT_OK(tablet_peer_->tablet()->FlushBiggestDMS()); + ANFF(VerifyRows(schema_, { KeyValue(1, 2) })); + + // Major compact deltas. + { + vector > rsets; + tablet_peer_->tablet()->GetRowSetsForTests(&rsets); + vector col_ids = { tablet_peer_->tablet()->schema()->column_id(1), + tablet_peer_->tablet()->schema()->column_id(2) }; + ASSERT_OK(tablet_peer_->tablet()->DoMajorDeltaCompaction(col_ids, rsets[0])) + } + + // Verify that data is still the same. + ANFF(VerifyRows(schema_, { KeyValue(1, 2) })); + + // Verify that data remains after a restart. + ASSERT_OK(ShutdownAndRebuildTablet()); + ANFF(VerifyRows(schema_, { KeyValue(1, 2) })); +} + +// Regression test for KUDU-177. Ensures that after a major delta compaction, +// rows that were in the old DRS's DMS are properly replayed. +TEST_F(TabletServerTest, TestKUDU_177_RecoveryOfDMSEditsAfterMajorDeltaCompaction) { + // Flush a DRS with 1 rows. + ANFF(InsertTestRowsRemote(0, 1, 1)); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + ANFF(VerifyRows(schema_, { KeyValue(1, 1) })); + + // Update it, flush deltas. + ANFF(UpdateTestRowRemote(0, 1, 2)); + ASSERT_OK(tablet_peer_->tablet()->FlushBiggestDMS()); + + // Update it again, so this last update is in the DMS. + ANFF(UpdateTestRowRemote(0, 1, 3)); + ANFF(VerifyRows(schema_, { KeyValue(1, 3) })); + + // Major compact deltas. This doesn't include the DMS, but the old + // DMS should "move over" to the output of the delta compaction. + { + vector > rsets; + tablet_peer_->tablet()->GetRowSetsForTests(&rsets); + vector col_ids = { tablet_peer_->tablet()->schema()->column_id(1), + tablet_peer_->tablet()->schema()->column_id(2) }; + ASSERT_OK(tablet_peer_->tablet()->DoMajorDeltaCompaction(col_ids, rsets[0])); + } + // Verify that data is still the same. + ANFF(VerifyRows(schema_, { KeyValue(1, 3) })); + + // Verify that the update remains after a restart. + ASSERT_OK(ShutdownAndRebuildTablet()); + ANFF(VerifyRows(schema_, { KeyValue(1, 3) })); +} + +TEST_F(TabletServerTest, TestClientGetsErrorBackWhenRecoveryFailed) { + ANFF(InsertTestRowsRemote(0, 1, 7)); + + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + // Save the log path before shutting down the tablet (and destroying + // the tablet peer). + string log_path = tablet_peer_->log()->ActiveSegmentPathForTests(); + ShutdownTablet(); + + ASSERT_OK(log::CorruptLogFile(env_.get(), log_path, + log::FLIP_BYTE, 300)); + + ASSERT_FALSE(ShutdownAndRebuildTablet().ok()); + + // Connect to it. + CreateTsClientProxies(mini_server_->bound_rpc_addr(), + client_messenger_, + &proxy_, &admin_proxy_, &consensus_proxy_, &generic_proxy_); + + WriteRequestPB req; + req.set_tablet_id(kTabletId); + + WriteResponsePB resp; + rpc::RpcController controller; + + // We're expecting the write to fail. + ASSERT_OK(DCHECK_NOTNULL(proxy_.get())->Write(req, &resp, &controller)); + ASSERT_EQ(TabletServerErrorPB::TABLET_NOT_RUNNING, resp.error().code()); + ASSERT_STR_CONTAINS(resp.error().status().message(), "Tablet not RUNNING: FAILED"); +} + +TEST_F(TabletServerTest, TestScan) { + int num_rows = AllowSlowTests() ? 10000 : 1000; + InsertTestRowsDirect(0, num_rows); + + ScanResponsePB resp; + ASSERT_NO_FATAL_FAILURE(OpenScannerWithAllColumns(&resp)); + + // Ensure that the scanner ID came back and got inserted into the + // ScannerManager map. + string scanner_id = resp.scanner_id(); + ASSERT_TRUE(!scanner_id.empty()); + { + SharedScanner junk; + ASSERT_TRUE(mini_server_->server()->scanner_manager()->LookupScanner(scanner_id, &junk)); + } + + // Drain all the rows from the scanner. + vector results; + ASSERT_NO_FATAL_FAILURE( + DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + ASSERT_EQ(num_rows, results.size()); + + KuduPartialRow row(&schema_); + for (int i = 0; i < num_rows; i++) { + BuildTestRow(i, &row); + string expected = "(" + row.ToString() + ")"; + ASSERT_EQ(expected, results[i]); + } + + // Since the rows are drained, the scanner should be automatically removed + // from the scanner manager. + { + SharedScanner junk; + ASSERT_FALSE(mini_server_->server()->scanner_manager()->LookupScanner(scanner_id, &junk)); + } +} + +TEST_F(TabletServerTest, TestScannerOpenWhenServerShutsDown) { + InsertTestRowsDirect(0, 1); + + ScanResponsePB resp; + ASSERT_NO_FATAL_FAILURE(OpenScannerWithAllColumns(&resp)); + + // Scanner is now open. The test will now shut down the TS with the scanner still + // out there. Due to KUDU-161 this used to fail, since the scanner (and thus the MRS) + // stayed open longer than the anchor registry +} + +TEST_F(TabletServerTest, TestSnapshotScan) { + int num_rows = AllowSlowTests() ? 1000 : 100; + int num_batches = AllowSlowTests() ? 100 : 10; + vector write_timestamps_collector; + + // perform a series of writes and collect the timestamps + InsertTestRowsRemote(0, 0, num_rows, num_batches, nullptr, + kTabletId, &write_timestamps_collector); + + // now perform snapshot scans. + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + int batch_idx = 1; + for (uint64_t write_timestamp : write_timestamps_collector) { + req.Clear(); + resp.Clear(); + rpc.Reset(); + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + scan->set_read_mode(READ_AT_SNAPSHOT); + + // Decode and re-encode the timestamp. Note that a snapshot at 'write_timestamp' + // does not include the written rows, so we increment that timestamp by one + // to make sure we get those rows back + Timestamp read_timestamp(write_timestamp); + read_timestamp = Timestamp(read_timestamp.value() + 1); + scan->set_snap_timestamp(read_timestamp.ToUint64()); + + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + ASSERT_TRUE(resp.has_more_results()); + // Drain all the rows from the scanner. + vector results; + ASSERT_NO_FATAL_FAILURE(DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + // on each scan we should get (num_rows / num_batches) * batch_idx rows back + int expected_num_rows = (num_rows / num_batches) * batch_idx; + ASSERT_EQ(expected_num_rows, results.size()); + + if (VLOG_IS_ON(2)) { + VLOG(2) << "Scanner: " << resp.scanner_id() << " performing a snapshot read at: " + << read_timestamp.ToString() << " got back: "; + for (const string& result : results) { + VLOG(2) << result; + } + } + + // assert that the first and last rows were the expected ones + ASSERT_EQ("(int32 key=0, int32 int_val=0, string string_val=original0)", results[0]); + ASSERT_EQ(Substitute("(int32 key=$0, int32 int_val=$0, string string_val=original$0)", + (batch_idx * (num_rows / num_batches) - 1)), results[results.size() - 1]); + batch_idx++; + } +} + +TEST_F(TabletServerTest, TestSnapshotScan_WithoutSnapshotTimestamp) { + vector write_timestamps_collector; + // perform a write + InsertTestRowsRemote(0, 0, 1, 1, nullptr, kTabletId, &write_timestamps_collector); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + scan->set_read_mode(READ_AT_SNAPSHOT); + + Timestamp now = mini_server_->server()->clock()->Now(); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // make sure that the snapshot timestamp that was selected is >= now + ASSERT_GE(resp.snap_timestamp(), now.ToUint64()); +} + +// Tests that a snapshot in the future (beyond the current time plus maximum +// synchronization error) fails as an invalid snapshot. +TEST_F(TabletServerTest, TestSnapshotScan_SnapshotInTheFutureFails) { + vector write_timestamps_collector; + // perform a write + InsertTestRowsRemote(0, 0, 1, 1, nullptr, kTabletId, &write_timestamps_collector); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + scan->set_read_mode(READ_AT_SNAPSHOT); + + Timestamp read_timestamp(write_timestamps_collector[0]); + // Increment the write timestamp by 60 secs: the server will definitely consider + // this in the future. + read_timestamp = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(read_timestamp) + 60000000); + scan->set_snap_timestamp(read_timestamp.ToUint64()); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::INVALID_SNAPSHOT, resp.error().code()); + } +} + + +// Test tserver shutdown with an active scanner open. +TEST_F(TabletServerTest, TestSnapshotScan_OpenScanner) { + vector write_timestamps_collector; + // Write and flush and write, so we have some rows in MRS and DRS + InsertTestRowsRemote(0, 0, 100, 2, nullptr, kTabletId, &write_timestamps_collector); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + InsertTestRowsRemote(0, 100, 100, 2, nullptr, kTabletId, &write_timestamps_collector); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); + scan->set_read_mode(READ_AT_SNAPSHOT); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + // Intentionally do not drain the scanner at the end, to leave it open. + // This tests tablet server shutdown with an active scanner. +} + + +// Test retrying a snapshot scan using last_row. +TEST_F(TabletServerTest, TestSnapshotScan_LastRow) { + // Set the internal batching within the tserver to be small. Otherwise, + // even though we use a small batch size in our request, we'd end up reading + // many rows at a time. + FLAGS_scanner_batch_size_rows = 5; + const int num_rows = AllowSlowTests() ? 1000 : 100; + const int num_batches = AllowSlowTests() ? 10 : 5; + const int batch_size = num_rows / num_batches; + + // Generate some interleaved rows + for (int i = 0; i < batch_size; i++) { + ASSERT_OK(tablet_peer_->tablet()->Flush()); + for (int j = 0; j < num_rows; j++) { + if (j % batch_size == i) { + InsertTestRowsDirect(j, 1); + } + } + } + + // Remove all the key columns from the projection. + // This makes sure the scanner adds them in for sorting but removes them before returning + // to the client. + SchemaBuilder sb(schema_); + for (int i = 0; i < schema_.num_key_columns(); i++) { + sb.RemoveColumn(schema_.column(i).name()); + } + const Schema& projection = sb.BuildWithoutIds(); + + // Scan the whole tablet with a few different batch sizes. + for (int i = 1; i < 10000; i *= 2) { + ScanResponsePB resp; + ScanRequestPB req; + RpcController rpc; + + // Set up a new snapshot scan without a specified timestamp. + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + scan->set_read_mode(READ_AT_SNAPSHOT); + scan->set_order_mode(ORDERED); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + vector results; + do { + rpc.Reset(); + // Send the call. + { + SCOPED_TRACE(req.DebugString()); + req.set_batch_size_bytes(i); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + // Save the rows into 'results' vector. + StringifyRowsFromResponse(projection, rpc, resp, &results); + // Retry the scan, setting the last_row_key and snapshot based on the response. + scan->set_last_primary_key(resp.last_primary_key()); + scan->set_snap_timestamp(resp.snap_timestamp()); + } while (resp.has_more_results()); + + ASSERT_EQ(num_rows, results.size()); + + // Verify that we get the rows back in order. + KuduPartialRow row(&projection); + for (int j = 0; j < num_rows; j++) { + ASSERT_OK(row.SetInt32(0, j * 2)); + ASSERT_OK(row.SetStringCopy(1, StringPrintf("hello %d", j))); + string expected = "(" + row.ToString() + ")"; + ASSERT_EQ(expected, results[j]); + } + } +} + + +// Tests that a read in the future succeeds if a propagated_timestamp (that is even +// further in the future) follows along. Also tests that the clock was updated so +// that no writes will ever have a timestamp post this snapshot. +TEST_F(TabletServerTest, TestSnapshotScan_SnapshotInTheFutureWithPropagatedTimestamp) { + vector write_timestamps_collector; + // perform a write + InsertTestRowsRemote(0, 0, 1, 1, nullptr, kTabletId, &write_timestamps_collector); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + scan->set_read_mode(READ_AT_SNAPSHOT); + + Timestamp read_timestamp(write_timestamps_collector[0]); + // increment the write timestamp by 5 secs, the server will definitely consider + // this in the future. + read_timestamp = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(read_timestamp) + 5000000); + scan->set_snap_timestamp(read_timestamp.ToUint64()); + + // send a propagated timestamp that is an additional 100 msecs into the future. + Timestamp propagated_timestamp = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(read_timestamp) + 100000); + scan->set_propagated_timestamp(propagated_timestamp.ToUint64()); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // make sure the server's current clock returns a value that is larger than the + // propagated timestamp. It should have the same physical time, but higher + // logical time (due to various calls to clock.Now() when processing the request). + Timestamp now = mini_server_->server()->clock()->Now(); + + ASSERT_EQ(HybridClock::GetPhysicalValueMicros(propagated_timestamp), + HybridClock::GetPhysicalValueMicros(now)); + + ASSERT_GT(HybridClock::GetLogicalValue(now), + HybridClock::GetLogicalValue(propagated_timestamp)); + + vector results; + ASSERT_NO_FATAL_FAILURE(DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + ASSERT_EQ(1, results.size()); + ASSERT_EQ("(int32 key=0, int32 int_val=0, string string_val=original0)", results[0]); +} + + +// Test that a read in the future fails, even if a propagated_timestamp is sent along, +// if the read_timestamp is beyond the propagated_timestamp. +TEST_F(TabletServerTest, TestSnapshotScan__SnapshotInTheFutureBeyondPropagatedTimestampFails) { + vector write_timestamps_collector; + // perform a write + InsertTestRowsRemote(0, 0, 1, 1, nullptr, kTabletId, &write_timestamps_collector); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + scan->set_read_mode(READ_AT_SNAPSHOT); + + Timestamp read_timestamp(write_timestamps_collector[0]); + // increment the write timestamp by 60 secs, the server will definitely consider + // this in the future. + read_timestamp = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(read_timestamp) + 60000000); + scan->set_snap_timestamp(read_timestamp.ToUint64()); + + // send a propagated timestamp that is an less than the read timestamp (but still + // in the future as far the server is concerned). + Timestamp propagated_timestamp = HybridClock::TimestampFromMicroseconds( + HybridClock::GetPhysicalValueMicros(read_timestamp) - 100000); + scan->set_propagated_timestamp(propagated_timestamp.ToUint64()); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::INVALID_SNAPSHOT, resp.error().code()); + } +} + +TEST_F(TabletServerTest, TestScanWithStringPredicates) { + InsertTestRowsDirect(0, 100); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns())); + + // Set up a range predicate: "hello 50" < string_val <= "hello 59" + ColumnRangePredicatePB* pred = scan->add_range_predicates(); + pred->mutable_column()->CopyFrom(scan->projected_columns(2)); + + pred->set_lower_bound("hello 50"); + pred->set_upper_bound("hello 59"); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // Drain all the rows from the scanner. + vector results; + ASSERT_NO_FATAL_FAILURE( + DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + ASSERT_EQ(10, results.size()); + ASSERT_EQ("(int32 key=50, int32 int_val=100, string string_val=hello 50)", results[0]); + ASSERT_EQ("(int32 key=59, int32 int_val=118, string string_val=hello 59)", results[9]); +} + +TEST_F(TabletServerTest, TestScanWithPredicates) { + // TODO: need to test adding a predicate on a column which isn't part of the + // projection! I don't think we implemented this at the tablet layer yet, + // but should do so. + + int num_rows = AllowSlowTests() ? 10000 : 1000; + InsertTestRowsDirect(0, num_rows); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns())); + + // Set up a range predicate: 51 <= key <= 100 + ColumnRangePredicatePB* pred = scan->add_range_predicates(); + pred->mutable_column()->CopyFrom(scan->projected_columns(0)); + + int32_t lower_bound_int = 51; + int32_t upper_bound_int = 100; + pred->mutable_lower_bound()->append(reinterpret_cast(&lower_bound_int), + sizeof(lower_bound_int)); + pred->mutable_upper_bound()->append(reinterpret_cast(&upper_bound_int), + sizeof(upper_bound_int)); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // Drain all the rows from the scanner. + vector results; + ASSERT_NO_FATAL_FAILURE( + DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + ASSERT_EQ(50, results.size()); +} + +TEST_F(TabletServerTest, TestScanWithEncodedPredicates) { + InsertTestRowsDirect(0, 100); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(SchemaToColumnPBs(schema_, scan->mutable_projected_columns())); + + // Set up a range predicate: 51 <= key <= 60 + // using encoded keys + int32_t start_key_int = 51; + int32_t stop_key_int = 60; + EncodedKeyBuilder ekb(&schema_); + ekb.AddColumnKey(&start_key_int); + gscoped_ptr start_encoded(ekb.BuildEncodedKey()); + + ekb.Reset(); + ekb.AddColumnKey(&stop_key_int); + gscoped_ptr stop_encoded(ekb.BuildEncodedKey()); + + scan->mutable_start_primary_key()->assign( + reinterpret_cast(start_encoded->encoded_key().data()), + start_encoded->encoded_key().size()); + scan->mutable_stop_primary_key()->assign( + reinterpret_cast(stop_encoded->encoded_key().data()), + stop_encoded->encoded_key().size()); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // Drain all the rows from the scanner. + vector results; + ASSERT_NO_FATAL_FAILURE( + DrainScannerToStrings(resp.scanner_id(), schema_, &results)); + ASSERT_EQ(9, results.size()); + EXPECT_EQ("(int32 key=51, int32 int_val=102, string string_val=hello 51)", + results.front()); + EXPECT_EQ("(int32 key=59, int32 int_val=118, string string_val=hello 59)", + results.back()); +} + + +// Test requesting more rows from a scanner which doesn't exist +TEST_F(TabletServerTest, TestBadScannerID) { + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + req.set_scanner_id("does-not-exist"); + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::SCANNER_EXPIRED, resp.error().code()); +} + +// Test passing a scanner ID, but also filling in some of the NewScanRequest +// field. +TEST_F(TabletServerTest, TestInvalidScanRequest_NewScanAndScannerID) { + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + req.set_batch_size_bytes(0); // so it won't return data right away + req.set_scanner_id("x"); + SCOPED_TRACE(req.DebugString()); + Status s = proxy_->Scan(req, &resp, &rpc); + ASSERT_FALSE(s.ok()); + ASSERT_STR_CONTAINS(s.ToString(), "Must not pass both a scanner_id and new_scan_request"); +} + + +// Test that passing a projection with fields not present in the tablet schema +// throws an exception. +TEST_F(TabletServerTest, TestInvalidScanRequest_BadProjection) { + const Schema projection({ ColumnSchema("col_doesnt_exist", INT32) }, 0); + VerifyScanRequestFailure(projection, + TabletServerErrorPB::MISMATCHED_SCHEMA, + "Some columns are not present in the current schema: col_doesnt_exist"); +} + +// Test that passing a projection with mismatched type/nullability throws an exception. +TEST_F(TabletServerTest, TestInvalidScanRequest_BadProjectionTypes) { + Schema projection; + + // Verify mismatched nullability for the not-null int field + ASSERT_OK( + projection.Reset({ ColumnSchema("int_val", INT32, true) }, // should be NOT NULL + 0)); + VerifyScanRequestFailure(projection, + TabletServerErrorPB::MISMATCHED_SCHEMA, + "The column 'int_val' must have type int32 NOT " + "NULL found int32 NULLABLE"); + + // Verify mismatched nullability for the nullable string field + ASSERT_OK( + projection.Reset({ ColumnSchema("string_val", STRING, false) }, // should be NULLABLE + 0)); + VerifyScanRequestFailure(projection, + TabletServerErrorPB::MISMATCHED_SCHEMA, + "The column 'string_val' must have type string " + "NULLABLE found string NOT NULL"); + + // Verify mismatched type for the not-null int field + ASSERT_OK( + projection.Reset({ ColumnSchema("int_val", INT16, false) }, // should be INT32 NOT NULL + 0)); + VerifyScanRequestFailure(projection, + TabletServerErrorPB::MISMATCHED_SCHEMA, + "The column 'int_val' must have type int32 NOT " + "NULL found int16 NOT NULL"); + + // Verify mismatched type for the nullable string field + ASSERT_OK(projection.Reset( + { ColumnSchema("string_val", INT32, true) }, // should be STRING NULLABLE + 0)); + VerifyScanRequestFailure(projection, + TabletServerErrorPB::MISMATCHED_SCHEMA, + "The column 'string_val' must have type string " + "NULLABLE found int32 NULLABLE"); +} + +// Test that passing a projection with Column IDs throws an exception. +// Column IDs are assigned to the user request schema on the tablet server +// based on the latest schema. +TEST_F(TabletServerTest, TestInvalidScanRequest_WithIds) { + const Schema* projection = tablet_peer_->tablet()->schema(); + ASSERT_TRUE(projection->has_column_ids()); + VerifyScanRequestFailure(*projection, + TabletServerErrorPB::INVALID_SCHEMA, + "User requests should not have Column IDs"); +} + +// Test scanning a tablet that has no entries. +TEST_F(TabletServerTest, TestScan_NoResults) { + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + // Set up a new request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + + // Because there are no entries, we should immediately return "no results". + ASSERT_FALSE(resp.has_more_results()); + } +} + +// Test scanning a tablet that has no entries. +TEST_F(TabletServerTest, TestScan_InvalidScanSeqId) { + InsertTestRowsDirect(0, 10); + + ScanRequestPB req; + ScanResponsePB resp; + RpcController rpc; + + { + // Set up a new scan request with no predicates, all columns. + const Schema& projection = schema_; + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + req.set_batch_size_bytes(0); // so it won't return data right away + + // Create the scanner + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + ASSERT_FALSE(resp.has_error()); + ASSERT_TRUE(resp.has_more_results()); + } + + string scanner_id = resp.scanner_id(); + resp.Clear(); + + { + // Continue the scan with an invalid sequence ID + req.Clear(); + rpc.Reset(); + req.set_scanner_id(scanner_id); + req.set_batch_size_bytes(0); // so it won't return data right away + req.set_call_seq_id(42); // should be 1 + + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::INVALID_SCAN_CALL_SEQ_ID, resp.error().code()); + } +} + +void TabletServerTest::DoOrderedScanTest(const Schema& projection, + const string& expected_rows_as_string) { + InsertTestRowsDirect(0, 10); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + InsertTestRowsDirect(10, 10); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + InsertTestRowsDirect(20, 10); + + ScanResponsePB resp; + ScanRequestPB req; + RpcController rpc; + + // Set up a new snapshot scan without a specified timestamp. + NewScanRequestPB* scan = req.mutable_new_scan_request(); + scan->set_tablet_id(kTabletId); + ASSERT_OK(SchemaToColumnPBs(projection, scan->mutable_projected_columns())); + req.set_call_seq_id(0); + scan->set_read_mode(READ_AT_SNAPSHOT); + scan->set_order_mode(ORDERED); + + { + SCOPED_TRACE(req.DebugString()); + req.set_batch_size_bytes(0); // so it won't return data right away + ASSERT_OK(proxy_->Scan(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + vector results; + ASSERT_NO_FATAL_FAILURE( + DrainScannerToStrings(resp.scanner_id(), projection, &results)); + + ASSERT_EQ(30, results.size()); + + for (int i = 0; i < results.size(); ++i) { + ASSERT_EQ(results[i], Substitute(expected_rows_as_string, i, i * 2)); + } +} + +// Tests for KUDU-967. This test creates multiple row sets and then performs an ordered +// scan including the key columns in the projection but without marking them as keys. +// Without a fix for KUDU-967 the scan will often return out-of-order results. +TEST_F(TabletServerTest, TestOrderedScan_ProjectionWithKeyColumnsInOrder) { + // Build a projection with all the columns, but don't mark the key columns as such. + SchemaBuilder sb; + for (int i = 0; i < schema_.num_columns(); i++) { + sb.AddColumn(schema_.column(i), false); + } + const Schema& projection = sb.BuildWithoutIds(); + DoOrderedScanTest(projection, "(int32 key=$0, int32 int_val=$1, string string_val=hello $0)"); +} + +// Same as above but doesn't add the key columns to the projection. +TEST_F(TabletServerTest, TestOrderedScan_ProjectionWithoutKeyColumns) { + // Build a projection without the key columns. + SchemaBuilder sb; + for (int i = schema_.num_key_columns(); i < schema_.num_columns(); i++) { + sb.AddColumn(schema_.column(i), false); + } + const Schema& projection = sb.BuildWithoutIds(); + DoOrderedScanTest(projection, "(int32 int_val=$1, string string_val=hello $0)"); +} + +// Same as above but creates a projection with the order of columns reversed. +TEST_F(TabletServerTest, TestOrderedScan_ProjectionWithKeyColumnsOutOfOrder) { + // Build a projection with the order of the columns reversed. + SchemaBuilder sb; + for (int i = schema_.num_columns() - 1; i >= 0; i--) { + sb.AddColumn(schema_.column(i), false); + } + const Schema& projection = sb.BuildWithoutIds(); + DoOrderedScanTest(projection, "(string string_val=hello $0, int32 int_val=$1, int32 key=$0)"); +} + +TEST_F(TabletServerTest, TestAlterSchema) { + AlterSchemaRequestPB req; + AlterSchemaResponsePB resp; + RpcController rpc; + + InsertTestRowsRemote(0, 0, 2); + + // Add one column with a default value + const int32_t c2_write_default = 5; + const int32_t c2_read_default = 7; + SchemaBuilder builder(schema_); + ASSERT_OK(builder.AddColumn("c2", INT32, false, &c2_read_default, &c2_write_default)); + Schema s2 = builder.Build(); + + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_tablet_id(kTabletId); + req.set_schema_version(1); + ASSERT_OK(SchemaToPB(s2, req.mutable_schema())); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(admin_proxy_->AlterSchema(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + { + InsertTestRowsRemote(0, 2, 2); + scoped_refptr tablet; + ASSERT_TRUE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + ASSERT_OK(tablet->tablet()->Flush()); + } + + const Schema projection({ ColumnSchema("key", INT32), (ColumnSchema("c2", INT32)) }, 1); + + // Try recovering from the original log + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(projection, { KeyValue(0, 7), + KeyValue(1, 7), + KeyValue(2, 5), + KeyValue(3, 5) }); + + // Try recovering from the log generated on recovery + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(projection, { KeyValue(0, 7), + KeyValue(1, 7), + KeyValue(2, 5), + KeyValue(3, 5) }); +} + +// Adds a new column with no "write default", and then restarts the tablet +// server. Inserts that were made before the new column was added should +// still replay properly during bootstrap. +// +// Regression test for KUDU-181. +TEST_F(TabletServerTest, TestAlterSchema_AddColWithoutWriteDefault) { + AlterSchemaRequestPB req; + AlterSchemaResponsePB resp; + RpcController rpc; + + InsertTestRowsRemote(0, 0, 2); + + // Add a column with a read-default but no write-default. + const uint32_t c2_read_default = 7; + SchemaBuilder builder(schema_); + ASSERT_OK(builder.AddColumn("c2", INT32, false, &c2_read_default, nullptr)); + Schema s2 = builder.Build(); + + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_tablet_id(kTabletId); + req.set_schema_version(1); + ASSERT_OK(SchemaToPB(s2, req.mutable_schema())); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(admin_proxy_->AlterSchema(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // Verify that the old data picked up the read default. + + const Schema projection({ ColumnSchema("key", INT32), ColumnSchema("c2", INT32) }, 1); + VerifyRows(projection, { KeyValue(0, 7), KeyValue(1, 7) }); + + // Try recovering from the original log + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(projection, { KeyValue(0, 7), KeyValue(1, 7) }); + + // Try recovering from the log generated on recovery + ASSERT_NO_FATAL_FAILURE(ShutdownAndRebuildTablet()); + VerifyRows(projection, { KeyValue(0, 7), KeyValue(1, 7) }); +} + +TEST_F(TabletServerTest, TestCreateTablet_TabletExists) { + CreateTabletRequestPB req; + CreateTabletResponsePB resp; + RpcController rpc; + + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_table_id("testtb"); + req.set_tablet_id(kTabletId); + PartitionPB* partition = req.mutable_partition(); + partition->set_partition_key_start(" "); + partition->set_partition_key_end(" "); + req.set_table_name("testtb"); + req.mutable_config()->CopyFrom(mini_server_->CreateLocalConfig()); + + Schema schema = SchemaBuilder(schema_).Build(); + ASSERT_OK(SchemaToPB(schema, req.mutable_schema())); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(admin_proxy_->CreateTablet(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::TABLET_ALREADY_EXISTS, resp.error().code()); + } +} + +TEST_F(TabletServerTest, TestDeleteTablet) { + scoped_refptr tablet; + + // Verify that the tablet exists + ASSERT_TRUE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + + // Fetch the metric for the number of on-disk blocks, so we can later verify + // that we actually remove data. + scoped_refptr > ondisk = + METRIC_log_block_manager_blocks_under_management.Instantiate( + mini_server_->server()->metric_entity(), 0); + const int block_count_before_flush = ondisk->value(); + if (FLAGS_block_manager == "log") { + ASSERT_EQ(block_count_before_flush, 0); + } + + // Put some data in the tablet. We flush and insert more rows to ensure that + // there is data both in the MRS and on disk. + ASSERT_NO_FATAL_FAILURE(InsertTestRowsRemote(0, 1, 1)); + ASSERT_OK(tablet_peer_->tablet()->Flush()); + ASSERT_NO_FATAL_FAILURE(InsertTestRowsRemote(0, 2, 1)); + + const int block_count_after_flush = ondisk->value(); + if (FLAGS_block_manager == "log") { + ASSERT_GT(block_count_after_flush, block_count_before_flush); + } + + // Drop any local references to the tablet from within this test, + // so that when we delete it on the server, it's not held alive + // by the test code. + tablet_peer_.reset(); + tablet.reset(); + + DeleteTabletRequestPB req; + DeleteTabletResponsePB resp; + RpcController rpc; + + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_tablet_id(kTabletId); + req.set_delete_type(tablet::TABLET_DATA_DELETED); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(admin_proxy_->DeleteTablet(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_FALSE(resp.has_error()); + } + + // Verify that the tablet is removed from the tablet map + ASSERT_FALSE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + + // Verify that fetching metrics doesn't crash. Regression test for KUDU-638. + EasyCurl c; + faststring buf; + ASSERT_OK(c.FetchURL(strings::Substitute( + "http://$0/jsonmetricz", + mini_server_->bound_http_addr().ToString()), + &buf)); + + // Verify data was actually removed. + // TODO(KUDU-678): this should be 0 but we leak an empty delta block. + const int block_count_after_delete = ondisk->value(); + if (FLAGS_block_manager == "log") { + ASSERT_EQ(block_count_after_delete, 1); + } + + // Verify that after restarting the TS, the tablet is still not in the tablet manager. + // This ensures that the on-disk metadata got removed. + Status s = ShutdownAndRebuildTablet(); + ASSERT_TRUE(s.IsNotFound()) << s.ToString(); + ASSERT_FALSE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); +} + +TEST_F(TabletServerTest, TestDeleteTablet_TabletNotCreated) { + DeleteTabletRequestPB req; + DeleteTabletResponsePB resp; + RpcController rpc; + + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_tablet_id("NotPresentTabletId"); + req.set_delete_type(tablet::TABLET_DATA_DELETED); + + // Send the call + { + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(admin_proxy_->DeleteTablet(req, &resp, &rpc)); + SCOPED_TRACE(resp.DebugString()); + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::TABLET_NOT_FOUND, resp.error().code()); + } +} + +// Test that with concurrent requests to delete the same tablet, one wins and +// the other fails, with no assertion failures. Regression test for KUDU-345. +TEST_F(TabletServerTest, TestConcurrentDeleteTablet) { + // Verify that the tablet exists + scoped_refptr tablet; + ASSERT_TRUE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + + static const int kNumDeletes = 2; + RpcController rpcs[kNumDeletes]; + DeleteTabletResponsePB responses[kNumDeletes]; + CountDownLatch latch(kNumDeletes); + + DeleteTabletRequestPB req; + req.set_dest_uuid(mini_server_->server()->fs_manager()->uuid()); + req.set_tablet_id(kTabletId); + req.set_delete_type(tablet::TABLET_DATA_DELETED); + + for (int i = 0; i < kNumDeletes; i++) { + SCOPED_TRACE(req.DebugString()); + admin_proxy_->DeleteTabletAsync(req, &responses[i], &rpcs[i], + boost::bind(&CountDownLatch::CountDown, &latch)); + } + latch.Wait(); + + int num_success = 0; + for (int i = 0; i < kNumDeletes; i++) { + ASSERT_TRUE(rpcs[i].finished()); + LOG(INFO) << "STATUS " << i << ": " << rpcs[i].status().ToString(); + LOG(INFO) << "RESPONSE " << i << ": " << responses[i].DebugString(); + if (!responses[i].has_error()) { + num_success++; + } + } + + // Verify that the tablet is removed from the tablet map + ASSERT_FALSE(mini_server_->server()->tablet_manager()->LookupTablet(kTabletId, &tablet)); + ASSERT_EQ(1, num_success); +} + +TEST_F(TabletServerTest, TestInsertLatencyMicroBenchmark) { + METRIC_DEFINE_entity(test); + METRIC_DEFINE_histogram(test, insert_latency, + "Insert Latency", + MetricUnit::kMicroseconds, + "TabletServer single threaded insert latency.", + 10000000, + 2); + + scoped_refptr histogram = METRIC_insert_latency.Instantiate(ts_test_metric_entity_); + + uint64_t warmup = AllowSlowTests() ? + FLAGS_single_threaded_insert_latency_bench_warmup_rows : 10; + + for (int i = 0; i < warmup; i++) { + InsertTestRowsRemote(0, i, 1); + } + + uint64_t max_rows = AllowSlowTests() ? + FLAGS_single_threaded_insert_latency_bench_insert_rows : 100; + + MonoTime start = MonoTime::Now(MonoTime::FINE); + + for (int i = warmup; i < warmup + max_rows; i++) { + MonoTime before = MonoTime::Now(MonoTime::FINE); + InsertTestRowsRemote(0, i, 1); + MonoTime after = MonoTime::Now(MonoTime::FINE); + MonoDelta delta = after.GetDeltaSince(before); + histogram->Increment(delta.ToMicroseconds()); + } + + MonoTime end = MonoTime::Now(MonoTime::FINE); + double throughput = ((max_rows - warmup) * 1.0) / end.GetDeltaSince(start).ToSeconds(); + + // Generate the JSON. + std::stringstream out; + JsonWriter writer(&out, JsonWriter::PRETTY); + ASSERT_OK(histogram->WriteAsJson(&writer, MetricJsonOptions())); + + LOG(INFO) << "Throughput: " << throughput << " rows/sec."; + LOG(INFO) << out.str(); +} + +// Simple test to ensure we can destroy an RpcServer in different states of +// initialization before Start()ing it. +TEST_F(TabletServerTest, TestRpcServerCreateDestroy) { + RpcServerOptions opts; + { + RpcServer server1(opts); + } + { + RpcServer server2(opts); + MessengerBuilder mb("foo"); + shared_ptr messenger; + ASSERT_OK(mb.Build(&messenger)); + ASSERT_OK(server2.Init(messenger)); + } +} + +TEST_F(TabletServerTest, TestWriteOutOfBounds) { + const char *tabletId = "TestWriteOutOfBoundsTablet"; + Schema schema = SchemaBuilder(schema_).Build(); + + PartitionSchema partition_schema; + CHECK_OK(PartitionSchema::FromPB(PartitionSchemaPB(), schema, &partition_schema)); + + KuduPartialRow start_row(&schema); + ASSERT_OK(start_row.SetInt32("key", 10)); + + KuduPartialRow end_row(&schema); + ASSERT_OK(end_row.SetInt32("key", 20)); + + vector partitions; + ASSERT_OK(partition_schema.CreatePartitions({ start_row, end_row }, schema, &partitions)); + + ASSERT_EQ(3, partitions.size()); + + ASSERT_OK(mini_server_->server()->tablet_manager()->CreateNewTablet( + "TestWriteOutOfBoundsTable", tabletId, + partitions[1], + tabletId, schema, partition_schema, + mini_server_->CreateLocalConfig(), nullptr)); + + ASSERT_OK(WaitForTabletRunning(tabletId)); + + WriteRequestPB req; + WriteResponsePB resp; + RpcController controller; + req.set_tablet_id(tabletId); + ASSERT_OK(SchemaToPB(schema_, req.mutable_schema())); + + vector ops = { RowOperationsPB::INSERT, RowOperationsPB::UPDATE }; + + for (const RowOperationsPB::Type &op : ops) { + RowOperationsPB* data = req.mutable_row_operations(); + AddTestRowToPB(op, schema_, 20, 1, "1", data); + SCOPED_TRACE(req.DebugString()); + ASSERT_OK(proxy_->Write(req, &resp, &controller)); + SCOPED_TRACE(resp.DebugString()); + + ASSERT_TRUE(resp.has_error()); + ASSERT_EQ(TabletServerErrorPB::UNKNOWN_ERROR, resp.error().code()); + Status s = StatusFromPB(resp.error().status()); + EXPECT_TRUE(s.IsNotFound()); + ASSERT_STR_CONTAINS(s.ToString(), + "Not found: Row not in tablet partition"); + data->Clear(); + controller.Reset(); + } +} + +static uint32_t CalcTestRowChecksum(int32_t key, uint8_t string_field_defined = true) { + crc::Crc* crc = crc::GetCrc32cInstance(); + uint64_t row_crc = 0; + + string strval = strings::Substitute("original$0", key); + uint32_t index = 0; + crc->Compute(&index, sizeof(index), &row_crc, nullptr); + crc->Compute(&key, sizeof(int32_t), &row_crc, nullptr); + + index = 1; + crc->Compute(&index, sizeof(index), &row_crc, nullptr); + crc->Compute(&key, sizeof(int32_t), &row_crc, nullptr); + + index = 2; + crc->Compute(&index, sizeof(index), &row_crc, nullptr); + crc->Compute(&string_field_defined, sizeof(string_field_defined), &row_crc, nullptr); + if (string_field_defined) { + crc->Compute(strval.c_str(), strval.size(), &row_crc, nullptr); + } + return static_cast(row_crc); +} + +// Simple test to check that our checksum scans work as expected. +TEST_F(TabletServerTest, TestChecksumScan) { + uint64_t total_crc = 0; + + ChecksumRequestPB req; + req.mutable_new_request()->set_tablet_id(kTabletId); + req.mutable_new_request()->set_read_mode(READ_LATEST); + req.set_call_seq_id(0); + ASSERT_OK(SchemaToColumnPBs(schema_, req.mutable_new_request()->mutable_projected_columns(), + SCHEMA_PB_WITHOUT_IDS)); + ChecksumRequestPB new_req = req; // Cache "new" request. + + ChecksumResponsePB resp; + RpcController controller; + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + + // No rows. + ASSERT_EQ(total_crc, resp.checksum()); + ASSERT_FALSE(resp.has_more_results()); + + // First row. + int32_t key = 1; + InsertTestRowsRemote(0, key, 1); + controller.Reset(); + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + total_crc += CalcTestRowChecksum(key); + uint64_t first_crc = total_crc; // Cache first record checksum. + + ASSERT_FALSE(resp.has_error()) << resp.error().DebugString(); + ASSERT_EQ(total_crc, resp.checksum()); + ASSERT_FALSE(resp.has_more_results()); + + // Second row (null string field). + key = 2; + InsertTestRowsRemote(0, key, 1, 1, nullptr, kTabletId, nullptr, nullptr, false); + controller.Reset(); + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + total_crc += CalcTestRowChecksum(key, false); + + ASSERT_FALSE(resp.has_error()) << resp.error().DebugString(); + ASSERT_EQ(total_crc, resp.checksum()); + ASSERT_FALSE(resp.has_more_results()); + + // Now test the same thing, but with a scan requiring 2 passes (one per row). + FLAGS_scanner_batch_size_rows = 1; + req.set_batch_size_bytes(1); + controller.Reset(); + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + string scanner_id = resp.scanner_id(); + ASSERT_TRUE(resp.has_more_results()); + uint64_t agg_checksum = resp.checksum(); + + // Second row. + req.clear_new_request(); + req.mutable_continue_request()->set_scanner_id(scanner_id); + req.mutable_continue_request()->set_previous_checksum(agg_checksum); + req.set_call_seq_id(1); + controller.Reset(); + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + ASSERT_EQ(total_crc, resp.checksum()); + ASSERT_FALSE(resp.has_more_results()); + + // Finally, delete row 2, so we're back to the row 1 checksum. + ASSERT_NO_FATAL_FAILURE(DeleteTestRowsRemote(key, 1)); + FLAGS_scanner_batch_size_rows = 100; + req = new_req; + controller.Reset(); + ASSERT_OK(proxy_->Checksum(req, &resp, &controller)); + ASSERT_NE(total_crc, resp.checksum()); + ASSERT_EQ(first_crc, resp.checksum()); + ASSERT_FALSE(resp.has_more_results()); +} + +class DelayFsyncLogHook : public log::Log::LogFaultHooks { + public: + DelayFsyncLogHook() : log_latch1_(1), test_latch1_(1) {} + + Status PostAppend() override { + test_latch1_.CountDown(); + log_latch1_.Wait(); + log_latch1_.Reset(1); + return Status::OK(); + } + + void Continue() { + test_latch1_.Wait(); + log_latch1_.CountDown(); + } + + private: + CountDownLatch log_latch1_; + CountDownLatch test_latch1_; +}; + +namespace { + +void DeleteOneRowAsync(TabletServerTest* test) { + test->DeleteTestRowsRemote(10, 1); +} + +void CompactAsync(Tablet* tablet, CountDownLatch* flush_done_latch) { + CHECK_OK(tablet->Compact(Tablet::FORCE_COMPACT_ALL)); + flush_done_latch->CountDown(); +} + +} // namespace + +// Tests that in flight transactions are committed and that commit messages +// are durable before a compaction is allowed to flush the tablet metadata. +// +// This test is in preparation for KUDU-120 and should pass before and after +// it, but was also confirmed to fail if the pre-conditions it tests for +// fail. That is if KUDU-120 is implemented without these pre-requisites +// this test is confirmed to fail. +TEST_F(TabletServerTest, TestKudu120PreRequisites) { + + // Insert a few rows... + InsertTestRowsRemote(0, 0, 10); + // ... now flush ... + ASSERT_OK(tablet_peer_->tablet()->Flush()); + // ... insert a few rows... + InsertTestRowsRemote(0, 10, 10); + // ... and flush again so that we have two disk row sets. + ASSERT_OK(tablet_peer_->tablet()->Flush()); + + // Add a hook so that we can make the log wait right after an append + // (before the callback is triggered). + log::Log* log = tablet_peer_->log(); + shared_ptr log_hook(new DelayFsyncLogHook); + log->SetLogFaultHooksForTests(log_hook); + + // Now start a transaction (delete) and stop just before commit. + scoped_refptr thread1; + CHECK_OK(kudu::Thread::Create("DeleteThread", "DeleteThread", + DeleteOneRowAsync, this, &thread1)); + + // Wait for the replicate message to arrive and continue. + log_hook->Continue(); + // Wait a few msecs to make sure that the transaction is + // trying to commit. + usleep(100* 1000); // 100 msecs + + // Now start a compaction before letting the commit message go through. + scoped_refptr flush_thread; + CountDownLatch flush_done_latch(1); + CHECK_OK(kudu::Thread::Create("CompactThread", "CompactThread", + CompactAsync, + tablet_peer_->tablet(), + &flush_done_latch, + &flush_thread)); + + // At this point we have both a compaction and a transaction going on. + // If we allow the transaction to return before the commit message is + // durable (KUDU-120) that means that the mvcc transaction will no longer + // be in flight at this moment, nonetheless since we're blocking the WAL + // and not allowing the commit message to go through, the compaction should + // be forced to wait. + // + // We are thus testing two conditions: + // - That in-flight transactions are committed. + // - That commit messages for transactions that were in flight are durable. + // + // If these pre-conditions are not met, i.e. if the compaction is not forced + // to wait here for the conditions to be true, then the below assertion + // will fail, since the transaction's commit write callback will only + // return when we allow it (in log_hook->Continue()); + CHECK(!flush_done_latch.WaitFor(MonoDelta::FromMilliseconds(300))); + + // Now let the rest go through. + log_hook->Continue(); + log_hook->Continue(); + flush_done_latch.Wait(); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_server.cc b/src/kudu/tserver/tablet_server.cc new file mode 100644 index 000000000000..14841e7b488b --- /dev/null +++ b/src/kudu/tserver/tablet_server.cc @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/tablet_server.h" + +#include +#include +#include + +#include "kudu/cfile/block_cache.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/rpc/service_if.h" +#include "kudu/server/rpc_server.h" +#include "kudu/server/webserver.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tserver/heartbeater.h" +#include "kudu/tserver/scanners.h" +#include "kudu/tserver/tablet_service.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/tserver/tserver-path-handlers.h" +#include "kudu/tserver/remote_bootstrap_service.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +using kudu::rpc::ServiceIf; +using kudu::tablet::TabletPeer; +using std::shared_ptr; +using std::vector; + +namespace kudu { +namespace tserver { + +TabletServer::TabletServer(const TabletServerOptions& opts) + : ServerBase("TabletServer", opts, "kudu.tabletserver"), + initted_(false), + fail_heartbeats_for_tests_(false), + opts_(opts), + tablet_manager_(new TSTabletManager(fs_manager_.get(), this, metric_registry())), + scanner_manager_(new ScannerManager(metric_entity())), + path_handlers_(new TabletServerPathHandlers(this)), + maintenance_manager_(new MaintenanceManager(MaintenanceManager::DEFAULT_OPTIONS)) { +} + +TabletServer::~TabletServer() { + Shutdown(); +} + +string TabletServer::ToString() const { + // TODO: include port numbers, etc. + return "TabletServer"; +} + +Status TabletServer::ValidateMasterAddressResolution() const { + for (const HostPort& master_addr : opts_.master_addresses) { + RETURN_NOT_OK_PREPEND(master_addr.ResolveAddresses(NULL), + strings::Substitute( + "Couldn't resolve master service address '$0'", + master_addr.ToString())); + } + return Status::OK(); +} + +Status TabletServer::Init() { + CHECK(!initted_); + + cfile::BlockCache::GetSingleton()->StartInstrumentation(metric_entity()); + + // Validate that the passed master address actually resolves. + // We don't validate that we can connect at this point -- it should + // be allowed to start the TS and the master in whichever order -- + // our heartbeat thread will loop until successfully connecting. + RETURN_NOT_OK(ValidateMasterAddressResolution()); + + RETURN_NOT_OK(ServerBase::Init()); + RETURN_NOT_OK(path_handlers_->Register(web_server_.get())); + + heartbeater_.reset(new Heartbeater(opts_, this)); + + RETURN_NOT_OK_PREPEND(tablet_manager_->Init(), + "Could not init Tablet Manager"); + + RETURN_NOT_OK_PREPEND(scanner_manager_->StartRemovalThread(), + "Could not start expired Scanner removal thread"); + + initted_ = true; + return Status::OK(); +} + +Status TabletServer::WaitInited() { + return tablet_manager_->WaitForAllBootstrapsToFinish(); +} + +Status TabletServer::Start() { + CHECK(initted_); + + gscoped_ptr ts_service(new TabletServiceImpl(this)); + gscoped_ptr admin_service(new TabletServiceAdminImpl(this)); + gscoped_ptr consensus_service(new ConsensusServiceImpl(metric_entity(), + tablet_manager_.get())); + gscoped_ptr remote_bootstrap_service( + new RemoteBootstrapServiceImpl(fs_manager_.get(), tablet_manager_.get(), metric_entity())); + + RETURN_NOT_OK(ServerBase::RegisterService(ts_service.Pass())); + RETURN_NOT_OK(ServerBase::RegisterService(admin_service.Pass())); + RETURN_NOT_OK(ServerBase::RegisterService(consensus_service.Pass())); + RETURN_NOT_OK(ServerBase::RegisterService(remote_bootstrap_service.Pass())); + RETURN_NOT_OK(ServerBase::Start()); + + RETURN_NOT_OK(heartbeater_->Start()); + RETURN_NOT_OK(maintenance_manager_->Init()); + + google::FlushLogFiles(google::INFO); // Flush the startup messages. + + return Status::OK(); +} + +void TabletServer::Shutdown() { + LOG(INFO) << "TabletServer shutting down..."; + + if (initted_) { + maintenance_manager_->Shutdown(); + WARN_NOT_OK(heartbeater_->Stop(), "Failed to stop TS Heartbeat thread"); + ServerBase::Shutdown(); + tablet_manager_->Shutdown(); + } + + LOG(INFO) << "TabletServer shut down complete. Bye!"; +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_server.h b/src/kudu/tserver/tablet_server.h new file mode 100644 index 000000000000..b8f3b7754486 --- /dev/null +++ b/src/kudu/tserver/tablet_server.h @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TABLET_SERVER_H +#define KUDU_TSERVER_TABLET_SERVER_H + +#include +#include +#include + +#include "kudu/consensus/metadata.pb.h" +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/server/server_base.h" +#include "kudu/server/webserver_options.h" +#include "kudu/tserver/tablet_server_options.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MaintenanceManager; + +namespace tserver { + +class Heartbeater; +class ScannerManager; +class TabletServerPathHandlers; +class TSTabletManager; + +class TabletServer : public server::ServerBase { + public: + // TODO: move this out of this header, since clients want to use this + // constant as well. + static const uint16_t kDefaultPort = 7050; + static const uint16_t kDefaultWebPort = 8050; + + explicit TabletServer(const TabletServerOptions& opts); + ~TabletServer(); + + // Initializes the tablet server, including the bootstrapping of all + // existing tablets. + // Some initialization tasks are asynchronous, such as the bootstrapping + // of tablets. Caller can block, waiting for the initialization to fully + // complete by calling WaitInited(). + Status Init(); + + // Waits for the tablet server to complete the initialization. + Status WaitInited(); + + Status Start(); + void Shutdown(); + + std::string ToString() const; + + TSTabletManager* tablet_manager() { return tablet_manager_.get(); } + + ScannerManager* scanner_manager() { return scanner_manager_.get(); } + + Heartbeater* heartbeater() { return heartbeater_.get(); } + + void set_fail_heartbeats_for_tests(bool fail_heartbeats_for_tests) { + base::subtle::NoBarrier_Store(&fail_heartbeats_for_tests_, 1); + } + + bool fail_heartbeats_for_tests() const { + return base::subtle::NoBarrier_Load(&fail_heartbeats_for_tests_); + } + + MaintenanceManager* maintenance_manager() { + return maintenance_manager_.get(); + } + + private: + friend class TabletServerTestBase; + + Status ValidateMasterAddressResolution() const; + + bool initted_; + + // If true, all heartbeats will be seen as failed. + Atomic32 fail_heartbeats_for_tests_; + + // The options passed at construction time. + const TabletServerOptions opts_; + + // Manager for tablets which are available on this server. + gscoped_ptr tablet_manager_; + + // Manager for open scanners from clients. + // This is always non-NULL. It is scoped only to minimize header + // dependencies. + gscoped_ptr scanner_manager_; + + // Thread responsible for heartbeating to the master. + gscoped_ptr heartbeater_; + + // Webserver path handlers + gscoped_ptr path_handlers_; + + // The maintenance manager for this tablet server + std::shared_ptr maintenance_manager_; + + DISALLOW_COPY_AND_ASSIGN(TabletServer); +}; + +} // namespace tserver +} // namespace kudu +#endif diff --git a/src/kudu/tserver/tablet_server_main.cc b/src/kudu/tserver/tablet_server_main.cc new file mode 100644 index 000000000000..8e35ae51316b --- /dev/null +++ b/src/kudu/tserver/tablet_server_main.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/flags.h" +#include "kudu/util/init.h" +#include "kudu/util/logging.h" + +using kudu::tserver::TabletServer; + +DECLARE_string(rpc_bind_addresses); +DECLARE_int32(rpc_num_service_threads); +DECLARE_int32(webserver_port); + +namespace kudu { +namespace tserver { + +static int TabletServerMain(int argc, char** argv) { + InitKuduOrDie(); + + // Reset some default values before parsing gflags. + FLAGS_rpc_bind_addresses = strings::Substitute("0.0.0.0:$0", + TabletServer::kDefaultPort); + FLAGS_rpc_num_service_threads = 20; + FLAGS_webserver_port = TabletServer::kDefaultWebPort; + + ParseCommandLineFlags(&argc, &argv, true); + if (argc != 1) { + std::cerr << "usage: " << argv[0] << std::endl; + return 1; + } + InitGoogleLoggingSafe(argv[0]); + + TabletServerOptions opts; + TabletServer server(opts); + LOG(INFO) << "Initializing tablet server..."; + CHECK_OK(server.Init()); + + LOG(INFO) << "Starting tablet server..."; + CHECK_OK(server.Start()); + + LOG(INFO) << "Tablet server successfully started."; + while (true) { + SleepFor(MonoDelta::FromSeconds(60)); + } + + return 0; +} + +} // namespace tserver +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::tserver::TabletServerMain(argc, argv); +} diff --git a/src/kudu/tserver/tablet_server_options.cc b/src/kudu/tserver/tablet_server_options.cc new file mode 100644 index 000000000000..70b6fc407734 --- /dev/null +++ b/src/kudu/tserver/tablet_server_options.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/tablet_server_options.h" + +#include +#include + +#include "kudu/master/master.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/flag_tags.h" + +namespace kudu { +namespace tserver { + +DEFINE_string(tserver_master_addrs, "127.0.0.1:7051", + "Comma separated addresses of the masters which the " + "tablet server should connect to. The masters do not " + "read this flag -- configure the masters separately " + "using 'rpc_bind_addresses'."); +TAG_FLAG(tserver_master_addrs, stable); + + +TabletServerOptions::TabletServerOptions() { + rpc_opts.default_port = TabletServer::kDefaultPort; + + Status s = HostPort::ParseStrings(FLAGS_tserver_master_addrs, + master::Master::kDefaultPort, + &master_addresses); + if (!s.ok()) { + LOG(FATAL) << "Couldn't parse tablet_server_master_addrs flag: " << s.ToString(); + } +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_server_options.h b/src/kudu/tserver/tablet_server_options.h new file mode 100644 index 000000000000..1ca13897acf1 --- /dev/null +++ b/src/kudu/tserver/tablet_server_options.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TABLET_SERVER_OPTIONS_H +#define KUDU_TSERVER_TABLET_SERVER_OPTIONS_H + +#include + +#include "kudu/server/server_base_options.h" +#include "kudu/util/net/net_util.h" + +namespace kudu { +namespace tserver { + +// Options for constructing a tablet server. +// These are filled in by gflags by default -- see the .cc file for +// the list of options and corresponding flags. +// +// This allows tests to easily start miniclusters with different +// tablet servers having different options. +struct TabletServerOptions : public kudu::server::ServerBaseOptions { + TabletServerOptions(); + + std::vector master_addresses; +}; + +} // namespace tserver +} // namespace kudu +#endif /* KUDU_TSERVER_TABLET_SERVER_OPTIONS_H */ diff --git a/src/kudu/tserver/tablet_server_test_util.cc b/src/kudu/tserver/tablet_server_test_util.cc new file mode 100644 index 000000000000..5caf5071b84c --- /dev/null +++ b/src/kudu/tserver/tablet_server_test_util.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/tablet_server_test_util.h" + +#include "kudu/consensus/consensus.proxy.h" +#include "kudu/rpc/messenger.h" +#include "kudu/server/server_base.proxy.h" +#include "kudu/tserver/tserver_admin.proxy.h" +#include "kudu/tserver/tserver_service.proxy.h" + +namespace kudu { +namespace tserver { + +using consensus::ConsensusServiceProxy; +using rpc::Messenger; +using std::shared_ptr; + +void CreateTsClientProxies(const Sockaddr& addr, + const shared_ptr& messenger, + gscoped_ptr* proxy, + gscoped_ptr* admin_proxy, + gscoped_ptr* consensus_proxy, + gscoped_ptr* generic_proxy) { + proxy->reset(new TabletServerServiceProxy(messenger, addr)); + admin_proxy->reset(new TabletServerAdminServiceProxy(messenger, addr)); + consensus_proxy->reset(new ConsensusServiceProxy(messenger, addr)); + generic_proxy->reset(new server::GenericServiceProxy(messenger, addr)); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_server_test_util.h b/src/kudu/tserver/tablet_server_test_util.h new file mode 100644 index 000000000000..6901c1e146d8 --- /dev/null +++ b/src/kudu/tserver/tablet_server_test_util.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_ +#define KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_ + +#include + +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { +class Sockaddr; + +namespace consensus { +class ConsensusServiceProxy; +} + +namespace rpc { +class Messenger; +} + +namespace server { +class GenericServiceProxy; +} + +namespace tserver { +class TabletServerAdminServiceProxy; +class TabletServerServiceProxy; + +// Create tablet server client proxies for tests. +void CreateTsClientProxies(const Sockaddr& addr, + const std::shared_ptr& messenger, + gscoped_ptr* proxy, + gscoped_ptr* admin_proxy, + gscoped_ptr* consensus_proxy, + gscoped_ptr* generic_proxy); + +} // namespace tserver +} // namespace kudu + +#endif // KUDU_TSERVER_TABLET_SERVER_TEST_UTIL_H_ diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc new file mode 100644 index 000000000000..10d8f3b7c0af --- /dev/null +++ b/src/kudu/tserver/tablet_service.cc @@ -0,0 +1,1697 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/tablet_service.h" + +#include +#include +#include +#include +#include + +#include "kudu/common/iterator.h" +#include "kudu/common/schema.h" +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/rpc/rpc_context.h" +#include "kudu/rpc/rpc_sidecar.h" +#include "kudu/server/hybrid_clock.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tserver/remote_bootstrap_service.h" +#include "kudu/tablet/metadata.pb.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet_metrics.h" +#include "kudu/tablet/transactions/alter_schema_transaction.h" +#include "kudu/tablet/transactions/write_transaction.h" +#include "kudu/tserver/scanners.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/crc.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/faststring.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/status_callback.h" +#include "kudu/util/trace.h" + +DEFINE_int32(scanner_default_batch_size_bytes, 1024 * 1024, + "The default size for batches of scan results"); +TAG_FLAG(scanner_default_batch_size_bytes, advanced); +TAG_FLAG(scanner_default_batch_size_bytes, runtime); + +DEFINE_int32(scanner_max_batch_size_bytes, 8 * 1024 * 1024, + "The maximum batch size that a client may request for " + "scan results."); +TAG_FLAG(scanner_max_batch_size_bytes, advanced); +TAG_FLAG(scanner_max_batch_size_bytes, runtime); + +DEFINE_int32(scanner_batch_size_rows, 100, + "The number of rows to batch for servicing scan requests."); +TAG_FLAG(scanner_batch_size_rows, advanced); +TAG_FLAG(scanner_batch_size_rows, runtime); + +// Fault injection flags. +DEFINE_int32(scanner_inject_latency_on_each_batch_ms, 0, + "If set, the scanner will pause the specified number of milliesconds " + "before reading each batch of data on the tablet server. " + "Used for tests."); +TAG_FLAG(scanner_inject_latency_on_each_batch_ms, unsafe); + +DECLARE_int32(memory_limit_warn_threshold_percentage); + +namespace kudu { +namespace tserver { + +using consensus::ChangeConfigRequestPB; +using consensus::ChangeConfigResponsePB; +using consensus::CONSENSUS_CONFIG_ACTIVE; +using consensus::CONSENSUS_CONFIG_COMMITTED; +using consensus::Consensus; +using consensus::ConsensusConfigType; +using consensus::ConsensusRequestPB; +using consensus::ConsensusResponsePB; +using consensus::GetLastOpIdRequestPB; +using consensus::GetNodeInstanceRequestPB; +using consensus::GetNodeInstanceResponsePB; +using consensus::LeaderStepDownRequestPB; +using consensus::LeaderStepDownResponsePB; +using consensus::RunLeaderElectionRequestPB; +using consensus::RunLeaderElectionResponsePB; +using consensus::StartRemoteBootstrapRequestPB; +using consensus::StartRemoteBootstrapResponsePB; +using consensus::VoteRequestPB; +using consensus::VoteResponsePB; + +using google::protobuf::RepeatedPtrField; +using rpc::RpcContext; +using std::shared_ptr; +using std::vector; +using strings::Substitute; +using tablet::AlterSchemaTransactionState; +using tablet::Tablet; +using tablet::TabletPeer; +using tablet::TabletStatusPB; +using tablet::TransactionCompletionCallback; +using tablet::WriteTransactionState; + +namespace { + +// Lookup the given tablet, ensuring that it both exists and is RUNNING. +// If it is not, responds to the RPC associated with 'context' after setting +// resp->mutable_error() to indicate the failure reason. +// +// Returns true if successful. +template +bool LookupTabletPeerOrRespond(TabletPeerLookupIf* tablet_manager, + const string& tablet_id, + RespClass* resp, + rpc::RpcContext* context, + scoped_refptr* peer) { + if (PREDICT_FALSE(!tablet_manager->GetTabletPeer(tablet_id, peer).ok())) { + SetupErrorAndRespond(resp->mutable_error(), + Status::NotFound("Tablet not found"), + TabletServerErrorPB::TABLET_NOT_FOUND, context); + return false; + } + + // Check RUNNING state. + tablet::TabletStatePB state = (*peer)->state(); + if (PREDICT_FALSE(state != tablet::RUNNING)) { + Status s = Status::IllegalState("Tablet not RUNNING", + tablet::TabletStatePB_Name(state)); + if (state == tablet::FAILED) { + s = s.CloneAndAppend((*peer)->error().ToString()); + } + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::TABLET_NOT_RUNNING, context); + return false; + } + return true; +} + +template +bool CheckUuidMatchOrRespond(TabletPeerLookupIf* tablet_manager, + const char* method_name, + const ReqClass* req, + RespClass* resp, + rpc::RpcContext* context) { + const string& local_uuid = tablet_manager->NodeInstance().permanent_uuid(); + if (PREDICT_FALSE(!req->has_dest_uuid())) { + // Maintain compat in release mode, but complain. + string msg = Substitute("$0: Missing destination UUID in request from $1: $2", + method_name, context->requestor_string(), req->ShortDebugString()); +#ifdef NDEBUG + KLOG_EVERY_N(ERROR, 100) << msg; +#else + LOG(DFATAL) << msg; +#endif + return true; + } + if (PREDICT_FALSE(req->dest_uuid() != local_uuid)) { + Status s = Status::InvalidArgument(Substitute("$0: Wrong destination UUID requested. " + "Local UUID: $1. Requested UUID: $2", + method_name, local_uuid, req->dest_uuid())); + LOG(WARNING) << s.ToString() << ": from " << context->requestor_string() + << ": " << req->ShortDebugString(); + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::WRONG_SERVER_UUID, context); + return false; + } + return true; +} + +template +bool GetConsensusOrRespond(const scoped_refptr& tablet_peer, + RespClass* resp, + rpc::RpcContext* context, + scoped_refptr* consensus) { + *consensus = tablet_peer->shared_consensus(); + if (!*consensus) { + Status s = Status::ServiceUnavailable("Consensus unavailable. Tablet not running"); + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::TABLET_NOT_RUNNING, context); + return false; + } + return true; +} + +Status GetTabletRef(const scoped_refptr& tablet_peer, + shared_ptr* tablet, + TabletServerErrorPB::Code* error_code) { + *DCHECK_NOTNULL(tablet) = tablet_peer->shared_tablet(); + if (PREDICT_FALSE(!*tablet)) { + *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; + return Status::IllegalState("Tablet is not running"); + } + return Status::OK(); +} + +template +void HandleUnknownError(const Status& s, RespType* resp, RpcContext* context) { + resp->Clear(); + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); +} + +template +void HandleResponse(const ReqType* req, RespType* resp, + RpcContext* context, const Status& s) { + if (PREDICT_FALSE(!s.ok())) { + HandleUnknownError(s, resp, context); + return; + } + context->RespondSuccess(); +} + +template +static StatusCallback BindHandleResponse(const ReqType* req, RespType* resp, RpcContext* context) { + return Bind(&HandleResponse, req, resp, context); +} + +} // namespace + +typedef ListTabletsResponsePB::StatusAndSchemaPB StatusAndSchemaPB; + +static void SetupErrorAndRespond(TabletServerErrorPB* error, + const Status& s, + TabletServerErrorPB::Code code, + rpc::RpcContext* context) { + // Generic "service unavailable" errors will cause the client to retry later. + if (code == TabletServerErrorPB::UNKNOWN_ERROR && s.IsServiceUnavailable()) { + context->RespondRpcFailure(rpc::ErrorStatusPB::ERROR_SERVER_TOO_BUSY, s); + return; + } + + StatusToPB(s, error->mutable_status()); + error->set_code(code); + // TODO: rename RespondSuccess() to just "Respond" or + // "SendResponse" since we use it for application-level error + // responses, and this just looks confusing! + context->RespondSuccess(); +} + +template +void HandleErrorResponse(const ReqType* req, RespType* resp, RpcContext* context, + const boost::optional& error_code, + const Status& s) { + resp->Clear(); + if (error_code) { + SetupErrorAndRespond(resp->mutable_error(), s, *error_code, context); + } else { + HandleUnknownError(s, resp, context); + } +} + +// A transaction completion callback that responds to the client when transactions +// complete and sets the client error if there is one to set. +template +class RpcTransactionCompletionCallback : public TransactionCompletionCallback { + public: + RpcTransactionCompletionCallback(rpc::RpcContext* context, + Response* response) + : context_(context), + response_(response) {} + + virtual void TransactionCompleted() OVERRIDE { + if (!status_.ok()) { + SetupErrorAndRespond(get_error(), status_, code_, context_); + } else { + context_->RespondSuccess(); + } + }; + + private: + + TabletServerErrorPB* get_error() { + return response_->mutable_error(); + } + + rpc::RpcContext* context_; + Response* response_; + tablet::TransactionState* state_; +}; + +// Generic interface to handle scan results. +class ScanResultCollector { + public: + virtual void HandleRowBlock(const Schema* client_projection_schema, + const RowBlock& row_block) = 0; + + // Returns number of times HandleRowBlock() was called. + virtual int BlocksProcessed() const = 0; + + // Returns number of bytes which will be returned in the response. + virtual int64_t ResponseSize() const = 0; + + // Returns the last processed row's primary key. + virtual const faststring& last_primary_key() const = 0; + + // Return the number of rows actually returned to the client. + virtual int64_t NumRowsReturned() const = 0; +}; + +namespace { + +// Given a RowBlock, set last_primary_key to the primary key of the last selected row +// in the RowBlock. If no row is selected, last_primary_key is not set. +void SetLastRow(const RowBlock& row_block, faststring* last_primary_key) { + // Find the last selected row and save its encoded key. + const SelectionVector* sel = row_block.selection_vector(); + if (sel->AnySelected()) { + for (int i = sel->nrows() - 1; i >= 0; i--) { + if (sel->IsRowSelected(i)) { + RowBlockRow last_row = row_block.row(i); + const Schema* schema = last_row.schema(); + schema->EncodeComparableKey(last_row, last_primary_key); + break; + } + } + } +} + +} // namespace + +// Copies the scan result to the given row block PB and data buffers. +// +// This implementation is used in the common case where a client is running +// a scan and the data needs to be returned to the client. +// +// (This is in contrast to some other ScanResultCollector implementation that +// might do an aggregation or gather some other types of statistics via a +// server-side scan and thus never need to return the actual data.) +class ScanResultCopier : public ScanResultCollector { + public: + ScanResultCopier(RowwiseRowBlockPB* rowblock_pb, faststring* rows_data, faststring* indirect_data) + : rowblock_pb_(DCHECK_NOTNULL(rowblock_pb)), + rows_data_(DCHECK_NOTNULL(rows_data)), + indirect_data_(DCHECK_NOTNULL(indirect_data)), + blocks_processed_(0), + num_rows_returned_(0) { + } + + virtual void HandleRowBlock(const Schema* client_projection_schema, + const RowBlock& row_block) OVERRIDE { + blocks_processed_++; + num_rows_returned_ += row_block.selection_vector()->CountSelected(); + SerializeRowBlock(row_block, rowblock_pb_, client_projection_schema, + rows_data_, indirect_data_); + SetLastRow(row_block, &last_primary_key_); + } + + virtual int BlocksProcessed() const OVERRIDE { return blocks_processed_; } + + // Returns number of bytes buffered to return. + virtual int64_t ResponseSize() const OVERRIDE { + return rows_data_->size() + indirect_data_->size(); + } + + virtual const faststring& last_primary_key() const OVERRIDE { + return last_primary_key_; + } + + virtual int64_t NumRowsReturned() const OVERRIDE { + return num_rows_returned_; + } + + private: + RowwiseRowBlockPB* const rowblock_pb_; + faststring* const rows_data_; + faststring* const indirect_data_; + int blocks_processed_; + int64_t num_rows_returned_; + faststring last_primary_key_; + + DISALLOW_COPY_AND_ASSIGN(ScanResultCopier); +}; + +// Checksums the scan result. +class ScanResultChecksummer : public ScanResultCollector { + public: + ScanResultChecksummer() + : crc_(crc::GetCrc32cInstance()), + agg_checksum_(0), + blocks_processed_(0) { + } + + virtual void HandleRowBlock(const Schema* client_projection_schema, + const RowBlock& row_block) OVERRIDE { + blocks_processed_++; + if (!client_projection_schema) { + client_projection_schema = &row_block.schema(); + } + + size_t nrows = row_block.nrows(); + for (size_t i = 0; i < nrows; i++) { + if (!row_block.selection_vector()->IsRowSelected(i)) continue; + uint32_t row_crc = CalcRowCrc32(*client_projection_schema, row_block.row(i)); + agg_checksum_ += row_crc; + } + // Find the last selected row and save its encoded key. + SetLastRow(row_block, &encoded_last_row_); + } + + virtual int BlocksProcessed() const OVERRIDE { return blocks_processed_; } + + // Returns a constant -- we only return checksum based on a time budget. + virtual int64_t ResponseSize() const OVERRIDE { return sizeof(agg_checksum_); } + + virtual const faststring& last_primary_key() const OVERRIDE { return encoded_last_row_; } + + virtual int64_t NumRowsReturned() const OVERRIDE { + return 0; + } + + // Accessors for initializing / setting the checksum. + void set_agg_checksum(uint64_t value) { agg_checksum_ = value; } + uint64_t agg_checksum() const { return agg_checksum_; } + + private: + // Calculates a CRC32C for the given row. + uint32_t CalcRowCrc32(const Schema& projection, const RowBlockRow& row) { + tmp_buf_.clear(); + + for (size_t j = 0; j < projection.num_columns(); j++) { + uint32_t col_index = static_cast(j); // For the CRC. + tmp_buf_.append(&col_index, sizeof(col_index)); + ColumnBlockCell cell = row.cell(j); + if (cell.is_nullable()) { + uint8_t is_defined = cell.is_null() ? 0 : 1; + tmp_buf_.append(&is_defined, sizeof(is_defined)); + if (!is_defined) continue; + } + if (cell.typeinfo()->physical_type() == BINARY) { + const Slice* data = reinterpret_cast(cell.ptr()); + tmp_buf_.append(data->data(), data->size()); + } else { + tmp_buf_.append(cell.ptr(), cell.size()); + } + } + + uint64_t row_crc = 0; + crc_->Compute(tmp_buf_.data(), tmp_buf_.size(), &row_crc, nullptr); + return static_cast(row_crc); // CRC32 only uses the lower 32 bits. + } + + + faststring tmp_buf_; + crc::Crc* const crc_; + uint64_t agg_checksum_; + int blocks_processed_; + faststring encoded_last_row_; + + DISALLOW_COPY_AND_ASSIGN(ScanResultChecksummer); +}; + +// Return the batch size to use for a given request, after clamping +// the user-requested request within the server-side allowable range. +// This is only a hint, really more of a threshold since returned bytes +// may exceed this limit, but hopefully only by a little bit. +static size_t GetMaxBatchSizeBytesHint(const ScanRequestPB* req) { + if (!req->has_batch_size_bytes()) { + return FLAGS_scanner_default_batch_size_bytes; + } + + return std::min(req->batch_size_bytes(), + implicit_cast(FLAGS_scanner_max_batch_size_bytes)); +} + +TabletServiceImpl::TabletServiceImpl(TabletServer* server) + : TabletServerServiceIf(server->metric_entity()), + server_(server) { +} + +void TabletServiceImpl::Ping(const PingRequestPB* req, + PingResponsePB* resp, + rpc::RpcContext* context) { + context->RespondSuccess(); +} + +TabletServiceAdminImpl::TabletServiceAdminImpl(TabletServer* server) + : TabletServerAdminServiceIf(server->metric_entity()), + server_(server) { +} + +void TabletServiceAdminImpl::AlterSchema(const AlterSchemaRequestPB* req, + AlterSchemaResponsePB* resp, + rpc::RpcContext* context) { + if (!CheckUuidMatchOrRespond(server_->tablet_manager(), "AlterSchema", req, resp, context)) { + return; + } + DVLOG(3) << "Received Alter Schema RPC: " << req->DebugString(); + + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(server_->tablet_manager(), req->tablet_id(), resp, context, + &tablet_peer)) { + return; + } + + uint32_t schema_version = tablet_peer->tablet_metadata()->schema_version(); + + // If the schema was already applied, respond as succeded + if (schema_version == req->schema_version()) { + // Sanity check, to verify that the tablet should have the same schema + // specified in the request. + Schema req_schema; + Status s = SchemaFromPB(req->schema(), &req_schema); + if (!s.ok()) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::INVALID_SCHEMA, context); + return; + } + + Schema tablet_schema = tablet_peer->tablet_metadata()->schema(); + if (req_schema.Equals(tablet_schema)) { + context->RespondSuccess(); + return; + } + + schema_version = tablet_peer->tablet_metadata()->schema_version(); + if (schema_version == req->schema_version()) { + LOG(ERROR) << "The current schema does not match the request schema." + << " version=" << schema_version + << " current-schema=" << tablet_schema.ToString() + << " request-schema=" << req_schema.ToString() + << " (corruption)"; + SetupErrorAndRespond(resp->mutable_error(), + Status::Corruption("got a different schema for the same version number"), + TabletServerErrorPB::MISMATCHED_SCHEMA, context); + return; + } + } + + // If the current schema is newer than the one in the request reject the request. + if (schema_version > req->schema_version()) { + SetupErrorAndRespond(resp->mutable_error(), + Status::InvalidArgument("Tablet has a newer schema"), + TabletServerErrorPB::TABLET_HAS_A_NEWER_SCHEMA, context); + return; + } + + gscoped_ptr tx_state( + new AlterSchemaTransactionState(tablet_peer.get(), req, resp)); + + tx_state->set_completion_callback(gscoped_ptr( + new RpcTransactionCompletionCallback(context, + resp)).Pass()); + + // Submit the alter schema op. The RPC will be responded to asynchronously. + Status s = tablet_peer->SubmitAlterSchema(tx_state.Pass()); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } +} + +void TabletServiceAdminImpl::CreateTablet(const CreateTabletRequestPB* req, + CreateTabletResponsePB* resp, + rpc::RpcContext* context) { + if (!CheckUuidMatchOrRespond(server_->tablet_manager(), "CreateTablet", req, resp, context)) { + return; + } + TRACE_EVENT1("tserver", "CreateTablet", + "tablet_id", req->tablet_id()); + + Schema schema; + Status s = SchemaFromPB(req->schema(), &schema); + DCHECK(schema.has_column_ids()); + if (!s.ok()) { + SetupErrorAndRespond(resp->mutable_error(), + Status::InvalidArgument("Invalid Schema."), + TabletServerErrorPB::INVALID_SCHEMA, context); + return; + } + + PartitionSchema partition_schema; + s = PartitionSchema::FromPB(req->partition_schema(), schema, &partition_schema); + if (!s.ok()) { + SetupErrorAndRespond(resp->mutable_error(), + Status::InvalidArgument("Invalid PartitionSchema."), + TabletServerErrorPB::INVALID_SCHEMA, context); + return; + } + + Partition partition; + Partition::FromPB(req->partition(), &partition); + + LOG(INFO) << "Processing CreateTablet for tablet " << req->tablet_id() + << " (table=" << req->table_name() + << " [id=" << req->table_id() << "]), partition=" + << partition_schema.PartitionDebugString(partition, schema); + VLOG(1) << "Full request: " << req->DebugString(); + + s = server_->tablet_manager()->CreateNewTablet(req->table_id(), + req->tablet_id(), + partition, + req->table_name(), + schema, + partition_schema, + req->config(), + nullptr); + if (PREDICT_FALSE(!s.ok())) { + TabletServerErrorPB::Code code; + if (s.IsAlreadyPresent()) { + code = TabletServerErrorPB::TABLET_ALREADY_EXISTS; + } else { + code = TabletServerErrorPB::UNKNOWN_ERROR; + } + SetupErrorAndRespond(resp->mutable_error(), s, code, context); + return; + } + context->RespondSuccess(); +} + +void TabletServiceAdminImpl::DeleteTablet(const DeleteTabletRequestPB* req, + DeleteTabletResponsePB* resp, + rpc::RpcContext* context) { + if (!CheckUuidMatchOrRespond(server_->tablet_manager(), "DeleteTablet", req, resp, context)) { + return; + } + TRACE_EVENT2("tserver", "DeleteTablet", + "tablet_id", req->tablet_id(), + "reason", req->reason()); + + tablet::TabletDataState delete_type = tablet::TABLET_DATA_UNKNOWN; + if (req->has_delete_type()) { + delete_type = req->delete_type(); + } + LOG(INFO) << "Processing DeleteTablet for tablet " << req->tablet_id() + << " with delete_type " << TabletDataState_Name(delete_type) + << (req->has_reason() ? (" (" + req->reason() + ")") : "") + << " from " << context->requestor_string(); + VLOG(1) << "Full request: " << req->DebugString(); + + boost::optional cas_config_opid_index_less_or_equal; + if (req->has_cas_config_opid_index_less_or_equal()) { + cas_config_opid_index_less_or_equal = req->cas_config_opid_index_less_or_equal(); + } + boost::optional error_code; + Status s = server_->tablet_manager()->DeleteTablet(req->tablet_id(), + delete_type, + cas_config_opid_index_less_or_equal, + &error_code); + if (PREDICT_FALSE(!s.ok())) { + HandleErrorResponse(req, resp, context, error_code, s); + return; + } + context->RespondSuccess(); +} + +void TabletServiceImpl::Write(const WriteRequestPB* req, + WriteResponsePB* resp, + rpc::RpcContext* context) { + TRACE_EVENT1("tserver", "TabletServiceImpl::Write", + "tablet_id", req->tablet_id()); + DVLOG(3) << "Received Write RPC: " << req->DebugString(); + + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(server_->tablet_manager(), req->tablet_id(), resp, context, + &tablet_peer)) { + return; + } + + shared_ptr tablet; + TabletServerErrorPB::Code error_code; + Status s = GetTabletRef(tablet_peer, &tablet, &error_code); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, error_code, context); + return; + } + + // Check for memory pressure; don't bother doing any additional work if we've + // exceeded the limit. + double capacity_pct; + if (tablet->mem_tracker()->AnySoftLimitExceeded(&capacity_pct)) { + tablet->metrics()->leader_memory_pressure_rejections->Increment(); + string msg = StringPrintf( + "Soft memory limit exceeded (at %.2f%% of capacity)", + capacity_pct); + if (capacity_pct >= FLAGS_memory_limit_warn_threshold_percentage) { + KLOG_EVERY_N_SECS(WARNING, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG; + } else { + KLOG_EVERY_N_SECS(INFO, 1) << "Rejecting Write request: " << msg << THROTTLE_MSG; + } + SetupErrorAndRespond(resp->mutable_error(), Status::ServiceUnavailable(msg), + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + + if (!server_->clock()->SupportsExternalConsistencyMode(req->external_consistency_mode())) { + Status s = Status::NotSupported("The configured clock does not support the" + " required consistency mode."); + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + + auto tx_state = new WriteTransactionState(tablet_peer.get(), req, resp); + + // If the client sent us a timestamp, decode it and update the clock so that all future + // timestamps are greater than the passed timestamp. + if (req->has_propagated_timestamp()) { + Timestamp ts(req->propagated_timestamp()); + s = server_->clock()->Update(ts); + } + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + + tx_state->set_completion_callback(gscoped_ptr( + new RpcTransactionCompletionCallback(context, + resp)).Pass()); + + // Submit the write. The RPC will be responded to asynchronously. + s = tablet_peer->SubmitWrite(tx_state); + + // Check that we could submit the write + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + } + return; +} + +ConsensusServiceImpl::ConsensusServiceImpl(const scoped_refptr& metric_entity, + TabletPeerLookupIf* tablet_manager) + : ConsensusServiceIf(metric_entity), + tablet_manager_(tablet_manager) { +} + +ConsensusServiceImpl::~ConsensusServiceImpl() { +} + +void ConsensusServiceImpl::UpdateConsensus(const ConsensusRequestPB* req, + ConsensusResponsePB* resp, + rpc::RpcContext* context) { + DVLOG(3) << "Received Consensus Update RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "UpdateConsensus", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + tablet_peer->permanent_uuid(); + + // Submit the update directly to the TabletPeer's Consensus instance. + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + Status s = consensus->Update(req, resp); + if (PREDICT_FALSE(!s.ok())) { + // Clear the response first, since a partially-filled response could + // result in confusing a caller, or in having missing required fields + // in embedded optional messages. + resp->Clear(); + + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void ConsensusServiceImpl::RequestConsensusVote(const VoteRequestPB* req, + VoteResponsePB* resp, + rpc::RpcContext* context) { + DVLOG(3) << "Received Consensus Request Vote RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "RequestConsensusVote", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + // Submit the vote request directly to the consensus instance. + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + Status s = consensus->RequestVote(req, resp); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void ConsensusServiceImpl::ChangeConfig(const ChangeConfigRequestPB* req, + ChangeConfigResponsePB* resp, + RpcContext* context) { + DVLOG(3) << "Received ChangeConfig RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "ChangeConfig", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, + &tablet_peer)) { + return; + } + + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + boost::optional error_code; + Status s = consensus->ChangeConfig(*req, BindHandleResponse(req, resp, context), &error_code); + if (PREDICT_FALSE(!s.ok())) { + HandleErrorResponse(req, resp, context, error_code, s); + return; + } + // The success case is handled when the callback fires. +} + +void ConsensusServiceImpl::GetNodeInstance(const GetNodeInstanceRequestPB* req, + GetNodeInstanceResponsePB* resp, + rpc::RpcContext* context) { + DVLOG(3) << "Received Get Node Instance RPC: " << req->DebugString(); + resp->mutable_node_instance()->CopyFrom(tablet_manager_->NodeInstance()); + context->RespondSuccess(); +} + +void ConsensusServiceImpl::RunLeaderElection(const RunLeaderElectionRequestPB* req, + RunLeaderElectionResponsePB* resp, + rpc::RpcContext* context) { + DVLOG(3) << "Received Run Leader Election RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "RunLeaderElection", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + Status s = consensus->StartElection( + consensus::Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void ConsensusServiceImpl::LeaderStepDown(const LeaderStepDownRequestPB* req, + LeaderStepDownResponsePB* resp, + RpcContext* context) { + DVLOG(3) << "Received Leader stepdown RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "LeaderStepDown", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + Status s = consensus->StepDown(resp); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void ConsensusServiceImpl::GetLastOpId(const consensus::GetLastOpIdRequestPB *req, + consensus::GetLastOpIdResponsePB *resp, + rpc::RpcContext *context) { + DVLOG(3) << "Received GetLastOpId RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "GetLastOpId", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + if (tablet_peer->state() != tablet::RUNNING) { + SetupErrorAndRespond(resp->mutable_error(), + Status::ServiceUnavailable("Tablet Peer not in RUNNING state"), + TabletServerErrorPB::TABLET_NOT_RUNNING, context); + return; + } + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + if (PREDICT_FALSE(req->opid_type() == consensus::UNKNOWN_OPID_TYPE)) { + HandleUnknownError(Status::InvalidArgument("Invalid opid_type specified to GetLastOpId()"), + resp, context); + return; + } + Status s = consensus->GetLastOpId(req->opid_type(), resp->mutable_opid()); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void ConsensusServiceImpl::GetConsensusState(const consensus::GetConsensusStateRequestPB *req, + consensus::GetConsensusStateResponsePB *resp, + rpc::RpcContext *context) { + DVLOG(3) << "Received GetConsensusState RPC: " << req->DebugString(); + if (!CheckUuidMatchOrRespond(tablet_manager_, "GetConsensusState", req, resp, context)) { + return; + } + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(tablet_manager_, req->tablet_id(), resp, context, &tablet_peer)) { + return; + } + + scoped_refptr consensus; + if (!GetConsensusOrRespond(tablet_peer, resp, context, &consensus)) return; + ConsensusConfigType type = req->type(); + if (PREDICT_FALSE(type != CONSENSUS_CONFIG_ACTIVE && type != CONSENSUS_CONFIG_COMMITTED)) { + HandleUnknownError( + Status::InvalidArgument(Substitute("Unsupported ConsensusConfigType $0 ($1)", + ConsensusConfigType_Name(type), type)), + resp, context); + return; + } + *resp->mutable_cstate() = consensus->ConsensusState(req->type()); + context->RespondSuccess(); +} + +void ConsensusServiceImpl::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB* req, + StartRemoteBootstrapResponsePB* resp, + rpc::RpcContext* context) { + if (!CheckUuidMatchOrRespond(tablet_manager_, "StartRemoteBootstrap", req, resp, context)) { + return; + } + Status s = tablet_manager_->StartRemoteBootstrap(*req); + if (!s.ok()) { + SetupErrorAndRespond(resp->mutable_error(), s, + TabletServerErrorPB::UNKNOWN_ERROR, + context); + return; + } + context->RespondSuccess(); +} + +void TabletServiceImpl::ScannerKeepAlive(const ScannerKeepAliveRequestPB *req, + ScannerKeepAliveResponsePB *resp, + rpc::RpcContext *context) { + DCHECK(req->has_scanner_id()); + SharedScanner scanner; + if (!server_->scanner_manager()->LookupScanner(req->scanner_id(), &scanner)) { + resp->mutable_error()->set_code(TabletServerErrorPB::SCANNER_EXPIRED); + StatusToPB(Status::NotFound("Scanner not found"), + resp->mutable_error()->mutable_status()); + return; + } + scanner->UpdateAccessTime(); + context->RespondSuccess(); +} + + +void TabletServiceImpl::Scan(const ScanRequestPB* req, + ScanResponsePB* resp, + rpc::RpcContext* context) { + TRACE_EVENT0("tserver", "TabletServiceImpl::Scan"); + // Validate the request: user must pass a new_scan_request or + // a scanner ID, but not both. + if (PREDICT_FALSE(req->has_scanner_id() && + req->has_new_scan_request())) { + context->RespondFailure(Status::InvalidArgument( + "Must not pass both a scanner_id and new_scan_request")); + return; + } + + size_t batch_size_bytes = GetMaxBatchSizeBytesHint(req); + gscoped_ptr rows_data(new faststring(batch_size_bytes * 11 / 10)); + gscoped_ptr indirect_data(new faststring(batch_size_bytes * 11 / 10)); + RowwiseRowBlockPB data; + ScanResultCopier collector(&data, rows_data.get(), indirect_data.get()); + + bool has_more_results = false; + TabletServerErrorPB::Code error_code; + if (req->has_new_scan_request()) { + const NewScanRequestPB& scan_pb = req->new_scan_request(); + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(server_->tablet_manager(), scan_pb.tablet_id(), resp, context, + &tablet_peer)) { + return; + } + string scanner_id; + Timestamp scan_timestamp; + Status s = HandleNewScanRequest(tablet_peer.get(), req, context, + &collector, &scanner_id, &scan_timestamp, &has_more_results, + &error_code); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, error_code, context); + return; + } + + // Only set the scanner id if we have more results. + if (has_more_results) { + resp->set_scanner_id(scanner_id); + } + if (scan_timestamp != Timestamp::kInvalidTimestamp) { + resp->set_snap_timestamp(scan_timestamp.ToUint64()); + } + } else if (req->has_scanner_id()) { + Status s = HandleContinueScanRequest(req, &collector, &has_more_results, &error_code); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, error_code, context); + return; + } + } else { + context->RespondFailure(Status::InvalidArgument( + "Must pass either a scanner_id or new_scan_request")); + return; + } + resp->set_has_more_results(has_more_results); + + DVLOG(2) << "Blocks processed: " << collector.BlocksProcessed(); + if (collector.BlocksProcessed() > 0) { + resp->mutable_data()->CopyFrom(data); + + // Add sidecar data to context and record the returned indices. + int rows_idx; + CHECK_OK(context->AddRpcSidecar(make_gscoped_ptr( + new rpc::RpcSidecar(rows_data.Pass())), &rows_idx)); + resp->mutable_data()->set_rows_sidecar(rows_idx); + + // Add indirect data as a sidecar, if applicable. + if (indirect_data->size() > 0) { + int indirect_idx; + CHECK_OK(context->AddRpcSidecar(make_gscoped_ptr( + new rpc::RpcSidecar(indirect_data.Pass())), &indirect_idx)); + resp->mutable_data()->set_indirect_data_sidecar(indirect_idx); + } + + // Set the last row found by the collector. + // We could have an empty batch if all the remaining rows are filtered by the predicate, + // in which case do not set the last row. + const faststring& last = collector.last_primary_key(); + if (last.length() > 0) { + resp->set_last_primary_key(last.ToString()); + } + } + + context->RespondSuccess(); +} + +void TabletServiceImpl::ListTablets(const ListTabletsRequestPB* req, + ListTabletsResponsePB* resp, + rpc::RpcContext* context) { + vector > peers; + server_->tablet_manager()->GetTabletPeers(&peers); + RepeatedPtrField* peer_status = resp->mutable_status_and_schema(); + for (const scoped_refptr& peer : peers) { + StatusAndSchemaPB* status = peer_status->Add(); + peer->GetTabletStatusPB(status->mutable_tablet_status()); + CHECK_OK(SchemaToPB(peer->status_listener()->schema(), + status->mutable_schema())); + peer->tablet_metadata()->partition_schema().ToPB(status->mutable_partition_schema()); + } + context->RespondSuccess(); +} + +void TabletServiceImpl::Checksum(const ChecksumRequestPB* req, + ChecksumResponsePB* resp, + rpc::RpcContext* context) { + VLOG(1) << "Full request: " << req->DebugString(); + + // Validate the request: user must pass a new_scan_request or + // a scanner ID, but not both. + if (PREDICT_FALSE(req->has_new_request() && + req->has_continue_request())) { + context->RespondFailure(Status::InvalidArgument( + "Must not pass both a scanner_id and new_scan_request")); + return; + } + + // Convert ChecksumRequestPB to a ScanRequestPB. + ScanRequestPB scan_req; + if (req->has_call_seq_id()) scan_req.set_call_seq_id(req->call_seq_id()); + if (req->has_batch_size_bytes()) scan_req.set_batch_size_bytes(req->batch_size_bytes()); + if (req->has_close_scanner()) scan_req.set_close_scanner(req->close_scanner()); + + ScanResultChecksummer collector; + bool has_more = false; + TabletServerErrorPB::Code error_code; + if (req->has_new_request()) { + scan_req.mutable_new_scan_request()->CopyFrom(req->new_request()); + const NewScanRequestPB& new_req = req->new_request(); + scoped_refptr tablet_peer; + if (!LookupTabletPeerOrRespond(server_->tablet_manager(), new_req.tablet_id(), resp, context, + &tablet_peer)) { + return; + } + + string scanner_id; + Timestamp snap_timestamp; + Status s = HandleNewScanRequest(tablet_peer.get(), &scan_req, context, + &collector, &scanner_id, &snap_timestamp, &has_more, + &error_code); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, error_code, context); + return; + } + resp->set_scanner_id(scanner_id); + if (snap_timestamp != Timestamp::kInvalidTimestamp) { + resp->set_snap_timestamp(snap_timestamp.ToUint64()); + } + } else if (req->has_continue_request()) { + const ContinueChecksumRequestPB& continue_req = req->continue_request(); + collector.set_agg_checksum(continue_req.previous_checksum()); + scan_req.set_scanner_id(continue_req.scanner_id()); + Status s = HandleContinueScanRequest(&scan_req, &collector, &has_more, &error_code); + if (PREDICT_FALSE(!s.ok())) { + SetupErrorAndRespond(resp->mutable_error(), s, error_code, context); + return; + } + } else { + context->RespondFailure(Status::InvalidArgument( + "Must pass either new_request or continue_request")); + return; + } + + resp->set_checksum(collector.agg_checksum()); + resp->set_has_more_results(has_more); + + context->RespondSuccess(); +} + +void TabletServiceImpl::Shutdown() { +} + +// Extract a void* pointer suitable for use in a ColumnRangePredicate from the +// user-specified protobuf field. +// This validates that the pb_value has the correct length, copies the data into +// 'arena', and sets *result to point to it. +// Returns bad status if the user-specified value is the wrong length. +static Status ExtractPredicateValue(const ColumnSchema& schema, + const string& pb_value, + Arena* arena, + const void** result) { + // Copy the data from the protobuf into the Arena. + uint8_t* data_copy = static_cast(arena->AllocateBytes(pb_value.size())); + memcpy(data_copy, &pb_value[0], pb_value.size()); + + // If the type is of variable length, then we need to return a pointer to a Slice + // element pointing to the string. Otherwise, just verify that the provided + // value was the right size. + if (schema.type_info()->physical_type() == BINARY) { + *result = arena->NewObject(data_copy, pb_value.size()); + } else { + // TODO: add test case for this invalid request + size_t expected_size = schema.type_info()->size(); + if (pb_value.size() != expected_size) { + return Status::InvalidArgument( + StringPrintf("Bad predicate on %s. Expected value size %zd, got %zd", + schema.ToString().c_str(), expected_size, pb_value.size())); + } + *result = data_copy; + } + + return Status::OK(); +} + +static Status DecodeEncodedKeyRange(const NewScanRequestPB& scan_pb, + const Schema& tablet_schema, + const SharedScanner& scanner, + ScanSpec* spec) { + gscoped_ptr start, stop; + if (scan_pb.has_start_primary_key()) { + RETURN_NOT_OK_PREPEND(EncodedKey::DecodeEncodedString( + tablet_schema, scanner->arena(), + scan_pb.start_primary_key(), &start), + "Invalid scan start key"); + } + + if (scan_pb.has_stop_primary_key()) { + RETURN_NOT_OK_PREPEND(EncodedKey::DecodeEncodedString( + tablet_schema, scanner->arena(), + scan_pb.stop_primary_key(), &stop), + "Invalid scan stop key"); + } + + if (scan_pb.order_mode() == ORDERED && scan_pb.has_last_primary_key()) { + if (start) { + return Status::InvalidArgument("Cannot specify both a start key and a last key"); + } + // Set the start key to the last key from a previous scan result. + RETURN_NOT_OK_PREPEND(EncodedKey::DecodeEncodedString(tablet_schema, scanner->arena(), + scan_pb.last_primary_key(), &start), + "Failed to decode last primary key"); + // Increment the start key, so we don't return the last row again. + RETURN_NOT_OK_PREPEND(EncodedKey::IncrementEncodedKey(tablet_schema, &start, scanner->arena()), + "Failed to increment encoded last row key"); + } + + if (start) { + spec->SetLowerBoundKey(start.get()); + scanner->autorelease_pool()->Add(start.release()); + } + if (stop) { + spec->SetExclusiveUpperBoundKey(stop.get()); + scanner->autorelease_pool()->Add(stop.release()); + } + + return Status::OK(); +} + +static Status SetupScanSpec(const NewScanRequestPB& scan_pb, + const Schema& tablet_schema, + const Schema& projection, + vector* missing_cols, + gscoped_ptr* spec, + const SharedScanner& scanner) { + gscoped_ptr ret(new ScanSpec); + ret->set_cache_blocks(scan_pb.cache_blocks()); + + unordered_set missing_col_names; + + // First the column range predicates. + for (const ColumnRangePredicatePB& pred_pb : scan_pb.range_predicates()) { + if (!pred_pb.has_lower_bound() && !pred_pb.has_upper_bound()) { + return Status::InvalidArgument( + string("Invalid predicate ") + pred_pb.ShortDebugString() + + ": has no lower or upper bound."); + } + ColumnSchema col(ColumnSchemaFromPB(pred_pb.column())); + if (projection.find_column(col.name()) == -1 && + !ContainsKey(missing_col_names, col.name())) { + missing_cols->push_back(col); + InsertOrDie(&missing_col_names, col.name()); + } + + const void* lower_bound = nullptr; + const void* upper_bound = nullptr; + if (pred_pb.has_lower_bound()) { + const void* val; + RETURN_NOT_OK(ExtractPredicateValue(col, pred_pb.lower_bound(), + scanner->arena(), + &val)); + lower_bound = val; + } else { + lower_bound = nullptr; + } + if (pred_pb.has_upper_bound()) { + const void* val; + RETURN_NOT_OK(ExtractPredicateValue(col, pred_pb.upper_bound(), + scanner->arena(), + &val)); + upper_bound = val; + } else { + upper_bound = nullptr; + } + + ColumnRangePredicate pred(col, lower_bound, upper_bound); + if (VLOG_IS_ON(3)) { + VLOG(3) << "Parsed predicate " << pred.ToString() << " from " << scan_pb.ShortDebugString(); + } + ret->AddPredicate(pred); + } + + // When doing an ordered scan, we need to include the key columns to be able to encode + // the last row key for the scan response. + if (scan_pb.order_mode() == kudu::ORDERED && + projection.num_key_columns() != tablet_schema.num_key_columns()) { + for (int i = 0; i < tablet_schema.num_key_columns(); i++) { + const ColumnSchema &col = tablet_schema.column(i); + if (projection.find_column(col.name()) == -1 && + !ContainsKey(missing_col_names, col.name())) { + missing_cols->push_back(col); + InsertOrDie(&missing_col_names, col.name()); + } + } + } + // Then any encoded key range predicates. + RETURN_NOT_OK(DecodeEncodedKeyRange(scan_pb, tablet_schema, scanner, ret.get())); + + spec->swap(ret); + return Status::OK(); +} + +// Start a new scan. +Status TabletServiceImpl::HandleNewScanRequest(TabletPeer* tablet_peer, + const ScanRequestPB* req, + const RpcContext* rpc_context, + ScanResultCollector* result_collector, + std::string* scanner_id, + Timestamp* snap_timestamp, + bool* has_more_results, + TabletServerErrorPB::Code* error_code) { + DCHECK(result_collector != nullptr); + DCHECK(error_code != nullptr); + DCHECK(req->has_new_scan_request()); + const NewScanRequestPB& scan_pb = req->new_scan_request(); + TRACE_EVENT1("tserver", "TabletServiceImpl::HandleNewScanRequest", + "tablet_id", scan_pb.tablet_id()); + + const Schema& tablet_schema = tablet_peer->tablet_metadata()->schema(); + + SharedScanner scanner; + server_->scanner_manager()->NewScanner(tablet_peer, + rpc_context->requestor_string(), + &scanner); + + // If we early-exit out of this function, automatically unregister + // the scanner. + ScopedUnregisterScanner unreg_scanner(server_->scanner_manager(), scanner->id()); + + // Create the user's requested projection. + // TODO: add test cases for bad projections including 0 columns + Schema projection; + Status s = ColumnPBsToSchema(scan_pb.projected_columns(), &projection); + if (PREDICT_FALSE(!s.ok())) { + *error_code = TabletServerErrorPB::INVALID_SCHEMA; + return s; + } + + if (projection.has_column_ids()) { + *error_code = TabletServerErrorPB::INVALID_SCHEMA; + return Status::InvalidArgument("User requests should not have Column IDs"); + } + + if (scan_pb.order_mode() == ORDERED) { + // Ordered scans must be at a snapshot so that we perform a serializable read (which can be + // resumed). Otherwise, this would be read committed isolation, which is not resumable. + if (scan_pb.read_mode() != READ_AT_SNAPSHOT) { + *error_code = TabletServerErrorPB::INVALID_SNAPSHOT; + return Status::InvalidArgument("Cannot do an ordered scan that is not a snapshot read"); + } + } + + gscoped_ptr spec(new ScanSpec); + + // Missing columns will contain the columns that are not mentioned in the client + // projection but are actually needed for the scan, such as columns referred to by + // predicates or key columns (if this is an ORDERED scan). + vector missing_cols; + s = SetupScanSpec(scan_pb, tablet_schema, projection, &missing_cols, &spec, scanner); + if (PREDICT_FALSE(!s.ok())) { + *error_code = TabletServerErrorPB::INVALID_SCAN_SPEC; + return s; + } + + // Store the original projection. + gscoped_ptr orig_projection(new Schema(projection)); + scanner->set_client_projection_schema(orig_projection.Pass()); + + // Build a new projection with the projection columns and the missing columns. Make + // sure to set whether the column is a key column appropriately. + SchemaBuilder projection_builder; + vector projection_columns = projection.columns(); + for (const ColumnSchema& col : missing_cols) { + projection_columns.push_back(col); + } + for (const ColumnSchema& col : projection_columns) { + CHECK_OK(projection_builder.AddColumn(col, tablet_schema.is_key_column(col.name()))); + } + projection = projection_builder.BuildWithoutIds(); + + gscoped_ptr iter; + // Preset the error code for when creating the iterator on the tablet fails + TabletServerErrorPB::Code tmp_error_code = TabletServerErrorPB::MISMATCHED_SCHEMA; + + shared_ptr tablet; + RETURN_NOT_OK(GetTabletRef(tablet_peer, &tablet, error_code)); + { + TRACE("Creating iterator"); + TRACE_EVENT0("tserver", "Create iterator"); + + switch (scan_pb.read_mode()) { + case UNKNOWN_READ_MODE: { + *error_code = TabletServerErrorPB::INVALID_SCAN_SPEC; + s = Status::NotSupported("Unknown read mode."); + return s; + } + case READ_LATEST: { + s = tablet->NewRowIterator(projection, &iter); + break; + } + case READ_AT_SNAPSHOT: { + s = HandleScanAtSnapshot(scan_pb, rpc_context, projection, tablet, &iter, snap_timestamp); + if (!s.ok()) { + tmp_error_code = TabletServerErrorPB::INVALID_SNAPSHOT; + } + } + TRACE("Iterator created"); + } + } + + if (PREDICT_TRUE(s.ok())) { + TRACE_EVENT0("tserver", "iter->Init"); + s = iter->Init(spec.get()); + } + + TRACE("Iterator init: $0", s.ToString()); + + if (PREDICT_FALSE(s.IsInvalidArgument())) { + // An invalid projection returns InvalidArgument above. + // TODO: would be nice if we threaded these more specific + // error codes throughout Kudu. + *error_code = tmp_error_code; + return s; + } else if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << "Error setting up scanner with request " << req->ShortDebugString(); + *error_code = TabletServerErrorPB::UNKNOWN_ERROR; + return s; + } + + *has_more_results = iter->HasNext(); + TRACE("has_more: $0", *has_more_results); + if (!*has_more_results) { + // If there are no more rows, we can short circuit some work and respond immediately. + VLOG(1) << "No more rows, short-circuiting out without creating a server-side scanner."; + return Status::OK(); + } + + scanner->Init(iter.Pass(), spec.Pass()); + unreg_scanner.Cancel(); + *scanner_id = scanner->id(); + + VLOG(1) << "Started scanner " << scanner->id() << ": " << scanner->iter()->ToString(); + + size_t batch_size_bytes = GetMaxBatchSizeBytesHint(req); + if (batch_size_bytes > 0) { + TRACE("Continuing scan request"); + // TODO: instead of copying the pb, instead split HandleContinueScanRequest + // and call the second half directly + ScanRequestPB continue_req(*req); + continue_req.set_scanner_id(scanner->id()); + RETURN_NOT_OK(HandleContinueScanRequest(&continue_req, result_collector, has_more_results, + error_code)); + } else { + // Increment the scanner call sequence ID. HandleContinueScanRequest handles + // this in the non-empty scan case. + scanner->IncrementCallSeqId(); + } + return Status::OK(); +} + +// Continue an existing scan request. +Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req, + ScanResultCollector* result_collector, + bool* has_more_results, + TabletServerErrorPB::Code* error_code) { + DCHECK(req->has_scanner_id()); + TRACE_EVENT1("tserver", "TabletServiceImpl::HandleContinueScanRequest", + "scanner_id", req->scanner_id()); + + size_t batch_size_bytes = GetMaxBatchSizeBytesHint(req); + + // TODO: need some kind of concurrency control on these scanner objects + // in case multiple RPCs hit the same scanner at the same time. Probably + // just a trylock and fail the RPC if it contends. + SharedScanner scanner; + if (!server_->scanner_manager()->LookupScanner(req->scanner_id(), &scanner)) { + if (batch_size_bytes == 0 && req->close_scanner()) { + // A request to close a non-existent scanner. + return Status::OK(); + } else { + *error_code = TabletServerErrorPB::SCANNER_EXPIRED; + return Status::NotFound("Scanner not found"); + } + } + + // If we early-exit out of this function, automatically unregister the scanner. + ScopedUnregisterScanner unreg_scanner(server_->scanner_manager(), scanner->id()); + + VLOG(2) << "Found existing scanner " << scanner->id() << " for request: " + << req->ShortDebugString(); + TRACE("Found scanner $0", scanner->id()); + + if (batch_size_bytes == 0 && req->close_scanner()) { + *has_more_results = false; + return Status::OK(); + } + + if (req->call_seq_id() != scanner->call_seq_id()) { + *error_code = TabletServerErrorPB::INVALID_SCAN_CALL_SEQ_ID; + return Status::InvalidArgument("Invalid call sequence ID in scan request"); + } + scanner->IncrementCallSeqId(); + scanner->UpdateAccessTime(); + + RowwiseIterator* iter = scanner->iter(); + + // TODO: could size the RowBlock based on the user's requested batch size? + // If people had really large indirect objects, we would currently overshoot + // their requested batch size by a lot. + Arena arena(32 * 1024, 1 * 1024 * 1024); + RowBlock block(scanner->iter()->schema(), + FLAGS_scanner_batch_size_rows, &arena); + + // TODO: in the future, use the client timeout to set a budget. For now, + // just use a half second, which should be plenty to amortize call overhead. + int budget_ms = 500; + MonoTime deadline = MonoTime::Now(MonoTime::COARSE); + deadline.AddDelta(MonoDelta::FromMilliseconds(budget_ms)); + + int64_t rows_scanned = 0; + while (iter->HasNext()) { + if (PREDICT_FALSE(FLAGS_scanner_inject_latency_on_each_batch_ms > 0)) { + SleepFor(MonoDelta::FromMilliseconds(FLAGS_scanner_inject_latency_on_each_batch_ms)); + } + + Status s = iter->NextBlock(&block); + if (PREDICT_FALSE(!s.ok())) { + LOG(WARNING) << "Copying rows from internal iterator for request " << req->ShortDebugString(); + *error_code = TabletServerErrorPB::UNKNOWN_ERROR; + return s; + } + + if (PREDICT_TRUE(block.nrows() > 0)) { + // Count the number of rows scanned, regardless of predicates or deletions. + // The collector will separately count the number of rows actually returned to + // the client. + rows_scanned += block.nrows(); + result_collector->HandleRowBlock(scanner->client_projection_schema(), block); + } + + int64_t response_size = result_collector->ResponseSize(); + + if (VLOG_IS_ON(2)) { + // This may be fairly expensive if row block size is small + TRACE("Copied block (nrows=$0), new size=$1", block.nrows(), response_size); + } + + // TODO: should check if RPC got cancelled, once we implement RPC cancellation. + MonoTime now = MonoTime::Now(MonoTime::COARSE); + if (PREDICT_FALSE(!now.ComesBefore(deadline))) { + TRACE("Deadline expired - responding early"); + break; + } + + if (response_size >= batch_size_bytes) { + break; + } + } + + // Update metrics based on this scan request. + scoped_refptr tablet_peer = scanner->tablet_peer(); + shared_ptr tablet; + RETURN_NOT_OK(GetTabletRef(tablet_peer, &tablet, error_code)); + + // First, the number of rows/cells/bytes actually returned to the user. + tablet->metrics()->scanner_rows_returned->IncrementBy( + result_collector->NumRowsReturned()); + tablet->metrics()->scanner_cells_returned->IncrementBy( + result_collector->NumRowsReturned() * scanner->client_projection_schema()->num_columns()); + tablet->metrics()->scanner_bytes_returned->IncrementBy( + result_collector->ResponseSize()); + + // Then the number of rows/cells/bytes actually processed. Here we have to dig + // into the per-column iterator stats, sum them up, and then subtract out the + // total that we already reported in a previous scan. + vector stats_by_col; + scanner->GetIteratorStats(&stats_by_col); + IteratorStats total_stats; + for (const IteratorStats& stats : stats_by_col) { + total_stats.AddStats(stats); + } + IteratorStats delta_stats = total_stats; + delta_stats.SubtractStats(scanner->already_reported_stats()); + scanner->set_already_reported_stats(total_stats); + + tablet->metrics()->scanner_rows_scanned->IncrementBy( + rows_scanned); + tablet->metrics()->scanner_cells_scanned_from_disk->IncrementBy( + delta_stats.cells_read_from_disk); + tablet->metrics()->scanner_bytes_scanned_from_disk->IncrementBy( + delta_stats.bytes_read_from_disk); + + scanner->UpdateAccessTime(); + *has_more_results = !req->close_scanner() && iter->HasNext(); + if (*has_more_results) { + unreg_scanner.Cancel(); + } else { + VLOG(2) << "Scanner " << scanner->id() << " complete: removing..."; + } + + return Status::OK(); +} + +Status TabletServiceImpl::HandleScanAtSnapshot(const NewScanRequestPB& scan_pb, + const RpcContext* rpc_context, + const Schema& projection, + const shared_ptr& tablet, + gscoped_ptr* iter, + Timestamp* snap_timestamp) { + + // TODO check against the earliest boundary (i.e. how early can we go) right + // now we're keeping all undos/redos forever! + + // If the client sent a timestamp update our clock with it. + if (scan_pb.has_propagated_timestamp()) { + Timestamp propagated_timestamp(scan_pb.propagated_timestamp()); + + // Update the clock so that we never generate snapshots lower that + // 'propagated_timestamp'. If 'propagated_timestamp' is lower than + // 'now' this call has no effect. If 'propagated_timestamp' is too much + // into the future this will fail and we abort. + RETURN_NOT_OK(server_->clock()->Update(propagated_timestamp)); + } + + Timestamp tmp_snap_timestamp; + + // If the client provided no snapshot timestamp we take the current clock + // time as the snapshot timestamp. + if (!scan_pb.has_snap_timestamp()) { + tmp_snap_timestamp = server_->clock()->Now(); + // ... else we use the client provided one, but make sure it is not too far + // in the future as to be invalid. + } else { + tmp_snap_timestamp.FromUint64(scan_pb.snap_timestamp()); + Timestamp max_allowed_ts; + Status s = server_->clock()->GetGlobalLatest(&max_allowed_ts); + if (!s.ok()) { + return Status::NotSupported("Snapshot scans not supported on this server", + s.ToString()); + } + if (tmp_snap_timestamp.CompareTo(max_allowed_ts) > 0) { + return Status::InvalidArgument( + Substitute("Snapshot time $0 in the future. Max allowed timestamp is $1", + server_->clock()->Stringify(tmp_snap_timestamp), + server_->clock()->Stringify(max_allowed_ts))); + } + } + + tablet::MvccSnapshot snap; + + // Wait for the in-flights in the snapshot to be finished. + // We'll use the client-provided deadline, but not if it's more than 5 seconds from + // now -- it's better to make the client retry than hold RPC threads busy. + // + // TODO(KUDU-1127): even this may not be sufficient -- perhaps we should check how long it + // has been since the MVCC manager was able to advance its safe time. If it has been + // a long time, it's likely that the majority of voters for this tablet are down + // and some writes are "stuck" and therefore won't be committed. + MonoTime client_deadline = rpc_context->GetClientDeadline(); + // Subtract a little bit from the client deadline so that it's more likely we actually + // have time to send our response sent back before it times out. + client_deadline.AddDelta(MonoDelta::FromMilliseconds(-10)); + + MonoTime deadline = MonoTime::Now(MonoTime::FINE); + deadline.AddDelta(MonoDelta::FromSeconds(5)); + if (client_deadline.ComesBefore(deadline)) { + deadline = client_deadline; + } + + TRACE("Waiting for operations in snapshot to commit"); + MonoTime before = MonoTime::Now(MonoTime::FINE); + RETURN_NOT_OK_PREPEND( + tablet->mvcc_manager()->WaitForCleanSnapshotAtTimestamp( + tmp_snap_timestamp, &snap, deadline), + "could not wait for desired snapshot timestamp to be consistent"); + + uint64_t duration_usec = MonoTime::Now(MonoTime::FINE).GetDeltaSince(before).ToMicroseconds(); + tablet->metrics()->snapshot_read_inflight_wait_duration->Increment(duration_usec); + TRACE("All operations in snapshot committed. Waited for $0 microseconds", duration_usec); + + tablet::Tablet::OrderMode order; + switch (scan_pb.order_mode()) { + case UNORDERED: order = tablet::Tablet::UNORDERED; break; + case ORDERED: order = tablet::Tablet::ORDERED; break; + default: LOG(FATAL) << "Unexpected order mode."; + } + RETURN_NOT_OK(tablet->NewRowIterator(projection, snap, order, iter)); + *snap_timestamp = tmp_snap_timestamp; + return Status::OK(); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tablet_service.h b/src/kudu/tserver/tablet_service.h new file mode 100644 index 000000000000..8f5846e1584d --- /dev/null +++ b/src/kudu/tserver/tablet_service.h @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TABLET_SERVICE_H +#define KUDU_TSERVER_TABLET_SERVICE_H + +#include +#include +#include + +#include "kudu/consensus/consensus.service.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tserver/tserver_admin.service.h" +#include "kudu/tserver/tserver_service.service.h" + +namespace kudu { +class RowwiseIterator; +class Schema; +class Status; +class Timestamp; + +namespace tablet { +class Tablet; +class TabletPeer; +class TransactionState; +} // namespace tablet + +namespace tserver { + +class ScanResultCollector; +class TabletPeerLookupIf; +class TabletServer; + +class TabletServiceImpl : public TabletServerServiceIf { + public: + explicit TabletServiceImpl(TabletServer* server); + + virtual void Ping(const PingRequestPB* req, + PingResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void Write(const WriteRequestPB* req, WriteResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void Scan(const ScanRequestPB* req, + ScanResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void ScannerKeepAlive(const ScannerKeepAliveRequestPB *req, + ScannerKeepAliveResponsePB *resp, + rpc::RpcContext *context) OVERRIDE; + + virtual void ListTablets(const ListTabletsRequestPB* req, + ListTabletsResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void Checksum(const ChecksumRequestPB* req, + ChecksumResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void Shutdown() OVERRIDE; + + private: + Status HandleNewScanRequest(tablet::TabletPeer* tablet_peer, + const ScanRequestPB* req, + const rpc::RpcContext* rpc_context, + ScanResultCollector* result_collector, + std::string* scanner_id, + Timestamp* snap_timestamp, + bool* has_more_results, + TabletServerErrorPB::Code* error_code); + + Status HandleContinueScanRequest(const ScanRequestPB* req, + ScanResultCollector* result_collector, + bool* has_more_results, + TabletServerErrorPB::Code* error_code); + + Status HandleScanAtSnapshot(const NewScanRequestPB& scan_pb, + const rpc::RpcContext* rpc_context, + const Schema& projection, + const std::shared_ptr& tablet, + gscoped_ptr* iter, + Timestamp* snap_timestamp); + + TabletServer* server_; +}; + +class TabletServiceAdminImpl : public TabletServerAdminServiceIf { + public: + explicit TabletServiceAdminImpl(TabletServer* server); + virtual void CreateTablet(const CreateTabletRequestPB* req, + CreateTabletResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void DeleteTablet(const DeleteTabletRequestPB* req, + DeleteTabletResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void AlterSchema(const AlterSchemaRequestPB* req, + AlterSchemaResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + private: + TabletServer* server_; +}; + +class ConsensusServiceImpl : public consensus::ConsensusServiceIf { + public: + ConsensusServiceImpl(const scoped_refptr& metric_entity, + TabletPeerLookupIf* tablet_manager_); + + virtual ~ConsensusServiceImpl(); + + virtual void UpdateConsensus(const consensus::ConsensusRequestPB *req, + consensus::ConsensusResponsePB *resp, + rpc::RpcContext *context) OVERRIDE; + + virtual void RequestConsensusVote(const consensus::VoteRequestPB* req, + consensus::VoteResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void ChangeConfig(const consensus::ChangeConfigRequestPB* req, + consensus::ChangeConfigResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void GetNodeInstance(const consensus::GetNodeInstanceRequestPB* req, + consensus::GetNodeInstanceResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void RunLeaderElection(const consensus::RunLeaderElectionRequestPB* req, + consensus::RunLeaderElectionResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void LeaderStepDown(const consensus::LeaderStepDownRequestPB* req, + consensus::LeaderStepDownResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + virtual void GetLastOpId(const consensus::GetLastOpIdRequestPB *req, + consensus::GetLastOpIdResponsePB *resp, + rpc::RpcContext *context) OVERRIDE; + + virtual void GetConsensusState(const consensus::GetConsensusStateRequestPB *req, + consensus::GetConsensusStateResponsePB *resp, + rpc::RpcContext *context) OVERRIDE; + + virtual void StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB* req, + consensus::StartRemoteBootstrapResponsePB* resp, + rpc::RpcContext* context) OVERRIDE; + + private: + TabletPeerLookupIf* tablet_manager_; +}; + +} // namespace tserver +} // namespace kudu + +#endif diff --git a/src/kudu/tserver/ts_tablet_manager-test.cc b/src/kudu/tserver/ts_tablet_manager-test.cc new file mode 100644 index 000000000000..b0585410d934 --- /dev/null +++ b/src/kudu/tserver/ts_tablet_manager-test.cc @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/ts_tablet_manager.h" + +#include +#include + +#include "kudu/common/partition.h" +#include "kudu/common/schema.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/master/master.pb.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tablet/tablet-test-util.h" +#include "kudu/tserver/mini_tablet_server.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/test_util.h" + +#define ASSERT_REPORT_HAS_UPDATED_TABLET(report, tablet_id) \ + ASSERT_NO_FATAL_FAILURE(AssertReportHasUpdatedTablet(report, tablet_id)) + +#define ASSERT_MONOTONIC_REPORT_SEQNO(report_seqno, tablet_report) \ + ASSERT_NO_FATAL_FAILURE(AssertMonotonicReportSeqno(report_seqno, tablet_report)) + +namespace kudu { +namespace tserver { + +using consensus::kInvalidOpIdIndex; +using consensus::RaftConfigPB; +using master::ReportedTabletPB; +using master::TabletReportPB; +using tablet::TabletPeer; + +static const char* const kTabletId = "my-tablet-id"; + + +class TsTabletManagerTest : public KuduTest { + public: + TsTabletManagerTest() + : schema_({ ColumnSchema("key", UINT32) }, 1) { + } + + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + + mini_server_.reset( + new MiniTabletServer(GetTestPath("TsTabletManagerTest-fsroot"), 0)); + ASSERT_OK(mini_server_->Start()); + mini_server_->FailHeartbeats(); + + config_ = mini_server_->CreateLocalConfig(); + + tablet_manager_ = mini_server_->server()->tablet_manager(); + fs_manager_ = mini_server_->server()->fs_manager(); + } + + Status CreateNewTablet(const std::string& tablet_id, + const Schema& schema, + scoped_refptr* out_tablet_peer) { + Schema full_schema = SchemaBuilder(schema).Build(); + std::pair partition = tablet::CreateDefaultPartition(full_schema); + + scoped_refptr tablet_peer; + RETURN_NOT_OK(tablet_manager_->CreateNewTablet(tablet_id, tablet_id, partition.second, + tablet_id, + full_schema, partition.first, + config_, + &tablet_peer)); + if (out_tablet_peer) { + (*out_tablet_peer) = tablet_peer; + } + + return tablet_peer->WaitUntilConsensusRunning(MonoDelta::FromMilliseconds(2000)); + } + + protected: + gscoped_ptr mini_server_; + FsManager* fs_manager_; + TSTabletManager* tablet_manager_; + + Schema schema_; + RaftConfigPB config_; +}; + +TEST_F(TsTabletManagerTest, TestCreateTablet) { + // Create a new tablet. + scoped_refptr peer; + ASSERT_OK(CreateNewTablet(kTabletId, schema_, &peer)); + ASSERT_EQ(kTabletId, peer->tablet()->tablet_id()); + peer.reset(); + + // Re-load the tablet manager from the filesystem. + LOG(INFO) << "Shutting down tablet manager"; + mini_server_->Shutdown(); + LOG(INFO) << "Restarting tablet manager"; + mini_server_.reset( + new MiniTabletServer(GetTestPath("TsTabletManagerTest-fsroot"), 0)); + ASSERT_OK(mini_server_->Start()); + ASSERT_OK(mini_server_->WaitStarted()); + tablet_manager_ = mini_server_->server()->tablet_manager(); + + // Ensure that the tablet got re-loaded and re-opened off disk. + ASSERT_TRUE(tablet_manager_->LookupTablet(kTabletId, &peer)); + ASSERT_EQ(kTabletId, peer->tablet()->tablet_id()); +} + +static void AssertMonotonicReportSeqno(int64_t* report_seqno, + const TabletReportPB &report) { + ASSERT_LT(*report_seqno, report.sequence_number()); + *report_seqno = report.sequence_number(); +} + +static void AssertReportHasUpdatedTablet(const TabletReportPB& report, + const string& tablet_id) { + ASSERT_GE(report.updated_tablets_size(), 0); + bool found_tablet = false; + for (ReportedTabletPB reported_tablet : report.updated_tablets()) { + if (reported_tablet.tablet_id() == tablet_id) { + found_tablet = true; + ASSERT_TRUE(reported_tablet.has_committed_consensus_state()); + ASSERT_TRUE(reported_tablet.committed_consensus_state().has_current_term()) + << reported_tablet.ShortDebugString(); + ASSERT_TRUE(reported_tablet.committed_consensus_state().has_leader_uuid()) + << reported_tablet.ShortDebugString(); + ASSERT_TRUE(reported_tablet.committed_consensus_state().has_config()); + const RaftConfigPB& committed_config = reported_tablet.committed_consensus_state().config(); + ASSERT_EQ(kInvalidOpIdIndex, committed_config.opid_index()); + ASSERT_EQ(1, committed_config.peers_size()); + ASSERT_TRUE(committed_config.peers(0).has_permanent_uuid()) + << reported_tablet.ShortDebugString(); + ASSERT_EQ(committed_config.peers(0).permanent_uuid(), + reported_tablet.committed_consensus_state().leader_uuid()) + << reported_tablet.ShortDebugString(); + } + } + ASSERT_TRUE(found_tablet); +} + +TEST_F(TsTabletManagerTest, TestTabletReports) { + TabletReportPB report; + int64_t seqno = -1; + + // Generate a tablet report before any tablets are loaded. Should be empty. + tablet_manager_->GenerateFullTabletReport(&report); + ASSERT_FALSE(report.is_incremental()); + ASSERT_EQ(0, report.updated_tablets().size()); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); + tablet_manager_->MarkTabletReportAcknowledged(report); + + // Another report should now be incremental, but with no changes. + tablet_manager_->GenerateIncrementalTabletReport(&report); + ASSERT_TRUE(report.is_incremental()); + ASSERT_EQ(0, report.updated_tablets().size()); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); + tablet_manager_->MarkTabletReportAcknowledged(report); + + // Create a tablet and do another incremental report - should include the tablet. + ASSERT_OK(CreateNewTablet("tablet-1", schema_, nullptr)); + int updated_tablets = 0; + while (updated_tablets != 1) { + tablet_manager_->GenerateIncrementalTabletReport(&report); + updated_tablets = report.updated_tablets().size(); + ASSERT_TRUE(report.is_incremental()); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); + } + + ASSERT_REPORT_HAS_UPDATED_TABLET(report, "tablet-1"); + + // If we don't acknowledge the report, and ask for another incremental report, + // it should include the tablet again. + tablet_manager_->GenerateIncrementalTabletReport(&report); + ASSERT_TRUE(report.is_incremental()); + ASSERT_EQ(1, report.updated_tablets().size()); + ASSERT_REPORT_HAS_UPDATED_TABLET(report, "tablet-1"); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); + + // Now acknowledge the last report, and further incrementals should be empty. + tablet_manager_->MarkTabletReportAcknowledged(report); + tablet_manager_->GenerateIncrementalTabletReport(&report); + ASSERT_TRUE(report.is_incremental()); + ASSERT_EQ(0, report.updated_tablets().size()); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); + tablet_manager_->MarkTabletReportAcknowledged(report); + + // Create a second tablet, and ensure the incremental report shows it. + ASSERT_OK(CreateNewTablet("tablet-2", schema_, nullptr)); + + // Wait up to 10 seconds to get a tablet report from tablet-2. + // TabletPeer does not mark tablets dirty until after it commits the + // initial configuration change, so there is also a window for tablet-1 to + // have been marked dirty since the last report. + MonoDelta timeout(MonoDelta::FromSeconds(10)); + MonoTime start(MonoTime::Now(MonoTime::FINE)); + report.Clear(); + while (true) { + bool found_tablet_2 = false; + tablet_manager_->GenerateIncrementalTabletReport(&report); + ASSERT_TRUE(report.is_incremental()) << report.ShortDebugString(); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report) << report.ShortDebugString(); + for (const ReportedTabletPB& reported_tablet : report.updated_tablets()) { + if (reported_tablet.tablet_id() == "tablet-2") { + found_tablet_2 = true; + break; + } + } + if (found_tablet_2) break; + MonoDelta elapsed(MonoTime::Now(MonoTime::FINE).GetDeltaSince(start)); + ASSERT_TRUE(elapsed.LessThan(timeout)) << "Waited too long for tablet-2 to be marked dirty: " + << elapsed.ToString() << ". " + << "Latest report: " << report.ShortDebugString(); + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + tablet_manager_->MarkTabletReportAcknowledged(report); + + // Asking for a full tablet report should re-report both tablets + tablet_manager_->GenerateFullTabletReport(&report); + ASSERT_FALSE(report.is_incremental()); + ASSERT_EQ(2, report.updated_tablets().size()); + ASSERT_REPORT_HAS_UPDATED_TABLET(report, "tablet-1"); + ASSERT_REPORT_HAS_UPDATED_TABLET(report, "tablet-2"); + ASSERT_MONOTONIC_REPORT_SEQNO(&seqno, report); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc new file mode 100644 index 000000000000..2e89e8948c48 --- /dev/null +++ b/src/kudu/tserver/ts_tablet_manager.cc @@ -0,0 +1,998 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/ts_tablet_manager.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/consensus/consensus_meta.h" +#include "kudu/consensus/log.h" +#include "kudu/consensus/metadata.pb.h" +#include "kudu/consensus/opid_util.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/fs/fs_manager.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/master/master.pb.h" +#include "kudu/tablet/metadata.pb.h" +#include "kudu/tablet/tablet.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet_metadata.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/heartbeater.h" +#include "kudu/tserver/remote_bootstrap_client.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/fault_injection.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/trace.h" + +DEFINE_int32(num_tablets_to_open_simultaneously, 0, + "Number of threads available to open tablets during startup. If this " + "is set to 0 (the default), then the number of bootstrap threads will " + "be set based on the number of data directories. If the data directories " + "are on some very fast storage device such as SSD or a RAID array, it " + "may make sense to manually tune this."); +TAG_FLAG(num_tablets_to_open_simultaneously, advanced); + +DEFINE_int32(tablet_start_warn_threshold_ms, 500, + "If a tablet takes more than this number of millis to start, issue " + "a warning with a trace."); +TAG_FLAG(tablet_start_warn_threshold_ms, hidden); + +DEFINE_double(fault_crash_after_blocks_deleted, 0.0, + "Fraction of the time when the tablet will crash immediately " + "after deleting the data blocks during tablet deletion. " + "(For testing only!)"); +TAG_FLAG(fault_crash_after_blocks_deleted, unsafe); + +DEFINE_double(fault_crash_after_wal_deleted, 0.0, + "Fraction of the time when the tablet will crash immediately " + "after deleting the WAL segments during tablet deletion. " + "(For testing only!)"); +TAG_FLAG(fault_crash_after_wal_deleted, unsafe); + +DEFINE_double(fault_crash_after_cmeta_deleted, 0.0, + "Fraction of the time when the tablet will crash immediately " + "after deleting the consensus metadata during tablet deletion. " + "(For testing only!)"); +TAG_FLAG(fault_crash_after_cmeta_deleted, unsafe); + +DEFINE_double(fault_crash_after_rb_files_fetched, 0.0, + "Fraction of the time when the tablet will crash immediately " + "after fetching the files during a remote bootstrap but before " + "marking the superblock as TABLET_DATA_READY. " + "(For testing only!)"); +TAG_FLAG(fault_crash_after_rb_files_fetched, unsafe); + +namespace kudu { +namespace tserver { + +METRIC_DEFINE_histogram(server, op_apply_queue_length, "Operation Apply Queue Length", + MetricUnit::kTasks, + "Number of operations waiting to be applied to the tablet. " + "High queue lengths indicate that the server is unable to process " + "operations as fast as they are being written to the WAL.", + 10000, 2); + +METRIC_DEFINE_histogram(server, op_apply_queue_time, "Operation Apply Queue Time", + MetricUnit::kMicroseconds, + "Time that operations spent waiting in the apply queue before being " + "processed. High queue times indicate that the server is unable to " + "process operations as fast as they are being written to the WAL.", + 10000000, 2); + +METRIC_DEFINE_histogram(server, op_apply_run_time, "Operation Apply Run Time", + MetricUnit::kMicroseconds, + "Time that operations spent being applied to the tablet. " + "High values may indicate that the server is under-provisioned or " + "that operations consist of very large batches.", + 10000000, 2); + +using consensus::ConsensusMetadata; +using consensus::ConsensusStatePB; +using consensus::OpId; +using consensus::RaftConfigPB; +using consensus::RaftPeerPB; +using consensus::StartRemoteBootstrapRequestPB; +using log::Log; +using master::ReportedTabletPB; +using master::TabletReportPB; +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; +using tablet::Tablet; +using tablet::TABLET_DATA_COPYING; +using tablet::TABLET_DATA_DELETED; +using tablet::TABLET_DATA_READY; +using tablet::TABLET_DATA_TOMBSTONED; +using tablet::TabletDataState; +using tablet::TabletMetadata; +using tablet::TabletPeer; +using tablet::TabletStatusListener; +using tablet::TabletStatusPB; +using tserver::RemoteBootstrapClient; + +TSTabletManager::TSTabletManager(FsManager* fs_manager, + TabletServer* server, + MetricRegistry* metric_registry) + : fs_manager_(fs_manager), + server_(server), + next_report_seq_(0), + metric_registry_(metric_registry), + state_(MANAGER_INITIALIZING) { + + CHECK_OK(ThreadPoolBuilder("apply").Build(&apply_pool_)); + apply_pool_->SetQueueLengthHistogram( + METRIC_op_apply_queue_length.Instantiate(server_->metric_entity())); + apply_pool_->SetQueueTimeMicrosHistogram( + METRIC_op_apply_queue_time.Instantiate(server_->metric_entity())); + apply_pool_->SetRunTimeMicrosHistogram( + METRIC_op_apply_run_time.Instantiate(server_->metric_entity())); +} + +TSTabletManager::~TSTabletManager() { +} + +Status TSTabletManager::Init() { + CHECK_EQ(state(), MANAGER_INITIALIZING); + + // Start the threadpool we'll use to open tablets. + // This has to be done in Init() instead of the constructor, since the + // FsManager isn't initialized until this point. + int max_bootstrap_threads = FLAGS_num_tablets_to_open_simultaneously; + if (max_bootstrap_threads == 0) { + // Default to the number of disks. + max_bootstrap_threads = fs_manager_->GetDataRootDirs().size(); + } + RETURN_NOT_OK(ThreadPoolBuilder("tablet-bootstrap") + .set_max_threads(max_bootstrap_threads) + .Build(&open_tablet_pool_)); + + // Search for tablets in the metadata dir. + vector tablet_ids; + RETURN_NOT_OK(fs_manager_->ListTabletIds(&tablet_ids)); + + InitLocalRaftPeerPB(); + + vector > metas; + + // First, load all of the tablet metadata. We do this before we start + // submitting the actual OpenTablet() tasks so that we don't have to compete + // for disk resources, etc, with bootstrap processes and running tablets. + for (const string& tablet_id : tablet_ids) { + scoped_refptr meta; + RETURN_NOT_OK_PREPEND(OpenTabletMeta(tablet_id, &meta), + "Failed to open tablet metadata for tablet: " + tablet_id); + if (PREDICT_FALSE(meta->tablet_data_state() != TABLET_DATA_READY)) { + RETURN_NOT_OK(HandleNonReadyTabletOnStartup(meta)); + continue; + } + metas.push_back(meta); + } + + // Now submit the "Open" task for each. + for (const scoped_refptr& meta : metas) { + scoped_refptr deleter; + { + boost::lock_guard lock(lock_); + CHECK_OK(StartTabletStateTransitionUnlocked(meta->tablet_id(), "opening tablet", &deleter)); + } + + scoped_refptr tablet_peer = CreateAndRegisterTabletPeer(meta, NEW_PEER); + RETURN_NOT_OK(open_tablet_pool_->SubmitFunc(boost::bind(&TSTabletManager::OpenTablet, + this, meta, deleter))); + } + + { + boost::lock_guard lock(lock_); + state_ = MANAGER_RUNNING; + } + + return Status::OK(); +} + +Status TSTabletManager::WaitForAllBootstrapsToFinish() { + CHECK_EQ(state(), MANAGER_RUNNING); + + open_tablet_pool_->Wait(); + + Status s = Status::OK(); + + boost::shared_lock shared_lock(lock_); + for (const TabletMap::value_type& entry : tablet_map_) { + if (entry.second->state() == tablet::FAILED) { + if (s.ok()) { + s = entry.second->error(); + } + } + } + + return s; +} + +Status TSTabletManager::CreateNewTablet(const string& table_id, + const string& tablet_id, + const Partition& partition, + const string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + RaftConfigPB config, + scoped_refptr* tablet_peer) { + CHECK_EQ(state(), MANAGER_RUNNING); + + // If the consensus configuration is specified to use local consensus, verify that the peer + // matches up with our local info. + if (config.local()) { + CHECK_EQ(1, config.peers_size()); + CHECK_EQ(server_->instance_pb().permanent_uuid(), config.peers(0).permanent_uuid()); + } + + // Set the initial opid_index for a RaftConfigPB to -1. + config.set_opid_index(consensus::kInvalidOpIdIndex); + + scoped_refptr deleter; + { + // acquire the lock in exclusive mode as we'll add a entry to the + // transition_in_progress_ set if the lookup fails. + boost::lock_guard lock(lock_); + TRACE("Acquired tablet manager lock"); + + // Sanity check that the tablet isn't already registered. + scoped_refptr junk; + if (LookupTabletUnlocked(tablet_id, &junk)) { + return Status::AlreadyPresent("Tablet already registered", tablet_id); + } + + // Sanity check that the tablet's creation isn't already in progress + RETURN_NOT_OK(StartTabletStateTransitionUnlocked(tablet_id, "creating tablet", &deleter)); + } + + // Create the metadata. + TRACE("Creating new metadata..."); + scoped_refptr meta; + RETURN_NOT_OK_PREPEND( + TabletMetadata::CreateNew(fs_manager_, + tablet_id, + table_name, + schema, + partition_schema, + partition, + TABLET_DATA_READY, + &meta), + "Couldn't create tablet metadata"); + + // We must persist the consensus metadata to disk before starting a new + // tablet's TabletPeer and Consensus implementation. + gscoped_ptr cmeta; + RETURN_NOT_OK_PREPEND(ConsensusMetadata::Create(fs_manager_, tablet_id, fs_manager_->uuid(), + config, consensus::kMinimumTerm, &cmeta), + "Unable to create new ConsensusMeta for tablet " + tablet_id); + scoped_refptr new_peer = CreateAndRegisterTabletPeer(meta, NEW_PEER); + + // We can run this synchronously since there is nothing to bootstrap. + RETURN_NOT_OK(open_tablet_pool_->SubmitFunc(boost::bind(&TSTabletManager::OpenTablet, + this, meta, deleter))); + + if (tablet_peer) { + *tablet_peer = new_peer; + } + return Status::OK(); +} + +// If 'expr' fails, log a message, tombstone the given tablet, and return the +// error status. +#define TOMBSTONE_NOT_OK(expr, meta, msg) \ + do { \ + Status _s = (expr); \ + if (PREDICT_FALSE(!_s.ok())) { \ + LogAndTombstone((meta), (msg), _s); \ + return _s; \ + } \ + } while (0) + +Status TSTabletManager::CheckLeaderTermNotLower(const string& tablet_id, + int64_t leader_term, + int64_t last_logged_term) { + if (PREDICT_FALSE(leader_term < last_logged_term)) { + Status s = Status::InvalidArgument( + Substitute("Leader has replica of tablet $0 with term $1 " + "lower than last logged term $2 on local replica. Rejecting " + "remote bootstrap request", + tablet_id, + leader_term, last_logged_term)); + LOG(WARNING) << LogPrefix(tablet_id) << "Remote boostrap: " << s.ToString(); + return s; + } + return Status::OK(); +} + +Status TSTabletManager::StartRemoteBootstrap(const StartRemoteBootstrapRequestPB& req) { + const string& tablet_id = req.tablet_id(); + const string& bootstrap_peer_uuid = req.bootstrap_peer_uuid(); + HostPort bootstrap_peer_addr; + RETURN_NOT_OK(HostPortFromPB(req.bootstrap_peer_addr(), &bootstrap_peer_addr)); + int64_t leader_term = req.caller_term(); + + const string kLogPrefix = LogPrefix(tablet_id); + + scoped_refptr old_tablet_peer; + scoped_refptr meta; + bool replacing_tablet = false; + scoped_refptr deleter; + { + boost::lock_guard lock(lock_); + if (LookupTabletUnlocked(tablet_id, &old_tablet_peer)) { + meta = old_tablet_peer->tablet_metadata(); + replacing_tablet = true; + } + RETURN_NOT_OK(StartTabletStateTransitionUnlocked(tablet_id, "remote bootstrapping tablet", + &deleter)); + } + + if (replacing_tablet) { + // Make sure the existing tablet peer is shut down and tombstoned. + TabletDataState data_state = meta->tablet_data_state(); + switch (data_state) { + case TABLET_DATA_COPYING: + // This should not be possible due to the transition_in_progress_ "lock". + LOG(FATAL) << LogPrefix(tablet_id) << " Remote bootstrap: " + << "Found tablet in TABLET_DATA_COPYING state during StartRemoteBootstrap()"; + case TABLET_DATA_TOMBSTONED: { + int64_t last_logged_term = meta->tombstone_last_logged_opid().term(); + RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id, leader_term, last_logged_term)); + break; + } + case TABLET_DATA_READY: { + Log* log = old_tablet_peer->log(); + if (!log) { + return Status::IllegalState("Log unavailable. Tablet is not running", tablet_id); + } + OpId last_logged_opid; + log->GetLatestEntryOpId(&last_logged_opid); + int64_t last_logged_term = last_logged_opid.term(); + RETURN_NOT_OK(CheckLeaderTermNotLower(tablet_id, leader_term, last_logged_term)); + + // Tombstone the tablet and store the last-logged OpId. + old_tablet_peer->Shutdown(); + // TODO: Because we begin shutdown of the tablet after we check our + // last-logged term against the leader's term, there may be operations + // in flight and it may be possible for the same check in the remote + // bootstrap client Start() method to fail. This will leave the replica in + // a tombstoned state, and then the leader with the latest log entries + // will simply remote boostrap this replica again. We could try to + // check again after calling Shutdown(), and if the check fails, try to + // reopen the tablet. For now, we live with the (unlikely) race. + RETURN_NOT_OK_PREPEND(DeleteTabletData(meta, TABLET_DATA_TOMBSTONED, last_logged_opid), + Substitute("Unable to delete on-disk data from tablet $0", + tablet_id)); + break; + } + default: + return Status::IllegalState( + Substitute("Found tablet in unsupported state for remote bootstrap. " + "Tablet: $0, tablet data state: $1", + tablet_id, TabletDataState_Name(data_state))); + } + } + + string init_msg = kLogPrefix + Substitute("Initiating remote bootstrap from Peer $0 ($1)", + bootstrap_peer_uuid, bootstrap_peer_addr.ToString()); + LOG(INFO) << init_msg; + TRACE(init_msg); + + gscoped_ptr rb_client( + new RemoteBootstrapClient(tablet_id, fs_manager_, server_->messenger(), + fs_manager_->uuid())); + + // Download and persist the remote superblock in TABLET_DATA_COPYING state. + if (replacing_tablet) { + RETURN_NOT_OK(rb_client->SetTabletToReplace(meta, leader_term)); + } + RETURN_NOT_OK(rb_client->Start(bootstrap_peer_uuid, bootstrap_peer_addr, &meta)); + + // From this point onward, the superblock is persisted in TABLET_DATA_COPYING + // state, and we need to tombtone the tablet if additional steps prior to + // getting to a TABLET_DATA_READY state fail. + + // Registering a non-initialized TabletPeer offers visibility through the Web UI. + RegisterTabletPeerMode mode = replacing_tablet ? REPLACEMENT_PEER : NEW_PEER; + scoped_refptr tablet_peer = CreateAndRegisterTabletPeer(meta, mode); + string peer_str = bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")"; + + // Download all of the remote files. + TOMBSTONE_NOT_OK(rb_client->FetchAll(tablet_peer->status_listener()), meta, + "Remote bootstrap: Unable to fetch data from remote peer " + + bootstrap_peer_uuid + " (" + bootstrap_peer_addr.ToString() + ")"); + + MAYBE_FAULT(FLAGS_fault_crash_after_rb_files_fetched); + + // Write out the last files to make the new replica visible and update the + // TabletDataState in the superblock to TABLET_DATA_READY. + TOMBSTONE_NOT_OK(rb_client->Finish(), meta, "Remote bootstrap: Failure calling Finish()"); + + // We run this asynchronously. We don't tombstone the tablet if this fails, + // because if we were to fail to open the tablet, on next startup, it's in a + // valid fully-copied state. + RETURN_NOT_OK(open_tablet_pool_->SubmitFunc(boost::bind(&TSTabletManager::OpenTablet, + this, meta, deleter))); + return Status::OK(); +} + +// Create and register a new TabletPeer, given tablet metadata. +scoped_refptr TSTabletManager::CreateAndRegisterTabletPeer( + const scoped_refptr& meta, RegisterTabletPeerMode mode) { + scoped_refptr tablet_peer( + new TabletPeer(meta, + local_peer_pb_, + apply_pool_.get(), + Bind(&TSTabletManager::MarkTabletDirty, Unretained(this), meta->tablet_id()))); + RegisterTablet(meta->tablet_id(), tablet_peer, mode); + return tablet_peer; +} + +Status TSTabletManager::DeleteTablet( + const string& tablet_id, + TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + boost::optional* error_code) { + + if (delete_type != TABLET_DATA_DELETED && delete_type != TABLET_DATA_TOMBSTONED) { + return Status::InvalidArgument("DeleteTablet() requires an argument that is one of " + "TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED", + Substitute("Given: $0 ($1)", + TabletDataState_Name(delete_type), delete_type)); + } + + TRACE("Deleting tablet $0", tablet_id); + + scoped_refptr tablet_peer; + scoped_refptr deleter; + { + // Acquire the lock in exclusive mode as we'll add a entry to the + // transition_in_progress_ map. + boost::lock_guard lock(lock_); + TRACE("Acquired tablet manager lock"); + RETURN_NOT_OK(CheckRunningUnlocked(error_code)); + + if (!LookupTabletUnlocked(tablet_id, &tablet_peer)) { + *error_code = TabletServerErrorPB::TABLET_NOT_FOUND; + return Status::NotFound("Tablet not found", tablet_id); + } + // Sanity check that the tablet's deletion isn't already in progress + Status s = StartTabletStateTransitionUnlocked(tablet_id, "deleting tablet", &deleter); + if (PREDICT_FALSE(!s.ok())) { + *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; + return s; + } + } + + // If the tablet is already deleted, the CAS check isn't possible because + // consensus and therefore the log is not available. + TabletDataState data_state = tablet_peer->tablet_metadata()->tablet_data_state(); + bool tablet_deleted = (data_state == TABLET_DATA_DELETED || data_state == TABLET_DATA_TOMBSTONED); + + // They specified an "atomic" delete. Check the committed config's opid_index. + // TODO: There's actually a race here between the check and shutdown, but + // it's tricky to fix. We could try checking again after the shutdown and + // restarting the tablet if the local replica committed a higher config + // change op during that time, or potentially something else more invasive. + if (cas_config_opid_index_less_or_equal && !tablet_deleted) { + scoped_refptr consensus = tablet_peer->shared_consensus(); + if (!consensus) { + *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; + return Status::IllegalState("Consensus not available. Tablet shutting down"); + } + RaftConfigPB committed_config = consensus->CommittedConfig(); + if (committed_config.opid_index() > *cas_config_opid_index_less_or_equal) { + *error_code = TabletServerErrorPB::CAS_FAILED; + return Status::IllegalState(Substitute("Request specified cas_config_opid_index_less_or_equal" + " of $0 but the committed config has opid_index of $1", + *cas_config_opid_index_less_or_equal, + committed_config.opid_index())); + } + } + + tablet_peer->Shutdown(); + + boost::optional opt_last_logged_opid; + if (tablet_peer->log()) { + OpId last_logged_opid; + tablet_peer->log()->GetLatestEntryOpId(&last_logged_opid); + opt_last_logged_opid = last_logged_opid; + } + + Status s = DeleteTabletData(tablet_peer->tablet_metadata(), delete_type, opt_last_logged_opid); + if (PREDICT_FALSE(!s.ok())) { + s = s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0", + tablet_id)); + LOG(WARNING) << s.ToString(); + tablet_peer->SetFailed(s); + return s; + } + + tablet_peer->status_listener()->StatusMessage("Deleted tablet blocks from disk"); + + // We only remove DELETED tablets from the tablet map. + if (delete_type == TABLET_DATA_DELETED) { + boost::lock_guard lock(lock_); + RETURN_NOT_OK(CheckRunningUnlocked(error_code)); + CHECK_EQ(1, tablet_map_.erase(tablet_id)) << tablet_id; + } + + return Status::OK(); +} + +string TSTabletManager::LogPrefix(const string& tablet_id) const { + return "T " + tablet_id + " P " + fs_manager_->uuid() + ": "; +} + +Status TSTabletManager::CheckRunningUnlocked( + boost::optional* error_code) const { + if (state_ == MANAGER_RUNNING) { + return Status::OK(); + } + *error_code = TabletServerErrorPB::TABLET_NOT_RUNNING; + return Status::ServiceUnavailable(Substitute("Tablet Manager is not running: $0", + TSTabletManagerStatePB_Name(state_))); +} + +Status TSTabletManager::StartTabletStateTransitionUnlocked( + const string& tablet_id, + const string& reason, + scoped_refptr* deleter) { + DCHECK(lock_.is_write_locked()); + if (!InsertIfNotPresent(&transition_in_progress_, tablet_id, reason)) { + return Status::IllegalState( + Substitute("State transition of tablet $0 already in progress: $1", + tablet_id, transition_in_progress_[tablet_id])); + } + deleter->reset(new TransitionInProgressDeleter(&transition_in_progress_, &lock_, tablet_id)); + return Status::OK(); +} + +Status TSTabletManager::OpenTabletMeta(const string& tablet_id, + scoped_refptr* metadata) { + LOG(INFO) << "Loading metadata for tablet " << tablet_id; + TRACE("Loading metadata..."); + scoped_refptr meta; + RETURN_NOT_OK_PREPEND(TabletMetadata::Load(fs_manager_, tablet_id, &meta), + strings::Substitute("Failed to load tablet metadata for tablet id $0", + tablet_id)); + TRACE("Metadata loaded"); + metadata->swap(meta); + return Status::OK(); +} + +void TSTabletManager::OpenTablet(const scoped_refptr& meta, + const scoped_refptr& deleter) { + string tablet_id = meta->tablet_id(); + TRACE_EVENT1("tserver", "TSTabletManager::OpenTablet", + "tablet_id", tablet_id); + + scoped_refptr tablet_peer; + CHECK(LookupTablet(tablet_id, &tablet_peer)) + << "Tablet not registered prior to OpenTabletAsync call: " << tablet_id; + + shared_ptr tablet; + scoped_refptr log; + + LOG(INFO) << LogPrefix(tablet_id) << "Bootstrapping tablet"; + TRACE("Bootstrapping tablet"); + + consensus::ConsensusBootstrapInfo bootstrap_info; + Status s; + LOG_TIMING_PREFIX(INFO, LogPrefix(tablet_id), "bootstrapping tablet") { + // TODO: handle crash mid-creation of tablet? do we ever end up with a + // partially created tablet here? + tablet_peer->SetBootstrapping(); + s = BootstrapTablet(meta, + scoped_refptr(server_->clock()), + server_->mem_tracker(), + metric_registry_, + tablet_peer->status_listener(), + &tablet, + &log, + tablet_peer->log_anchor_registry(), + &bootstrap_info); + if (!s.ok()) { + LOG(ERROR) << LogPrefix(tablet_id) << "Tablet failed to bootstrap: " + << s.ToString(); + tablet_peer->SetFailed(s); + return; + } + } + + MonoTime start(MonoTime::Now(MonoTime::FINE)); + LOG_TIMING_PREFIX(INFO, LogPrefix(tablet_id), "starting tablet") { + TRACE("Initializing tablet peer"); + s = tablet_peer->Init(tablet, + scoped_refptr(server_->clock()), + server_->messenger(), + log, + tablet->GetMetricEntity()); + + if (!s.ok()) { + LOG(ERROR) << LogPrefix(tablet_id) << "Tablet failed to init: " + << s.ToString(); + tablet_peer->SetFailed(s); + return; + } + + TRACE("Starting tablet peer"); + s = tablet_peer->Start(bootstrap_info); + if (!s.ok()) { + LOG(ERROR) << LogPrefix(tablet_id) << "Tablet failed to start: " + << s.ToString(); + tablet_peer->SetFailed(s); + return; + } + + tablet_peer->RegisterMaintenanceOps(server_->maintenance_manager()); + } + + int elapsed_ms = MonoTime::Now(MonoTime::FINE).GetDeltaSince(start).ToMilliseconds(); + if (elapsed_ms > FLAGS_tablet_start_warn_threshold_ms) { + LOG(WARNING) << LogPrefix(tablet_id) << "Tablet startup took " << elapsed_ms << "ms"; + if (Trace::CurrentTrace()) { + LOG(WARNING) << LogPrefix(tablet_id) << "Trace:" << std::endl + << Trace::CurrentTrace()->DumpToString(true); + } + } +} + +void TSTabletManager::Shutdown() { + { + boost::lock_guard lock(lock_); + switch (state_) { + case MANAGER_QUIESCING: { + VLOG(1) << "Tablet manager shut down already in progress.."; + return; + } + case MANAGER_SHUTDOWN: { + VLOG(1) << "Tablet manager has already been shut down."; + return; + } + case MANAGER_INITIALIZING: + case MANAGER_RUNNING: { + LOG(INFO) << "Shutting down tablet manager..."; + state_ = MANAGER_QUIESCING; + break; + } + default: { + LOG(FATAL) << "Invalid state: " << TSTabletManagerStatePB_Name(state_); + } + } + } + + // Shut down the bootstrap pool, so new tablets are registered after this point. + open_tablet_pool_->Shutdown(); + + // Take a snapshot of the peers list -- that way we don't have to hold + // on to the lock while shutting them down, which might cause a lock + // inversion. (see KUDU-308 for example). + vector > peers_to_shutdown; + GetTabletPeers(&peers_to_shutdown); + + for (const scoped_refptr& peer : peers_to_shutdown) { + peer->Shutdown(); + } + + // Shut down the apply pool. + apply_pool_->Shutdown(); + + { + boost::lock_guard l(lock_); + // We don't expect anyone else to be modifying the map after we start the + // shut down process. + CHECK_EQ(tablet_map_.size(), peers_to_shutdown.size()) + << "Map contents changed during shutdown!"; + tablet_map_.clear(); + + state_ = MANAGER_SHUTDOWN; + } +} + +void TSTabletManager::RegisterTablet(const std::string& tablet_id, + const scoped_refptr& tablet_peer, + RegisterTabletPeerMode mode) { + boost::lock_guard lock(lock_); + // If we are replacing a tablet peer, we delete the existing one first. + if (mode == REPLACEMENT_PEER && tablet_map_.erase(tablet_id) != 1) { + LOG(FATAL) << "Unable to remove previous tablet peer " << tablet_id << ": not registered!"; + } + if (!InsertIfNotPresent(&tablet_map_, tablet_id, tablet_peer)) { + LOG(FATAL) << "Unable to register tablet peer " << tablet_id << ": already registered!"; + } + + LOG(INFO) << "Registered tablet " << tablet_id; +} + +bool TSTabletManager::LookupTablet(const string& tablet_id, + scoped_refptr* tablet_peer) const { + boost::shared_lock shared_lock(lock_); + return LookupTabletUnlocked(tablet_id, tablet_peer); +} + +bool TSTabletManager::LookupTabletUnlocked(const string& tablet_id, + scoped_refptr* tablet_peer) const { + const scoped_refptr* found = FindOrNull(tablet_map_, tablet_id); + if (!found) { + return false; + } + *tablet_peer = *found; + return true; +} + +Status TSTabletManager::GetTabletPeer(const string& tablet_id, + scoped_refptr* tablet_peer) const { + if (!LookupTablet(tablet_id, tablet_peer)) { + return Status::NotFound("Tablet not found", tablet_id); + } + TabletDataState data_state = (*tablet_peer)->tablet_metadata()->tablet_data_state(); + if (data_state != TABLET_DATA_READY) { + return Status::IllegalState("Tablet data state not TABLET_DATA_READY: " + + TabletDataState_Name(data_state), + tablet_id); + } + return Status::OK(); +} + +const NodeInstancePB& TSTabletManager::NodeInstance() const { + return server_->instance_pb(); +} + +void TSTabletManager::GetTabletPeers(vector >* tablet_peers) const { + boost::shared_lock shared_lock(lock_); + AppendValuesFromMap(tablet_map_, tablet_peers); +} + +void TSTabletManager::MarkTabletDirty(const std::string& tablet_id, const std::string& reason) { + boost::lock_guard lock(lock_); + MarkDirtyUnlocked(tablet_id, reason); +} + +int TSTabletManager::GetNumDirtyTabletsForTests() const { + boost::shared_lock lock(lock_); + return dirty_tablets_.size(); +} + +int TSTabletManager::GetNumLiveTablets() const { + int count = 0; + boost::shared_lock lock(lock_); + for (const auto& entry : tablet_map_) { + tablet::TabletStatePB state = entry.second->state(); + if (state == tablet::BOOTSTRAPPING || + state == tablet::RUNNING) { + count++; + } + } + return count; +} + +void TSTabletManager::MarkDirtyUnlocked(const std::string& tablet_id, const std::string& reason) { + TabletReportState* state = FindOrNull(dirty_tablets_, tablet_id); + if (state != nullptr) { + CHECK_GE(next_report_seq_, state->change_seq); + state->change_seq = next_report_seq_; + } else { + TabletReportState state; + state.change_seq = next_report_seq_; + InsertOrDie(&dirty_tablets_, tablet_id, state); + } + VLOG(2) << LogPrefix(tablet_id) << "Marking dirty. Reason: " << reason + << ". Will report this tablet to the Master in the next heartbeat " + << "as part of report #" << next_report_seq_; + server_->heartbeater()->TriggerASAP(); +} + +void TSTabletManager::InitLocalRaftPeerPB() { + DCHECK_EQ(state(), MANAGER_INITIALIZING); + local_peer_pb_.set_permanent_uuid(fs_manager_->uuid()); + Sockaddr addr = server_->first_rpc_address(); + HostPort hp; + CHECK_OK(HostPortFromSockaddrReplaceWildcard(addr, &hp)); + CHECK_OK(HostPortToPB(hp, local_peer_pb_.mutable_last_known_addr())); +} + +void TSTabletManager::CreateReportedTabletPB(const string& tablet_id, + const scoped_refptr& tablet_peer, + ReportedTabletPB* reported_tablet) { + reported_tablet->set_tablet_id(tablet_id); + reported_tablet->set_state(tablet_peer->state()); + reported_tablet->set_tablet_data_state(tablet_peer->tablet_metadata()->tablet_data_state()); + if (tablet_peer->state() == tablet::FAILED) { + AppStatusPB* error_status = reported_tablet->mutable_error(); + StatusToPB(tablet_peer->error(), error_status); + } + reported_tablet->set_schema_version(tablet_peer->tablet_metadata()->schema_version()); + + // We cannot get consensus state information unless the TabletPeer is running. + scoped_refptr consensus = tablet_peer->shared_consensus(); + if (consensus) { + *reported_tablet->mutable_committed_consensus_state() = + consensus->ConsensusState(consensus::CONSENSUS_CONFIG_COMMITTED); + } +} + +void TSTabletManager::GenerateIncrementalTabletReport(TabletReportPB* report) { + boost::shared_lock shared_lock(lock_); + report->Clear(); + report->set_sequence_number(next_report_seq_++); + report->set_is_incremental(true); + for (const DirtyMap::value_type& dirty_entry : dirty_tablets_) { + const string& tablet_id = dirty_entry.first; + scoped_refptr* tablet_peer = FindOrNull(tablet_map_, tablet_id); + if (tablet_peer) { + // Dirty entry, report on it. + CreateReportedTabletPB(tablet_id, *tablet_peer, report->add_updated_tablets()); + } else { + // Removed. + report->add_removed_tablet_ids(tablet_id); + } + } +} + +void TSTabletManager::GenerateFullTabletReport(TabletReportPB* report) { + boost::shared_lock shared_lock(lock_); + report->Clear(); + report->set_is_incremental(false); + report->set_sequence_number(next_report_seq_++); + for (const TabletMap::value_type& entry : tablet_map_) { + CreateReportedTabletPB(entry.first, entry.second, report->add_updated_tablets()); + } + dirty_tablets_.clear(); +} + +void TSTabletManager::MarkTabletReportAcknowledged(const TabletReportPB& report) { + boost::lock_guard l(lock_); + + int32_t acked_seq = report.sequence_number(); + CHECK_LT(acked_seq, next_report_seq_); + + // Clear the "dirty" state for any tablets which have not changed since + // this report. + auto it = dirty_tablets_.begin(); + while (it != dirty_tablets_.end()) { + const TabletReportState& state = it->second; + if (state.change_seq <= acked_seq) { + // This entry has not changed since this tablet report, we no longer need + // to track it as dirty. If it becomes dirty again, it will be re-added + // with a higher sequence number. + it = dirty_tablets_.erase(it); + } else { + ++it; + } + } +} + +Status TSTabletManager::HandleNonReadyTabletOnStartup(const scoped_refptr& meta) { + const string& tablet_id = meta->tablet_id(); + TabletDataState data_state = meta->tablet_data_state(); + CHECK(data_state == TABLET_DATA_DELETED || + data_state == TABLET_DATA_TOMBSTONED || + data_state == TABLET_DATA_COPYING) + << "Unexpected TabletDataState in tablet " << tablet_id << ": " + << TabletDataState_Name(data_state) << " (" << data_state << ")"; + + if (data_state == TABLET_DATA_COPYING) { + // We tombstone tablets that failed to remotely bootstrap. + data_state = TABLET_DATA_TOMBSTONED; + } + + // Roll forward deletions, as needed. + LOG(INFO) << LogPrefix(tablet_id) << "Tablet Manager startup: Rolling forward tablet deletion " + << "of type " << TabletDataState_Name(data_state); + // Passing no OpId will retain the last_logged_opid that was previously in the metadata. + RETURN_NOT_OK(DeleteTabletData(meta, data_state, boost::none)); + + // We only delete the actual superblock of a TABLET_DATA_DELETED tablet on startup. + // TODO: Consider doing this after a fixed delay, instead of waiting for a restart. + // See KUDU-941. + if (data_state == TABLET_DATA_DELETED) { + LOG(INFO) << LogPrefix(tablet_id) << "Deleting tablet superblock"; + return meta->DeleteSuperBlock(); + } + + // Register TOMBSTONED tablets so that they get reported to the Master, which + // allows us to permanently delete replica tombstones when a table gets + // deleted. + if (data_state == TABLET_DATA_TOMBSTONED) { + CreateAndRegisterTabletPeer(meta, NEW_PEER); + } + + return Status::OK(); +} + +Status TSTabletManager::DeleteTabletData(const scoped_refptr& meta, + TabletDataState data_state, + const boost::optional& last_logged_opid) { + const string& tablet_id = meta->tablet_id(); + LOG(INFO) << LogPrefix(tablet_id) << "Deleting tablet data with delete state " + << TabletDataState_Name(data_state); + CHECK(data_state == TABLET_DATA_DELETED || + data_state == TABLET_DATA_TOMBSTONED) + << "Unexpected data_state to delete tablet " << meta->tablet_id() << ": " + << TabletDataState_Name(data_state) << " (" << data_state << ")"; + + // Note: Passing an unset 'last_logged_opid' will retain the last_logged_opid + // that was previously in the metadata. + RETURN_NOT_OK(meta->DeleteTabletData(data_state, last_logged_opid)); + LOG(INFO) << LogPrefix(tablet_id) << "Tablet deleted. Last logged OpId: " + << meta->tombstone_last_logged_opid(); + MAYBE_FAULT(FLAGS_fault_crash_after_blocks_deleted); + + RETURN_NOT_OK(Log::DeleteOnDiskData(meta->fs_manager(), meta->tablet_id())); + MAYBE_FAULT(FLAGS_fault_crash_after_wal_deleted); + + // We do not delete the superblock or the consensus metadata when tombstoning + // a tablet. + if (data_state == TABLET_DATA_TOMBSTONED) { + return Status::OK(); + } + + // Only TABLET_DATA_DELETED tablets get this far. + RETURN_NOT_OK(ConsensusMetadata::DeleteOnDiskData(meta->fs_manager(), meta->tablet_id())); + MAYBE_FAULT(FLAGS_fault_crash_after_cmeta_deleted); + + return Status::OK(); +} + +void TSTabletManager::LogAndTombstone(const scoped_refptr& meta, + const std::string& msg, + const Status& s) { + const string& tablet_id = meta->tablet_id(); + const string kLogPrefix = "T " + tablet_id + " P " + fs_manager_->uuid() + ": "; + LOG(WARNING) << kLogPrefix << msg << ": " << s.ToString(); + + // Tombstone the tablet when remote bootstrap fails. + LOG(INFO) << kLogPrefix << "Tombstoning tablet after failed remote bootstrap"; + Status delete_status = DeleteTabletData(meta, TABLET_DATA_TOMBSTONED, boost::optional()); + if (PREDICT_FALSE(!delete_status.ok())) { + // This failure should only either indicate a bug or an IO error. + LOG(FATAL) << kLogPrefix << "Failed to tombstone tablet after remote bootstrap: " + << delete_status.ToString(); + } +} + +TransitionInProgressDeleter::TransitionInProgressDeleter( + TransitionInProgressMap* map, rw_spinlock* lock, string entry) + : in_progress_(map), lock_(lock), entry_(std::move(entry)) {} + +TransitionInProgressDeleter::~TransitionInProgressDeleter() { + boost::lock_guard lock(*lock_); + CHECK(in_progress_->erase(entry_)); +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h new file mode 100644 index 000000000000..2d631425b930 --- /dev/null +++ b/src/kudu/tserver/ts_tablet_manager.h @@ -0,0 +1,366 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TS_TABLET_MANAGER_H +#define KUDU_TSERVER_TS_TABLET_MANAGER_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/tserver/tablet_peer_lookup.h" +#include "kudu/tserver/tserver_admin.pb.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" +#include "kudu/util/status.h" +#include "kudu/util/threadpool.h" + +namespace kudu { + +class PartitionSchema; +class FsManager; +class HostPort; +class Partition; +class Schema; + +namespace consensus { +class RaftConfigPB; +} // namespace consensus + +namespace master { +class ReportedTabletPB; +class TabletReportPB; +} // namespace master + +namespace tablet { +class TabletMetadata; +class TabletPeer; +class TabletStatusPB; +class TabletStatusListener; +} + +namespace tserver { +class TabletServer; + +// Map of tablet id -> transition reason string. +typedef std::unordered_map TransitionInProgressMap; + +class TransitionInProgressDeleter; + +// Keeps track of the tablets hosted on the tablet server side. +// +// TODO: will also be responsible for keeping the local metadata about +// which tablets are hosted on this server persistent on disk, as well +// as re-opening all the tablets at startup, etc. +class TSTabletManager : public tserver::TabletPeerLookupIf { + public: + // Construct the tablet manager. + // 'fs_manager' must remain valid until this object is destructed. + TSTabletManager(FsManager* fs_manager, + TabletServer* server, + MetricRegistry* metric_registry); + + virtual ~TSTabletManager(); + + // Load all tablet metadata blocks from disk, and open their respective tablets. + // Upon return of this method all existing tablets are registered, but + // the bootstrap is performed asynchronously. + Status Init(); + + // Waits for all the bootstraps to complete. + // Returns Status::OK if all tablets bootstrapped successfully. If + // the bootstrap of any tablet failed returns the failure reason for + // the first tablet whose bootstrap failed. + Status WaitForAllBootstrapsToFinish(); + + // Shut down all of the tablets, gracefully flushing before shutdown. + void Shutdown(); + + // Create a new tablet and register it with the tablet manager. The new tablet + // is persisted on disk and opened before this method returns. + // + // If tablet_peer is non-NULL, the newly created tablet will be returned. + // + // If another tablet already exists with this ID, logs a DFATAL + // and returns a bad Status. + Status CreateNewTablet(const std::string& table_id, + const std::string& tablet_id, + const Partition& partition, + const std::string& table_name, + const Schema& schema, + const PartitionSchema& partition_schema, + consensus::RaftConfigPB config, + scoped_refptr* tablet_peer); + + // Delete the specified tablet. + // 'delete_type' must be one of TABLET_DATA_DELETED or TABLET_DATA_TOMBSTONED + // or else returns Status::IllegalArgument. + // 'cas_config_opid_index_less_or_equal' is optionally specified to enable an + // atomic DeleteTablet operation that only occurs if the latest committed + // raft config change op has an opid_index equal to or less than the specified + // value. If not, 'error_code' is set to CAS_FAILED and a non-OK Status is + // returned. + Status DeleteTablet(const std::string& tablet_id, + tablet::TabletDataState delete_type, + const boost::optional& cas_config_opid_index_less_or_equal, + boost::optional* error_code); + + // Lookup the given tablet peer by its ID. + // Returns true if the tablet is found successfully. + bool LookupTablet(const std::string& tablet_id, + scoped_refptr* tablet_peer) const; + + // Same as LookupTablet but doesn't acquired the shared lock. + bool LookupTabletUnlocked(const std::string& tablet_id, + scoped_refptr* tablet_peer) const; + + virtual Status GetTabletPeer(const std::string& tablet_id, + scoped_refptr* tablet_peer) const + OVERRIDE; + + virtual const NodeInstancePB& NodeInstance() const OVERRIDE; + + // Initiate remote bootstrap of the specified tablet. + // See the StartRemoteBootstrap() RPC declaration in consensus.proto for details. + // Currently this runs the entire procedure synchronously. + // TODO: KUDU-921: Run this procedure on a background thread. + virtual Status StartRemoteBootstrap(const consensus::StartRemoteBootstrapRequestPB& req) OVERRIDE; + + // Generate an incremental tablet report. + // + // This will report any tablets which have changed since the last acknowleged + // tablet report. Once the report is successfully transferred, call + // MarkTabletReportAcknowledged() to clear the incremental state. Otherwise, the + // next tablet report will continue to include the same tablets until one + // is acknowleged. + // + // This is thread-safe to call along with tablet modification, but not safe + // to call from multiple threads at the same time. + void GenerateIncrementalTabletReport(master::TabletReportPB* report); + + // Generate a full tablet report and reset any incremental state tracking. + void GenerateFullTabletReport(master::TabletReportPB* report); + + // Mark that the master successfully received and processed the given + // tablet report. This uses the report sequence number to "un-dirty" any + // tablets which have not changed since the acknowledged report. + void MarkTabletReportAcknowledged(const master::TabletReportPB& report); + + // Get all of the tablets currently hosted on this server. + void GetTabletPeers(std::vector >* tablet_peers) const; + + // Marks tablet with 'tablet_id' dirty. + // Used for state changes outside of the control of TsTabletManager, such as consensus role + // changes. + void MarkTabletDirty(const std::string& tablet_id, const std::string& reason); + + // Returns the number of tablets in the "dirty" map, for use by unit tests. + int GetNumDirtyTabletsForTests() const; + + // Return the number of tablets in RUNNING or BOOTSTRAPPING state. + int GetNumLiveTablets() const; + + Status RunAllLogGC(); + + private: + FRIEND_TEST(TsTabletManagerTest, TestPersistBlocks); + + // Flag specified when registering a TabletPeer. + enum RegisterTabletPeerMode { + NEW_PEER, + REPLACEMENT_PEER + }; + + // Each tablet report is assigned a sequence number, so that subsequent + // tablet reports only need to re-report those tablets which have + // changed since the last report. Each tablet tracks the sequence + // number at which it became dirty. + struct TabletReportState { + uint32_t change_seq; + }; + typedef std::unordered_map DirtyMap; + + // Standard log prefix, given a tablet id. + std::string LogPrefix(const std::string& tablet_id) const; + + // Returns Status::OK() iff state_ == MANAGER_RUNNING. + Status CheckRunningUnlocked(boost::optional* error_code) const; + + // Registers the start of a tablet state transition by inserting the tablet + // id and reason string into the transition_in_progress_ map. + // 'reason' is a string included in the Status return when there is + // contention indicating why the tablet is currently already transitioning. + // Returns IllegalState if the tablet is already "locked" for a state + // transition by some other operation. + // On success, returns OK and populates 'deleter' with an object that removes + // the map entry on destruction. + Status StartTabletStateTransitionUnlocked(const std::string& tablet_id, + const std::string& reason, + scoped_refptr* deleter); + + // Open a tablet meta from the local file system by loading its superblock. + Status OpenTabletMeta(const std::string& tablet_id, + scoped_refptr* metadata); + + // Open a tablet whose metadata has already been loaded/created. + // This method does not return anything as it can be run asynchronously. + // Upon completion of this method the tablet should be initialized and running. + // If something wrong happened on bootstrap/initialization the relevant error + // will be set on TabletPeer along with the state set to FAILED. + // + // The tablet must be registered and an entry corresponding to this tablet + // must be put into the transition_in_progress_ map before calling this + // method. A TransitionInProgressDeleter must be passed as 'deleter' into + // this method in order to remove that transition-in-progress entry when + // opening the tablet is complete (in either a success or a failure case). + void OpenTablet(const scoped_refptr& meta, + const scoped_refptr& deleter); + + // Open a tablet whose metadata has already been loaded. + void BootstrapAndInitTablet(const scoped_refptr& meta, + scoped_refptr* peer); + + // Add the tablet to the tablet map. + // 'mode' specifies whether to expect an existing tablet to exist in the map. + // If mode == NEW_PEER but a tablet with the same name is already registered, + // or if mode == REPLACEMENT_PEER but a tablet with the same name is not + // registered, a FATAL message is logged, causing a process crash. + // Calls to this method are expected to be externally synchronized, typically + // using the transition_in_progress_ map. + void RegisterTablet(const std::string& tablet_id, + const scoped_refptr& tablet_peer, + RegisterTabletPeerMode mode); + + // Create and register a new TabletPeer, given tablet metadata. + // Calls RegisterTablet() with the given 'mode' parameter after constructing + // the TablerPeer object. See RegisterTablet() for details about the + // semantics of 'mode' and the locking requirements. + scoped_refptr CreateAndRegisterTabletPeer( + const scoped_refptr& meta, + RegisterTabletPeerMode mode); + + // Helper to generate the report for a single tablet. + void CreateReportedTabletPB(const std::string& tablet_id, + const scoped_refptr& tablet_peer, + master::ReportedTabletPB* reported_tablet); + + // Mark that the provided TabletPeer's state has changed. That should be taken into + // account in the next report. + // + // NOTE: requires that the caller holds the lock. + void MarkDirtyUnlocked(const std::string& tablet_id, const std::string& reason); + + // Handle the case on startup where we find a tablet that is not in + // TABLET_DATA_READY state. Generally, we tombstone the replica. + Status HandleNonReadyTabletOnStartup(const scoped_refptr& meta); + + // Delete the tablet using the specified delete_type as the final metadata + // state. Deletes the on-disk data, as well as all WAL segments. + Status DeleteTabletData(const scoped_refptr& meta, + tablet::TabletDataState delete_type, + const boost::optional& last_logged_opid); + + // Return Status::IllegalState if leader_term < last_logged_term. + // Helper function for use with remote bootstrap. + Status CheckLeaderTermNotLower(const std::string& tablet_id, + int64_t leader_term, + int64_t last_logged_term); + + // Print a log message using the given info and tombstone the specified + // tablet. If tombstoning the tablet fails, a FATAL error is logged, resulting + // in a crash. + void LogAndTombstone(const scoped_refptr& meta, + const std::string& msg, + const Status& s); + + TSTabletManagerStatePB state() const { + boost::shared_lock lock(lock_); + return state_; + } + + // Initializes the RaftPeerPB for the local peer. + // Guaranteed to include both uuid and last_seen_addr fields. + // Crashes with an invariant check if the RPC server is not currently in a + // running state. + void InitLocalRaftPeerPB(); + + FsManager* const fs_manager_; + + TabletServer* server_; + + consensus::RaftPeerPB local_peer_pb_; + + typedef std::unordered_map > TabletMap; + + // Lock protecting tablet_map_, dirty_tablets_, state_, and + // transition_in_progress_. + mutable rw_spinlock lock_; + + // Map from tablet ID to tablet + TabletMap tablet_map_; + + // Map of tablet ids -> reason strings where the keys are tablets whose + // bootstrap, creation, or deletion is in-progress + TransitionInProgressMap transition_in_progress_; + + // Tablets to include in the next incremental tablet report. + // When a tablet is added/removed/added locally and needs to be + // reported to the master, an entry is added to this map. + DirtyMap dirty_tablets_; + + // Next tablet report seqno. + int32_t next_report_seq_; + + MetricRegistry* metric_registry_; + + TSTabletManagerStatePB state_; + + // Thread pool used to open the tablets async, whether bootstrap is required or not. + gscoped_ptr open_tablet_pool_; + + // Thread pool for apply transactions, shared between all tablets. + gscoped_ptr apply_pool_; + + DISALLOW_COPY_AND_ASSIGN(TSTabletManager); +}; + +// Helper to delete the transition-in-progress entry from the corresponding set +// when tablet boostrap, create, and delete operations complete. +class TransitionInProgressDeleter : public RefCountedThreadSafe { + public: + TransitionInProgressDeleter(TransitionInProgressMap* map, rw_spinlock* lock, + string entry); + + private: + friend class RefCountedThreadSafe; + ~TransitionInProgressDeleter(); + + TransitionInProgressMap* const in_progress_; + rw_spinlock* const lock_; + const std::string entry_; +}; + +} // namespace tserver +} // namespace kudu +#endif /* KUDU_TSERVER_TS_TABLET_MANAGER_H */ diff --git a/src/kudu/tserver/tserver-path-handlers.cc b/src/kudu/tserver/tserver-path-handlers.cc new file mode 100644 index 000000000000..96dc5e9e80f3 --- /dev/null +++ b/src/kudu/tserver/tserver-path-handlers.cc @@ -0,0 +1,570 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/tserver/tserver-path-handlers.h" + +#include +#include +#include +#include +#include + +#include "kudu/consensus/log_anchor_registry.h" +#include "kudu/consensus/quorum_util.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/server/webui_util.h" +#include "kudu/tablet/maintenance_manager.h" +#include "kudu/tablet/tablet.pb.h" +#include "kudu/tablet/tablet_bootstrap.h" +#include "kudu/tablet/tablet_peer.h" +#include "kudu/tserver/scanners.h" +#include "kudu/tserver/tablet_server.h" +#include "kudu/tserver/ts_tablet_manager.h" +#include "kudu/util/url-coding.h" + +using kudu::consensus::GetConsensusRole; +using kudu::consensus::CONSENSUS_CONFIG_COMMITTED; +using kudu::consensus::ConsensusStatePB; +using kudu::consensus::RaftPeerPB; +using kudu::consensus::TransactionStatusPB; +using kudu::tablet::MaintenanceManagerStatusPB; +using kudu::tablet::MaintenanceManagerStatusPB_CompletedOpPB; +using kudu::tablet::MaintenanceManagerStatusPB_MaintenanceOpPB; +using kudu::tablet::Tablet; +using kudu::tablet::TabletPeer; +using kudu::tablet::TabletStatusPB; +using kudu::tablet::Transaction; +using std::endl; +using std::shared_ptr; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace tserver { + +TabletServerPathHandlers::~TabletServerPathHandlers() { +} + +Status TabletServerPathHandlers::Register(Webserver* server) { + server->RegisterPathHandler( + "/scans", "Scans", + boost::bind(&TabletServerPathHandlers::HandleScansPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/tablets", "Tablets", + boost::bind(&TabletServerPathHandlers::HandleTabletsPage, this, _1, _2), + true /* styled */, true /* is_on_nav_bar */); + server->RegisterPathHandler( + "/tablet", "", + boost::bind(&TabletServerPathHandlers::HandleTabletPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/transactions", "", + boost::bind(&TabletServerPathHandlers::HandleTransactionsPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/tablet-rowsetlayout-svg", "", + boost::bind(&TabletServerPathHandlers::HandleTabletSVGPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/tablet-consensus-status", "", + boost::bind(&TabletServerPathHandlers::HandleConsensusStatusPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/log-anchors", "", + boost::bind(&TabletServerPathHandlers::HandleLogAnchorsPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + server->RegisterPathHandler( + "/dashboards", "Dashboards", + boost::bind(&TabletServerPathHandlers::HandleDashboardsPage, this, _1, _2), + true /* styled */, true /* is_on_nav_bar */); + server->RegisterPathHandler( + "/maintenance-manager", "", + boost::bind(&TabletServerPathHandlers::HandleMaintenanceManagerPage, this, _1, _2), + true /* styled */, false /* is_on_nav_bar */); + + return Status::OK(); +} + +void TabletServerPathHandlers::HandleTransactionsPage(const Webserver::WebRequest& req, + std::stringstream* output) { + bool as_text = ContainsKey(req.parsed_args, "raw"); + + vector > peers; + tserver_->tablet_manager()->GetTabletPeers(&peers); + + string arg = FindWithDefault(req.parsed_args, "include_traces", "false"); + Transaction::TraceType trace_type = ParseLeadingBoolValue( + arg.c_str(), false) ? Transaction::TRACE_TXNS : Transaction::NO_TRACE_TXNS; + + if (!as_text) { + *output << "

    Transactions

    \n"; + *output << "\n"; + *output << " " + "\n"; + } + + for (const scoped_refptr& peer : peers) { + vector inflight; + + if (peer->tablet() == nullptr) { + continue; + } + + peer->GetInFlightTransactions(trace_type, &inflight); + for (const TransactionStatusPB& inflight_tx : inflight) { + string total_time_str = Substitute("$0 us.", inflight_tx.running_for_micros()); + string description; + if (trace_type == Transaction::TRACE_TXNS) { + description = Substitute("$0, Trace: $1", + inflight_tx.description(), inflight_tx.trace_buffer()); + } else { + description = inflight_tx.description(); + } + + if (!as_text) { + (*output) << Substitute( + "\n", + EscapeForHtmlToString(peer->tablet_id()), + EscapeForHtmlToString(inflight_tx.op_id().ShortDebugString()), + OperationType_Name(inflight_tx.tx_type()), + total_time_str, + EscapeForHtmlToString(description)); + } else { + (*output) << "Tablet: " << peer->tablet_id() << endl; + (*output) << "Op ID: " << inflight_tx.op_id().ShortDebugString() << endl; + (*output) << "Type: " << OperationType_Name(inflight_tx.tx_type()) << endl; + (*output) << "Running: " << total_time_str; + (*output) << description << endl; + (*output) << endl; + } + } + } + + if (!as_text) { + *output << "
    Tablet idOp IdTransaction Type" + "Total time in-flightDescription
    $0$1$2$3$4
    \n"; + } +} + +namespace { +string TabletLink(const string& id) { + return Substitute("$1", + UrlEncodeToString(id), + EscapeForHtmlToString(id)); +} + +bool CompareByTabletId(const scoped_refptr& a, + const scoped_refptr& b) { + return a->tablet_id() < b->tablet_id(); +} + +} // anonymous namespace + +void TabletServerPathHandlers::HandleTabletsPage(const Webserver::WebRequest& req, + std::stringstream *output) { + vector > peers; + tserver_->tablet_manager()->GetTabletPeers(&peers); + std::sort(peers.begin(), peers.end(), &CompareByTabletId); + + *output << "

    Tablets

    \n"; + *output << "\n"; + *output << " " + "" + "\n"; + for (const scoped_refptr& peer : peers) { + TabletStatusPB status; + peer->GetTabletStatusPB(&status); + string id = status.tablet_id(); + string table_name = status.table_name(); + string tablet_id_or_link; + if (peer->tablet() != nullptr) { + tablet_id_or_link = TabletLink(id); + } else { + tablet_id_or_link = EscapeForHtmlToString(id); + } + string n_bytes = ""; + if (status.has_estimated_on_disk_size()) { + n_bytes = HumanReadableNumBytes::ToString(status.estimated_on_disk_size()); + } + string partition = peer->tablet_metadata() + ->partition_schema() + .PartitionDebugString(peer->status_listener()->partition(), + peer->tablet_metadata()->schema()); + + // TODO: would be nice to include some other stuff like memory usage + scoped_refptr consensus = peer->shared_consensus(); + (*output) << Substitute( + // Table name, tablet id, partition + "" + // State, on-disk size, consensus configuration, last status + "\n", + EscapeForHtmlToString(table_name), // $0 + tablet_id_or_link, // $1 + EscapeForHtmlToString(partition), // $2 + EscapeForHtmlToString(peer->HumanReadableState()), n_bytes, // $3, $4 + consensus ? ConsensusStatePBToHtml(consensus->ConsensusState(CONSENSUS_CONFIG_COMMITTED)) + : "", // $5 + EscapeForHtmlToString(status.last_status())); // $6 + } + *output << "
    Table nameTablet IDPartitionStateOn-disk sizeRaftConfigLast status
    $0$1$2$3$4$5$6
    \n"; +} + +namespace { + +bool CompareByMemberType(const RaftPeerPB& a, const RaftPeerPB& b) { + if (!a.has_member_type()) return false; + if (!b.has_member_type()) return true; + return a.member_type() < b.member_type(); +} + +} // anonymous namespace + +string TabletServerPathHandlers::ConsensusStatePBToHtml(const ConsensusStatePB& cstate) const { + std::stringstream html; + + html << "
      \n"; + std::vector sorted_peers; + sorted_peers.assign(cstate.config().peers().begin(), cstate.config().peers().end()); + std::sort(sorted_peers.begin(), sorted_peers.end(), &CompareByMemberType); + for (const RaftPeerPB& peer : sorted_peers) { + string peer_addr_or_uuid = + peer.has_last_known_addr() ? peer.last_known_addr().host() : peer.permanent_uuid(); + peer_addr_or_uuid = EscapeForHtmlToString(peer_addr_or_uuid); + string role_name = RaftPeerPB::Role_Name(GetConsensusRole(peer.permanent_uuid(), cstate)); + string formatted = Substitute("$0: $1", role_name, peer_addr_or_uuid); + // Make the local peer bold. + if (peer.permanent_uuid() == tserver_->instance_pb().permanent_uuid()) { + formatted = Substitute("$0", formatted); + } + + html << Substitute("
    • $0
    • \n", formatted); + } + html << "
    \n"; + return html.str(); +} + +namespace { + +bool GetTabletID(const Webserver::WebRequest& req, string* id, std::stringstream *out) { + if (!FindCopy(req.parsed_args, "id", id)) { + // TODO: webserver should give a way to return a non-200 response code + (*out) << "Tablet missing 'id' argument"; + return false; + } + return true; +} + +bool GetTabletPeer(TabletServer* tserver, const Webserver::WebRequest& req, + scoped_refptr* peer, const string& tablet_id, + std::stringstream *out) { + if (!tserver->tablet_manager()->LookupTablet(tablet_id, peer)) { + (*out) << "Tablet " << EscapeForHtmlToString(tablet_id) << " not found"; + return false; + } + return true; +} + +bool TabletBootstrapping(const scoped_refptr& peer, const string& tablet_id, + std::stringstream* out) { + if (peer->state() == tablet::BOOTSTRAPPING) { + (*out) << "Tablet " << EscapeForHtmlToString(tablet_id) << " is still bootstrapping"; + return false; + } + return true; +} + +// Returns true if the tablet_id was properly specified, the +// tablet is found, and is in a non-bootstrapping state. +bool LoadTablet(TabletServer* tserver, + const Webserver::WebRequest& req, + string* tablet_id, scoped_refptr* peer, + std::stringstream* out) { + if (!GetTabletID(req, tablet_id, out)) return false; + if (!GetTabletPeer(tserver, req, peer, *tablet_id, out)) return false; + if (!TabletBootstrapping(*peer, *tablet_id, out)) return false; + return true; +} + +} // anonymous namespace + +void TabletServerPathHandlers::HandleTabletPage(const Webserver::WebRequest& req, + std::stringstream *output) { + string tablet_id; + scoped_refptr peer; + if (!LoadTablet(tserver_, req, &tablet_id, &peer, output)) return; + + string table_name = peer->tablet_metadata()->table_name(); + + *output << "

    Tablet " << EscapeForHtmlToString(tablet_id) << "

    \n"; + + // Output schema in tabular format. + *output << "

    Schema

    \n"; + const Schema& schema = peer->tablet_metadata()->schema(); + HtmlOutputSchemaTable(schema, output); + + *output << "

    Other Tablet Info Pages

    " << endl; + + // List of links to various tablet-specific info pages + *output << "
      "; + + // Link to output svg of current DiskRowSet layout over keyspace. + *output << "
    • " << Substitute("$1", + UrlEncodeToString(tablet_id), + "Rowset Layout Diagram") + << "
    • " << endl; + + // Link to consensus status page. + *output << "
    • " << Substitute("$1", + UrlEncodeToString(tablet_id), + "Consensus Status") + << "
    • " << endl; + + // Log anchors info page. + *output << "
    • " << Substitute("$1", + UrlEncodeToString(tablet_id), + "Tablet Log Anchors") + << "
    • " << endl; + + // End list + *output << "
    \n"; +} + +void TabletServerPathHandlers::HandleTabletSVGPage(const Webserver::WebRequest& req, + std::stringstream* output) { + string id; + scoped_refptr peer; + if (!LoadTablet(tserver_, req, &id, &peer, output)) return; + shared_ptr tablet = peer->shared_tablet(); + if (!tablet) { + *output << "Tablet " << EscapeForHtmlToString(id) << " not running"; + return; + } + + *output << "

    Rowset Layout Diagram for Tablet " + << TabletLink(id) << "

    \n"; + tablet->PrintRSLayout(output); + +} + +void TabletServerPathHandlers::HandleLogAnchorsPage(const Webserver::WebRequest& req, + std::stringstream* output) { + string tablet_id; + scoped_refptr peer; + if (!LoadTablet(tserver_, req, &tablet_id, &peer, output)) return; + + *output << "

    Log Anchors for Tablet " << EscapeForHtmlToString(tablet_id) << "

    " + << std::endl; + + string dump = peer->log_anchor_registry()->DumpAnchorInfo(); + *output << "
    " << EscapeForHtmlToString(dump) << "
    " << std::endl; +} + +void TabletServerPathHandlers::HandleConsensusStatusPage(const Webserver::WebRequest& req, + std::stringstream* output) { + string id; + scoped_refptr peer; + if (!LoadTablet(tserver_, req, &id, &peer, output)) return; + scoped_refptr consensus = peer->shared_consensus(); + if (!consensus) { + *output << "Tablet " << EscapeForHtmlToString(id) << " not running"; + return; + } + consensus->DumpStatusHtml(*output); +} + +void TabletServerPathHandlers::HandleScansPage(const Webserver::WebRequest& req, + std::stringstream* output) { + *output << "

    Scans

    \n"; + *output << "\n"; + *output << "" + "" + "\n"; + + vector scanners; + tserver_->scanner_manager()->ListScanners(&scanners); + for (const SharedScanner& scanner : scanners) { + *output << ScannerToHtml(*scanner); + } + *output << "
    Tablet idScanner idTotal time in-flightTime since last updateRequestorIterator StatsPushed down key predicatesOther predicates
    "; +} + +string TabletServerPathHandlers::ScannerToHtml(const Scanner& scanner) const { + std::stringstream html; + uint64_t time_in_flight_us = + MonoTime::Now(MonoTime::COARSE).GetDeltaSince(scanner.start_time()).ToMicroseconds(); + uint64_t time_since_last_access_us = + scanner.TimeSinceLastAccess(MonoTime::Now(MonoTime::COARSE)).ToMicroseconds(); + + html << Substitute("$0$1$2 us.$3 us.$4", + EscapeForHtmlToString(scanner.tablet_id()), // $0 + EscapeForHtmlToString(scanner.id()), // $1 + time_in_flight_us, time_since_last_access_us, // $2, $3 + EscapeForHtmlToString(scanner.requestor_string())); // $4 + + + if (!scanner.IsInitialized()) { + html << "<not yet initialized>"; + return html.str(); + } + + const Schema* projection = &scanner.iter()->schema(); + + vector stats; + scanner.GetIteratorStats(&stats); + CHECK_EQ(stats.size(), projection->num_columns()); + html << Substitute("$0", IteratorStatsToHtml(*projection, stats)); + scoped_refptr tablet_peer; + if (!tserver_->tablet_manager()->LookupTablet(scanner.tablet_id(), &tablet_peer)) { + html << Substitute("Tablet $0 is no longer valid.\n", + scanner.tablet_id()); + } else { + string range_pred_str; + vector other_preds; + const ScanSpec& spec = scanner.spec(); + if (spec.lower_bound_key() || spec.exclusive_upper_bound_key()) { + range_pred_str = EncodedKey::RangeToString(spec.lower_bound_key(), + spec.exclusive_upper_bound_key()); + } + for (const ColumnRangePredicate& pred : scanner.spec().predicates()) { + other_preds.push_back(pred.ToString()); + } + string other_pred_str = JoinStrings(other_preds, "\n"); + html << Substitute("$0$1\n", + EscapeForHtmlToString(range_pred_str), + EscapeForHtmlToString(other_pred_str)); + } + return html.str(); +} + +string TabletServerPathHandlers::IteratorStatsToHtml(const Schema& projection, + const vector& stats) const { + std::stringstream html; + html << "\n"; + html << "" + << "" + << "" + << "" + << "\n"; + for (size_t idx = 0; idx < stats.size(); idx++) { + // We use 'title' attributes so that if the user hovers over the value, they get a + // human-readable tooltip. + html << Substitute("" + "" + "" + "" + "" + "\n", + EscapeForHtmlToString(projection.column(idx).name()), // $0 + HumanReadableInt::ToString(stats[idx].data_blocks_read_from_disk), // $1 + stats[idx].data_blocks_read_from_disk, // $2 + HumanReadableNumBytes::ToString(stats[idx].bytes_read_from_disk), // $3 + stats[idx].bytes_read_from_disk, // $4 + HumanReadableInt::ToString(stats[idx].cells_read_from_disk), // $5 + stats[idx].cells_read_from_disk); // $6 + } + html << "
    ColumnBlocks read from diskBytes read from diskCells read from disk
    $0$2$4$6
    \n"; + return html.str(); +} + +void TabletServerPathHandlers::HandleDashboardsPage(const Webserver::WebRequest& req, + std::stringstream* output) { + + *output << "

    Dashboards

    \n"; + *output << "\n"; + *output << " \n"; + *output << GetDashboardLine("scans", "Scans", "List of scanners that are currently running."); + *output << GetDashboardLine("transactions", "Transactions", "List of transactions that are " + "currently running."); + *output << GetDashboardLine("maintenance-manager", "Maintenance Manager", + "List of operations that are currently running and those " + "that are registered."); +} + +string TabletServerPathHandlers::GetDashboardLine(const std::string& link, + const std::string& text, + const std::string& desc) { + return Substitute(" \n", + EscapeForHtmlToString(link), + EscapeForHtmlToString(text), + EscapeForHtmlToString(desc)); +} + +void TabletServerPathHandlers::HandleMaintenanceManagerPage(const Webserver::WebRequest& req, + std::stringstream* output) { + MaintenanceManager* manager = tserver_->maintenance_manager(); + MaintenanceManagerStatusPB pb; + manager->GetMaintenanceManagerStatusDump(&pb); + if (ContainsKey(req.parsed_args, "raw")) { + *output << pb.DebugString(); + return; + } + + int ops_count = pb.registered_operations_size(); + + *output << "

    Maintenance Manager state

    \n"; + *output << "

    Running operations

    \n"; + *output << "
    DashboardDescription
    $1$2
    \n"; + *output << " \n"; + for (int i = 0; i < ops_count; i++) { + MaintenanceManagerStatusPB_MaintenanceOpPB op_pb = pb.registered_operations(i); + if (op_pb.running() > 0) { + *output << Substitute("\n", + EscapeForHtmlToString(op_pb.name()), + op_pb.running()); + } + } + *output << "
    NameInstances running
    $0$1
    \n"; + + *output << "

    Recent completed operations

    \n"; + *output << "\n"; + *output << " \n"; + for (int i = 0; i < pb.completed_operations_size(); i++) { + MaintenanceManagerStatusPB_CompletedOpPB op_pb = pb.completed_operations(i); + *output << Substitute("\n", + EscapeForHtmlToString(op_pb.name()), + HumanReadableElapsedTime::ToShortString( + op_pb.duration_millis() / 1000.0), + HumanReadableElapsedTime::ToShortString( + op_pb.secs_since_start())); + } + *output << "
    NameDurationTime since op started
    $0$1$2
    \n"; + + *output << "

    Non-running operations

    \n"; + *output << "\n"; + *output << " \n" + << " \n"; + for (int i = 0; i < ops_count; i++) { + MaintenanceManagerStatusPB_MaintenanceOpPB op_pb = pb.registered_operations(i); + if (op_pb.running() == 0) { + *output << Substitute("\n", + EscapeForHtmlToString(op_pb.name()), + op_pb.runnable(), + HumanReadableNumBytes::ToString(op_pb.ram_anchored_bytes()), + HumanReadableNumBytes::ToString(op_pb.logs_retained_bytes()), + op_pb.perf_improvement()); + } + } + *output << "
    NameRunnableRAM anchoredLogs retainedPerf
    $0$1$2$3$4
    \n"; +} + +} // namespace tserver +} // namespace kudu diff --git a/src/kudu/tserver/tserver-path-handlers.h b/src/kudu/tserver/tserver-path-handlers.h new file mode 100644 index 000000000000..6cd1f2ac9745 --- /dev/null +++ b/src/kudu/tserver/tserver-path-handlers.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TSERVER_TSERVER_PATH_HANDLERS_H +#define KUDU_TSERVER_TSERVER_PATH_HANDLERS_H + +#include "kudu/gutil/macros.h" +#include "kudu/server/webserver.h" +#include +#include +#include + +namespace kudu { + +class Schema; +struct IteratorStats; + +namespace consensus { +class ConsensusStatePB; +} // namespace consensus + +namespace tserver { + +class TabletServer; +class Scanner; + +class TabletServerPathHandlers { + public: + explicit TabletServerPathHandlers(TabletServer* tserver) + : tserver_(tserver) { + } + + ~TabletServerPathHandlers(); + + Status Register(Webserver* server); + + private: + void HandleScansPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleTabletsPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleTabletPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleTransactionsPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleTabletSVGPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleLogAnchorsPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleConsensusStatusPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleDashboardsPage(const Webserver::WebRequest& req, + std::stringstream* output); + void HandleMaintenanceManagerPage(const Webserver::WebRequest& req, + std::stringstream* output); + std::string ConsensusStatePBToHtml(const consensus::ConsensusStatePB& cstate) const; + std::string ScannerToHtml(const Scanner& scanner) const; + std::string IteratorStatsToHtml(const Schema& projection, + const std::vector& stats) const; + std::string GetDashboardLine(const std::string& link, + const std::string& text, const std::string& desc); + + TabletServer* tserver_; + + DISALLOW_COPY_AND_ASSIGN(TabletServerPathHandlers); +}; + +} // namespace tserver +} // namespace kudu +#endif /* KUDU_TSERVER_TSERVER_PATH_HANDLERS_H */ diff --git a/src/kudu/tserver/tserver.proto b/src/kudu/tserver/tserver.proto new file mode 100644 index 000000000000..1762b220666a --- /dev/null +++ b/src/kudu/tserver/tserver.proto @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tserver; + +option java_package = "org.kududb.tserver"; + +import "kudu/common/common.proto"; +import "kudu/common/wire_protocol.proto"; +import "kudu/tablet/tablet.proto"; + +// Tablet-server specific errors use this protobuf. +message TabletServerErrorPB { + enum Code { + // An error which has no more specific error code. + // The code and message in 'status' may reveal more details. + // + // RPCs should avoid returning this, since callers will not be + // able to easily parse the error. + UNKNOWN_ERROR = 1; + + // The schema provided for a request was not well-formed. + INVALID_SCHEMA = 2; + + // The row data provided for a request was not well-formed. + INVALID_ROW_BLOCK = 3; + + // The mutations or mutation keys provided for a request were + // not well formed. + INVALID_MUTATION = 4; + + // The schema provided for a request didn't match the actual + // schema of the tablet. + MISMATCHED_SCHEMA = 5; + + // The requested tablet_id is not currently hosted on this server. + TABLET_NOT_FOUND = 6; + + // A request was made against a scanner ID that was either never + // created or has expired. + SCANNER_EXPIRED = 7; + + // An invalid scan was specified -- e.g the values passed for + // predicates were incorrect sizes. + INVALID_SCAN_SPEC = 8; + + // The provided configuration was not well-formed and/or + // had a sequence number that was below the current config. + INVALID_CONFIG = 9; + + // On a create tablet request, signals that the tablet already exists. + TABLET_ALREADY_EXISTS = 10; + + // If the tablet has a newer schema than the requested one the "alter" + // request will be rejected with this error. + TABLET_HAS_A_NEWER_SCHEMA = 11; + + // The tablet is hosted on this server, but not in RUNNING state. + TABLET_NOT_RUNNING = 12; + + // Client requested a snapshot read but the snapshot was invalid. + INVALID_SNAPSHOT = 13; + + // An invalid scan call sequence ID was specified. + INVALID_SCAN_CALL_SEQ_ID = 14; + + // This tserver is not the leader of the consensus configuration. + NOT_THE_LEADER = 15; + + // The destination UUID in the request does not match this server. + WRONG_SERVER_UUID = 16; + + // The compare-and-swap specified by an atomic RPC operation failed. + CAS_FAILED = 17; + } + + // The error code. + required Code code = 1 [ default = UNKNOWN_ERROR ]; + + // The Status object for the error. This will include a textual + // message that may be more useful to present in log messages, etc, + // though its error code is less specific. + required AppStatusPB status = 2; +} + + +message PingRequestPB { +} + +message PingResponsePB { +} + +// A batched set of insert/mutate requests. +message WriteRequestPB { + required bytes tablet_id = 1; + + // The schema as seen by the client. This may be out-of-date, in which case + // it will be projected to the current schema automatically, with defaults/NULLs + // being filled in. + optional SchemaPB schema = 2; + + // Operations to perform (insert/update/delete) + optional RowOperationsPB row_operations = 3; + + // The required consistency mode for this write. + optional ExternalConsistencyMode external_consistency_mode = 4 [default = CLIENT_PROPAGATED]; + + // A timestamp obtained by the client from a previous request. + // TODO crypto sign this and propagate the signature along with + // the timestamp. + optional fixed64 propagated_timestamp = 5; +} + +message WriteResponsePB { + // If the entire WriteResponsePB request failed, the error status that + // caused the failure. This type of error is triggered for + // cases such as the tablet not being on this server, or the + // schema not matching. If any error specific to a given row + // occurs, this error will be recorded in per_row_errors below, + // even if all rows failed. + optional TabletServerErrorPB error = 1; + + // If errors occurred with particular row operations, then the errors + // for those operations will be passed back in 'per_row_errors'. + message PerRowErrorPB { + // The index of the row in the incoming batch. + required int32 row_index = 1; + // The error that occurred. + required AppStatusPB error = 2; + } + repeated PerRowErrorPB per_row_errors = 2; + + // The timestamp chosen by the server for this write. + // TODO KUDU-611 propagate timestamps with server signature. + optional fixed64 timestamp = 3; +} + +// A list tablets request +message ListTabletsRequestPB { +} + +// A list tablets response +message ListTabletsResponsePB { + optional TabletServerErrorPB error = 1; + + message StatusAndSchemaPB { + required tablet.TabletStatusPB tablet_status = 1; + required SchemaPB schema = 2; + optional PartitionSchemaPB partition_schema = 3; + } + + repeated StatusAndSchemaPB status_and_schema = 2; +} + +// A range predicate on one of the columns in the underlying +// data. +message ColumnRangePredicatePB { + required ColumnSchemaPB column = 1; + + // These bounds should be encoded as follows: + // - STRING values: simply the exact string value for the bound. + // - other type: the canonical x86 in-memory representation -- eg for + // uint32s, a little-endian value. + // + // Note that this predicate type should not be used for NULL data -- + // NULL is defined to neither be greater than or less than other values + // for the comparison operator. We will eventually add a special + // predicate type for null-ness. + optional bytes lower_bound = 2; + optional bytes upper_bound = 3; +} + +// List of predicates used by the Java client. Will rapidly evolve into something more reusable +// as a way to pass scanner configurations. +message ColumnRangePredicateListPB { + repeated ColumnRangePredicatePB range_predicates = 1; +} + +message NewScanRequestPB { + // The tablet to scan. + required bytes tablet_id = 1; + + // The maximum number of rows to scan. + // The scanner will automatically stop yielding results and close + // itself after reaching this number of result rows. + optional uint64 limit = 2; + + // Any column range predicates to enforce. + repeated ColumnRangePredicatePB range_predicates = 3; + + // Encoded primary key to begin scanning at (inclusive). + optional bytes start_primary_key = 8; + // Encoded primary key to stop scanning at (exclusive). + optional bytes stop_primary_key = 9; + + // Which columns to select. + // if this is an empty list, no data will be returned, but the num_rows + // field of the returned RowBlock will indicate how many rows passed + // the predicates. Note that in some cases, the scan may still require + // multiple round-trips, and the caller must aggregate the counts. + repeated ColumnSchemaPB projected_columns = 4; + + // The read mode for this scan request. + // See common.proto for further information about read modes. + optional ReadMode read_mode = 5 [default = READ_LATEST]; + + // The requested snapshot timestamp. This is only used + // when the read mode is set to READ_AT_SNAPSHOT. + optional fixed64 snap_timestamp = 6; + + // Sent by clients which previously executed CLIENT_PROPAGATED writes. + // This updates the server's time so that no transaction will be assigned + // a timestamp lower than or equal to 'previous_known_timestamp' + optional fixed64 propagated_timestamp = 7; + + // Whether data blocks will be cached when read from the files or discarded after use. + // Disable this to lower cache churn when doing large scans. + optional bool cache_blocks = 10 [default = true]; + + // Whether to order the returned rows by primary key. + // This is used for scanner fault-tolerance. + optional OrderMode order_mode = 11 [default = UNORDERED]; + + // If retrying a scan, the final primary key retrieved in the previous scan + // attempt. If set, this will take precedence over the `start_primary_key` + // field, and functions as an exclusive start primary key. + optional bytes last_primary_key = 12; +} + +// A scan request. Initially, it should specify a scan. Later on, you +// can use the scanner id returned to fetch result batches with a different +// scan request. +// +// The scanner will remain open if there are more results, and it's not +// asked to be closed explicitly. Some errors on the Tablet Server may +// close the scanner automatically if the scanner state becomes +// inconsistent. +// +// Clients may choose to retry scan requests that fail to complete (due to, for +// example, a timeout or network error). If a scan request completes with an +// error result, the scanner should be closed by the client. +// +// You can fetch the results and ask the scanner to be closed to save +// a trip if you are not interested in remaining results. +// +// This is modeled somewhat after HBase's scanner API. +message ScanRequestPB { + // If continuing an existing scan, then you must set scanner_id. + // Otherwise, you must set 'new_scan_request'. + optional bytes scanner_id = 1; + optional NewScanRequestPB new_scan_request = 2; + + // The sequence ID of this call. The sequence ID should start at 0 + // with the request for a new scanner, and after each successful request, + // the client should increment it by 1. When retrying a request, the client + // should _not_ increment this value. If the server detects that the client + // missed a chunk of rows from the middle of a scan, it will respond with an + // error. + optional uint32 call_seq_id = 3; + + // The maximum number of bytes to send in the response. + // This is a hint, not a requirement: the server may send + // arbitrarily fewer or more bytes than requested. + optional uint32 batch_size_bytes = 4; + + // If set, the server will close the scanner after responding to + // this request, regardless of whether all rows have been delivered. + // In order to simply close a scanner without selecting any rows, you + // may set batch_size_bytes to 0 in conjunction with setting this flag. + optional bool close_scanner = 5; +} + +message ScanResponsePB { + // The error, if an error occurred with this request. + optional TabletServerErrorPB error = 1; + + // When a scanner is created, returns the scanner ID which may be used + // to pull new rows from the scanner. + optional bytes scanner_id = 2; + + // Set to true to indicate that there may be further results to be fetched + // from this scanner. If the scanner has no more results, then the scanner + // ID will become invalid and cannot continue to be used. + // + // Note that if a scan returns no results, then the initial response from + // the first RPC may return false in this flag, in which case there will + // be no scanner ID assigned. + optional bool has_more_results = 3; + + // The block of returned rows. + // + // NOTE: the schema-related fields will not be present in this row block. + // The schema will match the schema requested by the client when it created + // the scanner. + optional RowwiseRowBlockPB data = 4; + + // The snapshot timestamp at which the scan was executed. This is only set + // in the first response (i.e. the response to the request that had + // 'new_scan_request' set) and only for READ_AT_SNAPSHOT scans. + optional fixed64 snap_timestamp = 6; + + // If this is a fault-tolerant scanner, this is set to the encoded primary + // key of the last row returned in the response. + optional bytes last_primary_key = 7; +} + +// A scanner keep-alive request. +// Updates the scanner access time, increasing its time-to-live. +message ScannerKeepAliveRequestPB { + required bytes scanner_id = 1; +} + +message ScannerKeepAliveResponsePB { + // The error, if an error occurred with this request. + optional TabletServerErrorPB error = 1; +} \ No newline at end of file diff --git a/src/kudu/tserver/tserver_admin.proto b/src/kudu/tserver/tserver_admin.proto new file mode 100644 index 000000000000..e971af85c52c --- /dev/null +++ b/src/kudu/tserver/tserver_admin.proto @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tserver; + +option java_package = "org.kududb.tserver"; + +import "kudu/common/common.proto"; +import "kudu/consensus/metadata.proto"; +import "kudu/tablet/metadata.proto"; +import "kudu/tserver/tserver.proto"; + +message AlterSchemaRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 5; + + required bytes tablet_id = 1; + + // TODO: Replace with the table descriptor + // (Schema, Column IDs, Column Attributes) + required SchemaPB schema = 2; + + required uint32 schema_version = 3; + optional string new_table_name = 4; +} + +message AlterSchemaResponsePB { + optional TabletServerErrorPB error = 1; + + // The timestamp chosen by the server for this alter schema operation. + // TODO KUDU-611 propagate timestamps with server signature. + optional fixed64 timestamp = 2; +} + +// A create tablet request. +message CreateTabletRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 8; + + required bytes table_id = 1; + required bytes tablet_id = 2; + // DEPRECATED. + optional bytes start_key = 3; + // DEPRECATED. + optional bytes end_key = 4; + // The partition of the tablet. + optional PartitionPB partition = 9; + + required string table_name = 5; + required SchemaPB schema = 6; + // The partition schema of the table which the tablet belongs to. + optional PartitionSchemaPB partition_schema = 10; + + // Initial consensus configuration for the tablet. + required consensus.RaftConfigPB config = 7; +} + +message CreateTabletResponsePB { + optional TabletServerErrorPB error = 1; +} + +// A delete tablet request. +message DeleteTabletRequestPB { + // UUID of server this request is addressed to. + optional bytes dest_uuid = 4; + + required bytes tablet_id = 1; + + // Reason the tablet is being deleted (for logging purposes) + optional string reason = 2; + + // Must be one of TABLET_DATA_DELETED (for table deletes) or + // TABLET_DATA_TOMBSTONED (for replica retirement). + optional tablet.TabletDataState delete_type = 3 [ default = TABLET_DATA_TOMBSTONED ]; + + // The highest allowed OpId index of the latest known committed config. + // This optional parameter is here to provide an atomic (compare-and-swap) + // DeleteTablet operation. If this parameter is specified, the DeleteTablet() + // operation will succeed only if the committed config has an opid_index that + // is less than or equal to this value. + // See also the definition of RaftConfigPB. + // Note: At the time of this writing, there is a small race between checking + // the value of the committed config opid index and shutting down the tablet + // for deletion. See comments in ts_tablet_manager.cc + optional int64 cas_config_opid_index_less_or_equal = 5; +} + +message DeleteTabletResponsePB { + optional TabletServerErrorPB error = 1; +} + +// Enum of the server's Tablet Manager state: currently this is only +// used for assertions, but this can also be sent to the master. +enum TSTabletManagerStatePB { + UNKNOWN = 999; + + // Indicates that Tablet Manager is initializing. + MANAGER_INITIALIZING = 0; + + // Indicates that Tablet Manager is running and can create new + // tablets. + MANAGER_RUNNING = 1; + + // Indicates that tablet manager is shutting down and no new tablets + // can be created. + MANAGER_QUIESCING = 2; + + // Tablet Manager has shutdown. + MANAGER_SHUTDOWN = 3; +} + +service TabletServerAdminService { + // Create a new, empty tablet with the specified parameters. Only used for + // brand-new tablets, not for "moves". + rpc CreateTablet(CreateTabletRequestPB) returns (CreateTabletResponsePB); + + // Delete a tablet replica. + rpc DeleteTablet(DeleteTabletRequestPB) returns (DeleteTabletResponsePB); + + // Alter a tablet's schema. + rpc AlterSchema(AlterSchemaRequestPB) returns (AlterSchemaResponsePB); +} diff --git a/src/kudu/tserver/tserver_service.proto b/src/kudu/tserver/tserver_service.proto new file mode 100644 index 000000000000..b6010fd30982 --- /dev/null +++ b/src/kudu/tserver/tserver_service.proto @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu.tserver; + +option java_package = "org.kududb.tserver"; + +import "kudu/tserver/tserver.proto"; + +service TabletServerService { + + rpc Ping(PingRequestPB) returns (PingResponsePB); + rpc Write(WriteRequestPB) returns (WriteResponsePB); + rpc Scan(ScanRequestPB) returns (ScanResponsePB); + rpc ScannerKeepAlive(ScannerKeepAliveRequestPB) returns (ScannerKeepAliveResponsePB); + rpc ListTablets(ListTabletsRequestPB) returns (ListTabletsResponsePB); + + // Run full-scan data checksum on a tablet to verify data integrity. + // + // TODO: Consider refactoring this as a scan that runs a checksum aggregation + // function. + rpc Checksum(ChecksumRequestPB) + returns (ChecksumResponsePB); +} + +message ChecksumRequestPB { + // Only one of 'new_request' or 'continue_request' should be specified. + optional NewScanRequestPB new_request = 1; + optional ContinueChecksumRequestPB continue_request = 2; + + // See documentation for ScanRequestPB for info about these fields. + optional uint32 call_seq_id = 3; + optional uint32 batch_size_bytes = 4; + optional bool close_scanner = 5; +} + +message ContinueChecksumRequestPB { + // Scanner ID returned from a previous request. + required bytes scanner_id = 1; + + // Checksum returned from a previous request. + required uint64 previous_checksum = 2; +} + +message ChecksumResponsePB { + // Error message, if any. + optional TabletServerErrorPB error = 1; + + // The (possibly partial) checksum of the tablet data. + // This checksum is only complete if 'has_more_results' is false. + optional uint64 checksum = 2; + + // See documentation for ScanResponsePB for info about these fields. + optional bytes scanner_id = 3; + optional bool has_more_results = 4; + optional fixed64 snap_timestamp = 5; +} diff --git a/src/kudu/twitter-demo/CMakeLists.txt b/src/kudu/twitter-demo/CMakeLists.txt new file mode 100644 index 000000000000..5c261e29f27a --- /dev/null +++ b/src/kudu/twitter-demo/CMakeLists.txt @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +find_library(LIBOAUTH_LIBRARY NAMES oauth) +if(NOT LIBOAUTH_LIBRARY) + message(WARNING "liboauth not found on system. Skipping twitter demo") +else() + + add_library(twitter_demo + oauth.cc + parser.cc + insert_consumer.cc + twitter_streamer.cc) + + target_link_libraries(twitter_demo + gutil + kudu_util + kudu_test_util) + + target_link_libraries(twitter_demo + kudu_client + ${LIBOAUTH_LIBRARY} + ${CURL_LIBRARIES} + ${KUDU_BASE_LIBS}) + + # Require that the tserver protobuf code is generated first + add_dependencies(twitter_demo + tserver_proto) + + add_executable(ingest_firehose ingest_firehose.cc) + target_link_libraries(ingest_firehose + twitter_demo) + + # Tests + ADD_KUDU_TEST(oauth-test) + # parser-test relies on symlinked data files which we can't currently copy correctly + # to the cluster. + ADD_KUDU_TEST(parser-test LABELS no_dist_test) + if(NOT "${NO_TESTS}") + target_link_libraries(oauth-test + twitter_demo) + target_link_libraries(parser-test + twitter_demo) + execute_process(COMMAND ln -sf ${CMAKE_CURRENT_SOURCE_DIR}/example-tweets.txt + ${EXECUTABLE_OUTPUT_PATH}) + execute_process(COMMAND ln -sf ${CMAKE_CURRENT_SOURCE_DIR}/example-deletes.txt + ${EXECUTABLE_OUTPUT_PATH}) + endif() + +endif() # library checks diff --git a/src/kudu/twitter-demo/README b/src/kudu/twitter-demo/README new file mode 100644 index 000000000000..9d54c6e2b9a9 --- /dev/null +++ b/src/kudu/twitter-demo/README @@ -0,0 +1,24 @@ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This directory contains a demo which ingests the Twitter firehose +into Kudu. + +Building this requires some dependencies which are not in our thirdparty/ +directory: + Ubuntu: apt-get -y install liboauth-dev libcurl-dev + RHEL6: yum -y install liboauth-devel curl-devel + +By default, the demo uses the KuduProject twitter account's API keys +to connect to the sample firehose. Use the available command-line +flags to use a different account. diff --git a/src/kudu/twitter-demo/example-deletes.txt b/src/kudu/twitter-demo/example-deletes.txt new file mode 100644 index 000000000000..640f1874b810 --- /dev/null +++ b/src/kudu/twitter-demo/example-deletes.txt @@ -0,0 +1,163 @@ +{"delete":{"status":{"id":267057648111853568,"user_id":606114977,"id_str":"267057648111853568","user_id_str":"606114977"}}} +{"delete":{"status":{"id":365610819137970176,"user_id":1491343880,"id_str":"365610819137970176","user_id_str":"1491343880"}}} +{"delete":{"status":{"id":365600266294272002,"user_id":1655650302,"id_str":"365600266294272002","user_id_str":"1655650302"}}} +{"delete":{"status":{"id":272060894937235456,"user_id":294010962,"id_str":"272060894937235456","user_id_str":"294010962"}}} +{"delete":{"status":{"id":267023015743733761,"user_id":316010929,"id_str":"267023015743733761","user_id_str":"316010929"}}} +{"delete":{"status":{"id":365511472878596097,"user_id":370760011,"id_str":"365511472878596097","user_id_str":"370760011"}}} +{"delete":{"status":{"id":365399585625083904,"user_id":1274118158,"id_str":"365399585625083904","user_id_str":"1274118158"}}} +{"delete":{"status":{"id":365612903728021504,"user_id":381239891,"id_str":"365612903728021504","user_id_str":"381239891"}}} +{"delete":{"status":{"id":361066433372884995,"user_id":118787650,"id_str":"361066433372884995","user_id_str":"118787650"}}} +{"delete":{"status":{"id":364961268244819968,"user_id":1212003986,"id_str":"364961268244819968","user_id_str":"1212003986"}}} +{"delete":{"status":{"id":365422641676419072,"user_id":1449650708,"id_str":"365422641676419072","user_id_str":"1449650708"}}} +{"delete":{"status":{"id":361473696109568001,"user_id":750382290,"id_str":"361473696109568001","user_id_str":"750382290"}}} +{"delete":{"status":{"id":354267604992528387,"user_id":560763818,"id_str":"354267604992528387","user_id_str":"560763818"}}} +{"delete":{"status":{"id":234395352986447874,"user_id":584161811,"id_str":"234395352986447874","user_id_str":"584161811"}}} +{"delete":{"status":{"id":307906499051671553,"user_id":1131831090,"id_str":"307906499051671553","user_id_str":"1131831090"}}} +{"delete":{"status":{"id":361679875481927681,"user_id":702099402,"id_str":"361679875481927681","user_id_str":"702099402"}}} +{"delete":{"status":{"id":234370497515765760,"user_id":584161811,"id_str":"234370497515765760","user_id_str":"584161811"}}} +{"delete":{"status":{"id":364278599152443392,"user_id":45978919,"id_str":"364278599152443392","user_id_str":"45978919"}}} +{"delete":{"status":{"id":343144877028147202,"user_id":471589368,"id_str":"343144877028147202","user_id_str":"471589368"}}} +{"delete":{"status":{"id":334010391749332993,"user_id":471589368,"id_str":"334010391749332993","user_id_str":"471589368"}}} +{"delete":{"status":{"id":365611838345457664,"user_id":363033026,"id_str":"365611838345457664","user_id_str":"363033026"}}} +{"delete":{"status":{"id":365612907897176065,"user_id":1122270540,"id_str":"365612907897176065","user_id_str":"1122270540"}}} +{"delete":{"status":{"id":265755727782621184,"user_id":117731627,"id_str":"265755727782621184","user_id_str":"117731627"}}} +{"delete":{"status":{"id":310588303491620866,"user_id":471589368,"id_str":"310588303491620866","user_id_str":"471589368"}}} +{"delete":{"status":{"id":310012329079230465,"user_id":471589368,"id_str":"310012329079230465","user_id_str":"471589368"}}} +{"delete":{"status":{"id":352916745490673664,"user_id":1287517861,"id_str":"352916745490673664","user_id_str":"1287517861"}}} +{"delete":{"status":{"id":234360674489352193,"user_id":584161811,"id_str":"234360674489352193","user_id_str":"584161811"}}} +{"delete":{"status":{"id":313055527305359360,"user_id":402465302,"id_str":"313055527305359360","user_id_str":"402465302"}}} +{"delete":{"status":{"id":267181916946579456,"user_id":606114977,"id_str":"267181916946579456","user_id_str":"606114977"}}} +{"delete":{"status":{"id":85303805117349889,"user_id":261218323,"id_str":"85303805117349889","user_id_str":"261218323"}}} +{"delete":{"status":{"id":84382282953146368,"user_id":261218323,"id_str":"84382282953146368","user_id_str":"261218323"}}} +{"delete":{"status":{"id":308412227277430784,"user_id":471589368,"id_str":"308412227277430784","user_id_str":"471589368"}}} +{"delete":{"status":{"id":365607040078450688,"user_id":1650893678,"id_str":"365607040078450688","user_id_str":"1650893678"}}} +{"delete":{"status":{"id":365608864592691200,"user_id":1116595795,"id_str":"365608864592691200","user_id_str":"1116595795"}}} +{"delete":{"status":{"id":365560307155869696,"user_id":970784143,"id_str":"365560307155869696","user_id_str":"970784143"}}} +{"delete":{"status":{"id":343060730876149760,"user_id":995216024,"id_str":"343060730876149760","user_id_str":"995216024"}}} +{"delete":{"status":{"id":365612912104046592,"user_id":363303529,"id_str":"365612912104046592","user_id_str":"363303529"}}} +{"delete":{"status":{"id":365352911376621568,"user_id":23126868,"id_str":"365352911376621568","user_id_str":"23126868"}}} +{"delete":{"status":{"id":365612433957588992,"user_id":445619455,"id_str":"365612433957588992","user_id_str":"445619455"}}} +{"delete":{"status":{"id":365576128041201666,"user_id":187011137,"id_str":"365576128041201666","user_id_str":"187011137"}}} +{"delete":{"status":{"id":364881962344792064,"user_id":389703571,"id_str":"364881962344792064","user_id_str":"389703571"}}} +{"delete":{"status":{"id":234300540736118784,"user_id":584161811,"id_str":"234300540736118784","user_id_str":"584161811"}}} +{"delete":{"status":{"id":258368467898605568,"user_id":199020443,"id_str":"258368467898605568","user_id_str":"199020443"}}} +{"delete":{"status":{"id":363778331935649792,"user_id":597343507,"id_str":"363778331935649792","user_id_str":"597343507"}}} +{"delete":{"status":{"id":365608025739886592,"user_id":630746058,"id_str":"365608025739886592","user_id_str":"630746058"}}} +{"delete":{"status":{"id":314896608691101697,"user_id":294466600,"id_str":"314896608691101697","user_id_str":"294466600"}}} +{"delete":{"status":{"id":355389484864503810,"user_id":1495601640,"id_str":"355389484864503810","user_id_str":"1495601640"}}} +{"delete":{"status":{"id":338626156192481282,"user_id":1143664200,"id_str":"338626156192481282","user_id_str":"1143664200"}}} +{"delete":{"status":{"id":302552201568604160,"user_id":555211712,"id_str":"302552201568604160","user_id_str":"555211712"}}} +{"delete":{"status":{"id":365609359515987969,"user_id":86393029,"id_str":"365609359515987969","user_id_str":"86393029"}}} +{"delete":{"status":{"id":307013628203237377,"user_id":346960220,"id_str":"307013628203237377","user_id_str":"346960220"}}} +{"delete":{"status":{"id":365612928860291072,"user_id":882516884,"id_str":"365612928860291072","user_id_str":"882516884"}}} +{"delete":{"status":{"id":300877814440275968,"user_id":555211712,"id_str":"300877814440275968","user_id_str":"555211712"}}} +{"delete":{"status":{"id":190221450777604096,"user_id":274804222,"id_str":"190221450777604096","user_id_str":"274804222"}}} +{"delete":{"status":{"id":78450173965115392,"user_id":261218323,"id_str":"78450173965115392","user_id_str":"261218323"}}} +{"delete":{"status":{"id":211993210422181891,"user_id":245030912,"id_str":"211993210422181891","user_id_str":"245030912"}}} +{"delete":{"status":{"id":360183674962051073,"user_id":1360590798,"id_str":"360183674962051073","user_id_str":"1360590798"}}} +{"delete":{"status":{"id":245174718452338688,"user_id":777174108,"id_str":"245174718452338688","user_id_str":"777174108"}}} +{"delete":{"status":{"id":299251351429476352,"user_id":555211712,"id_str":"299251351429476352","user_id_str":"555211712"}}} +{"delete":{"status":{"id":190148453106585603,"user_id":274804222,"id_str":"190148453106585603","user_id_str":"274804222"}}} +{"delete":{"status":{"id":188852253778644996,"user_id":44686062,"id_str":"188852253778644996","user_id_str":"44686062"}}} +{"delete":{"status":{"id":234076464235020290,"user_id":584161811,"id_str":"234076464235020290","user_id_str":"584161811"}}} +{"delete":{"status":{"id":234072672601010178,"user_id":584161811,"id_str":"234072672601010178","user_id_str":"584161811"}}} +{"delete":{"status":{"id":365609753809928192,"user_id":368521768,"id_str":"365609753809928192","user_id_str":"368521768"}}} +{"delete":{"status":{"id":365083746124300288,"user_id":1212003986,"id_str":"365083746124300288","user_id_str":"1212003986"}}} +{"delete":{"status":{"id":205046444737040385,"user_id":393230991,"id_str":"205046444737040385","user_id_str":"393230991"}}} +{"delete":{"status":{"id":255834902757507073,"user_id":199020443,"id_str":"255834902757507073","user_id_str":"199020443"}}} +{"delete":{"status":{"id":365609753772171264,"user_id":348682214,"id_str":"365609753772171264","user_id_str":"348682214"}}} +{"delete":{"status":{"id":234060467167965184,"user_id":584161811,"id_str":"234060467167965184","user_id_str":"584161811"}}} +{"delete":{"status":{"id":365609816699318273,"user_id":433471019,"id_str":"365609816699318273","user_id_str":"433471019"}}} +{"delete":{"status":{"id":234055022973964288,"user_id":584161811,"id_str":"234055022973964288","user_id_str":"584161811"}}} +{"delete":{"status":{"id":365612920492662785,"user_id":340870559,"id_str":"365612920492662785","user_id_str":"340870559"}}} +{"delete":{"status":{"id":306632571524087808,"user_id":346960220,"id_str":"306632571524087808","user_id_str":"346960220"}}} +{"delete":{"status":{"id":365606624829771776,"user_id":100525053,"id_str":"365606624829771776","user_id_str":"100525053"}}} +{"delete":{"status":{"id":365612513632591872,"user_id":807637609,"id_str":"365612513632591872","user_id_str":"807637609"}}} +{"delete":{"status":{"id":306443022533529600,"user_id":346960220,"id_str":"306443022533529600","user_id_str":"346960220"}}} +{"delete":{"status":{"id":210524453241765888,"user_id":245030912,"id_str":"210524453241765888","user_id_str":"245030912"}}} +{"delete":{"status":{"id":210384103424536576,"user_id":245030912,"id_str":"210384103424536576","user_id_str":"245030912"}}} +{"delete":{"status":{"id":321354784064016384,"user_id":221576287,"id_str":"321354784064016384","user_id_str":"221576287"}}} +{"delete":{"status":{"id":365612815618281473,"user_id":1209234702,"id_str":"365612815618281473","user_id_str":"1209234702"}}} +{"delete":{"status":{"id":321342712848719872,"user_id":221576287,"id_str":"321342712848719872","user_id_str":"221576287"}}} +{"delete":{"status":{"id":365612765303422978,"user_id":333604624,"id_str":"365612765303422978","user_id_str":"333604624"}}} +{"delete":{"status":{"id":365506699739672576,"user_id":140727979,"id_str":"365506699739672576","user_id_str":"140727979"}}} +{"delete":{"status":{"id":306409069634265088,"user_id":1131831090,"id_str":"306409069634265088","user_id_str":"1131831090"}}} +{"delete":{"status":{"id":365493336699707395,"user_id":1542349333,"id_str":"365493336699707395","user_id_str":"1542349333"}}} +{"delete":{"status":{"id":157494588507045888,"user_id":402465302,"id_str":"157494588507045888","user_id_str":"402465302"}}} +{"delete":{"status":{"id":189845511098994690,"user_id":274804222,"id_str":"189845511098994690","user_id_str":"274804222"}}} +{"delete":{"status":{"id":365608201917431808,"user_id":338597198,"id_str":"365608201917431808","user_id_str":"338597198"}}} +{"delete":{"status":{"id":301451423269416962,"user_id":1027610744,"id_str":"301451423269416962","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":365606561906835457,"user_id":169962619,"id_str":"365606561906835457","user_id_str":"169962619"}}} +{"delete":{"status":{"id":345667956405071872,"user_id":26394490,"id_str":"345667956405071872","user_id_str":"26394490"}}} +{"delete":{"status":{"id":298208144146956289,"user_id":555211712,"id_str":"298208144146956289","user_id_str":"555211712"}}} +{"delete":{"status":{"id":177410414739587072,"user_id":395393883,"id_str":"177410414739587072","user_id_str":"395393883"}}} +{"delete":{"status":{"id":298180965065433088,"user_id":555211712,"id_str":"298180965065433088","user_id_str":"555211712"}}} +{"delete":{"status":{"id":365612878541234176,"user_id":39330283,"id_str":"365612878541234176","user_id_str":"39330283"}}} +{"delete":{"status":{"id":364204355747971072,"user_id":206718088,"id_str":"364204355747971072","user_id_str":"206718088"}}} +{"delete":{"status":{"id":233749992207175680,"user_id":584161811,"id_str":"233749992207175680","user_id_str":"584161811"}}} +{"delete":{"status":{"id":365608583582334976,"user_id":634726367,"id_str":"365608583582334976","user_id_str":"634726367"}}} +{"delete":{"status":{"id":365612538806800384,"user_id":1564813370,"id_str":"365612538806800384","user_id_str":"1564813370"}}} +{"delete":{"status":{"id":74431149937659904,"user_id":261218323,"id_str":"74431149937659904","user_id_str":"261218323"}}} +{"delete":{"status":{"id":365612865979293698,"user_id":53005087,"id_str":"365612865979293698","user_id_str":"53005087"}}} +{"delete":{"status":{"id":303649385374416896,"user_id":346960220,"id_str":"303649385374416896","user_id_str":"346960220"}}} +{"delete":{"status":{"id":138626289669124096,"user_id":402465302,"id_str":"138626289669124096","user_id_str":"402465302"}}} +{"delete":{"status":{"id":204059738906963968,"user_id":393230991,"id_str":"204059738906963968","user_id_str":"393230991"}}} +{"delete":{"status":{"id":173956493475979264,"user_id":341296761,"id_str":"173956493475979264","user_id_str":"341296761"}}} +{"delete":{"status":{"id":306250919199272960,"user_id":1131831090,"id_str":"306250919199272960","user_id_str":"1131831090"}}} +{"delete":{"status":{"id":363358431748423680,"user_id":63078428,"id_str":"363358431748423680","user_id_str":"63078428"}}} +{"delete":{"status":{"id":306950562644119553,"user_id":221576287,"id_str":"306950562644119553","user_id_str":"221576287"}}} +{"delete":{"status":{"id":365612706587357184,"user_id":928439221,"id_str":"365612706587357184","user_id_str":"928439221"}}} +{"delete":{"status":{"id":277600110903431168,"user_id":941366263,"id_str":"277600110903431168","user_id_str":"941366263"}}} +{"delete":{"status":{"id":297249124934352896,"user_id":555211712,"id_str":"297249124934352896","user_id_str":"555211712"}}} +{"delete":{"status":{"id":288491451724214273,"user_id":26394490,"id_str":"288491451724214273","user_id_str":"26394490"}}} +{"delete":{"status":{"id":365612933083959296,"user_id":1654400730,"id_str":"365612933083959296","user_id_str":"1654400730"}}} +{"delete":{"status":{"id":365611955798552578,"user_id":436873994,"id_str":"365611955798552578","user_id_str":"436873994"}}} +{"delete":{"status":{"id":365260959654223876,"user_id":1243710523,"id_str":"365260959654223876","user_id_str":"1243710523"}}} +{"delete":{"status":{"id":208095544369881089,"user_id":245030912,"id_str":"208095544369881089","user_id_str":"245030912"}}} +{"delete":{"status":{"id":365612874342727680,"user_id":606634592,"id_str":"365612874342727680","user_id_str":"606634592"}}} +{"delete":{"status":{"id":365104965129216000,"user_id":1212003986,"id_str":"365104965129216000","user_id_str":"1212003986"}}} +{"delete":{"status":{"id":312244768354729984,"user_id":470658712,"id_str":"312244768354729984","user_id_str":"470658712"}}} +{"delete":{"status":{"id":314252682368012288,"user_id":294466600,"id_str":"314252682368012288","user_id_str":"294466600"}}} +{"delete":{"status":{"id":302179353129779200,"user_id":346960220,"id_str":"302179353129779200","user_id_str":"346960220"}}} +{"delete":{"status":{"id":203685195948363776,"user_id":393230991,"id_str":"203685195948363776","user_id_str":"393230991"}}} +{"delete":{"status":{"id":365322074874605568,"user_id":102906056,"id_str":"365322074874605568","user_id_str":"102906056"}}} +{"delete":{"status":{"id":296716771262623746,"user_id":555211712,"id_str":"296716771262623746","user_id_str":"555211712"}}} +{"delete":{"status":{"id":365499477152366592,"user_id":402752207,"id_str":"365499477152366592","user_id_str":"402752207"}}} +{"delete":{"status":{"id":299702075556757505,"user_id":1027610744,"id_str":"299702075556757505","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":302324597674614784,"user_id":256832640,"id_str":"302324597674614784","user_id_str":"256832640"}}} +{"delete":{"status":{"id":365612974997639168,"user_id":286261924,"id_str":"365612974997639168","user_id_str":"286261924"}}} +{"delete":{"status":{"id":269708876092411904,"user_id":606114977,"id_str":"269708876092411904","user_id_str":"606114977"}}} +{"delete":{"status":{"id":314154254636023810,"user_id":294466600,"id_str":"314154254636023810","user_id_str":"294466600"}}} +{"delete":{"status":{"id":270113148265185281,"user_id":606114977,"id_str":"270113148265185281","user_id_str":"606114977"}}} +{"delete":{"status":{"id":361875267146358786,"user_id":434843551,"id_str":"361875267146358786","user_id_str":"434843551"}}} +{"delete":{"status":{"id":206581014745718784,"user_id":245030912,"id_str":"206581014745718784","user_id_str":"245030912"}}} +{"delete":{"status":{"id":216265532675067904,"user_id":26394490,"id_str":"216265532675067904","user_id_str":"26394490"}}} +{"delete":{"status":{"id":299354023813857280,"user_id":1027610744,"id_str":"299354023813857280","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":365525246956154880,"user_id":725675066,"id_str":"365525246956154880","user_id_str":"725675066"}}} +{"delete":{"status":{"id":203606674387378177,"user_id":393230991,"id_str":"203606674387378177","user_id_str":"393230991"}}} +{"delete":{"status":{"id":275146321457979392,"user_id":941366263,"id_str":"275146321457979392","user_id_str":"941366263"}}} +{"delete":{"status":{"id":365612979196149760,"user_id":270213507,"id_str":"365612979196149760","user_id_str":"270213507"}}} +{"delete":{"status":{"id":297928170169040896,"user_id":1027610744,"id_str":"297928170169040896","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":226772461302185984,"user_id":614667649,"id_str":"226772461302185984","user_id_str":"614667649"}}} +{"delete":{"status":{"id":361764747223384064,"user_id":237247083,"id_str":"361764747223384064","user_id_str":"237247083"}}} +{"delete":{"status":{"id":233698997829775361,"user_id":584161811,"id_str":"233698997829775361","user_id_str":"584161811"}}} +{"delete":{"status":{"id":365612949856976897,"user_id":286888170,"id_str":"365612949856976897","user_id_str":"286888170"}}} +{"delete":{"status":{"id":205681726607982593,"user_id":245030912,"id_str":"205681726607982593","user_id_str":"245030912"}}} +{"delete":{"status":{"id":205681214898700290,"user_id":245030912,"id_str":"205681214898700290","user_id_str":"245030912"}}} +{"delete":{"status":{"id":365518246658580480,"user_id":772118754,"id_str":"365518246658580480","user_id_str":"772118754"}}} +{"delete":{"status":{"id":37158119284932608,"user_id":22129513,"id_str":"37158119284932608","user_id_str":"22129513"}}} +{"delete":{"status":{"id":296420556960583681,"user_id":1027610744,"id_str":"296420556960583681","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":274056641262002176,"user_id":941366263,"id_str":"274056641262002176","user_id_str":"941366263"}}} +{"delete":{"status":{"id":347000927187791872,"user_id":1507093165,"id_str":"347000927187791872","user_id_str":"1507093165"}}} +{"delete":{"status":{"id":344948813611532289,"user_id":237946894,"id_str":"344948813611532289","user_id_str":"237946894"}}} +{"delete":{"status":{"id":365612962431508480,"user_id":477337365,"id_str":"365612962431508480","user_id_str":"477337365"}}} +{"delete":{"status":{"id":364789167575674880,"user_id":1151685355,"id_str":"364789167575674880","user_id_str":"1151685355"}}} +{"delete":{"status":{"id":305424679059988480,"user_id":309709804,"id_str":"305424679059988480","user_id_str":"309709804"}}} +{"delete":{"status":{"id":203593395208331264,"user_id":393230991,"id_str":"203593395208331264","user_id_str":"393230991"}}} +{"delete":{"status":{"id":295773195489910784,"user_id":1027610744,"id_str":"295773195489910784","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":247122377056600064,"user_id":316010929,"id_str":"247122377056600064","user_id_str":"316010929"}}} +{"delete":{"status":{"id":365609565062049792,"user_id":365660522,"id_str":"365609565062049792","user_id_str":"365660522"}}} +{"delete":{"status":{"id":313921894359379968,"user_id":294466600,"id_str":"313921894359379968","user_id_str":"294466600"}}} +{"delete":{"status":{"id":301954202861727745,"user_id":346960220,"id_str":"301954202861727745","user_id_str":"346960220"}}} +{"delete":{"status":{"id":295353949643497472,"user_id":1027610744,"id_str":"295353949643497472","user_id_str":"1027610744"}}} +{"delete":{"status":{"id":203317825258000384,"user_id":393230991,"id_str":"203317825258000384","user_id_str":"393230991"}}} diff --git a/src/kudu/twitter-demo/example-tweets.txt b/src/kudu/twitter-demo/example-tweets.txt new file mode 100644 index 000000000000..fc577ac215bd --- /dev/null +++ b/src/kudu/twitter-demo/example-tweets.txt @@ -0,0 +1,505 @@ +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217605238784,"id_str":"365611217605238784","text":"@JaredLeto u look like fish!","source":"web","truncated":false,"in_reply_to_status_id":365590910525911040,"in_reply_to_status_id_str":"365590910525911040","in_reply_to_user_id":27711339,"in_reply_to_user_id_str":"27711339","in_reply_to_screen_name":"JaredLeto","user":{"id":328473738,"id_str":"328473738","name":"Ula M.","screen_name":"tomula483","location":"Szczecin","url":null,"description":null,"protected":false,"followers_count":12,"friends_count":55,"listed_count":0,"created_at":"Sun Jul 03 12:53:42 +0000 2011","favourites_count":64,"utc_offset":7200,"time_zone":"Warsaw","geo_enabled":true,"verified":false,"statuses_count":261,"lang":"pl","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2907863190\/71913cdd9e92ac2c7db48d76e5b036bb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2907863190\/71913cdd9e92ac2c7db48d76e5b036bb_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"JaredLeto","name":"JARED LETO","id":27711339,"id_str":"27711339","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217601040384,"id_str":"365611217601040384","text":"@preetskapreet i already knew that...","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":364568088831721473,"in_reply_to_status_id_str":"364568088831721473","in_reply_to_user_id":526734170,"in_reply_to_user_id_str":"526734170","in_reply_to_screen_name":"preetskapreet","user":{"id":103636562,"id_str":"103636562","name":"\u2600 h a r v i t \u0a74","screen_name":"itsharvit","location":"COLE WORLD","url":"http:\/\/hiddensunsets.tumblr.com\/","description":"happiness. dreamer. i love god \u0a74 and i live for the music & moments. oh and i love j cole, kid cudi & the maine. instagram;harvitgill #allabout18 #celtics","protected":false,"followers_count":626,"friends_count":304,"listed_count":67,"created_at":"Sun Jan 10 18:47:18 +0000 2010","favourites_count":1534,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":14750,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DE4063","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/767224620\/c8abbd3e1f2f5489a8e628ce0788b8fd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/767224620\/c8abbd3e1f2f5489a8e628ce0788b8fd.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000163005610\/d1db2a626b77a937ca99c73033567352_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000163005610\/d1db2a626b77a937ca99c73033567352_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/103636562\/1357330434","profile_link_color":"55BA84","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"00FF51","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"preetskapreet","name":"SWIZZY PREET","id":526734170,"id_str":"526734170","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217592664064,"id_str":"365611217592664064","text":"@selenabtch <333 ofuscamos as inimigas ioddkf","source":"web","truncated":false,"in_reply_to_status_id":365609863348371458,"in_reply_to_status_id_str":"365609863348371458","in_reply_to_user_id":1628683730,"in_reply_to_user_id_str":"1628683730","in_reply_to_screen_name":"selenabtch","user":{"id":619520802,"id_str":"619520802","name":"1 M\u00caS DE SIEGE \u2654","screen_name":"ops_zarry","location":"London ","url":"http:\/\/pudim.com.br","description":"monie \u2661 vic \u2661 lah \u2661 jujubs \u2661 paula matos \u2661 selena \u2661 fl\u00e1 \u2661 lolla \u2661 grazi \u2661 nath \u2661 rafa \u2661","protected":false,"followers_count":1124,"friends_count":1023,"listed_count":0,"created_at":"Tue Jun 26 21:33:26 +0000 2012","favourites_count":158,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":6977,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047181213\/fbf82571a09ad8c426cac8c97e5a1664.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047181213\/fbf82571a09ad8c426cac8c97e5a1664.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259667631\/fc6a4d51ca02ef63404c0913c7884bfa_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259667631\/fc6a4d51ca02ef63404c0913c7884bfa_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/619520802\/1375962482","profile_link_color":"111EAD","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"selenabtch","name":"laus","id":1628683730,"id_str":"1628683730","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217588465664,"id_str":"365611217588465664","text":"@pleasefilipe hahahhaha","source":"web","truncated":false,"in_reply_to_status_id":365610934288384001,"in_reply_to_status_id_str":"365610934288384001","in_reply_to_user_id":1339764966,"in_reply_to_user_id_str":"1339764966","in_reply_to_screen_name":"pleasefilipe","user":{"id":158503687,"id_str":"158503687","name":"Vict\u00f3ria","screen_name":"locaporsophied","location":"","url":"https:\/\/twitter.com\/sophiaabrahao\/status\/347748097889284096","description":"Tepenico,nico,nico...","protected":false,"followers_count":1731,"friends_count":1422,"listed_count":1,"created_at":"Tue Jun 22 21:58:34 +0000 2010","favourites_count":1056,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":25415,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041688451\/475a3c2abd6d7d346075ada536cda7e5.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041688451\/475a3c2abd6d7d346075ada536cda7e5.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246663393\/0514008d1393b1a9a6cd60093ebd4959_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246663393\/0514008d1393b1a9a6cd60093ebd4959_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/158503687\/1375373203","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFCCEB","profile_text_color":"FF00D5","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"pleasefilipe","name":"Julie e Vick","id":1339764966,"id_str":"1339764966","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217605246976,"id_str":"365611217605246976","text":"@JohnnyBGoode77 @lnsomni0 s\u00ed, me encant\u00f3","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610752880553987,"in_reply_to_status_id_str":"365610752880553987","in_reply_to_user_id":250319865,"in_reply_to_user_id_str":"250319865","in_reply_to_screen_name":"JohnnyBGoode77","user":{"id":296471716,"id_str":"296471716","name":"Pati","screen_name":"TuiteraMx","location":"","url":null,"description":"Esta soy yo, la lavandera de mi ropa ajena. Mastuerzo dixit","protected":false,"followers_count":5803,"friends_count":4975,"listed_count":43,"created_at":"Tue May 10 20:28:37 +0000 2011","favourites_count":20022,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":86844,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/887155720\/deebcae8569aa99810b98c77ab395cf3.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/887155720\/deebcae8569aa99810b98c77ab395cf3.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3762983582\/500cde9354ab64d2861997d601206884_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3762983582\/500cde9354ab64d2861997d601206884_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/296471716\/1370524598","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"JohnnyBGoode77","name":"Johnny Rotten","id":250319865,"id_str":"250319865","indices":[0,15]},{"screen_name":"lnsomni0","name":"Nocturna...","id":464084771,"id_str":"464084771","indices":[16,25]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217617829888,"id_str":"365611217617829888","text":"RT @pataxula24: De botellon con los amigos de @AsocJuvAlhucema http:\/\/t.co\/wM8NkOU2XS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":621188704,"id_str":"621188704","name":"Antonio B.C.","screen_name":"aburca1289","location":"jaen","url":null,"description":null,"protected":false,"followers_count":241,"friends_count":346,"listed_count":1,"created_at":"Thu Jun 28 18:02:49 +0000 2012","favourites_count":5,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1485,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme19\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme19\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000164404708\/cba974d2aa585360efdb28f731070a50_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000164404708\/cba974d2aa585360efdb28f731070a50_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/621188704\/1372774425","profile_link_color":"0099CC","profile_sidebar_border_color":"FFF8AD","profile_sidebar_fill_color":"F6FFD1","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:09:15 +0000 2013","id":365595587111948288,"id_str":"365595587111948288","text":"De botellon con los amigos de @AsocJuvAlhucema http:\/\/t.co\/wM8NkOU2XS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1528891176,"id_str":"1528891176","name":"Jose Manuel Moreno ","screen_name":"pataxula24","location":"","url":null,"description":null,"protected":false,"followers_count":82,"friends_count":186,"listed_count":0,"created_at":"Tue Jun 18 21:38:54 +0000 2013","favourites_count":3,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":35,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000014584943\/642f71a8c4e644d228e244938eecaf67_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000014584943\/642f71a8c4e644d228e244938eecaf67_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":5,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"AsocJuvAlhucema","name":"Asoc. Juv. Alhucema","id":537670153,"id_str":"537670153","indices":[30,46]}],"media":[{"id":365595587116142593,"id_str":"365595587116142593","indices":[47,69],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLbP92CEAE05_q.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLbP92CEAE05_q.jpg","url":"http:\/\/t.co\/wM8NkOU2XS","display_url":"pic.twitter.com\/wM8NkOU2XS","expanded_url":"http:\/\/twitter.com\/pataxula24\/status\/365595587111948288\/photo\/1","type":"photo","sizes":{"large":{"w":1024,"h":576,"resize":"fit"},"small":{"w":340,"h":191,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":338,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"pataxula24","name":"Jose Manuel Moreno ","id":1528891176,"id_str":"1528891176","indices":[3,14]},{"screen_name":"AsocJuvAlhucema","name":"Asoc. Juv. Alhucema","id":537670153,"id_str":"537670153","indices":[46,62]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217609428993,"id_str":"365611217609428993","text":"Finally I saw her again, she had disapeared","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":911075358,"id_str":"911075358","name":"J&G","screen_name":"_joanna_xoxo","location":"North Carolina","url":null,"description":"Prince Royce \u2764 \/ In love \u221e\/ IG:_joannaa_xo r\u03bfyc\u03b5\u03b7\u03b1\u03c4\u03b9c\u03b1 \/ ~PlanetRoyce~","protected":false,"followers_count":767,"friends_count":705,"listed_count":2,"created_at":"Sun Oct 28 20:47:17 +0000 2012","favourites_count":109,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":6730,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000240499579\/a5d659535b107fa23ad0f38100fbabcf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000240499579\/a5d659535b107fa23ad0f38100fbabcf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/911075358\/1375664446","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217584259072,"id_str":"365611217584259072","text":"@Laurenszuhaj @MalwinaJozwiak ur rude","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611022876291072,"in_reply_to_status_id_str":"365611022876291072","in_reply_to_user_id":1038074576,"in_reply_to_user_id_str":"1038074576","in_reply_to_screen_name":"Laurenszuhaj","user":{"id":213052827,"id_str":"213052827","name":"Someone, Somewhere","screen_name":"Heather_Skye_","location":"Scotland","url":null,"description":"Lord Of The Rings, that is all.","protected":false,"followers_count":1065,"friends_count":920,"listed_count":0,"created_at":"Sun Nov 07 20:50:00 +0000 2010","favourites_count":1529,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":8035,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000225351955\/4d10a3af1109ce9603a240ca39554a07_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000225351955\/4d10a3af1109ce9603a240ca39554a07_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/213052827\/1372875958","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Laurenszuhaj","name":"Satan ","id":1038074576,"id_str":"1038074576","indices":[0,13]},{"screen_name":"MalwinaJozwiak","name":"malwina","id":393390356,"id_str":"393390356","indices":[14,29]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"et"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217617821697,"id_str":"365611217617821697","text":"@TThassio vai cola na expo nego?","source":"web","truncated":false,"in_reply_to_status_id":365611087766360065,"in_reply_to_status_id_str":"365611087766360065","in_reply_to_user_id":335097974,"in_reply_to_user_id_str":"335097974","in_reply_to_screen_name":"TThassio","user":{"id":227374028,"id_str":"227374028","name":"mary jane ","screen_name":"mariferreiraevc","location":"Cruzeiro ","url":null,"description":"http:\/\/instagram.com\/mariferreiraevc","protected":false,"followers_count":344,"friends_count":149,"listed_count":0,"created_at":"Thu Dec 16 17:38:16 +0000 2010","favourites_count":239,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":16443,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"F50A31","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000037995191\/c4155ad156ccb2d8e0fce974d05cc52b.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000037995191\/c4155ad156ccb2d8e0fce974d05cc52b.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000184110001\/c63a8b97dbe8aa4fb1faacb7a38d8cef_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000184110001\/c63a8b97dbe8aa4fb1faacb7a38d8cef_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/227374028\/1374704344","profile_link_color":"990DF7","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TThassio","name":"Thassio Carvalho","id":335097974,"id_str":"335097974","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217584259073,"id_str":"365611217584259073","text":"@LawerChacal eso es tongo jajaja","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365609713238409217,"in_reply_to_status_id_str":"365609713238409217","in_reply_to_user_id":287447203,"in_reply_to_user_id_str":"287447203","in_reply_to_screen_name":"LawerChacal","user":{"id":479736037,"id_str":"479736037","name":"Andrea Iba\u00f1ez ","screen_name":"andreiita_007","location":"Madrid","url":null,"description":"Como bien ves el tiempo nos va poniendo aprueba vuelo con el balance nuevo de mis bambas nuevas..","protected":false,"followers_count":121,"friends_count":127,"listed_count":1,"created_at":"Tue Jan 31 18:18:25 +0000 2012","favourites_count":128,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":2125,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000155420889\/e562b048fd5d444e50a0ab5f2bb68de8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000155420889\/e562b048fd5d444e50a0ab5f2bb68de8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/479736037\/1369922681","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"LawerChacal","name":"Lawer Chacal Clik","id":287447203,"id_str":"287447203","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217609437185,"id_str":"365611217609437185","text":"RT @MOHAMAD019: \u0645\u0646 \u0645\u062d\u0627\u0633\u0646 \u062a\u0648\u064a\u062a\u0631 :\n\n\u0644\u0627 \u064a\u0643\u0634\u0641 \u0639\u062f\u062f \u0632\u064a\u0627\u0631\u062a\u0646\u0627 \u0644\u0640 \u062d\u0633\u0627\u0628 \u0645\u0646 \u0646\u064f\u062d\u0650\u0628\u0652 !!","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":100830573,"id_str":"100830573","name":"\u0627\u062c\u0645\u0644 \u0627\u0644\u062a\u063a\u0631\u064a\u062f\u0627\u062a","screen_name":"Don_mohamad","location":"","url":null,"description":"\u062d\u0633\u0627\u0628 \u0644\u0639\u0645\u0644 \u0631\u064a\u062a\u0648\u064a\u062a \u0644\u0644\u062a\u063a\u0631\u064a\u062f\u0627\u062a \u0627\u0644\u062c\u0645\u064a\u0644\u0647 \u0648\u0627\u0644\u0645\u0641\u064a\u062f\u0647 ..","protected":false,"followers_count":4586,"friends_count":3078,"listed_count":6,"created_at":"Thu Dec 31 19:33:11 +0000 2009","favourites_count":18,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":1122,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3354092145\/381917afca5618db40ae9775026c5aa5_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3354092145\/381917afca5618db40ae9775026c5aa5_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/100830573\/1362764536","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:01:36 +0000 2013","id":365608761974456322,"id_str":"365608761974456322","text":"\u0645\u0646 \u0645\u062d\u0627\u0633\u0646 \u062a\u0648\u064a\u062a\u0631 :\n\n\u0644\u0627 \u064a\u0643\u0634\u0641 \u0639\u062f\u062f \u0632\u064a\u0627\u0631\u062a\u0646\u0627 \u0644\u0640 \u062d\u0633\u0627\u0628 \u0645\u0646 \u0646\u064f\u062d\u0650\u0628\u0652 !!","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":258417961,"id_str":"258417961","name":"\u0645\u062d\u0645\u062f \u0627\u0644\u062d\u062c\u0631\u0641\u24c2","screen_name":"MOHAMAD019","location":"Kuwait.","url":null,"description":"\u0623\u0645\u0646\u064a\u062a\u064a \u0627\u0644\u0648\u062d\u064a\u062f\u0647 \u0627\u0646 \u0627\u0644\u0632\u0645\u0646 \u064a\u0631\u062c\u0639 \u0644\u0644\u0648\u0631\u0627\u0621 \u0644\u062a\u0635\u062d\u064a\u062d \u0645\u0627\u0636\u064a \u062d\u064a\u0627\u062a\u064a \u0641\u0642\u0637.","protected":false,"followers_count":7252,"friends_count":119,"listed_count":6,"created_at":"Sun Feb 27 17:47:43 +0000 2011","favourites_count":6,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":6411,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000094014461\/ce3a53da33bc85767bbda77cff9d7274_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000094014461\/ce3a53da33bc85767bbda77cff9d7274_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/258417961\/1357347455","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":3,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MOHAMAD019","name":"\u0645\u062d\u0645\u062f \u0627\u0644\u062d\u062c\u0631\u0641\u24c2","id":258417961,"id_str":"258417961","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217592651776,"id_str":"365611217592651776","text":"What are you looking at bucko? http:\/\/t.co\/lz8bvmt7He","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":502235296,"id_str":"502235296","name":"Ryan","screen_name":"r_eady22","location":"","url":null,"description":"Best player in the valley.","protected":false,"followers_count":259,"friends_count":133,"listed_count":0,"created_at":"Fri Feb 24 23:17:31 +0000 2012","favourites_count":46,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":522,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261574670499\/a81f2fa625fe26b4800df8b05052557e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261574670499\/a81f2fa625fe26b4800df8b05052557e_normal.jpeg","profile_link_color":"93A644","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611217596846081,"id_str":"365611217596846081","indices":[31,53],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpdx7CEAEk7pn.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpdx7CEAEk7pn.jpg","url":"http:\/\/t.co\/lz8bvmt7He","display_url":"pic.twitter.com\/lz8bvmt7He","expanded_url":"http:\/\/twitter.com\/r_eady22\/status\/365611217592651776\/photo\/1","type":"photo","sizes":{"medium":{"w":600,"h":451,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":340,"h":255,"resize":"fit"},"large":{"w":1024,"h":769,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:21 +0000 2013","id":365611217596866560,"id_str":"365611217596866560","text":"Audit\u00f3rio do Terceiro Encontro de Casais - Ig. Brasil Para Cristo - Ipatinga -MG. Ministra\u00e7\u00e3o @silmarcoelho http:\/\/t.co\/iIdxA70yVH","source":"\u003ca href=\"http:\/\/www.apple.com\" rel=\"nofollow\"\u003ePhotos on iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":79531723,"id_str":"79531723","name":"Pastor Moreti","screen_name":"pr_moreti","location":"Rio de Janeiro - Brasil","url":"http:\/\/www.pastormoreti.com.br","description":"Um pregador formado nas adversidades, que aprendeu a depender de Deus e ser tolerante. Que procura ouvir e incentivar as pessoas a encontrarem a felicidade.","protected":false,"followers_count":287,"friends_count":87,"listed_count":1,"created_at":"Sat Oct 03 18:21:12 +0000 2009","favourites_count":2,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":5339,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBEBEB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/77104388\/Por_do_sol_twetter.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/77104388\/Por_do_sol_twetter.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2491736810\/n2jypj34wzwmwjtvzmjj_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2491736810\/n2jypj34wzwmwjtvzmjj_normal.png","profile_link_color":"990000","profile_sidebar_border_color":"DFDFDF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"silmarcoelho","name":"Silmar Coelho","id":69507933,"id_str":"69507933","indices":[94,107]}],"media":[{"id":365611217605255168,"id_str":"365611217605255168","indices":[108,130],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpdx9CYAABwRc.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpdx9CYAABwRc.jpg","url":"http:\/\/t.co\/iIdxA70yVH","display_url":"pic.twitter.com\/iIdxA70yVH","expanded_url":"http:\/\/twitter.com\/pr_moreti\/status\/365611217596866560\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":960,"h":720,"resize":"fit"},"medium":{"w":600,"h":450,"resize":"fit"},"small":{"w":340,"h":255,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778964480,"id_str":"365611221778964480","text":"Chick rides big cock with anal #anal #porn #freeporn http:\/\/t.co\/VPQcScbM08","source":"\u003ca href=\"http:\/\/hunthot.com\" rel=\"nofollow\"\u003eHunthot.com\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1423843104,"id_str":"1423843104","name":"hunthot.com","screen_name":"hunthotCom","location":"","url":"http:\/\/hunthot.com","description":"JUST hunt&hot!!!","protected":false,"followers_count":30,"friends_count":157,"listed_count":0,"created_at":"Sun May 12 19:24:43 +0000 2013","favourites_count":1,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":5819,"lang":"pl","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000006967014\/865cc6ee04901d404265ab7bb7f589ff.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000006967014\/865cc6ee04901d404265ab7bb7f589ff.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000012731225\/5e070bb34665fed33731f32dd159b8aa_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000012731225\/5e070bb34665fed33731f32dd159b8aa_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1423843104\/1371589496","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"anal","indices":[31,36]},{"text":"porn","indices":[38,43]},{"text":"freeporn","indices":[44,53]}],"urls":[{"url":"http:\/\/t.co\/VPQcScbM08","expanded_url":"http:\/\/tinyurl.com\/kaxqgdj","display_url":"tinyurl.com\/kaxqgdj","indices":[54,76]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786959873,"id_str":"365611221786959873","text":"You can't let black folks hold nun","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":248815262,"id_str":"248815262","name":"_TAKEOFF\u270c\u2122","screen_name":"A1_NIGGA_4LIFE","location":"#Inyobitch","url":null,"description":null,"protected":false,"followers_count":592,"friends_count":822,"listed_count":0,"created_at":"Mon Feb 07 19:49:34 +0000 2011","favourites_count":76,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":13463,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/317517400\/imagesCAGKAI60.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/317517400\/imagesCAGKAI60.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000210506845\/978c83b500ae19c080d8949493ff5be7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000210506845\/978c83b500ae19c080d8949493ff5be7_normal.jpeg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778567171,"id_str":"365611221778567171","text":"reryssy atiradora de elite, estou no ch\u00e3o","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":94933646,"id_str":"94933646","name":"musa do proletariado","screen_name":"luaishot","location":"","url":"http:\/\/instagram.com\/luaishot","description":"yv\u00e2nava lua, psicod\u00e9lica, 666, recife, gostosa.","protected":false,"followers_count":392,"friends_count":195,"listed_count":28,"created_at":"Sun Dec 06 04:28:14 +0000 2009","favourites_count":332,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":28669,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0D0D0D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000012941896\/c81658b1431fe728f56023e4ac182136.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000012941896\/c81658b1431fe728f56023e4ac182136.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000166519943\/e8cb602a3949a5e909a43e6038676636_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000166519943\/e8cb602a3949a5e909a43e6038676636_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/94933646\/1361819020","profile_link_color":"FF8419","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"14CCBD","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786963968,"id_str":"365611221786963968","text":"\u0645\u062c\u0646\u0648\u0646 \u0634\u0627\u0641\u0643 \u0648\u0627\u0635\u0628\u062d \u0627\u0644\u064a\u0648\u0645 \u0639\u0627\u0642\u0644 \u0648\u0639\u0627\u0642\u0644 \u062a\u0631\u0649 \u0641\u064a \u063a\u064a\u0628\u062a\u0643 \u0635\u0627\u0631 \u0645\u062c\u0646\u0648\u0646","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1635968832,"id_str":"1635968832","name":"7hmooody1403","screen_name":"ALbdnawi","location":"","url":null,"description":null,"protected":false,"followers_count":134,"friends_count":243,"listed_count":0,"created_at":"Wed Jul 31 17:34:58 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":57,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249211322\/94c78177455ed0da0e827beaf6d98377_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249211322\/94c78177455ed0da0e827beaf6d98377_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1635968832\/1375775515","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786951680,"id_str":"365611221786951680","text":"Bir cevabin varmi?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1052395614,"id_str":"1052395614","name":"EL\u0130ZAAAAA","screen_name":"bayankediiii","location":"MERS\u0130N","url":"https:\/\/www.facebook.com\/ezgi.sozer.35","description":"D\u00fcn tarih oldu yar\u0131n ise bilmece bug\u00fcn sana hediye.Numaram\u0131 de\u011fi\u015ftirmedim hala 1 numaray\u0131m. MessegaMe Pin; GC 219 XVK","protected":false,"followers_count":258,"friends_count":138,"listed_count":0,"created_at":"Tue Jan 01 11:33:06 +0000 2013","favourites_count":1379,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":true,"verified":false,"statuses_count":10509,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044904409\/ffe5c52551e6041e82d4ce38e4fe501e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044904409\/ffe5c52551e6041e82d4ce38e4fe501e.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244683336\/9de76a27030012d941391af33643c4f5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244683336\/9de76a27030012d941391af33643c4f5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1052395614\/1375702160","profile_link_color":"0099B9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786959874,"id_str":"365611221786959874","text":"\u0639\u0637\u0646\u064a \u0639\u0644\u064a \u0628\u0639\u0636 \u0627\u0644\u062a\u063a\u0627\u0631\u064a\u062f \u0631\u062a\u0648\u064a\u062a\n \u0639\u0634\u0627\u0646 \u0627\u062d\u0633\u0646 \u0627\u0646\u0643 \u0645\u0639\u064a \u0627\u0645\u062a\u0648\u0644\u0639\n\n\u0644\u0627 \u062a\u0628\u062e\u0644 \u0627\u0644\u0631\u062a\u0648\u064a\u062a \u0627\u0646 \u0643\u0627\u0646 \u0645\u0627\u062c\u064a\u062a\n \u0643\u0644 \u0627\u0644\u0639\u0631\u0628 \u062a\u0642\u0631\u0628 \u0628\u0639\u0636 \u0648\u0627\u062a\u062f\u0644\u0639 #\u0631\u062a\u0648\u064a\u062a","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":301315504,"id_str":"301315504","name":"\u0627\u0644\u0640\u0632\u0651\u064a\u0640A\u0142_\u017e\u00e9\u00e8\u00f1\u0640\u0640\u0646\u0652","screen_name":"al_zeen","location":"","url":null,"description":"\u0645\u064f\u0647\u0631\u0629 \u0648 \u062a\u0644\u0639\u0628 \u0641\u064a \u062d\u0634\u0649 \u0643\u0644 \u062e\u064a\u0627\u0644 .. \u0639\u0634\u0627\u0642\u0647\u0627 \u0648\u0627\u062c\u062f \u060c \u0648 \u0644\u0627 \u0623\u062d\u062f\u0646 \u0641\u062a\u0646\u0647\u0627! ((\u0627\u0644\u062e\u0627\u0635 \u0645\u063a\u0644\u0642 \u0644\u0623\u0634\u0639\u0627\u0631\u064d \u0622\u062e\u0631 ))","protected":false,"followers_count":3890,"friends_count":2299,"listed_count":2,"created_at":"Thu May 19 08:06:27 +0000 2011","favourites_count":209,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":22655,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000231849915\/ccfdd1bf555b9743edbc787c05f36ec7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000231849915\/ccfdd1bf555b9743edbc787c05f36ec7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/301315504\/1375477404","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"\u0631\u062a\u0648\u064a\u062a","indices":[125,131]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221803741184,"id_str":"365611221803741184","text":"Or that random ass swing set in the middle of fucking nowhere","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":310805318,"id_str":"310805318","name":"\u00a9hloe","screen_name":"ofmiceandchl0","location":"Newcastle","url":null,"description":"I like a lot of bands, tattoos, coffee and Harry Potter\u270c","protected":false,"followers_count":287,"friends_count":546,"listed_count":16,"created_at":"Sat Jun 04 11:24:58 +0000 2011","favourites_count":22,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":23485,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046755551\/1a05f787be98030e9316e3a431b0e474.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046755551\/1a05f787be98030e9316e3a431b0e474.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260950646\/be325bf148e5da31d6d610f2f457d17f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260950646\/be325bf148e5da31d6d610f2f457d17f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/310805318\/1375622640","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782761472,"id_str":"365611221782761472","text":"\u3010\u5b9a\u671f\u3011\u8150\u5973\u5b50\u3067\u3059\uff01\u30de\u30ae\/\u9ed2\u30d0\u30b9\/\u30d8\u30bf\u30ea\u30a2\/\u5fa9\u6d3b\/\u305d\u306e\u4ed6\u8af8\u3005\u3002\u6c17\u8efd\u306b\u7d61\u3093\u3067\u304f\u308c\u308b\u3068\u559c\u3076\u3088\u3001\u5fc3\u306e\u4e2d\u3067\u3002\u30b3\u30df\u30e5\u969c\u306fTwitter\u3067\u3082\u6cbb\u3089\u306a\u3044(\u00b4\uff1b\u03c9\uff1b`)\u3067\u3082\u9003\u3052\u3061\u3083\u30c0\u30e1\u3060\u9003\u3052\u3061\u3083\u30c0\u30e1\u3060\u3002\u30d5\u30a9\u30ed\u30fc\u30df\u30fc\u306a\u306e\u3060\u3088\u3002\u30aa\u30ca\u30b7\u30e3\u30b9\uff01","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":870011654,"id_str":"870011654","name":"\u305f\u304b\u3089@\u3084\u308b\u6c17\u30b9\u30a4\u30c3\u30c1OFF","screen_name":"rose_mottoyare","location":"\u3042\u306a\u305f\u306e\u5fc3\u306e\u306a\u304b\u306b...","url":null,"description":"\u30cb\u30b8\u30f2\u30bf\/\u250c(\u250c ^o^)\u2510\uff1c\u7981\u65ad\u306e\u30a8\u30c7\u30f3\/\u9ad8\u7dd1\/\u30a2\u30ea\u30d0\u30d0\u541b\u3092\u8ab0\u304b\u62b1\u3044\u3066\/\u53f3\u30d0\u30d0\/\u6211\u304c\u7956\u56fd\u306e\u70ba\u306a\u3089\u6b7b\u306d\u308b\/27\u304f\u3093\u306f\u6c38\u9060\/\u30a8\u30f4\u30a1\/\u304f\u3045\u3045\u3045rrrr\u3053\u3063\u3061\u3043\u3043\u30a4\u30a8\u30a2\/\u3068\u304b\u3044\u3063\u3066\u57fa\u672c\u96d1\u98df\/\u30ea\u30d6\u30a1\u30a4\u5175\u9577\u306b\u8abf\u6559\u3055\u308c\u968a\/\u30b3\u30df\u30e5\u969c\/\uff0a\u4e16\u754c\u306f\u611b\u3067\u6ea2\u308c\u3066\u308b\uff0a","protected":false,"followers_count":23,"friends_count":61,"listed_count":1,"created_at":"Tue Oct 09 14:43:34 +0000 2012","favourites_count":13,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":790,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261573852781\/4b1c56493712b5a8a893be858ada06d3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261573852781\/4b1c56493712b5a8a893be858ada06d3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/870011654\/1366091993","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221795344384,"id_str":"365611221795344384","text":"Dari tadi malam sakit perut :'","source":"\u003ca href=\"http:\/\/www.snaptwit.com\" rel=\"nofollow\"\u003eSnaptwit\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":990234434,"id_str":"990234434","name":"Restu yuriana","screen_name":"Restuyuriana","location":"D U R I, riau indonesia ","url":null,"description":"Mama papa (\u007b\u007d) (\u02c6\u25bd\u02c6) :) @smp3mandau IX.8 (\u02c6\u25bd\u02c6)","protected":false,"followers_count":407,"friends_count":438,"listed_count":0,"created_at":"Wed Dec 05 05:10:58 +0000 2012","favourites_count":15,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4580,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259182393\/58c5db71d24791610b6cdc98366e043d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259182393\/58c5db71d24791610b6cdc98366e043d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/990234434\/1371303995","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221795340288,"id_str":"365611221795340288","text":"la tomaron por loca en la aldea por comerse las piedras. Cuando todo qued\u00f3 arrasado x el hurancan ella segu\u00eda all\u00ed, sonriendo. BN :)","source":"\u003ca href=\"http:\/\/sinproject.net\/tweecha\/\" rel=\"nofollow\"\u003etweecha sinProject\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":602985599,"id_str":"602985599","name":"coral rico sanchez","screen_name":"Coralrs","location":"","url":null,"description":null,"protected":false,"followers_count":66,"friends_count":90,"listed_count":0,"created_at":"Fri Jun 08 19:01:48 +0000 2012","favourites_count":6,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":389,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2600380263\/38Pcg56I_normal","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2600380263\/38Pcg56I_normal","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/602985599\/1359592759","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221791145985,"id_str":"365611221791145985","text":"Gave you all. I had And you tossed it in the trash. You tossed it in the trash, you did.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1259574804,"id_str":"1259574804","name":"sulljin\u2022","screen_name":"fxsjin","location":"","url":null,"description":"thanks for everything.","protected":false,"followers_count":2467,"friends_count":2269,"listed_count":1,"created_at":"Mon Mar 11 14:26:02 +0000 2013","favourites_count":435,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":false,"verified":false,"statuses_count":20785,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040542161\/052d10458da26d72c909134aebeef035.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040542161\/052d10458da26d72c909134aebeef035.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246903183\/2c939e1cf5dd3fe895d614b80010655a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246903183\/2c939e1cf5dd3fe895d614b80010655a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1259574804\/1375843497","profile_link_color":"A38B4B","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782773761,"id_str":"365611221782773761","text":"RT @boronology: \u30b2\u30fc\u30e0\u6a5f\u3092\u5076\u7136\u8e0f\u3093\u3060\u304b\u306e\u3088\u3046\u306b\u88c5\u3063\u3066\u58ca\u305d\u3046\u3068\u3059\u308b\u89aa\u3092\u30b2\u30fc\u30e0\u30ad\u30e5\u30fc\u30d6\u3067\u6bb4\u308a\u7d9a\u3051\u308b\u3068\u6b7b\u306c\u3002\u3061\u306a\u307f\u306b\u305d\u308c\u3067\u3082\u30b2\u30fc\u30e0\u30ad\u30e5\u30fc\u30d6\u306f\u58ca\u308c\u306a\u3044","source":"\u003ca href=\"http:\/\/tapbots.com\/tweetbot\" rel=\"nofollow\"\u003eTweetbot for iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":83123433,"id_str":"83123433","name":"\u79cb\u6708\u785d\u5b50 \uff20PSO2 Ship10","screen_name":"Lily_vitroiris","location":"\u5ddd\u8d8a","url":null,"description":"FtX\u3082\u3069\u304d\u3002\u30d0\u30a4\u30bb\u30af\u30b7\u30e5\u30a2\u30eb\u3002\u5973\u5b50\u529b\uff1f\u4f55\u305d\u308c\u7f8e\u5473\u3057\u3044\u306e\uff1f\nAS\u7684\u601d\u8003\u306a\u5de5\u5b66\u5f92\u3002\u30a2\u30ca\u30ed\u30b0\u56de\u8def\u5c02\u653b\u3002\u97f3\u30b2\u30fc\u3001\u81ea\u4f5cPC\u3001\u96fb\u5b50\u5de5\u4f5c\u3001LTspice\u3002\nPSO2 : Ship10\u30ca\u30a6\u30b7\u30ba Hu50\/Te45\/Fo30\/Br25","protected":false,"followers_count":2560,"friends_count":740,"listed_count":253,"created_at":"Sat Oct 17 12:40:33 +0000 2009","favourites_count":45256,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":true,"verified":false,"statuses_count":212147,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/193009032\/tomoneko_1920x1200_ul.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/193009032\/tomoneko_1920x1200_ul.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000092721711\/ce5e736cf7da52f32314138475fe6350_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000092721711\/ce5e736cf7da52f32314138475fe6350_normal.png","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:50 +0000 2013","id":365611087523090433,"id_str":"365611087523090433","text":"\u30b2\u30fc\u30e0\u6a5f\u3092\u5076\u7136\u8e0f\u3093\u3060\u304b\u306e\u3088\u3046\u306b\u88c5\u3063\u3066\u58ca\u305d\u3046\u3068\u3059\u308b\u89aa\u3092\u30b2\u30fc\u30e0\u30ad\u30e5\u30fc\u30d6\u3067\u6bb4\u308a\u7d9a\u3051\u308b\u3068\u6b7b\u306c\u3002\u3061\u306a\u307f\u306b\u305d\u308c\u3067\u3082\u30b2\u30fc\u30e0\u30ad\u30e5\u30fc\u30d6\u306f\u58ca\u308c\u306a\u3044","source":"\u003ca href=\"http:\/\/mikutter.hachune.net\/\" rel=\"nofollow\"\u003emikutter\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":344723014,"id_str":"344723014","name":"Amazon\u30c6\u30ed\u3067\u4e3b\u98df\u304c\u7d20\u9eba\u306b","screen_name":"boronology","location":"Skype\uff1aboronji65536","url":"http:\/\/boronology.blogspot.jp\/","description":"\u307c\u308d\u3093\u3058\u30fb\u30b6\u30fb\u30d7\u30ed\u30b0\u30e9\u30df\u30f3\u30b0\u3067\u304d\u306a\u3044\u30d5\u30ea\u30fc\u30bf\u30fc\u66f8\u5e97\u54e1\u30028\u6708\u672b\u3067\u66f8\u5e97\u54e1\u306f\u8f9e\u3081\u3066\u5225\u306e\u4ed5\u4e8b\u3055\u304c\u3059\u4e88\u5b9a\u3002\r\nAmazon Wish list \u2192 http:\/\/t.co\/FcWyD4HsbF\r\nGitHub\u2192 https:\/\/t.co\/3oxIQhKmDq\r\nTumblr\u2192http:\/\/t.co\/IKOMkeDI9a","protected":false,"followers_count":2055,"friends_count":983,"listed_count":216,"created_at":"Fri Jul 29 13:58:13 +0000 2011","favourites_count":20982,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":83736,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2701105097\/40323e6ba14ebd774c42ea10950d57d3_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2701105097\/40323e6ba14ebd774c42ea10950d57d3_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"ja"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"boronology","name":"Amazon\u30c6\u30ed\u3067\u4e3b\u98df\u304c\u7d20\u9eba\u306b","id":344723014,"id_str":"344723014","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816324097,"id_str":"365611221816324097","text":"I hope the best for you & the decisions you make.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1559261942,"id_str":"1559261942","name":"Kiari Murphy","screen_name":"Kiari_20","location":"","url":null,"description":null,"protected":false,"followers_count":27,"friends_count":90,"listed_count":0,"created_at":"Mon Jul 01 00:36:58 +0000 2013","favourites_count":56,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":360,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000212830817\/7b44df886c599a087896653bbb93fdcf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000212830817\/7b44df886c599a087896653bbb93fdcf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1559261942\/1375894162","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816324096,"id_str":"365611221816324096","text":"@Shade_Sheist Ima be in L.A. next week, let's link up!?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":17513897,"in_reply_to_user_id_str":"17513897","in_reply_to_screen_name":"Shade_Sheist","user":{"id":970444670,"id_str":"970444670","name":"The Chosen Juan","screen_name":"JuanGHOMIE","location":"714 to the 316","url":null,"description":"Ambitions of wealthier better life, real thoughts from a #RealMind #iFly #Investor #Traveling #MusicCritic #ApperalCritic #DeepThinking'sWhatIDo #iRepCali. #714","protected":false,"followers_count":175,"friends_count":219,"listed_count":1,"created_at":"Sun Nov 25 17:35:30 +0000 2012","favourites_count":1582,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4602,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000069803763\/c7c887b2d8232b38d681041899e26f61_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000069803763\/c7c887b2d8232b38d681041899e26f61_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/970444670\/1357875753","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Shade_Sheist","name":"Shade Sheist\u2122","id":17513897,"id_str":"17513897","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221795348480,"id_str":"365611221795348480","text":"\u65e5\u304c\u51fa\u3066\u304d\u305f","source":"\u003ca href=\"http:\/\/fairchildblog.img.jugem.jp\/20130626_678745.jpg\" rel=\"nofollow\"\u003eRUNXtter\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1337397708,"id_str":"1337397708","name":"\u3042\u3093\u305a","screen_name":"anzunyaa","location":"\u3048\u308d\u306b\u3083\u3093\u306e\u5ac1","url":null,"description":"\u3042\u3093\u305a\u306b\u3083\u3042\u3002KONAMI\u306e\u97f3\u30b2\u30fc\u306f\u3060\u3044\u305f\u3044\u3084\u308a\u307e\u3059\uff08\u96d1\u98df\u7cfb\uff09 \u898f\u5236\u57a2\u2192@anzumit W\u898f\u5236\u2192@anzumitu \u3046\u308b\u3055\u3044\u306e\u3067\u30d5\u30a9\u30ed\u30fc\u3059\u308b\u3068\u304d\u306f\u6ce8\u610f\u3057\u3066\u304f\u3060\u3055\u3044","protected":false,"followers_count":431,"friends_count":332,"listed_count":46,"created_at":"Mon Apr 08 19:24:16 +0000 2013","favourites_count":55859,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":48166,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000219181195\/6f058d9d850dea87628dd57bad3d195b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000219181195\/6f058d9d850dea87628dd57bad3d195b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1337397708\/1365696869","profile_link_color":"93A644","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778567168,"id_str":"365611221778567168","text":"Jd kpn kita kumpul?ayu raya ke3 plg supen\"@Jessica_Rosadi: Minal 'aidin walfaidzin @DinaSAndriani @ReskykaAyu @fellaaleste @langka_pipi\"","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":174960683,"id_str":"174960683","name":"Sundari","screen_name":"sunDin_Ak","location":"Sei Penuh - Jambi - Indonesia","url":"http:\/\/twitter.com\/sunDin_Ak","description":"NTSDLL | Mathematic's Education STAIN Kerinci","protected":false,"followers_count":849,"friends_count":402,"listed_count":1,"created_at":"Thu Aug 05 07:30:29 +0000 2010","favourites_count":31,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":16104,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/680251280\/72b158a456f42ca9b90e1fae26f1f270.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/680251280\/72b158a456f42ca9b90e1fae26f1f270.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241494915\/f35d6c9668b3e5840328af7ebddf8aa5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241494915\/f35d6c9668b3e5840328af7ebddf8aa5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/174960683\/1375124803","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Jessica_Rosadi","name":"Jessica R Frastica","id":1589995980,"id_str":"1589995980","indices":[42,57]},{"screen_name":"DinaSAndriani","name":"DINASeptiaANDRIANI\u2122","id":469063606,"id_str":"469063606","indices":[84,98]},{"screen_name":"ReskykaAyu","name":"Ayu Reskyka Putri.A","id":176002217,"id_str":"176002217","indices":[99,110]},{"screen_name":"fellaaleste","name":"Fella Lestesia Vina","id":426371858,"id_str":"426371858","indices":[111,123]},{"screen_name":"langka_pipi","name":"Nopiii","id":237711892,"id_str":"237711892","indices":[124,136]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221807927296,"id_str":"365611221807927296","text":"\"@Cris9Cristina: Qu\u00e9 mejor s\u00e1bana que t\u00fa.\"","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":507874573,"id_str":"507874573","name":"irie","screen_name":"crisloppez","location":"ROAD TO ZION","url":"http:\/\/ask.fm\/crisloppez","description":"Simpatizante de la cultura Rastafari. Comunista marxista y libertaria. Roja. \u00bfPerroflauta...? Idealista. Futura fil\u00f3sofa, espero querido Wert.","protected":false,"followers_count":697,"friends_count":401,"listed_count":2,"created_at":"Tue Feb 28 20:26:07 +0000 2012","favourites_count":4397,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":36137,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000007503857\/83c027e09d3e0ae8fccbb798e2639cef.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000007503857\/83c027e09d3e0ae8fccbb798e2639cef.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000076919968\/90c808fd91eaa10de8621c02a94449af_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000076919968\/90c808fd91eaa10de8621c02a94449af_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/507874573\/1375563877","profile_link_color":"FF0303","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Cris9Cristina","name":"Cris\u2020ina","id":556048253,"id_str":"556048253","indices":[1,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816328193,"id_str":"365611221816328193","text":"as soon as I heard Bless The Broken Road I said to Em that it was Rascal Flatts, I just love them","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":269883839,"id_str":"269883839","name":"gabs","screen_name":"gablaaajb","location":"","url":"http:\/\/lif3-happens.tumblr.com","description":"Better to be hated, than loved for what you're not.","protected":false,"followers_count":324,"friends_count":286,"listed_count":0,"created_at":"Mon Mar 21 16:35:25 +0000 2011","favourites_count":31,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":18631,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"E9F4CA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046676765\/f1128e06812c1e5f51eba325ea8c14cf.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046676765\/f1128e06812c1e5f51eba325ea8c14cf.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261056508\/9b4643ebea60e453d7d671e99785aea6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261056508\/9b4643ebea60e453d7d671e99785aea6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/269883839\/1375899834","profile_link_color":"4FA3DB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"BBBE9F","profile_text_color":"89B59C","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782765569,"id_str":"365611221782765569","text":"RT @_sofiamendes: @PsiuLums_ vai \u00f4 obesa! JSOSJSOSJS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":444013607,"id_str":"444013607","name":"Pequena \u270c","screen_name":"PsiuLums_","location":"Far Far Way","url":null,"description":"S\u00f3 \u00e9 feliz quem sabe o que quer \u270c SPFC \u2764\u2764","protected":false,"followers_count":191,"friends_count":150,"listed_count":0,"created_at":"Thu Dec 22 20:00:23 +0000 2011","favourites_count":300,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":6880,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000039745529\/1abdcd3bca01cc07b7c73d98e3a4c0d9.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000039745529\/1abdcd3bca01cc07b7c73d98e3a4c0d9.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261788598\/cfb52bec2ee5d566724254e021315a17_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261788598\/cfb52bec2ee5d566724254e021315a17_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/444013607\/1376001838","profile_link_color":"FF0A0A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:11 +0000 2013","id":365611173443420160,"id_str":"365611173443420160","text":"@PsiuLums_ vai \u00f4 obesa! JSOSJSOSJS","source":"web","truncated":false,"in_reply_to_status_id":365610062439391232,"in_reply_to_status_id_str":"365610062439391232","in_reply_to_user_id":444013607,"in_reply_to_user_id_str":"444013607","in_reply_to_screen_name":"PsiuLums_","user":{"id":545216922,"id_str":"545216922","name":"Am\u00f4 \u2661","screen_name":"_sofiamendes","location":"Brasil","url":null,"description":"Pisciana, 13 anos. Melhor amiga \u2665 Uma hist\u00f3ria escrita pelo dedo de Deus \u266b Eu te quero s\u00f3 pra mim, como as ondas s\u00e3o do mar! Eu te amo @_MatheusSantan2 \u2661","protected":false,"followers_count":384,"friends_count":556,"listed_count":0,"created_at":"Wed Apr 04 13:37:50 +0000 2012","favourites_count":16,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":10755,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"141717","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047454825\/9805ec0b9ceda7a955748968b451ac74.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047454825\/9805ec0b9ceda7a955748968b451ac74.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227394306\/649308b1e0d7b6df426ece37edc32c74_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227394306\/649308b1e0d7b6df426ece37edc32c74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/545216922\/1375880288","profile_link_color":"9005FA","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":{"id":"68e019afec7d0ba5","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/68e019afec7d0ba5.json","place_type":"city","name":"S\u00e3o Paulo","full_name":"S\u00e3o Paulo, S\u00e3o Paulo","country_code":"BR","country":"Brasil","bounding_box":{"type":"Polygon","coordinates":[[[-46.826038999999994,-24.008813999999997],[-46.826038999999994,-23.356792],[-46.365052,-23.356792],[-46.365052,-24.008813999999997]]]},"attributes":{}},"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PsiuLums_","name":"Pequena \u270c","id":444013607,"id_str":"444013607","indices":[0,10]}]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_sofiamendes","name":"Am\u00f4 \u2661","id":545216922,"id_str":"545216922","indices":[3,16]},{"screen_name":"PsiuLums_","name":"Pequena \u270c","id":444013607,"id_str":"444013607","indices":[18,28]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778575360,"id_str":"365611221778575360","text":"tren #mtvhottest One Direction","source":"\u003ca href=\"http:\/\/www.samsungmobile.com\" rel=\"nofollow\"\u003eSamsung Mobile\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1428232459,"id_str":"1428232459","name":"MyNaughtyBoyIsZayn","screen_name":"267Patrick","location":"","url":null,"description":"#Directioner #UltrAslan\r\nJustinBieber-BridgitMendler-AustinMahone-SelenaGomez-DemiLovato-MileyCyrus-AvrilLavigne","protected":false,"followers_count":1094,"friends_count":1088,"listed_count":0,"created_at":"Tue May 14 16:08:26 +0000 2013","favourites_count":598,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3923,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035308608\/32404e043b870772d337a6af11d2e884.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035308608\/32404e043b870772d337a6af11d2e884.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000209161178\/d5145c0e25d051a67c326596f3df76a1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000209161178\/d5145c0e25d051a67c326596f3df76a1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1428232459\/1375114663","profile_link_color":"AB2CEB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"mtvhottest","indices":[5,16]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786959872,"id_str":"365611221786959872","text":"\u7dca\u6025\u5730\u9707\u901f\u5831\u3067\u300cYahoo! JAPAN\u300d\u306b\u30a2\u30af\u30bb\u30b9\u6025\u5897--3.11\u3092\u8d85\u3048\u308b\u9ad8\u8ca0\u8377 http:\/\/t.co\/6fpDDD74uz @cnet_japan\u3055\u3093\u304b\u3089\n\n\u4f55\u304b\u3042\u3063\u305f\u3089\u3001\u307e\u305a\u306fYahoo\u3063\u3066\u3053\u3068\u306a\u306e\u304b\u306a\u30fb\u30fb","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":6305592,"id_str":"6305592","name":"SINZY@\u610f\u5fd7\u306e\u306a\u3044\u30dc\u30fc\u30eb\u306f\u6253\u305f\u308c\u308b","screen_name":"sinzy","location":"Jingu Stadium JAPAN","url":"http:\/\/www.youtube.com\/user\/sinzychiba","description":"\u76ee\u6307\u305b \u30a2\u30b8\u30a2No.1! Go! Go! SWALLOWS!! \u30b9\u30ef\u30ed\u30fc\u30ba\u597d\u304d\uff06\u8857\u304c\u597d\u304d\u306e\u30b3\u30f3\u30d4\u30e5\u30fc\u30bf\u5c4b\u3055\u3093\u517c\u307e\u3061\u89b3\u5bdf\u7814\u7a76\u5bb6\u3002\u8da3\u5473\u306f\u6563\u6b69\u3002\u597d\u304d\u306a\u30b2\u30fc\u30e0\u306f\u30b8\u30e3\u30f3\u30b1\u30f3\u3002\u751f\u6d3b\u306e\u5fc3\u60c5\u306f\u3001Simple is Best! Google+ \u3082\u306f\u3058\u3081\u307e\u3057\u305f\u3002\u3053\u3063\u3061\u3082\u30d5\u30a9\u30ed\u30fc\u3057\u3066\u306d\uff01 http:\/\/goo.gl\/W2MM0","protected":false,"followers_count":1522,"friends_count":1981,"listed_count":73,"created_at":"Fri May 25 07:16:07 +0000 2007","favourites_count":33,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":67751,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024264296\/5ed45a6ec3d92ee4cead25e885444148.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024264296\/5ed45a6ec3d92ee4cead25e885444148.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1774615778\/y3181-3244-9070073_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1774615778\/y3181-3244-9070073_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/6305592\/1370791176","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"AEC1E8","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/6fpDDD74uz","expanded_url":"http:\/\/japan.cnet.com\/news\/service\/35035751\/","display_url":"japan.cnet.com\/news\/service\/3\u2026","indices":[42,64]}],"user_mentions":[{"screen_name":"cnet_japan","name":"CNET Japan","id":17081623,"id_str":"17081623","indices":[65,76]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782769664,"id_str":"365611221782769664","text":"Por fin, feriaaaaaa!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":808683384,"id_str":"808683384","name":"Laura Garc\u00eda ","screen_name":"LauraGarciaV94","location":"Navalmoral-Granja de thosa ","url":null,"description":"Estudiante de ADE y CCT en Badajoz y Ruque\u00f1a. Ten la virtud de saber esperar, porque todo lo que tenga que ser ser\u00e1 :)","protected":false,"followers_count":259,"friends_count":224,"listed_count":0,"created_at":"Fri Sep 07 11:25:41 +0000 2012","favourites_count":161,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":1253,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2891017093\/a4ba26f6b604de4eaf07d832483bcef1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2891017093\/a4ba26f6b604de4eaf07d832483bcef1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/808683384\/1351941020","profile_link_color":"CC3366","profile_sidebar_border_color":"DBE9ED","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778567170,"id_str":"365611221778567170","text":"@upsmyhazza fback?(:","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":201476608,"in_reply_to_user_id_str":"201476608","in_reply_to_screen_name":"upsmyhazza","user":{"id":396639427,"id_str":"396639427","name":"\u2654 Queen \u2654","screen_name":"_itsnearlyover","location":"","url":null,"description":"\u2727\u2726 who says you're not perfect who says you're not worth it who says you're not pretty who says you're not beautiful.. who says? \u2727\u2726","protected":false,"followers_count":560,"friends_count":565,"listed_count":27,"created_at":"Sun Oct 23 15:24:57 +0000 2011","favourites_count":44,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":1432,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045501879\/fb4e2ee2111cfc6f7ef9d2122b5bc0c5.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045501879\/fb4e2ee2111cfc6f7ef9d2122b5bc0c5.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248325724\/e4d6546985d04d004f987c7b8db995cc_normal.gif","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248325724\/e4d6546985d04d004f987c7b8db995cc_normal.gif","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/396639427\/1375759200","profile_link_color":"BC7DF0","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"upsmyhazza","name":"ashton","id":201476608,"id_str":"201476608","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221795356672,"id_str":"365611221795356672","text":"\u30fb\u30fb\u30fb\u305d\u3093\u306a\u6016\u3044\u9854\u3057\u3066 \u3069\u3046\u3057\u305f\u306e\u30fb\u30fb\u30fb\uff1f","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1250889236,"id_str":"1250889236","name":"\u904e\u53bb\u306e\u6b8b\u50cf","screen_name":"lucimogmog_yami","location":"\u6614\u306e\u5f71\u306e\u4e2d","url":null,"description":null,"protected":false,"followers_count":0,"friends_count":0,"listed_count":1,"created_at":"Fri Mar 08 06:06:16 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":7134,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/809156021\/a7999c086125346c817fa9292803e5db.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/809156021\/a7999c086125346c817fa9292803e5db.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3352183710\/55cad739608cce32fbd87d798980cf40_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3352183710\/55cad739608cce32fbd87d798980cf40_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1250889236\/1362723257","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221803732994,"id_str":"365611221803732994","text":"\u304a\u306f\u3088\u3046\u3054\u3056\u3044\u307e\u305b\u3093 \u3084\u3063\u3068\u7720\u304f\u306a\u3063\u3066\u304d\u305f\u304b\u3089\u304a\u3084\u3059\u307f\u306a\u3055\u3044","source":"\u003ca href=\"http:\/\/janetter.net\/\" rel=\"nofollow\"\u003eJanetter Pro for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":167992055,"id_str":"167992055","name":"\u30ca\u30a4\u30fc\u30f4","screen_name":"naive_SDVX","location":"","url":"http:\/\/www.pixiv.net\/member.php?id=6437903","description":"\u3000","protected":false,"followers_count":7221,"friends_count":608,"listed_count":177,"created_at":"Sun Jul 18 02:51:02 +0000 2010","favourites_count":11214,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":155147,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044042847\/05a054bed4292c8634ae9163f378cf66.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044042847\/05a054bed4292c8634ae9163f378cf66.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000239298476\/05dfad411568a187f405e6a30881e154_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000239298476\/05dfad411568a187f405e6a30881e154_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/167992055\/1374227748","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782765568,"id_str":"365611221782765568","text":"Aqui qee me voy pa la cama! # BN","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1479864445,"id_str":"1479864445","name":"\u10e6Biebernator\u10e6","screen_name":"GirlKidrahul","location":"Theatre Avon.","url":null,"description":"Nadie me avergonzar\u00e1 de ser belieber, al contrario, estoy orgullosa de ser fan de un chico que cree en sus sue\u00f1os. @justinbieber","protected":false,"followers_count":342,"friends_count":676,"listed_count":0,"created_at":"Mon Jun 03 14:32:29 +0000 2013","favourites_count":168,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3725,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000185749491\/9c6b90f7f50ac496b824a71df1bea153_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000185749491\/9c6b90f7f50ac496b824a71df1bea153_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1479864445\/1370640499","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221791145984,"id_str":"365611221791145984","text":"@Teamo_cocacola 51128298 o\/\/","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610551688183808,"in_reply_to_status_id_str":"365610551688183808","in_reply_to_user_id":303903281,"in_reply_to_user_id_str":"303903281","in_reply_to_screen_name":"Teamo_cocacola","user":{"id":299796154,"id_str":"299796154","name":"Te pe ache i ese.","screen_name":"Tephisss_","location":"Guatemala","url":null,"description":"Si seguir a Dios fuera tan f\u00e1cil, la recompensa no ser\u00eda tan grande; Luz y Vida\u2665; s\u00faper fan\u00e1tica del Chocolate\u2665.","protected":false,"followers_count":251,"friends_count":238,"listed_count":0,"created_at":"Mon May 16 17:55:05 +0000 2011","favourites_count":177,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":11803,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"15D2EB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/792802998\/c3d90855114eece350a3aeda1c262bd0.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/792802998\/c3d90855114eece350a3aeda1c262bd0.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000003363486\/62add827597e331a93dce79f0dd6b5ca_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000003363486\/62add827597e331a93dce79f0dd6b5ca_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/299796154\/1361060938","profile_link_color":"26E314","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"D817FF","profile_text_color":"6BFC17","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Teamo_cocacola","name":"De Pa\u00f9lTorres, \u2665.","id":303903281,"id_str":"303903281","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221791162368,"id_str":"365611221791162368","text":"People are awesome!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":29725871,"id_str":"29725871","name":"Chelsea Gockel","screen_name":"bellacd32","location":"Denver, CO","url":null,"description":"Lover of all that life is. Aspiring to do beautiful, magical things through inspiring others.","protected":false,"followers_count":142,"friends_count":201,"listed_count":2,"created_at":"Wed Apr 08 14:26:54 +0000 2009","favourites_count":15,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":321,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000046340601\/a3805154b5f086828c040e86d73b586a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000046340601\/a3805154b5f086828c040e86d73b586a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/29725871\/1374683360","profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221803732993,"id_str":"365611221803732993","text":"truce","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":325039314,"id_str":"325039314","name":"Paul D","screen_name":"PDonnison","location":"liverpool","url":null,"description":"Stop moaning.","protected":false,"followers_count":185,"friends_count":284,"listed_count":0,"created_at":"Mon Jun 27 16:53:52 +0000 2011","favourites_count":255,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":2037,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000059225027\/0a771ba6e56e481c7bac5975b9c599ff_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000059225027\/0a771ba6e56e481c7bac5975b9c599ff_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/325039314\/1373918032","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221812117504,"id_str":"365611221812117504","text":"@_Louieeeeeeeeee hoe you know i got robbed fah my iPhone","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611044401459201,"in_reply_to_status_id_str":"365611044401459201","in_reply_to_user_id":499259105,"in_reply_to_user_id_str":"499259105","in_reply_to_screen_name":"_Louieeeeeeeeee","user":{"id":1013449405,"id_str":"1013449405","name":"Truey\u2708","screen_name":"Ayoo_Tyrone","location":"IG:Ballout_Truey","url":null,"description":null,"protected":false,"followers_count":332,"friends_count":316,"listed_count":0,"created_at":"Sat Dec 15 15:40:48 +0000 2012","favourites_count":255,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":9868,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"030303","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/861221065\/ab41fb9abdf6d1f380b48036f0e25449.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/861221065\/ab41fb9abdf6d1f380b48036f0e25449.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262062303\/a39f46c40ac48041a2d761a01bfda359_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262062303\/a39f46c40ac48041a2d761a01bfda359_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1013449405\/1373927862","profile_link_color":"B30000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_Louieeeeeeeeee","name":"S I N G L E ","id":499259105,"id_str":"499259105","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221803741185,"id_str":"365611221803741185","text":"@algetheclown WHAT?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610881637302272,"in_reply_to_status_id_str":"365610881637302272","in_reply_to_user_id":21496705,"in_reply_to_user_id_str":"21496705","in_reply_to_screen_name":"algetheclown","user":{"id":414503048,"id_str":"414503048","name":"Tori Powell","screen_name":"t_powww","location":"","url":null,"description":"The only person you should try to be better than is the person you were yesterday. Instagram- @t_powww","protected":false,"followers_count":556,"friends_count":424,"listed_count":0,"created_at":"Thu Nov 17 03:57:16 +0000 2011","favourites_count":2635,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":9627,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/543020445\/glitter.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/543020445\/glitter.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000143699901\/24443050f5a059625d6a1b4d34c79076_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000143699901\/24443050f5a059625d6a1b4d34c79076_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/414503048\/1373683459","profile_link_color":"E8378D","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"algetheclown","name":"Mikey Jr","id":21496705,"id_str":"21496705","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221807923201,"id_str":"365611221807923201","text":"@LUHANEKSO90 gak gitu juga;-;)\/","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365479040024444928,"in_reply_to_status_id_str":"365479040024444928","in_reply_to_user_id":738172880,"in_reply_to_user_id_str":"738172880","in_reply_to_screen_name":"LUHANEKSO90","user":{"id":802019378,"id_str":"802019378","name":"Janda Galak","screen_name":"fxjsvj","location":"KaumLabil;F\u0394VOR;SemetonBali","url":null,"description":"\uc815\uc218\uc815 [\ud06c\ub9ac\uc2a4\ud0c8 \uc815] \u3161 94\ub77c\uc778 \u3161 \ud5ec\ub85c!\ub9c8\ub9ac \ubca8\ud14c\ub9cc :3 [PJT74] Duo Jungdet with @JungKrys_B","protected":false,"followers_count":1617,"friends_count":1531,"listed_count":1,"created_at":"Tue Sep 04 08:04:40 +0000 2012","favourites_count":115,"utc_offset":28800,"time_zone":"Beijing","geo_enabled":false,"verified":false,"statuses_count":16774,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000006268427\/2b42b7f9021ce507ad92bdd700aa8183.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000006268427\/2b42b7f9021ce507ad92bdd700aa8183.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000221692788\/bf1f41974fe424fb56d15656a608afa5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000221692788\/bf1f41974fe424fb56d15656a608afa5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/802019378\/1375305332","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"LUHANEKSO90","name":"GeHan","id":738172880,"id_str":"738172880","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221812129792,"id_str":"365611221812129792","text":"@kidrauhlsmusic6 oooooooooomggg","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610276743159809,"in_reply_to_status_id_str":"365610276743159809","in_reply_to_user_id":1289963725,"in_reply_to_user_id_str":"1289963725","in_reply_to_screen_name":"KidrauhlsMusic6","user":{"id":1641570583,"id_str":"1641570583","name":"avalanna justin 333","screen_name":"avalannajustin3","location":"all around the world","url":null,"description":"I love u beliebers. I love you so much Justin Bieber!","protected":false,"followers_count":35,"friends_count":112,"listed_count":0,"created_at":"Fri Aug 02 23:18:31 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":194,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_2_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_2_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"KidrauhlsMusic6","name":"TODAY!!!!!!","id":1289963725,"id_str":"1289963725","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786955777,"id_str":"365611221786955777","text":"RT\u201c@_LivingLavish01: \ud83d\ude1a\ud83d\udca8\ud83c\udf43\u201d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610935236300801,"in_reply_to_status_id_str":"365610935236300801","in_reply_to_user_id":441833359,"in_reply_to_user_id_str":"441833359","in_reply_to_screen_name":"_LivingLavish01","user":{"id":237134647,"id_str":"237134647","name":"Dreads2Legit","screen_name":"Exuberant_One","location":"Doing me\u2122","url":null,"description":"You dont gotta run and tell nobody they already know ....I am ME \u261d#Real IG: Kyy_b","protected":false,"followers_count":1109,"friends_count":1489,"listed_count":2,"created_at":"Wed Jan 12 04:29:09 +0000 2011","favourites_count":36,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":50572,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256500380\/50037f9061af2a17d227001ac8a6b0b1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256500380\/50037f9061af2a17d227001ac8a6b0b1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/237134647\/1373396775","profile_link_color":"0084B4","profile_sidebar_border_color":"A9C5F5","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_LivingLavish01","name":"Black Beauty\u2122","id":441833359,"id_str":"441833359","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sv"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778579456,"id_str":"365611221778579456","text":"@Roro_65Playuhh I'm playing the game. Text me","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611104702963712,"in_reply_to_status_id_str":"365611104702963712","in_reply_to_user_id":211394148,"in_reply_to_user_id_str":"211394148","in_reply_to_screen_name":"Roro_65Playuhh","user":{"id":303521880,"id_str":"303521880","name":"Layla Marie","screen_name":"SincerelyTiny_","location":"MikeyG \u2764","url":null,"description":"every saint has a past, every sinner has a future.","protected":false,"followers_count":1519,"friends_count":1072,"listed_count":1,"created_at":"Mon May 23 01:08:09 +0000 2011","favourites_count":362,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":77034,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/675288227\/c69d262fe6800e4224e4e718e63ccbb4.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/675288227\/c69d262fe6800e4224e4e718e63ccbb4.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000243260991\/709da08c937019b9d66b985b7352f818_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000243260991\/709da08c937019b9d66b985b7352f818_normal.jpeg","profile_link_color":"93A644","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Roro_65Playuhh","name":"RoyalKlan|63","id":211394148,"id_str":"211394148","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221803732992,"id_str":"365611221803732992","text":"Weird people are my kind if people","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1279086366,"id_str":"1279086366","name":"thursday","screen_name":"tottyanna_","location":"","url":"http:\/\/ask.fm\/xttnxx","description":"miles away","protected":false,"followers_count":172,"friends_count":98,"listed_count":1,"created_at":"Tue Mar 19 00:20:23 +0000 2013","favourites_count":1810,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":6434,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042239135\/c212fa959c9455ec5168ec81db6a9260.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042239135\/c212fa959c9455ec5168ec81db6a9260.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000243398911\/4a74bd8f46b5ffb4c0473722376790ae_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000243398911\/4a74bd8f46b5ffb4c0473722376790ae_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1279086366\/1375040649","profile_link_color":"F0B4D1","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816328192,"id_str":"365611221816328192","text":"@Daq_design \uc548\ub155\ud558\uc138\uc694 \ud558\ub85c\ub85c \ud301\uc2a4\ud130 \uc785\ub2c8\ub2e4^^\n\uce74\ud1a1 \uce5c\ucd94\ud6c4 \uac00\uc871\uacf5\uc6d0 \uc774\uc6a9\ud558\uc2dc\uba74 \uc8fc\ub825\ud53d(\ud578\ub514,\uc624\ubc84\uc5b8\ub354) \ub4dc\ub9bd\ub2c8\ub2e4.\n\uce74\ud1a1ID : Lambor11\n\ube14\ub85c\uadf8 : http:\/\/t.co\/SHzUbbP3yM","source":"\u003ca href=\"http:\/\/twitaddons.com\" rel=\"nofollow\"\u003etwitaddons\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":218786327,"in_reply_to_user_id_str":"218786327","in_reply_to_screen_name":"Daq_design","user":{"id":1595045299,"id_str":"1595045299","name":"\ud558\ub85c\ub85c","screen_name":"haroro100","location":"","url":null,"description":"\u2605 All Sports \/ Under,Over \/ Handy \ubb34\ub8ccTip \uc81c\uacf5 \u2605 http:\/\/blog.naver.com\/dede11kk \ud504\ub85c\uc81d\ud2b8 \uc804\ubb38 \ud53d\uc2a4\ud130","protected":false,"followers_count":0,"friends_count":0,"listed_count":2,"created_at":"Mon Jul 15 05:02:13 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":65,"lang":"ko","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000136299793\/a66da422630e4e065a97e5920bf9c505_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000136299793\/a66da422630e4e065a97e5920bf9c505_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/SHzUbbP3yM","expanded_url":"http:\/\/blog.naver.com\/dede11kk","display_url":"blog.naver.com\/dede11kk","indices":[91,113]}],"user_mentions":[{"screen_name":"Daq_design","name":"1\/29\uc77c \ubd80\ud130 \uad70\uc778","id":218786327,"id_str":"218786327","indices":[0,11]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ko"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782769665,"id_str":"365611221782769665","text":"@SonicHomicide happy birthday :-)","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610223077031936,"in_reply_to_status_id_str":"365610223077031936","in_reply_to_user_id":897539276,"in_reply_to_user_id_str":"897539276","in_reply_to_screen_name":"SonicHomicide","user":{"id":939459444,"id_str":"939459444","name":"Baron Shuttlecock","screen_name":"shinybellpiece","location":"Deepest Sprayberry","url":null,"description":"Mister Prince Fucking Charming Shuttlecock. Metal, horror, alcohol, loathing, endearing misanthropy.","protected":false,"followers_count":1342,"friends_count":1910,"listed_count":12,"created_at":"Sat Nov 10 16:55:14 +0000 2012","favourites_count":8909,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":31966,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000233163360\/05ba89422ab0b572705fe479f5962404_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000233163360\/05ba89422ab0b572705fe479f5962404_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/939459444\/1364507227","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[39.18813,-77.2589516]},"coordinates":{"type":"Point","coordinates":[-77.2589516,39.18813]},"place":{"id":"0b8f86488a6ebab7","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/0b8f86488a6ebab7.json","place_type":"city","name":"Germantown","full_name":"Germantown, MD","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-77.302544,39.145393],[-77.302544,39.206756],[-77.220329,39.206756],[-77.220329,39.145393]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SonicHomicide","name":"Toxic Avenger.","id":897539276,"id_str":"897539276","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221778567169,"id_str":"365611221778567169","text":"\u3010\u5b9a\u671f\u3011\u4e8c\u6b21\u5143\u306f\u9178\u7d20","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":891965516,"id_str":"891965516","name":"\u307f\u3043\u306b\u3083\u3093\u306f\u5e30\u7701\u306e\u305f\u308116\u65e5\u307e\u3067\u30c4\u30a4\u6e1b","screen_name":"heppoko13","location":"\u8ab0\u304b\u306e\u96a3( \u02d8\u03c9\u02d8 )","url":null,"description":"\u4e2d2\u270c\u3057\u3087\u305f\u3053\u3093\u25c7\u8150\u5973\u5b50\u25c7\u5984\u60f3\u3001\u304a\u7d75\u63cf\u304d\u306e\u65e5\u3005\u3002\r\nH\u00d7H\u3001\u9280\u9b42\u3001Free!\u306a\u3069\u611b\u3057\u3066\u307e\u3059(\/\/\u25dc\u25d2\u25dd\/\/)\u4e0b\u30c4\u30a4\u591a\u3081\r\n\u8a73\u7d30\u261ehttp:\/\/twpf.jp\/heppoko13\u3000\u898f\u5236\u57a2\u261e@heppoko13_2\u3000\u30b3\u30fc\u30e9bot\u261e@colacola_bot","protected":false,"followers_count":275,"friends_count":376,"listed_count":1,"created_at":"Fri Oct 19 22:26:26 +0000 2012","favourites_count":59,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":6823,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000126533693\/43d006ee9172015c18c0a40a82a2fd69_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000126533693\/43d006ee9172015c18c0a40a82a2fd69_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/891965516\/1368623378","profile_link_color":"0099B9","profile_sidebar_border_color":"5ED4DC","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221795352576,"id_str":"365611221795352576","text":"RT @MubzStar: \u201c@_aminaaden: The amount of guys I've seen with them red Jordan's. I actually can't stand them anymore \ud83d\ude37\u201d Most Jordan's are s\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":91151069,"id_str":"91151069","name":"aminaa","screen_name":"_aminaaden","location":"","url":null,"description":"london.","protected":false,"followers_count":1674,"friends_count":463,"listed_count":31,"created_at":"Thu Nov 19 17:39:14 +0000 2009","favourites_count":1452,"utc_offset":3600,"time_zone":"Casablanca","geo_enabled":true,"verified":false,"statuses_count":30074,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/714335879\/4100273d4bac43c73588595880ed70c7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/714335879\/4100273d4bac43c73588595880ed70c7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259852454\/f395053c0df833328719cba9766e41b1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259852454\/f395053c0df833328719cba9766e41b1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/91151069\/1375791271","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:12 +0000 2013","id":365611178623385600,"id_str":"365611178623385600","text":"\u201c@_aminaaden: The amount of guys I've seen with them red Jordan's. I actually can't stand them anymore \ud83d\ude37\u201d Most Jordan's are shit tbh","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365607995230519296,"in_reply_to_status_id_str":"365607995230519296","in_reply_to_user_id":91151069,"in_reply_to_user_id_str":"91151069","in_reply_to_screen_name":"_aminaaden","user":{"id":519452954,"id_str":"519452954","name":"\u0645\u0628\u0627\u0631\u0643","screen_name":"MubzStar","location":"East London","url":null,"description":"Free Waseem Free Kasim","protected":false,"followers_count":223,"friends_count":170,"listed_count":0,"created_at":"Fri Mar 09 12:41:51 +0000 2012","favourites_count":66,"utc_offset":3600,"time_zone":"Casablanca","geo_enabled":true,"verified":false,"statuses_count":2134,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/640774567\/46dyjiexu06ksuojur2c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/640774567\/46dyjiexu06ksuojur2c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000129634899\/6925ecd49072e42ed64d0a38438d5116_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000129634899\/6925ecd49072e42ed64d0a38438d5116_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/519452954\/1362345885","profile_link_color":"FF0000","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_aminaaden","name":"aminaa","id":91151069,"id_str":"91151069","indices":[1,12]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MubzStar","name":"\u0645\u0628\u0627\u0631\u0643","id":519452954,"id_str":"519452954","indices":[3,12]},{"screen_name":"_aminaaden","name":"aminaa","id":91151069,"id_str":"91151069","indices":[15,26]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221786955776,"id_str":"365611221786955776","text":"@kicktheanicka my drafts are all @ replying jack barakat linking him to smut hA","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610946099548161,"in_reply_to_status_id_str":"365610946099548161","in_reply_to_user_id":1199051972,"in_reply_to_user_id_str":"1199051972","in_reply_to_screen_name":"kicktheanicka","user":{"id":609516604,"id_str":"609516604","name":"kate","screen_name":"pj_licorice","location":"","url":"http:\/\/m.fanfiction.net\/s\/8545380\/1\/Dipper-goes-to-Taco-Bell","description":"*jack barakat voice* eggcellent","protected":false,"followers_count":482,"friends_count":1065,"listed_count":1,"created_at":"Fri Jun 15 23:16:19 +0000 2012","favourites_count":2971,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":8130,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258052025\/dcecf96798fbf70b66f841f5871463c4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258052025\/dcecf96798fbf70b66f841f5871463c4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/609516604\/1375737494","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"kicktheanicka","name":"princess zelda","id":1199051972,"id_str":"1199051972","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221791150080,"id_str":"365611221791150080","text":"\u3010\u5b9a\u671f\u3011\u30b9\u30ab\u30a4\u30d7\u3001LINE\u3001\u3084\u3063\u3066\u307e\u3059\u3002ID\u306a\u3069\u6c17\u8efd\u306b\u805e\u3044\u3066\u304f\u3060\u3055\u3044\u3002","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":390549578,"id_str":"390549578","name":"Riin@\u308a\u3044\u3093","screen_name":"reinforce05","location":"\u7bb1\u306e\u4e2d","url":"http:\/\/com.nicovideo.jp\/community\/co1715977","description":"\u8ab0\u304bSA\u306eSR\u6559\u3048\u3066\u304f\u3060\u3055\u3044\uff08\u5207\u5b9f\u3000\u6c17\u8efd\u306b\u30d5\u30a9\u30ed\u30fc\u3057\u3066\u304f\u3060\u3055\u3044\u3000http:\/\/twpf.jp\/reinforce05","protected":false,"followers_count":219,"friends_count":196,"listed_count":6,"created_at":"Fri Oct 14 04:37:42 +0000 2011","favourites_count":353,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":true,"verified":false,"statuses_count":29297,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3058641956\/4caebe8d8afbe0a3dd127bcfea25ae86_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3058641956\/4caebe8d8afbe0a3dd127bcfea25ae86_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221812125696,"id_str":"365611221812125696","text":"I have Bonfire on my island! Now my island is even more awesome! http:\/\/t.co\/uDALiQzoiT #android, #androidgames, #gameinsight","source":"\u003ca href=\"http:\/\/bit.ly\/tribez_itw\" rel=\"nofollow\"\u003eThe Tribez for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1450024976,"id_str":"1450024976","name":"dan bjorn","screen_name":"DanBjorndahl","location":"","url":null,"description":null,"protected":false,"followers_count":2,"friends_count":5,"listed_count":0,"created_at":"Wed May 22 22:27:34 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":52,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"android","indices":[88,96]},{"text":"androidgames","indices":[98,111]},{"text":"gameinsight","indices":[113,125]}],"urls":[{"url":"http:\/\/t.co\/uDALiQzoiT","expanded_url":"http:\/\/gigam.es\/htw_Tribez","display_url":"gigam.es\/htw_Tribez","indices":[65,87]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816315904,"id_str":"365611221816315904","text":"@buraunko \u304a\u306f\u3088\u3046\u3002\u79c1\u306f\u307e\u3060\u7720\u3044\u308f\u3041","source":"\u003ca href=\"http:\/\/twitter.com\/tsubaki4913\" rel=\"nofollow\"\u003e\u535a\u9e97\u795e\u793e\u306e\u5c4b\u6839\u88cf\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611108985348097,"in_reply_to_status_id_str":"365611108985348097","in_reply_to_user_id":875538710,"in_reply_to_user_id_str":"875538710","in_reply_to_screen_name":"buraunko","user":{"id":166041632,"id_str":"166041632","name":"\u30ab\u30ca\u30fb\u30a2\u30ca\u30d9\u30e9\u30eb","screen_name":"Anaberal_bot","location":"\u535a\u9e97\u795e\u793e\u306e\u5c4b\u6839\u88cf","url":null,"description":"\u6771\u65b9\u5922\u6642\u7a7a\u3088\u308a\u30ab\u30ca\u30fb\u30a2\u30ca\u30d9\u30e9\u30eb\u306e\u975e\u516c\u5f0fbot\u3067\u3059\u3002\r\n\u8aac\u660e\u9801\u2192http:\/\/anaberalbot.blog.fc2.com\/","protected":false,"followers_count":855,"friends_count":387,"listed_count":65,"created_at":"Tue Jul 13 05:01:59 +0000 2010","favourites_count":0,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":224504,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1726610045\/111226b_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1726610045\/111226b_normal.jpg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"buraunko","name":"\u30f4\u30e9\u30a6\u30f3\u7ba1@\u30b7\u30e3\u30eb\u30ed\u30c3\u515a\u54e1","id":875538710,"id_str":"875538710","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221782761473,"id_str":"365611221782761473","text":"@tkareempowell try massage & icing 2x a day. Hope this helps and you feel better soon. Sleep well!","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365289713650769920,"in_reply_to_status_id_str":"365289713650769920","in_reply_to_user_id":21793041,"in_reply_to_user_id_str":"21793041","in_reply_to_screen_name":"tkareempowell","user":{"id":431011689,"id_str":"431011689","name":"Blanketbooster","screen_name":"Blanketbooster","location":"Portland, OR","url":"http:\/\/www.blanketbooster.com","description":"We free your feet for a more comfortable night's sleep with the original self-assembling blanket lift. \r\n#neuropathy #arthritis #gout #fibro #footpain","protected":false,"followers_count":45,"friends_count":52,"listed_count":1,"created_at":"Wed Dec 07 20:30:36 +0000 2011","favourites_count":50,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":256,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/879851224\/9f4d2f54e12f2201a572fe7e16d933b7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/879851224\/9f4d2f54e12f2201a572fe7e16d933b7.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3719598100\/d9703dd7b3a80a1aece2ec96b0657487_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3719598100\/d9703dd7b3a80a1aece2ec96b0657487_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/431011689\/1369752217","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"tkareempowell","name":"T. Kareem Powell","id":21793041,"id_str":"21793041","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221791154176,"id_str":"365611221791154176","text":"\u041f\u043e\u043f\u0440\u043e\u0431\u0443\u0435\u043c, \u043f\u043e\u0441\u043c\u043e\u0442\u0440\u0438\u043c \u0447\u0442\u043e \u0442\u0443\u0442 \u043d\u0430\u0448\u043b\u043e\u0441\u044c http:\/\/t.co\/kEwjac216c \u0443\u0433\u043e\u043b\u043e\u0432\u043d\u044b\u0439 \u043a\u043e\u0434\u0435\u043a\u0441 \u0440\u0444 2011 \u0430\u0432\u0430\u0440\u0438\u044f","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":537196214,"id_str":"537196214","name":"Anton S","screen_name":"ShotAes","location":"","url":null,"description":null,"protected":false,"followers_count":1,"friends_count":1,"listed_count":0,"created_at":"Mon Mar 26 13:06:18 +0000 2012","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":47,"lang":"ru","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3599397309\/02b95bd35033820e889129f8a4472e03_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3599397309\/02b95bd35033820e889129f8a4472e03_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/537196214\/1367416859","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/kEwjac216c","expanded_url":"http:\/\/bet.gougouluntan.net\/instruction-7136.html","display_url":"bet.gougouluntan.net\/instruction-71\u2026","indices":[37,59]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:22 +0000 2013","id":365611221816324098,"id_str":"365611221816324098","text":"@Almondii_Joii (12) Slots Open For Background Extras In Drake's Billboard Topper, View This Photo For The Info >> http:\/\/t.co\/5t4oEL6LdF","source":"web","truncated":false,"in_reply_to_status_id":365611026806345731,"in_reply_to_status_id_str":"365611026806345731","in_reply_to_user_id":1458410586,"in_reply_to_user_id_str":"1458410586","in_reply_to_screen_name":"Almondii_Joii","user":{"id":1641088304,"id_str":"1641088304","name":"Hannah preston","screen_name":"Boardelleo797","location":"","url":null,"description":"Punctual xbox lover | Amateur xbox nerd | Friendly bacon entrepreneur :) 78819 lover","protected":false,"followers_count":1,"friends_count":0,"listed_count":0,"created_at":"Fri Aug 02 18:45:46 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":6,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"352726","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme5\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme5\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000231413914\/b03d846194782fa3808eb9671829506e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000231413914\/b03d846194782fa3808eb9671829506e_normal.jpeg","profile_link_color":"D02B55","profile_sidebar_border_color":"829D5E","profile_sidebar_fill_color":"99CC33","profile_text_color":"3E4415","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Almondii_Joii","name":"King E","id":1458410586,"id_str":"1458410586","indices":[0,14]}],"media":[{"id":365611221824712705,"id_str":"365611221824712705","indices":[120,142],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpeBrCMAE8pOX.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpeBrCMAE8pOX.jpg","url":"http:\/\/t.co\/5t4oEL6LdF","display_url":"pic.twitter.com\/5t4oEL6LdF","expanded_url":"http:\/\/twitter.com\/Boardelleo797\/status\/365611221816324098\/photo\/1","type":"photo","sizes":{"medium":{"w":500,"h":333,"resize":"fit"},"small":{"w":340,"h":226,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":333,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981267976,"id_str":"365611225981267976","text":"' \u0645\u0646 \u0628\u064a\u0646 \u0623\u0634\u0628\u0627\u0647\u0643 \u0627\u0644\u0623\u0631\u0628\u0639\u064a\u0646 \u0623\u0631\u064a\u062f\u0643 \u0623\u0646\u062a \u0648\u062a\u0628\u0627 \u0644\u0645\u0627 \u062a\u0628\u0642\u0649.!!! '\"\"","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1591873394,"id_str":"1591873394","name":"3mm\u0251\u0280\u026a","screen_name":"3mmari7","location":"Dxb","url":null,"description":null,"protected":false,"followers_count":241,"friends_count":519,"listed_count":0,"created_at":"Sat Jul 13 21:27:00 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":121,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000219362996\/e89d7ec473b12eca46d0f9bbaaa9f133_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000219362996\/e89d7ec473b12eca46d0f9bbaaa9f133_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1591873394\/1375269034","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981255681,"id_str":"365611225981255681","text":"Soit je fume et je me fais cramer encore une fois, soit je fume pas et j'\u00e9vite l'embrouille","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":127557566,"id_str":"127557566","name":"Guizmette. \u0394","screen_name":"GrnMelissa","location":"@LauraRnx_ \u2665","url":"http:\/\/ask.fm\/GeeketteMelissa","description":"#TeamGuizmo #TeamOM #TeamBarca","protected":false,"followers_count":919,"friends_count":130,"listed_count":8,"created_at":"Mon Mar 29 15:56:25 +0000 2010","favourites_count":295,"utc_offset":7200,"time_zone":"Paris","geo_enabled":true,"verified":false,"statuses_count":73553,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/768825527\/a3dc528ca12fad54089b5eb28a8fc0c0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/768825527\/a3dc528ca12fad54089b5eb28a8fc0c0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000240945001\/971b0ee9eabfa38527b1f214a5f6b436_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000240945001\/971b0ee9eabfa38527b1f214a5f6b436_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/127557566\/1375447322","profile_link_color":"2FC2EF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972867072,"id_str":"365611225972867072","text":"Un loquito que no conozco me quiere hacer una recarga, JAJAJJA","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1529294982,"id_str":"1529294982","name":"Loreley Menoni","screen_name":"MenoniLoreley","location":"","url":null,"description":null,"protected":false,"followers_count":147,"friends_count":206,"listed_count":0,"created_at":"Wed Jun 19 01:18:09 +0000 2013","favourites_count":71,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1081,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000013613773\/24580ea114e182dd0569421ac4c7cd96_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000013613773\/24580ea114e182dd0569421ac4c7cd96_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1529294982\/1371755464","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972871168,"id_str":"365611225972871168","text":"Eciee hahaha;;)Niisakk: Morning jg fa @usyafira\"","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1391952667,"id_str":"1391952667","name":"Ulfa","screen_name":"USyafira","location":"South Sumatera,Indonesia","url":null,"description":"03sept98|ILoveAllah_ILoveMyParents_ILoveMyFamily (\u02c6\u25bd\u02c6\u0283\u01aa)-Dance is MyWorld (\u02d8\u03b5\u02d8\u0283\u01aa)|@man3plg pin:32E091D2 Fllbck?just mention\u263a","protected":false,"followers_count":491,"friends_count":1060,"listed_count":0,"created_at":"Tue Apr 30 12:22:53 +0000 2013","favourites_count":46,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3458,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262028984\/0df3a6e0bfffea9dc90a898cd093cadf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262028984\/0df3a6e0bfffea9dc90a898cd093cadf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1391952667\/1375682718","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"USyafira","name":"Ulfa","id":1391952667,"id_str":"1391952667","indices":[38,47]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981259776,"id_str":"365611225981259776","text":"Idk why everybody says its only classes not schedules you have those classes no matter what lol","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":858548360,"id_str":"858548360","name":"marmar","screen_name":"xoxomarria","location":"","url":null,"description":"Inlove with my bestfriend, 41012\u2764","protected":false,"followers_count":146,"friends_count":210,"listed_count":0,"created_at":"Tue Oct 02 19:02:27 +0000 2012","favourites_count":1794,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":4992,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193137718\/e213926055814c25b5eddeda0e4904f3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193137718\/e213926055814c25b5eddeda0e4904f3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/858548360\/1374527364","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981255680,"id_str":"365611225981255680","text":"\u0641\u0642\u0637 \u0641\u064a \u0628\u0631\u064a\u0637\u0627\u0646\u064a\u0627 : \"\u0627\u0630\u0627 \u0623\u062d\u062f \u0639\u0635\u0628 \u0639\u0644\u064a\u0643 \u064a\u0642\u0648\u0644 \" \u0634\u0648\u0641 \u0623\u0646\u0627 \u0645 \u0627\u0628\u064a \u0623\u062d\u0644\u0641 \u0628\u0633 \u0623\u0642\u0633\u0645 \u0628\u0627\u0644\u0644\u0647 \u263a!","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":939298308,"id_str":"939298308","name":"\u0633\u0627\u0631\u0647 \u062c. \u0627\u0644\u0646\u0627\u0635\u0631\u2661 ","screen_name":"SarahAlnasser_","location":"Q8 \u062f\u0627\u0631 \u0633\u0644\u0648\u0649","url":"http:\/\/ask.fm\/SweerjH","description":"- \u0627\u0644\u0644\u0647\u0645 \u0644\u0627 \u062a\u062a\u0648\u0641\u0627\u0646\u064a \u0625\u0644\u0627 \u0648\u0623\u0646\u0627 \u0633\u0627\u062c\u062f\u0647 \u0644\u0643 \u0648\u0623\u062d\u0633\u0646 \u062e\u0627\u062a\u0645\u062a\u064a ! \u0627\u0644\u0637\u0628 \u0637\u0645\u0648\u062d\u064a \u0648\u0627\u0644\u0648\u0639\u062f \u0642\u062f\u0627\u0645 \u2600","protected":false,"followers_count":475,"friends_count":57,"listed_count":0,"created_at":"Sat Nov 10 15:35:06 +0000 2012","favourites_count":265,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":15873,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000207319864\/d67729ca1b70dda69b19ceaff070adfc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000207319864\/d67729ca1b70dda69b19ceaff070adfc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/939298308\/1375509582","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225993850881,"id_str":"365611225993850881","text":"Lincoln Ca Plumbers: Whats Needed To Be a DIY Home Plumber - Folsom, CA http:\/\/t.co\/qFCnJTRvG3","source":"\u003ca href=\"http:\/\/www.ajaymatharu.com\/\" rel=\"nofollow\"\u003eTweet Old Post\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1651265738,"id_str":"1651265738","name":"Pro Lincoln Plumbers","screen_name":"Lincolnplumbing","location":"Lincoln Ca","url":"http:\/\/lincolnproplumbing.com","description":null,"protected":false,"followers_count":1,"friends_count":27,"listed_count":0,"created_at":"Tue Aug 06 21:17:47 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":63,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251908950\/f9f49c01a5a96b6fead2ec06939f3a99_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251908950\/f9f49c01a5a96b6fead2ec06939f3a99_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1651265738\/1375824996","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/qFCnJTRvG3","expanded_url":"http:\/\/lincolnproplumbing.com\/plumbers-folsom-ca\/","display_url":"lincolnproplumbing.com\/plumbers-folso\u2026","indices":[72,94]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985466369,"id_str":"365611225985466369","text":"spanish 2 is way harder","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":197870531,"id_str":"197870531","name":"brat .","screen_name":"BxtchItsHolly","location":"","url":null,"description":"pretty girl , with a mean hustle .","protected":false,"followers_count":91,"friends_count":91,"listed_count":0,"created_at":"Sat Oct 02 17:56:05 +0000 2010","favourites_count":188,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":7008,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/747449292\/e659baa32d86e37c78888118179b5e2c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/747449292\/e659baa32d86e37c78888118179b5e2c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258114442\/15cefb6d20668a585f5b4142dd6d503b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258114442\/15cefb6d20668a585f5b4142dd6d503b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/197870531\/1371926194","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981267972,"id_str":"365611225981267972","text":"RT @KlaudiaHonza: Buenas noches mis lindas LQM\u2764\ud83d\udc8b@PaulaPC_PS @Smith_Navarro @ClubHablemonos @Abhiedabss @TefisLorena @kren0660","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":744238825,"id_str":"744238825","name":"SilviaEsLaMejor\u2661","screen_name":"Smith_Navarro","location":"Venezuela.","url":null,"description":"Silvia Navarro LA MUJER Q MAS AMO EN ESTE MUNDO\u2661 Sonya Smith LA MAS HERMOSA DE TODAS\u2665","protected":false,"followers_count":993,"friends_count":1365,"listed_count":5,"created_at":"Wed Aug 08 02:23:02 +0000 2012","favourites_count":1596,"utc_offset":-16200,"time_zone":"Caracas","geo_enabled":true,"verified":false,"statuses_count":26145,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000023990072\/fc45d0a48b2743ee616ab6ae66fe966e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000023990072\/fc45d0a48b2743ee616ab6ae66fe966e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258460546\/f74fd84f1d5de43d6de21c00c6c627a4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258460546\/f74fd84f1d5de43d6de21c00c6c627a4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/744238825\/1375573385","profile_link_color":"2C6EBF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:49:53 +0000 2013","id":365605812913188864,"id_str":"365605812913188864","text":"Buenas noches mis lindas LQM\u2764\ud83d\udc8b@PaulaPC_PS @Smith_Navarro @ClubHablemonos @Abhiedabss @TefisLorena @kren0660","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":353053815,"id_str":"353053815","name":"Klaudia\u2661","screen_name":"KlaudiaHonza","location":"","url":"http:\/\/www.facebook.com\/profile.php?id=100001564174583","description":"Adoro mi rubisita hermosa @Sonya_Smith\u2764 #OrgulloSmithFan \u2764Amo el teatro y la actuaci\u00f3n,Soy una chica sincera con sue\u00f1os","protected":false,"followers_count":637,"friends_count":600,"listed_count":1,"created_at":"Thu Aug 11 13:46:59 +0000 2011","favourites_count":6640,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":40028,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"008000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045710336\/5a0de3c0773647c68827093d6d416a3d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045710336\/5a0de3c0773647c68827093d6d416a3d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244706874\/eb32ceb71527c5a493e53a4bf74921df_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244706874\/eb32ceb71527c5a493e53a4bf74921df_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/353053815\/1370909345","profile_link_color":"008000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PaulaPC_PS","name":"Paula ","id":238150039,"id_str":"238150039","indices":[30,41]},{"screen_name":"Smith_Navarro","name":"SilviaEsLaMejor\u2661","id":744238825,"id_str":"744238825","indices":[42,56]},{"screen_name":"ClubHablemonos","name":"Fan Club Hablemonos ","id":991013942,"id_str":"991013942","indices":[57,72]},{"screen_name":"Abhiedabss","name":"Abhie Agapay \u2665","id":236228592,"id_str":"236228592","indices":[73,84]},{"screen_name":"TefisLorena","name":"Estefany Espinoza","id":407937772,"id_str":"407937772","indices":[85,97]},{"screen_name":"kren0660","name":"Karen Castillo","id":170869221,"id_str":"170869221","indices":[98,107]}]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"KlaudiaHonza","name":"Klaudia\u2661","id":353053815,"id_str":"353053815","indices":[3,16]},{"screen_name":"PaulaPC_PS","name":"Paula ","id":238150039,"id_str":"238150039","indices":[48,59]},{"screen_name":"Smith_Navarro","name":"SilviaEsLaMejor\u2661","id":744238825,"id_str":"744238825","indices":[60,74]},{"screen_name":"ClubHablemonos","name":"Fan Club Hablemonos ","id":991013942,"id_str":"991013942","indices":[75,90]},{"screen_name":"Abhiedabss","name":"Abhie Agapay \u2665","id":236228592,"id_str":"236228592","indices":[91,102]},{"screen_name":"TefisLorena","name":"Estefany Espinoza","id":407937772,"id_str":"407937772","indices":[103,115]},{"screen_name":"kren0660","name":"Karen Castillo","id":170869221,"id_str":"170869221","indices":[116,125]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225977061379,"id_str":"365611225977061379","text":"RT @factsonfemales: someone explain to me how parents can scream at you until you're crying and then act like nothing happened 20 minutes l\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":541196864,"id_str":"541196864","name":"LilyClaire","screen_name":"LilyClaire_","location":"Under The Sea","url":"http:\/\/let-me-be-lily.tumblr.com","description":"\u2764God | FHS '16 | swim | volleyball | mermaid","protected":false,"followers_count":261,"friends_count":652,"listed_count":0,"created_at":"Fri Mar 30 23:11:55 +0000 2012","favourites_count":220,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":9140,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EB1753","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/889773443\/314eab3b69491822e4d5df664b43fab7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/889773443\/314eab3b69491822e4d5df664b43fab7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000214911106\/9477bb20fc120ab8a2ea4b11dfbc45e7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000214911106\/9477bb20fc120ab8a2ea4b11dfbc45e7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/541196864\/1369766140","profile_link_color":"1EA3EB","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"BFEEE2","profile_text_color":"99C5D3","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:37 +0000 2013","id":365611033341067264,"id_str":"365611033341067264","text":"someone explain to me how parents can scream at you until you're crying and then act like nothing happened 20 minutes later","source":"\u003ca href=\"http:\/\/bufferapp.com\" rel=\"nofollow\"\u003eBuffer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":203021499,"id_str":"203021499","name":"Females Be Like","screen_name":"factsonfemales","location":"","url":null,"description":"Like OMG girl, call me now.","protected":false,"followers_count":1138620,"friends_count":77,"listed_count":3146,"created_at":"Fri Oct 15 10:43:02 +0000 2010","favourites_count":95,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":26121,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/660568963\/3r0g2gd11z7kw176z2bn.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/660568963\/3r0g2gd11z7kw176z2bn.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000040920779\/834e819bc1ac68cbcf5d2ee4dff99590_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000040920779\/834e819bc1ac68cbcf5d2ee4dff99590_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/203021499\/1369627905","profile_link_color":"185F96","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":31,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"factsonfemales","name":"Females Be Like","id":203021499,"id_str":"203021499","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226006437889,"id_str":"365611226006437889","text":"Bon vasi je vais au lit #L","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":923455946,"id_str":"923455946","name":"Chaton. \u2661","screen_name":"LesChatooon","location":"","url":null,"description":"L'amour \u00e7a tue, la haine \u00e7a maintient en vie. \u2206","protected":false,"followers_count":164,"friends_count":406,"listed_count":0,"created_at":"Sat Nov 03 15:38:03 +0000 2012","favourites_count":3,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":4478,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"B0196F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/874666108\/14a5cfa914368cb68a1b384320430ee9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/874666108\/14a5cfa914368cb68a1b384320430ee9.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247252460\/079ae489fd7175e4a8084456c4de1ce1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247252460\/079ae489fd7175e4a8084456c4de1ce1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/923455946\/1375908859","profile_link_color":"E31CCC","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"L","indices":[24,26]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226010628096,"id_str":"365611226010628096","text":"n\u00e3o lembrava de como o renan \u00e9 chato cara, mdss","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":358798238,"id_str":"358798238","name":"gaby","screen_name":"pqgabee","location":"","url":null,"description":"eu to fissurada!","protected":false,"followers_count":656,"friends_count":126,"listed_count":0,"created_at":"Sat Aug 20 14:33:35 +0000 2011","favourites_count":36,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":15368,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047451900\/129060af0d9c551854e7905a3edb99a0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047451900\/129060af0d9c551854e7905a3edb99a0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245319534\/cb590dd458b378fa9ce88cbc972b129d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245319534\/cb590dd458b378fa9ce88cbc972b129d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/358798238\/1376001483","profile_link_color":"B8128F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDFFCD","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985466371,"id_str":"365611225985466371","text":"RT @lndirecto: Ese miedo de volver a caer en lo mismo en aquello que te cost\u00f3 tanto salir.","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":299675084,"id_str":"299675084","name":"\u00ab \u00bb","screen_name":"irenelpzvera","location":"M\u00e1laga","url":null,"description":"\u00abGuardar\u00e9 en mis ojos tu \u00faltima mirada.\u00bb Una chica llena de cicatrices.","protected":false,"followers_count":550,"friends_count":479,"listed_count":2,"created_at":"Mon May 16 14:15:33 +0000 2011","favourites_count":313,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":false,"verified":false,"statuses_count":30567,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000030580803\/9ef33393f6df48d780b388d7275c47e5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000030580803\/9ef33393f6df48d780b388d7275c47e5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000204589679\/d684e38e55f035eab604b82af452a233_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000204589679\/d684e38e55f035eab604b82af452a233_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/299675084\/1375093522","profile_link_color":"2FC2EF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 18:53:57 +0000 2013","id":365546438018605057,"id_str":"365546438018605057","text":"Ese miedo de volver a caer en lo mismo en aquello que te cost\u00f3 tanto salir.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":419893737,"id_str":"419893737","name":"Sir Muse","screen_name":"lndirecto","location":"flavors.me\/lndirecto","url":"http:\/\/instagram.com\/indirecto\/","description":"En mi teclado hay muchos tweets, s\u00f3lo tengo que saber descifrarlos. \u2654 Keep Calm And Love Indirecto.\n Contacto: indirectomuse@gmail.com","protected":false,"followers_count":380514,"friends_count":178535,"listed_count":1277,"created_at":"Wed Nov 23 22:47:11 +0000 2011","favourites_count":42158,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":21535,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/836975659\/6d2dcb51140d20e8718ada759cfb34e2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/836975659\/6d2dcb51140d20e8718ada759cfb34e2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193356560\/823a7247b8d5184f6d1652ee69209059_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193356560\/823a7247b8d5184f6d1652ee69209059_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/419893737\/1367102632","profile_link_color":"148C86","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C06078","profile_text_color":"A8C090","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2202,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"lndirecto","name":"Sir Muse","id":419893737,"id_str":"419893737","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981267969,"id_str":"365611225981267969","text":"@bradffs my favourite dragon teacher! omfg being squashed against he cupboard at the front!!! \ud83d\ude02\ud83d\ude02","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610821386125312,"in_reply_to_status_id_str":"365610821386125312","in_reply_to_user_id":36476444,"in_reply_to_user_id_str":"36476444","in_reply_to_screen_name":"bradffs","user":{"id":38687353,"id_str":"38687353","name":"jess","screen_name":"omgjesss_","location":"swindon","url":"http:\/\/att4ckk.tumblr.com","description":null,"protected":false,"followers_count":449,"friends_count":97,"listed_count":1,"created_at":"Fri May 08 15:39:53 +0000 2009","favourites_count":1227,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":20226,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/596434663\/bqq3hvtubt1n51cm8onj.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/596434663\/bqq3hvtubt1n51cm8onj.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000181583769\/6d9ec9d7cb150c2e90ffffde4882431e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000181583769\/6d9ec9d7cb150c2e90ffffde4882431e_normal.jpeg","profile_link_color":"0C899C","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"bradffs","name":"bambi","id":36476444,"id_str":"36476444","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226006433792,"id_str":"365611226006433792","text":"I'm bouta listen to my music & go to sleep.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":222292556,"id_str":"222292556","name":"\u271d a l e x i s.","screen_name":"iLoveMiiCheeks","location":"","url":null,"description":"LISTEN before you TALK . THINK before you REACT . WAIT before you JUDGE . && TRY before you QUIT .","protected":false,"followers_count":349,"friends_count":379,"listed_count":0,"created_at":"Fri Dec 03 00:26:59 +0000 2010","favourites_count":45,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":7613,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF2346","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/729734413\/75054483cd5142cf696ab63a47b1239e.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/729734413\/75054483cd5142cf696ab63a47b1239e.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000111680326\/bddd16f10ac463e6ae4b44628eb9218b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000111680326\/bddd16f10ac463e6ae4b44628eb9218b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/222292556\/1373432580","profile_link_color":"D07002","profile_sidebar_border_color":"253336","profile_sidebar_fill_color":"2AA98E","profile_text_color":"9ABB76","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985458177,"id_str":"365611225985458177","text":"@oliviaramirez_ yeah yeah","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611162185900032,"in_reply_to_status_id_str":"365611162185900032","in_reply_to_user_id":970987698,"in_reply_to_user_id_str":"970987698","in_reply_to_screen_name":"oliviaramirez_","user":{"id":734314681,"id_str":"734314681","name":"lauuur.","screen_name":"laurenkara_","location":"","url":null,"description":"\u271devery one of you deserve to be happy.\u271d http:\/\/www.youtube.com\/watch?v=HjmZ3mKaYTM","protected":false,"followers_count":256,"friends_count":630,"listed_count":0,"created_at":"Fri Aug 03 06:29:48 +0000 2012","favourites_count":1405,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":4318,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000027402974\/f11dcd40c69ea975828253286499b5bd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000027402974\/f11dcd40c69ea975828253286499b5bd.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256945632\/0e55ad1abab6c5872df25831d37dcb4d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256945632\/0e55ad1abab6c5872df25831d37dcb4d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/734314681\/1375603544","profile_link_color":"0084B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"oliviaramirez_","name":"olivia.","id":970987698,"id_str":"970987698","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981263874,"id_str":"365611225981263874","text":"RT @CarlaAgostinaO: Me enferma la gente ignorante que dice que el Omb\u00fa es el \u00e1rbol de la provincia de La Pampa. Es el Calden, mierdas.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":159584944,"id_str":"159584944","name":"Bufarini","screen_name":"BufaaGonzalez","location":"C\u00f3rdoba - Nueva C\u00f3rdoba","url":null,"description":"Estudiante de la Facultad de Lenguas - Universidad Nacional de C\u00f3rdoba - C\u00f3rdoba Capital. Al Handball me dedico ahora! http:\/\/www.facebook.com\/BufaaG","protected":false,"followers_count":457,"friends_count":545,"listed_count":0,"created_at":"Fri Jun 25 20:02:50 +0000 2010","favourites_count":91,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":2057,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"96DE11","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/698810633\/724a6e947b86dbfe9c27e18d1a76f66a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/698810633\/724a6e947b86dbfe9c27e18d1a76f66a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2196153762\/180478_1830951101873_1483240660_32002152_547780_n_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2196153762\/180478_1830951101873_1483240660_32002152_547780_n_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/159584944\/1351706070","profile_link_color":"4DC21F","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Aug 05 02:33:48 +0000 2013","id":364212613980622848,"id_str":"364212613980622848","text":"Me enferma la gente ignorante que dice que el Omb\u00fa es el \u00e1rbol de la provincia de La Pampa. Es el Calden, mierdas.","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":617635529,"id_str":"617635529","name":"Cali","screen_name":"CarlaAgostinaO","location":"","url":null,"description":"Futura Azafata. Si, voy a vivir en las nubes (como acostumbro).","protected":false,"followers_count":350,"friends_count":253,"listed_count":0,"created_at":"Mon Jun 25 01:08:53 +0000 2012","favourites_count":61,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"verified":false,"statuses_count":5715,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000015388241\/c2b8d8e2e3d8dfdf6531640f393141d6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000015388241\/c2b8d8e2e3d8dfdf6531640f393141d6.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248892944\/e6df35245038ed9504f2c17248b07bc3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248892944\/e6df35245038ed9504f2c17248b07bc3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/617635529\/1372919519","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":3,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CarlaAgostinaO","name":"Cali","id":617635529,"id_str":"617635529","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989644289,"id_str":"365611225989644289","text":"RT @SrMikeWazowski: RT si t\u00fa tambi\u00e9n quieres esta funda para tu m\u00f3vil. http:\/\/t.co\/ZErR1VWJSM","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1633173241,"id_str":"1633173241","name":"soria the only pro","screen_name":"gonzalosoria00","location":"","url":null,"description":"Dando guerra desde el 2000, vivo dia a dia acompa\u00f1ado de mis amigos, si me caigo me levanto, si tu sonries yo sonrio, en lo bueno y en lo malo","protected":false,"followers_count":47,"friends_count":61,"listed_count":0,"created_at":"Tue Jul 30 15:14:10 +0000 2013","favourites_count":11,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":410,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252244744\/c7100142442fd38e9b7a6a597af7108a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252244744\/c7100142442fd38e9b7a6a597af7108a_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:09:08 +0000 2013","id":365610658592604160,"id_str":"365610658592604160","text":"RT si t\u00fa tambi\u00e9n quieres esta funda para tu m\u00f3vil. http:\/\/t.co\/ZErR1VWJSM","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":592202860,"id_str":"592202860","name":"MIKE WAZOWSKI \u1d34\u1d30","screen_name":"SrMikeWazowski","location":"tuparodiaentwitter@gmail.com","url":"http:\/\/es.favstar.fm\/users\/SrMikeWazowski","description":"Soy tan rom\u00e1ntico que a veces pienso que deber\u00eda casarme conmigo mismo. #FollowBack\u2003\u2003\u2003\u2003\u2003\u2003 \u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003 \u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2003\u2193\u2193 PUBLICIDAD \u2193\u2193\u2003\u2003\u2003\u2193\u2193 Mejores Tweets \u2193\u2193","protected":false,"followers_count":103910,"friends_count":88441,"listed_count":112,"created_at":"Sun May 27 22:41:24 +0000 2012","favourites_count":11790,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":false,"verified":false,"statuses_count":4343,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000006264343\/3b97570e5b22ed29f99ea3620b3cd49a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000006264343\/3b97570e5b22ed29f99ea3620b3cd49a.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261470279\/8f97dff5af8a356d94e84336bc8c2216_normal.gif","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261470279\/8f97dff5af8a356d94e84336bc8c2216_normal.gif","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/592202860\/1369673072","profile_link_color":"FF0F0F","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"0E73AD","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":55,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365610658462572544,"id_str":"365610658462572544","indices":[51,73],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLo9O_CIAAE49K.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLo9O_CIAAE49K.jpg","url":"http:\/\/t.co\/ZErR1VWJSM","display_url":"pic.twitter.com\/ZErR1VWJSM","expanded_url":"http:\/\/twitter.com\/SrMikeWazowski\/status\/365610658592604160\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":500,"resize":"fit"},"medium":{"w":500,"h":500,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SrMikeWazowski","name":"MIKE WAZOWSKI \u1d34\u1d30","id":592202860,"id_str":"592202860","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985449985,"id_str":"365611225985449985","text":"@B2BLMNHYK ga susah ga susah.-.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365609067869257728,"in_reply_to_status_id_str":"365609067869257728","in_reply_to_user_id":1268332200,"in_reply_to_user_id_str":"1268332200","in_reply_to_screen_name":"B2BLMNHYK","user":{"id":1147375069,"id_str":"1147375069","name":"miwn","screen_name":"93minah_","location":"9RS","url":null,"description":"D","protected":false,"followers_count":5343,"friends_count":5185,"listed_count":5,"created_at":"Mon Feb 04 08:06:43 +0000 2013","favourites_count":293,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":false,"verified":false,"statuses_count":43627,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000013520048\/906deab8d8a82b38a3310379bae6eae5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000013520048\/906deab8d8a82b38a3310379bae6eae5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260518468\/2c7642f85465b6e4dfc4515e5fc7ea32_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260518468\/2c7642f85465b6e4dfc4515e5fc7ea32_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1147375069\/1375978192","profile_link_color":"35B1C4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"B2BLMNHYK","name":"Hyeok","id":1268332200,"id_str":"1268332200","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225993850880,"id_str":"365611225993850880","text":"@shalabalabadude @ArryanJR @fadhildjaja RT","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365423720505937921,"in_reply_to_status_id_str":"365423720505937921","in_reply_to_user_id":281982895,"in_reply_to_user_id_str":"281982895","in_reply_to_screen_name":"shalabalabadude","user":{"id":44589140,"id_str":"44589140","name":"kumail afzal","screen_name":"kumail_afzal","location":"Indonesia","url":null,"description":"The Red devils 4 ever (MU), Lazuardi GIS (x-4) 2015, Just Enjoy Life :D,chodiveron","protected":false,"followers_count":221,"friends_count":209,"listed_count":1,"created_at":"Thu Jun 04 10:36:11 +0000 2009","favourites_count":6,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":599,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/738813524\/f840562fab48f5ee645a03f3b5d3c505.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/738813524\/f840562fab48f5ee645a03f3b5d3c505.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000219709956\/8d7938a26eaeb660ccdf227cefe9267a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000219709956\/8d7938a26eaeb660ccdf227cefe9267a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/44589140\/1374766872","profile_link_color":"CC3366","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"shalabalabadude","name":"\u0641\u0647\u0645\u064a \u0627\u0644\u062c\u0641\u0631\u064a","id":281982895,"id_str":"281982895","indices":[0,16]},{"screen_name":"ArryanJR","name":"ArryanJR (J)","id":99461450,"id_str":"99461450","indices":[17,26]},{"screen_name":"fadhildjaja","name":"Fadhil D","id":36328361,"id_str":"36328361","indices":[27,39]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226010615808,"id_str":"365611226010615808","text":"No se murio \u00d6","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":448479951,"id_str":"448479951","name":"Me llamo paola\u221e","screen_name":"Paoglez99B","location":"","url":null,"description":"Sigueme;) Adam Levine me sigue.... Tu porque no? \u2764No dedico tweets solo escribo para el que los quiera leer. CHICA COMUN Y NORMAL QUE AMA LA VIDA","protected":false,"followers_count":183,"friends_count":389,"listed_count":0,"created_at":"Wed Dec 28 01:53:07 +0000 2011","favourites_count":885,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":6229,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"F3D987","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/878642905\/0a3b1d2eafa8f28b37787a47b34027d9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/878642905\/0a3b1d2eafa8f28b37787a47b34027d9.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3781614401\/2aa95638518e61f75f39fb53db17bdcf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3781614401\/2aa95638518e61f75f39fb53db17bdcf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/448479951\/1372397289","profile_link_color":"D5B44B","profile_sidebar_border_color":"53AB8D","profile_sidebar_fill_color":"291C0D","profile_text_color":"53AB8D","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989660673,"id_str":"365611225989660673","text":"Prepare to east java again","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":338338914,"id_str":"338338914","name":"Raditya_R","screen_name":"radityabamer","location":"Semarang-malang","url":null,"description":"don't judge me based my timeline . It's just twitter, so take it easy","protected":false,"followers_count":172,"friends_count":266,"listed_count":0,"created_at":"Tue Jul 19 12:45:34 +0000 2011","favourites_count":2,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":1791,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/322503895\/Untitled.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/322503895\/Untitled.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000236653651\/d6d46a90b310ca9206d0c2d4d9acf7dc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000236653651\/d6d46a90b310ca9206d0c2d4d9acf7dc_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989660672,"id_str":"365611225989660672","text":"@WhoAreYouN0W @K0eki3M0nsterr Dat is zo'n slim idee omg","source":"web","truncated":false,"in_reply_to_status_id":365611141851922432,"in_reply_to_status_id_str":"365611141851922432","in_reply_to_user_id":91561807,"in_reply_to_user_id_str":"91561807","in_reply_to_screen_name":"WhoAreYouN0W","user":{"id":404877852,"id_str":"404877852","name":"'Wendy c: \u2665","screen_name":"WendyDePendy","location":"","url":"http:\/\/WendyDePendy.tumblr.com\/","description":"\u2665 @K0eki3M0nsterr \u2665 @NikkieDePanda \u2665 @PepVL \u2665","protected":false,"followers_count":169,"friends_count":167,"listed_count":0,"created_at":"Fri Nov 04 14:52:26 +0000 2011","favourites_count":300,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":18338,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"00FF44","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000033312534\/0a4810f8638f3c4ab19c7c7a501460f7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000033312534\/0a4810f8638f3c4ab19c7c7a501460f7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000228021006\/c15b78ac1c00ecb0f9f2cfeb4b65831b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000228021006\/c15b78ac1c00ecb0f9f2cfeb4b65831b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/404877852\/1374791360","profile_link_color":"FF8FE1","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"WhoAreYouN0W","name":"Jake","id":91561807,"id_str":"91561807","indices":[0,13]},{"screen_name":"K0eki3M0nsterr","name":"Yara' ","id":444710597,"id_str":"444710597","indices":[14,29]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225977061376,"id_str":"365611225977061376","text":"@nikkiwiz101 Okay. @ninjatheurgist @Earthgiver @Soph_Titanheart what were your subjects?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610755988533249,"in_reply_to_status_id_str":"365610755988533249","in_reply_to_user_id":700901550,"in_reply_to_user_id_str":"700901550","in_reply_to_screen_name":"nikkiwiz101","user":{"id":1035014628,"id_str":"1035014628","name":"Eri ^-^","screen_name":"EriFrostheart","location":"Trapped in Connor's Basement","url":"http:\/\/thehelpfulthaumaturge.webs.com","description":"Lvl 64 Ice, Hugger, #Twizard, Quite Random :D, SLAPPER :D. @Soph_Titanheart's sis. \u2665Followers are my dawlings.","protected":false,"followers_count":209,"friends_count":69,"listed_count":0,"created_at":"Tue Dec 25 15:33:40 +0000 2012","favourites_count":570,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":9883,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/870984873\/3c3dab83bde1d89e0b6065db523ef089.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/870984873\/3c3dab83bde1d89e0b6065db523ef089.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000215224542\/f97110aa91deeb37d9acaa3f3c06e853_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000215224542\/f97110aa91deeb37d9acaa3f3c06e853_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1035014628\/1375543881","profile_link_color":"088253","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"nikkiwiz101","name":"\u13c1\u13a5\u13e6\u13e6\u13a5_\u0500\u0467_\u15f7O\u15f0\u15f7","id":700901550,"id_str":"700901550","indices":[0,12]},{"screen_name":"ninjatheurgist","name":"Sophia Spirithunter","id":774906745,"id_str":"774906745","indices":[19,34]},{"screen_name":"Earthgiver","name":"Nicole Earthgiver","id":1571901342,"id_str":"1571901342","indices":[35,46]},{"screen_name":"Soph_Titanheart","name":"Sophia Titanheart","id":465703584,"id_str":"465703584","indices":[47,63]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225993842689,"id_str":"365611225993842689","text":"\u00bfC\u00f3mo pod\u00e9is tener tantos seguidores con tan pocos tweets? No me explico.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":472966266,"id_str":"472966266","name":"Rous ","screen_name":"OnlyLiveOnce_","location":"En todas partes","url":"http:\/\/favstar.fm\/users\/OnlyLiveOnce_","description":"Aqu\u00ed estoy para hacerte re\u00edr una vez m\u00e1s. Nadie es perfecto, solo t\u00fa cuando sonr\u00edes. T\u00fa f\u00edate de mi, no de perfumes atractivos. La vida no son problemas.","protected":false,"followers_count":1019,"friends_count":431,"listed_count":13,"created_at":"Tue Jan 24 14:04:24 +0000 2012","favourites_count":28903,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":false,"verified":false,"statuses_count":40859,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/803506998\/f35c0d0eb79da568357ddbf78fe6d8be.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/803506998\/f35c0d0eb79da568357ddbf78fe6d8be.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000188596062\/02b70342ebeea0c4a5a6b6b1a1f0afdd_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000188596062\/02b70342ebeea0c4a5a6b6b1a1f0afdd_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/472966266\/1352697535","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225977061378,"id_str":"365611225977061378","text":"@FerchMtzRod la verdad si esta todo as\u00ed bien padre lo d la moda. Ya despu\u00e9s me dir\u00e1s como te fue ...","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610410977665024,"in_reply_to_status_id_str":"365610410977665024","in_reply_to_user_id":121667706,"in_reply_to_user_id_str":"121667706","in_reply_to_screen_name":"FerchMtzRod","user":{"id":176291158,"id_str":"176291158","name":"moka","screen_name":"oscargarciae","location":"Toluca edo de mex","url":null,"description":"Loco por coviccion y ser feliz una adicion xD...","protected":false,"followers_count":105,"friends_count":236,"listed_count":0,"created_at":"Mon Aug 09 03:42:41 +0000 2010","favourites_count":67,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":1202,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3372898995\/1a430b0a4f05b79af2ab80c8b7ad07e5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3372898995\/1a430b0a4f05b79af2ab80c8b7ad07e5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/176291158\/1363135454","profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"FerchMtzRod","name":"Jos\u00e9 Fernando \uf8ff","id":121667706,"id_str":"121667706","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985458176,"id_str":"365611225985458176","text":"RT @Cristinalovebob: +Sigues so\u00f1ando conmigo? \n-Pues no se, ya no me acuerdo de lo que sue\u00f1o.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":349204659,"id_str":"349204659","name":"\u2020SERIAL KILLER\u2020","screen_name":"MarinaDmgz","location":"Perdida en tu perfecta piel.","url":null,"description":"Pride. WILD. \u2654 \u043c\u03c3\u03b7\u03b9\u00a2\u03b1 \u200e\u03b7\u03b1\u044f\u03b1\u200e\u03b7\u05e0\u03c3 \u2654 #HIGUAIN [13] Everytime I close my eyes, It's like a Dark Paradise. Gamer.","protected":false,"followers_count":494,"friends_count":380,"listed_count":0,"created_at":"Fri Aug 05 18:44:03 +0000 2011","favourites_count":3322,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":20408,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"318F22","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/801167790\/b3312296e359ad425e144e79c887b5e3.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/801167790\/b3312296e359ad425e144e79c887b5e3.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000219956266\/6c802a5a653bf90ce54b8780077de248_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000219956266\/6c802a5a653bf90ce54b8780077de248_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/349204659\/1364465506","profile_link_color":"B3240E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:06 +0000 2013","id":365611154149621761,"id_str":"365611154149621761","text":"+Sigues so\u00f1ando conmigo? \n-Pues no se, ya no me acuerdo de lo que sue\u00f1o.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":156588094,"id_str":"156588094","name":"\u0421\u044fis\u0442iiiiiiiii :)","screen_name":"Cristinalovebob","location":"El pa\u00eds de las maravillas \u2665","url":null,"description":"Conocedora de nada y aprendiz de todo. Coleccionista de abrazos y sonrisas. Experta en dormir muchas horas seguidas. Estudiante de enfermer\u00eda en la UNEX","protected":false,"followers_count":432,"friends_count":421,"listed_count":0,"created_at":"Thu Jun 17 09:54:53 +0000 2010","favourites_count":1491,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":9006,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"D3D9DB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000010180372\/d2fd593113f6e7489242cfbda26ae405.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000010180372\/d2fd593113f6e7489242cfbda26ae405.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000077471077\/d8790c5442d9d76e7d04775520919114_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000077471077\/d8790c5442d9d76e7d04775520919114_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/156588094\/1375456555","profile_link_color":"A177AB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"2D1E29","profile_text_color":"DB6995","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Cristinalovebob","name":"\u0421\u044fis\u0442iiiiiiiii :)","id":156588094,"id_str":"156588094","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002227200,"id_str":"365611226002227200","text":"\"@_daijak: LMFAO RT @CommonLightskin: Lightskin girls be like... http:\/\/t.co\/ewwLlix23t\" lowkey tho","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":194710654,"id_str":"194710654","name":"Faux Pos","screen_name":"__RETRO11","location":"Practicing in the gym ","url":"http:\/\/jordansandjesus.tumblr.com\/","description":"#ShoeCollector They say practice makes perfect but the bible says perfect doesn't exist.","protected":false,"followers_count":386,"friends_count":98,"listed_count":3,"created_at":"Fri Sep 24 20:25:51 +0000 2010","favourites_count":35,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":19680,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1F0C01","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/415300196\/air-jordan-socks-1.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/415300196\/air-jordan-socks-1.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000231931548\/b415bc6e5ab8322105546181ccfd55a2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000231931548\/b415bc6e5ab8322105546181ccfd55a2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/194710654\/1375478833","profile_link_color":"75231C","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"F06000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_daijak","name":"Lil Sean Don","id":108100247,"id_str":"108100247","indices":[1,9]},{"screen_name":"CommonLightskin","name":"Lightskin Ed.","id":1214346157,"id_str":"1214346157","indices":[20,36]}],"media":[{"id":365607902372847616,"id_str":"365607902372847616","indices":[65,87],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLmczwCYAAlvu8.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLmczwCYAAlvu8.jpg","url":"http:\/\/t.co\/ewwLlix23t","display_url":"pic.twitter.com\/ewwLlix23t","expanded_url":"http:\/\/twitter.com\/CommonLightskin\/status\/365607902368653312\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":608,"h":608,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}},"source_status_id":365607902368653312,"source_status_id_str":"365607902368653312"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002227201,"id_str":"365611226002227201","text":"@noviangginii mbb yak, iyaa dimaafin noppp:* morning keboo:3\u007b\u007d","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365231524586602496,"in_reply_to_status_id_str":"365231524586602496","in_reply_to_user_id":720007027,"in_reply_to_user_id_str":"720007027","in_reply_to_screen_name":"noviangginii","user":{"id":1026267396,"id_str":"1026267396","name":"Syaw~","screen_name":"fitsyawalia","location":"PixieDust\u2600","url":null,"description":"Revizweladies'12 \u2022 85'15\u2660| Moody\u25b2\u25bc| Astroboy&Astrogirl addicted\u2605","protected":false,"followers_count":526,"friends_count":507,"listed_count":0,"created_at":"Fri Dec 21 12:16:57 +0000 2012","favourites_count":53,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":10610,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000010073691\/bcec6c9e5552f8042f666eea0b475c25.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000010073691\/bcec6c9e5552f8042f666eea0b475c25.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244947235\/2408b6d0cb3731d7affc3d8336351f95_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244947235\/2408b6d0cb3731d7affc3d8336351f95_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1026267396\/1373260956","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"noviangginii","name":"NoviaAnggini.","id":720007027,"id_str":"720007027","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981267974,"id_str":"365611225981267974","text":"RT @ellbox: can we create something beautiful and destroy it","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":326956530,"id_str":"326956530","name":"fat jamie ","screen_name":"idkjami3","location":"uk","url":null,"description":"i probably fancy you","protected":false,"followers_count":1901,"friends_count":1822,"listed_count":0,"created_at":"Thu Jun 30 19:22:42 +0000 2011","favourites_count":708,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":15161,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/850452673\/e8b4477bfb7cc7f23d967eb2200b3354.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/850452673\/e8b4477bfb7cc7f23d967eb2200b3354.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000222175279\/a293acc3e25d819c55e30a3092867507_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000222175279\/a293acc3e25d819c55e30a3092867507_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/326956530\/1372798163","profile_link_color":"E817B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"92818B","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:09:34 +0000 2013","id":365610766444933121,"id_str":"365610766444933121","text":"can we create something beautiful and destroy it","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":358091519,"id_str":"358091519","name":"","screen_name":"ellbox","location":"","url":null,"description":"pierce the veil are my life \u2665 @kieranjss_ 20.11.2011 my baby.","protected":false,"followers_count":5327,"friends_count":3025,"listed_count":2,"created_at":"Fri Aug 19 11:00:17 +0000 2011","favourites_count":1849,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":13382,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/832576012\/8fa6df444db8d4eb1f28e159d4e87cee.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/832576012\/8fa6df444db8d4eb1f28e159d4e87cee.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249782775\/0c0a06b5b492b94f8ab5e5ceef463b0b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249782775\/0c0a06b5b492b94f8ab5e5ceef463b0b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/358091519\/1375815934","profile_link_color":"080808","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ellbox","name":"","id":358091519,"id_str":"358091519","indices":[3,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226010632192,"id_str":"365611226010632192","text":"RT @Sara_saruxa: THIS IS TRUE! Only Mahomies (: http:\/\/t.co\/MotWFstGk3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":930734832,"id_str":"930734832","name":"Martha \u2661 ","screen_name":"Mahomie_1D_Luv","location":"Foolish 4's HomeState","url":null,"description":"\u2764 Proud Mahomie \u2764 | ACon \u2764 | Villa \u2764 | Belieber \u2764 | Millertary \u2764 | Beaster \u2764 | | Branson| \u2764 ( ( Zach & Robert Follow ) ) \u2764 |","protected":false,"followers_count":2871,"friends_count":2171,"listed_count":2,"created_at":"Tue Nov 06 23:20:29 +0000 2012","favourites_count":703,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":17964,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131114","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041067941\/bd965de92a56fb1d288ff1913ad0ebe8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041067941\/bd965de92a56fb1d288ff1913ad0ebe8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261346809\/0417e557a84780066823ea390be9950e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261346809\/0417e557a84780066823ea390be9950e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/930734832\/1375988828","profile_link_color":"E00DB6","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"FF0000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 00:38:07 +0000 2013","id":365270664522899456,"id_str":"365270664522899456","text":"THIS IS TRUE! Only Mahomies (: http:\/\/t.co\/MotWFstGk3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1413038768,"id_str":"1413038768","name":"Sara Mahone","screen_name":"Sara_saruxa","location":"Mahomieland","url":null,"description":"Mahomie \u2764\r\n*Haters gonna hate, Mahomies gonna love*\r\nAUNT LISA FOLLOWS ME! :)","protected":false,"followers_count":963,"friends_count":667,"listed_count":4,"created_at":"Wed May 08 14:46:14 +0000 2013","favourites_count":1731,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":4490,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000004456998\/4f27a230c2dc76d1562cb981e0e03ccd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000004456998\/4f27a230c2dc76d1562cb981e0e03ccd.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000235716797\/0b862087a20d67927cf3f0adb6173858_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000235716797\/0b862087a20d67927cf3f0adb6173858_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1413038768\/1375307113","profile_link_color":"0099B9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":8,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365270664527093761,"id_str":"365270664527093761","indices":[31,53],"media_url":"http:\/\/pbs.twimg.com\/media\/BRGzu-4CIAEXGAp.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRGzu-4CIAEXGAp.jpg","url":"http:\/\/t.co\/MotWFstGk3","display_url":"pic.twitter.com\/MotWFstGk3","expanded_url":"http:\/\/twitter.com\/Sara_saruxa\/status\/365270664522899456\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":600,"h":600,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Sara_saruxa","name":"Sara Mahone","id":1413038768,"id_str":"1413038768","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981263872,"id_str":"365611225981263872","text":"@nyagarek \u304a\u306f\u30fc","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610940873453570,"in_reply_to_status_id_str":"365610940873453570","in_reply_to_user_id":97700056,"in_reply_to_user_id_str":"97700056","in_reply_to_screen_name":"nyagarek","user":{"id":141180859,"id_str":"141180859","name":"\u9ad8\u539f@\u4e0a\u534a\u30b7\u30f3\u58eb","screen_name":"KoU_GR","location":"\u51a5\u571f\u306e\u7f8a\u3001\u63a7\u3048\u5ba4","url":"http:\/\/s.ameblo.jp\/ikunoon","description":"\u3060\u3044\u3055\u304f\u3068\u304b\u3063\u304d\u30fc\u3068\u7027\u7530\u3068IKUO\u3055\u3093\u3068\u304a\u308a\u3087\u30fc\u3068\u30b7\u30f3\u3061\u3083\u3093\u3068\u85cd\u3061\u3083\u3093\u3092\u3053\u3088\u306a\u304f\u611b\u3059\u308b\u3001\u8150\u3063\u305f\u30df\u30eb\u30af\u30c6\u30a3\u30fc\u3002\r\n\r\n\uff79\uff9e\uff70\uff91\u5168\u822c\/GRANRODEO\/T.M.Revolution\u30fba.b.s\/\u5996\u7cbe\u5e1d\u570b\/\uff83\uff9e\uff9d\uff76\uff9a\/\uff7b\uff9d\uff8e\uff97\/FLOW\/\uff95\uff86\uff7f\uff9e\uff9d\/V\u7cfb\n\n\u73fe\u5728\u85cd\u4e95\u30a8\u30a4\u30eb\u63a8\u3057\u4e2d(\u0e05'\u03c9'\u0e05)\n\n\n\u6027\u5225\uff1a\u3053\u3046\u3067\u3059\u3002","protected":false,"followers_count":682,"friends_count":861,"listed_count":56,"created_at":"Fri May 07 11:19:58 +0000 2010","favourites_count":3114,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":156035,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000184004442\/71ea5ff433a5946764becc8aec3d7298_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000184004442\/71ea5ff433a5946764becc8aec3d7298_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"nyagarek","name":"\u5982\u6708\u3000\u6d41\uff20\u6c42\u8077\u4e2d","id":97700056,"id_str":"97700056","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226006429696,"id_str":"365611226006429696","text":"RT @bubblebuttbabes: Yes please... http:\/\/t.co\/IyO5bwM2ZH","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":383386513,"id_str":"383386513","name":"Pinkie pie. \u2665","screen_name":"RAINBOWFUCKER_","location":"Lollipopland. \u2665","url":"http:\/\/mygunyourheadyeah.tumblr.com\/","description":"\u2020 insta: rainb0wunicorn_ | snapchat: rainbowkoekje| 100513 @FeelTheShit \u2665| Schatje ik hou super veel van je en ik wil jou nooit meer kwijt , Xx je hipster :$ |","protected":false,"followers_count":307,"friends_count":182,"listed_count":0,"created_at":"Sat Oct 01 19:53:56 +0000 2011","favourites_count":62,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":41217,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/769022150\/598942829dc7dcad3a9fda53e8a3af9d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/769022150\/598942829dc7dcad3a9fda53e8a3af9d.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245335679\/927aebf441f943750a3c9169469bd5c4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245335679\/927aebf441f943750a3c9169469bd5c4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/383386513\/1374790151","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Fri Aug 02 22:32:19 +0000 2013","id":363427064931557376,"id_str":"363427064931557376","text":"Yes please... http:\/\/t.co\/IyO5bwM2ZH","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1452889219,"id_str":"1452889219","name":"Bubble Butt Babes","screen_name":"bubblebuttbabes","location":"","url":null,"description":"We all love bubble butts and here they are! Also follow our other accounts @athletic_babes. Email: athleticbabes@gmail.com","protected":false,"followers_count":60234,"friends_count":67,"listed_count":70,"created_at":"Fri May 24 00:41:35 +0000 2013","favourites_count":3,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":128,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000130347145\/5629c8b4192d03830ee1b2b9e9f7982f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000130347145\/5629c8b4192d03830ee1b2b9e9f7982f_normal.jpeg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":350,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":363427064939945984,"id_str":"363427064939945984","indices":[14,36],"media_url":"http:\/\/pbs.twimg.com\/media\/BQsm_XOCIAAw3lY.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BQsm_XOCIAAw3lY.jpg","url":"http:\/\/t.co\/IyO5bwM2ZH","display_url":"pic.twitter.com\/IyO5bwM2ZH","expanded_url":"http:\/\/twitter.com\/bubblebuttbabes\/status\/363427064931557376\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":500,"h":749,"resize":"fit"},"small":{"w":340,"h":509,"resize":"fit"},"large":{"w":500,"h":749,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"bubblebuttbabes","name":"Bubble Butt Babes","id":1452889219,"id_str":"1452889219","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225998041088,"id_str":"365611225998041088","text":"@rivera1_dayana jajajajaja eso dices","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611107018211331,"in_reply_to_status_id_str":"365611107018211331","in_reply_to_user_id":1151964529,"in_reply_to_user_id_str":"1151964529","in_reply_to_screen_name":"rivera1_dayana","user":{"id":63489936,"id_str":"63489936","name":"\u273fWilne\u270c","screen_name":"wilnelis","location":"Puerto Rico","url":null,"description":"\u273f vive y deja vivir ... \u273f","protected":false,"followers_count":157,"friends_count":213,"listed_count":0,"created_at":"Thu Aug 06 17:35:22 +0000 2009","favourites_count":597,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2952,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"7A536E","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000039756835\/d6793de225ed0a4a08d73aae0a8078df.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000039756835\/d6793de225ed0a4a08d73aae0a8078df.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000119545421\/f2936f9c86244af646c60c19f4e79fda_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000119545421\/f2936f9c86244af646c60c19f4e79fda_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/63489936\/1375303929","profile_link_color":"5F3D56","profile_sidebar_border_color":"EFE4C4","profile_sidebar_fill_color":"D0C180","profile_text_color":"908448","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"rivera1_dayana","name":"\u062f\u0627\u062c\u0627\u0646\u0627 \u0631\u064a\u0628\u064a\u0631\u0627","id":1151964529,"id_str":"1151964529","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225998032896,"id_str":"365611225998032896","text":"RT @karolinshrikii: \u05d7\u05d5\u05e1\u05e8 \u05d9\u05d7\u05e1 \u05d2\u05d5\u05e8\u05e8 \u05d9\u05d7\u05e1!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1168163294,"id_str":"1168163294","name":"Eliron Giny ","screen_name":"eginy7","location":"","url":null,"description":null,"protected":false,"followers_count":174,"friends_count":141,"listed_count":0,"created_at":"Mon Feb 11 07:33:14 +0000 2013","favourites_count":488,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5422,"lang":"he","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000160835595\/393f00cd37f4f51d932aa808035d7a9e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000160835595\/393f00cd37f4f51d932aa808035d7a9e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1168163294\/1371261205","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:11 +0000 2013","id":365611174911426561,"id_str":"365611174911426561","text":"\u05d7\u05d5\u05e1\u05e8 \u05d9\u05d7\u05e1 \u05d2\u05d5\u05e8\u05e8 \u05d9\u05d7\u05e1!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1246264166,"id_str":"1246264166","name":"karolin_shriki","screen_name":"karolinshrikii","location":"","url":null,"description":null,"protected":false,"followers_count":192,"friends_count":175,"listed_count":0,"created_at":"Wed Mar 06 14:47:24 +0000 2013","favourites_count":1677,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":2837,"lang":"he","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000050651661\/537a5f03e8ce3aa5b405adc03de9c12d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000050651661\/537a5f03e8ce3aa5b405adc03de9c12d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1246264166\/1374594423","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[31.97233777,34.77146378]},"coordinates":{"type":"Point","coordinates":[34.77146378,31.97233777]},"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"he"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"karolinshrikii","name":"karolin_shriki","id":1246264166,"id_str":"1246264166","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"he"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985449984,"id_str":"365611225985449984","text":"@irresistibIou sigh alrighty alves I've gots to go now. You have a good day, muffin! Love ya x","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610869880651777,"in_reply_to_status_id_str":"365610869880651777","in_reply_to_user_id":619999569,"in_reply_to_user_id_str":"619999569","in_reply_to_screen_name":"irresistibIou","user":{"id":619999569,"id_str":"619999569","name":"alvie \u2741","screen_name":"irresistibIou","location":"","url":null,"description":"he is my \u263c my \u263e and all my \u2606's","protected":false,"followers_count":1660,"friends_count":66,"listed_count":10,"created_at":"Wed Jun 27 11:43:55 +0000 2012","favourites_count":1744,"utc_offset":10800,"time_zone":"Helsinki","geo_enabled":false,"verified":false,"statuses_count":17745,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"F26A6A","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028827169\/fb179108f9f6aabd0e66934241166371.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028827169\/fb179108f9f6aabd0e66934241166371.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250969251\/0bc7b49b67fd9eec28a1da0b21656538_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250969251\/0bc7b49b67fd9eec28a1da0b21656538_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/619999569\/1375785700","profile_link_color":"F26A6A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"irresistibIou","name":"alvie \u2741","id":619999569,"id_str":"619999569","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981255682,"id_str":"365611225981255682","text":"Po oi me keeo deee traankii kee ti reveenta jajajajaja","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1527759438,"id_str":"1527759438","name":"\u2660\u266c. SiiLviia :S \u266c\u2660","screen_name":"laamorenotah","location":"","url":null,"description":"Mi viidaa, mis reglas :) con Laa caabeza bieen alttaa To Loo maaLo se descaansa :) \u00dd Lo qe siGo Lo consiGo aunkee aya fueGo en el caamino:) !:$","protected":false,"followers_count":69,"friends_count":151,"listed_count":0,"created_at":"Tue Jun 18 13:27:30 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1047,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246203010\/254e70f8837f73b17b6a403ce1bd08f3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246203010\/254e70f8837f73b17b6a403ce1bd08f3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1527759438\/1375878249","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225998045184,"id_str":"365611225998045184","text":"I hate you all http:\/\/t.co\/6e3kKIsomj","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":753830256,"id_str":"753830256","name":"jessica \u2661","screen_name":"lou_ash_xx","location":"","url":null,"description":"Find what makes you happy. \u2741","protected":false,"followers_count":561,"friends_count":161,"listed_count":1,"created_at":"Sun Aug 12 21:39:13 +0000 2012","favourites_count":1969,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":14106,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242572570\/1973db6a913ab838641df4c688390289_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242572570\/1973db6a913ab838641df4c688390289_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/753830256\/1375988517","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/6e3kKIsomj","expanded_url":"http:\/\/media.tumblr.com\/2b4cead58e7fe8c991f15d692771699d\/tumblr_inline_mn0u2mT3eV1qz4rgp.gif","display_url":"media.tumblr.com\/2b4cead58e7fe8\u2026","indices":[15,37]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225993854977,"id_str":"365611225993854977","text":"@PaulaxVengeance http:\/\/t.co\/VgjnRWNWuy","source":"web","truncated":false,"in_reply_to_status_id":365601941973237760,"in_reply_to_status_id_str":"365601941973237760","in_reply_to_user_id":269510964,"in_reply_to_user_id_str":"269510964","in_reply_to_screen_name":"PaulaxVengeance","user":{"id":121990157,"id_str":"121990157","name":"Mr. Crowley","screen_name":"RockNFuckinRoll","location":"Paradise City","url":"http:\/\/ask.fm\/RockNFuckinRoll","description":"Hodor.","protected":false,"followers_count":215,"friends_count":306,"listed_count":1,"created_at":"Thu Mar 11 07:10:01 +0000 2010","favourites_count":350,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":true,"verified":false,"statuses_count":10186,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/350873027\/guitar_8_105454.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/350873027\/guitar_8_105454.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000009897585\/05585bdd7d5172db9f9c7df980490c19_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000009897585\/05585bdd7d5172db9f9c7df980490c19_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/121990157\/1373893677","profile_link_color":"83989C","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"282424","profile_text_color":"637274","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/VgjnRWNWuy","expanded_url":"http:\/\/gifatron.com\/wp-content\/uploads\/2013\/03\/tyrion-approves-game-of-thrones.gif","display_url":"gifatron.com\/wp-content\/upl\u2026","indices":[17,39]}],"user_mentions":[{"screen_name":"PaulaxVengeance","name":"Paula Vengeance","id":269510964,"id_str":"269510964","indices":[0,16]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989656576,"id_str":"365611225989656576","text":"@Raul_Jungmann Secretaria Nacional dos Povos Ind\u00edgenas http:\/\/t.co\/SVOgrzvC4u \u2026 \u2026 \u2026, http:\/\/t.co\/Jq0cJcAfkI","source":"web","truncated":false,"in_reply_to_status_id":365610918282948608,"in_reply_to_status_id_str":"365610918282948608","in_reply_to_user_id":74014934,"in_reply_to_user_id_str":"74014934","in_reply_to_screen_name":"Raul_Jungmann","user":{"id":34951438,"id_str":"34951438","name":"helio","screen_name":"Heliosblog","location":"BRAS\u00cdLIA","url":"http:\/\/helioaraujosilva.wordpress.com\/","description":"Voc\u00ea j\u00e1 entrou no H\u00e9lio\u2019s Blog hoje?","protected":false,"followers_count":45,"friends_count":209,"listed_count":0,"created_at":"Fri Apr 24 15:19:56 +0000 2009","favourites_count":9,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":5310,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"BADFCD","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme12\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme12\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/921770921\/H_lio_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/921770921\/H_lio_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/34951438\/1358612664","profile_link_color":"FF0000","profile_sidebar_border_color":"F2E195","profile_sidebar_fill_color":"FFF7CC","profile_text_color":"0C3E53","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/SVOgrzvC4u","expanded_url":"http:\/\/www.avaaz.org\/po\/petition\/Secretaria_Nacional_dos_Povos_Indigenas\/?fgsJddb&pv=0","display_url":"avaaz.org\/po\/petition\/Se\u2026","indices":[55,77]},{"url":"http:\/\/t.co\/Jq0cJcAfkI","expanded_url":"http:\/\/wp.me\/p1ecQj-1Eb","display_url":"wp.me\/p1ecQj-1Eb","indices":[85,107]}],"user_mentions":[{"screen_name":"Raul_Jungmann","name":"Raul Jungmann","id":74014934,"id_str":"74014934","indices":[0,14]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226006437888,"id_str":"365611226006437888","text":"Wah ? :o RT @reza_sansan: S @iqbalmsahid RT Agustina_NF: Bisa sih kalo tau mimin siapa mah.. :p RT @1SNAPTU: jangan lupain minpa ya :) Agust","source":"\u003ca href=\"http:\/\/www.writelonger.com\" rel=\"nofollow\"\u003eWrite Longer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":302384299,"id_str":"302384299","name":"Agustina Nur Fauziah","screen_name":"Agustina_NF","location":"Bandung Barat","url":"http:\/\/facebook.com\/agustina.nf","description":"29 August 1996 | Mom is my everything | Obsessed by guitarist of Overboard Band | Biology Education UPI '13 ^_^","protected":false,"followers_count":339,"friends_count":214,"listed_count":1,"created_at":"Sat May 21 03:07:26 +0000 2011","favourites_count":16,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":true,"verified":false,"statuses_count":3965,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"571D55","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/820847125\/04867641dde120bdde2429a1969ae49b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/820847125\/04867641dde120bdde2429a1969ae49b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000253580151\/a07d06dcd33651200c6b9a1a59c2f53b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000253580151\/a07d06dcd33651200c6b9a1a59c2f53b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/302384299\/1363931582","profile_link_color":"D625E6","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"D97CEB","profile_text_color":"62C7E0","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"reza_sansan","name":"reza laksmana","id":520007545,"id_str":"520007545","indices":[12,24]},{"screen_name":"iqbalmsahid","name":"Iqbal Muhamad Sahid","id":301836732,"id_str":"301836732","indices":[28,40]},{"screen_name":"1SNAPTU","name":"SNAPTU (IPA 1 SABA)","id":1349107447,"id_str":"1349107447","indices":[99,107]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989644290,"id_str":"365611225989644290","text":"@connyblah @VirginMobile_cl yooooooo, suelto de raja? cu\u00e1ndo? XDDDD","source":"web","truncated":false,"in_reply_to_status_id":365610891036721154,"in_reply_to_status_id_str":"365610891036721154","in_reply_to_user_id":25782524,"in_reply_to_user_id_str":"25782524","in_reply_to_screen_name":"connyblah","user":{"id":118531444,"id_str":"118531444","name":"yerkolivio","screen_name":"yerkolivio","location":"Providencia, Santiago. ","url":"http:\/\/yerkolivio.tumblr.com","description":"25 a\u00f1os; introvertido, pinturita, mentiroso y sincero; descubra en cual versi\u00f3n me presento ante usted. Le busco la 5ta pata al gato. No soy wn, me hago.","protected":false,"followers_count":1352,"friends_count":847,"listed_count":40,"created_at":"Mon Mar 01 00:10:21 +0000 2010","favourites_count":1163,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":144480,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000205559898\/1cfe507041c9ab49f225eae5cd07347f_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000205559898\/1cfe507041c9ab49f225eae5cd07347f_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/118531444\/1363135299","profile_link_color":"2FC2EF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"connyblah","name":"Conny B.","id":25782524,"id_str":"25782524","indices":[0,10]},{"screen_name":"VirginMobile_cl","name":"Virgin Mobile Chile","id":393470224,"id_str":"393470224","indices":[11,27]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225989644288,"id_str":"365611225989644288","text":"\ud83d\ude24","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":545770125,"id_str":"545770125","name":"erikaaah\u262f\u262e","screen_name":"erikamoe44","location":"","url":null,"description":"A.Vargas\u2765\u221e\u262a","protected":false,"followers_count":363,"friends_count":212,"listed_count":0,"created_at":"Thu Apr 05 04:34:10 +0000 2012","favourites_count":4891,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":12482,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1E2224","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000243240844\/2070edb514a0a90e92c4d95ade23926d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000243240844\/2070edb514a0a90e92c4d95ade23926d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/545770125\/1375673679","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981272064,"id_str":"365611225981272064","text":"RT @MotivacionesF: \"Dejar de jugar al f\u00fatbol por miedo hacerlo mal es como vivir con miedo a morir\" - Schweinsteiger","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":370750739,"id_str":"370750739","name":"miguelodigas","screen_name":"2Miguelpina","location":"Orihuela","url":"http:\/\/www.tuenti.com\/#m=Profile&func=index","description":"Seguro, sincero y romantico. Me gusta hacer reir. El mejor antidoto la sonrisa. Como dice mi padre solo debes llorar por las cosas importantes.","protected":false,"followers_count":357,"friends_count":193,"listed_count":0,"created_at":"Fri Sep 09 15:28:19 +0000 2011","favourites_count":120,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":10313,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/580631834\/xgm6tji1b1xc95k0u1m1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/580631834\/xgm6tji1b1xc95k0u1m1.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000204099822\/e19bf2135b74f43655a49efded549869_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000204099822\/e19bf2135b74f43655a49efded549869_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/370750739\/1355091367","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 19:56:22 +0000 2013","id":365562148623093760,"id_str":"365562148623093760","text":"\"Dejar de jugar al f\u00fatbol por miedo hacerlo mal es como vivir con miedo a morir\" - Schweinsteiger","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":604988212,"id_str":"604988212","name":"Motivaciones F\u00fatbol ","screen_name":"MotivacionesF","location":"","url":null,"description":"Frases motivadoras del f\u00fatbol para que puedas seguir tus sue\u00f1os.","protected":false,"followers_count":226579,"friends_count":2,"listed_count":287,"created_at":"Mon Jun 11 01:03:20 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":10359,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/854135684\/0ed10937b2c121eab0053060116193dd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/854135684\/0ed10937b2c121eab0053060116193dd.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3582846140\/60b8cdeaab3a4ea9551497e4e89b8ed9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3582846140\/60b8cdeaab3a4ea9551497e4e89b8ed9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/604988212\/1372880558","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":747,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MotivacionesF","name":"Motivaciones F\u00fatbol ","id":604988212,"id_str":"604988212","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985466368,"id_str":"365611225985466368","text":"RT @nunaissexy: gabbs? se tiveres a ver este tweet, \u00e9 obv que o meu sorriso elumina o mundo eu brilho demasiado bc sou fab fab fab","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1610552244,"id_str":"1610552244","name":"us till infinity.","screen_name":"iwannabe_ollg","location":"","url":null,"description":"gabriela | jdb | smg | rdp | \nhttp:\/\/its-2-much-pain.tumblr.com\/","protected":false,"followers_count":178,"friends_count":217,"listed_count":1,"created_at":"Sun Jul 21 13:53:14 +0000 2013","favourites_count":343,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":2069,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"F9FCFC","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047257111\/a96af482d1e1c21590eb93335310209c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047257111\/a96af482d1e1c21590eb93335310209c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261633138\/4903ff70a2c05213edfe7c05f045a6ed_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261633138\/4903ff70a2c05213edfe7c05f045a6ed_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1610552244\/1375980478","profile_link_color":"5C1778","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:40 +0000 2013","id":365611045521334273,"id_str":"365611045521334273","text":"gabbs? se tiveres a ver este tweet, \u00e9 obv que o meu sorriso elumina o mundo eu brilho demasiado bc sou fab fab fab","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1627976346,"id_str":"1627976346","name":"sou do meu bf.","screen_name":"nunaissexy","location":"jdb ddl dob","url":"http:\/\/foreveer-togetheer.tumblr.com\/","description":"bf's boyfriend mav sbf","protected":false,"followers_count":98,"friends_count":149,"listed_count":0,"created_at":"Sun Jul 28 14:26:34 +0000 2013","favourites_count":363,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2498,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"F9FBFC","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000039162659\/201c60d258d8542a74f1feb580093938.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000039162659\/201c60d258d8542a74f1feb580093938.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256080723\/25375f008027dc6736a89c6ee5f4620f_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256080723\/25375f008027dc6736a89c6ee5f4620f_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1627976346\/1375888400","profile_link_color":"B6BBBD","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"nunaissexy","name":"sou do meu bf.","id":1627976346,"id_str":"1627976346","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226010619904,"id_str":"365611226010619904","text":"@commonrauhl fa male..","source":"web","truncated":false,"in_reply_to_status_id":365609508057260032,"in_reply_to_status_id_str":"365609508057260032","in_reply_to_user_id":356163087,"in_reply_to_user_id_str":"356163087","in_reply_to_screen_name":"commonrauhl","user":{"id":1058758190,"id_str":"1058758190","name":"sexyrus. ","screen_name":"milescigarettes","location":"","url":"http:\/\/jar-of-spring.tumblr.com\/","description":"\u2020 justin bieber and miley cyrus.\u2020","protected":false,"followers_count":2410,"friends_count":2017,"listed_count":1,"created_at":"Thu Jan 03 20:49:47 +0000 2013","favourites_count":1045,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":22754,"lang":"it","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/856655312\/0f0d82d24834eb717a106328a9c58bb4.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/856655312\/0f0d82d24834eb717a106328a9c58bb4.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252123866\/e8966d91d6d548aac32218bb3e3d1430_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252123866\/e8966d91d6d548aac32218bb3e3d1430_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1058758190\/1375828015","profile_link_color":"0A0A0A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"commonrauhl","name":"\u7231","id":356163087,"id_str":"356163087","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"it"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225981267973,"id_str":"365611225981267973","text":"\u671d\u304b\u3089\u795e\u306b\u3043\u3055\u307e\u4e2d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":864378128,"id_str":"864378128","name":"\u3061\u3087\u3053\u3048\u308a","screen_name":"eri_kaeru","location":"","url":null,"description":"Free!\u4e2d\u6bd2\u3002\u8150\u3067\u3059\u304c\u57fa\u672c\u96d1\u98df\u3002\u58f0\u512a\u3055\u3093\u2026\u7279\u306b\u798f\u5c71\u6f64\u3055\u3093\u5927\u597d\u304d\u3067\u3059\u3002\uff25\uff36\uff2f\uff2c.\uff2b.\u30a4\u30af\u30b5\u30ac.\u9ed2\u30d0\u30b9.\u56e0\u5e61.\u9032\u6483.\u6226\u52c7\u3002.\u30ab\u30fc\u30cb\u30f4\u30a1\u30eb.\u30ac\u30eb\u30ac\u30f3.\u9b54\u738b\u3055\u307e\u2026\u3002\u30e1\u30ac\u30cd\u306f\u6b63\u7fa9\u3002\u30ed\u30dc\u30c3\u30c8\u5927\u597d\u304d\u3002\u4e0d\u61ab\u840c\u3048\u3002","protected":false,"followers_count":52,"friends_count":205,"listed_count":0,"created_at":"Sat Oct 06 08:56:13 +0000 2012","favourites_count":499,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":7060,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000207989562\/1acc1137482afe81a7ae08d4e08e9c8c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000207989562\/1acc1137482afe81a7ae08d4e08e9c8c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/864378128\/1375198789","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972879360,"id_str":"365611225972879360","text":"\"Aim for my heart,if u feel like....\" who wants to dance wth me? (-.-)","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":702999596,"id_str":"702999596","name":"Mrs.Aizen","screen_name":"Kaytoure","location":"kampala uganda","url":null,"description":"Living,loving and laughing,save yo emotions for yo teddybear. God and family. #TeamUG #TeamAfrica","protected":false,"followers_count":417,"friends_count":368,"listed_count":2,"created_at":"Wed Jul 18 12:48:09 +0000 2012","favourites_count":53,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":9575,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000030061026\/9d101430a9b0db193a1f99cad7ff6de5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000030061026\/9d101430a9b0db193a1f99cad7ff6de5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000156374252\/bfb439debd6d5d68f4c5d92d9ddc791b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000156374252\/bfb439debd6d5d68f4c5d92d9ddc791b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/702999596\/1375265624","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002239489,"id_str":"365611226002239489","text":"@TimJMiller I grew a beak....","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611063204515840,"in_reply_to_status_id_str":"365611063204515840","in_reply_to_user_id":569580951,"in_reply_to_user_id_str":"569580951","in_reply_to_screen_name":"TimJMiller","user":{"id":875836178,"id_str":"875836178","name":"Justin Ward","screen_name":"Robert9Ward","location":"","url":null,"description":null,"protected":false,"followers_count":428,"friends_count":424,"listed_count":0,"created_at":"Fri Oct 12 15:21:56 +0000 2012","favourites_count":1648,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2289,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000174118289\/3e4529b3a2c406b303fe7b16a6341d2c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000174118289\/3e4529b3a2c406b303fe7b16a6341d2c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/875836178\/1374552989","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TimJMiller","name":"Tim Miller","id":569580951,"id_str":"569580951","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225998036992,"id_str":"365611225998036992","text":"@MaggaVqz jajaja q onda se la re banca jajaja","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610887823912960,"in_reply_to_status_id_str":"365610887823912960","in_reply_to_user_id":408923436,"in_reply_to_user_id_str":"408923436","in_reply_to_screen_name":"MaggaVqz","user":{"id":362152139,"id_str":"362152139","name":"- YaniL - 22 \u2661 \u265b \u200f","screen_name":"Yanilflorencia","location":"","url":null,"description":"De boedo Vengo A boedo Vuelvo \u2665 eternamente Enamorada de #SanLorenzo-socia N- 106703 Administradora de @CuervosT . Una loca amante de #Guasones \/ Futura Abogada","protected":false,"followers_count":356,"friends_count":299,"listed_count":1,"created_at":"Thu Aug 25 23:02:41 +0000 2011","favourites_count":375,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"verified":false,"statuses_count":11217,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"F31F1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/821600811\/fc55265d0df56c8e7d2456b90059a004.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/821600811\/fc55265d0df56c8e7d2456b90059a004.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000254899875\/f63e71ab0c5b0089051f332465121d90_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000254899875\/f63e71ab0c5b0089051f332465121d90_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/362152139\/1374898775","profile_link_color":"0329F9","profile_sidebar_border_color":"CBE499","profile_sidebar_fill_color":"99C8E4","profile_text_color":"AE67C2","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[-34.6930519,-58.6250299]},"coordinates":{"type":"Point","coordinates":[-58.6250299,-34.6930519]},"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MaggaVqz","name":"Meel ~B.J ~","id":408923436,"id_str":"408923436","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226010624000,"id_str":"365611226010624000","text":"RT @gaynycdad: Many #Winners Announced! New #Giveaways Are Up! http:\/\/t.co\/JSXrisKxZv #contests #sweepstakes","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":17917326,"id_str":"17917326","name":"The Frugal Free Gal","screen_name":"FrugalFreeGal","location":"","url":"http:\/\/thefrugalfreegal.com","description":"Sharing the Most Up-to-Date Free Samples, Coupons, Money Saving Tips, Great Deals, Giveaways, Blogging Tips, and more!","protected":false,"followers_count":17144,"friends_count":9296,"listed_count":120,"created_at":"Sat Dec 06 05:26:36 +0000 2008","favourites_count":75,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":18863,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3756799507\/d5f9f866671034030b8a85527f75ca5b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3756799507\/d5f9f866671034030b8a85527f75ca5b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/17917326\/1348802272","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 14:45:39 +0000 2013","id":365483953248935936,"id_str":"365483953248935936","text":"Many #Winners Announced! New #Giveaways Are Up! http:\/\/t.co\/JSXrisKxZv #contests #sweepstakes","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":109604578,"id_str":"109604578","name":"gaynycdad","screen_name":"gaynycdad","location":"\u00dcT: 40.733845,-73.980998","url":"http:\/\/www.gaynycdad.com","description":"Blogger! Parenting: I talk about my kid, adoption, and parenting. I review everything and giveaway even more. I am a SAHD.\r\nhttp:\/\/www.gaynycdad.com","protected":false,"followers_count":10692,"friends_count":8348,"listed_count":232,"created_at":"Fri Jan 29 16:40:59 +0000 2010","favourites_count":28,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":22596,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/741257892\/n791417777_881062_6363_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/741257892\/n791417777_881062_6363_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":22,"entities":{"hashtags":[{"text":"Winners","indices":[5,13]},{"text":"Giveaways","indices":[29,39]},{"text":"contests","indices":[71,80]},{"text":"sweepstakes","indices":[81,93]}],"urls":[{"url":"http:\/\/t.co\/JSXrisKxZv","expanded_url":"http:\/\/www.gaynycdad.com\/2013\/08\/many-winners-announced-new-giveaways-are-up-50.html","display_url":"gaynycdad.com\/2013\/08\/many-w\u2026","indices":[48,70]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"Winners","indices":[20,28]},{"text":"Giveaways","indices":[44,54]},{"text":"contests","indices":[86,95]},{"text":"sweepstakes","indices":[96,108]}],"urls":[],"user_mentions":[{"screen_name":"gaynycdad","name":"gaynycdad","id":109604578,"id_str":"109604578","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972871169,"id_str":"365611225972871169","text":"RT @laprincesa_blog: Un Sharkini!! En serio, un SHARKini! xDDD http:\/\/t.co\/fPf6mhtvXR","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":205310696,"id_str":"205310696","name":"Gilda","screen_name":"MonikBur","location":"","url":null,"description":"Me he metido aqu\u00ed y ahora no encuentro la salida, si alguien la sabe...por DM!","protected":false,"followers_count":217,"friends_count":335,"listed_count":9,"created_at":"Wed Oct 20 15:49:36 +0000 2010","favourites_count":330,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":7601,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261568838349\/b0985c44a883b8357de9a69e20065d4f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261568838349\/b0985c44a883b8357de9a69e20065d4f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/205310696\/1348070915","profile_link_color":"CC3366","profile_sidebar_border_color":"DBE9ED","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 18:48:51 +0000 2013","id":365545157807972352,"id_str":"365545157807972352","text":"Un Sharkini!! En serio, un SHARKini! xDDD http:\/\/t.co\/fPf6mhtvXR","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":531663711,"id_str":"531663711","name":"Gat\u00fabela","screen_name":"laprincesa_blog","location":"Sobre el tejado de zinc","url":"http:\/\/bloglaprincesaprometida.blogspot.com.es\/","description":"Mientras busco trabajo en #publicidad y #cine escribo en http:\/\/t.co\/cw5bV5XoK8 y \r\nhttp:\/\/t.co\/bLXg8S73wa \r\n #blogger \r\nelbloglaprincesaprometida@gmail.com","protected":false,"followers_count":614,"friends_count":926,"listed_count":17,"created_at":"Tue Mar 20 19:43:26 +0000 2012","favourites_count":1756,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":8035,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/665523885\/c9dc7291b82334dc5eb501812518329d.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/665523885\/c9dc7291b82334dc5eb501812518329d.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2633875111\/a5e0e25e02c81b1e62c15ad6bb36ddf1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2633875111\/a5e0e25e02c81b1e62c15ad6bb36ddf1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/531663711\/1348237800","profile_link_color":"990094","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":14,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365545157812166656,"id_str":"365545157812166656","indices":[42,64],"media_url":"http:\/\/pbs.twimg.com\/media\/BRKtYmDCIAAI5Yq.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRKtYmDCIAAI5Yq.jpg","url":"http:\/\/t.co\/fPf6mhtvXR","display_url":"pic.twitter.com\/fPf6mhtvXR","expanded_url":"http:\/\/twitter.com\/laprincesa_blog\/status\/365545157807972352\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":612,"h":612,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"fr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"laprincesa_blog","name":"Gat\u00fabela","id":531663711,"id_str":"531663711","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002243585,"id_str":"365611226002243585","text":"@sabrinasalemi que tal si te disfrazas de marinerita","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":108716880,"in_reply_to_user_id_str":"108716880","in_reply_to_screen_name":"sabrinasalemi","user":{"id":1636985466,"id_str":"1636985466","name":"daniel farfan","screen_name":"eulicesf82","location":"Venezuela","url":null,"description":null,"protected":false,"followers_count":1,"friends_count":6,"listed_count":0,"created_at":"Thu Aug 01 03:25:49 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000223198168\/e63e740645ea5255aaf362e43e615dcc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000223198168\/e63e740645ea5255aaf362e43e615dcc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1636985466\/1375328840","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"sabrinasalemi","name":"Sabrina Salemi","id":108716880,"id_str":"108716880","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002243584,"id_str":"365611226002243584","text":"RT @Agusponce_Ok: Anshi, no entiendo de (?): _.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1317402709,"id_str":"1317402709","name":"Mike Wasauski ","screen_name":"AngiiCorridoni","location":"","url":null,"description":"Hola Tarada..Aunque me pelees y me digas de Todo ..Yo Te Quiero Trola ..Yo el Mejor de todos *Nahue*","protected":false,"followers_count":446,"friends_count":379,"listed_count":1,"created_at":"Sat Mar 30 23:59:34 +0000 2013","favourites_count":105,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":6309,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045342225\/b65b41bc057e51924a7c959fb9fab19e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045342225\/b65b41bc057e51924a7c959fb9fab19e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000182457690\/b7e1660fcb9282a63d2d8fa4045efdd9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000182457690\/b7e1660fcb9282a63d2d8fa4045efdd9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1317402709\/1373912503","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:07:25 +0000 2013","id":365610226969358336,"id_str":"365610226969358336","text":"Anshi, no entiendo de (?): _.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":789521628,"id_str":"789521628","name":"NiicoMeQuiere :3","screen_name":"Agusponce_Ok","location":"","url":null,"description":"Gugus, te Amo Herrmosa :3 sos MIA. Anshii Manda Agus OBDC.\r\n AMO CON MI VIDA A RIVER PLATE\u2665 By: Agus \r\n.","protected":false,"followers_count":326,"friends_count":318,"listed_count":0,"created_at":"Wed Aug 29 15:02:39 +0000 2012","favourites_count":101,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":18586,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024069016\/71649687b4243876d6dc922653645ae4.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024069016\/71649687b4243876d6dc922653645ae4.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000119288790\/fa6aa32658f8ffa6f9b3539f1ba7c31c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000119288790\/fa6aa32658f8ffa6f9b3539f1ba7c31c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/789521628\/1375183894","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Agusponce_Ok","name":"NiicoMeQuiere :3","id":789521628,"id_str":"789521628","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972875264,"id_str":"365611225972875264","text":"\u304a\u306f\u3088\u3046\n\u90e8\u6d3b\u304c\u3093\u3070\u308a\u307e\u3063\u3059\u308b","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":895241707,"id_str":"895241707","name":"\u897f\u753a","screen_name":"24machi","location":"\u30a6\u30a9\u30fc\u30eb\u30fb\u30ab\u30c0\u30a4\u653b\u7565\u4e2d","url":"http:\/\/twpf.jp\/24machi","description":"\u30a4\u30ca\u30a4\u30ec\u3001\u30dc\u30ab\u30ed\u3068\u304b\u304c\u597d\u304d\u3067\u3059 \u30a2\u30a4\u30b3\u30f3\u30fb\u30d8\u30c3\u30c0\u30fc\u306f\u81ea\u4f5c \u3042\u3068\u305f\u307e\u306bTL\u57cb\u3081\u308b\u304b\u3082\u3067\u3059( \u00b4 \u25bd ` )\uff89","protected":false,"followers_count":200,"friends_count":181,"listed_count":5,"created_at":"Sun Oct 21 12:09:19 +0000 2012","favourites_count":81,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":8949,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"E6E6FA","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000243126455\/ff45783fa4034a1ce8d5a8d14da65710_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000243126455\/ff45783fa4034a1ce8d5a8d14da65710_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/895241707\/1372167207","profile_link_color":"2F4F4F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"A0C5C7","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226006425600,"id_str":"365611226006425600","text":"@Cookieisahippie I always have time for you bb \ud83d\ude0d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611011899793408,"in_reply_to_status_id_str":"365611011899793408","in_reply_to_user_id":1228583976,"in_reply_to_user_id_str":"1228583976","in_reply_to_screen_name":"Cookieisahippie","user":{"id":1225530788,"id_str":"1225530788","name":"Gabriela\u270c","screen_name":"gabyellaa_","location":"","url":null,"description":"Dance accepts me.","protected":false,"followers_count":118,"friends_count":125,"listed_count":0,"created_at":"Wed Feb 27 16:28:31 +0000 2013","favourites_count":289,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1078,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000138735055\/e2bfcaf66137eab25a84e3384efce201_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000138735055\/e2bfcaf66137eab25a84e3384efce201_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1225530788\/1375922424","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Cookieisahippie","name":"V-NA$TY","id":1228583976,"id_str":"1228583976","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002227202,"id_str":"365611226002227202","text":"@chrisroetter I did not!!! OMG","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365592498959163392,"in_reply_to_status_id_str":"365592498959163392","in_reply_to_user_id":191734213,"in_reply_to_user_id_str":"191734213","in_reply_to_screen_name":"chrisroetter","user":{"id":282288314,"id_str":"282288314","name":"\u029a\u03ca\u025e pu(tao) xx \u029a\u03ca\u025e","screen_name":"hentao_","location":"tao's armpit","url":"http:\/\/www.asianfanfics.com\/profile\/view_author_stories\/265573\/L","description":"Kii \u2022 Huang Zitao is a puta \u2022 multifandom \u2022 \u2764 @xiu_xiu_chan \u2764","protected":false,"followers_count":637,"friends_count":367,"listed_count":7,"created_at":"Thu Apr 14 22:51:21 +0000 2011","favourites_count":12377,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":74524,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043380782\/cabd101183631a21be380d1109a1b18a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043380782\/cabd101183631a21be380d1109a1b18a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261372480\/19a42b2f76d8d6a1368d066538d288b2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261372480\/19a42b2f76d8d6a1368d066538d288b2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/282288314\/1375496993","profile_link_color":"2DC2A4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"BCB8E0","profile_text_color":"B23EF0","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"chrisroetter","name":"\/\/","id":191734213,"id_str":"191734213","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225972875265,"id_str":"365611225972875265","text":"@de_melendi @SuhSandryta19 yo hablo en general si os dais x aludidas no es ni problema","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365580101624139776,"in_reply_to_status_id_str":"365580101624139776","in_reply_to_user_id":1039742005,"in_reply_to_user_id_str":"1039742005","in_reply_to_screen_name":"de_melendi","user":{"id":1218787020,"id_str":"1218787020","name":"rafa","screen_name":"raafaa_92","location":"","url":null,"description":null,"protected":false,"followers_count":134,"friends_count":258,"listed_count":0,"created_at":"Mon Feb 25 14:31:35 +0000 2013","favourites_count":14,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1265,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3328142763\/7788c32590c272c0c5bfaabfaa50c0de_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3328142763\/7788c32590c272c0c5bfaabfaa50c0de_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"de_melendi","name":"Sonr\u00ede Melendi.","id":1039742005,"id_str":"1039742005","indices":[0,11]},{"screen_name":"SuhSandryta19","name":"Coraz\u00f3n de pe\u00f3n. ","id":826003172,"id_str":"826003172","indices":[12,26]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611226002235392,"id_str":"365611226002235392","text":"TA LOKITO VO?","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":418703303,"id_str":"418703303","name":"Agostina","screen_name":"agovilte","location":"Argentina","url":null,"description":"\u2666SIN VIDA SOCIAL\u2666 Directioner\u00ba\u00ba Amante del Hockey\u2665","protected":false,"followers_count":215,"friends_count":368,"listed_count":0,"created_at":"Tue Nov 22 13:45:32 +0000 2011","favourites_count":45,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":5875,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024259351\/a592a3997433ce2d4f725a31915f6d25.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024259351\/a592a3997433ce2d4f725a31915f6d25.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227265606\/b90b8d115609368da0d5ab23b98beeb9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227265606\/b90b8d115609368da0d5ab23b98beeb9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/418703303\/1375396125","profile_link_color":"62CCBE","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:23 +0000 2013","id":365611225985454080,"id_str":"365611225985454080","text":"RT @LethiciaTorres: Pra n\u00f3s, todo amor do mundo. Am\u00e9m!","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":137506140,"id_str":"137506140","name":"thalinda","screen_name":"omgthalita","location":"","url":null,"description":"Imposs\u00edvel \u00e9 uma palavra muito grande que gente\r\npequena usa pra tentar nos oprimir","protected":false,"followers_count":260,"friends_count":150,"listed_count":0,"created_at":"Mon Apr 26 23:29:43 +0000 2010","favourites_count":715,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":9892,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046652544\/f9f46c98d6a582abec94852a075516aa.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046652544\/f9f46c98d6a582abec94852a075516aa.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000194989086\/71d6719413ca182bf36758bae65d050c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000194989086\/71d6719413ca182bf36758bae65d050c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/137506140\/1375897294","profile_link_color":"0BB3AB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:06:45 +0000 2013","id":365610058249273344,"id_str":"365610058249273344","text":"Pra n\u00f3s, todo amor do mundo. Am\u00e9m!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":85196541,"id_str":"85196541","name":"\u00e9 lele p crlh \u2206","screen_name":"LethiciaTorres","location":"","url":"https:\/\/www.facebook.com\/lethicia.torres.37","description":"\u201cSe voc\u00ea ainda n\u00e3o desistiu \u00e9 porque, de alguma forma, voc\u00ea ainda quer acreditar.\u201d \u2014 Renato Russo.","protected":false,"followers_count":218,"friends_count":354,"listed_count":2,"created_at":"Mon Oct 26 00:05:55 +0000 2009","favourites_count":130,"utc_offset":7200,"time_zone":"Paris","geo_enabled":true,"verified":false,"statuses_count":9992,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/803702532\/bb36033a47e0dd9642a2b9edce79cdee.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/803702532\/bb36033a47e0dd9642a2b9edce79cdee.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000255441938\/40c4ea572837e5577e2ec826794ef15e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000255441938\/40c4ea572837e5577e2ec826794ef15e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/85196541\/1375892975","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"050202","profile_text_color":"C20A5D","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"LethiciaTorres","name":"\u00e9 lele p crlh \u2206","id":85196541,"id_str":"85196541","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230183956480,"id_str":"365611230183956480","text":"Je vais me tuer moi meme \u00e0 force de dire aux gens qui demande de l'aide ''je gere tkt''","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":75264118,"id_str":"75264118","name":"Keep Calm I'm Here","screen_name":"OsmanMahdi1","location":"Dakar City","url":"https:\/\/soundcloud.com\/osmanmahdi","description":"Viens... :) Punchliner \u00e0 temps partiel ... #YbmCrew #ArsenalFan","protected":false,"followers_count":174,"friends_count":314,"listed_count":0,"created_at":"Fri Sep 18 12:09:28 +0000 2009","favourites_count":190,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":6017,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/712246734\/d470ab39cd179a622e68dd45a1117d6e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/712246734\/d470ab39cd179a622e68dd45a1117d6e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3038952478\/f5f44cb38456ba023d6766a5125aaa49_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3038952478\/f5f44cb38456ba023d6766a5125aaa49_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/75264118\/1360366104","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171377664,"id_str":"365611230171377664","text":"\"Vou guardar seu segredo, mas n\u00e3o posso ficar com voc\u00ea Stefan\" voce \u00e9 idiota? tem probleminha? quer apanhar?","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":181328351,"id_str":"181328351","name":"renachenha","screen_name":"reh_parker","location":"Hogwarts","url":"http:\/\/animespir.it\/939105","description":"fallen | tw | sg | hp | writer | jd | TCD","protected":false,"followers_count":348,"friends_count":423,"listed_count":1,"created_at":"Sat Aug 21 22:41:25 +0000 2010","favourites_count":124,"utc_offset":-21600,"time_zone":"Chihuahua","geo_enabled":false,"verified":false,"statuses_count":4033,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000038069240\/def63266afb00a3165613667011d5aea.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000038069240\/def63266afb00a3165613667011d5aea.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248063465\/9f671f5a1357407d327ad017ee5992c9_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248063465\/9f671f5a1357407d327ad017ee5992c9_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/181328351\/1375142521","profile_link_color":"F57373","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6FFD1","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175559680,"id_str":"365611230175559680","text":"pfvr","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":80091984,"id_str":"80091984","name":"SYNYSTRO ","screen_name":"Bruno_Carsten","location":"Blumenau-SC","url":null,"description":"Public Enemy No.1","protected":false,"followers_count":159,"friends_count":149,"listed_count":0,"created_at":"Mon Oct 05 18:31:48 +0000 2009","favourites_count":26,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":21394,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043368851\/af2a0c60ad7b578a167740f4b8195592.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043368851\/af2a0c60ad7b578a167740f4b8195592.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247960346\/72eb8fd13a8d6c7f75672d6f8c7b3573_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247960346\/72eb8fd13a8d6c7f75672d6f8c7b3573_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/80091984\/1370900837","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230183960576,"id_str":"365611230183960576","text":"#ArgentinoComo un buen asado","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1407436927,"id_str":"1407436927","name":"Pachuuuu","screen_name":"parodi_ignacio","location":"","url":null,"description":"CABJ-ARQUERO TITULAR INDISCUTIDO DE MORODO FC","protected":false,"followers_count":259,"friends_count":382,"listed_count":0,"created_at":"Mon May 06 10:46:15 +0000 2013","favourites_count":32,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":1211,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/862781269\/4f12bdae2a023972019f8af7303c4356.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/862781269\/4f12bdae2a023972019f8af7303c4356.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3683001283\/f15e7d90e6c2320331ac3b6d7349137a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3683001283\/f15e7d90e6c2320331ac3b6d7349137a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1407436927\/1374906100","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"ArgentinoComo","indices":[0,14]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175576064,"id_str":"365611230175576064","text":"Estoy lejos de olvidarte porque eres muy especial.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":546327072,"id_str":"546327072","name":"Ween'","screen_name":"EsaaWeen","location":"","url":null,"description":"Soy loca' divertida' exagerada' risue\u00f1a' loca' & mas loca xD .. Solo sigueme ;)","protected":false,"followers_count":103,"friends_count":147,"listed_count":0,"created_at":"Thu Apr 05 19:58:41 +0000 2012","favourites_count":171,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":4487,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/803889535\/be9dd654e250aef647d2d93945676ef4.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/803889535\/be9dd654e250aef647d2d93945676ef4.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3674799301\/81d7fa2927b8c0e7abce775f12a49b68_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3674799301\/81d7fa2927b8c0e7abce775f12a49b68_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/546327072\/1368848970","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230188142592,"id_str":"365611230188142592","text":"Saying my prayers ... Y'all are all there! \u2764","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":364782138,"id_str":"364782138","name":"Bella Montes","screen_name":"B_3lla_boo","location":"","url":null,"description":"#latina #soyloquesoy #childofGod #90'sborn","protected":false,"followers_count":198,"friends_count":171,"listed_count":1,"created_at":"Tue Aug 30 10:02:21 +0000 2011","favourites_count":52,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":3562,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247121110\/61cca0adc11a6e586a1597c9aad32f8f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247121110\/61cca0adc11a6e586a1597c9aad32f8f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/364782138\/1375349129","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196535297,"id_str":"365611230196535297","text":"I must be the only one that thinks wagon wheel is uber gay","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":514896681,"id_str":"514896681","name":"Jacob","screen_name":"JacobSayers09","location":"","url":null,"description":null,"protected":false,"followers_count":305,"friends_count":267,"listed_count":1,"created_at":"Sun Mar 04 23:48:31 +0000 2012","favourites_count":1076,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":8257,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3783605308\/269a9f0dd7149069ffa58105cb9418ad_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3783605308\/269a9f0dd7149069ffa58105cb9418ad_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/514896681\/1356292430","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171369473,"id_str":"365611230171369473","text":"A\u015fk kap\u0131y\u0131 \u00e7almaz... \u0130\u00e7eri girer, ba\u015f k\u00f6\u015feye kurulur.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":583143280,"id_str":"583143280","name":"Salih Bayat","screen_name":"saliihbayat","location":"\u0130stanbul","url":"http:\/\/www.facebook.com\/saliih.bayat","description":"00:00\r 08.06. tarihinde iki defa do\u011fan,fanatik Fenerbah\u00e7e'li,Atat\u00fcrk\u00e7\u00fc, \u0130kizler burcu, Selin'e, \u0131slak keke, basketbola, futbola ve uykuya a\u015f\u0131k olan biri.","protected":false,"followers_count":130,"friends_count":54,"listed_count":0,"created_at":"Thu May 17 21:09:15 +0000 2012","favourites_count":99,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":true,"verified":false,"statuses_count":4855,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000112504234\/10241e6489f474e63bacebf40082292c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000112504234\/10241e6489f474e63bacebf40082292c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/583143280\/1373813836","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230200741888,"id_str":"365611230200741888","text":"Minha cabe\u00e7a vai explodir, socorro!","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":390428879,"id_str":"390428879","name":"- BeeH ","screen_name":"Hey_Beeh","location":"","url":null,"description":"Pisciana, Idiota, Sincera, Ciumenta ...Segue? Sigo De Volta *--* Pergunta: http:\/\/t.co\/WpS7AKdNd2 Face: http:\/\/t.co\/xNonKWOZcw Instagram: http:\/\/t.co\/75dgvE5cBn","protected":false,"followers_count":515,"friends_count":396,"listed_count":0,"created_at":"Thu Oct 13 23:43:42 +0000 2011","favourites_count":0,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":false,"verified":false,"statuses_count":11769,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000021567580\/5df5b58536d4eaad3a112030238bd85e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000021567580\/5df5b58536d4eaad3a112030238bd85e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000177764964\/521fbc5ff859f767f793814b976c7c09_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000177764964\/521fbc5ff859f767f793814b976c7c09_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/390428879\/1369534927","profile_link_color":"0099CC","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230188154880,"id_str":"365611230188154880","text":"\u9234\u9e7f\u3055\u3093\u2026\u592a\u5dfb\u3055\u3093\u4ee5\u5916\u3044\u306a\u3044\u3093\u304b\u76f8\u624b","source":"\u003ca href=\"https:\/\/twitter.com\/TheWorld_JP\" rel=\"nofollow\"\u003eTheWorld for iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":105034250,"id_str":"105034250","name":"\u4e0b\u7d75","screen_name":"emi1211","location":"\u6771\u4eac","url":null,"description":"2009\u5e746\u6708\u306b\u9577\u7537\u307e\u3063\u3061\u30012013\u5e743\u6708\u306b\u6b21\u7537\u3082\u3063\u3061\u51fa\u7523\u3002\u6c17\u4ed8\u3044\u305f\u3089\u7537\u51502\u4eba\u306e\u6bcd\u3002\u304a\u6c17\u697d\u6bcd\u3055\u3093\u90e8\u9580\u6240\u5c5e\u3002\u4e00\u5fdcWM\u90e8\u9580\u306b\u3082\u6240\u5c5e\u3002\u80b2\u4f11\u4e2d\u3002\n\u30de\u30f3\u30ac\u3068\u829d\u5c45\u3068\u97f3\u697d\u5927\u597d\u304d\u3002\u5e03\u3068\u30d3\u30fc\u30ba\u3082\u597d\u304d\u3002\u751f\u6d3b\u5782\u308c\u6d41\u3057\u7cfb\u3002\n\u30d5\u30a9\u30ed\u30ea\u30e0\u306f\u304a\u6c17\u8efd\u306b\u3002\u80b2\u5150\u95a2\u9023\u306f\u30d5\u30a9\u30ed\u30fc\u8fd4\u3057\u3057\u3066\u3044\u308b\u3064\u3082\u308a\u3067\u3059\u304c\u3001\u6c17\u4ed8\u304b\u306a\u3044\u4e8b\u3082\u591a\u3005\u2026\u3059\u3044\u307e\u305b\u3093\u3002","protected":false,"followers_count":211,"friends_count":231,"listed_count":13,"created_at":"Fri Jan 15 04:20:17 +0000 2010","favourites_count":273,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":19132,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBEBEB","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2635438118\/05953a5358d1596f62a3c9cbce85fe5c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2635438118\/05953a5358d1596f62a3c9cbce85fe5c_normal.jpeg","profile_link_color":"990000","profile_sidebar_border_color":"DFDFDF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196535298,"id_str":"365611230196535298","text":"- Ai que filme chato man\u00e9...\n- Eu n\u00e3o t\u00f4 nem prestando aten\u00e7\u00e3o nele.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1281521191,"id_str":"1281521191","name":"O Vagabundo","screen_name":"tspereirarj","location":"","url":"http:\/\/instagram.com\/tspereirarj","description":"O mundo \u00e9 o mesmo. Com menos raz\u00f5es para se viver. - Capit\u00e3o Jack Sparrow.","protected":false,"followers_count":67,"friends_count":50,"listed_count":0,"created_at":"Tue Mar 19 21:02:46 +0000 2013","favourites_count":841,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":5741,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035223399\/c834866fcb148caa8ee4ba29cc76ff9c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035223399\/c834866fcb148caa8ee4ba29cc76ff9c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000199587107\/6d38db527e3b9c7b897e3a81bfefda8d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000199587107\/6d38db527e3b9c7b897e3a81bfefda8d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1281521191\/1375906660","profile_link_color":"0783D6","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230167175168,"id_str":"365611230167175168","text":"@tekla_h Thanks for sharing the safe space policy! Looks good, no real feedback, so cross-posting here: there's an \"an reading\" grammero.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":198602326,"in_reply_to_user_id_str":"198602326","in_reply_to_screen_name":"tekla_h","user":{"id":15707258,"id_str":"15707258","name":"Vincent Gable","screen_name":"VTPG","location":"San Francisco, CA","url":"http:\/\/vincentgable.com","description":"Austin Texas to San Francisco transplant.\r\nDTS Engineer at Apple \uf8ff","protected":false,"followers_count":694,"friends_count":314,"listed_count":31,"created_at":"Sun Aug 03 03:16:17 +0000 2008","favourites_count":3456,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":15542,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"E8E8E8","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1667453527\/crop_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1667453527\/crop_normal.png","profile_link_color":"0A27A4","profile_sidebar_border_color":"888888","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":{"id":"3ec0643c0bb2f318","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/3ec0643c0bb2f318.json","place_type":"city","name":"West Santa Clara","full_name":"West Santa Clara, CA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-122.202653,37.200495],[-122.202653,37.42499],[-122.028565,37.42499],[-122.028565,37.200495]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"tekla_h","name":"Tekla Hawkins","id":198602326,"id_str":"198602326","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230183960577,"id_str":"365611230183960577","text":"Contu y su candadito :)","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":223495686,"id_str":"223495686","name":"Mica \u2665","screen_name":"micaaelavecchio","location":"Villa Urquiza","url":null,"description":"La familia, los amigos y las peque\u00f1as cosas... Las Pastillas del Abuelo \u2665 :\u2022)","protected":false,"followers_count":160,"friends_count":176,"listed_count":0,"created_at":"Mon Dec 06 15:13:03 +0000 2010","favourites_count":73,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"verified":false,"statuses_count":5713,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000032284565\/0339c0d9c05e64655e89e2acf43e6f40.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000032284565\/0339c0d9c05e64655e89e2acf43e6f40.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3532687259\/4dc0dfbecbe7b8c68e88605f5fc4a242_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3532687259\/4dc0dfbecbe7b8c68e88605f5fc4a242_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/223495686\/1368574918","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230167183360,"id_str":"365611230167183360","text":"Seberapa jagokah kamu memasak? \u2014 Sejago masak air http:\/\/t.co\/TlnlA1HJbX","source":"\u003ca href=\"http:\/\/ask.fm\/\" rel=\"nofollow\"\u003eAsk.fm\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":319682147,"id_str":"319682147","name":"AnggitaKhanzaNabilla","screen_name":"anggitakhanza","location":"","url":null,"description":"Allah always with me. Yandikafauzan's mine\u2665","protected":false,"followers_count":382,"friends_count":450,"listed_count":0,"created_at":"Sat Jun 18 15:04:46 +0000 2011","favourites_count":11,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":13774,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/716552925\/495df7a19a9774d8353e37205d3d4b33.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/716552925\/495df7a19a9774d8353e37205d3d4b33.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260204788\/11bfe43bf5186d11500a5baccbf73ce0_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260204788\/11bfe43bf5186d11500a5baccbf73ce0_normal.jpeg","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/TlnlA1HJbX","expanded_url":"http:\/\/ask.fm\/a\/5jia3o48","display_url":"ask.fm\/a\/5jia3o48","indices":[50,72]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230200741889,"id_str":"365611230200741889","text":"\u3069\u30fc\u3067\u3082\u3044\u3044\u3084","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1630171009,"id_str":"1630171009","name":"Tomaso-Azoth@\uff8d(O\u0434O\uff8d","screen_name":"tomaso_Cocytus","location":"\u3068\u304a\u3044\u304f\u306b","url":null,"description":"\u30af\u30bd\u30cd\u30df \u30a2\u30a4\u30b3\u30f3\u306f\u6c34\u7389\u3055\u3093 @cyanoptila26 \u30d8\u30c3\u30c0\u30fc\u306f\u304a\u306b\u304e\u308a\u541b @Ryooooo_07","protected":false,"followers_count":46,"friends_count":51,"listed_count":1,"created_at":"Mon Jul 29 11:27:59 +0000 2013","favourites_count":179,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1000,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000223625416\/c169744ae20c6a5f26181427a9b0eb87_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000223625416\/c169744ae20c6a5f26181427a9b0eb87_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1630171009\/1375097351","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204932099,"id_str":"365611230204932099","text":"bu bayram uyumak sevap dediler uyuduk :)","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":777812508,"id_str":"777812508","name":"seyma caglar","screen_name":"symacglr","location":"","url":null,"description":"Pamukkale Universitesi- Fizik Tedavi ve Rehabilitasyon","protected":false,"followers_count":80,"friends_count":80,"listed_count":0,"created_at":"Fri Aug 24 09:42:10 +0000 2012","favourites_count":23,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":true,"verified":false,"statuses_count":129,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/692837118\/7714e72c8aa179a40e18e2801760d176.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/692837118\/7714e72c8aa179a40e18e2801760d176.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000177823470\/36dfdb52bd7ca5c1f7ceda09f249d462_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000177823470\/36dfdb52bd7ca5c1f7ceda09f249d462_normal.jpeg","profile_link_color":"2FC2EF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179758080,"id_str":"365611230179758080","text":"Sister Nadz In Her Preaching Segment ... Lool Okay","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":460760237,"id_str":"460760237","name":"IG:ChrispinDqaf \u2122 ","screen_name":"ChrispinDqaf","location":"","url":"https:\/\/www.facebook.com\/Crispianahldo","description":"\u2764 I'll FollowBack jh","protected":false,"followers_count":913,"friends_count":805,"listed_count":0,"created_at":"Wed Jan 11 02:59:59 +0000 2012","favourites_count":213,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":14266,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000240285912\/f473cd9d46d36e9d759dd5b86eceef85_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000240285912\/f473cd9d46d36e9d759dd5b86eceef85_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/460760237\/1375673857","profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175563776,"id_str":"365611230175563776","text":"@GARI_507 Dale click al boton Seguir en nuestro perfil para poder atenderte mejor","source":"\u003ca href=\"http:\/\/www.cableonda.com\" rel=\"nofollow\"\u003eapp Team Cable Onda\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":115496839,"in_reply_to_user_id_str":"115496839","in_reply_to_screen_name":"GARI_507","user":{"id":541171240,"id_str":"541171240","name":"Team CableOnda","screen_name":"TeamCableOnda","location":"Panam\u00e1","url":"http:\/\/www.cableonda.com\/","description":"Servicio al cliente + soporte t\u00e9cnico + estamos para ayudarle!\r\n de Lunes a Viernes de\r\n8:00am a 8:00pm y S\u00e1bados - Domingos\r\n8:00am a 4:00pm","protected":false,"followers_count":7286,"friends_count":3709,"listed_count":36,"created_at":"Fri Mar 30 22:21:28 +0000 2012","favourites_count":1,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":16646,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/525327180\/cable_onda-twitter--------05.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/525327180\/cable_onda-twitter--------05.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1995710595\/CableOnda_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1995710595\/CableOnda_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"GARI_507","name":"Gustavo A. Reyes I.","id":115496839,"id_str":"115496839","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175571968,"id_str":"365611230175571968","text":"RT @thehereafter: Portland. @CrystalHotelPDX for three more night. 7-10pm. Zero admission dollars. Excellent food available. Also alcohol a\u2026","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":174530607,"id_str":"174530607","name":"Kelsi E.","screen_name":"KelsiBSides","location":"Oklahoma City, OK","url":"http:\/\/www.brightersides.com","description":"i blog. i eat. i sleep. i tweet. basically.","protected":false,"followers_count":1201,"friends_count":382,"listed_count":28,"created_at":"Wed Aug 04 04:18:05 +0000 2010","favourites_count":1196,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":21824,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B4B4B4","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/402551728\/stripe_37c9fcfc9f91dcf86d224f87cbb1ac7f.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/402551728\/stripe_37c9fcfc9f91dcf86d224f87cbb1ac7f.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000189200401\/3510651e0d570f59f43a2f02b6c50bb8_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000189200401\/3510651e0d570f59f43a2f02b6c50bb8_normal.png","profile_link_color":"333333","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:05 +0000 2013","id":365611148147568640,"id_str":"365611148147568640","text":"Portland. @CrystalHotelPDX for three more night. 7-10pm. Zero admission dollars. Excellent food available. Also alcohol and sweet tunes.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":24641853,"id_str":"24641853","name":"John Elliott","screen_name":"thehereafter","location":"California","url":"http:\/\/www.thehereafterishere.com","description":"Time is an illusion.","protected":false,"followers_count":482,"friends_count":292,"listed_count":25,"created_at":"Mon Mar 16 03:21:16 +0000 2009","favourites_count":51,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":1058,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/114848329\/05-American-in-Love_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/114848329\/05-American-in-Love_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/24641853\/1369031653","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CrystalHotelPDX","name":"CrystalHotelPDX","id":358476824,"id_str":"358476824","indices":[10,26]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"thehereafter","name":"John Elliott","id":24641853,"id_str":"24641853","indices":[3,16]},{"screen_name":"CrystalHotelPDX","name":"CrystalHotelPDX","id":358476824,"id_str":"358476824","indices":[28,44]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179770368,"id_str":"365611230179770368","text":"Do I come to the ville this weekend or next weekend? Hmmm.. @Schlotskii @curlyBERNs","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":342159658,"id_str":"342159658","name":"Zachariah Green","screen_name":"prettyboyzach10","location":"The Ville","url":null,"description":"Get Yo Grindsauce On","protected":false,"followers_count":478,"friends_count":564,"listed_count":1,"created_at":"Mon Jul 25 15:43:03 +0000 2011","favourites_count":1940,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":4902,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000205488252\/9aabc72d6d650aa445339eb957378041_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000205488252\/9aabc72d6d650aa445339eb957378041_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/342159658\/1358968746","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Schlotskii","name":"Saint Christopher","id":363157633,"id_str":"363157633","indices":[61,72]},{"screen_name":"curlyBERNs","name":"Alex Bern","id":630804735,"id_str":"630804735","indices":[73,84]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230183952384,"id_str":"365611230183952384","text":"RT @MafiaBernasconi: Amarte, odiarte, nunca es suficiente.","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1651773715,"id_str":"1651773715","name":"JoaquinOchoaTeAmo","screen_name":"Joako_Sexy","location":"","url":null,"description":"Ochoista siempre !! Hermoso? @Joakota8 Sexi? @Joakota8 Las Ochoistas seguimos esperando ese #ChapeFuerteContraLaParedDe@Joakota8","protected":false,"followers_count":39,"friends_count":160,"listed_count":0,"created_at":"Wed Aug 07 01:52:51 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":392,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256947880\/bacbf10b8b4f557acd2ca5f614c46197_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256947880\/bacbf10b8b4f557acd2ca5f614c46197_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:18:44 +0000 2013","id":365597974979559425,"id_str":"365597974979559425","text":"Amarte, odiarte, nunca es suficiente.","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1576114734,"id_str":"1576114734","name":"Manuelito \u0784","screen_name":"MafiaBernasconi","location":"","url":null,"description":"Y a pesar de todo, seguimos sonriendo gracias a el.","protected":false,"followers_count":2301,"friends_count":1492,"listed_count":1,"created_at":"Sun Jul 07 22:02:48 +0000 2013","favourites_count":629,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":false,"verified":false,"statuses_count":5962,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000036080234\/cd444e9cac4ec7cee35eee9261ea775d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000036080234\/cd444e9cac4ec7cee35eee9261ea775d.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247518469\/1f1ff4e30842982f507a40e66f9536f9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247518469\/1f1ff4e30842982f507a40e66f9536f9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1576114734\/1374795091","profile_link_color":"403636","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":3,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MafiaBernasconi","name":"Manuelito \u0784","id":1576114734,"id_str":"1576114734","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179758081,"id_str":"365611230179758081","text":"Me lo imagine asiendo comida , y me dio risa #quelindo","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1390568828,"id_str":"1390568828","name":"YuvizaBernal","screen_name":"YuvizaB","location":"","url":null,"description":null,"protected":false,"followers_count":187,"friends_count":195,"listed_count":0,"created_at":"Mon Apr 29 22:06:12 +0000 2013","favourites_count":271,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3192,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DB2884","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000030147348\/46f3d9b575039db0e41678f8550f891d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000030147348\/46f3d9b575039db0e41678f8550f891d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000164482981\/eebd6ba46b15ee94ee32d9956519656c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000164482981\/eebd6ba46b15ee94ee32d9956519656c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1390568828\/1369003832","profile_link_color":"44C238","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"quelindo","indices":[45,54]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204928000,"id_str":"365611230204928000","text":"@32coloradogrl @DaisyD113 ,,hey chicks how do I work some mojo to get a m&g with kip he is gonna b at state fair this week?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":606563993,"in_reply_to_user_id_str":"606563993","in_reply_to_screen_name":"32coloradogrl","user":{"id":63601897,"id_str":"63601897","name":"jim moseley","screen_name":"jsam242448","location":"","url":null,"description":null,"protected":false,"followers_count":17,"friends_count":52,"listed_count":0,"created_at":"Fri Aug 07 01:36:05 +0000 2009","favourites_count":21,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1199,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000162675112\/59ec16b08fb806957027e95ae3f9a1a4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000162675112\/59ec16b08fb806957027e95ae3f9a1a4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/63601897\/1368584404","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"32coloradogrl","name":"Tiffany\u2728","id":606563993,"id_str":"606563993","indices":[0,14]},{"screen_name":"DaisyD113","name":"\u26beDaisy-KipMooreFan","id":617700902,"id_str":"617700902","indices":[15,25]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230200741890,"id_str":"365611230200741890","text":"RT @ddlovato: Sang a metal version of Happy Birthday to my mom. She said it's not complete without my pig squeals. #SomeThingsNeverChange..\u2026","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":718624129,"id_str":"718624129","name":"Rockvato","screen_name":"Giulovato","location":"","url":null,"description":"But please don't catch me..","protected":false,"followers_count":1252,"friends_count":1425,"listed_count":0,"created_at":"Thu Jul 26 19:38:40 +0000 2012","favourites_count":5677,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":12749,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"020F0D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000029776494\/392cc484502b718c2c31bf808ad834f5.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000029776494\/392cc484502b718c2c31bf808ad834f5.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193574827\/6b5333e6c4a9c9a5169729aa0a155949_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193574827\/6b5333e6c4a9c9a5169729aa0a155949_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/718624129\/1374867998","profile_link_color":"020F0D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 19:18:48 +0000 2013","id":365552692568920065,"id_str":"365552692568920065","text":"Sang a metal version of Happy Birthday to my mom. She said it's not complete without my pig squeals. #SomeThingsNeverChange..Happy bday mom!","source":"\u003ca href=\"http:\/\/www.echofon.com\/\" rel=\"nofollow\"\u003eEchofon\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":21111883,"id_str":"21111883","name":"demetria lovato","screen_name":"ddlovato","location":"DALLAS\/LA","url":"http:\/\/www.facebook.com\/DemiLovato","description":"New album DEMI feat. Made in the USA and Heart Attack available NOW!!! Download here - http:\/\/smarturl.it\/dliTunesa1","protected":false,"followers_count":16508929,"friends_count":222,"listed_count":101971,"created_at":"Tue Feb 17 18:02:08 +0000 2009","favourites_count":33,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":true,"statuses_count":9647,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045520748\/9970951508a34db1e9a46bd5f475c07d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045520748\/9970951508a34db1e9a46bd5f475c07d.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248460922\/9d6e8d8a24650372e314f485265eb40c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248460922\/9d6e8d8a24650372e314f485265eb40c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/21111883\/1375761090","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"B9BEB8","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":11506,"entities":{"hashtags":[{"text":"SomeThingsNeverChange","indices":[101,123]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"SomeThingsNeverChange","indices":[115,137]}],"urls":[],"user_mentions":[{"screen_name":"ddlovato","name":"demetria lovato","id":21111883,"id_str":"21111883","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230183964672,"id_str":"365611230183964672","text":"Como se que esta jugando a la play ni lo molesto. Ya se lo que es con su querida ps.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":481505711,"id_str":"481505711","name":"MinionDeFran\u2665","screen_name":"AnttoCastagna","location":"","url":"https:\/\/www.facebook.com\/antto.central2","description":"*Forever trusting who we are and nothing else matters* Lo lindo de mi vida es el saber que la gobierna tu ser","protected":false,"followers_count":430,"friends_count":360,"listed_count":0,"created_at":"Thu Feb 02 21:01:13 +0000 2012","favourites_count":36,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"verified":false,"statuses_count":17118,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042715421\/6cc229b745bad838a2ca22586951dce4.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042715421\/6cc229b745bad838a2ca22586951dce4.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242188815\/dd7b813b65c598b7429b710d399faa75_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242188815\/dd7b813b65c598b7429b710d399faa75_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/481505711\/1373672556","profile_link_color":"DE0BBE","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204919809,"id_str":"365611230204919809","text":"https:\/\/t.co\/ZX16l9qHdg falo nd, s\u00f3 observo a burrice e dou risada hgjdcgjghj","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":425264589,"id_str":"425264589","name":"tay","screen_name":"gutsdrew","location":"30.11","url":"http:\/\/a.tumblr.com\/tumblr_mg3arj07pd1qjn6b1o1.mp3","description":"i need your love","protected":false,"followers_count":1885,"friends_count":1131,"listed_count":2,"created_at":"Wed Nov 30 19:45:04 +0000 2011","favourites_count":139,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":false,"verified":false,"statuses_count":46715,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/818926273\/565f1d2ff429fb44049cd28e61ab1c61.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/818926273\/565f1d2ff429fb44049cd28e61ab1c61.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000236680580\/5a07a5be9cf18db64daf99618f3510a9_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000236680580\/5a07a5be9cf18db64daf99618f3510a9_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/425264589\/1366081307","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/ZX16l9qHdg","expanded_url":"https:\/\/fbcdn-sphotos-d-a.akamaihd.net\/hphotos-ak-ash3\/577229_390602151040086_2092078560_n.png","display_url":"fbcdn-sphotos-d-a.akamaihd.net\/hphotos-ak-ash\u2026","indices":[0,23]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175576065,"id_str":"365611230175576065","text":"According to Datalogix, followers exposed to Promoted Tweets purchased 29 percent more from that brand than... http:\/\/t.co\/c0cowB3pQs","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":877505964,"id_str":"877505964","name":"Hylton Chilchik","screen_name":"UltimITech","location":"Sydney, Australia","url":"https:\/\/www.facebook.com\/UltimITech","description":"Tech Info,Cloud solutions, Comprehensive I.T. support.Remote IT support,data backup & recovery,network architecture & support,virus eradication","protected":false,"followers_count":72,"friends_count":267,"listed_count":1,"created_at":"Sat Oct 13 10:24:38 +0000 2012","favourites_count":8,"utc_offset":39600,"time_zone":"New Caledonia","geo_enabled":false,"verified":false,"statuses_count":1468,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/683558333\/5219ed54b27ffb83facadcaf43bd5e54.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/683558333\/5219ed54b27ffb83facadcaf43bd5e54.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2712090494\/61b473a45f0a624ef728a3ebb184b0fe_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2712090494\/61b473a45f0a624ef728a3ebb184b0fe_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/c0cowB3pQs","expanded_url":"http:\/\/fb.me\/CX7ewy3R","display_url":"fb.me\/CX7ewy3R","indices":[111,133]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196531202,"id_str":"365611230196531202","text":"test\n08\/08\/2013 18:11:00\nWind Dir: 61\ufffd\nWind Spd: 2mph\nTemp: 81.6\ufffdF\nRaw Barom: 29.02in\nDew Point: 76.7\ufffdF\nDailyRain: 0.18in","source":"\u003ca href=\"http:\/\/twittercounter.com\" rel=\"nofollow\"\u003eThe Visitor Widget\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":107594885,"id_str":"107594885","name":"Jeff Bright","screen_name":"BrokenArrowWX","location":"Broken Arrow, OK","url":"http:\/\/www.baweather.com","description":"Weather reports for the areas surrounding Broken Arrow, Oklahoma","protected":false,"followers_count":65,"friends_count":0,"listed_count":1,"created_at":"Sat Jan 23 02:47:37 +0000 2010","favourites_count":0,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":25041,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/649416835\/weather-picture-photo-lightning-storm-Damgaard_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/649416835\/weather-picture-photo-lightning-storm-Damgaard_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204932097,"id_str":"365611230204932097","text":"\u5e03\u56e3\u3067\u3054\u308d\u3054\u308d\u3057\u3066\u308b\u306e\u306f\u5272\u3068\u6c17\u6301\u3061\u826f\u304b\u3063\u305f\u3067\u3059\u3051\u3069\u306d\uff3f(\u3000_\u00b4\u03c9`)_\u610f\u5473\u306a\u3044\u306a\u30fc\u7b11","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":388711976,"id_str":"388711976","name":"\u3082\u3075","screen_name":"luinaru","location":"","url":null,"description":"\u3082\u3075\u3082\u3075(\u03c3\u03c9-)\uff61\u043e\uff9f","protected":false,"followers_count":22,"friends_count":22,"listed_count":0,"created_at":"Tue Oct 11 06:41:13 +0000 2011","favourites_count":254,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":7394,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"9AE4E8","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme16\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme16\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3408188485\/b677aa63666e1ef266185e40f814eb7f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3408188485\/b677aa63666e1ef266185e40f814eb7f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/388711976\/1350106008","profile_link_color":"0084B4","profile_sidebar_border_color":"BDDCAD","profile_sidebar_fill_color":"DDFFCC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196535299,"id_str":"365611230196535299","text":"RT @maequerodemi: odeio\nque\napertam \nminhas\nbuchechas","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":264919902,"id_str":"264919902","name":"Milene M\u00fcller Reis","screen_name":"milenereis_","location":"","url":null,"description":"http:\/\/instagram.com\/milenemuller_","protected":false,"followers_count":722,"friends_count":272,"listed_count":0,"created_at":"Sat Mar 12 17:42:04 +0000 2011","favourites_count":312,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":25398,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"030303","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000020598152\/08a984678e0bde31a9c9d0bc1abec9a9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000020598152\/08a984678e0bde31a9c9d0bc1abec9a9.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227088790\/2a4a9f1760df97e4ead99656885b1a8e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227088790\/2a4a9f1760df97e4ead99656885b1a8e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/264919902\/1375666739","profile_link_color":"F272DB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:35:10 +0000 2013","id":365602112589139968,"id_str":"365602112589139968","text":"odeio\nque\napertam \nminhas\nbuchechas","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":414891222,"id_str":"414891222","name":"robin","screen_name":"maequerodemi","location":"victoria's \u2661","url":null,"description":"fuck the world i love helo","protected":false,"followers_count":2283,"friends_count":1822,"listed_count":2,"created_at":"Thu Nov 17 16:34:51 +0000 2011","favourites_count":526,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":23048,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F9F9F9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044500064\/64a1eb646e8fa65f92515f2b0068c097.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044500064\/64a1eb646e8fa65f92515f2b0068c097.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261804445\/4535ade74b4d725c5f80ef24547bf48c_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261804445\/4535ade74b4d725c5f80ef24547bf48c_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/414891222\/1375998993","profile_link_color":"C5C6C7","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"maequerodemi","name":"robin","id":414891222,"id_str":"414891222","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204928001,"id_str":"365611230204928001","text":"\u3010\u5b9a\u671f\u3011\u9244\u64ec\u57a2\u4f5c\u308a\u307e\u3057\u305f\uff01\u7d19\u7aef\u3055\u3093\u4e8c\u6b21\u3068\u9244\u9053\u4e00\u6b21\u3054\u3061\u3083\u307e\u305c\u30db\u30e2\u306b\u3087\u305f\u3042\u308a\u3067\u5984\u60f3\u3070\u304b\u308a\u3057\u3066\u308b\u57a2\u3067\u3059\u304c\u8208\u5473\u3042\u308b\u65b9\u306f\u30d5\u30a9\u30ed\u30fc\u304a\u6c17\u8efd\u306b\u2192@itigo_tetug","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":213681511,"id_str":"213681511","name":"AGO\u3060\u3044\u3075\u304f","screen_name":"RA____YU","location":"\u9ad8\u5d0e\u3055\u3093\u3084\u5ddd\u8d8a\u3055\u3093\u306e\u6cbf\u7dda","url":"http:\/\/www.pixiv.net\/member.php?id=1714155","description":"\u9752\u9244\u306e\u65e5.\u5149\u3068\u5ddd.\u8d8a\u306b\u304a\u71b1\u3002\u30c6\u30cb\u30d7\u30ea\u3001\u6771\u65b9\u3082\u304a\u71b1\u3067\u3059\u3002\u6642\u3005\u9032\u6483\u3068\u30b8\u30e3\u30f3\u30d7\u7cfb\u25a0\u597dCP\u7b49\u8a73\u3057\u3044\u3053\u3068\u306f\u3064\u3044\u30d7\u30ed\u2192http:\/\/twpf.jp\/RA____YU\u25a0\u30c6\u30cb\u57a2\u2192@rayu_TE\u9280\u30aa\u30d5\u57a2\u2192@itigo_off","protected":false,"followers_count":700,"friends_count":706,"listed_count":23,"created_at":"Tue Nov 09 14:43:16 +0000 2010","favourites_count":6330,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":69496,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/776757059\/f5ecaba9cc3913bfc83097a36ccb6b61.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/776757059\/f5ecaba9cc3913bfc83097a36ccb6b61.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000197302763\/8559f111f0b33ca8929555b8e28861f3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000197302763\/8559f111f0b33ca8929555b8e28861f3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/213681511\/1368236539","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"itigo_tetug","name":"\u82fa\u3060\u3044\u3075\u304f@\u9244\u64ec\u5984\u60f3\u57a2","id":1055443212,"id_str":"1055443212","indices":[64,76]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230192345088,"id_str":"365611230192345088","text":"@TreToCold12 is you finna come back..? Or you want to just shoot to where we going?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611102928769025,"in_reply_to_status_id_str":"365611102928769025","in_reply_to_user_id":257200681,"in_reply_to_user_id_str":"257200681","in_reply_to_screen_name":"TreToCold12","user":{"id":465101350,"id_str":"465101350","name":"Man Of The Year","screen_name":"iamTheRealOCP","location":"studio working \u270f","url":"http:\/\/soundcloud.com\/therealocp","description":"#TEAMNOTHOTS ONE MAN TEAM!! i only trust 3 boaa. #OTF #JTG Follow my big sister @_ThugLife02","protected":false,"followers_count":458,"friends_count":286,"listed_count":2,"created_at":"Sun Jan 15 23:48:51 +0000 2012","favourites_count":702,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":20564,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/856778540\/cb539f05313d7fd93dac394d310f8266.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/856778540\/cb539f05313d7fd93dac394d310f8266.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258074491\/f2045d50c15236f5ae97720f4c4d75fb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258074491\/f2045d50c15236f5ae97720f4c4d75fb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/465101350\/1375933865","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TreToCold12","name":"_The_Kidd","id":257200681,"id_str":"257200681","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175571969,"id_str":"365611230175571969","text":"@Whitey_4 hahahahaa(x","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610515071905792,"in_reply_to_status_id_str":"365610515071905792","in_reply_to_user_id":345578979,"in_reply_to_user_id_str":"345578979","in_reply_to_screen_name":"Whitey_4","user":{"id":285404577,"id_str":"285404577","name":"Aysha","screen_name":"aaaayshaaaa","location":"CA","url":null,"description":"I was told skys the limit so I'm reaching up","protected":false,"followers_count":398,"friends_count":373,"listed_count":0,"created_at":"Thu Apr 21 02:57:08 +0000 2011","favourites_count":5016,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":9416,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/781649881\/92f720023bcab1c18966b9decd7a67b6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/781649881\/92f720023bcab1c18966b9decd7a67b6.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000105673486\/a589d2e3c6b4a3f2c7d6731f05b54990_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000105673486\/a589d2e3c6b4a3f2c7d6731f05b54990_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/285404577\/1375059788","profile_link_color":"CCAEAE","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"F7F7F7","profile_text_color":"121212","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Whitey_4","name":"Agent Blanco","id":345578979,"id_str":"345578979","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171377665,"id_str":"365611230171377665","text":"I voted for @FallOutBoy to win Best Rock Video at the 2013 MTV Video Music Awards #vma http:\/\/t.co\/sbasRG2uR5","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1257566766,"id_str":"1257566766","name":"Mrs.Stypayhorlikson","screen_name":"SummerLove1D_22","location":"Hagerstown","url":null,"description":"Im almost 12 my birthday is June 22 2001 and Im in love with One Direction","protected":false,"followers_count":9,"friends_count":18,"listed_count":0,"created_at":"Sun Mar 10 17:45:42 +0000 2013","favourites_count":99,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":166,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000130643107\/223e3db5deee03a480890359704c5e53_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000130643107\/223e3db5deee03a480890359704c5e53_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1257566766\/1364153406","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"vma","indices":[82,86]}],"urls":[{"url":"http:\/\/t.co\/sbasRG2uR5","expanded_url":"http:\/\/www.mtv.com\/ontv\/vma\/2013\/best-rock-video\/","display_url":"mtv.com\/ontv\/vma\/2013\/\u2026","indices":[87,109]}],"user_mentions":[{"screen_name":"falloutboy","name":"Fall Out Boy","id":16212952,"id_str":"16212952","indices":[12,23]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230175576066,"id_str":"365611230175576066","text":"Acho q meu pai vai deixar eu ir no aniver da Lu","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":342305530,"id_str":"342305530","name":"P\u00e2mela Abreu ","screen_name":"p_pamis","location":"","url":"http:\/\/instagram.com\/p_pamis","description":"http:\/\/maybe--yes.tumblr.com","protected":false,"followers_count":337,"friends_count":264,"listed_count":1,"created_at":"Mon Jul 25 20:10:27 +0000 2011","favourites_count":19,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":32287,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"0F0F0F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045425771\/5f49bdb48a74c247d43f5952688965a6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045425771\/5f49bdb48a74c247d43f5952688965a6.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249678219\/aeda447f256787aa4770102e776f3aa8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249678219\/aeda447f256787aa4770102e776f3aa8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/342305530\/1375226575","profile_link_color":"0C99CC","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F058DE","profile_text_color":"EB84E4","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":{"id":"894146230dd1d42d","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/894146230dd1d42d.json","place_type":"city","name":"Porto Alegre","full_name":"Porto Alegre, Rio Grande do Sul","country_code":"BR","country":"Brasil","bounding_box":{"type":"Polygon","coordinates":[[[-51.3061478,-30.2688069],[-51.3061478,-29.9306357],[-51.012471,-29.9306357],[-51.012471,-30.2688069]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196535296,"id_str":"365611230196535296","text":"@dannybwoi49 @numiiigoesrawrz just letting you guys know cause a lot of bronies don't accept when people don't want to be bronies. Idc realy","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610936733671424,"in_reply_to_status_id_str":"365610936733671424","in_reply_to_user_id":82157444,"in_reply_to_user_id_str":"82157444","in_reply_to_screen_name":"dannybwoi49","user":{"id":1412486018,"id_str":"1412486018","name":"Nick Steele","screen_name":"PullsenMoosic","location":"Agawam, MA","url":null,"description":"Most of my music comes from a morbidly aggressive feline species. Me and @evmiester98 like to make people laugh. Ponies are cool. #N7Pride #CertifiedKREW","protected":false,"followers_count":287,"friends_count":514,"listed_count":1,"created_at":"Wed May 08 10:19:58 +0000 2013","favourites_count":988,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":3483,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247189236\/59b978fd5997c25cf8f5ca083b60c778_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247189236\/59b978fd5997c25cf8f5ca083b60c778_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1412486018\/1375037721","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"dannybwoi49","name":"Danny","id":82157444,"id_str":"82157444","indices":[0,12]},{"screen_name":"numiiigoesrawrz","name":"Angiecakez","id":1146876042,"id_str":"1146876042","indices":[13,29]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179766272,"id_str":"365611230179766272","text":"RT @Fix0Bieber: S\u00edgueme y te sigo, doy follow back <3.","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1127324624,"id_str":"1127324624","name":"\u2003","screen_name":"JustinPromises","location":"\u2661 Te quiero Vale \u2661","url":null,"description":"\u263a all love ever does is breaks, and burns, and ends\u263a","protected":false,"followers_count":1803,"friends_count":1307,"listed_count":2,"created_at":"Mon Jan 28 07:40:47 +0000 2013","favourites_count":26,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":202,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044346309\/f6a87128064a63b5484722c8a3e9a10a.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044346309\/f6a87128064a63b5484722c8a3e9a10a.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241172577\/7c4866579fa7556665436c72300abe6d_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241172577\/7c4866579fa7556665436c72300abe6d_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1127324624\/1375640775","profile_link_color":"90B9D4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:57 +0000 2013","id":365611114555379712,"id_str":"365611114555379712","text":"S\u00edgueme y te sigo, doy follow back <3.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1350279512,"id_str":"1350279512","name":"Diana.","screen_name":"Fix0Bieber","location":"JB&1D\u2665","url":null,"description":"11. \u00abLos mejores amigos si existen\u00bb Paola\/Julio\/Andrea\u2665","protected":false,"followers_count":1367,"friends_count":1103,"listed_count":5,"created_at":"Sat Apr 13 21:31:02 +0000 2013","favourites_count":1691,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":5393,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045368469\/e9d0804c1fe6fc7606e4a77e31097761.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045368469\/e9d0804c1fe6fc7606e4a77e31097761.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247480671\/027517318f225787b0237689c35686bf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247480671\/027517318f225787b0237689c35686bf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1350279512\/1375745868","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7A7A7A","profile_text_color":"B3B3B3","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Fix0Bieber","name":"Diana.","id":1350279512,"id_str":"1350279512","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171365377,"id_str":"365611230171365377","text":"@gma_natau \u304a\u306f\u30fc\u30fc\u3088\uff3c(^o^)\uff0f\uff3c(^o^)\uff0f\u2661","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610406569459713,"in_reply_to_status_id_str":"365610406569459713","in_reply_to_user_id":1266441366,"in_reply_to_user_id_str":"1266441366","in_reply_to_screen_name":"gma_natau","user":{"id":1295344352,"id_str":"1295344352","name":"\u304d\u3063\u305f\u3093@11.12\u65e5\u6771\u4eac\u884c\u304d\u307e\u3059w","screen_name":"kittan_mh","location":"","url":null,"description":"\u305f\u3060\u306e\u8150\u5973\u5b50(\u30fb\u03c9\u30fb)\u5c11\u3057\u5909\u614b\u3002\u6210\u4eba\u6e08\u3002\u5909\u614b\u767a\u8a00\u901a\u5e38\u88c5\u5099\u3002 \u30bb\u30ab\u30b3\u30a4\/\u7d14\u30ed\u30de\/\u751f\u5f92\u4f1a\u9577\u306b\u5fe0\u544a\/\u604b\u3059\u308b\u66b4\u541b\/\u9177\u304f\u3057\u306a\u3044\u3067 \/\/\u30e8\u30cd\u30c0\u30b3\u30a6\u3055\u3093\/\u306d\u3053\u7530\u7c73\u8535\u3055\u3093\/\u8170\u4e43\u3055\u3093\/\u685c\u8cc0\u3081\u3044\u3055\u3093(\uff9f\u2200\uff9f)\u597d\u307f\u304c\u540c\u3058\u4eba\u3088\u308d\u3057\u304f( \u00b4 \u25bd ` )\uff89\u304a\u4ef2\u9593\u6b53\u8fce","protected":false,"followers_count":66,"friends_count":107,"listed_count":0,"created_at":"Sun Mar 24 13:50:29 +0000 2013","favourites_count":570,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":4969,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3590904688\/4c9065c1e2c968e6580f3494b8c63ea4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3590904688\/4c9065c1e2c968e6580f3494b8c63ea4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1295344352\/1364803939","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"gma_natau","name":"\uff0a\u30b7\u30ed\u30af\u30ed\uff0a\uff72\uff6a\uff6a\uff6a\uff76\uff9e\uff67\uff67\uff67\u4e2d","id":1266441366,"id_str":"1266441366","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179766274,"id_str":"365611230179766274","text":"\u0627\u062d\u062f \u0627\u0644\u063a\u0631\u0628\u0627\u0621 \u0647\u0645\u0633 \u0644\u064a \u0642\u0627\u0626\u0644\u0627 \u0639\u064a\u0646\u0627\u0643 \u0628\u064e \u0644\u0645\u0639\u0629 \u0622\u0644\u0627\u0634\u062a\u064a\u0627\u0642 \u0641\u0627\u062a\u0646\u0647 \u062d\u062f \u0627\u0644\u0648\u062c\u0639 <\/3\"","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1602358909,"id_str":"1602358909","name":"*\u0628\u0646\u062a \u0645\u062d\u0645\u062f...~","screen_name":"ho0dho0d123491","location":"","url":null,"description":"\u0642\u064e\u0640\u062f \u064a\u064e\u062a\u062e\u0627\u064e\u0640\u0649\u0651 \u0643\u064f\u0644\u0651 \u0634\u064e\u0626\u0621 \u0639\u064e\u0646\u0643\u064e \u061b \u0648\u064e \u064a\u064e\u0628\u0642\u0649\u0651 \u0645\u064e\u0639\u0643\u064e \u0627\u0644\u0644\u0647 \u061b \u0641\u064e\u0643\u064f\u0646 \u0645\u064e\u0639\u064e #\u0627\u0644\u0644\u0647 \u061b \u064a\u064e\u0628\u0642\u0649\u0651 \u0643\u064f\u0644\u0651 \u0634\u064e\u0626\u0621 \u0645\u064e\u0639\u0643\u064e \u2665","protected":false,"followers_count":504,"friends_count":1333,"listed_count":0,"created_at":"Thu Jul 18 02:13:22 +0000 2013","favourites_count":236,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1542,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259290494\/1f420e689abd8b91de362e9f382e4bb8_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259290494\/1f420e689abd8b91de362e9f382e4bb8_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1602358909\/1374871207","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230204932098,"id_str":"365611230204932098","text":"Por falar em skate, quero comprar um","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":330747740,"id_str":"330747740","name":"Cavalcante","screen_name":"arixtin","location":"Pernambucano","url":null,"description":"Free your mind!","protected":false,"followers_count":474,"friends_count":289,"listed_count":0,"created_at":"Thu Jul 07 02:56:01 +0000 2011","favourites_count":231,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":66291,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035794349\/3c773ec227ce2333f520bb886dcd2b60.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035794349\/3c773ec227ce2333f520bb886dcd2b60.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250002950\/0fbf565a84fc27b3ea7407698dbe6154_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250002950\/0fbf565a84fc27b3ea7407698dbe6154_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/330747740\/1375795380","profile_link_color":"060B70","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171381760,"id_str":"365611230171381760","text":"\u671d\u304b\u3089\u751f\u7269\u3068\u304b","source":"\u003ca href=\"http:\/\/www.docodemo.jp\/twil\/\" rel=\"nofollow\"\u003eTwil2 (Tweet Anytime, Anywhere by Mail)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":805905990,"id_str":"805905990","name":"\u512a\u862d\uff20\u590f\u82e6\u624b","screen_name":"xoxowxyz_1009","location":"moon cold 3\u306e\uff11","url":"http:\/\/ameblo.jp\/nnnxx-winter\/","description":"\u203b\u512a\u862d(\u3046\u3089\u3093)\u306f\u672c\u540d\u3058\u3083\u3042\u308a\u307e\u305b\u3093\r\n\u81ea\u5206\u78e8\u304d\u4e2d\u306e\u53d7\u9a13\u751f\r\n\u30a2\u30a4\u30b3\u30f3\u306f\u3044\u3063\u3055@\u30ca\u30ea*\u3064\u3050\u307fetc\uff0e\u3055\u3093\uff08http:\/\/t.co\/gJpzdZUc1Y\uff09\u306b\u63cf\u3044\u3066\u3044\u305f\u3060\u304d\u307e\u3057\u305f\u30fc\u3042\u308a\u304c\u3068\u3046\u266a\r\n\u304f\u308f\u3057\u304f\u306fhttp:\/\/t.co\/YICkUkhsjh\r\n\u30d6\u30ed\u30b0\u3084\u3063\u3066\u308b\u304b\u3089\u305d\u3063\u3061\u3082\u3088\u308d\u3057\u304f\u306d","protected":false,"followers_count":573,"friends_count":611,"listed_count":2,"created_at":"Thu Sep 06 02:54:28 +0000 2012","favourites_count":66,"utc_offset":32400,"time_zone":"Sapporo","geo_enabled":false,"verified":false,"statuses_count":14258,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/805846312\/47187403623384ca3b07fb985f144987.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/805846312\/47187403623384ca3b07fb985f144987.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3685158102\/c2e4bf0ff8be3a2b66a29ca25ef6e44c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3685158102\/c2e4bf0ff8be3a2b66a29ca25ef6e44c_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230200733696,"id_str":"365611230200733696","text":"Crystal I Got A Game For Us To Play..","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":403742570,"id_str":"403742570","name":"jUss ME.","screen_name":"DezzyRawKidd","location":"","url":null,"description":"jUsss Me","protected":false,"followers_count":386,"friends_count":314,"listed_count":0,"created_at":"Wed Nov 02 23:01:28 +0000 2011","favourites_count":30,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":28642,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000253102953\/8b18937897bdc431f36886ea75954cda_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000253102953\/8b18937897bdc431f36886ea75954cda_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/403742570\/1375843229","profile_link_color":"FF0000","profile_sidebar_border_color":"65B0DA","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230192336897,"id_str":"365611230192336897","text":"RT @Diegosandoval5: OREO OREO OREO OREO OREO OREO http:\/\/t.co\/MDwczX5ZH3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":68980051,"id_str":"68980051","name":"(\u25d5\u203f\u25d5)","screen_name":"NoodlesEater","location":"\u262f","url":"http:\/\/fffiou.tumblr.com\/","description":"You may say that i'm a dreamer but i'm not the only one . \u262e&\u2764 #Potterhead #TEAMNOODLES #TeamMaladroite","protected":false,"followers_count":140,"friends_count":141,"listed_count":2,"created_at":"Wed Aug 26 13:45:04 +0000 2009","favourites_count":130,"utc_offset":7200,"time_zone":"Paris","geo_enabled":true,"verified":false,"statuses_count":20276,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/446555319\/424699_330314347005546_211540222216293_840256_1540237589_n.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/446555319\/424699_330314347005546_211540222216293_840256_1540237589_n.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3441802901\/a0ba4a48f10c9e5fe710f79d4c8595e5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3441802901\/a0ba4a48f10c9e5fe710f79d4c8595e5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/68980051\/1371247312","profile_link_color":"09639E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C1EBE3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 18:58:52 +0000 2013","id":365547676722421760,"id_str":"365547676722421760","text":"OREO OREO OREO OREO OREO OREO http:\/\/t.co\/MDwczX5ZH3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":248686053,"id_str":"248686053","name":"Diego","screen_name":"Diegosandoval5","location":"La esquina del infinito","url":"http:\/\/diegosandovalvalle.jimdo.com\/","description":"Aries. Vivo en lo fr\u00e1gil de la locura. Insoportablemente Vivo. Pecador por Hobby. Le canto baladas al diablo y a la muerte. Si te ofendes es tu problema.","protected":false,"followers_count":581,"friends_count":603,"listed_count":0,"created_at":"Mon Feb 07 14:29:48 +0000 2011","favourites_count":4009,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":33281,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042540849\/97bc8dd3543a2dbdb9343355fe976740.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042540849\/97bc8dd3543a2dbdb9343355fe976740.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260782122\/175d7493197547b2601eb586e8289e0f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260782122\/175d7493197547b2601eb586e8289e0f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/248686053\/1375885757","profile_link_color":"B50D0D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365547676730810368,"id_str":"365547676730810368","indices":[30,52],"media_url":"http:\/\/pbs.twimg.com\/media\/BRKvrNwCYAANrKo.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRKvrNwCYAANrKo.jpg","url":"http:\/\/t.co\/MDwczX5ZH3","display_url":"pic.twitter.com\/MDwczX5ZH3","expanded_url":"http:\/\/twitter.com\/Diegosandoval5\/status\/365547676722421760\/photo\/1","type":"photo","sizes":{"small":{"w":320,"h":246,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":320,"h":246,"resize":"fit"},"medium":{"w":320,"h":246,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"lang":"it"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Diegosandoval5","name":"Diego","id":248686053,"id_str":"248686053","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"it"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230188158977,"id_str":"365611230188158977","text":"RT @memonaa_: #FF pour ma ch\u00e9rie @RealChanty la meilleure de toute, je t'aime. \u2764","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":456174289,"id_str":"456174289","name":"Real Chanty","screen_name":"RealChanty","location":"Transylvanie","url":"http:\/\/www.facebook.com\/realchantymusic","description":"Chanteuse\/Singer\/Vampire - Working on my 1st album with @MMCLabel and @HKCorp_Officiel. Obsessed with video games, tattoos, Heath Ledger & Grimm's fairy tales.","protected":false,"followers_count":3316,"friends_count":905,"listed_count":17,"created_at":"Thu Jan 05 23:33:03 +0000 2012","favourites_count":13736,"utc_offset":7200,"time_zone":"Paris","geo_enabled":false,"verified":false,"statuses_count":22563,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/753079816\/51f1858d05c962eaeb32886519f29f8d.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/753079816\/51f1858d05c962eaeb32886519f29f8d.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251911785\/ec0999c1b03e3761ee777180598e26de_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251911785\/ec0999c1b03e3761ee777180598e26de_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/456174289\/1375322145","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FAD5FA","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:09:06 +0000 2013","id":365595551019958273,"id_str":"365595551019958273","text":"#FF pour ma ch\u00e9rie @RealChanty la meilleure de toute, je t'aime. \u2764","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":624077899,"id_str":"624077899","name":"\u2112\u2134\u03bd\u212f\u2661","screen_name":"memonaa_","location":"","url":null,"description":"While there's life, there's hope\u2661 A et L\u2661. matt | baba | tal | pleyers | chanty | justin Vampire diaries, best.","protected":false,"followers_count":763,"friends_count":758,"listed_count":2,"created_at":"Sun Jul 01 20:47:47 +0000 2012","favourites_count":1543,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":15857,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/883433983\/a7e4655fd0505c856ab4460576c680b7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/883433983\/a7e4655fd0505c856ab4460576c680b7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250025088\/a7ed94f6c7814b6ee203d35d3896434d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250025088\/a7ed94f6c7814b6ee203d35d3896434d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/624077899\/1375742834","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[{"text":"FF","indices":[0,3]}],"urls":[],"user_mentions":[{"screen_name":"RealChanty","name":"Real Chanty","id":456174289,"id_str":"456174289","indices":[19,30]}]},"favorited":false,"retweeted":false,"lang":"fr"},"retweet_count":0,"entities":{"hashtags":[{"text":"FF","indices":[14,17]}],"urls":[],"user_mentions":[{"screen_name":"memonaa_","name":"\u2112\u2134\u03bd\u212f\u2661","id":624077899,"id_str":"624077899","indices":[3,12]},{"screen_name":"RealChanty","name":"Real Chanty","id":456174289,"id_str":"456174289","indices":[33,44]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196531200,"id_str":"365611230196531200","text":"Hungryyyyy","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":345478726,"id_str":"345478726","name":"JESS. ","screen_name":"Jessicaaa_V","location":"LONDON | England","url":null,"description":"\u262c Hey :)| instagram: @_jess_v ||I think you should follow me ;) x","protected":false,"followers_count":3360,"friends_count":2840,"listed_count":7,"created_at":"Sat Jul 30 17:28:01 +0000 2011","favourites_count":48,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":37434,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/411656309\/urban-art-city-of-london-thames.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/411656309\/urban-art-city-of-london-thames.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3549123355\/d2ae6c0f3a283504dd3f19b155686b67_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3549123355\/d2ae6c0f3a283504dd3f19b155686b67_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/345478726\/1375115981","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"vi"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230192336896,"id_str":"365611230192336896","text":"Tomando cocacola. https:\/\/t.co\/sM6Iq8J2Jp","source":"\u003ca href=\"http:\/\/vine.co\" rel=\"nofollow\"\u003eVine - Make a Scene\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":188325684,"id_str":"188325684","name":"\u00bfeso es caca? mola.","screen_name":"payasosuicida","location":"","url":"http:\/\/www.youtube.com\/user\/mystrangeblog?feature=mhee","description":"\u25b2 Quiero cambiar el mundo \u25b2","protected":false,"followers_count":190,"friends_count":132,"listed_count":1,"created_at":"Wed Sep 08 13:31:16 +0000 2010","favourites_count":10,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":false,"verified":false,"statuses_count":5562,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/844223283\/fb4c4f50b01726cd77576853d685c8b5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/844223283\/fb4c4f50b01726cd77576853d685c8b5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000108935821\/3717759b37cdfa020d6834f9d5684a1b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000108935821\/3717759b37cdfa020d6834f9d5684a1b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/188325684\/1375137860","profile_link_color":"000203","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/sM6Iq8J2Jp","expanded_url":"https:\/\/vine.co\/v\/hhqrMw6V2tF","display_url":"vine.co\/v\/hhqrMw6V2tF","indices":[18,41]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230200737792,"id_str":"365611230200737792","text":"28.OOO voor @AnoukILOVEYOU_x & @EricMuller65 :)","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":520531362,"id_str":"520531362","name":"kim \u2661","screen_name":"JustKim_x","location":"06'09'03`Opa\u271d ","url":null,"description":"\u3163 B\u043ebbels met @Daniel_Thijssen \u2662\u3163#stamAghotonakienastan w @oguzhanx38\u3163ArianaGrande\u3163NeymarJR11\u3163Hayy x @JELLExROODE\u3163I love you classmate x @BRAMROODE","protected":false,"followers_count":414,"friends_count":314,"listed_count":0,"created_at":"Sat Mar 10 16:24:06 +0000 2012","favourites_count":1269,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":28000,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"FC4E88","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/865825583\/6a0939ae6e259e437d9c29c9aff2cfcb.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/865825583\/6a0939ae6e259e437d9c29c9aff2cfcb.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000155706885\/644ada991214305325eed8b85c893a12_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000155706885\/644ada991214305325eed8b85c893a12_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/520531362\/1375207824","profile_link_color":"F7D2DD","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EEAEE0","profile_text_color":"FBB3F0","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"AnoukILOVEYOU_x","name":"Anouk","id":308350221,"id_str":"308350221","indices":[12,28]},{"screen_name":"EricMuller65","name":"Eric ","id":409347019,"id_str":"409347019","indices":[36,49]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171369472,"id_str":"365611230171369472","text":"@nega_rockwood \u304a\u3072\u3083\u30fc","source":"\u003ca href=\"http:\/\/yubitter.com\/\" rel=\"nofollow\"\u003eyubitter\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610265649229824,"in_reply_to_status_id_str":"365610265649229824","in_reply_to_user_id":327817062,"in_reply_to_user_id_str":"327817062","in_reply_to_screen_name":"nega_rockwood","user":{"id":423219530,"id_str":"423219530","name":"\u30bf\u30ab","screen_name":"takaso2","location":"\u95a2\u897f","url":"http:\/\/www.pixiv.net\/member.php?id=4351401","description":"\u9ad8\u6c38\u3072\u306a\u3053\u5148\u751f\/\u5d0e\u8c37\u306f\u308b\u3072\u5148\u751f\/\u30bf\u30a4\u30d0\u30cb\/\u9ed2\u30d0\u30b9\/FF\u96f6\u5f0f\/\u305d\u306e\u4ed6\u3001\u8272\u3005\u6f2b\u753b\u3001BL\u542b\u3080\u3002\u65e5\u5e38\u306e\u4ed6\u611b\u306a\u3044\u3064\u3076\u3084\u304d\u591a\u6570\u3002\u7a81\u7136\u5b9f\u6cc1\u59cb\u3081\u305f\u308a\u3001\u8150\u767a\u8a00\u3082\u3042\u308b\u306e\u3067\u3001\u3054\u6ce8\u610f\u3092\u3002\u304a\u6c17\u8efd\u306b\u8a71\u3057\u304b\u3051\u3066\u4e0b\u3055\u3044\u3002\u6700\u8fd1\u3001\u9ed2\u30d0\u30b9\u30c4\u30a4\u30fc\u30c8\u304c\u304b\u306a\u308a\u591a\u3044\u3067\u3059\u3002\u8272\u3005\u96d1\u98df\u3067\u3059\u3002\u5730\u96f7\u3042\u308b\u65b9\u3054\u6ce8\u610f\u3067\u3059\u3002\u9ed2\u30d0\u30b9\u5984\u60f3\u57a2\u2192\u3010@takakurobasu\u3011","protected":false,"followers_count":193,"friends_count":269,"listed_count":4,"created_at":"Mon Nov 28 07:08:19 +0000 2011","favourites_count":606,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":76541,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247998359\/5e8dfbc38e34aee70816251ae87a3120_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247998359\/5e8dfbc38e34aee70816251ae87a3120_normal.jpeg","profile_link_color":"038543","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"nega_rockwood","name":"\u3057\u3052\u306e\u03b2\u7248","id":327817062,"id_str":"327817062","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230192340992,"id_str":"365611230192340992","text":"@KayyMarieeeee gotta grow up","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610865317265409,"in_reply_to_status_id_str":"365610865317265409","in_reply_to_user_id":620029742,"in_reply_to_user_id_str":"620029742","in_reply_to_screen_name":"KayyMarieeeee","user":{"id":219491797,"id_str":"219491797","name":"[T]he [S]tandard \u2122","screen_name":"iD_RobHer","location":"Miami - Gainesville, Fl","url":null,"description":"Future CEO | \u0391\u039b\u03a6\u0391 \u03a6\u0399 \u0391\u039b\u03a6\u0391 | #TTOTS #FOOLIES ELECTRICAL ENGINEERING YA BISHHHHH","protected":false,"followers_count":928,"friends_count":916,"listed_count":3,"created_at":"Thu Nov 25 00:52:00 +0000 2010","favourites_count":3607,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":42315,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/454777539\/spring_phirst-1.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/454777539\/spring_phirst-1.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000235706233\/1384b47709def3065e10db97750295b4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000235706233\/1384b47709def3065e10db97750295b4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/219491797\/1374724865","profile_link_color":"DEAF24","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"KayyMarieeeee","name":"Kadejiah Reaves ","id":620029742,"id_str":"620029742","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179762177,"id_str":"365611230179762177","text":"!! \u201c@TerryLime: My mood is 1000% killed.\u201d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611126140059648,"in_reply_to_status_id_str":"365611126140059648","in_reply_to_user_id":340347370,"in_reply_to_user_id_str":"340347370","in_reply_to_screen_name":"TerryLime","user":{"id":762858266,"id_str":"762858266","name":"Ming Lee","screen_name":"Mia_LaBoricua","location":"off","url":null,"description":"fuck","protected":false,"followers_count":467,"friends_count":372,"listed_count":0,"created_at":"Fri Aug 17 03:00:43 +0000 2012","favourites_count":2037,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":23127,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"BADFCD","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/695959527\/c2addf6cdd166c2939f5d908b64db977.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/695959527\/c2addf6cdd166c2939f5d908b64db977.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000232861327\/1eeffa6d014717cb28ff86c8337bd76c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000232861327\/1eeffa6d014717cb28ff86c8337bd76c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/762858266\/1374075956","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TerryLime","name":"Biggus Dickus.","id":340347370,"id_str":"340347370","indices":[4,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230188158976,"id_str":"365611230188158976","text":"cherish, jessica , & nyla not... ima ask abri & yas ffrfr..","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":422967017,"id_str":"422967017","name":"Mind Over Matter.\u2661","screen_name":"Pink_ChynaDolls","location":"me & myBolos.\u2661","url":null,"description":"class of 2k4teen. \u2661","protected":false,"followers_count":1905,"friends_count":772,"listed_count":0,"created_at":"Sun Nov 27 22:30:05 +0000 2011","favourites_count":330,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":false,"verified":false,"statuses_count":28002,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/651888802\/8ftph8350n636vebuxfz.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/651888802\/8ftph8350n636vebuxfz.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000127262329\/b0e9d46f72b22db001d6c6686b64ead6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000127262329\/b0e9d46f72b22db001d6c6686b64ead6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/422967017\/1367202479","profile_link_color":"F5C400","profile_sidebar_border_color":"FFCC00","profile_sidebar_fill_color":"FF3C00","profile_text_color":"FF8400","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230167183361,"id_str":"365611230167183361","text":"RT @MOADI_20: \u0643\u0627\u0646 \u0647\u0648 \u0631\u0648\u0627\u064a\u062a\u064a \u0627\u0644\u062a\u064a \u0627\u0633\u062a\u0645\u062a\u0639 \u0628\u0642\u0631\u0627\u0621\u0629 \u062a\u0641\u0627\u0635\u064a\u0644\u0647 \u0648\u0627\u0635\u0628\u062d \u0647\u0627\u0645\u0634\u0627\u064b \u0641\u064a \u062f\u0641\u062a\u0631 \u0627\u062d\u062f\u0627\u062b\u064a \u0644\u0627 \u064a\u0634\u062f\u0646\u064a \u0644\u0644\u0645\u0631\u0648\u0631 \u0628\u0647 \u062d\u062a\u0649 . http:\/\/t.co\/msc50Myy4u","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1413378602,"id_str":"1413378602","name":"\u0641\u064a\u0635\u0644 \u0627\u0644\u0642\u0631\u0634\u064a","screen_name":"ffaa_1434","location":"","url":null,"description":null,"protected":false,"followers_count":78,"friends_count":114,"listed_count":0,"created_at":"Wed May 08 17:23:55 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":270,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242749334\/1a935fa8aca3b2a972697ebd0963492b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242749334\/1a935fa8aca3b2a972697ebd0963492b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1413378602\/1375898187","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Feb 24 05:59:04 +0000 2013","id":305557435089842176,"id_str":"305557435089842176","text":"\u0643\u0627\u0646 \u0647\u0648 \u0631\u0648\u0627\u064a\u062a\u064a \u0627\u0644\u062a\u064a \u0627\u0633\u062a\u0645\u062a\u0639 \u0628\u0642\u0631\u0627\u0621\u0629 \u062a\u0641\u0627\u0635\u064a\u0644\u0647 \u0648\u0627\u0635\u0628\u062d \u0647\u0627\u0645\u0634\u0627\u064b \u0641\u064a \u062f\u0641\u062a\u0631 \u0627\u062d\u062f\u0627\u062b\u064a \u0644\u0627 \u064a\u0634\u062f\u0646\u064a \u0644\u0644\u0645\u0631\u0648\u0631 \u0628\u0647 \u062d\u062a\u0649 . http:\/\/t.co\/msc50Myy4u","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":331666702,"id_str":"331666702","name":"\u0645\u064f\u0648\u0636\u064a\u0651 \u0639\u064e\u0628\u062f\u064f\u0627\u0644\u0644\u0647.","screen_name":"MOADI_20","location":"\u0627\u0644\u0634\u064e\u0631\u0642\u064a\u0647 | \u0627\u0633\u0643\u064f\u0646\u0651 \u0645\u064e\u0644\u0627\u0645\u064e\u062d\u0647.","url":"http:\/\/sayat.me\/moadi","description":"+ \u0645\u064f\u062a\u0628\u0644\u0651\u062f\u0647 : \u062d\u062a\u0649 \u0627\u0646\u0651\u064a \u0623\u0643\u062a\u0628\u064f \u0643\u062b\u064a\u0631\u0627\u064b \u0639\u064e\u0646 \u0627\u0644\u0641\u0642\u062f\u0652 \u0648\u0627\u0644\u062d\u0646\u0650\u064a\u0646 \u062f\u0648\u0646\u064e \u062a\u063a\u064a\u0651\u0631 \u0641\u064a \u0645\u0644\u0627\u0645\u0650\u062d \u0648\u064e\u062c\u0647\u064a\u0650 .","protected":false,"followers_count":69568,"friends_count":641,"listed_count":120,"created_at":"Fri Jul 08 14:49:44 +0000 2011","favourites_count":84,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":4215,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"0A0B12","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/774263356\/79223e703e1c575559b7fd4c79e40b18.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/774263356\/79223e703e1c575559b7fd4c79e40b18.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000125420704\/e5cac4b7a5a47a3e96982ce5ddb8e3ac_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000125420704\/e5cac4b7a5a47a3e96982ce5ddb8e3ac_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/331666702\/1366295331","profile_link_color":"0D0E0F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":466,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":305557435094036481,"id_str":"305557435094036481","indices":[94,116],"media_url":"http:\/\/pbs.twimg.com\/media\/BD2O3klCcAE3j8G.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BD2O3klCcAE3j8G.jpg","url":"http:\/\/t.co\/msc50Myy4u","display_url":"pic.twitter.com\/msc50Myy4u","expanded_url":"http:\/\/twitter.com\/MOADI_20\/status\/305557435089842176\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":500,"h":329,"resize":"fit"},"large":{"w":500,"h":329,"resize":"fit"},"small":{"w":340,"h":224,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MOADI_20","name":"\u0645\u064f\u0648\u0636\u064a\u0651 \u0639\u064e\u0628\u062f\u064f\u0627\u0644\u0644\u0647.","id":331666702,"id_str":"331666702","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230196531201,"id_str":"365611230196531201","text":"@raylovscb oi ovo, td bem?","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":318655837,"in_reply_to_user_id_str":"318655837","in_reply_to_screen_name":"raylovscb","user":{"id":904972081,"id_str":"904972081","name":"q ","screen_name":"fuckcbrown","location":"raylovscb\u2665","url":null,"description":"\u262f cuz i'm so shy, i'm fucking with you \u262f","protected":false,"followers_count":866,"friends_count":706,"listed_count":0,"created_at":"Fri Oct 26 00:25:53 +0000 2012","favourites_count":1338,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":true,"verified":false,"statuses_count":17296,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDEDED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035011543\/90b708c8315edd8d9ade5b7e65a1246e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035011543\/90b708c8315edd8d9ade5b7e65a1246e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261852985\/607d1bc43ce3f613eb79123dcad79c6a_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261852985\/607d1bc43ce3f613eb79123dcad79c6a_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/904972081\/1375999527","profile_link_color":"030303","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"raylovscb","name":"k","id":318655837,"id_str":"318655837","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179762178,"id_str":"365611230179762178","text":"Yup. RT @AdamSchefter: Agree or disagree with this headline? http:\/\/t.co\/4dCnStLR1L","source":"\u003ca href=\"http:\/\/www.echofon.com\/\" rel=\"nofollow\"\u003eEchofon\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":249407593,"id_str":"249407593","name":"Lance Vance","screen_name":"ThisIsntXav","location":"Long Island","url":null,"description":"Volume tweeter. Crystal Palace FC. Eph. Yup.","protected":false,"followers_count":533,"friends_count":322,"listed_count":0,"created_at":"Wed Feb 09 00:14:33 +0000 2011","favourites_count":492,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":28406,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/724606035\/412e9b3743c295a944024f971f093cc2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/724606035\/412e9b3743c295a944024f971f093cc2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247333090\/5b05e0ba7d03849a4e2e611d43dea3f3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247333090\/5b05e0ba7d03849a4e2e611d43dea3f3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/249407593\/1370173846","profile_link_color":"93A644","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/4dCnStLR1L","expanded_url":"http:\/\/instagram.com\/p\/cxSgQfFOUK\/","display_url":"instagram.com\/p\/cxSgQfFOUK\/","indices":[61,83]}],"user_mentions":[{"screen_name":"AdamSchefter","name":"Adam Schefter","id":51263592,"id_str":"51263592","indices":[8,21]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230171365376,"id_str":"365611230171365376","text":"ard Dujaun dang lol","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":708569234,"id_str":"708569234","name":"'Inami","screen_name":"Mondroo_","location":"","url":null,"description":"VI.XVII.XCIV , 8\/16\/12 8:42pm @realtamiaworld Mentioned me!!!","protected":false,"followers_count":420,"friends_count":309,"listed_count":0,"created_at":"Sat Jul 21 07:32:57 +0000 2012","favourites_count":126,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":14369,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/654464634\/rl9iaxnyiuzuy3lm6x5g.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/654464634\/rl9iaxnyiuzuy3lm6x5g.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000217565935\/e3cf5d7b8cbcaf8eabb0546fa1669d2a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000217565935\/e3cf5d7b8cbcaf8eabb0546fa1669d2a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/708569234\/1375937049","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230179766273,"id_str":"365611230179766273","text":"RT @_demilolvato: esse comercial do mc donalds \u00e9 muito tosco","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":836304967,"id_str":"836304967","name":"isa ","screen_name":"whyhoyne","location":"","url":"http:\/\/whyhoyne.tumblr.com","description":"all is gay","protected":false,"followers_count":671,"friends_count":667,"listed_count":1,"created_at":"Thu Sep 20 20:28:11 +0000 2012","favourites_count":845,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":33112,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000006925340\/bdf9a1a7ffa7c8044a4dcc10587a6b29.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000006925340\/bdf9a1a7ffa7c8044a4dcc10587a6b29.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193548587\/612720137e2372839e31eda4d432af94_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193548587\/612720137e2372839e31eda4d432af94_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/836304967\/1374881452","profile_link_color":"B093C7","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:27 +0000 2013","id":365610990190067712,"id_str":"365610990190067712","text":"esse comercial do mc donalds \u00e9 muito tosco","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":605863435,"id_str":"605863435","name":"georgia","screen_name":"_demilolvato","location":"","url":"https:\/\/twitter.com\/justinbieber\/status\/363620860537868288","description":null,"protected":false,"followers_count":2077,"friends_count":1912,"listed_count":0,"created_at":"Tue Jun 12 00:12:24 +0000 2012","favourites_count":116,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":false,"verified":false,"statuses_count":35781,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046886711\/a44d6c49071316d240b31908a67e1bc3.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046886711\/a44d6c49071316d240b31908a67e1bc3.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257589686\/8214023cb90fb9895d8fc7a0a0091487_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257589686\/8214023cb90fb9895d8fc7a0a0091487_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/605863435\/1375921880","profile_link_color":"FF4794","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_demilolvato","name":"georgia","id":605863435,"id_str":"605863435","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:24 +0000 2013","id":365611230192345089,"id_str":"365611230192345089","text":"me mata assim http:\/\/t.co\/bB05M2HV2x","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1654153386,"id_str":"1654153386","name":"b","screen_name":"shippobilla","location":"100% el guaje ","url":null,"description":"\u2661 not love as the highest scorer of Spain? \u2661","protected":false,"followers_count":25,"friends_count":108,"listed_count":0,"created_at":"Thu Aug 08 00:24:09 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":23,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257666819\/04ae8261c3a29bef4ab2f8da9bda9198_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257666819\/04ae8261c3a29bef4ab2f8da9bda9198_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1654153386\/1376001454","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611230196539393,"id_str":"365611230196539393","indices":[14,36],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpeg3CIAE0z3X.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpeg3CIAE0z3X.jpg","url":"http:\/\/t.co\/bB05M2HV2x","display_url":"pic.twitter.com\/bB05M2HV2x","expanded_url":"http:\/\/twitter.com\/shippobilla\/status\/365611230192345089\/photo\/1","type":"photo","sizes":{"medium":{"w":500,"h":750,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":750,"resize":"fit"},"small":{"w":340,"h":510,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361475074,"id_str":"365611234361475074","text":"y aunque he pagado los impuestos de esta bancarrota que es vivir sin ti","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1616621640,"id_str":"1616621640","name":"Nicolas Andrew","screen_name":"NicolasAndrew5","location":"","url":null,"description":null,"protected":false,"followers_count":8,"friends_count":8,"listed_count":0,"created_at":"Wed Jul 24 02:49:47 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":118,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"0FDBC0","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000031749629\/4cf9e4d1db280624c46b11643d0c0807.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000031749629\/4cf9e4d1db280624c46b11643d0c0807.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000178918293\/7541f9eacdfcd821c23d54147227a00d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000178918293\/7541f9eacdfcd821c23d54147227a00d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1616621640\/1374634664","profile_link_color":"83E316","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382454784,"id_str":"365611234382454784","text":"\u304a\u304a\u304a\u3001\u3044\u304d\u306a\u308a\u611b\u5de5\u5927\u540d\u96fb\u30db\u30fc\u30e0\u30e9\u30f3\uff01","source":"\u003ca href=\"http:\/\/twipple.jp\/\" rel=\"nofollow\"\u003e\u3064\u3044\u3063\u3077\u308b for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":823537279,"id_str":"823537279","name":"\u3061\u306a","screen_name":"lovegs1202","location":"\u9752\u5cf0\u3068\u9ed2\u30d0\u30b9\u773c\u93e1\u30ba\u306b\u56f2\u307e\u308c\u968a","url":"http:\/\/twpf.jp\/lovegs1202","description":"\u9ed2\u30d0\u30b9\u30e1\u30a4\u30f3\u6642\u3005\u305d\u306e\u4ed6\u57a2\u3002\u9ed2\u30d0\u30b9\u5927\u597d\u304d\u3002\u65e5\u5411\/\u706b\u795e\/\u4f0a\u6708\/\u6c34\u6238\u90e8\/\u9ec4\u702c\/\u7dd1\u9593\/\u9ad8\u5c3e\/\u9752\u5cf0\/\u4eca\u5409\u597d\u304d\u3002\u4ed6\uff80\uff72\uff8a\uff9e\uff86\/\u3068\u304d\u30e1\u30e2GS\/APH\/\u4e09\u570b\u6226\u56fd\u7121\u53cc\/BSR\/\u4ed6\uff79\uff9e\uff70\uff91\/\uff71\uff86\uff92\/\u58f0\u512a\u597d\u304d\u3002\u9065\u304b\u6614\u306b\u6210\u4eba\u6e08\u306e\u5922\u597d\u304d\u3002\u30c4\u30a4\u30d7\u30ed\u4f5c\u3063\u3066\u307f\u305f","protected":false,"followers_count":45,"friends_count":51,"listed_count":1,"created_at":"Fri Sep 14 15:59:13 +0000 2012","favourites_count":741,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":2916,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000214079494\/e5d2cd7521d6a006150c07dc2f464c1b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000214079494\/e5d2cd7521d6a006150c07dc2f464c1b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/823537279\/1375283168","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382450694,"id_str":"365611234382450694","text":"These dumb tigers weather pisses me off","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":547869332,"id_str":"547869332","name":"Harvey","screen_name":"hunter_keller","location":"","url":"http:\/\/25.media.tumblr.com\/a235df4cecb65c767b103b003b6f5585\/tumblr_mhed47zt0f1rmi3q1o1_500.gif","description":"Now stay with me cause im quite quick 5 6 7 8 :)\nNiall's crazy Mofo","protected":false,"followers_count":248,"friends_count":529,"listed_count":0,"created_at":"Sat Apr 07 20:00:30 +0000 2012","favourites_count":1191,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2046,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258319592\/c744c0af34f55d1a0b5823fb641a2d34_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258319592\/c744c0af34f55d1a0b5823fb641a2d34_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/547869332\/1375934612","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234365669376,"id_str":"365611234365669376","text":"RT @notaslimboy: Masalahnya adalah, ketidaksetujuan dan pandangan terhadap sebuah nilai selalu dipandang sebagai bentuk penghinaan.","source":"\u003ca href=\"http:\/\/www.tweetcaster.com\" rel=\"nofollow\"\u003eTweetCaster for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1562867076,"id_str":"1562867076","name":"pujo","screen_name":"pujo2613","location":"jambi ","url":"http:\/\/pujobuanamotor.wordpress.com","description":"mantan pembalap yg gak smpet jdi idola cwek cwek || salah satu mahasiswa unja ekstensi jurusan ekonomi yg anti akutansi || barca ofc","protected":false,"followers_count":31,"friends_count":59,"listed_count":0,"created_at":"Tue Jul 02 11:07:36 +0000 2013","favourites_count":9,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":533,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000218604790\/6d2753c1be17d0982b4841f90b958ad7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000218604790\/6d2753c1be17d0982b4841f90b958ad7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1562867076\/1375025535","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 19:30:30 +0000 2013","id":365555639948292097,"id_str":"365555639948292097","text":"Masalahnya adalah, ketidaksetujuan dan pandangan terhadap sebuah nilai selalu dipandang sebagai bentuk penghinaan.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":19190361,"id_str":"19190361","name":"Sammy (Sam D. Putra)","screen_name":"notaslimboy","location":"Bekasi, Jawa Barat, Indonesia","url":"http:\/\/samdputra.wordpress.com\/","description":"Stand-up Comedian Tahan Banting, Computer Programmer, Pengamat Hal Gak Penting cp: sam.d.putra@gmail.com","protected":false,"followers_count":45212,"friends_count":1361,"listed_count":217,"created_at":"Mon Jan 19 16:50:24 +0000 2009","favourites_count":1156,"utc_offset":25200,"time_zone":"Jakarta","geo_enabled":false,"verified":false,"statuses_count":77792,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000097438166\/742ee3975d190e7d1f1d7a0e8d4739c7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000097438166\/742ee3975d190e7d1f1d7a0e8d4739c7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/19190361\/1356432422","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":13,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"id"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"notaslimboy","name":"Sammy (Sam D. Putra)","id":19190361,"id_str":"19190361","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234378260480,"id_str":"365611234378260480","text":"Gn bitches","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":236959580,"id_str":"236959580","name":"Jouke ","screen_name":"Joukedehaan","location":"Welsrijp","url":null,"description":"in a relationship with @jildouuu_ \u2665. She's mine, she's the one.","protected":false,"followers_count":311,"friends_count":158,"listed_count":2,"created_at":"Tue Jan 11 19:17:49 +0000 2011","favourites_count":23,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":15829,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/800973149\/7097c7f7a8d82486825ebf14963f78cd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/800973149\/7097c7f7a8d82486825ebf14963f78cd.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3507154415\/793e9486ff4ad81602e44101269e0968_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3507154415\/793e9486ff4ad81602e44101269e0968_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/236959580\/1375385336","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234365681664,"id_str":"365611234365681664","text":"Was that racist ?","source":"\u003ca href=\"http:\/\/www.echofon.com\/\" rel=\"nofollow\"\u003eEchofon\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":192310584,"id_str":"192310584","name":"Presidential Kid","screen_name":"KWilliams_35","location":"Lonely Loner","url":null,"description":"What if today was the rapture, and you completely tarnished\nThe truth will set you free, so to me be completely honest","protected":false,"followers_count":310,"friends_count":339,"listed_count":1,"created_at":"Sat Sep 18 19:38:37 +0000 2010","favourites_count":8,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":5572,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/887642034\/eb24076f3e172a0e21969fed7f14868b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/887642034\/eb24076f3e172a0e21969fed7f14868b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3579233977\/f1a53b59b24f08e135f394da6fd3f65d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3579233977\/f1a53b59b24f08e135f394da6fd3f65d_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382446594,"id_str":"365611234382446594","text":"\u3053\u306e\u3070\u3059\u30ab\u30fc\u30c6\u30f3\u306a\u3044\u3084\u3093","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1464855757,"id_str":"1464855757","name":"\u3072\u304b\u308b\u3093","screen_name":"hixlav","location":"","url":null,"description":null,"protected":false,"followers_count":84,"friends_count":70,"listed_count":1,"created_at":"Tue May 28 14:33:47 +0000 2013","favourites_count":8,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":832,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259469176\/fe0c7b2ea168af55d7e10e787875db88_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259469176\/fe0c7b2ea168af55d7e10e787875db88_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1464855757\/1374589289","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382446595,"id_str":"365611234382446595","text":"I just got an inbox saying 'oryt' do i reply hi, yes, what do i fricken say","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":67332849,"id_str":"67332849","name":"Faith","screen_name":"ffffaithf","location":"England","url":null,"description":"Start a conversation with me about good music and I will instantly like you.\nRHCP . Nirvana . The Smiths","protected":false,"followers_count":1420,"friends_count":1250,"listed_count":0,"created_at":"Thu Aug 20 14:48:24 +0000 2009","favourites_count":570,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":7807,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"709397","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/704386133\/0157c3f3b332468fb82c742c5c4fb8aa.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/704386133\/0157c3f3b332468fb82c742c5c4fb8aa.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000221851275\/34b425b89e58d47bfa65dfc33dc09b07_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000221851275\/34b425b89e58d47bfa65dfc33dc09b07_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/67332849\/1375307821","profile_link_color":"005EFF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3834F","profile_text_color":"753011","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386657280,"id_str":"365611234386657280","text":"\u0644\u0642\u0637\u0629 \u0631\u0627\u0626\u0639\u0629 \u0641\u064a \u0627\u0644\u0648\u0642\u062a \u0627\u0644\u0645\u0646\u0627\u0633\u0628 !!\n\n\u0644\u0644\u0645\u0632\u064a\u062f \u0645\u0646 \u0627\u0644\u0635\u0648\u0631 \u0627\u0644\u0645\u0645\u064a\u0632\u0629 \n\u062a\u0627\u0628\u0639\u0648\u0646\u0627 \u0639\u0644\u0649 \u0627\u0644\u0635\u0641\u062d\u0629 \u0627\u0644\u0631\u0627\u0626\u0639\u0629 ==> Travel with us \u2665 ^_^ http:\/\/t.co\/sJfjcssgOJ","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1593707054,"id_str":"1593707054","name":"World of innovation","screen_name":"ibtikarworld","location":"Jordan - Irbid","url":"https:\/\/www.facebook.com\/ibtikarworld","description":"Follow us for innovations pictures ^_^","protected":false,"followers_count":67,"friends_count":2,"listed_count":0,"created_at":"Sun Jul 14 15:58:13 +0000 2013","favourites_count":8,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":4640,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216186161\/b03b2e6ee2df5b5cd870b6ddbb167ddf_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216186161\/b03b2e6ee2df5b5cd870b6ddbb167ddf_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1593707054\/1375215973","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/sJfjcssgOJ","expanded_url":"http:\/\/fb.me\/JvdrXIRr","display_url":"fb.me\/JvdrXIRr","indices":[113,135]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234365685760,"id_str":"365611234365685760","text":"@Bossypants16 your account got hacked. Change your password","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":54683262,"in_reply_to_user_id_str":"54683262","in_reply_to_screen_name":"Bossypants16","user":{"id":182267524,"id_str":"182267524","name":"Cintia Guerrero","screen_name":"cinchachu","location":"","url":null,"description":"\u03a3\u039b\u0391 Fall 11' #3 'Kalama AKA Phat Pika' @sla_utd |EMAC\/MIS Double Major| @UTDLULACCouncil | ALPFA | @utdallasmc MPA | Creator of @Chalking4Cancer w\/ @thile1991","protected":false,"followers_count":437,"friends_count":431,"listed_count":19,"created_at":"Tue Aug 24 06:19:17 +0000 2010","favourites_count":860,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":5686,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258322617\/49da0eeb4c4701ae1cb099437d490b0d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258322617\/49da0eeb4c4701ae1cb099437d490b0d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/182267524\/1375054713","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Bossypants16","name":"Lanie Lopez","id":54683262,"id_str":"54683262","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234378252288,"id_str":"365611234378252288","text":"Ow maj gawd justin is doowdjes","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":239423165,"id_str":"239423165","name":" '","screen_name":"_persianproud","location":"Hitler","url":null,"description":"Waarom lees je dit ? Doe iets nuttigs met je tijd \u270c","protected":false,"followers_count":545,"friends_count":267,"listed_count":2,"created_at":"Mon Jan 17 15:55:43 +0000 2011","favourites_count":38,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":33524,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"D8E1E6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000038859527\/7c8a9a4cfe6f3c52b28563e449e36c1b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000038859527\/7c8a9a4cfe6f3c52b28563e449e36c1b.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000214008277\/42eb7949fedd7988ede6a91b0c35b9e6_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000214008277\/42eb7949fedd7988ede6a91b0c35b9e6_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/239423165\/1375962025","profile_link_color":"FF4747","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361491456,"id_str":"365611234361491456","text":"RT @kidrauhlJDB___: Well let me tell you a story about a girl and a boy he fell in love with his bestfriend \n#mtvhottest Justin Bieber","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":420594752,"id_str":"420594752","name":"SOON \u2665","screen_name":"Aureliestyler","location":"","url":"http:\/\/justin-drew-bieber4ever.com","description":"#BELIEBER #BelieveTour \u2665 10.04.2013 Belgium \n@justinbieber #WeSupportYouJustin \u2665\u2665\n#teamSOON #promoBELIEVE3DInBelgium","protected":false,"followers_count":273,"friends_count":991,"listed_count":0,"created_at":"Thu Nov 24 20:56:20 +0000 2011","favourites_count":14,"utc_offset":7200,"time_zone":"Paris","geo_enabled":false,"verified":false,"statuses_count":1800,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/865919174\/d77488bc206fcfe31d22fa1a91bd6b78.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/865919174\/d77488bc206fcfe31d22fa1a91bd6b78.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000219698698\/a21baf423d429a23ce80f49d9c057d9b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000219698698\/a21baf423d429a23ce80f49d9c057d9b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/420594752\/1372512680","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:09:56 +0000 2013","id":365610860795805696,"id_str":"365610860795805696","text":"Well let me tell you a story about a girl and a boy he fell in love with his bestfriend \n#mtvhottest Justin Bieber","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1654417315,"id_str":"1654417315","name":"Belieber\u2764","screen_name":"kidrauhlJDB___","location":"justin's world ","url":null,"description":"I couldnt ask for a better idol.\nFollow your dreams & Never say Never.\n- Justin Bieber \nBelieber since 09.\u2764\n|Believe tour july 31 ,2013| \u2764","protected":false,"followers_count":167,"friends_count":151,"listed_count":1,"created_at":"Thu Aug 08 02:41:47 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":342,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258039500\/33ea8301497e218ad61560085f0955a4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258039500\/33ea8301497e218ad61560085f0955a4_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[{"text":"mtvhottest","indices":[89,100]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"mtvhottest","indices":[109,120]}],"urls":[],"user_mentions":[{"screen_name":"kidrauhlJDB___","name":"Belieber\u2764","id":1654417315,"id_str":"1654417315","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234365677568,"id_str":"365611234365677568","text":"@PinedaGabriela awhh thanks aya te miro ehh(:","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":436116771,"in_reply_to_user_id_str":"436116771","in_reply_to_screen_name":"PinedaGabriela","user":{"id":799004634,"id_str":"799004634","name":"August 10\u2764","screen_name":"alexandrag08","location":"","url":null,"description":"Corridos\/Banda\/ Norte\u00f1as\u2764 Sophmore | 15 years young Y arriba Jalisco \u2764","protected":false,"followers_count":360,"friends_count":209,"listed_count":0,"created_at":"Sun Sep 02 20:41:17 +0000 2012","favourites_count":1843,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":8158,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000218640140\/dec2a253b30fb2afc28c7bc4eef081c1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000218640140\/dec2a253b30fb2afc28c7bc4eef081c1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/799004634\/1372969543","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PinedaGabriela","name":"Gabby","id":436116771,"id_str":"436116771","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234399240192,"id_str":"365611234399240192","text":"\u672c\u7530\uff90\uff97\uff9d\u304d\u305f","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":100268849,"id_str":"100268849","name":"\u3051\u3093\u305f\u308d","screen_name":"punisawa","location":"\u798f\u4e95\u5e02","url":null,"description":"\u3069\u30fc\u3082\u3001\u3051\u3093\u305f\u308d\u3067\u3059\u3002","protected":false,"followers_count":184,"friends_count":206,"listed_count":6,"created_at":"Tue Dec 29 16:56:32 +0000 2009","favourites_count":11,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":10670,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1066555660\/P1000485_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1066555660\/P1000485_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390843392,"id_str":"365611234390843392","text":"\u4e5d\u6708\u304b\u3089\u306f\u5fdc\u63f4\u56e3\u3042\u308b\u3057\u3001\u516b\u6708\u4e2d\u306b\u304b\u305b\u3050\u304b","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":986779220,"id_str":"986779220","name":"\u3055\u3063\u3074\u30fc@\u3070\u308b\u3081\u305f\u3093\u306f\u3041\u306f\u3041","screen_name":"one_yusaku","location":"","url":null,"description":"\u30d2\u30e3\u30c3\u30cf\u30fc\uff01\u4eca\u9031\u306e\u964d\u81e8\u30c0\u30f3\u30b8\u30e7\u30f3\u306f\u3055\u3063\u3074\u30fc\u964d\u81e8\u3060\u305c\uff01\u307f\u3093\u306a\u4ffa\u304c\u30c9\u30ed\u30c3\u30d7\u3059\u308b\u307e\u3067\u77f3\u5165\u308c\u307e\u304f\u308a\u3084\u304c\u308c\uff01\/\u30d1\u30ba\u30c9\u30e9\/\u30b5\u30e0\u30cd\u4e8c\u6b21\/\u30aa\u30ca\u30cb\u30fc\u9b54\u4eba\/\u30b9\u30cd\u30fc\u30af","protected":false,"followers_count":163,"friends_count":249,"listed_count":2,"created_at":"Mon Dec 03 14:53:00 +0000 2012","favourites_count":581,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2563,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000146548341\/f3b2554fe48d86d4afcca8ba0f3f33b6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000146548341\/f3b2554fe48d86d4afcca8ba0f3f33b6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/986779220\/1373297243","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382446596,"id_str":"365611234382446596","text":"|\u0631\u064a\u0641 \u062f\u0645\u0634\u0642|: \n\n\u0627\u0644\u062c\u064a\u0634 \u0627\u0644\u0639\u0631\u0628\u064a \u0627\u0644\u0633\u0648\u0631\u064a \u064a\u0633\u062a\u0647\u062f\u0641 \u0628\u0627\u0644\u0623\u0633\u0644\u062d\u0629 \u0627\u0644\u062b\u0642\u064a\u0644\u0629 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u0645\u062c\u0645\u0648\u0639\u0627\u062a \u0627\u0644\u0625\u0631\u0647\u0627\u0628\u064a\u0629 \u0641\u064a \u062d\u064a \u0627\u0644\u0642\u0627\u0628\u0648\u0646 \u0645\u0627 \u0623\u062f\u0649 \u0625\u0644\u0649... http:\/\/t.co\/8a2IiQc74L","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":966317670,"id_str":"966317670","name":"\u0631\u064a\u0627\u0636 \u0633\u0648\u0631\u064a","screen_name":"R_Syri","location":"","url":null,"description":null,"protected":false,"followers_count":16,"friends_count":33,"listed_count":0,"created_at":"Fri Nov 23 15:34:22 +0000 2012","favourites_count":0,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":1687,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2886008295\/851314ef09061c1994db2d5c783d6e8e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2886008295\/851314ef09061c1994db2d5c783d6e8e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/966317670\/1353685516","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/8a2IiQc74L","expanded_url":"http:\/\/fb.me\/6q9dN3zny","display_url":"fb.me\/6q9dN3zny","indices":[112,134]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382450691,"id_str":"365611234382450691","text":"RT @_analinzmeier: Felizes s\u00e3o aqueles que quando fogem algu\u00e9m vai atr\u00e1s.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":146133155,"id_str":"146133155","name":"Mariane","screen_name":"mariane_q_t","location":"","url":"http:\/\/sooooo-what.tumblr.com\/","description":"http:\/\/instagram.com\/marianetrombini","protected":false,"followers_count":228,"friends_count":24,"listed_count":7,"created_at":"Thu May 20 17:36:11 +0000 2010","favourites_count":184,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":1210,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"CF7E6A","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme2\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme2\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000143624037\/62fe61f7509a79e3cdaec571c8499db9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000143624037\/62fe61f7509a79e3cdaec571c8499db9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/146133155\/1375790204","profile_link_color":"FA5E0A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DAECF4","profile_text_color":"663B12","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:55 +0000 2013","id":365611106078699522,"id_str":"365611106078699522","text":"Felizes s\u00e3o aqueles que quando fogem algu\u00e9m vai atr\u00e1s.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1062950036,"id_str":"1062950036","name":"Ana Linzmeier","screen_name":"_analinzmeier","location":"","url":null,"description":null,"protected":false,"followers_count":421,"friends_count":333,"listed_count":0,"created_at":"Sat Jan 05 12:58:42 +0000 2013","favourites_count":512,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":12172,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"696969","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/757665818\/e2ea9c21348db90a1a209ef44c2e5220.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/757665818\/e2ea9c21348db90a1a209ef44c2e5220.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000018093025\/3173aff859683f55adf021744f51e097_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000018093025\/3173aff859683f55adf021744f51e097_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1062950036\/1372976530","profile_link_color":"5E5C5E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_analinzmeier","name":"Ana Linzmeier","id":1062950036,"id_str":"1062950036","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234399232000,"id_str":"365611234399232000","text":"\u062a\u0623\u0643\u062f\u064a \u0645\u0646 \u062c\u0645\u0627\u0644 \u0634\u0643\u0644 \u062c\u0633\u0645\u0643 \u0642\u0628\u0644 \u0645\u0627 \u062a\u0644\u0628\u0633\u064a\u0646 \u0636\u064a\u0642 \u263a!","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1218291600,"id_str":"1218291600","name":" \u043c\u03b1w\u03b1dd\u03b1\u043d \u2661","screen_name":"medo__1998","location":"\u03b1\u03b9 \u043c\u03b1di\u0438\u03b1\u043d","url":null,"description":"\u2650 H.B.D 2 ME =D","protected":false,"followers_count":249,"friends_count":310,"listed_count":0,"created_at":"Mon Feb 25 12:24:30 +0000 2013","favourites_count":290,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":4701,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/799993288\/c6b386665075185576354648f7cf03a8.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/799993288\/c6b386665075185576354648f7cf03a8.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256958397\/6e815091e2c2133b27206755bc730272_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256958397\/6e815091e2c2133b27206755bc730272_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1218291600\/1373393511","profile_link_color":"91C4C4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"none","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382446592,"id_str":"365611234382446592","text":"@owlcity can you survive in a jungle? #OCTMS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":15265990,"in_reply_to_user_id_str":"15265990","in_reply_to_screen_name":"owlcity","user":{"id":258568165,"id_str":"258568165","name":"Megan","screen_name":"dearoceantown","location":"Umbrella Beach","url":null,"description":"Thanks, I stole them from the president.","protected":false,"followers_count":223,"friends_count":218,"listed_count":0,"created_at":"Mon Feb 28 00:46:48 +0000 2011","favourites_count":415,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":2950,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"020812","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/845847107\/6e05a48d548fcc6a2ebda6658d7ea447.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/845847107\/6e05a48d548fcc6a2ebda6658d7ea447.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251713263\/1bea9cf88b9663f1b0100c9c6517396f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251713263\/1bea9cf88b9663f1b0100c9c6517396f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/258568165\/1360107744","profile_link_color":"17406C","profile_sidebar_border_color":"2280A9","profile_sidebar_fill_color":"020812","profile_text_color":"2280A9","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"OCTMS","indices":[38,44]}],"urls":[],"user_mentions":[{"screen_name":"owlcity","name":"Owl City","id":15265990,"id_str":"15265990","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234395029505,"id_str":"365611234395029505","text":"La concha de la lora Leila me cague toda pelotuda ma\u00f1ana te voy a decir de todo, te quiero\u2665 ah.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":770261256,"id_str":"770261256","name":"\u03c1\u03b1\u03c5\u2113\u03b9 \u0c9e","screen_name":"PauliBobalini","location":"","url":null,"description":"Las \u00fanicas mentiras, que se pueden hacer realidad son aquellas llamadas sue\u00f1os .","protected":false,"followers_count":200,"friends_count":401,"listed_count":0,"created_at":"Mon Aug 20 21:31:33 +0000 2012","favourites_count":14,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2138,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBC3D2","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/855844708\/61250e4ace145a708b4874891c39c9e4.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/855844708\/61250e4ace145a708b4874891c39c9e4.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3642490274\/39f4137d2bc2bf0adcff9aaa65d2434b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3642490274\/39f4137d2bc2bf0adcff9aaa65d2434b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/770261256\/1367179929","profile_link_color":"B300AD","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234374062080,"id_str":"365611234374062080","text":"RT @thxOneDirection: a anima\u00e7\u00e3o do hazza \u00e9 contagiante http:\/\/t.co\/s055IdMzkC","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":790022658,"id_str":"790022658","name":"Vivih","screen_name":"kissin1D","location":"Brasil - RJ","url":null,"description":"\u2665 One Direction stole my heart \u2665 ~ Just Believe In Your Dreams ~","protected":false,"followers_count":1323,"friends_count":1460,"listed_count":1,"created_at":"Wed Aug 29 19:45:51 +0000 2012","favourites_count":401,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":true,"verified":false,"statuses_count":13397,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCFCFC","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000039600443\/b5278927d0f82c3b3204004d5b01f15f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000039600443\/b5278927d0f82c3b3204004d5b01f15f.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216109173\/b92054f033a75c46f6df012488642c54_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216109173\/b92054f033a75c46f6df012488642c54_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/790022658\/1375214597","profile_link_color":"9C17B0","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:08:39 +0000 2013","id":365610537675014144,"id_str":"365610537675014144","text":"a anima\u00e7\u00e3o do hazza \u00e9 contagiante http:\/\/t.co\/s055IdMzkC","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":913073622,"id_str":"913073622","name":"Baby nanas \u2654","screen_name":"thxOneDirection","location":"Brazil \u2708 London","url":"http:\/\/animespirit.com.br\/fanfics\/historia\/fanfiction-idolos-one-direction-dark-angel-981603","description":"Leeroy, Hmm!!","protected":false,"followers_count":1740,"friends_count":1113,"listed_count":1,"created_at":"Mon Oct 29 19:28:23 +0000 2012","favourites_count":168,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":false,"verified":false,"statuses_count":9048,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000031491248\/5a19b227e1888444e9e2c583a76f38ab.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000031491248\/5a19b227e1888444e9e2c583a76f38ab.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000177350538\/dc1c299e0221bafba38289e8e76da59b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000177350538\/dc1c299e0221bafba38289e8e76da59b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/913073622\/1374611142","profile_link_color":"91A7BF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365610537679208450,"id_str":"365610537679208450","indices":[34,56],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLo2NCCQAIeVL4.png","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLo2NCCQAIeVL4.png","url":"http:\/\/t.co\/s055IdMzkC","display_url":"pic.twitter.com\/s055IdMzkC","expanded_url":"http:\/\/twitter.com\/thxOneDirection\/status\/365610537675014144\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":248,"resize":"fit"},"medium":{"w":403,"h":294,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":403,"h":294,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"thxOneDirection","name":"Baby nanas \u2654","id":913073622,"id_str":"913073622","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234369875968,"id_str":"365611234369875968","text":"\ud83d\ude02\u201c@Tera2Cockyyy: But S\/O to the bitches that CANT put up a picture of \"HIM\" cuz he another bitch nigga\ud83d\udc6b\ud83d\udd10\ud83d\ude29\ud83d\ude02\ud83d\ude02\ud83d\ude02\u201d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610961454891010,"in_reply_to_status_id_str":"365610961454891010","in_reply_to_user_id":363292243,"in_reply_to_user_id_str":"363292243","in_reply_to_screen_name":"Tera2Cockyyy","user":{"id":276150047,"id_str":"276150047","name":"\u26a1DAE\u26a1","screen_name":"juicydaee","location":"","url":null,"description":"B.I.T.E *","protected":false,"followers_count":2405,"friends_count":2150,"listed_count":3,"created_at":"Sat Apr 02 19:17:45 +0000 2011","favourites_count":460,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":79412,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043610098\/39c503b41b4f84eb9385755727ae0057.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043610098\/39c503b41b4f84eb9385755727ae0057.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000228659541\/945c8142c522eb8a16f473996509d6a5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000228659541\/945c8142c522eb8a16f473996509d6a5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/276150047\/1375221102","profile_link_color":"93A644","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Tera2Cockyyy","name":"BombTera\u26a1","id":363292243,"id_str":"363292243","indices":[2,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386657282,"id_str":"365611234386657282","text":"\u308f\u3089\u308f\u3082\u60aa\u304b\u3063\u305f\u3068\u601d\u3063\u3066\u3044\u308b\u306e\u3060\u3002\u305d\u308d\u305d\u308d\u6a5f\u5acc\u3092\u76f4\u3057\u3066\u304f\u308c\u306a\u3044\u304b\uff1f","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":714408536,"id_str":"714408536","name":"\u767d\u9280","screen_name":"xshiroganex_bot","location":"","url":null,"description":"\u98f4\u3068\u97ad\u306e\u5272\u5408\u304c\uff13:\uff17\u3068\u3044\u3046\u3064\u3093\u3067\u308c\u4e5d\u5c3e\u72d0\u3055\u3093\u3002\u597d\u304d\u306a\u3082\u306e\u306f\u548c\u83d3\u5b50\u3002\n\u6700\u8fd1\u306f\u7f8a\u7fae\u304c\u304a\u6c17\u306b\u5165\u308a\u3002\n\u307e\u3060\u307e\u3060\u8a66\u904b\u8ee2\u4e2d\n\u304a\u308a\u304d\u3083\u3089bot\n\u898b\u305a\u77e5\u3089\u305a\u306e\u4eba\u306f\uff8c\uff9e\uff9b\uff6f\uff78\u5bfe\u8c61\u306b\u306a\u308a\u307e\u3059\u306e\u3067\u3054\u6ce8\u610f\u3092","protected":false,"followers_count":4,"friends_count":3,"listed_count":0,"created_at":"Tue Jul 24 14:50:40 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2440,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2739127975\/2851e8bf0c783d419b1fe75d17a9f7f2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2739127975\/2851e8bf0c783d419b1fe75d17a9f7f2_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361483266,"id_str":"365611234361483266","text":"My back, neck & feet is killing me!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":274969649,"id_str":"274969649","name":"imcuteeeee\u2122","screen_name":"_kenishamonique","location":"Me,Myself & I ","url":null,"description":"18 \r\nGemini\r\nSingle\r\nChicago\r\nI LOVE ME SOME @iamjacquees\r\nInstagram:@_kenishamonique","protected":false,"followers_count":836,"friends_count":832,"listed_count":1,"created_at":"Thu Mar 31 11:23:28 +0000 2011","favourites_count":1160,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":24792,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000023237180\/b68a76f92ab48ed772154248beaee16c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000023237180\/b68a76f92ab48ed772154248beaee16c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241153175\/f8e6a63b0c2c5ea9e05f3293b850438c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241153175\/f8e6a63b0c2c5ea9e05f3293b850438c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/274969649\/1375905670","profile_link_color":"131516","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FA2335","profile_text_color":"111411","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382450689,"id_str":"365611234382450689","text":"@junjunmjgirly \u3042\uff01\u3058\u3085\u3093\u3058\u3085\u3093\u304c\u7740\u3066\u308b\u304b\u304d\u6c37\u30ab\u30e9\u30fc\u306e\u670d\u3001\u9ec4\u8272\u3060\uff01\u30ec\u30e2\u30f3\u5473\uff01\n\u2606\u301c\uff08\u309d\u3002\u2202\uff09\uff1c\u30ec\u30e2\u30f3\u5473\u597d\u304d\u3057\u30fc\uff01","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":246711127,"in_reply_to_user_id_str":"246711127","in_reply_to_screen_name":"junjunmjgirly","user":{"id":1591415948,"id_str":"1591415948","name":"\u3086\u3067\u305f\u307e","screen_name":"gbgbchoco3","location":"","url":null,"description":"\u3053\u3063\u3061\u3082\uff5e @gbgbchoco","protected":false,"followers_count":20,"friends_count":96,"listed_count":0,"created_at":"Sat Jul 13 16:58:35 +0000 2013","favourites_count":105,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":265,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000137681551\/e7e795ecd230ca8a51193c563e2e1110_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000137681551\/e7e795ecd230ca8a51193c563e2e1110_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"junjunmjgirly","name":"\u6b4c\u5e83\u5834 \u6df3","id":246711127,"id_str":"246711127","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382450693,"id_str":"365611234382450693","text":"RT @CFinlaysonMusic: Parents can only give good advice or put them on the right paths, but the final forming of a person's character lies i\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1155936728,"id_str":"1155936728","name":"Dorrie McClinton","screen_name":"DorrieMcClinton","location":"south jersey","url":null,"description":"The Lord Jesus Christ is the Truth","protected":false,"followers_count":29,"friends_count":61,"listed_count":0,"created_at":"Thu Feb 07 03:53:29 +0000 2013","favourites_count":86,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":840,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000109285120\/08c8186024a59de396381f07d316a036_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000109285120\/08c8186024a59de396381f07d316a036_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 11:17:04 +0000 2013","id":365431459890204672,"id_str":"365431459890204672","text":"Parents can only give good advice or put them on the right paths, but the final forming of a person's character lies in their own hands.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":263653301,"id_str":"263653301","name":"Chris_Finlayson","screen_name":"CFinlaysonMusic","location":"Youtube.com\/jessicadrown78","url":"http:\/\/www.facebook.com\/chrisfinlaysonmusic","description":"Contempary Christian Artist \u2022 Worship Minister \u2022 Facebook fan page: 'ChrisFinlaysonMusic' \u2022 Downloads: ReverbNation:'ChrisFinlayson' \u2022 (FLORIDA)","protected":false,"followers_count":13776,"friends_count":14200,"listed_count":30,"created_at":"Thu Mar 10 13:31:33 +0000 2011","favourites_count":3146,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":2399,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/835192057\/9cc887705875bf4c44975101825a175d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/835192057\/9cc887705875bf4c44975101825a175d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000229561920\/dcb3bb5ce1e2f0b4f6cca47a9fa58226_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000229561920\/dcb3bb5ce1e2f0b4f6cca47a9fa58226_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/263653301\/1375994482","profile_link_color":"2FC2EF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":48,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CFinlaysonMusic","name":"Chris_Finlayson","id":263653301,"id_str":"263653301","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382462976,"id_str":"365611234382462976","text":"@caritonoir asi me gusta! jajajaja ma\u00f1ana te cuento de todos los que me entere","source":"web","truncated":false,"in_reply_to_status_id":365609211285094402,"in_reply_to_status_id_str":"365609211285094402","in_reply_to_user_id":212203712,"in_reply_to_user_id_str":"212203712","in_reply_to_screen_name":"caritonoir","user":{"id":284220244,"id_str":"284220244","name":"Luc\u00eda \u2655","screen_name":"PeqqeWy","location":"","url":"http:\/\/facebook.com\/Lucia.Siguenza","description":"El pincha es mi amor heredado\u2665 - la m\u00fasica es la paz de mi alma - y llevo en mi coraz\u00f3n al tesoro mas preciado que tuve en la vida.","protected":false,"followers_count":88,"friends_count":98,"listed_count":2,"created_at":"Mon Apr 18 21:40:23 +0000 2011","favourites_count":51,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":1781,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/441279487\/Rosas_Brillantes_800.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/441279487\/Rosas_Brillantes_800.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227282376\/3d2738985453835b6cb3ed48482b6300_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227282376\/3d2738985453835b6cb3ed48482b6300_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/284220244\/1375396445","profile_link_color":"E304AF","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"caritonoir","name":"Caro Noir","id":212203712,"id_str":"212203712","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234365677569,"id_str":"365611234365677569","text":"@BaudotBaptiste #GrosPointCommun #Frat\u00e9JusquauxGenoux","source":"web","truncated":false,"in_reply_to_status_id":365611015116816384,"in_reply_to_status_id_str":"365611015116816384","in_reply_to_user_id":338553356,"in_reply_to_user_id_str":"338553356","in_reply_to_screen_name":"BaudotBaptiste","user":{"id":879908114,"id_str":"879908114","name":"Doreyyy Yallah #NK","screen_name":"DoreyA_Nk","location":"Dijon","url":"http:\/\/instagram.com\/anaisnk13","description":"Handball Handball\r\n& Nikola Karabatic.","protected":false,"followers_count":124,"friends_count":67,"listed_count":0,"created_at":"Sun Oct 14 11:25:42 +0000 2012","favourites_count":221,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":7044,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024324957\/59454e4f7861218f862ed08932fa79cb.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024324957\/59454e4f7861218f862ed08932fa79cb.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000239269858\/48fb3f280dc4716fc0cb70c3a7828829_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000239269858\/48fb3f280dc4716fc0cb70c3a7828829_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/879908114\/1373897042","profile_link_color":"0EB8F0","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"GrosPointCommun","indices":[16,32]},{"text":"Frat\u00e9JusquauxGenoux","indices":[33,53]}],"urls":[],"user_mentions":[{"screen_name":"BaudotBaptiste","name":"L'intello","id":338553356,"id_str":"338553356","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234369867776,"id_str":"365611234369867776","text":"morning '-')\/","source":"\u003ca href=\"http:\/\/www.myplume.com\/\" rel=\"nofollow\"\u003ePlume\u00a0for\u00a0Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1574116819,"id_str":"1574116819","name":"Eyoungie~","screen_name":"PeperoEyoungie","location":"","url":null,"description":"[V] @PeperoRoleplay - roleplayer of Eyoung's AfterSchool - single - forever vampire","protected":false,"followers_count":149,"friends_count":156,"listed_count":0,"created_at":"Sun Jul 07 02:13:44 +0000 2013","favourites_count":11,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2872,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260113293\/1c077d7a0de90caace82395972d4814f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260113293\/1c077d7a0de90caace82395972d4814f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1574116819\/1375794577","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sl"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361475073,"id_str":"365611234361475073","text":"\"@Brilliant_Ads: The most famous brand from each state,in US http:\/\/t.co\/KMAGGQbtic\" @noahkostiuk. @austinframe check out Florida!!!! #miami","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1094001326,"id_str":"1094001326","name":"Nate Renooy","screen_name":"NateRenooy12","location":"","url":null,"description":null,"protected":false,"followers_count":123,"friends_count":154,"listed_count":0,"created_at":"Wed Jan 16 03:28:30 +0000 2013","favourites_count":874,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":683,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3605285835\/8386e34357be0add9c48fedfc1cccfaf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3605285835\/8386e34357be0add9c48fedfc1cccfaf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1094001326\/1359144116","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"miami","indices":[134,140]}],"urls":[],"user_mentions":[{"screen_name":"Brilliant_Ads","name":"Brilliant Ads","id":564686965,"id_str":"564686965","indices":[1,15]},{"screen_name":"noahkostiuk","name":"Noah Kostiuk","id":1121127782,"id_str":"1121127782","indices":[85,97]},{"screen_name":"austinframe","name":"Austin Frame","id":739955024,"id_str":"739955024","indices":[99,111]}],"media":[{"id":364536303867539457,"id_str":"364536303867539457","indices":[61,83],"media_url":"http:\/\/pbs.twimg.com\/media\/BQ8X1kPCMAEMq7m.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BQ8X1kPCMAEMq7m.jpg","url":"http:\/\/t.co\/KMAGGQbtic","display_url":"pic.twitter.com\/KMAGGQbtic","expanded_url":"http:\/\/twitter.com\/Brilliant_Ads\/status\/364536303863345152\/photo\/1","type":"photo","sizes":{"medium":{"w":540,"h":430,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":540,"h":430,"resize":"fit"},"small":{"w":340,"h":271,"resize":"fit"}},"source_status_id":364536303863345152,"source_status_id_str":"364536303863345152"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390851586,"id_str":"365611234390851586","text":"@SucksMeHazza Qual seu apelido? Eu sou do Cear\u00e1 e vc??","source":"web","truncated":false,"in_reply_to_status_id":365610852965027841,"in_reply_to_status_id_str":"365610852965027841","in_reply_to_user_id":1571083884,"in_reply_to_user_id_str":"1571083884","in_reply_to_screen_name":"SucksMeHazza","user":{"id":1400071513,"id_str":"1400071513","name":"Mrs.Tomlinson","screen_name":"LittledoTommo","location":"Londres,s\u00f3 em sonho!","url":null,"description":"AMO voc\u00eas muito mesmo - Aly-\u00c2ngela-Karla -Cris S2 You\u2019re everything I see in my dreams.","protected":false,"followers_count":849,"friends_count":952,"listed_count":0,"created_at":"Fri May 03 15:52:49 +0000 2013","favourites_count":845,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":4818,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FAF7F7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000033139264\/3ccdfdda162f321a5f515e534c84b388.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000033139264\/3ccdfdda162f321a5f515e534c84b388.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216093583\/982fd889c7d3915756f2b6d02016ec1b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216093583\/982fd889c7d3915756f2b6d02016ec1b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1400071513\/1374778138","profile_link_color":"F21111","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SucksMeHazza","name":"Mrs. Styles \u2764","id":1571083884,"id_str":"1571083884","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386653184,"id_str":"365611234386653184","text":"@P4nd44s kkkkkkkkkkkkk Comigo n\u00e3o, eu sempre me dou bem com os hunter! Aqui s\u00f3 esperando Outubro u.u","source":"web","truncated":false,"in_reply_to_status_id":365610848137388033,"in_reply_to_status_id_str":"365610848137388033","in_reply_to_user_id":475393060,"in_reply_to_user_id_str":"475393060","in_reply_to_screen_name":"P4nd44s","user":{"id":217532231,"id_str":"217532231","name":"Srta. Ackles :3","screen_name":"anastaackles","location":"Brasil","url":"https:\/\/www.facebook.com\/anastacia.thais","description":"I'm Hunter. I love Supernatural. Carry on my Wayward son \u2665\u2665","protected":false,"followers_count":28,"friends_count":106,"listed_count":0,"created_at":"Fri Nov 19 19:56:55 +0000 2010","favourites_count":2,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":1123,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"8A2D40","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046083040\/eab6fe9addecbc72ba3f582ce7b42314.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046083040\/eab6fe9addecbc72ba3f582ce7b42314.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242827346\/4eb6d9fefc49022a4dffa3b5b25f281d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242827346\/4eb6d9fefc49022a4dffa3b5b25f281d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/217532231\/1363872185","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"P4nd44s","name":"Carol","id":475393060,"id_str":"475393060","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234395033600,"id_str":"365611234395033600","text":"RT @SexyBuzz69: RT\/F @vagasstits @HDhotties @Pornpica @DukeXx3 @Dippysrift @TooHot4FaceBook @2Kavitha @LoveGirlsAlot @Juspixforyou http:\/\/t\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1318221991,"id_str":"1318221991","name":"Pussy Lovers","screen_name":"napson_gede","location":"","url":null,"description":null,"protected":false,"followers_count":99,"friends_count":154,"listed_count":0,"created_at":"Sun Mar 31 11:35:08 +0000 2013","favourites_count":418,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4643,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3482338298\/1abbdf162d21c25e929fe5a8459d526c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3482338298\/1abbdf162d21c25e929fe5a8459d526c_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:02:21 +0000 2013","id":365593852658188291,"id_str":"365593852658188291","text":"RT\/F @vagasstits @HDhotties @Pornpica @DukeXx3 @Dippysrift @TooHot4FaceBook @2Kavitha @LoveGirlsAlot @Juspixforyou http:\/\/t.co\/R6HwnY8jHL","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":999868676,"id_str":"999868676","name":"Sexy","screen_name":"SexyBuzz69","location":"","url":null,"description":"Adults Only 18+","protected":false,"followers_count":3356,"friends_count":775,"listed_count":21,"created_at":"Sun Dec 09 18:31:25 +0000 2012","favourites_count":1386,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":3517,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/883951883\/956d95f09ccb9985ab49b0ad7575a441.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/883951883\/956d95f09ccb9985ab49b0ad7575a441.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000033237301\/0d9fbc356c9cea77fc33351f3a651896_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000033237301\/0d9fbc356c9cea77fc33351f3a651896_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/999868676\/1371854514","profile_link_color":"000000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"vagasstits","name":"vagasstits","id":1441559863,"id_str":"1441559863","indices":[5,16]},{"screen_name":"HDhotties","name":"Hotties in HD","id":150048901,"id_str":"150048901","indices":[17,27]},{"screen_name":"Pornpica","name":"PornPica.com\u2122","id":902010354,"id_str":"902010354","indices":[28,37]},{"screen_name":"DukeXx3","name":"Duke","id":1013707122,"id_str":"1013707122","indices":[38,46]},{"screen_name":"Dippysrift","name":"Dippy","id":254600653,"id_str":"254600653","indices":[47,58]},{"screen_name":"TooHot4FaceBook","name":"Too Hot 4 FaceBook","id":250672259,"id_str":"250672259","indices":[59,75]},{"screen_name":"2Kavitha","name":"kavitha ","id":1172342766,"id_str":"1172342766","indices":[76,85]},{"screen_name":"LoveGirlsAlot","name":"LoveGirlsAlot","id":464388539,"id_str":"464388539","indices":[86,100]},{"screen_name":"Juspixforyou","name":"Serina","id":1306147225,"id_str":"1306147225","indices":[101,114]}],"media":[{"id":365593852666576897,"id_str":"365593852666576897","indices":[115,137],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLZrAhCAAEkHDB.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLZrAhCAAEkHDB.jpg","url":"http:\/\/t.co\/R6HwnY8jHL","display_url":"pic.twitter.com\/R6HwnY8jHL","expanded_url":"http:\/\/twitter.com\/SexyBuzz69\/status\/365593852658188291\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":736,"h":1075,"resize":"fit"},"small":{"w":340,"h":497,"resize":"fit"},"medium":{"w":600,"h":876,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"lang":"und"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SexyBuzz69","name":"Sexy","id":999868676,"id_str":"999868676","indices":[3,14]},{"screen_name":"vagasstits","name":"vagasstits","id":1441559863,"id_str":"1441559863","indices":[21,32]},{"screen_name":"HDhotties","name":"Hotties in HD","id":150048901,"id_str":"150048901","indices":[33,43]},{"screen_name":"Pornpica","name":"PornPica.com\u2122","id":902010354,"id_str":"902010354","indices":[44,53]},{"screen_name":"DukeXx3","name":"Duke","id":1013707122,"id_str":"1013707122","indices":[54,62]},{"screen_name":"Dippysrift","name":"Dippy","id":254600653,"id_str":"254600653","indices":[63,74]},{"screen_name":"TooHot4FaceBook","name":"Too Hot 4 FaceBook","id":250672259,"id_str":"250672259","indices":[75,91]},{"screen_name":"2Kavitha","name":"kavitha ","id":1172342766,"id_str":"1172342766","indices":[92,101]},{"screen_name":"LoveGirlsAlot","name":"LoveGirlsAlot","id":464388539,"id_str":"464388539","indices":[102,116]},{"screen_name":"Juspixforyou","name":"Serina","id":1306147225,"id_str":"1306147225","indices":[117,130]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234395029506,"id_str":"365611234395029506","text":"RT @IlkayErylmaz: Elini \u00f6pmemem i\u00e7in direten insanlar\u0131 da anlam\u0131yorum lan b\u0131rak \u00f6peyim \u0131ste ya\u015fl\u0131s\u0131n","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":710513784,"id_str":"710513784","name":"\u015eeyma","screen_name":"SeymaEryilmaz","location":"Isparta","url":null,"description":"O\u011flak burcu,fazlas\u0131yla inat\u00e7\u0131 bir de k\u0131skan\u00e7,kahve ve \u00e7ikolata tutkunu,Dexter sever. #MustafaKemalATAT\u00dcRK #Fenerbah\u00e7e a\u015f\u0131\u011f\u0131.","protected":false,"followers_count":122,"friends_count":101,"listed_count":0,"created_at":"Sun Jul 22 10:56:54 +0000 2012","favourites_count":267,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":true,"verified":false,"statuses_count":1727,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000017962375\/67ac1db9f723a7157b3ba8573ca62d0c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000017962375\/67ac1db9f723a7157b3ba8573ca62d0c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193854403\/ad7c8715553df19b9f68929dd64a27b0_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193854403\/ad7c8715553df19b9f68929dd64a27b0_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/710513784\/1375726096","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 10:47:08 +0000 2013","id":365423929021575169,"id_str":"365423929021575169","text":"Elini \u00f6pmemem i\u00e7in direten insanlar\u0131 da anlam\u0131yorum lan b\u0131rak \u00f6peyim \u0131ste ya\u015fl\u0131s\u0131n","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":558839653,"id_str":"558839653","name":"\u0130lkay","screen_name":"IlkayErylmaz","location":"","url":null,"description":"Tak\u0131nt\u0131l\u0131 ko\u00e7 burcu. #Galatasaray - #ATAT\u00dcRK","protected":false,"followers_count":233,"friends_count":227,"listed_count":0,"created_at":"Fri Apr 20 18:15:40 +0000 2012","favourites_count":88,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":1834,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"050505","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044524341\/1162dba84422ff4678752ccd6dc68db6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044524341\/1162dba84422ff4678752ccd6dc68db6.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242261411\/fc8d39884df6f816d73cea89bcd7faba_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242261411\/fc8d39884df6f816d73cea89bcd7faba_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/558839653\/1374798780","profile_link_color":"590675","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"tr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"IlkayErylmaz","name":"\u0130lkay","id":558839653,"id_str":"558839653","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390839298,"id_str":"365611234390839298","text":"@Deer_EXOLuhan JIDAT LU SEKSI DIKATA MAKE BIKINI HA?!","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610880160899072,"in_reply_to_status_id_str":"365610880160899072","in_reply_to_user_id":979864566,"in_reply_to_user_id_str":"979864566","in_reply_to_screen_name":"Deer_EXOLuhan","user":{"id":1353293024,"id_str":"1353293024","name":"chami","screen_name":"7thChamomile","location":"","url":null,"description":"hello im cheerful genie from girls generation. dont be shock if i love food but i never getting fat","protected":false,"followers_count":140,"friends_count":142,"listed_count":0,"created_at":"Mon Apr 15 02:42:35 +0000 2013","favourites_count":24,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":false,"verified":false,"statuses_count":3947,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/868352887\/74135751cf83b5769f289e19526ac914.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/868352887\/74135751cf83b5769f289e19526ac914.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251335148\/b14ec5b2dab5f20f3687f4a3fe5b9b5a_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251335148\/b14ec5b2dab5f20f3687f4a3fe5b9b5a_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1353293024\/1375205038","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Deer_EXOLuhan","name":"Abang Luhan","id":979864566,"id_str":"979864566","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382450692,"id_str":"365611234382450692","text":"Internet ta r\u00e1pida, que milagre.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":239571073,"id_str":"239571073","name":"Pra\u00e7a carai.","screen_name":"UmaOgrinha","location":"","url":null,"description":null,"protected":false,"followers_count":1634,"friends_count":551,"listed_count":3,"created_at":"Mon Jan 17 23:03:34 +0000 2011","favourites_count":1728,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":85203,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"E1005F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/822705374\/179cf357fcb834cb181eb26c1abf1830.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/822705374\/179cf357fcb834cb181eb26c1abf1830.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000049248094\/1a71f89d9c10c234495ef3893fe4b313_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000049248094\/1a71f89d9c10c234495ef3893fe4b313_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/239571073\/1373757185","profile_link_color":"080808","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"39343B","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234369863681,"id_str":"365611234369863681","text":"RT @TweetLikeAGirI: Me the first day of school http:\/\/t.co\/wolOKxslZ6","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":942715129,"id_str":"942715129","name":"Emilaaa(:","screen_name":"Emila_Pilipovic","location":"","url":null,"description":null,"protected":false,"followers_count":457,"friends_count":465,"listed_count":0,"created_at":"Mon Nov 12 02:51:10 +0000 2012","favourites_count":1598,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":12623,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000194637064\/4f353270dcbc6b771ee409645d71ca97_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000194637064\/4f353270dcbc6b771ee409645d71ca97_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/942715129\/1372208508","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:32:05 +0000 2013","id":365601333027426304,"id_str":"365601333027426304","text":"Me the first day of school http:\/\/t.co\/wolOKxslZ6","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":891826837,"id_str":"891826837","name":"Tweet Like A Girl","screen_name":"TweetLikeAGirI","location":"","url":null,"description":"Omg, follow me betch. \r\n\r\n\r\nSubmit tweets & business: tweetlikeafemale@gmail.com\r\n\r\n(Parody Account)","protected":false,"followers_count":623110,"friends_count":1,"listed_count":293,"created_at":"Fri Oct 19 20:54:40 +0000 2012","favourites_count":328,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"verified":false,"statuses_count":508,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2936815825\/77cae1a5c0b8b9b097f988e55ce2a063_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2936815825\/77cae1a5c0b8b9b097f988e55ce2a063_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/891826837\/1375052557","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1963,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365601333031620608,"id_str":"365601333031620608","indices":[27,49],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLgebDCUAAtIvr.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLgebDCUAAtIvr.jpg","url":"http:\/\/t.co\/wolOKxslZ6","display_url":"pic.twitter.com\/wolOKxslZ6","expanded_url":"http:\/\/twitter.com\/TweetLikeAGirI\/status\/365601333027426304\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":213,"resize":"fit"},"medium":{"w":598,"h":375,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":598,"h":375,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TweetLikeAGirI","name":"Tweet Like A Girl","id":891826837,"id_str":"891826837","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386657281,"id_str":"365611234386657281","text":"@megstiyell @molstiel @deansalcohol @mishaztiel HUG ME WITH THE ARMS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610962105024515,"in_reply_to_status_id_str":"365610962105024515","in_reply_to_user_id":1576041872,"in_reply_to_user_id_str":"1576041872","in_reply_to_screen_name":"megstiyell","user":{"id":1050821455,"id_str":"1050821455","name":"marknado (\u0942\u2022\u1d17\u2022\u0942\u2741)","screen_name":"jullegrino","location":"DANVILLE","url":"http:\/\/tumblr.com\/blog\/daddy-luci","description":"if you listen closely you can hear me crying over mark pellegrino's face.","protected":false,"followers_count":341,"friends_count":357,"listed_count":4,"created_at":"Mon Dec 31 17:33:43 +0000 2012","favourites_count":3040,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":4324,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242447409\/8f859455b5024bbef9f35af522a56d27_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242447409\/8f859455b5024bbef9f35af522a56d27_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1050821455\/1374676230","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"megstiyell","name":"gAYBRIEL \u2661","id":1576041872,"id_str":"1576041872","indices":[0,11]},{"screen_name":"molstiel","name":"\u2661crowley\u2661","id":1643637127,"id_str":"1643637127","indices":[12,21]},{"screen_name":"deansalcohol","name":"\u273f lisanne \u273f","id":1616251285,"id_str":"1616251285","indices":[22,35]},{"screen_name":"mishaztiel","name":"kurofer\u2606Al\u2606","id":1420862892,"id_str":"1420862892","indices":[36,47]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390839297,"id_str":"365611234390839297","text":"@stillint0drew esse meu urso \u00e9 mt dl\u00e7","source":"web","truncated":false,"in_reply_to_status_id":365610703450681344,"in_reply_to_status_id_str":"365610703450681344","in_reply_to_user_id":1143881984,"in_reply_to_user_id_str":"1143881984","in_reply_to_screen_name":"stillint0drew","user":{"id":410077176,"id_str":"410077176","name":"vict\u00f3ria","screen_name":"jbangerz","location":"\u2661 victor \u2661","url":null,"description":"rafa tudao (ex thoughtmiley xoxo)","protected":false,"followers_count":1986,"friends_count":1087,"listed_count":21,"created_at":"Fri Nov 11 16:16:22 +0000 2011","favourites_count":330,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":false,"verified":false,"statuses_count":57703,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045373689\/58e96f69bf1a255bd527bc851dc63a30.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045373689\/58e96f69bf1a255bd527bc851dc63a30.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258136985\/23d75ff515abb702131f2cf4ccc1f46c_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258136985\/23d75ff515abb702131f2cf4ccc1f46c_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/410077176\/1375931252","profile_link_color":"524618","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"stillint0drew","name":"Victor","id":1143881984,"id_str":"1143881984","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390839296,"id_str":"365611234390839296","text":"@darthkniight :P","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611174445842432,"in_reply_to_status_id_str":"365611174445842432","in_reply_to_user_id":725673108,"in_reply_to_user_id_str":"725673108","in_reply_to_screen_name":"darthkniight","user":{"id":1620459757,"id_str":"1620459757","name":"Female Jack Howard","screen_name":"_Sherl0cked_","location":"AmazingPhil twtd me!-02\/08\/13\u2764","url":"http:\/\/hh-bromances-specialist.tumblr.com","description":"#HorribleHistorian,#LokisArmy,#Hiddlestoner,#Danosaur,#Phillion,#TinyPlanetExplorer,#Sherlockian,#Whovian,#Jack&Dean,#Hunter,#Muser,#Avenger,#FriendOrFoe","protected":false,"followers_count":92,"friends_count":210,"listed_count":0,"created_at":"Thu Jul 25 13:59:36 +0000 2013","favourites_count":341,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1397,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000186407512\/779c846194ef4d79d3faaff106e0576d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000186407512\/779c846194ef4d79d3faaff106e0576d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1620459757\/1375430015","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"darthkniight","name":"\u2606 anakin skywalker \u2606","id":725673108,"id_str":"725673108","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234374057985,"id_str":"365611234374057985","text":"@Erickpgarcia enserio?._. Que miedo:\/","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611081185509376,"in_reply_to_status_id_str":"365611081185509376","in_reply_to_user_id":1056623190,"in_reply_to_user_id_str":"1056623190","in_reply_to_screen_name":"Erickpgarcia","user":{"id":1149898692,"id_str":"1149898692","name":"enana\u2693","screen_name":"valeria_vogt","location":"nogales sonora.","url":null,"description":"que tu sonrisa sea m\u00e1s grande que tus est\u00fapidos problemas.","protected":false,"followers_count":164,"friends_count":149,"listed_count":0,"created_at":"Tue Feb 05 04:18:35 +0000 2013","favourites_count":1269,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":2567,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040232087\/e4f21d9d0e23c90d6a115717c7f088a2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040232087\/e4f21d9d0e23c90d6a115717c7f088a2.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216518281\/839d83aa5a6553e9b5c8d553bb0e5637_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216518281\/839d83aa5a6553e9b5c8d553bb0e5637_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1149898692\/1375666182","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Erickpgarcia","name":"Erick P.","id":1056623190,"id_str":"1056623190","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234395037698,"id_str":"365611234395037698","text":"RT @dounnnya: @mmbb113 @aa15355 @fo0fo012345 @A_alqabbani @138683 @mshaer9 @mooory_12 @_ohooood @lolo_alsahli \u0643\u0644 \u0639\u0627\u0627\u0627\u0627\u0627\u0627\u0627\u0645 \u0648\u0627\u0646\u062a\u0645 \u0628\u062e\u064a\u0631 \u0648\u064a\u0627\u0631\u0628\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":563678465,"id_str":"563678465","name":"\u0645\u0634\u0627\u0639\u0631 \u0631\u0642\u064a\u0642\u0629","screen_name":"mshaer9","location":"","url":null,"description":"\u0628\u064a\u0646 \u0625\u0646\u0652\u063a\u0645\u0627\u0633\u0629\u0650 \u062d\u0631\u0641\u0652 \u0648\u064e\u0644\u062c\u064e\u0629\u0650 \u0636\u064e\u062c\u064a\u062c\u0652 \u0623\u0643\u064f\u0648\u0646\u064f \u0623\u0646\u064e\u0627 \u0643\u064f\u0644\u064f \u0645\u064e\u0627\u064a\u064f\u062f\u0648\u0646\u064f \u0647\u064f\u0646\u0627 \u0628\u0650\u0650\u0642\u064e\u0644\u064e\u0645\u064a..","protected":false,"followers_count":12604,"friends_count":12251,"listed_count":6,"created_at":"Thu Apr 26 12:20:50 +0000 2012","favourites_count":46,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":16631,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000105039564\/82efbdb60cd9475e8f8a6a8d27f16356_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000105039564\/82efbdb60cd9475e8f8a6a8d27f16356_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/563678465\/1373294128","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 17:35:01 +0000 2013","id":365164189062402049,"id_str":"365164189062402049","text":"@mmbb113 @aa15355 @fo0fo012345 @A_alqabbani @138683 @mshaer9 @mooory_12 @_ohooood @lolo_alsahli \u0643\u0644 \u0639\u0627\u0627\u0627\u0627\u0627\u0627\u0627\u0645 \u0648\u0627\u0646\u062a\u0645 \u0628\u062e\u064a\u0631 \u0648\u064a\u0627\u0631\u0628\u064a \u064a\u0639\u0637\u064a\u0643\u0645 \u0627\u0644\u0635\u062d\u0647","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365163254495981568,"in_reply_to_status_id_str":"365163254495981568","in_reply_to_user_id":1225476205,"in_reply_to_user_id_str":"1225476205","in_reply_to_screen_name":"mmbb113","user":{"id":1542653070,"id_str":"1542653070","name":"\u062f\u0646\u064a\u0627","screen_name":"dounnnya","location":"","url":null,"description":"\u200f\u0646\u0638\u0631\u062a \u0639\u064a\u0648\u0646\u0647 \u064a\u0648\u0645 \u064a\u0632\u0639\u0644 \u064a\u0627\u0627\u0644\u0628\u064a\u0647 \u062d\u062a\u0649 \u0648\u0647\u0648 \u0632\u0639\u0644\u0627\u0646 \u0632\u0627\u064a\u062f \u062d\u0644\u0627\u0647\u0627","protected":false,"followers_count":313,"friends_count":310,"listed_count":0,"created_at":"Mon Jun 24 06:56:41 +0000 2013","favourites_count":237,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3363,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258997035\/53600a94f608c27d70077f5772a8c7d8_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258997035\/53600a94f608c27d70077f5772a8c7d8_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1542653070\/1373562074","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":3,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"mmbb113","name":"\u0645\u0628\u0627\u0631\u0643 \u0627\u0644\u0633\u0647\u0644\u064a ","id":1225476205,"id_str":"1225476205","indices":[0,8]},{"screen_name":"aa15355","name":"\u062f\u064a\u0646\u0627 \u0627\u0644\u0639\u0646\u0632\u064a","id":930453462,"id_str":"930453462","indices":[9,17]},{"screen_name":"fo0fo012345","name":"\u0639\u0641\u0640\u0640\u0640\u0640\u0640\u0627\u0641 ","id":994093417,"id_str":"994093417","indices":[18,30]},{"screen_name":"A_alqabbani","name":"\u0639\u0628\u062f\u0627\u0644\u0644\u0647 \u0627\u0644\u0642\u0628\u0627\u0646\u064a ","id":399499433,"id_str":"399499433","indices":[31,43]},{"screen_name":"138683","name":"\u063a\u0644\u0627 \u0627\u0644\u0637\u0627\u0626\u0641","id":1329233701,"id_str":"1329233701","indices":[44,51]},{"screen_name":"mshaer9","name":"\u0645\u0634\u0627\u0639\u0631 \u0631\u0642\u064a\u0642\u0629","id":563678465,"id_str":"563678465","indices":[52,60]},{"screen_name":"mooory_12","name":"\u0645\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0648\u0631\u064a\u2022\u2022","id":1115574385,"id_str":"1115574385","indices":[61,71]},{"screen_name":"_ohooood","name":"\u0627\u062c\u0645\u0640\u0640\u0644 \u0627\u0646\u0633\u0640\u0627\u0646\u0647 ","id":776923471,"id_str":"776923471","indices":[72,81]},{"screen_name":"lolo_alsahli","name":"\u0644\u0648\u0644\u0648 \u0627\u0644\u0633\u0647\u0644\u064a 5\/5","id":1379340200,"id_str":"1379340200","indices":[82,95]}]},"favorited":false,"retweeted":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"dounnnya","name":"\u062f\u0646\u064a\u0627","id":1542653070,"id_str":"1542653070","indices":[3,12]},{"screen_name":"mmbb113","name":"\u0645\u0628\u0627\u0631\u0643 \u0627\u0644\u0633\u0647\u0644\u064a ","id":1225476205,"id_str":"1225476205","indices":[14,22]},{"screen_name":"aa15355","name":"\u062f\u064a\u0646\u0627 \u0627\u0644\u0639\u0646\u0632\u064a","id":930453462,"id_str":"930453462","indices":[23,31]},{"screen_name":"fo0fo012345","name":"\u0639\u0641\u0640\u0640\u0640\u0640\u0640\u0627\u0641 ","id":994093417,"id_str":"994093417","indices":[32,44]},{"screen_name":"A_alqabbani","name":"\u0639\u0628\u062f\u0627\u0644\u0644\u0647 \u0627\u0644\u0642\u0628\u0627\u0646\u064a ","id":399499433,"id_str":"399499433","indices":[45,57]},{"screen_name":"138683","name":"\u063a\u0644\u0627 \u0627\u0644\u0637\u0627\u0626\u0641","id":1329233701,"id_str":"1329233701","indices":[58,65]},{"screen_name":"mshaer9","name":"\u0645\u0634\u0627\u0639\u0631 \u0631\u0642\u064a\u0642\u0629","id":563678465,"id_str":"563678465","indices":[66,74]},{"screen_name":"mooory_12","name":"\u0645\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0640\u0648\u0631\u064a\u2022\u2022","id":1115574385,"id_str":"1115574385","indices":[75,85]},{"screen_name":"_ohooood","name":"\u0627\u062c\u0645\u0640\u0640\u0644 \u0627\u0646\u0633\u0640\u0627\u0646\u0647 ","id":776923471,"id_str":"776923471","indices":[86,95]},{"screen_name":"lolo_alsahli","name":"\u0644\u0648\u0644\u0648 \u0627\u0644\u0633\u0647\u0644\u064a 5\/5","id":1379340200,"id_str":"1379340200","indices":[96,109]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386653185,"id_str":"365611234386653185","text":"RT @p0larfantasy: So tired through the day but cant sleep at night, why \ud83d\ude10","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1347552631,"id_str":"1347552631","name":"AEI","screen_name":"abdullahesmael_","location":"","url":null,"description":"Go fuck yourself","protected":false,"followers_count":277,"friends_count":293,"listed_count":0,"created_at":"Fri Apr 12 19:15:59 +0000 2013","favourites_count":81,"utc_offset":28800,"time_zone":"Beijing","geo_enabled":true,"verified":false,"statuses_count":2998,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262067512\/a99129ee82c7ff74ecd07617f74e2e8f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262067512\/a99129ee82c7ff74ecd07617f74e2e8f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1347552631\/1372224078","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 18:40:48 +0000 2013","id":365543130600177664,"id_str":"365543130600177664","text":"So tired through the day but cant sleep at night, why \ud83d\ude10","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":283890566,"id_str":"283890566","name":"Ayuni Bazila","screen_name":"p0larfantasy","location":"","url":null,"description":"My name is Ayuni but you can call me Ayuni","protected":false,"followers_count":318,"friends_count":65,"listed_count":1,"created_at":"Mon Apr 18 06:33:17 +0000 2011","favourites_count":511,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":8590,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/344918034410181363\/69d8f241b686ad7f25fed041478ebd69.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/344918034410181363\/69d8f241b686ad7f25fed041478ebd69.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000185980762\/d115390438f7e96f317eaf113ba50814_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000185980762\/d115390438f7e96f317eaf113ba50814_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/283890566\/1370006945","profile_link_color":"9412EB","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F06EB3","profile_text_color":"F7EE4D","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"p0larfantasy","name":"Ayuni Bazila","id":283890566,"id_str":"283890566","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234378256384,"id_str":"365611234378256384","text":"@rookey_1004 \u3075\u3041\u3044\u3068\u3067\u3059\u30fc\u30fc\uff01\uff01","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610741358800896,"in_reply_to_status_id_str":"365610741358800896","in_reply_to_user_id":7713092,"in_reply_to_user_id_str":"7713092","in_reply_to_screen_name":"rookey_1004","user":{"id":82890490,"id_str":"82890490","name":"T_K_7","screen_name":"T_K_7","location":"\u795e\u5948\u5ddd\u770c","url":"http:\/\/iddy.jp\/profile\/T_K_7\/","description":"S.C.NANA NET\/\u4f0a\u96c6\u9662\u5149\/ATLUS\/\u30a2\u30cb\u30b9\u30d1\/\u65e5\u5e38\u306e\u622f\u8a00\/\u795e\u5948\u5ddd\u770c\u6c11\/\u8d77\u5e8a\u6642\u306b\u5948\u3005\u3055\u3093\u95a2\u9023\u306e\u30ab\u30a6\u30f3\u30c8\u30c0\u30a6\u30f3\u3092post\u3057\u307e\u3059\u3002\u81ea\u5206\u304c\u53c2\u6226\u3057\u306a\u3044LIVE\u7b49\u306f\u30b9\u30eb\u30fc\u3057\u307e\u3059\u306e\u3067\u3054\u6ce8\u610f\u304f\u3060\u3055\u3044\uff57","protected":false,"followers_count":889,"friends_count":1116,"listed_count":126,"created_at":"Fri Oct 16 14:24:55 +0000 2009","favourites_count":12,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":38969,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/63172000\/picnenga.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/63172000\/picnenga.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/557650554\/p-image_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/557650554\/p-image_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"rookey_1004","name":"\u30eb\u30a5KEY","id":7713092,"id_str":"7713092","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390847488,"id_str":"365611234390847488","text":"@yosied pagi juga yosie;D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365609398011305985,"in_reply_to_status_id_str":"365609398011305985","in_reply_to_user_id":243971030,"in_reply_to_user_id_str":"243971030","in_reply_to_screen_name":"yosied","user":{"id":544167253,"id_str":"544167253","name":"Sita Saomi","screen_name":"SITASM","location":"","url":null,"description":"Ig\/path: sitaasm\/sitasm","protected":false,"followers_count":1237,"friends_count":899,"listed_count":2,"created_at":"Tue Apr 03 10:38:23 +0000 2012","favourites_count":10,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":true,"verified":false,"statuses_count":63189,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000002358057\/9acc78c3f497a297d9a76e33e62e0346.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000002358057\/9acc78c3f497a297d9a76e33e62e0346.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000229645453\/5dc148615f7d730c25abb612febee492_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000229645453\/5dc148615f7d730c25abb612febee492_normal.jpeg","profile_link_color":"25F013","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"yosied","name":"Yosie Dwiana","id":243971030,"id_str":"243971030","indices":[0,7]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361475072,"id_str":"365611234361475072","text":"RT @kismyft2xxxm: \"@marubuta121: \u4ffa\u8db3\u65cf\u306e\u307f\u306a\u3055\u30fc\u3093\uff01\n\u30ad\u30b9\u30de\u30a4\u3092 \u4e0b\u5442\u6271\u3044\u3057\u3066\u308b\u5973\u304c\u3044\u307e\u30fc\u3059\uff01\n800RT\u3044\u3063\u305f\u3089\u571f\u4e0b\u5ea7\u5199\u30e1\u3092Twitter\u306b\u8f09\u305b\u3066\u304f\u308c\u308b\u305d\u3046\u3067\u30fc\u3059\uff01\n\u3054\u5354\u529b\u3088\u308d\u3057\u304f\uff01\n\uff03\u30e0\u30ab\u3064\u3044\u305f\u4eba\u308eRT http:\/\/t.co\/tKKa6o\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1384232198,"id_str":"1384232198","name":"\u304a\u3061\u3042\u3044\u306a\u306a\u304b","screen_name":"boku7733","location":"","url":null,"description":"\u3068\u3073\u3063\u5b50\u3001\u30bb\u30af\u30ac\u30ebfollow me \u30de\u30ea\u30a6\u30b9\u4e16\u4ee3***JFF8\u670828\u65e5\u53c2\u6226\u2606","protected":false,"followers_count":615,"friends_count":703,"listed_count":1,"created_at":"Sat Apr 27 11:35:10 +0000 2013","favourites_count":4,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3100,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257079904\/653775551fd83c7d7f7184cfcf799df5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257079904\/653775551fd83c7d7f7184cfcf799df5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1384232198\/1375950709","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 14:21:25 +0000 2013","id":365115466257154048,"id_str":"365115466257154048","text":"\"@marubuta121: \u4ffa\u8db3\u65cf\u306e\u307f\u306a\u3055\u30fc\u3093\uff01\n\u30ad\u30b9\u30de\u30a4\u3092 \u4e0b\u5442\u6271\u3044\u3057\u3066\u308b\u5973\u304c\u3044\u307e\u30fc\u3059\uff01\n800RT\u3044\u3063\u305f\u3089\u571f\u4e0b\u5ea7\u5199\u30e1\u3092Twitter\u306b\u8f09\u305b\u3066\u304f\u308c\u308b\u305d\u3046\u3067\u30fc\u3059\uff01\n\u3054\u5354\u529b\u3088\u308d\u3057\u304f\uff01\n\uff03\u30e0\u30ab\u3064\u3044\u305f\u4eba\u308eRT http:\/\/t.co\/tKKa6ott2n\"\n\n\u610f\u5473\u308f\u304b\u3089\u3093\u3002","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":224979189,"id_str":"224979189","name":"\u305f\u3041\u307f\u3044\uff0a\u3055\u308a\u30fc","screen_name":"kismyft2xxxm","location":"\u5175\u5eab\u770c","url":null,"description":"\uff0a\u591a\u7530\u5e7c\u2192\u591a\u7530\u6771\u2192\u591a\u7530\u4e2d\u2192\u5317\u96752\u5e74(30\u56de\u751f)\uff0a\u5439\u594f\u697d\u2026\u4e2d:T.Sax(\uff7e\uff99\uff8f\uff70\u2161) \u9ad8:A.Sax(\uff94\uff8f\uff8a62)\uff0a\u85e4\u30f6\u8c37 \/ \u7389\u68ee \/ \u795e\u5bae\u5bfa\uff0a","protected":false,"followers_count":236,"friends_count":345,"listed_count":0,"created_at":"Fri Dec 10 10:24:53 +0000 2010","favourites_count":40,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":1361,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/736832350\/dc0aa3753763a3eb626180df682818b9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/736832350\/dc0aa3753763a3eb626180df682818b9.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000067089407\/71d4bb84b67dd307c72140be9379805d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000067089407\/71d4bb84b67dd307c72140be9379805d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/224979189\/1369226174","profile_link_color":"0084B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":433,"entities":{"hashtags":[{"text":"\u30e0\u30ab\u3064\u3044\u305f\u4eba\u308eRT","indices":[92,102]}],"urls":[],"user_mentions":[{"screen_name":"marubuta121","name":"\u3042\u3084\u308a\u30ef\u30be\u30a6\u30b9\u30ad\u30fc","id":1420694521,"id_str":"1420694521","indices":[1,13]}],"media":[{"id":365071990505234432,"id_str":"365071990505234432","indices":[103,125],"media_url":"http:\/\/pbs.twimg.com\/media\/BRD_CoeCYAAMTX9.png","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRD_CoeCYAAMTX9.png","url":"http:\/\/t.co\/tKKa6ott2n","display_url":"pic.twitter.com\/tKKa6ott2n","expanded_url":"http:\/\/twitter.com\/marubuta121\/status\/365071990496845824\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":605,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":480,"h":854,"resize":"fit"},"medium":{"w":480,"h":854,"resize":"fit"}},"source_status_id":365071990496845824,"source_status_id_str":"365071990496845824"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"ja"},"retweet_count":0,"entities":{"hashtags":[{"text":"\u30e0\u30ab\u3064\u3044\u305f\u4eba\u308eRT","indices":[110,120]}],"urls":[],"user_mentions":[{"screen_name":"kismyft2xxxm","name":"\u305f\u3041\u307f\u3044\uff0a\u3055\u308a\u30fc","id":224979189,"id_str":"224979189","indices":[3,16]},{"screen_name":"marubuta121","name":"\u3042\u3084\u308a\u30ef\u30be\u30a6\u30b9\u30ad\u30fc","id":1420694521,"id_str":"1420694521","indices":[19,31]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234399227906,"id_str":"365611234399227906","text":"@dixiechickidie @pcdunham We need to have a frank discussion about humidity, Allie.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610218400382977,"in_reply_to_status_id_str":"365610218400382977","in_reply_to_user_id":36385658,"in_reply_to_user_id_str":"36385658","in_reply_to_screen_name":"dixiechickidie","user":{"id":561336440,"id_str":"561336440","name":"Matthew","screen_name":"Matterless","location":"","url":"http:\/\/matterless.tumblr.com","description":"Mostly poison.","protected":false,"followers_count":442,"friends_count":474,"listed_count":10,"created_at":"Mon Apr 23 17:49:22 +0000 2012","favourites_count":11452,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":12269,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044860925\/54aa6eef0cd261ff13336fdb43122262.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044860925\/54aa6eef0cd261ff13336fdb43122262.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3402669649\/7efba5357594c54c08b2ad5bc9d45d46_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3402669649\/7efba5357594c54c08b2ad5bc9d45d46_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/561336440\/1361309757","profile_link_color":"2FC2EF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"dixiechickidie","name":"Allie Seago","id":36385658,"id_str":"36385658","indices":[0,15]},{"screen_name":"pcdunham","name":"PCD","id":26377458,"id_str":"26377458","indices":[16,25]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234374066177,"id_str":"365611234374066177","text":"@sawyerrr_ http:\/\/t.co\/ktIR4FZvpT","source":"web","truncated":false,"in_reply_to_status_id":365609403124170752,"in_reply_to_status_id_str":"365609403124170752","in_reply_to_user_id":358268825,"in_reply_to_user_id_str":"358268825","in_reply_to_screen_name":"sawyerrr_","user":{"id":1346808324,"id_str":"1346808324","name":"Monika","screen_name":"themortalswords","location":"","url":"http:\/\/thedaughterofvalentine.tumblr.com","description":"The only thing we're allowed to do is to believe that we won't regret the choice we made.","protected":false,"followers_count":92,"friends_count":144,"listed_count":0,"created_at":"Fri Apr 12 13:22:11 +0000 2013","favourites_count":304,"utc_offset":7200,"time_zone":"Warsaw","geo_enabled":true,"verified":false,"statuses_count":22524,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035911617\/764ffeeeef97111890ee554b01ab79f7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035911617\/764ffeeeef97111890ee554b01ab79f7.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259737565\/c611f7cd39ccd420296ac84921fbae46_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259737565\/c611f7cd39ccd420296ac84921fbae46_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1346808324\/1373748387","profile_link_color":"141414","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"0084B4","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/ktIR4FZvpT","expanded_url":"http:\/\/youtu.be\/AnXjDGcIAj4","display_url":"youtu.be\/AnXjDGcIAj4","indices":[11,33]}],"user_mentions":[{"screen_name":"sawyerrr_","name":"Andrzejak","id":358268825,"id_str":"358268825","indices":[0,10]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390843393,"id_str":"365611234390843393","text":"@selmapjetrovic @timmyinheaven oh god! I'll have to hear him every day!! \ud83d\ude2b\ud83d\ude09","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610623159119872,"in_reply_to_status_id_str":"365610623159119872","in_reply_to_user_id":334690852,"in_reply_to_user_id_str":"334690852","in_reply_to_screen_name":"selmapjetrovic","user":{"id":367093609,"id_str":"367093609","name":"Ash Rouse","screen_name":"AWRMAH","location":"Heart's in Eastbourne, UK.","url":null,"description":"The semi pink haired raunchy British child\/mediocre guitarist and singer in @MAHbandX and guitar tech for @ThisIsAllNowNY \nInstragram: AWRMAH","protected":false,"followers_count":1016,"friends_count":1888,"listed_count":1,"created_at":"Sat Sep 03 11:10:10 +0000 2011","favourites_count":1645,"utc_offset":-18000,"time_zone":"Mexico City","geo_enabled":true,"verified":false,"statuses_count":18082,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/762846831\/51db368807816e710a64e1f250fe4738.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/762846831\/51db368807816e710a64e1f250fe4738.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259410106\/213c41588010f2d19572d70e1fb66bd5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259410106\/213c41588010f2d19572d70e1fb66bd5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/367093609\/1373314699","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"selmapjetrovic","name":"Selma Pjetrovic","id":334690852,"id_str":"334690852","indices":[0,15]},{"screen_name":"timmyinheaven","name":"Timmy Rasmussen","id":88340317,"id_str":"88340317","indices":[16,30]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234395029504,"id_str":"365611234395029504","text":"@CarlosValero08 yo mas bien sobrao a maduro ya no creo en p\u00e1jaros con sida","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610527495434240,"in_reply_to_status_id_str":"365610527495434240","in_reply_to_user_id":159260410,"in_reply_to_user_id_str":"159260410","in_reply_to_screen_name":"CarlosValero08","user":{"id":87982325,"id_str":"87982325","name":" =\u00a11T4L4 R0M3R0?=","screen_name":"italaromero","location":"venezuela","url":null,"description":"vieja pero segura de lo que quiero y due\u00f1a de lo que pienso,ni la muerte me hara cambiar","protected":false,"followers_count":1792,"friends_count":1875,"listed_count":4,"created_at":"Fri Nov 06 16:58:40 +0000 2009","favourites_count":93,"utc_offset":-16200,"time_zone":"Caracas","geo_enabled":true,"verified":false,"statuses_count":9675,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/889057919\/c16572b2fda5dc728a290de20d599eca.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/889057919\/c16572b2fda5dc728a290de20d599eca.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3722668300\/39f590b4ba25348dd9f0b6fa3bea33b7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3722668300\/39f590b4ba25348dd9f0b6fa3bea33b7_normal.jpeg","profile_link_color":"0099CC","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"F6FFD1","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CarlosValero08","name":"Carlos Valero","id":159260410,"id_str":"159260410","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234369875969,"id_str":"365611234369875969","text":"@OswinsSouffles *takes her hand and leads her onto the floor*","source":"\u003ca href=\"http:\/\/www.osfoora.com\" rel=\"nofollow\"\u003eOsfoora for iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611100433162240,"in_reply_to_status_id_str":"365611100433162240","in_reply_to_user_id":1575620900,"in_reply_to_user_id_str":"1575620900","in_reply_to_screen_name":"OswinsSouffles","user":{"id":542796629,"id_str":"542796629","name":"Theta","screen_name":"TeenageDoc","location":"The Academy Of Gallifrey","url":null,"description":"Live while we're young, right? I'm 219 and life is good. Cutting class, hanging with friends and just having fun! That's teens for you. (RP Young @DocWithABox)","protected":false,"followers_count":391,"friends_count":186,"listed_count":0,"created_at":"Sun Apr 01 19:36:35 +0000 2012","favourites_count":19,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":30144,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/787240650\/ab10f200aca16101446b9052b7b5e37e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/787240650\/ab10f200aca16101446b9052b7b5e37e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3751768105\/04893e09ad2189b9c49c845a6c38ee33_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3751768105\/04893e09ad2189b9c49c845a6c38ee33_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/542796629\/1364071298","profile_link_color":"0084B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"OswinsSouffles","name":"Oswin-Claire Oswald","id":1575620900,"id_str":"1575620900","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390843394,"id_str":"365611234390843394","text":"http:\/\/t.co\/KGDZkE2m79 \u0448\u043a\u043e\u043b\u044c\u043d\u0430\u044f \u043a\u043d\u0438\u0433\u0430 10 \u043a\u043b\u0430\u0441\u0441 http:\/\/t.co\/itpBOArWbO \u0447\u0435\u0440\u0442\u0435\u0436 \u0441\u043d\u0435\u0433\u043e\u0443\u0431\u043e\u0440\u043e\u0447\u043d\u043e\u0439 \u043b\u0430\u043f\u0430\u0442\u044b","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":159574908,"id_str":"159574908","name":"Jordan","screen_name":"bereg0ff","location":"","url":null,"description":null,"protected":false,"followers_count":1,"friends_count":3,"listed_count":0,"created_at":"Fri Jun 25 19:27:02 +0000 2010","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2659,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_3_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_3_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/KGDZkE2m79","expanded_url":"http:\/\/aey.acba.in.ua\/cat4\/torrent-1933.html","display_url":"aey.acba.in.ua\/cat4\/torrent-1\u2026","indices":[0,22]},{"url":"http:\/\/t.co\/itpBOArWbO","expanded_url":"http:\/\/aey.acba.in.ua\/cat4\/torrent-1936.html","display_url":"aey.acba.in.ua\/cat4\/torrent-1\u2026","indices":[47,69]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390835200,"id_str":"365611234390835200","text":"@ray_g10 you still wanna play?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365582316531236864,"in_reply_to_status_id_str":"365582316531236864","in_reply_to_user_id":597188613,"in_reply_to_user_id_str":"597188613","in_reply_to_screen_name":"NathanColeDoee_","user":{"id":597188613,"id_str":"597188613","name":"NCole.","screen_name":"NathanColeDoee_","location":"Sunny California\u2600","url":null,"description":"Promise to always give you me , the real me. http:\/\/Ask.fm\/NathanColee","protected":false,"followers_count":434,"friends_count":350,"listed_count":1,"created_at":"Sat Jun 02 05:11:59 +0000 2012","favourites_count":1316,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4665,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256602879\/389d856f13bb2a0dfe3fc69df13785d8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256602879\/389d856f13bb2a0dfe3fc69df13785d8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/597188613\/1373686355","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ray_g10","name":"R\u00e1ynaldo","id":213106734,"id_str":"213106734","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234382446593,"id_str":"365611234382446593","text":"How To Install LED Backup Reverse Lights For Your Car? http:\/\/t.co\/9ZASrRGXme","source":"\u003ca href=\"http:\/\/tweetadder.com\" rel=\"nofollow\"\u003eTweetAdder v4\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":635029552,"id_str":"635029552","name":"Car & Driver","screen_name":"CSC919","location":"USA","url":null,"description":"All about automotive issues.","protected":false,"followers_count":11410,"friends_count":4513,"listed_count":62,"created_at":"Sat Jul 14 02:03:07 +0000 2012","favourites_count":1,"utc_offset":28800,"time_zone":"Beijing","geo_enabled":false,"verified":false,"statuses_count":8826,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2411646281\/7w9aa1h9zs4cljbt0c0u_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2411646281\/7w9aa1h9zs4cljbt0c0u_normal.jpeg","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/9ZASrRGXme","expanded_url":"http:\/\/ezinearticles.com\/7596229","display_url":"ezinearticles.com\/7596229","indices":[56,78]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390859776,"id_str":"365611234390859776","text":"Te tiro el dato. http:\/\/t.co\/4x30KpW3rM","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":224080364,"id_str":"224080364","name":"Dear \u2020","screen_name":"adizalmon","location":"s t a t e of t r a n c e","url":null,"description":"Espero alegre la salida y espero no volver jam\u00e1s.","protected":false,"followers_count":358,"friends_count":274,"listed_count":2,"created_at":"Wed Dec 08 02:57:10 +0000 2010","favourites_count":430,"utc_offset":-18000,"time_zone":"Mexico City","geo_enabled":true,"verified":false,"statuses_count":23314,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"42AD98","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000018585635\/7f7d3da88738031ebf2e11f76baa04f4.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000018585635\/7f7d3da88738031ebf2e11f76baa04f4.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261430547\/ab3afce6d2b69324dc8ce522e0fa136e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261430547\/ab3afce6d2b69324dc8ce522e0fa136e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/224080364\/1375991867","profile_link_color":"F52377","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000B17","profile_text_color":"448668","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611234395054080,"id_str":"365611234395054080","indices":[17,39],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpewgCYAA104T.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpewgCYAA104T.jpg","url":"http:\/\/t.co\/4x30KpW3rM","display_url":"pic.twitter.com\/4x30KpW3rM","expanded_url":"http:\/\/twitter.com\/adizalmon\/status\/365611234390859776\/photo\/1","type":"photo","sizes":{"medium":{"w":500,"h":226,"resize":"fit"},"small":{"w":340,"h":154,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":226,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234390863872,"id_str":"365611234390863872","text":"\uff3e\uff3e http:\/\/t.co\/f20zfmo5Cr #pupe http:\/\/t.co\/3O0M4z2nDI","source":"\u003ca href=\"http:\/\/pupe.ameba.jp\/\" rel=\"nofollow\"\u003epoupeegirl\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1379359908,"id_str":"1379359908","name":"\u30a2\u30d5\u30ed\u30a2\u30eb\u30d1\u30ab","screen_name":"ahuroarupaka","location":"","url":null,"description":null,"protected":false,"followers_count":4,"friends_count":6,"listed_count":0,"created_at":"Thu Apr 25 12:42:20 +0000 2013","favourites_count":42,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":59,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3572946264\/fdcd44629368a5cbbe000d5d49963038_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3572946264\/fdcd44629368a5cbbe000d5d49963038_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"pupe","indices":[26,31]}],"urls":[{"url":"http:\/\/t.co\/f20zfmo5Cr","expanded_url":"http:\/\/pupe.ameba.jp\/profile\/YP-hytpPxWxk","display_url":"pupe.ameba.jp\/profile\/YP-hyt\u2026","indices":[3,25]}],"user_mentions":[],"media":[{"id":365611234399252480,"id_str":"365611234399252480","indices":[32,54],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpewhCcAAzv_N.png","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpewhCcAAzv_N.png","url":"http:\/\/t.co\/3O0M4z2nDI","display_url":"pic.twitter.com\/3O0M4z2nDI","expanded_url":"http:\/\/twitter.com\/ahuroarupaka\/status\/365611234390863872\/photo\/1","type":"photo","sizes":{"small":{"w":100,"h":280,"resize":"fit"},"thumb":{"w":100,"h":150,"resize":"crop"},"large":{"w":100,"h":280,"resize":"fit"},"medium":{"w":100,"h":280,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234361483264,"id_str":"365611234361483264","text":"yal\u0131nla e\u011fleniyoruz jdfbdjfbdfjbg :**** http:\/\/t.co\/o3pC5hXILW","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1420687838,"id_str":"1420687838","name":"SELOOO\u011e\u011e :d","screen_name":"Sellooooooooooo","location":"","url":null,"description":"Dans etmek i\u00e7in yarat\u0131lm\u0131\u015f bir varl\u0131\u011f\u0131m :** \u221eFENERBAH\u00c7E'M !!","protected":false,"followers_count":409,"friends_count":331,"listed_count":0,"created_at":"Sat May 11 13:45:37 +0000 2013","favourites_count":635,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":1426,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042679284\/85c4bf1ad42f2c737629d74a2fd5edac.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042679284\/85c4bf1ad42f2c737629d74a2fd5edac.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259357400\/6320f7691b6042b4d251a79ed0062409_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259357400\/6320f7691b6042b4d251a79ed0062409_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1420687838\/1374932161","profile_link_color":"93A644","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611234365677571,"id_str":"365611234365677571","indices":[40,62],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpewZCIAMtdsC.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpewZCIAMtdsC.jpg","url":"http:\/\/t.co\/o3pC5hXILW","display_url":"pic.twitter.com\/o3pC5hXILW","expanded_url":"http:\/\/twitter.com\/Sellooooooooooo\/status\/365611234361483264\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:25 +0000 2013","id":365611234386653186,"id_str":"365611234386653186","text":"- @mikeymuzik - Sorry. Fam.","source":"web","truncated":false,"in_reply_to_status_id":365611065263931395,"in_reply_to_status_id_str":"365611065263931395","in_reply_to_user_id":192310725,"in_reply_to_user_id_str":"192310725","in_reply_to_screen_name":"mikeymuzik","user":{"id":25499021,"id_str":"25499021","name":"TINYMAN","screen_name":"TINYMANMUSIC","location":"The Orphanage ","url":"http:\/\/www.tinymanmusic.com","description":"I'm Underground But Still Getting Good Reception....Amen. #ORPHGANG tinymanmusic@hotmail.co.uk","protected":false,"followers_count":2222,"friends_count":986,"listed_count":20,"created_at":"Fri Mar 20 13:00:30 +0000 2009","favourites_count":1733,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":136557,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"89014D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/885994029\/bb46c5591dcf88115fa2df645dc46bb2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/885994029\/bb46c5591dcf88115fa2df645dc46bb2.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000234437207\/afba9ead2b628fe90278cdb4a2239323_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000234437207\/afba9ead2b628fe90278cdb4a2239323_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/25499021\/1374942485","profile_link_color":"E819BF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFF03","profile_text_color":"BC19FC","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"mikeymuzik","name":"Mikey","id":192310725,"id_str":"192310725","indices":[2,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238564573184,"id_str":"365611238564573184","text":"RT @mrbelding: @CMPunk I hope you are enjoying where you are, what you are doing, and who you are doing it with! God knows you deserve it! \u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":485824527,"id_str":"485824527","name":"~ BITW\u10e6 21-00 \u2764","screen_name":"Taker_Sylvester","location":"France","url":null,"description":"I'm a french fan of #Undertaker \/ #HBK \/ #CMPunk \/ #RandyOrton \/ #TheRock & #JohnCena *0* and if you do not agree with that I have two words for Ya #SUCKIT \u10e6","protected":false,"followers_count":515,"friends_count":862,"listed_count":0,"created_at":"Tue Feb 07 16:29:43 +0000 2012","favourites_count":16,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":9425,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/800896934\/73f204eb612ed493fc763f46eecccd64.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/800896934\/73f204eb612ed493fc763f46eecccd64.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261780216\/5cfafe0c53089afd209f029579573c7e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261780216\/5cfafe0c53089afd209f029579573c7e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/485824527\/1376002860","profile_link_color":"9B00B3","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat May 25 03:26:27 +0000 2013","id":338133937945710593,"id_str":"338133937945710593","text":"@CMPunk I hope you are enjoying where you are, what you are doing, and who you are doing it with! God knows you deserve it! #straightedge :)","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":177345928,"in_reply_to_user_id_str":"177345928","in_reply_to_screen_name":"CMPunk","user":{"id":26910839,"id_str":"26910839","name":"Dennis Haskins","screen_name":"mrbelding","location":"Los Angeles","url":"http:\/\/twitter.com\/dennis_haskins","description":"Still chasing the dream!","protected":false,"followers_count":28852,"friends_count":2107,"listed_count":310,"created_at":"Fri Mar 27 01:16:33 +0000 2009","favourites_count":7,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":true,"statuses_count":12908,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/40631885\/twitterbg11.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/40631885\/twitterbg11.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/434350760\/200crop_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/434350760\/200crop_normal.jpg","profile_link_color":"793AA6","profile_sidebar_border_color":"F4FF21","profile_sidebar_fill_color":"FBFF19","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":280,"entities":{"hashtags":[{"text":"straightedge","indices":[124,137]}],"urls":[],"user_mentions":[{"screen_name":"CMPunk","name":"CM Punk","id":177345928,"id_str":"177345928","indices":[0,7]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"mrbelding","name":"Dennis Haskins","id":26910839,"id_str":"26910839","indices":[3,13]},{"screen_name":"CMPunk","name":"CM Punk","id":177345928,"id_str":"177345928","indices":[15,22]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555791360,"id_str":"365611238555791360","text":"Me pica la heridaaaaaaaa TT","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":78300059,"id_str":"78300059","name":"BeleniKate","screen_name":"belenica98","location":"Hogwarts","url":null,"description":"Otaku\/Parawhore\/Shawol\/Taemint\/Locket\/Potterhead\/KH\r\n Actualmente en Hogwarts, un robot twitea por mi.\r\nA.K.A: Sharen98100.\r\nMi nii-chan @neandromeda :3","protected":false,"followers_count":191,"friends_count":280,"listed_count":2,"created_at":"Tue Sep 29 12:57:24 +0000 2009","favourites_count":1707,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":false,"verified":false,"statuses_count":12953,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF2BF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000015843105\/65ca9feee0c3c500d89d099d578e7a5b.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000015843105\/65ca9feee0c3c500d89d099d578e7a5b.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000191508471\/d586dbd6cf4e97470c72b138472a3d69_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000191508471\/d586dbd6cf4e97470c72b138472a3d69_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/78300059\/1372965792","profile_link_color":"FF5900","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"485F57","profile_text_color":"BCC491","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555791361,"id_str":"365611238555791361","text":"\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36@\nCalum5SOS What's your all time fav song? \nPlease answer love you! xxx\n\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36\ud83d\udc95\ud83d\udc36x11","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":814239114,"id_str":"814239114","name":"IM BACK NIGGAS","screen_name":"Styles06Lexi","location":"Different Dicks Every Night","url":null,"description":"Im a gamer. 5 seconds of winter and The direction are my fav. Me and Niall were born on the same day.","protected":false,"followers_count":326,"friends_count":166,"listed_count":2,"created_at":"Mon Sep 10 01:36:59 +0000 2012","favourites_count":2509,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":3688,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000029043630\/591b72939327e4033f60311243464005.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000029043630\/591b72939327e4033f60311243464005.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259063912\/d1da1d01c54a99fe228b70865da55cd4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259063912\/d1da1d01c54a99fe228b70865da55cd4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/814239114\/1375347750","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559989760,"id_str":"365611238559989760","text":"Gmorning","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":223475402,"id_str":"223475402","name":"Ade kurniawan","screen_name":"adee231","location":"Team Gaduh | Jakarta ","url":null,"description":"Free in this world | berujang tanpa batas | INDIVIDU MERDEKA \\m\/","protected":false,"followers_count":374,"friends_count":248,"listed_count":0,"created_at":"Mon Dec 06 14:04:56 +0000 2010","favourites_count":5,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":17710,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000019838930\/435ff85633dc1eee957da2122e8bffdd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000019838930\/435ff85633dc1eee957da2122e8bffdd.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249547817\/e5d8463475dd02807bcaf5890b666416_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249547817\/e5d8463475dd02807bcaf5890b666416_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/223475402\/1375599093","profile_link_color":"3C7A91","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576754689,"id_str":"365611238576754689","text":"\u304d\u3083\u3074\u308b\u30fc\u3093\u2605\u30a6\u30b6\u30b7\u30ad\u53ea\u4eca\u53c2\u4e0a\uff01","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":628365876,"id_str":"628365876","name":"\u30a6\u30b6\u30b7\u30ad","screen_name":"uza_shiki_bot","location":"\u30a4\u30ed\u306e\u96a3","url":null,"description":"\u30a6\u30b6\u30b7\u30adbot\u3067\u3059\u3002\u30a4\u30ed\u5ec3\u904e\u304e\u3066\u30a6\u30b6\u3044\u3053\u3068\u3057\u304b\u8a00\u3044\u307e\u305b\u3093\u3002\u30a2\u30a4\u30b3\u30f3\u306f\u30a4\u30b1\u30e1\u30f3\u306a\u3057\u30fc\u304f\u3093\u3067\u3059\u3002\u3054\u610f\u898b\u3001\u4e0d\u5177\u5408\u7b49\u306f\u7ba1\u7406\u4eba\u307e\u3067@t_clown_w","protected":false,"followers_count":87,"friends_count":77,"listed_count":1,"created_at":"Fri Jul 06 11:15:35 +0000 2012","favourites_count":1,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":6377,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2409277634\/icon7767195107460254422619821197_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2409277634\/icon7767195107460254422619821197_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572556288,"id_str":"365611238572556288","text":"Eu s\u00f3 posso ta ficando louca...","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":408536096,"id_str":"408536096","name":"Renata Salles","screen_name":"reesalles","location":"","url":"http:\/\/3words4-you.tumblr.com","description":null,"protected":false,"followers_count":101,"friends_count":44,"listed_count":0,"created_at":"Wed Nov 09 15:11:02 +0000 2011","favourites_count":206,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":4257,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/709251751\/f259d5e1e4893e521926d8eb5faa5939.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/709251751\/f259d5e1e4893e521926d8eb5faa5939.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3002271624\/33f528d3c7347a64cbd6cd41b6251848_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3002271624\/33f528d3c7347a64cbd6cd41b6251848_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/408536096\/1352685191","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568374274,"id_str":"365611238568374274","text":"At the fair getting ready to watch the derby","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":549604232,"id_str":"549604232","name":"Alexandra Williams","screen_name":"AlexandraJ2012","location":"ohio","url":null,"description":"small town country girl. loves to laugh. #TeamSingle #Pisces.","protected":false,"followers_count":368,"friends_count":746,"listed_count":0,"created_at":"Mon Apr 09 22:39:32 +0000 2012","favourites_count":1558,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":5008,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000140349140\/086e14bde4a6d05678a1f6004e4626cc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000140349140\/086e14bde4a6d05678a1f6004e4626cc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/549604232\/1357267872","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568374275,"id_str":"365611238568374275","text":"y ahora que paso ? :(","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":815477112,"id_str":"815477112","name":"M\u2665","screen_name":"Flopybarrientos","location":"","url":null,"description":null,"protected":false,"followers_count":156,"friends_count":188,"listed_count":0,"created_at":"Mon Sep 10 15:46:31 +0000 2012","favourites_count":347,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":4946,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"010D0F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041897433\/f3debe8da3fead662909c4f810aef01d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041897433\/f3debe8da3fead662909c4f810aef01d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000221777777\/d87bd2d08f3b990b5914e4beac9fc300_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000221777777\/d87bd2d08f3b990b5914e4beac9fc300_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/815477112\/1375058537","profile_link_color":"F21111","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"99CC33","profile_text_color":"3E4415","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238564175873,"id_str":"365611238564175873","text":"\u3010PCMAX\u3011\u2460\uff11\uff10\u5e74\u4ee5\u4e0a\u306e\u904b\u55b6\u5b9f\u7e3e\u304c\u3042\u308b\u2461\u4f1a\u54e1\u6570\uff14\uff10\uff10\u4e07\u4eba\u2462\u4e07\u5168\u306e\u30b5\u30dd\u30fc\u30c8\u4f53\u5236\u2463\u30b5\u30af\u30e9\u30fb\u30e4\u30e9\u30bb\u4e0d\u4f7f\u7528\n\u2464\u5927\u624b\u306a\u306e\u3067\u3001\u4fe1\u983c\u611f\u304c\u9055\u3046\u2465\u30dd\u30a4\u30f3\u30c8\u5236\u3067\u5b8c\u5168\u524d\u6255\u3044\u5236\n\u25cehttp:\/\/t.co\/7eETgM1RGz(\u767b\u9332\u7121\u6599)","source":"\u003ca href=\"http:\/\/www.twisuke.com\" rel=\"nofollow\"\u003e\u30c4\u30a4\u52a9\u3002\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1646337626,"id_str":"1646337626","name":"\u5f7c\u5973\u3053\u305dHde\u7f8e\u4eba\u3060\u306d\uff01\u78ba\u304b\u3081\u305f\uff01","screen_name":"icp4619","location":"\u3010\u76f8\u4e92\u30d5\u30a9\u30ed\u30fc\u5927\u6b53\u8fce\uff01\u3011","url":null,"description":"\u4eba\u306f\u5f7c\u5973\u3092H\u7f8e\u4eba\u3068\u8a00\u3046\u3051\u308c\u3069\u3002\u3002\u3002","protected":false,"followers_count":26,"friends_count":31,"listed_count":0,"created_at":"Sun Aug 04 23:05:42 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1924,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245269833\/486bf72b6b373044dfbe371a32f2a2cd_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245269833\/486bf72b6b373044dfbe371a32f2a2cd_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/7eETgM1RGz","expanded_url":"http:\/\/pcmax.jp\/?ad_id=rm190318","display_url":"pcmax.jp\/?ad_id=rm190318","indices":[80,102]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572560384,"id_str":"365611238572560384","text":"T\u00e1 todo mundo aqui em casa com problema na perna , at\u00e9 eu .. estou mancando","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":396976176,"id_str":"396976176","name":"Brenda Helen .","screen_name":"BrendinhaHelen","location":"","url":null,"description":null,"protected":false,"followers_count":271,"friends_count":152,"listed_count":0,"created_at":"Mon Oct 24 01:57:27 +0000 2011","favourites_count":324,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":8048,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"352726","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/879504585\/f49b2462251383a15fc286fccaa06659.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/879504585\/f49b2462251383a15fc286fccaa06659.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3774135009\/769df639535b6a94d320d8b942136a20_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3774135009\/769df639535b6a94d320d8b942136a20_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/396976176\/1375935556","profile_link_color":"052C45","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"99CC33","profile_text_color":"3E4415","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555779072,"id_str":"365611238555779072","text":"RT @ElgatoEsmio: Get some balls and RT that. I saw you laugh.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":245372604,"id_str":"245372604","name":"Topcat_007","screen_name":"Topcat_007","location":"","url":null,"description":"I am here to fight evil and exchange good-natured barbs.","protected":false,"followers_count":181,"friends_count":352,"listed_count":3,"created_at":"Mon Jan 31 15:01:33 +0000 2011","favourites_count":1854,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":3902,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FAFAFA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/624038101\/zg4llp7i0w6h7edw6oh0.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/624038101\/zg4llp7i0w6h7edw6oh0.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1989267972\/tcdrag02a_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1989267972\/tcdrag02a_normal.jpg","profile_link_color":"FF0000","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Fri Aug 02 03:37:52 +0000 2013","id":363141570993733632,"id_str":"363141570993733632","text":"Get some balls and RT that. I saw you laugh.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":741495810,"id_str":"741495810","name":"Drunk Dreamer","screen_name":"ElgatoEsmio","location":"Columbus, Ohio ","url":"http:\/\/favstar.fm\/users\/ElgatoEsmio","description":"I drink hard because I think hard.","protected":false,"followers_count":7783,"friends_count":6018,"listed_count":315,"created_at":"Mon Aug 06 21:38:48 +0000 2012","favourites_count":81505,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":19929,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/645523471\/egilq2lnfu90tdh8zn7q.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/645523471\/egilq2lnfu90tdh8zn7q.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000236336234\/3762ce2286a91922f729a65849f31e87_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000236336234\/3762ce2286a91922f729a65849f31e87_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/741495810\/1353639171","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":115,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ElgatoEsmio","name":"Drunk Dreamer","id":741495810,"id_str":"741495810","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576766976,"id_str":"365611238576766976","text":"#MuerteAYoutube","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":822642618,"id_str":"822642618","name":"\u0105l\u0111\u0173 \u221e","screen_name":"AldiVirhuez","location":"","url":null,"description":"..................................................................................","protected":false,"followers_count":90,"friends_count":229,"listed_count":0,"created_at":"Fri Sep 14 03:57:06 +0000 2012","favourites_count":34,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":700,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"F3E5B8","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/885892466\/0a3b1d2eafa8f28b37787a47b34027d9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/885892466\/0a3b1d2eafa8f28b37787a47b34027d9.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3705580045\/1faf19b711206a809e8825fe0047b778_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3705580045\/1faf19b711206a809e8825fe0047b778_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/822642618\/1375856153","profile_link_color":"CFB2D9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"291C0D","profile_text_color":"53AB8D","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"MuerteAYoutube","indices":[0,15]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"vi"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238564175874,"id_str":"365611238564175874","text":"The conjuring > the purge","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1084071721,"id_str":"1084071721","name":"\u0394","screen_name":"Misstesfaye_","location":"West Yorkshire","url":null,"description":"I go by carrying yourself with courage, sincerity and self respect #\u2717o #Teamfollowback","protected":false,"followers_count":945,"friends_count":1214,"listed_count":0,"created_at":"Sat Jan 12 20:36:46 +0000 2013","favourites_count":378,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4771,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000254507014\/21a0cdd2cab648f16f97008dc465c3ac_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000254507014\/21a0cdd2cab648f16f97008dc465c3ac_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1084071721\/1375872439","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238580957184,"id_str":"365611238580957184","text":"Minions ai ai ai ai aAJDPIWHFQW\u00c7UGVQIEUBHQ\u00c7 q vontade de morder","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":133757658,"id_str":"133757658","name":"Lorena","screen_name":"lorenacaserta","location":"SP\/BR","url":"http:\/\/s-u-a-princesa.tumblr.com\/","description":"Little bad girl or naughty little angel","protected":false,"followers_count":392,"friends_count":194,"listed_count":59,"created_at":"Fri Apr 16 15:05:58 +0000 2010","favourites_count":216,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":31474,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"D1D1D1","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045391318\/701a3fe10cc2b3a719a5b14838ff4fef.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045391318\/701a3fe10cc2b3a719a5b14838ff4fef.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000238236888\/8cb9f18186ecfe29ccdd1450129fa8c6_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000238236888\/8cb9f18186ecfe29ccdd1450129fa8c6_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/133757658\/1375747225","profile_link_color":"8BC9E8","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589349889,"id_str":"365611238589349889","text":"Mr bean btw pml","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":410155391,"id_str":"410155391","name":"Chris Lennon","screen_name":"TheRealLendog","location":"Onthank, Kilmarnock","url":null,"description":"Glasgow Celtic","protected":false,"followers_count":311,"friends_count":552,"listed_count":1,"created_at":"Fri Nov 11 18:25:24 +0000 2011","favourites_count":122,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":3140,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2763233356\/3ccccf702bb79819669a0e80731aaa18_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2763233356\/3ccccf702bb79819669a0e80731aaa18_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/410155391\/1351182327","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589341697,"id_str":"365611238589341697","text":"Estar en una relaci\u00f3n no se trata de besar, de fechas o de presumir, Se trata de estar con la persona que te hace feliz y hacerla feliz","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":256261986,"id_str":"256261986","name":"\u01b8\u0334\u04c1\u0334\u01b7 NaNi \u01b8\u0334\u04c1\u0334\u01b7","screen_name":"dika0503","location":"Bogot\u00e1, Bogot\u00e1, Bogot\u00e1!","url":null,"description":"I was single and I was happy to be free and fly like a bird ..but now I find a love that really worth and he is really interesting feliz animalista pastusa","protected":false,"followers_count":554,"friends_count":2001,"listed_count":3,"created_at":"Wed Feb 23 00:20:58 +0000 2011","favourites_count":1280,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":14769,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000036059777\/2ada92282d5edddb9da401e8fee0e14a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000036059777\/2ada92282d5edddb9da401e8fee0e14a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000121658200\/fd04c94d9565d6f6e3f26791452c51bb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000121658200\/fd04c94d9565d6f6e3f26791452c51bb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/256261986\/1370386885","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568378368,"id_str":"365611238568378368","text":"una hora aproximadamente estara el videochat con @EloyOfficial entra a http:\/\/t.co\/CyftknDocV te lo recomienda http:\/\/t.co\/qEUgkSkU5e #RT","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":166717017,"id_str":"166717017","name":"wWw.cAtRaChO88.NeT","screen_name":"catracho88_net","location":"La Ceiba, Honduras","url":"http:\/\/catracho88.NeT","description":"Parte del movimiento @Army504 Promotores oficiales de @RKEnterpriseHN | La Web De Mayor Trayectoria","protected":false,"followers_count":994,"friends_count":309,"listed_count":3,"created_at":"Wed Jul 14 20:56:49 +0000 2010","favourites_count":1,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":9352,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/783209696\/5915551879c425f4b9a16de0fb0dc299.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/783209696\/5915551879c425f4b9a16de0fb0dc299.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3213701698\/24cd54ca967f3f359342cbfbceecc426_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3213701698\/24cd54ca967f3f359342cbfbceecc426_normal.jpeg","profile_link_color":"470FFF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"1881F2","profile_text_color":"0008A8","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"RT","indices":[135,138]}],"urls":[{"url":"http:\/\/t.co\/CyftknDocV","expanded_url":"http:\/\/iloveeloy.com","display_url":"iloveeloy.com","indices":[71,93]},{"url":"http:\/\/t.co\/qEUgkSkU5e","expanded_url":"http:\/\/www.ecuamusic.fm","display_url":"ecuamusic.fm","indices":[112,134]}],"user_mentions":[{"screen_name":"EloyOfficial","name":"Eloy","id":16845581,"id_str":"16845581","indices":[49,62]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593540096,"id_str":"365611238593540096","text":"Ih ketawa.. Kamu lagi apa cil? @putriyramadhini: @Muhammad_Raiza Hhahaa\"","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":455580659,"id_str":"455580659","name":"Muhammad_Raiza","screen_name":"Muhammad_Raiza","location":"Indonesian","url":null,"description":"\u2022:Ria Oktasinda\u2022 #SMPN 1 CIKARANG BARAT(8.10)\/addmyfacebook:Muhammad Raiza:Follow and Mention_Ok#","protected":false,"followers_count":188,"friends_count":118,"listed_count":0,"created_at":"Thu Jan 05 08:17:18 +0000 2012","favourites_count":35,"utc_offset":25200,"time_zone":"Jakarta","geo_enabled":true,"verified":false,"statuses_count":546,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"0040A1","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/730754573\/8f911d7e588f01d5887967996e11695b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/730754573\/8f911d7e588f01d5887967996e11695b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2844170017\/bba0af5bed2b25997344d00cad61d42c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2844170017\/bba0af5bed2b25997344d00cad61d42c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/455580659\/1375653306","profile_link_color":"FA0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"putriyramadhini","name":"Putri Y.Ramadhini \u262e\t","id":863966802,"id_str":"863966802","indices":[31,47]},{"screen_name":"Muhammad_Raiza","name":"Muhammad_Raiza","id":455580659,"id_str":"455580659","indices":[49,64]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559985664,"id_str":"365611238559985664","text":"@_prelepotica Ma samo sam na pauzi sa vinom, bicemo in love mi opet. Ali naterala si me da je zavolim hehe","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365608968195813376,"in_reply_to_status_id_str":"365608968195813376","in_reply_to_user_id":520822028,"in_reply_to_user_id_str":"520822028","in_reply_to_screen_name":"_prelepotica","user":{"id":58617693,"id_str":"58617693","name":"Nada Stojanovi\u0107","screen_name":"_devojcurak","location":"Belgrade, Serbia","url":null,"description":null,"protected":false,"followers_count":189,"friends_count":138,"listed_count":0,"created_at":"Mon Jul 20 22:49:35 +0000 2009","favourites_count":2370,"utc_offset":7200,"time_zone":"Belgrade","geo_enabled":true,"verified":false,"statuses_count":2454,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"2234D4","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000009633498\/600cf4f2991bb7cc64f28ccfc0f9eaa0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000009633498\/600cf4f2991bb7cc64f28ccfc0f9eaa0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252526837\/a0a4d9dbbb598c3e2fc260d6054a209d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252526837\/a0a4d9dbbb598c3e2fc260d6054a209d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/58617693\/1375833914","profile_link_color":"48B2BA","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_prelepotica","name":"Nevena Trifkovi\u0107","id":520822028,"id_str":"520822028","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sl"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238580953088,"id_str":"365611238580953088","text":"Angry lino... http:\/\/t.co\/fq5Z3gOAls","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":580378728,"id_str":"580378728","name":"Ant Canavan","screen_name":"AntCanavan","location":"England","url":null,"description":"I live the life you dream of. Kinda.","protected":false,"followers_count":40,"friends_count":161,"listed_count":0,"created_at":"Mon May 14 22:29:08 +0000 2012","favourites_count":8,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":809,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/606129022\/lo216zk5mxoynm6nbp7b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/606129022\/lo216zk5mxoynm6nbp7b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2219027953\/prof_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2219027953\/prof_normal.jpg","profile_link_color":"0099CC","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/fq5Z3gOAls","expanded_url":"http:\/\/fb.me\/2524DCvCz","display_url":"fb.me\/2524DCvCz","indices":[14,36]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"filter_level":"medium","lang":"tl"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593531904,"id_str":"365611238593531904","text":"RT @HitFollowsJp: \u2665RETWEET\u2665ONLY\u2665IF\u2665YOU\u2665WANT\u2665NEW\u2665FOLLOWERS\u2665 \u2708 #TFBJP \u2708 #TeamFollowBack \u2708 #SougoFollow \u2708 #FollowBack \u2708 #HITFOLLOWSTEAM \u2708 07.48","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":415521090,"id_str":"415521090","name":"John Davison ","screen_name":"JohnGD1994","location":"Bishop Middleham","url":"http:\/\/ask.fm\/JohnGD1994","description":"John |19 | KIK: JohnDavi2013 | SKYPE: JohnDavi.2013 | Follow For A Follow Back!","protected":false,"followers_count":5221,"friends_count":5651,"listed_count":6,"created_at":"Fri Nov 18 13:03:47 +0000 2011","favourites_count":96,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":3348,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F0F0E4","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000025305795\/b809350403eec62fe2bfce1aa280d2dd.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000025305795\/b809350403eec62fe2bfce1aa280d2dd.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000138203799\/4adcfafafa6e126b0a6a372fa72b9a69_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000138203799\/4adcfafafa6e126b0a6a372fa72b9a69_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/415521090\/1373902561","profile_link_color":"F01111","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:48:50 +0000 2013","id":365605551364771843,"id_str":"365605551364771843","text":"\u2665RETWEET\u2665ONLY\u2665IF\u2665YOU\u2665WANT\u2665NEW\u2665FOLLOWERS\u2665 \u2708 #TFBJP \u2708 #TeamFollowBack \u2708 #SougoFollow \u2708 #FollowBack \u2708 #HITFOLLOWSTEAM \u2708 07.48","source":"\u003ca href=\"https:\/\/twitter.com\/HitFollowsJp\" rel=\"nofollow\"\u003ehitfollowsjpbot\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":755368129,"id_str":"755368129","name":"HITFOLLOWSTEAM","screen_name":"HitFollowsJp","location":"Osaka - Japan \u00b7","url":"http:\/\/teamfairyrosejpmex.blogspot.com\/","description":"#TFBJP #TMW #R_Family #TEAMFOLLOWBACK #TMW #90sBabyFollowTrain \n #HitFollowsTeam \u25ba@teamfairyrose @OfficialTFBJP\u25c4 \u25ba#TEAMFAIRYROSE \u25c4","protected":false,"followers_count":254427,"friends_count":234014,"listed_count":904,"created_at":"Mon Aug 13 15:46:25 +0000 2012","favourites_count":2301,"utc_offset":32400,"time_zone":"Osaka","geo_enabled":false,"verified":false,"statuses_count":163833,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045503342\/5cddfae96ebb432f1b8cbbc86161ebe0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045503342\/5cddfae96ebb432f1b8cbbc86161ebe0.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3038712758\/376d6e7caba1bef6a9a67da8a970da35_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3038712758\/376d6e7caba1bef6a9a67da8a970da35_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/755368129\/1375759551","profile_link_color":"F70A1D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":272,"entities":{"hashtags":[{"text":"TFBJP","indices":[43,49]},{"text":"TeamFollowBack","indices":[52,67]},{"text":"SougoFollow","indices":[70,82]},{"text":"FollowBack","indices":[85,96]},{"text":"HITFOLLOWSTEAM","indices":[99,114]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"TFBJP","indices":[61,67]},{"text":"TeamFollowBack","indices":[70,85]},{"text":"SougoFollow","indices":[88,100]},{"text":"FollowBack","indices":[103,114]},{"text":"HITFOLLOWSTEAM","indices":[117,132]}],"urls":[],"user_mentions":[{"screen_name":"HitFollowsJp","name":"HITFOLLOWSTEAM","id":755368129,"id_str":"755368129","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585151489,"id_str":"365611238585151489","text":"My arm hurt","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":59721899,"id_str":"59721899","name":"Ms.Short Stuff :)","screen_name":"_BrownBeautty","location":"North Carolina","url":null,"description":"Blessed.Independent.19 years Young.Living life. In love with the one and only Chris Brown","protected":false,"followers_count":309,"friends_count":327,"listed_count":0,"created_at":"Fri Jul 24 07:45:43 +0000 2009","favourites_count":61,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":16550,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000080695770\/cfd40f88b422f38287e88cd1979ada22_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000080695770\/cfd40f88b422f38287e88cd1979ada22_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/59721899\/1374765296","profile_link_color":"FF0000","profile_sidebar_border_color":"65B0DA","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559989761,"id_str":"365611238559989761","text":"RT @MarqusAllen: The Return of THE JETT SHOW \u201cShowTime\u201d http:\/\/t.co\/waGP5u4Lup via @cosignmag","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":166763725,"id_str":"166763725","name":"IG: @CosignMag","screen_name":"CosignMag","location":"Online & in stores! #Dallas","url":"http:\/\/CosignMag.com","description":"Founded by @COSIGN_KG! COSIGN MAGAZINE is a bi-monthly innovative publication! ART | CULTURE | MUSIC | SPORTS | EVENTS | FASHION #TheCosignLife","protected":false,"followers_count":1954,"friends_count":1227,"listed_count":36,"created_at":"Wed Jul 14 23:21:36 +0000 2010","favourites_count":203,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":20316,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/142243041\/logo.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/142243041\/logo.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000209396652\/e3cb4b226a1f412bf5a4eda3746428c8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000209396652\/e3cb4b226a1f412bf5a4eda3746428c8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/166763725\/1353466111","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:03:17 +0000 2013","id":365609185418817537,"id_str":"365609185418817537","text":"The Return of THE JETT SHOW \u201cShowTime\u201d http:\/\/t.co\/waGP5u4Lup via @cosignmag","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":313669662,"id_str":"313669662","name":"Marqus Allen","screen_name":"MarqusAllen","location":"Dallas,TX","url":null,"description":"Ready for the world","protected":false,"followers_count":125,"friends_count":174,"listed_count":0,"created_at":"Thu Jun 09 01:16:10 +0000 2011","favourites_count":7,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":1741,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/839277514\/8af01109c9e2749e67e3095d06a97d43.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/839277514\/8af01109c9e2749e67e3095d06a97d43.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000084453596\/61bc0ae4473c8054f273232a0363d13d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000084453596\/61bc0ae4473c8054f273232a0363d13d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/313669662\/1366572361","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/waGP5u4Lup","expanded_url":"http:\/\/cosignmag.com\/showtime\/","display_url":"cosignmag.com\/showtime\/","indices":[39,61]}],"user_mentions":[{"screen_name":"CosignMag","name":"IG: @CosignMag","id":166763725,"id_str":"166763725","indices":[66,76]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MarqusAllen","name":"Marqus Allen","id":313669662,"id_str":"313669662","indices":[3,15]},{"screen_name":"CosignMag","name":"IG: @CosignMag","id":166763725,"id_str":"166763725","indices":[83,93]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585143296,"id_str":"365611238585143296","text":"\u0627\u0644\u0628\u0627\u0631\u062d\u0647 \u0639\u0646\u062f\u064a \u063a\u0640\u062f\u0627 \u0627\u0644\u0644\u064a\u0640\u0644 \u0644\u064a\u0644\u064a\u0640\u0646\n\u0646\u0627\u064a\u0645 \u0639\u0644\u0640\u0649 \u0641\u0631\u0627\u0634\u0640\u064a \u0648\u0644\u0627\u0646\u0640\u064a \u0628\u0646\u0627\u064a\u0640\u0645\n\u0635\u0627\u062d\u064a \u0648\u0627\u0647\u0648\u062c\u0633 \u0641\u064a \u0645\u062d\u0628\u0647 \u0644\u0647\u0627 \u0633\u0646\u064a\u0646\n\u0645\u0627\u0641\u0627\u0631\u0642\u0640\u062a \u0642\u0644\u0640\u0628(\u0646) \u0645\u0648\u0644\u0640\u0639 \u0648\u0647\u0627\u064a\u0640\u0645","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":849719804,"id_str":"849719804","name":"\u0639\u0634\u0642\u064a \u0645\u0627\u062c\u062f\u064a ","screen_name":"958Soul","location":"","url":null,"description":"\u0644\u064a\u0633 \u062c\u0645\u0650\u064a\u0639 \u0645\u0622 \u0623\u0646\u062b\u0631\u0647 ,,\u0647\u064f\u0646\u0627,, \u0645\u0646 \u06aa\u062a\u0627\u0628\u0622\u062a\u064a ! \u0648\u0644\u06aa\u064e\u0646 \u061b \u0623\u0646\u062b\u0631\u064f \u0645\u0627 \u062a\u0631\u0648\u0642 \u0644\u0647\u064f \u0623\u0630\u0646\u064a .. \u0648 \u0645\u0627 \u062a\u0645\u064a\u0644 \u0644\u0647\u064f \u0639\u064a\u0646\u0627\u064a\u0652 .. \u0648 \u0645\u0627 \u064a\u062a\u062d\u0633\u0633\u0647\u064f \u0642\u0644\u0628\u064a \u0648 \u064a\u062b\u064a\u0631 \u0645\u0634\u0627\u0639\u0650\u0631\u064a .... \u2665 \u2665","protected":false,"followers_count":951,"friends_count":2002,"listed_count":0,"created_at":"Thu Sep 27 18:21:09 +0000 2012","favourites_count":452,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":true,"verified":false,"statuses_count":4289,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3495497001\/d0270bcbd6bfb77a434709c4f8b910ff_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3495497001\/d0270bcbd6bfb77a434709c4f8b910ff_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/849719804\/1365452525","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238564167680,"id_str":"365611238564167680","text":"RT @Judipaola: Con instagram somos todos fot\u00f3grafos","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":161541421,"id_str":"161541421","name":"aneet","screen_name":"AnitaPulfer","location":"","url":null,"description":null,"protected":false,"followers_count":165,"friends_count":214,"listed_count":1,"created_at":"Thu Jul 01 03:25:06 +0000 2010","favourites_count":18,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":4809,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/409401833\/127520751.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/409401833\/127520751.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3756325953\/1c665a505c9b879dfdc64adc193213a3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3756325953\/1c665a505c9b879dfdc64adc193213a3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/161541421\/1370384938","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:17:28 +0000 2013","id":365597656640270336,"id_str":"365597656640270336","text":"Con instagram somos todos fot\u00f3grafos","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":152763340,"id_str":"152763340","name":"Ju Di Paola","screen_name":"Judipaola","location":"","url":null,"description":"A la vida hay que hacerle el amor- Virus","protected":false,"followers_count":145,"friends_count":291,"listed_count":0,"created_at":"Sun Jun 06 20:30:12 +0000 2010","favourites_count":141,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":2317,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/345254280\/SAM_0053.JPG","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/345254280\/SAM_0053.JPG","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3234826536\/4a310957bd99aaebf230f1277674d466_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3234826536\/4a310957bd99aaebf230f1277674d466_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Judipaola","name":"Ju Di Paola","id":152763340,"id_str":"152763340","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589349890,"id_str":"365611238589349890","text":"RT @Telebiology: \u062e\u0644\u0627\u0635 \u064a\u0627 \u0634\u0628\u0627\u0628.. \u0631\u0628\u0639 \u0633\u0627\u0639\u0647 \u0648 \u0647\u0627\u064a\u062e\u0631\u062c\u0648\u0627.. \u0627\u0646\u0627 \u0646\u0627\u0632\u0644\u0647\u0645 \u062a\u0627\u0646\u064a \u0627\u0647\u0648","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":261854141,"id_str":"261854141","name":"EsRaa","screen_name":"soskaZzZz","location":"egypt","url":"http:\/\/favstar.fm\/users\/soskaZzZz","description":"#\u0647\u0646\u062f\u0633\u0629_\u0627\u0644\u0645\u0646\u0635\u0648\u0631\u0629 \u0647\u0628\u0642\u0649 \u0633\u0645\u0643\u0631\u064a\u0629 \u0643\u0645\u0628\u064a\u0648\u062a\u0631\u0627\u062a..\u062a\u0648\u064a\u062a\u0627\u062a\u064a +10 \u0627\u0644\u0627 \u062e\u0645\u0633\u0629.\u062d\u064a\u0627\u062a\u064a \u062a\u062a\u0644\u062e\u0635 \u0641 \u062c\u0645\u0644\u0629 \u0648\u0627\u0646\u062a \u0645\u0627\u0644 \u0645\u0627\u0645\u062a\u0643 #MiniCooper \u2665 #kitkat #Nescafe \u0645\u0639\u0644\u0642\u062a\u064a\u0646 \u0633\u0643\u0631","protected":false,"followers_count":840,"friends_count":177,"listed_count":17,"created_at":"Sun Mar 06 21:02:45 +0000 2011","favourites_count":1356,"utc_offset":7200,"time_zone":"Cairo","geo_enabled":false,"verified":false,"statuses_count":28719,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035080405\/5964560570da4067e8cc01cb7b5a0d22.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035080405\/5964560570da4067e8cc01cb7b5a0d22.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000198493292\/6a91b9ef74ac86ad8ab7abb9dec25cf0_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000198493292\/6a91b9ef74ac86ad8ab7abb9dec25cf0_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/261854141\/1374890130","profile_link_color":"6BCAD4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"1F8994","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:04:51 +0000 2013","id":365609579960221697,"id_str":"365609579960221697","text":"\u062e\u0644\u0627\u0635 \u064a\u0627 \u0634\u0628\u0627\u0628.. \u0631\u0628\u0639 \u0633\u0627\u0639\u0647 \u0648 \u0647\u0627\u064a\u062e\u0631\u062c\u0648\u0627.. \u0627\u0646\u0627 \u0646\u0627\u0632\u0644\u0647\u0645 \u062a\u0627\u0646\u064a \u0627\u0647\u0648","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":262431817,"id_str":"262431817","name":"Teleb \u30c4","screen_name":"Telebiology","location":"Mansoura,Egypt","url":null,"description":null,"protected":false,"followers_count":821,"friends_count":294,"listed_count":14,"created_at":"Tue Mar 08 01:25:56 +0000 2011","favourites_count":5682,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":21156,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EEDEC7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000004565547\/b1081a8bec0c9bc93f134eb3dff8a519.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000004565547\/b1081a8bec0c9bc93f134eb3dff8a519.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000066207352\/51307ea157630382d279b1f79119569e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000066207352\/51307ea157630382d279b1f79119569e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/262431817\/1374015740","profile_link_color":"D65B42","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C3A173","profile_text_color":"35140F","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Telebiology","name":"Teleb \u30c4","id":262431817,"id_str":"262431817","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572572672,"id_str":"365611238572572672","text":"RT @rennofrz: Yesus masihkah enggkau ingin mencobai keluargaku dengan seperti ini, apakah engkau tidaksuka melihat aku tersenyum seperti du\u2026","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1524321830,"id_str":"1524321830","name":"\u30c4 ","screen_name":"SayaGaluh_","location":"","url":null,"description":"Asudahlah!","protected":false,"followers_count":65,"friends_count":112,"listed_count":0,"created_at":"Mon Jun 17 08:47:09 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":701,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000000793620\/3dba9db4456d3c6f6ba4b7a65fa893ea.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000000793620\/3dba9db4456d3c6f6ba4b7a65fa893ea.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000221069665\/2412505df9483367775ad93a200af851_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000221069665\/2412505df9483367775ad93a200af851_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1524321830\/1371459187","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 16:33:02 +0000 2013","id":365510978890244096,"id_str":"365510978890244096","text":"Yesus masihkah enggkau ingin mencobai keluargaku dengan seperti ini, apakah engkau tidaksuka melihat aku tersenyum seperti dulu :'(","source":"\u003ca href=\"http:\/\/ubersocial.com\" rel=\"nofollow\"\u003eUberSocial for BlackBerry\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":103127394,"id_str":"103127394","name":"Venna Passio Yunanda","screen_name":"rennofrz","location":"Your Mind","url":"http:\/\/alexanderzrenno.tumblr.com","description":"Leave the drama and bullshit in your life, don't be stress over it, be happy and move on \u2022 Industrial Engineering \u2022 Atma Jaya University","protected":false,"followers_count":608,"friends_count":244,"listed_count":0,"created_at":"Sat Jan 09 00:19:10 +0000 2010","favourites_count":12,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":26002,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040386046\/422b3ea044e669d55d22d9d810f397b0.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040386046\/422b3ea044e669d55d22d9d810f397b0.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252534994\/cd02466ca18f9519980a2741582c4c20_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252534994\/cd02466ca18f9519980a2741582c4c20_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/103127394\/1375269694","profile_link_color":"3FB7CC","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"id"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"rennofrz","name":"Venna Passio Yunanda","id":103127394,"id_str":"103127394","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576758785,"id_str":"365611238576758785","text":"RT @Follow_Trains: \u2605\uff32\uff25\uff34\uff37\uff25\uff25\uff34\u2605 ONLYIFYOUARE 100% #TEAMFOLLOWBACK(mustfollow @Follow_Trains & everyone who RTs this to gain followers. ( Follo\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1638708469,"id_str":"1638708469","name":"\u00a4\u00b0\u00b0\u00b0Sugar Plum\u00b0\u00b0\u00b0\u00a4","screen_name":"CallMe_Peggy","location":"Right by your side:)","url":null,"description":"*No Drama Queen!@Sheviyung,@Damola_TDH,@am_andre_@MarvyMall,@NickiMinaj own my HRT!!l\u00e9 boo boo@verifiedDopeBoy love him 4Eva:)he's flawless:*","protected":false,"followers_count":258,"friends_count":239,"listed_count":2,"created_at":"Thu Aug 01 19:30:01 +0000 2013","favourites_count":21,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":486,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262070631\/86b6b83dc2cfafb6a487df3ca622af2f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262070631\/86b6b83dc2cfafb6a487df3ca622af2f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1638708469\/1375541749","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:05:40 +0000 2013","id":365609788249358337,"id_str":"365609788249358337","text":"\u2605\uff32\uff25\uff34\uff37\uff25\uff25\uff34\u2605 ONLYIFYOUARE 100% #TEAMFOLLOWBACK(mustfollow @Follow_Trains & everyone who RTs this to gain followers. ( Follow @christleyuy )036","source":"\u003ca href=\"http:\/\/gremln.com\" rel=\"nofollow\"\u003eGremln\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":867932791,"id_str":"867932791","name":"I Am Following Back","screen_name":"Follow_Trains","location":"Instant SHOUTOUT? Visit ","url":"http:\/\/TweetPeddler.com\/follow_trains","description":"My job is to follow you back. Retweet my tweets to gain followers. \n#Teamfollowback\nBusiness: followtrainsbusiness@yahoo.com","protected":false,"followers_count":119666,"friends_count":94684,"listed_count":404,"created_at":"Mon Oct 08 12:42:57 +0000 2012","favourites_count":2587,"utc_offset":28800,"time_zone":"Beijing","geo_enabled":false,"verified":false,"statuses_count":75204,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2694351050\/05c45a880b54fa71f372aacb175a5b6f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2694351050\/05c45a880b54fa71f372aacb175a5b6f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/867932791\/1349701123","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":83,"entities":{"hashtags":[{"text":"TEAMFOLLOWBACK","indices":[28,43]}],"urls":[],"user_mentions":[{"screen_name":"Follow_Trains","name":"I Am Following Back","id":867932791,"id_str":"867932791","indices":[55,69]},{"screen_name":"christleyuy","name":"CHINITO","id":555965922,"id_str":"555965922","indices":[126,138]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"TEAMFOLLOWBACK","indices":[47,62]}],"urls":[],"user_mentions":[{"screen_name":"Follow_Trains","name":"I Am Following Back","id":867932791,"id_str":"867932791","indices":[3,17]},{"screen_name":"Follow_Trains","name":"I Am Following Back","id":867932791,"id_str":"867932791","indices":[74,88]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559985666,"id_str":"365611238559985666","text":"Vraiment Turfuuu lui et sa femme","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":578435037,"id_str":"578435037","name":"Tachelhit Fi B\u00e9riz","screen_name":"SoMarocaine","location":"Casablanca&B\u00e9riz","url":null,"description":"#TeamMaroc #TeamChleuh #TeamCasablanca","protected":false,"followers_count":96,"friends_count":95,"listed_count":0,"created_at":"Sat May 12 20:01:19 +0000 2012","favourites_count":635,"utc_offset":7200,"time_zone":"Paris","geo_enabled":true,"verified":false,"statuses_count":14150,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/644600812\/8j08t27f9e6pa3om03bh.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/644600812\/8j08t27f9e6pa3om03bh.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3786802688\/b22a7890549314f496eb90552858429d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3786802688\/b22a7890549314f496eb90552858429d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/578435037\/1374357784","profile_link_color":"CC3366","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568361984,"id_str":"365611238568361984","text":"@asapzvrry you're the only true friend I have","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610997383315456,"in_reply_to_status_id_str":"365610997383315456","in_reply_to_user_id":979459567,"in_reply_to_user_id_str":"979459567","in_reply_to_screen_name":"asapzvrry","user":{"id":270562550,"id_str":"270562550","name":"lil bre ","screen_name":"psycharry","location":"ig: idekbre","url":"http:\/\/pvler.tumblr.com","description":"how to be indie punk and still like one direction","protected":false,"followers_count":480,"friends_count":94,"listed_count":5,"created_at":"Tue Mar 22 21:11:13 +0000 2011","favourites_count":1824,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":24307,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0F0E0F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000019248335\/c1d13cd74b3576894baec6009c73009c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000019248335\/c1d13cd74b3576894baec6009c73009c.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249371079\/17236ecf90d02cbc208b126d65780ef2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249371079\/17236ecf90d02cbc208b126d65780ef2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/270562550\/1375780157","profile_link_color":"0F0D0D","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FF0A0A","profile_text_color":"FFFFFF","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"asapzvrry","name":"C.","id":979459567,"id_str":"979459567","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576754688,"id_str":"365611238576754688","text":"#\u0441\u0442\u0440\u043e\u0438\u0442\u0435\u043b\u044c\u0441\u0442\u0432\u043e \u0411\u044b\u0432\u0448\u0438\u0435 \u0437\u0435\u043c\u043b\u0438 \u041c\u0438\u043d\u043e\u0431\u043e\u0440\u043e\u043d\u044b \u043e\u0442\u0434\u0430\u0434\u0443\u0442 \u043f\u043e\u0434 \u0436\u0438\u043b\u0443\u044e \u0437\u0430\u0441\u0442\u0440\u043e\u0439\u043a\u0443: \u041c\u0438\u043d\u043e\u0431\u043e\u0440\u043e\u043d\u044b \u043f\u0435\u0440\u0435\u0434\u0430\u0441\u0442 \u041b\u0435\u043d\u043e\u0431\u043b... http:\/\/t.co\/eEbx7Or6Kx #\u043d\u043e\u0432\u043e\u0441\u0442\u0438","source":"\u003ca href=\"http:\/\/twitterfeed.com\" rel=\"nofollow\"\u003etwitterfeed\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1370422418,"id_str":"1370422418","name":"\u041a\u0441\u044e\u0448\u0430 \u0421\u0442\u0443\u0434\u0435\u043d\u043d\u0438\u043a\u043e\u0432\u0430","screen_name":"elizavetashivo1","location":"\u0420\u043e\u0441\u0442\u043e\u0432-\u043d\u0430-\u0414\u043e\u043d\u0443","url":null,"description":"\u0412\u0441\u0435 \u0443 \u043c\u0435\u043d\u044f \u043d\u0435 \u043a\u0430\u043a \u0443 \u043b\u044e\u0434\u0435\u0439, \u0430 \u0445\u043e\u0442\u0435\u043b\u043e\u0441\u044c \u0431\u044b...","protected":false,"followers_count":341,"friends_count":437,"listed_count":0,"created_at":"Sun Apr 21 19:44:26 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1117,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3555990762\/b5e7b927a3c5641033f6fdfd6420cfc5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3555990762\/b5e7b927a3c5641033f6fdfd6420cfc5_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"\u0441\u0442\u0440\u043e\u0438\u0442\u0435\u043b\u044c\u0441\u0442\u0432\u043e","indices":[0,14]},{"text":"\u043d\u043e\u0432\u043e\u0441\u0442\u0438","indices":[121,129]}],"urls":[{"url":"http:\/\/t.co\/eEbx7Or6Kx","expanded_url":"http:\/\/bit.ly\/14pSLUV","display_url":"bit.ly\/14pSLUV","indices":[98,120]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576766977,"id_str":"365611238576766977","text":"@HabiroLucas que convencido voce! hahaha","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611092380106753,"in_reply_to_status_id_str":"365611092380106753","in_reply_to_user_id":1599891721,"in_reply_to_user_id_str":"1599891721","in_reply_to_screen_name":"HabiroLucas","user":{"id":371489628,"id_str":"371489628","name":"giba","screen_name":"__girodrigues","location":"","url":null,"description":"amo a @caroolmoota_ \u2764","protected":false,"followers_count":305,"friends_count":162,"listed_count":0,"created_at":"Sat Sep 10 23:37:03 +0000 2011","favourites_count":371,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":13213,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040758658\/e3c5494dd368b9291941775995a8aa47.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040758658\/e3c5494dd368b9291941775995a8aa47.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248074883\/f0aed73d3b010b8f38afc804953578d6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248074883\/f0aed73d3b010b8f38afc804953578d6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/371489628\/1375929908","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"HabiroLucas","name":"H A B I R \u00c3 O \u270c","id":1599891721,"id_str":"1599891721","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585147393,"id_str":"365611238585147393","text":"Ariana Grande - The Way ft. Mac Miller: http:\/\/t.co\/wFjfqvq8P9 via @youtube #TheWay","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1191794822,"id_str":"1191794822","name":"M e g a n\u30c4","screen_name":"hi_im_megann13","location":"Made in the U.S.A.\u272e","url":null,"description":"\u0b90Curiosity often leads to trouble.\u0b90","protected":false,"followers_count":125,"friends_count":432,"listed_count":0,"created_at":"Mon Feb 18 02:26:02 +0000 2013","favourites_count":1058,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":299,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028515258\/48912e97e8138a20131bc2ef97c57ea5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028515258\/48912e97e8138a20131bc2ef97c57ea5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000155611061\/7bf883ade382176f5424f01bdafbd56b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000155611061\/7bf883ade382176f5424f01bdafbd56b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1191794822\/1374940771","profile_link_color":"EB1181","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FC147D","profile_text_color":"EB11BF","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"TheWay","indices":[76,83]}],"urls":[{"url":"http:\/\/t.co\/wFjfqvq8P9","expanded_url":"http:\/\/youtu.be\/_sV0S8qWSy0","display_url":"youtu.be\/_sV0S8qWSy0","indices":[40,62]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[67,75]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589349888,"id_str":"365611238589349888","text":"@iamsrk I LOVE UU SOO MUCH DEAR <3<3<3","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":101311381,"in_reply_to_user_id_str":"101311381","in_reply_to_screen_name":"iamsrk","user":{"id":1492420298,"id_str":"1492420298","name":"K i d r a u h l ","screen_name":"juustkidding69","location":"#Believe ","url":null,"description":"I'm a crazy girl with big dreams! :)","protected":false,"followers_count":5,"friends_count":1,"listed_count":0,"created_at":"Sat Jun 08 09:17:00 +0000 2013","favourites_count":18,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":211,"lang":"de","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257898614\/48d100e6c4ba1a61d738ea81cd29f2a6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257898614\/48d100e6c4ba1a61d738ea81cd29f2a6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1492420298\/1375927542","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"iamsrk","name":"SHAH RUKH KHAN","id":101311381,"id_str":"101311381","indices":[0,7]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572572673,"id_str":"365611238572572673","text":"RT @evi_maia: @Vivuuka aiii sim :3 amo muito","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":83117513,"id_str":"83117513","name":"Vivuka ;3","screen_name":"Vivuuka","location":"Joinville (:","url":"http:\/\/acaminhodossonhos.tumblr.com\/","description":"Sou dessas pessoas sem gra\u00e7a. Que se irrita f\u00e1cil com barulhos, se sente t\u00edmida na multid\u00e3o, se deixa levar pelo cora\u00e7\u00e3o. *-*","protected":false,"followers_count":299,"friends_count":136,"listed_count":2,"created_at":"Sat Oct 17 12:03:01 +0000 2009","favourites_count":296,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":7790,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"0A0909","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047308037\/92af5cdd8809fa3acb087271b217c925.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047308037\/92af5cdd8809fa3acb087271b217c925.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000084320615\/e95709a59555829692ef1ec07d8ea667_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000084320615\/e95709a59555829692ef1ec07d8ea667_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/83117513\/1372895942","profile_link_color":"1C1A1B","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 21:22:50 +0000 2013","id":365583908043104256,"id_str":"365583908043104256","text":"@Vivuuka aiii sim :3 amo muito","source":"web","truncated":false,"in_reply_to_status_id":365582726096625664,"in_reply_to_status_id_str":"365582726096625664","in_reply_to_user_id":83117513,"in_reply_to_user_id_str":"83117513","in_reply_to_screen_name":"Vivuuka","user":{"id":609956899,"id_str":"609956899","name":"Evi","screen_name":"evi_maia","location":"Joinville - Sc","url":null,"description":"And look how far we've come","protected":false,"followers_count":209,"friends_count":544,"listed_count":0,"created_at":"Sat Jun 16 12:32:54 +0000 2012","favourites_count":22,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":1823,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/784548684\/57192a073e210732edaba7fb307055e1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/784548684\/57192a073e210732edaba7fb307055e1.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000143221188\/e48121b5ee2ca59ccd6241237d1d9d69_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000143221188\/e48121b5ee2ca59ccd6241237d1d9d69_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/609956899\/1372697258","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Vivuuka","name":"Vivuka ;3","id":83117513,"id_str":"83117513","indices":[0,8]}]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"evi_maia","name":"Evi","id":609956899,"id_str":"609956899","indices":[3,12]},{"screen_name":"Vivuuka","name":"Vivuka ;3","id":83117513,"id_str":"83117513","indices":[14,22]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555791362,"id_str":"365611238555791362","text":"\u201c@mewvymyme: \u0e21\u0e35\u0e04\u0e19\u0e17\u0e35\u0e48\u0e40\u0e18\u0e2d\u0e23\u0e31\u0e01\u0e2d\u0e22\u0e39\u0e48\u0e01\u0e48\u0e2d\u0e19\u201d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365605704448491522,"in_reply_to_status_id_str":"365605704448491522","in_reply_to_user_id":387697096,"in_reply_to_user_id_str":"387697096","in_reply_to_screen_name":"mewvymyme","user":{"id":384091039,"id_str":"384091039","name":"NAMWARN\u2022","screen_name":"namwarnchuda","location":"","url":null,"description":null,"protected":false,"followers_count":270,"friends_count":145,"listed_count":1,"created_at":"Mon Oct 03 02:01:45 +0000 2011","favourites_count":3590,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":true,"verified":false,"statuses_count":51941,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/654987890\/lsvjm2yzsbze4ugq92qv.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/654987890\/lsvjm2yzsbze4ugq92qv.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000230302235\/2b0ebeab2f2c4e852c278583f7c5776e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000230302235\/2b0ebeab2f2c4e852c278583f7c5776e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/384091039\/1374302715","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7B71A6","profile_text_color":"62E0D6","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"mewvymyme","name":"mewvy","id":387697096,"id_str":"387697096","indices":[1,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"th"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585155584,"id_str":"365611238585155584","text":"@TomPeyton15 @ChrissyDrobish spin not spit you stupid shots","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610417365581825,"in_reply_to_status_id_str":"365610417365581825","in_reply_to_user_id":1651838040,"in_reply_to_user_id_str":"1651838040","in_reply_to_screen_name":"TomPeyton15","user":{"id":633070719,"id_str":"633070719","name":"mike wazowski","screen_name":"niiadawson","location":"","url":null,"description":"diva by day dragon slayer by night","protected":false,"followers_count":205,"friends_count":296,"listed_count":0,"created_at":"Wed Jul 11 15:38:15 +0000 2012","favourites_count":1545,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":2342,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000240935709\/e64d3679f5dee6ee5da043ad6a7aa866_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000240935709\/e64d3679f5dee6ee5da043ad6a7aa866_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/633070719\/1375586261","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TomPeyton15","name":"Tom Peyton","id":1651838040,"id_str":"1651838040","indices":[0,12]},{"screen_name":"ChrissyDrobish","name":"Chrissy Drobish","id":405867377,"id_str":"405867377","indices":[13,28]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238580953089,"id_str":"365611238580953089","text":"@basel_12 \u0623\u0646\u0627 \u0628\u064a\u0646 \u0627\u062f\u064a\u0643 \u0648 \u0631\u062c\u0644\u064a\u0643 \u064a\u0627 \u0641\u0646\u062f\u0645","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611111065726977,"in_reply_to_status_id_str":"365611111065726977","in_reply_to_user_id":85339394,"in_reply_to_user_id_str":"85339394","in_reply_to_screen_name":"basel_12","user":{"id":425830918,"id_str":"425830918","name":"A\u043d\u043c\u03b5\u2202 K\u03c5\u0442\u0432\u0456","screen_name":"drAkutbi","location":"Jeddah, Saudi Arabia","url":"http:\/\/dooid.com\/drkutbi","description":"Hakuna Matata and be like water my friend , engaged to the most beautiful girl @rOoOr_alsh","protected":false,"followers_count":540,"friends_count":459,"listed_count":3,"created_at":"Thu Dec 01 13:53:02 +0000 2011","favourites_count":0,"utc_offset":10800,"time_zone":"Riyadh","geo_enabled":false,"verified":false,"statuses_count":13128,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/414921439\/fish2.br.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/414921439\/fish2.br.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000249574802\/0035b1b5e28c3539e188f6048fb7f62f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000249574802\/0035b1b5e28c3539e188f6048fb7f62f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/425830918\/1375580850","profile_link_color":"0000FF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"9CD7F2","profile_text_color":"186577","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"basel_12","name":"Dr.Basel Salama","id":85339394,"id_str":"85339394","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238564179968,"id_str":"365611238564179968","text":"RT @zaynimstyle: @ZaynieStyle bu arada sen 1\/5 mi\u015fsin aq hemde zayn takip ediyo neyse cnm bionda \u00e7ok haniyy","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":486739807,"id_str":"486739807","name":"Ily Zayn","screen_name":"ZaynieStyle","location":"\u2605 Zayn\/5 \u2605","url":null,"description":"http:\/\/31.media.tumblr.com\/398578f494dd951554662e70a6ad0c59\/tumblr_mqe70pOboK1s1etjmo5_500.gifJuly 23nd 2010 - 8:22 pm \u221e","protected":false,"followers_count":2404,"friends_count":521,"listed_count":0,"created_at":"Wed Feb 08 16:22:59 +0000 2012","favourites_count":1254,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":22146,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/820292110\/e9158434d6d2a01ec24ae0104a0dd395.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/820292110\/e9158434d6d2a01ec24ae0104a0dd395.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244560169\/7121c38a585b60d7778839e75c0c8920_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244560169\/7121c38a585b60d7778839e75c0c8920_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/486739807\/1375701959","profile_link_color":"000303","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:04 +0000 2013","id":365611146947989504,"id_str":"365611146947989504","text":"@ZaynieStyle bu arada sen 1\/5 mi\u015fsin aq hemde zayn takip ediyo neyse cnm bionda \u00e7ok haniyy","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610771578757120,"in_reply_to_status_id_str":"365610771578757120","in_reply_to_user_id":486739807,"in_reply_to_user_id_str":"486739807","in_reply_to_screen_name":"ZaynieStyle","user":{"id":728682283,"id_str":"728682283","name":"BOK KAFALI ZAYN","screen_name":"zaynimstyle","location":"ONE ((PERFECT)) DIRECTION","url":null,"description":"@harryology1D -HER \u015eEY\u0130M | B\u00fc\u015fra -EN DE\u011eERLIM","protected":false,"followers_count":5067,"friends_count":3868,"listed_count":2,"created_at":"Tue Jul 31 15:56:53 +0000 2012","favourites_count":3192,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":26607,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000034813891\/4e347ce9a92eb05622ba9a6849d0a407.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000034813891\/4e347ce9a92eb05622ba9a6849d0a407.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261846492\/dc82b97e99caf12bfa2ad9095d26abde_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261846492\/dc82b97e99caf12bfa2ad9095d26abde_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/728682283\/1376000250","profile_link_color":"010008","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ZaynieStyle","name":"Ily Zayn","id":486739807,"id_str":"486739807","indices":[0,12]}]},"favorited":false,"retweeted":false,"lang":"tr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"zaynimstyle","name":"BOK KAFALI ZAYN","id":728682283,"id_str":"728682283","indices":[3,15]},{"screen_name":"ZaynieStyle","name":"Ily Zayn","id":486739807,"id_str":"486739807","indices":[17,29]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576754692,"id_str":"365611238576754692","text":"@carloslespaul 69,69 %","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611033882140673,"in_reply_to_status_id_str":"365611033882140673","in_reply_to_user_id":415779929,"in_reply_to_user_id_str":"415779929","in_reply_to_screen_name":"carloslespaul","user":{"id":635612311,"id_str":"635612311","name":"Doctor J.","screen_name":"DavidMartinezbj","location":"Detr\u00e1s de t\u00ed. ","url":null,"description":"Jugador y gran amante del Baloncesto. #13. Doctor J. Detroit Bad Boys. Puedes aceptar el fracaso, pero no puedes aceptar el no intentarlo. 25.","protected":false,"followers_count":463,"friends_count":194,"listed_count":2,"created_at":"Sat Jul 14 19:41:29 +0000 2012","favourites_count":1072,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":9207,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/762882490\/0cc6d819deab2f295ccad1cbc9b35f04.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/762882490\/0cc6d819deab2f295ccad1cbc9b35f04.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000167277464\/db01d4724de212ba5390a037f5984c83_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000167277464\/db01d4724de212ba5390a037f5984c83_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/635612311\/1375486906","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"carloslespaul","name":"Toglodita","id":415779929,"id_str":"415779929","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sl"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593531905,"id_str":"365611238593531905","text":"RT @GreaterThn: Victoria secret pink underwear > > > > > > > > > > > >","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":959247871,"id_str":"959247871","name":"\u2661lyssa\u2661","screen_name":"__alysssaaaa","location":"Indio, CA","url":null,"description":"Just a fun and weird person who loves sushi.","protected":false,"followers_count":704,"friends_count":543,"listed_count":0,"created_at":"Tue Nov 20 01:51:22 +0000 2012","favourites_count":2834,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":13382,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244574622\/1570d262f57d381c88b49aadb7f38d38_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244574622\/1570d262f57d381c88b49aadb7f38d38_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/959247871\/1375698739","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:42:17 +0000 2013","id":365603903846359040,"id_str":"365603903846359040","text":"Victoria secret pink underwear > > > > > > > > > > > >","source":"\u003ca href=\"http:\/\/www.socialoomph.com\" rel=\"nofollow\"\u003eSocialOomph\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":553905461,"id_str":"553905461","name":"Greater Than \u2122","screen_name":"GreaterThn","location":"","url":null,"description":"Send us your GreaterThans and we'll post them to our site :) greaterthns@gmail.com","protected":false,"followers_count":390161,"friends_count":135,"listed_count":237,"created_at":"Sat Apr 14 23:18:07 +0000 2012","favourites_count":15,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":11654,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCFEFF","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3022898190\/194ff9970a9a66788dd7572045dacdc5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3022898190\/194ff9970a9a66788dd7572045dacdc5_normal.jpeg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":434,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"GreaterThn","name":"Greater Than \u2122","id":553905461,"id_str":"553905461","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572568576,"id_str":"365611238572568576","text":"@8_Danielo na dice......ya estas alteraooo jajajaja","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610769217359872,"in_reply_to_status_id_str":"365610769217359872","in_reply_to_user_id":573763695,"in_reply_to_user_id_str":"573763695","in_reply_to_screen_name":"8_Danielo","user":{"id":465909784,"id_str":"465909784","name":"Meeeel\u00f610 :))","screen_name":"COterosroldan","location":"","url":"http:\/\/tuenti.com","description":"Salir, beber, el rollo de siempre, contar mil historias, hablar con la gente. Melendi mi \u00eddolo","protected":false,"followers_count":397,"friends_count":446,"listed_count":0,"created_at":"Mon Jan 16 21:25:57 +0000 2012","favourites_count":57,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":7957,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/705970561\/544d8c2bb69592877960ced1273ce77c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/705970561\/544d8c2bb69592877960ced1273ce77c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000095119850\/4cb8d7a226912a014b6cf29ae3eb9305_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000095119850\/4cb8d7a226912a014b6cf29ae3eb9305_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/465909784\/1373104218","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"8_Danielo","name":"Dani, Daniel:)","id":573763695,"id_str":"573763695","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568374273,"id_str":"365611238568374273","text":"De lo bueno lo mejor ;)","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1610639288,"id_str":"1610639288","name":"Tatiana Flamero","screen_name":"TatianaFlamero","location":"Me alegra haberte conocido :) ","url":null,"description":"Vida loca :)","protected":false,"followers_count":31,"friends_count":79,"listed_count":0,"created_at":"Sun Jul 21 14:24:00 +0000 2013","favourites_count":3,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":91,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000166035910\/675ac84b06f29b0ccf2f7bd9b8314f34_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000166035910\/675ac84b06f29b0ccf2f7bd9b8314f34_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1610639288\/1374423100","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576754690,"id_str":"365611238576754690","text":"RT @DaniVans_: Cuantos tweets tendr\u00e1n tu nombre sin haberte mencionado.","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":147780515,"id_str":"147780515","name":"440;","screen_name":"DreamwithBiebs1","location":"","url":null,"description":null,"protected":false,"followers_count":1197,"friends_count":1294,"listed_count":2,"created_at":"Tue May 25 01:30:16 +0000 2010","favourites_count":163,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":9323,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/344918034408786999\/7b00c1609c82b3df5e42eb2d0367add8.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/344918034408786999\/7b00c1609c82b3df5e42eb2d0367add8.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261929938\/f7fc545a679472129e630ae7965825a0_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261929938\/f7fc545a679472129e630ae7965825a0_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/147780515\/1375359564","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Aug 05 14:26:25 +0000 2013","id":364391947890204674,"id_str":"364391947890204674","text":"Cuantos tweets tendr\u00e1n tu nombre sin haberte mencionado.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1169912869,"id_str":"1169912869","name":"ONCE.","screen_name":"DaniVans_","location":"con @YunSmile_","url":"http:\/\/es.favstar.fm\/users\/DaniTuSonrisa_","description":"\u00bfQuieres ser Relaciones Publicas de SHOKO LIGHT MADRID? Mencioname, o da Rt a algun tuit de shoko, te sigo y te doy informacion.","protected":false,"followers_count":3046,"friends_count":51,"listed_count":5,"created_at":"Mon Feb 11 20:07:17 +0000 2013","favourites_count":444,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":1813,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"0AF71A","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/874939327\/a6085c1f46a404a65817143652f0ee65.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/874939327\/a6085c1f46a404a65817143652f0ee65.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000096918563\/bdaea8208fc50340cfde8c078c3ca8a2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000096918563\/bdaea8208fc50340cfde8c078c3ca8a2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1169912869\/1373997471","profile_link_color":"F5082F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":96,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"DaniVans_","name":"ONCE.","id":1169912869,"id_str":"1169912869","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555791363,"id_str":"365611238555791363","text":"RT @EbissetJordan: Les meufs qui disent que la coupe a beyonc\u00e9 est nul.. PTDRR JUSTE SON VISAGE VOUS FUME, ET FERMER LA AVEC VOS TISSAGE A \u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":581875263,"id_str":"581875263","name":"JUNIOR","screen_name":"holynaay","location":"Paris","url":"http:\/\/letmeplayou.tumblr.com\/","description":null,"protected":false,"followers_count":300,"friends_count":190,"listed_count":0,"created_at":"Wed May 16 12:29:26 +0000 2012","favourites_count":120,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":10612,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/685556841\/886eec4b3c332a3db753fc7d688bd8d9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/685556841\/886eec4b3c332a3db753fc7d688bd8d9.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262064537\/9edbfc3d64ab0dbaa722d0b8c4b457fe_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262064537\/9edbfc3d64ab0dbaa722d0b8c4b457fe_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/581875263\/1375921193","profile_link_color":"DB2568","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 13:41:52 +0000 2013","id":365467900896747521,"id_str":"365467900896747521","text":"Les meufs qui disent que la coupe a beyonc\u00e9 est nul.. PTDRR JUSTE SON VISAGE VOUS FUME, ET FERMER LA AVEC VOS TISSAGE A 2EUROS LA","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":561404443,"id_str":"561404443","name":"Jordan Ebisset","screen_name":"EbissetJordan","location":"Paris ","url":null,"description":"Style | Mode..","protected":false,"followers_count":2493,"friends_count":100,"listed_count":4,"created_at":"Mon Apr 23 19:25:21 +0000 2012","favourites_count":5312,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":64904,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000020348564\/a379ce54592b679b14a88ab44a2d4fa0.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000020348564\/a379ce54592b679b14a88ab44a2d4fa0.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000139417348\/d154b4e447f4b8f7ddb4da03e256dfa1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000139417348\/d154b4e447f4b8f7ddb4da03e256dfa1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/561404443\/1373811396","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":29,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"fr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"EbissetJordan","name":"Jordan Ebisset","id":561404443,"id_str":"561404443","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589345792,"id_str":"365611238589345792","text":"@DariaMalfoy que tal voy","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365605094190239744,"in_reply_to_status_id_str":"365605094190239744","in_reply_to_user_id":1586285108,"in_reply_to_user_id_str":"1586285108","in_reply_to_screen_name":"DariaMalfoy","user":{"id":1472298612,"id_str":"1472298612","name":"Teddy R. Lupin","screen_name":"TeddyRsLupin","location":"","url":"https:\/\/twitter.com\/pottertty","description":"Hijo de Remus Lupin y de @NymphraTonks.Novio de @VictoireWly. Metam\u00f3rfago.Animago y Patronus:Lobo Blanco. Futuro m\u00e9dico en San Mungo. #Pottertty","protected":false,"followers_count":432,"friends_count":310,"listed_count":0,"created_at":"Fri May 31 14:43:19 +0000 2013","favourites_count":190,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":1792,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/883448933\/19d6c792ab8ad2d9f95755fc5315fc9a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/883448933\/19d6c792ab8ad2d9f95755fc5315fc9a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248693797\/d87f4d614f2b27befb655c333b873c05_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248693797\/d87f4d614f2b27befb655c333b873c05_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1472298612\/1375765135","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"DariaMalfoy","name":"Daria Malfoy ","id":1586285108,"id_str":"1586285108","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555787264,"id_str":"365611238555787264","text":"RT @1D_5SOS_World28: @Calum5SOS hey Calum. Guess what. Seattle wants you back. #5SOSAcousticSeattle #SeattleWants5SOS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":579574557,"id_str":"579574557","name":"michael c'mon","screen_name":"NotAPretzel","location":"that place that rains a lot","url":"http:\/\/fangirlfridaysbitches.tumblr.com","description":"I just want to talk to Michael Clifford about How To Train Your Dragon.","protected":false,"followers_count":955,"friends_count":1335,"listed_count":3,"created_at":"Mon May 14 03:02:31 +0000 2012","favourites_count":5861,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":23528,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"709397","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000223000373\/e4ed7f9e13b72af842ec021f160068b8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000223000373\/e4ed7f9e13b72af842ec021f160068b8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/579574557\/1372909819","profile_link_color":"FF3300","profile_sidebar_border_color":"86A4A6","profile_sidebar_fill_color":"A0C5C7","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:08:08 +0000 2013","id":365610406414262273,"id_str":"365610406414262273","text":"@Calum5SOS hey Calum. Guess what. Seattle wants you back. #5SOSAcousticSeattle #SeattleWants5SOS","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":403255314,"in_reply_to_user_id_str":"403255314","in_reply_to_screen_name":"Calum5SOS","user":{"id":1637411550,"id_str":"1637411550","name":"WWATOURSEATTLE","screen_name":"1D_5SOS_World28","location":"The Rainy City ","url":null,"description":"one direction and 5 seconds of summer are my whole life. if you like them, we can be friends :)","protected":false,"followers_count":130,"friends_count":375,"listed_count":0,"created_at":"Thu Aug 01 08:05:02 +0000 2013","favourites_count":783,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":426,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237645050\/c601f93b86acb787ae88c981a4a3061a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237645050\/c601f93b86acb787ae88c981a4a3061a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1637411550\/1375388355","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":4,"entities":{"hashtags":[{"text":"5SOSAcousticSeattle","indices":[58,78]},{"text":"SeattleWants5SOS","indices":[79,96]}],"urls":[],"user_mentions":[{"screen_name":"Calum5SOS","name":"Calum Hood","id":403255314,"id_str":"403255314","indices":[0,10]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"5SOSAcousticSeattle","indices":[79,99]},{"text":"SeattleWants5SOS","indices":[100,117]}],"urls":[],"user_mentions":[{"screen_name":"1D_5SOS_World28","name":"WWATOURSEATTLE","id":1637411550,"id_str":"1637411550","indices":[3,19]},{"screen_name":"Calum5SOS","name":"Calum Hood","id":403255314,"id_str":"403255314","indices":[21,31]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585139200,"id_str":"365611238585139200","text":"\u65e5\u5927NOW","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1150325342,"id_str":"1150325342","name":"YUKI","screen_name":"gitaristyuki","location":"","url":null,"description":"\u30d7\u30ed\u30ec\u30b9\/\u4eee\u9762\u30e9\u30a4\u30c0\u30fc\/JOJO\/ABC\/BREAKERZ\/\u611b\u3057\u3066\u307e\u30fc\u3059\uff01\u8da3\u5473\u5408\u3046\u4eba\u30d5\u30a9\u30ed\u30fc\u3057\u3066\u306d\u2606\u3000\u3000\u3000\u3000\u3000\u6e05\u9675\u60c5\u58311\u5e74\u3002","protected":false,"followers_count":247,"friends_count":374,"listed_count":0,"created_at":"Tue Feb 05 08:16:32 +0000 2013","favourites_count":23,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":907,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000151433842\/f04fd76ce1ba81b8a34edb74709dd184_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000151433842\/f04fd76ce1ba81b8a34edb74709dd184_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1150325342\/1374417484","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589337600,"id_str":"365611238589337600","text":"@saqer_4 \n\u0645\u0646 \u0627\u0644\u0639\u0627\u064a\u062f\u064a\u0646 \u0648\u0643\u0644 \u0639\u0627\u0645 \u0648\u0623\u0646\u062a \u0628\u062e\u064a\u0631","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610042025717761,"in_reply_to_status_id_str":"365610042025717761","in_reply_to_user_id":521733027,"in_reply_to_user_id_str":"521733027","in_reply_to_screen_name":"saqer_4","user":{"id":375800523,"id_str":"375800523","name":"\u062f.\u0639\u0628\u062f\u0627\u0644\u0631\u062d\u0645\u0646 \u0627\u0644\u0639\u062a\u0644","screen_name":"drabofaris","location":"","url":null,"description":"\u062f\u0643\u062a\u0648\u0631\u0627\u0647 \u060c \u0623\u062f\u0628 \u062d\u062f\u064a\u062b \u060c \u0639\u0636\u0648 \u0644\u062c\u0646\u0629 \u0627\u0644\u0634\u0639\u0631 \u0628\u0646\u0627\u062f\u064a \u0627\u0644\u0631\u064a\u0627\u0636 \u0627\u0644\u0623\u062f\u0628\u064a. \u0634\u0627\u0639\u0631 \u0648\u0635\u062f\u0631 \u0644\u064a \u062f\u064a\u0648\u0627\u0646\u0627\u0646.\n\u0623\u063a\u0631\u062f \u0643\u0645\u0627 \u0623\u0646\u0627 \u0644\u0627 \u0643\u0645\u0627 \u064a\u0631\u064a\u062f\u0648\u0646.!","protected":false,"followers_count":477,"friends_count":284,"listed_count":3,"created_at":"Sun Sep 18 19:15:56 +0000 2011","favourites_count":8,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":7260,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3363869493\/2fc7895aa79e6eba2a70c18f862f6594_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3363869493\/2fc7895aa79e6eba2a70c18f862f6594_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/375800523\/1369431966","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"saqer_4","name":"\u0635\u0642\u0631 \u0639\u0628\u062f\u0627\u0644\u0625\u0644\u0647 \u0627\u0644\u062d\u0627\u062a\u0645 ","id":521733027,"id_str":"521733027","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593536000,"id_str":"365611238593536000","text":"RT @Alpergul1: Bayram ba\u015flad\u0131 (@ G\u00fczelyal\u0131 Yat Liman\u0131 w\/ 11 others) [pic]: http:\/\/t.co\/XEdqLpmALH","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":297102445,"id_str":"297102445","name":"osman cakl\u0131","screen_name":"Osmancakli","location":"T\u00fcrkiye","url":"http:\/\/www.facebook.com\/#!\/osman.cakli","description":"Ba\u015fkas\u0131n\u0131n y\u00fcz\u00fcne at\u0131lan tokat\u0131 kendi y\u00fcz\u00fcnde hissedenler insand\u0131r.\r\nI'M CHE PULCU","protected":false,"followers_count":1050,"friends_count":610,"listed_count":3,"created_at":"Wed May 11 23:06:27 +0000 2011","favourites_count":132,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":26257,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/880546769\/c16398779816bf1d5552db487cdfefb1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/880546769\/c16398779816bf1d5552db487cdfefb1.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000119109251\/50ff33331b97e2a6f7eb8b13b9974262_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000119109251\/50ff33331b97e2a6f7eb8b13b9974262_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/297102445\/1374521394","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 21:23:41 +0000 2013","id":365584121264742400,"id_str":"365584121264742400","text":"Bayram ba\u015flad\u0131 (@ G\u00fczelyal\u0131 Yat Liman\u0131 w\/ 11 others) [pic]: http:\/\/t.co\/XEdqLpmALH","source":"\u003ca href=\"http:\/\/foursquare.com\" rel=\"nofollow\"\u003efoursquare\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":457839803,"id_str":"457839803","name":"Alper G\u00fcl","screen_name":"Alpergul1","location":"Turkey- Sweden","url":null,"description":"^#BreakingBad #Supernatural #NBA \/ Bursa","protected":false,"followers_count":111,"friends_count":77,"listed_count":0,"created_at":"Sat Jan 07 21:51:41 +0000 2012","favourites_count":140,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":2533,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/838142981\/1e0463a0cea1850f2000fdca7108b91d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/838142981\/1e0463a0cea1850f2000fdca7108b91d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000202425896\/ef8da1567b78db4caaf075616266a136_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000202425896\/ef8da1567b78db4caaf075616266a136_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/457839803\/1373406843","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/XEdqLpmALH","expanded_url":"http:\/\/4sq.com\/196woEU","display_url":"4sq.com\/196woEU","indices":[60,82]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"lang":"tr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Alpergul1","name":"Alper G\u00fcl","id":457839803,"id_str":"457839803","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238572568577,"id_str":"365611238572568577","text":"@sagahigashi911 \u3053\u3046\u3059\u3051\u3042\u308a\u304c\u3068(*\uff9f\u25bd\uff9f*)\uff01\uff01","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365604265198227456,"in_reply_to_status_id_str":"365604265198227456","in_reply_to_user_id":1324554019,"in_reply_to_user_id_str":"1324554019","in_reply_to_screen_name":"sagahigashi911","user":{"id":597410102,"id_str":"597410102","name":"\u3042\u3060\u3053\u308d","screen_name":"coocchan719","location":"","url":null,"description":"\u4f1d\u7fd2\u9928 \u897f\u5357\u82f1\u5c02 wim \u3042\u3060\u3061(\u0e51*\u0c6a*\u0e51)","protected":false,"followers_count":242,"friends_count":245,"listed_count":1,"created_at":"Sat Jun 02 13:27:59 +0000 2012","favourites_count":562,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1245,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000132001277\/d151eac6c4a1ff3e46d9fa6832231f45_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000132001277\/d151eac6c4a1ff3e46d9fa6832231f45_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/597410102\/1367132010","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"sagahigashi911","name":"\u53e4\u8cc0\u5f18\u7950","id":1324554019,"id_str":"1324554019","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238589341696,"id_str":"365611238589341696","text":"@justinbieber me salvas cada noche.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":27260086,"in_reply_to_user_id_str":"27260086","in_reply_to_screen_name":"justinbieber","user":{"id":526416116,"id_str":"526416116","name":"Alex.","screen_name":"myhugejustin","location":"","url":"http:\/\/livestotell.tumblr.com","description":"\u00ab Las calles de Par\u00eds se mor\u00edan de envidia de aquel callej\u00f3n oscuro donde nos bes\u00e1bamos.\u00bb 12.09.11. Panda.","protected":false,"followers_count":870,"friends_count":282,"listed_count":5,"created_at":"Fri Mar 16 13:51:23 +0000 2012","favourites_count":7769,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":true,"verified":false,"statuses_count":28459,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"E0E094","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041056030\/d6061a1316861ae240cade67512fdd58.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041056030\/d6061a1316861ae240cade67512fdd58.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000207887657\/26f52fbedb61a3d94c91e35ec0302112_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000207887657\/26f52fbedb61a3d94c91e35ec0302112_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/526416116\/1375886222","profile_link_color":"D95E68","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"justinbieber","name":"Justin Bieber","id":27260086,"id_str":"27260086","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585139201,"id_str":"365611238585139201","text":"RT @PielTatuada: Tatuaje nocturno. Rt si te gusta. http:\/\/t.co\/1Tp5lRNtwN","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":459648030,"id_str":"459648030","name":"Sta.mary","screen_name":"marypg11","location":"","url":null,"description":"Nunca dejes que nadie te diga que no puedes hacer algo. Si tienes un sue\u00f1o, tienes que protegerlo.En busca de la felicidad","protected":false,"followers_count":224,"friends_count":368,"listed_count":0,"created_at":"Mon Jan 09 22:18:16 +0000 2012","favourites_count":16,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1593,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/472887492\/foto__55.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/472887492\/foto__55.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3713031166\/ee9cebd600b91f67f9d08d8cc19a4eb4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3713031166\/ee9cebd600b91f67f9d08d8cc19a4eb4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/459648030\/1374418882","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:08:01 +0000 2013","id":365610379109343232,"id_str":"365610379109343232","text":"Tatuaje nocturno. Rt si te gusta. http:\/\/t.co\/1Tp5lRNtwN","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":258061521,"id_str":"258061521","name":"\u25ba TATUAJES \u25c4","screen_name":"PielTatuada","location":"Colombia","url":null,"description":"Mi cuerpo es mi diario, y mis tatuajes son mi historia.","protected":false,"followers_count":140226,"friends_count":37701,"listed_count":167,"created_at":"Sat Feb 26 21:47:00 +0000 2011","favourites_count":1549,"utc_offset":-18000,"time_zone":"Bogota","geo_enabled":false,"verified":false,"statuses_count":791,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"080003","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043756285\/4278e2f062a74c3cb5eac420ddcb2b3a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043756285\/4278e2f062a74c3cb5eac420ddcb2b3a.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237707973\/7f3ec8c5da50526781098013097f3937_normal.gif","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237707973\/7f3ec8c5da50526781098013097f3937_normal.gif","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/258061521\/1375580224","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":43,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365610379113537536,"id_str":"365610379113537536","indices":[34,56],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLos-VCIAAEvvM.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLos-VCIAAEvvM.jpg","url":"http:\/\/t.co\/1Tp5lRNtwN","display_url":"pic.twitter.com\/1Tp5lRNtwN","expanded_url":"http:\/\/twitter.com\/PielTatuada\/status\/365610379109343232\/photo\/1","type":"photo","sizes":{"medium":{"w":598,"h":272,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":598,"h":272,"resize":"fit"},"small":{"w":340,"h":155,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PielTatuada","name":"\u25ba TATUAJES \u25c4","id":258061521,"id_str":"258061521","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559985665,"id_str":"365611238559985665","text":"RT @jetlifeAF: What's good for tomorrow night?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":504574566,"id_str":"504574566","name":"E\u303d_","screen_name":"EmmaHobson_JL","location":"","url":null,"description":"17 HudCity NewYorkk ; #JetLife #OITNB #Equestrian","protected":false,"followers_count":243,"friends_count":310,"listed_count":0,"created_at":"Sun Feb 26 14:30:23 +0000 2012","favourites_count":666,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":2058,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/586398335\/rr7uoatjplod8uz45bd2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/586398335\/rr7uoatjplod8uz45bd2.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252856712\/47c6111216d560483a44e670b0681328_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252856712\/47c6111216d560483a44e670b0681328_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/504574566\/1375409131","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:04 +0000 2013","id":365611146440486912,"id_str":"365611146440486912","text":"What's good for tomorrow night?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":48233067,"id_str":"48233067","name":"\u25b2v\u0119r\u00ff","screen_name":"jetlifeAF","location":"NY ","url":null,"description":"#TDE #JetLife #GoodMusic #ProERA #BeastCoast stay loud my friends - instagram\u2566\u2564\u2500 jetlifeAF","protected":false,"followers_count":464,"friends_count":309,"listed_count":0,"created_at":"Thu Jun 18 03:41:42 +0000 2009","favourites_count":5478,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":12015,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"787A80","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/462189141\/artworks-000017989708-fkbod9-crop.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/462189141\/artworks-000017989708-fkbod9-crop.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3622987162\/691cf5521b9ae22b769e18c9db32b864_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3622987162\/691cf5521b9ae22b769e18c9db32b864_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/48233067\/1362964933","profile_link_color":"54665A","profile_sidebar_border_color":"162911","profile_sidebar_fill_color":"37B34A","profile_text_color":"C0DBCC","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"jetlifeAF","name":"\u25b2v\u0119r\u00ff","id":48233067,"id_str":"48233067","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238576754691,"id_str":"365611238576754691","text":"@abdirmo of course, as always!! Lol and where do you work?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610988768210945,"in_reply_to_status_id_str":"365610988768210945","in_reply_to_user_id":232338576,"in_reply_to_user_id_str":"232338576","in_reply_to_screen_name":"abdirmo","user":{"id":400473178,"id_str":"400473178","name":"Baza.","screen_name":"baza_boo","location":"Minneapolis, Minnesota","url":"https:\/\/www.facebook.com\/bazaselassie","description":"University of Minnesota | Marketing & International Business Development | Giraffe Aficionado | Daughter of the King of Kings","protected":false,"followers_count":422,"friends_count":344,"listed_count":3,"created_at":"Sat Oct 29 02:58:33 +0000 2011","favourites_count":390,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":9576,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3684084291\/8c5fe9dda171403c924289ce169562e1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3684084291\/8c5fe9dda171403c924289ce169562e1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/400473178\/1367429393","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"abdirmo","name":"Adam Rage","id":232338576,"id_str":"232338576","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238568370176,"id_str":"365611238568370176","text":"FIRED UP after my first day of Leadership Summit. So many great speakers and encouraging words. Can't wait for tomorrow! #wcagls","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":19804096,"id_str":"19804096","name":"Lindsey White","screen_name":"lindswhy","location":"Franklin, TN","url":null,"description":"Selfie taker. Coffee drinker. Donut eater. Big dreamer.","protected":false,"followers_count":271,"friends_count":286,"listed_count":2,"created_at":"Sat Jan 31 04:03:49 +0000 2009","favourites_count":2030,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":8480,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/886485392\/6e057ea7a5a54f07a5effbf8e0ea6798.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/886485392\/6e057ea7a5a54f07a5effbf8e0ea6798.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000070221732\/a97b30263205d765a44c9dd451f7ab3d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000070221732\/a97b30263205d765a44c9dd451f7ab3d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/19804096\/1374561118","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"wcagls","indices":[121,128]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238585147392,"id_str":"365611238585147392","text":"@algohany11 \u0647\u0647\u0647\u0647 \u0631\u0647\u064a\u0628","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610184384593920,"in_reply_to_status_id_str":"365610184384593920","in_reply_to_user_id":482710008,"in_reply_to_user_id_str":"482710008","in_reply_to_screen_name":"algohany11","user":{"id":751208756,"id_str":"751208756","name":"\u064a\u0648\u0645\u064a\u0627\u062a \u0637\u0627\u0644\u0628","screen_name":"6alleb","location":"","url":null,"description":"\u0627\u0644\u0644\u0647\u0645 \u0623\u0639\u0632 \u0627\u0644\u0627\u0633\u0644\u0627\u0645 \u0648\u0627\u0644\u0645\u0633\u0644\u0645\u064a\u0646","protected":false,"followers_count":1690,"friends_count":1692,"listed_count":2,"created_at":"Sat Aug 11 12:40:41 +0000 2012","favourites_count":63,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2074,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3419622092\/12827052ca6c1e9e57b87b27a0fae818_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3419622092\/12827052ca6c1e9e57b87b27a0fae818_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/751208756\/1364062056","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"algohany11","name":"\u0623\u0628\u0648\u062a\u0627\u0644\u0627 \u0627\u0644\u0639\u0644\u0627\u0637\u064a","id":482710008,"id_str":"482710008","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238559981568,"id_str":"365611238559981568","text":"@mario_hart @AleBaigorria1 @DianaSanchez_04 @Pantera_Zegarra @IsraDreyfus Vamos Chicos Seran campeones esta temporda #FuerzaVerde","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":351895595,"in_reply_to_user_id_str":"351895595","in_reply_to_screen_name":"mario_hart","user":{"id":222582608,"id_str":"222582608","name":"Stephy Trespalacios","screen_name":"StephyTa_TM","location":"ECUADOR","url":null,"description":"La vida est\u00e1 llena de obst\u00e1culos que son dif\u00edciles de superar pero con paciencia y perseverancia todo se puede y se logra ... Stephy \u2665Bb Pin 22CCC8A0 \u2665Canto","protected":false,"followers_count":21,"friends_count":218,"listed_count":0,"created_at":"Fri Dec 03 20:57:50 +0000 2010","favourites_count":0,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":195,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/835180049\/3296c0ce3a31a2e593f167e71b0f5db0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/835180049\/3296c0ce3a31a2e593f167e71b0f5db0.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000073117482\/0718ad87646220176232f117b39fd6d2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000073117482\/0718ad87646220176232f117b39fd6d2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/222582608\/1365222206","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"FuerzaVerde","indices":[118,130]}],"urls":[],"user_mentions":[{"screen_name":"mario_hart","name":"MARIO HART","id":351895595,"id_str":"351895595","indices":[0,11]},{"screen_name":"AleBaigorria1","name":"Alejandra Baigorria","id":516860000,"id_str":"516860000","indices":[12,26]},{"screen_name":"DianaSanchez_04","name":"Diana Sanchez","id":479204092,"id_str":"479204092","indices":[27,43]},{"screen_name":"Pantera_Zegarra","name":"David Zegarra \u270c","id":765719748,"id_str":"765719748","indices":[44,60]},{"screen_name":"IsraDreyfus","name":"Israel Dreyfus","id":1388544253,"id_str":"1388544253","indices":[61,73]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593556480,"id_str":"365611238593556480","text":"Just added hot new product https:\/\/t.co\/XF8ZxuiHtR to my boutique https:\/\/t.co\/zfr6Ant79t. Check it out: http:\/\/t.co\/3UGwDYdCF6","source":"\u003ca href=\"https:\/\/www.kitsylane.com\" rel=\"nofollow\"\u003eKitsy Lane\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1328217343,"id_str":"1328217343","name":"Crystal Rivera","screen_name":"shyloa10","location":"","url":null,"description":null,"protected":false,"followers_count":0,"friends_count":40,"listed_count":0,"created_at":"Fri Apr 05 02:13:22 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":460,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3619056921\/306f6eea2b805180cf3bbb02605c93a5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3619056921\/306f6eea2b805180cf3bbb02605c93a5_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/XF8ZxuiHtR","expanded_url":"https:\/\/corasstyle.kitsylane.com\/index.php?file=product_detail&pId=2435","display_url":"corasstyle.kitsylane.com\/index.php?file\u2026","indices":[27,50]},{"url":"https:\/\/t.co\/zfr6Ant79t","expanded_url":"https:\/\/corasstyle.kitsylane.com\/","display_url":"corasstyle.kitsylane.com","indices":[66,89]}],"user_mentions":[],"media":[{"id":365611238597750785,"id_str":"365611238597750785","indices":[105,127],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpfAKCcAET1Rl.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpfAKCcAET1Rl.jpg","url":"http:\/\/t.co\/3UGwDYdCF6","display_url":"pic.twitter.com\/3UGwDYdCF6","expanded_url":"http:\/\/twitter.com\/shyloa10\/status\/365611238593556480\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":432,"h":432,"resize":"fit"},"medium":{"w":432,"h":432,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238555791364,"id_str":"365611238555791364","text":"J'ai Frooooid :$","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":477884580,"id_str":"477884580","name":"Mlle Bedji :)","screen_name":"perle_precieuse","location":"","url":null,"description":"BBM PIN : 2A14F12A .","protected":false,"followers_count":180,"friends_count":175,"listed_count":0,"created_at":"Sun Jan 29 17:33:13 +0000 2012","favourites_count":155,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":13700,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"709397","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme6\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257462957\/446ff4a351a6c748cd2d18609bd6dffb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257462957\/446ff4a351a6c748cd2d18609bd6dffb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/477884580\/1375232599","profile_link_color":"FF3300","profile_sidebar_border_color":"86A4A6","profile_sidebar_fill_color":"A0C5C7","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:26 +0000 2013","id":365611238593536001,"id_str":"365611238593536001","text":"RT @TheFunnyVines: How I feel about back to school commercials https:\/\/t.co\/tHfmSHicoM","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":391462205,"id_str":"391462205","name":"Alyssa Hughes","screen_name":"lyssssx77","location":"","url":null,"description":null,"protected":false,"followers_count":855,"friends_count":352,"listed_count":1,"created_at":"Sat Oct 15 16:09:42 +0000 2011","favourites_count":1950,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":23589,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000209616005\/47a35e47bfdf25f41197feb53493e7f7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000209616005\/47a35e47bfdf25f41197feb53493e7f7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/391462205\/1375120195","profile_link_color":"CC3366","profile_sidebar_border_color":"DBE9ED","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 02:35:43 +0000 2013","id":365300257942282243,"id_str":"365300257942282243","text":"How I feel about back to school commercials https:\/\/t.co\/tHfmSHicoM","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":612009655,"id_str":"612009655","name":"Best Vines","screen_name":"TheFunnyVines","location":"NOT Affiliated With Vine!","url":null,"description":"*Original Fan\/Parody Account* We do NOT own the Vines that are posted! They are from @VineApp users!","protected":false,"followers_count":635450,"friends_count":33,"listed_count":281,"created_at":"Mon Jun 18 21:17:51 +0000 2012","favourites_count":313,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":656,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3760779837\/f95bc9859407976e3b7e0d8cfdaaf929_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3760779837\/f95bc9859407976e3b7e0d8cfdaaf929_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/612009655\/1369869750","profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2824,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/tHfmSHicoM","expanded_url":"https:\/\/vine.co\/v\/h7idpbeJd07","display_url":"vine.co\/v\/h7idpbeJd07","indices":[44,67]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TheFunnyVines","name":"Best Vines","id":612009655,"id_str":"612009655","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754281472,"id_str":"365611242754281472","text":"taco making and then gym time.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":33294810,"id_str":"33294810","name":"miss mi","screen_name":"a_symphony","location":"romans 10:9-11","url":null,"description":"christian woman || songwriter || freelance stage manager || actress","protected":false,"followers_count":166,"friends_count":225,"listed_count":0,"created_at":"Sun Apr 19 19:54:46 +0000 2009","favourites_count":760,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":16834,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000033610156\/f68e4a3822b53bae2c8348a7a33d3c10.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000033610156\/f68e4a3822b53bae2c8348a7a33d3c10.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000141184776\/ee46237d7f2f16b977fd0f41d10b98d9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000141184776\/ee46237d7f2f16b977fd0f41d10b98d9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/33294810\/1374028764","profile_link_color":"CF0202","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"0022E0","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754281473,"id_str":"365611242754281473","text":"\ud83d\ude21 Extremely pissed off","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":733521686,"id_str":"733521686","name":"snow white\u2661","screen_name":"SaigesTink","location":"","url":null,"description":"Short sweet and to the point \u2020","protected":false,"followers_count":401,"friends_count":249,"listed_count":0,"created_at":"Thu Aug 02 20:49:33 +0000 2012","favourites_count":916,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":33139,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261444089\/348dc5b1fabb11a01d57bc3ed18e7d37_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261444089\/348dc5b1fabb11a01d57bc3ed18e7d37_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/733521686\/1375502969","profile_link_color":"0099B9","profile_sidebar_border_color":"5ED4DC","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758488064,"id_str":"365611242758488064","text":"Vieja de mierda #TeDetesto #Callate","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1263198788,"id_str":"1263198788","name":"BAR \u25bc","screen_name":"BarbaraSanes","location":"Rosario","url":null,"description":"Futura profesora de Educaci\u00f3n F\u00edsica","protected":false,"followers_count":64,"friends_count":169,"listed_count":0,"created_at":"Wed Mar 13 00:22:58 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":359,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045509043\/f8d50a2944f5ea2837f4a937d562338d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045509043\/f8d50a2944f5ea2837f4a937d562338d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000112493218\/527007a9993f638f4ac5fda47afd71f9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000112493218\/527007a9993f638f4ac5fda47afd71f9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1263198788\/1373473132","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"TeDetesto","indices":[16,26]},{"text":"Callate","indices":[27,35]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758483968,"id_str":"365611242758483968","text":"ainda bem, pq sen\u00e3o ia ter que ir no Baile dos Pais no clube","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":703967671,"id_str":"703967671","name":"Ana Cardoso","screen_name":"AnaCardoso__","location":"","url":null,"description":"O ser humano inventou a linguagem para satisfazer a sua profunda necessidade de se queixar.","protected":false,"followers_count":224,"friends_count":268,"listed_count":0,"created_at":"Wed Jul 18 23:15:58 +0000 2012","favourites_count":319,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":6339,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/685323576\/3fc17bcf3fa19aa26f2384f1309b70d6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/685323576\/3fc17bcf3fa19aa26f2384f1309b70d6.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000191904955\/77e5196ad8d91a7f61e076690dad86aa_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000191904955\/77e5196ad8d91a7f61e076690dad86aa_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/703967671\/1372297626","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242771054593,"id_str":"365611242771054593","text":"\u0647\u0627\u0646\u062a \u0639\u0644\u064a\u0647.. \u0639\u0634\u0631\u0629 \u0632\u0645\u0627\u0646.. \u0631\u0627\u062d \u0627\u0644\u0648\u0641\u0649.. \u0648\u0645\u0627\u062a \u0627\u0644\u0627\u0645\u0627\u0646..\u201d","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":447930131,"id_str":"447930131","name":"\u0645\u0627\u0639\u0627\u062f \u064a\u0647\u0640\u0645\u064d","screen_name":"Rwwani","location":"","url":"http:\/\/ask.fm\/rnoush1","description":"\u0639\u0634\u0642\u062a \u0631\u062c\u0644\u0627 \u062c\u0639\u0644\u0646\u064a \u0641\u062a\u0627\u0629 \u0645\u062f\u0644\u0644\u0629 \u0642\u062f \u064a\u0643\u0648\u0646 \u0647\u0648 \u0633\u0631 \u0643\u0628\u0631\u064a\u0627\u062a\u064a \u0648\u063a\u0631\u0648\u0631\u064a \u0644\u0643\u0646 \u0627\u0644\u0623\u0643\u064a\u062f \u0627\u0646\u0647 \u0645\u0635\u062f\u0631 \u062b\u0642\u062a\u064a \u0641 \u0639\u0641\u0648\u0627 \u064a\u0627 \u0631\u062c\u0627\u0644 \u0627\u0644\u0639\u0627\u0644\u0645 \u0644\u0646 \u062a\u0634\u0628\u0647\u0648 \u0638\u0644\u0647\u2661#\u0627\u0628\u064a\u2764\u0627\u0644\u0645\u062f\u0631\u064a\u062f \u0648\u0641\u0642\u0637!\u0627\u0633\u062a\u063a\u0641\u0631\u0627\u0644\u0644\u0647 \u0648\u0627\u062a\u0648\u0628 \u0627\u0644\u064a\u0647","protected":false,"followers_count":897,"friends_count":629,"listed_count":1,"created_at":"Tue Dec 27 12:23:35 +0000 2011","favourites_count":536,"utc_offset":10800,"time_zone":"Kuwait","geo_enabled":true,"verified":false,"statuses_count":10713,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261833262\/815197703217896a95ea35968a8d3fbc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261833262\/815197703217896a95ea35968a8d3fbc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/447930131\/1374049479","profile_link_color":"088253","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758475776,"id_str":"365611242758475776","text":"\"@TeamNikeBoi_: \u201cI really don't care who was \"\ud83d\udc48Before\" me, as long as there's no one \"During\ud83d\udc47\ud83d\udc4d\" me..\u201d\"","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":275010837,"id_str":"275010837","name":"\u2665 FMOI:MonieMulah ","screen_name":"_MonicaMonroe","location":"In Your Dreams ; )","url":null,"description":"Just A Young Girl Chasing A BIG Dream .. \u2665 , You Can Hate Me , But Why Knock My Hustle ? , Follow Me Or Swallow Me ;*","protected":false,"followers_count":969,"friends_count":814,"listed_count":0,"created_at":"Thu Mar 31 13:18:17 +0000 2011","favourites_count":226,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":29676,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/847269777\/7f9fef978acc068f9de754796c2b7b61.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/847269777\/7f9fef978acc068f9de754796c2b7b61.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261638684\/0fd4db0afbf707d69495fd93b872d1b6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261638684\/0fd4db0afbf707d69495fd93b872d1b6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/275010837\/1373893665","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F7F2F7","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TeamNikeBoi_","name":"Muslim1st[R.I.P BY]","id":527243026,"id_str":"527243026","indices":[1,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758479872,"id_str":"365611242758479872","text":"Hoy conoc\u00ed a.bogot\u00e1 de.una manera particular.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":135323453,"id_str":"135323453","name":"Alejandro Hernandez","screen_name":"AlejoHndez_15","location":"","url":null,"description":"\u00a1Si uno conociera lo que tiene, con tanta claridad como conoce lo que le falta!... Mario Benedetti http:\/\/instagram.com\/alejohndez_15","protected":false,"followers_count":117,"friends_count":100,"listed_count":0,"created_at":"Wed Apr 21 00:21:38 +0000 2010","favourites_count":247,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":4998,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/876855915\/aed13dc38397de992c0d4d5712f0fbc0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/876855915\/aed13dc38397de992c0d4d5712f0fbc0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000151054227\/0ce8cc8f67b4246e0af9124d657b6ccf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000151054227\/0ce8cc8f67b4246e0af9124d657b6ccf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/135323453\/1368223767","profile_link_color":"0E34F0","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"0B0621","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242750091264,"id_str":"365611242750091264","text":"\u2600\u2600\u2600\u2600\u2600\u2600\n\nYOU ARE MY EVERYTHING\n\nPLEASE MAKE MY LIFE \n\nAND FOLLOW ME\n\nI LOVE YOU SO MUCH\n\nMY BIRTHDAY IS IN 9 DAYS\n\n@Harry_Styles \n\n\u2600\u2600\u2600\u2600\u2600\u2600srs","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":466937178,"id_str":"466937178","name":"HARRY!!!!","screen_name":"skeletonharry","location":"\u2661harry styles\u2661","url":null,"description":"\u2022\u2022 harry is my moon because he brightens up even the darkest nights \u2022\u2022","protected":false,"followers_count":1831,"friends_count":585,"listed_count":10,"created_at":"Tue Jan 17 23:24:26 +0000 2012","favourites_count":4020,"utc_offset":14400,"time_zone":"Abu Dhabi","geo_enabled":false,"verified":false,"statuses_count":43849,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000034915451\/7efba14f05471da414995883432a6ac4.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000034915451\/7efba14f05471da414995883432a6ac4.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262011157\/2cc416a3d8fddb942a3c2ae33b84e7e9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262011157\/2cc416a3d8fddb942a3c2ae33b84e7e9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/466937178\/1375582446","profile_link_color":"038543","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Harry_Styles","name":"Harry Styles","id":181561712,"id_str":"181561712","indices":[114,127]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242771054592,"id_str":"365611242771054592","text":"(Carter) You've been tasked with coming up with the most inventive way for Black Widow to be killed off in the... http:\/\/t.co\/2E7Dwlod5z","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1416994622,"id_str":"1416994622","name":"The MCU","screen_name":"MarvelCU","location":"","url":"http:\/\/www.Facebook.com\/marvelcinematicuniverse","description":"This is the Twitter of the Marvel Cinematic Universe, bringing you the latest and most up to date information on the Marvel Cinematic Universe movies!","protected":false,"followers_count":247,"friends_count":41,"listed_count":2,"created_at":"Fri May 10 02:39:03 +0000 2013","favourites_count":17,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":610,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3639850731\/13fc9285311f8964270cc4a53a63b01b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3639850731\/13fc9285311f8964270cc4a53a63b01b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1416994622\/1368217025","profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/2E7Dwlod5z","expanded_url":"http:\/\/fb.me\/6rB1zSgiU","display_url":"fb.me\/6rB1zSgiU","indices":[114,136]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242775248897,"id_str":"365611242775248897","text":"Como nino Disfrutando de una compota =) http:\/\/t.co\/5Uh5lLYlDX","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":605812007,"id_str":"605812007","name":"Mervin Martinez","screen_name":"08Mervin","location":"","url":null,"description":"Si me preguntas si soy del barsa o del Madrid mi respuesta ser\u00e1 SOY VINOTINTO HAZTA LA MUERTE.","protected":false,"followers_count":41,"friends_count":153,"listed_count":0,"created_at":"Mon Jun 11 22:50:56 +0000 2012","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":360,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257290938\/36b9b601e072cb2080672590407c4632_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257290938\/36b9b601e072cb2080672590407c4632_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/605812007\/1364086118","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/5Uh5lLYlDX","expanded_url":"http:\/\/instagram.com\/p\/cxSZ1EDzd0\/","display_url":"instagram.com\/p\/cxSZ1EDzd0\/","indices":[40,62]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242787831809,"id_str":"365611242787831809","text":"God lets us make wrong decisions so we can learn how to make the right ones. #UnashamedImpact","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":32812725,"id_str":"32812725","name":"Ray Silva \u2020","screen_name":"lilray8748","location":"Denver, Co.","url":"http:\/\/facebook.com\/I.Am.Lil.Ray","description":"Improving my relationship with Jesus Christ everyday. A wrestling tweet may slip in every now and then. #Agape","protected":false,"followers_count":1483,"friends_count":1438,"listed_count":13,"created_at":"Sat Apr 18 06:17:20 +0000 2009","favourites_count":38,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":8186,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044469937\/76777593f4023cf94fc7118169e1c3b2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044469937\/76777593f4023cf94fc7118169e1c3b2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261513024\/0457c7634173df715df6d5a28b5195ee_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261513024\/0457c7634173df715df6d5a28b5195ee_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/32812725\/1375655552","profile_link_color":"7127A3","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"AD0707","profile_text_color":"EBEBEB","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"UnashamedImpact","indices":[77,93]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779455489,"id_str":"365611242779455489","text":"Sense of sore loseritues me thinks #trololol","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":38983578,"id_str":"38983578","name":"Aaron Bowers","screen_name":"DubleAA_","location":"Crieff, Scotland","url":null,"description":"Former Cunt of the Year (2012)","protected":false,"followers_count":108,"friends_count":259,"listed_count":1,"created_at":"Sun May 10 03:03:49 +0000 2009","favourites_count":214,"utc_offset":3600,"time_zone":"Edinburgh","geo_enabled":true,"verified":false,"statuses_count":7903,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/849188958\/f7ea4f768469f42d48099245f4ba0372.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/849188958\/f7ea4f768469f42d48099245f4ba0372.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3752455238\/3f94c46235b0e1d29503963de823b76f_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3752455238\/3f94c46235b0e1d29503963de823b76f_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/38983578\/1348365195","profile_link_color":"2FC2EF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"trololol","indices":[35,44]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779443203,"id_str":"365611242779443203","text":"I think that I'm sick hh","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":950113352,"id_str":"950113352","name":"Moon \u2661'","screen_name":"_MOON_M","location":"Dxb ","url":null,"description":"senior 13\u2665, going to 17, Kpop fan, adore choi min-ho","protected":false,"followers_count":35,"friends_count":54,"listed_count":0,"created_at":"Thu Nov 15 17:10:40 +0000 2012","favourites_count":19,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1912,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/712339078\/9aa835a7fc35ca94519af2d81e8c8590.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/712339078\/9aa835a7fc35ca94519af2d81e8c8590.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000153380039\/054b641b3024e34fe35604ffe3a6ec7b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000153380039\/054b641b3024e34fe35604ffe3a6ec7b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/950113352\/1374185084","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779443200,"id_str":"365611242779443200","text":"EU TE AMO \u2764 EU TE AMO \u2764 EU TE AMO \u2764 EU TE AMO \u2764 EU TE... \u2014 Eu te amo mais \u2665 Eu te amo mais \u2665 Eu te amo mais \u2665 Eu... http:\/\/t.co\/2pdOqgwaSm","source":"\u003ca href=\"http:\/\/ask.fm\/\" rel=\"nofollow\"\u003eAsk.fm\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1131935138,"id_str":"1131935138","name":"\u2607Kim\u25bfmyonsu\u00ae\u25b1","screen_name":"CheWillBeloved","location":"","url":"http:\/\/ask.fm\/CheWillBeloved","description":"\ucca0\uc5c6\ub294 \uc5b4\ub9b0\uc560\ucc98\ub7fc \uc2a4\uc2a4\ub85c \ucee8\ud2b8\ub864\uc744 \ud558\uc9c0!\ub0a8\uc790\uac00 \uc0ac\ub791\ud560 \ub54c\uc5d4 \uaf2d \ud56d\uc0c1 \uacc1\uc5d0 \uba38\ubb3c\uba74\uc11c \ub298 \ud574\uc8fc\uace0 \uc2f6\uc740\uac8c \u2665","protected":false,"followers_count":43,"friends_count":29,"listed_count":0,"created_at":"Tue Jan 29 19:33:19 +0000 2013","favourites_count":30,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":8432,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000027054169\/13aed74981ca3b3783a2cc9db0c467c8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000027054169\/13aed74981ca3b3783a2cc9db0c467c8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000152403379\/90e12f5d7a25bc521b4baf1a9e673367_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000152403379\/90e12f5d7a25bc521b4baf1a9e673367_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1131935138\/1374169125","profile_link_color":"1F6FBF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/2pdOqgwaSm","expanded_url":"http:\/\/ask.fm\/a\/5fd3pd3q","display_url":"ask.fm\/a\/5fd3pd3q","indices":[116,138]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754293760,"id_str":"365611242754293760","text":"RT @avonfearlessxo: you made an account to support justin not hate on others.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":212995582,"id_str":"212995582","name":"promise","screen_name":"avoncreature","location":"dorothy means everything","url":null,"description":"April 16th was the best day of my life.","protected":false,"followers_count":3501,"friends_count":2723,"listed_count":6,"created_at":"Sun Nov 07 17:25:19 +0000 2010","favourites_count":537,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":21809,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/874144711\/7f0ec665894e1bb7af3a18aae4bfd99f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/874144711\/7f0ec665894e1bb7af3a18aae4bfd99f.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261818067\/b0e25b9faa38b6b1ff88f25fa2178531_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261818067\/b0e25b9faa38b6b1ff88f25fa2178531_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/212995582\/1375452824","profile_link_color":"8F7A8F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:05:42 +0000 2013","id":365609795337728002,"id_str":"365609795337728002","text":"you made an account to support justin not hate on others.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":549829102,"id_str":"549829102","name":"summer","screen_name":"avonfearlessxo","location":"anaheim, california","url":null,"description":"june 24 floor 4 row 19 seat 7","protected":false,"followers_count":17942,"friends_count":16703,"listed_count":26,"created_at":"Tue Apr 10 05:11:21 +0000 2012","favourites_count":94,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":15354,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044555724\/88b031413ed3825018b6a5cb912d79ea.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044555724\/88b031413ed3825018b6a5cb912d79ea.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257851481\/e0bcffd0e2dba15ec12f04c57ef64762_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257851481\/e0bcffd0e2dba15ec12f04c57ef64762_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/549829102\/1375431420","profile_link_color":"0A0A0A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"avonfearlessxo","name":"summer","id":549829102,"id_str":"549829102","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779459584,"id_str":"365611242779459584","text":"carambaaaaa","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":427872756,"id_str":"427872756","name":"G\u00ea \u2661 ","screen_name":"DeosaGrega","location":"\u270c","url":"http:\/\/instagram.com\/deosagrega","description":"Pode passar mil anos, voc\u00ea vai me amar e vai ser pra sempre meu! Eu te amo, idiota @GeeovaneBarros \u2661","protected":false,"followers_count":1120,"friends_count":358,"listed_count":2,"created_at":"Sun Dec 04 02:33:30 +0000 2011","favourites_count":1835,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":58723,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028510517\/1beadea19f780379566141bc82b7baf2.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028510517\/1beadea19f780379566141bc82b7baf2.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000199760994\/6980bf9dbf048d8c4bca8590d8845aec_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000199760994\/6980bf9dbf048d8c4bca8590d8845aec_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/427872756\/1375470139","profile_link_color":"0099B9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242766864385,"id_str":"365611242766864385","text":"RT @oathkidrauhl: VOU AVALIAR EM : [ ]Legal\u2714 [ ]Perfeito\u2730 [ ] Diamond\u272a [ ]Humilha\u265b [ ]Passa a senha \u2665 TEM QUE ME SEGUIR, SDV, S\u00d3 PEDIR #RT \u2026","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":500972949,"id_str":"500972949","name":"ROAR - lucas ","screen_name":"P3rrylovato","location":"KP .DL.PARAMORE.SG ","url":null,"description":"California Dreams Tour","protected":false,"followers_count":388,"friends_count":1039,"listed_count":0,"created_at":"Thu Feb 23 17:20:42 +0000 2012","favourites_count":64,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":4799,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043700387\/3232eda496097521416229506fadba74.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043700387\/3232eda496097521416229506fadba74.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000232813226\/955db240ed5478a4ccab53a38e0fbda5_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000232813226\/955db240ed5478a4ccab53a38e0fbda5_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/500972949\/1375561156","profile_link_color":"242020","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:55 +0000 2013","id":365611108402339841,"id_str":"365611108402339841","text":"VOU AVALIAR EM : [ ]Legal\u2714 [ ]Perfeito\u2730 [ ] Diamond\u272a [ ]Humilha\u265b [ ]Passa a senha \u2665 TEM QUE ME SEGUIR, SDV, S\u00d3 PEDIR #RT x8","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":419104575,"id_str":"419104575","name":"maluzete","screen_name":"oathkidrauhl","location":"erin and jennette m. follows \u2661","url":null,"description":"i do not have tickets to meet my idol","protected":false,"followers_count":5309,"friends_count":4950,"listed_count":2,"created_at":"Tue Nov 22 23:38:56 +0000 2011","favourites_count":2172,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":false,"verified":false,"statuses_count":55730,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045380130\/63b11fa12b75f1b7804ab08232a6c521.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045380130\/63b11fa12b75f1b7804ab08232a6c521.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247554062\/1972f1ccc16ea720e6e9165be60ffc27_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247554062\/1972f1ccc16ea720e6e9165be60ffc27_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/419104575\/1375404879","profile_link_color":"6C61C2","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FC8567","profile_text_color":"FB9B87","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[{"text":"RT","indices":[117,120]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[{"text":"RT","indices":[135,138]}],"urls":[],"user_mentions":[{"screen_name":"oathkidrauhl","name":"maluzete","id":419104575,"id_str":"419104575","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242771062784,"id_str":"365611242771062784","text":"she said no but ima try again tomrrow","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":353364407,"id_str":"353364407","name":"nannas votaw\u2654","screen_name":"Nannas_97","location":"Africa ","url":null,"description":"follow me dawg","protected":false,"followers_count":199,"friends_count":138,"listed_count":0,"created_at":"Thu Aug 11 23:36:03 +0000 2011","favourites_count":1150,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":5709,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000031672913\/bb38470cf05dbcf334a25fdb5af66d7d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000031672913\/bb38470cf05dbcf334a25fdb5af66d7d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000127354218\/8178235ec5948b8fb9527bb67f47aa64_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000127354218\/8178235ec5948b8fb9527bb67f47aa64_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/353364407\/1374995448","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":{"id":"9b101e0451f073b6","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/9b101e0451f073b6.json","place_type":"city","name":"Antioch","full_name":"Antioch, CA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-121.861171,37.93225],[-121.861171,38.029723],[-121.732346,38.029723],[-121.732346,37.93225]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242750083072,"id_str":"365611242750083072","text":"http:\/\/t.co\/ayobGSEcoI \u0441\u0435\u0431\u0435\u0441\u0442\u043e\u0438\u043c\u043e\u0441\u0442\u044c \u043f\u0435\u0440\u0438\u043e\u0434\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0436\u0443\u0440\u043d\u0430\u043b\u0430 http:\/\/t.co\/EIadnHXk7A \u0436\u0443\u0440\u043d\u0430\u043b instyle \u0447\u0438\u0442\u0430\u0442\u044c","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":508215566,"id_str":"508215566","name":"alvaro veloso ","screen_name":"AlvaroReborn","location":"","url":null,"description":null,"protected":false,"followers_count":3,"friends_count":185,"listed_count":0,"created_at":"Wed Feb 29 01:48:23 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":131,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1861399127\/images__1__normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1861399127\/images__1__normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/ayobGSEcoI","expanded_url":"http:\/\/ana.acba.in.ua\/cat5\/torrent-73.html","display_url":"ana.acba.in.ua\/cat5\/torrent-7\u2026","indices":[0,22]},{"url":"http:\/\/t.co\/EIadnHXk7A","expanded_url":"http:\/\/ana.acba.in.ua\/cat5\/torrent-76.html","display_url":"ana.acba.in.ua\/cat5\/torrent-7\u2026","indices":[60,82]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758475778,"id_str":"365611242758475778","text":"Hate group chats on facebook, so irritating","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":120900766,"id_str":"120900766","name":"_daniellecrainey\u270c","screen_name":"DanielleCrainey","location":"","url":null,"description":"..","protected":false,"followers_count":127,"friends_count":127,"listed_count":0,"created_at":"Sun Mar 07 22:59:09 +0000 2010","favourites_count":84,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":409,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237217604\/96d9ddffda012fe28c5fc2d6e331e440_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237217604\/96d9ddffda012fe28c5fc2d6e331e440_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/120900766\/1374887798","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242766860288,"id_str":"365611242766860288","text":"RT @reIatabIe: Emojis make flirting A LOT easier. \ud83d\ude09\ud83d\ude0d\ud83d\ude18\ud83d\ude03\ud83d\ude0f","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":504808791,"id_str":"504808791","name":"Megan Fulton","screen_name":"MeganFulton2","location":"","url":null,"description":"Heyy","protected":false,"followers_count":121,"friends_count":135,"listed_count":0,"created_at":"Sun Feb 26 17:57:33 +0000 2012","favourites_count":3453,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":8409,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247810705\/d57e27dcdd70852fc7e95afdbc7f649a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247810705\/d57e27dcdd70852fc7e95afdbc7f649a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/504808791\/1375938119","profile_link_color":"038543","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:00:46 +0000 2013","id":365608553802760192,"id_str":"365608553802760192","text":"Emojis make flirting A LOT easier. \ud83d\ude09\ud83d\ude0d\ud83d\ude18\ud83d\ude03\ud83d\ude0f","source":"\u003ca href=\"http:\/\/bufferapp.com\" rel=\"nofollow\"\u003eBuffer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":437030761,"id_str":"437030761","name":"Girl Code","screen_name":"reIatabIe","location":"","url":null,"description":"Defining the GIRL CODE in 140 characters or less. *NOT affiliated with MTV's show Girl Code*","protected":false,"followers_count":963908,"friends_count":8,"listed_count":954,"created_at":"Wed Dec 14 21:45:48 +0000 2011","favourites_count":797,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":17387,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DEDEDE","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/344918034408982886\/adc0ee7cefa3b924eb95ad1eee0b6ca0.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/344918034408982886\/adc0ee7cefa3b924eb95ad1eee0b6ca0.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000037616396\/5130d76a8cf831deda7c7c2242de9277_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000037616396\/5130d76a8cf831deda7c7c2242de9277_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/437030761\/1371183367","profile_link_color":"6965A4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"0A0A0A","profile_text_color":"9C162C","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":517,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"reIatabIe","name":"Girl Code","id":437030761,"id_str":"437030761","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762682368,"id_str":"365611242762682368","text":"@mcosdrugs #Avaliando\n( ) Punhetinha\n ( ) Gozo\n ( ) Orgasmo\n ( ) Masturba\u00e7\u00e3o\n (xxxxxx) Engravidei\nPs: Amei seu banner de vdd <3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":1325393690,"in_reply_to_user_id_str":"1325393690","in_reply_to_screen_name":"mcosdrugs","user":{"id":749788620,"id_str":"749788620","name":"F\u00e3 do Olhudo o.O","screen_name":"peterboymagia","location":"Darrenatic \u2661 Little Monster","url":"http:\/\/premonicao-de-amor.tumblr.com\/","description":"F\u00e3 de um neg\u00e3o carrotudo problem ?","protected":false,"followers_count":1122,"friends_count":757,"listed_count":0,"created_at":"Fri Aug 10 18:04:31 +0000 2012","favourites_count":1419,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":14535,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"F4EFF7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042948120\/4cd5d7ce9f9460709607b5806fa123fd.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042948120\/4cd5d7ce9f9460709607b5806fa123fd.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000130820389\/50380306625f0a6000823a3e1f7082ca_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000130820389\/50380306625f0a6000823a3e1f7082ca_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/749788620\/1374971397","profile_link_color":"FF0000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"Avaliando","indices":[11,21]}],"urls":[],"user_mentions":[{"screen_name":"mcosdrugs","name":"Juh","id":1325393690,"id_str":"1325393690","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758471680,"id_str":"365611242758471680","text":"RT @EdwinMegadivo: - Weed \u2665 #NeyshaHere\u2665!","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1589079122,"id_str":"1589079122","name":" Amo My Bff Marcos \u221e","screen_name":"NeysaSantana1","location":"- Mameyera \u2665","url":null,"description":"\u00df\u0166\u0166 @El_Baron27 \u0442\u0454 \u03b1\u043c\u03c3 \u2665 @samuel4515 & Laly \r\n@flako_loko1 Loveee'\r\n\r\n\r\n \u2665","protected":false,"followers_count":378,"friends_count":283,"listed_count":1,"created_at":"Fri Jul 12 18:09:05 +0000 2013","favourites_count":533,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4632,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF0000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045910596\/38b298bb157e507f66791209458256f7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045910596\/38b298bb157e507f66791209458256f7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251067997\/8850f8d95f43b4b3b0792dbe892118b7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251067997\/8850f8d95f43b4b3b0792dbe892118b7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1589079122\/1375809627","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:42:20 +0000 2013","id":365603912683761665,"id_str":"365603912683761665","text":"- Weed \u2665 #NeyshaHere\u2665!","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":293850348,"id_str":"293850348","name":"\u2192 E\u2202\u03c9\u03b9\u03b7 C\u03c3\u2113\u2113\u03b9\u03b7s \u2122","screen_name":"EdwinMegadivo","location":"| @iRollinz | @YaironGarcia |","url":"http:\/\/www.facebook.com\/EdwinMegadiivo","description":"\u2665__\u2665 [ Instagram: edwinmegadivo | | | WhatsApp: 829-561-6396 ] \u2665__\u2665 [ @Laly_Reynoso | Moncerrat \u2665 | @iPeloLindo @Wamder_garcia ]","protected":false,"followers_count":1002,"friends_count":530,"listed_count":0,"created_at":"Fri May 06 02:48:57 +0000 2011","favourites_count":1297,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":false,"verified":false,"statuses_count":20777,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043030277\/e843f98f7ada8467f9f958231484894c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043030277\/e843f98f7ada8467f9f958231484894c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256373481\/fc34e44d376554975118c800628c6e62_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256373481\/fc34e44d376554975118c800628c6e62_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/293850348\/1375901551","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[{"text":"NeyshaHere","indices":[9,20]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[{"text":"NeyshaHere","indices":[28,39]}],"urls":[],"user_mentions":[{"screen_name":"EdwinMegadivo","name":"\u2192 E\u2202\u03c9\u03b9\u03b7 C\u03c3\u2113\u2113\u03b9\u03b7s \u2122","id":293850348,"id_str":"293850348","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779443202,"id_str":"365611242779443202","text":"RT @DATbitchPookie: I'm hella hungry.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":820667276,"id_str":"820667276","name":"VIII.XXXI. XII\u2665","screen_name":"Arriona_LaShay","location":"#NorthSide St.Louis ","url":null,"description":"Im Flexin Bitch$ |Yea I Walk Like I Talk Like I Kno That Im The Sht | R.I.P Daddy&Lasia\u2020 Miss Yall | Follow my bae\u2661 @Luh_Skuuta&Follow me on IG @arriona_lashay\u270c","protected":false,"followers_count":347,"friends_count":370,"listed_count":1,"created_at":"Thu Sep 13 01:21:58 +0000 2012","favourites_count":75,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":4753,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045002953\/1d7e29961a15704bfd7fe31edc96db0f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045002953\/1d7e29961a15704bfd7fe31edc96db0f.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3768925374\/57fd2e2734b108f769fa44fa3b15691c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3768925374\/57fd2e2734b108f769fa44fa3b15691c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/820667276\/1375576217","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:05:02 +0000 2013","id":365609626441494531,"id_str":"365609626441494531","text":"I'm hella hungry.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":400647360,"id_str":"400647360","name":"\u2764 Relatable \u2764","screen_name":"DATbitchPookie","location":"","url":null,"description":"#TEAM B A D A S S (S.I.P LILD) i LOVE TK4L! http:\/\/ask.fm\/PookieDot Ask Ya Guh Sum Questions Doogg !!!","protected":false,"followers_count":1190,"friends_count":598,"listed_count":0,"created_at":"Sat Oct 29 10:58:08 +0000 2011","favourites_count":6210,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":23827,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000228387015\/f1978384562633d84dbbc5d3c82e7a26_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000228387015\/f1978384562633d84dbbc5d3c82e7a26_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/400647360\/1375410657","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"sv"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"DATbitchPookie","name":"\u2764 Relatable \u2764","id":400647360,"id_str":"400647360","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sv"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762670080,"id_str":"365611242762670080","text":"RT @PlainOldVince: I be peepin shit but ion say nun\ud83d\ude36. Game recognize game","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":321733829,"id_str":"321733829","name":"R.I.P Dexter Wright\u2764","screen_name":"Lexi_Tyriana","location":"Atlanta , Georgia \u2708","url":null,"description":"R.I.P Dexter Wright \u2764 Rest Easy. I love & miss you. Gone but never forgotten. Blessed.","protected":false,"followers_count":475,"friends_count":287,"listed_count":0,"created_at":"Wed Jun 22 01:46:57 +0000 2011","favourites_count":296,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":25806,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000163707607\/3e18b8ae86923733e1d768c226b4fa8d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000163707607\/3e18b8ae86923733e1d768c226b4fa8d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/321733829\/1372699698","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:07:22 +0000 2013","id":365610215598587906,"id_str":"365610215598587906","text":"I be peepin shit but ion say nun\ud83d\ude36. Game recognize game","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":339474451,"id_str":"339474451","name":"September 14th","screen_name":"PlainOldVince","location":"Texas","url":null,"description":"If they don't know your dreams, they can't shoot'em down. Rip PIMP C #Navy #CELTICNATION","protected":false,"followers_count":1036,"friends_count":645,"listed_count":5,"created_at":"Thu Jul 21 04:51:20 +0000 2011","favourites_count":605,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":40691,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBEBEB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/308544031\/len.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/308544031\/len.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251453115\/b02cfc1bc4956046ec0a8fb19ce89f1a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251453115\/b02cfc1bc4956046ec0a8fb19ce89f1a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/339474451\/1354179943","profile_link_color":"095875","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"2082B3","profile_text_color":"28236B","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":6,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PlainOldVince","name":"September 14th","id":339474451,"id_str":"339474451","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762665984,"id_str":"365611242762665984","text":"RT @Batmankillerzoo: RTsi au grec t'a dit \"Salade tomates oignons jsuis dans le KETURE\" un serveur t'a forc\u00e9 a l\u00e9cher son torse","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":289891686,"id_str":"289891686","name":"Lorenzo ","screen_name":"LorenzoTnt","location":"Saint-Etienne ","url":null,"description":null,"protected":false,"followers_count":321,"friends_count":558,"listed_count":1,"created_at":"Fri Apr 29 11:53:36 +0000 2011","favourites_count":654,"utc_offset":-9000,"time_zone":"Newfoundland","geo_enabled":true,"verified":false,"statuses_count":13554,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261638589\/a3af227c1dc4c5cf2befc0d9e9ff5eb5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261638589\/a3af227c1dc4c5cf2befc0d9e9ff5eb5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/289891686\/1375211050","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:42 +0000 2013","id":365611053888974849,"id_str":"365611053888974849","text":"RTsi au grec t'a dit \"Salade tomates oignons jsuis dans le KETURE\" un serveur t'a forc\u00e9 a l\u00e9cher son torse","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":622062521,"id_str":"622062521","name":"Sharingan ","screen_name":"Batmankillerzoo","location":"Lille","url":null,"description":"Je ne vous aime pas non plus, niquez vos m\u00e8res.","protected":false,"followers_count":15343,"friends_count":9528,"listed_count":5,"created_at":"Fri Jun 29 16:59:14 +0000 2012","favourites_count":4343,"utc_offset":10800,"time_zone":"Athens","geo_enabled":true,"verified":false,"statuses_count":44250,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041123079\/360042cdc33ebb19b17e9c7a820265d9.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041123079\/360042cdc33ebb19b17e9c7a820265d9.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261746394\/100dce94053c233addf9135297d65339_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261746394\/100dce94053c233addf9135297d65339_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/622062521\/1375626702","profile_link_color":"C2C0C2","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":12,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"fr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Batmankillerzoo","name":"Sharingan ","id":622062521,"id_str":"622062521","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762670081,"id_str":"365611242762670081","text":"@soledadbarragan Y las habra,te quiero<3","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610668914769921,"in_reply_to_status_id_str":"365610668914769921","in_reply_to_user_id":283075612,"in_reply_to_user_id_str":"283075612","in_reply_to_screen_name":"soledadbarragan","user":{"id":780402530,"id_str":"780402530","name":"Never stop smiling.","screen_name":"Aitanablanco_g","location":"","url":null,"description":"15.Soy como soy,no hay m\u00e1s.Es imposible ser fuerte a todas horas. Me ves por fuera,pero no por dentro.'Apesar de todo,sonr\u00ede'. 5,Octubre.","protected":false,"followers_count":405,"friends_count":404,"listed_count":0,"created_at":"Sat Aug 25 14:18:58 +0000 2012","favourites_count":1058,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":9834,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"16E0BE","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/802916267\/871b2defcc2f6021ad2df81b59ddb986.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/802916267\/871b2defcc2f6021ad2df81b59ddb986.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247589685\/07fa246570c8b505ea4db69c99de3107_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247589685\/07fa246570c8b505ea4db69c99de3107_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/780402530\/1375968663","profile_link_color":"1FCFCF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"soledadbarragan","name":"Be Free","id":283075612,"id_str":"283075612","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754293761,"id_str":"365611242754293761","text":"@officialdioni zoals altijd #perfect<3 x","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365608867918381056,"in_reply_to_status_id_str":"365608867918381056","in_reply_to_user_id":243743226,"in_reply_to_user_id_str":"243743226","in_reply_to_screen_name":"officialdioni","user":{"id":974776226,"id_str":"974776226","name":" Jenn\u00a1fer\u2665","screen_name":"xxxjenniferrr","location":"Netherlands","url":null,"description":"\u25e62Va \u25e6 proud to be a directioner\/dionizer\/maniac\/nialler\/braviour\/mofo with Mo\u00efse\u2661 \u25e6 your way is #Up \u27b9","protected":false,"followers_count":130,"friends_count":80,"listed_count":0,"created_at":"Tue Nov 27 21:39:23 +0000 2012","favourites_count":538,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":2559,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"2DFCB4","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/859021711\/ab4d6c00fba41134ad947fb48d0039f1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/859021711\/ab4d6c00fba41134ad947fb48d0039f1.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000043364934\/3d8a968cfb88b55b5b6f94148415f1fd_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000043364934\/3d8a968cfb88b55b5b6f94148415f1fd_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/974776226\/1374177909","profile_link_color":"E61573","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"perfect","indices":[28,36]}],"urls":[],"user_mentions":[{"screen_name":"officialdioni","name":"Dioni Jurado-Gomez","id":243743226,"id_str":"243743226","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242750087168,"id_str":"365611242750087168","text":"RT @PatriciaGledis: Asik dg ninot, nd sepi :( @Lavenia_ninot: Rame banget syg:p\"PatriciaGledis: Wah, rame paling :) @Lavenia_ninot: Npa jal\u2026","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":285615650,"id_str":"285615650","name":"Lavenia N Kaunang","screen_name":"Lavenia_ninot","location":"Indonesia,manado (kolongan)","url":null,"description":"Jesus,Your always in my heart and in my mind......:)\/\/bless my family,friend and people that I love..and in the end you can call me nia or ninot...;)","protected":false,"followers_count":226,"friends_count":164,"listed_count":1,"created_at":"Thu Apr 21 13:43:52 +0000 2011","favourites_count":44,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":true,"verified":false,"statuses_count":3490,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/745558284\/0fff87ed5e0b9ec3d32b0b17e30aeef1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/745558284\/0fff87ed5e0b9ec3d32b0b17e30aeef1.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000174770662\/6d7796b2c68c68c7c76deefee3c84bc3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000174770662\/6d7796b2c68c68c7c76deefee3c84bc3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/285615650\/1374297990","profile_link_color":"FF00F7","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:28 +0000 2013","id":365610996053716993,"id_str":"365610996053716993","text":"Asik dg ninot, nd sepi :( @Lavenia_ninot: Rame banget syg:p\"PatriciaGledis: Wah, rame paling :) @Lavenia_ninot: Npa jalan Sehat syg:)","source":"\u003ca href=\"http:\/\/www.sheilagank.com\" rel=\"nofollow\"\u003eTw\u0456tter for She\u0456laGank\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":77944497,"id_str":"77944497","name":"Gledis Aruperes","screen_name":"PatriciaGledis","location":"","url":null,"description":null,"protected":false,"followers_count":499,"friends_count":420,"listed_count":0,"created_at":"Mon Sep 28 06:35:01 +0000 2009","favourites_count":12,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":17701,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/888356092\/24a401202bc9bb7c1a330f59763b916a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/888356092\/24a401202bc9bb7c1a330f59763b916a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000127560944\/197b3e5e9615af3973b7c95992b5753f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000127560944\/197b3e5e9615af3973b7c95992b5753f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/77944497\/1373669454","profile_link_color":"0099B9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Lavenia_ninot","name":"Lavenia N Kaunang","id":285615650,"id_str":"285615650","indices":[26,40]},{"screen_name":"Lavenia_ninot","name":"Lavenia N Kaunang","id":285615650,"id_str":"285615650","indices":[96,110]}]},"favorited":false,"retweeted":false,"lang":"id"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"PatriciaGledis","name":"Gledis Aruperes","id":77944497,"id_str":"77944497","indices":[3,18]},{"screen_name":"Lavenia_ninot","name":"Lavenia N Kaunang","id":285615650,"id_str":"285615650","indices":[46,60]},{"screen_name":"Lavenia_ninot","name":"Lavenia N Kaunang","id":285615650,"id_str":"285615650","indices":[116,130]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242783653888,"id_str":"365611242783653888","text":"@maronmahomaho \u30d5\u30a9\u30ed\u30fc\u3042\u308a\u304c\u3068\u3046\u3054\u3056\u3044\u307e\u3059\uff01\u305c\u3072\u305c\u3072\u7d61\u3093\u3067\u304f\u3060\u3055\u3044\uff01","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":1322715565,"in_reply_to_user_id_str":"1322715565","in_reply_to_screen_name":"maronmahomaho","user":{"id":1147634898,"id_str":"1147634898","name":"\u5927\u30d0\u30ab\u4e0b\u4e4b\u4ecb","screen_name":"cfvntt689","location":"","url":null,"description":"\u540d\u63a2\u5075\u30b3\u30ca\u30f3\u304c\u5927\u597d\u304d\u904e\u304e\u3066\u9042\u306b\u5c02\u30a2\u30ab\u3092\u3064\u304f\u3063\u3066\u3057\u307e\u3044\u307e\u3057\u305f\u3002CFC\u4f1a\u54e1No.1340\u3002\u30b3\u30ca\u30af\u30e9\u306e\u7686\u3055\u3093\u3068\u8a9e\u308a\u305f\u3044\u3067\u3059\uff01\u30d5\u30a9\u30ed\u30fc\u3088\u308d\u3057\u304f\u304a\u9858\u3044\u3057\u307e\u3059\uff01\uff08\u57fa\u672c\u7684\u672c\u30a2\u30ab@pfvmkk67\u306b\u3044\u308b\u306e\u3067\u30c4\u30a4\u30fc\u30c8\u3059\u304f\u306a\u3044\u3067\u3059\u304c\uff09","protected":false,"followers_count":108,"friends_count":127,"listed_count":1,"created_at":"Mon Feb 04 10:44:15 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":413,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000160517584\/a121216166a33cfae6f90d8c6510b17a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000160517584\/a121216166a33cfae6f90d8c6510b17a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1147634898\/1368100041","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"maronmahomaho","name":"\u307e\u308d\u3055\u3093","id":1322715565,"id_str":"1322715565","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754277376,"id_str":"365611242754277376","text":"RT @EUPHORICNOUlS: READ THIS OK, IM ACTUALLY GONNA DO THIS JUST READ OK AND RETWEET IF YOU WANNA BE ON IT http:\/\/t.co\/QJ7qXfoXzF","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":447132362,"id_str":"447132362","name":"NIALL PLS ILY","screen_name":"imacrazymofo_x","location":"drowned in niall's eyes!","url":null,"description":"Being a Directioner doesn't mean to have 12.000 Posters and more. Being a Directioner means to love & support 1D \/ i'm so proud of @onedirection\/ 01\/05\/13 \u2665","protected":false,"followers_count":1025,"friends_count":751,"listed_count":0,"created_at":"Mon Dec 26 14:58:21 +0000 2011","favourites_count":358,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":13870,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBEBEB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/859214624\/dd8b4a5f57866da19af781ae99ad287c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/859214624\/dd8b4a5f57866da19af781ae99ad287c.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000188113482\/6ecfd0e0d1bf75cfcf2c88f6f70b1c88_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000188113482\/6ecfd0e0d1bf75cfcf2c88f6f70b1c88_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/447132362\/1364169045","profile_link_color":"990000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 21:52:24 +0000 2013","id":365591347236831232,"id_str":"365591347236831232","text":"READ THIS OK, IM ACTUALLY GONNA DO THIS JUST READ OK AND RETWEET IF YOU WANNA BE ON IT http:\/\/t.co\/QJ7qXfoXzF","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":433794109,"id_str":"433794109","name":"","screen_name":"EUPHORICNOUlS","location":"","url":null,"description":"\u2661\u2661BOYS AND THEIR TOYS, LIAMS 10 INCH ROCKET\u2661\u2661","protected":false,"followers_count":3375,"friends_count":3160,"listed_count":12,"created_at":"Sun Dec 11 01:22:16 +0000 2011","favourites_count":395,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":6762,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000011446150\/25e644f86fe9ba3c6b8cb3305b92b9e7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000011446150\/25e644f86fe9ba3c6b8cb3305b92b9e7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246938589\/05649c738f2b108ad6a21e5a309c0d04_normal.gif","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246938589\/05649c738f2b108ad6a21e5a309c0d04_normal.gif","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/433794109\/1375489695","profile_link_color":"115C16","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":172,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365591347241025537,"id_str":"365591347241025537","indices":[87,109],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLXZLFCAAExDvw.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLXZLFCAAExDvw.jpg","url":"http:\/\/t.co\/QJ7qXfoXzF","display_url":"pic.twitter.com\/QJ7qXfoXzF","expanded_url":"http:\/\/twitter.com\/EUPHORICNOUlS\/status\/365591347236831232\/photo\/1","type":"photo","sizes":{"medium":{"w":600,"h":613,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":623,"h":637,"resize":"fit"},"small":{"w":340,"h":348,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"EUPHORICNOUlS","name":"","id":433794109,"id_str":"433794109","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754277377,"id_str":"365611242754277377","text":"@n_napple \u306c\u3093\u306c\u3093\uff08\uff3e\u03c9\uff3e\uff09","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365510198372220928,"in_reply_to_status_id_str":"365510198372220928","in_reply_to_user_id":1588081483,"in_reply_to_user_id_str":"1588081483","in_reply_to_screen_name":"n_napple","user":{"id":1226869776,"id_str":"1226869776","name":"\u3042\u307e\u305b@\u56db\u5929\u7acb\u6d77\u5ec3","screen_name":"kaxm0112","location":"\u6728\u30ce\u702c\u3055\u3093\u5bb6","url":"http:\/\/twpf.jp\/kaxm0112","description":"\u8d85\u6b21\u5143\u306b\u30c6\u30cb\u30b9\u3084\u30b5\u30c3\u30ab\u30fc\u3059\u308b\u4e2d\u5b66\u751f\u3084\u3089\u6b7b\u795e\u3084\u3089\u3082\u308d\u3082\u308d\u5927\u597d\u304d\u3067\u30cf\u30b2\u305d\u3046\u306a\u6d6e\u6c17\u6027\u3002\u57fa\u672c\u30b9\u30bf\u30f3\u30b9\u306f\u91cd\u5ea6\u306e\u30c6\u30cb\u30af\u30e9\u3001\u4e09\u6b21\u3067\u306f\u6850\u5c71\u6f23\u3068\u4f50\u85e4\u6c38\u5178\u306b\u304a\u71b1\u3002 \u30a2\u30a4\u30b3\u30f3\u306f\u307d\u3093(@pon_kiyu )\u3088\u308a\u3002THANK YOU!!!","protected":false,"followers_count":56,"friends_count":95,"listed_count":1,"created_at":"Thu Feb 28 06:38:10 +0000 2013","favourites_count":312,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5552,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3713495072\/c6b4f14c3089e465a87213cbc16447f8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3713495072\/c6b4f14c3089e465a87213cbc16447f8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1226869776\/1369525789","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"n_napple","name":"\u306a\u3063\u3077\u308b","id":1588081483,"id_str":"1588081483","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762670082,"id_str":"365611242762670082","text":"\"Kangta surprisingly reveals that he visits a hair loss clinic\" ...","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":180752395,"id_str":"180752395","name":"*turn it up*","screen_name":"INFTDESTINY","location":"","url":"http:\/\/jiminsnoona.tumblr.com\/","description":"Jessica | 18yo | KPOPPER \u270c- DB5K \u2261 INFINITE \u2261 BTS \u2261 C-REAL \u2261 EXO \u2261 B.A.P \u2261 VIXX \u2261 CROSSGENE \u2261 my english sucks~","protected":false,"followers_count":288,"friends_count":1376,"listed_count":2,"created_at":"Fri Aug 20 11:37:01 +0000 2010","favourites_count":396,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":59776,"lang":"it","contributors_enabled":false,"is_translator":false,"profile_background_color":"C1FDC5","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000034064334\/5f3a325aa3c1e8fd219a4f749f45746b.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000034064334\/5f3a325aa3c1e8fd219a4f749f45746b.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000192541205\/32d56b279e4ca172fcce6a72f0e257a8_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000192541205\/32d56b279e4ca172fcce6a72f0e257a8_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/180752395\/1374866620","profile_link_color":"1AB529","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"A4E97D","profile_text_color":"B9F88E","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242787831808,"id_str":"365611242787831808","text":"RT @realmadridarabi: \u0643\u0631\u064a\u0633\u062a\u064a\u0627\u0646\u0648 \u0631\u0648\u0646\u0627\u0644\u062f\u0648 \u0642\u062f \u0623\u0643\u0645\u0644 \u0644\u0642\u0627\u0621 \u0627\u0644\u0628\u0627\u0631\u062d\u0629 \u0636\u062f \u062a\u0634\u064a\u0644\u0633\u064a \u0648\u0642\u062f\u0645\u0647 \u062a\u0646\u0632\u0641 \u062f\u0645\u0627\u064b \u060c \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u062e\u0644\u0627\u062a \u0627\u0644\u0639\u0646\u064a\u0641\u0629 \u0645\u0646 \u0644\u0627\u0639\u0628\u064a \u0627\u0644\u062e\u0635\u0645 ! http:\/\/t.co\/8\u2026","source":"\u003ca href=\"http:\/\/www.twitter.com\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":598063437,"id_str":"598063437","name":"\u0627\u0646\u0633 \u062c\u0646\u0628\u064a ","screen_name":"a721253242","location":"","url":null,"description":null,"protected":false,"followers_count":7,"friends_count":25,"listed_count":0,"created_at":"Sun Jun 03 03:38:31 +0000 2012","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":9,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000171686474\/01b3ac3dede5364aa01037797e54926f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000171686474\/01b3ac3dede5364aa01037797e54926f_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 17:46:21 +0000 2013","id":365529426345205760,"id_str":"365529426345205760","text":"\u0643\u0631\u064a\u0633\u062a\u064a\u0627\u0646\u0648 \u0631\u0648\u0646\u0627\u0644\u062f\u0648 \u0642\u062f \u0623\u0643\u0645\u0644 \u0644\u0642\u0627\u0621 \u0627\u0644\u0628\u0627\u0631\u062d\u0629 \u0636\u062f \u062a\u0634\u064a\u0644\u0633\u064a \u0648\u0642\u062f\u0645\u0647 \u062a\u0646\u0632\u0641 \u062f\u0645\u0627\u064b \u060c \u0628\u0639\u062f \u0627\u0644\u062a\u062f\u062e\u0644\u0627\u062a \u0627\u0644\u0639\u0646\u064a\u0641\u0629 \u0645\u0646 \u0644\u0627\u0639\u0628\u064a \u0627\u0644\u062e\u0635\u0645 ! http:\/\/t.co\/8FuQTTRehq","source":"\u003ca href=\"http:\/\/tapbots.com\/tweetbot\" rel=\"nofollow\"\u003eTweetbot for iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":344699554,"id_str":"344699554","name":"\u0634\u0628\u0643\u0629 \u0631\u064a\u0627\u0644 \u0645\u062f\u0631\u064a\u062f","screen_name":"realmadridarabi","location":"","url":"http:\/\/www.alrealclub.com","description":"\u0643\u0644 \u0645\u0627 \u064a\u062e\u0635 \u0631\u064a\u0627\u0644 \u0645\u062f\u0631\u064a\u062f \u0641\u064a \u062a\u063a\u0637\u064a\u0629 \u0645\u0633\u062a\u0645\u0631\u0629 \u0639\u0644\u0649 \u0645\u062f\u0627\u0631 \u0627\u0644\u0633\u0627\u0639\u0629 \u0623\u062e\u0628\u0627\u0631 \u0631\u064a\u0627\u0644 \u0645\u062f\u0631\u064a\u062f \u0623\u0648\u0644\u0627\u064b \u0628\u0623\u0648\u0644 - \u0627\u0647\u062f\u0627\u0641 \u0627\u0644\u0645\u0628\u0627\u0631\u064a\u0627\u062a - \u0635\u0648\u0631 - \u0641\u064a\u062f\u064a\u0648\u0647\u0627\u062a - \u0645\u0648\u0627\u0639\u064a\u062f \u0627\u0644\u0645\u0628\u0627\u0631\u064a\u0627\u062a - \u0627\u0644\u0645\u0624\u062a\u0645\u0631\u0627\u062a \u0627\u0644\u0635\u062d\u0641\u064a\u0629 .","protected":false,"followers_count":52097,"friends_count":2,"listed_count":250,"created_at":"Fri Jul 29 13:11:36 +0000 2011","favourites_count":0,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":11633,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1C1C","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000034397811\/2fccf929bdde985c577e98c3f7701019.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000034397811\/2fccf929bdde985c577e98c3f7701019.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1566476632\/112211_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1566476632\/112211_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/344699554\/1374025231","profile_link_color":"252DBA","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":104,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365529426349400066,"id_str":"365529426349400066","indices":[105,127],"media_url":"http:\/\/pbs.twimg.com\/media\/BRKfE5yCIAI0COE.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRKfE5yCIAI0COE.jpg","url":"http:\/\/t.co\/8FuQTTRehq","display_url":"pic.twitter.com\/8FuQTTRehq","expanded_url":"http:\/\/twitter.com\/realmadridarabi\/status\/365529426345205760\/photo\/1","type":"photo","sizes":{"medium":{"w":598,"h":314,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":598,"h":314,"resize":"fit"},"small":{"w":340,"h":179,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"realmadridarabi","name":"\u0634\u0628\u0643\u0629 \u0631\u064a\u0627\u0644 \u0645\u062f\u0631\u064a\u062f","id":344699554,"id_str":"344699554","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242783653889,"id_str":"365611242783653889","text":"Oops","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":450966232,"id_str":"450966232","name":"vegas","screen_name":"vegasgrace13","location":"","url":null,"description":"yeah my real name is Vegas","protected":false,"followers_count":415,"friends_count":257,"listed_count":1,"created_at":"Fri Dec 30 21:48:51 +0000 2011","favourites_count":4210,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":false,"verified":false,"statuses_count":8294,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme11\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme11\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258923885\/2ba7cd3957d69522b289c6729a0d1688_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258923885\/2ba7cd3957d69522b289c6729a0d1688_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/450966232\/1373082797","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779455488,"id_str":"365611242779455488","text":"@jccaylen #JCCAYLENFOLLOWSPREE IF YOU FOLLOW ME THEN ILL BUY YOU AS MUCH TACO BELL AS A BOY CAN EAT LETS GO :)58","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":139539038,"in_reply_to_user_id_str":"139539038","in_reply_to_screen_name":"jccaylen","user":{"id":288257523,"id_str":"288257523","name":"PLEASE JC\u2661","screen_name":"beausillusion","location":"5\/5 & ariana","url":null,"description":"my boyfriend likes batteries | 25\/05\/13 |","protected":false,"followers_count":9483,"friends_count":8313,"listed_count":12,"created_at":"Tue Apr 26 14:40:09 +0000 2011","favourites_count":73,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":19832,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043575514\/8fbbc12076a9455ef50e1139d4ac56e7.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043575514\/8fbbc12076a9455ef50e1139d4ac56e7.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257121138\/bfb9b4c423d473229f92ec13bc04a826_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257121138\/bfb9b4c423d473229f92ec13bc04a826_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/288257523\/1375914355","profile_link_color":"FCC0EF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C736B4","profile_text_color":"1BC9E0","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"JCCAYLENFOLLOWSPREE","indices":[10,30]}],"urls":[],"user_mentions":[{"screen_name":"jccaylen","name":"\u2601Jc Caylen","id":139539038,"id_str":"139539038","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779459585,"id_str":"365611242779459585","text":"@FumatoS @annajmaussa rio das ostras , foi encontrar com as amigas piranhas dela","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611068736811008,"in_reply_to_status_id_str":"365611068736811008","in_reply_to_user_id":150440039,"in_reply_to_user_id_str":"150440039","in_reply_to_screen_name":"FumatoS","user":{"id":389304040,"id_str":"389304040","name":"Guines","screen_name":"RigasAgnes","location":"","url":null,"description":"Crescer e para todos. Amadurecer e pra poucos...","protected":false,"followers_count":87,"friends_count":87,"listed_count":0,"created_at":"Wed Oct 12 05:47:34 +0000 2011","favourites_count":132,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1949,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000178708790\/aa517605096155d704b7391eef09f7c2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000178708790\/aa517605096155d704b7391eef09f7c2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/389304040\/1374631281","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"FumatoS","name":"Renato Santos","id":150440039,"id_str":"150440039","indices":[0,8]},{"screen_name":"annajmaussa","name":"Anna Julia","id":974930791,"id_str":"974930791","indices":[9,21]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242771070976,"id_str":"365611242771070976","text":"duas idiotas akjhdjskh","source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eMobile Web (M2)\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":460653410,"id_str":"460653410","name":"demetria","screen_name":"devonneincase","location":"","url":null,"description":"breathing in snowflakes","protected":false,"followers_count":4020,"friends_count":3812,"listed_count":2,"created_at":"Wed Jan 11 00:12:47 +0000 2012","favourites_count":290,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":57968,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042731236\/bdcfffab1e9acaad98dcc0b8dc24886a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042731236\/bdcfffab1e9acaad98dcc0b8dc24886a.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261597596\/4ded9fc26b8b5d1d7836ed95ee031f37_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261597596\/4ded9fc26b8b5d1d7836ed95ee031f37_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/460653410\/1375994635","profile_link_color":"877D7F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762678272,"id_str":"365611242762678272","text":"@BerkAkben \u0130yi g\u00fcld\u00fcm lan :D","source":"web","truncated":false,"in_reply_to_status_id":365580809169666048,"in_reply_to_status_id_str":"365580809169666048","in_reply_to_user_id":757275870,"in_reply_to_user_id_str":"757275870","in_reply_to_screen_name":"BerkAkben","user":{"id":1591864874,"id_str":"1591864874","name":"Mansur Baydemir","screen_name":"mnsria","location":"Kahramanmara\u015f","url":null,"description":"Ot var m\u0131 ede ? \/ Ot sar hele ! \/ Ot bitti.","protected":false,"followers_count":77,"friends_count":104,"listed_count":0,"created_at":"Sat Jul 13 21:22:17 +0000 2013","favourites_count":170,"utc_offset":10800,"time_zone":"Istanbul","geo_enabled":false,"verified":false,"statuses_count":337,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000032636904\/2e339e5d7781329ee2b297f8cf1856a3.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000032636904\/2e339e5d7781329ee2b297f8cf1856a3.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000181978347\/14223ebb6c60b934a13080dd085f94eb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000181978347\/14223ebb6c60b934a13080dd085f94eb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1591864874\/1374722859","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"BerkAkben","name":"Daddy Naber ?","id":757275870,"id_str":"757275870","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754281475,"id_str":"365611242754281475","text":"I wish I could have stopped by my uncles grave while I was in Texas.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":281766012,"id_str":"281766012","name":"\u24dc\u24d0\u24d6\u24d6\u24d8\u24d4 \u2693","screen_name":"MaggieeeBethh","location":"Pineville","url":null,"description":"\u0192\u03c3\u2113\u2113\u03c3\u03c9","protected":false,"followers_count":560,"friends_count":820,"listed_count":1,"created_at":"Wed Apr 13 22:55:24 +0000 2011","favourites_count":3927,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":12438,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFBB7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/624392701\/x732ab0c971b8931fbeb818e020732bc.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/624392701\/x732ab0c971b8931fbeb818e020732bc.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000103742284\/f5994caae34842da391a64c8c804f522_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000103742284\/f5994caae34842da391a64c8c804f522_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/281766012\/1373417252","profile_link_color":"A6F6AF","profile_sidebar_border_color":"4F2958","profile_sidebar_fill_color":"5B7C8D","profile_text_color":"66B6AB","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[32.67536863,-96.17365020]},"coordinates":{"type":"Point","coordinates":[-96.17365020,32.67536863]},"place":{"id":"e0060cda70f5f341","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/e0060cda70f5f341.json","place_type":"admin","name":"Texas","full_name":"Texas, US","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-106.645646,25.837163999999998],[-106.645646,36.500704],[-93.508039,36.500704],[-93.508039,25.837163999999998]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242750095360,"id_str":"365611242750095360","text":"@iSoyDiana quedaste en la tarde?:o","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365607428156440576,"in_reply_to_status_id_str":"365607428156440576","in_reply_to_user_id":353797677,"in_reply_to_user_id_str":"353797677","in_reply_to_screen_name":"iSoyDiana","user":{"id":288028631,"id_str":"288028631","name":"Yuridia\u025e","screen_name":"Yuridiagrimaldo","location":"","url":null,"description":"Secretos de mi memoria. \r\nINSTAGRAM: @yuridiagr \u2764","protected":false,"followers_count":264,"friends_count":253,"listed_count":0,"created_at":"Tue Apr 26 03:30:29 +0000 2011","favourites_count":125,"utc_offset":-18000,"time_zone":"Mexico City","geo_enabled":true,"verified":false,"statuses_count":2939,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DE4760","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047411934\/319ed256ff5116491e2ac96f1d070511.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047411934\/319ed256ff5116491e2ac96f1d070511.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000253428892\/ad964d330c0d16de68072e1b82896767_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000253428892\/ad964d330c0d16de68072e1b82896767_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/288028631\/1375725943","profile_link_color":"699900","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"iSoyDiana","name":"Demi","id":353797677,"id_str":"353797677","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779447296,"id_str":"365611242779447296","text":"A pleasent find in the waiting room:) #BreakingBad y u make me so anxious?!?! #jessepinkman\u2026 http:\/\/t.co\/ubybHLBEbb","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":378423797,"id_str":"378423797","name":"Keemberly Masicampo ","screen_name":"kcher805","location":"Oxnard","url":null,"description":"instagram: @11181989_keem","protected":false,"followers_count":5,"friends_count":20,"listed_count":0,"created_at":"Fri Sep 23 05:28:16 +0000 2011","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":63,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3784551185\/14221aaa0aa124df3024155a9a2f09a6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3784551185\/14221aaa0aa124df3024155a9a2f09a6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/378423797\/1370930407","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"BreakingBad","indices":[38,50]},{"text":"jessepinkman","indices":[78,91]}],"urls":[{"url":"http:\/\/t.co\/ubybHLBEbb","expanded_url":"http:\/\/instagram.com\/p\/cxSPWkJ1r7\/","display_url":"instagram.com\/p\/cxSPWkJ1r7\/","indices":[93,115]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242787848192,"id_str":"365611242787848192","text":"@WidyaNurulAsmar Iy deh lu mah pinter","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365427470054338560,"in_reply_to_status_id_str":"365427470054338560","in_reply_to_user_id":1046516816,"in_reply_to_user_id_str":"1046516816","in_reply_to_screen_name":"WidyaNurulAsmar","user":{"id":1535931668,"id_str":"1535931668","name":"\u2660 A F N A N \u2660","screen_name":"j_afnan2","location":"Purwakarta","url":null,"description":"persija I love \u2665","protected":false,"followers_count":18,"friends_count":18,"listed_count":0,"created_at":"Fri Jun 21 07:03:04 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":56,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"022330","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme15\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000220198160\/dd0f516c32ba7844c93a75cb9bb1acd5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000220198160\/dd0f516c32ba7844c93a75cb9bb1acd5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1535931668\/1372295262","profile_link_color":"0084B4","profile_sidebar_border_color":"A8C7F7","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"WidyaNurulAsmar","name":"Widya Nurul Asmarani","id":1046516816,"id_str":"1046516816","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242775252992,"id_str":"365611242775252992","text":"\u98df\u3079\u3089\u308c\u308b\u30de\u30a4\u30af\uff08\u96a0\u8a9e\uff09 #etv","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":225006412,"id_str":"225006412","name":"\u304b\u308b\u3073","screen_name":"kaluvin","location":"\u306f\u3070\u305f\u304d\u5e02","url":null,"description":"\u3068\u304d\u30e1\u30e2GS\uff13(\u5148\u8f29\u30ba\u6fc0\u840c\u3048)GS2(\u30e6\u30ad\u597d\u304d\u3059\u304e\u3066\u3069\u3046\u3057\u305f\u3089\u3044\u3044\u304b\u308f\u304b\u3089\u306a\u3044)\u3001\uff25\u30c6\u30ec\u3001\u91ce\u7403(\u4e3b\u306b\u30aa\u30ea\u30c3\u30af\u30b9\u3068\u962a\u795e)\u3001\u76f8\u68d2(\u30e9\u30e0\u30cd\u3001\u30a4\u30bf\u30df\u30f3\u597d\u304d)\u3001\u65e5\u5e38\u306e\u3069\u3046\u3067\u3082\u3044\u3044\u3053\u3068\u306a\u3069\u3064\u3076\u3084\u3044\u3066\u304a\u308a\u307e\u3059\u3002\u4e0b\u54c1\u306a\u3053\u3068\u3082\u30d0\u30f3\u30d0\u30f3\u8a00\u3063\u3061\u3083\u3046\u3088\u3002","protected":false,"followers_count":61,"friends_count":80,"listed_count":4,"created_at":"Fri Dec 10 12:22:34 +0000 2010","favourites_count":23,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":true,"verified":false,"statuses_count":6240,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3681391397\/a40cb94966040ff495af0b901a2ef79e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3681391397\/a40cb94966040ff495af0b901a2ef79e_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"etv","indices":[13,17]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242771066880,"id_str":"365611242771066880","text":"RT @CharlieOiAy: \"Was it something I said?!\" - Man recently thrown out of a Mute group.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":805887530,"id_str":"805887530","name":"\u2022\u0391lia\u2022","screen_name":"GallifreyPotter","location":"Spamalot ","url":null,"description":"Greek.Mexican.Native American.French Irish.and Welsh. Good luck will rub off when I shakes 'ands with you! I respect Susan and his life choices. Je suis LOSER","protected":false,"followers_count":450,"friends_count":929,"listed_count":1,"created_at":"Thu Sep 06 02:37:51 +0000 2012","favourites_count":7281,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":9943,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"001329","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/711093832\/bc41ac88b8164a29815ccea4a0315634.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/711093832\/bc41ac88b8164a29815ccea4a0315634.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000112080701\/c922a33c56bc77d98ac73576d26aa53b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000112080701\/c922a33c56bc77d98ac73576d26aa53b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/805887530\/1376003449","profile_link_color":"004358","profile_sidebar_border_color":"F7B565","profile_sidebar_fill_color":"000B17","profile_text_color":"448668","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:05 +0000 2013","id":365611147824607235,"id_str":"365611147824607235","text":"\"Was it something I said?!\" - Man recently thrown out of a Mute group.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":68278282,"id_str":"68278282","name":"Obscure Sex Poet","screen_name":"CharlieOiAy","location":"Twin Peaks- Universe 1610","url":"http:\/\/www.facebook.com\/CharlieAshby1996","description":"Just a Sci-Fi fan babbling about life. #TESD fan. Oh, and the owls are not what they seem.","protected":false,"followers_count":866,"friends_count":1869,"listed_count":9,"created_at":"Mon Aug 24 00:32:15 +0000 2009","favourites_count":16331,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":67804,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"9AE4E8","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/887383916\/63c201cf568c5bf7c3e2e2860fd14508.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/887383916\/63c201cf568c5bf7c3e2e2860fd14508.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000181314053\/8be2b85ca1250775d49841561a3efcf7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000181314053\/8be2b85ca1250775d49841561a3efcf7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/68278282\/1375493032","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDFFCC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[52.27011163,-0.74770467]},"coordinates":{"type":"Point","coordinates":[-0.74770467,52.27011163]},"place":{"id":"368dfc439fcaa041","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/368dfc439fcaa041.json","place_type":"city","name":"Wellingborough","full_name":"Wellingborough, Northamptonshire","country_code":"GB","country":"United Kingdom","bounding_box":{"type":"Polygon","coordinates":[[[-0.817963,52.191568999999994],[-0.817963,52.36437],[-0.610544,52.36437],[-0.610544,52.191568999999994]]]},"attributes":{}},"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"CharlieOiAy","name":"Obscure Sex Poet","id":68278282,"id_str":"68278282","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242787831810,"id_str":"365611242787831810","text":"''RT@revistaporti \u00a1Logan Lerman est\u00e1 guap\u00edsimo! #PercyJackson #percyjacksonenmexico http:\/\/t.co\/m6V7ozJmNl'' el es P E R F E C T O<3","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":107266606,"id_str":"107266606","name":"WelcomeToMexicoLogan","screen_name":"EmsWatsonn","location":"M\u00e9xico\/London","url":"http:\/\/thebigsevenalways.tumblr.com\/","description":"Logan is My \u2112\u2134\u0475\u212f \u2764 Ems Is My Princess\u2655 #WatsonLover\u221e#Lermaniac PoooooottttttteeeerrrrHeaaaddss!!!!!!!!!Always\u03df #Wallflower\u273f#Demigod\u03a8 : )","protected":false,"followers_count":1752,"friends_count":1759,"listed_count":4,"created_at":"Fri Jan 22 01:18:33 +0000 2010","favourites_count":2176,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":17341,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"0B7866","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000018481723\/6a5875c17b9116b3e8a35cb992f5733b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000018481723\/6a5875c17b9116b3e8a35cb992f5733b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000200244532\/e7f678fce83eb3077d76e352ea554fa5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000200244532\/e7f678fce83eb3077d76e352ea554fa5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/107266606\/1373260787","profile_link_color":"8A0E2D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"A6A6A6","profile_text_color":"09D5F0","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"PercyJackson","indices":[48,61]},{"text":"percyjacksonenmexico","indices":[62,83]}],"urls":[],"user_mentions":[{"screen_name":"revistaporti","name":"Revista Por Ti","id":54943253,"id_str":"54943253","indices":[4,17]}],"media":[{"id":365500317166616577,"id_str":"365500317166616577","indices":[84,106],"media_url":"http:\/\/pbs.twimg.com\/media\/BRKEmhoCQAE35VA.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRKEmhoCQAE35VA.jpg","url":"http:\/\/t.co\/m6V7ozJmNl","display_url":"pic.twitter.com\/m6V7ozJmNl","expanded_url":"http:\/\/twitter.com\/revistaporti\/status\/365500317162422273\/photo\/1","type":"photo","sizes":{"large":{"w":764,"h":1024,"resize":"fit"},"small":{"w":340,"h":456,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":804,"resize":"fit"}},"source_status_id":365500317162422273,"source_status_id_str":"365500317162422273"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242762670083,"id_str":"365611242762670083","text":"-Tanta jente hipocrita que hay en el mundo","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":420497284,"id_str":"420497284","name":"MeEstoyEnamorando:$","screen_name":"Cheka_x3","location":"","url":null,"description":"-God Controls My Life, New Life & Goals, Your Not Alone Forever Nobody checked under My:) People Ke Amoo;Kirii.ivette.Laloo,isamar,orqui \u2665","protected":false,"followers_count":259,"friends_count":189,"listed_count":0,"created_at":"Thu Nov 24 18:14:52 +0000 2011","favourites_count":174,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":16932,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046611878\/06ee3f1b26b67073fff112eae2fdba24.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046611878\/06ee3f1b26b67073fff112eae2fdba24.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000164440639\/d23acf1684523c1785ee379aba9d3f32_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000164440639\/d23acf1684523c1785ee379aba9d3f32_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/420497284\/1375892884","profile_link_color":"FF0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242783649792,"id_str":"365611242783649792","text":"RT @Rafaahihi: Carlux::ruivinho","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":245776468,"id_str":"245776468","name":"j.k","screen_name":"lhamanoma","location":"Hogwarts","url":null,"description":"i feel infinite","protected":false,"followers_count":393,"friends_count":277,"listed_count":0,"created_at":"Tue Feb 01 13:03:43 +0000 2011","favourites_count":233,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":30457,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"010F0F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045477280\/a68ae50738511f149cc014296a1128b8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045477280\/a68ae50738511f149cc014296a1128b8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246603658\/50de417ea6b09322ded82fd2f1ed8020_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246603658\/50de417ea6b09322ded82fd2f1ed8020_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/245776468\/1375838810","profile_link_color":"C0C7BF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"0B0917","profile_text_color":"9F19CC","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 21:51:08 +0000 2013","id":365228641493323776,"id_str":"365228641493323776","text":"Carlux::ruivinho","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":269833244,"id_str":"269833244","name":"P\u00f4nei cor de rosa :3","screen_name":"Rafaahihi","location":"","url":"https:\/\/www.facebook.com\/rafaela.demelo.1","description":"Amo a @NadaVerGabi\/ Meu amorz\u00e3o @cr4wling_\/ Meu amorzinho @Luciienes","protected":false,"followers_count":400,"friends_count":184,"listed_count":0,"created_at":"Mon Mar 21 14:45:55 +0000 2011","favourites_count":653,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":24380,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"487A72","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046119609\/a119fda415b7b16c850982f48ba75290.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046119609\/a119fda415b7b16c850982f48ba75290.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252436541\/d0dd5c6215476143a66424e8a4557f22_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252436541\/d0dd5c6215476143a66424e8a4557f22_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/269833244\/1375755088","profile_link_color":"46524C","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"it"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Rafaahihi","name":"P\u00f4nei cor de rosa :3","id":269833244,"id_str":"269833244","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"it"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779443201,"id_str":"365611242779443201","text":"@dulcevegaluv Always my #1 no matter what level I am ok?(:","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610746702340097,"in_reply_to_status_id_str":"365610746702340097","in_reply_to_user_id":1093410426,"in_reply_to_user_id_str":"1093410426","in_reply_to_screen_name":"dulcevegaluv","user":{"id":1321592365,"id_str":"1321592365","name":"\u2665Dulce ^. #1 always","screen_name":"MSP_tamilove","location":"\u2665United States\u2665","url":"http:\/\/moviestarplanet.com","description":"\u2022Mybeauer\u2022 I am not nice or mean I can be both\u2665Bully Free\u2665","protected":false,"followers_count":920,"friends_count":888,"listed_count":2,"created_at":"Tue Apr 02 02:26:38 +0000 2013","favourites_count":1335,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"verified":false,"statuses_count":20752,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"121413","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000008131485\/cc0d8aac09c29ec6a0b55e4acbecb494.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000008131485\/cc0d8aac09c29ec6a0b55e4acbecb494.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000097634579\/6e2f32d3240ae79878cb0dbd89e779a7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000097634579\/6e2f32d3240ae79878cb0dbd89e779a7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1321592365\/1373152860","profile_link_color":"EA11F5","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"dulcevegaluv","name":"Tami #1 \u2764","id":1093410426,"id_str":"1093410426","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242775248896,"id_str":"365611242775248896","text":"@Adam_Grozny @VainahsVeras Vot pochemu etot Kvachkov obucahal chto protiv KavKaztsev dlozhny itdi 10 Russkih :-)","source":"web","truncated":false,"in_reply_to_status_id":365609901638156288,"in_reply_to_status_id_str":"365609901638156288","in_reply_to_user_id":231699846,"in_reply_to_user_id_str":"231699846","in_reply_to_screen_name":"Adam_Grozny","user":{"id":247495419,"id_str":"247495419","name":"Chechen Center","screen_name":"ChechenCenter","location":"","url":"http:\/\/ChechenCenter.info","description":"Independent Chechen Media, Chechnya, Chechen Republic of Ichkeria, ChRI, Ichkeria","protected":false,"followers_count":1237,"friends_count":837,"listed_count":31,"created_at":"Fri Feb 04 22:42:10 +0000 2011","favourites_count":121,"utc_offset":10800,"time_zone":"Tallinn","geo_enabled":false,"verified":false,"statuses_count":8524,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/659090097\/tty4xjqvp6igcrqt6ujq.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/659090097\/tty4xjqvp6igcrqt6ujq.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1824950940\/e8145a60-eb55-4a30-ade2-873d25d18080_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1824950940\/e8145a60-eb55-4a30-ade2-873d25d18080_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/247495419\/1359132863","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Adam_Grozny","name":"ADAM TAGIRI","id":231699846,"id_str":"231699846","indices":[0,12]},{"screen_name":"VainahsVeras","name":"VAINAH'S VERAS ","id":197562308,"id_str":"197562308","indices":[13,26]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"sl"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758475777,"id_str":"365611242758475777","text":"@asaasaid jajajajaaj xD","source":"web","truncated":false,"in_reply_to_status_id":351165277846769664,"in_reply_to_status_id_str":"351165277846769664","in_reply_to_user_id":973160810,"in_reply_to_user_id_str":"973160810","in_reply_to_screen_name":"Duartesillo","user":{"id":973160810,"id_str":"973160810","name":"Leo Duarte","screen_name":"Duartesillo","location":"","url":null,"description":null,"protected":false,"followers_count":98,"friends_count":30,"listed_count":0,"created_at":"Tue Nov 27 03:10:03 +0000 2012","favourites_count":1,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":50,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/732581454\/17c8496b63249508f50137221547885c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/732581454\/17c8496b63249508f50137221547885c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3326300964\/f35b6f7330747b718ca2faf52a060c03_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3326300964\/f35b6f7330747b718ca2faf52a060c03_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/973160810\/1353988187","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"asaasaid","name":"IronAsa.","id":322267471,"id_str":"322267471","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242783637504,"id_str":"365611242783637504","text":"RT @Radio1Direction: 7\/8\/13 Concert: Harry on stage! 6 http:\/\/t.co\/RM1wPv7hBc","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":924288392,"id_str":"924288392","name":"hazza's babe\u2665","screen_name":"InesCaiires","location":"","url":null,"description":"one direction \u221e i'm a crazy mofo \u221e 26-05-2013 \u221e","protected":false,"followers_count":180,"friends_count":536,"listed_count":0,"created_at":"Sun Nov 04 00:05:20 +0000 2012","favourites_count":5,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":148,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"030303","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000034125293\/7ff15cb4a7a819df6f4080a9890771ab.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000034125293\/7ff15cb4a7a819df6f4080a9890771ab.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000192152536\/1c0684ccea94659d863ab0aed0c4cec4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000192152536\/1c0684ccea94659d863ab0aed0c4cec4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/924288392\/1375742785","profile_link_color":"41F070","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:43:02 +0000 2013","id":365604087863054338,"id_str":"365604087863054338","text":"7\/8\/13 Concert: Harry on stage! 6 http:\/\/t.co\/RM1wPv7hBc","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":573078668,"id_str":"573078668","name":"Radio One Direction","screen_name":"Radio1Direction","location":"Broadcasting Worldwide.","url":"http:\/\/Radio1D.webs.com","description":"Forever supporting our 5 boys. The FIRST online radio station for Directioners! Saturdays; times: http:\/\/tl.gd\/krsim3","protected":false,"followers_count":11742,"friends_count":1245,"listed_count":13,"created_at":"Sun May 06 22:38:51 +0000 2012","favourites_count":2731,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":26047,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000026556680\/6aba10dce9a41a29198120913a3aff7f.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000026556680\/6aba10dce9a41a29198120913a3aff7f.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000149802005\/89e1bd25121d46cf50485fa2eea2bafb_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000149802005\/89e1bd25121d46cf50485fa2eea2bafb_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/573078668\/1374120966","profile_link_color":"E11C2E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":4,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365604087871442944,"id_str":"365604087871442944","indices":[34,56],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLi-xoCEAA4f5H.png","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLi-xoCEAA4f5H.png","url":"http:\/\/t.co\/RM1wPv7hBc","display_url":"pic.twitter.com\/RM1wPv7hBc","expanded_url":"http:\/\/twitter.com\/Radio1Direction\/status\/365604087863054338\/photo\/1","type":"photo","sizes":{"large":{"w":432,"h":617,"resize":"fit"},"medium":{"w":432,"h":617,"resize":"fit"},"small":{"w":340,"h":486,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Radio1Direction","name":"Radio One Direction","id":573078668,"id_str":"573078668","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242766860289,"id_str":"365611242766860289","text":"@raquidato18 claro si los tres son de chocolate..ufff el de chocolate blanco!! como esta madre mia te lo recomiendo..","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610650266906624,"in_reply_to_status_id_str":"365610650266906624","in_reply_to_user_id":615708696,"in_reply_to_user_id_str":"615708696","in_reply_to_screen_name":"raquidato18","user":{"id":631242403,"id_str":"631242403","name":"\u2606MILAN P.M\u2606G.PIQU3\u2606","screen_name":"erika_piquerina","location":"espa\u00f1a murcia ","url":null,"description":"\u2661MI GRAN IDOLO GERARD PIQUE NU3 EL MEJOR DE MUNDO\u2661@3gerardpique\u266cFC.barcelona\u266cpinto 30\/5\/13 sergio araujo RT11\/06\/13 \u266c","protected":false,"followers_count":689,"friends_count":939,"listed_count":2,"created_at":"Mon Jul 09 16:47:15 +0000 2012","favourites_count":61,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3130,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/665655888\/d427ec8f969a48bd717185be51035d1c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/665655888\/d427ec8f969a48bd717185be51035d1c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000198137275\/97a14c42fef7c94dd685684579a0da0c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000198137275\/97a14c42fef7c94dd685684579a0da0c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/631242403\/1375378599","profile_link_color":"E0269F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"raquidato18","name":" Raqui #18","id":615708696,"id_str":"615708696","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242775252993,"id_str":"365611242775252993","text":"RT @RealDopePosts: Homecomings\nSweatshirts\nFootball. \ud83c\udfc8 \nHaunted houses\nScary movies\nSweatpants\nCold nights\nHalloween. \n\nCan't wait for fall\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":602431541,"id_str":"602431541","name":"kay kay","screen_name":"kay_baeeee","location":"","url":null,"description":"I have no one","protected":false,"followers_count":662,"friends_count":272,"listed_count":1,"created_at":"Fri Jun 08 03:41:45 +0000 2012","favourites_count":86,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":50688,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme18\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258334528\/32b4dcb8136753f79b92b8325fd7da70_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258334528\/32b4dcb8136753f79b92b8325fd7da70_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/602431541\/1375715897","profile_link_color":"038543","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:06:26 +0000 2013","id":365609980847587328,"id_str":"365609980847587328","text":"Homecomings\nSweatshirts\nFootball. \ud83c\udfc8 \nHaunted houses\nScary movies\nSweatpants\nCold nights\nHalloween. \n\nCan't wait for fall! \ud83c\udf42\ud83c\udf41\ud83c\udf83\ud83d\udc7b","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":50944503,"id_str":"50944503","name":"Brandon","screen_name":"RealDopePosts","location":"\u2514A","url":"http:\/\/www.realdopeposts.com","description":"one of your followers retweeted me, you peeped my avi, now you're reading my bio, twatchin & shit. go ahead and follow me :)","protected":false,"followers_count":574744,"friends_count":316145,"listed_count":362,"created_at":"Fri Jun 26 06:04:09 +0000 2009","favourites_count":16,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":5880,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000027539791\/0d37bc6a73930a8f13e2fadca1fa9f31.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000027539791\/0d37bc6a73930a8f13e2fadca1fa9f31.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000211917094\/5486e2eb9c827ec699803ab500e224ad_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000211917094\/5486e2eb9c827ec699803ab500e224ad_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/50944503\/1375606072","profile_link_color":"0A0A0A","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":617,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"RealDopePosts","name":"Brandon","id":50944503,"id_str":"50944503","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242779451392,"id_str":"365611242779451392","text":"RT @imSMlminG: \u0e2a\u0e32\u0e22\u0e01\u0e32\u0e23\u0e1a\u0e34\u0e19\u0e04\u0e34\u0e15\u0e15\u0e35\u0e49?!! \u0e19\u0e48\u0e32\u0e23\u0e31\u0e01\u0e21\u0e38\u0e49\u0e07\u0e21\u0e34\u0e49\u0e07\u0e44\u0e1b \u0e40\u0e14\u0e32\u0e44\u0e14\u0e49\u0e40\u0e25\u0e22\u0e04\u0e19\u0e17\u0e35\u0e48\u0e14\u0e39\u0e08\u0e30\u0e15\u0e37\u0e48\u0e19\u0e15\u0e32\u0e15\u0e37\u0e48\u0e19\u0e43\u0e08\u0e01\u0e31\u0e1a\u0e40\u0e17\u0e35\u0e48\u0e22\u0e27\u0e1a\u0e34\u0e19\u0e19\u0e19\u0e35\u0e49\u0e04\u0e37\u0e2d\u0e43\u0e04\u0e23.....\u0e2d\u0e35\u0e0b\u0e2d\u0e07\u0e21\u0e34\u0e19 \u0e2a\u0e35\u0e0a\u0e21\u0e1e\u0e39\u0e17\u0e31\u0e49\u0e07\u0e25\u0e33 http:\/\/t.\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":381301647,"id_str":"381301647","name":"Ai Prachayawan S.","screen_name":"prachayawan","location":"UD","url":null,"description":"| FTISLAND & VIXX & CNBLUE | PRIMADONNA \u00a4 STARLIGHT \u00a4 BOICE | \u2764 \uc774\ud64d\uae30 \u2764","protected":false,"followers_count":42,"friends_count":154,"listed_count":0,"created_at":"Wed Sep 28 02:50:52 +0000 2011","favourites_count":135,"utc_offset":25200,"time_zone":"Bangkok","geo_enabled":true,"verified":false,"statuses_count":2011,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3345268381\/1b3afc4e67862983102a26286b2f7076_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3345268381\/1b3afc4e67862983102a26286b2f7076_normal.jpeg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 14:42:09 +0000 2013","id":365483071161643011,"id_str":"365483071161643011","text":"\u0e2a\u0e32\u0e22\u0e01\u0e32\u0e23\u0e1a\u0e34\u0e19\u0e04\u0e34\u0e15\u0e15\u0e35\u0e49?!! \u0e19\u0e48\u0e32\u0e23\u0e31\u0e01\u0e21\u0e38\u0e49\u0e07\u0e21\u0e34\u0e49\u0e07\u0e44\u0e1b \u0e40\u0e14\u0e32\u0e44\u0e14\u0e49\u0e40\u0e25\u0e22\u0e04\u0e19\u0e17\u0e35\u0e48\u0e14\u0e39\u0e08\u0e30\u0e15\u0e37\u0e48\u0e19\u0e15\u0e32\u0e15\u0e37\u0e48\u0e19\u0e43\u0e08\u0e01\u0e31\u0e1a\u0e40\u0e17\u0e35\u0e48\u0e22\u0e27\u0e1a\u0e34\u0e19\u0e19\u0e19\u0e35\u0e49\u0e04\u0e37\u0e2d\u0e43\u0e04\u0e23.....\u0e2d\u0e35\u0e0b\u0e2d\u0e07\u0e21\u0e34\u0e19 \u0e2a\u0e35\u0e0a\u0e21\u0e1e\u0e39\u0e17\u0e31\u0e49\u0e07\u0e25\u0e33 http:\/\/t.co\/nS9k6we0zM","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":145615638,"id_str":"145615638","name":"\ud604\ubbfc","screen_name":"imSMlminG","location":"BKK'Thailand","url":"http:\/\/imsmlming.tumblr.com\/","description":"@imSMl","protected":false,"followers_count":1183,"friends_count":330,"listed_count":5,"created_at":"Wed May 19 12:00:43 +0000 2010","favourites_count":944,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":41945,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"050505","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000035842885\/244a26ab9819150710a7f5e88b482ae9.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000035842885\/244a26ab9819150710a7f5e88b482ae9.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000255273693\/539b44241326b84bef2a978104d63184_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000255273693\/539b44241326b84bef2a978104d63184_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/145615638\/1375974828","profile_link_color":"5406BA","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"BD7BBD","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":631,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365483071170031618,"id_str":"365483071170031618","indices":[115,137],"media_url":"http:\/\/pbs.twimg.com\/media\/BRJ06rSCMAICbr5.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRJ06rSCMAICbr5.jpg","url":"http:\/\/t.co\/nS9k6we0zM","display_url":"pic.twitter.com\/nS9k6we0zM","expanded_url":"http:\/\/twitter.com\/imSMlminG\/status\/365483071161643011\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":450,"resize":"fit"},"small":{"w":340,"h":255,"resize":"fit"},"large":{"w":600,"h":450,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"th"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"imSMlminG","name":"\ud604\ubbfc","id":145615638,"id_str":"145615638","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"th"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242758475779,"id_str":"365611242758475779","text":"@DarleneSegar break it a wee bit and have a look if its still red inside if so, chances are that its still raw inside.","source":"web","truncated":false,"in_reply_to_status_id":365447903872958464,"in_reply_to_status_id_str":"365447903872958464","in_reply_to_user_id":551634992,"in_reply_to_user_id_str":"551634992","in_reply_to_screen_name":"DarleneSegar","user":{"id":132845642,"id_str":"132845642","name":"Parvash","screen_name":"MrPotate","location":"Malaysia\/Nz","url":null,"description":"Money can't buy life - Bob Marley","protected":false,"followers_count":117,"friends_count":138,"listed_count":0,"created_at":"Wed Apr 14 09:57:11 +0000 2010","favourites_count":800,"utc_offset":43200,"time_zone":"Auckland","geo_enabled":true,"verified":false,"statuses_count":5674,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/890610041\/32fe3a8e9e92e078bbe3b88a289cb0d7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/890610041\/32fe3a8e9e92e078bbe3b88a289cb0d7.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000185663969\/59742c9cf5635449cbe5d9cc71e55e7b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000185663969\/59742c9cf5635449cbe5d9cc71e55e7b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/132845642\/1370938435","profile_link_color":"A62121","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"0F0202","profile_text_color":"56BADE","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"DarleneSegar","name":"Darlene Segar","id":551634992,"id_str":"551634992","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242766860290,"id_str":"365611242766860290","text":"@xXbstephensXx. Haha you can tilt it over!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365609834181165056,"in_reply_to_status_id_str":"365609834181165056","in_reply_to_user_id":372535885,"in_reply_to_user_id_str":"372535885","in_reply_to_screen_name":"xXbstephensXx","user":{"id":514082636,"id_str":"514082636","name":"Clay","screen_name":"Austin3Clay","location":"","url":null,"description":null,"protected":false,"followers_count":219,"friends_count":50,"listed_count":0,"created_at":"Sun Mar 04 07:56:32 +0000 2012","favourites_count":119,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2645,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000103157329\/40c3d0357c99278c7ccb77e5664c6f8e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000103157329\/40c3d0357c99278c7ccb77e5664c6f8e_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"xXbstephensXx","name":"Red","id":372535885,"id_str":"372535885","indices":[0,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242754285568,"id_str":"365611242754285568","text":"\u7d0b\u7f8e\u3061\u3083\u309320\u6b73\u306e\u8a95\u751f\u65e5\n\u304a\u3081\u3067\u3068\u304a\uff5e\u3063(*^_^*)\n\u4eca\u5e74\u306718\u5e74\u76ee\u306e\u4ed8\u304d\u5408\u3044\u2605\n2\u6b73\u304b\u3089\u9ad8\u6821\u307e\u3067\u305a\u3063\u3068\u4e00\u7dd2\u3067\n\u793e\u4f1a\u4eba\u306b\u306a\u3063\u3066\u96e2\u308c\u3061\u3083\u3063\u305f\u3051\u3069\n\u67081\u3067\u4f1a\u3063\u3066\u308b\u4ef2\u266a\u266a\n\u672c\u97f3\u8a00\u3044\u5408\u3048\u308b\u3057\u307b\u3093\u3068\u843d\u3061\u7740\u304f\uff01\n\u611b\u68d2\u3061\u3083\u3093\u3060\u3044\u3059\u304d\uff5e\u3063\/\/\/ http:\/\/t.co\/39zx9Pknu7","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1041562700,"id_str":"1041562700","name":"\u306d\u3063\u3061","screen_name":"akanechin23","location":"","url":null,"description":"shinshiro \uff0a \u793e\u4f1a\u4eba \uff12\u5e74\u76ee","protected":false,"followers_count":55,"friends_count":55,"listed_count":0,"created_at":"Fri Dec 28 07:12:11 +0000 2012","favourites_count":153,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1063,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260217310\/2f69b328f63356a9367a758e38ed0fd2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260217310\/2f69b328f63356a9367a758e38ed0fd2_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611242758479873,"id_str":"365611242758479873","indices":[116,138],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpfPqCIAESYQ-.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpfPqCIAESYQ-.jpg","url":"http:\/\/t.co\/39zx9Pknu7","display_url":"pic.twitter.com\/39zx9Pknu7","expanded_url":"http:\/\/twitter.com\/akanechin23\/status\/365611242754285568\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":338,"resize":"fit"},"medium":{"w":557,"h":553,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":557,"h":553,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242783645696,"id_str":"365611242783645696","text":"@kw_ee \n\n\u062a\u0634\u0631\u0641\u062a \u0628\u0643 http:\/\/t.co\/6jiRiYDVaN","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":600738296,"in_reply_to_user_id_str":"600738296","in_reply_to_screen_name":"kw_ee","user":{"id":533783265,"id_str":"533783265","name":"m7arah","screen_name":"m7arah","location":"k.s.a \u0627\u0644\u0631\u064a\u0627\u0636","url":null,"description":"\u0645\u062d\u0627\u0631\u0647 .. \u0644\u0627\u062a\u062a\u0648\u0627\u062c\u062f \u0625\u0644\u0627 \u0641\u064a \u0627\u0644\u0623\u0639\u0645\u0627\u0642 ..... \u062f\u0627\u062e\u0644\u0647\u0627 \u0644\u0624\u0644\u0624\u0647 .....","protected":false,"followers_count":2410,"friends_count":2382,"listed_count":13,"created_at":"Fri Mar 23 00:49:03 +0000 2012","favourites_count":3320,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":34751,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3395258178\/df181e9ca87082a7036678d78c30ddcd_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3395258178\/df181e9ca87082a7036678d78c30ddcd_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/533783265\/1368732790","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"kw_ee","name":"\u0627\u0644\u0631\u0651\u064e\u0633\u064c\u0645 \u0628\u064e\u0627\u0644\u06af\u0644\u0645\u064c\u0627\u062a\u2122","id":600738296,"id_str":"600738296","indices":[0,6]}],"media":[{"id":365611242787840001,"id_str":"365611242787840001","indices":[18,40],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpfPxCIAEKqJm.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpfPxCIAEKqJm.jpg","url":"http:\/\/t.co\/6jiRiYDVaN","display_url":"pic.twitter.com\/6jiRiYDVaN","expanded_url":"http:\/\/twitter.com\/m7arah\/status\/365611242783645696\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":612,"h":612,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:27 +0000 2013","id":365611242766864384,"id_str":"365611242766864384","text":"@chelseakenton_ aardappellen visserij","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611135174582273,"in_reply_to_status_id_str":"365611135174582273","in_reply_to_user_id":405801080,"in_reply_to_user_id_str":"405801080","in_reply_to_screen_name":"chelseakenton_","user":{"id":363826282,"id_str":"363826282","name":"marysse","screen_name":"maryssedegrootx","location":"ouderkerk.","url":null,"description":"ajax 7 - @yarahopelie m'n beste vriendin.","protected":false,"followers_count":166,"friends_count":158,"listed_count":0,"created_at":"Sun Aug 28 19:13:51 +0000 2011","favourites_count":134,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":9721,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"EBEBEB","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme7\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241307496\/28e0be8cc877d9ca7cb3f3dbb01a781b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241307496\/28e0be8cc877d9ca7cb3f3dbb01a781b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/363826282\/1375622883","profile_link_color":"990000","profile_sidebar_border_color":"DFDFDF","profile_sidebar_fill_color":"F3F3F3","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"chelseakenton_","name":"CJK. ","id":405801080,"id_str":"405801080","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944399361,"id_str":"365611246944399361","text":"Why is virginty trending on my TL :s","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":225919480,"id_str":"225919480","name":"Vanessa Nwuka","screen_name":"VeeNwuka_","location":"","url":null,"description":"live, love, laugh\r\ninstagram : @veenwuka","protected":false,"followers_count":380,"friends_count":377,"listed_count":0,"created_at":"Sun Dec 12 20:24:53 +0000 2010","favourites_count":216,"utc_offset":3600,"time_zone":"Casablanca","geo_enabled":false,"verified":false,"statuses_count":9564,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/888427373\/7de44dc8e3c405477aa74c36a3214ba1.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/888427373\/7de44dc8e3c405477aa74c36a3214ba1.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237604539\/3435e5d325dca88adff8b33a41ede174_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237604539\/3435e5d325dca88adff8b33a41ede174_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/225919480\/1367530682","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956978176,"id_str":"365611246956978176","text":"amanha de manha chapinha de manha af","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":942638430,"id_str":"942638430","name":"Liam Payne","screen_name":"liampayne3d","location":"Boydirectioner","url":null,"description":"|Ke$ha| |Lady gaga| |One direction|\r\n ~ @Real_liam_payne ~","protected":false,"followers_count":22069,"friends_count":5177,"listed_count":7,"created_at":"Mon Nov 12 01:56:32 +0000 2012","favourites_count":24,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":true,"verified":false,"statuses_count":65534,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040664278\/59420982fd7513b5a890efcfd4438c5c.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040664278\/59420982fd7513b5a890efcfd4438c5c.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000220911214\/90f494b974520947460fdbef6837c31e_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000220911214\/90f494b974520947460fdbef6837c31e_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/942638430\/1375548758","profile_link_color":"163BCC","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956974082,"id_str":"365611246956974082","text":"Plus le temps passe et plus les choses deviennent de plus en plus parfaites avec elle","source":"\u003ca href=\"http:\/\/store.ovi.com\/content\/256340\" rel=\"nofollow\"\u003eTwitter for Nokia S40\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1402046352,"id_str":"1402046352","name":"\u2112ucile \u2665","screen_name":"LucileChalmette","location":"","url":null,"description":"Directioner Forever & Ever \u221e \u2665 16.05.2013 \u2661 \u2655 \u2661 \u1dab\u1d52\u1d5b\u1d49\u1d67\u2092\u1d64","protected":false,"followers_count":99,"friends_count":82,"listed_count":0,"created_at":"Sat May 04 11:29:05 +0000 2013","favourites_count":37,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1941,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000023505229\/82a84384a3c4d0eae4978edc1997fc17.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000023505229\/82a84384a3c4d0eae4978edc1997fc17.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216048411\/fc6f97f1e8558c68d837eefb6e5d64e7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216048411\/fc6f97f1e8558c68d837eefb6e5d64e7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1402046352\/1375465328","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246965374979,"id_str":"365611246965374979","text":"23. vi date qualche soprannome? no.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":826908590,"id_str":"826908590","name":"serena.","screen_name":"xmjacksonsvojce","location":"Niall James Horan.","url":null,"description":"'between lots of eyes, my heart chose the ones who would have never seen me.'","protected":false,"followers_count":4330,"friends_count":3886,"listed_count":35,"created_at":"Sun Sep 16 10:53:54 +0000 2012","favourites_count":1923,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":39150,"lang":"it","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/700685237\/f1ca6ea115cbd245c7ddb181fac82d98.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/700685237\/f1ca6ea115cbd245c7ddb181fac82d98.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260427195\/da2ee3282ca9c37510d3287d9b443387_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260427195\/da2ee3282ca9c37510d3287d9b443387_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/826908590\/1375975051","profile_link_color":"1EB7D6","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"it"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961180672,"id_str":"365611246961180672","text":"\u0e15\u0e37\u0e48\u0e19\u0e41\u0e25\u0e49\u0e27\u0e41\u0e15\u0e48\u0e19\u0e2d\u0e19\u0e15\u0e48\u0e2d","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":952808953,"id_str":"952808953","name":"\u0e21\u0e32\u0e23 \u0e40\u0e12\u0e48\u0e32","screen_name":"Maneetrem","location":"","url":null,"description":"\u0e0a\u0e2d\u0e1a\u0e2b\u0e38\u0e49\u0e19 \u0e0a\u0e2d\u0e1a\u0e01\u0e32\u0e23\u0e25\u0e07\u0e17\u0e38\u0e19\u0e41\u0e25\u0e30\u0e0a\u0e2d\u0e1a\u0e14\u0e39\u0e2b\u0e19\u0e31\u0e07\u0e08\u0e35\u0e19\u0e01\u0e33\u0e25\u0e31\u0e07\u0e20\u0e32\u0e22\u0e43\u0e19\u0e21\u0e32\u0e01\u0e40\u0e1b\u0e47\u0e19\u0e1e\u0e34\u0e40\u0e28\u0e29","protected":false,"followers_count":179,"friends_count":937,"listed_count":2,"created_at":"Sat Nov 17 02:45:26 +0000 2012","favourites_count":39,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1539,"lang":"th","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244096212\/467e8ac61cdd5015bc34fdb2e5f6d499_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244096212\/467e8ac61cdd5015bc34fdb2e5f6d499_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"th"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246969556992,"id_str":"365611246969556992","text":"\u9a19\u3055\u308c\u305f\u3001\u305d\u3057\u3066\u304c\u3093\u3070\u308c","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":856603171,"id_str":"856603171","name":"\u307f\u3069\u3045\u304d","screen_name":"lvn96","location":"","url":null,"description":"\uff3c\u306a\u306b\u3065\u304d\uff1f\u307f\u305a\u304d\u3043\uff01\uff01\uff1f\uff01\uff1f(\u30ad\u30ec\u6c17\u5473\u306b)\u3044\u3084\u3044\u3084\u30fc\u3001\u307f\u3065\u304d\uff01\uff01\uff01\u2661\uff0f","protected":false,"followers_count":179,"friends_count":178,"listed_count":2,"created_at":"Mon Oct 01 16:01:28 +0000 2012","favourites_count":16753,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":21468,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000108371982\/a142f68358855e2debfd61676c7998d8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000108371982\/a142f68358855e2debfd61676c7998d8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/856603171\/1373030021","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246973747200,"id_str":"365611246973747200","text":"Walked in the door after biking home, stripped and laid under the fan...best feeling ever!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1654016358,"id_str":"1654016358","name":"Love Is In The Air","screen_name":"AirForceDoll","location":"","url":null,"description":"Fiance of a US Airman. Nursing Student. Supporting him at the academy while he supports his future nurse. 20","protected":false,"followers_count":143,"friends_count":202,"listed_count":0,"created_at":"Wed Aug 07 22:50:41 +0000 2013","favourites_count":29,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"verified":false,"statuses_count":15,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257234381\/1274be111795f72fe8c053a8098ec94d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257234381\/1274be111795f72fe8c053a8098ec94d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1654016358\/1375916261","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961172481,"id_str":"365611246961172481","text":"http:\/\/t.co\/heoLS3yNCH \u0444\u0443\u0437\u0438\u0434\u0435\u0440\u043c \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u044f","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":305491113,"id_str":"305491113","name":"DMargaret","screen_name":"damienloeillet","location":"Les Chano\u00ebls","url":null,"description":null,"protected":false,"followers_count":0,"friends_count":4,"listed_count":0,"created_at":"Thu May 26 09:30:09 +0000 2011","favourites_count":5,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":57,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"030103","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/753402084\/530305ee54dcd37a33244a09f7f62844.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/753402084\/530305ee54dcd37a33244a09f7f62844.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3055095337\/4386191c5c34ec1b4dcf1b4e93b02a9b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3055095337\/4386191c5c34ec1b4dcf1b4e93b02a9b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/305491113\/1357138294","profile_link_color":"FA8459","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"947974","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/heoLS3yNCH","expanded_url":"http:\/\/yhr.stayaustralia.net\/torrent-65489.html","display_url":"yhr.stayaustralia.net\/torrent-65489.\u2026","indices":[0,22]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246977957888,"id_str":"365611246977957888","text":"I'm so bored, what's my ladies up tew?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":406405782,"id_str":"406405782","name":".nej","screen_name":"JennzyFBaby","location":"","url":null,"description":"daddys girl.\n im in love. @DevinMeadows","protected":false,"followers_count":670,"friends_count":276,"listed_count":1,"created_at":"Sun Nov 06 17:25:21 +0000 2011","favourites_count":4665,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":13831,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"D62885","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/699823334\/219a8fabcb25ecfdcb3b38c274acc1d5.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/699823334\/219a8fabcb25ecfdcb3b38c274acc1d5.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000243191127\/84c6df8e16eef38c48dc8d82291ebcb9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000243191127\/84c6df8e16eef38c48dc8d82291ebcb9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/406405782\/1369813073","profile_link_color":"F580D6","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246965374977,"id_str":"365611246965374977","text":"I'd love for a zombie apocalypse to occur I'd get the fastest car a shit loa of guns and beer ha","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":939228086,"id_str":"939228086","name":"Bryan Pettersen","screen_name":"BryanGu21","location":"","url":null,"description":null,"protected":false,"followers_count":134,"friends_count":102,"listed_count":0,"created_at":"Sat Nov 10 14:54:43 +0000 2012","favourites_count":788,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2309,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000222716581\/cf155fddf7adcc78ffca5fa383f1d03f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000222716581\/cf155fddf7adcc78ffca5fa383f1d03f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/939228086\/1375306499","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246948589568,"id_str":"365611246948589568","text":"RT @ChrisGPackham: Meet one of 'The Burrowers' stars - clearly not me ! Coming soon on BBC2 . http:\/\/t.co\/k5lAxH0Y2L","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":390252740,"id_str":"390252740","name":"Lionheart","screen_name":"I_am_a_Leo","location":"In your dreams","url":null,"description":"I have issues with this and that. Love all animals, more than most ppl. Feed foxes. Hate the EU.","protected":false,"followers_count":2426,"friends_count":2417,"listed_count":45,"created_at":"Thu Oct 13 18:04:22 +0000 2011","favourites_count":372,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":38936,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3555524265\/3a84905977b74b80cecd7d24559358aa_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3555524265\/3a84905977b74b80cecd7d24559358aa_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/390252740\/1369609919","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 20:07:44 +0000 2013","id":365202615790862336,"id_str":"365202615790862336","text":"Meet one of 'The Burrowers' stars - clearly not me ! Coming soon on BBC2 . http:\/\/t.co\/k5lAxH0Y2L","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":78905469,"id_str":"78905469","name":"Chris Packham","screen_name":"ChrisGPackham","location":"","url":"http:\/\/chrispackham.co.uk","description":"Naturalist and BBC broadcaster","protected":false,"followers_count":68524,"friends_count":6,"listed_count":1123,"created_at":"Thu Oct 01 13:40:31 +0000 2009","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":9046,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/446903215\/chris_packham_twitter_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/446903215\/chris_packham_twitter_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":99,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365202615795056640,"id_str":"365202615795056640","indices":[75,97],"media_url":"http:\/\/pbs.twimg.com\/media\/BRF12BmCAAAJEUW.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRF12BmCAAAJEUW.jpg","url":"http:\/\/t.co\/k5lAxH0Y2L","display_url":"pic.twitter.com\/k5lAxH0Y2L","expanded_url":"http:\/\/twitter.com\/ChrisGPackham\/status\/365202615790862336\/photo\/1","type":"photo","sizes":{"large":{"w":1024,"h":683,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":400,"resize":"fit"},"small":{"w":340,"h":227,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ChrisGPackham","name":"Chris Packham","id":78905469,"id_str":"78905469","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246965374978,"id_str":"365611246965374978","text":"whenever i be bored i be texting people back that i usually never text","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":377572560,"id_str":"377572560","name":"Alexus Strayer","screen_name":"lexx_strayerr","location":"IG: _lexxstrayerr","url":null,"description":"#GODfirst #MoonBaby #DoItForTED #Tlanez #FamILY #11 #FWM","protected":false,"followers_count":1014,"friends_count":832,"listed_count":2,"created_at":"Wed Sep 21 19:51:44 +0000 2011","favourites_count":576,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":15760,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1414DE","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/547169137\/Tory-Lanez.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/547169137\/Tory-Lanez.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3487782352\/9a832c97f4bd0a1aa9bd66bdac22e16a_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3487782352\/9a832c97f4bd0a1aa9bd66bdac22e16a_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/377572560\/1370461711","profile_link_color":"1753EB","profile_sidebar_border_color":"1034D1","profile_sidebar_fill_color":"8A11ED","profile_text_color":"EB1EDD","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956969985,"id_str":"365611246956969985","text":"Eid.. Morning & Night http:\/\/t.co\/MY0sFCNrxR","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":81483365,"id_str":"81483365","name":"\u0635\u0627\u0627\u062f","screen_name":"SaadxAhmad","location":"","url":null,"description":"Pursuit of Excellence.","protected":false,"followers_count":224,"friends_count":116,"listed_count":0,"created_at":"Sun Oct 11 01:15:02 +0000 2009","favourites_count":107,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":3305,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/686452333\/41ef163dc4bf0a8c22d0761e984f398a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/686452333\/41ef163dc4bf0a8c22d0761e984f398a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3770843075\/b753b3190caf688e4f9ec3b5de807974_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3770843075\/b753b3190caf688e4f9ec3b5de807974_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/81483365\/1358711355","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"FFFFFF","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/MY0sFCNrxR","expanded_url":"http:\/\/instagram.com\/p\/cxSiyOP93S\/","display_url":"instagram.com\/p\/cxSiyOP93S\/","indices":[26,48]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246969552896,"id_str":"365611246969552896","text":"n\u00edvel de entrosamento: responde tudo rindo","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":72970840,"id_str":"72970840","name":"Evandro Souza","screen_name":"evandrouza","location":"Porto Alegre \/ RS","url":"http:\/\/Instagram.com\/evandror93","description":"Quem tem amor faz motor.\nFazedor de cagadas na empresa: vida","protected":false,"followers_count":140,"friends_count":147,"listed_count":18,"created_at":"Wed Sep 09 22:35:46 +0000 2009","favourites_count":346,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":13638,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000031137462\/c148d9956e9d87bc2641f1d984bfe613.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000031137462\/c148d9956e9d87bc2641f1d984bfe613.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000228310032\/2df8d3e3fe5df2241b3240d2f3595296_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000228310032\/2df8d3e3fe5df2241b3240d2f3595296_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/72970840\/1375670830","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952787968,"id_str":"365611246952787968","text":"RT @vipregan: #ShorterThanBeyoncesHair Kim Kardashian and Kris Humphries's marriage.","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":607635490,"id_str":"607635490","name":"Bikelife22","screen_name":"jswif007","location":"","url":"http:\/\/wefollow.com\/jswif007","description":"# bike life","protected":false,"followers_count":695,"friends_count":289,"listed_count":1,"created_at":"Wed Jun 13 23:08:41 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":16,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000005592621\/2f913d1d6f88b2edd2b526340d111e0d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000005592621\/2f913d1d6f88b2edd2b526340d111e0d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000031985641\/c7a4fccafbe7eed2ed3d080af4d4af3c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000031985641\/c7a4fccafbe7eed2ed3d080af4d4af3c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/607635490\/1371959171","profile_link_color":"2FC2EF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:50:34 +0000 2013","id":365605985223589888,"id_str":"365605985223589888","text":"#ShorterThanBeyoncesHair Kim Kardashian and Kris Humphries's marriage.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":38570287,"id_str":"38570287","name":"Matt ","screen_name":"vipregan","location":"Hollywood Walk of Fame \/ NY","url":"http:\/\/www.facebook.com\/mattregan","description":"This is a story of dreams mixed with reality.\r\n\r\nUpcoming actor \/ musician \/ writer \/ poet \/ star.","protected":false,"followers_count":325448,"friends_count":148881,"listed_count":629,"created_at":"Fri May 08 01:15:29 +0000 2009","favourites_count":2011,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":62307,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1430BA","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/795502575\/9669ba21a1b32a380ef58cab9bd3597d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/795502575\/9669ba21a1b32a380ef58cab9bd3597d.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1718191651\/Me_Cleveland_2010_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1718191651\/Me_Cleveland_2010_normal.jpg","profile_link_color":"24F01D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"130F8A","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":23,"entities":{"hashtags":[{"text":"ShorterThanBeyoncesHair","indices":[0,24]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"ShorterThanBeyoncesHair","indices":[14,38]}],"urls":[],"user_mentions":[{"screen_name":"vipregan","name":"Matt ","id":38570287,"id_str":"38570287","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961168385,"id_str":"365611246961168385","text":"\"@JPMinor: ...#MeMojaQue llueva.\"","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":99172900,"id_str":"99172900","name":"\u043c\u03b9\u044f\u03b9\u03b1\u043c g\u03b1\u044f\u00a2\u03b9\u03b1 \u0455\u03b9\u0454\u044f\u044f\u03b1","screen_name":"miriam_garcia_","location":"Talavera De La Reina","url":"https:\/\/docs.google.com\/file\/d\/0B6VjVCNWuBDWc1hWMkxJYVBObjQ\/edit?usp=sharing","description":"18. Espa\u00f1a, Amo Actuar(Aspirante A Actriz https:\/\/t.co\/dRB0vzA69M ) Y La Musica Skype: miriam1995gs\r\n\r\n\r\nFacebook: https:\/\/t.co\/9wk5eiuICQ","protected":false,"followers_count":288,"friends_count":1783,"listed_count":1,"created_at":"Thu Dec 24 21:35:14 +0000 2009","favourites_count":230,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":true,"verified":false,"statuses_count":11221,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/617853829\/oilru1m8s0z6ivz3sujc.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/617853829\/oilru1m8s0z6ivz3sujc.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000115633101\/2ebc0c3afdde679df99634f2e8b258db_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000115633101\/2ebc0c3afdde679df99634f2e8b258db_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/99172900\/1348675511","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"MeMojaQue","indices":[14,24]}],"urls":[],"user_mentions":[{"screen_name":"JPMinor","name":"jose pablo minor","id":482546857,"id_str":"482546857","indices":[1,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982139905,"id_str":"365611246982139905","text":"http:\/\/t.co\/lB5A3YYmpd","source":"\u003ca href=\"http:\/\/vk.com\" rel=\"nofollow\"\u003evk.com\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":603547941,"id_str":"603547941","name":"\u0410\u043d\u0442\u043e\u043d \u041c\u0430\u0440\u0438\u0447\u0435\u0432","screen_name":"MarichevAntonio","location":"\u0421\u0435\u0440\u043f\u0443\u0445\u043e\u0432","url":"http:\/\/vk.com\/antony__m","description":"\u0426\u0435\u043d\u044e \u043b\u044e\u0434\u0435\u0439,\u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043d\u0435 \u0440\u0430\u0437\u043c\u0435\u043d\u0438\u0432\u0430\u044e\u0442\u0441\u044f \u0441\u043b\u043e\u0432\u0430\u043c\u0438,\u0430 \u043e\u0442\u043c\u0435\u0447\u0430\u044e\u0442\u0441\u044f \u0434\u043e\u0441\u0442\u043e\u0439\u043d\u044b\u043c\u0438 \u0434\u0435\u043b\u0430\u043c\u0438...","protected":false,"followers_count":11,"friends_count":32,"listed_count":0,"created_at":"Sat Jun 09 09:43:46 +0000 2012","favourites_count":3,"utc_offset":14400,"time_zone":"Moscow","geo_enabled":true,"verified":false,"statuses_count":735,"lang":"ru","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000230668414\/659664bc23c3cc891c4dac16dd1747a7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000230668414\/659664bc23c3cc891c4dac16dd1747a7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/603547941\/1372598985","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/lB5A3YYmpd","expanded_url":"http:\/\/vk.cc\/1IUrIR","display_url":"vk.cc\/1IUrIR","indices":[0,22]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982135808,"id_str":"365611246982135808","text":"\u201cTemer o amor \u00e9 temer a vida, e aqueles que temem a vida j\u00e1 est\u00e3o praticamente mortos.\u201d","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1352582803,"id_str":"1352582803","name":"Linara Ribeiro","screen_name":"ribeiro_linara","location":"","url":null,"description":"A melhor maneira que o homem disp\u00f5e para se aperfei\u00e7oar, \u00e9 aproximar-se de Deus.","protected":false,"followers_count":8,"friends_count":184,"listed_count":0,"created_at":"Sun Apr 14 19:02:53 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241218414\/c9a5b962887f948c962e8ffc5d9eed69_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241218414\/c9a5b962887f948c962e8ffc5d9eed69_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1352582803\/1375641659","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982152192,"id_str":"365611246982152192","text":"Rocio me hace preguntas de Nacho JAJAJAJJAJA que pibaaa a a-.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":294254411,"id_str":"294254411","name":"Micaa Cirelli :B","screen_name":"MicaCirelli","location":"Del Viso","url":null,"description":"Me llamo Micaela Cirelli,seguime y yo te sigo(? CABJ PASION asdfas","protected":false,"followers_count":539,"friends_count":353,"listed_count":1,"created_at":"Fri May 06 20:16:04 +0000 2011","favourites_count":142,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":8682,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/804985122\/fa137e31c369a7dcbad4bef0644731f2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/804985122\/fa137e31c369a7dcbad4bef0644731f2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000143228875\/201c4094ab5c27e1283869cbf6d39d61_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000143228875\/201c4094ab5c27e1283869cbf6d39d61_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/294254411\/1373996253","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952783872,"id_str":"365611246952783872","text":"RT @Aaqibfida: The uglier the snapchat, the closer the friendship\ud83d\udc6d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1312274132,"id_str":"1312274132","name":"Daniel Hueg","screen_name":"Daniel_Hueg","location":"","url":null,"description":null,"protected":false,"followers_count":14,"friends_count":86,"listed_count":0,"created_at":"Fri Mar 29 01:46:01 +0000 2013","favourites_count":66,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":156,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3633403316\/a1a060fe671121233efd8cfc911f811a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3633403316\/a1a060fe671121233efd8cfc911f811a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1312274132\/1368065804","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:34:51 +0000 2013","id":365602031068647427,"id_str":"365602031068647427","text":"The uglier the snapchat, the closer the friendship\ud83d\udc6d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":132442329,"id_str":"132442329","name":"Aaqib Fida","screen_name":"Aaqibfida","location":"In left pocket of your jeans","url":null,"description":"Guy who wanna be a guitarist ,passionate cricketer, love to pridict ,heartbreaking music lover and wants to have coffee cup with almost anyone and yeah rudeboy","protected":false,"followers_count":46646,"friends_count":45808,"listed_count":11,"created_at":"Tue Apr 13 07:23:09 +0000 2010","favourites_count":136,"utc_offset":18000,"time_zone":"Islamabad","geo_enabled":false,"verified":false,"statuses_count":10639,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EEBC73","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/851326350\/e5fb8fa32a36aeffe1f6f09a5e347fee.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/851326350\/e5fb8fa32a36aeffe1f6f09a5e347fee.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3773604411\/18807f88b14ca8eaa5574b84faa4b24e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3773604411\/18807f88b14ca8eaa5574b84faa4b24e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/132442329\/1375725311","profile_link_color":"DE5E2D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"8E485F","profile_text_color":"FD7D3C","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":8,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Aaqibfida","name":"Aaqib Fida","id":132442329,"id_str":"132442329","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246965374980,"id_str":"365611246965374980","text":"RT @La_Joha27: D\u00e9jame cambiarme que mi brujis viene paca! :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":987576108,"id_str":"987576108","name":"Dary Taveras","screen_name":"dary_taveras","location":"","url":null,"description":"No creas nada de lo que digo!","protected":false,"followers_count":156,"friends_count":116,"listed_count":0,"created_at":"Mon Dec 03 22:50:58 +0000 2012","favourites_count":106,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":17969,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme11\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme11\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000066480199\/19421fc568c6226f16c0bd9cbbc5ace7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000066480199\/19421fc568c6226f16c0bd9cbbc5ace7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/987576108\/1369705959","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:44:22 +0000 2013","id":365604424976044032,"id_str":"365604424976044032","text":"D\u00e9jame cambiarme que mi brujis viene paca! :D","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":430266051,"id_str":"430266051","name":". J\u03c3\u043d\u03b1\u03b7\u03b7\u03b1; \u2661","screen_name":"La_Joha27","location":"*B\u044f\u03c5\u03b7\u03c3 M\u03b1\u044f\u0455\u2665 #M\u03c3z\u03b1\u044f\u03b9\u03b9\u0455\u0442\u03b1*","url":null,"description":"#GOD\u0192\u03b9\u044f\u0455\u0442 *C\u03b1\u2113\u2113 M\u0454 \u03b7\u03b1\u03b7\u03b1* (L\u03b1 \u03bd\u03b9d\u03b1 \u0455\u0454 \u03b1\u00a2\u03b1\u0432\u03b1 \u03c1\u03b1\u044f\u03b1 \u2113\u03c3\u0455 q\u03c5\u0454 \u2202\u0454j\u03b1\u03b7 \u2202\u0454 \u0455\u03c3\u00f1\u03b1\u044f) \r \u2502Follow Me y Pide tu Back\u2502 #D\u03c3\u043c\u03b9\u03b7\u03b9c\u03b1\u03b7H\u03c3\u03c3\u2113\u03b9g\u03b1\u03b7' \u2665 #Music","protected":false,"followers_count":666,"friends_count":224,"listed_count":0,"created_at":"Tue Dec 06 23:36:14 +0000 2011","favourites_count":294,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":31198,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/804845694\/f9c295826c35695221dc9dc3dc2a890f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/804845694\/f9c295826c35695221dc9dc3dc2a890f.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000246647949\/abb51c598c28cfaf6adcea962a98e82e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000246647949\/abb51c598c28cfaf6adcea962a98e82e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/430266051\/1375326119","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"La_Joha27","name":". J\u03c3\u043d\u03b1\u03b7\u03b7\u03b1; \u2661","id":430266051,"id_str":"430266051","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246948581376,"id_str":"365611246948581376","text":"@SergioFGF @Eric23Gonzalez @JBehem @BeaSoria1 @martaruiz_98 a pagar**","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611114039492608,"in_reply_to_status_id_str":"365611114039492608","in_reply_to_user_id":500073385,"in_reply_to_user_id_str":"500073385","in_reply_to_screen_name":"SergioFGF","user":{"id":474022019,"id_str":"474022019","name":"Jordi Garc\u00eda D\u00edaz","screen_name":"jordigarciadiaz","location":"Alicante","url":null,"description":"Jugador de Lacross Babel C.F Juvenil\/\/ Yuris","protected":false,"followers_count":219,"friends_count":210,"listed_count":0,"created_at":"Wed Jan 25 15:35:46 +0000 2012","favourites_count":170,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":2545,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000215811026\/4759ba43ad33b6b105c1413b2364c120_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000215811026\/4759ba43ad33b6b105c1413b2364c120_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/474022019\/1371469258","profile_link_color":"ED8313","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SergioFGF","name":"Sergio Fern\u00e1ndez","id":500073385,"id_str":"500073385","indices":[0,10]},{"screen_name":"Eric23Gonzalez","name":"ericgonz\u00e1lez","id":160884942,"id_str":"160884942","indices":[11,26]},{"screen_name":"JBehem","name":" Joselito16\u2605","id":396601445,"id_str":"396601445","indices":[27,34]},{"screen_name":"BeaSoria1","name":"Bea\u2665","id":364457341,"id_str":"364457341","indices":[35,45]},{"screen_name":"martaruiz_98","name":"Marta Ruiz ","id":489559407,"id_str":"489559407","indices":[46,59]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952779776,"id_str":"365611246952779776","text":"@XY_Chanyeol kudu musti harus kagak caya gue...","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365469680896454656,"in_reply_to_status_id_str":"365469680896454656","in_reply_to_user_id":1539904898,"in_reply_to_user_id_str":"1539904898","in_reply_to_screen_name":"XY_Chanyeol","user":{"id":1541062489,"id_str":"1541062489","name":"sehuna","screen_name":"XY_Sehun","location":"somewhere","url":null,"description":"[V] @XY_YaoiRP Parody account of EXO's Oh Sehun | 94liner | hannie's | my son jeje | wut!?","protected":false,"followers_count":58,"friends_count":57,"listed_count":1,"created_at":"Sun Jun 23 13:58:02 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5947,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252511291\/0f5b1fbe768c4ef2295a33b35059d517_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252511291\/0f5b1fbe768c4ef2295a33b35059d517_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1541062489\/1375738374","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"XY_Chanyeol","name":"pcy","id":1539904898,"id_str":"1539904898","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944399362,"id_str":"365611246944399362","text":"Sesion subida","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":158854436,"id_str":"158854436","name":"Laurs","screen_name":"LauraBlazquezT6","location":"","url":null,"description":"SOLO Y UNICAMENTE ELLAS. @SandraGancedo27 @Merymillan15 @andreagonzalo11 @abrilriveiro98","protected":false,"followers_count":382,"friends_count":445,"listed_count":0,"created_at":"Wed Jun 23 20:54:33 +0000 2010","favourites_count":1061,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":15175,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/443989757\/tutu.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/443989757\/tutu.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000210751469\/3ce7fdc13006bc8f055ca5bf21698693_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000210751469\/3ce7fdc13006bc8f055ca5bf21698693_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/158854436\/1370654636","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952783873,"id_str":"365611246952783873","text":"avec la reprise du boulot et tout l\u00e0 euh j'\u00e9tais capoute moi","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":343084556,"id_str":"343084556","name":"need caffeine.","screen_name":"twocafelatte","location":"","url":null,"description":null,"protected":false,"followers_count":7080,"friends_count":376,"listed_count":13,"created_at":"Wed Jul 27 01:30:43 +0000 2011","favourites_count":1769,"utc_offset":7200,"time_zone":"Paris","geo_enabled":true,"verified":false,"statuses_count":65116,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000012284048\/119aa467f3b17e9e3f27b69eb6d7b368.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000012284048\/119aa467f3b17e9e3f27b69eb6d7b368.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000197126277\/2e6732af95ebeb768424d0ec1a36f3aa_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000197126277\/2e6732af95ebeb768424d0ec1a36f3aa_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/343084556\/1375632052","profile_link_color":"F59DB9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"F598F2","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246969569280,"id_str":"365611246969569280","text":"RT @Anti_FCB: Periodistas que se ven con el culo al aire y piden perd\u00f3n por manipular una vez descubierta la manipulaci\u00f3n","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1111690104,"id_str":"1111690104","name":"adrian","screen_name":"adriancarriles6","location":"","url":null,"description":null,"protected":false,"followers_count":36,"friends_count":100,"listed_count":0,"created_at":"Tue Jan 22 14:33:25 +0000 2013","favourites_count":18,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":349,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242678691\/6552b1a7e4289792a3abffd90b6e0e1b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242678691\/6552b1a7e4289792a3abffd90b6e0e1b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1111690104\/1375665458","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 20:02:52 +0000 2013","id":365563784787537921,"id_str":"365563784787537921","text":"Periodistas que se ven con el culo al aire y piden perd\u00f3n por manipular una vez descubierta la manipulaci\u00f3n","source":"\u003ca href=\"http:\/\/ubersocial.com\" rel=\"nofollow\"\u003eUberSocial\u00a9 PRO\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":112810991,"id_str":"112810991","name":"\u24b6\u24c3\u24c9\u24be\u2248\u24bb\u24b8\u24b7\u2122","screen_name":"Anti_FCB","location":"Espa\u00f1a","url":"http:\/\/www.anti-barcelona.com","description":"Web oficial antibarcelonista, para denunciar el proteccionismo que goza el FC Barcelona","protected":false,"followers_count":15312,"friends_count":602,"listed_count":227,"created_at":"Tue Feb 09 19:51:12 +0000 2010","favourites_count":9,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":true,"verified":false,"statuses_count":40296,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/73870710\/ab1.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/73870710\/ab1.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241175171\/6afbb50c24db26bbbe1a297c5419aeb5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241175171\/6afbb50c24db26bbbe1a297c5419aeb5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/112810991\/1356297279","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":5,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Anti_FCB","name":"\u24b6\u24c3\u24c9\u24be\u2248\u24bb\u24b8\u24b7\u2122","id":112810991,"id_str":"112810991","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982139907,"id_str":"365611246982139907","text":"RT @Quueeenn_: I'm just not gonna care about anything and have fun this year. \ud83d\ude1b","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":234483272,"id_str":"234483272","name":"\u03b1\u2113\u0454\u03c7\u03b9\u0455","screen_name":"alexismguerra","location":"","url":null,"description":"15. RHS. Sophomore.","protected":false,"followers_count":449,"friends_count":290,"listed_count":1,"created_at":"Wed Jan 05 19:35:03 +0000 2011","favourites_count":2002,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":8167,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000135980761\/350aafb3489553c8ff9cafce543ee239_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000135980761\/350aafb3489553c8ff9cafce543ee239_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/234483272\/1375769183","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:09:02 +0000 2013","id":365610633422573570,"id_str":"365610633422573570","text":"I'm just not gonna care about anything and have fun this year. \ud83d\ude1b","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":920904458,"id_str":"920904458","name":"\u2654.","screen_name":"Quueeenn_","location":"","url":null,"description":"Be the bitch they made you, and never show them you're weak. \u2715\u262f\u2715\u262f. \n\n\nhttp:\/\/quuueeennn.tumblr.com","protected":false,"followers_count":267,"friends_count":262,"listed_count":0,"created_at":"Fri Nov 02 12:14:54 +0000 2012","favourites_count":402,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":3042,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"CDA29A","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000023134056\/278402f292f85a4b0ffa7d5d61a85164.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000023134056\/278402f292f85a4b0ffa7d5d61a85164.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259016251\/a34ae8f245df3fe5a67877aa061bcfdf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259016251\/a34ae8f245df3fe5a67877aa061bcfdf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/920904458\/1375583477","profile_link_color":"F95B78","profile_sidebar_border_color":"DE8F86","profile_sidebar_fill_color":"516474","profile_text_color":"C7AFA1","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[26.39209730,-98.84037911]},"coordinates":{"type":"Point","coordinates":[-98.84037911,26.39209730]},"place":{"id":"b57577c7e5f0a142","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/b57577c7e5f0a142.json","place_type":"city","name":"Rio Grande City","full_name":"Rio Grande City, TX","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-98.866074,26.35404],[-98.866074,26.397867],[-98.769868,26.397867],[-98.769868,26.35404]]]},"attributes":{}},"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Quueeenn_","name":"\u2654.","id":920904458,"id_str":"920904458","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246965374976,"id_str":"365611246965374976","text":"JAJAJAJAJ","source":"\u003ca href=\"http:\/\/dickson-apps.blogspot.com\/\" rel=\"nofollow\"\u003eTweetian for Symbian\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":222001667,"id_str":"222001667","name":"C A R P s2","screen_name":"Meeeeelita","location":"","url":null,"description":"Borr\u00f3n y cuenta nueva y a seguir (\u266a S.M \u2665\r\n\r\nhttp:\/\/ask.fm\/MeliDp\r\nhttps:\/\/www.facebook.com\/meeeliD","protected":false,"followers_count":135,"friends_count":148,"listed_count":0,"created_at":"Thu Dec 02 05:27:33 +0000 2010","favourites_count":370,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":7822,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"F3E5B8","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000011377464\/ae6e73a0bbfac5d2108590551c230463.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000011377464\/ae6e73a0bbfac5d2108590551c230463.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000184542100\/874468a03b3b7c4dd6e598ca249e7aeb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000184542100\/874468a03b3b7c4dd6e598ca249e7aeb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/222001667\/1372971201","profile_link_color":"53AB8D","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"291C0D","profile_text_color":"F3E5B8","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982139906,"id_str":"365611246982139906","text":"RT @ohiyasian: \"he's 24 months old\"\n2\nYOUR CHILD IS 2","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":254933205,"id_str":"254933205","name":"steve the shearer ","screen_name":"s0lene_fish3r","location":"earth","url":"http:\/\/Instagram.com\/solene_fisher","description":"'never let the fear of striking out keep you from playing the game' aca-scuse me? 1\/5\/13 sound of change \u2022seen beyonce live\u2022 aca-believe it","protected":false,"followers_count":1084,"friends_count":885,"listed_count":0,"created_at":"Sun Feb 20 09:18:55 +0000 2011","favourites_count":1591,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":30683,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"BADFCD","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/812149217\/42d61c0463f5d85aa080d6c5aa2923e9.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/812149217\/42d61c0463f5d85aa080d6c5aa2923e9.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000193229171\/220d0e470308ce6028077411a724ded1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000193229171\/220d0e470308ce6028077411a724ded1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/254933205\/1375567361","profile_link_color":"EB92AC","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"D90F71","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Tue Jul 23 16:07:32 +0000 2013","id":359706352915972097,"id_str":"359706352915972097","text":"\"he's 24 months old\"\n2\nYOUR CHILD IS 2","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":600956493,"id_str":"600956493","name":"\u0633\u064a\u0627\u0646","screen_name":"ohiyasian","location":"northants. UK","url":"http:\/\/chronicmess.tumblr.com\/","description":"quote the mcdonalds menu to me while we make love. \/\/ #CFC instagram: sianhenty https:\/\/www.facebook.com\/sianhentyx","protected":false,"followers_count":12336,"friends_count":4514,"listed_count":11,"created_at":"Wed Jun 06 10:55:31 +0000 2012","favourites_count":36,"utc_offset":3600,"time_zone":"London","geo_enabled":true,"verified":false,"statuses_count":26148,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/864980364\/5e54af9d8bad6fdf6067d89ca035fb0d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/864980364\/5e54af9d8bad6fdf6067d89ca035fb0d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000254887179\/d62db84094ba326b69fcff6a726f4fa4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000254887179\/d62db84094ba326b69fcff6a726f4fa4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/600956493\/1375000374","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"303018","profile_text_color":"D8A878","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2183,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ohiyasian","name":"\u0633\u064a\u0627\u0646","id":600956493,"id_str":"600956493","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246969561088,"id_str":"365611246969561088","text":"@sexyszayn pq vc er quase perfeita ue kk","source":"web","truncated":false,"in_reply_to_status_id":365610991788105728,"in_reply_to_status_id_str":"365610991788105728","in_reply_to_user_id":218081686,"in_reply_to_user_id_str":"218081686","in_reply_to_screen_name":"sexyszayn","user":{"id":1568429670,"id_str":"1568429670","name":"Victor","screen_name":"prfvtomm0","location":"Camz. Cris. Jujubs. Vic \u2764","url":null,"description":"'Don't you worry child'","protected":false,"followers_count":248,"friends_count":233,"listed_count":0,"created_at":"Thu Jul 04 15:37:17 +0000 2013","favourites_count":135,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2348,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000015875319\/a8c51439a4c222f0873fe94f4a724c81.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000015875319\/a8c51439a4c222f0873fe94f4a724c81.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000087956549\/d727f7f1046daea29b696a6c4e78e62c_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000087956549\/d727f7f1046daea29b696a6c4e78e62c_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1568429670\/1372965779","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"sexyszayn","name":"Camz","id":218081686,"id_str":"218081686","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956986368,"id_str":"365611246956986368","text":"@fukukucchi \u3057\u305f\u3089\u30b2\u30a4\u5bdf\u547c\u3076\u305e\u3063","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610967343706112,"in_reply_to_status_id_str":"365610967343706112","in_reply_to_user_id":225771591,"in_reply_to_user_id_str":"225771591","in_reply_to_screen_name":"fukukucchi","user":{"id":250530088,"id_str":"250530088","name":"\u3084\u304f\u3082\u3093","screen_name":"Yakumo_Hisyous","location":"\u3082\u3075\u3082\u3075\u306e\u3042\u308b\u3068\u3053\u308d","url":null,"description":"\u6771\u65b9\u3001\u578b\u6708\u3001 FPS\u3001\u3082\u3075\u3082\u3075\u304c\u597d\u304d\u306a\u7d33\u58eb\uff08\uff09\u3067\u3059 \u30de\u30a4\u30f3\u30af\u30e9\u30d5\u30c8\u3082\u3084\u3063\u3066\u3044\u307e\u3059\u304cmod\u307e\u307f\u308c\u306a\u306e\u3067\u30de\u30eb\u30c1\u30d7\u30ec\u30a4\u306f\u96e3\u3057\u3044\u304b\u3082 \u7d2b\u69d8\u3068\u9b3c\u5deb\u5973\u69d8\u304c\u611b\u3089\u3057\u304f\u3066\u4ed5\u65b9\u306a\u3044 \u203b\u8da3\u5473\u306e\u5408\u308f\u306a\u3044\u5834\u5408\u30ea\u30d5\u30a9\u30ed\u30fc\u3057\u307e\u305b\u3093","protected":false,"followers_count":954,"friends_count":1141,"listed_count":40,"created_at":"Fri Feb 11 08:18:41 +0000 2011","favourites_count":252,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":70936,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000199213624\/480efcbbfbcf98371abe03ae35a6384f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000199213624\/480efcbbfbcf98371abe03ae35a6384f_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"fukukucchi","name":"\u30d5\u30af\u30c3\u30c1@\u4e00\u65e5\u4e00\u7d75","id":225771591,"id_str":"225771591","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961176577,"id_str":"365611246961176577","text":"@xsarahumesbabyx @Beril_belieber true! X","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365608323103473665,"in_reply_to_status_id_str":"365608323103473665","in_reply_to_user_id":252784978,"in_reply_to_user_id_str":"252784978","in_reply_to_screen_name":"xsarahumesbabyx","user":{"id":1568656614,"id_str":"1568656614","name":"FollowMeUnionJ\u2661","screen_name":"MrsUnionJ2012","location":"In Adam Pitt's Bed! England!","url":null,"description":"I am a jcat and proud!Fallen in love with them ever since they auditioned so i do believe in love at first sight!District3 Followed 27-7-13!\u2661 \u2764#LAWSON#GEESE\u2764","protected":false,"followers_count":112,"friends_count":269,"listed_count":3,"created_at":"Thu Jul 04 17:36:18 +0000 2013","favourites_count":200,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1986,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000087636201\/f536c52c08b2ff1a5700b30131e54da7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000087636201\/f536c52c08b2ff1a5700b30131e54da7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1568656614\/1372976256","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"xsarahumesbabyx","name":"RyansIrishGoose x","id":252784978,"id_str":"252784978","indices":[0,16]},{"screen_name":"Beril_belieber","name":"THANK YOU LAWSON!!!!","id":473075032,"id_str":"473075032","indices":[17,32]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246977957889,"id_str":"365611246977957889","text":"\u00c1lbum de fotos: kingmborges: Awwn que saudade .. http:\/\/t.co\/AxkNjvtfkN","source":"\u003ca href=\"http:\/\/www.tumblr.com\/\" rel=\"nofollow\"\u003eTumblr\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1393277352,"id_str":"1393277352","name":"maria","screen_name":"minhluab","location":"may goxtosa ","url":null,"description":"eu amo a lua_blanco \u2661 conta f\u00e3","protected":false,"followers_count":381,"friends_count":341,"listed_count":0,"created_at":"Tue Apr 30 23:20:42 +0000 2013","favourites_count":17,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":6469,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047215240\/da1222ecb83320cb8cf135a229d9b9cf.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047215240\/da1222ecb83320cb8cf135a229d9b9cf.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261216612\/6009d46a491b12cb67b6d194083c555b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261216612\/6009d46a491b12cb67b6d194083c555b_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1393277352\/1375981471","profile_link_color":"B0B0B0","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/AxkNjvtfkN","expanded_url":"http:\/\/tmblr.co\/ZZYE7urneNGv","display_url":"tmblr.co\/ZZYE7urneNGv","indices":[49,71]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982148096,"id_str":"365611246982148096","text":"@Nohlwhat dacc","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611185091002368,"in_reply_to_status_id_str":"365611185091002368","in_reply_to_user_id":1346526631,"in_reply_to_user_id_str":"1346526631","in_reply_to_screen_name":"Nohlwhat","user":{"id":781342938,"id_str":"781342938","name":"Wu-tang","screen_name":"guillaumeprovo","location":"anti directioner","url":null,"description":"Sam, je taime. Yael, david, mael, eva joanna, mathilde, loic, tib, chlo\u00e9, chlo\u00e9, quentin. Le hand et tout le reste \u2665 snoop dogg","protected":false,"followers_count":367,"friends_count":161,"listed_count":0,"created_at":"Sun Aug 26 00:07:24 +0000 2012","favourites_count":1438,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":14345,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"2E8F0E","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000027083392\/4234b0830dad42c20584747d2ed125b6.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000027083392\/4234b0830dad42c20584747d2ed125b6.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000130066530\/eabe553902d3cfc1d4375d6823793089_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000130066530\/eabe553902d3cfc1d4375d6823793089_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/781342938\/1374086505","profile_link_color":"FA0000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Nohlwhat","name":"Nohl-Wheen. \u2665","id":1346526631,"id_str":"1346526631","indices":[0,9]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944403456,"id_str":"365611246944403456","text":"@BorgesDouglas7 prof \u00e9 o Bid?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610516514742272,"in_reply_to_status_id_str":"365610516514742272","in_reply_to_user_id":254917138,"in_reply_to_user_id_str":"254917138","in_reply_to_screen_name":"BorgesDouglas7","user":{"id":290812523,"id_str":"290812523","name":"princesa","screen_name":"barcelos_lolo","location":"Brasil","url":null,"description":"instagram: lorrainybarcelos wpp: 95646463","protected":false,"followers_count":225,"friends_count":120,"listed_count":0,"created_at":"Sat Apr 30 23:58:38 +0000 2011","favourites_count":893,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":4824,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/823719982\/5a938424ff18bc38c718ca9a5023ea21.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/823719982\/5a938424ff18bc38c718ca9a5023ea21.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3257646772\/6c46693eed3bb1f406148b44eea822b1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3257646772\/6c46693eed3bb1f406148b44eea822b1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/290812523\/1360937670","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"BorgesDouglas7","name":"Douglas Borges","id":254917138,"id_str":"254917138","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246973751296,"id_str":"365611246973751296","text":"@lettucexxx \u3055\u3059\u304c\u3061\u3055\u308a\u3093\u306d\u2661","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610480728932353,"in_reply_to_status_id_str":"365610480728932353","in_reply_to_user_id":566057891,"in_reply_to_user_id_str":"566057891","in_reply_to_screen_name":"lettucexxx","user":{"id":1538361318,"id_str":"1538361318","name":"RAKU","screen_name":"RAKU88123354","location":"","url":null,"description":null,"protected":false,"followers_count":30,"friends_count":36,"listed_count":0,"created_at":"Sat Jun 22 09:55:38 +0000 2013","favourites_count":65,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":115,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000175572272\/c4b14e5ffe9f8cbcd3d2e16d433a4d89_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000175572272\/c4b14e5ffe9f8cbcd3d2e16d433a4d89_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1538361318\/1375010558","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"lettucexxx","name":"\u4e2d\u5ddd\u3002","id":566057891,"id_str":"566057891","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961164288,"id_str":"365611246961164288","text":"Didnt win anything fuck the scratchers -_-","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":463356700,"id_str":"463356700","name":"La_Gabbyy","screen_name":"Lovelyy__Gabby","location":"","url":null,"description":"Living Life With No Regrets (; #Live #Love #Laugh","protected":false,"followers_count":168,"friends_count":143,"listed_count":0,"created_at":"Sat Jan 14 01:00:54 +0000 2012","favourites_count":1199,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":18660,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000201156509\/2357530d99ef4430effc59c28b759770_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000201156509\/2357530d99ef4430effc59c28b759770_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/463356700\/1374999560","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952775682,"id_str":"365611246952775682","text":"This Starbucks right now >> #nightmade","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":157696551,"id_str":"157696551","name":"Leah O'Steen","screen_name":"leahosteen","location":"St. Louis, Missouri","url":"http:\/\/www.instagram.com\/leahosteen","description":"18. senior @ holt. always & forever 01.02.13","protected":false,"followers_count":337,"friends_count":235,"listed_count":1,"created_at":"Sun Jun 20 15:49:45 +0000 2010","favourites_count":1541,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":6901,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"5AC5E8","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/367458336\/bokeh.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/367458336\/bokeh.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260746655\/32b34a34d0d6e9234aa53d88cee0eda9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260746655\/32b34a34d0d6e9234aa53d88cee0eda9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/157696551\/1375849243","profile_link_color":"ED6056","profile_sidebar_border_color":"E0DEDD","profile_sidebar_fill_color":"E0DEDD","profile_text_color":"724DE3","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"nightmade","indices":[34,44]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956974081,"id_str":"365611246956974081","text":"@chloe_bland forget the 'please', just get me skips already","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610128810057728,"in_reply_to_status_id_str":"365610128810057728","in_reply_to_user_id":620071093,"in_reply_to_user_id_str":"620071093","in_reply_to_screen_name":"MKmeganking","user":{"id":620071093,"id_str":"620071093","name":"Megan King","screen_name":"MKmeganking","location":"Derbyshire, England","url":null,"description":null,"protected":false,"followers_count":272,"friends_count":276,"listed_count":0,"created_at":"Wed Jun 27 14:58:27 +0000 2012","favourites_count":469,"utc_offset":3600,"time_zone":"Casablanca","geo_enabled":true,"verified":false,"statuses_count":3421,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/833513051\/fb8e6a74b0d2cafe523428b6a05368d0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/833513051\/fb8e6a74b0d2cafe523428b6a05368d0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250255172\/db215930a0879c83bb152e37c9147f9e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250255172\/db215930a0879c83bb152e37c9147f9e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/620071093\/1375737419","profile_link_color":"A10D2F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"chloe_bland","name":"*c H l 0 e*","id":437688163,"id_str":"437688163","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246977945600,"id_str":"365611246977945600","text":"@_K_Ziemak13_ if you woulda told me an hour ago dude","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365597596888219649,"in_reply_to_status_id_str":"365597596888219649","in_reply_to_user_id":251759790,"in_reply_to_user_id_str":"251759790","in_reply_to_screen_name":"_K_Ziemak13_","user":{"id":407525356,"id_str":"407525356","name":"Erik Siller","screen_name":"Siller_440","location":"","url":null,"description":null,"protected":false,"followers_count":166,"friends_count":148,"listed_count":0,"created_at":"Tue Nov 08 05:52:48 +0000 2011","favourites_count":356,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4250,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3372497305\/c01283a6bcf7fe39e3a3cc65b8747402_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3372497305\/c01283a6bcf7fe39e3a3cc65b8747402_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_K_Ziemak13_","name":"Kyron Ziemak","id":251759790,"id_str":"251759790","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944387072,"id_str":"365611246944387072","text":"@Raph_Joseph oh bun you then","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611116560252928,"in_reply_to_status_id_str":"365611116560252928","in_reply_to_user_id":1106653548,"in_reply_to_user_id_str":"1106653548","in_reply_to_screen_name":"Raph_Joseph","user":{"id":1641716844,"id_str":"1641716844","name":"Enduring Love ","screen_name":"laseulerene","location":"Every where you want to be.","url":null,"description":"Bad decisions, good intentions.","protected":false,"followers_count":35,"friends_count":37,"listed_count":0,"created_at":"Sat Aug 03 01:32:05 +0000 2013","favourites_count":25,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":266,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261466136\/163d237773b9eee955d5f708216699e7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261466136\/163d237773b9eee955d5f708216699e7_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1641716844\/1376002859","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Raph_Joseph","name":"#10 Raph","id":1106653548,"id_str":"1106653548","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246977945601,"id_str":"365611246977945601","text":"Ugh. Whatever.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":423348933,"id_str":"423348933","name":"Coleen","screen_name":"coleen_cookies","location":"BCD, Philippines","url":"https:\/\/soundcloud.com\/hannahshae17","description":"Keep Holding On. Keep Moving Forward","protected":false,"followers_count":367,"friends_count":187,"listed_count":0,"created_at":"Mon Nov 28 11:44:35 +0000 2011","favourites_count":1486,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":20800,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/816950579\/ac10eec29c66b0c582f09c7ef59986ef.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/816950579\/ac10eec29c66b0c582f09c7ef59986ef.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000159729673\/1537e7ad0df151a109c176b98ef1e7cc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000159729673\/1537e7ad0df151a109c176b98ef1e7cc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/423348933\/1368438025","profile_link_color":"00FFEA","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"F6FFD1","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246948585472,"id_str":"365611246948585472","text":"@Tankfantry love it :3","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365560974066974720,"in_reply_to_status_id_str":"365560974066974720","in_reply_to_user_id":33752338,"in_reply_to_user_id_str":"33752338","in_reply_to_screen_name":"Tankfantry","user":{"id":80966606,"id_str":"80966606","name":"Lixxiestix","screen_name":"Scilixx","location":"Florida","url":null,"description":"Gamer. Full frontal nerdity. Geeky and laid back, with a mind that tends to reside in the gutter, and a heart of gold. Own 20+ working consoles.","protected":false,"followers_count":850,"friends_count":376,"listed_count":36,"created_at":"Thu Oct 08 23:01:12 +0000 2009","favourites_count":3579,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":38824,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"352726","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme5\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme5\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261571643581\/200f31fbfeac2e809009ffc1bed95705_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261571643581\/200f31fbfeac2e809009ffc1bed95705_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/80966606\/1348491079","profile_link_color":"D02B55","profile_sidebar_border_color":"829D5E","profile_sidebar_fill_color":"99CC33","profile_text_color":"3E4415","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Tankfantry","name":"B.K. LeBlanc","id":33752338,"id_str":"33752338","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944391168,"id_str":"365611246944391168","text":"RT @Beverly78350JO: http:\/\/t.co\/Oe8EYm5XSg R\u00e9f.: F800074 #RealEstate #HautDeGamme #Beverly #BPA14 @LaurenceParisot @HECParis @Ludovic_Jamet","source":"\u003ca href=\"http:\/\/tweetadder.com\" rel=\"nofollow\"\u003eTweetAdder v4\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1595295156,"id_str":"1595295156","name":"Beverly Andorre","screen_name":"BeverlyAndorre","location":"Andorre","url":"http:\/\/pinterest.com\/BeverlyFrance","description":"The First Digital Network of Real Estate Agencies @BeverlyFrance #Luxury - Homes & Properties - Villas - Apartments - #Andorra","protected":false,"followers_count":158,"friends_count":342,"listed_count":0,"created_at":"Mon Jul 15 08:18:11 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":138,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"30271E","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024135293\/3fa6f20b858de96e338e322c1af684ae.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024135293\/3fa6f20b858de96e338e322c1af684ae.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000136794057\/e7689092a0f710f8392c9c24f9daa57d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000136794057\/e7689092a0f710f8392c9c24f9daa57d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1595295156\/1373876369","profile_link_color":"C39823","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 19:36:41 +0000 2013","id":365557194458005507,"id_str":"365557194458005507","text":"http:\/\/t.co\/Oe8EYm5XSg R\u00e9f.: F800074 #RealEstate #HautDeGamme #Beverly #BPA14 @LaurenceParisot @HECParis @Ludovic_Jamet","source":"\u003ca href=\"http:\/\/tweetadder.com\" rel=\"nofollow\"\u003eTweetAdder v4\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1305512959,"id_str":"1305512959","name":"Beverly JouyenJosas","screen_name":"Beverly78350JO","location":"Yvelines, France","url":"http:\/\/pinterest.com\/Beverly78350JO","description":"Jouy-en-Josas - Transactions - H\u00f4tels Particuliers - Propri\u00e9t\u00e9s & Demeures - Appartements Prestige - Fnaim #Immobilier @BeverlyIDF @BeverlyFrance","protected":false,"followers_count":176,"friends_count":217,"listed_count":1,"created_at":"Tue Mar 26 20:21:41 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":380,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"30271E","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/825410630\/3fa6f20b858de96e338e322c1af684ae.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/825410630\/3fa6f20b858de96e338e322c1af684ae.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3434743297\/696f41b69d15013f5d4c6fae2861dd8d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3434743297\/696f41b69d15013f5d4c6fae2861dd8d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1305512959\/1364329355","profile_link_color":"C39823","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":12,"entities":{"hashtags":[{"text":"RealEstate","indices":[37,48]},{"text":"HautDeGamme","indices":[49,61]},{"text":"Beverly","indices":[62,70]},{"text":"BPA14","indices":[71,77]}],"urls":[{"url":"http:\/\/t.co\/Oe8EYm5XSg","expanded_url":"http:\/\/pinterest.com\/Beverly78350JO","display_url":"pinterest.com\/Beverly78350JO","indices":[0,22]}],"user_mentions":[{"screen_name":"LaurenceParisot","name":"Laurence Parisot","id":20095109,"id_str":"20095109","indices":[78,94]},{"screen_name":"HECParis","name":"HEC Paris","id":35766159,"id_str":"35766159","indices":[95,104]},{"screen_name":"Ludovic_Jamet","name":"Ludovic Jamet","id":294179644,"id_str":"294179644","indices":[105,119]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"fr"},"retweet_count":0,"entities":{"hashtags":[{"text":"RealEstate","indices":[57,68]},{"text":"HautDeGamme","indices":[69,81]},{"text":"Beverly","indices":[82,90]},{"text":"BPA14","indices":[91,97]}],"urls":[],"user_mentions":[{"screen_name":"Beverly78350JO","name":"Beverly JouyenJosas","id":1305512959,"id_str":"1305512959","indices":[3,18]},{"screen_name":"LaurenceParisot","name":"Laurence Parisot","id":20095109,"id_str":"20095109","indices":[98,114]},{"screen_name":"HECParis","name":"HEC Paris","id":35766159,"id_str":"35766159","indices":[115,124]},{"screen_name":"Ludovic_Jamet","name":"Ludovic Jamet","id":294179644,"id_str":"294179644","indices":[125,139]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246982139904,"id_str":"365611246982139904","text":"Cual de las dos.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":551436143,"id_str":"551436143","name":"Natalie G.","screen_name":"Natalie_pink7","location":"","url":null,"description":"SG'13 graduate. College freshman '17. Chasing my dreams \u221e","protected":false,"followers_count":136,"friends_count":114,"listed_count":1,"created_at":"Thu Apr 12 00:31:15 +0000 2012","favourites_count":2943,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":8116,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000233803515\/c22b4982ab255e0b69c1ee8c1b72dc1d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000233803515\/c22b4982ab255e0b69c1ee8c1b72dc1d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/551436143\/1375512336","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246944403457,"id_str":"365611246944403457","text":"@white_H_Blanc_ \u308a\u3060\u3066\u3089\u3060\u306a\u3001\u30d6\u30e9\u30f3","source":"\u003ca href=\"http:\/\/sinproject.net\/tweecha\/\" rel=\"nofollow\"\u003etweecha sinProject\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611049216520192,"in_reply_to_status_id_str":"365611049216520192","in_reply_to_user_id":960258595,"in_reply_to_user_id_str":"960258595","in_reply_to_screen_name":"white_H_Blanc_","user":{"id":1279583214,"id_str":"1279583214","name":"\u5239\u90a3\u30fbF\u30fb\u30bb\u30a4\u30a8\u30a4","screen_name":"S_F_S_00QW","location":"GNT-0000\u30c0\u30d6\u30eb\u30aa\u30fc\u30af\u30a2\u30f3\u30bf","url":"http:\/\/twpf.jp\/S_F_S_00QW","description":"\u305d\u306e\u77ac\u9593\u306e\u8f1d\u304d\u306b\u3059\u3079\u3066\u3092\u71c3\u3084\u3057\u5c3d\u304f\u3059\u304b\u3089\u3053\u305d\u3001\u4eba\u306f\u3001\u547d\u306e\u610f\u5473\u3092\u77e5\u308b\n\n\u8a2d\u5b9a[\u5287\u5834\u7248] (\u30c4\u30a4\u30d7\u30ed\u5fc5\u8aad)","protected":false,"followers_count":286,"friends_count":206,"listed_count":0,"created_at":"Tue Mar 19 05:26:28 +0000 2013","favourites_count":230,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":9056,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"0400FF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/851419460\/08d2974710ec84fa8cec13ad9be49407.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/851419460\/08d2974710ec84fa8cec13ad9be49407.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257247700\/5a77a15fd1562db3fbc070adc20cd57d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257247700\/5a77a15fd1562db3fbc070adc20cd57d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1279583214\/1375269263","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"white_H_Blanc_","name":"\u30d6\u30e9\u30f3@\u795e\u6b21\u5143","id":960258595,"id_str":"960258595","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246961176576,"id_str":"365611246961176576","text":"@MayeeAcozta aaaa eso me gusta que siempre lo seas, asi quien sabe que loco la tendra:s","source":"web","truncated":false,"in_reply_to_status_id":365606016638922756,"in_reply_to_status_id_str":"365606016638922756","in_reply_to_user_id":377541647,"in_reply_to_user_id_str":"377541647","in_reply_to_screen_name":"MayeeAcozta","user":{"id":382332684,"id_str":"382332684","name":"Elias David ","screen_name":"Rrrido_03","location":"","url":null,"description":"SILVESTRISTA\u200f\u2661 BARCELONA\u2661","protected":false,"followers_count":846,"friends_count":569,"listed_count":0,"created_at":"Thu Sep 29 21:56:48 +0000 2011","favourites_count":285,"utc_offset":-18000,"time_zone":"Bogota","geo_enabled":true,"verified":false,"statuses_count":30587,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"050505","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000024896431\/203dc720f33bb74a82841d7a3f83bce1.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000024896431\/203dc720f33bb74a82841d7a3f83bce1.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000187044666\/5e33ded9dbd8a9885575ce073b434af2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000187044666\/5e33ded9dbd8a9885575ce073b434af2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/382332684\/1373940272","profile_link_color":"A83641","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"4255E3","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"MayeeAcozta","name":"DeclaroMiAdiccionATw","id":377541647,"id_str":"377541647","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246952792064,"id_str":"365611246952792064","text":"21.\u00bfQuien de los 5 tiene un gemelo id\u00e9ntico?: Seev. #MTVHottest The Wanted","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1216361826,"id_str":"1216361826","name":"we own the night ","screen_name":"theperfwanted","location":"\u00ab \u0442\u043d\u0454 \u03c9\u03b1\u0438\u0442\u0454\u2202 \u00bb","url":null,"description":"\u00ab We try and we fall and we live another day \u00bb","protected":false,"followers_count":868,"friends_count":1051,"listed_count":1,"created_at":"Sun Feb 24 19:36:20 +0000 2013","favourites_count":58,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":6232,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/819912682\/24270b39efd57603779da9b147425797.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/819912682\/24270b39efd57603779da9b147425797.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000112255026\/caa436d589abc775d958f215c533adff_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000112255026\/caa436d589abc775d958f215c533adff_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1216361826\/1371686734","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"MTVHottest","indices":[52,63]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:28 +0000 2013","id":365611246956974083,"id_str":"365611246956974083","text":"Happy hour #chilis @Fa_Paredes http:\/\/t.co\/nfoIB6X9Q4","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":191302187,"id_str":"191302187","name":"Leovana Astudillo","screen_name":"leovanastudillo","location":"Guayaquil, Ecuador","url":null,"description":null,"protected":false,"followers_count":713,"friends_count":87,"listed_count":2,"created_at":"Thu Sep 16 03:06:19 +0000 2010","favourites_count":60,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":9135,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"93F5E8","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme17\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000211169911\/ec0802da486dc1562774a1a7f13db653_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000211169911\/ec0802da486dc1562774a1a7f13db653_normal.jpeg","profile_link_color":"FA872F","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"chilis","indices":[11,18]}],"urls":[],"user_mentions":[{"screen_name":"Fa_Paredes","name":"Fabrizzio Paredes","id":277857082,"id_str":"277857082","indices":[19,30]}],"media":[{"id":365611246587895809,"id_str":"365611246587895809","indices":[31,53],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpfd7CYAEnGvk.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpfd7CYAEnGvk.jpg","url":"http:\/\/t.co\/nfoIB6X9Q4","display_url":"pic.twitter.com\/nfoIB6X9Q4","expanded_url":"http:\/\/twitter.com\/leovanastudillo\/status\/365611246956974083\/photo\/1","type":"photo","sizes":{"large":{"w":640,"h":480,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":450,"resize":"fit"},"small":{"w":340,"h":255,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251143303168,"id_str":"365611251143303168","text":"Si pudieras tener la vista que quisieras desde tu casa, \u00bfcu\u00e1l ser\u00eda? http:\/\/t.co\/fKL5aGKcT4","source":"\u003ca href=\"http:\/\/ask.fm\/\" rel=\"nofollow\"\u003eAsk.fm\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":891357008,"id_str":"891357008","name":"Pitufa \u2665","screen_name":"AylenBorobach3","location":"","url":null,"description":"Un d\u00eda sin sonrisa es un d\u00eda perdido !!","protected":false,"followers_count":90,"friends_count":83,"listed_count":0,"created_at":"Fri Oct 19 16:45:21 +0000 2012","favourites_count":18,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":1906,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"709397","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028844632\/fe163ca66da9abd289943ea38d9275f0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028844632\/fe163ca66da9abd289943ea38d9275f0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000182752493\/2d16bef87f2048f722e8241b4c9ae866_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000182752493\/2d16bef87f2048f722e8241b4c9ae866_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/891357008\/1370702018","profile_link_color":"FF007B","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"A0C5C7","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/fKL5aGKcT4","expanded_url":"http:\/\/ask.fm\/a\/5nb0qfma","display_url":"ask.fm\/a\/5nb0qfma","indices":[69,91]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251164278784,"id_str":"365611251164278784","text":"RT @NotADictionary: Shark Week:\n\nDef: The one week you're glad you're not a fucking seal.\n\nSent: Kevin tried to eat his brother after watch\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1259133642,"id_str":"1259133642","name":"Cinderemma Small","screen_name":"emmasmally94","location":"Northumberland","url":null,"description":"Pantomime, clarinet, singing... A woman of many talents HA!","protected":false,"followers_count":76,"friends_count":256,"listed_count":0,"created_at":"Mon Mar 11 10:45:09 +0000 2013","favourites_count":702,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":3480,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000029794336\/9e4da821322b4c2996564d8c352cb25f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000029794336\/9e4da821322b4c2996564d8c352cb25f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1259133642\/1373634082","profile_link_color":"93A644","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 14:50:42 +0000 2013","id":365122837809332224,"id_str":"365122837809332224","text":"Shark Week:\n\nDef: The one week you're glad you're not a fucking seal.\n\nSent: Kevin tried to eat his brother after watching shark week.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1641335952,"id_str":"1641335952","name":"Not A Dictionary","screen_name":"NotADictionary","location":"*original* 8\/02\/2013","url":null,"description":"Not your average dictionary! http:\/\/NotADictionary.com coming soon. Want us to define a word? Send it to us! KiK: humordepot\/twitterfame","protected":false,"followers_count":17973,"friends_count":4,"listed_count":2,"created_at":"Fri Aug 02 21:31:25 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":15,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000231960620\/ea02d0f75cd1ba2e222a8aa53c940ee7_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000231960620\/ea02d0f75cd1ba2e222a8aa53c940ee7_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":498,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"NotADictionary","name":"Not A Dictionary","id":1641335952,"id_str":"1641335952","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"none","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142885376,"id_str":"365611251142885376","text":"Me super envidia JJAJJJAJAJJAJAJAJAJ","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":909260311,"id_str":"909260311","name":"Nare\u271d","screen_name":"NareManzur","location":"","url":"http:\/\/www.facebook.com\/nare02","description":"Todo es dif\u00edcil, hasta que se logra.\r\nTodo da miedo, hasta que se conoce.\r\nTodo importa poco, hasta que se pierde.","protected":false,"followers_count":842,"friends_count":512,"listed_count":0,"created_at":"Sun Oct 28 01:12:25 +0000 2012","favourites_count":2159,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":42212,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000038168576\/e895c0c5116121c329e1bf25418eb367.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000038168576\/e895c0c5116121c329e1bf25418eb367.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257850298\/6662819c2f32ed32e1dbf07e041680c3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257850298\/6662819c2f32ed32e1dbf07e041680c3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/909260311\/1375913246","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251138707456,"id_str":"365611251138707456","text":"tava no centro c o gui hj kaka","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":303950819,"id_str":"303950819","name":"suy","screen_name":"Suycapistrano","location":"sc","url":null,"description":"hi stalker","protected":false,"followers_count":486,"friends_count":162,"listed_count":1,"created_at":"Mon May 23 17:51:32 +0000 2011","favourites_count":26,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":16603,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/823400652\/9ade52d2e5868a1dbd1be4d0c686da72.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/823400652\/9ade52d2e5868a1dbd1be4d0c686da72.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227840447\/42dfa588a34be2942b87656063b93b02_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227840447\/42dfa588a34be2942b87656063b93b02_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/303950819\/1375391166","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"F0578F","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251147087873,"id_str":"365611251147087873","text":"TWERK #JcCaylenFollowSpree","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1571341358,"id_str":"1571341358","name":"JC CAYLEN","screen_name":"Holland_Fan","location":"Town in Illinois no one knows","url":null,"description":"JC CAYLEN DOES NOT FOLLOW. 0\/6 I miss Ricardo! YOU NEWBIES DONT KNOW WHAT ITS LIKE. SUPRISE SUNDAYS !","protected":false,"followers_count":68,"friends_count":173,"listed_count":1,"created_at":"Fri Jul 05 20:57:14 +0000 2013","favourites_count":57,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":210,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247593352\/c37c54a01692ef50d2a51b5166a703d5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247593352\/c37c54a01692ef50d2a51b5166a703d5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1571341358\/1375801830","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"JcCaylenFollowSpree","indices":[6,26]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251151273985,"id_str":"365611251151273985","text":"\u063a\u0627\u0626\u0628\u064a \u062d\u062f\u062b\u062a \u0627\u0644\u0644\u0647 \u0639\u0646\u0643\u0643 \u0623\u062e\u0628\u0631\u062a\u0647 \u061b \u0623\u0646\u0643 \u0623\u0634\u062f \u0623\u0634\u064a\u0627\u0626\u064a \u0627\u0644\u062a\u064a \u0641\u0642\u062f\u062a\u0647\u0627 \u062c\u0645\u0627\u0644\u0627\u064b \u060c \u0648\u062f\u0639\u0648\u062a\u0647 \u0623\u064a\u0636\u0627\u064b \u0623\u0646 \u064a\u0631\u062d\u0645\u0643 \u0648\u064a\u0639\u0641 \u0639\u0646\u0643\u0643 \u0648\u0623\u0646 \u064a\u062c\u0639\u0644\u0646\u064a \u0648\u0625\u064a\u0627\u0643 \u0633\u0648\u064a\u0627\u064b \u0641\u064a \u0627\u0644\u062c\u0646\u0629 ..","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":868711904,"id_str":"868711904","name":"M\u03b1\u043d\u03b1 Alharbi*","screen_name":"Mema_166","location":" A\u03b9 \u043c\u0251d\u026a\u0438\u0251\u043d A\u03b9 \u043c\u03c5\u0438\u03c9\u03c9r\u0251\u04ba \u2661\u2665 ","url":null,"description":"\u0622\u062a\u0645\u0646\u0649 \u0622\u0644\u062c\u064e\u0646\u0647 \u2665\u2665~","protected":false,"followers_count":345,"friends_count":360,"listed_count":0,"created_at":"Mon Oct 08 21:22:27 +0000 2012","favourites_count":547,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":3339,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000129795154\/8933534e49c94a07a28d53f010c281fc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000129795154\/8933534e49c94a07a28d53f010c281fc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/868711904\/1375507646","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176439808,"id_str":"365611251176439808","text":"El que stalkea, encuentra..","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":128400948,"id_str":"128400948","name":"Vale Jara","screen_name":"ValeJyo","location":"","url":"http:\/\/www.facebook.com\/vale.jara.50","description":"Charata-Chaco","protected":false,"followers_count":742,"friends_count":505,"listed_count":0,"created_at":"Thu Apr 01 01:06:57 +0000 2010","favourites_count":114,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":425,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000025790330\/3f8f3e52712f0cf5fff47b042f1b6a42.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000025790330\/3f8f3e52712f0cf5fff47b042f1b6a42.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000066093800\/ab390c57037aa557c1d5ccb0b844206f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000066093800\/ab390c57037aa557c1d5ccb0b844206f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/128400948\/1373396526","profile_link_color":"0099CC","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"F6FFD1","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251147079680,"id_str":"365611251147079680","text":"\"Toda c\u00e9lula procede de outra c\u00e9lula. Toda c\u00e9lula nasce de uma c\u00e9lula que a antecede, que se formou a...\" http:\/\/t.co\/WEC1CMtr26","source":"\u003ca href=\"http:\/\/www.tumblr.com\/\" rel=\"nofollow\"\u003eTumblr\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":127695649,"id_str":"127695649","name":"Amanda Antoni\u00eatto","screen_name":"amandantonietto","location":"brasil","url":"http:\/\/amandaantonietto.tumblr.com","description":"Sinto vontade de te proteger,vontade de esconder vc para que nada de mal aconte\u00e7a.Guardar vc nos meus sonhos,onde eu possa ter vc sem pensar nas consequ\u00eancias.","protected":false,"followers_count":205,"friends_count":177,"listed_count":0,"created_at":"Tue Mar 30 00:21:21 +0000 2010","favourites_count":4,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":18538,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/397057785\/y.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/397057785\/y.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3573597430\/2d84d75f3483d782070f5b02e473c71e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3573597430\/2d84d75f3483d782070f5b02e473c71e_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"E3C1EB","profile_sidebar_fill_color":"01090D","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/WEC1CMtr26","expanded_url":"http:\/\/tmblr.co\/ZF4nWxrneNbZ","display_url":"tmblr.co\/ZF4nWxrneNbZ","indices":[106,128]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172253696,"id_str":"365611251172253696","text":"This girl going ham on my ig pictures.. #Relax","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":243662130,"id_str":"243662130","name":"\u0493\u03b9\u044f\u03b5&\u03b9c\u03b53 \u2661 ","screen_name":"MissTrouble06","location":"Looking for Unicorns brb .","url":"http:\/\/misstrouble06.tumblr.com\/","description":"18 year's old, no expectations, no disappointments. @AshleyWenita \u2665 IG.Keek.SnapChat ; MissTrouble06 :*","protected":false,"followers_count":1210,"friends_count":1180,"listed_count":0,"created_at":"Thu Jan 27 15:01:46 +0000 2011","favourites_count":1011,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":33376,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041157093\/a97e2d766cc398318a6f64fd3ceb38c8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041157093\/a97e2d766cc398318a6f64fd3ceb38c8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247894894\/63f6e533a024dfe4fe32d059eaf27a9a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247894894\/63f6e533a024dfe4fe32d059eaf27a9a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/243662130\/1375321012","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"D410E6","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"Relax","indices":[40,46]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163869186,"id_str":"365611251163869186","text":"http:\/\/t.co\/CWoQ7P5sEU \u0441\u043f\u0440\u0430\u0432\u043e\u0447\u043d\u0438\u043a \u0444\u0430\u0440\u043c \u043f\u0440\u0435\u0434\u043f\u0440\u0438\u044f\u0442\u0438\u0439 \u043a\u0430\u0437\u0430\u0445\u0441\u0442\u0430\u043d\u0430 http:\/\/t.co\/IeKd7eiW50 \u0441\u043a\u0430\u0447\u0430\u0442\u0438 \u043a\u043d\u0438\u0433\u0443 \u0448\u0442\u0440\u0430\u0444\u0431\u0430\u0442","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":305517937,"id_str":"305517937","name":"\u0414\u0436\u0430\u0431\u0440\u0430\u0438\u043b","screen_name":"693_passat","location":"","url":null,"description":null,"protected":false,"followers_count":2,"friends_count":34,"listed_count":0,"created_at":"Thu May 26 10:51:49 +0000 2011","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":987,"lang":"ru","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_4_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_4_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/CWoQ7P5sEU","expanded_url":"http:\/\/uyi.acba.in.ua\/cat5\/torrent-2656.html","display_url":"uyi.acba.in.ua\/cat5\/torrent-2\u2026","indices":[0,22]},{"url":"http:\/\/t.co\/IeKd7eiW50","expanded_url":"http:\/\/uyi.acba.in.ua\/cat5\/torrent-2657.html","display_url":"uyi.acba.in.ua\/cat5\/torrent-2\u2026","indices":[62,84]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172249600,"id_str":"365611251172249600","text":"@o8tg_1754 08\u670809\u65e5\u306b\u5165\u3063\u3066\u304b\u3089\u30ea\u30d7\u30e9\u30a4\u65705\u3092\u53d7\u3051\u53d6\u308a\u307e\u3057\u305f\u3002","source":"\u003ca href=\"http:\/\/gumu-lab.com\/replychecker\/\" rel=\"nofollow\"\u003e\u30ea\u30d7\u30e9\u30a4\u6570\u30c1\u30a7\u30c3\u30ab\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":443709957,"in_reply_to_user_id_str":"443709957","in_reply_to_screen_name":"o8tg_1754","user":{"id":443709957,"id_str":"443709957","name":"\u3074\u304b\u3058\u3085\u30fc","screen_name":"o8tg_1754","location":"","url":null,"description":"\u30d5\u30a9\u30ed\u30fc\u3059\u308b\u3068\u304d\u306f\u4e00\u8a00\u304f\u3060\u3055\u3044\u3002\u898f\u5236\u57a2\u3010@o8tg_1754_1 \u3011\n\u81ea\u4f5c\u30a2\u30a4\u30b3\u30f3\u3060\u3088","protected":false,"followers_count":1178,"friends_count":958,"listed_count":10,"created_at":"Thu Dec 22 13:13:07 +0000 2011","favourites_count":5112,"utc_offset":32400,"time_zone":"Osaka","geo_enabled":true,"verified":false,"statuses_count":60086,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000259401669\/62983782dccba022283d36073f70aefb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000259401669\/62983782dccba022283d36073f70aefb_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/443709957\/1375793461","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"o8tg_1754","name":"\u3074\u304b\u3058\u3085\u30fc","id":443709957,"id_str":"443709957","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172245504,"id_str":"365611251172245504","text":"\u30ca\u30a4\u30c8\u30e1\u30a2\u597d\u304d\u3059\u304e\u3066KH\u3082\u30ca\u30a4\u30c8\u30e1\u30a2\u30b9\u30c6\u30fc\u30b8\u304b\u3089\u96e2\u308c\u3089\u308c\u306a\u3044","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":488830759,"id_str":"488830759","name":"\u3055\u308f\u3059\u306f\u5e30\u7701\u4e2d","screen_name":"foommm69","location":"from Fortunehill","url":null,"description":"\u7c92\u3042\u3093\u6b7b\u306d","protected":false,"followers_count":93,"friends_count":124,"listed_count":1,"created_at":"Fri Feb 10 22:01:35 +0000 2012","favourites_count":79,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":3226,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260141209\/e1e31dffd39566dbfc301ccec598900c_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260141209\/e1e31dffd39566dbfc301ccec598900c_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/488830759\/1375971645","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142893571,"id_str":"365611251142893571","text":"RT @AlvaroVSantos: Boa Noite Galera...Vamos la, Rumo Tim Beta Labs...Preciso de Seguidores & Retweets... SIGA QUE TE SIGO !!! #RT\u00b4s 30","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":297156119,"id_str":"297156119","name":"Alvaro Valis","screen_name":"alvaro_valis","location":"","url":null,"description":null,"protected":false,"followers_count":14,"friends_count":31,"listed_count":0,"created_at":"Thu May 12 01:23:56 +0000 2011","favourites_count":0,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":474,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/567073989\/rtxwovhw1uccxxe45nwu.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/567073989\/rtxwovhw1uccxxe45nwu.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2267106478\/plp0c8xhuxxi5iybeytf_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2267106478\/plp0c8xhuxxi5iybeytf_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/297156119\/1366643233","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:02:53 +0000 2013","id":365609085057507328,"id_str":"365609085057507328","text":"Boa Noite Galera...Vamos la, Rumo Tim Beta Labs...Preciso de Seguidores & Retweets... SIGA QUE TE SIGO !!! #RT\u00b4s 30","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":421248437,"id_str":"421248437","name":"\u00c1lvaro Tim Beta","screen_name":"AlvaroVSantos","location":"","url":null,"description":"\u2500\u2554\u2557\u2566\u2554\u2557\u2554\u2557\u2500\u2500\u2566\u2557\u2554\u2550\u2500\u2500\u2557\u2554\u2554\u2557\u2566\u2554\u2566\u2557\u2554\u2557\u2500 \u2500\u255a\u2557\u2551\u2551\u2566\u2551\u2551\u2500\u2500\u2551\u2551\u2560\u2550\u2500\u2500\u2551\u2551\u2551\u2551\u2551\u2500\u2551\u2500\u2560\u2563\u2500 \u2500\u255a\u255d\u2569\u255a\u255d\u255a\u255d\u2500\u2500\u2569\u255d\u255a\u2550\u2500\u2500\u255a\u255d\u255a\u255d\u255a\u2550\u255a\u2500\u255d\u2569\u2500","protected":false,"followers_count":166,"friends_count":172,"listed_count":1,"created_at":"Fri Nov 25 18:04:09 +0000 2011","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":839,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000231273409\/0187753fd69497abf0d32c327123e33d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000231273409\/0187753fd69497abf0d32c327123e33d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/421248437\/1375466921","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":3,"entities":{"hashtags":[{"text":"RT","indices":[111,114]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"pt"},"retweet_count":0,"entities":{"hashtags":[{"text":"RT","indices":[130,133]}],"urls":[],"user_mentions":[{"screen_name":"AlvaroVSantos","name":"\u00c1lvaro Tim Beta","id":421248437,"id_str":"421248437","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163869184,"id_str":"365611251163869184","text":"\u306b\u3057\u3066\u3082\u30e1\u30ac\u30b7\u30f3\u30ab\u3066","source":"\u003ca href=\"http:\/\/bit.ly\/UDldit\" rel=\"nofollow\"\u003eSaezuri\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":222465477,"id_str":"222465477","name":"\u6751\u96e8","screen_name":"__Murasame__","location":"\u79cb\u7530\u9ad8\u5c02\u6a5f\u68b0\u5de5\u5b66\u79d1\u68df3F","url":null,"description":"\u9ad8\u5c02\u30ed\u30dc\u30b3\u30f3\/\u30b7\u30ed\u30af\u30de\/\u6a5f\u68b0\/ SolidWorks\/\u65cb\u76e4\/\u6b6f\u8eca\/\u30ae\u30a2\u30dc\u8a2d\u8a08\/zoids\/\u306d\u3093\u3069\u308d\u3044\u3069\/\u30dd\u30b1\u30e2\u30f3\/\u9006\u8ee2\u88c1\u5224\/ im\uff20s\/\u82b1\u6fa4\u75c5\/\u30a2\u30ea\u30b9\u30bd\u30d5\u30c8\/\u30dd\u30cb\u30c6\/\/\r\n\u9ad8\u5c02\u30ed\u30dc\u30b3\u30f3\u3084\u3063\u3066\u305f\u5c02\u653b\u79d1\u751f\u3067\u3054\u3056\u3044\u307e\u3059\u3002\u203b\u7a00\u306b\u5909\u614bPost\u3042\u308a\u307e\u3059\u3054\u6ce8\u610f\u304f\u3060\u3055\u3044","protected":false,"followers_count":607,"friends_count":485,"listed_count":73,"created_at":"Fri Dec 03 14:10:41 +0000 2010","favourites_count":767,"utc_offset":32400,"time_zone":"Sapporo","geo_enabled":false,"verified":false,"statuses_count":35968,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/635622183\/sgjihkluzvgwfmbnue2p.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/635622183\/sgjihkluzvgwfmbnue2p.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000191468761\/beb6048c741c31b458899b58832bc0f2_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000191468761\/beb6048c741c31b458899b58832bc0f2_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/222465477\/1370513773","profile_link_color":"3C1BBF","profile_sidebar_border_color":"5ED4DC","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251155480579,"id_str":"365611251155480579","text":"How do you get some one of your mind. No matter what you do it reminds you of them .? How fair is God to make us... http:\/\/t.co\/t61NeWsHvA","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":564108706,"id_str":"564108706","name":"mikey uva","screen_name":"mikeyfuva","location":"","url":null,"description":null,"protected":false,"followers_count":0,"friends_count":0,"listed_count":0,"created_at":"Thu Apr 26 22:54:20 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":606,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_0_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_0_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/t61NeWsHvA","expanded_url":"http:\/\/fb.me\/RPgbwtb9","display_url":"fb.me\/RPgbwtb9","indices":[116,138]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251151290368,"id_str":"365611251151290368","text":"RT @footlocker: Check out some of the new colorways of our new Nike Hyperelite socks now available in stores! #Approved http:\/\/t.co\/n51DYQb\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1177181370,"id_str":"1177181370","name":"DeAndre Hood Grady","screen_name":"dgrady14","location":"","url":null,"description":"I'm that Hood nigga DeAndre Grady if u follow me, I will follow back","protected":false,"followers_count":52,"friends_count":126,"listed_count":0,"created_at":"Thu Feb 14 00:09:15 +0000 2013","favourites_count":268,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":584,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247442394\/e4ee2e24261a13ceb45522ed97e21c83_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247442394\/e4ee2e24261a13ceb45522ed97e21c83_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1177181370\/1375798743","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:57:55 +0000 2013","id":365607838053179392,"id_str":"365607838053179392","text":"Check out some of the new colorways of our new Nike Hyperelite socks now available in stores! #Approved http:\/\/t.co\/n51DYQbein","source":"\u003ca href=\"http:\/\/www.apple.com\" rel=\"nofollow\"\u003eCamera on iOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":22030851,"id_str":"22030851","name":"Foot Locker","screen_name":"footlocker","location":"USA","url":"http:\/\/www.footlockerunlocked.com","description":"The source for info on the hottest kicks at Foot Locker and House of Hoops locations nationwide. Sneaker lovers welcome.","protected":false,"followers_count":354318,"friends_count":3375,"listed_count":1213,"created_at":"Thu Feb 26 16:51:45 +0000 2009","favourites_count":219,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":true,"statuses_count":27719,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045807443\/2142a98d01e22e85c9b9aeb9f5341406.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045807443\/2142a98d01e22e85c9b9aeb9f5341406.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2511185889\/1285w6dqyb2xixanxp9u_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2511185889\/1285w6dqyb2xixanxp9u_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/22030851\/1375795221","profile_link_color":"C70202","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"D90F2E","profile_text_color":"080707","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":45,"entities":{"hashtags":[{"text":"Approved","indices":[94,103]}],"urls":[],"user_mentions":[],"media":[{"id":365607837830901760,"id_str":"365607837830901760","indices":[104,126],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLmZDUCcAAuTl5.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLmZDUCcAAuTl5.jpg","url":"http:\/\/t.co\/n51DYQbein","display_url":"pic.twitter.com\/n51DYQbein","expanded_url":"http:\/\/twitter.com\/footlocker\/status\/365607838053179392\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":960,"h":960,"resize":"fit"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"Approved","indices":[110,119]}],"urls":[],"user_mentions":[{"screen_name":"footlocker","name":"Foot Locker","id":22030851,"id_str":"22030851","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251151286273,"id_str":"365611251151286273","text":"RT @ShakespeareSong: Halt! The time of hammer is upon us.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":262929551,"id_str":"262929551","name":"Andre Edward Johnson","screen_name":"SnapbackFxcker","location":"Cybertron","url":"http:\/\/www.XpensiveSociety.com","description":"Zero is gone for now. But for the sins of my father call me Punished 'Dre.","protected":false,"followers_count":205,"friends_count":178,"listed_count":0,"created_at":"Wed Mar 09 01:34:16 +0000 2011","favourites_count":433,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":6978,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/854915096\/cd7f0d86f6d9adb2580735b3a71b52a2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/854915096\/cd7f0d86f6d9adb2580735b3a71b52a2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000255083107\/f3ccbf8a05c49c1ab748884819c72784_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000255083107\/f3ccbf8a05c49c1ab748884819c72784_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/262929551\/1375113620","profile_link_color":"B33900","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Fri Jul 26 02:05:42 +0000 2013","id":360581664071622659,"id_str":"360581664071622659","text":"Halt! The time of hammer is upon us.","source":"\u003ca href=\"http:\/\/www.hootsuite.com\" rel=\"nofollow\"\u003eHootSuite\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":581033983,"id_str":"581033983","name":"Shakespeare Lyrics","screen_name":"ShakespeareSong","location":"Rack Village, wench","url":null,"description":"Songs\/movie quotes\/phrases in archaic language. Can you guess them? | Advertising\/Promo: shakespearesong@yahoo.com","protected":false,"followers_count":398282,"friends_count":79,"listed_count":613,"created_at":"Tue May 15 16:22:42 +0000 2012","favourites_count":14,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":825,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/552731753\/twitterbackground.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/552731753\/twitterbackground.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3741935615\/92a6820c83a3e70f582ee4365ea56467_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3741935615\/92a6820c83a3e70f582ee4365ea56467_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2767,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ShakespeareSong","name":"Shakespeare Lyrics","id":581033983,"id_str":"581033983","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251151273984,"id_str":"365611251151273984","text":"Check This Out --->: Modern Warfare 3 -... http:\/\/t.co\/wI1GDzuPmb","source":"\u003ca href=\"http:\/\/winthecustomer.com\/\" rel=\"nofollow\"\u003eWin the Customer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":422851144,"id_str":"422851144","name":"james dale law","screen_name":"JDLWellabove","location":"united kingdom","url":"http:\/\/symptomsofinfection.com","description":"When A Goal Matters Enough To A Person That Person Will Find A Way To Accomplish What At First Seemed Impossible ..Nido Quebin..","protected":false,"followers_count":57,"friends_count":272,"listed_count":0,"created_at":"Sun Nov 27 19:29:07 +0000 2011","favourites_count":0,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":728,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0099B9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme4\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237082983\/64e7884cd9fd2185a30a0ca89c5c4ac3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237082983\/64e7884cd9fd2185a30a0ca89c5c4ac3_normal.jpeg","profile_link_color":"0099B9","profile_sidebar_border_color":"5ED4DC","profile_sidebar_fill_color":"95E8EC","profile_text_color":"3C3940","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/wI1GDzuPmb","expanded_url":"http:\/\/symptomsofinfection.com\/2012\/07\/19\/modern_warfare_3_-_glitches__tricks_part_1_terminal\/","display_url":"symptomsofinfection.com\/2012\/07\/19\/mod\u2026","indices":[46,68]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142885378,"id_str":"365611251142885378","text":"RT @Retwet55: \u2666 \u0644\u0645\u0640\u0640\u062f\u0629 ((60 \u062f\u0642\u064a\u0642\u0640\u0640\u0640\u0640\u0640\u0629)) \u2666\u2714\n(\u2776) \u0631\u064a\u062a\u0648\u064a\u0640\u062a \u0647\u0640\u0630\u0647 \u0627\u0644\u062a\u063a\u0631\u064a\u0640\u0640\u062f\u0629 \u2666\u2714\n(\u2777) \u0641\u0640\u0640\u0648\u0644\u0640\u0640\u0648\u0645\u0640\u0640\u064a\u2714 \u0641\u0640\u0648\u0644\u0640\u0648\u0628\u0640\u0640\u0640\u0627\u0643 \u2666\u2714\n(\u2778) \u0636\u064a\u0641\u0640\u0640\u0640\u0648\u0627 \u0643\u0640\u0640\u0644 \u0645\u0640\u0640\u0640\u0646 \u064a\u0636\u064a\u0641\u0643\u0640\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1029700628,"id_str":"1029700628","name":"\u0645\u0628\u0627\u0631\u0643 \u0627\u0644\u0639\u0648\u064a\u0635","screen_name":"mbarkalawais","location":"","url":null,"description":"\u0645\u0640\u0640\u0646 \u062a\u063a\u064a\u0640\u0640\u0631\u064a\u062c\u064a\u0640\u0640\u0628 \u0627\u0644\u0644\u0647 \u0627\u0644\u0640\u0640\u0641 \u063a\u064a\u0640\u0640\u0631\u0647 \u0648\u0645\u0640\u0646 \u0646\u0633\u0640\u0649 \u0645\u0627\u0646\u064a \u0628\u062d\u0627\u062c\u062a\u0647 \u0644\u0627\u0630\u0643\u0631 \u064a\u0645\u0643\u0640\u0646 \u0627\u0646\u0647 \u0641\u0640\u064a \u063a\u064a\u0627\u0628\u0647 \u0627\u0644\u0640\u0640\u0641 \u062e\u064a\u0640\u0640\u0631\u0647 \u0648\u0627\u0644\u0639\u0645\u0631 \u0645\u0627\u064a\u0648\u0642\u0640\u0641 \u0639\u0644\u0649 \u063a\u064a\u0628\u0629 \u0628\u0634\u0640\u0631 \u201d","protected":false,"followers_count":5953,"friends_count":78,"listed_count":0,"created_at":"Sun Dec 23 02:36:29 +0000 2012","favourites_count":421,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":29081,"lang":"ar","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261565602365\/3a044a1a7533a5793a362fc7f4594551_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261565602365\/3a044a1a7533a5793a362fc7f4594551_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1029700628\/1369490008","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:59:48 +0000 2013","id":365608309333557249,"id_str":"365608309333557249","text":"\u2666 \u0644\u0645\u0640\u0640\u062f\u0629 ((60 \u062f\u0642\u064a\u0642\u0640\u0640\u0640\u0640\u0640\u0629)) \u2666\u2714\n(\u2776) \u0631\u064a\u062a\u0648\u064a\u0640\u062a \u0647\u0640\u0630\u0647 \u0627\u0644\u062a\u063a\u0631\u064a\u0640\u0640\u062f\u0629 \u2666\u2714\n(\u2777) \u0641\u0640\u0640\u0648\u0644\u0640\u0640\u0648\u0645\u0640\u0640\u064a\u2714 \u0641\u0640\u0648\u0644\u0640\u0648\u0628\u0640\u0640\u0640\u0627\u0643 \u2666\u2714\n(\u2778) \u0636\u064a\u0641\u0640\u0640\u0640\u0648\u0627 \u0643\u0640\u0640\u0644 \u0645\u0640\u0640\u0640\u0646 \u064a\u0636\u064a\u0641\u0643\u0640\u0640\u0645 \u2666\u2714\n(108)","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":701866404,"id_str":"701866404","name":"\u0632\u064a\u0627\u062f\u0629 \u0645\u062a\u0627\u0628\u0639\u064a\u0646 \u0627\u0644\u0639\u0631\u0628","screen_name":"Retwet55","location":"\u0627\u0644\u0648\u0637\u0646 \u0627\u0644\u0639\u0631\u0628\u064a","url":null,"description":"\u0644\u0632\u064a\u0627\u062f\u0629 \u0645\u062a\u0627\u0628\u0639\u064a\u0646\u0643\u0645 \u0636\u064a\u0641\u0648\u0627 \u0645\u0646 \u064a\u0642\u0648\u0645 \u0628\u0627\u0644\u0631\u064a\u062a\u0648\u064a\u062a \u0644\u064a\u0636\u064a\u0641\u0648\u0646\u0643\u0645","protected":false,"followers_count":196581,"friends_count":136955,"listed_count":553,"created_at":"Tue Jul 17 22:10:23 +0000 2012","favourites_count":163,"utc_offset":10800,"time_zone":"Riyadh","geo_enabled":false,"verified":false,"statuses_count":34072,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3136722907\/2796e713e3bfd5606e66ef967013fe9b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3136722907\/2796e713e3bfd5606e66ef967013fe9b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/701866404\/1354599549","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":62,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"ar"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Retwet55","name":"\u0632\u064a\u0627\u062f\u0629 \u0645\u062a\u0627\u0628\u0639\u064a\u0646 \u0627\u0644\u0639\u0631\u0628","id":701866404,"id_str":"701866404","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ar"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251138695168,"id_str":"365611251138695168","text":"RT @TipsForYouDaily: A shaving cut will immediately stop bleeding if you put Chapstick on it","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":166650650,"id_str":"166650650","name":"Rawlito(:","screen_name":"Raul_Perez23","location":"The Moon","url":null,"description":"Scott Mescudi","protected":false,"followers_count":694,"friends_count":607,"listed_count":7,"created_at":"Wed Jul 14 17:45:18 +0000 2010","favourites_count":11998,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":66263,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme9\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000198979767\/cf2cc2c391a36f2861301c70da48dbd9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000198979767\/cf2cc2c391a36f2861301c70da48dbd9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/166650650\/1375778710","profile_link_color":"2FC2EF","profile_sidebar_border_color":"181A1E","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:05:06 +0000 2013","id":365609644493783041,"id_str":"365609644493783041","text":"A shaving cut will immediately stop bleeding if you put Chapstick on it","source":"\u003ca href=\"http:\/\/twuffer.com\" rel=\"nofollow\"\u003eTwuffer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1308829592,"id_str":"1308829592","name":"Life Hack TIps","screen_name":"TipsForYouDaily","location":"Instagram - Tips4YouDaily","url":null,"description":"Daily hacks and tips to optimize your life! Free food, save $$$, cut in line, hack vending machines, live life to the fullest.","protected":false,"followers_count":472505,"friends_count":2,"listed_count":265,"created_at":"Wed Mar 27 22:53:41 +0000 2013","favourites_count":91,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":442,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000011584222\/581827aef1d4da0d3fc5b0917b513d85.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000011584222\/581827aef1d4da0d3fc5b0917b513d85.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000182904457\/cca9c116c58c96b217c198fe53c2951e_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000182904457\/cca9c116c58c96b217c198fe53c2951e_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1308829592\/1372565281","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"C0DFEC","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":164,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TipsForYouDaily","name":"Life Hack TIps","id":1308829592,"id_str":"1308829592","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142893569,"id_str":"365611251142893569","text":"RT @justinbieber: #tampa","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":887775234,"id_str":"887775234","name":"babi","screen_name":"jbdhugs","location":"","url":null,"description":"\u25b3 i not your baby\u25b3","protected":false,"followers_count":465,"friends_count":468,"listed_count":0,"created_at":"Thu Oct 18 00:08:31 +0000 2012","favourites_count":840,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":7150,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFF04D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000030318154\/68a306ee3ba03ed6e543e0c338392be8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000030318154\/68a306ee3ba03ed6e543e0c338392be8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262068243\/37c45b4ad80697126f1eaaf46eb5cd53_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262068243\/37c45b4ad80697126f1eaaf46eb5cd53_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/887775234\/1376002969","profile_link_color":"0099CC","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:36:45 +0000 2013","id":365602507680002048,"id_str":"365602507680002048","text":"#tampa","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":27260086,"id_str":"27260086","name":"Justin Bieber","screen_name":"justinbieber","location":"All Around The World","url":"http:\/\/www.youtube.com\/justinbieber","description":"#BELIEVE is on ITUNES and in STORES WORLDWIDE! - SO MUCH LOVE FOR THE FANS...you are always there for me and I will always be there for you. MUCH LOVE. thanks","protected":false,"followers_count":42796867,"friends_count":121902,"listed_count":552471,"created_at":"Sat Mar 28 16:41:22 +0000 2009","favourites_count":12,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":true,"statuses_count":23091,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/885769807\/043faf7949366ef2486c28a74311aa5d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/885769807\/043faf7949366ef2486c28a74311aa5d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3467035972\/4c978ba8510da3fb77d2d5e9ae7c93f0_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3467035972\/4c978ba8510da3fb77d2d5e9ae7c93f0_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/27260086\/1355357428","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":18417,"entities":{"hashtags":[{"text":"tampa","indices":[0,6]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"id"},"retweet_count":0,"entities":{"hashtags":[{"text":"tampa","indices":[18,24]}],"urls":[],"user_mentions":[{"screen_name":"justinbieber","name":"Justin Bieber","id":27260086,"id_str":"27260086","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251168059394,"id_str":"365611251168059394","text":"RT @lsilenciosa: Reir con quien se ama\nes alcanzar orgasmos de ternura\nbajo la imagen sonriente de la luna","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":541370431,"id_str":"541370431","name":"EL APRENDIZ","screen_name":"monarcamanni","location":"Mexico.","url":"http:\/\/es.favstar.fm\/users\/monarcamanni","description":"Ejerciendo el oficio de cambiar en letras el sentido de la vida. En busca de la raz\u00f3n, encontr\u00e9 tu coraz\u00f3n.","protected":false,"followers_count":1510,"friends_count":142,"listed_count":80,"created_at":"Sat Mar 31 04:55:06 +0000 2012","favourites_count":24332,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":7373,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3533843582\/17be053e0d6502ee191ef2eecb34a18c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3533843582\/17be053e0d6502ee191ef2eecb34a18c_normal.jpeg","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:45:34 +0000 2013","id":365604726038994944,"id_str":"365604726038994944","text":"Reir con quien se ama\nes alcanzar orgasmos de ternura\nbajo la imagen sonriente de la luna","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":594307075,"id_str":"594307075","name":"Sonja","screen_name":"lsilenciosa","location":"Argentina - Buenos Aires","url":null,"description":"____Escribir es aullar sin ruido ____M.Duras","protected":false,"followers_count":1790,"friends_count":1640,"listed_count":44,"created_at":"Wed May 30 04:46:53 +0000 2012","favourites_count":18449,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":false,"verified":false,"statuses_count":22504,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047302867\/857b9bb7b259d7501532168782f8710a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047302867\/857b9bb7b259d7501532168782f8710a.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/2292873655\/onjogp90cidj1bt4xima_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/2292873655\/onjogp90cidj1bt4xima_normal.jpeg","profile_link_color":"C23019","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"lsilenciosa","name":"Sonja","id":594307075,"id_str":"594307075","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142893570,"id_str":"365611251142893570","text":"RT @JulieLauren143: You can't force yourself to like someone.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":772546940,"id_str":"772546940","name":"\u0126\u03b1\u2113\u212f\u01b4 \u20a1\u2134\u03c5\u044f\u0442\u03b7\u212f\u01b4\u2765","screen_name":"Haleyycourtney","location":"","url":null,"description":"Nobody cares hoe","protected":false,"followers_count":144,"friends_count":250,"listed_count":0,"created_at":"Wed Aug 22 00:23:34 +0000 2012","favourites_count":264,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":1062,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000199278862\/a355e8b0abdfaf925e815829165bf329_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000199278862\/a355e8b0abdfaf925e815829165bf329_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/772546940\/1375411011","profile_link_color":"FF0000","profile_sidebar_border_color":"65B0DA","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 20:31:31 +0000 2013","id":365570993743339521,"id_str":"365570993743339521","text":"You can't force yourself to like someone.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":419103843,"id_str":"419103843","name":"Julie Lauren","screen_name":"JulieLauren143","location":"Motown","url":null,"description":"Be the one to guide me but never hold me down.\u2693\u2728","protected":false,"followers_count":279,"friends_count":275,"listed_count":0,"created_at":"Tue Nov 22 23:37:36 +0000 2011","favourites_count":835,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":true,"verified":false,"statuses_count":4046,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"E332D7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040728916\/762690371eaf6056ad281cbb2eddfb7e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040728916\/762690371eaf6056ad281cbb2eddfb7e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000134397864\/03c0dbf36826efb46581e56b7c728206_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000134397864\/03c0dbf36826efb46581e56b7c728206_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/419103843\/1372814420","profile_link_color":"25DB92","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E9C1F5","profile_text_color":"ED11C8","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"JulieLauren143","name":"Julie Lauren","id":419103843,"id_str":"419103843","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163873280,"id_str":"365611251163873280","text":"I almost just fought a little girl in Walmart. I am not in the mood.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":70544521,"id_str":"70544521","name":"-A.","screen_name":"reALLIEty_","location":"Home Sweet Home, for now.","url":"http:\/\/oolalaALLIE.tumblr.com","description":"I twerk like Miley.","protected":false,"followers_count":440,"friends_count":328,"listed_count":5,"created_at":"Tue Sep 01 00:16:49 +0000 2009","favourites_count":181,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":40866,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/729971714\/cdbe0eeeaa8d63181d44bbc833bebf21.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/729971714\/cdbe0eeeaa8d63181d44bbc833bebf21.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245962392\/a6838b6c5ebae853e790040d03c5051f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245962392\/a6838b6c5ebae853e790040d03c5051f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/70544521\/1375716746","profile_link_color":"A11097","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"456ED6","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163860992,"id_str":"365611251163860992","text":"\u51fa\u304b\u3051\u308b\u307e\u3067\u30af\u30fc\u30e9\u30fc\u3064\u3051\u3088\u3063\u304b\u306a\u3042\u3063\u3064\u3044","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1558526102,"id_str":"1558526102","name":"\u90a3\u667a","screen_name":"yknrnrain","location":"","url":null,"description":"\u8150\u5973\u5b50\u306e\u5927\u5b66\u751f\u3002\u9ed2\u30d0\u30b9\u9032\u6483\u4e03\u970aFree\u30ec\u30a4\u30f3\u3002\u7b20\u677e\u5148\u8f29\u8679\u6751\u4e3b\u5c06\u30a8\u30eb\u30f4\u30a3\u30f3\u3055\u3093\u30ca\u30ca\u30d0\u3055\u3093\u30ab\u30df\u30e5\u307e\u3053\u3061\u3083\u3093\u51db\u3061\u3083\u3093\u597d\u304d\u3002\u30d5\u30ea\u30fc\u30a2\u30a4\u30b3\u30f3\u304a\u501f\u308a\u3057\u3066\u307e\u3059(http:\/\/www.pixiv.net\/member_illust.php?mode=medium&illust_id=36144463)","protected":false,"followers_count":35,"friends_count":60,"listed_count":3,"created_at":"Sun Jun 30 17:22:28 +0000 2013","favourites_count":718,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4697,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260392772\/fa874eb27aedcbf225d7566d2e57d995_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260392772\/fa874eb27aedcbf225d7566d2e57d995_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251147087874,"id_str":"365611251147087874","text":"@_tainalima7 torio, so isso","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610777840844802,"in_reply_to_status_id_str":"365610777840844802","in_reply_to_user_id":346659305,"in_reply_to_user_id_str":"346659305","in_reply_to_screen_name":"_tainalima7","user":{"id":64378692,"id_str":"64378692","name":"Pedro Cafure","screen_name":"PedroCafure","location":"","url":null,"description":"DeMolay | eu te amo muito meu amor \/ @jusbaraini_","protected":false,"followers_count":504,"friends_count":292,"listed_count":2,"created_at":"Mon Aug 10 11:02:27 +0000 2009","favourites_count":4,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":7334,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3474544796\/f2350e3bb3a90ffae400951995c63fbf_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3474544796\/f2350e3bb3a90ffae400951995c63fbf_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/64378692\/1366807614","profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"_tainalima7","name":"Tain\u00e3 Lima","id":346659305,"id_str":"346659305","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163869185,"id_str":"365611251163869185","text":"@Iiamneeson I need to know if you won that fight against the alpha wolf in The Grey Liam! Please just tell me :( I don't think I can wait!!!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":317264802,"in_reply_to_user_id_str":"317264802","in_reply_to_screen_name":"Iiamneeson","user":{"id":1638577386,"id_str":"1638577386","name":"Josh","screen_name":"OhSwiftyy","location":"","url":null,"description":"Chelsea Fan and I Follow Back :)","protected":false,"followers_count":55,"friends_count":29,"listed_count":1,"created_at":"Thu Aug 01 18:47:48 +0000 2013","favourites_count":2,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":217,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000254668486\/01e664a947ffb6159066c4fc03ea9af5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000254668486\/01e664a947ffb6159066c4fc03ea9af5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1638577386\/1375445860","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Iiamneeson","name":"Liam Neeson","id":317264802,"id_str":"317264802","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142893568,"id_str":"365611251142893568","text":"RT @kevonlooknanan: ALL CXC STUDENTS RESULTS ARE OUT.","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":355891135,"id_str":"355891135","name":"SaRah.. \u2665","screen_name":"Sarah_Eliiiza","location":"Trinidad & Tobago","url":"http:\/\/Nothing-short-of-amaziing.tumblr.com","description":"Keeping my dreams BIG & my doubts small \u2661 ; #TeamBMRS","protected":false,"followers_count":364,"friends_count":362,"listed_count":1,"created_at":"Tue Aug 16 01:37:35 +0000 2011","favourites_count":717,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":false,"verified":false,"statuses_count":15905,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"35CFAD","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/638501372\/ff0084_zebra.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/638501372\/ff0084_zebra.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000065626513\/7dacf3c8a79f51267c6b99dd5f311340_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000065626513\/7dacf3c8a79f51267c6b99dd5f311340_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/355891135\/1375417226","profile_link_color":"35CFAD","profile_sidebar_border_color":"FFAAAA","profile_sidebar_fill_color":"FFAAAA","profile_text_color":"FF5554","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:01:59 +0000 2013","id":365608860737748992,"id_str":"365608860737748992","text":"ALL CXC STUDENTS RESULTS ARE OUT.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":583130932,"id_str":"583130932","name":"IG: \u04c3evon_\u04c3!dd\u2122","screen_name":"kevonlooknanan","location":"Trinidad and Tobago","url":"http:\/\/kevon-kid.tumblr.com\/","description":"Madrid & Arsenal Supporter | Lace Up | #EST19XX | \r #GoHardOrGoHome | \r Whatsapp - 785 - 4349","protected":false,"followers_count":450,"friends_count":418,"listed_count":1,"created_at":"Thu May 17 20:51:19 +0000 2012","favourites_count":4725,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"verified":false,"statuses_count":18285,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"760DAB","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/875102517\/09c82fe2933cd0b738a8e3310a3ae8ce.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/875102517\/09c82fe2933cd0b738a8e3310a3ae8ce.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000167303869\/213d18e929cd1e19b674988aff976fbe_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000167303869\/213d18e929cd1e19b674988aff976fbe_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/583130932\/1369679158","profile_link_color":"640DDE","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"A55FEB","profile_text_color":"660EA8","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"kevonlooknanan","name":"IG: \u04c3evon_\u04c3!dd\u2122","id":583130932,"id_str":"583130932","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251147079681,"id_str":"365611251147079681","text":"n\u00e3o \u00e9 bordinha Itallo, \u00e9 arma\u00e7\u00e3o, anta","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":194831704,"id_str":"194831704","name":"Lara Oliveira","screen_name":"Lara_bs","location":"Minas Gerais","url":null,"description":"Sei l\u00e1. Sis da @Gabrielirg","protected":false,"followers_count":592,"friends_count":258,"listed_count":4,"created_at":"Sat Sep 25 02:47:17 +0000 2010","favourites_count":137,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":25939,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F01D39","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000022553739\/4e5be9d2d9722ecb14908b1f767d5e1f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000022553739\/4e5be9d2d9722ecb14908b1f767d5e1f.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000202381779\/ec36fd7896e77f075f9b77cc47178480_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000202381779\/ec36fd7896e77f075f9b77cc47178480_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/194831704\/1373407991","profile_link_color":"6B0EF5","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251159666688,"id_str":"365611251159666688","text":"RT @DeyaniiraCM: Quiz\u00e1 no hoy, ni ma\u00f1ana, pero alg\u00fan d\u00eda estaremos juntos. #tuiteocomonieves","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":174531107,"id_str":"174531107","name":"Naye Dalid","screen_name":"nayedalid","location":"","url":null,"description":"Me gusta sonre\u00edr :)\nSoy afortunada de tener amigos que siempre est\u00e1n para apoyarme :3","protected":false,"followers_count":146,"friends_count":137,"listed_count":1,"created_at":"Wed Aug 04 04:20:13 +0000 2010","favourites_count":2800,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":9959,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/621117861\/8yfd7ila01390h3igkkj.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/621117861\/8yfd7ila01390h3igkkj.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000179349723\/3996126f506f2861d5a96d6e9fdb0b3c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000179349723\/3996126f506f2861d5a96d6e9fdb0b3c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/174531107\/1348072232","profile_link_color":"B40B43","profile_sidebar_border_color":"CC3366","profile_sidebar_fill_color":"E5507E","profile_text_color":"362720","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:40:26 +0000 2013","id":365603434809921537,"id_str":"365603434809921537","text":"Quiz\u00e1 no hoy, ni ma\u00f1ana, pero alg\u00fan d\u00eda estaremos juntos. #tuiteocomonieves","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":143661654,"id_str":"143661654","name":"Deya\u2661 \u221e","screen_name":"DeyaniiraCM","location":"Veracruz","url":null,"description":"Follow me;*","protected":false,"followers_count":130,"friends_count":214,"listed_count":1,"created_at":"Fri May 14 02:28:16 +0000 2010","favourites_count":156,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":12694,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/274970463\/216465_1983444384036_1181024402_2402874_6244280_nL.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/274970463\/216465_1983444384036_1181024402_2402874_6244280_nL.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000177144855\/e719c9fdc00506cb1020d6f9c69064b6_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000177144855\/e719c9fdc00506cb1020d6f9c69064b6_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/143661654\/1373331337","profile_link_color":"919191","profile_sidebar_border_color":"F05B8F","profile_sidebar_fill_color":"000000","profile_text_color":"F2669B","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[{"text":"tuiteocomonieves","indices":[58,75]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[{"text":"tuiteocomonieves","indices":[75,92]}],"urls":[],"user_mentions":[{"screen_name":"DeyaniiraCM","name":"Deya\u2661 \u221e","id":143661654,"id_str":"143661654","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142893572,"id_str":"365611251142893572","text":"RT @horoscoponegro: #ESCORPIO: NUNCA olvidas una ofensa, y lo sabes, te vengas de los traidores y puedes pasar toda tu vida esperando sabor\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1270957165,"id_str":"1270957165","name":"Ivan Fernandez","screen_name":"Ivan_101190","location":"","url":null,"description":null,"protected":false,"followers_count":68,"friends_count":77,"listed_count":0,"created_at":"Fri Mar 15 23:24:32 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":138,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3384926627\/de26808df6015ac0006790a9a62702a3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3384926627\/de26808df6015ac0006790a9a62702a3_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 21:55:05 +0000 2013","id":365592023559979010,"id_str":"365592023559979010","text":"#ESCORPIO: NUNCA olvidas una ofensa, y lo sabes, te vengas de los traidores y puedes pasar toda tu vida esperando saborear ese momento...","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":781988934,"id_str":"781988934","name":"\u25b2Hor\u00f3scopo Negro\u25b2","screen_name":"horoscoponegro","location":"","url":"http:\/\/www.facebook.com\/HoroscopoNegro","description":"Te mostramos el lado oscuro de tu signo del Zodiaco. Sois perversos con almas negras. Tauro, vender\u00edas a tu abuela por un Ralph Lauren y lo sabes...\u25b2","protected":false,"followers_count":527986,"friends_count":12,"listed_count":445,"created_at":"Sun Aug 26 08:14:16 +0000 2012","favourites_count":13443,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":3177,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/741616861\/85ca49e6b1d7ace92165e5066963f123.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/741616861\/85ca49e6b1d7ace92165e5066963f123.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3393274687\/7d1bb3a9e630eaaa298c76b6f5a564b5_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3393274687\/7d1bb3a9e630eaaa298c76b6f5a564b5_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/781988934\/1363548812","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"252429","profile_text_color":"666666","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2302,"entities":{"hashtags":[{"text":"ESCORPIO","indices":[0,9]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[{"text":"ESCORPIO","indices":[20,29]}],"urls":[],"user_mentions":[{"screen_name":"horoscoponegro","name":"\u25b2Hor\u00f3scopo Negro\u25b2","id":781988934,"id_str":"781988934","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142897664,"id_str":"365611251142897664","text":"Now my red ass went to this eoc summer school the WHOLE july n I still got 2 math classes ..! Wtf<<<<<<<<","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1362776748,"id_str":"1362776748","name":"DaZsa Bitch!\u2665","screen_name":"Redddddd_23","location":"","url":null,"description":"Oooo Kill Emmm Oooo Kill Emmmm Ooooooooo , Bofl Bestfranddddd heaaaaa @___Jambo , Follow me \u2764","protected":false,"followers_count":203,"friends_count":210,"listed_count":0,"created_at":"Thu Apr 18 20:03:56 +0000 2013","favourites_count":1775,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"verified":false,"statuses_count":10847,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046113688\/74e979f0970842e0e7aa9fdab05245d3.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046113688\/74e979f0970842e0e7aa9fdab05245d3.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000206288493\/5edd82a1dcf341a106fb6079967ff061_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000206288493\/5edd82a1dcf341a106fb6079967ff061_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1362776748\/1375483242","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142897665,"id_str":"365611251142897665","text":"\u2605**X-girl*\u30d6\u30e9\u30f3\u30c9\u30e0\u30c3\u30af\u672c*****************************************\u2026 http:\/\/t.co\/qb2xEOSNVX #r_blog","source":"\u003ca href=\"http:\/\/www.rakuten.co.jp\/\" rel=\"nofollow\"\u003eRakuten,Inc.\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":159065347,"id_str":"159065347","name":"\u6797\u6a8e1222","screen_name":"Leo401004","location":"","url":"http:\/\/plaza.rakuten.co.jp\/lovely1215\/","description":"\u697d\u5929\u306e\u304a\u8cb7\u3044\u3082\u306e\u60c5\u5831\u3092\u7d39\u4ecb\u3057\u3066\u3044\u307e\u3059\u3002","protected":false,"followers_count":351,"friends_count":1,"listed_count":1,"created_at":"Thu Jun 24 10:37:39 +0000 2010","favourites_count":1,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":89307,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"B2DFDA","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme13\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1520438622\/icon13147050332631_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1520438622\/icon13147050332631_normal.jpg","profile_link_color":"93A644","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"r_blog","indices":[84,91]}],"urls":[{"url":"http:\/\/t.co\/qb2xEOSNVX","expanded_url":"http:\/\/r10.to\/hGvr4E","display_url":"r10.to\/hGvr4E","indices":[61,83]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172245505,"id_str":"365611251172245505","text":"Okay,amo mucho a Juanita\u2665.","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":860447012,"id_str":"860447012","name":"Una pelinegra mas.","screen_name":"Soit_Hereuxe_","location":"","url":"http:\/\/www.facebook.com\/karenblancoo","description":"Welcome, to my jungle bitch..c: PIN : 26323191","protected":false,"followers_count":332,"friends_count":249,"listed_count":0,"created_at":"Wed Oct 03 22:36:36 +0000 2012","favourites_count":815,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":8883,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000045942696\/2218cd868fe7582cd22f7d99dccd631c.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000045942696\/2218cd868fe7582cd22f7d99dccd631c.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000252068909\/d97fd967309946e072670f06e4eb6a0d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000252068909\/d97fd967309946e072670f06e4eb6a0d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/860447012\/1375647698","profile_link_color":"048560","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176448002,"id_str":"365611251176448002","text":"RT @Iaughing: meeting someone in person for the first time and they're like http:\/\/t.co\/fAEkbmyMIU","source":"\u003ca href=\"http:\/\/blackberry.com\/twitter\" rel=\"nofollow\"\u003eTwitter for BlackBerry\u00ae\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":514568256,"id_str":"514568256","name":"\u2665Random\u2661Quotes\u2665","screen_name":"KittyDreyer93","location":"Cape Town, South Africa","url":"http:\/\/www.TeamFollowBack.com","description":"\u2665 I wonder if anyone thinks the same way I do. I pretend that I'm happy everyday. I feel alone all the time. I am shy and courageous. \u2665 #TeamFollowBack","protected":false,"followers_count":513,"friends_count":1106,"listed_count":4,"created_at":"Sun Mar 04 17:33:40 +0000 2012","favourites_count":291,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":3344,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000183234120\/d0560681902e8e4fcdcb573b85ca7354_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000183234120\/d0560681902e8e4fcdcb573b85ca7354_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/514568256\/1373629253","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Mon Jul 15 21:52:19 +0000 2013","id":356894019860692992,"id_str":"356894019860692992","text":"meeting someone in person for the first time and they're like http:\/\/t.co\/fAEkbmyMIU","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":280157793,"id_str":"280157793","name":"Ugly People Problems","screen_name":"Iaughing","location":"","url":null,"description":"\u2022 If our tweets relate to you, you're ugly \u2022 Not affiliated with The Fairly OddParents!","protected":false,"followers_count":706275,"friends_count":81,"listed_count":790,"created_at":"Sun Apr 10 19:19:22 +0000 2011","favourites_count":37,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":false,"verified":false,"statuses_count":3426,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/790383378\/ad164a3f8b560eccc2cdd39eed9035c0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/790383378\/ad164a3f8b560eccc2cdd39eed9035c0.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3756211675\/3fcea00e18038cf1cfb2707f1264bbe6_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3756211675\/3fcea00e18038cf1cfb2707f1264bbe6_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/280157793\/1368491595","profile_link_color":"EB963B","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2084,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":356894019864887296,"id_str":"356894019864887296","indices":[62,84],"media_url":"http:\/\/pbs.twimg.com\/media\/BPPxOFyCAAAcQSs.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BPPxOFyCAAAcQSs.jpg","url":"http:\/\/t.co\/fAEkbmyMIU","display_url":"pic.twitter.com\/fAEkbmyMIU","expanded_url":"http:\/\/twitter.com\/Iaughing\/status\/356894019860692992\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":237,"resize":"fit"},"medium":{"w":500,"h":348,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":348,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Iaughing","name":"Ugly People Problems","id":280157793,"id_str":"280157793","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251155476481,"id_str":"365611251155476481","text":"Kal Tak with Javed Chaudhry (Kya Riyasat Nakaam Ho rahi Hai) -- 8th August 2013: http:\/\/t.co\/Rtyp5vKm7J via @YouTube","source":"\u003ca href=\"http:\/\/www.google.com\/\" rel=\"nofollow\"\u003eGoogle\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":156848665,"id_str":"156848665","name":"Zeeshan Ahmad","screen_name":"zeshan793","location":"","url":null,"description":null,"protected":false,"followers_count":17,"friends_count":20,"listed_count":0,"created_at":"Fri Jun 18 03:12:58 +0000 2010","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":15723,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1845188544\/dp_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1845188544\/dp_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/Rtyp5vKm7J","expanded_url":"http:\/\/youtu.be\/B12xkkhLkv8?a","display_url":"youtu.be\/B12xkkhLkv8?a","indices":[81,103]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[108,116]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251168059395,"id_str":"365611251168059395","text":"@Djow973 oui je viens de comprendre Mdr ^^","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611007244120064,"in_reply_to_status_id_str":"365611007244120064","in_reply_to_user_id":114853816,"in_reply_to_user_id_str":"114853816","in_reply_to_screen_name":"Djow973","user":{"id":487812708,"id_str":"487812708","name":"Morgan De Bel ","screen_name":"MorganDeBel","location":"Paris ","url":"http:\/\/www.facebook.com\/MorganDeBelOfficiel?ref=hl","description":"Morgan De Bel Mannequin Mod\u00e8le et Com\u00e9dien .","protected":false,"followers_count":380,"friends_count":412,"listed_count":0,"created_at":"Thu Feb 09 19:30:08 +0000 2012","favourites_count":2,"utc_offset":10800,"time_zone":"Athens","geo_enabled":false,"verified":false,"statuses_count":1222,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/746765281\/0db078b6541bf0781dc864f2f9d791d7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/746765281\/0db078b6541bf0781dc864f2f9d791d7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1815789503\/Morgan03__3__normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1815789503\/Morgan03__3__normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/487812708\/1356395619","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Djow973","name":"\u2605 Jordhan_75 \u2605","id":114853816,"id_str":"114853816","indices":[0,8]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"fr"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251138695169,"id_str":"365611251138695169","text":"TENTANDO FORMAR UMA PALAVRA LEGAL NESSA SOPA DE LETRINHA","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":66443429,"id_str":"66443429","name":"matheus moreira","screen_name":"mathmoreira_","location":"S\u00e3o Jos\u00e9 dos Campos - SP","url":"http:\/\/ihateupleaseloveme.tumblr.com\/","description":"http:\/\/instagram.com\/mathmoreiraa\/","protected":false,"followers_count":424,"friends_count":204,"listed_count":3,"created_at":"Mon Aug 17 18:37:10 +0000 2009","favourites_count":324,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":true,"verified":false,"statuses_count":15264,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"030303","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028259955\/da89f06f6c7b1ac90f80c144788418a5.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028259955\/da89f06f6c7b1ac90f80c144788418a5.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000240541466\/e045a60829d49655bc4c1f63f8a83bcc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000240541466\/e045a60829d49655bc4c1f63f8a83bcc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/66443429\/1375458393","profile_link_color":"0D0101","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FCFCFC","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176448003,"id_str":"365611251176448003","text":"RT @SpaceGhostJoee: @notoriousjdg_ \ud83d\ude33\ud83d\ude0f\ud83d\ude0b\ud83d\ude09","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":256396973,"id_str":"256396973","name":"JDG","screen_name":"notoriousjdg_","location":"","url":null,"description":"the more things seem to change, the more they stay the same","protected":false,"followers_count":775,"friends_count":638,"listed_count":0,"created_at":"Wed Feb 23 07:10:03 +0000 2011","favourites_count":2691,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"verified":false,"statuses_count":35717,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/545603389\/tumblr_m0p82ux1HO1r6jm8to1_500.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/545603389\/tumblr_m0p82ux1HO1r6jm8to1_500.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248294469\/6e93fb2267ec0cf8d4e8a69215637ca5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248294469\/6e93fb2267ec0cf8d4e8a69215637ca5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/256396973\/1375554201","profile_link_color":"F0AFBC","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:11:07 +0000 2013","id":365611158180343810,"id_str":"365611158180343810","text":"@notoriousjdg_ \ud83d\ude33\ud83d\ude0f\ud83d\ude0b\ud83d\ude09","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610948851011584,"in_reply_to_status_id_str":"365610948851011584","in_reply_to_user_id":256396973,"in_reply_to_user_id_str":"256396973","in_reply_to_screen_name":"notoriousjdg_","user":{"id":292872138,"id_str":"292872138","name":"ALCAPONE","screen_name":"SpaceGhostJoee","location":"TRAP HOUSE","url":null,"description":"Smoke Dope Get Money Ball Hard \nIG: jostaytrappin","protected":false,"followers_count":234,"friends_count":374,"listed_count":1,"created_at":"Wed May 04 11:25:22 +0000 2011","favourites_count":359,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":2045,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/584871706\/GBE.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/584871706\/GBE.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/344513261571335764\/6f4ddf6c92134357dc303875d8dfff80_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/344513261571335764\/6f4ddf6c92134357dc303875d8dfff80_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"notoriousjdg_","name":"JDG","id":256396973,"id_str":"256396973","indices":[0,14]}]},"favorited":false,"retweeted":false,"lang":"und"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SpaceGhostJoee","name":"ALCAPONE","id":292872138,"id_str":"292872138","indices":[3,18]},{"screen_name":"notoriousjdg_","name":"JDG","id":256396973,"id_str":"256396973","indices":[20,34]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172249602,"id_str":"365611251172249602","text":"@DCFCchat @Lee_oyaloper I'll be very surprised if he goes to a top 4 side. He's a talented boy, but can he \/ will he reach potential?!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610821230931969,"in_reply_to_status_id_str":"365610821230931969","in_reply_to_user_id":636017132,"in_reply_to_user_id_str":"636017132","in_reply_to_screen_name":"DCFCchat","user":{"id":416196273,"id_str":"416196273","name":"TheTrentEnd.co.uk","screen_name":"TheTrentEndBlog","location":"Nottingham","url":"http:\/\/www.TheTrentEnd.co.uk","description":"A site for Forest Fans to enjoy. Contains news and more. We follow and tweet back.\nThese views are my own and not those of Nottingham Forest FC","protected":false,"followers_count":2523,"friends_count":2687,"listed_count":21,"created_at":"Sat Nov 19 10:53:53 +0000 2011","favourites_count":13,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":7702,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1756971233\/twittertrentend_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1756971233\/twittertrentend_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/416196273\/1374439031","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"DCFCchat","name":"DCFCchat","id":636017132,"id_str":"636017132","indices":[0,9]},{"screen_name":"Lee_oyaloper","name":"Leetheoyaloper","id":35071315,"id_str":"35071315","indices":[10,23]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251168055296,"id_str":"365611251168055296","text":"RT @TNiCE252: Damn every time 2 celebs of the opposite sex are in a picture together y'all swear they make a good couple","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":85554084,"id_str":"85554084","name":"IStanFor-Breezy\u00b0\u2764","screen_name":"GohardforCB","location":"EARTH!!","url":"http:\/\/its--breezy-bitch.tumblr.com\/","description":"#TEAMBREEZY\u2661.TupacShakur\u2661. My perfect boyfriend\u2661 @jacoblatimore followed 26.9.11@21:58.@mombreezy followed 1.2.12@4:10 on my bday\u2661 instagram @rosesaredarkblue","protected":false,"followers_count":1156,"friends_count":1228,"listed_count":9,"created_at":"Tue Oct 27 12:04:04 +0000 2009","favourites_count":47,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":11977,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/578773446\/tj22o7o24ed3faut8scl.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/578773446\/tj22o7o24ed3faut8scl.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245544753\/1086b53dfcde2dc8ecf1aa374cc4386e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245544753\/1086b53dfcde2dc8ecf1aa374cc4386e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/85554084\/1356940132","profile_link_color":"F090DD","profile_sidebar_border_color":"9EBEFF","profile_sidebar_fill_color":"030303","profile_text_color":"0014F5","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:28 +0000 2013","id":365610993113509888,"id_str":"365610993113509888","text":"Damn every time 2 celebs of the opposite sex are in a picture together y'all swear they make a good couple","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":31232875,"id_str":"31232875","name":"TNiCE","screen_name":"TNiCE252","location":" Atlanta, GA 30319","url":"http:\/\/www.youtube.com\/user\/Tnice2nice","description":"You can Follow me ...But I probably get yo ass lost because I dont know where i'm going neither. ATL \u2708 LA \u2708 TX #LakerNation (Kik & IG - Tnice252)","protected":false,"followers_count":8479,"friends_count":6710,"listed_count":68,"created_at":"Tue Apr 14 21:33:14 +0000 2009","favourites_count":816,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":false,"verified":false,"statuses_count":113103,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0F0609","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000021647532\/90b4b40dcf628b602f1249279bb8ab82.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000021647532\/90b4b40dcf628b602f1249279bb8ab82.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257438190\/2ec220ef1411068e2896b569826ad24f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257438190\/2ec220ef1411068e2896b569826ad24f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/31232875\/1374848987","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"TNiCE252","name":"TNiCE","id":31232875,"id_str":"31232875","indices":[3,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251151290369,"id_str":"365611251151290369","text":"@jacobpiazza \n\"I don't need to go to college\"\n-Jacob Piazza","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610699264761856,"in_reply_to_status_id_str":"365610699264761856","in_reply_to_user_id":363227587,"in_reply_to_user_id_str":"363227587","in_reply_to_screen_name":"jacobpiazza","user":{"id":388567071,"id_str":"388567071","name":"John Zeke Jude Deely","screen_name":"John_Deely","location":"Bethlehem, Pennsylvania","url":null,"description":"I'm just trying to do John Deely better than everyone else","protected":false,"followers_count":387,"friends_count":380,"listed_count":0,"created_at":"Mon Oct 10 23:46:24 +0000 2011","favourites_count":281,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":4642,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000093933794\/8e630fa19f9f97bc1336bb6304012964_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000093933794\/8e630fa19f9f97bc1336bb6304012964_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"jacobpiazza","name":"Piazza","id":363227587,"id_str":"363227587","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176448001,"id_str":"365611251176448001","text":"Hace un a\u00f1o tambi\u00e9n estabamos por Muros","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":284182199,"id_str":"284182199","name":"Carla ","screen_name":"carlacostas96","location":"","url":"http:\/\/m.youtube.com\/watch?gl=ES&hl=es&client=mv-google&v=MrFOAbzWuUA","description":"FAMILIA MART\u00cdN C\u00d3DAX, PROMOCI\u00d3N 1999-2012. Midnight, mi rutina favorita. JOEL","protected":false,"followers_count":289,"friends_count":289,"listed_count":0,"created_at":"Mon Apr 18 19:56:55 +0000 2011","favourites_count":1150,"utc_offset":7200,"time_zone":"Madrid","geo_enabled":true,"verified":false,"statuses_count":9446,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"030303","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/766249905\/386e61a7be1624c606199b0bd4b9f721.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/766249905\/386e61a7be1624c606199b0bd4b9f721.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000241114482\/0d2eb73986751456cda5e23edb50fa4a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000241114482\/0d2eb73986751456cda5e23edb50fa4a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/284182199\/1374188634","profile_link_color":"948C8F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EDEDED","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251142885377,"id_str":"365611251142885377","text":"http:\/\/t.co\/7pLGS8kGEo 1\u0437\u0430\u043a\u043e\u043d \u043d\u044c\u044e\u0442\u043e\u043d\u0430 \u043f\u0440\u0435\u0437\u0435\u043d\u0442\u0430\u0446\u0438\u044f http:\/\/t.co\/Och0uISwgI \u0436\u0443\u0440\u043d\u0430\u043b \u0431\u0430\u0441\u043a\u0435\u0442\u0431\u043e\u043b \u0441\u043a\u0430\u0447\u0430\u0442\u044c","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":490408544,"id_str":"490408544","name":"\u043d\u0438\u043a \u0430\u0440\u044e\u043a\u043e\u0432","screen_name":"chetwertak","location":"","url":null,"description":null,"protected":false,"followers_count":0,"friends_count":0,"listed_count":0,"created_at":"Sun Feb 12 14:41:11 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1038,"lang":"ru","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_3_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_3_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/7pLGS8kGEo","expanded_url":"http:\/\/uyi.acba.in.ua\/cat5\/torrent-2654.html","display_url":"uyi.acba.in.ua\/cat5\/torrent-2\u2026","indices":[0,22]},{"url":"http:\/\/t.co\/Och0uISwgI","expanded_url":"http:\/\/uyi.acba.in.ua\/cat5\/torrent-2655.html","display_url":"uyi.acba.in.ua\/cat5\/torrent-2\u2026","indices":[50,72]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ru"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251159662592,"id_str":"365611251159662592","text":"@HTantlinger right?! So dumb!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610772589576192,"in_reply_to_status_id_str":"365610772589576192","in_reply_to_user_id":1135529419,"in_reply_to_user_id_str":"1135529419","in_reply_to_screen_name":"HTantlinger","user":{"id":701726461,"id_str":"701726461","name":"Shae Jones","screen_name":"ShaeeeeMarieeee","location":"","url":null,"description":"Not all of us can do great things. But we can do small things with great love.","protected":false,"followers_count":1062,"friends_count":1913,"listed_count":0,"created_at":"Tue Jul 17 20:31:21 +0000 2012","favourites_count":31182,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":23159,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216932856\/df44ba404f84e0d9b3c6455d2b19d79d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216932856\/df44ba404f84e0d9b3c6455d2b19d79d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/701726461\/1375597197","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"HTantlinger","name":"\u2601\u210d\u03c5\u0418\u271e\u13a5\u018e\u2601","id":1135529419,"id_str":"1135529419","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251155480578,"id_str":"365611251155480578","text":"#mtvhottest Justin Bieber http:\/\/t.co\/M5G5uBmNIE dmn","source":"\u003ca href=\"http:\/\/twitter.com\/tweetbutton\" rel=\"nofollow\"\u003eTweet Button\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1656333108,"id_str":"1656333108","name":"salynasra","screen_name":"salynasra","location":"bieber","url":null,"description":"#TheKey #GF #Someday","protected":false,"followers_count":5,"friends_count":20,"listed_count":0,"created_at":"Thu Aug 08 22:52:22 +0000 2013","favourites_count":0,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":69,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262023279\/e82920b9c6938866175c02dbc58961b3_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262023279\/e82920b9c6938866175c02dbc58961b3_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1656333108\/1376002488","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"mtvhottest","indices":[0,11]}],"urls":[{"url":"http:\/\/t.co\/M5G5uBmNIE","expanded_url":"http:\/\/www.mtv.co.uk\/hottest","display_url":"mtv.co.uk\/hottest","indices":[26,48]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"de"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251155468289,"id_str":"365611251155468289","text":"@knitmeapony Will take a fuller listen when I get a chance. Also love this photo of her: http:\/\/t.co\/57fs3vcztp","source":"web","truncated":false,"in_reply_to_status_id":365610186632724481,"in_reply_to_status_id_str":"365610186632724481","in_reply_to_user_id":3447831,"in_reply_to_user_id_str":"3447831","in_reply_to_screen_name":"knitmeapony","user":{"id":10397722,"id_str":"10397722","name":".","screen_name":"onthelevel","location":"Chicago","url":null,"description":"Continuing experimentation in omnipresence. Still mostly harmless.","protected":false,"followers_count":958,"friends_count":649,"listed_count":86,"created_at":"Tue Nov 20 00:00:34 +0000 2007","favourites_count":67936,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":18111,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"241C11","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/1241092\/lookintothelight1.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/1241092\/lookintothelight1.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/285057971\/twitteronthelevelsmall_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/285057971\/twitteronthelevelsmall_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/10397722\/1348364224","profile_link_color":"826341","profile_sidebar_border_color":"BBCBF5","profile_sidebar_fill_color":"E5E080","profile_text_color":"000000","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/57fs3vcztp","expanded_url":"http:\/\/flic.kr\/p\/84k9Y6","display_url":"flic.kr\/p\/84k9Y6","indices":[89,111]}],"user_mentions":[{"screen_name":"knitmeapony","name":"LJ","id":3447831,"id_str":"3447831","indices":[0,12]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251168059392,"id_str":"365611251168059392","text":"RT @HopkinsOnTheRox: 'Grease' is the word.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":481615119,"id_str":"481615119","name":"Courtney(:","screen_name":"Courtney_R_A","location":"KANSAS","url":null,"description":"Dream big \u2728 Eat Pop-Tarts... And Kyle May is my favorite person in the entire universe!!! My heart is full!!!\u2764","protected":false,"followers_count":180,"friends_count":154,"listed_count":0,"created_at":"Fri Feb 03 00:08:55 +0000 2012","favourites_count":2341,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":4660,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000149743617\/c911a7cca15ca7998841cd5535c39546_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000149743617\/c911a7cca15ca7998841cd5535c39546_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/481615119\/1374123153","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:00:46 +0000 2013","id":365608554448699393,"id_str":"365608554448699393","text":"'Grease' is the word.","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":174932154,"id_str":"174932154","name":"Dr. Horrible","screen_name":"HopkinsOnTheRox","location":"Raiding Erin's Fridge.","url":"http:\/\/holy-batman-at-the-disco.tumblr.com","description":"You're made of carbon. I'm made of carbon. We already have so much in common.","protected":false,"followers_count":286,"friends_count":577,"listed_count":0,"created_at":"Thu Aug 05 05:09:43 +0000 2010","favourites_count":5246,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":8475,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/387309626\/Supernatural_Wallpaper.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/387309626\/Supernatural_Wallpaper.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258142446\/b311b1ad392f7cc509bf028345fa2eb5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258142446\/b311b1ad392f7cc509bf028345fa2eb5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/174932154\/1372856115","profile_link_color":"FF0000","profile_sidebar_border_color":"65B0DA","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"HopkinsOnTheRox","name":"Dr. Horrible","id":174932154,"id_str":"174932154","indices":[3,19]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176448000,"id_str":"365611251176448000","text":"\u5929\u306f\u4e8c\u7269\u3092\u4e0e\u3048\u306a\u3044\u3089\u3057\u3044\u3067\u3059\u304c\u3001\u79c1\u306e\u3088\u3046\u306a\u5fd8\u308c\u7269\u3060\u3089\u3051\u304b\u3089\u3059\u308b\u3068\u4e0e\u3048\u3089\u308c\u3059\u304e\u3060\u3068\u601d\u3044\u307e\u3059\uff01\uff01","source":"\u003ca href=\"http:\/\/twicca.r246.jp\/\" rel=\"nofollow\"\u003etwicca\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":145236510,"id_str":"145236510","name":"\u9921\u871c\u306e\u3069\u98f4","screen_name":"_an32_","location":"\u9921\u871c\u306e\u3069\u98f4\u306f\u30aa\u30f3\u30e9\u30a4\u30f3\u3067\u3059\u3002","url":"http:\/\/twilog.org\/_an32_","description":"\u689f\u3068\u8702\u871c\u306e\u3069\u98f4(@chukuroc)\u306e\u73a9\u5177\u3067\u3059","protected":false,"followers_count":83,"friends_count":60,"listed_count":4,"created_at":"Tue May 18 13:09:10 +0000 2010","favourites_count":499,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"verified":false,"statuses_count":40730,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/784253446\/c958d3e4a9907d62aef35b180999f67a.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/784253446\/c958d3e4a9907d62aef35b180999f67a.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000062300735\/391524531ef5a0a1286e0c6f27eb62fc_normal.gif","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000062300735\/391524531ef5a0a1286e0c6f27eb62fc_normal.gif","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/145236510\/1375166117","profile_link_color":"93C2FF","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFEBF2","profile_text_color":"80442E","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163873281,"id_str":"365611251163873281","text":"@cdazzle01 \ud83d\ude29\ud83d\ude2d I wish you could bring me pizza !!!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610854089113601,"in_reply_to_status_id_str":"365610854089113601","in_reply_to_user_id":411027371,"in_reply_to_user_id_str":"411027371","in_reply_to_screen_name":"cdazzle01","user":{"id":344273061,"id_str":"344273061","name":"Caitlin\u00b0","screen_name":"caitylinnn_","location":"8.0.4 !","url":null,"description":"InstaMe! C8tlinnn \u270c","protected":false,"followers_count":227,"friends_count":246,"listed_count":0,"created_at":"Thu Jul 28 20:17:34 +0000 2011","favourites_count":2018,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":8393,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/300222809\/5.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/300222809\/5.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000227735293\/6c21871438b666300d92c4460b690812_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000227735293\/6c21871438b666300d92c4460b690812_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/344273061\/1375400168","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"cdazzle01","name":"Cortney","id":411027371,"id_str":"411027371","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251163873282,"id_str":"365611251163873282","text":"@hockeybychoice I support this use of your 18,0001 tweet. @dylanobrien","source":"web","truncated":false,"in_reply_to_status_id":365588009262919680,"in_reply_to_status_id_str":"365588009262919680","in_reply_to_user_id":15149931,"in_reply_to_user_id_str":"15149931","in_reply_to_screen_name":"hockeybychoice","user":{"id":15593412,"id_str":"15593412","name":"angie","screen_name":"angiep213","location":"","url":"http:\/\/angiedet.wordpress.com\/","description":"Detroit. Nola. Liberal. I swear a lot. Currently living between disasters. Apologies in advance.","protected":false,"followers_count":710,"friends_count":877,"listed_count":57,"created_at":"Fri Jul 25 03:10:45 +0000 2008","favourites_count":352,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":30182,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3666259345\/473e552930ea7a1bfdbae532b8952a84_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3666259345\/473e552930ea7a1bfdbae532b8952a84_normal.jpeg","profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hockeybychoice","name":"hockeybychoice","id":15149931,"id_str":"15149931","indices":[0,15]},{"screen_name":"dylanobrien","name":"Dylan O'Brien","id":281766200,"id_str":"281766200","indices":[59,71]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172249601,"id_str":"365611251172249601","text":"RT @PostureoTuits: 'C\u00f3metelo que por dentro est\u00e1 perfecto' #PostureoMadres http:\/\/t.co\/SGSiKFNMOP","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":498981710,"id_str":"498981710","name":"Jose Mari Peguero ","screen_name":"JMPeguero_8","location":"Corte de Peleas.","url":null,"description":"C\u00f3mo de feliz ser\u00eda nuestra vida.","protected":false,"followers_count":297,"friends_count":229,"listed_count":0,"created_at":"Tue Feb 21 16:53:40 +0000 2012","favourites_count":39,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":false,"verified":false,"statuses_count":3204,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000031795903\/7d1600e27c0cadd63993d15effee3e3d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000031795903\/7d1600e27c0cadd63993d15effee3e3d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/498981710\/1375046338","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 20:13:00 +0000 2013","id":365203947180412928,"id_str":"365203947180412928","text":"'C\u00f3metelo que por dentro est\u00e1 perfecto' #PostureoMadres http:\/\/t.co\/SGSiKFNMOP","source":"\u003ca href=\"http:\/\/www.google.es\" rel=\"nofollow\"\u003ePostureoTuits\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1387236000,"id_str":"1387236000","name":"#postureo","screen_name":"PostureoTuits","location":"Spain","url":null,"description":"Para todos aquellos que amamos el #Postureo","protected":false,"followers_count":47060,"friends_count":0,"listed_count":43,"created_at":"Sun Apr 28 15:45:10 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":151,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/854754118\/c5b16310890ede14d131e5619654d42b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/854754118\/c5b16310890ede14d131e5619654d42b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3586591919\/40e2c74b323d67324620e1d449dcbd7c_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3586591919\/40e2c74b323d67324620e1d449dcbd7c_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1387236000\/1367165877","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":228,"entities":{"hashtags":[{"text":"PostureoMadres","indices":[40,55]}],"urls":[],"user_mentions":[],"media":[{"id":365203947184607232,"id_str":"365203947184607232","indices":[56,78],"media_url":"http:\/\/pbs.twimg.com\/media\/BRF3DhaCUAA_LNL.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRF3DhaCUAA_LNL.jpg","url":"http:\/\/t.co\/SGSiKFNMOP","display_url":"pic.twitter.com\/SGSiKFNMOP","expanded_url":"http:\/\/twitter.com\/PostureoTuits\/status\/365203947180412928\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":500,"h":375,"resize":"fit"},"medium":{"w":500,"h":375,"resize":"fit"},"small":{"w":340,"h":255,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[{"text":"PostureoMadres","indices":[59,74]}],"urls":[],"user_mentions":[{"screen_name":"PostureoTuits","name":"#postureo","id":1387236000,"id_str":"1387236000","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251176443904,"id_str":"365611251176443904","text":"Me every time http:\/\/t.co\/1HEMbpO4iG","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":408242587,"id_str":"408242587","name":"Rebecca\u2728","screen_name":"RebeccaSarai","location":"Los Angeles,California","url":null,"description":"Rebecca Sarai Herrera. 18. 5'8. Volleyball. Mexican. Salvadorian. CSULA. #Gleek","protected":false,"followers_count":76,"friends_count":92,"listed_count":1,"created_at":"Wed Nov 09 05:09:59 +0000 2011","favourites_count":845,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5026,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000057139067\/897eae8928260499bd4e56ab14cfb584_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000057139067\/897eae8928260499bd4e56ab14cfb584_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/408242587\/1366173247","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365611251180638209,"id_str":"365611251180638209","indices":[14,36],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLpfvCCEAEBKTP.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLpfvCCEAEBKTP.jpg","url":"http:\/\/t.co\/1HEMbpO4iG","display_url":"pic.twitter.com\/1HEMbpO4iG","expanded_url":"http:\/\/twitter.com\/RebeccaSarai\/status\/365611251176443904\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"large":{"w":639,"h":639,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251168067586,"id_str":"365611251168067586","text":"@xamytu sem acordos, compra na msm loja vei kkkkkk oxe q mizera","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610753388064768,"in_reply_to_status_id_str":"365610753388064768","in_reply_to_user_id":460400210,"in_reply_to_user_id_str":"460400210","in_reply_to_screen_name":"xamytu","user":{"id":334598193,"id_str":"334598193","name":"claudia","screen_name":"claudiaholanda1","location":"","url":null,"description":"Comigo ninguem tira onda","protected":false,"followers_count":544,"friends_count":702,"listed_count":1,"created_at":"Wed Jul 13 10:55:03 +0000 2011","favourites_count":658,"utc_offset":-7200,"time_zone":"Greenland","geo_enabled":false,"verified":false,"statuses_count":20054,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/750768569\/145c78eaa1a7c29eb9cd2d5425b656db.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/750768569\/145c78eaa1a7c29eb9cd2d5425b656db.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000254781457\/408f923af8152462a3e07731fde1b505_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000254781457\/408f923af8152462a3e07731fde1b505_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/334598193\/1375627782","profile_link_color":"038543","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"CC1212","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"xamytu","name":"avaju","id":460400210,"id_str":"460400210","indices":[0,7]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:29 +0000 2013","id":365611251172261888,"id_str":"365611251172261888","text":"Just hold on we're going home","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":548042302,"id_str":"548042302","name":"Trent Bussard","screen_name":"Trenty_B","location":"CA","url":null,"description":"Kanye West doesn't care about white people. Kendyl \u2764","protected":false,"followers_count":313,"friends_count":237,"listed_count":0,"created_at":"Sun Apr 08 01:28:16 +0000 2012","favourites_count":2737,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":10149,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000159792290\/7ad0a121cb1d2831175a0febe01ff448_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000159792290\/7ad0a121cb1d2831175a0febe01ff448_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/548042302\/1375250506","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[35.33797211,-119.10635397]},"coordinates":{"type":"Point","coordinates":[-119.10635397,35.33797211]},"place":{"id":"960993b9cfdffda9","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/960993b9cfdffda9.json","place_type":"city","name":"Bakersfield","full_name":"Bakersfield, CA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-119.265033,35.193979],[-119.265033,35.447975],[-118.772341,35.447975],[-118.772341,35.193979]]]},"attributes":{}},"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255354376192,"id_str":"365611255354376192","text":"Esto es una divinura! \ud83d\udc9c\ud83d\udc9c\ud83d\udc9c http:\/\/t.co\/xnwUlvYqVv","source":"\u003ca href=\"http:\/\/www.apple.com\" rel=\"nofollow\"\u003eiOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":711007249,"id_str":"711007249","name":"Mafe unigarro","screen_name":"mafeunigarro","location":"","url":null,"description":null,"protected":false,"followers_count":60,"friends_count":112,"listed_count":0,"created_at":"Sun Jul 22 17:31:04 +0000 2012","favourites_count":59,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":474,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3700860284\/bcf3af338bac2ea44c2ed6c044be6ed8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3700860284\/bcf3af338bac2ea44c2ed6c044be6ed8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/711007249\/1357162723","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/xnwUlvYqVv","expanded_url":"http:\/\/youtu.be\/ss2QI_EwCHQ","display_url":"youtu.be\/ss2QI_EwCHQ","indices":[26,48]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337193472,"id_str":"365611255337193472","text":"\u30ea\u30d7\u3067\u76ee\u899a\u3081\u305f\u3002\n\u306d\u308b\u3002","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1086709333,"id_str":"1086709333","name":"kaniko\u262a*.\u22c6\u2235\u20dd\u2661","screen_name":"exo_chinpa","location":"","url":null,"description":"EXO-K\u30fbM\u2661 We are One\u30fe(\u25cf\u2019`\u25cf)\uff89\u203c\u203c \u3044\u3064\u3082\u304a\u30cd\u30e0\u306a\u304a\u307c\u3063\u3061\u3083\u3093KAI\u306b\u3069\u3063\u3077\u308a\u30cf\u30de\u3063\u3066\u307e\u3059\u2661 K\u3063\u5b50M\u3063\u5b50\u307f\u3093\u306a\u304b\u308f\u3044\u304f\u3066\u2026\u30ab\u30a4\u30da\u30f3\u3060\u3051\u3069\u3001\u7d50\u5c40\u304a\u308b\u307a\u3093(\uff89\u2200`)\u266c \u4e5d\u5dde\u306c\u306a\u4f1a\u211629 85\uff08-86\uff09line \u203bexo\u5c02\u7528\u30a2\u30ab(\uff65\u2200\uff65)","protected":false,"followers_count":85,"friends_count":198,"listed_count":1,"created_at":"Sun Jan 13 17:23:11 +0000 2013","favourites_count":131,"utc_offset":32400,"time_zone":"Tokyo","geo_enabled":true,"verified":false,"statuses_count":6709,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000190316937\/aacf9b5a0efa39f077a6b47b52a5746c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000190316937\/aacf9b5a0efa39f077a6b47b52a5746c_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1086709333\/1375797302","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337189376,"id_str":"365611255337189376","text":"O ganhador ser\u00e1 divulgado no dia 17 via Facebook e Twitter da p\u00e1gina.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1656307033,"id_str":"1656307033","name":"Mafia Emblem3 ","screen_name":"MafiaE3Promo","location":"","url":"http:\/\/www.facebook.com\/MafiaEmblem3","description":"Fique por dentro de todas promo\u00e7\u00f5es da Mafia E3!","protected":false,"followers_count":2,"friends_count":12,"listed_count":0,"created_at":"Thu Aug 08 21:59:58 +0000 2013","favourites_count":0,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":10,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047467075\/817e7b09b93cf37416ca5d4aec764282.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047467075\/817e7b09b93cf37416ca5d4aec764282.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262019880\/51f23feeb1f182601f78196e03b0a606_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262019880\/51f23feeb1f182601f78196e03b0a606_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1656307033\/1376002503","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255341383681,"id_str":"365611255341383681","text":"estoy tan aburrida que le hablo a todos jjej e","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":725841109,"id_str":"725841109","name":"Cuerva \u2665","screen_name":"BeluCiorciari1","location":"","url":"http:\/\/www.facebook.com\/La.Bechuu","description":"Hincha del mas grande el CICLON, loca un poco nada mas.","protected":false,"followers_count":439,"friends_count":684,"listed_count":0,"created_at":"Mon Jul 30 11:07:50 +0000 2012","favourites_count":10,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2548,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"DBE9ED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000018348112\/23bcae798ba978bc4642add189bda1eb.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000018348112\/23bcae798ba978bc4642add189bda1eb.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000216322738\/3c83a47fa327f76363163a620f9b66dd_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000216322738\/3c83a47fa327f76363163a620f9b66dd_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/725841109\/1373243920","profile_link_color":"CC3366","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E6F6F9","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337201664,"id_str":"365611255337201664","text":"Kkkkkkkkkkkkkkkkk","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":585686764,"id_str":"585686764","name":"J\u00e3o","screen_name":"19365999","location":"Atalaia Local Molhes","url":"http:\/\/Instagram.com\/joaaopereiraa","description":"Quero viver at\u00e9 morrer","protected":false,"followers_count":165,"friends_count":98,"listed_count":0,"created_at":"Sun May 20 13:52:18 +0000 2012","favourites_count":629,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":6539,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"F00808","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/884428104\/e71d7259c762851c89e15ad1b1608f7d.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/884428104\/e71d7259c762851c89e15ad1b1608f7d.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261603459\/02de9d6556e6e3409874120d266b1d52_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261603459\/02de9d6556e6e3409874120d266b1d52_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/585686764\/1370096368","profile_link_color":"0A0A01","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255366557696,"id_str":"365611255366557696","text":"\u3044\u3063\u305d\u3001\u6226\u3063\u3066\u307f\u308b\u3068\u3044\u3046\u306e\u3082\u826f\u3044\u304b\u3082\u3057\u308c\u306a\u3044\u3067\u3059\u306d","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1153003310,"id_str":"1153003310","name":"\u590f\u5ddd\u771f\u6dbc(bot)","screen_name":"Masuzu__bot","location":"","url":"https:\/\/sites.google.com\/site\/xiachuanzhenliangbot\/","description":"\u4ffa\u306e\u5f7c\u5973\u3068\u5e7c\u99b4\u67d3\u304c\u4fee\u7f85\u5834\u3059\u304e\u308b \u590f\u5ddd\u771f\u6dbc \u975e\u516c\u5f0fbot\u3067\u3059\u3002\u53f0\u8a5e\u306f\u57fa\u672c\u30a2\u30cb\u30e1\u304b\u3089\u629c\u7c8b\u3057\u3066\u3044\u307e\u3059\u3001\u30d5\u30a9\u30ed\u30fc\u306f\u6975\u529b\u8fd4\u3057\u307e\u3059\u3002\r\nbot\u4f5c\u6210\u306f\u521d\u3081\u3066\u306a\u306e\u3067\u3001\u610f\u898b\u30a2\u30c9\u30d0\u30a4\u30b9\u7b49\u3042\u308c\u3070 @po_te_ma_yo_ \u307e\u3067\u304a\u9858\u3044\u81f4\u3057\u307e\u3059\u3002\u30d5\u30a9\u30ed\u30fc\u524d\u306b\u8aac\u660e\u66f8\u3092\u95b2\u89a7\u4e0b\u3055\u3044","protected":false,"followers_count":221,"friends_count":232,"listed_count":6,"created_at":"Wed Feb 06 05:25:26 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":12512,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/783498498\/4f18701a4e18d237ff1cc00e520b0722.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/783498498\/4f18701a4e18d237ff1cc00e520b0722.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3214525657\/07e54bf7751a072ecdd992c1564bfebc_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3214525657\/07e54bf7751a072ecdd992c1564bfebc_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1153003310\/1360148044","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353966592,"id_str":"365611255353966592","text":"Don't know what I'm gonna do yet though...","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":302112812,"id_str":"302112812","name":"Soccerstar2014","screen_name":"J_Weezy88","location":"","url":null,"description":"I aint got no worries...Soccerplayer #8...Rapper...and Just know I am going to be famous one day..! Point..Blank..Period!!","protected":false,"followers_count":292,"friends_count":423,"listed_count":0,"created_at":"Fri May 20 16:32:00 +0000 2011","favourites_count":28,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":1994,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000017940885\/7318072a12e1da7645e93df13a417b15_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000017940885\/7318072a12e1da7645e93df13a417b15_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353974785,"id_str":"365611255353974785","text":"Texting ruins relationships","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":328821977,"id_str":"328821977","name":"Tim TB Kasierski","screen_name":"TimKasierski","location":"","url":null,"description":"Middle aged women introduce me to their daughters","protected":false,"followers_count":297,"friends_count":293,"listed_count":0,"created_at":"Mon Jul 04 02:51:40 +0000 2011","favourites_count":841,"utc_offset":-18000,"time_zone":"Quito","geo_enabled":false,"verified":false,"statuses_count":7441,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/337145240\/0810111453.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/337145240\/0810111453.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000102128805\/e8f4a537a19c9c8efd1a80b7cccdf312_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000102128805\/e8f4a537a19c9c8efd1a80b7cccdf312_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/328821977\/1372078335","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337197569,"id_str":"365611255337197569","text":"RT @Xx___WhY___xX: #DoYouLoveMe ? Yes, I do. <3 #2ne1","source":"\u003ca href=\"http:\/\/roundteam.co\" rel=\"nofollow\"\u003eRoundTeam\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1405004989,"id_str":"1405004989","name":"Manusia biasa~ \u263a","screen_name":"AgenCIA1010","location":"Los Angeles, USA","url":null,"description":"Jadi gini , kalo lu follow gua gua doain deh lu ganteng ^_^ | Support @officialJKT48 my oshi @nabilahJKT48 \u2665","protected":false,"followers_count":1213,"friends_count":61,"listed_count":0,"created_at":"Sun May 05 12:49:42 +0000 2013","favourites_count":1,"utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"verified":false,"statuses_count":2712,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/864844221\/262eea8b877046ffee210c0e58e8d1f0.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/864844221\/262eea8b877046ffee210c0e58e8d1f0.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000245363288\/1e82aef42de7fca3b63bb024473a8bf1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000245363288\/1e82aef42de7fca3b63bb024473a8bf1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1405004989\/1375283140","profile_link_color":"0084B4","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:30:06 +0000 2013","id":365600836585406465,"id_str":"365600836585406465","text":"#DoYouLoveMe ? Yes, I do. <3 #2ne1","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":35271727,"id_str":"35271727","name":"LallaSbralla Madden","screen_name":"Xx___WhY___xX","location":"Roma!","url":null,"description":"Giappone. Korea.\r\nModa. Playstation.\r\nAnime. Manga.\r\n\r\nLallaSbralla_GZB","protected":false,"followers_count":34,"friends_count":124,"listed_count":1,"created_at":"Sat Apr 25 18:10:47 +0000 2009","favourites_count":2,"utc_offset":7200,"time_zone":"Rome","geo_enabled":false,"verified":false,"statuses_count":85,"lang":"it","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047472858\/a69f68c5ef8e169386d1cc5a9608be31.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047472858\/a69f68c5ef8e169386d1cc5a9608be31.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261956631\/e32aac76fcac757df15869ebee62d624_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261956631\/e32aac76fcac757df15869ebee62d624_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/35271727\/1376001373","profile_link_color":"C20E17","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"850808","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":2,"entities":{"hashtags":[{"text":"DoYouLoveMe","indices":[0,12]},{"text":"2ne1","indices":[32,37]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"vi"},"retweet_count":0,"entities":{"hashtags":[{"text":"DoYouLoveMe","indices":[19,31]},{"text":"2ne1","indices":[51,56]}],"urls":[],"user_mentions":[{"screen_name":"Xx___WhY___xX","name":"LallaSbralla Madden","id":35271727,"id_str":"35271727","indices":[3,17]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"vi"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255366561792,"id_str":"365611255366561792","text":"Son los efectos de liarse conmigo. Me stalkeas, quieres hablarme, pero algo te detiene y te dice que yo te voy a ignorar o que me molestas.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":833118504,"id_str":"833118504","name":"Sara\/Sarah","screen_name":"SariSwaan","location":"Tegan, Sara, Florence, ID.","url":null,"description":"Alg\u00fan d\u00eda el \u00e1rbol que has cortado te har\u00e1 falta para respirar. ||Lectora||Asexual||Amante de la ortograf\u00eda||.","protected":false,"followers_count":404,"friends_count":376,"listed_count":1,"created_at":"Wed Sep 19 11:31:27 +0000 2012","favourites_count":615,"utc_offset":-18000,"time_zone":"Mexico City","geo_enabled":true,"verified":false,"statuses_count":11733,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"666166","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000047426522\/a31dd4f67cd9d5ec03fda6873f822aa2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000047426522\/a31dd4f67cd9d5ec03fda6873f822aa2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000257877435\/79dd46206db0515fe1d95944831faf9d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000257877435\/79dd46206db0515fe1d95944831faf9d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/833118504\/1376001636","profile_link_color":"4DE3C5","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDFFCC","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255370756096,"id_str":"365611255370756096","text":"So I'm giving you a chance,dont break my trust.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1231611781,"id_str":"1231611781","name":"\u00c0\u00f1dr\u00eb\u0101\u2764","screen_name":"clary_life","location":"Spring Lake,North Carolina","url":"http:\/\/www.facebok.com\/AndreaKayClary","description":"@shantel325 is my best friend @mebrohem is my Older sister. Daddys little girl. \u2764 You can hate but im the girl who rises above it. #Cenanation \u26a1 #thefamily #OHS","protected":false,"followers_count":271,"friends_count":540,"listed_count":0,"created_at":"Fri Mar 01 21:44:48 +0000 2013","favourites_count":575,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":2917,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme10\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000178575122\/3c8e4ea7a099256bfe4dc6cb5d8189ce_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000178575122\/3c8e4ea7a099256bfe4dc6cb5d8189ce_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1231611781\/1375469776","profile_link_color":"06606E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255345594368,"id_str":"365611255345594368","text":"Indonesia menerima paradigma baru dalam hubungan internasional, yaitu relativitas kedaulatan negara di hadapan Hukum Internasional HAM","source":"\u003ca href=\"http:\/\/www.tweetdeck.com\" rel=\"nofollow\"\u003eTweetDeck\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":80246748,"id_str":"80246748","name":"Media Center","screen_name":"infobencana","location":"Indonesia","url":"http:\/\/www.mediacenter.or.id","description":null,"protected":false,"followers_count":34694,"friends_count":3908,"listed_count":246,"created_at":"Tue Oct 06 07:54:52 +0000 2009","favourites_count":248,"utc_offset":25200,"time_zone":"Jakarta","geo_enabled":true,"verified":false,"statuses_count":172916,"lang":"id","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000251911625\/44d551bfe2f81506c2987afe6a42d281_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000251911625\/44d551bfe2f81506c2987afe6a42d281_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/80246748\/1375115642","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"id"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353974786,"id_str":"365611255353974786","text":"http:\/\/t.co\/XBY4inlEHO","source":"\u003ca href=\"http:\/\/www.facebook.com\/twitter\" rel=\"nofollow\"\u003eFacebook\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":246888706,"id_str":"246888706","name":"bahaa abadi","screen_name":"babady99","location":"","url":null,"description":null,"protected":false,"followers_count":11,"friends_count":66,"listed_count":0,"created_at":"Thu Feb 03 17:33:38 +0000 2011","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":2240,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/1235868074\/image_normal.jpg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/1235868074\/image_normal.jpg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/XBY4inlEHO","expanded_url":"http:\/\/fb.me\/3dCOe2AhQ","display_url":"fb.me\/3dCOe2AhQ","indices":[0,22]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353978881,"id_str":"365611255353978881","text":"RT @Real_Liam_Payne: Checkkk out our limited edition product line we designed to support anti-bullying education with @OfficeDepot. Live Tr\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":624671858,"id_str":"624671858","name":"60 DAYS TO TMHTPERTH","screen_name":"GucciStyles_","location":"@lucy_meck is perfect \u2764","url":"http:\/\/chloe-page.tumblr.com\/","description":"Harry ur lips are so kissable ur eyes irristable ur giving me heart attack looking like u do I dont want to take it slow i just want to take u home.","protected":false,"followers_count":5652,"friends_count":3990,"listed_count":3,"created_at":"Mon Jul 02 13:37:52 +0000 2012","favourites_count":3593,"utc_offset":28800,"time_zone":"Perth","geo_enabled":true,"verified":false,"statuses_count":29800,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000305","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/847083074\/4f299e9844605abf08b805fa65fc6ff5.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/847083074\/4f299e9844605abf08b805fa65fc6ff5.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000214279874\/0e46220cd932f2f2a02db83b4094cc29_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000214279874\/0e46220cd932f2f2a02db83b4094cc29_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/624671858\/1366364982","profile_link_color":"B40B43","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 23:11:54 +0000 2013","id":365248969372205057,"id_str":"365248969372205057","text":"Checkkk out our limited edition product line we designed to support anti-bullying education with @OfficeDepot. Live True!!!!!!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":158314798,"id_str":"158314798","name":"Liam Payne","screen_name":"Real_Liam_Payne","location":"UK","url":"http:\/\/www.onedirectionmusic.com","description":"Now stay with me cause im quite quick 5 6 7 8 :)","protected":false,"followers_count":11984326,"friends_count":15030,"listed_count":65638,"created_at":"Tue Jun 22 10:19:08 +0000 2010","favourites_count":31,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":true,"statuses_count":7590,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3566362915\/de1b2c9c319b01ef71994c57ef46e92e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3566362915\/de1b2c9c319b01ef71994c57ef46e92e_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/158314798\/1366882913","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":28177,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"officedepot","name":"Office Depot","id":18572345,"id_str":"18572345","indices":[97,109]}]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Real_Liam_Payne","name":"Liam Payne","id":158314798,"id_str":"158314798","indices":[3,19]},{"screen_name":"officedepot","name":"Office Depot","id":18572345,"id_str":"18572345","indices":[118,130]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353970689,"id_str":"365611255353970689","text":"RT @longgone_: Imagine being the owner of a blue Nissan versa #dirtylooksfordays","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":480202990,"id_str":"480202990","name":"JALEN \u303dILLER","screen_name":"JalensTweets","location":"Oregon","url":null,"description":"Everything I'm not made me everything I am \n @jalenspics","protected":false,"followers_count":178,"friends_count":46,"listed_count":1,"created_at":"Wed Feb 01 07:10:23 +0000 2012","favourites_count":1135,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":3237,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/734857691\/2868acf984d76d0e33f963624c79ad47.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/734857691\/2868acf984d76d0e33f963624c79ad47.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000138611700\/e183bc89c491760c7c30246529904896_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000138611700\/e183bc89c491760c7c30246529904896_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/480202990\/1375738268","profile_link_color":"41B6E0","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EDF08B","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 20:35:48 +0000 2013","id":365572071469760512,"id_str":"365572071469760512","text":"Imagine being the owner of a blue Nissan versa #dirtylooksfordays","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":549578478,"id_str":"549578478","name":"Long Tran","screen_name":"longgone_","location":"","url":"http:\/\/gonelonger.tumblr.com","description":null,"protected":false,"followers_count":31,"friends_count":19,"listed_count":0,"created_at":"Mon Apr 09 19:46:22 +0000 2012","favourites_count":61,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":103,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247746986\/c982dbcf661ee89a52f974e664ff07a8_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247746986\/c982dbcf661ee89a52f974e664ff07a8_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/549578478\/1375749716","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[{"text":"dirtylooksfordays","indices":[47,65]}],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[{"text":"dirtylooksfordays","indices":[62,80]}],"urls":[],"user_mentions":[{"screen_name":"longgone_","name":"Long Tran","id":549578478,"id_str":"549578478","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255366565888,"id_str":"365611255366565888","text":"RT @binladinsgooch: Balloon fest saturday#decent","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":233254513,"id_str":"233254513","name":"gab","screen_name":"gaybiejefferies","location":"bristol","url":null,"description":null,"protected":false,"followers_count":717,"friends_count":133,"listed_count":0,"created_at":"Sun Jan 02 19:08:36 +0000 2011","favourites_count":1031,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":10699,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/672155391\/8c10d12932856c1f7e572fbd71049d7e.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/672155391\/8c10d12932856c1f7e572fbd71049d7e.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244414782\/3304d77eb43c76422de8e0858bf2f73f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244414782\/3304d77eb43c76422de8e0858bf2f73f_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/233254513\/1375270495","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:02:47 +0000 2013","id":365609061577789440,"id_str":"365609061577789440","text":"Balloon fest saturday#decent","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":607576618,"id_str":"607576618","name":"Arj","screen_name":"binladinsgooch","location":"bristol ","url":null,"description":"Follow me \nInstagram arj231","protected":false,"followers_count":89,"friends_count":96,"listed_count":0,"created_at":"Wed Jun 13 21:35:23 +0000 2012","favourites_count":63,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":781,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000106574824\/92148a241b57194d86c16a710edafaed_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000106574824\/92148a241b57194d86c16a710edafaed_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/607576618\/1373849065","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"binladinsgooch","name":"Arj","id":607576618,"id_str":"607576618","indices":[3,18]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255332995072,"id_str":"365611255332995072","text":"\u3010\u30d7\u30ed\u91ce\u7403\u30c1\u30fc\u30e0\u3092\u3064\u304f\u308d\u3046\uff01\u30d7\u30ec\u30a4\u4e2d\uff01\u3011\u3000\u4e00\u7dd2\u306b\u300c\u91ce\u7403\u3064\u304f\uff01\u300d\u3092\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3057\u3066\u3001\u81ea\u5206\u3060\u3051\u306e\u5922\u306e\u7403\u56e3\u3092\u3064\u304f\u308d\u3046\uff01\u3000 #\u3084\u304d\u3085\u3064\u304f http:\/\/t.co\/T7I94rjJ2i\u300008\u670809\u65e508\u6642","source":"\u003ca href=\"http:\/\/sega.jp\/\" rel=\"nofollow\"\u003e\u3084\u304d\u3085\u3064\u304f\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":452697027,"id_str":"452697027","name":"\u4e2d\u91ce \u5149","screen_name":"214meteora","location":"","url":null,"description":null,"protected":false,"followers_count":3,"friends_count":47,"listed_count":0,"created_at":"Mon Jan 02 04:30:35 +0000 2012","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":14,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/sticky\/default_profile_images\/default_profile_6_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/sticky\/default_profile_images\/default_profile_6_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":true,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"\u3084\u304d\u3085\u3064\u304f","indices":[58,64]}],"urls":[{"url":"http:\/\/t.co\/T7I94rjJ2i","expanded_url":"http:\/\/sgap.ps\/twihp","display_url":"sgap.ps\/twihp","indices":[65,87]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337193473,"id_str":"365611255337193473","text":"RT @Savannaaah_: Finally, done with work(':","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":422386530,"id_str":"422386530","name":"Sam De La Riva","screen_name":"Sammi_bruhh","location":"","url":null,"description":"Even when winning is illogical, losing is still far from optional, Samantha Rosales is the one\u2764","protected":false,"followers_count":154,"friends_count":140,"listed_count":0,"created_at":"Sun Nov 27 06:01:49 +0000 2011","favourites_count":4,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":5272,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248997970\/8c4ea5b1c143ef35bc7e337b76ba1cf4_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248997970\/8c4ea5b1c143ef35bc7e337b76ba1cf4_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/422386530\/1375770904","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:04:19 +0000 2013","id":365594349142151168,"id_str":"365594349142151168","text":"Finally, done with work(':","source":"\u003ca href=\"http:\/\/www.myplume.com\/\" rel=\"nofollow\"\u003ePlume\u00a0for\u00a0Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":204147396,"id_str":"204147396","name":"Savannah Pantoja","screen_name":"Savannaaah_","location":"El Paso Tx.","url":"http:\/\/thegreatestsavannah.tumblr.com","description":"Thug life \ud763\u2655","protected":false,"followers_count":406,"friends_count":399,"listed_count":2,"created_at":"Mon Oct 18 01:21:00 +0000 2010","favourites_count":160,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":33483,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"642D8B","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/492456495\/38.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/492456495\/38.jpg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000242020175\/56d7502c6bab45faadd49af335420bd2_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000242020175\/56d7502c6bab45faadd49af335420bd2_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/204147396\/1374645409","profile_link_color":"FF0000","profile_sidebar_border_color":"65B0DA","profile_sidebar_fill_color":"7AC3EE","profile_text_color":"3D1957","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Savannaaah_","name":"Savannah Pantoja","id":204147396,"id_str":"204147396","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255341395968,"id_str":"365611255341395968","text":"What you find in your backyard (Part 4) http:\/\/t.co\/5tbDZ1a9at","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":236662665,"id_str":"236662665","name":"David (\u0e01\u0e38\u0e21\u0e34) Chung","screen_name":"dychung7","location":"","url":null,"description":"You truly never know.","protected":false,"followers_count":104,"friends_count":93,"listed_count":0,"created_at":"Tue Jan 11 02:56:02 +0000 2011","favourites_count":451,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":3017,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"1A1B1F","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/491030523\/DSC_0212_cropped.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/491030523\/DSC_0212_cropped.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000139075262\/f2fa24212aab769348ce50ef47872e1b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000139075262\/f2fa24212aab769348ce50ef47872e1b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/236662665\/1361219873","profile_link_color":"2FC2EF","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/5tbDZ1a9at","expanded_url":"http:\/\/instagram.com\/p\/cxSeTIMlWM\/","display_url":"instagram.com\/p\/cxSeTIMlWM\/","indices":[40,62]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255333003264,"id_str":"365611255333003264","text":"RT @jelibonummm1: Yasin (12) 5 tl'lik bayram har\u00e7l\u0131\u011f\u0131yla oyuncak tabanca ald\u0131. O ARTIK B\u0130R MAFYA","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1336914152,"id_str":"1336914152","name":"ESMERD\u0130L","screen_name":"meliherdiil","location":"\u0130zmir","url":"https:\/\/www.facebook.com\/melih.erdil.0","description":"Ruh ya\u015f\u0131 5 , kendi 15 , tecr\u00fcbe ya\u015f\u0131 25 olan insan .Yerine g\u00f6re \u00e7ok kibar olur , yerine g\u00f6re \u00e7ok k\u00fcf\u00fcr ederim.","protected":false,"followers_count":161,"friends_count":99,"listed_count":0,"created_at":"Mon Apr 08 15:24:17 +0000 2013","favourites_count":52,"utc_offset":10800,"time_zone":"Istanbul","geo_enabled":false,"verified":false,"statuses_count":1327,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"007970","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/863475329\/148cecd16811395455c541e54e8703a7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/863475329\/148cecd16811395455c541e54e8703a7.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000034864494\/93456369a1d628171448a89d2a856a8a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000034864494\/93456369a1d628171448a89d2a856a8a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1336914152\/1374718956","profile_link_color":"FF4F03","profile_sidebar_border_color":"5D4739","profile_sidebar_fill_color":"013542","profile_text_color":"DA000B","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:08:44 +0000 2013","id":365610557463728128,"id_str":"365610557463728128","text":"Yasin (12) 5 tl'lik bayram har\u00e7l\u0131\u011f\u0131yla oyuncak tabanca ald\u0131. O ARTIK B\u0130R MAFYA","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1624819027,"id_str":"1624819027","name":"jelibonummm","screen_name":"jelibonummm1","location":"","url":null,"description":null,"protected":false,"followers_count":184,"friends_count":165,"listed_count":0,"created_at":"Sat Jul 27 06:44:50 +0000 2013","favourites_count":60,"utc_offset":10800,"time_zone":"Baghdad","geo_enabled":false,"verified":false,"statuses_count":152,"lang":"tr","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044061292\/7e13600d64a76b542d6280f2f581ec85.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044061292\/7e13600d64a76b542d6280f2f581ec85.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000262055016\/461bb5ad0cccdba717a81a57fcac6d3d_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000262055016\/461bb5ad0cccdba717a81a57fcac6d3d_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1624819027\/1375307311","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"tr"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"jelibonummm1","name":"jelibonummm","id":1624819027,"id_str":"1624819027","indices":[3,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"tr"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255333011456,"id_str":"365611255333011456","text":"It's a fine line between truth and lies .. Jesus Christ never lied still was crucified","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":938090330,"id_str":"938090330","name":"Aug18\u264c","screen_name":"King__IV","location":"Baltimore*","url":null,"description":"Jaded Sky..Hazy Clouds; Dream Accounts Large Amounts ,","protected":false,"followers_count":256,"friends_count":239,"listed_count":0,"created_at":"Sat Nov 10 00:45:00 +0000 2012","favourites_count":162,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":4602,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000010761478\/f26a82e6a678f22462dde790ee3c7f3a.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000010761478\/f26a82e6a678f22462dde790ee3c7f3a.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000027062707\/a369aeded217ea47b926f1aa91416606_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000027062707\/a369aeded217ea47b926f1aa91416606_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/938090330\/1375843884","profile_link_color":"000000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255345582080,"id_str":"365611255345582080","text":"RT @BdotRocks: They hate to see a nigga succeed","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":343788940,"id_str":"343788940","name":"K.","screen_name":"_Korynne","location":"","url":null,"description":"What it do Nephew?","protected":false,"followers_count":346,"friends_count":280,"listed_count":2,"created_at":"Thu Jul 28 02:37:18 +0000 2011","favourites_count":28,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":25386,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"0C1413","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028499632\/7426f3b39fe696088fea62b7a9991f87.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028499632\/7426f3b39fe696088fea62b7a9991f87.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000107309526\/ddcff36f0a705c2872693f4ebe48174b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000107309526\/ddcff36f0a705c2872693f4ebe48174b_normal.jpeg","profile_link_color":"D4C51E","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E2F0C5","profile_text_color":"AB00AB","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:30 +0000 2013","id":365611002957545472,"id_str":"365611002957545472","text":"They hate to see a nigga succeed","source":"\u003ca href=\"http:\/\/www.apple.com\" rel=\"nofollow\"\u003eiOS\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":455317411,"id_str":"455317411","name":"Brody Rocks","screen_name":"BdotRocks","location":"St. Clair, Cleveland, OH","url":"http:\/\/brodyrocks.tumblr.com","description":"Praying for wisdom on this road to riches. Long live $t. Clair","protected":false,"followers_count":979,"friends_count":605,"listed_count":0,"created_at":"Thu Jan 05 00:05:24 +0000 2012","favourites_count":110,"utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"verified":false,"statuses_count":12427,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000028963932\/6ed4232d679e5c98d06fbc2417b7b432.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000028963932\/6ed4232d679e5c98d06fbc2417b7b432.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000222002761\/f3995820f6e82e7cb1ac1379c0b6d2c9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000222002761\/f3995820f6e82e7cb1ac1379c0b6d2c9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/455317411\/1374042044","profile_link_color":"F20909","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"000000","profile_text_color":"FAFAFA","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"BdotRocks","name":"Brody Rocks","id":455317411,"id_str":"455317411","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353982977,"id_str":"365611255353982977","text":"Passageiro e o enigma da conquista RT @YahooPerguntas: como se conquista um onibus","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":124585519,"id_str":"124585519","name":"Aeron ","screen_name":"Loop_38","location":"Planeta Terra","url":"http:\/\/instagram.com\/loop_38","description":"Sou magro eu posso.","protected":false,"followers_count":368,"friends_count":246,"listed_count":17,"created_at":"Fri Mar 19 22:18:15 +0000 2010","favourites_count":248,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":23717,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"0F0701","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000009380285\/2e333e8e950f04e3a357d25f998de265.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000009380285\/2e333e8e950f04e3a357d25f998de265.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250179105\/091991e5718eaf619d66aebde5252ce5_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250179105\/091991e5718eaf619d66aebde5252ce5_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/124585519\/1372357381","profile_link_color":"4651CC","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"22A6E3","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"YahooPerguntas","name":"Yahoo Perguntas","id":1551949094,"id_str":"1551949094","indices":[38,53]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255370752000,"id_str":"365611255370752000","text":"@Orion_prime1 *smiles* ok *snuggles up close to you*","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365606595230568448,"in_reply_to_status_id_str":"365606595230568448","in_reply_to_user_id":1653190231,"in_reply_to_user_id_str":"1653190231","in_reply_to_screen_name":"Orion_prime1","user":{"id":1655981978,"id_str":"1655981978","name":"Fiara","screen_name":"Fiara__","location":"","url":null,"description":"RP|| follows back...unless your annoying","protected":false,"followers_count":23,"friends_count":2,"listed_count":0,"created_at":"Thu Aug 08 18:45:00 +0000 2013","favourites_count":0,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":52,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000261878735\/3d6614f0d0471c1e89be56b1fcac485e_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000261878735\/3d6614f0d0471c1e89be56b1fcac485e_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Orion_prime1","name":"Orion (child)","id":1653190231,"id_str":"1653190231","indices":[0,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255362371584,"id_str":"365611255362371584","text":"@Giovana_Costa9 inglaterra rs","source":"web","truncated":false,"in_reply_to_status_id":365600844298719232,"in_reply_to_status_id_str":"365600844298719232","in_reply_to_user_id":969038605,"in_reply_to_user_id_str":"969038605","in_reply_to_screen_name":"Giovana_Costa9","user":{"id":1287204734,"id_str":"1287204734","name":"Rodrigo Martins","screen_name":"roddmartins","location":"","url":null,"description":null,"protected":false,"followers_count":172,"friends_count":104,"listed_count":0,"created_at":"Thu Mar 21 23:12:49 +0000 2013","favourites_count":1475,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":false,"verified":false,"statuses_count":4443,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/868354473\/a22d692892821e8def179b7ae80570de.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/868354473\/a22d692892821e8def179b7ae80570de.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000188798847\/368dd90f35e814359df9aa08f61c4090_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000188798847\/368dd90f35e814359df9aa08f61c4090_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1287204734\/1373728025","profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"99CC33","profile_text_color":"3E4415","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Giovana_Costa9","name":"Gisenilda do Pd","id":969038605,"id_str":"969038605","indices":[0,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255345586177,"id_str":"365611255345586177","text":"@martu_milla Quedo llena de humo como si hubiera un insendio! Y eso pero habia un olor a humo! No se quien corrio el coso de la tostadora!.","source":"web","truncated":false,"in_reply_to_status_id":365609872525500416,"in_reply_to_status_id_str":"365609872525500416","in_reply_to_user_id":598571587,"in_reply_to_user_id_str":"598571587","in_reply_to_screen_name":"martu_milla","user":{"id":555417899,"id_str":"555417899","name":"\u221e\u2648\u221eV\u221e\u2648\u221e","screen_name":"SolValent","location":"En mi Casa\u00a9\u00b0","url":null,"description":"FAN 1 @proudclaryfray I love you!\r\nLa verdad es que no todos los d\u00edas tengo ganas de sonre\u00edr, hablar y actuar como si la vida fuera perfecta\u2661 100% CHIKIPEDIA!","protected":false,"followers_count":593,"friends_count":1168,"listed_count":0,"created_at":"Mon Apr 16 20:58:42 +0000 2012","favourites_count":2944,"utc_offset":-10800,"time_zone":"Buenos Aires","geo_enabled":true,"verified":false,"statuses_count":8436,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000043545317\/f3c06e846ceb8368002e5759f742ec3b.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000043545317\/f3c06e846ceb8368002e5759f742ec3b.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247538094\/2d532f727c1dc07e480d141b294dd0e2_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247538094\/2d532f727c1dc07e480d141b294dd0e2_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/555417899\/1375746787","profile_link_color":"16119C","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"martu_milla","name":"Martu Milla","id":598571587,"id_str":"598571587","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255341395969,"id_str":"365611255341395969","text":"Top 5: Baltimore Ravens Fantasy Players. http:\/\/t.co\/8kGAK0Igsc\n@SportsRantRT @SportsBlogRT @BlogsUnite","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1216313798,"id_str":"1216313798","name":"Dan Abeshouse","screen_name":"DanAbeshouse82","location":"Baltimore,MD","url":"http:\/\/www.rantsports.com\/nfl\/author\/danabeshouse\/","description":"I'm a Baltimore Ravens writer for Rant Sports. Make sure to visit my personal page here http:\/\/www.rantsports.com\/nfl\/author\/danabeshouse\/","protected":false,"followers_count":73,"friends_count":169,"listed_count":3,"created_at":"Sun Feb 24 19:08:00 +0000 2013","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":309,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3303046786\/c5e23584a20282ea1b6377f549463a3f_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3303046786\/c5e23584a20282ea1b6377f549463a3f_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/8kGAK0Igsc","expanded_url":"http:\/\/www.rantsports.com\/fantasy\/2013\/08\/08\/top-5-baltimore-ravens-fantasy-players\/","display_url":"rantsports.com\/fantasy\/2013\/0\u2026","indices":[41,63]}],"user_mentions":[{"screen_name":"SportsRantRT","name":"Sports Rant Retweet","id":1138267494,"id_str":"1138267494","indices":[64,77]},{"screen_name":"SportsBlogRT","name":"Sports Blog RT","id":397725597,"id_str":"397725597","indices":[78,91]},{"screen_name":"BlogsUnite","name":"Sports Blogs Unite","id":486313257,"id_str":"486313257","indices":[92,103]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255341383680,"id_str":"365611255341383680","text":"RT @ForvictoRi: D-10 Rise Up Vip !!!!!!!!!!!!!!!!!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":452126764,"id_str":"452126764","name":"\u3059\u3056\u304f\u314e\u3145\u314e\u2661","screen_name":"sukhun0216","location":"\ud22c\ud53c\uc5e0\/\uc5d0\ud504\uc5d1\uc2a4\/\ube45\ubc45\/JJ\ud504\ub85c\uc81d\ud2b8","url":null,"description":"\uff19\uff19(\uff19\uff18)\uff4c\uff49\uff4e\uff45\u304f\u3093\u30da\u30f3\u306f\u308c\u3059\u029a\u2661\u025e","protected":false,"followers_count":360,"friends_count":389,"listed_count":6,"created_at":"Sun Jan 01 13:05:31 +0000 2012","favourites_count":272,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":6529,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000114356493\/d8e46c9328bc527d0219a9d4ff018db0_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000114356493\/d8e46c9328bc527d0219a9d4ff018db0_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/452126764\/1374412267","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:10:49 +0000 2013","id":365595982349598722,"id_str":"365595982349598722","text":"D-10 Rise Up Vip !!!!!!!!!!!!!!!!!","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":957027824,"id_str":"957027824","name":"Seung Ri","screen_name":"ForvictoRi","location":"SEOUL","url":"http:\/\/Ygbigbang.com\/Seungri","description":"Once the tweet goes out, it cant be taken back kkkk","protected":false,"followers_count":835957,"friends_count":32,"listed_count":5125,"created_at":"Mon Nov 19 08:41:13 +0000 2012","favourites_count":0,"utc_offset":32400,"time_zone":"Seoul","geo_enabled":false,"verified":true,"statuses_count":12,"lang":"ko","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000034793228\/a5d21f2c640d1aa324a4907c2318db4a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000034793228\/a5d21f2c640d1aa324a4907c2318db4a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/957027824\/1372148547","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1768,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"vi"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ForvictoRi","name":"Seung Ri","id":957027824,"id_str":"957027824","indices":[3,14]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"vi"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255362359298,"id_str":"365611255362359298","text":"\u253b\u2533|\uff2a \uff01 \u3075\u3041\u307c\u3082\u30ea\u30c4\u30a4\u30ad\u30e3\u30b9\u3057\u3066\u304f\u308b\u304b\u3089\u96e2\u8131\u266a\u4ffa\u306e\u9811\u5f35\u3063\u3066\u3093\u3060\u3088\uff57\uff57\uff57\u693f\u306e\u4e2d\u3092\u8997\u3051\u3070\u30ad\u30c1\u30ac\u30a4\u3060\u304b\u3089\u3055\uff73\uff75\uff6b\uff6bw \uff01 \u307e\u305f\u3057\u3070\u3089\u304f\u6765\u306a\u304b\u3063\u305f\u304b\u3089\u3054\u3081\u3093\u306dw ww \u2588\u2588\u2588\u2588\u2588\u2588\uff01 http:\/\/t.co\/DbftibpSEu #\u52a3\u5316\u30b3\u30d4\u30fc","source":"\u003ca href=\"http:\/\/bit.ly\/SiHFe6\" rel=\"nofollow\"\u003erekkacopy\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1259481662,"id_str":"1259481662","name":"\u672a\u6930@\uff12\uff18\u65e5\u8a95\u751f\u65e5\u0669( '\u03c9' )\ufeed","screen_name":"Miyay167","location":"\u30c4\u30a4\u30d7\u30ed\u3092\u5236\u4f5c\u4e2d","url":null,"description":"\u672a\u6930(\u307f\u3084)\u3068\u3044\u3044\u307e\u3059\u266a\r\n\u30ea\u30d7\u306a\u3057\u30d5\u30a9\u30ed\u30fc\u306f\u8fd4\u3057\u3066\u3044\u307e\u305b\u3093\uff01\uff01\r\n\u898f\u5236\u57a2\u2192\u3010Miyay167_k\u3011\r\n\u57fa\u672c\u304d\u307e\u3050\u308c\u3067\u305f\u307e\u306b\u66b4\u8d70\u3057\u307e\u3059\uff3c( '\u03c9')\uff0f","protected":false,"followers_count":256,"friends_count":236,"listed_count":7,"created_at":"Mon Mar 11 13:35:11 +0000 2013","favourites_count":434,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":14386,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/871349571\/91bf650f8d5635da9c8bf9f8874a4a83.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/871349571\/91bf650f8d5635da9c8bf9f8874a4a83.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000258582524\/55c0f7494da8105f9bb20f2b5d3b5a3b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000258582524\/55c0f7494da8105f9bb20f2b5d3b5a3b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1259481662\/1375944967","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[{"text":"\u52a3\u5316\u30b3\u30d4\u30fc","indices":[115,121]}],"urls":[{"url":"http:\/\/t.co\/DbftibpSEu","expanded_url":"http:\/\/rekkacopy.com","display_url":"rekkacopy.com","indices":[92,114]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337205760,"id_str":"365611255337205760","text":"\"But there are dreams that cannot be And there are storms we cannot weather\"","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":80189680,"id_str":"80189680","name":"Alejandro Querales","screen_name":"Aquerales","location":"Maracaibo","url":"http:\/\/www.fabulaeterrorem.blogspot.com","description":"Jehov\u00e1, m\u00fasica cl\u00e1sica, tenor l\u00edrico. Medicina, arte, autodidacta. Creador de Fabulae Terrorem @Fabulaeterrorem L'a anima ho milionaria!","protected":false,"followers_count":197,"friends_count":229,"listed_count":1,"created_at":"Tue Oct 06 02:12:55 +0000 2009","favourites_count":73,"utc_offset":-21600,"time_zone":"Mountain Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":15389,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/308923226\/caricature_depicting_franz_li_hi_1_.jpg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/308923226\/caricature_depicting_franz_li_hi_1_.jpg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3231008624\/65df09e730a1e45467168561ac0f1247_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3231008624\/65df09e730a1e45467168561ac0f1247_normal.jpeg","profile_link_color":"A11414","profile_sidebar_border_color":"9E0F0F","profile_sidebar_fill_color":"F2EAEB","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255366553600,"id_str":"365611255366553600","text":"@Y10Abigail of course ! :)) .xx","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610655522361344,"in_reply_to_status_id_str":"365610655522361344","in_reply_to_user_id":385703750,"in_reply_to_user_id_str":"385703750","in_reply_to_screen_name":"Y10Abigail","user":{"id":594916499,"id_str":"594916499","name":"what's good?","screen_name":"Ilovemycello1","location":"Chicago","url":null,"description":"Yoooo~ veni vidi vici ~ \u00a3 the alarm -- i just really like one direction and 5sos okay ; i ship larry + ziam \u270c","protected":false,"followers_count":99,"friends_count":280,"listed_count":1,"created_at":"Wed May 30 21:19:26 +0000 2012","favourites_count":374,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":1229,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000129692285\/6fb3564b0da025382c7db3b9303fa7f1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000129692285\/6fb3564b0da025382c7db3b9303fa7f1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/594916499\/1375835146","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Y10Abigail","name":"Directioner Forever\u2661","id":385703750,"id_str":"385703750","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pl"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255341387776,"id_str":"365611255341387776","text":"@ImThe_Ant13 fassho , wya?","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365611204221214721,"in_reply_to_status_id_str":"365611204221214721","in_reply_to_user_id":281788133,"in_reply_to_user_id_str":"281788133","in_reply_to_screen_name":"ImThe_Ant13","user":{"id":182332211,"id_str":"182332211","name":"eat , sleep , hoop .","screen_name":"naybuhoodteeuno","location":"in the gym !","url":null,"description":"#ballisnotlife ! R.I.P Kyree , R.I.P Chris , R.I.P Antwan . OG'z ! IG - naybuhoodteeuno . married to the game.","protected":false,"followers_count":685,"friends_count":416,"listed_count":0,"created_at":"Tue Aug 24 10:45:28 +0000 2010","favourites_count":218,"utc_offset":-28800,"time_zone":"Alaska","geo_enabled":true,"verified":false,"statuses_count":25671,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/709340096\/8ae4f0186eefbbb197fc81be5f693974.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/709340096\/8ae4f0186eefbbb197fc81be5f693974.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000027799528\/5287e9669d278bd2bea164c10cff7de9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000027799528\/5287e9669d278bd2bea164c10cff7de9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/182332211\/1361809315","profile_link_color":"088253","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ImThe_Ant13","name":"The Ant ","id":281788133,"id_str":"281788133","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ht"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255362359297,"id_str":"365611255362359297","text":"@okaka_konbu \u304a\u306f\u3088\u3001\u304a\u304b\u304b\u3053\u3093\u3076\u3002\u4eca\u65e5\u3082\uff11\u65e5\u5f35\u308a\u5207\u3063\u3066\u3044\u3053\u3046\u306a\uff01","source":"\u003ca href=\"http:\/\/twittbot.net\/\" rel=\"nofollow\"\u003etwittbot.net\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365609733199114240,"in_reply_to_status_id_str":"365609733199114240","in_reply_to_user_id":606691441,"in_reply_to_user_id_str":"606691441","in_reply_to_screen_name":"okaka_konbu","user":{"id":800485332,"id_str":"800485332","name":"\u4e09\u5e74\u751f\u964d\u65d7bot","screen_name":"sannenfurihata","location":"\u8aa0\u51db\u9ad8\u6821","url":"http:\/\/twpf.jp\/sannenfurihata","description":"\u9ed2\/\u5b50\/\u306e\/\u30d0\/\u30b9\/\u30b1\u306b\u767b\u5834\u3059\u308b\u964d\/\u65d7\/\u5149\/\u6a39\u306e\u975e\u516c\u5f0f\u30ad\u30e3\u30e9bot\u3067\u3059\u3002\u539f\u4f5c\u3068\u306f\u9055\u3044\u4e09\u5e74\u751f\u3067\u4e3b\u5c06\u306b\u306a\u3063\u3066\u3044\u307e\u3059\u3002\u307e\u305f\u3001\u8150\u5411\u3051\u3082\u3042\u308a\u307e\u3059\u306e\u3067\u82e6\u624b\u306a\u65b9\u306f\u3054\u6ce8\u610f\u304f\u3060\u3055\u3044\u3002\u4f55\u304b\u3042\u308a\u307e\u3057\u305f\u3089DM\u307e\u3067\u3002\u307e\u305f\u3001\u5916\u56fd\u306e\u65b9\u306f\u8aac\u660e\u66f8\u3092\u4e00\u8aad\u9858\u3044\u307e\u3059\u3002","protected":false,"followers_count":2202,"friends_count":2182,"listed_count":67,"created_at":"Mon Sep 03 14:11:07 +0000 2012","favourites_count":10,"utc_offset":32400,"time_zone":"Irkutsk","geo_enabled":false,"verified":false,"statuses_count":7606,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000260445776\/7abafcfe8d57fcffd21905ab48c9e24c_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000260445776\/7abafcfe8d57fcffd21905ab48c9e24c_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"okaka_konbu","name":"\u304a\u304b\u304b\u3053\u3093\u3076","id":606691441,"id_str":"606691441","indices":[0,12]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255349772288,"id_str":"365611255349772288","text":"RT @sec_advisor: Goodnight everyone!\n\nGOD BLESS.\n\nJust kidding...\n\nGOD DOESN'T EXIST, or bless.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":328676707,"id_str":"328676707","name":"Nearly Anonymous","screen_name":"Audible_Gasp","location":"Canada","url":null,"description":"Author full-time writer movie-buff lover of music and all forms of art. And of course: #Atheist! Shepherds of woe beware! Find me here: http:\/\/t.co\/GwurEYIByj","protected":false,"followers_count":243,"friends_count":419,"listed_count":7,"created_at":"Sun Jul 03 20:37:16 +0000 2011","favourites_count":138,"utc_offset":-18000,"time_zone":"Central Time (US & Canada)","geo_enabled":false,"verified":false,"statuses_count":1228,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000026364872\/fd8abb5956445a51eb9c697539883cae.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000026364872\/fd8abb5956445a51eb9c697539883cae.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000148481606\/30ed137bee0681c6cac7fb4f8884c1b1_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000148481606\/30ed137bee0681c6cac7fb4f8884c1b1_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/328676707\/1353546258","profile_link_color":"038543","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:10:59 +0000 2013","id":365611124235841538,"id_str":"365611124235841538","text":"Goodnight everyone!\n\nGOD BLESS.\n\nJust kidding...\n\nGOD DOESN'T EXIST, or bless.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":989020656,"id_str":"989020656","name":"The Secular Advisor","screen_name":"sec_advisor","location":"South Africa","url":"http:\/\/www.godchecker.com\/","description":"I'm an #atheist; at least I don't worship a Bronze Age torture device! I'm about #humor #EqualityForAll & #animals \/\/ Also follow @Dementera, my wife.","protected":false,"followers_count":3248,"friends_count":1679,"listed_count":52,"created_at":"Tue Dec 04 15:50:45 +0000 2012","favourites_count":1420,"utc_offset":-36000,"time_zone":"Hawaii","geo_enabled":true,"verified":false,"statuses_count":5929,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000007385791\/d35f4a25085e37219faf015fad06baeb.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000007385791\/d35f4a25085e37219faf015fad06baeb.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000250885484\/51924672736ff458ead2d8509b831ab9_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000250885484\/51924672736ff458ead2d8509b831ab9_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/989020656\/1375809577","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"sec_advisor","name":"The Secular Advisor","id":989020656,"id_str":"989020656","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255362367488,"id_str":"365611255362367488","text":"RT @__RockMeLiam: @dancezi4m sdv?","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":405678534,"id_str":"405678534","name":"tori","screen_name":"dancezi4m","location":"1d sg ddl hp spn","url":null,"description":"\u25b3 nunca vi sunshine fumar maconha \/ fan account \u25b3","protected":false,"followers_count":4563,"friends_count":4455,"listed_count":1,"created_at":"Sat Nov 05 16:59:00 +0000 2011","favourites_count":2851,"utc_offset":-7200,"time_zone":"Mid-Atlantic","geo_enabled":false,"verified":false,"statuses_count":14524,"lang":"en-gb","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000040831608\/7150db378578e1596744445790e0cb96.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000040831608\/7150db378578e1596744445790e0cb96.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247062325\/9305b6bd1ef0cbe5b429a911d07b6c36_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247062325\/9305b6bd1ef0cbe5b429a911d07b6c36_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/405678534\/1375739365","profile_link_color":"0D0505","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 23:05:53 +0000 2013","id":365609840980140032,"id_str":"365609840980140032","text":"@dancezi4m sdv?","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":405678534,"in_reply_to_user_id_str":"405678534","in_reply_to_screen_name":"dancezi4m","user":{"id":1335414457,"id_str":"1335414457","name":"6 days to my Bday","screen_name":"__RockMeLiam","location":"LA LA LAND","url":null,"description":"Our love runs deep like a Chevy!!!","protected":false,"followers_count":743,"friends_count":811,"listed_count":0,"created_at":"Mon Apr 08 00:34:30 +0000 2013","favourites_count":292,"utc_offset":-10800,"time_zone":"Brasilia","geo_enabled":true,"verified":false,"statuses_count":1087,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"EFF3F7","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000042578314\/9200546d70716837aeb1a619135b2e8d.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000042578314\/9200546d70716837aeb1a619135b2e8d.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000237680072\/aad4f712f97e38bf7a3aae5a20d069b8_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000237680072\/aad4f712f97e38bf7a3aae5a20d069b8_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1335414457\/1375665312","profile_link_color":"C481C0","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"dancezi4m","name":"tori","id":405678534,"id_str":"405678534","indices":[0,10]}]},"favorited":false,"retweeted":false,"lang":"und"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"__RockMeLiam","name":"6 days to my Bday","id":1335414457,"id_str":"1335414457","indices":[3,16]},{"screen_name":"dancezi4m","name":"tori","id":405678534,"id_str":"405678534","indices":[18,28]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"und"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255362355200,"id_str":"365611255362355200","text":"RT @fuckch4rlie: [walks into chair] \n\nme: sorry \n\nchair: you fucking should be","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1007211524,"id_str":"1007211524","name":"\u2661 rosy","screen_name":"inderfaen","location":"somewhere in neverland","url":"http:\/\/furtherwego.tumblr.com","description":"Guds beste barn.","protected":false,"followers_count":93,"friends_count":170,"listed_count":0,"created_at":"Wed Dec 12 20:12:41 +0000 2012","favourites_count":1017,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":2182,"lang":"no","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/825730228\/b7a06ac5d0fee77786ec561f31aee881.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/825730228\/b7a06ac5d0fee77786ec561f31aee881.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000173352490\/cc09ffa741be94a8bbc47fd1b6142564_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000173352490\/cc09ffa741be94a8bbc47fd1b6142564_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1007211524\/1374541347","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:52:25 +0000 2013","id":365606450258640896,"id_str":"365606450258640896","text":"[walks into chair] \n\nme: sorry \n\nchair: you fucking should be","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1011881286,"id_str":"1011881286","name":"what","screen_name":"fuckch4rlie","location":"ariana follows ","url":null,"description":"nobody's perfect i gotta twerk it","protected":false,"followers_count":15781,"friends_count":666,"listed_count":36,"created_at":"Fri Dec 14 20:15:49 +0000 2012","favourites_count":960,"utc_offset":3600,"time_zone":"London","geo_enabled":false,"verified":false,"statuses_count":9890,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/738332147\/4e9b630f10a8efe92805f2fe43084924.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/738332147\/4e9b630f10a8efe92805f2fe43084924.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/3401386697\/8c38965124e599ca3f1df9a571db6508_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/3401386697\/8c38965124e599ca3f1df9a571db6508_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1011881286\/1370455462","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":48,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"fuckch4rlie","name":"what","id":1011881286,"id_str":"1011881286","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255332999168,"id_str":"365611255332999168","text":"@NORA__0504 \u79c1\u898f\u5236\u306e\u3084\u3064\u307e\u3068\u3081\u3066\u30d5\u30a9\u30ed\u30fc\u3044\u3063\u3066\u308b\u3051\u3069\u51cd\u7d50\u3055\u308c\u305f\u3053\u3068\u306a\u3044\uff57\uff57\uff57\uff57","source":"\u003ca href=\"http:\/\/twtr.jp\" rel=\"nofollow\"\u003eKeitai Web\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610917683146752,"in_reply_to_status_id_str":"365610917683146752","in_reply_to_user_id":1424988673,"in_reply_to_user_id_str":"1424988673","in_reply_to_screen_name":"NORA__0504","user":{"id":331988424,"id_str":"331988424","name":"\u672a\u685c(\u5947\u884c\u7a2e)@\u590f\u30b3\u30df\u4e00\u65e5\u76ee","screen_name":"12miou04","location":"\u6771\u4eac\u90fd","url":"http:\/\/www.pixiv.net\/member.php?id=1704087","description":"\u7d75\u63cf\u3044\u3066\u307e\u3059\u300218\u2191 \u5fc5\u8aad\u2192http:\/\/twpf.jp\/12miou04 \u898f\u5236\u57a2\u3010@mi1204ou\u3011 \u30a2\u30a4\u30b3\u30f3\u30d8\u30c3\u30c0\u30fc\u81ea\u4f5c","protected":false,"followers_count":1749,"friends_count":512,"listed_count":58,"created_at":"Sat Jul 09 01:38:48 +0000 2011","favourites_count":3543,"utc_offset":32400,"time_zone":"Osaka","geo_enabled":false,"verified":false,"statuses_count":61498,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/840604521\/3e15939dbdea122c20705965c3a6231f.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/840604521\/3e15939dbdea122c20705965c3a6231f.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000185829114\/c831e22bb76bbbd3a91548bc577dddae_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000185829114\/c831e22bb76bbbd3a91548bc577dddae_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/331988424\/1368621789","profile_link_color":"CCCCCC","profile_sidebar_border_color":"CCCCCC","profile_sidebar_fill_color":"000000","profile_text_color":"FFFFFF","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"NORA__0504","name":"\u5e7d\u9b3c@(ETTN\u6240\u5c5e)24\u65e5\u30ca\u30f3\u30b8\u30e3","id":1424988673,"id_str":"1424988673","indices":[0,11]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255366553601,"id_str":"365611255366553601","text":"@feastings bE NOIZR","source":"web","truncated":false,"in_reply_to_status_id":365608343399702529,"in_reply_to_status_id_str":"365608343399702529","in_reply_to_user_id":464369737,"in_reply_to_user_id_str":"464369737","in_reply_to_screen_name":"feastings","user":{"id":1023882680,"id_str":"1023882680","name":"surf's up shinji-kun","screen_name":"spacevz","location":"CA","url":"http:\/\/viazi.tumblr.com","description":"hunger like the worlf","protected":false,"followers_count":68,"friends_count":37,"listed_count":0,"created_at":"Thu Dec 20 09:18:19 +0000 2012","favourites_count":10600,"utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":18392,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C2FFE4","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000036299070\/75081cf6d681ff66631e67c38390a0f4.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000036299070\/75081cf6d681ff66631e67c38390a0f4.png","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000244465824\/9dc8fafcc50522483d1dab6a99028911_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000244465824\/9dc8fafcc50522483d1dab6a99028911_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1023882680\/1375069026","profile_link_color":"FF4A86","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"feastings","name":"i did this to myself","id":464369737,"id_str":"464369737","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"pt"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255370743808,"id_str":"365611255370743808","text":"@Bresnaa14 thanks babyyy, zal ikdoen loveu2 !!","source":"\u003ca href=\"http:\/\/ubersocial.com\" rel=\"nofollow\"\u003eUberSocial for BlackBerry\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365599757906870273,"in_reply_to_status_id_str":"365599757906870273","in_reply_to_user_id":1310757962,"in_reply_to_user_id_str":"1310757962","in_reply_to_screen_name":"Bresnaa14","user":{"id":621153842,"id_str":"621153842","name":"BIRTHDAAYGIIRRLLLLLL","screen_name":"MerrieK_","location":" Freshkidddd \u2661","url":null,"description":"cute net @FreshKidmusic\u2665 &I'm in love with @RonnieFlex2907 \u2665__\u2661 !","protected":false,"followers_count":642,"friends_count":380,"listed_count":0,"created_at":"Thu Jun 28 17:18:35 +0000 2012","favourites_count":29,"utc_offset":7200,"time_zone":"Amsterdam","geo_enabled":false,"verified":false,"statuses_count":73577,"lang":"nl","contributors_enabled":false,"is_translator":false,"profile_background_color":"D6229D","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000046900361\/606623a0dbfc3f3956213851bb7504f7.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000046900361\/606623a0dbfc3f3956213851bb7504f7.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000147695150\/e3ac0ba1f0d90b5b4f67e4b0b7a5c921_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000147695150\/e3ac0ba1f0d90b5b4f67e4b0b7a5c921_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/621153842\/1375923001","profile_link_color":"F71EE9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"Bresnaa14","name":"_BrE$NaA^","id":1310757962,"id_str":"1310757962","indices":[0,10]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"nl"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255358164993,"id_str":"365611255358164993","text":"@SummerRoseLacey why does that matter lol, London raves are soo shit. And London is a cunt to drive though in the morning","source":"\u003ca href=\"http:\/\/twitter.com\/#!\/download\/ipad\" rel=\"nofollow\"\u003eTwitter for iPad\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":365610329658490881,"in_reply_to_status_id_str":"365610329658490881","in_reply_to_user_id":475046609,"in_reply_to_user_id_str":"475046609","in_reply_to_screen_name":"SummerRoseLacey","user":{"id":790155674,"id_str":"790155674","name":"parmsss","screen_name":"Parmsss","location":"England - Reading","url":null,"description":"Variety is key","protected":false,"followers_count":475,"friends_count":364,"listed_count":0,"created_at":"Wed Aug 29 21:07:48 +0000 2012","favourites_count":209,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":924,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000247781115\/1f1549e3bd14b7309d652997c69c716b_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000247781115\/1f1549e3bd14b7309d652997c69c716b_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/790155674\/1372684247","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"SummerRoseLacey","name":"summer ","id":475046609,"id_str":"475046609","indices":[0,16]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255349776384,"id_str":"365611255349776384","text":"\u6cbb\u308a\u305d\u3046\u3067\u6cbb\u3063\u3066\u304f\u308c\u306d\u3047\u30fc\u306a\u3041\u30fc\u2026\n\n\u98a8\u90aa\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":579757939,"id_str":"579757939","name":"YAMA -SAN","screen_name":"113Yama","location":"","url":null,"description":"\u307b\u3093\u306e\u51fa\u6765\u5fc3\u3067\u59cb\u3081\u305f\u307f\u305f\u3044\u3067\u3059\u3088","protected":false,"followers_count":17,"friends_count":69,"listed_count":1,"created_at":"Mon May 14 10:31:49 +0000 2012","favourites_count":1,"utc_offset":null,"time_zone":null,"geo_enabled":false,"verified":false,"statuses_count":769,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000132634729\/522bfe94d3c355ecbca4b2b6790e576b_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000132634729\/522bfe94d3c355ecbca4b2b6790e576b_normal.png","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"ja"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353982976,"id_str":"365611255353982976","text":"RT @beautstyles: I CAN ACTUALLY IMAGINE HARRY DOING THISN http:\/\/t.co\/Zxi9O2Eie6","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":18615041,"id_str":"18615041","name":"Victoria McCrory","screen_name":"VikkiMcCrory","location":"Poughkeepsie, NY","url":"https:\/\/www.facebook.com\/WLDCHLD?ref=br_tf","description":"20. NY. I sing & play guitar. Neil Patrick Harris is my spirit animal. @WLDCHLDmusic http:\/\/ask.fm\/vikkimccrory","protected":false,"followers_count":417,"friends_count":159,"listed_count":2,"created_at":"Sun Jan 04 22:50:59 +0000 2009","favourites_count":1019,"utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"verified":false,"statuses_count":11190,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/760037072\/9a29d137efe5360ef1f77330cc5b1bdb.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/760037072\/9a29d137efe5360ef1f77330cc5b1bdb.jpeg","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000158536873\/3ea1d46a61d6236d7a9e6ec25d7ebc71_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000158536873\/3ea1d46a61d6236d7a9e6ec25d7ebc71_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/18615041\/1375857178","profile_link_color":"009999","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Thu Aug 08 22:21:45 +0000 2013","id":365598736237330433,"id_str":"365598736237330433","text":"I CAN ACTUALLY IMAGINE HARRY DOING THISN http:\/\/t.co\/Zxi9O2Eie6","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":130204279,"id_str":"130204279","name":" \u2741 millie \u2741","screen_name":"beautstyles","location":"uk | IG:millieweaverxo","url":null,"description":"\u2661 cute as a button every single one of you \u2661 niam\/5.justin rt'd+follows.5\/4","protected":false,"followers_count":51618,"friends_count":7642,"listed_count":116,"created_at":"Tue Apr 06 16:43:58 +0000 2010","favourites_count":8166,"utc_offset":3600,"time_zone":"Casablanca","geo_enabled":false,"verified":false,"statuses_count":50414,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/868504533\/295e6dbca220ec403f996484fd190ec8.gif","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/868504533\/295e6dbca220ec403f996484fd190ec8.gif","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000239739280\/64c04cb9143e0aa7d9e0bc0d71c18662_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000239739280\/64c04cb9143e0aa7d9e0bc0d71c18662_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/130204279\/1375619451","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"000000","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":51,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"id":365598736241524736,"id_str":"365598736241524736","indices":[41,63],"media_url":"http:\/\/pbs.twimg.com\/media\/BRLeHRQCAAAPUvw.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/BRLeHRQCAAAPUvw.jpg","url":"http:\/\/t.co\/Zxi9O2Eie6","display_url":"pic.twitter.com\/Zxi9O2Eie6","expanded_url":"http:\/\/twitter.com\/beautstyles\/status\/365598736237330433\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":900,"resize":"fit"},"large":{"w":640,"h":960,"resize":"fit"},"small":{"w":340,"h":510,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"lang":"en"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"beautstyles","name":" \u2741 millie \u2741","id":130204279,"id_str":"130204279","indices":[3,15]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255337197568,"id_str":"365611255337197568","text":"RT @EsTatuaje: Fav si eres de Espa\u00f1a.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1187294178,"id_str":"1187294178","name":" I love my bitches","screen_name":"the_samuel01","location":"","url":null,"description":null,"protected":false,"followers_count":182,"friends_count":447,"listed_count":0,"created_at":"Sat Feb 16 20:02:51 +0000 2013","favourites_count":16,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":652,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000046852191\/f922357851e7880de34ddba3e39ea8cb_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000046852191\/f922357851e7880de34ddba3e39ea8cb_normal.jpeg","profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Aug 07 07:48:26 +0000 2013","id":365016569077710849,"id_str":"365016569077710849","text":"Fav si eres de Espa\u00f1a.","source":"web","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1638812636,"id_str":"1638812636","name":"Tatuajes ","screen_name":"EsTatuaje","location":"","url":null,"description":"Contacto\/Publicidad: estatuaje@gmail.com | \u00bfTienes un tatuaje y quieres que lo publiquemos? env\u00edanos un correo con la foto y su significado.","protected":false,"followers_count":19923,"friends_count":8,"listed_count":8,"created_at":"Thu Aug 01 20:40:41 +0000 2013","favourites_count":3,"utc_offset":-21600,"time_zone":"Central America","geo_enabled":false,"verified":false,"statuses_count":104,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000044463759\/4c5eb4a02b0a2b2db8b60a2220486da2.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000044463759\/4c5eb4a02b0a2b2db8b60a2220486da2.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000248086408\/095d2bae16b538efcc6b57e625f5ff84_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000248086408\/095d2bae16b538efcc6b57e625f5ff84_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1638812636\/1375652917","profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":4,"entities":{"hashtags":[],"urls":[],"user_mentions":[]},"favorited":false,"retweeted":false,"lang":"es"},"retweet_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"EsTatuaje","name":"Tatuajes ","id":1638812636,"id_str":"1638812636","indices":[3,13]}]},"favorited":false,"retweeted":false,"filter_level":"medium","lang":"es"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353970688,"id_str":"365611255353970688","text":"Ooooo my brother... \u2764\ud83d\udc9c http:\/\/t.co\/9ywmj3srgO","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":84970027,"id_str":"84970027","name":"duda ","screen_name":"dudss_","location":"","url":"http:\/\/come-to-believe-in-love.tumblr.com\/","description":null,"protected":false,"followers_count":294,"friends_count":326,"listed_count":24,"created_at":"Sun Oct 25 00:15:55 +0000 2009","favourites_count":329,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":44048,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FF6699","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/750034479\/bbd7d130c5e28af7172184c0c69e3bf8.jpeg","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/750034479\/bbd7d130c5e28af7172184c0c69e3bf8.jpeg","profile_background_tile":true,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000170753398\/4699071b5c555ea569e64334e2f0527a_normal.jpeg","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000170753398\/4699071b5c555ea569e64334e2f0527a_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/84970027\/1356747500","profile_link_color":"3652D1","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"FFFFFF","profile_text_color":"FF3C00","profile_use_background_image":true,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"http:\/\/t.co\/9ywmj3srgO","expanded_url":"http:\/\/instagram.com\/p\/cxST-QunV4\/","display_url":"instagram.com\/p\/cxST-QunV4\/","indices":[23,45]}],"user_mentions":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"en"} +{"created_at":"Thu Aug 08 23:11:30 +0000 2013","id":365611255353970690,"id_str":"365611255353970690","text":"@dileyland da uma olhada https:\/\/t.co\/D3ulVL7xfI","source":"web","truncated":false,"in_reply_to_status_id":365610326839918593,"in_reply_to_status_id_str":"365610326839918593","in_reply_to_user_id":911209723,"in_reply_to_user_id_str":"911209723","in_reply_to_screen_name":"dileyland","user":{"id":191136672,"id_str":"191136672","name":"Lara","screen_name":"demibydemi","location":"","url":null,"description":"\u2661","protected":false,"followers_count":536,"friends_count":498,"listed_count":5,"created_at":"Wed Sep 15 18:00:15 +0000 2010","favourites_count":917,"utc_offset":-14400,"time_zone":"Santiago","geo_enabled":true,"verified":false,"statuses_count":21052,"lang":"pt","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http:\/\/a0.twimg.com\/profile_background_images\/378800000041248508\/ccfa123b2aaeb91c0d79ded4dd7ec103.png","profile_background_image_url_https":"https:\/\/si0.twimg.com\/profile_background_images\/378800000041248508\/ccfa123b2aaeb91c0d79ded4dd7ec103.png","profile_background_tile":false,"profile_image_url":"http:\/\/a0.twimg.com\/profile_images\/378800000256389162\/234c25ef157d877ba2d26e8a322ed265_normal.png","profile_image_url_https":"https:\/\/si0.twimg.com\/profile_images\/378800000256389162\/234c25ef157d877ba2d26e8a322ed265_normal.png","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/191136672\/1375901638","profile_link_color":"000000","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/D3ulVL7xfI","expanded_url":"https:\/\/pbs.twimg.com\/media\/BFayWXuCYAEecP9.jpg","display_url":"pbs.twimg.com\/media\/BFayWXuC\u2026","indices":[25,48]}],"user_mentions":[{"screen_name":"dileyland","name":"Duda","id":911209723,"id_str":"911209723","indices":[0,10]}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"medium","lang":"pt"} diff --git a/src/kudu/twitter-demo/ingest_firehose.cc b/src/kudu/twitter-demo/ingest_firehose.cc new file mode 100644 index 000000000000..9fb4c42c5da0 --- /dev/null +++ b/src/kudu/twitter-demo/ingest_firehose.cc @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/client/client.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/once.h" +#include "kudu/rpc/messenger.h" +#include "kudu/master/master.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/twitter-demo/oauth.h" +#include "kudu/twitter-demo/insert_consumer.h" +#include "kudu/twitter-demo/twitter_streamer.h" +#include "kudu/util/flags.h" +#include "kudu/util/logging.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +DEFINE_string(twitter_firehose_sink, "console", + "Where to write firehose output.\n" + "Valid values: console,rpc"); +DEFINE_string(twitter_rpc_master_address, "localhost", + "Address of master for the cluster to write to"); + +DEFINE_string(twitter_firehose_source, "api", + "Where to obtain firehose input.\n" + "Valid values: api,file"); +DEFINE_string(twitter_firehose_file, "/dev/fd/0", + "File to read firehose data from, if 'file' is configured."); + + +using std::string; + +namespace kudu { +namespace twitter_demo { + +using client::sp::shared_ptr; +using tserver::TabletServerServiceProxy; + +// Consumer which simply logs messages to the console. +class LoggingConsumer : public TwitterConsumer { + public: + virtual void ConsumeJSON(const Slice& json) OVERRIDE { + std::cout << json.ToString(); + } +}; + +gscoped_ptr CreateInsertConsumer() { + shared_ptr client; + CHECK_OK(client::KuduClientBuilder() + .add_master_server_addr(FLAGS_twitter_rpc_master_address) + .Build(&client)); + + gscoped_ptr ret(new InsertConsumer(client)); + CHECK_OK(ret->Init()); + return gscoped_ptr(ret.Pass()); // up-cast +} + +static void IngestFromFile(const string& file, gscoped_ptr consumer) { + std::ifstream in(file.c_str()); + CHECK(in.is_open()) << "Couldn't open " << file; + + string line; + while (std::getline(in, line)) { + consumer->ConsumeJSON(line); + } +} + +static int main(int argc, char** argv) { + // Since this is meant to be run by a user, not a daemon, + // log to stderr by default. + FLAGS_logtostderr = 1; + kudu::ParseCommandLineFlags(&argc, &argv, true); + kudu::InitGoogleLoggingSafe(argv[0]); + + gscoped_ptr consumer; + if (FLAGS_twitter_firehose_sink == "console") { + consumer.reset(new LoggingConsumer); + } else if (FLAGS_twitter_firehose_sink == "rpc") { + consumer = CreateInsertConsumer(); + } else { + LOG(FATAL) << "Unknown sink: " << FLAGS_twitter_firehose_sink; + } + + if (FLAGS_twitter_firehose_source == "api") { + TwitterStreamer streamer(consumer.get()); + CHECK_OK(streamer.Init()); + CHECK_OK(streamer.Start()); + CHECK_OK(streamer.Join()); + } else if (FLAGS_twitter_firehose_source == "file") { + IngestFromFile(FLAGS_twitter_firehose_file, consumer.Pass()); + } else { + LOG(FATAL) << "Unknown source: " << FLAGS_twitter_firehose_source; + } + return 0; +} + +} // namespace twitter_demo +} // namespace kudu + +int main(int argc, char** argv) { + return kudu::twitter_demo::main(argc, argv); +} diff --git a/src/kudu/twitter-demo/insert_consumer.cc b/src/kudu/twitter-demo/insert_consumer.cc new file mode 100644 index 000000000000..370b8cc6b5ac --- /dev/null +++ b/src/kudu/twitter-demo/insert_consumer.cc @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/twitter-demo/insert_consumer.h" + +#include +#include +#include +#include +#include + +#include "kudu/common/wire_protocol.h" +#include "kudu/common/row.h" +#include "kudu/common/schema.h" +#include "kudu/client/client.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/tserver/tserver.pb.h" +#include "kudu/twitter-demo/parser.h" +#include "kudu/twitter-demo/twitter-schema.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace twitter_demo { + +using tserver::TabletServerServiceProxy; +using tserver::WriteRequestPB; +using tserver::WriteResponsePB; +using rpc::RpcController; +using kudu::client::KuduInsert; +using kudu::client::KuduClient; +using kudu::client::KuduSession; +using kudu::client::KuduStatusCallback; +using kudu::client::KuduTable; +using kudu::client::KuduTableCreator; + +FlushCB::FlushCB(InsertConsumer* consumer) + : consumer_(consumer) { +} + +FlushCB::~FlushCB() { +} + +void FlushCB::Run(const Status& status) { + consumer_->BatchFinished(status); +} + +InsertConsumer::InsertConsumer(const client::sp::shared_ptr &client) + : initted_(false), + schema_(CreateTwitterSchema()), + flush_cb_(this), + client_(client), + request_pending_(false) { +} + +Status InsertConsumer::Init() { + const char *kTableName = "twitter"; + Status s = client_->OpenTable(kTableName, &table_); + if (s.IsNotFound()) { + gscoped_ptr table_creator(client_->NewTableCreator()); + RETURN_NOT_OK_PREPEND(table_creator->table_name(kTableName) + .schema(&schema_) + .Create(), + "Couldn't create twitter table"); + s = client_->OpenTable(kTableName, &table_); + } + RETURN_NOT_OK_PREPEND(s, "Couldn't open twitter table"); + + session_ = client_->NewSession(); + session_->SetTimeoutMillis(1000); + CHECK_OK(session_->SetFlushMode(KuduSession::MANUAL_FLUSH)); + initted_ = true; + return Status::OK(); +} + +InsertConsumer::~InsertConsumer() { + // TODO: to be safe, we probably need to cancel any current RPC, + // or else the callback will get called on the destroyed object. + // Given this is just demo code, cutting this corner. + CHECK(!request_pending_); +} + +void InsertConsumer::BatchFinished(const Status& s) { + boost::lock_guard l(lock_); + request_pending_ = false; + if (!s.ok()) { + bool overflow; + vector errors; + ElementDeleter d(&errors); + session_->GetPendingErrors(&errors, &overflow); + for (const client::KuduError* error : errors) { + LOG(WARNING) << "Failed to insert row " << error->failed_op().ToString() + << ": " << error->status().ToString(); + } + } +} + +void InsertConsumer::ConsumeJSON(const Slice& json_slice) { + CHECK(initted_); + string json = json_slice.ToString(); + Status s = parser_.Parse(json, &event_); + if (!s.ok()) { + LOG(WARNING) << "Unable to parse JSON string: " << json << ": " << s.ToString(); + return; + } + + if (event_.type == DELETE_TWEET) { + // Not currently supported. + return; + } + + string created_at = TwitterEventParser::ReformatTime(event_.tweet_event.created_at); + + gscoped_ptr ins(table_->NewInsert()); + KuduPartialRow* r = ins->mutable_row(); + CHECK_OK(r->SetInt64("tweet_id", event_.tweet_event.tweet_id)); + CHECK_OK(r->SetStringCopy("text", event_.tweet_event.text)); + CHECK_OK(r->SetStringCopy("source", event_.tweet_event.source)); + CHECK_OK(r->SetStringCopy("created_at", created_at)); + CHECK_OK(r->SetInt64("user_id", event_.tweet_event.user_id)); + CHECK_OK(r->SetStringCopy("user_name", event_.tweet_event.user_name)); + CHECK_OK(r->SetStringCopy("user_description", event_.tweet_event.user_description)); + CHECK_OK(r->SetStringCopy("user_location", event_.tweet_event.user_location)); + CHECK_OK(r->SetInt32("user_followers_count", event_.tweet_event.user_followers_count)); + CHECK_OK(r->SetInt32("user_friends_count", event_.tweet_event.user_friends_count)); + CHECK_OK(r->SetStringCopy("user_image_url", event_.tweet_event.user_image_url)); + CHECK_OK(session_->Apply(ins.release())); + + // TODO: once the auto-flush mode is implemented, switch to using that + // instead of the manual batching here + bool do_flush = false; + { + boost::lock_guard l(lock_); + if (!request_pending_) { + request_pending_ = true; + do_flush = true; + } + } + if (do_flush) { + VLOG(1) << "Sending batch of " << session_->CountBufferedOperations(); + session_->FlushAsync(&flush_cb_); + } +} + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/insert_consumer.h b/src/kudu/twitter-demo/insert_consumer.h new file mode 100644 index 000000000000..826ca11a4e21 --- /dev/null +++ b/src/kudu/twitter-demo/insert_consumer.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TWITTER_DEMO_INSERT_CONSUMER_H +#define KUDU_TWITTER_DEMO_INSERT_CONSUMER_H + +#include "kudu/twitter-demo/twitter_streamer.h" + +#include + +#include "kudu/client/callbacks.h" +#include "kudu/client/schema.h" +#include "kudu/client/shared_ptr.h" +#include "kudu/rpc/rpc_controller.h" +#include "kudu/tserver/tserver_service.proxy.h" +#include "kudu/twitter-demo/parser.h" +#include "kudu/util/locks.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace client { +class KuduClient; +class KuduTable; +class KuduSession; +class KuduStatusCallback; +} // namespace client + +namespace twitter_demo { + +class InsertConsumer; + +class FlushCB : public client::KuduStatusCallback { + public: + explicit FlushCB(InsertConsumer* consumer); + + virtual ~FlushCB(); + + virtual void Run(const Status& status) OVERRIDE; + private: + InsertConsumer* consumer_; +}; + +// Consumer of tweet data which parses the JSON and inserts +// into a remote tablet via RPC. +class InsertConsumer : public TwitterConsumer { + public: + explicit InsertConsumer( + const client::sp::shared_ptr &client); + ~InsertConsumer(); + + Status Init(); + + virtual void ConsumeJSON(const Slice& json) OVERRIDE; + + private: + friend class FlushCB; + + void BatchFinished(const Status& s); + + bool initted_; + + client::KuduSchema schema_; + FlushCB flush_cb_; + TwitterEventParser parser_; + + // Reusable object for latest event. + TwitterEvent event_; + + client::sp::shared_ptr client_; + client::sp::shared_ptr session_; + client::sp::shared_ptr table_; + + simple_spinlock lock_; + bool request_pending_; +}; + +} // namespace twitter_demo +} // namespace kudu +#endif diff --git a/src/kudu/twitter-demo/oauth-test.cc b/src/kudu/twitter-demo/oauth-test.cc new file mode 100644 index 000000000000..3b9e9229028e --- /dev/null +++ b/src/kudu/twitter-demo/oauth-test.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "kudu/twitter-demo/oauth.h" + +#include +#include + +#include "kudu/util/debug/leakcheck_disabler.h" + +using std::string; + +namespace kudu { +namespace twitter_demo { + +// Test case from Appendix A of the OAuth 1.0 standard: +// http://oauth.net/core/1.0/ +TEST(OAuthTest, TestSignature) { + const string kConsumerKey = "dpf43f3p2l4k3l03"; + const string kConsumerSecret = "kd94hf93k423kf44"; + const string kTokenKey = "nnch734d00sl2jdk"; + const string kTokenSecret = "pfkkdhi9sl3r4s00"; + + // Necessary to squelch a leak originating in the NSS SSL library. + debug::ScopedLeakCheckDisabler disabler; + + OAuthRequest req("GET", "http://photos.example.net/photos"); + + req.AddPair("oauth_consumer_key", kConsumerKey); + req.AddPair("oauth_token", kTokenKey); + req.AddPair("oauth_signature_method", "HMAC-SHA1"); + req.AddPair("oauth_timestamp", "1191242096"); + req.AddPair("oauth_nonce", "kllo9940pd9333jh"); + req.AddPair("oauth_version", "1.0"); + req.AddPair("file", "vacation.jpg"); + req.AddPair("size", "original"); + string base = req.SignatureBaseString(); + ASSERT_EQ(string("GET&http%3A%2F%2Fphotos.example.net%2Fphotos&file%3Dvacation.jpg%26" + "oauth_consumer_key%3Ddpf43f3p2l4k3l03%26oauth_nonce%3Dkllo9940pd9333jh%26" + "oauth_signature_method%3DHMAC-SHA1%26oauth_timestamp%3D1191242096%26" + "oauth_token%3Dnnch734d00sl2jdk%26oauth_version%3D1.0%26size%3Doriginal"), + base); + + string sig = req.Signature(kConsumerSecret, kTokenSecret); + ASSERT_EQ("tR3+Ty81lMeYAr/Fid0kMTYa/WM=", sig); +} + + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/oauth.cc b/src/kudu/twitter-demo/oauth.cc new file mode 100644 index 000000000000..efffcb4a0ba6 --- /dev/null +++ b/src/kudu/twitter-demo/oauth.cc @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/twitter-demo/oauth.h" + +#include +#include +#include +#include +extern "C" { +#include +} + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/util.h" + +using std::pair; +using std::string; +using std::vector; + +namespace kudu { +namespace twitter_demo { + +static string EscapeUrl(const string& str) { + gscoped_ptr enc(oauth_url_escape(str.c_str())); + return string(enc.get()); +} + +static string GenerateNonce() { + gscoped_ptr ret(oauth_gen_nonce()); + return string(ret.get()); +} + + +OAuthRequest::OAuthRequest(const string& http_method, + const string& url) + : http_method_(http_method), + url_(url) { +} + +void OAuthRequest::AddStandardOAuthFields(const string& consumer_key, + const string& token_key) { + AddPair("oauth_version", "1.0"); + AddPair("oauth_signature_method", "HMAC-SHA1"); + AddPair("oauth_nonce", GenerateNonce()); + AddPair("oauth_timestamp", boost::lexical_cast(time(NULL))); + AddPair("oauth_consumer_key", consumer_key); + AddPair("oauth_token", token_key); +} + +void OAuthRequest::AddPair(const string& key, const string& value) { + kv_pairs_.push_back(std::make_pair(key, value)); +} + +static bool ComparePair(const pair& a, + const pair& b) { + if (a.first < b.first) return true; + else if (a.first > b.first) return false; + + return a.second < b.second; +} + +string OAuthRequest::SignatureBaseString() const { + vector > sorted_pairs(kv_pairs_); + std::sort(sorted_pairs.begin(), sorted_pairs.end(), &ComparePair); + string ret; + ret.append(http_method_); + ret.append("&"); + ret.append(EscapeUrl(url_)); + + string kvpairs; + bool first = true; + for (const StringPair& p : sorted_pairs) { + if (!first) { + kvpairs.append("&"); + } + first = false; + kvpairs.append(p.first); + kvpairs.append("="); + kvpairs.append(EscapeUrl(p.second)); + } + ret.append("&"); + ret.append(EscapeUrl(kvpairs)); + return ret; +} + +string OAuthRequest::Signature(const string& consumer_secret, + const string& token_secret) const { + string base = SignatureBaseString(); + string key = consumer_secret + "&" + token_secret; + gscoped_ptr hmacced( + oauth_sign_hmac_sha1_raw(base.c_str(), base.size(), key.c_str(), key.size())); + CHECK(hmacced.get()); + return string(hmacced.get()); +} + +string OAuthRequest::AuthHeader(const string& consumer_secret, + const string& token_secret) const { + string sig = Signature(consumer_secret, token_secret); + + string ret = "Authorization: OAuth realm=\"\""; + for (const StringPair& p : kv_pairs_) { + if (!HasPrefixString(p.first, "oauth_")) continue; + ret.append(", "); + ret.append(p.first).append("=\"").append(EscapeUrl(p.second)).append("\""); + } + ret.append(", oauth_signature_method=\"HMAC-SHA1\""); + ret.append(", oauth_signature=\"").append(EscapeUrl(sig)).append("\""); + return ret; +} + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/oauth.h b/src/kudu/twitter-demo/oauth.h new file mode 100644 index 000000000000..b24f45404037 --- /dev/null +++ b/src/kudu/twitter-demo/oauth.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TWITTERDEMO_OAUTH_H +#define KUDU_TWITTERDEMO_OAUTH_H + +#include + +#include +#include +#include + +namespace kudu { +namespace twitter_demo { + +// An OpenAuth-authenticated request. See oauth-test.cc for +// usage examples. +class OAuthRequest { + private: + typedef std::pair StringPair; + + public: + OAuthRequest(const std::string& http_method, + const std::string& url); + + // Add a key-value pair to the OAauth request. + void AddPair(const std::string& key, const std::string& value); + + // Add the standard OAuth fields to the request, including + // generating a nonce and filling in the request timestamp. + void AddStandardOAuthFields(const std::string& consumer_key, + const std::string& token_key); + + // Generate the HTTP Authorization header to authenticate this request. + // This is the entire header, including the 'Authorization: ' prefix. + std::string AuthHeader(const std::string& consumer_secret, + const std::string& token_secret) const; + + private: + FRIEND_TEST(OAuthTest, TestSignature); + + std::string SignatureBaseString() const; + std::string Signature(const std::string& consumer_secret, + const std::string& token_secret) const; + + std::string http_method_; + std::string url_; + + // The entries used in the request. + std::vector kv_pairs_; +}; + +} // namespace twitter_demo +} // namespace kudu +#endif diff --git a/src/kudu/twitter-demo/parser-test.cc b/src/kudu/twitter-demo/parser-test.cc new file mode 100644 index 000000000000..2870864bf641 --- /dev/null +++ b/src/kudu/twitter-demo/parser-test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/twitter-demo/parser.h" + +#include +#include + +#include "kudu/gutil/strings/split.h" +#include "kudu/util/env.h" +#include "kudu/util/path_util.h" +#include "kudu/util/test_util.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace twitter_demo { + +// Return the directory of the currently-running executable. +static string GetExecutableDir() { + string exec; + CHECK_OK(Env::Default()->GetExecutablePath(&exec)); + return DirName(exec); +} + +static Status LoadFile(const string& name, vector* lines) { + string path = JoinPathSegments(GetExecutableDir(), name); + faststring data; + RETURN_NOT_OK(ReadFileToString(Env::Default(), path, &data)); + + *lines = strings::Split(data.ToString(), "\n"); + return Status::OK(); +} + +static void EnsureFileParses(const char* file, TwitterEventType expected_type) { + TwitterEventParser p; + TwitterEvent event; + + SCOPED_TRACE(file); + vector jsons; + CHECK_OK(LoadFile(file, &jsons)); + + int line_number = 1; + for (const string& json : jsons) { + if (json.empty()) continue; + SCOPED_TRACE(json); + SCOPED_TRACE(line_number); + ASSERT_OK(p.Parse(json, &event)); + ASSERT_EQ(expected_type, event.type); + line_number++; + } +} + +// example-tweets.txt includes a few hundred tweets collected +// from the sample hose. +TEST(ParserTest, TestParseTweets) { + EnsureFileParses("example-tweets.txt", TWEET); +} + +// example-deletes.txt includes a few hundred deletes collected +// from the sample hose. +TEST(ParserTest, TestParseDeletes) { + EnsureFileParses("example-deletes.txt", DELETE_TWEET); +} + +TEST(ParserTest, TestReformatTime) { + ASSERT_EQ("20130814063107", TwitterEventParser::ReformatTime("Wed Aug 14 06:31:07 +0000 2013")); +} + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/parser.cc b/src/kudu/twitter-demo/parser.cc new file mode 100644 index 000000000000..20a82f4fb9d7 --- /dev/null +++ b/src/kudu/twitter-demo/parser.cc @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/twitter-demo/parser.h" + +#include + +#include +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/jsonreader.h" + +namespace kudu { +namespace twitter_demo { + +TwitterEventParser::TwitterEventParser() { +} + +TwitterEventParser::~TwitterEventParser() { +} + +static Status ParseDelete(const JsonReader& r, + const rapidjson::Value* delete_obj, + TwitterEvent* event) { + event->type = DELETE_TWEET; + DeleteTweetEvent* e = &event->delete_event; + + const rapidjson::Value* status_obj; + RETURN_NOT_OK(r.ExtractObject(delete_obj, "status", &status_obj)); + RETURN_NOT_OK(r.ExtractInt64(status_obj, "id", &e->tweet_id)); + RETURN_NOT_OK(r.ExtractInt64(status_obj, "user_id", &e->user_id)); + return Status::OK(); +} + +static Status ParseTweet(const JsonReader& r, + TwitterEvent* event) { + event->type = TWEET; + TweetEvent* e = &event->tweet_event; + + RETURN_NOT_OK(r.ExtractString(r.root(), "created_at", &e->created_at)); + RETURN_NOT_OK(r.ExtractInt64(r.root(), "id", &e->tweet_id)); + RETURN_NOT_OK(r.ExtractString(r.root(), "text", &e->text)); + RETURN_NOT_OK(r.ExtractString(r.root(), "source", &e->source)); + + const rapidjson::Value* user_obj; + RETURN_NOT_OK(r.ExtractObject(r.root(), "user", &user_obj)); + RETURN_NOT_OK(r.ExtractInt64(user_obj, "id", &e->user_id)); + RETURN_NOT_OK(r.ExtractString(user_obj, "name", &e->user_name)); + RETURN_NOT_OK(r.ExtractString(user_obj, "location", &e->user_location)); + RETURN_NOT_OK(r.ExtractString(user_obj, "description", &e->user_description)); + RETURN_NOT_OK(r.ExtractInt32(user_obj, "followers_count", &e->user_followers_count)); + RETURN_NOT_OK(r.ExtractInt32(user_obj, "friends_count", &e->user_friends_count)); + RETURN_NOT_OK(r.ExtractString(user_obj, "profile_image_url", &e->user_image_url)); + + return Status::OK(); +} + +Status TwitterEventParser::Parse(const string& json, TwitterEvent* event) { + JsonReader r(json); + RETURN_NOT_OK(r.Init()); + const rapidjson::Value* delete_obj; + Status s = r.ExtractObject(r.root(), "delete", &delete_obj); + if (s.IsNotFound()) { + return ParseTweet(r, event); + } + RETURN_NOT_OK(s); + return ParseDelete(r, delete_obj, event); +} + +string TwitterEventParser::ReformatTime(const string& twitter_time) { + struct tm t; + memset(&t, 0, sizeof(t)); + // Example: Wed Aug 14 06:31:07 +0000 2013 + char* x = strptime(twitter_time.c_str(), "%a %b %d %H:%M:%S +0000 %Y", &t); + if (*x != '\0') { + return StringPrintf("unparseable date, date=%s, leftover=%s", twitter_time.c_str(), x); + } + + char buf[100]; + size_t n = strftime(buf, arraysize(buf), "%Y%m%d%H%M%S", &t); + CHECK_GT(n, 0); + return string(buf); +} + + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/parser.h b/src/kudu/twitter-demo/parser.h new file mode 100644 index 000000000000..c10045072b98 --- /dev/null +++ b/src/kudu/twitter-demo/parser.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TWITTER_DEMO_PARSER_H +#define KUDU_TWITTER_DEMO_PARSER_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace twitter_demo { + +enum TwitterEventType { + NONE = 0, + TWEET = 1, + DELETE_TWEET = 2 +}; + + +struct TweetEvent { + int64_t tweet_id; + std::string text; + std::string source; + std::string created_at; + // TODO: add geolocation + int64_t user_id; + std::string user_name; + std::string user_description; + std::string user_location; + int32_t user_followers_count; + int32_t user_friends_count; + std::string user_image_url; +}; + +struct DeleteTweetEvent { + int64_t tweet_id; + int64_t user_id; +}; + +struct TwitterEvent { + TwitterEvent() : type(NONE) {} + + // The type of event. Only one of the various events below will + // be valid, depending on this type value. + TwitterEventType type; + + // The different event types. These are separate fields rather than + // a union so that we can reuse string storage when parsing multiple + // events. + + TweetEvent tweet_event; + DeleteTweetEvent delete_event; +}; + +class TwitterEventParser { + public: + TwitterEventParser(); + ~TwitterEventParser(); + + Status Parse(const std::string& json, TwitterEvent* event); + + static std::string ReformatTime(const std::string& time); + + private: + DISALLOW_COPY_AND_ASSIGN(TwitterEventParser); +}; + +} // namespace twitter_demo +} // namespace kudu +#endif diff --git a/src/kudu/twitter-demo/twitter-schema.h b/src/kudu/twitter-demo/twitter-schema.h new file mode 100644 index 000000000000..acfc85123f2a --- /dev/null +++ b/src/kudu/twitter-demo/twitter-schema.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Inline functions to create the Twitter schema +#ifndef KUDU_TWITTER_DEMO_TWITTER_SCHEMA_H +#define KUDU_TWITTER_DEMO_TWITTER_SCHEMA_H + +#include "kudu/client/schema.h" + +namespace kudu { +namespace twitter_demo { + +using client::KuduColumnSchema; +using client::KuduSchema; +using client::KuduSchemaBuilder; + +inline KuduSchema CreateTwitterSchema() { + KuduSchema s; + KuduSchemaBuilder b; + b.AddColumn("tweet_id")->Type(KuduColumnSchema::INT64)->NotNull()->PrimaryKey(); + b.AddColumn("text")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("source")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("created_at")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("user_id")->Type(KuduColumnSchema::INT64)->NotNull(); + b.AddColumn("user_name")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("user_description")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("user_location")->Type(KuduColumnSchema::STRING)->NotNull(); + b.AddColumn("user_followers_count")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("user_friends_count")->Type(KuduColumnSchema::INT32)->NotNull(); + b.AddColumn("user_image_url")->Type(KuduColumnSchema::STRING)->NotNull(); + CHECK_OK(b.Build(&s)); + return s; +} + +} // namespace twitter_demo +} // namespace kudu +#endif + +/* + +Schema for Impala: + +CREATE TABLE twitter ( + tweet_id bigint, + text string, + source string, + created_at string, + user_id bigint, + user_name string, + user_description string, + user_location string, + user_followers_count int, + user_friends_count int, + user_image_url string); + + +Schema for MySQL: + +CREATE TABLE twitter ( + tweet_id bigint not null primary key, + tweet_text varchar(1000) not null, + source varchar(1000) not null, + created_at varchar(1000) not null, + user_id bigint not null, + user_name varchar(1000) not null, + user_description varchar(1000) not null, + user_location varchar(1000) not null, + user_followers_count int not null, + user_friends_count int not null, + user_image_url varchar(1000) not null); + +*/ diff --git a/src/kudu/twitter-demo/twitter_streamer.cc b/src/kudu/twitter-demo/twitter_streamer.cc new file mode 100644 index 000000000000..3d0c14b595b3 --- /dev/null +++ b/src/kudu/twitter-demo/twitter_streamer.cc @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/twitter-demo/twitter_streamer.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/twitter-demo/oauth.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/once.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +using std::string; + +const char* kTwitterUrl = "https://stream.twitter.com/1.1/statuses/sample.json"; + +// Defaults are for the "kudu-demo" app under the "KuduProject" account. +// See https://dev.twitter.com/apps/4906821/oauth if you have credentials. +DEFINE_string(twitter_consumer_key, "lRXXfnhNGhFO1DdAorVJeQ", + "Twitter API consumer key"); +DEFINE_string(twitter_consumer_secret, "5Enn1Uwy3mHdhwSVrJEbd24whGiHsA2YGJ0O28E", + "Twitter API consumer secret"); +DEFINE_string(twitter_token_key, "1653869436-7QncqwFkMaOS6rWNeHpwNQZ8li1CFbJp0QNOEpE", + "Twitter API access token key"); +DEFINE_string(twitter_token_secret, "1t3UPOJc6nkThvBPcCPGAj3gHB3mB97F3zraoRkKMA", + "Twitter API access token secret"); + +namespace kudu { +namespace twitter_demo { + +//////////////////////////////////////////////////////////// +// Curl utilities +//////////////////////////////////////////////////////////// + +static void DoInitCurl() { + CHECK_EQ(0, curl_global_init(CURL_GLOBAL_ALL)); +} + +static void InitCurl() { + static GoogleOnceType once = GOOGLE_ONCE_INIT; + GoogleOnceInit(&once, &DoInitCurl); +} + +// Scope-based deleters for various libcurl types. +template +class CurlDeleter { + public: + explicit CurlDeleter(CurlType* curl) : curl_(curl) {} + ~CurlDeleter() { + if (curl_) DoFree(); + } + private: + // Will be specialized for each type. + void DoFree(); + CurlType* curl_; + + DISALLOW_COPY_AND_ASSIGN(CurlDeleter); +}; + +template<> +void CurlDeleter::DoFree() { + curl_easy_cleanup(curl_); +} + +template<> +void CurlDeleter::DoFree() { + curl_slist_free_all(curl_); +} + +//////////////////////////////////////////////////////////// +// TwitterStreamer implementation +//////////////////////////////////////////////////////////// + +Status TwitterStreamer::Init() { + if (FLAGS_twitter_consumer_key.empty()) { + return Status::InvalidArgument("Missing flag", "--twitter_consumer_key"); + } + if (FLAGS_twitter_consumer_secret.empty()) { + return Status::InvalidArgument("Missing flag", "--twitter_consumer_secret"); + } + if (FLAGS_twitter_token_key.empty()) { + return Status::InvalidArgument("Missing flag", "--twitter_token_key"); + } + if (FLAGS_twitter_token_secret.empty()) { + return Status::InvalidArgument("Missing flag", "--twitter_token_secret"); + } + return Status::OK(); +} + +Status TwitterStreamer::Start() { + CHECK(!thread_.joinable()); + + thread_ = boost::thread(&TwitterStreamer::StreamThread, this); + return Status::OK(); +} + +Status TwitterStreamer::Join() { + thread_.join(); + return stream_status_; +} + +// C-style curl callback for data +size_t DataReceivedCallback(void* buffer, size_t size, size_t nmemb, void* user_ptr) { + TwitterStreamer* streamer = DCHECK_NOTNULL(reinterpret_cast(user_ptr)); + size_t total_size = size * nmemb; + Slice data(reinterpret_cast(buffer), total_size); + return streamer->DataReceived(data); +} + +void TwitterStreamer::StreamThread() { + Status s = DoStreaming(); + if (!s.ok()) { + LOG(ERROR) << "Streaming thread failed: " << s.ToString(); + boost::lock_guard l(lock_); + stream_status_ = s; + } +} + +Status TwitterStreamer::DoStreaming() { + OAuthRequest req("GET", kTwitterUrl); + req.AddStandardOAuthFields(FLAGS_twitter_consumer_key, FLAGS_twitter_token_key); + string auth_header = req.AuthHeader(FLAGS_twitter_consumer_secret, FLAGS_twitter_token_secret); + VLOG(1) << auth_header; + + InitCurl(); + CURL* curl = curl_easy_init(); + CurlDeleter delete_curl(curl); + if (!curl) { + return Status::NetworkError("curl_easy_init failed"); + } + CHECK(curl); + + // Disable SSL verification so we don't have to set up a + // trust store database of any kind. + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(curl, CURLOPT_URL, kTwitterUrl); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, DataReceivedCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, this); + + struct curl_slist* headers = NULL; + CurlDeleter delete_headers(headers); + headers = curl_slist_append(headers, auth_header.c_str()); + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + return Status::NetworkError("curl_easy_perfom failed", curl_easy_strerror(res)); + } + + return Status::OK(); +} + +size_t TwitterStreamer::DataReceived(const Slice& slice) { + recv_buf_.append(slice.data(), slice.size()); + + // Chop the received data into lines. + while (true) { + void* newline_ptr = memchr(recv_buf_.data(), '\n', recv_buf_.size()); + if (newline_ptr == NULL) { + // no newlines + break; + } + int newline_idx = reinterpret_cast(newline_ptr) - + reinterpret_cast(recv_buf_.data()); + + Slice line(recv_buf_.data(), newline_idx); + consumer_->ConsumeJSON(line); + + // Copy remaining data back to front of the buffer + int rem_size = recv_buf_.size() - newline_idx - 1; + memmove(recv_buf_.data(), &recv_buf_[newline_idx + 1], rem_size); + // Resize to only have the front + recv_buf_.resize(rem_size); + } + + return slice.size(); +} + +} // namespace twitter_demo +} // namespace kudu diff --git a/src/kudu/twitter-demo/twitter_streamer.h b/src/kudu/twitter-demo/twitter_streamer.h new file mode 100644 index 000000000000..7a1b3b5b4f31 --- /dev/null +++ b/src/kudu/twitter-demo/twitter_streamer.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TWITTER_DEMO_TWITTER_STREAMER_H +#define KUDU_TWITTER_DEMO_TWITTER_STREAMER_H + +#include +#include + +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" +#include "kudu/util/status.h" + +namespace kudu { +namespace twitter_demo { + +class TwitterConsumer { + public: + virtual void ConsumeJSON(const Slice& json) = 0; + virtual ~TwitterConsumer() {} +}; + +class TwitterStreamer { + public: + explicit TwitterStreamer(TwitterConsumer* consumer) + : consumer_(consumer) { + } + + Status Init(); + Status Start(); + Status Join(); + + private: + friend size_t DataReceivedCallback(void* buffer, size_t size, size_t nmemb, void* user_ptr); + void StreamThread(); + Status DoStreaming(); + size_t DataReceived(const Slice& data); + + boost::thread thread_; + boost::mutex lock_; + Status stream_status_; + + faststring recv_buf_; + + TwitterConsumer* consumer_; + + DISALLOW_COPY_AND_ASSIGN(TwitterStreamer); +}; + + +} // namespace twitter_demo +} // namespace kudu +#endif diff --git a/src/kudu/util/CMakeLists.txt b/src/kudu/util/CMakeLists.txt new file mode 100644 index 000000000000..23ac5e5c3ee6 --- /dev/null +++ b/src/kudu/util/CMakeLists.txt @@ -0,0 +1,362 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# histogram_proto +####################################### + +PROTOBUF_GENERATE_CPP( + HISTOGRAM_PROTO_SRCS HISTOGRAM_PROTO_HDRS HISTOGRAM_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES histogram.proto) +ADD_EXPORTABLE_LIBRARY(histogram_proto + SRCS ${HISTOGRAM_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${HISTOGRAM_PROTO_TGTS}) + +####################################### +# pb_util_proto +####################################### + +PROTOBUF_GENERATE_CPP( + PB_UTIL_PROTO_SRCS PB_UTIL_PROTO_HDRS PB_UTIL_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES pb_util.proto) +ADD_EXPORTABLE_LIBRARY(pb_util_proto + SRCS ${PB_UTIL_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${PB_UTIL_PROTO_TGTS}) + +####################################### +# version_info_proto +####################################### + +PROTOBUF_GENERATE_CPP( + VERSION_INFO_PROTO_SRCS VERSION_INFO_PROTO_HDRS VERSION_INFO_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES version_info.proto) +ADD_EXPORTABLE_LIBRARY(version_info_proto + SRCS ${VERSION_INFO_PROTO_SRCS} + DEPS protobuf + NONLINK_DEPS ${VERSION_INFO_PROTO_TGTS}) + +############################################################ +# Version stamp +############################################################ + +# Unlike CMAKE_CURRENT_BINARY_DIR, CMAKE_BINARY_DIR is always the root of +# the build directory. +set(VERSION_STAMP_FILE ${CMAKE_BINARY_DIR}/src/kudu/generated/version_defines.h) + +list(APPEND GEN_VERSION_INFO_COMMAND "${BUILD_SUPPORT_DIR}/gen_version_info.py") +list(APPEND GEN_VERSION_INFO_COMMAND "--version=${KUDU_VERSION_NUMBER}") +list(APPEND GEN_VERSION_INFO_COMMAND "--build-type=${CMAKE_BUILD_TYPE}") +if(KUDU_GIT_HASH) + message(STATUS "Provided git hash: ${KUDU_GIT_HASH}") + list(APPEND GEN_VERSION_INFO_COMMAND "--git-hash=${KUDU_GIT_HASH}") +endif() +list(APPEND GEN_VERSION_INFO_COMMAND "${VERSION_STAMP_FILE}") +add_custom_target(gen_version_info + COMMAND ${GEN_VERSION_INFO_COMMAND} + BYPRODUCTS "${VERSION_STAMP_FILE}") + +####################################### +# kudu_util +####################################### + +if (APPLE) + set(SEMAPHORE_CC "semaphore_macosx.cc") +else () + set(SEMAPHORE_CC "semaphore.cc") +endif() + +set(UTIL_SRCS + atomic.cc + bitmap.cc + bloom_filter.cc + bitmap.cc + cache.cc + cache_metrics.cc + coding.cc + condition_variable.cc + crc.cc + debug-util.cc + debug/trace_event_impl.cc + debug/trace_event_impl_constants.cc + debug/trace_event_synthetic_delay.cc + env.cc env_posix.cc env_util.cc + errno.cc + faststring.cc + failure_detector.cc + fault_injection.cc + flags.cc + flag_tags.cc + group_varint.cc + pstack_watcher.cc + hdr_histogram.cc + hexdump.cc + init.cc + jsonreader.cc + jsonwriter.cc + kernel_stack_watchdog.cc + locks.cc + logging.cc + malloc.cc + memcmpable_varint.cc + memory/arena.cc memory/memory.cc + memenv/memenv.cc + mem_tracker.cc + metrics.cc + monotime.cc + mutex.cc + net/dns_resolver.cc + net/net_util.cc + net/sockaddr.cc + net/socket.cc + oid_generator.cc + once.cc + os-util.cc + path_util.cc + pb_util.cc + pb_util-internal.cc + random_util.cc + resettable_heartbeater.cc + rolling_log.cc + rwc_lock.cc + ${SEMAPHORE_CC} + slice.cc + spinlock_profiling.cc + status.cc + status_callback.cc + string_case.cc + striped64.cc + subprocess.cc + sync_point.cc + test_graph.cc + thread.cc + threadlocal.cc + threadpool.cc + thread_restrictions.cc + trace.cc + user.cc + url-coding.cc + version_info.cc +) + +if(NOT APPLE) + set(UTIL_SRCS + ${UTIL_SRCS} + nvm_cache.cc) +endif() + +set(UTIL_LIBS + boost_system + boost_thread + crcutil + gflags + glog + gutil + histogram_proto + pb_util_proto + protobuf + version_info_proto + zlib) + +if(NOT APPLE) + set(UTIL_LIBS + ${UTIL_LIBS} + rt + vmem) +endif() + +# We use MallocExtension, but not in the exported version of the library. +set(EXPORTED_UTIL_LIBS ${UTIL_LIBS}) +if(${KUDU_TCMALLOC_AVAILABLE}) + list(APPEND UTIL_LIBS tcmalloc) +endif() + +ADD_EXPORTABLE_LIBRARY(kudu_util + SRCS ${UTIL_SRCS} + DEPS ${UTIL_LIBS} + EXPORTED_DEPS ${EXPORTED_UTIL_LIBS}) + +add_dependencies(kudu_util gen_version_info) + +####################################### +# kudu_test_util +####################################### + +add_library(kudu_test_util + test_util.cc + curl_util.cc) +target_link_libraries(kudu_test_util + ${CURL_LIBRARIES} + gflags + glog + gmock + kudu_util) + +if(NOT APPLE) +target_link_libraries(kudu_test_util + vmem) +endif() + +####################################### +# kudu_test_main +####################################### + +add_library(kudu_test_main + test_main.cc) +target_link_libraries(kudu_test_main + gflags + glog + gmock + kudu_util + kudu_test_util + dl) + +if(NOT APPLE) + target_link_libraries(kudu_test_main + rt) +endif() + +####################################### +# protoc-gen-insertions +####################################### + +add_executable(protoc-gen-insertions protoc-gen-insertions.cc) +target_link_libraries(protoc-gen-insertions gutil protobuf protoc ${KUDU_BASE_LIBS}) + +####################################### +# Unit tests +####################################### + +set(KUDU_TEST_LINK_LIBS kudu_util gutil ${KUDU_MIN_TEST_LIBS}) +ADD_KUDU_TEST(atomic-test) +ADD_KUDU_TEST(bit-util-test) +ADD_KUDU_TEST(bitmap-test) +ADD_KUDU_TEST(blocking_queue-test) +ADD_KUDU_TEST(bloom_filter-test) +ADD_KUDU_TEST(cache-test) +ADD_KUDU_TEST(callback_bind-test) +ADD_KUDU_TEST(countdown_latch-test) +ADD_KUDU_TEST(crc-test RUN_SERIAL true) # has a benchmark +ADD_KUDU_TEST(debug-util-test) +ADD_KUDU_TEST(env-test LABELS no_tsan) +ADD_KUDU_TEST(errno-test) +ADD_KUDU_TEST(failure_detector-test) +ADD_KUDU_TEST(flag_tags-test) +ADD_KUDU_TEST(group_varint-test) +ADD_KUDU_TEST(hash_util-test) +ADD_KUDU_TEST(hdr_histogram-test) +ADD_KUDU_TEST(inline_slice-test) +ADD_KUDU_TEST(interval_tree-test) +ADD_KUDU_TEST(jsonreader-test) +ADD_KUDU_TEST(knapsack_solver-test) +ADD_KUDU_TEST(logging-test) +ADD_KUDU_TEST(map-util-test) +ADD_KUDU_TEST(memcmpable_varint-test LABELS no_tsan) +ADD_KUDU_TEST(memenv/memenv-test) +ADD_KUDU_TEST(memory/arena-test) +ADD_KUDU_TEST(mem_tracker-test) +ADD_KUDU_TEST(metrics-test) +ADD_KUDU_TEST(monotime-test) +ADD_KUDU_TEST(mt-hdr_histogram-test RUN_SERIAL true) +ADD_KUDU_TEST(mt-metrics-test RUN_SERIAL true) +ADD_KUDU_TEST(mt-threadlocal-test RUN_SERIAL true) +ADD_KUDU_TEST(net/dns_resolver-test) +ADD_KUDU_TEST(net/net_util-test) +ADD_KUDU_TEST(object_pool-test) +ADD_KUDU_TEST(once-test) +ADD_KUDU_TEST(os-util-test) +ADD_KUDU_TEST(path_util-test) +ADD_KUDU_TEST(pstack_watcher-test) +ADD_KUDU_TEST(random-test) +ADD_KUDU_TEST(random_util-test) +ADD_KUDU_TEST(resettable_heartbeater-test) +ADD_KUDU_TEST(rle-test) +ADD_KUDU_TEST(rolling_log-test) +ADD_KUDU_TEST(rw_semaphore-test) +ADD_KUDU_TEST(rwc_lock-test) +ADD_KUDU_TEST(safe_math-test) +ADD_KUDU_TEST(slice-test) +ADD_KUDU_TEST(spinlock_profiling-test) +ADD_KUDU_TEST(stack_watchdog-test) +ADD_KUDU_TEST(status-test) +ADD_KUDU_TEST(string_case-test) +ADD_KUDU_TEST(striped64-test) +ADD_KUDU_TEST(subprocess-test) +ADD_KUDU_TEST(sync_point-test) +ADD_KUDU_TEST(thread-test) +ADD_KUDU_TEST(threadpool-test) +ADD_KUDU_TEST(trace-test) +ADD_KUDU_TEST(url-coding-test) +ADD_KUDU_TEST(user-test) + +####################################### +# jsonwriter_test_proto +####################################### + +PROTOBUF_GENERATE_CPP( + JSONWRITER_TEST_PROTO_SRCS JSONWRITER_TEST_PROTO_HDRS JSONWRITER_TEST_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES jsonwriter_test.proto) +add_library(jsonwriter_test_proto ${JSONWRITER_TEST_PROTO_SRCS} ${JSONWRITER_TEST_PROTO_HDRS}) +target_link_libraries(jsonwriter_test_proto + protobuf) + +####################################### +# jsonwriter-test +####################################### + +ADD_KUDU_TEST(jsonwriter-test) +if(NOT "${NO_TESTS}") + target_link_libraries(jsonwriter-test + jsonwriter_test_proto) +endif() + +####################################### +# proto_container_test_proto +####################################### + +PROTOBUF_GENERATE_CPP( + PROTO_CONTAINER_TEST_PROTO_SRCS PROTO_CONTAINER_TEST_PROTO_HDRS PROTO_CONTAINER_TEST_PROTO_TGTS + SOURCE_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../.. + BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}/../.. + PROTO_FILES + proto_container_test.proto + proto_container_test2.proto + proto_container_test3.proto) +add_library(proto_container_test_proto + ${PROTO_CONTAINER_TEST_PROTO_SRCS} + ${PROTO_CONTAINER_TEST_PROTO_HDRS}) +target_link_libraries(proto_container_test_proto + protobuf) + +####################################### +# pb_util-test +####################################### + +ADD_KUDU_TEST(pb_util-test) +if(NOT "${NO_TESTS}") + target_link_libraries(pb_util-test + proto_container_test_proto) +endif() diff --git a/src/kudu/util/alignment.h b/src/kudu/util/alignment.h new file mode 100644 index 000000000000..f4753731257f --- /dev/null +++ b/src/kudu/util/alignment.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Macros for dealing with memory alignment. +#ifndef KUDU_UTIL_ALIGNMENT_H +#define KUDU_UTIL_ALIGNMENT_H + +// Round down 'x' to the nearest 'align' boundary +#define KUDU_ALIGN_DOWN(x, align) ((x) & (-(align))) + +// Round up 'x' to the nearest 'align' boundary +#define KUDU_ALIGN_UP(x, align) (((x) + ((align) - 1)) & (-(align))) + +#endif diff --git a/src/kudu/util/async_util.h b/src/kudu/util/async_util.h new file mode 100644 index 000000000000..1e2830c69d74 --- /dev/null +++ b/src/kudu/util/async_util.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility functions which are handy when doing async/callback-based programming. +#ifndef KUDU_UTIL_ASYNC_UTIL_H +#define KUDU_UTIL_ASYNC_UTIL_H + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/status.h" +#include "kudu/util/status_callback.h" + +namespace kudu { + +// Simple class which can be used to make async methods synchronous. +// For example: +// Synchronizer s; +// SomeAsyncMethod(s.callback()); +// CHECK_OK(s.Wait()); +class Synchronizer { + public: + Synchronizer() + : l(1) { + } + void StatusCB(const Status& status) { + s = status; + l.CountDown(); + } + StatusCallback AsStatusCallback() { + // Synchronizers are often declared on the stack, so it doesn't make + // sense for a callback to take a reference to its synchronizer. + // + // Note: this means the returned callback _must_ go out of scope before + // its synchronizer. + return Bind(&Synchronizer::StatusCB, Unretained(this)); + } + Status Wait() { + l.Wait(); + return s; + } + Status WaitFor(const MonoDelta& delta) { + if (PREDICT_FALSE(!l.WaitFor(delta))) { + return Status::TimedOut("Timed out while waiting for the callback to be called."); + } + return s; + } + void Reset() { + l.Reset(1); + } + private: + DISALLOW_COPY_AND_ASSIGN(Synchronizer); + Status s; + CountDownLatch l; +}; + +} // namespace kudu +#endif /* KUDU_UTIL_ASYNC_UTIL_H */ diff --git a/src/kudu/util/atomic-test.cc b/src/kudu/util/atomic-test.cc new file mode 100644 index 000000000000..ad754f300991 --- /dev/null +++ b/src/kudu/util/atomic-test.cc @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/atomic.h" + +#include +#include +#include + +namespace kudu { + +using std::numeric_limits; +using std::vector; + +// TODO Add some multi-threaded tests; currently AtomicInt is just a +// wrapper around 'atomicops.h', but should the underlying +// implemention change, it would help to have tests that make sure +// invariants are preserved in a multi-threaded environment. + +template +class AtomicIntTest : public ::testing::Test { + public: + + AtomicIntTest() + : max_(numeric_limits::max()), + min_(numeric_limits::min()) { + acquire_release_ = { kMemOrderNoBarrier, kMemOrderAcquire, kMemOrderRelease }; + barrier_ = { kMemOrderNoBarrier, kMemOrderBarrier }; + } + + vector acquire_release_; + vector barrier_; + + T max_; + T min_; +}; + +typedef ::testing::Types IntTypes; +TYPED_TEST_CASE(AtomicIntTest, IntTypes); + +TYPED_TEST(AtomicIntTest, LoadStore) { + for (const MemoryOrder mem_order : this->acquire_release_) { + AtomicInt i(0); + EXPECT_EQ(0, i.Load(mem_order)); + i.Store(42, mem_order); + EXPECT_EQ(42, i.Load(mem_order)); + i.Store(this->min_, mem_order); + EXPECT_EQ(this->min_, i.Load(mem_order)); + i.Store(this->max_, mem_order); + EXPECT_EQ(this->max_, i.Load(mem_order)); + } +} + +TYPED_TEST(AtomicIntTest, SetSwapExchange) { + for (const MemoryOrder mem_order : this->acquire_release_) { + AtomicInt i(0); + EXPECT_TRUE(i.CompareAndSet(0, 5, mem_order)); + EXPECT_EQ(5, i.Load(mem_order)); + EXPECT_FALSE(i.CompareAndSet(0, 10, mem_order)); + + EXPECT_EQ(5, i.CompareAndSwap(5, this->max_, mem_order)); + EXPECT_EQ(this->max_, i.CompareAndSwap(42, 42, mem_order)); + EXPECT_EQ(this->max_, i.CompareAndSwap(this->max_, this->min_, mem_order)); + + EXPECT_EQ(this->min_, i.Exchange(this->max_, mem_order)); + EXPECT_EQ(this->max_, i.Load(mem_order)); + } +} + +TYPED_TEST(AtomicIntTest, MinMax) { + for (const MemoryOrder mem_order : this->acquire_release_) { + AtomicInt i(0); + + i.StoreMax(100, mem_order); + EXPECT_EQ(100, i.Load(mem_order)); + i.StoreMin(50, mem_order); + EXPECT_EQ(50, i.Load(mem_order)); + + i.StoreMax(25, mem_order); + EXPECT_EQ(50, i.Load(mem_order)); + i.StoreMin(75, mem_order); + EXPECT_EQ(50, i.Load(mem_order)); + + i.StoreMax(this->max_, mem_order); + EXPECT_EQ(this->max_, i.Load(mem_order)); + i.StoreMin(this->min_, mem_order); + EXPECT_EQ(this->min_, i.Load(mem_order)); + } +} + +TYPED_TEST(AtomicIntTest, Increment) { + for (const MemoryOrder mem_order : this->barrier_) { + AtomicInt i(0); + EXPECT_EQ(1, i.Increment(mem_order)); + EXPECT_EQ(3, i.IncrementBy(2, mem_order)); + EXPECT_EQ(3, i.IncrementBy(0, mem_order)); + } +} + +TEST(Atomic, AtomicBool) { + vector memory_orders = { kMemOrderNoBarrier, kMemOrderRelease, kMemOrderAcquire }; + for (const MemoryOrder mem_order : memory_orders) { + AtomicBool b(false); + EXPECT_FALSE(b.Load(mem_order)); + b.Store(true, mem_order); + EXPECT_TRUE(b.Load(mem_order)); + EXPECT_TRUE(b.CompareAndSet(true, false, mem_order)); + EXPECT_FALSE(b.Load(mem_order)); + EXPECT_FALSE(b.CompareAndSet(true, false, mem_order)); + EXPECT_FALSE(b.CompareAndSwap(false, true, mem_order)); + EXPECT_TRUE(b.Load(mem_order)); + EXPECT_TRUE(b.Exchange(false, mem_order)); + EXPECT_FALSE(b.Load(mem_order)); + } +} + +} // namespace kudu diff --git a/src/kudu/util/atomic.cc b/src/kudu/util/atomic.cc new file mode 100644 index 000000000000..1fdae399d36b --- /dev/null +++ b/src/kudu/util/atomic.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/atomic.h" + +#include + +#include + +namespace kudu { + +template +AtomicInt::AtomicInt(T initial_value) { + Store(initial_value, kMemOrderNoBarrier); +} + +template +void AtomicInt::FatalMemOrderNotSupported(const char* caller, + const char* requested, + const char* supported) { + LOG(FATAL) << caller << " does not support " << requested << ": only " + << supported << " are supported."; +} + +template +class AtomicInt; + +template +class AtomicInt; + +template +class AtomicInt; + +template +class AtomicInt; + +AtomicBool::AtomicBool(bool value) + : underlying_(value) { +} + +} // namespace kudu diff --git a/src/kudu/util/atomic.h b/src/kudu/util/atomic.h new file mode 100644 index 000000000000..106048e461d6 --- /dev/null +++ b/src/kudu/util/atomic.h @@ -0,0 +1,320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_ATOMIC_H +#define KUDU_UTIL_ATOMIC_H + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" + +namespace kudu { + +// See top-level comments in kudu/gutil/atomicops.h for further +// explanations of these levels. +enum MemoryOrder { + // Relaxed memory ordering, doesn't use any barriers. + kMemOrderNoBarrier = 0, + + // Ensures that no later memory access by the same thread can be + // reordered ahead of the operation. + kMemOrderAcquire = 1, + + // Ensures that no previous memory access by the same thread can be + // reordered after the operation. + kMemOrderRelease = 2, + + // Ensures that neither previous NOR later memory access by the same + // thread can be reordered after the operation. + kMemOrderBarrier = 3, +}; + +// Atomic integer class inspired by Impala's AtomicInt and +// std::atomic<> in C++11. +// +// NOTE: All of public operations use an implicit memory order of +// kMemOrderNoBarrier unless otherwise specified. +// +// Unlike std::atomic<>, overflowing an unsigned AtomicInt via Increment or +// IncrementBy is undefined behavior (it is also undefined for signed types, +// as always). +// +// See also: kudu/gutil/atomicops.h +template +class AtomicInt { + public: + // Initialize the underlying value to 'initial_value'. The + // initialization performs a Store with 'kMemOrderNoBarrier'. + explicit AtomicInt(T initial_value); + + // Returns the underlying value. + // + // Does not support 'kMemOrderBarrier'. + T Load(MemoryOrder mem_order = kMemOrderNoBarrier) const; + + // Sets the underlying value to 'new_value'. + // + // Does not support 'kMemOrderBarrier'. + void Store(T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Iff the underlying value is equal to 'expected_val', sets the + // underlying value to 'new_value' and returns true; returns false + // otherwise. + // + // Does not support 'kMemOrderBarrier'. + bool CompareAndSet(T expected_val, T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Iff the underlying value is equal to 'expected_val', sets the + // underlying value to 'new_value' and returns + // 'expected_val'. Otherwise, returns the current underlying + // value. + // + // Does not support 'kMemOrderBarrier'. + T CompareAndSwap(T expected_val, T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Sets the underlying value to 'new_value' iff 'new_value' is + // greater than the current underlying value. + // + // Does not support 'kMemOrderBarrier'. + void StoreMax(T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Sets the underlying value to 'new_value' iff 'new_value' is less + // than the current underlying value. + // + // Does not support 'kMemOrderBarrier'. + void StoreMin(T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Increments the underlying value by 1 and returns the new + // underlying value. + // + // Does not support 'kMemOrderAcquire' or 'kMemOrderRelease'. + T Increment(MemoryOrder mem_order = kMemOrderNoBarrier); + + // Increments the underlying value by 'delta' and returns the new + // underlying value. + + // Does not support 'kKemOrderAcquire' or 'kMemOrderRelease'. + T IncrementBy(T delta, MemoryOrder mem_order = kMemOrderNoBarrier); + + // Sets the underlying value to 'new_value' and returns the previous + // underlying value. + // + // Does not support 'kMemOrderBarrier'. + T Exchange(T new_value, MemoryOrder mem_order = kMemOrderNoBarrier); + + private: + // If a method 'caller' doesn't support memory order described as + // 'requested', exit by doing perform LOG(FATAL) logging the method + // called, the requested memory order, and the supported memory + // orders. + static void FatalMemOrderNotSupported(const char* caller, + const char* requested = "kMemOrderBarrier", + const char* supported = + "kMemNorderNoBarrier, kMemOrderAcquire, kMemOrderRelease"); + + // The gutil/atomicops.h functions only operate on signed types. + // So, even if the user specializes on an unsigned type, we use a + // signed type internally. + typedef typename boost::make_signed::type SignedT; + SignedT value_; + + DISALLOW_COPY_AND_ASSIGN(AtomicInt); +}; + +// Adapts AtomicInt to handle boolean values. +// +// NOTE: All of public operations use an implicit memory order of +// kMemOrderNoBarrier unless otherwise specified. +// +// See AtomicInt above for documentation on individual methods. +class AtomicBool { + public: + explicit AtomicBool(bool value); + + bool Load(MemoryOrder m = kMemOrderNoBarrier) const { + return underlying_.Load(m); + } + void Store(bool n, MemoryOrder m = kMemOrderNoBarrier) { + underlying_.Store(static_cast(n), m); + } + bool CompareAndSet(bool e, bool n, MemoryOrder m = kMemOrderNoBarrier) { + return underlying_.CompareAndSet(static_cast(e), static_cast(n), m); + } + bool CompareAndSwap(bool e, bool n, MemoryOrder m = kMemOrderNoBarrier) { + return underlying_.CompareAndSwap(static_cast(e), static_cast(n), m); + } + bool Exchange(bool n, MemoryOrder m = kMemOrderNoBarrier) { + return underlying_.Exchange(static_cast(n), m); + } + private: + AtomicInt underlying_; + + DISALLOW_COPY_AND_ASSIGN(AtomicBool); +}; + +template +inline T AtomicInt::Load(MemoryOrder mem_order) const { + switch (mem_order) { + case kMemOrderNoBarrier: { + return base::subtle::NoBarrier_Load(&value_); + } + case kMemOrderBarrier: { + FatalMemOrderNotSupported("Load"); + break; + } + case kMemOrderAcquire: { + return base::subtle::Acquire_Load(&value_); + } + case kMemOrderRelease: { + return base::subtle::Release_Load(&value_); + } + } + abort(); // Unnecessary, but avoids gcc complaining. +} + +template +inline void AtomicInt::Store(T new_value, MemoryOrder mem_order) { + switch (mem_order) { + case kMemOrderNoBarrier: { + base::subtle::NoBarrier_Store(&value_, new_value); + break; + } + case kMemOrderBarrier: { + FatalMemOrderNotSupported("Store"); + break; + } + case kMemOrderAcquire: { + base::subtle::Acquire_Store(&value_, new_value); + break; + } + case kMemOrderRelease: { + base::subtle::Release_Store(&value_, new_value); + break; + } + } +} + +template +inline bool AtomicInt::CompareAndSet(T expected_val, T new_val, MemoryOrder mem_order) { + return CompareAndSwap(expected_val, new_val, mem_order) == expected_val; +} + +template +inline T AtomicInt::CompareAndSwap(T expected_val, T new_val, MemoryOrder mem_order) { + switch (mem_order) { + case kMemOrderNoBarrier: { + return base::subtle::NoBarrier_CompareAndSwap( + &value_, expected_val, new_val); + } + case kMemOrderBarrier: { + FatalMemOrderNotSupported("CompareAndSwap/CompareAndSet"); + break; + } + case kMemOrderAcquire: { + return base::subtle::Acquire_CompareAndSwap( + &value_, expected_val, new_val); + } + case kMemOrderRelease: { + return base::subtle::Release_CompareAndSwap( + &value_, expected_val, new_val); + } + } + abort(); +} + + +template +inline T AtomicInt::Increment(MemoryOrder mem_order) { + return IncrementBy(1, mem_order); +} + +template +inline T AtomicInt::IncrementBy(T delta, MemoryOrder mem_order) { + switch (mem_order) { + case kMemOrderNoBarrier: { + return base::subtle::NoBarrier_AtomicIncrement(&value_, delta); + } + case kMemOrderBarrier: { + return base::subtle::Barrier_AtomicIncrement(&value_, delta); + } + case kMemOrderAcquire: { + FatalMemOrderNotSupported("Increment/IncrementBy", + "kMemOrderAcquire", + "kMemOrderNoBarrier and kMemOrderBarrier"); + break; + } + case kMemOrderRelease: { + FatalMemOrderNotSupported("Increment/Incrementby", + "kMemOrderAcquire", + "kMemOrderNoBarrier and kMemOrderBarrier"); + break; + } + } + abort(); +} + +template +inline T AtomicInt::Exchange(T new_value, MemoryOrder mem_order) { + switch (mem_order) { + case kMemOrderNoBarrier: { + return base::subtle::NoBarrier_AtomicExchange(&value_, new_value); + } + case kMemOrderBarrier: { + FatalMemOrderNotSupported("Exchange"); + break; + } + case kMemOrderAcquire: { + return base::subtle::Acquire_AtomicExchange(&value_, new_value); + } + case kMemOrderRelease: { + return base::subtle::Release_AtomicExchange(&value_, new_value); + } + } + abort(); +} + +template +inline void AtomicInt::StoreMax(T new_value, MemoryOrder mem_order) { + T old_value = Load(mem_order); + while (true) { + T max_value = std::max(old_value, new_value); + T prev_value = CompareAndSwap(old_value, max_value, mem_order); + if (PREDICT_TRUE(old_value == prev_value)) { + break; + } + old_value = prev_value; + } +} + +template +inline void AtomicInt::StoreMin(T new_value, MemoryOrder mem_order) { + T old_value = Load(mem_order); + while (true) { + T min_value = std::min(old_value, new_value); + T prev_value = CompareAndSwap(old_value, min_value, mem_order); + if (PREDICT_TRUE(old_value == prev_value)) { + break; + } + old_value = prev_value; + } +} + +} // namespace kudu +#endif /* KUDU_UTIL_ATOMIC_H */ diff --git a/src/kudu/util/auto_release_pool.h b/src/kudu/util/auto_release_pool.h new file mode 100644 index 000000000000..eaed9c20ff14 --- /dev/null +++ b/src/kudu/util/auto_release_pool.h @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple pool of objects that will be deallocated when the pool is +// destroyed + +#ifndef KUDU_UTIL_AUTO_RELEASE_POOL_H +#define KUDU_UTIL_AUTO_RELEASE_POOL_H + +#include + +#include "kudu/gutil/spinlock.h" + +namespace kudu { + +// Thread-safe. +class AutoReleasePool { + public: + AutoReleasePool(): objects_() { } + + ~AutoReleasePool() { + for (auto& object : objects_) { + delete object; + } + } + + template + T *Add(T *t) { + base::SpinLockHolder l(&lock_); + objects_.push_back(new SpecificElement(t)); + return t; + } + + // Add an array-allocated object to the pool. This is identical to + // Add() except that it will be freed with 'delete[]' instead of 'delete'. + template + T* AddArray(T *t) { + base::SpinLockHolder l(&lock_); + objects_.push_back(new SpecificArrayElement(t)); + return t; + } + + // Donate all objects in this pool to another pool. + void DonateAllTo(AutoReleasePool* dst) { + base::SpinLockHolder l(&lock_); + base::SpinLockHolder l_them(&dst->lock_); + + dst->objects_.reserve(dst->objects_.size() + objects_.size()); + dst->objects_.insert(dst->objects_.end(), objects_.begin(), objects_.end()); + objects_.clear(); + } + + private: + struct GenericElement { + virtual ~GenericElement() {} + }; + + template + struct SpecificElement : GenericElement { + explicit SpecificElement(T *t): t(t) {} + ~SpecificElement() { + delete t; + } + + T *t; + }; + + template + struct SpecificArrayElement : GenericElement { + explicit SpecificArrayElement(T *t): t(t) {} + ~SpecificArrayElement() { + delete [] t; + } + + T *t; + }; + + typedef std::vector ElementVector; + ElementVector objects_; + base::SpinLock lock_; +}; + + +} // namespace kudu +#endif diff --git a/src/kudu/util/bit-stream-utils.h b/src/kudu/util/bit-stream-utils.h new file mode 100644 index 000000000000..976bf803d1b9 --- /dev/null +++ b/src/kudu/util/bit-stream-utils.h @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_H +#define IMPALA_UTIL_BIT_STREAM_UTILS_H + +#include "kudu/gutil/port.h" +#include "kudu/util/bit-util.h" +#include "kudu/util/faststring.h" + +namespace kudu { + +// Utility class to write bit/byte streams. This class can write data to either be +// bit packed or byte aligned (and a single stream that has a mix of both). +class BitWriter { + public: + // buffer: buffer to write bits to. + explicit BitWriter(faststring *buffer) + : buffer_(buffer) { + Clear(); + } + + void Clear() { + buffered_values_ = 0; + byte_offset_ = 0; + bit_offset_ = 0; + buffer_->clear(); + } + + // Returns a pointer to the underlying buffer + faststring *buffer() const { return buffer_; } + + // The number of current bytes written, including the current byte (i.e. may include a + // fraction of a byte). Includes buffered values. + int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } + + // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit + // packed. num_bits must be <= 32. + void PutValue(uint64_t v, int num_bits); + + // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the + // extra high-order bits will be ignored. + template + void PutAligned(T v, int num_bits); + + // Write a Vlq encoded int to the buffer. The value is written byte aligned. + // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity + void PutVlqInt(int32_t v); + + // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. + size_t GetByteIndexAndAdvance(int num_bytes) { + uint8_t* ptr = GetNextBytePtr(num_bytes); + return ptr - buffer_->data(); + } + + // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. + uint8_t* GetNextBytePtr(int num_bytes); + + // Flushes all buffered values to the buffer. Call this when done writing to the buffer. + // If 'align' is true, buffered_values_ is reset and any future writes will be written + // to the next byte boundary. + void Flush(bool align = false); + + private: + // Bit-packed values are initially written to this variable before being memcpy'd to + // buffer_. This is faster than writing values byte by byte directly to buffer_. + uint64_t buffered_values_; + + faststring *buffer_; + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +// Utility class to read bit/byte stream. This class can read bits or bytes +// that are either byte aligned or not. It also has utilities to read multiple +// bytes in one read (e.g. encoded int). +class BitReader { + public: + // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. + BitReader(const uint8_t* buffer, int buffer_len); + + BitReader() : buffer_(NULL), max_bytes_(0) {} + + // Gets the next value from the buffer. Returns true if 'v' could be read or false if + // there are not enough bytes left. num_bits must be <= 32. + template + bool GetValue(int num_bits, T* v); + + // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a + // little-endian native type and big enough to store 'num_bytes'. The value is assumed + // to be byte-aligned so the stream will be advanced to the start of the next byte + // before 'v' is read. Returns false if there are not enough bytes left. + template + bool GetAligned(int num_bytes, T* v); + + // Reads a vlq encoded int from the stream. The encoded int must start at the + // beginning of a byte. Return false if there were not enough bytes in the buffer. + bool GetVlqInt(int32_t* v); + + // Returns the number of bytes left in the stream, not including the current byte (i.e., + // there may be an additional fraction of a byte). + int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } + + // Current position in the stream, by bit. + int position() const { return byte_offset_ * 8 + bit_offset_; } + + // Rewind the stream by 'num_bits' bits + void Rewind(int num_bits); + + // Seek to a specific bit in the buffer + void SeekToBit(uint stream_position); + + // Maximum byte length of a vlq encoded int + static const int MAX_VLQ_BYTE_LEN = 5; + + bool is_initialized() const { return buffer_ != NULL; } + + private: + // Used by SeekToBit() and GetValue() to fetch the + // the next word into buffer_. + void BufferValues(); + + const uint8_t* buffer_; + int max_bytes_; + + // Bytes are memcpy'd from buffer_ and values are read from this variable. This is + // faster than reading values byte by byte directly from buffer_. + uint64_t buffered_values_; + + int byte_offset_; // Offset in buffer_ + int bit_offset_; // Offset in buffered_values_ +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/bit-stream-utils.inline.h b/src/kudu/util/bit-stream-utils.inline.h new file mode 100644 index 000000000000..bdccdb8e2c34 --- /dev/null +++ b/src/kudu/util/bit-stream-utils.inline.h @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H +#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H + +#include + +#include "kudu/util/bit-stream-utils.h" +#include "kudu/util/alignment.h" + +namespace kudu { + +inline void BitWriter::PutValue(uint64_t v, int num_bits) { + // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases) + DCHECK_LE(num_bits, 32); + DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; + + buffered_values_ |= v << bit_offset_; + bit_offset_ += num_bits; + + if (PREDICT_FALSE(bit_offset_ >= 64)) { + // Flush buffered_values_ and write out bits of v that did not fit + buffer_->reserve(KUDU_ALIGN_UP(byte_offset_ + 8, 8)); + buffer_->resize(byte_offset_ + 8); + DCHECK_LE(byte_offset_ + 8, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, 8); + buffered_values_ = 0; + byte_offset_ += 8; + bit_offset_ -= 64; + buffered_values_ = v >> (num_bits - bit_offset_); + } + DCHECK_LT(bit_offset_, 64); +} + +inline void BitWriter::Flush(bool align) { + int num_bytes = BitUtil::Ceil(bit_offset_, 8); + buffer_->reserve(KUDU_ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + DCHECK_LE(byte_offset_ + num_bytes, buffer_->capacity()); + memcpy(buffer_->data() + byte_offset_, &buffered_values_, num_bytes); + + if (align) { + buffered_values_ = 0; + byte_offset_ += num_bytes; + bit_offset_ = 0; + } +} + +inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) { + Flush(/* align */ true); + buffer_->reserve(KUDU_ALIGN_UP(byte_offset_ + num_bytes, 8)); + buffer_->resize(byte_offset_ + num_bytes); + uint8_t* ptr = buffer_->data() + byte_offset_; + byte_offset_ += num_bytes; + DCHECK_LE(byte_offset_, buffer_->capacity()); + return ptr; +} + +template +inline void BitWriter::PutAligned(T val, int num_bytes) { + DCHECK_LE(num_bytes, sizeof(T)); + uint8_t* ptr = GetNextBytePtr(num_bytes); + memcpy(ptr, &val, num_bytes); +} + +inline void BitWriter::PutVlqInt(int32_t v) { + while ((v & 0xFFFFFF80) != 0L) { + PutAligned((v & 0x7F) | 0x80, 1); + v >>= 7; + } + PutAligned(v & 0x7F, 1); +} + + +inline BitReader::BitReader(const uint8_t* buffer, int buffer_len) + : buffer_(buffer), + max_bytes_(buffer_len), + buffered_values_(0), + byte_offset_(0), + bit_offset_(0) { + int num_bytes = std::min(8, max_bytes_); + memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes); +} + +inline void BitReader::BufferValues() { + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } +} + +template +inline bool BitReader::GetValue(int num_bits, T* v) { + // TODO: revisit this limit if necessary + DCHECK_LE(num_bits, 32); + DCHECK_LE(num_bits, sizeof(T) * 8); + + if (PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false; + + *v = BitUtil::TrailingBits(buffered_values_, bit_offset_ + num_bits) >> bit_offset_; + + bit_offset_ += num_bits; + if (bit_offset_ >= 64) { + byte_offset_ += 8; + bit_offset_ -= 64; + BufferValues(); + // Read bits of v that crossed into new buffered_values_ + *v |= BitUtil::TrailingBits(buffered_values_, bit_offset_) + << (num_bits - bit_offset_); + } + DCHECK_LE(bit_offset_, 64); + return true; +} + +inline void BitReader::Rewind(int num_bits) { + bit_offset_ -= num_bits; + if (bit_offset_ >= 0) { + return; + } + while (bit_offset_ < 0) { + int seek_back = std::min(byte_offset_, 8); + byte_offset_ -= seek_back; + bit_offset_ += seek_back * 8; + } + // This should only be executed *if* rewinding by 'num_bits' + // make the existing buffered_values_ invalid + DCHECK_GE(byte_offset_, 0); // Check for underflow + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); +} + +inline void BitReader::SeekToBit(uint stream_position) { + DCHECK_LE(stream_position, max_bytes_ * 8); + + int delta = stream_position - position(); + if (delta == 0) { + return; + } else if (delta < 0) { + Rewind(position() - stream_position); + } else { + bit_offset_ += delta; + while (bit_offset_ >= 64) { + byte_offset_ +=8; + bit_offset_ -= 64; + if (bit_offset_ < 64) { + // This should only be executed if seeking to + // 'stream_position' makes the existing buffered_values_ + // invalid. + BufferValues(); + } + } + } +} + +template +inline bool BitReader::GetAligned(int num_bytes, T* v) { + DCHECK_LE(num_bytes, sizeof(T)); + int bytes_read = BitUtil::Ceil(bit_offset_, 8); + if (PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false; + + // Advance byte_offset to next unread byte and read num_bytes + byte_offset_ += bytes_read; + memcpy(v, buffer_ + byte_offset_, num_bytes); + byte_offset_ += num_bytes; + + // Reset buffered_values_ + bit_offset_ = 0; + int bytes_remaining = max_bytes_ - byte_offset_; + if (PREDICT_TRUE(bytes_remaining >= 8)) { + memcpy(&buffered_values_, buffer_ + byte_offset_, 8); + } else { + memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining); + } + return true; +} + +inline bool BitReader::GetVlqInt(int32_t* v) { + *v = 0; + int shift = 0; + int num_bytes = 0; + uint8_t byte = 0; + do { + if (!GetAligned(1, &byte)) return false; + *v |= (byte & 0x7F) << shift; + shift += 7; + DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN); + } while ((byte & 0x80) != 0); + return true; +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/bit-util-test.cc b/src/kudu/util/bit-util-test.cc new file mode 100644 index 000000000000..7b3e96dc71db --- /dev/null +++ b/src/kudu/util/bit-util-test.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "kudu/util/bit-util.h" + +namespace kudu { + +TEST(BitUtil, TrailingBits) { + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64), + BOOST_BINARY(1 1 1 1 1 1 1 1)); + EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100), + BOOST_BINARY(1 1 1 1 1 1 1 1)); + EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0); + EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0); + EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63); +} + +} // namespace kudu diff --git a/src/kudu/util/bit-util.h b/src/kudu/util/bit-util.h new file mode 100644 index 000000000000..da42f5417d23 --- /dev/null +++ b/src/kudu/util/bit-util.h @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#ifndef IMPALA_BIT_UTIL_H +#define IMPALA_BIT_UTIL_H + +#include +#include "kudu/gutil/port.h" + +namespace kudu { + +// Utility class to do standard bit tricks +// TODO: is this in boost or something else like that? +class BitUtil { + public: + // Returns the ceil of value/divisor + static inline int Ceil(int value, int divisor) { + return value / divisor + (value % divisor != 0); + } + + // Returns the 'num_bits' least-significant bits of 'v'. + static inline uint64_t TrailingBits(uint64_t v, int num_bits) { + if (PREDICT_FALSE(num_bits == 0)) return 0; + if (PREDICT_FALSE(num_bits >= 64)) return v; + int n = 64 - num_bits; + return (v << n) >> n; + } +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/bitmap-test.cc b/src/kudu/util/bitmap-test.cc new file mode 100644 index 000000000000..a91e41460b42 --- /dev/null +++ b/src/kudu/util/bitmap-test.cc @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +static int ReadBackBitmap(uint8_t *bm, size_t bits, + std::vector *result) { + int iters = 0; + for (TrueBitIterator iter(bm, bits); + !iter.done(); + ++iter) { + size_t val = *iter; + result->push_back(val); + + iters++; + } + return iters; +} + +TEST(TestBitMap, TestIteration) { + uint8_t bm[8]; + memset(bm, 0, sizeof(bm)); + BitmapSet(bm, 0); + BitmapSet(bm, 8); + BitmapSet(bm, 31); + BitmapSet(bm, 32); + BitmapSet(bm, 33); + BitmapSet(bm, 63); + + EXPECT_EQ(" 0: 10000000 10000000 00000000 00000001 11000000 00000000 00000000 00000001 \n", + BitmapToString(bm, sizeof(bm) * 8)); + + std::vector read_back; + + int iters = ReadBackBitmap(bm, sizeof(bm)*8, &read_back); + ASSERT_EQ(6, iters); + ASSERT_EQ("0,8,31,32,33,63", JoinElements(read_back, ",")); +} + + +TEST(TestBitMap, TestIteration2) { + uint8_t bm[1]; + memset(bm, 0, sizeof(bm)); + BitmapSet(bm, 1); + + std::vector read_back; + + int iters = ReadBackBitmap(bm, 3, &read_back); + ASSERT_EQ(1, iters); + ASSERT_EQ("1", JoinElements(read_back, ",")); +} + +TEST(TestBitmap, TestSetAndTestBits) { + uint8_t bm[1]; + memset(bm, 0, sizeof(bm)); + + size_t num_bits = sizeof(bm) * 8; + for (size_t i = 0; i < num_bits; i++) { + ASSERT_FALSE(BitmapTest(bm, i)); + + BitmapSet(bm, i); + ASSERT_TRUE(BitmapTest(bm, i)); + + BitmapClear(bm, i); + ASSERT_FALSE(BitmapTest(bm, i)); + + BitmapChange(bm, i, true); + ASSERT_TRUE(BitmapTest(bm, i)); + + BitmapChange(bm, i, false); + ASSERT_FALSE(BitmapTest(bm, i)); + } + + // Set the other bit: 01010101 + for (size_t i = 0; i < num_bits; ++i) { + ASSERT_FALSE(BitmapTest(bm, i)); + if (i & 1) BitmapSet(bm, i); + } + + // Check and Clear the other bit: 0000000 + for (size_t i = 0; i < num_bits; ++i) { + ASSERT_EQ(!!(i & 1), BitmapTest(bm, i)); + if (i & 1) BitmapClear(bm, i); + } + + // Check if bits are zero and change the other to one + for (size_t i = 0; i < num_bits; ++i) { + ASSERT_FALSE(BitmapTest(bm, i)); + BitmapChange(bm, i, i & 1); + } + + // Check the bits change them again + for (size_t i = 0; i < num_bits; ++i) { + ASSERT_EQ(!!(i & 1), BitmapTest(bm, i)); + BitmapChange(bm, i, !(i & 1)); + } + + // Check the last setup + for (size_t i = 0; i < num_bits; ++i) { + ASSERT_EQ(!(i & 1), BitmapTest(bm, i)); + } +} + +TEST(TestBitMap, TestBulkSetAndTestBits) { + uint8_t bm[16]; + size_t total_size = sizeof(bm) * 8; + + // Test Bulk change bits and test bits + for (int i = 0; i < 4; ++i) { + bool value = i & 1; + size_t num_bits = total_size; + while (num_bits > 0) { + for (size_t offset = 0; offset < num_bits; ++offset) { + BitmapChangeBits(bm, 0, total_size, !value); + BitmapChangeBits(bm, offset, num_bits - offset, value); + + ASSERT_EQ(value, BitMapIsAllSet(bm, offset, num_bits)); + ASSERT_EQ(!value, BitmapIsAllZero(bm, offset, num_bits)); + + if (offset > 1) { + ASSERT_EQ(value, BitmapIsAllZero(bm, 0, offset - 1)); + ASSERT_EQ(!value, BitMapIsAllSet(bm, 0, offset - 1)); + } + + if ((offset + num_bits) < total_size) { + ASSERT_EQ(value, BitmapIsAllZero(bm, num_bits, total_size)); + ASSERT_EQ(!value, BitMapIsAllSet(bm, num_bits, total_size)); + } + } + num_bits--; + } + } +} + +TEST(TestBitMap, TestFindBit) { + uint8_t bm[16]; + + size_t num_bits = sizeof(bm) * 8; + BitmapChangeBits(bm, 0, num_bits, false); + while (num_bits > 0) { + for (size_t offset = 0; offset < num_bits; ++offset) { + size_t idx; + ASSERT_FALSE(BitmapFindFirstSet(bm, offset, num_bits, &idx)); + ASSERT_TRUE(BitmapFindFirstZero(bm, offset, num_bits, &idx)); + ASSERT_EQ(idx, offset); + } + num_bits--; + } + + num_bits = sizeof(bm) * 8; + for (int i = 0; i < num_bits; ++i) { + BitmapChange(bm, i, i & 3); + } + + while (num_bits--) { + for (size_t offset = 0; offset < num_bits; ++offset) { + size_t idx; + + // Find a set bit + bool res = BitmapFindFirstSet(bm, offset, num_bits, &idx); + size_t expected_set_idx = (offset + !(offset & 3)); + bool expect_set_found = (expected_set_idx < num_bits); + ASSERT_EQ(expect_set_found, res); + if (expect_set_found) ASSERT_EQ(expected_set_idx, idx); + + // Find a zero bit + res = BitmapFindFirstZero(bm, offset, num_bits, &idx); + size_t expected_zero_idx = offset + ((offset & 3) ? (4 - (offset & 3)) : 0); + bool expect_zero_found = (expected_zero_idx < num_bits); + ASSERT_EQ(expect_zero_found, res); + if (expect_zero_found) ASSERT_EQ(expected_zero_idx, idx); + } + } +} + +TEST(TestBitMap, TestBitmapIteration) { + uint8_t bm[8]; + memset(bm, 0, sizeof(bm)); + BitmapSet(bm, 0); + BitmapSet(bm, 8); + BitmapSet(bm, 31); + BitmapSet(bm, 32); + BitmapSet(bm, 33); + BitmapSet(bm, 63); + + BitmapIterator biter(bm, sizeof(bm) * 8); + + size_t i = 0; + size_t size; + bool value = false; + bool expected_value = true; + size_t expected_sizes[] = {1, 7, 1, 22, 3, 29, 1, 0}; + while ((size = biter.Next(&value)) > 0) { + ASSERT_LT(i, 8); + ASSERT_EQ(expected_value, value); + ASSERT_EQ(expected_sizes[i], size); + expected_value = !expected_value; + i++; + } + ASSERT_EQ(expected_sizes[i], size); +} + +} // namespace kudu diff --git a/src/kudu/util/bitmap.cc b/src/kudu/util/bitmap.cc new file mode 100644 index 000000000000..e38f2e26ed4c --- /dev/null +++ b/src/kudu/util/bitmap.cc @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +void BitmapChangeBits(uint8_t *bitmap, size_t offset, size_t num_bits, bool value) { + DCHECK_GT(num_bits, 0); + + size_t start_byte = (offset >> 3); + size_t end_byte = (offset + num_bits - 1) >> 3; + int single_byte = (start_byte == end_byte); + + // Change the last bits of the first byte + size_t left = offset & 0x7; + size_t right = (single_byte) ? (left + num_bits) : 8; + uint8_t mask = ((0xff << left) & (0xff >> (8 - right))); + if (value) { + bitmap[start_byte++] |= mask; + } else { + bitmap[start_byte++] &= ~mask; + } + + // Nothing left... I'm done + if (single_byte) { + return; + } + + // change the middle bits + if (end_byte > start_byte) { + const uint8_t pattern8[2] = { 0x00, 0xff }; + memset(bitmap + start_byte, pattern8[value], end_byte - start_byte); + } + + // change the first bits of the last byte + right = offset + num_bits - (end_byte << 3); + mask = (0xff >> (8 - right)); + if (value) { + bitmap[end_byte] |= mask; + } else { + bitmap[end_byte] &= ~mask; + } +} + +bool BitmapFindFirst(const uint8_t *bitmap, size_t offset, size_t bitmap_size, + bool value, size_t *idx) { + const uint64_t pattern64[2] = { 0xffffffffffffffff, 0x0000000000000000 }; + const uint8_t pattern8[2] = { 0xff, 0x00 }; + size_t bit; + + DCHECK_LE(offset, bitmap_size); + + // Jump to the byte at specified offset + const uint8_t *p = bitmap + (offset >> 3); + size_t num_bits = bitmap_size - offset; + + // Find a 'value' bit at the end of the first byte + if ((bit = offset & 0x7)) { + for (; bit < 8 && num_bits > 0; ++bit) { + if (BitmapTest(p, bit) == value) { + *idx = ((p - bitmap) << 3) + bit; + return true; + } + + num_bits--; + } + + p++; + } + + // check 64bit at the time for a 'value' bit + const uint64_t *u64 = (const uint64_t *)p; + while (num_bits >= 64 && *u64 == pattern64[value]) { + num_bits -= 64; + u64++; + } + + // check 8bit at the time for a 'value' bit + p = (const uint8_t *)u64; + while (num_bits >= 8 && *p == pattern8[value]) { + num_bits -= 8; + p++; + } + + // Find a 'value' bit at the beginning of the last byte + for (bit = 0; num_bits > 0; ++bit) { + if (BitmapTest(p, bit) == value) { + *idx = ((p - bitmap) << 3) + bit; + return true; + } + num_bits--; + } + + return false; +} + +std::string BitmapToString(const uint8_t *bitmap, size_t num_bits) { + std::string s; + size_t index = 0; + while (index < num_bits) { + StringAppendF(&s, "%4zu: ", index); + for (int i = 0; i < 8 && index < num_bits; ++i) { + for (int j = 0; j < 8 && index < num_bits; ++j) { + StringAppendF(&s, "%d", BitmapTest(bitmap, index)); + index++; + } + StringAppendF(&s, " "); + } + StringAppendF(&s, "\n"); + } + return s; +} + +} // namespace kudu diff --git a/src/kudu/util/bitmap.h b/src/kudu/util/bitmap.h new file mode 100644 index 000000000000..1689b50d509a --- /dev/null +++ b/src/kudu/util/bitmap.h @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility functions for dealing with a byte array as if it were a bitmap. +#ifndef KUDU_UTIL_BITMAP_H +#define KUDU_UTIL_BITMAP_H + +#include +#include "kudu/gutil/bits.h" + +namespace kudu { + +// Return the number of bytes necessary to store the given number of bits. +inline size_t BitmapSize(size_t num_bits) { + return (num_bits + 7) / 8; +} + +// Set the given bit. +inline void BitmapSet(uint8_t *bitmap, size_t idx) { + bitmap[idx >> 3] |= 1 << (idx & 7); +} + +// Switch the given bit to the specified value. +inline void BitmapChange(uint8_t *bitmap, size_t idx, bool value) { + bitmap[idx >> 3] = (bitmap[idx >> 3] & ~(1 << (idx & 7))) | ((!!value) << (idx & 7)); +} + +// Clear the given bit. +inline void BitmapClear(uint8_t *bitmap, size_t idx) { + bitmap[idx >> 3] &= ~(1 << (idx & 7)); +} + +// Test/get the given bit. +inline bool BitmapTest(const uint8_t *bitmap, size_t idx) { + return bitmap[idx >> 3] & (1 << (idx & 7)); +} + +// Merge the two bitmaps using bitwise or. Both bitmaps should have at least +// n_bits valid bits. +inline void BitmapMergeOr(uint8_t *dst, const uint8_t *src, size_t n_bits) { + size_t n_bytes = BitmapSize(n_bits); + for (size_t i = 0; i < n_bytes; i++) { + *dst++ |= *src++; + } +} + +// Set bits from offset to (offset + num_bits) to the specified value +void BitmapChangeBits(uint8_t *bitmap, size_t offset, size_t num_bits, bool value); + +// Find the first bit of the specified value, starting from the specified offset. +bool BitmapFindFirst(const uint8_t *bitmap, size_t offset, size_t bitmap_size, + bool value, size_t *idx); + +// Find the first set bit in the bitmap, at the specified offset. +inline bool BitmapFindFirstSet(const uint8_t *bitmap, size_t offset, + size_t bitmap_size, size_t *idx) { + return BitmapFindFirst(bitmap, offset, bitmap_size, true, idx); +} + +// Find the first zero bit in the bitmap, at the specified offset. +inline bool BitmapFindFirstZero(const uint8_t *bitmap, size_t offset, + size_t bitmap_size, size_t *idx) { + return BitmapFindFirst(bitmap, offset, bitmap_size, false, idx); +} + +// Returns true if the bitmap contains only ones. +inline bool BitMapIsAllSet(const uint8_t *bitmap, size_t offset, size_t bitmap_size) { + DCHECK_LT(offset, bitmap_size); + size_t idx; + return !BitmapFindFirstZero(bitmap, offset, bitmap_size, &idx); +} + +// Returns true if the bitmap contains only zeros. +inline bool BitmapIsAllZero(const uint8_t *bitmap, size_t offset, size_t bitmap_size) { + DCHECK_LT(offset, bitmap_size); + size_t idx; + return !BitmapFindFirstSet(bitmap, offset, bitmap_size, &idx); +} + +std::string BitmapToString(const uint8_t *bitmap, size_t num_bits); + +// Iterator which yields ranges of set and unset bits. +// Example usage: +// bool value; +// size_t size; +// BitmapIterator iter(bitmap, n_bits); +// while ((size = iter.Next(&value))) { +// printf("bitmap block len=%lu value=%d\n", size, value); +// } +class BitmapIterator { + public: + BitmapIterator(const uint8_t *map, size_t num_bits) + : offset_(0), num_bits_(num_bits), map_(map) + {} + + bool done() const { + return (num_bits_ - offset_) == 0; + } + + void SeekTo(size_t bit) { + DCHECK_LE(bit, num_bits_); + offset_ = bit; + } + + size_t Next(bool *value) { + size_t len = num_bits_ - offset_; + if (PREDICT_FALSE(len == 0)) + return(0); + + *value = BitmapTest(map_, offset_); + + size_t index; + if (BitmapFindFirst(map_, offset_, num_bits_, !(*value), &index)) { + len = index - offset_; + } else { + index = num_bits_; + } + + offset_ = index; + return len; + } + + private: + size_t offset_; + size_t num_bits_; + const uint8_t *map_; +}; + +// Iterator which yields the set bits in a bitmap. +// Example usage: +// for (TrueBitIterator iter(bitmap, n_bits); +// !iter.done(); +// ++iter) { +// int next_onebit_position = *iter; +// } +class TrueBitIterator { + public: + TrueBitIterator(const uint8_t *bitmap, size_t n_bits) + : bitmap_(bitmap), + cur_byte_(0), + cur_byte_idx_(0), + n_bits_(n_bits), + n_bytes_(BitmapSize(n_bits_)), + bit_idx_(0) { + if (n_bits_ == 0) { + cur_byte_idx_ = 1; // sets done + } else { + cur_byte_ = bitmap[0]; + AdvanceToNextOneBit(); + } + } + + TrueBitIterator &operator ++() { + DCHECK(!done()); + DCHECK(cur_byte_ & 1); + cur_byte_ &= (~1); + AdvanceToNextOneBit(); + return *this; + } + + bool done() const { + return cur_byte_idx_ >= n_bytes_; + } + + size_t operator *() const { + DCHECK(!done()); + return bit_idx_; + } + + private: + void AdvanceToNextOneBit() { + while (cur_byte_ == 0) { + cur_byte_idx_++; + if (cur_byte_idx_ >= n_bytes_) return; + cur_byte_ = bitmap_[cur_byte_idx_]; + bit_idx_ = cur_byte_idx_ * 8; + } + DVLOG(2) << "Found next nonzero byte at " << cur_byte_idx_ + << " val=" << cur_byte_; + + DCHECK_NE(cur_byte_, 0); + int set_bit = Bits::FindLSBSetNonZero(cur_byte_); + bit_idx_ += set_bit; + cur_byte_ >>= set_bit; + } + + const uint8_t *bitmap_; + uint8_t cur_byte_; + uint8_t cur_byte_idx_; + + const size_t n_bits_; + const size_t n_bytes_; + size_t bit_idx_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/blocking_queue-test.cc b/src/kudu/util/blocking_queue-test.cc new file mode 100644 index 000000000000..876b451de5d0 --- /dev/null +++ b/src/kudu/util/blocking_queue-test.cc @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/util/countdown_latch.h" +#include "kudu/util/blocking_queue.h" + +using std::shared_ptr; +using std::string; +using std::vector; + +namespace kudu { + +BlockingQueue test1_queue(5); + +void InsertSomeThings(void) { + ASSERT_EQ(test1_queue.Put(1), QUEUE_SUCCESS); + ASSERT_EQ(test1_queue.Put(2), QUEUE_SUCCESS); + ASSERT_EQ(test1_queue.Put(3), QUEUE_SUCCESS); +} + +TEST(BlockingQueueTest, Test1) { + boost::thread inserter_thread(InsertSomeThings); + int32_t i; + ASSERT_TRUE(test1_queue.BlockingGet(&i)); + ASSERT_EQ(1, i); + ASSERT_TRUE(test1_queue.BlockingGet(&i)); + ASSERT_EQ(2, i); + ASSERT_TRUE(test1_queue.BlockingGet(&i)); + ASSERT_EQ(3, i); +} + +TEST(BlockingQueueTest, TestBlockingDrainTo) { + BlockingQueue test_queue(3); + ASSERT_EQ(test_queue.Put(1), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put(2), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put(3), QUEUE_SUCCESS); + vector out; + ASSERT_TRUE(test_queue.BlockingDrainTo(&out)); + ASSERT_EQ(1, out[0]); + ASSERT_EQ(2, out[1]); + ASSERT_EQ(3, out[2]); +} + +TEST(BlockingQueueTest, TestTooManyInsertions) { + BlockingQueue test_queue(2); + ASSERT_EQ(test_queue.Put(123), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put(123), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put(123), QUEUE_FULL); +} + +namespace { + +struct LengthLogicalSize { + static size_t logical_size(const string& s) { + return s.length(); + } +}; + +} // anonymous namespace + +TEST(BlockingQueueTest, TestLogicalSize) { + BlockingQueue test_queue(4); + ASSERT_EQ(test_queue.Put("a"), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put("bcd"), QUEUE_SUCCESS); + ASSERT_EQ(test_queue.Put("e"), QUEUE_FULL); +} + +TEST(BlockingQueueTest, TestNonPointerParamsMayBeNonEmptyOnDestruct) { + BlockingQueue test_queue(1); + ASSERT_EQ(test_queue.Put(123), QUEUE_SUCCESS); + // No DCHECK failure on destruct. +} + +#ifndef NDEBUG +TEST(BlockingQueueDeathTest, TestPointerParamsMustBeEmptyOnDestruct) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + ASSERT_DEATH({ + BlockingQueue test_queue(1); + int32_t element = 123; + ASSERT_EQ(test_queue.Put(&element), QUEUE_SUCCESS); + // Debug assertion triggered on queue destruction since type is a pointer. + }, + "BlockingQueue holds bare pointers"); +} +#endif // NDEBUG + +TEST(BlockingQueueTest, TestGetFromShutdownQueue) { + BlockingQueue test_queue(2); + ASSERT_EQ(test_queue.Put(123), QUEUE_SUCCESS); + test_queue.Shutdown(); + ASSERT_EQ(test_queue.Put(456), QUEUE_SHUTDOWN); + int64_t i; + ASSERT_TRUE(test_queue.BlockingGet(&i)); + ASSERT_EQ(123, i); + ASSERT_FALSE(test_queue.BlockingGet(&i)); +} + +TEST(BlockingQueueTest, TestGscopedPtrMethods) { + BlockingQueue test_queue(2); + gscoped_ptr input_int(new int(123)); + ASSERT_EQ(test_queue.Put(&input_int), QUEUE_SUCCESS); + gscoped_ptr output_int; + ASSERT_TRUE(test_queue.BlockingGet(&output_int)); + ASSERT_EQ(123, *output_int.get()); + test_queue.Shutdown(); +} + +class MultiThreadTest { + public: + typedef vector > thread_vec_t; + + MultiThreadTest() + : puts_(4), + blocking_puts_(4), + nthreads_(5), + queue_(nthreads_ * puts_), + num_inserters_(nthreads_), + sync_latch_(nthreads_) { + } + + void InserterThread(int arg) { + for (int i = 0; i < puts_; i++) { + ASSERT_EQ(queue_.Put(arg), QUEUE_SUCCESS); + } + sync_latch_.CountDown(); + sync_latch_.Wait(); + for (int i = 0; i < blocking_puts_; i++) { + ASSERT_TRUE(queue_.BlockingPut(arg)); + } + MutexLock guard(lock_); + if (--num_inserters_ == 0) { + queue_.Shutdown(); + } + } + + void RemoverThread() { + for (int i = 0; i < puts_ + blocking_puts_; i++) { + int32_t arg = 0; + bool got = queue_.BlockingGet(&arg); + if (!got) { + arg = -1; + } + MutexLock guard(lock_); + gotten_[arg] = gotten_[arg] + 1; + } + } + + void Run() { + for (int i = 0; i < nthreads_; i++) { + threads_.push_back(shared_ptr( + new boost::thread(boost::bind( + &MultiThreadTest::InserterThread, this, i)))); + threads_.push_back(shared_ptr( + new boost::thread(boost::bind( + &MultiThreadTest::RemoverThread, this)))); + } + // We add an extra thread to ensure that there aren't enough elements in + // the queue to go around. This way, we test removal after Shutdown. + threads_.push_back(shared_ptr( + new boost::thread(boost::bind( + &MultiThreadTest::RemoverThread, this)))); + for (const auto& thread : threads_) { + thread->join(); + } + // Let's check to make sure we got what we should have. + MutexLock guard(lock_); + for (int i = 0; i < nthreads_; i++) { + ASSERT_EQ(puts_ + blocking_puts_, gotten_[i]); + } + // And there were nthreads_ * (puts_ + blocking_puts_) + // elements removed, but only nthreads_ * puts_ + + // blocking_puts_ elements added. So some removers hit the + // shutdown case. + ASSERT_EQ(puts_ + blocking_puts_, gotten_[-1]); + } + + int puts_; + int blocking_puts_; + int nthreads_; + BlockingQueue queue_; + Mutex lock_; + std::map gotten_; + thread_vec_t threads_; + int num_inserters_; + CountDownLatch sync_latch_; +}; + +TEST(BlockingQueueTest, TestMultipleThreads) { + MultiThreadTest test; + test.Run(); +} + +} // namespace kudu diff --git a/src/kudu/util/blocking_queue.h b/src/kudu/util/blocking_queue.h new file mode 100644 index 000000000000..f20f787b8d4b --- /dev/null +++ b/src/kudu/util/blocking_queue.h @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_BLOCKING_QUEUE_H +#define KUDU_UTIL_BLOCKING_QUEUE_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/mutex.h" + +namespace kudu { + +// Return values for BlockingQueue::Put() +enum QueueStatus { + QUEUE_SUCCESS = 0, + QUEUE_SHUTDOWN = 1, + QUEUE_FULL = 2 +}; + +// Default logical length implementation: always returns 1. +struct DefaultLogicalSize { + template + static size_t logical_size(const T& /* unused */) { + return 1; + } +}; + +template +class BlockingQueue { + public: + // If T is a pointer, this will be the base type. If T is not a pointer, you + // can ignore this and the functions which make use of it. + // Template substitution failure is not an error. + typedef typename boost::remove_pointer::type T_VAL; + + explicit BlockingQueue(size_t max_size) + : shutdown_(false), + size_(0), + max_size_(max_size), + not_empty_(&lock_), + not_full_(&lock_) { + } + + // If the queue holds a bare pointer, it must be empty on destruction, since + // it may have ownership of the pointer. + ~BlockingQueue() { + DCHECK(list_.empty() || !std::is_pointer::value) + << "BlockingQueue holds bare pointers at destruction time"; + } + + // Get an element from the queue. Returns false if we were shut down prior to + // getting the element. + bool BlockingGet(T *out) { + MutexLock l(lock_); + while (true) { + if (!list_.empty()) { + *out = list_.front(); + list_.pop_front(); + decrement_size_unlocked(*out); + not_full_.Signal(); + return true; + } + if (shutdown_) { + return false; + } + not_empty_.Wait(); + } + } + + // Get an element from the queue. Returns false if the queue is empty and + // we were shut down prior to getting the element. + bool BlockingGet(gscoped_ptr *out) { + T t = NULL; + bool got_element = BlockingGet(&t); + if (!got_element) { + return false; + } + out->reset(t); + return true; + } + + // Get all elements from the queue and append them to a + // vector. Returns false if shutdown prior to getting the elements. + bool BlockingDrainTo(std::vector* out) { + MutexLock l(lock_); + while (true) { + if (!list_.empty()) { + out->reserve(list_.size()); + for (const T& elt : list_) { + out->push_back(elt); + decrement_size_unlocked(elt); + } + list_.clear(); + not_full_.Signal(); + return true; + } + if (shutdown_) { + return false; + } + not_empty_.Wait(); + } + } + + // Attempts to put the given value in the queue. + // Returns: + // QUEUE_SUCCESS: if successfully inserted + // QUEUE_FULL: if the queue has reached max_size + // QUEUE_SHUTDOWN: if someone has already called Shutdown() + QueueStatus Put(const T &val) { + MutexLock l(lock_); + if (size_ >= max_size_) { + return QUEUE_FULL; + } + if (shutdown_) { + return QUEUE_SHUTDOWN; + } + list_.push_back(val); + increment_size_unlocked(val); + l.Unlock(); + not_empty_.Signal(); + return QUEUE_SUCCESS; + } + + // Returns the same as the other Put() overload above. + // If the element was inserted, the gscoped_ptr releases its contents. + QueueStatus Put(gscoped_ptr *val) { + QueueStatus s = Put(val->get()); + if (s == QUEUE_SUCCESS) { + ignore_result<>(val->release()); + } + return s; + } + + // Gets an element for the queue; if the queue is full, blocks until + // space becomes available. Returns false if we were shutdown prior + // to enqueueing the element. + bool BlockingPut(const T& val) { + MutexLock l(lock_); + while (true) { + if (shutdown_) { + return false; + } + if (size_ < max_size_) { + list_.push_back(val); + increment_size_unlocked(val); + l.Unlock(); + not_empty_.Signal(); + return true; + } + not_full_.Wait(); + } + } + + // Same as other BlockingPut() overload above. If the element was + // enqueued, gscoped_ptr releases its contents. + bool BlockingPut(gscoped_ptr* val) { + bool ret = Put(val->get()); + if (ret) { + ignore_result(val->release()); + } + return ret; + } + + // Shut down the queue. + // When a blocking queue is shut down, no more elements can be added to it, + // and Put() will return QUEUE_SHUTDOWN. + // Existing elements will drain out of it, and then BlockingGet will start + // returning false. + void Shutdown() { + MutexLock l(lock_); + shutdown_ = true; + not_full_.Broadcast(); + not_empty_.Broadcast(); + } + + bool empty() const { + MutexLock l(lock_); + return list_.empty(); + } + + size_t max_size() const { + return max_size_; + } + + std::string ToString() const { + std::string ret; + + MutexLock l(lock_); + for (const T& t : list_) { + ret.append(t->ToString()); + ret.append("\n"); + } + return ret; + } + + private: + + // Increments queue size. Must be called when 'lock_' is held. + void increment_size_unlocked(const T& t) { + size_ += LOGICAL_SIZE::logical_size(t); + } + + // Decrements queue size. Must be called when 'lock_' is held. + void decrement_size_unlocked(const T& t) { + size_ -= LOGICAL_SIZE::logical_size(t); + } + + bool shutdown_; + size_t size_; + size_t max_size_; + mutable Mutex lock_; + ConditionVariable not_empty_; + ConditionVariable not_full_; + std::list list_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/bloom_filter-test.cc b/src/kudu/util/bloom_filter-test.cc new file mode 100644 index 000000000000..3cee98589d2a --- /dev/null +++ b/src/kudu/util/bloom_filter-test.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include "kudu/util/bloom_filter.h" + +namespace kudu { + +static const int kRandomSeed = 0xdeadbeef; + +static void AddRandomKeys(int random_seed, int n_keys, BloomFilterBuilder *bf) { + srandom(random_seed); + for (int i = 0; i < n_keys; i++) { + uint64_t key = random(); + Slice key_slice(reinterpret_cast(&key), sizeof(key)); + BloomKeyProbe probe(key_slice); + bf->AddKey(probe); + } +} + +static void CheckRandomKeys(int random_seed, int n_keys, const BloomFilter &bf) { + srandom(random_seed); + for (int i = 0; i < n_keys; i++) { + uint64_t key = random(); + Slice key_slice(reinterpret_cast(&key), sizeof(key)); + BloomKeyProbe probe(key_slice); + ASSERT_TRUE(bf.MayContainKey(probe)); + } +} + +TEST(TestBloomFilter, TestInsertAndProbe) { + int n_keys = 2000; + BloomFilterBuilder bfb( + BloomFilterSizing::ByCountAndFPRate(n_keys, 0.01)); + + // Check that the desired false positive rate is achieved. + double expected_fp_rate = bfb.false_positive_rate(); + ASSERT_NEAR(expected_fp_rate, 0.01, 0.002); + + // 1% FP rate should need about 9 bits per key + ASSERT_EQ(9, bfb.n_bits() / n_keys); + + // Enter n_keys random keys into the bloom filter + AddRandomKeys(kRandomSeed, n_keys, &bfb); + + // Verify that the keys we inserted all return true when queried. + BloomFilter bf(bfb.slice(), bfb.n_hashes()); + CheckRandomKeys(kRandomSeed, n_keys, bf); + + // Query a bunch of other keys, and verify the false positive rate + // is within reasonable bounds. + uint32_t num_queries = 100000; + uint32_t num_positives = 0; + for (int i = 0; i < num_queries; i++) { + uint64_t key = random(); + Slice key_slice(reinterpret_cast(&key), sizeof(key)); + BloomKeyProbe probe(key_slice); + if (bf.MayContainKey(probe)) { + num_positives++; + } + } + + double fp_rate = static_cast(num_positives) / static_cast(num_queries); + LOG(INFO) << "FP rate: " << fp_rate << " (" << num_positives << "/" << num_queries << ")"; + LOG(INFO) << "Expected FP rate: " << expected_fp_rate; + + // Actual FP rate should be within 20% of the estimated FP rate + ASSERT_NEAR(fp_rate, expected_fp_rate, 0.20*expected_fp_rate); +} + +} // namespace kudu diff --git a/src/kudu/util/bloom_filter.cc b/src/kudu/util/bloom_filter.cc new file mode 100644 index 000000000000..2b48b0daadc0 --- /dev/null +++ b/src/kudu/util/bloom_filter.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/util/bloom_filter.h" +#include "kudu/util/bitmap.h" + +namespace kudu { + +static double kNaturalLog2 = 0.69314; + +static int ComputeOptimalHashCount(size_t n_bits, size_t elems) { + int n_hashes = n_bits * kNaturalLog2 / elems; + if (n_hashes < 1) n_hashes = 1; + return n_hashes; +} + +BloomFilterSizing BloomFilterSizing::ByCountAndFPRate( + size_t expected_count, double fp_rate) { + CHECK_GT(fp_rate, 0); + CHECK_LT(fp_rate, 1); + + double n_bits = -static_cast(expected_count) * log(fp_rate) + / kNaturalLog2 / kNaturalLog2; + int n_bytes = static_cast(ceil(n_bits / 8)); + CHECK_GT(n_bytes, 0) + << "expected_count: " << expected_count + << " fp_rate: " << fp_rate; + return BloomFilterSizing(n_bytes, expected_count); +} + +BloomFilterSizing BloomFilterSizing::BySizeAndFPRate(size_t n_bytes, double fp_rate) { + size_t n_bits = n_bytes * 8; + double expected_elems = -static_cast(n_bits) * kNaturalLog2 * kNaturalLog2 / + log(fp_rate); + DCHECK_GT(expected_elems, 1); + return BloomFilterSizing(n_bytes, (size_t)ceil(expected_elems)); +} + + +BloomFilterBuilder::BloomFilterBuilder(const BloomFilterSizing &sizing) + : n_bits_(sizing.n_bytes() * 8), + bitmap_(new uint8_t[sizing.n_bytes()]), + n_hashes_(ComputeOptimalHashCount(n_bits_, sizing.expected_count())), + expected_count_(sizing.expected_count()), + n_inserted_(0) { + Clear(); +} + +void BloomFilterBuilder::Clear() { + memset(&bitmap_[0], 0, n_bytes()); + n_inserted_ = 0; +} + +double BloomFilterBuilder::false_positive_rate() const { + CHECK_NE(expected_count_, 0) + << "expected_count_ not initialized: can't call this function on " + << "a BloomFilter initialized from external data"; + + return pow(1 - exp(-static_cast(n_hashes_) * expected_count_ / n_bits_), n_hashes_); +} + +BloomFilter::BloomFilter(const Slice &data, size_t n_hashes) + : n_bits_(data.size() * 8), + bitmap_(reinterpret_cast(data.data())), + n_hashes_(n_hashes) +{} + + + +} // namespace kudu diff --git a/src/kudu/util/bloom_filter.h b/src/kudu/util/bloom_filter.h new file mode 100644 index 000000000000..fb12022c2833 --- /dev/null +++ b/src/kudu/util/bloom_filter.h @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_BLOOM_FILTER_H +#define KUDU_UTIL_BLOOM_FILTER_H + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/slice.h" + +namespace kudu { + +// Probe calculated from a given key. This caches the calculated +// hash values which are necessary for probing into a Bloom Filter, +// so that when many bloom filters have to be consulted for a given +// key, we only need to calculate the hashes once. +// +// This is implemented based on the idea of double-hashing from the following paper: +// "Less Hashing, Same Performance: Building a Better Bloom Filter" +// Kirsch and Mitzenmacher, ESA 2006 +// http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf +// +// Currently, the implementation uses the 64-bit City Hash. +// TODO: an SSE CRC32 hash is probably ~20% faster. Come back to this +// at some point. +class BloomKeyProbe { + public: + // Default constructor - this is only used to instantiate an object + // and later reassign by assignment from another instance + BloomKeyProbe() {} + + // Construct a probe from the given key. + // + // NOTE: proper operation requires that the referenced memory remain + // valid for the lifetime of this object. + explicit BloomKeyProbe(const Slice &key) : key_(key) { + uint64_t h = util_hash::CityHash64( + reinterpret_cast(key.data()), + key.size()); + + // Use the top and bottom halves of the 64-bit hash + // as the two independent hash functions for mixing. + h_1_ = static_cast(h); + h_2_ = static_cast(h >> 32); + } + + const Slice &key() const { return key_; } + + // The initial hash value. See MixHash() for usage example. + uint32_t initial_hash() const { + return h_1_; + } + + // Mix the given hash function with the second calculated hash + // value. A sequence of independent hashes can be calculated + // by repeatedly calling MixHash() on its previous result. + uint32_t MixHash(uint32_t h) const { + return h + h_2_; + } + + private: + Slice key_; + + // The two hashes. + uint32_t h_1_; + uint32_t h_2_; +}; + +// Sizing parameters for the constructor to BloomFilterBuilder. +// This is simply to provide a nicer API than a bunch of overloaded +// constructors. +class BloomFilterSizing { + public: + // Size the bloom filter by a fixed size and false positive rate. + // + // Picks the number of entries to achieve the above. + static BloomFilterSizing BySizeAndFPRate(size_t n_bytes, double fp_rate); + + // Size the bloom filer by an expected count and false positive rate. + // + // Picks the number of bytes to achieve the above. + static BloomFilterSizing ByCountAndFPRate(size_t expected_count, double fp_rate); + + size_t n_bytes() const { return n_bytes_; } + size_t expected_count() const { return expected_count_; } + + private: + BloomFilterSizing(size_t n_bytes, size_t expected_count) : + n_bytes_(n_bytes), + expected_count_(expected_count) + {} + + size_t n_bytes_; + size_t expected_count_; +}; + + +// Builder for a BloomFilter structure. +class BloomFilterBuilder { + public: + // Create a bloom filter. + // See BloomFilterSizing static methods to specify this argument. + explicit BloomFilterBuilder(const BloomFilterSizing &sizing); + + // Clear all entries, reset insertion count. + void Clear(); + + // Add the given key to the bloom filter. + void AddKey(const BloomKeyProbe &probe); + + // Return an estimate of the false positive rate. + double false_positive_rate() const; + + int n_bytes() const { + return n_bits_ / 8; + } + + int n_bits() const { + return n_bits_; + } + + // Return a slice view into this Bloom Filter, suitable for + // writing out to a file. + const Slice slice() const { + return Slice(&bitmap_[0], n_bytes()); + } + + // Return the number of hashes that are calculated for each entry + // in the bloom filter. + size_t n_hashes() const { return n_hashes_; } + + size_t expected_count() const { return expected_count_; } + + // Return the number of keys inserted. + size_t count() const { return n_inserted_; } + + private: + DISALLOW_COPY_AND_ASSIGN(BloomFilterBuilder); + + size_t n_bits_; + gscoped_array bitmap_; + + // The number of hash functions to compute. + size_t n_hashes_; + + // The expected number of elements, for which the bloom is optimized. + size_t expected_count_; + + // The number of elements inserted so far since the last Reset. + size_t n_inserted_; +}; + + +// Wrapper around a byte array for reading it as a bloom filter. +class BloomFilter { + public: + BloomFilter(const Slice &data, size_t n_hashes); + + // Return true if the filter may contain the given key. + bool MayContainKey(const BloomKeyProbe &probe) const; + + private: + friend class BloomFilterBuilder; + static uint32_t PickBit(uint32_t hash, size_t n_bits); + + size_t n_bits_; + const uint8_t *bitmap_; + + size_t n_hashes_; +}; + + +//////////////////////////////////////////////////////////// +// Inline implementations +//////////////////////////////////////////////////////////// + +inline uint32_t BloomFilter::PickBit(uint32_t hash, size_t n_bits) { + switch (n_bits) { + // Fast path for the default bloom filter block size. Bitwise math + // is much faster than division. + case 4096 * 8: + return hash & (n_bits - 1); + + default: + return hash % n_bits; + } +} + +inline void BloomFilterBuilder::AddKey(const BloomKeyProbe &probe) { + uint32_t h = probe.initial_hash(); + for (size_t i = 0; i < n_hashes_; i++) { + uint32_t bitpos = BloomFilter::PickBit(h, n_bits_); + BitmapSet(&bitmap_[0], bitpos); + h = probe.MixHash(h); + } + n_inserted_++; +} + +inline bool BloomFilter::MayContainKey(const BloomKeyProbe &probe) const { + uint32_t h = probe.initial_hash(); + + // Basic unrolling by 2s gives a small benefit here since the two bit positions + // can be calculated in parallel -- it's a 50% chance that the first will be + // set even if it's a bloom miss, in which case we can parallelize the load. + int rem_hashes = n_hashes_; + while (rem_hashes >= 2) { + uint32_t bitpos1 = PickBit(h, n_bits_); + h = probe.MixHash(h); + uint32_t bitpos2 = PickBit(h, n_bits_); + h = probe.MixHash(h); + + if (!BitmapTest(&bitmap_[0], bitpos1) || + !BitmapTest(&bitmap_[0], bitpos2)) { + return false; + } + + rem_hashes -= 2; + } + + while (rem_hashes) { + uint32_t bitpos = PickBit(h, n_bits_); + if (!BitmapTest(&bitmap_[0], bitpos)) { + return false; + } + h = probe.MixHash(h); + rem_hashes--; + } + return true; +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/boost_mutex_utils.h b/src/kudu/util/boost_mutex_utils.h new file mode 100644 index 000000000000..41e475bce9e4 --- /dev/null +++ b/src/kudu/util/boost_mutex_utils.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_BOOST_MUTEX_UTILS_H +#define KUDU_BOOST_MUTEX_UTILS_H + + +// Similar to boost::lock_guard except that it takes +// a lock pointer, and checks against NULL. If the +// pointer is NULL, does nothing. Otherwise guards +// with the lock. +template +class lock_guard_maybe { + public: + explicit lock_guard_maybe(LockType *l) : + lock_(l) { + if (l != NULL) { + l->lock(); + } + } + + ~lock_guard_maybe() { + if (lock_ != NULL) { + lock_->unlock(); + } + } + + private: + LockType *lock_; +}; + +#endif diff --git a/src/kudu/util/cache-test.cc b/src/kudu/util/cache-test.cc new file mode 100644 index 000000000000..1aec114bb9ae --- /dev/null +++ b/src/kudu/util/cache-test.cc @@ -0,0 +1,246 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include + +#include +#include "kudu/util/cache.h" +#include "kudu/util/coding.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +#if defined(__linux__) +DECLARE_string(nvm_cache_path); +#endif // defined(__linux__) + +namespace kudu { + +// Conversions between numeric keys/values and the types expected by Cache. +static std::string EncodeKey(int k) { + faststring result; + PutFixed32(&result, k); + return result.ToString(); +} +static int DecodeKey(const Slice& k) { + assert(k.size() == 4); + return DecodeFixed32(k.data()); +} +static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { return reinterpret_cast(v); } + +class CacheTest : public KuduTest, + public ::testing::WithParamInterface, + public CacheDeleter { + public: + + // Implementation of the CacheDeleter interface + virtual void Delete(const Slice& key, void* v) OVERRIDE { + deleted_keys_.push_back(DecodeKey(key)); + deleted_values_.push_back(DecodeValue(v)); + } + std::vector deleted_keys_; + std::vector deleted_values_; + std::shared_ptr mem_tracker_; + gscoped_ptr cache_; + MetricRegistry metric_registry_; + + static const int kCacheSize = 14*1024*1024; + + virtual void SetUp() OVERRIDE { + +#if defined(__linux__) + if (google::GetCommandLineFlagInfoOrDie("nvm_cache_path").is_default) { + FLAGS_nvm_cache_path = GetTestPath("nvm-cache"); + ASSERT_OK(Env::Default()->CreateDir(FLAGS_nvm_cache_path)); + } +#endif // defined(__linux__) + + cache_.reset(NewLRUCache(GetParam(), kCacheSize, "cache_test")); + + MemTracker::FindTracker("cache_test-sharded_lru_cache", &mem_tracker_); + // Since nvm cache does not have memtracker due to the use of + // tcmalloc for this we only check for it in the DRAM case. + if (GetParam() == DRAM_CACHE) { + ASSERT_TRUE(mem_tracker_.get()); + } + + scoped_refptr entity = METRIC_ENTITY_server.Instantiate( + &metric_registry_, "test"); + cache_->SetMetrics(entity); + } + + int Lookup(int key) { + Cache::Handle* handle = cache_->Lookup(EncodeKey(key), Cache::EXPECT_IN_CACHE); + const int r = (handle == nullptr) ? -1 : DecodeValue(cache_->Value(handle)); + if (handle != nullptr) { + cache_->Release(handle); + } + return r; + } + + void Insert(int key, int value, int charge = 1) { + cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge, + this)); + } + + void Erase(int key) { + cache_->Erase(EncodeKey(key)); + } +}; + +#if defined(__linux__) +INSTANTIATE_TEST_CASE_P(CacheTypes, CacheTest, ::testing::Values(DRAM_CACHE, NVM_CACHE)); +#else +INSTANTIATE_TEST_CASE_P(CacheTypes, CacheTest, ::testing::Values(DRAM_CACHE)); +#endif // defined(__linux__) + +TEST_P(CacheTest, TrackMemory) { + if (mem_tracker_) { + Insert(100, 100, 1); + ASSERT_EQ(1, mem_tracker_->consumption()); + Erase(100); + ASSERT_EQ(0, mem_tracker_->consumption()); + ASSERT_EQ(1, mem_tracker_->peak_consumption()); + } +} + +TEST_P(CacheTest, HitAndMiss) { + ASSERT_EQ(-1, Lookup(100)); + + Insert(100, 101); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(200, 201); + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + Insert(100, 102); + ASSERT_EQ(102, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); + + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); +} + +TEST_P(CacheTest, Erase) { + Erase(200); + ASSERT_EQ(0, deleted_keys_.size()); + + Insert(100, 101); + Insert(200, 201); + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(201, Lookup(200)); + ASSERT_EQ(1, deleted_keys_.size()); +} + +TEST_P(CacheTest, EntriesArePinned) { + Insert(100, 101); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100), Cache::EXPECT_IN_CACHE); + ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + + Insert(100, 102); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100), Cache::EXPECT_IN_CACHE); + ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); + ASSERT_EQ(0, deleted_keys_.size()); + + cache_->Release(h1); + ASSERT_EQ(1, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(101, deleted_values_[0]); + + Erase(100); + ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(1, deleted_keys_.size()); + + cache_->Release(h2); + ASSERT_EQ(2, deleted_keys_.size()); + ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(102, deleted_values_[1]); +} + +TEST_P(CacheTest, EvictionPolicy) { + Insert(100, 101); + Insert(200, 201); + + const int kNumElems = 1000; + const int kSizePerElem = kCacheSize / kNumElems; + + // Frequently used entry must be kept around + for (int i = 0; i < kNumElems + 100; i++) { + Insert(1000+i, 2000+i, kSizePerElem); + ASSERT_EQ(2000+i, Lookup(1000+i)); + ASSERT_EQ(101, Lookup(100)); + } + ASSERT_EQ(101, Lookup(100)); + ASSERT_EQ(-1, Lookup(200)); +} + +TEST_P(CacheTest, HeavyEntries) { + // Add a bunch of light and heavy entries and then count the combined + // size of items still in the cache, which must be approximately the + // same as the total capacity. + const int kLight = kCacheSize/1000; + const int kHeavy = kCacheSize/100; + int added = 0; + int index = 0; + while (added < 2*kCacheSize) { + const int weight = (index & 1) ? kLight : kHeavy; + Insert(index, 1000+index, weight); + added += weight; + index++; + } + + int cached_weight = 0; + for (int i = 0; i < index; i++) { + const int weight = (i & 1 ? kLight : kHeavy); + int r = Lookup(i); + if (r >= 0) { + cached_weight += weight; + ASSERT_EQ(1000+i, r); + } + } + ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10); +} + +TEST_P(CacheTest, NewId) { + uint64_t a = cache_->NewId(); + uint64_t b = cache_->NewId(); + ASSERT_NE(a, b); +} + +} // namespace kudu diff --git a/src/kudu/util/cache.cc b/src/kudu/util/cache.cc new file mode 100644 index 000000000000..c9d63ce9e4d2 --- /dev/null +++ b/src/kudu/util/cache.cc @@ -0,0 +1,488 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include + +#include "kudu/gutil/atomic_refcount.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/atomic.h" +#include "kudu/util/cache.h" +#include "kudu/util/cache_metrics.h" +#include "kudu/util/locks.h" +#include "kudu/util/mem_tracker.h" +#include "kudu/util/metrics.h" + +#if !defined(__APPLE__) +#include "kudu/util/nvm_cache.h" +#endif + +namespace kudu { + +class MetricEntity; + +Cache::~Cache() { +} + +namespace { + +using std::shared_ptr; +using std::vector; + +typedef simple_spinlock MutexType; + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + CacheDeleter* deleter; + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + Atomic32 refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + uint8_t key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == nullptr ? nullptr : old->next_hash); + *ptr = h; + if (old == nullptr) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle* Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != nullptr) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != nullptr && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { + new_length *= 2; + } + auto new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + DCHECK_EQ(elems_, count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + +// A single shard of sharded cache. +class LRUCache { + public: + explicit LRUCache(MemTracker* tracker); + ~LRUCache(); + + // Separate from constructor so caller can easily make an array of LRUCache + void SetCapacity(size_t capacity) { capacity_ = capacity; } + + void SetMetrics(CacheMetrics* metrics) { metrics_ = metrics; } + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + CacheDeleter* deleter); + Cache::Handle* Lookup(const Slice& key, uint32_t hash, bool caching); + void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); + + private: + void LRU_Remove(LRUHandle* e); + void LRU_Append(LRUHandle* e); + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(LRUHandle* e); + // Call deleter and free + void FreeEntry(LRUHandle* e); + + // Initialized before use. + size_t capacity_; + + // mutex_ protects the following state. + MutexType mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; + + MemTracker* mem_tracker_; + + CacheMetrics* metrics_; +}; + +LRUCache::LRUCache(MemTracker* tracker) + : usage_(0), + mem_tracker_(tracker), + metrics_(nullptr) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +LRUCache::~LRUCache() { + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + DCHECK_EQ(e->refs, 1); // Error if caller has an unreleased handle + if (Unref(e)) { + FreeEntry(e); + } + e = next; + } +} + +bool LRUCache::Unref(LRUHandle* e) { + DCHECK_GT(ANNOTATE_UNPROTECTED_READ(e->refs), 0); + return !base::RefCountDec(&e->refs); +} + +void LRUCache::FreeEntry(LRUHandle* e) { + DCHECK_EQ(ANNOTATE_UNPROTECTED_READ(e->refs), 0); + e->deleter->Delete(e->key(), e->value); + mem_tracker_->Release(e->charge); + if (PREDICT_TRUE(metrics_)) { + metrics_->cache_usage->DecrementBy(e->charge); + metrics_->evictions->Increment(); + } + free(e); +} + +void LRUCache::LRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + usage_ -= e->charge; +} + +void LRUCache::LRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + usage_ += e->charge; +} + +Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash, bool caching) { + LRUHandle* e; + { + lock_guard l(&mutex_); + e = table_.Lookup(key, hash); + if (e != nullptr) { + base::RefCountInc(&e->refs); + LRU_Remove(e); + LRU_Append(e); + } + } + + // Do the metrics outside of the lock. + if (metrics_) { + metrics_->lookups->Increment(); + bool was_hit = (e != nullptr); + if (was_hit) { + if (caching) { + metrics_->cache_hits_caching->Increment(); + } else { + metrics_->cache_hits->Increment(); + } + } else { + if (caching) { + metrics_->cache_misses_caching->Increment(); + } else { + metrics_->cache_misses->Increment(); + } + } + } + + return reinterpret_cast(e); +} + +void LRUCache::Release(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + bool last_reference = Unref(e); + if (last_reference) { + FreeEntry(e); + } +} + +Cache::Handle* LRUCache::Insert( + const Slice& key, uint32_t hash, void* value, size_t charge, + CacheDeleter *deleter) { + + LRUHandle* e = reinterpret_cast( + malloc(sizeof(LRUHandle)-1 + key.size())); + LRUHandle* to_remove_head = nullptr; + + e->value = value; + e->deleter = deleter; + e->charge = charge; + e->key_length = key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + memcpy(e->key_data, key.data(), key.size()); + mem_tracker_->Consume(charge); + if (PREDICT_TRUE(metrics_)) { + metrics_->cache_usage->IncrementBy(charge); + metrics_->inserts->Increment(); + } + + { + lock_guard l(&mutex_); + + LRU_Append(e); + + LRUHandle* old = table_.Insert(e); + if (old != nullptr) { + LRU_Remove(old); + if (Unref(old)) { + old->next = to_remove_head; + to_remove_head = old; + } + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + LRUHandle* old = lru_.next; + LRU_Remove(old); + table_.Remove(old->key(), old->hash); + if (Unref(old)) { + old->next = to_remove_head; + to_remove_head = old; + } + } + } + + // we free the entries here outside of mutex for + // performance reasons + while (to_remove_head != nullptr) { + LRUHandle* next = to_remove_head->next; + FreeEntry(to_remove_head); + to_remove_head = next; + } + + return reinterpret_cast(e); +} + +void LRUCache::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + lock_guard l(&mutex_); + e = table_.Remove(key, hash); + if (e != nullptr) { + LRU_Remove(e); + last_reference = Unref(e); + } + } + // mutex not held here + // last_reference will only be true if e != NULL + if (last_reference) { + FreeEntry(e); + } +} + +static const int kNumShardBits = 4; +static const int kNumShards = 1 << kNumShardBits; + +class ShardedLRUCache : public Cache { + private: + shared_ptr mem_tracker_; + gscoped_ptr metrics_; + vector shards_; + MutexType id_mutex_; + uint64_t last_id_; + + static inline uint32_t HashSlice(const Slice& s) { + return util_hash::CityHash64( + reinterpret_cast(s.data()), s.size()); + } + + static uint32_t Shard(uint32_t hash) { + return hash >> (32 - kNumShardBits); + } + + public: + explicit ShardedLRUCache(size_t capacity, const string& id) + : last_id_(0) { + // A cache is often a singleton, so: + // 1. We reuse its MemTracker if one already exists, and + // 2. It is directly parented to the root MemTracker. + mem_tracker_ = MemTracker::FindOrCreateTracker( + -1, strings::Substitute("$0-sharded_lru_cache", id)); + + const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards; + for (int s = 0; s < kNumShards; s++) { + gscoped_ptr shard(new LRUCache(mem_tracker_.get())); + shard->SetCapacity(per_shard); + shards_.push_back(shard.release()); + } + } + + virtual ~ShardedLRUCache() { + STLDeleteElements(&shards_); + } + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + CacheDeleter* deleter) OVERRIDE { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)]->Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key, CacheBehavior caching) OVERRIDE { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)]->Lookup(key, hash, caching == EXPECT_IN_CACHE); + } + virtual void Release(Handle* handle) OVERRIDE { + LRUHandle* h = reinterpret_cast(handle); + shards_[Shard(h->hash)]->Release(handle); + } + virtual void Erase(const Slice& key) OVERRIDE { + const uint32_t hash = HashSlice(key); + shards_[Shard(hash)]->Erase(key, hash); + } + virtual void* Value(Handle* handle) OVERRIDE { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() OVERRIDE { + lock_guard l(&id_mutex_); + return ++(last_id_); + } + + virtual void SetMetrics(const scoped_refptr& entity) OVERRIDE { + metrics_.reset(new CacheMetrics(entity)); + for (LRUCache* cache : shards_) { + cache->SetMetrics(metrics_.get()); + } + } + + virtual uint8_t* Allocate(int bytes) OVERRIDE { + DCHECK_GE(bytes, 0); + return new uint8_t[bytes]; + } + + virtual void Free(uint8_t* ptr) OVERRIDE { + delete[] ptr; + } + + virtual uint8_t* MoveToHeap(uint8_t* ptr, int size) OVERRIDE { + // Our allocated pointers are always on the heap. + return ptr; + } + +}; + +} // end anonymous namespace + +Cache* NewLRUCache(CacheType type, size_t capacity, const string& id) { + switch (type) { + case DRAM_CACHE: + return new ShardedLRUCache(capacity, id); +#if !defined(__APPLE__) + case NVM_CACHE: + return NewLRUNvmCache(capacity, id); +#endif + default: + LOG(FATAL) << "Unsupported LRU cache type: " << type; + } +} + +} // namespace kudu diff --git a/src/kudu/util/cache.h b/src/kudu/util/cache.h new file mode 100644 index 000000000000..da1c9cbb8206 --- /dev/null +++ b/src/kudu/util/cache.h @@ -0,0 +1,158 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Cache is an interface that maps keys to values. It has internal +// synchronization and may be safely accessed concurrently from +// multiple threads. It may automatically evict entries to make room +// for new entries. Values have a specified charge against the cache +// capacity. For example, a cache where the values are variable +// length strings, may use the length of the string as the charge for +// the string. +// +// This is taken from LevelDB and evolved to fit the kudu codebase. +// +// TODO: this is pretty lock-heavy. Would be good to sub out something +// a little more concurrent. + +#ifndef KUDU_UTIL_CACHE_H_ +#define KUDU_UTIL_CACHE_H_ + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/slice.h" + +namespace kudu { + +class Cache; +struct CacheMetrics; +class MetricEntity; + +enum CacheType { + DRAM_CACHE, + NVM_CACHE +}; + +// Create a new cache with a fixed size capacity. This implementation +// of Cache uses a least-recently-used eviction policy. +Cache* NewLRUCache(CacheType type, size_t capacity, const std::string& id); + +// Callback interface for deleting a value stored in the cache. +// This is called when an inserted entry is no longer needed. +class CacheDeleter { + public: + // Delete the given 'value'. + // The key is only passed for convenenience -- the cache itself is + // responsible for managing the key's memory. + virtual void Delete(const Slice& key, void* value) = 0; + virtual ~CacheDeleter() {} +}; + +class Cache { + public: + Cache() { } + + // Destroys all existing entries by calling the "deleter" + // function that was passed to the constructor. + virtual ~Cache(); + + // Opaque handle to an entry stored in the cache. + struct Handle { }; + + // Insert a mapping from key->value into the cache and assign it + // the specified charge against the total cache capacity. + // + // Returns a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + // + // Note that the 'key' Slice is copied into the internal storage of + // the cache. The caller may free or mutate the key data freely + // after this method returns. + // + // When the inserted entry is no longer needed, the cache object, key and + // value will be passed to "deleter". The deleter callback must remain + // valid until it is called. + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + CacheDeleter* deleter) = 0; + + // Passing EXPECT_IN_CACHE will increment the hit/miss metrics that track the number of times + // blocks were requested that the users were hoping to get the block from the cache, along with + // with the basic metrics. + // Passing NO_EXPECT_IN_CACHE will only increment the basic metrics. + // This helps in determining if we are effectively caching the blocks that matter the most. + enum CacheBehavior { + EXPECT_IN_CACHE, + NO_EXPECT_IN_CACHE + }; + + // If the cache has no mapping for "key", returns NULL. + // + // Else return a handle that corresponds to the mapping. The caller + // must call this->Release(handle) when the returned mapping is no + // longer needed. + virtual Handle* Lookup(const Slice& key, CacheBehavior caching) = 0; + + // Release a mapping returned by a previous Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void Release(Handle* handle) = 0; + + // Return the value encapsulated in a handle returned by a + // successful Lookup(). + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual void* Value(Handle* handle) = 0; + + // If the cache contains entry for key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + + // Return a new numeric id. May be used by multiple clients who are + // sharing the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // Pass a metric entity in order to start recoding metrics. + virtual void SetMetrics(const scoped_refptr& metric_entity) = 0; + + // Allocate 'bytes' bytes from the cache's memory pool. + // + // It is possible that this will return NULL if the cache is above its capacity + // and eviction fails to free up enough space for the requested allocation. + // + // NOTE: the returned memory is not automatically freed by the cache: the + // caller must either free it using Free(), or MoveToHeap() followed by + // delete[]. + virtual uint8_t* Allocate(int bytes) = 0; + + // Free 'ptr', which must have been previously allocated using 'Allocate'. + virtual void Free(uint8_t* ptr) = 0; + + // Moves 'ptr' to the normal C++ heap, if it is not already there. + // 'ptr' must have previously been allocated using Allocate(bytes). + // If 'ptr' is already on the C++ heap, then returns the same value. + // + // The returned value should be freed by the caller using the 'delete[]' + // operator. + virtual uint8_t* MoveToHeap(uint8_t* ptr, int bytes) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(Cache); + + void LRU_Remove(Handle* e); + void LRU_Append(Handle* e); + void Unref(Handle* e); + + struct Rep; + Rep* rep_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/cache_metrics.cc b/src/kudu/util/cache_metrics.cc new file mode 100644 index 000000000000..ac2fadfdb4f2 --- /dev/null +++ b/src/kudu/util/cache_metrics.cc @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/cache_metrics.h" + +#include "kudu/util/metrics.h" + +METRIC_DEFINE_counter(server, block_cache_inserts, + "Block Cache Inserts", kudu::MetricUnit::kBlocks, + "Number of blocks inserted in the cache"); +METRIC_DEFINE_counter(server, block_cache_lookups, + "Block Cache Lookups", kudu::MetricUnit::kBlocks, + "Number of blocks looked up from the cache"); +METRIC_DEFINE_counter(server, block_cache_evictions, + "Block Cache Evictions", kudu::MetricUnit::kBlocks, + "Number of blocks evicted from the cache"); +METRIC_DEFINE_counter(server, block_cache_misses, + "Block Cache Misses", kudu::MetricUnit::kBlocks, + "Number of lookups that didn't yield a block"); +METRIC_DEFINE_counter(server, block_cache_misses_caching, + "Block Cache Misses (Caching)", kudu::MetricUnit::kBlocks, + "Number of lookups that were expecting a block that didn't yield one." + "Use this number instead of cache_misses when trying to determine how " + "efficient the cache is"); +METRIC_DEFINE_counter(server, block_cache_hits, + "Block Cache Hits", kudu::MetricUnit::kBlocks, + "Number of lookups that found a block"); +METRIC_DEFINE_counter(server, block_cache_hits_caching, + "Block Cache Hits (Caching)", kudu::MetricUnit::kBlocks, + "Number of lookups that were expecting a block that found one." + "Use this number instead of cache_hits when trying to determine how " + "efficient the cache is"); + +METRIC_DEFINE_gauge_uint64(server, block_cache_usage, "Block Cache Memory Usage", + kudu::MetricUnit::kBytes, + "Memory consumed by the block cache"); + +namespace kudu { + +#define MINIT(member, x) member(METRIC_##x.Instantiate(entity)) +#define GINIT(member, x) member(METRIC_##x.Instantiate(entity, 0)) +CacheMetrics::CacheMetrics(const scoped_refptr& entity) + : MINIT(inserts, block_cache_inserts), + MINIT(lookups, block_cache_lookups), + MINIT(evictions, block_cache_evictions), + MINIT(cache_hits, block_cache_hits), + MINIT(cache_hits_caching, block_cache_hits_caching), + MINIT(cache_misses, block_cache_misses), + MINIT(cache_misses_caching, block_cache_misses_caching), + GINIT(cache_usage, block_cache_usage) { +} +#undef MINIT +#undef GINIT + +} // namespace kudu diff --git a/src/kudu/util/cache_metrics.h b/src/kudu/util/cache_metrics.h new file mode 100644 index 000000000000..47f759f225f5 --- /dev/null +++ b/src/kudu/util/cache_metrics.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_CACHE_METRICS_H +#define KUDU_UTIL_CACHE_METRICS_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" + +namespace kudu { + +template +class AtomicGauge; +class Counter; +class MetricEntity; + +struct CacheMetrics { + explicit CacheMetrics(const scoped_refptr& metric_entity); + + scoped_refptr inserts; + scoped_refptr lookups; + scoped_refptr evictions; + scoped_refptr cache_hits; + scoped_refptr cache_hits_caching; + scoped_refptr cache_misses; + scoped_refptr cache_misses_caching; + + scoped_refptr > cache_usage; +}; + +} // namespace kudu +#endif /* KUDU_UTIL_CACHE_METRICS_H */ diff --git a/src/kudu/util/callback_bind-test.cc b/src/kudu/util/callback_bind-test.cc new file mode 100644 index 000000000000..ded1b3a57c3c --- /dev/null +++ b/src/kudu/util/callback_bind-test.cc @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/macros.h" + +#include + +namespace kudu { + +using std::string; + +static int Return5() { + return 5; +} + +TEST(CallbackBindTest, TestFreeFunction) { + Callback func_cb = Bind(&Return5); + ASSERT_EQ(5, func_cb.Run()); +} + +class Ref : public RefCountedThreadSafe { + public: + int Foo() { return 3; } +}; + +// Simple class that helps with verifying ref counting. +// Not thread-safe. +struct RefCountable { + RefCountable() + : refs(0) { + } + void AddRef() const { + refs++; + } + void Release() const { + refs--; + } + void Print() const { + LOG(INFO) << "Hello. Refs: " << refs; + } + + mutable int refs; + DISALLOW_COPY_AND_ASSIGN(RefCountable); +}; + +TEST(CallbackBindTest, TestClassMethod) { + scoped_refptr ref = new Ref(); + Callback ref_cb = Bind(&Ref::Foo, ref); + ref = nullptr; + ASSERT_EQ(3, ref_cb.Run()); +} + +int ReturnI(int i, const char* str) { + return i; +} + +TEST(CallbackBindTest, TestPartialBind) { + Callback cb = Bind(&ReturnI, 23); + ASSERT_EQ(23, cb.Run("hello world")); +} + +char IncrementChar(gscoped_ptr in) { + return *in + 1; +} + +TEST(CallbackBindTest, TestCallScopedPtrArg) { + // Calling a function with a gscoped_ptr argument is just like any other + // function which takes gscoped_ptr: + gscoped_ptr foo(new char('x')); + Callback)> cb = Bind(&IncrementChar); + ASSERT_EQ('y', cb.Run(foo.Pass())); +} + +TEST(CallbackBindTest, TestBindScopedPtrArg) { + // Binding a function with a gscoped_ptr argument requires using Passed() + gscoped_ptr foo(new char('x')); + Callback cb = Bind(&IncrementChar, Passed(&foo)); + ASSERT_EQ('y', cb.Run()); +} + +// Test that the ref counting functionality works. +TEST(CallbackBindTest, TestRefCounting) { + RefCountable countable; + { + ASSERT_EQ(0, countable.refs); + Closure cb = Bind(&RefCountable::Print, &countable); + ASSERT_EQ(1, countable.refs); + cb.Run(); + ASSERT_EQ(1, countable.refs); + } + ASSERT_EQ(0, countable.refs); +} + +} // namespace kudu diff --git a/src/kudu/util/coding-inl.h b/src/kudu/util/coding-inl.h new file mode 100644 index 000000000000..5fe0f9df6d69 --- /dev/null +++ b/src/kudu/util/coding-inl.h @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// Some portions Copyright (c) 2011 The LevelDB Authors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef KUDU_UTIL_CODING_INL_H +#define KUDU_UTIL_CODING_INL_H + +#include +#include + +namespace kudu { + +inline uint8_t *InlineEncodeVarint32(uint8_t *dst, uint32_t v) { + // Operate on characters as unsigneds + uint8_t *ptr = dst; + static const int B = 128; + if (v < (1<<7)) { + *(ptr++) = v; + } else if (v < (1<<14)) { + *(ptr++) = v | B; + *(ptr++) = v>>7; + } else if (v < (1<<21)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = v>>14; + } else if (v < (1<<28)) { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = v>>21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v>>7) | B; + *(ptr++) = (v>>14) | B; + *(ptr++) = (v>>21) | B; + *(ptr++) = v>>28; + } + return ptr; +} + +inline void InlineEncodeFixed32(uint8_t *buf, uint32_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; +#endif +} + +inline void InlineEncodeFixed64(uint8_t *buf, uint64_t value) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + memcpy(buf, &value, sizeof(value)); +#else + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; +#endif +} + + +// Standard Put... routines append to a string +template +inline void InlinePutFixed32(StrType *dst, uint32_t value) { + uint8_t buf[sizeof(value)]; + InlineEncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); +} + +template +inline void InlinePutFixed64(StrType *dst, uint64_t value) { + uint8_t buf[sizeof(value)]; + InlineEncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); +} + +template +inline void InlinePutVarint32(StrType* dst, uint32_t v) { + // We resize the array and then size it back down as appropriate + // rather than using append(), since the generated code ends up + // being substantially shorter. + int old_size = dst->size(); + dst->resize(old_size + 5); + uint8_t* p = &(*dst)[old_size]; + uint8_t *ptr = InlineEncodeVarint32(p, v); + + dst->resize(old_size + ptr - p); +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/coding.cc b/src/kudu/util/coding.cc new file mode 100644 index 000000000000..bd3cfcd5eac5 --- /dev/null +++ b/src/kudu/util/coding.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "kudu/util/coding.h" +#include "kudu/util/coding-inl.h" + +namespace kudu { + +void PutVarint32(faststring* dst, uint32_t v) { + uint8_t buf[5]; + uint8_t* ptr = InlineEncodeVarint32(buf, v); + dst->append(buf, ptr - buf); +} + +uint8_t* EncodeVarint64(uint8_t* dst, uint64_t v) { + static const int B = 128; + while (v >= B) { + *(dst++) = (v & (B-1)) | B; + v >>= 7; + } + *(dst++) = static_cast(v); + return dst; +} + +void PutFixed32(faststring *dst, uint32_t value) { + InlinePutFixed32(dst, value); +} + +void PutFixed64(faststring *dst, uint64_t value) { + InlinePutFixed64(dst, value); +} + +void PutVarint64(faststring *dst, uint64_t v) { + uint8_t buf[10]; + uint8_t* ptr = EncodeVarint64(buf, v); + dst->append(buf, ptr - buf); +} + +void PutLengthPrefixedSlice(faststring* dst, const Slice& value) { + PutVarint32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +void PutFixed32LengthPrefixedSlice(faststring* dst, const Slice& value) { + PutFixed32(dst, value.size()); + dst->append(value.data(), value.size()); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +const uint8_t *GetVarint32PtrFallback(const uint8_t *p, + const uint8_t *limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *p; + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return p; + } + } + return nullptr; +} + +bool GetVarint32(Slice* input, uint32_t* value) { + const uint8_t *p = input->data(); + const uint8_t *limit = p + input->size(); + const uint8_t *q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const uint8_t *GetVarint64Ptr(const uint8_t *p, const uint8_t *limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *p; + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return p; + } + } + return nullptr; +} + +bool GetVarint64(Slice* input, uint64_t* value) { + const uint8_t *p = input->data(); + const uint8_t *limit = p + input->size(); + const uint8_t *q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, limit - q); + return true; + } +} + +const uint8_t *GetLengthPrefixedSlice(const uint8_t *p, const uint8_t *limit, + Slice* result) { + uint32_t len = 0; + p = GetVarint32Ptr(p, limit, &len); + if (p == nullptr) return nullptr; + if (p + len > limit) return nullptr; + *result = Slice(p, len); + return p + len; +} + +bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && + input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +} // namespace kudu diff --git a/src/kudu/util/coding.h b/src/kudu/util/coding.h new file mode 100644 index 000000000000..698d92a03341 --- /dev/null +++ b/src/kudu/util/coding.h @@ -0,0 +1,110 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Endian-neutral encoding: +// * Fixed-length numbers are encoded with least-significant byte first +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format + +#ifndef STORAGE_LEVELDB_UTIL_CODING_H_ +#define STORAGE_LEVELDB_UTIL_CODING_H_ + +#include +#include +#include + +#include "kudu/util/slice.h" +#include "kudu/util/faststring.h" + +namespace kudu { +extern void PutFixed32(faststring* dst, uint32_t value); +extern void PutFixed64(faststring* dst, uint64_t value); +extern void PutVarint32(faststring* dst, uint32_t value); +extern void PutVarint64(faststring* dst, uint64_t value); + +// Put a length-prefixed Slice into the buffer. The length prefix +// is varint-encoded. +extern void PutLengthPrefixedSlice(faststring* dst, const Slice& value); + +// Put a length-prefixed Slice into the buffer. The length prefix +// is 32-bit fixed encoded in little endian. +extern void PutFixed32LengthPrefixedSlice(faststring* dst, const Slice& value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// NULL on error. These routines only look at bytes in the range +// [p..limit-1] +extern const uint8_t *GetVarint32Ptr(const uint8_t *p,const uint8_t *limit, uint32_t* v); +extern const uint8_t *GetVarint64Ptr(const uint8_t *p,const uint8_t *limit, uint64_t* v); + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +extern void EncodeFixed32(uint8_t *dst, uint32_t value); +extern void EncodeFixed64(uint8_t *dst, uint64_t value); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern uint8_t *EncodeVarint32(uint8_t *dst, uint32_t value); +extern uint8_t *EncodeVarint64(uint8_t *dst, uint64_t value); + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint32_t DecodeFixed32(const uint8_t *ptr) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; +#else + return ((static_cast(static_cast(ptr[0]))) + | (static_cast(static_cast(ptr[1])) << 8) + | (static_cast(static_cast(ptr[2])) << 16) + | (static_cast(static_cast(ptr[3])) << 24)); +#endif +} + +inline uint64_t DecodeFixed64(const uint8_t *ptr) { +#if __BYTE_ORDER == __LITTLE_ENDIAN + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; +#else + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; +#endif +} + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const uint8_t *GetVarint32PtrFallback(const uint8_t *p, + const uint8_t *limit, + uint32_t* value); +inline const uint8_t *GetVarint32Ptr(const uint8_t *p, + const uint8_t *limit, + uint32_t* value) { + if (PREDICT_TRUE(p < limit)) { + uint32_t result = *p; + if (PREDICT_TRUE((result & 128) == 0)) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +} // namespace kudu + +#endif // STORAGE_LEVELDB_UTIL_CODING_H_ diff --git a/src/kudu/util/condition_variable.cc b/src/kudu/util/condition_variable.cc new file mode 100644 index 000000000000..13d1d36ea623 --- /dev/null +++ b/src/kudu/util/condition_variable.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/util/condition_variable.h" + +#include + +#include +#include + +#include "kudu/util/monotime.h" +#include "kudu/util/thread_restrictions.h" + +namespace kudu { + +ConditionVariable::ConditionVariable(Mutex* user_lock) + : user_mutex_(&user_lock->native_handle_) +#if !defined(NDEBUG) + , user_lock_(user_lock) +#endif +{ + int rv = 0; + // http://crbug.com/293736 + // NaCl doesn't support monotonic clock based absolute deadlines. + // On older Android platform versions, it's supported through the + // non-standard pthread_cond_timedwait_monotonic_np. Newer platform + // versions have pthread_condattr_setclock. + // Mac can use relative time deadlines. +#if !defined(__APPLE__) && !defined(OS_NACL) && \ + !(defined(OS_ANDROID) && defined(HAVE_PTHREAD_COND_TIMEDWAIT_MONOTONIC)) + pthread_condattr_t attrs; + rv = pthread_condattr_init(&attrs); + DCHECK_EQ(0, rv); + pthread_condattr_setclock(&attrs, CLOCK_MONOTONIC); + rv = pthread_cond_init(&condition_, &attrs); + pthread_condattr_destroy(&attrs); +#else + rv = pthread_cond_init(&condition_, nullptr); +#endif + DCHECK_EQ(0, rv); +} + +ConditionVariable::~ConditionVariable() { +#if defined(OS_MACOSX) + // This hack is necessary to avoid a fatal pthreads subsystem bug in the + // Darwin kernel. https://codereview.chromium.org/1323293005/ + { + Mutex lock; + MutexLock l(lock); + struct timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = 1; + pthread_cond_timedwait_relative_np(&condition_, lock.native_handle, &ts); + } +#endif + int rv = pthread_cond_destroy(&condition_); + DCHECK_EQ(0, rv); +} + +void ConditionVariable::Wait() const { + ThreadRestrictions::AssertWaitAllowed(); +#if !defined(NDEBUG) + user_lock_->CheckHeldAndUnmark(); +#endif + int rv = pthread_cond_wait(&condition_, user_mutex_); + DCHECK_EQ(0, rv); +#if !defined(NDEBUG) + user_lock_->CheckUnheldAndMark(); +#endif +} + +bool ConditionVariable::TimedWait(const MonoDelta& max_time) const { + ThreadRestrictions::AssertWaitAllowed(); + + // Negative delta means we've already timed out. + int64 nsecs = max_time.ToNanoseconds(); + if (nsecs < 0) { + return false; + } + + struct timespec relative_time; + max_time.ToTimeSpec(&relative_time); + +#if !defined(NDEBUG) + user_lock_->CheckHeldAndUnmark(); +#endif + +#if defined(__APPLE__) + int rv = pthread_cond_timedwait_relative_np( + &condition_, user_mutex_, &relative_time); +#else + // The timeout argument to pthread_cond_timedwait is in absolute time. + struct timespec absolute_time; +#if defined(OS_NACL) + // See comment in constructor for why this is different in NaCl. + struct timeval now; + gettimeofday(&now, NULL); + absolute_time.tv_sec = now.tv_sec; + absolute_time.tv_nsec = now.tv_usec * MonoTime::kNanosecondsPerMicrosecond; +#else + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + absolute_time.tv_sec = now.tv_sec; + absolute_time.tv_nsec = now.tv_nsec; +#endif + + absolute_time.tv_sec += relative_time.tv_sec; + absolute_time.tv_nsec += relative_time.tv_nsec; + absolute_time.tv_sec += absolute_time.tv_nsec / MonoTime::kNanosecondsPerSecond; + absolute_time.tv_nsec %= MonoTime::kNanosecondsPerSecond; + DCHECK_GE(absolute_time.tv_sec, now.tv_sec); // Overflow paranoia + +#if defined(OS_ANDROID) && defined(HAVE_PTHREAD_COND_TIMEDWAIT_MONOTONIC) + int rv = pthread_cond_timedwait_monotonic_np( + &condition_, user_mutex_, &absolute_time); +#else + int rv = pthread_cond_timedwait(&condition_, user_mutex_, &absolute_time); +#endif // OS_ANDROID && HAVE_PTHREAD_COND_TIMEDWAIT_MONOTONIC +#endif // __APPLE__ + + DCHECK(rv == 0 || rv == ETIMEDOUT) + << "unexpected pthread_cond_timedwait return value: " << rv; +#if !defined(NDEBUG) + user_lock_->CheckUnheldAndMark(); +#endif + return rv == 0; +} + +void ConditionVariable::Broadcast() { + int rv = pthread_cond_broadcast(&condition_); + DCHECK_EQ(0, rv); +} + +void ConditionVariable::Signal() { + int rv = pthread_cond_signal(&condition_); + DCHECK_EQ(0, rv); +} + +} // namespace kudu diff --git a/src/kudu/util/condition_variable.h b/src/kudu/util/condition_variable.h new file mode 100644 index 000000000000..ca6e265f26a5 --- /dev/null +++ b/src/kudu/util/condition_variable.h @@ -0,0 +1,113 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// ConditionVariable wraps pthreads condition variable synchronization or, on +// Windows, simulates it. This functionality is very helpful for having +// several threads wait for an event, as is common with a thread pool managed +// by a master. The meaning of such an event in the (worker) thread pool +// scenario is that additional tasks are now available for processing. It is +// used in Chrome in the DNS prefetching system to notify worker threads that +// a queue now has items (tasks) which need to be tended to. A related use +// would have a pool manager waiting on a ConditionVariable, waiting for a +// thread in the pool to announce (signal) that there is now more room in a +// (bounded size) communications queue for the manager to deposit tasks, or, +// as a second example, that the queue of tasks is completely empty and all +// workers are waiting. +// +// USAGE NOTE 1: spurious signal events are possible with this and +// most implementations of condition variables. As a result, be +// *sure* to retest your condition before proceeding. The following +// is a good example of doing this correctly: +// +// while (!work_to_be_done()) Wait(...); +// +// In contrast do NOT do the following: +// +// if (!work_to_be_done()) Wait(...); // Don't do this. +// +// Especially avoid the above if you are relying on some other thread only +// issuing a signal up *if* there is work-to-do. There can/will +// be spurious signals. Recheck state on waiting thread before +// assuming the signal was intentional. Caveat caller ;-). +// +// USAGE NOTE 2: Broadcast() frees up all waiting threads at once, +// which leads to contention for the locks they all held when they +// called Wait(). This results in POOR performance. A much better +// approach to getting a lot of threads out of Wait() is to have each +// thread (upon exiting Wait()) call Signal() to free up another +// Wait'ing thread. Look at condition_variable_unittest.cc for +// both examples. +// +// Broadcast() can be used nicely during teardown, as it gets the job +// done, and leaves no sleeping threads... and performance is less +// critical at that point. +// +// The semantics of Broadcast() are carefully crafted so that *all* +// threads that were waiting when the request was made will indeed +// get signaled. Some implementations mess up, and don't signal them +// all, while others allow the wait to be effectively turned off (for +// a while while waiting threads come around). This implementation +// appears correct, as it will not "lose" any signals, and will guarantee +// that all threads get signaled by Broadcast(). +// +// This implementation offers support for "performance" in its selection of +// which thread to revive. Performance, in direct contrast with "fairness," +// assures that the thread that most recently began to Wait() is selected by +// Signal to revive. Fairness would (if publicly supported) assure that the +// thread that has Wait()ed the longest is selected. The default policy +// may improve performance, as the selected thread may have a greater chance of +// having some of its stack data in various CPU caches. +// +// For a discussion of the many very subtle implementation details, see the FAQ +// at the end of condition_variable_win.cc. + +#ifndef BASE_SYNCHRONIZATION_CONDITION_VARIABLE_H_ +#define BASE_SYNCHRONIZATION_CONDITION_VARIABLE_H_ + +#include + +#include "kudu/util/monotime.h" +#include "kudu/util/mutex.h" + +namespace kudu { + +class ConditionVarImpl; +class TimeDelta; + +class ConditionVariable { + public: + // Construct a cv for use with ONLY one user lock. + explicit ConditionVariable(Mutex* user_lock); + + ~ConditionVariable(); + + // Wait() releases the caller's critical section atomically as it starts to + // sleep, and the reacquires it when it is signaled. + void Wait() const; + + // Like Wait(), but only waits up to a limited amount of time. + // + // Returns true if we were Signal()'ed, or false if 'max_time' elapsed. + bool TimedWait(const MonoDelta& max_time) const; + + // Broadcast() revives all waiting threads. + void Broadcast(); + // Signal() revives one waiting thread. + void Signal(); + + private: + + mutable pthread_cond_t condition_; + pthread_mutex_t* user_mutex_; + +#if !defined(NDEBUG) + Mutex* user_lock_; // Needed to adjust shadow lock state on wait. +#endif + + DISALLOW_COPY_AND_ASSIGN(ConditionVariable); +}; + +} // namespace kudu + +#endif // BASE_SYNCHRONIZATION_CONDITION_VARIABLE_H_ diff --git a/src/kudu/util/countdown_latch-test.cc b/src/kudu/util/countdown_latch-test.cc new file mode 100644 index 000000000000..cf2517c3443b --- /dev/null +++ b/src/kudu/util/countdown_latch-test.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/util/countdown_latch.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" + +namespace kudu { + +static void DecrementLatch(CountDownLatch* latch, int amount) { + if (amount == 1) { + latch->CountDown(); + return; + } + latch->CountDown(amount); +} + +// Tests that we can decrement the latch by arbitrary amounts, as well +// as 1 by one. +TEST(TestCountDownLatch, TestLatch) { + + gscoped_ptr pool; + ASSERT_OK(ThreadPoolBuilder("cdl-test").set_max_threads(1).Build(&pool)); + + CountDownLatch latch(1000); + + // Decrement the count by 1 in another thread, this should not fire the + // latch. + ASSERT_OK(pool->SubmitFunc(boost::bind(DecrementLatch, &latch, 1))); + ASSERT_FALSE(latch.WaitFor(MonoDelta::FromMilliseconds(200))); + ASSERT_EQ(999, latch.count()); + + // Now decrement by 1000 this should decrement to 0 and fire the latch + // (even though 1000 is one more than the current count). + ASSERT_OK(pool->SubmitFunc(boost::bind(DecrementLatch, &latch, 1000))); + latch.Wait(); + ASSERT_EQ(0, latch.count()); +} + +// Test that resetting to zero while there are waiters lets the waiters +// continue. +TEST(TestCountDownLatch, TestResetToZero) { + CountDownLatch cdl(100); + scoped_refptr t; + ASSERT_OK(Thread::Create("test", "cdl-test", &CountDownLatch::Wait, &cdl, &t)); + + // Sleep for a bit until it's likely the other thread is waiting on the latch. + SleepFor(MonoDelta::FromMilliseconds(10)); + cdl.Reset(0); + t->Join(); +} + +} // namespace kudu diff --git a/src/kudu/util/countdown_latch.h b/src/kudu/util/countdown_latch.h new file mode 100644 index 000000000000..c7d2693a2eed --- /dev/null +++ b/src/kudu/util/countdown_latch.h @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_COUNTDOWN_LATCH_H +#define KUDU_UTIL_COUNTDOWN_LATCH_H + +#include "kudu/gutil/macros.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/monotime.h" +#include "kudu/util/mutex.h" +#include "kudu/util/thread_restrictions.h" + +namespace kudu { + +// This is a C++ implementation of the Java CountDownLatch +// class. +// See http://docs.oracle.com/javase/6/docs/api/java/util/concurrent/CountDownLatch.html +class CountDownLatch { + public: + // Initialize the latch with the given initial count. + explicit CountDownLatch(int count) + : cond_(&lock_), + count_(count) { + } + + // Decrement the count of this latch by 'amount' + // If the new count is less than or equal to zero, then all waiting threads are woken up. + // If the count is already zero, this has no effect. + void CountDown(int amount) { + DCHECK_GE(amount, 0); + MutexLock lock(lock_); + if (count_ == 0) { + return; + } + + if (amount >= count_) { + count_ = 0; + } else { + count_ -= amount; + } + + if (count_ == 0) { + // Latch has triggered. + cond_.Broadcast(); + } + } + + // Decrement the count of this latch. + // If the new count is zero, then all waiting threads are woken up. + // If the count is already zero, this has no effect. + void CountDown() { + CountDown(1); + } + + // Wait until the count on the latch reaches zero. + // If the count is already zero, this returns immediately. + void Wait() const { + ThreadRestrictions::AssertWaitAllowed(); + MutexLock lock(lock_); + while (count_ > 0) { + cond_.Wait(); + } + } + + // Waits for the count on the latch to reach zero, or until 'until' time is reached. + // Returns true if the count became zero, false otherwise. + bool WaitUntil(const MonoTime& when) const { + ThreadRestrictions::AssertWaitAllowed(); + MonoDelta relative = when.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + return WaitFor(relative); + } + + // Waits for the count on the latch to reach zero, or until 'delta' time elapses. + // Returns true if the count became zero, false otherwise. + bool WaitFor(const MonoDelta& delta) const { + ThreadRestrictions::AssertWaitAllowed(); + MutexLock lock(lock_); + while (count_ > 0) { + if (!cond_.TimedWait(delta)) { + return false; + } + } + return true; + } + + // Reset the latch with the given count. This is equivalent to reconstructing + // the latch. If 'count' is 0, and there are currently waiters, those waiters + // will be triggered as if you counted down to 0. + void Reset(uint64_t count) { + MutexLock lock(lock_); + count_ = count; + if (count_ == 0) { + // Awake any waiters if we reset to 0. + cond_.Broadcast(); + } + } + + uint64_t count() const { + MutexLock lock(lock_); + return count_; + } + + private: + DISALLOW_COPY_AND_ASSIGN(CountDownLatch); + mutable Mutex lock_; + ConditionVariable cond_; + + uint64_t count_; +}; + +// Utility class which calls latch->CountDown() in its destructor. +class CountDownOnScopeExit { + public: + explicit CountDownOnScopeExit(CountDownLatch *latch) : latch_(latch) {} + ~CountDownOnScopeExit() { + latch_->CountDown(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(CountDownOnScopeExit); + + CountDownLatch *latch_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/util/cow_object.h b/src/kudu/util/cow_object.h new file mode 100644 index 000000000000..10c019e59325 --- /dev/null +++ b/src/kudu/util/cow_object.h @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_COW_OBJECT_H +#define KUDU_UTIL_COW_OBJECT_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/rwc_lock.h" + +namespace kudu { + +// An object which manages its state via copy-on-write. +// +// Access to this object can be done more conveniently using the +// CowLock template class defined below. +// +// The 'State' template parameter must be swappable using std::swap. +template +class CowObject { + public: + CowObject() {} + ~CowObject() {} + + void ReadLock() const { + lock_.ReadLock(); + } + + void ReadUnlock() const { + lock_.ReadUnlock(); + } + + // Lock the object for write (preventing concurrent mutators), and make a safe + // copy of the object to mutate. + void StartMutation() { + lock_.WriteLock(); + // Clone our object. + dirty_state_.reset(new State(state_)); + } + + // Abort the current mutation. This drops the write lock without applying any + // changes made to the mutable copy. + void AbortMutation() { + dirty_state_.reset(); + lock_.WriteUnlock(); + } + + // Commit the current mutation. This escalates to the "Commit" lock, which + // blocks any concurrent readers or writers, swaps in the new version of the + // State, and then drops the commit lock. + void CommitMutation() { + lock_.UpgradeToCommitLock(); + CHECK(dirty_state_); + std::swap(state_, *dirty_state_); + dirty_state_.reset(); + lock_.CommitUnlock(); + } + + // Return the current state, not reflecting any in-progress mutations. + State& state() { + DCHECK(lock_.HasReaders() || lock_.HasWriteLock()); + return state_; + } + + const State& state() const { + DCHECK(lock_.HasReaders() || lock_.HasWriteLock()); + return state_; + } + + // Returns the current dirty state (i.e reflecting in-progress mutations). + // Should only be called by a thread who previously called StartMutation(). + State* mutable_dirty() { + DCHECK(lock_.HasWriteLock()); + return DCHECK_NOTNULL(dirty_state_.get()); + } + + const State& dirty() const { + return *DCHECK_NOTNULL(dirty_state_.get()); + } + + private: + mutable RWCLock lock_; + + State state_; + gscoped_ptr dirty_state_; + + DISALLOW_COPY_AND_ASSIGN(CowObject); +}; + +// A lock-guard-like scoped object to acquire the lock on a CowObject, +// and obtain a pointer to the correct copy to read/write. +// +// Example usage: +// +// CowObject my_obj; +// { +// CowLock l(&my_obj, CowLock::READ); +// l.data().get_foo(); +// ... +// } +// { +// CowLock l(&my_obj, CowLock::WRITE); +// l->mutable_data()->set_foo(...); +// ... +// l.Commit(); +// } +template +class CowLock { + public: + enum LockMode { + READ, WRITE, RELEASED + }; + + // Lock in either read or write mode. + CowLock(CowObject* cow, + LockMode mode) + : cow_(cow), + mode_(mode) { + if (mode == READ) { + cow_->ReadLock(); + } else if (mode_ == WRITE) { + cow_->StartMutation(); + } else { + LOG(FATAL) << "Cannot lock in mode " << mode; + } + } + + // Lock in read mode. + // A const object may not be locked in write mode. + CowLock(const CowObject* info, + LockMode mode) + : cow_(const_cast*>(info)), + mode_(mode) { + if (mode == READ) { + cow_->ReadLock(); + } else if (mode_ == WRITE) { + LOG(FATAL) << "Cannot write-lock a const pointer"; + } else { + LOG(FATAL) << "Cannot lock in mode " << mode; + } + } + + // Commit the underlying object. + // Requires that the caller hold the lock in write mode. + void Commit() { + DCHECK_EQ(WRITE, mode_); + cow_->CommitMutation(); + mode_ = RELEASED; + } + + void Unlock() { + if (mode_ == READ) { + cow_->ReadUnlock(); + } else if (mode_ == WRITE) { + cow_->AbortMutation(); + } else { + DCHECK_EQ(RELEASED, mode_); + } + mode_ = RELEASED; + } + + // Obtain the underlying data. In WRITE mode, this returns the + // same data as mutable_data() (not the safe unchanging copy). + const State& data() const { + if (mode_ == READ) { + return cow_->state(); + } else if (mode_ == WRITE) { + return cow_->dirty(); + } else { + LOG(FATAL) << "Cannot access data after committing"; + } + } + + // Obtain the mutable data. This may only be called in WRITE mode. + State* mutable_data() { + if (mode_ == READ) { + LOG(FATAL) << "Cannot mutate data with READ lock"; + } else if (mode_ == WRITE) { + return cow_->mutable_dirty(); + } else { + LOG(FATAL) << "Cannot access data after committing"; + } + } + + bool is_write_locked() const { + return mode_ == WRITE; + } + + // Drop the lock. If the lock is held in WRITE mode, and the + // lock has not yet been released, aborts the mutation, restoring + // the underlying object to its original data. + ~CowLock() { + Unlock(); + } + + private: + CowObject* cow_; + LockMode mode_; + DISALLOW_COPY_AND_ASSIGN(CowLock); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_COW_OBJECT_H */ diff --git a/src/kudu/util/crc-test.cc b/src/kudu/util/crc-test.cc new file mode 100644 index 000000000000..6da023f38aa0 --- /dev/null +++ b/src/kudu/util/crc-test.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/crc.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace crc { + +using strings::Substitute; + +class CrcTest : public KuduTest { + protected: + + // Returns pointer to data which must be deleted by caller. + static void GenerateBenchmarkData(const uint8_t** bufptr, size_t* buflen) { + const uint32_t kNumNumbers = 1000000; + const uint32_t kBytesPerNumber = sizeof(uint32_t); + const uint32_t kLength = kNumNumbers * kBytesPerNumber; + auto buf = new uint8_t[kLength]; + for (uint32_t i = 0; i < kNumNumbers; i++) { + memcpy(buf + (i * kBytesPerNumber), &i, kBytesPerNumber); + } + *bufptr = buf; + *buflen = kLength; + } + +}; + +// Basic functionality test. +TEST_F(CrcTest, TestCRC32C) { + const string test_data("abcdefgh"); + Crc* crc32c = GetCrc32cInstance(); + uint64_t data_crc = 0; + crc32c->Compute(test_data.data(), test_data.length(), &data_crc); + char buf[kFastToBufferSize]; + const char* output = FastHex64ToBuffer(data_crc, buf); + LOG(INFO) << "CRC32C of " << test_data << " is: 0x" << output << " (full 64 bits)"; + output = FastHex32ToBuffer(static_cast(data_crc), buf); + LOG(INFO) << "CRC32C of " << test_data << " is: 0x" << output << " (truncated 32 bits)"; + ASSERT_EQ(0xa9421b7, data_crc); // Known value from crcutil usage test program. +} + +// Simple benchmark of CRC32C throughput. +// We should expect about 8 bytes per cycle in throughput on a single core. +TEST_F(CrcTest, BenchmarkCRC32C) { + gscoped_ptr data; + const uint8_t* buf; + size_t buflen; + GenerateBenchmarkData(&buf, &buflen); + data.reset(buf); + Crc* crc32c = GetCrc32cInstance(); + int kNumRuns = 1000; + if (AllowSlowTests()) { + kNumRuns = 40000; + } + const uint64_t kNumBytes = kNumRuns * buflen; + Stopwatch sw; + sw.start(); + for (int i = 0; i < kNumRuns; i++) { + uint64_t cksum; + crc32c->Compute(buf, buflen, &cksum); + } + sw.stop(); + CpuTimes elapsed = sw.elapsed(); + LOG(INFO) << Substitute("$0 runs of CRC32C on $1 bytes of data (total: $2 bytes)" + " in $3 seconds; $4 bytes per millisecond, $5 bytes per nanosecond!", + kNumRuns, buflen, kNumBytes, elapsed.wall_seconds(), + (kNumBytes / elapsed.wall_millis()), + (kNumBytes / elapsed.wall)); +} + +} // namespace crc +} // namespace kudu diff --git a/src/kudu/util/crc.cc b/src/kudu/util/crc.cc new file mode 100644 index 000000000000..7be2709de467 --- /dev/null +++ b/src/kudu/util/crc.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/crc.h" + +#include + +#include "kudu/gutil/once.h" +#include "kudu/util/debug/leakcheck_disabler.h" + +namespace kudu { +namespace crc { + +using debug::ScopedLeakCheckDisabler; + +static GoogleOnceType crc32c_once = GOOGLE_ONCE_INIT; +static Crc* crc32c_instance = nullptr; + +static void InitCrc32cInstance() { + ScopedLeakCheckDisabler disabler; // CRC instance is never freed. + // TODO: Is initial = 0 and roll window = 4 appropriate for all cases? + crc32c_instance = crcutil_interface::CRC::CreateCrc32c(true, 0, 4, nullptr); +} + +Crc* GetCrc32cInstance() { + GoogleOnceInit(&crc32c_once, &InitCrc32cInstance); + return crc32c_instance; +} + +uint32_t Crc32c(const void* data, size_t length) { + uint64_t crc32 = 0; + GetCrc32cInstance()->Compute(data, length, &crc32); + return static_cast(crc32); // Only uses lower 32 bits. +} + +} // namespace crc +} // namespace kudu diff --git a/src/kudu/util/crc.h b/src/kudu/util/crc.h new file mode 100644 index 000000000000..3c2277a1af9a --- /dev/null +++ b/src/kudu/util/crc.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_CRC_H_ +#define KUDU_UTIL_CRC_H_ + +#include +#include + +#include + +namespace kudu { +namespace crc { + +typedef crcutil_interface::CRC Crc; + +// Returns pointer to singleton instance of CRC32C implementation. +Crc* GetCrc32cInstance(); + +// Helper function to simply calculate a CRC32C of the given data. +uint32_t Crc32c(const void* data, size_t length); + +} // namespace crc +} // namespace kudu + +#endif // KUDU_UTIL_CRC_H_ diff --git a/src/kudu/util/curl_util.cc b/src/kudu/util/curl_util.cc new file mode 100644 index 000000000000..d3265b882644 --- /dev/null +++ b/src/kudu/util/curl_util.cc @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/curl_util.h" + +#include "kudu/gutil/strings/substitute.h" + +#include +#include + +namespace kudu { + +namespace { + +inline Status TranslateError(CURLcode code) { + if (code == CURLE_OK) { + return Status::OK(); + } + return Status::NetworkError("curl error", curl_easy_strerror(code)); +} + +extern "C" { +size_t WriteCallback(void* buffer, size_t size, size_t nmemb, void* user_ptr) { + size_t real_size = size * nmemb; + faststring* buf = reinterpret_cast(user_ptr); + CHECK_NOTNULL(buf)->append(reinterpret_cast(buffer), real_size); + return real_size; +} +} // extern "C" + +} // anonymous namespace + +EasyCurl::EasyCurl() { + curl_ = curl_easy_init(); + CHECK(curl_) << "Could not init curl"; +} + +EasyCurl::~EasyCurl() { + curl_easy_cleanup(curl_); +} + +Status EasyCurl::FetchURL(const std::string& url, faststring* buf) { + return DoRequest(url, nullptr, buf); +} + +Status EasyCurl::PostToURL(const std::string& url, + const std::string& post_data, + faststring* dst) { + return DoRequest(url, &post_data, dst); +} + +Status EasyCurl::DoRequest(const std::string& url, + const std::string* post_data, + faststring* dst) { + CHECK_NOTNULL(dst)->clear(); + + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_URL, url.c_str()))); + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION, WriteCallback))); + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_WRITEDATA, + static_cast(dst)))); + if (post_data) { + RETURN_NOT_OK(TranslateError(curl_easy_setopt(curl_, CURLOPT_POSTFIELDS, + post_data->c_str()))); + } + + RETURN_NOT_OK(TranslateError(curl_easy_perform(curl_))); + long rc; // NOLINT(runtime/int) curl wants a long + RETURN_NOT_OK(TranslateError(curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE, &rc))); + if (rc != 200) { + return Status::RemoteError(strings::Substitute("HTTP $0", rc)); + } + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/curl_util.h b/src/kudu/util/curl_util.h new file mode 100644 index 000000000000..b7e2da9acd3d --- /dev/null +++ b/src/kudu/util/curl_util.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_CURL_UTIL_H +#define KUDU_UTIL_CURL_UTIL_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +typedef void CURL; + +namespace kudu { + +class faststring; + +// Simple wrapper around curl's "easy" interface, allowing the user to +// fetch web pages into memory using a blocking API. +// +// This is not thread-safe. +class EasyCurl { + public: + EasyCurl(); + ~EasyCurl(); + + // Fetch the given URL into the provided buffer. + // Any existing data in the buffer is replaced. + Status FetchURL(const std::string& url, + faststring* dst); + + // Issue an HTTP POST to the given URL with the given data. + // Returns results in 'dst' as above. + Status PostToURL(const std::string& url, + const std::string& post_data, + faststring* dst); + + private: + // Do a request. If 'post_data' is non-NULL, does a POST. + // Otherwise, does a GET. + Status DoRequest(const std::string& url, + const std::string* post_data, + faststring* dst); + CURL* curl_; + DISALLOW_COPY_AND_ASSIGN(EasyCurl); +}; + +} // namespace kudu + +#endif /* KUDU_UTIL_CURL_UTIL_H */ diff --git a/src/kudu/util/debug-util-test.cc b/src/kudu/util/debug-util-test.cc new file mode 100644 index 000000000000..a4563da79886 --- /dev/null +++ b/src/kudu/util/debug-util-test.cc @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +using std::string; +using std::vector; + +namespace kudu { + +class DebugUtilTest : public KuduTest { +}; + +TEST_F(DebugUtilTest, TestStackTrace) { + StackTrace t; + t.Collect(1); + string trace = t.Symbolize(); + ASSERT_STR_CONTAINS(trace, "kudu::DebugUtilTest_TestStackTrace_Test::TestBody"); +} + +// DumpThreadStack is only supported on Linux, since the implementation relies +// on the tgkill syscall which is not portable. +#if defined(__linux__) + +namespace { +void SleeperThread(CountDownLatch* l) { + // We use an infinite loop around WaitFor() instead of a normal Wait() + // so that this test passes in TSAN. Without this, we run into this TSAN + // bug which prevents the sleeping thread from handling signals: + // https://code.google.com/p/thread-sanitizer/issues/detail?id=91 + while (!l->WaitFor(MonoDelta::FromMilliseconds(10))) { + } +} + +void fake_signal_handler(int signum) {} + +bool IsSignalHandlerRegistered(int signum) { + struct sigaction cur_action; + CHECK_EQ(0, sigaction(signum, nullptr, &cur_action)); + return cur_action.sa_handler != SIG_DFL; +} +} // anonymous namespace + +TEST_F(DebugUtilTest, TestStackTraceInvalidTid) { + string s = DumpThreadStack(1); + ASSERT_STR_CONTAINS(s, "unable to deliver signal"); +} + +TEST_F(DebugUtilTest, TestStackTraceSelf) { + string s = DumpThreadStack(Thread::CurrentThreadId()); + ASSERT_STR_CONTAINS(s, "kudu::DebugUtilTest_TestStackTraceSelf_Test::TestBody()"); +} + +TEST_F(DebugUtilTest, TestStackTraceMainThread) { + string s = DumpThreadStack(getpid()); + ASSERT_STR_CONTAINS(s, "kudu::DebugUtilTest_TestStackTraceMainThread_Test::TestBody()"); +} + +TEST_F(DebugUtilTest, TestSignalStackTrace) { + CountDownLatch l(1); + scoped_refptr t; + ASSERT_OK(Thread::Create("test", "test thread", &SleeperThread, &l, &t)); + + // We have to loop a little bit because it takes a little while for the thread + // to start up and actually call our function. + string stack; + for (int i = 0; i < 10000; i++) { + stack = DumpThreadStack(t->tid()); + if (stack.find("SleeperThread") != string::npos) break; + SleepFor(MonoDelta::FromMicroseconds(100)); + } + ASSERT_STR_CONTAINS(stack, "SleeperThread"); + + // Test that we can change the signal and that the stack traces still work, + // on the new signal. + ASSERT_FALSE(IsSignalHandlerRegistered(SIGUSR1)); + ASSERT_OK(SetStackTraceSignal(SIGUSR1)); + + // Should now be registered. + ASSERT_TRUE(IsSignalHandlerRegistered(SIGUSR1)); + + // SIGUSR2 should be relinquished. + ASSERT_FALSE(IsSignalHandlerRegistered(SIGUSR2)); + + // Stack traces should work using the new handler. + ASSERT_STR_CONTAINS(DumpThreadStack(t->tid()), "SleeperThread"); + + // Switch back to SIGUSR2 and ensure it changes back. + ASSERT_OK(SetStackTraceSignal(SIGUSR2)); + ASSERT_TRUE(IsSignalHandlerRegistered(SIGUSR2)); + ASSERT_FALSE(IsSignalHandlerRegistered(SIGUSR1)); + + // Stack traces should work using the new handler. + ASSERT_STR_CONTAINS(DumpThreadStack(t->tid()), "SleeperThread"); + + // Register our own signal handler on SIGUSR1, and ensure that + // we get a bad Status if we try to use it. + signal(SIGUSR1, &fake_signal_handler); + ASSERT_STR_CONTAINS(SetStackTraceSignal(SIGUSR1).ToString(), + "unable to install signal handler"); + signal(SIGUSR1, SIG_IGN); + + // Stack traces should be disabled + ASSERT_STR_CONTAINS(DumpThreadStack(t->tid()), "unable to take thread stack"); + + // Re-enable so that other tests pass. + ASSERT_OK(SetStackTraceSignal(SIGUSR2)); + + // Allow the thread to finish. + l.CountDown(); + t->Join(); +} + +// Test which dumps all known threads within this process. +// We don't validate the results in any way -- but this verifies that we can +// dump library threads such as the libc timer_thread and properly time out. +TEST_F(DebugUtilTest, TestDumpAllThreads) { + vector tids; + ASSERT_OK(ListThreads(&tids)); + for (pid_t tid : tids) { + LOG(INFO) << DumpThreadStack(tid); + } +} +#endif + +} // namespace kudu diff --git a/src/kudu/util/debug-util.cc b/src/kudu/util/debug-util.cc new file mode 100644 index 000000000000..daabc8a6bfb4 --- /dev/null +++ b/src/kudu/util/debug-util.cc @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/debug-util.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/util/env.h" +#include "kudu/util/errno.h" +#include "kudu/util/monotime.h" +#include "kudu/util/thread.h" + +#if defined(__APPLE__) +typedef sig_t sighandler_t; +#endif + +// Evil hack to grab a few useful functions from glog +namespace google { + +extern int GetStackTrace(void** result, int max_depth, int skip_count); + +// Symbolizes a program counter. On success, returns true and write the +// symbol name to "out". The symbol name is demangled if possible +// (supports symbols generated by GCC 3.x or newer). Otherwise, +// returns false. +bool Symbolize(void *pc, char *out, int out_size); + +namespace glog_internal_namespace_ { +extern void DumpStackTraceToString(std::string *s); +} // namespace glog_internal_namespace_ +} // namespace google + +// The %p field width for printf() functions is two characters per byte. +// For some environments, add two extra bytes for the leading "0x". +static const int kPrintfPointerFieldWidth = 2 + 2 * sizeof(void*); + +// The signal that we'll use to communicate with our other threads. +// This can't be in used by other libraries in the process. +static int g_stack_trace_signum = SIGUSR2; + +// We only allow a single dumper thread to run at a time. This simplifies the synchronization +// between the dumper and the target thread. +// +// This lock also protects changes to the signal handler. +static base::SpinLock g_dumper_thread_lock(base::LINKER_INITIALIZED); + +namespace kudu { + +namespace { + +// Global structure used to communicate between the signal handler +// and a dumping thread. +struct SignalCommunication { + // The actual stack trace collected from the target thread. + StackTrace stack; + + // The current target. Signals can be delivered asynchronously, so the + // dumper thread sets this variable first before sending a signal. If + // a signal is received on a thread that doesn't match 'target_tid', it is + // ignored. + pid_t target_tid; + + // Set to 1 when the target thread has successfully collected its stack. + // The dumper thread spins waiting for this to become true. + Atomic32 result_ready; + + // Lock protecting the other members. We use a bare atomic here and a custom + // lock guard below instead of existing spinlock implementaitons because futex() + // is not signal-safe. + Atomic32 lock; + + struct Lock; +}; +SignalCommunication g_comm; + +// Pared-down SpinLock for SignalCommunication::lock. This doesn't rely on futex +// so it is async-signal safe. +struct SignalCommunication::Lock { + Lock() { + while (base::subtle::Acquire_CompareAndSwap(&g_comm.lock, 0, 1) != 0) { + sched_yield(); + } + } + ~Lock() { + base::subtle::Release_Store(&g_comm.lock, 0); + } +}; + +// Signal handler for our stack trace signal. +// We expect that the signal is only sent from DumpThreadStack() -- not by a user. +void HandleStackTraceSignal(int signum) { + SignalCommunication::Lock l; + + // Check that the dumper thread is still interested in our stack trace. + // It's possible for signal delivery to be artificially delayed, in which + // case the dumper thread would have already timed out and moved on with + // its life. In that case, we don't want to race with some other thread's + // dump. + int64_t my_tid = Thread::CurrentThreadId(); + if (g_comm.target_tid != my_tid) { + return; + } + + g_comm.stack.Collect(2); + base::subtle::Release_Store(&g_comm.result_ready, 1); +} + +bool InitSignalHandlerUnlocked(int signum) { + enum InitState { + UNINITIALIZED, + INIT_ERROR, + INITIALIZED + }; + static InitState state = UNINITIALIZED; + + // If we've already registered a handler, but we're being asked to + // change our signal, unregister the old one. + if (signum != g_stack_trace_signum && state == INITIALIZED) { + struct sigaction old_act; + PCHECK(sigaction(g_stack_trace_signum, nullptr, &old_act) == 0); + if (old_act.sa_handler == &HandleStackTraceSignal) { + signal(g_stack_trace_signum, SIG_DFL); + } + } + + // If we'd previously had an error, but the signal number + // is changing, we should mark ourselves uninitialized. + if (signum != g_stack_trace_signum) { + g_stack_trace_signum = signum; + state = UNINITIALIZED; + } + + if (state == UNINITIALIZED) { + struct sigaction old_act; + PCHECK(sigaction(g_stack_trace_signum, nullptr, &old_act) == 0); + if (old_act.sa_handler != SIG_DFL && + old_act.sa_handler != SIG_IGN) { + state = INIT_ERROR; + LOG(WARNING) << "signal handler for stack trace signal " + << g_stack_trace_signum + << " is already in use: " + << "Kudu will not produce thread stack traces."; + } else { + // No one appears to be using the signal. This is racy, but there is no + // atomic swap capability. + sighandler_t old_handler = signal(g_stack_trace_signum, HandleStackTraceSignal); + if (old_handler != SIG_IGN && + old_handler != SIG_DFL) { + LOG(FATAL) << "raced against another thread installing a signal handler"; + } + state = INITIALIZED; + } + } + return state == INITIALIZED; +} + +} // namespace + +Status SetStackTraceSignal(int signum) { + base::SpinLockHolder h(&g_dumper_thread_lock); + if (!InitSignalHandlerUnlocked(signum)) { + return Status::InvalidArgument("unable to install signal handler"); + } + return Status::OK(); +} + +std::string DumpThreadStack(int64_t tid) { +#if defined(__linux__) + base::SpinLockHolder h(&g_dumper_thread_lock); + + // Ensure that our signal handler is installed. We don't need any fancy GoogleOnce here + // because of the mutex above. + if (!InitSignalHandlerUnlocked(g_stack_trace_signum)) { + return ""; + } + + // Set the target TID in our communication structure, so if we end up with any + // delayed signal reaching some other thread, it will know to ignore it. + { + SignalCommunication::Lock l; + CHECK_EQ(0, g_comm.target_tid); + g_comm.target_tid = tid; + } + + // We use the raw syscall here instead of kill() to ensure that we don't accidentally + // send a signal to some other process in the case that the thread has exited and + // the TID been recycled. + if (syscall(SYS_tgkill, getpid(), tid, g_stack_trace_signum) != 0) { + { + SignalCommunication::Lock l; + g_comm.target_tid = 0; + } + return "(unable to deliver signal: process may have exited)"; + } + + // We give the thread ~1s to respond. In testing, threads typically respond within + // a few iterations of the loop, so this timeout is very conservative. + // + // The main reason that a thread would not respond is that it has blocked signals. For + // example, glibc's timer_thread doesn't respond to our signal, so we always time out + // on that one. + string ret; + int i = 0; + while (!base::subtle::Acquire_Load(&g_comm.result_ready) && + i++ < 100) { + SleepFor(MonoDelta::FromMilliseconds(10)); + } + + { + SignalCommunication::Lock l; + CHECK_EQ(tid, g_comm.target_tid); + + if (!g_comm.result_ready) { + ret = "(thread did not respond: maybe it is blocking signals)"; + } else { + ret = g_comm.stack.Symbolize(); + } + + g_comm.target_tid = 0; + g_comm.result_ready = 0; + } + return ret; +#else // defined(__linux__) + return "(unsupported platform)"; +#endif +} + +Status ListThreads(vector *tids) { +#if defined(__linux__) + DIR *dir = opendir("/proc/self/task/"); + if (dir == NULL) { + return Status::IOError("failed to open task dir", ErrnoToString(errno), errno); + } + struct dirent *d; + while ((d = readdir(dir)) != NULL) { + if (d->d_name[0] != '.') { + uint32_t tid; + if (!safe_strtou32(d->d_name, &tid)) { + LOG(WARNING) << "bad tid found in procfs: " << d->d_name; + continue; + } + tids->push_back(tid); + } + } + closedir(dir); +#endif // defined(__linux__) + return Status::OK(); +} + +std::string GetStackTrace() { + std::string s; + google::glog_internal_namespace_::DumpStackTraceToString(&s); + return s; +} + +std::string GetStackTraceHex() { + char buf[1024]; + HexStackTraceToString(buf, 1024); + return std::string(buf); +} + +void HexStackTraceToString(char* buf, size_t size) { + StackTrace trace; + trace.Collect(1); + trace.StringifyToHex(buf, size); +} + +string GetLogFormatStackTraceHex() { + StackTrace trace; + trace.Collect(1); + return trace.ToLogFormatHexString(); +} + +void StackTrace::Collect(int skip_frames) { + num_frames_ = google::GetStackTrace(frames_, arraysize(frames_), skip_frames); +} + +void StackTrace::StringifyToHex(char* buf, size_t size, int flags) const { + char* dst = buf; + + // Reserve kHexEntryLength for the first iteration of the loop, 1 byte for a + // space (which we may not need if there's just one frame), and 1 for a nul + // terminator. + char* limit = dst + size - kHexEntryLength - 2; + for (int i = 0; i < num_frames_ && dst < limit; i++) { + if (i != 0) { + *dst++ = ' '; + } + // See note in Symbolize() below about why we subtract 1 from each address here. + uintptr_t addr = reinterpret_cast(frames_[i]); + if (!(flags & NO_FIX_CALLER_ADDRESSES)) { + addr--; + } + FastHex64ToBuffer(addr, dst); + dst += kHexEntryLength; + } + *dst = '\0'; +} + +string StackTrace::ToHexString(int flags) const { + // Each frame requires kHexEntryLength, plus a space + // We also need one more byte at the end for '\0' + char buf[kMaxFrames * (kHexEntryLength + 1) + 1]; + StringifyToHex(buf, arraysize(buf), flags); + return string(buf); +} + +// Symbolization function borrowed from glog. +string StackTrace::Symbolize() const { + string ret; + for (int i = 0; i < num_frames_; i++) { + void* pc = frames_[i]; + + char tmp[1024]; + const char* symbol = "(unknown)"; + + // The return address 'pc' on the stack is the address of the instruction + // following the 'call' instruction. In the case of calling a function annotated + // 'noreturn', this address may actually be the first instruction of the next + // function, because the function we care about ends with the 'call'. + // So, we subtract 1 from 'pc' so that we're pointing at the 'call' instead + // of the return address. + // + // For example, compiling a C program with -O2 that simply calls 'abort()' yields + // the following disassembly: + // Disassembly of section .text: + // + // 0000000000400440
    : + // 400440: 48 83 ec 08 sub $0x8,%rsp + // 400444: e8 c7 ff ff ff callq 400410 + // + // 0000000000400449 <_start>: + // 400449: 31 ed xor %ebp,%ebp + // ... + // + // If we were to take a stack trace while inside 'abort', the return pointer + // on the stack would be 0x400449 (the first instruction of '_start'). By subtracting + // 1, we end up with 0x400448, which is still within 'main'. + // + // This also ensures that we point at the correct line number when using addr2line + // on logged stacks. + if (google::Symbolize( + reinterpret_cast(pc) - 1, tmp, sizeof(tmp))) { + symbol = tmp; + } + StringAppendF(&ret, " @ %*p %s\n", kPrintfPointerFieldWidth, pc, symbol); + } + return ret; +} + +string StackTrace::ToLogFormatHexString() const { + string ret; + for (int i = 0; i < num_frames_; i++) { + void* pc = frames_[i]; + StringAppendF(&ret, " @ %*p\n", kPrintfPointerFieldWidth, pc); + } + return ret; +} + +uint64_t StackTrace::HashCode() const { + return util_hash::CityHash64(reinterpret_cast(frames_), + sizeof(frames_[0]) * num_frames_); +} + +} // namespace kudu diff --git a/src/kudu/util/debug-util.h b/src/kudu/util/debug-util.h new file mode 100644 index 000000000000..e61f8f0783e9 --- /dev/null +++ b/src/kudu/util/debug-util.h @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_DEBUG_UTIL_H +#define KUDU_UTIL_DEBUG_UTIL_H + +#include + +#include +#include + +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/util/status.h" + +namespace kudu { + +// Return a list of all of the thread IDs currently running in this process. +// Not async-safe. +Status ListThreads(std::vector* tids); + +// Set which POSIX signal number should be used internally for triggering +// stack traces. If the specified signal handler is already in use, this +// returns an error, and stack traces will be disabled. +Status SetStackTraceSignal(int signum); + +// Return the stack trace of the given thread, stringified and symbolized. +// +// Note that the symbolization happens on the calling thread, not the target +// thread, so this is relatively low-impact on the target. +// +// This is safe to use against the current thread, the main thread, or any other +// thread. It requires that the target thread has not blocked POSIX signals. If +// it has, an error message will be returned. +// +// This function is thread-safe but coarsely synchronized: only one "dumper" thread +// may be active at a time. +std::string DumpThreadStack(int64_t tid); + +// Return the current stack trace, stringified. +std::string GetStackTrace(); + +// Return the current stack trace, in hex form. This is significantly +// faster than GetStackTrace() above, so should be used in performance-critical +// places like TRACE() calls. If you really need blazing-fast speed, though, +// use HexStackTraceToString() into a stack-allocated buffer instead -- +// this call causes a heap allocation for the std::string. +// +// Note that this is much more useful in the context of a static binary, +// since addr2line wouldn't know where shared libraries were mapped at +// runtime. +// +// NOTE: This inherits the same async-safety issue as HexStackTraceToString() +std::string GetStackTraceHex(); + +// This is the same as GetStackTraceHex(), except multi-line in a format that +// looks very similar to GetStackTrace() but without symbols. Because it's in +// that format, the tool stacktrace_addr2line.pl in the kudu build-support +// directory can symbolize it automatically (to the extent that addr2line(1) +// is able to find the symbols). +std::string GetLogFormatStackTraceHex(); + +// Collect the current stack trace in hex form into the given buffer. +// +// The resulting trace just includes the hex addresses, space-separated. This is suitable +// for later stringification by pasting into 'addr2line' for example. +// +// This function is not async-safe, since it uses the libc backtrace() function which +// may invoke the dynamic loader. +void HexStackTraceToString(char* buf, size_t size); + +// Efficient class for collecting and later stringifying a stack trace. +// +// Requires external synchronization. +class StackTrace { + public: + StackTrace() + : num_frames_(0) { + } + + void Reset() { + num_frames_ = 0; + } + + void CopyFrom(const StackTrace& s) { + memcpy(this, &s, sizeof(s)); + } + + bool Equals(const StackTrace& s) { + return s.num_frames_ == num_frames_ && + strings::memeq(frames_, s.frames_, + num_frames_ * sizeof(frames_[0])); + } + + // Collect and store the current stack trace. Skips the top 'skip_frames' frames + // from the stack. For example, a value of '1' will skip the 'Collect()' function + // call itself. + // + // This function is technically not async-safe. However, according to + // http://lists.nongnu.org/archive/html/libunwind-devel/2011-08/msg00054.html it is "largely + // async safe" and it would only deadlock in the case that you call it while a dynamic library + // load is in progress. We assume that dynamic library loads would almost always be completed + // very early in the application lifecycle, so for now, this is considered "async safe" until + // it proves to be a problem. + void Collect(int skip_frames = 1); + + + enum Flags { + // Do not fix up the addresses on the stack to try to point to the 'call' + // instructions instead of the return address. This is necessary when dumping + // addresses to be interpreted by 'pprof', which does this fix-up itself. + NO_FIX_CALLER_ADDRESSES = 1 + }; + + // Stringify the trace into the given buffer. + // The resulting output is hex addresses suitable for passing into 'addr2line' + // later. + void StringifyToHex(char* buf, size_t size, int flags = 0) const; + + // Same as above, but returning a std::string. + // This is not async-safe. + std::string ToHexString(int flags = 0) const; + + // Return a string with a symbolized backtrace in a format suitable for + // printing to a log file. + // This is not async-safe. + std::string Symbolize() const; + + // Return a string with a hex-only backtrace in the format typically used in + // log files. Similar to the format given by Symbolize(), but symbols are not + // resolved (only the hex addresses are given). + std::string ToLogFormatHexString() const; + + uint64_t HashCode() const; + + private: + enum { + // The maximum number of stack frames to collect. + kMaxFrames = 16, + + // The max number of characters any frame requires in string form. + kHexEntryLength = 16 + }; + + int num_frames_; + void* frames_[kMaxFrames]; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/debug/leak_annotations.h b/src/kudu/util/debug/leak_annotations.h new file mode 100644 index 000000000000..c13b7e8b140d --- /dev/null +++ b/src/kudu/util/debug/leak_annotations.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_DEBUG_LEAK_ANNOTATIONS_H_ +#define KUDU_UTIL_DEBUG_LEAK_ANNOTATIONS_H_ + +// API definitions from LLVM lsan_interface.h + +extern "C" { + // Allocations made between calls to __lsan_disable() and __lsan_enable() will + // be treated as non-leaks. Disable/enable pairs may be nested. + void __lsan_disable(); + void __lsan_enable(); + // The heap object into which p points will be treated as a non-leak. + void __lsan_ignore_object(const void *p); + // The user may optionally provide this function to disallow leak checking + // for the program it is linked into (if the return value is non-zero). This + // function must be defined as returning a constant value; any behavior beyond + // that is unsupported. + int __lsan_is_turned_off(); + // Calling this function makes LSan enter the leak checking phase immediately. + // Use this if normal end-of-process leak checking happens too late (e.g. if + // you have intentional memory leaks in your shutdown code). Calling this + // function overrides end-of-process leak checking; it must be called at + // most once per process. This function will terminate the process if there + // are memory leaks and the exit_code flag is non-zero. + void __lsan_do_leak_check(); +} // extern "C" + +namespace kudu { +namespace debug { +class ScopedLSANDisabler { + public: + ScopedLSANDisabler() { __lsan_disable(); } + ~ScopedLSANDisabler() { __lsan_enable(); } +}; +} // namespace debug +} // namespace kudu + +#endif // KUDU_UTIL_DEBUG_LEAK_ANNOTATIONS_H_ diff --git a/src/kudu/util/debug/leakcheck_disabler.h b/src/kudu/util/debug/leakcheck_disabler.h new file mode 100644 index 000000000000..493abe4148a5 --- /dev/null +++ b/src/kudu/util/debug/leakcheck_disabler.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_DEBUG_LEAKCHECK_DISABLER_H_ +#define KUDU_UTIL_DEBUG_LEAKCHECK_DISABLER_H_ + +#include +#include "kudu/gutil/macros.h" +#include "kudu/util/debug/leak_annotations.h" + +namespace kudu { +namespace debug { + +// Scoped object that generically disables LSAN leak checking in a given scope. +// While this object is alive, calls to "new" will not be checked for leaks. +class ScopedLeakCheckDisabler { + public: + ScopedLeakCheckDisabler() {} + + private: + +#if defined(__has_feature) +# if __has_feature(address_sanitizer) + ScopedLSANDisabler lsan_disabler; +# endif +#endif + + DISALLOW_COPY_AND_ASSIGN(ScopedLeakCheckDisabler); +}; + +} // namespace debug +} // namespace kudu + +#endif // KUDU_UTIL_DEBUG_LEAKCHECK_DISABLER_H_ diff --git a/src/kudu/util/debug/sanitizer_scopes.h b/src/kudu/util/debug/sanitizer_scopes.h new file mode 100644 index 000000000000..2f8a5572d440 --- /dev/null +++ b/src/kudu/util/debug/sanitizer_scopes.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Wrappers around the annotations from gutil/dynamic_annotations.h, +// provided as C++-style scope guards. +#ifndef KUDU_UTIL_DEBUG_SANITIZER_SCOPES_H_ +#define KUDU_UTIL_DEBUG_SANITIZER_SCOPES_H_ + +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/macros.h" + +namespace kudu { +namespace debug { + +// Scope guard which instructs TSAN to ignore all reads and writes +// on the current thread as long as it is alive. These may be safely +// nested. +class ScopedTSANIgnoreReadsAndWrites { + public: + ScopedTSANIgnoreReadsAndWrites() { + ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN(); + } + ~ScopedTSANIgnoreReadsAndWrites() { + ANNOTATE_IGNORE_READS_AND_WRITES_END(); + } + private: + DISALLOW_COPY_AND_ASSIGN(ScopedTSANIgnoreReadsAndWrites); +}; + +} // namespace debug +} // namespace kudu + +#endif // KUDU_UTIL_DEBUG_SANITIZER_SCOPES_H_ diff --git a/src/kudu/util/debug/trace_event.h b/src/kudu/util/debug/trace_event.h new file mode 100644 index 000000000000..75a83e66cefb --- /dev/null +++ b/src/kudu/util/debug/trace_event.h @@ -0,0 +1,1500 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This header file defines the set of trace_event macros without specifying +// how the events actually get collected and stored. If you need to expose trace +// events to some other universe, you can copy-and-paste this file as well as +// trace_event.h, modifying the macros contained there as necessary for the +// target platform. The end result is that multiple libraries can funnel events +// through to a shared trace event collector. + +// Trace events are for tracking application performance and resource usage. +// Macros are provided to track: +// Begin and end of function calls +// Counters +// +// Events are issued against categories. Whereas LOG's +// categories are statically defined, TRACE categories are created +// implicitly with a string. For example: +// TRACE_EVENT_INSTANT0("MY_SUBSYSTEM", "SomeImportantEvent", +// TRACE_EVENT_SCOPE_THREAD) +// +// It is often the case that one trace may belong in multiple categories at the +// same time. The first argument to the trace can be a comma-separated list of +// categories, forming a category group, like: +// +// TRACE_EVENT_INSTANT0("input,views", "OnMouseOver", TRACE_EVENT_SCOPE_THREAD) +// +// We can enable/disable tracing of OnMouseOver by enabling/disabling either +// category. +// +// Events can be INSTANT, or can be pairs of BEGIN and END in the same scope: +// TRACE_EVENT_BEGIN0("MY_SUBSYSTEM", "SomethingCostly") +// doSomethingCostly() +// TRACE_EVENT_END0("MY_SUBSYSTEM", "SomethingCostly") +// Note: our tools can't always determine the correct BEGIN/END pairs unless +// these are used in the same scope. Use ASYNC_BEGIN/ASYNC_END macros if you +// need them to be in separate scopes. +// +// A common use case is to trace entire function scopes. This +// issues a trace BEGIN and END automatically: +// void doSomethingCostly() { +// TRACE_EVENT0("MY_SUBSYSTEM", "doSomethingCostly"); +// ... +// } +// +// Additional parameters can be associated with an event: +// void doSomethingCostly2(int howMuch) { +// TRACE_EVENT1("MY_SUBSYSTEM", "doSomethingCostly", +// "howMuch", howMuch); +// ... +// } +// +// The trace system will automatically add to this information the +// current process id, thread id, and a timestamp in microseconds. +// +// To trace an asynchronous procedure such as an IPC send/receive, use +// ASYNC_BEGIN and ASYNC_END: +// [single threaded sender code] +// static int send_count = 0; +// ++send_count; +// TRACE_EVENT_ASYNC_BEGIN0("ipc", "message", send_count); +// Send(new MyMessage(send_count)); +// [receive code] +// void OnMyMessage(send_count) { +// TRACE_EVENT_ASYNC_END0("ipc", "message", send_count); +// } +// The third parameter is a unique ID to match ASYNC_BEGIN/ASYNC_END pairs. +// ASYNC_BEGIN and ASYNC_END can occur on any thread of any traced process. +// Pointers can be used for the ID parameter, and they will be mangled +// internally so that the same pointer on two different processes will not +// match. For example: +// class MyTracedClass { +// public: +// MyTracedClass() { +// TRACE_EVENT_ASYNC_BEGIN0("category", "MyTracedClass", this); +// } +// ~MyTracedClass() { +// TRACE_EVENT_ASYNC_END0("category", "MyTracedClass", this); +// } +// } +// +// Trace event also supports counters, which is a way to track a quantity +// as it varies over time. Counters are created with the following macro: +// TRACE_COUNTER1("MY_SUBSYSTEM", "myCounter", g_myCounterValue); +// +// Counters are process-specific. The macro itself can be issued from any +// thread, however. +// +// Sometimes, you want to track two counters at once. You can do this with two +// counter macros: +// TRACE_COUNTER1("MY_SUBSYSTEM", "myCounter0", g_myCounterValue[0]); +// TRACE_COUNTER1("MY_SUBSYSTEM", "myCounter1", g_myCounterValue[1]); +// Or you can do it with a combined macro: +// TRACE_COUNTER2("MY_SUBSYSTEM", "myCounter", +// "bytesPinned", g_myCounterValue[0], +// "bytesAllocated", g_myCounterValue[1]); +// This indicates to the tracing UI that these counters should be displayed +// in a single graph, as a summed area chart. +// +// Since counters are in a global namespace, you may want to disambiguate with a +// unique ID, by using the TRACE_COUNTER_ID* variations. +// +// By default, trace collection is compiled in, but turned off at runtime. +// Collecting trace data is the responsibility of the embedding +// application. In Chrome's case, navigating to about:tracing will turn on +// tracing and display data collected across all active processes. +// +// +// Memory scoping note: +// Tracing copies the pointers, not the string content, of the strings passed +// in for category_group, name, and arg_names. Thus, the following code will +// cause problems: +// char* str = strdup("importantName"); +// TRACE_EVENT_INSTANT0("SUBSYSTEM", str); // BAD! +// free(str); // Trace system now has dangling pointer +// +// To avoid this issue with the |name| and |arg_name| parameters, use the +// TRACE_EVENT_COPY_XXX overloads of the macros at additional runtime overhead. +// Notes: The category must always be in a long-lived char* (i.e. static const). +// The |arg_values|, when used, are always deep copied with the _COPY +// macros. +// +// When are string argument values copied: +// const char* arg_values are only referenced by default: +// TRACE_EVENT1("category", "name", +// "arg1", "literal string is only referenced"); +// Use TRACE_STR_COPY to force copying of a const char*: +// TRACE_EVENT1("category", "name", +// "arg1", TRACE_STR_COPY("string will be copied")); +// std::string arg_values are always copied: +// TRACE_EVENT1("category", "name", +// "arg1", std::string("string will be copied")); +// +// +// Convertable notes: +// Converting a large data type to a string can be costly. To help with this, +// the trace framework provides an interface ConvertableToTraceFormat. If you +// inherit from it and implement the AppendAsTraceFormat method the trace +// framework will call back to your object to convert a trace output time. This +// means, if the category for the event is disabled, the conversion will not +// happen. +// +// class MyData : public kudu::debug::ConvertableToTraceFormat { +// public: +// MyData() {} +// virtual void AppendAsTraceFormat(std::string* out) const OVERRIDE { +// out->append("{\"foo\":1}"); +// } +// private: +// virtual ~MyData() {} +// DISALLOW_COPY_AND_ASSIGN(MyData); +// }; +// +// TRACE_EVENT1("foo", "bar", "data", +// scoped_refptr(new MyData())); +// +// The trace framework will take ownership if the passed pointer and it will +// be free'd when the trace buffer is flushed. +// +// Note, we only do the conversion when the buffer is flushed, so the provided +// data object should not be modified after it's passed to the trace framework. +// +// +// Thread Safety: +// A thread safe singleton and mutex are used for thread safety. Category +// enabled flags are used to limit the performance impact when the system +// is not enabled. +// +// TRACE_EVENT macros first cache a pointer to a category. The categories are +// statically allocated and safe at all times, even after exit. Fetching a +// category is protected by the TraceLog::lock_. Multiple threads initializing +// the static variable is safe, as they will be serialized by the lock and +// multiple calls will return the same pointer to the category. +// +// Then the category_group_enabled flag is checked. This is a unsigned char, and +// not intended to be multithread safe. It optimizes access to AddTraceEvent +// which is threadsafe internally via TraceLog::lock_. The enabled flag may +// cause some threads to incorrectly call or skip calling AddTraceEvent near +// the time of the system being enabled or disabled. This is acceptable as +// we tolerate some data loss while the system is being enabled/disabled and +// because AddTraceEvent is threadsafe internally and checks the enabled state +// again under lock. +// +// Without the use of these static category pointers and enabled flags all +// trace points would carry a significant performance cost of acquiring a lock +// and resolving the category. + +#ifndef KUDU_UTIL_DEBUG_TRACE_EVENT_H_ +#define KUDU_UTIL_DEBUG_TRACE_EVENT_H_ + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/util/debug/trace_event_impl.h" +#include "kudu/util/debug/trace_event_memory.h" +#include "kudu/gutil/walltime.h" + +// By default, const char* argument values are assumed to have long-lived scope +// and will not be copied. Use this macro to force a const char* to be copied. +#define TRACE_STR_COPY(str) \ + trace_event_internal::TraceStringWithCopy(str) + +// This will mark the trace event as disabled by default. The user will need +// to explicitly enable the event. +#define TRACE_DISABLED_BY_DEFAULT(name) "disabled-by-default-" name + +// By default, uint64 ID argument values are not mangled with the Process ID in +// TRACE_EVENT_ASYNC macros. Use this macro to force Process ID mangling. +#define TRACE_ID_MANGLE(id) \ + trace_event_internal::TraceID::ForceMangle(id) + +// By default, pointers are mangled with the Process ID in TRACE_EVENT_ASYNC +// macros. Use this macro to prevent Process ID mangling. +#define TRACE_ID_DONT_MANGLE(id) \ + trace_event_internal::TraceID::DontMangle(id) + +// Records a pair of begin and end events called "name" for the current +// scope, with 0, 1 or 2 associated arguments. If the category is not +// enabled, then this does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_EVENT0(category_group, name) \ + INTERNAL_TRACE_MEMORY(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD_SCOPED(category_group, name) +#define TRACE_EVENT1(category_group, name, arg1_name, arg1_val) \ + INTERNAL_TRACE_MEMORY(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD_SCOPED(category_group, name, arg1_name, arg1_val) +#define TRACE_EVENT2( \ + category_group, name, arg1_name, arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_MEMORY(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD_SCOPED( \ + category_group, name, arg1_name, arg1_val, arg2_name, arg2_val) + +// Records events like TRACE_EVENT2 but uses |memory_tag| for memory tracing. +// Use this where |name| is too generic to accurately aggregate allocations. +#define TRACE_EVENT_WITH_MEMORY_TAG2( \ + category, name, memory_tag, arg1_name, arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_MEMORY(category, memory_tag) \ + INTERNAL_TRACE_EVENT_ADD_SCOPED( \ + category, name, arg1_name, arg1_val, arg2_name, arg2_val) + +// UNSHIPPED_TRACE_EVENT* are like TRACE_EVENT* except that they are not +// included in official builds. + +#if OFFICIAL_BUILD +#undef TRACING_IS_OFFICIAL_BUILD +#define TRACING_IS_OFFICIAL_BUILD 1 +#elif !defined(TRACING_IS_OFFICIAL_BUILD) +#define TRACING_IS_OFFICIAL_BUILD 0 +#endif + +#if TRACING_IS_OFFICIAL_BUILD +#define UNSHIPPED_TRACE_EVENT0(category_group, name) (void)0 +#define UNSHIPPED_TRACE_EVENT1(category_group, name, arg1_name, arg1_val) \ + (void)0 +#define UNSHIPPED_TRACE_EVENT2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) (void)0 +#define UNSHIPPED_TRACE_EVENT_INSTANT0(category_group, name, scope) (void)0 +#define UNSHIPPED_TRACE_EVENT_INSTANT1(category_group, name, scope, \ + arg1_name, arg1_val) (void)0 +#define UNSHIPPED_TRACE_EVENT_INSTANT2(category_group, name, scope, \ + arg1_name, arg1_val, \ + arg2_name, arg2_val) (void)0 +#else +#define UNSHIPPED_TRACE_EVENT0(category_group, name) \ + TRACE_EVENT0(category_group, name) +#define UNSHIPPED_TRACE_EVENT1(category_group, name, arg1_name, arg1_val) \ + TRACE_EVENT1(category_group, name, arg1_name, arg1_val) +#define UNSHIPPED_TRACE_EVENT2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + TRACE_EVENT2(category_group, name, arg1_name, arg1_val, arg2_name, arg2_val) +#define UNSHIPPED_TRACE_EVENT_INSTANT0(category_group, name, scope) \ + TRACE_EVENT_INSTANT0(category_group, name, scope) +#define UNSHIPPED_TRACE_EVENT_INSTANT1(category_group, name, scope, \ + arg1_name, arg1_val) \ + TRACE_EVENT_INSTANT1(category_group, name, scope, arg1_name, arg1_val) +#define UNSHIPPED_TRACE_EVENT_INSTANT2(category_group, name, scope, \ + arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + TRACE_EVENT_INSTANT2(category_group, name, scope, arg1_name, arg1_val, \ + arg2_name, arg2_val) +#endif + +// Records a single event called "name" immediately, with 0, 1 or 2 +// associated arguments. If the category is not enabled, then this +// does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_EVENT_INSTANT0(category_group, name, scope) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_NONE | scope) +#define TRACE_EVENT_INSTANT1(category_group, name, scope, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_NONE | scope, \ + arg1_name, arg1_val) +#define TRACE_EVENT_INSTANT2(category_group, name, scope, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_NONE | scope, \ + arg1_name, arg1_val, arg2_name, arg2_val) +#define TRACE_EVENT_COPY_INSTANT0(category_group, name, scope) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_COPY | scope) +#define TRACE_EVENT_COPY_INSTANT1(category_group, name, scope, \ + arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_COPY | scope, arg1_name, \ + arg1_val) +#define TRACE_EVENT_COPY_INSTANT2(category_group, name, scope, \ + arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_INSTANT, \ + category_group, name, TRACE_EVENT_FLAG_COPY | scope, \ + arg1_name, arg1_val, arg2_name, arg2_val) + +// Sets the current sample state to the given category and name (both must be +// constant strings). These states are intended for a sampling profiler. +// Implementation note: we store category and name together because we don't +// want the inconsistency/expense of storing two pointers. +// |thread_bucket| is [0..2] and is used to statically isolate samples in one +// thread from others. +#define TRACE_EVENT_SET_SAMPLING_STATE_FOR_BUCKET( \ + bucket_number, category, name) \ + trace_event_internal:: \ + TraceEventSamplingStateScope::Set(category "\0" name) + +// Returns a current sampling state of the given bucket. +#define TRACE_EVENT_GET_SAMPLING_STATE_FOR_BUCKET(bucket_number) \ + trace_event_internal::TraceEventSamplingStateScope::Current() + +// Creates a scope of a sampling state of the given bucket. +// +// { // The sampling state is set within this scope. +// TRACE_EVENT_SAMPLING_STATE_SCOPE_FOR_BUCKET(0, "category", "name"); +// ...; +// } +#define TRACE_EVENT_SCOPED_SAMPLING_STATE_FOR_BUCKET( \ + bucket_number, category, name) \ + trace_event_internal::TraceEventSamplingStateScope \ + traceEventSamplingScope(category "\0" name); + +// Syntactic sugars for the sampling tracing in the main thread. +#define TRACE_EVENT_SCOPED_SAMPLING_STATE(category, name) \ + TRACE_EVENT_SCOPED_SAMPLING_STATE_FOR_BUCKET(0, category, name) +#define TRACE_EVENT_GET_SAMPLING_STATE() \ + TRACE_EVENT_GET_SAMPLING_STATE_FOR_BUCKET(0) +#define TRACE_EVENT_SET_SAMPLING_STATE(category, name) \ + TRACE_EVENT_SET_SAMPLING_STATE_FOR_BUCKET(0, category, name) + + +// Records a single BEGIN event called "name" immediately, with 0, 1 or 2 +// associated arguments. If the category is not enabled, then this +// does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_EVENT_BEGIN0(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_BEGIN1(category_group, name, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_BEGIN2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val, \ + arg2_name, arg2_val) +#define TRACE_EVENT_COPY_BEGIN0(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_BEGIN1(category_group, name, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_COPY, arg1_name, arg1_val) +#define TRACE_EVENT_COPY_BEGIN2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_BEGIN, \ + category_group, name, TRACE_EVENT_FLAG_COPY, arg1_name, arg1_val, \ + arg2_name, arg2_val) + +// Similar to TRACE_EVENT_BEGINx but with a custom |at| timestamp provided. +// - |id| is used to match the _BEGIN event with the _END event. +// Events are considered to match if their category_group, name and id values +// all match. |id| must either be a pointer or an integer value up to 64 bits. +// If it's a pointer, the bits will be xored with a hash of the process ID so +// that the same pointer on two different processes will not collide. +#define TRACE_EVENT_BEGIN_WITH_ID_TID_AND_TIMESTAMP0(category_group, \ + name, id, thread_id, timestamp) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID_TID_AND_TIMESTAMP( \ + TRACE_EVENT_PHASE_ASYNC_BEGIN, category_group, name, id, thread_id, \ + timestamp, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_COPY_BEGIN_WITH_ID_TID_AND_TIMESTAMP0( \ + category_group, name, id, thread_id, timestamp) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID_TID_AND_TIMESTAMP( \ + TRACE_EVENT_PHASE_ASYNC_BEGIN, category_group, name, id, thread_id, \ + timestamp, TRACE_EVENT_FLAG_COPY) + +// Records a single END event for "name" immediately. If the category +// is not enabled, then this does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_EVENT_END0(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_END1(category_group, name, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_END2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val, \ + arg2_name, arg2_val) +#define TRACE_EVENT_COPY_END0(category_group, name) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_END1(category_group, name, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_COPY, arg1_name, arg1_val) +#define TRACE_EVENT_COPY_END2(category_group, name, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_END, \ + category_group, name, TRACE_EVENT_FLAG_COPY, arg1_name, arg1_val, \ + arg2_name, arg2_val) + +// Similar to TRACE_EVENT_ENDx but with a custom |at| timestamp provided. +// - |id| is used to match the _BEGIN event with the _END event. +// Events are considered to match if their category_group, name and id values +// all match. |id| must either be a pointer or an integer value up to 64 bits. +// If it's a pointer, the bits will be xored with a hash of the process ID so +// that the same pointer on two different processes will not collide. +#define TRACE_EVENT_END_WITH_ID_TID_AND_TIMESTAMP0(category_group, \ + name, id, thread_id, timestamp) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID_TID_AND_TIMESTAMP( \ + TRACE_EVENT_PHASE_ASYNC_END, category_group, name, id, thread_id, \ + timestamp, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_COPY_END_WITH_ID_TID_AND_TIMESTAMP0( \ + category_group, name, id, thread_id, timestamp) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID_TID_AND_TIMESTAMP( \ + TRACE_EVENT_PHASE_ASYNC_END, category_group, name, id, thread_id, \ + timestamp, TRACE_EVENT_FLAG_COPY) + +// Records the value of a counter called "name" immediately. Value +// must be representable as a 32 bit integer. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_COUNTER1(category_group, name, value) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, TRACE_EVENT_FLAG_NONE, \ + "value", static_cast(value)) +#define TRACE_COPY_COUNTER1(category_group, name, value) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, TRACE_EVENT_FLAG_COPY, \ + "value", static_cast(value)) + +// Records the values of a multi-parted counter called "name" immediately. +// The UI will treat value1 and value2 as parts of a whole, displaying their +// values as a stacked-bar chart. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +#define TRACE_COUNTER2(category_group, name, value1_name, value1_val, \ + value2_name, value2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, TRACE_EVENT_FLAG_NONE, \ + value1_name, static_cast(value1_val), \ + value2_name, static_cast(value2_val)) +#define TRACE_COPY_COUNTER2(category_group, name, value1_name, value1_val, \ + value2_name, value2_val) \ + INTERNAL_TRACE_EVENT_ADD(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, TRACE_EVENT_FLAG_COPY, \ + value1_name, static_cast(value1_val), \ + value2_name, static_cast(value2_val)) + +// Records the value of a counter called "name" immediately. Value +// must be representable as a 32 bit integer. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +// - |id| is used to disambiguate counters with the same name. It must either +// be a pointer or an integer value up to 64 bits. If it's a pointer, the bits +// will be xored with a hash of the process ID so that the same pointer on +// two different processes will not collide. +#define TRACE_COUNTER_ID1(category_group, name, id, value) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + "value", static_cast(value)) +#define TRACE_COPY_COUNTER_ID1(category_group, name, id, value) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + "value", static_cast(value)) + +// Records the values of a multi-parted counter called "name" immediately. +// The UI will treat value1 and value2 as parts of a whole, displaying their +// values as a stacked-bar chart. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +// - |id| is used to disambiguate counters with the same name. It must either +// be a pointer or an integer value up to 64 bits. If it's a pointer, the bits +// will be xored with a hash of the process ID so that the same pointer on +// two different processes will not collide. +#define TRACE_COUNTER_ID2(category_group, name, id, value1_name, value1_val, \ + value2_name, value2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + value1_name, static_cast(value1_val), \ + value2_name, static_cast(value2_val)) +#define TRACE_COPY_COUNTER_ID2(category_group, name, id, value1_name, \ + value1_val, value2_name, value2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_COUNTER, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + value1_name, static_cast(value1_val), \ + value2_name, static_cast(value2_val)) + + +// Records a single ASYNC_BEGIN event called "name" immediately, with 0, 1 or 2 +// associated arguments. If the category is not enabled, then this +// does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +// - |id| is used to match the ASYNC_BEGIN event with the ASYNC_END event. ASYNC +// events are considered to match if their category_group, name and id values +// all match. |id| must either be a pointer or an integer value up to 64 bits. +// If it's a pointer, the bits will be xored with a hash of the process ID so +// that the same pointer on two different processes will not collide. +// +// An asynchronous operation can consist of multiple phases. The first phase is +// defined by the ASYNC_BEGIN calls. Additional phases can be defined using the +// ASYNC_STEP_INTO or ASYNC_STEP_PAST macros. The ASYNC_STEP_INTO macro will +// annotate the block following the call. The ASYNC_STEP_PAST macro will +// annotate the block prior to the call. Note that any particular event must use +// only STEP_INTO or STEP_PAST macros; they can not mix and match. When the +// operation completes, call ASYNC_END. +// +// An ASYNC trace typically occurs on a single thread (if not, they will only be +// drawn on the thread defined in the ASYNC_BEGIN event), but all events in that +// operation must use the same |name| and |id|. Each step can have its own +// args. +#define TRACE_EVENT_ASYNC_BEGIN0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_ASYNC_BEGIN1(category_group, name, id, arg1_name, \ + arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_ASYNC_BEGIN2(category_group, name, id, arg1_name, \ + arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + arg1_name, arg1_val, arg2_name, arg2_val) +#define TRACE_EVENT_COPY_ASYNC_BEGIN0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_ASYNC_BEGIN1(category_group, name, id, arg1_name, \ + arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val) +#define TRACE_EVENT_COPY_ASYNC_BEGIN2(category_group, name, id, arg1_name, \ + arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val, arg2_name, arg2_val) + +// Records a single ASYNC_STEP_INTO event for |step| immediately. If the +// category is not enabled, then this does nothing. The |name| and |id| must +// match the ASYNC_BEGIN event above. The |step| param identifies this step +// within the async event. This should be called at the beginning of the next +// phase of an asynchronous operation. The ASYNC_BEGIN event must not have any +// ASYNC_STEP_PAST events. +#define TRACE_EVENT_ASYNC_STEP_INTO0(category_group, name, id, step) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_STEP_INTO, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step) +#define TRACE_EVENT_ASYNC_STEP_INTO1(category_group, name, id, step, \ + arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_STEP_INTO, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step, \ + arg1_name, arg1_val) + +// Records a single ASYNC_STEP_PAST event for |step| immediately. If the +// category is not enabled, then this does nothing. The |name| and |id| must +// match the ASYNC_BEGIN event above. The |step| param identifies this step +// within the async event. This should be called at the beginning of the next +// phase of an asynchronous operation. The ASYNC_BEGIN event must not have any +// ASYNC_STEP_INTO events. +#define TRACE_EVENT_ASYNC_STEP_PAST0(category_group, name, id, step) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_STEP_PAST, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step) +#define TRACE_EVENT_ASYNC_STEP_PAST1(category_group, name, id, step, \ + arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_STEP_PAST, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step, \ + arg1_name, arg1_val) + +// Records a single ASYNC_END event for "name" immediately. If the category +// is not enabled, then this does nothing. +#define TRACE_EVENT_ASYNC_END0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_ASYNC_END1(category_group, name, id, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_ASYNC_END2(category_group, name, id, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + arg1_name, arg1_val, arg2_name, arg2_val) +#define TRACE_EVENT_COPY_ASYNC_END0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_ASYNC_END1(category_group, name, id, arg1_name, \ + arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val) +#define TRACE_EVENT_COPY_ASYNC_END2(category_group, name, id, arg1_name, \ + arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_ASYNC_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val, arg2_name, arg2_val) + + +// Records a single FLOW_BEGIN event called "name" immediately, with 0, 1 or 2 +// associated arguments. If the category is not enabled, then this +// does nothing. +// - category and name strings must have application lifetime (statics or +// literals). They may not include " chars. +// - |id| is used to match the FLOW_BEGIN event with the FLOW_END event. FLOW +// events are considered to match if their category_group, name and id values +// all match. |id| must either be a pointer or an integer value up to 64 bits. +// If it's a pointer, the bits will be xored with a hash of the process ID so +// that the same pointer on two different processes will not collide. +// FLOW events are different from ASYNC events in how they are drawn by the +// tracing UI. A FLOW defines asynchronous data flow, such as posting a task +// (FLOW_BEGIN) and later executing that task (FLOW_END). Expect FLOWs to be +// drawn as lines or arrows from FLOW_BEGIN scopes to FLOW_END scopes. Similar +// to ASYNC, a FLOW can consist of multiple phases. The first phase is defined +// by the FLOW_BEGIN calls. Additional phases can be defined using the FLOW_STEP +// macros. When the operation completes, call FLOW_END. An async operation can +// span threads and processes, but all events in that operation must use the +// same |name| and |id|. Each event can have its own args. +#define TRACE_EVENT_FLOW_BEGIN0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_FLOW_BEGIN1(category_group, name, id, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_FLOW_BEGIN2(category_group, name, id, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + arg1_name, arg1_val, arg2_name, arg2_val) +#define TRACE_EVENT_COPY_FLOW_BEGIN0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_FLOW_BEGIN1(category_group, name, id, arg1_name, \ + arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val) +#define TRACE_EVENT_COPY_FLOW_BEGIN2(category_group, name, id, arg1_name, \ + arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_BEGIN, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val, arg2_name, arg2_val) + +// Records a single FLOW_STEP event for |step| immediately. If the category +// is not enabled, then this does nothing. The |name| and |id| must match the +// FLOW_BEGIN event above. The |step| param identifies this step within the +// async event. This should be called at the beginning of the next phase of an +// asynchronous operation. +#define TRACE_EVENT_FLOW_STEP0(category_group, name, id, step) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_STEP, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step) +#define TRACE_EVENT_FLOW_STEP1(category_group, name, id, step, \ + arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_STEP, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, "step", step, \ + arg1_name, arg1_val) +#define TRACE_EVENT_COPY_FLOW_STEP0(category_group, name, id, step) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_STEP, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, "step", step) +#define TRACE_EVENT_COPY_FLOW_STEP1(category_group, name, id, step, \ + arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_STEP, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, "step", step, \ + arg1_name, arg1_val) + +// Records a single FLOW_END event for "name" immediately. If the category +// is not enabled, then this does nothing. +#define TRACE_EVENT_FLOW_END0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE) +#define TRACE_EVENT_FLOW_END1(category_group, name, id, arg1_name, arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, arg1_name, arg1_val) +#define TRACE_EVENT_FLOW_END2(category_group, name, id, arg1_name, arg1_val, \ + arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_NONE, \ + arg1_name, arg1_val, arg2_name, arg2_val) +#define TRACE_EVENT_COPY_FLOW_END0(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY) +#define TRACE_EVENT_COPY_FLOW_END1(category_group, name, id, arg1_name, \ + arg1_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val) +#define TRACE_EVENT_COPY_FLOW_END2(category_group, name, id, arg1_name, \ + arg1_val, arg2_name, arg2_val) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_FLOW_END, \ + category_group, name, id, TRACE_EVENT_FLAG_COPY, \ + arg1_name, arg1_val, arg2_name, arg2_val) + +// Macros to track the life time and value of arbitrary client objects. +// See also TraceTrackableObject. +#define TRACE_EVENT_OBJECT_CREATED_WITH_ID(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_CREATE_OBJECT, \ + category_group, name, TRACE_ID_DONT_MANGLE(id), TRACE_EVENT_FLAG_NONE) + +#define TRACE_EVENT_OBJECT_SNAPSHOT_WITH_ID(category_group, name, id, snapshot) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_SNAPSHOT_OBJECT, \ + category_group, name, TRACE_ID_DONT_MANGLE(id), TRACE_EVENT_FLAG_NONE,\ + "snapshot", snapshot) + +#define TRACE_EVENT_OBJECT_DELETED_WITH_ID(category_group, name, id) \ + INTERNAL_TRACE_EVENT_ADD_WITH_ID(TRACE_EVENT_PHASE_DELETE_OBJECT, \ + category_group, name, TRACE_ID_DONT_MANGLE(id), TRACE_EVENT_FLAG_NONE) + +#define INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE() \ + PREDICT_FALSE(*INTERNAL_TRACE_EVENT_UID(category_group_enabled) & \ + (kudu::debug::TraceLog::ENABLED_FOR_RECORDING | \ + kudu::debug::TraceLog::ENABLED_FOR_EVENT_CALLBACK)) + +// Macro to efficiently determine if a given category group is enabled. +#define TRACE_EVENT_CATEGORY_GROUP_ENABLED(category_group, ret) \ + do { \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group); \ + if (INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE()) { \ + *ret = true; \ + } else { \ + *ret = false; \ + } \ + } while (0) + +// Macro to efficiently determine, through polling, if a new trace has begun. +#define TRACE_EVENT_IS_NEW_TRACE(ret) \ + do { \ + static int INTERNAL_TRACE_EVENT_UID(lastRecordingNumber) = 0; \ + int num_traces_recorded = TRACE_EVENT_API_GET_NUM_TRACES_RECORDED(); \ + if (num_traces_recorded != -1 && \ + num_traces_recorded != \ + INTERNAL_TRACE_EVENT_UID(lastRecordingNumber)) { \ + INTERNAL_TRACE_EVENT_UID(lastRecordingNumber) = \ + num_traces_recorded; \ + *ret = true; \ + } else { \ + *ret = false; \ + } \ + } while (0) + +//////////////////////////////////////////////////////////////////////////////// +// Implementation specific tracing API definitions. + +// Get a pointer to the enabled state of the given trace category. Only +// long-lived literal strings should be given as the category group. The +// returned pointer can be held permanently in a local static for example. If +// the unsigned char is non-zero, tracing is enabled. If tracing is enabled, +// TRACE_EVENT_API_ADD_TRACE_EVENT can be called. It's OK if tracing is disabled +// between the load of the tracing state and the call to +// TRACE_EVENT_API_ADD_TRACE_EVENT, because this flag only provides an early out +// for best performance when tracing is disabled. +// const unsigned char* +// TRACE_EVENT_API_GET_CATEGORY_GROUP_ENABLED(const char* category_group) +#define TRACE_EVENT_API_GET_CATEGORY_GROUP_ENABLED \ + kudu::debug::TraceLog::GetCategoryGroupEnabled + +// Get the number of times traces have been recorded. This is used to implement +// the TRACE_EVENT_IS_NEW_TRACE facility. +// unsigned int TRACE_EVENT_API_GET_NUM_TRACES_RECORDED() +#define TRACE_EVENT_API_GET_NUM_TRACES_RECORDED \ + kudu::debug::TraceLog::GetInstance()->GetNumTracesRecorded + +// Add a trace event to the platform tracing system. +// kudu::debug::TraceEventHandle TRACE_EVENT_API_ADD_TRACE_EVENT( +// char phase, +// const unsigned char* category_group_enabled, +// const char* name, +// uint64_t id, +// int num_args, +// const char** arg_names, +// const unsigned char* arg_types, +// const uint64_t* arg_values, +// unsigned char flags) +#define TRACE_EVENT_API_ADD_TRACE_EVENT \ + kudu::debug::TraceLog::GetInstance()->AddTraceEvent + +// Add a trace event to the platform tracing system. +// kudu::debug::TraceEventHandle TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_TIMESTAMP( +// char phase, +// const unsigned char* category_group_enabled, +// const char* name, +// uint64_t id, +// int thread_id, +// const MicrosecondsInt64& timestamp, +// int num_args, +// const char** arg_names, +// const unsigned char* arg_types, +// const uint64_t* arg_values, +// unsigned char flags) +#define TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP \ + kudu::debug::TraceLog::GetInstance()->AddTraceEventWithThreadIdAndTimestamp + +// Set the duration field of a COMPLETE trace event. +// void TRACE_EVENT_API_UPDATE_TRACE_EVENT_DURATION( +// const unsigned char* category_group_enabled, +// const char* name, +// kudu::debug::TraceEventHandle id) +#define TRACE_EVENT_API_UPDATE_TRACE_EVENT_DURATION \ + kudu::debug::TraceLog::GetInstance()->UpdateTraceEventDuration + +// Defines atomic operations used internally by the tracing system. +#define TRACE_EVENT_API_ATOMIC_WORD AtomicWord +#define TRACE_EVENT_API_ATOMIC_LOAD(var) base::subtle::NoBarrier_Load(&(var)) +#define TRACE_EVENT_API_ATOMIC_STORE(var, value) \ + base::subtle::NoBarrier_Store(&(var), (value)) + +// Defines visibility for classes in trace_event.h +#define TRACE_EVENT_API_CLASS_EXPORT BASE_EXPORT + +// The thread buckets for the sampling profiler. +TRACE_EVENT_API_CLASS_EXPORT extern \ + TRACE_EVENT_API_ATOMIC_WORD g_trace_state[3]; + +#define TRACE_EVENT_API_THREAD_BUCKET(thread_bucket) \ + g_trace_state[thread_bucket] + +//////////////////////////////////////////////////////////////////////////////// + +// Implementation detail: trace event macros create temporary variables +// to keep instrumentation overhead low. These macros give each temporary +// variable a unique name based on the line number to prevent name collisions. +#define INTERNAL_TRACE_EVENT_UID3(a,b) \ + trace_event_unique_##a##b +#define INTERNAL_TRACE_EVENT_UID2(a,b) \ + INTERNAL_TRACE_EVENT_UID3(a,b) +#define INTERNAL_TRACE_EVENT_UID(name_prefix) \ + INTERNAL_TRACE_EVENT_UID2(name_prefix, __LINE__) + +// Implementation detail: internal macro to create static category. +// No barriers are needed, because this code is designed to operate safely +// even when the unsigned char* points to garbage data (which may be the case +// on processors without cache coherency). +#define INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO_CUSTOM_VARIABLES( \ + category_group, atomic, category_group_enabled) \ + category_group_enabled = \ + reinterpret_cast(TRACE_EVENT_API_ATOMIC_LOAD( \ + atomic)); \ + if (PREDICT_FALSE(!category_group_enabled)) { \ + category_group_enabled = \ + TRACE_EVENT_API_GET_CATEGORY_GROUP_ENABLED(category_group); \ + TRACE_EVENT_API_ATOMIC_STORE(atomic, \ + reinterpret_cast( \ + category_group_enabled)); \ + } + +#define INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group) \ + static TRACE_EVENT_API_ATOMIC_WORD INTERNAL_TRACE_EVENT_UID(atomic) = 0; \ + const unsigned char* INTERNAL_TRACE_EVENT_UID(category_group_enabled); \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO_CUSTOM_VARIABLES(category_group, \ + INTERNAL_TRACE_EVENT_UID(atomic), \ + INTERNAL_TRACE_EVENT_UID(category_group_enabled)); + +// Implementation detail: internal macro to create static category and add +// event if the category is enabled. +#define INTERNAL_TRACE_EVENT_ADD(phase, category_group, name, flags, ...) \ + do { \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group); \ + if (INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE()) { \ + trace_event_internal::AddTraceEvent( \ + phase, INTERNAL_TRACE_EVENT_UID(category_group_enabled), name, \ + trace_event_internal::kNoEventId, flags, ##__VA_ARGS__); \ + } \ + } while (0) + +// Implementation detail: internal macro to create static category and add begin +// event if the category is enabled. Also adds the end event when the scope +// ends. +#define INTERNAL_TRACE_EVENT_ADD_SCOPED(category_group, name, ...) \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group); \ + trace_event_internal::ScopedTracer INTERNAL_TRACE_EVENT_UID(tracer); \ + if (INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE()) { \ + kudu::debug::TraceEventHandle h = trace_event_internal::AddTraceEvent( \ + TRACE_EVENT_PHASE_COMPLETE, \ + INTERNAL_TRACE_EVENT_UID(category_group_enabled), \ + name, trace_event_internal::kNoEventId, \ + TRACE_EVENT_FLAG_NONE, ##__VA_ARGS__); \ + INTERNAL_TRACE_EVENT_UID(tracer).Initialize( \ + INTERNAL_TRACE_EVENT_UID(category_group_enabled), name, h); \ + } + +// Implementation detail: internal macro to create static category and add +// event if the category is enabled. +#define INTERNAL_TRACE_EVENT_ADD_WITH_ID(phase, category_group, name, id, \ + flags, ...) \ + do { \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group); \ + if (INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE()) { \ + unsigned char trace_event_flags = flags | TRACE_EVENT_FLAG_HAS_ID; \ + trace_event_internal::TraceID trace_event_trace_id( \ + id, &trace_event_flags); \ + trace_event_internal::AddTraceEvent( \ + phase, INTERNAL_TRACE_EVENT_UID(category_group_enabled), \ + name, trace_event_trace_id.data(), trace_event_flags, \ + ##__VA_ARGS__); \ + } \ + } while (0) + +// Implementation detail: internal macro to create static category and add +// event if the category is enabled. +#define INTERNAL_TRACE_EVENT_ADD_WITH_ID_TID_AND_TIMESTAMP(phase, \ + category_group, name, id, thread_id, timestamp, flags, ...) \ + do { \ + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO(category_group); \ + if (INTERNAL_TRACE_EVENT_CATEGORY_GROUP_ENABLED_FOR_RECORDING_MODE()) { \ + unsigned char trace_event_flags = flags | TRACE_EVENT_FLAG_HAS_ID; \ + trace_event_internal::TraceID trace_event_trace_id( \ + id, &trace_event_flags); \ + trace_event_internal::AddTraceEventWithThreadIdAndTimestamp( \ + phase, INTERNAL_TRACE_EVENT_UID(category_group_enabled), \ + name, trace_event_trace_id.data(), \ + thread_id, timestamp, \ + trace_event_flags, ##__VA_ARGS__); \ + } \ + } while (0) + +// Notes regarding the following definitions: +// New values can be added and propagated to third party libraries, but existing +// definitions must never be changed, because third party libraries may use old +// definitions. + +// Phase indicates the nature of an event entry. E.g. part of a begin/end pair. +#define TRACE_EVENT_PHASE_BEGIN ('B') +#define TRACE_EVENT_PHASE_END ('E') +#define TRACE_EVENT_PHASE_COMPLETE ('X') +#define TRACE_EVENT_PHASE_INSTANT ('i') +#define TRACE_EVENT_PHASE_ASYNC_BEGIN ('S') +#define TRACE_EVENT_PHASE_ASYNC_STEP_INTO ('T') +#define TRACE_EVENT_PHASE_ASYNC_STEP_PAST ('p') +#define TRACE_EVENT_PHASE_ASYNC_END ('F') +#define TRACE_EVENT_PHASE_FLOW_BEGIN ('s') +#define TRACE_EVENT_PHASE_FLOW_STEP ('t') +#define TRACE_EVENT_PHASE_FLOW_END ('f') +#define TRACE_EVENT_PHASE_METADATA ('M') +#define TRACE_EVENT_PHASE_COUNTER ('C') +#define TRACE_EVENT_PHASE_SAMPLE ('P') +#define TRACE_EVENT_PHASE_CREATE_OBJECT ('N') +#define TRACE_EVENT_PHASE_SNAPSHOT_OBJECT ('O') +#define TRACE_EVENT_PHASE_DELETE_OBJECT ('D') + +// Flags for changing the behavior of TRACE_EVENT_API_ADD_TRACE_EVENT. +#define TRACE_EVENT_FLAG_NONE (static_cast(0)) +#define TRACE_EVENT_FLAG_COPY (static_cast(1 << 0)) +#define TRACE_EVENT_FLAG_HAS_ID (static_cast(1 << 1)) +#define TRACE_EVENT_FLAG_MANGLE_ID (static_cast(1 << 2)) +#define TRACE_EVENT_FLAG_SCOPE_OFFSET (static_cast(1 << 3)) + +#define TRACE_EVENT_FLAG_SCOPE_MASK (static_cast( \ + TRACE_EVENT_FLAG_SCOPE_OFFSET | (TRACE_EVENT_FLAG_SCOPE_OFFSET << 1))) + +// Type values for identifying types in the TraceValue union. +#define TRACE_VALUE_TYPE_BOOL (static_cast(1)) +#define TRACE_VALUE_TYPE_UINT (static_cast(2)) +#define TRACE_VALUE_TYPE_INT (static_cast(3)) +#define TRACE_VALUE_TYPE_DOUBLE (static_cast(4)) +#define TRACE_VALUE_TYPE_POINTER (static_cast(5)) +#define TRACE_VALUE_TYPE_STRING (static_cast(6)) +#define TRACE_VALUE_TYPE_COPY_STRING (static_cast(7)) +#define TRACE_VALUE_TYPE_CONVERTABLE (static_cast(8)) + +// Enum reflecting the scope of an INSTANT event. Must fit within +// TRACE_EVENT_FLAG_SCOPE_MASK. +#define TRACE_EVENT_SCOPE_GLOBAL (static_cast(0 << 3)) +#define TRACE_EVENT_SCOPE_PROCESS (static_cast(1 << 3)) +#define TRACE_EVENT_SCOPE_THREAD (static_cast(2 << 3)) + +#define TRACE_EVENT_SCOPE_NAME_GLOBAL ('g') +#define TRACE_EVENT_SCOPE_NAME_PROCESS ('p') +#define TRACE_EVENT_SCOPE_NAME_THREAD ('t') + +namespace trace_event_internal { + +// Specify these values when the corresponding argument of AddTraceEvent is not +// used. +const int kZeroNumArgs = 0; +const uint64_t kNoEventId = 0; + +// TraceID encapsulates an ID that can either be an integer or pointer. Pointers +// are by default mangled with the Process ID so that they are unlikely to +// collide when the same pointer is used on different processes. +class TraceID { + public: + class DontMangle { + public: + explicit DontMangle(const void* id) + : data_(static_cast( + reinterpret_cast(id))) {} + explicit DontMangle(uint64_t id) : data_(id) {} + explicit DontMangle(unsigned int id) : data_(id) {} + explicit DontMangle(unsigned short id) : data_(id) {} + explicit DontMangle(unsigned char id) : data_(id) {} + explicit DontMangle(long long id) + : data_(static_cast(id)) {} + explicit DontMangle(long id) + : data_(static_cast(id)) {} + explicit DontMangle(int id) + : data_(static_cast(id)) {} + explicit DontMangle(short id) + : data_(static_cast(id)) {} + explicit DontMangle(signed char id) + : data_(static_cast(id)) {} + uint64_t data() const { return data_; } + private: + uint64_t data_; + }; + + class ForceMangle { + public: + explicit ForceMangle(uint64_t id) : data_(id) {} + explicit ForceMangle(unsigned int id) : data_(id) {} + explicit ForceMangle(unsigned short id) : data_(id) {} + explicit ForceMangle(unsigned char id) : data_(id) {} + explicit ForceMangle(long long id) + : data_(static_cast(id)) {} + explicit ForceMangle(long id) + : data_(static_cast(id)) {} + explicit ForceMangle(int id) + : data_(static_cast(id)) {} + explicit ForceMangle(short id) + : data_(static_cast(id)) {} + explicit ForceMangle(signed char id) + : data_(static_cast(id)) {} + uint64_t data() const { return data_; } + private: + uint64_t data_; + }; + + TraceID(const void* id, unsigned char* flags) + : data_(static_cast( + reinterpret_cast(id))) { + *flags |= TRACE_EVENT_FLAG_MANGLE_ID; + } + TraceID(ForceMangle id, unsigned char* flags) : data_(id.data()) { + *flags |= TRACE_EVENT_FLAG_MANGLE_ID; + } + TraceID(DontMangle id, unsigned char* flags) : data_(id.data()) { + } + TraceID(uint64_t id, unsigned char* flags) + : data_(id) { (void)flags; } + TraceID(unsigned int id, unsigned char* flags) + : data_(id) { (void)flags; } + TraceID(unsigned short id, unsigned char* flags) + : data_(id) { (void)flags; } + TraceID(unsigned char id, unsigned char* flags) + : data_(id) { (void)flags; } + TraceID(long long id, unsigned char* flags) + : data_(static_cast(id)) { (void)flags; } + TraceID(long id, unsigned char* flags) + : data_(static_cast(id)) { (void)flags; } + TraceID(int id, unsigned char* flags) + : data_(static_cast(id)) { (void)flags; } + TraceID(short id, unsigned char* flags) + : data_(static_cast(id)) { (void)flags; } + TraceID(signed char id, unsigned char* flags) + : data_(static_cast(id)) { (void)flags; } + + uint64_t data() const { return data_; } + + private: + uint64_t data_; +}; + +// Simple union to store various types as uint64_t. +union TraceValueUnion { + bool as_bool; + uint64_t as_uint; + long long as_int; + double as_double; + const void* as_pointer; + const char* as_string; +}; + +// Simple container for const char* that should be copied instead of retained. +class TraceStringWithCopy { + public: + explicit TraceStringWithCopy(const char* str) : str_(str) {} + const char* str() const { return str_; } + private: + const char* str_; +}; + +// Define SetTraceValue for each allowed type. It stores the type and +// value in the return arguments. This allows this API to avoid declaring any +// structures so that it is portable to third_party libraries. +#define INTERNAL_DECLARE_SET_TRACE_VALUE(actual_type, \ + arg_expression, \ + union_member, \ + value_type_id) \ + static inline void SetTraceValue( \ + actual_type arg, \ + unsigned char* type, \ + uint64_t* value) { \ + TraceValueUnion type_value; \ + type_value.union_member = arg_expression; \ + *type = value_type_id; \ + *value = type_value.as_uint; \ + } +// Simpler form for int types that can be safely casted. +#define INTERNAL_DECLARE_SET_TRACE_VALUE_INT(actual_type, \ + value_type_id) \ + static inline void SetTraceValue( \ + actual_type arg, \ + unsigned char* type, \ + uint64_t* value) { \ + *type = value_type_id; \ + *value = static_cast(arg); \ + } + +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(uint64_t, TRACE_VALUE_TYPE_UINT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(unsigned int, TRACE_VALUE_TYPE_UINT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(unsigned short, TRACE_VALUE_TYPE_UINT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(unsigned char, TRACE_VALUE_TYPE_UINT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(long long, TRACE_VALUE_TYPE_INT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(long, TRACE_VALUE_TYPE_INT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(int, TRACE_VALUE_TYPE_INT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(short, TRACE_VALUE_TYPE_INT) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(signed char, TRACE_VALUE_TYPE_INT) +INTERNAL_DECLARE_SET_TRACE_VALUE(bool, arg, as_bool, TRACE_VALUE_TYPE_BOOL) +INTERNAL_DECLARE_SET_TRACE_VALUE(double, arg, as_double, + TRACE_VALUE_TYPE_DOUBLE) +INTERNAL_DECLARE_SET_TRACE_VALUE(const void*, arg, as_pointer, + TRACE_VALUE_TYPE_POINTER) +INTERNAL_DECLARE_SET_TRACE_VALUE(const char*, arg, as_string, + TRACE_VALUE_TYPE_STRING) +INTERNAL_DECLARE_SET_TRACE_VALUE(const TraceStringWithCopy&, arg.str(), + as_string, TRACE_VALUE_TYPE_COPY_STRING) +#if defined(__APPLE__) +INTERNAL_DECLARE_SET_TRACE_VALUE_INT(size_t, TRACE_VALUE_TYPE_UINT) +#endif + +#undef INTERNAL_DECLARE_SET_TRACE_VALUE +#undef INTERNAL_DECLARE_SET_TRACE_VALUE_INT + +// std::string version of SetTraceValue so that trace arguments can be strings. +static inline void SetTraceValue(const std::string& arg, + unsigned char* type, + uint64_t* value) { + TraceValueUnion type_value; + type_value.as_string = arg.c_str(); + *type = TRACE_VALUE_TYPE_COPY_STRING; + *value = type_value.as_uint; +} + +// These AddTraceEvent and AddTraceEventWithThreadIdAndTimestamp template +// functions are defined here instead of in the macro, because the arg_values +// could be temporary objects, such as std::string. In order to store +// pointers to the internal c_str and pass through to the tracing API, +// the arg_values must live throughout these procedures. + +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const scoped_refptr& arg1_val) { + const int num_args = 1; + unsigned char arg_types[1] = { TRACE_VALUE_TYPE_CONVERTABLE }; + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, &arg1_name, arg_types, NULL, &arg1_val, flags); +} + +template +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const ARG1_TYPE& arg1_val, + const char* arg2_name, + const scoped_refptr& arg2_val) { + const int num_args = 2; + const char* arg_names[2] = { arg1_name, arg2_name }; + + unsigned char arg_types[2]; + uint64_t arg_values[2]; + SetTraceValue(arg1_val, &arg_types[0], &arg_values[0]); + arg_types[1] = TRACE_VALUE_TYPE_CONVERTABLE; + + scoped_refptr convertable_values[2]; + convertable_values[1] = arg2_val; + + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, arg_names, arg_types, arg_values, convertable_values, flags); +} + +template +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const scoped_refptr& arg1_val, + const char* arg2_name, + const ARG2_TYPE& arg2_val) { + const int num_args = 2; + const char* arg_names[2] = { arg1_name, arg2_name }; + + unsigned char arg_types[2]; + uint64_t arg_values[2]; + arg_types[0] = TRACE_VALUE_TYPE_CONVERTABLE; + arg_values[0] = 0; + SetTraceValue(arg2_val, &arg_types[1], &arg_values[1]); + + scoped_refptr convertable_values[2]; + convertable_values[0] = arg1_val; + + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, arg_names, arg_types, arg_values, convertable_values, flags); +} + +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const scoped_refptr& arg1_val, + const char* arg2_name, + const scoped_refptr& arg2_val) { + const int num_args = 2; + const char* arg_names[2] = { arg1_name, arg2_name }; + unsigned char arg_types[2] = + { TRACE_VALUE_TYPE_CONVERTABLE, TRACE_VALUE_TYPE_CONVERTABLE }; + scoped_refptr convertable_values[2] = + { arg1_val, arg2_val }; + + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, arg_names, arg_types, NULL, convertable_values, flags); +} + +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags) { + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + kZeroNumArgs, NULL, NULL, NULL, NULL, flags); +} + +static inline kudu::debug::TraceEventHandle AddTraceEvent( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + unsigned char flags) { + int thread_id = static_cast(kudu::Thread::UniqueThreadId()); + MicrosecondsInt64 now = GetMonoTimeMicros(); + return AddTraceEventWithThreadIdAndTimestamp(phase, category_group_enabled, + name, id, thread_id, now, flags); +} + +template +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const ARG1_TYPE& arg1_val) { + const int num_args = 1; + unsigned char arg_types[1]; + uint64_t arg_values[1]; + SetTraceValue(arg1_val, &arg_types[0], &arg_values[0]); + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, &arg1_name, arg_types, arg_values, NULL, flags); +} + +template +static inline kudu::debug::TraceEventHandle AddTraceEvent( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + unsigned char flags, + const char* arg1_name, + const ARG1_TYPE& arg1_val) { + int thread_id = static_cast(kudu::Thread::UniqueThreadId()); + MicrosecondsInt64 now = GetMonoTimeMicros(); + return AddTraceEventWithThreadIdAndTimestamp(phase, category_group_enabled, + name, id, thread_id, now, flags, + arg1_name, arg1_val); +} + +template +static inline kudu::debug::TraceEventHandle +AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + unsigned char flags, + const char* arg1_name, + const ARG1_TYPE& arg1_val, + const char* arg2_name, + const ARG2_TYPE& arg2_val) { + const int num_args = 2; + const char* arg_names[2] = { arg1_name, arg2_name }; + unsigned char arg_types[2]; + uint64_t arg_values[2]; + SetTraceValue(arg1_val, &arg_types[0], &arg_values[0]); + SetTraceValue(arg2_val, &arg_types[1], &arg_values[1]); + return TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + phase, category_group_enabled, name, id, thread_id, timestamp, + num_args, arg_names, arg_types, arg_values, NULL, flags); +} + +template +static inline kudu::debug::TraceEventHandle AddTraceEvent( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + unsigned char flags, + const char* arg1_name, + const ARG1_TYPE& arg1_val, + const char* arg2_name, + const ARG2_TYPE& arg2_val) { + int thread_id = static_cast(kudu::Thread::UniqueThreadId()); + MicrosecondsInt64 now = GetMonoTimeMicros(); + return AddTraceEventWithThreadIdAndTimestamp(phase, category_group_enabled, + name, id, thread_id, now, flags, + arg1_name, arg1_val, + arg2_name, arg2_val); +} + +// Used by TRACE_EVENTx macros. Do not use directly. +class TRACE_EVENT_API_CLASS_EXPORT ScopedTracer { + public: + // Note: members of data_ intentionally left uninitialized. See Initialize. + ScopedTracer() : p_data_(NULL) {} + + ~ScopedTracer() { + if (p_data_ && *data_.category_group_enabled) + TRACE_EVENT_API_UPDATE_TRACE_EVENT_DURATION( + data_.category_group_enabled, data_.name, data_.event_handle); + } + + void Initialize(const unsigned char* category_group_enabled, + const char* name, + kudu::debug::TraceEventHandle event_handle) { + data_.category_group_enabled = category_group_enabled; + data_.name = name; + data_.event_handle = event_handle; + p_data_ = &data_; + } + + private: + // This Data struct workaround is to avoid initializing all the members + // in Data during construction of this object, since this object is always + // constructed, even when tracing is disabled. If the members of Data were + // members of this class instead, compiler warnings occur about potential + // uninitialized accesses. + struct Data { + const unsigned char* category_group_enabled; + const char* name; + kudu::debug::TraceEventHandle event_handle; + }; + Data* p_data_; + Data data_; +}; + +// Used by TRACE_EVENT_BINARY_EFFICIENTx macro. Do not use directly. +class TRACE_EVENT_API_CLASS_EXPORT ScopedTraceBinaryEfficient { + public: + ScopedTraceBinaryEfficient(const char* category_group, const char* name); + ~ScopedTraceBinaryEfficient(); + + private: + const unsigned char* category_group_enabled_; + const char* name_; + kudu::debug::TraceEventHandle event_handle_; +}; + +// This macro generates less code then TRACE_EVENT0 but is also +// slower to execute when tracing is off. It should generally only be +// used with code that is seldom executed or conditionally executed +// when debugging. +// For now the category_group must be "gpu". +#define TRACE_EVENT_BINARY_EFFICIENT0(category_group, name) \ + trace_event_internal::ScopedTraceBinaryEfficient \ + INTERNAL_TRACE_EVENT_UID(scoped_trace)(category_group, name); + +// TraceEventSamplingStateScope records the current sampling state +// and sets a new sampling state. When the scope exists, it restores +// the sampling state having recorded. +template +class TraceEventSamplingStateScope { + public: + TraceEventSamplingStateScope(const char* category_and_name) { + previous_state_ = TraceEventSamplingStateScope::Current(); + TraceEventSamplingStateScope::Set(category_and_name); + } + + ~TraceEventSamplingStateScope() { + TraceEventSamplingStateScope::Set(previous_state_); + } + + static inline const char* Current() { + return reinterpret_cast(TRACE_EVENT_API_ATOMIC_LOAD( + g_trace_state[BucketNumber])); + } + + static inline void Set(const char* category_and_name) { + TRACE_EVENT_API_ATOMIC_STORE( + g_trace_state[BucketNumber], + reinterpret_cast( + const_cast(category_and_name))); + } + + private: + const char* previous_state_; +}; + +} // namespace trace_event_internal + +namespace kudu { +namespace debug { + +template class TraceScopedTrackableObject { + public: + TraceScopedTrackableObject(const char* category_group, const char* name, + IDType id) + : category_group_(category_group), + name_(name), + id_(id) { + TRACE_EVENT_OBJECT_CREATED_WITH_ID(category_group_, name_, id_); + } + + template void snapshot(ArgType snapshot) { + TRACE_EVENT_OBJECT_SNAPSHOT_WITH_ID(category_group_, name_, id_, snapshot); + } + + ~TraceScopedTrackableObject() { + TRACE_EVENT_OBJECT_DELETED_WITH_ID(category_group_, name_, id_); + } + + private: + const char* category_group_; + const char* name_; + IDType id_; + + DISALLOW_COPY_AND_ASSIGN(TraceScopedTrackableObject); +}; + +} // namespace debug +} // namespace kudu + +#endif /* KUDU_UTIL_DEBUG_TRACE_EVENT_H_ */ diff --git a/src/kudu/util/debug/trace_event_impl.cc b/src/kudu/util/debug/trace_event_impl.cc new file mode 100644 index 000000000000..b3e2e5f0bb89 --- /dev/null +++ b/src/kudu/util/debug/trace_event_impl.cc @@ -0,0 +1,2416 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/util/debug/trace_event_impl.h" + +#include +#include +#include +#include + +#include "kudu/gutil/bind.h" +#include "kudu/util/atomic.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/singleton.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/dynamic_annotations.h" + +#include "kudu/gutil/walltime.h" +#include "kudu/util/debug/trace_event_synthetic_delay.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/thread.h" + +DEFINE_string(trace_to_console, "", + "Trace pattern specifying which trace events should be dumped " + "directly to the console"); +TAG_FLAG(trace_to_console, experimental); + +// The thread buckets for the sampling profiler. +BASE_EXPORT TRACE_EVENT_API_ATOMIC_WORD g_trace_state[3]; + +namespace kudu { +namespace debug { + +using base::SpinLockHolder; + +using strings::SubstituteAndAppend; +using std::string; + +__thread TraceLog::PerThreadInfo* TraceLog::thread_local_info_ = nullptr; + +namespace { + +// Controls the number of trace events we will buffer in-memory +// before throwing them away. +const size_t kTraceBufferChunkSize = TraceBufferChunk::kTraceBufferChunkSize; +const size_t kTraceEventVectorBufferChunks = 256000 / kTraceBufferChunkSize; +const size_t kTraceEventRingBufferChunks = kTraceEventVectorBufferChunks / 4; +const size_t kTraceEventBatchChunks = 1000 / kTraceBufferChunkSize; +// Can store results for 30 seconds with 1 ms sampling interval. +const size_t kMonitorTraceEventBufferChunks = 30000 / kTraceBufferChunkSize; +// ECHO_TO_CONSOLE needs a small buffer to hold the unfinished COMPLETE events. +const size_t kEchoToConsoleTraceEventBufferChunks = 256; + +const char kSyntheticDelayCategoryFilterPrefix[] = "DELAY("; + +#define MAX_CATEGORY_GROUPS 100 + +// Parallel arrays g_category_groups and g_category_group_enabled are separate +// so that a pointer to a member of g_category_group_enabled can be easily +// converted to an index into g_category_groups. This allows macros to deal +// only with char enabled pointers from g_category_group_enabled, and we can +// convert internally to determine the category name from the char enabled +// pointer. +const char* g_category_groups[MAX_CATEGORY_GROUPS] = { + "toplevel", + "tracing already shutdown", + "tracing categories exhausted; must increase MAX_CATEGORY_GROUPS", + "__metadata"}; + +// The enabled flag is char instead of bool so that the API can be used from C. +unsigned char g_category_group_enabled[MAX_CATEGORY_GROUPS] = { 0 }; +// Indexes here have to match the g_category_groups array indexes above. +const int g_category_already_shutdown = 1; +const int g_category_categories_exhausted = 2; +const int g_category_metadata = 3; +const int g_num_builtin_categories = 4; +// Skip default categories. +AtomicWord g_category_index = g_num_builtin_categories; + +// The name of the current thread. This is used to decide if the current +// thread name has changed. We combine all the seen thread names into the +// output name for the thread. +__thread const char* g_current_thread_name = ""; + +static void NOTIMPLEMENTED() { + LOG(FATAL); +} + +class TraceBufferRingBuffer : public TraceBuffer { + public: + TraceBufferRingBuffer(size_t max_chunks) + : max_chunks_(max_chunks), + recyclable_chunks_queue_(new size_t[queue_capacity()]), + queue_head_(0), + queue_tail_(max_chunks), + current_iteration_index_(0), + current_chunk_seq_(1) { + chunks_.reserve(max_chunks); + for (size_t i = 0; i < max_chunks; ++i) + recyclable_chunks_queue_[i] = i; + } + + ~TraceBufferRingBuffer() { + STLDeleteElements(&chunks_); + } + + virtual gscoped_ptr GetChunk(size_t* index) OVERRIDE { + // Because the number of threads is much less than the number of chunks, + // the queue should never be empty. + DCHECK(!QueueIsEmpty()); + + *index = recyclable_chunks_queue_[queue_head_]; + queue_head_ = NextQueueIndex(queue_head_); + current_iteration_index_ = queue_head_; + + if (*index >= chunks_.size()) + chunks_.resize(*index + 1); + + TraceBufferChunk* chunk = chunks_[*index]; + chunks_[*index] = nullptr; // Put NULL in the slot of a in-flight chunk. + if (chunk) + chunk->Reset(current_chunk_seq_++); + else + chunk = new TraceBufferChunk(current_chunk_seq_++); + + return gscoped_ptr(chunk); + } + + virtual void ReturnChunk(size_t index, + gscoped_ptr chunk) OVERRIDE { + // When this method is called, the queue should not be full because it + // can contain all chunks including the one to be returned. + DCHECK(!QueueIsFull()); + DCHECK(chunk); + DCHECK_LT(index, chunks_.size()); + DCHECK(!chunks_[index]); + chunks_[index] = chunk.release(); + recyclable_chunks_queue_[queue_tail_] = index; + queue_tail_ = NextQueueIndex(queue_tail_); + } + + virtual bool IsFull() const OVERRIDE { + return false; + } + + virtual size_t Size() const OVERRIDE { + // This is approximate because not all of the chunks are full. + return chunks_.size() * kTraceBufferChunkSize; + } + + virtual size_t Capacity() const OVERRIDE { + return max_chunks_ * kTraceBufferChunkSize; + } + + virtual TraceEvent* GetEventByHandle(TraceEventHandle handle) OVERRIDE { + if (handle.chunk_index >= chunks_.size()) + return nullptr; + TraceBufferChunk* chunk = chunks_[handle.chunk_index]; + if (!chunk || chunk->seq() != handle.chunk_seq) + return nullptr; + return chunk->GetEventAt(handle.event_index); + } + + virtual const TraceBufferChunk* NextChunk() OVERRIDE { + if (chunks_.empty()) + return nullptr; + + while (current_iteration_index_ != queue_tail_) { + size_t chunk_index = recyclable_chunks_queue_[current_iteration_index_]; + current_iteration_index_ = NextQueueIndex(current_iteration_index_); + if (chunk_index >= chunks_.size()) // Skip uninitialized chunks. + continue; + DCHECK(chunks_[chunk_index]); + return chunks_[chunk_index]; + } + return nullptr; + } + + virtual gscoped_ptr CloneForIteration() const OVERRIDE { + gscoped_ptr cloned_buffer(new ClonedTraceBuffer()); + for (size_t queue_index = queue_head_; queue_index != queue_tail_; + queue_index = NextQueueIndex(queue_index)) { + size_t chunk_index = recyclable_chunks_queue_[queue_index]; + if (chunk_index >= chunks_.size()) // Skip uninitialized chunks. + continue; + TraceBufferChunk* chunk = chunks_[chunk_index]; + cloned_buffer->chunks_.push_back(chunk ? chunk->Clone().release() : nullptr); + } + return cloned_buffer.PassAs(); + } + + private: + class ClonedTraceBuffer : public TraceBuffer { + public: + ClonedTraceBuffer() : current_iteration_index_(0) {} + ~ClonedTraceBuffer() { + STLDeleteElements(&chunks_); + } + + // The only implemented method. + virtual const TraceBufferChunk* NextChunk() OVERRIDE { + return current_iteration_index_ < chunks_.size() ? + chunks_[current_iteration_index_++] : nullptr; + } + + virtual gscoped_ptr GetChunk(size_t* index) OVERRIDE { + NOTIMPLEMENTED(); + return gscoped_ptr(); + } + virtual void ReturnChunk(size_t index, + gscoped_ptr) OVERRIDE { + NOTIMPLEMENTED(); + } + virtual bool IsFull() const OVERRIDE { return false; } + virtual size_t Size() const OVERRIDE { return 0; } + virtual size_t Capacity() const OVERRIDE { return 0; } + virtual TraceEvent* GetEventByHandle(TraceEventHandle handle) OVERRIDE { + return nullptr; + } + virtual gscoped_ptr CloneForIteration() const OVERRIDE { + NOTIMPLEMENTED(); + return gscoped_ptr(); + } + + size_t current_iteration_index_; + vector chunks_; + }; + + bool QueueIsEmpty() const { + return queue_head_ == queue_tail_; + } + + size_t QueueSize() const { + return queue_tail_ > queue_head_ ? queue_tail_ - queue_head_ : + queue_tail_ + queue_capacity() - queue_head_; + } + + bool QueueIsFull() const { + return QueueSize() == queue_capacity() - 1; + } + + size_t queue_capacity() const { + // One extra space to help distinguish full state and empty state. + return max_chunks_ + 1; + } + + size_t NextQueueIndex(size_t index) const { + index++; + if (index >= queue_capacity()) + index = 0; + return index; + } + + size_t max_chunks_; + vector chunks_; + + gscoped_ptr recyclable_chunks_queue_; + size_t queue_head_; + size_t queue_tail_; + + size_t current_iteration_index_; + uint32 current_chunk_seq_; + + DISALLOW_COPY_AND_ASSIGN(TraceBufferRingBuffer); +}; + +class TraceBufferVector : public TraceBuffer { + public: + TraceBufferVector() + : in_flight_chunk_count_(0), + current_iteration_index_(0) { + chunks_.reserve(kTraceEventVectorBufferChunks); + } + ~TraceBufferVector() { + STLDeleteElements(&chunks_); + } + + virtual gscoped_ptr GetChunk(size_t* index) OVERRIDE { + // This function may be called when adding normal events or indirectly from + // AddMetadataEventsWhileLocked(). We can not DECHECK(!IsFull()) because we + // have to add the metadata events and flush thread-local buffers even if + // the buffer is full. + *index = chunks_.size(); + chunks_.push_back(nullptr); // Put NULL in the slot of a in-flight chunk. + ++in_flight_chunk_count_; + // + 1 because zero chunk_seq is not allowed. + return gscoped_ptr( + new TraceBufferChunk(static_cast(*index) + 1)); + } + + virtual void ReturnChunk(size_t index, + gscoped_ptr chunk) OVERRIDE { + DCHECK_GT(in_flight_chunk_count_, 0u); + DCHECK_LT(index, chunks_.size()); + DCHECK(!chunks_[index]); + --in_flight_chunk_count_; + chunks_[index] = chunk.release(); + } + + virtual bool IsFull() const OVERRIDE { + return chunks_.size() >= kTraceEventVectorBufferChunks; + } + + virtual size_t Size() const OVERRIDE { + // This is approximate because not all of the chunks are full. + return chunks_.size() * kTraceBufferChunkSize; + } + + virtual size_t Capacity() const OVERRIDE { + return kTraceEventVectorBufferChunks * kTraceBufferChunkSize; + } + + virtual TraceEvent* GetEventByHandle(TraceEventHandle handle) OVERRIDE { + if (handle.chunk_index >= chunks_.size()) + return nullptr; + TraceBufferChunk* chunk = chunks_[handle.chunk_index]; + if (!chunk || chunk->seq() != handle.chunk_seq) + return nullptr; + return chunk->GetEventAt(handle.event_index); + } + + virtual const TraceBufferChunk* NextChunk() OVERRIDE { + while (current_iteration_index_ < chunks_.size()) { + // Skip in-flight chunks. + const TraceBufferChunk* chunk = chunks_[current_iteration_index_++]; + if (chunk) + return chunk; + } + return nullptr; + } + + virtual gscoped_ptr CloneForIteration() const OVERRIDE { + NOTIMPLEMENTED(); + return gscoped_ptr(); + } + + private: + size_t in_flight_chunk_count_; + size_t current_iteration_index_; + vector chunks_; + + DISALLOW_COPY_AND_ASSIGN(TraceBufferVector); +}; + +template +void InitializeMetadataEvent(TraceEvent* trace_event, + int thread_id, + const char* metadata_name, const char* arg_name, + const T& value) { + if (!trace_event) + return; + + int num_args = 1; + unsigned char arg_type; + uint64_t arg_value; + ::trace_event_internal::SetTraceValue(value, &arg_type, &arg_value); + trace_event->Initialize(thread_id, + MicrosecondsInt64(0), MicrosecondsInt64(0), TRACE_EVENT_PHASE_METADATA, + &g_category_group_enabled[g_category_metadata], + metadata_name, ::trace_event_internal::kNoEventId, + num_args, &arg_name, &arg_type, &arg_value, nullptr, + TRACE_EVENT_FLAG_NONE); +} + +// RAII object which marks '*dst' with a non-zero value while in scope. +// This assumes that no other threads write to '*dst'. +class MarkFlagInScope { + public: + explicit MarkFlagInScope(Atomic32* dst) + : dst_(dst) { + // We currently use Acquire_AtomicExchange here because it appears + // to be the cheapest way of getting an "Acquire_Store" barrier. Actually + // using Acquire_Store generates more assembly instructions and benchmarks + // slightly slower. + // + // TODO: it would be even faster to avoid the memory barrier here entirely, + // and do an asymmetric barrier, for example by having the flusher thread + // send a signal to every registered thread, or wait until every other thread + // has experienced at least one context switch. A number of options for this + // are outlined in: + // http://home.comcast.net/~pjbishop/Dave/Asymmetric-Dekker-Synchronization.txt + Atomic32 old_val = base::subtle::Acquire_AtomicExchange(dst_, 1); + DCHECK_EQ(old_val, 0); + } + ~MarkFlagInScope() { + base::subtle::Release_Store(dst_, 0); + } + + private: + Atomic32* dst_; + DISALLOW_COPY_AND_ASSIGN(MarkFlagInScope); +}; +} // anonymous namespace + +TraceLog::ThreadLocalEventBuffer* TraceLog::PerThreadInfo::AtomicTakeBuffer() { + return reinterpret_cast( + base::subtle::Acquire_AtomicExchange( + reinterpret_cast(&event_buffer_), + 0)); +} + +void TraceBufferChunk::Reset(uint32 new_seq) { + for (size_t i = 0; i < next_free_; ++i) + chunk_[i].Reset(); + next_free_ = 0; + seq_ = new_seq; +} + +TraceEvent* TraceBufferChunk::AddTraceEvent(size_t* event_index) { + DCHECK(!IsFull()); + *event_index = next_free_++; + return &chunk_[*event_index]; +} + +gscoped_ptr TraceBufferChunk::Clone() const { + gscoped_ptr cloned_chunk(new TraceBufferChunk(seq_)); + cloned_chunk->next_free_ = next_free_; + for (size_t i = 0; i < next_free_; ++i) + cloned_chunk->chunk_[i].CopyFrom(chunk_[i]); + return cloned_chunk.Pass(); +} + +// A helper class that allows the lock to be acquired in the middle of the scope +// and unlocks at the end of scope if locked. +class TraceLog::OptionalAutoLock { + public: + explicit OptionalAutoLock(base::SpinLock& lock) + : lock_(lock), + locked_(false) { + } + + ~OptionalAutoLock() { + if (locked_) + lock_.Unlock(); + } + + void EnsureAcquired() { + if (!locked_) { + lock_.Lock(); + locked_ = true; + } + } + + private: + base::SpinLock& lock_; + bool locked_; + DISALLOW_COPY_AND_ASSIGN(OptionalAutoLock); +}; + +// Use this function instead of TraceEventHandle constructor to keep the +// overhead of ScopedTracer (trace_event.h) constructor minimum. +void MakeHandle(uint32 chunk_seq, size_t chunk_index, size_t event_index, + TraceEventHandle* handle) { + DCHECK(chunk_seq); + DCHECK(chunk_index < (1u << 16)); + DCHECK(event_index < (1u << 16)); + handle->chunk_seq = chunk_seq; + handle->chunk_index = static_cast(chunk_index); + handle->event_index = static_cast(event_index); +} + +//////////////////////////////////////////////////////////////////////////////// +// +// TraceEvent +// +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +size_t GetAllocLength(const char* str) { return str ? strlen(str) + 1 : 0; } + +// Copies |*member| into |*buffer|, sets |*member| to point to this new +// location, and then advances |*buffer| by the amount written. +void CopyTraceEventParameter(char** buffer, + const char** member, + const char* end) { + if (*member) { + size_t written = strings::strlcpy(*buffer, *member, end - *buffer) + 1; + DCHECK_LE(static_cast(written), end - *buffer); + *member = *buffer; + *buffer += written; + } +} + +} // namespace + +TraceEvent::TraceEvent() + : duration_(-1), + thread_duration_(-1), + id_(0u), + category_group_enabled_(nullptr), + name_(nullptr), + thread_id_(0), + phase_(TRACE_EVENT_PHASE_BEGIN), + flags_(0) { + for (auto& arg_name : arg_names_) { + arg_name = nullptr; + } + memset(arg_values_, 0, sizeof(arg_values_)); +} + +TraceEvent::~TraceEvent() { +} + +void TraceEvent::CopyFrom(const TraceEvent& other) { + timestamp_ = other.timestamp_; + thread_timestamp_ = other.thread_timestamp_; + duration_ = other.duration_; + id_ = other.id_; + category_group_enabled_ = other.category_group_enabled_; + name_ = other.name_; + thread_id_ = other.thread_id_; + phase_ = other.phase_; + flags_ = other.flags_; + parameter_copy_storage_ = other.parameter_copy_storage_; + + for (int i = 0; i < kTraceMaxNumArgs; ++i) { + arg_names_[i] = other.arg_names_[i]; + arg_types_[i] = other.arg_types_[i]; + arg_values_[i] = other.arg_values_[i]; + convertable_values_[i] = other.convertable_values_[i]; + } +} + +void TraceEvent::Initialize( + int thread_id, + MicrosecondsInt64 timestamp, + MicrosecondsInt64 thread_timestamp, + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags) { + timestamp_ = timestamp; + thread_timestamp_ = thread_timestamp; + duration_ = -1;; + id_ = id; + category_group_enabled_ = category_group_enabled; + name_ = name; + thread_id_ = thread_id; + phase_ = phase; + flags_ = flags; + + // Clamp num_args since it may have been set by a third_party library. + num_args = (num_args > kTraceMaxNumArgs) ? kTraceMaxNumArgs : num_args; + int i = 0; + for (; i < num_args; ++i) { + arg_names_[i] = arg_names[i]; + arg_types_[i] = arg_types[i]; + + if (arg_types[i] == TRACE_VALUE_TYPE_CONVERTABLE) + convertable_values_[i] = convertable_values[i]; + else + arg_values_[i].as_uint = arg_values[i]; + } + for (; i < kTraceMaxNumArgs; ++i) { + arg_names_[i] = nullptr; + arg_values_[i].as_uint = 0u; + convertable_values_[i] = nullptr; + arg_types_[i] = TRACE_VALUE_TYPE_UINT; + } + + bool copy = !!(flags & TRACE_EVENT_FLAG_COPY); + size_t alloc_size = 0; + if (copy) { + alloc_size += GetAllocLength(name); + for (i = 0; i < num_args; ++i) { + alloc_size += GetAllocLength(arg_names_[i]); + if (arg_types_[i] == TRACE_VALUE_TYPE_STRING) + arg_types_[i] = TRACE_VALUE_TYPE_COPY_STRING; + } + } + + bool arg_is_copy[kTraceMaxNumArgs]; + for (i = 0; i < num_args; ++i) { + // No copying of convertable types, we retain ownership. + if (arg_types_[i] == TRACE_VALUE_TYPE_CONVERTABLE) + continue; + + // We only take a copy of arg_vals if they are of type COPY_STRING. + arg_is_copy[i] = (arg_types_[i] == TRACE_VALUE_TYPE_COPY_STRING); + if (arg_is_copy[i]) + alloc_size += GetAllocLength(arg_values_[i].as_string); + } + + if (alloc_size) { + parameter_copy_storage_ = new RefCountedString; + parameter_copy_storage_->data().resize(alloc_size); + char* ptr = string_as_array(¶meter_copy_storage_->data()); + const char* end = ptr + alloc_size; + if (copy) { + CopyTraceEventParameter(&ptr, &name_, end); + for (i = 0; i < num_args; ++i) { + CopyTraceEventParameter(&ptr, &arg_names_[i], end); + } + } + for (i = 0; i < num_args; ++i) { + if (arg_types_[i] == TRACE_VALUE_TYPE_CONVERTABLE) + continue; + if (arg_is_copy[i]) + CopyTraceEventParameter(&ptr, &arg_values_[i].as_string, end); + } + DCHECK_EQ(end, ptr) << "Overrun by " << ptr - end; + } +} + +void TraceEvent::Reset() { + // Only reset fields that won't be initialized in Initialize(), or that may + // hold references to other objects. + duration_ = -1;; + parameter_copy_storage_ = nullptr; + for (int i = 0; i < kTraceMaxNumArgs && arg_names_[i]; ++i) + convertable_values_[i] = nullptr; +} + +void TraceEvent::UpdateDuration(const MicrosecondsInt64& now, + const MicrosecondsInt64& thread_now) { + DCHECK(duration_ == -1); + duration_ = now - timestamp_; + thread_duration_ = thread_now - thread_timestamp_; +} + +namespace { +// Escape the given string using JSON rules. +void JsonEscape(StringPiece s, string* out) { + out->reserve(out->size() + s.size() * 2); + const char* p_end = s.data() + s.size(); + for (const char* p = s.data(); p != p_end; p++) { + // Only the following characters need to be escaped, according to json.org. + // In particular, it's illegal to escape the single-quote character, and + // JSON does not support the "\x" escape sequence like C/Java. + switch (*p) { + case '"': + case '\\': + out->push_back('\\'); + out->push_back(*p); + break; + case '\b': + out->append("\\b"); + break; + case '\f': + out->append("\\f"); + break; + case '\n': + out->append("\\n"); + case '\r': + out->append("\\r"); + break; + case '\t': + out->append("\\t"); + break; + default: + out->push_back(*p); + } + } +} +} // anonymous namespace + +// static +void TraceEvent::AppendValueAsJSON(unsigned char type, + TraceEvent::TraceValue value, + std::string* out) { + switch (type) { + case TRACE_VALUE_TYPE_BOOL: + *out += value.as_bool ? "true" : "false"; + break; + case TRACE_VALUE_TYPE_UINT: + SubstituteAndAppend(out, "$0", static_cast(value.as_uint)); + break; + case TRACE_VALUE_TYPE_INT: + SubstituteAndAppend(out, "$0", static_cast(value.as_int)); + break; + case TRACE_VALUE_TYPE_DOUBLE: { + // FIXME: base/json/json_writer.cc is using the same code, + // should be made into a common method. + std::string real; + double val = value.as_double; + if (MathLimits::IsFinite(val)) { + real = strings::Substitute("$0", val); + // Ensure that the number has a .0 if there's no decimal or 'e'. This + // makes sure that when we read the JSON back, it's interpreted as a + // real rather than an int. + if (real.find('.') == std::string::npos && + real.find('e') == std::string::npos && + real.find('E') == std::string::npos) { + real.append(".0"); + } + // The JSON spec requires that non-integer values in the range (-1,1) + // have a zero before the decimal point - ".52" is not valid, "0.52" is. + if (real[0] == '.') { + real.insert(0, "0"); + } else if (real.length() > 1 && real[0] == '-' && real[1] == '.') { + // "-.1" bad "-0.1" good + real.insert(1, "0"); + } + } else if (MathLimits::IsNaN(val)){ + // The JSON spec doesn't allow NaN and Infinity (since these are + // objects in EcmaScript). Use strings instead. + real = "\"NaN\""; + } else if (val < 0) { + real = "\"-Infinity\""; + } else { + real = "\"Infinity\""; + } + SubstituteAndAppend(out, "$0", real); + break; + } + case TRACE_VALUE_TYPE_POINTER: + // JSON only supports double and int numbers. + // So as not to lose bits from a 64-bit pointer, output as a hex string. + StringAppendF(out, "\"0x%" PRIx64 "\"", static_cast( + reinterpret_cast( + value.as_pointer))); + break; + case TRACE_VALUE_TYPE_STRING: + case TRACE_VALUE_TYPE_COPY_STRING: + *out += "\""; + JsonEscape(value.as_string ? value.as_string : "NULL", out); + *out += "\""; + break; + default: + LOG(FATAL) << "Don't know how to print this value"; + break; + } +} + +void TraceEvent::AppendAsJSON(std::string* out) const { + int64 time_int64 = timestamp_; + int process_id = TraceLog::GetInstance()->process_id(); + // Category group checked at category creation time. + DCHECK(!strchr(name_, '"')); + StringAppendF(out, + "{\"cat\":\"%s\",\"pid\":%i,\"tid\":%i,\"ts\":%" PRId64 "," + "\"ph\":\"%c\",\"name\":\"%s\",\"args\":{", + TraceLog::GetCategoryGroupName(category_group_enabled_), + process_id, + thread_id_, + time_int64, + phase_, + name_); + + // Output argument names and values, stop at first NULL argument name. + for (int i = 0; i < kTraceMaxNumArgs && arg_names_[i]; ++i) { + if (i > 0) + *out += ","; + *out += "\""; + *out += arg_names_[i]; + *out += "\":"; + + if (arg_types_[i] == TRACE_VALUE_TYPE_CONVERTABLE) + convertable_values_[i]->AppendAsTraceFormat(out); + else + AppendValueAsJSON(arg_types_[i], arg_values_[i], out); + } + *out += "}"; + + if (phase_ == TRACE_EVENT_PHASE_COMPLETE) { + int64 duration = duration_; + if (duration != -1) + StringAppendF(out, ",\"dur\":%" PRId64, duration); + if (thread_timestamp_ >= 0) { + int64 thread_duration = thread_duration_; + if (thread_duration != -1) + StringAppendF(out, ",\"tdur\":%" PRId64, thread_duration); + } + } + + // Output tts if thread_timestamp is valid. + if (thread_timestamp_ >= 0) { + int64 thread_time_int64 = thread_timestamp_; + StringAppendF(out, ",\"tts\":%" PRId64, thread_time_int64); + } + + // If id_ is set, print it out as a hex string so we don't loose any + // bits (it might be a 64-bit pointer). + if (flags_ & TRACE_EVENT_FLAG_HAS_ID) + StringAppendF(out, ",\"id\":\"0x%" PRIx64 "\"", static_cast(id_)); + + // Instant events also output their scope. + if (phase_ == TRACE_EVENT_PHASE_INSTANT) { + char scope = '?'; + switch (flags_ & TRACE_EVENT_FLAG_SCOPE_MASK) { + case TRACE_EVENT_SCOPE_GLOBAL: + scope = TRACE_EVENT_SCOPE_NAME_GLOBAL; + break; + + case TRACE_EVENT_SCOPE_PROCESS: + scope = TRACE_EVENT_SCOPE_NAME_PROCESS; + break; + + case TRACE_EVENT_SCOPE_THREAD: + scope = TRACE_EVENT_SCOPE_NAME_THREAD; + break; + } + StringAppendF(out, ",\"s\":\"%c\"", scope); + } + + *out += "}"; +} + +void TraceEvent::AppendPrettyPrinted(std::ostringstream* out) const { + *out << name_ << "["; + *out << TraceLog::GetCategoryGroupName(category_group_enabled_); + *out << "]"; + if (arg_names_[0]) { + *out << ", {"; + for (int i = 0; i < kTraceMaxNumArgs && arg_names_[i]; ++i) { + if (i > 0) + *out << ", "; + *out << arg_names_[i] << ":"; + std::string value_as_text; + + if (arg_types_[i] == TRACE_VALUE_TYPE_CONVERTABLE) + convertable_values_[i]->AppendAsTraceFormat(&value_as_text); + else + AppendValueAsJSON(arg_types_[i], arg_values_[i], &value_as_text); + + *out << value_as_text; + } + *out << "}"; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +// TraceResultBuffer +// +//////////////////////////////////////////////////////////////////////////////// + +string TraceResultBuffer::FlushTraceLogToString() { + return DoFlush(false); +} + +string TraceResultBuffer::FlushTraceLogToStringButLeaveBufferIntact() { + return DoFlush(true); +} + +string TraceResultBuffer::DoFlush(bool leave_intact) { + TraceResultBuffer buf; + TraceLog* tl = TraceLog::GetInstance(); + if (leave_intact) { + tl->FlushButLeaveBufferIntact(Bind(&TraceResultBuffer::Collect, Unretained(&buf))); + } else { + tl->Flush(Bind(&TraceResultBuffer::Collect, Unretained(&buf))); + } + buf.json_.append("]}\n"); + return buf.json_; +} + +TraceResultBuffer::TraceResultBuffer() + : first_(true) { +} +TraceResultBuffer::~TraceResultBuffer() { +} + +void TraceResultBuffer::Collect( + const scoped_refptr& s, + bool has_more_events) { + if (first_) { + json_.append("{\"traceEvents\": [\n"); + first_ = false; + } else if (!s->data().empty()) { + // Sometimes we get sent an empty chunk at the end, + // and we don't want to end up with an extra trailing ',' + json_.append(",\n"); + } + json_.append(s->data()); +} + +//////////////////////////////////////////////////////////////////////////////// +// +// TraceSamplingThread +// +//////////////////////////////////////////////////////////////////////////////// +class TraceBucketData; +typedef Callback TraceSampleCallback; + +class TraceBucketData { + public: + TraceBucketData(AtomicWord* bucket, + const char* name, + TraceSampleCallback callback); + ~TraceBucketData(); + + TRACE_EVENT_API_ATOMIC_WORD* bucket; + const char* bucket_name; + TraceSampleCallback callback; +}; + +// This object must be created on the IO thread. +class TraceSamplingThread { + public: + TraceSamplingThread(); + virtual ~TraceSamplingThread(); + + void ThreadMain(); + + static void DefaultSamplingCallback(TraceBucketData* bucekt_data); + + void Stop(); + + private: + friend class TraceLog; + + void GetSamples(); + // Not thread-safe. Once the ThreadMain has been called, this can no longer + // be called. + void RegisterSampleBucket(TRACE_EVENT_API_ATOMIC_WORD* bucket, + const char* const name, + TraceSampleCallback callback); + // Splits a combined "category\0name" into the two component parts. + static void ExtractCategoryAndName(const char* combined, + const char** category, + const char** name); + std::vector sample_buckets_; + bool thread_running_; + AtomicBool cancellation_flag_; +}; + + +TraceSamplingThread::TraceSamplingThread() + : thread_running_(false), + cancellation_flag_(false) { +} + +TraceSamplingThread::~TraceSamplingThread() { +} + +void TraceSamplingThread::ThreadMain() { + thread_running_ = true; + const MonoDelta sleepDelta = MonoDelta::FromMicroseconds(1000); + while (!cancellation_flag_.Load()) { + SleepFor(sleepDelta); + GetSamples(); + } +} + +// static +void TraceSamplingThread::DefaultSamplingCallback( + TraceBucketData* bucket_data) { + TRACE_EVENT_API_ATOMIC_WORD category_and_name = + TRACE_EVENT_API_ATOMIC_LOAD(*bucket_data->bucket); + if (!category_and_name) + return; + const char* const combined = + reinterpret_cast(category_and_name); + const char* category_group; + const char* name; + ExtractCategoryAndName(combined, &category_group, &name); + TRACE_EVENT_API_ADD_TRACE_EVENT(TRACE_EVENT_PHASE_SAMPLE, + TraceLog::GetCategoryGroupEnabled(category_group), + name, 0, 0, nullptr, nullptr, nullptr, nullptr, 0); +} + +void TraceSamplingThread::GetSamples() { + for (auto& sample_bucket : sample_buckets_) { + TraceBucketData* bucket_data = &sample_bucket; + bucket_data->callback.Run(bucket_data); + } +} + +void TraceSamplingThread::RegisterSampleBucket( + TRACE_EVENT_API_ATOMIC_WORD* bucket, + const char* const name, + TraceSampleCallback callback) { + // Access to sample_buckets_ doesn't cause races with the sampling thread + // that uses the sample_buckets_, because it is guaranteed that + // RegisterSampleBucket is called before the sampling thread is created. + DCHECK(!thread_running_); + sample_buckets_.push_back(TraceBucketData(bucket, name, callback)); +} + +// static +void TraceSamplingThread::ExtractCategoryAndName(const char* combined, + const char** category, + const char** name) { + *category = combined; + *name = &combined[strlen(combined) + 1]; +} + +void TraceSamplingThread::Stop() { + cancellation_flag_.Store(true); +} + +TraceBucketData::TraceBucketData(AtomicWord* bucket, const char* name, + TraceSampleCallback callback) + : bucket(bucket), bucket_name(name), callback(std::move(callback)) {} + +TraceBucketData::~TraceBucketData() { +} + +//////////////////////////////////////////////////////////////////////////////// +// +// TraceLog +// +//////////////////////////////////////////////////////////////////////////////// + +class TraceLog::ThreadLocalEventBuffer { + public: + ThreadLocalEventBuffer(TraceLog* trace_log); + virtual ~ThreadLocalEventBuffer(); + + TraceEvent* AddTraceEvent(TraceEventHandle* handle); + + TraceEvent* GetEventByHandle(TraceEventHandle handle) { + if (!chunk_ || handle.chunk_seq != chunk_->seq() || + handle.chunk_index != chunk_index_) + return nullptr; + + return chunk_->GetEventAt(handle.event_index); + } + + int generation() const { return generation_; } + + void Flush(int64_t tid); + + private: + // Check that the current thread is the one that constructed this trace buffer. + void CheckIsOwnerThread() const { + DCHECK_EQ(kudu::Thread::UniqueThreadId(), owner_tid_); + } + + // Since TraceLog is a leaky singleton, trace_log_ will always be valid + // as long as the thread exists. + TraceLog* trace_log_; + gscoped_ptr chunk_; + size_t chunk_index_; + int generation_; + + // The TID of the thread that constructed this event buffer. Only this thread + // may add trace events. + int64_t owner_tid_; + + DISALLOW_COPY_AND_ASSIGN(ThreadLocalEventBuffer); +}; + +TraceLog::ThreadLocalEventBuffer::ThreadLocalEventBuffer(TraceLog* trace_log) + : trace_log_(trace_log), + chunk_index_(0), + generation_(trace_log->generation()), + owner_tid_(kudu::Thread::UniqueThreadId()) { +} + +TraceLog::ThreadLocalEventBuffer::~ThreadLocalEventBuffer() { +} + +TraceEvent* TraceLog::ThreadLocalEventBuffer::AddTraceEvent( + TraceEventHandle* handle) { + CheckIsOwnerThread(); + + if (chunk_ && chunk_->IsFull()) { + SpinLockHolder lock(&trace_log_->lock_); + Flush(Thread::UniqueThreadId()); + chunk_.reset(); + } + if (!chunk_) { + SpinLockHolder lock(&trace_log_->lock_); + chunk_ = trace_log_->logged_events_->GetChunk(&chunk_index_); + trace_log_->CheckIfBufferIsFullWhileLocked(); + } + if (!chunk_) + return nullptr; + + size_t event_index; + TraceEvent* trace_event = chunk_->AddTraceEvent(&event_index); + if (trace_event && handle) + MakeHandle(chunk_->seq(), chunk_index_, event_index, handle); + + return trace_event; +} + +void TraceLog::ThreadLocalEventBuffer::Flush(int64_t tid) { + DCHECK(trace_log_->lock_.IsHeld()); + + if (!chunk_) + return; + + if (trace_log_->CheckGeneration(generation_)) { + // Return the chunk to the buffer only if the generation matches. + trace_log_->logged_events_->ReturnChunk(chunk_index_, chunk_.Pass()); + } +} + +// static +TraceLog* TraceLog::GetInstance() { + return Singleton::get(); +} + +TraceLog::TraceLog() + : mode_(DISABLED), + num_traces_recorded_(0), + event_callback_(0), + dispatching_to_observer_list_(false), + process_sort_index_(0), + process_id_hash_(0), + process_id_(0), + time_offset_(0), + watch_category_(0), + trace_options_(RECORD_UNTIL_FULL), + sampling_thread_handle_(nullptr), + category_filter_(CategoryFilter::kDefaultCategoryFilterString), + event_callback_category_filter_( + CategoryFilter::kDefaultCategoryFilterString), + thread_shared_chunk_index_(0), + generation_(0) { + // Trace is enabled or disabled on one thread while other threads are + // accessing the enabled flag. We don't care whether edge-case events are + // traced or not, so we allow races on the enabled flag to keep the trace + // macros fast. + ANNOTATE_BENIGN_RACE_SIZED(g_category_group_enabled, + sizeof(g_category_group_enabled), + "trace_event category enabled"); + for (int i = 0; i < MAX_CATEGORY_GROUPS; ++i) { + ANNOTATE_BENIGN_RACE(&g_category_group_enabled[i], + "trace_event category enabled"); + } + SetProcessID(static_cast(getpid())); + + string filter = FLAGS_trace_to_console; + if (!filter.empty()) { + SetEnabled(CategoryFilter(filter), RECORDING_MODE, ECHO_TO_CONSOLE); + LOG(ERROR) << "Tracing to console with CategoryFilter '" << filter << "'."; + } + + logged_events_.reset(CreateTraceBuffer()); +} + +TraceLog::~TraceLog() { +} + +const unsigned char* TraceLog::GetCategoryGroupEnabled( + const char* category_group) { + TraceLog* tracelog = GetInstance(); + if (!tracelog) { + DCHECK(!g_category_group_enabled[g_category_already_shutdown]); + return &g_category_group_enabled[g_category_already_shutdown]; + } + return tracelog->GetCategoryGroupEnabledInternal(category_group); +} + +const char* TraceLog::GetCategoryGroupName( + const unsigned char* category_group_enabled) { + // Calculate the index of the category group by finding + // category_group_enabled in g_category_group_enabled array. + uintptr_t category_begin = + reinterpret_cast(g_category_group_enabled); + uintptr_t category_ptr = reinterpret_cast(category_group_enabled); + DCHECK(category_ptr >= category_begin && + category_ptr < reinterpret_cast( + g_category_group_enabled + MAX_CATEGORY_GROUPS)) << + "out of bounds category pointer"; + uintptr_t category_index = + (category_ptr - category_begin) / sizeof(g_category_group_enabled[0]); + return g_category_groups[category_index]; +} + +void TraceLog::UpdateCategoryGroupEnabledFlag(int category_index) { + unsigned char enabled_flag = 0; + const char* category_group = g_category_groups[category_index]; + if (mode_ == RECORDING_MODE && + category_filter_.IsCategoryGroupEnabled(category_group)) + enabled_flag |= ENABLED_FOR_RECORDING; + else if (mode_ == MONITORING_MODE && + category_filter_.IsCategoryGroupEnabled(category_group)) + enabled_flag |= ENABLED_FOR_MONITORING; + if (event_callback_ && + event_callback_category_filter_.IsCategoryGroupEnabled(category_group)) + enabled_flag |= ENABLED_FOR_EVENT_CALLBACK; + g_category_group_enabled[category_index] = enabled_flag; +} + +void TraceLog::UpdateCategoryGroupEnabledFlags() { + int category_index = base::subtle::NoBarrier_Load(&g_category_index); + for (int i = 0; i < category_index; i++) + UpdateCategoryGroupEnabledFlag(i); +} + +void TraceLog::UpdateSyntheticDelaysFromCategoryFilter() { + ResetTraceEventSyntheticDelays(); + const CategoryFilter::StringList& delays = + category_filter_.GetSyntheticDelayValues(); + CategoryFilter::StringList::const_iterator ci; + for (ci = delays.begin(); ci != delays.end(); ++ci) { + std::list tokens = strings::Split(*ci, ";"); + if (tokens.empty()) continue; + + TraceEventSyntheticDelay* delay = + TraceEventSyntheticDelay::Lookup(tokens.front()); + tokens.pop_front(); + while (!tokens.empty()) { + std::string token = tokens.front(); + tokens.pop_front(); + char* duration_end; + double target_duration = strtod(token.c_str(), &duration_end); + if (duration_end != token.c_str()) { + delay->SetTargetDuration(MonoDelta::FromSeconds(target_duration)); + } else if (token == "static") { + delay->SetMode(TraceEventSyntheticDelay::STATIC); + } else if (token == "oneshot") { + delay->SetMode(TraceEventSyntheticDelay::ONE_SHOT); + } else if (token == "alternating") { + delay->SetMode(TraceEventSyntheticDelay::ALTERNATING); + } + } + } +} + +const unsigned char* TraceLog::GetCategoryGroupEnabledInternal( + const char* category_group) { + DCHECK(!strchr(category_group, '"')) << + "Category groups may not contain double quote"; + // The g_category_groups is append only, avoid using a lock for the fast path. + int current_category_index = base::subtle::Acquire_Load(&g_category_index); + + // Search for pre-existing category group. + for (int i = 0; i < current_category_index; ++i) { + if (strcmp(g_category_groups[i], category_group) == 0) { + return &g_category_group_enabled[i]; + } + } + + unsigned char* category_group_enabled = nullptr; + // This is the slow path: the lock is not held in the case above, so more + // than one thread could have reached here trying to add the same category. + // Only hold to lock when actually appending a new category, and + // check the categories groups again. + SpinLockHolder lock(&lock_); + int category_index = base::subtle::Acquire_Load(&g_category_index); + for (int i = 0; i < category_index; ++i) { + if (strcmp(g_category_groups[i], category_group) == 0) { + return &g_category_group_enabled[i]; + } + } + + // Create a new category group. + DCHECK(category_index < MAX_CATEGORY_GROUPS) << + "must increase MAX_CATEGORY_GROUPS"; + if (category_index < MAX_CATEGORY_GROUPS) { + // Don't hold on to the category_group pointer, so that we can create + // category groups with strings not known at compile time (this is + // required by SetWatchEvent). + const char* new_group = strdup(category_group); + // NOTE: new_group is leaked, but this is a small finite amount of data + g_category_groups[category_index] = new_group; + DCHECK(!g_category_group_enabled[category_index]); + // Note that if both included and excluded patterns in the + // CategoryFilter are empty, we exclude nothing, + // thereby enabling this category group. + UpdateCategoryGroupEnabledFlag(category_index); + category_group_enabled = &g_category_group_enabled[category_index]; + // Update the max index now. + base::subtle::Release_Store(&g_category_index, category_index + 1); + } else { + category_group_enabled = + &g_category_group_enabled[g_category_categories_exhausted]; + } + return category_group_enabled; +} + +void TraceLog::GetKnownCategoryGroups( + std::vector* category_groups) { + SpinLockHolder lock(&lock_); + int category_index = base::subtle::NoBarrier_Load(&g_category_index); + for (int i = g_num_builtin_categories; i < category_index; i++) + category_groups->push_back(g_category_groups[i]); +} + +void TraceLog::SetEnabled(const CategoryFilter& category_filter, + Mode mode, + Options options) { + std::vector observer_list; + { + SpinLockHolder lock(&lock_); + + // Can't enable tracing when Flush() is in progress. + Options old_options = trace_options(); + + if (IsEnabled()) { + if (options != old_options) { + DLOG(ERROR) << "Attempting to re-enable tracing with a different " + << "set of options."; + } + + if (mode != mode_) { + DLOG(ERROR) << "Attempting to re-enable tracing with a different mode."; + } + + category_filter_.Merge(category_filter); + UpdateCategoryGroupEnabledFlags(); + return; + } + + if (dispatching_to_observer_list_) { + DLOG(ERROR) << + "Cannot manipulate TraceLog::Enabled state from an observer."; + return; + } + + mode_ = mode; + + if (options != old_options) { + base::subtle::NoBarrier_Store(&trace_options_, options); + UseNextTraceBuffer(); + } + + num_traces_recorded_++; + + category_filter_ = CategoryFilter(category_filter); + UpdateCategoryGroupEnabledFlags(); + UpdateSyntheticDelaysFromCategoryFilter(); + + if (options & ENABLE_SAMPLING) { + sampling_thread_.reset(new TraceSamplingThread); + sampling_thread_->RegisterSampleBucket( + &g_trace_state[0], + "bucket0", + Bind(&TraceSamplingThread::DefaultSamplingCallback)); + sampling_thread_->RegisterSampleBucket( + &g_trace_state[1], + "bucket1", + Bind(&TraceSamplingThread::DefaultSamplingCallback)); + sampling_thread_->RegisterSampleBucket( + &g_trace_state[2], + "bucket2", + Bind(&TraceSamplingThread::DefaultSamplingCallback)); + + Status s = Thread::Create("tracing", "sampler", + &TraceSamplingThread::ThreadMain, + sampling_thread_.get(), + &sampling_thread_handle_); + if (!s.ok()) { + LOG(DFATAL) << "failed to create trace sampling thread: " << s.ToString(); + } + } + + dispatching_to_observer_list_ = true; + observer_list = enabled_state_observer_list_; + } + // Notify observers outside the lock in case they trigger trace events. + for (const auto& observer : observer_list) + observer->OnTraceLogEnabled(); + + { + SpinLockHolder lock(&lock_); + dispatching_to_observer_list_ = false; + } +} + +CategoryFilter TraceLog::GetCurrentCategoryFilter() { + SpinLockHolder lock(&lock_); + return category_filter_; +} + +void TraceLog::SetDisabled() { + SpinLockHolder lock(&lock_); + SetDisabledWhileLocked(); +} + +void TraceLog::SetDisabledWhileLocked() { + DCHECK(lock_.IsHeld()); + + if (!IsEnabled()) + return; + + if (dispatching_to_observer_list_) { + DLOG(ERROR) + << "Cannot manipulate TraceLog::Enabled state from an observer."; + return; + } + + mode_ = DISABLED; + + if (sampling_thread_.get()) { + // Stop the sampling thread. + sampling_thread_->Stop(); + lock_.Unlock(); + sampling_thread_handle_->Join(); + lock_.Lock(); + sampling_thread_handle_.reset(); + sampling_thread_.reset(); + } + + category_filter_.Clear(); + base::subtle::NoBarrier_Store(&watch_category_, 0); + watch_event_name_ = ""; + UpdateCategoryGroupEnabledFlags(); + AddMetadataEventsWhileLocked(); + + dispatching_to_observer_list_ = true; + std::vector observer_list = + enabled_state_observer_list_; + + { + // Dispatch to observers outside the lock in case the observer triggers a + // trace event. + lock_.Unlock(); + for (const auto& observer : observer_list) + observer->OnTraceLogDisabled(); + lock_.Lock(); + } + dispatching_to_observer_list_ = false; +} + +int TraceLog::GetNumTracesRecorded() { + SpinLockHolder lock(&lock_); + if (!IsEnabled()) + return -1; + return num_traces_recorded_; +} + +void TraceLog::AddEnabledStateObserver(EnabledStateObserver* listener) { + enabled_state_observer_list_.push_back(listener); +} + +void TraceLog::RemoveEnabledStateObserver(EnabledStateObserver* listener) { + auto it = std::find(enabled_state_observer_list_.begin(), + enabled_state_observer_list_.end(), listener); + if (it != enabled_state_observer_list_.end()) + enabled_state_observer_list_.erase(it); +} + +bool TraceLog::HasEnabledStateObserver(EnabledStateObserver* listener) const { + auto it = std::find(enabled_state_observer_list_.begin(), + enabled_state_observer_list_.end(), listener); + return it != enabled_state_observer_list_.end(); +} + +float TraceLog::GetBufferPercentFull() const { + SpinLockHolder lock(&lock_); + return static_cast(static_cast(logged_events_->Size()) / + logged_events_->Capacity()); +} + +bool TraceLog::BufferIsFull() const { + SpinLockHolder lock(&lock_); + return logged_events_->IsFull(); +} + +TraceBuffer* TraceLog::CreateTraceBuffer() { + Options options = trace_options(); + if (options & RECORD_CONTINUOUSLY) + return new TraceBufferRingBuffer(kTraceEventRingBufferChunks); + else if ((options & ENABLE_SAMPLING) && mode_ == MONITORING_MODE) + return new TraceBufferRingBuffer(kMonitorTraceEventBufferChunks); + else if (options & ECHO_TO_CONSOLE) + return new TraceBufferRingBuffer(kEchoToConsoleTraceEventBufferChunks); + return new TraceBufferVector(); +} + +TraceEvent* TraceLog::AddEventToThreadSharedChunkWhileLocked( + TraceEventHandle* handle, bool check_buffer_is_full) { + DCHECK(lock_.IsHeld()); + + if (thread_shared_chunk_ && thread_shared_chunk_->IsFull()) { + logged_events_->ReturnChunk(thread_shared_chunk_index_, + thread_shared_chunk_.Pass()); + } + + if (!thread_shared_chunk_) { + thread_shared_chunk_ = logged_events_->GetChunk( + &thread_shared_chunk_index_); + if (check_buffer_is_full) + CheckIfBufferIsFullWhileLocked(); + } + if (!thread_shared_chunk_) + return nullptr; + + size_t event_index; + TraceEvent* trace_event = thread_shared_chunk_->AddTraceEvent(&event_index); + if (trace_event && handle) { + MakeHandle(thread_shared_chunk_->seq(), thread_shared_chunk_index_, + event_index, handle); + } + return trace_event; +} + +void TraceLog::CheckIfBufferIsFullWhileLocked() { + DCHECK(lock_.IsHeld()); + if (logged_events_->IsFull()) + SetDisabledWhileLocked(); +} + +void TraceLog::SetEventCallbackEnabled(const CategoryFilter& category_filter, + EventCallback cb) { + SpinLockHolder lock(&lock_); + base::subtle::NoBarrier_Store(&event_callback_, + reinterpret_cast(cb)); + event_callback_category_filter_ = category_filter; + UpdateCategoryGroupEnabledFlags(); +}; + +void TraceLog::SetEventCallbackDisabled() { + SpinLockHolder lock(&lock_); + base::subtle::NoBarrier_Store(&event_callback_, 0); + UpdateCategoryGroupEnabledFlags(); +} + +// Flush() works as the following: +// +// We ensure by taking the global lock that we have exactly one Flusher thread +// (the caller of this function) and some number of "target" threads. We do +// not want to block the target threads, since they are running application code, +// so this implementation takes an approach based on asymmetric synchronization. +// +// For each active thread, we grab its PerThreadInfo object, which may contain +// a pointer to its active trace chunk. We use an AtomicExchange to swap this +// out for a null pointer. This ensures that, on the *next* TRACE call made by +// that thread, it will see a NULL buffer and create a _new_ trace buffer. That +// new buffer would be assigned the generation of the next collection and we don't +// have to worry about it in the current Flush(). +// +// However, the swap doesn't ensure that the thread doesn't already have a local copy of +// the 'event_buffer_' that we are trying to flush. So, if the thread is in the +// middle of a Trace call, we have to wait until it exits. We do that by spinning +// on the 'is_in_trace_event_' member of that thread's thread-local structure. +// +// After we've swapped the buffer pointer and waited on the thread to exit any +// concurrent Trace() call, we know that no other thread can hold a pointer to +// the trace buffer, and we can safely flush it and delete it. +void TraceLog::Flush(const TraceLog::OutputCallback& cb) { + if (IsEnabled()) { + // Can't flush when tracing is enabled because otherwise PostTask would + // - generate more trace events; + // - deschedule the calling thread on some platforms causing inaccurate + // timing of the trace events. + scoped_refptr empty_result = new RefCountedString; + if (!cb.is_null()) + cb.Run(empty_result, false); + LOG(WARNING) << "Ignored TraceLog::Flush called when tracing is enabled"; + return; + } + + int generation = this->generation(); + { + // Holding the active threads lock ensures that no thread will exit and + // delete its own PerThreadInfo object. + MutexLock l(active_threads_lock_); + for (const ActiveThreadMap::value_type& entry : active_threads_) { + int64_t tid = entry.first; + PerThreadInfo* thr_info = entry.second; + + // Swap out their buffer from their thread-local data. + // After this, any _future_ trace calls on that thread will create a new buffer + // and not use the one we obtain here. + ThreadLocalEventBuffer* buf = thr_info->AtomicTakeBuffer(); + + // If this thread hasn't traced anything since our last + // flush, we can skip it. + if (!buf) { + continue; + } + + // The buffer may still be in use by that thread if they're in a call. Sleep until + // they aren't, so we can flush/delete their old buffer. + // + // It's important that we do not hold 'lock_' here, because otherwise we can get a + // deadlock: a thread may be in the middle of a trace event (is_in_trace_event_ == + // true) and waiting to take lock_, while we are holding the lock and waiting for it + // to not be in the trace event. + while (base::subtle::Acquire_Load(&thr_info->is_in_trace_event_)) { + sched_yield(); + } + + { + SpinLockHolder lock(&lock_); + buf->Flush(tid); + } + delete buf; + } + } + + { + SpinLockHolder lock(&lock_); + + if (thread_shared_chunk_) { + logged_events_->ReturnChunk(thread_shared_chunk_index_, + thread_shared_chunk_.Pass()); + } + } + + FinishFlush(generation, cb); +} + +void TraceLog::ConvertTraceEventsToTraceFormat( + gscoped_ptr logged_events, + const TraceLog::OutputCallback& flush_output_callback) { + + if (flush_output_callback.is_null()) + return; + + // The callback need to be called at least once even if there is no events + // to let the caller know the completion of flush. + bool has_more_events = true; + do { + scoped_refptr json_events_str_ptr = + new RefCountedString(); + + for (size_t i = 0; i < kTraceEventBatchChunks; ++i) { + const TraceBufferChunk* chunk = logged_events->NextChunk(); + if (!chunk) { + has_more_events = false; + break; + } + for (size_t j = 0; j < chunk->size(); ++j) { + if (i > 0 || j > 0) + json_events_str_ptr->data().append(","); + chunk->GetEventAt(j)->AppendAsJSON(&(json_events_str_ptr->data())); + } + } + + flush_output_callback.Run(json_events_str_ptr, has_more_events); + } while (has_more_events); +} + +void TraceLog::FinishFlush(int generation, + const TraceLog::OutputCallback& flush_output_callback) { + gscoped_ptr previous_logged_events; + + if (!CheckGeneration(generation)) + return; + + { + SpinLockHolder lock(&lock_); + + previous_logged_events.swap(logged_events_); + UseNextTraceBuffer(); + } + + ConvertTraceEventsToTraceFormat(previous_logged_events.Pass(), + flush_output_callback); +} + +void TraceLog::FlushButLeaveBufferIntact( + const TraceLog::OutputCallback& flush_output_callback) { + gscoped_ptr previous_logged_events; + { + SpinLockHolder lock(&lock_); + if (mode_ == DISABLED || (trace_options_ & RECORD_CONTINUOUSLY) == 0) { + scoped_refptr empty_result = new RefCountedString; + flush_output_callback.Run(empty_result, false); + LOG(WARNING) << "Ignored TraceLog::FlushButLeaveBufferIntact when monitoring is not enabled"; + return; + } + + AddMetadataEventsWhileLocked(); + if (thread_shared_chunk_) { + // Return the chunk to the main buffer to flush the sampling data. + logged_events_->ReturnChunk(thread_shared_chunk_index_, + thread_shared_chunk_.Pass()); + } + previous_logged_events = logged_events_->CloneForIteration().Pass(); + } + + ConvertTraceEventsToTraceFormat(previous_logged_events.Pass(), + flush_output_callback); +} + +void TraceLog::UseNextTraceBuffer() { + logged_events_.reset(CreateTraceBuffer()); + base::subtle::NoBarrier_AtomicIncrement(&generation_, 1); + thread_shared_chunk_.reset(); + thread_shared_chunk_index_ = 0; +} + +TraceEventHandle TraceLog::AddTraceEvent( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags) { + int thread_id = static_cast(kudu::Thread::UniqueThreadId()); + MicrosecondsInt64 now = GetMonoTimeMicros(); + return AddTraceEventWithThreadIdAndTimestamp(phase, category_group_enabled, + name, id, thread_id, now, + num_args, arg_names, + arg_types, arg_values, + convertable_values, flags); +} + +TraceLog::PerThreadInfo* TraceLog::SetupThreadLocalBuffer() { + int64_t cur_tid = Thread::UniqueThreadId(); + + auto thr_info = new PerThreadInfo(); + thr_info->event_buffer_ = nullptr; + thr_info->is_in_trace_event_ = 0; + thread_local_info_ = thr_info; + + Thread* t = Thread::current_thread(); + if (t) { + t->CallAtExit(Bind(&TraceLog::ThreadExiting, Unretained(this))); + } + + { + MutexLock lock(active_threads_lock_); + InsertOrDie(&active_threads_, cur_tid, thr_info); + } + return thr_info; +} + +void TraceLog::ThreadExiting() { + PerThreadInfo* thr_info = thread_local_info_; + if (!thr_info) { + return; + } + + int64_t cur_tid = Thread::UniqueThreadId(); + + // Flush our own buffer back to the central event buffer. + // We do the atomic exchange because a flusher thread may + // also be trying to flush us at the same time, and we need to avoid + // conflict. + ThreadLocalEventBuffer* buf = thr_info->AtomicTakeBuffer(); + if (buf) { + SpinLockHolder lock(&lock_); + buf->Flush(Thread::UniqueThreadId()); + } + delete buf; + + { + MutexLock lock(active_threads_lock_); + active_threads_.erase(cur_tid); + } + delete thr_info; +} + +TraceEventHandle TraceLog::AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags) { + TraceEventHandle handle = { 0, 0, 0 }; + if (!*category_group_enabled) + return handle; + + DCHECK(name); + + if (flags & TRACE_EVENT_FLAG_MANGLE_ID) + id ^= process_id_hash_; + + MicrosecondsInt64 now = OffsetTimestamp(timestamp); + MicrosecondsInt64 thread_now = GetThreadCpuTimeMicros(); + + PerThreadInfo* thr_info = thread_local_info_; + if (PREDICT_FALSE(!thr_info)) { + thr_info = SetupThreadLocalBuffer(); + } + + // Avoid re-entrance of AddTraceEvent. This may happen in GPU process when + // ECHO_TO_CONSOLE is enabled: AddTraceEvent -> LOG(ERROR) -> + // GpuProcessLogMessageHandler -> PostPendingTask -> TRACE_EVENT ... + if (base::subtle::NoBarrier_Load(&thr_info->is_in_trace_event_)) + return handle; + + MarkFlagInScope thread_is_in_trace_event(&thr_info->is_in_trace_event_); + + ThreadLocalEventBuffer* thread_local_event_buffer = + reinterpret_cast( + base::subtle::NoBarrier_Load( + reinterpret_cast(&thr_info->event_buffer_))); + + // If we have an event buffer, but it's a left-over from a previous trace, + // delete it. + if (PREDICT_FALSE(thread_local_event_buffer && + !CheckGeneration(thread_local_event_buffer->generation()))) { + // We might also race against a flusher thread, so we have to atomically + // take the buffer. + thread_local_event_buffer = thr_info->AtomicTakeBuffer(); + delete thread_local_event_buffer; + thread_local_event_buffer = nullptr; + } + + // If there is no current buffer, create one for this event. + if (PREDICT_FALSE(!thread_local_event_buffer)) { + thread_local_event_buffer = new ThreadLocalEventBuffer(this); + + base::subtle::NoBarrier_Store( + reinterpret_cast(&thr_info->event_buffer_), + reinterpret_cast(thread_local_event_buffer)); + } + + // Check and update the current thread name only if the event is for the + // current thread to avoid locks in most cases. + if (thread_id == static_cast(Thread::UniqueThreadId())) { + Thread* kudu_thr = Thread::current_thread(); + if (kudu_thr) { + const char* new_name = kudu_thr->name().c_str(); + // Check if the thread name has been set or changed since the previous + // call (if any), but don't bother if the new name is empty. Note this will + // not detect a thread name change within the same char* buffer address: we + // favor common case performance over corner case correctness. + if (PREDICT_FALSE(new_name != g_current_thread_name && + new_name && *new_name)) { + g_current_thread_name = new_name; + + SpinLockHolder thread_info_lock(&thread_info_lock_); + + auto existing_name = thread_names_.find(thread_id); + if (existing_name == thread_names_.end()) { + // This is a new thread id, and a new name. + thread_names_[thread_id] = new_name; + } else { + // This is a thread id that we've seen before, but potentially with a + // new name. + std::vector existing_names = strings::Split(existing_name->second, ","); + bool found = std::find(existing_names.begin(), + existing_names.end(), + new_name) != existing_names.end(); + if (!found) { + if (existing_names.size()) + existing_name->second.push_back(','); + existing_name->second.append(new_name); + } + } + } + } + } + + std::string console_message; + if (*category_group_enabled & + (ENABLED_FOR_RECORDING | ENABLED_FOR_MONITORING)) { + TraceEvent* trace_event = thread_local_event_buffer->AddTraceEvent(&handle); + + if (trace_event) { + trace_event->Initialize(thread_id, now, thread_now, phase, + category_group_enabled, name, id, + num_args, arg_names, arg_types, arg_values, + convertable_values, flags); + +#if defined(OS_ANDROID) + trace_event->SendToATrace(); +#endif + } + + if (trace_options() & ECHO_TO_CONSOLE) { + console_message = EventToConsoleMessage( + phase == TRACE_EVENT_PHASE_COMPLETE ? TRACE_EVENT_PHASE_BEGIN : phase, + timestamp, trace_event); + } + } + + if (PREDICT_FALSE(console_message.size())) + LOG(ERROR) << console_message; + + if (PREDICT_FALSE(reinterpret_cast( + base::subtle::NoBarrier_Load(&watch_category_)) == category_group_enabled)) { + bool event_name_matches; + WatchEventCallback watch_event_callback_copy; + { + SpinLockHolder lock(&lock_); + event_name_matches = watch_event_name_ == name; + watch_event_callback_copy = watch_event_callback_; + } + if (event_name_matches) { + if (!watch_event_callback_copy.is_null()) + watch_event_callback_copy.Run(); + } + } + + if (PREDICT_FALSE(*category_group_enabled & ENABLED_FOR_EVENT_CALLBACK)) { + EventCallback event_callback = reinterpret_cast( + base::subtle::NoBarrier_Load(&event_callback_)); + if (event_callback) { + event_callback(now, + phase == TRACE_EVENT_PHASE_COMPLETE ? + TRACE_EVENT_PHASE_BEGIN : phase, + category_group_enabled, name, id, + num_args, arg_names, arg_types, arg_values, + flags); + } + } + + return handle; +} + +// May be called when a COMPELETE event ends and the unfinished event has been +// recycled (phase == TRACE_EVENT_PHASE_END and trace_event == NULL). +std::string TraceLog::EventToConsoleMessage(unsigned char phase, + const MicrosecondsInt64& timestamp, + TraceEvent* trace_event) { + SpinLockHolder thread_info_lock(&thread_info_lock_); + + // The caller should translate TRACE_EVENT_PHASE_COMPLETE to + // TRACE_EVENT_PHASE_BEGIN or TRACE_EVENT_END. + DCHECK(phase != TRACE_EVENT_PHASE_COMPLETE); + + MicrosecondsInt64 duration; + int thread_id = trace_event ? + trace_event->thread_id() : Thread::UniqueThreadId(); + if (phase == TRACE_EVENT_PHASE_END) { + duration = timestamp - thread_event_start_times_[thread_id].top(); + thread_event_start_times_[thread_id].pop(); + } + + std::string thread_name = thread_names_[thread_id]; + if (thread_colors_.find(thread_name) == thread_colors_.end()) + thread_colors_[thread_name] = (thread_colors_.size() % 6) + 1; + + std::ostringstream log; + log << StringPrintf("%s: \x1b[0;3%dm", + thread_name.c_str(), + thread_colors_[thread_name]); + + size_t depth = 0; + if (thread_event_start_times_.find(thread_id) != + thread_event_start_times_.end()) + depth = thread_event_start_times_[thread_id].size(); + + for (size_t i = 0; i < depth; ++i) + log << "| "; + + if (trace_event) + trace_event->AppendPrettyPrinted(&log); + if (phase == TRACE_EVENT_PHASE_END) + log << StringPrintf(" (%.3f ms)", duration / 1000.0f); + + log << "\x1b[0;m"; + + if (phase == TRACE_EVENT_PHASE_BEGIN) + thread_event_start_times_[thread_id].push(timestamp); + + return log.str(); +} + +void TraceLog::AddTraceEventEtw(char phase, + const char* name, + const void* id, + const char* extra) { +#if defined(OS_WIN) + TraceEventETWProvider::Trace(name, phase, id, extra); +#endif + INTERNAL_TRACE_EVENT_ADD(phase, "ETW Trace Event", name, + TRACE_EVENT_FLAG_COPY, "id", id, "extra", extra); +} + +void TraceLog::AddTraceEventEtw(char phase, + const char* name, + const void* id, + const std::string& extra) { +#if defined(OS_WIN) + TraceEventETWProvider::Trace(name, phase, id, extra); +#endif + INTERNAL_TRACE_EVENT_ADD(phase, "ETW Trace Event", name, + TRACE_EVENT_FLAG_COPY, "id", id, "extra", extra); +} + +void TraceLog::UpdateTraceEventDuration( + const unsigned char* category_group_enabled, + const char* name, + TraceEventHandle handle) { + + PerThreadInfo* thr_info = thread_local_info_; + if (!thr_info) { + thr_info = SetupThreadLocalBuffer(); + } + + // Avoid re-entrance of AddTraceEvent. This may happen in GPU process when + // ECHO_TO_CONSOLE is enabled: AddTraceEvent -> LOG(ERROR) -> + // GpuProcessLogMessageHandler -> PostPendingTask -> TRACE_EVENT ... + if (base::subtle::NoBarrier_Load(&thr_info->is_in_trace_event_)) + return; + MarkFlagInScope thread_is_in_trace_event(&thr_info->is_in_trace_event_); + + MicrosecondsInt64 thread_now = GetThreadCpuTimeMicros(); + MicrosecondsInt64 now = OffsetNow(); + + std::string console_message; + if (*category_group_enabled & ENABLED_FOR_RECORDING) { + OptionalAutoLock lock(lock_); + + TraceEvent* trace_event = GetEventByHandleInternal(handle, &lock); + if (trace_event) { + DCHECK(trace_event->phase() == TRACE_EVENT_PHASE_COMPLETE); + trace_event->UpdateDuration(now, thread_now); +#if defined(OS_ANDROID) + trace_event->SendToATrace(); +#endif + } + + if (trace_options() & ECHO_TO_CONSOLE) { + console_message = EventToConsoleMessage(TRACE_EVENT_PHASE_END, + now, trace_event); + } + } + + if (console_message.size()) + LOG(ERROR) << console_message; + + if (*category_group_enabled & ENABLED_FOR_EVENT_CALLBACK) { + EventCallback event_callback = reinterpret_cast( + base::subtle::NoBarrier_Load(&event_callback_)); + if (event_callback) { + event_callback(now, TRACE_EVENT_PHASE_END, category_group_enabled, name, + trace_event_internal::kNoEventId, 0, nullptr, nullptr, nullptr, + TRACE_EVENT_FLAG_NONE); + } + } +} + +void TraceLog::SetWatchEvent(const std::string& category_name, + const std::string& event_name, + const WatchEventCallback& callback) { + const unsigned char* category = GetCategoryGroupEnabled( + category_name.c_str()); + SpinLockHolder lock(&lock_); + base::subtle::NoBarrier_Store(&watch_category_, + reinterpret_cast(category)); + watch_event_name_ = event_name; + watch_event_callback_ = callback; +} + +void TraceLog::CancelWatchEvent() { + SpinLockHolder lock(&lock_); + base::subtle::NoBarrier_Store(&watch_category_, 0); + watch_event_name_ = ""; + watch_event_callback_.Reset(); +} + +void TraceLog::AddMetadataEventsWhileLocked() { + DCHECK(lock_.IsHeld()); + +#if !defined(OS_NACL) // NaCl shouldn't expose the process id. + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + 0, + "num_cpus", "number", + base::NumCPUs()); +#endif + + + int current_thread_id = static_cast(kudu::Thread::UniqueThreadId()); + if (process_sort_index_ != 0) { + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + current_thread_id, + "process_sort_index", "sort_index", + process_sort_index_); + } + + if (process_name_.size()) { + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + current_thread_id, + "process_name", "name", + process_name_); + } + + if (process_labels_.size() > 0) { + std::vector labels; + for(auto& label : process_labels_) { + labels.push_back(label.second); + } + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + current_thread_id, + "process_labels", "labels", + JoinStrings(labels, ",")); + } + + // Thread sort indices. + for(auto& sort_index : thread_sort_indices_) { + if (sort_index.second == 0) + continue; + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + sort_index.first, + "thread_sort_index", "sort_index", + sort_index.second); + } + + // Thread names. + SpinLockHolder thread_info_lock(&thread_info_lock_); + for(auto& name : thread_names_) { + if (name.second.empty()) + continue; + InitializeMetadataEvent(AddEventToThreadSharedChunkWhileLocked(nullptr, false), + name.first, + "thread_name", "name", + name.second); + } +} + + +TraceEvent* TraceLog::GetEventByHandle(TraceEventHandle handle) { + return GetEventByHandleInternal(handle, nullptr); +} + +TraceEvent* TraceLog::GetEventByHandleInternal(TraceEventHandle handle, + OptionalAutoLock* lock) { + TraceLog::PerThreadInfo* thr_info = TraceLog::thread_local_info_; + + if (!handle.chunk_seq) + return nullptr; + + if (thr_info) { + ThreadLocalEventBuffer* buf = + reinterpret_cast( + base::subtle::NoBarrier_Load( + reinterpret_cast(&thr_info->event_buffer_))); + + if (buf) { + DCHECK_EQ(1, ANNOTATE_UNPROTECTED_READ(thr_info->is_in_trace_event_)); + + TraceEvent* trace_event = buf->GetEventByHandle(handle); + if (trace_event) + return trace_event; + } + } + + // The event has been out-of-control of the thread local buffer. + // Try to get the event from the main buffer with a lock. + if (lock) + lock->EnsureAcquired(); + + if (thread_shared_chunk_ && + handle.chunk_index == thread_shared_chunk_index_) { + return handle.chunk_seq == thread_shared_chunk_->seq() ? + thread_shared_chunk_->GetEventAt(handle.event_index) : nullptr; + } + + return logged_events_->GetEventByHandle(handle); +} + +void TraceLog::SetProcessID(int process_id) { + process_id_ = process_id; + // Create a FNV hash from the process ID for XORing. + // See http://isthe.com/chongo/tech/comp/fnv/ for algorithm details. + uint64_t offset_basis = 14695981039346656037ull; + uint64_t fnv_prime = 1099511628211ull; + uint64_t pid = static_cast(process_id_); + process_id_hash_ = (offset_basis ^ pid) * fnv_prime; +} + +void TraceLog::SetProcessSortIndex(int sort_index) { + SpinLockHolder lock(&lock_); + process_sort_index_ = sort_index; +} + +void TraceLog::SetProcessName(const std::string& process_name) { + SpinLockHolder lock(&lock_); + process_name_ = process_name; +} + +void TraceLog::UpdateProcessLabel( + int label_id, const std::string& current_label) { + if(!current_label.length()) + return RemoveProcessLabel(label_id); + + SpinLockHolder lock(&lock_); + process_labels_[label_id] = current_label; +} + +void TraceLog::RemoveProcessLabel(int label_id) { + SpinLockHolder lock(&lock_); + auto it = process_labels_.find(label_id); + if (it == process_labels_.end()) + return; + + process_labels_.erase(it); +} + +void TraceLog::SetThreadSortIndex(int64_t thread_id, int sort_index) { + SpinLockHolder lock(&lock_); + thread_sort_indices_[static_cast(thread_id)] = sort_index; +} + +void TraceLog::SetTimeOffset(MicrosecondsInt64 offset) { + time_offset_ = offset; +} + +size_t TraceLog::GetObserverCountForTest() const { + return enabled_state_observer_list_.size(); +} + +bool CategoryFilter::IsEmptyOrContainsLeadingOrTrailingWhitespace( + const std::string& str) { + return str.empty() || + str.at(0) == ' ' || + str.at(str.length() - 1) == ' '; +} + +bool CategoryFilter::DoesCategoryGroupContainCategory( + const char* category_group, + const char* category) const { + DCHECK(category); + vector pieces = strings::Split(category_group, ","); + for (const string& category_group_token : pieces) { + // Don't allow empty tokens, nor tokens with leading or trailing space. + DCHECK(!CategoryFilter::IsEmptyOrContainsLeadingOrTrailingWhitespace( + category_group_token)) + << "Disallowed category string"; + + if (MatchPattern(category_group_token.c_str(), category)) + return true; + } + return false; +} + +CategoryFilter::CategoryFilter(const std::string& filter_string) { + if (!filter_string.empty()) + Initialize(filter_string); + else + Initialize(CategoryFilter::kDefaultCategoryFilterString); +} + +CategoryFilter::CategoryFilter(const CategoryFilter& cf) + : included_(cf.included_), + disabled_(cf.disabled_), + excluded_(cf.excluded_), + delays_(cf.delays_) { +} + +CategoryFilter::~CategoryFilter() { +} + +CategoryFilter& CategoryFilter::operator=(const CategoryFilter& rhs) { + if (this == &rhs) + return *this; + + included_ = rhs.included_; + disabled_ = rhs.disabled_; + excluded_ = rhs.excluded_; + delays_ = rhs.delays_; + return *this; +} + +void CategoryFilter::Initialize(const std::string& filter_string) { + // Tokenize list of categories, delimited by ','. + vector tokens = strings::Split(filter_string, ","); + // Add each token to the appropriate list (included_,excluded_). + for (string category : tokens) { + // Ignore empty categories. + if (category.empty()) + continue; + // Synthetic delays are of the form 'DELAY(delay;option;option;...)'. + if (category.find(kSyntheticDelayCategoryFilterPrefix) == 0 && + category.at(category.size() - 1) == ')') { + category = category.substr( + strlen(kSyntheticDelayCategoryFilterPrefix), + category.size() - strlen(kSyntheticDelayCategoryFilterPrefix) - 1); + size_t name_length = category.find(';'); + if (name_length != std::string::npos && name_length > 0 && + name_length != category.size() - 1) { + delays_.push_back(category); + } + } else if (category.at(0) == '-') { + // Excluded categories start with '-'. + // Remove '-' from category string. + category = category.substr(1); + excluded_.push_back(category); + } else if (category.compare(0, strlen(TRACE_DISABLED_BY_DEFAULT("")), + TRACE_DISABLED_BY_DEFAULT("")) == 0) { + disabled_.push_back(category); + } else { + included_.push_back(category); + } + } +} + +void CategoryFilter::WriteString(const StringList& values, + std::string* out, + bool included) const { + bool prepend_comma = !out->empty(); + int token_cnt = 0; + for (const auto& value : values) { + if (token_cnt > 0 || prepend_comma) + StringAppendF(out, ","); + StringAppendF(out, "%s%s", (included ? "" : "-"), value.c_str()); + ++token_cnt; + } +} + +void CategoryFilter::WriteString(const StringList& delays, + std::string* out) const { + bool prepend_comma = !out->empty(); + int token_cnt = 0; + for (const auto& delay : delays) { + if (token_cnt > 0 || prepend_comma) + StringAppendF(out, ","); + StringAppendF(out, "%s%s)", kSyntheticDelayCategoryFilterPrefix, + delay.c_str()); + ++token_cnt; + } +} + +std::string CategoryFilter::ToString() const { + std::string filter_string; + WriteString(included_, &filter_string, true); + WriteString(disabled_, &filter_string, true); + WriteString(excluded_, &filter_string, false); + WriteString(delays_, &filter_string); + return filter_string; +} + +bool CategoryFilter::IsCategoryGroupEnabled( + const char* category_group_name) const { + // TraceLog should call this method only as part of enabling/disabling + // categories. + StringList::const_iterator ci; + + // Check the disabled- filters and the disabled-* wildcard first so that a + // "*" filter does not include the disabled. + for (ci = disabled_.begin(); ci != disabled_.end(); ++ci) { + if (DoesCategoryGroupContainCategory(category_group_name, ci->c_str())) + return true; + } + if (DoesCategoryGroupContainCategory(category_group_name, + TRACE_DISABLED_BY_DEFAULT("*"))) + return false; + + for (ci = included_.begin(); ci != included_.end(); ++ci) { + if (DoesCategoryGroupContainCategory(category_group_name, ci->c_str())) + return true; + } + + for (ci = excluded_.begin(); ci != excluded_.end(); ++ci) { + if (DoesCategoryGroupContainCategory(category_group_name, ci->c_str())) + return false; + } + // If the category group is not excluded, and there are no included patterns + // we consider this pattern enabled. + return included_.empty(); +} + +bool CategoryFilter::HasIncludedPatterns() const { + return !included_.empty(); +} + +void CategoryFilter::Merge(const CategoryFilter& nested_filter) { + // Keep included patterns only if both filters have an included entry. + // Otherwise, one of the filter was specifying "*" and we want to honour the + // broadest filter. + if (HasIncludedPatterns() && nested_filter.HasIncludedPatterns()) { + included_.insert(included_.end(), + nested_filter.included_.begin(), + nested_filter.included_.end()); + } else { + included_.clear(); + } + + disabled_.insert(disabled_.end(), + nested_filter.disabled_.begin(), + nested_filter.disabled_.end()); + excluded_.insert(excluded_.end(), + nested_filter.excluded_.begin(), + nested_filter.excluded_.end()); + delays_.insert(delays_.end(), + nested_filter.delays_.begin(), + nested_filter.delays_.end()); +} + +void CategoryFilter::Clear() { + included_.clear(); + disabled_.clear(); + excluded_.clear(); +} + +const CategoryFilter::StringList& + CategoryFilter::GetSyntheticDelayValues() const { + return delays_; +} + +} // namespace debug +} // namespace kudu + +namespace trace_event_internal { + +ScopedTraceBinaryEfficient::ScopedTraceBinaryEfficient( + const char* category_group, const char* name) { + // The single atom works because for now the category_group can only be "gpu". + DCHECK(strcmp(category_group, "gpu") == 0); + static TRACE_EVENT_API_ATOMIC_WORD atomic = 0; + INTERNAL_TRACE_EVENT_GET_CATEGORY_INFO_CUSTOM_VARIABLES( + category_group, atomic, category_group_enabled_); + name_ = name; + if (*category_group_enabled_) { + event_handle_ = + TRACE_EVENT_API_ADD_TRACE_EVENT_WITH_THREAD_ID_AND_TIMESTAMP( + TRACE_EVENT_PHASE_COMPLETE, category_group_enabled_, name, + trace_event_internal::kNoEventId, + static_cast(kudu::Thread::UniqueThreadId()), + GetMonoTimeMicros(), + 0, nullptr, nullptr, nullptr, nullptr, TRACE_EVENT_FLAG_NONE); + } +} + +ScopedTraceBinaryEfficient::~ScopedTraceBinaryEfficient() { + if (*category_group_enabled_) { + TRACE_EVENT_API_UPDATE_TRACE_EVENT_DURATION(category_group_enabled_, + name_, event_handle_); + } +} + +} // namespace trace_event_internal diff --git a/src/kudu/util/debug/trace_event_impl.h b/src/kudu/util/debug/trace_event_impl.h new file mode 100644 index 000000000000..73d2251f0905 --- /dev/null +++ b/src/kudu/util/debug/trace_event_impl.h @@ -0,0 +1,717 @@ +// Copyright (c) 2012 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef KUDU_UTIL_DEBUG_TRACE_EVENT_IMPL_H_ +#define KUDU_UTIL_DEBUG_TRACE_EVENT_IMPL_H_ + +#include +#include +#include +#include +#include +#include + + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/walltime.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/ref_counted_memory.h" +#include "kudu/util/atomic.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/locks.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadlocal.h" + +// Older style trace macros with explicit id and extra data +// Only these macros result in publishing data to ETW as currently implemented. +#define TRACE_EVENT_BEGIN_ETW(name, id, extra) \ + base::debug::TraceLog::AddTraceEventEtw( \ + TRACE_EVENT_PHASE_BEGIN, \ + name, reinterpret_cast(id), extra) + +#define TRACE_EVENT_END_ETW(name, id, extra) \ + base::debug::TraceLog::AddTraceEventEtw( \ + TRACE_EVENT_PHASE_END, \ + name, reinterpret_cast(id), extra) + +#define TRACE_EVENT_INSTANT_ETW(name, id, extra) \ + base::debug::TraceLog::AddTraceEventEtw( \ + TRACE_EVENT_PHASE_INSTANT, \ + name, reinterpret_cast(id), extra) + +template +class Singleton; + +#if defined(COMPILER_GCC) +namespace BASE_HASH_NAMESPACE { +template <> +struct hash { + std::size_t operator()(kudu::Thread* value) const { + return reinterpret_cast(value); + } +}; +} // BASE_HASH_NAMESPACE +#endif + +namespace kudu { +namespace debug { + +// For any argument of type TRACE_VALUE_TYPE_CONVERTABLE the provided +// class must implement this interface. +class ConvertableToTraceFormat : public kudu::RefCountedThreadSafe { + public: + // Append the class info to the provided |out| string. The appended + // data must be a valid JSON object. Strings must be properly quoted, and + // escaped. There is no processing applied to the content after it is + // appended. + virtual void AppendAsTraceFormat(std::string* out) const = 0; + + protected: + virtual ~ConvertableToTraceFormat() {} + + private: + friend class kudu::RefCountedThreadSafe; +}; + +struct TraceEventHandle { + uint32 chunk_seq; + uint16 chunk_index; + uint16 event_index; +}; + +const int kTraceMaxNumArgs = 2; + +class BASE_EXPORT TraceEvent { + public: + union TraceValue { + bool as_bool; + uint64_t as_uint; + long long as_int; + double as_double; + const void* as_pointer; + const char* as_string; + }; + + TraceEvent(); + ~TraceEvent(); + + // We don't need to copy TraceEvent except when TraceEventBuffer is cloned. + // Use explicit copy method to avoid accidentally misuse of copy. + void CopyFrom(const TraceEvent& other); + + void Initialize( + int thread_id, + MicrosecondsInt64 timestamp, + MicrosecondsInt64 thread_timestamp, + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags); + + void Reset(); + + void UpdateDuration(const MicrosecondsInt64& now, const MicrosecondsInt64& thread_now); + + // Serialize event data to JSON + void AppendAsJSON(std::string* out) const; + void AppendPrettyPrinted(std::ostringstream* out) const; + + static void AppendValueAsJSON(unsigned char type, + TraceValue value, + std::string* out); + + MicrosecondsInt64 timestamp() const { return timestamp_; } + MicrosecondsInt64 thread_timestamp() const { return thread_timestamp_; } + char phase() const { return phase_; } + int thread_id() const { return thread_id_; } + MicrosecondsInt64 duration() const { return duration_; } + MicrosecondsInt64 thread_duration() const { return thread_duration_; } + uint64_t id() const { return id_; } + unsigned char flags() const { return flags_; } + + // Exposed for unittesting: + + const kudu::RefCountedString* parameter_copy_storage() const { + return parameter_copy_storage_.get(); + } + + const unsigned char* category_group_enabled() const { + return category_group_enabled_; + } + + const char* name() const { return name_; } + +#if defined(OS_ANDROID) + void SendToATrace(); +#endif + + private: + // Note: these are ordered by size (largest first) for optimal packing. + MicrosecondsInt64 timestamp_; + MicrosecondsInt64 thread_timestamp_; + MicrosecondsInt64 duration_; + MicrosecondsInt64 thread_duration_; + // id_ can be used to store phase-specific data. + uint64_t id_; + TraceValue arg_values_[kTraceMaxNumArgs]; + const char* arg_names_[kTraceMaxNumArgs]; + scoped_refptr convertable_values_[kTraceMaxNumArgs]; + const unsigned char* category_group_enabled_; + const char* name_; + scoped_refptr parameter_copy_storage_; + int thread_id_; + char phase_; + unsigned char flags_; + unsigned char arg_types_[kTraceMaxNumArgs]; + + DISALLOW_COPY_AND_ASSIGN(TraceEvent); +}; + +// TraceBufferChunk is the basic unit of TraceBuffer. +class BASE_EXPORT TraceBufferChunk { + public: + TraceBufferChunk(uint32 seq) + : next_free_(0), + seq_(seq) { + } + + void Reset(uint32 new_seq); + TraceEvent* AddTraceEvent(size_t* event_index); + bool IsFull() const { return next_free_ == kTraceBufferChunkSize; } + + uint32 seq() const { return seq_; } + size_t capacity() const { return kTraceBufferChunkSize; } + size_t size() const { return next_free_; } + + TraceEvent* GetEventAt(size_t index) { + DCHECK(index < size()); + return &chunk_[index]; + } + const TraceEvent* GetEventAt(size_t index) const { + DCHECK(index < size()); + return &chunk_[index]; + } + + gscoped_ptr Clone() const; + + static const size_t kTraceBufferChunkSize = 64; + + private: + size_t next_free_; + TraceEvent chunk_[kTraceBufferChunkSize]; + uint32 seq_; +}; + +// TraceBuffer holds the events as they are collected. +class BASE_EXPORT TraceBuffer { + public: + virtual ~TraceBuffer() {} + + virtual gscoped_ptr GetChunk(size_t *index) = 0; + virtual void ReturnChunk(size_t index, + gscoped_ptr chunk) = 0; + + virtual bool IsFull() const = 0; + virtual size_t Size() const = 0; + virtual size_t Capacity() const = 0; + virtual TraceEvent* GetEventByHandle(TraceEventHandle handle) = 0; + + // For iteration. Each TraceBuffer can only be iterated once. + virtual const TraceBufferChunk* NextChunk() = 0; + + virtual gscoped_ptr CloneForIteration() const = 0; +}; + +// TraceResultBuffer collects and converts trace fragments returned by TraceLog +// to JSON output. +class TraceResultBuffer { + public: + static std::string FlushTraceLogToString(); + static std::string FlushTraceLogToStringButLeaveBufferIntact(); + + private: + TraceResultBuffer(); + ~TraceResultBuffer(); + + static std::string DoFlush(bool leave_intact); + + // Callback for TraceLog::Flush + void Collect(const scoped_refptr& s, + bool has_more_events); + + bool first_; + std::string json_; +}; + +class BASE_EXPORT CategoryFilter { + public: + typedef std::vector StringList; + + // The default category filter, used when none is provided. + // Allows all categories through, except if they end in the suffix 'Debug' or + // 'Test'. + static const char* kDefaultCategoryFilterString; + + // |filter_string| is a comma-delimited list of category wildcards. + // A category can have an optional '-' prefix to make it an excluded category. + // All the same rules apply above, so for example, having both included and + // excluded categories in the same list would not be supported. + // + // Example: CategoryFilter"test_MyTest*"); + // Example: CategoryFilter("test_MyTest*,test_OtherStuff"); + // Example: CategoryFilter("-excluded_category1,-excluded_category2"); + // Example: CategoryFilter("-*,webkit"); would disable everything but webkit. + // Example: CategoryFilter("-webkit"); would enable everything but webkit. + // + // Category filters can also be used to configure synthetic delays. + // + // Example: CategoryFilter("DELAY(gpu.PresentingFrame;16)"); would make swap + // buffers always take at least 16 ms. + // Example: CategoryFilter("DELAY(gpu.PresentingFrame;16;oneshot)"); would + // make swap buffers take at least 16 ms the first time it is + // called. + // Example: CategoryFilter("DELAY(gpu.PresentingFrame;16;alternating)"); + // would make swap buffers take at least 16 ms every other time it + // is called. + explicit CategoryFilter(const std::string& filter_string); + + CategoryFilter(const CategoryFilter& cf); + + ~CategoryFilter(); + + CategoryFilter& operator=(const CategoryFilter& rhs); + + // Writes the string representation of the CategoryFilter. This is a comma + // separated string, similar in nature to the one used to determine + // enabled/disabled category patterns, except here there is an arbitrary + // order, included categories go first, then excluded categories. Excluded + // categories are distinguished from included categories by the prefix '-'. + std::string ToString() const; + + // Determines whether category group would be enabled or + // disabled by this category filter. + bool IsCategoryGroupEnabled(const char* category_group) const; + + // Return a list of the synthetic delays specified in this category filter. + const StringList& GetSyntheticDelayValues() const; + + // Merges nested_filter with the current CategoryFilter + void Merge(const CategoryFilter& nested_filter); + + // Clears both included/excluded pattern lists. This would be equivalent to + // creating a CategoryFilter with an empty string, through the constructor. + // i.e: CategoryFilter(""). + // + // When using an empty filter, all categories are considered included as we + // are not excluding anything. + void Clear(); + + private: + FRIEND_TEST(TraceEventTestFixture, CategoryFilter); + + static bool IsEmptyOrContainsLeadingOrTrailingWhitespace( + const std::string& str); + + void Initialize(const std::string& filter_string); + void WriteString(const StringList& values, + std::string* out, + bool included) const; + void WriteString(const StringList& delays, std::string* out) const; + bool HasIncludedPatterns() const; + + bool DoesCategoryGroupContainCategory(const char* category_group, + const char* category) const; + + StringList included_; + StringList disabled_; + StringList excluded_; + StringList delays_; +}; + +class TraceSamplingThread; + +class BASE_EXPORT TraceLog { + public: + enum Mode { + DISABLED = 0, + RECORDING_MODE, + MONITORING_MODE, + }; + + // Options determines how the trace buffer stores data. + enum Options { + // Record until the trace buffer is full. + RECORD_UNTIL_FULL = 1 << 0, + + // Record until the user ends the trace. The trace buffer is a fixed size + // and we use it as a ring buffer during recording. + RECORD_CONTINUOUSLY = 1 << 1, + + // Enable the sampling profiler in the recording mode. + ENABLE_SAMPLING = 1 << 2, + + // Echo to console. Events are discarded. + ECHO_TO_CONSOLE = 1 << 3, + }; + + // The pointer returned from GetCategoryGroupEnabledInternal() points to a + // value with zero or more of the following bits. Used in this class only. + // The TRACE_EVENT macros should only use the value as a bool. + // These values must be in sync with macro values in TraceEvent.h in Blink. + enum CategoryGroupEnabledFlags { + // Category group enabled for the recording mode. + ENABLED_FOR_RECORDING = 1 << 0, + // Category group enabled for the monitoring mode. + ENABLED_FOR_MONITORING = 1 << 1, + // Category group enabled by SetEventCallbackEnabled(). + ENABLED_FOR_EVENT_CALLBACK = 1 << 2, + }; + + static TraceLog* GetInstance(); + + // Get set of known category groups. This can change as new code paths are + // reached. The known category groups are inserted into |category_groups|. + void GetKnownCategoryGroups(std::vector* category_groups); + + // Retrieves a copy (for thread-safety) of the current CategoryFilter. + CategoryFilter GetCurrentCategoryFilter(); + + Options trace_options() const { + return static_cast(base::subtle::NoBarrier_Load(&trace_options_)); + } + + // Enables normal tracing (recording trace events in the trace buffer). + // See CategoryFilter comments for details on how to control what categories + // will be traced. If tracing has already been enabled, |category_filter| will + // be merged into the current category filter. + void SetEnabled(const CategoryFilter& category_filter, + Mode mode, Options options); + + // Disables normal tracing for all categories. + void SetDisabled(); + + bool IsEnabled() { return mode_ != DISABLED; } + + // The number of times we have begun recording traces. If tracing is off, + // returns -1. If tracing is on, then it returns the number of times we have + // recorded a trace. By watching for this number to increment, you can + // passively discover when a new trace has begun. This is then used to + // implement the TRACE_EVENT_IS_NEW_TRACE() primitive. + int GetNumTracesRecorded(); + +#if defined(OS_ANDROID) + void StartATrace(); + void StopATrace(); + void AddClockSyncMetadataEvent(); +#endif + + // Enabled state listeners give a callback when tracing is enabled or + // disabled. This can be used to tie into other library's tracing systems + // on-demand. + class EnabledStateObserver { + public: + // Called just after the tracing system becomes enabled, outside of the + // |lock_|. TraceLog::IsEnabled() is true at this point. + virtual void OnTraceLogEnabled() = 0; + + // Called just after the tracing system disables, outside of the |lock_|. + // TraceLog::IsEnabled() is false at this point. + virtual void OnTraceLogDisabled() = 0; + }; + void AddEnabledStateObserver(EnabledStateObserver* listener); + void RemoveEnabledStateObserver(EnabledStateObserver* listener); + bool HasEnabledStateObserver(EnabledStateObserver* listener) const; + + float GetBufferPercentFull() const; + bool BufferIsFull() const; + + // Not using kudu::Callback because of its limited by 7 parameters. + // Also, using primitive type allows directly passing callback from WebCore. + // WARNING: It is possible for the previously set callback to be called + // after a call to SetEventCallbackEnabled() that replaces or a call to + // SetEventCallbackDisabled() that disables the callback. + // This callback may be invoked on any thread. + // For TRACE_EVENT_PHASE_COMPLETE events, the client will still receive pairs + // of TRACE_EVENT_PHASE_BEGIN and TRACE_EVENT_PHASE_END events to keep the + // interface simple. + typedef void (*EventCallback)(MicrosecondsInt64 timestamp, + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char* const arg_names[], + const unsigned char arg_types[], + const uint64_t arg_values[], + unsigned char flags); + + // Enable tracing for EventCallback. + void SetEventCallbackEnabled(const CategoryFilter& category_filter, + EventCallback cb); + void SetEventCallbackDisabled(); + + // Flush all collected events to the given output callback. The callback will + // be called one or more times synchronously from + // the current thread with IPC-bite-size chunks. The string format is + // undefined. Use TraceResultBuffer to convert one or more trace strings to + // JSON. The callback can be null if the caller doesn't want any data. + // Due to the implementation of thread-local buffers, flush can't be + // done when tracing is enabled. If called when tracing is enabled, the + // callback will be called directly with (empty_string, false) to indicate + // the end of this unsuccessful flush. + typedef kudu::Callback&, + bool has_more_events)> OutputCallback; + void Flush(const OutputCallback& cb); + void FlushButLeaveBufferIntact(const OutputCallback& flush_output_callback); + + // Called by TRACE_EVENT* macros, don't call this directly. + // The name parameter is a category group for example: + // TRACE_EVENT0("renderer,webkit", "WebViewImpl::HandleInputEvent") + static const unsigned char* GetCategoryGroupEnabled(const char* name); + static const char* GetCategoryGroupName( + const unsigned char* category_group_enabled); + + // Called by TRACE_EVENT* macros, don't call this directly. + // If |copy| is set, |name|, |arg_name1| and |arg_name2| will be deep copied + // into the event; see "Memory scoping note" and TRACE_EVENT_COPY_XXX above. + TraceEventHandle AddTraceEvent( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags); + TraceEventHandle AddTraceEventWithThreadIdAndTimestamp( + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int thread_id, + const MicrosecondsInt64& timestamp, + int num_args, + const char** arg_names, + const unsigned char* arg_types, + const uint64_t* arg_values, + const scoped_refptr* convertable_values, + unsigned char flags); + static void AddTraceEventEtw(char phase, + const char* category_group, + const void* id, + const char* extra); + static void AddTraceEventEtw(char phase, + const char* category_group, + const void* id, + const std::string& extra); + + void UpdateTraceEventDuration(const unsigned char* category_group_enabled, + const char* name, + TraceEventHandle handle); + + // For every matching event, the callback will be called. + typedef kudu::Callback WatchEventCallback; + void SetWatchEvent(const std::string& category_name, + const std::string& event_name, + const WatchEventCallback& callback); + // Cancel the watch event. If tracing is enabled, this may race with the + // watch event notification firing. + void CancelWatchEvent(); + + int process_id() const { return process_id_; } + + // Allow tests to inspect TraceEvents. + size_t GetEventsSize() const { return logged_events_->Size(); } + TraceEvent* GetEventByHandle(TraceEventHandle handle); + + void SetProcessID(int process_id); + + // Process sort indices, if set, override the order of a process will appear + // relative to other processes in the trace viewer. Processes are sorted first + // on their sort index, ascending, then by their name, and then tid. + void SetProcessSortIndex(int sort_index); + + // Sets the name of the process. + void SetProcessName(const std::string& process_name); + + // Processes can have labels in addition to their names. Use labels, for + // instance, to list out the web page titles that a process is handling. + void UpdateProcessLabel(int label_id, const std::string& current_label); + void RemoveProcessLabel(int label_id); + + // Thread sort indices, if set, override the order of a thread will appear + // within its process in the trace viewer. Threads are sorted first on their + // sort index, ascending, then by their name, and then tid. + void SetThreadSortIndex(int64_t tid , int sort_index); + + // Allow setting an offset between the current MicrosecondsInt64 time and the time + // that should be reported. + void SetTimeOffset(MicrosecondsInt64 offset); + + size_t GetObserverCountForTest() const; + + + private: + FRIEND_TEST(TraceEventTestFixture, + TraceBufferRingBufferGetReturnChunk); + FRIEND_TEST(TraceEventTestFixture, + TraceBufferRingBufferHalfIteration); + FRIEND_TEST(TraceEventTestFixture, + TraceBufferRingBufferFullIteration); + + // This allows constructor and destructor to be private and usable only + // by the Singleton class. + friend class Singleton; + + // Enable/disable each category group based on the current mode_, + // category_filter_, event_callback_ and event_callback_category_filter_. + // Enable the category group in the enabled mode if category_filter_ matches + // the category group, or event_callback_ is not null and + // event_callback_category_filter_ matches the category group. + void UpdateCategoryGroupEnabledFlags(); + void UpdateCategoryGroupEnabledFlag(int category_index); + + // Configure synthetic delays based on the values set in the current + // category filter. + void UpdateSyntheticDelaysFromCategoryFilter(); + + struct PerThreadInfo; + class OptionalAutoLock; + class ThreadLocalEventBuffer; + + TraceLog(); + ~TraceLog(); + const unsigned char* GetCategoryGroupEnabledInternal(const char* name); + void AddMetadataEventsWhileLocked(); + + TraceBuffer* trace_buffer() const { return logged_events_.get(); } + TraceBuffer* CreateTraceBuffer(); + + std::string EventToConsoleMessage(unsigned char phase, + const MicrosecondsInt64& timestamp, + TraceEvent* trace_event); + + TraceEvent* AddEventToThreadSharedChunkWhileLocked(TraceEventHandle* handle, + bool check_buffer_is_full); + void CheckIfBufferIsFullWhileLocked(); + void SetDisabledWhileLocked(); + + TraceEvent* GetEventByHandleInternal(TraceEventHandle handle, + OptionalAutoLock* lock); + + void ConvertTraceEventsToTraceFormat(gscoped_ptr logged_events, + const OutputCallback& flush_output_callback); + void FinishFlush(int generation, + const OutputCallback& flush_output_callback); + + // Called when a thread which has registered trace events is about to exit. + void ThreadExiting(); + + int generation() const { + return static_cast(base::subtle::NoBarrier_Load(&generation_)); + } + bool CheckGeneration(int generation) const { + return generation == this->generation(); + } + void UseNextTraceBuffer(); + + MicrosecondsInt64 OffsetNow() const { + return OffsetTimestamp(GetMonoTimeMicros()); + } + MicrosecondsInt64 OffsetTimestamp(const MicrosecondsInt64& timestamp) const { + return timestamp - time_offset_; + } + + // Create a new PerThreadInfo object for the current thread, + // and register it in the active_threads_ list. + PerThreadInfo* SetupThreadLocalBuffer(); + + // This lock protects TraceLog member accesses (except for members protected + // by thread_info_lock_) from arbitrary threads. + mutable base::SpinLock lock_; + // This lock protects accesses to thread_names_, thread_event_start_times_ + // and thread_colors_. + base::SpinLock thread_info_lock_; + int locked_line_; + Mode mode_; + int num_traces_recorded_; + gscoped_ptr logged_events_; + AtomicWord /* EventCallback */ event_callback_; + bool dispatching_to_observer_list_; + std::vector enabled_state_observer_list_; + + std::string process_name_; + std::unordered_map process_labels_; + int process_sort_index_; + std::unordered_map thread_sort_indices_; + std::unordered_map thread_names_; + + // The following two maps are used only when ECHO_TO_CONSOLE. + std::unordered_map > thread_event_start_times_; + std::unordered_map thread_colors_; + + // XORed with TraceID to make it unlikely to collide with other processes. + uint64_t process_id_hash_; + + int process_id_; + + MicrosecondsInt64 time_offset_; + + // Allow tests to wake up when certain events occur. + WatchEventCallback watch_event_callback_; + AtomicWord /* const unsigned char* */ watch_category_; + std::string watch_event_name_; + + AtomicWord /* Options */ trace_options_; + + // Sampling thread handles. + gscoped_ptr sampling_thread_; + scoped_refptr sampling_thread_handle_; + + CategoryFilter category_filter_; + CategoryFilter event_callback_category_filter_; + + struct PerThreadInfo { + ThreadLocalEventBuffer* event_buffer_; + base::subtle::Atomic32 is_in_trace_event_; + + // Atomically take the event_buffer_ member, setting it to NULL. + // Returns the old value of the member. + ThreadLocalEventBuffer* AtomicTakeBuffer(); + }; + static __thread PerThreadInfo* thread_local_info_; + + Mutex active_threads_lock_; + // Map of PID -> PerThreadInfo + // Protected by active_threads_lock_. + typedef std::unordered_map ActiveThreadMap; + ActiveThreadMap active_threads_; + + // For events which can't be added into the thread local buffer, e.g. events + // from threads without a message loop. + gscoped_ptr thread_shared_chunk_; + size_t thread_shared_chunk_index_; + + // The generation is incremented whenever tracing is enabled, and incremented + // again when the buffers are flushed. This ensures that trace events logged + // for a previous tracing session do not get accidentally flushed in the + // next tracing session. + AtomicWord generation_; + + DISALLOW_COPY_AND_ASSIGN(TraceLog); +}; + +} // namespace debug +} // namespace kudu + +#endif // KUDU_UTIL_DEBUG_TRACE_EVENT_IMPL_H_ diff --git a/src/kudu/util/debug/trace_event_impl_constants.cc b/src/kudu/util/debug/trace_event_impl_constants.cc new file mode 100644 index 000000000000..bf45ed7b2021 --- /dev/null +++ b/src/kudu/util/debug/trace_event_impl_constants.cc @@ -0,0 +1,14 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/util/debug/trace_event_impl.h" + +namespace kudu { +namespace debug { + +// Enable everything but debug and test categories by default. +const char* CategoryFilter::kDefaultCategoryFilterString = "-*Debug,-*Test"; + +} // namespace debug +} // namespace kudu diff --git a/src/kudu/util/debug/trace_event_memory.h b/src/kudu/util/debug/trace_event_memory.h new file mode 100644 index 000000000000..6d9cf8d99190 --- /dev/null +++ b/src/kudu/util/debug/trace_event_memory.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_DEBUG_TRACE_EVENT_MEMORY_H +#define KUDU_DEBUG_TRACE_EVENT_MEMORY_H + +// Stub for this part of chromium tracing we haven't yet +// imported. +// The Chromium code relies on a locally patch tcmalloc. +// See 5bc71bae28ea03689dbf50fe6baa15b574319091 in the Chromium +// repository. + +#define INTERNAL_TRACE_MEMORY(category_group, name) + +#endif /* KUDU_DEBUG_TRACE_EVENT_MEMORY_H */ diff --git a/src/kudu/util/debug/trace_event_synthetic_delay.cc b/src/kudu/util/debug/trace_event_synthetic_delay.cc new file mode 100644 index 000000000000..4669234266d9 --- /dev/null +++ b/src/kudu/util/debug/trace_event_synthetic_delay.cc @@ -0,0 +1,232 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "kudu/gutil/singleton.h" +#include "kudu/util/debug/trace_event_synthetic_delay.h" + +namespace { +const int kMaxSyntheticDelays = 32; +} // namespace + +namespace kudu { +namespace debug { + +TraceEventSyntheticDelayClock::TraceEventSyntheticDelayClock() {} +TraceEventSyntheticDelayClock::~TraceEventSyntheticDelayClock() {} + +class TraceEventSyntheticDelayRegistry : public TraceEventSyntheticDelayClock { + public: + static TraceEventSyntheticDelayRegistry* GetInstance(); + + TraceEventSyntheticDelay* GetOrCreateDelay(const char* name); + void ResetAllDelays(); + + // TraceEventSyntheticDelayClock implementation. + virtual MonoTime Now() OVERRIDE; + + private: + TraceEventSyntheticDelayRegistry(); + + friend class Singleton; + + Mutex lock_; + TraceEventSyntheticDelay delays_[kMaxSyntheticDelays]; + TraceEventSyntheticDelay dummy_delay_; + base::subtle::Atomic32 delay_count_; + + DISALLOW_COPY_AND_ASSIGN(TraceEventSyntheticDelayRegistry); +}; + +TraceEventSyntheticDelay::TraceEventSyntheticDelay() + : mode_(STATIC), begin_count_(0), trigger_count_(0), clock_(nullptr) {} + +TraceEventSyntheticDelay::~TraceEventSyntheticDelay() {} + +TraceEventSyntheticDelay* TraceEventSyntheticDelay::Lookup( + const std::string& name) { + return TraceEventSyntheticDelayRegistry::GetInstance()->GetOrCreateDelay( + name.c_str()); +} + +void TraceEventSyntheticDelay::Initialize( + const std::string& name, + TraceEventSyntheticDelayClock* clock) { + name_ = name; + clock_ = clock; +} + +void TraceEventSyntheticDelay::SetTargetDuration(const MonoDelta& target_duration) { + MutexLock lock(lock_); + target_duration_ = target_duration; + trigger_count_ = 0; + begin_count_ = 0; +} + +void TraceEventSyntheticDelay::SetMode(Mode mode) { + MutexLock lock(lock_); + mode_ = mode; +} + +void TraceEventSyntheticDelay::SetClock(TraceEventSyntheticDelayClock* clock) { + MutexLock lock(lock_); + clock_ = clock; +} + +void TraceEventSyntheticDelay::Begin() { + // Note that we check for a non-zero target duration without locking to keep + // things quick for the common case when delays are disabled. Since the delay + // calculation is done with a lock held, it will always be correct. The only + // downside of this is that we may fail to apply some delays when the target + // duration changes. + ANNOTATE_BENIGN_RACE(&target_duration_, "Synthetic delay duration"); + if (!target_duration_.Initialized()) + return; + + MonoTime start_time = clock_->Now(); + { + MutexLock lock(lock_); + if (++begin_count_ != 1) + return; + end_time_ = CalculateEndTimeLocked(start_time); + } +} + +void TraceEventSyntheticDelay::BeginParallel(MonoTime* out_end_time) { + // See note in Begin(). + ANNOTATE_BENIGN_RACE(&target_duration_, "Synthetic delay duration"); + if (!target_duration_.Initialized()) { + *out_end_time = MonoTime(); + return; + } + + MonoTime start_time = clock_->Now(); + { + MutexLock lock(lock_); + *out_end_time = CalculateEndTimeLocked(start_time); + } +} + +void TraceEventSyntheticDelay::End() { + // See note in Begin(). + ANNOTATE_BENIGN_RACE(&target_duration_, "Synthetic delay duration"); + if (!target_duration_.Initialized()) + return; + + MonoTime end_time; + { + MutexLock lock(lock_); + if (!begin_count_ || --begin_count_ != 0) + return; + end_time = end_time_; + } + if (end_time.Initialized()) + ApplyDelay(end_time); +} + +void TraceEventSyntheticDelay::EndParallel(const MonoTime& end_time) { + if (end_time.Initialized()) + ApplyDelay(end_time); +} + +MonoTime TraceEventSyntheticDelay::CalculateEndTimeLocked( + const MonoTime& start_time) { + if (mode_ == ONE_SHOT && trigger_count_++) + return MonoTime(); + else if (mode_ == ALTERNATING && trigger_count_++ % 2) + return MonoTime(); + MonoTime end = start_time; + end.AddDelta(target_duration_); + return end; +} + +void TraceEventSyntheticDelay::ApplyDelay(const MonoTime& end_time) { + TRACE_EVENT0("synthetic_delay", name_.c_str()); + while (clock_->Now().ComesBefore(end_time)) { + // Busy loop. + } +} + +TraceEventSyntheticDelayRegistry* +TraceEventSyntheticDelayRegistry::GetInstance() { + return Singleton::get(); +} + +TraceEventSyntheticDelayRegistry::TraceEventSyntheticDelayRegistry() + : delay_count_(0) {} + +TraceEventSyntheticDelay* TraceEventSyntheticDelayRegistry::GetOrCreateDelay( + const char* name) { + // Try to find an existing delay first without locking to make the common case + // fast. + int delay_count = base::subtle::Acquire_Load(&delay_count_); + for (int i = 0; i < delay_count; ++i) { + if (!strcmp(name, delays_[i].name_.c_str())) + return &delays_[i]; + } + + MutexLock lock(lock_); + delay_count = base::subtle::Acquire_Load(&delay_count_); + for (int i = 0; i < delay_count; ++i) { + if (!strcmp(name, delays_[i].name_.c_str())) + return &delays_[i]; + } + + DCHECK(delay_count < kMaxSyntheticDelays) + << "must increase kMaxSyntheticDelays"; + if (delay_count >= kMaxSyntheticDelays) + return &dummy_delay_; + + delays_[delay_count].Initialize(std::string(name), this); + base::subtle::Release_Store(&delay_count_, delay_count + 1); + return &delays_[delay_count]; +} + +MonoTime TraceEventSyntheticDelayRegistry::Now() { + return MonoTime::Now(MonoTime::FINE); +} + +void TraceEventSyntheticDelayRegistry::ResetAllDelays() { + MutexLock lock(lock_); + int delay_count = base::subtle::Acquire_Load(&delay_count_); + for (int i = 0; i < delay_count; ++i) { + delays_[i].SetTargetDuration(MonoDelta()); + delays_[i].SetClock(this); + } +} + +void ResetTraceEventSyntheticDelays() { + TraceEventSyntheticDelayRegistry::GetInstance()->ResetAllDelays(); +} + +} // namespace debug +} // namespace kudu + +namespace trace_event_internal { + +ScopedSyntheticDelay::ScopedSyntheticDelay(const char* name, + AtomicWord* impl_ptr) + : delay_impl_(GetOrCreateDelay(name, impl_ptr)) { + delay_impl_->BeginParallel(&end_time_); +} + +ScopedSyntheticDelay::~ScopedSyntheticDelay() { + delay_impl_->EndParallel(end_time_); +} + +kudu::debug::TraceEventSyntheticDelay* GetOrCreateDelay( + const char* name, + AtomicWord* impl_ptr) { + kudu::debug::TraceEventSyntheticDelay* delay_impl = + reinterpret_cast( + base::subtle::Acquire_Load(impl_ptr)); + if (!delay_impl) { + delay_impl = kudu::debug::TraceEventSyntheticDelayRegistry::GetInstance() + ->GetOrCreateDelay(name); + base::subtle::Release_Store( + impl_ptr, reinterpret_cast(delay_impl)); + } + return delay_impl; +} + +} // namespace trace_event_internal diff --git a/src/kudu/util/debug/trace_event_synthetic_delay.h b/src/kudu/util/debug/trace_event_synthetic_delay.h new file mode 100644 index 000000000000..f53d5f4ebe25 --- /dev/null +++ b/src/kudu/util/debug/trace_event_synthetic_delay.h @@ -0,0 +1,162 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The synthetic delay framework makes it possible to dynamically inject +// arbitrary delays into into different parts of the codebase. This can be used, +// for instance, for testing various task scheduling algorithms. +// +// The delays are specified in terms of a target duration for a given block of +// code. If the code executes faster than the duration, the thread is made to +// sleep until the deadline is met. +// +// Code can be instrumented for delays with two sets of macros. First, for +// delays that should apply within a scope, use the following macro: +// +// TRACE_EVENT_SYNTHETIC_DELAY("cc.LayerTreeHost.DrawAndSwap"); +// +// For delaying operations that span multiple scopes, use: +// +// TRACE_EVENT_SYNTHETIC_DELAY_BEGIN("cc.Scheduler.BeginMainFrame"); +// ... +// TRACE_EVENT_SYNTHETIC_DELAY_END("cc.Scheduler.BeginMainFrame"); +// +// Here BEGIN establishes the start time for the delay and END executes the +// delay based on the remaining time. If BEGIN is called multiple times in a +// row, END should be called a corresponding number of times. Only the last +// call to END will have an effect. +// +// Note that a single delay may begin on one thread and end on another. This +// implies that a single delay cannot not be applied in several threads at once. + +#ifndef KUDU_UTIL_DEBUG_TRACE_EVENT_SYNTHETIC_DELAY_H_ +#define KUDU_UTIL_DEBUG_TRACE_EVENT_SYNTHETIC_DELAY_H_ + +#include "kudu/gutil/atomicops.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/monotime.h" + +// Apply a named delay in the current scope. +#define TRACE_EVENT_SYNTHETIC_DELAY(name) \ + static AtomicWord INTERNAL_TRACE_EVENT_UID(impl_ptr) = 0; \ + trace_event_internal::ScopedSyntheticDelay INTERNAL_TRACE_EVENT_UID(delay)( \ + name, &INTERNAL_TRACE_EVENT_UID(impl_ptr)); + +// Begin a named delay, establishing its timing start point. May be called +// multiple times as long as the calls to TRACE_EVENT_SYNTHETIC_DELAY_END are +// balanced. Only the first call records the timing start point. +#define TRACE_EVENT_SYNTHETIC_DELAY_BEGIN(name) \ + do { \ + static AtomicWord impl_ptr = 0; \ + trace_event_internal::GetOrCreateDelay(name, &impl_ptr)->Begin(); \ + } while (false) + +// End a named delay. The delay is applied only if this call matches the +// first corresponding call to TRACE_EVENT_SYNTHETIC_DELAY_BEGIN with the +// same delay. +#define TRACE_EVENT_SYNTHETIC_DELAY_END(name) \ + do { \ + static AtomicWord impl_ptr = 0; \ + trace_event_internal::GetOrCreateDelay(name, &impl_ptr)->End(); \ + } while (false) + +namespace kudu { +namespace debug { + +// Time source for computing delay durations. Used for testing. +class TRACE_EVENT_API_CLASS_EXPORT TraceEventSyntheticDelayClock { + public: + TraceEventSyntheticDelayClock(); + virtual ~TraceEventSyntheticDelayClock(); + virtual MonoTime Now() = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(TraceEventSyntheticDelayClock); +}; + +// Single delay point instance. +class TRACE_EVENT_API_CLASS_EXPORT TraceEventSyntheticDelay { + public: + enum Mode { + STATIC, // Apply the configured delay every time. + ONE_SHOT, // Apply the configured delay just once. + ALTERNATING // Apply the configured delay every other time. + }; + + // Returns an existing named delay instance or creates a new one with |name|. + static TraceEventSyntheticDelay* Lookup(const std::string& name); + + void SetTargetDuration(const MonoDelta& target_duration); + void SetMode(Mode mode); + void SetClock(TraceEventSyntheticDelayClock* clock); + + // Begin the delay, establishing its timing start point. May be called + // multiple times as long as the calls to End() are balanced. Only the first + // call records the timing start point. + void Begin(); + + // End the delay. The delay is applied only if this call matches the first + // corresponding call to Begin() with the same delay. + void End(); + + // Begin a parallel instance of the delay. Several parallel instances may be + // active simultaneously and will complete independently. The computed end + // time for the delay is stored in |out_end_time|, which should later be + // passed to EndParallel(). + void BeginParallel(MonoTime* out_end_time); + + // End a previously started parallel delay. |end_time| is the delay end point + // computed by BeginParallel(). + void EndParallel(const MonoTime& end_time); + + private: + TraceEventSyntheticDelay(); + ~TraceEventSyntheticDelay(); + friend class TraceEventSyntheticDelayRegistry; + + void Initialize(const std::string& name, + TraceEventSyntheticDelayClock* clock); + MonoTime CalculateEndTimeLocked(const MonoTime& start_time); + void ApplyDelay(const MonoTime& end_time); + + Mutex lock_; + Mode mode_; + std::string name_; + int begin_count_; + int trigger_count_; + MonoTime end_time_; + MonoDelta target_duration_; + TraceEventSyntheticDelayClock* clock_; + + DISALLOW_COPY_AND_ASSIGN(TraceEventSyntheticDelay); +}; + +// Set the target durations of all registered synthetic delay points to zero. +TRACE_EVENT_API_CLASS_EXPORT void ResetTraceEventSyntheticDelays(); + +} // namespace debug +} // namespace kudu + +namespace trace_event_internal { + +// Helper class for scoped delays. Do not use directly. +class TRACE_EVENT_API_CLASS_EXPORT ScopedSyntheticDelay { + public: + explicit ScopedSyntheticDelay(const char* name, + AtomicWord* impl_ptr); + ~ScopedSyntheticDelay(); + + private: + kudu::debug::TraceEventSyntheticDelay* delay_impl_; + kudu::MonoTime end_time_; + + DISALLOW_COPY_AND_ASSIGN(ScopedSyntheticDelay); +}; + +// Helper for registering delays. Do not use directly. +TRACE_EVENT_API_CLASS_EXPORT kudu::debug::TraceEventSyntheticDelay* + GetOrCreateDelay(const char* name, AtomicWord* impl_ptr); + +} // namespace trace_event_internal + +#endif /* KUDU_UTIL_DEBUG_TRACE_EVENT_SYNTHETIC_DELAY_H_ */ diff --git a/src/kudu/util/debug/trace_logging.h b/src/kudu/util/debug/trace_logging.h new file mode 100644 index 000000000000..c497562f88d4 --- /dev/null +++ b/src/kudu/util/debug/trace_logging.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This header defines the following macro: +// +// VLOG_AND_TRACE(category, vlevel) +// +// Write a log message to VLOG(vlevel) as well as the current +// trace event buffer as an "INSTANT" trace event type. If the +// given vlog level is not enabled, this will still result in a +// trace buffer entry. +// +// The provided 'category' should be a trace event category, which +// allows the users to filter which trace events to enable. +// For example: +// +// VLOG_AND_TRACE("my_subsystem", 1) << "This always shows up in trace buffers " +// << "but only shows up in the log if VLOG(1) level logging is enabled."; +// +// Most VLOG(1) level log messages are reasonable to use this macro. +// Note that there is slightly more overhead to this macro as opposed +// to just using VLOG(1). +// +// Note that, like VLOG(n), this macro avoids evaluating its arguments unless +// either trace recording or VLOG(n) is enabled. In the case that both are enabled, +// the arguments are only evaluated once. +// +#ifndef KUDU_DEBUG_TRACE_LOGGING_H +#define KUDU_DEBUG_TRACE_LOGGING_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/debug/trace_event.h" + +// The inner workings of these macros are a bit arcane: +// - We make use of the fact that a block can be embedded within a ternary expression. +// This allows us to determine whether the trace event is enabled before we decide +// to evaluate the arguments. +// - We have to use google::LogMessageVoidify so that we can put 'void(0)' on one side +// of the ternary expression and the log stream on the other. This technique is +// cribbed from glog/logging.h. +#define VLOG_AND_TRACE_INTERNAL(category, vlevel) \ + kudu::debug::TraceVLog(__FILE__, __LINE__, category, VLOG_IS_ON(vlevel)).stream() +#define VLOG_AND_TRACE(category, vlevel) \ + !( { \ + bool enabled; \ + TRACE_EVENT_CATEGORY_GROUP_ENABLED(category, &enabled); \ + enabled || VLOG_IS_ON(vlevel); \ + } ) ? static_cast(0) : \ + google::LogMessageVoidify() & VLOG_AND_TRACE_INTERNAL(category, vlevel) + +namespace kudu { +namespace debug { + +class TraceVLog { + public: + TraceVLog(const char* file, int line, const char* category, bool do_vlog) + : sink_(category), + google_msg_(file, line, google::GLOG_INFO, &sink_, do_vlog) { + } + + std::ostream& stream() { + return google_msg_.stream(); + } + + private: + class TraceLogSink : public google::LogSink { + public: + explicit TraceLogSink(const char* category) : category_(category) {} + void send(google::LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct ::tm* tm_time, const char* message, + size_t message_len) override { + // Rather than calling TRACE_EVENT_INSTANT here, we have to do it from + // the destructor. This is because glog holds its internal mutex while + // calling send(). So, if we try to use TRACE_EVENT here, and --trace_to_console + // is enabled, then we'd end up calling back into glog when its lock is already + // held. glog isn't re-entrant, so that causes a crash. + // + // By just storing the string here, and then emitting the trace in the dtor, + // we defer the tracing until the google::LogMessage has destructed and the + // glog lock is available again. + str_ = ToString(severity, base_filename, line, + tm_time, message, message_len); + } + virtual ~TraceLogSink() { + TRACE_EVENT_INSTANT1(category_, "vlog", TRACE_EVENT_SCOPE_THREAD, + "msg", str_); + } + + private: + const char* const category_; + std::string str_; + }; + + TraceLogSink sink_; + google::LogMessage google_msg_; +}; + +} // namespace debug +} // namespace kudu +#endif /* KUDU_DEBUG_TRACE_LOGGING_H */ diff --git a/src/kudu/util/debug_ref_counted.h b/src/kudu/util/debug_ref_counted.h new file mode 100644 index 000000000000..7c2deca5f889 --- /dev/null +++ b/src/kudu/util/debug_ref_counted.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_DEBUG_REF_COUNTED_H_ +#define KUDU_UTIL_DEBUG_REF_COUNTED_H_ + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/debug-util.h" + +namespace kudu { + +// For use in debugging. Change a ref-counted class to inherit from this, +// instead of RefCountedThreadSafe, and fill your logs with stack traces. +template > +class DebugRefCountedThreadSafe : public RefCountedThreadSafe { + public: + DebugRefCountedThreadSafe() {} + + void AddRef() const { + RefCountedThreadSafe::AddRef(); + LOG(INFO) << "Incremented ref on " << this << ":\n" << GetStackTrace(); + } + + void Release() const { + LOG(INFO) << "Decrementing ref on " << this << ":\n" << GetStackTrace(); + RefCountedThreadSafe::Release(); + } + + protected: + ~DebugRefCountedThreadSafe() {} + + private: + friend struct DefaultRefCountedThreadSafeTraits; + + DISALLOW_COPY_AND_ASSIGN(DebugRefCountedThreadSafe); +}; + +} // namespace kudu + +#endif // KUDU_UTIL_DEBUG_REF_COUNTED_H_ diff --git a/src/kudu/util/env-test.cc b/src/kudu/util/env-test.cc new file mode 100644 index 000000000000..03e25032b5d1 --- /dev/null +++ b/src/kudu/util/env-test.cc @@ -0,0 +1,705 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/malloc.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/path_util.h" +#include "kudu/util/status.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +#if !defined(__APPLE__) +#include +#endif // !defined(__APPLE__) +// Copied from falloc.h. Useful for older kernels that lack support for +// hole punching; fallocate(2) will return EOPNOTSUPP. +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ +#endif +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ +#endif + +namespace kudu { + +using std::shared_ptr; +using std::string; +using std::vector; + +static const uint64_t kOneMb = 1024 * 1024; + +class TestEnv : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + CheckFallocateSupport(); + } + + // Verify that fallocate() is supported in the test directory. + // Some local file systems like ext3 do not support it, and we don't + // want to fail tests on those systems. + // + // Sets fallocate_supported_ based on the result. + void CheckFallocateSupport() { + static bool checked = false; + if (checked) return; + +#if defined(__linux__) + int fd = creat(GetTestPath("check-fallocate").c_str(), S_IWUSR); + PCHECK(fd >= 0); + int err = fallocate(fd, 0, 0, 4096); + if (err != 0) { + PCHECK(errno == ENOTSUP); + } else { + fallocate_supported_ = true; + + err = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + 1024, 1024); + if (err != 0) { + PCHECK(errno == ENOTSUP); + } else { + fallocate_punch_hole_supported_ = true; + } + } + + close(fd); +#endif + + checked = true; + } + + protected: + + void VerifyTestData(const Slice& read_data, size_t offset) { + for (int i = 0; i < read_data.size(); i++) { + size_t file_offset = offset + i; + ASSERT_EQ((file_offset * 31) & 0xff, read_data[i]) << "failed at " << i; + } + } + + void MakeVectors(int num_slices, int slice_size, int num_iterations, + gscoped_ptr* data, vector >* vec) { + data->reset(new faststring[num_iterations * num_slices]); + vec->resize(num_iterations); + + int data_idx = 0; + int byte_idx = 0; + for (int vec_idx = 0; vec_idx < num_iterations; vec_idx++) { + vector& iter_vec = vec->at(vec_idx); + iter_vec.resize(num_slices); + for (int i = 0; i < num_slices; i++) { + (*data)[data_idx].resize(slice_size); + for (int j = 0; j < slice_size; j++) { + (*data)[data_idx][j] = (byte_idx * 31) & 0xff; + ++byte_idx; + } + iter_vec[i]= Slice((*data)[data_idx]); + ++data_idx; + } + } + } + + void ReadAndVerifyTestData(RandomAccessFile* raf, size_t offset, size_t n) { + gscoped_ptr scratch(new uint8_t[n]); + Slice s; + ASSERT_OK(env_util::ReadFully(raf, offset, n, &s, + scratch.get())); + ASSERT_EQ(n, s.size()); + ASSERT_NO_FATAL_FAILURE(VerifyTestData(s, offset)); + } + + void TestAppendVector(size_t num_slices, size_t slice_size, size_t iterations, + bool fast, bool pre_allocate, const WritableFileOptions& opts) { + const string kTestPath = GetTestPath("test_env_appendvec_read_append"); + shared_ptr file; + ASSERT_OK(env_util::OpenFileForWrite(opts, env_.get(), kTestPath, &file)); + + if (pre_allocate) { + ASSERT_OK(file->PreAllocate(num_slices * slice_size * iterations)); + ASSERT_OK(file->Sync()); + } + + gscoped_ptr data; + vector > input; + + MakeVectors(num_slices, slice_size, iterations, &data, &input); + + shared_ptr raf; + + if (!fast) { + ASSERT_OK(env_util::OpenFileForRandom(env_.get(), kTestPath, &raf)); + } + + srand(123); + + const string test_descr = strings::Substitute( + "appending a vector of slices(number of slices=$0,size of slice=$1 b) $2 times", + num_slices, slice_size, iterations); + LOG_TIMING(INFO, test_descr) { + for (int i = 0; i < iterations; i++) { + if (fast || random() % 2) { + ASSERT_OK(file->AppendVector(input[i])); + } else { + for (const Slice& slice : input[i]) { + ASSERT_OK(file->Append(slice)); + } + } + if (!fast) { + // Verify as write. Note: this requires that file is pre-allocated, otherwise + // the ReadFully() fails with EINVAL. + ASSERT_NO_FATAL_FAILURE(ReadAndVerifyTestData(raf.get(), num_slices * slice_size * i, + num_slices * slice_size)); + } + } + } + + // Verify the entire file + ASSERT_OK(file->Close()); + + if (fast) { + ASSERT_OK(env_util::OpenFileForRandom(env_.get(), kTestPath, &raf)); + } + for (int i = 0; i < iterations; i++) { + ASSERT_NO_FATAL_FAILURE(ReadAndVerifyTestData(raf.get(), num_slices * slice_size * i, + num_slices * slice_size)); + } + } + + static bool fallocate_supported_; + static bool fallocate_punch_hole_supported_; +}; + +bool TestEnv::fallocate_supported_ = false; +bool TestEnv::fallocate_punch_hole_supported_ = false; + +TEST_F(TestEnv, TestPreallocate) { + if (!fallocate_supported_) { + LOG(INFO) << "fallocate not supported, skipping test"; + return; + } + LOG(INFO) << "Testing PreAllocate()"; + string test_path = GetTestPath("test_env_wf"); + shared_ptr file; + ASSERT_OK(env_util::OpenFileForWrite(WritableFileOptions(), + env_.get(), test_path, &file)); + + // pre-allocate 1 MB + ASSERT_OK(file->PreAllocate(kOneMb)); + ASSERT_OK(file->Sync()); + + // the writable file size should report 0 + ASSERT_EQ(file->Size(), 0); + // but the real size of the file on disk should report 1MB + uint64_t size; + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(size, kOneMb); + + // write 1 MB + uint8_t scratch[kOneMb]; + Slice slice(scratch, kOneMb); + ASSERT_OK(file->Append(slice)); + ASSERT_OK(file->Sync()); + + // the writable file size should now report 1 MB + ASSERT_EQ(file->Size(), kOneMb); + ASSERT_OK(file->Close()); + // and the real size for the file on disk should match ony the + // written size + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(kOneMb, size); +} + +// To test consecutive pre-allocations we need higher pre-allocations since the +// mmapped regions grow in size until 2MBs (so smaller pre-allocations will easily +// be smaller than the mmapped regions size). +TEST_F(TestEnv, TestConsecutivePreallocate) { + if (!fallocate_supported_) { + LOG(INFO) << "fallocate not supported, skipping test"; + return; + } + LOG(INFO) << "Testing consecutive PreAllocate()"; + string test_path = GetTestPath("test_env_wf"); + shared_ptr file; + ASSERT_OK(env_util::OpenFileForWrite( + WritableFileOptions(), env_.get(), test_path, &file)); + + // pre-allocate 64 MB + ASSERT_OK(file->PreAllocate(64 * kOneMb)); + ASSERT_OK(file->Sync()); + + // the writable file size should report 0 + ASSERT_EQ(file->Size(), 0); + // but the real size of the file on disk should report 64 MBs + uint64_t size; + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(size, 64 * kOneMb); + + // write 1 MB + uint8_t scratch[kOneMb]; + Slice slice(scratch, kOneMb); + ASSERT_OK(file->Append(slice)); + ASSERT_OK(file->Sync()); + + // the writable file size should now report 1 MB + ASSERT_EQ(kOneMb, file->Size()); + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(64 * kOneMb, size); + + // pre-allocate 64 additional MBs + ASSERT_OK(file->PreAllocate(64 * kOneMb)); + ASSERT_OK(file->Sync()); + + // the writable file size should now report 1 MB + ASSERT_EQ(kOneMb, file->Size()); + // while the real file size should report 128 MB's + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(128 * kOneMb, size); + + // write another MB + ASSERT_OK(file->Append(slice)); + ASSERT_OK(file->Sync()); + + // the writable file size should now report 2 MB + ASSERT_EQ(file->Size(), 2 * kOneMb); + // while the real file size should reamin at 128 MBs + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(128 * kOneMb, size); + + // close the file (which ftruncates it to the real size) + ASSERT_OK(file->Close()); + // and the real size for the file on disk should match only the written size + ASSERT_OK(env_->GetFileSize(test_path, &size)); + ASSERT_EQ(2* kOneMb, size); + +} + +TEST_F(TestEnv, TestHolePunch) { + if (!fallocate_punch_hole_supported_) { + LOG(INFO) << "hole punching not supported, skipping test"; + return; + } + string test_path = GetTestPath("test_env_wf"); + gscoped_ptr file; + ASSERT_OK(env_->NewRWFile(test_path, &file)); + + // Write 1 MB. The size and size-on-disk both agree. + uint8_t scratch[kOneMb]; + Slice slice(scratch, kOneMb); + ASSERT_OK(file->Write(0, slice)); + ASSERT_OK(file->Sync()); + uint64_t sz; + ASSERT_OK(file->Size(&sz)); + ASSERT_EQ(kOneMb, sz); + uint64_t size_on_disk; + ASSERT_OK(env_->GetFileSizeOnDisk(test_path, &size_on_disk)); + // Some kernels and filesystems (e.g. Centos 6.6 with XFS) aggressively + // preallocate file disk space when writing to files, so the disk space may be + // greater than 1MiB. + ASSERT_LE(kOneMb, size_on_disk); + + // Punch some data out at byte marker 4096. Now the two sizes diverge. + uint64_t punch_amount = 4096 * 4; + uint64_t new_size_on_disk; + ASSERT_OK(file->PunchHole(4096, punch_amount)); + ASSERT_OK(file->Size(&sz)); + ASSERT_EQ(kOneMb, sz); + ASSERT_OK(env_->GetFileSizeOnDisk(test_path, &new_size_on_disk)); + ASSERT_EQ(size_on_disk - punch_amount, new_size_on_disk); +} + +class ShortReadRandomAccessFile : public RandomAccessFile { + public: + explicit ShortReadRandomAccessFile(shared_ptr wrapped) + : wrapped_(std::move(wrapped)) {} + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + uint8_t *scratch) const OVERRIDE { + CHECK_GT(n, 0); + // Divide the requested amount of data by a small integer, + // and issue the shorter read to the underlying file. + int short_n = n / ((rand() % 3) + 1); + if (short_n == 0) { + short_n = 1; + } + + VLOG(1) << "Reading " << short_n << " instead of " << n; + + return wrapped_->Read(offset, short_n, result, scratch); + } + + virtual Status Size(uint64_t *size) const OVERRIDE { + return wrapped_->Size(size); + } + + virtual const string& filename() const OVERRIDE { return wrapped_->filename(); } + + virtual size_t memory_footprint() const OVERRIDE { + return wrapped_->memory_footprint(); + } + + private: + const shared_ptr wrapped_; +}; + +// Write 'size' bytes of data to a file, with a simple pattern stored in it. +static void WriteTestFile(Env* env, const string& path, size_t size) { + shared_ptr wf; + ASSERT_OK(env_util::OpenFileForWrite(env, path, &wf)); + faststring data; + data.resize(size); + for (int i = 0; i < data.size(); i++) { + data[i] = (i * 31) & 0xff; + } + ASSERT_OK(wf->Append(Slice(data))); + ASSERT_OK(wf->Close()); +} + + + +TEST_F(TestEnv, TestReadFully) { + SeedRandom(); + const string kTestPath = "test"; + const int kFileSize = 64 * 1024; + gscoped_ptr mem(NewMemEnv(Env::Default())); + + WriteTestFile(mem.get(), kTestPath, kFileSize); + ASSERT_NO_FATAL_FAILURE(); + + // Reopen for read + shared_ptr raf; + ASSERT_OK(env_util::OpenFileForRandom(mem.get(), kTestPath, &raf)); + + ShortReadRandomAccessFile sr_raf(raf); + + const int kReadLength = 10000; + Slice s; + gscoped_ptr scratch(new uint8_t[kReadLength]); + + // Verify that ReadFully reads the whole requested data. + ASSERT_OK(env_util::ReadFully(&sr_raf, 0, kReadLength, &s, scratch.get())); + ASSERT_EQ(s.data(), scratch.get()) << "Should have returned a contiguous copy"; + ASSERT_EQ(kReadLength, s.size()); + + // Verify that the data read was correct. + VerifyTestData(s, 0); + + // Verify that ReadFully fails with an IOError at EOF. + Status status = env_util::ReadFully(&sr_raf, kFileSize - 100, 200, &s, scratch.get()); + ASSERT_FALSE(status.ok()); + ASSERT_TRUE(status.IsIOError()); + ASSERT_STR_CONTAINS(status.ToString(), "EOF"); +} + +TEST_F(TestEnv, TestAppendVector) { + WritableFileOptions opts; + LOG(INFO) << "Testing AppendVector() only, NO pre-allocation"; + ASSERT_NO_FATAL_FAILURE(TestAppendVector(2000, 1024, 5, true, false, opts)); + + if (!fallocate_supported_) { + LOG(INFO) << "fallocate not supported, skipping preallocated runs"; + } else { + LOG(INFO) << "Testing AppendVector() only, WITH pre-allocation"; + ASSERT_NO_FATAL_FAILURE(TestAppendVector(2000, 1024, 5, true, true, opts)); + LOG(INFO) << "Testing AppendVector() together with Append() and Read(), WITH pre-allocation"; + ASSERT_NO_FATAL_FAILURE(TestAppendVector(128, 4096, 5, false, true, opts)); + } +} + +TEST_F(TestEnv, TestGetExecutablePath) { + string p; + ASSERT_OK(Env::Default()->GetExecutablePath(&p)); + ASSERT_TRUE(HasSuffixString(p, "env-test")) << p; +} + +TEST_F(TestEnv, TestOpenEmptyRandomAccessFile) { + Env* env = Env::Default(); + string test_file = JoinPathSegments(GetTestDataDirectory(), "test_file"); + ASSERT_NO_FATAL_FAILURE(WriteTestFile(env, test_file, 0)); + gscoped_ptr readable_file; + ASSERT_OK(env->NewRandomAccessFile(test_file, &readable_file)); + uint64_t size; + ASSERT_OK(readable_file->Size(&size)); + ASSERT_EQ(0, size); +} + +TEST_F(TestEnv, TestOverwrite) { + string test_path = GetTestPath("test_env_wf"); + + // File does not exist, create it. + shared_ptr writer; + ASSERT_OK(env_util::OpenFileForWrite(env_.get(), test_path, &writer)); + + // File exists, overwrite it. + ASSERT_OK(env_util::OpenFileForWrite(env_.get(), test_path, &writer)); + + // File exists, try to overwrite (and fail). + WritableFileOptions opts; + opts.mode = Env::CREATE_NON_EXISTING; + Status s = env_util::OpenFileForWrite(opts, + env_.get(), test_path, &writer); + ASSERT_TRUE(s.IsAlreadyPresent()); +} + +TEST_F(TestEnv, TestReopen) { + LOG(INFO) << "Testing reopening behavior"; + string test_path = GetTestPath("test_env_wf"); + string first = "The quick brown fox"; + string second = "jumps over the lazy dog"; + + // Create the file and write to it. + shared_ptr writer; + ASSERT_OK(env_util::OpenFileForWrite(WritableFileOptions(), + env_.get(), test_path, &writer)); + ASSERT_OK(writer->Append(first)); + ASSERT_EQ(first.length(), writer->Size()); + ASSERT_OK(writer->Close()); + + // Reopen it and append to it. + WritableFileOptions reopen_opts; + reopen_opts.mode = Env::OPEN_EXISTING; + ASSERT_OK(env_util::OpenFileForWrite(reopen_opts, + env_.get(), test_path, &writer)); + ASSERT_EQ(first.length(), writer->Size()); + ASSERT_OK(writer->Append(second)); + ASSERT_EQ(first.length() + second.length(), writer->Size()); + ASSERT_OK(writer->Close()); + + // Check that the file has both strings. + shared_ptr reader; + ASSERT_OK(env_util::OpenFileForRandom(env_.get(), test_path, &reader)); + uint64_t size; + ASSERT_OK(reader->Size(&size)); + ASSERT_EQ(first.length() + second.length(), size); + Slice s; + uint8_t scratch[size]; + ASSERT_OK(env_util::ReadFully(reader.get(), 0, size, &s, scratch)); + ASSERT_EQ(first + second, s.ToString()); +} + +TEST_F(TestEnv, TestIsDirectory) { + string dir = GetTestPath("a_directory"); + ASSERT_OK(env_->CreateDir(dir)); + bool is_dir; + ASSERT_OK(env_->IsDirectory(dir, &is_dir)); + ASSERT_TRUE(is_dir); + + string not_dir = GetTestPath("not_a_directory"); + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(not_dir, &writer)); + ASSERT_OK(env_->IsDirectory(not_dir, &is_dir)); + ASSERT_FALSE(is_dir); +} + +static Status TestWalkCb(vector* actual, + Env::FileType type, + const string& dirname, const string& basename) { + VLOG(1) << type << ":" << dirname << ":" << basename; + actual->push_back(JoinPathSegments(dirname, basename)); + return Status::OK(); +} + +static Status CreateDir(Env* env, const string& name, vector* created) { + RETURN_NOT_OK(env->CreateDir(name)); + created->push_back(name); + return Status::OK(); +} + +static Status CreateFile(Env* env, const string& name, vector* created) { + gscoped_ptr writer; + RETURN_NOT_OK(env->NewWritableFile(name, &writer)); + created->push_back(writer->filename()); + return Status::OK(); +} + +TEST_F(TestEnv, TestWalk) { + // We test with this tree: + // + // /root/ + // /root/file_1 + // /root/file_2 + // /root/dir_a/file_1 + // /root/dir_a/file_2 + // /root/dir_b/file_1 + // /root/dir_b/file_2 + // /root/dir_b/dir_c/file_1 + // /root/dir_b/dir_c/file_2 + string root = GetTestPath("root"); + string subdir_a = JoinPathSegments(root, "dir_a"); + string subdir_b = JoinPathSegments(root, "dir_b"); + string subdir_c = JoinPathSegments(subdir_b, "dir_c"); + string file_one = "file_1"; + string file_two = "file_2"; + vector expected; + ASSERT_OK(CreateDir(env_.get(), root, &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(root, file_one), &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(root, file_two), &expected)); + ASSERT_OK(CreateDir(env_.get(), subdir_a, &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_a, file_one), &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_a, file_two), &expected)); + ASSERT_OK(CreateDir(env_.get(), subdir_b, &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_b, file_one), &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_b, file_two), &expected)); + ASSERT_OK(CreateDir(env_.get(), subdir_c, &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_c, file_one), &expected)); + ASSERT_OK(CreateFile(env_.get(), JoinPathSegments(subdir_c, file_two), &expected)); + + // Do the walk. + // + // Sadly, tr1/unordered_set doesn't implement equality operators, so we + // compare sorted vectors instead. + vector actual; + ASSERT_OK(env_->Walk(root, Env::PRE_ORDER, Bind(&TestWalkCb, &actual))); + sort(expected.begin(), expected.end()); + sort(actual.begin(), actual.end()); + ASSERT_EQ(expected, actual); +} + +static Status TestWalkErrorCb(int* num_calls, + Env::FileType type, + const string& dirname, const string& basename) { + (*num_calls)++; + return Status::Aborted("Returning abort status"); +} + +TEST_F(TestEnv, TestWalkCbReturnsError) { + string new_dir = GetTestPath("foo"); + string new_file = "myfile"; + ASSERT_OK(env_->CreateDir(new_dir)); + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(JoinPathSegments(new_dir, new_file), &writer)); + int num_calls = 0; + ASSERT_TRUE(env_->Walk(new_dir, Env::PRE_ORDER, + Bind(&TestWalkErrorCb, &num_calls)).IsIOError()); + + // Once for the directory and once for the file inside it. + ASSERT_EQ(2, num_calls); +} + +TEST_F(TestEnv, TestGetBlockSize) { + uint64_t block_size; + + // Does not exist. + ASSERT_TRUE(env_->GetBlockSize("does_not_exist", &block_size).IsNotFound()); + + // Try with a directory. + ASSERT_OK(env_->GetBlockSize(".", &block_size)); + ASSERT_GT(block_size, 0); + + // Try with a file. + string path = GetTestPath("foo"); + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(path, &writer)); + ASSERT_OK(env_->GetBlockSize(path, &block_size)); + ASSERT_GT(block_size, 0); +} + +TEST_F(TestEnv, TestRWFile) { + // Create the file. + gscoped_ptr file; + ASSERT_OK(env_->NewRWFile(GetTestPath("foo"), &file)); + + // Append to it. + string kTestData = "abcde"; + ASSERT_OK(file->Write(0, kTestData)); + + // Read from it. + Slice result; + gscoped_ptr scratch(new uint8_t[kTestData.length()]); + ASSERT_OK(file->Read(0, kTestData.length(), &result, scratch.get())); + ASSERT_EQ(result, kTestData); + uint64_t sz; + ASSERT_OK(file->Size(&sz)); + ASSERT_EQ(kTestData.length(), sz); + + // Write past the end of the file and rewrite some of the interior. + ASSERT_OK(file->Write(kTestData.length() * 2, kTestData)); + ASSERT_OK(file->Write(kTestData.length(), kTestData)); + ASSERT_OK(file->Write(1, kTestData)); + string kNewTestData = "aabcdebcdeabcde"; + gscoped_ptr scratch2(new uint8_t[kNewTestData.length()]); + ASSERT_OK(file->Read(0, kNewTestData.length(), &result, scratch2.get())); + + // Retest. + ASSERT_EQ(result, kNewTestData); + ASSERT_OK(file->Size(&sz)); + ASSERT_EQ(kNewTestData.length(), sz); + + // Make sure we can't overwrite it. + RWFileOptions opts; + opts.mode = Env::CREATE_NON_EXISTING; + ASSERT_TRUE(env_->NewRWFile(opts, GetTestPath("foo"), &file).IsAlreadyPresent()); + + // Reopen it without truncating the existing data. + opts.mode = Env::OPEN_EXISTING; + ASSERT_OK(env_->NewRWFile(opts, GetTestPath("foo"), &file)); + ASSERT_OK(file->Read(0, kNewTestData.length(), &result, scratch2.get())); + ASSERT_EQ(result, kNewTestData); +} + +TEST_F(TestEnv, TestCanonicalize) { + vector synonyms = { GetTestPath("."), GetTestPath("./."), GetTestPath(".//./") }; + for (const string& synonym : synonyms) { + string result; + ASSERT_OK(env_->Canonicalize(synonym, &result)); + ASSERT_EQ(GetTestDataDirectory(), result); + } + + string dir = GetTestPath("some_dir"); + ASSERT_OK(env_->CreateDir(dir)); + string result; + ASSERT_OK(env_->Canonicalize(dir + "/", &result)); + ASSERT_EQ(dir, result); + + ASSERT_TRUE(env_->Canonicalize(dir + "/bar", nullptr).IsNotFound()); +} + +TEST_F(TestEnv, TestGetTotalRAMBytes) { + int64_t ram = 0; + ASSERT_OK(env_->GetTotalRAMBytes(&ram)); + + // Can't test much about it. + ASSERT_GT(ram, 0); +} + +// Test that CopyFile() copies all the bytes properly. +TEST_F(TestEnv, TestCopyFile) { + string orig_path = GetTestPath("test"); + string copy_path = orig_path + ".copy"; + const int kFileSize = 1024 * 1024 + 11; // Some odd number of bytes. + + Env* env = Env::Default(); + NO_FATALS(WriteTestFile(env, orig_path, kFileSize)); + ASSERT_OK(env_util::CopyFile(env, orig_path, copy_path, WritableFileOptions())); + gscoped_ptr copy; + ASSERT_OK(env->NewRandomAccessFile(copy_path, ©)); + NO_FATALS(ReadAndVerifyTestData(copy.get(), 0, kFileSize)); +} + +} // namespace kudu diff --git a/src/kudu/util/env.cc b/src/kudu/util/env.cc new file mode 100644 index 000000000000..2ea9622ac004 --- /dev/null +++ b/src/kudu/util/env.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "kudu/util/env.h" +#include "kudu/util/faststring.h" + +namespace kudu { + +Env::~Env() { +} + +SequentialFile::~SequentialFile() { +} + +RandomAccessFile::~RandomAccessFile() { +} + +WritableFile::~WritableFile() { +} + +RWFile::~RWFile() { +} + +FileLock::~FileLock() { +} + +static Status DoWriteStringToFile(Env* env, const Slice& data, + const std::string& fname, + bool should_sync) { + gscoped_ptr file; + Status s = env->NewWritableFile(fname, &file); + if (!s.ok()) { + return s; + } + s = file->Append(data); + if (s.ok() && should_sync) { + s = file->Sync(); + } + if (s.ok()) { + s = file->Close(); + } + file.reset(); // Will auto-close if we did not close above + if (!s.ok()) { + WARN_NOT_OK(env->DeleteFile(fname), + "Failed to delete partially-written file " + fname); + } + return s; +} + +// TODO: move these utils into env_util +Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, false); +} + +Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname) { + return DoWriteStringToFile(env, data, fname, true); +} + +Status ReadFileToString(Env* env, const std::string& fname, faststring* data) { + data->clear(); + gscoped_ptr file; + Status s = env->NewSequentialFile(fname, &file); + if (!s.ok()) { + return s; + } + static const int kBufferSize = 8192; + gscoped_ptr scratch(new uint8_t[kBufferSize]); + while (true) { + Slice fragment; + s = file->Read(kBufferSize, &fragment, scratch.get()); + if (!s.ok()) { + break; + } + data->append(fragment.data(), fragment.size()); + if (fragment.empty()) { + break; + } + } + return s; +} + +EnvWrapper::~EnvWrapper() { +} + +} // namespace kudu diff --git a/src/kudu/util/env.h b/src/kudu/util/env.h new file mode 100644 index 000000000000..0d90ef3a60a3 --- /dev/null +++ b/src/kudu/util/env.h @@ -0,0 +1,602 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// An Env is an interface used by the kudu implementation to access +// operating system functionality like the filesystem etc. Callers +// may wish to provide a custom Env object when opening a database to +// get fine gain control; e.g., to rate limit file system operations. +// +// All Env implementations are safe for concurrent access from +// multiple threads without any external synchronization. + +#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ +#define STORAGE_LEVELDB_INCLUDE_ENV_H_ + +#include +#include +#include +#include + +#include "kudu/gutil/callback_forward.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/status.h" + +namespace kudu { + +class FileLock; +class RandomAccessFile; +class RWFile; +class SequentialFile; +class Slice; +class WritableFile; + +struct RandomAccessFileOptions; +struct RWFileOptions; +struct WritableFileOptions; + +class Env { + public: + // Governs if/how the file is created. + // + // enum value | file exists | file does not exist + // --------------------------------+-------------------+-------------------- + // CREATE_IF_NON_EXISTING_TRUNCATE | opens + truncates | creates + // CREATE_NON_EXISTING | fails | creates + // OPEN_EXISTING | opens | fails + enum CreateMode { + CREATE_IF_NON_EXISTING_TRUNCATE, + CREATE_NON_EXISTING, + OPEN_EXISTING + }; + + Env() { } + virtual ~Env(); + + // Return a default environment suitable for the current operating + // system. Sophisticated users may wish to provide their own Env + // implementation instead of relying on this default environment. + // + // The result of Default() belongs to kudu and must never be deleted. + static Env* Default(); + + // Create a brand new sequentially-readable file with the specified name. + // On success, stores a pointer to the new file in *result and returns OK. + // On failure stores NULL in *result and returns non-OK. If the file does + // not exist, returns a non-OK status. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewSequentialFile(const std::string& fname, + gscoped_ptr* result) = 0; + + // Create a brand new random access read-only file with the + // specified name. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. If the file does not exist, returns a non-OK + // status. + // + // The returned file may be concurrently accessed by multiple threads. + virtual Status NewRandomAccessFile(const std::string& fname, + gscoped_ptr* result) = 0; + + // Like the previous NewRandomAccessFile, but allows options to be specified. + virtual Status NewRandomAccessFile(const RandomAccessFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) = 0; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores NULL in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWritableFile(const std::string& fname, + gscoped_ptr* result) = 0; + + + // Like the previous NewWritableFile, but allows options to be + // specified. + virtual Status NewWritableFile(const WritableFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) = 0; + + // Creates a new WritableFile provided the name_template parameter. + // The last six characters of name_template must be "XXXXXX" and these are + // replaced with a string that makes the filename unique. + // The resulting created filename, if successful, will be stored in the + // created_filename out parameter. + // The file is created with permissions 0600, that is, read plus write for + // owner only. The implementation will create the file in a secure manner, + // and will return an error Status if it is unable to open the file. + virtual Status NewTempWritableFile(const WritableFileOptions& opts, + const std::string& name_template, + std::string* created_filename, + gscoped_ptr* result) = 0; + + // Creates a new readable and writable file. If a file with the same name + // already exists on disk, it is deleted. + // + // Some of the methods of the new file may be accessed concurrently, + // while others are only safe for access by one thread at a time. + virtual Status NewRWFile(const std::string& fname, + gscoped_ptr* result) = 0; + + // Like the previous NewRWFile, but allows options to be specified. + virtual Status NewRWFile(const RWFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) = 0; + + // Returns true iff the named file exists. + virtual bool FileExists(const std::string& fname) = 0; + + // Store in *result the names of the children of the specified directory. + // The names are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildren(const std::string& dir, + std::vector* result) = 0; + + // Delete the named file. + virtual Status DeleteFile(const std::string& fname) = 0; + + // Create the specified directory. + virtual Status CreateDir(const std::string& dirname) = 0; + + // Delete the specified directory. + virtual Status DeleteDir(const std::string& dirname) = 0; + + // Synchronize the entry for a specific directory. + virtual Status SyncDir(const std::string& dirname) = 0; + + // Recursively delete the specified directory. + // This should operate safely, not following any symlinks, etc. + virtual Status DeleteRecursively(const std::string &dirname) = 0; + + // Store the logical size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0; + + // Store the physical size of fname in *file_size. + // + // This differs from GetFileSize() in that it returns the actual amount + // of space consumed by the file, not the user-facing file size. + virtual Status GetFileSizeOnDisk(const std::string& fname, uint64_t* file_size) = 0; + + // Store the block size of the filesystem where fname resides in + // *block_size. fname must exist but it may be a file or a directory. + virtual Status GetBlockSize(const std::string& fname, uint64_t* block_size) = 0; + + // Rename file src to target. + virtual Status RenameFile(const std::string& src, + const std::string& target) = 0; + + // Lock the specified file. Used to prevent concurrent access to + // the same db by multiple processes. On failure, stores NULL in + // *lock and returns non-OK. + // + // On success, stores a pointer to the object that represents the + // acquired lock in *lock and returns OK. The caller should call + // UnlockFile(*lock) to release the lock. If the process exits, + // the lock will be automatically released. + // + // If somebody else already holds the lock, finishes immediately + // with a failure. I.e., this call does not wait for existing locks + // to go away. + // + // May create the named file if it does not already exist. + virtual Status LockFile(const std::string& fname, FileLock** lock) = 0; + + // Release the lock acquired by a previous successful call to LockFile. + // REQUIRES: lock was returned by a successful LockFile() call + // REQUIRES: lock has not already been unlocked. + virtual Status UnlockFile(FileLock* lock) = 0; + + // *path is set to a temporary directory that can be used for testing. It may + // or many not have just been created. The directory may or may not differ + // between runs of the same process, but subsequent calls will return the + // same directory. + virtual Status GetTestDirectory(std::string* path) = 0; + + // Returns the number of micro-seconds since some fixed point in time. Only + // useful for computing deltas of time. + virtual uint64_t NowMicros() = 0; + + // Sleep/delay the thread for the perscribed number of micro-seconds. + virtual void SleepForMicroseconds(int micros) = 0; + + // Get caller's thread id. + virtual uint64_t gettid() = 0; + + // Return the full path of the currently running executable. + virtual Status GetExecutablePath(std::string* path) = 0; + + // Checks if the file is a directory. Returns an error if it doesn't + // exist, otherwise writes true or false into 'is_dir' appropriately. + virtual Status IsDirectory(const std::string& path, bool* is_dir) = 0; + + // The kind of file found during a walk. Note that symbolic links are + // reported as FILE_TYPE. + enum FileType { + DIRECTORY_TYPE, + FILE_TYPE, + }; + + // Called for each file/directory in the walk. + // + // The first argument is the type of file. + // The second is the dirname of the file. + // The third is the basename of the file. + // + // Returning an error won't halt the walk, but it will cause it to return + // with an error status when it's done. + typedef Callback WalkCallback; + + // Whether to walk directories in pre-order or post-order. + enum DirectoryOrder { + PRE_ORDER, + POST_ORDER, + }; + + // Walk the filesystem subtree from 'root' down, invoking 'cb' for each + // file or directory found, including 'root'. + // + // The walk will not cross filesystem boundaries. It won't change the + // working directory, nor will it follow symbolic links. + virtual Status Walk(const std::string& root, + DirectoryOrder order, + const WalkCallback& cb) = 0; + + // Canonicalize 'path' by applying the following conversions: + // - Converts a relative path into an absolute one using the cwd. + // - Converts '.' and '..' references. + // - Resolves all symbolic links. + // + // All directory entries in 'path' must exist on the filesystem. + virtual Status Canonicalize(const std::string& path, std::string* result) = 0; + + // Get the total amount of RAM installed on this machine. + virtual Status GetTotalRAMBytes(int64_t* ram) = 0; + private: + // No copying allowed + Env(const Env&); + void operator=(const Env&); +}; + +// A file abstraction for reading sequentially through a file +class SequentialFile { + public: + SequentialFile() { } + virtual ~SequentialFile(); + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, uint8_t *scratch) = 0; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) = 0; + + // Returns the filename provided when the SequentialFile was constructed. + virtual const std::string& filename() const = 0; +}; + +// A file abstraction for randomly reading the contents of a file. +class RandomAccessFile { + public: + RandomAccessFile() { } + virtual ~RandomAccessFile(); + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + uint8_t *scratch) const = 0; + + // Returns the size of the file + virtual Status Size(uint64_t *size) const = 0; + + // Returns the filename provided when the RandomAccessFile was constructed. + virtual const std::string& filename() const = 0; + + // Returns the approximate memory usage of this RandomAccessFile including + // the object itself. + virtual size_t memory_footprint() const = 0; +}; + +// Creation-time options for WritableFile +struct WritableFileOptions { + // Call Sync() during Close(). + bool sync_on_close; + + // See CreateMode for details. + Env::CreateMode mode; + + WritableFileOptions() + : sync_on_close(false), + mode(Env::CREATE_IF_NON_EXISTING_TRUNCATE) { } +}; + +// Options specified when a file is opened for random access. +struct RandomAccessFileOptions { + RandomAccessFileOptions() {} +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class WritableFile { + public: + enum FlushMode { + FLUSH_SYNC, + FLUSH_ASYNC + }; + + WritableFile() { } + virtual ~WritableFile(); + + // Pre-allocates 'size' bytes for the file in the underlying filesystem. + // size bytes are added to the current pre-allocated size or to the current + // offset, whichever is bigger. In no case is the file truncated by this + // operation. + virtual Status PreAllocate(uint64_t size) = 0; + + virtual Status Append(const Slice& data) = 0; + + // If possible, uses scatter-gather I/O to efficiently append + // multiple buffers to a file. Otherwise, falls back to regular I/O. + // + // For implementation specific quirks and details, see comments in + // implementation source code (e.g., env_posix.cc) + virtual Status AppendVector(const std::vector& data_vector) = 0; + + virtual Status Close() = 0; + + // Flush all dirty data (not metadata) to disk. + // + // If the flush mode is synchronous, will wait for flush to finish and + // return a meaningful status. + virtual Status Flush(FlushMode mode) = 0; + + virtual Status Sync() = 0; + + virtual uint64_t Size() const = 0; + + // Returns the filename provided when the WritableFile was constructed. + virtual const std::string& filename() const = 0; + + private: + // No copying allowed + WritableFile(const WritableFile&); + void operator=(const WritableFile&); +}; + +// Creation-time options for RWFile +struct RWFileOptions { + // Call Sync() during Close(). + bool sync_on_close; + + // See CreateMode for details. + Env::CreateMode mode; + + RWFileOptions() + : sync_on_close(false), + mode(Env::CREATE_IF_NON_EXISTING_TRUNCATE) { } +}; + +// A file abstraction for both reading and writing. No notion of a built-in +// file offset is ever used; instead, all operations must provide an +// explicit offset. +// +// All "read" operations are safe for concurrent use by multiple threads, +// but "write" operations must be externally synchronized. +class RWFile { + public: + enum FlushMode { + FLUSH_SYNC, + FLUSH_ASYNC + }; + + RWFile() { + } + + virtual ~RWFile(); + + // Read exactly 'length' bytes from the file starting at 'offset'. + // 'scratch[0..length-1]' may be written by this routine. Sets '*result' + // to the data that was read. May set '*result' to point at data in + // 'scratch[0..length-1]', which must be live when '*result' is used. + // If an error was encountered, returns a non-OK status. + // + // In the event of a "short read" (fewer bytes read than were requested), + // an IOError is returned. + // + // Safe for concurrent use by multiple threads. + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const = 0; + + // Writes 'data' to the file position given by 'offset'. + virtual Status Write(uint64_t offset, const Slice& data) = 0; + + // Preallocates 'length' bytes for the file in the underlying filesystem + // beginning at 'offset'. It is safe to preallocate the same range + // repeatedly; this is an idempotent operation. + // + // In no case is the file truncated by this operation. + virtual Status PreAllocate(uint64_t offset, size_t length) = 0; + + // Deallocates space given by 'offset' and length' from the file, + // effectively "punching a hole" in it. The space will be reclaimed by + // the filesystem and reads to that range will return zeroes. Useful + // for making whole files sparse. + // + // Filesystems that don't implement this will return an error. + virtual Status PunchHole(uint64_t offset, size_t length) = 0; + + // Flushes the range of dirty data (not metadata) given by 'offset' and + // 'length' to disk. If length is 0, all bytes from 'offset' to the end + // of the file are flushed. + // + // If the flush mode is synchronous, will wait for flush to finish and + // return a meaningful status. + virtual Status Flush(FlushMode mode, uint64_t offset, size_t length) = 0; + + // Synchronously flushes all dirty file data and metadata to disk. Upon + // returning successfully, all previously issued file changes have been + // made durable. + virtual Status Sync() = 0; + + // Closes the file, optionally calling Sync() on it if the file was + // created with the sync_on_close option enabled. + virtual Status Close() = 0; + + // Retrieves the file's size. + virtual Status Size(uint64_t* size) const = 0; + + // Returns the filename provided when the RWFile was constructed. + virtual const std::string& filename() const = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(RWFile); +}; + +// Identifies a locked file. +class FileLock { + public: + FileLock() { } + virtual ~FileLock(); + private: + // No copying allowed + FileLock(const FileLock&); + void operator=(const FileLock&); +}; + +// A utility routine: write "data" to the named file. +extern Status WriteStringToFile(Env* env, const Slice& data, + const std::string& fname); + +// A utility routine: read contents of named file into *data +extern Status ReadFileToString(Env* env, const std::string& fname, + faststring* data); + +// An implementation of Env that forwards all calls to another Env. +// May be useful to clients who wish to override just part of the +// functionality of another Env. +class EnvWrapper : public Env { + public: + // Initialize an EnvWrapper that delegates all calls to *t + explicit EnvWrapper(Env* t) : target_(t) { } + virtual ~EnvWrapper(); + + // Return the target to which this Env forwards all calls + Env* target() const { return target_; } + + // The following text is boilerplate that forwards all methods to target() + Status NewSequentialFile(const std::string& f, gscoped_ptr* r) OVERRIDE { + return target_->NewSequentialFile(f, r); + } + Status NewRandomAccessFile(const std::string& f, + gscoped_ptr* r) OVERRIDE { + return target_->NewRandomAccessFile(f, r); + } + Status NewRandomAccessFile(const RandomAccessFileOptions& opts, + const std::string& f, + gscoped_ptr* r) OVERRIDE { + return target_->NewRandomAccessFile(opts, f, r); + } + Status NewWritableFile(const std::string& f, gscoped_ptr* r) OVERRIDE { + return target_->NewWritableFile(f, r); + } + Status NewWritableFile(const WritableFileOptions& o, + const std::string& f, + gscoped_ptr* r) OVERRIDE { + return target_->NewWritableFile(o, f, r); + } + Status NewTempWritableFile(const WritableFileOptions& o, const std::string& t, + std::string* f, gscoped_ptr* r) OVERRIDE { + return target_->NewTempWritableFile(o, t, f, r); + } + Status NewRWFile(const std::string& f, gscoped_ptr* r) OVERRIDE { + return target_->NewRWFile(f, r); + } + Status NewRWFile(const RWFileOptions& o, + const std::string& f, + gscoped_ptr* r) OVERRIDE { + return target_->NewRWFile(o, f, r); + } + bool FileExists(const std::string& f) OVERRIDE { return target_->FileExists(f); } + Status GetChildren(const std::string& dir, std::vector* r) OVERRIDE { + return target_->GetChildren(dir, r); + } + Status DeleteFile(const std::string& f) OVERRIDE { return target_->DeleteFile(f); } + Status CreateDir(const std::string& d) OVERRIDE { return target_->CreateDir(d); } + Status SyncDir(const std::string& d) OVERRIDE { return target_->SyncDir(d); } + Status DeleteDir(const std::string& d) OVERRIDE { return target_->DeleteDir(d); } + Status DeleteRecursively(const std::string& d) OVERRIDE { return target_->DeleteRecursively(d); } + Status GetFileSize(const std::string& f, uint64_t* s) OVERRIDE { + return target_->GetFileSize(f, s); + } + Status GetFileSizeOnDisk(const std::string& f, uint64_t* s) OVERRIDE { + return target_->GetFileSizeOnDisk(f, s); + } + Status GetBlockSize(const std::string& f, uint64_t* s) OVERRIDE { + return target_->GetBlockSize(f, s); + } + Status RenameFile(const std::string& s, const std::string& t) OVERRIDE { + return target_->RenameFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) OVERRIDE { + return target_->LockFile(f, l); + } + Status UnlockFile(FileLock* l) OVERRIDE { return target_->UnlockFile(l); } + virtual Status GetTestDirectory(std::string* path) OVERRIDE { + return target_->GetTestDirectory(path); + } + uint64_t NowMicros() OVERRIDE { + return target_->NowMicros(); + } + void SleepForMicroseconds(int micros) OVERRIDE { + target_->SleepForMicroseconds(micros); + } + uint64_t gettid() OVERRIDE { + return target_->gettid(); + } + Status GetExecutablePath(std::string* path) OVERRIDE { + return target_->GetExecutablePath(path); + } + Status IsDirectory(const std::string& path, bool* is_dir) OVERRIDE { + return target_->IsDirectory(path, is_dir); + } + Status Walk(const std::string& root, + DirectoryOrder order, + const WalkCallback& cb) OVERRIDE { + return target_->Walk(root, order, cb); + } + Status Canonicalize(const std::string& path, std::string* result) OVERRIDE { + return target_->Canonicalize(path, result); + } + Status GetTotalRAMBytes(int64_t* ram) OVERRIDE { + return target_->GetTotalRAMBytes(ram); + } + private: + Env* target_; +}; + +} // namespace kudu + +#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_ diff --git a/src/kudu/util/env_posix.cc b/src/kudu/util/env_posix.cc new file mode 100644 index 000000000000..d568f76b1967 --- /dev/null +++ b/src/kudu/util/env_posix.cc @@ -0,0 +1,1139 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/bind.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env.h" +#include "kudu/util/errno.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/logging.h" +#include "kudu/util/malloc.h" +#include "kudu/util/monotime.h" +#include "kudu/util/path_util.h" +#include "kudu/util/slice.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/thread_restrictions.h" + +#if defined(__APPLE__) +#include +#include +#else +#include +#include +#endif // defined(__APPLE__) + +// Copied from falloc.h. Useful for older kernels that lack support for +// hole punching; fallocate(2) will return EOPNOTSUPP. +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ +#endif +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ +#endif + +// For platforms without fdatasync (like OS X) +#ifndef fdatasync +#define fdatasync fsync +#endif + +// For platforms without unlocked_stdio (like OS X) +#ifndef fread_unlocked +#define fread_unlocked fread +#endif + +// See KUDU-588 for details. +DEFINE_bool(writable_file_use_fsync, false, + "Use fsync(2) instead of fdatasync(2) for synchronizing dirty " + "data to disk."); +TAG_FLAG(writable_file_use_fsync, advanced); + +DEFINE_bool(suicide_on_eio, true, + "Kill the process if an I/O operation results in EIO"); +TAG_FLAG(suicide_on_eio, advanced); + +DEFINE_bool(never_fsync, false, + "Never fsync() anything to disk. This is used by certain test cases to " + "speed up runtime. This is very unsafe to use in production."); +TAG_FLAG(never_fsync, advanced); +TAG_FLAG(never_fsync, unsafe); + +using base::subtle::Atomic64; +using base::subtle::Barrier_AtomicIncrement; +using std::vector; +using strings::Substitute; + +static __thread uint64_t thread_local_id; +static Atomic64 cur_thread_local_id_; + +namespace kudu { + +namespace { + +#if defined(__APPLE__) +// Simulates Linux's fallocate file preallocation API on OS X. +int fallocate(int fd, int mode, off_t offset, off_t len) { + CHECK(mode == 0); + off_t size = offset + len; + + struct stat stat; + int ret = fstat(fd, &stat); + if (ret < 0) { + return ret; + } + + if (stat.st_blocks * 512 < size) { + // The offset field seems to have no effect; the file is always allocated + // with space from 0 to the size. This is probably because OS X does not + // support sparse files. + fstore_t store = {F_ALLOCATECONTIG, F_PEOFPOSMODE, 0, size}; + if (fcntl(fd, F_PREALLOCATE, &store) < 0) { + LOG(INFO) << "Unable to allocate contiguous disk space, attempting non-contiguous allocation"; + store.fst_flags = F_ALLOCATEALL; + ret = fcntl(fd, F_PREALLOCATE, &store); + if (ret < 0) { + return ret; + } + } + } + + if (stat.st_size < size) { + // fcntl does not change the file size, so set it if necessary. + return ftruncate(fd, size); + } + return 0; +} +#endif + +// Close file descriptor when object goes out of scope. +class ScopedFdCloser { + public: + explicit ScopedFdCloser(int fd) + : fd_(fd) { + } + + ~ScopedFdCloser() { + ThreadRestrictions::AssertIOAllowed(); + int err = ::close(fd_); + if (PREDICT_FALSE(err != 0)) { + PLOG(WARNING) << "Failed to close fd " << fd_; + } + } + + private: + int fd_; +}; + +static Status IOError(const std::string& context, int err_number) { + switch (err_number) { + case ENOENT: + return Status::NotFound(context, ErrnoToString(err_number), err_number); + case EEXIST: + return Status::AlreadyPresent(context, ErrnoToString(err_number), err_number); + case EOPNOTSUPP: + return Status::NotSupported(context, ErrnoToString(err_number), err_number); + case EIO: + if (FLAGS_suicide_on_eio) { + // TODO: This is very, very coarse-grained. A more comprehensive + // approach is described in KUDU-616. + LOG(FATAL) << "Fatal I/O error, context: " << context; + } + } + return Status::IOError(context, ErrnoToString(err_number), err_number); +} + +static Status DoSync(int fd, const string& filename) { + ThreadRestrictions::AssertIOAllowed(); + if (FLAGS_never_fsync) return Status::OK(); + if (FLAGS_writable_file_use_fsync) { + if (fsync(fd) < 0) { + return IOError(filename, errno); + } + } else { + if (fdatasync(fd) < 0) { + return IOError(filename, errno); + } + } + return Status::OK(); +} + +static Status DoOpen(const string& filename, Env::CreateMode mode, int* fd) { + ThreadRestrictions::AssertIOAllowed(); + int flags = O_RDWR; + switch (mode) { + case Env::CREATE_IF_NON_EXISTING_TRUNCATE: + flags |= O_CREAT | O_TRUNC; + break; + case Env::CREATE_NON_EXISTING: + flags |= O_CREAT | O_EXCL; + break; + case Env::OPEN_EXISTING: + break; + default: + return Status::NotSupported(Substitute("Unknown create mode $0", mode)); + } + const int f = open(filename.c_str(), flags, 0644); + if (f < 0) { + return IOError(filename, errno); + } + *fd = f; + return Status::OK(); +} + +class PosixSequentialFile: public SequentialFile { + private: + std::string filename_; + FILE* file_; + + public: + PosixSequentialFile(std::string fname, FILE* f) + : filename_(std::move(fname)), file_(f) {} + virtual ~PosixSequentialFile() { fclose(file_); } + + virtual Status Read(size_t n, Slice* result, uint8_t* scratch) OVERRIDE { + ThreadRestrictions::AssertIOAllowed(); + Status s; + size_t r = fread_unlocked(scratch, 1, n, file_); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + } else { + // A partial read with an error: return a non-ok status. + s = IOError(filename_, errno); + } + } + return s; + } + + virtual Status Skip(uint64_t n) OVERRIDE { + TRACE_EVENT1("io", "PosixSequentialFile::Skip", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + if (fseek(file_, n, SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { return filename_; } +}; + +// pread() based random-access +class PosixRandomAccessFile: public RandomAccessFile { + private: + std::string filename_; + int fd_; + + public: + PosixRandomAccessFile(std::string fname, int fd) + : filename_(std::move(fname)), fd_(fd) {} + virtual ~PosixRandomAccessFile() { close(fd_); } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + uint8_t *scratch) const OVERRIDE { + ThreadRestrictions::AssertIOAllowed(); + Status s; + ssize_t r = pread(fd_, scratch, n, static_cast(offset)); + *result = Slice(scratch, (r < 0) ? 0 : r); + if (r < 0) { + // An error: return a non-ok status. + s = IOError(filename_, errno); + } + return s; + } + + virtual Status Size(uint64_t *size) const OVERRIDE { + TRACE_EVENT1("io", "PosixRandomAccessFile::Size", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + struct stat st; + if (fstat(fd_, &st) == -1) { + return IOError(filename_, errno); + } + *size = st.st_size; + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { return filename_; } + + virtual size_t memory_footprint() const OVERRIDE { + return kudu_malloc_usable_size(this) + filename_.capacity(); + } +}; + +// Use non-memory mapped POSIX files to write data to a file. +// +// TODO (perf) investigate zeroing a pre-allocated allocated area in +// order to further improve Sync() performance. +class PosixWritableFile : public WritableFile { + public: + PosixWritableFile(std::string fname, int fd, uint64_t file_size, + bool sync_on_close) + : filename_(std::move(fname)), + fd_(fd), + sync_on_close_(sync_on_close), + filesize_(file_size), + pre_allocated_size_(0), + pending_sync_(false) {} + + ~PosixWritableFile() { + if (fd_ >= 0) { + WARN_NOT_OK(Close(), "Failed to close " + filename_); + } + } + + virtual Status Append(const Slice& data) OVERRIDE { + vector data_vector; + data_vector.push_back(data); + return AppendVector(data_vector); + } + + virtual Status AppendVector(const vector& data_vector) OVERRIDE { + ThreadRestrictions::AssertIOAllowed(); + static const size_t kIovMaxElements = IOV_MAX; + + Status s; + for (size_t i = 0; i < data_vector.size() && s.ok(); i += kIovMaxElements) { + size_t n = std::min(data_vector.size() - i, kIovMaxElements); + s = DoWritev(data_vector, i, n); + } + + pending_sync_ = true; + return s; + } + + virtual Status PreAllocate(uint64_t size) OVERRIDE { + TRACE_EVENT1("io", "PosixWritableFile::PreAllocate", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + uint64_t offset = std::max(filesize_, pre_allocated_size_); + if (fallocate(fd_, 0, offset, size) < 0) { + if (errno == EOPNOTSUPP) { + KLOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate()."; + } else if (errno == ENOSYS) { + KLOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate()."; + } else { + return IOError(filename_, errno); + } + } + pre_allocated_size_ = offset + size; + return Status::OK(); + } + + virtual Status Close() OVERRIDE { + TRACE_EVENT1("io", "PosixWritableFile::Close", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + Status s; + + // If we've allocated more space than we used, truncate to the + // actual size of the file and perform Sync(). + if (filesize_ < pre_allocated_size_) { + if (ftruncate(fd_, filesize_) < 0) { + s = IOError(filename_, errno); + pending_sync_ = true; + } + } + + if (sync_on_close_) { + Status sync_status = Sync(); + if (!sync_status.ok()) { + LOG(ERROR) << "Unable to Sync " << filename_ << ": " << sync_status.ToString(); + if (s.ok()) { + s = sync_status; + } + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + return s; + } + + virtual Status Flush(FlushMode mode) OVERRIDE { + TRACE_EVENT1("io", "PosixWritableFile::Flush", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); +#if defined(__linux__) + int flags = SYNC_FILE_RANGE_WRITE; + if (mode == FLUSH_SYNC) { + flags |= SYNC_FILE_RANGE_WAIT_AFTER; + } + if (sync_file_range(fd_, 0, 0, flags) < 0) { + return IOError(filename_, errno); + } +#else + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } +#endif + return Status::OK(); + } + + virtual Status Sync() OVERRIDE { + TRACE_EVENT1("io", "PosixWritableFile::Sync", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + LOG_SLOW_EXECUTION(WARNING, 1000, Substitute("sync call for $0", filename_)) { + if (pending_sync_) { + pending_sync_ = false; + RETURN_NOT_OK(DoSync(fd_, filename_)); + } + } + return Status::OK(); + } + + virtual uint64_t Size() const OVERRIDE { + return filesize_; + } + + virtual const string& filename() const OVERRIDE { return filename_; } + + private: + + Status DoWritev(const vector& data_vector, + size_t offset, size_t n) { + ThreadRestrictions::AssertIOAllowed(); +#if defined(__linux__) + DCHECK_LE(n, IOV_MAX); + + struct iovec iov[n]; + size_t j = 0; + size_t nbytes = 0; + + for (size_t i = offset; i < offset + n; i++) { + const Slice& data = data_vector[i]; + iov[j].iov_base = const_cast(data.data()); + iov[j].iov_len = data.size(); + nbytes += data.size(); + ++j; + } + + ssize_t written = pwritev(fd_, iov, n, filesize_); + + if (PREDICT_FALSE(written == -1)) { + int err = errno; + return IOError(filename_, err); + } + + filesize_ += written; + + if (PREDICT_FALSE(written != nbytes)) { + return Status::IOError( + Substitute("pwritev error: expected to write $0 bytes, wrote $1 bytes instead", + nbytes, written)); + } +#else + for (size_t i = offset; i < offset + n; i++) { + const Slice& data = data_vector[i]; + ssize_t written = pwrite(fd_, data.data(), data.size(), filesize_); + if (PREDICT_FALSE(written == -1)) { + int err = errno; + return IOError("pwrite error", err); + } + + filesize_ += written; + + if (PREDICT_FALSE(written != data.size())) { + return Status::IOError( + Substitute("pwrite error: expected to write $0 bytes, wrote $1 bytes instead", + data.size(), written)); + } + } +#endif + + return Status::OK(); + } + + const std::string filename_; + int fd_; + bool sync_on_close_; + uint64_t filesize_; + uint64_t pre_allocated_size_; + + bool pending_sync_; +}; + +class PosixRWFile : public RWFile { +// is not employed. + public: + PosixRWFile(string fname, int fd, bool sync_on_close) + : filename_(std::move(fname)), + fd_(fd), + sync_on_close_(sync_on_close), + pending_sync_(false) {} + + ~PosixRWFile() { + if (fd_ >= 0) { + WARN_NOT_OK(Close(), "Failed to close " + filename_); + } + } + + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const OVERRIDE { + ThreadRestrictions::AssertIOAllowed(); + int rem = length; + uint8_t* dst = scratch; + while (rem > 0) { + ssize_t r = pread(fd_, dst, rem, offset); + if (r < 0) { + // An error: return a non-ok status. + return IOError(filename_, errno); + } + Slice this_result(dst, r); + DCHECK_LE(this_result.size(), rem); + if (this_result.size() == 0) { + // EOF + return Status::IOError(Substitute("EOF trying to read $0 bytes at offset $1", + length, offset)); + } + dst += this_result.size(); + rem -= this_result.size(); + offset += this_result.size(); + } + DCHECK_EQ(0, rem); + *result = Slice(scratch, length); + return Status::OK(); + } + + virtual Status Write(uint64_t offset, const Slice& data) OVERRIDE { + ThreadRestrictions::AssertIOAllowed(); + ssize_t written = pwrite(fd_, data.data(), data.size(), offset); + + if (PREDICT_FALSE(written == -1)) { + int err = errno; + return IOError(filename_, err); + } + + if (PREDICT_FALSE(written != data.size())) { + return Status::IOError( + Substitute("pwrite error: expected to write $0 bytes, wrote $1 bytes instead", + data.size(), written)); + } + + pending_sync_ = true; + return Status::OK(); + } + + virtual Status PreAllocate(uint64_t offset, size_t length) OVERRIDE { + TRACE_EVENT1("io", "PosixRWFile::PreAllocate", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + if (fallocate(fd_, 0, offset, length) < 0) { + if (errno == EOPNOTSUPP) { + KLOG_FIRST_N(WARNING, 1) << "The filesystem does not support fallocate()."; + } else if (errno == ENOSYS) { + KLOG_FIRST_N(WARNING, 1) << "The kernel does not implement fallocate()."; + } else { + return IOError(filename_, errno); + } + } + return Status::OK(); + } + + virtual Status PunchHole(uint64_t offset, size_t length) OVERRIDE { +#if defined(__linux__) + TRACE_EVENT1("io", "PosixRWFile::PunchHole", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + if (fallocate(fd_, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length) < 0) { + return IOError(filename_, errno); + } + return Status::OK(); +#else + return Status::NotSupported("Hole punching not supported on this platform"); +#endif + } + + virtual Status Flush(FlushMode mode, uint64_t offset, size_t length) OVERRIDE { + TRACE_EVENT1("io", "PosixRWFile::Flush", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); +#if defined(__linux__) + int flags = SYNC_FILE_RANGE_WRITE; + if (mode == FLUSH_SYNC) { + flags |= SYNC_FILE_RANGE_WAIT_AFTER; + } + if (sync_file_range(fd_, offset, length, flags) < 0) { + return IOError(filename_, errno); + } +#else + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } +#endif + return Status::OK(); + } + + virtual Status Sync() OVERRIDE { + TRACE_EVENT1("io", "PosixRWFile::Sync", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + LOG_SLOW_EXECUTION(WARNING, 1000, Substitute("sync call for $0", filename())) { + if (pending_sync_) { + pending_sync_ = false; + RETURN_NOT_OK(DoSync(fd_, filename_)); + } + } + return Status::OK(); + } + + virtual Status Close() OVERRIDE { + TRACE_EVENT1("io", "PosixRWFile::Close", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + Status s; + + if (sync_on_close_) { + s = Sync(); + if (!s.ok()) { + LOG(ERROR) << "Unable to Sync " << filename_ << ": " << s.ToString(); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + return s; + } + + virtual Status Size(uint64_t* size) const OVERRIDE { + TRACE_EVENT1("io", "PosixRWFile::Size", "path", filename_); + ThreadRestrictions::AssertIOAllowed(); + struct stat st; + if (fstat(fd_, &st) == -1) { + return IOError(filename_, errno); + } + *size = st.st_size; + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { + return filename_; + } + + private: + const std::string filename_; + int fd_; + bool sync_on_close_; + bool pending_sync_; +}; + +static int LockOrUnlock(int fd, bool lock) { + ThreadRestrictions::AssertIOAllowed(); + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); +} + +class PosixFileLock : public FileLock { + public: + int fd_; +}; + +class PosixEnv : public Env { + public: + PosixEnv(); + virtual ~PosixEnv() { + fprintf(stderr, "Destroying Env::Default()\n"); + exit(1); + } + + virtual Status NewSequentialFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::NewSequentialFile", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + FILE* f = fopen(fname.c_str(), "r"); + if (f == nullptr) { + return IOError(fname, errno); + } else { + result->reset(new PosixSequentialFile(fname, f)); + return Status::OK(); + } + } + + virtual Status NewRandomAccessFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + return NewRandomAccessFile(RandomAccessFileOptions(), fname, result); + } + + virtual Status NewRandomAccessFile(const RandomAccessFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::NewRandomAccessFile", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + int fd = open(fname.c_str(), O_RDONLY); + if (fd < 0) { + return IOError(fname, errno); + } + + result->reset(new PosixRandomAccessFile(fname, fd)); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + return NewWritableFile(WritableFileOptions(), fname, result); + } + + virtual Status NewWritableFile(const WritableFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::NewWritableFile", "path", fname); + int fd; + RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd)); + return InstantiateNewWritableFile(fname, fd, opts, result); + } + + virtual Status NewTempWritableFile(const WritableFileOptions& opts, + const std::string& name_template, + std::string* created_filename, + gscoped_ptr* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::NewTempWritableFile", "template", name_template); + ThreadRestrictions::AssertIOAllowed(); + gscoped_ptr fname(new char[name_template.size() + 1]); + ::snprintf(fname.get(), name_template.size() + 1, "%s", name_template.c_str()); + const int fd = ::mkstemp(fname.get()); + if (fd < 0) { + return IOError(Substitute("Call to mkstemp() failed on name template $0", name_template), + errno); + } + *created_filename = fname.get(); + return InstantiateNewWritableFile(*created_filename, fd, opts, result); + } + + virtual Status NewRWFile(const string& fname, + gscoped_ptr* result) OVERRIDE { + return NewRWFile(RWFileOptions(), fname, result); + } + + virtual Status NewRWFile(const RWFileOptions& opts, + const string& fname, + gscoped_ptr* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::NewRWFile", "path", fname); + int fd; + RETURN_NOT_OK(DoOpen(fname, opts.mode, &fd)); + result->reset(new PosixRWFile(fname, fd, opts.sync_on_close)); + return Status::OK(); + } + + virtual bool FileExists(const std::string& fname) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::FileExists", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + return access(fname.c_str(), F_OK) == 0; + } + + virtual Status GetChildren(const std::string& dir, + std::vector* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::GetChildren", "path", dir); + ThreadRestrictions::AssertIOAllowed(); + result->clear(); + DIR* d = opendir(dir.c_str()); + if (d == nullptr) { + return IOError(dir, errno); + } + struct dirent* entry; + // TODO: lint: Consider using readdir_r(...) instead of readdir(...) for improved thread safety. + while ((entry = readdir(d)) != nullptr) { + result->push_back(entry->d_name); + } + closedir(d); + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::DeleteFile", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + Status result; + if (unlink(fname.c_str()) != 0) { + result = IOError(fname, errno); + } + return result; + }; + + virtual Status CreateDir(const std::string& name) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::CreateDir", "path", name); + ThreadRestrictions::AssertIOAllowed(); + Status result; + if (mkdir(name.c_str(), 0755) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status DeleteDir(const std::string& name) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::DeleteDir", "path", name); + ThreadRestrictions::AssertIOAllowed(); + Status result; + if (rmdir(name.c_str()) != 0) { + result = IOError(name, errno); + } + return result; + }; + + virtual Status SyncDir(const std::string& dirname) OVERRIDE { + TRACE_EVENT1("io", "SyncDir", "path", dirname); + ThreadRestrictions::AssertIOAllowed(); + if (FLAGS_never_fsync) return Status::OK(); + int dir_fd; + if ((dir_fd = open(dirname.c_str(), O_DIRECTORY|O_RDONLY)) == -1) { + return IOError(dirname, errno); + } + ScopedFdCloser fd_closer(dir_fd); + if (fsync(dir_fd) != 0) { + return IOError(dirname, errno); + } + return Status::OK(); + } + + virtual Status DeleteRecursively(const std::string &name) OVERRIDE { + return Walk(name, POST_ORDER, Bind(&PosixEnv::DeleteRecursivelyCb, + Unretained(this))); + } + + virtual Status GetFileSize(const std::string& fname, uint64_t* size) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::GetFileSize", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + s = IOError(fname, errno); + } else { + *size = sbuf.st_size; + } + return s; + } + + virtual Status GetFileSizeOnDisk(const std::string& fname, uint64_t* size) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::GetFileSizeOnDisk", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + s = IOError(fname, errno); + } else { + // From stat(2): + // + // The st_blocks field indicates the number of blocks allocated to + // the file, 512-byte units. (This may be smaller than st_size/512 + // when the file has holes.) + *size = sbuf.st_blocks * 512; + } + return s; + } + + virtual Status GetBlockSize(const string& fname, uint64_t* block_size) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::GetBlockSize", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + Status s; + struct stat sbuf; + if (stat(fname.c_str(), &sbuf) != 0) { + s = IOError(fname, errno); + } else { + *block_size = sbuf.st_blksize; + } + return s; + } + + virtual Status RenameFile(const std::string& src, const std::string& target) OVERRIDE { + TRACE_EVENT2("io", "PosixEnv::RenameFile", "src", src, "dst", target); + ThreadRestrictions::AssertIOAllowed(); + Status result; + if (rename(src.c_str(), target.c_str()) != 0) { + result = IOError(src, errno); + } + return result; + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::LockFile", "path", fname); + ThreadRestrictions::AssertIOAllowed(); + *lock = nullptr; + Status result; + int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + result = IOError(fname, errno); + } else if (LockOrUnlock(fd, true) == -1) { + result = IOError("lock " + fname, errno); + close(fd); + } else { + auto my_lock = new PosixFileLock; + my_lock->fd_ = fd; + *lock = my_lock; + } + return result; + } + + virtual Status UnlockFile(FileLock* lock) OVERRIDE { + TRACE_EVENT0("io", "PosixEnv::UnlockFile"); + ThreadRestrictions::AssertIOAllowed(); + PosixFileLock* my_lock = reinterpret_cast(lock); + Status result; + if (LockOrUnlock(my_lock->fd_, false) == -1) { + result = IOError("unlock", errno); + } + close(my_lock->fd_); + delete my_lock; + return result; + } + + virtual Status GetTestDirectory(std::string* result) OVERRIDE { + string dir; + const char* env = getenv("TEST_TMPDIR"); + if (env && env[0] != '\0') { + dir = env; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "/tmp/kudutest-%d", static_cast(geteuid())); + dir = buf; + } + // Directory may already exist + ignore_result(CreateDir(dir)); + // /tmp may be a symlink, so canonicalize the path. + return Canonicalize(dir, result); + } + + virtual uint64_t gettid() OVERRIDE { + // Platform-independent thread ID. We can't use pthread_self here, + // because that function returns a totally opaque ID, which can't be + // compared via normal means. + if (thread_local_id == 0) { + thread_local_id = Barrier_AtomicIncrement(&cur_thread_local_id_, 1); + } + return thread_local_id; + } + + virtual uint64_t NowMicros() OVERRIDE { + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + virtual void SleepForMicroseconds(int micros) OVERRIDE { + ThreadRestrictions::AssertWaitAllowed(); + SleepFor(MonoDelta::FromMicroseconds(micros)); + } + + virtual Status GetExecutablePath(string* path) OVERRIDE { + uint32_t size = 64; + uint32_t len = 0; + while (true) { + gscoped_ptr buf(new char[size]); +#if defined(__linux__) + int rc = readlink("/proc/self/exe", buf.get(), size); + if (rc == -1) { + return Status::IOError("Unable to determine own executable path", "", errno); + } else if (rc >= size) { + // The buffer wasn't large enough + size *= 2; + continue; + } + len = rc; +#elif defined(__APPLE__) + if (_NSGetExecutablePath(buf.get(), &size) != 0) { + // The buffer wasn't large enough; 'size' has been updated. + continue; + } + len = strlen(buf.get()); +#else +#error Unsupported platform +#endif + + path->assign(buf.get(), len); + break; + } + return Status::OK(); + } + + virtual Status IsDirectory(const string& path, bool* is_dir) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::IsDirectory", "path", path); + ThreadRestrictions::AssertIOAllowed(); + Status s; + struct stat sbuf; + if (stat(path.c_str(), &sbuf) != 0) { + s = IOError(path, errno); + } else { + *is_dir = S_ISDIR(sbuf.st_mode); + } + return s; + } + + virtual Status Walk(const string& root, DirectoryOrder order, const WalkCallback& cb) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::Walk", "path", root); + ThreadRestrictions::AssertIOAllowed(); + // Some sanity checks + CHECK_NE(root, "/"); + CHECK_NE(root, "./"); + CHECK_NE(root, "."); + CHECK_NE(root, ""); + + // FTS requires a non-const copy of the name. strdup it and free() when + // we leave scope. + gscoped_ptr name_dup(strdup(root.c_str())); + char *(paths[]) = { name_dup.get(), nullptr }; + + // FTS_NOCHDIR is important here to make this thread-safe. + gscoped_ptr tree( + fts_open(paths, FTS_PHYSICAL | FTS_XDEV | FTS_NOCHDIR, nullptr)); + if (!tree.get()) { + return IOError(root, errno); + } + + FTSENT *ent = nullptr; + bool had_errors = false; + while ((ent = fts_read(tree.get())) != nullptr) { + bool doCb = false; + FileType type = DIRECTORY_TYPE; + switch (ent->fts_info) { + case FTS_D: // Directory in pre-order + if (order == PRE_ORDER) { + doCb = true; + } + break; + case FTS_DP: // Directory in post-order + if (order == POST_ORDER) { + doCb = true; + } + break; + case FTS_F: // A regular file + case FTS_SL: // A symbolic link + case FTS_SLNONE: // A broken symbolic link + case FTS_DEFAULT: // Unknown type of file + doCb = true; + type = FILE_TYPE; + break; + + case FTS_ERR: + LOG(WARNING) << "Unable to access file " << ent->fts_path + << " during walk: " << strerror(ent->fts_errno); + had_errors = true; + break; + + default: + LOG(WARNING) << "Unable to access file " << ent->fts_path + << " during walk (code " << ent->fts_info << ")"; + break; + } + if (doCb) { + if (!cb.Run(type, DirName(ent->fts_path), ent->fts_name).ok()) { + had_errors = true; + } + } + } + + if (had_errors) { + return Status::IOError(root, "One or more errors occurred"); + } + return Status::OK(); + } + + virtual Status Canonicalize(const string& path, string* result) OVERRIDE { + TRACE_EVENT1("io", "PosixEnv::Canonicalize", "path", path); + ThreadRestrictions::AssertIOAllowed(); + gscoped_ptr r(realpath(path.c_str(), nullptr)); + if (!r) { + return IOError(path, errno); + } + *result = string(r.get()); + return Status::OK(); + } + + virtual Status GetTotalRAMBytes(int64_t* ram) OVERRIDE { +#if defined(__APPLE__) + int mib[2]; + size_t length = sizeof(*ram); + + // Get the Physical memory size + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + CHECK_ERR(sysctl(mib, 2, ram, &length, nullptr, 0)) << "sysctl CTL_HW HW_MEMSIZE failed"; +#else + struct sysinfo info; + if (sysinfo(&info) < 0) { + return IOError("sysinfo() failed", errno); + } + *ram = info.totalram; +#endif + return Status::OK(); + } + + private: + // gscoped_ptr Deleter implementation for fts_close + struct FtsCloser { + void operator()(FTS *fts) const { + if (fts) { fts_close(fts); } + } + }; + + Status InstantiateNewWritableFile(const std::string& fname, + int fd, + const WritableFileOptions& opts, + gscoped_ptr* result) { + uint64_t file_size = 0; + if (opts.mode == OPEN_EXISTING) { + RETURN_NOT_OK(GetFileSize(fname, &file_size)); + } + result->reset(new PosixWritableFile(fname, fd, file_size, opts.sync_on_close)); + return Status::OK(); + } + + Status DeleteRecursivelyCb(FileType type, const string& dirname, const string& basename) { + string full_path = JoinPathSegments(dirname, basename); + Status s; + switch (type) { + case FILE_TYPE: + s = DeleteFile(full_path); + WARN_NOT_OK(s, "Could not delete file"); + return s; + case DIRECTORY_TYPE: + s = DeleteDir(full_path); + WARN_NOT_OK(s, "Could not delete directory"); + return s; + default: + LOG(FATAL) << "Unknown file type: " << type; + return Status::OK(); + } + } +}; + +PosixEnv::PosixEnv() {} + +} // namespace + +static pthread_once_t once = PTHREAD_ONCE_INIT; +static Env* default_env; +static void InitDefaultEnv() { default_env = new PosixEnv; } + +Env* Env::Default() { + pthread_once(&once, InitDefaultEnv); + return default_env; +} + +} // namespace kudu diff --git a/src/kudu/util/env_util.cc b/src/kudu/util/env_util.cc new file mode 100644 index 000000000000..b6b22444a15e --- /dev/null +++ b/src/kudu/util/env_util.cc @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/status.h" + +using strings::Substitute; +using std::shared_ptr; + +namespace kudu { +namespace env_util { + +Status OpenFileForWrite(Env* env, const string& path, + shared_ptr* file) { + return OpenFileForWrite(WritableFileOptions(), env, path, file); +} + +Status OpenFileForWrite(const WritableFileOptions& opts, + Env *env, const string &path, + shared_ptr *file) { + gscoped_ptr w; + RETURN_NOT_OK(env->NewWritableFile(opts, path, &w)); + file->reset(w.release()); + return Status::OK(); +} + +Status OpenFileForRandom(Env *env, const string &path, + shared_ptr *file) { + gscoped_ptr r; + RETURN_NOT_OK(env->NewRandomAccessFile(path, &r)); + file->reset(r.release()); + return Status::OK(); +} + +Status OpenFileForSequential(Env *env, const string &path, + shared_ptr *file) { + gscoped_ptr r; + RETURN_NOT_OK(env->NewSequentialFile(path, &r)); + file->reset(r.release()); + return Status::OK(); +} + +Status ReadFully(RandomAccessFile* file, uint64_t offset, size_t n, + Slice* result, uint8_t* scratch) { + + bool first_read = true; + + int rem = n; + uint8_t* dst = scratch; + while (rem > 0) { + Slice this_result; + RETURN_NOT_OK(file->Read(offset, rem, &this_result, dst)); + DCHECK_LE(this_result.size(), rem); + if (this_result.size() == 0) { + // EOF + return Status::IOError(Substitute("EOF trying to read $0 bytes at offset $1", + n, offset)); + } + + if (first_read && this_result.size() == n) { + // If it's the first read, we can return a zero-copy array. + *result = this_result; + return Status::OK(); + } + first_read = false; + + // Otherwise, we're going to have to do more reads and stitch + // each read together. + this_result.relocate(dst); + dst += this_result.size(); + rem -= this_result.size(); + offset += this_result.size(); + } + DCHECK_EQ(0, rem); + *result = Slice(scratch, n); + return Status::OK(); +} + +Status CreateDirIfMissing(Env* env, const string& path, bool* created) { + Status s = env->CreateDir(path); + if (created != nullptr) { + *created = s.ok(); + } + return s.IsAlreadyPresent() ? Status::OK() : s; +} + +Status CopyFile(Env* env, const string& source_path, const string& dest_path, + WritableFileOptions opts) { + gscoped_ptr source; + RETURN_NOT_OK(env->NewSequentialFile(source_path, &source)); + uint64_t size; + RETURN_NOT_OK(env->GetFileSize(source_path, &size)); + + gscoped_ptr dest; + RETURN_NOT_OK(env->NewWritableFile(opts, dest_path, &dest)); + RETURN_NOT_OK(dest->PreAllocate(size)); + + const int32_t kBufferSize = 1024 * 1024; + gscoped_ptr scratch(new uint8_t[kBufferSize]); + + uint64_t bytes_read = 0; + while (bytes_read < size) { + uint64_t max_bytes_to_read = std::min(size - bytes_read, kBufferSize); + Slice data; + RETURN_NOT_OK(source->Read(max_bytes_to_read, &data, scratch.get())); + RETURN_NOT_OK(dest->Append(data)); + bytes_read += data.size(); + } + return Status::OK(); +} + +ScopedFileDeleter::ScopedFileDeleter(Env* env, std::string path) + : env_(DCHECK_NOTNULL(env)), path_(std::move(path)), should_delete_(true) {} + +ScopedFileDeleter::~ScopedFileDeleter() { + if (should_delete_) { + bool is_dir; + Status s = env_->IsDirectory(path_, &is_dir); + WARN_NOT_OK(s, Substitute( + "Failed to determine if path is a directory: $0", path_)); + if (!s.ok()) { + return; + } + if (is_dir) { + WARN_NOT_OK(env_->DeleteDir(path_), + Substitute("Failed to remove directory: $0", path_)); + } else { + WARN_NOT_OK(env_->DeleteFile(path_), + Substitute("Failed to remove file: $0", path_)); + } + } +} + +} // namespace env_util +} // namespace kudu diff --git a/src/kudu/util/env_util.h b/src/kudu/util/env_util.h new file mode 100644 index 000000000000..e121f9c8ff59 --- /dev/null +++ b/src/kudu/util/env_util.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_ENV_UTIL_H +#define KUDU_UTIL_ENV_UTIL_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/env.h" + +namespace kudu { +namespace env_util { + +Status OpenFileForWrite(Env *env, const std::string &path, + std::shared_ptr *file); + +Status OpenFileForWrite(const WritableFileOptions& opts, + Env *env, const std::string &path, + std::shared_ptr *file); + +Status OpenFileForRandom(Env *env, const std::string &path, + std::shared_ptr *file); + +Status OpenFileForSequential(Env *env, const std::string &path, + std::shared_ptr *file); + +// Read exactly 'n' bytes from the given file. If fewer than 'n' bytes +// are read, returns an IOError. This differs from the underlying +// RandomAccessFile::Read(), which may return a "short read". +// +// Similar to RandomAccessFile::Read(), '*result' is modified to point +// to the bytes which were read. These bytes may be a copy placed in +// the 'scratch' buffer, or result may point into the underlying file +// (e.g. via mmap or other zero-copy mechanism). +// +// NOTE: even if this returns an error, some data _may_ be read into +// the provided scratch buffer, but no guarantee that that will be the +// case. +Status ReadFully(RandomAccessFile* file, uint64_t offset, size_t n, + Slice* result, uint8_t* scratch); + +// Creates the directory given by 'path', unless it already exists. +// +// If 'created' is not NULL, sets it to true if the directory was +// created, false otherwise. +Status CreateDirIfMissing(Env* env, const std::string& path, + bool* created = NULL); + +// Copy the contents of file source_path to file dest_path. +// This is not atomic, and if there is an error while reading or writing, +// a partial copy may be left in 'dest_path'. Does not fsync the parent +// directory of dest_path -- if you need durability then do that yourself. +Status CopyFile(Env* env, const std::string& source_path, const std::string& dest_path, + WritableFileOptions opts); + +// Deletes a file or directory when this object goes out of scope. +// +// The deletion may be cancelled by calling .Cancel(). +// This is typically useful for cleaning up temporary files if the +// creation of the tmp file may fail. +class ScopedFileDeleter { + public: + ScopedFileDeleter(Env* env, std::string path); + ~ScopedFileDeleter(); + + // Do not delete the file when this object goes out of scope. + void Cancel() { + should_delete_ = false; + } + + private: + Env* const env_; + const std::string path_; + bool should_delete_; + + DISALLOW_COPY_AND_ASSIGN(ScopedFileDeleter); +}; + +} // namespace env_util +} // namespace kudu + +#endif diff --git a/src/kudu/util/errno-test.cc b/src/kudu/util/errno-test.cc new file mode 100644 index 000000000000..911ca14d0bde --- /dev/null +++ b/src/kudu/util/errno-test.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/errno.h" + +using std::string; + +namespace kudu { + +TEST(OsUtilTest, TestErrnoToString) { + int err = ENOENT; + + // Non-truncated result. + ASSERT_EQ("No such file or directory", ErrnoToString(err)); + + // Truncated because of a short buffer. + char buf[2]; + ErrnoToCString(err, buf, arraysize(buf)); + ASSERT_EQ("N", string(buf)); + + // Unknown error. + string expected = "Unknown error"; + ASSERT_EQ(ErrnoToString(-1).compare(0, expected.length(), expected), 0); + + // Unknown error (truncated). + ErrnoToCString(-1, buf, arraysize(buf)); + ASSERT_EQ("U", string(buf)); +} + +} // namespace kudu diff --git a/src/kudu/util/errno.cc b/src/kudu/util/errno.cc new file mode 100644 index 000000000000..f08bf2f4bdde --- /dev/null +++ b/src/kudu/util/errno.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/errno.h" + +#include +#include + +#include "kudu/util/logging.h" + +namespace kudu { + +void ErrnoToCString(int err, char *buf, size_t buf_len) { + CHECK_GT(buf_len, 0); +#if !defined(__GLIBC__) || \ + ((_POSIX_C_SOURCE >= 200112 || _XOPEN_SOURCE >= 600) && !defined(_GNU_SOURCE)) + // Using POSIX version 'int strerror_r(...)'. + int ret = strerror_r(err, buf, buf_len); + if (ret && ret != ERANGE && ret != EINVAL) { + strncpy(buf, "unknown error", buf_len); + buf[buf_len - 1] = '\0'; + } +#else + // Using GLIBC version + char* ret = strerror_r(err, buf, buf_len); + if (ret != buf) { + strncpy(buf, ret, buf_len); + buf[buf_len - 1] = '\0'; + } +#endif +} +} // namespace kudu diff --git a/src/kudu/util/errno.h b/src/kudu/util/errno.h new file mode 100644 index 000000000000..7d5416edeee8 --- /dev/null +++ b/src/kudu/util/errno.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_ERRNO_H +#define KUDU_ERRNO_H + +#include + +namespace kudu { + +void ErrnoToCString(int err, char *buf, size_t buf_len); + +// Return a string representing an errno. +inline static std::string ErrnoToString(int err) { + char buf[512]; + ErrnoToCString(err, buf, sizeof(buf)); + return std::string(buf); +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/failure_detector-test.cc b/src/kudu/util/failure_detector-test.cc new file mode 100644 index 000000000000..03a5789c1512 --- /dev/null +++ b/src/kudu/util/failure_detector-test.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/failure_detector.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +// How often we expect a node to heartbeat to assert its "aliveness". +static const int kExpectedHeartbeatPeriodMillis = 100; + +// Number of heartbeats after which the FD will consider the node dead. +static const int kMaxMissedHeartbeats = 2; + +// Let's check for failures every 100ms on average +/- 10ms. +static const int kFailureMonitorMeanMillis = 100; +static const int kFailureMonitorStddevMillis = 10; + +static const char* kNodeName = "node-1"; +static const char* kTestTabletName = "test-tablet"; + +class FailureDetectorTest : public KuduTest { + public: + FailureDetectorTest() + : KuduTest(), + latch_(1), + monitor_(new RandomizedFailureMonitor(SeedRandom(), + kFailureMonitorMeanMillis, + kFailureMonitorStddevMillis)) { + } + + void FailureFunction(const std::string& name, const Status& status) { + LOG(INFO) << "Detected failure of " << name; + latch_.CountDown(); + } + + protected: + void WaitForFailure() { + latch_.Wait(); + } + + CountDownLatch latch_; + gscoped_ptr monitor_; +}; + +// Tests that we can track a node, that while we notify that we're received messages from +// that node everything is ok and that once we stop doing so the failure detection function +// gets called. +TEST_F(FailureDetectorTest, TestDetectsFailure) { + ASSERT_OK(monitor_->Start()); + + scoped_refptr detector(new TimedFailureDetector( + MonoDelta::FromMilliseconds(kExpectedHeartbeatPeriodMillis * kMaxMissedHeartbeats))); + + monitor_->MonitorFailureDetector(kTestTabletName, detector); + ASSERT_FALSE(detector->IsTracking(kNodeName)); + ASSERT_OK(detector->Track(kNodeName, + MonoTime::Now(MonoTime::FINE), + Bind(&FailureDetectorTest::FailureFunction, Unretained(this)))); + ASSERT_TRUE(detector->IsTracking(kNodeName)); + + const int kNumPeriodsToWait = 4; // Num heartbeat periods to wait for a failure. + const int kUpdatesPerPeriod = 10; // Num updates we give per period to minimize test flakiness. + + for (int i = 0; i < kNumPeriodsToWait * kUpdatesPerPeriod; i++) { + // Report in (heartbeat) to the detector. + ASSERT_OK(detector->MessageFrom(kNodeName, MonoTime::Now(MonoTime::FINE))); + + // We sleep for a fraction of heartbeat period, to minimize test flakiness. + SleepFor(MonoDelta::FromMilliseconds(kExpectedHeartbeatPeriodMillis / kUpdatesPerPeriod)); + + // The latch shouldn't have counted down, since the node's been reporting that + // it's still alive. + ASSERT_EQ(1, latch_.count()); + } + + // If we stop reporting he node is alive the failure callback is eventually + // triggered and we exit. + WaitForFailure(); + + ASSERT_OK(detector->UnTrack(kNodeName)); + ASSERT_FALSE(detector->IsTracking(kNodeName)); + + ASSERT_OK(monitor_->UnmonitorFailureDetector(kTestTabletName)); + monitor_->Shutdown(); +} + +} // namespace kudu diff --git a/src/kudu/util/failure_detector.cc b/src/kudu/util/failure_detector.cc new file mode 100644 index 000000000000..426d345ab653 --- /dev/null +++ b/src/kudu/util/failure_detector.cc @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/failure_detector.h" + +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/locks.h" +#include "kudu/util/random_util.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" + +namespace kudu { + +using std::unordered_map; +using strings::Substitute; + +const int64_t RandomizedFailureMonitor::kMinWakeUpTimeMillis = 10; + +TimedFailureDetector::TimedFailureDetector(MonoDelta failure_period) + : failure_period_(std::move(failure_period)) {} + +TimedFailureDetector::~TimedFailureDetector() { + STLDeleteValues(&nodes_); +} + +Status TimedFailureDetector::Track(const string& name, + const MonoTime& now, + const FailureDetectedCallback& callback) { + lock_guard lock(&lock_); + gscoped_ptr node(new Node); + node->permanent_name = name; + node->callback = callback; + node->last_heard_of = now; + node->status = ALIVE; + if (!InsertIfNotPresent(&nodes_, name, node.get())) { + return Status::AlreadyPresent( + Substitute("Node with name '$0' is already being monitored", name)); + } + ignore_result(node.release()); + return Status::OK(); +} + +Status TimedFailureDetector::UnTrack(const string& name) { + lock_guard lock(&lock_); + Node* node = EraseKeyReturnValuePtr(&nodes_, name); + if (PREDICT_FALSE(node == NULL)) { + return Status::NotFound(Substitute("Node with name '$0' not found", name)); + } + delete node; + return Status::OK(); +} + +bool TimedFailureDetector::IsTracking(const std::string& name) { + lock_guard lock(&lock_); + return ContainsKey(nodes_, name); +} + +Status TimedFailureDetector::MessageFrom(const std::string& name, const MonoTime& now) { + VLOG(3) << "Received message from " << name << " at " << now.ToString(); + lock_guard lock(&lock_); + Node* node = FindPtrOrNull(nodes_, name); + if (node == NULL) { + VLOG(1) << "Not tracking node: " << name; + return Status::NotFound(Substitute("Message from unknown node '$0'", name)); + } + node->last_heard_of = now; + node->status = ALIVE; + return Status::OK(); +} + +FailureDetector::NodeStatus TimedFailureDetector::GetNodeStatusUnlocked(const std::string& name, + const MonoTime& now) { + Node* node = FindOrDie(nodes_, name); + if (now.GetDeltaSince(node->last_heard_of).MoreThan(failure_period_)) { + node->status = DEAD; + } + return node->status; +} + +void TimedFailureDetector::CheckForFailures(const MonoTime& now) { + typedef unordered_map CallbackMap; + CallbackMap callbacks; + { + lock_guard lock(&lock_); + for (const NodeMap::value_type& entry : nodes_) { + if (GetNodeStatusUnlocked(entry.first, now) == DEAD) { + InsertOrDie(&callbacks, entry.first, entry.second->callback); + } + } + } + // Invoke failure callbacks outside of lock. + for (const CallbackMap::value_type& entry : callbacks) { + const string& node_name = entry.first; + const FailureDetectedCallback& callback = entry.second; + callback.Run(node_name, Status::RemoteError(Substitute("Node '$0' failed", node_name))); + } +} + +RandomizedFailureMonitor::RandomizedFailureMonitor(uint32_t random_seed, + int64_t period_mean_millis, + int64_t period_stddev_millis) + : period_mean_millis_(period_mean_millis), + period_stddev_millis_(period_stddev_millis), + random_(random_seed), + run_latch_(0), + shutdown_(false) { +} + +RandomizedFailureMonitor::~RandomizedFailureMonitor() { + Shutdown(); +} + +Status RandomizedFailureMonitor::Start() { + CHECK(!thread_); + run_latch_.Reset(1); + return Thread::Create("failure-monitors", "failure-monitor", + &RandomizedFailureMonitor::RunThread, + this, &thread_); +} + +void RandomizedFailureMonitor::Shutdown() { + if (!thread_) { + return; + } + + { + lock_guard l(&lock_); + if (shutdown_) { + return; + } + shutdown_ = true; + } + + run_latch_.CountDown(); + CHECK_OK(ThreadJoiner(thread_.get()).Join()); + thread_.reset(); +} + +Status RandomizedFailureMonitor::MonitorFailureDetector(const string& name, + const scoped_refptr& fd) { + lock_guard l(&lock_); + bool inserted = InsertIfNotPresent(&fds_, name, fd); + if (PREDICT_FALSE(!inserted)) { + return Status::AlreadyPresent(Substitute("Already monitoring failure detector '$0'", name)); + } + return Status::OK(); +} + +Status RandomizedFailureMonitor::UnmonitorFailureDetector(const string& name) { + lock_guard l(&lock_); + int count = fds_.erase(name); + if (PREDICT_FALSE(count == 0)) { + return Status::NotFound(Substitute("Failure detector '$0' not found", name)); + } + return Status::OK(); +} + +void RandomizedFailureMonitor::RunThread() { + VLOG(1) << "Failure monitor thread starting"; + + while (true) { + int64_t wait_millis = random_.Normal(period_mean_millis_, period_stddev_millis_); + if (wait_millis < kMinWakeUpTimeMillis) { + wait_millis = kMinWakeUpTimeMillis; + } + + MonoDelta wait_delta = MonoDelta::FromMilliseconds(wait_millis); + VLOG(3) << "RandomizedFailureMonitor sleeping for: " << wait_delta.ToString(); + if (run_latch_.WaitFor(wait_delta)) { + // CountDownLatch reached 0. + lock_guard lock(&lock_); + // Check if we were told to shutdown. + if (shutdown_) { + // Latch fired: exit loop. + VLOG(1) << "RandomizedFailureMonitor thread shutting down"; + return; + } + } + + // Take a copy of the FD map under the lock. + FDMap fds_copy; + { + lock_guard l(&lock_); + fds_copy = fds_; + } + + MonoTime now = MonoTime::Now(MonoTime::FINE); + for (const FDMap::value_type& entry : fds_copy) { + entry.second->CheckForFailures(now); + } + } +} + +} // namespace kudu diff --git a/src/kudu/util/failure_detector.h b/src/kudu/util/failure_detector.h new file mode 100644 index 000000000000..029001144b98 --- /dev/null +++ b/src/kudu/util/failure_detector.h @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_FAILURE_DETECTOR_H_ +#define KUDU_UTIL_FAILURE_DETECTOR_H_ + +#include +#include + +#include "kudu/gutil/callback.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/monotime.h" +#include "kudu/util/locks.h" +#include "kudu/util/random.h" +#include "kudu/util/status_callback.h" + +namespace kudu { +class MonoDelta; +class MonoTime; +class Status; +class Thread; + +// A generic interface for failure detector implementations. +// A failure detector is responsible for deciding whether a certain server is dead or alive. +class FailureDetector : public RefCountedThreadSafe { + public: + enum NodeStatus { + DEAD, + ALIVE + }; + typedef std::unordered_map StatusMap; + + typedef Callback FailureDetectedCallback; + + virtual ~FailureDetector() {} + + // Registers a node with 'name' in the failure detector. + // + // If it returns Status::OK() the failure detector will from now + // expect messages from the machine with 'name' and will trigger + // 'callback' if a failure is detected. + // + // Returns Status::AlreadyPresent() if a machine with 'name' is + // already registered in this failure detector. + virtual Status Track(const std::string& name, + const MonoTime& now, + const FailureDetectedCallback& callback) = 0; + + // Stops tracking node with 'name'. + virtual Status UnTrack(const std::string& name) = 0; + + // Return true iff the named entity is currently being tracked. + virtual bool IsTracking(const std::string& name) = 0; + + // Records that a message from machine with 'name' was received at 'now'. + virtual Status MessageFrom(const std::string& name, const MonoTime& now) = 0; + + // Checks the failure status of each tracked node. If the failure criteria is + // met, the failure callback is invoked. + virtual void CheckForFailures(const MonoTime& now) = 0; +}; + +// A simple failure detector implementation that considers a node dead +// when they have not reported by a certain time interval. +class TimedFailureDetector : public FailureDetector { + public: + // Some monitorable entity. + struct Node { + std::string permanent_name; + MonoTime last_heard_of; + FailureDetectedCallback callback; + NodeStatus status; + }; + + explicit TimedFailureDetector(MonoDelta failure_period); + virtual ~TimedFailureDetector(); + + virtual Status Track(const std::string& name, + const MonoTime& now, + const FailureDetectedCallback& callback) OVERRIDE; + + virtual Status UnTrack(const std::string& name) OVERRIDE; + + virtual bool IsTracking(const std::string& name) OVERRIDE; + + virtual Status MessageFrom(const std::string& name, const MonoTime& now) OVERRIDE; + + virtual void CheckForFailures(const MonoTime& now) OVERRIDE; + + private: + typedef std::unordered_map NodeMap; + + // Check if the named failure detector has failed. + // Does not invoke the callback. + FailureDetector::NodeStatus GetNodeStatusUnlocked(const std::string& name, + const MonoTime& now); + + const MonoDelta failure_period_; + mutable simple_spinlock lock_; + NodeMap nodes_; + + DISALLOW_COPY_AND_ASSIGN(TimedFailureDetector); +}; + +// A randomized failure monitor that wakes up in normally-distributed intervals +// and runs CheckForFailures() on each failure detector it monitors. +// +// The wake up interval is defined by a normal distribution with the specified +// mean and standard deviation, in milliseconds, with minimum possible value +// pinned at kMinWakeUpTimeMillis. +// +// We use a random wake up interval to avoid thundering herd / lockstep problems +// when multiple nodes react to the failure of another node. +class RandomizedFailureMonitor { + public: + // The minimum time the FailureMonitor will wait. + static const int64_t kMinWakeUpTimeMillis; + + RandomizedFailureMonitor(uint32_t random_seed, + int64_t period_mean_millis, + int64_t period_std_dev_millis); + ~RandomizedFailureMonitor(); + + // Starts the failure monitor. + Status Start(); + + // Stops the failure monitor. + void Shutdown(); + + // Adds a failure detector to be monitored. + Status MonitorFailureDetector(const std::string& name, + const scoped_refptr& fd); + + // Unmonitors the failure detector with the specified name. + Status UnmonitorFailureDetector(const std::string& name); + + private: + typedef std::unordered_map > FDMap; + + // Runs the monitor thread. + void RunThread(); + + // Mean & std. deviation of random period to sleep for between checking the + // failure detectors. + const int64_t period_mean_millis_; + const int64_t period_stddev_millis_; + ThreadSafeRandom random_; + + scoped_refptr thread_; + CountDownLatch run_latch_; + + mutable simple_spinlock lock_; + FDMap fds_; + bool shutdown_; // Whether the failure monitor should shut down. + + DISALLOW_COPY_AND_ASSIGN(RandomizedFailureMonitor); +}; + +} // namespace kudu + +#endif /* KUDU_UTIL_FAILURE_DETECTOR_H_ */ diff --git a/src/kudu/util/faststring.cc b/src/kudu/util/faststring.cc new file mode 100644 index 000000000000..cf5dbd12a4b3 --- /dev/null +++ b/src/kudu/util/faststring.cc @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/faststring.h" + +#include + +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { + +void faststring::GrowByAtLeast(size_t count) { + // Not enough space, need to reserve more. + // Don't reserve exactly enough space for the new string -- that makes it + // too easy to write perf bugs where you get O(n^2) append. + // Instead, alwayhs expand by at least 50%. + + size_t to_reserve = len_ + count; + if (len_ + count < len_ * 3 / 2) { + to_reserve = len_ * 3 / 2; + } + GrowArray(to_reserve); +} + +void faststring::GrowArray(size_t newcapacity) { + DCHECK_GE(newcapacity, capacity_); + gscoped_array newdata(new uint8_t[newcapacity]); + if (len_ > 0) { + memcpy(&newdata[0], &data_[0], len_); + } + capacity_ = newcapacity; + if (data_ != initial_data_) { + delete[] data_; + } else { + ASAN_POISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + } + + data_ = newdata.release(); + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); +} + + +} // namespace kudu diff --git a/src/kudu/util/faststring.h b/src/kudu/util/faststring.h new file mode 100644 index 000000000000..5bd003a74c12 --- /dev/null +++ b/src/kudu/util/faststring.h @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_FASTSTRING_H +#define KUDU_UTIL_FASTSTRING_H + +#include + +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/fastmem.h" + +namespace kudu { + +// A faststring is similar to a std::string, except that it is faster for many +// common use cases (in particular, resize() will fill with uninitialized data +// instead of memsetting to \0) +class faststring { + public: + faststring() : + data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + } + + // Construct a string with the given capacity, in bytes. + explicit faststring(size_t capacity) + : data_(initial_data_), + len_(0), + capacity_(kInitialCapacity) { + if (capacity > capacity_) { + data_ = new uint8_t[capacity]; + capacity_ = capacity; + } + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + ~faststring() { + ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); + if (data_ != initial_data_) { + delete[] data_; + } + } + + // Reset the valid length of the string to 0. + // + // This does not free up any memory. The capacity of the string remains unchanged. + void clear() { + resize(0); + ASAN_POISON_MEMORY_REGION(data_, capacity_); + } + + // Resize the string to the given length. + // If the new length is larger than the old length, the capacity is expanded as necessary. + // + // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are + // not cleared. + void resize(size_t newsize) { + if (newsize > capacity_) { + reserve(newsize); + } + len_ = newsize; + ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); + ASAN_UNPOISON_MEMORY_REGION(data_, len_); + } + + // Releases the underlying array; after this, the buffer is left empty. + // + // NOTE: the data pointer returned by release() is not necessarily the pointer + uint8_t *release() WARN_UNUSED_RESULT { + uint8_t *ret = data_; + if (ret == initial_data_) { + ret = new uint8_t[len_]; + memcpy(ret, data_, len_); + } + len_ = 0; + capacity_ = kInitialCapacity; + data_ = initial_data_; + ASAN_POISON_MEMORY_REGION(data_, capacity_); + return ret; + } + + // Reserve space for the given total amount of data. If the current capacity is already + // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). + // + // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory + // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. + void reserve(size_t newcapacity) { + if (PREDICT_TRUE(newcapacity <= capacity_)) return; + GrowArray(newcapacity); + } + + // Append the given data to the string, resizing capacity as necessary. + void append(const void *src_v, size_t count) { + const uint8_t *src = reinterpret_cast(src_v); + EnsureRoomForAppend(count); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); + + // appending short values is common enough that this + // actually helps, according to benchmarks. In theory + // memcpy_inlined should already be just as good, but this + // was ~20% faster for reading a large prefix-coded string file + // where each string was only a few chars different + if (count <= 4) { + uint8_t *p = &data_[len_]; + for (int i = 0; i < count; i++) { + *p++ = *src++; + } + } else { + strings::memcpy_inlined(&data_[len_], src, count); + } + len_ += count; + } + + // Append the given string to this string. + void append(const std::string &str) { + append(str.data(), str.size()); + } + + // Append the given character to this string. + void push_back(const char byte) { + EnsureRoomForAppend(1); + ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); + data_[len_] = byte; + len_++; + } + + // Return the valid length of this string. + size_t length() const { + return len_; + } + + // Return the valid length of this string (identical to length()) + size_t size() const { + return len_; + } + + // Return the allocated capacity of this string. + size_t capacity() const { + return capacity_; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + const uint8_t *data() const { + return &data_[0]; + } + + // Return a pointer to the data in this string. Note that this pointer + // may be invalidated by any later non-const operation. + uint8_t *data() { + return &data_[0]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &at(size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + const uint8_t &operator[](size_t i) const { + return data_[i]; + } + + // Return the given element of this string. Note that this does not perform + // any bounds checking. + uint8_t &operator[](size_t i) { + return data_[i]; + } + + // Reset the contents of this string by copying 'len' bytes from 'src'. + void assign_copy(const uint8_t *src, size_t len) { + // Reset length so that the first resize doesn't need to copy the current + // contents of the array. + len_ = 0; + resize(len); + memcpy(data(), src, len); + } + + // Reset the contents of this string by copying from the given std::string. + void assign_copy(const std::string &str) { + assign_copy(reinterpret_cast(str.c_str()), + str.size()); + } + + // Return a copy of this string as a std::string. + std::string ToString() const { + return std::string(reinterpret_cast(data()), + len_); + } + + private: + DISALLOW_COPY_AND_ASSIGN(faststring); + + // If necessary, expand the buffer to fit at least 'count' more bytes. + // If the array has to be grown, it is grown by at least 50%. + void EnsureRoomForAppend(size_t count) { + if (PREDICT_TRUE(len_ + count <= capacity_)) { + return; + } + + // Call the non-inline slow path - this reduces the number of instructions + // on the hot path. + GrowByAtLeast(count); + } + + // The slow path of MakeRoomFor. Grows the buffer by either + // 'count' bytes, or 50%, whichever is more. + void GrowByAtLeast(size_t count); + + // Grow the array to the given capacity, which must be more than + // the current capacity. + void GrowArray(size_t newcapacity); + + enum { + kInitialCapacity = 32 + }; + + uint8_t* data_; + uint8_t initial_data_[kInitialCapacity]; + size_t len_; + size_t capacity_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/fault_injection.cc b/src/kudu/util/fault_injection.cc new file mode 100644 index 000000000000..ffa6cb24816d --- /dev/null +++ b/src/kudu/util/fault_injection.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/fault_injection.h" + +#include +#include +#include + +#include "kudu/gutil/once.h" +#include "kudu/util/debug/leakcheck_disabler.h" +#include "kudu/util/monotime.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" + +namespace kudu { +namespace fault_injection { + +namespace { +GoogleOnceType g_random_once; +Random* g_random; + +void InitRandom() { + LOG(WARNING) << "FAULT INJECTION ENABLED!"; + LOG(WARNING) << "THIS SERVER MAY CRASH!"; + + debug::ScopedLeakCheckDisabler d; + g_random = new Random(GetRandomSeed32()); + ANNOTATE_BENIGN_RACE_SIZED(g_random, sizeof(Random), + "Racy random numbers are OK"); +} + +} // anonymous namespace + +void DoMaybeFault(const char* fault_str, double fraction) { + GoogleOnceInit(&g_random_once, InitRandom); + if (PREDICT_TRUE(g_random->NextDoubleFraction() >= fraction)) { + return; + } + + // Disable core dumps -- it's not useful to get a core dump when we're + // purposefully crashing, and some tests cause lots of server crashes + // in a loop. This avoids filling up the disk with useless cores. + struct rlimit lim; + PCHECK(getrlimit(RLIMIT_CORE, &lim) == 0); + lim.rlim_cur = 0; + PCHECK(setrlimit(RLIMIT_CORE, &lim) == 0); + + // Set coredump_filter to not dump any parts of the address space. + // Although the above disables core dumps to files, if core_pattern + // is set to a pipe rather than a file, it's not sufficient. Setting + // this pattern results in piping a very minimal dump into the core + // processor (eg abrtd), thus speeding up the crash. + int f = open("/proc/self/coredump_filter", O_WRONLY); + if (f >= 0) { + write(f, "00000000", 8); + close(f); + } + + LOG(FATAL) << "Injected fault: " << fault_str; +} + +void DoInjectRandomLatency(double max_ms) { + GoogleOnceInit(&g_random_once, InitRandom); + SleepFor(MonoDelta::FromMilliseconds(g_random->NextDoubleFraction() * max_ms)); +} + +} // namespace fault_injection +} // namespace kudu diff --git a/src/kudu/util/fault_injection.h b/src/kudu/util/fault_injection.h new file mode 100644 index 000000000000..eba021b1cce5 --- /dev/null +++ b/src/kudu/util/fault_injection.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_FAULT_INJECTION_H +#define KUDU_UTIL_FAULT_INJECTION_H + +#include "kudu/gutil/macros.h" + +// With some probability, crash at the current point in the code +// by issuing LOG(FATAL). +// +// The probability is determined by the 'fraction_flag' argument. +// +// Typical usage: +// +// DEFINE_double(fault_crash_before_foo, 0.0, +// "Fraction of the time when we will crash before doing foo"); +// TAG_FLAG(fault_crash_before_foo, unsafe); +// +// This macro should be fast enough to run even in hot code paths. +#define MAYBE_FAULT(fraction_flag) \ + kudu::fault_injection::MaybeFault(AS_STRING(fraction_flag), fraction_flag) + +// Inject a uniformly random amount of latency between 0 and the configured +// number of milliseconds. +// +// As with above, if the flag is configured to be <= 0, then this will be evaluated +// inline and should be fast, even in hot code path. +#define MAYBE_INJECT_RANDOM_LATENCY(max_ms_flag) \ + kudu::fault_injection::MaybeInjectRandomLatency(max_ms_flag); + +// Implementation details below. +// Use the MAYBE_FAULT macro instead. +namespace kudu { +namespace fault_injection { + +// Out-of-line implementation. +void DoMaybeFault(const char* fault_str, double fraction); +void DoInjectRandomLatency(double max_latency); + +inline void MaybeFault(const char* fault_str, double fraction) { + if (PREDICT_TRUE(fraction <= 0)) return; + DoMaybeFault(fault_str, fraction); +} + +inline void MaybeInjectRandomLatency(double max_latency) { + if (PREDICT_TRUE(max_latency <= 0)) return; + DoInjectRandomLatency(max_latency); +} + +} // namespace fault_injection +} // namespace kudu +#endif /* KUDU_UTIL_FAULT_INJECTION_H */ diff --git a/src/kudu/util/flag_tags-test.cc b/src/kudu/util/flag_tags-test.cc new file mode 100644 index 000000000000..ea16b1cff2e2 --- /dev/null +++ b/src/kudu/util/flag_tags-test.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/test_util.h" + +DEFINE_int32(flag_with_no_tags, 0, "test flag that has no tags"); + +DEFINE_int32(flag_with_one_tag, 0, "test flag that has 1 tag"); +TAG_FLAG(flag_with_one_tag, stable); + +DEFINE_int32(flag_with_two_tags, 0, "test flag that has 2 tags"); +TAG_FLAG(flag_with_two_tags, evolving); +TAG_FLAG(flag_with_two_tags, unsafe); + +using std::string; +using std::unordered_set; + +namespace kudu { + +class FlagTagsTest : public KuduTest { +}; + +TEST_F(FlagTagsTest, TestTags) { + unordered_set tags; + GetFlagTags("flag_with_no_tags", &tags); + EXPECT_EQ(0, tags.size()); + + GetFlagTags("flag_with_one_tag", &tags); + EXPECT_EQ(1, tags.size()); + EXPECT_TRUE(ContainsKey(tags, "stable")); + + GetFlagTags("flag_with_two_tags", &tags); + EXPECT_EQ(2, tags.size()); + EXPECT_TRUE(ContainsKey(tags, "evolving")); + EXPECT_TRUE(ContainsKey(tags, "unsafe")); + + GetFlagTags("missing_flag", &tags); + EXPECT_EQ(0, tags.size()); +} + +} // namespace kudu diff --git a/src/kudu/util/flag_tags.cc b/src/kudu/util/flag_tags.cc new file mode 100644 index 000000000000..8b9ebe33495f --- /dev/null +++ b/src/kudu/util/flag_tags.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/flag_tags.h" + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/singleton.h" + +#include +#include +#include +#include +#include + +using std::multimap; +using std::pair; +using std::string; +using std::unordered_set; +using std::vector; + +namespace kudu { +namespace flag_tags_internal { + +// Singleton registry storing the set of tags for each flag. +class FlagTagRegistry { + public: + static FlagTagRegistry* GetInstance() { + return Singleton::get(); + } + + void Tag(const string& name, const string& tag) { + tag_map_.insert(TagMap::value_type(name, tag)); + } + + void GetTags(const string& name, unordered_set* tags) { + tags->clear(); + pair range = + tag_map_.equal_range(name); + for (auto it = range.first; it != range.second; ++it) { + if (!InsertIfNotPresent(tags, it->second)) { + LOG(DFATAL) << "Flag " << name << " was tagged more than once with the tag '" + << it->second << "'"; + } + } + } + + private: + friend class Singleton; + FlagTagRegistry() {} + + typedef multimap TagMap; + TagMap tag_map_; + + DISALLOW_COPY_AND_ASSIGN(FlagTagRegistry); +}; + + +FlagTagger::FlagTagger(const char* name, const char* tag) { + FlagTagRegistry::GetInstance()->Tag(name, tag); +} + +FlagTagger::~FlagTagger() { +} + +} // namespace flag_tags_internal + +using flag_tags_internal::FlagTagRegistry; + +void GetFlagTags(const string& flag_name, + unordered_set* tags) { + FlagTagRegistry::GetInstance()->GetTags(flag_name, tags); +} + +} // namespace kudu diff --git a/src/kudu/util/flag_tags.h b/src/kudu/util/flag_tags.h new file mode 100644 index 000000000000..ed2dbfd93c71 --- /dev/null +++ b/src/kudu/util/flag_tags.h @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Flag Tags provide a way to attach arbitrary textual tags to gflags in +// a global registry. Kudu uses the following flag tags: +// +// - "stable": +// These flags are considered user-facing APIs. Therefore, the +// semantics of the flag should not be changed except between major +// versions. Similarly, they must not be removed except between major +// versions. +// +// - "evolving": +// These flags are considered user-facing APIs, but are not yet +// locked down. For example, they may pertain to a newly introduced +// feature that is still being actively developed. These may be changed +// between minor versions, but should be suitably release-noted. +// +// This is the default assumed stability level, but can be tagged +// if you'd like to make it explicit. +// +// - "experimental": +// These flags are considered user-facing APIs, but are related to +// an experimental feature, or otherwise likely to change or be +// removed at any point. Users should not expect any compatibility +// of these flags. +// +// TODO: we should add a new flag like -unlock_experimental_flags +// which would be required if the user wants to use any of these, +// similar to the JVM's -XX:+UnlockExperimentalVMOptions. +// +// - "hidden": +// These flags are for internal use only (e.g. testing) and should +// not be included in user-facing documentation. +// +// - "advanced": +// These flags are for advanced users or debugging purposes. While +// they aren't likely to be actively harmful (see "unsafe" below), +// they're also likely to be used only rarely and should be relegated +// to more detailed sections of documentation. +// +// - "unsafe": +// These flags are for internal use only (e.g. testing), and changing +// them away from the defaults may result in arbitrarily bad things +// happening. These flags are automatically excluded from user-facing +// documentation even if they are not also marked 'hidden'. +// +// TODO: we should add a flag -unlock_unsafe_flags which would be required +// to use any of these flags. +// +// - "runtime": +// These flags can be safely changed at runtime via an RPC to the +// server. Changing a flag at runtime that does not have this tag is allowed +// only if the user specifies a "force_unsafe_change" flag in the RPC. +// +// NOTE: because gflags are simple global variables, it's important to +// think very carefully before tagging a flag with 'runtime'. In particular, +// if a string-type flag is marked 'runtime', you should never access it +// using the raw 'FLAGS_foo_bar' name. Instead, you must use the +// google::GetCommandLineFlagInfo(...) API to make a copy of the flag value +// under a lock. Otherwise, the 'std::string' instance could be mutated +// underneath the reader causing a crash. +// +// For primitive-type flags, we assume that reading a variable is atomic. +// That is to say that a reader will either see the old value or the new +// one, but not some invalid value. However, for the runtime change to +// have any effect, you must be sure to use the FLAGS_foo_bar variable directly +// rather than initializing some instance variable during program startup. +// +// A given flag may have zero or more tags associated with it. The system does +// not make any attempt to check integrity of the tags - for example, it allows +// you to mark a flag as both stable and unstable, even though this makes no +// real sense. Nevertheless, you should strive to meet the following requirements: +// +// - A flag should have exactly no more than one of stable/evolving/experimental +// indicating its stability. 'evolving' is considered the default. +// - A flag should have no more than one of advanced/hidden indicating visibility +// in documentation. If neither is specified, the flag will be in the main +// section of the documentation. +// - It is likely that most 'experimental' flags will also be 'advanced' or 'hidden', +// and that 'stable' flags are not likely to be 'hidden' or 'unsafe'. +// +// To add a tag to a flag, use the TAG_FLAG macro. For example: +// +// DEFINE_bool(sometimes_crash, false, "This flag makes Kudu crash a lot"); +// TAG_FLAG(sometimes_crash, unsafe); +// TAG_FLAG(sometimes_crash, runtime); +// +// To fetch the list of tags associated with a flag, use 'GetFlagTags'. + +#ifndef KUDU_UTIL_FLAG_TAGS_H +#define KUDU_UTIL_FLAG_TAGS_H + +#include "kudu/gutil/macros.h" + +#include +#include +#include + +namespace kudu { + +struct FlagTags { + enum { + stable, + evolving, + experimental, + hidden, + advanced, + unsafe, + runtime + }; +}; + +// Tag the flag 'flag_name' with the given tag 'tag'. +// +// This verifies that 'flag_name' is a valid gflag, which must be defined +// or declared above the use of the TAG_FLAG macro. +// +// This also validates that 'tag' is a valid flag as defined in the FlagTags +// enum above. +#define TAG_FLAG(flag_name, tag) \ + COMPILE_ASSERT(sizeof(FLAGS_##flag_name), flag_does_not_exist); \ + COMPILE_ASSERT(sizeof(::kudu::FlagTags::tag), invalid_tag); \ + namespace { \ + ::kudu::flag_tags_internal::FlagTagger t_##flag_name##_##tag( \ + AS_STRING(flag_name), AS_STRING(tag)); \ + } + +// Fetch the list of flags associated with the given flag. +// +// If the flag is invalid or has no tags, sets 'tags' to be empty. +void GetFlagTags(const std::string& flag_name, + std::unordered_set* tags); + +// ------------------------------------------------------------ +// Internal implementation details +// ------------------------------------------------------------ +namespace flag_tags_internal { + +class FlagTagger { + public: + FlagTagger(const char* name, const char* tag); + ~FlagTagger(); + + private: + DISALLOW_COPY_AND_ASSIGN(FlagTagger); +}; + +} // namespace flag_tags_internal + +} // namespace kudu +#endif /* KUDU_UTIL_FLAG_TAGS_H */ diff --git a/src/kudu/util/flags.cc b/src/kudu/util/flags.cc new file mode 100644 index 000000000000..17e6ee5ead1d --- /dev/null +++ b/src/kudu/util/flags.cc @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/flags.h" + +#include +#include +#include +#include + +#include +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/path_util.h" +#include "kudu/util/url-coding.h" +#include "kudu/util/version_info.h" + +using google::CommandLineFlagInfo; +using std::cout; +using std::endl; +using std::string; +using std::unordered_set; + +// Because every binary initializes its flags here, we use it as a convenient place +// to offer some global flags as well. +DEFINE_bool(dump_metrics_json, false, + "Dump a JSON document describing all of the metrics which may be emitted " + "by this binary."); +TAG_FLAG(dump_metrics_json, hidden); + +DEFINE_bool(enable_process_lifetime_heap_profiling, false, "Enables heap " + "profiling for the lifetime of the process. Profile output will be stored in the " + "directory specified by -heap_profile_path. Enabling this option will disable the " + "on-demand/remote server profile handlers."); +TAG_FLAG(enable_process_lifetime_heap_profiling, stable); +TAG_FLAG(enable_process_lifetime_heap_profiling, advanced); + +DEFINE_string(heap_profile_path, "", "Output path to store heap profiles. If not set " \ + "profiles are stored in /tmp/...heap."); +TAG_FLAG(heap_profile_path, stable); +TAG_FLAG(heap_profile_path, advanced); + +// Tag a bunch of the flags that we inherit from glog/gflags. + +//------------------------------------------------------------ +// GLog flags +//------------------------------------------------------------ +// Most of these are considered stable. The ones related to email are +// marked unsafe because sending email inline from a server is a pretty +// bad idea. +DECLARE_string(alsologtoemail); +TAG_FLAG(alsologtoemail, hidden); +TAG_FLAG(alsologtoemail, unsafe); + +// --alsologtostderr is deprecated in favor of --stderrthreshold +DECLARE_bool(alsologtostderr); +TAG_FLAG(alsologtostderr, hidden); +TAG_FLAG(alsologtostderr, runtime); + +DECLARE_bool(colorlogtostderr); +TAG_FLAG(colorlogtostderr, stable); +TAG_FLAG(colorlogtostderr, runtime); + +DECLARE_bool(drop_log_memory); +TAG_FLAG(drop_log_memory, advanced); +TAG_FLAG(drop_log_memory, runtime); + +DECLARE_string(log_backtrace_at); +TAG_FLAG(log_backtrace_at, advanced); + +DECLARE_string(log_dir); +TAG_FLAG(log_dir, stable); + +DECLARE_string(log_link); +TAG_FLAG(log_link, stable); +TAG_FLAG(log_link, advanced); + +DECLARE_bool(log_prefix); +TAG_FLAG(log_prefix, stable); +TAG_FLAG(log_prefix, advanced); +TAG_FLAG(log_prefix, runtime); + +DECLARE_int32(logbuflevel); +TAG_FLAG(logbuflevel, advanced); +TAG_FLAG(logbuflevel, runtime); +DECLARE_int32(logbufsecs); +TAG_FLAG(logbufsecs, advanced); +TAG_FLAG(logbufsecs, runtime); + +DECLARE_int32(logemaillevel); +TAG_FLAG(logemaillevel, hidden); +TAG_FLAG(logemaillevel, unsafe); + +DECLARE_string(logmailer); +TAG_FLAG(logmailer, hidden); + +DECLARE_bool(logtostderr); +TAG_FLAG(logtostderr, stable); +TAG_FLAG(logtostderr, runtime); + +DECLARE_int32(max_log_size); +TAG_FLAG(max_log_size, stable); +TAG_FLAG(max_log_size, runtime); + +DECLARE_int32(minloglevel); +TAG_FLAG(minloglevel, stable); +TAG_FLAG(minloglevel, advanced); +TAG_FLAG(minloglevel, runtime); + +DECLARE_int32(stderrthreshold); +TAG_FLAG(stderrthreshold, stable); +TAG_FLAG(stderrthreshold, advanced); +TAG_FLAG(stderrthreshold, runtime); + +DECLARE_bool(stop_logging_if_full_disk); +TAG_FLAG(stop_logging_if_full_disk, stable); +TAG_FLAG(stop_logging_if_full_disk, advanced); +TAG_FLAG(stop_logging_if_full_disk, runtime); + +DECLARE_int32(v); +TAG_FLAG(v, stable); +TAG_FLAG(v, advanced); +TAG_FLAG(v, runtime); + +DECLARE_string(vmodule); +TAG_FLAG(vmodule, stable); +TAG_FLAG(vmodule, advanced); + +DECLARE_bool(symbolize_stacktrace); +TAG_FLAG(symbolize_stacktrace, stable); +TAG_FLAG(symbolize_stacktrace, runtime); +TAG_FLAG(symbolize_stacktrace, advanced); + +//------------------------------------------------------------ +// GFlags flags +//------------------------------------------------------------ +DECLARE_string(flagfile); +TAG_FLAG(flagfile, stable); + +DECLARE_string(fromenv); +TAG_FLAG(fromenv, stable); +TAG_FLAG(fromenv, advanced); + +DECLARE_string(tryfromenv); +TAG_FLAG(tryfromenv, stable); +TAG_FLAG(tryfromenv, advanced); + +DECLARE_string(undefok); +TAG_FLAG(undefok, stable); +TAG_FLAG(undefok, advanced); + +DECLARE_int32(tab_completion_columns); +TAG_FLAG(tab_completion_columns, stable); +TAG_FLAG(tab_completion_columns, hidden); + +DECLARE_string(tab_completion_word); +TAG_FLAG(tab_completion_word, stable); +TAG_FLAG(tab_completion_word, hidden); + +DECLARE_bool(help); +TAG_FLAG(help, stable); + +DECLARE_bool(helpfull); +// We hide -helpfull because it's the same as -help for now. +TAG_FLAG(helpfull, stable); +TAG_FLAG(helpfull, hidden); + +DECLARE_string(helpmatch); +TAG_FLAG(helpmatch, stable); +TAG_FLAG(helpmatch, advanced); + +DECLARE_string(helpon); +TAG_FLAG(helpon, stable); +TAG_FLAG(helpon, advanced); + +DECLARE_bool(helppackage); +TAG_FLAG(helppackage, stable); +TAG_FLAG(helppackage, advanced); + +DECLARE_bool(helpshort); +TAG_FLAG(helpshort, stable); +TAG_FLAG(helpshort, advanced); + +DECLARE_bool(helpxml); +TAG_FLAG(helpxml, stable); +TAG_FLAG(helpxml, advanced); + +DECLARE_bool(version); +TAG_FLAG(version, stable); + +namespace kudu { +namespace { + +void AppendXMLTag(const char* tag, const string& txt, string* r) { + strings::SubstituteAndAppend(r, "<$0>$1", tag, EscapeForHtmlToString(txt)); +} + +static string DescribeOneFlagInXML(const CommandLineFlagInfo& flag) { + unordered_set tags; + GetFlagTags(flag.name, &tags); + + string r(""); + AppendXMLTag("file", flag.filename, &r); + AppendXMLTag("name", flag.name, &r); + AppendXMLTag("meaning", flag.description, &r); + AppendXMLTag("default", flag.default_value, &r); + AppendXMLTag("current", flag.current_value, &r); + AppendXMLTag("type", flag.type, &r); + AppendXMLTag("tags", JoinStrings(tags, ","), &r); + r += ""; + return r; +} + +void DumpFlagsXML() { + vector flags; + GetAllFlags(&flags); + + cout << "" << endl; + cout << "" << endl; + cout << strings::Substitute( + "$0", + EscapeForHtmlToString(BaseName(google::ProgramInvocationShortName()))) << endl; + cout << strings::Substitute( + "$0", + EscapeForHtmlToString(google::ProgramUsage())) << endl; + + for (const CommandLineFlagInfo& flag : flags) { + cout << DescribeOneFlagInXML(flag) << std::endl; + } + + cout << "" << endl; + exit(1); +} + +void ShowVersionAndExit() { + cout << VersionInfo::GetAllVersionInfo() << endl; + exit(0); +} + +} // anonymous namespace + +int ParseCommandLineFlags(int* argc, char*** argv, bool remove_flags) { + int ret = google::ParseCommandLineNonHelpFlags(argc, argv, remove_flags); + + if (FLAGS_helpxml) { + DumpFlagsXML(); + } else if (FLAGS_dump_metrics_json) { + MetricPrototypeRegistry::get()->WriteAsJsonAndExit(); + } else if (FLAGS_version) { + ShowVersionAndExit(); + } else { + google::HandleCommandLineHelpFlags(); + } + + if (FLAGS_heap_profile_path.empty()) { + FLAGS_heap_profile_path = strings::Substitute( + "/tmp/$0.$1", google::ProgramInvocationShortName(), getpid()); + } + +#ifdef TCMALLOC_ENABLED + if (FLAGS_enable_process_lifetime_heap_profiling) { + HeapProfilerStart(FLAGS_heap_profile_path.c_str()); + } +#endif + + return ret; +} + +} // namespace kudu diff --git a/src/kudu/util/flags.h b/src/kudu/util/flags.h new file mode 100644 index 000000000000..8cbd255c0b92 --- /dev/null +++ b/src/kudu/util/flags.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_FLAGS_H +#define KUDU_UTIL_FLAGS_H + +#include "kudu/gutil/macros.h" + +namespace kudu { + +// Looks for flags in argv and parses them. Rearranges argv to put +// flags first, or removes them entirely if remove_flags is true. +// If a flag is defined more than once in the command line or flag +// file, the last definition is used. Returns the index (into argv) +// of the first non-flag argument. +// +// This is a wrapper around google::ParseCommandLineFlags, but integrates +// with Kudu flag tags. For example, --helpxml will include the list of +// tags for each flag. This should be be used instead of +// google::ParseCommandLineFlags in any user-facing binary. +// +// See gflags.h for more information. +int ParseCommandLineFlags(int* argc, char*** argv, bool remove_flags); + +} // namespace kudu +#endif /* KUDU_UTIL_FLAGS_H */ diff --git a/src/kudu/util/group_varint-inl.h b/src/kudu/util/group_varint-inl.h new file mode 100644 index 000000000000..8f418b613384 --- /dev/null +++ b/src/kudu/util/group_varint-inl.h @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_GROUP_VARINT_INL_H +#define KUDU_UTIL_GROUP_VARINT_INL_H + +#include +#include +#include +#include + +#include "kudu/util/faststring.h" + +namespace kudu { +namespace coding { + +extern bool SSE_TABLE_INITTED; +extern uint8_t SSE_TABLE[256 * 16] __attribute__((aligned(16))); +extern uint8_t VARINT_SELECTOR_LENGTHS[256]; + +const uint32_t MASKS[4] = { 0xff, 0xffff, 0xffffff, 0xffffffff }; + + +// Calculate the number of bytes to encode the given unsigned int. +inline size_t CalcRequiredBytes32(uint32_t i) { + // | 1 because the result is undefined for the 0 case + return sizeof(uint32_t) - __builtin_clz(i|1)/8; +} + +// Decode a set of 4 group-varint encoded integers from the given pointer. +// +// Requires that there are at up to 3 extra bytes remaining in 'src' after +// the last integer. +// +// Returns a pointer following the last decoded integer. +inline const uint8_t *DecodeGroupVarInt32( + const uint8_t *src, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) { + + uint8_t a_sel = (*src & BOOST_BINARY(11 00 00 00)) >> 6; + uint8_t b_sel = (*src & BOOST_BINARY(00 11 00 00)) >> 4; + uint8_t c_sel = (*src & BOOST_BINARY(00 00 11 00)) >> 2; + uint8_t d_sel = (*src & BOOST_BINARY(00 00 00 11 )); + + src++; // skip past selector byte + + *a = *reinterpret_cast(src) & MASKS[a_sel]; + src += a_sel + 1; + + *b = *reinterpret_cast(src) & MASKS[b_sel]; + src += b_sel + 1; + + *c = *reinterpret_cast(src) & MASKS[c_sel]; + src += c_sel + 1; + + *d = *reinterpret_cast(src) & MASKS[d_sel]; + src += d_sel + 1; + + return src; +} + +// Decode total length of the encoded integers from the given pointer, +// include the tag byte. +inline size_t DecodeGroupVarInt32_GetGroupSize(const uint8_t *src) { + return VARINT_SELECTOR_LENGTHS[*src] + 1; +} + +// Decode a set of 4 group-varint encoded integers from the given pointer. +// +// Returns a pointer following the last decoded integer. +inline const uint8_t *DecodeGroupVarInt32_SlowButSafe( + const uint8_t *src, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) { + + // VARINT_SELECTOR_LENGTHS[] isn't initialized until SSE_TABLE_INITTED is true + DCHECK(SSE_TABLE_INITTED); + + const size_t total_len = DecodeGroupVarInt32_GetGroupSize(src); + + uint8_t safe_buf[17]; + memcpy(safe_buf, src, total_len); + DecodeGroupVarInt32(safe_buf, a, b, c, d); + return src + total_len; +} + + +inline void DoExtractM128(__m128i results, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) { +#define SSE_USE_EXTRACT_PS +#ifdef SSE_USE_EXTRACT_PS + // _mm_extract_ps turns into extractps, which is slightly faster + // than _mm_extract_epi32 (which turns into pextrd) + // Apparently pextrd involves one more micro-op + // than extractps. + // + // A uint32 cfile macro-benchmark is about 3% faster with this code path. + *a = _mm_extract_ps((__v4sf)results, 0); + *b = _mm_extract_ps((__v4sf)results, 1); + *c = _mm_extract_ps((__v4sf)results, 2); + *d = _mm_extract_ps((__v4sf)results, 3); +#else + *a = _mm_extract_epi32(results, 0); + *b = _mm_extract_epi32(results, 1); + *c = _mm_extract_epi32(results, 2); + *d = _mm_extract_epi32(results, 3); +#endif +} + +// Same as above, but uses SSE so may be faster. +// TODO: remove this and just automatically pick the right implementation at runtime. +// +// NOTE: the src buffer must be have at least 17 bytes remaining in it, so this +// code path is not usable at the end of a block. +inline const uint8_t *DecodeGroupVarInt32_SSE( + const uint8_t *src, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) { + + DCHECK(SSE_TABLE_INITTED); + + uint8_t sel_byte = *src++; + __m128i shuffle_mask = _mm_load_si128( + reinterpret_cast<__m128i *>(&SSE_TABLE[sel_byte * 16])); + __m128i data = _mm_loadu_si128(reinterpret_cast(src)); + + __m128i results = _mm_shuffle_epi8(data, shuffle_mask); + + // It would look like the following would be most efficient, + // since it turns into a single movdqa instruction: + // *reinterpret_cast<__m128i *>(ret) = results; + // (where ret is an aligned array of ints, which the user must pass) + // but it is actually slower than the below alternatives by a + // good amount -- even though these result in more instructions. + DoExtractM128(results, a, b, c, d); + src += VARINT_SELECTOR_LENGTHS[sel_byte]; + + return src; +} + +// Optimized function which decodes a group of uint32s from 'src' into 'ret', +// which should have enough space for 4 uint32s. During decoding, adds 'add' +// to the vector in parallel. +// +// NOTE: the src buffer must be have at least 17 bytes remaining in it, so this +// code path is not usable at the end of a block. +inline const uint8_t *DecodeGroupVarInt32_SSE_Add( + const uint8_t *src, + uint32_t *ret, + __m128i add) { + + DCHECK(SSE_TABLE_INITTED); + + uint8_t sel_byte = *src++; + __m128i shuffle_mask = _mm_load_si128( + reinterpret_cast<__m128i *>(&SSE_TABLE[sel_byte * 16])); + __m128i data = _mm_loadu_si128(reinterpret_cast(src)); + + __m128i decoded_deltas = _mm_shuffle_epi8(data, shuffle_mask); + __m128i results = _mm_add_epi32(decoded_deltas, add); + + DoExtractM128(results, &ret[0], &ret[1], &ret[2], &ret[3]); + + src += VARINT_SELECTOR_LENGTHS[sel_byte]; + return src; +} + + +// Append a set of group-varint encoded integers to the given faststring. +inline void AppendGroupVarInt32( + faststring *s, + uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + + uint8_t a_tag = CalcRequiredBytes32(a) - 1; + uint8_t b_tag = CalcRequiredBytes32(b) - 1; + uint8_t c_tag = CalcRequiredBytes32(c) - 1; + uint8_t d_tag = CalcRequiredBytes32(d) - 1; + + uint8_t prefix_byte = + (a_tag << 6) | + (b_tag << 4) | + (c_tag << 2) | + (d_tag); + + uint8_t size = 1 + + a_tag + 1 + + b_tag + 1 + + c_tag + 1 + + d_tag + 1; + + size_t old_size = s->size(); + + // Reserving 4 extra bytes means we can use simple + // 4-byte stores instead of variable copies here -- + // if we hang off the end of the array into the "empty" area, it's OK. + // We'll chop it back off down below. + s->resize(old_size + size + 4); + uint8_t *ptr = &((*s)[old_size]); + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error dont support big endian currently +#endif + + *ptr++ = prefix_byte; + memcpy(ptr, &a, 4); + ptr += a_tag + 1; + memcpy(ptr, &b, 4); + ptr += b_tag + 1; + memcpy(ptr, &c, 4); + ptr += c_tag + 1; + memcpy(ptr, &d, 4); + + s->resize(old_size + size); +} + +// Append a sequence of uint32s encoded using group-varint. +// +// 'frame_of_reference' is also subtracted from each integer +// before encoding. +// +// If frame_of_reference is greater than any element in the array, +// results are undefined. +// +// For best performance, users should already have reserved adequate +// space in 's' (CalcRequiredBytes32 can be handy here) +inline void AppendGroupVarInt32Sequence(faststring *s, uint32_t frame_of_reference, + uint32_t *ints, size_t size) { + uint32_t *p = ints; + while (size >= 4) { + AppendGroupVarInt32(s, + p[0] - frame_of_reference, + p[1] - frame_of_reference, + p[2] - frame_of_reference, + p[3] - frame_of_reference); + size -= 4; + p += 4; + } + + + uint32_t trailer[4] = {0, 0, 0, 0}; + uint32_t *trailer_p = &trailer[0]; + + if (size > 0) { + while (size > 0) { + *trailer_p++ = *p++ - frame_of_reference; + size--; + } + + AppendGroupVarInt32(s, trailer[0], trailer[1], trailer[2], trailer[3]); + } +} + + +} // namespace coding +} // namespace kudu + +#endif diff --git a/src/kudu/util/group_varint-test.cc b/src/kudu/util/group_varint-test.cc new file mode 100644 index 000000000000..62176ef28b05 --- /dev/null +++ b/src/kudu/util/group_varint-test.cc @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { +namespace coding { + +extern void DumpSSETable(); + +// Encodes the given four ints as group-varint, then +// decodes and ensures the result is the same. +static void DoTestRoundTripGVI32( + uint32_t a, uint32_t b, uint32_t c, uint32_t d, + bool use_sse = false) { + faststring buf; + AppendGroupVarInt32(&buf, a, b, c, d); + + int real_size = buf.size(); + + // The implementations actually read past the group varint, + // so append some extra padding data to ensure that it's not reading + // uninitialized memory. The SSE implementation uses 128-bit reads + // and the non-SSE one uses 32-bit reads. + buf.append(string('x', use_sse ? 16 : 4)); + + uint32_t ret[4]; + + const uint8_t *end; + + if (use_sse) { + end = DecodeGroupVarInt32_SSE( + buf.data(), &ret[0], &ret[1], &ret[2], &ret[3]); + } else { + end = DecodeGroupVarInt32( + buf.data(), &ret[0], &ret[1], &ret[2], &ret[3]); + } + + ASSERT_EQ(a, ret[0]); + ASSERT_EQ(b, ret[1]); + ASSERT_EQ(c, ret[2]); + ASSERT_EQ(d, ret[3]); + ASSERT_EQ(end, buf.data() + real_size); +} + + +TEST(TestGroupVarInt, TestSSETable) { + DumpSSETable(); + faststring buf; + AppendGroupVarInt32(&buf, 0, 0, 0, 0); + DoTestRoundTripGVI32(0, 0, 0, 0, true); + DoTestRoundTripGVI32(1, 2, 3, 4, true); + DoTestRoundTripGVI32(1, 2000, 3, 200000, true); +} + +TEST(TestGroupVarInt, TestGroupVarInt) { + faststring buf; + AppendGroupVarInt32(&buf, 0, 0, 0, 0); + ASSERT_EQ(5UL, buf.size()); + ASSERT_EQ(0, memcmp("\x00\x00\x00\x00\x00", buf.data(), 5)); + buf.clear(); + + // All 1-byte + AppendGroupVarInt32(&buf, 1, 2, 3, 254); + ASSERT_EQ(5UL, buf.size()); + ASSERT_EQ(0, memcmp("\x00\x01\x02\x03\xfe", buf.data(), 5)); + buf.clear(); + + // Mixed 1-byte and 2-byte + AppendGroupVarInt32(&buf, 256, 2, 3, 65535); + ASSERT_EQ(7UL, buf.size()); + ASSERT_EQ(BOOST_BINARY(01 00 00 01), buf.at(0)); + ASSERT_EQ(256, *reinterpret_cast(&buf[1])); + ASSERT_EQ(2, *reinterpret_cast(&buf[3])); + ASSERT_EQ(3, *reinterpret_cast(&buf[4])); + ASSERT_EQ(65535, *reinterpret_cast(&buf[5])); +} + + +// Round-trip encode/decodes using group varint +TEST(TestGroupVarInt, TestRoundTrip) { + // A few simple tests. + DoTestRoundTripGVI32(0, 0, 0, 0); + DoTestRoundTripGVI32(1, 2, 3, 4); + DoTestRoundTripGVI32(1, 2000, 3, 200000); + + // Then a randomized test. + for (int i = 0; i < 10000; i++) { + DoTestRoundTripGVI32(random(), random(), random(), random()); + } +} + +#ifdef NDEBUG +TEST(TestGroupVarInt, EncodingBenchmark) { + int n_ints = 1000000; + + std::vector ints; + ints.reserve(n_ints); + for (int i = 0; i < n_ints; i++) { + ints.push_back(i); + } + + faststring s; + // conservative reservation + s.reserve(ints.size() * 4); + + LOG_TIMING(INFO, "Benchmark") { + for (int i = 0; i < 100; i++) { + s.clear(); + AppendGroupVarInt32Sequence(&s, 0, &ints[0], n_ints); + } + } +} +#endif +} // namespace coding +} // namespace kudu diff --git a/src/kudu/util/group_varint.cc b/src/kudu/util/group_varint.cc new file mode 100644 index 000000000000..53c362bab670 --- /dev/null +++ b/src/kudu/util/group_varint.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/util/group_varint-inl.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/slice.h" + +namespace kudu { +namespace coding { + +bool SSE_TABLE_INITTED = false; +uint8_t SSE_TABLE[256 * 16] __attribute__((aligned(16))); +uint8_t VARINT_SELECTOR_LENGTHS[256]; + +__attribute__((constructor)) +static void InitializeSSETables() { + memset(SSE_TABLE, 0xff, sizeof(SSE_TABLE)); + + for (int i = 0; i < 256; i++) { + uint32_t *entry = reinterpret_cast(&SSE_TABLE[i * 16]); + + uint8_t selectors[] = { + static_cast((i & BOOST_BINARY(11 00 00 00)) >> 6), + static_cast((i & BOOST_BINARY(00 11 00 00)) >> 4), + static_cast((i & BOOST_BINARY(00 00 11 00)) >> 2), + static_cast((i & BOOST_BINARY(00 00 00 11))) }; + + // 00000000 -> + // 00 ff ff ff 01 ff ff ff 02 ff ff ff 03 ff ff ff + + // 01000100 -> + // 00 01 ff ff 02 ff ff ff 03 04 ff ff 05 ff ff ff + + uint8_t offset = 0; + + for (int j = 0; j < 4; j++) { + uint8_t num_bytes = selectors[j] + 1; + uint8_t *entry_bytes = reinterpret_cast(&entry[j]); + + for (int k = 0; k < num_bytes; k++) { + *entry_bytes++ = offset++; + } + } + + VARINT_SELECTOR_LENGTHS[i] = offset; + } + + SSE_TABLE_INITTED = true; +} + +void DumpSSETable() { + LOG(INFO) << "SSE table:\n" + << kudu::HexDump(Slice(SSE_TABLE, sizeof(SSE_TABLE))); +} + + + +} // namespace coding +} // namespace kudu diff --git a/src/kudu/util/hash_util-test.cc b/src/kudu/util/hash_util-test.cc new file mode 100644 index 000000000000..a88f275ed543 --- /dev/null +++ b/src/kudu/util/hash_util-test.cc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/test_util.h" + +#include "kudu/util/hash_util.h" + +namespace kudu { + +// Test Murmur2 Hash64 returns the expected values for inputs. These tests are +// duplicated on the Java side to ensure that hash computations are stable +// across both platforms. +TEST(HashUtilTest, TestMurmur2Hash64) { + uint64_t hash; + + hash = HashUtil::MurmurHash2_64("ab", 2, 0); + ASSERT_EQ(7115271465109541368, hash); + + hash = HashUtil::MurmurHash2_64("abcdefg", 7, 0); + ASSERT_EQ(2601573339036254301, hash); + + hash = HashUtil::MurmurHash2_64("quick brown fox", 15, 42); + ASSERT_EQ(3575930248840144026, hash); +} + +} // namespace kudu diff --git a/src/kudu/util/hash_util.h b/src/kudu/util/hash_util.h new file mode 100644 index 000000000000..78922527f5a7 --- /dev/null +++ b/src/kudu/util/hash_util.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#ifndef KUDU_UTIL_HASH_UTIL_H +#define KUDU_UTIL_HASH_UTIL_H + +namespace kudu { + +/// Utility class to compute hash values. +class HashUtil { + public: + + static const uint64_t MURMUR_PRIME = 0xc6a4a7935bd1e995; + static const int MURMUR_R = 47; + + /// Murmur2 hash implementation returning 64-bit hashes. + static uint64_t MurmurHash2_64(const void* input, int len, uint64_t seed) { + uint64_t h = seed ^ (len * MURMUR_PRIME); + + const uint64_t* data = reinterpret_cast(input); + const uint64_t* end = data + (len / sizeof(uint64_t)); + + while (data != end) { + uint64_t k = *data++; + k *= MURMUR_PRIME; + k ^= k >> MURMUR_R; + k *= MURMUR_PRIME; + h ^= k; + h *= MURMUR_PRIME; + } + + const uint8_t* data2 = reinterpret_cast(data); + switch (len & 7) { + case 7: h ^= static_cast(data2[6]) << 48; + case 6: h ^= static_cast(data2[5]) << 40; + case 5: h ^= static_cast(data2[4]) << 32; + case 4: h ^= static_cast(data2[3]) << 24; + case 3: h ^= static_cast(data2[2]) << 16; + case 2: h ^= static_cast(data2[1]) << 8; + case 1: h ^= static_cast(data2[0]); + h *= MURMUR_PRIME; + } + + h ^= h >> MURMUR_R; + h *= MURMUR_PRIME; + h ^= h >> MURMUR_R; + return h; + } +}; + +} // namespace kudu +#endif diff --git a/src/kudu/util/hdr_histogram-test.cc b/src/kudu/util/hdr_histogram-test.cc new file mode 100644 index 000000000000..bf1c101ee519 --- /dev/null +++ b/src/kudu/util/hdr_histogram-test.cc @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include + +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +static const int kSigDigits = 2; + +class HdrHistogramTest : public KuduTest { +}; + +TEST_F(HdrHistogramTest, SimpleTest) { + uint64_t highest_val = 10000LU; + + HdrHistogram hist(highest_val, kSigDigits); + ASSERT_EQ(0, hist.CountInBucketForValue(1)); + hist.Increment(1); + ASSERT_EQ(1, hist.CountInBucketForValue(1)); + hist.IncrementBy(1, 3); + ASSERT_EQ(4, hist.CountInBucketForValue(1)); + hist.Increment(10); + ASSERT_EQ(1, hist.CountInBucketForValue(10)); + hist.Increment(20); + ASSERT_EQ(1, hist.CountInBucketForValue(20)); + ASSERT_EQ(0, hist.CountInBucketForValue(1000)); + hist.Increment(1000); + hist.Increment(1001); + ASSERT_EQ(2, hist.CountInBucketForValue(1000)); + + ASSERT_EQ(1 + 1 * 3 + 10 + 20 + 1000 + 1001, + hist.TotalSum()); +} + +TEST_F(HdrHistogramTest, TestCoordinatedOmission) { + uint64_t interval = 1000; + int loop_iters = 100; + int64_t normal_value = 10; + HdrHistogram hist(1000000LU, kSigDigits); + for (int i = 1; i <= loop_iters; i++) { + // Simulate a periodic "large value" that would exhibit coordinated + // omission were this loop to sleep on 'interval'. + int64_t value = (i % normal_value == 0) ? interval * 10 : normal_value; + + hist.IncrementWithExpectedInterval(value, interval); + } + ASSERT_EQ(loop_iters - (loop_iters / normal_value), + hist.CountInBucketForValue(normal_value)); + for (int i = interval; i <= interval * 10; i += interval) { + ASSERT_EQ(loop_iters / normal_value, hist.CountInBucketForValue(i)); + } +} + +static const int kExpectedSum = + 10 * 80 + 100 * 10 + 1000 * 5 + 10000 * 3 + 100000 * 1 + 1000000 * 1; +static const int kExpectedMax = 1000000; +static const int kExpectedCount = 100; +static const int kExpectedMin = 10; +static void load_percentiles(HdrHistogram* hist) { + hist->IncrementBy(10, 80); + hist->IncrementBy(100, 10); + hist->IncrementBy(1000, 5); + hist->IncrementBy(10000, 3); + hist->IncrementBy(100000, 1); + hist->IncrementBy(1000000, 1); +} + +static void validate_percentiles(HdrHistogram* hist, uint64_t specified_max) { + double expected_mean = + static_cast(kExpectedSum) / (80 + 10 + 5 + 3 + 1 + 1); + + ASSERT_EQ(kExpectedMin, hist->MinValue()); + ASSERT_EQ(kExpectedMax, hist->MaxValue()); + ASSERT_EQ(kExpectedSum, hist->TotalSum()); + ASSERT_NEAR(expected_mean, hist->MeanValue(), 0.001); + ASSERT_EQ(kExpectedCount, hist->TotalCount()); + ASSERT_EQ(10, hist->ValueAtPercentile(80)); + ASSERT_EQ(kExpectedCount, hist->ValueAtPercentile(90)); + ASSERT_EQ(hist->LowestEquivalentValue(specified_max), hist->ValueAtPercentile(99)); + ASSERT_EQ(hist->LowestEquivalentValue(specified_max), hist->ValueAtPercentile(99.99)); + ASSERT_EQ(hist->LowestEquivalentValue(specified_max), hist->ValueAtPercentile(100)); +} + +TEST_F(HdrHistogramTest, PercentileAndCopyTest) { + uint64_t specified_max = 10000; + HdrHistogram hist(specified_max, kSigDigits); + load_percentiles(&hist); + NO_FATALS(validate_percentiles(&hist, specified_max)); + + HdrHistogram copy(hist); + NO_FATALS(validate_percentiles(©, specified_max)); + + ASSERT_EQ(hist.TotalSum(), copy.TotalSum()); +} + +} // namespace kudu diff --git a/src/kudu/util/hdr_histogram.cc b/src/kudu/util/hdr_histogram.cc new file mode 100644 index 000000000000..43bd2a934a6b --- /dev/null +++ b/src/kudu/util/hdr_histogram.cc @@ -0,0 +1,490 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/hdr_histogram.h" + +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/bits.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +using base::subtle::Atomic64; +using base::subtle::NoBarrier_AtomicIncrement; +using base::subtle::NoBarrier_Store; +using base::subtle::NoBarrier_Load; +using base::subtle::NoBarrier_CompareAndSwap; +using strings::Substitute; + +namespace kudu { + +HdrHistogram::HdrHistogram(uint64_t highest_trackable_value, int num_significant_digits) + : highest_trackable_value_(highest_trackable_value), + num_significant_digits_(num_significant_digits), + counts_array_length_(0), + bucket_count_(0), + sub_bucket_count_(0), + sub_bucket_half_count_magnitude_(0), + sub_bucket_half_count_(0), + sub_bucket_mask_(0), + total_count_(0), + total_sum_(0), + min_value_(std::numeric_limits::max()), + max_value_(0), + counts_(nullptr) { + Init(); +} + +HdrHistogram::HdrHistogram(const HdrHistogram& other) + : highest_trackable_value_(other.highest_trackable_value_), + num_significant_digits_(other.num_significant_digits_), + counts_array_length_(0), + bucket_count_(0), + sub_bucket_count_(0), + sub_bucket_half_count_magnitude_(0), + sub_bucket_half_count_(0), + sub_bucket_mask_(0), + total_count_(0), + total_sum_(0), + min_value_(std::numeric_limits::max()), + max_value_(0), + counts_(nullptr) { + Init(); + + // Not a consistent snapshot but we try to roughly keep it close. + // Copy the sum and min first. + NoBarrier_Store(&total_sum_, NoBarrier_Load(&other.total_sum_)); + NoBarrier_Store(&min_value_, NoBarrier_Load(&other.min_value_)); + + uint64_t total_copied_count = 0; + // Copy the counts in order of ascending magnitude. + for (int i = 0; i < counts_array_length_; i++) { + uint64_t count = NoBarrier_Load(&other.counts_[i]); + NoBarrier_Store(&counts_[i], count); + total_copied_count += count; + } + // Copy the max observed value last. + NoBarrier_Store(&max_value_, NoBarrier_Load(&other.max_value_)); + // We must ensure the total is consistent with the copied counts. + NoBarrier_Store(&total_count_, total_copied_count); +} + +bool HdrHistogram::IsValidHighestTrackableValue(uint64_t highest_trackable_value) { + return highest_trackable_value >= kMinHighestTrackableValue; +} + +bool HdrHistogram::IsValidNumSignificantDigits(int num_significant_digits) { + return num_significant_digits >= kMinValidNumSignificantDigits && + num_significant_digits <= kMaxValidNumSignificantDigits; +} + +void HdrHistogram::Init() { + // Verify parameter validity + CHECK(IsValidHighestTrackableValue(highest_trackable_value_)) << + Substitute("highest_trackable_value must be >= $0", kMinHighestTrackableValue); + CHECK(IsValidNumSignificantDigits(num_significant_digits_)) << + Substitute("num_significant_digits must be between $0 and $1", + kMinValidNumSignificantDigits, kMaxValidNumSignificantDigits); + + uint32_t largest_value_with_single_unit_resolution = + 2 * static_cast(pow(10.0, num_significant_digits_)); + + // We need to maintain power-of-two sub_bucket_count_ (for clean direct + // indexing) that is large enough to provide unit resolution to at least + // largest_value_with_single_unit_resolution. So figure out + // largest_value_with_single_unit_resolution's nearest power-of-two + // (rounded up), and use that: + + // The sub-buckets take care of the precision. + // Each sub-bucket is sized to have enough bits for the requested + // 10^precision accuracy. + int sub_bucket_count_magnitude = + Bits::Log2Ceiling(largest_value_with_single_unit_resolution); + sub_bucket_half_count_magnitude_ = + (sub_bucket_count_magnitude >= 1) ? sub_bucket_count_magnitude - 1 : 0; + + // sub_bucket_count_ is approx. 10^num_sig_digits (as a power of 2) + sub_bucket_count_ = pow(2.0, sub_bucket_half_count_magnitude_ + 1); + sub_bucket_mask_ = sub_bucket_count_ - 1; + sub_bucket_half_count_ = sub_bucket_count_ / 2; + + // The buckets take care of the magnitude. + // Determine exponent range needed to support the trackable value with no + // overflow: + uint64_t trackable_value = sub_bucket_count_ - 1; + int buckets_needed = 1; + while (trackable_value < highest_trackable_value_) { + trackable_value <<= 1; + buckets_needed++; + } + bucket_count_ = buckets_needed; + + counts_array_length_ = (bucket_count_ + 1) * sub_bucket_half_count_; + counts_.reset(new Atomic64[counts_array_length_]()); // value-initialized +} + +void HdrHistogram::Increment(int64_t value) { + IncrementBy(value, 1); +} + +void HdrHistogram::IncrementBy(int64_t value, int64_t count) { + DCHECK_GE(value, 0); + DCHECK_GE(count, 0); + + // Dissect the value into bucket and sub-bucket parts, and derive index into + // counts array: + int bucket_index = BucketIndex(value); + int sub_bucket_index = SubBucketIndex(value, bucket_index); + int counts_index = CountsArrayIndex(bucket_index, sub_bucket_index); + + // Increment bucket, total, and sum. + NoBarrier_AtomicIncrement(&counts_[counts_index], count); + NoBarrier_AtomicIncrement(&total_count_, count); + NoBarrier_AtomicIncrement(&total_sum_, value * count); + + // Update min, if needed. + { + Atomic64 min_val; + while (PREDICT_FALSE(value < (min_val = MinValue()))) { + Atomic64 old_val = NoBarrier_CompareAndSwap(&min_value_, min_val, value); + if (PREDICT_TRUE(old_val == min_val)) break; // CAS success. + } + } + + // Update max, if needed. + { + Atomic64 max_val; + while (PREDICT_FALSE(value > (max_val = MaxValue()))) { + Atomic64 old_val = NoBarrier_CompareAndSwap(&max_value_, max_val, value); + if (PREDICT_TRUE(old_val == max_val)) break; // CAS success. + } + } +} + +void HdrHistogram::IncrementWithExpectedInterval(int64_t value, + int64_t expected_interval_between_samples) { + Increment(value); + if (expected_interval_between_samples <= 0) { + return; + } + for (int64_t missing_value = value - expected_interval_between_samples; + missing_value >= expected_interval_between_samples; + missing_value -= expected_interval_between_samples) { + Increment(missing_value); + } +} + +//////////////////////////////////// + +int HdrHistogram::BucketIndex(uint64_t value) const { + if (PREDICT_FALSE(value > highest_trackable_value_)) { + value = highest_trackable_value_; + } + // Here we are calculating the power-of-2 magnitude of the value with a + // correction for precision in the first bucket. + // Smallest power of 2 containing value. + int pow2ceiling = Bits::Log2Ceiling64(value | sub_bucket_mask_); + return pow2ceiling - (sub_bucket_half_count_magnitude_ + 1); +} + +int HdrHistogram::SubBucketIndex(uint64_t value, int bucket_index) const { + if (PREDICT_FALSE(value > highest_trackable_value_)) { + value = highest_trackable_value_; + } + // We hack off the magnitude and are left with only the relevant precision + // portion, which gives us a direct index into the sub-bucket. TODO: Right?? + return static_cast(value >> bucket_index); +} + +int HdrHistogram::CountsArrayIndex(int bucket_index, int sub_bucket_index) const { + DCHECK(sub_bucket_index < sub_bucket_count_); + DCHECK(bucket_index < bucket_count_); + DCHECK(bucket_index == 0 || (sub_bucket_index >= sub_bucket_half_count_)); + // Calculate the index for the first entry in the bucket: + // (The following is the equivalent of ((bucket_index + 1) * sub_bucket_half_count_) ): + int bucket_base_index = (bucket_index + 1) << sub_bucket_half_count_magnitude_; + // Calculate the offset in the bucket: + int offset_in_bucket = sub_bucket_index - sub_bucket_half_count_; + return bucket_base_index + offset_in_bucket; +} + +uint64_t HdrHistogram::CountAt(int bucket_index, int sub_bucket_index) const { + return counts_[CountsArrayIndex(bucket_index, sub_bucket_index)]; +} + +uint64_t HdrHistogram::CountInBucketForValue(uint64_t value) const { + int bucket_index = BucketIndex(value); + int sub_bucket_index = SubBucketIndex(value, bucket_index); + return CountAt(bucket_index, sub_bucket_index); +} + +uint64_t HdrHistogram::ValueFromIndex(int bucket_index, int sub_bucket_index) { + return static_cast(sub_bucket_index) << bucket_index; +} + +//////////////////////////////////// + +uint64_t HdrHistogram::SizeOfEquivalentValueRange(uint64_t value) const { + int bucket_index = BucketIndex(value); + int sub_bucket_index = SubBucketIndex(value, bucket_index); + uint64_t distance_to_next_value = + (1 << ((sub_bucket_index >= sub_bucket_count_) ? (bucket_index + 1) : bucket_index)); + return distance_to_next_value; +} + +uint64_t HdrHistogram::LowestEquivalentValue(uint64_t value) const { + int bucket_index = BucketIndex(value); + int sub_bucket_index = SubBucketIndex(value, bucket_index); + uint64_t this_value_base_level = ValueFromIndex(bucket_index, sub_bucket_index); + return this_value_base_level; +} + +uint64_t HdrHistogram::HighestEquivalentValue(uint64_t value) const { + return NextNonEquivalentValue(value) - 1; +} + +uint64_t HdrHistogram::MedianEquivalentValue(uint64_t value) const { + return (LowestEquivalentValue(value) + (SizeOfEquivalentValueRange(value) >> 1)); +} + +uint64_t HdrHistogram::NextNonEquivalentValue(uint64_t value) const { + return LowestEquivalentValue(value) + SizeOfEquivalentValueRange(value); +} + +bool HdrHistogram::ValuesAreEquivalent(uint64_t value1, uint64_t value2) const { + return (LowestEquivalentValue(value1) == LowestEquivalentValue(value2)); +} + +uint64_t HdrHistogram::MinValue() const { + if (PREDICT_FALSE(TotalCount() == 0)) return 0; + return NoBarrier_Load(&min_value_); +} + +uint64_t HdrHistogram::MaxValue() const { + if (PREDICT_FALSE(TotalCount() == 0)) return 0; + return NoBarrier_Load(&max_value_); +} + +double HdrHistogram::MeanValue() const { + uint64_t count = TotalCount(); + if (PREDICT_FALSE(count == 0)) return 0.0; + return static_cast(TotalSum()) / count; +} + +uint64_t HdrHistogram::ValueAtPercentile(double percentile) const { + uint64_t count = TotalCount(); + if (PREDICT_FALSE(count == 0)) return 0; + + double requested_percentile = std::min(percentile, 100.0); // Truncate down to 100% + uint64_t count_at_percentile = + static_cast(((requested_percentile / 100.0) * count) + 0.5); // Round + // Make sure we at least reach the first recorded entry + count_at_percentile = std::max(count_at_percentile, static_cast(1)); + + uint64_t total_to_current_iJ = 0; + for (int i = 0; i < bucket_count_; i++) { + int j = (i == 0) ? 0 : (sub_bucket_count_ / 2); + for (; j < sub_bucket_count_; j++) { + total_to_current_iJ += CountAt(i, j); + if (total_to_current_iJ >= count_at_percentile) { + uint64_t valueAtIndex = ValueFromIndex(i, j); + return valueAtIndex; + } + } + } + + LOG(DFATAL) << "Fell through while iterating, likely concurrent modification of histogram"; + return 0; +} + +/////////////////////////////////////////////////////////////////////// +// AbstractHistogramIterator +/////////////////////////////////////////////////////////////////////// + +AbstractHistogramIterator::AbstractHistogramIterator(const HdrHistogram* histogram) + : histogram_(CHECK_NOTNULL(histogram)), + cur_iter_val_(), + histogram_total_count_(histogram_->TotalCount()), + current_bucket_index_(0), + current_sub_bucket_index_(0), + current_value_at_index_(0), + next_bucket_index_(0), + next_sub_bucket_index_(1), + next_value_at_index_(1), + prev_value_iterated_to_(0), + total_count_to_prev_index_(0), + total_count_to_current_index_(0), + total_value_to_current_index_(0), + count_at_this_value_(0), + fresh_sub_bucket_(true) { +} + +bool AbstractHistogramIterator::HasNext() const { + return total_count_to_current_index_ < histogram_total_count_; +} + +Status AbstractHistogramIterator::Next(HistogramIterationValue* value) { + if (histogram_->TotalCount() != histogram_total_count_) { + return Status::IllegalState("Concurrently modified histogram while traversing it"); + } + + // Move through the sub buckets and buckets until we hit the next reporting level: + while (!ExhaustedSubBuckets()) { + count_at_this_value_ = + histogram_->CountAt(current_bucket_index_, current_sub_bucket_index_); + if (fresh_sub_bucket_) { // Don't add unless we've incremented since last bucket... + total_count_to_current_index_ += count_at_this_value_; + total_value_to_current_index_ += + count_at_this_value_ * histogram_->MedianEquivalentValue(current_value_at_index_); + fresh_sub_bucket_ = false; + } + if (ReachedIterationLevel()) { + uint64_t value_iterated_to = ValueIteratedTo(); + + // Update iterator value. + cur_iter_val_.value_iterated_to = value_iterated_to; + cur_iter_val_.value_iterated_from = prev_value_iterated_to_; + cur_iter_val_.count_at_value_iterated_to = count_at_this_value_; + cur_iter_val_.count_added_in_this_iteration_step = + (total_count_to_current_index_ - total_count_to_prev_index_); + cur_iter_val_.total_count_to_this_value = total_count_to_current_index_; + cur_iter_val_.total_value_to_this_value = total_value_to_current_index_; + cur_iter_val_.percentile = + ((100.0 * total_count_to_current_index_) / histogram_total_count_); + cur_iter_val_.percentile_level_iterated_to = PercentileIteratedTo(); + + prev_value_iterated_to_ = value_iterated_to; + total_count_to_prev_index_ = total_count_to_current_index_; + // Move the next percentile reporting level forward. + IncrementIterationLevel(); + + *value = cur_iter_val_; + return Status::OK(); + } + IncrementSubBucket(); + } + return Status::IllegalState("Histogram array index out of bounds while traversing"); +} + +double AbstractHistogramIterator::PercentileIteratedTo() const { + return (100.0 * static_cast(total_count_to_current_index_)) / histogram_total_count_; +} + +double AbstractHistogramIterator::PercentileIteratedFrom() const { + return (100.0 * static_cast(total_count_to_prev_index_)) / histogram_total_count_; +} + +uint64_t AbstractHistogramIterator::ValueIteratedTo() const { + return histogram_->HighestEquivalentValue(current_value_at_index_); +} + +bool AbstractHistogramIterator::ExhaustedSubBuckets() const { + return (current_bucket_index_ >= histogram_->bucket_count_); +} + +void AbstractHistogramIterator::IncrementSubBucket() { + fresh_sub_bucket_ = true; + // Take on the next index: + current_bucket_index_ = next_bucket_index_; + current_sub_bucket_index_ = next_sub_bucket_index_; + current_value_at_index_ = next_value_at_index_; + // Figure out the next next index: + next_sub_bucket_index_++; + if (next_sub_bucket_index_ >= histogram_->sub_bucket_count_) { + next_sub_bucket_index_ = histogram_->sub_bucket_half_count_; + next_bucket_index_++; + } + next_value_at_index_ = HdrHistogram::ValueFromIndex(next_bucket_index_, next_sub_bucket_index_); +} + +/////////////////////////////////////////////////////////////////////// +// RecordedValuesIterator +/////////////////////////////////////////////////////////////////////// + +RecordedValuesIterator::RecordedValuesIterator(const HdrHistogram* histogram) + : AbstractHistogramIterator(histogram), + visited_sub_bucket_index_(-1), + visited_bucket_index_(-1) { +} + +void RecordedValuesIterator::IncrementIterationLevel() { + visited_sub_bucket_index_ = current_sub_bucket_index_; + visited_bucket_index_ = current_bucket_index_; +} + +bool RecordedValuesIterator::ReachedIterationLevel() const { + uint64_t current_ij_count = + histogram_->CountAt(current_bucket_index_, current_sub_bucket_index_); + return current_ij_count != 0 && + ((visited_sub_bucket_index_ != current_sub_bucket_index_) || + (visited_bucket_index_ != current_bucket_index_)); +} + +/////////////////////////////////////////////////////////////////////// +// PercentileIterator +/////////////////////////////////////////////////////////////////////// + +PercentileIterator::PercentileIterator(const HdrHistogram* histogram, + int percentile_ticks_per_half_distance) + : AbstractHistogramIterator(histogram), + percentile_ticks_per_half_distance_(percentile_ticks_per_half_distance), + percentile_level_to_iterate_to_(0.0), + percentile_level_to_iterate_from_(0.0), + reached_last_recorded_value_(false) { +} + +bool PercentileIterator::HasNext() const { + if (AbstractHistogramIterator::HasNext()) { + return true; + } + // We want one additional last step to 100% + if (!reached_last_recorded_value_ && (histogram_total_count_ > 0)) { + const_cast(this)->percentile_level_to_iterate_to_ = 100.0; + const_cast(this)->reached_last_recorded_value_ = true; + return true; + } + return false; +} + +double PercentileIterator::PercentileIteratedTo() const { + return percentile_level_to_iterate_to_; +} + + +double PercentileIterator::PercentileIteratedFrom() const { + return percentile_level_to_iterate_from_; +} + +void PercentileIterator::IncrementIterationLevel() { + percentile_level_to_iterate_from_ = percentile_level_to_iterate_to_; + // TODO: Can this expression be simplified? + uint64_t percentile_reporting_ticks = percentile_ticks_per_half_distance_ * + static_cast(pow(2.0, + static_cast(log(100.0 / (100.0 - (percentile_level_to_iterate_to_))) / log(2)) + 1)); + percentile_level_to_iterate_to_ += 100.0 / percentile_reporting_ticks; +} + +bool PercentileIterator::ReachedIterationLevel() const { + if (count_at_this_value_ == 0) return false; + double current_percentile = + (100.0 * static_cast(total_count_to_current_index_)) / histogram_total_count_; + return (current_percentile >= percentile_level_to_iterate_to_); +} + +} // namespace kudu diff --git a/src/kudu/util/hdr_histogram.h b/src/kudu/util/hdr_histogram.h new file mode 100644 index 000000000000..bbfedd1e63af --- /dev/null +++ b/src/kudu/util/hdr_histogram.h @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_HDRHISTOGRAM_H_ +#define KUDU_UTIL_HDRHISTOGRAM_H_ + +// C++ (TR1) port of HdrHistogram. +// Original java implementation: http://giltene.github.io/HdrHistogram/ +// +// A High Dynamic Range (HDR) Histogram +// +// HdrHistogram supports the recording and analyzing sampled data value counts +// across a configurable integer value range with configurable value precision +// within the range. Value precision is expressed as the number of significant +// digits in the value recording, and provides control over value quantization +// behavior across the value range and the subsequent value resolution at any +// given level. +// +// For example, a Histogram could be configured to track the counts of observed +// integer values between 0 and 3,600,000,000 while maintaining a value +// precision of 3 significant digits across that range. Value quantization +// within the range will thus be no larger than 1/1,000th (or 0.1%) of any +// value. This example Histogram could be used to track and analyze the counts +// of observed response times ranging between 1 microsecond and 1 hour in +// magnitude, while maintaining a value resolution of 1 microsecond up to 1 +// millisecond, a resolution of 1 millisecond (or better) up to one second, and +// a resolution of 1 second (or better) up to 1,000 seconds. At it's maximum +// tracked value (1 hour), it would still maintain a resolution of 3.6 seconds +// (or better). + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { + +class AbstractHistogramIterator; +class Status; +class RecordedValuesIterator; + +// This implementation allows you to specify a range and accuracy (significant +// digits) to support in an instance of a histogram. The class takes care of +// the rest. At this time, only uint64_t values are supported. +// +// An HdrHistogram consists of a set of buckets, which bucket the magnitude of +// a value stored, and a set of sub-buckets, which implement the tunable +// precision of the storage. So if you specify 3 significant digits of +// precision, then you will get about 10^3 sub-buckets (as a power of 2) for +// each level of magnitude. Magnitude buckets are tracked in powers of 2. +// +// This class is thread-safe. +class HdrHistogram { + public: + // Specify the highest trackable value so that the class has a bound on the + // number of buckets, and # of significant digits (in decimal) so that the + // class can determine the granularity of those buckets. + HdrHistogram(uint64_t highest_trackable_value, int num_significant_digits); + + // Copy-construct a (non-consistent) snapshot of other. + explicit HdrHistogram(const HdrHistogram& other); + + // Validate your params before trying to construct the object. + static bool IsValidHighestTrackableValue(uint64_t highest_trackable_value); + static bool IsValidNumSignificantDigits(int num_significant_digits); + + // Record new data. + void Increment(int64_t value); + void IncrementBy(int64_t value, int64_t count); + + // Record new data, correcting for "coordinated omission". + // + // See https://groups.google.com/d/msg/mechanical-sympathy/icNZJejUHfE/BfDekfBEs_sJ + // for more details. + void IncrementWithExpectedInterval(int64_t value, + int64_t expected_interval_between_samples); + + // Fetch configuration params. + uint64_t highest_trackable_value() const { return highest_trackable_value_; } + int num_significant_digits() const { return num_significant_digits_; } + + // Get indexes into histogram based on value. + int BucketIndex(uint64_t value) const; + int SubBucketIndex(uint64_t value, int bucket_index) const; + + // Count of all events recorded. + uint64_t TotalCount() const { return base::subtle::NoBarrier_Load(&total_count_); } + + // Sum of all events recorded. + uint64_t TotalSum() const { return base::subtle::NoBarrier_Load(&total_sum_); } + + // Return number of items at index. + uint64_t CountAt(int bucket_index, int sub_bucket_index) const; + + // Return count of values in bucket with values equivalent to value. + uint64_t CountInBucketForValue(uint64_t) const; + + // Return representative value based on index. + static uint64_t ValueFromIndex(int bucket_index, int sub_bucket_index); + + // Get the size (in value units) of the range of values that are equivalent + // to the given value within the histogram's resolution. Where "equivalent" + // means that value samples recorded for any two equivalent values are + // counted in a common total count. + uint64_t SizeOfEquivalentValueRange(uint64_t value) const; + + // Get the lowest value that is equivalent to the given value within the + // histogram's resolution. Where "equivalent" means that value samples + // recorded for any two equivalent values are counted in a common total + // count. + uint64_t LowestEquivalentValue(uint64_t value) const; + + // Get the highest value that is equivalent to the given value within the + // histogram's resolution. + uint64_t HighestEquivalentValue(uint64_t value) const; + + // Get a value that lies in the middle (rounded up) of the range of values + // equivalent the given value. + uint64_t MedianEquivalentValue(uint64_t value) const; + + // Get the next value that is not equivalent to the given value within the + // histogram's resolution. + uint64_t NextNonEquivalentValue(uint64_t value) const; + + // Determine if two values are equivalent with the histogram's resolution. + bool ValuesAreEquivalent(uint64_t value1, uint64_t value2) const; + + // Get the exact minimum value (may lie outside the histogram). + uint64_t MinValue() const; + + // Get the exact maximum value (may lie outside the histogram). + uint64_t MaxValue() const; + + // Get the exact mean value of all recorded values in the histogram. + double MeanValue() const; + + // Get the value at a given percentile. + // This is a percentile in percents, i.e. 99.99 percentile. + uint64_t ValueAtPercentile(double percentile) const; + + // Get the percentile at a given value + // TODO: implement + // double PercentileAtOrBelowValue(uint64_t value) const; + + // Get the count of recorded values within a range of value levels. + // (inclusive to within the histogram's resolution) + // TODO: implement + //uint64_t CountBetweenValues(uint64_t low_value, uint64_t high_value) const; + + private: + friend class AbstractHistogramIterator; + + static const uint64_t kMinHighestTrackableValue = 2; + static const int kMinValidNumSignificantDigits = 1; + static const int kMaxValidNumSignificantDigits = 5; + + void Init(); + int CountsArrayIndex(int bucket_index, int sub_bucket_index) const; + + uint64_t highest_trackable_value_; + int num_significant_digits_; + int counts_array_length_; + int bucket_count_; + int sub_bucket_count_; + + // "Hot" fields in the write path. + uint8_t sub_bucket_half_count_magnitude_; + int sub_bucket_half_count_; + uint32_t sub_bucket_mask_; + + // Also hot. + base::subtle::Atomic64 total_count_; + base::subtle::Atomic64 total_sum_; + base::subtle::Atomic64 min_value_; + base::subtle::Atomic64 max_value_; + gscoped_array counts_; + + HdrHistogram& operator=(const HdrHistogram& other); // Disable assignment operator. +}; + +// Value returned from iterators. +struct HistogramIterationValue { + HistogramIterationValue() + : value_iterated_to(0), + value_iterated_from(0), + count_at_value_iterated_to(0), + count_added_in_this_iteration_step(0), + total_count_to_this_value(0), + total_value_to_this_value(0), + percentile(0.0), + percentile_level_iterated_to(0.0) { + } + + void Reset() { + value_iterated_to = 0; + value_iterated_from = 0; + count_at_value_iterated_to = 0; + count_added_in_this_iteration_step = 0; + total_count_to_this_value = 0; + total_value_to_this_value = 0; + percentile = 0.0; + percentile_level_iterated_to = 0.0; + } + + uint64_t value_iterated_to; + uint64_t value_iterated_from; + uint64_t count_at_value_iterated_to; + uint64_t count_added_in_this_iteration_step; + uint64_t total_count_to_this_value; + uint64_t total_value_to_this_value; + double percentile; + double percentile_level_iterated_to; +}; + +// Base class for iterating through histogram values. +// +// The underlying histogram must not be modified or destroyed while this class +// is iterating over it. +// +// This class is not thread-safe. +class AbstractHistogramIterator { + public: + // Create iterator with new histogram. + // The histogram must not be mutated while the iterator is in use. + explicit AbstractHistogramIterator(const HdrHistogram* histogram); + virtual ~AbstractHistogramIterator() { + } + + // Returns true if the iteration has more elements. + virtual bool HasNext() const; + + // Returns the next element in the iteration. + Status Next(HistogramIterationValue* value); + + virtual double PercentileIteratedTo() const; + virtual double PercentileIteratedFrom() const; + uint64_t ValueIteratedTo() const; + + protected: + // Implementations must override these methods. + virtual void IncrementIterationLevel() = 0; + virtual bool ReachedIterationLevel() const = 0; + + const HdrHistogram* histogram_; + HistogramIterationValue cur_iter_val_; + + uint64_t histogram_total_count_; + + int current_bucket_index_; + int current_sub_bucket_index_; + uint64_t current_value_at_index_; + + int next_bucket_index_; + int next_sub_bucket_index_; + uint64_t next_value_at_index_; + + uint64_t prev_value_iterated_to_; + uint64_t total_count_to_prev_index_; + + uint64_t total_count_to_current_index_; + uint64_t total_value_to_current_index_; + + uint64_t count_at_this_value_; + + private: + bool ExhaustedSubBuckets() const; + void IncrementSubBucket(); + + bool fresh_sub_bucket_; + + DISALLOW_COPY_AND_ASSIGN(AbstractHistogramIterator); +}; + +// Used for iterating through all recorded histogram values using the finest +// granularity steps supported by the underlying representation. The iteration +// steps through all non-zero recorded value counts, and terminates when all +// recorded histogram values are exhausted. +// +// The underlying histogram must not be modified or destroyed while this class +// is iterating over it. +// +// This class is not thread-safe. +class RecordedValuesIterator : public AbstractHistogramIterator { + public: + explicit RecordedValuesIterator(const HdrHistogram* histogram); + + protected: + virtual void IncrementIterationLevel() OVERRIDE; + virtual bool ReachedIterationLevel() const OVERRIDE; + + private: + int visited_sub_bucket_index_; + int visited_bucket_index_; + + DISALLOW_COPY_AND_ASSIGN(RecordedValuesIterator); +}; + +// Used for iterating through histogram values according to percentile levels. +// The iteration is performed in steps that start at 0% and reduce their +// distance to 100% according to the percentileTicksPerHalfDistance parameter, +// ultimately reaching 100% when all recorded histogram values are exhausted. +// +// The underlying histogram must not be modified or destroyed while this class +// is iterating over it. +// +// This class is not thread-safe. +class PercentileIterator : public AbstractHistogramIterator { + public: + // TODO: Explain percentile_ticks_per_half_distance. + PercentileIterator(const HdrHistogram* histogram, + int percentile_ticks_per_half_distance); + virtual bool HasNext() const OVERRIDE; + virtual double PercentileIteratedTo() const OVERRIDE; + virtual double PercentileIteratedFrom() const OVERRIDE; + + protected: + virtual void IncrementIterationLevel() OVERRIDE; + virtual bool ReachedIterationLevel() const OVERRIDE; + + private: + int percentile_ticks_per_half_distance_; + double percentile_level_to_iterate_to_; + double percentile_level_to_iterate_from_; + bool reached_last_recorded_value_; + + DISALLOW_COPY_AND_ASSIGN(PercentileIterator); +}; + +} // namespace kudu + +#endif // KUDU_UTIL_HDRHISTOGRAM_H_ diff --git a/src/kudu/util/hexdump.cc b/src/kudu/util/hexdump.cc new file mode 100644 index 000000000000..250c7c0e7962 --- /dev/null +++ b/src/kudu/util/hexdump.cc @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/slice.h" + +namespace kudu { + +std::string HexDump(const Slice &slice) { + std::string output; + output.reserve(slice.size() * 5); + + const uint8_t *p = slice.data(); + + int rem = slice.size(); + while (rem > 0) { + const uint8_t *line_p = p; + int line_len = std::min(rem, 16); + int line_rem = line_len; + StringAppendF(&output, "%06lx: ", line_p - slice.data()); + + while (line_rem >= 2) { + StringAppendF(&output, "%02x%02x ", + p[0] & 0xff, p[1] & 0xff); + p += 2; + line_rem -= 2; + } + + if (line_rem == 1) { + StringAppendF(&output, "%02x ", + p[0] & 0xff); + p += 1; + line_rem -= 1; + } + + int padding = (16 - line_len) / 2; + + for (int i = 0; i < padding; i++) { + output.append(" "); + } + + for (int i = 0; i < line_len; i++) { + char c = line_p[i]; + if (isprint(c)) { + output.push_back(c); + } else { + output.push_back('.'); + } + } + + output.push_back('\n'); + rem -= line_len; + } + return output; +} +} // namespace kudu diff --git a/src/kudu/util/hexdump.h b/src/kudu/util/hexdump.h new file mode 100644 index 000000000000..55deed42848c --- /dev/null +++ b/src/kudu/util/hexdump.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_HEXDUMP_H +#define KUDU_UTIL_HEXDUMP_H + +#include + +namespace kudu { + +class Slice; + +// Generate an 'xxd'-style hexdump of the given slice. +// This should only be used for debugging, as the format is +// subject to change and it has not been implemented for +// speed. +std::string HexDump(const Slice &slice); + +} // namespace kudu +#endif diff --git a/src/kudu/util/high_water_mark.h b/src/kudu/util/high_water_mark.h new file mode 100644 index 000000000000..dfc30e4d5180 --- /dev/null +++ b/src/kudu/util/high_water_mark.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_HIGH_WATER_MARK_H +#define KUDU_UTIL_HIGH_WATER_MARK_H + +#include "kudu/gutil/macros.h" +#include "kudu/util/atomic.h" + +namespace kudu { + +// Lock-free integer that keeps track of the highest value seen. +// Similar to Impala's RuntimeProfile::HighWaterMarkCounter. +// HighWaterMark::max_value() returns the highest value seen; +// HighWaterMark::current_value() returns the current value. +class HighWaterMark { + public: + explicit HighWaterMark(int64_t initial_value) + : current_value_(initial_value), + max_value_(initial_value) { + } + + // Return the current value. + int64_t current_value() const { + return current_value_.Load(kMemOrderNoBarrier); + } + + // Return the max value. + int64_t max_value() const { + return max_value_.Load(kMemOrderNoBarrier); + } + + // If current value + 'delta' is <= 'max', increment current value + // by 'delta' and return true; return false otherwise. + bool TryIncrementBy(int64_t delta, int64_t max) { + while (true) { + int64_t old_val = current_value(); + int64_t new_val = old_val + delta; + if (new_val > max) { + return false; + } + if (PREDICT_TRUE(current_value_.CompareAndSet(old_val, + new_val, + kMemOrderNoBarrier))) { + UpdateMax(new_val); + return true; + } + } + } + + void IncrementBy(int64_t amount) { + UpdateMax(current_value_.IncrementBy(amount, kMemOrderNoBarrier)); + } + + void set_value(int64_t v) { + current_value_.Store(v, kMemOrderNoBarrier); + UpdateMax(v); + } + + private: + void UpdateMax(int64_t value) { + max_value_.StoreMax(value, kMemOrderNoBarrier); + } + + AtomicInt current_value_; + AtomicInt max_value_; +}; + +} // namespace kudu +#endif /* KUDU_UTIL_HIGH_WATER_MARK_H */ + + diff --git a/src/kudu/util/histogram.proto b/src/kudu/util/histogram.proto new file mode 100644 index 000000000000..f54bc9322ba0 --- /dev/null +++ b/src/kudu/util/histogram.proto @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +option java_package = "org.kududb"; + +// Captures the state of an Histogram. +message HistogramSnapshotPB { + required string type = 1; + required string name = 2; + optional string description = 3; + required string unit = 4; + optional string label = 19; + + required uint64 max_trackable_value = 5; + required int32 num_significant_digits = 6; + required uint64 total_count = 7; + optional uint64 total_sum = 18; + required uint64 min = 8; + required double mean = 9; + required uint64 percentile_75 = 10; + required uint64 percentile_95 = 11; + required uint64 percentile_99 = 12; + required uint64 percentile_99_9 = 13; + required uint64 percentile_99_99 = 14; + required uint64 max = 15; + repeated uint64 values = 16 [packed = true]; + repeated uint64 counts = 17 [packed = true]; +} + +message HistogramSnapshotsListPB { + repeated HistogramSnapshotPB histograms = 1; +} diff --git a/src/kudu/util/init.cc b/src/kudu/util/init.cc new file mode 100644 index 000000000000..9bae7c230aa1 --- /dev/null +++ b/src/kudu/util/init.cc @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/init.h" + +#include + +#include "kudu/gutil/cpu.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/status.h" + +using std::string; + +namespace kudu { + +Status BadCPUStatus(const base::CPU& cpu, const char* instruction_set) { + return Status::NotSupported(strings::Substitute( + "The CPU on this system ($0) does not support the $1 instruction " + "set which is required for running Kudu.", + cpu.cpu_brand(), instruction_set)); +} + +Status CheckCPUFlags() { + base::CPU cpu; + if (!cpu.has_sse42()) { + return BadCPUStatus(cpu, "SSE4.2"); + } + + if (!cpu.has_ssse3()) { + return BadCPUStatus(cpu, "SSSE3"); + } + + return Status::OK(); +} + +void InitKuduOrDie() { + CHECK_OK(CheckCPUFlags()); +} + +} // namespace kudu diff --git a/src/kudu/util/init.h b/src/kudu/util/init.h new file mode 100644 index 000000000000..3f7916c75fdc --- /dev/null +++ b/src/kudu/util/init.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_INIT_H +#define KUDU_UTIL_INIT_H + +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +// Return a NotSupported Status if the current CPU does not support the CPU flags +// required for Kudu. +Status CheckCPUFlags(); + +// Initialize Kudu, checking that the platform we are running on is supported, etc. +// Issues a FATAL log message if we fail to init. +void InitKuduOrDie(); + +} // namespace kudu +#endif /* KUDU_UTIL_INIT_H */ diff --git a/src/kudu/util/inline_slice-test.cc b/src/kudu/util/inline_slice-test.cc new file mode 100644 index 000000000000..df69028a9336 --- /dev/null +++ b/src/kudu/util/inline_slice-test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/inline_slice.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { + +template +static void TestRoundTrip(InlineSlice *slice, + Arena *arena, + size_t test_size) { + gscoped_ptr buf(new uint8_t[test_size]); + for (int i = 0; i < test_size; i++) { + buf[i] = i & 0xff; + } + + Slice test_input(buf.get(), test_size); + + slice->set(test_input, arena); + Slice ret = slice->as_slice(); + ASSERT_TRUE(ret == test_input) + << "test_size =" << test_size << "\n" + << "ret = " << ret.ToDebugString() << "\n" + << "test_input = " << test_input.ToDebugString(); + + // If the data is small enough to fit inline, then + // the returned slice should point directly into the + // InlineSlice object. + if (test_size < N) { + ASSERT_EQ(reinterpret_cast(slice) + 1, + ret.data()); + } +} + +// Sweep a variety of inputs for a given size of inline +// data +template +static void DoTest() { + Arena arena(1024, 4096); + + // Test a range of inputs both growing and shrinking + InlineSlice my_slice; + ASSERT_EQ(N, sizeof(my_slice)); + + for (size_t to_test = 0; to_test < 1000; to_test++) { + TestRoundTrip(&my_slice, &arena, to_test); + } + for (size_t to_test = 1000; to_test > 0; to_test--) { + TestRoundTrip(&my_slice, &arena, to_test); + } +} + +TEST(TestInlineSlice, Test8ByteInline) { + DoTest<8>(); +} + +TEST(TestInlineSlice, Test12ByteInline) { + DoTest<12>(); +} + +TEST(TestInlineSlice, Test16ByteInline) { + DoTest<16>(); +} + +} // namespace kudu diff --git a/src/kudu/util/inline_slice.h b/src/kudu/util/inline_slice.h new file mode 100644 index 000000000000..aeca37062903 --- /dev/null +++ b/src/kudu/util/inline_slice.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_INLINE_SLICE_H +#define KUDU_UTIL_INLINE_SLICE_H + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/casts.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error This needs to be ported for big endian +#endif + +// Class which represents short strings inline, and stores longer ones +// by instead storing a pointer. +// +// Internal format: +// The buffer must be at least as large as a pointer (eg 8 bytes for 64-bit). +// Let ptr = bit-casting the first 8 bytes as a pointer: +// If buf_[0] < 0xff: +// buf_[0] == length of stored data +// buf_[1..1 + buf_[0]] == inline data +// If buf_[0] == 0xff: +// buf_[1..sizeof(uint8_t *)] == pointer to indirect data, minus the MSB. +// buf_[sizeof(uint8_t *)..] = unused +// TODO: we could store a prefix of the indirect data in this unused space +// in the future, which might be able to short-circuit some comparisons +// +// The indirect data which is pointed to is stored as a 4 byte length followed by +// the actual data. +// +// This class relies on the fact that the most significant bit of any x86 pointer is +// 0 (i.e pointers only use the bottom 48 bits) +// +// If ATOMIC is true, then this class has the semantics that readers will never see +// invalid pointers, even in the case of concurrent access. However, they _may_ see +// invalid *data*. That is to say, calling 'as_slice()' will always return a slice +// which points to a valid memory region -- the memory region may contain garbage +// but will not cause a segfault on access. +// +// These ATOMIC semantics may seem too loose to be useful, but can be used in +// optimistic concurrency control schemes -- so long as accessing the slice doesn't +// produce a segfault, it's OK to read bad data on a race because the higher-level +// concurrency control will cause a retry. +template +class InlineSlice { + private: + enum { + kPointerByteWidth = sizeof(uintptr_t), + kPointerBitWidth = kPointerByteWidth * 8, + kMaxInlineData = STORAGE_SIZE - 1 + }; + + BOOST_STATIC_ASSERT(STORAGE_SIZE >= kPointerByteWidth); + BOOST_STATIC_ASSERT(STORAGE_SIZE <= 256); + public: + InlineSlice() { + } + + inline const Slice as_slice() const ATTRIBUTE_ALWAYS_INLINE { + DiscriminatedPointer dptr = LoadValue(); + + if (dptr.is_indirect()) { + const uint8_t *indir_data = reinterpret_cast(dptr.pointer); + uint32_t len = *reinterpret_cast(indir_data); + indir_data += sizeof(uint32_t); + return Slice(indir_data, (size_t)len); + } else { + uint8_t len = dptr.discriminator; + DCHECK_LE(len, STORAGE_SIZE - 1); + return Slice(&buf_[1], len); + } + } + + template + void set(const Slice &src, ArenaType *alloc_arena) { + set(src.data(), src.size(), alloc_arena); + } + + template + void set(const uint8_t *src, size_t len, + ArenaType *alloc_arena) { + if (len <= kMaxInlineData) { + if (ATOMIC) { + // If atomic, we need to make sure that we store the discriminator + // before we copy in any data. Otherwise the data would overwrite + // part of a pointer and a reader might see an invalid address. + DiscriminatedPointer dptr; + dptr.discriminator = len; + dptr.pointer = 0; // will be overwritten + // "Acquire" ensures that the later memcpy doesn't reorder above the + // set of the discriminator bit. + base::subtle::Acquire_Store(reinterpret_cast(buf_), + bit_cast(dptr)); + } else { + buf_[0] = len; + } + memcpy(&buf_[1], src, len); + + } else { + // TODO: if already indirect and the current storage has enough space, just reuse that. + + // Set up the pointed-to data before setting a pointer to it. This ensures that readers + // never see a pointer to an invalid region (i.e one without a proper length header). + void *in_arena = CHECK_NOTNULL(alloc_arena->AllocateBytes(len + sizeof(uint32_t))); + *reinterpret_cast(in_arena) = len; + memcpy(reinterpret_cast(in_arena) + sizeof(uint32_t), src, len); + set_ptr(in_arena); + } + } + + private: + struct DiscriminatedPointer { + uint8_t discriminator : 8; + uintptr_t pointer : 54; + + bool is_indirect() const { + return discriminator == 0xff; + } + }; + + DiscriminatedPointer LoadValue() const { + if (ATOMIC) { + // Load with "Acquire" semantics -- if we load a pointer, this ensures + // that we also see the pointed-to data. + uintptr_t ptr_val = base::subtle::Acquire_Load( + reinterpret_cast(buf_)); + return bit_cast(ptr_val); + } else { + DiscriminatedPointer ret; + memcpy(&ret, buf_, sizeof(ret)); + return ret; + } + } + + // Set the internal storage to be an indirect pointer to the given + // address. + void set_ptr(void *ptr) { + uintptr_t ptr_int = reinterpret_cast(ptr); + DCHECK_EQ(ptr_int >> (kPointerBitWidth - 8), 0) << + "bad pointer (should have 0x00 MSB): " << ptr; + + DiscriminatedPointer dptr; + dptr.discriminator = 0xff; + dptr.pointer = ptr_int; + + if (ATOMIC) { + // Store with "Release" semantics -- this ensures that the pointed-to data + // is visible to any readers who see this pointer. + uintptr_t to_store = bit_cast(dptr); + base::subtle::Release_Store(reinterpret_cast(buf_), + to_store); + } else { + memcpy(&buf_[0], &dptr, sizeof(dptr)); + } + } + + uint8_t buf_[STORAGE_SIZE]; + +} PACKED; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/interval_tree-inl.h b/src/kudu/util/interval_tree-inl.h new file mode 100644 index 000000000000..7e88e42b776a --- /dev/null +++ b/src/kudu/util/interval_tree-inl.h @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_INTERVAL_TREE_INL_H +#define KUDU_UTIL_INTERVAL_TREE_INL_H + +#include +#include + +namespace kudu { + +template +IntervalTree::IntervalTree(const IntervalVector &intervals) + : root_(NULL) { + if (!intervals.empty()) { + root_ = CreateNode(intervals); + } +} + +template +IntervalTree::~IntervalTree() { + delete root_; +} + +template +void IntervalTree::FindContainingPoint(const point_type &query, + IntervalVector *results) const { + if (root_) { + root_->FindContainingPoint(query, results); + } +} + +template +void IntervalTree::FindIntersectingInterval(const interval_type &query, + IntervalVector *results) const { + if (root_) { + root_->FindIntersectingInterval(query, results); + } +} + +template +static bool LessThan(const typename Traits::point_type &a, + const typename Traits::point_type &b) { + return Traits::compare(a, b) < 0; +} + +// Select a split point which attempts to evenly divide 'in' into three groups: +// (a) those that are fully left of the split point +// (b) those that overlap the split point. +// (c) those that are fully right of the split point +// These three groups are stored in the output parameters '*left', '*overlapping', +// and '*right', respectively. The selected split point is stored in *split_point. +// +// For example, the input interval set: +// +// |------1-------| |-----2-----| +// |--3--| |---4--| |----5----| +// | +// Resulting split: | Partition point +// | +// +// *left: intervals 1 and 3 +// *overlapping: interval 4 +// *right: intervals 2 and 5 +template +void IntervalTree::Partition(const IntervalVector &in, + point_type *split_point, + IntervalVector *left, + IntervalVector *overlapping, + IntervalVector *right) { + CHECK(!in.empty()); + + // Pick a split point which is the median of all of the interval boundaries. + std::vector endpoints; + endpoints.reserve(in.size() * 2); + for (const interval_type &interval : in) { + endpoints.push_back(Traits::get_left(interval)); + endpoints.push_back(Traits::get_right(interval)); + } + std::sort(endpoints.begin(), endpoints.end(), LessThan); + *split_point = endpoints[endpoints.size() / 2]; + + // Partition into the groups based on the determined split point. + for (const interval_type &interval : in) { + if (Traits::compare(Traits::get_right(interval), *split_point) < 0) { + // | split point + // |------------| | + // interval + left->push_back(interval); + } else if (Traits::compare(Traits::get_left(interval), *split_point) > 0) { + // | split point + // | |------------| + // interval + right->push_back(interval); + } else { + // | split point + // | + // |------------| + // interval + overlapping->push_back(interval); + } + } +} + +template +typename IntervalTree::node_type *IntervalTree::CreateNode( + const IntervalVector &intervals) { + IntervalVector left, right, overlap; + point_type split_point; + + // First partition the input intervals and select a split point + Partition(intervals, &split_point, &left, &overlap, &right); + + // Recursively subdivide the intervals which are fully left or fully + // right of the split point into subtree nodes. + node_type *left_node = !left.empty() ? CreateNode(left) : NULL; + node_type *right_node = !right.empty() ? CreateNode(right) : NULL; + + return new node_type(split_point, left_node, overlap, right_node); +} + +namespace interval_tree_internal { + +// Node in the interval tree. +template +class ITNode { + private: + // Import types. + typedef std::vector IntervalVector; + typedef typename Traits::interval_type interval_type; + typedef typename Traits::point_type point_type; + + public: + ITNode(point_type split_point, + ITNode *left, + const IntervalVector &overlap, + ITNode *right); + ~ITNode(); + + // See IntervalTree::FindContainingPoint(...) + void FindContainingPoint(const point_type &query, + IntervalVector *results) const; + + // See IntervalTree::FindIntersectingInterval(...) + void FindIntersectingInterval(const interval_type &query, + IntervalVector *results) const; + + private: + // Comparators for sorting lists of intervals. + static bool SortByAscLeft(const interval_type &a, const interval_type &b); + static bool SortByDescRight(const interval_type &a, const interval_type &b); + + // Partition point of this node. + point_type split_point_; + + // Those nodes that overlap with split_point_, in ascending order by their left side. + IntervalVector overlapping_by_asc_left_; + + // Those nodes that overlap with split_point_, in descending order by their right side. + IntervalVector overlapping_by_desc_right_; + + // Tree node for intervals fully left of split_point_, or NULL. + ITNode *left_; + + // Tree node for intervals fully right of split_point_, or NULL. + ITNode *right_; + + DISALLOW_COPY_AND_ASSIGN(ITNode); +}; + +template +bool ITNode::SortByAscLeft(const interval_type &a, const interval_type &b) { + return Traits::compare(Traits::get_left(a), Traits::get_left(b)) < 0; +} + +template +bool ITNode::SortByDescRight(const interval_type &a, const interval_type &b) { + return Traits::compare(Traits::get_right(a), Traits::get_right(b)) > 0; +} + +template +ITNode::ITNode(typename Traits::point_type split_point, + ITNode *left, const IntervalVector &overlap, + ITNode *right) + : split_point_(std::move(split_point)), left_(left), right_(right) { + // Store two copies of the set of intervals which overlap the split point: + // 1) Sorted by ascending left boundary + overlapping_by_asc_left_.assign(overlap.begin(), overlap.end()); + std::sort(overlapping_by_asc_left_.begin(), overlapping_by_asc_left_.end(), SortByAscLeft); + // 2) Sorted by descending right boundary + overlapping_by_desc_right_.assign(overlap.begin(), overlap.end()); + std::sort(overlapping_by_desc_right_.begin(), overlapping_by_desc_right_.end(), SortByDescRight); +} + +template +ITNode::~ITNode() { + if (left_) delete left_; + if (right_) delete right_; +} + +template +void ITNode::FindContainingPoint(const point_type &query, + IntervalVector *results) const { + int cmp = Traits::compare(query, split_point_); + if (cmp < 0) { + // None of the intervals in right_ may intersect this. + if (left_ != NULL) { + left_->FindContainingPoint(query, results); + } + + // Any intervals which start before the query point and overlap the split point + // must therefore contain the query point. + for (const interval_type &interval : overlapping_by_asc_left_) { + if (Traits::compare(Traits::get_left(interval), query) <= 0) { + results->push_back(interval); + } else { + break; + } + } + } else if (cmp > 0) { + // None of the intervals in left_ may intersect this. + if (right_ != NULL) { + right_->FindContainingPoint(query, results); + } + + // Any intervals which end after the query point and overlap the split point + // must therefore contain the query point. + for (const interval_type &interval : overlapping_by_desc_right_) { + if (Traits::compare(Traits::get_right(interval), query) >= 0) { + results->push_back(interval); + } else { + break; + } + } + } else { + DCHECK_EQ(cmp, 0); + // The query is exactly our split point -- in this case we've already got + // the computed list of overlapping intervals. + results->insert(results->end(), overlapping_by_asc_left_.begin(), + overlapping_by_asc_left_.end()); + } +} + +template +void ITNode::FindIntersectingInterval(const interval_type &query, + IntervalVector *results) const { + if (Traits::compare(Traits::get_right(query), split_point_) < 0) { + // The interval is fully left of the split point. So, it may not overlap + // with any in 'right_' + if (left_ != NULL) { + left_->FindIntersectingInterval(query, results); + } + + // Any intervals whose left edge is <= the query interval's right edge + // intersect the query interval. + for (const interval_type &interval : overlapping_by_asc_left_) { + if (Traits::compare(Traits::get_left(interval),Traits::get_right(query)) <= 0) { + results->push_back(interval); + } else { + break; + } + } + } else if (Traits::compare(Traits::get_left(query), split_point_) > 0) { + // The interval is fully right of the split point. So, it may not overlap + // with any in 'left_' + if (right_ != NULL) { + right_->FindIntersectingInterval(query, results); + } + + // Any intervals whose right edge is >= the query interval's left edge + // intersect the query interval. + for (const interval_type &interval : overlapping_by_desc_right_) { + if (Traits::compare(Traits::get_right(interval), Traits::get_left(query)) >= 0) { + results->push_back(interval); + } else { + break; + } + } + } else { + // The query interval contains the split point. Therefore all other intervals + // which also contain the split point are intersecting. + results->insert(results->end(), overlapping_by_asc_left_.begin(), + overlapping_by_asc_left_.end()); + + // The query interval may _also_ intersect some in either child. + if (left_ != NULL) { + left_->FindIntersectingInterval(query, results); + } + if (right_ != NULL) { + right_->FindIntersectingInterval(query, results); + } + } +} + + +} // namespace interval_tree_internal + +} // namespace kudu + +#endif diff --git a/src/kudu/util/interval_tree-test.cc b/src/kudu/util/interval_tree-test.cc new file mode 100644 index 000000000000..09c301532b97 --- /dev/null +++ b/src/kudu/util/interval_tree-test.cc @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// All rights reserved. + +#include +#include + +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/interval_tree.h" +#include "kudu/util/interval_tree-inl.h" +#include "kudu/util/test_util.h" + +using std::vector; + +namespace kudu { + +// Test harness. +class TestIntervalTree : public KuduTest { +}; + +// Simple interval class for integer intervals. +struct IntInterval { + IntInterval(int left_, int right_) : left(left_), right(right_) {} + + bool Intersects(const IntInterval &other) const { + if (other.left > right) return false; + if (left > other.right) return false; + return true; + } + + int left, right; +}; + +// Traits definition for intervals made up of ints on either end. +struct IntTraits { + typedef int point_type; + typedef IntInterval interval_type; + static point_type get_left(const IntInterval &x) { + return x.left; + } + static point_type get_right(const IntInterval &x) { + return x.right; + } + static int compare(int a, int b) { + if (a < b) return -1; + if (a > b) return 1; + return 0; + } +}; + +// Compare intervals in a consistent way - this is only used for verifying +// that the two algorithms come up with the same results. It's not necessary +// to define this to use an interval tree. +static bool CompareIntervals(const IntInterval &a, const IntInterval &b) { + if (a.left < b.left) return true; + if (a.left > b.left) return false; + if (a.right < b.right) return true; + if (b.right > b.right) return true; + return false; // equal +} + +// Stringify a list of int intervals, for easy test error reporting. +static string Stringify(const vector &intervals) { + string ret; + bool first = true; + for (const IntInterval &interval : intervals) { + if (!first) { + ret.append(","); + } + StringAppendF(&ret, "[%d, %d]", interval.left, interval.right); + } + return ret; +} + +// Find any intervals in 'intervals' which contain 'query_point' by brute force. +static void FindContainingBruteForce(const vector &intervals, + int query_point, + vector *results) { + for (const IntInterval &i : intervals) { + if (query_point >= i.left && query_point <= i.right) { + results->push_back(i); + } + } +} + + +// Find any intervals in 'intervals' which intersect 'query_interval' by brute force. +static void FindIntersectingBruteForce(const vector &intervals, + IntInterval query_interval, + vector *results) { + for (const IntInterval &i : intervals) { + if (query_interval.Intersects(i)) { + results->push_back(i); + } + } +} + + +// Verify that IntervalTree::FindContainingPoint yields the same results as the naive +// brute-force O(n) algorithm. +static void VerifyFindContainingPoint(const vector all_intervals, + const IntervalTree &tree, + int query_point) { + vector results; + tree.FindContainingPoint(query_point, &results); + std::sort(results.begin(), results.end(), CompareIntervals); + + vector brute_force; + FindContainingBruteForce(all_intervals, query_point, &brute_force); + std::sort(brute_force.begin(), brute_force.end(), CompareIntervals); + + SCOPED_TRACE(Stringify(all_intervals) + StringPrintf(" (q=%d)", query_point)); + EXPECT_EQ(Stringify(brute_force), Stringify(results)); +} + +// Verify that IntervalTree::FindIntersectingInterval yields the same results as the naive +// brute-force O(n) algorithm. +static void VerifyFindIntersectingInterval(const vector all_intervals, + const IntervalTree &tree, + const IntInterval &query_interval) { + vector results; + tree.FindIntersectingInterval(query_interval, &results); + std::sort(results.begin(), results.end(), CompareIntervals); + + vector brute_force; + FindIntersectingBruteForce(all_intervals, query_interval, &brute_force); + std::sort(brute_force.begin(), brute_force.end(), CompareIntervals); + + SCOPED_TRACE(Stringify(all_intervals) + + StringPrintf(" (q=[%d,%d])", query_interval.left, query_interval.right)); + EXPECT_EQ(Stringify(brute_force), Stringify(results)); +} + + +TEST_F(TestIntervalTree, TestBasic) { + vector intervals; + intervals.push_back(IntInterval(1, 2)); + intervals.push_back(IntInterval(3, 4)); + intervals.push_back(IntInterval(1, 4)); + IntervalTree t(intervals); + + for (int i = 0; i <= 5; i++) { + VerifyFindContainingPoint(intervals, t, i); + + for (int j = i; j <= 5; j++) { + VerifyFindIntersectingInterval(intervals, t, IntInterval(i, j)); + } + } +} + +TEST_F(TestIntervalTree, TestRandomized) { + SeedRandom(); + + // Generate 100 random intervals spanning 0-200 and build an interval tree from them. + vector intervals; + for (int i = 0; i < 100; i++) { + int l = rand() % 100; // NOLINT(runtime/threadsafe_fn) + int r = l + rand() % 100; // NOLINT(runtime/threadsafe_fn) + intervals.push_back(IntInterval(l, r)); + } + IntervalTree t(intervals); + + // Test that we get the correct result on every possible query. + for (int i = -1; i < 201; i++) { + VerifyFindContainingPoint(intervals, t, i); + } + + // Test that we get the correct result for random intervals + for (int i = 0; i < 100; i++) { + int l = rand() % 100; // NOLINT(runtime/threadsafe_fn) + int r = l + rand() % 100; // NOLINT(runtime/threadsafe_fn) + VerifyFindIntersectingInterval(intervals, t, IntInterval(l, r)); + } +} + +TEST_F(TestIntervalTree, TestEmpty) { + vector empty; + IntervalTree t(empty); + + VerifyFindContainingPoint(empty, t, 1); + VerifyFindIntersectingInterval(empty, t, IntInterval(1, 2)); +} + +} // namespace kudu diff --git a/src/kudu/util/interval_tree.h b/src/kudu/util/interval_tree.h new file mode 100644 index 000000000000..8a625d174c7d --- /dev/null +++ b/src/kudu/util/interval_tree.h @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Implements an Interval Tree. See http://en.wikipedia.org/wiki/Interval_tree +// or CLRS for a full description of the data structure. +// +// Callers of this class should also include interval_tree-inl.h for function +// definitions. +#ifndef KUDU_UTIL_INTERVAL_TREE_H +#define KUDU_UTIL_INTERVAL_TREE_H + +#include + +#include + +#include "kudu/gutil/macros.h" + +namespace kudu { + +namespace interval_tree_internal { +template +class ITNode; +} + +// Implements an Interval Tree. +// +// An Interval Tree is a data structure which stores a set of intervals and supports +// efficient searches to determine which intervals in that set overlap a query +// point or interval. These operations are O(lg n + k) where 'n' is the number of +// intervals in the tree and 'k' is the number of results returned for a given query. +// +// This particular implementation is a static tree -- intervals may not be added or +// removed once the tree is instantiated. +// +// This class also assumes that all intervals are "closed" intervals -- the intervals +// are inclusive of their start and end points. +// +// The Traits class should have the following members: +// Traits::point_type +// a typedef for what a "point" in the range is +// +// Traits::interval_type +// a typedef for an interval +// +// static point_type get_left(const interval_type &) +// static point_type get_right(const interval_type &) +// accessors which fetch the left and right bound of the interval, respectively. +// +// static int compare(const point_type &a, const point_type &b) +// return < 0 if a < b, 0 if a == b, > 0 if a > b +// +// See interval_tree-test.cc for an example Traits class for 'int' ranges. +template +class IntervalTree { + private: + // Import types from the traits class to make code more readable. + typedef typename Traits::interval_type interval_type; + typedef typename Traits::point_type point_type; + + // And some convenience types. + typedef std::vector IntervalVector; + typedef interval_tree_internal::ITNode node_type; + + public: + // Construct an Interval Tree containing the given set of intervals. + explicit IntervalTree(const IntervalVector &intervals); + + ~IntervalTree(); + + // Find all intervals in the tree which contain the query point. + // The resulting intervals are added to the 'results' vector. + // The vector is not cleared first. + void FindContainingPoint(const point_type &query, + IntervalVector *results) const; + + // Find all intervals in the tree which intersect the given interval. + // The resulting intervals are added to the 'results' vector. + // The vector is not cleared first. + void FindIntersectingInterval(const interval_type &query, + IntervalVector *results) const; + + private: + static void Partition(const IntervalVector &in, + point_type *split_point, + IntervalVector *left, + IntervalVector *overlapping, + IntervalVector *right); + + // Create a node containing the given intervals, recursively splitting down the tree. + static node_type *CreateNode(const IntervalVector &intervals); + + node_type *root_; + + DISALLOW_COPY_AND_ASSIGN(IntervalTree); +}; + + +} // namespace kudu + +#endif diff --git a/src/kudu/util/jsonreader-test.cc b/src/kudu/util/jsonreader-test.cc new file mode 100644 index 000000000000..3c54cc75853a --- /dev/null +++ b/src/kudu/util/jsonreader-test.cc @@ -0,0 +1,170 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "kudu/gutil/integral_types.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/jsonreader.h" +#include "kudu/util/test_macros.h" + +using rapidjson::Value; +using std::string; +using std::vector; +using strings::Substitute; + +namespace kudu { + +TEST(JsonReaderTest, Corrupt) { + JsonReader r(""); + Status s = r.Init(); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_STR_CONTAINS( + s.ToString(), "JSON text is corrupt: Text only contains white space(s)"); +} + +TEST(JsonReaderTest, Empty) { + JsonReader r("{}"); + ASSERT_OK(r.Init()); + JsonReader r2("[]"); + ASSERT_OK(r2.Init()); + + // Not found. + ASSERT_TRUE(r.ExtractInt32(r.root(), "foo", nullptr).IsNotFound()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "foo", nullptr).IsNotFound()); + ASSERT_TRUE(r.ExtractString(r.root(), "foo", nullptr).IsNotFound()); + ASSERT_TRUE(r.ExtractObject(r.root(), "foo", nullptr).IsNotFound()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "foo", nullptr).IsNotFound()); +} + +TEST(JsonReaderTest, Basic) { + JsonReader r("{ \"foo\" : \"bar\" }"); + ASSERT_OK(r.Init()); + string foo; + ASSERT_OK(r.ExtractString(r.root(), "foo", &foo)); + ASSERT_EQ("bar", foo); + + // Bad types. + ASSERT_TRUE(r.ExtractInt32(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "foo", nullptr).IsInvalidArgument()); +} + +TEST(JsonReaderTest, LessBasic) { + string doc = Substitute( + "{ \"small\" : 1, \"big\" : $0, \"null\" : null, \"empty\" : \"\" }", kint64max); + JsonReader r(doc); + ASSERT_OK(r.Init()); + int32_t small; + ASSERT_OK(r.ExtractInt32(r.root(), "small", &small)); + ASSERT_EQ(1, small); + int64_t big; + ASSERT_OK(r.ExtractInt64(r.root(), "big", &big)); + ASSERT_EQ(kint64max, big); + string str; + ASSERT_OK(r.ExtractString(r.root(), "null", &str)); + ASSERT_EQ("", str); + ASSERT_OK(r.ExtractString(r.root(), "empty", &str)); + ASSERT_EQ("", str); + + // Bad types. + ASSERT_TRUE(r.ExtractString(r.root(), "small", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "small", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "small", nullptr).IsInvalidArgument()); + + ASSERT_TRUE(r.ExtractInt32(r.root(), "big", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractString(r.root(), "big", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "big", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "big", nullptr).IsInvalidArgument()); + + ASSERT_TRUE(r.ExtractInt32(r.root(), "null", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "null", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "null", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "null", nullptr).IsInvalidArgument()); + + ASSERT_TRUE(r.ExtractInt32(r.root(), "empty", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "empty", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "empty", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "empty", nullptr).IsInvalidArgument()); +} + +TEST(JsonReaderTest, Objects) { + JsonReader r("{ \"foo\" : { \"1\" : 1 } }"); + ASSERT_OK(r.Init()); + + const Value* foo = nullptr; + ASSERT_OK(r.ExtractObject(r.root(), "foo", &foo)); + ASSERT_TRUE(foo); + + int32_t one; + ASSERT_OK(r.ExtractInt32(foo, "1", &one)); + ASSERT_EQ(1, one); + + // Bad types. + ASSERT_TRUE(r.ExtractInt32(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractString(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObjectArray(r.root(), "foo", nullptr).IsInvalidArgument()); +} + +TEST(JsonReaderTest, TopLevelArray) { + JsonReader r("[ { \"name\" : \"foo\" }, { \"name\" : \"bar\" } ]"); + ASSERT_OK(r.Init()); + + vector objs; + ASSERT_OK(r.ExtractObjectArray(r.root(), nullptr, &objs)); + ASSERT_EQ(2, objs.size()); + string name; + ASSERT_OK(r.ExtractString(objs[0], "name", &name)); + ASSERT_EQ("foo", name); + ASSERT_OK(r.ExtractString(objs[1], "name", &name)); + ASSERT_EQ("bar", name); + + // Bad types. + ASSERT_TRUE(r.ExtractInt32(r.root(), nullptr, NULL).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), nullptr, NULL).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractString(r.root(), nullptr, NULL).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), nullptr, NULL).IsInvalidArgument()); +} + +TEST(JsonReaderTest, NestedArray) { + JsonReader r("{ \"foo\" : [ { \"val\" : 0 }, { \"val\" : 1 }, { \"val\" : 2 } ] }"); + ASSERT_OK(r.Init()); + + vector foo; + ASSERT_OK(r.ExtractObjectArray(r.root(), "foo", &foo)); + ASSERT_EQ(3, foo.size()); + int i = 0; + for (const Value* v : foo) { + int32_t number; + ASSERT_OK(r.ExtractInt32(v, "val", &number)); + ASSERT_EQ(i, number); + i++; + } + + // Bad types. + ASSERT_TRUE(r.ExtractInt32(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractInt64(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractString(r.root(), "foo", nullptr).IsInvalidArgument()); + ASSERT_TRUE(r.ExtractObject(r.root(), "foo", nullptr).IsInvalidArgument()); +} + +} // namespace kudu diff --git a/src/kudu/util/jsonreader.cc b/src/kudu/util/jsonreader.cc new file mode 100644 index 000000000000..e39761d1868e --- /dev/null +++ b/src/kudu/util/jsonreader.cc @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/jsonreader.h" + +#include "kudu/gutil/strings/substitute.h" + +using rapidjson::Value; +using std::string; +using std::vector; +using strings::Substitute; + +namespace kudu { + +JsonReader::JsonReader(string text) : text_(std::move(text)) {} + +JsonReader::~JsonReader() { +} + +Status JsonReader::Init() { + document_.Parse<0>(text_.c_str()); + if (document_.HasParseError()) { + return Status::Corruption("JSON text is corrupt", document_.GetParseError()); + } + return Status::OK(); +} + +Status JsonReader::ExtractInt32(const Value* object, + const char* field, + int32_t* result) const { + const Value* val; + RETURN_NOT_OK(ExtractField(object, field, &val)); + if (PREDICT_FALSE(!val->IsInt())) { + return Status::InvalidArgument(Substitute( + "Wrong type during field extraction: expected int32 but got $0", + val->GetType())); + } + *result = val->GetUint(); + return Status::OK(); +} + +Status JsonReader::ExtractInt64(const Value* object, + const char* field, + int64_t* result) const { + const Value* val; + RETURN_NOT_OK(ExtractField(object, field, &val)); + if (PREDICT_FALSE(!val->IsInt64())) { + return Status::InvalidArgument(Substitute( + "Wrong type during field extraction: expected int64 but got $0", + val->GetType())); } + *result = val->GetUint64(); + return Status::OK(); +} + +Status JsonReader::ExtractString(const Value* object, + const char* field, + string* result) const { + const Value* val; + RETURN_NOT_OK(ExtractField(object, field, &val)); + if (PREDICT_FALSE(!val->IsString())) { + if (val->IsNull()) { + *result = ""; + return Status::OK(); + } + return Status::InvalidArgument(Substitute( + "Wrong type during field extraction: expected string but got $0", + val->GetType())); } + result->assign(val->GetString()); + return Status::OK(); +} + +Status JsonReader::ExtractObject(const Value* object, + const char* field, + const Value** result) const { + const Value* val; + RETURN_NOT_OK(ExtractField(object, field, &val)); + if (PREDICT_FALSE(!val->IsObject())) { + return Status::InvalidArgument(Substitute( + "Wrong type during field extraction: expected object but got $0", + val->GetType())); } + *result = val; + return Status::OK(); +} + +Status JsonReader::ExtractObjectArray(const Value* object, + const char* field, + vector* result) const { + const Value* val; + RETURN_NOT_OK(ExtractField(object, field, &val)); + if (PREDICT_FALSE(!val->IsArray())) { + return Status::InvalidArgument(Substitute( + "Wrong type during field extraction: expected object array but got $0", + val->GetType())); } + for (Value::ConstValueIterator iter = val->Begin(); iter != val->End(); ++iter) { + result->push_back(iter); + } + return Status::OK(); +} + +Status JsonReader::ExtractField(const Value* object, + const char* field, + const Value** result) const { + if (field && PREDICT_FALSE(!object->HasMember(field))) { + return Status::NotFound("Missing field", field); + } + *result = field ? &(*object)[field] : object; + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/jsonreader.h b/src/kudu/util/jsonreader.h new file mode 100644 index 000000000000..2d9e9829a38c --- /dev/null +++ b/src/kudu/util/jsonreader.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_JSONREADER_H_ +#define KUDU_UTIL_JSONREADER_H_ + +#include +#include +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +// Wraps the JSON parsing functionality of rapidjson::Document. +// +// Unlike JsonWriter, this class does not hide rapidjson internals from +// clients. That's because there's just no easy way to implement object and +// array parsing otherwise. At most, this class aspires to be a simpler +// error-handling wrapper for reading and parsing. +class JsonReader { + public: + explicit JsonReader(std::string text); + ~JsonReader(); + + Status Init(); + + // Extractor methods. + // + // If 'field' is not NULL, will look for a field with that name in the + // given object, returning Status::NotFound if it cannot be found. If + // 'field' is NULL, will try to convert 'object' directly into the + // desire type. + + Status ExtractInt32(const rapidjson::Value* object, + const char* field, + int32_t* result) const; + + Status ExtractInt64(const rapidjson::Value* object, + const char* field, + int64_t* result) const; + + Status ExtractString(const rapidjson::Value* object, + const char* field, + std::string* result) const; + + // 'result' is only valid for as long as JsonReader is alive. + Status ExtractObject(const rapidjson::Value* object, + const char* field, + const rapidjson::Value** result) const; + + // 'result' is only valid for as long as JsonReader is alive. + Status ExtractObjectArray(const rapidjson::Value* object, + const char* field, + std::vector* result) const; + + const rapidjson::Value* root() const { return &document_; } + + private: + Status ExtractField(const rapidjson::Value* object, + const char* field, + const rapidjson::Value** result) const; + + std::string text_; + rapidjson::Document document_; + + DISALLOW_COPY_AND_ASSIGN(JsonReader); +}; + +} // namespace kudu + +#endif // KUDU_UTIL_JSONREADER_H_ diff --git a/src/kudu/util/jsonwriter-test.cc b/src/kudu/util/jsonwriter-test.cc new file mode 100644 index 000000000000..81e7d85e2548 --- /dev/null +++ b/src/kudu/util/jsonwriter-test.cc @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/util/jsonwriter.h" +#include "kudu/util/jsonwriter_test.pb.h" +#include "kudu/util/test_util.h" + +using jsonwriter_test::TestAllTypes; + +namespace kudu { + +class TestJsonWriter : public KuduTest {}; + +TEST_F(TestJsonWriter, TestPBEmpty) { + TestAllTypes pb; + ASSERT_EQ("{}", JsonWriter::ToJson(pb, JsonWriter::PRETTY)); +} + +TEST_F(TestJsonWriter, TestPBAllFieldTypes) { + TestAllTypes pb; + pb.set_optional_int32(1); + pb.set_optional_int64(2); + pb.set_optional_uint32(3); + pb.set_optional_uint64(4); + pb.set_optional_sint32(5); + pb.set_optional_sint64(6); + pb.set_optional_fixed32(7); + pb.set_optional_fixed64(8); + pb.set_optional_sfixed32(9); + pb.set_optional_sfixed64(10); + pb.set_optional_float(11); + pb.set_optional_double(12); + pb.set_optional_bool(true); + pb.set_optional_string("hello world"); + pb.set_optional_nested_enum(TestAllTypes::FOO); + ASSERT_EQ("{\n" + " \"optional_int32\": 1,\n" + " \"optional_int64\": 2,\n" + " \"optional_uint32\": 3,\n" + " \"optional_uint64\": 4,\n" + " \"optional_sint32\": 5,\n" + " \"optional_sint64\": 6,\n" + " \"optional_fixed32\": 7,\n" + " \"optional_fixed64\": 8,\n" + " \"optional_sfixed32\": 9,\n" + " \"optional_sfixed64\": 10,\n" + " \"optional_float\": 11,\n" + " \"optional_double\": 12,\n" + " \"optional_bool\": true,\n" + " \"optional_string\": \"hello world\",\n" + " \"optional_nested_enum\": \"FOO\"\n" + "}", JsonWriter::ToJson(pb, JsonWriter::PRETTY)); + ASSERT_EQ("{" + "\"optional_int32\":1," + "\"optional_int64\":2," + "\"optional_uint32\":3," + "\"optional_uint64\":4," + "\"optional_sint32\":5," + "\"optional_sint64\":6," + "\"optional_fixed32\":7," + "\"optional_fixed64\":8," + "\"optional_sfixed32\":9," + "\"optional_sfixed64\":10," + "\"optional_float\":11," + "\"optional_double\":12," + "\"optional_bool\":true," + "\"optional_string\":\"hello world\"," + "\"optional_nested_enum\":\"FOO\"" + "}", JsonWriter::ToJson(pb, JsonWriter::COMPACT)); + +} + +TEST_F(TestJsonWriter, TestPBRepeatedPrimitives) { + TestAllTypes pb; + for (int i = 0; i <= 3; i++) { + pb.add_repeated_int32(i); + } + ASSERT_EQ("{\n" + " \"repeated_int32\": [\n" + " 0,\n" + " 1,\n" + " 2,\n" + " 3\n" + " ]\n" + "}", JsonWriter::ToJson(pb, JsonWriter::PRETTY)); + ASSERT_EQ("{\"repeated_int32\":[0,1,2,3]}", + JsonWriter::ToJson(pb, JsonWriter::COMPACT)); +} + +TEST_F(TestJsonWriter, TestPBNestedMessage) { + TestAllTypes pb; + pb.add_repeated_nested_message()->set_int_field(12345); + pb.mutable_optional_nested_message()->set_int_field(54321); + ASSERT_EQ("{\n" + " \"optional_nested_message\": {\n" + " \"int_field\": 54321\n" + " },\n" + " \"repeated_nested_message\": [\n" + " {\n" + " \"int_field\": 12345\n" + " }\n" + " ]\n" + "}", JsonWriter::ToJson(pb, JsonWriter::PRETTY)); + ASSERT_EQ("{\"optional_nested_message\":{\"int_field\":54321}," + "\"repeated_nested_message\":" + "[{\"int_field\":12345}]}", + JsonWriter::ToJson(pb, JsonWriter::COMPACT)); +} + +} // namespace kudu diff --git a/src/kudu/util/jsonwriter.cc b/src/kudu/util/jsonwriter.cc new file mode 100644 index 000000000000..bc1126483185 --- /dev/null +++ b/src/kudu/util/jsonwriter.cc @@ -0,0 +1,319 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/jsonwriter.h" + +#include +#include + +#include +#include +#include +#include +#include + +using google::protobuf::FieldDescriptor; +using google::protobuf::Message; +using google::protobuf::Reflection; + +using std::string; +using std::stringstream; +using std::vector; + +namespace kudu { + +// Adapter to allow RapidJSON to write directly to a stringstream. +// Since Squeasel exposes a stringstream as its interface, this is needed to avoid overcopying. +class UTF8StringStreamBuffer { + public: + explicit UTF8StringStreamBuffer(std::stringstream* out); + void Put(rapidjson::UTF8<>::Ch c); + private: + std::stringstream* out_; +}; + +// rapidjson doesn't provide any common interface between the PrettyWriter and +// Writer classes. So, we create our own pure virtual interface here, and then +// use JsonWriterImpl below to make the two different rapidjson implementations +// correspond to this subclass. +class JsonWriterIf { + public: + virtual void Null() = 0; + virtual void Bool(bool b) = 0; + virtual void Int(int i) = 0; + virtual void Uint(unsigned u) = 0; + virtual void Int64(int64_t i64) = 0; + virtual void Uint64(uint64_t u64) = 0; + virtual void Double(double d) = 0; + virtual void String(const char* str, size_t length) = 0; + virtual void String(const char* str) = 0; + virtual void String(const std::string& str) = 0; + + virtual void StartObject() = 0; + virtual void EndObject() = 0; + virtual void StartArray() = 0; + virtual void EndArray() = 0; + + virtual ~JsonWriterIf() {} +}; + +// Adapts the different rapidjson Writer implementations to our virtual +// interface above. +template +class JsonWriterImpl : public JsonWriterIf { + public: + explicit JsonWriterImpl(stringstream* out); + + virtual void Null() OVERRIDE; + virtual void Bool(bool b) OVERRIDE; + virtual void Int(int i) OVERRIDE; + virtual void Uint(unsigned u) OVERRIDE; + virtual void Int64(int64_t i64) OVERRIDE; + virtual void Uint64(uint64_t u64) OVERRIDE; + virtual void Double(double d) OVERRIDE; + virtual void String(const char* str, size_t length) OVERRIDE; + virtual void String(const char* str) OVERRIDE; + virtual void String(const std::string& str) OVERRIDE; + + virtual void StartObject() OVERRIDE; + virtual void EndObject() OVERRIDE; + virtual void StartArray() OVERRIDE; + virtual void EndArray() OVERRIDE; + + private: + UTF8StringStreamBuffer stream_; + T writer_; + DISALLOW_COPY_AND_ASSIGN(JsonWriterImpl); +}; + +// +// JsonWriter +// + +typedef rapidjson::PrettyWriter PrettyWriterClass; +typedef rapidjson::Writer CompactWriterClass; + +JsonWriter::JsonWriter(stringstream* out, Mode m) { + switch (m) { + case PRETTY: + impl_.reset(new JsonWriterImpl(DCHECK_NOTNULL(out))); + break; + case COMPACT: + impl_.reset(new JsonWriterImpl(DCHECK_NOTNULL(out))); + break; + } +} +JsonWriter::~JsonWriter() { +} +void JsonWriter::Null() { impl_->Null(); } +void JsonWriter::Bool(bool b) { impl_->Bool(b); } +void JsonWriter::Int(int i) { impl_->Int(i); } +void JsonWriter::Uint(unsigned u) { impl_->Uint(u); } +void JsonWriter::Int64(int64_t i64) { impl_->Int64(i64); } +void JsonWriter::Uint64(uint64_t u64) { impl_->Uint64(u64); } +void JsonWriter::Double(double d) { impl_->Double(d); } +void JsonWriter::String(const char* str, size_t length) { impl_->String(str, length); } +void JsonWriter::String(const char* str) { impl_->String(str); } +void JsonWriter::String(const string& str) { impl_->String(str); } +void JsonWriter::StartObject() { impl_->StartObject(); } +void JsonWriter::EndObject() { impl_->EndObject(); } +void JsonWriter::StartArray() { impl_->StartArray(); } +void JsonWriter::EndArray() { impl_->EndArray(); } + +// Specializations for common primitive metric types. +template<> void JsonWriter::Value(const bool& val) { + Bool(val); +} +template<> void JsonWriter::Value(const int32_t& val) { + Int(val); +} +template<> void JsonWriter::Value(const uint32_t& val) { + Uint(val); +} +template<> void JsonWriter::Value(const int64_t& val) { + Int64(val); +} +template<> void JsonWriter::Value(const uint64_t& val) { + Uint64(val); +} +template<> void JsonWriter::Value(const double& val) { + Double(val); +} +template<> void JsonWriter::Value(const string& val) { + String(val); +} + +#if defined(__APPLE__) +template<> void JsonWriter::Value(const size_t& val) { + Uint64(val); +} +#endif + +void JsonWriter::Protobuf(const Message& pb) { + const Reflection* reflection = pb.GetReflection(); + vector fields; + reflection->ListFields(pb, &fields); + + StartObject(); + for (const FieldDescriptor* field : fields) { + String(field->name()); + if (field->is_repeated()) { + StartArray(); + for (int i = 0; i < reflection->FieldSize(pb, field); i++) { + ProtobufRepeatedField(pb, field, i); + } + EndArray(); + } else { + ProtobufField(pb, field); + } + } + EndObject(); +} + +void JsonWriter::ProtobufField(const Message& pb, const FieldDescriptor* field) { + const Reflection* reflection = pb.GetReflection(); + switch (field->cpp_type()) { + case FieldDescriptor::CPPTYPE_INT32: + Int(reflection->GetInt32(pb, field)); + break; + case FieldDescriptor::CPPTYPE_INT64: + Int64(reflection->GetInt64(pb, field)); + break; + case FieldDescriptor::CPPTYPE_UINT32: + Uint(reflection->GetUInt32(pb, field)); + break; + case FieldDescriptor::CPPTYPE_UINT64: + Uint64(reflection->GetUInt64(pb, field)); + break; + case FieldDescriptor::CPPTYPE_DOUBLE: + Double(reflection->GetDouble(pb, field)); + break; + case FieldDescriptor::CPPTYPE_FLOAT: + Double(reflection->GetFloat(pb, field)); + break; + case FieldDescriptor::CPPTYPE_BOOL: + Bool(reflection->GetBool(pb, field)); + break; + case FieldDescriptor::CPPTYPE_ENUM: + String(reflection->GetEnum(pb, field)->name()); + break; + case FieldDescriptor::CPPTYPE_STRING: + String(reflection->GetString(pb, field)); + break; + case FieldDescriptor::CPPTYPE_MESSAGE: + Protobuf(reflection->GetMessage(pb, field)); + break; + default: + LOG(FATAL) << "Unknown cpp_type: " << field->cpp_type(); + } +} + +void JsonWriter::ProtobufRepeatedField(const Message& pb, const FieldDescriptor* field, int index) { + const Reflection* reflection = pb.GetReflection(); + switch (field->cpp_type()) { + case FieldDescriptor::CPPTYPE_INT32: + Int(reflection->GetRepeatedInt32(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_INT64: + Int64(reflection->GetRepeatedInt64(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_UINT32: + Uint(reflection->GetRepeatedUInt32(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_UINT64: + Uint64(reflection->GetRepeatedUInt64(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_DOUBLE: + Double(reflection->GetRepeatedDouble(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_FLOAT: + Double(reflection->GetRepeatedFloat(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_BOOL: + Bool(reflection->GetRepeatedBool(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_ENUM: + String(reflection->GetRepeatedEnum(pb, field, index)->name()); + break; + case FieldDescriptor::CPPTYPE_STRING: + String(reflection->GetRepeatedString(pb, field, index)); + break; + case FieldDescriptor::CPPTYPE_MESSAGE: + Protobuf(reflection->GetRepeatedMessage(pb, field, index)); + break; + default: + LOG(FATAL) << "Unknown cpp_type: " << field->cpp_type(); + } +} + +string JsonWriter::ToJson(const Message& pb, Mode mode) { + stringstream stream; + JsonWriter writer(&stream, mode); + writer.Protobuf(pb); + return stream.str(); +} + +// +// UTF8StringStreamBuffer +// + +UTF8StringStreamBuffer::UTF8StringStreamBuffer(std::stringstream* out) + : out_(DCHECK_NOTNULL(out)) { +} + +void UTF8StringStreamBuffer::Put(rapidjson::UTF8<>::Ch c) { + out_->put(c); +} + +// +// JsonWriterImpl: simply forward to the underlying implementation. +// + +template +JsonWriterImpl::JsonWriterImpl(stringstream* out) + : stream_(DCHECK_NOTNULL(out)), + writer_(stream_) { +} +template +void JsonWriterImpl::Null() { writer_.Null(); } +template +void JsonWriterImpl::Bool(bool b) { writer_.Bool(b); } +template +void JsonWriterImpl::Int(int i) { writer_.Int(i); } +template +void JsonWriterImpl::Uint(unsigned u) { writer_.Uint(u); } +template +void JsonWriterImpl::Int64(int64_t i64) { writer_.Int64(i64); } +template +void JsonWriterImpl::Uint64(uint64_t u64) { writer_.Uint64(u64); } +template +void JsonWriterImpl::Double(double d) { writer_.Double(d); } +template +void JsonWriterImpl::String(const char* str, size_t length) { writer_.String(str, length); } +template +void JsonWriterImpl::String(const char* str) { writer_.String(str); } +template +void JsonWriterImpl::String(const string& str) { writer_.String(str.c_str(), str.length()); } +template +void JsonWriterImpl::StartObject() { writer_.StartObject(); } +template +void JsonWriterImpl::EndObject() { writer_.EndObject(); } +template +void JsonWriterImpl::StartArray() { writer_.StartArray(); } +template +void JsonWriterImpl::EndArray() { writer_.EndArray(); } + +} // namespace kudu diff --git a/src/kudu/util/jsonwriter.h b/src/kudu/util/jsonwriter.h new file mode 100644 index 000000000000..4e1c9fd45d5b --- /dev/null +++ b/src/kudu/util/jsonwriter.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_JSONWRITER_H +#define KUDU_UTIL_JSONWRITER_H + +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" + +namespace google { +namespace protobuf { +class Message; +class FieldDescriptor; +} // namespace protobuf +} // namespace google + +namespace kudu { + +class JsonWriterIf; + +// Acts as a pimpl for rapidjson so that not all metrics users must bring in the +// rapidjson library, which is template-based and therefore hard to forward-declare. +// +// This class implements all the methods of rapidjson::JsonWriter, plus an +// additional convenience method for String(std::string). +// +// We take an instance of std::stringstream in the constructor because Mongoose / Squeasel +// uses std::stringstream for output buffering. +class JsonWriter { + public: + enum Mode { + // Pretty-print the JSON, with nice indentation, newlines, etc. + PRETTY, + // Print the JSON as compactly as possible. + COMPACT + }; + + JsonWriter(std::stringstream* out, Mode mode); + ~JsonWriter(); + + void Null(); + void Bool(bool b); + void Int(int i); + void Uint(unsigned u); + void Int64(int64_t i64); + void Uint64(uint64_t u64); + void Double(double d); + void String(const char* str, size_t length); + void String(const char* str); + void String(const std::string& str); + + void Protobuf(const google::protobuf::Message& message); + + template + void Value(const T& val); + + void StartObject(); + void EndObject(); + void StartArray(); + void EndArray(); + + // Convert the given protobuf to JSON format. + static std::string ToJson(const google::protobuf::Message& pb, + Mode mode); + + private: + void ProtobufField(const google::protobuf::Message& pb, + const google::protobuf::FieldDescriptor* field); + void ProtobufRepeatedField(const google::protobuf::Message& pb, + const google::protobuf::FieldDescriptor* field, + int index); + + gscoped_ptr impl_; + DISALLOW_COPY_AND_ASSIGN(JsonWriter); +}; + +} // namespace kudu + +#endif // KUDU_UTIL_JSONWRITER_H diff --git a/src/kudu/util/jsonwriter_test.proto b/src/kudu/util/jsonwriter_test.proto new file mode 100644 index 000000000000..d16cc11def9c --- /dev/null +++ b/src/kudu/util/jsonwriter_test.proto @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package jsonwriter_test; + +// This proto includes every type of field in both singular and repeated +// forms. This is mostly copied from 'unittest.proto' in the protobuf source +// (hence the odd field numbers which skip some). +message TestAllTypes { + message NestedMessage { + optional int32 int_field = 1; + } + + enum NestedEnum { + FOO = 1; + BAR = 2; + BAZ = 3; + } + + // Singular + optional int32 optional_int32 = 1; + optional int64 optional_int64 = 2; + optional uint32 optional_uint32 = 3; + optional uint64 optional_uint64 = 4; + optional sint32 optional_sint32 = 5; + optional sint64 optional_sint64 = 6; + optional fixed32 optional_fixed32 = 7; + optional fixed64 optional_fixed64 = 8; + optional sfixed32 optional_sfixed32 = 9; + optional sfixed64 optional_sfixed64 = 10; + optional float optional_float = 11; + optional double optional_double = 12; + optional bool optional_bool = 13; + optional string optional_string = 14; + optional bytes optional_bytes = 15; + + optional NestedMessage optional_nested_message = 18; + optional NestedEnum optional_nested_enum = 21; + + // Repeated + repeated int32 repeated_int32 = 31; + repeated int64 repeated_int64 = 32; + repeated uint32 repeated_uint32 = 33; + repeated uint64 repeated_uint64 = 34; + repeated sint32 repeated_sint32 = 35; + repeated sint64 repeated_sint64 = 36; + repeated fixed32 repeated_fixed32 = 37; + repeated fixed64 repeated_fixed64 = 38; + repeated sfixed32 repeated_sfixed32 = 39; + repeated sfixed64 repeated_sfixed64 = 40; + repeated float repeated_float = 41; + repeated double repeated_double = 42; + repeated bool repeated_bool = 43; + repeated string repeated_string = 44; + repeated bytes repeated_bytes = 45; + + repeated NestedMessage repeated_nested_message = 48; + repeated NestedEnum repeated_nested_enum = 51; +} diff --git a/src/kudu/util/kernel_stack_watchdog.cc b/src/kudu/util/kernel_stack_watchdog.cc new file mode 100644 index 000000000000..486fc72f0247 --- /dev/null +++ b/src/kudu/util/kernel_stack_watchdog.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/kernel_stack_watchdog.h" + +#include +#include +#include +#include + +#include "kudu/util/debug-util.h" +#include "kudu/util/env.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/thread.h" +#include "kudu/util/status.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/substitute.h" + +DEFINE_int32(hung_task_check_interval_ms, 200, + "Number of milliseconds in between checks for hung threads"); +TAG_FLAG(hung_task_check_interval_ms, hidden); + +using strings::Substitute; + +namespace kudu { + +DEFINE_STATIC_THREAD_LOCAL(KernelStackWatchdog::TLS, + KernelStackWatchdog, tls_); + +KernelStackWatchdog::KernelStackWatchdog() + : log_collector_(nullptr), + finish_(1) { + CHECK_OK(Thread::Create("kernel-watchdog", "kernel-watcher", + boost::bind(&KernelStackWatchdog::RunThread, this), + &thread_)); +} + +KernelStackWatchdog::~KernelStackWatchdog() { + finish_.CountDown(); + CHECK_OK(ThreadJoiner(thread_.get()).Join()); +} + +void KernelStackWatchdog::SaveLogsForTests(bool save_logs) { + MutexLock l(lock_); + if (save_logs) { + log_collector_.reset(new vector()); + } else { + log_collector_.reset(); + } +} + +vector KernelStackWatchdog::LoggedMessagesForTests() const { + MutexLock l(lock_); + CHECK(log_collector_) << "Must call SaveLogsForTests(true) first"; + return *log_collector_; +} + +void KernelStackWatchdog::Register(TLS* tls) { + int64_t tid = Thread::CurrentThreadId(); + MutexLock l(lock_); + InsertOrDie(&tls_by_tid_, tid, tls); +} + +void KernelStackWatchdog::Unregister(TLS* tls) { + int64_t tid = Thread::CurrentThreadId(); + MutexLock l(lock_); + CHECK(tls_by_tid_.erase(tid)); +} + +Status GetKernelStack(pid_t p, string* ret) { + faststring buf; + RETURN_NOT_OK(ReadFileToString(Env::Default(), Substitute("/proc/$0/stack", p), &buf)); + *ret = buf.ToString(); + return Status::OK(); +} + +void KernelStackWatchdog::RunThread() { + while (true) { + MonoDelta delta = MonoDelta::FromMilliseconds(FLAGS_hung_task_check_interval_ms); + if (finish_.WaitFor(delta)) { + // Watchdog exiting. + break; + } + + { + MutexLock l(lock_); + MicrosecondsInt64 now = GetMonoTimeMicros(); + + for (const TLSMap::value_type& map_entry : tls_by_tid_) { + pid_t p = map_entry.first; + const TLS::Data* tls = &map_entry.second->data_; + + TLS::Data tls_copy; + tls->SnapshotCopy(&tls_copy); + + for (int i = 0; i < tls_copy.depth_; i++) { + TLS::Frame* frame = &tls_copy.frames_[i]; + + int paused_ms = (now - frame->start_time_) / 1000; + if (paused_ms > frame->threshold_ms_) { + string kernel_stack; + Status s = GetKernelStack(p, &kernel_stack); + if (!s.ok()) { + // Can't read the kernel stack of the pid -- it's possible that the thread exited + // while we were iterating, so just ignore it. + kernel_stack = "(could not read kernel stack)"; + } + + string user_stack = DumpThreadStack(p); + LOG_STRING(WARNING, log_collector_.get()) + << "Thread " << p << " stuck at " << frame->status_ + << " for " << paused_ms << "ms" << ":\n" + << "Kernel stack:\n" << kernel_stack << "\n" + << "User stack:\n" << user_stack; + } + } + } + } + } +} + +KernelStackWatchdog::TLS* KernelStackWatchdog::GetTLS() { + INIT_STATIC_THREAD_LOCAL(KernelStackWatchdog::TLS, tls_); + return tls_; +} + +KernelStackWatchdog::TLS::TLS() { + memset(&data_, 0, sizeof(data_)); + KernelStackWatchdog::GetInstance()->Register(this); +} + +KernelStackWatchdog::TLS::~TLS() { + KernelStackWatchdog::GetInstance()->Unregister(this); +} + +// Optimistic concurrency control approach to snapshot the value of another +// thread's TLS, even though that thread might be changing it. +// +// Called by the watchdog thread to see if a target thread is currently in the +// middle of a watched section. +void KernelStackWatchdog::TLS::Data::SnapshotCopy(Data* copy) const { + while (true) { + Atomic32 v_0 = base::subtle::Acquire_Load(&seq_lock_); + if (v_0 & 1) { + // If the value is odd, then the thread is in the middle of modifying + // its TLS, and we have to spin. + base::subtle::PauseCPU(); + continue; + } + ANNOTATE_IGNORE_READS_BEGIN(); + memcpy(copy, this, sizeof(*copy)); + ANNOTATE_IGNORE_READS_END(); + Atomic32 v_1 = base::subtle::Release_Load(&seq_lock_); + + // If the value hasn't changed since we started the copy, then + // we know that the copy was a consistent snapshot. + if (v_1 == v_0) break; + } +} + +} // namespace kudu diff --git a/src/kudu/util/kernel_stack_watchdog.h b/src/kudu/util/kernel_stack_watchdog.h new file mode 100644 index 000000000000..1a98affccb80 --- /dev/null +++ b/src/kudu/util/kernel_stack_watchdog.h @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This class defines a singleton thread which manages a map of other thread IDs to +// watch. Before performing some operation which may stall (eg IO) or which we expect +// should be short (e.g. a callback on a critical thread that should not block), threads +// may mark themselves as "watched", with a threshold beyond which they would like +// warnings to be emitted including their stack trace at that time. +// +// In the background, a separate watchdog thread periodically wakes up, and if a thread +// has been marked longer than its provided threshold, it will dump the stack trace +// of that thread (both kernel-mode and user-mode stacks). +// +// This can be useful for diagnosing I/O stalls coming from the kernel, for example. +// +// Users will typically use the macro SCOPED_WATCH_STACK. Example usage: +// +// // We expect the Write() to return in <100ms. If it takes longer than that +// // we'll see warnings indicating why it is stalled. +// { +// SCOPED_WATCH_STACK(100); +// file->Write(...); +// } +// +// If the Write call takes too long, a stack trace will be logged at WARNING level. +// Note that the threshold time parameter is not a guarantee that a stall will be +// caught by the watchdog thread. The watchdog only wakes up periodically to look +// for threads that have been stalled too long. For example, if the threshold is 10ms +// and the thread blocks for only 20ms, it's quite likely that the watchdog will +// have missed the event. +// +// The SCOPED_WATCH_STACK macro is designed to have minimal overhead: approximately +// equivalent to a clock_gettime() and a single 'mfence' instruction. Micro-benchmarks +// measure the cost at about 50ns per call. Thus, it may safely be used in hot code +// paths. +// +// Scopes with SCOPED_WATCH_STACK may be nested, but only up to a hard-coded limited depth +// (currently 8). +#ifndef KUDU_UTIL_KERNEL_STACK_WATCHDOG_H +#define KUDU_UTIL_KERNEL_STACK_WATCHDOG_H + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/singleton.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/mutex.h" +#include "kudu/util/monotime.h" +#include "kudu/util/threadlocal.h" + +#define SCOPED_WATCH_STACK(threshold_ms) \ + ScopedWatchKernelStack _stack_watcher(__FILE__ ":" AS_STRING(__LINE__), threshold_ms) + +namespace kudu { + +class Thread; + +// Singleton thread which implements the watchdog. +class KernelStackWatchdog { + public: + static KernelStackWatchdog* GetInstance() { + return Singleton::get(); + } + + // Instead of logging through glog, log warning messages into a vector. + // + // If 'save_logs' is true, will start saving to the vector, and forget any + // previously logged messages. + // If 'save_logs' is false, disables this functionality. + void SaveLogsForTests(bool save_logs); + + // Return any log messages saved since the last call to SaveLogsForTests(true). + std::vector LoggedMessagesForTests() const; + + private: + friend class Singleton; + friend class ScopedWatchKernelStack; + + // The thread-local state which captures whether a thread should be watched by + // the watchdog. This structure is constructed as a thread-local on first use + // and destructed when the thread exits. Upon construction, the TLS structure + // registers itself with the WatchDog, and on destruction, unregisters itself. + // + // See 'seq_lock_' below for details on thread-safe operation. + struct TLS { + TLS(); + ~TLS(); + + enum Constants { + // The maximum nesting depth of SCOPED_WATCH_STACK() macros. + kMaxDepth = 8 + }; + + // Because we support nested SCOPED_WATCH_STACK() macros, we need to capture + // multiple active frames within the TLS. + struct Frame { + // The time at which this frame entered the SCOPED_WATCH_STACK section. + // We use MicrosecondsInt64 instead of MonoTime because it inlines a bit + // better. + MicrosecondsInt64 start_time_; + // The threshold of time beyond which the watchdog should emit warnings. + int threshold_ms_; + // A string explaining the state that the thread is in (typically a file:line + // string). This is expected to be static storage and is not freed. + const char* status_; + }; + + // The data within the TLS. This is a POD type so that the watchdog can easily + // copy data out of a thread's TLS. + struct Data { + Frame frames_[kMaxDepth]; + Atomic32 depth_; + + // Counter implementing a simple "sequence lock". + // + // Before modifying any data inside its TLS, the watched thread increments this value so it is + // odd. When the modifications are complete, it increments it again, making it even. + // + // To read the TLS data from a target thread, the watchdog thread waits for the value + // to become even, indicating that no write is in progress. Then, it does a potentially + // racy copy of the entire 'Data' structure. Then, it validates the value again. + // If it is has not changed, then the snapshot is guaranteed to be consistent. + // + // We use this type of locking to ensure that the watched thread is as fast as possible, + // allowing us to use SCOPED_WATCH_STACK even in hot code paths. In particular, + // the watched thread is wait-free, since it doesn't need to loop or retry. In addition, the + // memory is only written by that thread, eliminating any cache-line bouncing. The watchdog + // thread may have to loop multiple times to see a consistent snapshot, but we're OK delaying + // the watchdog arbitrarily since it isn't on any critical path. + Atomic32 seq_lock_; + + // Take a consistent snapshot of this data into 'dst'. This may block if the target thread + // is currently modifying its TLS. + void SnapshotCopy(Data* dst) const; + }; + Data data_; + }; + + KernelStackWatchdog(); + ~KernelStackWatchdog(); + + // Get or create the TLS for the current thread. + static TLS* GetTLS(); + + // Register a new thread's TLS with the watchdog. + // Called by any thread the first time it enters a watched section, when its TLS + // is constructed. + void Register(TLS* tls); + + // Called when a thread's TLS is destructed (i.e. when the thread exits). + void Unregister(TLS* tls); + + // The actual watchdog loop that the watchdog thread runs. + void RunThread(); + + DECLARE_STATIC_THREAD_LOCAL(TLS, tls_); + + typedef std::unordered_map TLSMap; + TLSMap tls_by_tid_; + + // If non-NULL, warnings will be emitted into this vector instead of glog. + // Used by tests. + gscoped_ptr > log_collector_; + + // Lock protecting tls_by_tid_ and log_collector_. + mutable Mutex lock_; + + // The watchdog thread itself. + scoped_refptr thread_; + + // Signal to stop the watchdog. + CountDownLatch finish_; + + DISALLOW_COPY_AND_ASSIGN(KernelStackWatchdog); +}; + +// Scoped object which marks the current thread for watching. +class ScopedWatchKernelStack { + public: + // If the current scope is active more than 'threshold_ms' milliseconds, the + // watchdog thread will log a warning including the message 'label'. 'label' + // is not copied or freed. + ScopedWatchKernelStack(const char* label, int threshold_ms) { + // Rather than just using the lazy GetTLS() method, we'll first try to load + // the TLS ourselves. This is usually successful, and avoids us having to inline + // the TLS construction path at call sites. + KernelStackWatchdog::TLS* tls = KernelStackWatchdog::tls_; + if (PREDICT_FALSE(tls == NULL)) { + tls = KernelStackWatchdog::GetTLS(); + } + KernelStackWatchdog::TLS::Data* tls_data = &tls->data_; + + // "Acquire" the sequence lock. While the lock value is odd, readers will block. + // TODO: technically this barrier is stronger than we need: we are the only writer + // to this data, so it's OK to allow loads from within the critical section to + // reorder above this next line. All we need is a "StoreStore" barrier (i.e. + // prevent any stores in the critical section from getting reordered above the + // increment of the counter). However, atomicops.h doesn't provide such a barrier + // as of yet, so we'll do the slightly more expensive one for now. + base::subtle::Acquire_Store(&tls_data->seq_lock_, tls_data->seq_lock_ + 1); + + KernelStackWatchdog::TLS::Frame* frame = &tls_data->frames_[tls_data->depth_++]; + DCHECK_LE(tls_data->depth_, KernelStackWatchdog::TLS::kMaxDepth); + frame->start_time_ = GetMonoTimeMicros(); + frame->threshold_ms_ = threshold_ms; + frame->status_ = label; + + // "Release" the sequence lock. This resets the lock value to be even, so readers + // will proceed. + base::subtle::Release_Store(&tls_data->seq_lock_, tls_data->seq_lock_ + 1); + } + + ~ScopedWatchKernelStack() { + KernelStackWatchdog::TLS::Data* tls = &DCHECK_NOTNULL(KernelStackWatchdog::tls_)->data_; + int d = tls->depth_; + DCHECK_GT(d, 0); + + // We don't bother with a lock/unlock, because the change we're making here is atomic. + // If we race with the watchdog, either they'll see the old depth_ or the new depth_, + // but in either case the underlying data is perfectly valid. + base::subtle::NoBarrier_Store(&tls->depth_, d - 1); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ScopedWatchKernelStack); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_KERNEL_STACK_WATCHDOG_H */ diff --git a/src/kudu/util/knapsack_solver-test.cc b/src/kudu/util/knapsack_solver-test.cc new file mode 100644 index 000000000000..703255ffadab --- /dev/null +++ b/src/kudu/util/knapsack_solver-test.cc @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include "kudu/util/knapsack_solver.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using std::vector; + +namespace kudu { + +class TestKnapsack : public KuduTest { +}; + +// A simple test item for use with the knapsack solver. +// The real code will be solving knapsack over RowSet objects -- +// using simple value/weight pairs in the tests makes it standalone. +struct TestItem { + TestItem(double v, int w) + : value(v), weight(w) { + } + + double value; + int weight; +}; + +// A traits class to adapt the knapsack solver to TestItem. +struct TestItemTraits { + typedef TestItem item_type; + typedef double value_type; + static int get_weight(const TestItem &item) { + return item.weight; + } + static value_type get_value(const TestItem &item) { + return item.value; + } +}; + +// Generate random items into the provided vector. +static void GenerateRandomItems(int n_items, int max_weight, + vector *out) { + for (int i = 0; i < n_items; i++) { + double value = 10000.0 / (random() % 10000 + 1); + int weight = random() % max_weight; + out->push_back(TestItem(value, weight)); + } +} + +// Join and stringify the given list of ints. +static string JoinInts(const vector &ints) { + string ret; + for (int i = 0; i < ints.size(); i++) { + if (i > 0) { + ret.push_back(','); + } + ret.append(boost::lexical_cast(ints[i])); + } + return ret; +} + +TEST_F(TestKnapsack, Basics) { + KnapsackSolver solver; + + vector in; + in.push_back(TestItem(500, 3)); + in.push_back(TestItem(110, 1)); + in.push_back(TestItem(125, 1)); + in.push_back(TestItem(100, 1)); + + vector out; + double max_val; + + // For 1 weight, pick item 2 + solver.Solve(in, 1, &out, &max_val); + ASSERT_DOUBLE_EQ(125, max_val); + ASSERT_EQ("2", JoinInts(out)); + out.clear(); + + // For 2 weight, pick item 1, 2 + solver.Solve(in, 2, &out, &max_val); + ASSERT_DOUBLE_EQ(110 + 125, max_val); + ASSERT_EQ("2,1", JoinInts(out)); + out.clear(); + + // For 3 weight, pick item 0 + solver.Solve(in, 3, &out, &max_val); + ASSERT_DOUBLE_EQ(500, max_val); + ASSERT_EQ("0", JoinInts(out)); + out.clear(); + + // For 10 weight, pick all. + solver.Solve(in, 10, &out, &max_val); + ASSERT_DOUBLE_EQ(500 + 110 + 125 + 100, max_val); + ASSERT_EQ("3,2,1,0", JoinInts(out)); + out.clear(); +} + +// Test which generates random knapsack instances and verifies +// that the result satisfies the constraints. +TEST_F(TestKnapsack, Randomized) { + SeedRandom(); + KnapsackSolver solver; + + const int kNumTrials = AllowSlowTests() ? 200 : 1; + const int kMaxWeight = 1000; + const int kNumItems = 1000; + + for (int i = 0; i < kNumTrials; i++) { + vector in; + vector out; + GenerateRandomItems(kNumItems, kMaxWeight, &in); + double max_val; + int max_weight = random() % kMaxWeight; + solver.Solve(in, max_weight, &out, &max_val); + + // Verify that the max_val is equal to the sum of the chosen items' values. + double sum_val = 0; + int sum_weight = 0; + for (int i : out) { + sum_val += in[i].value; + sum_weight += in[i].weight; + } + ASSERT_NEAR(max_val, sum_val, 0.000001); + ASSERT_LE(sum_weight, max_weight); + } +} + +#ifdef NDEBUG +TEST_F(TestKnapsack, Benchmark) { + KnapsackSolver solver; + + const int kNumTrials = 1000; + const int kMaxWeight = 1000; + const int kNumItems = 1000; + + vector in; + GenerateRandomItems(kNumItems, kMaxWeight, &in); + + LOG_TIMING(INFO, "benchmark") { + vector out; + for (int i = 0; i < kNumTrials; i++) { + out.clear(); + double max_val; + solver.Solve(in, random() % kMaxWeight, &out, &max_val); + } + } +} +#endif + +} // namespace kudu diff --git a/src/kudu/util/knapsack_solver.h b/src/kudu/util/knapsack_solver.h new file mode 100644 index 000000000000..2c370659b58b --- /dev/null +++ b/src/kudu/util/knapsack_solver.h @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_KNAPSACK_SOLVER_H +#define KUDU_UTIL_KNAPSACK_SOLVER_H + +#include +#include +#include +#include +#include "kudu/gutil/macros.h" + +namespace kudu { + +// Solver for the 0-1 knapsack problem. This uses dynamic programming +// to solve the problem exactly. +// +// Given a knapsack capacity of 'W' and a number of potential items 'n', +// this solver is O(nW) time and space. +// +// This implementation is cribbed from wikipedia. The only interesting +// bit here that doesn't directly match the pseudo-code is that we +// maintain the "taken" bitmap keeping track of which items were +// taken, so we can efficiently "trace back" the chosen items. +template +class KnapsackSolver { + public: + typedef typename Traits::item_type item_type; + typedef typename Traits::value_type value_type; + typedef std::pair solution_type; + + KnapsackSolver() {} + ~KnapsackSolver() {} + + // Solve a knapsack problem in one shot. Finds the set of + // items in 'items' such that their weights add up to no + // more than 'knapsack_capacity' and maximizes the sum + // of their values. + // The indexes of the chosen items are stored in 'chosen_items', + // and the maximal value is stored in 'optimal_value'. + void Solve(std::vector &items, + int knapsack_capacity, + std::vector* chosen_items, + value_type* optimal_value); + + + // The following functions are a more advanced API for solving + // knapsack problems, allowing the caller to obtain incremental + // results as each item is considered. See the implementation of + // Solve() for usage. + + // Prepare to solve a knapsack problem with the given capacity and + // item set. The vector of items must remain valid and unchanged + // until the next call to Reset(). + void Reset(int knapsack_capacity, + const std::vector* items); + + // Process the next item in 'items'. Returns false if there + // were no more items to process. + bool ProcessNext(); + + // Returns the current best solution after the most recent ProcessNext + // call. *solution is a pair of (knapsack weight used, value obtained). + solution_type GetSolution(); + + // Trace the path of item indexes used to achieve the given best + // solution as of the latest ProcessNext() call. + void TracePath(const solution_type& best, + std::vector* chosen_items); + + private: + + // The state kept by the DP algorithm. + class KnapsackBlackboard { + public: + typedef std::pair solution_type; + KnapsackBlackboard() : + n_items_(0), + n_weights_(0), + cur_item_idx_(0), + best_solution_(0, 0) { + } + + void ResizeAndClear(int n_items, int max_weight); + + // Current maximum value at the given weight + value_type &max_at(int weight) { + DCHECK_GE(weight, 0); + DCHECK_LT(weight, n_weights_); + return max_value_[weight]; + } + + // Consider the next item to be put into the knapsack + // Moves the "state" of the solution forward + void Advance(value_type new_val, int new_wt); + + // How many items have been considered + int current_item_index() const { return cur_item_idx_; } + + bool item_taken(int item, int weight) const { + DCHECK_GE(weight, 0); + DCHECK_LT(weight, n_weights_); + DCHECK_GE(item, 0); + DCHECK_LT(item, n_items_); + return item_taken_[index(item, weight)]; + } + + solution_type best_solution() { return best_solution_; } + + bool done() { return cur_item_idx_ == n_items_; } + + private: + void MarkTaken(int item, int weight) { + item_taken_[index(item, weight)] = true; + } + + // If the dynamic programming matrix has more than this number of cells, + // then warn. + static const int kWarnDimension = 10000000; + + int index(int item, int weight) const { + return n_weights_ * item + weight; + } + + // vector with maximum value at the i-th position meaning that it is + // the maximum value you can get given a knapsack of weight capacity i + // while only considering items 0..cur_item_idx_-1 + std::vector max_value_; + std::vector item_taken_; // TODO: record difference vectors? + int n_items_, n_weights_; + int cur_item_idx_; + // Best current solution + solution_type best_solution_; + + DISALLOW_COPY_AND_ASSIGN(KnapsackBlackboard); + }; + + KnapsackBlackboard bb_; + const std::vector* items_; + int knapsack_capacity_; + + DISALLOW_COPY_AND_ASSIGN(KnapsackSolver); +}; + +template +inline void KnapsackSolver::Reset(int knapsack_capacity, + const std::vector* items) { + DCHECK_GE(knapsack_capacity, 0); + items_ = items; + knapsack_capacity_ = knapsack_capacity; + bb_.ResizeAndClear(items->size(), knapsack_capacity); +} + +template +inline bool KnapsackSolver::ProcessNext() { + if (bb_.done()) return false; + + const item_type& item = (*items_)[bb_.current_item_index()]; + int item_weight = Traits::get_weight(item); + value_type item_value = Traits::get_value(item); + bb_.Advance(item_value, item_weight); + + return true; +} + +template +inline void KnapsackSolver::Solve(std::vector &items, + int knapsack_capacity, + std::vector* chosen_items, + value_type* optimal_value) { + Reset(knapsack_capacity, &items); + + while (ProcessNext()) { + } + + solution_type best = GetSolution(); + *optimal_value = best.second; + TracePath(best, chosen_items); +} + +template +inline typename KnapsackSolver::solution_type KnapsackSolver::GetSolution() { + return bb_.best_solution(); +} + +template +inline void KnapsackSolver::TracePath(const solution_type& best, + std::vector* chosen_items) { + chosen_items->clear(); + // Retrace back which set of items corresponded to this value. + int w = best.first; + chosen_items->clear(); + for (int k = bb_.current_item_index() - 1; k >= 0; k--) { + if (bb_.item_taken(k, w)) { + const item_type& taken = (*items_)[k]; + chosen_items->push_back(k); + w -= Traits::get_weight(taken); + DCHECK_GE(w, 0); + } + } +} + +template +void KnapsackSolver::KnapsackBlackboard::ResizeAndClear(int n_items, + int max_weight) { + CHECK_GT(n_items, 0); + CHECK_GE(max_weight, 0); + + // Rather than zero-indexing the weights, we size the array from + // 0 to max_weight. This avoids having to subtract 1 every time + // we index into the array. + n_weights_ = max_weight + 1; + max_value_.resize(n_weights_); + + int dimension = index(n_items, n_weights_); + if (dimension > kWarnDimension) { + LOG(WARNING) << "Knapsack problem " << n_items << "x" << n_weights_ + << " is large: may be inefficient!"; + } + item_taken_.resize(dimension); + n_items_ = n_items; + + // Clear + std::fill(max_value_.begin(), max_value_.end(), 0); + std::fill(item_taken_.begin(), item_taken_.end(), false); + best_solution_ = std::make_pair(0, 0); + + cur_item_idx_ = 0; +} + +template +void KnapsackSolver::KnapsackBlackboard::Advance(value_type new_val, int new_wt) { + // Use the dynamic programming formula: + // Define mv(i, j) as maximum value considering items 0..i-1 with knapsack weight j + // Then: + // if j - weight(i) >= 0, then: + // mv(i, j) = max(mv(i-1, j), mv(i-1, j-weight(i)) + value(j)) + // else mv(i, j) = mv(i-1, j) + // Since the recursive formula requires an access of j-weight(i), we go in reverse. + for (int j = n_weights_ - 1; j >= new_wt ; --j) { + value_type val_if_taken = max_value_[j - new_wt] + new_val; + if (max_value_[j] < val_if_taken) { + max_value_[j] = val_if_taken; + MarkTaken(cur_item_idx_, j); + // Check if new solution found + if (best_solution_.second < val_if_taken) { + best_solution_ = std::make_pair(j, val_if_taken); + } + } + } + + cur_item_idx_++; +} + +} // namespace kudu +#endif diff --git a/src/kudu/util/locks.cc b/src/kudu/util/locks.cc new file mode 100644 index 000000000000..bcb72016d849 --- /dev/null +++ b/src/kudu/util/locks.cc @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/locks.h" + +#include "kudu/util/malloc.h" + +namespace kudu { + +size_t percpu_rwlock::memory_footprint_excluding_this() const { + // Because locks_ is a dynamic array of non-trivially-destructable types, + // the returned pointer from new[] isn't guaranteed to point at the start of + // a memory block, rendering it useless for malloc_usable_size(). + // + // Rather than replace locks_ with a vector or something equivalent, we'll + // just measure the memory footprint using sizeof(), with the understanding + // that we might be inaccurate due to malloc "slop". + // + // See https://code.google.com/p/address-sanitizer/issues/detail?id=395 for + // more details. + return n_cpus_ * sizeof(padded_lock); +} + +size_t percpu_rwlock::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} + +} // namespace kudu diff --git a/src/kudu/util/locks.h b/src/kudu/util/locks.h new file mode 100644 index 000000000000..28b4092bef81 --- /dev/null +++ b/src/kudu/util/locks.h @@ -0,0 +1,331 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_LOCKS_H +#define KUDU_UTIL_LOCKS_H + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/util/errno.h" +#include "kudu/util/rw_semaphore.h" + +namespace kudu { + +using base::subtle::Acquire_CompareAndSwap; +using base::subtle::NoBarrier_Load; +using base::subtle::Release_Store; + +// Wrapper around the Google SpinLock class to adapt it to the method names +// expected by Boost. +class simple_spinlock { + public: + simple_spinlock() {} + + void lock() { + l_.Lock(); + } + + void unlock() { + l_.Unlock(); + } + + bool try_lock() { + return l_.TryLock(); + } + + // Return whether the lock is currently held. + // + // This state can change at any instant, so this is only really useful + // for assertions where you expect to hold the lock. The success of + // such an assertion isn't a guarantee that the current thread is the + // holder, but the failure of such an assertion _is_ a guarantee that + // the current thread is _not_ holding the lock! + bool is_locked() { + return l_.IsHeld(); + } + + private: + base::SpinLock l_; + + DISALLOW_COPY_AND_ASSIGN(simple_spinlock); +}; + +struct padded_spinlock : public simple_spinlock { + char padding[CACHELINE_SIZE - (sizeof(simple_spinlock) % CACHELINE_SIZE)]; +}; + +// Reader-writer lock. +// This is functionally equivalent to rw_semaphore in rw_semaphore.h, but should be +// used whenever the lock is expected to only be acquired on a single thread. +// It adds TSAN annotations which will detect misuse of the lock, but those +// annotations also assume that the same thread the takes the lock will unlock it. +// +// See rw_semaphore.h for documentation on the individual methods where unclear. +class rw_spinlock { + public: + rw_spinlock() { + ANNOTATE_RWLOCK_CREATE(this); + } + ~rw_spinlock() { + ANNOTATE_RWLOCK_DESTROY(this); + } + + void lock_shared() { + sem_.lock_shared(); + ANNOTATE_RWLOCK_ACQUIRED(this, 0); + } + + void unlock_shared() { + ANNOTATE_RWLOCK_RELEASED(this, 0); + sem_.unlock_shared(); + } + + bool try_lock() { + bool ret = sem_.try_lock(); + if (ret) { + ANNOTATE_RWLOCK_ACQUIRED(this, 1); + } + return ret; + } + + void lock() { + sem_.lock(); + ANNOTATE_RWLOCK_ACQUIRED(this, 1); + } + + void unlock() { + ANNOTATE_RWLOCK_RELEASED(this, 1); + sem_.unlock(); + } + + bool is_write_locked() const { + return sem_.is_write_locked(); + } + + bool is_locked() const { + return sem_.is_locked(); + } + + private: + rw_semaphore sem_; +}; + +// A reader-writer lock implementation which is biased for use cases where +// the write lock is taken infrequently, but the read lock is used often. +// +// Internally, this creates N underlying mutexes, one per CPU. When a thread +// wants to lock in read (shared) mode, it locks only its own CPU's mutex. When it +// wants to lock in write (exclusive) mode, it locks all CPU's mutexes. +// +// This means that in the read-mostly case, different readers will not cause any +// cacheline contention. +// +// Usage: +// percpu_rwlock mylock; +// +// // Lock shared: +// { +// boost::shared_lock lock(mylock.get_lock()); +// ... +// } +// +// // Lock exclusive: +// +// { +// boost::lock_guard lock(mylock); +// ... +// } +class percpu_rwlock { + public: + percpu_rwlock() { + errno = 0; + n_cpus_ = base::MaxCPUIndex() + 1; + CHECK_EQ(errno, 0) << ErrnoToString(errno); + CHECK_GT(n_cpus_, 0); + locks_ = new padded_lock[n_cpus_]; + } + + ~percpu_rwlock() { + delete [] locks_; + } + + rw_spinlock &get_lock() { +#if defined(__APPLE__) + // OSX doesn't have a way to get the CPU, so we'll pick a random one. + int cpu = reinterpret_cast(this) % n_cpus_; +#else + int cpu = sched_getcpu(); + CHECK_LT(cpu, n_cpus_); +#endif // defined(__APPLE__) + return locks_[cpu].lock; + } + + bool try_lock() { + for (int i = 0; i < n_cpus_; i++) { + if (!locks_[i].lock.try_lock()) { + while (i--) { + locks_[i].lock.unlock(); + } + return false; + } + } + return true; + } + + // Return true if this lock is held on any CPU. + // See simple_spinlock::is_locked() for details about where this is useful. + bool is_locked() const { + for (int i = 0; i < n_cpus_; i++) { + if (locks_[i].lock.is_locked()) return true; + } + return false; + } + + void lock() { + for (int i = 0; i < n_cpus_; i++) { + locks_[i].lock.lock(); + } + } + + void unlock() { + for (int i = 0; i < n_cpus_; i++) { + locks_[i].lock.unlock(); + } + } + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + struct padded_lock { + rw_spinlock lock; + char padding[CACHELINE_SIZE - (sizeof(rw_spinlock) % CACHELINE_SIZE)]; + }; + + int n_cpus_; + padded_lock *locks_; +}; + +// Simpler version of boost::lock_guard. Only supports the basic object +// lifecycle and defers any error checking to the underlying mutex. +template +class lock_guard { + public: + explicit lock_guard(Mutex* m) + : m_(DCHECK_NOTNULL(m)) { + m_->lock(); + } + + ~lock_guard() { + m_->unlock(); + } + + private: + Mutex* m_; + DISALLOW_COPY_AND_ASSIGN(lock_guard); +}; + +// Simpler version of boost::unique_lock. Tracks lock acquisition and will +// report attempts to double lock() or unlock(). +template +class unique_lock { + public: + unique_lock() + : locked_(false), + m_(NULL) { + } + + explicit unique_lock(Mutex* m) + : locked_(true), + m_(m) { + m_->lock(); + } + + ~unique_lock() { + if (locked_) { + m_->unlock(); + locked_ = false; + } + } + + void lock() { + DCHECK(!locked_); + m_->lock(); + locked_ = true; + } + + void unlock() { + DCHECK(locked_); + m_->unlock(); + locked_ = false; + } + + void swap(unique_lock* other) { + DCHECK(other != NULL) << "The passed unique_lock is null"; + std::swap(locked_, other->locked_); + std::swap(m_, other->m_); + } + + private: + bool locked_; + Mutex* m_; + DISALLOW_COPY_AND_ASSIGN(unique_lock); +}; + +// Simpler version of boost::shared_lock. Defers error checking to the +// underlying mutex. +template +class shared_lock { + public: + shared_lock() + : m_(NULL) { + } + + explicit shared_lock(Mutex* m) + : m_(DCHECK_NOTNULL(m)) { + m_->lock_shared(); + } + + void swap(shared_lock& other) { + std::swap(m_,other.m_); + } + + ~shared_lock() { + if (m_ != NULL) { + m_->unlock_shared(); + } + } + + private: + Mutex* m_; + DISALLOW_COPY_AND_ASSIGN(shared_lock); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/logging-test.cc b/src/kudu/util/logging-test.cc new file mode 100644 index 000000000000..82ce90d62277 --- /dev/null +++ b/src/kudu/util/logging-test.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/util/logging_test_util.h" +#include "kudu/util/logging.h" +#include "kudu/util/monotime.h" + +using std::string; +using std::vector; + +namespace kudu { + +// Test the KLOG_EVERY_N_SECS(...) macro. +TEST(LoggingTest, TestThrottledLogging) { + StringVectorSink sink; + ScopedRegisterSink srs(&sink); + + for (int i = 0; i < 10000; i++) { + KLOG_EVERY_N_SECS(INFO, 1) << "test" << THROTTLE_MSG; + SleepFor(MonoDelta::FromMilliseconds(1)); + if (sink.logged_msgs().size() >= 2) break; + } + const vector& msgs = sink.logged_msgs(); + ASSERT_GE(msgs.size(), 2); + + // The first log line shouldn't have a suppression count. + EXPECT_THAT(msgs[0], testing::ContainsRegex("test$")); + // The second one should have suppressed at least three digits worth of log messages. + EXPECT_THAT(msgs[1], testing::ContainsRegex("\\[suppressed [0-9]{3,} similar messages\\]")); +} + +} // namespace kudu diff --git a/src/kudu/util/logging.cc b/src/kudu/util/logging.cc new file mode 100644 index 000000000000..8edd20a7abc7 --- /dev/null +++ b/src/kudu/util/logging.cc @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kudu/util/logging.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/callback.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/util/flag_tags.h" + +DEFINE_string(log_filename, "", + "Prefix of log filename - " + "full path is /.[INFO|WARN|ERROR|FATAL]"); +TAG_FLAG(log_filename, stable); + +#define PROJ_NAME "kudu" + +bool logging_initialized = false; + +using namespace std; // NOLINT(*) +using namespace boost::uuids; // NOLINT(*) + +using base::SpinLock; +using base::SpinLockHolder; + +namespace kudu { + +namespace { + +class SimpleSink : public google::LogSink { + public: + explicit SimpleSink(LoggingCallback cb) : cb_(std::move(cb)) {} + + virtual ~SimpleSink() OVERRIDE { + } + + virtual void send(google::LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct ::tm* tm_time, + const char* message, size_t message_len) OVERRIDE { + LogSeverity kudu_severity; + switch (severity) { + case google::INFO: + kudu_severity = SEVERITY_INFO; + break; + case google::WARNING: + kudu_severity = SEVERITY_WARNING; + break; + case google::ERROR: + kudu_severity = SEVERITY_ERROR; + break; + case google::FATAL: + kudu_severity = SEVERITY_FATAL; + break; + default: + LOG(FATAL) << "Unknown glog severity: " << severity; + } + cb_.Run(kudu_severity, full_filename, line, tm_time, message, message_len); + } + + private: + + LoggingCallback cb_; +}; + +SpinLock logging_mutex(base::LINKER_INITIALIZED); + +// There can only be a single instance of a SimpleSink. +// +// Protected by 'logging_mutex'. +SimpleSink* registered_sink = nullptr; + +// Records the logging severity after the first call to +// InitGoogleLoggingSafe{Basic}. Calls to UnregisterLoggingCallback() +// will restore stderr logging back to this severity level. +// +// Protected by 'logging_mutex'. +int initial_stderr_severity; + +void UnregisterLoggingCallbackUnlocked() { + CHECK(logging_mutex.IsHeld()); + CHECK(registered_sink); + + // Restore logging to stderr, then remove our sink. This ordering ensures + // that no log messages are missed. + google::SetStderrLogging(initial_stderr_severity); + google::RemoveLogSink(registered_sink); + delete registered_sink; + registered_sink = nullptr; +} + +} // anonymous namespace + +void InitGoogleLoggingSafe(const char* arg) { + SpinLockHolder l(&logging_mutex); + if (logging_initialized) return; + + google::InstallFailureSignalHandler(); + + if (!FLAGS_log_filename.empty()) { + for (int severity = google::INFO; severity <= google::FATAL; ++severity) { + google::SetLogSymlink(severity, FLAGS_log_filename.c_str()); + } + } + + // This forces our logging to use /tmp rather than looking for a + // temporary directory if none is specified. This is done so that we + // can reliably construct the log file name without duplicating the + // complex logic that glog uses to guess at a temporary dir. + if (FLAGS_log_dir.empty()) { + FLAGS_log_dir = "/tmp"; + } + + if (!FLAGS_logtostderr) { + // Verify that a log file can be created in log_dir by creating a tmp file. + stringstream ss; + random_generator uuid_generator; + ss << FLAGS_log_dir << "/" << PROJ_NAME "_test_log." << uuid_generator(); + const string file_name = ss.str(); + ofstream test_file(file_name.c_str()); + if (!test_file.is_open()) { + stringstream error_msg; + error_msg << "Could not open file in log_dir " << FLAGS_log_dir; + perror(error_msg.str().c_str()); + // Unlock the mutex before exiting the program to avoid mutex d'tor assert. + logging_mutex.Unlock(); + exit(1); + } + remove(file_name.c_str()); + } + + google::InitGoogleLogging(arg); + + // Needs to be done after InitGoogleLogging + if (FLAGS_log_filename.empty()) { + CHECK_STRNE(google::ProgramInvocationShortName(), "UNKNOWN") + << ": must initialize gflags before glog"; + FLAGS_log_filename = google::ProgramInvocationShortName(); + } + + // File logging: on. + // Stderr logging threshold: FLAGS_stderrthreshold. + // Sink logging: off. + initial_stderr_severity = FLAGS_stderrthreshold; + logging_initialized = true; +} + +void InitGoogleLoggingSafeBasic(const char* arg) { + SpinLockHolder l(&logging_mutex); + if (logging_initialized) return; + + google::InitGoogleLogging(arg); + + // This also disables file-based logging. + google::LogToStderr(); + + // File logging: off. + // Stderr logging threshold: INFO. + // Sink logging: off. + initial_stderr_severity = google::INFO; + logging_initialized = true; +} + +void RegisterLoggingCallback(const LoggingCallback& cb) { + SpinLockHolder l(&logging_mutex); + CHECK(logging_initialized); + + if (registered_sink) { + LOG(WARNING) << "Cannot register logging callback: one already registered"; + return; + } + + // AddLogSink() claims to take ownership of the sink, but it doesn't + // really; it actually expects it to remain valid until + // google::ShutdownGoogleLogging() is called. + registered_sink = new SimpleSink(cb); + google::AddLogSink(registered_sink); + + // Even when stderr logging is ostensibly off, it's still emitting + // ERROR-level stuff. This is the default. + google::SetStderrLogging(google::ERROR); + + // File logging: yes, if InitGoogleLoggingSafe() was called earlier. + // Stderr logging threshold: ERROR. + // Sink logging: on. +} + +void UnregisterLoggingCallback() { + SpinLockHolder l(&logging_mutex); + CHECK(logging_initialized); + + if (!registered_sink) { + LOG(WARNING) << "Cannot unregister logging callback: none registered"; + return; + } + + UnregisterLoggingCallbackUnlocked(); + // File logging: yes, if InitGoogleLoggingSafe() was called earlier. + // Stderr logging threshold: initial_stderr_severity. + // Sink logging: off. +} + +void GetFullLogFilename(google::LogSeverity severity, string* filename) { + stringstream ss; + ss << FLAGS_log_dir << "/" << FLAGS_log_filename << "." + << google::GetLogSeverityName(severity); + *filename = ss.str(); +} + +void ShutdownLoggingSafe() { + SpinLockHolder l(&logging_mutex); + if (!logging_initialized) return; + + if (registered_sink) { + UnregisterLoggingCallbackUnlocked(); + } + + google::ShutdownGoogleLogging(); + + logging_initialized = false; +} + +void LogCommandLineFlags() { + LOG(INFO) << "Flags (see also /varz are on debug webserver):" << endl + << google::CommandlineFlagsIntoString(); +} + +// Support for the special THROTTLE_MSG token in a log message stream. +ostream& operator<<(ostream &os, const PRIVATE_ThrottleMsg&) { + using google::LogMessage; +#ifdef DISABLE_RTTI + LogMessage::LogStream *log = static_cast(&os); +#else + LogMessage::LogStream *log = dynamic_cast(&os); +#endif + CHECK(log && log == log->self()) + << "You must not use COUNTER with non-glog ostream"; + int ctr = log->ctr(); + if (ctr > 0) { + os << " [suppressed " << ctr << " similar messages]"; + } + return os; +} + +} // namespace kudu diff --git a/src/kudu/util/logging.h b/src/kudu/util/logging.h new file mode 100644 index 000000000000..553e92adda55 --- /dev/null +++ b/src/kudu/util/logging.h @@ -0,0 +1,235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef KUDU_UTIL_LOGGING_H +#define KUDU_UTIL_LOGGING_H + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/logging_callback.h" + +//////////////////////////////////////////////////////////////////////////////// +// Throttled logging support +//////////////////////////////////////////////////////////////////////////////// + +// Logs a message throttled to appear at most once every 'n_secs' seconds to +// the given severity. +// +// The log message may include the special token 'THROTTLE_MSG' which expands +// to either an empty string or '[suppressed similar messages]'. +// +// Example usage: +// KLOG_EVERY_N_SECS(WARNING, 1) << "server is low on memory" << THROTTLE_MSG; +#define KLOG_EVERY_N_SECS(severity, n_secs) \ + static logging_internal::LogThrottler LOG_THROTTLER; \ + int num_suppressed = 0; \ + if (LOG_THROTTLER.ShouldLog(n_secs, &num_suppressed)) \ + google::LogMessage( \ + __FILE__, __LINE__, google::GLOG_ ## severity, num_suppressed, \ + &google::LogMessage::SendToLog).stream() + +namespace kudu { +enum PRIVATE_ThrottleMsg {THROTTLE_MSG}; +} // namespace kudu + +//////////////////////////////////////////////////////////////////////////////// +// Versions of glog macros for "LOG_EVERY" and "LOG_FIRST" that annotate the +// benign races on their internal static variables. +//////////////////////////////////////////////////////////////////////////////// + +// The "base" macros. +#define KUDU_SOME_KIND_OF_LOG_EVERY_N(severity, n, what_to_do) \ + static int LOG_OCCURRENCES = 0, LOG_OCCURRENCES_MOD_N = 0; \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES, "Logging every N is approximate"); \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES_MOD_N, "Logging every N is approximate"); \ + ++LOG_OCCURRENCES; \ + if (++LOG_OCCURRENCES_MOD_N > n) LOG_OCCURRENCES_MOD_N -= n; \ + if (LOG_OCCURRENCES_MOD_N == 1) \ + google::LogMessage( \ + __FILE__, __LINE__, google::GLOG_ ## severity, LOG_OCCURRENCES, \ + &what_to_do).stream() + +#define KUDU_SOME_KIND_OF_LOG_IF_EVERY_N(severity, condition, n, what_to_do) \ + static int LOG_OCCURRENCES = 0, LOG_OCCURRENCES_MOD_N = 0; \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES, "Logging every N is approximate"); \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES_MOD_N, "Logging every N is approximate"); \ + ++LOG_OCCURRENCES; \ + if (condition && \ + ((LOG_OCCURRENCES_MOD_N=(LOG_OCCURRENCES_MOD_N + 1) % n) == (1 % n))) \ + google::LogMessage( \ + __FILE__, __LINE__, google::GLOG_ ## severity, LOG_OCCURRENCES, \ + &what_to_do).stream() + +#define KUDU_SOME_KIND_OF_PLOG_EVERY_N(severity, n, what_to_do) \ + static int LOG_OCCURRENCES = 0, LOG_OCCURRENCES_MOD_N = 0; \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES, "Logging every N is approximate"); \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES_MOD_N, "Logging every N is approximate"); \ + ++LOG_OCCURRENCES; \ + if (++LOG_OCCURRENCES_MOD_N > n) LOG_OCCURRENCES_MOD_N -= n; \ + if (LOG_OCCURRENCES_MOD_N == 1) \ + google::ErrnoLogMessage( \ + __FILE__, __LINE__, google::GLOG_ ## severity, LOG_OCCURRENCES, \ + &what_to_do).stream() + +#define KUDU_SOME_KIND_OF_LOG_FIRST_N(severity, n, what_to_do) \ + static uint64_t LOG_OCCURRENCES = 0; \ + ANNOTATE_BENIGN_RACE(&LOG_OCCURRENCES, "Logging the first N is approximate"); \ + if (LOG_OCCURRENCES++ < n) \ + google::LogMessage( \ + __FILE__, __LINE__, google::GLOG_ ## severity, LOG_OCCURRENCES, \ + &what_to_do).stream() + +// The direct user-facing macros. +#define KLOG_EVERY_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(google::GLOG_ ## severity < \ + google::NUM_SEVERITIES, \ + INVALID_REQUESTED_LOG_SEVERITY); \ + KUDU_SOME_KIND_OF_LOG_EVERY_N(severity, (n), google::LogMessage::SendToLog) + +#define KSYSLOG_EVERY_N(severity, n) \ + KUDU_SOME_KIND_OF_LOG_EVERY_N(severity, (n), google::LogMessage::SendToSyslogAndLog) + +#define KPLOG_EVERY_N(severity, n) \ + KUDU_SOME_KIND_OF_PLOG_EVERY_N(severity, (n), google::LogMessage::SendToLog) + +#define KLOG_FIRST_N(severity, n) \ + KUDU_SOME_KIND_OF_LOG_FIRST_N(severity, (n), google::LogMessage::SendToLog) + +#define KLOG_IF_EVERY_N(severity, condition, n) \ + KUDU_SOME_KIND_OF_LOG_IF_EVERY_N(severity, (condition), (n), google::LogMessage::SendToLog) + +// We also disable the un-annotated glog macros for anyone who includes this header. +#undef LOG_EVERY_N +#define LOG_EVERY_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(false, "LOG_EVERY_N is deprecated. Please use KLOG_EVERY_N.") + +#undef SYSLOG_EVERY_N +#define SYSLOG_EVERY_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(false, "SYSLOG_EVERY_N is deprecated. Please use KSYSLOG_EVERY_N.") + +#undef PLOG_EVERY_N +#define PLOG_EVERY_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(false, "PLOG_EVERY_N is deprecated. Please use KPLOG_EVERY_N.") + +#undef LOG_FIRST_N +#define LOG_FIRST_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(false, "LOG_FIRST_N is deprecated. Please use KLOG_FIRST_N.") + +#undef LOG_IF_EVERY_N +#define LOG_IF_EVERY_N(severity, condition, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(false, "LOG_IF_EVERY_N is deprecated. Please use KLOG_IF_EVERY_N.") + + + + +namespace kudu { + +// glog doesn't allow multiple invocations of InitGoogleLogging. This method conditionally +// calls InitGoogleLogging only if it hasn't been called before. +// +// It also takes care of installing the google failure signal handler. +void InitGoogleLoggingSafe(const char* arg); + +// Like InitGoogleLoggingSafe() but stripped down: no signal handlers are +// installed, regular logging is disabled, and log events of any severity +// will be written to stderr. +// +// These properties make it attractive for us in libraries. +void InitGoogleLoggingSafeBasic(const char* arg); + +// Demotes stderr logging to ERROR or higher and registers 'cb' as the +// recipient for all log events. +// +// Subsequent calls to RegisterLoggingCallback no-op (until the callback +// is unregistered with UnregisterLoggingCallback()). +void RegisterLoggingCallback(const LoggingCallback& cb); + +// Unregisters a callback previously registered with +// RegisterLoggingCallback() and promotes stderr logging back to all +// severities. +// +// If no callback is registered, this is a no-op. +void UnregisterLoggingCallback(); + +// Returns the full pathname of the symlink to the most recent log +// file corresponding to this severity +void GetFullLogFilename(google::LogSeverity severity, std::string* filename); + +// Shuts down the google logging library. Call before exit to ensure that log files are +// flushed. +void ShutdownLoggingSafe(); + +// Writes all command-line flags to the log at level INFO. +void LogCommandLineFlags(); + +namespace logging_internal { +// Internal implementation class used for throttling log messages. +class LogThrottler { + public: + LogThrottler() : num_suppressed_(0), last_ts_(0) { + ANNOTATE_BENIGN_RACE(&last_ts_, "OK to be sloppy with log throttling"); + } + + bool ShouldLog(int n_secs, int* num_suppressed) { + MicrosecondsInt64 ts = GetMonoTimeMicros(); + if (ts - last_ts_ < n_secs * 1e6) { + *num_suppressed = base::subtle::NoBarrier_AtomicIncrement(&num_suppressed_, 1); + return false; + } + last_ts_ = ts; + *num_suppressed = base::subtle::NoBarrier_AtomicExchange(&num_suppressed_, 0); + return true; + } + private: + Atomic32 num_suppressed_; + uint64_t last_ts_; +}; +} // namespace logging_internal + +std::ostream& operator<<(std::ostream &os, const PRIVATE_ThrottleMsg&); + +// Convenience macros to prefix log messages with some prefix, these are the unlocked +// versions and should not obtain a lock (if one is required to obtain the prefix). +// There must be a LogPrefixUnlocked()/LogPrefixLocked() method available in the current +// scope in order to use these macros. +#define LOG_WITH_PREFIX_UNLOCKED(severity) LOG(severity) << LogPrefixUnlocked() +#define VLOG_WITH_PREFIX_UNLOCKED(verboselevel) LOG_IF(INFO, VLOG_IS_ON(verboselevel)) \ + << LogPrefixUnlocked() + +// Same as the above, but obtain the lock. +#define LOG_WITH_PREFIX(severity) LOG(severity) << LogPrefix() +#define VLOG_WITH_PREFIX(verboselevel) LOG_IF(INFO, VLOG_IS_ON(verboselevel)) \ + << LogPrefix() + +} // namespace kudu + +#endif // KUDU_UTIL_LOGGING_H diff --git a/src/kudu/util/logging_callback.h b/src/kudu/util/logging_callback.h new file mode 100644 index 000000000000..83fb9735d456 --- /dev/null +++ b/src/kudu/util/logging_callback.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_LOGGING_CALLBACK_H +#define KUDU_UTIL_LOGGING_CALLBACK_H + +#include +#include + +#include "kudu/gutil/callback_forward.h" + +namespace kudu { + +enum LogSeverity { + SEVERITY_INFO, + SEVERITY_WARNING, + SEVERITY_ERROR, + SEVERITY_FATAL +}; + +// Callback for simple logging. +// +// 'message' is NOT terminated with an endline. +typedef Callback LoggingCallback; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/logging_test_util.h b/src/kudu/util/logging_test_util.h new file mode 100644 index 000000000000..dfa348683f66 --- /dev/null +++ b/src/kudu/util/logging_test_util.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_LOGGING_TEST_UTIL_H +#define KUDU_LOGGING_TEST_UTIL_H + +#include +#include +#include + +namespace kudu { + +// GLog sink that keeps an internal buffer of messages that have been logged. +class StringVectorSink : public google::LogSink { + public: + void send(google::LogSeverity severity, const char* full_filename, + const char* base_filename, int line, + const struct ::tm* tm_time, + const char* message, size_t message_len) override { + logged_msgs_.push_back(ToString(severity, base_filename, line, + tm_time, message, message_len)); + } + + const std::vector& logged_msgs() const { + return logged_msgs_; + } + + private: + std::vector logged_msgs_; +}; + +// RAII wrapper around registering a LogSink with GLog. +struct ScopedRegisterSink { + explicit ScopedRegisterSink(google::LogSink* s) : s_(s) { + google::AddLogSink(s_); + } + ~ScopedRegisterSink() { + google::RemoveLogSink(s_); + } + + google::LogSink* s_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/malloc.cc b/src/kudu/util/malloc.cc new file mode 100644 index 000000000000..3fec2dbe058f --- /dev/null +++ b/src/kudu/util/malloc.cc @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/malloc.h" + +#if defined(__linux__) +#include +#else +#include +#endif // defined(__linux__) + +namespace kudu { + +int64_t kudu_malloc_usable_size(const void* obj) { +#if defined(__linux__) + return malloc_usable_size(const_cast(obj)); +#else + return malloc_size(obj); +#endif // defined(__linux__) +} + +} // namespace kudu diff --git a/src/kudu/util/malloc.h b/src/kudu/util/malloc.h new file mode 100644 index 000000000000..e8a27c561f17 --- /dev/null +++ b/src/kudu/util/malloc.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_MALLOC_H +#define KUDU_UTIL_MALLOC_H + +#include + +namespace kudu { + +// Simple wrapper for malloc_usable_size(). +// +// Really just centralizes the const_cast, as this function is often called +// on const pointers (i.e. "this" in a const method). +int64_t kudu_malloc_usable_size(const void* obj); + +} // namespace kudu + +#endif // KUDU_UTIL_MALLOC_H diff --git a/src/kudu/util/map-util-test.cc b/src/kudu/util/map-util-test.cc new file mode 100644 index 000000000000..4001f980b1bc --- /dev/null +++ b/src/kudu/util/map-util-test.cc @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This unit test belongs in gutil, but it depends on test_main which is +// part of util. +#include "kudu/gutil/map-util.h" + +#include +#include + +using std::map; + +namespace kudu { + +TEST(FloorTest, TestMapUtil) { + map my_map; + + ASSERT_EQ(nullptr, FindFloorOrNull(my_map, 5)); + + my_map[5] = 5; + ASSERT_EQ(5, *FindFloorOrNull(my_map, 6)); + ASSERT_EQ(5, *FindFloorOrNull(my_map, 5)); + ASSERT_EQ(nullptr, FindFloorOrNull(my_map, 4)); + + my_map[1] = 1; + ASSERT_EQ(5, *FindFloorOrNull(my_map, 6)); + ASSERT_EQ(5, *FindFloorOrNull(my_map, 5)); + ASSERT_EQ(1, *FindFloorOrNull(my_map, 4)); + ASSERT_EQ(1, *FindFloorOrNull(my_map, 1)); + ASSERT_EQ(nullptr, FindFloorOrNull(my_map, 0)); + +} + +} // namespace kudu diff --git a/src/kudu/util/mem_tracker-test.cc b/src/kudu/util/mem_tracker-test.cc new file mode 100644 index 000000000000..2a10bed86c0b --- /dev/null +++ b/src/kudu/util/mem_tracker-test.cc @@ -0,0 +1,340 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/mem_tracker.h" + +#include +#include +#include +#include + +#include +#include + +#include "kudu/util/test_util.h" + +DECLARE_int32(memory_limit_soft_percentage); + +namespace kudu { + +using std::equal_to; +using std::hash; +using std::pair; +using std::shared_ptr; +using std::string; +using std::unordered_map; +using std::vector; + +TEST(MemTrackerTest, SingleTrackerNoLimit) { + shared_ptr t = MemTracker::CreateTracker(-1, "t"); + EXPECT_FALSE(t->has_limit()); + t->Consume(10); + EXPECT_EQ(t->consumption(), 10); + t->Consume(10); + EXPECT_EQ(t->consumption(), 20); + t->Release(15); + EXPECT_EQ(t->consumption(), 5); + EXPECT_FALSE(t->LimitExceeded()); + t->Release(5); + EXPECT_EQ(t->consumption(), 0); +} + +TEST(MemTrackerTest, SingleTrackerWithLimit) { + shared_ptr t = MemTracker::CreateTracker(11, "t"); + EXPECT_TRUE(t->has_limit()); + t->Consume(10); + EXPECT_EQ(t->consumption(), 10); + EXPECT_FALSE(t->LimitExceeded()); + t->Consume(10); + EXPECT_EQ(t->consumption(), 20); + EXPECT_TRUE(t->LimitExceeded()); + t->Release(15); + EXPECT_EQ(t->consumption(), 5); + EXPECT_FALSE(t->LimitExceeded()); + t->Release(5); +} + +TEST(MemTrackerTest, TrackerHierarchy) { + shared_ptr p = MemTracker::CreateTracker(100, "p"); + shared_ptr c1 = MemTracker::CreateTracker(80, "c1", p); + shared_ptr c2 = MemTracker::CreateTracker(50, "c2", p); + + // everything below limits + c1->Consume(60); + EXPECT_EQ(c1->consumption(), 60); + EXPECT_FALSE(c1->LimitExceeded()); + EXPECT_FALSE(c1->AnyLimitExceeded()); + EXPECT_EQ(c2->consumption(), 0); + EXPECT_FALSE(c2->LimitExceeded()); + EXPECT_FALSE(c2->AnyLimitExceeded()); + EXPECT_EQ(p->consumption(), 60); + EXPECT_FALSE(p->LimitExceeded()); + EXPECT_FALSE(p->AnyLimitExceeded()); + + // p goes over limit + c2->Consume(50); + EXPECT_EQ(c1->consumption(), 60); + EXPECT_FALSE(c1->LimitExceeded()); + EXPECT_TRUE(c1->AnyLimitExceeded()); + EXPECT_EQ(c2->consumption(), 50); + EXPECT_FALSE(c2->LimitExceeded()); + EXPECT_TRUE(c2->AnyLimitExceeded()); + EXPECT_EQ(p->consumption(), 110); + EXPECT_TRUE(p->LimitExceeded()); + + // c2 goes over limit, p drops below limit + c1->Release(20); + c2->Consume(10); + EXPECT_EQ(c1->consumption(), 40); + EXPECT_FALSE(c1->LimitExceeded()); + EXPECT_FALSE(c1->AnyLimitExceeded()); + EXPECT_EQ(c2->consumption(), 60); + EXPECT_TRUE(c2->LimitExceeded()); + EXPECT_TRUE(c2->AnyLimitExceeded()); + EXPECT_EQ(p->consumption(), 100); + EXPECT_FALSE(p->LimitExceeded()); + c1->Release(40); + c2->Release(60); +} + +class GcFunctionHelper { + public: + static const int NUM_RELEASE_BYTES = 1; + + explicit GcFunctionHelper(MemTracker* tracker) : tracker_(tracker) { } + + void GcFunc() { tracker_->Release(NUM_RELEASE_BYTES); } + + private: + MemTracker* tracker_; +}; + +TEST(MemTrackerTest, GcFunctions) { + shared_ptr t = MemTracker::CreateTracker(10, ""); + ASSERT_TRUE(t->has_limit()); + + t->Consume(9); + EXPECT_FALSE(t->LimitExceeded()); + + // Test TryConsume() + EXPECT_FALSE(t->TryConsume(2)); + EXPECT_EQ(t->consumption(), 9); + EXPECT_FALSE(t->LimitExceeded()); + + // Attach GcFunction that releases 1 byte + GcFunctionHelper gc_func_helper(t.get()); + t->AddGcFunction(boost::bind(&GcFunctionHelper::GcFunc, &gc_func_helper)); + EXPECT_TRUE(t->TryConsume(2)); + EXPECT_EQ(t->consumption(), 10); + EXPECT_FALSE(t->LimitExceeded()); + + // GcFunction will be called even though TryConsume() fails + EXPECT_FALSE(t->TryConsume(2)); + EXPECT_EQ(t->consumption(), 9); + EXPECT_FALSE(t->LimitExceeded()); + + // GcFunction won't be called + EXPECT_TRUE(t->TryConsume(1)); + EXPECT_EQ(t->consumption(), 10); + EXPECT_FALSE(t->LimitExceeded()); + + // Test LimitExceeded() + t->Consume(1); + EXPECT_EQ(t->consumption(), 11); + EXPECT_FALSE(t->LimitExceeded()); + EXPECT_EQ(t->consumption(), 10); + + // Add more GcFunctions, test that we only call them until the limit is no longer + // exceeded + GcFunctionHelper gc_func_helper2(t.get()); + t->AddGcFunction(boost::bind(&GcFunctionHelper::GcFunc, &gc_func_helper2)); + GcFunctionHelper gc_func_helper3(t.get()); + t->AddGcFunction(boost::bind(&GcFunctionHelper::GcFunc, &gc_func_helper3)); + t->Consume(1); + EXPECT_EQ(t->consumption(), 11); + EXPECT_FALSE(t->LimitExceeded()); + EXPECT_EQ(t->consumption(), 10); + t->Release(10); +} + +TEST(MemTrackerTest, STLContainerAllocator) { + shared_ptr t = MemTracker::CreateTracker(-1, "t"); + MemTrackerAllocator vec_alloc(t); + MemTrackerAllocator> map_alloc(t); + + // Simple test: use the allocator in a vector. + { + vector > v(vec_alloc); + ASSERT_EQ(0, t->consumption()); + v.reserve(5); + ASSERT_EQ(5 * sizeof(int), t->consumption()); + v.reserve(10); + ASSERT_EQ(10 * sizeof(int), t->consumption()); + } + ASSERT_EQ(0, t->consumption()); + + // Complex test: use it in an unordered_map, where it must be rebound in + // order to allocate the map's buckets. + { + unordered_map, equal_to, MemTrackerAllocator>> um( + 10, + hash(), + equal_to(), + map_alloc); + + // Don't care about the value (it depends on map internals). + ASSERT_GT(t->consumption(), 0); + } + ASSERT_EQ(0, t->consumption()); +} + +TEST(MemTrackerTest, FindFunctionsTakeOwnership) { + // In each test, ToString() would crash if the MemTracker is destroyed when + // 'm' goes out of scope. + + shared_ptr ref; + { + shared_ptr m = MemTracker::CreateTracker(-1, "test"); + ASSERT_TRUE(MemTracker::FindTracker(m->id(), &ref)); + } + LOG(INFO) << ref->ToString(); + ref.reset(); + + { + shared_ptr m = MemTracker::CreateTracker(-1, "test"); + ref = MemTracker::FindOrCreateTracker(-1, m->id()); + } + LOG(INFO) << ref->ToString(); + ref.reset(); + + vector > refs; + { + shared_ptr m = MemTracker::CreateTracker(-1, "test"); + MemTracker::ListTrackers(&refs); + } + for (const shared_ptr& r : refs) { + LOG(INFO) << r->ToString(); + } + refs.clear(); +} + +TEST(MemTrackerTest, ScopedTrackedConsumption) { + shared_ptr m = MemTracker::CreateTracker(-1, "test"); + ASSERT_EQ(0, m->consumption()); + { + ScopedTrackedConsumption consumption(m, 1); + ASSERT_EQ(1, m->consumption()); + + consumption.Reset(3); + ASSERT_EQ(3, m->consumption()); + } + ASSERT_EQ(0, m->consumption()); +} + +TEST(MemTrackerTest, SoftLimitExceeded) { + const int kNumIters = 100000; + const int kMemLimit = 1000; + google::FlagSaver saver; + FLAGS_memory_limit_soft_percentage = 0; + shared_ptr m = MemTracker::CreateTracker(kMemLimit, "test"); + + // Consumption is 0; the soft limit is never exceeded. + for (int i = 0; i < kNumIters; i++) { + ASSERT_FALSE(m->SoftLimitExceeded(nullptr)); + } + + // Consumption is half of the actual limit, so we expect to exceed the soft + // limit roughly half the time. + ScopedTrackedConsumption consumption(m, kMemLimit / 2); + int exceeded_count = 0; + for (int i = 0; i < kNumIters; i++) { + double current_percentage; + if (m->SoftLimitExceeded(¤t_percentage)) { + exceeded_count++; + ASSERT_NEAR(50, current_percentage, 0.1); + } + } + double exceeded_pct = static_cast(exceeded_count) / kNumIters * 100; + ASSERT_TRUE(exceeded_pct > 47 && exceeded_pct < 52); + + // Consumption is over the limit; the soft limit is always exceeded. + consumption.Reset(kMemLimit + 1); + for (int i = 0; i < kNumIters; i++) { + double current_percentage; + ASSERT_TRUE(m->SoftLimitExceeded(¤t_percentage)); + ASSERT_NEAR(100, current_percentage, 0.1); + } +} + +#ifdef TCMALLOC_ENABLED +TEST(MemTrackerTest, TcMallocRootTracker) { + shared_ptr root = MemTracker::GetRootTracker(); + + // The root tracker's consumption and tcmalloc should agree. + size_t value; + root->UpdateConsumption(); + ASSERT_TRUE(MallocExtension::instance()->GetNumericProperty( + "generic.current_allocated_bytes", &value)); + ASSERT_EQ(value, root->consumption()); + + // Explicit Consume() and Release() have no effect. + root->Consume(100); + ASSERT_EQ(value, root->consumption()); + root->Release(3); + ASSERT_EQ(value, root->consumption()); + + // But if we allocate something really big, we should see a change. + gscoped_ptr big_alloc(new char[4*1024*1024]); + // clang in release mode can optimize out the above allocation unless + // we do something with the pointer... so we just log it. + VLOG(8) << static_cast(big_alloc.get()); + root->UpdateConsumption(); + ASSERT_GT(root->consumption(), value); +} +#endif + +TEST(MemTrackerTest, UnregisterFromParent) { + shared_ptr p = MemTracker::CreateTracker(-1, "parent"); + shared_ptr c = MemTracker::CreateTracker(-1, "child", p); + vector > all; + + // Three trackers: root, parent, and child. + MemTracker::ListTrackers(&all); + ASSERT_EQ(3, all.size()); + + c->UnregisterFromParent(); + + // Now only two because the child cannot be found from the root, though it is + // still alive. + MemTracker::ListTrackers(&all); + ASSERT_EQ(2, all.size()); + shared_ptr not_found; + ASSERT_FALSE(MemTracker::FindTracker("child", ¬_found, p)); + + // We can also recreate the child with the same name without colliding + // with the old one. + shared_ptr c2 = MemTracker::CreateTracker(-1, "child", p); + + // We should still able to walk up to the root from the unregistered child + // without crashing. + LOG(INFO) << c->ToString(); + + // And this should no-op. + c->UnregisterFromParent(); +} + +} // namespace kudu diff --git a/src/kudu/util/mem_tracker.cc b/src/kudu/util/mem_tracker.cc new file mode 100644 index 000000000000..1f1334736c0c --- /dev/null +++ b/src/kudu/util/mem_tracker.cc @@ -0,0 +1,579 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/mem_tracker.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/once.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/mutex.h" +#include "kudu/util/random_util.h" +#include "kudu/util/status.h" + +DEFINE_int64(memory_limit_hard_bytes, 0, + "Maximum amount of memory this daemon should use, in bytes. " + "A value of 0 autosizes based on the total system memory. " + "A value of -1 disables all memory limiting."); +TAG_FLAG(memory_limit_hard_bytes, stable); + +DEFINE_int32(memory_limit_soft_percentage, 60, + "Percentage of the hard memory limit that this daemon may " + "consume before memory throttling of writes begins. The greater " + "the excess, the higher the chance of throttling. In general, a " + "lower soft limit leads to smoother write latencies but " + "decreased throughput, and vice versa for a higher soft limit."); +TAG_FLAG(memory_limit_soft_percentage, advanced); + +DEFINE_int32(memory_limit_warn_threshold_percentage, 98, + "Percentage of the hard memory limit that this daemon may " + "consume before WARNING level messages are periodically logged."); +TAG_FLAG(memory_limit_warn_threshold_percentage, advanced); + +#ifdef TCMALLOC_ENABLED +DEFINE_int32(tcmalloc_max_free_bytes_percentage, 10, + "Maximum percentage of the RSS that tcmalloc is allowed to use for " + "reserved but unallocated memory."); +TAG_FLAG(tcmalloc_max_free_bytes_percentage, advanced); +#endif + +namespace kudu { + +// NOTE: this class has been adapted from Impala, so the code style varies +// somewhat from kudu. + +using std::deque; +using std::list; +using std::string; +using std::stringstream; +using std::shared_ptr; +using std::vector; + +using strings::Substitute; + +// The ancestor for all trackers. Every tracker is visible from the root down. +static shared_ptr root_tracker; +static GoogleOnceType root_tracker_once = GOOGLE_ONCE_INIT; + +// Total amount of memory from calls to Release() since the last GC. If this +// is greater than GC_RELEASE_SIZE, this will trigger a tcmalloc gc. +static Atomic64 released_memory_since_gc; + +// Validate that various flags are percentages. +static bool ValidatePercentage(const char* flagname, int value) { + if (value >= 0 && value <= 100) { + return true; + } + LOG(ERROR) << Substitute("$0 must be a percentage, value $1 is invalid", + flagname, value); + return false; +} +static bool dummy[] = { + google::RegisterFlagValidator(&FLAGS_memory_limit_soft_percentage, &ValidatePercentage), + google::RegisterFlagValidator(&FLAGS_memory_limit_warn_threshold_percentage, &ValidatePercentage) +#ifdef TCMALLOC_ENABLED + ,google::RegisterFlagValidator(&FLAGS_tcmalloc_max_free_bytes_percentage, &ValidatePercentage) +#endif +}; + +#ifdef TCMALLOC_ENABLED +static int64_t GetTCMallocProperty(const char* prop) { + size_t value; + if (!MallocExtension::instance()->GetNumericProperty(prop, &value)) { + LOG(DFATAL) << "Failed to get tcmalloc property " << prop; + } + return value; +} + +static int64_t GetTCMallocCurrentAllocatedBytes() { + return GetTCMallocProperty("generic.current_allocated_bytes"); +} +#endif + +void MemTracker::CreateRootTracker() { + int64_t limit = FLAGS_memory_limit_hard_bytes; + if (limit == 0) { + // If no limit is provided, we'll use 80% of system RAM. + int64_t total_ram; + CHECK_OK(Env::Default()->GetTotalRAMBytes(&total_ram)); + limit = total_ram * 4; + limit /= 5; + } + + ConsumptionFunction f; +#ifdef TCMALLOC_ENABLED + f = &GetTCMallocCurrentAllocatedBytes; +#endif + root_tracker.reset(new MemTracker(f, limit, "root", + shared_ptr())); + root_tracker->Init(); + LOG(INFO) << StringPrintf("MemTracker: hard memory limit is %.6f GB", + (static_cast(limit) / (1024.0 * 1024.0 * 1024.0))); + LOG(INFO) << StringPrintf("MemTracker: soft memory limit is %.6f GB", + (static_cast(root_tracker->soft_limit_) / + (1024.0 * 1024.0 * 1024.0))); +} + +shared_ptr MemTracker::CreateTracker(int64_t byte_limit, + const string& id, + const shared_ptr& parent) { + shared_ptr real_parent = parent ? parent : GetRootTracker(); + MutexLock l(real_parent->child_trackers_lock_); + return CreateTrackerUnlocked(byte_limit, id, real_parent); +} + +shared_ptr MemTracker::CreateTrackerUnlocked(int64_t byte_limit, + const string& id, + const shared_ptr& parent) { + DCHECK(parent); + shared_ptr tracker(new MemTracker(ConsumptionFunction(), byte_limit, id, parent)); + parent->AddChildTrackerUnlocked(tracker.get()); + tracker->Init(); + + return tracker; +} + +MemTracker::MemTracker(ConsumptionFunction consumption_func, int64_t byte_limit, + const string& id, shared_ptr parent) + : limit_(byte_limit), + id_(id), + descr_(Substitute("memory consumption for $0", id)), + parent_(std::move(parent)), + consumption_(0), + consumption_func_(std::move(consumption_func)), + rand_(GetRandomSeed32()), + enable_logging_(false), + log_stack_(false) { + VLOG(1) << "Creating tracker " << ToString(); + if (consumption_func_) { + UpdateConsumption(); + } + soft_limit_ = (limit_ == -1) + ? -1 : (limit_ * FLAGS_memory_limit_soft_percentage) / 100; +} + +MemTracker::~MemTracker() { + VLOG(1) << "Destroying tracker " << ToString(); + if (parent_) { + DCHECK(consumption() == 0) << "Memory tracker " << ToString() + << " has unreleased consumption " << consumption(); + parent_->Release(consumption()); + UnregisterFromParent(); + } +} + +void MemTracker::UnregisterFromParent() { + DCHECK(parent_); + MutexLock l(parent_->child_trackers_lock_); + if (child_tracker_it_ != parent_->child_trackers_.end()) { + parent_->child_trackers_.erase(child_tracker_it_); + child_tracker_it_ = parent_->child_trackers_.end(); + } +} + +string MemTracker::ToString() const { + string s; + const MemTracker* tracker = this; + while (tracker) { + if (s != "") { + s += "->"; + } + s += tracker->id(); + tracker = tracker->parent_.get(); + } + return s; +} + +bool MemTracker::FindTracker(const string& id, + shared_ptr* tracker, + const shared_ptr& parent) { + shared_ptr real_parent = parent ? parent : GetRootTracker(); + MutexLock l(real_parent->child_trackers_lock_); + return FindTrackerUnlocked(id, tracker, real_parent); +} + +bool MemTracker::FindTrackerUnlocked(const string& id, + shared_ptr* tracker, + const shared_ptr& parent) { + DCHECK(parent != NULL); + parent->child_trackers_lock_.AssertAcquired(); + for (MemTracker* child : parent->child_trackers_) { + if (child->id() == id) { + *tracker = child->shared_from_this(); + return true; + } + } + return false; +} + +shared_ptr MemTracker::FindOrCreateTracker(int64_t byte_limit, + const string& id, + const shared_ptr& parent) { + shared_ptr real_parent = parent ? parent : GetRootTracker(); + MutexLock l(real_parent->child_trackers_lock_); + shared_ptr found; + if (FindTrackerUnlocked(id, &found, real_parent)) { + return found; + } + return CreateTrackerUnlocked(byte_limit, id, real_parent); +} + +void MemTracker::ListTrackers(vector>* trackers) { + trackers->clear(); + deque > to_process; + to_process.push_front(GetRootTracker()); + while (!to_process.empty()) { + shared_ptr t = to_process.back(); + to_process.pop_back(); + + trackers->push_back(t); + { + MutexLock l(t->child_trackers_lock_); + for (MemTracker* child : t->child_trackers_) { + to_process.push_back(child->shared_from_this()); + } + } + } +} + +void MemTracker::UpdateConsumption() { + DCHECK(!consumption_func_.empty()); + DCHECK(parent_.get() == NULL); + consumption_.set_value(consumption_func_()); +} + +void MemTracker::Consume(int64_t bytes) { + if (bytes < 0) { + Release(-bytes); + return; + } + + if (!consumption_func_.empty()) { + UpdateConsumption(); + return; + } + if (bytes == 0) { + return; + } + if (PREDICT_FALSE(enable_logging_)) { + LogUpdate(true, bytes); + } + for (auto& tracker : all_trackers_) { + tracker->consumption_.IncrementBy(bytes); + if (!tracker->consumption_func_.empty()) { + DCHECK_GE(tracker->consumption_.current_value(), 0); + } + } +} + +bool MemTracker::TryConsume(int64_t bytes) { + if (!consumption_func_.empty()) { + UpdateConsumption(); + } + if (bytes <= 0) { + return true; + } + if (PREDICT_FALSE(enable_logging_)) { + LogUpdate(true, bytes); + } + + int i = 0; + // Walk the tracker tree top-down, to avoid expanding a limit on a child whose parent + // won't accommodate the change. + for (i = all_trackers_.size() - 1; i >= 0; --i) { + MemTracker *tracker = all_trackers_[i]; + if (tracker->limit_ < 0) { + tracker->consumption_.IncrementBy(bytes); + } else { + if (!tracker->consumption_.TryIncrementBy(bytes, tracker->limit_)) { + // One of the trackers failed, attempt to GC memory or expand our limit. If that + // succeeds, TryUpdate() again. Bail if either fails. + if (!tracker->GcMemory(tracker->limit_ - bytes) || + tracker->ExpandLimit(bytes)) { + if (!tracker->consumption_.TryIncrementBy( + bytes, tracker->limit_)) { + break; + } + } else { + break; + } + } + } + } + // Everyone succeeded, return. + if (i == -1) { + return true; + } + + // Someone failed, roll back the ones that succeeded. + // TODO: this doesn't roll it back completely since the max values for + // the updated trackers aren't decremented. The max values are only used + // for error reporting so this is probably okay. Rolling those back is + // pretty hard; we'd need something like 2PC. + // + // TODO: This might leave us with an allocated resource that we can't use. Do we need + // to adjust the consumption of the query tracker to stop the resource from never + // getting used by a subsequent TryConsume()? + for (int j = all_trackers_.size() - 1; j > i; --j) { + all_trackers_[j]->consumption_.IncrementBy(-bytes); + } + return false; +} + +void MemTracker::Release(int64_t bytes) { + if (bytes < 0) { + Consume(-bytes); + return; + } + + if (PREDICT_FALSE(base::subtle::Barrier_AtomicIncrement(&released_memory_since_gc, bytes) > + GC_RELEASE_SIZE)) { + GcTcmalloc(); + } + + if (!consumption_func_.empty()) { + UpdateConsumption(); + return; + } + + if (bytes == 0) { + return; + } + if (PREDICT_FALSE(enable_logging_)) { + LogUpdate(false, bytes); + } + + for (auto& tracker : all_trackers_) { + tracker->consumption_.IncrementBy(-bytes); + // If a UDF calls FunctionContext::TrackAllocation() but allocates less than the + // reported amount, the subsequent call to FunctionContext::Free() may cause the + // process mem tracker to go negative until it is synced back to the tcmalloc + // metric. Don't blow up in this case. (Note that this doesn't affect non-process + // trackers since we can enforce that the reported memory usage is internally + // consistent.) + if (!tracker->consumption_func_.empty()) { + DCHECK_GE(tracker->consumption_.current_value(), 0); + } + } +} + +bool MemTracker::AnyLimitExceeded() { + for (const auto& tracker : limit_trackers_) { + if (tracker->LimitExceeded()) { + return true; + } + } + return false; +} + +bool MemTracker::LimitExceeded() { + if (PREDICT_FALSE(CheckLimitExceeded())) { + return GcMemory(limit_); + } + return false; +} + +bool MemTracker::SoftLimitExceeded(double* current_capacity_pct) { + // Did we exceed the actual limit? + if (LimitExceeded()) { + if (current_capacity_pct) { + *current_capacity_pct = + static_cast(consumption()) / limit() * 100; + } + return true; + } + + // No soft limit defined. + if (!has_limit() || limit_ == soft_limit_) { + return false; + } + + // Are we under the soft limit threshold? + int64_t usage = consumption(); + if (usage < soft_limit_) { + return false; + } + + // We're over the threshold; were we randomly chosen to be over the soft limit? + if (usage + rand_.Uniform64(limit_ - soft_limit_) > limit_) { + bool exceeded = GcMemory(soft_limit_); + if (exceeded && current_capacity_pct) { + *current_capacity_pct = + static_cast(consumption()) / limit() * 100; + } + return exceeded; + } + return false; +} + +bool MemTracker::AnySoftLimitExceeded(double* current_capacity_pct) { + for (MemTracker* t : limit_trackers_) { + if (t->SoftLimitExceeded(current_capacity_pct)) { + return true; + } + } + return false; +} + +int64_t MemTracker::SpareCapacity() const { + int64_t result = std::numeric_limits::max(); + for (const auto& tracker : limit_trackers_) { + int64_t mem_left = tracker->limit() - tracker->consumption(); + result = std::min(result, mem_left); + } + return result; +} + +bool MemTracker::GcMemory(int64_t max_consumption) { + if (max_consumption < 0) { + // Impossible to GC enough memory to reach the goal. + return true; + } + + lock_guard l(&gc_lock_); + if (!consumption_func_.empty()) { + UpdateConsumption(); + } + uint64_t pre_gc_consumption = consumption(); + // Check if someone gc'd before us + if (pre_gc_consumption < max_consumption) { + return false; + } + + // Try to free up some memory + for (const auto& gc_function : gc_functions_) { + gc_function(); + if (!consumption_func_.empty()) { + UpdateConsumption(); + } + if (consumption() <= max_consumption) { + break; + } + } + + return consumption() > max_consumption; +} + +void MemTracker::GcTcmalloc() { +#ifdef TCMALLOC_ENABLED + released_memory_since_gc = 0; + TRACE_EVENT0("process", "MemTracker::GcTcmalloc"); + + // Number of bytes in the 'NORMAL' free list (i.e reserved by tcmalloc but + // not in use). + int64_t bytes_overhead = GetTCMallocProperty("tcmalloc.pageheap_free_bytes"); + // Bytes allocated by the application. + int64_t bytes_used = GetTCMallocCurrentAllocatedBytes(); + + int64_t max_overhead = bytes_used * FLAGS_tcmalloc_max_free_bytes_percentage / 100.0; + if (bytes_overhead > max_overhead) { + int64_t extra = bytes_overhead - max_overhead; + while (extra > 0) { + // Release 1MB at a time, so that tcmalloc releases its page heap lock + // allowing other threads to make progress. This still disrupts the current + // thread, but is better than disrupting all. + MallocExtension::instance()->ReleaseToSystem(1024 * 1024); + extra -= 1024 * 1024; + } + } + +#else + // Nothing to do if not using tcmalloc. +#endif +} + +string MemTracker::LogUsage(const string& prefix) const { + stringstream ss; + ss << prefix << id_ << ":"; + if (CheckLimitExceeded()) { + ss << " memory limit exceeded."; + } + if (limit_ > 0) { + ss << " Limit=" << HumanReadableNumBytes::ToString(limit_); + } + ss << " Consumption=" << HumanReadableNumBytes::ToString(consumption()); + + stringstream prefix_ss; + prefix_ss << prefix << " "; + string new_prefix = prefix_ss.str(); + MutexLock l(child_trackers_lock_); + if (!child_trackers_.empty()) { + ss << "\n" << LogUsage(new_prefix, child_trackers_); + } + return ss.str(); +} + +void MemTracker::Init() { + // populate all_trackers_ and limit_trackers_ + MemTracker* tracker = this; + while (tracker) { + all_trackers_.push_back(tracker); + if (tracker->has_limit()) limit_trackers_.push_back(tracker); + tracker = tracker->parent_.get(); + } + DCHECK_GT(all_trackers_.size(), 0); + DCHECK_EQ(all_trackers_[0], this); +} + +void MemTracker::AddChildTrackerUnlocked(MemTracker* tracker) { + child_trackers_lock_.AssertAcquired(); +#ifndef NDEBUG + shared_ptr found; + CHECK(!FindTrackerUnlocked(tracker->id(), &found, shared_from_this())) + << Substitute("Duplicate memory tracker (id $0) on parent $1", + tracker->id(), ToString()); +#endif + tracker->child_tracker_it_ = child_trackers_.insert(child_trackers_.end(), tracker); +} + +void MemTracker::LogUpdate(bool is_consume, int64_t bytes) const { + stringstream ss; + ss << this << " " << (is_consume ? "Consume: " : "Release: ") << bytes + << " Consumption: " << consumption() << " Limit: " << limit_; + if (log_stack_) { + ss << std::endl << GetStackTrace(); + } + LOG(ERROR) << ss.str(); +} + +string MemTracker::LogUsage(const string& prefix, + const list& trackers) { + vector usage_strings; + for (const MemTracker* child : trackers) { + usage_strings.push_back(child->LogUsage(prefix)); + } + return JoinStrings(usage_strings, "\n"); +} + +shared_ptr MemTracker::GetRootTracker() { + GoogleOnceInit(&root_tracker_once, &MemTracker::CreateRootTracker); + return root_tracker; +} + +} // namespace kudu diff --git a/src/kudu/util/mem_tracker.h b/src/kudu/util/mem_tracker.h new file mode 100644 index 000000000000..645f6e581df1 --- /dev/null +++ b/src/kudu/util/mem_tracker.h @@ -0,0 +1,420 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_MEM_TRACKER_H +#define KUDU_UTIL_MEM_TRACKER_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/high_water_mark.h" +#include "kudu/util/locks.h" +#include "kudu/util/mutex.h" +#include "kudu/util/random.h" + +namespace kudu { + +class Status; +class MemTracker; + +// A MemTracker tracks memory consumption; it contains an optional limit and is +// arranged into a tree structure such that the consumption tracked by a +// MemTracker is also tracked by its ancestors. +// +// The MemTracker hierarchy is rooted in a single static MemTracker whose limit +// is set via gflag. The root MemTracker always exists, and it is the common +// ancestor to all MemTrackers. All operations that discover MemTrackers begin +// at the root and work their way down the tree, while operations that deal +// with adjusting memory consumption begin at a particular MemTracker and work +// their way up the tree to the root. The tree structure is strictly enforced: +// all MemTrackers (except the root) must have a parent, and all children +// belonging to a parent must have unique ids. +// +// When a MemTracker begins its life, it has a strong reference to its parent +// and the parent has a weak reference to it. The strong reference remains for +// the lifetime of the MemTracker, but the weak reference can be dropped via +// UnregisterFromParent(). A MemTracker in this state may continue servicing +// memory consumption operations while allowing a new MemTracker with the same +// id to be created on the old parent. +// +// By default, memory consumption is tracked via calls to Consume()/Release(), either to +// the tracker itself or to one of its descendents. Alternatively, a consumption function +// can specified, and then the function's value is used as the consumption rather than the +// tally maintained by Consume() and Release(). A tcmalloc function is used to track process +// memory consumption, since the process memory usage may be higher than the computed +// total memory (tcmalloc does not release deallocated memory immediately). +// +// GcFunctions can be attached to a MemTracker in order to free up memory if the limit is +// reached. If LimitExceeded() is called and the limit is exceeded, it will first call the +// GcFunctions to try to free memory and recheck the limit. For example, the process +// tracker has a GcFunction that releases any unused memory still held by tcmalloc, so +// this will be called before the process limit is reported as exceeded. GcFunctions are +// called in the order they are added, so expensive functions should be added last. +// +// This class is thread-safe. +// +// NOTE: this class has been partially ported over from Impala with +// several changes, and as a result the style differs somewhat from +// the Kudu style. +// +// Changes from Impala: +// 1) Id a string vs. a TUniqueId +// 2) There is no concept of query trackers vs. pool trackers -- trackers are instead +// associated with objects. Parent hierarchy is preserved, with the assumption that, +// e.g., a tablet server's memtracker will have as its children the tablets' memtrackers, +// which in turn will have memtrackers for their caches, logs, and so forth. +// +// TODO: this classes uses a lot of statics fields and methods, which +// isn't common in Kudu. It is probably wise to later move the +// 'registry' of trackers to a separate class, but it's better to +// start using the 'class' *first* and then change this functionality, +// depending on how MemTracker ends up being used in Kudu. +class MemTracker : public std::enable_shared_from_this { + public: + + // Signature for function that can be called to free some memory after limit is reached. + typedef boost::function GcFunction; + + ~MemTracker(); + + // Removes this tracker from its parent's children. This tracker retains its + // link to its parent. Must be called on a tracker with a parent. + // + // Automatically called in the MemTracker destructor, but should be called + // explicitly when an object is destroyed if that object is also the "primary + // owner" of a tracker (i.e. the object that originally created the tracker). + // This orphans the tracker so that if the object is recreated, its new + // tracker won't collide with the now orphaned tracker. + // + // Is thread-safe on the parent but not the child. Meaning, multiple trackers + // that share the same parent can all UnregisterFromParent() at the same + // time, but all UnregisterFromParent() calls on a given tracker must be + // externally synchronized. + void UnregisterFromParent(); + + // Creates and adds the tracker to the tree so that it can be retrieved with + // FindTracker/FindOrCreateTracker. + // + // byte_limit < 0 means no limit; 'id' is a used as a label for LogUsage() + // and web UI and must be unique for the given parent. Use the two-argument + // form if there is no parent. + static std::shared_ptr CreateTracker( + int64_t byte_limit, + const std::string& id, + const std::shared_ptr& parent = std::shared_ptr()); + + // If a tracker with the specified 'id' and 'parent' exists in the tree, sets + // 'tracker' to reference that instance. Use the two-argument form if there + // is no parent. Returns false if no such tracker exists. + static bool FindTracker( + const std::string& id, + std::shared_ptr* tracker, + const std::shared_ptr& parent = std::shared_ptr()); + + // If a tracker with the specified 'id' and 'parent' exists in the tree, + // returns a shared_ptr to that instance. Otherwise, creates a new + // MemTracker with the specified byte_limit, id, and parent. Use the two + // argument form if there is no parent. + static std::shared_ptr FindOrCreateTracker( + int64_t byte_limit, + const std::string& id, + const std::shared_ptr& parent = std::shared_ptr()); + + // Returns a list of all the valid trackers. + static void ListTrackers(std::vector >* trackers); + + // Gets a shared_ptr to the "root" tracker, creating it if necessary. + static std::shared_ptr GetRootTracker(); + + // Updates consumption from the consumption function specified in the constructor. + // NOTE: this method will crash if 'consumption_func_' is not set. + void UpdateConsumption(); + + // Increases consumption of this tracker and its ancestors by 'bytes'. + void Consume(int64_t bytes); + + // Try to expand the limit (by asking the resource broker for more memory) by at least + // 'bytes'. Returns false if not possible, true if the request succeeded. May allocate + // more memory than was requested. + // TODO: always returns false for now, not yet implemented. + bool ExpandLimit(int64_t /* unused: bytes */) { return false; } + + // Increases consumption of this tracker and its ancestors by 'bytes' only if + // they can all consume 'bytes'. If this brings any of them over, none of them + // are updated. + // Returns true if the try succeeded. + bool TryConsume(int64_t bytes); + + // Decreases consumption of this tracker and its ancestors by 'bytes'. + void Release(int64_t bytes); + + // Returns true if a valid limit of this tracker or one of its ancestors is + // exceeded. + bool AnyLimitExceeded(); + + // If this tracker has a limit, checks the limit and attempts to free up some memory if + // the limit is exceeded by calling any added GC functions. Returns true if the limit is + // exceeded after calling the GC functions. Returns false if there is no limit. + bool LimitExceeded(); + + // Like LimitExceeded() but may also return true if the soft memory limit is exceeded. + // The greater the excess, the higher the chance that it returns true. + // + // If the soft limit is exceeded and 'current_capacity_pct' is not NULL, the percentage + // of the hard limit consumed is written to it. + bool SoftLimitExceeded(double* current_capacity_pct); + + // Combines the semantics of AnyLimitExceeded() and SoftLimitExceeded(). + // + // Note: if there's more than one soft limit defined, the probability of it being + // exceeded in at least one tracker is much higher (as each soft limit check is an + // independent event). + bool AnySoftLimitExceeded(double* current_capacity_pct); + + // Returns the maximum consumption that can be made without exceeding the limit on + // this tracker or any of its parents. Returns int64_t::max() if there are no + // limits and a negative value if any limit is already exceeded. + int64_t SpareCapacity() const; + + + int64_t limit() const { return limit_; } + bool has_limit() const { return limit_ >= 0; } + const std::string& id() const { return id_; } + + // Returns the memory consumed in bytes. + int64_t consumption() const { + return consumption_.current_value(); + } + + // Note that if consumption_ is based on consumption_func_, this + // will be the max value we've recorded in consumption(), not + // necessarily the highest value consumption_func_ has ever + // reached. + int64_t peak_consumption() const { return consumption_.max_value(); } + + // Retrieve the parent tracker, or NULL If one is not set. + std::shared_ptr parent() const { return parent_; } + + // Add a function 'f' to be called if the limit is reached. + // 'f' does not need to be thread-safe as long as it is added to only one MemTracker. + // Note that 'f' must be valid for the lifetime of this MemTracker. + void AddGcFunction(GcFunction f) { + gc_functions_.push_back(f); + } + + // Logs the usage of this tracker and all of its children (recursively). + std::string LogUsage(const std::string& prefix = "") const; + + void EnableLogging(bool enable, bool log_stack) { + enable_logging_ = enable; + log_stack_ = log_stack; + } + + // Returns a textual representation of the tracker that is guaranteed to be + // globally unique. + std::string ToString() const; + + private: + // Function signatures for gauge-style memory trackers (where consumption is + // periodically observed rather than explicitly tracked). + // + // Currently only used by the root tracker. + typedef boost::function ConsumptionFunction; + + // If consumption_func is not empty, uses it as the consumption value. + // Consume()/Release() can still be called. + // byte_limit < 0 means no limit + // 'id' is the label for LogUsage() and web UI. + MemTracker(ConsumptionFunction consumption_func, int64_t byte_limit, + const std::string& id, std::shared_ptr parent); + + bool CheckLimitExceeded() const { + return limit_ >= 0 && limit_ < consumption(); + } + + // If consumption is higher than max_consumption, attempts to free memory by calling any + // added GC functions. Returns true if max_consumption is still exceeded. Takes + // gc_lock. Updates metrics if initialized. + bool GcMemory(int64_t max_consumption); + + // Called when the total release memory is larger than GC_RELEASE_SIZE. + // TcMalloc holds onto released memory and very slowly (if ever) releases it back to + // the OS. This is problematic since it is memory we are not constantly tracking which + // can cause us to go way over mem limits. + void GcTcmalloc(); + + // Further initializes the tracker. + void Init(); + + // Adds tracker to child_trackers_. + // + // child_trackers_lock_ must be held. + void AddChildTrackerUnlocked(MemTracker* tracker); + + // Logs the stack of the current consume/release. Used for debugging only. + void LogUpdate(bool is_consume, int64_t bytes) const; + + static std::string LogUsage(const std::string& prefix, + const std::list& trackers); + + // Variant of CreateTracker() that: + // 1. Must be called with a non-NULL parent, and + // 2. Must be called with parent->child_trackers_lock_ held. + static std::shared_ptr CreateTrackerUnlocked( + int64_t byte_limit, + const std::string& id, + const std::shared_ptr& parent); + + // Variant of FindTracker() that: + // 1. Must be called with a non-NULL parent, and + // 2. Must be called with parent->child_trackers_lock_ held. + static bool FindTrackerUnlocked( + const std::string& id, + std::shared_ptr* tracker, + const std::shared_ptr& parent); + + // Creates the root tracker. + static void CreateRootTracker(); + + // Size, in bytes, that is considered a large value for Release() (or Consume() with + // a negative value). If tcmalloc is used, this can trigger it to GC. + // A higher value will make us call into tcmalloc less often (and therefore more + // efficient). A lower value will mean our memory overhead is lower. + // TODO: this is a stopgap. + static const int64_t GC_RELEASE_SIZE = 128 * 1024L * 1024L; + + simple_spinlock gc_lock_; + + int64_t limit_; + int64_t soft_limit_; + const std::string id_; + const std::string descr_; + std::shared_ptr parent_; + + HighWaterMark consumption_; + + ConsumptionFunction consumption_func_; + + // this tracker plus all of its ancestors + std::vector all_trackers_; + // all_trackers_ with valid limits + std::vector limit_trackers_; + + // All the child trackers of this tracker. Used for error reporting and + // listing only (i.e. updating the consumption of a parent tracker does not + // update that of its children). + mutable Mutex child_trackers_lock_; + std::list child_trackers_; + + // Iterator into parent_->child_trackers_ for this object. Stored to have O(1) + // remove. + std::list::iterator child_tracker_it_; + + // Functions to call after the limit is reached to free memory. + std::vector gc_functions_; + + ThreadSafeRandom rand_; + + // If true, logs to INFO every consume/release called. Used for debugging. + bool enable_logging_; + + // If true, log the stack as well. + bool log_stack_; +}; + +// An std::allocator that manipulates a MemTracker during allocation +// and deallocation. +template > +class MemTrackerAllocator : public Alloc { + public: + typedef typename Alloc::pointer pointer; + typedef typename Alloc::const_pointer const_pointer; + typedef typename Alloc::size_type size_type; + + explicit MemTrackerAllocator(std::shared_ptr mem_tracker) + : mem_tracker_(std::move(mem_tracker)) {} + + // This constructor is used for rebinding. + template + MemTrackerAllocator(const MemTrackerAllocator& allocator) + : Alloc(allocator), + mem_tracker_(allocator.mem_tracker()) { + } + + ~MemTrackerAllocator() { + } + + pointer allocate(size_type n, const_pointer hint = 0) { + // Ideally we'd use TryConsume() here to enforce the tracker's limit. + // However, that means throwing bad_alloc if the limit is exceeded, and + // it's not clear that the rest of Kudu can handle that. + mem_tracker_->Consume(n * sizeof(T)); + return Alloc::allocate(n, hint); + } + + void deallocate(pointer p, size_type n) { + Alloc::deallocate(p, n); + mem_tracker_->Release(n * sizeof(T)); + } + + // This allows an allocator to be used for a different type. + template + struct rebind { + typedef MemTrackerAllocator::other> other; + }; + + const std::shared_ptr& mem_tracker() const { return mem_tracker_; } + + private: + std::shared_ptr mem_tracker_; +}; + +// Convenience class that adds memory consumption to a tracker when declared, +// releasing it when the end of scope is reached. +class ScopedTrackedConsumption { + public: + ScopedTrackedConsumption(std::shared_ptr tracker, + int64_t to_consume) + : tracker_(std::move(tracker)), consumption_(to_consume) { + DCHECK(tracker_); + tracker_->Consume(consumption_); + } + + void Reset(int64_t new_consumption) { + // Consume(-x) is the same as Release(x). + tracker_->Consume(new_consumption - consumption_); + consumption_ = new_consumption; + } + + ~ScopedTrackedConsumption() { + tracker_->Release(consumption_); + } + + int64_t consumption() const { return consumption_; } + + private: + std::shared_ptr tracker_; + int64_t consumption_; +}; + +} // namespace kudu + +#endif // KUDU_UTIL_MEM_TRACKER_H diff --git a/src/kudu/util/memcmpable_varint-test.cc b/src/kudu/util/memcmpable_varint-test.cc new file mode 100644 index 000000000000..3e5b5e082550 --- /dev/null +++ b/src/kudu/util/memcmpable_varint-test.cc @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/util/hexdump.h" +#include "kudu/util/memcmpable_varint.h" +#include "kudu/util/random.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +// Add operator<< to print pairs, used in a test below. +// This has to be done in the 'std' namespace due to the way that +// template resolution works. +namespace std { +template +ostream &operator <<(ostream &os, const pair &pair) { + return os << "(" << pair.first << ", " << pair.second << ")"; +} +} + +namespace kudu { + +class TestMemcmpableVarint : public KuduTest { + protected: + TestMemcmpableVarint() : random_(SeedRandom()) {} + + // Random number generator that generates different length integers + // with equal probability -- i.e it is equally as likely to generate + // a number with 8 bits as it is to generate one with 64 bits. + // This is useful for testing varint implementations, where a uniform + // random is skewed towards generating longer integers. + uint64_t Rand64WithRandomBitLength() { + return random_.Next64() >> random_.Uniform(64); + } + + Random random_; +}; + +static void DoRoundTripTest(uint64_t to_encode) { + static faststring buf; + buf.clear(); + PutMemcmpableVarint64(&buf, to_encode); + + uint64_t decoded; + Slice slice(buf); + bool success = GetMemcmpableVarint64(&slice, &decoded); + ASSERT_TRUE(success); + ASSERT_EQ(to_encode, decoded); + ASSERT_TRUE(slice.empty()); +} + + +TEST_F(TestMemcmpableVarint, TestRoundTrip) { + // Test the first 100K integers + // (exercises the special cases for <= 67823 in the code) + for (int i = 0; i < 100000; i++) { + DoRoundTripTest(i); + } + + // Test a bunch of random integers (which are likely to be many bytes) + for (int i = 0; i < 100000; i++) { + DoRoundTripTest(random_.Next64()); + } +} + + +// Test that a composite key can be made up of multiple memcmpable +// varints strung together, and that the resulting key compares the +// same as the original pair of integers (i.e left-to-right). +TEST_F(TestMemcmpableVarint, TestCompositeKeys) { + faststring buf1; + faststring buf2; + + const int n_trials = 1000; + + for (int i = 0; i < n_trials; i++) { + buf1.clear(); + buf2.clear(); + + pair p1 = + make_pair(Rand64WithRandomBitLength(), Rand64WithRandomBitLength()); + PutMemcmpableVarint64(&buf1, p1.first); + PutMemcmpableVarint64(&buf1, p1.second); + + pair p2 = + make_pair(Rand64WithRandomBitLength(), Rand64WithRandomBitLength()); + PutMemcmpableVarint64(&buf2, p2.first); + PutMemcmpableVarint64(&buf2, p2.second); + + SCOPED_TRACE(testing::Message() << p1 << "\n" << HexDump(Slice(buf1)) + << " vs\n" << p2 << "\n" << HexDump(Slice(buf2))); + if (p1 < p2) { + ASSERT_LT(Slice(buf1).compare(Slice(buf2)), 0); + } else if (p1 > p2) { + ASSERT_GT(Slice(buf1).compare(Slice(buf2)), 0); + } else { + ASSERT_EQ(Slice(buf1).compare(Slice(buf2)), 0); + } + } +} + +// Similar to the above test, but instead of being randomized, specifically +// tests "interesting" values -- i.e values around the boundaries of where +// the encoding changes its number of bytes. +TEST_F(TestMemcmpableVarint, TestInterestingCompositeKeys) { + vector interesting_values = { 0, 1, 240, // 1 byte + 241, 2000, 2287, // 2 bytes + 2288, 40000, 67823, // 3 bytes + 67824, 1ULL << 23, (1ULL << 24) - 1, // 4 bytes + 1ULL << 24, 1ULL << 30, (1ULL << 32) - 1 }; // 5 bytes + + faststring buf1; + faststring buf2; + + for (uint64_t v1 : interesting_values) { + for (uint64_t v2 : interesting_values) { + buf1.clear(); + pair p1 = make_pair(v1, v2); + PutMemcmpableVarint64(&buf1, p1.first); + PutMemcmpableVarint64(&buf1, p1.second); + + for (uint64_t v3 : interesting_values) { + for (uint64_t v4 : interesting_values) { + buf2.clear(); + pair p2 = make_pair(v3, v4); + PutMemcmpableVarint64(&buf2, p2.first); + PutMemcmpableVarint64(&buf2, p2.second); + + SCOPED_TRACE(testing::Message() << p1 << "\n" << HexDump(Slice(buf1)) + << " vs\n" << p2 << "\n" << HexDump(Slice(buf2))); + if (p1 < p2) { + ASSERT_LT(Slice(buf1).compare(Slice(buf2)), 0); + } else if (p1 > p2) { + ASSERT_GT(Slice(buf1).compare(Slice(buf2)), 0); + } else { + ASSERT_EQ(Slice(buf1).compare(Slice(buf2)), 0); + } + } + } + } + } +} + +//////////////////////////////////////////////////////////// +// Benchmarks +//////////////////////////////////////////////////////////// + +#ifdef NDEBUG +TEST_F(TestMemcmpableVarint, BenchmarkEncode) { + faststring buf; + + int sum_sizes = 0; // need to do something with results to force evaluation + + LOG_TIMING(INFO, "Encoding integers") { + for (int trial = 0; trial < 100; trial++) { + for (uint64_t i = 0; i < 1000000; i++) { + buf.clear(); + PutMemcmpableVarint64(&buf, i); + sum_sizes += buf.size(); + } + } + } + ASSERT_GT(sum_sizes, 1); // use 'sum_sizes' to avoid optimizing it out. +} + +TEST_F(TestMemcmpableVarint, BenchmarkDecode) { + faststring buf; + + // Encode 1M integers into the buffer + for (uint64_t i = 0; i < 1000000; i++) { + PutMemcmpableVarint64(&buf, i); + } + + // Decode the whole buffer 100 times. + LOG_TIMING(INFO, "Decoding integers") { + uint64_t sum_vals = 0; + for (int trial = 0; trial < 100; trial++) { + Slice s(buf); + while (!s.empty()) { + uint64_t decoded; + CHECK(GetMemcmpableVarint64(&s, &decoded)); + sum_vals += decoded; + } + } + ASSERT_GT(sum_vals, 1); // use 'sum_vals' to avoid optimizing it out. + } +} + +#endif + +} // namespace kudu diff --git a/src/kudu/util/memcmpable_varint.cc b/src/kudu/util/memcmpable_varint.cc new file mode 100644 index 000000000000..e55addf2b780 --- /dev/null +++ b/src/kudu/util/memcmpable_varint.cc @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This file contains code derived from sqlite4, distributed in the public domain. +// +// A variable length integer is an encoding of 64-bit unsigned integers +// into between 1 and 9 bytes. The encoding is designed so that small +// (and common) values take much less space that larger values. Additional +// properties: +// +// * The length of the varint can be determined after examining just +// the first byte of the encoding. +// +// * Varints compare in numerical order using memcmp(). +// +//************************************************************************ +// +// Treat each byte of the encoding as an unsigned integer between 0 and 255. +// Let the bytes of the encoding be called A0, A1, A2, ..., A8. +// +// DECODE +// +// If A0 is between 0 and 240 inclusive, then the result is the value of A0. +// +// If A0 is between 241 and 248 inclusive, then the result is +// 240+256*(A0-241)+A1. +// +// If A0 is 249 then the result is 2288+256*A1+A2. +// +// If A0 is 250 then the result is A1..A3 as a 3-byte big-ending integer. +// +// If A0 is 251 then the result is A1..A4 as a 4-byte big-ending integer. +// +// If A0 is 252 then the result is A1..A5 as a 5-byte big-ending integer. +// +// If A0 is 253 then the result is A1..A6 as a 6-byte big-ending integer. +// +// If A0 is 254 then the result is A1..A7 as a 7-byte big-ending integer. +// +// If A0 is 255 then the result is A1..A8 as a 8-byte big-ending integer. +// +// ENCODE +// +// Let the input value be V. +// +// If V<=240 then output a single by A0 equal to V. +// +// If V<=2287 then output A0 as (V-240)/256 + 241 and A1 as (V-240)%256. +// +// If V<=67823 then output A0 as 249, A1 as (V-2288)/256, and A2 +// as (V-2288)%256. +// +// If V<=16777215 then output A0 as 250 and A1 through A3 as a big-endian +// 3-byte integer. +// +// If V<=4294967295 then output A0 as 251 and A1..A4 as a big-ending +// 4-byte integer. +// +// If V<=1099511627775 then output A0 as 252 and A1..A5 as a big-ending +// 5-byte integer. +// +// If V<=281474976710655 then output A0 as 253 and A1..A6 as a big-ending +// 6-byte integer. +// +// If V<=72057594037927935 then output A0 as 254 and A1..A7 as a +// big-ending 7-byte integer. +// +// Otherwise then output A0 as 255 and A1..A8 as a big-ending 8-byte integer. +// +// SUMMARY +// +// Bytes Max Value Digits +// ------- --------- --------- +// 1 240 2.3 +// 2 2287 3.3 +// 3 67823 4.8 +// 4 2**24-1 7.2 +// 5 2**32-1 9.6 +// 6 2**40-1 12.0 +// 7 2**48-1 14.4 +// 8 2**56-1 16.8 +// 9 2**64-1 19.2 +// + +#include + +#include "kudu/gutil/endian.h" +#include "kudu/util/faststring.h" +#include "kudu/util/memcmpable_varint.h" +#include "kudu/util/slice.h" + +namespace kudu { + +//////////////////////////////////////////////////////////// +// Begin code ripped from sqlite4 +//////////////////////////////////////////////////////////// + +// This function is borrowed from sqlite4/varint.c +static void varintWrite32(uint8_t *z, uint32_t y) { + z[0] = (uint8_t)(y>>24); + z[1] = (uint8_t)(y>>16); + z[2] = (uint8_t)(y>>8); + z[3] = (uint8_t)(y); +} + + +// Write a varint into z[]. The buffer z[] must be at least 9 characters +// long to accommodate the largest possible varint. Return the number of +// bytes of z[] used. +// +// This function is borrowed from sqlite4/varint.c +static size_t sqlite4PutVarint64(uint8_t *z, uint64_t x) { + uint64_t w, y; + if (x <= 240) { + z[0] = (uint8_t)x; + return 1; + } + if (x <= 2287) { + y = (uint64_t)(x - 240); + z[0] = (uint8_t)(y/256 + 241); + z[1] = (uint8_t)(y%256); + return 2; + } + if (x <= 67823) { + y = (uint64_t)(x - 2288); + z[0] = 249; + z[1] = (uint8_t)(y/256); + z[2] = (uint8_t)(y%256); + return 3; + } + y = (uint64_t)x; + w = (uint64_t)(x>>32); + if (w == 0) { + if (y <= 16777215) { + z[0] = 250; + z[1] = (uint8_t)(y>>16); + z[2] = (uint8_t)(y>>8); + z[3] = (uint8_t)(y); + return 4; + } + z[0] = 251; + varintWrite32(z+1, y); + return 5; + } + if (w <= 255) { + z[0] = 252; + z[1] = (uint8_t)w; + varintWrite32(z+2, y); + return 6; + } + if (w <= 65535) { + z[0] = 253; + z[1] = (uint8_t)(w>>8); + z[2] = (uint8_t)w; + varintWrite32(z+3, y); + return 7; + } + if (w <= 16777215) { + z[0] = 254; + z[1] = (uint8_t)(w>>16); + z[2] = (uint8_t)(w>>8); + z[3] = (uint8_t)w; + varintWrite32(z+4, y); + return 8; + } + z[0] = 255; + varintWrite32(z+1, w); + varintWrite32(z+5, y); + return 9; +} + +// Decode the varint in the first n bytes z[]. Write the integer value +// into *pResult and return the number of bytes in the varint. +// +// If the decode fails because there are not enough bytes in z[] then +// return 0; +// +// Borrowed from sqlite4 varint.c +static int sqlite4GetVarint64( + const uint8_t *z, + int n, + uint64_t *pResult) { + unsigned int x; + if ( n < 1) return 0; + if (z[0] <= 240) { + *pResult = z[0]; + return 1; + } + if (z[0] <= 248) { + if ( n < 2) return 0; + *pResult = (z[0]-241)*256 + z[1] + 240; + return 2; + } + if (n < z[0]-246 ) return 0; + if (z[0] == 249) { + *pResult = 2288 + 256*z[1] + z[2]; + return 3; + } + if (z[0] == 250) { + *pResult = (z[1]<<16) + (z[2]<<8) + z[3]; + return 4; + } + x = (z[1]<<24) + (z[2]<<16) + (z[3]<<8) + z[4]; + if (z[0] == 251) { + *pResult = x; + return 5; + } + if (z[0] == 252) { + *pResult = (((uint64_t)x)<<8) + z[5]; + return 6; + } + if (z[0] == 253) { + *pResult = (((uint64_t)x)<<16) + (z[5]<<8) + z[6]; + return 7; + } + if (z[0] == 254) { + *pResult = (((uint64_t)x)<<24) + (z[5]<<16) + (z[6]<<8) + z[7]; + return 8; + } + *pResult = (((uint64_t)x)<<32) + + (0xffffffff & ((z[5]<<24) + (z[6]<<16) + (z[7]<<8) + z[8])); + return 9; +} + +//////////////////////////////////////////////////////////// +// End code ripped from sqlite4 +//////////////////////////////////////////////////////////// + +void PutMemcmpableVarint64(faststring *dst, uint64_t value) { + uint8_t buf[9]; + int used = sqlite4PutVarint64(buf, value); + DCHECK_LE(used, sizeof(buf)); + dst->append(buf, used); +} + +bool GetMemcmpableVarint64(Slice *input, uint64_t *value) { + size_t size = sqlite4GetVarint64(input->data(), input->size(), value); + input->remove_prefix(size); + return size > 0; +} + + +} // namespace kudu diff --git a/src/kudu/util/memcmpable_varint.h b/src/kudu/util/memcmpable_varint.h new file mode 100644 index 000000000000..c1ce0711792a --- /dev/null +++ b/src/kudu/util/memcmpable_varint.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// This is an alternate varint format, borrowed from sqlite4, that differs from the +// varint in util/coding.h in that its serialized form can be compared with memcmp(), +// yielding the same result as comparing the original integers. +// +// The serialized form also has the property that multiple such varints can be strung +// together to form a composite key, which itself is memcmpable. +// +// See memcmpable_varint.cc for further description. + +#ifndef KUDU_UTIL_MEMCMPABLE_VARINT_H +#define KUDU_UTIL_MEMCMPABLE_VARINT_H + +#include "kudu/util/faststring.h" +#include "kudu/util/slice.h" + +namespace kudu { + +void PutMemcmpableVarint64(faststring *dst, uint64_t value); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +bool GetMemcmpableVarint64(Slice *input, uint64_t *value); + +} // namespace kudu + +#endif diff --git a/src/kudu/util/memenv/memenv-test.cc b/src/kudu/util/memenv/memenv-test.cc new file mode 100644 index 000000000000..28f34d5be3d7 --- /dev/null +++ b/src/kudu/util/memenv/memenv-test.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Modified for kudu: +// - use gtest + +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/test_macros.h" + +using std::shared_ptr; +using std::string; +using std::unordered_set; +using std::vector; + +namespace kudu { + +class MemEnvTest : public ::testing::Test { + public: + Env* env_; + + MemEnvTest() + : env_(NewMemEnv(Env::Default())) { + } + ~MemEnvTest() { + delete env_; + } +}; + +TEST_F(MemEnvTest, Basics) { + uint64_t file_size; + gscoped_ptr writable_file; + vector children; + + // Create the directory. + ASSERT_FALSE(env_->FileExists("/dir")); + ASSERT_OK(env_->CreateDir("/dir")); + ASSERT_TRUE(env_->FileExists("/dir")); + + // Check that the directory is empty. + ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_TRUE(env_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/f")); + ASSERT_TRUE(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3, file_size); + + // Check that opening non-existent file fails. + gscoped_ptr seq_file; + gscoped_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); + ASSERT_FALSE(env_->FileExists("/dir")); +} + +TEST_F(MemEnvTest, ReadWrite) { + Slice result; + uint8_t scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + { + gscoped_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + } + + { + // Read sequentially. + gscoped_ptr seq_file; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0, result.size()); + } + + { + // Random reads. + gscoped_ptr rand_file; + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); + } +} + +TEST_F(MemEnvTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST_F(MemEnvTest, Misc) { + string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + gscoped_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush(WritableFile::FLUSH_SYNC)); + ASSERT_OK(writable_file->Flush(WritableFile::FLUSH_ASYNC)); + ASSERT_OK(writable_file->Close()); +} + +TEST_F(MemEnvTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + gscoped_ptr scratch(new uint8_t[kWriteSize * 2]); + + string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + gscoped_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + gscoped_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file)); + ASSERT_OK(seq_file->Read(3, &result, scratch.get())); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch.get())); + read_data.append(reinterpret_cast(result.data()), + result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); +} + +TEST_F(MemEnvTest, Overwrite) { + // File does not exist, create it. + shared_ptr writer; + ASSERT_OK(env_util::OpenFileForWrite(env_, "some file", &writer)); + + // File exists, overwrite it. + ASSERT_OK(env_util::OpenFileForWrite(env_, "some file", &writer)); + + // File exists, try to overwrite (and fail). + WritableFileOptions opts; + opts.mode = Env::CREATE_NON_EXISTING; + Status s = env_util::OpenFileForWrite(opts, + env_, "some file", &writer); + ASSERT_TRUE(s.IsAlreadyPresent()); +} + +TEST_F(MemEnvTest, Reopen) { + string first = "The quick brown fox"; + string second = "jumps over the lazy dog"; + + // Create the file and write to it. + shared_ptr writer; + ASSERT_OK(env_util::OpenFileForWrite(env_, "some file", &writer)); + ASSERT_OK(writer->Append(first)); + ASSERT_EQ(first.length(), writer->Size()); + ASSERT_OK(writer->Close()); + + // Reopen it and append to it. + WritableFileOptions reopen_opts; + reopen_opts.mode = Env::OPEN_EXISTING; + ASSERT_OK(env_util::OpenFileForWrite(reopen_opts, + env_, "some file", &writer)); + ASSERT_EQ(first.length(), writer->Size()); + ASSERT_OK(writer->Append(second)); + ASSERT_EQ(first.length() + second.length(), writer->Size()); + ASSERT_OK(writer->Close()); + + // Check that the file has both strings. + shared_ptr reader; + ASSERT_OK(env_util::OpenFileForRandom(env_, "some file", &reader)); + uint64_t size; + ASSERT_OK(reader->Size(&size)); + ASSERT_EQ(first.length() + second.length(), size); + Slice s; + uint8_t scratch[size]; + ASSERT_OK(env_util::ReadFully(reader.get(), 0, size, &s, scratch)); + ASSERT_EQ(first + second, s.ToString()); +} + +TEST_F(MemEnvTest, TempFile) { + string tmpl = "foo.XXXXXX"; + string bad_tmpl = "foo.YYY"; + + string path; + gscoped_ptr file; + + // Ensure we don't accept a bad template. + Status s = env_->NewTempWritableFile(WritableFileOptions(), bad_tmpl, &path, &file); + ASSERT_TRUE(s.IsInvalidArgument()) << "Should not accept bad template: " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "must end with the string XXXXXX"); + + // Create multiple temp files, ensure no collisions. + unordered_set paths; + for (int i = 0; i < 10; i++) { + ASSERT_OK(env_->NewTempWritableFile(WritableFileOptions(), tmpl, &path, &file)); + VLOG(1) << "Created temporary file at path " << path; + ASSERT_EQ(path.length(), tmpl.length()) << "Template and final path should have same length"; + ASSERT_NE(path, tmpl) << "Template and final path should differ"; + ASSERT_OK(file->Append("Hello, tempfile.\n")); + ASSERT_OK(file->Close()); + ASSERT_FALSE(ContainsKey(paths, path)) << "Created " << path << " twice!"; + InsertOrDie(&paths, path); // Will crash if we have a duplicate. + } + + // Delete the files we created. + for (const string& p : paths) { + ASSERT_OK(env_->DeleteFile(p)); + } +} + +TEST_F(MemEnvTest, TestRWFile) { + // Create the file. + gscoped_ptr file; + ASSERT_OK(env_->NewRWFile("foo", &file)); + + // Append to it. + string kTestData = "abcdefghijklmno"; + ASSERT_OK(file->Write(0, kTestData)); + + // Read from it. + Slice result; + gscoped_ptr scratch(new uint8_t[kTestData.length()]); + ASSERT_OK(file->Read(0, kTestData.length(), &result, scratch.get())); + ASSERT_EQ(result, kTestData); + + // Try to rewrite; it shouldn't work. + ASSERT_TRUE(file->Write(0, kTestData).IsNotSupported()); + + // Make sure we can't overwrite it. + RWFileOptions opts; + opts.mode = Env::CREATE_NON_EXISTING; + ASSERT_TRUE(env_->NewRWFile(opts, "foo", &file).IsAlreadyPresent()); + + // Reopen it without truncating the existing data. + opts.mode = Env::OPEN_EXISTING; + ASSERT_OK(env_->NewRWFile(opts, "foo", &file)); + ASSERT_OK(file->Read(0, kTestData.length(), &result, scratch.get())); + ASSERT_EQ(result, kTestData); +} + +} // namespace kudu diff --git a/src/kudu/util/memenv/memenv.cc b/src/kudu/util/memenv/memenv.cc new file mode 100644 index 000000000000..92b200784af5 --- /dev/null +++ b/src/kudu/util/memenv/memenv.cc @@ -0,0 +1,616 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Modified for kudu: +// - use boost mutexes instead of port mutexes + +#include +#include +#include +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/env.h" +#include "kudu/util/malloc.h" +#include "kudu/util/mutex.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/random.h" +#include "kudu/util/status.h" + +namespace kudu { + +namespace { + +using std::string; +using std::vector; +using strings::Substitute; + +class FileState : public RefCountedThreadSafe { + public: + // FileStates are reference counted. The initial reference count is zero + // and the caller must call Ref() at least once. + explicit FileState(string filename) + : filename_(std::move(filename)), size_(0) {} + + uint64_t Size() const { return size_; } + + Status Read(uint64_t offset, size_t n, Slice* result, uint8_t* scratch) const { + if (offset > size_) { + return Status::IOError("Offset greater than file size."); + } + const uint64_t available = size_ - offset; + if (n > available) { + n = available; + } + if (n == 0) { + *result = Slice(); + return Status::OK(); + } + + size_t block = offset / kBlockSize; + size_t block_offset = offset % kBlockSize; + + if (n <= kBlockSize - block_offset) { + // The requested bytes are all in the first block. + *result = Slice(blocks_[block] + block_offset, n); + return Status::OK(); + } + + size_t bytes_to_copy = n; + uint8_t* dst = scratch; + + while (bytes_to_copy > 0) { + size_t avail = kBlockSize - block_offset; + if (avail > bytes_to_copy) { + avail = bytes_to_copy; + } + memcpy(dst, blocks_[block] + block_offset, avail); + + bytes_to_copy -= avail; + dst += avail; + block++; + block_offset = 0; + } + + *result = Slice(scratch, n); + return Status::OK(); + } + + Status PreAllocate(uint64_t size) { + auto padding = new uint8_t[size]; + // TODO optimize me + memset(&padding, 0, sizeof(uint8_t)); + Status s = AppendRaw(padding, size); + delete [] padding; + size_ -= size; + return s; + } + + Status Append(const Slice& data) { + return AppendRaw(data.data(), data.size()); + } + + Status AppendRaw(const uint8_t *src, size_t src_len) { + while (src_len > 0) { + size_t avail; + size_t offset = size_ % kBlockSize; + + if (offset != 0) { + // There is some room in the last block. + avail = kBlockSize - offset; + } else { + // No room in the last block; push new one. + blocks_.push_back(new uint8_t[kBlockSize]); + avail = kBlockSize; + } + + if (avail > src_len) { + avail = src_len; + } + memcpy(blocks_.back() + offset, src, avail); + src_len -= avail; + src += avail; + size_ += avail; + } + + return Status::OK(); + } + + const string& filename() const { return filename_; } + + size_t memory_footprint() const { + size_t size = kudu_malloc_usable_size(this); + if (blocks_.capacity() > 0) { + size += kudu_malloc_usable_size(blocks_.data()); + } + for (uint8_t* block : blocks_) { + size += kudu_malloc_usable_size(block); + } + size += filename_.capacity(); + return size; + } + + private: + friend class RefCountedThreadSafe; + + enum { kBlockSize = 8 * 1024 }; + + // Private since only Release() should be used to delete it. + ~FileState() { + for (uint8_t* block : blocks_) { + delete[] block; + } + } + + const string filename_; + + // The following fields are not protected by any mutex. They are only mutable + // while the file is being written, and concurrent access is not allowed + // to writable files. + uint64_t size_; + vector blocks_; + + DISALLOW_COPY_AND_ASSIGN(FileState); +}; + +class SequentialFileImpl : public SequentialFile { + public: + explicit SequentialFileImpl(const scoped_refptr& file) + : file_(file), + pos_(0) { + } + + ~SequentialFileImpl() { + } + + virtual Status Read(size_t n, Slice* result, uint8_t* scratch) OVERRIDE { + Status s = file_->Read(pos_, n, result, scratch); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + virtual Status Skip(uint64_t n) OVERRIDE { + if (pos_ > file_->Size()) { + return Status::IOError("pos_ > file_->Size()"); + } + const size_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += n; + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { + return file_->filename(); + } + + private: + const scoped_refptr file_; + size_t pos_; +}; + +class RandomAccessFileImpl : public RandomAccessFile { + public: + explicit RandomAccessFileImpl(const scoped_refptr& file) + : file_(file) { + } + + ~RandomAccessFileImpl() { + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + uint8_t* scratch) const OVERRIDE { + return file_->Read(offset, n, result, scratch); + } + + virtual Status Size(uint64_t *size) const OVERRIDE { + *size = file_->Size(); + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { + return file_->filename(); + } + + virtual size_t memory_footprint() const OVERRIDE { + // The FileState is actually shared between multiple files, but the double + // counting doesn't matter much since MemEnv is only used in tests. + return kudu_malloc_usable_size(this) + file_->memory_footprint(); + } + + private: + const scoped_refptr file_; +}; + +class WritableFileImpl : public WritableFile { + public: + explicit WritableFileImpl(const scoped_refptr& file) + : file_(file) { + } + + ~WritableFileImpl() { + } + + virtual Status PreAllocate(uint64_t size) OVERRIDE { + return file_->PreAllocate(size); + } + + virtual Status Append(const Slice& data) OVERRIDE { + return file_->Append(data); + } + + // This is a dummy implementation that simply serially appends all + // slices using regular I/O. + virtual Status AppendVector(const vector& data_vector) OVERRIDE { + for (const Slice& data : data_vector) { + RETURN_NOT_OK(file_->Append(data)); + } + return Status::OK(); + } + + virtual Status Close() OVERRIDE { return Status::OK(); } + + virtual Status Flush(FlushMode mode) OVERRIDE { return Status::OK(); } + + virtual Status Sync() OVERRIDE { return Status::OK(); } + + virtual uint64_t Size() const OVERRIDE { return file_->Size(); } + + virtual const string& filename() const OVERRIDE { + return file_->filename(); + } + + private: + const scoped_refptr file_; +}; + +class RWFileImpl : public RWFile { + public: + explicit RWFileImpl(const scoped_refptr& file) + : file_(file) { + } + + ~RWFileImpl() { + } + + virtual Status Read(uint64_t offset, size_t length, + Slice* result, uint8_t* scratch) const OVERRIDE { + return file_->Read(offset, length, result, scratch); + } + + virtual Status Write(uint64_t offset, const Slice& data) OVERRIDE { + uint64_t file_size = file_->Size(); + // TODO: Modify FileState to allow rewriting. + if (offset < file_size) { + return Status::NotSupported( + "In-memory RW file does not support random writing"); + } else if (offset > file_size) { + // Fill in the space between with zeroes. + uint8_t zeroes[offset - file_size]; + memset(zeroes, 0, sizeof(zeroes)); + Slice s(zeroes, sizeof(zeroes)); + RETURN_NOT_OK(file_->Append(s)); + } + return file_->Append(data); + } + + virtual Status PreAllocate(uint64_t offset, size_t length) OVERRIDE { + return Status::OK(); + } + + virtual Status PunchHole(uint64_t offset, size_t length) OVERRIDE { + return Status::OK(); + } + + virtual Status Flush(FlushMode mode, uint64_t offset, size_t length) OVERRIDE { + return Status::OK(); + } + + virtual Status Sync() OVERRIDE { + return Status::OK(); + } + + virtual Status Close() OVERRIDE { + return Status::OK(); + } + + virtual Status Size(uint64_t* size) const OVERRIDE { + *size = file_->Size(); + return Status::OK(); + } + + virtual const string& filename() const OVERRIDE { + return file_->filename(); + } + + private: + const scoped_refptr file_; +}; + +class InMemoryEnv : public EnvWrapper { + public: + explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } + + virtual ~InMemoryEnv() { + } + + // Partial implementation of the Env interface. + virtual Status NewSequentialFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + MutexLock lock(mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + result->reset(new SequentialFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewRandomAccessFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + return NewRandomAccessFile(RandomAccessFileOptions(), fname, result); + } + + virtual Status NewRandomAccessFile(const RandomAccessFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) OVERRIDE { + MutexLock lock(mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + result->reset(new RandomAccessFileImpl(file_map_[fname])); + return Status::OK(); + } + + virtual Status NewWritableFile(const WritableFileOptions& opts, + const std::string& fname, + gscoped_ptr* result) OVERRIDE { + gscoped_ptr wf; + RETURN_NOT_OK(CreateAndRegisterNewFile(fname, opts.mode, &wf)); + result->reset(wf.release()); + return Status::OK(); + } + + virtual Status NewWritableFile(const std::string& fname, + gscoped_ptr* result) OVERRIDE { + return NewWritableFile(WritableFileOptions(), fname, result); + } + + virtual Status NewRWFile(const RWFileOptions& opts, + const string& fname, + gscoped_ptr* result) OVERRIDE { + gscoped_ptr rwf; + RETURN_NOT_OK(CreateAndRegisterNewFile(fname, opts.mode, &rwf)); + result->reset(rwf.release()); + return Status::OK(); + } + + virtual Status NewRWFile(const string& fname, + gscoped_ptr* result) OVERRIDE { + return NewRWFile(RWFileOptions(), fname, result); + } + + virtual Status NewTempWritableFile(const WritableFileOptions& opts, + const std::string& name_template, + std::string* created_filename, + gscoped_ptr* result) OVERRIDE { + // Not very random, but InMemoryEnv is basically a test env. + Random random(GetCurrentTimeMicros()); + while (true) { + string stripped; + if (!TryStripSuffixString(name_template, "XXXXXX", &stripped)) { + return Status::InvalidArgument("Name template must end with the string XXXXXX", + name_template); + } + uint32_t num = random.Next() % 999999; // Ensure it's <= 6 digits long. + string path = StringPrintf("%s%06u", stripped.c_str(), num); + + MutexLock lock(mutex_); + if (!ContainsKey(file_map_, path)) { + CreateAndRegisterNewWritableFileUnlocked(path, result); + *created_filename = path; + return Status::OK(); + } + } + // Unreachable. + } + + virtual bool FileExists(const std::string& fname) OVERRIDE { + MutexLock lock(mutex_); + return file_map_.find(fname) != file_map_.end(); + } + + virtual Status GetChildren(const std::string& dir, + vector* result) OVERRIDE { + MutexLock lock(mutex_); + result->clear(); + + for (const auto& file : file_map_) { + const std::string& filename = file.first; + + if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' && + Slice(filename).starts_with(Slice(dir))) { + result->push_back(filename.substr(dir.size() + 1)); + } + } + + return Status::OK(); + } + + virtual Status DeleteFile(const std::string& fname) OVERRIDE { + MutexLock lock(mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + DeleteFileInternal(fname); + return Status::OK(); + } + + virtual Status CreateDir(const std::string& dirname) OVERRIDE { + gscoped_ptr file; + return NewWritableFile(dirname, &file); + } + + virtual Status DeleteDir(const std::string& dirname) OVERRIDE { + return DeleteFile(dirname); + } + + virtual Status SyncDir(const std::string& dirname) OVERRIDE { + return Status::OK(); + } + + virtual Status DeleteRecursively(const std::string& dirname) OVERRIDE { + CHECK(!dirname.empty()); + string dir(dirname); + if (dir[dir.size() - 1] != '/') { + dir.push_back('/'); + } + + MutexLock lock(mutex_); + + for (auto i = file_map_.begin(); i != file_map_.end();) { + const std::string& filename = i->first; + + if (filename.size() >= dir.size() && Slice(filename).starts_with(Slice(dir))) { + file_map_.erase(i++); + } else { + ++i; + } + } + + return Status::OK(); + } + + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) OVERRIDE { + MutexLock lock(mutex_); + if (file_map_.find(fname) == file_map_.end()) { + return Status::IOError(fname, "File not found"); + } + + *file_size = file_map_[fname]->Size(); + return Status::OK(); + } + + virtual Status GetFileSizeOnDisk(const std::string& fname, uint64_t* file_size) OVERRIDE { + return GetFileSize(fname, file_size); + } + + virtual Status GetBlockSize(const string& fname, uint64_t* block_size) OVERRIDE { + // The default for ext3/ext4 filesystems. + *block_size = 4096; + return Status::OK(); + } + + virtual Status RenameFile(const std::string& src, + const std::string& target) OVERRIDE { + MutexLock lock(mutex_); + if (file_map_.find(src) == file_map_.end()) { + return Status::IOError(src, "File not found"); + } + + DeleteFileInternal(target); + file_map_[target] = file_map_[src]; + file_map_.erase(src); + return Status::OK(); + } + + virtual Status LockFile(const std::string& fname, FileLock** lock) OVERRIDE { + *lock = new FileLock; + return Status::OK(); + } + + virtual Status UnlockFile(FileLock* lock) OVERRIDE { + delete lock; + return Status::OK(); + } + + virtual Status GetTestDirectory(std::string* path) OVERRIDE { + *path = "/test"; + return Status::OK(); + } + + virtual Status Walk(const std::string& root, + DirectoryOrder order, + const WalkCallback& cb) OVERRIDE { + LOG(FATAL) << "Not implemented"; + } + + virtual Status Canonicalize(const string& path, string* result) OVERRIDE { + *result = path; + return Status::OK(); + } + + virtual Status GetTotalRAMBytes(int64_t* ram) OVERRIDE { + LOG(FATAL) << "Not implemented"; + } + + private: + void DeleteFileInternal(const std::string& fname) { + if (!ContainsKey(file_map_, fname)) { + return; + } + file_map_.erase(fname); + } + + // Create new internal representation of a writable file. + template + void CreateAndRegisterNewWritableFileUnlocked(const string& path, + gscoped_ptr* result) { + file_map_[path] = make_scoped_refptr(new FileState(path)); + result->reset(new ImplType(file_map_[path])); + } + + // Create new internal representation of a file. + template + Status CreateAndRegisterNewFile(const string& fname, + CreateMode mode, + gscoped_ptr* result) { + MutexLock lock(mutex_); + if (ContainsKey(file_map_, fname)) { + switch (mode) { + case CREATE_IF_NON_EXISTING_TRUNCATE: + DeleteFileInternal(fname); + break; // creates a new file below + case CREATE_NON_EXISTING: + return Status::AlreadyPresent(fname, "File already exists"); + case OPEN_EXISTING: + result->reset(new Type(file_map_[fname])); + return Status::OK(); + default: + return Status::NotSupported(Substitute("Unknown create mode $0", + mode)); + } + } else if (mode == OPEN_EXISTING) { + return Status::IOError(fname, "File not found"); + } + + CreateAndRegisterNewWritableFileUnlocked(fname, result); + return Status::OK(); + } + + // Map from filenames to FileState objects, representing a simple file system. + typedef std::map > FileSystem; + Mutex mutex_; + FileSystem file_map_; // Protected by mutex_. +}; + +} // namespace + +Env* NewMemEnv(Env* base_env) { + return new InMemoryEnv(base_env); +} + +} // namespace kudu diff --git a/src/kudu/util/memenv/memenv.h b/src/kudu/util/memenv/memenv.h new file mode 100644 index 000000000000..014855247702 --- /dev/null +++ b/src/kudu/util/memenv/memenv.h @@ -0,0 +1,20 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_ +#define STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_ + +namespace kudu { + +class Env; + +// Returns a new environment that stores its data in memory and delegates +// all non-file-storage tasks to base_env. The caller must delete the result +// when it is no longer needed. +// *base_env must remain live while the result is in use. +Env* NewMemEnv(Env* base_env); + +} // namespace kudu + +#endif // STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_ diff --git a/src/kudu/util/memory/arena-test.cc b/src/kudu/util/memory/arena-test.cc new file mode 100644 index 000000000000..a14e1f2e345e --- /dev/null +++ b/src/kudu/util/memory/arena-test.cc @@ -0,0 +1,190 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/memory/arena.h" +#include "kudu/util/memory/memory.h" +#include "kudu/util/mem_tracker.h" + +DEFINE_int32(num_threads, 16, "Number of threads to test"); +DEFINE_int32(allocs_per_thread, 10000, "Number of allocations each thread should do"); +DEFINE_int32(alloc_size, 4, "number of bytes in each allocation"); + +namespace kudu { + +using std::shared_ptr; + +template +static void AllocateThread(ArenaType *arena, uint8_t thread_index) { + std::vector ptrs; + ptrs.reserve(FLAGS_allocs_per_thread); + + char buf[FLAGS_alloc_size]; + memset(buf, thread_index, FLAGS_alloc_size); + + for (int i = 0; i < FLAGS_allocs_per_thread; i++) { + void *alloced = arena->AllocateBytes(FLAGS_alloc_size); + CHECK(alloced); + memcpy(alloced, buf, FLAGS_alloc_size); + ptrs.push_back(alloced); + } + + for (void *p : ptrs) { + if (memcmp(buf, p, FLAGS_alloc_size) != 0) { + FAIL() << StringPrintf("overwritten pointer at %p", p); + } + } +} + +// Non-templated function to forward to above -- simplifies +// boost::thread creation +static void AllocateThreadTSArena(ThreadSafeArena *arena, uint8_t thread_index) { + AllocateThread(arena, thread_index); +} + + +TEST(TestArena, TestSingleThreaded) { + Arena arena(128, 128); + + AllocateThread(&arena, 0); +} + + + +TEST(TestArena, TestMultiThreaded) { + CHECK(FLAGS_num_threads < 256); + + ThreadSafeArena arena(1024, 1024); + + boost::ptr_vector threads; + for (uint8_t i = 0; i < FLAGS_num_threads; i++) { + threads.push_back(new boost::thread(AllocateThreadTSArena, &arena, (uint8_t)i)); + } + + for (boost::thread &thr : threads) { + thr.join(); + } +} + +TEST(TestArena, TestAlignment) { + + ThreadSafeArena arena(1024, 1024); + for (int i = 0; i < 1000; i++) { + int alignment = 1 << (1 % 5); + + void *ret = arena.AllocateBytesAligned(5, alignment); + ASSERT_EQ(0, (uintptr_t)(ret) % alignment) << + "failed to align on " << alignment << "b boundary: " << + ret; + } +} + +// MemTrackers update their ancestors when consuming and releasing memory to compute +// usage totals. However, the lifetimes of parent and child trackers can be different. +// Validate that child trackers can still correctly update their parent stats even when +// the parents go out of scope. +TEST(TestArena, TestMemoryTrackerParentReferences) { + // Set up a parent and child MemTracker. + const string parent_id = "parent-id"; + const string child_id = "child-id"; + shared_ptr child_tracker; + { + shared_ptr parent_tracker = MemTracker::CreateTracker(1024, parent_id); + child_tracker = MemTracker::CreateTracker(-1, child_id, parent_tracker); + // Parent falls out of scope here. Should still be owned by the child. + } + shared_ptr allocator( + new MemoryTrackingBufferAllocator(HeapBufferAllocator::Get(), child_tracker)); + MemoryTrackingArena arena(256, 1024, allocator); + + // Try some child operations. + ASSERT_EQ(256, child_tracker->consumption()); + void *allocated = arena.AllocateBytes(256); + ASSERT_TRUE(allocated); + ASSERT_EQ(256, child_tracker->consumption()); + allocated = arena.AllocateBytes(256); + ASSERT_TRUE(allocated); + ASSERT_EQ(768, child_tracker->consumption()); +} + +TEST(TestArena, TestMemoryTrackingDontEnforce) { + shared_ptr mem_tracker = MemTracker::CreateTracker(1024, "arena-test-tracker"); + shared_ptr allocator( + new MemoryTrackingBufferAllocator(HeapBufferAllocator::Get(), mem_tracker)); + MemoryTrackingArena arena(256, 1024, allocator); + ASSERT_EQ(256, mem_tracker->consumption()); + void *allocated = arena.AllocateBytes(256); + ASSERT_TRUE(allocated); + ASSERT_EQ(256, mem_tracker->consumption()); + allocated = arena.AllocateBytes(256); + ASSERT_TRUE(allocated); + ASSERT_EQ(768, mem_tracker->consumption()); + + // In DEBUG mode after Reset() the last component of an arena is + // cleared, but is then created again; in release mode, the last + // component is not cleared. In either case, after Reset() + // consumption() should equal the size of the last component which + // is 512 bytes. + arena.Reset(); + ASSERT_EQ(512, mem_tracker->consumption()); + + // Allocate beyond allowed consumption. This should still go + // through, since enforce_limit is false. + allocated = arena.AllocateBytes(1024); + ASSERT_TRUE(allocated); + + ASSERT_EQ(1536, mem_tracker->consumption()); +} + +TEST(TestArena, TestMemoryTrackingEnforced) { + shared_ptr mem_tracker = MemTracker::CreateTracker(1024, "arena-test-tracker"); + shared_ptr allocator( + new MemoryTrackingBufferAllocator(HeapBufferAllocator::Get(), mem_tracker, + // enforce limit + true)); + MemoryTrackingArena arena(256, 1024, allocator); + ASSERT_EQ(256, mem_tracker->consumption()); + void *allocated = arena.AllocateBytes(256); + ASSERT_TRUE(allocated); + ASSERT_EQ(256, mem_tracker->consumption()); + allocated = arena.AllocateBytes(1024); + ASSERT_FALSE(allocated); + ASSERT_EQ(256, mem_tracker->consumption()); +} + +TEST(TestArena, TestSTLAllocator) { + Arena a(256, 256 * 1024); + typedef vector > ArenaVector; + ArenaAllocator alloc(&a); + ArenaVector v(alloc); + for (int i = 0; i < 10000; i++) { + v.push_back(i); + } + for (int i = 0; i < 10000; i++) { + ASSERT_EQ(i, v[i]); + } +} + +} // namespace kudu diff --git a/src/kudu/util/memory/arena.cc b/src/kudu/util/memory/arena.cc new file mode 100644 index 000000000000..822a4f6b18c4 --- /dev/null +++ b/src/kudu/util/memory/arena.cc @@ -0,0 +1,173 @@ +// Copyright 2010 Google Inc. All Rights Reserved +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "kudu/util/memory/arena.h" + +#include + +#include "kudu/util/debug-util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" + +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::shared_ptr; +using std::sort; +using std::swap; + +DEFINE_int64(arena_warn_threshold_bytes, 256*1024*1024, + "Number of bytes beyond which to emit a warning for a large arena"); +TAG_FLAG(arena_warn_threshold_bytes, hidden); + +namespace kudu { + +template +ArenaBase::ArenaBase( + BufferAllocator* const buffer_allocator, + size_t initial_buffer_size, + size_t max_buffer_size) + : buffer_allocator_(buffer_allocator), + max_buffer_size_(max_buffer_size), + arena_footprint_(0), + warned_(false) { + AddComponent(CHECK_NOTNULL(NewComponent(initial_buffer_size, 0))); +} + +template +ArenaBase::ArenaBase(size_t initial_buffer_size, size_t max_buffer_size) + : buffer_allocator_(HeapBufferAllocator::Get()), + max_buffer_size_(max_buffer_size), + arena_footprint_(0), + warned_(false) { + AddComponent(CHECK_NOTNULL(NewComponent(initial_buffer_size, 0))); +} + +template +void *ArenaBase::AllocateBytesFallback(const size_t size, const size_t align) { + lock_guard lock(&component_lock_); + + // It's possible another thread raced with us and already allocated + // a new component, in which case we should try the "fast path" again + Component* cur = AcquireLoadCurrent(); + void * result = cur->AllocateBytesAligned(size, align); + if (PREDICT_FALSE(result != nullptr)) return result; + + // Really need to allocate more space. + size_t next_component_size = min(2 * cur->size(), max_buffer_size_); + // But, allocate enough, even if the request is large. In this case, + // might violate the max_element_size bound. + if (next_component_size < size) { + next_component_size = size; + } + // If soft quota is exhausted we will only get the "minimal" amount of memory + // we ask for. In this case if we always use "size" as minimal, we may degrade + // to allocating a lot of tiny components, one for each string added to the + // arena. This would be very inefficient, so let's first try something between + // "size" and "next_component_size". If it fails due to hard quota being + // exhausted, we'll fall back to using "size" as minimal. + size_t minimal = (size + next_component_size) / 2; + CHECK_LE(size, minimal); + CHECK_LE(minimal, next_component_size); + // Now, just make sure we can actually get the memory. + Component* component = NewComponent(next_component_size, minimal); + if (component == nullptr) { + component = NewComponent(next_component_size, size); + } + if (!component) return nullptr; + + // Now, must succeed. The component has at least 'size' bytes. + result = component->AllocateBytesAligned(size, align); + CHECK(result != nullptr); + + // Now add it to the arena. + AddComponent(component); + + return result; +} + +template +typename ArenaBase::Component* ArenaBase::NewComponent( + size_t requested_size, + size_t minimum_size) { + Buffer* buffer = buffer_allocator_->BestEffortAllocate(requested_size, + minimum_size); + if (buffer == nullptr) return nullptr; + + CHECK_EQ(reinterpret_cast(buffer->data()) & (16 - 1), 0) + << "Components should be 16-byte aligned: " << buffer->data(); + + ASAN_POISON_MEMORY_REGION(buffer->data(), buffer->size()); + + return new Component(buffer); +} + +// LOCKING: component_lock_ must be held by the current thread. +template +void ArenaBase::AddComponent(ArenaBase::Component *component) { + ReleaseStoreCurrent(component); + arena_.push_back(shared_ptr(component)); + arena_footprint_ += component->size(); + if (PREDICT_FALSE(arena_footprint_ > FLAGS_arena_warn_threshold_bytes) && !warned_) { + LOG(WARNING) << "Arena " << reinterpret_cast(this) + << " footprint (" << arena_footprint_ << " bytes) exceeded warning threshold (" + << FLAGS_arena_warn_threshold_bytes << " bytes)\n" + << GetStackTrace(); + warned_ = true; + } +} + +template +void ArenaBase::Reset() { + lock_guard lock(&component_lock_); + + if (PREDICT_FALSE(arena_.size() > 1)) { + shared_ptr last = arena_.back(); + arena_.clear(); + arena_.push_back(last); + ReleaseStoreCurrent(last.get()); + } + arena_.back()->Reset(); + arena_footprint_ = arena_.back()->size(); + warned_ = false; + +#ifndef NDEBUG + // In debug mode release the last component too for (hopefully) better + // detection of memory-related bugs (invalid shallow copies, etc.). + size_t last_size = arena_.back()->size(); + arena_.clear(); + AddComponent(CHECK_NOTNULL(NewComponent(last_size, 0))); + arena_footprint_ = 0; +#endif +} + +template +size_t ArenaBase::memory_footprint() const { + lock_guard lock(&component_lock_); + return arena_footprint_; +} + +// Explicit instantiation. +template class ArenaBase; +template class ArenaBase; + + +} // namespace kudu diff --git a/src/kudu/util/memory/arena.h b/src/kudu/util/memory/arena.h new file mode 100644 index 000000000000..69c36361851c --- /dev/null +++ b/src/kudu/util/memory/arena.h @@ -0,0 +1,495 @@ +// Copyright 2010 Google Inc. All Rights Reserved +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// +// Memory arena for variable-length datatypes and STL collections. + +#ifndef KUDU_UTIL_MEMORY_ARENA_H_ +#define KUDU_UTIL_MEMORY_ARENA_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/alignment.h" +#include "kudu/util/locks.h" +#include "kudu/util/memory/memory.h" +#include "kudu/util/slice.h" + +using std::allocator; + +namespace kudu { + +template struct ArenaTraits; + +template <> struct ArenaTraits { + typedef Atomic32 offset_type; + typedef Mutex mutex_type; + typedef simple_spinlock spinlock_type; +}; + +template <> struct ArenaTraits { + typedef uint32_t offset_type; + // For non-threadsafe, we don't need any real locking. + typedef boost::signals2::dummy_mutex mutex_type; + typedef boost::signals2::dummy_mutex spinlock_type; +}; + +// A helper class for storing variable-length blobs (e.g. strings). Once a blob +// is added to the arena, its index stays fixed. No reallocation happens. +// Instead, the arena keeps a list of buffers. When it needs to grow, it +// allocates a new buffer. Each subsequent buffer is 2x larger, than its +// predecessor, until the maximum specified buffer size is reached. +// The buffers are furnished by a designated allocator. +// +// This class is thread-safe with the fast path lock-free. +template +class ArenaBase { + public: + // Creates a new arena, with a single buffer of size up-to + // initial_buffer_size, upper size limit for later-allocated buffers capped + // at max_buffer_size, and maximum capacity (i.e. total sizes of all buffers) + // possibly limited by the buffer allocator. The allocator might cap the + // initial allocation request arbitrarily (down to zero). As a consequence, + // arena construction never fails due to OOM. + // + // Calls to AllocateBytes() will then give out bytes from the working buffer + // until it is exhausted. Then, a subsequent working buffer will be allocated. + // The size of the next buffer is normally 2x the size of the previous buffer. + // It might be capped by the allocator, or by the max_buffer_size parameter. + ArenaBase(BufferAllocator* const buffer_allocator, + size_t initial_buffer_size, + size_t max_buffer_size); + + // Creates an arena using a default (heap) allocator with unbounded capacity. + // Discretion advised. + ArenaBase(size_t initial_buffer_size, size_t max_buffer_size); + + // Adds content of the specified Slice to the arena, and returns a + // pointer to it. The pointer is guaranteed to remain valid during the + // lifetime of the arena. The Slice object itself is not copied. The + // size information is not stored. + // (Normal use case is that the caller already has an array of Slices, + // where it keeps these pointers together with size information). + // If this request would make the arena grow and the allocator denies that, + // returns NULL and leaves the arena unchanged. + uint8_t *AddSlice(const Slice& value); + + // Same as above. + void * AddBytes(const void *data, size_t len); + + // Handy wrapper for placement-new + template + T *NewObject(); + + // Handy wrapper for placement-new + template + T *NewObject(A1 arg1); + + // Handy wrapper for placement-new + template + T *NewObject(A1 arg1, A2 arg2); + + // Handy wrapper for placement-new + template + T *NewObject(A1 arg1, A2 arg2, A3 arg3); + + // Relocate the given Slice into the arena, setting 'dst' and + // returning true if successful. + // It is legal for 'dst' to be a pointer to 'src'. + // See AddSlice above for detail on memory lifetime. + bool RelocateSlice(const Slice &src, Slice *dst); + + // Reserves a blob of the specified size in the arena, and returns a pointer + // to it. The caller can then fill the allocated memory. The pointer is + // guaranteed to remain valid during the lifetime of the arena. + // If this request would make the arena grow and the allocator denies that, + // returns NULL and leaves the arena unchanged. + void* AllocateBytes(const size_t size) { + return AllocateBytesAligned(size, 1); + } + + // Allocate bytes, ensuring a specified alignment. + // NOTE: alignment MUST be a power of two, or else this will break. + void* AllocateBytesAligned(const size_t size, const size_t alignment); + + // Removes all data from the arena. (Invalidates all pointers returned by + // AddSlice and AllocateBytes). Does not cause memory allocation. + // May reduce memory footprint, as it discards all allocated buffers but + // the last one. + // Unless allocations exceed max_buffer_size, repetitive filling up and + // resetting normally lead to quickly settling memory footprint and ceasing + // buffer allocations, as the arena keeps reusing a single, large buffer. + void Reset(); + + // Returns the memory footprint of this arena, in bytes, defined as a sum of + // all buffer sizes. Always greater or equal to the total number of + // bytes allocated out of the arena. + size_t memory_footprint() const; + + private: + typedef typename ArenaTraits::mutex_type mutex_type; + // Encapsulates a single buffer in the arena. + class Component; + + // Fallback for AllocateBytes non-fast-path + void* AllocateBytesFallback(const size_t size, const size_t align); + + Component* NewComponent(size_t requested_size, size_t minimum_size); + void AddComponent(Component *component); + + // Load the current component, with "Acquire" semantics (see atomicops.h) + // if the arena is meant to be thread-safe. + inline Component* AcquireLoadCurrent() { + if (THREADSAFE) { + return reinterpret_cast( + base::subtle::Acquire_Load(reinterpret_cast(¤t_))); + } else { + return current_; + } + } + + // Store the current component, with "Release" semantics (see atomicops.h) + // if the arena is meant to be thread-safe. + inline void ReleaseStoreCurrent(Component* c) { + if (THREADSAFE) { + base::subtle::Release_Store(reinterpret_cast(¤t_), + reinterpret_cast(c)); + } else { + current_ = c; + } + } + + BufferAllocator* const buffer_allocator_; + vector > arena_; + + // The current component to allocate from. + // Use AcquireLoadCurrent and ReleaseStoreCurrent to load/store. + Component* current_; + const size_t max_buffer_size_; + size_t arena_footprint_; + + // True if this Arena has already emitted a warning about surpassing + // the global warning size threshold. + bool warned_; + + // Lock covering 'slow path' allocation, when new components are + // allocated and added to the arena's list. Also covers any other + // mutation of the component data structure (eg Reset). + mutable mutex_type component_lock_; + + DISALLOW_COPY_AND_ASSIGN(ArenaBase); +}; + +// STL-compliant allocator, for use with hash_maps and other structures +// which share lifetime with an Arena. Enables memory control and improves +// performance. +template class ArenaAllocator { + public: + typedef T value_type; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + pointer index(reference r) const { return &r; } + const_pointer index(const_reference r) const { return &r; } + size_type max_size() const { return size_t(-1) / sizeof(T); } + + explicit ArenaAllocator(ArenaBase* arena) : arena_(arena) { + CHECK_NOTNULL(arena_); + } + + ~ArenaAllocator() { } + + pointer allocate(size_type n, allocator::const_pointer /*hint*/ = 0) { + return reinterpret_cast(arena_->AllocateBytes(n * sizeof(T))); + } + + void deallocate(pointer p, size_type n) {} + + void construct(pointer p, const T& val) { + new(reinterpret_cast(p)) T(val); + } + + void destroy(pointer p) { p->~T(); } + + template struct rebind { + typedef ArenaAllocator other; + }; + + template ArenaAllocator(const ArenaAllocator& other) + : arena_(other.arena()) { } + + template bool operator==(const ArenaAllocator& other) const { + return arena_ == other.arena(); + } + + template bool operator!=(const ArenaAllocator& other) const { + return arena_ != other.arena(); + } + + ArenaBase *arena() const { + return arena_; + } + + private: + + ArenaBase* arena_; +}; + + +class Arena : public ArenaBase { + public: + explicit Arena(size_t initial_buffer_size, size_t max_buffer_size) : + ArenaBase(initial_buffer_size, max_buffer_size) + {} +}; + +class ThreadSafeArena : public ArenaBase { + public: + explicit ThreadSafeArena(size_t initial_buffer_size, size_t max_buffer_size) : + ArenaBase(initial_buffer_size, max_buffer_size) + {} +}; + +// Arena implementation that is integrated with MemTracker in order to +// track heap-allocated space consumed by the arena. + +class MemoryTrackingArena : public ArenaBase { + public: + + MemoryTrackingArena( + size_t initial_buffer_size, + size_t max_buffer_size, + const std::shared_ptr& tracking_allocator) + : ArenaBase(tracking_allocator.get(), initial_buffer_size, max_buffer_size), + tracking_allocator_(tracking_allocator) {} + + ~MemoryTrackingArena() { + } + + private: + + // This is required in order for the Arena to survive even after tablet is shut down, + // e.g., in the case of Scanners running scanners (see tablet_server-test.cc) + std::shared_ptr tracking_allocator_; +}; + +class ThreadSafeMemoryTrackingArena : public ArenaBase { + public: + + ThreadSafeMemoryTrackingArena( + size_t initial_buffer_size, + size_t max_buffer_size, + const std::shared_ptr& tracking_allocator) + : ArenaBase(tracking_allocator.get(), initial_buffer_size, max_buffer_size), + tracking_allocator_(tracking_allocator) {} + + ~ThreadSafeMemoryTrackingArena() { + } + + private: + + // See comment in MemoryTrackingArena above. + std::shared_ptr tracking_allocator_; +}; + +// Implementation of inline and template methods + +template +class ArenaBase::Component { + public: + explicit Component(Buffer* buffer) + : buffer_(buffer), + data_(static_cast(buffer->data())), + offset_(0), + size_(buffer->size()) {} + + // Tries to reserve space in this component. Returns the pointer to the + // reserved space if successful; NULL on failure (if there's no more room). + uint8_t* AllocateBytes(const size_t size) { + return AllocateBytesAligned(size, 1); + } + + uint8_t *AllocateBytesAligned(const size_t size, const size_t alignment); + + size_t size() const { return size_; } + void Reset() { + ASAN_POISON_MEMORY_REGION(data_, size_); + offset_ = 0; + } + + private: + // Mark the given range unpoisoned in ASAN. + // This is a no-op in a non-ASAN build. + void AsanUnpoison(const void* addr, size_t size); + + gscoped_ptr buffer_; + uint8_t* const data_; + typename ArenaTraits::offset_type offset_; + const size_t size_; + +#ifdef ADDRESS_SANITIZER + // Lock used around unpoisoning memory when ASAN is enabled. + // ASAN does not support concurrent unpoison calls that may overlap a particular + // memory word (8 bytes). + typedef typename ArenaTraits::spinlock_type spinlock_type; + spinlock_type asan_lock_; +#endif + DISALLOW_COPY_AND_ASSIGN(Component); +}; + + +// Thread-safe implementation +template <> +inline uint8_t *ArenaBase::Component::AllocateBytesAligned( + const size_t size, const size_t alignment) { + // Special case check the allowed alignments. Currently, we only ensure + // the allocated buffer components are 16-byte aligned, and the code path + // doesn't support larger alignment. + DCHECK(alignment == 1 || alignment == 2 || alignment == 4 || + alignment == 8 || alignment == 16) + << "bad alignment: " << alignment; + retry: + Atomic32 offset = Acquire_Load(&offset_); + + Atomic32 aligned = KUDU_ALIGN_UP(offset, alignment); + Atomic32 new_offset = aligned + size; + + if (PREDICT_TRUE(new_offset <= size_)) { + bool success = Acquire_CompareAndSwap(&offset_, offset, new_offset) == offset; + if (PREDICT_TRUE(success)) { + AsanUnpoison(data_ + aligned, size); + return data_ + aligned; + } else { + // Raced with another allocator + goto retry; + } + } else { + return NULL; + } +} + +// Non-Threadsafe implementation +template <> +inline uint8_t *ArenaBase::Component::AllocateBytesAligned( + const size_t size, const size_t alignment) { + DCHECK(alignment == 1 || alignment == 2 || alignment == 4 || + alignment == 8 || alignment == 16) + << "bad alignment: " << alignment; + size_t aligned = KUDU_ALIGN_UP(offset_, alignment); + uint8_t* destination = data_ + aligned; + size_t save_offset = offset_; + offset_ = aligned + size; + if (PREDICT_TRUE(offset_ <= size_)) { + AsanUnpoison(data_ + aligned, size); + return destination; + } else { + offset_ = save_offset; + return NULL; + } +} + +template +inline void ArenaBase::Component::AsanUnpoison(const void* addr, size_t size) { +#ifdef ADDRESS_SANITIZER + lock_guard l(&asan_lock_); + ASAN_UNPOISON_MEMORY_REGION(addr, size); +#endif +} + +// Fast-path allocation should get inlined, and fall-back +// to non-inline function call for allocation failure +template +inline void *ArenaBase::AllocateBytesAligned(const size_t size, const size_t align) { + void* result = AcquireLoadCurrent()->AllocateBytesAligned(size, align); + if (PREDICT_TRUE(result != NULL)) return result; + return AllocateBytesFallback(size, align); +} + +template +inline uint8_t* ArenaBase::AddSlice(const Slice& value) { + return reinterpret_cast(AddBytes(value.data(), value.size())); +} + +template +inline void *ArenaBase::AddBytes(const void *data, size_t len) { + void* destination = AllocateBytes(len); + if (destination == NULL) return NULL; + memcpy(destination, data, len); + return destination; +} + +template +inline bool ArenaBase::RelocateSlice(const Slice &src, Slice *dst) { + void* destination = AllocateBytes(src.size()); + if (destination == NULL) return false; + memcpy(destination, src.data(), src.size()); + *dst = Slice(reinterpret_cast(destination), src.size()); + return true; +} + + +template +template +inline T *ArenaBase::NewObject() { + void *mem = AllocateBytes(sizeof(T)); + if (mem == NULL) throw std::bad_alloc(); + return new (mem) T(); +} + +template +template +inline T *ArenaBase::NewObject(A1 arg1) { + void *mem = AllocateBytes(sizeof(T)); + if (mem == NULL) throw std::bad_alloc(); + return new (mem) T(arg1); +} + + +template +template +inline T *ArenaBase::NewObject(A1 arg1, A2 arg2) { + void *mem = AllocateBytes(sizeof(T)); + if (mem == NULL) throw std::bad_alloc(); + return new (mem) T(arg1, arg2); +} + +template +template +inline T *ArenaBase::NewObject(A1 arg1, A2 arg2, A3 arg3) { + void *mem = AllocateBytes(sizeof(T)); + if (mem == NULL) throw std::bad_alloc(); + return new (mem) T(arg1, arg2, arg3); +} + +} // namespace kudu + +#endif // KUDU_UTIL_MEMORY_ARENA_H_ diff --git a/src/kudu/util/memory/memory.cc b/src/kudu/util/memory/memory.cc new file mode 100644 index 000000000000..02065afe1a50 --- /dev/null +++ b/src/kudu/util/memory/memory.cc @@ -0,0 +1,365 @@ +// Copyright 2010 Google Inc. All Rights Reserved +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "kudu/util/alignment.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/memory/memory.h" +#include "kudu/util/mem_tracker.h" + +#include +#include + +#include +using std::copy; +using std::max; +using std::min; +using std::reverse; +using std::sort; +using std::swap; +#include + + +namespace kudu { + +namespace { +static char dummy_buffer[0] = {}; +} + +// This function is micro-optimized a bit, since it helps debug +// mode tests run much faster. +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC push_options +#pragma GCC optimize("-O3") +#endif +void OverwriteWithPattern(char* p, size_t len, StringPiece pattern) { + size_t pat_len = pattern.size(); + CHECK_LT(0, pat_len); + size_t rem = len; + const char *pat_ptr = pattern.data(); + + while (rem >= pat_len) { + memcpy(p, pat_ptr, pat_len); + p += pat_len; + rem -= pat_len; + } + + while (rem-- > 0) { + *p++ = *pat_ptr++; + } +} +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +Buffer::~Buffer() { +#if !defined(NDEBUG) && !defined(ADDRESS_SANITIZER) + // "unrolling" the string "BAD" makes for a much more efficient + // OverwriteWithPattern call in debug mode, so we can keep this + // useful bit of code without tests going slower! + // + // In ASAN mode, we don't bother with this, because when we free the memory, ASAN will + // prevent us from accessing it anyway. + OverwriteWithPattern(reinterpret_cast(data_), size_, + "BADBADBADBADBADBADBADBADBADBADBAD" + "BADBADBADBADBADBADBADBADBADBADBAD" + "BADBADBADBADBADBADBADBADBADBADBAD"); +#endif + if (allocator_ != nullptr) allocator_->FreeInternal(this); +} + +void BufferAllocator::LogAllocation(size_t requested, + size_t minimal, + Buffer* buffer) { + if (buffer == nullptr) { + LOG(WARNING) << "Memory allocation failed. " + << "Number of bytes requested: " << requested + << ", minimal: " << minimal; + return; + } + if (buffer->size() < requested) { + LOG(WARNING) << "Memory allocation was shorter than requested. " + << "Number of bytes requested to allocate: " << requested + << ", minimal: " << minimal + << ", and actually allocated: " << buffer->size(); + } +} + +// TODO(onufry) - test whether the code still tests OK if we set this to true, +// or remove this code and add a test that Google allocator does not change it's +// contract - 16-aligned in -c opt and %16 == 8 in debug. +DEFINE_bool(allocator_aligned_mode, false, + "Use 16-byte alignment instead of 8-byte, " + "unless explicitly specified otherwise - to boost SIMD"); +TAG_FLAG(allocator_aligned_mode, hidden); + +HeapBufferAllocator::HeapBufferAllocator() + : aligned_mode_(FLAGS_allocator_aligned_mode) { +} + +Buffer* HeapBufferAllocator::AllocateInternal( + const size_t requested, + const size_t minimal, + BufferAllocator* const originator) { + DCHECK_LE(minimal, requested); + void* data; + size_t attempted = requested; + while (true) { + data = (attempted == 0) ? &dummy_buffer[0] : Malloc(attempted); + if (data != nullptr) { + return CreateBuffer(data, attempted, originator); + } + if (attempted == minimal) return nullptr; + attempted = minimal + (attempted - minimal - 1) / 2; + } +} + +bool HeapBufferAllocator::ReallocateInternal( + const size_t requested, + const size_t minimal, + Buffer* const buffer, + BufferAllocator* const originator) { + DCHECK_LE(minimal, requested); + void* data; + size_t attempted = requested; + while (true) { + if (attempted == 0) { + if (buffer->size() > 0) free(buffer->data()); + data = &dummy_buffer[0]; + } else { + if (buffer->size() > 0) { + data = Realloc(buffer->data(), buffer->size(), attempted); + } else { + data = Malloc(attempted); + } + } + if (data != nullptr) { + UpdateBuffer(data, attempted, buffer); + return true; + } + if (attempted == minimal) return false; + attempted = minimal + (attempted - minimal - 1) / 2; + } +} + +void HeapBufferAllocator::FreeInternal(Buffer* buffer) { + if (buffer->size() > 0) free(buffer->data()); +} + +void* HeapBufferAllocator::Malloc(size_t size) { + if (aligned_mode_) { + void* data; + if (posix_memalign(&data, 16, KUDU_ALIGN_UP(size, 16))) { + return nullptr; + } + return data; + } else { + return malloc(size); + } +} + +void* HeapBufferAllocator::Realloc(void* previousData, size_t previousSize, + size_t newSize) { + if (aligned_mode_) { + void* data = Malloc(newSize); + if (data) { +// NOTE(ptab): We should use realloc here to avoid memmory coping, +// but it doesn't work on memory allocated by posix_memalign(...). +// realloc reallocates the memory but doesn't preserve the content. +// TODO(ptab): reiterate after some time to check if it is fixed (tcmalloc ?) + memcpy(data, previousData, min(previousSize, newSize)); + free(previousData); + return data; + } else { + return nullptr; + } + } else { + return realloc(previousData, newSize); + } +} + +Buffer* ClearingBufferAllocator::AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) { + Buffer* buffer = DelegateAllocate(delegate_, requested, minimal, + originator); + if (buffer != nullptr) memset(buffer->data(), 0, buffer->size()); + return buffer; +} + +bool ClearingBufferAllocator::ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) { + size_t offset = (buffer != nullptr ? buffer->size() : 0); + bool success = DelegateReallocate(delegate_, requested, minimal, buffer, + originator); + if (success && buffer->size() > offset) { + memset(static_cast(buffer->data()) + offset, 0, + buffer->size() - offset); + } + return success; +} + +void ClearingBufferAllocator::FreeInternal(Buffer* buffer) { + DelegateFree(delegate_, buffer); +} + +Buffer* MediatingBufferAllocator::AllocateInternal( + const size_t requested, + const size_t minimal, + BufferAllocator* const originator) { + // Allow the mediator to trim the request. + size_t granted; + if (requested > 0) { + granted = mediator_->Allocate(requested, minimal); + if (granted < minimal) return nullptr; + } else { + granted = 0; + } + Buffer* buffer = DelegateAllocate(delegate_, granted, minimal, originator); + if (buffer == nullptr) { + mediator_->Free(granted); + } else if (buffer->size() < granted) { + mediator_->Free(granted - buffer->size()); + } + return buffer; +} + +bool MediatingBufferAllocator::ReallocateInternal( + const size_t requested, + const size_t minimal, + Buffer* const buffer, + BufferAllocator* const originator) { + // Allow the mediator to trim the request. Be conservative; assume that + // realloc may degenerate to malloc-memcpy-free. + size_t granted; + if (requested > 0) { + granted = mediator_->Allocate(requested, minimal); + if (granted < minimal) return false; + } else { + granted = 0; + } + size_t old_size = buffer->size(); + if (DelegateReallocate(delegate_, granted, minimal, buffer, originator)) { + mediator_->Free(granted - buffer->size() + old_size); + return true; + } else { + mediator_->Free(granted); + return false; + } +} + +void MediatingBufferAllocator::FreeInternal(Buffer* buffer) { + mediator_->Free(buffer->size()); + DelegateFree(delegate_, buffer); +} + +Buffer* MemoryStatisticsCollectingBufferAllocator::AllocateInternal( + const size_t requested, + const size_t minimal, + BufferAllocator* const originator) { + Buffer* buffer = DelegateAllocate(delegate_, requested, minimal, originator); + if (buffer != nullptr) { + memory_stats_collector_->AllocatedMemoryBytes(buffer->size()); + } else { + memory_stats_collector_->RefusedMemoryBytes(minimal); + } + return buffer; +} + +bool MemoryStatisticsCollectingBufferAllocator::ReallocateInternal( + const size_t requested, + const size_t minimal, + Buffer* const buffer, + BufferAllocator* const originator) { + const size_t old_size = buffer->size(); + bool outcome = DelegateReallocate(delegate_, requested, minimal, buffer, + originator); + if (buffer->size() > old_size) { + memory_stats_collector_->AllocatedMemoryBytes(buffer->size() - old_size); + } else if (buffer->size() < old_size) { + memory_stats_collector_->FreedMemoryBytes(old_size - buffer->size()); + } else if (!outcome && (minimal > buffer->size())) { + memory_stats_collector_->RefusedMemoryBytes(minimal - buffer->size()); + } + return outcome; +} + +void MemoryStatisticsCollectingBufferAllocator::FreeInternal(Buffer* buffer) { + DelegateFree(delegate_, buffer); + memory_stats_collector_->FreedMemoryBytes(buffer->size()); +} + +size_t MemoryTrackingBufferAllocator::Available() const { + return enforce_limit_ ? mem_tracker_->SpareCapacity() : std::numeric_limits::max(); +} + +bool MemoryTrackingBufferAllocator::TryConsume(int64_t bytes) { + // Calls TryConsume first, even if enforce_limit_ is false: this + // will cause mem_tracker_ to try to free up more memory by GCing. + if (!mem_tracker_->TryConsume(bytes)) { + if (enforce_limit_) { + return false; + } else { + // If enforce_limit_ is false, allocate memory anyway. + mem_tracker_->Consume(bytes); + } + } + return true; +} + +Buffer* MemoryTrackingBufferAllocator::AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) { + if (TryConsume(requested)) { + Buffer* buffer = DelegateAllocate(delegate_, requested, requested, originator); + if (buffer == nullptr) { + mem_tracker_->Release(requested); + } else { + return buffer; + } + } + + if (TryConsume(minimal)) { + Buffer* buffer = DelegateAllocate(delegate_, minimal, minimal, originator); + if (buffer == nullptr) { + mem_tracker_->Release(minimal); + } + return buffer; + } + + return nullptr; +} + + +bool MemoryTrackingBufferAllocator::ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) { + LOG(FATAL) << "Not implemented"; + return false; +} + +void MemoryTrackingBufferAllocator::FreeInternal(Buffer* buffer) { + DelegateFree(delegate_, buffer); + mem_tracker_->Release(buffer->size()); +} + +} // namespace kudu diff --git a/src/kudu/util/memory/memory.h b/src/kudu/util/memory/memory.h new file mode 100644 index 000000000000..f618ea8943bc --- /dev/null +++ b/src/kudu/util/memory/memory.h @@ -0,0 +1,978 @@ +// Copyright 2010 Google Inc. All Rights Reserved +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// +// Classes for memory management, used by materializations +// (arenas, segments, and STL collections parametrized via arena allocators) +// so that memory usage can be controlled at the application level. +// +// Materializations can be parametrized by specifying an instance of a +// BufferAllocator. The allocator implements +// memory management policy (e.g. setting allocation limits). Allocators may +// be shared between multiple materializations; e.g. you can designate a +// single allocator per a single user request, thus setting bounds on memory +// usage on a per-request basis. + +#ifndef KUDU_UTIL_MEMORY_MEMORY_H_ +#define KUDU_UTIL_MEMORY_MEMORY_H_ + +#include +#include +#include +#include +#include +#include + +#include "kudu/util/boost_mutex_utils.h" +#include "kudu/util/mutex.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/logging-inl.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/singleton.h" + +using std::copy; +using std::max; +using std::min; +using std::numeric_limits; +using std::reverse; +using std::sort; +using std::swap; +using std::vector; + +namespace kudu { + +class BufferAllocator; +class MemTracker; + +void OverwriteWithPattern(char* p, size_t len, StringPiece pattern); + +// Wrapper for a block of data allocated by a BufferAllocator. Owns the block. +// (To release the block, destroy the buffer - it will then return it via the +// same allocator that has been used to create it). +class Buffer { + public: + ~Buffer(); + + void* data() const { return data_; } // The data buffer. + size_t size() const { return size_; } // In bytes. + + private: + friend class BufferAllocator; + + Buffer(void* data, size_t size, BufferAllocator* allocator) + : data_(CHECK_NOTNULL(data)), + size_(size), + allocator_(allocator) { +#ifndef NDEBUG + OverwriteWithPattern(reinterpret_cast(data_), size_, + "NEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEW" + "NEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEW" + "NEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEWNEW"); +#endif + } + + // Called by a successful realloc. + void Update(void* new_data, size_t new_size) { +#ifndef NDEBUG + if (new_size > size_) { + OverwriteWithPattern(reinterpret_cast(new_data) + size_, + new_size - size_, "NEW"); + } +#endif + data_ = new_data; + size_ = new_size; + } + + void* data_; + size_t size_; + BufferAllocator* const allocator_; + DISALLOW_COPY_AND_ASSIGN(Buffer); +}; + +// Allocators allow applications to control memory usage. They are +// used by materializations to allocate blocks of memory arenas. +// BufferAllocator is an abstract class that defines a common contract of +// all implementations of allocators. Specific allocators provide specific +// features, e.g. enforced resource limits, thread safety, etc. +class BufferAllocator { + public: + virtual ~BufferAllocator() {} + + // Called by the user when a new block of memory is needed. The 'requested' + // parameter specifies how much memory (in bytes) the user would like to get. + // The 'minimal' parameter specifies how much he is willing to settle for. + // The allocator returns a buffer sized in the range [minimal, requested], + // or NULL if the request can't be satisfied. When the buffer is destroyed, + // its destructor calls the FreeInternal() method on its allocator. + // CAVEAT: The allocator must outlive all buffers returned by it. + // + // Corner cases: + // 1. If requested == 0, the allocator will always return a non-NULL Buffer + // with a non-NULL data pointer and zero capacity. + // 2. If minimal == 0, the allocator will always return a non-NULL Buffer + // with a non-NULL data pointer, possibly with zero capacity. + Buffer* BestEffortAllocate(size_t requested, size_t minimal) { + DCHECK_LE(minimal, requested); + Buffer* result = AllocateInternal(requested, minimal, this); + LogAllocation(requested, minimal, result); + return result; + } + + // Called by the user when a new block of memory is needed. Equivalent to + // BestEffortAllocate(requested, requested). + Buffer* Allocate(size_t requested) { + return BestEffortAllocate(requested, requested); + } + + // Called by the user when a previously allocated block needs to be resized. + // Mimics semantics of realloc. The 'requested' and 'minimal' + // represent the desired final buffer size, with semantics as in the Allocate. + // If the 'buffer' parameter is NULL, the call is equivalent to + // Allocate(requested, minimal). Otherwise, a reallocation of the buffer's + // data is attempted. On success, the original 'buffer' parameter is returned, + // but the buffer itself might have updated size and data. On failure, + // returns NULL, and leaves the input buffer unmodified. + // Reallocation might happen in-place, preserving the original data + // pointer, but it is not guaranteed - e.g. this function might degenerate to + // Allocate-Copy-Free. Either way, the content of the data buffer, up to the + // minimum of the new and old size, is preserved. + // + // Corner cases: + // 1. If requested == 0, the allocator will always return a non-NULL Buffer + // with a non-NULL data pointer and zero capacity. + // 2. If minimal == 0, the allocator will always return a non-NULL Buffer + // with a non-NULL data pointer, possibly with zero capacity. + Buffer* BestEffortReallocate(size_t requested, + size_t minimal, + Buffer* buffer) { + DCHECK_LE(minimal, requested); + Buffer* result; + if (buffer == NULL) { + result = AllocateInternal(requested, minimal, this); + LogAllocation(requested, minimal, result); + return result; + } else { + result = ReallocateInternal(requested, minimal, buffer, this) ? + buffer : NULL; + LogAllocation(requested, minimal, buffer); + return result; + } + } + + // Called by the user when a previously allocated block needs to be resized. + // Equivalent to BestEffortReallocate(requested, requested, buffer). + Buffer* Reallocate(size_t requested, Buffer* buffer) { + return BestEffortReallocate(requested, requested, buffer); + } + + // Returns the amount of memory (in bytes) still available for this allocator. + // For unbounded allocators (like raw HeapBufferAllocator) this is the highest + // size_t value possible. + // TODO(user): consider making pure virtual. + virtual size_t Available() const { return numeric_limits::max(); } + + protected: + friend class Buffer; + + BufferAllocator() {} + + // Expose the constructor to subclasses of BufferAllocator. + Buffer* CreateBuffer(void* data, + size_t size, + BufferAllocator* allocator) { + return new Buffer(data, size, allocator); + } + + // Expose Buffer::Update to subclasses of BufferAllocator. + void UpdateBuffer(void* new_data, size_t new_size, Buffer* buffer) { + buffer->Update(new_data, new_size); + } + + // Called by chained buffer allocators. + Buffer* DelegateAllocate(BufferAllocator* delegate, + size_t requested, + size_t minimal, + BufferAllocator* originator) { + return delegate->AllocateInternal(requested, minimal, originator); + } + + // Called by chained buffer allocators. + bool DelegateReallocate(BufferAllocator* delegate, + size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) { + return delegate->ReallocateInternal(requested, minimal, buffer, originator); + } + + // Called by chained buffer allocators. + void DelegateFree(BufferAllocator* delegate, Buffer* buffer) { + delegate->FreeInternal(buffer); + } + + private: + // Implemented by concrete subclasses. + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) = 0; + + // Implemented by concrete subclasses. Returns false on failure. + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) = 0; + + // Implemented by concrete subclasses. + virtual void FreeInternal(Buffer* buffer) = 0; + + // Logs a warning message if the allocation failed or if it returned less than + // the required number of bytes. + void LogAllocation(size_t required, size_t minimal, Buffer* buffer); + + DISALLOW_COPY_AND_ASSIGN(BufferAllocator); +}; + +// Allocates buffers on the heap, with no memory limits. Uses standard C +// allocation functions (malloc, realloc, free). +class HeapBufferAllocator : public BufferAllocator { + public: + virtual ~HeapBufferAllocator() {} + + // Returns a singleton instance of the heap allocator. + static HeapBufferAllocator* Get() { + return Singleton::get(); + } + + virtual size_t Available() const OVERRIDE { + return numeric_limits::max(); + } + + private: + // Allocates memory that is aligned to 16 way. + // Use if you want to boost SIMD operations on the memory area. + const bool aligned_mode_; + + friend class Singleton; + + // Always allocates 'requested'-sized buffer, or returns NULL on OOM. + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE; + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE; + + void* Malloc(size_t size); + void* Realloc(void* previousData, size_t previousSize, size_t newSize); + + virtual void FreeInternal(Buffer* buffer) OVERRIDE; + + HeapBufferAllocator(); + explicit HeapBufferAllocator(bool aligned_mode) + : aligned_mode_(aligned_mode) {} + + DISALLOW_COPY_AND_ASSIGN(HeapBufferAllocator); +}; + +// Wrapper around the delegate allocator, that clears all newly allocated +// (and reallocated) memory. +class ClearingBufferAllocator : public BufferAllocator { + public: + // Does not take ownership of the delegate. + explicit ClearingBufferAllocator(BufferAllocator* delegate) + : delegate_(delegate) {} + + virtual size_t Available() const OVERRIDE { + return delegate_->Available(); + } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE; + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE; + + virtual void FreeInternal(Buffer* buffer) OVERRIDE; + + BufferAllocator* delegate_; + DISALLOW_COPY_AND_ASSIGN(ClearingBufferAllocator); +}; + +// Abstract policy for modifying allocation requests - e.g. enforcing quotas. +class Mediator { + public: + Mediator() {} + virtual ~Mediator() {} + + // Called by an allocator when a allocation request is processed. + // Must return a value in the range [minimal, requested], or zero. Returning + // zero (if minimal is non-zero) indicates denial to allocate. Returning + // non-zero indicates that the request should be capped at that value. + virtual size_t Allocate(size_t requested, size_t minimal) = 0; + + // Called by an allocator when the specified amount (in bytes) is released. + virtual void Free(size_t amount) = 0; + + // TODO(user): consider making pure virtual. + virtual size_t Available() const { return numeric_limits::max(); } +}; + +// Optionally thread-safe skeletal implementation of a 'quota' abstraction, +// providing methods to allocate resources against the quota, and return them. +template +class Quota : public Mediator { + public: + explicit Quota(bool enforced) : usage_(0), enforced_(enforced) {} + virtual ~Quota() {} + + // Returns a value in range [minimal, requested] if not exceeding remaining + // quota or if the quota is not enforced (soft quota), and adjusts the usage + // value accordingly. Otherwise, returns zero. The semantics of 'remaining + // quota' are defined by subclasses (that must supply GetQuotaInternal() + // method). + virtual size_t Allocate(size_t requested, size_t minimal) OVERRIDE; + + virtual void Free(size_t amount) OVERRIDE; + + // Returns memory still available in the quota. For unenforced Quota objects, + // you are still able to perform _minimal_ allocations when the available + // quota is 0 (or less than "minimal" param). + virtual size_t Available() const OVERRIDE { + lock_guard_maybe lock(Quota::mutex()); + const size_t quota = GetQuotaInternal(); + return (usage_ >= quota) ? 0 : (quota - usage_); + } + + // Returns the current quota value. + size_t GetQuota() const; + + // Returns the current usage value, defined as a sum of all the values + // granted by calls to Allocate, less these released via calls to Free. + size_t GetUsage() const; + + bool enforced() const { + return enforced_; + } + + protected: + // Overridden by specific implementations, to define semantics of + // the quota, i.e. the total amount of resources that the mediator will + // allocate. Called directly from GetQuota that optionally provides + // thread safety. An 'Allocate' request will succeed if + // GetUsage() + minimal <= GetQuota() or if the quota is not enforced (soft + // quota). + virtual size_t GetQuotaInternal() const = 0; + + Mutex* mutex() const { return thread_safe ? &mutex_ : NULL; } + + private: + mutable Mutex mutex_; + size_t usage_; + bool enforced_; + DISALLOW_COPY_AND_ASSIGN(Quota); +}; + +// Optionally thread-safe static quota implementation (where quota is explicitly +// set to a concrete numeric value). +template +class StaticQuota : public Quota { + public: + explicit StaticQuota(size_t quota) + : Quota(true) { + SetQuota(quota); + } + StaticQuota(size_t quota, bool enforced) + : Quota(enforced) { + SetQuota(quota); + } + virtual ~StaticQuota() {} + + // Sets quota to the new value. + void SetQuota(const size_t quota); + + protected: + virtual size_t GetQuotaInternal() const { return quota_; } + + private: + size_t quota_; + DISALLOW_COPY_AND_ASSIGN(StaticQuota); +}; + +// Places resource limits on another allocator, using the specified Mediator +// (e.g. quota) implementation. +// +// If the mediator and the delegate allocator are thread-safe, this allocator +// is also thread-safe, to the extent that it will not introduce any +// state inconsistencies. However, without additional synchronization, +// allocation requests are not atomic end-to-end. This way, it is deadlock- +// resilient (even if you have cyclic relationships between allocators) and +// allows better concurrency. But, it may cause over-conservative +// allocations under memory contention, if you have multiple levels of +// mediating allocators. For example, if two requests that can't both be +// satisfied are submitted concurrently, it may happen that one of them succeeds +// but gets smaller buffer allocated than it would if the requests were strictly +// ordered. This is usually not a problem, however, as you don't really want to +// operate so close to memory limits that some of your allocations can't be +// satisfied. If you do have a simple, cascading graph of allocators though, +// and want to force requests be atomic end-to-end, put a +// ThreadSafeBufferAllocator at the entry point. +class MediatingBufferAllocator : public BufferAllocator { + public: + // Does not take ownership of the delegate, nor the mediator, allowing + // both to be reused. + MediatingBufferAllocator(BufferAllocator* const delegate, + Mediator* const mediator) + : delegate_(delegate), + mediator_(mediator) {} + + virtual ~MediatingBufferAllocator() {} + + virtual size_t Available() const OVERRIDE { + return min(delegate_->Available(), mediator_->Available()); + } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE; + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE; + + virtual void FreeInternal(Buffer* buffer) OVERRIDE; + + BufferAllocator* delegate_; + Mediator* const mediator_; +}; + +// Convenience non-thread-safe static memory bounds enforcer. +// Combines MediatingBufferAllocator with a StaticQuota. +class MemoryLimit : public BufferAllocator { + public: + // Creates a limiter based on the default, heap allocator. Quota is infinite. + // (Can be set using SetQuota). + MemoryLimit() + : quota_(std::numeric_limits::max()), + allocator_(HeapBufferAllocator::Get(), "a_) {} + + // Creates a limiter based on the default, heap allocator. + explicit MemoryLimit(size_t quota) + : quota_(quota), + allocator_(HeapBufferAllocator::Get(), "a_) {} + + // Creates a limiter relaying to the specified delegate allocator. + MemoryLimit(size_t quota, BufferAllocator* const delegate) + : quota_(quota), + allocator_(delegate, "a_) {} + + // Creates a (possibly non-enforcing) limiter relaying to the specified + // delegate allocator. + MemoryLimit(size_t quota, bool enforced, BufferAllocator* const delegate) + : quota_(quota, enforced), + allocator_(delegate, "a_) {} + + virtual ~MemoryLimit() {} + + virtual size_t Available() const OVERRIDE { + return allocator_.Available(); + } + + size_t GetQuota() const { return quota_.GetQuota(); } + size_t GetUsage() const { return quota_.GetUsage(); } + void SetQuota(const size_t quota) { quota_.SetQuota(quota); } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE { + return DelegateAllocate(&allocator_, requested, minimal, originator); + } + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE { + return DelegateReallocate(&allocator_, requested, minimal, buffer, + originator); + } + virtual void FreeInternal(Buffer* buffer) OVERRIDE { + DelegateFree(&allocator_, buffer); + } + + StaticQuota quota_; + MediatingBufferAllocator allocator_; +}; + +// An allocator that allows to bypass the (potential) soft quota below for a +// given amount of memory usage. The goal is to make the allocation methods and +// Available() work as if the allocator below had at least bypassed_amount of +// soft quota. Of course this class doesn't allow to exceed the hard quota. +class SoftQuotaBypassingBufferAllocator : public BufferAllocator { + public: + SoftQuotaBypassingBufferAllocator(BufferAllocator* allocator, + size_t bypassed_amount) + : allocator_(std::numeric_limits::max(), allocator), + bypassed_amount_(bypassed_amount) {} + + virtual size_t Available() const OVERRIDE { + const size_t usage = allocator_.GetUsage(); + size_t available = allocator_.Available(); + if (bypassed_amount_ > usage) { + available = max(bypassed_amount_ - usage, available); + } + return available; + } + + private: + // Calculates how much to increase the minimal parameter to allocate more + // aggressively in the underlying allocator. This is to avoid getting only + // very small allocations when we exceed the soft quota below. The request + // with increased minimal size is more likely to fail because of exceeding + // hard quota, so we also fall back to the original minimal size. + size_t AdjustMinimal(size_t requested, size_t minimal) const { + return min(requested, max(minimal, Available())); + } + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE { + // Try increasing the "minimal" parameter to allocate more aggresively + // within the bypassed amount of soft quota. + Buffer* result = DelegateAllocate(&allocator_, + requested, + AdjustMinimal(requested, minimal), + originator); + if (result != NULL) { + return result; + } else { + return DelegateAllocate(&allocator_, + requested, + minimal, + originator); + } + } + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE { + if (DelegateReallocate(&allocator_, + requested, + AdjustMinimal(requested, minimal), + buffer, + originator)) { + return true; + } else { + return DelegateReallocate(&allocator_, + requested, + minimal, + buffer, + originator); + } + } + virtual void FreeInternal(Buffer* buffer) OVERRIDE { + DelegateFree(&allocator_, buffer); + } + + // Using MemoryLimit with "infinite" limit to get GetUsage(). + MemoryLimit allocator_; + size_t bypassed_amount_; +}; + +// An interface for a MemoryStatisticsCollector - an object which collects +// information about the memory usage of the allocator. The collector will +// gather statistics about memory usage based on information received from the +// allocator. +class MemoryStatisticsCollectorInterface { + public: + MemoryStatisticsCollectorInterface() {} + + virtual ~MemoryStatisticsCollectorInterface() {} + + // Informs the collector that the allocator granted bytes memory. Note that in + // the case of reallocation bytes should be the increase in total memory + // usage, not the total size of the buffer after reallocation. + virtual void AllocatedMemoryBytes(size_t bytes) = 0; + + // Informs the collector that the allocator received a request for at least + // bytes memory, and rejected it (meaning that it granted nothing). + virtual void RefusedMemoryBytes(size_t bytes) = 0; + + // Informs the collector that bytes memory have been released to the + // allocator. + virtual void FreedMemoryBytes(size_t bytes) = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(MemoryStatisticsCollectorInterface); +}; + +class MemoryStatisticsCollectingBufferAllocator : public BufferAllocator { + public: + // Does not take ownership of the delegate. + // Takes ownership of memory_stats_collector. + MemoryStatisticsCollectingBufferAllocator( + BufferAllocator* const delegate, + MemoryStatisticsCollectorInterface* const memory_stats_collector) + : delegate_(delegate), + memory_stats_collector_(memory_stats_collector) {} + + virtual ~MemoryStatisticsCollectingBufferAllocator() {} + + virtual size_t Available() const OVERRIDE { + return delegate_->Available(); + } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE; + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE; + + virtual void FreeInternal(Buffer* buffer) OVERRIDE; + + BufferAllocator* delegate_; + gscoped_ptr + memory_stats_collector_; +}; + +// BufferAllocator which uses MemTracker to keep track of and optionally +// (if a limit is set on the MemTracker) regulate memory consumption. +class MemoryTrackingBufferAllocator : public BufferAllocator { + public: + // Does not take ownership of the delegate. The delegate must remain + // valid for the lifetime of this allocator. Increments reference + // count for 'mem_tracker'. + // If 'mem_tracker' has a limit and 'enforce_limit' is true, then + // the classes calling this buffer allocator (whether directly, or + // through an Arena) must be able to handle the case when allocation + // fails. If 'enforce_limit' is false (this is the default), then + // allocation will always succeed. + MemoryTrackingBufferAllocator(BufferAllocator* const delegate, + std::shared_ptr mem_tracker, + bool enforce_limit = false) + : delegate_(delegate), + mem_tracker_(std::move(mem_tracker)), + enforce_limit_(enforce_limit) {} + + virtual ~MemoryTrackingBufferAllocator() {} + + // If enforce limit is false, this always returns maximum possible value + // for int64_t (std::numeric_limits::max()). Otherwise, this + // is equivalent to calling mem_tracker_->SpareCapacity(); + virtual size_t Available() const OVERRIDE; + + private: + + // If enforce_limit_ is true, this is equivalent to calling + // mem_tracker_->TryConsume(bytes). If enforce_limit_ is false and + // mem_tracker_->TryConsume(bytes) is false, we call + // mem_tracker_->Consume(bytes) and always return true. + bool TryConsume(int64_t bytes); + + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE; + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE; + + virtual void FreeInternal(Buffer* buffer) OVERRIDE; + + BufferAllocator* delegate_; + std::shared_ptr mem_tracker_; + bool enforce_limit_; +}; + +// Synchronizes access to AllocateInternal and FreeInternal, and exposes the +// mutex for use by subclasses. Allocation requests performed through this +// allocator are atomic end-to-end. Template parameter DelegateAllocatorType +// allows to specify a subclass of BufferAllocator for the delegate, to allow +// subclasses of ThreadSafeBufferAllocator to access additional methods provided +// by the allocator subclass. If this is not needed, it can be set to +// BufferAllocator. +template +class ThreadSafeBufferAllocator : public BufferAllocator { + public: + // Does not take ownership of the delegate. + explicit ThreadSafeBufferAllocator(DelegateAllocatorType* delegate) + : delegate_(delegate) {} + virtual ~ThreadSafeBufferAllocator() {} + + virtual size_t Available() const OVERRIDE { + lock_guard_maybe lock(mutex()); + return delegate()->Available(); + } + + protected: + Mutex* mutex() const { return &mutex_; } + // Expose the delegate allocator, with the precise type of the allocator + // specified by the template parameter. The delegate() methods themselves + // don't give any thread-safety guarantees. Protect all uses taking the Mutex + // exposed by the mutex() method. + DelegateAllocatorType* delegate() { return delegate_; } + const DelegateAllocatorType* delegate() const { return delegate_; } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE { + lock_guard_maybe lock(mutex()); + return DelegateAllocate(delegate(), requested, minimal, originator); + } + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE { + lock_guard_maybe lock(mutex()); + return DelegateReallocate(delegate(), requested, minimal, buffer, + originator); + } + + virtual void FreeInternal(Buffer* buffer) OVERRIDE { + lock_guard_maybe lock(mutex()); + DelegateFree(delegate(), buffer); + } + + DelegateAllocatorType* delegate_; + mutable Mutex mutex_; + DISALLOW_COPY_AND_ASSIGN(ThreadSafeBufferAllocator); +}; + +// A version of ThreadSafeBufferAllocator that owns the supplied delegate +// allocator. +template +class OwningThreadSafeBufferAllocator + : public ThreadSafeBufferAllocator { + public: + explicit OwningThreadSafeBufferAllocator(DelegateAllocatorType* delegate) + : ThreadSafeBufferAllocator(delegate), + delegate_owned_(delegate) {} + virtual ~OwningThreadSafeBufferAllocator() {} + + private: + gscoped_ptr delegate_owned_; +}; + +class ThreadSafeMemoryLimit + : public OwningThreadSafeBufferAllocator { + public: + ThreadSafeMemoryLimit(size_t quota, bool enforced, + BufferAllocator* const delegate) + : OwningThreadSafeBufferAllocator( + new MemoryLimit(quota, enforced, delegate)) {} + virtual ~ThreadSafeMemoryLimit() {} + + size_t GetQuota() const { + lock_guard_maybe lock(mutex()); + return delegate()->GetQuota(); + } + size_t GetUsage() const { + lock_guard_maybe lock(mutex()); + return delegate()->GetUsage(); + } + void SetQuota(const size_t quota) { + lock_guard_maybe lock(mutex()); + delegate()->SetQuota(quota); + } +}; + +// A BufferAllocator that can be given ownership of many objects of given type. +// These objects will then be deleted when the buffer allocator is destroyed. +// The objects added last are deleted first (LIFO). +template +class OwningBufferAllocator : public BufferAllocator { + public: + // Doesn't take ownership of delegate. + explicit OwningBufferAllocator(BufferAllocator* const delegate) + : delegate_(delegate) {} + + virtual ~OwningBufferAllocator() { + // Delete elements starting from the end. + while (!owned_.empty()) { + OwnedType* p = owned_.back(); + owned_.pop_back(); + delete p; + } + } + + // Add to the collection of objects owned by this allocator. The object added + // last is deleted first. + OwningBufferAllocator* Add(OwnedType* p) { + owned_.push_back(p); + return this; + } + + virtual size_t Available() const OVERRIDE { + return delegate_->Available(); + } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE { + return DelegateAllocate(delegate_, requested, minimal, originator); + } + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE { + return DelegateReallocate(delegate_, requested, minimal, buffer, + originator); + } + + virtual void FreeInternal(Buffer* buffer) OVERRIDE { + DelegateFree(delegate_, buffer); + } + + // Not using PointerVector here because we want to guarantee certain order of + // deleting elements (starting from the ones added last). + vector owned_; + BufferAllocator* delegate_; +}; + +// Buffer allocator that tries to guarantee the exact and consistent amount +// of memory. Uses hard MemoryLimit to enforce the upper bound but also +// guarantees consistent allocations by ignoring minimal requested amounts and +// always returning the full amount of memory requested if available. +// Allocations will fail if the memory requested would exceed the quota or if +// the underlying allocator fails to provide the memory. +class GuaranteeMemory : public BufferAllocator { + public: + // Doesn't take ownership of 'delegate'. + GuaranteeMemory(size_t memory_quota, + BufferAllocator* delegate) + : limit_(memory_quota, true, delegate), + memory_guarantee_(memory_quota) {} + + virtual size_t Available() const OVERRIDE { + return memory_guarantee_ - limit_.GetUsage(); + } + + private: + virtual Buffer* AllocateInternal(size_t requested, + size_t minimal, + BufferAllocator* originator) OVERRIDE { + if (requested > Available()) { + return NULL; + } else { + return DelegateAllocate(&limit_, requested, requested, originator); + } + } + + virtual bool ReallocateInternal(size_t requested, + size_t minimal, + Buffer* buffer, + BufferAllocator* originator) OVERRIDE { + int64 additional_memory = requested - (buffer != NULL ? buffer->size() : 0); + return additional_memory <= static_cast(Available()) + && DelegateReallocate(&limit_, requested, requested, + buffer, originator); + } + + virtual void FreeInternal(Buffer* buffer) OVERRIDE { + DelegateFree(&limit_, buffer); + } + + MemoryLimit limit_; + size_t memory_guarantee_; + DISALLOW_COPY_AND_ASSIGN(GuaranteeMemory); +}; + +// Implementation of inline and template methods + +template +size_t Quota::Allocate(const size_t requested, + const size_t minimal) { + lock_guard_maybe lock(mutex()); + DCHECK_LE(minimal, requested) + << "\"minimal\" shouldn't be bigger than \"requested\""; + const size_t quota = GetQuotaInternal(); + size_t allocation; + if (usage_ > quota || minimal > quota - usage_) { + // OOQ (Out of quota). + if (!enforced() && minimal <= numeric_limits::max() - usage_) { + // The quota is unenforced and the value of "minimal" won't cause an + // overflow. Perform a minimal allocation. + allocation = minimal; + } else { + allocation = 0; + } + LOG(WARNING) << "Out of quota. Requested: " << requested + << " bytes, or at least minimal: " << minimal + << ". Current quota value is: " << quota + << " while current usage is: " << usage_ + << ". The quota is " << (enforced() ? "" : "not ") + << "enforced. " + << ((allocation == 0) ? "Did not allocate any memory." + : "Allocated the minimal value requested."); + } else { + allocation = min(requested, quota - usage_); + } + usage_ += allocation; + return allocation; +} + +template +void Quota::Free(size_t amount) { + lock_guard_maybe lock(mutex()); + usage_ -= amount; + // threads allocate/free memory concurrently via the same Quota object that is + // not protected with a mutex (thread_safe == false). + if (usage_ > (numeric_limits::max() - (1 << 28))) { + LOG(ERROR) << "Suspiciously big usage_ value: " << usage_ + << " (could be a result size_t wrapping around below 0, " + << "for example as a result of race condition)."; + } +} + +template +size_t Quota::GetQuota() const { + lock_guard_maybe lock(mutex()); + return GetQuotaInternal(); +} + +template +size_t Quota::GetUsage() const { + lock_guard_maybe lock(mutex()); + return usage_; +} + +template +void StaticQuota::SetQuota(const size_t quota) { + lock_guard_maybe lock(Quota::mutex()); + quota_ = quota; +} + +} // namespace kudu + +#endif // KUDU_UTIL_MEMORY_MEMORY_H_ diff --git a/src/kudu/util/metrics-test.cc b/src/kudu/util/metrics-test.cc new file mode 100644 index 000000000000..725c91c1af4b --- /dev/null +++ b/src/kudu/util/metrics-test.cc @@ -0,0 +1,310 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/map-util.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/jsonreader.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/metrics.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::unordered_set; +using std::vector; + +DECLARE_int32(metrics_retirement_age_ms); + +namespace kudu { + +METRIC_DEFINE_entity(test_entity); + +class MetricsTest : public KuduTest { + public: + void SetUp() override { + KuduTest::SetUp(); + + entity_ = METRIC_ENTITY_test_entity.Instantiate(®istry_, "my-test"); + } + + protected: + MetricRegistry registry_; + scoped_refptr entity_; +}; + +METRIC_DEFINE_counter(test_entity, reqs_pending, "Requests Pending", MetricUnit::kRequests, + "Number of requests pending"); + +TEST_F(MetricsTest, SimpleCounterTest) { + scoped_refptr requests = + new Counter(&METRIC_reqs_pending); + ASSERT_EQ("Number of requests pending", requests->prototype()->description()); + ASSERT_EQ(0, requests->value()); + requests->Increment(); + ASSERT_EQ(1, requests->value()); + requests->IncrementBy(2); + ASSERT_EQ(3, requests->value()); +} + +METRIC_DEFINE_gauge_uint64(test_entity, fake_memory_usage, "Memory Usage", + MetricUnit::kBytes, "Test Gauge 1"); + +TEST_F(MetricsTest, SimpleAtomicGaugeTest) { + scoped_refptr > mem_usage = + METRIC_fake_memory_usage.Instantiate(entity_, 0); + ASSERT_EQ(METRIC_fake_memory_usage.description(), mem_usage->prototype()->description()); + ASSERT_EQ(0, mem_usage->value()); + mem_usage->IncrementBy(7); + ASSERT_EQ(7, mem_usage->value()); + mem_usage->set_value(5); + ASSERT_EQ(5, mem_usage->value()); +} + +METRIC_DEFINE_gauge_int64(test_entity, test_func_gauge, "Test Gauge", MetricUnit::kBytes, + "Test Gauge 2"); + +static int64_t MyFunction(int* metric_val) { + return (*metric_val)++; +} + +TEST_F(MetricsTest, SimpleFunctionGaugeTest) { + int metric_val = 1000; + scoped_refptr > gauge = + METRIC_test_func_gauge.InstantiateFunctionGauge( + entity_, Bind(&MyFunction, Unretained(&metric_val))); + + ASSERT_EQ(1000, gauge->value()); + ASSERT_EQ(1001, gauge->value()); + + gauge->DetachToCurrentValue(); + // After detaching, it should continue to return the same constant value. + ASSERT_EQ(1002, gauge->value()); + ASSERT_EQ(1002, gauge->value()); + + // Test resetting to a constant. + gauge->DetachToConstant(2); + ASSERT_EQ(2, gauge->value()); +} + +TEST_F(MetricsTest, AutoDetachToLastValue) { + int metric_val = 1000; + scoped_refptr > gauge = + METRIC_test_func_gauge.InstantiateFunctionGauge( + entity_, Bind(&MyFunction, Unretained(&metric_val))); + + ASSERT_EQ(1000, gauge->value()); + ASSERT_EQ(1001, gauge->value()); + { + FunctionGaugeDetacher detacher; + gauge->AutoDetachToLastValue(&detacher); + ASSERT_EQ(1002, gauge->value()); + ASSERT_EQ(1003, gauge->value()); + } + + ASSERT_EQ(1004, gauge->value()); + ASSERT_EQ(1004, gauge->value()); +} + +TEST_F(MetricsTest, AutoDetachToConstant) { + int metric_val = 1000; + scoped_refptr > gauge = + METRIC_test_func_gauge.InstantiateFunctionGauge( + entity_, Bind(&MyFunction, Unretained(&metric_val))); + + ASSERT_EQ(1000, gauge->value()); + ASSERT_EQ(1001, gauge->value()); + { + FunctionGaugeDetacher detacher; + gauge->AutoDetach(&detacher, 12345); + ASSERT_EQ(1002, gauge->value()); + ASSERT_EQ(1003, gauge->value()); + } + + ASSERT_EQ(12345, gauge->value()); +} + +METRIC_DEFINE_gauge_uint64(test_entity, counter_as_gauge, "Gauge exposed as Counter", + MetricUnit::kBytes, "Gauge exposed as Counter", + EXPOSE_AS_COUNTER); +TEST_F(MetricsTest, TEstExposeGaugeAsCounter) { + ASSERT_EQ(MetricType::kCounter, METRIC_counter_as_gauge.type()); +} + +METRIC_DEFINE_histogram(test_entity, test_hist, "Test Histogram", + MetricUnit::kMilliseconds, "foo", 1000000, 3); + +TEST_F(MetricsTest, SimpleHistogramTest) { + scoped_refptr hist = METRIC_test_hist.Instantiate(entity_); + hist->Increment(2); + hist->IncrementBy(4, 1); + ASSERT_EQ(2, hist->histogram_->MinValue()); + ASSERT_EQ(3, hist->histogram_->MeanValue()); + ASSERT_EQ(4, hist->histogram_->MaxValue()); + ASSERT_EQ(2, hist->histogram_->TotalCount()); + ASSERT_EQ(6, hist->histogram_->TotalSum()); + // TODO: Test coverage needs to be improved a lot. +} + +TEST_F(MetricsTest, JsonPrintTest) { + scoped_refptr bytes_seen = METRIC_reqs_pending.Instantiate(entity_); + bytes_seen->Increment(); + entity_->SetAttribute("test_attr", "attr_val"); + + // Generate the JSON. + std::stringstream out; + JsonWriter writer(&out, JsonWriter::PRETTY); + ASSERT_OK(entity_->WriteAsJson(&writer, { "*" }, MetricJsonOptions())); + + // Now parse it back out. + JsonReader reader(out.str()); + ASSERT_OK(reader.Init()); + + vector metrics; + ASSERT_OK(reader.ExtractObjectArray(reader.root(), "metrics", &metrics)); + ASSERT_EQ(1, metrics.size()); + string metric_name; + ASSERT_OK(reader.ExtractString(metrics[0], "name", &metric_name)); + ASSERT_EQ("reqs_pending", metric_name); + int64_t metric_value; + ASSERT_OK(reader.ExtractInt64(metrics[0], "value", &metric_value)); + ASSERT_EQ(1L, metric_value); + + const rapidjson::Value* attributes; + ASSERT_OK(reader.ExtractObject(reader.root(), "attributes", &attributes)); + string attr_value; + ASSERT_OK(reader.ExtractString(attributes, "test_attr", &attr_value)); + ASSERT_EQ("attr_val", attr_value); + + // Verify that, if we filter for a metric that isn't in this entity, we get no result. + out.str(""); + ASSERT_OK(entity_->WriteAsJson(&writer, { "not_a_matching_metric" }, MetricJsonOptions())); + ASSERT_EQ("", out.str()); +} + +// Test that metrics are retired when they are no longer referenced. +TEST_F(MetricsTest, RetirementTest) { + FLAGS_metrics_retirement_age_ms = 100; + + const string kMetricName = "foo"; + scoped_refptr counter = METRIC_reqs_pending.Instantiate(entity_); + ASSERT_EQ(1, entity_->UnsafeMetricsMapForTests().size()); + + // Since we hold a reference to the counter, it should not get retired. + entity_->RetireOldMetrics(); + ASSERT_EQ(1, entity_->UnsafeMetricsMapForTests().size()); + + // When we de-ref it, it should not get immediately retired, either, because + // we keep retirable metrics around for some amount of time. We try retiring + // a number of times to hit all the cases. + counter = nullptr; + for (int i = 0; i < 3; i++) { + entity_->RetireOldMetrics(); + ASSERT_EQ(1, entity_->UnsafeMetricsMapForTests().size()); + } + + // If we wait for longer than the retirement time, and call retire again, we'll + // actually retire it. + SleepFor(MonoDelta::FromMilliseconds(FLAGS_metrics_retirement_age_ms * 1.5)); + entity_->RetireOldMetrics(); + ASSERT_EQ(0, entity_->UnsafeMetricsMapForTests().size()); +} + +TEST_F(MetricsTest, TestRetiringEntities) { + ASSERT_EQ(1, registry_.num_entities()); + + // Drop the reference to our entity. + entity_.reset(); + + // Retire metrics. Since there is nothing inside our entity, it should + // retire immediately (no need to loop). + registry_.RetireOldMetrics(); + + ASSERT_EQ(0, registry_.num_entities()); +} + +// Test that we can mark a metric to never be retired. +TEST_F(MetricsTest, NeverRetireTest) { + entity_->NeverRetire(METRIC_test_hist.Instantiate(entity_)); + FLAGS_metrics_retirement_age_ms = 0; + + for (int i = 0; i < 3; i++) { + entity_->RetireOldMetrics(); + ASSERT_EQ(1, entity_->UnsafeMetricsMapForTests().size()); + } +} + +TEST_F(MetricsTest, TestInstantiatingTwice) { + // Test that re-instantiating the same entity ID returns the same object. + scoped_refptr new_entity = METRIC_ENTITY_test_entity.Instantiate( + ®istry_, entity_->id()); + ASSERT_EQ(new_entity.get(), entity_.get()); +} + +TEST_F(MetricsTest, TestInstantiatingDifferentEntities) { + scoped_refptr new_entity = METRIC_ENTITY_test_entity.Instantiate( + ®istry_, "some other ID"); + ASSERT_NE(new_entity.get(), entity_.get()); +} + +TEST_F(MetricsTest, TestDumpJsonPrototypes) { + // Dump the prototype info. + std::stringstream out; + JsonWriter w(&out, JsonWriter::PRETTY); + MetricPrototypeRegistry::get()->WriteAsJson(&w); + string json = out.str(); + + // Quick sanity check for one of our metrics defined in this file. + const char* expected = + " {\n" + " \"name\": \"test_func_gauge\",\n" + " \"label\": \"Test Gauge\",\n" + " \"type\": \"gauge\",\n" + " \"unit\": \"bytes\",\n" + " \"description\": \"Test Gauge 2\",\n" + " \"entity_type\": \"test_entity\"\n" + " }"; + ASSERT_STR_CONTAINS(json, expected); + + // Parse it. + rapidjson::Document d; + d.Parse<0>(json.c_str()); + + // Ensure that we got a reasonable number of metrics. + int num_metrics = d["metrics"].Size(); + int num_entities = d["entities"].Size(); + LOG(INFO) << "Parsed " << num_metrics << " metrics and " << num_entities << " entities"; + ASSERT_GT(num_metrics, 5); + ASSERT_EQ(num_entities, 2); + + // Spot-check that some metrics were properly registered and that the JSON was properly + // formed. + unordered_set seen_metrics; + for (int i = 0; i < d["metrics"].Size(); i++) { + InsertOrDie(&seen_metrics, d["metrics"][i]["name"].GetString()); + } + ASSERT_TRUE(ContainsKey(seen_metrics, "threads_started")); + ASSERT_TRUE(ContainsKey(seen_metrics, "test_hist")); +} + +} // namespace kudu diff --git a/src/kudu/util/metrics.cc b/src/kudu/util/metrics.cc new file mode 100644 index 000000000000..2935cbc69dfe --- /dev/null +++ b/src/kudu/util/metrics.cc @@ -0,0 +1,683 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/metrics.h" + +#include +#include +#include + +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/singleton.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/histogram.pb.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" + +DEFINE_int32(metrics_retirement_age_ms, 120 * 1000, + "The minimum number of milliseconds a metric will be kept for after it is " + "no longer active. (Advanced option)"); +TAG_FLAG(metrics_retirement_age_ms, runtime); +TAG_FLAG(metrics_retirement_age_ms, advanced); + +// Process/server-wide metrics should go into the 'server' entity. +// More complex applications will define other entities. +METRIC_DEFINE_entity(server); + +namespace kudu { + +using std::string; +using std::vector; +using strings::Substitute; + +// +// MetricUnit +// + +const char* MetricUnit::Name(Type unit) { + switch (unit) { + case kCacheHits: + return "hits"; + case kCacheQueries: + return "queries"; + case kBytes: + return "bytes"; + case kRequests: + return "requests"; + case kEntries: + return "entries"; + case kRows: + return "rows"; + case kCells: + return "cells"; + case kConnections: + return "connections"; + case kOperations: + return "operations"; + case kProbes: + return "probes"; + case kNanoseconds: + return "nanoseconds"; + case kMicroseconds: + return "microseconds"; + case kMilliseconds: + return "milliseconds"; + case kSeconds: + return "seconds"; + case kThreads: + return "threads"; + case kTransactions: + return "transactions"; + case kUnits: + return "units"; + case kScanners: + return "scanners"; + case kMaintenanceOperations: + return "operations"; + case kBlocks: + return "blocks"; + case kLogBlockContainers: + return "log block containers"; + case kTasks: + return "tasks"; + case kMessages: + return "messages"; + case kContextSwitches: + return "context switches"; + default: + return "UNKNOWN UNIT"; + } +} + +// +// MetricType +// + +const char* const MetricType::kGaugeType = "gauge"; +const char* const MetricType::kCounterType = "counter"; +const char* const MetricType::kHistogramType = "histogram"; +const char* MetricType::Name(MetricType::Type type) { + switch (type) { + case kGauge: + return kGaugeType; + case kCounter: + return kCounterType; + case kHistogram: + return kHistogramType; + default: + return "UNKNOWN TYPE"; + } +} + +// +// MetricEntityPrototype +// + +MetricEntityPrototype::MetricEntityPrototype(const char* name) + : name_(name) { + MetricPrototypeRegistry::get()->AddEntity(this); +} + +MetricEntityPrototype::~MetricEntityPrototype() { +} + +scoped_refptr MetricEntityPrototype::Instantiate( + MetricRegistry* registry, + const std::string& id, + const MetricEntity::AttributeMap& initial_attrs) const { + return registry->FindOrCreateEntity(this, id, initial_attrs); +} + + +// +// MetricEntity +// + +MetricEntity::MetricEntity(const MetricEntityPrototype* prototype, + std::string id, AttributeMap attributes) + : prototype_(prototype), + id_(std::move(id)), + attributes_(std::move(attributes)) {} + +MetricEntity::~MetricEntity() { +} + +void MetricEntity::CheckInstantiation(const MetricPrototype* proto) const { + CHECK_STREQ(prototype_->name(), proto->entity_type()) + << "Metric " << proto->name() << " may not be instantiated entity of type " + << prototype_->name() << " (expected: " << proto->entity_type() << ")"; +} + +scoped_refptr MetricEntity::FindOrNull(const MetricPrototype& prototype) const { + lock_guard l(&lock_); + return FindPtrOrNull(metric_map_, &prototype); +} + +namespace { + +bool MatchMetricInList(const string& metric_name, + const vector& match_params) { + for (const string& param : match_params) { + // Handle wildcard. + if (param == "*") return true; + // The parameter is a substring match of the metric name. + if (metric_name.find(param) != std::string::npos) { + return true; + } + } + return false; +} + +} // anonymous namespace + + +Status MetricEntity::WriteAsJson(JsonWriter* writer, + const vector& requested_metrics, + const MetricJsonOptions& opts) const { + bool select_all = MatchMetricInList(id(), requested_metrics); + + // We want the keys to be in alphabetical order when printing, so we use an ordered map here. + typedef std::map > OrderedMetricMap; + OrderedMetricMap metrics; + AttributeMap attrs; + { + // Snapshot the metrics in this registry (not guaranteed to be a consistent snapshot) + lock_guard l(&lock_); + attrs = attributes_; + for (const MetricMap::value_type& val : metric_map_) { + const MetricPrototype* prototype = val.first; + const scoped_refptr& metric = val.second; + + if (select_all || MatchMetricInList(prototype->name(), requested_metrics)) { + InsertOrDie(&metrics, prototype->name(), metric); + } + } + } + + // If we had a filter, and we didn't either match this entity or any metrics inside + // it, don't print the entity at all. + if (!requested_metrics.empty() && !select_all && metrics.empty()) { + return Status::OK(); + } + + writer->StartObject(); + + writer->String("type"); + writer->String(prototype_->name()); + + writer->String("id"); + writer->String(id_); + + writer->String("attributes"); + writer->StartObject(); + for (const AttributeMap::value_type& val : attrs) { + writer->String(val.first); + writer->String(val.second); + } + writer->EndObject(); + + writer->String("metrics"); + writer->StartArray(); + for (OrderedMetricMap::value_type& val : metrics) { + WARN_NOT_OK(val.second->WriteAsJson(writer, opts), + strings::Substitute("Failed to write $0 as JSON", val.first)); + + } + writer->EndArray(); + + writer->EndObject(); + + return Status::OK(); +} + +void MetricEntity::RetireOldMetrics() { + MonoTime now(MonoTime::Now(MonoTime::FINE)); + + lock_guard l(&lock_); + for (auto it = metric_map_.begin(); it != metric_map_.end();) { + const scoped_refptr& metric = it->second; + + if (PREDICT_TRUE(!metric->HasOneRef())) { + // The metric is still in use. Note that, in the case of "NeverRetire()", the metric + // will have a ref-count of 2 because it is reffed by the 'never_retire_metrics_' + // collection. + + // Ensure that it is not marked for later retirement (this could happen in the case + // that a metric is un-reffed and then re-reffed later by looking it up from the + // registry). + metric->retire_time_ = MonoTime(); + ++it; + continue; + } + + if (!metric->retire_time_.Initialized()) { + VLOG(3) << "Metric " << it->first << " has become un-referenced. Will retire after " + << "the retention interval"; + // This is the first time we've seen this metric as retirable. + metric->retire_time_ = now; + metric->retire_time_.AddDelta(MonoDelta::FromMilliseconds( + FLAGS_metrics_retirement_age_ms)); + ++it; + continue; + } + + // If we've already seen this metric in a previous scan, check if it's + // time to retire it yet. + if (now.ComesBefore(metric->retire_time_)) { + VLOG(3) << "Metric " << it->first << " is un-referenced, but still within " + << "the retention interval"; + ++it; + continue; + } + + + VLOG(2) << "Retiring metric " << it->first; + metric_map_.erase(it++); + } +} + +void MetricEntity::NeverRetire(const scoped_refptr& metric) { + lock_guard l(&lock_); + never_retire_metrics_.push_back(metric); +} + +void MetricEntity::SetAttributes(const AttributeMap& attrs) { + lock_guard l(&lock_); + attributes_ = attrs; +} + +void MetricEntity::SetAttribute(const string& key, const string& val) { + lock_guard l(&lock_); + attributes_[key] = val; +} + +// +// MetricRegistry +// + +MetricRegistry::MetricRegistry() { +} + +MetricRegistry::~MetricRegistry() { +} + +Status MetricRegistry::WriteAsJson(JsonWriter* writer, + const vector& requested_metrics, + const MetricJsonOptions& opts) const { + EntityMap entities; + { + lock_guard l(&lock_); + entities = entities_; + } + + writer->StartArray(); + for (const EntityMap::value_type e : entities) { + WARN_NOT_OK(e.second->WriteAsJson(writer, requested_metrics, opts), + Substitute("Failed to write entity $0 as JSON", e.second->id())); + } + writer->EndArray(); + + // Rather than having a thread poll metrics periodically to retire old ones, + // we'll just retire them here. The only downside is that, if no one is polling + // metrics, we may end up leaving them around indefinitely; however, metrics are + // small, and one might consider it a feature: if monitoring stops polling for + // metrics, we should keep them around until the next poll. + entities.clear(); // necessary to deref metrics we just dumped before doing retirement scan. + const_cast(this)->RetireOldMetrics(); + return Status::OK(); +} + +void MetricRegistry::RetireOldMetrics() { + lock_guard l(&lock_); + for (auto it = entities_.begin(); it != entities_.end();) { + it->second->RetireOldMetrics(); + + if (it->second->num_metrics() == 0 && it->second->HasOneRef()) { + // No metrics and no external references to this entity, so we can retire it. + // Unlike retiring the metrics themselves, we don't wait for any timeout + // to retire them -- we assume that that timed retention has been satisfied + // by holding onto the metrics inside the entity. + entities_.erase(it++); + } else { + ++it; + } + } +} + +// +// MetricPrototypeRegistry +// +MetricPrototypeRegistry* MetricPrototypeRegistry::get() { + return Singleton::get(); +} + +void MetricPrototypeRegistry::AddMetric(const MetricPrototype* prototype) { + lock_guard l(&lock_); + metrics_.push_back(prototype); +} + +void MetricPrototypeRegistry::AddEntity(const MetricEntityPrototype* prototype) { + lock_guard l(&lock_); + entities_.push_back(prototype); +} + +void MetricPrototypeRegistry::WriteAsJson(JsonWriter* writer) const { + lock_guard l(&lock_); + MetricJsonOptions opts; + opts.include_schema_info = true; + writer->StartObject(); + + // Dump metric prototypes. + writer->String("metrics"); + writer->StartArray(); + for (const MetricPrototype* p : metrics_) { + writer->StartObject(); + p->WriteFields(writer, opts); + writer->String("entity_type"); + writer->String(p->entity_type()); + writer->EndObject(); + } + writer->EndArray(); + + // Dump entity prototypes. + writer->String("entities"); + writer->StartArray(); + for (const MetricEntityPrototype* p : entities_) { + writer->StartObject(); + writer->String("name"); + writer->String(p->name()); + writer->EndObject(); + } + writer->EndArray(); + + writer->EndObject(); +} + +void MetricPrototypeRegistry::WriteAsJsonAndExit() const { + std::stringstream s; + JsonWriter w(&s, JsonWriter::PRETTY); + WriteAsJson(&w); + std::cout << s.str() << std::endl; + exit(0); +} + +// +// MetricPrototype +// +MetricPrototype::MetricPrototype(CtorArgs args) : args_(std::move(args)) { + MetricPrototypeRegistry::get()->AddMetric(this); +} + +void MetricPrototype::WriteFields(JsonWriter* writer, + const MetricJsonOptions& opts) const { + writer->String("name"); + writer->String(name()); + + if (opts.include_schema_info) { + writer->String("label"); + writer->String(label()); + + writer->String("type"); + writer->String(MetricType::Name(type())); + + writer->String("unit"); + writer->String(MetricUnit::Name(unit())); + + writer->String("description"); + writer->String(description()); + } +} + +// +// FunctionGaugeDetacher +// + +FunctionGaugeDetacher::FunctionGaugeDetacher() { +} + +FunctionGaugeDetacher::~FunctionGaugeDetacher() { + for (const Closure& c : callbacks_) { + c.Run(); + } +} + +scoped_refptr MetricRegistry::FindOrCreateEntity( + const MetricEntityPrototype* prototype, + const std::string& id, + const MetricEntity::AttributeMap& initial_attributes) { + lock_guard l(&lock_); + scoped_refptr e = FindPtrOrNull(entities_, id); + if (!e) { + e = new MetricEntity(prototype, id, initial_attributes); + InsertOrDie(&entities_, id, e); + } else { + e->SetAttributes(initial_attributes); + } + return e; +} + +// +// Metric +// +Metric::Metric(const MetricPrototype* prototype) + : prototype_(prototype) { +} + +Metric::~Metric() { +} + +// +// Gauge +// + +Status Gauge::WriteAsJson(JsonWriter* writer, + const MetricJsonOptions& opts) const { + writer->StartObject(); + + prototype_->WriteFields(writer, opts); + + writer->String("value"); + WriteValue(writer); + + writer->EndObject(); + return Status::OK(); +} + +// +// StringGauge +// + +StringGauge::StringGauge(const GaugePrototype* proto, + string initial_value) + : Gauge(proto), value_(std::move(initial_value)) {} + +std::string StringGauge::value() const { + lock_guard l(&lock_); + return value_; +} + +void StringGauge::set_value(const std::string& value) { + lock_guard l(&lock_); + value_ = value; +} + +void StringGauge::WriteValue(JsonWriter* writer) const { + writer->String(value()); +} + +// +// Counter +// +// This implementation is optimized by using a striped counter. See LongAdder for details. + +scoped_refptr CounterPrototype::Instantiate(const scoped_refptr& entity) { + return entity->FindOrCreateCounter(this); +} + +Counter::Counter(const CounterPrototype* proto) : Metric(proto) { +} + +int64_t Counter::value() const { + return value_.Value(); +} + +void Counter::Increment() { + IncrementBy(1); +} + +void Counter::IncrementBy(int64_t amount) { + value_.IncrementBy(amount); +} + +Status Counter::WriteAsJson(JsonWriter* writer, + const MetricJsonOptions& opts) const { + writer->StartObject(); + + prototype_->WriteFields(writer, opts); + + writer->String("value"); + writer->Int64(value()); + + writer->EndObject(); + return Status::OK(); +} + +///////////////////////////////////////////////// +// HistogramPrototype +///////////////////////////////////////////////// + +HistogramPrototype::HistogramPrototype(const MetricPrototype::CtorArgs& args, + uint64_t max_trackable_value, int num_sig_digits) + : MetricPrototype(args), + max_trackable_value_(max_trackable_value), + num_sig_digits_(num_sig_digits) { + // Better to crash at definition time that at instantiation time. + CHECK(HdrHistogram::IsValidHighestTrackableValue(max_trackable_value)) + << Substitute("Invalid max trackable value on histogram $0: $1", + args.name_, max_trackable_value); + CHECK(HdrHistogram::IsValidNumSignificantDigits(num_sig_digits)) + << Substitute("Invalid number of significant digits on histogram $0: $1", + args.name_, num_sig_digits); +} + +scoped_refptr HistogramPrototype::Instantiate( + const scoped_refptr& entity) { + return entity->FindOrCreateHistogram(this); +} + +///////////////////////////////////////////////// +// Histogram +///////////////////////////////////////////////// + +Histogram::Histogram(const HistogramPrototype* proto) + : Metric(proto), + histogram_(new HdrHistogram(proto->max_trackable_value(), proto->num_sig_digits())) { +} + +void Histogram::Increment(int64_t value) { + histogram_->Increment(value); +} + +void Histogram::IncrementBy(int64_t value, int64_t amount) { + histogram_->IncrementBy(value, amount); +} + +Status Histogram::WriteAsJson(JsonWriter* writer, + const MetricJsonOptions& opts) const { + + HistogramSnapshotPB snapshot; + RETURN_NOT_OK(GetHistogramSnapshotPB(&snapshot, opts)); + writer->Protobuf(snapshot); + return Status::OK(); +} + +Status Histogram::GetHistogramSnapshotPB(HistogramSnapshotPB* snapshot_pb, + const MetricJsonOptions& opts) const { + HdrHistogram snapshot(*histogram_); + snapshot_pb->set_name(prototype_->name()); + if (opts.include_schema_info) { + snapshot_pb->set_type(MetricType::Name(prototype_->type())); + snapshot_pb->set_label(prototype_->label()); + snapshot_pb->set_unit(MetricUnit::Name(prototype_->unit())); + snapshot_pb->set_description(prototype_->description()); + snapshot_pb->set_max_trackable_value(snapshot.highest_trackable_value()); + snapshot_pb->set_num_significant_digits(snapshot.num_significant_digits()); + } + snapshot_pb->set_total_count(snapshot.TotalCount()); + snapshot_pb->set_total_sum(snapshot.TotalSum()); + snapshot_pb->set_min(snapshot.MinValue()); + snapshot_pb->set_mean(snapshot.MeanValue()); + snapshot_pb->set_percentile_75(snapshot.ValueAtPercentile(75)); + snapshot_pb->set_percentile_95(snapshot.ValueAtPercentile(95)); + snapshot_pb->set_percentile_99(snapshot.ValueAtPercentile(99)); + snapshot_pb->set_percentile_99_9(snapshot.ValueAtPercentile(99.9)); + snapshot_pb->set_percentile_99_99(snapshot.ValueAtPercentile(99.99)); + snapshot_pb->set_max(snapshot.MaxValue()); + + if (opts.include_raw_histograms) { + RecordedValuesIterator iter(&snapshot); + while (iter.HasNext()) { + HistogramIterationValue value; + RETURN_NOT_OK(iter.Next(&value)); + snapshot_pb->add_values(value.value_iterated_to); + snapshot_pb->add_counts(value.count_at_value_iterated_to); + } + } + return Status::OK(); +} + +uint64_t Histogram::CountInBucketForValueForTests(uint64_t value) const { + return histogram_->CountInBucketForValue(value); +} + +uint64_t Histogram::TotalCount() const { + return histogram_->TotalCount(); +} + +uint64_t Histogram::MinValueForTests() const { + return histogram_->MinValue(); +} + +uint64_t Histogram::MaxValueForTests() const { + return histogram_->MaxValue(); +} +double Histogram::MeanValueForTests() const { + return histogram_->MeanValue(); +} + +ScopedLatencyMetric::ScopedLatencyMetric(Histogram* latency_hist) + : latency_hist_(latency_hist) { + if (latency_hist_) { + time_started_ = MonoTime::Now(MonoTime::FINE); + } +} + +ScopedLatencyMetric::~ScopedLatencyMetric() { + if (latency_hist_ != nullptr) { + MonoTime time_now = MonoTime::Now(MonoTime::FINE); + latency_hist_->Increment(time_now.GetDeltaSince(time_started_).ToMicroseconds()); + } +} + +} // namespace kudu diff --git a/src/kudu/util/metrics.h b/src/kudu/util/metrics.h new file mode 100644 index 000000000000..b55cd081dee4 --- /dev/null +++ b/src/kudu/util/metrics.h @@ -0,0 +1,1075 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_METRICS_H +#define KUDU_UTIL_METRICS_H + +///////////////////////////////////////////////////// +// Kudu Metrics +///////////////////////////////////////////////////// +// +// Summary +// ------------------------------------------------------------ +// +// This API provides a basic set of metrics primitives along the lines of the Code Hale's +// metrics library along with JSON formatted output of running metrics. +// +// The metrics system has a few main concepts in its data model: +// +// Metric Prototypes +// ----------------- +// Every metric that may be emitted is constructed from a prototype. The prototype defines +// the name of the metric, the entity it is attached to, its type, its units, and a description. +// +// Metric prototypes are defined statically using the METRIC_DEFINE_*(...) macros. This +// allows us to easily enumerate a full list of every metric that might be emitted from a +// server, thus allowing auto-generation of metric metadata for integration with +// monitoring systems such as Cloudera Manager. +// +// Metric Entity Prototypes +// ------------------------ +// The other main type in the data model is the Metric Entity. The most basic entity is the +// "server" entity -- metrics such as memory usage, RPC rates, etc, are typically associated +// with the server as a whole. +// +// Users of the metrics framework can define more entity types using the +// METRIC_DEFINE_entity(...) macro. +// +// MetricEntity instances +// ----------------------- +// Each defined Metric Entity Type serves as a prototype allowing instantiation of a +// MetricEntity object. Each instance then has its own unique set of metrics. For +// example, in the case of Kudu, we define a Metric Entity Type called 'tablet', and the +// Tablet Server instantiates one MetricEntity instance per tablet that it hosts. +// +// MetricEntity instances are instantiated within a MetricRegistry, and each instance is +// expected to have a unique string identifier within that registry. To continue the +// example above, a tablet entity uses its tablet ID as its unique identifier. These +// identifiers are exposed to the operator and surfaced in monitoring tools. +// +// MetricEntity instances may also carry a key-value map of string attributes. These +// attributes are directly exposed to monitoring systems via the JSON output. Monitoring +// systems may use this information to allow hierarchical aggregation beteween entities, +// display them to the user, etc. +// +// Metric instances +// ---------------- +// Given a MetricEntity instance and a Metric Prototype, one can instantiate a Metric +// instance. For example, the Kudu Tablet Server instantiates one MetricEntity instance +// for each tablet, and then instantiates the 'tablet_rows_inserted' prototype within that +// entity. Thus, each tablet then has a separate instance of the metric, allowing the end +// operator to track the metric on a per-tablet basis. +// +// +// Types of metrics +// ------------------------------------------------------------ +// Gauge: Set or get a point-in-time value. +// - string: Gauge for a string value. +// - Primitive types (bool, int64_t/uint64_t, double): Lock-free gauges. +// Counter: Get, reset, increment or decrement an int64_t value. +// Histogram: Increment buckets of values segmented by configurable max and precision. +// +// Gauge vs. Counter +// ------------------------------------------------------------ +// +// A Counter is a metric we expect to only monotonically increase. A +// Gauge is a metric that can decrease and increase. Use a Gauge to +// reflect a sample, e.g., the number of transaction in-flight at a +// given time; use a Counter when considering a metric over time, +// e.g., exposing the number of transactions processed since start to +// produce a metric for the number of transactions processed over some +// time period. +// +// The one exception to this rule is that occasionally it may be more convenient to +// implement a metric as a Gauge, even when it is logically a counter, due to Gauge's +// support for fetching metric values via a bound function. In that case, you can +// use the 'EXPOSE_AS_COUNTER' flag when defining the gauge prototype. For example: +// +// METRIC_DEFINE_gauge_uint64(server, threads_started, +// "Threads Started", +// kudu::MetricUnit::kThreads, +// "Total number of threads started on this server", +// kudu::EXPOSE_AS_COUNTER); +// +// +// Metrics ownership +// ------------------------------------------------------------ +// +// Metrics are reference-counted, and one of the references is always held by a metrics +// entity itself. Users of metrics should typically hold a scoped_refptr to their metrics +// within class instances, so that they also hold a reference. The one exception to this +// is FunctionGauges: see the class documentation below for a typical Gauge ownership pattern. +// +// Because the metrics entity holds a reference to the metric, this means that metrics will +// not be immediately destructed when your class instance publishing them is destructed. +// This is on purpose: metrics are retained for a configurable time interval even after they +// are no longer being published. The purpose of this is to allow monitoring systems, which +// only poll metrics infrequently (eg once a minute) to see the last value of a metric whose +// owner was destructed in between two polls. +// +// +// Example usage for server-level metrics +// ------------------------------------------------------------ +// +// 1) In your server class, define the top-level registry and the server entity: +// +// MetricRegistry metric_registry_; +// scoped_refptr metric_entity_; +// +// 2) In your server constructor/initialization, construct metric_entity_. This instance +// will be plumbed through into other subsystems that want to register server-level +// metrics. +// +// metric_entity_ = METRIC_ENTITY_server.Instantiate(®istry_, "some server identifier)"); +// +// 3) At the top of your .cc file where you want to emit a metric, define the metric prototype: +// +// METRIC_DEFINE_counter(server, ping_requests, "Ping Requests", kudu::MetricUnit::kRequests, +// "Number of Ping() RPC requests this server has handled since start"); +// +// 4) In your class where you want to emit metrics, define the metric instance itself: +// scoped_refptr ping_counter_; +// +// 5) In your class constructor, instantiate the metric based on the MetricEntity plumbed in: +// +// MyClass(..., const scoped_refptr& metric_entity) : +// ping_counter_(METRIC_ping_requests.Instantiate(metric_entity)) { +// } +// +// 6) Where you want to change the metric value, just use the instance variable: +// +// ping_counter_->IncrementBy(100); +// +// +// Example usage for custom entity metrics +// ------------------------------------------------------------ +// Follow the same pattern as above, but also define a metric entity somewhere. For example: +// +// At the top of your CC file: +// +// METRIC_DEFINE_entity(my_entity); +// METRIC_DEFINE_counter(my_entity, ping_requests, "Ping Requests", kudu::MetricUnit::kRequests, +// "Number of Ping() RPC requests this particular entity has handled since start"); +// +// In whatever class represents the entity: +// +// entity_ = METRIC_ENTITY_my_entity.Instantiate(®istry_, my_entity_id); +// +// In whatever classes emit metrics: +// +// scoped_refptr ping_requests_ = METRIC_ping_requests.Instantiate(entity); +// ping_requests_->Increment(); +// +// NOTE: at runtime, the metrics system prevents you from instantiating a metric in the +// wrong entity type. This ensures that the metadata can fully describe the set of metric-entity +// relationships. +// +// Plumbing of MetricEntity and MetricRegistry objects +// ------------------------------------------------------------ +// Generally, the rule of thumb to follow when plumbing through entities and registries is +// this: if you're creating new entities or you need to dump the registry contents +// (e.g. path handlers), pass in the registry. Otherwise, pass in the entity. +// +// =========== +// JSON output +// =========== +// +// The first-class output format for metrics is pretty-printed JSON. +// Such a format is relatively easy for humans and machines to read. +// +// The top level JSON object is an array, which contains one element per +// entity. Each entity is an object which has its type, id, and an array +// of metrics. Each metric contains its type, name, unit, description, value, +// etc. +// TODO: Output to HTML. +// +// Example JSON output: +// +// [ +// { +// "type": "tablet", +// "id": "e95e57ba8d4d48458e7c7d35020d4a46", +// "attributes": { +// "table_id": "12345", +// "table_name": "my_table" +// }, +// "metrics": [ +// { +// "type": "counter", +// "name": "log_reader_bytes_read", +// "label": "Log Reader Bytes Read", +// "unit": "bytes", +// "description": "Number of bytes read since tablet start", +// "value": 0 +// }, +// ... +// ] +// }, +// ... +// ] +// +///////////////////////////////////////////////////// + +#include +#include +#include +#include + +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/casts.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/singleton.h" +#include "kudu/util/atomic.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/striped64.h" + +// Define a new entity type. +// +// The metrics subsystem itself defines the entity type 'server', but other +// entity types can be registered using this macro. +#define METRIC_DEFINE_entity(name) \ + ::kudu::MetricEntityPrototype METRIC_ENTITY_##name(#name) + +// Convenience macros to define metric prototypes. +// See the documentation at the top of this file for example usage. +#define METRIC_DEFINE_counter(entity, name, label, unit, desc) \ + ::kudu::CounterPrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc)) + +#define METRIC_DEFINE_gauge_string(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_bool(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_## name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_int32(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_uint32(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_int64(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_uint64(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DEFINE_gauge_double(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) + +#define METRIC_DEFINE_histogram(entity, name, label, unit, desc, max_val, num_sig_digits) \ + ::kudu::HistogramPrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc), \ + max_val, num_sig_digits) + +// The following macros act as forward declarations for entity types and metric prototypes. +#define METRIC_DECLARE_entity(name) \ + extern ::kudu::MetricEntityPrototype METRIC_ENTITY_##name +#define METRIC_DECLARE_counter(name) \ + extern ::kudu::CounterPrototype METRIC_##name +#define METRIC_DECLARE_gauge_string(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_bool(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_int32(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_uint32(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_int64(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_uint64(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_gauge_double(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#define METRIC_DECLARE_histogram(name) \ + extern ::kudu::HistogramPrototype METRIC_##name + +#if defined(__APPLE__) +#define METRIC_DEFINE_gauge_size(entity, name, label, unit, desc, ...) \ + ::kudu::GaugePrototype METRIC_##name( \ + ::kudu::MetricPrototype::CtorArgs(#entity, #name, label, unit, desc, ## __VA_ARGS__)) +#define METRIC_DECLARE_gauge_size(name) \ + extern ::kudu::GaugePrototype METRIC_##name +#else +#define METRIC_DEFINE_gauge_size METRIC_DEFINE_gauge_uint64 +#define METRIC_DECLARE_gauge_size METRIC_DECLARE_gauge_uint64 +#endif + +namespace kudu { + +class Counter; +class CounterPrototype; + +template +class AtomicGauge; +template +class FunctionGauge; +class Gauge; +template +class GaugePrototype; + +class Metric; +class MetricEntityPrototype; +class MetricPrototype; +class MetricRegistry; + +class HdrHistogram; +class Histogram; +class HistogramPrototype; +class HistogramSnapshotPB; + +class MetricEntity; + +} // namespace kudu + +// Forward-declare the generic 'server' entity type. +// We have to do this here below the forward declarations, but not +// in the kudu namespace. +METRIC_DECLARE_entity(server); + +namespace kudu { + +// Unit types to be used with metrics. +// As additional units are required, add them to this enum and also to Name(). +struct MetricUnit { + enum Type { + kCacheHits, + kCacheQueries, + kBytes, + kRequests, + kEntries, + kRows, + kCells, + kConnections, + kOperations, + kProbes, + kNanoseconds, + kMicroseconds, + kMilliseconds, + kSeconds, + kThreads, + kTransactions, + kUnits, + kScanners, + kMaintenanceOperations, + kBlocks, + kLogBlockContainers, + kTasks, + kMessages, + kContextSwitches, + }; + static const char* Name(Type unit); +}; + +class MetricType { + public: + enum Type { kGauge, kCounter, kHistogram }; + static const char* Name(Type t); + private: + static const char* const kGaugeType; + static const char* const kCounterType; + static const char* const kHistogramType; +}; + +struct MetricJsonOptions { + MetricJsonOptions() : + include_raw_histograms(false), + include_schema_info(false) { + } + + // Include the raw histogram values and counts in the JSON output. + // This allows consumers to do cross-server aggregation or window + // data over time. + // Default: false + bool include_raw_histograms; + + // Include the metrics "schema" information (i.e description, label, + // unit, etc). + // Default: false + bool include_schema_info; +}; + +class MetricEntityPrototype { + public: + explicit MetricEntityPrototype(const char* name); + ~MetricEntityPrototype(); + + const char* name() const { return name_; } + + // Find or create an entity with the given ID within the provided 'registry'. + scoped_refptr Instantiate( + MetricRegistry* registry, + const std::string& id) const { + return Instantiate(registry, id, std::unordered_map()); + } + + // If the entity already exists, then 'initial_attrs' will replace all existing + // attributes. + scoped_refptr Instantiate( + MetricRegistry* registry, + const std::string& id, + const std::unordered_map& initial_attrs) const; + + private: + const char* const name_; + + DISALLOW_COPY_AND_ASSIGN(MetricEntityPrototype); +}; + +class MetricEntity : public RefCountedThreadSafe { + public: + typedef std::unordered_map > MetricMap; + typedef std::unordered_map AttributeMap; + + scoped_refptr FindOrCreateCounter(const CounterPrototype* proto); + scoped_refptr FindOrCreateHistogram(const HistogramPrototype* proto); + + template + scoped_refptr > FindOrCreateGauge(const GaugePrototype* proto, + const T& initial_value); + + template + scoped_refptr > FindOrCreateFunctionGauge(const GaugePrototype* proto, + const Callback& function); + + // Return the metric instantiated from the given prototype, or NULL if none has been + // instantiated. Primarily used by tests trying to read metric values. + scoped_refptr FindOrNull(const MetricPrototype& prototype) const; + + const std::string& id() const { return id_; } + + // See MetricRegistry::WriteAsJson() + Status WriteAsJson(JsonWriter* writer, + const std::vector& requested_metrics, + const MetricJsonOptions& opts) const; + + const MetricMap& UnsafeMetricsMapForTests() const { return metric_map_; } + + // Mark that the given metric should never be retired until the metric + // registry itself destructs. This is useful for system metrics such as + // tcmalloc, etc, which should live as long as the process itself. + void NeverRetire(const scoped_refptr& metric); + + // Scan the metrics map for metrics needing retirement, removing them as necessary. + // + // Metrics are retired when they are no longer referenced outside of the metrics system + // itself. Additionally, we only retire a metric that has been in this state for + // at least FLAGS_metrics_retirement_age_ms milliseconds. + void RetireOldMetrics(); + + // Replaces all attributes for this entity. + // Any attributes currently set, but not in 'attrs', are removed. + void SetAttributes(const AttributeMap& attrs); + + // Set a particular attribute. Replaces any current value. + void SetAttribute(const std::string& key, const std::string& val); + + int num_metrics() const { + lock_guard l(&lock_); + return metric_map_.size(); + } + + private: + friend class MetricRegistry; + friend class RefCountedThreadSafe; + + MetricEntity(const MetricEntityPrototype* prototype, std::string id, + AttributeMap attributes); + ~MetricEntity(); + + // Ensure that the given metric prototype is allowed to be instantiated + // within this entity. This entity's type must match the expected entity + // type defined within the metric prototype. + void CheckInstantiation(const MetricPrototype* proto) const; + + const MetricEntityPrototype* const prototype_; + const std::string id_; + + mutable simple_spinlock lock_; + + // Map from metric name to Metric object. Protected by lock_. + MetricMap metric_map_; + + // The key/value attributes. Protected by lock_ + AttributeMap attributes_; + + // The set of metrics which should never be retired. Protected by lock_. + std::vector > never_retire_metrics_; +}; + +// Base class to allow for putting all metrics into a single container. +// See documentation at the top of this file for information on metrics ownership. +class Metric : public RefCountedThreadSafe { + public: + // All metrics must be able to render themselves as JSON. + virtual Status WriteAsJson(JsonWriter* writer, + const MetricJsonOptions& opts) const = 0; + + const MetricPrototype* prototype() const { return prototype_; } + + protected: + explicit Metric(const MetricPrototype* prototype); + virtual ~Metric(); + + const MetricPrototype* const prototype_; + + private: + friend class MetricEntity; + friend class RefCountedThreadSafe; + + // The time at which we should retire this metric if it is still un-referenced outside + // of the metrics subsystem. If this metric is not due for retirement, this member is + // uninitialized. + MonoTime retire_time_; + + DISALLOW_COPY_AND_ASSIGN(Metric); +}; + +// Registry of all the metrics for a server. +// +// This aggregates the MetricEntity objects associated with the server. +class MetricRegistry { + public: + MetricRegistry(); + ~MetricRegistry(); + + scoped_refptr FindOrCreateEntity(const MetricEntityPrototype* prototype, + const std::string& id, + const MetricEntity::AttributeMap& initial_attrs); + + // Writes metrics in this registry to 'writer'. + // + // 'requested_metrics' is a set of substrings to match metric names against, + // where '*' matches all metrics. + // + // The string matching can either match an entity ID or a metric name. + // If it matches an entity ID, then all metrics for that entity will be printed. + // + // See the MetricJsonOptions struct definition above for options changing the + // output of this function. + Status WriteAsJson(JsonWriter* writer, + const std::vector& requested_metrics, + const MetricJsonOptions& opts) const; + + // For each registered entity, retires orphaned metrics. If an entity has no more + // metrics and there are no external references, entities are removed as well. + // + // See MetricEntity::RetireOldMetrics(). + void RetireOldMetrics(); + + // Return the number of entities in this registry. + int num_entities() const { + lock_guard l(&lock_); + return entities_.size(); + } + + private: + typedef std::unordered_map > EntityMap; + EntityMap entities_; + + mutable simple_spinlock lock_; + DISALLOW_COPY_AND_ASSIGN(MetricRegistry); +}; + +// Registry of all of the metric and entity prototypes that have been +// defined. +// +// Prototypes are typically defined as static variables in different compilation +// units, and their constructors register themselves here. The registry is then +// used in order to dump metrics metadata to generate a Cloudera Manager MDL +// file. +// +// This class is thread-safe. +class MetricPrototypeRegistry { + public: + // Get the singleton instance. + static MetricPrototypeRegistry* get(); + + // Dump a JSON document including all of the registered entity and metric + // prototypes. + void WriteAsJson(JsonWriter* writer) const; + + // Convenience wrapper around WriteAsJson(...). This dumps the JSON information + // to stdout and then exits. + void WriteAsJsonAndExit() const; + private: + friend class Singleton; + friend class MetricPrototype; + friend class MetricEntityPrototype; + MetricPrototypeRegistry() {} + ~MetricPrototypeRegistry() {} + + // Register a metric prototype in the registry. + void AddMetric(const MetricPrototype* prototype); + + // Register a metric entity prototype in the registry. + void AddEntity(const MetricEntityPrototype* prototype); + + mutable simple_spinlock lock_; + std::vector metrics_; + std::vector entities_; + + DISALLOW_COPY_AND_ASSIGN(MetricPrototypeRegistry); +}; + +enum PrototypeFlags { + // Flag which causes a Gauge prototype to expose itself as if it + // were a counter. + EXPOSE_AS_COUNTER = 1 << 0 +}; + +class MetricPrototype { + public: + // Simple struct to aggregate the arguments common to all prototypes. + // This makes constructor chaining a little less tedious. + struct CtorArgs { + CtorArgs(const char* entity_type, + const char* name, + const char* label, + MetricUnit::Type unit, + const char* description, + uint32_t flags = 0) + : entity_type_(entity_type), + name_(name), + label_(label), + unit_(unit), + description_(description), + flags_(flags) { + } + + const char* const entity_type_; + const char* const name_; + const char* const label_; + const MetricUnit::Type unit_; + const char* const description_; + const uint32_t flags_; + }; + + const char* entity_type() const { return args_.entity_type_; } + const char* name() const { return args_.name_; } + const char* label() const { return args_.label_; } + MetricUnit::Type unit() const { return args_.unit_; } + const char* description() const { return args_.description_; } + virtual MetricType::Type type() const = 0; + + // Writes the fields of this prototype to the given JSON writer. + void WriteFields(JsonWriter* writer, + const MetricJsonOptions& opts) const; + + protected: + explicit MetricPrototype(CtorArgs args); + virtual ~MetricPrototype() { + } + + const CtorArgs args_; + + private: + DISALLOW_COPY_AND_ASSIGN(MetricPrototype); +}; + +// A description of a Gauge. +template +class GaugePrototype : public MetricPrototype { + public: + explicit GaugePrototype(const MetricPrototype::CtorArgs& args) + : MetricPrototype(args) { + } + + // Instantiate a "manual" gauge. + scoped_refptr > Instantiate( + const scoped_refptr& entity, + const T& initial_value) const { + return entity->FindOrCreateGauge(this, initial_value); + } + + // Instantiate a gauge that is backed by the given callback. + scoped_refptr > InstantiateFunctionGauge( + const scoped_refptr& entity, + const Callback& function) const { + return entity->FindOrCreateFunctionGauge(this, function); + } + + virtual MetricType::Type type() const OVERRIDE { + if (args_.flags_ & EXPOSE_AS_COUNTER) { + return MetricType::kCounter; + } else { + return MetricType::kGauge; + } + } + + private: + DISALLOW_COPY_AND_ASSIGN(GaugePrototype); +}; + +// Abstract base class to provide point-in-time metric values. +class Gauge : public Metric { + public: + explicit Gauge(const MetricPrototype* prototype) + : Metric(prototype) { + } + virtual ~Gauge() {} + virtual Status WriteAsJson(JsonWriter* w, + const MetricJsonOptions& opts) const OVERRIDE; + protected: + virtual void WriteValue(JsonWriter* writer) const = 0; + private: + DISALLOW_COPY_AND_ASSIGN(Gauge); +}; + +// Gauge implementation for string that uses locks to ensure thread safety. +class StringGauge : public Gauge { + public: + StringGauge(const GaugePrototype* proto, + std::string initial_value); + std::string value() const; + void set_value(const std::string& value); + protected: + virtual void WriteValue(JsonWriter* writer) const OVERRIDE; + private: + std::string value_; + mutable simple_spinlock lock_; // Guards value_ + DISALLOW_COPY_AND_ASSIGN(StringGauge); +}; + +// Lock-free implementation for types that are convertible to/from int64_t. +template +class AtomicGauge : public Gauge { + public: + AtomicGauge(const GaugePrototype* proto, T initial_value) + : Gauge(proto), + value_(initial_value) { + } + T value() const { + return static_cast(value_.Load(kMemOrderRelease)); + } + virtual void set_value(const T& value) { + value_.Store(static_cast(value), kMemOrderNoBarrier); + } + void Increment() { + value_.IncrementBy(1, kMemOrderNoBarrier); + } + virtual void IncrementBy(int64_t amount) { + value_.IncrementBy(amount, kMemOrderNoBarrier); + } + void Decrement() { + IncrementBy(-1); + } + void DecrementBy(int64_t amount) { + IncrementBy(-amount); + } + + protected: + virtual void WriteValue(JsonWriter* writer) const OVERRIDE { + writer->Value(value()); + } + AtomicInt value_; + private: + DISALLOW_COPY_AND_ASSIGN(AtomicGauge); +}; + +// Utility class to automatically detach FunctionGauges when a class destructs. +// +// Because FunctionGauges typically access class instance state, it's important to ensure +// that they are detached before the class destructs. One approach is to make all +// FunctionGauge instances be members of the class, and then call gauge_->Detach() in your +// class's destructor. However, it's easy to forget to do this, which would lead to +// heap-use-after-free bugs. This type of bug is easy to miss in unit tests because the +// tests don't always poll metrics. Using a FunctionGaugeDetacher member instead makes +// the detaching automatic and thus less error-prone. +// +// Example usage: +// +// METRIC_define_gauge_int64(my_metric, MetricUnit::kOperations, "My metric docs"); +// class MyClassWithMetrics { +// public: +// MyClassWithMetrics(const scoped_refptr& entity) { +// METRIC_my_metric.InstantiateFunctionGauge(entity, +// Bind(&MyClassWithMetrics::ComputeMyMetric, Unretained(this))) +// ->AutoDetach(&metric_detacher_); +// } +// ~MyClassWithMetrics() { +// } +// +// private: +// int64_t ComputeMyMetric() { +// // Compute some metric based on instance state. +// } +// FunctionGaugeDetacher metric_detacher_; +// }; +class FunctionGaugeDetacher { + public: + FunctionGaugeDetacher(); + ~FunctionGaugeDetacher(); + + private: + template + friend class FunctionGauge; + + void OnDestructor(const Closure& c) { + callbacks_.push_back(c); + } + + std::vector callbacks_; + + DISALLOW_COPY_AND_ASSIGN(FunctionGaugeDetacher); +}; + + +// A Gauge that calls back to a function to get its value. +// +// This metric type should be used in cases where it is difficult to keep a running +// measure of a metric, but instead would like to compute the metric value whenever it is +// requested by a user. +// +// The lifecycle should be carefully considered when using a FunctionGauge. In particular, +// the bound function needs to always be safe to run -- so if it references a particular +// non-singleton class instance, the instance must out-live the function. Typically, +// the easiest way to ensure this is to use a FunctionGaugeDetacher (see above). +template +class FunctionGauge : public Gauge { + public: + T value() const { + lock_guard l(&lock_); + return function_.Run(); + } + + virtual void WriteValue(JsonWriter* writer) const OVERRIDE { + writer->Value(value()); + } + + // Reset this FunctionGauge to return a specific value. + // This should be used during destruction. If you want a settable + // Gauge, use a normal Gauge instead of a FunctionGauge. + void DetachToConstant(T v) { + lock_guard l(&lock_); + function_ = Bind(&FunctionGauge::Return, v); + } + + // Get the current value of the gauge, and detach so that it continues to return this + // value in perpetuity. + void DetachToCurrentValue() { + T last_value = value(); + DetachToConstant(last_value); + } + + // Automatically detach this gauge when the given 'detacher' destructs. + // After detaching, the metric will return 'value' in perpetuity. + void AutoDetach(FunctionGaugeDetacher* detacher, T value = T()) { + detacher->OnDestructor(Bind(&FunctionGauge::DetachToConstant, + this, value)); + } + + // Automatically detach this gauge when the given 'detacher' destructs. + // After detaching, the metric will return whatever its value was at the + // time of detaching. + // + // Note that, when using this method, you should be sure that the FunctionGaugeDetacher + // is destructed before any objects which are required by the gauge implementation. + // In typical usage (see the FunctionGaugeDetacher class documentation) this means you + // should declare the detacher member after all other class members that might be + // accessed by the gauge function implementation. + void AutoDetachToLastValue(FunctionGaugeDetacher* detacher) { + detacher->OnDestructor(Bind(&FunctionGauge::DetachToCurrentValue, + this)); + } + + private: + friend class MetricEntity; + + FunctionGauge(const GaugePrototype* proto, Callback function) + : Gauge(proto), function_(std::move(function)) {} + + static T Return(T v) { + return v; + } + + mutable simple_spinlock lock_; + Callback function_; + DISALLOW_COPY_AND_ASSIGN(FunctionGauge); +}; + +// Prototype for a counter. +class CounterPrototype : public MetricPrototype { + public: + explicit CounterPrototype(const MetricPrototype::CtorArgs& args) + : MetricPrototype(args) { + } + scoped_refptr Instantiate(const scoped_refptr& entity); + + virtual MetricType::Type type() const OVERRIDE { return MetricType::kCounter; } + + private: + DISALLOW_COPY_AND_ASSIGN(CounterPrototype); +}; + +// Simple incrementing 64-bit integer. +// Only use Counters in cases that we expect the count to only increase. For example, +// a counter is appropriate for "number of transactions processed by the server", +// but not for "number of transactions currently in flight". Monitoring software +// knows that counters only increase and thus can compute rates over time, rates +// across multiple servers, etc, which aren't appropriate in the case of gauges. +class Counter : public Metric { + public: + int64_t value() const; + void Increment(); + void IncrementBy(int64_t amount); + virtual Status WriteAsJson(JsonWriter* w, + const MetricJsonOptions& opts) const OVERRIDE; + + private: + FRIEND_TEST(MetricsTest, SimpleCounterTest); + FRIEND_TEST(MultiThreadedMetricsTest, CounterIncrementTest); + friend class MetricEntity; + + explicit Counter(const CounterPrototype* proto); + + LongAdder value_; + DISALLOW_COPY_AND_ASSIGN(Counter); +}; + +class HistogramPrototype : public MetricPrototype { + public: + HistogramPrototype(const MetricPrototype::CtorArgs& args, + uint64_t max_trackable_value, int num_sig_digits); + scoped_refptr Instantiate(const scoped_refptr& entity); + + uint64_t max_trackable_value() const { return max_trackable_value_; } + int num_sig_digits() const { return num_sig_digits_; } + virtual MetricType::Type type() const OVERRIDE { return MetricType::kHistogram; } + + private: + const uint64_t max_trackable_value_; + const int num_sig_digits_; + DISALLOW_COPY_AND_ASSIGN(HistogramPrototype); +}; + +class Histogram : public Metric { + public: + // Increment the histogram for the given value. + // 'value' must be non-negative. + void Increment(int64_t value); + + // Increment the histogram for the given value by the given amount. + // 'value' and 'amount' must be non-negative. + void IncrementBy(int64_t value, int64_t amount); + + // Return the total number of values added to the histogram (via Increment() + // or IncrementBy()). + uint64_t TotalCount() const; + + virtual Status WriteAsJson(JsonWriter* w, + const MetricJsonOptions& opts) const OVERRIDE; + + // Returns a snapshot of this histogram including the bucketed values and counts. + Status GetHistogramSnapshotPB(HistogramSnapshotPB* snapshot, + const MetricJsonOptions& opts) const; + + uint64_t CountInBucketForValueForTests(uint64_t value) const; + uint64_t MinValueForTests() const; + uint64_t MaxValueForTests() const; + double MeanValueForTests() const; + + private: + FRIEND_TEST(MetricsTest, SimpleHistogramTest); + friend class MetricEntity; + explicit Histogram(const HistogramPrototype* proto); + + const gscoped_ptr histogram_; + DISALLOW_COPY_AND_ASSIGN(Histogram); +}; + +// Measures a duration while in scope. Adds this duration to specified histogram on destruction. +class ScopedLatencyMetric { + public: + // NOTE: the given histogram must live as long as this object. + // If 'latency_hist' is NULL, this turns into a no-op. + explicit ScopedLatencyMetric(Histogram* latency_hist); + ~ScopedLatencyMetric(); + + private: + Histogram* latency_hist_; + MonoTime time_started_; +}; + +//////////////////////////////////////////////////////////// +// Inline implementations of template methods +//////////////////////////////////////////////////////////// + +inline scoped_refptr MetricEntity::FindOrCreateCounter( + const CounterPrototype* proto) { + CheckInstantiation(proto); + lock_guard l(&lock_); + scoped_refptr m = down_cast(FindPtrOrNull(metric_map_, proto).get()); + if (!m) { + m = new Counter(proto); + InsertOrDie(&metric_map_, proto, m); + } + return m; +} + +inline scoped_refptr MetricEntity::FindOrCreateHistogram( + const HistogramPrototype* proto) { + CheckInstantiation(proto); + lock_guard l(&lock_); + scoped_refptr m = down_cast(FindPtrOrNull(metric_map_, proto).get()); + if (!m) { + m = new Histogram(proto); + InsertOrDie(&metric_map_, proto, m); + } + return m; +} + +template +inline scoped_refptr > MetricEntity::FindOrCreateGauge( + const GaugePrototype* proto, + const T& initial_value) { + CheckInstantiation(proto); + lock_guard l(&lock_); + scoped_refptr > m = down_cast*>( + FindPtrOrNull(metric_map_, proto).get()); + if (!m) { + m = new AtomicGauge(proto, initial_value); + InsertOrDie(&metric_map_, proto, m); + } + return m; +} + +template +inline scoped_refptr > MetricEntity::FindOrCreateFunctionGauge( + const GaugePrototype* proto, + const Callback& function) { + CheckInstantiation(proto); + lock_guard l(&lock_); + scoped_refptr > m = down_cast*>( + FindPtrOrNull(metric_map_, proto).get()); + if (!m) { + m = new FunctionGauge(proto, function); + InsertOrDie(&metric_map_, proto, m); + } + return m; +} + +} // namespace kudu + +#endif // KUDU_UTIL_METRICS_H diff --git a/src/kudu/util/monotime-test.cc b/src/kudu/util/monotime-test.cc new file mode 100644 index 000000000000..96d7277e16c9 --- /dev/null +++ b/src/kudu/util/monotime-test.cc @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/monotime.h" + +#include +#include + +#include +#include + +#include "kudu/util/test_util.h" + +namespace kudu { + +TEST(TestMonoTime, TestMonotonicity) { + alarm(360); + MonoTime prev(MonoTime::Now(MonoTime::FINE)); + MonoTime next; + + do { + next = MonoTime::Now(MonoTime::FINE); + //LOG(INFO) << " next = " << next.ToString(); + } while (!prev.ComesBefore(next)); + ASSERT_FALSE(next.ComesBefore(prev)); + alarm(0); +} + +TEST(TestMonoTime, TestComparison) { + MonoTime now(MonoTime::Now(MonoTime::COARSE)); + MonoTime future(now); + future.AddDelta(MonoDelta::FromNanoseconds(1L)); + + ASSERT_GT(future.GetDeltaSince(now).ToNanoseconds(), 0); + ASSERT_LT(now.GetDeltaSince(future).ToNanoseconds(), 0); + ASSERT_EQ(now.GetDeltaSince(now).ToNanoseconds(), 0); + + MonoDelta nano(MonoDelta::FromNanoseconds(1L)); + MonoDelta mil(MonoDelta::FromMilliseconds(1L)); + MonoDelta sec(MonoDelta::FromSeconds(1.0)); + + ASSERT_TRUE(nano.LessThan(mil)); + ASSERT_TRUE(mil.LessThan(sec)); + ASSERT_TRUE(mil.MoreThan(nano)); + ASSERT_TRUE(sec.MoreThan(mil)); +} + +TEST(TestMonoTime, TestTimeVal) { + struct timeval tv; + tv.tv_sec = 0; + tv.tv_usec = 0; + + // Normal conversion case. + MonoDelta one_sec_one_micro(MonoDelta::FromNanoseconds(1000001000L)); + one_sec_one_micro.ToTimeVal(&tv); + ASSERT_EQ(1, tv.tv_sec); + ASSERT_EQ(1, tv.tv_usec); + + // Case where we are still positive but sub-micro. + // Round up to nearest microsecond. This is to avoid infinite timeouts + // in APIs that take a struct timeval. + MonoDelta zero_sec_one_nano(MonoDelta::FromNanoseconds(1L)); + zero_sec_one_nano.ToTimeVal(&tv); + ASSERT_EQ(0, tv.tv_sec); + ASSERT_EQ(1, tv.tv_usec); // Special case: 1ns rounds up to + + // Negative conversion case. Ensure the timeval is normalized. + // That means sec is negative and usec is positive. + MonoDelta neg_micro(MonoDelta::FromMicroseconds(-1L)); + ASSERT_EQ(-1000, neg_micro.ToNanoseconds()); + neg_micro.ToTimeVal(&tv); + ASSERT_EQ(-1, tv.tv_sec); + ASSERT_EQ(999999, tv.tv_usec); + + // Case where we are still negative but sub-micro. + // Round up to nearest microsecond. This is to avoid infinite timeouts + // in APIs that take a struct timeval and for consistency. + MonoDelta zero_sec_neg_one_nano(MonoDelta::FromNanoseconds(-1L)); + zero_sec_neg_one_nano.ToTimeVal(&tv); + ASSERT_EQ(-1, tv.tv_sec); + ASSERT_EQ(999999, tv.tv_usec); +} + +TEST(TestMonoTime, TestTimeSpec) { + MonoTime one_sec_one_nano_expected(1000000001L); + struct timespec ts; + ts.tv_sec = 1; + ts.tv_nsec = 1; + MonoTime one_sec_one_nano_actual(ts); + ASSERT_EQ(0, one_sec_one_nano_expected.GetDeltaSince(one_sec_one_nano_actual).ToNanoseconds()); + + MonoDelta zero_sec_two_nanos(MonoDelta::FromNanoseconds(2L)); + zero_sec_two_nanos.ToTimeSpec(&ts); + ASSERT_EQ(0, ts.tv_sec); + ASSERT_EQ(2, ts.tv_nsec); + + // Negative conversion case. Ensure the timespec is normalized. + // That means sec is negative and nsec is positive. + MonoDelta neg_nano(MonoDelta::FromNanoseconds(-1L)); + ASSERT_EQ(-1, neg_nano.ToNanoseconds()); + neg_nano.ToTimeSpec(&ts); + ASSERT_EQ(-1, ts.tv_sec); + ASSERT_EQ(999999999, ts.tv_nsec); + +} + +TEST(TestMonoTime, TestDeltas) { + alarm(360); + const MonoDelta max_delta(MonoDelta::FromSeconds(0.1)); + MonoTime prev(MonoTime::Now(MonoTime::FINE)); + MonoTime next; + MonoDelta cur_delta; + do { + next = MonoTime::Now(MonoTime::FINE); + cur_delta = next.GetDeltaSince(prev); + } while (cur_delta.LessThan(max_delta)); + alarm(0); +} + +TEST(TestMonoTime, TestDeltaConversions) { + // TODO: Reliably test MonoDelta::FromSeconds() considering floating-point rounding errors + + MonoDelta mil(MonoDelta::FromMilliseconds(500)); + ASSERT_EQ(500 * MonoTime::kNanosecondsPerMillisecond, mil.nano_delta_); + + MonoDelta micro(MonoDelta::FromMicroseconds(500)); + ASSERT_EQ(500 * MonoTime::kNanosecondsPerMicrosecond, micro.nano_delta_); + + MonoDelta nano(MonoDelta::FromNanoseconds(500)); + ASSERT_EQ(500, nano.nano_delta_); +} + +static void DoTestMonoTimePerf(MonoTime::Granularity granularity) { + const MonoDelta max_delta(MonoDelta::FromMilliseconds(500)); + uint64_t num_calls = 0; + MonoTime prev(MonoTime::Now(granularity)); + MonoTime next; + MonoDelta cur_delta; + do { + next = MonoTime::Now(granularity); + cur_delta = next.GetDeltaSince(prev); + num_calls++; + } while (cur_delta.LessThan(max_delta)); + LOG(INFO) << "DoTestMonoTimePerf(granularity=" + << ((granularity == MonoTime::FINE) ? "FINE" : "COARSE") + << "): " << num_calls << " in " + << max_delta.ToString() << " seconds."; +} + +TEST(TestMonoTime, TestSleepFor) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoDelta sleep = MonoDelta::FromMilliseconds(100); + SleepFor(sleep); + MonoTime end = MonoTime::Now(MonoTime::FINE); + MonoDelta actualSleep = end.GetDeltaSince(start); + ASSERT_GE(actualSleep.ToNanoseconds(), sleep.ToNanoseconds()); +} + +TEST(TestMonoTime, TestSleepForOverflow) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test because it sleeps for ~4s"; + return; + } + + // This quantity (~4s sleep) overflows a 32-bit integer such that + // the value becomes 0. + MonoTime start = MonoTime::Now(MonoTime::FINE); + MonoDelta sleep = MonoDelta::FromNanoseconds(1L << 32); + SleepFor(sleep); + MonoTime end = MonoTime::Now(MonoTime::FINE); + MonoDelta actualSleep = end.GetDeltaSince(start); + ASSERT_GE(actualSleep.ToNanoseconds(), sleep.ToNanoseconds()); +} + +TEST(TestMonoTimePerf, TestMonoTimePerfCoarse) { + alarm(360); + DoTestMonoTimePerf(MonoTime::COARSE); + alarm(0); +} + +TEST(TestMonoTimePerf, TestMonoTimePerfFine) { + alarm(360); + DoTestMonoTimePerf(MonoTime::FINE); + alarm(0); +} + +} // namespace kudu diff --git a/src/kudu/util/monotime.cc b/src/kudu/util/monotime.cc new file mode 100644 index 000000000000..51c95e9aceeb --- /dev/null +++ b/src/kudu/util/monotime.cc @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/monotime.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/thread_restrictions.h" + +namespace kudu { + +#define MAX_MONOTONIC_SECONDS \ + (((1ULL<<63) - 1ULL) /(int64_t)MonoTime::kNanosecondsPerSecond) + + +/// +/// MonoDelta +/// + +const int64_t MonoDelta::kUninitialized = kint64min; + +MonoDelta MonoDelta::FromSeconds(double seconds) { + int64_t delta = seconds * MonoTime::kNanosecondsPerSecond; + return MonoDelta(delta); +} + +MonoDelta MonoDelta::FromMilliseconds(int64_t ms) { + return MonoDelta(ms * MonoTime::kNanosecondsPerMillisecond); +} + +MonoDelta MonoDelta::FromMicroseconds(int64_t us) { + return MonoDelta(us * MonoTime::kNanosecondsPerMicrosecond); +} + +MonoDelta MonoDelta::FromNanoseconds(int64_t ns) { + return MonoDelta(ns); +} + +MonoDelta::MonoDelta() + : nano_delta_(kUninitialized) { +} + +bool MonoDelta::Initialized() const { + return nano_delta_ != kUninitialized; +} + +bool MonoDelta::LessThan(const MonoDelta &rhs) const { + DCHECK(Initialized()); + DCHECK(rhs.Initialized()); + return nano_delta_ < rhs.nano_delta_; +} + +bool MonoDelta::MoreThan(const MonoDelta &rhs) const { + DCHECK(Initialized()); + DCHECK(rhs.Initialized()); + return nano_delta_ > rhs.nano_delta_; +} + +bool MonoDelta::Equals(const MonoDelta &rhs) const { + DCHECK(Initialized()); + DCHECK(rhs.Initialized()); + return nano_delta_ == rhs.nano_delta_; +} + +std::string MonoDelta::ToString() const { + return StringPrintf("%.3fs", ToSeconds()); +} + +MonoDelta::MonoDelta(int64_t delta) + : nano_delta_(delta) { +} + +double MonoDelta::ToSeconds() const { + DCHECK(Initialized()); + double d(nano_delta_); + d /= MonoTime::kNanosecondsPerSecond; + return d; +} + +int64_t MonoDelta::ToNanoseconds() const { + DCHECK(Initialized()); + return nano_delta_; +} + +int64_t MonoDelta::ToMicroseconds() const { + DCHECK(Initialized()); + return nano_delta_ / MonoTime::kNanosecondsPerMicrosecond; +} + +int64_t MonoDelta::ToMilliseconds() const { + DCHECK(Initialized()); + return nano_delta_ / MonoTime::kNanosecondsPerMillisecond; +} + +void MonoDelta::ToTimeVal(struct timeval *tv) const { + DCHECK(Initialized()); + tv->tv_sec = nano_delta_ / MonoTime::kNanosecondsPerSecond; + tv->tv_usec = (nano_delta_ - (tv->tv_sec * MonoTime::kNanosecondsPerSecond)) + / MonoTime::kNanosecondsPerMicrosecond; + + // tv_usec must be between 0 and 999999. + // There is little use for negative timevals so wrap it in PREDICT_FALSE. + if (PREDICT_FALSE(tv->tv_usec < 0)) { + --(tv->tv_sec); + tv->tv_usec += 1000000; + } + + // Catch positive corner case where we "round down" and could potentially set a timeout of 0. + // Make it 1 usec. + if (PREDICT_FALSE(tv->tv_usec == 0 && tv->tv_sec == 0 && nano_delta_ > 0)) { + tv->tv_usec = 1; + } + + // Catch negative corner case where we "round down" and could potentially set a timeout of 0. + // Make it -1 usec (but normalized, so tv_usec is not negative). + if (PREDICT_FALSE(tv->tv_usec == 0 && tv->tv_sec == 0 && nano_delta_ < 0)) { + tv->tv_sec = -1; + tv->tv_usec = 999999; + } +} + + +void MonoDelta::NanosToTimeSpec(int64_t nanos, struct timespec* ts) { + ts->tv_sec = nanos / MonoTime::kNanosecondsPerSecond; + ts->tv_nsec = nanos - (ts->tv_sec * MonoTime::kNanosecondsPerSecond); + + // tv_nsec must be between 0 and 999999999. + // There is little use for negative timespecs so wrap it in PREDICT_FALSE. + if (PREDICT_FALSE(ts->tv_nsec < 0)) { + --(ts->tv_sec); + ts->tv_nsec += MonoTime::kNanosecondsPerSecond; + } +} + +void MonoDelta::ToTimeSpec(struct timespec *ts) const { + DCHECK(Initialized()); + NanosToTimeSpec(nano_delta_, ts); +} + +/// +/// MonoTime +/// + +MonoTime MonoTime::Now(enum Granularity granularity) { +#if defined(__APPLE__) + return MonoTime(walltime_internal::GetMonoTimeNanos()); +# else + struct timespec ts; + clockid_t clock; + +// Older systems do not support CLOCK_MONOTONIC_COARSE +#ifdef CLOCK_MONOTONIC_COARSE + clock = (granularity == COARSE) ? CLOCK_MONOTONIC_COARSE : CLOCK_MONOTONIC; +#else + clock = CLOCK_MONOTONIC; +#endif + PCHECK(clock_gettime(clock, &ts) == 0); + return MonoTime(ts); +#endif // defined(__APPLE__) +} + +MonoTime MonoTime::Max() { + return MonoTime(std::numeric_limits::max()); +} + +MonoTime MonoTime::Min() { + return MonoTime(1); +} + +const MonoTime& MonoTime::Earliest(const MonoTime& a, const MonoTime& b) { + if (b.nanos_ < a.nanos_) { + return b; + } + return a; +} + +MonoTime::MonoTime() + : nanos_(0) { +} + +bool MonoTime::Initialized() const { + return nanos_ != 0; +} + +MonoDelta MonoTime::GetDeltaSince(const MonoTime &rhs) const { + DCHECK(Initialized()); + DCHECK(rhs.Initialized()); + int64_t delta(nanos_); + delta -= rhs.nanos_; + return MonoDelta(delta); +} + +void MonoTime::AddDelta(const MonoDelta &delta) { + DCHECK(Initialized()); + nanos_ += delta.nano_delta_; +} + +bool MonoTime::ComesBefore(const MonoTime &rhs) const { + DCHECK(Initialized()); + DCHECK(rhs.Initialized()); + return nanos_ < rhs.nanos_; +} + +std::string MonoTime::ToString() const { + return StringPrintf("%.3fs", ToSeconds()); +} + +bool MonoTime::Equals(const MonoTime& other) const { + return nanos_ == other.nanos_; +} + +MonoTime::MonoTime(const struct timespec &ts) { + // Monotonic time resets when the machine reboots. The 64-bit limitation + // means that we can't represent times larger than 292 years, which should be + // adequate. + CHECK_LT(ts.tv_sec, MAX_MONOTONIC_SECONDS); + nanos_ = ts.tv_sec; + nanos_ *= MonoTime::kNanosecondsPerSecond; + nanos_ += ts.tv_nsec; +} + +MonoTime::MonoTime(int64_t nanos) + : nanos_(nanos) { +} + +double MonoTime::ToSeconds() const { + double d(nanos_); + d /= MonoTime::kNanosecondsPerSecond; + return d; +} + +void SleepFor(const MonoDelta& delta) { + ThreadRestrictions::AssertWaitAllowed(); + base::SleepForNanoseconds(delta.ToNanoseconds()); +} + +} // namespace kudu diff --git a/src/kudu/util/monotime.h b/src/kudu/util/monotime.h new file mode 100644 index 000000000000..25f3056fc75d --- /dev/null +++ b/src/kudu/util/monotime.h @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_MONOTIME_H +#define KUDU_UTIL_MONOTIME_H + +#include +#include + +#ifdef KUDU_HEADERS_NO_STUBS +#include +#else +// This is a poor module interdependency, but the stubs are header-only and +// it's only for exported header builds, so we'll make an exception. +#include "kudu/client/stubs.h" +#endif + +#include "kudu/util/kudu_export.h" + +struct timeval; +struct timespec; + +namespace kudu { +class MonoTime; + +// Represent an elapsed duration of time -- i.e the delta between +// two MonoTime instances. +// +// A MonoDelta built with the default constructor is "uninitialized" and +// may not be used for any operation. +class KUDU_EXPORT MonoDelta { + public: + static MonoDelta FromSeconds(double seconds); + static MonoDelta FromMilliseconds(int64_t ms); + static MonoDelta FromMicroseconds(int64_t us); + static MonoDelta FromNanoseconds(int64_t ns); + MonoDelta(); + bool Initialized() const; + bool LessThan(const MonoDelta &rhs) const; + bool MoreThan(const MonoDelta &rhs) const; + bool Equals(const MonoDelta &rhs) const; + std::string ToString() const; + double ToSeconds() const; + int64_t ToMilliseconds() const; + int64_t ToMicroseconds() const; + int64_t ToNanoseconds() const; + + // Update struct timeval to current value of delta, with microsecond accuracy. + // Note that if MonoDelta::IsPositive() returns true, the struct timeval + // is guaranteed to hold a positive number as well (at least 1 microsecond). + void ToTimeVal(struct timeval *tv) const; + + // Update struct timespec to current value of delta, with nanosecond accuracy. + void ToTimeSpec(struct timespec *ts) const; + + // Convert a nanosecond value to a timespec. + static void NanosToTimeSpec(int64_t nanos, struct timespec* ts); + + private: + static const int64_t kUninitialized; + + friend class MonoTime; + FRIEND_TEST(TestMonoTime, TestDeltaConversions); + explicit MonoDelta(int64_t delta); + int64_t nano_delta_; +}; + +// Represent a particular point in time, relative to some fixed but unspecified +// reference point. +// +// This time is monotonic, meaning that if the user changes his or her system +// clock, the monotime does not change. +class KUDU_EXPORT MonoTime { + public: + enum Granularity { + COARSE, + FINE + }; + + static const int64_t kNanosecondsPerSecond = 1000000000L; + static const int64_t kNanosecondsPerMillisecond = 1000000L; + static const int64_t kNanosecondsPerMicrosecond = 1000L; + + static const int64_t kMicrosecondsPerSecond = 1000000L; + + // The coarse monotonic time is faster to retrieve, but "only" + // accurate to within a millisecond or two. The speed difference will + // depend on your timer hardware. + static MonoTime Now(enum Granularity granularity); + + // Return MonoTime equal to farthest possible time into the future. + static MonoTime Max(); + + // Return MonoTime equal to farthest possible time into the past. + static MonoTime Min(); + + // Return the earliest (minimum) of the two monotimes. + static const MonoTime& Earliest(const MonoTime& a, const MonoTime& b); + + MonoTime(); + bool Initialized() const; + MonoDelta GetDeltaSince(const MonoTime &rhs) const; + void AddDelta(const MonoDelta &delta); + bool ComesBefore(const MonoTime &rhs) const; + std::string ToString() const; + bool Equals(const MonoTime& other) const; + + private: + friend class MonoDelta; + FRIEND_TEST(TestMonoTime, TestTimeSpec); + FRIEND_TEST(TestMonoTime, TestDeltaConversions); + + explicit MonoTime(const struct timespec &ts); + explicit MonoTime(int64_t nanos); + double ToSeconds() const; + uint64_t nanos_; +}; + +// Sleep for a MonoDelta duration. +// +// This is preferred over sleep(3), usleep(3), and nanosleep(3). It's less prone to mixups with +// units since it uses a MonoDelta. It also ignores EINTR, so will reliably sleep at least the +// MonoDelta duration. +void KUDU_EXPORT SleepFor(const MonoDelta& delta); + +} // namespace kudu + +#endif diff --git a/src/kudu/util/mt-hdr_histogram-test.cc b/src/kudu/util/mt-hdr_histogram-test.cc new file mode 100644 index 000000000000..879c5e3134b7 --- /dev/null +++ b/src/kudu/util/mt-hdr_histogram-test.cc @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/hdr_histogram.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DEFINE_int32(histogram_test_num_threads, 16, + "Number of threads to spawn for mt-hdr_histogram test"); +DEFINE_uint64(histogram_test_num_increments_per_thread, 100000LU, + "Number of times to call Increment() per thread in mt-hdr_histogram test"); + +using std::vector; + +namespace kudu { + +class MtHdrHistogramTest : public KuduTest { + public: + MtHdrHistogramTest() { + num_threads_ = FLAGS_histogram_test_num_threads; + num_times_ = FLAGS_histogram_test_num_increments_per_thread; + } + + protected: + int num_threads_; + uint64_t num_times_; +}; + +// Increment a counter a bunch of times in the same bucket +static void IncrementSameHistValue(HdrHistogram* hist, uint64_t value, uint64_t times) { + for (uint64_t i = 0; i < times; i++) { + hist->Increment(value); + } +} + +TEST_F(MtHdrHistogramTest, ConcurrentWriteTest) { + const uint64_t kValue = 1LU; + + HdrHistogram hist(100000LU, 3); + + auto threads = new scoped_refptr[num_threads_]; + for (int i = 0; i < num_threads_; i++) { + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("thread-$0", i), + IncrementSameHistValue, &hist, kValue, num_times_, &threads[i])); + } + for (int i = 0; i < num_threads_; i++) { + CHECK_OK(ThreadJoiner(threads[i].get()).Join()); + } + + HdrHistogram snapshot(hist); + ASSERT_EQ(num_threads_ * num_times_, snapshot.CountInBucketForValue(kValue)); + + delete[] threads; +} + +// Copy while writing, then iterate to ensure copies are consistent. +TEST_F(MtHdrHistogramTest, ConcurrentCopyWhileWritingTest) { + const int kNumCopies = 10; + const uint64_t kValue = 1; + + HdrHistogram hist(100000LU, 3); + + auto threads = new scoped_refptr[num_threads_]; + for (int i = 0; i < num_threads_; i++) { + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("thread-$0", i), + IncrementSameHistValue, &hist, kValue, num_times_, &threads[i])); + } + + // This is somewhat racy but the goal is to catch this issue at least + // most of the time. At the time of this writing, before fixing a bug where + // the total count stored in a copied histogram may not match its internal + // counts (under concurrent writes), this test fails for me on 100/100 runs. + vector snapshots; + ElementDeleter deleter(&snapshots); + for (int i = 0; i < kNumCopies; i++) { + snapshots.push_back(new HdrHistogram(hist)); + SleepFor(MonoDelta::FromMicroseconds(100)); + } + for (int i = 0; i < kNumCopies; i++) { + snapshots[i]->MeanValue(); // Will crash if underlying iterator is inconsistent. + } + + for (int i = 0; i < num_threads_; i++) { + CHECK_OK(ThreadJoiner(threads[i].get()).Join()); + } + + delete[] threads; +} + +} // namespace kudu diff --git a/src/kudu/util/mt-metrics-test.cc b/src/kudu/util/mt-metrics-test.cc new file mode 100644 index 000000000000..b4512fb9ad3f --- /dev/null +++ b/src/kudu/util/mt-metrics-test.cc @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug/leakcheck_disabler.h" +#include "kudu/util/jsonwriter.h" +#include "kudu/util/metrics.h" +#include "kudu/util/monotime.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +DEFINE_int32(mt_metrics_test_num_threads, 4, + "Number of threads to spawn in mt metrics tests"); + +METRIC_DEFINE_entity(test_entity); + +namespace kudu { + +using debug::ScopedLeakCheckDisabler; +using std::vector; + +class MultiThreadedMetricsTest : public KuduTest { + public: + static void RegisterCounters(const scoped_refptr& metric_entity, + const string& name_prefix, int num_counters); + + MetricRegistry registry_; +}; + +// Call increment on a Counter a bunch of times. +static void CountWithCounter(scoped_refptr counter, int num_increments) { + for (int i = 0; i < num_increments; i++) { + counter->Increment(); + } +} + +// Helper function that spawns and then joins a bunch of threads. +static void RunWithManyThreads(boost::function* f, int num_threads) { + vector > threads; + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", StringPrintf("thread%d", i), + *f, &new_thread)); + threads.push_back(new_thread); + } + for (int i = 0; i < num_threads; i++) { + ASSERT_OK(ThreadJoiner(threads[i].get()).Join()); + } +} + +METRIC_DEFINE_counter(test_entity, test_counter, "Test Counter", + MetricUnit::kRequests, "Test counter"); + +// Ensure that incrementing a counter is thread-safe. +TEST_F(MultiThreadedMetricsTest, CounterIncrementTest) { + scoped_refptr counter = new Counter(&METRIC_test_counter); + int num_threads = FLAGS_mt_metrics_test_num_threads; + int num_increments = 1000; + boost::function f = + boost::bind(CountWithCounter, counter, num_increments); + RunWithManyThreads(&f, num_threads); + ASSERT_EQ(num_threads * num_increments, counter->value()); +} + +// Helper function to register a bunch of counters in a loop. +void MultiThreadedMetricsTest::RegisterCounters( + const scoped_refptr& metric_entity, + const string& name_prefix, + int num_counters) { + uint64_t tid = Env::Default()->gettid(); + for (int i = 0; i < num_counters; i++) { + // This loop purposefully leaks metrics prototypes, because the metrics system + // expects the prototypes and their names to live forever. This is the only + // place we dynamically generate them for the purposes of a test, so it's easier + // to just leak them than to figure out a way to manage lifecycle of objects that + // are typically static. + ScopedLeakCheckDisabler disabler; + + string name = strings::Substitute("$0-$1-$2", name_prefix, tid, i); + auto proto = new CounterPrototype(MetricPrototype::CtorArgs( + "test_entity", strdup(name.c_str()), "Test Counter", + MetricUnit::kOperations, "test counter")); + proto->Instantiate(metric_entity)->Increment(); + } +} + +// Ensure that adding a counter to a registry is thread-safe. +TEST_F(MultiThreadedMetricsTest, AddCounterToRegistryTest) { + scoped_refptr entity = METRIC_ENTITY_test_entity.Instantiate(®istry_, "my-test"); + int num_threads = FLAGS_mt_metrics_test_num_threads; + int num_counters = 1000; + boost::function f = + boost::bind(RegisterCounters, entity, "prefix", num_counters); + RunWithManyThreads(&f, num_threads); + ASSERT_EQ(num_threads * num_counters, entity->UnsafeMetricsMapForTests().size()); +} + +} // namespace kudu diff --git a/src/kudu/util/mt-threadlocal-test.cc b/src/kudu/util/mt-threadlocal-test.cc new file mode 100644 index 000000000000..c3479ff1f401 --- /dev/null +++ b/src/kudu/util/mt-threadlocal-test.cc @@ -0,0 +1,322 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/env.h" +#include "kudu/util/locks.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadlocal.h" + +using std::unordered_set; +using std::vector; +using strings::Substitute; + +namespace kudu { +namespace threadlocal { + +class ThreadLocalTest : public KuduTest {}; + +const int kTargetCounterVal = 1000000; + +class Counter; +typedef unordered_set CounterPtrSet; +typedef Mutex RegistryLockType; +typedef simple_spinlock CounterLockType; + +// Registry to provide reader access to the thread-local Counters. +// The methods are only thread-safe if the calling thread holds the lock. +class CounterRegistry { + public: + CounterRegistry() { + } + + RegistryLockType* get_lock() const { + return &lock_; + } + + bool RegisterUnlocked(Counter* counter) { + LOG(INFO) << "Called RegisterUnlocked()"; + return InsertIfNotPresent(&counters_, counter); + } + + bool UnregisterUnlocked(Counter* counter) { + LOG(INFO) << "Called UnregisterUnlocked()"; + return counters_.erase(counter) > 0; + } + + CounterPtrSet* GetCountersUnlocked() { + return &counters_; + } + + private: + mutable RegistryLockType lock_; + CounterPtrSet counters_; + DISALLOW_COPY_AND_ASSIGN(CounterRegistry); +}; + +// A simple Counter class that registers itself with a CounterRegistry. +class Counter { + public: + Counter(CounterRegistry* registry, int val) + : tid_(Env::Default()->gettid()), + registry_(CHECK_NOTNULL(registry)), + val_(val) { + LOG(INFO) << "Counter::~Counter(): tid = " << tid_ << ", addr = " << this << ", val = " << val_; + boost::lock_guard reg_lock(*registry_->get_lock()); + CHECK(registry_->RegisterUnlocked(this)); + } + + ~Counter() { + LOG(INFO) << "Counter::~Counter(): tid = " << tid_ << ", addr = " << this << ", val = " << val_; + boost::lock_guard reg_lock(*registry_->get_lock()); + boost::lock_guard self_lock(lock_); + LOG(INFO) << tid_ << ": deleting self from registry..."; + CHECK(registry_->UnregisterUnlocked(this)); + } + + uint64_t tid() { + return tid_; + } + + CounterLockType* get_lock() const { + return &lock_; + } + + void IncrementUnlocked() { + val_++; + } + + int GetValueUnlocked() { + return val_; + } + + private: + // We expect that most of the time this lock will be uncontended. + mutable CounterLockType lock_; + + // TID of thread that constructed this object. + const uint64_t tid_; + + // Register / unregister ourselves with this on construction / destruction. + CounterRegistry* const registry_; + + // Current value of the counter. + int val_; + + DISALLOW_COPY_AND_ASSIGN(Counter); +}; + +// Create a new THREAD_LOCAL Counter and loop an increment operation on it. +static void RegisterCounterAndLoopIncr(CounterRegistry* registry, + CountDownLatch* counters_ready, + CountDownLatch* reader_ready, + CountDownLatch* counters_done, + CountDownLatch* reader_done) { + BLOCK_STATIC_THREAD_LOCAL(Counter, counter, registry, 0); + // Inform the reader that we are alive. + counters_ready->CountDown(); + // Let the reader initialize before we start counting. + reader_ready->Wait(); + // Now rock & roll on the counting loop. + for (int i = 0; i < kTargetCounterVal; i++) { + boost::lock_guard l(*counter->get_lock()); + counter->IncrementUnlocked(); + } + // Let the reader know we're ready for him to verify our counts. + counters_done->CountDown(); + // Wait until the reader is done before we exit the thread, which will call + // delete on the Counter. + reader_done->Wait(); +} + +// Iterate over the registered counters and their values. +static uint64_t Iterate(CounterRegistry* registry, int expected_counters) { + uint64_t sum = 0; + int seen_counters = 0; + boost::lock_guard l(*registry->get_lock()); + for (Counter* counter : *registry->GetCountersUnlocked()) { + uint64_t value; + { + boost::lock_guard l(*counter->get_lock()); + value = counter->GetValueUnlocked(); + } + LOG(INFO) << "tid " << counter->tid() << " (counter " << counter << "): " << value; + sum += value; + seen_counters++; + } + CHECK_EQ(expected_counters, seen_counters); + return sum; +} + +static void TestThreadLocalCounters(CounterRegistry* registry, const int num_threads) { + LOG(INFO) << "Starting threads..."; + vector > threads; + + CountDownLatch counters_ready(num_threads); + CountDownLatch reader_ready(1); + CountDownLatch counters_done(num_threads); + CountDownLatch reader_done(1); + for (int i = 0; i < num_threads; i++) { + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("t$0", i), + &RegisterCounterAndLoopIncr, registry, &counters_ready, &reader_ready, + &counters_done, &reader_done, &new_thread)); + threads.push_back(new_thread); + } + + // Wait for all threads to start and register their Counters. + counters_ready.Wait(); + CHECK_EQ(0, Iterate(registry, num_threads)); + LOG(INFO) << "--"; + + // Let the counters start spinning. + reader_ready.CountDown(); + + // Try to catch them in the act, just for kicks. + for (int i = 0; i < 2; i++) { + Iterate(registry, num_threads); + LOG(INFO) << "--"; + SleepFor(MonoDelta::FromMicroseconds(1)); + } + + // Wait until they're done and assure they sum up properly. + counters_done.Wait(); + LOG(INFO) << "Checking Counter sums..."; + CHECK_EQ(kTargetCounterVal * num_threads, Iterate(registry, num_threads)); + LOG(INFO) << "Counter sums add up!"; + reader_done.CountDown(); + + LOG(INFO) << "Joining & deleting threads..."; + for (scoped_refptr thread : threads) { + CHECK_OK(ThreadJoiner(thread.get()).Join()); + } + LOG(INFO) << "Done."; +} + +TEST_F(ThreadLocalTest, TestConcurrentCounters) { + // Run this multiple times to ensure we don't leave remnants behind in the + // CounterRegistry. + CounterRegistry registry; + for (int i = 0; i < 3; i++) { + TestThreadLocalCounters(®istry, 8); + } +} + +// Test class that stores a string in a static thread local member. +// This class cannot be instantiated. The methods are all static. +class ThreadLocalString { + public: + static void set(std::string value); + static const std::string& get(); + private: + ThreadLocalString() { + } + DECLARE_STATIC_THREAD_LOCAL(std::string, value_); + DISALLOW_COPY_AND_ASSIGN(ThreadLocalString); +}; + +DEFINE_STATIC_THREAD_LOCAL(std::string, ThreadLocalString, value_); + +void ThreadLocalString::set(std::string value) { + INIT_STATIC_THREAD_LOCAL(std::string, value_); + *value_ = value; +} + +const std::string& ThreadLocalString::get() { + INIT_STATIC_THREAD_LOCAL(std::string, value_); + return *value_; +} + +static void RunAndAssign(CountDownLatch* writers_ready, + CountDownLatch *readers_ready, + CountDownLatch *all_done, + CountDownLatch *threads_exiting, + const std::string& in, + std::string* out) { + writers_ready->Wait(); + // Ensure it starts off as an empty string. + CHECK_EQ("", ThreadLocalString::get()); + ThreadLocalString::set(in); + + readers_ready->Wait(); + out->assign(ThreadLocalString::get()); + all_done->Wait(); + threads_exiting->CountDown(); +} + +TEST_F(ThreadLocalTest, TestTLSMember) { + const int num_threads = 8; + + vector writers_ready; + vector readers_ready; + vector out_strings; + vector > threads; + + ElementDeleter writers_deleter(&writers_ready); + ElementDeleter readers_deleter(&readers_ready); + ElementDeleter out_strings_deleter(&out_strings); + + CountDownLatch all_done(1); + CountDownLatch threads_exiting(num_threads); + + LOG(INFO) << "Starting threads..."; + for (int i = 0; i < num_threads; i++) { + writers_ready.push_back(new CountDownLatch(1)); + readers_ready.push_back(new CountDownLatch(1)); + out_strings.push_back(new std::string()); + scoped_refptr new_thread; + CHECK_OK(kudu::Thread::Create("test", strings::Substitute("t$0", i), + &RunAndAssign, writers_ready[i], readers_ready[i], + &all_done, &threads_exiting, Substitute("$0", i), out_strings[i], &new_thread)); + threads.push_back(new_thread); + } + + // Unlatch the threads in order. + LOG(INFO) << "Writing to thread locals..."; + for (int i = 0; i < num_threads; i++) { + writers_ready[i]->CountDown(); + } + LOG(INFO) << "Reading from thread locals..."; + for (int i = 0; i < num_threads; i++) { + readers_ready[i]->CountDown(); + } + all_done.CountDown(); + // threads_exiting acts as a memory barrier. + threads_exiting.Wait(); + for (int i = 0; i < num_threads; i++) { + ASSERT_EQ(Substitute("$0", i), *out_strings[i]); + LOG(INFO) << "Read " << *out_strings[i]; + } + + LOG(INFO) << "Joining & deleting threads..."; + for (scoped_refptr thread : threads) { + CHECK_OK(ThreadJoiner(thread.get()).Join()); + } +} + +} // namespace threadlocal +} // namespace kudu diff --git a/src/kudu/util/mutex.cc b/src/kudu/util/mutex.cc new file mode 100644 index 000000000000..5996daf79aad --- /dev/null +++ b/src/kudu/util/mutex.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Portions (c) 2011 The Chromium Authors. + +#include "kudu/util/mutex.h" + +#include + +#include "kudu/util/debug-util.h" +#include "kudu/util/env.h" + +namespace kudu { + +Mutex::Mutex() +#ifndef NDEBUG + : owning_tid_(0), + stack_trace_(new StackTrace()) +#endif +{ +#ifndef NDEBUG + // In debug, setup attributes for lock error checking. + pthread_mutexattr_t mta; + int rv = pthread_mutexattr_init(&mta); + DCHECK_EQ(0, rv) << ". " << strerror(rv); + rv = pthread_mutexattr_settype(&mta, PTHREAD_MUTEX_ERRORCHECK); + DCHECK_EQ(0, rv) << ". " << strerror(rv); + rv = pthread_mutex_init(&native_handle_, &mta); + DCHECK_EQ(0, rv) << ". " << strerror(rv); + rv = pthread_mutexattr_destroy(&mta); + DCHECK_EQ(0, rv) << ". " << strerror(rv); +#else + // In release, go with the default lock attributes. + pthread_mutex_init(&native_handle_, NULL); +#endif +} + +Mutex::~Mutex() { + int rv = pthread_mutex_destroy(&native_handle_); + DCHECK_EQ(0, rv) << ". " << strerror(rv); +} + +bool Mutex::TryAcquire() { + int rv = pthread_mutex_trylock(&native_handle_); +#ifndef NDEBUG + DCHECK(rv == 0 || rv == EBUSY) << ". " << strerror(rv) + << ". Owner tid: " << owning_tid_ << "; Self tid: " << Env::Default()->gettid() + << "; Owner stack: " << std::endl << stack_trace_->Symbolize();; + if (rv == 0) { + CheckUnheldAndMark(); + } +#endif + return rv == 0; +} + +void Mutex::Acquire() { + int rv = pthread_mutex_lock(&native_handle_); +#ifndef NDEBUG + DCHECK_EQ(0, rv) << ". " << strerror(rv) + << ". Owner tid: " << owning_tid_ << "; Self tid: " << Env::Default()->gettid() + << "; Owner stack: " << std::endl << stack_trace_->Symbolize();; + CheckUnheldAndMark(); +#endif +} + +void Mutex::Release() { +#ifndef NDEBUG + CheckHeldAndUnmark(); +#endif + int rv = pthread_mutex_unlock(&native_handle_); + DCHECK_EQ(0, rv) << ". " << strerror(rv); +} + +#ifndef NDEBUG +void Mutex::AssertAcquired() const { + DCHECK_EQ(Env::Default()->gettid(), owning_tid_); +} + +void Mutex::CheckHeldAndUnmark() { + AssertAcquired(); + owning_tid_ = 0; + stack_trace_->Reset(); +} + +void Mutex::CheckUnheldAndMark() { + DCHECK_EQ(0, owning_tid_); + owning_tid_ = Env::Default()->gettid(); + stack_trace_->Collect(); +} + +#endif + +} // namespace kudu diff --git a/src/kudu/util/mutex.h b/src/kudu/util/mutex.h new file mode 100644 index 000000000000..fb6c53c0ffea --- /dev/null +++ b/src/kudu/util/mutex.h @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_MUTEX_H +#define KUDU_UTIL_MUTEX_H + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" + +namespace kudu { + +class StackTrace; + +// A lock built around pthread_mutex_t. Does not allow recursion. +// +// The following checks will be performed in DEBUG mode: +// Acquire(), TryAcquire() - the lock isn't already held. +// Release() - the lock is already held by this thread. +// +class Mutex { + public: + Mutex(); + ~Mutex(); + + void Acquire(); + void Release(); + bool TryAcquire(); + + void lock() { Acquire(); } + void unlock() { Release(); } + bool try_lock() { return TryAcquire(); } + +#ifndef NDEBUG + void AssertAcquired() const; +#else + void AssertAcquired() const {} +#endif + + private: + friend class ConditionVariable; + + pthread_mutex_t native_handle_; + +#ifndef NDEBUG + // Members and routines taking care of locks assertions. + void CheckHeldAndUnmark(); + void CheckUnheldAndMark(); + + // All private data is implicitly protected by native_handle_. + // Be VERY careful to only access members under that lock. + pid_t owning_tid_; + gscoped_ptr stack_trace_; +#endif + + DISALLOW_COPY_AND_ASSIGN(Mutex); +}; + +// A helper class that acquires the given Lock while the MutexLock is in scope. +class MutexLock { + public: + struct AlreadyAcquired {}; + + // Acquires 'lock' (must be unheld) and wraps around it. + // + // Sample usage: + // { + // MutexLock l(lock_); // acquired + // ... + // } // released + explicit MutexLock(Mutex& lock) + : lock_(&lock), + owned_(true) { + lock_->Acquire(); + } + + // Wraps around 'lock' (must already be held by this thread). + // + // Sample usage: + // { + // lock_.Acquire(); // acquired + // ... + // MutexLock l(lock_, AlreadyAcquired()); + // ... + // } // released + MutexLock(Mutex& lock, const AlreadyAcquired&) + : lock_(&lock), + owned_(true) { + lock_->AssertAcquired(); + } + + void Lock() { + DCHECK(!owned_); + lock_->Acquire(); + owned_ = true; + } + + void Unlock() { + DCHECK(owned_); + lock_->AssertAcquired(); + lock_->Release(); + owned_ = false; + } + + ~MutexLock() { + if (owned_) { + Unlock(); + } + } + + bool OwnsLock() const { + return owned_; + } + + private: + Mutex* lock_; + bool owned_; + DISALLOW_COPY_AND_ASSIGN(MutexLock); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_MUTEX_H */ diff --git a/src/kudu/util/net/dns_resolver-test.cc b/src/kudu/util/net/dns_resolver-test.cc new file mode 100644 index 000000000000..55be2844df73 --- /dev/null +++ b/src/kudu/util/net/dns_resolver-test.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/net/dns_resolver.h" + +#include +#include +#include + +#include "kudu/gutil/strings/util.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/test_util.h" + +using std::vector; + +namespace kudu { + +class DnsResolverTest : public KuduTest { + protected: + DnsResolver resolver_; +}; + +TEST_F(DnsResolverTest, TestResolution) { + vector addrs; + Synchronizer s; + { + HostPort hp("localhost", 12345); + resolver_.ResolveAddresses(hp, &addrs, s.AsStatusCallback()); + } + ASSERT_OK(s.Wait()); + ASSERT_TRUE(!addrs.empty()); + for (const Sockaddr& addr : addrs) { + LOG(INFO) << "Address: " << addr.ToString(); + EXPECT_TRUE(HasPrefixString(addr.ToString(), "127.")); + EXPECT_TRUE(HasSuffixString(addr.ToString(), ":12345")); + } +} + +} // namespace kudu diff --git a/src/kudu/util/net/dns_resolver.cc b/src/kudu/util/net/dns_resolver.cc new file mode 100644 index 000000000000..4c37a95ed0e3 --- /dev/null +++ b/src/kudu/util/net/dns_resolver.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/net/dns_resolver.h" + +#include +#include +#include +#include + +#include "kudu/util/flag_tags.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" + +DEFINE_int32(dns_num_resolver_threads, 1, "The number of threads to use for DNS resolution"); +TAG_FLAG(dns_num_resolver_threads, advanced); + +using std::vector; + +namespace kudu { + +DnsResolver::DnsResolver() { + CHECK_OK(ThreadPoolBuilder("dns-resolver") + .set_max_threads(FLAGS_dns_num_resolver_threads) + .Build(&pool_)); +} + +DnsResolver::~DnsResolver() { + pool_->Shutdown(); +} + +namespace { +static void DoResolution(const HostPort &hostport, vector* addresses, + StatusCallback cb) { + cb.Run(hostport.ResolveAddresses(addresses)); +} +} // anonymous namespace + +void DnsResolver::ResolveAddresses(const HostPort& hostport, + vector* addresses, + const StatusCallback& cb) { + Status s = pool_->SubmitFunc(boost::bind(&DoResolution, hostport, addresses, cb)); + if (!s.ok()) { + cb.Run(s); + } +} + +} // namespace kudu diff --git a/src/kudu/util/net/dns_resolver.h b/src/kudu/util/net/dns_resolver.h new file mode 100644 index 000000000000..4232174a00cb --- /dev/null +++ b/src/kudu/util/net/dns_resolver.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_NET_DNS_RESOLVER_H +#define KUDU_UTIL_NET_DNS_RESOLVER_H + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/util/async_util.h" +#include "kudu/util/status.h" + +namespace kudu { + +class HostPort; +class Sockaddr; +class ThreadPool; + +// DNS Resolver which supports async address resolution. +class DnsResolver { + public: + DnsResolver(); + ~DnsResolver(); + + // Resolve any addresses corresponding to this host:port pair. + // Note that a host may resolve to more than one IP address. + // + // 'addresses' may be NULL, in which case this function simply checks that + // the host/port pair can be resolved, without returning anything. + // + // When the result is available, or an error occurred, 'cb' is called + // with the result Status. + // + // NOTE: the callback should be fast since it is called by the DNS + // resolution thread. + // NOTE: in some rare cases, the callback may also be called inline + // from this function call, on the caller's thread. + void ResolveAddresses(const HostPort& hostport, + std::vector* addresses, + const StatusCallback& cb); + + private: + gscoped_ptr pool_; + + DISALLOW_COPY_AND_ASSIGN(DnsResolver); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_NET_DNS_RESOLVER_H */ diff --git a/src/kudu/util/net/net_util-test.cc b/src/kudu/util/net/net_util-test.cc new file mode 100644 index 000000000000..b1c33ef1f762 --- /dev/null +++ b/src/kudu/util/net/net_util-test.cc @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/socket.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class NetUtilTest : public KuduTest { + protected: + Status DoParseBindAddresses(const string& input, string* result) { + vector addrs; + RETURN_NOT_OK(ParseAddressList(input, kDefaultPort, &addrs)); + std::sort(addrs.begin(), addrs.end()); + + vector addr_strs; + for (const Sockaddr& addr : addrs) { + addr_strs.push_back(addr.ToString()); + } + *result = JoinStrings(addr_strs, ","); + return Status::OK(); + } + + static const uint16_t kDefaultPort = 7150; +}; + +TEST(SockaddrTest, Test) { + Sockaddr addr; + ASSERT_OK(addr.ParseString("1.1.1.1:12345", 12345)); + ASSERT_EQ(12345, addr.port()); +} + +TEST_F(NetUtilTest, TestParseAddresses) { + string ret; + ASSERT_OK(DoParseBindAddresses("0.0.0.0:12345", &ret)); + ASSERT_EQ("0.0.0.0:12345", ret); + + ASSERT_OK(DoParseBindAddresses("0.0.0.0", &ret)); + ASSERT_EQ("0.0.0.0:7150", ret); + + ASSERT_OK(DoParseBindAddresses("0.0.0.0:12345, 0.0.0.0:12346", &ret)); + ASSERT_EQ("0.0.0.0:12345,0.0.0.0:12346", ret); + + // Test some invalid addresses. + Status s = DoParseBindAddresses("0.0.0.0:xyz", &ret); + ASSERT_STR_CONTAINS(s.ToString(), "Invalid port"); + + s = DoParseBindAddresses("0.0.0.0:100000", &ret); + ASSERT_STR_CONTAINS(s.ToString(), "Invalid port"); + + s = DoParseBindAddresses("0.0.0.0:", &ret); + ASSERT_STR_CONTAINS(s.ToString(), "Invalid port"); +} + +TEST_F(NetUtilTest, TestResolveAddresses) { + HostPort hp("localhost", 12345); + vector addrs; + ASSERT_OK(hp.ResolveAddresses(&addrs)); + ASSERT_TRUE(!addrs.empty()); + for (const Sockaddr& addr : addrs) { + LOG(INFO) << "Address: " << addr.ToString(); + EXPECT_TRUE(HasPrefixString(addr.ToString(), "127.")); + EXPECT_TRUE(HasSuffixString(addr.ToString(), ":12345")); + EXPECT_TRUE(addr.IsAnyLocalAddress()); + } + + ASSERT_OK(hp.ResolveAddresses(nullptr)); +} + +// Ensure that we are able to do a reverse DNS lookup on various IP addresses. +// The reverse lookups should never fail, but may return numeric strings. +TEST_F(NetUtilTest, TestReverseLookup) { + string host; + Sockaddr addr; + HostPort hp; + ASSERT_OK(addr.ParseString("0.0.0.0:12345", 0)); + EXPECT_EQ(12345, addr.port()); + ASSERT_OK(HostPortFromSockaddrReplaceWildcard(addr, &hp)); + EXPECT_NE("0.0.0.0", hp.host()); + EXPECT_NE("", hp.host()); + EXPECT_EQ(12345, hp.port()); + + ASSERT_OK(addr.ParseString("127.0.0.1:12345", 0)); + ASSERT_OK(HostPortFromSockaddrReplaceWildcard(addr, &hp)); + EXPECT_EQ("127.0.0.1", hp.host()); + EXPECT_EQ(12345, hp.port()); +} + +TEST_F(NetUtilTest, TestLsof) { + Socket s; + ASSERT_OK(s.Init(0)); + + Sockaddr addr; // wildcard + ASSERT_OK(s.BindAndListen(addr, 1)); + + ASSERT_OK(s.GetSocketAddress(&addr)); + ASSERT_NE(addr.port(), 0); + vector lsof_lines; + TryRunLsof(addr, &lsof_lines); + SCOPED_TRACE(JoinStrings(lsof_lines, "\n")); + + ASSERT_GE(lsof_lines.size(), 3); + ASSERT_STR_CONTAINS(lsof_lines[2], "net_util-test"); +} + +TEST_F(NetUtilTest, TestGetFQDN) { + string fqdn; + ASSERT_OK(GetFQDN(&fqdn)); + LOG(INFO) << "fqdn is " << fqdn; +} + +} // namespace kudu diff --git a/src/kudu/util/net/net_util.cc b/src/kudu/util/net/net_util.cc new file mode 100644 index 000000000000..7c7ed31633f5 --- /dev/null +++ b/src/kudu/util/net/net_util.cc @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/errno.h" +#include "kudu/util/faststring.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/subprocess.h" + +// Mac OS 10.9 does not appear to define HOST_NAME_MAX in unistd.h +#ifndef HOST_NAME_MAX +#define HOST_NAME_MAX 64 +#endif + +using std::unordered_set; +using std::vector; +using strings::Substitute; + +namespace kudu { + +namespace { +struct AddrinfoDeleter { + void operator()(struct addrinfo* info) { + freeaddrinfo(info); + } +}; +} + +HostPort::HostPort() + : host_(""), + port_(0) { +} + +HostPort::HostPort(std::string host, uint16_t port) + : host_(std::move(host)), port_(port) {} + +HostPort::HostPort(const Sockaddr& addr) + : host_(addr.host()), + port_(addr.port()) { +} + +Status HostPort::ParseString(const string& str, uint16_t default_port) { + std::pair p = strings::Split(str, strings::delimiter::Limit(":", 1)); + + // Strip any whitespace from the host. + StripWhiteSpace(&p.first); + + // Parse the port. + uint32_t port; + if (p.second.empty() && strcount(str, ':') == 0) { + // No port specified. + port = default_port; + } else if (!SimpleAtoi(p.second, &port) || + port > 65535) { + return Status::InvalidArgument("Invalid port", str); + } + + host_.swap(p.first); + port_ = port; + return Status::OK(); +} + +Status HostPort::ResolveAddresses(vector* addresses) const { + TRACE_EVENT1("net", "HostPort::ResolveAddresses", + "host", host_); + struct addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + struct addrinfo* res = nullptr; + int rc; + LOG_SLOW_EXECUTION(WARNING, 200, + Substitute("resolving address for $0", host_)) { + rc = getaddrinfo(host_.c_str(), nullptr, &hints, &res); + } + if (rc != 0) { + return Status::NetworkError( + StringPrintf("Unable to resolve address '%s'", host_.c_str()), + gai_strerror(rc)); + } + gscoped_ptr scoped_res(res); + for (; res != nullptr; res = res->ai_next) { + CHECK_EQ(res->ai_family, AF_INET); + struct sockaddr_in* addr = reinterpret_cast(res->ai_addr); + addr->sin_port = htons(port_); + Sockaddr sockaddr(*addr); + if (addresses) { + addresses->push_back(sockaddr); + } + VLOG(2) << "Resolved address " << sockaddr.ToString() + << " for host/port " << ToString(); + } + return Status::OK(); +} + +Status HostPort::ParseStrings(const string& comma_sep_addrs, + uint16_t default_port, + vector* res) { + vector addr_strings = strings::Split(comma_sep_addrs, ",", strings::SkipEmpty()); + for (const string& addr_string : addr_strings) { + HostPort host_port; + RETURN_NOT_OK(host_port.ParseString(addr_string, default_port)); + res->push_back(host_port); + } + return Status::OK(); +} + +string HostPort::ToString() const { + return Substitute("$0:$1", host_, port_); +} + +string HostPort::ToCommaSeparatedString(const vector& hostports) { + vector hostport_strs; + for (const HostPort& hostport : hostports) { + hostport_strs.push_back(hostport.ToString()); + } + return JoinStrings(hostport_strs, ","); +} + +bool IsPrivilegedPort(uint16_t port) { + return port <= 1024 && port != 0; +} + +Status ParseAddressList(const std::string& addr_list, + uint16_t default_port, + std::vector* addresses) { + vector host_ports; + RETURN_NOT_OK(HostPort::ParseStrings(addr_list, default_port, &host_ports)); + unordered_set uniqued; + + for (const HostPort& host_port : host_ports) { + vector this_addresses; + RETURN_NOT_OK(host_port.ResolveAddresses(&this_addresses)); + + // Only add the unique ones -- the user may have specified + // some IP addresses in multiple ways + for (const Sockaddr& addr : this_addresses) { + if (InsertIfNotPresent(&uniqued, addr)) { + addresses->push_back(addr); + } else { + LOG(INFO) << "Address " << addr.ToString() << " for " << host_port.ToString() + << " duplicates an earlier resolved entry."; + } + } + } + return Status::OK(); +} + +Status GetHostname(string* hostname) { + TRACE_EVENT0("net", "GetHostname"); + char name[HOST_NAME_MAX]; + int ret = gethostname(name, HOST_NAME_MAX); + if (ret != 0) { + return Status::NetworkError("Unable to determine local hostname", + ErrnoToString(errno), + errno); + } + *hostname = name; + return Status::OK(); +} + +Status GetFQDN(string* hostname) { + TRACE_EVENT0("net", "GetFQDN"); + // Start with the non-qualified hostname + RETURN_NOT_OK(GetHostname(hostname)); + + struct addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_DGRAM; + hints.ai_flags = AI_CANONNAME; + + struct addrinfo* result; + LOG_SLOW_EXECUTION(WARNING, 200, + Substitute("looking up canonical hostname for localhost $0", hostname)) { + TRACE_EVENT0("net", "getaddrinfo"); + int rc = getaddrinfo(hostname->c_str(), nullptr, &hints, &result); + if (rc != 0) { + return Status::NetworkError("Unable to lookup FQDN", ErrnoToString(errno), errno); + } + } + + *hostname = result->ai_canonname; + freeaddrinfo(result); + return Status::OK(); +} + +Status SockaddrFromHostPort(const HostPort& host_port, Sockaddr* addr) { + vector addrs; + RETURN_NOT_OK(host_port.ResolveAddresses(&addrs)); + if (addrs.empty()) { + return Status::NetworkError("Unable to resolve address", host_port.ToString()); + } + *addr = addrs[0]; + if (addrs.size() > 1) { + VLOG(1) << "Hostname " << host_port.host() << " resolved to more than one address. " + << "Using address: " << addr->ToString(); + } + return Status::OK(); +} + +Status HostPortFromSockaddrReplaceWildcard(const Sockaddr& addr, HostPort* hp) { + string host; + if (addr.IsWildcard()) { + RETURN_NOT_OK(GetFQDN(&host)); + } else { + host = addr.host(); + } + hp->set_host(host); + hp->set_port(addr.port()); + return Status::OK(); +} + +void TryRunLsof(const Sockaddr& addr, vector* log) { +#if defined(__APPLE__) + string cmd = strings::Substitute( + "lsof -n -i 'TCP:$0' -sTCP:LISTEN ; " + "for pid in $$(lsof -F p -n -i 'TCP:$0' -sTCP:LISTEN | cut -f 2 -dp) ; do" + " pstree $$pid || ps h -p $$pid;" + "done", + addr.port()); +#else + // Little inline bash script prints the full ancestry of any pid listening + // on the same port as 'addr'. We could use 'pstree -s', but that option + // doesn't exist on el6. + string cmd = strings::Substitute( + "export PATH=$$PATH:/usr/sbin ; " + "lsof -n -i 'TCP:$0' -sTCP:LISTEN ; " + "for pid in $$(lsof -F p -n -i 'TCP:$0' -sTCP:LISTEN | cut -f 2 -dp) ; do" + " while [ $$pid -gt 1 ] ; do" + " ps h -fp $$pid ;" + " stat=($$( argv = { "bash", "-c", cmd }; + string results; + Status s = Subprocess::Call(argv, &results); + if (PREDICT_FALSE(!s.ok())) { + LOG_STRING(WARNING, log) << s.ToString(); + } + LOG_STRING(WARNING, log) << results; +} + +} // namespace kudu diff --git a/src/kudu/util/net/net_util.h b/src/kudu/util/net/net_util.h new file mode 100644 index 000000000000..d0bb92e24b47 --- /dev/null +++ b/src/kudu/util/net/net_util.h @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_NET_NET_UTIL_H +#define KUDU_UTIL_NET_NET_UTIL_H + +#include +#include + +#include "kudu/util/status.h" + +namespace kudu { + +class Sockaddr; + +// A container for a host:port pair. +class HostPort { + public: + HostPort(); + HostPort(std::string host, uint16_t port); + explicit HostPort(const Sockaddr& addr); + + // Parse a "host:port" pair into this object. + // If there is no port specified in the string, then 'default_port' is used. + Status ParseString(const std::string& str, uint16_t default_port); + + // Resolve any addresses corresponding to this host:port pair. + // Note that a host may resolve to more than one IP address. + // + // 'addresses' may be NULL, in which case this function simply checks that + // the host/port pair can be resolved, without returning anything. + Status ResolveAddresses(std::vector* addresses) const; + + std::string ToString() const; + + const std::string& host() const { return host_; } + void set_host(const std::string& host) { host_ = host; } + + uint16_t port() const { return port_; } + void set_port(uint16_t port) { port_ = port; } + + // Parse a comma separated list of "host:port" pairs into a vector + // HostPort objects. If no port is specified for an entry in the + // comma separated list, 'default_port' is used for that entry's + // pair. + static Status ParseStrings( + const std::string& comma_sep_addrs, uint16_t default_port, std::vector* res); + + // Takes a vector of HostPort objects and returns a comma separated + // string containing of "host:port" pairs. This method is the + // "inverse" of ParseStrings(). + static std::string ToCommaSeparatedString(const std::vector& host_ports); + + private: + std::string host_; + uint16_t port_; +}; + +// Parse and resolve the given comma-separated list of addresses. +// +// The resulting addresses will be resolved, made unique, and added to +// the 'addresses' vector. +// +// Any elements which do not include a port will be assigned 'default_port'. +Status ParseAddressList(const std::string& addr_list, + uint16_t default_port, + std::vector* addresses); + +// Return true if the given port is likely to need root privileges to bind to. +bool IsPrivilegedPort(uint16_t port); + +// Return the local machine's hostname. +Status GetHostname(std::string* hostname); + +// Return the local machine's FQDN. +Status GetFQDN(std::string* fqdn); + +// Returns a single socket address from a HostPort. +// If the hostname resolves to multiple addresses, returns the first in the +// list and logs a message in verbose mode. +Status SockaddrFromHostPort(const HostPort& host_port, Sockaddr* addr); + +// Converts the given Sockaddr into a HostPort, substituting the FQDN +// in the case that the provided address is the wildcard. +// +// In the case of other addresses, the returned HostPort will contain just the +// stringified form of the IP. +Status HostPortFromSockaddrReplaceWildcard(const Sockaddr& addr, HostPort* hp); + +// Try to run 'lsof' to determine which process is preventing binding to +// the given 'addr'. If pids can be determined, outputs full 'ps' and 'pstree' +// output for that process. +// +// Output is issued to the log at WARNING level, or appended to 'log' if it +// is non-NULL (mostly useful for testing). +void TryRunLsof(const Sockaddr& addr, std::vector* log = NULL); + +} // namespace kudu +#endif diff --git a/src/kudu/util/net/sockaddr.cc b/src/kudu/util/net/sockaddr.cc new file mode 100644 index 000000000000..3385e8c7bf42 --- /dev/null +++ b/src/kudu/util/net/sockaddr.cc @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/net/sockaddr.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/endian.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/stopwatch.h" + +namespace kudu { + +using strings::Substitute; + +/// +/// Sockaddr +/// +Sockaddr::Sockaddr() { + memset(&addr_, 0, sizeof(addr_)); + addr_.sin_family = AF_INET; + addr_.sin_addr.s_addr = INADDR_ANY; +} + +Sockaddr::Sockaddr(const struct sockaddr_in& addr) { + memcpy(&addr_, &addr, sizeof(struct sockaddr_in)); +} + +Status Sockaddr::ParseString(const std::string& s, uint16_t default_port) { + HostPort hp; + RETURN_NOT_OK(hp.ParseString(s, default_port)); + + if (inet_pton(AF_INET, hp.host().c_str(), &addr_.sin_addr) != 1) { + return Status::InvalidArgument("Invalid IP address", hp.host()); + } + set_port(hp.port()); + return Status::OK(); +} + +Sockaddr& Sockaddr::operator=(const struct sockaddr_in &addr) { + memcpy(&addr_, &addr, sizeof(struct sockaddr_in)); + return *this; +} + +bool Sockaddr::operator==(const Sockaddr& other) const { + return memcmp(&other.addr_, &addr_, sizeof(addr_)) == 0; +} + +bool Sockaddr::operator<(const Sockaddr &rhs) const { + return addr_.sin_addr.s_addr < rhs.addr_.sin_addr.s_addr; +} + +uint32_t Sockaddr::HashCode() const { + uint32_t ret = addr_.sin_addr.s_addr; + ret ^= (addr_.sin_port * 7919); + return ret; +} + +void Sockaddr::set_port(int port) { + addr_.sin_port = htons(port); +} + +int Sockaddr::port() const { + return ntohs(addr_.sin_port); +} + +std::string Sockaddr::host() const { + char str[INET_ADDRSTRLEN]; + ::inet_ntop(AF_INET, &addr_.sin_addr, str, INET_ADDRSTRLEN); + return str; +} + +const struct sockaddr_in& Sockaddr::addr() const { + return addr_; +} + +std::string Sockaddr::ToString() const { + char str[INET_ADDRSTRLEN]; + ::inet_ntop(AF_INET, &addr_.sin_addr, str, INET_ADDRSTRLEN); + return StringPrintf("%s:%d", str, port()); +} + +bool Sockaddr::IsWildcard() const { + return addr_.sin_addr.s_addr == 0; +} + +bool Sockaddr::IsAnyLocalAddress() const { + return (NetworkByteOrder::FromHost32(addr_.sin_addr.s_addr) >> 24) == 127; +} + +Status Sockaddr::LookupHostname(string* hostname) const { + char host[NI_MAXHOST]; + int flags = 0; + + int rc; + LOG_SLOW_EXECUTION(WARNING, 200, + Substitute("DNS reverse-lookup for $0", ToString())) { + rc = getnameinfo((struct sockaddr *) &addr_, sizeof(sockaddr_in), + host, NI_MAXHOST, + nullptr, 0, flags); + } + if (PREDICT_FALSE(rc != 0)) { + if (rc == EAI_SYSTEM) { + int errno_saved = errno; + return Status::NetworkError(Substitute("getnameinfo: $0", gai_strerror(rc)), + strerror(errno_saved), errno_saved); + } + return Status::NetworkError("getnameinfo", gai_strerror(rc), rc); + } + *hostname = host; + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/net/sockaddr.h b/src/kudu/util/net/sockaddr.h new file mode 100644 index 000000000000..09777f3e593f --- /dev/null +++ b/src/kudu/util/net/sockaddr.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_NET_SOCKADDR_H +#define KUDU_UTIL_NET_SOCKADDR_H + +#include +#include +#include + +#include "kudu/util/status.h" + +namespace kudu { + +/// +/// Represents a sockaddr. +/// +/// Currently only IPv4 is implemented. When IPv6 and UNIX domain are +/// implemented, this should become an abstract base class and those should be +/// multiple implementations. +/// +class Sockaddr { + public: + Sockaddr(); + explicit Sockaddr(const struct sockaddr_in &addr); + + // Parse a string IP address of the form "A.B.C.D:port", storing the result + // in this Sockaddr object. If no ':port' is specified, uses 'default_port'. + // Note that this function will not handle resolving hostnames. + // + // Returns a bad Status if the input is malformed. + Status ParseString(const std::string& s, uint16_t default_port); + + Sockaddr& operator=(const struct sockaddr_in &addr); + + bool operator==(const Sockaddr& other) const; + + // Compare the endpoints of two sockaddrs. + // The port number is ignored in this comparison. + bool operator<(const Sockaddr &rhs) const; + + uint32_t HashCode() const; + + std::string host() const; + + void set_port(int port); + int port() const; + const struct sockaddr_in& addr() const; + std::string ToString() const; + + // Returns true if the address is 0.0.0.0 + bool IsWildcard() const; + + // Returns true if the address is 127.*.*.* + bool IsAnyLocalAddress() const; + + // Does reverse DNS lookup of the address and stores it in hostname. + Status LookupHostname(std::string* hostname) const; + + // the default auto-generated copy constructor is fine here + private: + struct sockaddr_in addr_; +}; + +} // namespace kudu + +// Specialize std::hash for Sockaddr +namespace std { +template<> +struct hash { + int operator()(const kudu::Sockaddr& addr) const { + return addr.HashCode(); + } +}; +} // namespace std +#endif diff --git a/src/kudu/util/net/socket.cc b/src/kudu/util/net/socket.cc new file mode 100644 index 000000000000..7a6933b358d1 --- /dev/null +++ b/src/kudu/util/net/socket.cc @@ -0,0 +1,558 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/net/socket.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/errno.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/monotime.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/net/sockaddr.h" +#include "kudu/util/random.h" +#include "kudu/util/random_util.h" +#include "kudu/util/subprocess.h" + +DEFINE_string(local_ip_for_outbound_sockets, "", + "IP to bind to when making outgoing socket connections. " + "This must be an IP address of the form A.B.C.D, not a hostname. " + "Advanced parameter, subject to change."); +TAG_FLAG(local_ip_for_outbound_sockets, experimental); + +DEFINE_bool(socket_inject_short_recvs, false, + "Inject short recv() responses which return less data than " + "requested"); +TAG_FLAG(socket_inject_short_recvs, hidden); +TAG_FLAG(socket_inject_short_recvs, unsafe); + +namespace kudu { + +Socket::Socket() + : fd_(-1) { +} + +Socket::Socket(int fd) + : fd_(fd) { +} + +void Socket::Reset(int fd) { + ignore_result(Close()); + fd_ = fd; +} + +int Socket::Release() { + int fd = fd_; + fd_ = -1; + return fd; +} + +Socket::~Socket() { + ignore_result(Close()); +} + +Status Socket::Close() { + if (fd_ < 0) + return Status::OK(); + int err, fd = fd_; + fd_ = -1; + if (::close(fd) < 0) { + err = errno; + return Status::NetworkError(std::string("close error: ") + + ErrnoToString(err), Slice(), err); + } + fd = -1; + return Status::OK(); +} + +Status Socket::Shutdown(bool shut_read, bool shut_write) { + DCHECK_GE(fd_, 0); + int flags = 0; + if (shut_read && shut_write) { + flags |= SHUT_RDWR; + } else if (shut_read) { + flags |= SHUT_RD; + } else if (shut_write) { + flags |= SHUT_WR; + } + if (::shutdown(fd_, flags) < 0) { + int err = errno; + return Status::NetworkError(std::string("shutdown error: ") + + ErrnoToString(err), Slice(), err); + } + return Status::OK(); +} + +int Socket::GetFd() const { + return fd_; +} + +bool Socket::IsTemporarySocketError(int err) { + return ((err == EAGAIN) || (err == EWOULDBLOCK) || (err == EINTR)); +} + +#if defined(__linux__) + +Status Socket::Init(int flags) { + int nonblocking_flag = (flags & FLAG_NONBLOCKING) ? SOCK_NONBLOCK : 0; + Reset(::socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC | nonblocking_flag, 0)); + if (fd_ < 0) { + int err = errno; + return Status::NetworkError(std::string("error opening socket: ") + + ErrnoToString(err), Slice(), err); + } + + return Status::OK(); +} + +#else + +Status Socket::Init(int flags) { + Reset(::socket(AF_INET, SOCK_STREAM, 0)); + if (fd_ < 0) { + int err = errno; + return Status::NetworkError(std::string("error opening socket: ") + + ErrnoToString(err), Slice(), err); + } + RETURN_NOT_OK(SetNonBlocking(flags & FLAG_NONBLOCKING)); + RETURN_NOT_OK(SetCloseOnExec()); + + // Disable SIGPIPE. + int set = 1; + if (setsockopt(fd_, SOL_SOCKET, SO_NOSIGPIPE, &set, sizeof(set)) == -1) { + int err = errno; + return Status::NetworkError(std::string("failed to set SO_NOSIGPIPE: ") + + ErrnoToString(err), Slice(), err); + } + + return Status::OK(); +} + +#endif // defined(__linux__) + +Status Socket::SetNoDelay(bool enabled) { + int flag = enabled ? 1 : 0; + if (setsockopt(fd_, IPPROTO_TCP, TCP_NODELAY, &flag, sizeof(flag)) == -1) { + int err = errno; + return Status::NetworkError(std::string("failed to set TCP_NODELAY: ") + + ErrnoToString(err), Slice(), err); + } + return Status::OK(); +} + +Status Socket::SetNonBlocking(bool enabled) { + int curflags = ::fcntl(fd_, F_GETFL, 0); + if (curflags == -1) { + int err = errno; + return Status::NetworkError( + StringPrintf("Failed to get file status flags on fd %d", fd_), + ErrnoToString(err), err); + } + int newflags = (enabled) ? (curflags | O_NONBLOCK) : (curflags & ~O_NONBLOCK); + if (::fcntl(fd_, F_SETFL, newflags) == -1) { + int err = errno; + if (enabled) { + return Status::NetworkError( + StringPrintf("Failed to set O_NONBLOCK on fd %d", fd_), + ErrnoToString(err), err); + } else { + return Status::NetworkError( + StringPrintf("Failed to clear O_NONBLOCK on fd %d", fd_), + ErrnoToString(err), err); + } + } + return Status::OK(); +} + +Status Socket::IsNonBlocking(bool* is_nonblock) const { + int curflags = ::fcntl(fd_, F_GETFL, 0); + if (curflags == -1) { + int err = errno; + return Status::NetworkError( + StringPrintf("Failed to get file status flags on fd %d", fd_), + ErrnoToString(err), err); + } + *is_nonblock = ((curflags & O_NONBLOCK) != 0); + return Status::OK(); +} + +Status Socket::SetCloseOnExec() { + int curflags = fcntl(fd_, F_GETFD, 0); + if (curflags == -1) { + int err = errno; + Reset(-1); + return Status::NetworkError(std::string("fcntl(F_GETFD) error: ") + + ErrnoToString(err), Slice(), err); + } + if (fcntl(fd_, F_SETFD, curflags | FD_CLOEXEC) == -1) { + int err = errno; + Reset(-1); + return Status::NetworkError(std::string("fcntl(F_SETFD) error: ") + + ErrnoToString(err), Slice(), err); + } + return Status::OK(); +} + +Status Socket::SetSendTimeout(const MonoDelta& timeout) { + return SetTimeout(SO_SNDTIMEO, "SO_SNDTIMEO", timeout); +} + +Status Socket::SetRecvTimeout(const MonoDelta& timeout) { + return SetTimeout(SO_RCVTIMEO, "SO_RCVTIMEO", timeout); +} + +Status Socket::SetReuseAddr(bool flag) { + int err; + int int_flag = flag ? 1 : 0; + if (setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &int_flag, sizeof(int_flag)) == -1) { + err = errno; + return Status::NetworkError(std::string("failed to set SO_REUSEADDR: ") + + ErrnoToString(err), Slice(), err); + } + return Status::OK(); +} + +Status Socket::BindAndListen(const Sockaddr &sockaddr, + int listenQueueSize) { + RETURN_NOT_OK(SetReuseAddr(true)); + RETURN_NOT_OK(Bind(sockaddr)); + RETURN_NOT_OK(Listen(listenQueueSize)); + return Status::OK(); +} + +Status Socket::Listen(int listen_queue_size) { + if (listen(fd_, listen_queue_size)) { + int err = errno; + return Status::NetworkError("listen() error", ErrnoToString(err)); + } + return Status::OK(); +} + +Status Socket::GetSocketAddress(Sockaddr *cur_addr) const { + struct sockaddr_in sin; + socklen_t len = sizeof(sin); + DCHECK_GE(fd_, 0); + if (::getsockname(fd_, (struct sockaddr *)&sin, &len) == -1) { + int err = errno; + return Status::NetworkError(string("getsockname error: ") + + ErrnoToString(err), Slice(), err); + } + *cur_addr = sin; + return Status::OK(); +} + +Status Socket::GetPeerAddress(Sockaddr *cur_addr) const { + struct sockaddr_in sin; + socklen_t len = sizeof(sin); + DCHECK_GE(fd_, 0); + if (::getpeername(fd_, (struct sockaddr *)&sin, &len) == -1) { + int err = errno; + return Status::NetworkError(string("getpeername error: ") + + ErrnoToString(err), Slice(), err); + } + *cur_addr = sin; + return Status::OK(); +} + +Status Socket::Bind(const Sockaddr& bind_addr) { + struct sockaddr_in addr = bind_addr.addr(); + + DCHECK_GE(fd_, 0); + if (PREDICT_FALSE(::bind(fd_, (struct sockaddr*) &addr, sizeof(addr)))) { + int err = errno; + Status s = Status::NetworkError( + strings::Substitute("error binding socket to $0: $1", + bind_addr.ToString(), ErrnoToString(err)), + Slice(), err); + + if (s.IsNetworkError() && s.posix_code() == EADDRINUSE && bind_addr.port() != 0) { + TryRunLsof(bind_addr); + } + return s; + } + + return Status::OK(); +} + +Status Socket::Accept(Socket *new_conn, Sockaddr *remote, int flags) { + TRACE_EVENT0("net", "Socket::Accept"); + struct sockaddr_in addr; + socklen_t olen = sizeof(addr); + DCHECK_GE(fd_, 0); +#if defined(__linux__) + int accept_flags = SOCK_CLOEXEC; + if (flags & FLAG_NONBLOCKING) { + accept_flags |= SOCK_NONBLOCK; + } + new_conn->Reset(::accept4(fd_, (struct sockaddr*)&addr, + &olen, accept_flags)); + if (new_conn->GetFd() < 0) { + int err = errno; + return Status::NetworkError(std::string("accept4(2) error: ") + + ErrnoToString(err), Slice(), err); + } +#else + new_conn->Reset(::accept(fd_, (struct sockaddr*)&addr, &olen)); + if (new_conn->GetFd() < 0) { + int err = errno; + return Status::NetworkError(std::string("accept(2) error: ") + + ErrnoToString(err), Slice(), err); + } + RETURN_NOT_OK(new_conn->SetNonBlocking(flags & FLAG_NONBLOCKING)); + RETURN_NOT_OK(new_conn->SetCloseOnExec()); +#endif // defined(__linux__) + + *remote = addr; + TRACE_EVENT_INSTANT1("net", "Accepted", TRACE_EVENT_SCOPE_THREAD, + "remote", remote->ToString()); + return Status::OK(); +} + +Status Socket::BindForOutgoingConnection() { + Sockaddr bind_host; + Status s = bind_host.ParseString(FLAGS_local_ip_for_outbound_sockets, 0); + CHECK(s.ok() && bind_host.port() == 0) + << "Invalid local IP set for 'local_ip_for_outbound_sockets': '" + << FLAGS_local_ip_for_outbound_sockets << "': " << s.ToString(); + + RETURN_NOT_OK(Bind(bind_host)); + return Status::OK(); +} + +Status Socket::Connect(const Sockaddr &remote) { + TRACE_EVENT1("net", "Socket::Connect", + "remote", remote.ToString()); + if (PREDICT_FALSE(!FLAGS_local_ip_for_outbound_sockets.empty())) { + RETURN_NOT_OK(BindForOutgoingConnection()); + } + + struct sockaddr_in addr; + memcpy(&addr, &remote.addr(), sizeof(sockaddr_in)); + DCHECK_GE(fd_, 0); + if (::connect(fd_, (const struct sockaddr*)&addr, sizeof(addr)) < 0) { + int err = errno; + return Status::NetworkError(std::string("connect(2) error: ") + + ErrnoToString(err), Slice(), err); + } + return Status::OK(); +} + +Status Socket::GetSockError() const { + int val = 0, ret; + socklen_t val_len = sizeof(val); + DCHECK_GE(fd_, 0); + ret = ::getsockopt(fd_, SOL_SOCKET, SO_ERROR, &val, &val_len); + if (ret) { + int err = errno; + return Status::NetworkError(std::string("getsockopt(SO_ERROR) failed: ") + + ErrnoToString(err), Slice(), err); + } + if (val != 0) { + return Status::NetworkError(ErrnoToString(val), Slice(), val); + } + return Status::OK(); +} + +Status Socket::Write(const uint8_t *buf, int32_t amt, int32_t *nwritten) { + if (amt <= 0) { + return Status::NetworkError( + StringPrintf("invalid send of %" PRId32 " bytes", + amt), Slice(), EINVAL); + } + DCHECK_GE(fd_, 0); + int res = ::send(fd_, buf, amt, MSG_NOSIGNAL); + if (res < 0) { + int err = errno; + return Status::NetworkError(std::string("write error: ") + + ErrnoToString(err), Slice(), err); + } + *nwritten = res; + return Status::OK(); +} + +Status Socket::Writev(const struct ::iovec *iov, int iov_len, + int32_t *nwritten) { + if (PREDICT_FALSE(iov_len <= 0)) { + return Status::NetworkError( + StringPrintf("writev: invalid io vector length of %d", + iov_len), + Slice(), EINVAL); + } + DCHECK_GE(fd_, 0); + + struct msghdr msg; + memset(&msg, 0, sizeof(struct msghdr)); + msg.msg_iov = const_cast(iov); + msg.msg_iovlen = iov_len; + int res = ::sendmsg(fd_, &msg, MSG_NOSIGNAL); + if (PREDICT_FALSE(res < 0)) { + int err = errno; + return Status::NetworkError(std::string("sendmsg error: ") + + ErrnoToString(err), Slice(), err); + } + + *nwritten = res; + return Status::OK(); +} + +// Mostly follows writen() from Stevens (2004) or Kerrisk (2010). +Status Socket::BlockingWrite(const uint8_t *buf, size_t buflen, size_t *nwritten, + const MonoTime& deadline) { + DCHECK_LE(buflen, std::numeric_limits::max()) << "Writes > INT32_MAX not supported"; + DCHECK(nwritten); + + size_t tot_written = 0; + while (tot_written < buflen) { + int32_t inc_num_written = 0; + int32_t num_to_write = buflen - tot_written; + MonoDelta timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + if (PREDICT_FALSE(timeout.ToNanoseconds() <= 0)) { + return Status::TimedOut("BlockingWrite timed out"); + } + RETURN_NOT_OK(SetSendTimeout(timeout)); + Status s = Write(buf, num_to_write, &inc_num_written); + tot_written += inc_num_written; + buf += inc_num_written; + *nwritten = tot_written; + + if (PREDICT_FALSE(!s.ok())) { + // Continue silently when the syscall is interrupted. + if (s.posix_code() == EINTR) { + continue; + } + if (s.posix_code() == EAGAIN) { + return Status::TimedOut(""); + } + return s.CloneAndPrepend("BlockingWrite error"); + } + if (PREDICT_FALSE(inc_num_written == 0)) { + // Shouldn't happen on Linux with a blocking socket. Maybe other Unices. + break; + } + } + + if (tot_written < buflen) { + return Status::IOError("Wrote zero bytes on a BlockingWrite() call", + StringPrintf("Transferred %zu of %zu bytes", tot_written, buflen)); + } + return Status::OK(); +} + +Status Socket::Recv(uint8_t *buf, int32_t amt, int32_t *nread) { + if (amt <= 0) { + return Status::NetworkError( + StringPrintf("invalid recv of %d bytes", amt), Slice(), EINVAL); + } + + // The recv() call can return fewer than the requested number of bytes. + // Especially when 'amt' is small, this is very unlikely to happen in + // the context of unit tests. So, we provide an injection hook which + // simulates the same behavior. + if (PREDICT_FALSE(FLAGS_socket_inject_short_recvs && amt > 1)) { + Random r(GetRandomSeed32()); + amt = 1 + r.Uniform(amt - 1); + } + + DCHECK_GE(fd_, 0); + int res = ::recv(fd_, buf, amt, 0); + if (res <= 0) { + if (res == 0) { + return Status::NetworkError("Recv() got EOF from remote", Slice(), ESHUTDOWN); + } + int err = errno; + return Status::NetworkError(std::string("recv error: ") + + ErrnoToString(err), Slice(), err); + } + *nread = res; + return Status::OK(); +} + +// Mostly follows readn() from Stevens (2004) or Kerrisk (2010). +// One place where we deviate: we consider EOF a failure if < amt bytes are read. +Status Socket::BlockingRecv(uint8_t *buf, size_t amt, size_t *nread, const MonoTime& deadline) { + DCHECK_LE(amt, std::numeric_limits::max()) << "Reads > INT32_MAX not supported"; + DCHECK(nread); + size_t tot_read = 0; + while (tot_read < amt) { + int32_t inc_num_read = 0; + int32_t num_to_read = amt - tot_read; + MonoDelta timeout = deadline.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + if (PREDICT_FALSE(timeout.ToNanoseconds() <= 0)) { + return Status::TimedOut(""); + } + RETURN_NOT_OK(SetRecvTimeout(timeout)); + Status s = Recv(buf, num_to_read, &inc_num_read); + tot_read += inc_num_read; + buf += inc_num_read; + *nread = tot_read; + + if (PREDICT_FALSE(!s.ok())) { + // Continue silently when the syscall is interrupted. + if (s.posix_code() == EINTR) { + continue; + } + if (s.posix_code() == EAGAIN) { + return Status::TimedOut(""); + } + return s.CloneAndPrepend("BlockingRecv error"); + } + if (PREDICT_FALSE(inc_num_read == 0)) { + // EOF. + break; + } + } + + if (PREDICT_FALSE(tot_read < amt)) { + return Status::IOError("Read zero bytes on a blocking Recv() call", + StringPrintf("Transferred %zu of %zu bytes", tot_read, amt)); + } + return Status::OK(); +} + +Status Socket::SetTimeout(int opt, std::string optname, const MonoDelta& timeout) { + if (PREDICT_FALSE(timeout.ToNanoseconds() < 0)) { + return Status::InvalidArgument("Timeout specified as negative to SetTimeout", + timeout.ToString()); + } + struct timeval tv; + timeout.ToTimeVal(&tv); + socklen_t optlen = sizeof(tv); + if (::setsockopt(fd_, SOL_SOCKET, opt, &tv, optlen) == -1) { + int err = errno; + return Status::NetworkError( + StringPrintf("Failed to set %s to %s", optname.c_str(), timeout.ToString().c_str()), + ErrnoToString(err), err); + } + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/net/socket.h b/src/kudu/util/net/socket.h new file mode 100644 index 000000000000..99af6c66c17d --- /dev/null +++ b/src/kudu/util/net/socket.h @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_NET_SOCKET_H +#define KUDU_UTIL_NET_SOCKET_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MonoDelta; +class MonoTime; +class Sockaddr; + +class Socket { + public: + static const int FLAG_NONBLOCKING = 0x1; + + // Create a new invalid Socket object. + Socket(); + + // Start managing a socket. + explicit Socket(int fd); + + // Close the socket. Errors will be ignored. + ~Socket(); + + // Close the Socket, checking for errors. + Status Close(); + + // call shutdown() on the socket + Status Shutdown(bool shut_read, bool shut_write); + + // Start managing a socket. + void Reset(int fd); + + // Stop managing the socket and return it. + int Release(); + + // Get the raw file descriptor, or -1 if there is no file descriptor being + // managed. + int GetFd() const; + + // Returns true if the error is temporary and will go away if we retry on + // the socket. + static bool IsTemporarySocketError(int err); + + Status Init(int flags); // See FLAG_NONBLOCKING + + // Set or clear TCP_NODELAY + Status SetNoDelay(bool enabled); + + // Set or clear O_NONBLOCK + Status SetNonBlocking(bool enabled); + Status IsNonBlocking(bool* is_nonblock) const; + + // Set SO_SENDTIMEO to the specified value. Should only be used for blocking sockets. + Status SetSendTimeout(const MonoDelta& timeout); + + // Set SO_RCVTIMEO to the specified value. Should only be used for blocking sockets. + Status SetRecvTimeout(const MonoDelta& timeout); + + // Sets SO_REUSEADDR to 'flag'. Should be used prior to Bind(). + Status SetReuseAddr(bool flag); + + // Convenience method to invoke the common sequence: + // 1) SetReuseAddr(true) + // 2) Bind() + // 3) Listen() + Status BindAndListen(const Sockaddr &sockaddr, int listen_queue_size); + + // Start listening for new connections, with the given backlog size. + // Requires that the socket has already been bound using Bind(). + Status Listen(int listen_queue_size); + + // Call getsockname to get the address of this socket. + Status GetSocketAddress(Sockaddr *cur_addr) const; + + // Call getpeername to get the address of the connected peer. + Status GetPeerAddress(Sockaddr *cur_addr) const; + + // Call bind() to bind the socket to a given address. + // If bind() fails and indicates that the requested port is already in use, + // generates an informative log message by calling 'lsof' if available. + Status Bind(const Sockaddr& bind_addr); + + // Call accept(2) to get a new connection. + Status Accept(Socket *new_conn, Sockaddr *remote, int flags); + + // start connecting this socket to a remote address. + Status Connect(const Sockaddr &remote); + + // get the error status using getsockopt(2) + Status GetSockError() const; + + Status Write(const uint8_t *buf, int32_t amt, int32_t *nwritten); + + Status Writev(const struct ::iovec *iov, int iov_len, int32_t *nwritten); + + // Blocking Write call, returns IOError unless full buffer is sent. + // Underlying Socket expected to be in blocking mode. Fails if any Write() sends 0 bytes. + // Returns OK if buflen bytes were sent, otherwise IOError. + // Upon return, num_written will contain the number of bytes actually written. + // See also writen() from Stevens (2004) or Kerrisk (2010) + Status BlockingWrite(const uint8_t *buf, size_t buflen, size_t *num_written, + const MonoTime& deadline); + + Status Recv(uint8_t *buf, int32_t amt, int32_t *nread); + + // Blocking Recv call, returns IOError unless requested amt bytes are read. + // Underlying Socket expected to be in blocking mode. Fails if any Recv() reads 0 bytes. + // Returns OK if amt bytes were read, otherwise IOError. + // Upon return, nread will contain the number of bytes actually read. + // See also readn() from Stevens (2004) or Kerrisk (2010) + Status BlockingRecv(uint8_t *buf, size_t amt, size_t *nread, const MonoTime& deadline); + + private: + // Called internally from SetSend/RecvTimeout(). + Status SetTimeout(int opt, std::string optname, const MonoDelta& timeout); + + // Called internally during socket setup. + Status SetCloseOnExec(); + + // Bind the socket to a local address before making an outbound connection, + // based on the value of FLAGS_local_ip_for_outbound_sockets. + Status BindForOutgoingConnection(); + + int fd_; + + DISALLOW_COPY_AND_ASSIGN(Socket); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/nvm_cache.cc b/src/kudu/util/nvm_cache.cc new file mode 100644 index 000000000000..cce82b97a37b --- /dev/null +++ b/src/kudu/util/nvm_cache.cc @@ -0,0 +1,592 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// ------------------------------------------------------------ +// This file implements a cache based on the NVML library (http://pmem.io), +// specifically its "libvmem" component. This library makes it easy to program +// against persistent memory hardware by exposing an API which parallels +// malloc/free, but allocates from persistent memory instead of DRAM. +// +// We use this API to implement a cache which treats persistent memory or +// non-volatile memory as if it were a larger cheaper bank of volatile memory. We +// currently make no use of its persistence properties. +// +// Currently, we only store key/value in NVM. All other data structures such as the +// ShardedLRUCache instances, hash table, etc are in DRAM. The assumption is that +// the ratio of data stored vs overhead is quite high. + +#include "kudu/util/nvm_cache.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/atomic_refcount.h" +#include "kudu/gutil/hash/city.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/atomic.h" +#include "kudu/util/cache.h" +#include "kudu/util/cache_metrics.h" +#include "kudu/util/flags.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/locks.h" +#include "kudu/util/metrics.h" + +DEFINE_string(nvm_cache_path, "/vmem", + "The path at which the NVM cache will try to allocate its memory. " + "This can be a tmpfs or ramfs for testing purposes."); +TAG_FLAG(nvm_cache_path, experimental); + +DEFINE_int32(nvm_cache_allocation_retry_count, 10, + "The number of times that the NVM cache will retry attempts to allocate " + "memory for new entries. In between attempts, a cache entry will be " + "evicted."); +TAG_FLAG(nvm_cache_allocation_retry_count, advanced); +TAG_FLAG(nvm_cache_allocation_retry_count, experimental); + +DEFINE_bool(nvm_cache_simulate_allocation_failure, false, + "If true, the NVM cache will inject failures in calls to vmem_malloc " + "for testing."); +TAG_FLAG(nvm_cache_simulate_allocation_failure, unsafe); + + +namespace kudu { + +class MetricEntity; + +namespace { + +using std::shared_ptr; +using std::vector; + +typedef simple_spinlock MutexType; + +// LRU cache implementation + +// An entry is a variable length heap-allocated structure. Entries +// are kept in a circular doubly linked list ordered by access time. +struct LRUHandle { + void* value; + CacheDeleter* deleter; + LRUHandle* next_hash; + LRUHandle* next; + LRUHandle* prev; + size_t charge; // TODO(opt): Only allow uint32_t? + size_t key_length; + Atomic32 refs; + uint32_t hash; // Hash of key(); used for fast sharding and comparisons + uint8_t key_data[1]; // Beginning of key + + Slice key() const { + // For cheaper lookups, we allow a temporary Handle object + // to store a pointer to a key in "value". + if (next == this) { + return *(reinterpret_cast(value)); + } else { + return Slice(key_data, key_length); + } + } +}; + +// We provide our own simple hash table since it removes a whole bunch +// of porting hacks and is also faster than some of the built-in hash +// table implementations in some of the compiler/runtime combinations +// we have tested. E.g., readrandom speeds up by ~5% over the g++ +// 4.4.3's builtin hashtable. +class HandleTable { + public: + HandleTable() : length_(0), elems_(0), list_(NULL) { Resize(); } + ~HandleTable() { delete[] list_; } + + LRUHandle* Lookup(const Slice& key, uint32_t hash) { + return *FindPointer(key, hash); + } + + LRUHandle* Insert(LRUHandle* h) { + LRUHandle** ptr = FindPointer(h->key(), h->hash); + LRUHandle* old = *ptr; + h->next_hash = (old == NULL ? NULL : old->next_hash); + *ptr = h; + if (old == NULL) { + ++elems_; + if (elems_ > length_) { + // Since each cache entry is fairly large, we aim for a small + // average linked list length (<= 1). + Resize(); + } + } + return old; + } + + LRUHandle* Remove(const Slice& key, uint32_t hash) { + LRUHandle** ptr = FindPointer(key, hash); + LRUHandle* result = *ptr; + if (result != NULL) { + *ptr = result->next_hash; + --elems_; + } + return result; + } + + private: + // The table consists of an array of buckets where each bucket is + // a linked list of cache entries that hash into the bucket. + uint32_t length_; + uint32_t elems_; + LRUHandle** list_; + + // Return a pointer to slot that points to a cache entry that + // matches key/hash. If there is no such cache entry, return a + // pointer to the trailing slot in the corresponding linked list. + LRUHandle** FindPointer(const Slice& key, uint32_t hash) { + LRUHandle** ptr = &list_[hash & (length_ - 1)]; + while (*ptr != NULL && + ((*ptr)->hash != hash || key != (*ptr)->key())) { + ptr = &(*ptr)->next_hash; + } + return ptr; + } + + void Resize() { + uint32_t new_length = 16; + while (new_length < elems_ * 1.5) { + new_length *= 2; + } + LRUHandle** new_list = new LRUHandle*[new_length]; + memset(new_list, 0, sizeof(new_list[0]) * new_length); + uint32_t count = 0; + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != NULL) { + LRUHandle* next = h->next_hash; + uint32_t hash = h->hash; + LRUHandle** ptr = &new_list[hash & (new_length - 1)]; + h->next_hash = *ptr; + *ptr = h; + h = next; + count++; + } + } + DCHECK_EQ(elems_, count); + delete[] list_; + list_ = new_list; + length_ = new_length; + } +}; + +// A single shard of sharded cache. +class NvmLRUCache { + public: + explicit NvmLRUCache(VMEM *vmp); + ~NvmLRUCache(); + + // Separate from constructor so caller can easily make an array of LRUCache + void SetCapacity(size_t capacity) { capacity_ = capacity; } + + void SetMetrics(CacheMetrics* metrics) { metrics_ = metrics; } + + // Like Cache methods, but with an extra "hash" parameter. + Cache::Handle* Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + CacheDeleter* deleter); + Cache::Handle* Lookup(const Slice& key, uint32_t hash, bool caching); + void Release(Cache::Handle* handle); + void Erase(const Slice& key, uint32_t hash); + void* AllocateAndRetry(size_t size); + + private: + void NvmLRU_Remove(LRUHandle* e); + void NvmLRU_Append(LRUHandle* e); + // Just reduce the reference count by 1. + // Return true if last reference + bool Unref(LRUHandle* e); + void FreeEntry(LRUHandle* e); + + // Evict the LRU item in the cache, adding it to the linked list + // pointed to by 'to_remove_head'. + void EvictOldestUnlocked(LRUHandle** to_remove_head); + + // Free all of the entries in the linked list that has to_free_head + // as its head. + void FreeLRUEntries(LRUHandle* to_free_head); + + // Wrapper around vmem_malloc which injects failures based on a flag. + void* VmemMalloc(size_t size); + + // Initialized before use. + size_t capacity_; + + // mutex_ protects the following state. + MutexType mutex_; + size_t usage_; + + // Dummy head of LRU list. + // lru.prev is newest entry, lru.next is oldest entry. + LRUHandle lru_; + + HandleTable table_; + + VMEM* vmp_; + + CacheMetrics* metrics_; +}; + +NvmLRUCache::NvmLRUCache(VMEM* vmp) + : usage_(0), + vmp_(vmp), + metrics_(NULL) { + // Make empty circular linked list + lru_.next = &lru_; + lru_.prev = &lru_; +} + +NvmLRUCache::~NvmLRUCache() { + for (LRUHandle* e = lru_.next; e != &lru_; ) { + LRUHandle* next = e->next; + DCHECK_EQ(e->refs, 1); // Error if caller has an unreleased handle + if (Unref(e)) { + FreeEntry(e); + } + e = next; + } +} + +void* NvmLRUCache::VmemMalloc(size_t size) { + if (PREDICT_FALSE(FLAGS_nvm_cache_simulate_allocation_failure)) { + return NULL; + } + return vmem_malloc(vmp_, size); +} + +bool NvmLRUCache::Unref(LRUHandle* e) { + DCHECK_GT(ANNOTATE_UNPROTECTED_READ(e->refs), 0); + return !base::RefCountDec(&e->refs); +} + +void NvmLRUCache::FreeEntry(LRUHandle* e) { + DCHECK_EQ(ANNOTATE_UNPROTECTED_READ(e->refs), 0); + e->deleter->Delete(e->key(), e->value); + if (PREDICT_TRUE(metrics_)) { + metrics_->cache_usage->DecrementBy(e->charge); + metrics_->evictions->Increment(); + } + vmem_free(vmp_, e); +} + +// Allocate nvm memory. Try until successful or FLAGS_nvm_cache_allocation_retry_count +// has been exceeded. +void *NvmLRUCache::AllocateAndRetry(size_t size) { + void *tmp; + // There may be times that an allocation fails. With NVM we have + // a fixed size to allocate from. If we cannot allocate the size + // that was asked for, we will remove entries from the cache and + // retry up to the configured number of retries. If this fails, we + // return NULL, which will cause the caller to not insert anything + // into the cache. + LRUHandle *to_remove_head = NULL; + tmp = VmemMalloc(size); + + if (tmp == NULL) { + unique_lock l(&mutex_); + + int retries_remaining = FLAGS_nvm_cache_allocation_retry_count; + while (tmp == NULL && retries_remaining-- > 0 && lru_.next != &lru_) { + EvictOldestUnlocked(&to_remove_head); + + // Unlock while allocating memory. + l.unlock(); + tmp = VmemMalloc(size); + l.lock(); + } + } + + // we free the entries here outside of mutex for + // performance reasons + FreeLRUEntries(to_remove_head); + return tmp; +} + +void NvmLRUCache::NvmLRU_Remove(LRUHandle* e) { + e->next->prev = e->prev; + e->prev->next = e->next; + usage_ -= e->charge; +} + +void NvmLRUCache::NvmLRU_Append(LRUHandle* e) { + // Make "e" newest entry by inserting just before lru_ + e->next = &lru_; + e->prev = lru_.prev; + e->prev->next = e; + e->next->prev = e; + usage_ += e->charge; +} + +Cache::Handle* NvmLRUCache::Lookup(const Slice& key, uint32_t hash, bool caching) { + LRUHandle* e; + { + lock_guard l(&mutex_); + e = table_.Lookup(key, hash); + if (e != NULL) { + // If an entry exists, remove the old entry from the cache + // and re-add to the end of the linked list. + base::RefCountInc(&e->refs); + NvmLRU_Remove(e); + NvmLRU_Append(e); + } + } + + // Do the metrics outside of the lock. + if (metrics_) { + metrics_->lookups->Increment(); + bool was_hit = (e != NULL); + if (was_hit) { + if (caching) { + metrics_->cache_hits_caching->Increment(); + } else { + metrics_->cache_hits->Increment(); + } + } else { + if (caching) { + metrics_->cache_misses_caching->Increment(); + } else { + metrics_->cache_misses->Increment(); + } + } + } + + return reinterpret_cast(e); +} + +void NvmLRUCache::Release(Cache::Handle* handle) { + LRUHandle* e = reinterpret_cast(handle); + bool last_reference = Unref(e); + if (last_reference) { + FreeEntry(e); + } +} + +void NvmLRUCache::EvictOldestUnlocked(LRUHandle** to_remove_head) { + LRUHandle* old = lru_.next; + NvmLRU_Remove(old); + table_.Remove(old->key(), old->hash); + if (Unref(old)) { + old->next = *to_remove_head; + *to_remove_head = old; + } +} + +void NvmLRUCache::FreeLRUEntries(LRUHandle* to_free_head) { + while (to_free_head != NULL) { + LRUHandle* next = to_free_head->next; + FreeEntry(to_free_head); + to_free_head = next; + } +} + +Cache::Handle* NvmLRUCache::Insert(const Slice& key, uint32_t hash, + void* value, size_t charge, + CacheDeleter* deleter) { + // Account for nvm key memory. + LRUHandle* e = reinterpret_cast( + AllocateAndRetry(sizeof(LRUHandle) - 1 /* sizeof(LRUHandle::key_data) */ + key.size())); + LRUHandle* to_remove_head = NULL; + + if (!e) { + return NULL; + } + + e->value = value; + memcpy(e->key_data, key.data(), key.size()); + + // Modify the charge to the nvm cache to account for all allocations + // done from the nvm address space. In this case we allocated the value + // slice object, the key slice and the key_data structure from nvm. + e->charge = charge + key.size(); + e->hash = hash; + e->refs = 2; // One from LRUCache, one for the returned handle + e->key_length = key.size(); + e->deleter = deleter; + if (PREDICT_TRUE(metrics_)) { + metrics_->cache_usage->IncrementBy(e->charge); + metrics_->inserts->Increment(); + } + + { + lock_guard l(&mutex_); + + NvmLRU_Append(e); + + LRUHandle* old = table_.Insert(e); + if (old != NULL) { + NvmLRU_Remove(old); + if (Unref(old)) { + old->next = to_remove_head; + to_remove_head = old; + } + } + + while (usage_ > capacity_ && lru_.next != &lru_) { + EvictOldestUnlocked(&to_remove_head); + } + } + + // we free the entries here outside of mutex for + // performance reasons + FreeLRUEntries(to_remove_head); + + return reinterpret_cast(e); +} + +void NvmLRUCache::Erase(const Slice& key, uint32_t hash) { + LRUHandle* e; + bool last_reference = false; + { + lock_guard l(&mutex_); + e = table_.Remove(key, hash); + if (e != NULL) { + NvmLRU_Remove(e); + last_reference = Unref(e); + } + } + // mutex not held here + // last_reference will only be true if e != NULL + if (last_reference) { + FreeEntry(e); + } +} +static const int kNumShardBits = 4; +static const int kNumShards = 1 << kNumShardBits; + +class ShardedLRUCache : public Cache { + private: + gscoped_ptr metrics_; + vector shards_; + MutexType id_mutex_; + uint64_t last_id_; + VMEM* vmp_; + + static inline uint32_t HashSlice(const Slice& s) { + return util_hash::CityHash64( + reinterpret_cast(s.data()), s.size()); + } + + static uint32_t Shard(uint32_t hash) { + return hash >> (32 - kNumShardBits); + } + + public: + explicit ShardedLRUCache(size_t capacity, const string& id, VMEM* vmp) + : last_id_(0), + vmp_(vmp) { + + const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards; + for (int s = 0; s < kNumShards; s++) { + gscoped_ptr shard(new NvmLRUCache(vmp_)); + shard->SetCapacity(per_shard); + shards_.push_back(shard.release()); + } + } + + virtual ~ShardedLRUCache() { + STLDeleteElements(&shards_); + // Per the note at the top of this file, our cache is entirely volatile. + // Hence, when the cache is destructed, we delete the underlying + // VMEM pool. + vmem_delete(vmp_); + } + + virtual Handle* Insert(const Slice& key, void* value, size_t charge, + CacheDeleter* deleter) OVERRIDE { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)]->Insert(key, hash, value, charge, deleter); + } + virtual Handle* Lookup(const Slice& key, CacheBehavior caching) OVERRIDE { + const uint32_t hash = HashSlice(key); + return shards_[Shard(hash)]->Lookup(key, hash, caching == EXPECT_IN_CACHE); + } + virtual void Release(Handle* handle) OVERRIDE { + LRUHandle* h = reinterpret_cast(handle); + shards_[Shard(h->hash)]->Release(handle); + } + virtual void Erase(const Slice& key) OVERRIDE { + const uint32_t hash = HashSlice(key); + shards_[Shard(hash)]->Erase(key, hash); + } + virtual void* Value(Handle* handle) OVERRIDE { + return reinterpret_cast(handle)->value; + } + virtual uint64_t NewId() OVERRIDE { + lock_guard l(&id_mutex_); + return ++(last_id_); + } + virtual void SetMetrics(const scoped_refptr& entity) OVERRIDE { + metrics_.reset(new CacheMetrics(entity)); + for (NvmLRUCache* cache : shards_) { + cache->SetMetrics(metrics_.get()); + } + } + virtual uint8_t* Allocate(int size) OVERRIDE { + // Try allocating from each of the shards -- if vmem is tight, + // this can cause eviction, so we might have better luck in different + // shards. + for (NvmLRUCache* cache : shards_) { + uint8_t* ptr = reinterpret_cast(cache->AllocateAndRetry(size)); + if (ptr) return ptr; + } + // TODO: increment a metric here on allocation failure. + return NULL; + } + virtual void Free(uint8_t *ptr) OVERRIDE { + vmem_free(vmp_, ptr); + } + virtual uint8_t* MoveToHeap(uint8_t* ptr, int size) OVERRIDE { + uint8_t* ret = new uint8_t[size]; + memcpy(ret, ptr, size); + vmem_free(vmp_, ptr); + return ret; + } + +}; + +} // end anonymous namespace + +Cache* NewLRUNvmCache(size_t capacity, const std::string& id) { + // vmem_create() will fail if the capacity is too small, but with + // an inscrutable error. So, we'll check ourselves. + CHECK_GE(capacity, VMEM_MIN_POOL) + << "configured capacity " << capacity << " bytes is less than " + << "the minimum capacity for an NVM cache: " << VMEM_MIN_POOL; + + VMEM* vmp = vmem_create(FLAGS_nvm_cache_path.c_str(), capacity); + // If we cannot create the cache pool we should not retry. + PLOG_IF(FATAL, vmp == NULL) << "Could not initialize NVM cache library in path " + << FLAGS_nvm_cache_path.c_str(); + + return new ShardedLRUCache(capacity, id, vmp); +} + +} // namespace kudu diff --git a/src/kudu/util/nvm_cache.h b/src/kudu/util/nvm_cache.h new file mode 100644 index 000000000000..38962f2c4adf --- /dev/null +++ b/src/kudu/util/nvm_cache.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_NVM_CACHE_H_ +#define KUDU_UTIL_NVM_CACHE_H_ + +#include + +namespace kudu { +class Cache; + +// Create a cache in persistent memory with the given capacity. +Cache* NewLRUNvmCache(size_t capacity, const std::string& id); + +} // namespace kudu + +#endif diff --git a/src/kudu/util/object_pool-test.cc b/src/kudu/util/object_pool-test.cc new file mode 100644 index 000000000000..d6b34ae49572 --- /dev/null +++ b/src/kudu/util/object_pool-test.cc @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "kudu/util/object_pool.h" + +namespace kudu { + +// Simple class which maintains a count of how many objects +// are currently alive. +class MyClass { + public: + MyClass() { + instance_count_++; + } + + ~MyClass() { + instance_count_--; + } + + static int instance_count() { + return instance_count_; + } + + static void ResetCount() { + instance_count_ = 0; + } + + private: + static int instance_count_; +}; +int MyClass::instance_count_ = 0; + +TEST(TestObjectPool, TestPooling) { + MyClass::ResetCount(); + { + ObjectPool pool; + ASSERT_EQ(0, MyClass::instance_count()); + MyClass *a = pool.Construct(); + ASSERT_EQ(1, MyClass::instance_count()); + MyClass *b = pool.Construct(); + ASSERT_EQ(2, MyClass::instance_count()); + ASSERT_TRUE(a != b); + pool.Destroy(b); + ASSERT_EQ(1, MyClass::instance_count()); + MyClass *c = pool.Construct(); + ASSERT_EQ(2, MyClass::instance_count()); + ASSERT_TRUE(c == b) << "should reuse instance"; + pool.Destroy(c); + + ASSERT_EQ(1, MyClass::instance_count()); + } + + ASSERT_EQ(0, MyClass::instance_count()) + << "destructing pool should have cleared instances"; +} + +TEST(TestObjectPool, TestScopedPtr) { + MyClass::ResetCount(); + ASSERT_EQ(0, MyClass::instance_count()); + ObjectPool pool; + { + ObjectPool::scoped_ptr sptr( + pool.make_scoped_ptr(pool.Construct())); + ASSERT_EQ(1, MyClass::instance_count()); + } + ASSERT_EQ(0, MyClass::instance_count()); +} + +} // namespace kudu diff --git a/src/kudu/util/object_pool.h b/src/kudu/util/object_pool.h new file mode 100644 index 000000000000..147363f9ce94 --- /dev/null +++ b/src/kudu/util/object_pool.h @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple pool/freelist for objects of the same type, typically used +// in local context. +#ifndef KUDU_UTIL_OBJECT_POOL_H +#define KUDU_UTIL_OBJECT_POOL_H + +#include +#include +#include "kudu/gutil/manual_constructor.h" +#include "kudu/gutil/gscoped_ptr.h" + +namespace kudu { + +using base::ManualConstructor; + +template +class ReturnToPool; + +// An object pool allocates and destroys a single class of objects +// off of a free-list. +// +// Upon destruction of the pool, any objects allocated from this pool are +// destroyed, regardless of whether they have been explicitly returned to the +// pool. +// +// This class is similar to the boost::pool::object_pool, except that the boost +// implementation seems to have O(n) deallocation performance and benchmarked +// really poorly. +// +// This class is not thread-safe. +template +class ObjectPool { + public: + typedef ReturnToPool deleter_type; + typedef gscoped_ptr scoped_ptr; + + ObjectPool() : + free_list_head_(NULL), + alloc_list_head_(NULL), + deleter_(this) { + } + + ~ObjectPool() { + // Delete all objects ever allocated from this pool + ListNode *node = alloc_list_head_; + while (node != NULL) { + ListNode *tmp = node; + node = node->next_on_alloc_list; + if (!tmp->is_on_freelist) { + // Have to run the actual destructor if the user forgot to free it. + tmp->Destroy(); + } + delete tmp; + } + } + + // Construct a new object instance from the pool. + T *Construct() { + ManualConstructor *obj = GetObject(); + obj->Init(); + return obj->get(); + } + + template + T *Construct(Arg1 arg1) { + ManualConstructor *obj = GetObject(); + obj->Init(arg1); + return obj->get(); + } + + // Destroy an object, running its destructor and returning it to the + // free-list. + void Destroy(T *t) { + CHECK_NOTNULL(t); + ListNode *node = static_cast( + reinterpret_cast *>(t)); + + node->Destroy(); + + DCHECK(!node->is_on_freelist); + node->is_on_freelist = true; + node->next_on_free_list = free_list_head_; + free_list_head_ = node; + } + + // Create a scoped_ptr wrapper around the given pointer which came from this + // pool. + // When the scoped_ptr goes out of scope, the object will get released back + // to the pool. + scoped_ptr make_scoped_ptr(T *ptr) { + return scoped_ptr(ptr, deleter_); + } + + private: + class ListNode : ManualConstructor { + friend class ObjectPool; + + ListNode *next_on_free_list; + ListNode *next_on_alloc_list; + + bool is_on_freelist; + }; + + + ManualConstructor *GetObject() { + if (free_list_head_ != NULL) { + ListNode *tmp = free_list_head_; + free_list_head_ = tmp->next_on_free_list; + tmp->next_on_free_list = NULL; + DCHECK(tmp->is_on_freelist); + tmp->is_on_freelist = false; + + return static_cast *>(tmp); + } + auto new_node = new ListNode(); + new_node->next_on_free_list = NULL; + new_node->next_on_alloc_list = alloc_list_head_; + new_node->is_on_freelist = false; + alloc_list_head_ = new_node; + return new_node; + } + + // Keeps track of free objects in this pool. + ListNode *free_list_head_; + + // Keeps track of all objects ever allocated by this pool. + ListNode *alloc_list_head_; + + deleter_type deleter_; +}; + +// Functor which returns the passed objects to a specific object pool. +// This can be used in conjunction with scoped_ptr to automatically release +// an object back to a pool when it goes out of scope. +template +class ReturnToPool { + public: + explicit ReturnToPool(ObjectPool *pool) : + pool_(pool) { + } + + inline void operator()(T *ptr) const { + pool_->Destroy(ptr); + } + + private: + ObjectPool *pool_; +}; + + +} // namespace kudu +#endif diff --git a/src/kudu/util/oid_generator.cc b/src/kudu/util/oid_generator.cc new file mode 100644 index 000000000000..09755864c8a9 --- /dev/null +++ b/src/kudu/util/oid_generator.cc @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/oid_generator.h" + +namespace kudu { + +string ObjectIdGenerator::Next() { + boost::lock_guard l(oid_lock_); + boost::uuids::uuid oid = oid_generator_(); + const uint8_t *uuid = oid.data; + return StringPrintf("%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", + uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], + uuid[8], uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]); +} + +} // namespace kudu diff --git a/src/kudu/util/oid_generator.h b/src/kudu/util/oid_generator.h new file mode 100644 index 000000000000..85a74127e6ea --- /dev/null +++ b/src/kudu/util/oid_generator.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_OID_GENERATOR_H +#define KUDU_UTIL_OID_GENERATOR_H + +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/locks.h" + +namespace kudu { + +// Generates a unique 32byte id, based on uuid v4. +// This class is thread safe +class ObjectIdGenerator { + public: + ObjectIdGenerator() {} + ~ObjectIdGenerator() {} + + std::string Next(); + + private: + DISALLOW_COPY_AND_ASSIGN(ObjectIdGenerator); + + typedef simple_spinlock LockType; + + LockType oid_lock_; + boost::uuids::random_generator oid_generator_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/once-test.cc b/src/kudu/util/once-test.cc new file mode 100644 index 000000000000..8ccd8b6f84e3 --- /dev/null +++ b/src/kudu/util/once-test.cc @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/once.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/thread.h" + +using std::vector; +using strings::Substitute; + +namespace kudu { + +namespace { + +struct Thing { + explicit Thing(bool should_fail) + : should_fail_(should_fail), + value_(0) { + } + + Status Init() { + return once_.Init(&Thing::InitOnce, this); + } + + Status InitOnce() { + if (should_fail_) { + return Status::IllegalState("Whoops!"); + } + value_ = 1; + return Status::OK(); + } + + const bool should_fail_; + int value_; + KuduOnceDynamic once_; +}; + +} // anonymous namespace + +TEST(TestOnce, KuduOnceDynamicTest) { + { + Thing t(false); + ASSERT_EQ(0, t.value_); + ASSERT_FALSE(t.once_.initted()); + + for (int i = 0; i < 2; i++) { + ASSERT_OK(t.Init()); + ASSERT_EQ(1, t.value_); + ASSERT_TRUE(t.once_.initted()); + } + } + + { + Thing t(true); + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(t.Init().IsIllegalState()); + ASSERT_EQ(0, t.value_); + ASSERT_TRUE(t.once_.initted()); + } + } +} + +static void InitOrGetInitted(Thing* t, int i) { + if (i % 2 == 0) { + LOG(INFO) << "Thread " << i << " initting"; + t->Init(); + } else { + LOG(INFO) << "Thread " << i << " value: " << t->once_.initted(); + } +} + +TEST(TestOnce, KuduOnceDynamicThreadSafeTest) { + Thing thing(false); + + // The threads will read and write to thing.once_.initted. If access to + // it is not synchronized, TSAN will flag the access as data races. + vector > threads; + for (int i = 0; i < 10; i++) { + scoped_refptr t; + ASSERT_OK(Thread::Create("test", Substitute("thread $0", i), + &InitOrGetInitted, &thing, i, &t)); + threads.push_back(t); + } + + for (const scoped_refptr& t : threads) { + t->Join(); + } +} + +} // namespace kudu diff --git a/src/kudu/util/once.cc b/src/kudu/util/once.cc new file mode 100644 index 000000000000..fada7770fdf7 --- /dev/null +++ b/src/kudu/util/once.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/once.h" + +#include "kudu/util/malloc.h" + +namespace kudu { + +size_t KuduOnceDynamic::memory_footprint_excluding_this() const { + return status_.memory_footprint_excluding_this(); +} + +size_t KuduOnceDynamic::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} + +} // namespace kudu diff --git a/src/kudu/util/once.h b/src/kudu/util/once.h new file mode 100644 index 000000000000..da26107abc6c --- /dev/null +++ b/src/kudu/util/once.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_ONCE_H +#define KUDU_UTIL_ONCE_H + +#include + +#include "kudu/gutil/once.h" +#include "kudu/util/atomic.h" +#include "kudu/util/status.h" + +namespace kudu { + +class KuduOnceDynamic; + +namespace internal { + +// Cheap, single-arg "bound callback" (similar to kudu::Callback) for use +// in KuduOnceDynamic. +template +struct MemberFunc { + KuduOnceDynamic* once; + T* instance; + Status (T::*member_func)(); +}; + +template +void InitCb(void* arg) { + MemberFunc* mf = reinterpret_cast*>(arg); + mf->once->status_ = (mf->instance->*mf->member_func)(); + mf->once->set_initted(); +} + +} // namespace internal + +// More versatile version of GoogleOnceDynamic, including the following: +// 1. Can be used with single-arg, non-static member functions. +// 2. Retains results and overall initialization state for repeated access. +// 3. Access to initialization state is safe for concurrent use. +class KuduOnceDynamic { + public: + KuduOnceDynamic() + : initted_(false) { + } + + // If the underlying GoogleOnceDynamic has yet to be invoked, invokes the + // provided member function and stores its return value. Otherwise, + // returns the stored Status. + // + // T: the type of the member passed in. + template + Status Init(Status (T::*member_func)(), T* instance) { + internal::MemberFunc mf = { this, instance, member_func }; + + // Clang UBSAN doesn't like it when GoogleOnceDynamic handles the cast + // of the argument: + // + // runtime error: call to function + // kudu::cfile::BloomFileReader::InitOnceCb(kudu::cfile::BloomFileReader*) + // through pointer to incorrect function type 'void (*)(void *)' + // + // So let's do the cast ourselves, to void* here and back in InitCb(). + once_.Init(&internal::InitCb, reinterpret_cast(&mf)); + return status_; + } + + // kMemOrderAcquire ensures that loads/stores that come after initted() + // aren't reordered to come before it instead. kMemOrderRelease ensures + // the opposite (i.e. loads/stores before set_initted() aren't reordered + // to come after it). + // + // Taken together, threads can safely synchronize on initted_. + bool initted() const { return initted_.Load(kMemOrderAcquire); } + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + template + friend void internal::InitCb(void* arg); + + void set_initted() { initted_.Store(true, kMemOrderRelease); } + + AtomicBool initted_; + GoogleOnceDynamic once_; + Status status_; +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/os-util-test.cc b/src/kudu/util/os-util-test.cc new file mode 100644 index 000000000000..b7a59d52cd40 --- /dev/null +++ b/src/kudu/util/os-util-test.cc @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/os-util.h" + +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/errno.h" +#include "kudu/util/test_macros.h" + +using std::string; + +namespace kudu { + +void RunTest(const string& name, int user_ticks, int kernel_ticks, int io_wait) { + string buf = strings::Substitute(string("0 ($0) S 0 0 0 0 0 0 0") + + " 0 0 0 $1 $2 0 0 0 0 0" + + " 0 0 0 0 0 0 0 0 0 0 " + + " 0 0 0 0 0 0 0 0 0 0 " + + " 0 $3 0 0 0 0 0 0 0 0 " + + " 0 0", + name, user_ticks, kernel_ticks, io_wait); + ThreadStats stats; + string extracted_name; + ASSERT_OK(ParseStat(buf, &extracted_name, &stats)); + ASSERT_EQ(name, extracted_name); + ASSERT_EQ(user_ticks * (1e9 / sysconf(_SC_CLK_TCK)), stats.user_ns); + ASSERT_EQ(kernel_ticks * (1e9 / sysconf(_SC_CLK_TCK)), stats.kernel_ns); + ASSERT_EQ(io_wait * (1e9 / sysconf(_SC_CLK_TCK)), stats.iowait_ns); +} + +TEST(OsUtilTest, TestSelf) { + RunTest("test", 111, 222, 333); +} + +TEST(OsUtilTest, TestSelfNameWithSpace) { + RunTest("a space", 111, 222, 333); +} + +TEST(OsUtilTest, TestSelfNameWithParens) { + RunTest("a(b(c((d))e)", 111, 222, 333); +} + +} // namespace kudu diff --git a/src/kudu/util/os-util.cc b/src/kudu/util/os-util.cc new file mode 100644 index 000000000000..aac109c1f53a --- /dev/null +++ b/src/kudu/util/os-util.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Imported from Impala. Changes include: +// - Namespace and imports. +// - Replaced GetStrErrMsg with ErrnoToString. +// - Replaced StringParser with strings/numbers. +// - Fixes for cpplint. +// - Fixed parsing when thread names have spaces. + +#include "kudu/util/os-util.h" + +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/errno.h" + +using std::ifstream; +using std::istreambuf_iterator; +using std::stringstream; +using strings::Split; +using strings::Substitute; + +namespace kudu { + +// Ensure that Impala compiles on earlier kernels. If the target kernel does not support +// _SC_CLK_TCK, sysconf(_SC_CLK_TCK) will return -1. +#ifndef _SC_CLK_TCK +#define _SC_CLK_TCK 2 +#endif + +static const int64_t TICKS_PER_SEC = sysconf(_SC_CLK_TCK); + +// Offsets into the ../stat file array of per-thread statistics. +// +// They are themselves offset by two because the pid and comm fields of the +// file are parsed separately. +static const int64_t USER_TICKS = 13 - 2; +static const int64_t KERNEL_TICKS = 14 - 2; +static const int64_t IO_WAIT = 41 - 2; + +// Largest offset we are interested in, to check we get a well formed stat file. +static const int64_t MAX_OFFSET = IO_WAIT; + +Status ParseStat(const std::string& buffer, std::string* name, ThreadStats* stats) { + DCHECK(stats != nullptr); + + // The thread name should be the only field with parentheses. But the name + // itself may contain parentheses. + size_t open_paren = buffer.find('('); + size_t close_paren = buffer.rfind(')'); + if (open_paren == string::npos || // '(' must exist + close_paren == string::npos || // ')' must exist + open_paren >= close_paren || // '(' must come before ')' + close_paren + 2 == buffer.size()) { // there must be at least two chars after ')' + return Status::IOError("Unrecognised /proc format"); + } + string extracted_name = buffer.substr(open_paren + 1, close_paren - (open_paren + 1)); + string rest = buffer.substr(close_paren + 2); + vector splits = Split(rest, " ", strings::SkipEmpty()); + if (splits.size() < MAX_OFFSET) { + return Status::IOError("Unrecognised /proc format"); + } + + int64 tmp; + if (safe_strto64(splits[USER_TICKS], &tmp)) { + stats->user_ns = tmp * (1e9 / TICKS_PER_SEC); + } + if (safe_strto64(splits[KERNEL_TICKS], &tmp)) { + stats->kernel_ns = tmp * (1e9 / TICKS_PER_SEC); + } + if (safe_strto64(splits[IO_WAIT], &tmp)) { + stats->iowait_ns = tmp * (1e9 / TICKS_PER_SEC); + } + if (name != nullptr) { + *name = extracted_name; + } + return Status::OK(); + +} + +Status GetThreadStats(int64_t tid, ThreadStats* stats) { + DCHECK(stats != nullptr); + if (TICKS_PER_SEC <= 0) { + return Status::NotSupported("ThreadStats not supported"); + } + + stringstream proc_path; + proc_path << "/proc/self/task/" << tid << "/stat"; + ifstream proc_file(proc_path.str().c_str()); + if (!proc_file.is_open()) { + return Status::IOError("Could not open ifstream"); + } + + string buffer((istreambuf_iterator(proc_file)), + istreambuf_iterator()); + + return ParseStat(buffer, nullptr, stats); // don't want the name +} + +bool RunShellProcess(const string& cmd, string* msg) { + DCHECK(msg != nullptr); + FILE* fp = popen(cmd.c_str(), "r"); + if (fp == nullptr) { + *msg = Substitute("Failed to execute shell cmd: '$0', error was: $1", cmd, + ErrnoToString(errno)); + return false; + } + // Read the first 1024 bytes of any output before pclose() so we have some idea of what + // happened on failure. + char buf[1024]; + size_t len = fread(buf, 1, 1024, fp); + string output; + output.assign(buf, len); + + // pclose() returns an encoded form of the sub-process' exit code. + int status = pclose(fp); + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + *msg = output; + return true; + } + + *msg = Substitute("Shell cmd: '$0' exited with an error: '$1'. Output was: '$2'", cmd, + ErrnoToString(errno), output); + return false; +} + +} // namespace kudu diff --git a/src/kudu/util/os-util.h b/src/kudu/util/os-util.h new file mode 100644 index 000000000000..ae2dee3d3149 --- /dev/null +++ b/src/kudu/util/os-util.h @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Imported from Impala. Changes include: +// - Namespace + imports. +// - Fixes for cpplint. +// - Fixed parsing when thread names have spaces. + +#ifndef KUDU_UTIL_OS_UTIL_H +#define KUDU_UTIL_OS_UTIL_H + +#include + +#include "kudu/util/status.h" + +namespace kudu { + +// Utility methods to read interesting values from /proc. +// TODO: Get stats for parent process. + +// Container struct for statistics read from the /proc filesystem for a thread. +struct ThreadStats { + int64_t user_ns; + int64_t kernel_ns; + int64_t iowait_ns; + + // Default constructor zeroes all members in case structure can't be filled by + // GetThreadStats. + ThreadStats() : user_ns(0), kernel_ns(0), iowait_ns(0) { } +}; + +// Populates ThreadStats object using a given buffer. The buffer is expected to +// conform to /proc//task//stat layout; an error will be returned otherwise. +// +// If 'name' is supplied, the extracted thread name will be written to it. +Status ParseStat(const std::string&buffer, std::string* name, ThreadStats* stats); + +// Populates ThreadStats object for a given thread by reading from +// /proc//task//stat. Returns OK unless the file cannot be read or is in an +// unrecognised format, or if the kernel version is not modern enough. +Status GetThreadStats(int64_t tid, ThreadStats* stats); + +// Runs a shell command. Returns false if there was any error (either failure to launch or +// non-0 exit code), and true otherwise. *msg is set to an error message including the OS +// error string, if any, and the first 1k of output if there was any error, or just the +// first 1k of output otherwise. +bool RunShellProcess(const std::string& cmd, std::string* msg); + +} // namespace kudu + +#endif /* KUDU_UTIL_OS_UTIL_H */ diff --git a/src/kudu/util/path_util-test.cc b/src/kudu/util/path_util-test.cc new file mode 100644 index 000000000000..d36eba60643c --- /dev/null +++ b/src/kudu/util/path_util-test.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/util/path_util.h" + +namespace kudu { + +TEST(TestPathUtil, BaseNameTest) { + ASSERT_EQ(".", BaseName("")); + ASSERT_EQ(".", BaseName(".")); + ASSERT_EQ("..", BaseName("..")); + ASSERT_EQ("/", BaseName("/")); + ASSERT_EQ("/", BaseName("//")); + ASSERT_EQ("a", BaseName("a")); + ASSERT_EQ("ab", BaseName("ab")); + ASSERT_EQ("ab", BaseName("ab/")); + ASSERT_EQ("cd", BaseName("ab/cd")); + ASSERT_EQ("ab", BaseName("/ab")); + ASSERT_EQ("ab", BaseName("/ab///")); + ASSERT_EQ("cd", BaseName("/ab/cd")); +} + +TEST(TestPathUtil, DirNameTest) { + ASSERT_EQ(".", DirName("")); + ASSERT_EQ(".", DirName(".")); + ASSERT_EQ(".", DirName("..")); + ASSERT_EQ("/", DirName("/")); +#if defined(__linux__) + // On OS X this test case returns "/", while Linux returns "//". On both + // platforms dirname(1) returns "/". The difference is unlikely to matter in + // practice. + ASSERT_EQ("//", DirName("//")); +#else + ASSERT_EQ("/", DirName("//")); +#endif // defined(__linux__) + ASSERT_EQ(".", DirName("a")); + ASSERT_EQ(".", DirName("ab")); + ASSERT_EQ(".", DirName("ab/")); + ASSERT_EQ("ab", DirName("ab/cd")); + ASSERT_EQ("/", DirName("/ab")); + ASSERT_EQ("/", DirName("/ab///")); + ASSERT_EQ("/ab", DirName("/ab/cd")); +} + +} // namespace kudu diff --git a/src/kudu/util/path_util.cc b/src/kudu/util/path_util.cc new file mode 100644 index 000000000000..872886b52d90 --- /dev/null +++ b/src/kudu/util/path_util.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/path_util.h" + +// Use the POSIX version of dirname(3). +#include + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" + +#if defined(__APPLE__) +#include "kudu/util/locks.h" +#endif // defined(__APPLE__) + +using std::string; + +namespace kudu { + +std::string JoinPathSegments(const std::string &a, + const std::string &b) { + CHECK(!a.empty()) << "empty first component: " << a; + CHECK(!b.empty() && b[0] != '/') + << "second path component must be non-empty and relative: " + << b; + if (a[a.size() - 1] == '/') { + return a + b; + } else { + return a + "/" + b; + } +} + +string DirName(const string& path) { + gscoped_ptr path_copy(strdup(path.c_str())); +#if defined(__APPLE__) + static Mutex lock; + lock_guard l(&lock); +#endif // defined(__APPLE__) + return ::dirname(path_copy.get()); +} + +string BaseName(const string& path) { + gscoped_ptr path_copy(strdup(path.c_str())); + return basename(path_copy.get()); +} + +} // namespace kudu diff --git a/src/kudu/util/path_util.h b/src/kudu/util/path_util.h new file mode 100644 index 000000000000..a5307d2a0825 --- /dev/null +++ b/src/kudu/util/path_util.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility methods for dealing with file paths. +#ifndef KUDU_UTIL_PATH_UTIL_H +#define KUDU_UTIL_PATH_UTIL_H + +#include + +namespace kudu { + +// Join two path segments with the appropriate path separator, +// if necessary. +std::string JoinPathSegments(const std::string &a, + const std::string &b); + +// Return the enclosing directory of path. +// This is like dirname(3) but for C++ strings. +std::string DirName(const std::string& path); + +// Return the terminal component of a path. +// This is like basename(3) but for C++ strings. +std::string BaseName(const std::string& path); + +} // namespace kudu +#endif /* KUDU_UTIL_PATH_UTIL_H */ diff --git a/src/kudu/util/pb_util-internal.cc b/src/kudu/util/pb_util-internal.cc new file mode 100644 index 000000000000..b4393279c006 --- /dev/null +++ b/src/kudu/util/pb_util-internal.cc @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/pb_util-internal.h" + +namespace kudu { +namespace pb_util { +namespace internal { + +//////////////////////////////////////////// +// SequentialFileFileInputStream +//////////////////////////////////////////// + +bool SequentialFileFileInputStream::Next(const void **data, int *size) { + if (PREDICT_FALSE(!status_.ok())) { + LOG(WARNING) << "Already failed on a previous read: " << status_.ToString(); + return false; + } + + size_t available = (buffer_used_ - buffer_offset_); + if (available > 0) { + *data = buffer_.get() + buffer_offset_; + *size = available; + buffer_offset_ += available; + total_read_ += available; + return true; + } + + Slice result; + status_ = rfile_->Read(buffer_size_, &result, buffer_.get()); + if (!status_.ok()) { + LOG(WARNING) << "Read at " << buffer_offset_ << " failed: " << status_.ToString(); + return false; + } + + if (result.data() != buffer_.get()) { + memcpy(buffer_.get(), result.data(), result.size()); + } + + buffer_used_ = result.size(); + buffer_offset_ = buffer_used_; + total_read_ += buffer_used_; + *data = buffer_.get(); + *size = buffer_used_; + return buffer_used_ > 0; +} + +bool SequentialFileFileInputStream::Skip(int count) { + CHECK_GT(count, 0); + int avail = (buffer_used_ - buffer_offset_); + if (avail > count) { + buffer_offset_ += count; + total_read_ += count; + } else { + buffer_used_ = 0; + buffer_offset_ = 0; + status_ = rfile_->Skip(count - avail); + total_read_ += count - avail; + } + return status_.ok(); +} + +//////////////////////////////////////////// +// WritableFileOutputStream +//////////////////////////////////////////// + +bool WritableFileOutputStream::Next(void **data, int *size) { + if (PREDICT_FALSE(!status_.ok())) { + LOG(WARNING) << "Already failed on a previous write: " << status_.ToString(); + return false; + } + + size_t available = (buffer_size_ - buffer_offset_); + if (available > 0) { + *data = buffer_.get() + buffer_offset_; + *size = available; + buffer_offset_ += available; + return true; + } + + if (!Flush()) { + return false; + } + + buffer_offset_ = buffer_size_; + *data = buffer_.get(); + *size = buffer_size_; + return true; +} + +} // namespace internal +} // namespace pb_util +} // namespace kudu diff --git a/src/kudu/util/pb_util-internal.h b/src/kudu/util/pb_util-internal.h new file mode 100644 index 000000000000..35edf4e01781 --- /dev/null +++ b/src/kudu/util/pb_util-internal.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Classes used internally by pb_util.h. +// This header should not be included by anything but pb_util and its tests. +#ifndef KUDU_UTIL_PB_UTIL_INTERNAL_H +#define KUDU_UTIL_PB_UTIL_INTERNAL_H + +#include +#include +#include "kudu/util/env.h" + +namespace kudu { +namespace pb_util { +namespace internal { + +// Input Stream used by ParseFromSequentialFile() +class SequentialFileFileInputStream : public google::protobuf::io::ZeroCopyInputStream { + public: + explicit SequentialFileFileInputStream(SequentialFile *rfile, + size_t buffer_size = kDefaultBufferSize) + : buffer_used_(0), buffer_offset_(0), + buffer_size_(buffer_size), buffer_(new uint8[buffer_size_]), + total_read_(0), rfile_(rfile) { + CHECK_GT(buffer_size, 0); + } + + ~SequentialFileFileInputStream() { + } + + bool Next(const void **data, int *size) OVERRIDE; + bool Skip(int count) OVERRIDE; + + void BackUp(int count) OVERRIDE { + CHECK_GE(count, 0); + CHECK_LE(count, buffer_offset_); + buffer_offset_ -= count; + total_read_ -= count; + } + + int64 ByteCount() const OVERRIDE { + return total_read_; + } + + private: + static const size_t kDefaultBufferSize = 8192; + + Status status_; + + size_t buffer_used_; + size_t buffer_offset_; + const size_t buffer_size_; + gscoped_ptr buffer_; + + size_t total_read_; + SequentialFile *rfile_; +}; + +// Output Stream used by SerializeToWritableFile() +class WritableFileOutputStream : public google::protobuf::io::ZeroCopyOutputStream { + public: + explicit WritableFileOutputStream(WritableFile *wfile, size_t buffer_size = kDefaultBufferSize) + : buffer_offset_(0), buffer_size_(buffer_size), buffer_(new uint8[buffer_size_]), + flushed_(0), wfile_(wfile) { + CHECK_GT(buffer_size, 0); + } + + ~WritableFileOutputStream() { + } + + bool Flush() { + if (buffer_offset_ > 0) { + Slice data(buffer_.get(), buffer_offset_); + status_ = wfile_->Append(data); + flushed_ += buffer_offset_; + buffer_offset_ = 0; + } + return status_.ok(); + } + + bool Next(void **data, int *size) OVERRIDE; + + void BackUp(int count) OVERRIDE { + CHECK_GE(count, 0); + CHECK_LE(count, buffer_offset_); + buffer_offset_ -= count; + } + + int64 ByteCount() const OVERRIDE { + return flushed_ + buffer_offset_; + } + + private: + static const size_t kDefaultBufferSize = 8192; + + Status status_; + + size_t buffer_offset_; + const size_t buffer_size_; + gscoped_ptr buffer_; + + size_t flushed_; + WritableFile *wfile_; +}; + +} // namespace internal +} // namespace pb_util +} // namespace kudu +#endif diff --git a/src/kudu/util/pb_util-test.cc b/src/kudu/util/pb_util-test.cc new file mode 100644 index 000000000000..ba61d585b421 --- /dev/null +++ b/src/kudu/util/pb_util-test.cc @@ -0,0 +1,424 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/env_util.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/pb_util.h" +#include "kudu/util/pb_util-internal.h" +#include "kudu/util/proto_container_test.pb.h" +#include "kudu/util/proto_container_test2.pb.h" +#include "kudu/util/proto_container_test3.pb.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { +namespace pb_util { + +using google::protobuf::FileDescriptorSet; +using internal::WritableFileOutputStream; +using std::ostringstream; +using std::shared_ptr; +using std::string; +using std::vector; + +static const char* kTestFileName = "pb_container.meta"; +static const char* kTestKeyvalName = "my-key"; +static const int kTestKeyvalValue = 1; + +class TestPBUtil : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + path_ = GetTestPath(kTestFileName); + } + + protected: + // Create a container file with expected values. + // Since this is a unit test class, and we want it to be fast, we do not + // fsync by default. + Status CreateKnownGoodContainerFile(CreateMode create = OVERWRITE, + SyncMode sync = NO_SYNC); + + // XORs the data in the specified range of the file at the given path. + Status BitFlipFileByteRange(const string& path, uint64_t offset, uint64_t length); + + void DumpPBCToString(const string& path, bool oneline_output, string* ret); + + // Output file name for most unit tests. + string path_; +}; + +Status TestPBUtil::CreateKnownGoodContainerFile(CreateMode create, SyncMode sync) { + ProtoContainerTestPB test_pb; + test_pb.set_name(kTestKeyvalName); + test_pb.set_value(kTestKeyvalValue); + return WritePBContainerToPath(env_.get(), path_, test_pb, create, sync); +} + +Status TestPBUtil::BitFlipFileByteRange(const string& path, uint64_t offset, uint64_t length) { + faststring buf; + // Read the data from disk. + { + gscoped_ptr file; + RETURN_NOT_OK(env_->NewRandomAccessFile(path, &file)); + uint64_t size; + RETURN_NOT_OK(file->Size(&size)); + Slice slice; + faststring scratch; + scratch.resize(size); + RETURN_NOT_OK(env_util::ReadFully(file.get(), 0, size, &slice, scratch.data())); + buf.append(slice.data(), slice.size()); + } + + // Flip the bits. + for (uint64_t i = 0; i < length; i++) { + uint8_t* addr = buf.data() + offset + i; + *addr = ~*addr; + } + + // Write the data back to disk. + gscoped_ptr file; + RETURN_NOT_OK(env_->NewWritableFile(path, &file)); + RETURN_NOT_OK(file->Append(buf)); + RETURN_NOT_OK(file->Close()); + + return Status::OK(); +} + +TEST_F(TestPBUtil, TestWritableFileOutputStream) { + gscoped_ptr env(NewMemEnv(Env::Default())); + shared_ptr file; + ASSERT_OK(env_util::OpenFileForWrite(env.get(), "/test", &file)); + + WritableFileOutputStream stream(file.get(), 4096); + + void* buf; + int size; + + // First call should yield the whole buffer. + ASSERT_TRUE(stream.Next(&buf, &size)); + ASSERT_EQ(4096, size); + ASSERT_EQ(4096, stream.ByteCount()); + + // Backup 1000 and the next call should yield 1000 + stream.BackUp(1000); + ASSERT_EQ(3096, stream.ByteCount()); + + ASSERT_TRUE(stream.Next(&buf, &size)); + ASSERT_EQ(1000, size); + + // Another call should flush and yield a new buffer of 4096 + ASSERT_TRUE(stream.Next(&buf, &size)); + ASSERT_EQ(4096, size); + ASSERT_EQ(8192, stream.ByteCount()); + + // Should be able to backup to 7192 + stream.BackUp(1000); + ASSERT_EQ(7192, stream.ByteCount()); + + // Flushing shouldn't change written count. + ASSERT_TRUE(stream.Flush()); + ASSERT_EQ(7192, stream.ByteCount()); + + // Since we just flushed, we should get another full buffer. + ASSERT_TRUE(stream.Next(&buf, &size)); + ASSERT_EQ(4096, size); + ASSERT_EQ(7192 + 4096, stream.ByteCount()); + + ASSERT_TRUE(stream.Flush()); + + ASSERT_EQ(stream.ByteCount(), file->Size()); +} + +// Basic read/write test. +TEST_F(TestPBUtil, TestPBContainerSimple) { + // Exercise both the SYNC and NO_SYNC codepaths, despite the fact that we + // aren't able to observe a difference in the test. + vector modes = { SYNC, NO_SYNC }; + for (SyncMode mode : modes) { + + // Write the file. + ASSERT_OK(CreateKnownGoodContainerFile(NO_OVERWRITE, mode)); + + // Read it back, should validate and contain the expected values. + ProtoContainerTestPB test_pb; + ASSERT_OK(ReadPBContainerFromPath(env_.get(), path_, &test_pb)); + ASSERT_EQ(kTestKeyvalName, test_pb.name()); + ASSERT_EQ(kTestKeyvalValue, test_pb.value()); + + // Delete the file. + ASSERT_OK(env_->DeleteFile(path_)); + } +} + +// Corruption / various failure mode test. +TEST_F(TestPBUtil, TestPBContainerCorruption) { + // Test that we indicate when the file does not exist. + ProtoContainerTestPB test_pb; + Status s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsNotFound()) << "Should not be found: " << path_ << ": " << s.ToString(); + + // Test that an empty file looks like corruption. + { + // Create the empty file. + gscoped_ptr file; + ASSERT_OK(env_->NewWritableFile(path_, &file)); + ASSERT_OK(file->Close()); + } + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should be zero length: " << path_ << ": " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "File size not large enough to be valid"); + + // Test truncated file. + ASSERT_OK(CreateKnownGoodContainerFile()); + uint64_t known_good_size = 0; + ASSERT_OK(env_->GetFileSize(path_, &known_good_size)); + int ret = truncate(path_.c_str(), known_good_size - 2); + if (ret != 0) { + PLOG(ERROR) << "truncate() of file " << path_ << " failed"; + FAIL(); + } + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should be incorrect size: " << path_ << ": " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "File size not large enough to be valid"); + + // Test corrupted magic. + ASSERT_OK(CreateKnownGoodContainerFile()); + ASSERT_OK(BitFlipFileByteRange(path_, 0, 2)); + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should have invalid magic: " << path_ << ": " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Invalid magic number"); + + // Test corrupted version. + ASSERT_OK(CreateKnownGoodContainerFile()); + ASSERT_OK(BitFlipFileByteRange(path_, 8, 2)); + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsNotSupported()) << "Should have unsupported version number: " << path_ << ": " + << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "we only support version 1"); + + // Test corrupted size. + ASSERT_OK(CreateKnownGoodContainerFile()); + ASSERT_OK(BitFlipFileByteRange(path_, 12, 2)); + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should be incorrect size: " << path_ << ": " << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "File size not large enough to be valid"); + + // Test corrupted data (looks like bad checksum). + ASSERT_OK(CreateKnownGoodContainerFile()); + ASSERT_OK(BitFlipFileByteRange(path_, 16, 2)); + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should be incorrect checksum: " << path_ << ": " + << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Incorrect checksum"); + + // Test corrupted checksum. + ASSERT_OK(CreateKnownGoodContainerFile()); + ASSERT_OK(BitFlipFileByteRange(path_, known_good_size - 4, 2)); + s = ReadPBContainerFromPath(env_.get(), path_, &test_pb); + ASSERT_TRUE(s.IsCorruption()) << "Should be incorrect checksum: " << path_ << ": " + << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "Incorrect checksum"); +} + +TEST_F(TestPBUtil, TestMultipleMessages) { + ProtoContainerTestPB pb; + pb.set_name("foo"); + pb.set_note("bar"); + + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(path_, &writer)); + WritablePBContainerFile pb_writer(writer.Pass()); + ASSERT_OK(pb_writer.Init(pb)); + + for (int i = 0; i < 10; i++) { + pb.set_value(i); + ASSERT_OK(pb_writer.Append(pb)); + } + ASSERT_OK(pb_writer.Close()); + + int pbs_read = 0; + gscoped_ptr reader; + ASSERT_OK(env_->NewRandomAccessFile(path_, &reader)); + ReadablePBContainerFile pb_reader(reader.Pass()); + ASSERT_OK(pb_reader.Init()); + for (int i = 0;; i++) { + ProtoContainerTestPB read_pb; + Status s = pb_reader.ReadNextPB(&read_pb); + if (s.IsEndOfFile()) { + break; + } + ASSERT_OK(s); + ASSERT_EQ(pb.name(), read_pb.name()); + ASSERT_EQ(read_pb.value(), i); + ASSERT_EQ(pb.note(), read_pb.note()); + pbs_read++; + } + ASSERT_EQ(10, pbs_read); + ASSERT_OK(pb_reader.Close()); +} + +TEST_F(TestPBUtil, TestInterleavedReadWrite) { + ProtoContainerTestPB pb; + pb.set_name("foo"); + pb.set_note("bar"); + + // Open the file for writing and reading. + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(path_, &writer)); + WritablePBContainerFile pb_writer(writer.Pass()); + gscoped_ptr reader; + ASSERT_OK(env_->NewRandomAccessFile(path_, &reader)); + ReadablePBContainerFile pb_reader(reader.Pass()); + + // Write the header (writer) and validate it (reader). + ASSERT_OK(pb_writer.Init(pb)); + ASSERT_OK(pb_reader.Init()); + + for (int i = 0; i < 10; i++) { + // Write a message and read it back. + pb.set_value(i); + ASSERT_OK(pb_writer.Append(pb)); + ProtoContainerTestPB read_pb; + ASSERT_OK(pb_reader.ReadNextPB(&read_pb)); + ASSERT_EQ(pb.name(), read_pb.name()); + ASSERT_EQ(read_pb.value(), i); + ASSERT_EQ(pb.note(), read_pb.note()); + } + + // After closing the writer, the reader should be out of data. + ASSERT_OK(pb_writer.Close()); + ASSERT_TRUE(pb_reader.ReadNextPB(nullptr).IsEndOfFile()); + ASSERT_OK(pb_reader.Close()); +} + +TEST_F(TestPBUtil, TestPopulateDescriptorSet) { + { + // No dependencies --> just one proto. + ProtoContainerTestPB pb; + FileDescriptorSet protos; + WritablePBContainerFile::PopulateDescriptorSet( + pb.GetDescriptor()->file(), &protos); + ASSERT_EQ(1, protos.file_size()); + } + { + // One direct dependency --> two protos. + ProtoContainerTest2PB pb; + FileDescriptorSet protos; + WritablePBContainerFile::PopulateDescriptorSet( + pb.GetDescriptor()->file(), &protos); + ASSERT_EQ(2, protos.file_size()); + } + { + // One direct and one indirect dependency --> three protos. + ProtoContainerTest3PB pb; + FileDescriptorSet protos; + WritablePBContainerFile::PopulateDescriptorSet( + pb.GetDescriptor()->file(), &protos); + ASSERT_EQ(3, protos.file_size()); + } +} + +void TestPBUtil::DumpPBCToString(const string& path, bool oneline_output, + string* ret) { + gscoped_ptr reader; + ASSERT_OK(env_->NewRandomAccessFile(path, &reader)); + ReadablePBContainerFile pb_reader(reader.Pass()); + ASSERT_OK(pb_reader.Init()); + ostringstream oss; + ASSERT_OK(pb_reader.Dump(&oss, oneline_output)); + ASSERT_OK(pb_reader.Close()); + *ret = oss.str(); +} + +TEST_F(TestPBUtil, TestDumpPBContainer) { + const char* kExpectedOutput = + "Message 0\n" + "-------\n" + "record_one {\n" + " name: \"foo\"\n" + " value: 0\n" + "}\n" + "record_two {\n" + " record {\n" + " name: \"foo\"\n" + " value: 0\n" + " }\n" + "}\n" + "\n" + "Message 1\n" + "-------\n" + "record_one {\n" + " name: \"foo\"\n" + " value: 1\n" + "}\n" + "record_two {\n" + " record {\n" + " name: \"foo\"\n" + " value: 2\n" + " }\n" + "}\n\n"; + + const char* kExpectedOutputShort = + "0\trecord_one { name: \"foo\" value: 0 } record_two { record { name: \"foo\" value: 0 } }\n" + "1\trecord_one { name: \"foo\" value: 1 } record_two { record { name: \"foo\" value: 2 } }\n"; + + ProtoContainerTest3PB pb; + pb.mutable_record_one()->set_name("foo"); + pb.mutable_record_two()->mutable_record()->set_name("foo"); + + gscoped_ptr writer; + ASSERT_OK(env_->NewWritableFile(path_, &writer)); + WritablePBContainerFile pb_writer(writer.Pass()); + ASSERT_OK(pb_writer.Init(pb)); + + for (int i = 0; i < 2; i++) { + pb.mutable_record_one()->set_value(i); + pb.mutable_record_two()->mutable_record()->set_value(i*2); + ASSERT_OK(pb_writer.Append(pb)); + } + ASSERT_OK(pb_writer.Close()); + + string output; + DumpPBCToString(path_, false, &output); + ASSERT_STREQ(kExpectedOutput, output.c_str()); + + DumpPBCToString(path_, true, &output); + ASSERT_STREQ(kExpectedOutputShort, output.c_str()); +} + +TEST_F(TestPBUtil, TestOverwriteExistingPB) { + ASSERT_OK(CreateKnownGoodContainerFile(NO_OVERWRITE)); + ASSERT_TRUE(CreateKnownGoodContainerFile(NO_OVERWRITE).IsAlreadyPresent()); + ASSERT_OK(CreateKnownGoodContainerFile(OVERWRITE)); + ASSERT_OK(CreateKnownGoodContainerFile(OVERWRITE)); +} + +} // namespace pb_util +} // namespace kudu diff --git a/src/kudu/util/pb_util.cc b/src/kudu/util/pb_util.cc new file mode 100644 index 000000000000..0e89d646ffe4 --- /dev/null +++ b/src/kudu/util/pb_util.cc @@ -0,0 +1,664 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions copyright (C) 2008, Google, inc. +// +// Utilities for working with protobufs. +// Some of this code is cribbed from the protobuf source, +// but modified to work with kudu's 'faststring' instead of STL strings. + +#include "kudu/util/pb_util.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/bind.h" +#include "kudu/gutil/callback.h" +#include "kudu/gutil/map-util.h" +#include "kudu/gutil/strings/escaping.h" +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/coding-inl.h" +#include "kudu/util/coding.h" +#include "kudu/util/crc.h" +#include "kudu/util/debug/sanitizer_scopes.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/env.h" +#include "kudu/util/env_util.h" +#include "kudu/util/path_util.h" +#include "kudu/util/pb_util-internal.h" +#include "kudu/util/pb_util.pb.h" +#include "kudu/util/status.h" + +using google::protobuf::Descriptor; +using google::protobuf::DescriptorPool; +using google::protobuf::DynamicMessageFactory; +using google::protobuf::FieldDescriptor; +using google::protobuf::FileDescriptor; +using google::protobuf::FileDescriptorProto; +using google::protobuf::FileDescriptorSet; +using google::protobuf::io::ArrayInputStream; +using google::protobuf::io::CodedInputStream; +using google::protobuf::Message; +using google::protobuf::MessageLite; +using google::protobuf::Reflection; +using google::protobuf::SimpleDescriptorDatabase; +using kudu::crc::Crc; +using kudu::pb_util::internal::SequentialFileFileInputStream; +using kudu::pb_util::internal::WritableFileOutputStream; +using std::deque; +using std::endl; +using std::shared_ptr; +using std::string; +using std::unordered_set; +using std::vector; +using strings::Substitute; +using strings::Utf8SafeCEscape; + +static const char* const kTmpTemplateSuffix = ".tmp.XXXXXX"; + +// Protobuf container constants. +static const int kPBContainerVersion = 1; +static const char kPBContainerMagic[] = "kuducntr"; +static const int kPBContainerMagicLen = 8; +static const int kPBContainerHeaderLen = + // magic number + version + kPBContainerMagicLen + sizeof(uint32_t); +static const int kPBContainerChecksumLen = sizeof(uint32_t); + +COMPILE_ASSERT((arraysize(kPBContainerMagic) - 1) == kPBContainerMagicLen, + kPBContainerMagic_does_not_match_expected_length); + +namespace kudu { +namespace pb_util { + +namespace { + +// When serializing, we first compute the byte size, then serialize the message. +// If serialization produces a different number of bytes than expected, we +// call this function, which crashes. The problem could be due to a bug in the +// protobuf implementation but is more likely caused by concurrent modification +// of the message. This function attempts to distinguish between the two and +// provide a useful error message. +void ByteSizeConsistencyError(int byte_size_before_serialization, + int byte_size_after_serialization, + int bytes_produced_by_serialization) { + CHECK_EQ(byte_size_before_serialization, byte_size_after_serialization) + << "Protocol message was modified concurrently during serialization."; + CHECK_EQ(bytes_produced_by_serialization, byte_size_before_serialization) + << "Byte size calculation and serialization were inconsistent. This " + "may indicate a bug in protocol buffers or it may be caused by " + "concurrent modification of the message."; + LOG(FATAL) << "This shouldn't be called if all the sizes are equal."; +} + +string InitializationErrorMessage(const char* action, + const MessageLite& message) { + // Note: We want to avoid depending on strutil in the lite library, otherwise + // we'd use: + // + // return strings::Substitute( + // "Can't $0 message of type \"$1\" because it is missing required " + // "fields: $2", + // action, message.GetTypeName(), + // message.InitializationErrorString()); + + string result; + result += "Can't "; + result += action; + result += " message of type \""; + result += message.GetTypeName(); + result += "\" because it is missing required fields: "; + result += message.InitializationErrorString(); + return result; +} + +} // anonymous namespace + +bool AppendToString(const MessageLite &msg, faststring *output) { + DCHECK(msg.IsInitialized()) << InitializationErrorMessage("serialize", msg); + return AppendPartialToString(msg, output); +} + +bool AppendPartialToString(const MessageLite &msg, faststring* output) { + int old_size = output->size(); + int byte_size = msg.ByteSize(); + + output->resize(old_size + byte_size); + + uint8* start = &((*output)[old_size]); + uint8* end = msg.SerializeWithCachedSizesToArray(start); + if (end - start != byte_size) { + ByteSizeConsistencyError(byte_size, msg.ByteSize(), end - start); + } + return true; +} + +bool SerializeToString(const MessageLite &msg, faststring *output) { + output->clear(); + return AppendToString(msg, output); +} + +bool ParseFromSequentialFile(MessageLite *msg, SequentialFile *rfile) { + SequentialFileFileInputStream istream(rfile); + return msg->ParseFromZeroCopyStream(&istream); +} + +Status ParseFromArray(MessageLite* msg, const uint8_t* data, uint32_t length) { + if (!msg->ParseFromArray(data, length)) { + return Status::Corruption("Error parsing msg", InitializationErrorMessage("parse", *msg)); + } + return Status::OK(); +} + +Status WritePBToPath(Env* env, const std::string& path, + const MessageLite& msg, + SyncMode sync) { + const string tmp_template = path + kTmpTemplateSuffix; + string tmp_path; + + gscoped_ptr file; + RETURN_NOT_OK(env->NewTempWritableFile(WritableFileOptions(), tmp_template, &tmp_path, &file)); + env_util::ScopedFileDeleter tmp_deleter(env, tmp_path); + + WritableFileOutputStream ostream(file.get()); + bool res = msg.SerializeToZeroCopyStream(&ostream); + if (!res || !ostream.Flush()) { + return Status::IOError("Unable to serialize PB to file"); + } + + if (sync == pb_util::SYNC) { + RETURN_NOT_OK_PREPEND(file->Sync(), "Failed to Sync() " + tmp_path); + } + RETURN_NOT_OK_PREPEND(file->Close(), "Failed to Close() " + tmp_path); + RETURN_NOT_OK_PREPEND(env->RenameFile(tmp_path, path), "Failed to rename tmp file to " + path); + tmp_deleter.Cancel(); + if (sync == pb_util::SYNC) { + RETURN_NOT_OK_PREPEND(env->SyncDir(DirName(path)), "Failed to SyncDir() parent of " + path); + } + return Status::OK(); +} + +Status ReadPBFromPath(Env* env, const std::string& path, MessageLite* msg) { + shared_ptr rfile; + RETURN_NOT_OK(env_util::OpenFileForSequential(env, path, &rfile)); + if (!ParseFromSequentialFile(msg, rfile.get())) { + return Status::IOError("Unable to parse PB from path", path); + } + return Status::OK(); +} + +static void TruncateString(string* s, int max_len) { + if (s->size() > max_len) { + s->resize(max_len); + s->append(""); + } +} + +void TruncateFields(Message* message, int max_len) { + const Reflection* reflection = message->GetReflection(); + vector fields; + reflection->ListFields(*message, &fields); + for (const FieldDescriptor* field : fields) { + if (field->is_repeated()) { + for (int i = 0; i < reflection->FieldSize(*message, field); i++) { + switch (field->cpp_type()) { + case FieldDescriptor::CPPTYPE_STRING: { + const string& s_const = reflection->GetRepeatedStringReference(*message, field, i, + nullptr); + TruncateString(const_cast(&s_const), max_len); + break; + } + case FieldDescriptor::CPPTYPE_MESSAGE: { + TruncateFields(reflection->MutableRepeatedMessage(message, field, i), max_len); + break; + } + default: + break; + } + } + } else { + switch (field->cpp_type()) { + case FieldDescriptor::CPPTYPE_STRING: { + const string& s_const = reflection->GetStringReference(*message, field, nullptr); + TruncateString(const_cast(&s_const), max_len); + break; + } + case FieldDescriptor::CPPTYPE_MESSAGE: { + TruncateFields(reflection->MutableMessage(message, field), max_len); + break; + } + default: + break; + } + } + } +} + +WritablePBContainerFile::WritablePBContainerFile(gscoped_ptr writer) + : closed_(false), + writer_(writer.Pass()) { +} + +WritablePBContainerFile::~WritablePBContainerFile() { + WARN_NOT_OK(Close(), "Could not Close() when destroying file"); +} + +Status WritablePBContainerFile::Init(const Message& msg) { + DCHECK(!closed_); + + faststring buf; + buf.resize(kPBContainerHeaderLen); + + // Serialize the magic. + strings::memcpy_inlined(buf.data(), kPBContainerMagic, kPBContainerMagicLen); + size_t offset = kPBContainerMagicLen; + + // Serialize the version. + InlineEncodeFixed32(buf.data() + offset, kPBContainerVersion); + offset += sizeof(uint32_t); + DCHECK_EQ(kPBContainerHeaderLen, offset) + << "Serialized unexpected number of total bytes"; + + // Serialize the supplemental header. + ContainerSupHeaderPB sup_header; + PopulateDescriptorSet(msg.GetDescriptor()->file(), + sup_header.mutable_protos()); + sup_header.set_pb_type(msg.GetTypeName()); + RETURN_NOT_OK_PREPEND(AppendMsgToBuffer(sup_header, &buf), + "Failed to prepare supplemental header for writing"); + + // Write the serialized buffer to the file. + RETURN_NOT_OK_PREPEND(writer_->Append(buf), + "Failed to Append() header to file"); + return Status::OK(); +} + +Status WritablePBContainerFile::Append(const Message& msg) { + DCHECK(!closed_); + + faststring buf; + RETURN_NOT_OK_PREPEND(AppendMsgToBuffer(msg, &buf), + "Failed to prepare buffer for writing"); + RETURN_NOT_OK_PREPEND(writer_->Append(buf), "Failed to Append() data to file"); + + return Status::OK(); +} + +Status WritablePBContainerFile::Flush() { + DCHECK(!closed_); + + // TODO: Flush just the dirty bytes. + RETURN_NOT_OK_PREPEND(writer_->Flush(WritableFile::FLUSH_ASYNC), "Failed to Flush() file"); + + return Status::OK(); +} + +Status WritablePBContainerFile::Sync() { + DCHECK(!closed_); + + RETURN_NOT_OK_PREPEND(writer_->Sync(), "Failed to Sync() file"); + + return Status::OK(); +} + +Status WritablePBContainerFile::Close() { + if (!closed_) { + closed_ = true; + + RETURN_NOT_OK_PREPEND(writer_->Close(), "Failed to Close() file"); + } + + return Status::OK(); +} + +Status WritablePBContainerFile::AppendMsgToBuffer(const Message& msg, faststring* buf) { + DCHECK(msg.IsInitialized()) << InitializationErrorMessage("serialize", msg); + int data_size = msg.ByteSize(); + uint64_t bufsize = sizeof(uint32_t) + data_size + kPBContainerChecksumLen; + + // Grow the buffer to hold the new data. + size_t orig_size = buf->size(); + buf->resize(orig_size + bufsize); + uint8_t* dst = buf->data() + orig_size; + + // Serialize the data size. + InlineEncodeFixed32(dst, static_cast(data_size)); + size_t offset = sizeof(uint32_t); + + // Serialize the data. + if (PREDICT_FALSE(!msg.SerializeWithCachedSizesToArray(dst + offset))) { + return Status::IOError("Failed to serialize PB to array"); + } + offset += data_size; + + // Calculate and serialize the checksum. + uint32_t checksum = crc::Crc32c(dst, offset); + InlineEncodeFixed32(dst + offset, checksum); + offset += kPBContainerChecksumLen; + + DCHECK_EQ(bufsize, offset) << "Serialized unexpected number of total bytes"; + return Status::OK(); +} + +void WritablePBContainerFile::PopulateDescriptorSet( + const FileDescriptor* desc, FileDescriptorSet* output) { + // Because we don't compile protobuf with TSAN enabled, copying the + // static PB descriptors in this function ends up triggering a lot of + // race reports. We suppress the reports, but TSAN still has to walk + // the stack, etc, and this function becomes very slow. So, we ignore + // TSAN here. + debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; + + FileDescriptorSet all_descs; + + // Tracks all schemas that have been added to 'unemitted' at one point + // or another. Is a superset of 'unemitted' and only ever grows. + unordered_set processed; + + // Tracks all remaining unemitted schemas. + deque unemitted; + + InsertOrDie(&processed, desc); + unemitted.push_front(desc); + while (!unemitted.empty()) { + const FileDescriptor* proto = unemitted.front(); + + // The current schema is emitted iff we've processed (i.e. emitted) all + // of its dependencies. + bool emit = true; + for (int i = 0; i < proto->dependency_count(); i++) { + const FileDescriptor* dep = proto->dependency(i); + if (InsertIfNotPresent(&processed, dep)) { + unemitted.push_front(dep); + emit = false; + } + } + if (emit) { + unemitted.pop_front(); + proto->CopyTo(all_descs.mutable_file()->Add()); + } + } + all_descs.Swap(output); +} + +ReadablePBContainerFile::ReadablePBContainerFile(gscoped_ptr reader) + : offset_(0), + reader_(reader.Pass()) { +} + +ReadablePBContainerFile::~ReadablePBContainerFile() { + WARN_NOT_OK(Close(), "Could not Close() when destroying file"); +} + +Status ReadablePBContainerFile::Init() { + // Read header data. + Slice header; + gscoped_ptr scratch; + RETURN_NOT_OK_PREPEND(ValidateAndRead(kPBContainerHeaderLen, EOF_NOT_OK, &header, &scratch), + Substitute("Could not read header for proto container file $0", + reader_->filename())); + + // Validate magic number. + if (PREDICT_FALSE(!strings::memeq(kPBContainerMagic, header.data(), kPBContainerMagicLen))) { + string file_magic(reinterpret_cast(header.data()), kPBContainerMagicLen); + return Status::Corruption("Invalid magic number", + Substitute("Expected: $0, found: $1", + Utf8SafeCEscape(kPBContainerMagic), + Utf8SafeCEscape(file_magic))); + } + + // Validate container file version. + uint32_t version = DecodeFixed32(header.data() + kPBContainerMagicLen); + if (PREDICT_FALSE(version != kPBContainerVersion)) { + // We only support version 1. + return Status::NotSupported( + Substitute("Protobuf container has version $0, we only support version $1", + version, kPBContainerVersion)); + } + + // Read the supplemental header. + ContainerSupHeaderPB sup_header; + RETURN_NOT_OK_PREPEND(ReadNextPB(&sup_header), Substitute( + "Could not read supplemental header from proto container file $0", + reader_->filename())); + protos_.reset(sup_header.release_protos()); + pb_type_ = sup_header.pb_type(); + + return Status::OK(); +} + +Status ReadablePBContainerFile::ReadNextPB(Message* msg) { + VLOG(1) << "Reading PB from offset " << offset_; + + // Read the size from the file. EOF here is acceptable: it means we're + // out of PB entries. + Slice size; + gscoped_ptr size_scratch; + RETURN_NOT_OK_PREPEND(ValidateAndRead(sizeof(uint32_t), EOF_OK, &size, &size_scratch), + Substitute("Could not read data size from proto container file $0", + reader_->filename())); + uint32_t data_size = DecodeFixed32(size.data()); + + // Read body into buffer for checksum & parsing. + Slice body; + gscoped_ptr body_scratch; + RETURN_NOT_OK_PREPEND(ValidateAndRead(data_size, EOF_NOT_OK, &body, &body_scratch), + Substitute("Could not read body from proto container file $0", + reader_->filename())); + + // Read checksum. + uint32_t expected_checksum = 0; + { + Slice encoded_checksum; + gscoped_ptr encoded_checksum_scratch; + RETURN_NOT_OK_PREPEND(ValidateAndRead(kPBContainerChecksumLen, EOF_NOT_OK, + &encoded_checksum, &encoded_checksum_scratch), + Substitute("Could not read checksum from proto container file $0", + reader_->filename())); + expected_checksum = DecodeFixed32(encoded_checksum.data()); + } + + // Validate CRC32C checksum. + Crc* crc32c = crc::GetCrc32cInstance(); + uint64_t actual_checksum = 0; + // Compute a rolling checksum over the two byte arrays (size, body). + crc32c->Compute(size.data(), size.size(), &actual_checksum); + crc32c->Compute(body.data(), body.size(), &actual_checksum); + if (PREDICT_FALSE(actual_checksum != expected_checksum)) { + return Status::Corruption(Substitute("Incorrect checksum of file $0: actually $1, expected $2", + reader_->filename(), actual_checksum, expected_checksum)); + } + + // The checksum is correct. Time to decode the body. + // + // We could compare pb_type_ against msg.GetTypeName(), but: + // 1. pb_type_ is not available when reading the supplemental header, + // 2. ParseFromArray() should fail if the data cannot be parsed into the + // provided message type. + + // To permit parsing of very large PB messages, we must use parse through a + // CodedInputStream and bump the byte limit. The SetTotalBytesLimit() docs + // say that 512MB is the shortest theoretical message length that may produce + // integer overflow warnings, so that's what we'll use. + ArrayInputStream ais(body.data(), body.size()); + CodedInputStream cis(&ais); + cis.SetTotalBytesLimit(512 * 1024 * 1024, -1); + if (PREDICT_FALSE(!msg->ParseFromCodedStream(&cis))) { + return Status::IOError("Unable to parse PB from path", reader_->filename()); + } + + return Status::OK(); +} + +Status ReadablePBContainerFile::Dump(ostream* os, bool oneline) { + // Use the embedded protobuf information from the container file to + // create the appropriate kind of protobuf Message. + // + // Loading the schemas into a DescriptorDatabase (and not directly into + // a DescriptorPool) defers resolution until FindMessageTypeByName() + // below, allowing for schemas to be loaded in any order. + SimpleDescriptorDatabase db; + for (int i = 0; i < protos()->file_size(); i++) { + if (!db.Add(protos()->file(i))) { + return Status::Corruption("Descriptor not loaded", Substitute( + "Could not load descriptor for PB type $0 referenced in container file", + pb_type())); + } + } + DescriptorPool pool(&db); + const Descriptor* desc = pool.FindMessageTypeByName(pb_type()); + if (!desc) { + return Status::NotFound("Descriptor not found", Substitute( + "Could not find descriptor for PB type $0 referenced in container file", + pb_type())); + } + DynamicMessageFactory factory; + const Message* prototype = factory.GetPrototype(desc); + if (!prototype) { + return Status::NotSupported("Descriptor not supported", Substitute( + "Descriptor $0 referenced in container file not supported", + pb_type())); + } + gscoped_ptr msg(prototype->New()); + + // Dump each message in the container file. + int count = 0; + Status s; + for (s = ReadNextPB(msg.get()); + s.ok(); + s = ReadNextPB(msg.get())) { + if (oneline) { + *os << count++ << "\t" << msg->ShortDebugString() << endl; + } else { + *os << "Message " << count << endl; + *os << "-------" << endl; + *os << msg->DebugString() << endl; + count++; + } + } + return s.IsEndOfFile() ? s.OK() : s; +} + +Status ReadablePBContainerFile::Close() { + gscoped_ptr deleter; + deleter.swap(reader_); + return Status::OK(); +} + +Status ReadablePBContainerFile::ValidateAndRead(size_t length, EofOK eofOK, + Slice* result, gscoped_ptr* scratch) { + // Validate the read length using the file size. + uint64_t file_size; + RETURN_NOT_OK(reader_->Size(&file_size)); + if (offset_ + length > file_size) { + switch (eofOK) { + case EOF_OK: + return Status::EndOfFile("Reached end of file"); + case EOF_NOT_OK: + return Status::Corruption("File size not large enough to be valid", + Substitute("Proto container file $0: " + "tried to read $0 bytes at offset " + "$1 but file size is only $2", + reader_->filename(), length, + offset_, file_size)); + default: + LOG(FATAL) << "Unknown value for eofOK: " << eofOK; + } + } + + // Perform the read. + Slice s; + gscoped_ptr local_scratch(new uint8_t[length]); + RETURN_NOT_OK(reader_->Read(offset_, length, &s, local_scratch.get())); + + // Sanity check the result. + if (PREDICT_FALSE(s.size() < length)) { + return Status::Corruption("Unexpected short read", Substitute( + "Proto container file $0: tried to read $1 bytes; got $2 bytes", + reader_->filename(), length, s.size())); + } + + *result = s; + scratch->swap(local_scratch); + offset_ += s.size(); + return Status::OK(); +} + + +Status ReadPBContainerFromPath(Env* env, const std::string& path, Message* msg) { + gscoped_ptr file; + RETURN_NOT_OK(env->NewRandomAccessFile(path, &file)); + + ReadablePBContainerFile pb_file(file.Pass()); + RETURN_NOT_OK(pb_file.Init()); + RETURN_NOT_OK(pb_file.ReadNextPB(msg)); + return pb_file.Close(); +} + +Status WritePBContainerToPath(Env* env, const std::string& path, + const Message& msg, + CreateMode create, + SyncMode sync) { + TRACE_EVENT2("io", "WritePBContainerToPath", + "path", path, + "msg_type", msg.GetTypeName()); + + if (create == NO_OVERWRITE && env->FileExists(path)) { + return Status::AlreadyPresent(Substitute("File $0 already exists", path)); + } + + const string tmp_template = path + kTmpTemplateSuffix; + string tmp_path; + + gscoped_ptr file; + RETURN_NOT_OK(env->NewTempWritableFile(WritableFileOptions(), tmp_template, &tmp_path, &file)); + env_util::ScopedFileDeleter tmp_deleter(env, tmp_path); + + WritablePBContainerFile pb_file(file.Pass()); + RETURN_NOT_OK(pb_file.Init(msg)); + RETURN_NOT_OK(pb_file.Append(msg)); + if (sync == pb_util::SYNC) { + RETURN_NOT_OK(pb_file.Sync()); + } + RETURN_NOT_OK(pb_file.Close()); + RETURN_NOT_OK_PREPEND(env->RenameFile(tmp_path, path), + "Failed to rename tmp file to " + path); + tmp_deleter.Cancel(); + if (sync == pb_util::SYNC) { + RETURN_NOT_OK_PREPEND(env->SyncDir(DirName(path)), + "Failed to SyncDir() parent of " + path); + } + return Status::OK(); +} + +} // namespace pb_util +} // namespace kudu diff --git a/src/kudu/util/pb_util.h b/src/kudu/util/pb_util.h new file mode 100644 index 000000000000..2e9f7ce6a6e9 --- /dev/null +++ b/src/kudu/util/pb_util.h @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utilities for dealing with protocol buffers. +// These are mostly just functions similar to what are found in the protobuf +// library itself, but using kudu::faststring instances instead of STL strings. +#ifndef KUDU_UTIL_PB_UTIL_H +#define KUDU_UTIL_PB_UTIL_H + +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/faststring.h" + +namespace google { +namespace protobuf { +class FileDescriptor; +class FileDescriptorSet; +class MessageLite; +class Message; +} +} + +namespace kudu { + +class Env; +class RandomAccessFile; +class SequentialFile; +class Slice; +class Status; +class WritableFile; + +namespace pb_util { + +using google::protobuf::MessageLite; + +enum SyncMode { + SYNC, + NO_SYNC +}; + +enum CreateMode { + OVERWRITE, + NO_OVERWRITE +}; + +// See MessageLite::AppendToString +bool AppendToString(const MessageLite &msg, faststring *output); + +// See MessageLite::AppendPartialToString +bool AppendPartialToString(const MessageLite &msg, faststring *output); + +// See MessageLite::SerializeToString. +bool SerializeToString(const MessageLite &msg, faststring *output); + +// See MessageLite::ParseFromZeroCopyStream +// TODO: change this to return Status - differentiate IO error from bad PB +bool ParseFromSequentialFile(MessageLite *msg, SequentialFile *rfile); + +// Similar to MessageLite::ParseFromArray, with the difference that it returns +// Status::Corruption() if the message could not be parsed. +Status ParseFromArray(MessageLite* msg, const uint8_t* data, uint32_t length); + +// Load a protobuf from the given path. +Status ReadPBFromPath(Env* env, const std::string& path, MessageLite* msg); + +// Serialize a protobuf to the given path. +// +// If SyncMode SYNC is provided, ensures the changes are made durable. +Status WritePBToPath(Env* env, const std::string& path, const MessageLite& msg, SyncMode sync); + +// Truncate any 'bytes' or 'string' fields of this message to max_len. +// The text "" is appended to any such truncated fields. +void TruncateFields(google::protobuf::Message* message, int max_len); + +// A protobuf "container" has the following format (all integers in +// little-endian byte order): +// +// +// +// magic number: 8 byte string identifying the file format. +// +// Included so that we have a minimal guarantee that this file is +// of the type we expect and that we are not just reading garbage. +// +// container_version: 4 byte unsigned integer indicating the "version" of the +// container format. Must be set to 1 at this time. +// +// Included so that this file format may be extended at some +// later date while maintaining backwards compatibility. +// +// +// The remaining container fields are repeated (in a group) for each protobuf message. +// +// +// data size: 4 byte unsigned integer indicating the size of the encoded data. +// +// Included because PB messages aren't self-delimiting, and thus +// writing a stream of messages to the same file requires +// delimiting each with its size. +// +// See https://developers.google.com/protocol-buffers/docs/techniques?hl=zh-cn#streaming +// for more details. +// +// data: "size" bytes of protobuf data encoded according to the schema. +// +// Our payload. +// +// checksum: 4 byte unsigned integer containing the CRC32C checksum of "data". +// +// Included to ensure validity of the data on-disk. +// +// Every container must have at least one protobuf message: the +// supplemental header. It includes additional container-level information. +// See pb_util.proto for details. As a containerized PB message, the header +// is protected by a CRC32C checksum like any other message. +// +// +// It is worth describing the kinds of errors that can be detected by the +// protobuf container and the kinds that cannot. +// +// The checksums in the container are independent, not rolling. As such, +// they won't detect the disappearance or reordering of entire protobuf +// messages, which can happen if a range of the file is collapsed (see +// man fallocate(2)) or if the file is otherwise manually manipulated. +// Moreover, the checksums do not protect against corruption in the data +// size fields, though that is mitigated by validating each data size +// against the remaining number of bytes in the container. +// +// Additionally, the container does not include footers or periodic +// checkpoints. As such, it will not detect if entire protobuf messages +// are truncated. +// +// That said, all corruption or truncation of the magic number or the +// container version will be detected, as will most corruption/truncation +// of the data size, data, and checksum (subject to CRC32 limitations). +// +// These tradeoffs in error detection are reasonable given the failure +// environment that Kudu operates within. We tolerate failures such as +// "kill -9" of the Kudu process, machine power loss, or fsync/fdatasync +// failure, but not failures like runaway processes mangling data files +// in arbitrary ways or attackers crafting malicious data files. +// +// The one kind of failure that clients must handle is truncation of entire +// protobuf messages (see above). The protobuf container will not detect +// these failures, so clients must tolerate them in some way. +// +// For further reading on what files might look like following a normal +// filesystem failure, see: +// +// https://www.usenix.org/system/files/conference/osdi14/osdi14-paper-pillai.pdf + +// Protobuf container file opened for writing. +// +// Can be built around an existing file or a completely new file. +// +// Not thread-safe. +class WritablePBContainerFile { + public: + + // Initializes the class instance; writer must be open. + explicit WritablePBContainerFile(gscoped_ptr writer); + + // Closes the container if not already closed. + ~WritablePBContainerFile(); + + // Writes the header information to the container. + // + // 'msg' need not be populated; its type is used to "lock" the container + // to a particular protobuf message type in Append(). + Status Init(const google::protobuf::Message& msg); + + // Writes a protobuf message to the container, beginning with its size + // and ending with its CRC32 checksum. + Status Append(const google::protobuf::Message& msg); + + // Asynchronously flushes all dirty container data to the filesystem. + Status Flush(); + + // Synchronizes all dirty container data to the filesystem. + // + // Note: the parent directory is _not_ synchronized. Because the + // container file was provided during construction, we don't know whether + // it was created or reopened, and parent directory synchronization is + // only needed in the former case. + Status Sync(); + + // Closes the container. + Status Close(); + + private: + FRIEND_TEST(TestPBUtil, TestPopulateDescriptorSet); + + // Write the protobuf schemas belonging to 'desc' and all of its + // dependencies to 'output'. + // + // Schemas are written in dependency order (i.e. if A depends on B which + // depends on C, the order is C, B, A). + static void PopulateDescriptorSet(const google::protobuf::FileDescriptor* desc, + google::protobuf::FileDescriptorSet* output); + + // Serialize the contents of 'msg' into 'buf' along with additional metadata + // to aid in deserialization. + Status AppendMsgToBuffer(const google::protobuf::Message& msg, faststring* buf); + + bool closed_; + + gscoped_ptr writer_; +}; + +// Protobuf container file opened for reading. +// +// Can be built around a file with existing contents or an empty file (in +// which case it's safe to interleave with WritablePBContainerFile). +class ReadablePBContainerFile { + public: + + // Initializes the class instance; reader must be open. + explicit ReadablePBContainerFile(gscoped_ptr reader); + + // Closes the file if not already closed. + ~ReadablePBContainerFile(); + + // Reads the header information from the container and validates it. + Status Init(); + + // Reads a protobuf message from the container, validating its size and + // data using a CRC32 checksum. + Status ReadNextPB(google::protobuf::Message* msg); + + // Dumps any unread protobuf messages in the container to 'os'. Each + // message's DebugString() method is invoked to produce its textual form. + // + // If 'oneline' is true, prints each message on a single line. + Status Dump(std::ostream* os, bool oneline); + + // Closes the container. + Status Close(); + + // Expected PB type and schema for each message to be read. + // + // Only valid after a successful call to Init(). + const std::string& pb_type() const { return pb_type_; } + const google::protobuf::FileDescriptorSet* protos() const { + return protos_.get(); + } + + private: + enum EofOK { + EOF_OK, + EOF_NOT_OK + }; + + // Reads exactly 'length' bytes from the container file into 'scratch', + // validating the correctness of the read both before and after and + // returning a slice of the bytes in 'result'. + // + // If 'eofOK' is EOF_OK, an EOF is returned as-is. Otherwise, it is + // considered to be an invalid short read and returned as an error. + Status ValidateAndRead(size_t length, EofOK eofOK, + Slice* result, gscoped_ptr* scratch); + + size_t offset_; + + // The fully-qualified PB type name of the messages in the container. + std::string pb_type_; + + // Wrapped in a gscoped_ptr so that clients need not include PB headers. + gscoped_ptr protos_; + + gscoped_ptr reader_; +}; + +// Convenience functions for protobuf containers holding just one record. + +// Load a "containerized" protobuf from the given path. +// If the file does not exist, returns Status::NotFound(). Otherwise, may +// return other Status error codes such as Status::IOError. +Status ReadPBContainerFromPath(Env* env, const std::string& path, + google::protobuf::Message* msg); + +// Serialize a "containerized" protobuf to the given path. +// +// If create == NO_OVERWRITE and 'path' already exists, the function will fail. +// If sync == SYNC, the newly created file will be fsynced before returning. +Status WritePBContainerToPath(Env* env, const std::string& path, + const google::protobuf::Message& msg, + CreateMode create, + SyncMode sync); + +} // namespace pb_util +} // namespace kudu +#endif diff --git a/src/kudu/util/pb_util.proto b/src/kudu/util/pb_util.proto new file mode 100644 index 000000000000..b7265e295b8e --- /dev/null +++ b/src/kudu/util/pb_util.proto @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +option java_package = "org.kududb"; + +import "google/protobuf/descriptor.proto"; + +// ============================================================================ +// Protobuf container metadata +// ============================================================================ + +// Supplemental protobuf container header, after the main header (see +// pb_util.h for details). +message ContainerSupHeaderPB { + // The protobuf schema for the messages expected in this container. + // + // This schema is complete, that is, it includes all of its dependencies + // (i.e. other schemas defined in .proto files imported by this schema's + // .proto file). + required google.protobuf.FileDescriptorSet protos = 1; + + // The PB message type expected in each data entry in this container. Must + // be fully qualified (i.e. kudu.tablet.TabletSuperBlockPB). + required string pb_type = 2; +} diff --git a/src/kudu/util/promise.h b/src/kudu/util/promise.h new file mode 100644 index 000000000000..17f8cecc4fae --- /dev/null +++ b/src/kudu/util/promise.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_PROMISE_H +#define KUDU_UTIL_PROMISE_H + +#include "kudu/gutil/macros.h" +#include "kudu/util/countdown_latch.h" + +namespace kudu { + +// A promise boxes a value which is to be provided at some time in the future. +// A single producer calls Set(...), and any number of consumers can call Get() +// to retrieve the produced value. +// +// In Guava terms, this is a SettableFuture. +template +class Promise { + public: + Promise() : latch_(1) {} + ~Promise() {} + + // Reset the promise to be used again. + // For this to be safe, there must be some kind of external synchronization + // ensuring that no threads are still accessing the value from the previous + // incarnation of the promise. + void Reset() { + latch_.Reset(1); + val_ = T(); + } + + // Block until a value is available, and return a reference to it. + const T& Get() const { + latch_.Wait(); + return val_; + } + + // Wait for the promised value to become available with the given timeout. + // + // Returns NULL if the timeout elapses before a value is available. + // Otherwise returns a pointer to the value. This pointer's lifetime is + // tied to the lifetime of the Promise object. + const T* WaitFor(const MonoDelta& delta) const { + if (latch_.WaitFor(delta)) { + return &val_; + } else { + return NULL; + } + } + + // Set the value of this promise. + // This may be called at most once. + void Set(const T& val) { + DCHECK_EQ(latch_.count(), 1) << "Already set!"; + val_ = val; + latch_.CountDown(); + } + + private: + CountDownLatch latch_; + T val_; + DISALLOW_COPY_AND_ASSIGN(Promise); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_PROMISE_H */ diff --git a/src/kudu/util/proto_container_test.proto b/src/kudu/util/proto_container_test.proto new file mode 100644 index 000000000000..663870659ff2 --- /dev/null +++ b/src/kudu/util/proto_container_test.proto @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +// Arbitrary protobuf to test writing a containerized protobuf. +message ProtoContainerTestPB { + required string name = 1; + required int32 value = 2; + optional string note = 3; +} diff --git a/src/kudu/util/proto_container_test2.proto b/src/kudu/util/proto_container_test2.proto new file mode 100644 index 000000000000..bcf7ed150002 --- /dev/null +++ b/src/kudu/util/proto_container_test2.proto @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +// Dependency chain: +// +// this file --> proto_container_test.proto + +import "kudu/util/proto_container_test.proto"; + +// Arbitrary protobuf that has one PB dependency. +message ProtoContainerTest2PB { + required kudu.ProtoContainerTestPB record = 1; +} diff --git a/src/kudu/util/proto_container_test3.proto b/src/kudu/util/proto_container_test3.proto new file mode 100644 index 000000000000..5cf34dd9a764 --- /dev/null +++ b/src/kudu/util/proto_container_test3.proto @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +// Dependency chain: +// +// this file --> proto_container_test.proto +// --> proto_container_test2.proto --> proto_container_test.proto + +import "kudu/util/proto_container_test.proto"; +import "kudu/util/proto_container_test2.proto"; + +// Arbitrary protobuf has two PB dependencies. +// dependency. +message ProtoContainerTest3PB { + required kudu.ProtoContainerTestPB record_one = 1; + required kudu.ProtoContainerTest2PB record_two = 2; +} diff --git a/src/kudu/util/protobuf-annotations.h b/src/kudu/util/protobuf-annotations.h new file mode 100644 index 000000000000..7fdc9614ffbe --- /dev/null +++ b/src/kudu/util/protobuf-annotations.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple header which is inserted into all of our generated protobuf code. +// We use this to hook protobuf code up to TSAN annotations. +#ifndef KUDU_UTIL_PROTOBUF_ANNOTATIONS_H +#define KUDU_UTIL_PROTOBUF_ANNOTATIONS_H + +#include "kudu/gutil/dynamic_annotations.h" + +// The protobuf internal headers are included before this, so we have to undefine +// the empty definitions first. +#undef GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN +#undef GOOGLE_SAFE_CONCURRENT_WRITES_END + +#define GOOGLE_SAFE_CONCURRENT_WRITES_BEGIN ANNOTATE_IGNORE_WRITES_BEGIN +#define GOOGLE_SAFE_CONCURRENT_WRITES_END ANNOTATE_IGNORE_WRITES_END + +#endif /* KUDU_UTIL_PROTOBUF_ANNOTATIONS_H */ diff --git a/src/kudu/util/protobuf_util.h b/src/kudu/util/protobuf_util.h new file mode 100644 index 000000000000..cc88eda8f76e --- /dev/null +++ b/src/kudu/util/protobuf_util.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_PROTOBUF_UTIL_H +#define KUDU_UTIL_PROTOBUF_UTIL_H + +#include + +namespace kudu { + +bool AppendPBToString(const google::protobuf::MessageLite &msg, faststring *output) { + int old_size = output->size(); + int byte_size = msg.ByteSize(); + output->resize(old_size + byte_size); + uint8* start = reinterpret_cast(output->data() + old_size); + uint8* end = msg.SerializeWithCachedSizesToArray(start); + CHECK(end - start == byte_size) + << "Error in serialization. byte_size=" << byte_size + << " new ByteSize()=" << msg.ByteSize() + << " end-start=" << (end-start); + return true; +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/protoc-gen-insertions.cc b/src/kudu/util/protoc-gen-insertions.cc new file mode 100644 index 000000000000..d8769aa3cc14 --- /dev/null +++ b/src/kudu/util/protoc-gen-insertions.cc @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Simple protoc plugin which inserts some code at the top of each generated protobuf. +// Currently, this just adds an include of protobuf-annotations.h, a file which hooks up +// the protobuf concurrency annotations to our TSAN annotations. +#include +#include +#include +#include +#include +#include +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/strings/strip.h" +#include "kudu/gutil/strings/substitute.h" + +using google::protobuf::io::ZeroCopyOutputStream; +using google::protobuf::io::Printer; + +namespace kudu { + +static const char* const kIncludeToInsert = "#include \"kudu/util/protobuf-annotations.h\"\n"; +static const char* const kProtoExtension = ".proto"; + +class InsertAnnotations : public ::google::protobuf::compiler::CodeGenerator { + virtual bool Generate(const google::protobuf::FileDescriptor *file, + const std::string &/*param*/, + google::protobuf::compiler::GeneratorContext *gen_context, + std::string *error) const OVERRIDE { + + // Determine the file name we will substitute into. + string path_no_extension; + if (!TryStripSuffixString(file->name(), kProtoExtension, &path_no_extension)) { + *error = strings::Substitute("file name $0 did not end in $1", file->name(), kProtoExtension); + return false; + } + string pb_file = path_no_extension + ".pb.cc"; + + // Actually insert the new #include + gscoped_ptr inserter(gen_context->OpenForInsert(pb_file, "includes")); + Printer printer(inserter.get(), '$'); + printer.Print(kIncludeToInsert); + + if (printer.failed()) { + *error = "Failed to print to output file"; + return false; + } + + return true; + } +}; + +} // namespace kudu + +int main(int argc, char *argv[]) { + kudu::InsertAnnotations generator; + return google::protobuf::compiler::PluginMain(argc, argv, &generator); +} diff --git a/src/kudu/util/pstack_watcher-test.cc b/src/kudu/util/pstack_watcher-test.cc new file mode 100644 index 000000000000..652fec2974fd --- /dev/null +++ b/src/kudu/util/pstack_watcher-test.cc @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/pstack_watcher.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/bitmap.h" +#include "kudu/util/env.h" +#include "kudu/util/errno.h" +#include "kudu/util/test_macros.h" + +using std::shared_ptr; +using std::string; +using strings::Substitute; + +namespace kudu { + +TEST(TestPstackWatcher, TestPstackWatcherCancellation) { + PstackWatcher watcher(MonoDelta::FromSeconds(1000000)); + watcher.Shutdown(); +} + +TEST(TestPstackWatcher, TestWait) { + PstackWatcher watcher(MonoDelta::FromMilliseconds(10)); + watcher.Wait(); +} + +TEST(TestPstackWatcher, TestDumpStacks) { + ASSERT_OK(PstackWatcher::DumpStacks()); +} + +static shared_ptr RedirectStdout(string *temp_path) { + string temp_dir; + CHECK_OK(Env::Default()->GetTestDirectory(&temp_dir)); + *temp_path = Substitute("$0/pstack_watcher-dump.$1.txt", + temp_dir, getpid()); + return shared_ptr( + freopen(temp_path->c_str(), "w", stdout), fclose); +} + +TEST(TestPstackWatcher, TestPstackWatcherRunning) { + string stdout_file; + int old_stdout; + CHECK_ERR(old_stdout = dup(STDOUT_FILENO)); + { + shared_ptr out_fp = RedirectStdout(&stdout_file); + PCHECK(out_fp.get()); + PstackWatcher watcher(MonoDelta::FromMilliseconds(500)); + while (watcher.IsRunning()) { + SleepFor(MonoDelta::FromMilliseconds(1)); + } + } + CHECK_ERR(dup2(old_stdout, STDOUT_FILENO)); + PCHECK(stdout = fdopen(STDOUT_FILENO, "w")); + + faststring contents; + CHECK_OK(ReadFileToString(Env::Default(), stdout_file, &contents)); + ASSERT_STR_CONTAINS(contents.ToString(), "BEGIN STACKS"); + CHECK_ERR(unlink(stdout_file.c_str())); + ASSERT_GE(fprintf(stdout, "%s\n", contents.ToString().c_str()), 0) + << "errno=" << errno << ": " << ErrnoToString(errno); +} + +} // namespace kudu diff --git a/src/kudu/util/pstack_watcher.cc b/src/kudu/util/pstack_watcher.cc new file mode 100644 index 000000000000..c769f0780f46 --- /dev/null +++ b/src/kudu/util/pstack_watcher.cc @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/pstack_watcher.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/env.h" +#include "kudu/util/errno.h" +#include "kudu/util/status.h" +#include "kudu/util/subprocess.h" + +namespace kudu { + +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Substitute; + +PstackWatcher::PstackWatcher(MonoDelta timeout) + : timeout_(std::move(timeout)), running_(true), cond_(&lock_) { + CHECK_OK(Thread::Create("pstack_watcher", "pstack_watcher", + boost::bind(&PstackWatcher::Run, this), &thread_)); +} + +PstackWatcher::~PstackWatcher() { + Shutdown(); +} + +void PstackWatcher::Shutdown() { + { + MutexLock guard(lock_); + running_ = false; + cond_.Broadcast(); + } + if (thread_) { + CHECK_OK(ThreadJoiner(thread_.get()).Join()); + thread_.reset(); + } +} + +bool PstackWatcher::IsRunning() const { + MutexLock guard(lock_); + return running_; +} + +void PstackWatcher::Wait() const { + MutexLock lock(lock_); + while (running_) { + cond_.Wait(); + } +} + +void PstackWatcher::Run() { + MutexLock guard(lock_); + if (!running_) return; + cond_.TimedWait(timeout_); + if (!running_) return; + + WARN_NOT_OK(DumpStacks(DUMP_FULL), "Unable to print pstack from watcher"); + running_ = false; + cond_.Broadcast(); +} + +Status PstackWatcher::HasProgram(const char* progname) { + string which("which"); + vector argv; + argv.push_back(which); + argv.push_back(progname); + Subprocess proc(which, argv); + proc.DisableStderr(); + proc.DisableStdout(); + RETURN_NOT_OK_PREPEND(proc.Start(), + Substitute("HasProgram($0): error running 'which'", progname)); + int wait_status = 0; + RETURN_NOT_OK(proc.Wait(&wait_status)); + if ((WIFEXITED(wait_status)) && (0 == WEXITSTATUS(wait_status))) { + return Status::OK(); + } + return Status::NotFound(Substitute("can't find $0: exited?=$1, status=$2", + progname, + static_cast(WIFEXITED(wait_status)), + WEXITSTATUS(wait_status))); +} + +Status PstackWatcher::DumpStacks(int flags) { + return DumpPidStacks(getpid(), flags); +} + +Status PstackWatcher::DumpPidStacks(pid_t pid, int flags) { + + // Prefer GDB if available; it gives us line numbers and thread names. + if (HasProgram("gdb").ok()) { + return RunGdbStackDump(pid, flags); + } + + // Otherwise, try to use pstack or gstack. + const char *progname = nullptr; + if (HasProgram("pstack").ok()) { + progname = "pstack"; + } else if (HasProgram("gstack").ok()) { + progname = "gstack"; + } + + if (!progname) { + return Status::ServiceUnavailable("Neither gdb, pstack, nor gstack appears to be installed."); + } + return RunPstack(progname, pid); +} + +Status PstackWatcher::RunGdbStackDump(pid_t pid, int flags) { + // Command: gdb -quiet -batch -nx -ex cmd1 -ex cmd2 /proc/$PID/exe $PID + string prog("gdb"); + vector argv; + argv.push_back(prog); + argv.push_back("-quiet"); + argv.push_back("-batch"); + argv.push_back("-nx"); + argv.push_back("-ex"); + argv.push_back("set print pretty on"); + argv.push_back("-ex"); + argv.push_back("info threads"); + argv.push_back("-ex"); + argv.push_back("thread apply all bt"); + if (flags & DUMP_FULL) { + argv.push_back("-ex"); + argv.push_back("thread apply all bt full"); + } + string executable; + Env* env = Env::Default(); + RETURN_NOT_OK(env->GetExecutablePath(&executable)); + argv.push_back(executable); + argv.push_back(Substitute("$0", pid)); + return RunStackDump(prog, argv); +} + +Status PstackWatcher::RunPstack(const std::string& progname, pid_t pid) { + string prog(progname); + string pid_string(Substitute("$0", pid)); + vector argv; + argv.push_back(prog); + argv.push_back(pid_string); + return RunStackDump(prog, argv); +} + +Status PstackWatcher::RunStackDump(const string& prog, const vector& argv) { + printf("************************ BEGIN STACKS **************************\n"); + if (fflush(stdout) == EOF) { + return Status::IOError("Unable to flush stdout", ErrnoToString(errno), errno); + } + Subprocess pstack_proc(prog, argv); + RETURN_NOT_OK_PREPEND(pstack_proc.Start(), "RunStackDump proc.Start() failed"); + if (::close(pstack_proc.ReleaseChildStdinFd()) == -1) { + return Status::IOError("Unable to close child stdin", ErrnoToString(errno), errno); + } + int ret; + RETURN_NOT_OK_PREPEND(pstack_proc.Wait(&ret), "RunStackDump proc.Wait() failed"); + if (ret == -1) { + return Status::RuntimeError("RunStackDump proc.Wait() error", ErrnoToString(errno), errno); + } + printf("************************* END STACKS ***************************\n"); + if (fflush(stdout) == EOF) { + return Status::IOError("Unable to flush stdout", ErrnoToString(errno), errno); + } + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/pstack_watcher.h b/src/kudu/util/pstack_watcher.h new file mode 100644 index 000000000000..82390dd36c42 --- /dev/null +++ b/src/kudu/util/pstack_watcher.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_PSTACK_WATCHER_H +#define KUDU_UTIL_PSTACK_WATCHER_H + +#include +#include + +#include "kudu/util/condition_variable.h" +#include "kudu/util/monotime.h" +#include "kudu/util/mutex.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" + +namespace kudu { + +// PstackWatcher is an object which will pstack the current process and print +// the results to stdout. It does this after a certain timeout has occured. +class PstackWatcher { + public: + + enum Flags { + NO_FLAGS = 0, + + // Run 'thread apply all bt full', which is very verbose output + DUMP_FULL = 1 + }; + + // Static method to collect and write stack dump output to stdout of the current + // process. + static Status DumpStacks(int flags = NO_FLAGS); + + // Like the above but for any process, not just the current one. + static Status DumpPidStacks(pid_t pid, int flags = NO_FLAGS); + + // Instantiate a watcher that writes a pstack to stdout after the given + // timeout expires. + explicit PstackWatcher(MonoDelta timeout); + + ~PstackWatcher(); + + // Shut down the watcher and do not log a pstack. + // This method is not thread-safe. + void Shutdown(); + + // Test whether the watcher is still running or has shut down. + // Thread-safe. + bool IsRunning() const; + + // Wait until the timeout expires and the watcher logs a pstack. + // Thread-safe. + void Wait() const; + + private: + // Test for the existence of the given program in the system path. + static Status HasProgram(const char* progname); + + // Get a stack dump using GDB directly. + static Status RunGdbStackDump(pid_t pid, int flags); + + // Get a stack dump using the pstack or gstack program. + static Status RunPstack(const std::string& progname, pid_t pid); + + // Invoke and wait for the stack dump program. + static Status RunStackDump(const std::string& prog, const std::vector& argv); + + // Run the thread that waits for the specified duration before logging a + // pstack. + void Run(); + + const MonoDelta timeout_; + bool running_; + scoped_refptr thread_; + mutable Mutex lock_; + mutable ConditionVariable cond_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/util/random-test.cc b/src/kudu/util/random-test.cc new file mode 100644 index 000000000000..54a9c66c0474 --- /dev/null +++ b/src/kudu/util/random-test.cc @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class RandomTest : public KuduTest { + public: + RandomTest() + : rng_(SeedRandom()) { + } + + protected: + Random rng_; +}; + +// Tests that after a certain number of invocations of Normal(), the +// actual mean of all samples is within the specified standard +// deviation of the target mean. +TEST_F(RandomTest, TestNormalDist) { + const double kMean = 5.0; + const double kStdDev = 0.01; + const int kNumIters = 100000; + + double sum = 0.0; + for (int i = 0; i < kNumIters; ++i) { + sum += rng_.Normal(kMean, kStdDev); + } + + ASSERT_LE(fabs((sum / static_cast(kNumIters)) - kMean), kStdDev); +} + +// Tests that after a large number of invocations of Next32() and Next64(), we +// have flipped all the bits we claim we should have. +// +// This is a regression test for a bug where we were incorrectly bit-shifting +// in Next64(). +// +// Note: Our RNG actually only generates 31 bits of randomness for 32 bit +// integers and 62 bits for 64 bit integers. So this test reflects that, and if +// we change the RNG algo this test should also change. +TEST_F(RandomTest, TestUseOfBits) { + uint32_t ones32 = std::numeric_limits::max(); + uint32_t zeroes32 = 0; + uint64_t ones64 = std::numeric_limits::max(); + uint64_t zeroes64 = 0; + + for (int i = 0; i < 10000000; i++) { + uint32_t r32 = rng_.Next32(); + ones32 &= r32; + zeroes32 |= r32; + + uint64_t r64 = rng_.Next64(); + ones64 &= r64; + zeroes64 |= r64; + } + + // At the end, we should have flipped 31 and 62 bits, respectively. One + // detail of the current RNG impl is that Next32() always returns a number + // with MSB set to 0, and Next64() always returns a number with the first + // two bits set to zero. + uint32_t expected_bits_31 = std::numeric_limits::max() >> 1; + uint64_t expected_bits_62 = std::numeric_limits::max() >> 2; + + ASSERT_EQ(0, ones32); + ASSERT_EQ(expected_bits_31, zeroes32); + ASSERT_EQ(0, ones64); + ASSERT_EQ(expected_bits_62, zeroes64); +} + +TEST_F(RandomTest, TestResetSeed) { + rng_.Reset(1); + uint64_t first = rng_.Next64(); + rng_.Reset(1); + uint64_t second = rng_.Next64(); + ASSERT_EQ(first, second); +} + +TEST_F(RandomTest, TestReservoirSample) { + // Use a constant seed to avoid flakiness. + rng_.Reset(12345); + + vector population; + for (int i = 0; i < 100; i++) { + population.push_back(i); + } + + // Run 1000 trials selecting 5 elements. + vector results; + vector counts(population.size()); + std::unordered_set avoid; + for (int trial = 0; trial < 1000; trial++) { + rng_.ReservoirSample(population, 5, avoid, &results); + for (int result : results) { + counts[result]++; + } + } + + // We expect each element to be selected + // 50 times on average, but since it's random, it won't be exact. + // However, since we use a constant seed, this test won't be flaky. + for (int count : counts) { + ASSERT_GE(count, 25); + ASSERT_LE(count, 75); + } + + // Run again, but avoid some particular entries. + avoid.insert(3); + avoid.insert(10); + avoid.insert(20); + counts.assign(100, 0); + for (int trial = 0; trial < 1000; trial++) { + rng_.ReservoirSample(population, 5, avoid, &results); + for (int result : results) { + counts[result]++; + } + } + + // Ensure that we didn't ever pick the avoided elements. + ASSERT_EQ(0, counts[3]); + ASSERT_EQ(0, counts[10]); + ASSERT_EQ(0, counts[20]); +} + +TEST_F(RandomTest, TestReservoirSamplePopulationTooSmall) { + vector population; + for (int i = 0; i < 10; i++) { + population.push_back(i); + } + + vector results; + std::unordered_set avoid; + rng_.ReservoirSample(population, 20, avoid, &results); + ASSERT_EQ(population.size(), results.size()); + ASSERT_EQ(population, results); + + rng_.ReservoirSample(population, 10, avoid, &results); + ASSERT_EQ(population.size(), results.size()); + ASSERT_EQ(population, results); +} + +} // namespace kudu diff --git a/src/kudu/util/random.h b/src/kudu/util/random.h new file mode 100644 index 000000000000..b4f0a46447f1 --- /dev/null +++ b/src/kudu/util/random.h @@ -0,0 +1,232 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef KUDU_UTIL_RANDOM_H_ +#define KUDU_UTIL_RANDOM_H_ + +#include + +#include +#include + +#include "kudu/gutil/map-util.h" +#include "kudu/util/locks.h" + +namespace kudu { + +namespace random_internal { + +static const uint32_t M = 2147483647L; // 2^31-1 +const double kTwoPi = 6.283185307179586476925286; + +} // namespace random_internal + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. This implementation is not thread-safe. +class Random { + private: + uint32_t seed_; + public: + explicit Random(uint32_t s) { + Reset(s); + } + + // Reset the RNG to the given seed value. + void Reset(uint32_t s) { + seed_ = s & 0x7fffffffu; + // Avoid bad seeds. + if (seed_ == 0 || seed_ == random_internal::M) { + seed_ = 1; + } + } + + // Next pseudo-random 32-bit unsigned integer. + // FIXME: This currently only generates 31 bits of randomness. + // The MSB will always be zero. + uint32_t Next() { + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & random_internal::M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > random_internal::M) { + seed_ -= random_internal::M; + } + return seed_; + } + + // Alias for consistency with Next64 + uint32_t Next32() { return Next(); } + + // Next pseudo-random 64-bit unsigned integer. + // FIXME: This currently only generates 62 bits of randomness due to Next() + // only giving 31 bits of randomness. The 2 most significant bits will always + // be zero. + uint64_t Next64() { + uint64_t large = Next(); + // Only shift by 31 bits so we end up with zeros in MSB and not scattered + // throughout the 64-bit word. This is due to the weakness in Next() noted + // above. + large <<= 31; + large |= Next(); + return large; + } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(uint32_t n) { return Next() % n; } + + // Alias for consistency with Uniform64 + uint32_t Uniform32(uint32_t n) { return Uniform(n); } + + // Returns a uniformly distributed 64-bit value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform64(uint64_t n) { return Next64() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } + + // Creates a normal distribution variable using the + // Box-Muller transform. See: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform + // Adapted from WebRTC source code at: + // webrtc/trunk/modules/video_coding/main/test/test_util.cc + double Normal(double mean, double std_dev) { + double uniform1 = (Next() + 1.0) / (random_internal::M + 1.0); + double uniform2 = (Next() + 1.0) / (random_internal::M + 1.0); + return (mean + std_dev * sqrt(-2 * ::log(uniform1)) * cos(random_internal::kTwoPi * uniform2)); + } + + // Return a random number between 0.0 and 1.0 inclusive. + double NextDoubleFraction() { + return Next() / static_cast(random_internal::M + 1.0); + } + + // Sample 'k' random elements from the collection 'c' into 'result', taking care not to sample any + // elements that are already present in 'avoid'. + // + // In the case that 'c' has fewer than 'k' elements then all elements in 'c' will be selected. + // + // 'c' should be an iterable STL collection such as a vector, set, or list. + // 'avoid' should be an STL-compatible set. + // + // The results are not stored in a randomized order: the order of results will + // match their order in the input collection. + template + void ReservoirSample(const Collection& c, int k, const Set& avoid, + std::vector* result) { + result->clear(); + result->reserve(k); + int i = 0; + for (const T& elem : c) { + if (ContainsKey(avoid, elem)) { + continue; + } + i++; + // Fill the reservoir if there is available space. + if (result->size() < k) { + result->push_back(elem); + continue; + } + // Otherwise replace existing elements with decreasing probability. + int j = Uniform(i); + if (j < k) { + (*result)[j] = elem; + } + } + } +}; + +// Thread-safe wrapper around Random. +class ThreadSafeRandom { + public: + explicit ThreadSafeRandom(uint32_t s) + : random_(s) { + } + + void Reset(uint32_t s) { + lock_guard l(&lock_); + random_.Reset(s); + } + + uint32_t Next() { + lock_guard l(&lock_); + return random_.Next(); + } + + uint32_t Next32() { + lock_guard l(&lock_); + return random_.Next32(); + } + + uint64_t Next64() { + lock_guard l(&lock_); + return random_.Next64(); + } + + uint32_t Uniform(uint32_t n) { + lock_guard l(&lock_); + return random_.Uniform(n); + } + + uint32_t Uniform32(uint32_t n) { + lock_guard l(&lock_); + return random_.Uniform32(n); + } + + uint64_t Uniform64(uint64_t n) { + lock_guard l(&lock_); + return random_.Uniform64(n); + } + + bool OneIn(int n) { + lock_guard l(&lock_); + return random_.OneIn(n); + } + + uint32_t Skewed(int max_log) { + lock_guard l(&lock_); + return random_.Skewed(max_log); + } + + double Normal(double mean, double std_dev) { + lock_guard l(&lock_); + return random_.Normal(mean, std_dev); + } + + template + void ReservoirSample(const Collection& c, int k, const Set& avoid, + std::vector* result) { + lock_guard l(&lock_); + random_.ReservoirSample(c, k, avoid, result); + } + + private: + simple_spinlock lock_; + Random random_; +}; + + + +} // namespace kudu + +#endif // KUDU_UTIL_RANDOM_H_ diff --git a/src/kudu/util/random_util-test.cc b/src/kudu/util/random_util-test.cc new file mode 100644 index 000000000000..f3eb7d5176b6 --- /dev/null +++ b/src/kudu/util/random_util-test.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/random_util.h" + +#include +#include +#include + +#include "kudu/util/random.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +class RandomUtilTest : public KuduTest { + protected: + RandomUtilTest() : rng_(SeedRandom()) {} + + Random rng_; + + static const int kLenMax = 100; + static const int kNumTrials = 100; +}; + +namespace { + +// Checks string defined at start is set to \0 everywhere but [from, to) +void CheckEmpty(char* start, int from, int to, int stop) { + DCHECK_LE(0, from); + DCHECK_LE(from, to); + DCHECK_LE(to, stop); + for (int j = 0; (j == from ? j = to : j) < stop; ++j) { + CHECK_EQ(start[j], '\0') << "Index " << j << " not null after defining" + << "indices [" << from << "," << to << ") of " + << "a nulled string [0," << stop << ")."; + } +} + +} // anonymous namespace + +// Makes sure that RandomString only writes the specified amount +TEST_F(RandomUtilTest, TestRandomString) { + char start[kLenMax]; + + for (int i = 0; i < kNumTrials; ++i) { + memset(start, '\0', kLenMax); + int to = rng_.Uniform(kLenMax + 1); + int from = rng_.Uniform(to + 1); + RandomString(start + from, to - from, &rng_); + CheckEmpty(start, from, to, kLenMax); + } + + // Corner case + memset(start, '\0', kLenMax); + RandomString(start, 0, &rng_); + CheckEmpty(start, 0, 0, kLenMax); +} + +} // namespace kudu diff --git a/src/kudu/util/random_util.cc b/src/kudu/util/random_util.cc new file mode 100644 index 000000000000..21a4144edf34 --- /dev/null +++ b/src/kudu/util/random_util.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/random_util.h" + +#include +#include +#include +#include +#include + +#include "kudu/util/env.h" +#include "kudu/util/random.h" +#include "kudu/gutil/walltime.h" + +namespace kudu { + +void RandomString(void* dest, size_t n, Random* rng) { + size_t i = 0; + uint32_t random = rng->Next(); + char* cdest = static_cast(dest); + static const size_t sz = sizeof(random); + if (n >= sz) { + for (i = 0; i <= n - sz; i += sz) { + memcpy(&cdest[i], &random, sizeof(random)); + random = rng->Next(); + } + } + memcpy(cdest + i, &random, n - i); +} + +uint32_t GetRandomSeed32() { + uint32_t seed = static_cast(GetCurrentTimeMicros()); + seed *= getpid(); + seed *= Env::Default()->gettid(); + return seed; +} + +} // namespace kudu diff --git a/src/kudu/util/random_util.h b/src/kudu/util/random_util.h new file mode 100644 index 000000000000..e286bbe999b7 --- /dev/null +++ b/src/kudu/util/random_util.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_RANDOM_UTIL_H +#define KUDU_UTIL_RANDOM_UTIL_H + +#include +#include + +namespace kudu { + +class Random; + +// Writes exactly n random bytes to dest using the parameter Random generator. +// Note RandomString() does not null-terminate its strings, though '\0' could +// be written to dest with the same probability as any other byte. +void RandomString(void* dest, size_t n, Random* rng); + +// Generate a 32-bit random seed from several sources, including timestamp, +// pid & tid. +uint32_t GetRandomSeed32(); + +} // namespace kudu + +#endif // KUDU_UTIL_RANDOM_UTIL_H diff --git a/src/kudu/util/resettable_heartbeater-test.cc b/src/kudu/util/resettable_heartbeater-test.cc new file mode 100644 index 000000000000..aeb67d7e8e3f --- /dev/null +++ b/src/kudu/util/resettable_heartbeater-test.cc @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/resettable_heartbeater.h" + +#include +#include +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/monotime.h" +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +namespace kudu { + +// Number of heartbeats we want to observe before allowing the test to end. +static const int kNumHeartbeats = 2; + +class ResettableHeartbeaterTest : public KuduTest { + public: + ResettableHeartbeaterTest() + : KuduTest(), + latch_(kNumHeartbeats) { + } + + protected: + void CreateHeartbeater(uint64_t period_ms, const std::string& name) { + period_ms_ = period_ms; + heartbeater_.reset( + new ResettableHeartbeater(name, + MonoDelta::FromMilliseconds(period_ms), + boost::bind(&ResettableHeartbeaterTest::HeartbeatFunction, + this))); + } + + Status HeartbeatFunction() { + latch_.CountDown(); + return Status::OK(); + } + + void WaitForCountDown() { + // Wait a large multiple (in the worst case) of the required time before we + // time out and fail the test. Large to avoid test flakiness. + const uint64_t kMaxWaitMillis = period_ms_ * kNumHeartbeats * 20; + CHECK(latch_.WaitFor(MonoDelta::FromMilliseconds(kMaxWaitMillis))) + << "Failed to count down " << kNumHeartbeats << " times in " << kMaxWaitMillis + << " ms: latch count == " << latch_.count(); + } + + CountDownLatch latch_; + uint64_t period_ms_; + gscoped_ptr heartbeater_; +}; + +// Tests that if Reset() is not called the heartbeat method is called +// the expected number of times. +TEST_F(ResettableHeartbeaterTest, TestRegularHeartbeats) { + const int64_t kHeartbeatPeriodMillis = 100; // Heartbeat every 100ms. + CreateHeartbeater(kHeartbeatPeriodMillis, CURRENT_TEST_NAME()); + ASSERT_OK(heartbeater_->Start()); + WaitForCountDown(); + ASSERT_OK(heartbeater_->Stop()); +} + +// Tests that if we Reset() the heartbeater in a period smaller than +// the heartbeat period the heartbeat method never gets called. +// After we stop resetting heartbeats should resume as normal +TEST_F(ResettableHeartbeaterTest, TestResetHeartbeats) { + const int64_t kHeartbeatPeriodMillis = 800; // Heartbeat every 800ms. + const int64_t kNumResetSlicesPerPeriod = 40; // Reset 40 times per heartbeat period. + // Reset once every 800ms / 40 = 20ms. + const int64_t kResetPeriodMillis = kHeartbeatPeriodMillis / kNumResetSlicesPerPeriod; + + CreateHeartbeater(kHeartbeatPeriodMillis, CURRENT_TEST_NAME()); + ASSERT_OK(heartbeater_->Start()); + // Call Reset() in a loop for 2 heartbeat periods' worth of time, with sleeps + // in-between as defined above. + for (int i = 0; i < kNumResetSlicesPerPeriod * 2; i++) { + heartbeater_->Reset(); + ASSERT_EQ(kNumHeartbeats, latch_.count()); // Ensure we haven't counted down, yet. + SleepFor(MonoDelta::FromMilliseconds(kResetPeriodMillis)); + } + WaitForCountDown(); + ASSERT_OK(heartbeater_->Stop()); +} + +} // namespace kudu diff --git a/src/kudu/util/resettable_heartbeater.cc b/src/kudu/util/resettable_heartbeater.cc new file mode 100644 index 000000000000..a1c68b95c11f --- /dev/null +++ b/src/kudu/util/resettable_heartbeater.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/util/resettable_heartbeater.h" + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/locks.h" +#include "kudu/util/random.h" +#include "kudu/util/status.h" +#include "kudu/util/thread.h" + +namespace kudu { +using std::string; + +class ResettableHeartbeaterThread { + public: + ResettableHeartbeaterThread(std::string name, MonoDelta period, + HeartbeatFunction function); + + Status Start(); + Status Stop(); + void Reset(); + + private: + void RunThread(); + bool IsCurrentThread() const; + + const string name_; + + // The heartbeat period. + const MonoDelta period_; + + // The function to call to perform the heartbeat + const HeartbeatFunction function_; + + // The actual running thread (NULL before it is started) + scoped_refptr thread_; + + CountDownLatch run_latch_; + + // Whether the heartbeater should shutdown. + bool shutdown_; + + // lock that protects access to 'shutdown_' and to 'run_latch_' + // Reset() method. + mutable simple_spinlock lock_; + DISALLOW_COPY_AND_ASSIGN(ResettableHeartbeaterThread); +}; + +ResettableHeartbeater::ResettableHeartbeater(const std::string& name, + MonoDelta period, + HeartbeatFunction function) + : thread_(new ResettableHeartbeaterThread(name, period, function)) { +} + +Status ResettableHeartbeater::Start() { + return thread_->Start(); +} + +Status ResettableHeartbeater::Stop() { + return thread_->Stop(); +} +void ResettableHeartbeater::Reset() { + thread_->Reset(); +} + +ResettableHeartbeater::~ResettableHeartbeater() { + WARN_NOT_OK(Stop(), "Unable to stop heartbeater thread"); +} + +ResettableHeartbeaterThread::ResettableHeartbeaterThread( + std::string name, MonoDelta period, HeartbeatFunction function) + : name_(std::move(name)), + period_(std::move(period)), + function_(std::move(function)), + run_latch_(0), + shutdown_(false) {} + +void ResettableHeartbeaterThread::RunThread() { + CHECK(IsCurrentThread()); + VLOG(1) << "Heartbeater: " << name_ << " thread starting"; + + bool prev_reset_was_manual = false; + Random rng(random()); + while (true) { + MonoDelta wait_period = period_; + if (prev_reset_was_manual) { + // When the caller does a manual reset, we randomize the subsequent wait + // timeout between period_/2 and period_. This builds in some jitter so + // multiple tablets on the same TS don't end up heartbeating in lockstep. + int64_t half_period_ms = period_.ToMilliseconds() / 2; + wait_period = MonoDelta::FromMilliseconds( + half_period_ms + + rng.NextDoubleFraction() * half_period_ms); + prev_reset_was_manual = false; + } + if (run_latch_.WaitFor(wait_period)) { + // CountDownLatch reached 0 -- this means there was a manual reset. + prev_reset_was_manual = true; + lock_guard lock(&lock_); + // check if we were told to shutdown + if (shutdown_) { + // Latch fired -- exit loop + VLOG(1) << "Heartbeater: " << name_ << " thread finished"; + return; + } else { + // otherwise it's just a reset, reset the latch + // and continue; + run_latch_.Reset(1); + continue; + } + } + + Status s = function_(); + if (!s.ok()) { + LOG(WARNING)<< "Failed to heartbeat in heartbeater: " << name_ + << " Status: " << s.ToString(); + continue; + } + } +} + +bool ResettableHeartbeaterThread::IsCurrentThread() const { + return thread_.get() == kudu::Thread::current_thread(); +} + +Status ResettableHeartbeaterThread::Start() { + CHECK(thread_ == nullptr); + run_latch_.Reset(1); + return kudu::Thread::Create("heartbeater", strings::Substitute("$0-heartbeat", name_), + &ResettableHeartbeaterThread::RunThread, + this, &thread_); +} + +void ResettableHeartbeaterThread::Reset() { + if (!thread_) { + return; + } + run_latch_.CountDown(); +} + +Status ResettableHeartbeaterThread::Stop() { + if (!thread_) { + return Status::OK(); + } + + { + lock_guard l(&lock_); + if (shutdown_) { + return Status::OK(); + } + shutdown_ = true; + } + + run_latch_.CountDown(); + RETURN_NOT_OK(ThreadJoiner(thread_.get()).Join()); + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/resettable_heartbeater.h b/src/kudu/util/resettable_heartbeater.h new file mode 100644 index 000000000000..40bbe2996dd5 --- /dev/null +++ b/src/kudu/util/resettable_heartbeater.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_RESETTABLE_HEARTBEATER_H_ +#define KUDU_UTIL_RESETTABLE_HEARTBEATER_H_ + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" + +namespace kudu { +class MonoDelta; +class Status; +class ResettableHeartbeaterThread; + +typedef boost::function HeartbeatFunction; + +// A resettable hearbeater that takes a function and calls +// it to perform a regular heartbeat, unless Reset() is called +// in which case the heartbeater resets the heartbeat period. +// The point is to send "I'm Alive" heartbeats only if no regular +// messages are sent in the same period. +// +// TODO Eventually this should be used instead of the master heartbeater +// as it shares a lot of logic with the exception of the specific master +// stuff (and the fact that it is resettable). +// +// TODO We'll have a lot of these per server, so eventually we need +// to refactor this so that multiple heartbeaters share something like +// java's ScheduledExecutor. +// +// TODO Do something about failed hearbeats, right now this is just +// logging. Probably could take more arguments and do more of an +// exponential backoff. +// +// This class is thread safe. +class ResettableHeartbeater { + public: + ResettableHeartbeater(const std::string& name, + MonoDelta period, + HeartbeatFunction function); + + // Starts the heartbeater + Status Start(); + + // Stops the hearbeater + Status Stop(); + + // Resets the heartbeat period. + // When this is called, the subsequent heartbeat has some built-in jitter and + // may trigger before a full period (as specified to the constructor). + void Reset(); + + ~ResettableHeartbeater(); + private: + gscoped_ptr thread_; + + DISALLOW_COPY_AND_ASSIGN(ResettableHeartbeater); +}; + +} // namespace kudu + +#endif /* KUDU_UTIL_RESETTABLE_HEARTBEATER_H_ */ diff --git a/src/kudu/util/rle-encoding.h b/src/kudu/util/rle-encoding.h new file mode 100644 index 000000000000..48626aae4e67 --- /dev/null +++ b/src/kudu/util/rle-encoding.h @@ -0,0 +1,536 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IMPALA_RLE_ENCODING_H +#define IMPALA_RLE_ENCODING_H + +#include + +#include "kudu/gutil/port.h" +#include "kudu/util/bit-stream-utils.inline.h" +#include "kudu/util/bit-util.h" + +namespace kudu { + +// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs +// are sufficiently long, RLE is used, otherwise, the values are just bit-packed +// (literal encoding). +// For both types of runs, there is a byte-aligned indicator which encodes the length +// of the run and the type of the run. +// This encoding has the benefit that when there aren't any long enough runs, values +// are always decoded at fixed (can be precomputed) bit offsets OR both the value and +// the run length are byte aligned. This allows for very efficient decoding +// implementations. +// The encoding is: +// encoded-block := run* +// run := literal-run | repeated-run +// literal-run := literal-indicator < literal bytes > +// repeated-run := repeated-indicator < repeated value. padded to byte boundary > +// literal-indicator := varint_encode( number_of_groups << 1 | 1) +// repeated-indicator := varint_encode( number_of_repetitions << 1 ) +// +// Each run is preceded by a varint. The varint's least significant bit is +// used to indicate whether the run is a literal run or a repeated run. The rest +// of the varint is used to determine the length of the run (eg how many times the +// value repeats). +// +// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode +// in groups of 8), so that no matter the bit-width of the value, the sequence will end +// on a byte boundary without padding. +// Given that we know it is a multiple of 8, we store the number of 8-groups rather than +// the actual number of encoded ints. (This means that the total number of encoded values +// can not be determined from the encoded data, since the number of values in the last +// group may not be a multiple of 8). +// There is a break-even point when it is more storage efficient to do run length +// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes +// for both the repeated encoding or the literal encoding. This value can always +// be computed based on the bit-width. +// TODO: think about how to use this for strings. The bit packing isn't quite the same. +// +// Examples with bit-width 1 (eg encoding booleans): +// ---------------------------------------- +// 100 1s followed by 100 0s: +// <1, padded to 1 byte> <0, padded to 1 byte> +// - (total 4 bytes) +// +// alternating 1s and 0s (200 total): +// 200 ints = 25 groups of 8 +// <25 bytes of values, bitpacked> +// (total 26 bytes, 1 byte overhead) +// + +// Decoder class for RLE encoded data. +// +// NOTE: the encoded format does not have any length prefix or any other way of +// indicating that the encoded sequence ends at a certain point, so the Decoder +// methods may return some extra bits at the end before the read methods start +// to return 0/false. +template +class RleDecoder { + public: + // Create a decoder object. buffer/buffer_len is the decoded data. + // bit_width is the width of each value (before encoding). + RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width) + : bit_reader_(buffer, buffer_len), + bit_width_(bit_width), + current_value_(0), + repeat_count_(0), + literal_count_(0), + rewind_state_(CANT_REWIND) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + } + + RleDecoder() {} + + // Skip n values, and returns the number of non-zero entries skipped. + size_t Skip(size_t to_skip); + + // Gets the next value. Returns false if there are no more. + bool Get(T* val); + + // Seek to the previous value. + void RewindOne(); + + // Gets the next run of the same 'val'. Returns 0 if there is no + // more data to be decoded. Will return a run of at most 'max_run' + // values. If there are more values than this, the next call to + // GetNextRun will return more from the same run. + size_t GetNextRun(T* val, size_t max_run); + + private: + bool ReadHeader(); + + enum RewindState { + REWIND_LITERAL, + REWIND_RUN, + CANT_REWIND + }; + + BitReader bit_reader_; + int bit_width_; + uint64_t current_value_; + uint32_t repeat_count_; + uint32_t literal_count_; + RewindState rewind_state_; +}; + +// Class to incrementally build the rle data. +// The encoding has two modes: encoding repeated runs and literal runs. +// If the run is sufficiently short, it is more efficient to encode as a literal run. +// This class does so by buffering 8 values at a time. If they are not all the same +// they are added to the literal run. If they are the same, they are added to the +// repeated run. When we switch modes, the previous run is flushed out. +template +class RleEncoder { + public: + // buffer: buffer to write bits to. + // bit_width: max number of bits for value. + // TODO: consider adding a min_repeated_run_length so the caller can control + // when values should be encoded as repeated runs. Currently this is derived + // based on the bit_width, which can determine a storage optimal choice. + explicit RleEncoder(faststring *buffer, int bit_width) + : bit_width_(bit_width), + bit_writer_(buffer) { + DCHECK_GE(bit_width_, 1); + DCHECK_LE(bit_width_, 64); + Clear(); + } + + // Reserve 'num_bytes' bytes for a plain encoded header, set each + // byte with 'val': this is used for the RLE-encoded data blocks in + // order to be able to able to store the initial ordinal position + // and number of elements. This is a part of RleEncoder in order to + // maintain the correct offset in 'buffer'. + void Reserve(int num_bytes, uint8_t val); + + // Encode value. This value must be representable with bit_width_ bits. + void Put(T value, size_t run_length = 1); + + // Flushes any pending values to the underlying buffer. + // Returns the total number of bytes written + int Flush(); + + // Resets all the state in the encoder. + void Clear(); + + int32_t len() const { return bit_writer_.bytes_written(); } + + private: + // Flushes any buffered values. If this is part of a repeated run, this is largely + // a no-op. + // If it is part of a literal run, this will call FlushLiteralRun, which writes + // out the buffered literal values. + // If 'done' is true, the current run would be written even if it would normally + // have been buffered more. This should only be called at the end, when the + // encoder has received all values even if it would normally continue to be + // buffered. + void FlushBufferedValues(bool done); + + // Flushes literal values to the underlying buffer. If update_indicator_byte, + // then the current literal run is complete and the indicator byte is updated. + void FlushLiteralRun(bool update_indicator_byte); + + // Flushes a repeated run to the underlying buffer. + void FlushRepeatedRun(); + + // Number of bits needed to encode the value. + const int bit_width_; + + // Underlying buffer. + BitWriter bit_writer_; + + // We need to buffer at most 8 values for literals. This happens when the + // bit_width is 1 (so 8 values fit in one byte). + // TODO: generalize this to other bit widths + uint64_t buffered_values_[8]; + + // Number of values in buffered_values_ + int num_buffered_values_; + + // The current (also last) value that was written and the count of how + // many times in a row that value has been seen. This is maintained even + // if we are in a literal run. If the repeat_count_ get high enough, we switch + // to encoding repeated runs. + uint64_t current_value_; + int repeat_count_; + + // Number of literals in the current run. This does not include the literals + // that might be in buffered_values_. Only after we've got a group big enough + // can we decide if they should part of the literal_count_ or repeat_count_ + int literal_count_; + + // Index of a byte in the underlying buffer that stores the indicator byte. + // This is reserved as soon as we need a literal run but the value is written + // when the literal run is complete. We maintain an index rather than a pointer + // into the underlying buffer because the pointer value may become invalid if + // the underlying buffer is resized. + int literal_indicator_byte_idx_; +}; + +template +inline bool RleDecoder::ReadHeader() { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(literal_count_ == 0 && repeat_count_ == 0)) { + // Read the next run's indicator int, it could be a literal or repeated run + // The int is encoded as a vlq-encoded value. + int32_t indicator_value = 0; + bool result = bit_reader_.GetVlqInt(&indicator_value); + if (PREDICT_FALSE(!result)) { + return false; + } + + // lsb indicates if it is a literal run or repeated run + bool is_literal = indicator_value & 1; + if (is_literal) { + literal_count_ = (indicator_value >> 1) * 8; + DCHECK_GT(literal_count_, 0); + } else { + repeat_count_ = indicator_value >> 1; + DCHECK_GT(repeat_count_, 0); + bool result = bit_reader_.GetAligned( + BitUtil::Ceil(bit_width_, 8), reinterpret_cast(¤t_value_)); + DCHECK(result); + } + } + return true; +} + +template +inline bool RleDecoder::Get(T* val) { + DCHECK(bit_reader_.is_initialized()); + if (PREDICT_FALSE(!ReadHeader())) { + return false; + } + + if (PREDICT_TRUE(repeat_count_ > 0)) { + *val = current_value_; + --repeat_count_; + rewind_state_ = REWIND_RUN; + } else { + DCHECK(literal_count_ > 0); + bool result = bit_reader_.GetValue(bit_width_, val); + DCHECK(result); + --literal_count_; + rewind_state_ = REWIND_LITERAL; + } + + return true; +} + +template +inline void RleDecoder::RewindOne() { + DCHECK(bit_reader_.is_initialized()); + + switch (rewind_state_) { + case CANT_REWIND: + LOG(FATAL) << "Can't rewind more than once after each read!"; + break; + case REWIND_RUN: + ++repeat_count_; + break; + case REWIND_LITERAL: + { + bit_reader_.Rewind(bit_width_); + ++literal_count_; + break; + } + } + + rewind_state_ = CANT_REWIND; +} + +template +inline size_t RleDecoder::GetNextRun(T* val, size_t max_run) { + DCHECK(bit_reader_.is_initialized()); + DCHECK_GT(max_run, 0); + size_t ret = 0; + size_t rem = max_run; + while (ReadHeader()) { + if (PREDICT_TRUE(repeat_count_ > 0)) { + if (PREDICT_FALSE(ret > 0 && *val != current_value_)) { + return ret; + } + *val = current_value_; + if (repeat_count_ >= rem) { + // The next run is longer than the amount of remaining data + // that the caller wants to read. Only consume it partially. + repeat_count_ -= rem; + ret += rem; + return ret; + } + ret += repeat_count_; + rem -= repeat_count_; + repeat_count_ = 0; + } else { + DCHECK(literal_count_ > 0); + if (ret == 0) { + bool has_more = bit_reader_.GetValue(bit_width_, val); + DCHECK(has_more); + literal_count_--; + ret++; + rem--; + } + + while (literal_count_ > 0) { + bool result = bit_reader_.GetValue(bit_width_, ¤t_value_); + DCHECK(result); + if (current_value_ != *val || rem == 0) { + bit_reader_.Rewind(bit_width_); + return ret; + } + ret++; + rem--; + literal_count_--; + } + } + } + return ret; + } + +template +inline size_t RleDecoder::Skip(size_t to_skip) { + DCHECK(bit_reader_.is_initialized()); + + size_t set_count = 0; + while (to_skip > 0) { + bool result = ReadHeader(); + DCHECK(result); + + if (PREDICT_TRUE(repeat_count_ > 0)) { + size_t nskip = (repeat_count_ < to_skip) ? repeat_count_ : to_skip; + repeat_count_ -= nskip; + to_skip -= nskip; + if (current_value_ != 0) { + set_count += nskip; + } + } else { + DCHECK(literal_count_ > 0); + size_t nskip = (literal_count_ < to_skip) ? literal_count_ : to_skip; + literal_count_ -= nskip; + to_skip -= nskip; + while (nskip--) { + T value = 0; + bool result = bit_reader_.GetValue(bit_width_, &value); + DCHECK(result); + if (value != 0) { + set_count++; + } + } + } + } + return set_count; +} + +// This function buffers input values 8 at a time. After seeing all 8 values, +// it decides whether they should be encoded as a literal or repeated run. +template +inline void RleEncoder::Put(T value, size_t run_length) { + DCHECK(bit_width_ == 64 || value < (1LL << bit_width_)); + + // TODO(perf): remove the loop and use the repeat_count_ + while (run_length--) { + if (PREDICT_TRUE(current_value_ == value)) { + ++repeat_count_; + if (repeat_count_ > 8) { + // This is just a continuation of the current run, no need to buffer the + // values. + // Note that this is the fast path for long repeated runs. + continue; + } + } else { + if (repeat_count_ >= 8) { + // We had a run that was long enough but it has ended. Flush the + // current repeated run. + DCHECK_EQ(literal_count_, 0); + FlushRepeatedRun(); + } + repeat_count_ = 1; + current_value_ = value; + } + + buffered_values_[num_buffered_values_] = value; + if (++num_buffered_values_ == 8) { + DCHECK_EQ(literal_count_ % 8, 0); + FlushBufferedValues(false); + } + } +} + +template +inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) { + if (literal_indicator_byte_idx_ < 0) { + // The literal indicator byte has not been reserved yet, get one now. + literal_indicator_byte_idx_ = bit_writer_.GetByteIndexAndAdvance(1); + DCHECK_GE(literal_indicator_byte_idx_, 0); + } + + // Write all the buffered values as bit packed literals + for (int i = 0; i < num_buffered_values_; ++i) { + bit_writer_.PutValue(buffered_values_[i], bit_width_); + } + num_buffered_values_ = 0; + + if (update_indicator_byte) { + // At this point we need to write the indicator byte for the literal run. + // We only reserve one byte, to allow for streaming writes of literal values. + // The logic makes sure we flush literal runs often enough to not overrun + // the 1 byte. + int num_groups = BitUtil::Ceil(literal_count_, 8); + int32_t indicator_value = (num_groups << 1) | 1; + DCHECK_EQ(indicator_value & 0xFFFFFF00, 0); + bit_writer_.buffer()->data()[literal_indicator_byte_idx_] = indicator_value; + literal_indicator_byte_idx_ = -1; + literal_count_ = 0; + } +} + +template +inline void RleEncoder::FlushRepeatedRun() { + DCHECK_GT(repeat_count_, 0); + // The lsb of 0 indicates this is a repeated run + int32_t indicator_value = repeat_count_ << 1 | 0; + bit_writer_.PutVlqInt(indicator_value); + bit_writer_.PutAligned(current_value_, BitUtil::Ceil(bit_width_, 8)); + num_buffered_values_ = 0; + repeat_count_ = 0; +} + +// Flush the values that have been buffered. At this point we decide whether +// we need to switch between the run types or continue the current one. +template +inline void RleEncoder::FlushBufferedValues(bool done) { + if (repeat_count_ >= 8) { + // Clear the buffered values. They are part of the repeated run now and we + // don't want to flush them out as literals. + num_buffered_values_ = 0; + if (literal_count_ != 0) { + // There was a current literal run. All the values in it have been flushed + // but we still need to update the indicator byte. + DCHECK_EQ(literal_count_ % 8, 0); + DCHECK_EQ(repeat_count_, 8); + FlushLiteralRun(true); + } + DCHECK_EQ(literal_count_, 0); + return; + } + + literal_count_ += num_buffered_values_; + int num_groups = BitUtil::Ceil(literal_count_, 8); + if (num_groups + 1 >= (1 << 6)) { + // We need to start a new literal run because the indicator byte we've reserved + // cannot store more values. + DCHECK_GE(literal_indicator_byte_idx_, 0); + FlushLiteralRun(true); + } else { + FlushLiteralRun(done); + } + repeat_count_ = 0; +} + +template +inline void RleEncoder::Reserve(int num_bytes, uint8_t val) { + for (int i = 0; i < num_bytes; ++i) { + bit_writer_.PutValue(val, 8); + } +} + +template +inline int RleEncoder::Flush() { + if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) { + bool all_repeat = literal_count_ == 0 && + (repeat_count_ == num_buffered_values_ || num_buffered_values_ == 0); + // There is something pending, figure out if it's a repeated or literal run + if (repeat_count_ > 0 && all_repeat) { + FlushRepeatedRun(); + } else { + literal_count_ += num_buffered_values_; + FlushLiteralRun(true); + repeat_count_ = 0; + } + } + bit_writer_.Flush(); + DCHECK_EQ(num_buffered_values_, 0); + DCHECK_EQ(literal_count_, 0); + DCHECK_EQ(repeat_count_, 0); + return bit_writer_.bytes_written(); +} + +template +inline void RleEncoder::Clear() { + current_value_ = 0; + repeat_count_ = 0; + num_buffered_values_ = 0; + literal_count_ = 0; + literal_indicator_byte_idx_ = -1; + bit_writer_.Clear(); +} + +} // namespace kudu +#endif diff --git a/src/kudu/util/rle-test.cc b/src/kudu/util/rle-test.cc new file mode 100644 index 000000000000..b220b30232e8 --- /dev/null +++ b/src/kudu/util/rle-test.cc @@ -0,0 +1,527 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +// Must come before gtest.h. +#include "kudu/gutil/mathlimits.h" + +#include +#include +#include +#include + +#include "kudu/util/rle-encoding.h" +#include "kudu/util/bit-stream-utils.h" +#include "kudu/util/hexdump.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; + +namespace kudu { + +const int MAX_WIDTH = 32; + +class TestRle : public KuduTest {}; + +TEST(BitArray, TestBool) { + const int len_bytes = 2; + faststring buffer(len_bytes); + + BitWriter writer(&buffer); + + // Write alternating 0's and 1's + for (int i = 0; i < 8; ++i) { + writer.PutValue(i % 2, 1); + } + writer.Flush(); + EXPECT_EQ(buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); + + // Write 00110011 + for (int i = 0; i < 8; ++i) { + switch (i) { + case 0: + case 1: + case 4: + case 5: + writer.PutValue(0, 1); + break; + default: + writer.PutValue(1, 1); + break; + } + } + writer.Flush(); + + // Validate the exact bit value + EXPECT_EQ(buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0)); + EXPECT_EQ(buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0)); + + // Use the reader and validate + BitReader reader(buffer.data(), buffer.size()); + for (int i = 0; i < 8; ++i) { + bool val = false; + bool result = reader.GetValue(1, &val); + EXPECT_TRUE(result); + EXPECT_EQ(val, i % 2); + } + + for (int i = 0; i < 8; ++i) { + bool val = false; + bool result = reader.GetValue(1, &val); + EXPECT_TRUE(result); + switch (i) { + case 0: + case 1: + case 4: + case 5: + EXPECT_EQ(val, false); + break; + default: + EXPECT_EQ(val, true); + break; + } + } +} + +// Writes 'num_vals' values with width 'bit_width' and reads them back. +void TestBitArrayValues(int bit_width, int num_vals) { + const int kTestLen = BitUtil::Ceil(bit_width * num_vals, 8); + const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width; + + faststring buffer(kTestLen); + BitWriter writer(&buffer); + for (int i = 0; i < num_vals; ++i) { + writer.PutValue(i % mod, bit_width); + } + writer.Flush(); + EXPECT_EQ(writer.bytes_written(), kTestLen); + + BitReader reader(buffer.data(), kTestLen); + for (int i = 0; i < num_vals; ++i) { + int64_t val = 0; + bool result = reader.GetValue(bit_width, &val); + EXPECT_TRUE(result); + EXPECT_EQ(val, i % mod); + } + EXPECT_EQ(reader.bytes_left(), 0); +} + +TEST(BitArray, TestValues) { + for (int width = 1; width <= MAX_WIDTH; ++width) { + TestBitArrayValues(width, 1); + TestBitArrayValues(width, 2); + // Don't write too many values + TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096); + TestBitArrayValues(width, 1024); + } +} + +// Test some mixed values +TEST(BitArray, TestMixed) { + const int kTestLenBits = 1024; + faststring buffer(kTestLenBits / 8); + bool parity = true; + + BitWriter writer(&buffer); + for (int i = 0; i < kTestLenBits; ++i) { + if (i % 2 == 0) { + writer.PutValue(parity, 1); + parity = !parity; + } else { + writer.PutValue(i, 10); + } + } + writer.Flush(); + + parity = true; + BitReader reader(buffer.data(), buffer.size()); + for (int i = 0; i < kTestLenBits; ++i) { + bool result; + if (i % 2 == 0) { + bool val = false; + result = reader.GetValue(1, &val); + EXPECT_EQ(val, parity); + parity = !parity; + } else { + int val; + result = reader.GetValue(10, &val); + EXPECT_EQ(val, i); + } + EXPECT_TRUE(result); + } +} + +// Validates encoding of values by encoding and decoding them. If +// expected_encoding != NULL, also validates that the encoded buffer is +// exactly 'expected_encoding'. +// if expected_len is not -1, it will validate the encoded size is correct. +template +void ValidateRle(const vector& values, int bit_width, + uint8_t* expected_encoding, int expected_len) { + faststring buffer; + RleEncoder encoder(&buffer, bit_width); + + for (const auto& value : values) { + encoder.Put(value); + } + int encoded_len = encoder.Flush(); + + if (expected_len != -1) { + EXPECT_EQ(encoded_len, expected_len); + } + if (expected_encoding != nullptr) { + EXPECT_EQ(memcmp(buffer.data(), expected_encoding, expected_len), 0) + << "\n" + << "Expected: " << HexDump(Slice(expected_encoding, expected_len)) << "\n" + << "Got: " << HexDump(Slice(buffer)); + } + + // Verify read + RleDecoder decoder(buffer.data(), encoded_len, bit_width); + for (const auto& value : values) { + T val = 0; + bool result = decoder.Get(&val); + EXPECT_TRUE(result); + EXPECT_EQ(value, val); + } +} + +TEST(Rle, SpecificSequences) { + const int kTestLen = 1024; + uint8_t expected_buffer[kTestLen]; + vector values; + + // Test 50 0' followed by 50 1's + values.resize(100); + for (int i = 0; i < 50; ++i) { + values[i] = 0; + } + for (int i = 50; i < 100; ++i) { + values[i] = 1; + } + + // expected_buffer valid for bit width <= 1 byte + expected_buffer[0] = (50 << 1); + expected_buffer[1] = 0; + expected_buffer[2] = (50 << 1); + expected_buffer[3] = 1; + for (int width = 1; width <= 8; ++width) { + ValidateRle(values, width, expected_buffer, 4); + } + + for (int width = 9; width <= MAX_WIDTH; ++width) { + ValidateRle(values, width, nullptr, 2 * (1 + BitUtil::Ceil(width, 8))); + } + + // Test 100 0's and 1's alternating + for (int i = 0; i < 100; ++i) { + values[i] = i % 2; + } + int num_groups = BitUtil::Ceil(100, 8); + expected_buffer[0] = (num_groups << 1) | 1; + for (int i = 0; i < 100/8; ++i) { + expected_buffer[i + 1] = BOOST_BINARY(1 0 1 0 1 0 1 0); // 0xaa + } + // Values for the last 4 0 and 1's + expected_buffer[1 + 100/8] = BOOST_BINARY(0 0 0 0 1 0 1 0); // 0x0a + + // num_groups and expected_buffer only valid for bit width = 1 + ValidateRle(values, 1, expected_buffer, 1 + num_groups); + for (int width = 2; width <= MAX_WIDTH; ++width) { + ValidateRle(values, width, nullptr, 1 + BitUtil::Ceil(width * 100, 8)); + } +} + +// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value +// is used, otherwise alternating values are used. +void TestRleValues(int bit_width, int num_vals, int value = -1) { + const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width; + vector values; + for (int v = 0; v < num_vals; ++v) { + values.push_back((value != -1) ? value : (v % mod)); + } + ValidateRle(values, bit_width, nullptr, -1); +} + +TEST(Rle, TestValues) { + for (int width = 1; width <= MAX_WIDTH; ++width) { + TestRleValues(width, 1); + TestRleValues(width, 1024); + TestRleValues(width, 1024, 0); + TestRleValues(width, 1024, 1); + } +} + +class BitRle : public KuduTest { +}; + +// Tests all true/false values +TEST_F(BitRle, AllSame) { + const int kTestLen = 1024; + vector values; + + for (int v = 0; v < 2; ++v) { + values.clear(); + for (int i = 0; i < kTestLen; ++i) { + values.push_back(v ? true : false); + } + + ValidateRle(values, 1, nullptr, 3); + } +} + +// Test that writes out a repeated group and then a literal +// group but flush before finishing. +TEST_F(BitRle, Flush) { + vector values; + for (int i = 0; i < 16; ++i) values.push_back(1); + values.push_back(false); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); + values.push_back(true); + ValidateRle(values, 1, nullptr, -1); +} + +// Test some random sequences. +TEST_F(BitRle, Random) { + int iters = 0; + const int n_iters = AllowSlowTests() ? 1000 : 20; + while (iters < n_iters) { + srand(iters++); + if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters; + vector values; + bool parity = 0; + for (int i = 0; i < 1000; ++i) { + int group_size = rand() % 20 + 1; // NOLINT(*) + if (group_size > 16) { + group_size = 1; + } + for (int i = 0; i < group_size; ++i) { + values.push_back(parity); + } + parity = !parity; + } + ValidateRle(values, (iters % MAX_WIDTH) + 1, nullptr, -1); + } +} + +// Test a sequence of 1 0's, 2 1's, 3 0's. etc +// e.g. 011000111100000 +TEST_F(BitRle, RepeatedPattern) { + vector values; + const int min_run = 1; + const int max_run = 32; + + for (int i = min_run; i <= max_run; ++i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + // And go back down again + for (int i = max_run; i >= min_run; --i) { + int v = i % 2; + for (int j = 0; j < i; ++j) { + values.push_back(v); + } + } + + ValidateRle(values, 1, nullptr, -1); +} + +TEST_F(TestRle, TestBulkPut) { + size_t run_length; + bool val = false; + + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + encoder.Put(true, 10); + encoder.Put(false, 7); + encoder.Put(true, 5); + encoder.Put(true, 15); + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_FALSE(val); + ASSERT_EQ(7, run_length); + + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_TRUE(val); + ASSERT_EQ(20, run_length); + + ASSERT_EQ(0, decoder.GetNextRun(&val, MathLimits::kMax)); +} + +TEST_F(TestRle, TestGetNextRun) { + // Repeat the test with different number of items + for (int num_items = 7; num_items < 200; num_items += 13) { + // Test different block patterns + // 1: 01010101 01010101 + // 2: 00110011 00110011 + // 3: 00011100 01110001 + // ... + for (int block = 1; block <= 20; ++block) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + for (int j = 0; j < num_items; ++j) { + encoder.Put(!!(j & 1), block); + } + encoder.Flush(); + + RleDecoder decoder(buffer.data(), encoder.len(), 1); + size_t count = num_items * block; + for (int j = 0; j < num_items; ++j) { + size_t run_length; + bool val = false; + DCHECK_GT(count, 0); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + run_length = std::min(run_length, count); + + ASSERT_EQ(!!(j & 1), val); + ASSERT_EQ(block, run_length); + count -= run_length; + } + DCHECK_EQ(count, 0); + } + } +} + +// Generate a random bit string which consists of 'num_runs' runs, +// each with a random length between 1 and 100. Returns the number +// of values encoded (i.e the sum run length). +static size_t GenerateRandomBitString(int num_runs, faststring* enc_buf, string* string_rep) { + RleEncoder enc(enc_buf, 1); + int num_bits = 0; + for (int i = 0; i < num_runs; i++) { + int run_length = random() % 100; + bool value = static_cast(i & 1); + enc.Put(value, run_length); + string_rep->append(run_length, value ? '1' : '0'); + num_bits += run_length; + } + enc.Flush(); + return num_bits; +} + +TEST_F(TestRle, TestRoundTripRandomSequencesWithRuns) { + SeedRandom(); + + // Test the limiting function of GetNextRun. + const int kMaxToReadAtOnce = (random() % 20) + 1; + + // Generate a bunch of random bit sequences, and "round-trip" them + // through the encode/decode sequence. + for (int rep = 0; rep < 100; rep++) { + faststring buf; + string string_rep; + int num_bits = GenerateRandomBitString(10, &buf, &string_rep); + RleDecoder decoder(buf.data(), buf.size(), 1); + string roundtrip_str; + int rem_to_read = num_bits; + size_t run_len; + bool val; + while (rem_to_read > 0 && + (run_len = decoder.GetNextRun(&val, std::min(kMaxToReadAtOnce, rem_to_read))) != 0) { + ASSERT_LE(run_len, kMaxToReadAtOnce); + roundtrip_str.append(run_len, val ? '1' : '0'); + rem_to_read -= run_len; + } + + ASSERT_EQ(string_rep, roundtrip_str); + } +} +TEST_F(TestRle, TestSkip) { + faststring buffer(1); + RleEncoder encoder(&buffer, 1); + + // 0101010[1] 01010101 01 + // "A" + for (int j = 0; j < 18; ++j) { + encoder.Put(!!(j & 1)); + } + + // 0011[00] 11001100 11001100 11001100 11001100 + // "B" + for (int j = 0; j < 19; ++j) { + encoder.Put(!!(j & 1), 2); + } + + // 000000000000 11[1111111111] 000000000000 111111111111 + // "C" + // 000000000000 111111111111 0[00000000000] 111111111111 + // "D" + // 000000000000 111111111111 000000000000 111111111111 + for (int j = 0; j < 12; ++j) { + encoder.Put(!!(j & 1), 12); + } + encoder.Flush(); + + bool val = false; + size_t run_length; + RleDecoder decoder(buffer.data(), encoder.len(), 1); + + // position before "A" + ASSERT_EQ(3, decoder.Skip(7)); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_TRUE(val); + ASSERT_EQ(1, run_length); + + // position before "B" + ASSERT_EQ(7, decoder.Skip(14)); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_FALSE(val); + ASSERT_EQ(2, run_length); + + // position before "C" + ASSERT_EQ(18, decoder.Skip(46)); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_TRUE(val); + ASSERT_EQ(10, run_length); + + // position before "D" + ASSERT_EQ(24, decoder.Skip(49)); + run_length = decoder.GetNextRun(&val, MathLimits::kMax); + ASSERT_FALSE(val); + ASSERT_EQ(11, run_length); + + encoder.Flush(); +} +} // namespace kudu diff --git a/src/kudu/util/rolling_log-test.cc b/src/kudu/util/rolling_log-test.cc new file mode 100644 index 000000000000..6f18fa0342e2 --- /dev/null +++ b/src/kudu/util/rolling_log-test.cc @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/rolling_log.h" + +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/util/env.h" +#include "kudu/util/memenv/memenv.h" +#include "kudu/util/path_util.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; +using strings::Substitute; + +namespace kudu { + +class RollingLogTest : public KuduTest { + public: + RollingLogTest() + : log_dir_(GetTestPath("log_dir")) { + } + + virtual void SetUp() OVERRIDE { + ASSERT_OK(env_->CreateDir(log_dir_)); + } + + protected: + void AssertLogCount(int expected_count, vector* children) { + vector dir_entries; + ASSERT_OK(env_->GetChildren(log_dir_, &dir_entries)); + children->clear(); + + for (const string& child : dir_entries) { + if (child == "." || child == "..") continue; + children->push_back(child); + ASSERT_TRUE(HasPrefixString(child, "rolling_log-test.")); + ASSERT_STR_CONTAINS(child, ".mylog."); + + string pid_suffix = Substitute("$0", getpid()); + ASSERT_TRUE(HasSuffixString(child, pid_suffix) || + HasSuffixString(child, pid_suffix + ".gz")) << "bad child: " << child; + } + ASSERT_EQ(children->size(), expected_count) << *children; + } + + const string log_dir_; +}; + +// Test with compression off. +TEST_F(RollingLogTest, TestLog) { + RollingLog log(env_.get(), log_dir_, "mylog"); + log.SetCompressionEnabled(false); + log.SetSizeLimitBytes(100); + + // Before writing anything, we shouldn't open a log file. + vector children; + NO_FATALS(AssertLogCount(0, &children)); + + // Appending some data should write a new segment. + ASSERT_OK(log.Append("Hello world\n")); + NO_FATALS(AssertLogCount(1, &children)); + + for (int i = 0; i < 10; i++) { + ASSERT_OK(log.Append("Hello world\n")); + } + NO_FATALS(AssertLogCount(2, &children)); + + faststring data; + string path = JoinPathSegments(log_dir_, children[0]); + ASSERT_OK(ReadFileToString(env_.get(), path, &data)); + ASSERT_TRUE(HasPrefixString(data.ToString(), "Hello world\n")) + << "Data missing"; + ASSERT_LE(data.size(), 100) << "Size limit not respected"; +} + +// Test with compression on. +TEST_F(RollingLogTest, TestCompression) { + RollingLog log(env_.get(), log_dir_, "mylog"); + ASSERT_OK(log.Open()); + + StringPiece data = "Hello world\n"; + int raw_size = 0; + for (int i = 0; i < 1000; i++) { + ASSERT_OK(log.Append(data)); + raw_size += data.size(); + } + ASSERT_OK(log.Close()); + + vector children; + NO_FATALS(AssertLogCount(1, &children)); + ASSERT_TRUE(HasSuffixString(children[0], ".gz")); + + // Ensure that the output is actually gzipped. + uint64_t size; + ASSERT_OK(env_->GetFileSize(JoinPathSegments(log_dir_, children[0]), &size)); + ASSERT_LT(size, raw_size / 10); + ASSERT_GT(size, 0); +} + +} // namespace kudu diff --git a/src/kudu/util/rolling_log.cc b/src/kudu/util/rolling_log.cc new file mode 100644 index 000000000000..457ad2c285d1 --- /dev/null +++ b/src/kudu/util/rolling_log.cc @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/rolling_log.h" + +#include +#include + +#include +#include +#include +#include + +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/env.h" +#include "kudu/util/net/net_util.h" +#include "kudu/util/path_util.h" +#include "kudu/util/thread_restrictions.h" +#include "kudu/util/user.h" + +using std::ostringstream; +using std::setw; +using std::string; +using strings::Substitute; + +static const int kDefaultSizeLimitBytes = 64 * 1024 * 1024; // 64MB + +namespace kudu { + +RollingLog::RollingLog(Env* env, string log_dir, string log_name) + : env_(env), + log_dir_(std::move(log_dir)), + log_name_(std::move(log_name)), + size_limit_bytes_(kDefaultSizeLimitBytes), + compress_after_close_(true) {} + +RollingLog::~RollingLog() { + WARN_NOT_OK(Close(), "Unable to close RollingLog"); +} + +void RollingLog::SetSizeLimitBytes(int64_t size) { + CHECK_GT(size, 0); + size_limit_bytes_ = size; +} + +void RollingLog::SetCompressionEnabled(bool compress) { + compress_after_close_ = compress; +} + +string RollingLog::GetLogFileName(int sequence) const { + ostringstream str; + + // 1. Program name. + str << google::ProgramInvocationShortName(); + + // 2. Host name. + string hostname; + Status s = GetHostname(&hostname); + if (!s.ok()) { + hostname = "unknown_host"; + } + str << "." << hostname; + + // 3. User name. + string user_name; + s = GetLoggedInUser(&user_name); + if (!s.ok()) { + user_name = "unknown_user"; + } + str << "." << user_name; + + // 4. Log name. + str << "." << log_name_; + + // 5. Timestamp. + // Implementation cribbed from glog/logging.cc + time_t time = static_cast(WallTime_Now()); + struct ::tm tm_time; + localtime_r(&time, &tm_time); + + str << "."; + str.fill('0'); + str << 1900+tm_time.tm_year + << setw(2) << 1+tm_time.tm_mon + << setw(2) << tm_time.tm_mday + << '-' + << setw(2) << tm_time.tm_hour + << setw(2) << tm_time.tm_min + << setw(2) << tm_time.tm_sec; + str.clear(); // resets formatting flags + + // 6. Sequence number. + str << "." << sequence; + + // 7. Pid. + str << "." << getpid(); + + return str.str(); +} + +Status RollingLog::Open() { + CHECK(!file_); + + for (int sequence = 0; ; sequence++) { + + string path = JoinPathSegments(log_dir_, + GetLogFileName(sequence)); + + WritableFileOptions opts; + // Logs aren't worth the performance cost of durability. + opts.sync_on_close = false; + opts.mode = Env::CREATE_NON_EXISTING; + + Status s = env_->NewWritableFile(opts, path, &file_); + if (s.IsAlreadyPresent()) { + // We already rolled once at this same timestamp. + // Try again with a new sequence number. + continue; + } + RETURN_NOT_OK(s); + + VLOG(1) << "Rolled " << log_name_ << " log to new file: " << path; + break; + } + return Status::OK(); +} + +Status RollingLog::Close() { + if (!file_) { + return Status::OK(); + } + string path = file_->filename(); + RETURN_NOT_OK_PREPEND(file_->Close(), + Substitute("Unable to close $0", path)); + file_.reset(); + if (compress_after_close_) { + WARN_NOT_OK(CompressFile(path), "Unable to compress old log file"); + } + return Status::OK(); +} + +Status RollingLog::Append(StringPiece s) { + if (!file_) { + RETURN_NOT_OK_PREPEND(Open(), "Unable to open log"); + } + + if (file_->Size() + s.size() > size_limit_bytes_) { + RETURN_NOT_OK_PREPEND(Close(), "Unable to roll log"); + RETURN_NOT_OK_PREPEND(Open(), "Unable to roll log"); + } + RETURN_NOT_OK(file_->Append(s)); + return Status::OK(); +} + +namespace { + +Status GzClose(gzFile f) { + int err = gzclose(f); + switch (err) { + case Z_OK: + return Status::OK(); + case Z_STREAM_ERROR: + return Status::InvalidArgument("Stream not valid"); + case Z_ERRNO: + return Status::IOError("IO Error closing stream"); + case Z_MEM_ERROR: + return Status::RuntimeError("Out of memory"); + case Z_BUF_ERROR: + return Status::IOError("read ended in the middle of a stream"); + default: + return Status::IOError("Unknown zlib error", SimpleItoa(err)); + } +} + +class ScopedGzipCloser { + public: + explicit ScopedGzipCloser(gzFile f) + : file_(f) { + } + + ~ScopedGzipCloser() { + if (file_) { + WARN_NOT_OK(GzClose(file_), "Unable to close gzip stream"); + } + } + + void Cancel() { + file_ = nullptr; + } + + private: + gzFile file_; +}; +} // anonymous namespace + +// We implement CompressFile() manually using zlib APIs rather than forking +// out to '/bin/gzip' since fork() can be expensive on processes that use a large +// amount of memory. During the time of the fork, other threads could end up +// blocked. Implementing it using the zlib stream APIs isn't too much code +// and is less likely to be problematic. +Status RollingLog::CompressFile(const std::string& path) const { + gscoped_ptr in_file; + RETURN_NOT_OK_PREPEND(env_->NewSequentialFile(path, &in_file), + "Unable to open input file to compress"); + + string gz_path = path + ".gz"; + gzFile gzf = gzopen(gz_path.c_str(), "w"); + if (!gzf) { + return Status::IOError("Unable to open gzip stream"); + } + + ScopedGzipCloser closer(gzf); + + // Loop reading data from the input file and writing to the gzip stream. + uint8_t buf[32 * 1024]; + while (true) { + Slice result; + RETURN_NOT_OK_PREPEND(in_file->Read(arraysize(buf), &result, buf), + "Unable to read from gzip input"); + if (result.size() == 0) { + break; + } + int n = gzwrite(gzf, result.data(), result.size()); + if (n == 0) { + int errnum; + return Status::IOError("Unable to write to gzip output", + gzerror(gzf, &errnum)); + } + } + closer.Cancel(); + RETURN_NOT_OK_PREPEND(GzClose(gzf), + "Unable to close gzip output"); + + WARN_NOT_OK(env_->DeleteFile(path), + "Unable to delete gzip input file after compression"); + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/rolling_log.h b/src/kudu/util/rolling_log.h new file mode 100644 index 000000000000..eefd13858655 --- /dev/null +++ b/src/kudu/util/rolling_log.h @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_ROLLING_LOG_H +#define KUDU_UTIL_ROLLING_LOG_H + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Env; +class WritableFile; + +// A simple rolling log. +// +// This creates a log which spans multiple files in a specified directory. +// After a log file reaches a specified size threshold, it automatically rolls +// to the next file in the sequence. +// +// The files are named similarly to glog log files and use the following pattern: +// +// /...... +// log_dir: the log_dir specified in the constructor +// program-name: argv[0], as determined by google::ProgramInvocationShortName() +// hostname: the local machine hostname +// user-name: the current user name +// log-name: the log_name specified in the constructor +// timestamp: the wall clock time when the log file was created, in +// YYYYmmdd-HHMMSS fixed-length format. +// sequence: a sequence number which is used to disambiguate when the log file is +// rolled multiple times within a second +// pid: the pid of the daemon +// +// The log implementation does not ensure durability of the log or its files in any way. +// This class is not thread-safe and must be externally synchronized. +class RollingLog { + public: + RollingLog(Env* env, std::string log_dir, std::string log_name); + + ~RollingLog(); + + // Open the log. + // It is optional to call this function. Append() will automatically open + // the log as necessary if it is not open. + Status Open(); + + // Set the size limit for the current and any future log files. + // + // There is no limit on the total number of previous log segments. We rely + // on system utilities to clean up old logs to maintain some size limit. + void SetSizeLimitBytes(int64_t bytes); + + // If compression is enabled, log files are compressed. + // NOTE: this requires that the passed-in Env instance is the local file system. + void SetCompressionEnabled(bool compress); + + // Append the given data to the current log file. + // + // If appending this data would cross the configured file size limit, a new file + // is created and the data is appended there. + // + // Note that this is a synchronous API and causes potentially-blocking IO on the + // current thread. However, this does not fsync() or otherwise ensure durability + // of the appended data. + Status Append(StringPiece data); + + // Close the log. + Status Close(); + + private: + std::string GetLogFileName(int sequence) const; + + // Compress the given path, writing a new file '.gz'. + Status CompressFile(const std::string& path) const; + + Env* const env_; + const std::string log_dir_; + const std::string log_name_; + + int64_t size_limit_bytes_; + + gscoped_ptr file_; + bool compress_after_close_; + + DISALLOW_COPY_AND_ASSIGN(RollingLog); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_ROLLING_LOG_H */ diff --git a/src/kudu/util/rw_semaphore-test.cc b/src/kudu/util/rw_semaphore-test.cc new file mode 100644 index 000000000000..16a08e36f440 --- /dev/null +++ b/src/kudu/util/rw_semaphore-test.cc @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/util/monotime.h" +#include "kudu/util/rw_semaphore.h" + +using boost::thread; +using std::vector; + +namespace kudu { +struct SharedState { + SharedState() : done(false), int_var(0) {} + + bool done; + int64_t int_var; + rw_semaphore sem; +}; + +// Thread which increases the value in the shared state under the write lock. +void Writer(SharedState* state) { + int i = 0; + while (true) { + boost::lock_guard l(state->sem); + state->int_var += (i++); + if (state->done) { + break; + } + } +} + +// Thread which verifies that the value in the shared state only increases. +void Reader(SharedState* state) { + int prev_val = 0; + while (true) { + boost::shared_lock l(state->sem); + // The int var should only be seen to increase. + CHECK_GE(state->int_var, prev_val); + prev_val = state->int_var; + if (state->done) { + break; + } + } +} + +// Test which verifies basic functionality of the semaphore. +// When run under TSAN this also verifies the barriers. +TEST(RWSemaphoreTest, TestBasicOperation) { + SharedState s; + vector threads; + // Start 5 readers and writers. + for (int i = 0; i < 5; i++) { + threads.push_back(new thread(Reader, &s)); + threads.push_back(new thread(Writer, &s)); + } + + // Let them contend for a short amount of time. + SleepFor(MonoDelta::FromMilliseconds(50)); + + // Signal them to stop. + { + boost::lock_guard l(s.sem); + s.done = true; + } + + for (thread* t : threads) { + t->join(); + delete t; + } +} + +} // namespace kudu diff --git a/src/kudu/util/rw_semaphore.h b/src/kudu/util/rw_semaphore.h new file mode 100644 index 000000000000..6ce54622f126 --- /dev/null +++ b/src/kudu/util/rw_semaphore.h @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_RW_SEMAPHORE_H +#define KUDU_UTIL_RW_SEMAPHORE_H + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/debug-util.h" + +#include "kudu/util/thread.h" + +namespace kudu { + +// Read-Write semaphore. 32bit uint that contains the number of readers. +// When someone wants to write, tries to set the 32bit, and waits until +// the readers have finished. Readers are spinning while the write flag is set. +// +// This rw-semaphore makes no attempt at fairness, though it does avoid write +// starvation (no new readers may obtain the lock if a write is waiting). +// +// Given that this is currently based only on spinning (and not futex), +// it should only be used in cases where the lock is held for very short +// time intervals. +// +// If the semaphore is expected to always be released from the same thread +// that acquired it, use rw_spinlock instead. +// +// In order to support easier debugging of leaked locks, this class can track +// the stack trace of the last thread to lock it in write mode. To do so, +// uncomment the following define: +// #define RW_SEMAPHORE_TRACK_HOLDER 1 +// ... and then in gdb, print the contents of the semaphore, and you should +// see the collected stack trace. +class rw_semaphore { + public: + rw_semaphore() : state_(0) { + } + ~rw_semaphore() {} + + void lock_shared() { + int loop_count = 0; + Atomic32 cur_state = base::subtle::NoBarrier_Load(&state_); + while (true) { + Atomic32 expected = cur_state & kNumReadersMask; // I expect no write lock + Atomic32 try_new_state = expected + 1; // Add me as reader + cur_state = base::subtle::Acquire_CompareAndSwap(&state_, expected, try_new_state); + if (cur_state == expected) + break; + // Either was already locked by someone else, or CAS failed. + boost::detail::yield(loop_count++); + } + } + + void unlock_shared() { + int loop_count = 0; + Atomic32 cur_state = base::subtle::NoBarrier_Load(&state_); + while (true) { + DCHECK_GT(cur_state & kNumReadersMask, 0) + << "unlock_shared() called when there are no shared locks held"; + Atomic32 expected = cur_state; // I expect a write lock and other readers + Atomic32 try_new_state = expected - 1; // Drop me as reader + cur_state = base::subtle::Release_CompareAndSwap(&state_, expected, try_new_state); + if (cur_state == expected) + break; + // Either was already locked by someone else, or CAS failed. + boost::detail::yield(loop_count++); + } + } + + // Tries to acquire a write lock, if no one else has it. + // This function retries on CAS failure and waits for readers to complete. + bool try_lock() { + int loop_count = 0; + Atomic32 cur_state = base::subtle::NoBarrier_Load(&state_); + while (true) { + // someone else has already the write lock + if (cur_state & kWriteFlag) + return false; + + Atomic32 expected = cur_state & kNumReadersMask; // I expect some 0+ readers + Atomic32 try_new_state = kWriteFlag | expected; // I want to lock the other writers + cur_state = base::subtle::Acquire_CompareAndSwap(&state_, expected, try_new_state); + if (cur_state == expected) + break; + // Either was already locked by someone else, or CAS failed. + boost::detail::yield(loop_count++); + } + + WaitPendingReaders(); + RecordLockHolderStack(); + return true; + } + + void lock() { + int loop_count = 0; + Atomic32 cur_state = base::subtle::NoBarrier_Load(&state_); + while (true) { + Atomic32 expected = cur_state & kNumReadersMask; // I expect some 0+ readers + Atomic32 try_new_state = kWriteFlag | expected; // I want to lock the other writers + // Note: we use NoBarrier here because we'll do the Acquire barrier down below + // in WaitPendingReaders + cur_state = base::subtle::NoBarrier_CompareAndSwap(&state_, expected, try_new_state); + if (cur_state == expected) + break; + // Either was already locked by someone else, or CAS failed. + boost::detail::yield(loop_count++); + } + + WaitPendingReaders(); + +#ifndef NDEBUG + writer_tid_ = Thread::CurrentThreadId(); +#endif // NDEBUG + RecordLockHolderStack(); + } + + void unlock() { + // I expect to be the only writer + DCHECK_EQ(base::subtle::NoBarrier_Load(&state_), kWriteFlag); + +#ifndef NDEBUG + writer_tid_ = -1; // Invalid tid. +#endif // NDEBUG + + ResetLockHolderStack(); + // Reset: no writers & no readers. + Release_Store(&state_, 0); + } + + // Return true if the lock is currently held for write by any thread. + // See simple_semaphore::is_locked() for details about where this is useful. + bool is_write_locked() const { + return base::subtle::NoBarrier_Load(&state_) & kWriteFlag; + } + + // Return true if the lock is currently held, either for read or write + // by any thread. + // See simple_semaphore::is_locked() for details about where this is useful. + bool is_locked() const { + return base::subtle::NoBarrier_Load(&state_); + } + + private: + static const uint32_t kNumReadersMask = 0x7fffffff; + static const uint32_t kWriteFlag = 1 << 31; + +#ifdef RW_SEMAPHORE_TRACK_HOLDER + StackTrace writer_stack_; + void RecordLockHolderStack() { + writer_stack_.Collect(); + } + void ResetLockHolderStack() { + writer_stack_.Reset(); + } +#else + void RecordLockHolderStack() { + } + void ResetLockHolderStack() { + } +#endif + + void WaitPendingReaders() { + int loop_count = 0; + while ((base::subtle::Acquire_Load(&state_) & kNumReadersMask) > 0) { + boost::detail::yield(loop_count++); + } + } + + private: + volatile Atomic32 state_; +#ifndef NDEBUG + int64_t writer_tid_; +#endif // NDEBUG +}; + +} // namespace kudu +#endif /* KUDU_UTIL_RW_SEMAPHORE_H */ diff --git a/src/kudu/util/rwc_lock-test.cc b/src/kudu/util/rwc_lock-test.cc new file mode 100644 index 000000000000..af825d8fcbcc --- /dev/null +++ b/src/kudu/util/rwc_lock-test.cc @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/util/rwc_lock.h" +#include "kudu/util/test_util.h" +#include "kudu/util/locks.h" + +namespace kudu { + +using base::subtle::NoBarrier_Load; +using base::subtle::Release_Store; +using std::string; +using std::vector; + +class RWCLockTest : public KuduTest {}; + +// Holds counters of how many threads hold the lock in each of the +// provided modes. +struct LockHoldersCount { + LockHoldersCount() + : num_readers(0), + num_writers(0), + num_committers(0) { + } + + // Check the invariants of the lock counts. + void CheckInvariants() { + // At no time should we have more than one writer or committer. + CHECK_LE(num_writers, 1); + CHECK_LE(num_committers, 1); + + // If we have any readers, then we should not have any committers. + if (num_readers > 0) { + CHECK_EQ(num_committers, 0); + } + } + + void AdjustReaders(int delta) { + boost::lock_guard l(lock); + num_readers += delta; + CheckInvariants(); + } + + void AdjustWriters(int delta) { + boost::lock_guard l(lock); + num_writers += delta; + CheckInvariants(); + } + + void AdjustCommitters(int delta) { + boost::lock_guard l(lock); + num_committers += delta; + CheckInvariants(); + } + + int num_readers; + int num_writers; + int num_committers; + simple_spinlock lock; +}; + +struct SharedState { + LockHoldersCount counts; + RWCLock rwc_lock; + Atomic32 stop; +}; + +void ReaderThread(SharedState* state) { + while (!NoBarrier_Load(&state->stop)) { + state->rwc_lock.ReadLock(); + state->counts.AdjustReaders(1); + state->counts.AdjustReaders(-1); + state->rwc_lock.ReadUnlock(); + } +} + +void WriterThread(SharedState* state) { + string local_str; + while (!NoBarrier_Load(&state->stop)) { + state->rwc_lock.WriteLock(); + state->counts.AdjustWriters(1); + + state->rwc_lock.UpgradeToCommitLock(); + state->counts.AdjustWriters(-1); + state->counts.AdjustCommitters(1); + + state->counts.AdjustCommitters(-1); + state->rwc_lock.CommitUnlock(); + } +} + + +TEST_F(RWCLockTest, TestCorrectBehavior) { + SharedState state; + Release_Store(&state.stop, 0); + + vector threads; + + const int kNumWriters = 5; + const int kNumReaders = 5; + + for (int i = 0; i < kNumWriters; i++) { + threads.push_back(new boost::thread(WriterThread, &state)); + } + for (int i = 0; i < kNumReaders; i++) { + threads.push_back(new boost::thread(ReaderThread, &state)); + } + + if (AllowSlowTests()) { + SleepFor(MonoDelta::FromSeconds(1)); + } else { + SleepFor(MonoDelta::FromMilliseconds(100)); + } + + Release_Store(&state.stop, 1); + + for (boost::thread* t : threads) { + t->join(); + delete t; + } + +} + +} // namespace kudu diff --git a/src/kudu/util/rwc_lock.cc b/src/kudu/util/rwc_lock.cc new file mode 100644 index 000000000000..efe3ccc19581 --- /dev/null +++ b/src/kudu/util/rwc_lock.cc @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/rwc_lock.h" + +#include + +#ifndef NDEBUG +#include "kudu/gutil/walltime.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/env.h" +#include "kudu/util/thread.h" +#endif // NDEBUG + +namespace kudu { + +RWCLock::RWCLock() + : no_mutators_(&lock_), + no_readers_(&lock_), + reader_count_(0), +#ifdef NDEBUG + write_locked_(false) { +#else + write_locked_(false), + last_writer_tid_(0), + last_writelock_acquire_time_(0) { + last_writer_backtrace_[0] = '\0'; +#endif // NDEBUG +} + +RWCLock::~RWCLock() { + CHECK_EQ(reader_count_, 0); +} + +void RWCLock::ReadLock() { + MutexLock l(lock_); + reader_count_++; +} + +void RWCLock::ReadUnlock() { + MutexLock l(lock_); + DCHECK_GT(reader_count_, 0); + reader_count_--; + if (reader_count_ == 0) { + no_readers_.Signal(); + } +} + +bool RWCLock::HasReaders() const { + MutexLock l(lock_); + return reader_count_ > 0; +} + +bool RWCLock::HasWriteLock() const { + MutexLock l(lock_); +#ifndef NDEBUG + return last_writer_tid_ == Thread::CurrentThreadId(); +#else + return write_locked_; +#endif +} + +void RWCLock::WriteLock() { + MutexLock l(lock_); + // Wait for any other mutations to finish. + while (write_locked_) { + no_mutators_.Wait(); + } +#ifndef NDEBUG + last_writelock_acquire_time_ = GetCurrentTimeMicros(); + last_writer_tid_ = Thread::CurrentThreadId(); + HexStackTraceToString(last_writer_backtrace_, kBacktraceBufSize); +#endif // NDEBUG + write_locked_ = true; +} + +void RWCLock::WriteUnlock() { + MutexLock l(lock_); + DCHECK(write_locked_); + write_locked_ = false; +#ifndef NDEBUG + last_writer_backtrace_[0] = '\0'; +#endif // NDEBUG + no_mutators_.Signal(); +} + +void RWCLock::UpgradeToCommitLock() { + lock_.lock(); + DCHECK(write_locked_); + while (reader_count_ > 0) { + no_readers_.Wait(); + } + DCHECK(write_locked_); + + // Leaves the lock held, which prevents any new readers + // or writers. +} + +void RWCLock::CommitUnlock() { + DCHECK_EQ(0, reader_count_); + write_locked_ = false; +#ifndef NDEBUG + last_writer_backtrace_[0] = '\0'; +#endif // NDEBUG + no_mutators_.Broadcast(); + lock_.unlock(); +} + +} // namespace kudu diff --git a/src/kudu/util/rwc_lock.h b/src/kudu/util/rwc_lock.h new file mode 100644 index 000000000000..6d4cb7026d47 --- /dev/null +++ b/src/kudu/util/rwc_lock.h @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_RWC_LOCK_H +#define KUDU_UTIL_RWC_LOCK_H + +#include "kudu/gutil/macros.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/mutex.h" + +namespace kudu { + +// A read-write-commit lock. +// +// This lock has three modes: read, write, and commit. +// The lock compatibility matrix is as follows: +// +// Read Write Commit +// Read X X +// Write X +// Commit +// +// An 'X' indicates that the two types of locks may be +// held at the same time. +// +// In prose: +// - Multiple threads may hold the Read lock at the same time. +// - A single thread may hold the Write lock, potentially at the +// same time as any number of readers. +// - A single thread may hold the Commit lock, but this lock is completely +// exclusive (no concurrent readers or writers). +// +// A typical use case for this type of lock is when a structure is read often, +// occasionally updated, and the update operation can take a long time. In this +// use case, the readers simply use ReadLock() and ReadUnlock(), while the +// writer uses a copy-on-write technique like: +// +// obj->lock.WriteLock(); +// // NOTE: cannot safely mutate obj->state directly here, since readers +// // may be concurrent! So, we make a local copy to mutate. +// my_local_copy = obj->state; +// SomeLengthyMutation(my_local_copy); +// obj->lock.UpgradeToCommitLock(); +// obj->state = my_local_copy; +// obj->lock.CommitUnlock(); +// +// This is more efficient than a standard Reader-Writer lock since the lengthy +// mutation is only protected against other concurrent mutators, and readers +// may continue to run with no contention. +// +// For the common pattern described above, the 'CowObject<>' template class defined +// in cow_object.h is more convenient than manual locking. +// +// NOTE: this implementation currently does not implement any starvation protection +// or fairness. If the read lock is being constantly acquired (i.e reader count +// never drops to 0) then UpgradeToCommitLock() may block arbitrarily long. +class RWCLock { + public: + RWCLock(); + ~RWCLock(); + + // Acquire the lock in read mode. Upon return, guarantees that: + // - Other threads may concurrently hold the lock for Read. + // - Either zero or one thread may hold the lock for Write. + // - No threads hold the lock for Commit. + void ReadLock(); + void ReadUnlock(); + + // Return true if there are any readers currently holding the lock. + // Useful for debug assertions. + bool HasReaders() const; + + // Return true if the current thread holds the write lock. + // + // In DEBUG mode this is accurate -- we track the current holder's tid. + // In non-DEBUG mode, this may sometimes return true even if another thread + // is in fact the holder. + // Thus, this is only really useful in the context of a DCHECK assertion. + bool HasWriteLock() const; + + // Boost-like wrappers, so boost lock guards work + void lock_shared() { ReadLock(); } + void unlock_shared() { ReadUnlock(); } + + // Acquire the lock in write mode. Upon return, guarantees that: + // - Other threads may concurrently hold the lock for Read. + // - No other threads hold the lock for Write or Commit. + void WriteLock(); + void WriteUnlock(); + + // Boost-like wrappers + void lock() { WriteLock(); } + void unlock() { WriteUnlock(); } + + // Upgrade the lock from Write mode to Commit mode. + // Requires that the current thread holds the lock in Write mode. + // Upon return, guarantees: + // - No other thread holds the lock in any mode. + void UpgradeToCommitLock(); + void CommitUnlock(); + + private: + // Lock which protects reader_count_ and write_locked_. + // Additionally, while the commit lock is held, the + // locking thread holds this mutex, which prevents any new + // threads from obtaining the lock in any mode. + mutable Mutex lock_; + ConditionVariable no_mutators_, no_readers_; + int reader_count_; + bool write_locked_; + +#ifndef NDEBUG + static const int kBacktraceBufSize = 1024; + int64_t last_writer_tid_; + int64_t last_writelock_acquire_time_; + char last_writer_backtrace_[kBacktraceBufSize]; +#endif // NDEBUG + + DISALLOW_COPY_AND_ASSIGN(RWCLock); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_RWC_LOCK_H */ diff --git a/src/kudu/util/safe_math-test.cc b/src/kudu/util/safe_math-test.cc new file mode 100644 index 000000000000..d3a81c638ee8 --- /dev/null +++ b/src/kudu/util/safe_math-test.cc @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +// Must come before gtest.h. +#include "kudu/gutil/mathlimits.h" + +#include +#include "kudu/util/safe_math.h" + +namespace kudu { +template +static void DoTest(T a, T b, bool expected) { + SCOPED_TRACE(a); + SCOPED_TRACE(b); + bool overflow = false; + T ret = AddWithOverflowCheck(a, b, &overflow); + EXPECT_EQ(overflow, expected); + if (!overflow) { + EXPECT_EQ(ret, a + b); + } +} + +TEST(TestSafeMath, TestSignedInts) { + // Overflow above max of range. + DoTest(MathLimits::kMax - 10, 15, true); + DoTest(MathLimits::kMax - 10, 10, false); + + // Underflow around negative + DoTest(MathLimits::kMin + 10, -15, true); + DoTest(MathLimits::kMin + 10, -5, false); + +} + +TEST(TestSafeMath, TestUnsignedInts) { + // Overflow above max + DoTest(MathLimits::kMax - 10, 15, true); + DoTest(MathLimits::kMax - 10, 10, false); +} + +} // namespace kudu diff --git a/src/kudu/util/safe_math.h b/src/kudu/util/safe_math.h new file mode 100644 index 000000000000..4c126dd04fc0 --- /dev/null +++ b/src/kudu/util/safe_math.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Inline functions for doing overflow-safe operations on integers. +// These should be used when doing bounds checks on user-provided data, +// for example. +// See also: https://www.securecoding.cert.org/confluence/display/cplusplus/INT32-CPP.+Ensure+that+operations+on+signed+integers+do+not+result+in+overflow +#ifndef KUDU_UTIL_SAFE_MATH_H +#define KUDU_UTIL_SAFE_MATH_H + +#include "kudu/gutil/mathlimits.h" + +namespace kudu { + +namespace safe_math_internal { + +// Template which is specialized for signed and unsigned types separately. +template +struct WithOverflowCheck { +}; + + +// Specialization for signed types. +template +struct WithOverflowCheck { + static inline Type Add(Type a, Type b, bool *overflowed) { + // Implementation from the CERT article referenced in the file header. + *overflowed = (((a > 0) && (b > 0) && (a > (MathLimits::kMax - b))) || + ((a < 0) && (b < 0) && (a < (MathLimits::kMin - b)))); + return a + b; + } +}; + +// Specialization for unsigned types. +template +struct WithOverflowCheck { + static inline Type Add(Type a, Type b, bool *overflowed) { + Type ret = a + b; + *overflowed = ret < a; + return a + b; + } +}; + +} // namespace safe_math_internal + +// Add 'a' and 'b', and set *overflowed to true if overflow occured. +template +inline Type AddWithOverflowCheck(Type a, Type b, bool *overflowed) { + // Pick the right specialization based on whether Type is signed. + typedef safe_math_internal::WithOverflowCheck::kIsSigned> my_struct; + return my_struct::Add(a, b, overflowed); +} + +} // namespace kudu +#endif diff --git a/src/kudu/util/semaphore.cc b/src/kudu/util/semaphore.cc new file mode 100644 index 000000000000..985da44e5f48 --- /dev/null +++ b/src/kudu/util/semaphore.cc @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/semaphore.h" + +#include +#include +#include "kudu/gutil/walltime.h" +namespace kudu { + +Semaphore::Semaphore(int capacity) { + DCHECK_GE(capacity, 0); + if (sem_init(&sem_, 0, capacity) != 0) { + Fatal("init"); + } +} + +Semaphore::~Semaphore() { + if (sem_destroy(&sem_) != 0) { + Fatal("destroy"); + } +} + +void Semaphore::Acquire() { + while (true) { + int ret = sem_wait(&sem_); + if (ret == 0) { + // TODO: would be nice to track acquisition time, etc. + return; + } + + if (errno == EINTR) continue; + Fatal("wait"); + } +} + +bool Semaphore::TryAcquire() { + int ret = sem_trywait(&sem_); + if (ret == 0) { + return true; + } + if (errno == EAGAIN || errno == EINTR) { + return false; + } + Fatal("trywait"); +} + +bool Semaphore::TimedAcquire(const MonoDelta& timeout) { + int64_t microtime = GetCurrentTimeMicros(); + microtime += timeout.ToMicroseconds(); + + struct timespec abs_timeout; + MonoDelta::NanosToTimeSpec(microtime * MonoTime::kNanosecondsPerMicrosecond, + &abs_timeout); + + while (true) { + int ret = sem_timedwait(&sem_, &abs_timeout); + if (ret == 0) return true; + if (errno == ETIMEDOUT) return false; + if (errno == EINTR) continue; + Fatal("timedwait"); + } +} + +void Semaphore::Release() { + PCHECK(sem_post(&sem_) == 0); +} + +int Semaphore::GetValue() { + int val; + PCHECK(sem_getvalue(&sem_, &val) == 0); + return val; +} + +void Semaphore::Fatal(const char* action) { + PLOG(FATAL) << "Could not " << action << " semaphore " + << reinterpret_cast(&sem_); + abort(); // unnecessary, but avoids gcc complaining +} + +} // namespace kudu diff --git a/src/kudu/util/semaphore.h b/src/kudu/util/semaphore.h new file mode 100644 index 000000000000..88a1086f322b --- /dev/null +++ b/src/kudu/util/semaphore.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_SEMAPHORE_H +#define KUDU_UTIL_SEMAPHORE_H + +#include +#if defined(__APPLE__) +#include +#include "kudu/util/atomic.h" +#endif // define(__APPLE__) + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/util/monotime.h" + +namespace kudu { + +// Wrapper for POSIX semaphores. +class Semaphore { + public: + // Initialize the semaphore with the specified capacity. + explicit Semaphore(int capacity); + ~Semaphore(); + + // Acquire the semaphore. + void Acquire(); + + // Acquire the semaphore within the given timeout. Returns true if successful. + bool TimedAcquire(const MonoDelta& timeout); + + // Try to acquire the semaphore immediately. Returns false if unsuccessful. + bool TryAcquire(); + + // Release the semaphore. + void Release(); + + // Get the current value of the semaphore. + int GetValue(); + + // Boost-compatible wrappers. + void lock() { Acquire(); } + void unlock() { Release(); } + bool try_lock() { return TryAcquire(); } + + private: +#if !defined(__APPLE__) + // Log a fatal error message. Separated out to keep the main functions + // as small as possible in terms of code size. + void Fatal(const char* action) ATTRIBUTE_NORETURN; +#endif // !define(__APPLE__) + +#if defined(__APPLE__) + dispatch_semaphore_t sem_; + AtomicInt count_; +#else + sem_t sem_; +#endif // define(__APPLE__) + DISALLOW_COPY_AND_ASSIGN(Semaphore); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_SEMAPHORE_H */ diff --git a/src/kudu/util/semaphore_macosx.cc b/src/kudu/util/semaphore_macosx.cc new file mode 100644 index 000000000000..7cc5f08500fc --- /dev/null +++ b/src/kudu/util/semaphore_macosx.cc @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/semaphore.h" + +#include +#include +#include "kudu/gutil/walltime.h" + +namespace kudu { + +Semaphore::Semaphore(int capacity) + : count_(capacity) { + DCHECK_GE(capacity, 0); + sem_ = dispatch_semaphore_create(capacity); + CHECK_NOTNULL(sem_); +} + +Semaphore::~Semaphore() { + dispatch_release(sem_); +} + +void Semaphore::Acquire() { + // If the timeout is DISPATCH_TIME_FOREVER, then dispatch_semaphore_wait() + // waits forever and always returns zero. + CHECK(dispatch_semaphore_wait(sem_, DISPATCH_TIME_FOREVER) == 0); + count_.IncrementBy(-1); +} + +bool Semaphore::TryAcquire() { + // The dispatch_semaphore_wait() function returns zero upon success and + // non-zero after the timeout expires. + if (dispatch_semaphore_wait(sem_, DISPATCH_TIME_NOW) == 0) { + count_.IncrementBy(-1); + return true; + } + return false; +} + +bool Semaphore::TimedAcquire(const MonoDelta& timeout) { + dispatch_time_t t = dispatch_time(DISPATCH_TIME_NOW, timeout.ToNanoseconds()); + if (dispatch_semaphore_wait(sem_, t) == 0) { + count_.IncrementBy(-1); + return true; + } + return false; +} + +void Semaphore::Release() { + dispatch_semaphore_signal(sem_); + count_.IncrementBy(1); +} + +int Semaphore::GetValue() { + return count_.Load(); +} + +} // namespace kudu diff --git a/src/kudu/util/slice-test.cc b/src/kudu/util/slice-test.cc new file mode 100644 index 000000000000..b0d3bef9b943 --- /dev/null +++ b/src/kudu/util/slice-test.cc @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/slice.h" + +#include + +#include "kudu/gutil/map-util.h" + +using std::string; + +namespace kudu { + +typedef SliceMap::type MySliceMap; + +TEST(SliceTest, TestSliceMap) { + MySliceMap my_map; + Slice a("a"); + Slice b("b"); + Slice c("c"); + + // Insertion is deliberately out-of-order; the map should restore order. + InsertOrDie(&my_map, c, 3); + InsertOrDie(&my_map, a, 1); + InsertOrDie(&my_map, b, 2); + + int expectedValue = 0; + for (const MySliceMap::value_type& pair : my_map) { + int data = 'a' + expectedValue++; + ASSERT_EQ(Slice(reinterpret_cast(&data), 1), pair.first); + ASSERT_EQ(expectedValue, pair.second); + } + + expectedValue = 0; + for (auto iter = my_map.begin(); iter != my_map.end(); iter++) { + int data = 'a' + expectedValue++; + ASSERT_EQ(Slice(reinterpret_cast(&data), 1), iter->first); + ASSERT_EQ(expectedValue, iter->second); + } +} + +} // namespace kudu diff --git a/src/kudu/util/slice.cc b/src/kudu/util/slice.cc new file mode 100644 index 000000000000..716bd4bdb72e --- /dev/null +++ b/src/kudu/util/slice.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/slice.h" + +#include "kudu/gutil/stringprintf.h" +#include "kudu/util/status.h" + +namespace kudu { + +Status Slice::check_size(size_t expected_size) const { + if (PREDICT_FALSE(size() != expected_size)) { + return Status::Corruption(StringPrintf("Unexpected Slice size. " + "Expected %zu but got %zu.", expected_size, size()), ToDebugString(100)); + } + return Status::OK(); +} + +// Return a string that contains the copy of the referenced data. +std::string Slice::ToString() const { + return std::string(reinterpret_cast(data_), size_); +} + +std::string Slice::ToDebugString(size_t max_len) const { + size_t bytes_to_print = size_; + bool abbreviated = false; + if (max_len != 0 && bytes_to_print > max_len) { + bytes_to_print = max_len; + abbreviated = true; + } + + int size = 0; + for (int i = 0; i < bytes_to_print; i++) { + if (!isgraph(data_[i])) { + size += 4; + } else { + size++; + } + } + if (abbreviated) { + size += 20; // extra padding + } + + std::string ret; + ret.reserve(size); + for (int i = 0; i < bytes_to_print; i++) { + if (!isgraph(data_[i])) { + StringAppendF(&ret, "\\x%02x", data_[i] & 0xff); + } else { + ret.push_back(data_[i]); + } + } + if (abbreviated) { + StringAppendF(&ret, "...<%zd bytes total>", size_); + } + return ret; +} + +} // namespace kudu diff --git a/src/kudu/util/slice.h b/src/kudu/util/slice.h new file mode 100644 index 000000000000..609fa6e5db99 --- /dev/null +++ b/src/kudu/util/slice.h @@ -0,0 +1,214 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Slice is a simple structure containing a pointer into some external +// storage and a size. The user of a Slice must ensure that the slice +// is not used after the corresponding external storage has been +// deallocated. +// +// Multiple threads can invoke const methods on a Slice without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Slice must use +// external synchronization. +// +// Slices can be built around faststrings and StringPieces using constructors +// with implicit casts. Both StringPieces and faststrings depend on a great +// deal of gutil code, so these constructors are conditionalized on +// KUDU_HEADERS_USE_RICH_SLICE. Likewise, KUDU_HEADERS_USE_RICH_SLICE controls +// whether to use gutil-based memeq/memcmp substitutes; if it is unset, Slice +// will fall back to standard memcmp. + +#ifndef KUDU_UTIL_SLICE_H_ +#define KUDU_UTIL_SLICE_H_ + +#include +#include +#include +#include +#include +#include + +#ifdef KUDU_HEADERS_USE_RICH_SLICE +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/util/faststring.h" +#endif +#include "kudu/util/kudu_export.h" + +namespace kudu { + +class Status; + +class KUDU_EXPORT Slice { + public: + // Create an empty slice. + Slice() : data_(reinterpret_cast("")), + size_(0) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const uint8_t* d, size_t n) : data_(d), size_(n) { } + + // Create a slice that refers to d[0,n-1]. + Slice(const char* d, size_t n) : + data_(reinterpret_cast(d)), + size_(n) { } + + // Create a slice that refers to the contents of "s" + Slice(const std::string& s) : // NOLINT(runtime/explicit) + data_(reinterpret_cast(s.data())), + size_(s.size()) { } + + // Create a slice that refers to s[0,strlen(s)-1] + Slice(const char* s) : // NOLINT(runtime/explicit) + data_(reinterpret_cast(s)), + size_(strlen(s)) { } + +#ifdef KUDU_HEADERS_USE_RICH_SLICE + // Create a slice that refers to the contents of the faststring. + // Note that further appends to the faststring may invalidate this slice. + Slice(const faststring &s) // NOLINT(runtime/explicit) + : data_(s.data()), + size_(s.size()) { + } + + Slice(const StringPiece& s) // NOLINT(runtime/explicit) + : data_(reinterpret_cast(s.data())), + size_(s.size()) { + } +#endif + + // Return a pointer to the beginning of the referenced data + const uint8_t* data() const { return data_; } + + // Return a mutable pointer to the beginning of the referenced data. + uint8_t *mutable_data() { return const_cast(data_); } + + // Return the length (in bytes) of the referenced data + size_t size() const { return size_; } + + // Return true iff the length of the referenced data is zero + bool empty() const { return size_ == 0; } + + // Return the ith byte in the referenced data. + // REQUIRES: n < size() + const uint8_t &operator[](size_t n) const { + assert(n < size()); + return data_[n]; + } + + // Change this slice to refer to an empty array + void clear() { + data_ = reinterpret_cast(""); + size_ = 0; + } + + // Drop the first "n" bytes from this slice. + void remove_prefix(size_t n) { + assert(n <= size()); + data_ += n; + size_ -= n; + } + + // Truncate the slice to "n" bytes + void truncate(size_t n) { + assert(n <= size()); + size_ = n; + } + + // Checks that this slice has size() = 'expected_size' and returns + // Status::Corruption() otherwise. + Status check_size(size_t expected_size) const; + + // Return a string that contains the copy of the referenced data. + std::string ToString() const; + + std::string ToDebugString(size_t max_len = 0) const; + + // Three-way comparison. Returns value: + // < 0 iff "*this" < "b", + // == 0 iff "*this" == "b", + // > 0 iff "*this" > "b" + int compare(const Slice& b) const; + + // Return true iff "x" is a prefix of "*this" + bool starts_with(const Slice& x) const { + return ((size_ >= x.size_) && + (MemEqual(data_, x.data_, x.size_))); + } + + // Comparator struct, useful for ordered collections (like STL maps). + struct Comparator { + bool operator()(const Slice& a, const Slice& b) const { + return a.compare(b) < 0; + } + }; + + // Relocates this slice's data into 'd' provided this isn't already the + // case. It is assumed that 'd' is large enough to fit the data. + void relocate(uint8_t* d) { + if (data_ != d) { + memcpy(d, data_, size_); + data_ = d; + } + } + + private: + friend bool operator==(const Slice& x, const Slice& y); + + static bool MemEqual(const void* a, const void* b, size_t n) { +#ifdef KUDU_HEADERS_USE_RICH_SLICE + return strings::memeq(a, b, n); +#else + return memcmp(a, b, n) == 0; +#endif + } + + static int MemCompare(const void* a, const void* b, size_t n) { +#ifdef KUDU_HEADERS_USE_RICH_SLICE + return strings::fastmemcmp_inlined(a, b, n); +#else + return memcmp(a, b, n); +#endif + } + + const uint8_t* data_; + size_t size_; + + // Intentionally copyable +}; + +inline bool operator==(const Slice& x, const Slice& y) { + return ((x.size() == y.size()) && + (Slice::MemEqual(x.data(), y.data(), x.size()))); +} + +inline bool operator!=(const Slice& x, const Slice& y) { + return !(x == y); +} + +inline std::ostream& operator<<(std::ostream& o, const Slice& s) { + return o << s.ToDebugString(16); // should be enough for anyone... +} + +inline int Slice::compare(const Slice& b) const { + const int min_len = (size_ < b.size_) ? size_ : b.size_; + int r = MemCompare(data_, b.data_, min_len); + if (r == 0) { + if (size_ < b.size_) r = -1; + else if (size_ > b.size_) r = +1; + } + return r; +} + +// STL map whose keys are Slices. +// +// See sample usage in slice-test.cc. +template +struct SliceMap { + typedef std::map type; +}; + +} // namespace kudu + +#endif // KUDU_UTIL_SLICE_H_ diff --git a/src/kudu/util/spinlock_profiling-test.cc b/src/kudu/util/spinlock_profiling-test.cc new file mode 100644 index 000000000000..74227b29cdb3 --- /dev/null +++ b/src/kudu/util/spinlock_profiling-test.cc @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/spinlock.h" +#include "kudu/util/spinlock_profiling.h" +#include "kudu/util/test_util.h" +#include "kudu/util/trace.h" + +// Can't include gutil/synchronization_profiling.h directly as it'll +// declare a weak symbol directly in this unit test, which the runtime +// linker will prefer over equivalent strong symbols for some reason. By +// declaring the symbol without providing an empty definition, the strong +// symbols are chosen when provided via shared libraries. +// +// Further reading: +// - http://stackoverflow.com/questions/20658809/dynamic-loading-and-weak-symbol-resolution +// - http://notmysock.org/blog/php/weak-symbols-arent.html +namespace gutil { +extern void SubmitSpinLockProfileData(const void *, int64); +} // namespace gutil + +namespace kudu { + +class SpinLockProfilingTest : public KuduTest {}; + +TEST_F(SpinLockProfilingTest, TestSpinlockProfiling) { + scoped_refptr t(new Trace); + base::SpinLock lock; + { + ADOPT_TRACE(t.get()); + gutil::SubmitSpinLockProfileData(&lock, 4000000); + } + string result = t->DumpToString(true); + LOG(INFO) << "trace: " << result; + // We can't assert more specifically because the CyclesPerSecond + // on different machines might be different. + ASSERT_STR_CONTAINS(result, "Waited "); + ASSERT_STR_CONTAINS(result, "on lock "); + + ASSERT_GT(GetSpinLockContentionMicros(), 0); +} + +TEST_F(SpinLockProfilingTest, TestStackCollection) { + StartSynchronizationProfiling(); + base::SpinLock lock; + gutil::SubmitSpinLockProfileData(&lock, 12345); + StopSynchronizationProfiling(); + std::stringstream str; + int64_t dropped = 0; + FlushSynchronizationProfile(&str, &dropped); + string s = str.str(); + ASSERT_STR_CONTAINS(s, "12345\t1 @ "); + ASSERT_EQ(0, dropped); +} + +} // namespace kudu diff --git a/src/kudu/util/spinlock_profiling.cc b/src/kudu/util/spinlock_profiling.cc new file mode 100644 index 000000000000..f4a375fb6027 --- /dev/null +++ b/src/kudu/util/spinlock_profiling.cc @@ -0,0 +1,299 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/spinlock_profiling.h" + +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/basictypes.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/spinlock.h" +#include "kudu/gutil/strings/human_readable.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/flag_tags.h" +#include "kudu/util/metrics.h" +#include "kudu/util/striped64.h" +#include "kudu/util/trace.h" + +DEFINE_int32(lock_contention_trace_threshold_cycles, + 2000000, // 2M cycles should be about 1ms + "If acquiring a spinlock takes more than this number of " + "cycles, and a Trace is currently active, then the current " + "stack trace is logged to the trace buffer."); +TAG_FLAG(lock_contention_trace_threshold_cycles, hidden); + +METRIC_DEFINE_gauge_uint64(server, spinlock_contention_time, + "Spinlock Contention Time", kudu::MetricUnit::kMicroseconds, + "Amount of time consumed by contention on internal spinlocks since the server " + "started. If this increases rapidly, it may indicate a performance issue in Kudu " + "internals triggered by a particular workload and warrant investigation.", + kudu::EXPOSE_AS_COUNTER); + +using base::SpinLock; +using base::SpinLockHolder; + +namespace kudu { + +static const double kMicrosPerSecond = 1000000.0; + +static LongAdder* g_contended_cycles = nullptr; + +namespace { + +// Implements a very simple linear-probing hashtable of stack traces with +// a fixed number of entries. +// +// Threads experiencing contention record their stacks into this hashtable, +// or increment an already-existing entry. Each entry has its own lock, +// but we can "skip" an entry under contention, and spread out a single stack +// into multiple buckets if necessary. +// +// A thread collecting a profile collects stack traces out of the hash table +// and resets the counts to 0 as they are collected. +class ContentionStacks { + public: + ContentionStacks() + : dropped_samples_(0) { + } + + // Add a stack trace to the table. + void AddStack(const StackTrace& s, int64_t cycles); + + // Flush stacks from the buffer to 'out'. See the docs for FlushSynchronizationProfile() + // in spinlock_profiling.h for details on format. + // + // On return, guarantees that any stack traces that were present at the beginning of + // the call have been flushed. However, new stacks can be added concurrently with this call. + void Flush(std::stringstream* out, int64_t* dropped); + + private: + + // Collect the next sample from the underlying buffer, and set it back to 0 count + // (thus marking it as "empty"). + // + // 'iterator' serves as a way to keep track of the current position in the buffer. + // Callers should initially set it to 0, and then pass the same pointer to each + // call to CollectSample. This serves to loop through the collected samples. + bool CollectSample(uint64_t* iterator, StackTrace* s, int64_t* trip_count, int64_t* cycles); + + // Hashtable entry. + struct Entry { + Entry() : trip_count(0), + cycle_count(0) { + } + + // Protects all other entries. + SpinLock lock; + + // The number of times we've experienced contention with a stack trace equal + // to 'trace'. + // + // If this is 0, then the entry is "unclaimed" and the other fields are not + // considered valid. + int64_t trip_count; + + // The total number of cycles spent waiting at this stack trace. + int64_t cycle_count; + + // A cached hashcode of the trace. + uint64_t hash; + + // The actual stack trace. + StackTrace trace; + }; + + enum { + kNumEntries = 1024, + kNumLinearProbeAttempts = 4 + }; + Entry entries_[kNumEntries]; + + // The number of samples which were dropped due to contention on this structure or + // due to the hashtable being too full. + AtomicInt dropped_samples_; +}; + +Atomic32 g_profiling_enabled = 0; +ContentionStacks* g_contention_stacks = nullptr; + +void ContentionStacks::AddStack(const StackTrace& s, int64_t cycles) { + uint64_t hash = s.HashCode(); + + // Linear probe up to 4 attempts before giving up + for (int i = 0; i < kNumLinearProbeAttempts; i++) { + Entry* e = &entries_[(hash + i) % kNumEntries]; + if (!e->lock.TryLock()) { + // If we fail to lock it, we can safely just use a different slot. + // It's OK if a single stack shows up multiple times, because pprof + // aggregates them in the end anyway. + continue; + } + + if (e->trip_count == 0) { + // It's an un-claimed slot. Claim it. + e->hash = hash; + e->trace.CopyFrom(s); + } else if (e->hash != hash || !e->trace.Equals(s)) { + // It's claimed by a different stack trace. + e->lock.Unlock(); + continue; + } + + // Contribute to the stats for this stack. + e->cycle_count += cycles; + e->trip_count++; + e->lock.Unlock(); + return; + } + + // If we failed to find a matching hashtable slot, or we hit lock contention + // trying to record our sample, add it to the dropped sample count. + dropped_samples_.Increment(); +} + +void ContentionStacks::Flush(std::stringstream* out, int64_t* dropped) { + uint64_t iterator = 0; + StackTrace t; + int64_t cycles; + int64_t count; + while (g_contention_stacks->CollectSample(&iterator, &t, &count, &cycles)) { + *out << cycles << "\t" << count + << " @ " << t.ToHexString(StackTrace::NO_FIX_CALLER_ADDRESSES) + << std::endl; + } + + *dropped += dropped_samples_.Exchange(0); +} + +bool ContentionStacks::CollectSample(uint64_t* iterator, StackTrace* s, int64_t* trip_count, + int64_t* cycles) { + while (*iterator < kNumEntries) { + Entry* e = &entries_[(*iterator)++]; + SpinLockHolder l(&e->lock); + if (e->trip_count == 0) continue; + + *trip_count = e->trip_count; + *cycles = e->cycle_count; + s->CopyFrom(e->trace); + + e->trip_count = 0; + e->cycle_count = 0; + return true; + } + + // Looped through the whole array and found nothing. + return false; +} + + +void SubmitSpinLockProfileData(const void *contendedlock, int64 wait_cycles) { + bool profiling_enabled = base::subtle::Acquire_Load(&g_profiling_enabled); + bool long_wait_time = wait_cycles > FLAGS_lock_contention_trace_threshold_cycles; + // Short circuit this function quickly in the common case. + if (PREDICT_TRUE(!profiling_enabled && !long_wait_time)) { + return; + } + + static __thread bool in_func = false; + if (in_func) return; // non-re-entrant + in_func = true; + + StackTrace stack; + stack.Collect(); + + if (profiling_enabled) { + DCHECK_NOTNULL(g_contention_stacks)->AddStack(stack, wait_cycles); + } + + if (PREDICT_FALSE(long_wait_time)) { + Trace* t = Trace::CurrentTrace(); + if (t) { + double seconds = static_cast(wait_cycles) / base::CyclesPerSecond(); + char backtrace_buffer[1024]; + stack.StringifyToHex(backtrace_buffer, arraysize(backtrace_buffer)); + TRACE_TO(t, "Waited $0 on lock $1. stack: $2", + HumanReadableElapsedTime::ToShortString(seconds), contendedlock, + backtrace_buffer); + } + } + + LongAdder* la = reinterpret_cast( + base::subtle::Acquire_Load(reinterpret_cast(&g_contended_cycles))); + if (la) { + la->IncrementBy(wait_cycles); + } + + in_func = false; +} + +void DoInit() { + base::subtle::Release_Store(reinterpret_cast(&g_contention_stacks), + reinterpret_cast(new ContentionStacks())); + base::subtle::Release_Store(reinterpret_cast(&g_contended_cycles), + reinterpret_cast(new LongAdder())); +} + +} // anonymous namespace + +void InitSpinLockContentionProfiling() { + static GoogleOnceType once = GOOGLE_ONCE_INIT; + GoogleOnceInit(&once, DoInit); +} + + +void RegisterSpinLockContentionMetrics(const scoped_refptr& entity) { + InitSpinLockContentionProfiling(); + entity->NeverRetire( + METRIC_spinlock_contention_time.InstantiateFunctionGauge( + entity, Bind(&GetSpinLockContentionMicros))); + +} + +uint64_t GetSpinLockContentionMicros() { + int64_t wait_cycles = DCHECK_NOTNULL(g_contended_cycles)->Value(); + double micros = static_cast(wait_cycles) / base::CyclesPerSecond() + * kMicrosPerSecond; + return implicit_cast(micros); +} + +void StartSynchronizationProfiling() { + InitSpinLockContentionProfiling(); + base::subtle::Barrier_AtomicIncrement(&g_profiling_enabled, 1); +} + +void FlushSynchronizationProfile(std::stringstream* out, + int64_t* drop_count) { + CHECK_NOTNULL(g_contention_stacks)->Flush(out, drop_count); +} + +void StopSynchronizationProfiling() { + InitSpinLockContentionProfiling(); + CHECK_GE(base::subtle::Barrier_AtomicIncrement(&g_profiling_enabled, -1), 0); +} + +} // namespace kudu + +// The hook expected by gutil is in the gutil namespace. Simply forward into the +// kudu namespace so we don't need to qualify everything. +namespace gutil { +void SubmitSpinLockProfileData(const void *contendedlock, int64 wait_cycles) { + kudu::SubmitSpinLockProfileData(contendedlock, wait_cycles); +} +} // namespace gutil diff --git a/src/kudu/util/spinlock_profiling.h b/src/kudu/util/spinlock_profiling.h new file mode 100644 index 000000000000..15d484d0cbe9 --- /dev/null +++ b/src/kudu/util/spinlock_profiling.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_SPINLOCK_PROFILING_H +#define KUDU_UTIL_SPINLOCK_PROFILING_H + +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/ref_counted.h" + +namespace kudu { + +class MetricEntity; + +// Enable instrumentation of spinlock contention. +// +// Calling this method currently does nothing, except for ensuring +// that the spinlock_profiling.cc object file gets linked into your +// executable. It needs to be somewhere reachable in your code, +// just so that gcc doesn't omit the underlying module from the binary. +void InitSpinLockContentionProfiling(); + +// Return the total number of microseconds spent in spinlock contention +// since the server started. +uint64_t GetSpinLockContentionMicros(); + +// Register metrics in the given server entity which measure the amount of +// spinlock contention. +void RegisterSpinLockContentionMetrics(const scoped_refptr& entity); + +// Enable process-wide synchronization profiling. +// +// While profiling is enabled, spinlock contention will be recorded in a buffer. +// The caller should periodically call FlushSynchronizationProfile() to empty +// the buffer, or else profiles may be dropped. +void StartSynchronizationProfiling(); + +// Flush the current buffer of contention profile samples to the given stream. +// +// Each stack trace that has been observed results in at least one line of the +// following format: +// @ +// +// Flushing the data also clears the current buffer of trace samples. +// This may be called while synchronization profiling is enabled or after it has +// been disabled. +// +// *dropped_samples will be incremented by the number of samples which were dropped +// due to the contention buffer overflowing. If profiling is enabled during this +// call, then the 'drop_count' may be slightly out-of-date with respect to the +// returned samples. +void FlushSynchronizationProfile(std::stringstream* out, int64_t* drop_count); + +// Stop collecting contention profiles. +void StopSynchronizationProfiling(); + +} // namespace kudu +#endif /* KUDU_UTIL_SPINLOCK_PROFILING_H */ diff --git a/src/kudu/util/stack_watchdog-test.cc b/src/kudu/util/stack_watchdog-test.cc new file mode 100644 index 000000000000..a7cb06d9c04a --- /dev/null +++ b/src/kudu/util/stack_watchdog-test.cc @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/kernel_stack_watchdog.h" + +#include +#include +#include + +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; +using strings::Substitute; + +DECLARE_int32(hung_task_check_interval_ms); + +namespace kudu { + +class StackWatchdogTest : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + KernelStackWatchdog::GetInstance()->SaveLogsForTests(true); + ANNOTATE_BENIGN_RACE(&FLAGS_hung_task_check_interval_ms, + "Integer flag change should be safe"); + FLAGS_hung_task_check_interval_ms = 10; + } +}; + +// The KernelStackWatchdog is only enabled on Linux, since we can't get kernel +// stack traces on other platforms. +#if defined(__linux__) +TEST_F(StackWatchdogTest, TestWatchdog) { + vector log; + { + SCOPED_WATCH_STACK(20); + for (int i = 0; i < 50; i++) { + SleepFor(MonoDelta::FromMilliseconds(100)); + log = KernelStackWatchdog::GetInstance()->LoggedMessagesForTests(); + // Wait for several samples, since it's possible that we get unlucky + // and the watchdog sees us just before or after a sleep. + if (log.size() > 5) { + break; + } + } + } + string s = JoinStrings(log, "\n"); + ASSERT_STR_CONTAINS(s, "TestWatchdog_Test::TestBody()"); + ASSERT_STR_CONTAINS(s, "nanosleep"); +} +#endif + +// Test that SCOPED_WATCH_STACK scopes can be nested. +TEST_F(StackWatchdogTest, TestNestedScopes) { + vector log; + int line1; + int line2; + { + SCOPED_WATCH_STACK(20); line1 = __LINE__; + { + SCOPED_WATCH_STACK(20); line2 = __LINE__; + for (int i = 0; i < 50; i++) { + SleepFor(MonoDelta::FromMilliseconds(100)); + log = KernelStackWatchdog::GetInstance()->LoggedMessagesForTests(); + if (log.size() > 3) { + break; + } + } + } + } + + // Verify that both nested scopes were collected. + string s = JoinStrings(log, "\n"); + ASSERT_STR_CONTAINS(s, Substitute("stack_watchdog-test.cc:$0", line1)); + ASSERT_STR_CONTAINS(s, Substitute("stack_watchdog-test.cc:$0", line2)); +} + +TEST_F(StackWatchdogTest, TestPerformance) { + // Reset the check interval to be reasonable. Otherwise the benchmark + // wastes a lot of CPU running the watchdog thread too often. + FLAGS_hung_task_check_interval_ms = 500; + LOG_TIMING(INFO, "1M SCOPED_WATCH_STACK()s") { + for (int i = 0; i < 1000000; i++) { + SCOPED_WATCH_STACK(100); + } + } +} +} // namespace kudu diff --git a/src/kudu/util/status-test.cc b/src/kudu/util/status-test.cc new file mode 100644 index 000000000000..afbecd2c8a3c --- /dev/null +++ b/src/kudu/util/status-test.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include +#include +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" + +using std::string; + +namespace kudu { + +TEST(StatusTest, TestPosixCode) { + Status ok = Status::OK(); + ASSERT_EQ(0, ok.posix_code()); + Status file_error = Status::IOError("file error", Slice(), ENOTDIR); + ASSERT_EQ(ENOTDIR, file_error.posix_code()); +} + +TEST(StatusTest, TestToString) { + Status file_error = Status::IOError("file error", Slice(), ENOTDIR); + ASSERT_EQ(string("IO error: file error (error 20)"), file_error.ToString()); +} + +TEST(StatusTest, TestClonePrepend) { + Status file_error = Status::IOError("file error", "msg2", ENOTDIR); + Status appended = file_error.CloneAndPrepend("Heading"); + ASSERT_EQ(string("IO error: Heading: file error: msg2 (error 20)"), appended.ToString()); +} + +TEST(StatusTest, TestCloneAppend) { + Status remote_error = Status::RemoteError("Application error"); + Status appended = remote_error.CloneAndAppend(Status::NotFound("Unknown tablet").ToString()); + ASSERT_EQ(string("Remote error: Application error: Not found: Unknown tablet"), + appended.ToString()); +} + +TEST(StatusTest, TestMemoryUsage) { + ASSERT_EQ(0, Status::OK().memory_footprint_excluding_this()); + ASSERT_GT(Status::IOError( + "file error", "some other thing", ENOTDIR).memory_footprint_excluding_this(), 0); +} + +TEST(StatusTest, TestMoveConstructor) { + // OK->OK move should do nothing. + { + Status src = Status::OK(); + Status dst = std::move(src); + ASSERT_OK(src); + ASSERT_OK(dst); + } + + // Moving a not-OK status into a new one should make the moved status + // "OK". + { + Status src = Status::NotFound("foo"); + Status dst = std::move(src); + ASSERT_OK(src); + ASSERT_EQ("Not found: foo", dst.ToString()); + } +} + +TEST(StatusTest, TestMoveAssignment) { + // OK->Bad move should clear the source status and also make the + // destination status OK. + { + Status src = Status::OK(); + Status dst = Status::NotFound("orig dst"); + dst = std::move(src); + ASSERT_OK(src); + ASSERT_OK(dst); + } + + // Bad->Bad move. + { + Status src = Status::NotFound("orig src"); + Status dst = Status::NotFound("orig dst"); + dst = std::move(src); + ASSERT_OK(src); + ASSERT_EQ("Not found: orig src", dst.ToString()); + } + + // Bad->OK move + { + Status src = Status::NotFound("orig src"); + Status dst = Status::OK(); + dst = std::move(src); + ASSERT_OK(src); + ASSERT_EQ("Not found: orig src", dst.ToString()); + } +} + + +} // namespace kudu diff --git a/src/kudu/util/status.cc b/src/kudu/util/status.cc new file mode 100644 index 000000000000..d13eeea43f52 --- /dev/null +++ b/src/kudu/util/status.cc @@ -0,0 +1,162 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "kudu/util/status.h" + +#include +#include + +#include "kudu/gutil/strings/fastmem.h" +#include "kudu/util/malloc.h" + +namespace kudu { + +const char* Status::CopyState(const char* state) { + uint32_t size; + strings::memcpy_inlined(&size, state, sizeof(size)); + auto result = new char[size + 7]; + strings::memcpy_inlined(result, state, size + 7); + return result; +} + +Status::Status(Code code, const Slice& msg, const Slice& msg2, + int16_t posix_code) { + assert(code != kOk); + const uint32_t len1 = msg.size(); + const uint32_t len2 = msg2.size(); + const uint32_t size = len1 + (len2 ? (2 + len2) : 0); + auto result = new char[size + 7]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, &posix_code, sizeof(posix_code)); + memcpy(result + 7, msg.data(), len1); + if (len2) { + result[7 + len1] = ':'; + result[8 + len1] = ' '; + memcpy(result + 9 + len1, msg2.data(), len2); + } + state_ = result; +} + +std::string Status::CodeAsString() const { + if (state_ == nullptr) { + return "OK"; + } + + const char* type; + switch (code()) { + case kOk: + type = "OK"; + break; + case kNotFound: + type = "Not found"; + break; + case kCorruption: + type = "Corruption"; + break; + case kNotSupported: + type = "Not implemented"; + break; + case kInvalidArgument: + type = "Invalid argument"; + break; + case kIOError: + type = "IO error"; + break; + case kAlreadyPresent: + type = "Already present"; + break; + case kRuntimeError: + type = "Runtime error"; + break; + case kNetworkError: + type = "Network error"; + break; + case kIllegalState: + type = "Illegal state"; + break; + case kNotAuthorized: + type = "Not authorized"; + break; + case kAborted: + type = "Aborted"; + break; + case kRemoteError: + type = "Remote error"; + break; + case kServiceUnavailable: + type = "Service unavailable"; + break; + case kTimedOut: + type = "Timed out"; + break; + case kUninitialized: + type = "Uninitialized"; + break; + case kConfigurationError: + type = "Configuration error"; + break; + case kIncomplete: + type = "Incomplete"; + break; + case kEndOfFile: + type = "End of file"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == nullptr) { + return result; + } + + result.append(": "); + Slice msg = message(); + result.append(reinterpret_cast(msg.data()), msg.size()); + int16_t posix = posix_code(); + if (posix != -1) { + char buf[64]; + snprintf(buf, sizeof(buf), " (error %d)", posix); + result.append(buf); + } + return result; +} + +Slice Status::message() const { + if (state_ == nullptr) { + return Slice(); + } + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + return Slice(state_ + 7, length); +} + +int16_t Status::posix_code() const { + if (state_ == nullptr) { + return 0; + } + int16_t posix_code; + memcpy(&posix_code, state_ + 5, sizeof(posix_code)); + return posix_code; +} + +Status Status::CloneAndPrepend(const Slice& msg) const { + return Status(code(), msg, message(), posix_code()); +} + +Status Status::CloneAndAppend(const Slice& msg) const { + return Status(code(), message(), msg, posix_code()); +} + +size_t Status::memory_footprint_excluding_this() const { + return state_ ? kudu_malloc_usable_size(state_) : 0; +} + +size_t Status::memory_footprint_including_this() const { + return kudu_malloc_usable_size(this) + memory_footprint_excluding_this(); +} +} // namespace kudu diff --git a/src/kudu/util/status.h b/src/kudu/util/status.h new file mode 100644 index 000000000000..5003be3d5cec --- /dev/null +++ b/src/kudu/util/status.h @@ -0,0 +1,359 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef KUDU_UTIL_STATUS_H_ +#define KUDU_UTIL_STATUS_H_ + +#include +#include + +#ifdef KUDU_HEADERS_NO_STUBS +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#else +#include "kudu/client/stubs.h" +#endif + +#include "kudu/util/kudu_export.h" +#include "kudu/util/slice.h" + +// Return the given status if it is not OK. +#define KUDU_RETURN_NOT_OK(s) do { \ + ::kudu::Status _s = (s); \ + if (PREDICT_FALSE(!_s.ok())) return _s; \ + } while (0); + +// Return the given status if it is not OK, but first clone it and +// prepend the given message. +#define KUDU_RETURN_NOT_OK_PREPEND(s, msg) do { \ + ::kudu::Status _s = (s); \ + if (PREDICT_FALSE(!_s.ok())) return _s.CloneAndPrepend(msg); \ + } while (0); + +// Return 'to_return' if 'to_call' returns a bad status. +// The substitution for 'to_return' may reference the variable +// 's' for the bad status. +#define KUDU_RETURN_NOT_OK_RET(to_call, to_return) do { \ + ::kudu::Status s = (to_call); \ + if (PREDICT_FALSE(!s.ok())) return (to_return); \ + } while (0); + +// Emit a warning if 'to_call' returns a bad status. +#define KUDU_WARN_NOT_OK(to_call, warning_prefix) do { \ + ::kudu::Status _s = (to_call); \ + if (PREDICT_FALSE(!_s.ok())) { \ + KUDU_LOG(WARNING) << (warning_prefix) << ": " << _s.ToString(); \ + } \ + } while (0); + +// Log the given status and return immediately. +#define KUDU_LOG_AND_RETURN(level, status) do { \ + ::kudu::Status _s = (status); \ + KUDU_LOG(level) << _s.ToString(); \ + return _s; \ + } while (0); + +// If 'to_call' returns a bad status, CHECK immediately with a logged message +// of 'msg' followed by the status. +#define KUDU_CHECK_OK_PREPEND(to_call, msg) do { \ + ::kudu::Status _s = (to_call); \ + KUDU_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ + } while (0); + +// If the status is bad, CHECK immediately, appending the status to the +// logged message. +#define KUDU_CHECK_OK(s) KUDU_CHECK_OK_PREPEND(s, "Bad status") + +// This header is used in both the Kudu build as well as in builds of +// applications that use the Kudu C++ client. In the latter we need to be +// careful to "namespace" our macros, to avoid colliding or overriding with +// similarly named macros belonging to the application. +// +// KUDU_HEADERS_USE_SHORT_STATUS_MACROS handles this behavioral change. When +// defined, we're building Kudu and: +// 1. Non-namespaced macros are allowed and mapped to the namespaced versions +// defined above. +// 2. Namespaced versions of glog macros are mapped to the real glog macros +// (otherwise the macros are defined in the C++ client stubs). +#ifdef KUDU_HEADERS_USE_SHORT_STATUS_MACROS +#define RETURN_NOT_OK KUDU_RETURN_NOT_OK +#define RETURN_NOT_OK_PREPEND KUDU_RETURN_NOT_OK_PREPEND +#define RETURN_NOT_OK_RET KUDU_RETURN_NOT_OK_RET +#define WARN_NOT_OK KUDU_WARN_NOT_OK +#define LOG_AND_RETURN KUDU_LOG_AND_RETURN +#define CHECK_OK_PREPEND KUDU_CHECK_OK_PREPEND +#define CHECK_OK KUDU_CHECK_OK + +// These are standard glog macros. +#define KUDU_LOG LOG +#define KUDU_CHECK CHECK +#endif + +namespace kudu { + +class KUDU_EXPORT Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + +#if __cplusplus >= 201103L + // Move the specified status. + Status(Status&& s); + void operator=(Status&& s); +#endif + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status NotFound(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kNotFound, msg, msg2, posix_code); + } + static Status Corruption(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kCorruption, msg, msg2, posix_code); + } + static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kNotSupported, msg, msg2, posix_code); + } + static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kInvalidArgument, msg, msg2, posix_code); + } + static Status IOError(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kIOError, msg, msg2, posix_code); + } + static Status AlreadyPresent(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kAlreadyPresent, msg, msg2, posix_code); + } + static Status RuntimeError(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kRuntimeError, msg, msg2, posix_code); + } + static Status NetworkError(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kNetworkError, msg, msg2, posix_code); + } + static Status IllegalState(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kIllegalState, msg, msg2, posix_code); + } + static Status NotAuthorized(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kNotAuthorized, msg, msg2, posix_code); + } + static Status Aborted(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kAborted, msg, msg2, posix_code); + } + static Status RemoteError(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kRemoteError, msg, msg2, posix_code); + } + static Status ServiceUnavailable(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kServiceUnavailable, msg, msg2, posix_code); + } + static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kTimedOut, msg, msg2, posix_code); + } + static Status Uninitialized(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kUninitialized, msg, msg2, posix_code); + } + static Status ConfigurationError(const Slice& msg, const Slice& msg2 = Slice(), + int16_t posix_code = -1) { + return Status(kConfigurationError, msg, msg2, posix_code); + } + static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice(), + int64_t posix_code = -1) { + return Status(kIncomplete, msg, msg2, posix_code); + } + static Status EndOfFile(const Slice& msg, const Slice& msg2 = Slice(), + int64_t posix_code = -1) { + return Status(kEndOfFile, msg, msg2, posix_code); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + // Returns true iff the status indicates a NotFound error. + bool IsNotFound() const { return code() == kNotFound; } + + // Returns true iff the status indicates a Corruption error. + bool IsCorruption() const { return code() == kCorruption; } + + // Returns true iff the status indicates a NotSupported error. + bool IsNotSupported() const { return code() == kNotSupported; } + + // Returns true iff the status indicates an IOError. + bool IsIOError() const { return code() == kIOError; } + + // Returns true iff the status indicates an InvalidArgument error + bool IsInvalidArgument() const { return code() == kInvalidArgument; } + + // Returns true iff the status indicates an AlreadyPresent error + bool IsAlreadyPresent() const { return code() == kAlreadyPresent; } + + // Returns true iff the status indicates a RuntimeError. + bool IsRuntimeError() const { return code() == kRuntimeError; } + + // Returns true iff the status indicates a NetworkError. + bool IsNetworkError() const { return code() == kNetworkError; } + + // Returns true iff the status indicates a IllegalState. + bool IsIllegalState() const { return code() == kIllegalState; } + + // Returns true iff the status indicates a NotAuthorized. + bool IsNotAuthorized() const { return code() == kNotAuthorized; } + + // Returns true iff the status indicates Aborted. + bool IsAborted() const { return code() == kAborted; } + + // Returns true iff the status indicates RemoteError. + bool IsRemoteError() const { return code() == kRemoteError; } + + // Returns true iff the status indicates ServiceUnavailable. + bool IsServiceUnavailable() const { return code() == kServiceUnavailable; } + + // Returns true iff the status indicates TimedOut. + bool IsTimedOut() const { return code() == kTimedOut; } + + // Returns true iff the status indicates Uninitialized. + bool IsUninitialized() const { return code() == kUninitialized; } + + // Returns true iff the status indicates Configuration error. + bool IsConfigurationError() const { return code() == kConfigurationError; } + + // Returns true iff the status indicates Incomplete. + bool IsIncomplete() const { return code() == kIncomplete; } + + // Returns true iff the status indicates end of file. + bool IsEndOfFile() const { return code() == kEndOfFile; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + // Return a string representation of the status code, without the message + // text or posix code information. + std::string CodeAsString() const; + + // Return the message portion of the Status. This is similar to ToString, + // except that it does not include the stringified error code or posix code. + // + // For OK statuses, this returns an empty string. + // + // The returned Slice is only valid as long as this Status object remains + // live and unchanged. + Slice message() const; + + // Get the POSIX code associated with this Status, or -1 if there is none. + int16_t posix_code() const; + + // Return a new Status object with the same state plus an additional leading message. + Status CloneAndPrepend(const Slice& msg) const; + + // Same as CloneAndPrepend, but appends to the message instead. + Status CloneAndAppend(const Slice& msg) const; + + // Returns the memory usage of this object without the object itself. Should + // be used when embedded inside another object. + size_t memory_footprint_excluding_this() const; + + // Returns the memory usage of this object including the object itself. + // Should be used when allocated on the heap. + size_t memory_footprint_including_this() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..6] == posix_code + // state_[7..] == message + const char* state_; + + enum Code { + kOk = 0, + kNotFound = 1, + kCorruption = 2, + kNotSupported = 3, + kInvalidArgument = 4, + kIOError = 5, + kAlreadyPresent = 6, + kRuntimeError = 7, + kNetworkError = 8, + kIllegalState = 9, + kNotAuthorized = 10, + kAborted = 11, + kRemoteError = 12, + kServiceUnavailable = 13, + kTimedOut = 14, + kUninitialized = 15, + kConfigurationError = 16, + kIncomplete = 17, + kEndOfFile = 18, + // NOTE: Remember to duplicate these constants into wire_protocol.proto and + // and to add StatusTo/FromPB ser/deser cases in wire_protocol.cc ! + // + // TODO: Move error codes into an error_code.proto or something similar. + }; + COMPILE_ASSERT(sizeof(Code) == 4, code_enum_size_is_part_of_abi); + + Code code() const { + return (state_ == NULL) ? kOk : static_cast(state_[4]); + } + + Status(Code code, const Slice& msg, const Slice& msg2, int16_t posix_code); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +#if __cplusplus >= 201103L +inline Status::Status(Status&& s) : state_(s.state_) { + s.state_ = nullptr; +} + +inline void Status::operator=(Status&& s) { + if (state_ != s.state_) { + delete[] state_; + state_ = s.state_; + s.state_ = nullptr; + } +} +#endif + +} // namespace kudu + +#endif // KUDU_UTIL_STATUS_H_ diff --git a/src/kudu/util/status_callback.cc b/src/kudu/util/status_callback.cc new file mode 100644 index 000000000000..0bb11a076f9d --- /dev/null +++ b/src/kudu/util/status_callback.cc @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/status.h" +#include "kudu/util/status_callback.h" + +namespace kudu { + +void DoNothingStatusCB(const Status& status) {} + +Status DoNothingStatusClosure() { return Status::OK(); } + +} // end namespace kudu diff --git a/src/kudu/util/status_callback.h b/src/kudu/util/status_callback.h new file mode 100644 index 000000000000..83b9d904fd0d --- /dev/null +++ b/src/kudu/util/status_callback.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_STATUS_CALLBACK_H +#define KUDU_UTIL_STATUS_CALLBACK_H + +#include "kudu/gutil/callback_forward.h" + +namespace kudu { + +class Status; + +// A callback which takes a Status. This is typically used for functions which +// produce asynchronous results and may fail. +typedef Callback StatusCallback; + +// To be used when a function signature requires a StatusCallback but none +// is needed. +extern void DoNothingStatusCB(const Status& status); + +// A closure (callback without arguments) that returns a Status indicating +// whether it was successful or not. +typedef Callback StatusClosure; + +// To be used when setting a StatusClosure is optional. +extern Status DoNothingStatusClosure(); + +} // namespace kudu + +#endif diff --git a/src/kudu/util/stopwatch.h b/src/kudu/util/stopwatch.h new file mode 100644 index 000000000000..a08691a7a406 --- /dev/null +++ b/src/kudu/util/stopwatch.h @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_STOPWATCH_H +#define KUDU_UTIL_STOPWATCH_H + +#include +#include +#include +#include +#include +#if defined(__APPLE__) +#include +#include +#endif // defined(__APPLE__) + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/walltime.h" + +namespace kudu { + +// Macro for logging timing of a block. Usage: +// LOG_TIMING_PREFIX_IF(INFO, FLAGS_should_record_time, "Tablet X: ", "doing some task") { +// ... some task which takes some time +// } +// If FLAGS_should_record_time is true, yields a log like: +// I1102 14:35:51.726186 23082 file.cc:167] Tablet X: Time spent doing some task: +// real 3.729s user 3.570s sys 0.150s +// The task will always execute regardless of whether the timing information is +// printed. +#define LOG_TIMING_PREFIX_IF(severity, condition, prefix, description) \ + for (kudu::sw_internal::LogTiming _l(__FILE__, __LINE__, google::severity, prefix, description, \ + -1, (condition)); !_l.HasRun(); _l.MarkHasRun()) + +// Conditionally log, no prefix. +#define LOG_TIMING_IF(severity, condition, description) \ + LOG_TIMING_PREFIX_IF(severity, (condition), "", (description)) + +// Always log, including prefix. +#define LOG_TIMING_PREFIX(severity, prefix, description) \ + LOG_TIMING_PREFIX_IF(severity, true, (prefix), (description)) + +// Always log, no prefix. +#define LOG_TIMING(severity, description) \ + LOG_TIMING_IF(severity, true, (description)) + +// Macro to log the time spent in the rest of the block. +#define SCOPED_LOG_TIMING(severity, description) \ + kudu::sw_internal::LogTiming VARNAME_LINENUM(_log_timing)(__FILE__, __LINE__, \ + google::severity, "", description, -1, true); + +// Scoped version of LOG_SLOW_EXECUTION() but with a prefix. +#define SCOPED_LOG_SLOW_EXECUTION_PREFIX(severity, max_expected_millis, prefix, description) \ + kudu::sw_internal::LogTiming VARNAME_LINENUM(_log_timing)(__FILE__, __LINE__, \ + google::severity, prefix, description, max_expected_millis, true) + +// Macro for logging timing of a block. Usage: +// LOG_SLOW_EXECUTION(INFO, 5, "doing some task") { +// ... some task which takes some time +// } +// when slower than 5 milliseconds, yields a log like: +// I1102 14:35:51.726186 23082 file.cc:167] Time spent doing some task: +// real 3.729s user 3.570s sys 0.150s +#define LOG_SLOW_EXECUTION(severity, max_expected_millis, description) \ + for (kudu::sw_internal::LogTiming _l(__FILE__, __LINE__, google::severity, "", description, \ + max_expected_millis, true); !_l.HasRun(); _l.MarkHasRun()) + +// Macro for vlogging timing of a block. The execution happens regardless of the vlog_level, +// it's only the logging that's affected. +// Usage: +// VLOG_TIMING(1, "doing some task") { +// ... some task which takes some time +// } +// Yields a log just like LOG_TIMING's. +#define VLOG_TIMING(vlog_level, description) \ + for (kudu::sw_internal::LogTiming _l(__FILE__, __LINE__, google::INFO, "", description, \ + -1, VLOG_IS_ON(vlog_level)); !_l.HasRun(); _l.MarkHasRun()) + +// Macro to log the time spent in the rest of the block. +#define SCOPED_VLOG_TIMING(vlog_level, description) \ + kudu::sw_internal::LogTiming VARNAME_LINENUM(_log_timing)(__FILE__, __LINE__, \ + google::INFO, "", description, -1, VLOG_IS_ON(vlog_level)); + +#define NANOS_PER_SECOND 1000000000.0 +#define NANOS_PER_MILLISECOND 1000000.0 + +class Stopwatch; + +typedef uint64_t nanosecond_type; + +// Structure which contains an elapsed amount of wall/user/sys time. +struct CpuTimes { + nanosecond_type wall; + nanosecond_type user; + nanosecond_type system; + + void clear() { wall = user = system = 0LL; } + + // Return a string formatted similar to the output of the "time" shell command. + std::string ToString() const { + return StringPrintf( + "real %.3fs\tuser %.3fs\tsys %.3fs", + wall_seconds(), user_cpu_seconds(), system_cpu_seconds()); + } + + double wall_millis() const { + return static_cast(wall) / NANOS_PER_MILLISECOND; + } + + double wall_seconds() const { + return static_cast(wall) / NANOS_PER_SECOND; + } + + double user_cpu_seconds() const { + return static_cast(user) / NANOS_PER_SECOND; + } + + double system_cpu_seconds() const { + return static_cast(system) / NANOS_PER_SECOND; + } +}; + +// A Stopwatch is a convenient way of timing a given operation. +// +// Wall clock time is based on a monotonic timer, so can be reliably used for +// determining durations. +// CPU time is based on the current thread's usage (not the whole process). +// +// The implementation relies on several syscalls, so should not be used for +// hot paths, but is useful for timing anything on the granularity of seconds +// or more. +class Stopwatch { + public: + + enum Mode { + // Collect usage only about the calling thread. + // This may not be supported on older versions of Linux. + THIS_THREAD, + // Collect usage of all threads. + ALL_THREADS + }; + + // Construct a new stopwatch. The stopwatch is initially stopped. + explicit Stopwatch(Mode mode = THIS_THREAD) + : stopped_(true), + mode_(mode) { + times_.clear(); + } + + // Start counting. If the stopwatch is already counting, then resets the + // start point at the current time. + void start() { + stopped_ = false; + GetTimes(×_); + } + + // Stop counting. If the stopwatch is already stopped, has no effect. + void stop() { + if (stopped_) return; + stopped_ = true; + + CpuTimes current; + GetTimes(¤t); + times_.wall = current.wall - times_.wall; + times_.user = current.user - times_.user; + times_.system = current.system - times_.system; + } + + // Return the elapsed amount of time. If the stopwatch is running, then returns + // the amount of time since it was started. If it is stopped, returns the amount + // of time between the most recent start/stop pair. If the stopwatch has never been + // started, the elapsed time is considered to be zero. + CpuTimes elapsed() const { + if (stopped_) return times_; + + CpuTimes current; + GetTimes(¤t); + current.wall -= times_.wall; + current.user -= times_.user; + current.system -= times_.system; + return current; + } + + // Resume a stopped stopwatch, such that the elapsed time continues to grow from + // the point where it was last stopped. + // For example: + // Stopwatch s; + // s.start(); + // sleep(1); // elapsed() is now ~1sec + // s.stop(); + // sleep(1); + // s.resume(); + // sleep(1); // elapsed() is now ~2sec + void resume() { + if (!stopped_) return; + + CpuTimes current(times_); + start(); + times_.wall -= current.wall; + times_.user -= current.user; + times_.system -= current.system; + } + + bool is_stopped() const { + return stopped_; + } + + private: + void GetTimes(CpuTimes *times) const { + struct rusage usage; + struct timespec wall; + +#if defined(__APPLE__) + if (mode_ == THIS_THREAD) { + //Adapted from http://blog.kuriositaet.de/?p=257. + struct task_basic_info t_info; + mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT; + CHECK_EQ(KERN_SUCCESS, task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, + (task_info_t)&t_info, &t_info_count)); + usage.ru_utime.tv_sec = t_info.user_time.seconds; + usage.ru_utime.tv_usec = t_info.user_time.microseconds; + usage.ru_stime.tv_sec = t_info.system_time.seconds; + usage.ru_stime.tv_usec = t_info.system_time.microseconds; + } else { + CHECK_EQ(0, getrusage(RUSAGE_SELF, &usage)); + } + + mach_timespec_t ts; + walltime_internal::GetCurrentTime(&ts); + wall.tv_sec = ts.tv_sec; + wall.tv_nsec = ts.tv_nsec; +#else + CHECK_EQ(0, getrusage((mode_ == THIS_THREAD) ? RUSAGE_THREAD : RUSAGE_SELF, &usage)); + CHECK_EQ(0, clock_gettime(CLOCK_MONOTONIC, &wall)); +#endif // defined(__APPLE__) + times->wall = wall.tv_sec * 1000000000L + wall.tv_nsec; + times->user = usage.ru_utime.tv_sec * 1000000000L + usage.ru_utime.tv_usec * 1000; + times->system = usage.ru_stime.tv_sec * 1000000000L + usage.ru_stime.tv_usec * 1000; + } + + bool stopped_; + + CpuTimes times_; + Mode mode_; +}; + + +namespace sw_internal { + +// Internal class used by the LOG_TIMING macro. +class LogTiming { + public: + LogTiming(const char *file, int line, google::LogSeverity severity, + std::string prefix, std::string description, + int64_t max_expected_millis, bool should_print) + : file_(file), + line_(line), + severity_(severity), + prefix_(std::move(prefix)), + description_(std::move(description)), + max_expected_millis_(max_expected_millis), + should_print_(should_print), + has_run_(false) { + stopwatch_.start(); + } + + ~LogTiming() { + if (should_print_) { + Print(max_expected_millis_); + } + } + + // Allows this object to be used as the loop variable in for-loop macros. + // Call HasRun() in the conditional check in the for-loop. + bool HasRun() { + return has_run_; + } + + // Allows this object to be used as the loop variable in for-loop macros. + // Call MarkHasRun() in the "increment" section of the for-loop. + void MarkHasRun() { + has_run_ = true; + } + + private: + Stopwatch stopwatch_; + const char *file_; + const int line_; + const google::LogSeverity severity_; + const string prefix_; + const std::string description_; + const int64_t max_expected_millis_; + const bool should_print_; + bool has_run_; + + // Print if the number of expected millis exceeds the max. + // Passing a negative number implies "always print". + void Print(int64_t max_expected_millis) { + stopwatch_.stop(); + CpuTimes times = stopwatch_.elapsed(); + if (times.wall_millis() > max_expected_millis) { + google::LogMessage(file_, line_, severity_).stream() + << prefix_ << "Time spent " << description_ << ": " + << times.ToString(); + } + } + +}; + +} // namespace sw_internal +} // namespace kudu + +#endif diff --git a/src/kudu/util/string_case-test.cc b/src/kudu/util/string_case-test.cc new file mode 100644 index 000000000000..ae166f511184 --- /dev/null +++ b/src/kudu/util/string_case-test.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/util/string_case.h" + +using std::string; + +namespace kudu { + +TEST(TestStringCase, TestSnakeToCamel) { + string out; + SnakeToCamelCase("foo_bar", &out); + ASSERT_EQ("FooBar", out); + + + SnakeToCamelCase("foo-bar", &out); + ASSERT_EQ("FooBar", out); + + SnakeToCamelCase("foobar", &out); + ASSERT_EQ("Foobar", out); +} + +TEST(TestStringCase, TestToUpperCase) { + string out; + ToUpperCase(string("foo"), &out); + ASSERT_EQ("FOO", out); + ToUpperCase(string("foo bar-BaZ"), &out); + ASSERT_EQ("FOO BAR-BAZ", out); +} + +TEST(TestStringCase, TestToUpperCaseInPlace) { + string in_out = "foo"; + ToUpperCase(in_out, &in_out); + ASSERT_EQ("FOO", in_out); +} + +TEST(TestStringCase, TestCapitalize) { + string word = "foo"; + Capitalize(&word); + ASSERT_EQ("Foo", word); + + word = "HiBerNATe"; + Capitalize(&word); + ASSERT_EQ("Hibernate", word); +} + +} // namespace kudu diff --git a/src/kudu/util/string_case.cc b/src/kudu/util/string_case.cc new file mode 100644 index 000000000000..141cdc504d1b --- /dev/null +++ b/src/kudu/util/string_case.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/string_case.h" + +#include +#include + +namespace kudu { + +using std::string; + +void SnakeToCamelCase(const std::string &snake_case, + std::string *camel_case) { + DCHECK_NE(camel_case, &snake_case) << "Does not support in-place operation"; + camel_case->clear(); + camel_case->reserve(snake_case.size()); + + bool uppercase_next = true; + for (char c : snake_case) { + if ((c == '_') || + (c == '-')) { + uppercase_next = true; + continue; + } + if (uppercase_next) { + camel_case->push_back(toupper(c)); + } else { + camel_case->push_back(c); + } + uppercase_next = false; + } +} + +void ToUpperCase(const std::string &string, + std::string *out) { + if (out != &string) { + *out = string; + } + + for (char& c : *out) { + c = toupper(c); + } +} + +void Capitalize(string *word) { + uint32_t size = word->size(); + if (size == 0) { + return; + } + + (*word)[0] = toupper((*word)[0]); + + for (int i = 1; i < size; i++) { + (*word)[i] = tolower((*word)[i]); + } +} + +} // namespace kudu diff --git a/src/kudu/util/string_case.h b/src/kudu/util/string_case.h new file mode 100644 index 000000000000..98f5828e87cf --- /dev/null +++ b/src/kudu/util/string_case.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Utility methods for dealing with string case. +#ifndef KUDU_UTIL_STRING_CASE_H +#define KUDU_UTIL_STRING_CASE_H + +#include + +namespace kudu { + +// Convert the given snake_case string to camel case. +// Also treats '-' in a string like a '_' +// For example: +// - 'foo_bar' -> FooBar +// - 'foo-bar' -> FooBar +// +// This function cannot operate in-place -- i.e. 'camel_case' must not +// point to 'snake_case'. +void SnakeToCamelCase(const std::string &snake_case, + std::string *camel_case); + +// Upper-case all of the characters in the given string. +// 'string' and 'out' may refer to the same string to replace in-place. +void ToUpperCase(const std::string &string, + std::string *out); + +// Capitalizes a string containing a word in place. +// For example: +// - 'hiBerNATe' -> 'Hibernate' +void Capitalize(std::string *word); + +} // namespace kudu +#endif diff --git a/src/kudu/util/striped64-test.cc b/src/kudu/util/striped64-test.cc new file mode 100644 index 000000000000..d211a0ae3583 --- /dev/null +++ b/src/kudu/util/striped64-test.cc @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/monotime.h" +#include "kudu/util/striped64.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +namespace kudu { + +// These flags are used by the multi-threaded tests, can be used for microbenchmarking. +DEFINE_int32(num_operations, 10*1000, "Number of operations to perform"); +DEFINE_int32(num_threads, 2, "Number of worker threads"); + +// Test some basic operations +TEST(Striped64Test, TestBasic) { + LongAdder adder; + ASSERT_EQ(adder.Value(), 0); + adder.IncrementBy(100); + ASSERT_EQ(adder.Value(), 100); + adder.Increment(); + ASSERT_EQ(adder.Value(), 101); + adder.Decrement(); + ASSERT_EQ(adder.Value(), 100); + adder.IncrementBy(-200); + ASSERT_EQ(adder.Value(), -100); + adder.Reset(); + ASSERT_EQ(adder.Value(), 0); +} + +template +class MultiThreadTest { + public: + typedef std::vector > thread_vec_t; + + MultiThreadTest(int64_t num_operations, int64_t num_threads) + : num_operations_(num_operations), + num_threads_(num_threads) { + } + + void IncrementerThread(const int64_t num) { + for (int i = 0; i < num; i++) { + adder_.Increment(); + } + } + + void DecrementerThread(const int64_t num) { + for (int i = 0; i < num; i++) { + adder_.Decrement(); + } + } + + void Run() { + // Increment + for (int i = 0; i < num_threads_; i++) { + scoped_refptr ref; + Thread::Create("Striped64", "Incrementer", &MultiThreadTest::IncrementerThread, this, + num_operations_, &ref); + threads_.push_back(ref); + } + for (const scoped_refptr &t : threads_) { + t->Join(); + } + ASSERT_EQ(num_threads_*num_operations_, adder_.Value()); + threads_.clear(); + + // Decrement back to zero + for (int i = 0; i < num_threads_; i++) { + scoped_refptr ref; + Thread::Create("Striped64", "Decrementer", &MultiThreadTest::DecrementerThread, this, + num_operations_, &ref); + threads_.push_back(ref); + } + for (const scoped_refptr &t : threads_) { + t->Join(); + } + ASSERT_EQ(0, adder_.Value()); + } + + Adder adder_; + + int64_t num_operations_; + // This is rounded down to the nearest even number + int32_t num_threads_; + thread_vec_t threads_; +}; + +// Test adder implemented by a single AtomicInt for comparison +class BasicAdder { + public: + BasicAdder() : value_(0) {} + void IncrementBy(int64_t x) { value_.IncrementBy(x); } + inline void Increment() { IncrementBy(1); } + inline void Decrement() { IncrementBy(-1); } + int64_t Value() { return value_.Load(); } + private: + AtomicInt value_; +}; + +void RunMultiTest(int64_t num_operations, int64_t num_threads) { + MonoTime start = MonoTime::Now(MonoTime::FINE); + MultiThreadTest basicTest(num_operations, num_threads); + basicTest.Run(); + MonoTime end1 = MonoTime::Now(MonoTime::FINE); + MultiThreadTest test(num_operations, num_threads); + test.Run(); + MonoTime end2 = MonoTime::Now(MonoTime::FINE); + MonoDelta basic = end1.GetDeltaSince(start); + MonoDelta striped = end2.GetDeltaSince(end1); + LOG(INFO) << "Basic counter took " << basic.ToMilliseconds() << "ms."; + LOG(INFO) << "Striped counter took " << striped.ToMilliseconds() << "ms."; +} + +// Compare a single-thread workload. Demonstrates the overhead of LongAdder over AtomicInt. +TEST(Striped64Test, TestSingleIncrDecr) { + OverrideFlagForSlowTests( + "num_operations", + strings::Substitute("$0", (FLAGS_num_operations * 100))); + RunMultiTest(FLAGS_num_operations, 1); +} + +// Compare a multi-threaded workload. LongAdder should show improvements here. +TEST(Striped64Test, TestMultiIncrDecr) { + OverrideFlagForSlowTests( + "num_operations", + strings::Substitute("$0", (FLAGS_num_operations * 100))); + OverrideFlagForSlowTests( + "num_threads", + strings::Substitute("$0", (FLAGS_num_threads * 4))); + RunMultiTest(FLAGS_num_operations, FLAGS_num_threads); +} + +} // namespace kudu diff --git a/src/kudu/util/striped64.cc b/src/kudu/util/striped64.cc new file mode 100644 index 000000000000..7b566e71eabc --- /dev/null +++ b/src/kudu/util/striped64.cc @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/monotime.h" +#include "kudu/util/random.h" +#include "kudu/util/striped64.h" +#include "kudu/util/threadlocal.h" + +using kudu::striped64::internal::HashCode; +using kudu::striped64::internal::Cell; + +namespace kudu { + +namespace striped64 { +namespace internal { +// +// HashCode +// + +HashCode::HashCode() { + Random r(MonoTime::Now(MonoTime::FINE).GetDeltaSince(MonoTime::Min()).ToNanoseconds()); + const uint64_t hash = r.Next64(); + code_ = (hash == 0) ? 1 : hash; // Avoid zero to allow xorShift rehash +} + +// +// Cell +// + +Cell::Cell() + : value_(0) { +} +} // namespace internal +} // namespace striped64 + +// +// Striped64 +// +const uint32_t Striped64::kNumCpus = sysconf(_SC_NPROCESSORS_ONLN); +DEFINE_STATIC_THREAD_LOCAL(HashCode, Striped64, hashcode_); + +Striped64::Striped64() + : busy_(false), + cell_buffer_(nullptr), + cells_(nullptr), + num_cells_(0) { +} + +Striped64::~Striped64() { + // Cell is a POD, so no need to destruct each one. + free(cell_buffer_); +} + +void Striped64::RetryUpdate(int64_t x, Rehash contention) { + uint64_t h = hashcode_->code_; + // There are three operations in this loop. + // + // 1. Try to add to the Cell hash table entry for the thread if the table exists. + // When there's contention, rehash to try a different Cell. + // 2. Try to initialize the hash table. + // 3. Try to update the base counter. + // + // These are predicated on successful CAS operations, which is why it's all wrapped in an + // infinite retry loop. + while (true) { + int32_t n = base::subtle::Acquire_Load(&num_cells_); + if (n > 0) { + if (contention == kRehash) { + // CAS failed already, rehash before trying to increment. + contention = kNoRehash; + } else { + Cell *cell = &(cells_[(n - 1) & h]); + int64_t v = cell->value_.Load(); + if (cell->CompareAndSet(v, Fn(v, x))) { + // Successfully CAS'd the corresponding cell, done. + break; + } + } + // Rehash since we failed to CAS, either previously or just now. + h ^= h << 13; + h ^= h >> 17; + h ^= h << 5; + } else if (n == 0 && CasBusy()) { + // We think table hasn't been initialized yet, try to do so. + // Recheck preconditions, someone else might have init'd in the meantime. + n = base::subtle::Acquire_Load(&num_cells_); + if (n == 0) { + n = 1; + // Calculate the size. Nearest power of two >= NCPU. + // Also handle a negative NCPU, can happen if sysconf name is unknown + while (kNumCpus > n) { + n <<= 1; + } + // Allocate cache-aligned memory for use by the cells_ table. + int err = posix_memalign(&cell_buffer_, CACHELINE_SIZE, sizeof(Cell)*n); + CHECK_EQ(0, err) << "error calling posix_memalign" << std::endl; + // Initialize the table + cells_ = new (cell_buffer_) Cell[n]; + base::subtle::Release_Store(&num_cells_, n); + } + // End critical section + busy_.Store(0); + } else { + // Fallback to adding to the base value. + // Means the table wasn't initialized or we failed to init it. + int64_t v = base_.value_.Load(); + if (CasBase(v, Fn(v, x))) { + break; + } + } + } + // Record index for next time + hashcode_->code_ = h; +} + +void Striped64::InternalReset(int64_t initialValue) { + const int32_t n = base::subtle::Acquire_Load(&num_cells_); + base_.value_.Store(initialValue); + for (int i = 0; i < n; i++) { + cells_[i].value_.Store(initialValue); + } +} + +void LongAdder::IncrementBy(int64_t x) { + INIT_STATIC_THREAD_LOCAL(HashCode, hashcode_); + // Use hash table if present. If that fails, call RetryUpdate to rehash and retry. + // If no hash table, try to CAS the base counter. If that fails, RetryUpdate to init the table. + const int32_t n = base::subtle::Acquire_Load(&num_cells_); + if (n > 0) { + Cell *cell = &(cells_[(n - 1) & hashcode_->code_]); + DCHECK_EQ(0, reinterpret_cast(cell) & (sizeof(Cell) - 1)) + << " unaligned Cell not allowed for Striped64" << std::endl; + const int64_t old = cell->value_.Load(); + if (!cell->CompareAndSet(old, old + x)) { + // When we hit a hash table contention, signal RetryUpdate to rehash. + RetryUpdate(x, kRehash); + } + } else { + int64_t b = base_.value_.Load(); + if (!base_.CompareAndSet(b, b + x)) { + // Attempt to initialize the table. No need to rehash since the contention was for the + // base counter, not the hash table. + RetryUpdate(x, kNoRehash); + } + } +} + +// +// LongAdder +// + +int64_t LongAdder::Value() const { + int64_t sum = base_.value_.Load(); + const int32_t n = base::subtle::Acquire_Load(&num_cells_); + for (int i = 0; i < n; i++) { + sum += cells_[i].value_.Load(); + } + return sum; +} + +} // namespace kudu diff --git a/src/kudu/util/striped64.h b/src/kudu/util/striped64.h new file mode 100644 index 000000000000..a7ffec709c58 --- /dev/null +++ b/src/kudu/util/striped64.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef KUDU_UTIL_STRIPED64_H_ +#define KUDU_UTIL_STRIPED64_H_ + +#include "kudu/gutil/port.h" +#include "kudu/util/atomic.h" +#include "kudu/util/threadlocal.h" + +namespace kudu { + +class Striped64; + +namespace striped64 { +namespace internal { + +struct HashCode { + public: + HashCode(); + uint64_t code_; +}; + +#define ATOMIC_INT_SIZE sizeof(AtomicInt) +// Padded POD container for AtomicInt. This prevents false sharing of cache lines. +class Cell { + public: + Cell(); + inline bool CompareAndSet(int64_t cmp, int64_t value) { + return value_.CompareAndSet(cmp, value); + } + + // Padding advice from Herb Sutter: + // http://www.drdobbs.com/parallel/eliminate-false-sharing/217500206?pgno=4 + AtomicInt value_; + char pad[CACHELINE_SIZE > ATOMIC_INT_SIZE ? + CACHELINE_SIZE - ATOMIC_INT_SIZE : 1]; + + DISALLOW_COPY_AND_ASSIGN(Cell); +} CACHELINE_ALIGNED; +#undef ATOMIC_INT_SIZE + +} // namespace internal +} // namespace striped64 + +// This set of classes is heavily derived from JSR166e, released into the public domain +// by Doug Lea and the other authors. +// +// See: http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/src/jsr166e/Striped64.java?view=co +// See: http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/src/jsr166e/LongAdder.java?view=co +// +// The Striped64 and LongAdder implementations here are simplified versions of what's present in +// JSR166e. However, the core ideas remain the same. +// +// Updating a single AtomicInteger in a multi-threaded environment can be quite slow: +// +// 1. False sharing of cache lines with other counters. +// 2. Cache line bouncing from high update rates, especially with many cores. +// +// These two problems are addressed by Striped64. When there is no contention, it uses CAS on a +// single base counter to store updates. However, when Striped64 detects contention +// (via a failed CAS operation), it will allocate a small, fixed size hashtable of Cells. +// A Cell is a simple POD that pads out an AtomicInt to 64 bytes to prevent +// sharing a cache line. +// +// Reading the value of a Striped64 requires traversing the hashtable to calculate the true sum. +// +// Each updating thread uses a thread-local hashcode to determine its Cell in the hashtable. +// If a thread fails to CAS its hashed Cell, it will do a lightweight rehash operation to try +// and find an uncontended bucket. Because the hashcode is thread-local, this rehash affects all +// Striped64's accessed by the thread. This is good, since contention on one Striped64 is +// indicative of contention elsewhere too. +// +// The hashtable is statically sized to the nearest power of 2 greater than or equal to the +// number of CPUs. This is sufficient, since this guarantees the existence of a perfect hash +// function. Due to the random rehashing, the threads should eventually converge to this function. +// In practice, this scheme has shown to be sufficient. +// +// The biggest simplification of this implementation compared to JSR166e is that we do not +// dynamically grow the table, instead immediately allocating it to the full size. +// We also do not lazily allocate each Cell, instead allocating the entire array at once. +// This means we waste some additional memory in low contention scenarios, and initial allocation +// will also be slower. Some of the micro-optimizations were also elided for readability. +class Striped64 { + public: + Striped64(); + virtual ~Striped64(); + + protected: + + enum Rehash { + kRehash, + kNoRehash + }; + + // CAS the base field. + bool CasBase(int64_t cmp, int64_t val) { return base_.CompareAndSet(cmp, val); } + + // CAS the busy field from 0 to 1 to acquire the lock. + bool CasBusy() { return busy_.CompareAndSet(0, 1); } + + // Computes the function of the current and new value. Used in RetryUpdate. + virtual int64_t Fn(int64_t current_value, int64_t new_value) = 0; + + // Handles cases of updates involving initialization, resizing, creating new Cells, and/or + // contention. See above for further explanation. + void RetryUpdate(int64_t x, Rehash to_rehash); + + // Sets base and all cells to the given value. + void InternalReset(int64_t initialValue); + + // Base value, used mainly when there is no contention, but also as a fallback during + // table initialization races. Updated via CAS. + striped64::internal::Cell base_; + + // CAS lock used when resizing and/or creating cells. + AtomicBool busy_; + + // Backing buffer for cells_, used for alignment. + void* cell_buffer_; + + // Table of cells. When non-null, size is the nearest power of 2 >= NCPU. + striped64::internal::Cell* cells_; + int32_t num_cells_; + + // Static hash code per-thread. Shared across all instances to limit thread-local pollution. + // Also, if a thread hits a collision on one Striped64, it's also likely to collide on + // other Striped64s too. + DECLARE_STATIC_THREAD_LOCAL(striped64::internal::HashCode, hashcode_); + + private: + + // Number of CPUs, to place bound on table size. + static const uint32_t kNumCpus; + +}; + +// A 64-bit number optimized for high-volume concurrent updates. +// See Striped64 for a longer explanation of the inner workings. +class LongAdder : Striped64 { + public: + LongAdder() {} + void IncrementBy(int64_t x); + void Increment() { IncrementBy(1); } + void Decrement() { IncrementBy(-1); } + + // Returns the current value. + // Note this is not an atomic snapshot in the presence of concurrent updates. + int64_t Value() const; + + // Resets the counter state to zero. + void Reset() { InternalReset(0); } + + private: + int64_t Fn(int64_t current_value, int64_t new_value) override { + return current_value + new_value; + } + + DISALLOW_COPY_AND_ASSIGN(LongAdder); +}; + +} // namespace kudu + +#endif diff --git a/src/kudu/util/subprocess-test.cc b/src/kudu/util/subprocess-test.cc new file mode 100644 index 000000000000..6397f7cd808e --- /dev/null +++ b/src/kudu/util/subprocess-test.cc @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include +#include "kudu/util/subprocess.h" +#include "kudu/util/test_util.h" + +using std::string; +using std::vector; + +namespace kudu { + +class SubprocessTest : public KuduTest {}; + +TEST_F(SubprocessTest, TestSimplePipe) { + vector argv; + argv.push_back("tr"); + argv.push_back("a-z"); + argv.push_back("A-Z"); + Subprocess p("/usr/bin/tr", argv); + p.ShareParentStdout(false); + ASSERT_OK(p.Start()); + + FILE* out = fdopen(p.ReleaseChildStdinFd(), "w"); + PCHECK(out); + FILE* in = fdopen(p.from_child_stdout_fd(), "r"); + PCHECK(in); + + fprintf(out, "hello world\n"); + // We have to close 'out' or else tr won't write any output, since + // it enters a buffered mode if it detects that its input is a FIFO. + fclose(out); + + char buf[1024]; + ASSERT_EQ(buf, fgets(buf, sizeof(buf), in)); + ASSERT_STREQ("HELLO WORLD\n", &buf[0]); + + int wait_status = 0; + ASSERT_OK(p.Wait(&wait_status)); + ASSERT_TRUE(WIFEXITED(wait_status)); + ASSERT_EQ(0, WEXITSTATUS(wait_status)); +} + +TEST_F(SubprocessTest, TestErrPipe) { + vector argv; + argv.push_back("tee"); + argv.push_back("/dev/stderr"); + Subprocess p("/usr/bin/tee", argv); + p.ShareParentStderr(false); + ASSERT_OK(p.Start()); + + FILE* out = fdopen(p.ReleaseChildStdinFd(), "w"); + PCHECK(out); + + fprintf(out, "Hello, World\n"); + fclose(out); // same reasoning as above, flush to prevent tee buffering + + FILE* in = fdopen(p.from_child_stderr_fd(), "r"); + PCHECK(in); + + char buf[1024]; + ASSERT_EQ(buf, fgets(buf, sizeof(buf), in)); + ASSERT_STREQ("Hello, World\n", &buf[0]); + + int wait_status = 0; + ASSERT_OK(p.Wait(&wait_status)); + ASSERT_TRUE(WIFEXITED(wait_status)); + ASSERT_EQ(0, WEXITSTATUS(wait_status)); +} + +TEST_F(SubprocessTest, TestKill) { + vector argv; + argv.push_back("cat"); + Subprocess p("/bin/cat", argv); + ASSERT_OK(p.Start()); + + ASSERT_OK(p.Kill(SIGKILL)); + + int wait_status = 0; + ASSERT_OK(p.Wait(&wait_status)); + ASSERT_EQ(SIGKILL, WTERMSIG(wait_status)); + + // Test that calling Wait() a second time returns the same + // cached value instead of trying to wait on some other process + // that was assigned the same pid. + wait_status = 0; + ASSERT_OK(p.Wait(&wait_status)); + ASSERT_EQ(SIGKILL, WTERMSIG(wait_status)); +} + +} // namespace kudu diff --git a/src/kudu/util/subprocess.cc b/src/kudu/util/subprocess.cc new file mode 100644 index 000000000000..014568abfcbe --- /dev/null +++ b/src/kudu/util/subprocess.cc @@ -0,0 +1,448 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/subprocess.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#endif + +#include "kudu/gutil/once.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/strings/join.h" +#include "kudu/gutil/strings/numbers.h" +#include "kudu/gutil/strings/split.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/errno.h" +#include "kudu/util/status.h" + +using std::shared_ptr; +using std::string; +using std::vector; +using strings::Split; +using strings::Substitute; + +namespace kudu { + +namespace { + +static const char* kProcSelfFd = +#if defined(__APPLE__) + "/dev/fd"; +#else + "/proc/self/fd"; +#endif // defined(__APPLE__) + +#if defined(__linux__) +#define READDIR readdir64 +#define DIRENT dirent64 +#else +#define READDIR readdir +#define DIRENT dirent +#endif + +void DisableSigPipe() { + struct sigaction act; + + act.sa_handler = SIG_IGN; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + PCHECK(sigaction(SIGPIPE, &act, nullptr) == 0); +} + +void EnsureSigPipeDisabled() { + static GoogleOnceType once = GOOGLE_ONCE_INIT; + GoogleOnceInit(&once, &DisableSigPipe); +} + +// Since opendir() calls malloc(), this must be called before fork(). +// This function is not async-signal-safe. +Status OpenProcFdDir(DIR** dir) { + *dir = opendir(kProcSelfFd); + if (PREDICT_FALSE(dir == nullptr)) { + return Status::IOError(Substitute("opendir(\"$0\") failed", kProcSelfFd), + ErrnoToString(errno), errno); + } + return Status::OK(); +} + +// Close the directory stream opened by OpenProcFdDir(). +// This function is not async-signal-safe. +void CloseProcFdDir(DIR* dir) { + if (PREDICT_FALSE(closedir(dir) == -1)) { + LOG(WARNING) << "Unable to close fd dir: " + << Status::IOError(Substitute("closedir(\"$0\") failed", kProcSelfFd), + ErrnoToString(errno), errno).ToString(); + } +} + +// Close all open file descriptors other than stdin, stderr, stdout. +// Expects a directory stream created by OpenProdFdDir() as a parameter. +// This function is called after fork() and must not call malloc(). +// The rule of thumb is to only call async-signal-safe functions in such cases +// if at all possible. +void CloseNonStandardFDs(DIR* fd_dir) { + // This is implemented by iterating over the open file descriptors + // rather than using sysconf(SC_OPEN_MAX) -- the latter is error prone + // since it may not represent the highest open fd if the fd soft limit + // has changed since the process started. This should also be faster + // since iterating over all possible fds is likely to cause 64k+ syscalls + // in typical configurations. + // + // Note also that this doesn't use any of the Env utility functions, to + // make it as lean and mean as possible -- this runs in the subprocess + // after a fork, so there's some possibility that various global locks + // inside malloc() might be held, so allocating memory is a no-no. + PCHECK(fd_dir != nullptr); + int dir_fd = dirfd(fd_dir); + + struct DIRENT* ent; + // readdir64() is not reentrant (it uses a static buffer) and it also + // locks fd_dir->lock, so it must not be called in a multi-threaded + // environment and is certainly not async-signal-safe. + // However, it appears to be safe to call right after fork(), since only one + // thread exists in the child process at that time. It also does not call + // malloc() or free(). We could use readdir64_r() instead, but all that + // buys us is reentrancy, and not async-signal-safety, due to the use of + // dir->lock, so seems not worth the added complexity in lifecycle & plumbing. + while ((ent = READDIR(fd_dir)) != nullptr) { + uint32_t fd; + if (!safe_strtou32(ent->d_name, &fd)) continue; + if (!(fd == STDIN_FILENO || + fd == STDOUT_FILENO || + fd == STDERR_FILENO || + fd == dir_fd)) { + close(fd); + } + } +} + +} // anonymous namespace + +Subprocess::Subprocess(string program, vector argv) + : program_(std::move(program)), + argv_(std::move(argv)), + state_(kNotStarted), + child_pid_(-1), + fd_state_(), + child_fds_() { + fd_state_[STDIN_FILENO] = PIPED; + fd_state_[STDOUT_FILENO] = SHARED; + fd_state_[STDERR_FILENO] = SHARED; + child_fds_[STDIN_FILENO] = -1; + child_fds_[STDOUT_FILENO] = -1; + child_fds_[STDERR_FILENO] = -1; +} + +Subprocess::~Subprocess() { + if (state_ == kRunning) { + LOG(WARNING) << "Child process " << child_pid_ + << "(" << JoinStrings(argv_, " ") << ") " + << " was orphaned. Sending SIGKILL..."; + WARN_NOT_OK(Kill(SIGKILL), "Failed to send SIGKILL"); + int junk = 0; + WARN_NOT_OK(Wait(&junk), "Failed to Wait()"); + } + + for (int i = 0; i < 3; ++i) { + if (fd_state_[i] == PIPED && child_fds_[i] >= 0) { + close(child_fds_[i]); + } + } +} + +void Subprocess::SetFdShared(int stdfd, bool share) { + CHECK_EQ(state_, kNotStarted); + CHECK_NE(fd_state_[stdfd], DISABLED); + fd_state_[stdfd] = share? SHARED : PIPED; +} + +void Subprocess::DisableStderr() { + CHECK_EQ(state_, kNotStarted); + fd_state_[STDERR_FILENO] = DISABLED; +} + +void Subprocess::DisableStdout() { + CHECK_EQ(state_, kNotStarted); + fd_state_[STDOUT_FILENO] = DISABLED; +} + +static void RedirectToDevNull(int fd) { + // We must not close stderr or stdout, because then when a new file descriptor + // gets opened, it might get that fd number. (We always allocate the lowest + // available file descriptor number.) Instead, we reopen that fd as + // /dev/null. + int dev_null = open("/dev/null", O_WRONLY); + if (dev_null < 0) { + PLOG(WARNING) << "failed to open /dev/null"; + } else { + PCHECK(dup2(dev_null, fd)); + } +} + +#if defined(__APPLE__) +static int pipe2(int pipefd[2], int flags) { + DCHECK_EQ(O_CLOEXEC, flags); + + int new_fds[2]; + if (pipe(new_fds) == -1) { + return -1; + } + if (fcntl(new_fds[0], F_SETFD, O_CLOEXEC) == -1) { + close(new_fds[0]); + close(new_fds[1]); + return -1; + } + if (fcntl(new_fds[1], F_SETFD, O_CLOEXEC) == -1) { + close(new_fds[0]); + close(new_fds[1]); + return -1; + } + pipefd[0] = new_fds[0]; + pipefd[1] = new_fds[1]; + return 0; +} +#endif + +Status Subprocess::Start() { + CHECK_EQ(state_, kNotStarted); + EnsureSigPipeDisabled(); + + if (argv_.size() < 1) { + return Status::InvalidArgument("argv must have at least one elem"); + } + + vector argv_ptrs; + for (const string& arg : argv_) { + argv_ptrs.push_back(const_cast(arg.c_str())); + } + argv_ptrs.push_back(nullptr); + + // Pipe from caller process to child's stdin + // [0] = stdin for child, [1] = how parent writes to it + int child_stdin[2] = {-1, -1}; + if (fd_state_[STDIN_FILENO] == PIPED) { + PCHECK(pipe2(child_stdin, O_CLOEXEC) == 0); + } + // Pipe from child's stdout back to caller process + // [0] = how parent reads from child's stdout, [1] = how child writes to it + int child_stdout[2] = {-1, -1}; + if (fd_state_[STDOUT_FILENO] == PIPED) { + PCHECK(pipe2(child_stdout, O_CLOEXEC) == 0); + } + // Pipe from child's stderr back to caller process + // [0] = how parent reads from child's stderr, [1] = how child writes to it + int child_stderr[2] = {-1, -1}; + if (fd_state_[STDERR_FILENO] == PIPED) { + PCHECK(pipe2(child_stderr, O_CLOEXEC) == 0); + } + + DIR* fd_dir = nullptr; + RETURN_NOT_OK_PREPEND(OpenProcFdDir(&fd_dir), "Unable to open fd dir"); + shared_ptr fd_dir_closer(fd_dir, CloseProcFdDir); + + int ret = fork(); + if (ret == -1) { + return Status::RuntimeError("Unable to fork", ErrnoToString(errno), errno); + } + if (ret == 0) { // We are the child + // Send the child a SIGTERM when the parent dies. This is done as early + // as possible in the child's life to prevent any orphaning whatsoever + // (e.g. from KUDU-402). +#if defined(__linux__) + // TODO: prctl(PR_SET_PDEATHSIG) is Linux-specific, look into portable ways + // to prevent orphans when parent is killed. + prctl(PR_SET_PDEATHSIG, SIGTERM); +#endif + + // stdin + if (fd_state_[STDIN_FILENO] == PIPED) { + PCHECK(dup2(child_stdin[0], STDIN_FILENO) == STDIN_FILENO); + } + // stdout + switch (fd_state_[STDOUT_FILENO]) { + case PIPED: { + PCHECK(dup2(child_stdout[1], STDOUT_FILENO) == STDOUT_FILENO); + break; + } + case DISABLED: { + RedirectToDevNull(STDOUT_FILENO); + break; + } + default: break; + } + // stderr + switch (fd_state_[STDERR_FILENO]) { + case PIPED: { + PCHECK(dup2(child_stderr[1], STDERR_FILENO) == STDERR_FILENO); + break; + } + case DISABLED: { + RedirectToDevNull(STDERR_FILENO); + break; + } + default: break; + } + + CloseNonStandardFDs(fd_dir); + + execvp(program_.c_str(), &argv_ptrs[0]); + PLOG(WARNING) << "Couldn't exec " << program_; + _exit(errno); + } else { + // We are the parent + child_pid_ = ret; + // Close child's side of the pipes + if (fd_state_[STDIN_FILENO] == PIPED) close(child_stdin[0]); + if (fd_state_[STDOUT_FILENO] == PIPED) close(child_stdout[1]); + if (fd_state_[STDERR_FILENO] == PIPED) close(child_stderr[1]); + // Keep parent's side of the pipes + child_fds_[STDIN_FILENO] = child_stdin[1]; + child_fds_[STDOUT_FILENO] = child_stdout[0]; + child_fds_[STDERR_FILENO] = child_stderr[0]; + } + + state_ = kRunning; + return Status::OK(); +} + +Status Subprocess::DoWait(int* ret, int options) { + if (state_ == kExited) { + *ret = cached_rc_; + return Status::OK(); + } + CHECK_EQ(state_, kRunning); + + int rc = waitpid(child_pid_, ret, options); + if (rc == -1) { + return Status::RuntimeError("Unable to wait on child", + ErrnoToString(errno), + errno); + } + if ((options & WNOHANG) && rc == 0) { + return Status::TimedOut(""); + } + + CHECK_EQ(rc, child_pid_); + child_pid_ = -1; + cached_rc_ = *ret; + state_ = kExited; + return Status::OK(); +} + +Status Subprocess::Kill(int signal) { + CHECK_EQ(state_, kRunning); + if (kill(child_pid_, signal) != 0) { + return Status::RuntimeError("Unable to kill", + ErrnoToString(errno), + errno); + } + return Status::OK(); +} + +Status Subprocess::Call(const string& arg_str) { + VLOG(2) << "Invoking command: " << arg_str; + vector argv = Split(arg_str, " "); + return Call(argv); +} + +Status Subprocess::Call(const vector& argv) { + Subprocess proc(argv[0], argv); + RETURN_NOT_OK(proc.Start()); + int retcode; + RETURN_NOT_OK(proc.Wait(&retcode)); + + if (retcode == 0) { + return Status::OK(); + } else { + return Status::RuntimeError(Substitute( + "Subprocess '$0' terminated with non-zero exit status $1", + argv[0], + retcode)); + } +} + +Status Subprocess::Call(const vector& argv, string* stdout_out) { + Subprocess p(argv[0], argv); + p.ShareParentStdout(false); + RETURN_NOT_OK_PREPEND(p.Start(), "Unable to fork " + argv[0]); + int err = close(p.ReleaseChildStdinFd()); + if (PREDICT_FALSE(err != 0)) { + return Status::IOError("Unable to close child process stdin", ErrnoToString(errno), errno); + } + + stdout_out->clear(); + char buf[1024]; + while (true) { + ssize_t n = read(p.from_child_stdout_fd(), buf, arraysize(buf)); + if (n == 0) { + // EOF + break; + } + if (n < 0) { + if (errno == EINTR) continue; + return Status::IOError("IO error reading from " + argv[0], ErrnoToString(errno), errno); + } + + stdout_out->append(buf, n); + } + + int retcode; + RETURN_NOT_OK_PREPEND(p.Wait(&retcode), "Unable to wait() for " + argv[0]); + + if (PREDICT_FALSE(retcode != 0)) { + return Status::RuntimeError(Substitute( + "Subprocess '$0' terminated with non-zero exit status $1", + argv[0], + retcode)); + } + return Status::OK(); +} + +int Subprocess::CheckAndOffer(int stdfd) const { + CHECK_EQ(state_, kRunning); + CHECK_EQ(fd_state_[stdfd], PIPED); + return child_fds_[stdfd]; +} + +int Subprocess::ReleaseChildFd(int stdfd) { + CHECK_EQ(state_, kRunning); + CHECK_GE(child_fds_[stdfd], 0); + CHECK_EQ(fd_state_[stdfd], PIPED); + int ret = child_fds_[stdfd]; + child_fds_[stdfd] = -1; + return ret; +} + +pid_t Subprocess::pid() const { + CHECK_EQ(state_, kRunning); + return child_pid_; +} + +} // namespace kudu diff --git a/src/kudu/util/subprocess.h b/src/kudu/util/subprocess.h new file mode 100644 index 000000000000..6889b6a8581f --- /dev/null +++ b/src/kudu/util/subprocess.h @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_SUBPROCESS_H +#define KUDU_UTIL_SUBPROCESS_H + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/util/status.h" + +namespace kudu { + +// Wrapper around a spawned subprocess. +// +// program will be treated as an absolute path unless it begins with a dot or a +// slash. +// +// This takes care of creating pipes to/from the subprocess and offers +// basic functionality to wait on it or send signals. +// By default, child process only has stdin captured and separate from the parent. +// The stdout/stderr streams are shared with the parent by default. +// +// The process may only be started and waited on/killed once. +// +// Optionally, user may change parent/child stream sharing. Also, a user may disable +// a subprocess stream. A user cannot do both. +// +// Note that, when the Subprocess object is destructed, the child process +// will be forcibly SIGKILLed to avoid orphaning processes. +class Subprocess { + public: + Subprocess(std::string program, std::vector argv); + ~Subprocess(); + + // Disable subprocess stream output. Must be called before subprocess starts. + void DisableStderr(); + void DisableStdout(); + + // Share a stream with parent. Must be called before subprocess starts. + // Cannot set sharing at all if stream is disabled + void ShareParentStdin(bool share = true) { SetFdShared(STDIN_FILENO, share); } + void ShareParentStdout(bool share = true) { SetFdShared(STDOUT_FILENO, share); } + void ShareParentStderr(bool share = true) { SetFdShared(STDERR_FILENO, share); } + + // Start the subprocess. Can only be called once. + // + // Thie returns a bad Status if the fork() fails. However, + // note that if the executable path was incorrect such that + // exec() fails, this will still return Status::OK. You must + // use Wait() to check for failure. + Status Start(); + + // Wait for the subprocess to exit. The return value is the same as + // that of the waitpid() syscall. Only call after starting. + // + // NOTE: unlike the standard wait(2) call, this may be called multiple + // times. If the process has exited, it will repeatedly return the same + // exit code. + Status Wait(int* ret) { return DoWait(ret, 0); } + + // Like the above, but does not block. This returns Status::TimedOut + // immediately if the child has not exited. Otherwise returns Status::OK + // and sets *ret. Only call after starting. + // + // NOTE: unlike the standard wait(2) call, this may be called multiple + // times. If the process has exited, it will repeatedly return the same + // exit code. + Status WaitNoBlock(int* ret) { return DoWait(ret, WNOHANG); } + + // Send a signal to the subprocess. + // Note that this does not reap the process -- you must still Wait() + // in order to reap it. Only call after starting. + Status Kill(int signal); + + // Helper method that creates a Subprocess, issues a Start() then a Wait(). + // Expects a blank-separated list of arguments, with the first being the + // full path to the executable. + // The returned Status will only be OK if all steps were successful and + // the return code was 0. + static Status Call(const std::string& arg_str); + + // Same as above, but accepts a vector that includes the path to the + // executable as argv[0] and the arguments to the program in argv[1..n]. + static Status Call(const std::vector& argv); + + // Same as above, but collects the output from the child process stdout into + // 'stdout_out'. + static Status Call(const std::vector& argv, + std::string* stdout_out); + + // Return the pipe fd to the child's standard stream. + // Stream should not be disabled or shared. + int to_child_stdin_fd() const { return CheckAndOffer(STDIN_FILENO); } + int from_child_stdout_fd() const { return CheckAndOffer(STDOUT_FILENO); } + int from_child_stderr_fd() const { return CheckAndOffer(STDERR_FILENO); } + + // Release control of the file descriptor for the child's stream, only if piped. + // Writes to this FD show up on stdin in the subprocess + int ReleaseChildStdinFd() { return ReleaseChildFd(STDIN_FILENO ); } + // Reads from this FD come from stdout of the subprocess + int ReleaseChildStdoutFd() { return ReleaseChildFd(STDOUT_FILENO); } + // Reads from this FD come from stderr of the subprocess + int ReleaseChildStderrFd() { return ReleaseChildFd(STDERR_FILENO); } + + pid_t pid() const; + + private: + void SetFdShared(int stdfd, bool share); + int CheckAndOffer(int stdfd) const; + int ReleaseChildFd(int stdfd); + Status DoWait(int* ret, int options); + + enum StreamMode {SHARED, DISABLED, PIPED}; + + std::string program_; + std::vector argv_; + + enum State { + kNotStarted, + kRunning, + kExited + }; + State state_; + int child_pid_; + enum StreamMode fd_state_[3]; + int child_fds_[3]; + + // The cached exit result code if Wait() has been called. + // Only valid if state_ == kExited. + int cached_rc_; + + DISALLOW_COPY_AND_ASSIGN(Subprocess); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_SUBPROCESS_H */ diff --git a/src/kudu/util/sync_point-test.cc b/src/kudu/util/sync_point-test.cc new file mode 100644 index 000000000000..45e5e5dc00ba --- /dev/null +++ b/src/kudu/util/sync_point-test.cc @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/sync_point.h" + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread.h" + +using std::string; +using std::vector; + +#ifndef NDEBUG +namespace kudu { + +static void RunThread(bool *var) { + *var = true; + TEST_SYNC_POINT("first"); +} + +TEST(SyncPointTest, TestSyncPoint) { + // Set up a sync point "second" that depends on "first". + vector dependencies; + dependencies.push_back(SyncPoint::Dependency("first", "second")); + SyncPoint::GetInstance()->LoadDependency(dependencies); + SyncPoint::GetInstance()->EnableProcessing(); + + // Kick off a thread that'll process "first", but not before + // setting 'var' to true, which unblocks the main thread. + scoped_refptr thread; + bool var = false; + ASSERT_OK(kudu::Thread::Create("test", "test", + &RunThread, &var, &thread)); + + // Blocked on RunThread to process "first". + TEST_SYNC_POINT("second"); + ASSERT_TRUE(var); + + thread->Join(); +} + +} // namespace kudu +#endif // NDEBUG diff --git a/src/kudu/util/sync_point.cc b/src/kudu/util/sync_point.cc new file mode 100644 index 000000000000..7f171d732e6d --- /dev/null +++ b/src/kudu/util/sync_point.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "kudu/util/sync_point.h" + +using std::string; +using std::vector; + +#ifndef NDEBUG +namespace kudu { + +SyncPoint::Dependency::Dependency(string predecessor, string successor) + : predecessor_(std::move(predecessor)), successor_(std::move(successor)) {} + +SyncPoint::SyncPoint() + : cv_(&mutex_), + enabled_(false) { +} + +SyncPoint* SyncPoint::GetInstance() { + static SyncPoint sync_point; + return &sync_point; +} + +void SyncPoint::LoadDependency(const vector& dependencies) { + successors_.clear(); + predecessors_.clear(); + cleared_points_.clear(); + for (const Dependency& dependency : dependencies) { + successors_[dependency.predecessor_].push_back(dependency.successor_); + predecessors_[dependency.successor_].push_back(dependency.predecessor_); + } +} + +bool SyncPoint::PredecessorsAllCleared(const string& point) { + for (const string& pred : predecessors_[point]) { + if (cleared_points_.count(pred) == 0) { + return false; + } + } + return true; +} + +void SyncPoint::EnableProcessing() { + MutexLock lock(mutex_); + enabled_ = true; +} + +void SyncPoint::DisableProcessing() { + MutexLock lock(mutex_); + enabled_ = false; +} + +void SyncPoint::ClearTrace() { + MutexLock lock(mutex_); + cleared_points_.clear(); +} + +void SyncPoint::Process(const string& point) { + MutexLock lock(mutex_); + + if (!enabled_) return; + + while (!PredecessorsAllCleared(point)) { + cv_.Wait(); + } + + cleared_points_.insert(point); + cv_.Broadcast(); +} + +} // namespace kudu +#endif // NDEBUG diff --git a/src/kudu/util/sync_point.h b/src/kudu/util/sync_point.h new file mode 100644 index 000000000000..b24686d2e776 --- /dev/null +++ b/src/kudu/util/sync_point.h @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include + +#include "kudu/util/condition_variable.h" +#include "kudu/util/mutex.h" + +#ifdef NDEBUG +#define TEST_SYNC_POINT(x) +#else + +namespace kudu { + +// This class provides facility to reproduce race conditions deterministically +// in unit tests. +// Developer could specify sync points in the codebase via TEST_SYNC_POINT. +// Each sync point represents a position in the execution stream of a thread. +// In the unit test, 'Happens After' relationship among sync points could be +// setup via SyncPoint::LoadDependency, to reproduce a desired interleave of +// threads execution. + +class SyncPoint { + public: + static SyncPoint* GetInstance(); + + struct Dependency { + Dependency(std::string predecessor, std::string successor); + + std::string predecessor_; + std::string successor_; + }; + // call once at the beginning of a test to setup the dependency between + // sync points + void LoadDependency(const std::vector& dependencies); + + // enable sync point processing (disabled on startup) + void EnableProcessing(); + + // disable sync point processing + void DisableProcessing(); + + // remove the execution trace of all sync points + void ClearTrace(); + + // triggered by TEST_SYNC_POINT, blocking execution until all predecessors + // are executed. + void Process(const std::string& point); + + // TODO: it might be useful to provide a function that blocks until all + // sync points are cleared. + + private: + SyncPoint(); + + bool PredecessorsAllCleared(const std::string& point); + + // successor/predecessor map loaded from LoadDependency + std::unordered_map > successors_; + std::unordered_map > predecessors_; + + Mutex mutex_; + ConditionVariable cv_; + // sync points that have been passed through + std::unordered_set cleared_points_; + bool enabled_; +}; + +} // namespace kudu + +// Use TEST_SYNC_POINT to specify sync points inside code base. +// Sync points can have happens-after depedency on other sync points, +// configured at runtime via SyncPoint::LoadDependency. This could be +// utilized to re-produce race conditions between threads. +// TEST_SYNC_POINT is no op in release build. +#define TEST_SYNC_POINT(x) kudu::SyncPoint::GetInstance()->Process(x) +#endif // NDEBUG diff --git a/src/kudu/util/test_graph.cc b/src/kudu/util/test_graph.cc new file mode 100644 index 000000000000..c3dc9e19a176 --- /dev/null +++ b/src/kudu/util/test_graph.cc @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/stringprintf.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/locks.h" +#include "kudu/util/status.h" +#include "kudu/util/test_graph.h" +#include "kudu/util/thread.h" + +using std::shared_ptr; +using std::string; + +namespace kudu { + +void TimeSeries::AddValue(double val) { + lock_guard l(&lock_); + val_ += val; +} + +void TimeSeries::SetValue(double val) { + lock_guard l(&lock_); + val_ = val; +} + +double TimeSeries::value() const { + lock_guard l(&lock_); + return val_; +} + +TimeSeriesCollector::~TimeSeriesCollector() { + if (started_) { + StopDumperThread(); + } +} + +shared_ptr TimeSeriesCollector::GetTimeSeries(const string &key) { + MutexLock l(series_lock_); + SeriesMap::const_iterator it = series_map_.find(key); + if (it == series_map_.end()) { + shared_ptr ts(new TimeSeries()); + series_map_[key] = ts; + return ts; + } else { + return (*it).second; + } +} + +void TimeSeriesCollector::StartDumperThread() { + LOG(INFO) << "Starting metrics dumper"; + CHECK(!started_); + exit_latch_.Reset(1); + started_ = true; + CHECK_OK(kudu::Thread::Create("time series", "dumper", + &TimeSeriesCollector::DumperThread, this, &dumper_thread_)); +} + +void TimeSeriesCollector::StopDumperThread() { + CHECK(started_); + exit_latch_.CountDown(); + CHECK_OK(ThreadJoiner(dumper_thread_.get()).Join()); + started_ = false; +} + +void TimeSeriesCollector::DumperThread() { + CHECK(started_); + WallTime start_time = WallTime_Now(); + + faststring metrics_str; + while (true) { + metrics_str.clear(); + metrics_str.append("metrics: "); + BuildMetricsString(WallTime_Now() - start_time, &metrics_str); + LOG(INFO) << metrics_str.ToString(); + + // Sleep until next dump time, or return if we should exit + if (exit_latch_.WaitFor(MonoDelta::FromMilliseconds(250))) { + return; + } + } +} + +void TimeSeriesCollector::BuildMetricsString( + WallTime time_since_start, faststring *dst_buf) const { + MutexLock l(series_lock_); + + dst_buf->append(StringPrintf("{ \"scope\": \"%s\", \"time\": %.3f", + scope_.c_str(), time_since_start)); + + for (SeriesMap::const_reference entry : series_map_) { + dst_buf->append(StringPrintf(", \"%s\": %.3f", + entry.first.c_str(), entry.second->value())); + } + dst_buf->append("}"); +} + + +} // namespace kudu diff --git a/src/kudu/util/test_graph.h b/src/kudu/util/test_graph.h new file mode 100644 index 000000000000..6ea49ca6d777 --- /dev/null +++ b/src/kudu/util/test_graph.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_TEST_GRAPH_COLLECTOR_H +#define KUDU_TEST_GRAPH_COLLECTOR_H + +#include +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/faststring.h" +#include "kudu/util/locks.h" +#include "kudu/util/thread.h" + +namespace kudu { + +class TimeSeries { + public: + void AddValue(double val); + void SetValue(double val); + + double value() const; + + private: + friend class TimeSeriesCollector; + + DISALLOW_COPY_AND_ASSIGN(TimeSeries); + + TimeSeries() : + val_(0) + {} + + mutable simple_spinlock lock_; + double val_; +}; + +class TimeSeriesCollector { + public: + explicit TimeSeriesCollector(std::string scope) + : scope_(std::move(scope)), exit_latch_(0), started_(false) {} + + ~TimeSeriesCollector(); + + std::shared_ptr GetTimeSeries(const std::string &key); + void StartDumperThread(); + void StopDumperThread(); + + private: + DISALLOW_COPY_AND_ASSIGN(TimeSeriesCollector); + + void DumperThread(); + void BuildMetricsString(WallTime time_since_start, faststring *dst_buf) const; + + std::string scope_; + + typedef std::unordered_map > SeriesMap; + SeriesMap series_map_; + mutable Mutex series_lock_; + + scoped_refptr dumper_thread_; + + // Latch used to stop the dumper_thread_. When the thread is started, + // this is set to 1, and when the thread should exit, it is counted down. + CountDownLatch exit_latch_; + + bool started_; +}; + +} // namespace kudu +#endif diff --git a/src/kudu/util/test_macros.h b/src/kudu/util/test_macros.h new file mode 100644 index 000000000000..8ef7af5c1656 --- /dev/null +++ b/src/kudu/util/test_macros.h @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_TEST_MACROS_H +#define KUDU_UTIL_TEST_MACROS_H + +#include + +// ASSERT_NO_FATAL_FAILURE is just too long to type. +#define NO_FATALS ASSERT_NO_FATAL_FAILURE + +#define ASSERT_OK(status) do { \ + Status _s = status; \ + if (_s.ok()) { \ + SUCCEED(); \ + } else { \ + FAIL() << "Bad status: " << _s.ToString(); \ + } \ + } while (0); + +#define EXPECT_OK(status) do { \ + Status _s = status; \ + if (_s.ok()) { \ + SUCCEED(); \ + } else { \ + ADD_FAILURE() << "Bad status: " << _s.ToString(); \ + } \ + } while (0); + +// Like the above, but doesn't record successful +// tests. +#define ASSERT_OK_FAST(status) do { \ + Status _s = status; \ + if (!_s.ok()) { \ + FAIL() << "Bad status: " << _s.ToString(); \ + } \ + } while (0); + +#define ASSERT_STR_CONTAINS(str, substr) do { \ + std::string _s = (str); \ + if (_s.find((substr)) == std::string::npos) { \ + FAIL() << "Expected to find substring '" << (substr) \ + << "'. Got: '" << _s << "'"; \ + } \ + } while (0); + +#define ASSERT_FILE_EXISTS(env, path) do { \ + std::string _s = path; \ + ASSERT_TRUE(env->FileExists(_s)) \ + << "Expected file to exist: " << _s; \ + } while (0); + +#define ASSERT_FILE_NOT_EXISTS(env, path) do { \ + std::string _s = path; \ + ASSERT_FALSE(env->FileExists(_s)) \ + << "Expected file not to exist: " << _s; \ + } while (0); + +#define CURRENT_TEST_NAME() \ + ::testing::UnitTest::GetInstance()->current_test_info()->name() + +#define CURRENT_TEST_CASE_NAME() \ + ::testing::UnitTest::GetInstance()->current_test_info()->test_case_name() + +#endif diff --git a/src/kudu/util/test_main.cc b/src/kudu/util/test_main.cc new file mode 100644 index 000000000000..dd9af5f5d9ee --- /dev/null +++ b/src/kudu/util/test_main.cc @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/util/pstack_watcher.h" +#include "kudu/util/flags.h" +#include "kudu/util/status.h" + +DEFINE_int32(test_timeout_after, 0, + "Maximum total seconds allowed for all unit tests in the suite. Default: disabled"); + +// Start timer that kills the process if --test_timeout_after is exceeded before +// the tests complete. +static void CreateAndStartTimer(); + +// Gracefully kill the process. +static void KillTestOnTimeout(int signum); + +int main(int argc, char **argv) { + google::InstallFailureSignalHandler(); + // InitGoogleTest() must precede ParseCommandLineFlags(), as the former + // removes gtest-related flags from argv that would trip up the latter. + ::testing::InitGoogleTest(&argc, argv); + kudu::ParseCommandLineFlags(&argc, &argv, true); + + // Create the test-timeout timer. + CreateAndStartTimer(); + + int ret = RUN_ALL_TESTS(); + + return ret; +} + +static void CreateAndStartTimer() { + struct sigaction action; + struct itimerval timer; + + // Create the test-timeout timer. + memset(&action, 0, sizeof(action)); + action.sa_handler = &KillTestOnTimeout; + CHECK_ERR(sigaction(SIGALRM, &action, nullptr)) << "Unable to set timeout action"; + + timer.it_interval.tv_sec = 0; // No repeat. + timer.it_interval.tv_usec = 0; + timer.it_value.tv_sec = FLAGS_test_timeout_after; // Fire in timeout seconds. + timer.it_value.tv_usec = 0; + + CHECK_ERR(setitimer(ITIMER_REAL, &timer, nullptr)) << "Unable to set timeout timer"; +} + +static void KillTestOnTimeout(int signum) { + // Dump a pstack to stdout. + WARN_NOT_OK(kudu::PstackWatcher::DumpStacks(), "Unable to print pstack"); + + // ...and abort. + LOG(FATAL) << "Maximum unit test time exceeded (" << FLAGS_test_timeout_after << " sec)"; +} diff --git a/src/kudu/util/test_util.cc b/src/kudu/util/test_util.cc new file mode 100644 index 000000000000..51746ee496db --- /dev/null +++ b/src/kudu/util/test_util.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/test_util.h" + +#include +#include + +#include "kudu/gutil/strings/strcat.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/strings/util.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/env.h" +#include "kudu/util/path_util.h" +#include "kudu/util/random.h" +#include "kudu/util/spinlock_profiling.h" + +DEFINE_string(test_leave_files, "on_failure", + "Whether to leave test files around after the test run. " + " Valid values are 'always', 'on_failure', or 'never'"); + +DEFINE_int32(test_random_seed, 0, "Random seed to use for randomized tests"); + +using std::string; +using strings::Substitute; + +namespace kudu { + +static const char* const kSlowTestsEnvVariable = "KUDU_ALLOW_SLOW_TESTS"; + +static const uint64 kTestBeganAtMicros = Env::Default()->NowMicros(); + +/////////////////////////////////////////////////// +// KuduTest +/////////////////////////////////////////////////// + +KuduTest::KuduTest() + : env_(new EnvWrapper(Env::Default())), + test_dir_(GetTestDataDirectory()) { +} + +// env passed in from subclass, for tests that run in-memory +KuduTest::KuduTest(Env *env) + : env_(env), + test_dir_(GetTestDataDirectory()) { +} + +KuduTest::~KuduTest() { + // Clean up the test directory in the destructor instead of a TearDown + // method. This is better because it ensures that the child-class + // dtor runs first -- so, if the child class is using a minicluster, etc, + // we will shut that down before we remove files underneath. + if (FLAGS_test_leave_files == "always") { + LOG(INFO) << "-----------------------------------------------"; + LOG(INFO) << "--test_leave_files specified, leaving files in " << test_dir_; + } else if (FLAGS_test_leave_files == "on_failure" && HasFatalFailure()) { + LOG(INFO) << "-----------------------------------------------"; + LOG(INFO) << "Had fatal failures, leaving test files at " << test_dir_; + } else { + VLOG(1) << "Cleaning up temporary test files..."; + WARN_NOT_OK(env_->DeleteRecursively(test_dir_), + "Couldn't remove test files"); + } +} + +void KuduTest::SetUp() { + InitSpinLockContentionProfiling(); +} + +string KuduTest::GetTestPath(const string& relative_path) { + CHECK(!test_dir_.empty()) << "Call SetUp() first"; + return JoinPathSegments(test_dir_, relative_path); +} + +/////////////////////////////////////////////////// +// Test utility functions +/////////////////////////////////////////////////// + +bool AllowSlowTests() { + char *e = getenv(kSlowTestsEnvVariable); + if ((e == nullptr) || + (strlen(e) == 0) || + (strcasecmp(e, "false") == 0) || + (strcasecmp(e, "0") == 0) || + (strcasecmp(e, "no") == 0)) { + return false; + } + if ((strcasecmp(e, "true") == 0) || + (strcasecmp(e, "1") == 0) || + (strcasecmp(e, "yes") == 0)) { + return true; + } + LOG(FATAL) << "Unrecognized value for " << kSlowTestsEnvVariable << ": " << e; + return false; +} + +void OverrideFlagForSlowTests(const std::string& flag_name, + const std::string& new_value) { + // Ensure that the flag is valid. + google::GetCommandLineFlagInfoOrDie(flag_name.c_str()); + + // If we're not running slow tests, don't override it. + if (!AllowSlowTests()) { + return; + } + google::SetCommandLineOptionWithMode(flag_name.c_str(), new_value.c_str(), + google::SET_FLAG_IF_DEFAULT); +} + +int SeedRandom() { + int seed; + // Initialize random seed + if (FLAGS_test_random_seed == 0) { + // Not specified by user + seed = static_cast(GetCurrentTimeMicros()); + } else { + seed = FLAGS_test_random_seed; + } + LOG(INFO) << "Using random seed: " << seed; + srand(seed); + return seed; +} + +string GetTestDataDirectory() { + const ::testing::TestInfo* const test_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + CHECK(test_info) << "Must be running in a gtest unit test to call this function"; + string dir; + CHECK_OK(Env::Default()->GetTestDirectory(&dir)); + + // The directory name includes some strings for specific reasons: + // - program name: identifies the directory to the test invoker + // - timestamp and pid: disambiguates with prior runs of the same test + // + // e.g. "env-test.TestEnv.TestReadFully.1409169025392361-23600" + dir += Substitute("/$0.$1.$2.$3-$4", + StringReplace(google::ProgramInvocationShortName(), "/", "_", true), + StringReplace(test_info->test_case_name(), "/", "_", true), + StringReplace(test_info->name(), "/", "_", true), + kTestBeganAtMicros, + getpid()); + Status s = Env::Default()->CreateDir(dir); + CHECK(s.IsAlreadyPresent() || s.ok()) + << "Could not create directory " << dir << ": " << s.ToString(); + if (s.ok()) { + string metadata; + + StrAppend(&metadata, Substitute("PID=$0\n", getpid())); + + StrAppend(&metadata, Substitute("PPID=$0\n", getppid())); + + char* jenkins_build_id = getenv("BUILD_ID"); + if (jenkins_build_id) { + StrAppend(&metadata, Substitute("BUILD_ID=$0\n", jenkins_build_id)); + } + + CHECK_OK(WriteStringToFile(Env::Default(), metadata, + Substitute("$0/test_metadata", dir))); + } + return dir; +} + +} // namespace kudu diff --git a/src/kudu/util/test_util.h b/src/kudu/util/test_util.h new file mode 100644 index 000000000000..a5452335676e --- /dev/null +++ b/src/kudu/util/test_util.h @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Base test class, with various utility functions. +#ifndef KUDU_UTIL_TEST_UTIL_H +#define KUDU_UTIL_TEST_UTIL_H + +#include +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/env.h" +#include "kudu/util/test_macros.h" + +namespace kudu { + +class KuduTest : public ::testing::Test { + public: + KuduTest(); + + // Env passed in from subclass, for tests that run in-memory. + explicit KuduTest(Env *env); + + virtual ~KuduTest(); + + virtual void SetUp() OVERRIDE; + + protected: + // Returns absolute path based on a unit test-specific work directory, given + // a relative path. Useful for writing test files that should be deleted after + // the test ends. + std::string GetTestPath(const std::string& relative_path); + + gscoped_ptr env_; + google::FlagSaver flag_saver_; // Reset flags on every test. + + private: + std::string test_dir_; +}; + +// Returns true if slow tests are runtime-enabled. +bool AllowSlowTests(); + +// Override the given gflag to the new value, only in the case that +// slow tests are enabled and the user hasn't otherwise overridden +// it on the command line. +// Example usage: +// +// OverrideFlagForSlowTests( +// "client_inserts_per_thread", +// strings::Substitute("$0", FLAGS_client_inserts_per_thread * 100)); +// +void OverrideFlagForSlowTests(const std::string& flag_name, + const std::string& new_value); + +// Call srand() with a random seed based on the current time, reporting +// that seed to the logs. The time-based seed may be overridden by passing +// --test_random_seed= from the CLI in order to reproduce a failed randomized +// test. Returns the seed. +int SeedRandom(); + +// Return a per-test directory in which to store test data. Guaranteed to +// return the same directory every time for a given unit test. +// +// May only be called from within a gtest unit test. +std::string GetTestDataDirectory(); + +} // namespace kudu +#endif diff --git a/src/kudu/util/thread-test.cc b/src/kudu/util/thread-test.cc new file mode 100644 index 000000000000..a52f2d46b2fa --- /dev/null +++ b/src/kudu/util/thread-test.cc @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/thread.h" + +#include +#include + +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/env.h" +#include "kudu/util/test_util.h" +#include "kudu/util/thread_restrictions.h" + +using std::string; + +namespace kudu { + +class ThreadTest : public KuduTest {}; + +// Join with a thread and emit warnings while waiting to join. +// This has to be manually verified. +TEST_F(ThreadTest, TestJoinAndWarn) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in quick test mode, since this sleeps"; + return; + } + + scoped_refptr holder; + ASSERT_OK(Thread::Create("test", "sleeper thread", usleep, 1000*1000, &holder)); + ASSERT_OK(ThreadJoiner(holder.get()) + .warn_after_ms(10) + .warn_every_ms(100) + .Join()); +} + +TEST_F(ThreadTest, TestFailedJoin) { + if (!AllowSlowTests()) { + LOG(INFO) << "Skipping test in quick test mode, since this sleeps"; + return; + } + + scoped_refptr holder; + ASSERT_OK(Thread::Create("test", "sleeper thread", usleep, 1000*1000, &holder)); + Status s = ThreadJoiner(holder.get()) + .give_up_after_ms(50) + .Join(); + ASSERT_STR_CONTAINS(s.ToString(), "Timed out after 50ms joining on sleeper thread"); +} + +static void TryJoinOnSelf() { + Status s = ThreadJoiner(Thread::current_thread()).Join(); + // Use CHECK instead of ASSERT because gtest isn't thread-safe. + CHECK(s.IsInvalidArgument()); +} + +// Try to join on the thread that is currently running. +TEST_F(ThreadTest, TestJoinOnSelf) { + scoped_refptr holder; + ASSERT_OK(Thread::Create("test", "test", TryJoinOnSelf, &holder)); + holder->Join(); + // Actual assertion is done by the thread spawned above. +} + +TEST_F(ThreadTest, TestDoubleJoinIsNoOp) { + scoped_refptr holder; + ASSERT_OK(Thread::Create("test", "sleeper thread", usleep, 0, &holder)); + ThreadJoiner joiner(holder.get()); + ASSERT_OK(joiner.Join()); + ASSERT_OK(joiner.Join()); +} + + +namespace { + +void ExitHandler(string* s, const char* to_append) { + *s += to_append; +} + +void CallAtExitThread(string* s) { + Thread::current_thread()->CallAtExit(Bind(&ExitHandler, s, Unretained("hello 1, "))); + Thread::current_thread()->CallAtExit(Bind(&ExitHandler, s, Unretained("hello 2"))); +} + +} // anonymous namespace + +TEST_F(ThreadTest, TestCallOnExit) { + scoped_refptr holder; + string s; + ASSERT_OK(Thread::Create("test", "TestCallOnExit", CallAtExitThread, &s, &holder)); + holder->Join(); + ASSERT_EQ("hello 1, hello 2", s); +} + +// The following tests only run in debug mode, since thread restrictions are no-ops +// in release builds. +#ifndef NDEBUG +TEST_F(ThreadTest, TestThreadRestrictions_IO) { + // Default should be to allow IO + ThreadRestrictions::AssertIOAllowed(); + + ThreadRestrictions::SetIOAllowed(false); + { + ThreadRestrictions::ScopedAllowIO allow_io; + ASSERT_TRUE(Env::Default()->FileExists("/")); + } + ThreadRestrictions::SetIOAllowed(true); + + // Disallow IO - doing IO should crash the process. + ASSERT_DEATH({ + ThreadRestrictions::SetIOAllowed(false); + ignore_result(Env::Default()->FileExists("/")); + }, + "Function marked as IO-only was called from a thread that disallows IO"); +} + +TEST_F(ThreadTest, TestThreadRestrictions_Waiting) { + // Default should be to allow IO + ThreadRestrictions::AssertWaitAllowed(); + + ThreadRestrictions::SetWaitAllowed(false); + { + ThreadRestrictions::ScopedAllowWait allow_wait; + CountDownLatch l(0); + l.Wait(); + } + ThreadRestrictions::SetWaitAllowed(true); + + // Disallow waiting - blocking on a latch should crash the process. + ASSERT_DEATH({ + ThreadRestrictions::SetWaitAllowed(false); + CountDownLatch l(0); + l.Wait(); + }, + "Waiting is not allowed to be used on this thread"); +} +#endif // NDEBUG + +} // namespace kudu diff --git a/src/kudu/util/thread.cc b/src/kudu/util/thread.cc new file mode 100644 index 000000000000..502620862f33 --- /dev/null +++ b/src/kudu/util/thread.cc @@ -0,0 +1,606 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Copied from Impala and adapted to Kudu. + +#include "kudu/util/thread.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#endif // defined(__linux__) + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/dynamic_annotations.h" +#include "kudu/gutil/mathlimits.h" +#include "kudu/gutil/once.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/debug-util.h" +#include "kudu/util/errno.h" +#include "kudu/util/logging.h" +#include "kudu/util/metrics.h" +#include "kudu/util/mutex.h" +#include "kudu/util/os-util.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/url-coding.h" +#include "kudu/util/web_callback_registry.h" + +using boost::bind; +using boost::mem_fn; +using std::endl; +using std::map; +using std::shared_ptr; +using std::stringstream; +using strings::Substitute; + +METRIC_DEFINE_gauge_uint64(server, threads_started, + "Threads Started", + kudu::MetricUnit::kThreads, + "Total number of threads started on this server", + kudu::EXPOSE_AS_COUNTER); + +METRIC_DEFINE_gauge_uint64(server, threads_running, + "Threads Running", + kudu::MetricUnit::kThreads, + "Current number of running threads"); + +METRIC_DEFINE_gauge_uint64(server, cpu_utime, + "User CPU Time", + kudu::MetricUnit::kMilliseconds, + "Total user CPU time of the process", + kudu::EXPOSE_AS_COUNTER); + +METRIC_DEFINE_gauge_uint64(server, cpu_stime, + "System CPU Time", + kudu::MetricUnit::kMilliseconds, + "Total system CPU time of the process", + kudu::EXPOSE_AS_COUNTER); + +METRIC_DEFINE_gauge_uint64(server, voluntary_context_switches, + "Voluntary Context Switches", + kudu::MetricUnit::kContextSwitches, + "Total voluntary context switches", + kudu::EXPOSE_AS_COUNTER); + +METRIC_DEFINE_gauge_uint64(server, involuntary_context_switches, + "Involuntary Context Switches", + kudu::MetricUnit::kContextSwitches, + "Total involuntary context switches", + kudu::EXPOSE_AS_COUNTER); + +namespace kudu { + +static uint64_t GetCpuUTime() { + rusage ru; + CHECK_ERR(getrusage(RUSAGE_SELF, &ru)); + return ru.ru_utime.tv_sec * 1000UL + ru.ru_utime.tv_usec / 1000UL; +} + +static uint64_t GetCpuSTime() { + rusage ru; + CHECK_ERR(getrusage(RUSAGE_SELF, &ru)); + return ru.ru_stime.tv_sec * 1000UL + ru.ru_stime.tv_usec / 1000UL; +} + +static uint64_t GetVoluntaryContextSwitches() { + rusage ru; + CHECK_ERR(getrusage(RUSAGE_SELF, &ru)); + return ru.ru_nvcsw;; +} + +static uint64_t GetInVoluntaryContextSwitches() { + rusage ru; + CHECK_ERR(getrusage(RUSAGE_SELF, &ru)); + return ru.ru_nivcsw; +} + +class ThreadMgr; + +__thread Thread* Thread::tls_ = NULL; + +// Singleton instance of ThreadMgr. Only visible in this file, used only by Thread. +// The Thread class adds a reference to thread_manager while it is supervising a thread so +// that a race between the end of the process's main thread (and therefore the destruction +// of thread_manager) and the end of a thread that tries to remove itself from the +// manager after the destruction can be avoided. +static shared_ptr thread_manager; + +// Controls the single (lazy) initialization of thread_manager. +static GoogleOnceType once = GOOGLE_ONCE_INIT; + +// A singleton class that tracks all live threads, and groups them together for easy +// auditing. Used only by Thread. +class ThreadMgr { + public: + ThreadMgr() + : metrics_enabled_(false), + threads_started_metric_(0), + threads_running_metric_(0) { + } + + ~ThreadMgr() { + MutexLock l(lock_); + thread_categories_.clear(); + } + + static void SetThreadName(const std::string& name, int64 tid); + + Status StartInstrumentation(const scoped_refptr& metrics, WebCallbackRegistry* web); + + // Registers a thread to the supplied category. The key is a pthread_t, + // not the system TID, since pthread_t is less prone to being recycled. + void AddThread(const pthread_t& pthread_id, const string& name, const string& category, + int64_t tid); + + // Removes a thread from the supplied category. If the thread has + // already been removed, this is a no-op. + void RemoveThread(const pthread_t& pthread_id, const string& category); + + private: + // Container class for any details we want to capture about a thread + // TODO: Add start-time. + // TODO: Track fragment ID. + class ThreadDescriptor { + public: + ThreadDescriptor() { } + ThreadDescriptor(string category, string name, int64_t thread_id) + : name_(std::move(name)), + category_(std::move(category)), + thread_id_(thread_id) {} + + const string& name() const { return name_; } + const string& category() const { return category_; } + int64_t thread_id() const { return thread_id_; } + + private: + string name_; + string category_; + int64_t thread_id_; + }; + + // A ThreadCategory is a set of threads that are logically related. + // TODO: unordered_map is incompatible with pthread_t, but would be more + // efficient here. + typedef map ThreadCategory; + + // All thread categorys, keyed on the category name. + typedef map ThreadCategoryMap; + + // Protects thread_categories_ and metrics_enabled_ + Mutex lock_; + + // All thread categorys that ever contained a thread, even if empty + ThreadCategoryMap thread_categories_; + + // True after StartInstrumentation(..) returns + bool metrics_enabled_; + + // Counters to track all-time total number of threads, and the + // current number of running threads. + uint64_t threads_started_metric_; + uint64_t threads_running_metric_; + + // Metric callbacks. + uint64_t ReadThreadsStarted(); + uint64_t ReadThreadsRunning(); + + // Webpage callback; prints all threads by category + void ThreadPathHandler(const WebCallbackRegistry::WebRequest& args, stringstream* output); + void PrintThreadCategoryRows(const ThreadCategory& category, stringstream* output); +}; + +void ThreadMgr::SetThreadName(const string& name, int64 tid) { + // On linux we can get the thread names to show up in the debugger by setting + // the process name for the LWP. We don't want to do this for the main + // thread because that would rename the process, causing tools like killall + // to stop working. + if (tid == getpid()) { + return; + } + +#if defined(__linux__) + // http://0pointer.de/blog/projects/name-your-threads.html + // Set the name for the LWP (which gets truncated to 15 characters). + // Note that glibc also has a 'pthread_setname_np' api, but it may not be + // available everywhere and it's only benefit over using prctl directly is + // that it can set the name of threads other than the current thread. + int err = prctl(PR_SET_NAME, name.c_str()); +#else + int err = pthread_setname_np(name.c_str()); +#endif // defined(__linux__) + // We expect EPERM failures in sandboxed processes, just ignore those. + if (err < 0 && errno != EPERM) { + PLOG(ERROR) << "SetThreadName"; + } +} + +Status ThreadMgr::StartInstrumentation(const scoped_refptr& metrics, + WebCallbackRegistry* web) { + MutexLock l(lock_); + metrics_enabled_ = true; + + // Use function gauges here so that we can register a unique copy of these metrics in + // multiple tservers, even though the ThreadMgr is itself a singleton. + metrics->NeverRetire( + METRIC_threads_started.InstantiateFunctionGauge(metrics, + Bind(&ThreadMgr::ReadThreadsStarted, Unretained(this)))); + metrics->NeverRetire( + METRIC_threads_running.InstantiateFunctionGauge(metrics, + Bind(&ThreadMgr::ReadThreadsRunning, Unretained(this)))); + metrics->NeverRetire( + METRIC_cpu_utime.InstantiateFunctionGauge(metrics, + Bind(&GetCpuUTime))); + metrics->NeverRetire( + METRIC_cpu_stime.InstantiateFunctionGauge(metrics, + Bind(&GetCpuSTime))); + metrics->NeverRetire( + METRIC_voluntary_context_switches.InstantiateFunctionGauge(metrics, + Bind(&GetVoluntaryContextSwitches))); + metrics->NeverRetire( + METRIC_involuntary_context_switches.InstantiateFunctionGauge(metrics, + Bind(&GetInVoluntaryContextSwitches))); + + WebCallbackRegistry::PathHandlerCallback thread_callback = + bind(mem_fn(&ThreadMgr::ThreadPathHandler), this, _1, _2); + DCHECK_NOTNULL(web)->RegisterPathHandler("/threadz", "Threads", thread_callback); + return Status::OK(); +} + +uint64_t ThreadMgr::ReadThreadsStarted() { + MutexLock l(lock_); + return threads_started_metric_; +} + +uint64_t ThreadMgr::ReadThreadsRunning() { + MutexLock l(lock_); + return threads_running_metric_; +} + +void ThreadMgr::AddThread(const pthread_t& pthread_id, const string& name, + const string& category, int64_t tid) { + // These annotations cause TSAN to ignore the synchronization on lock_ + // without causing the subsequent mutations to be treated as data races + // in and of themselves (that's what IGNORE_READS_AND_WRITES does). + // + // Why do we need them here and in SuperviseThread()? TSAN operates by + // observing synchronization events and using them to establish "happens + // before" relationships between threads. Where these relationships are + // not built, shared state access constitutes a data race. The + // synchronization events here, in RemoveThread(), and in + // SuperviseThread() may cause TSAN to establish a "happens before" + // relationship between thread functors, ignoring potential data races. + // The annotations prevent this from happening. + ANNOTATE_IGNORE_SYNC_BEGIN(); + ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN(); + { + MutexLock l(lock_); + thread_categories_[category][pthread_id] = ThreadDescriptor(category, name, tid); + if (metrics_enabled_) { + threads_running_metric_++; + threads_started_metric_++; + } + } + ANNOTATE_IGNORE_SYNC_END(); + ANNOTATE_IGNORE_READS_AND_WRITES_END(); +} + +void ThreadMgr::RemoveThread(const pthread_t& pthread_id, const string& category) { + ANNOTATE_IGNORE_SYNC_BEGIN(); + ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN(); + { + MutexLock l(lock_); + auto category_it = thread_categories_.find(category); + DCHECK(category_it != thread_categories_.end()); + category_it->second.erase(pthread_id); + if (metrics_enabled_) { + threads_running_metric_--; + } + } + ANNOTATE_IGNORE_SYNC_END(); + ANNOTATE_IGNORE_READS_AND_WRITES_END(); +} + +void ThreadMgr::PrintThreadCategoryRows(const ThreadCategory& category, + stringstream* output) { + for (const ThreadCategory::value_type& thread : category) { + ThreadStats stats; + Status status = GetThreadStats(thread.second.thread_id(), &stats); + if (!status.ok()) { + KLOG_EVERY_N(INFO, 100) << "Could not get per-thread statistics: " + << status.ToString(); + } + (*output) << "" << thread.second.name() << "" + << (static_cast(stats.user_ns) / 1e9) << "" + << (static_cast(stats.kernel_ns) / 1e9) << "" + << (static_cast(stats.iowait_ns) / 1e9) << ""; + } +} + +void ThreadMgr::ThreadPathHandler(const WebCallbackRegistry::WebRequest& req, + stringstream* output) { + MutexLock l(lock_); + vector categories_to_print; + auto category_name = req.parsed_args.find("group"); + if (category_name != req.parsed_args.end()) { + string group = EscapeForHtmlToString(category_name->second); + (*output) << "

    Thread Group: " << group << "

    " << endl; + if (group != "all") { + ThreadCategoryMap::const_iterator category = thread_categories_.find(group); + if (category == thread_categories_.end()) { + (*output) << "Thread group '" << group << "' not found" << endl; + return; + } + categories_to_print.push_back(&category->second); + (*output) << "

    " << category->first << " : " << category->second.size() + << "

    "; + } else { + for (const ThreadCategoryMap::value_type& category : thread_categories_) { + categories_to_print.push_back(&category.second); + } + (*output) << "

    All Threads :

    "; + } + + (*output) << ""; + (*output) << "" + << "" + << ""; + + for (const ThreadCategory* category : categories_to_print) { + PrintThreadCategoryRows(*category, output); + } + (*output) << "
    Thread nameCumulative User CPU(s)Cumulative Kernel CPU(s)Cumulative IO-wait(s)
    "; + } else { + (*output) << "

    Thread Groups

    "; + if (metrics_enabled_) { + (*output) << "

    " << threads_running_metric_ << " thread(s) running"; + } + (*output) << "

    All Threads

    "; + + for (const ThreadCategoryMap::value_type& category : thread_categories_) { + string category_arg; + UrlEncode(category.first, &category_arg); + (*output) << "

    " + << category.first << " : " << category.second.size() << "

    "; + } + } +} + +static void InitThreading() { + // Warm up the stack trace library. This avoids a race in libunwind initialization + // by making sure we initialize it before we start any other threads. + ignore_result(GetStackTraceHex()); + thread_manager.reset(new ThreadMgr()); +} + +Status StartThreadInstrumentation(const scoped_refptr& server_metrics, + WebCallbackRegistry* web) { + GoogleOnceInit(&once, &InitThreading); + return thread_manager->StartInstrumentation(server_metrics, web); +} + +ThreadJoiner::ThreadJoiner(Thread* thr) + : thread_(CHECK_NOTNULL(thr)), + warn_after_ms_(kDefaultWarnAfterMs), + warn_every_ms_(kDefaultWarnEveryMs), + give_up_after_ms_(kDefaultGiveUpAfterMs) { +} + +ThreadJoiner& ThreadJoiner::warn_after_ms(int ms) { + warn_after_ms_ = ms; + return *this; +} + +ThreadJoiner& ThreadJoiner::warn_every_ms(int ms) { + warn_every_ms_ = ms; + return *this; +} + +ThreadJoiner& ThreadJoiner::give_up_after_ms(int ms) { + give_up_after_ms_ = ms; + return *this; +} + +Status ThreadJoiner::Join() { + if (Thread::current_thread() && + Thread::current_thread()->tid() == thread_->tid()) { + return Status::InvalidArgument("Can't join on own thread", thread_->name_); + } + + // Early exit: double join is a no-op. + if (!thread_->joinable_) { + return Status::OK(); + } + + int waited_ms = 0; + bool keep_trying = true; + while (keep_trying) { + if (waited_ms >= warn_after_ms_) { + LOG(WARNING) << Substitute("Waited for $0ms trying to join with $1 (tid $2)", + waited_ms, thread_->name_, thread_->tid_); + } + + int remaining_before_giveup = MathLimits::kMax; + if (give_up_after_ms_ != -1) { + remaining_before_giveup = give_up_after_ms_ - waited_ms; + } + + int remaining_before_next_warn = warn_every_ms_; + if (waited_ms < warn_after_ms_) { + remaining_before_next_warn = warn_after_ms_ - waited_ms; + } + + if (remaining_before_giveup < remaining_before_next_warn) { + keep_trying = false; + } + + int wait_for = std::min(remaining_before_giveup, remaining_before_next_warn); + + if (thread_->done_.WaitFor(MonoDelta::FromMilliseconds(wait_for))) { + // Unconditionally join before returning, to guarantee that any TLS + // has been destroyed (pthread_key_create() destructors only run + // after a pthread's user method has returned). + int ret = pthread_join(thread_->thread_, NULL); + CHECK_EQ(ret, 0); + thread_->joinable_ = false; + return Status::OK(); + } + waited_ms += wait_for; + } + return Status::Aborted(strings::Substitute("Timed out after $0ms joining on $1", + waited_ms, thread_->name_)); +} + +Thread::~Thread() { + if (joinable_) { + int ret = pthread_detach(thread_); + CHECK_EQ(ret, 0); + } +} + +void Thread::CallAtExit(const Closure& cb) { + CHECK_EQ(Thread::current_thread(), this); + exit_callbacks_.push_back(cb); +} + +std::string Thread::ToString() const { + return Substitute("Thread $0 (name: \"$1\", category: \"$2\")", tid_, name_, category_); +} + +Status Thread::StartThread(const std::string& category, const std::string& name, + const ThreadFunctor& functor, scoped_refptr *holder) { + const string log_prefix = Substitute("$0 ($1) ", name, category); + SCOPED_LOG_SLOW_EXECUTION_PREFIX(WARNING, 500 /* ms */, log_prefix, "starting thread"); + + // Temporary reference for the duration of this function. + scoped_refptr t(new Thread(category, name, functor)); + + { + SCOPED_LOG_SLOW_EXECUTION_PREFIX(WARNING, 500 /* ms */, log_prefix, "creating pthread"); + int ret = pthread_create(&t->thread_, NULL, &Thread::SuperviseThread, t.get()); + if (ret) { + return Status::RuntimeError("Could not create thread", strerror(ret), ret); + } + } + + // The thread has been created and is now joinable. + // + // Why set this in the parent and not the child? Because only the parent + // (or someone communicating with the parent) can join, so joinable must + // be set before the parent returns. + t->joinable_ = true; + + // Optional, and only set if the thread was successfully created. + if (holder) { + *holder = t; + } + + // The tid_ member goes through the following states: + // 1 CHILD_WAITING_TID: the child has just been spawned and is waiting + // for the parent to finish writing to caller state (i.e. 'holder'). + // 2. PARENT_WAITING_TID: the parent has updated caller state and is now + // waiting for the child to write the tid. + // 3. : both the parent and the child are free to continue. If the + // value is INVALID_TID, the child could not discover its tid. + Release_Store(&t->tid_, PARENT_WAITING_TID); + { + SCOPED_LOG_SLOW_EXECUTION_PREFIX(WARNING, 500 /* ms */, log_prefix, + "waiting for new thread to publish its TID"); + int loop_count = 0; + while (Acquire_Load(&t->tid_) == PARENT_WAITING_TID) { + boost::detail::yield(loop_count++); + } + } + + VLOG(2) << "Started thread " << t->tid()<< " - " << category << ":" << name; + return Status::OK(); +} + +void* Thread::SuperviseThread(void* arg) { + Thread* t = static_cast(arg); + int64_t system_tid = Thread::CurrentThreadId(); + if (system_tid == -1) { + string error_msg = ErrnoToString(errno); + KLOG_EVERY_N(INFO, 100) << "Could not determine thread ID: " << error_msg; + } + string name = strings::Substitute("$0-$1", t->name(), system_tid); + + // Take an additional reference to the thread manager, which we'll need below. + GoogleOnceInit(&once, &InitThreading); + ANNOTATE_IGNORE_SYNC_BEGIN(); + shared_ptr thread_mgr_ref = thread_manager; + ANNOTATE_IGNORE_SYNC_END(); + + // Set up the TLS. + // + // We could store a scoped_refptr in the TLS itself, but as its + // lifecycle is poorly defined, we'll use a bare pointer and take an + // additional reference on t out of band, in thread_ref. + scoped_refptr thread_ref = t; + t->tls_ = t; + + // Wait until the parent has updated all caller-visible state, then write + // the TID to 'tid_', thus completing the parent<-->child handshake. + int loop_count = 0; + while (Acquire_Load(&t->tid_) == CHILD_WAITING_TID) { + boost::detail::yield(loop_count++); + } + Release_Store(&t->tid_, system_tid); + + thread_manager->SetThreadName(name, t->tid()); + thread_manager->AddThread(pthread_self(), name, t->category(), t->tid()); + + // FinishThread() is guaranteed to run (even if functor_ throws an + // exception) because pthread_cleanup_push() creates a scoped object + // whose destructor invokes the provided callback. + pthread_cleanup_push(&Thread::FinishThread, t); + t->functor_(); + pthread_cleanup_pop(true); + + return NULL; +} + +void Thread::FinishThread(void* arg) { + Thread* t = static_cast(arg); + + for (Closure& c : t->exit_callbacks_) { + c.Run(); + } + + // We're here either because of the explicit pthread_cleanup_pop() in + // SuperviseThread() or through pthread_exit(). In either case, + // thread_manager is guaranteed to be live because thread_mgr_ref in + // SuperviseThread() is still live. + thread_manager->RemoveThread(pthread_self(), t->category()); + + // Signal any Joiner that we're done. + t->done_.CountDown(); + + VLOG(2) << "Ended thread " << t->tid() << " - " + << t->category() << ":" << t->name(); +} + +} // namespace kudu diff --git a/src/kudu/util/thread.h b/src/kudu/util/thread.h new file mode 100644 index 000000000000..1d33aa3b9f81 --- /dev/null +++ b/src/kudu/util/thread.h @@ -0,0 +1,343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Copied from Impala and adapted to Kudu. + +#ifndef KUDU_UTIL_THREAD_H +#define KUDU_UTIL_THREAD_H + +#include +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/async_util.h" +#include "kudu/util/status.h" + +namespace kudu { + +class MetricEntity; +class Thread; +class WebCallbackRegistry; + +// Utility to join on a thread, printing warning messages if it +// takes too long. For example: +// +// ThreadJoiner(&my_thread, "processing thread") +// .warn_after_ms(1000) +// .warn_every_ms(5000) +// .Join(); +// +// TODO: would be nice to offer a way to use ptrace() or signals to +// dump the stack trace of the thread we're trying to join on if it +// gets stuck. But, after looking for 20 minutes or so, it seems +// pretty complicated to get right. +class ThreadJoiner { + public: + explicit ThreadJoiner(Thread* thread); + + // Start emitting warnings after this many milliseconds. + // + // Default: 1000 ms. + ThreadJoiner& warn_after_ms(int ms); + + // After the warnings after started, emit another warning at the + // given interval. + // + // Default: 1000 ms. + ThreadJoiner& warn_every_ms(int ms); + + // If the thread has not stopped after this number of milliseconds, give up + // joining on it and return Status::Aborted. + // + // -1 (the default) means to wait forever trying to join. + ThreadJoiner& give_up_after_ms(int ms); + + // Join the thread, subject to the above parameters. If the thread joining + // fails for any reason, returns RuntimeError. If it times out, returns + // Aborted. + Status Join(); + + private: + enum { + kDefaultWarnAfterMs = 1000, + kDefaultWarnEveryMs = 1000, + kDefaultGiveUpAfterMs = -1 // forever + }; + + Thread* thread_; + + int warn_after_ms_; + int warn_every_ms_; + int give_up_after_ms_; + + DISALLOW_COPY_AND_ASSIGN(ThreadJoiner); +}; + +// Thin wrapper around pthread that can register itself with the singleton ThreadMgr +// (a private class implemented in thread.cc entirely, which tracks all live threads so +// that they may be monitored via the debug webpages). This class has a limited subset of +// boost::thread's API. Construction is almost the same, but clients must supply a +// category and a name for each thread so that they can be identified in the debug web +// UI. Otherwise, Join() is the only supported method from boost::thread. +// +// Each Thread object knows its operating system thread ID (TID), which can be used to +// attach debuggers to specific threads, to retrieve resource-usage statistics from the +// operating system, and to assign threads to resource control groups. +// +// Threads are shared objects, but in a degenerate way. They may only have +// up to two referents: the caller that created the thread (parent), and +// the thread itself (child). Moreover, the only two methods to mutate state +// (Join() and the destructor) are constrained: the child may not Join() on +// itself, and the destructor is only run when there's one referent left. +// These constraints allow us to access thread internals without any locks. +// +// TODO: Consider allowing fragment IDs as category parameters. +class Thread : public RefCountedThreadSafe { + public: + // This constructor pattern mimics that in boost::thread. There is + // one constructor for each number of arguments that the thread + // function accepts. To extend the set of acceptable signatures, add + // another constructor with . + // + // In general: + // - category: string identifying the thread category to which this thread belongs, + // used for organising threads together on the debug UI. + // - name: name of this thread. Will be appended with "-" to ensure + // uniqueness. + // - F - a method type that supports operator(), and the instance passed to the + // constructor is executed immediately in a separate thread. + // - A1...An - argument types whose instances are passed to f(...) + // - holder - optional shared pointer to hold a reference to the created thread. + template + static Status Create(const std::string& category, const std::string& name, const F& f, + scoped_refptr* holder) { + return StartThread(category, name, f, holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1), holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, const A2& a2, scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1, a2), holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, const A2& a2, const A3& a3, scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1, a2, a3), holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, const A2& a2, const A3& a3, const A4& a4, + scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1, a2, a3, a4), holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, const A2& a2, const A3& a3, const A4& a4, const A5& a5, + scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1, a2, a3, a4, a5), holder); + } + + template + static Status Create(const std::string& category, const std::string& name, const F& f, + const A1& a1, const A2& a2, const A3& a3, const A4& a4, const A5& a5, + const A6& a6, scoped_refptr* holder) { + return StartThread(category, name, boost::bind(f, a1, a2, a3, a4, a5, a6), holder); + } + + // Emulates boost::thread and detaches. + ~Thread(); + + // Blocks until this thread finishes execution. Once this method returns, the thread + // will be unregistered with the ThreadMgr and will not appear in the debug UI. + void Join() { ThreadJoiner(this).Join(); } + + // Call the given Closure on the thread before it exits. The closures are executed + // in the order they are added. + // + // NOTE: This must only be called on the currently executing thread, to avoid having + // to reason about complicated races (eg registering a callback on an already-dead + // thread). + // + // This callback is guaranteed to be called except in the case of a process crash. + void CallAtExit(const Closure& cb); + + // The thread ID assigned to this thread by the operating system. If the OS does not + // support retrieving the tid, returns Thread::INVALID_TID. + int64_t tid() const { return tid_; } + + // Returns the thread's pthread ID. + pthread_t pthread_id() const { return thread_; } + + const std::string& name() const { return name_; } + const std::string& category() const { return category_; } + + // Return a string representation of the thread identifying information. + std::string ToString() const; + + // The current thread of execution, or NULL if the current thread isn't a kudu::Thread. + // This call is signal-safe. + static Thread* current_thread() { return tls_; } + + // Returns a unique, stable identifier for this thread. Note that this is a static + // method and thus can be used on any thread, including the main thread of the + // process. + // + // In general, this should be used when a value is required that is unique to + // a thread and must work on any thread including the main process thread. + // + // NOTE: this is _not_ the TID, but rather a unique value assigned by the + // thread implementation. So, this value should not be presented to the user + // in log messages, etc. + static int64_t UniqueThreadId() { +#if defined(__linux__) + // This cast is a little bit ugly, but it is significantly faster than + // calling syscall(SYS_gettid). In particular, this speeds up some code + // paths in the tracing implementation. + return static_cast(pthread_self()); +#elif defined(__APPLE__) + uint64_t tid; + CHECK_EQ(0, pthread_threadid_np(NULL, &tid)); + return tid; +#else +#error Unsupported platform +#endif + } + + // Returns the system thread ID (tid on Linux) for the current thread. Note + // that this is a static method and thus can be used from any thread, + // including the main thread of the process. This is in contrast to + // Thread::tid(), which only works on kudu::Threads. + // + // Thread::tid() will return the same value, but the value is cached in the + // Thread object, so will be faster to call. + // + // Thread::UniqueThreadId() (or Thread::tid()) should be preferred for + // performance sensistive code, however it is only guaranteed to return a + // unique and stable thread ID, not necessarily the system thread ID. + static int64_t CurrentThreadId() { +#if defined(__linux__) + return syscall(SYS_gettid); +#else + return UniqueThreadId(); +#endif + } + + private: + friend class ThreadJoiner; + + // The various special values for tid_ that describe the various steps + // in the parent<-->child handshake. + enum { + INVALID_TID = -1, + CHILD_WAITING_TID = -2, + PARENT_WAITING_TID = -3, + }; + + // Function object that wraps the user-supplied function to run in a separate thread. + typedef boost::function ThreadFunctor; + + Thread(std::string category, std::string name, ThreadFunctor functor) + : thread_(0), + category_(std::move(category)), + name_(std::move(name)), + tid_(CHILD_WAITING_TID), + functor_(std::move(functor)), + done_(1), + joinable_(false) {} + + // Library-specific thread ID. + pthread_t thread_; + + // Name and category for this thread. + const std::string category_; + const std::string name_; + + // OS-specific thread ID. Once the constructor finishes StartThread(), + // guaranteed to be set either to a non-negative integer, or to INVALID_TID. + int64_t tid_; + + // User function to be executed by this thread. + const ThreadFunctor functor_; + + // Joiners wait on this latch to be notified if the thread is done. + // + // Note that Joiners must additionally pthread_join(), otherwise certain + // resources that callers expect to be destroyed (like TLS) may still be + // alive when a Joiner finishes. + CountDownLatch done_; + + bool joinable_; + + // Thread local pointer to the current thread of execution. Will be NULL if the current + // thread is not a Thread. + static __thread Thread* tls_; + + std::vector exit_callbacks_; + + // Starts the thread running SuperviseThread(), and returns once that thread has + // initialised and its TID has been read. Waits for notification from the started + // thread that initialisation is complete before returning. On success, stores a + // reference to the thread in holder. + static Status StartThread(const std::string& category, const std::string& name, + const ThreadFunctor& functor, scoped_refptr* holder); + + // Wrapper for the user-supplied function. Invoked from the new thread, + // with the Thread as its only argument. Executes functor_, but before + // doing so registers with the global ThreadMgr and reads the thread's + // system ID. After functor_ terminates, unregisters with the ThreadMgr. + // Always returns NULL. + // + // SuperviseThread() notifies StartThread() when thread initialisation is + // completed via the tid_, which is set to the new thread's system ID. + // By that point in time SuperviseThread() has also taken a reference to + // the Thread object, allowing it to safely refer to it even after the + // caller drops its reference. + // + // Additionally, StartThread() notifies SuperviseThread() when the actual + // Thread object has been assigned (SuperviseThread() is spinning during + // this time). Without this, the new thread may reference the actual + // Thread object before it has been assigned by StartThread(). See + // KUDU-11 for more details. + static void* SuperviseThread(void* arg); + + // Invoked when the user-supplied function finishes or in the case of an + // abrupt exit (i.e. pthread_exit()). Cleans up after SuperviseThread(). + static void FinishThread(void* arg); +}; + +// Registers /threadz with the debug webserver, and creates thread-tracking metrics under +// the given entity. +Status StartThreadInstrumentation(const scoped_refptr& server_metrics, + WebCallbackRegistry* web); +} // namespace kudu + +#endif /* KUDU_UTIL_THREAD_H */ diff --git a/src/kudu/util/thread_restrictions.cc b/src/kudu/util/thread_restrictions.cc new file mode 100644 index 000000000000..c6207a0175e8 --- /dev/null +++ b/src/kudu/util/thread_restrictions.cc @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "kudu/util/thread.h" +#include "kudu/util/threadlocal.h" +#include "kudu/util/thread_restrictions.h" + +#ifdef ENABLE_THREAD_RESTRICTIONS + +namespace kudu { + +namespace { + +struct LocalThreadRestrictions { + LocalThreadRestrictions() + : io_allowed(true), + wait_allowed(true), + singleton_allowed(true) { + } + + bool io_allowed; + bool wait_allowed; + bool singleton_allowed; +}; + +LocalThreadRestrictions* LoadTLS() { + BLOCK_STATIC_THREAD_LOCAL(LocalThreadRestrictions, local_thread_restrictions); + return local_thread_restrictions; +} + +} // anonymous namespace + +bool ThreadRestrictions::SetIOAllowed(bool allowed) { + bool previous_allowed = LoadTLS()->io_allowed; + LoadTLS()->io_allowed = allowed; + return previous_allowed; +} + +void ThreadRestrictions::AssertIOAllowed() { + CHECK(LoadTLS()->io_allowed) + << "Function marked as IO-only was called from a thread that " + << "disallows IO! If this thread really should be allowed to " + << "make IO calls, adjust the call to " + << "kudu::ThreadRestrictions::SetIOAllowed() in this thread's " + << "startup. " + << (Thread::current_thread() ? Thread::current_thread()->ToString() : "(not a kudu::Thread)"); +} + +bool ThreadRestrictions::SetWaitAllowed(bool allowed) { + bool previous_allowed = LoadTLS()->wait_allowed; + LoadTLS()->wait_allowed = allowed; + return previous_allowed; +} + +void ThreadRestrictions::AssertWaitAllowed() { + CHECK(LoadTLS()->wait_allowed) + << "Waiting is not allowed to be used on this thread to prevent " + << "server-wide latency aberrations and deadlocks. " + << (Thread::current_thread() ? Thread::current_thread()->ToString() : "(not a kudu::Thread)"); +} + +} // namespace kudu + +#endif diff --git a/src/kudu/util/thread_restrictions.h b/src/kudu/util/thread_restrictions.h new file mode 100644 index 000000000000..23f0cd52a111 --- /dev/null +++ b/src/kudu/util/thread_restrictions.h @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Some portions: Copyright (c) 2012, The Chromium Authors. +#ifndef KUDU_UTIL_THREAD_RESTRICTIONS_H +#define KUDU_UTIL_THREAD_RESTRICTIONS_H + +#include "kudu/gutil/macros.h" + +#ifndef NDEBUG +#define ENABLE_THREAD_RESTRICTIONS 1 +#endif + +namespace kudu { + +// Certain behavior is disallowed on certain threads. ThreadRestrictions helps +// enforce these rules. Examples of such rules: +// +// * Do not do blocking IO +// * Do not wait on synchronization variables or sleep +// +// Here's more about how the protection works: +// +// 1) If a thread should not be allowed to make IO calls, mark it: +// ThreadRestrictions::SetIOAllowed(false); +// By default, threads *are* allowed to make IO calls. +// In particular, threads like RPC reactors should never do blocking IO +// because it may stall other unrelated requests. +// +// 2) If a function makes a call that will go out to disk, check whether the +// current thread is allowed: +// ThreadRestrictions::AssertIOAllowed(); +// +// +// Style tip: where should you put AssertIOAllowed checks? It's best +// if you put them as close to the disk access as possible, at the +// lowest level. This rule is simple to follow and helps catch all +// callers. For example, if your function GoDoSomeBlockingDiskCall() +// only calls other functions in Kudu and doesn't access the underlying +// disk, you should go add the AssertIOAllowed checks in the helper functions. +class ThreadRestrictions { + public: + // Constructing a ScopedAllowIO temporarily allows IO for the current + // thread. Doing this is almost certainly always incorrect, but sometimes + // it makes more sense to allow an exception and file a bug in the backlog + // to improve it later. + class ScopedAllowIO { + public: + ScopedAllowIO() { previous_value_ = SetIOAllowed(true); } + ~ScopedAllowIO() { SetIOAllowed(previous_value_); } + private: + // Whether IO is allowed when the ScopedAllowIO was constructed. + bool previous_value_; + + DISALLOW_COPY_AND_ASSIGN(ScopedAllowIO); + }; + + // Constructing a ScopedAllowWait temporarily allows waiting on the current + // thread. Doing this is almost always incorrect: consider carefully whether + // you should instead be deferring work to a different thread. + class ScopedAllowWait { + public: + ScopedAllowWait() { previous_value_ = SetWaitAllowed(true); } + ~ScopedAllowWait() { SetWaitAllowed(previous_value_); } + private: + // Whether singleton use is allowed when the ScopedAllowWait was + // constructed. + bool previous_value_; + + DISALLOW_COPY_AND_ASSIGN(ScopedAllowWait); + }; + + +#if ENABLE_THREAD_RESTRICTIONS + // Set whether the current thread to make IO calls. + // Threads start out in the *allowed* state. + // Returns the previous value. + static bool SetIOAllowed(bool allowed); + + // Check whether the current thread is allowed to make IO calls, + // and FATALs if not. See the block comment above the class for + // a discussion of where to add these checks. + static void AssertIOAllowed(); + + // Set whether the current thread may wait/block. Returns the previous + // value. + static bool SetWaitAllowed(bool allowed); + + // Check whether the current thread is allowed to wait/block. + // FATALs if not. + static void AssertWaitAllowed(); +#else + // Inline the empty definitions of these functions so that they can be + // compiled out. + static bool SetIOAllowed(bool allowed) { return true; } + static void AssertIOAllowed() {} + static bool SetWaitAllowed(bool allowed) { return true; } + static void AssertWaitAllowed() {} +#endif + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ThreadRestrictions); +}; + +} // namespace kudu + +#endif /* KUDU_UTIL_THREAD_RESTRICTIONS_H */ diff --git a/src/kudu/util/threadlocal.cc b/src/kudu/util/threadlocal.cc new file mode 100644 index 000000000000..11e8e339cb1b --- /dev/null +++ b/src/kudu/util/threadlocal.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "kudu/util/threadlocal.h" + +#include + +#include + +#include "kudu/gutil/once.h" +#include "kudu/util/errno.h" + +namespace kudu { +namespace threadlocal { +namespace internal { + +// One key used by the entire process to attach destructors on thread exit. +static pthread_key_t destructors_key; + +// The above key must only be initialized once per process. +static GoogleOnceType once = GOOGLE_ONCE_INIT; + +// Call all the destructors associated with all THREAD_LOCAL instances in this +// thread. +static void InvokeDestructors(void* t) { + PerThreadDestructorList* d = reinterpret_cast(t); + while (d != nullptr) { + d->destructor(d->arg); + PerThreadDestructorList* next = d->next; + delete d; + d = next; + } +} + +// This key must be initialized only once. +static void CreateKey() { + int ret = pthread_key_create(&destructors_key, &InvokeDestructors); + // Linux supports up to 1024 keys, we will use only one for all thread locals. + CHECK_EQ(0, ret) << "pthread_key_create() failed, cannot add destructor to thread: " + << "error " << ret << ": " << ErrnoToString(ret); +} + +// Adds a destructor to the list. +void AddDestructor(PerThreadDestructorList* p) { + GoogleOnceInit(&once, &CreateKey); + + // Returns NULL if nothing is set yet. + p->next = reinterpret_cast(pthread_getspecific(destructors_key)); + int ret = pthread_setspecific(destructors_key, p); + // The only time this check should fail is if we are out of memory, or if + // somehow key creation failed, which should be caught by the above CHECK. + CHECK_EQ(0, ret) << "pthread_setspecific() failed, cannot update destructor list: " + << "error " << ret << ": " << ErrnoToString(ret); +} + +} // namespace internal +} // namespace threadlocal +} // namespace kudu diff --git a/src/kudu/util/threadlocal.h b/src/kudu/util/threadlocal.h new file mode 100644 index 000000000000..2380487a73f1 --- /dev/null +++ b/src/kudu/util/threadlocal.h @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_THREADLOCAL_H_ +#define KUDU_UTIL_THREADLOCAL_H_ + +// Block-scoped static thread local implementation. +// +// Usage is similar to a C++11 thread_local. The BLOCK_STATIC_THREAD_LOCAL macro +// defines a thread-local pointer to the specified type, which is lazily +// instantiated by any thread entering the block for the first time. The +// constructor for the type T is invoked at macro execution time, as expected, +// and its destructor is invoked when the corresponding thread's Runnable +// returns, or when the thread exits. +// +// Inspired by Poco , +// Andrew Tomazos , and +// the C++11 thread_local API. +// +// Example usage: +// +// // Invokes a 3-arg constructor on SomeClass: +// BLOCK_STATIC_THREAD_LOCAL(SomeClass, instance, arg1, arg2, arg3); +// instance->DoSomething(); +// +#define BLOCK_STATIC_THREAD_LOCAL(T, t, ...) \ +static __thread T* t; \ +do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + threadlocal::internal::PerThreadDestructorList* dtor_list = \ + new threadlocal::internal::PerThreadDestructorList(); \ + dtor_list->destructor = threadlocal::internal::Destroy; \ + dtor_list->arg = t; \ + threadlocal::internal::AddDestructor(dtor_list); \ + } \ +} while (false) + +// Class-scoped static thread local implementation. +// +// Very similar in implementation to the above block-scoped version, but +// requires a bit more syntax and vigilance to use properly. +// +// DECLARE_STATIC_THREAD_LOCAL(Type, instance_var_) must be placed in the +// class header, as usual for variable declarations. +// +// Because these variables are static, they must also be defined in the impl +// file with DEFINE_STATIC_THREAD_LOCAL(Type, Classname, instance_var_), +// which is very much like defining any static member, i.e. int Foo::member_. +// +// Finally, each thread must initialize the instance before using it by calling +// INIT_STATIC_THREAD_LOCAL(Type, instance_var_, ...). This is a cheap +// call, and may be invoked at the top of any method which may reference a +// thread-local variable. +// +// Due to all of these requirements, you should probably declare TLS members +// as private. +// +// Example usage: +// +// // foo.h +// #include "kudu/utils/file.h" +// class Foo { +// public: +// void DoSomething(std::string s); +// private: +// DECLARE_STATIC_THREAD_LOCAL(utils::File, file_); +// }; +// +// // foo.cc +// #include "kudu/foo.h" +// DEFINE_STATIC_THREAD_LOCAL(utils::File, Foo, file_); +// void Foo::WriteToFile(std::string s) { +// // Call constructor if necessary. +// INIT_STATIC_THREAD_LOCAL(utils::File, file_, "/tmp/file_location.txt"); +// file_->Write(s); +// } + +// Goes in the class declaration (usually in a header file). +// dtor must be destructed _after_ t, so it gets defined first. +// Uses a mangled variable name for dtor since it must also be a member of the +// class. +#define DECLARE_STATIC_THREAD_LOCAL(T, t) \ +static __thread T* t + +// You must also define the instance in the .cc file. +#define DEFINE_STATIC_THREAD_LOCAL(T, Class, t) \ +__thread T* Class::t + +// Must be invoked at least once by each thread that will access t. +#define INIT_STATIC_THREAD_LOCAL(T, t, ...) \ +do { \ + if (PREDICT_FALSE(t == NULL)) { \ + t = new T(__VA_ARGS__); \ + threadlocal::internal::PerThreadDestructorList* dtor_list = \ + new threadlocal::internal::PerThreadDestructorList(); \ + dtor_list->destructor = threadlocal::internal::Destroy; \ + dtor_list->arg = t; \ + threadlocal::internal::AddDestructor(dtor_list); \ + } \ +} while (false) + +// Internal implementation below. + +namespace kudu { +namespace threadlocal { +namespace internal { + +// List of destructors for all thread locals instantiated on a given thread. +struct PerThreadDestructorList { + void (*destructor)(void*); + void* arg; + PerThreadDestructorList* next; +}; + +// Add a destructor to the list. +void AddDestructor(PerThreadDestructorList* p); + +// Destroy the passed object of type T. +template +static void Destroy(void* t) { + // With tcmalloc, this should be pretty cheap (same thread as new). + delete reinterpret_cast(t); +} + +} // namespace internal +} // namespace threadlocal +} // namespace kudu + +#endif // KUDU_UTIL_THREADLOCAL_H_ diff --git a/src/kudu/util/threadpool-test.cc b/src/kudu/util/threadpool-test.cc new file mode 100644 index 000000000000..d32566291778 --- /dev/null +++ b/src/kudu/util/threadpool-test.cc @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/gutil/atomicops.h" +#include "kudu/gutil/bind.h" +#include "kudu/util/countdown_latch.h" +#include "kudu/util/metrics.h" +#include "kudu/util/promise.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/test_macros.h" +#include "kudu/util/trace.h" + +using std::shared_ptr; + +namespace kudu { + +namespace { +static Status BuildMinMaxTestPool(int min_threads, int max_threads, gscoped_ptr* pool) { + return ThreadPoolBuilder("test").set_min_threads(min_threads) + .set_max_threads(max_threads) + .Build(pool); +} +} // anonymous namespace + +TEST(TestThreadPool, TestNoTaskOpenClose) { + gscoped_ptr thread_pool; + ASSERT_OK(BuildMinMaxTestPool(4, 4, &thread_pool)); + thread_pool->Shutdown(); +} + +static void SimpleTaskMethod(int n, Atomic32 *counter) { + while (n--) { + base::subtle::NoBarrier_AtomicIncrement(counter, 1); + boost::detail::yield(n); + } +} + +class SimpleTask : public Runnable { + public: + SimpleTask(int n, Atomic32 *counter) + : n_(n), counter_(counter) { + } + + void Run() OVERRIDE { + SimpleTaskMethod(n_, counter_); + } + + private: + int n_; + Atomic32 *counter_; +}; + +TEST(TestThreadPool, TestSimpleTasks) { + gscoped_ptr thread_pool; + ASSERT_OK(BuildMinMaxTestPool(4, 4, &thread_pool)); + + Atomic32 counter(0); + std::shared_ptr task(new SimpleTask(15, &counter)); + + ASSERT_OK(thread_pool->SubmitFunc(boost::bind(&SimpleTaskMethod, 10, &counter))); + ASSERT_OK(thread_pool->Submit(task)); + ASSERT_OK(thread_pool->SubmitFunc(boost::bind(&SimpleTaskMethod, 20, &counter))); + ASSERT_OK(thread_pool->Submit(task)); + ASSERT_OK(thread_pool->SubmitClosure(Bind(&SimpleTaskMethod, 123, &counter))); + thread_pool->Wait(); + ASSERT_EQ(10 + 15 + 20 + 15 + 123, base::subtle::NoBarrier_Load(&counter)); + thread_pool->Shutdown(); +} + +static void IssueTraceStatement() { + TRACE("hello from task"); +} + +// Test that the thread-local trace is propagated to tasks +// submitted to the threadpool. +TEST(TestThreadPool, TestTracePropagation) { + gscoped_ptr thread_pool; + ASSERT_OK(BuildMinMaxTestPool(1, 1, &thread_pool)); + + scoped_refptr t(new Trace); + { + ADOPT_TRACE(t.get()); + ASSERT_OK(thread_pool->SubmitFunc(&IssueTraceStatement)); + } + thread_pool->Wait(); + ASSERT_STR_CONTAINS(t->DumpToString(true), "hello from task"); +} + +TEST(TestThreadPool, TestSubmitAfterShutdown) { + gscoped_ptr thread_pool; + ASSERT_OK(BuildMinMaxTestPool(1, 1, &thread_pool)); + thread_pool->Shutdown(); + Status s = thread_pool->SubmitFunc(&IssueTraceStatement); + ASSERT_EQ("Service unavailable: The pool has been shut down.", + s.ToString()); +} + +class SlowTask : public Runnable { + public: + explicit SlowTask(CountDownLatch* latch) + : latch_(latch) { + } + + void Run() OVERRIDE { + latch_->Wait(); + } + + private: + CountDownLatch* latch_; +}; + +TEST(TestThreadPool, TestThreadPoolWithNoMinimum) { + MonoDelta idle_timeout = MonoDelta::FromMilliseconds(1); + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(0).set_max_threads(3) + .set_idle_timeout(idle_timeout).Build(&thread_pool)); + // There are no threads to start with. + ASSERT_TRUE(thread_pool->num_threads_ == 0); + // We get up to 3 threads when submitting work. + CountDownLatch latch(1); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(2, thread_pool->num_threads_); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(3, thread_pool->num_threads_); + // The 4th piece of work gets queued. + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(3, thread_pool->num_threads_); + // Finish all work + latch.CountDown(); + thread_pool->Wait(); + ASSERT_EQ(0, thread_pool->active_threads_); + thread_pool->Shutdown(); + ASSERT_EQ(0, thread_pool->num_threads_); +} + +// Regression test for a bug where a task is submitted exactly +// as a thread is about to exit. Previously this could hang forever. +TEST(TestThreadPool, TestRace) { + alarm(10); + MonoDelta idle_timeout = MonoDelta::FromMicroseconds(1); + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(0).set_max_threads(1) + .set_idle_timeout(idle_timeout).Build(&thread_pool)); + + for (int i = 0; i < 500; i++) { + CountDownLatch l(1); + ASSERT_OK(thread_pool->SubmitFunc(boost::bind(&CountDownLatch::CountDown, &l))); + l.Wait(); + // Sleeping a different amount in each iteration makes it more likely to hit + // the bug. + SleepFor(MonoDelta::FromMicroseconds(i)); + } +} + +TEST(TestThreadPool, TestVariableSizeThreadPool) { + MonoDelta idle_timeout = MonoDelta::FromMilliseconds(1); + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(1).set_max_threads(4) + .set_idle_timeout(idle_timeout).Build(&thread_pool)); + // There is 1 thread to start with. + ASSERT_EQ(1, thread_pool->num_threads_); + // We get up to 4 threads when submitting work. + CountDownLatch latch(1); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(1, thread_pool->num_threads_); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(2, thread_pool->num_threads_); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(3, thread_pool->num_threads_); + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(4, thread_pool->num_threads_); + // The 5th piece of work gets queued. + ASSERT_OK(thread_pool->Submit( + shared_ptr(new SlowTask(&latch)))); + ASSERT_EQ(4, thread_pool->num_threads_); + // Finish all work + latch.CountDown(); + thread_pool->Wait(); + ASSERT_EQ(0, thread_pool->active_threads_); + thread_pool->Shutdown(); + ASSERT_EQ(0, thread_pool->num_threads_); +} + +TEST(TestThreadPool, TestMaxQueueSize) { + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(1).set_max_threads(1) + .set_max_queue_size(1).Build(&thread_pool)); + + CountDownLatch latch(1); + ASSERT_OK(thread_pool->Submit(shared_ptr(new SlowTask(&latch)))); + Status s = thread_pool->Submit(shared_ptr(new SlowTask(&latch))); + // We race against the worker thread to re-enqueue. + // If we get there first, we fail on the 2nd Submit(). + // If the worker dequeues first, we fail on the 3rd. + if (s.ok()) { + s = thread_pool->Submit(shared_ptr(new SlowTask(&latch))); + } + CHECK(s.IsServiceUnavailable()) << "Expected failure due to queue blowout:" << s.ToString(); + latch.CountDown(); + thread_pool->Wait(); + thread_pool->Shutdown(); +} + +// Test that setting a promise from another thread yields +// a value on the current thread. +TEST(TestThreadPool, TestPromises) { + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(1).set_max_threads(1) + .set_max_queue_size(1).Build(&thread_pool)); + + Promise my_promise; + ASSERT_OK(thread_pool->SubmitClosure( + Bind(&Promise::Set, Unretained(&my_promise), 5))); + ASSERT_EQ(5, my_promise.Get()); + thread_pool->Shutdown(); +} + + +METRIC_DEFINE_entity(test_entity); +METRIC_DEFINE_histogram(test_entity, queue_length, "queue length", + MetricUnit::kTasks, "queue length", 1000, 1); + +METRIC_DEFINE_histogram(test_entity, queue_time, "queue time", + MetricUnit::kMicroseconds, "queue time", 1000000, 1); + +METRIC_DEFINE_histogram(test_entity, run_time, "run time", + MetricUnit::kMicroseconds, "run time", 1000, 1); + +TEST(TestThreadPool, TestMetrics) { + MetricRegistry registry; + scoped_refptr entity = METRIC_ENTITY_test_entity.Instantiate( + ®istry, "test entity"); + + gscoped_ptr thread_pool; + ASSERT_OK(ThreadPoolBuilder("test") + .set_min_threads(1).set_max_threads(1) + .Build(&thread_pool)); + + // Enable metrics for the thread pool. + scoped_refptr queue_length = METRIC_queue_length.Instantiate(entity); + scoped_refptr queue_time = METRIC_queue_time.Instantiate(entity); + scoped_refptr run_time = METRIC_run_time.Instantiate(entity); + thread_pool->SetQueueLengthHistogram(queue_length); + thread_pool->SetQueueTimeMicrosHistogram(queue_time); + thread_pool->SetRunTimeMicrosHistogram(run_time); + + int kNumItems = 500; + for (int i = 0; i < kNumItems; i++) { + ASSERT_OK(thread_pool->SubmitFunc(boost::bind(&usleep, i))); + } + + thread_pool->Wait(); + + // Check that all histograms were incremented once per submitted item. + ASSERT_EQ(kNumItems, queue_length->TotalCount()); + ASSERT_EQ(kNumItems, queue_time->TotalCount()); + ASSERT_EQ(kNumItems, run_time->TotalCount()); +} + +} // namespace kudu diff --git a/src/kudu/util/threadpool.cc b/src/kudu/util/threadpool.cc new file mode 100644 index 000000000000..3c33d1665b14 --- /dev/null +++ b/src/kudu/util/threadpool.cc @@ -0,0 +1,351 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "kudu/gutil/callback.h" +#include "kudu/gutil/stl_util.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/sysinfo.h" +#include "kudu/util/metrics.h" +#include "kudu/util/thread.h" +#include "kudu/util/threadpool.h" +#include "kudu/util/trace.h" + +namespace kudu { + +using strings::Substitute; + +//////////////////////////////////////////////////////// +// FunctionRunnable +//////////////////////////////////////////////////////// + +class FunctionRunnable : public Runnable { + public: + explicit FunctionRunnable(boost::function func) : func_(std::move(func)) {} + + void Run() OVERRIDE { + func_(); + } + + private: + boost::function func_; +}; + +//////////////////////////////////////////////////////// +// ThreadPoolBuilder +//////////////////////////////////////////////////////// + +ThreadPoolBuilder::ThreadPoolBuilder(std::string name) + : name_(std::move(name)), + min_threads_(0), + max_threads_(base::NumCPUs()), + max_queue_size_(std::numeric_limits::max()), + idle_timeout_(MonoDelta::FromMilliseconds(500)) {} + +ThreadPoolBuilder& ThreadPoolBuilder::set_min_threads(int min_threads) { + CHECK_GE(min_threads, 0); + min_threads_ = min_threads; + return *this; +} + +ThreadPoolBuilder& ThreadPoolBuilder::set_max_threads(int max_threads) { + CHECK_GT(max_threads, 0); + max_threads_ = max_threads; + return *this; +} + +ThreadPoolBuilder& ThreadPoolBuilder::set_max_queue_size(int max_queue_size) { + CHECK_GT(max_queue_size, 0); + max_queue_size_ = max_queue_size; + return *this; +} + +ThreadPoolBuilder& ThreadPoolBuilder::set_idle_timeout(const MonoDelta& idle_timeout) { + idle_timeout_ = idle_timeout; + return *this; +} + +Status ThreadPoolBuilder::Build(gscoped_ptr* pool) const { + pool->reset(new ThreadPool(*this)); + RETURN_NOT_OK((*pool)->Init()); + return Status::OK(); +} + +//////////////////////////////////////////////////////// +// ThreadPool +//////////////////////////////////////////////////////// + +ThreadPool::ThreadPool(const ThreadPoolBuilder& builder) + : name_(builder.name_), + min_threads_(builder.min_threads_), + max_threads_(builder.max_threads_), + max_queue_size_(builder.max_queue_size_), + idle_timeout_(builder.idle_timeout_), + pool_status_(Status::Uninitialized("The pool was not initialized.")), + idle_cond_(&lock_), + no_threads_cond_(&lock_), + not_empty_(&lock_), + num_threads_(0), + active_threads_(0), + queue_size_(0) { +} + +ThreadPool::~ThreadPool() { + Shutdown(); +} + +Status ThreadPool::Init() { + MutexLock unique_lock(lock_); + if (!pool_status_.IsUninitialized()) { + return Status::NotSupported("The thread pool is already initialized"); + } + pool_status_ = Status::OK(); + for (int i = 0; i < min_threads_; i++) { + Status status = CreateThreadUnlocked(); + if (!status.ok()) { + Shutdown(); + return status; + } + } + return Status::OK(); +} + +void ThreadPool::ClearQueue() { + for (QueueEntry& e : queue_) { + if (e.trace) { + e.trace->Release(); + } + } + queue_.clear(); + queue_size_ = 0; +} + +void ThreadPool::Shutdown() { + MutexLock unique_lock(lock_); + pool_status_ = Status::ServiceUnavailable("The pool has been shut down."); + ClearQueue(); + not_empty_.Broadcast(); + + // The Runnable doesn't have Abort() so we must wait + // and hopefully the abort is done outside before calling Shutdown(). + while (num_threads_ > 0) { + no_threads_cond_.Wait(); + } +} + +Status ThreadPool::SubmitClosure(const Closure& task) { + // TODO: once all uses of boost::bind-based tasks are dead, implement this + // in a more straight-forward fashion. + return SubmitFunc(boost::bind(&Closure::Run, task)); +} + +Status ThreadPool::SubmitFunc(const boost::function& func) { + return Submit(std::shared_ptr(new FunctionRunnable(func))); +} + +Status ThreadPool::Submit(const std::shared_ptr& task) { + MonoTime submit_time = MonoTime::Now(MonoTime::FINE); + + MutexLock guard(lock_); + if (PREDICT_FALSE(!pool_status_.ok())) { + return pool_status_; + } + + // Size limit check. + if (queue_size_ == max_queue_size_) { + return Status::ServiceUnavailable(Substitute("Thread pool queue is full ($0 items)", + queue_size_)); + } + + // Should we create another thread? + // We assume that each current inactive thread will grab one item from the + // queue. If it seems like we'll need another thread, we create one. + // In theory, a currently active thread could finish immediately after this + // calculation. This would mean we created a thread we didn't really need. + // However, this race is unavoidable, since we don't do the work under a lock. + // It's also harmless. + // + // Of course, we never create more than max_threads_ threads no matter what. + int inactive_threads = num_threads_ - active_threads_; + int additional_threads = (queue_size_ + 1) - inactive_threads; + if (additional_threads > 0 && num_threads_ < max_threads_) { + Status status = CreateThreadUnlocked(); + if (!status.ok()) { + if (num_threads_ == 0) { + // If we have no threads, we can't do any work. + return status; + } else { + // If we failed to create a thread, but there are still some other + // worker threads, log a warning message and continue. + LOG(WARNING) << "Thread pool failed to create thread: " + << status.ToString(); + } + } + } + + QueueEntry e; + e.runnable = task; + e.trace = Trace::CurrentTrace(); + // Need to AddRef, since the thread which submitted the task may go away, + // and we don't want the trace to be destructed while waiting in the queue. + if (e.trace) { + e.trace->AddRef(); + } + e.submit_time = submit_time; + + queue_.push_back(e); + int length_at_submit = queue_size_++; + + guard.Unlock(); + not_empty_.Signal(); + + if (queue_length_histogram_) { + queue_length_histogram_->Increment(length_at_submit); + } + + return Status::OK(); +} + +void ThreadPool::Wait() { + MutexLock unique_lock(lock_); + while ((!queue_.empty()) || (active_threads_ > 0)) { + idle_cond_.Wait(); + } +} + +bool ThreadPool::WaitUntil(const MonoTime& until) { + MonoDelta relative = until.GetDeltaSince(MonoTime::Now(MonoTime::FINE)); + return WaitFor(relative); +} + +bool ThreadPool::WaitFor(const MonoDelta& delta) { + MutexLock unique_lock(lock_); + while ((!queue_.empty()) || (active_threads_ > 0)) { + if (!idle_cond_.TimedWait(delta)) { + return false; + } + } + return true; +} + + +void ThreadPool::SetQueueLengthHistogram(const scoped_refptr& hist) { + queue_length_histogram_ = hist; +} + +void ThreadPool::SetQueueTimeMicrosHistogram(const scoped_refptr& hist) { + queue_time_us_histogram_ = hist; +} + +void ThreadPool::SetRunTimeMicrosHistogram(const scoped_refptr& hist) { + run_time_us_histogram_ = hist; +} + + +void ThreadPool::DispatchThread(bool permanent) { + MutexLock unique_lock(lock_); + while (true) { + // Note: Status::Aborted() is used to indicate normal shutdown. + if (!pool_status_.ok()) { + VLOG(2) << "DispatchThread exiting: " << pool_status_.ToString(); + break; + } + + if (queue_.empty()) { + if (permanent) { + not_empty_.Wait(); + } else { + if (!not_empty_.TimedWait(idle_timeout_)) { + // After much investigation, it appears that pthread condition variables have + // a weird behavior in which they can return ETIMEDOUT from timed_wait even if + // another thread did in fact signal. Apparently after a timeout there is some + // brief period during which another thread may actually grab the internal mutex + // protecting the state, signal, and release again before we get the mutex. So, + // we'll recheck the empty queue case regardless. + if (queue_.empty()) { + VLOG(3) << "Releasing worker thread from pool " << name_ << " after " + << idle_timeout_.ToMilliseconds() << "ms of idle time."; + break; + } + } + } + continue; + } + + // Fetch a pending task + QueueEntry entry = queue_.front(); + queue_.pop_front(); + queue_size_--; + ++active_threads_; + + unique_lock.Unlock(); + + // Update metrics + if (queue_time_us_histogram_) { + MonoTime now(MonoTime::Now(MonoTime::FINE)); + queue_time_us_histogram_->Increment(now.GetDeltaSince(entry.submit_time).ToMicroseconds()); + } + + ADOPT_TRACE(entry.trace); + // Release the reference which was held by the queued item. + if (entry.trace) { + entry.trace->Release(); + } + // Execute the task + { + ScopedLatencyMetric m(run_time_us_histogram_.get()); + entry.runnable->Run(); + } + unique_lock.Lock(); + + if (--active_threads_ == 0) { + idle_cond_.Broadcast(); + } + } + + // It's important that we hold the lock between exiting the loop and dropping + // num_threads_. Otherwise it's possible someone else could come along here + // and add a new task just as the last running thread is about to exit. + CHECK(unique_lock.OwnsLock()); + + if (--num_threads_ == 0) { + no_threads_cond_.Broadcast(); + + // Sanity check: if we're the last thread exiting, the queue ought to be + // empty. Otherwise it will never get processed. + CHECK(queue_.empty()); + DCHECK_EQ(0, queue_size_); + } +} + +Status ThreadPool::CreateThreadUnlocked() { + // The first few threads are permanent, and do not time out. + bool permanent = (num_threads_ < min_threads_); + Status s = kudu::Thread::Create("thread pool", strings::Substitute("$0 [worker]", name_), + &ThreadPool::DispatchThread, this, permanent, nullptr); + if (s.ok()) { + num_threads_++; + } + return s; +} + +} // namespace kudu diff --git a/src/kudu/util/threadpool.h b/src/kudu/util/threadpool.h new file mode 100644 index 000000000000..267b3a3f811b --- /dev/null +++ b/src/kudu/util/threadpool.h @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_THREAD_POOL_H +#define KUDU_UTIL_THREAD_POOL_H + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/callback_forward.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/macros.h" +#include "kudu/gutil/port.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/util/condition_variable.h" +#include "kudu/util/monotime.h" +#include "kudu/util/mutex.h" +#include "kudu/util/status.h" + +namespace kudu { + +class Histogram; +class ThreadPool; +class Trace; + +class Runnable { + public: + virtual void Run() = 0; + virtual ~Runnable() {} +}; + +// ThreadPool takes a lot of arguments. We provide sane defaults with a builder. +// +// name: Used for debugging output and default names of the worker threads. +// Since thread names are limited to 16 characters on Linux, it's good to +// choose a short name here. +// Required. +// +// min_threads: Minimum number of threads we'll have at any time. +// Default: 0. +// +// max_threads: Maximum number of threads we'll have at any time. +// Default: Number of CPUs detected on the system. +// +// max_queue_size: Maximum number of items to enqueue before returning a +// Status::ServiceUnavailable message from Submit(). +// Default: INT_MAX. +// +// timeout: How long we'll keep around an idle thread before timing it out. +// We always keep at least min_threads. +// Default: 500 milliseconds. +// +class ThreadPoolBuilder { + public: + explicit ThreadPoolBuilder(std::string name); + + // Note: We violate the style guide by returning mutable references here + // in order to provide traditional Builder pattern conveniences. + ThreadPoolBuilder& set_min_threads(int min_threads); + ThreadPoolBuilder& set_max_threads(int max_threads); + ThreadPoolBuilder& set_max_queue_size(int max_queue_size); + ThreadPoolBuilder& set_idle_timeout(const MonoDelta& idle_timeout); + + const std::string& name() const { return name_; } + int min_threads() const { return min_threads_; } + int max_threads() const { return max_threads_; } + int max_queue_size() const { return max_queue_size_; } + const MonoDelta& idle_timeout() const { return idle_timeout_; } + + // Instantiate a new ThreadPool with the existing builder arguments. + Status Build(gscoped_ptr* pool) const; + + private: + friend class ThreadPool; + const std::string name_; + int min_threads_; + int max_threads_; + int max_queue_size_; + MonoDelta idle_timeout_; + + DISALLOW_COPY_AND_ASSIGN(ThreadPoolBuilder); +}; + +// Thread pool with a variable number of threads. +// The pool can execute a class that implements the Runnable interface, or a +// boost::function, which can be obtained via boost::bind(). +// +// Usage Example: +// static void Func(int n) { ... } +// class Task : public Runnable { ... } +// +// gscoped_ptr thread_pool; +// CHECK_OK( +// ThreadPoolBuilder("my_pool") +// .set_min_threads(0) +// .set_max_threads(5) +// .set_max_queue_size(10) +// .set_timeout(MonoDelta::FromMilliseconds(2000)) +// .Build(&thread_pool)); +// thread_pool->Submit(shared_ptr(new Task())); +// thread_pool->Submit(boost::bind(&Func, 10)); +class ThreadPool { + public: + ~ThreadPool(); + + // Wait for the running tasks to complete and then shutdown the threads. + // All the other pending tasks in the queue will be removed. + // NOTE: That the user may implement an external abort logic for the + // runnables, that must be called before Shutdown(), if the system + // should know about the non-execution of these tasks, or the runnable + // require an explicit "abort" notification to exit from the run loop. + void Shutdown(); + + // Submit a function using the kudu Closure system. + Status SubmitClosure(const Closure& task) WARN_UNUSED_RESULT; + + // Submit a function binded using boost::bind(&FuncName, args...) + Status SubmitFunc(const boost::function& func) + WARN_UNUSED_RESULT; + + // Submit a Runnable class + Status Submit(const std::shared_ptr& task) + WARN_UNUSED_RESULT; + + // Wait until all the tasks are completed. + void Wait(); + + // Waits for the pool to reach the idle state, or until 'until' time is reached. + // Returns true if the pool reached the idle state, false otherwise. + bool WaitUntil(const MonoTime& until); + + // Waits for the pool to reach the idle state, or until 'delta' time elapses. + // Returns true if the pool reached the idle state, false otherwise. + bool WaitFor(const MonoDelta& delta); + + // Return the current number of tasks waiting in the queue. + // Typically used for metrics. + int queue_length() const { + return ANNOTATE_UNPROTECTED_READ(queue_size_); + } + + // Attach a histogram which measures the queue length seen by tasks when they enter + // the thread pool's queue. + void SetQueueLengthHistogram(const scoped_refptr& hist); + + // Attach a histogram which measures the amount of time that tasks spend waiting in + // the queue. + void SetQueueTimeMicrosHistogram(const scoped_refptr& hist); + + // Attach a histogram which measures the amount of time that tasks spend running. + void SetRunTimeMicrosHistogram(const scoped_refptr& hist); + + private: + friend class ThreadPoolBuilder; + + // Create a new thread pool using a builder. + explicit ThreadPool(const ThreadPoolBuilder& builder); + + // Initialize the thread pool by starting the minimum number of threads. + Status Init(); + + // Clear all entries from queue_. Requires that lock_ is held. + void ClearQueue(); + + // Dispatcher responsible for dequeueing and executing the tasks + void DispatchThread(bool permanent); + + // Create new thread. Required that lock_ is held. + Status CreateThreadUnlocked(); + + private: + FRIEND_TEST(TestThreadPool, TestThreadPoolWithNoMinimum); + FRIEND_TEST(TestThreadPool, TestVariableSizeThreadPool); + + struct QueueEntry { + std::shared_ptr runnable; + Trace* trace; + + // Time at which the entry was submitted to the pool. + MonoTime submit_time; + }; + + const std::string name_; + const int min_threads_; + const int max_threads_; + const int max_queue_size_; + const MonoDelta idle_timeout_; + + Status pool_status_; + Mutex lock_; + ConditionVariable idle_cond_; + ConditionVariable no_threads_cond_; + ConditionVariable not_empty_; + int num_threads_; + int active_threads_; + int queue_size_; + std::list queue_; + + scoped_refptr queue_length_histogram_; + scoped_refptr queue_time_us_histogram_; + scoped_refptr run_time_us_histogram_; + + DISALLOW_COPY_AND_ASSIGN(ThreadPool); +}; + +} // namespace kudu +#endif diff --git a/src/kudu/util/trace-test.cc b/src/kudu/util/trace-test.cc new file mode 100644 index 000000000000..1891dfc9f16c --- /dev/null +++ b/src/kudu/util/trace-test.cc @@ -0,0 +1,825 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "kudu/util/trace.h" +#include "kudu/util/debug/trace_event.h" +#include "kudu/util/debug/trace_event_synthetic_delay.h" +#include "kudu/util/debug/trace_logging.h" +#include "kudu/util/stopwatch.h" +#include "kudu/util/test_util.h" + +using kudu::debug::TraceLog; +using kudu::debug::TraceResultBuffer; +using kudu::debug::CategoryFilter; +using rapidjson::Document; +using rapidjson::Value; +using std::string; +using std::vector; + +namespace kudu { + +class TraceTest : public KuduTest { +}; + +// Replace all digits in 's' with the character 'X'. +static string XOutDigits(const string& s) { + string ret; + ret.reserve(s.size()); + for (char c : s) { + if (isdigit(c)) { + ret.push_back('X'); + } else { + ret.push_back(c); + } + } + return ret; +} + +TEST_F(TraceTest, TestBasic) { + scoped_refptr t(new Trace); + TRACE_TO(t, "hello $0, $1", "world", 12345); + TRACE_TO(t, "goodbye $0, $1", "cruel world", 54321); + + string result = XOutDigits(t->DumpToString(false)); + ASSERT_EQ("XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] hello world, XXXXX\n" + "XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] goodbye cruel world, XXXXX\n", + result); +} + +TEST_F(TraceTest, TestAttach) { + scoped_refptr traceA(new Trace); + scoped_refptr traceB(new Trace); + { + ADOPT_TRACE(traceA.get()); + EXPECT_EQ(traceA.get(), Trace::CurrentTrace()); + { + ADOPT_TRACE(traceB.get()); + EXPECT_EQ(traceB.get(), Trace::CurrentTrace()); + TRACE("hello from traceB"); + } + EXPECT_EQ(traceA.get(), Trace::CurrentTrace()); + TRACE("hello from traceA"); + } + EXPECT_TRUE(Trace::CurrentTrace() == nullptr); + TRACE("this goes nowhere"); + + EXPECT_EQ(XOutDigits(traceA->DumpToString(false)), + "XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] hello from traceA\n"); + EXPECT_EQ(XOutDigits(traceB->DumpToString(false)), + "XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] hello from traceB\n"); +} + +TEST_F(TraceTest, TestChildTrace) { + scoped_refptr traceA(new Trace); + scoped_refptr traceB(new Trace); + ADOPT_TRACE(traceA.get()); + traceA->AddChildTrace(traceB.get()); + TRACE("hello from traceA"); + { + ADOPT_TRACE(traceB.get()); + TRACE("hello from traceB"); + } + EXPECT_EQ(XOutDigits(traceA->DumpToString(false)), + "XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] hello from traceA\n" + "Related trace:\n" + "XXXX XX:XX:XX.XXXXXX trace-test.cc:XX] hello from traceB\n"); +} + +static void GenerateTraceEvents(int thread_id, + int num_events) { + for (int i = 0; i < num_events; i++) { + TRACE_EVENT1("test", "foo", "thread_id", thread_id); + } +} + +// Parse the dumped trace data and return the number of events +// found within, including only those with the "test" category. +int ParseAndReturnEventCount(const string& trace_json) { + Document d; + d.Parse<0>(trace_json.c_str()); + CHECK(d.IsObject()) << "bad json: " << trace_json; + const Value& events_json = d["traceEvents"]; + CHECK(events_json.IsArray()) << "bad json: " << trace_json; + + // Count how many of our events were seen. We have to filter out + // the metadata events. + int seen_real_events = 0; + for (int i = 0; i < events_json.Size(); i++) { + if (events_json[i]["cat"].GetString() == string("test")) { + seen_real_events++; + } + } + + return seen_real_events; +} + +TEST_F(TraceTest, TestChromeTracing) { + const int kNumThreads = 4; + const int kEventsPerThread = AllowSlowTests() ? 1000000 : 10000; + + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + + vector > threads(kNumThreads); + + Stopwatch s; + s.start(); + for (int i = 0; i < kNumThreads; i++) { + CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTraceEvents, i, kEventsPerThread, + &threads[i])); + } + + for (int i = 0; i < kNumThreads; i++) { + threads[i]->Join(); + } + tl->SetDisabled(); + + int total_events = kNumThreads * kEventsPerThread; + double elapsed = s.elapsed().wall_seconds(); + + LOG(INFO) << "Trace performance: " << static_cast(total_events / elapsed) << " traces/sec"; + + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + + // Verify that the JSON contains events. It won't have exactly + // kEventsPerThread * kNumThreads because the trace buffer isn't large enough + // for that. + ASSERT_GE(ParseAndReturnEventCount(trace_json), 100); +} + +// Test that, if a thread exits before filling a full trace buffer, we still +// see its results. This is a regression test for a bug in the earlier integration +// of Chromium tracing into Kudu. +TEST_F(TraceTest, TestTraceFromExitedThread) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + + // Generate 10 trace events in a separate thread. + int kNumEvents = 10; + scoped_refptr t; + CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTraceEvents, 1, kNumEvents, + &t)); + t->Join(); + tl->SetDisabled(); + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + LOG(INFO) << trace_json; + + // Verify that the buffer contains 10 trace events + ASSERT_EQ(10, ParseAndReturnEventCount(trace_json)); +} + +static void GenerateWideSpan() { + TRACE_EVENT0("test", "GenerateWideSpan"); + for (int i = 0; i < 1000; i++) { + TRACE_EVENT0("test", "InnerLoop"); + } +} + +// Test creating a trace event which contains many other trace events. +// This ensures that we can go back and update a TraceEvent which fell in +// a different trace chunk. +TEST_F(TraceTest, TestWideSpan) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + + scoped_refptr t; + CHECK_OK(Thread::Create("test", "gen-traces", &GenerateWideSpan, &t)); + t->Join(); + tl->SetDisabled(); + + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + ASSERT_EQ(1001, ParseAndReturnEventCount(trace_json)); +} + +// Regression test for KUDU-753: faulty JSON escaping when dealing with +// single quote characters. +TEST_F(TraceTest, TestJsonEncodingString) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + { + TRACE_EVENT1("test", "test", "arg", "this is a test with \"'\"' and characters\nand new lines"); + } + tl->SetDisabled(); + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + ASSERT_EQ(1, ParseAndReturnEventCount(trace_json)); +} + +// Generate trace events continuously until 'latch' fires. +// Increment *num_events_generated for each event generated. +void GenerateTracesUntilLatch(AtomicInt* num_events_generated, + CountDownLatch* latch) { + while (latch->count()) { + { + // This goes in its own scope so that the event is fully generated (with + // both its START and END times) before we do the counter increment below. + TRACE_EVENT0("test", "GenerateTracesUntilLatch"); + } + num_events_generated->Increment(); + } +} + +// Test starting and stopping tracing while a thread is running. +// This is a regression test for bugs in earlier versions of the imported +// trace code. +TEST_F(TraceTest, TestStartAndStopCollection) { + TraceLog* tl = TraceLog::GetInstance(); + + CountDownLatch latch(1); + AtomicInt num_events_generated(0); + scoped_refptr t; + CHECK_OK(Thread::Create("test", "gen-traces", &GenerateTracesUntilLatch, + &num_events_generated, &latch, &t)); + + const int num_flushes = AllowSlowTests() ? 50 : 3; + for (int i = 0; i < num_flushes; i++) { + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + + const int64_t num_events_before = num_events_generated.Load(); + SleepFor(MonoDelta::FromMilliseconds(10)); + const int64_t num_events_after = num_events_generated.Load(); + tl->SetDisabled(); + + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + // We might under-count the number of events, since we only measure the sleep, + // and tracing is enabled before and disabled after we start counting. + // We might also over-count by at most 1, because we could enable tracing + // right in between creating a trace event and incrementing the counter. + // But, we should never over-count by more than 1. + int expected_events_lowerbound = num_events_after - num_events_before - 1; + int captured_events = ParseAndReturnEventCount(trace_json); + ASSERT_GE(captured_events, expected_events_lowerbound); + } + + latch.CountDown(); + t->Join(); +} + +TEST_F(TraceTest, TestChromeSampling) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + static_cast(TraceLog::RECORD_CONTINUOUSLY | + TraceLog::ENABLE_SAMPLING)); + + for (int i = 0; i < 100; i++) { + switch (i % 3) { + case 0: + TRACE_EVENT_SET_SAMPLING_STATE("test", "state-0"); + break; + case 1: + TRACE_EVENT_SET_SAMPLING_STATE("test", "state-1"); + break; + case 2: + TRACE_EVENT_SET_SAMPLING_STATE("test", "state-2"); + break; + } + SleepFor(MonoDelta::FromMilliseconds(1)); + } + tl->SetDisabled(); + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + ASSERT_GT(ParseAndReturnEventCount(trace_json), 0); +} + +class TraceEventCallbackTest : public KuduTest { + public: + virtual void SetUp() OVERRIDE { + KuduTest::SetUp(); + ASSERT_EQ(nullptr, s_instance); + s_instance = this; + } + virtual void TearDown() OVERRIDE { + TraceLog::GetInstance()->SetDisabled(); + + // Flush the buffer so that one test doesn't end up leaving any + // extra results for the next test. + TraceResultBuffer::FlushTraceLogToString(); + + ASSERT_TRUE(!!s_instance); + s_instance = nullptr; + KuduTest::TearDown(); + + } + + protected: + void EndTraceAndFlush() { + TraceLog::GetInstance()->SetDisabled(); + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + trace_doc_.Parse<0>(trace_json.c_str()); + LOG(INFO) << trace_json; + ASSERT_TRUE(trace_doc_.IsObject()); + trace_parsed_ = trace_doc_["traceEvents"]; + ASSERT_TRUE(trace_parsed_.IsArray()); + } + + void DropTracedMetadataRecords() { + // NB: rapidjson has move-semantics, like auto_ptr. + Value old_trace_parsed; + old_trace_parsed = trace_parsed_; + trace_parsed_.SetArray(); + size_t old_trace_parsed_size = old_trace_parsed.Size(); + + for (size_t i = 0; i < old_trace_parsed_size; i++) { + Value value; + value = old_trace_parsed[i]; + if (value.GetType() != rapidjson::kObjectType) { + trace_parsed_.PushBack(value, trace_doc_.GetAllocator()); + continue; + } + string tmp; + if (value.HasMember("ph") && strcmp(value["ph"].GetString(), "M") == 0) { + continue; + } + + trace_parsed_.PushBack(value, trace_doc_.GetAllocator()); + } + } + + // Search through the given array for any dictionary which has a key + // or value which has 'string_to_match' as a substring. + // Returns the matching dictionary, or NULL. + static const Value* FindTraceEntry( + const Value& trace_parsed, + const char* string_to_match) { + // Scan all items + size_t trace_parsed_count = trace_parsed.Size(); + for (size_t i = 0; i < trace_parsed_count; i++) { + const Value& value = trace_parsed[i]; + if (value.GetType() != rapidjson::kObjectType) { + continue; + } + + for (Value::ConstMemberIterator it = value.MemberBegin(); + it != value.MemberEnd(); + ++it) { + if (it->name.IsString() && strstr(it->name.GetString(), string_to_match) != nullptr) { + return &value; + } + if (it->value.IsString() && strstr(it->value.GetString(), string_to_match) != nullptr) { + return &value; + } + } + } + return nullptr; + } + + // For TraceEventCallbackAndRecordingX tests. + void VerifyCallbackAndRecordedEvents(size_t expected_callback_count, + size_t expected_recorded_count) { + // Callback events. + EXPECT_EQ(expected_callback_count, collected_events_names_.size()); + for (size_t i = 0; i < collected_events_names_.size(); ++i) { + EXPECT_EQ("callback", collected_events_categories_[i]); + EXPECT_EQ("yes", collected_events_names_[i]); + } + + // Recorded events. + EXPECT_EQ(expected_recorded_count, trace_parsed_.Size()); + EXPECT_TRUE(FindTraceEntry(trace_parsed_, "recording")); + EXPECT_FALSE(FindTraceEntry(trace_parsed_, "callback")); + EXPECT_TRUE(FindTraceEntry(trace_parsed_, "yes")); + EXPECT_FALSE(FindTraceEntry(trace_parsed_, "no")); + } + + void VerifyCollectedEvent(size_t i, + unsigned phase, + const string& category, + const string& name) { + EXPECT_EQ(phase, collected_events_phases_[i]); + EXPECT_EQ(category, collected_events_categories_[i]); + EXPECT_EQ(name, collected_events_names_[i]); + } + + Document trace_doc_; + Value trace_parsed_; + + vector collected_events_categories_; + vector collected_events_names_; + vector collected_events_phases_; + vector collected_events_timestamps_; + + static TraceEventCallbackTest* s_instance; + static void Callback(MicrosecondsInt64 timestamp, + char phase, + const unsigned char* category_group_enabled, + const char* name, + uint64_t id, + int num_args, + const char* const arg_names[], + const unsigned char arg_types[], + const uint64_t arg_values[], + unsigned char flags) { + s_instance->collected_events_phases_.push_back(phase); + s_instance->collected_events_categories_.push_back( + TraceLog::GetCategoryGroupName(category_group_enabled)); + s_instance->collected_events_names_.push_back(name); + s_instance->collected_events_timestamps_.push_back(timestamp); + } +}; + +TraceEventCallbackTest* TraceEventCallbackTest::s_instance; + +TEST_F(TraceEventCallbackTest, TraceEventCallback) { + TRACE_EVENT_INSTANT0("all", "before enable", TRACE_EVENT_SCOPE_THREAD); + TraceLog::GetInstance()->SetEventCallbackEnabled( + CategoryFilter("*"), Callback); + TRACE_EVENT_INSTANT0("all", "event1", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("all", "event2", TRACE_EVENT_SCOPE_GLOBAL); + { + TRACE_EVENT0("all", "duration"); + TRACE_EVENT_INSTANT0("all", "event3", TRACE_EVENT_SCOPE_GLOBAL); + } + TraceLog::GetInstance()->SetEventCallbackDisabled(); + TRACE_EVENT_INSTANT0("all", "after callback removed", + TRACE_EVENT_SCOPE_GLOBAL); + ASSERT_EQ(5u, collected_events_names_.size()); + EXPECT_EQ("event1", collected_events_names_[0]); + EXPECT_EQ(TRACE_EVENT_PHASE_INSTANT, collected_events_phases_[0]); + EXPECT_EQ("event2", collected_events_names_[1]); + EXPECT_EQ(TRACE_EVENT_PHASE_INSTANT, collected_events_phases_[1]); + EXPECT_EQ("duration", collected_events_names_[2]); + EXPECT_EQ(TRACE_EVENT_PHASE_BEGIN, collected_events_phases_[2]); + EXPECT_EQ("event3", collected_events_names_[3]); + EXPECT_EQ(TRACE_EVENT_PHASE_INSTANT, collected_events_phases_[3]); + EXPECT_EQ("duration", collected_events_names_[4]); + EXPECT_EQ(TRACE_EVENT_PHASE_END, collected_events_phases_[4]); + for (size_t i = 1; i < collected_events_timestamps_.size(); i++) { + EXPECT_LE(collected_events_timestamps_[i - 1], + collected_events_timestamps_[i]); + } +} + +TEST_F(TraceEventCallbackTest, TraceEventCallbackWhileFull) { + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("*"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + do { + TRACE_EVENT_INSTANT0("all", "badger badger", TRACE_EVENT_SCOPE_GLOBAL); + } while (!TraceLog::GetInstance()->BufferIsFull()); + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("*"), + Callback); + TRACE_EVENT_INSTANT0("all", "a snake", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackDisabled(); + ASSERT_EQ(1u, collected_events_names_.size()); + EXPECT_EQ("a snake", collected_events_names_[0]); +} + +// 1: Enable callback, enable recording, disable callback, disable recording. +TEST_F(TraceEventCallbackTest, TraceEventCallbackAndRecording1) { + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("callback"), + Callback); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("recording"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackDisabled(); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + EndTraceAndFlush(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + + DropTracedMetadataRecords(); + ASSERT_NO_FATAL_FAILURE(); + VerifyCallbackAndRecordedEvents(2, 2); +} + +// 2: Enable callback, enable recording, disable recording, disable callback. +TEST_F(TraceEventCallbackTest, TraceEventCallbackAndRecording2) { + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("callback"), + Callback); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("recording"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + EndTraceAndFlush(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackDisabled(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + + DropTracedMetadataRecords(); + VerifyCallbackAndRecordedEvents(3, 1); +} + +// 3: Enable recording, enable callback, disable callback, disable recording. +TEST_F(TraceEventCallbackTest, TraceEventCallbackAndRecording3) { + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("recording"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("callback"), + Callback); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackDisabled(); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + EndTraceAndFlush(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + + DropTracedMetadataRecords(); + VerifyCallbackAndRecordedEvents(1, 3); +} + +// 4: Enable recording, enable callback, disable recording, disable callback. +TEST_F(TraceEventCallbackTest, TraceEventCallbackAndRecording4) { + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("recording"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("callback"), + Callback); + TRACE_EVENT_INSTANT0("recording", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + EndTraceAndFlush(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "yes", TRACE_EVENT_SCOPE_GLOBAL); + TraceLog::GetInstance()->SetEventCallbackDisabled(); + TRACE_EVENT_INSTANT0("recording", "no", TRACE_EVENT_SCOPE_GLOBAL); + TRACE_EVENT_INSTANT0("callback", "no", TRACE_EVENT_SCOPE_GLOBAL); + + DropTracedMetadataRecords(); + VerifyCallbackAndRecordedEvents(2, 2); +} + +TEST_F(TraceEventCallbackTest, TraceEventCallbackAndRecordingDuration) { + TraceLog::GetInstance()->SetEventCallbackEnabled(CategoryFilter("*"), + Callback); + { + TRACE_EVENT0("callback", "duration1"); + TraceLog::GetInstance()->SetEnabled( + CategoryFilter("*"), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_UNTIL_FULL); + TRACE_EVENT0("callback", "duration2"); + EndTraceAndFlush(); + TRACE_EVENT0("callback", "duration3"); + } + TraceLog::GetInstance()->SetEventCallbackDisabled(); + + ASSERT_EQ(6u, collected_events_names_.size()); + VerifyCollectedEvent(0, TRACE_EVENT_PHASE_BEGIN, "callback", "duration1"); + VerifyCollectedEvent(1, TRACE_EVENT_PHASE_BEGIN, "callback", "duration2"); + VerifyCollectedEvent(2, TRACE_EVENT_PHASE_BEGIN, "callback", "duration3"); + VerifyCollectedEvent(3, TRACE_EVENT_PHASE_END, "callback", "duration3"); + VerifyCollectedEvent(4, TRACE_EVENT_PHASE_END, "callback", "duration2"); + VerifyCollectedEvent(5, TRACE_EVENT_PHASE_END, "callback", "duration1"); +} + +//////////////////////////////////////////////////////////// +// Tests for synthetic delay +// (from chromium-base/debug/trace_event_synthetic_delay_unittest.cc) +//////////////////////////////////////////////////////////// + +namespace { + +const int kTargetDurationMs = 100; +// Allow some leeway in timings to make it possible to run these tests with a +// wall clock time source too. +const int kShortDurationMs = 10; + +} // namespace + +namespace debug { + +class TraceEventSyntheticDelayTest : public KuduTest, + public TraceEventSyntheticDelayClock { + public: + TraceEventSyntheticDelayTest() { + now_ = MonoTime::Min(); + } + + virtual ~TraceEventSyntheticDelayTest() { + ResetTraceEventSyntheticDelays(); + } + + // TraceEventSyntheticDelayClock implementation. + virtual MonoTime Now() OVERRIDE { + AdvanceTime(MonoDelta::FromMilliseconds(kShortDurationMs / 10)); + return now_; + } + + TraceEventSyntheticDelay* ConfigureDelay(const char* name) { + TraceEventSyntheticDelay* delay = TraceEventSyntheticDelay::Lookup(name); + delay->SetClock(this); + delay->SetTargetDuration( + MonoDelta::FromMilliseconds(kTargetDurationMs)); + return delay; + } + + void AdvanceTime(MonoDelta delta) { now_.AddDelta(delta); } + + int TestFunction() { + MonoTime start = Now(); + { TRACE_EVENT_SYNTHETIC_DELAY("test.Delay"); } + MonoTime end = Now(); + return end.GetDeltaSince(start).ToMilliseconds(); + } + + int AsyncTestFunctionBegin() { + MonoTime start = Now(); + { TRACE_EVENT_SYNTHETIC_DELAY_BEGIN("test.AsyncDelay"); } + MonoTime end = Now(); + return end.GetDeltaSince(start).ToMilliseconds(); + } + + int AsyncTestFunctionEnd() { + MonoTime start = Now(); + { TRACE_EVENT_SYNTHETIC_DELAY_END("test.AsyncDelay"); } + MonoTime end = Now(); + return end.GetDeltaSince(start).ToMilliseconds(); + } + + private: + MonoTime now_; + + DISALLOW_COPY_AND_ASSIGN(TraceEventSyntheticDelayTest); +}; + +TEST_F(TraceEventSyntheticDelayTest, StaticDelay) { + TraceEventSyntheticDelay* delay = ConfigureDelay("test.Delay"); + delay->SetMode(TraceEventSyntheticDelay::STATIC); + EXPECT_GE(TestFunction(), kTargetDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, OneShotDelay) { + TraceEventSyntheticDelay* delay = ConfigureDelay("test.Delay"); + delay->SetMode(TraceEventSyntheticDelay::ONE_SHOT); + EXPECT_GE(TestFunction(), kTargetDurationMs); + EXPECT_LT(TestFunction(), kShortDurationMs); + + delay->SetTargetDuration( + MonoDelta::FromMilliseconds(kTargetDurationMs)); + EXPECT_GE(TestFunction(), kTargetDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, AlternatingDelay) { + TraceEventSyntheticDelay* delay = ConfigureDelay("test.Delay"); + delay->SetMode(TraceEventSyntheticDelay::ALTERNATING); + EXPECT_GE(TestFunction(), kTargetDurationMs); + EXPECT_LT(TestFunction(), kShortDurationMs); + EXPECT_GE(TestFunction(), kTargetDurationMs); + EXPECT_LT(TestFunction(), kShortDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, AsyncDelay) { + ConfigureDelay("test.AsyncDelay"); + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + EXPECT_GE(AsyncTestFunctionEnd(), kTargetDurationMs / 2); +} + +TEST_F(TraceEventSyntheticDelayTest, AsyncDelayExceeded) { + ConfigureDelay("test.AsyncDelay"); + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + AdvanceTime(MonoDelta::FromMilliseconds(kTargetDurationMs)); + EXPECT_LT(AsyncTestFunctionEnd(), kShortDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, AsyncDelayNoActivation) { + ConfigureDelay("test.AsyncDelay"); + EXPECT_LT(AsyncTestFunctionEnd(), kShortDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, AsyncDelayNested) { + ConfigureDelay("test.AsyncDelay"); + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + EXPECT_LT(AsyncTestFunctionEnd(), kShortDurationMs); + EXPECT_GE(AsyncTestFunctionEnd(), kTargetDurationMs / 2); +} + +TEST_F(TraceEventSyntheticDelayTest, AsyncDelayUnbalanced) { + ConfigureDelay("test.AsyncDelay"); + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + EXPECT_GE(AsyncTestFunctionEnd(), kTargetDurationMs / 2); + EXPECT_LT(AsyncTestFunctionEnd(), kShortDurationMs); + + EXPECT_LT(AsyncTestFunctionBegin(), kShortDurationMs); + EXPECT_GE(AsyncTestFunctionEnd(), kTargetDurationMs / 2); +} + +TEST_F(TraceEventSyntheticDelayTest, ResetDelays) { + ConfigureDelay("test.Delay"); + ResetTraceEventSyntheticDelays(); + EXPECT_LT(TestFunction(), kShortDurationMs); +} + +TEST_F(TraceEventSyntheticDelayTest, BeginParallel) { + TraceEventSyntheticDelay* delay = ConfigureDelay("test.AsyncDelay"); + MonoTime end_times[2]; + MonoTime start_time = Now(); + + delay->BeginParallel(&end_times[0]); + EXPECT_FALSE(!end_times[0].Initialized()); + + delay->BeginParallel(&end_times[1]); + EXPECT_FALSE(!end_times[1].Initialized()); + + delay->EndParallel(end_times[0]); + EXPECT_GE(Now().GetDeltaSince(start_time).ToMilliseconds(), kTargetDurationMs); + + start_time = Now(); + delay->EndParallel(end_times[1]); + EXPECT_LT(Now().GetDeltaSince(start_time).ToMilliseconds(), kShortDurationMs); +} + +TEST_F(TraceTest, TestVLogTrace) { + for (FLAGS_v = 0; FLAGS_v <= 1; FLAGS_v++) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::RECORD_CONTINUOUSLY); + VLOG_AND_TRACE("test", 1) << "hello world"; + tl->SetDisabled(); + string trace_json = TraceResultBuffer::FlushTraceLogToString(); + ASSERT_STR_CONTAINS(trace_json, "hello world"); + ASSERT_STR_CONTAINS(trace_json, "trace-test.cc"); + } +} + +namespace { +string FunctionWithSideEffect(bool* b) { + *b = true; + return "function-result"; +} +} // anonymous namespace + +// Test that, if tracing is not enabled, a VLOG_AND_TRACE doesn't evaluate its +// arguments. +TEST_F(TraceTest, TestVLogTraceLazyEvaluation) { + FLAGS_v = 0; + bool function_run = false; + VLOG_AND_TRACE("test", 1) << FunctionWithSideEffect(&function_run); + ASSERT_FALSE(function_run); + + // If we enable verbose logging, we should run the side effect even though + // trace logging is disabled. + FLAGS_v = 1; + VLOG_AND_TRACE("test", 1) << FunctionWithSideEffect(&function_run); + ASSERT_TRUE(function_run); +} + +TEST_F(TraceTest, TestVLogAndEchoToConsole) { + TraceLog* tl = TraceLog::GetInstance(); + tl->SetEnabled(CategoryFilter(CategoryFilter::kDefaultCategoryFilterString), + TraceLog::RECORDING_MODE, + TraceLog::ECHO_TO_CONSOLE); + FLAGS_v = 1; + VLOG_AND_TRACE("test", 1) << "hello world"; + tl->SetDisabled(); +} + +} // namespace debug +} // namespace kudu diff --git a/src/kudu/util/trace.cc b/src/kudu/util/trace.cc new file mode 100644 index 000000000000..bc1085cfb5bc --- /dev/null +++ b/src/kudu/util/trace.cc @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/trace.h" + +#include +#include +#include +#include +#include +#include + +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/walltime.h" +#include "kudu/util/memory/arena.h" + +namespace kudu { + +using strings::internal::SubstituteArg; + +__thread Trace* Trace::threadlocal_trace_; + +Trace::Trace() + : arena_(new ThreadSafeArena(1024, 128*1024)), + entries_head_(nullptr), + entries_tail_(nullptr) { +} + +Trace::~Trace() { +} + +// Struct which precedes each entry in the trace. +struct TraceEntry { + MicrosecondsInt64 timestamp_micros; + + // The source file and line number which generated the trace message. + const char* file_path; + int line_number; + + uint32_t message_len; + TraceEntry* next; + + // The actual trace message follows the entry header. + char* message() { + return reinterpret_cast(this) + sizeof(*this); + } +}; + +// Get the part of filepath after the last path separator. +// (Doesn't modify filepath, contrary to basename() in libgen.h.) +// Borrowed from glog. +static const char* const_basename(const char* filepath) { + const char* base = strrchr(filepath, '/'); +#ifdef OS_WINDOWS // Look for either path separator in Windows + if (!base) + base = strrchr(filepath, '\\'); +#endif + return base ? (base+1) : filepath; +} + + +void Trace::SubstituteAndTrace(const char* file_path, + int line_number, + StringPiece format, + const SubstituteArg& arg0, const SubstituteArg& arg1, + const SubstituteArg& arg2, const SubstituteArg& arg3, + const SubstituteArg& arg4, const SubstituteArg& arg5, + const SubstituteArg& arg6, const SubstituteArg& arg7, + const SubstituteArg& arg8, const SubstituteArg& arg9) { + const SubstituteArg* const args_array[] = { + &arg0, &arg1, &arg2, &arg3, &arg4, &arg5, &arg6, &arg7, &arg8, &arg9, nullptr + }; + + int msg_len = strings::internal::SubstitutedSize(format, args_array); + TraceEntry* entry = NewEntry(msg_len, file_path, line_number); + SubstituteToBuffer(format, args_array, entry->message()); + AddEntry(entry); +} + +TraceEntry* Trace::NewEntry(int msg_len, const char* file_path, int line_number) { + int size = sizeof(TraceEntry) + msg_len; + uint8_t* dst = reinterpret_cast(arena_->AllocateBytes(size)); + TraceEntry* entry = reinterpret_cast(dst); + entry->timestamp_micros = GetCurrentTimeMicros(); + entry->message_len = msg_len; + entry->file_path = file_path; + entry->line_number = line_number; + return entry; +} + +void Trace::AddEntry(TraceEntry* entry) { + lock_guard l(&lock_); + entry->next = nullptr; + + if (entries_tail_ != nullptr) { + entries_tail_->next = entry; + } else { + DCHECK(entries_head_ == nullptr); + entries_head_ = entry; + } + entries_tail_ = entry; +} + +void Trace::Dump(std::ostream* out, bool include_time_deltas) const { + // Gather a copy of the list of entries under the lock. This is fast + // enough that we aren't worried about stalling concurrent tracers + // (whereas doing the logging itself while holding the lock might be + // too slow, if the output stream is a file, for example). + vector entries; + vector > child_traces; + { + lock_guard l(&lock_); + for (TraceEntry* cur = entries_head_; + cur != nullptr; + cur = cur->next) { + entries.push_back(cur); + } + + child_traces = child_traces_; + } + + // Save original flags. + std::ios::fmtflags save_flags(out->flags()); + + int64_t prev_usecs = 0; + for (TraceEntry* e : entries) { + // Log format borrowed from glog/logging.cc + time_t secs_since_epoch = e->timestamp_micros / 1000000; + int usecs = e->timestamp_micros % 1000000; + struct tm tm_time; + localtime_r(&secs_since_epoch, &tm_time); + + int64_t usecs_since_prev = 0; + if (prev_usecs != 0) { + usecs_since_prev = e->timestamp_micros - prev_usecs; + } + prev_usecs = e->timestamp_micros; + + using std::setw; + out->fill('0'); + + *out << setw(2) << (1 + tm_time.tm_mon) + << setw(2) << tm_time.tm_mday + << ' ' + << setw(2) << tm_time.tm_hour << ':' + << setw(2) << tm_time.tm_min << ':' + << setw(2) << tm_time.tm_sec << '.' + << setw(6) << usecs << ' '; + if (include_time_deltas) { + out->fill(' '); + *out << "(+" << setw(6) << usecs_since_prev << "us) "; + } + *out << const_basename(e->file_path) << ':' << e->line_number + << "] "; + out->write(reinterpret_cast(e) + sizeof(TraceEntry), + e->message_len); + *out << std::endl; + } + + for (scoped_refptr child_trace : child_traces) { + *out << "Related trace:" << std::endl; + *out << child_trace->DumpToString(include_time_deltas); + } + + // Restore stream flags. + out->flags(save_flags); +} + +string Trace::DumpToString(bool include_time_deltas) const { + std::stringstream s; + Dump(&s, include_time_deltas); + return s.str(); +} + +void Trace::DumpCurrentTrace() { + Trace* t = CurrentTrace(); + if (t == nullptr) { + LOG(INFO) << "No trace is currently active."; + return; + } + t->Dump(&std::cerr, true); +} + +void Trace::AddChildTrace(Trace* child_trace) { + lock_guard l(&lock_); + scoped_refptr ptr(child_trace); + child_traces_.push_back(ptr); +} + +} // namespace kudu diff --git a/src/kudu/util/trace.h b/src/kudu/util/trace.h new file mode 100644 index 000000000000..ff44edcd87e0 --- /dev/null +++ b/src/kudu/util/trace.h @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_TRACE_H +#define KUDU_UTIL_TRACE_H + +#include +#include +#include + +#include "kudu/gutil/macros.h" +#include "kudu/gutil/strings/stringpiece.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/gutil/ref_counted.h" +#include "kudu/gutil/threading/thread_collision_warner.h" +#include "kudu/util/locks.h" + +// Adopt a Trace on the current thread for the duration of the current +// scope. The old current Trace is restored when the scope is exited. +// +// 't' should be a Trace* pointer. +#define ADOPT_TRACE(t) kudu::ScopedAdoptTrace _adopt_trace(t); + +// Issue a trace message, if tracing is enabled in the current thread. +// See Trace::SubstituteAndTrace for arguments. +// Example: +// TRACE("Acquired timestamp $0", timestamp); +#define TRACE(format, substitutions...) \ + do { \ + kudu::Trace* _trace = Trace::CurrentTrace(); \ + if (_trace) { \ + _trace->SubstituteAndTrace(__FILE__, __LINE__, (format), \ + ##substitutions); \ + } \ + } while (0); + +// Like the above, but takes the trace pointer as an explicit argument. +#define TRACE_TO(trace, format, substitutions...) \ + (trace)->SubstituteAndTrace(__FILE__, __LINE__, (format), ##substitutions) + + +namespace kudu { + +class ThreadSafeArena; +struct TraceEntry; + +// A trace for a request or other process. This supports collecting trace entries +// from a number of threads, and later dumping the results to a stream. +// +// Callers should generally not add trace messages directly using the public +// methods of this class. Rather, the TRACE(...) macros defined above should +// be used such that file/line numbers are automatically included, etc. +// +// This class is thread-safe. +class Trace : public RefCountedThreadSafe { + public: + Trace(); + + // Logs a message into the trace buffer. + // + // See strings::Substitute for details. + // + // N.B.: the file path passed here is not copied, so should be a static + // constant (eg __FILE__). + void SubstituteAndTrace(const char* filepath, int line_number, + StringPiece format, + const strings::internal::SubstituteArg& arg0 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg1 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg2 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg3 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg4 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg5 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg6 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg7 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg8 = + strings::internal::SubstituteArg::NoArg, + const strings::internal::SubstituteArg& arg9 = + strings::internal::SubstituteArg::NoArg); + + // Dump the trace buffer to the given output stream. + // + // If 'include_time_deltas' is true, calculates and prints the difference between + // successive trace messages. + void Dump(std::ostream* out, bool include_time_deltas) const; + + // Dump the trace buffer as a string. + std::string DumpToString(bool include_time_deltas) const; + + // Attaches the given trace which will get appended at the end when Dumping. + void AddChildTrace(Trace* child_trace); + + // Return the current trace attached to this thread, if there is one. + static Trace* CurrentTrace() { + return threadlocal_trace_; + } + + // Simple function to dump the current trace to stderr, if one is + // available. This is meant for usage when debugging in gdb via + // 'call kudu::Trace::DumpCurrentTrace();'. + static void DumpCurrentTrace(); + + private: + friend class ScopedAdoptTrace; + friend class RefCountedThreadSafe; + ~Trace(); + + // The current trace for this thread. Threads should only set this using + // using ScopedAdoptTrace, which handles reference counting the underlying + // object. + static __thread Trace* threadlocal_trace_; + + // Allocate a new entry from the arena, with enough space to hold a + // message of length 'len'. + TraceEntry* NewEntry(int len, const char* file_path, int line_number); + + // Add the entry to the linked list of entries. + void AddEntry(TraceEntry* entry); + + gscoped_ptr arena_; + + // Lock protecting the entries linked list. + mutable simple_spinlock lock_; + // The head of the linked list of entries (allocated inside arena_) + TraceEntry* entries_head_; + // The tail of the linked list of entries (allocated inside arena_) + TraceEntry* entries_tail_; + + std::vector > child_traces_; + + DISALLOW_COPY_AND_ASSIGN(Trace); +}; + +// Adopt a Trace object into the current thread for the duration +// of this object. +// This should only be used on the stack (and thus created and destroyed +// on the same thread) +class ScopedAdoptTrace { + public: + explicit ScopedAdoptTrace(Trace* t) : + old_trace_(Trace::threadlocal_trace_) { + Trace::threadlocal_trace_ = t; + if (t) { + t->AddRef(); + } + DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); + } + + ~ScopedAdoptTrace() { + if (Trace::threadlocal_trace_) { + Trace::threadlocal_trace_->Release(); + } + Trace::threadlocal_trace_ = old_trace_; + DFAKE_SCOPED_LOCK_THREAD_LOCKED(ctor_dtor_); + } + + private: + DFAKE_MUTEX(ctor_dtor_); + Trace* old_trace_; + + DISALLOW_COPY_AND_ASSIGN(ScopedAdoptTrace); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_TRACE_H */ diff --git a/src/kudu/util/url-coding-test.cc b/src/kudu/util/url-coding-test.cc new file mode 100644 index 000000000000..37b81a4bfab7 --- /dev/null +++ b/src/kudu/util/url-coding-test.cc @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "kudu/util/url-coding.h" + +using namespace std; // NOLINT(*) + +namespace kudu { + +// Tests encoding/decoding of input. If expected_encoded is non-empty, the +// encoded string is validated against it. +void TestUrl(const string& input, const string& expected_encoded, bool hive_compat) { + string intermediate; + UrlEncode(input, &intermediate, hive_compat); + string output; + if (!expected_encoded.empty()) { + EXPECT_EQ(expected_encoded, intermediate); + } + EXPECT_TRUE(UrlDecode(intermediate, &output, hive_compat)); + EXPECT_EQ(input, output); + + // Convert string to vector and try that also + vector input_vector; + input_vector.resize(input.size()); + if (!input.empty()) { + memcpy(&input_vector[0], input.c_str(), input.size()); + } + string intermediate2; + UrlEncode(input_vector, &intermediate2, hive_compat); + EXPECT_EQ(intermediate, intermediate2); +} + +void TestBase64(const string& input, const string& expected_encoded) { + string intermediate; + Base64Encode(input, &intermediate); + string output; + if (!expected_encoded.empty()) { + EXPECT_EQ(intermediate, expected_encoded); + } + EXPECT_TRUE(Base64Decode(intermediate, &output)); + EXPECT_EQ(input, output); + + // Convert string to vector and try that also + vector input_vector; + input_vector.resize(input.size()); + memcpy(&input_vector[0], input.c_str(), input.size()); + string intermediate2; + Base64Encode(input_vector, &intermediate2); + EXPECT_EQ(intermediate, intermediate2); +} + +// Test URL encoding. Check that the values that are put in are the +// same that come out. +TEST(UrlCodingTest, Basic) { + string input = "ABCDEFGHIJKLMNOPQRSTUWXYZ1234567890~!@#$%^&*()<>?,./:\";'{}|[]\\_+-="; + TestUrl(input, "", false); + TestUrl(input, "", true); +} + +TEST(UrlCodingTest, HiveExceptions) { + TestUrl(" +", " +", true); +} + +TEST(UrlCodingTest, BlankString) { + TestUrl("", "", false); + TestUrl("", "", true); +} + +TEST(UrlCodingTest, PathSeparators) { + TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", false); + TestUrl("/home/impala/directory/", "%2Fhome%2Fimpala%2Fdirectory%2F", true); +} + +TEST(Base64Test, Basic) { + TestBase64("a", "YQ=="); + TestBase64("ab", "YWI="); + TestBase64("abc", "YWJj"); + TestBase64("abcd", "YWJjZA=="); + TestBase64("abcde", "YWJjZGU="); + TestBase64("abcdef", "YWJjZGVm"); +} + +TEST(HtmlEscapingTest, Basic) { + string before = "&"; + stringstream after; + EscapeForHtml(before, &after); + EXPECT_EQ(after.str(), "<html><body>&amp"); +} + +} // namespace kudu diff --git a/src/kudu/util/url-coding.cc b/src/kudu/util/url-coding.cc new file mode 100644 index 000000000000..4aae48e54bee --- /dev/null +++ b/src/kudu/util/url-coding.cc @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kudu/util/url-coding.h" + +#include +#include +#include +#include +#include +#include +#include + +#include + +using std::string; +using std::vector; +using namespace boost::archive::iterators; // NOLINT(*) + +namespace kudu { + +// Hive selectively encodes characters. This is the whitelist of +// characters it will encode. +// See common/src/java/org/apache/hadoop/hive/common/FileUtils.java +// in the Hive source code for the source of this list. +static boost::function HiveShouldEscape = boost::is_any_of("\"#%\\*/:=?\u00FF"); // NOLINT(*) + +// It is more convenient to maintain the complement of the set of +// characters to escape when not in Hive-compat mode. +static boost::function ShouldNotEscape = boost::is_any_of("-_.~"); // NOLINT(*) + +static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) { + (*out).reserve(in_len); + std::stringstream ss; + for (int i = 0; i < in_len; ++i) { + const char ch = in[i]; + // Escape the character iff a) we are in Hive-compat mode and the + // character is in the Hive whitelist or b) we are not in + // Hive-compat mode, and the character is not alphanumeric or one + // of the four commonly excluded characters. + if ((hive_compat && HiveShouldEscape(ch)) || + (!hive_compat && !(isalnum(ch) || ShouldNotEscape(ch)))) { + ss << '%' << std::uppercase << std::hex << static_cast(ch); + } else { + ss << ch; + } + } + + (*out) = ss.str(); +} + +void UrlEncode(const vector& in, string* out, bool hive_compat) { + if (in.empty()) { + *out = ""; + } else { + UrlEncode(reinterpret_cast(&in[0]), in.size(), out, hive_compat); + } +} + +void UrlEncode(const string& in, string* out, bool hive_compat) { + UrlEncode(in.c_str(), in.size(), out, hive_compat); +} + +string UrlEncodeToString(const std::string& in, bool hive_compat) { + string ret; + UrlEncode(in, &ret, hive_compat); + return ret; +} + +// Adapted from +// http://www.boost.org/doc/libs/1_40_0/doc/html/boost_asio/ +// example/http/server3/request_handler.cpp +// See http://www.boost.org/LICENSE_1_0.txt for license for this method. +bool UrlDecode(const string& in, string* out, bool hive_compat) { + out->clear(); + out->reserve(in.size()); + for (size_t i = 0; i < in.size(); ++i) { + if (in[i] == '%') { + if (i + 3 <= in.size()) { + int value = 0; + std::istringstream is(in.substr(i + 1, 2)); + if (is >> std::hex >> value) { + (*out) += static_cast(value); + i += 2; + } else { + return false; + } + } else { + return false; + } + } else if (!hive_compat && in[i] == '+') { // Hive does not encode ' ' as '+' + (*out) += ' '; + } else { + (*out) += in[i]; + } + } + return true; +} + +static inline void Base64Encode(const char* in, int in_len, std::stringstream* out) { + typedef base64_from_binary > base64_encode; + // Base64 encodes 8 byte chars as 6 bit values. + std::stringstream::pos_type len_before = out->tellp(); + copy(base64_encode(in), base64_encode(in + in_len), std::ostream_iterator(*out)); + int bytes_written = out->tellp() - len_before; + // Pad with = to make it valid base64 encoded string + int num_pad = bytes_written % 4; + if (num_pad != 0) { + num_pad = 4 - num_pad; + for (int i = 0; i < num_pad; ++i) { + (*out) << "="; + } + } + DCHECK_EQ(out->str().size() % 4, 0); +} + +void Base64Encode(const vector& in, string* out) { + if (in.empty()) { + *out = ""; + } else { + std::stringstream ss; + Base64Encode(in, &ss); + *out = ss.str(); + } +} + +void Base64Encode(const vector& in, std::stringstream* out) { + if (!in.empty()) { + // Boost does not like non-null terminated strings + string tmp(reinterpret_cast(&in[0]), in.size()); + Base64Encode(tmp.c_str(), tmp.size(), out); + } +} + +void Base64Encode(const string& in, string* out) { + std::stringstream ss; + Base64Encode(in.c_str(), in.size(), &ss); + *out = ss.str(); +} + +void Base64Encode(const string& in, std::stringstream* out) { + Base64Encode(in.c_str(), in.size(), out); +} + +bool Base64Decode(const string& in, string* out) { + typedef transform_width, 8, 6> base64_decode; + string tmp = in; + // Replace padding with base64 encoded NULL + replace(tmp.begin(), tmp.end(), '=', 'A'); + try { + *out = string(base64_decode(tmp.begin()), base64_decode(tmp.end())); + } catch(std::exception& e) { + return false; + } + + // Remove trailing '\0' that were added as padding. Since \0 is special, + // the boost functions get confused so do this manually. + int num_padded_chars = 0; + for (int i = out->size() - 1; i >= 0; --i) { + if ((*out)[i] != '\0') break; + ++num_padded_chars; + } + out->resize(out->size() - num_padded_chars); + return true; +} + +void EscapeForHtml(const string& in, std::stringstream* out) { + DCHECK(out != nullptr); + for (const char& c : in) { + switch (c) { + case '<': (*out) << "<"; + break; + case '>': (*out) << ">"; + break; + case '&': (*out) << "&"; + break; + default: (*out) << c; + } + } +} + +std::string EscapeForHtmlToString(const std::string& in) { + std::stringstream str; + EscapeForHtml(in, &str); + return str.str(); +} + +} // namespace kudu diff --git a/src/kudu/util/url-coding.h b/src/kudu/util/url-coding.h new file mode 100644 index 000000000000..c2208a43c7ed --- /dev/null +++ b/src/kudu/util/url-coding.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_URL_CODING_H +#define UTIL_URL_CODING_H + +#include + +#include +#include + +namespace kudu { + +// Utility method to URL-encode a string (that is, replace special +// characters with %). +// The optional parameter hive_compat controls whether we mimic Hive's +// behaviour when encoding a string, which is only to encode certain +// characters (excluding, e.g., ' ') +void UrlEncode(const std::string& in, std::string* out, bool hive_compat = false); +void UrlEncode(const std::vector& in, std::string* out, + bool hive_compat = false); +std::string UrlEncodeToString(const std::string& in, bool hive_compat = false); + +// Utility method to decode a string that was URL-encoded. Returns +// true unless the string could not be correctly decoded. +// The optional parameter hive_compat controls whether or not we treat +// the strings as encoded by Hive, which means selectively ignoring +// certain characters like ' '. +bool UrlDecode(const std::string& in, std::string* out, bool hive_compat = false); + +// Utility method to encode input as base-64 encoded. This is not +// very performant (multiple string copies) and should not be used +// in a hot path. +void Base64Encode(const std::vector& in, std::string* out); +void Base64Encode(const std::vector& in, std::stringstream* out); +void Base64Encode(const std::string& in, std::string* out); +void Base64Encode(const std::string& in, std::stringstream* out); + +// Utility method to decode base64 encoded strings. Also not extremely +// performant. +// Returns true unless the string could not be correctly decoded. +bool Base64Decode(const std::string& in, std::string* out); + +// Replaces &, < and > with &, < and > respectively. This is +// not the full set of required encodings, but one that should be +// added to on a case-by-case basis. Slow, since it necessarily +// inspects each character in turn, and copies them all to *out; use +// judiciously. +void EscapeForHtml(const std::string& in, std::stringstream* out); + +// Same as above, but returns a string. +std::string EscapeForHtmlToString(const std::string& in); + +} // namespace kudu + +#endif diff --git a/src/kudu/util/user-test.cc b/src/kudu/util/user-test.cc new file mode 100644 index 000000000000..8dfbe14e300f --- /dev/null +++ b/src/kudu/util/user-test.cc @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "kudu/util/status.h" +#include "kudu/util/test_util.h" +#include "kudu/util/user.h" + +namespace kudu { + +using std::string; + +class TestUser : public KuduTest { +}; + +// Validate that the current username is non-empty. +TEST_F(TestUser, TestNonEmpty) { + string username; + ASSERT_TRUE(username.empty()); + ASSERT_OK(GetLoggedInUser(&username)); + ASSERT_FALSE(username.empty()); + LOG(INFO) << "Name of the current user is: " << username; +} + +} // namespace kudu diff --git a/src/kudu/util/user.cc b/src/kudu/util/user.cc new file mode 100644 index 000000000000..c8a286841d42 --- /dev/null +++ b/src/kudu/util/user.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/user.h" + +#include +#include +#include +#include + +#include + +#include + +#include "kudu/gutil/gscoped_ptr.h" +#include "kudu/util/errno.h" +#include "kudu/util/status.h" + +using std::string; + +namespace kudu { + +Status GetLoggedInUser(string* user_name) { + DCHECK(user_name != nullptr); + + struct passwd pwd; + struct passwd *result; + + size_t bufsize = sysconf(_SC_GETPW_R_SIZE_MAX); + if (bufsize == -1) { // Value was indeterminate. + bufsize = 16384; // Should be more than enough, per the man page. + } + + gscoped_ptr buf(static_cast(malloc(bufsize))); + if (buf.get() == nullptr) { + return Status::RuntimeError("Malloc failed", ErrnoToString(errno), errno); + } + + int ret = getpwuid_r(getuid(), &pwd, buf.get(), bufsize, &result); + if (result == nullptr) { + if (ret == 0) { + return Status::NotFound("Current logged-in user not found! This is an unexpected error."); + } else { + // Errno in ret + return Status::RuntimeError("Error calling getpwuid_r()", ErrnoToString(ret), ret); + } + } + + *user_name = pwd.pw_name; + + return Status::OK(); +} + +} // namespace kudu diff --git a/src/kudu/util/user.h b/src/kudu/util/user.h new file mode 100644 index 000000000000..6839a81d5f4c --- /dev/null +++ b/src/kudu/util/user.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_USER_H +#define KUDU_UTIL_USER_H + +#include + +#include "kudu/util/status.h" + +namespace kudu { + +// Get current logged-in user with getpwuid_r(). +// user name is written to user_name. +Status GetLoggedInUser(std::string* user_name); + +} // namespace kudu + +#endif // KUDU_UTIL_USER_H diff --git a/src/kudu/util/version_info.cc b/src/kudu/util/version_info.cc new file mode 100644 index 000000000000..2ea437ff8e56 --- /dev/null +++ b/src/kudu/util/version_info.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "kudu/util/version_info.h" + +#include + +#include "kudu/generated/version_defines.h" +#include "kudu/gutil/strings/substitute.h" +#include "kudu/util/version_info.pb.h" + +using std::string; + +namespace kudu { + +string VersionInfo::GetGitHash() { + string ret = KUDU_GIT_HASH; + if (!KUDU_BUILD_CLEAN_REPO) { + ret += "-dirty"; + } + return ret; +} + +string VersionInfo::GetShortVersionString() { + return strings::Substitute("kudu $0 (rev $1)", + KUDU_VERSION_STRING, + GetGitHash()); +} + +string VersionInfo::GetAllVersionInfo() { + string ret = strings::Substitute( + "kudu $0\n" + "revision $1\n" + "build type $2\n" + "built by $3 at $4 on $5", + KUDU_VERSION_STRING, + GetGitHash(), + KUDU_BUILD_TYPE, + KUDU_BUILD_USERNAME, + KUDU_BUILD_TIMESTAMP, + KUDU_BUILD_HOSTNAME); + if (strlen(KUDU_BUILD_ID) > 0) { + strings::SubstituteAndAppend(&ret, "\nbuild id $0", KUDU_BUILD_ID); + } +#ifdef ADDRESS_SANITIZER + ret += "\nASAN enabled"; +#endif +#ifdef THREAD_SANITIZER + ret += "\nTSAN enabled"; +#endif + return ret; +} + +void VersionInfo::GetVersionInfoPB(VersionInfoPB* pb) { + pb->set_git_hash(KUDU_GIT_HASH); + pb->set_build_hostname(KUDU_BUILD_HOSTNAME); + pb->set_build_timestamp(KUDU_BUILD_TIMESTAMP); + pb->set_build_username(KUDU_BUILD_USERNAME); + pb->set_build_clean_repo(KUDU_BUILD_CLEAN_REPO); + pb->set_build_id(KUDU_BUILD_ID); + pb->set_build_type(KUDU_BUILD_TYPE); + pb->set_version_string(KUDU_VERSION_STRING); +} + +} // namespace kudu diff --git a/src/kudu/util/version_info.h b/src/kudu/util/version_info.h new file mode 100644 index 000000000000..5bda97e8eddd --- /dev/null +++ b/src/kudu/util/version_info.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_VERSION_INFO_H +#define KUDU_UTIL_VERSION_INFO_H + +#include + +#include "kudu/gutil/macros.h" + +namespace kudu { + +class VersionInfoPB; + +// Static functions related to fetching information about the current build. +class VersionInfo { + public: + // Get a short version string ("kudu 1.2.3 (rev abcdef...)") + static std::string GetShortVersionString(); + + // Get a multi-line string including version info, build time, etc. + static std::string GetAllVersionInfo(); + + // Set the version info in 'pb'. + static void GetVersionInfoPB(VersionInfoPB* pb); + private: + // Get the git hash for this build. If the working directory was dirty when + // Kudu was built, also appends "-dirty". + static std::string GetGitHash(); + + DISALLOW_IMPLICIT_CONSTRUCTORS(VersionInfo); +}; + +} // namespace kudu +#endif /* KUDU_UTIL_VERSION_INFO_H */ diff --git a/src/kudu/util/version_info.proto b/src/kudu/util/version_info.proto new file mode 100644 index 000000000000..b543d475241d --- /dev/null +++ b/src/kudu/util/version_info.proto @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package kudu; + +option java_package = "org.kududb"; + +// Information about the build environment, configuration, etc. +message VersionInfoPB { + optional string git_hash = 1; + optional string build_hostname = 2; + optional string build_timestamp = 3; + optional string build_username = 4; + optional bool build_clean_repo = 5; + optional string build_id = 6; + optional string build_type = 7; + optional string version_string = 8; +} diff --git a/src/kudu/util/web_callback_registry.h b/src/kudu/util/web_callback_registry.h new file mode 100644 index 000000000000..0c143f402c90 --- /dev/null +++ b/src/kudu/util/web_callback_registry.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#ifndef KUDU_UTIL_WEB_CALLBACK_REGISTRY_H +#define KUDU_UTIL_WEB_CALLBACK_REGISTRY_H + +#include +#include +#include + +namespace kudu { + +// Interface for registering webserver callbacks. +class WebCallbackRegistry { + public: + typedef std::map ArgumentMap; + + struct WebRequest { + // The query string, parsed into key/value argument pairs. + ArgumentMap parsed_args; + + // The raw query string passed in the URL. May be empty. + std::string query_string; + + // The method (POST/GET/etc). + std::string request_method; + + // In the case of a POST, the posted data. + std::string post_data; + }; + + typedef boost::function + PathHandlerCallback; + + virtual ~WebCallbackRegistry() {} + + // Register a callback for a URL path. Path should not include the + // http://hostname/ prefix. If is_styled is true, the page is meant to be for + // people to look at and is styled. If false, it is meant to be for machines to + // scrape. If is_on_nav_bar is true, a link to this page is + // printed in the navigation bar at the top of each debug page. Otherwise the + // link does not appear, and the page is rendered without HTML headers and + // footers. + // The first registration's choice of is_styled overrides all + // subsequent registrations for that URL. + virtual void RegisterPathHandler(const std::string& path, const std::string& alias, + const PathHandlerCallback& callback, + bool is_styled = true, bool is_on_nav_bar = true) = 0; +}; + +} // namespace kudu + +#endif /* KUDU_UTIL_WEB_CALLBACK_REGISTRY_H */ diff --git a/thirdparty/.gitignore b/thirdparty/.gitignore new file mode 100644 index 000000000000..4d85033c6124 --- /dev/null +++ b/thirdparty/.gitignore @@ -0,0 +1,31 @@ +cmake-* +crcutil-* +gflags-* +glog-* +gmock-* +gperftools-* +gtest-* +libev-* +libunwind-* +llvm-* +protobuf-* +snappy-* +zlib-* +curl-* +gcovr-* +rapidjson-* +squeasel-* +installed/ +installed-deps/ +installed-deps-tsan/ +install_manifest.txt +.build-hash +google-styleguide* +*.a +kudu-trace-viewer-* +clang-* +bitshuffle-* +lz4-lz4-* +nvml-* +python-* +gcc-* diff --git a/thirdparty/LICENSE.txt b/thirdparty/LICENSE.txt new file mode 100644 index 000000000000..c66b2e257b2d --- /dev/null +++ b/thirdparty/LICENSE.txt @@ -0,0 +1,630 @@ +This directory contains scripts which download and install several third-party +dependencies of Kudu. Most of these dependencies are statically linked into +Kudu binaries, though a few are used only at build-time. + +-------------------------------------------------------------------------------- +thirdparty/lz4-svn/: BSD 2-clause license +Source: https://github.com/Cyan4973/lz4 + + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2012, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/gflags-*/: BSD 3-clause dependency +source: https://github.com/gflags/gflags + + Copyright (c) 2006, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/glog-*/: BSD 3-clause license +Source: https://github.com/google/glog + + Copyright (c) 2008, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + A function gettimeofday in utilities.cc is based on + + http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd + + The license of this code is: + + Copyright (c) 2003-2008, Jouni Malinen and contributors + All Rights Reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name(s) of the above-listed copyright holder(s) nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/gperftools-*/: BSD 3-clause +Source: https://github.com/gperftools/gperftools + + Copyright (c) 2005, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/libev-*/: BSD 2-clause +Source: http://software.schmorp.de/pkg/libev.html + + All files in libev are + Copyright (c)2007,2008,2009,2010,2011,2012 Marc Alexander Lehmann. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Alternatively, the contents of this package may be used under the terms + of the GNU General Public License ("GPL") version 2 or any later version, + in which case the provisions of the GPL are applicable instead of the + above. If you wish to allow the use of your version of this package only + under the terms of the GPL and not to allow others to use your version of + this file under the BSD license, indicate your decision by deleting the + provisions above and replace them with the notice and other provisions + required by the GPL in this and the other files of this package. If you do + not delete the provisions above, a recipient may use your version of this + file under either the BSD or the GPL. + +-------------------------------------------------------------------------------- +thirdparty/squeasel-*/: MIT license +Source: https://github.com/cloudera/squeasel + + Copyright (c) 2004-2013 Sergey Lyubka + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +NOTE: 'squeasel' is derived from another open source project called 'mongoose'. +That project was originally licensed under the above MIT license, but later +changed to a GPL license. squeasel was forked from a revision of mongoose +obtained prior to the license change. + +-------------------------------------------------------------------------------- +thirdparty/protobuf-*/: BSD 3-clause +Source: https://github.com/google/protobuf + + Copyright 2008, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Code generated by the Protocol Buffer compiler is owned by the owner + of the input file used when generating it. This code is not + standalone and requires a support library to be linked with it. This + support library is itself covered by the above license. + +-------------------------------------------------------------------------------- +thirdparty/rapidjson-*/: MIT License +Source: https://github.com/miloyip/rapidjson + + Copyright (C) 2011 Milo Yip + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +-------------------------------------------------------------------------------- +thirdparty/snappy-*/: BSD 3-clause +Source: https://github.com/google/snappy + + Copyright 2011, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/bitshuffle-*: MIT license +Source: https://github.com/kiyo-masui/bitshuffle + + Bitshuffle - Filter for improving compression of typed binary data. + + Copyright (c) 2014 Kiyoshi Masui (kiyo@physics.ubc.ca) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +-------------------------------------------------------------------------------- +thirdparty/kudu-trace-viewer-*/: BSD 3-clause +Source: https://github.com/catapult-project/catapult + + Copyright (c) 2012 The Chromium Authors. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- +thirdparty/zlib-*/: Zlib license +Source: http://www.zlib.net/ + + (C) 1995-2013 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + +-------------------------------------------------------------------------------- +thirdparty/llvm-*: LLVM Release License (BSD 3-clause) + + ============================================================================== + LLVM Release License + ============================================================================== + University of Illinois/NCSA + Open Source License + + Copyright (c) 2003-2013 University of Illinois at Urbana-Champaign. + All rights reserved. + + Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal with + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + of the Software, and to permit persons to whom the Software is furnished to do + so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + +-------------------------------------------------------------------------------- +thirdparty/curl-*/: MIT/X derivative license +Source: http://curl.haxx.se/ + + Copyright (c) 1996 - 2016, Daniel Stenberg, daniel@haxx.se. + + All rights reserved. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT + OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, use + or other dealings in this Software without prior written authorization + of the copyright holder. + + +-------------------------------------------------------------------------------- +thirdparty/crcutil-*/: Apache 2.0 license +Source: https://code.google.com/archive/p/crcutil/ + +-------------------------------------------------------------------------------- +thirdparty/libunwind-*/: X11 License +Source: http://savannah.nongnu.org/projects/libunwind/ + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- +thirdparty/nvml-*/: BSD 3-clause license +Source: https://github.com/pmem/nvml + + Copyright (c) 2014-2015, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + Everything in this source tree is covered by the previous license + with the following exceptions: + + * src/jemalloc has its own (somewhat similar) license contained in + src/jemalloc/COPYING. + + * utils/cstyle (used only during development) licensed under CDDL. + +nvml bundles jemalloc which is licensed under the BSD 2-clause license: + + Copyright (C) 2002-2014 Jason Evans . + All rights reserved. + Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. + Copyright (C) 2009-2014 Facebook, Inc. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + 1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +================================================================================ +BUILD-ONLY DEPENDENCIES +================================================================================ + +The following dependencies are used at build time, and do not link with or +become included in binary distributions of Kudu. We do not reproduce the +full text of the licenses here, since neither source nor binary distributions +bundle these projects. + +-------------------------------------------------------------------------------- +thirdparty/cmake-*/: BSD 3-clause license +source: http://cmake.org/ +NOTE: build-time dependency only + +-------------------------------------------------------------------------------- +thirdparty/gcovr-*/: BSD 3-clause license +source: https://github.com/gcovr/gcovr +NOTE: optional build-time dependency + +-------------------------------------------------------------------------------- +thirdparty/gcc-*/: GPL version 2 +Source: https://gcc.gnu.org +NOTE: build-time dependency, not linked or bundled. + +-------------------------------------------------------------------------------- +thirdparty/gmock-*/: BSD 3-clause +Source: https://github.com/google/googletest +NOTE: build-time dependency + +-------------------------------------------------------------------------------- +thirdparty/google-styleguide-*/: CC-By 3.0 (http://creativecommons.org/licenses/by/3.0/) +Source: https://github.com/google/styleguide +NOTE: optional build-time dependency, not linked or bundled. + +-------------------------------------------------------------------------------- +thirdparty/python-*: Python 2.7 license (https://www.python.org/download/releases/2.7/license/) +Source: http://www.python.org/ +NOTE: optional build-time dependency, not linked or bundled. + diff --git a/thirdparty/README.txt b/thirdparty/README.txt new file mode 100644 index 000000000000..f19a354218da --- /dev/null +++ b/thirdparty/README.txt @@ -0,0 +1,6 @@ +This directory contains scripts which download and install several third-party +dependencies of Kudu. Most of these dependencies are statically linked into +Kudu binaries, though a few are used only at build-time. + +See LICENSE.txt in this file for information on the licensing of each of these +dependencies. diff --git a/thirdparty/boost_uuid/boost/uuid/name_generator.hpp b/thirdparty/boost_uuid/boost/uuid/name_generator.hpp new file mode 100644 index 000000000000..42473a6f4fed --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/name_generator.hpp @@ -0,0 +1,125 @@ +// Boost name_generator.hpp header file ----------------------------------------------// + +// Copyright 2010 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_UUID_NAME_GENERATOR_HPP +#define BOOST_UUID_NAME_GENERATOR_HPP + +#include +#include +#include +#include +#include // for strlen, wcslen + +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std { + using ::strlen; + using ::wcslen; +} //namespace std +#endif //BOOST_NO_STDC_NAMESPACE + +namespace boost { +namespace uuids { + +// generate a name-based uuid +// TODO: add in common namesspace uuids +class name_generator { +public: + typedef uuid result_type; + + explicit name_generator(uuid const& namespace_uuid) + : namespace_uuid(namespace_uuid) + {} + + uuid operator()(const char* name) { + reset(); + process_characters(name, std::strlen(name)); + return sha_to_uuid(); + } + + uuid operator()(const wchar_t* name) { + reset(); + process_characters(name, std::wcslen(name)); + return sha_to_uuid(); + } + + template + uuid operator()(std::basic_string const& name) { + reset(); + process_characters(name.c_str(), name.length()); + return sha_to_uuid(); + } + + uuid operator()(void const* buffer, std::size_t byte_count) { + reset(); + sha.process_bytes(buffer, byte_count); + return sha_to_uuid(); + }; + +private: + // we convert all characters to uint32_t so that each + // character is 4 bytes reguardless of sizeof(char) or + // sizeof(wchar_t). We want the name string on any + // platform / compiler to generate the same uuid + // except for char + template + void process_characters(char_type const*const characters, size_t count) { + BOOST_ASSERT(sizeof(uint32_t) >= sizeof(char_type)); + + for (size_t i=0; i> 0) && 0xFF ); + sha.process_byte( (c >> 8) && 0xFF ); + sha.process_byte( (c >> 16) && 0xFF ); + sha.process_byte( (c >> 24) && 0xFF ); + } + } + + void process_characters(char const*const characters, size_t count) { + sha.process_bytes(characters, count); + } + + void reset() + { + sha.reset(); + sha.process_bytes(namespace_uuid.begin(), namespace_uuid.size()); + } + + uuid sha_to_uuid() + { + unsigned int digest[5]; + + sha.get_digest(digest); + + uuid u; + for (int i=0; i<4; ++i) { + *(u.begin() + i*4+0) = ((digest[i] >> 24) & 0xFF); + *(u.begin() + i*4+1) = ((digest[i] >> 16) & 0xFF); + *(u.begin() + i*4+2) = ((digest[i] >> 8) & 0xFF); + *(u.begin() + i*4+3) = ((digest[i] >> 0) & 0xFF); + } + + // set variant + // must be 0b10xxxxxx + *(u.begin()+8) &= 0xBF; + *(u.begin()+8) |= 0x80; + + // set version + // must be 0b0101xxxx + *(u.begin()+6) &= 0x5F; //0b01011111 + *(u.begin()+6) |= 0x50; //0b01010000 + + return u; + } + +private: + uuid namespace_uuid; + detail::sha1 sha; +}; + +}} // namespace boost::uuids + +#endif // BOOST_UUID_NAME_GENERATOR_HPP diff --git a/thirdparty/boost_uuid/boost/uuid/nil_generator.hpp b/thirdparty/boost_uuid/boost/uuid/nil_generator.hpp new file mode 100644 index 000000000000..c3c581837666 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/nil_generator.hpp @@ -0,0 +1,34 @@ +// Boost nil_generator.hpp header file ----------------------------------------------// + +// Copyright 2010 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_UUID_NIL_GENERATOR_HPP +#define BOOST_UUID_NIL_GENERATOR_HPP + +#include + +namespace boost { +namespace uuids { + +// generate a nil uuid +struct nil_generator { + typedef uuid result_type; + + uuid operator()() const { + // initialize to all zeros + uuid u = {{0}}; + return u; + } +}; + +inline uuid nil_uuid() { + return nil_generator()(); +} + +}} // namespace boost::uuids + +#endif // BOOST_UUID_NIL_GENERATOR_HPP + diff --git a/thirdparty/boost_uuid/boost/uuid/random_generator.hpp b/thirdparty/boost_uuid/boost/uuid/random_generator.hpp new file mode 100644 index 000000000000..4d11f6b5c955 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/random_generator.hpp @@ -0,0 +1,118 @@ +// Boost random_generator.hpp header file ----------------------------------------------// + +// Copyright 2010 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_UUID_RANDOM_GENERATOR_HPP +#define BOOST_UUID_RANDOM_GENERATOR_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace boost { +namespace uuids { + +// generate a random-based uuid +template +class basic_random_generator { +private: + typedef uniform_int distribution_type; + typedef variate_generator generator_type; + + struct null_deleter + { + void operator()(void const *) const {} + }; + +public: + typedef uuid result_type; + + // default constructor creates the random number generator + basic_random_generator() + : pURNG(new UniformRandomNumberGenerator) + , generator + ( pURNG.get() + , distribution_type + ( (std::numeric_limits::min)() + , (std::numeric_limits::max)() + ) + ) + { + // seed the random number generator + detail::seed(*pURNG); + } + + // keep a reference to a random number generator + // don't seed a given random number generator + explicit basic_random_generator(UniformRandomNumberGenerator& gen) + : pURNG(&gen, null_deleter()) + , generator + ( pURNG.get() + , distribution_type + ( (std::numeric_limits::min)() + , (std::numeric_limits::max)() + ) + ) + {} + + // keep a pointer to a random number generator + // don't seed a given random number generator + explicit basic_random_generator(UniformRandomNumberGenerator* pGen) + : pURNG(pGen, null_deleter()) + , generator + ( pURNG.get() + , distribution_type + ( (std::numeric_limits::min)() + , (std::numeric_limits::max)() + ) + ) + { + BOOST_ASSERT(pURNG); + } + + uuid operator()() + { + uuid u; + + int i=0; + unsigned long random_value = generator(); + for (uuid::iterator it=u.begin(); it!=u.end(); ++it, ++i) { + if (i==sizeof(unsigned long)) { + random_value = generator(); + i = 0; + } + + *it = ((random_value >> (i*8)) & 0xFF); + } + + // set variant + // must be 0b10xxxxxx + *(u.begin()+8) &= 0xBF; + *(u.begin()+8) |= 0x80; + + // set version + // must be 0b0100xxxx + *(u.begin()+6) &= 0x4F; //0b01001111 + *(u.begin()+6) |= 0x40; //0b01000000 + + return u; + } + +private: + shared_ptr pURNG; + generator_type generator; +}; + +typedef basic_random_generator random_generator; + +}} // namespace boost::uuids + +#endif //BOOST_UUID_RANDOM_GENERATOR_HPP diff --git a/thirdparty/boost_uuid/boost/uuid/seed_rng.hpp b/thirdparty/boost_uuid/boost/uuid/seed_rng.hpp new file mode 100644 index 000000000000..309019710790 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/seed_rng.hpp @@ -0,0 +1,262 @@ +// Boost seed_rng.hpp header file ----------------------------------------------// + +// Copyright 2007 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 09 Nov 2007 - Initial Revision +// 25 Feb 2008 - moved to namespace boost::uuids::detail +// 28 Nov 2009 - disabled deprecated warnings for MSVC + +// seed_rng models a UniformRandomNumberGenerator (see Boost.Random). +// Random number generators are hard to seed well. This is intended to provide +// good seed values for random number generators. +// It creates random numbers from a sha1 hash of data from a variary of sources, +// all of which are standard function calls. It produces random numbers slowly. +// Peter Dimov provided the details of sha1_random_digest_(). +// see http://archives.free.net.ph/message/20070507.175609.4c4f503a.en.html + +#ifndef BOOST_UUID_SEED_RNG_HPP +#define BOOST_UUID_SEED_RNG_HPP + +#include +#include // for memcpy +#include +#include +#include // for time_t, time, clock_t, clock +#include // for rand +#include // for FILE, fopen, fread, fclose +#include +//#include //forward declare boost::random_device + +// can't use boost::generator_iterator since boost::random number seed(Iter&, Iter) +// functions need a last iterator +//#include +# include + +#if defined(_MSC_VER) +#pragma warning(push) // Save warning settings. +#pragma warning(disable : 4996) // Disable deprecated std::fopen +#endif + +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std { + using ::memcpy; + using ::time_t; + using ::time; + using ::clock_t; + using ::clock; + using ::rand; + using ::FILE; + using ::fopen; + using ::fread; + using ::fclose; +} //namespace std +#endif + +// forward declare random number generators +namespace boost { +class random_device; +} //namespace boost + +namespace boost { +namespace uuids { +namespace detail { + +// should this be part of Boost.Random? +class seed_rng +{ +public: + typedef unsigned int result_type; + BOOST_STATIC_CONSTANT(bool, has_fixed_range = false); + //BOOST_STATIC_CONSTANT(unsigned int, min_value = 0); + //BOOST_STATIC_CONSTANT(unsigned int, max_value = UINT_MAX); + +public: + // note: rd_ intentionally left uninitialized + seed_rng() + : rd_index_(5) + , random_(std::fopen( "/dev/urandom", "rb" )) + {} + + ~seed_rng() + { + if (random_) { + std::fclose(random_); + } + } + + result_type min BOOST_PREVENT_MACRO_SUBSTITUTION () const + { + return (std::numeric_limits::min)(); + } + result_type max BOOST_PREVENT_MACRO_SUBSTITUTION () const + { + return (std::numeric_limits::max)(); + } + + result_type operator()() + { + if (rd_index_ >= 5) { + //get new digest + sha1_random_digest_(); + + rd_index_ = 0; + } + + return rd_[rd_index_++]; + } + +private: + static unsigned int * sha1_random_digest_state_() + { + // intentionally left uninitialized + static unsigned int state[ 5 ]; + return state; + } + + void sha1_random_digest_() + { + boost::uuids::detail::sha1 sha; + + unsigned int * ps = sha1_random_digest_state_(); + + unsigned int state[ 5 ]; + std::memcpy( state, ps, sizeof( state ) ); // harmless data race + + sha.process_bytes( (unsigned char const*)state, sizeof( state ) ); + sha.process_bytes( (unsigned char const*)&ps, sizeof( ps ) ); + + { + std::time_t tm = std::time( 0 ); + sha.process_bytes( (unsigned char const*)&tm, sizeof( tm ) ); + } + + { + std::clock_t ck = std::clock(); + sha.process_bytes( (unsigned char const*)&ck, sizeof( ck ) ); + } + + { + unsigned int rn[] = { std::rand(), std::rand(), std::rand() }; + sha.process_bytes( (unsigned char const*)rn, sizeof( rn ) ); + } + + { + // intentionally left uninitialized + unsigned char buffer[ 20 ]; + + if(random_) + { + std::fread( buffer, 1, 20, random_ ); + } + + // using an uninitialized buffer[] if fopen fails + // intentional, we rely on its contents being random + sha.process_bytes( buffer, sizeof( buffer ) ); + } + + { + // *p is intentionally left uninitialized + unsigned int * p = new unsigned int; + + sha.process_bytes( (unsigned char const*)p, sizeof( *p ) ); + sha.process_bytes( (unsigned char const*)&p, sizeof( p ) ); + + delete p; + } + + sha.process_bytes( (unsigned char const*)rd_, sizeof( rd_ ) ); + + unsigned int digest[ 5 ]; + sha.get_digest( digest ); + + for( int i = 0; i < 5; ++i ) + { + // harmless data race + ps[ i ] ^= digest[ i ]; + rd_[ i ] ^= digest[ i ]; + } + } + +private: + unsigned int rd_[5]; + int rd_index_; + std::FILE * random_; + +private: // make seed_rng noncopyable + seed_rng(seed_rng const&); + seed_rng& operator=(seed_rng const&); +}; + +// almost a copy of boost::generator_iterator +// but default constructor sets m_g to NULL +template +class generator_iterator + : public iterator_facade< + generator_iterator + , typename Generator::result_type + , single_pass_traversal_tag + , typename Generator::result_type const& + > +{ + typedef iterator_facade< + generator_iterator + , typename Generator::result_type + , single_pass_traversal_tag + , typename Generator::result_type const& + > super_t; + + public: + generator_iterator() : m_g(NULL) {} + generator_iterator(Generator* g) : m_g(g), m_value((*m_g)()) {} + + void increment() + { + m_value = (*m_g)(); + } + + const typename Generator::result_type& + dereference() const + { + return m_value; + } + + bool equal(generator_iterator const& y) const + { + return this->m_g == y.m_g && this->m_value == y.m_value; + } + + private: + Generator* m_g; + typename Generator::result_type m_value; +}; + +// seed() seeds a random number generator with good seed values + +template +inline void seed(UniformRandomNumberGenerator& rng) +{ + seed_rng seed_gen; + generator_iterator begin(&seed_gen); + generator_iterator end; + rng.seed(begin, end); +} + +// random_device does not / can not be seeded +template <> +inline void seed(boost::random_device&) {} + +// random_device does not / can not be seeded +template <> +inline void seed(seed_rng&) {} + +}}} //namespace boost::uuids::detail + +#if defined(_MSC_VER) +#pragma warning(pop) // Restore warnings to previous state. +#endif + +#endif diff --git a/thirdparty/boost_uuid/boost/uuid/sha1.hpp b/thirdparty/boost_uuid/boost/uuid/sha1.hpp new file mode 100644 index 000000000000..b4a1344cd6c8 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/sha1.hpp @@ -0,0 +1,208 @@ +// boost/uuid/sha1.hpp header file ----------------------------------------------// + +// Copyright 2007 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 29 May 2007 - Initial Revision +// 25 Feb 2008 - moved to namespace boost::uuids::detail + +// This is a byte oriented implementation +// Note: this implementation does not handle message longer than +// 2^32 bytes. + +#ifndef BOOST_UUID_SHA1_H +#define BOOST_UUID_SHA1_H + +#include +#include + +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std { + using ::size_t; +} // namespace std +#endif + +namespace boost { +namespace uuids { +namespace detail { + +BOOST_STATIC_ASSERT(sizeof(unsigned char)*8 == 8); +BOOST_STATIC_ASSERT(sizeof(unsigned int)*8 == 32); + +inline unsigned int left_rotate(unsigned int x, std::size_t n) +{ + return (x<> (32-n)); +} + +class sha1 +{ +public: + typedef unsigned int(&digest_type)[5]; +public: + sha1(); + + void reset(); + + void process_byte(unsigned char byte); + void process_block(void const* bytes_begin, void const* bytes_end); + void process_bytes(void const* buffer, std::size_t byte_count); + + void get_digest(digest_type digest); + +private: + void process_block(); + +private: + unsigned int h_[5]; + + unsigned char block_[64]; + + std::size_t block_byte_index_; + std::size_t byte_count_; +}; + +inline sha1::sha1() +{ + reset(); +} + +inline void sha1::reset() +{ + h_[0] = 0x67452301; + h_[1] = 0xEFCDAB89; + h_[2] = 0x98BADCFE; + h_[3] = 0x10325476; + h_[4] = 0xC3D2E1F0; + + block_byte_index_ = 0; + byte_count_ = 0; +} + +inline void sha1::process_byte(unsigned char byte) +{ + block_[block_byte_index_++] = byte; + ++byte_count_; + if (block_byte_index_ == 64) { + block_byte_index_ = 0; + process_block(); + } +} + +inline void sha1::process_block(void const* bytes_begin, void const* bytes_end) +{ + unsigned char const* begin = static_cast(bytes_begin); + unsigned char const* end = static_cast(bytes_end); + for(; begin != end; ++begin) { + process_byte(*begin); + } +} + +inline void sha1::process_bytes(void const* buffer, std::size_t byte_count) +{ + unsigned char const* b = static_cast(buffer); + process_block(b, b+byte_count); +} + +inline void sha1::process_block() +{ + unsigned int w[80]; + for (std::size_t i=0; i<16; ++i) { + w[i] = (block_[i*4 + 0] << 24); + w[i] |= (block_[i*4 + 1] << 16); + w[i] |= (block_[i*4 + 2] << 8); + w[i] |= (block_[i*4 + 3]); + } + for (std::size_t i=16; i<80; ++i) { + w[i] = left_rotate((w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]), 1); + } + + unsigned int a = h_[0]; + unsigned int b = h_[1]; + unsigned int c = h_[2]; + unsigned int d = h_[3]; + unsigned int e = h_[4]; + + for (std::size_t i=0; i<80; ++i) { + unsigned int f; + unsigned int k; + + if (i<20) { + f = (b & c) | (~b & d); + k = 0x5A827999; + } else if (i<40) { + f = b ^ c ^ d; + k = 0x6ED9EBA1; + } else if (i<60) { + f = (b & c) | (b & d) | (c & d); + k = 0x8F1BBCDC; + } else { + f = b ^ c ^ d; + k = 0xCA62C1D6; + } + + unsigned temp = left_rotate(a, 5) + f + e + k + w[i]; + e = d; + d = c; + c = left_rotate(b, 30); + b = a; + a = temp; + } + + h_[0] += a; + h_[1] += b; + h_[2] += c; + h_[3] += d; + h_[4] += e; +} + +inline void sha1::get_digest(digest_type digest) +{ + std::size_t bit_count = byte_count_*8; + + // append the bit '1' to the message + process_byte(0x80); + + // append k bits '0', where k is the minimum number >= 0 + // such that the resulting message length is congruent to 56 (mod 64) + // check if there is enough space for padding and bit_count + if (block_byte_index_ > 56) { + // finish this block + while (block_byte_index_ != 0) { + process_byte(0); + } + + // one more block + while (block_byte_index_ < 56) { + process_byte(0); + } + } else { + while (block_byte_index_ < 56) { + process_byte(0); + } + } + + // append length of message (before pre-processing) + // as a 64-bit big-endian integer + process_byte(0); + process_byte(0); + process_byte(0); + process_byte(0); + process_byte( static_cast((bit_count>>24) & 0xFF)); + process_byte( static_cast((bit_count>>16) & 0xFF)); + process_byte( static_cast((bit_count>>8 ) & 0xFF)); + process_byte( static_cast((bit_count) & 0xFF)); + + // get final digest + digest[0] = h_[0]; + digest[1] = h_[1]; + digest[2] = h_[2]; + digest[3] = h_[3]; + digest[4] = h_[4]; +} + +}}} // namespace boost::uuids::detail + +#endif diff --git a/thirdparty/boost_uuid/boost/uuid/string_generator.hpp b/thirdparty/boost_uuid/boost/uuid/string_generator.hpp new file mode 100644 index 000000000000..7d2733bb7ab0 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/string_generator.hpp @@ -0,0 +1,184 @@ +// Boost string_generator.hpp header file ----------------------------------------------// + +// Copyright 2010 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef BOOST_UUID_STRING_GENERATOR_HPP +#define BOOST_UUID_STRING_GENERATOR_HPP + +#include +#include +#include // for strlen, wcslen +#include +#include // for find +#include + +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std { + using ::strlen; + using ::wcslen; +} //namespace std +#endif //BOOST_NO_STDC_NAMESPACE + +namespace boost { +namespace uuids { + +// generate a uuid from a string +// lexical_cast works fine using uuid_io.hpp +// but this generator should accept more forms +// and be more efficient +// would like to accept the following forms: +// 0123456789abcdef0123456789abcdef +// 01234567-89ab-cdef-0123456789abcdef +// {01234567-89ab-cdef-0123456789abcdef} +// {0123456789abcdef0123456789abcdef} +// others? +struct string_generator { + typedef uuid result_type; + + template + uuid operator()(std::basic_string const& s) const { + return operator()(s.begin(), s.end()); + }; + + uuid operator()(char const*const s) const { + return operator()(s, s+std::strlen(s)); + } + + uuid operator()(wchar_t const*const s) const { + return operator()(s, s+std::wcslen(s)); + } + + template + uuid operator()(CharIterator begin, CharIterator end) const + { + typedef typename std::iterator_traits::value_type char_type; + + // check open brace + char_type c = get_next_char(begin, end); + bool has_open_brace = is_open_brace(c); + char_type open_brace_char = c; + if (has_open_brace) { + c = get_next_char(begin, end); + } + + bool has_dashes = false; + + uuid u; + int i=0; + for (uuid::iterator it_byte=u.begin(); it_byte!=u.end(); ++it_byte, ++i) { + if (it_byte != u.begin()) { + c = get_next_char(begin, end); + } + + if (i == 4) { + has_dashes = is_dash(c); + if (has_dashes) { + c = get_next_char(begin, end); + } + } + + if (has_dashes) { + if (i == 6 || i == 8 || i == 10) { + if (is_dash(c)) { + c = get_next_char(begin, end); + } else { + throw_invalid(); + } + } + } + + *it_byte = get_value(c); + + c = get_next_char(begin, end); + *it_byte <<= 4; + *it_byte |= get_value(c); + } + + // check close brace + if (has_open_brace) { + c = get_next_char(begin, end); + check_close_brace(c, open_brace_char); + } + + return u; + } + +private: + template + typename std::iterator_traits::value_type + get_next_char(CharIterator& begin, CharIterator end) const { + if (begin == end) { + throw_invalid(); + } + return *begin++; + } + + unsigned char get_value(char c) const { + static char const*const digits_begin = "0123456789abcdefABCDEF"; + static char const*const digits_end = digits_begin + 22; + + static unsigned char const values[] = + { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,10,11,12,13,14,15 + , static_cast(-1) }; + + char const* d = std::find(digits_begin, digits_end, c); + return values[d - digits_begin]; + } + + unsigned char get_value(wchar_t c) const { + static wchar_t const*const digits_begin = L"0123456789abcdefABCDEF"; + static wchar_t const*const digits_end = digits_begin + 22; + + static unsigned char const values[] = + { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,10,11,12,13,14,15 + , static_cast(-1) }; + + wchar_t const* d = std::find(digits_begin, digits_end, c); + return values[d - digits_begin]; + } + + bool is_dash(char c) const { + return c == '-'; + } + + bool is_dash(wchar_t c) const { + return c == L'-'; + } + + // return closing brace + bool is_open_brace(char c) const { + return (c == '{'); + } + + bool is_open_brace(wchar_t c) const { + return (c == L'{'); + } + + void check_close_brace(char c, char open_brace) const { + if (open_brace == '{' && c == '}') { + //great + } else { + throw_invalid(); + } + } + + void check_close_brace(wchar_t c, wchar_t open_brace) const { + if (open_brace == L'{' && c == L'}') { + // great + } else { + throw_invalid(); + } + } + + void throw_invalid() const { + throw std::runtime_error("invalid uuid string"); + } +}; + +}} // namespace boost::uuids + +#endif //BOOST_UUID_STRING_GENERATOR_HPP + diff --git a/thirdparty/boost_uuid/boost/uuid/uuid.hpp b/thirdparty/boost_uuid/boost/uuid/uuid.hpp new file mode 100644 index 000000000000..2678d856c0e8 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/uuid.hpp @@ -0,0 +1,221 @@ +// Boost uuid.hpp header file ----------------------------------------------// + +// Copyright 2006 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 06 Feb 2006 - Initial Revision +// 09 Nov 2006 - fixed variant and version bits for v4 guids +// 13 Nov 2006 - added serialization +// 17 Nov 2006 - added name-based guid creation +// 20 Nov 2006 - add fixes for gcc (from Tim Blechmann) +// 07 Mar 2007 - converted to header only +// 10 May 2007 - removed need for Boost.Thread +// - added better seed - thanks Peter Dimov +// - removed null() +// - replaced byte_count() and output_bytes() with size() and begin() and end() +// 11 May 2007 - fixed guid(ByteInputIterator first, ByteInputIterator last) +// - optimized operator>> +// 14 May 2007 - converted from guid to uuid +// 29 May 2007 - uses new implementation of sha1 +// 01 Jun 2007 - removed using namespace directives +// 09 Nov 2007 - moved implementation to uuid.ipp file +// 12 Nov 2007 - moved serialize code to uuid_serialize.hpp file +// 25 Feb 2008 - moved to namespace boost::uuids +// 19 Mar 2009 - changed to a POD, reorganized files +// 28 Nov 2009 - disabled deprecated warnings for MSVC +// 30 Nov 2009 - used BOOST_STATIC_CONSTANT +// 02 Dec 2009 - removed BOOST_STATIC_CONSTANT - not all compilers like it + +#ifndef BOOST_UUID_HPP +#define BOOST_UUID_HPP + +#include +#include +#include +#include +#include // for static assert +#ifndef BOOST_UUID_NO_TYPE_TRAITS +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(push) // Save warning settings. +#pragma warning(disable : 4996) // Disable deprecated std::swap_ranges, std::equal +#endif + +#ifdef BOOST_NO_STDC_NAMESPACE +namespace std { + using ::size_t; + using ::ptrdiff_t; +} //namespace std +#endif //BOOST_NO_STDC_NAMESPACE + +namespace boost { +namespace uuids { + +struct uuid +{ +public: + typedef uint8_t value_type; + typedef uint8_t& reference; + typedef uint8_t const& const_reference; + typedef uint8_t* iterator; + typedef uint8_t const* const_iterator; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + // This does not work on some compilers + // They seem to want the variable definec in + // a cpp file + //BOOST_STATIC_CONSTANT(size_type, static_size = 16); + static size_type static_size() { return 16; } + +public: + iterator begin() { return data; } /* throw() */ + const_iterator begin() const { return data; } /* throw() */ + iterator end() { return data+size(); } /* throw() */ + const_iterator end() const { return data+size(); } /* throw() */ + + size_type size() const { return static_size(); } /* throw() */ + + bool is_nil() const /* throw() */ + { + for(size_t i=0; i + uint8_t data[16]; +}; + +inline bool operator==(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return std::equal(lhs.begin(), lhs.end(), rhs.begin()); +} + +inline bool operator!=(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return !(lhs == rhs); +} + +inline bool operator<(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); +} + +inline bool operator>(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return rhs < lhs; +} +inline bool operator<=(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return !(rhs < lhs); +} + +inline bool operator>=(uuid const& lhs, uuid const& rhs) /* throw() */ +{ + return !(lhs < rhs); +} + +inline void swap(uuid& lhs, uuid& rhs) /* throw() */ +{ + lhs.swap(rhs); +} + +// This is equivalent to boost::hash_range(u.begin(), u.end()); +inline std::size_t hash_value(uuid const& u) /* throw() */ +{ + std::size_t seed = 0; + for(uuid::const_iterator i=u.begin(); i != u.end(); ++i) + { + seed ^= static_cast(*i) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + return seed; +} + +}} //namespace boost::uuids + +#ifndef BOOST_UUID_NO_TYPE_TRAITS +// type traits specializations +namespace boost { + +template <> +struct is_pod : true_type {}; + +} // namespace boost +#endif + +#if defined(_MSC_VER) +#pragma warning(pop) // Restore warnings to previous state. +#endif + +#endif // BOOST_UUID_HPP diff --git a/thirdparty/boost_uuid/boost/uuid/uuid_generators.hpp b/thirdparty/boost_uuid/boost/uuid/uuid_generators.hpp new file mode 100644 index 000000000000..29d39ccf6526 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/uuid_generators.hpp @@ -0,0 +1,19 @@ +// Boost uuid_generators.hpp header file ----------------------------------------------// + +// Copyright 2006 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 06 Feb 2006 - Initial Revision + +#ifndef BOOST_UUID_GENERATORS_HPP +#define BOOST_UUID_GENERATORS_HPP + +#include +#include +#include +#include + +#endif //BOOST_UUID_GENERATORS_HPP diff --git a/thirdparty/boost_uuid/boost/uuid/uuid_io.hpp b/thirdparty/boost_uuid/boost/uuid/uuid_io.hpp new file mode 100644 index 000000000000..592a5094dff8 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/uuid_io.hpp @@ -0,0 +1,198 @@ +// Boost uuid_io.hpp header file ----------------------------------------------// + +// Copyright 2009 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 20 Mar 2009 - Initial Revision +// 28 Nov 2009 - disabled deprecated warnings for MSVC + +#ifndef BOOST_UUID_IO_HPP +#define BOOST_UUID_IO_HPP + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(push) // Save warning settings. +#pragma warning(disable : 4996) // Disable deprecated std::ctype::widen, std::copy +#endif + +namespace boost { +namespace uuids { + +template + std::basic_ostream& operator<<(std::basic_ostream &os, uuid const& u) +{ + io::ios_flags_saver flags_saver(os); + io::basic_ios_fill_saver fill_saver(os); + + const typename std::basic_ostream::sentry ok(os); + if (ok) { + const std::streamsize width = os.width(0); + const std::streamsize uuid_width = 36; + const std::ios_base::fmtflags flags = os.flags(); + const typename std::basic_ios::char_type fill = os.fill(); + if (flags & (std::ios_base::right | std::ios_base::internal)) { + for (std::streamsize i=uuid_width; i(*i_data); + if (i == 3 || i == 5 || i == 7 || i == 9) { + os << os.widen('-'); + } + } + + if (flags & std::ios_base::left) { + for (std::streamsize i=uuid_width; i + std::basic_istream& operator>>(std::basic_istream &is, uuid &u) +{ + const typename std::basic_istream::sentry ok(is); + if (ok) { + unsigned char data[16]; + + typedef std::ctype ctype_t; + ctype_t const& ctype = std::use_facet(is.getloc()); + + ch xdigits[16]; + { + char szdigits[] = "0123456789ABCDEF"; + ctype.widen(szdigits, szdigits+16, xdigits); + } + ch*const xdigits_end = xdigits+16; + + ch c; + for (std::size_t i=0; i> c; + c = ctype.toupper(c); + + ch* f = std::find(xdigits, xdigits_end, c); + if (f == xdigits_end) { + is.setstate(std::ios_base::failbit); + break; + } + + unsigned char byte = static_cast(std::distance(&xdigits[0], f)); + + is >> c; + c = ctype.toupper(c); + f = std::find(xdigits, xdigits_end, c); + if (f == xdigits_end) { + is.setstate(std::ios_base::failbit); + break; + } + + byte <<= 4; + byte |= static_cast(std::distance(&xdigits[0], f)); + + data[i] = byte; + + if (is) { + if (i == 3 || i == 5 || i == 7 || i == 9) { + is >> c; + if (c != is.widen('-')) is.setstate(std::ios_base::failbit); + } + } + } + + if (is) { + std::copy(data, data+16, u.begin()); + } + } + return is; +} + +namespace detail { +inline char to_char(size_t i) { + if (i <= 9) { + return static_cast('0' + i); + } else { + return static_cast('a' + (i-10)); + } +} + +inline wchar_t to_wchar(size_t i) { + if (i <= 9) { + return static_cast(L'0' + i); + } else { + return static_cast(L'a' + (i-10)); + } +} + +} // namespace detail + +inline std::string to_string(uuid const& u) +{ + std::string result; + result.reserve(36); + + std::size_t i=0; + for (uuid::const_iterator it_data = u.begin(); it_data!=u.end(); ++it_data, ++i) { + const size_t hi = ((*it_data) >> 4) & 0x0F; + result += detail::to_char(hi); + + const size_t lo = (*it_data) & 0x0F; + result += detail::to_char(lo); + + if (i == 3 || i == 5 || i == 7 || i == 9) { + result += '-'; + } + } + return result; +} + +#ifndef BOOST_NO_STD_WSTRING +inline std::wstring to_wstring(uuid const& u) +{ + std::wstring result; + result.reserve(36); + + std::size_t i=0; + for (uuid::const_iterator it_data = u.begin(); it_data!=u.end(); ++it_data, ++i) { + const size_t hi = ((*it_data) >> 4) & 0x0F; + result += detail::to_wchar(hi); + + const size_t lo = (*it_data) & 0x0F; + result += detail::to_wchar(lo); + + if (i == 3 || i == 5 || i == 7 || i == 9) { + result += L'-'; + } + } + return result; +} + +#endif + +}} //namespace boost::uuids + +#if defined(_MSC_VER) +#pragma warning(pop) // Restore warnings to previous state. +#endif + +#endif // BOOST_UUID_IO_HPP diff --git a/thirdparty/boost_uuid/boost/uuid/uuid_serialize.hpp b/thirdparty/boost_uuid/boost/uuid/uuid_serialize.hpp new file mode 100644 index 000000000000..3d8a608b18f5 --- /dev/null +++ b/thirdparty/boost_uuid/boost/uuid/uuid_serialize.hpp @@ -0,0 +1,20 @@ +// Boost uuid_serialize.hpp header file ----------------------------------------------// + +// Copyright 2007 Andy Tompkins. +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Revision History +// 12 Nov 2007 - Initial Revision +// 25 Feb 2008 - moved to namespace boost::uuids::detail + +#ifndef BOOST_UUID_SERIALIZE_HPP +#define BOOST_UUID_SERIALIZE_HPP + +#include +#include + +BOOST_CLASS_IMPLEMENTATION(boost::uuids::uuid, boost::serialization::primitive_type) + +#endif // BOOST_UUID_SERIALIZE_HPP diff --git a/thirdparty/build-definitions.sh b/thirdparty/build-definitions.sh new file mode 100644 index 000000000000..65b78259b7d8 --- /dev/null +++ b/thirdparty/build-definitions.sh @@ -0,0 +1,320 @@ +#!/bin/sh +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# build-definitions.sh provides functions to build thirdparty dependencies. +# These functions do not take positional arguments, but individual builds may +# be influenced by setting environment variables: +# +# * PREFIX - the install destination directory. +# * EXTRA_CFLAGS - additional flags to pass to the C compiler. +# * EXTRA_CXXFLAGS - additional flags to pass to the C++ compiler. +# * EXTRA_LDFLAGS - additional flags to pass to the linker. +# * EXTRA_LIBS - additional libraries to link. +# +# build-definitions.sh is meant to be sourced from build-thirdparty.sh, and +# relies on environment variables defined there and in vars.sh. + +# Save the current build environment. +save_env() { + _PREFIX=${PREFIX} + _EXTRA_CFLAGS=${EXTRA_CFLAGS} + _EXTRA_CXXFLAGS=${EXTRA_CXXFLAGS} + _EXTRA_LDFLAGS=${EXTRA_LDFLAGS} + _EXTRA_LIBS=${EXTRA_LIBS} +} + +# Restore the most recently saved build environment. +restore_env() { + PREFIX=${_PREFIX} + EXTRA_CFLAGS=${_EXTRA_CFLAGS} + EXTRA_CXXFLAGS=${_EXTRA_CXXFLAGS} + EXTRA_LDFLAGS=${_EXTRA_LDFLAGS} + EXTRA_LIBS=${_EXTRA_LIBS} +} + +build_cmake() { + cd $CMAKE_DIR + ./bootstrap --prefix=$PREFIX --parallel=$PARALLEL + make -j$PARALLEL + make install +} + +build_llvm() { + + # Build Python if necessary. + if [[ $(python2.7 -V 2>&1) =~ "Python 2.7." ]]; then + PYTHON_EXECUTABLE=$(which python2.7) + elif [[ $(python -V 2>&1) =~ "Python 2.7." ]]; then + PYTHON_EXECUTABLE=$(which python) + else + cd $PYTHON_DIR + ./configure --prefix=$PREFIX + make -j$PARALLEL + PYTHON_EXECUTABLE=$PYTHON_DIR/python + fi + + mkdir -p $LLVM_BUILD_DIR + cd $LLVM_BUILD_DIR + + # Rebuild the CMake cache every time. + rm -Rf CMakeCache.txt CMakeFiles/ + + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$PREFIX \ + -DLLVM_TARGETS_TO_BUILD=X86 \ + -DLLVM_ENABLE_RTTI=ON \ + -DCMAKE_CXX_FLAGS="$EXTRA_CXXFLAGS" \ + -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \ + $LLVM_DIR + + make -j$PARALLEL install + + # Create a link from Clang to thirdparty/clang-toolchain. This path is used + # for compiling Kudu with sanitizers. The link can't point to the Clang + # installed in the prefix directory, since this confuses CMake into believing + # the thirdparty prefix directory is the system-wide prefix, and it omits the + # thirdparty prefix directory from the rpath of built binaries. + ln -sfn $LLVM_BUILD_DIR $TP_DIR/clang-toolchain +} + +build_libstdcxx() { + # Configure libstdcxx to use posix threads by default. Normally this symlink + # would be created automatically while building libgcc as part of the overall + # GCC build, but since we are only building libstdcxx we must configure it + # manually. + ln -sf $GCC_DIR/libgcc/gthr-posix.h $GCC_DIR/libgcc/gthr-default.h + + # Remove the GCC build directory to remove cached build configuration. + rm -rf $GCC_BUILD_DIR + mkdir -p $GCC_BUILD_DIR + cd $GCC_BUILD_DIR + CFLAGS=$EXTRA_CFLAGS \ + CXXFLAGS=$EXTRA_CXXFLAGS \ + $GCC_DIR/libstdc++-v3/configure \ + --enable-multilib=no \ + --prefix="$PREFIX" + make -j$PARALLEL install +} + +build_gflags() { + cd $GFLAGS_DIR + rm -rf CMakeCache.txt CMakeFiles/ + CXXFLAGS="$EXTRA_CFLAGS $EXTRA_CXXFLAGS $EXTRA_LDFLAGS $EXTRA_LIBS" \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_POSITION_INDEPENDENT_CODE=On \ + -DCMAKE_INSTALL_PREFIX=$PREFIX \ + -DBUILD_SHARED_LIBS=On \ + -DBUILD_STATIC_LIBS=On + make -j$PARALLEL install +} + +build_libunwind() { + cd $LIBUNWIND_DIR + # Disable minidebuginfo, which depends on liblzma, until/unless we decide to + # add liblzma to thirdparty. + ./configure --disable-minidebuginfo --with-pic --prefix=$PREFIX + make -j$PARALLEL install +} + +build_glog() { + cd $GLOG_DIR + CXXFLAGS="$EXTRA_CXXFLAGS" \ + LDFLAGS="$EXTRA_LDFLAGS" \ + LIBS="$EXTRA_LIBS" \ + ./configure --with-pic --prefix=$PREFIX --with-gflags=$PREFIX + make -j$PARALLEL install +} + +build_gperftools() { + cd $GPERFTOOLS_DIR + CFLAGS="$EXTRA_CFLAGS" \ + CXXFLAGS="$EXTRA_CXXFLAGS" \ + LDFLAGS="$EXTRA_LDFLAGS" \ + LIBS="$EXTRA_LIBS" \ + ./configure --enable-frame-pointers --enable-heap-checker --with-pic --prefix=$PREFIX + make -j$PARALLEL install +} + +build_gmock() { + cd $GMOCK_DIR + for SHARED in OFF ON; do + rm -rf CMakeCache.txt CMakeFiles/ + CXXFLAGS="$EXTRA_CXXFLAGS $EXTRA_LDFLAGS $EXTRA_LIBS" \ + cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_POSITION_INDEPENDENT_CODE=On \ + -DBUILD_SHARED_LIBS=$SHARED . + make -j$PARALLEL + done + echo Installing gmock... + cp -a libgmock.$DYLIB_SUFFIX libgmock.a $PREFIX/lib/ + rsync -av include/ $PREFIX/include/ + rsync -av gtest/include/ $PREFIX/include/ +} + +build_protobuf() { + cd $PROTOBUF_DIR + # We build protobuf in both instrumented and non-instrumented modes. + # If we don't clean in between, we may end up mixing modes. + test -f Makefile && make distclean + CFLAGS="$EXTRA_CFLAGS" \ + CXXFLAGS="$EXTRA_CXXFLAGS" \ + LDFLAGS="$EXTRA_LDFLAGS" \ + LIBS="$EXTRA_LIBS" \ + ./configure \ + --with-pic \ + --enable-shared \ + --enable-static \ + --prefix=$PREFIX + make -j$PARALLEL install +} + +build_snappy() { + cd $SNAPPY_DIR + CFLAGS="$EXTRA_CFLAGS" \ + CXXFLAGS="$EXTRA_CXXFLAGS" \ + LDFLAGS="$EXTRA_LDFLAGS" \ + LIBS="$EXTRA_LIBS" \ + ./configure --with-pic --prefix=$PREFIX + make -j$PARALLEL install +} + +build_zlib() { + cd $ZLIB_DIR + CFLAGS="$EXTRA_CFLAGS -fPIC" ./configure --prefix=$PREFIX + make -j$PARALLEL install +} + +build_lz4() { + cd $LZ4_DIR + CFLAGS="$EXTRA_CFLAGS" cmake -DCMAKE_BUILD_TYPE=release \ + -DBUILD_TOOLS=0 -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX cmake_unofficial/ + make -j$PARALLEL install +} + +build_bitshuffle() { + cd $BITSHUFFLE_DIR + # bitshuffle depends on lz4, therefore set the flag I$PREFIX/include + ${CC:-gcc} $EXTRA_CFLAGS -std=c99 -I$PREFIX/include -O3 -DNDEBUG -fPIC -c bitshuffle.c + ar rs bitshuffle.a bitshuffle.o + cp bitshuffle.a $PREFIX/lib/ + cp bitshuffle.h $PREFIX/include/ +} + +build_libev() { + cd $LIBEV_DIR + CFLAGS="$EXTRA_CFLAGS" \ + CXXFLAGS="$EXTRA_CXXFLAGS" \ + ./configure --with-pic --prefix=$PREFIX + make -j$PARALLEL install +} + +build_rapidjson() { + # just installing it into our prefix + cd $RAPIDJSON_DIR + rsync -av --delete $RAPIDJSON_DIR/include/rapidjson/ $PREFIX/include/rapidjson/ +} + +build_squeasel() { + # Mongoose's Makefile builds a standalone web server, whereas we just want + # a static lib + cd $SQUEASEL_DIR + ${CC:-gcc} $EXTRA_CFLAGS -std=c99 -O3 -DNDEBUG -fPIC -c squeasel.c + ar rs libsqueasel.a squeasel.o + cp libsqueasel.a $PREFIX/lib/ + cp squeasel.h $PREFIX/include/ +} + +build_curl() { + # Configure for a very minimal install - basically only HTTP, since we only + # use this for testing our own HTTP endpoints at this point in time. + cd $CURL_DIR + ./configure --prefix=$PREFIX \ + --disable-ftp \ + --disable-file \ + --disable-ldap \ + --disable-ldaps \ + --disable-rtsp \ + --disable-dict \ + --disable-telnet \ + --disable-tftp \ + --disable-pop3 \ + --disable-imap \ + --disable-smtp \ + --disable-gopher \ + --disable-manual \ + --without-rtmp \ + --disable-ipv6 + make -j$PARALLEL + make install +} + +build_crcutil() { + cd $CRCUTIL_DIR + ./autogen.sh + CFLAGS="$EXTRA_CFLAGS" \ + CXXFLAGS="$EXTRA_CXXFLAGS" \ + LDFLAGS="$EXTRA_LDFLAGS" \ + LIBS="$EXTRA_LIBS" \ + ./configure --prefix=$PREFIX + make -j$PARALLEL install +} + +build_boost_uuid() { + # Copy boost_uuid into the include directory. + # This is a header-only library which isn't present in some older versions of + # boost (eg the one on el6). So, we check it in and put it in our own include + # directory. + rsync -a $TP_DIR/boost_uuid/boost/ $PREFIX/include/boost/ +} + +build_cpplint() { + # Copy cpplint tool into bin directory + cp $GSG_DIR/cpplint/cpplint.py $PREFIX/bin/cpplint.py +} + +build_gcovr() { + # Copy gcovr tool into bin directory + cp -a $GCOVR_DIR/scripts/gcovr $PREFIX/bin/gcovr +} + +build_trace_viewer() { + echo Installing trace-viewer into the www directory + cp -a $TRACE_VIEWER_DIR/* $TP_DIR/../www/ +} + +build_nvml() { + cd $NVML_DIR/src/ + + # The embedded jemalloc build doesn't pick up the EXTRA_CFLAGS environment + # variable, so we have to stick our flags into this config file. + if ! grep -q -e "$EXTRA_CFLAGS" jemalloc/jemalloc.cfg ; then + perl -p -i -e "s,(EXTRA_CFLAGS=\"),\$1$EXTRA_CFLAGS ," jemalloc/jemalloc.cfg + fi + + EXTRA_CFLAGS="$EXTRA_CFLAGS" make -j$PARALLEL libvmem DEBUG=0 + # NVML doesn't allow configuring PREFIX -- it always installs into + # DESTDIR/usr/lib. Additionally, the 'install' target builds all of + # the NVML libraries, even though we only need libvmem. + # So, we manually install the built artifacts. + cp -a $NVML_DIR/src/include/libvmem.h $PREFIX/include + cp -a $NVML_DIR/src/nondebug/libvmem.{so*,a} $PREFIX/lib +} diff --git a/thirdparty/build-if-necessary.sh b/thirdparty/build-if-necessary.sh new file mode 100755 index 000000000000..8239b851bcc5 --- /dev/null +++ b/thirdparty/build-if-necessary.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Script which downloads and builds the thirdparty dependencies +# only if necessary. +# +# In a git repo, this uses git checksum information on the thirdparty +# tree. Otherwise, it uses a 'stamp file' approach. + +set -e +set -o pipefail + +TP_DIR=$(dirname $BASH_SOURCE) +cd $TP_DIR + +NEEDS_BUILD= + +IS_IN_GIT=$(test -d ../.git && echo true || :) + +if [ -n "$IS_IN_GIT" ]; then + # Determine whether this subtree in the git repo has changed since thirdparty + # was last built + + CUR_THIRDPARTY_HASH=$(cd .. && git ls-tree -d HEAD thirdparty | awk '{print $3}') + LAST_BUILD_HASH=$(cat .build-hash || :) + if [ "$CUR_THIRDPARTY_HASH" != "$LAST_BUILD_HASH" ]; then + echo "Rebuilding thirdparty: the repository has changed since thirdparty was last built." + echo "Old git hash: $LAST_BUILD_HASH" + echo "New build hash: $CUR_THIRDPARTY_HASH" + NEEDS_BUILD=1 + else + # Determine whether the developer has any local changes + if ! ( git diff --quiet . && git diff --cached --quiet . ) ; then + echo "Rebuilding thirdparty: There are local changes in the repository." + NEEDS_BUILD=1 + else + echo Not rebuilding thirdparty. No changes since last build. + fi + fi +else + # If we aren't inside running inside a git repository (e.g. we are + # part of a source distribution tarball) then we can't use git to find + # out whether the build is clean. Instead, use a .build-stamp file, and + # see if any files inside this directory have been modified since then. + if [ -f .build-stamp ]; then + CHANGED_FILE_COUNT=$(find . -cnewer .build-stamp | wc -l) + echo "$CHANGED_FILE_COUNT file(s) been modified since thirdparty was last built." + if [ $CHANGED_FILE_COUNT -gt 0 ]; then + echo "Rebuilding." + echo NEEDS_BUILD=1 + fi + else + echo "It appears that thirdparty was never built. Building." + NEEDS_BUILD=1 + fi +fi + +if [ -z "$NEEDS_BUILD" ]; then + exit 0 +fi + +rm -f .build-hash .build-stamp +./download-thirdparty.sh +./build-thirdparty.sh + +if [ -n "$IS_IN_GIT" ]; then + echo $CUR_THIRDPARTY_HASH > .build-hash +else + touch .build-stamp +fi diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh new file mode 100755 index 000000000000..4d6b8a68bf6e --- /dev/null +++ b/thirdparty/build-thirdparty.sh @@ -0,0 +1,342 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# build-thirdparty.sh builds and installs thirdparty dependencies into prefix +# directories within the thirdparty directory. Three prefix directories are +# used, corresponding to build type: +# +# * /thirdparty/installed - prefix directory for libraries and binary tools +# common to all build types, e.g. LLVM, Clang, and +# CMake. +# * /thirdparty/installed-deps - prefix directory for libraries built with +# normal options (no sanitizer instrumentation). +# * /thirdparty/installed-deps-tsan - prefix directory for libraries built +# with thread sanitizer instrumentation. +# +# Environment variables which can be set when calling build-thirdparty.sh: +# * EXTRA_CFLAGS - additional flags passed to the C compiler. +# * EXTRA_CXXFLAGS - additional flags passed to the C++ compiler. +# * EXTRA_LDFLAGS - additional flags passed to the linker. +# * EXTRA_LIBS - additional libraries to link. + +set -ex + +TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +source $TP_DIR/vars.sh +source $TP_DIR/build-definitions.sh + +for PREFIX_DIR in $PREFIX_COMMON $PREFIX_DEPS $PREFIX_DEPS_TSAN $PREFIX_LIBSTDCXX $PREFIX_LIBSTDCXX_TSAN; do + mkdir -p $PREFIX_DIR/lib + mkdir -p $PREFIX_DIR/include + + # On some systems, autotools installs libraries to lib64 rather than lib. Fix + # this by setting up lib64 as a symlink to lib. We have to do this step first + # to handle cases where one third-party library depends on another. + ln -sf "$PREFIX_DIR/lib" "$PREFIX_DIR/lib64" +done + +# We use -O2 instead of -O3 for thirdparty since benchmarks indicate +# that the benefits of a smaller code size outweight the benefits of +# more inlining. +# +# We also enable -fno-omit-frame-pointer so that profiling tools which +# use frame-pointer based stack unwinding can function correctly. +EXTRA_CFLAGS="$CFLAGS $EXTRA_CFLAGS -fno-omit-frame-pointer" +EXTRA_CXXFLAGS="$CXXFLAGS $EXTRA_CXXFLAGS -I${PREFIX_COMMON}/include -fno-omit-frame-pointer -O2" +EXTRA_LDFLAGS="$LDFLAGS $EXTRA_LDFLAGS -L${PREFIX_COMMON}/lib" +EXTRA_LIBS="$LIBS $EXTRA_LIBS" + +if [[ "$OSTYPE" =~ ^linux ]]; then + OS_LINUX=1 + PARALLEL=$(grep -c processor /proc/cpuinfo) + + # Explicitly disable the new gcc5 ABI. Until clang supports abi tags [1], + # Kudu's generated code (which always uses clang) must be built against the + # old ABI. There's no recourse for using both ABIs in the same process; gcc's + # advice [2] is to build everything against the old ABI. + # + # 1. https://llvm.org/bugs/show_bug.cgi?id=23529 + # 2. https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html + EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS -D_GLIBCXX_USE_CXX11_ABI=0" + DYLIB_SUFFIX="so" + + # Enable TSAN builds on Linux. + F_TSAN=1 +elif [[ "$OSTYPE" == "darwin"* ]]; then + OS_OSX=1 + DYLIB_SUFFIX="dylib" + PARALLEL=$(sysctl -n hw.ncpu) + + # Kudu builds with C++11, which on OS X requires using libc++ as the standard + # library implementation. Some of the dependencies do not compile against + # libc++ by default, so we specify it explicitly. + EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS -stdlib=libc++" + EXTRA_LDFLAGS="$EXTRA_LDFLAGS -stdlib=libc++" + EXTRA_LIBS="$EXTRA_LIBS -lc++ -lc++abi" +else + echo Unsupported platform $OSTYPE + exit 1 +fi + +################################################################################ + +if [ "$#" = "0" ]; then + F_ALL=1 +else + # Allow passing specific libs to build on the command line + for arg in "$*"; do + case $arg in + "cmake") F_CMAKE=1 ;; + "gflags") F_GFLAGS=1 ;; + "glog") F_GLOG=1 ;; + "gmock") F_GMOCK=1 ;; + "gperftools") F_GPERFTOOLS=1 ;; + "libev") F_LIBEV=1 ;; + "lz4") F_LZ4=1 ;; + "bitshuffle") F_BITSHUFFLE=1;; + "protobuf") F_PROTOBUF=1 ;; + "rapidjson") F_RAPIDJSON=1 ;; + "snappy") F_SNAPPY=1 ;; + "zlib") F_ZLIB=1 ;; + "squeasel") F_SQUEASEL=1 ;; + "gsg") F_GSG=1 ;; + "gcovr") F_GCOVR=1 ;; + "curl") F_CURL=1 ;; + "crcutil") F_CRCUTIL=1 ;; + "libunwind") F_LIBUNWIND=1 ;; + "llvm") F_LLVM=1 ;; + "libstdcxx") F_LIBSTDCXX=1 ;; + "trace-viewer") F_TRACE_VIEWER=1 ;; + "nvml") F_NVML=1 ;; + *) echo "Unknown module: $arg"; exit 1 ;; + esac + done +fi + +################################################################################ + +### Build common tools and libraries + +PREFIX=$PREFIX_COMMON + +# Add tools to path +export PATH=$PREFIX/bin:$PATH + +if [ -n "$F_ALL" -o -n "$F_CMAKE" ]; then + build_cmake +fi + +if [ -n "$F_ALL" -o -n "$F_LLVM" ]; then + build_llvm +fi + +# Enable debug symbols so that stacktraces and linenumbers are available at +# runtime. CMake and LLVM are compiled without debug symbols since CMake is a +# compile-time only tool, and the LLVM debug symbols take up more than 20GiB of +# disk space. +EXTRA_CFLAGS="-g $EXTRA_CFLAGS" +EXTRA_CXXFLAGS="-g $EXTRA_CXXFLAGS" + +if [ -n "$OS_LINUX" ] && [ -n "$F_ALL" -o -n "$F_LIBUNWIND" ]; then + build_libunwind +fi + +if [ -n "$F_ALL" -o -n "$F_ZLIB" ]; then + build_zlib +fi + +if [ -n "$F_ALL" -o -n "$F_LZ4" ]; then + build_lz4 +fi + +if [ -n "$F_ALL" -o -n "$F_BITSHUFFLE" ]; then + build_bitshuffle +fi + +if [ -n "$F_ALL" -o -n "$F_LIBEV" ]; then + build_libev +fi + +if [ -n "$F_ALL" -o -n "$F_RAPIDJSON" ]; then + build_rapidjson +fi + +if [ -n "$F_ALL" -o -n "$F_SQUEASEL" ]; then + build_squeasel +fi + +if [ -n "$F_ALL" -o -n "$F_CURL" ]; then + build_curl +fi + +build_boost_uuid + +if [ -n "$F_ALL" -o -n "$F_GSG" ]; then + build_cpplint +fi + +if [ -n "$F_ALL" -o -n "$F_GCOVR" ]; then + build_gcovr +fi + +if [ -n "$F_ALL" -o -n "$F_TRACE_VIEWER" ]; then + build_trace_viewer +fi + +if [ -n "$OS_LINUX" ] && [ -n "$F_ALL" -o -n "$F_NVML" ]; then + build_nvml +fi + +### Build C++ dependencies + +PREFIX=$PREFIX_DEPS + +if [ -n "$F_ALL" -o -n "$F_GFLAGS" ]; then + build_gflags +fi + +if [ -n "$F_ALL" -o -n "$F_GLOG" ]; then + build_glog +fi + +if [ -n "$F_ALL" -o -n "$F_GPERFTOOLS" ]; then + build_gperftools +fi + +if [ -n "$F_ALL" -o -n "$F_GMOCK" ]; then + build_gmock +fi + +if [ -n "$F_ALL" -o -n "$F_PROTOBUF" ]; then + build_protobuf +fi + +if [ -n "$F_ALL" -o -n "$F_SNAPPY" ]; then + build_snappy +fi + +if [ -n "$F_ALL" -o -n "$F_CRCUTIL" ]; then + build_crcutil +fi + +## Build C++ dependencies with TSAN instrumentation + +if [ -n "$F_TSAN" ]; then + + # Achieving good results with TSAN requires that the C++ standard + # library be instrumented with TSAN. Additionally, dependencies which + # internally use threads or synchronization should be instrumented. + # libstdc++ requires that all shared objects linked into an executable should + # be built against the same version of libstdc++. As a result, we must build + # libstdc++ twice: once instrumented, and once uninstrumented, in order to + # guarantee that the versions match. + # + # Currently protobuf is the only thirdparty dependency that we build with + # instrumentation. + # + # Special flags for TSAN builds: + # * -fsanitize=thread - enable the thread sanitizer during compilation. + # * -L ... - add the instrumented libstdc++ to the library search paths. + # * -isystem ... - Add libstdc++ headers to the system header search paths. + # * -nostdinc++ - Do not automatically link the system C++ standard library. + # * -Wl,-rpath,... - Add instrumented libstdc++ location to the rpath so that + # it can be found at runtime. + + if which ccache >/dev/null ; then + CLANG="$TP_DIR/../build-support/ccache-clang/clang" + CLANGXX="$TP_DIR/../build-support/ccache-clang/clang++" + else + CLANG="$TP_DIR/clang-toolchain/bin/clang" + CLANGXX="$TP_DIR/clang-toolchain/bin/clang++" + fi + export CC=$CLANG + export CXX=$CLANGXX + + PREFIX=$PREFIX_DEPS_TSAN + + if [ -n "$F_ALL" -o -n "$F_LIBSTDCXX" ]; then + save_env + + # Build uninstrumented libstdcxx + PREFIX=$PREFIX_LIBSTDCXX + EXTRA_CFLAGS= + EXTRA_CXXFLAGS= + build_libstdcxx + + # Build instrumented libstdxx + PREFIX=$PREFIX_LIBSTDCXX_TSAN + EXTRA_CFLAGS="-fsanitize=thread" + EXTRA_CXXFLAGS="-fsanitize=thread" + build_libstdcxx + + restore_env + fi + + # Build dependencies that require TSAN instrumentation + + save_env + EXTRA_CFLAGS="-fsanitize=thread $EXTRA_CFLAGS" + EXTRA_CXXFLAGS="-nostdinc++ -fsanitize=thread $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-DTHREAD_SANITIZER $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-isystem $PREFIX_LIBSTDCXX_TSAN/include/c++/$GCC_VERSION/backward $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-isystem $PREFIX_LIBSTDCXX_TSAN/include/c++/$GCC_VERSION $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-L$PREFIX_LIBSTDCXX_TSAN/lib $EXTRA_CXXFLAGS" + EXTRA_LDFLAGS="-Wl,-rpath,$PREFIX_LIBSTDCXX_TSAN/lib $EXTRA_LDFLAGS" + + if [ -n "$F_ALL" -o -n "$F_PROTOBUF" ]; then + build_protobuf + fi + restore_env + + # Build dependencies that do not require TSAN instrumentation + + EXTRA_CXXFLAGS="-nostdinc++ $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-isystem $PREFIX_LIBSTDCXX/include/c++/$GCC_VERSION/backward $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-isystem $PREFIX_LIBSTDCXX/include/c++/$GCC_VERSION $EXTRA_CXXFLAGS" + EXTRA_CXXFLAGS="-L$PREFIX_LIBSTDCXX/lib $EXTRA_CXXFLAGS" + EXTRA_LDFLAGS="-Wl,-rpath,$PREFIX_LIBSTDCXX/lib $EXTRA_LDFLAGS" + + if [ -n "$F_ALL" -o -n "$F_GFLAGS" ]; then + build_gflags + fi + + if [ -n "$F_ALL" -o -n "$F_GLOG" ]; then + build_glog + fi + + if [ -n "$F_ALL" -o -n "$F_GPERFTOOLS" ]; then + build_gperftools + fi + + if [ -n "$F_ALL" -o -n "$F_GMOCK" ]; then + build_gmock + fi + + if [ -n "$F_ALL" -o -n "$F_SNAPPY" ]; then + build_snappy + fi + + if [ -n "$F_ALL" -o -n "$F_CRCUTIL" ]; then + build_crcutil + fi +fi + +echo "---------------------" +echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh new file mode 100755 index 000000000000..542f6a994ffb --- /dev/null +++ b/thirdparty/download-thirdparty.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# autoreconf calls are necessary to fix hard-coded aclocal versions in the +# configure scripts that ship with the projects. + +set -e + +TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +cd $TP_DIR + +if [[ "$OSTYPE" =~ ^linux ]]; then + OS_LINUX=1 +fi + +source vars.sh + +delete_if_wrong_patchlevel() { + local DIR=$1 + local PATCHLEVEL=$2 + if [ ! -f $DIR/patchlevel-$PATCHLEVEL ]; then + echo It appears that $DIR is missing the latest local patches. + echo Removing it so we re-download it. + rm -Rf $DIR + fi +} + +fetch_and_expand() { + local FILENAME=$1 + if [ -z "$FILENAME" ]; then + echo "Error: Must specify file to fetch" + exit 1 + fi + + echo "Fetching $FILENAME" + curl -O "${CLOUDFRONT_URL_PREFIX}/${FILENAME}" + + echo "Unpacking $FILENAME" + if echo "$FILENAME" | egrep -q '\.zip$'; then + unzip -q $FILENAME + elif echo "$FILENAME" | egrep -q '(\.tar\.gz|\.tgz)$'; then + tar xf $FILENAME + else + echo "Error: unknown file format: $FILENAME" + exit 1 + fi + + echo "Removing $FILENAME" + rm $FILENAME + echo +} + +GLOG_PATCHLEVEL=1 +delete_if_wrong_patchlevel $GLOG_DIR $GLOG_PATCHLEVEL +if [ ! -d $GLOG_DIR ]; then + fetch_and_expand glog-${GLOG_VERSION}.tar.gz + + pushd $GLOG_DIR + patch -p0 < $TP_DIR/patches/glog-issue-198-fix-unused-warnings.patch + touch patchlevel-$GLOG_PATCHLEVEL + autoreconf -fvi + popd + echo +fi + +if [ ! -d $GMOCK_DIR ]; then + fetch_and_expand gmock-${GMOCK_VERSION}.zip +fi + +if [ ! -d $GFLAGS_DIR ]; then + fetch_and_expand gflags-${GFLAGS_VERSION}.tar.gz +fi + +# Check that the gperftools patch has been applied. +# If you add or remove patches, bump the patchlevel below to ensure +# that any new Jenkins builds pick up your patches. +GPERFTOOLS_PATCHLEVEL=2 +delete_if_wrong_patchlevel $GPERFTOOLS_DIR $GPERFTOOLS_PATCHLEVEL +if [ ! -d $GPERFTOOLS_DIR ]; then + fetch_and_expand gperftools-${GPERFTOOLS_VERSION}.tar.gz + + pushd $GPERFTOOLS_DIR + patch -p1 < $TP_DIR/patches/gperftools-Change-default-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch + patch -p1 < $TP_DIR/patches/gperftools-hook-mi_force_unlock-on-OSX-instead-of-pthread_atfork.patch + touch patchlevel-$GPERFTOOLS_PATCHLEVEL + autoreconf -fvi + popd + echo +fi + +if [ ! -d $PROTOBUF_DIR ]; then + fetch_and_expand protobuf-${PROTOBUF_VERSION}.tar.gz + pushd $PROTOBUF_DIR + autoreconf -fvi + popd +fi + +if [ ! -d $CMAKE_DIR ]; then + fetch_and_expand cmake-${CMAKE_VERSION}.tar.gz +fi + +if [ ! -d $SNAPPY_DIR ]; then + fetch_and_expand snappy-${SNAPPY_VERSION}.tar.gz + pushd $SNAPPY_DIR + autoreconf -fvi + popd +fi + +if [ ! -d $ZLIB_DIR ]; then + fetch_and_expand zlib-${ZLIB_VERSION}.tar.gz +fi + +if [ ! -d $LIBEV_DIR ]; then + fetch_and_expand libev-${LIBEV_VERSION}.tar.gz +fi + +if [ ! -d $RAPIDJSON_DIR ]; then + fetch_and_expand rapidjson-${RAPIDJSON_VERSION}.zip + mv rapidjson ${RAPIDJSON_DIR} +fi + +if [ ! -d $SQUEASEL_DIR ]; then + fetch_and_expand squeasel-${SQUEASEL_VERSION}.tar.gz +fi + +if [ ! -d $GSG_DIR ]; then + fetch_and_expand google-styleguide-${GSG_VERSION}.tar.gz +fi + +if [ ! -d $GCOVR_DIR ]; then + fetch_and_expand gcovr-${GCOVR_VERSION}.tar.gz +fi + +if [ ! -d $CURL_DIR ]; then + fetch_and_expand curl-${CURL_VERSION}.tar.gz +fi + +CRCUTIL_PATCHLEVEL=1 +delete_if_wrong_patchlevel $CRCUTIL_DIR $CRCUTIL_PATCHLEVEL +if [ ! -d $CRCUTIL_DIR ]; then + fetch_and_expand crcutil-${CRCUTIL_VERSION}.tar.gz + + pushd $CRCUTIL_DIR + patch -p0 < $TP_DIR/patches/crcutil-fix-libtoolize-on-osx.patch + touch patchlevel-$CRCUTIL_PATCHLEVEL + popd + echo +fi + +if [ ! -d $LIBUNWIND_DIR ]; then + fetch_and_expand libunwind-${LIBUNWIND_VERSION}.tar.gz +fi + +if [ ! -d $PYTHON_DIR ]; then + fetch_and_expand python-${PYTHON_VERSION}.tar.gz +fi + +LLVM_PATCHLEVEL=2 +delete_if_wrong_patchlevel $LLVM_DIR $LLVM_PATCHLEVEL +if [ ! -d $LLVM_DIR ]; then + fetch_and_expand llvm-${LLVM_VERSION}.src.tar.gz + + pushd $LLVM_DIR + patch -p1 < $TP_DIR/patches/llvm-fix-amazon-linux.patch + patch -p1 < $TP_DIR/patches/llvm-devtoolset-toolchain.patch + touch patchlevel-$LLVM_PATCHLEVEL + popd + echo +fi + +GCC_PATCHLEVEL=2 +delete_if_wrong_patchlevel $GCC_DIR $GCC_PATCHLEVEL +if [[ "$OSTYPE" =~ ^linux ]] && [[ ! -d $GCC_DIR ]]; then + fetch_and_expand gcc-${GCC_VERSION}.tar.gz + pushd $GCC_DIR/libstdc++-v3 + patch -p0 < $TP_DIR/patches/libstdcxx-fix-string-dtor.patch + patch -p0 < $TP_DIR/patches/libstdcxx-fix-tr1-shared-ptr.patch + cd .. + touch patchlevel-$GCC_PATCHLEVEL + popd + echo +fi + +LZ4_PATCHLEVEL=1 +delete_if_wrong_patchlevel $LZ4_DIR $LZ4_PATCHLEVEL +if [ ! -d $LZ4_DIR ]; then + fetch_and_expand lz4-lz4-$LZ4_VERSION.tar.gz + pushd $LZ4_DIR + patch -p1 < $TP_DIR/patches/lz4-0001-Fix-cmake-build-to-use-gnu-flags-on-clang.patch + touch patchlevel-$LZ4_PATCHLEVEL + popd + echo +fi + +if [ ! -d $BITSHUFFLE_DIR ]; then + fetch_and_expand bitshuffle-${BITSHUFFLE_VERSION}.tar.gz +fi + +if [ ! -d $TRACE_VIEWER_DIR ]; then + fetch_and_expand kudu-trace-viewer-${TRACE_VIEWER_VERSION}.tar.gz +fi + +if [ -n "$OS_LINUX" -a ! -d $NVML_DIR ]; then + fetch_and_expand nvml-${NVML_VERSION}.tar.gz +fi + +echo "---------------" +echo "Thirdparty dependencies downloaded successfully" diff --git a/thirdparty/patches/crcutil-fix-libtoolize-on-osx.patch b/thirdparty/patches/crcutil-fix-libtoolize-on-osx.patch new file mode 100644 index 000000000000..9e59349a59f7 --- /dev/null +++ b/thirdparty/patches/crcutil-fix-libtoolize-on-osx.patch @@ -0,0 +1,16 @@ +--- autogen.sh 2014-08-25 17:08:54.000000000 -0700 ++++ autogen.sh 2014-08-25 16:52:22.000000000 -0700 +@@ -119,7 +119,12 @@ + echo>>${target} "crcutilhdrs_HEADERS=examples/interface.h" + + echo "Creating Makefile.in" +-libtoolize ++ ++case `uname` in ++ Darwin*) glibtoolize ;; ++ *) libtoolize ;; ++esac ++ + aclocal + automake --add-missing + autoconf diff --git a/thirdparty/patches/glog-issue-198-fix-unused-warnings.patch b/thirdparty/patches/glog-issue-198-fix-unused-warnings.patch new file mode 100644 index 000000000000..caa8c61e4d0c --- /dev/null +++ b/thirdparty/patches/glog-issue-198-fix-unused-warnings.patch @@ -0,0 +1,47 @@ +Index: configure.ac +=================================================================== +--- configure.ac (revision 142) ++++ configure.ac (working copy) +@@ -80,15 +80,17 @@ + [Define if you have the 'pwrite' function])) + + AX_C___ATTRIBUTE__ +-# We only care about these two attributes. ++# We only care about these attributes. + if test x"$ac_cv___attribute__" = x"yes"; then + ac_cv___attribute___noreturn="__attribute__ ((noreturn))" + ac_cv___attribute___noinline="__attribute__ ((noinline))" + ac_cv___attribute___printf_4_5="__attribute__((__format__ (__printf__, 4, 5)))" ++ ac_cv___attribute___unused="__attribute__ ((unused))" + else + ac_cv___attribute___noreturn= + ac_cv___attribute___noinline= + ac_cv___attribute___printf_4_5= ++ ac_cv___attribute___unused= + fi + + AX_C___BUILTIN_EXPECT +@@ -214,6 +216,7 @@ + AC_SUBST(ac_cv___attribute___noreturn) + AC_SUBST(ac_cv___attribute___noinline) + AC_SUBST(ac_cv___attribute___printf_4_5) ++AC_SUBST(ac_cv___attribute___unused) + AC_SUBST(ac_cv_have___builtin_expect) + AC_SUBST(ac_cv_have_stdint_h) + AC_SUBST(ac_cv_have_systypes_h) +Index: src/glog/logging.h.in +=================================================================== +--- src/glog/logging.h.in (revision 142) ++++ src/glog/logging.h.in (working copy) +@@ -908,8 +908,10 @@ + struct CrashReason; + } // namespace glog_internal_namespace_ + ++#define GOOGLE_GLOG_ATTRIBUTE_UNUSED @ac_cv___attribute___unused@ ++ + #define GOOGLE_GLOG_COMPILE_ASSERT(expr, msg) \ +- typedef @ac_google_namespace@::glog_internal_namespace_::CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] ++ typedef @ac_google_namespace@::glog_internal_namespace_::CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] GOOGLE_GLOG_ATTRIBUTE_UNUSED + + #define LOG_EVERY_N(severity, n) \ + GOOGLE_GLOG_COMPILE_ASSERT(@ac_google_namespace@::GLOG_ ## severity < \ diff --git a/thirdparty/patches/gperftools-Change-default-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch b/thirdparty/patches/gperftools-Change-default-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch new file mode 100644 index 000000000000..353df161911c --- /dev/null +++ b/thirdparty/patches/gperftools-Change-default-TCMALLOC_TRANSFER_NUM_OBJ-to-40.patch @@ -0,0 +1,34 @@ +From a36aea827b76b19c82b780de0762da30c230f19f Mon Sep 17 00:00:00 2001 +From: Todd Lipcon +Date: Tue, 23 Sep 2014 19:09:56 -0700 +Subject: [PATCH] Change default TCMALLOC_TRANSFER_NUM_OBJ to 40 + +We found that this configuration was preventing good performance +of the central freelist in benchmarks with a lot of threads. + +After discussion with Aliaksey Kandratsenka (the gperftools maintainer) +we decided the best course of action was to revert to a number close to the old +default, which seems to improve the performance of these high-concurrency +benchmarks by about 4.5x. + +Any higher numbers seem to produce worse performance in our benchmarks. +--- + src/common.cc | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/common.cc b/src/common.cc +index 9d48377..c2ad054 100644 +--- a/src/common.cc ++++ b/src/common.cc +@@ -42,7 +42,7 @@ namespace tcmalloc { + // thread and central caches. + static int32 FLAGS_tcmalloc_transfer_num_objects; + +-static const int32 kDefaultTransferNumObjecs = 32768; ++static const int32 kDefaultTransferNumObjecs = 40; + + // The init function is provided to explicit initialize the variable value + // from the env. var to avoid C++ global construction that might defer its +-- +1.8.3.2 + diff --git a/thirdparty/patches/gperftools-hook-mi_force_unlock-on-OSX-instead-of-pthread_atfork.patch b/thirdparty/patches/gperftools-hook-mi_force_unlock-on-OSX-instead-of-pthread_atfork.patch new file mode 100644 index 000000000000..b8c96954c3d5 --- /dev/null +++ b/thirdparty/patches/gperftools-hook-mi_force_unlock-on-OSX-instead-of-pthread_atfork.patch @@ -0,0 +1,72 @@ +commit 7013b219970a329d1db58fbd7fa7c907bec8dbba +Author: Aliaksey Kandratsenka +Date: Sat May 9 12:48:11 2015 -0700 + + hook mi_force_{un,}lock on OSX instead of pthread_atfork + + This is patch by Anton Samokhvalov. + + Apparently it helps with locking around forking on OSX. + +diff --git a/src/libc_override_osx.h b/src/libc_override_osx.h +index 26923e9..b801f22 100644 +--- a/src/libc_override_osx.h ++++ b/src/libc_override_osx.h +@@ -85,6 +85,11 @@ + #include + #include + ++namespace tcmalloc { ++ void CentralCacheLockAll(); ++ void CentralCacheUnlockAll(); ++} ++ + // from AvailabilityMacros.h + #if defined(MAC_OS_X_VERSION_10_6) && \ + MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6 +@@ -169,11 +174,11 @@ void mi_log(malloc_zone_t *zone, void *address) { + } + + void mi_force_lock(malloc_zone_t *zone) { +- // Hopefully unneeded by us! ++ tcmalloc::CentralCacheLockAll(); + } + + void mi_force_unlock(malloc_zone_t *zone) { +- // Hopefully unneeded by us! ++ tcmalloc::CentralCacheUnlockAll(); + } + + void mi_statistics(malloc_zone_t *zone, malloc_statistics_t *stats) { +diff --git a/src/static_vars.cc b/src/static_vars.cc +index 197b3a1..09d2b59 100644 +--- a/src/static_vars.cc ++++ b/src/static_vars.cc +@@ -51,7 +51,6 @@ namespace tcmalloc { + // sure the central_cache locks remain in a consisten state in the forked + // version of the thread. + +-static + void CentralCacheLockAll() + { + Static::pageheap_lock()->Lock(); +@@ -59,7 +58,6 @@ void CentralCacheLockAll() + Static::central_cache()[i].Lock(); + } + +-static + void CentralCacheUnlockAll() + { + for (int i = 0; i < kNumClasses; ++i) +@@ -114,9 +112,11 @@ void Static::InitStaticVars() { + static inline + void SetupAtForkLocksHandler() + { ++#if !defined(__APPLE__) + pthread_atfork(CentralCacheLockAll, // parent calls before fork + CentralCacheUnlockAll, // parent calls after fork + CentralCacheUnlockAll); // child calls after fork ++#endif + } + REGISTER_MODULE_INITIALIZER(tcmalloc_fork_handler, SetupAtForkLocksHandler()); + diff --git a/thirdparty/patches/libstdcxx-fix-string-dtor.patch b/thirdparty/patches/libstdcxx-fix-string-dtor.patch new file mode 100644 index 000000000000..28978c5c5b9e --- /dev/null +++ b/thirdparty/patches/libstdcxx-fix-string-dtor.patch @@ -0,0 +1,54 @@ +Index: include/bits/basic_string.h +=================================================================== +--- include/bits/basic_string.h (revision 227400) ++++ include/bits/basic_string.h (working copy) +@@ -2601,11 +2601,32 @@ + + bool + _M_is_leaked() const _GLIBCXX_NOEXCEPT +- { return this->_M_refcount < 0; } ++ { ++#if defined(__GTHREADS) ++ // _M_refcount is mutated concurrently by _M_refcopy/_M_dispose, ++ // so we need to use an atomic load. However, _M_is_leaked ++ // predicate does not change concurrently (i.e. the string is either ++ // leaked or not), so a relaxed load is enough. ++ return __atomic_load_n(&this->_M_refcount, __ATOMIC_RELAXED) < 0; ++#else ++ return this->_M_refcount < 0; ++#endif ++ } + + bool + _M_is_shared() const _GLIBCXX_NOEXCEPT +- { return this->_M_refcount > 0; } ++ { ++#if defined(__GTHREADS) ++ // _M_refcount is mutated concurrently by _M_refcopy/_M_dispose, ++ // so we need to use an atomic load. Another thread can drop last ++ // but one reference concurrently with this check, so we need this ++ // load to be acquire to synchronize with release fetch_and_add in ++ // _M_dispose. ++ return __atomic_load_n(&this->_M_refcount, __ATOMIC_ACQUIRE) > 0; ++#else ++ return this->_M_refcount > 0; ++#endif ++ } + + void + _M_set_leaked() _GLIBCXX_NOEXCEPT +@@ -2654,6 +2675,14 @@ + { + // Be race-detector-friendly. For more info see bits/c++config. + _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&this->_M_refcount); ++ // Decrement of _M_refcount is acq_rel, because: ++ // - all but last decrements need to release to synchronize with ++ // the last decrement that will delete the object. ++ // - the last decrement needs to acquire to synchronize with ++ // all the previous decrements. ++ // - last but one decrement needs to release to synchronize with ++ // the acquire load in _M_is_shared that will conclude that ++ // the object is not shared anymore. + if (__gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount, + -1) <= 0) + { diff --git a/thirdparty/patches/libstdcxx-fix-tr1-shared-ptr.patch b/thirdparty/patches/libstdcxx-fix-tr1-shared-ptr.patch new file mode 100644 index 000000000000..f21fa735d613 --- /dev/null +++ b/thirdparty/patches/libstdcxx-fix-tr1-shared-ptr.patch @@ -0,0 +1,21 @@ +diff -ur ./include/tr1/shared_ptr.h ../../gcc-4.9.3.patched/libstdc++-v3/include/tr1/shared_ptr.h +--- ./include/tr1/shared_ptr.h 2014-01-02 14:30:10.000000000 -0800 ++++ ../../gcc-4.9.3.patched/libstdc++-v3/include/tr1/shared_ptr.h 2016-02-01 22:45:11.808475373 -0800 +@@ -188,7 +188,7 @@ + { + // No memory barrier is used here so there is no synchronization + // with other threads. +- return const_cast(_M_use_count); ++ return __atomic_load_n(&_M_use_count, __ATOMIC_RELAXED); + } + + private: +@@ -230,7 +230,7 @@ + _M_add_ref_lock() + { + // Perform lock-free add-if-not-zero operation. +- _Atomic_word __count = _M_use_count; ++ _Atomic_word __count = _M_get_use_count(); + do + { + if (__count == 0) diff --git a/thirdparty/patches/llvm-devtoolset-toolchain.patch b/thirdparty/patches/llvm-devtoolset-toolchain.patch new file mode 100644 index 000000000000..ce41be3e824d --- /dev/null +++ b/thirdparty/patches/llvm-devtoolset-toolchain.patch @@ -0,0 +1,16 @@ +diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp +index 4688335..9173714 100644 +--- a/tools/clang/lib/Driver/ToolChains.cpp ++++ b/tools/clang/lib/Driver/ToolChains.cpp +@@ -1060,8 +1060,10 @@ Generic_GCC::GCCInstallationDetector::GCCInstallationDetector( + Prefixes.push_back(D.InstalledDir + "/.."); + + // And finally in /usr. +- if (D.SysRoot.empty()) ++ if (D.SysRoot.empty()) { ++ Prefixes.push_back("/opt/rh/devtoolset-3/root/usr"); + Prefixes.push_back("/usr"); ++ } + } + + // Loop over the various components which exist and select the best GCC diff --git a/thirdparty/patches/llvm-fix-amazon-linux.patch b/thirdparty/patches/llvm-fix-amazon-linux.patch new file mode 100644 index 000000000000..968b908afc08 --- /dev/null +++ b/thirdparty/patches/llvm-fix-amazon-linux.patch @@ -0,0 +1,22 @@ +diff --git a/lib/Driver/ToolChains.cpp b/lib/Driver/ToolChains.cpp +index e5528f0..7348f9d 100644 +--- a/tools/clang/lib/Driver/ToolChains.cpp ++++ b/tools/clang/lib/Driver/ToolChains.cpp +@@ -1441,7 +1441,7 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const { + "x86_64-redhat-linux", "x86_64-suse-linux", + "x86_64-manbo-linux-gnu", "x86_64-linux-gnu", + "x86_64-slackware-linux", "x86_64-linux-android", +- "x86_64-unknown-linux"}; ++ "x86_64-unknown-linux", "x86_64-amazon-linux"}; + static const char *const X32LibDirs[] = {"/libx32"}; + static const char *const X86LibDirs[] = {"/lib32", "/lib"}; + static const char *const X86Triples[] = { +@@ -1449,7 +1449,7 @@ bool Generic_GCC::GCCInstallationDetector::getBiarchSibling(Multilib &M) const { + "i386-linux-gnu", "i386-redhat-linux6E", "i686-redhat-linux", + "i586-redhat-linux", "i386-redhat-linux", "i586-suse-linux", + "i486-slackware-linux", "i686-montavista-linux", "i686-linux-android", +- "i586-linux-gnu"}; ++ "i586-linux-gnu", "i686-amazon-linux"}; + + static const char *const MIPSLibDirs[] = {"/lib"}; + static const char *const MIPSTriples[] = {"mips-linux-gnu", "mips-mti-linux", diff --git a/thirdparty/patches/lz4-0001-Fix-cmake-build-to-use-gnu-flags-on-clang.patch b/thirdparty/patches/lz4-0001-Fix-cmake-build-to-use-gnu-flags-on-clang.patch new file mode 100644 index 000000000000..7e268c4393cf --- /dev/null +++ b/thirdparty/patches/lz4-0001-Fix-cmake-build-to-use-gnu-flags-on-clang.patch @@ -0,0 +1,51 @@ +From f5f137b7b2fddf3e3232e4859b62da42654d0f87 Mon Sep 17 00:00:00 2001 +From: Todd Lipcon +Date: Thu, 25 Jun 2015 14:45:27 -0700 +Subject: [PATCH] Fix cmake build to use gnu flags on clang + +Previously, the cmake build was only adding -fPIC and -std=c99 on +gcc. However, these flags are also appropriate when building with +clang. +--- + cmake_unofficial/CMakeLists.txt | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt +index 4c3eb65..a986f43 100644 +--- a/cmake_unofficial/CMakeLists.txt ++++ b/cmake_unofficial/CMakeLists.txt +@@ -17,7 +17,12 @@ ENDIF() + option(BUILD_TOOLS "Build the command line tools" ON) + option(BUILD_LIBS "Build the libraries in addition to the tools" ON) + +-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) ++IF("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR ++ "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") ++ SET(GNU_COMPATIBLE_COMPILER 1) ++ENDIF() ++ ++if(GNU_COMPATIBLE_COMPILER) + if(UNIX AND BUILD_LIBS) + add_definitions(-fPIC) + endif() +@@ -68,7 +73,7 @@ endif() + if(MSVC) + ADD_DEFINITIONS("-W4") + endif() +-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) ++if(GNU_COMPATIBLE_COMPILER) + ADD_DEFINITIONS("-Wall") + endif() + if(CMAKE_COMPILER_IS_GNUCXX) +@@ -78,7 +83,7 @@ ADD_DEFINITIONS("-Wshadow") + ADD_DEFINITIONS("-Wcast-align") + ADD_DEFINITIONS("-Wstrict-prototypes") + endif(CMAKE_COMPILER_IS_GNUCXX) +-if((CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) AND ++if(GNU_COMPATIBLE_COMPILER AND + (NOT CMAKE_SYSTEM_NAME MATCHES "SunOS")) + ADD_DEFINITIONS("-std=c99") + endif() +-- +1.8.3.2 + diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh new file mode 100644 index 000000000000..9dfcac1e319e --- /dev/null +++ b/thirdparty/vars.sh @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This URL corresponds to the CloudFront Distribution for the S3 +# bucket cloudera-thirdparty-libs which is directly accessible at +# http://cloudera-thirdparty-libs.s3.amazonaws.com/ +CLOUDFRONT_URL_PREFIX=http://d3dr9sfxru4sde.cloudfront.net + +PREFIX_COMMON=$TP_DIR/installed +PREFIX_DEPS=$TP_DIR/installed-deps +PREFIX_DEPS_TSAN=$TP_DIR/installed-deps-tsan + +# libstdcxx needs its own prefix so that it is not inadvertently +# included in the library search path during non-TSAN builds. +PREFIX_LIBSTDCXX=$PREFIX_DEPS/gcc +PREFIX_LIBSTDCXX_TSAN=$PREFIX_DEPS_TSAN/gcc + +GFLAGS_VERSION=2.1.2 +GFLAGS_DIR=$TP_DIR/gflags-$GFLAGS_VERSION + +GLOG_VERSION=0.3.4 +GLOG_DIR=$TP_DIR/glog-$GLOG_VERSION + +GMOCK_VERSION=1.7.0 +GMOCK_DIR=$TP_DIR/gmock-$GMOCK_VERSION + +GPERFTOOLS_VERSION=2.2.1 +GPERFTOOLS_DIR=$TP_DIR/gperftools-$GPERFTOOLS_VERSION + +PROTOBUF_VERSION=2.6.1 +PROTOBUF_DIR=$TP_DIR/protobuf-$PROTOBUF_VERSION + +CMAKE_VERSION=3.2.3 +CMAKE_DIR=$TP_DIR/cmake-${CMAKE_VERSION} + +SNAPPY_VERSION=1.1.0 +SNAPPY_DIR=$TP_DIR/snappy-$SNAPPY_VERSION + +LZ4_VERSION=r130 +LZ4_DIR=$TP_DIR/lz4-lz4-$LZ4_VERSION + +# from https://github.com/kiyo-masui/bitshuffle +# Hash of git: c5c928fe7d4bc5b9391748a8dd29de5a89c3c94a +BITSHUFFLE_VERSION=c5c928f +BITSHUFFLE_DIR=$TP_DIR/bitshuffle-${BITSHUFFLE_VERSION} + +ZLIB_VERSION=1.2.8 +ZLIB_DIR=$TP_DIR/zlib-$ZLIB_VERSION + +LIBEV_VERSION=4.20 +LIBEV_DIR=$TP_DIR/libev-$LIBEV_VERSION + +RAPIDJSON_VERSION=0.11 +RAPIDJSON_DIR=$TP_DIR/rapidjson-${RAPIDJSON_VERSION} + +# Hash of the squeasel git revision to use. +# (from http://github.com/cloudera/squeasel) +# +# To re-build this tarball use the following in the squeasel repo: +# export NAME=squeasel-$(git rev-parse HEAD) +# git archive HEAD --prefix=$NAME/ -o /tmp/$NAME.tar.gz +# s3cmd put -P /tmp/$NAME.tar.gz s3://cloudera-thirdparty-libs/$NAME.tar.gz +# +# File a HD ticket for access to the cloudera-dev AWS instance to push to S3. +SQUEASEL_VERSION=8ac777a122fccf0358cb8562e900f8e9edd9ed11 +SQUEASEL_DIR=$TP_DIR/squeasel-${SQUEASEL_VERSION} + +# git revision of google style guide: +# https://github.com/google/styleguide +# git archive --prefix=google-styleguide-$(git rev-parse HEAD)/ -o /tmp/google-styleguide-$(git rev-parse HEAD).tgz HEAD +GSG_VERSION=7a179d1ac2e08a5cc1622bec900d1e0452776713 +GSG_DIR=$TP_DIR/google-styleguide-${GSG_VERSION} + +GCOVR_VERSION=3.0 +GCOVR_DIR=$TP_DIR/gcovr-${GCOVR_VERSION} + +CURL_VERSION=7.32.0 +CURL_DIR=$TP_DIR/curl-${CURL_VERSION} + +# Hash of the crcutil git revision to use. +# (from http://github.mtv.cloudera.com/CDH/crcutil) +# +# To re-build this tarball use the following in the crcutil repo: +# export NAME=crcutil-$(git rev-parse HEAD) +# git archive HEAD --prefix=$NAME/ -o /tmp/$NAME.tar.gz +# s3cmd put -P /tmp/$NAME.tar.gz s3://cloudera-thirdparty-libs/$NAME.tar.gz +CRCUTIL_VERSION=440ba7babeff77ffad992df3a10c767f184e946e +CRCUTIL_DIR=$TP_DIR/crcutil-${CRCUTIL_VERSION} + +LIBUNWIND_VERSION=1.1a +LIBUNWIND_DIR=$TP_DIR/libunwind-${LIBUNWIND_VERSION} + +# Our llvm tarball includes clang, extra clang tools, and compiler-rt. +# +# See http://clang.llvm.org/get_started.html for details on how they're laid +# out in the llvm tarball. +LLVM_VERSION=3.7.1 +LLVM_DIR=$TP_DIR/llvm-${LLVM_VERSION}.src +LLVM_BUILD_DIR=$TP_DIR/llvm-${LLVM_VERSION}.build + +# Python 2.7 is required to build LLVM 3.6+. It is only built and installed if +# the system Python version is not 2.7. +PYTHON_VERSION=2.7.10 +PYTHON_DIR=$TP_DIR/python-${PYTHON_VERSION} + +GCC_VERSION=4.9.3 +GCC_DIR=${TP_DIR}/gcc-${GCC_VERSION} +GCC_BUILD_DIR=${GCC_DIR}.build + +# Our trace-viewer repository is separate since it's quite large and +# shouldn't change frequently. We upload the built artifacts (HTML/JS) +# when we need to roll to a new revision. +# +# The source can be found at https://github.com/cloudera/kudu-trace-viewer +# and built with "kudu-build.sh" included within the repository. +TRACE_VIEWER_VERSION=45f6525d8aa498be53e4137fb73a9e9e036ce91d +TRACE_VIEWER_DIR=$TP_DIR/kudu-trace-viewer-${TRACE_VIEWER_VERSION} + +NVML_VERSION=0.4-b2 +NVML_DIR=$TP_DIR/nvml-$NVML_VERSION diff --git a/version.txt b/version.txt new file mode 100644 index 000000000000..35dae96bb5f3 --- /dev/null +++ b/version.txt @@ -0,0 +1 @@ +0.8.0-SNAPSHOT diff --git a/www/bootstrap/css/bootstrap-responsive.css b/www/bootstrap/css/bootstrap-responsive.css new file mode 100644 index 000000000000..82fa9cafe69d --- /dev/null +++ b/www/bootstrap/css/bootstrap-responsive.css @@ -0,0 +1,1088 @@ +/*! + * Bootstrap Responsive v2.2.1 + * + * Copyright 2012 Twitter, Inc + * Licensed under the Apache License v2.0 + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Designed and built with all the love in the world @twitter by @mdo and @fat. + */ + +.clearfix { + *zoom: 1; +} + +.clearfix:before, +.clearfix:after { + display: table; + line-height: 0; + content: ""; +} + +.clearfix:after { + clear: both; +} + +.hide-text { + font: 0/0 a; + color: transparent; + text-shadow: none; + background-color: transparent; + border: 0; +} + +.input-block-level { + display: block; + width: 100%; + min-height: 30px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +.hidden { + display: none; + visibility: hidden; +} + +.visible-phone { + display: none !important; +} + +.visible-tablet { + display: none !important; +} + +.hidden-desktop { + display: none !important; +} + +.visible-desktop { + display: inherit !important; +} + +@media (min-width: 768px) and (max-width: 979px) { + .hidden-desktop { + display: inherit !important; + } + .visible-desktop { + display: none !important ; + } + .visible-tablet { + display: inherit !important; + } + .hidden-tablet { + display: none !important; + } +} + +@media (max-width: 767px) { + .hidden-desktop { + display: inherit !important; + } + .visible-desktop { + display: none !important; + } + .visible-phone { + display: inherit !important; + } + .hidden-phone { + display: none !important; + } +} + +@media (min-width: 1200px) { + .row { + margin-left: -30px; + *zoom: 1; + } + .row:before, + .row:after { + display: table; + line-height: 0; + content: ""; + } + .row:after { + clear: both; + } + [class*="span"] { + float: left; + min-height: 1px; + margin-left: 30px; + } + .container, + .navbar-static-top .container, + .navbar-fixed-top .container, + .navbar-fixed-bottom .container { + width: 1170px; + } + .span12 { + width: 1170px; + } + .span11 { + width: 1070px; + } + .span10 { + width: 970px; + } + .span9 { + width: 870px; + } + .span8 { + width: 770px; + } + .span7 { + width: 670px; + } + .span6 { + width: 570px; + } + .span5 { + width: 470px; + } + .span4 { + width: 370px; + } + .span3 { + width: 270px; + } + .span2 { + width: 170px; + } + .span1 { + width: 70px; + } + .offset12 { + margin-left: 1230px; + } + .offset11 { + margin-left: 1130px; + } + .offset10 { + margin-left: 1030px; + } + .offset9 { + margin-left: 930px; + } + .offset8 { + margin-left: 830px; + } + .offset7 { + margin-left: 730px; + } + .offset6 { + margin-left: 630px; + } + .offset5 { + margin-left: 530px; + } + .offset4 { + margin-left: 430px; + } + .offset3 { + margin-left: 330px; + } + .offset2 { + margin-left: 230px; + } + .offset1 { + margin-left: 130px; + } + .row-fluid { + width: 100%; + *zoom: 1; + } + .row-fluid:before, + .row-fluid:after { + display: table; + line-height: 0; + content: ""; + } + .row-fluid:after { + clear: both; + } + .row-fluid [class*="span"] { + display: block; + float: left; + width: 100%; + min-height: 30px; + margin-left: 2.564102564102564%; + *margin-left: 2.5109110747408616%; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + } + .row-fluid [class*="span"]:first-child { + margin-left: 0; + } + .row-fluid .controls-row [class*="span"] + [class*="span"] { + margin-left: 2.564102564102564%; + } + .row-fluid .span12 { + width: 100%; + *width: 99.94680851063829%; + } + .row-fluid .span11 { + width: 91.45299145299145%; + *width: 91.39979996362975%; + } + .row-fluid .span10 { + width: 82.90598290598291%; + *width: 82.8527914166212%; + } + .row-fluid .span9 { + width: 74.35897435897436%; + *width: 74.30578286961266%; + } + .row-fluid .span8 { + width: 65.81196581196582%; + *width: 65.75877432260411%; + } + .row-fluid .span7 { + width: 57.26495726495726%; + *width: 57.21176577559556%; + } + .row-fluid .span6 { + width: 48.717948717948715%; + *width: 48.664757228587014%; + } + .row-fluid .span5 { + width: 40.17094017094017%; + *width: 40.11774868157847%; + } + .row-fluid .span4 { + width: 31.623931623931625%; + *width: 31.570740134569924%; + } + .row-fluid .span3 { + width: 23.076923076923077%; + *width: 23.023731587561375%; + } + .row-fluid .span2 { + width: 14.52991452991453%; + *width: 14.476723040552828%; + } + .row-fluid .span1 { + width: 5.982905982905983%; + *width: 5.929714493544281%; + } + .row-fluid .offset12 { + margin-left: 105.12820512820512%; + *margin-left: 105.02182214948171%; + } + .row-fluid .offset12:first-child { + margin-left: 102.56410256410257%; + *margin-left: 102.45771958537915%; + } + .row-fluid .offset11 { + margin-left: 96.58119658119658%; + *margin-left: 96.47481360247316%; + } + .row-fluid .offset11:first-child { + margin-left: 94.01709401709402%; + *margin-left: 93.91071103837061%; + } + .row-fluid .offset10 { + margin-left: 88.03418803418803%; + *margin-left: 87.92780505546462%; + } + .row-fluid .offset10:first-child { + margin-left: 85.47008547008548%; + *margin-left: 85.36370249136206%; + } + .row-fluid .offset9 { + margin-left: 79.48717948717949%; + *margin-left: 79.38079650845607%; + } + .row-fluid .offset9:first-child { + margin-left: 76.92307692307693%; + *margin-left: 76.81669394435352%; + } + .row-fluid .offset8 { + margin-left: 70.94017094017094%; + *margin-left: 70.83378796144753%; + } + .row-fluid .offset8:first-child { + margin-left: 68.37606837606839%; + *margin-left: 68.26968539734497%; + } + .row-fluid .offset7 { + margin-left: 62.393162393162385%; + *margin-left: 62.28677941443899%; + } + .row-fluid .offset7:first-child { + margin-left: 59.82905982905982%; + *margin-left: 59.72267685033642%; + } + .row-fluid .offset6 { + margin-left: 53.84615384615384%; + *margin-left: 53.739770867430444%; + } + .row-fluid .offset6:first-child { + margin-left: 51.28205128205128%; + *margin-left: 51.175668303327875%; + } + .row-fluid .offset5 { + margin-left: 45.299145299145295%; + *margin-left: 45.1927623204219%; + } + .row-fluid .offset5:first-child { + margin-left: 42.73504273504273%; + *margin-left: 42.62865975631933%; + } + .row-fluid .offset4 { + margin-left: 36.75213675213675%; + *margin-left: 36.645753773413354%; + } + .row-fluid .offset4:first-child { + margin-left: 34.18803418803419%; + *margin-left: 34.081651209310785%; + } + .row-fluid .offset3 { + margin-left: 28.205128205128204%; + *margin-left: 28.0987452264048%; + } + .row-fluid .offset3:first-child { + margin-left: 25.641025641025642%; + *margin-left: 25.53464266230224%; + } + .row-fluid .offset2 { + margin-left: 19.65811965811966%; + *margin-left: 19.551736679396257%; + } + .row-fluid .offset2:first-child { + margin-left: 17.094017094017094%; + *margin-left: 16.98763411529369%; + } + .row-fluid .offset1 { + margin-left: 11.11111111111111%; + *margin-left: 11.004728132387708%; + } + .row-fluid .offset1:first-child { + margin-left: 8.547008547008547%; + *margin-left: 8.440625568285142%; + } + input, + textarea, + .uneditable-input { + margin-left: 0; + } + .controls-row [class*="span"] + [class*="span"] { + margin-left: 30px; + } + input.span12, + textarea.span12, + .uneditable-input.span12 { + width: 1156px; + } + input.span11, + textarea.span11, + .uneditable-input.span11 { + width: 1056px; + } + input.span10, + textarea.span10, + .uneditable-input.span10 { + width: 956px; + } + input.span9, + textarea.span9, + .uneditable-input.span9 { + width: 856px; + } + input.span8, + textarea.span8, + .uneditable-input.span8 { + width: 756px; + } + input.span7, + textarea.span7, + .uneditable-input.span7 { + width: 656px; + } + input.span6, + textarea.span6, + .uneditable-input.span6 { + width: 556px; + } + input.span5, + textarea.span5, + .uneditable-input.span5 { + width: 456px; + } + input.span4, + textarea.span4, + .uneditable-input.span4 { + width: 356px; + } + input.span3, + textarea.span3, + .uneditable-input.span3 { + width: 256px; + } + input.span2, + textarea.span2, + .uneditable-input.span2 { + width: 156px; + } + input.span1, + textarea.span1, + .uneditable-input.span1 { + width: 56px; + } + .thumbnails { + margin-left: -30px; + } + .thumbnails > li { + margin-left: 30px; + } + .row-fluid .thumbnails { + margin-left: 0; + } +} + +@media (min-width: 768px) and (max-width: 979px) { + .row { + margin-left: -20px; + *zoom: 1; + } + .row:before, + .row:after { + display: table; + line-height: 0; + content: ""; + } + .row:after { + clear: both; + } + [class*="span"] { + float: left; + min-height: 1px; + margin-left: 20px; + } + .container, + .navbar-static-top .container, + .navbar-fixed-top .container, + .navbar-fixed-bottom .container { + width: 724px; + } + .span12 { + width: 724px; + } + .span11 { + width: 662px; + } + .span10 { + width: 600px; + } + .span9 { + width: 538px; + } + .span8 { + width: 476px; + } + .span7 { + width: 414px; + } + .span6 { + width: 352px; + } + .span5 { + width: 290px; + } + .span4 { + width: 228px; + } + .span3 { + width: 166px; + } + .span2 { + width: 104px; + } + .span1 { + width: 42px; + } + .offset12 { + margin-left: 764px; + } + .offset11 { + margin-left: 702px; + } + .offset10 { + margin-left: 640px; + } + .offset9 { + margin-left: 578px; + } + .offset8 { + margin-left: 516px; + } + .offset7 { + margin-left: 454px; + } + .offset6 { + margin-left: 392px; + } + .offset5 { + margin-left: 330px; + } + .offset4 { + margin-left: 268px; + } + .offset3 { + margin-left: 206px; + } + .offset2 { + margin-left: 144px; + } + .offset1 { + margin-left: 82px; + } + .row-fluid { + width: 100%; + *zoom: 1; + } + .row-fluid:before, + .row-fluid:after { + display: table; + line-height: 0; + content: ""; + } + .row-fluid:after { + clear: both; + } + .row-fluid [class*="span"] { + display: block; + float: left; + width: 100%; + min-height: 30px; + margin-left: 2.7624309392265194%; + *margin-left: 2.709239449864817%; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + } + .row-fluid [class*="span"]:first-child { + margin-left: 0; + } + .row-fluid .controls-row [class*="span"] + [class*="span"] { + margin-left: 2.7624309392265194%; + } + .row-fluid .span12 { + width: 100%; + *width: 99.94680851063829%; + } + .row-fluid .span11 { + width: 91.43646408839778%; + *width: 91.38327259903608%; + } + .row-fluid .span10 { + width: 82.87292817679558%; + *width: 82.81973668743387%; + } + .row-fluid .span9 { + width: 74.30939226519337%; + *width: 74.25620077583166%; + } + .row-fluid .span8 { + width: 65.74585635359117%; + *width: 65.69266486422946%; + } + .row-fluid .span7 { + width: 57.18232044198895%; + *width: 57.12912895262725%; + } + .row-fluid .span6 { + width: 48.61878453038674%; + *width: 48.56559304102504%; + } + .row-fluid .span5 { + width: 40.05524861878453%; + *width: 40.00205712942283%; + } + .row-fluid .span4 { + width: 31.491712707182323%; + *width: 31.43852121782062%; + } + .row-fluid .span3 { + width: 22.92817679558011%; + *width: 22.87498530621841%; + } + .row-fluid .span2 { + width: 14.3646408839779%; + *width: 14.311449394616199%; + } + .row-fluid .span1 { + width: 5.801104972375691%; + *width: 5.747913483013988%; + } + .row-fluid .offset12 { + margin-left: 105.52486187845304%; + *margin-left: 105.41847889972962%; + } + .row-fluid .offset12:first-child { + margin-left: 102.76243093922652%; + *margin-left: 102.6560479605031%; + } + .row-fluid .offset11 { + margin-left: 96.96132596685082%; + *margin-left: 96.8549429881274%; + } + .row-fluid .offset11:first-child { + margin-left: 94.1988950276243%; + *margin-left: 94.09251204890089%; + } + .row-fluid .offset10 { + margin-left: 88.39779005524862%; + *margin-left: 88.2914070765252%; + } + .row-fluid .offset10:first-child { + margin-left: 85.6353591160221%; + *margin-left: 85.52897613729868%; + } + .row-fluid .offset9 { + margin-left: 79.8342541436464%; + *margin-left: 79.72787116492299%; + } + .row-fluid .offset9:first-child { + margin-left: 77.07182320441989%; + *margin-left: 76.96544022569647%; + } + .row-fluid .offset8 { + margin-left: 71.2707182320442%; + *margin-left: 71.16433525332079%; + } + .row-fluid .offset8:first-child { + margin-left: 68.50828729281768%; + *margin-left: 68.40190431409427%; + } + .row-fluid .offset7 { + margin-left: 62.70718232044199%; + *margin-left: 62.600799341718584%; + } + .row-fluid .offset7:first-child { + margin-left: 59.94475138121547%; + *margin-left: 59.838368402492065%; + } + .row-fluid .offset6 { + margin-left: 54.14364640883978%; + *margin-left: 54.037263430116376%; + } + .row-fluid .offset6:first-child { + margin-left: 51.38121546961326%; + *margin-left: 51.27483249088986%; + } + .row-fluid .offset5 { + margin-left: 45.58011049723757%; + *margin-left: 45.47372751851417%; + } + .row-fluid .offset5:first-child { + margin-left: 42.81767955801105%; + *margin-left: 42.71129657928765%; + } + .row-fluid .offset4 { + margin-left: 37.01657458563536%; + *margin-left: 36.91019160691196%; + } + .row-fluid .offset4:first-child { + margin-left: 34.25414364640884%; + *margin-left: 34.14776066768544%; + } + .row-fluid .offset3 { + margin-left: 28.45303867403315%; + *margin-left: 28.346655695309746%; + } + .row-fluid .offset3:first-child { + margin-left: 25.69060773480663%; + *margin-left: 25.584224756083227%; + } + .row-fluid .offset2 { + margin-left: 19.88950276243094%; + *margin-left: 19.783119783707537%; + } + .row-fluid .offset2:first-child { + margin-left: 17.12707182320442%; + *margin-left: 17.02068884448102%; + } + .row-fluid .offset1 { + margin-left: 11.32596685082873%; + *margin-left: 11.219583872105325%; + } + .row-fluid .offset1:first-child { + margin-left: 8.56353591160221%; + *margin-left: 8.457152932878806%; + } + input, + textarea, + .uneditable-input { + margin-left: 0; + } + .controls-row [class*="span"] + [class*="span"] { + margin-left: 20px; + } + input.span12, + textarea.span12, + .uneditable-input.span12 { + width: 710px; + } + input.span11, + textarea.span11, + .uneditable-input.span11 { + width: 648px; + } + input.span10, + textarea.span10, + .uneditable-input.span10 { + width: 586px; + } + input.span9, + textarea.span9, + .uneditable-input.span9 { + width: 524px; + } + input.span8, + textarea.span8, + .uneditable-input.span8 { + width: 462px; + } + input.span7, + textarea.span7, + .uneditable-input.span7 { + width: 400px; + } + input.span6, + textarea.span6, + .uneditable-input.span6 { + width: 338px; + } + input.span5, + textarea.span5, + .uneditable-input.span5 { + width: 276px; + } + input.span4, + textarea.span4, + .uneditable-input.span4 { + width: 214px; + } + input.span3, + textarea.span3, + .uneditable-input.span3 { + width: 152px; + } + input.span2, + textarea.span2, + .uneditable-input.span2 { + width: 90px; + } + input.span1, + textarea.span1, + .uneditable-input.span1 { + width: 28px; + } +} + +@media (max-width: 767px) { + body { + padding-right: 20px; + padding-left: 20px; + } + .navbar-fixed-top, + .navbar-fixed-bottom, + .navbar-static-top { + margin-right: -20px; + margin-left: -20px; + } + .container-fluid { + padding: 0; + } + .dl-horizontal dt { + float: none; + width: auto; + clear: none; + text-align: left; + } + .dl-horizontal dd { + margin-left: 0; + } + .container { + width: auto; + } + .row-fluid { + width: 100%; + } + .row, + .thumbnails { + margin-left: 0; + } + .thumbnails > li { + float: none; + margin-left: 0; + } + [class*="span"], + .uneditable-input[class*="span"], + .row-fluid [class*="span"] { + display: block; + float: none; + width: 100%; + margin-left: 0; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + } + .span12, + .row-fluid .span12 { + width: 100%; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + } + .row-fluid [class*="offset"]:first-child { + margin-left: 0; + } + .input-large, + .input-xlarge, + .input-xxlarge, + input[class*="span"], + select[class*="span"], + textarea[class*="span"], + .uneditable-input { + display: block; + width: 100%; + min-height: 30px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + } + .input-prepend input, + .input-append input, + .input-prepend input[class*="span"], + .input-append input[class*="span"] { + display: inline-block; + width: auto; + } + .controls-row [class*="span"] + [class*="span"] { + margin-left: 0; + } + .modal { + position: fixed; + top: 20px; + right: 20px; + left: 20px; + width: auto; + margin: 0; + } + .modal.fade { + top: -100px; + } + .modal.fade.in { + top: 20px; + } +} + +@media (max-width: 480px) { + .nav-collapse { + -webkit-transform: translate3d(0, 0, 0); + } + .page-header h1 small { + display: block; + line-height: 20px; + } + input[type="checkbox"], + input[type="radio"] { + border: 1px solid #ccc; + } + .form-horizontal .control-label { + float: none; + width: auto; + padding-top: 0; + text-align: left; + } + .form-horizontal .controls { + margin-left: 0; + } + .form-horizontal .control-list { + padding-top: 0; + } + .form-horizontal .form-actions { + padding-right: 10px; + padding-left: 10px; + } + .media .pull-left, + .media .pull-right { + display: block; + float: none; + margin-bottom: 10px; + } + .media-object { + margin-right: 0; + margin-left: 0; + } + .modal { + top: 10px; + right: 10px; + left: 10px; + } + .modal-header .close { + padding: 10px; + margin: -10px; + } + .carousel-caption { + position: static; + } +} + +@media (max-width: 979px) { + body { + padding-top: 0; + } + .navbar-fixed-top, + .navbar-fixed-bottom { + position: static; + } + .navbar-fixed-top { + margin-bottom: 20px; + } + .navbar-fixed-bottom { + margin-top: 20px; + } + .navbar-fixed-top .navbar-inner, + .navbar-fixed-bottom .navbar-inner { + padding: 5px; + } + .navbar .container { + width: auto; + padding: 0; + } + .navbar .brand { + padding-right: 10px; + padding-left: 10px; + margin: 0 0 0 -5px; + } + .nav-collapse { + clear: both; + } + .nav-collapse .nav { + float: none; + margin: 0 0 10px; + } + .nav-collapse .nav > li { + float: none; + } + .nav-collapse .nav > li > a { + margin-bottom: 2px; + } + .nav-collapse .nav > .divider-vertical { + display: none; + } + .nav-collapse .nav .nav-header { + color: #777777; + text-shadow: none; + } + .nav-collapse .nav > li > a, + .nav-collapse .dropdown-menu a { + padding: 9px 15px; + font-weight: bold; + color: #777777; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; + } + .nav-collapse .btn { + padding: 4px 10px 4px; + font-weight: normal; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + } + .nav-collapse .dropdown-menu li + li a { + margin-bottom: 2px; + } + .nav-collapse .nav > li > a:hover, + .nav-collapse .dropdown-menu a:hover { + background-color: #f2f2f2; + } + .navbar-inverse .nav-collapse .nav > li > a, + .navbar-inverse .nav-collapse .dropdown-menu a { + color: #999999; + } + .navbar-inverse .nav-collapse .nav > li > a:hover, + .navbar-inverse .nav-collapse .dropdown-menu a:hover { + background-color: #111111; + } + .nav-collapse.in .btn-group { + padding: 0; + margin-top: 5px; + } + .nav-collapse .dropdown-menu { + position: static; + top: auto; + left: auto; + display: none; + float: none; + max-width: none; + padding: 0; + margin: 0 15px; + background-color: transparent; + border: none; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; + -webkit-box-shadow: none; + -moz-box-shadow: none; + box-shadow: none; + } + .nav-collapse .open > .dropdown-menu { + display: block; + } + .nav-collapse .dropdown-menu:before, + .nav-collapse .dropdown-menu:after { + display: none; + } + .nav-collapse .dropdown-menu .divider { + display: none; + } + .nav-collapse .nav > li > .dropdown-menu:before, + .nav-collapse .nav > li > .dropdown-menu:after { + display: none; + } + .nav-collapse .navbar-form, + .nav-collapse .navbar-search { + float: none; + padding: 10px 15px; + margin: 10px 0; + border-top: 1px solid #f2f2f2; + border-bottom: 1px solid #f2f2f2; + -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.1); + -moz-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.1); + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.1); + } + .navbar-inverse .nav-collapse .navbar-form, + .navbar-inverse .nav-collapse .navbar-search { + border-top-color: #111111; + border-bottom-color: #111111; + } + .navbar .nav-collapse .nav.pull-right { + float: none; + margin-left: 0; + } + .nav-collapse, + .nav-collapse.collapse { + height: 0; + overflow: hidden; + } + .navbar .btn-navbar { + display: block; + } + .navbar-static .navbar-inner { + padding-right: 10px; + padding-left: 10px; + } +} + +@media (min-width: 980px) { + .nav-collapse.collapse { + height: auto !important; + overflow: visible !important; + } +} diff --git a/www/bootstrap/css/bootstrap-responsive.min.css b/www/bootstrap/css/bootstrap-responsive.min.css new file mode 100644 index 000000000000..2269019f1891 --- /dev/null +++ b/www/bootstrap/css/bootstrap-responsive.min.css @@ -0,0 +1,9 @@ +/*! + * Bootstrap Responsive v2.2.1 + * + * Copyright 2012 Twitter, Inc + * Licensed under the Apache License v2.0 + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Designed and built with all the love in the world @twitter by @mdo and @fat. + */.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;line-height:0;content:""}.clearfix:after{clear:both}.hide-text{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.input-block-level{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.hidden{display:none;visibility:hidden}.visible-phone{display:none!important}.visible-tablet{display:none!important}.hidden-desktop{display:none!important}.visible-desktop{display:inherit!important}@media(min-width:768px) and (max-width:979px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-tablet{display:inherit!important}.hidden-tablet{display:none!important}}@media(max-width:767px){.hidden-desktop{display:inherit!important}.visible-desktop{display:none!important}.visible-phone{display:inherit!important}.hidden-phone{display:none!important}}@media(min-width:1200px){.row{margin-left:-30px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:30px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:1170px}.span12{width:1170px}.span11{width:1070px}.span10{width:970px}.span9{width:870px}.span8{width:770px}.span7{width:670px}.span6{width:570px}.span5{width:470px}.span4{width:370px}.span3{width:270px}.span2{width:170px}.span1{width:70px}.offset12{margin-left:1230px}.offset11{margin-left:1130px}.offset10{margin-left:1030px}.offset9{margin-left:930px}.offset8{margin-left:830px}.offset7{margin-left:730px}.offset6{margin-left:630px}.offset5{margin-left:530px}.offset4{margin-left:430px}.offset3{margin-left:330px}.offset2{margin-left:230px}.offset1{margin-left:130px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.564102564102564%;*margin-left:2.5109110747408616%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.564102564102564%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.45299145299145%;*width:91.39979996362975%}.row-fluid .span10{width:82.90598290598291%;*width:82.8527914166212%}.row-fluid .span9{width:74.35897435897436%;*width:74.30578286961266%}.row-fluid .span8{width:65.81196581196582%;*width:65.75877432260411%}.row-fluid .span7{width:57.26495726495726%;*width:57.21176577559556%}.row-fluid .span6{width:48.717948717948715%;*width:48.664757228587014%}.row-fluid .span5{width:40.17094017094017%;*width:40.11774868157847%}.row-fluid .span4{width:31.623931623931625%;*width:31.570740134569924%}.row-fluid .span3{width:23.076923076923077%;*width:23.023731587561375%}.row-fluid .span2{width:14.52991452991453%;*width:14.476723040552828%}.row-fluid .span1{width:5.982905982905983%;*width:5.929714493544281%}.row-fluid .offset12{margin-left:105.12820512820512%;*margin-left:105.02182214948171%}.row-fluid .offset12:first-child{margin-left:102.56410256410257%;*margin-left:102.45771958537915%}.row-fluid .offset11{margin-left:96.58119658119658%;*margin-left:96.47481360247316%}.row-fluid .offset11:first-child{margin-left:94.01709401709402%;*margin-left:93.91071103837061%}.row-fluid .offset10{margin-left:88.03418803418803%;*margin-left:87.92780505546462%}.row-fluid .offset10:first-child{margin-left:85.47008547008548%;*margin-left:85.36370249136206%}.row-fluid .offset9{margin-left:79.48717948717949%;*margin-left:79.38079650845607%}.row-fluid .offset9:first-child{margin-left:76.92307692307693%;*margin-left:76.81669394435352%}.row-fluid .offset8{margin-left:70.94017094017094%;*margin-left:70.83378796144753%}.row-fluid .offset8:first-child{margin-left:68.37606837606839%;*margin-left:68.26968539734497%}.row-fluid .offset7{margin-left:62.393162393162385%;*margin-left:62.28677941443899%}.row-fluid .offset7:first-child{margin-left:59.82905982905982%;*margin-left:59.72267685033642%}.row-fluid .offset6{margin-left:53.84615384615384%;*margin-left:53.739770867430444%}.row-fluid .offset6:first-child{margin-left:51.28205128205128%;*margin-left:51.175668303327875%}.row-fluid .offset5{margin-left:45.299145299145295%;*margin-left:45.1927623204219%}.row-fluid .offset5:first-child{margin-left:42.73504273504273%;*margin-left:42.62865975631933%}.row-fluid .offset4{margin-left:36.75213675213675%;*margin-left:36.645753773413354%}.row-fluid .offset4:first-child{margin-left:34.18803418803419%;*margin-left:34.081651209310785%}.row-fluid .offset3{margin-left:28.205128205128204%;*margin-left:28.0987452264048%}.row-fluid .offset3:first-child{margin-left:25.641025641025642%;*margin-left:25.53464266230224%}.row-fluid .offset2{margin-left:19.65811965811966%;*margin-left:19.551736679396257%}.row-fluid .offset2:first-child{margin-left:17.094017094017094%;*margin-left:16.98763411529369%}.row-fluid .offset1{margin-left:11.11111111111111%;*margin-left:11.004728132387708%}.row-fluid .offset1:first-child{margin-left:8.547008547008547%;*margin-left:8.440625568285142%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:30px}input.span12,textarea.span12,.uneditable-input.span12{width:1156px}input.span11,textarea.span11,.uneditable-input.span11{width:1056px}input.span10,textarea.span10,.uneditable-input.span10{width:956px}input.span9,textarea.span9,.uneditable-input.span9{width:856px}input.span8,textarea.span8,.uneditable-input.span8{width:756px}input.span7,textarea.span7,.uneditable-input.span7{width:656px}input.span6,textarea.span6,.uneditable-input.span6{width:556px}input.span5,textarea.span5,.uneditable-input.span5{width:456px}input.span4,textarea.span4,.uneditable-input.span4{width:356px}input.span3,textarea.span3,.uneditable-input.span3{width:256px}input.span2,textarea.span2,.uneditable-input.span2{width:156px}input.span1,textarea.span1,.uneditable-input.span1{width:56px}.thumbnails{margin-left:-30px}.thumbnails>li{margin-left:30px}.row-fluid .thumbnails{margin-left:0}}@media(min-width:768px) and (max-width:979px){.row{margin-left:-20px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:20px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:724px}.span12{width:724px}.span11{width:662px}.span10{width:600px}.span9{width:538px}.span8{width:476px}.span7{width:414px}.span6{width:352px}.span5{width:290px}.span4{width:228px}.span3{width:166px}.span2{width:104px}.span1{width:42px}.offset12{margin-left:764px}.offset11{margin-left:702px}.offset10{margin-left:640px}.offset9{margin-left:578px}.offset8{margin-left:516px}.offset7{margin-left:454px}.offset6{margin-left:392px}.offset5{margin-left:330px}.offset4{margin-left:268px}.offset3{margin-left:206px}.offset2{margin-left:144px}.offset1{margin-left:82px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.7624309392265194%;*margin-left:2.709239449864817%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.7624309392265194%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.43646408839778%;*width:91.38327259903608%}.row-fluid .span10{width:82.87292817679558%;*width:82.81973668743387%}.row-fluid .span9{width:74.30939226519337%;*width:74.25620077583166%}.row-fluid .span8{width:65.74585635359117%;*width:65.69266486422946%}.row-fluid .span7{width:57.18232044198895%;*width:57.12912895262725%}.row-fluid .span6{width:48.61878453038674%;*width:48.56559304102504%}.row-fluid .span5{width:40.05524861878453%;*width:40.00205712942283%}.row-fluid .span4{width:31.491712707182323%;*width:31.43852121782062%}.row-fluid .span3{width:22.92817679558011%;*width:22.87498530621841%}.row-fluid .span2{width:14.3646408839779%;*width:14.311449394616199%}.row-fluid .span1{width:5.801104972375691%;*width:5.747913483013988%}.row-fluid .offset12{margin-left:105.52486187845304%;*margin-left:105.41847889972962%}.row-fluid .offset12:first-child{margin-left:102.76243093922652%;*margin-left:102.6560479605031%}.row-fluid .offset11{margin-left:96.96132596685082%;*margin-left:96.8549429881274%}.row-fluid .offset11:first-child{margin-left:94.1988950276243%;*margin-left:94.09251204890089%}.row-fluid .offset10{margin-left:88.39779005524862%;*margin-left:88.2914070765252%}.row-fluid .offset10:first-child{margin-left:85.6353591160221%;*margin-left:85.52897613729868%}.row-fluid .offset9{margin-left:79.8342541436464%;*margin-left:79.72787116492299%}.row-fluid .offset9:first-child{margin-left:77.07182320441989%;*margin-left:76.96544022569647%}.row-fluid .offset8{margin-left:71.2707182320442%;*margin-left:71.16433525332079%}.row-fluid .offset8:first-child{margin-left:68.50828729281768%;*margin-left:68.40190431409427%}.row-fluid .offset7{margin-left:62.70718232044199%;*margin-left:62.600799341718584%}.row-fluid .offset7:first-child{margin-left:59.94475138121547%;*margin-left:59.838368402492065%}.row-fluid .offset6{margin-left:54.14364640883978%;*margin-left:54.037263430116376%}.row-fluid .offset6:first-child{margin-left:51.38121546961326%;*margin-left:51.27483249088986%}.row-fluid .offset5{margin-left:45.58011049723757%;*margin-left:45.47372751851417%}.row-fluid .offset5:first-child{margin-left:42.81767955801105%;*margin-left:42.71129657928765%}.row-fluid .offset4{margin-left:37.01657458563536%;*margin-left:36.91019160691196%}.row-fluid .offset4:first-child{margin-left:34.25414364640884%;*margin-left:34.14776066768544%}.row-fluid .offset3{margin-left:28.45303867403315%;*margin-left:28.346655695309746%}.row-fluid .offset3:first-child{margin-left:25.69060773480663%;*margin-left:25.584224756083227%}.row-fluid .offset2{margin-left:19.88950276243094%;*margin-left:19.783119783707537%}.row-fluid .offset2:first-child{margin-left:17.12707182320442%;*margin-left:17.02068884448102%}.row-fluid .offset1{margin-left:11.32596685082873%;*margin-left:11.219583872105325%}.row-fluid .offset1:first-child{margin-left:8.56353591160221%;*margin-left:8.457152932878806%}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:20px}input.span12,textarea.span12,.uneditable-input.span12{width:710px}input.span11,textarea.span11,.uneditable-input.span11{width:648px}input.span10,textarea.span10,.uneditable-input.span10{width:586px}input.span9,textarea.span9,.uneditable-input.span9{width:524px}input.span8,textarea.span8,.uneditable-input.span8{width:462px}input.span7,textarea.span7,.uneditable-input.span7{width:400px}input.span6,textarea.span6,.uneditable-input.span6{width:338px}input.span5,textarea.span5,.uneditable-input.span5{width:276px}input.span4,textarea.span4,.uneditable-input.span4{width:214px}input.span3,textarea.span3,.uneditable-input.span3{width:152px}input.span2,textarea.span2,.uneditable-input.span2{width:90px}input.span1,textarea.span1,.uneditable-input.span1{width:28px}}@media(max-width:767px){body{padding-right:20px;padding-left:20px}.navbar-fixed-top,.navbar-fixed-bottom,.navbar-static-top{margin-right:-20px;margin-left:-20px}.container-fluid{padding:0}.dl-horizontal dt{float:none;width:auto;clear:none;text-align:left}.dl-horizontal dd{margin-left:0}.container{width:auto}.row-fluid{width:100%}.row,.thumbnails{margin-left:0}.thumbnails>li{float:none;margin-left:0}[class*="span"],.uneditable-input[class*="span"],.row-fluid [class*="span"]{display:block;float:none;width:100%;margin-left:0;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.span12,.row-fluid .span12{width:100%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="offset"]:first-child{margin-left:0}.input-large,.input-xlarge,.input-xxlarge,input[class*="span"],select[class*="span"],textarea[class*="span"],.uneditable-input{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.input-prepend input,.input-append input,.input-prepend input[class*="span"],.input-append input[class*="span"]{display:inline-block;width:auto}.controls-row [class*="span"]+[class*="span"]{margin-left:0}.modal{position:fixed;top:20px;right:20px;left:20px;width:auto;margin:0}.modal.fade{top:-100px}.modal.fade.in{top:20px}}@media(max-width:480px){.nav-collapse{-webkit-transform:translate3d(0,0,0)}.page-header h1 small{display:block;line-height:20px}input[type="checkbox"],input[type="radio"]{border:1px solid #ccc}.form-horizontal .control-label{float:none;width:auto;padding-top:0;text-align:left}.form-horizontal .controls{margin-left:0}.form-horizontal .control-list{padding-top:0}.form-horizontal .form-actions{padding-right:10px;padding-left:10px}.media .pull-left,.media .pull-right{display:block;float:none;margin-bottom:10px}.media-object{margin-right:0;margin-left:0}.modal{top:10px;right:10px;left:10px}.modal-header .close{padding:10px;margin:-10px}.carousel-caption{position:static}}@media(max-width:979px){body{padding-top:0}.navbar-fixed-top,.navbar-fixed-bottom{position:static}.navbar-fixed-top{margin-bottom:20px}.navbar-fixed-bottom{margin-top:20px}.navbar-fixed-top .navbar-inner,.navbar-fixed-bottom .navbar-inner{padding:5px}.navbar .container{width:auto;padding:0}.navbar .brand{padding-right:10px;padding-left:10px;margin:0 0 0 -5px}.nav-collapse{clear:both}.nav-collapse .nav{float:none;margin:0 0 10px}.nav-collapse .nav>li{float:none}.nav-collapse .nav>li>a{margin-bottom:2px}.nav-collapse .nav>.divider-vertical{display:none}.nav-collapse .nav .nav-header{color:#777;text-shadow:none}.nav-collapse .nav>li>a,.nav-collapse .dropdown-menu a{padding:9px 15px;font-weight:bold;color:#777;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.nav-collapse .btn{padding:4px 10px 4px;font-weight:normal;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.nav-collapse .dropdown-menu li+li a{margin-bottom:2px}.nav-collapse .nav>li>a:hover,.nav-collapse .dropdown-menu a:hover{background-color:#f2f2f2}.navbar-inverse .nav-collapse .nav>li>a,.navbar-inverse .nav-collapse .dropdown-menu a{color:#999}.navbar-inverse .nav-collapse .nav>li>a:hover,.navbar-inverse .nav-collapse .dropdown-menu a:hover{background-color:#111}.nav-collapse.in .btn-group{padding:0;margin-top:5px}.nav-collapse .dropdown-menu{position:static;top:auto;left:auto;display:none;float:none;max-width:none;padding:0;margin:0 15px;background-color:transparent;border:0;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none}.nav-collapse .open>.dropdown-menu{display:block}.nav-collapse .dropdown-menu:before,.nav-collapse .dropdown-menu:after{display:none}.nav-collapse .dropdown-menu .divider{display:none}.nav-collapse .nav>li>.dropdown-menu:before,.nav-collapse .nav>li>.dropdown-menu:after{display:none}.nav-collapse .navbar-form,.nav-collapse .navbar-search{float:none;padding:10px 15px;margin:10px 0;border-top:1px solid #f2f2f2;border-bottom:1px solid #f2f2f2;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);-moz-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1);box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.1)}.navbar-inverse .nav-collapse .navbar-form,.navbar-inverse .nav-collapse .navbar-search{border-top-color:#111;border-bottom-color:#111}.navbar .nav-collapse .nav.pull-right{float:none;margin-left:0}.nav-collapse,.nav-collapse.collapse{height:0;overflow:hidden}.navbar .btn-navbar{display:block}.navbar-static .navbar-inner{padding-right:10px;padding-left:10px}}@media(min-width:980px){.nav-collapse.collapse{height:auto!important;overflow:visible!important}} diff --git a/www/bootstrap/css/bootstrap.css b/www/bootstrap/css/bootstrap.css new file mode 100644 index 000000000000..1b519e220009 --- /dev/null +++ b/www/bootstrap/css/bootstrap.css @@ -0,0 +1,5893 @@ +/*! + * Bootstrap v2.2.1 + * + * Copyright 2012 Twitter, Inc + * Licensed under the Apache License v2.0 + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Designed and built with all the love in the world @twitter by @mdo and @fat. + */ + +article, +aside, +details, +figcaption, +figure, +footer, +header, +hgroup, +nav, +section { + display: block; +} + +audio, +canvas, +video { + display: inline-block; + *display: inline; + *zoom: 1; +} + +audio:not([controls]) { + display: none; +} + +html { + font-size: 100%; + -webkit-text-size-adjust: 100%; + -ms-text-size-adjust: 100%; +} + +a:focus { + outline: thin dotted #333; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} + +a:hover, +a:active { + outline: 0; +} + +sub, +sup { + position: relative; + font-size: 75%; + line-height: 0; + vertical-align: baseline; +} + +sup { + top: -0.5em; +} + +sub { + bottom: -0.25em; +} + +img { + width: auto\9; + height: auto; + max-width: 100%; + vertical-align: middle; + border: 0; + -ms-interpolation-mode: bicubic; +} + +#map_canvas img, +.google-maps img { + max-width: none; +} + +button, +input, +select, +textarea { + margin: 0; + font-size: 100%; + vertical-align: middle; +} + +button, +input { + *overflow: visible; + line-height: normal; +} + +button::-moz-focus-inner, +input::-moz-focus-inner { + padding: 0; + border: 0; +} + +button, +html input[type="button"], +input[type="reset"], +input[type="submit"] { + cursor: pointer; + -webkit-appearance: button; +} + +input[type="search"] { + -webkit-box-sizing: content-box; + -moz-box-sizing: content-box; + box-sizing: content-box; + -webkit-appearance: textfield; +} + +input[type="search"]::-webkit-search-decoration, +input[type="search"]::-webkit-search-cancel-button { + -webkit-appearance: none; +} + +textarea { + overflow: auto; + vertical-align: top; +} + +.clearfix { + *zoom: 1; +} + +.clearfix:before, +.clearfix:after { + display: table; + line-height: 0; + content: ""; +} + +.clearfix:after { + clear: both; +} + +.hide-text { + font: 0/0 a; + color: transparent; + text-shadow: none; + background-color: transparent; + border: 0; +} + +.input-block-level { + display: block; + width: 100%; + min-height: 30px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +body { + margin: 0; + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + font-size: 14px; + line-height: 20px; + color: #333333; + background-color: #ffffff; +} + +a { + color: #0088cc; + text-decoration: none; +} + +a:hover { + color: #005580; + text-decoration: underline; +} + +.img-rounded { + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; +} + +.img-polaroid { + padding: 4px; + background-color: #fff; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.2); + -webkit-box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); + -moz-box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); +} + +.img-circle { + -webkit-border-radius: 500px; + -moz-border-radius: 500px; + border-radius: 500px; +} + +.row { + margin-left: -20px; + *zoom: 1; +} + +.row:before, +.row:after { + display: table; + line-height: 0; + content: ""; +} + +.row:after { + clear: both; +} + +[class*="span"] { + float: left; + min-height: 1px; + margin-left: 20px; +} + +.container, +.navbar-static-top .container, +.navbar-fixed-top .container, +.navbar-fixed-bottom .container { + width: 940px; +} + +.span12 { + width: 940px; +} + +.span11 { + width: 860px; +} + +.span10 { + width: 780px; +} + +.span9 { + width: 700px; +} + +.span8 { + width: 620px; +} + +.span7 { + width: 540px; +} + +.span6 { + width: 460px; +} + +.span5 { + width: 380px; +} + +.span4 { + width: 300px; +} + +.span3 { + width: 220px; +} + +.span2 { + width: 140px; +} + +.span1 { + width: 60px; +} + +.offset12 { + margin-left: 980px; +} + +.offset11 { + margin-left: 900px; +} + +.offset10 { + margin-left: 820px; +} + +.offset9 { + margin-left: 740px; +} + +.offset8 { + margin-left: 660px; +} + +.offset7 { + margin-left: 580px; +} + +.offset6 { + margin-left: 500px; +} + +.offset5 { + margin-left: 420px; +} + +.offset4 { + margin-left: 340px; +} + +.offset3 { + margin-left: 260px; +} + +.offset2 { + margin-left: 180px; +} + +.offset1 { + margin-left: 100px; +} + +.row-fluid { + width: 100%; + *zoom: 1; +} + +.row-fluid:before, +.row-fluid:after { + display: table; + line-height: 0; + content: ""; +} + +.row-fluid:after { + clear: both; +} + +.row-fluid [class*="span"] { + display: block; + float: left; + width: 100%; + min-height: 30px; + margin-left: 2.127659574468085%; + *margin-left: 2.074468085106383%; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +.row-fluid [class*="span"]:first-child { + margin-left: 0; +} + +.row-fluid .controls-row [class*="span"] + [class*="span"] { + margin-left: 2.127659574468085%; +} + +.row-fluid .span12 { + width: 100%; + *width: 99.94680851063829%; +} + +.row-fluid .span11 { + width: 91.48936170212765%; + *width: 91.43617021276594%; +} + +.row-fluid .span10 { + width: 82.97872340425532%; + *width: 82.92553191489361%; +} + +.row-fluid .span9 { + width: 74.46808510638297%; + *width: 74.41489361702126%; +} + +.row-fluid .span8 { + width: 65.95744680851064%; + *width: 65.90425531914893%; +} + +.row-fluid .span7 { + width: 57.44680851063829%; + *width: 57.39361702127659%; +} + +.row-fluid .span6 { + width: 48.93617021276595%; + *width: 48.88297872340425%; +} + +.row-fluid .span5 { + width: 40.42553191489362%; + *width: 40.37234042553192%; +} + +.row-fluid .span4 { + width: 31.914893617021278%; + *width: 31.861702127659576%; +} + +.row-fluid .span3 { + width: 23.404255319148934%; + *width: 23.351063829787233%; +} + +.row-fluid .span2 { + width: 14.893617021276595%; + *width: 14.840425531914894%; +} + +.row-fluid .span1 { + width: 6.382978723404255%; + *width: 6.329787234042553%; +} + +.row-fluid .offset12 { + margin-left: 104.25531914893617%; + *margin-left: 104.14893617021275%; +} + +.row-fluid .offset12:first-child { + margin-left: 102.12765957446808%; + *margin-left: 102.02127659574467%; +} + +.row-fluid .offset11 { + margin-left: 95.74468085106382%; + *margin-left: 95.6382978723404%; +} + +.row-fluid .offset11:first-child { + margin-left: 93.61702127659574%; + *margin-left: 93.51063829787232%; +} + +.row-fluid .offset10 { + margin-left: 87.23404255319149%; + *margin-left: 87.12765957446807%; +} + +.row-fluid .offset10:first-child { + margin-left: 85.1063829787234%; + *margin-left: 84.99999999999999%; +} + +.row-fluid .offset9 { + margin-left: 78.72340425531914%; + *margin-left: 78.61702127659572%; +} + +.row-fluid .offset9:first-child { + margin-left: 76.59574468085106%; + *margin-left: 76.48936170212764%; +} + +.row-fluid .offset8 { + margin-left: 70.2127659574468%; + *margin-left: 70.10638297872339%; +} + +.row-fluid .offset8:first-child { + margin-left: 68.08510638297872%; + *margin-left: 67.9787234042553%; +} + +.row-fluid .offset7 { + margin-left: 61.70212765957446%; + *margin-left: 61.59574468085106%; +} + +.row-fluid .offset7:first-child { + margin-left: 59.574468085106375%; + *margin-left: 59.46808510638297%; +} + +.row-fluid .offset6 { + margin-left: 53.191489361702125%; + *margin-left: 53.085106382978715%; +} + +.row-fluid .offset6:first-child { + margin-left: 51.063829787234035%; + *margin-left: 50.95744680851063%; +} + +.row-fluid .offset5 { + margin-left: 44.68085106382979%; + *margin-left: 44.57446808510638%; +} + +.row-fluid .offset5:first-child { + margin-left: 42.5531914893617%; + *margin-left: 42.4468085106383%; +} + +.row-fluid .offset4 { + margin-left: 36.170212765957444%; + *margin-left: 36.06382978723405%; +} + +.row-fluid .offset4:first-child { + margin-left: 34.04255319148936%; + *margin-left: 33.93617021276596%; +} + +.row-fluid .offset3 { + margin-left: 27.659574468085104%; + *margin-left: 27.5531914893617%; +} + +.row-fluid .offset3:first-child { + margin-left: 25.53191489361702%; + *margin-left: 25.425531914893618%; +} + +.row-fluid .offset2 { + margin-left: 19.148936170212764%; + *margin-left: 19.04255319148936%; +} + +.row-fluid .offset2:first-child { + margin-left: 17.02127659574468%; + *margin-left: 16.914893617021278%; +} + +.row-fluid .offset1 { + margin-left: 10.638297872340425%; + *margin-left: 10.53191489361702%; +} + +.row-fluid .offset1:first-child { + margin-left: 8.51063829787234%; + *margin-left: 8.404255319148938%; +} + +[class*="span"].hide, +.row-fluid [class*="span"].hide { + display: none; +} + +[class*="span"].pull-right, +.row-fluid [class*="span"].pull-right { + float: right; +} + +.container { + margin-right: auto; + margin-left: auto; + *zoom: 1; +} + +.container:before, +.container:after { + display: table; + line-height: 0; + content: ""; +} + +.container:after { + clear: both; +} + +.container-fluid { + padding-right: 20px; + padding-left: 20px; + *zoom: 1; +} + +.container-fluid:before, +.container-fluid:after { + display: table; + line-height: 0; + content: ""; +} + +.container-fluid:after { + clear: both; +} + +p { + margin: 0 0 10px; +} + +.lead { + margin-bottom: 20px; + font-size: 21px; + font-weight: 200; + line-height: 30px; +} + +small { + font-size: 85%; +} + +strong { + font-weight: bold; +} + +em { + font-style: italic; +} + +cite { + font-style: normal; +} + +.muted { + color: #999999; +} + +.text-warning { + color: #c09853; +} + +a.text-warning:hover { + color: #a47e3c; +} + +.text-error { + color: #b94a48; +} + +a.text-error:hover { + color: #953b39; +} + +.text-info { + color: #3a87ad; +} + +a.text-info:hover { + color: #2d6987; +} + +.text-success { + color: #468847; +} + +a.text-success:hover { + color: #356635; +} + +h1, +h2, +h3, +h4, +h5, +h6 { + margin: 10px 0; + font-family: inherit; + font-weight: bold; + line-height: 20px; + color: inherit; + text-rendering: optimizelegibility; +} + +h1 small, +h2 small, +h3 small, +h4 small, +h5 small, +h6 small { + font-weight: normal; + line-height: 1; + color: #999999; +} + +h1, +h2, +h3 { + line-height: 40px; +} + +h1 { + font-size: 38.5px; +} + +h2 { + font-size: 31.5px; +} + +h3 { + font-size: 24.5px; +} + +h4 { + font-size: 17.5px; +} + +h5 { + font-size: 14px; +} + +h6 { + font-size: 11.9px; +} + +h1 small { + font-size: 24.5px; +} + +h2 small { + font-size: 17.5px; +} + +h3 small { + font-size: 14px; +} + +h4 small { + font-size: 14px; +} + +.page-header { + padding-bottom: 9px; + margin: 20px 0 30px; + border-bottom: 1px solid #eeeeee; +} + +ul, +ol { + padding: 0; + margin: 0 0 10px 25px; +} + +ul ul, +ul ol, +ol ol, +ol ul { + margin-bottom: 0; +} + +li { + line-height: 20px; +} + +ul.unstyled, +ol.unstyled { + margin-left: 0; + list-style: none; +} + +dl { + margin-bottom: 20px; +} + +dt, +dd { + line-height: 20px; +} + +dt { + font-weight: bold; +} + +dd { + margin-left: 10px; +} + +.dl-horizontal { + *zoom: 1; +} + +.dl-horizontal:before, +.dl-horizontal:after { + display: table; + line-height: 0; + content: ""; +} + +.dl-horizontal:after { + clear: both; +} + +.dl-horizontal dt { + float: left; + width: 160px; + overflow: hidden; + clear: left; + text-align: right; + text-overflow: ellipsis; + white-space: nowrap; +} + +.dl-horizontal dd { + margin-left: 180px; +} + +hr { + margin: 20px 0; + border: 0; + border-top: 1px solid #eeeeee; + border-bottom: 1px solid #ffffff; +} + +abbr[title], +abbr[data-original-title] { + cursor: help; + border-bottom: 1px dotted #999999; +} + +abbr.initialism { + font-size: 90%; + text-transform: uppercase; +} + +blockquote { + padding: 0 0 0 15px; + margin: 0 0 20px; + border-left: 5px solid #eeeeee; +} + +blockquote p { + margin-bottom: 0; + font-size: 16px; + font-weight: 300; + line-height: 25px; +} + +blockquote small { + display: block; + line-height: 20px; + color: #999999; +} + +blockquote small:before { + content: '\2014 \00A0'; +} + +blockquote.pull-right { + float: right; + padding-right: 15px; + padding-left: 0; + border-right: 5px solid #eeeeee; + border-left: 0; +} + +blockquote.pull-right p, +blockquote.pull-right small { + text-align: right; +} + +blockquote.pull-right small:before { + content: ''; +} + +blockquote.pull-right small:after { + content: '\00A0 \2014'; +} + +q:before, +q:after, +blockquote:before, +blockquote:after { + content: ""; +} + +address { + display: block; + margin-bottom: 20px; + font-style: normal; + line-height: 20px; +} + +code, +pre { + padding: 0 3px 2px; + font-family: Monaco, Menlo, Consolas, "Courier New", monospace; + font-size: 12px; + color: #333333; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +code { + padding: 2px 4px; + color: #d14; + background-color: #f7f7f9; + border: 1px solid #e1e1e8; +} + +pre { + display: block; + padding: 9.5px; + margin: 0 0 10px; + font-size: 13px; + line-height: 20px; + word-break: break-all; + word-wrap: break-word; + white-space: pre; + white-space: pre-wrap; + background-color: #f5f5f5; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.15); + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +pre.prettyprint { + margin-bottom: 20px; +} + +pre code { + padding: 0; + color: inherit; + background-color: transparent; + border: 0; +} + +.pre-scrollable { + max-height: 340px; + overflow-y: scroll; +} + +form { + margin: 0 0 20px; +} + +fieldset { + padding: 0; + margin: 0; + border: 0; +} + +legend { + display: block; + width: 100%; + padding: 0; + margin-bottom: 20px; + font-size: 21px; + line-height: 40px; + color: #333333; + border: 0; + border-bottom: 1px solid #e5e5e5; +} + +legend small { + font-size: 15px; + color: #999999; +} + +label, +input, +button, +select, +textarea { + font-size: 14px; + font-weight: normal; + line-height: 20px; +} + +input, +button, +select, +textarea { + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; +} + +label { + display: block; + margin-bottom: 5px; +} + +select, +textarea, +input[type="text"], +input[type="password"], +input[type="datetime"], +input[type="datetime-local"], +input[type="date"], +input[type="month"], +input[type="time"], +input[type="week"], +input[type="number"], +input[type="email"], +input[type="url"], +input[type="search"], +input[type="tel"], +input[type="color"], +.uneditable-input { + display: inline-block; + height: 20px; + padding: 4px 6px; + margin-bottom: 10px; + font-size: 14px; + line-height: 20px; + color: #555555; + vertical-align: middle; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +input, +textarea, +.uneditable-input { + width: 206px; +} + +textarea { + height: auto; +} + +textarea, +input[type="text"], +input[type="password"], +input[type="datetime"], +input[type="datetime-local"], +input[type="date"], +input[type="month"], +input[type="time"], +input[type="week"], +input[type="number"], +input[type="email"], +input[type="url"], +input[type="search"], +input[type="tel"], +input[type="color"], +.uneditable-input { + background-color: #ffffff; + border: 1px solid #cccccc; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -webkit-transition: border linear 0.2s, box-shadow linear 0.2s; + -moz-transition: border linear 0.2s, box-shadow linear 0.2s; + -o-transition: border linear 0.2s, box-shadow linear 0.2s; + transition: border linear 0.2s, box-shadow linear 0.2s; +} + +textarea:focus, +input[type="text"]:focus, +input[type="password"]:focus, +input[type="datetime"]:focus, +input[type="datetime-local"]:focus, +input[type="date"]:focus, +input[type="month"]:focus, +input[type="time"]:focus, +input[type="week"]:focus, +input[type="number"]:focus, +input[type="email"]:focus, +input[type="url"]:focus, +input[type="search"]:focus, +input[type="tel"]:focus, +input[type="color"]:focus, +.uneditable-input:focus { + border-color: rgba(82, 168, 236, 0.8); + outline: 0; + outline: thin dotted \9; + /* IE6-9 */ + + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 8px rgba(82, 168, 236, 0.6); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 8px rgba(82, 168, 236, 0.6); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 8px rgba(82, 168, 236, 0.6); +} + +input[type="radio"], +input[type="checkbox"] { + margin: 4px 0 0; + margin-top: 1px \9; + *margin-top: 0; + line-height: normal; + cursor: pointer; +} + +input[type="file"], +input[type="image"], +input[type="submit"], +input[type="reset"], +input[type="button"], +input[type="radio"], +input[type="checkbox"] { + width: auto; +} + +select, +input[type="file"] { + height: 30px; + /* In IE7, the height of the select element cannot be changed by height, only font-size */ + + *margin-top: 4px; + /* For IE7, add top margin to align select with labels */ + + line-height: 30px; +} + +select { + width: 220px; + background-color: #ffffff; + border: 1px solid #cccccc; +} + +select[multiple], +select[size] { + height: auto; +} + +select:focus, +input[type="file"]:focus, +input[type="radio"]:focus, +input[type="checkbox"]:focus { + outline: thin dotted #333; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} + +.uneditable-input, +.uneditable-textarea { + color: #999999; + cursor: not-allowed; + background-color: #fcfcfc; + border-color: #cccccc; + -webkit-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.025); + -moz-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.025); + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.025); +} + +.uneditable-input { + overflow: hidden; + white-space: nowrap; +} + +.uneditable-textarea { + width: auto; + height: auto; +} + +input:-moz-placeholder, +textarea:-moz-placeholder { + color: #999999; +} + +input:-ms-input-placeholder, +textarea:-ms-input-placeholder { + color: #999999; +} + +input::-webkit-input-placeholder, +textarea::-webkit-input-placeholder { + color: #999999; +} + +.radio, +.checkbox { + min-height: 20px; + padding-left: 20px; +} + +.radio input[type="radio"], +.checkbox input[type="checkbox"] { + float: left; + margin-left: -20px; +} + +.controls > .radio:first-child, +.controls > .checkbox:first-child { + padding-top: 5px; +} + +.radio.inline, +.checkbox.inline { + display: inline-block; + padding-top: 5px; + margin-bottom: 0; + vertical-align: middle; +} + +.radio.inline + .radio.inline, +.checkbox.inline + .checkbox.inline { + margin-left: 10px; +} + +.input-mini { + width: 60px; +} + +.input-small { + width: 90px; +} + +.input-medium { + width: 150px; +} + +.input-large { + width: 210px; +} + +.input-xlarge { + width: 270px; +} + +.input-xxlarge { + width: 530px; +} + +input[class*="span"], +select[class*="span"], +textarea[class*="span"], +.uneditable-input[class*="span"], +.row-fluid input[class*="span"], +.row-fluid select[class*="span"], +.row-fluid textarea[class*="span"], +.row-fluid .uneditable-input[class*="span"] { + float: none; + margin-left: 0; +} + +.input-append input[class*="span"], +.input-append .uneditable-input[class*="span"], +.input-prepend input[class*="span"], +.input-prepend .uneditable-input[class*="span"], +.row-fluid input[class*="span"], +.row-fluid select[class*="span"], +.row-fluid textarea[class*="span"], +.row-fluid .uneditable-input[class*="span"], +.row-fluid .input-prepend [class*="span"], +.row-fluid .input-append [class*="span"] { + display: inline-block; +} + +input, +textarea, +.uneditable-input { + margin-left: 0; +} + +.controls-row [class*="span"] + [class*="span"] { + margin-left: 20px; +} + +input.span12, +textarea.span12, +.uneditable-input.span12 { + width: 926px; +} + +input.span11, +textarea.span11, +.uneditable-input.span11 { + width: 846px; +} + +input.span10, +textarea.span10, +.uneditable-input.span10 { + width: 766px; +} + +input.span9, +textarea.span9, +.uneditable-input.span9 { + width: 686px; +} + +input.span8, +textarea.span8, +.uneditable-input.span8 { + width: 606px; +} + +input.span7, +textarea.span7, +.uneditable-input.span7 { + width: 526px; +} + +input.span6, +textarea.span6, +.uneditable-input.span6 { + width: 446px; +} + +input.span5, +textarea.span5, +.uneditable-input.span5 { + width: 366px; +} + +input.span4, +textarea.span4, +.uneditable-input.span4 { + width: 286px; +} + +input.span3, +textarea.span3, +.uneditable-input.span3 { + width: 206px; +} + +input.span2, +textarea.span2, +.uneditable-input.span2 { + width: 126px; +} + +input.span1, +textarea.span1, +.uneditable-input.span1 { + width: 46px; +} + +.controls-row { + *zoom: 1; +} + +.controls-row:before, +.controls-row:after { + display: table; + line-height: 0; + content: ""; +} + +.controls-row:after { + clear: both; +} + +.controls-row [class*="span"], +.row-fluid .controls-row [class*="span"] { + float: left; +} + +.controls-row .checkbox[class*="span"], +.controls-row .radio[class*="span"] { + padding-top: 5px; +} + +input[disabled], +select[disabled], +textarea[disabled], +input[readonly], +select[readonly], +textarea[readonly] { + cursor: not-allowed; + background-color: #eeeeee; +} + +input[type="radio"][disabled], +input[type="checkbox"][disabled], +input[type="radio"][readonly], +input[type="checkbox"][readonly] { + background-color: transparent; +} + +.control-group.warning > label, +.control-group.warning .help-block, +.control-group.warning .help-inline { + color: #c09853; +} + +.control-group.warning .checkbox, +.control-group.warning .radio, +.control-group.warning input, +.control-group.warning select, +.control-group.warning textarea { + color: #c09853; +} + +.control-group.warning input, +.control-group.warning select, +.control-group.warning textarea { + border-color: #c09853; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} + +.control-group.warning input:focus, +.control-group.warning select:focus, +.control-group.warning textarea:focus { + border-color: #a47e3c; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #dbc59e; + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #dbc59e; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #dbc59e; +} + +.control-group.warning .input-prepend .add-on, +.control-group.warning .input-append .add-on { + color: #c09853; + background-color: #fcf8e3; + border-color: #c09853; +} + +.control-group.error > label, +.control-group.error .help-block, +.control-group.error .help-inline { + color: #b94a48; +} + +.control-group.error .checkbox, +.control-group.error .radio, +.control-group.error input, +.control-group.error select, +.control-group.error textarea { + color: #b94a48; +} + +.control-group.error input, +.control-group.error select, +.control-group.error textarea { + border-color: #b94a48; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} + +.control-group.error input:focus, +.control-group.error select:focus, +.control-group.error textarea:focus { + border-color: #953b39; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #d59392; + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #d59392; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #d59392; +} + +.control-group.error .input-prepend .add-on, +.control-group.error .input-append .add-on { + color: #b94a48; + background-color: #f2dede; + border-color: #b94a48; +} + +.control-group.success > label, +.control-group.success .help-block, +.control-group.success .help-inline { + color: #468847; +} + +.control-group.success .checkbox, +.control-group.success .radio, +.control-group.success input, +.control-group.success select, +.control-group.success textarea { + color: #468847; +} + +.control-group.success input, +.control-group.success select, +.control-group.success textarea { + border-color: #468847; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} + +.control-group.success input:focus, +.control-group.success select:focus, +.control-group.success textarea:focus { + border-color: #356635; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7aba7b; + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7aba7b; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7aba7b; +} + +.control-group.success .input-prepend .add-on, +.control-group.success .input-append .add-on { + color: #468847; + background-color: #dff0d8; + border-color: #468847; +} + +.control-group.info > label, +.control-group.info .help-block, +.control-group.info .help-inline { + color: #3a87ad; +} + +.control-group.info .checkbox, +.control-group.info .radio, +.control-group.info input, +.control-group.info select, +.control-group.info textarea { + color: #3a87ad; +} + +.control-group.info input, +.control-group.info select, +.control-group.info textarea { + border-color: #3a87ad; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} + +.control-group.info input:focus, +.control-group.info select:focus, +.control-group.info textarea:focus { + border-color: #2d6987; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7ab5d3; + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7ab5d3; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 6px #7ab5d3; +} + +.control-group.info .input-prepend .add-on, +.control-group.info .input-append .add-on { + color: #3a87ad; + background-color: #d9edf7; + border-color: #3a87ad; +} + +input:focus:required:invalid, +textarea:focus:required:invalid, +select:focus:required:invalid { + color: #b94a48; + border-color: #ee5f5b; +} + +input:focus:required:invalid:focus, +textarea:focus:required:invalid:focus, +select:focus:required:invalid:focus { + border-color: #e9322d; + -webkit-box-shadow: 0 0 6px #f8b9b7; + -moz-box-shadow: 0 0 6px #f8b9b7; + box-shadow: 0 0 6px #f8b9b7; +} + +.form-actions { + padding: 19px 20px 20px; + margin-top: 20px; + margin-bottom: 20px; + background-color: #f5f5f5; + border-top: 1px solid #e5e5e5; + *zoom: 1; +} + +.form-actions:before, +.form-actions:after { + display: table; + line-height: 0; + content: ""; +} + +.form-actions:after { + clear: both; +} + +.help-block, +.help-inline { + color: #595959; +} + +.help-block { + display: block; + margin-bottom: 10px; +} + +.help-inline { + display: inline-block; + *display: inline; + padding-left: 5px; + vertical-align: middle; + *zoom: 1; +} + +.input-append, +.input-prepend { + margin-bottom: 5px; + font-size: 0; + white-space: nowrap; +} + +.input-append input, +.input-prepend input, +.input-append select, +.input-prepend select, +.input-append .uneditable-input, +.input-prepend .uneditable-input, +.input-append .dropdown-menu, +.input-prepend .dropdown-menu { + font-size: 14px; +} + +.input-append input, +.input-prepend input, +.input-append select, +.input-prepend select, +.input-append .uneditable-input, +.input-prepend .uneditable-input { + position: relative; + margin-bottom: 0; + *margin-left: 0; + vertical-align: top; + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.input-append input:focus, +.input-prepend input:focus, +.input-append select:focus, +.input-prepend select:focus, +.input-append .uneditable-input:focus, +.input-prepend .uneditable-input:focus { + z-index: 2; +} + +.input-append .add-on, +.input-prepend .add-on { + display: inline-block; + width: auto; + height: 20px; + min-width: 16px; + padding: 4px 5px; + font-size: 14px; + font-weight: normal; + line-height: 20px; + text-align: center; + text-shadow: 0 1px 0 #ffffff; + background-color: #eeeeee; + border: 1px solid #ccc; +} + +.input-append .add-on, +.input-prepend .add-on, +.input-append .btn, +.input-prepend .btn { + vertical-align: top; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.input-append .active, +.input-prepend .active { + background-color: #a9dba9; + border-color: #46a546; +} + +.input-prepend .add-on, +.input-prepend .btn { + margin-right: -1px; +} + +.input-prepend .add-on:first-child, +.input-prepend .btn:first-child { + -webkit-border-radius: 4px 0 0 4px; + -moz-border-radius: 4px 0 0 4px; + border-radius: 4px 0 0 4px; +} + +.input-append input, +.input-append select, +.input-append .uneditable-input { + -webkit-border-radius: 4px 0 0 4px; + -moz-border-radius: 4px 0 0 4px; + border-radius: 4px 0 0 4px; +} + +.input-append input + .btn-group .btn, +.input-append select + .btn-group .btn, +.input-append .uneditable-input + .btn-group .btn { + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.input-append .add-on, +.input-append .btn, +.input-append .btn-group { + margin-left: -1px; +} + +.input-append .add-on:last-child, +.input-append .btn:last-child { + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.input-prepend.input-append input, +.input-prepend.input-append select, +.input-prepend.input-append .uneditable-input { + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.input-prepend.input-append input + .btn-group .btn, +.input-prepend.input-append select + .btn-group .btn, +.input-prepend.input-append .uneditable-input + .btn-group .btn { + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.input-prepend.input-append .add-on:first-child, +.input-prepend.input-append .btn:first-child { + margin-right: -1px; + -webkit-border-radius: 4px 0 0 4px; + -moz-border-radius: 4px 0 0 4px; + border-radius: 4px 0 0 4px; +} + +.input-prepend.input-append .add-on:last-child, +.input-prepend.input-append .btn:last-child { + margin-left: -1px; + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.input-prepend.input-append .btn-group:first-child { + margin-left: 0; +} + +input.search-query { + padding-right: 14px; + padding-right: 4px \9; + padding-left: 14px; + padding-left: 4px \9; + /* IE7-8 doesn't have border-radius, so don't indent the padding */ + + margin-bottom: 0; + -webkit-border-radius: 15px; + -moz-border-radius: 15px; + border-radius: 15px; +} + +/* Allow for input prepend/append in search forms */ + +.form-search .input-append .search-query, +.form-search .input-prepend .search-query { + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.form-search .input-append .search-query { + -webkit-border-radius: 14px 0 0 14px; + -moz-border-radius: 14px 0 0 14px; + border-radius: 14px 0 0 14px; +} + +.form-search .input-append .btn { + -webkit-border-radius: 0 14px 14px 0; + -moz-border-radius: 0 14px 14px 0; + border-radius: 0 14px 14px 0; +} + +.form-search .input-prepend .search-query { + -webkit-border-radius: 0 14px 14px 0; + -moz-border-radius: 0 14px 14px 0; + border-radius: 0 14px 14px 0; +} + +.form-search .input-prepend .btn { + -webkit-border-radius: 14px 0 0 14px; + -moz-border-radius: 14px 0 0 14px; + border-radius: 14px 0 0 14px; +} + +.form-search input, +.form-inline input, +.form-horizontal input, +.form-search textarea, +.form-inline textarea, +.form-horizontal textarea, +.form-search select, +.form-inline select, +.form-horizontal select, +.form-search .help-inline, +.form-inline .help-inline, +.form-horizontal .help-inline, +.form-search .uneditable-input, +.form-inline .uneditable-input, +.form-horizontal .uneditable-input, +.form-search .input-prepend, +.form-inline .input-prepend, +.form-horizontal .input-prepend, +.form-search .input-append, +.form-inline .input-append, +.form-horizontal .input-append { + display: inline-block; + *display: inline; + margin-bottom: 0; + vertical-align: middle; + *zoom: 1; +} + +.form-search .hide, +.form-inline .hide, +.form-horizontal .hide { + display: none; +} + +.form-search label, +.form-inline label, +.form-search .btn-group, +.form-inline .btn-group { + display: inline-block; +} + +.form-search .input-append, +.form-inline .input-append, +.form-search .input-prepend, +.form-inline .input-prepend { + margin-bottom: 0; +} + +.form-search .radio, +.form-search .checkbox, +.form-inline .radio, +.form-inline .checkbox { + padding-left: 0; + margin-bottom: 0; + vertical-align: middle; +} + +.form-search .radio input[type="radio"], +.form-search .checkbox input[type="checkbox"], +.form-inline .radio input[type="radio"], +.form-inline .checkbox input[type="checkbox"] { + float: left; + margin-right: 3px; + margin-left: 0; +} + +.control-group { + margin-bottom: 10px; +} + +legend + .control-group { + margin-top: 20px; + -webkit-margin-top-collapse: separate; +} + +.form-horizontal .control-group { + margin-bottom: 20px; + *zoom: 1; +} + +.form-horizontal .control-group:before, +.form-horizontal .control-group:after { + display: table; + line-height: 0; + content: ""; +} + +.form-horizontal .control-group:after { + clear: both; +} + +.form-horizontal .control-label { + float: left; + width: 160px; + padding-top: 5px; + text-align: right; +} + +.form-horizontal .controls { + *display: inline-block; + *padding-left: 20px; + margin-left: 180px; + *margin-left: 0; +} + +.form-horizontal .controls:first-child { + *padding-left: 180px; +} + +.form-horizontal .help-block { + margin-bottom: 0; +} + +.form-horizontal input + .help-block, +.form-horizontal select + .help-block, +.form-horizontal textarea + .help-block { + margin-top: 10px; +} + +.form-horizontal .form-actions { + padding-left: 180px; +} + +table { + max-width: 100%; + background-color: transparent; + border-collapse: collapse; + border-spacing: 0; +} + +.table { + width: 100%; + margin-bottom: 20px; +} + +.table th, +.table td { + padding: 8px; + line-height: 20px; + text-align: left; + vertical-align: top; + border-top: 1px solid #dddddd; +} + +.table th { + font-weight: bold; +} + +.table thead th { + vertical-align: bottom; +} + +.table caption + thead tr:first-child th, +.table caption + thead tr:first-child td, +.table colgroup + thead tr:first-child th, +.table colgroup + thead tr:first-child td, +.table thead:first-child tr:first-child th, +.table thead:first-child tr:first-child td { + border-top: 0; +} + +.table tbody + tbody { + border-top: 2px solid #dddddd; +} + +.table-condensed th, +.table-condensed td { + padding: 4px 5px; +} + +.table-bordered { + border: 1px solid #dddddd; + border-collapse: separate; + *border-collapse: collapse; + border-left: 0; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.table-bordered th, +.table-bordered td { + border-left: 1px solid #dddddd; +} + +.table-bordered caption + thead tr:first-child th, +.table-bordered caption + tbody tr:first-child th, +.table-bordered caption + tbody tr:first-child td, +.table-bordered colgroup + thead tr:first-child th, +.table-bordered colgroup + tbody tr:first-child th, +.table-bordered colgroup + tbody tr:first-child td, +.table-bordered thead:first-child tr:first-child th, +.table-bordered tbody:first-child tr:first-child th, +.table-bordered tbody:first-child tr:first-child td { + border-top: 0; +} + +.table-bordered thead:first-child tr:first-child th:first-child, +.table-bordered tbody:first-child tr:first-child td:first-child { + -webkit-border-top-left-radius: 4px; + border-top-left-radius: 4px; + -moz-border-radius-topleft: 4px; +} + +.table-bordered thead:first-child tr:first-child th:last-child, +.table-bordered tbody:first-child tr:first-child td:last-child { + -webkit-border-top-right-radius: 4px; + border-top-right-radius: 4px; + -moz-border-radius-topright: 4px; +} + +.table-bordered thead:last-child tr:last-child th:first-child, +.table-bordered tbody:last-child tr:last-child td:first-child, +.table-bordered tfoot:last-child tr:last-child td:first-child { + -webkit-border-radius: 0 0 0 4px; + -moz-border-radius: 0 0 0 4px; + border-radius: 0 0 0 4px; + -webkit-border-bottom-left-radius: 4px; + border-bottom-left-radius: 4px; + -moz-border-radius-bottomleft: 4px; +} + +.table-bordered thead:last-child tr:last-child th:last-child, +.table-bordered tbody:last-child tr:last-child td:last-child, +.table-bordered tfoot:last-child tr:last-child td:last-child { + -webkit-border-bottom-right-radius: 4px; + border-bottom-right-radius: 4px; + -moz-border-radius-bottomright: 4px; +} + +.table-bordered caption + thead tr:first-child th:first-child, +.table-bordered caption + tbody tr:first-child td:first-child, +.table-bordered colgroup + thead tr:first-child th:first-child, +.table-bordered colgroup + tbody tr:first-child td:first-child { + -webkit-border-top-left-radius: 4px; + border-top-left-radius: 4px; + -moz-border-radius-topleft: 4px; +} + +.table-bordered caption + thead tr:first-child th:last-child, +.table-bordered caption + tbody tr:first-child td:last-child, +.table-bordered colgroup + thead tr:first-child th:last-child, +.table-bordered colgroup + tbody tr:first-child td:last-child { + -webkit-border-top-right-radius: 4px; + border-top-right-radius: 4px; + -moz-border-radius-topright: 4px; +} + +.table-striped tbody tr:nth-child(odd) td, +.table-striped tbody tr:nth-child(odd) th { + background-color: #f9f9f9; +} + +.table-hover tbody tr:hover td, +.table-hover tbody tr:hover th { + background-color: #f5f5f5; +} + +table td[class*="span"], +table th[class*="span"], +.row-fluid table td[class*="span"], +.row-fluid table th[class*="span"] { + display: table-cell; + float: none; + margin-left: 0; +} + +.table td.span1, +.table th.span1 { + float: none; + width: 44px; + margin-left: 0; +} + +.table td.span2, +.table th.span2 { + float: none; + width: 124px; + margin-left: 0; +} + +.table td.span3, +.table th.span3 { + float: none; + width: 204px; + margin-left: 0; +} + +.table td.span4, +.table th.span4 { + float: none; + width: 284px; + margin-left: 0; +} + +.table td.span5, +.table th.span5 { + float: none; + width: 364px; + margin-left: 0; +} + +.table td.span6, +.table th.span6 { + float: none; + width: 444px; + margin-left: 0; +} + +.table td.span7, +.table th.span7 { + float: none; + width: 524px; + margin-left: 0; +} + +.table td.span8, +.table th.span8 { + float: none; + width: 604px; + margin-left: 0; +} + +.table td.span9, +.table th.span9 { + float: none; + width: 684px; + margin-left: 0; +} + +.table td.span10, +.table th.span10 { + float: none; + width: 764px; + margin-left: 0; +} + +.table td.span11, +.table th.span11 { + float: none; + width: 844px; + margin-left: 0; +} + +.table td.span12, +.table th.span12 { + float: none; + width: 924px; + margin-left: 0; +} + +.table tbody tr.success td { + background-color: #dff0d8; +} + +.table tbody tr.error td { + background-color: #f2dede; +} + +.table tbody tr.warning td { + background-color: #fcf8e3; +} + +.table tbody tr.info td { + background-color: #d9edf7; +} + +.table-hover tbody tr.success:hover td { + background-color: #d0e9c6; +} + +.table-hover tbody tr.error:hover td { + background-color: #ebcccc; +} + +.table-hover tbody tr.warning:hover td { + background-color: #faf2cc; +} + +.table-hover tbody tr.info:hover td { + background-color: #c4e3f3; +} + +[class^="icon-"], +[class*=" icon-"] { + display: inline-block; + width: 14px; + height: 14px; + margin-top: 1px; + *margin-right: .3em; + line-height: 14px; + vertical-align: text-top; + background-image: url("../img/glyphicons-halflings.png"); + background-position: 14px 14px; + background-repeat: no-repeat; +} + +/* White icons with optional class, or on hover/active states of certain elements */ + +.icon-white, +.nav-pills > .active > a > [class^="icon-"], +.nav-pills > .active > a > [class*=" icon-"], +.nav-list > .active > a > [class^="icon-"], +.nav-list > .active > a > [class*=" icon-"], +.navbar-inverse .nav > .active > a > [class^="icon-"], +.navbar-inverse .nav > .active > a > [class*=" icon-"], +.dropdown-menu > li > a:hover > [class^="icon-"], +.dropdown-menu > li > a:hover > [class*=" icon-"], +.dropdown-menu > .active > a > [class^="icon-"], +.dropdown-menu > .active > a > [class*=" icon-"], +.dropdown-submenu:hover > a > [class^="icon-"], +.dropdown-submenu:hover > a > [class*=" icon-"] { + background-image: url("../img/glyphicons-halflings-white.png"); +} + +.icon-glass { + background-position: 0 0; +} + +.icon-music { + background-position: -24px 0; +} + +.icon-search { + background-position: -48px 0; +} + +.icon-envelope { + background-position: -72px 0; +} + +.icon-heart { + background-position: -96px 0; +} + +.icon-star { + background-position: -120px 0; +} + +.icon-star-empty { + background-position: -144px 0; +} + +.icon-user { + background-position: -168px 0; +} + +.icon-film { + background-position: -192px 0; +} + +.icon-th-large { + background-position: -216px 0; +} + +.icon-th { + background-position: -240px 0; +} + +.icon-th-list { + background-position: -264px 0; +} + +.icon-ok { + background-position: -288px 0; +} + +.icon-remove { + background-position: -312px 0; +} + +.icon-zoom-in { + background-position: -336px 0; +} + +.icon-zoom-out { + background-position: -360px 0; +} + +.icon-off { + background-position: -384px 0; +} + +.icon-signal { + background-position: -408px 0; +} + +.icon-cog { + background-position: -432px 0; +} + +.icon-trash { + background-position: -456px 0; +} + +.icon-home { + background-position: 0 -24px; +} + +.icon-file { + background-position: -24px -24px; +} + +.icon-time { + background-position: -48px -24px; +} + +.icon-road { + background-position: -72px -24px; +} + +.icon-download-alt { + background-position: -96px -24px; +} + +.icon-download { + background-position: -120px -24px; +} + +.icon-upload { + background-position: -144px -24px; +} + +.icon-inbox { + background-position: -168px -24px; +} + +.icon-play-circle { + background-position: -192px -24px; +} + +.icon-repeat { + background-position: -216px -24px; +} + +.icon-refresh { + background-position: -240px -24px; +} + +.icon-list-alt { + background-position: -264px -24px; +} + +.icon-lock { + background-position: -287px -24px; +} + +.icon-flag { + background-position: -312px -24px; +} + +.icon-headphones { + background-position: -336px -24px; +} + +.icon-volume-off { + background-position: -360px -24px; +} + +.icon-volume-down { + background-position: -384px -24px; +} + +.icon-volume-up { + background-position: -408px -24px; +} + +.icon-qrcode { + background-position: -432px -24px; +} + +.icon-barcode { + background-position: -456px -24px; +} + +.icon-tag { + background-position: 0 -48px; +} + +.icon-tags { + background-position: -25px -48px; +} + +.icon-book { + background-position: -48px -48px; +} + +.icon-bookmark { + background-position: -72px -48px; +} + +.icon-print { + background-position: -96px -48px; +} + +.icon-camera { + background-position: -120px -48px; +} + +.icon-font { + background-position: -144px -48px; +} + +.icon-bold { + background-position: -167px -48px; +} + +.icon-italic { + background-position: -192px -48px; +} + +.icon-text-height { + background-position: -216px -48px; +} + +.icon-text-width { + background-position: -240px -48px; +} + +.icon-align-left { + background-position: -264px -48px; +} + +.icon-align-center { + background-position: -288px -48px; +} + +.icon-align-right { + background-position: -312px -48px; +} + +.icon-align-justify { + background-position: -336px -48px; +} + +.icon-list { + background-position: -360px -48px; +} + +.icon-indent-left { + background-position: -384px -48px; +} + +.icon-indent-right { + background-position: -408px -48px; +} + +.icon-facetime-video { + background-position: -432px -48px; +} + +.icon-picture { + background-position: -456px -48px; +} + +.icon-pencil { + background-position: 0 -72px; +} + +.icon-map-marker { + background-position: -24px -72px; +} + +.icon-adjust { + background-position: -48px -72px; +} + +.icon-tint { + background-position: -72px -72px; +} + +.icon-edit { + background-position: -96px -72px; +} + +.icon-share { + background-position: -120px -72px; +} + +.icon-check { + background-position: -144px -72px; +} + +.icon-move { + background-position: -168px -72px; +} + +.icon-step-backward { + background-position: -192px -72px; +} + +.icon-fast-backward { + background-position: -216px -72px; +} + +.icon-backward { + background-position: -240px -72px; +} + +.icon-play { + background-position: -264px -72px; +} + +.icon-pause { + background-position: -288px -72px; +} + +.icon-stop { + background-position: -312px -72px; +} + +.icon-forward { + background-position: -336px -72px; +} + +.icon-fast-forward { + background-position: -360px -72px; +} + +.icon-step-forward { + background-position: -384px -72px; +} + +.icon-eject { + background-position: -408px -72px; +} + +.icon-chevron-left { + background-position: -432px -72px; +} + +.icon-chevron-right { + background-position: -456px -72px; +} + +.icon-plus-sign { + background-position: 0 -96px; +} + +.icon-minus-sign { + background-position: -24px -96px; +} + +.icon-remove-sign { + background-position: -48px -96px; +} + +.icon-ok-sign { + background-position: -72px -96px; +} + +.icon-question-sign { + background-position: -96px -96px; +} + +.icon-info-sign { + background-position: -120px -96px; +} + +.icon-screenshot { + background-position: -144px -96px; +} + +.icon-remove-circle { + background-position: -168px -96px; +} + +.icon-ok-circle { + background-position: -192px -96px; +} + +.icon-ban-circle { + background-position: -216px -96px; +} + +.icon-arrow-left { + background-position: -240px -96px; +} + +.icon-arrow-right { + background-position: -264px -96px; +} + +.icon-arrow-up { + background-position: -289px -96px; +} + +.icon-arrow-down { + background-position: -312px -96px; +} + +.icon-share-alt { + background-position: -336px -96px; +} + +.icon-resize-full { + background-position: -360px -96px; +} + +.icon-resize-small { + background-position: -384px -96px; +} + +.icon-plus { + background-position: -408px -96px; +} + +.icon-minus { + background-position: -433px -96px; +} + +.icon-asterisk { + background-position: -456px -96px; +} + +.icon-exclamation-sign { + background-position: 0 -120px; +} + +.icon-gift { + background-position: -24px -120px; +} + +.icon-leaf { + background-position: -48px -120px; +} + +.icon-fire { + background-position: -72px -120px; +} + +.icon-eye-open { + background-position: -96px -120px; +} + +.icon-eye-close { + background-position: -120px -120px; +} + +.icon-warning-sign { + background-position: -144px -120px; +} + +.icon-plane { + background-position: -168px -120px; +} + +.icon-calendar { + background-position: -192px -120px; +} + +.icon-random { + width: 16px; + background-position: -216px -120px; +} + +.icon-comment { + background-position: -240px -120px; +} + +.icon-magnet { + background-position: -264px -120px; +} + +.icon-chevron-up { + background-position: -288px -120px; +} + +.icon-chevron-down { + background-position: -313px -119px; +} + +.icon-retweet { + background-position: -336px -120px; +} + +.icon-shopping-cart { + background-position: -360px -120px; +} + +.icon-folder-close { + background-position: -384px -120px; +} + +.icon-folder-open { + width: 16px; + background-position: -408px -120px; +} + +.icon-resize-vertical { + background-position: -432px -119px; +} + +.icon-resize-horizontal { + background-position: -456px -118px; +} + +.icon-hdd { + background-position: 0 -144px; +} + +.icon-bullhorn { + background-position: -24px -144px; +} + +.icon-bell { + background-position: -48px -144px; +} + +.icon-certificate { + background-position: -72px -144px; +} + +.icon-thumbs-up { + background-position: -96px -144px; +} + +.icon-thumbs-down { + background-position: -120px -144px; +} + +.icon-hand-right { + background-position: -144px -144px; +} + +.icon-hand-left { + background-position: -168px -144px; +} + +.icon-hand-up { + background-position: -192px -144px; +} + +.icon-hand-down { + background-position: -216px -144px; +} + +.icon-circle-arrow-right { + background-position: -240px -144px; +} + +.icon-circle-arrow-left { + background-position: -264px -144px; +} + +.icon-circle-arrow-up { + background-position: -288px -144px; +} + +.icon-circle-arrow-down { + background-position: -312px -144px; +} + +.icon-globe { + background-position: -336px -144px; +} + +.icon-wrench { + background-position: -360px -144px; +} + +.icon-tasks { + background-position: -384px -144px; +} + +.icon-filter { + background-position: -408px -144px; +} + +.icon-briefcase { + background-position: -432px -144px; +} + +.icon-fullscreen { + background-position: -456px -144px; +} + +.dropup, +.dropdown { + position: relative; +} + +.dropdown-toggle { + *margin-bottom: -3px; +} + +.dropdown-toggle:active, +.open .dropdown-toggle { + outline: 0; +} + +.caret { + display: inline-block; + width: 0; + height: 0; + vertical-align: top; + border-top: 4px solid #000000; + border-right: 4px solid transparent; + border-left: 4px solid transparent; + content: ""; +} + +.dropdown .caret { + margin-top: 8px; + margin-left: 2px; +} + +.dropdown-menu { + position: absolute; + top: 100%; + left: 0; + z-index: 1000; + display: none; + float: left; + min-width: 160px; + padding: 5px 0; + margin: 2px 0 0; + list-style: none; + background-color: #ffffff; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.2); + *border-right-width: 2px; + *border-bottom-width: 2px; + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; + -webkit-box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + -moz-box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + -webkit-background-clip: padding-box; + -moz-background-clip: padding; + background-clip: padding-box; +} + +.dropdown-menu.pull-right { + right: 0; + left: auto; +} + +.dropdown-menu .divider { + *width: 100%; + height: 1px; + margin: 9px 1px; + *margin: -5px 0 5px; + overflow: hidden; + background-color: #e5e5e5; + border-bottom: 1px solid #ffffff; +} + +.dropdown-menu li > a { + display: block; + padding: 3px 20px; + clear: both; + font-weight: normal; + line-height: 20px; + color: #333333; + white-space: nowrap; +} + +.dropdown-menu li > a:hover, +.dropdown-menu li > a:focus, +.dropdown-submenu:hover > a { + color: #ffffff; + text-decoration: none; + background-color: #0081c2; + background-image: -moz-linear-gradient(top, #0088cc, #0077b3); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#0088cc), to(#0077b3)); + background-image: -webkit-linear-gradient(top, #0088cc, #0077b3); + background-image: -o-linear-gradient(top, #0088cc, #0077b3); + background-image: linear-gradient(to bottom, #0088cc, #0077b3); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc', endColorstr='#ff0077b3', GradientType=0); +} + +.dropdown-menu .active > a, +.dropdown-menu .active > a:hover { + color: #333333; + text-decoration: none; + background-color: #0081c2; + background-image: -moz-linear-gradient(top, #0088cc, #0077b3); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#0088cc), to(#0077b3)); + background-image: -webkit-linear-gradient(top, #0088cc, #0077b3); + background-image: -o-linear-gradient(top, #0088cc, #0077b3); + background-image: linear-gradient(to bottom, #0088cc, #0077b3); + background-repeat: repeat-x; + outline: 0; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc', endColorstr='#ff0077b3', GradientType=0); +} + +.dropdown-menu .disabled > a, +.dropdown-menu .disabled > a:hover { + color: #999999; +} + +.dropdown-menu .disabled > a:hover { + text-decoration: none; + cursor: default; + background-color: transparent; + background-image: none; +} + +.open { + *z-index: 1000; +} + +.open > .dropdown-menu { + display: block; +} + +.pull-right > .dropdown-menu { + right: 0; + left: auto; +} + +.dropup .caret, +.navbar-fixed-bottom .dropdown .caret { + border-top: 0; + border-bottom: 4px solid #000000; + content: ""; +} + +.dropup .dropdown-menu, +.navbar-fixed-bottom .dropdown .dropdown-menu { + top: auto; + bottom: 100%; + margin-bottom: 1px; +} + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu > .dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + -webkit-border-radius: 0 6px 6px 6px; + -moz-border-radius: 0 6px 6px 6px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover > .dropdown-menu { + display: block; +} + +.dropup .dropdown-submenu > .dropdown-menu { + top: auto; + bottom: 0; + margin-top: 0; + margin-bottom: -2px; + -webkit-border-radius: 5px 5px 5px 0; + -moz-border-radius: 5px 5px 5px 0; + border-radius: 5px 5px 5px 0; +} + +.dropdown-submenu > a:after { + display: block; + float: right; + width: 0; + height: 0; + margin-top: 5px; + margin-right: -10px; + border-color: transparent; + border-left-color: #cccccc; + border-style: solid; + border-width: 5px 0 5px 5px; + content: " "; +} + +.dropdown-submenu:hover > a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left > .dropdown-menu { + left: -100%; + margin-left: 10px; + -webkit-border-radius: 6px 0 6px 6px; + -moz-border-radius: 6px 0 6px 6px; + border-radius: 6px 0 6px 6px; +} + +.dropdown .dropdown-menu .nav-header { + padding-right: 20px; + padding-left: 20px; +} + +.typeahead { + margin-top: 2px; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.well { + min-height: 20px; + padding: 19px; + margin-bottom: 20px; + background-color: #f5f5f5; + border: 1px solid #e3e3e3; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.05); + -moz-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.05); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.05); +} + +.well blockquote { + border-color: #ddd; + border-color: rgba(0, 0, 0, 0.15); +} + +.well-large { + padding: 24px; + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; +} + +.well-small { + padding: 9px; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +.fade { + opacity: 0; + -webkit-transition: opacity 0.15s linear; + -moz-transition: opacity 0.15s linear; + -o-transition: opacity 0.15s linear; + transition: opacity 0.15s linear; +} + +.fade.in { + opacity: 1; +} + +.collapse { + position: relative; + height: 0; + overflow: hidden; + -webkit-transition: height 0.35s ease; + -moz-transition: height 0.35s ease; + -o-transition: height 0.35s ease; + transition: height 0.35s ease; +} + +.collapse.in { + height: auto; +} + +.close { + float: right; + font-size: 20px; + font-weight: bold; + line-height: 20px; + color: #000000; + text-shadow: 0 1px 0 #ffffff; + opacity: 0.2; + filter: alpha(opacity=20); +} + +.close:hover { + color: #000000; + text-decoration: none; + cursor: pointer; + opacity: 0.4; + filter: alpha(opacity=40); +} + +button.close { + padding: 0; + cursor: pointer; + background: transparent; + border: 0; + -webkit-appearance: none; +} + +.btn { + display: inline-block; + *display: inline; + padding: 4px 12px; + margin-bottom: 0; + *margin-left: .3em; + font-size: 14px; + line-height: 20px; + *line-height: 20px; + color: #333333; + text-align: center; + text-shadow: 0 1px 1px rgba(255, 255, 255, 0.75); + vertical-align: middle; + cursor: pointer; + background-color: #f5f5f5; + *background-color: #e6e6e6; + background-image: -moz-linear-gradient(top, #ffffff, #e6e6e6); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#ffffff), to(#e6e6e6)); + background-image: -webkit-linear-gradient(top, #ffffff, #e6e6e6); + background-image: -o-linear-gradient(top, #ffffff, #e6e6e6); + background-image: linear-gradient(to bottom, #ffffff, #e6e6e6); + background-repeat: repeat-x; + border: 1px solid #bbbbbb; + *border: 0; + border-color: #e6e6e6 #e6e6e6 #bfbfbf; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + border-bottom-color: #a2a2a2; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe6e6e6', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); + *zoom: 1; + -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); + -moz-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.btn:hover, +.btn:active, +.btn.active, +.btn.disabled, +.btn[disabled] { + color: #333333; + background-color: #e6e6e6; + *background-color: #d9d9d9; +} + +.btn:active, +.btn.active { + background-color: #cccccc \9; +} + +.btn:first-child { + *margin-left: 0; +} + +.btn:hover { + color: #333333; + text-decoration: none; + background-color: #e6e6e6; + *background-color: #d9d9d9; + /* Buttons in IE7 don't get borders, so darken on hover */ + + background-position: 0 -15px; + -webkit-transition: background-position 0.1s linear; + -moz-transition: background-position 0.1s linear; + -o-transition: background-position 0.1s linear; + transition: background-position 0.1s linear; +} + +.btn:focus { + outline: thin dotted #333; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} + +.btn.active, +.btn:active { + background-color: #e6e6e6; + background-color: #d9d9d9 \9; + background-image: none; + outline: 0; + -webkit-box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); + -moz-box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); + box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.btn.disabled, +.btn[disabled] { + cursor: default; + background-color: #e6e6e6; + background-image: none; + opacity: 0.65; + filter: alpha(opacity=65); + -webkit-box-shadow: none; + -moz-box-shadow: none; + box-shadow: none; +} + +.btn-large { + padding: 11px 19px; + font-size: 17.5px; + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; +} + +.btn-large [class^="icon-"], +.btn-large [class*=" icon-"] { + margin-top: 2px; +} + +.btn-small { + padding: 2px 10px; + font-size: 11.9px; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +.btn-small [class^="icon-"], +.btn-small [class*=" icon-"] { + margin-top: 0; +} + +.btn-mini { + padding: 1px 6px; + font-size: 10.5px; + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +.btn-block { + display: block; + width: 100%; + padding-right: 0; + padding-left: 0; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +.btn-block + .btn-block { + margin-top: 5px; +} + +input[type="submit"].btn-block, +input[type="reset"].btn-block, +input[type="button"].btn-block { + width: 100%; +} + +.btn-primary.active, +.btn-warning.active, +.btn-danger.active, +.btn-success.active, +.btn-info.active, +.btn-inverse.active { + color: rgba(255, 255, 255, 0.75); +} + +.btn { + border-color: #c5c5c5; + border-color: rgba(0, 0, 0, 0.15) rgba(0, 0, 0, 0.15) rgba(0, 0, 0, 0.25); +} + +.btn-primary { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #006dcc; + *background-color: #0044cc; + background-image: -moz-linear-gradient(top, #0088cc, #0044cc); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#0088cc), to(#0044cc)); + background-image: -webkit-linear-gradient(top, #0088cc, #0044cc); + background-image: -o-linear-gradient(top, #0088cc, #0044cc); + background-image: linear-gradient(to bottom, #0088cc, #0044cc); + background-repeat: repeat-x; + border-color: #0044cc #0044cc #002a80; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc', endColorstr='#ff0044cc', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-primary:hover, +.btn-primary:active, +.btn-primary.active, +.btn-primary.disabled, +.btn-primary[disabled] { + color: #ffffff; + background-color: #0044cc; + *background-color: #003bb3; +} + +.btn-primary:active, +.btn-primary.active { + background-color: #003399 \9; +} + +.btn-warning { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #faa732; + *background-color: #f89406; + background-image: -moz-linear-gradient(top, #fbb450, #f89406); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#fbb450), to(#f89406)); + background-image: -webkit-linear-gradient(top, #fbb450, #f89406); + background-image: -o-linear-gradient(top, #fbb450, #f89406); + background-image: linear-gradient(to bottom, #fbb450, #f89406); + background-repeat: repeat-x; + border-color: #f89406 #f89406 #ad6704; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffbb450', endColorstr='#fff89406', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-warning:hover, +.btn-warning:active, +.btn-warning.active, +.btn-warning.disabled, +.btn-warning[disabled] { + color: #ffffff; + background-color: #f89406; + *background-color: #df8505; +} + +.btn-warning:active, +.btn-warning.active { + background-color: #c67605 \9; +} + +.btn-danger { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #da4f49; + *background-color: #bd362f; + background-image: -moz-linear-gradient(top, #ee5f5b, #bd362f); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#ee5f5b), to(#bd362f)); + background-image: -webkit-linear-gradient(top, #ee5f5b, #bd362f); + background-image: -o-linear-gradient(top, #ee5f5b, #bd362f); + background-image: linear-gradient(to bottom, #ee5f5b, #bd362f); + background-repeat: repeat-x; + border-color: #bd362f #bd362f #802420; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffee5f5b', endColorstr='#ffbd362f', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-danger:hover, +.btn-danger:active, +.btn-danger.active, +.btn-danger.disabled, +.btn-danger[disabled] { + color: #ffffff; + background-color: #bd362f; + *background-color: #a9302a; +} + +.btn-danger:active, +.btn-danger.active { + background-color: #942a25 \9; +} + +.btn-success { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #5bb75b; + *background-color: #51a351; + background-image: -moz-linear-gradient(top, #62c462, #51a351); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#62c462), to(#51a351)); + background-image: -webkit-linear-gradient(top, #62c462, #51a351); + background-image: -o-linear-gradient(top, #62c462, #51a351); + background-image: linear-gradient(to bottom, #62c462, #51a351); + background-repeat: repeat-x; + border-color: #51a351 #51a351 #387038; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff62c462', endColorstr='#ff51a351', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-success:hover, +.btn-success:active, +.btn-success.active, +.btn-success.disabled, +.btn-success[disabled] { + color: #ffffff; + background-color: #51a351; + *background-color: #499249; +} + +.btn-success:active, +.btn-success.active { + background-color: #408140 \9; +} + +.btn-info { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #49afcd; + *background-color: #2f96b4; + background-image: -moz-linear-gradient(top, #5bc0de, #2f96b4); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#5bc0de), to(#2f96b4)); + background-image: -webkit-linear-gradient(top, #5bc0de, #2f96b4); + background-image: -o-linear-gradient(top, #5bc0de, #2f96b4); + background-image: linear-gradient(to bottom, #5bc0de, #2f96b4); + background-repeat: repeat-x; + border-color: #2f96b4 #2f96b4 #1f6377; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2f96b4', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-info:hover, +.btn-info:active, +.btn-info.active, +.btn-info.disabled, +.btn-info[disabled] { + color: #ffffff; + background-color: #2f96b4; + *background-color: #2a85a0; +} + +.btn-info:active, +.btn-info.active { + background-color: #24748c \9; +} + +.btn-inverse { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #363636; + *background-color: #222222; + background-image: -moz-linear-gradient(top, #444444, #222222); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#444444), to(#222222)); + background-image: -webkit-linear-gradient(top, #444444, #222222); + background-image: -o-linear-gradient(top, #444444, #222222); + background-image: linear-gradient(to bottom, #444444, #222222); + background-repeat: repeat-x; + border-color: #222222 #222222 #000000; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff444444', endColorstr='#ff222222', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.btn-inverse:hover, +.btn-inverse:active, +.btn-inverse.active, +.btn-inverse.disabled, +.btn-inverse[disabled] { + color: #ffffff; + background-color: #222222; + *background-color: #151515; +} + +.btn-inverse:active, +.btn-inverse.active { + background-color: #080808 \9; +} + +button.btn, +input[type="submit"].btn { + *padding-top: 3px; + *padding-bottom: 3px; +} + +button.btn::-moz-focus-inner, +input[type="submit"].btn::-moz-focus-inner { + padding: 0; + border: 0; +} + +button.btn.btn-large, +input[type="submit"].btn.btn-large { + *padding-top: 7px; + *padding-bottom: 7px; +} + +button.btn.btn-small, +input[type="submit"].btn.btn-small { + *padding-top: 3px; + *padding-bottom: 3px; +} + +button.btn.btn-mini, +input[type="submit"].btn.btn-mini { + *padding-top: 1px; + *padding-bottom: 1px; +} + +.btn-link, +.btn-link:active, +.btn-link[disabled] { + background-color: transparent; + background-image: none; + -webkit-box-shadow: none; + -moz-box-shadow: none; + box-shadow: none; +} + +.btn-link { + color: #0088cc; + cursor: pointer; + border-color: transparent; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.btn-link:hover { + color: #005580; + text-decoration: underline; + background-color: transparent; +} + +.btn-link[disabled]:hover { + color: #333333; + text-decoration: none; +} + +.btn-group { + position: relative; + display: inline-block; + *display: inline; + *margin-left: .3em; + font-size: 0; + white-space: nowrap; + vertical-align: middle; + *zoom: 1; +} + +.btn-group:first-child { + *margin-left: 0; +} + +.btn-group + .btn-group { + margin-left: 5px; +} + +.btn-toolbar { + margin-top: 10px; + margin-bottom: 10px; + font-size: 0; +} + +.btn-toolbar .btn + .btn, +.btn-toolbar .btn-group + .btn, +.btn-toolbar .btn + .btn-group { + margin-left: 5px; +} + +.btn-group > .btn { + position: relative; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.btn-group > .btn + .btn { + margin-left: -1px; +} + +.btn-group > .btn, +.btn-group > .dropdown-menu { + font-size: 14px; +} + +.btn-group > .btn-mini { + font-size: 11px; +} + +.btn-group > .btn-small { + font-size: 12px; +} + +.btn-group > .btn-large { + font-size: 16px; +} + +.btn-group > .btn:first-child { + margin-left: 0; + -webkit-border-bottom-left-radius: 4px; + border-bottom-left-radius: 4px; + -webkit-border-top-left-radius: 4px; + border-top-left-radius: 4px; + -moz-border-radius-bottomleft: 4px; + -moz-border-radius-topleft: 4px; +} + +.btn-group > .btn:last-child, +.btn-group > .dropdown-toggle { + -webkit-border-top-right-radius: 4px; + border-top-right-radius: 4px; + -webkit-border-bottom-right-radius: 4px; + border-bottom-right-radius: 4px; + -moz-border-radius-topright: 4px; + -moz-border-radius-bottomright: 4px; +} + +.btn-group > .btn.large:first-child { + margin-left: 0; + -webkit-border-bottom-left-radius: 6px; + border-bottom-left-radius: 6px; + -webkit-border-top-left-radius: 6px; + border-top-left-radius: 6px; + -moz-border-radius-bottomleft: 6px; + -moz-border-radius-topleft: 6px; +} + +.btn-group > .btn.large:last-child, +.btn-group > .large.dropdown-toggle { + -webkit-border-top-right-radius: 6px; + border-top-right-radius: 6px; + -webkit-border-bottom-right-radius: 6px; + border-bottom-right-radius: 6px; + -moz-border-radius-topright: 6px; + -moz-border-radius-bottomright: 6px; +} + +.btn-group > .btn:hover, +.btn-group > .btn:focus, +.btn-group > .btn:active, +.btn-group > .btn.active { + z-index: 2; +} + +.btn-group .dropdown-toggle:active, +.btn-group.open .dropdown-toggle { + outline: 0; +} + +.btn-group > .btn + .dropdown-toggle { + *padding-top: 5px; + padding-right: 8px; + *padding-bottom: 5px; + padding-left: 8px; + -webkit-box-shadow: inset 1px 0 0 rgba(255, 255, 255, 0.125), inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); + -moz-box-shadow: inset 1px 0 0 rgba(255, 255, 255, 0.125), inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); + box-shadow: inset 1px 0 0 rgba(255, 255, 255, 0.125), inset 0 1px 0 rgba(255, 255, 255, 0.2), 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.btn-group > .btn-mini + .dropdown-toggle { + *padding-top: 2px; + padding-right: 5px; + *padding-bottom: 2px; + padding-left: 5px; +} + +.btn-group > .btn-small + .dropdown-toggle { + *padding-top: 5px; + *padding-bottom: 4px; +} + +.btn-group > .btn-large + .dropdown-toggle { + *padding-top: 7px; + padding-right: 12px; + *padding-bottom: 7px; + padding-left: 12px; +} + +.btn-group.open .dropdown-toggle { + background-image: none; + -webkit-box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); + -moz-box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); + box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.15), 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.btn-group.open .btn.dropdown-toggle { + background-color: #e6e6e6; +} + +.btn-group.open .btn-primary.dropdown-toggle { + background-color: #0044cc; +} + +.btn-group.open .btn-warning.dropdown-toggle { + background-color: #f89406; +} + +.btn-group.open .btn-danger.dropdown-toggle { + background-color: #bd362f; +} + +.btn-group.open .btn-success.dropdown-toggle { + background-color: #51a351; +} + +.btn-group.open .btn-info.dropdown-toggle { + background-color: #2f96b4; +} + +.btn-group.open .btn-inverse.dropdown-toggle { + background-color: #222222; +} + +.btn .caret { + margin-top: 8px; + margin-left: 0; +} + +.btn-mini .caret, +.btn-small .caret, +.btn-large .caret { + margin-top: 6px; +} + +.btn-large .caret { + border-top-width: 5px; + border-right-width: 5px; + border-left-width: 5px; +} + +.dropup .btn-large .caret { + border-bottom-width: 5px; +} + +.btn-primary .caret, +.btn-warning .caret, +.btn-danger .caret, +.btn-info .caret, +.btn-success .caret, +.btn-inverse .caret { + border-top-color: #ffffff; + border-bottom-color: #ffffff; +} + +.btn-group-vertical { + display: inline-block; + *display: inline; + /* IE7 inline-block hack */ + + *zoom: 1; +} + +.btn-group-vertical .btn { + display: block; + float: none; + width: 100%; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.btn-group-vertical .btn + .btn { + margin-top: -1px; + margin-left: 0; +} + +.btn-group-vertical .btn:first-child { + -webkit-border-radius: 4px 4px 0 0; + -moz-border-radius: 4px 4px 0 0; + border-radius: 4px 4px 0 0; +} + +.btn-group-vertical .btn:last-child { + -webkit-border-radius: 0 0 4px 4px; + -moz-border-radius: 0 0 4px 4px; + border-radius: 0 0 4px 4px; +} + +.btn-group-vertical .btn-large:first-child { + -webkit-border-radius: 6px 6px 0 0; + -moz-border-radius: 6px 6px 0 0; + border-radius: 6px 6px 0 0; +} + +.btn-group-vertical .btn-large:last-child { + -webkit-border-radius: 0 0 6px 6px; + -moz-border-radius: 0 0 6px 6px; + border-radius: 0 0 6px 6px; +} + +.alert { + padding: 8px 35px 8px 14px; + margin-bottom: 20px; + color: #c09853; + text-shadow: 0 1px 0 rgba(255, 255, 255, 0.5); + background-color: #fcf8e3; + border: 1px solid #fbeed5; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.alert h4 { + margin: 0; +} + +.alert .close { + position: relative; + top: -2px; + right: -21px; + line-height: 20px; +} + +.alert-success { + color: #468847; + background-color: #dff0d8; + border-color: #d6e9c6; +} + +.alert-danger, +.alert-error { + color: #b94a48; + background-color: #f2dede; + border-color: #eed3d7; +} + +.alert-info { + color: #3a87ad; + background-color: #d9edf7; + border-color: #bce8f1; +} + +.alert-block { + padding-top: 14px; + padding-bottom: 14px; +} + +.alert-block > p, +.alert-block > ul { + margin-bottom: 0; +} + +.alert-block p + p { + margin-top: 5px; +} + +.nav { + margin-bottom: 20px; + margin-left: 0; + list-style: none; +} + +.nav > li > a { + display: block; +} + +.nav > li > a:hover { + text-decoration: none; + background-color: #eeeeee; +} + +.nav > .pull-right { + float: right; +} + +.nav-header { + display: block; + padding: 3px 15px; + font-size: 11px; + font-weight: bold; + line-height: 20px; + color: #999999; + text-shadow: 0 1px 0 rgba(255, 255, 255, 0.5); + text-transform: uppercase; +} + +.nav li + .nav-header { + margin-top: 9px; +} + +.nav-list { + padding-right: 15px; + padding-left: 15px; + margin-bottom: 0; +} + +.nav-list > li > a, +.nav-list .nav-header { + margin-right: -15px; + margin-left: -15px; + text-shadow: 0 1px 0 rgba(255, 255, 255, 0.5); +} + +.nav-list > li > a { + padding: 3px 15px; +} + +.nav-list > .active > a, +.nav-list > .active > a:hover { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.2); + background-color: #0088cc; +} + +.nav-list [class^="icon-"], +.nav-list [class*=" icon-"] { + margin-right: 2px; +} + +.nav-list .divider { + *width: 100%; + height: 1px; + margin: 9px 1px; + *margin: -5px 0 5px; + overflow: hidden; + background-color: #e5e5e5; + border-bottom: 1px solid #ffffff; +} + +.nav-tabs, +.nav-pills { + *zoom: 1; +} + +.nav-tabs:before, +.nav-pills:before, +.nav-tabs:after, +.nav-pills:after { + display: table; + line-height: 0; + content: ""; +} + +.nav-tabs:after, +.nav-pills:after { + clear: both; +} + +.nav-tabs > li, +.nav-pills > li { + float: left; +} + +.nav-tabs > li > a, +.nav-pills > li > a { + padding-right: 12px; + padding-left: 12px; + margin-right: 2px; + line-height: 14px; +} + +.nav-tabs { + border-bottom: 1px solid #ddd; +} + +.nav-tabs > li { + margin-bottom: -1px; +} + +.nav-tabs > li > a { + padding-top: 8px; + padding-bottom: 8px; + line-height: 20px; + border: 1px solid transparent; + -webkit-border-radius: 4px 4px 0 0; + -moz-border-radius: 4px 4px 0 0; + border-radius: 4px 4px 0 0; +} + +.nav-tabs > li > a:hover { + border-color: #eeeeee #eeeeee #dddddd; +} + +.nav-tabs > .active > a, +.nav-tabs > .active > a:hover { + color: #555555; + cursor: default; + background-color: #ffffff; + border: 1px solid #ddd; + border-bottom-color: transparent; +} + +.nav-pills > li > a { + padding-top: 8px; + padding-bottom: 8px; + margin-top: 2px; + margin-bottom: 2px; + -webkit-border-radius: 5px; + -moz-border-radius: 5px; + border-radius: 5px; +} + +.nav-pills > .active > a, +.nav-pills > .active > a:hover { + color: #ffffff; + background-color: #0088cc; +} + +.nav-stacked > li { + float: none; +} + +.nav-stacked > li > a { + margin-right: 0; +} + +.nav-tabs.nav-stacked { + border-bottom: 0; +} + +.nav-tabs.nav-stacked > li > a { + border: 1px solid #ddd; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.nav-tabs.nav-stacked > li:first-child > a { + -webkit-border-top-right-radius: 4px; + border-top-right-radius: 4px; + -webkit-border-top-left-radius: 4px; + border-top-left-radius: 4px; + -moz-border-radius-topright: 4px; + -moz-border-radius-topleft: 4px; +} + +.nav-tabs.nav-stacked > li:last-child > a { + -webkit-border-bottom-right-radius: 4px; + border-bottom-right-radius: 4px; + -webkit-border-bottom-left-radius: 4px; + border-bottom-left-radius: 4px; + -moz-border-radius-bottomright: 4px; + -moz-border-radius-bottomleft: 4px; +} + +.nav-tabs.nav-stacked > li > a:hover { + z-index: 2; + border-color: #ddd; +} + +.nav-pills.nav-stacked > li > a { + margin-bottom: 3px; +} + +.nav-pills.nav-stacked > li:last-child > a { + margin-bottom: 1px; +} + +.nav-tabs .dropdown-menu { + -webkit-border-radius: 0 0 6px 6px; + -moz-border-radius: 0 0 6px 6px; + border-radius: 0 0 6px 6px; +} + +.nav-pills .dropdown-menu { + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; +} + +.nav .dropdown-toggle .caret { + margin-top: 6px; + border-top-color: #0088cc; + border-bottom-color: #0088cc; +} + +.nav .dropdown-toggle:hover .caret { + border-top-color: #005580; + border-bottom-color: #005580; +} + +/* move down carets for tabs */ + +.nav-tabs .dropdown-toggle .caret { + margin-top: 8px; +} + +.nav .active .dropdown-toggle .caret { + border-top-color: #fff; + border-bottom-color: #fff; +} + +.nav-tabs .active .dropdown-toggle .caret { + border-top-color: #555555; + border-bottom-color: #555555; +} + +.nav > .dropdown.active > a:hover { + cursor: pointer; +} + +.nav-tabs .open .dropdown-toggle, +.nav-pills .open .dropdown-toggle, +.nav > li.dropdown.open.active > a:hover { + color: #ffffff; + background-color: #999999; + border-color: #999999; +} + +.nav li.dropdown.open .caret, +.nav li.dropdown.open.active .caret, +.nav li.dropdown.open a:hover .caret { + border-top-color: #ffffff; + border-bottom-color: #ffffff; + opacity: 1; + filter: alpha(opacity=100); +} + +.tabs-stacked .open > a:hover { + border-color: #999999; +} + +.tabbable { + *zoom: 1; +} + +.tabbable:before, +.tabbable:after { + display: table; + line-height: 0; + content: ""; +} + +.tabbable:after { + clear: both; +} + +.tab-content { + overflow: auto; +} + +.tabs-below > .nav-tabs, +.tabs-right > .nav-tabs, +.tabs-left > .nav-tabs { + border-bottom: 0; +} + +.tab-content > .tab-pane, +.pill-content > .pill-pane { + display: none; +} + +.tab-content > .active, +.pill-content > .active { + display: block; +} + +.tabs-below > .nav-tabs { + border-top: 1px solid #ddd; +} + +.tabs-below > .nav-tabs > li { + margin-top: -1px; + margin-bottom: 0; +} + +.tabs-below > .nav-tabs > li > a { + -webkit-border-radius: 0 0 4px 4px; + -moz-border-radius: 0 0 4px 4px; + border-radius: 0 0 4px 4px; +} + +.tabs-below > .nav-tabs > li > a:hover { + border-top-color: #ddd; + border-bottom-color: transparent; +} + +.tabs-below > .nav-tabs > .active > a, +.tabs-below > .nav-tabs > .active > a:hover { + border-color: transparent #ddd #ddd #ddd; +} + +.tabs-left > .nav-tabs > li, +.tabs-right > .nav-tabs > li { + float: none; +} + +.tabs-left > .nav-tabs > li > a, +.tabs-right > .nav-tabs > li > a { + min-width: 74px; + margin-right: 0; + margin-bottom: 3px; +} + +.tabs-left > .nav-tabs { + float: left; + margin-right: 19px; + border-right: 1px solid #ddd; +} + +.tabs-left > .nav-tabs > li > a { + margin-right: -1px; + -webkit-border-radius: 4px 0 0 4px; + -moz-border-radius: 4px 0 0 4px; + border-radius: 4px 0 0 4px; +} + +.tabs-left > .nav-tabs > li > a:hover { + border-color: #eeeeee #dddddd #eeeeee #eeeeee; +} + +.tabs-left > .nav-tabs .active > a, +.tabs-left > .nav-tabs .active > a:hover { + border-color: #ddd transparent #ddd #ddd; + *border-right-color: #ffffff; +} + +.tabs-right > .nav-tabs { + float: right; + margin-left: 19px; + border-left: 1px solid #ddd; +} + +.tabs-right > .nav-tabs > li > a { + margin-left: -1px; + -webkit-border-radius: 0 4px 4px 0; + -moz-border-radius: 0 4px 4px 0; + border-radius: 0 4px 4px 0; +} + +.tabs-right > .nav-tabs > li > a:hover { + border-color: #eeeeee #eeeeee #eeeeee #dddddd; +} + +.tabs-right > .nav-tabs .active > a, +.tabs-right > .nav-tabs .active > a:hover { + border-color: #ddd #ddd #ddd transparent; + *border-left-color: #ffffff; +} + +.nav > .disabled > a { + color: #999999; +} + +.nav > .disabled > a:hover { + text-decoration: none; + cursor: default; + background-color: transparent; +} + +.navbar { + *position: relative; + *z-index: 2; + margin-bottom: 20px; + overflow: visible; + color: #777777; +} + +.navbar-inner { + min-height: 40px; + padding-right: 20px; + padding-left: 20px; + background-color: #fafafa; + background-image: -moz-linear-gradient(top, #ffffff, #f2f2f2); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#ffffff), to(#f2f2f2)); + background-image: -webkit-linear-gradient(top, #ffffff, #f2f2f2); + background-image: -o-linear-gradient(top, #ffffff, #f2f2f2); + background-image: linear-gradient(to bottom, #ffffff, #f2f2f2); + background-repeat: repeat-x; + border: 1px solid #d4d4d4; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff2f2f2', GradientType=0); + *zoom: 1; + -webkit-box-shadow: 0 1px 4px rgba(0, 0, 0, 0.065); + -moz-box-shadow: 0 1px 4px rgba(0, 0, 0, 0.065); + box-shadow: 0 1px 4px rgba(0, 0, 0, 0.065); +} + +.navbar-inner:before, +.navbar-inner:after { + display: table; + line-height: 0; + content: ""; +} + +.navbar-inner:after { + clear: both; +} + +.navbar .container { + width: auto; +} + +.nav-collapse.collapse { + height: auto; + overflow: visible; +} + +.navbar .brand { + display: block; + float: left; + padding: 10px 20px 10px; + margin-left: -20px; + font-size: 20px; + font-weight: 200; + color: #777777; + text-shadow: 0 1px 0 #ffffff; +} + +.navbar .brand:hover { + text-decoration: none; +} + +.navbar-text { + margin-bottom: 0; + line-height: 40px; +} + +.navbar-link { + color: #777777; +} + +.navbar-link:hover { + color: #333333; +} + +.navbar .divider-vertical { + height: 40px; + margin: 0 9px; + border-right: 1px solid #ffffff; + border-left: 1px solid #f2f2f2; +} + +.navbar .btn, +.navbar .btn-group { + margin-top: 5px; +} + +.navbar .btn-group .btn, +.navbar .input-prepend .btn, +.navbar .input-append .btn { + margin-top: 0; +} + +.navbar-form { + margin-bottom: 0; + *zoom: 1; +} + +.navbar-form:before, +.navbar-form:after { + display: table; + line-height: 0; + content: ""; +} + +.navbar-form:after { + clear: both; +} + +.navbar-form input, +.navbar-form select, +.navbar-form .radio, +.navbar-form .checkbox { + margin-top: 5px; +} + +.navbar-form input, +.navbar-form select, +.navbar-form .btn { + display: inline-block; + margin-bottom: 0; +} + +.navbar-form input[type="image"], +.navbar-form input[type="checkbox"], +.navbar-form input[type="radio"] { + margin-top: 3px; +} + +.navbar-form .input-append, +.navbar-form .input-prepend { + margin-top: 6px; + white-space: nowrap; +} + +.navbar-form .input-append input, +.navbar-form .input-prepend input { + margin-top: 0; +} + +.navbar-search { + position: relative; + float: left; + margin-top: 5px; + margin-bottom: 0; +} + +.navbar-search .search-query { + padding: 4px 14px; + margin-bottom: 0; + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + font-size: 13px; + font-weight: normal; + line-height: 1; + -webkit-border-radius: 15px; + -moz-border-radius: 15px; + border-radius: 15px; +} + +.navbar-static-top { + position: static; + margin-bottom: 0; +} + +.navbar-static-top .navbar-inner { + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.navbar-fixed-top, +.navbar-fixed-bottom { + position: fixed; + right: 0; + left: 0; + z-index: 1030; + margin-bottom: 0; +} + +.navbar-fixed-top .navbar-inner, +.navbar-static-top .navbar-inner { + border-width: 0 0 1px; +} + +.navbar-fixed-bottom .navbar-inner { + border-width: 1px 0 0; +} + +.navbar-fixed-top .navbar-inner, +.navbar-fixed-bottom .navbar-inner { + padding-right: 0; + padding-left: 0; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.navbar-static-top .container, +.navbar-fixed-top .container, +.navbar-fixed-bottom .container { + width: 940px; +} + +.navbar-fixed-top { + top: 0; +} + +.navbar-fixed-top .navbar-inner, +.navbar-static-top .navbar-inner { + -webkit-box-shadow: 0 1px 10px rgba(0, 0, 0, 0.1); + -moz-box-shadow: 0 1px 10px rgba(0, 0, 0, 0.1); + box-shadow: 0 1px 10px rgba(0, 0, 0, 0.1); +} + +.navbar-fixed-bottom { + bottom: 0; +} + +.navbar-fixed-bottom .navbar-inner { + -webkit-box-shadow: 0 -1px 10px rgba(0, 0, 0, 0.1); + -moz-box-shadow: 0 -1px 10px rgba(0, 0, 0, 0.1); + box-shadow: 0 -1px 10px rgba(0, 0, 0, 0.1); +} + +.navbar .nav { + position: relative; + left: 0; + display: block; + float: left; + margin: 0 10px 0 0; +} + +.navbar .nav.pull-right { + float: right; + margin-right: 0; +} + +.navbar .nav > li { + float: left; +} + +.navbar .nav > li > a { + float: none; + padding: 10px 15px 10px; + color: #777777; + text-decoration: none; + text-shadow: 0 1px 0 #ffffff; +} + +.navbar .nav .dropdown-toggle .caret { + margin-top: 8px; +} + +.navbar .nav > li > a:focus, +.navbar .nav > li > a:hover { + color: #333333; + text-decoration: none; + background-color: transparent; +} + +.navbar .nav > .active > a, +.navbar .nav > .active > a:hover, +.navbar .nav > .active > a:focus { + color: #555555; + text-decoration: none; + background-color: #e5e5e5; + -webkit-box-shadow: inset 0 3px 8px rgba(0, 0, 0, 0.125); + -moz-box-shadow: inset 0 3px 8px rgba(0, 0, 0, 0.125); + box-shadow: inset 0 3px 8px rgba(0, 0, 0, 0.125); +} + +.navbar .btn-navbar { + display: none; + float: right; + padding: 7px 10px; + margin-right: 5px; + margin-left: 5px; + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #ededed; + *background-color: #e5e5e5; + background-image: -moz-linear-gradient(top, #f2f2f2, #e5e5e5); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#f2f2f2), to(#e5e5e5)); + background-image: -webkit-linear-gradient(top, #f2f2f2, #e5e5e5); + background-image: -o-linear-gradient(top, #f2f2f2, #e5e5e5); + background-image: linear-gradient(to bottom, #f2f2f2, #e5e5e5); + background-repeat: repeat-x; + border-color: #e5e5e5 #e5e5e5 #bfbfbf; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2f2f2', endColorstr='#ffe5e5e5', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); + -webkit-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.075); + -moz-box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.075); + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1), 0 1px 0 rgba(255, 255, 255, 0.075); +} + +.navbar .btn-navbar:hover, +.navbar .btn-navbar:active, +.navbar .btn-navbar.active, +.navbar .btn-navbar.disabled, +.navbar .btn-navbar[disabled] { + color: #ffffff; + background-color: #e5e5e5; + *background-color: #d9d9d9; +} + +.navbar .btn-navbar:active, +.navbar .btn-navbar.active { + background-color: #cccccc \9; +} + +.navbar .btn-navbar .icon-bar { + display: block; + width: 18px; + height: 2px; + background-color: #f5f5f5; + -webkit-border-radius: 1px; + -moz-border-radius: 1px; + border-radius: 1px; + -webkit-box-shadow: 0 1px 0 rgba(0, 0, 0, 0.25); + -moz-box-shadow: 0 1px 0 rgba(0, 0, 0, 0.25); + box-shadow: 0 1px 0 rgba(0, 0, 0, 0.25); +} + +.btn-navbar .icon-bar + .icon-bar { + margin-top: 3px; +} + +.navbar .nav > li > .dropdown-menu:before { + position: absolute; + top: -7px; + left: 9px; + display: inline-block; + border-right: 7px solid transparent; + border-bottom: 7px solid #ccc; + border-left: 7px solid transparent; + border-bottom-color: rgba(0, 0, 0, 0.2); + content: ''; +} + +.navbar .nav > li > .dropdown-menu:after { + position: absolute; + top: -6px; + left: 10px; + display: inline-block; + border-right: 6px solid transparent; + border-bottom: 6px solid #ffffff; + border-left: 6px solid transparent; + content: ''; +} + +.navbar-fixed-bottom .nav > li > .dropdown-menu:before { + top: auto; + bottom: -7px; + border-top: 7px solid #ccc; + border-bottom: 0; + border-top-color: rgba(0, 0, 0, 0.2); +} + +.navbar-fixed-bottom .nav > li > .dropdown-menu:after { + top: auto; + bottom: -6px; + border-top: 6px solid #ffffff; + border-bottom: 0; +} + +.navbar .nav li.dropdown.open > .dropdown-toggle, +.navbar .nav li.dropdown.active > .dropdown-toggle, +.navbar .nav li.dropdown.open.active > .dropdown-toggle { + color: #555555; + background-color: #e5e5e5; +} + +.navbar .nav li.dropdown > .dropdown-toggle .caret { + border-top-color: #777777; + border-bottom-color: #777777; +} + +.navbar .nav li.dropdown.open > .dropdown-toggle .caret, +.navbar .nav li.dropdown.active > .dropdown-toggle .caret, +.navbar .nav li.dropdown.open.active > .dropdown-toggle .caret { + border-top-color: #555555; + border-bottom-color: #555555; +} + +.navbar .pull-right > li > .dropdown-menu, +.navbar .nav > li > .dropdown-menu.pull-right { + right: 0; + left: auto; +} + +.navbar .pull-right > li > .dropdown-menu:before, +.navbar .nav > li > .dropdown-menu.pull-right:before { + right: 12px; + left: auto; +} + +.navbar .pull-right > li > .dropdown-menu:after, +.navbar .nav > li > .dropdown-menu.pull-right:after { + right: 13px; + left: auto; +} + +.navbar .pull-right > li > .dropdown-menu .dropdown-menu, +.navbar .nav > li > .dropdown-menu.pull-right .dropdown-menu { + right: 100%; + left: auto; + margin-right: -1px; + margin-left: 0; + -webkit-border-radius: 6px 0 6px 6px; + -moz-border-radius: 6px 0 6px 6px; + border-radius: 6px 0 6px 6px; +} + +.navbar-inverse { + color: #999999; +} + +.navbar-inverse .navbar-inner { + background-color: #1b1b1b; + background-image: -moz-linear-gradient(top, #222222, #111111); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#222222), to(#111111)); + background-image: -webkit-linear-gradient(top, #222222, #111111); + background-image: -o-linear-gradient(top, #222222, #111111); + background-image: linear-gradient(to bottom, #222222, #111111); + background-repeat: repeat-x; + border-color: #252525; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff222222', endColorstr='#ff111111', GradientType=0); +} + +.navbar-inverse .brand, +.navbar-inverse .nav > li > a { + color: #999999; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); +} + +.navbar-inverse .brand:hover, +.navbar-inverse .nav > li > a:hover { + color: #ffffff; +} + +.navbar-inverse .nav > li > a:focus, +.navbar-inverse .nav > li > a:hover { + color: #ffffff; + background-color: transparent; +} + +.navbar-inverse .nav .active > a, +.navbar-inverse .nav .active > a:hover, +.navbar-inverse .nav .active > a:focus { + color: #ffffff; + background-color: #111111; +} + +.navbar-inverse .navbar-link { + color: #999999; +} + +.navbar-inverse .navbar-link:hover { + color: #ffffff; +} + +.navbar-inverse .divider-vertical { + border-right-color: #222222; + border-left-color: #111111; +} + +.navbar-inverse .nav li.dropdown.open > .dropdown-toggle, +.navbar-inverse .nav li.dropdown.active > .dropdown-toggle, +.navbar-inverse .nav li.dropdown.open.active > .dropdown-toggle { + color: #ffffff; + background-color: #111111; +} + +.navbar-inverse .nav li.dropdown > .dropdown-toggle .caret { + border-top-color: #999999; + border-bottom-color: #999999; +} + +.navbar-inverse .nav li.dropdown.open > .dropdown-toggle .caret, +.navbar-inverse .nav li.dropdown.active > .dropdown-toggle .caret, +.navbar-inverse .nav li.dropdown.open.active > .dropdown-toggle .caret { + border-top-color: #ffffff; + border-bottom-color: #ffffff; +} + +.navbar-inverse .navbar-search .search-query { + color: #ffffff; + background-color: #515151; + border-color: #111111; + -webkit-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1), 0 1px 0 rgba(255, 255, 255, 0.15); + -moz-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1), 0 1px 0 rgba(255, 255, 255, 0.15); + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1), 0 1px 0 rgba(255, 255, 255, 0.15); + -webkit-transition: none; + -moz-transition: none; + -o-transition: none; + transition: none; +} + +.navbar-inverse .navbar-search .search-query:-moz-placeholder { + color: #cccccc; +} + +.navbar-inverse .navbar-search .search-query:-ms-input-placeholder { + color: #cccccc; +} + +.navbar-inverse .navbar-search .search-query::-webkit-input-placeholder { + color: #cccccc; +} + +.navbar-inverse .navbar-search .search-query:focus, +.navbar-inverse .navbar-search .search-query.focused { + padding: 5px 15px; + color: #333333; + text-shadow: 0 1px 0 #ffffff; + background-color: #ffffff; + border: 0; + outline: 0; + -webkit-box-shadow: 0 0 3px rgba(0, 0, 0, 0.15); + -moz-box-shadow: 0 0 3px rgba(0, 0, 0, 0.15); + box-shadow: 0 0 3px rgba(0, 0, 0, 0.15); +} + +.navbar-inverse .btn-navbar { + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #0e0e0e; + *background-color: #040404; + background-image: -moz-linear-gradient(top, #151515, #040404); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#151515), to(#040404)); + background-image: -webkit-linear-gradient(top, #151515, #040404); + background-image: -o-linear-gradient(top, #151515, #040404); + background-image: linear-gradient(to bottom, #151515, #040404); + background-repeat: repeat-x; + border-color: #040404 #040404 #000000; + border-color: rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.1) rgba(0, 0, 0, 0.25); + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff151515', endColorstr='#ff040404', GradientType=0); + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); +} + +.navbar-inverse .btn-navbar:hover, +.navbar-inverse .btn-navbar:active, +.navbar-inverse .btn-navbar.active, +.navbar-inverse .btn-navbar.disabled, +.navbar-inverse .btn-navbar[disabled] { + color: #ffffff; + background-color: #040404; + *background-color: #000000; +} + +.navbar-inverse .btn-navbar:active, +.navbar-inverse .btn-navbar.active { + background-color: #000000 \9; +} + +.breadcrumb { + padding: 8px 15px; + margin: 0 0 20px; + list-style: none; + background-color: #f5f5f5; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.breadcrumb li { + display: inline-block; + *display: inline; + text-shadow: 0 1px 0 #ffffff; + *zoom: 1; +} + +.breadcrumb .divider { + padding: 0 5px; + color: #ccc; +} + +.breadcrumb .active { + color: #999999; +} + +.pagination { + margin: 20px 0; +} + +.pagination ul { + display: inline-block; + *display: inline; + margin-bottom: 0; + margin-left: 0; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + *zoom: 1; + -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); + -moz-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); +} + +.pagination ul > li { + display: inline; +} + +.pagination ul > li > a, +.pagination ul > li > span { + float: left; + padding: 4px 12px; + line-height: 20px; + text-decoration: none; + background-color: #ffffff; + border: 1px solid #dddddd; + border-left-width: 0; +} + +.pagination ul > li > a:hover, +.pagination ul > .active > a, +.pagination ul > .active > span { + background-color: #f5f5f5; +} + +.pagination ul > .active > a, +.pagination ul > .active > span { + color: #999999; + cursor: default; +} + +.pagination ul > .disabled > span, +.pagination ul > .disabled > a, +.pagination ul > .disabled > a:hover { + color: #999999; + cursor: default; + background-color: transparent; +} + +.pagination ul > li:first-child > a, +.pagination ul > li:first-child > span { + border-left-width: 1px; + -webkit-border-bottom-left-radius: 4px; + border-bottom-left-radius: 4px; + -webkit-border-top-left-radius: 4px; + border-top-left-radius: 4px; + -moz-border-radius-bottomleft: 4px; + -moz-border-radius-topleft: 4px; +} + +.pagination ul > li:last-child > a, +.pagination ul > li:last-child > span { + -webkit-border-top-right-radius: 4px; + border-top-right-radius: 4px; + -webkit-border-bottom-right-radius: 4px; + border-bottom-right-radius: 4px; + -moz-border-radius-topright: 4px; + -moz-border-radius-bottomright: 4px; +} + +.pagination-centered { + text-align: center; +} + +.pagination-right { + text-align: right; +} + +.pagination-large ul > li > a, +.pagination-large ul > li > span { + padding: 11px 19px; + font-size: 17.5px; +} + +.pagination-large ul > li:first-child > a, +.pagination-large ul > li:first-child > span { + -webkit-border-bottom-left-radius: 6px; + border-bottom-left-radius: 6px; + -webkit-border-top-left-radius: 6px; + border-top-left-radius: 6px; + -moz-border-radius-bottomleft: 6px; + -moz-border-radius-topleft: 6px; +} + +.pagination-large ul > li:last-child > a, +.pagination-large ul > li:last-child > span { + -webkit-border-top-right-radius: 6px; + border-top-right-radius: 6px; + -webkit-border-bottom-right-radius: 6px; + border-bottom-right-radius: 6px; + -moz-border-radius-topright: 6px; + -moz-border-radius-bottomright: 6px; +} + +.pagination-mini ul > li:first-child > a, +.pagination-small ul > li:first-child > a, +.pagination-mini ul > li:first-child > span, +.pagination-small ul > li:first-child > span { + -webkit-border-bottom-left-radius: 3px; + border-bottom-left-radius: 3px; + -webkit-border-top-left-radius: 3px; + border-top-left-radius: 3px; + -moz-border-radius-bottomleft: 3px; + -moz-border-radius-topleft: 3px; +} + +.pagination-mini ul > li:last-child > a, +.pagination-small ul > li:last-child > a, +.pagination-mini ul > li:last-child > span, +.pagination-small ul > li:last-child > span { + -webkit-border-top-right-radius: 3px; + border-top-right-radius: 3px; + -webkit-border-bottom-right-radius: 3px; + border-bottom-right-radius: 3px; + -moz-border-radius-topright: 3px; + -moz-border-radius-bottomright: 3px; +} + +.pagination-small ul > li > a, +.pagination-small ul > li > span { + padding: 2px 10px; + font-size: 11.9px; +} + +.pagination-mini ul > li > a, +.pagination-mini ul > li > span { + padding: 1px 6px; + font-size: 10.5px; +} + +.pager { + margin: 20px 0; + text-align: center; + list-style: none; + *zoom: 1; +} + +.pager:before, +.pager:after { + display: table; + line-height: 0; + content: ""; +} + +.pager:after { + clear: both; +} + +.pager li { + display: inline; +} + +.pager li > a, +.pager li > span { + display: inline-block; + padding: 5px 14px; + background-color: #fff; + border: 1px solid #ddd; + -webkit-border-radius: 15px; + -moz-border-radius: 15px; + border-radius: 15px; +} + +.pager li > a:hover { + text-decoration: none; + background-color: #f5f5f5; +} + +.pager .next > a, +.pager .next > span { + float: right; +} + +.pager .previous > a, +.pager .previous > span { + float: left; +} + +.pager .disabled > a, +.pager .disabled > a:hover, +.pager .disabled > span { + color: #999999; + cursor: default; + background-color: #fff; +} + +.modal-backdrop { + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 1040; + background-color: #000000; +} + +.modal-backdrop.fade { + opacity: 0; +} + +.modal-backdrop, +.modal-backdrop.fade.in { + opacity: 0.8; + filter: alpha(opacity=80); +} + +.modal { + position: fixed; + top: 50%; + left: 50%; + z-index: 1050; + width: 560px; + margin: -250px 0 0 -280px; + background-color: #ffffff; + border: 1px solid #999; + border: 1px solid rgba(0, 0, 0, 0.3); + *border: 1px solid #999; + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; + outline: none; + -webkit-box-shadow: 0 3px 7px rgba(0, 0, 0, 0.3); + -moz-box-shadow: 0 3px 7px rgba(0, 0, 0, 0.3); + box-shadow: 0 3px 7px rgba(0, 0, 0, 0.3); + -webkit-background-clip: padding-box; + -moz-background-clip: padding-box; + background-clip: padding-box; +} + +.modal.fade { + top: -25%; + -webkit-transition: opacity 0.3s linear, top 0.3s ease-out; + -moz-transition: opacity 0.3s linear, top 0.3s ease-out; + -o-transition: opacity 0.3s linear, top 0.3s ease-out; + transition: opacity 0.3s linear, top 0.3s ease-out; +} + +.modal.fade.in { + top: 50%; +} + +.modal-header { + padding: 9px 15px; + border-bottom: 1px solid #eee; +} + +.modal-header .close { + margin-top: 2px; +} + +.modal-header h3 { + margin: 0; + line-height: 30px; +} + +.modal-body { + max-height: 400px; + padding: 15px; + overflow-y: auto; +} + +.modal-form { + margin-bottom: 0; +} + +.modal-footer { + padding: 14px 15px 15px; + margin-bottom: 0; + text-align: right; + background-color: #f5f5f5; + border-top: 1px solid #ddd; + -webkit-border-radius: 0 0 6px 6px; + -moz-border-radius: 0 0 6px 6px; + border-radius: 0 0 6px 6px; + *zoom: 1; + -webkit-box-shadow: inset 0 1px 0 #ffffff; + -moz-box-shadow: inset 0 1px 0 #ffffff; + box-shadow: inset 0 1px 0 #ffffff; +} + +.modal-footer:before, +.modal-footer:after { + display: table; + line-height: 0; + content: ""; +} + +.modal-footer:after { + clear: both; +} + +.modal-footer .btn + .btn { + margin-bottom: 0; + margin-left: 5px; +} + +.modal-footer .btn-group .btn + .btn { + margin-left: -1px; +} + +.modal-footer .btn-block + .btn-block { + margin-left: 0; +} + +.tooltip { + position: absolute; + z-index: 1030; + display: block; + padding: 5px; + font-size: 11px; + opacity: 0; + filter: alpha(opacity=0); + visibility: visible; +} + +.tooltip.in { + opacity: 0.8; + filter: alpha(opacity=80); +} + +.tooltip.top { + margin-top: -3px; +} + +.tooltip.right { + margin-left: 3px; +} + +.tooltip.bottom { + margin-top: 3px; +} + +.tooltip.left { + margin-left: -3px; +} + +.tooltip-inner { + max-width: 200px; + padding: 3px 8px; + color: #ffffff; + text-align: center; + text-decoration: none; + background-color: #000000; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.tooltip-arrow { + position: absolute; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; +} + +.tooltip.top .tooltip-arrow { + bottom: 0; + left: 50%; + margin-left: -5px; + border-top-color: #000000; + border-width: 5px 5px 0; +} + +.tooltip.right .tooltip-arrow { + top: 50%; + left: 0; + margin-top: -5px; + border-right-color: #000000; + border-width: 5px 5px 5px 0; +} + +.tooltip.left .tooltip-arrow { + top: 50%; + right: 0; + margin-top: -5px; + border-left-color: #000000; + border-width: 5px 0 5px 5px; +} + +.tooltip.bottom .tooltip-arrow { + top: 0; + left: 50%; + margin-left: -5px; + border-bottom-color: #000000; + border-width: 0 5px 5px; +} + +.popover { + position: absolute; + top: 0; + left: 0; + z-index: 1010; + display: none; + width: 236px; + padding: 1px; + background-color: #ffffff; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.2); + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; + -webkit-box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + -moz-box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + -webkit-background-clip: padding-box; + -moz-background-clip: padding; + background-clip: padding-box; +} + +.popover.top { + margin-top: -10px; +} + +.popover.right { + margin-left: 10px; +} + +.popover.bottom { + margin-top: 10px; +} + +.popover.left { + margin-left: -10px; +} + +.popover-title { + padding: 8px 14px; + margin: 0; + font-size: 14px; + font-weight: normal; + line-height: 18px; + background-color: #f7f7f7; + border-bottom: 1px solid #ebebeb; + -webkit-border-radius: 5px 5px 0 0; + -moz-border-radius: 5px 5px 0 0; + border-radius: 5px 5px 0 0; +} + +.popover-content { + padding: 9px 14px; +} + +.popover-content p, +.popover-content ul, +.popover-content ol { + margin-bottom: 0; +} + +.popover .arrow, +.popover .arrow:after { + position: absolute; + display: inline-block; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; +} + +.popover .arrow:after { + z-index: -1; + content: ""; +} + +.popover.top .arrow { + bottom: -10px; + left: 50%; + margin-left: -10px; + border-top-color: #ffffff; + border-width: 10px 10px 0; +} + +.popover.top .arrow:after { + bottom: -1px; + left: -11px; + border-top-color: rgba(0, 0, 0, 0.25); + border-width: 11px 11px 0; +} + +.popover.right .arrow { + top: 50%; + left: -10px; + margin-top: -10px; + border-right-color: #ffffff; + border-width: 10px 10px 10px 0; +} + +.popover.right .arrow:after { + bottom: -11px; + left: -1px; + border-right-color: rgba(0, 0, 0, 0.25); + border-width: 11px 11px 11px 0; +} + +.popover.bottom .arrow { + top: -10px; + left: 50%; + margin-left: -10px; + border-bottom-color: #ffffff; + border-width: 0 10px 10px; +} + +.popover.bottom .arrow:after { + top: -1px; + left: -11px; + border-bottom-color: rgba(0, 0, 0, 0.25); + border-width: 0 11px 11px; +} + +.popover.left .arrow { + top: 50%; + right: -10px; + margin-top: -10px; + border-left-color: #ffffff; + border-width: 10px 0 10px 10px; +} + +.popover.left .arrow:after { + right: -1px; + bottom: -11px; + border-left-color: rgba(0, 0, 0, 0.25); + border-width: 11px 0 11px 11px; +} + +.thumbnails { + margin-left: -20px; + list-style: none; + *zoom: 1; +} + +.thumbnails:before, +.thumbnails:after { + display: table; + line-height: 0; + content: ""; +} + +.thumbnails:after { + clear: both; +} + +.row-fluid .thumbnails { + margin-left: 0; +} + +.thumbnails > li { + float: left; + margin-bottom: 20px; + margin-left: 20px; +} + +.thumbnail { + display: block; + padding: 4px; + line-height: 20px; + border: 1px solid #ddd; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + -webkit-box-shadow: 0 1px 3px rgba(0, 0, 0, 0.055); + -moz-box-shadow: 0 1px 3px rgba(0, 0, 0, 0.055); + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.055); + -webkit-transition: all 0.2s ease-in-out; + -moz-transition: all 0.2s ease-in-out; + -o-transition: all 0.2s ease-in-out; + transition: all 0.2s ease-in-out; +} + +a.thumbnail:hover { + border-color: #0088cc; + -webkit-box-shadow: 0 1px 4px rgba(0, 105, 214, 0.25); + -moz-box-shadow: 0 1px 4px rgba(0, 105, 214, 0.25); + box-shadow: 0 1px 4px rgba(0, 105, 214, 0.25); +} + +.thumbnail > img { + display: block; + max-width: 100%; + margin-right: auto; + margin-left: auto; +} + +.thumbnail .caption { + padding: 9px; + color: #555555; +} + +.media, +.media-body { + overflow: hidden; + *overflow: visible; + zoom: 1; +} + +.media, +.media .media { + margin-top: 15px; +} + +.media:first-child { + margin-top: 0; +} + +.media-object { + display: block; +} + +.media-heading { + margin: 0 0 5px; +} + +.media .pull-left { + margin-right: 10px; +} + +.media .pull-right { + margin-left: 10px; +} + +.media-list { + margin-left: 0; + list-style: none; +} + +.label, +.badge { + display: inline-block; + padding: 2px 4px; + font-size: 11.844px; + font-weight: bold; + line-height: 14px; + color: #ffffff; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + white-space: nowrap; + vertical-align: baseline; + background-color: #999999; +} + +.label { + -webkit-border-radius: 3px; + -moz-border-radius: 3px; + border-radius: 3px; +} + +.badge { + padding-right: 9px; + padding-left: 9px; + -webkit-border-radius: 9px; + -moz-border-radius: 9px; + border-radius: 9px; +} + +a.label:hover, +a.badge:hover { + color: #ffffff; + text-decoration: none; + cursor: pointer; +} + +.label-important, +.badge-important { + background-color: #b94a48; +} + +.label-important[href], +.badge-important[href] { + background-color: #953b39; +} + +.label-warning, +.badge-warning { + background-color: #f89406; +} + +.label-warning[href], +.badge-warning[href] { + background-color: #c67605; +} + +.label-success, +.badge-success { + background-color: #468847; +} + +.label-success[href], +.badge-success[href] { + background-color: #356635; +} + +.label-info, +.badge-info { + background-color: #3a87ad; +} + +.label-info[href], +.badge-info[href] { + background-color: #2d6987; +} + +.label-inverse, +.badge-inverse { + background-color: #333333; +} + +.label-inverse[href], +.badge-inverse[href] { + background-color: #1a1a1a; +} + +.btn .label, +.btn .badge { + position: relative; + top: -1px; +} + +.btn-mini .label, +.btn-mini .badge { + top: 0; +} + +@-webkit-keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} + +@-moz-keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} + +@-ms-keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} + +@-o-keyframes progress-bar-stripes { + from { + background-position: 0 0; + } + to { + background-position: 40px 0; + } +} + +@keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} + +.progress { + height: 20px; + margin-bottom: 20px; + overflow: hidden; + background-color: #f7f7f7; + background-image: -moz-linear-gradient(top, #f5f5f5, #f9f9f9); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#f5f5f5), to(#f9f9f9)); + background-image: -webkit-linear-gradient(top, #f5f5f5, #f9f9f9); + background-image: -o-linear-gradient(top, #f5f5f5, #f9f9f9); + background-image: linear-gradient(to bottom, #f5f5f5, #f9f9f9); + background-repeat: repeat-x; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#fff9f9f9', GradientType=0); + -webkit-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); + -moz-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); +} + +.progress .bar { + float: left; + width: 0; + height: 100%; + font-size: 12px; + color: #ffffff; + text-align: center; + text-shadow: 0 -1px 0 rgba(0, 0, 0, 0.25); + background-color: #0e90d2; + background-image: -moz-linear-gradient(top, #149bdf, #0480be); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#149bdf), to(#0480be)); + background-image: -webkit-linear-gradient(top, #149bdf, #0480be); + background-image: -o-linear-gradient(top, #149bdf, #0480be); + background-image: linear-gradient(to bottom, #149bdf, #0480be); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff149bdf', endColorstr='#ff0480be', GradientType=0); + -webkit-box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.15); + -moz-box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.15); + box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.15); + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + -webkit-transition: width 0.6s ease; + -moz-transition: width 0.6s ease; + -o-transition: width 0.6s ease; + transition: width 0.6s ease; +} + +.progress .bar + .bar { + -webkit-box-shadow: inset 1px 0 0 rgba(0, 0, 0, 0.15), inset 0 -1px 0 rgba(0, 0, 0, 0.15); + -moz-box-shadow: inset 1px 0 0 rgba(0, 0, 0, 0.15), inset 0 -1px 0 rgba(0, 0, 0, 0.15); + box-shadow: inset 1px 0 0 rgba(0, 0, 0, 0.15), inset 0 -1px 0 rgba(0, 0, 0, 0.15); +} + +.progress-striped .bar { + background-color: #149bdf; + background-image: -webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent)); + background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -moz-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + -webkit-background-size: 40px 40px; + -moz-background-size: 40px 40px; + -o-background-size: 40px 40px; + background-size: 40px 40px; +} + +.progress.active .bar { + -webkit-animation: progress-bar-stripes 2s linear infinite; + -moz-animation: progress-bar-stripes 2s linear infinite; + -ms-animation: progress-bar-stripes 2s linear infinite; + -o-animation: progress-bar-stripes 2s linear infinite; + animation: progress-bar-stripes 2s linear infinite; +} + +.progress-danger .bar, +.progress .bar-danger { + background-color: #dd514c; + background-image: -moz-linear-gradient(top, #ee5f5b, #c43c35); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#ee5f5b), to(#c43c35)); + background-image: -webkit-linear-gradient(top, #ee5f5b, #c43c35); + background-image: -o-linear-gradient(top, #ee5f5b, #c43c35); + background-image: linear-gradient(to bottom, #ee5f5b, #c43c35); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffee5f5b', endColorstr='#ffc43c35', GradientType=0); +} + +.progress-danger.progress-striped .bar, +.progress-striped .bar-danger { + background-color: #ee5f5b; + background-image: -webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent)); + background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -moz-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); +} + +.progress-success .bar, +.progress .bar-success { + background-color: #5eb95e; + background-image: -moz-linear-gradient(top, #62c462, #57a957); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#62c462), to(#57a957)); + background-image: -webkit-linear-gradient(top, #62c462, #57a957); + background-image: -o-linear-gradient(top, #62c462, #57a957); + background-image: linear-gradient(to bottom, #62c462, #57a957); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff62c462', endColorstr='#ff57a957', GradientType=0); +} + +.progress-success.progress-striped .bar, +.progress-striped .bar-success { + background-color: #62c462; + background-image: -webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent)); + background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -moz-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); +} + +.progress-info .bar, +.progress .bar-info { + background-color: #4bb1cf; + background-image: -moz-linear-gradient(top, #5bc0de, #339bb9); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#5bc0de), to(#339bb9)); + background-image: -webkit-linear-gradient(top, #5bc0de, #339bb9); + background-image: -o-linear-gradient(top, #5bc0de, #339bb9); + background-image: linear-gradient(to bottom, #5bc0de, #339bb9); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff339bb9', GradientType=0); +} + +.progress-info.progress-striped .bar, +.progress-striped .bar-info { + background-color: #5bc0de; + background-image: -webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent)); + background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -moz-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); +} + +.progress-warning .bar, +.progress .bar-warning { + background-color: #faa732; + background-image: -moz-linear-gradient(top, #fbb450, #f89406); + background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#fbb450), to(#f89406)); + background-image: -webkit-linear-gradient(top, #fbb450, #f89406); + background-image: -o-linear-gradient(top, #fbb450, #f89406); + background-image: linear-gradient(to bottom, #fbb450, #f89406); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffbb450', endColorstr='#fff89406', GradientType=0); +} + +.progress-warning.progress-striped .bar, +.progress-striped .bar-warning { + background-color: #fbb450; + background-image: -webkit-gradient(linear, 0 100%, 100% 0, color-stop(0.25, rgba(255, 255, 255, 0.15)), color-stop(0.25, transparent), color-stop(0.5, transparent), color-stop(0.5, rgba(255, 255, 255, 0.15)), color-stop(0.75, rgba(255, 255, 255, 0.15)), color-stop(0.75, transparent), to(transparent)); + background-image: -webkit-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -moz-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: -o-linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); +} + +.accordion { + margin-bottom: 20px; +} + +.accordion-group { + margin-bottom: 2px; + border: 1px solid #e5e5e5; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; +} + +.accordion-heading { + border-bottom: 0; +} + +.accordion-heading .accordion-toggle { + display: block; + padding: 8px 15px; +} + +.accordion-toggle { + cursor: pointer; +} + +.accordion-inner { + padding: 9px 15px; + border-top: 1px solid #e5e5e5; +} + +.carousel { + position: relative; + margin-bottom: 20px; + line-height: 1; +} + +.carousel-inner { + position: relative; + width: 100%; + overflow: hidden; +} + +.carousel .item { + position: relative; + display: none; + -webkit-transition: 0.6s ease-in-out left; + -moz-transition: 0.6s ease-in-out left; + -o-transition: 0.6s ease-in-out left; + transition: 0.6s ease-in-out left; +} + +.carousel .item > img { + display: block; + line-height: 1; +} + +.carousel .active, +.carousel .next, +.carousel .prev { + display: block; +} + +.carousel .active { + left: 0; +} + +.carousel .next, +.carousel .prev { + position: absolute; + top: 0; + width: 100%; +} + +.carousel .next { + left: 100%; +} + +.carousel .prev { + left: -100%; +} + +.carousel .next.left, +.carousel .prev.right { + left: 0; +} + +.carousel .active.left { + left: -100%; +} + +.carousel .active.right { + left: 100%; +} + +.carousel-control { + position: absolute; + top: 40%; + left: 15px; + width: 40px; + height: 40px; + margin-top: -20px; + font-size: 60px; + font-weight: 100; + line-height: 30px; + color: #ffffff; + text-align: center; + background: #222222; + border: 3px solid #ffffff; + -webkit-border-radius: 23px; + -moz-border-radius: 23px; + border-radius: 23px; + opacity: 0.5; + filter: alpha(opacity=50); +} + +.carousel-control.right { + right: 15px; + left: auto; +} + +.carousel-control:hover { + color: #ffffff; + text-decoration: none; + opacity: 0.9; + filter: alpha(opacity=90); +} + +.carousel-caption { + position: absolute; + right: 0; + bottom: 0; + left: 0; + padding: 15px; + background: #333333; + background: rgba(0, 0, 0, 0.75); +} + +.carousel-caption h4, +.carousel-caption p { + line-height: 20px; + color: #ffffff; +} + +.carousel-caption h4 { + margin: 0 0 5px; +} + +.carousel-caption p { + margin-bottom: 0; +} + +.hero-unit { + padding: 60px; + margin-bottom: 30px; + font-size: 18px; + font-weight: 200; + line-height: 30px; + color: inherit; + background-color: #eeeeee; + -webkit-border-radius: 6px; + -moz-border-radius: 6px; + border-radius: 6px; +} + +.hero-unit h1 { + margin-bottom: 0; + font-size: 60px; + line-height: 1; + letter-spacing: -1px; + color: inherit; +} + +.hero-unit li { + line-height: 30px; +} + +.pull-right { + float: right; +} + +.pull-left { + float: left; +} + +.hide { + display: none; +} + +.show { + display: block; +} + +.invisible { + visibility: hidden; +} + +.affix { + position: fixed; +} diff --git a/www/bootstrap/css/bootstrap.min.css b/www/bootstrap/css/bootstrap.min.css new file mode 100644 index 000000000000..43e16d725113 --- /dev/null +++ b/www/bootstrap/css/bootstrap.min.css @@ -0,0 +1,9 @@ +/*! + * Bootstrap v2.2.1 + * + * Copyright 2012 Twitter, Inc + * Licensed under the Apache License v2.0 + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Designed and built with all the love in the world @twitter by @mdo and @fat. + */article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}audio:not([controls]){display:none}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}a:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}a:hover,a:active{outline:0}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sup{top:-0.5em}sub{bottom:-0.25em}img{width:auto\9;height:auto;max-width:100%;vertical-align:middle;border:0;-ms-interpolation-mode:bicubic}#map_canvas img,.google-maps img{max-width:none}button,input,select,textarea{margin:0;font-size:100%;vertical-align:middle}button,input{*overflow:visible;line-height:normal}button::-moz-focus-inner,input::-moz-focus-inner{padding:0;border:0}button,html input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button}input[type="search"]{-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;-webkit-appearance:textfield}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none}textarea{overflow:auto;vertical-align:top}.clearfix{*zoom:1}.clearfix:before,.clearfix:after{display:table;line-height:0;content:""}.clearfix:after{clear:both}.hide-text{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.input-block-level{display:block;width:100%;min-height:30px;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}body{margin:0;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:14px;line-height:20px;color:#333;background-color:#fff}a{color:#08c;text-decoration:none}a:hover{color:#005580;text-decoration:underline}.img-rounded{-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px}.img-polaroid{padding:4px;background-color:#fff;border:1px solid #ccc;border:1px solid rgba(0,0,0,0.2);-webkit-box-shadow:0 1px 3px rgba(0,0,0,0.1);-moz-box-shadow:0 1px 3px rgba(0,0,0,0.1);box-shadow:0 1px 3px rgba(0,0,0,0.1)}.img-circle{-webkit-border-radius:500px;-moz-border-radius:500px;border-radius:500px}.row{margin-left:-20px;*zoom:1}.row:before,.row:after{display:table;line-height:0;content:""}.row:after{clear:both}[class*="span"]{float:left;min-height:1px;margin-left:20px}.container,.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:940px}.span12{width:940px}.span11{width:860px}.span10{width:780px}.span9{width:700px}.span8{width:620px}.span7{width:540px}.span6{width:460px}.span5{width:380px}.span4{width:300px}.span3{width:220px}.span2{width:140px}.span1{width:60px}.offset12{margin-left:980px}.offset11{margin-left:900px}.offset10{margin-left:820px}.offset9{margin-left:740px}.offset8{margin-left:660px}.offset7{margin-left:580px}.offset6{margin-left:500px}.offset5{margin-left:420px}.offset4{margin-left:340px}.offset3{margin-left:260px}.offset2{margin-left:180px}.offset1{margin-left:100px}.row-fluid{width:100%;*zoom:1}.row-fluid:before,.row-fluid:after{display:table;line-height:0;content:""}.row-fluid:after{clear:both}.row-fluid [class*="span"]{display:block;float:left;width:100%;min-height:30px;margin-left:2.127659574468085%;*margin-left:2.074468085106383%;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.row-fluid [class*="span"]:first-child{margin-left:0}.row-fluid .controls-row [class*="span"]+[class*="span"]{margin-left:2.127659574468085%}.row-fluid .span12{width:100%;*width:99.94680851063829%}.row-fluid .span11{width:91.48936170212765%;*width:91.43617021276594%}.row-fluid .span10{width:82.97872340425532%;*width:82.92553191489361%}.row-fluid .span9{width:74.46808510638297%;*width:74.41489361702126%}.row-fluid .span8{width:65.95744680851064%;*width:65.90425531914893%}.row-fluid .span7{width:57.44680851063829%;*width:57.39361702127659%}.row-fluid .span6{width:48.93617021276595%;*width:48.88297872340425%}.row-fluid .span5{width:40.42553191489362%;*width:40.37234042553192%}.row-fluid .span4{width:31.914893617021278%;*width:31.861702127659576%}.row-fluid .span3{width:23.404255319148934%;*width:23.351063829787233%}.row-fluid .span2{width:14.893617021276595%;*width:14.840425531914894%}.row-fluid .span1{width:6.382978723404255%;*width:6.329787234042553%}.row-fluid .offset12{margin-left:104.25531914893617%;*margin-left:104.14893617021275%}.row-fluid .offset12:first-child{margin-left:102.12765957446808%;*margin-left:102.02127659574467%}.row-fluid .offset11{margin-left:95.74468085106382%;*margin-left:95.6382978723404%}.row-fluid .offset11:first-child{margin-left:93.61702127659574%;*margin-left:93.51063829787232%}.row-fluid .offset10{margin-left:87.23404255319149%;*margin-left:87.12765957446807%}.row-fluid .offset10:first-child{margin-left:85.1063829787234%;*margin-left:84.99999999999999%}.row-fluid .offset9{margin-left:78.72340425531914%;*margin-left:78.61702127659572%}.row-fluid .offset9:first-child{margin-left:76.59574468085106%;*margin-left:76.48936170212764%}.row-fluid .offset8{margin-left:70.2127659574468%;*margin-left:70.10638297872339%}.row-fluid .offset8:first-child{margin-left:68.08510638297872%;*margin-left:67.9787234042553%}.row-fluid .offset7{margin-left:61.70212765957446%;*margin-left:61.59574468085106%}.row-fluid .offset7:first-child{margin-left:59.574468085106375%;*margin-left:59.46808510638297%}.row-fluid .offset6{margin-left:53.191489361702125%;*margin-left:53.085106382978715%}.row-fluid .offset6:first-child{margin-left:51.063829787234035%;*margin-left:50.95744680851063%}.row-fluid .offset5{margin-left:44.68085106382979%;*margin-left:44.57446808510638%}.row-fluid .offset5:first-child{margin-left:42.5531914893617%;*margin-left:42.4468085106383%}.row-fluid .offset4{margin-left:36.170212765957444%;*margin-left:36.06382978723405%}.row-fluid .offset4:first-child{margin-left:34.04255319148936%;*margin-left:33.93617021276596%}.row-fluid .offset3{margin-left:27.659574468085104%;*margin-left:27.5531914893617%}.row-fluid .offset3:first-child{margin-left:25.53191489361702%;*margin-left:25.425531914893618%}.row-fluid .offset2{margin-left:19.148936170212764%;*margin-left:19.04255319148936%}.row-fluid .offset2:first-child{margin-left:17.02127659574468%;*margin-left:16.914893617021278%}.row-fluid .offset1{margin-left:10.638297872340425%;*margin-left:10.53191489361702%}.row-fluid .offset1:first-child{margin-left:8.51063829787234%;*margin-left:8.404255319148938%}[class*="span"].hide,.row-fluid [class*="span"].hide{display:none}[class*="span"].pull-right,.row-fluid [class*="span"].pull-right{float:right}.container{margin-right:auto;margin-left:auto;*zoom:1}.container:before,.container:after{display:table;line-height:0;content:""}.container:after{clear:both}.container-fluid{padding-right:20px;padding-left:20px;*zoom:1}.container-fluid:before,.container-fluid:after{display:table;line-height:0;content:""}.container-fluid:after{clear:both}p{margin:0 0 10px}.lead{margin-bottom:20px;font-size:21px;font-weight:200;line-height:30px}small{font-size:85%}strong{font-weight:bold}em{font-style:italic}cite{font-style:normal}.muted{color:#999}.text-warning{color:#c09853}a.text-warning:hover{color:#a47e3c}.text-error{color:#b94a48}a.text-error:hover{color:#953b39}.text-info{color:#3a87ad}a.text-info:hover{color:#2d6987}.text-success{color:#468847}a.text-success:hover{color:#356635}h1,h2,h3,h4,h5,h6{margin:10px 0;font-family:inherit;font-weight:bold;line-height:20px;color:inherit;text-rendering:optimizelegibility}h1 small,h2 small,h3 small,h4 small,h5 small,h6 small{font-weight:normal;line-height:1;color:#999}h1,h2,h3{line-height:40px}h1{font-size:38.5px}h2{font-size:31.5px}h3{font-size:24.5px}h4{font-size:17.5px}h5{font-size:14px}h6{font-size:11.9px}h1 small{font-size:24.5px}h2 small{font-size:17.5px}h3 small{font-size:14px}h4 small{font-size:14px}.page-header{padding-bottom:9px;margin:20px 0 30px;border-bottom:1px solid #eee}ul,ol{padding:0;margin:0 0 10px 25px}ul ul,ul ol,ol ol,ol ul{margin-bottom:0}li{line-height:20px}ul.unstyled,ol.unstyled{margin-left:0;list-style:none}dl{margin-bottom:20px}dt,dd{line-height:20px}dt{font-weight:bold}dd{margin-left:10px}.dl-horizontal{*zoom:1}.dl-horizontal:before,.dl-horizontal:after{display:table;line-height:0;content:""}.dl-horizontal:after{clear:both}.dl-horizontal dt{float:left;width:160px;overflow:hidden;clear:left;text-align:right;text-overflow:ellipsis;white-space:nowrap}.dl-horizontal dd{margin-left:180px}hr{margin:20px 0;border:0;border-top:1px solid #eee;border-bottom:1px solid #fff}abbr[title],abbr[data-original-title]{cursor:help;border-bottom:1px dotted #999}abbr.initialism{font-size:90%;text-transform:uppercase}blockquote{padding:0 0 0 15px;margin:0 0 20px;border-left:5px solid #eee}blockquote p{margin-bottom:0;font-size:16px;font-weight:300;line-height:25px}blockquote small{display:block;line-height:20px;color:#999}blockquote small:before{content:'\2014 \00A0'}blockquote.pull-right{float:right;padding-right:15px;padding-left:0;border-right:5px solid #eee;border-left:0}blockquote.pull-right p,blockquote.pull-right small{text-align:right}blockquote.pull-right small:before{content:''}blockquote.pull-right small:after{content:'\00A0 \2014'}q:before,q:after,blockquote:before,blockquote:after{content:""}address{display:block;margin-bottom:20px;font-style:normal;line-height:20px}code,pre{padding:0 3px 2px;font-family:Monaco,Menlo,Consolas,"Courier New",monospace;font-size:12px;color:#333;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}code{padding:2px 4px;color:#d14;background-color:#f7f7f9;border:1px solid #e1e1e8}pre{display:block;padding:9.5px;margin:0 0 10px;font-size:13px;line-height:20px;word-break:break-all;word-wrap:break-word;white-space:pre;white-space:pre-wrap;background-color:#f5f5f5;border:1px solid #ccc;border:1px solid rgba(0,0,0,0.15);-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}pre.prettyprint{margin-bottom:20px}pre code{padding:0;color:inherit;background-color:transparent;border:0}.pre-scrollable{max-height:340px;overflow-y:scroll}form{margin:0 0 20px}fieldset{padding:0;margin:0;border:0}legend{display:block;width:100%;padding:0;margin-bottom:20px;font-size:21px;line-height:40px;color:#333;border:0;border-bottom:1px solid #e5e5e5}legend small{font-size:15px;color:#999}label,input,button,select,textarea{font-size:14px;font-weight:normal;line-height:20px}input,button,select,textarea{font-family:"Helvetica Neue",Helvetica,Arial,sans-serif}label{display:block;margin-bottom:5px}select,textarea,input[type="text"],input[type="password"],input[type="datetime"],input[type="datetime-local"],input[type="date"],input[type="month"],input[type="time"],input[type="week"],input[type="number"],input[type="email"],input[type="url"],input[type="search"],input[type="tel"],input[type="color"],.uneditable-input{display:inline-block;height:20px;padding:4px 6px;margin-bottom:10px;font-size:14px;line-height:20px;color:#555;vertical-align:middle;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}input,textarea,.uneditable-input{width:206px}textarea{height:auto}textarea,input[type="text"],input[type="password"],input[type="datetime"],input[type="datetime-local"],input[type="date"],input[type="month"],input[type="time"],input[type="week"],input[type="number"],input[type="email"],input[type="url"],input[type="search"],input[type="tel"],input[type="color"],.uneditable-input{background-color:#fff;border:1px solid #ccc;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-webkit-transition:border linear .2s,box-shadow linear .2s;-moz-transition:border linear .2s,box-shadow linear .2s;-o-transition:border linear .2s,box-shadow linear .2s;transition:border linear .2s,box-shadow linear .2s}textarea:focus,input[type="text"]:focus,input[type="password"]:focus,input[type="datetime"]:focus,input[type="datetime-local"]:focus,input[type="date"]:focus,input[type="month"]:focus,input[type="time"]:focus,input[type="week"]:focus,input[type="number"]:focus,input[type="email"]:focus,input[type="url"]:focus,input[type="search"]:focus,input[type="tel"]:focus,input[type="color"]:focus,.uneditable-input:focus{border-color:rgba(82,168,236,0.8);outline:0;outline:thin dotted \9;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 8px rgba(82,168,236,0.6);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 8px rgba(82,168,236,0.6);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 8px rgba(82,168,236,0.6)}input[type="radio"],input[type="checkbox"]{margin:4px 0 0;margin-top:1px \9;*margin-top:0;line-height:normal;cursor:pointer}input[type="file"],input[type="image"],input[type="submit"],input[type="reset"],input[type="button"],input[type="radio"],input[type="checkbox"]{width:auto}select,input[type="file"]{height:30px;*margin-top:4px;line-height:30px}select{width:220px;background-color:#fff;border:1px solid #ccc}select[multiple],select[size]{height:auto}select:focus,input[type="file"]:focus,input[type="radio"]:focus,input[type="checkbox"]:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}.uneditable-input,.uneditable-textarea{color:#999;cursor:not-allowed;background-color:#fcfcfc;border-color:#ccc;-webkit-box-shadow:inset 0 1px 2px rgba(0,0,0,0.025);-moz-box-shadow:inset 0 1px 2px rgba(0,0,0,0.025);box-shadow:inset 0 1px 2px rgba(0,0,0,0.025)}.uneditable-input{overflow:hidden;white-space:nowrap}.uneditable-textarea{width:auto;height:auto}input:-moz-placeholder,textarea:-moz-placeholder{color:#999}input:-ms-input-placeholder,textarea:-ms-input-placeholder{color:#999}input::-webkit-input-placeholder,textarea::-webkit-input-placeholder{color:#999}.radio,.checkbox{min-height:20px;padding-left:20px}.radio input[type="radio"],.checkbox input[type="checkbox"]{float:left;margin-left:-20px}.controls>.radio:first-child,.controls>.checkbox:first-child{padding-top:5px}.radio.inline,.checkbox.inline{display:inline-block;padding-top:5px;margin-bottom:0;vertical-align:middle}.radio.inline+.radio.inline,.checkbox.inline+.checkbox.inline{margin-left:10px}.input-mini{width:60px}.input-small{width:90px}.input-medium{width:150px}.input-large{width:210px}.input-xlarge{width:270px}.input-xxlarge{width:530px}input[class*="span"],select[class*="span"],textarea[class*="span"],.uneditable-input[class*="span"],.row-fluid input[class*="span"],.row-fluid select[class*="span"],.row-fluid textarea[class*="span"],.row-fluid .uneditable-input[class*="span"]{float:none;margin-left:0}.input-append input[class*="span"],.input-append .uneditable-input[class*="span"],.input-prepend input[class*="span"],.input-prepend .uneditable-input[class*="span"],.row-fluid input[class*="span"],.row-fluid select[class*="span"],.row-fluid textarea[class*="span"],.row-fluid .uneditable-input[class*="span"],.row-fluid .input-prepend [class*="span"],.row-fluid .input-append [class*="span"]{display:inline-block}input,textarea,.uneditable-input{margin-left:0}.controls-row [class*="span"]+[class*="span"]{margin-left:20px}input.span12,textarea.span12,.uneditable-input.span12{width:926px}input.span11,textarea.span11,.uneditable-input.span11{width:846px}input.span10,textarea.span10,.uneditable-input.span10{width:766px}input.span9,textarea.span9,.uneditable-input.span9{width:686px}input.span8,textarea.span8,.uneditable-input.span8{width:606px}input.span7,textarea.span7,.uneditable-input.span7{width:526px}input.span6,textarea.span6,.uneditable-input.span6{width:446px}input.span5,textarea.span5,.uneditable-input.span5{width:366px}input.span4,textarea.span4,.uneditable-input.span4{width:286px}input.span3,textarea.span3,.uneditable-input.span3{width:206px}input.span2,textarea.span2,.uneditable-input.span2{width:126px}input.span1,textarea.span1,.uneditable-input.span1{width:46px}.controls-row{*zoom:1}.controls-row:before,.controls-row:after{display:table;line-height:0;content:""}.controls-row:after{clear:both}.controls-row [class*="span"],.row-fluid .controls-row [class*="span"]{float:left}.controls-row .checkbox[class*="span"],.controls-row .radio[class*="span"]{padding-top:5px}input[disabled],select[disabled],textarea[disabled],input[readonly],select[readonly],textarea[readonly]{cursor:not-allowed;background-color:#eee}input[type="radio"][disabled],input[type="checkbox"][disabled],input[type="radio"][readonly],input[type="checkbox"][readonly]{background-color:transparent}.control-group.warning>label,.control-group.warning .help-block,.control-group.warning .help-inline{color:#c09853}.control-group.warning .checkbox,.control-group.warning .radio,.control-group.warning input,.control-group.warning select,.control-group.warning textarea{color:#c09853}.control-group.warning input,.control-group.warning select,.control-group.warning textarea{border-color:#c09853;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075)}.control-group.warning input:focus,.control-group.warning select:focus,.control-group.warning textarea:focus{border-color:#a47e3c;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #dbc59e;-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #dbc59e;box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #dbc59e}.control-group.warning .input-prepend .add-on,.control-group.warning .input-append .add-on{color:#c09853;background-color:#fcf8e3;border-color:#c09853}.control-group.error>label,.control-group.error .help-block,.control-group.error .help-inline{color:#b94a48}.control-group.error .checkbox,.control-group.error .radio,.control-group.error input,.control-group.error select,.control-group.error textarea{color:#b94a48}.control-group.error input,.control-group.error select,.control-group.error textarea{border-color:#b94a48;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075)}.control-group.error input:focus,.control-group.error select:focus,.control-group.error textarea:focus{border-color:#953b39;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #d59392;-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #d59392;box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #d59392}.control-group.error .input-prepend .add-on,.control-group.error .input-append .add-on{color:#b94a48;background-color:#f2dede;border-color:#b94a48}.control-group.success>label,.control-group.success .help-block,.control-group.success .help-inline{color:#468847}.control-group.success .checkbox,.control-group.success .radio,.control-group.success input,.control-group.success select,.control-group.success textarea{color:#468847}.control-group.success input,.control-group.success select,.control-group.success textarea{border-color:#468847;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075)}.control-group.success input:focus,.control-group.success select:focus,.control-group.success textarea:focus{border-color:#356635;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7aba7b;-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7aba7b;box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7aba7b}.control-group.success .input-prepend .add-on,.control-group.success .input-append .add-on{color:#468847;background-color:#dff0d8;border-color:#468847}.control-group.info>label,.control-group.info .help-block,.control-group.info .help-inline{color:#3a87ad}.control-group.info .checkbox,.control-group.info .radio,.control-group.info input,.control-group.info select,.control-group.info textarea{color:#3a87ad}.control-group.info input,.control-group.info select,.control-group.info textarea{border-color:#3a87ad;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075);box-shadow:inset 0 1px 1px rgba(0,0,0,0.075)}.control-group.info input:focus,.control-group.info select:focus,.control-group.info textarea:focus{border-color:#2d6987;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7ab5d3;-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7ab5d3;box-shadow:inset 0 1px 1px rgba(0,0,0,0.075),0 0 6px #7ab5d3}.control-group.info .input-prepend .add-on,.control-group.info .input-append .add-on{color:#3a87ad;background-color:#d9edf7;border-color:#3a87ad}input:focus:required:invalid,textarea:focus:required:invalid,select:focus:required:invalid{color:#b94a48;border-color:#ee5f5b}input:focus:required:invalid:focus,textarea:focus:required:invalid:focus,select:focus:required:invalid:focus{border-color:#e9322d;-webkit-box-shadow:0 0 6px #f8b9b7;-moz-box-shadow:0 0 6px #f8b9b7;box-shadow:0 0 6px #f8b9b7}.form-actions{padding:19px 20px 20px;margin-top:20px;margin-bottom:20px;background-color:#f5f5f5;border-top:1px solid #e5e5e5;*zoom:1}.form-actions:before,.form-actions:after{display:table;line-height:0;content:""}.form-actions:after{clear:both}.help-block,.help-inline{color:#595959}.help-block{display:block;margin-bottom:10px}.help-inline{display:inline-block;*display:inline;padding-left:5px;vertical-align:middle;*zoom:1}.input-append,.input-prepend{margin-bottom:5px;font-size:0;white-space:nowrap}.input-append input,.input-prepend input,.input-append select,.input-prepend select,.input-append .uneditable-input,.input-prepend .uneditable-input,.input-append .dropdown-menu,.input-prepend .dropdown-menu{font-size:14px}.input-append input,.input-prepend input,.input-append select,.input-prepend select,.input-append .uneditable-input,.input-prepend .uneditable-input{position:relative;margin-bottom:0;*margin-left:0;vertical-align:top;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.input-append input:focus,.input-prepend input:focus,.input-append select:focus,.input-prepend select:focus,.input-append .uneditable-input:focus,.input-prepend .uneditable-input:focus{z-index:2}.input-append .add-on,.input-prepend .add-on{display:inline-block;width:auto;height:20px;min-width:16px;padding:4px 5px;font-size:14px;font-weight:normal;line-height:20px;text-align:center;text-shadow:0 1px 0 #fff;background-color:#eee;border:1px solid #ccc}.input-append .add-on,.input-prepend .add-on,.input-append .btn,.input-prepend .btn{vertical-align:top;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.input-append .active,.input-prepend .active{background-color:#a9dba9;border-color:#46a546}.input-prepend .add-on,.input-prepend .btn{margin-right:-1px}.input-prepend .add-on:first-child,.input-prepend .btn:first-child{-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px}.input-append input,.input-append select,.input-append .uneditable-input{-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px}.input-append input+.btn-group .btn,.input-append select+.btn-group .btn,.input-append .uneditable-input+.btn-group .btn{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.input-append .add-on,.input-append .btn,.input-append .btn-group{margin-left:-1px}.input-append .add-on:last-child,.input-append .btn:last-child{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.input-prepend.input-append input,.input-prepend.input-append select,.input-prepend.input-append .uneditable-input{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.input-prepend.input-append input+.btn-group .btn,.input-prepend.input-append select+.btn-group .btn,.input-prepend.input-append .uneditable-input+.btn-group .btn{-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.input-prepend.input-append .add-on:first-child,.input-prepend.input-append .btn:first-child{margin-right:-1px;-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px}.input-prepend.input-append .add-on:last-child,.input-prepend.input-append .btn:last-child{margin-left:-1px;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.input-prepend.input-append .btn-group:first-child{margin-left:0}input.search-query{padding-right:14px;padding-right:4px \9;padding-left:14px;padding-left:4px \9;margin-bottom:0;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px}.form-search .input-append .search-query,.form-search .input-prepend .search-query{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.form-search .input-append .search-query{-webkit-border-radius:14px 0 0 14px;-moz-border-radius:14px 0 0 14px;border-radius:14px 0 0 14px}.form-search .input-append .btn{-webkit-border-radius:0 14px 14px 0;-moz-border-radius:0 14px 14px 0;border-radius:0 14px 14px 0}.form-search .input-prepend .search-query{-webkit-border-radius:0 14px 14px 0;-moz-border-radius:0 14px 14px 0;border-radius:0 14px 14px 0}.form-search .input-prepend .btn{-webkit-border-radius:14px 0 0 14px;-moz-border-radius:14px 0 0 14px;border-radius:14px 0 0 14px}.form-search input,.form-inline input,.form-horizontal input,.form-search textarea,.form-inline textarea,.form-horizontal textarea,.form-search select,.form-inline select,.form-horizontal select,.form-search .help-inline,.form-inline .help-inline,.form-horizontal .help-inline,.form-search .uneditable-input,.form-inline .uneditable-input,.form-horizontal .uneditable-input,.form-search .input-prepend,.form-inline .input-prepend,.form-horizontal .input-prepend,.form-search .input-append,.form-inline .input-append,.form-horizontal .input-append{display:inline-block;*display:inline;margin-bottom:0;vertical-align:middle;*zoom:1}.form-search .hide,.form-inline .hide,.form-horizontal .hide{display:none}.form-search label,.form-inline label,.form-search .btn-group,.form-inline .btn-group{display:inline-block}.form-search .input-append,.form-inline .input-append,.form-search .input-prepend,.form-inline .input-prepend{margin-bottom:0}.form-search .radio,.form-search .checkbox,.form-inline .radio,.form-inline .checkbox{padding-left:0;margin-bottom:0;vertical-align:middle}.form-search .radio input[type="radio"],.form-search .checkbox input[type="checkbox"],.form-inline .radio input[type="radio"],.form-inline .checkbox input[type="checkbox"]{float:left;margin-right:3px;margin-left:0}.control-group{margin-bottom:10px}legend+.control-group{margin-top:20px;-webkit-margin-top-collapse:separate}.form-horizontal .control-group{margin-bottom:20px;*zoom:1}.form-horizontal .control-group:before,.form-horizontal .control-group:after{display:table;line-height:0;content:""}.form-horizontal .control-group:after{clear:both}.form-horizontal .control-label{float:left;width:160px;padding-top:5px;text-align:right}.form-horizontal .controls{*display:inline-block;*padding-left:20px;margin-left:180px;*margin-left:0}.form-horizontal .controls:first-child{*padding-left:180px}.form-horizontal .help-block{margin-bottom:0}.form-horizontal input+.help-block,.form-horizontal select+.help-block,.form-horizontal textarea+.help-block{margin-top:10px}.form-horizontal .form-actions{padding-left:180px}table{max-width:100%;background-color:transparent;border-collapse:collapse;border-spacing:0}.table{width:100%;margin-bottom:20px}.table th,.table td{padding:8px;line-height:20px;text-align:left;vertical-align:top;border-top:1px solid #ddd}.table th{font-weight:bold}.table thead th{vertical-align:bottom}.table caption+thead tr:first-child th,.table caption+thead tr:first-child td,.table colgroup+thead tr:first-child th,.table colgroup+thead tr:first-child td,.table thead:first-child tr:first-child th,.table thead:first-child tr:first-child td{border-top:0}.table tbody+tbody{border-top:2px solid #ddd}.table-condensed th,.table-condensed td{padding:4px 5px}.table-bordered{border:1px solid #ddd;border-collapse:separate;*border-collapse:collapse;border-left:0;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.table-bordered th,.table-bordered td{border-left:1px solid #ddd}.table-bordered caption+thead tr:first-child th,.table-bordered caption+tbody tr:first-child th,.table-bordered caption+tbody tr:first-child td,.table-bordered colgroup+thead tr:first-child th,.table-bordered colgroup+tbody tr:first-child th,.table-bordered colgroup+tbody tr:first-child td,.table-bordered thead:first-child tr:first-child th,.table-bordered tbody:first-child tr:first-child th,.table-bordered tbody:first-child tr:first-child td{border-top:0}.table-bordered thead:first-child tr:first-child th:first-child,.table-bordered tbody:first-child tr:first-child td:first-child{-webkit-border-top-left-radius:4px;border-top-left-radius:4px;-moz-border-radius-topleft:4px}.table-bordered thead:first-child tr:first-child th:last-child,.table-bordered tbody:first-child tr:first-child td:last-child{-webkit-border-top-right-radius:4px;border-top-right-radius:4px;-moz-border-radius-topright:4px}.table-bordered thead:last-child tr:last-child th:first-child,.table-bordered tbody:last-child tr:last-child td:first-child,.table-bordered tfoot:last-child tr:last-child td:first-child{-webkit-border-radius:0 0 0 4px;-moz-border-radius:0 0 0 4px;border-radius:0 0 0 4px;-webkit-border-bottom-left-radius:4px;border-bottom-left-radius:4px;-moz-border-radius-bottomleft:4px}.table-bordered thead:last-child tr:last-child th:last-child,.table-bordered tbody:last-child tr:last-child td:last-child,.table-bordered tfoot:last-child tr:last-child td:last-child{-webkit-border-bottom-right-radius:4px;border-bottom-right-radius:4px;-moz-border-radius-bottomright:4px}.table-bordered caption+thead tr:first-child th:first-child,.table-bordered caption+tbody tr:first-child td:first-child,.table-bordered colgroup+thead tr:first-child th:first-child,.table-bordered colgroup+tbody tr:first-child td:first-child{-webkit-border-top-left-radius:4px;border-top-left-radius:4px;-moz-border-radius-topleft:4px}.table-bordered caption+thead tr:first-child th:last-child,.table-bordered caption+tbody tr:first-child td:last-child,.table-bordered colgroup+thead tr:first-child th:last-child,.table-bordered colgroup+tbody tr:first-child td:last-child{-webkit-border-top-right-radius:4px;border-top-right-radius:4px;-moz-border-radius-topright:4px}.table-striped tbody tr:nth-child(odd) td,.table-striped tbody tr:nth-child(odd) th{background-color:#f9f9f9}.table-hover tbody tr:hover td,.table-hover tbody tr:hover th{background-color:#f5f5f5}table td[class*="span"],table th[class*="span"],.row-fluid table td[class*="span"],.row-fluid table th[class*="span"]{display:table-cell;float:none;margin-left:0}.table td.span1,.table th.span1{float:none;width:44px;margin-left:0}.table td.span2,.table th.span2{float:none;width:124px;margin-left:0}.table td.span3,.table th.span3{float:none;width:204px;margin-left:0}.table td.span4,.table th.span4{float:none;width:284px;margin-left:0}.table td.span5,.table th.span5{float:none;width:364px;margin-left:0}.table td.span6,.table th.span6{float:none;width:444px;margin-left:0}.table td.span7,.table th.span7{float:none;width:524px;margin-left:0}.table td.span8,.table th.span8{float:none;width:604px;margin-left:0}.table td.span9,.table th.span9{float:none;width:684px;margin-left:0}.table td.span10,.table th.span10{float:none;width:764px;margin-left:0}.table td.span11,.table th.span11{float:none;width:844px;margin-left:0}.table td.span12,.table th.span12{float:none;width:924px;margin-left:0}.table tbody tr.success td{background-color:#dff0d8}.table tbody tr.error td{background-color:#f2dede}.table tbody tr.warning td{background-color:#fcf8e3}.table tbody tr.info td{background-color:#d9edf7}.table-hover tbody tr.success:hover td{background-color:#d0e9c6}.table-hover tbody tr.error:hover td{background-color:#ebcccc}.table-hover tbody tr.warning:hover td{background-color:#faf2cc}.table-hover tbody tr.info:hover td{background-color:#c4e3f3}[class^="icon-"],[class*=" icon-"]{display:inline-block;width:14px;height:14px;margin-top:1px;*margin-right:.3em;line-height:14px;vertical-align:text-top;background-image:url("../img/glyphicons-halflings.png");background-position:14px 14px;background-repeat:no-repeat}.icon-white,.nav-pills>.active>a>[class^="icon-"],.nav-pills>.active>a>[class*=" icon-"],.nav-list>.active>a>[class^="icon-"],.nav-list>.active>a>[class*=" icon-"],.navbar-inverse .nav>.active>a>[class^="icon-"],.navbar-inverse .nav>.active>a>[class*=" icon-"],.dropdown-menu>li>a:hover>[class^="icon-"],.dropdown-menu>li>a:hover>[class*=" icon-"],.dropdown-menu>.active>a>[class^="icon-"],.dropdown-menu>.active>a>[class*=" icon-"],.dropdown-submenu:hover>a>[class^="icon-"],.dropdown-submenu:hover>a>[class*=" icon-"]{background-image:url("../img/glyphicons-halflings-white.png")}.icon-glass{background-position:0 0}.icon-music{background-position:-24px 0}.icon-search{background-position:-48px 0}.icon-envelope{background-position:-72px 0}.icon-heart{background-position:-96px 0}.icon-star{background-position:-120px 0}.icon-star-empty{background-position:-144px 0}.icon-user{background-position:-168px 0}.icon-film{background-position:-192px 0}.icon-th-large{background-position:-216px 0}.icon-th{background-position:-240px 0}.icon-th-list{background-position:-264px 0}.icon-ok{background-position:-288px 0}.icon-remove{background-position:-312px 0}.icon-zoom-in{background-position:-336px 0}.icon-zoom-out{background-position:-360px 0}.icon-off{background-position:-384px 0}.icon-signal{background-position:-408px 0}.icon-cog{background-position:-432px 0}.icon-trash{background-position:-456px 0}.icon-home{background-position:0 -24px}.icon-file{background-position:-24px -24px}.icon-time{background-position:-48px -24px}.icon-road{background-position:-72px -24px}.icon-download-alt{background-position:-96px -24px}.icon-download{background-position:-120px -24px}.icon-upload{background-position:-144px -24px}.icon-inbox{background-position:-168px -24px}.icon-play-circle{background-position:-192px -24px}.icon-repeat{background-position:-216px -24px}.icon-refresh{background-position:-240px -24px}.icon-list-alt{background-position:-264px -24px}.icon-lock{background-position:-287px -24px}.icon-flag{background-position:-312px -24px}.icon-headphones{background-position:-336px -24px}.icon-volume-off{background-position:-360px -24px}.icon-volume-down{background-position:-384px -24px}.icon-volume-up{background-position:-408px -24px}.icon-qrcode{background-position:-432px -24px}.icon-barcode{background-position:-456px -24px}.icon-tag{background-position:0 -48px}.icon-tags{background-position:-25px -48px}.icon-book{background-position:-48px -48px}.icon-bookmark{background-position:-72px -48px}.icon-print{background-position:-96px -48px}.icon-camera{background-position:-120px -48px}.icon-font{background-position:-144px -48px}.icon-bold{background-position:-167px -48px}.icon-italic{background-position:-192px -48px}.icon-text-height{background-position:-216px -48px}.icon-text-width{background-position:-240px -48px}.icon-align-left{background-position:-264px -48px}.icon-align-center{background-position:-288px -48px}.icon-align-right{background-position:-312px -48px}.icon-align-justify{background-position:-336px -48px}.icon-list{background-position:-360px -48px}.icon-indent-left{background-position:-384px -48px}.icon-indent-right{background-position:-408px -48px}.icon-facetime-video{background-position:-432px -48px}.icon-picture{background-position:-456px -48px}.icon-pencil{background-position:0 -72px}.icon-map-marker{background-position:-24px -72px}.icon-adjust{background-position:-48px -72px}.icon-tint{background-position:-72px -72px}.icon-edit{background-position:-96px -72px}.icon-share{background-position:-120px -72px}.icon-check{background-position:-144px -72px}.icon-move{background-position:-168px -72px}.icon-step-backward{background-position:-192px -72px}.icon-fast-backward{background-position:-216px -72px}.icon-backward{background-position:-240px -72px}.icon-play{background-position:-264px -72px}.icon-pause{background-position:-288px -72px}.icon-stop{background-position:-312px -72px}.icon-forward{background-position:-336px -72px}.icon-fast-forward{background-position:-360px -72px}.icon-step-forward{background-position:-384px -72px}.icon-eject{background-position:-408px -72px}.icon-chevron-left{background-position:-432px -72px}.icon-chevron-right{background-position:-456px -72px}.icon-plus-sign{background-position:0 -96px}.icon-minus-sign{background-position:-24px -96px}.icon-remove-sign{background-position:-48px -96px}.icon-ok-sign{background-position:-72px -96px}.icon-question-sign{background-position:-96px -96px}.icon-info-sign{background-position:-120px -96px}.icon-screenshot{background-position:-144px -96px}.icon-remove-circle{background-position:-168px -96px}.icon-ok-circle{background-position:-192px -96px}.icon-ban-circle{background-position:-216px -96px}.icon-arrow-left{background-position:-240px -96px}.icon-arrow-right{background-position:-264px -96px}.icon-arrow-up{background-position:-289px -96px}.icon-arrow-down{background-position:-312px -96px}.icon-share-alt{background-position:-336px -96px}.icon-resize-full{background-position:-360px -96px}.icon-resize-small{background-position:-384px -96px}.icon-plus{background-position:-408px -96px}.icon-minus{background-position:-433px -96px}.icon-asterisk{background-position:-456px -96px}.icon-exclamation-sign{background-position:0 -120px}.icon-gift{background-position:-24px -120px}.icon-leaf{background-position:-48px -120px}.icon-fire{background-position:-72px -120px}.icon-eye-open{background-position:-96px -120px}.icon-eye-close{background-position:-120px -120px}.icon-warning-sign{background-position:-144px -120px}.icon-plane{background-position:-168px -120px}.icon-calendar{background-position:-192px -120px}.icon-random{width:16px;background-position:-216px -120px}.icon-comment{background-position:-240px -120px}.icon-magnet{background-position:-264px -120px}.icon-chevron-up{background-position:-288px -120px}.icon-chevron-down{background-position:-313px -119px}.icon-retweet{background-position:-336px -120px}.icon-shopping-cart{background-position:-360px -120px}.icon-folder-close{background-position:-384px -120px}.icon-folder-open{width:16px;background-position:-408px -120px}.icon-resize-vertical{background-position:-432px -119px}.icon-resize-horizontal{background-position:-456px -118px}.icon-hdd{background-position:0 -144px}.icon-bullhorn{background-position:-24px -144px}.icon-bell{background-position:-48px -144px}.icon-certificate{background-position:-72px -144px}.icon-thumbs-up{background-position:-96px -144px}.icon-thumbs-down{background-position:-120px -144px}.icon-hand-right{background-position:-144px -144px}.icon-hand-left{background-position:-168px -144px}.icon-hand-up{background-position:-192px -144px}.icon-hand-down{background-position:-216px -144px}.icon-circle-arrow-right{background-position:-240px -144px}.icon-circle-arrow-left{background-position:-264px -144px}.icon-circle-arrow-up{background-position:-288px -144px}.icon-circle-arrow-down{background-position:-312px -144px}.icon-globe{background-position:-336px -144px}.icon-wrench{background-position:-360px -144px}.icon-tasks{background-position:-384px -144px}.icon-filter{background-position:-408px -144px}.icon-briefcase{background-position:-432px -144px}.icon-fullscreen{background-position:-456px -144px}.dropup,.dropdown{position:relative}.dropdown-toggle{*margin-bottom:-3px}.dropdown-toggle:active,.open .dropdown-toggle{outline:0}.caret{display:inline-block;width:0;height:0;vertical-align:top;border-top:4px solid #000;border-right:4px solid transparent;border-left:4px solid transparent;content:""}.dropdown .caret{margin-top:8px;margin-left:2px}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:160px;padding:5px 0;margin:2px 0 0;list-style:none;background-color:#fff;border:1px solid #ccc;border:1px solid rgba(0,0,0,0.2);*border-right-width:2px;*border-bottom-width:2px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;-webkit-box-shadow:0 5px 10px rgba(0,0,0,0.2);-moz-box-shadow:0 5px 10px rgba(0,0,0,0.2);box-shadow:0 5px 10px rgba(0,0,0,0.2);-webkit-background-clip:padding-box;-moz-background-clip:padding;background-clip:padding-box}.dropdown-menu.pull-right{right:0;left:auto}.dropdown-menu .divider{*width:100%;height:1px;margin:9px 1px;*margin:-5px 0 5px;overflow:hidden;background-color:#e5e5e5;border-bottom:1px solid #fff}.dropdown-menu li>a{display:block;padding:3px 20px;clear:both;font-weight:normal;line-height:20px;color:#333;white-space:nowrap}.dropdown-menu li>a:hover,.dropdown-menu li>a:focus,.dropdown-submenu:hover>a{color:#fff;text-decoration:none;background-color:#0081c2;background-image:-moz-linear-gradient(top,#08c,#0077b3);background-image:-webkit-gradient(linear,0 0,0 100%,from(#08c),to(#0077b3));background-image:-webkit-linear-gradient(top,#08c,#0077b3);background-image:-o-linear-gradient(top,#08c,#0077b3);background-image:linear-gradient(to bottom,#08c,#0077b3);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc',endColorstr='#ff0077b3',GradientType=0)}.dropdown-menu .active>a,.dropdown-menu .active>a:hover{color:#333;text-decoration:none;background-color:#0081c2;background-image:-moz-linear-gradient(top,#08c,#0077b3);background-image:-webkit-gradient(linear,0 0,0 100%,from(#08c),to(#0077b3));background-image:-webkit-linear-gradient(top,#08c,#0077b3);background-image:-o-linear-gradient(top,#08c,#0077b3);background-image:linear-gradient(to bottom,#08c,#0077b3);background-repeat:repeat-x;outline:0;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc',endColorstr='#ff0077b3',GradientType=0)}.dropdown-menu .disabled>a,.dropdown-menu .disabled>a:hover{color:#999}.dropdown-menu .disabled>a:hover{text-decoration:none;cursor:default;background-color:transparent;background-image:none}.open{*z-index:1000}.open>.dropdown-menu{display:block}.pull-right>.dropdown-menu{right:0;left:auto}.dropup .caret,.navbar-fixed-bottom .dropdown .caret{border-top:0;border-bottom:4px solid #000;content:""}.dropup .dropdown-menu,.navbar-fixed-bottom .dropdown .dropdown-menu{top:auto;bottom:100%;margin-bottom:1px}.dropdown-submenu{position:relative}.dropdown-submenu>.dropdown-menu{top:0;left:100%;margin-top:-6px;margin-left:-1px;-webkit-border-radius:0 6px 6px 6px;-moz-border-radius:0 6px 6px 6px;border-radius:0 6px 6px 6px}.dropdown-submenu:hover>.dropdown-menu{display:block}.dropup .dropdown-submenu>.dropdown-menu{top:auto;bottom:0;margin-top:0;margin-bottom:-2px;-webkit-border-radius:5px 5px 5px 0;-moz-border-radius:5px 5px 5px 0;border-radius:5px 5px 5px 0}.dropdown-submenu>a:after{display:block;float:right;width:0;height:0;margin-top:5px;margin-right:-10px;border-color:transparent;border-left-color:#ccc;border-style:solid;border-width:5px 0 5px 5px;content:" "}.dropdown-submenu:hover>a:after{border-left-color:#fff}.dropdown-submenu.pull-left{float:none}.dropdown-submenu.pull-left>.dropdown-menu{left:-100%;margin-left:10px;-webkit-border-radius:6px 0 6px 6px;-moz-border-radius:6px 0 6px 6px;border-radius:6px 0 6px 6px}.dropdown .dropdown-menu .nav-header{padding-right:20px;padding-left:20px}.typeahead{margin-top:2px;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.well{min-height:20px;padding:19px;margin-bottom:20px;background-color:#f5f5f5;border:1px solid #e3e3e3;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,0.05);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,0.05);box-shadow:inset 0 1px 1px rgba(0,0,0,0.05)}.well blockquote{border-color:#ddd;border-color:rgba(0,0,0,0.15)}.well-large{padding:24px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px}.well-small{padding:9px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.fade{opacity:0;-webkit-transition:opacity .15s linear;-moz-transition:opacity .15s linear;-o-transition:opacity .15s linear;transition:opacity .15s linear}.fade.in{opacity:1}.collapse{position:relative;height:0;overflow:hidden;-webkit-transition:height .35s ease;-moz-transition:height .35s ease;-o-transition:height .35s ease;transition:height .35s ease}.collapse.in{height:auto}.close{float:right;font-size:20px;font-weight:bold;line-height:20px;color:#000;text-shadow:0 1px 0 #fff;opacity:.2;filter:alpha(opacity=20)}.close:hover{color:#000;text-decoration:none;cursor:pointer;opacity:.4;filter:alpha(opacity=40)}button.close{padding:0;cursor:pointer;background:transparent;border:0;-webkit-appearance:none}.btn{display:inline-block;*display:inline;padding:4px 12px;margin-bottom:0;*margin-left:.3em;font-size:14px;line-height:20px;*line-height:20px;color:#333;text-align:center;text-shadow:0 1px 1px rgba(255,255,255,0.75);vertical-align:middle;cursor:pointer;background-color:#f5f5f5;*background-color:#e6e6e6;background-image:-moz-linear-gradient(top,#fff,#e6e6e6);background-image:-webkit-gradient(linear,0 0,0 100%,from(#fff),to(#e6e6e6));background-image:-webkit-linear-gradient(top,#fff,#e6e6e6);background-image:-o-linear-gradient(top,#fff,#e6e6e6);background-image:linear-gradient(to bottom,#fff,#e6e6e6);background-repeat:repeat-x;border:1px solid #bbb;*border:0;border-color:#e6e6e6 #e6e6e6 #bfbfbf;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);border-bottom-color:#a2a2a2;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff',endColorstr='#ffe6e6e6',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);*zoom:1;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05);-moz-box-shadow:inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05)}.btn:hover,.btn:active,.btn.active,.btn.disabled,.btn[disabled]{color:#333;background-color:#e6e6e6;*background-color:#d9d9d9}.btn:active,.btn.active{background-color:#ccc \9}.btn:first-child{*margin-left:0}.btn:hover{color:#333;text-decoration:none;background-color:#e6e6e6;*background-color:#d9d9d9;background-position:0 -15px;-webkit-transition:background-position .1s linear;-moz-transition:background-position .1s linear;-o-transition:background-position .1s linear;transition:background-position .1s linear}.btn:focus{outline:thin dotted #333;outline:5px auto -webkit-focus-ring-color;outline-offset:-2px}.btn.active,.btn:active{background-color:#e6e6e6;background-color:#d9d9d9 \9;background-image:none;outline:0;-webkit-box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05);-moz-box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05)}.btn.disabled,.btn[disabled]{cursor:default;background-color:#e6e6e6;background-image:none;opacity:.65;filter:alpha(opacity=65);-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none}.btn-large{padding:11px 19px;font-size:17.5px;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px}.btn-large [class^="icon-"],.btn-large [class*=" icon-"]{margin-top:2px}.btn-small{padding:2px 10px;font-size:11.9px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.btn-small [class^="icon-"],.btn-small [class*=" icon-"]{margin-top:0}.btn-mini{padding:1px 6px;font-size:10.5px;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.btn-block{display:block;width:100%;padding-right:0;padding-left:0;-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}.btn-block+.btn-block{margin-top:5px}input[type="submit"].btn-block,input[type="reset"].btn-block,input[type="button"].btn-block{width:100%}.btn-primary.active,.btn-warning.active,.btn-danger.active,.btn-success.active,.btn-info.active,.btn-inverse.active{color:rgba(255,255,255,0.75)}.btn{border-color:#c5c5c5;border-color:rgba(0,0,0,0.15) rgba(0,0,0,0.15) rgba(0,0,0,0.25)}.btn-primary{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#006dcc;*background-color:#04c;background-image:-moz-linear-gradient(top,#08c,#04c);background-image:-webkit-gradient(linear,0 0,0 100%,from(#08c),to(#04c));background-image:-webkit-linear-gradient(top,#08c,#04c);background-image:-o-linear-gradient(top,#08c,#04c);background-image:linear-gradient(to bottom,#08c,#04c);background-repeat:repeat-x;border-color:#04c #04c #002a80;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff0088cc',endColorstr='#ff0044cc',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-primary:hover,.btn-primary:active,.btn-primary.active,.btn-primary.disabled,.btn-primary[disabled]{color:#fff;background-color:#04c;*background-color:#003bb3}.btn-primary:active,.btn-primary.active{background-color:#039 \9}.btn-warning{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#faa732;*background-color:#f89406;background-image:-moz-linear-gradient(top,#fbb450,#f89406);background-image:-webkit-gradient(linear,0 0,0 100%,from(#fbb450),to(#f89406));background-image:-webkit-linear-gradient(top,#fbb450,#f89406);background-image:-o-linear-gradient(top,#fbb450,#f89406);background-image:linear-gradient(to bottom,#fbb450,#f89406);background-repeat:repeat-x;border-color:#f89406 #f89406 #ad6704;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffbb450',endColorstr='#fff89406',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-warning:hover,.btn-warning:active,.btn-warning.active,.btn-warning.disabled,.btn-warning[disabled]{color:#fff;background-color:#f89406;*background-color:#df8505}.btn-warning:active,.btn-warning.active{background-color:#c67605 \9}.btn-danger{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#da4f49;*background-color:#bd362f;background-image:-moz-linear-gradient(top,#ee5f5b,#bd362f);background-image:-webkit-gradient(linear,0 0,0 100%,from(#ee5f5b),to(#bd362f));background-image:-webkit-linear-gradient(top,#ee5f5b,#bd362f);background-image:-o-linear-gradient(top,#ee5f5b,#bd362f);background-image:linear-gradient(to bottom,#ee5f5b,#bd362f);background-repeat:repeat-x;border-color:#bd362f #bd362f #802420;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffee5f5b',endColorstr='#ffbd362f',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-danger:hover,.btn-danger:active,.btn-danger.active,.btn-danger.disabled,.btn-danger[disabled]{color:#fff;background-color:#bd362f;*background-color:#a9302a}.btn-danger:active,.btn-danger.active{background-color:#942a25 \9}.btn-success{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#5bb75b;*background-color:#51a351;background-image:-moz-linear-gradient(top,#62c462,#51a351);background-image:-webkit-gradient(linear,0 0,0 100%,from(#62c462),to(#51a351));background-image:-webkit-linear-gradient(top,#62c462,#51a351);background-image:-o-linear-gradient(top,#62c462,#51a351);background-image:linear-gradient(to bottom,#62c462,#51a351);background-repeat:repeat-x;border-color:#51a351 #51a351 #387038;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff62c462',endColorstr='#ff51a351',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-success:hover,.btn-success:active,.btn-success.active,.btn-success.disabled,.btn-success[disabled]{color:#fff;background-color:#51a351;*background-color:#499249}.btn-success:active,.btn-success.active{background-color:#408140 \9}.btn-info{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#49afcd;*background-color:#2f96b4;background-image:-moz-linear-gradient(top,#5bc0de,#2f96b4);background-image:-webkit-gradient(linear,0 0,0 100%,from(#5bc0de),to(#2f96b4));background-image:-webkit-linear-gradient(top,#5bc0de,#2f96b4);background-image:-o-linear-gradient(top,#5bc0de,#2f96b4);background-image:linear-gradient(to bottom,#5bc0de,#2f96b4);background-repeat:repeat-x;border-color:#2f96b4 #2f96b4 #1f6377;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de',endColorstr='#ff2f96b4',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-info:hover,.btn-info:active,.btn-info.active,.btn-info.disabled,.btn-info[disabled]{color:#fff;background-color:#2f96b4;*background-color:#2a85a0}.btn-info:active,.btn-info.active{background-color:#24748c \9}.btn-inverse{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#363636;*background-color:#222;background-image:-moz-linear-gradient(top,#444,#222);background-image:-webkit-gradient(linear,0 0,0 100%,from(#444),to(#222));background-image:-webkit-linear-gradient(top,#444,#222);background-image:-o-linear-gradient(top,#444,#222);background-image:linear-gradient(to bottom,#444,#222);background-repeat:repeat-x;border-color:#222 #222 #000;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff444444',endColorstr='#ff222222',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.btn-inverse:hover,.btn-inverse:active,.btn-inverse.active,.btn-inverse.disabled,.btn-inverse[disabled]{color:#fff;background-color:#222;*background-color:#151515}.btn-inverse:active,.btn-inverse.active{background-color:#080808 \9}button.btn,input[type="submit"].btn{*padding-top:3px;*padding-bottom:3px}button.btn::-moz-focus-inner,input[type="submit"].btn::-moz-focus-inner{padding:0;border:0}button.btn.btn-large,input[type="submit"].btn.btn-large{*padding-top:7px;*padding-bottom:7px}button.btn.btn-small,input[type="submit"].btn.btn-small{*padding-top:3px;*padding-bottom:3px}button.btn.btn-mini,input[type="submit"].btn.btn-mini{*padding-top:1px;*padding-bottom:1px}.btn-link,.btn-link:active,.btn-link[disabled]{background-color:transparent;background-image:none;-webkit-box-shadow:none;-moz-box-shadow:none;box-shadow:none}.btn-link{color:#08c;cursor:pointer;border-color:transparent;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.btn-link:hover{color:#005580;text-decoration:underline;background-color:transparent}.btn-link[disabled]:hover{color:#333;text-decoration:none}.btn-group{position:relative;display:inline-block;*display:inline;*margin-left:.3em;font-size:0;white-space:nowrap;vertical-align:middle;*zoom:1}.btn-group:first-child{*margin-left:0}.btn-group+.btn-group{margin-left:5px}.btn-toolbar{margin-top:10px;margin-bottom:10px;font-size:0}.btn-toolbar .btn+.btn,.btn-toolbar .btn-group+.btn,.btn-toolbar .btn+.btn-group{margin-left:5px}.btn-group>.btn{position:relative;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.btn-group>.btn+.btn{margin-left:-1px}.btn-group>.btn,.btn-group>.dropdown-menu{font-size:14px}.btn-group>.btn-mini{font-size:11px}.btn-group>.btn-small{font-size:12px}.btn-group>.btn-large{font-size:16px}.btn-group>.btn:first-child{margin-left:0;-webkit-border-bottom-left-radius:4px;border-bottom-left-radius:4px;-webkit-border-top-left-radius:4px;border-top-left-radius:4px;-moz-border-radius-bottomleft:4px;-moz-border-radius-topleft:4px}.btn-group>.btn:last-child,.btn-group>.dropdown-toggle{-webkit-border-top-right-radius:4px;border-top-right-radius:4px;-webkit-border-bottom-right-radius:4px;border-bottom-right-radius:4px;-moz-border-radius-topright:4px;-moz-border-radius-bottomright:4px}.btn-group>.btn.large:first-child{margin-left:0;-webkit-border-bottom-left-radius:6px;border-bottom-left-radius:6px;-webkit-border-top-left-radius:6px;border-top-left-radius:6px;-moz-border-radius-bottomleft:6px;-moz-border-radius-topleft:6px}.btn-group>.btn.large:last-child,.btn-group>.large.dropdown-toggle{-webkit-border-top-right-radius:6px;border-top-right-radius:6px;-webkit-border-bottom-right-radius:6px;border-bottom-right-radius:6px;-moz-border-radius-topright:6px;-moz-border-radius-bottomright:6px}.btn-group>.btn:hover,.btn-group>.btn:focus,.btn-group>.btn:active,.btn-group>.btn.active{z-index:2}.btn-group .dropdown-toggle:active,.btn-group.open .dropdown-toggle{outline:0}.btn-group>.btn+.dropdown-toggle{*padding-top:5px;padding-right:8px;*padding-bottom:5px;padding-left:8px;-webkit-box-shadow:inset 1px 0 0 rgba(255,255,255,0.125),inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05);-moz-box-shadow:inset 1px 0 0 rgba(255,255,255,0.125),inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 1px 0 0 rgba(255,255,255,0.125),inset 0 1px 0 rgba(255,255,255,0.2),0 1px 2px rgba(0,0,0,0.05)}.btn-group>.btn-mini+.dropdown-toggle{*padding-top:2px;padding-right:5px;*padding-bottom:2px;padding-left:5px}.btn-group>.btn-small+.dropdown-toggle{*padding-top:5px;*padding-bottom:4px}.btn-group>.btn-large+.dropdown-toggle{*padding-top:7px;padding-right:12px;*padding-bottom:7px;padding-left:12px}.btn-group.open .dropdown-toggle{background-image:none;-webkit-box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05);-moz-box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05);box-shadow:inset 0 2px 4px rgba(0,0,0,0.15),0 1px 2px rgba(0,0,0,0.05)}.btn-group.open .btn.dropdown-toggle{background-color:#e6e6e6}.btn-group.open .btn-primary.dropdown-toggle{background-color:#04c}.btn-group.open .btn-warning.dropdown-toggle{background-color:#f89406}.btn-group.open .btn-danger.dropdown-toggle{background-color:#bd362f}.btn-group.open .btn-success.dropdown-toggle{background-color:#51a351}.btn-group.open .btn-info.dropdown-toggle{background-color:#2f96b4}.btn-group.open .btn-inverse.dropdown-toggle{background-color:#222}.btn .caret{margin-top:8px;margin-left:0}.btn-mini .caret,.btn-small .caret,.btn-large .caret{margin-top:6px}.btn-large .caret{border-top-width:5px;border-right-width:5px;border-left-width:5px}.dropup .btn-large .caret{border-bottom-width:5px}.btn-primary .caret,.btn-warning .caret,.btn-danger .caret,.btn-info .caret,.btn-success .caret,.btn-inverse .caret{border-top-color:#fff;border-bottom-color:#fff}.btn-group-vertical{display:inline-block;*display:inline;*zoom:1}.btn-group-vertical .btn{display:block;float:none;width:100%;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.btn-group-vertical .btn+.btn{margin-top:-1px;margin-left:0}.btn-group-vertical .btn:first-child{-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0;border-radius:4px 4px 0 0}.btn-group-vertical .btn:last-child{-webkit-border-radius:0 0 4px 4px;-moz-border-radius:0 0 4px 4px;border-radius:0 0 4px 4px}.btn-group-vertical .btn-large:first-child{-webkit-border-radius:6px 6px 0 0;-moz-border-radius:6px 6px 0 0;border-radius:6px 6px 0 0}.btn-group-vertical .btn-large:last-child{-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px}.alert{padding:8px 35px 8px 14px;margin-bottom:20px;color:#c09853;text-shadow:0 1px 0 rgba(255,255,255,0.5);background-color:#fcf8e3;border:1px solid #fbeed5;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.alert h4{margin:0}.alert .close{position:relative;top:-2px;right:-21px;line-height:20px}.alert-success{color:#468847;background-color:#dff0d8;border-color:#d6e9c6}.alert-danger,.alert-error{color:#b94a48;background-color:#f2dede;border-color:#eed3d7}.alert-info{color:#3a87ad;background-color:#d9edf7;border-color:#bce8f1}.alert-block{padding-top:14px;padding-bottom:14px}.alert-block>p,.alert-block>ul{margin-bottom:0}.alert-block p+p{margin-top:5px}.nav{margin-bottom:20px;margin-left:0;list-style:none}.nav>li>a{display:block}.nav>li>a:hover{text-decoration:none;background-color:#eee}.nav>.pull-right{float:right}.nav-header{display:block;padding:3px 15px;font-size:11px;font-weight:bold;line-height:20px;color:#999;text-shadow:0 1px 0 rgba(255,255,255,0.5);text-transform:uppercase}.nav li+.nav-header{margin-top:9px}.nav-list{padding-right:15px;padding-left:15px;margin-bottom:0}.nav-list>li>a,.nav-list .nav-header{margin-right:-15px;margin-left:-15px;text-shadow:0 1px 0 rgba(255,255,255,0.5)}.nav-list>li>a{padding:3px 15px}.nav-list>.active>a,.nav-list>.active>a:hover{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.2);background-color:#08c}.nav-list [class^="icon-"],.nav-list [class*=" icon-"]{margin-right:2px}.nav-list .divider{*width:100%;height:1px;margin:9px 1px;*margin:-5px 0 5px;overflow:hidden;background-color:#e5e5e5;border-bottom:1px solid #fff}.nav-tabs,.nav-pills{*zoom:1}.nav-tabs:before,.nav-pills:before,.nav-tabs:after,.nav-pills:after{display:table;line-height:0;content:""}.nav-tabs:after,.nav-pills:after{clear:both}.nav-tabs>li,.nav-pills>li{float:left}.nav-tabs>li>a,.nav-pills>li>a{padding-right:12px;padding-left:12px;margin-right:2px;line-height:14px}.nav-tabs{border-bottom:1px solid #ddd}.nav-tabs>li{margin-bottom:-1px}.nav-tabs>li>a{padding-top:8px;padding-bottom:8px;line-height:20px;border:1px solid transparent;-webkit-border-radius:4px 4px 0 0;-moz-border-radius:4px 4px 0 0;border-radius:4px 4px 0 0}.nav-tabs>li>a:hover{border-color:#eee #eee #ddd}.nav-tabs>.active>a,.nav-tabs>.active>a:hover{color:#555;cursor:default;background-color:#fff;border:1px solid #ddd;border-bottom-color:transparent}.nav-pills>li>a{padding-top:8px;padding-bottom:8px;margin-top:2px;margin-bottom:2px;-webkit-border-radius:5px;-moz-border-radius:5px;border-radius:5px}.nav-pills>.active>a,.nav-pills>.active>a:hover{color:#fff;background-color:#08c}.nav-stacked>li{float:none}.nav-stacked>li>a{margin-right:0}.nav-tabs.nav-stacked{border-bottom:0}.nav-tabs.nav-stacked>li>a{border:1px solid #ddd;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.nav-tabs.nav-stacked>li:first-child>a{-webkit-border-top-right-radius:4px;border-top-right-radius:4px;-webkit-border-top-left-radius:4px;border-top-left-radius:4px;-moz-border-radius-topright:4px;-moz-border-radius-topleft:4px}.nav-tabs.nav-stacked>li:last-child>a{-webkit-border-bottom-right-radius:4px;border-bottom-right-radius:4px;-webkit-border-bottom-left-radius:4px;border-bottom-left-radius:4px;-moz-border-radius-bottomright:4px;-moz-border-radius-bottomleft:4px}.nav-tabs.nav-stacked>li>a:hover{z-index:2;border-color:#ddd}.nav-pills.nav-stacked>li>a{margin-bottom:3px}.nav-pills.nav-stacked>li:last-child>a{margin-bottom:1px}.nav-tabs .dropdown-menu{-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px}.nav-pills .dropdown-menu{-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px}.nav .dropdown-toggle .caret{margin-top:6px;border-top-color:#08c;border-bottom-color:#08c}.nav .dropdown-toggle:hover .caret{border-top-color:#005580;border-bottom-color:#005580}.nav-tabs .dropdown-toggle .caret{margin-top:8px}.nav .active .dropdown-toggle .caret{border-top-color:#fff;border-bottom-color:#fff}.nav-tabs .active .dropdown-toggle .caret{border-top-color:#555;border-bottom-color:#555}.nav>.dropdown.active>a:hover{cursor:pointer}.nav-tabs .open .dropdown-toggle,.nav-pills .open .dropdown-toggle,.nav>li.dropdown.open.active>a:hover{color:#fff;background-color:#999;border-color:#999}.nav li.dropdown.open .caret,.nav li.dropdown.open.active .caret,.nav li.dropdown.open a:hover .caret{border-top-color:#fff;border-bottom-color:#fff;opacity:1;filter:alpha(opacity=100)}.tabs-stacked .open>a:hover{border-color:#999}.tabbable{*zoom:1}.tabbable:before,.tabbable:after{display:table;line-height:0;content:""}.tabbable:after{clear:both}.tab-content{overflow:auto}.tabs-below>.nav-tabs,.tabs-right>.nav-tabs,.tabs-left>.nav-tabs{border-bottom:0}.tab-content>.tab-pane,.pill-content>.pill-pane{display:none}.tab-content>.active,.pill-content>.active{display:block}.tabs-below>.nav-tabs{border-top:1px solid #ddd}.tabs-below>.nav-tabs>li{margin-top:-1px;margin-bottom:0}.tabs-below>.nav-tabs>li>a{-webkit-border-radius:0 0 4px 4px;-moz-border-radius:0 0 4px 4px;border-radius:0 0 4px 4px}.tabs-below>.nav-tabs>li>a:hover{border-top-color:#ddd;border-bottom-color:transparent}.tabs-below>.nav-tabs>.active>a,.tabs-below>.nav-tabs>.active>a:hover{border-color:transparent #ddd #ddd #ddd}.tabs-left>.nav-tabs>li,.tabs-right>.nav-tabs>li{float:none}.tabs-left>.nav-tabs>li>a,.tabs-right>.nav-tabs>li>a{min-width:74px;margin-right:0;margin-bottom:3px}.tabs-left>.nav-tabs{float:left;margin-right:19px;border-right:1px solid #ddd}.tabs-left>.nav-tabs>li>a{margin-right:-1px;-webkit-border-radius:4px 0 0 4px;-moz-border-radius:4px 0 0 4px;border-radius:4px 0 0 4px}.tabs-left>.nav-tabs>li>a:hover{border-color:#eee #ddd #eee #eee}.tabs-left>.nav-tabs .active>a,.tabs-left>.nav-tabs .active>a:hover{border-color:#ddd transparent #ddd #ddd;*border-right-color:#fff}.tabs-right>.nav-tabs{float:right;margin-left:19px;border-left:1px solid #ddd}.tabs-right>.nav-tabs>li>a{margin-left:-1px;-webkit-border-radius:0 4px 4px 0;-moz-border-radius:0 4px 4px 0;border-radius:0 4px 4px 0}.tabs-right>.nav-tabs>li>a:hover{border-color:#eee #eee #eee #ddd}.tabs-right>.nav-tabs .active>a,.tabs-right>.nav-tabs .active>a:hover{border-color:#ddd #ddd #ddd transparent;*border-left-color:#fff}.nav>.disabled>a{color:#999}.nav>.disabled>a:hover{text-decoration:none;cursor:default;background-color:transparent}.navbar{*position:relative;*z-index:2;margin-bottom:20px;overflow:visible;color:#777}.navbar-inner{min-height:40px;padding-right:20px;padding-left:20px;background-color:#fafafa;background-image:-moz-linear-gradient(top,#fff,#f2f2f2);background-image:-webkit-gradient(linear,0 0,0 100%,from(#fff),to(#f2f2f2));background-image:-webkit-linear-gradient(top,#fff,#f2f2f2);background-image:-o-linear-gradient(top,#fff,#f2f2f2);background-image:linear-gradient(to bottom,#fff,#f2f2f2);background-repeat:repeat-x;border:1px solid #d4d4d4;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff',endColorstr='#fff2f2f2',GradientType=0);*zoom:1;-webkit-box-shadow:0 1px 4px rgba(0,0,0,0.065);-moz-box-shadow:0 1px 4px rgba(0,0,0,0.065);box-shadow:0 1px 4px rgba(0,0,0,0.065)}.navbar-inner:before,.navbar-inner:after{display:table;line-height:0;content:""}.navbar-inner:after{clear:both}.navbar .container{width:auto}.nav-collapse.collapse{height:auto;overflow:visible}.navbar .brand{display:block;float:left;padding:10px 20px 10px;margin-left:-20px;font-size:20px;font-weight:200;color:#777;text-shadow:0 1px 0 #fff}.navbar .brand:hover{text-decoration:none}.navbar-text{margin-bottom:0;line-height:40px}.navbar-link{color:#777}.navbar-link:hover{color:#333}.navbar .divider-vertical{height:40px;margin:0 9px;border-right:1px solid #fff;border-left:1px solid #f2f2f2}.navbar .btn,.navbar .btn-group{margin-top:5px}.navbar .btn-group .btn,.navbar .input-prepend .btn,.navbar .input-append .btn{margin-top:0}.navbar-form{margin-bottom:0;*zoom:1}.navbar-form:before,.navbar-form:after{display:table;line-height:0;content:""}.navbar-form:after{clear:both}.navbar-form input,.navbar-form select,.navbar-form .radio,.navbar-form .checkbox{margin-top:5px}.navbar-form input,.navbar-form select,.navbar-form .btn{display:inline-block;margin-bottom:0}.navbar-form input[type="image"],.navbar-form input[type="checkbox"],.navbar-form input[type="radio"]{margin-top:3px}.navbar-form .input-append,.navbar-form .input-prepend{margin-top:6px;white-space:nowrap}.navbar-form .input-append input,.navbar-form .input-prepend input{margin-top:0}.navbar-search{position:relative;float:left;margin-top:5px;margin-bottom:0}.navbar-search .search-query{padding:4px 14px;margin-bottom:0;font-family:"Helvetica Neue",Helvetica,Arial,sans-serif;font-size:13px;font-weight:normal;line-height:1;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px}.navbar-static-top{position:static;margin-bottom:0}.navbar-static-top .navbar-inner{-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.navbar-fixed-top,.navbar-fixed-bottom{position:fixed;right:0;left:0;z-index:1030;margin-bottom:0}.navbar-fixed-top .navbar-inner,.navbar-static-top .navbar-inner{border-width:0 0 1px}.navbar-fixed-bottom .navbar-inner{border-width:1px 0 0}.navbar-fixed-top .navbar-inner,.navbar-fixed-bottom .navbar-inner{padding-right:0;padding-left:0;-webkit-border-radius:0;-moz-border-radius:0;border-radius:0}.navbar-static-top .container,.navbar-fixed-top .container,.navbar-fixed-bottom .container{width:940px}.navbar-fixed-top{top:0}.navbar-fixed-top .navbar-inner,.navbar-static-top .navbar-inner{-webkit-box-shadow:0 1px 10px rgba(0,0,0,0.1);-moz-box-shadow:0 1px 10px rgba(0,0,0,0.1);box-shadow:0 1px 10px rgba(0,0,0,0.1)}.navbar-fixed-bottom{bottom:0}.navbar-fixed-bottom .navbar-inner{-webkit-box-shadow:0 -1px 10px rgba(0,0,0,0.1);-moz-box-shadow:0 -1px 10px rgba(0,0,0,0.1);box-shadow:0 -1px 10px rgba(0,0,0,0.1)}.navbar .nav{position:relative;left:0;display:block;float:left;margin:0 10px 0 0}.navbar .nav.pull-right{float:right;margin-right:0}.navbar .nav>li{float:left}.navbar .nav>li>a{float:none;padding:10px 15px 10px;color:#777;text-decoration:none;text-shadow:0 1px 0 #fff}.navbar .nav .dropdown-toggle .caret{margin-top:8px}.navbar .nav>li>a:focus,.navbar .nav>li>a:hover{color:#333;text-decoration:none;background-color:transparent}.navbar .nav>.active>a,.navbar .nav>.active>a:hover,.navbar .nav>.active>a:focus{color:#555;text-decoration:none;background-color:#e5e5e5;-webkit-box-shadow:inset 0 3px 8px rgba(0,0,0,0.125);-moz-box-shadow:inset 0 3px 8px rgba(0,0,0,0.125);box-shadow:inset 0 3px 8px rgba(0,0,0,0.125)}.navbar .btn-navbar{display:none;float:right;padding:7px 10px;margin-right:5px;margin-left:5px;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#ededed;*background-color:#e5e5e5;background-image:-moz-linear-gradient(top,#f2f2f2,#e5e5e5);background-image:-webkit-gradient(linear,0 0,0 100%,from(#f2f2f2),to(#e5e5e5));background-image:-webkit-linear-gradient(top,#f2f2f2,#e5e5e5);background-image:-o-linear-gradient(top,#f2f2f2,#e5e5e5);background-image:linear-gradient(to bottom,#f2f2f2,#e5e5e5);background-repeat:repeat-x;border-color:#e5e5e5 #e5e5e5 #bfbfbf;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2f2f2',endColorstr='#ffe5e5e5',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.075);-moz-box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.075);box-shadow:inset 0 1px 0 rgba(255,255,255,0.1),0 1px 0 rgba(255,255,255,0.075)}.navbar .btn-navbar:hover,.navbar .btn-navbar:active,.navbar .btn-navbar.active,.navbar .btn-navbar.disabled,.navbar .btn-navbar[disabled]{color:#fff;background-color:#e5e5e5;*background-color:#d9d9d9}.navbar .btn-navbar:active,.navbar .btn-navbar.active{background-color:#ccc \9}.navbar .btn-navbar .icon-bar{display:block;width:18px;height:2px;background-color:#f5f5f5;-webkit-border-radius:1px;-moz-border-radius:1px;border-radius:1px;-webkit-box-shadow:0 1px 0 rgba(0,0,0,0.25);-moz-box-shadow:0 1px 0 rgba(0,0,0,0.25);box-shadow:0 1px 0 rgba(0,0,0,0.25)}.btn-navbar .icon-bar+.icon-bar{margin-top:3px}.navbar .nav>li>.dropdown-menu:before{position:absolute;top:-7px;left:9px;display:inline-block;border-right:7px solid transparent;border-bottom:7px solid #ccc;border-left:7px solid transparent;border-bottom-color:rgba(0,0,0,0.2);content:''}.navbar .nav>li>.dropdown-menu:after{position:absolute;top:-6px;left:10px;display:inline-block;border-right:6px solid transparent;border-bottom:6px solid #fff;border-left:6px solid transparent;content:''}.navbar-fixed-bottom .nav>li>.dropdown-menu:before{top:auto;bottom:-7px;border-top:7px solid #ccc;border-bottom:0;border-top-color:rgba(0,0,0,0.2)}.navbar-fixed-bottom .nav>li>.dropdown-menu:after{top:auto;bottom:-6px;border-top:6px solid #fff;border-bottom:0}.navbar .nav li.dropdown.open>.dropdown-toggle,.navbar .nav li.dropdown.active>.dropdown-toggle,.navbar .nav li.dropdown.open.active>.dropdown-toggle{color:#555;background-color:#e5e5e5}.navbar .nav li.dropdown>.dropdown-toggle .caret{border-top-color:#777;border-bottom-color:#777}.navbar .nav li.dropdown.open>.dropdown-toggle .caret,.navbar .nav li.dropdown.active>.dropdown-toggle .caret,.navbar .nav li.dropdown.open.active>.dropdown-toggle .caret{border-top-color:#555;border-bottom-color:#555}.navbar .pull-right>li>.dropdown-menu,.navbar .nav>li>.dropdown-menu.pull-right{right:0;left:auto}.navbar .pull-right>li>.dropdown-menu:before,.navbar .nav>li>.dropdown-menu.pull-right:before{right:12px;left:auto}.navbar .pull-right>li>.dropdown-menu:after,.navbar .nav>li>.dropdown-menu.pull-right:after{right:13px;left:auto}.navbar .pull-right>li>.dropdown-menu .dropdown-menu,.navbar .nav>li>.dropdown-menu.pull-right .dropdown-menu{right:100%;left:auto;margin-right:-1px;margin-left:0;-webkit-border-radius:6px 0 6px 6px;-moz-border-radius:6px 0 6px 6px;border-radius:6px 0 6px 6px}.navbar-inverse{color:#999}.navbar-inverse .navbar-inner{background-color:#1b1b1b;background-image:-moz-linear-gradient(top,#222,#111);background-image:-webkit-gradient(linear,0 0,0 100%,from(#222),to(#111));background-image:-webkit-linear-gradient(top,#222,#111);background-image:-o-linear-gradient(top,#222,#111);background-image:linear-gradient(to bottom,#222,#111);background-repeat:repeat-x;border-color:#252525;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff222222',endColorstr='#ff111111',GradientType=0)}.navbar-inverse .brand,.navbar-inverse .nav>li>a{color:#999;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.navbar-inverse .brand:hover,.navbar-inverse .nav>li>a:hover{color:#fff}.navbar-inverse .nav>li>a:focus,.navbar-inverse .nav>li>a:hover{color:#fff;background-color:transparent}.navbar-inverse .nav .active>a,.navbar-inverse .nav .active>a:hover,.navbar-inverse .nav .active>a:focus{color:#fff;background-color:#111}.navbar-inverse .navbar-link{color:#999}.navbar-inverse .navbar-link:hover{color:#fff}.navbar-inverse .divider-vertical{border-right-color:#222;border-left-color:#111}.navbar-inverse .nav li.dropdown.open>.dropdown-toggle,.navbar-inverse .nav li.dropdown.active>.dropdown-toggle,.navbar-inverse .nav li.dropdown.open.active>.dropdown-toggle{color:#fff;background-color:#111}.navbar-inverse .nav li.dropdown>.dropdown-toggle .caret{border-top-color:#999;border-bottom-color:#999}.navbar-inverse .nav li.dropdown.open>.dropdown-toggle .caret,.navbar-inverse .nav li.dropdown.active>.dropdown-toggle .caret,.navbar-inverse .nav li.dropdown.open.active>.dropdown-toggle .caret{border-top-color:#fff;border-bottom-color:#fff}.navbar-inverse .navbar-search .search-query{color:#fff;background-color:#515151;border-color:#111;-webkit-box-shadow:inset 0 1px 2px rgba(0,0,0,0.1),0 1px 0 rgba(255,255,255,0.15);-moz-box-shadow:inset 0 1px 2px rgba(0,0,0,0.1),0 1px 0 rgba(255,255,255,0.15);box-shadow:inset 0 1px 2px rgba(0,0,0,0.1),0 1px 0 rgba(255,255,255,0.15);-webkit-transition:none;-moz-transition:none;-o-transition:none;transition:none}.navbar-inverse .navbar-search .search-query:-moz-placeholder{color:#ccc}.navbar-inverse .navbar-search .search-query:-ms-input-placeholder{color:#ccc}.navbar-inverse .navbar-search .search-query::-webkit-input-placeholder{color:#ccc}.navbar-inverse .navbar-search .search-query:focus,.navbar-inverse .navbar-search .search-query.focused{padding:5px 15px;color:#333;text-shadow:0 1px 0 #fff;background-color:#fff;border:0;outline:0;-webkit-box-shadow:0 0 3px rgba(0,0,0,0.15);-moz-box-shadow:0 0 3px rgba(0,0,0,0.15);box-shadow:0 0 3px rgba(0,0,0,0.15)}.navbar-inverse .btn-navbar{color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#0e0e0e;*background-color:#040404;background-image:-moz-linear-gradient(top,#151515,#040404);background-image:-webkit-gradient(linear,0 0,0 100%,from(#151515),to(#040404));background-image:-webkit-linear-gradient(top,#151515,#040404);background-image:-o-linear-gradient(top,#151515,#040404);background-image:linear-gradient(to bottom,#151515,#040404);background-repeat:repeat-x;border-color:#040404 #040404 #000;border-color:rgba(0,0,0,0.1) rgba(0,0,0,0.1) rgba(0,0,0,0.25);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff151515',endColorstr='#ff040404',GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false)}.navbar-inverse .btn-navbar:hover,.navbar-inverse .btn-navbar:active,.navbar-inverse .btn-navbar.active,.navbar-inverse .btn-navbar.disabled,.navbar-inverse .btn-navbar[disabled]{color:#fff;background-color:#040404;*background-color:#000}.navbar-inverse .btn-navbar:active,.navbar-inverse .btn-navbar.active{background-color:#000 \9}.breadcrumb{padding:8px 15px;margin:0 0 20px;list-style:none;background-color:#f5f5f5;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.breadcrumb li{display:inline-block;*display:inline;text-shadow:0 1px 0 #fff;*zoom:1}.breadcrumb .divider{padding:0 5px;color:#ccc}.breadcrumb .active{color:#999}.pagination{margin:20px 0}.pagination ul{display:inline-block;*display:inline;margin-bottom:0;margin-left:0;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;*zoom:1;-webkit-box-shadow:0 1px 2px rgba(0,0,0,0.05);-moz-box-shadow:0 1px 2px rgba(0,0,0,0.05);box-shadow:0 1px 2px rgba(0,0,0,0.05)}.pagination ul>li{display:inline}.pagination ul>li>a,.pagination ul>li>span{float:left;padding:4px 12px;line-height:20px;text-decoration:none;background-color:#fff;border:1px solid #ddd;border-left-width:0}.pagination ul>li>a:hover,.pagination ul>.active>a,.pagination ul>.active>span{background-color:#f5f5f5}.pagination ul>.active>a,.pagination ul>.active>span{color:#999;cursor:default}.pagination ul>.disabled>span,.pagination ul>.disabled>a,.pagination ul>.disabled>a:hover{color:#999;cursor:default;background-color:transparent}.pagination ul>li:first-child>a,.pagination ul>li:first-child>span{border-left-width:1px;-webkit-border-bottom-left-radius:4px;border-bottom-left-radius:4px;-webkit-border-top-left-radius:4px;border-top-left-radius:4px;-moz-border-radius-bottomleft:4px;-moz-border-radius-topleft:4px}.pagination ul>li:last-child>a,.pagination ul>li:last-child>span{-webkit-border-top-right-radius:4px;border-top-right-radius:4px;-webkit-border-bottom-right-radius:4px;border-bottom-right-radius:4px;-moz-border-radius-topright:4px;-moz-border-radius-bottomright:4px}.pagination-centered{text-align:center}.pagination-right{text-align:right}.pagination-large ul>li>a,.pagination-large ul>li>span{padding:11px 19px;font-size:17.5px}.pagination-large ul>li:first-child>a,.pagination-large ul>li:first-child>span{-webkit-border-bottom-left-radius:6px;border-bottom-left-radius:6px;-webkit-border-top-left-radius:6px;border-top-left-radius:6px;-moz-border-radius-bottomleft:6px;-moz-border-radius-topleft:6px}.pagination-large ul>li:last-child>a,.pagination-large ul>li:last-child>span{-webkit-border-top-right-radius:6px;border-top-right-radius:6px;-webkit-border-bottom-right-radius:6px;border-bottom-right-radius:6px;-moz-border-radius-topright:6px;-moz-border-radius-bottomright:6px}.pagination-mini ul>li:first-child>a,.pagination-small ul>li:first-child>a,.pagination-mini ul>li:first-child>span,.pagination-small ul>li:first-child>span{-webkit-border-bottom-left-radius:3px;border-bottom-left-radius:3px;-webkit-border-top-left-radius:3px;border-top-left-radius:3px;-moz-border-radius-bottomleft:3px;-moz-border-radius-topleft:3px}.pagination-mini ul>li:last-child>a,.pagination-small ul>li:last-child>a,.pagination-mini ul>li:last-child>span,.pagination-small ul>li:last-child>span{-webkit-border-top-right-radius:3px;border-top-right-radius:3px;-webkit-border-bottom-right-radius:3px;border-bottom-right-radius:3px;-moz-border-radius-topright:3px;-moz-border-radius-bottomright:3px}.pagination-small ul>li>a,.pagination-small ul>li>span{padding:2px 10px;font-size:11.9px}.pagination-mini ul>li>a,.pagination-mini ul>li>span{padding:1px 6px;font-size:10.5px}.pager{margin:20px 0;text-align:center;list-style:none;*zoom:1}.pager:before,.pager:after{display:table;line-height:0;content:""}.pager:after{clear:both}.pager li{display:inline}.pager li>a,.pager li>span{display:inline-block;padding:5px 14px;background-color:#fff;border:1px solid #ddd;-webkit-border-radius:15px;-moz-border-radius:15px;border-radius:15px}.pager li>a:hover{text-decoration:none;background-color:#f5f5f5}.pager .next>a,.pager .next>span{float:right}.pager .previous>a,.pager .previous>span{float:left}.pager .disabled>a,.pager .disabled>a:hover,.pager .disabled>span{color:#999;cursor:default;background-color:#fff}.modal-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1040;background-color:#000}.modal-backdrop.fade{opacity:0}.modal-backdrop,.modal-backdrop.fade.in{opacity:.8;filter:alpha(opacity=80)}.modal{position:fixed;top:50%;left:50%;z-index:1050;width:560px;margin:-250px 0 0 -280px;background-color:#fff;border:1px solid #999;border:1px solid rgba(0,0,0,0.3);*border:1px solid #999;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;outline:0;-webkit-box-shadow:0 3px 7px rgba(0,0,0,0.3);-moz-box-shadow:0 3px 7px rgba(0,0,0,0.3);box-shadow:0 3px 7px rgba(0,0,0,0.3);-webkit-background-clip:padding-box;-moz-background-clip:padding-box;background-clip:padding-box}.modal.fade{top:-25%;-webkit-transition:opacity .3s linear,top .3s ease-out;-moz-transition:opacity .3s linear,top .3s ease-out;-o-transition:opacity .3s linear,top .3s ease-out;transition:opacity .3s linear,top .3s ease-out}.modal.fade.in{top:50%}.modal-header{padding:9px 15px;border-bottom:1px solid #eee}.modal-header .close{margin-top:2px}.modal-header h3{margin:0;line-height:30px}.modal-body{max-height:400px;padding:15px;overflow-y:auto}.modal-form{margin-bottom:0}.modal-footer{padding:14px 15px 15px;margin-bottom:0;text-align:right;background-color:#f5f5f5;border-top:1px solid #ddd;-webkit-border-radius:0 0 6px 6px;-moz-border-radius:0 0 6px 6px;border-radius:0 0 6px 6px;*zoom:1;-webkit-box-shadow:inset 0 1px 0 #fff;-moz-box-shadow:inset 0 1px 0 #fff;box-shadow:inset 0 1px 0 #fff}.modal-footer:before,.modal-footer:after{display:table;line-height:0;content:""}.modal-footer:after{clear:both}.modal-footer .btn+.btn{margin-bottom:0;margin-left:5px}.modal-footer .btn-group .btn+.btn{margin-left:-1px}.modal-footer .btn-block+.btn-block{margin-left:0}.tooltip{position:absolute;z-index:1030;display:block;padding:5px;font-size:11px;opacity:0;filter:alpha(opacity=0);visibility:visible}.tooltip.in{opacity:.8;filter:alpha(opacity=80)}.tooltip.top{margin-top:-3px}.tooltip.right{margin-left:3px}.tooltip.bottom{margin-top:3px}.tooltip.left{margin-left:-3px}.tooltip-inner{max-width:200px;padding:3px 8px;color:#fff;text-align:center;text-decoration:none;background-color:#000;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.tooltip-arrow{position:absolute;width:0;height:0;border-color:transparent;border-style:solid}.tooltip.top .tooltip-arrow{bottom:0;left:50%;margin-left:-5px;border-top-color:#000;border-width:5px 5px 0}.tooltip.right .tooltip-arrow{top:50%;left:0;margin-top:-5px;border-right-color:#000;border-width:5px 5px 5px 0}.tooltip.left .tooltip-arrow{top:50%;right:0;margin-top:-5px;border-left-color:#000;border-width:5px 0 5px 5px}.tooltip.bottom .tooltip-arrow{top:0;left:50%;margin-left:-5px;border-bottom-color:#000;border-width:0 5px 5px}.popover{position:absolute;top:0;left:0;z-index:1010;display:none;width:236px;padding:1px;background-color:#fff;border:1px solid #ccc;border:1px solid rgba(0,0,0,0.2);-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px;-webkit-box-shadow:0 5px 10px rgba(0,0,0,0.2);-moz-box-shadow:0 5px 10px rgba(0,0,0,0.2);box-shadow:0 5px 10px rgba(0,0,0,0.2);-webkit-background-clip:padding-box;-moz-background-clip:padding;background-clip:padding-box}.popover.top{margin-top:-10px}.popover.right{margin-left:10px}.popover.bottom{margin-top:10px}.popover.left{margin-left:-10px}.popover-title{padding:8px 14px;margin:0;font-size:14px;font-weight:normal;line-height:18px;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;-webkit-border-radius:5px 5px 0 0;-moz-border-radius:5px 5px 0 0;border-radius:5px 5px 0 0}.popover-content{padding:9px 14px}.popover-content p,.popover-content ul,.popover-content ol{margin-bottom:0}.popover .arrow,.popover .arrow:after{position:absolute;display:inline-block;width:0;height:0;border-color:transparent;border-style:solid}.popover .arrow:after{z-index:-1;content:""}.popover.top .arrow{bottom:-10px;left:50%;margin-left:-10px;border-top-color:#fff;border-width:10px 10px 0}.popover.top .arrow:after{bottom:-1px;left:-11px;border-top-color:rgba(0,0,0,0.25);border-width:11px 11px 0}.popover.right .arrow{top:50%;left:-10px;margin-top:-10px;border-right-color:#fff;border-width:10px 10px 10px 0}.popover.right .arrow:after{bottom:-11px;left:-1px;border-right-color:rgba(0,0,0,0.25);border-width:11px 11px 11px 0}.popover.bottom .arrow{top:-10px;left:50%;margin-left:-10px;border-bottom-color:#fff;border-width:0 10px 10px}.popover.bottom .arrow:after{top:-1px;left:-11px;border-bottom-color:rgba(0,0,0,0.25);border-width:0 11px 11px}.popover.left .arrow{top:50%;right:-10px;margin-top:-10px;border-left-color:#fff;border-width:10px 0 10px 10px}.popover.left .arrow:after{right:-1px;bottom:-11px;border-left-color:rgba(0,0,0,0.25);border-width:11px 0 11px 11px}.thumbnails{margin-left:-20px;list-style:none;*zoom:1}.thumbnails:before,.thumbnails:after{display:table;line-height:0;content:""}.thumbnails:after{clear:both}.row-fluid .thumbnails{margin-left:0}.thumbnails>li{float:left;margin-bottom:20px;margin-left:20px}.thumbnail{display:block;padding:4px;line-height:20px;border:1px solid #ddd;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;-webkit-box-shadow:0 1px 3px rgba(0,0,0,0.055);-moz-box-shadow:0 1px 3px rgba(0,0,0,0.055);box-shadow:0 1px 3px rgba(0,0,0,0.055);-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;-o-transition:all .2s ease-in-out;transition:all .2s ease-in-out}a.thumbnail:hover{border-color:#08c;-webkit-box-shadow:0 1px 4px rgba(0,105,214,0.25);-moz-box-shadow:0 1px 4px rgba(0,105,214,0.25);box-shadow:0 1px 4px rgba(0,105,214,0.25)}.thumbnail>img{display:block;max-width:100%;margin-right:auto;margin-left:auto}.thumbnail .caption{padding:9px;color:#555}.media,.media-body{overflow:hidden;*overflow:visible;zoom:1}.media,.media .media{margin-top:15px}.media:first-child{margin-top:0}.media-object{display:block}.media-heading{margin:0 0 5px}.media .pull-left{margin-right:10px}.media .pull-right{margin-left:10px}.media-list{margin-left:0;list-style:none}.label,.badge{display:inline-block;padding:2px 4px;font-size:11.844px;font-weight:bold;line-height:14px;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25);white-space:nowrap;vertical-align:baseline;background-color:#999}.label{-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px}.badge{padding-right:9px;padding-left:9px;-webkit-border-radius:9px;-moz-border-radius:9px;border-radius:9px}a.label:hover,a.badge:hover{color:#fff;text-decoration:none;cursor:pointer}.label-important,.badge-important{background-color:#b94a48}.label-important[href],.badge-important[href]{background-color:#953b39}.label-warning,.badge-warning{background-color:#f89406}.label-warning[href],.badge-warning[href]{background-color:#c67605}.label-success,.badge-success{background-color:#468847}.label-success[href],.badge-success[href]{background-color:#356635}.label-info,.badge-info{background-color:#3a87ad}.label-info[href],.badge-info[href]{background-color:#2d6987}.label-inverse,.badge-inverse{background-color:#333}.label-inverse[href],.badge-inverse[href]{background-color:#1a1a1a}.btn .label,.btn .badge{position:relative;top:-1px}.btn-mini .label,.btn-mini .badge{top:0}@-webkit-keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}@-moz-keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}@-ms-keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}@-o-keyframes progress-bar-stripes{from{background-position:0 0}to{background-position:40px 0}}@keyframes progress-bar-stripes{from{background-position:40px 0}to{background-position:0 0}}.progress{height:20px;margin-bottom:20px;overflow:hidden;background-color:#f7f7f7;background-image:-moz-linear-gradient(top,#f5f5f5,#f9f9f9);background-image:-webkit-gradient(linear,0 0,0 100%,from(#f5f5f5),to(#f9f9f9));background-image:-webkit-linear-gradient(top,#f5f5f5,#f9f9f9);background-image:-o-linear-gradient(top,#f5f5f5,#f9f9f9);background-image:linear-gradient(to bottom,#f5f5f5,#f9f9f9);background-repeat:repeat-x;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5',endColorstr='#fff9f9f9',GradientType=0);-webkit-box-shadow:inset 0 1px 2px rgba(0,0,0,0.1);-moz-box-shadow:inset 0 1px 2px rgba(0,0,0,0.1);box-shadow:inset 0 1px 2px rgba(0,0,0,0.1)}.progress .bar{float:left;width:0;height:100%;font-size:12px;color:#fff;text-align:center;text-shadow:0 -1px 0 rgba(0,0,0,0.25);background-color:#0e90d2;background-image:-moz-linear-gradient(top,#149bdf,#0480be);background-image:-webkit-gradient(linear,0 0,0 100%,from(#149bdf),to(#0480be));background-image:-webkit-linear-gradient(top,#149bdf,#0480be);background-image:-o-linear-gradient(top,#149bdf,#0480be);background-image:linear-gradient(to bottom,#149bdf,#0480be);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff149bdf',endColorstr='#ff0480be',GradientType=0);-webkit-box-shadow:inset 0 -1px 0 rgba(0,0,0,0.15);-moz-box-shadow:inset 0 -1px 0 rgba(0,0,0,0.15);box-shadow:inset 0 -1px 0 rgba(0,0,0,0.15);-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;-webkit-transition:width .6s ease;-moz-transition:width .6s ease;-o-transition:width .6s ease;transition:width .6s ease}.progress .bar+.bar{-webkit-box-shadow:inset 1px 0 0 rgba(0,0,0,0.15),inset 0 -1px 0 rgba(0,0,0,0.15);-moz-box-shadow:inset 1px 0 0 rgba(0,0,0,0.15),inset 0 -1px 0 rgba(0,0,0,0.15);box-shadow:inset 1px 0 0 rgba(0,0,0,0.15),inset 0 -1px 0 rgba(0,0,0,0.15)}.progress-striped .bar{background-color:#149bdf;background-image:-webkit-gradient(linear,0 100%,100% 0,color-stop(0.25,rgba(255,255,255,0.15)),color-stop(0.25,transparent),color-stop(0.5,transparent),color-stop(0.5,rgba(255,255,255,0.15)),color-stop(0.75,rgba(255,255,255,0.15)),color-stop(0.75,transparent),to(transparent));background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-moz-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);-webkit-background-size:40px 40px;-moz-background-size:40px 40px;-o-background-size:40px 40px;background-size:40px 40px}.progress.active .bar{-webkit-animation:progress-bar-stripes 2s linear infinite;-moz-animation:progress-bar-stripes 2s linear infinite;-ms-animation:progress-bar-stripes 2s linear infinite;-o-animation:progress-bar-stripes 2s linear infinite;animation:progress-bar-stripes 2s linear infinite}.progress-danger .bar,.progress .bar-danger{background-color:#dd514c;background-image:-moz-linear-gradient(top,#ee5f5b,#c43c35);background-image:-webkit-gradient(linear,0 0,0 100%,from(#ee5f5b),to(#c43c35));background-image:-webkit-linear-gradient(top,#ee5f5b,#c43c35);background-image:-o-linear-gradient(top,#ee5f5b,#c43c35);background-image:linear-gradient(to bottom,#ee5f5b,#c43c35);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffee5f5b',endColorstr='#ffc43c35',GradientType=0)}.progress-danger.progress-striped .bar,.progress-striped .bar-danger{background-color:#ee5f5b;background-image:-webkit-gradient(linear,0 100%,100% 0,color-stop(0.25,rgba(255,255,255,0.15)),color-stop(0.25,transparent),color-stop(0.5,transparent),color-stop(0.5,rgba(255,255,255,0.15)),color-stop(0.75,rgba(255,255,255,0.15)),color-stop(0.75,transparent),to(transparent));background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-moz-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent)}.progress-success .bar,.progress .bar-success{background-color:#5eb95e;background-image:-moz-linear-gradient(top,#62c462,#57a957);background-image:-webkit-gradient(linear,0 0,0 100%,from(#62c462),to(#57a957));background-image:-webkit-linear-gradient(top,#62c462,#57a957);background-image:-o-linear-gradient(top,#62c462,#57a957);background-image:linear-gradient(to bottom,#62c462,#57a957);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff62c462',endColorstr='#ff57a957',GradientType=0)}.progress-success.progress-striped .bar,.progress-striped .bar-success{background-color:#62c462;background-image:-webkit-gradient(linear,0 100%,100% 0,color-stop(0.25,rgba(255,255,255,0.15)),color-stop(0.25,transparent),color-stop(0.5,transparent),color-stop(0.5,rgba(255,255,255,0.15)),color-stop(0.75,rgba(255,255,255,0.15)),color-stop(0.75,transparent),to(transparent));background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-moz-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent)}.progress-info .bar,.progress .bar-info{background-color:#4bb1cf;background-image:-moz-linear-gradient(top,#5bc0de,#339bb9);background-image:-webkit-gradient(linear,0 0,0 100%,from(#5bc0de),to(#339bb9));background-image:-webkit-linear-gradient(top,#5bc0de,#339bb9);background-image:-o-linear-gradient(top,#5bc0de,#339bb9);background-image:linear-gradient(to bottom,#5bc0de,#339bb9);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de',endColorstr='#ff339bb9',GradientType=0)}.progress-info.progress-striped .bar,.progress-striped .bar-info{background-color:#5bc0de;background-image:-webkit-gradient(linear,0 100%,100% 0,color-stop(0.25,rgba(255,255,255,0.15)),color-stop(0.25,transparent),color-stop(0.5,transparent),color-stop(0.5,rgba(255,255,255,0.15)),color-stop(0.75,rgba(255,255,255,0.15)),color-stop(0.75,transparent),to(transparent));background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-moz-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent)}.progress-warning .bar,.progress .bar-warning{background-color:#faa732;background-image:-moz-linear-gradient(top,#fbb450,#f89406);background-image:-webkit-gradient(linear,0 0,0 100%,from(#fbb450),to(#f89406));background-image:-webkit-linear-gradient(top,#fbb450,#f89406);background-image:-o-linear-gradient(top,#fbb450,#f89406);background-image:linear-gradient(to bottom,#fbb450,#f89406);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffbb450',endColorstr='#fff89406',GradientType=0)}.progress-warning.progress-striped .bar,.progress-striped .bar-warning{background-color:#fbb450;background-image:-webkit-gradient(linear,0 100%,100% 0,color-stop(0.25,rgba(255,255,255,0.15)),color-stop(0.25,transparent),color-stop(0.5,transparent),color-stop(0.5,rgba(255,255,255,0.15)),color-stop(0.75,rgba(255,255,255,0.15)),color-stop(0.75,transparent),to(transparent));background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-moz-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,0.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,0.15) 50%,rgba(255,255,255,0.15) 75%,transparent 75%,transparent)}.accordion{margin-bottom:20px}.accordion-group{margin-bottom:2px;border:1px solid #e5e5e5;-webkit-border-radius:4px;-moz-border-radius:4px;border-radius:4px}.accordion-heading{border-bottom:0}.accordion-heading .accordion-toggle{display:block;padding:8px 15px}.accordion-toggle{cursor:pointer}.accordion-inner{padding:9px 15px;border-top:1px solid #e5e5e5}.carousel{position:relative;margin-bottom:20px;line-height:1}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel .item{position:relative;display:none;-webkit-transition:.6s ease-in-out left;-moz-transition:.6s ease-in-out left;-o-transition:.6s ease-in-out left;transition:.6s ease-in-out left}.carousel .item>img{display:block;line-height:1}.carousel .active,.carousel .next,.carousel .prev{display:block}.carousel .active{left:0}.carousel .next,.carousel .prev{position:absolute;top:0;width:100%}.carousel .next{left:100%}.carousel .prev{left:-100%}.carousel .next.left,.carousel .prev.right{left:0}.carousel .active.left{left:-100%}.carousel .active.right{left:100%}.carousel-control{position:absolute;top:40%;left:15px;width:40px;height:40px;margin-top:-20px;font-size:60px;font-weight:100;line-height:30px;color:#fff;text-align:center;background:#222;border:3px solid #fff;-webkit-border-radius:23px;-moz-border-radius:23px;border-radius:23px;opacity:.5;filter:alpha(opacity=50)}.carousel-control.right{right:15px;left:auto}.carousel-control:hover{color:#fff;text-decoration:none;opacity:.9;filter:alpha(opacity=90)}.carousel-caption{position:absolute;right:0;bottom:0;left:0;padding:15px;background:#333;background:rgba(0,0,0,0.75)}.carousel-caption h4,.carousel-caption p{line-height:20px;color:#fff}.carousel-caption h4{margin:0 0 5px}.carousel-caption p{margin-bottom:0}.hero-unit{padding:60px;margin-bottom:30px;font-size:18px;font-weight:200;line-height:30px;color:inherit;background-color:#eee;-webkit-border-radius:6px;-moz-border-radius:6px;border-radius:6px}.hero-unit h1{margin-bottom:0;font-size:60px;line-height:1;letter-spacing:-1px;color:inherit}.hero-unit li{line-height:30px}.pull-right{float:right}.pull-left{float:left}.hide{display:none}.show{display:block}.invisible{visibility:hidden}.affix{position:fixed} diff --git a/www/bootstrap/img/glyphicons-halflings-white.png b/www/bootstrap/img/glyphicons-halflings-white.png new file mode 100644 index 0000000000000000000000000000000000000000..3bf6484a29d8da269f9bc874b25493a45fae3bae GIT binary patch literal 8777 zcmZvC1yGz#v+m*$LXcp=A$ZWB0fL7wNbp_U*$~{_gL`my3oP#L!5tQYy99Ta`+g_q zKlj|KJ2f@c)ARJx{q*bbkhN_!|Wn*Vos8{TEhUT@5e;_WJsIMMcG5%>DiS&dv_N`4@J0cnAQ-#>RjZ z00W5t&tJ^l-QC*ST1-p~00u^9XJ=AUl7oW-;2a+x2k__T=grN{+1c4XK0ZL~^z^i$ zp&>vEhr@4fZWb380S18T&!0cQ3IKpHF)?v=b_NIm0Q>vwY7D0baZ)n z31Fa5sELUQARIVaU0nqf0XzT+fB_63aA;@<$l~wse|mcA;^G1TmX?-)e)jkGPfkuA z92@|!<>h5S_4f8QP-JRq>d&7)^Yin8l7K8gED$&_FaV?gY+wLjpoW%~7NDe=nHfMG z5DO3j{R9kv5GbssrUpO)OyvVrlx>u0UKD0i;Dpm5S5dY16(DL5l{ixz|mhJU@&-OWCTb7_%}8-fE(P~+XIRO zJU|wp1|S>|J3KrLcz^+v1f&BDpd>&MAaibR4#5A_4(MucZwG9E1h4@u0P@C8;oo+g zIVj7kfJi{oV~E(NZ*h(@^-(Q(C`Psb3KZ{N;^GB(a8NE*Vwc715!9 zr-H4Ao|T_c6+VT_JH9H+P3>iXSt!a$F`>s`jn`w9GZ_~B!{0soaiV|O_c^R2aWa%}O3jUE)WO=pa zs~_Wz08z|ieY5A%$@FcBF9^!1a}m5ks@7gjn;67N>}S~Hrm`4sM5Hh`q7&5-N{|31 z6x1{ol7BnskoViZ0GqbLa#kW`Z)VCjt1MysKg|rT zi!?s##Ck>8c zpi|>$lGlw#@yMNi&V4`6OBGJ(H&7lqLlcTQ&1zWriG_fL>BnFcr~?;E93{M-xIozQ zO=EHQ#+?<}%@wbWWv23#!V70h9MOuUVaU>3kpTvYfc|LBw?&b*89~Gc9i&8tlT#kF ztpbZoAzkdB+UTy=tx%L3Z4)I{zY(Kb)eg{InobSJmNwPZt$14aS-uc4eKuY8h$dtfyxu^a%zA)>fYI&)@ZXky?^{5>xSC?;w4r&td6vBdi%vHm4=XJH!3yL3?Ep+T5aU_>i;yr_XGq zxZfCzUU@GvnoIk+_Nd`aky>S&H!b*{A%L>?*XPAgWL(Vf(k7qUS}>Zn=U(ZfcOc{B z3*tOHH@t5Ub5D~#N7!Fxx}P2)sy{vE_l(R7$aW&CX>c|&HY+7};vUIietK%}!phrCuh+;C@1usp;XLU<8Gq8P!rEI3ieg#W$!= zQcZr{hp>8sF?k&Yl0?B84OneiQxef-4TEFrq3O~JAZR}yEJHA|Xkqd49tR&8oq{zP zY@>J^HBV*(gJvJZc_0VFN7Sx?H7#75E3#?N8Z!C+_f53YU}pyggxx1?wQi5Yb-_`I`_V*SMx5+*P^b=ec5RON-k1cIlsBLk}(HiaJyab0`CI zo0{=1_LO$~oE2%Tl_}KURuX<`+mQN_sTdM&* zkFf!Xtl^e^gTy6ON=&gTn6)$JHQq2)33R@_!#9?BLNq-Wi{U|rVX7Vny$l6#+SZ@KvQt@VYb%<9JfapI^b9j=wa+Tqb4ei;8c5 z&1>Uz@lVFv6T4Z*YU$r4G`g=91lSeA<=GRZ!*KTWKDPR}NPUW%peCUj`Ix_LDq!8| zMH-V`Pv!a~QkTL||L@cqiTz)*G-0=ytr1KqTuFPan9y4gYD5>PleK`NZB$ev@W%t= zkp)_=lBUTLZJpAtZg;pjI;7r2y|26-N7&a(hX|`1YNM9N8{>8JAuv}hp1v`3JHT-=5lbXpbMq7X~2J5Kl zh7tyU`_AusMFZ{ej9D;Uyy;SQ!4nwgSnngsYBwdS&EO3NS*o04)*juAYl;57c2Ly0(DEZ8IY?zSph-kyxu+D`tt@oU{32J#I{vmy=#0ySPK zA+i(A3yl)qmTz*$dZi#y9FS;$;h%bY+;StNx{_R56Otq+?pGe^T^{5d7Gs&?`_r`8 zD&dzOA|j8@3A&FR5U3*eQNBf<4^4W_iS_()*8b4aaUzfk2 zzIcMWSEjm;EPZPk{j{1>oXd}pXAj!NaRm8{Sjz!D=~q3WJ@vmt6ND_?HI~|wUS1j5 z9!S1MKr7%nxoJ3k`GB^7yV~*{n~O~n6($~x5Bu{7s|JyXbAyKI4+tO(zZYMslK;Zc zzeHGVl{`iP@jfSKq>R;{+djJ9n%$%EL()Uw+sykjNQdflkJZSjqV_QDWivbZS~S{K zkE@T^Jcv)Dfm93!mf$XYnCT--_A$zo9MOkPB6&diM8MwOfV?+ApNv`moV@nqn>&lv zYbN1-M|jc~sG|yLN^1R2=`+1ih3jCshg`iP&mY$GMTcY^W^T`WOCX!{-KHmZ#GiRH zYl{|+KLn5!PCLtBy~9i}`#d^gCDDx$+GQb~uc;V#K3OgbbOG0j5{BRG-si%Bo{@lB zGIt+Ain8^C`!*S0d0OSWVO+Z89}}O8aFTZ>p&k}2gGCV zh#<$gswePFxWGT$4DC^8@84_e*^KT74?7n8!$8cg=sL$OlKr&HMh@Rr5%*Wr!xoOl zo7jItnj-xYgVTX)H1=A2bD(tleEH57#V{xAeW_ezISg5OC zg=k>hOLA^urTH_e6*vSYRqCm$J{xo}-x3@HH;bsHD1Z`Pzvsn}%cvfw%Q(}h`Dgtb z0_J^niUmoCM5$*f)6}}qi(u;cPgxfyeVaaVmOsG<)5`6tzU4wyhF;k|~|x>7-2hXpVBpc5k{L4M`Wbe6Q?tr^*B z`Y*>6*&R#~%JlBIitlZ^qGe3s21~h3U|&k%%jeMM;6!~UH|+0+<5V-_zDqZQN79?n?!Aj!Nj`YMO9?j>uqI9-Tex+nJD z%e0#Yca6(zqGUR|KITa?9x-#C0!JKJHO(+fy@1!B$%ZwJwncQW7vGYv?~!^`#L~Um zOL++>4qmqW`0Chc0T23G8|vO)tK=Z2`gvS4*qpqhIJCEv9i&&$09VO8YOz|oZ+ubd zNXVdLc&p=KsSgtmIPLN69P7xYkYQ1vJ?u1g)T!6Ru`k2wkdj*wDC)VryGu2=yb0?F z>q~~e>KZ0d_#7f3UgV%9MY1}vMgF{B8yfE{HL*pMyhYF)WDZ^^3vS8F zGlOhs%g_~pS3=WQ#494@jAXwOtr^Y|TnQ5zki>qRG)(oPY*f}U_=ip_{qB0!%w7~G zWE!P4p3khyW-JJnE>eECuYfI?^d366Shq!Wm#x&jAo>=HdCllE$>DPO0N;y#4G)D2y#B@5=N=+F%Xo2n{gKcPcK2!hP*^WSXl+ut; zyLvVoY>VL{H%Kd9^i~lsb8j4>$EllrparEOJNT?Ym>vJa$(P^tOG)5aVb_5w^*&M0 zYOJ`I`}9}UoSnYg#E(&yyK(tqr^@n}qU2H2DhkK-`2He% zgXr_4kpXoQHxAO9S`wEdmqGU4j=1JdG!OixdqB4PPP6RXA}>GM zumruUUH|ZG2$bBj)Qluj&uB=dRb)?^qomw?Z$X%#D+Q*O97eHrgVB2*mR$bFBU`*} zIem?dM)i}raTFDn@5^caxE^XFXVhBePmH9fqcTi`TLaXiueH=@06sl}>F%}h9H_e9 z>^O?LxM1EjX}NVppaO@NNQr=AtHcH-BU{yBT_vejJ#J)l^cl69Z7$sk`82Zyw7Wxt z=~J?hZm{f@W}|96FUJfy65Gk8?^{^yjhOahUMCNNpt5DJw}ZKH7b!bGiFY9y6OY&T z_N)?Jj(MuLTN36ZCJ6I5Xy7uVlrb$o*Z%=-)kPo9s?<^Yqz~!Z* z_mP8(unFq65XSi!$@YtieSQ!<7IEOaA9VkKI?lA`*(nURvfKL8cX}-+~uw9|_5)uC2`ZHcaeX7L8aG6Ghleg@F9aG%X$#g6^yP5apnB>YTz&EfS{q z9UVfSyEIczebC)qlVu5cOoMzS_jrC|)rQlAzK7sfiW0`M8mVIohazPE9Jzn*qPt%6 zZL8RELY@L09B83@Be;x5V-IHnn$}{RAT#<2JA%ttlk#^(%u}CGze|1JY5MPhbfnYG zIw%$XfBmA-<_pKLpGKwbRF$#P;@_)ech#>vj25sv25VM$ouo)?BXdRcO{)*OwTw)G zv43W~T6ekBMtUD%5Bm>`^Ltv!w4~65N!Ut5twl!Agrzyq4O2Fi3pUMtCU~>9gt_=h-f% z;1&OuSu?A_sJvIvQ+dZNo3?m1%b1+s&UAx?8sUHEe_sB7zkm4R%6)<@oYB_i5>3Ip zIA+?jVdX|zL{)?TGpx+=Ta>G80}0}Ax+722$XFNJsC1gcH56{8B)*)eU#r~HrC&}` z|EWW92&;6y;3}!L5zXa385@?-D%>dSvyK;?jqU2t_R3wvBW;$!j45uQ7tyEIQva;Db}r&bR3kqNSh)Q_$MJ#Uj3Gj1F;)sO|%6z#@<+ zi{pbYsYS#u`X$Nf($OS+lhw>xgjos1OnF^$-I$u;qhJswhH~p|ab*nO>zBrtb0ndn zxV0uh!LN`&xckTP+JW}gznSpU492)u+`f{9Yr)js`NmfYH#Wdtradc0TnKNz@Su!e zu$9}G_=ku;%4xk}eXl>)KgpuT>_<`Ud(A^a++K&pm3LbN;gI}ku@YVrA%FJBZ5$;m zobR8}OLtW4-i+qPPLS-(7<>M{)rhiPoi@?&vDeVq5%fmZk=mDdRV>Pb-l7pP1y6|J z8I>sF+TypKV=_^NwBU^>4JJq<*14GLfM2*XQzYdlqqjnE)gZsPW^E@mp&ww* zW9i>XL=uwLVZ9pO*8K>t>vdL~Ek_NUL$?LQi5sc#1Q-f6-ywKcIT8Kw?C(_3pbR`e|)%9S-({if|E+hR2W!&qfQ&UiF^I!|M#xhdWsenv^wpKCBiuxXbnp85`{i|;BM?Ba`lqTA zyRm=UWJl&E{8JzYDHFu>*Z10-?#A8D|5jW9Ho0*CAs0fAy~MqbwYuOq9jjt9*nuHI zbDwKvh)5Ir$r!fS5|;?Dt>V+@F*v8=TJJF)TdnC#Mk>+tGDGCw;A~^PC`gUt*<(|i zB{{g{`uFehu`$fm4)&k7`u{xIV)yvA(%5SxX9MS80p2EKnLtCZ>tlX>*Z6nd&6-Mv$5rHD*db;&IBK3KH&M<+ArlGXDRdX1VVO4)&R$f4NxXI>GBh zSv|h>5GDAI(4E`@F?EnW zS>#c&Gw6~_XL`qQG4bK`W*>hek4LX*efn6|_MY+rXkNyAuu?NxS%L7~9tD3cn7&p( zCtfqe6sjB&Q-Vs7BP5+%;#Gk};4xtwU!KY0XXbmkUy$kR9)!~?*v)qw00!+Yg^#H> zc#8*z6zZo>+(bud?K<*!QO4ehiTCK&PD4G&n)Tr9X_3r-we z?fI+}-G~Yn93gI6F{}Dw_SC*FLZ)5(85zp4%uubtD)J)UELLkvGk4#tw&Tussa)mTD$R2&O~{ zCI3>fr-!-b@EGRI%g0L8UU%%u_<;e9439JNV;4KSxd|78v+I+8^rmMf3f40Jb}wEszROD?xBZu>Ll3;sUIoNxDK3|j3*sam2tC@@e$ z^!;+AK>efeBJB%ALsQ{uFui)oDoq()2USi?n=6C3#eetz?wPswc={I<8x=(8lE4EIsUfyGNZ{|KYn1IR|=E==f z(;!A5(-2y^2xRFCSPqzHAZn5RCN_bp22T(KEtjA(rFZ%>a4@STrHZflxKoqe9Z4@^ zM*scx_y73?Q{vt6?~WEl?2q*;@8 z3M*&@%l)SQmXkcUm)d@GT2#JdzhfSAP9|n#C;$E8X|pwD!r#X?0P>0ZisQ~TNqupW z*lUY~+ikD`vQb?@SAWX#r*Y+;=_|oacL$2CL$^(mV}aKO77pg}O+-=T1oLBT5sL2i z42Qth2+0@C`c+*D0*5!qy26sis<9a7>LN2{z%Qj49t z=L@x`4$ALHb*3COHoT?5S_c(Hs}g!V>W^=6Q0}zaubkDn)(lTax0+!+%B}9Vqw6{H zvL|BRM`O<@;eVi1DzM!tXtBrA20Ce@^Jz|>%X-t`vi-%WweXCh_LhI#bUg2*pcP~R z*RuTUzBKLXO~~uMd&o$v3@d0shHfUjC6c539PE6rF&;Ufa(Rw@K1*m7?f5)t`MjH0 z)_V(cajV5Am>f!kWcI@5rE8t6$S>5M=k=aRZROH6fA^jJp~2NlR4;Q2>L$7F#RT#9 z>4@1RhWG`Khy>P2j1Yx^BBL{S`niMaxlSWV-JBU0-T9zZ%>7mR3l$~QV$({o0;jTI ze5=cN^!Bc2bT|BcojXp~K#2cM>OTe*cM{Kg-j*CkiW)EGQot^}s;cy8_1_@JA0Whq zlrNr+R;Efa+`6N)s5rH*|E)nYZ3uqkk2C(E7@A|3YI`ozP~9Lexx#*1(r8luq+YPk z{J}c$s` zPM35Fx(YWB3Z5IYnN+L_4|jaR(5iWJi2~l&xy}aU7kW?o-V*6Av2wyZTG!E2KSW2* zGRLQkQU;Oz##ie-Z4fI)WSRxn$(ZcD;TL+;^r=a4(G~H3ZhK$lSXZj?cvyY8%d9JM zzc3#pD^W_QnWy#rx#;c&N@sqHhrnHRmj#i;s%zLm6SE(n&BWpd&f7>XnjV}OlZntI70fq%8~9<7 zMYaw`E-rp49-oC1N_uZTo)Cu%RR2QWdHpzQIcNsoDp`3xfP+`gI?tVQZ4X={qU?(n zV>0ASES^Xuc;9JBji{)RnFL(Lez;8XbB1uWaMp@p?7xhXk6V#!6B@aP4Rz7-K%a>i z?fvf}va_DGUXlI#4--`A3qK7J?-HwnG7O~H2;zR~RLW)_^#La!=}+>KW#anZ{|^D3 B7G?kd literal 0 HcmV?d00001 diff --git a/www/bootstrap/img/glyphicons-halflings.png b/www/bootstrap/img/glyphicons-halflings.png new file mode 100644 index 0000000000000000000000000000000000000000..a9969993201f9cee63cf9f49217646347297b643 GIT binary patch literal 12799 zcma*OWmH^Ivn@*S;K3nSf_t!#;0f+&pm7Po8`nk}2q8f5;M%x$SdAkd9FAvlc$ zx660V9e3Ox@4WZ^?7jZ%QFGU-T~%||Ug4iK6bbQY@zBuF2$hxOw9wF=A)nUSxR_5@ zEX>HBryGrjyuOFFv$Y4<+|3H@gQfEqD<)+}a~mryD|1U9*I_FOG&F%+Ww{SJ-V2BR zjt<81Ek$}Yb*95D4RS0HCps|uLyovt;P05hchQb-u2bzLtmog&f2}1VlNhxXV);S9 zM2buBg~!q9PtF)&KGRgf3#z7B(hm5WlNClaCWFs!-P!4-u*u5+=+D|ZE9e`KvhTHT zJBnLwGM%!u&vlE%1ytJ=!xt~y_YkFLQb6bS!E+s8l7PiPGSt9xrmg?LV&&SL?J~cI zS(e9TF1?SGyh+M_p@o1dyWu7o7_6p;N6hO!;4~ z2B`I;y`;$ZdtBpvK5%oQ^p4eR2L)BH>B$FQeC*t)c`L71gXHPUa|vyu`Bnz)H$ZcXGve(}XvR!+*8a>BLV;+ryG1kt0=)ytl zNJxFUN{V7P?#|Cp85QTa@(*Q3%K-R(Pkv1N8YU*(d(Y}9?PQ(j;NzWoEVWRD-~H$=f>j9~PN^BM2okI(gY-&_&BCV6RP&I$FnSEM3d=0fCxbxA6~l>54-upTrw zYgX@%m>jsSGi`0cQt6b8cX~+02IghVlNblR7eI;0ps}mpWUcxty1yG56C5rh%ep(X z?)#2d?C<4t-KLc*EAn>>M8%HvC1TyBSoPNg(4id~H8JwO#I)Bf;N*y6ai6K9_bA`4 z_g9(-R;qyH&6I$`b42v|0V3Z8IXN*p*8g$gE98+JpXNY+jXxU0zsR^W$#V=KP z3AEFp@OL}WqwOfsV<)A^UTF4&HF1vQecz?LWE@p^Z2){=KEC_3Iopx_eS42>DeiDG zWMXGbYfG~W7C8s@@m<_?#Gqk;!&)_Key@^0xJxrJahv{B&{^!>TV7TEDZlP|$=ZCz zmX=ZWtt4QZKx**)lQQoW8y-XLiOQy#T`2t}p6l*S`68ojyH@UXJ-b~@tN`WpjF z%7%Yzv807gsO!v=!(2uR)16!&U5~VPrPHtGzUU?2w(b1Xchq}(5Ed^G|SD7IG+kvgyVksU) z(0R)SW1V(>&q2nM%Z!C9=;pTg!(8pPSc%H01urXmQI6Gi^dkYCYfu6b4^tW))b^U+ z$2K&iOgN_OU7n#GC2jgiXU{caO5hZt0(>k+c^(r><#m|#J^s?zA6pi;^#*rp&;aqL zRcZi0Q4HhVX3$ybclxo4FFJW*`IV`)Bj_L3rQe?5{wLJh168Ve1jZv+f1D}f0S$N= zm4i|9cEWz&C9~ZI3q*gwWH^<6sBWuphgy@S3Qy?MJiL>gwd|E<2h9-$3;gT9V~S6r z)cAcmE0KXOwDA5eJ02-75d~f?3;n7a9d_xPBJaO;Z)#@s7gk5$Qn(Fc^w@9c5W0zY z59is0?Mt^@Rolcn{4%)Ioat(kxQH6}hIykSA)zht=9F_W*D#<}N(k&&;k;&gKkWIL z0Of*sP=X(Uyu$Pw;?F@?j{}=>{aSHFcii#78FC^6JGrg-)!)MV4AKz>pXnhVgTgx8 z1&5Y=>|8RGA6++FrSy=__k_imx|z-EI@foKi>tK0Hq2LetjUotCgk2QFXaej!BWYL zJc{fv(&qA7UUJ|AXLc5z*_NW#yWzKtl(c8mEW{A>5Hj^gfZ^HC9lQNQ?RowXjmuCj4!!54Us1=hY z0{@-phvC}yls!PmA~_z>Y&n&IW9FQcj}9(OLO-t^NN$c0o}YksCUWt|DV(MJB%%Sr zdf}8!9ylU2TW!=T{?)g-ojAMKc>3pW;KiZ7f0;&g)k}K^#HBhE5ot)%oxq$*$W@b# zg4p<Ou`ME|Kd1WHK@8 zzLD+0(NHWa`B{em3Ye?@aVsEi>y#0XVZfaFuq#;X5C3{*ikRx7UY4FF{ZtNHNO?A_ z#Q?hwRv~D8fPEc%B5E-ZMI&TAmikl||EERumQCRh7p;)>fdZMxvKq;ky0}7IjhJph zW*uuu*(Y6)S;Od--8uR^R#sb$cmFCnPcj9PPCWhPN;n`i1Q#Qn>ii z{WR|0>8F`vf&#E(c2NsoH=I7Cd-FV|%(7a`i}gZw4N~QFFG2WtS^H%@c?%9UZ+kez z;PwGgg_r6V>Kn5n(nZ40P4qMyrCP3bDkJp@hp6&X3>gzC>=f@Hsen<%I~7W+x@}b> z0}Et*vx_50-q@PIV=(3&Tbm}}QRo*FP2@)A#XX-8jYspIhah`9ukPBr)$8>Tmtg&R z?JBoH17?+1@Y@r>anoKPQ}F8o9?vhcG79Cjv^V6ct709VOQwg{c0Q#rBSsSmK3Q;O zBpNihl3S0_IGVE)^`#94#j~$;7+u870yWiV$@={|GrBmuz4b)*bCOPkaN0{6$MvazOEBxFdKZDlbVvv{8_*kJ zfE6C`4&Kkz<5u%dEdStd85-5UHG5IOWbo8i9azgg#zw-(P1AA049hddAB*UdG3Vn0 zX`OgM+EM|<+KhJ<=k?z~WA5waVj?T9eBdfJGebVifBKS1u<$#vl^BvSg)xsnT5Aw_ZY#}v*LXO#htB>f}x3qDdDHoFeb zAq7;0CW;XJ`d&G*9V)@H&739DpfWYzdQt+Kx_E1K#Cg1EMtFa8eQRk_JuUdHD*2;W zR~XFnl!L2A?48O;_iqCVr1oxEXvOIiN_9CUVTZs3C~P+11}ebyTRLACiJuMIG#`xP zKlC|E(S@QvN+%pBc6vPiQS8KgQAUh75C0a2xcPQDD$}*bM&z~g8+=9ltmkT$;c;s z5_=8%i0H^fEAOQbHXf0;?DN5z-5+1 zDxj50yYkz4ox9p$HbZ|H?8ukAbLE^P$@h}L%i6QVcY>)i!w=hkv2zvrduut%!8>6b zcus3bh1w~L804EZ*s96?GB&F7c5?m?|t$-tp2rKMy>F*=4;w*jW}^;8v`st&8)c; z2Ct2{)?S(Z;@_mjAEjb8x=qAQvx=}S6l9?~H?PmP`-xu;ME*B8sm|!h@BX4>u(xg_ zIHmQzp4Tgf*J}Y=8STR5_s)GKcmgV!$JKTg@LO402{{Wrg>#D4-L%vjmtJ4r?p&$F!o-BOf7ej~ z6)BuK^^g1b#(E>$s`t3i13{6-mmSp7{;QkeG5v}GAN&lM2lQT$@(aQCcFP(%UyZbF z#$HLTqGT^@F#A29b0HqiJsRJAlh8kngU`BDI6 zJUE~&!cQ*&f95Ot$#mxU5+*^$qg_DWNdfu+1irglB7yDglzH()2!@#rpu)^3S8weW z_FE$=j^GTY*|5SH95O8o8W9FluYwB=2PwtbW|JG6kcV^dMVmX(wG+Otj;E$%gfu^K z!t~<3??8=()WQSycsBKy24>NjRtuZ>zxJIED;YXaUz$@0z4rl+TW zWxmvM$%4jYIpO>j5k1t1&}1VKM~s!eLsCVQ`TTjn3JRXZD~>GM z$-IT~(Y)flNqDkC%DfbxaV9?QuWCV&-U1yzrV@0jRhE;)ZO0=r-{s@W?HOFbRHDDV zq;eLo+wOW;nI|#mNf(J?RImB9{YSO2Y`9825Lz#u4(nk3)RGv3X8B(A$TsontJ8L! z9JP^eWxtKC?G8^xAZa1HECx*rp35s!^%;&@Jyk)NexVc)@U4$^X1Dag6`WKs|(HhZ#rzO2KEw3xh~-0<;|zcs0L>OcO#YYX{SN8m6`9pp+ zQG@q$I)T?aoe#AoR@%om_#z=c@ych!bj~lV13Qi-xg$i$hXEAB#l=t7QWENGbma4L zbBf*X*4oNYZUd_;1{Ln_ZeAwQv4z?n9$eoxJeI?lU9^!AB2Y~AwOSq67dT9ADZ)s@ zCRYS7W$Zpkdx$3T>7$I%3EI2ik~m!f7&$Djpt6kZqDWZJ-G{*_eXs*B8$1R4+I}Kf zqniwCI64r;>h2Lu{0c(#Atn)%E8&)=0S4BMhq9$`vu|Ct;^ur~gL`bD>J@l)P$q_A zO7b3HGOUG`vgH{}&&AgrFy%K^>? z>wf**coZ2vdSDcNYSm~dZ(vk6&m6bVKmVgrx-X<>{QzA!)2*L+HLTQz$e8UcB&Djq zl)-%s$ZtUN-R!4ZiG=L0#_P=BbUyH+YPmFl_ogkkQ$=s@T1v}rNnZ^eMaqJ|quc+6 z*ygceDOrldsL30w`H;rNu+IjlS+G~p&0SawXCA1+D zC%cZtjUkLNq%FadtHE?O(yQTP486A{1x<{krq#rpauNQaeyhM3*i0%tBpQHQo-u)x z{0{&KS`>}vf2_}b160XZO2$b)cyrHq7ZSeiSbRvaxnKUH{Q`-P(nL&^fcF2){vhN- zbX&WEjP7?b4A%0y6n_=m%l00uZ+}mCYO(!x?j$+O$*TqoD_Q5EoyDJ?w?^UIa491H zE}87(bR`X;@u#3Qy~9wWdWQIg1`cXrk$x9=ccR|RY1~%{fAJ@uq@J3e872x0v$hmv ze_KcL(wM|n0EOp;t{hKoohYyDmYO;!`7^Lx;0k=PWPGZpI>V5qYlzjSL_(%|mud50 z7#{p97s`U|Sn$WYF>-i{i4`kzlrV6a<}=72q2sAT7Zh{>P%*6B;Zl;~0xWymt10Mo zl5{bmR(wJefJpNGK=fSRP|mpCI-)Nf6?Pv==FcFmpSwF1%CTOucV{yqxSyx4Zws3O z8hr5Uyd%ezIO7?PnEO0T%af#KOiXD$e?V&OX-B|ZX-YsgSs%sv-6U+sLPuz{D4bq| zpd&|o5tNCmpT>(uIbRf?8c}d3IpOb3sn6>_dr*26R#ev<_~vi)wleW$PX|5)$_ z+_|=pi(0D(AB_sjQ;sQQSM&AWqzDO1@NHw;C9cPdXRKRI#@nUW)CgFxzQ1nyd!+h& zcjU!U=&u|>@}R(9D$%lu2TlV>@I2-n@fCr5PrZNVyKWR7hm zWjoy^p7v8m#$qN0K#8jT- zq`mSirDZDa1Jxm;Rg3rAPhC)LcI4@-RvKT+@9&KsR3b0_0zuM!Fg7u>oF>3bzOxZPU&$ab$Z9@ zY)f7pKh22I7ZykL{YsdjcqeN++=0a}elQM-4;Q)(`Ep3|VFHqnXOh14`!Bus& z9w%*EWK6AiAM{s$6~SEQS;A>ey$#`7)khZvamem{P?>k)5&7Sl&&NXKk}o!%vd;-! zpo2p-_h^b$DNBO>{h4JdGB=D>fvGIYN8v&XsfxU~VaefL?q} z3ekM?iOKkCzQHkBkhg=hD!@&(L}FcHKoa zbZ7)H1C|lHjwEb@tu=n^OvdHOo7o+W`0-y3KdP#bb~wM=Vr_gyoEq|#B?$&d$tals ziIs-&7isBpvS|CjC|7C&3I0SE?~`a%g~$PI%;au^cUp@ER3?mn-|vyu!$7MV6(uvt z+CcGuM(Ku2&G0tcRCo7#D$Dirfqef2qPOE5I)oCGzmR5G!o#Q~(k~)c=LpIfrhHQk zeAva6MilEifE7rgP1M7AyWmLOXK}i8?=z2;N=no)`IGm#y%aGE>-FN zyXCp0Sln{IsfOBuCdE*#@CQof%jzuU*jkR*Su3?5t}F(#g0BD0Zzu|1MDes8U7f9; z$JBg|mqTXt`muZ8=Z`3wx$uizZG_7>GI7tcfOHW`C2bKxNOR)XAwRkLOaHS4xwlH4 zDpU29#6wLXI;H?0Se`SRa&I_QmI{zo7p%uveBZ0KZKd9H6@U?YGArbfm)D*^5=&Rp z`k{35?Z5GbZnv>z@NmJ%+sx=1WanWg)8r}C_>EGR8mk(NR$pW<-l8OTU^_u3M@gwS z7}GGa1)`z5G|DZirw;FB@VhH7Dq*0qc=|9lLe{w2#`g+_nt>_%o<~9(VZe=zI*SSz4w43-_o>4E4`M@NPKTWZuQJs)?KXbWp1M zimd5F;?AP(LWcaI-^Sl{`~>tmxsQB9Y$Xi*{Zr#py_+I$vx7@NY`S?HFfS!hUiz$a z{>!&e1(16T!Om)m)&k1W#*d#GslD^4!TwiF2WjFBvi=Ms!ADT)ArEW6zfVuIXcXVk z>AHjPADW+mJzY`_Ieq(s?jbk4iD2Rb8*V3t6?I+E06(K8H!!xnDzO%GB;Z$N-{M|B zeT`jo%9)s%op*XZKDd6*)-^lWO{#RaIGFdBH+;XXjI(8RxpBc~azG1H^2v7c^bkFE zZCVPE+E*Q=FSe8Vm&6|^3ki{9~qafiMAf7i4APZg>b%&5>nT@pHH z%O*pOv(77?ZiT{W zBibx}Q12tRc7Py1NcZTp`Q4ey%T_nj@1WKg5Fz_Rjl4wlJQj)rtp8yL3r!Shy zvZvnmh!tH4T6Js-?vI0<-rzzl{mgT*S0d_7^AU_8gBg^03o-J=p(1o6kww2hx|!%T z-jqp}m^G*W?$!R#M%Ef?&2jYxmx+lXWZszpI4d$pUN`(S)|*c^CgdwY>Fa>> zgGBJhwe8y#Xd*q0=@SLEgPF>+Qe4?%E*v{a`||luZ~&dqMBrRfJ{SDMaJ!s_;cSJp zSqZHXIdc@@XteNySUZs^9SG7xK`8=NBNM)fRVOjw)D^)w%L2OPkTQ$Tel-J)GD3=YXy+F4in(ILy*A3m@3o73uv?JC}Q>f zrY&8SWmesiba0|3X-jmlMT3 z*ST|_U@O=i*sM_*48G)dgXqlwoFp5G6qSM3&%_f_*n!PiT>?cNI)fAUkA{qWnqdMi+aNK_yVQ&lx4UZknAc9FIzVk% zo6JmFH~c{_tK!gt4+o2>)zoP{sR}!!vfRjI=13!z5}ijMFQ4a4?QIg-BE4T6!#%?d&L;`j5=a`4is>U;%@Rd~ zXC~H7eGQhhYWhMPWf9znDbYIgwud(6$W3e>$W4$~d%qoJ z+JE`1g$qJ%>b|z*xCKenmpV$0pM=Gl-Y*LT8K+P)2X#;XYEFF4mRbc~jj?DM@(1e`nL=F4Syv)TKIePQUz)bZ?Bi3@G@HO$Aps1DvDGkYF50O$_welu^cL7;vPiMGho74$;4fDqKbE{U zd1h{;LfM#Fb|Z&uH~Rm_J)R~Vy4b;1?tW_A)Iz#S_=F|~pISaVkCnQ0&u%Yz%o#|! zS-TSg87LUfFSs{tTuM3$!06ZzH&MFtG)X-l7>3)V?Txuj2HyG*5u;EY2_5vU0ujA? zHXh5G%6e3y7v?AjhyX79pnRBVr}RmPmtrxoB7lkxEzChX^(vKd+sLh?SBic=Q)5nA zdz7Mw3_iA>;T^_Kl~?1|5t%GZ;ki_+i>Q~Q1EVdKZ)$Sh3LM@ea&D~{2HOG++7*wF zAC6jW4>fa~!Vp5+$Z{<)Qxb|{unMgCv2)@%3j=7)Zc%U<^i|SAF88s!A^+Xs!OASYT%7;Jx?olg_6NFP1475N z#0s<@E~FI}#LNQ{?B1;t+N$2k*`K$Hxb%#8tRQi*Z#No0J}Pl;HWb){l7{A8(pu#@ zfE-OTvEreoz1+p`9sUI%Y{e5L-oTP_^NkgpYhZjp&ykinnW;(fu1;ttpSsgYM8ABX4dHe_HxU+%M(D=~) zYM}XUJ5guZ;=_ZcOsC`_{CiU$zN3$+x&5C`vX-V3`8&RjlBs^rf00MNYZW+jCd~7N z%{jJuUUwY(M`8$`B>K&_48!Li682ZaRknMgQ3~dnlp8C?__!P2z@=Auv;T^$yrsNy zCARmaA@^Yo2sS%2$`031-+h9KMZsIHfB>s@}>Y(z988e!`%4=EDoAQ0kbk>+lCoK60Mx9P!~I zlq~wf7kcm_NFImt3ZYlE(b3O1K^QWiFb$V^a2Jlwvm(!XYx<`i@ZMS3UwFt{;x+-v zhx{m=m;4dgvkKp5{*lfSN3o^keSpp9{hlXj%=}e_7Ou{Yiw(J@NXuh*;pL6@$HsfB zh?v+r^cp@jQ4EspC#RqpwPY(}_SS$wZ{S959`C25777&sgtNh%XTCo9VHJC-G z;;wi9{-iv+ETiY;K9qvlEc04f;ZnUP>cUL_T*ms``EtGoP^B#Q>n2dSrbAg8a>*Lg zd0EJ^=tdW~7fbcLFsqryFEcy*-8!?;n%;F+8i{eZyCDaiYxghr z$8k>L|2&-!lhvuVdk!r-kpSFl`5F5d4DJr%M4-qOy3gdmQbqF1=aBtRM7)c_Ae?$b8 zQg4c8*KQ{XJmL)1c7#0Yn0#PTMEs4-IHPjkn0!=;JdhMXqzMLeh`yOylXROP- zl#z3+fwM9l3%VN(6R77ua*uI9%hO7l7{+Hcbr(peh;afUK?B4EC09J{-u{mv)+u#? zdKVBCPt`eU@IzL)OXA`Ebu`Xp?u0m%h&X41}FNfnJ*g1!1wcbbpo%F4x!-#R9ft!8{5`Ho}04?FI#Kg zL|k`tF1t_`ywdy8(wnTut>HND(qNnq%Sq=AvvZbXnLx|mJhi!*&lwG2g|edBdVgLy zjvVTKHAx(+&P;P#2Xobo7_RttUi)Nllc}}hX>|N?-u5g7VJ-NNdwYcaOG?NK=5)}` zMtOL;o|i0mSKm(UI_7BL_^6HnVOTkuPI6y@ZLR(H?c1cr-_ouSLp{5!bx^DiKd*Yb z{K78Ci&Twup zTKm)ioN|wcYy%Qnwb)IzbH>W!;Ah5Zdm_jRY`+VRJ2 zhkspZ9hbK3iQD91A$d!0*-1i#%x81|s+SPRmD}d~<1p6!A13(!vABP2kNgqEG z?AMgl^P+iRoIY(9@_I?n1829lGvAsRnHwS~|5vD2+Zi53j<5N4wNn0{q>>jF9*bI) zL$kMXM-awNOElF>{?Jr^tOz1glbwaD-M0OKOlTeW3C!1ZyxRbB>8JDof(O&R1bh%3x#>y2~<>OXO#IIedH0Q`(&&?eo-c~ z>*Ah#3~09unym~UC-UFqqI>{dmUD$Y4@evG#ORLI*{ZM)Jl=e1it!XzY($S3V zLG!Y6fCjE>x6r@5FG1n|8ompSZaJ>9)q6jqU;XxCQk9zV(?C9+i*>w z21+KYt1gXX&0`x3E)hS7I5}snbBzox9C@Xzcr|{B8Hw;SY1$}&BoYKXH^hpjW-RgJ z-Fb}tannKCv>y~^`r|(1Q9;+sZlYf3XPSX|^gR01UFtu$B*R;$sPZdIZShRr>|b@J z;#G{EdoY+O;REEjQ}X7_YzWLO+Ey3>a_KDe1CjSe| z6arqcEZ)CX!8r(si`dqbF$uu&pnf^Np{1f*TdJ`r2;@SaZ z#hb4xlaCA@Pwqj#LlUEe5L{I$k(Zj$d3(~)u(F%&xb8={N9hKxlZIO1ABsM{Mt|)2 zJ^t9Id;?%4PfR4&Ph9B9cFK~@tG3wlFW-0fXZS_L4U*EiAA%+`h%q2^6BCC;t0iO4V=s4Qug{M|iDV@s zC7|ef-dxiR7T&Mpre!%hiUhHM%3Qxi$Lzw6&(Tvlx9QA_7LhYq<(o~=Y>3ka-zrQa zhGpfFK@)#)rtfz61w35^sN1=IFw&Oc!Nah+8@qhJ0UEGr;JplaxOGI82OVqZHsqfX ze1}r{jy;G?&}Da}a7>SCDsFDuzuseeCKof|Dz2BPsP8? zY;a)Tkr2P~0^2BeO?wnzF_Ul-ekY=-w26VnU%U3f19Z-pj&2 z4J_a|o4Dci+MO)mPQIM>kdPG1xydiR9@#8m zh27D7GF{p|a{8({Q-Pr-;#jV{2zHR>lGoFtIfIpoMo?exuQyX_A;;l0AP4!)JEM$EwMInZkj+8*IHP4vKRd zKx_l-i*>A*C@{u%ct`y~s6MWAfO{@FPIX&sg8H{GMDc{4M3%$@c8&RAlw0-R<4DO3 trJqdc$mBpWeznn?E0M$F`|3v=`3%T2A17h;rxP7$%JLd=6(2u;`(N3pt&so# literal 0 HcmV?d00001 diff --git a/www/bootstrap/js/bootstrap.js b/www/bootstrap/js/bootstrap.js new file mode 100644 index 000000000000..c753bd6f8a6b --- /dev/null +++ b/www/bootstrap/js/bootstrap.js @@ -0,0 +1,2025 @@ +/* =================================================== + * bootstrap-transition.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#transitions + * =================================================== + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ========================================================== */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* CSS TRANSITION SUPPORT (http://www.modernizr.com/) + * ======================================================= */ + + $(function () { + + $.support.transition = (function () { + + var transitionEnd = (function () { + + var el = document.createElement('bootstrap') + , transEndEventNames = { + 'WebkitTransition' : 'webkitTransitionEnd' + , 'MozTransition' : 'transitionend' + , 'OTransition' : 'oTransitionEnd otransitionend' + , 'transition' : 'transitionend' + } + , name + + for (name in transEndEventNames){ + if (el.style[name] !== undefined) { + return transEndEventNames[name] + } + } + + }()) + + return transitionEnd && { + end: transitionEnd + } + + })() + + }) + +}(window.jQuery);/* ========================================================== + * bootstrap-alert.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#alerts + * ========================================================== + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ========================================================== */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* ALERT CLASS DEFINITION + * ====================== */ + + var dismiss = '[data-dismiss="alert"]' + , Alert = function (el) { + $(el).on('click', dismiss, this.close) + } + + Alert.prototype.close = function (e) { + var $this = $(this) + , selector = $this.attr('data-target') + , $parent + + if (!selector) { + selector = $this.attr('href') + selector = selector && selector.replace(/.*(?=#[^\s]*$)/, '') //strip for ie7 + } + + $parent = $(selector) + + e && e.preventDefault() + + $parent.length || ($parent = $this.hasClass('alert') ? $this : $this.parent()) + + $parent.trigger(e = $.Event('close')) + + if (e.isDefaultPrevented()) return + + $parent.removeClass('in') + + function removeElement() { + $parent + .trigger('closed') + .remove() + } + + $.support.transition && $parent.hasClass('fade') ? + $parent.on($.support.transition.end, removeElement) : + removeElement() + } + + + /* ALERT PLUGIN DEFINITION + * ======================= */ + + $.fn.alert = function (option) { + return this.each(function () { + var $this = $(this) + , data = $this.data('alert') + if (!data) $this.data('alert', (data = new Alert(this))) + if (typeof option == 'string') data[option].call($this) + }) + } + + $.fn.alert.Constructor = Alert + + + /* ALERT DATA-API + * ============== */ + + $(document).on('click.alert.data-api', dismiss, Alert.prototype.close) + +}(window.jQuery);/* ============================================================ + * bootstrap-button.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#buttons + * ============================================================ + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================ */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* BUTTON PUBLIC CLASS DEFINITION + * ============================== */ + + var Button = function (element, options) { + this.$element = $(element) + this.options = $.extend({}, $.fn.button.defaults, options) + } + + Button.prototype.setState = function (state) { + var d = 'disabled' + , $el = this.$element + , data = $el.data() + , val = $el.is('input') ? 'val' : 'html' + + state = state + 'Text' + data.resetText || $el.data('resetText', $el[val]()) + + $el[val](data[state] || this.options[state]) + + // push to event loop to allow forms to submit + setTimeout(function () { + state == 'loadingText' ? + $el.addClass(d).attr(d, d) : + $el.removeClass(d).removeAttr(d) + }, 0) + } + + Button.prototype.toggle = function () { + var $parent = this.$element.closest('[data-toggle="buttons-radio"]') + + $parent && $parent + .find('.active') + .removeClass('active') + + this.$element.toggleClass('active') + } + + + /* BUTTON PLUGIN DEFINITION + * ======================== */ + + $.fn.button = function (option) { + return this.each(function () { + var $this = $(this) + , data = $this.data('button') + , options = typeof option == 'object' && option + if (!data) $this.data('button', (data = new Button(this, options))) + if (option == 'toggle') data.toggle() + else if (option) data.setState(option) + }) + } + + $.fn.button.defaults = { + loadingText: 'loading...' + } + + $.fn.button.Constructor = Button + + + /* BUTTON DATA-API + * =============== */ + + $(document).on('click.button.data-api', '[data-toggle^=button]', function (e) { + var $btn = $(e.target) + if (!$btn.hasClass('btn')) $btn = $btn.closest('.btn') + $btn.button('toggle') + }) + +}(window.jQuery);/* ========================================================== + * bootstrap-carousel.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#carousel + * ========================================================== + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ========================================================== */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* CAROUSEL CLASS DEFINITION + * ========================= */ + + var Carousel = function (element, options) { + this.$element = $(element) + this.options = options + this.options.slide && this.slide(this.options.slide) + this.options.pause == 'hover' && this.$element + .on('mouseenter', $.proxy(this.pause, this)) + .on('mouseleave', $.proxy(this.cycle, this)) + } + + Carousel.prototype = { + + cycle: function (e) { + if (!e) this.paused = false + this.options.interval + && !this.paused + && (this.interval = setInterval($.proxy(this.next, this), this.options.interval)) + return this + } + + , to: function (pos) { + var $active = this.$element.find('.item.active') + , children = $active.parent().children() + , activePos = children.index($active) + , that = this + + if (pos > (children.length - 1) || pos < 0) return + + if (this.sliding) { + return this.$element.one('slid', function () { + that.to(pos) + }) + } + + if (activePos == pos) { + return this.pause().cycle() + } + + return this.slide(pos > activePos ? 'next' : 'prev', $(children[pos])) + } + + , pause: function (e) { + if (!e) this.paused = true + if (this.$element.find('.next, .prev').length && $.support.transition.end) { + this.$element.trigger($.support.transition.end) + this.cycle() + } + clearInterval(this.interval) + this.interval = null + return this + } + + , next: function () { + if (this.sliding) return + return this.slide('next') + } + + , prev: function () { + if (this.sliding) return + return this.slide('prev') + } + + , slide: function (type, next) { + var $active = this.$element.find('.item.active') + , $next = next || $active[type]() + , isCycling = this.interval + , direction = type == 'next' ? 'left' : 'right' + , fallback = type == 'next' ? 'first' : 'last' + , that = this + , e + + this.sliding = true + + isCycling && this.pause() + + $next = $next.length ? $next : this.$element.find('.item')[fallback]() + + e = $.Event('slide', { + relatedTarget: $next[0] + }) + + if ($next.hasClass('active')) return + + if ($.support.transition && this.$element.hasClass('slide')) { + this.$element.trigger(e) + if (e.isDefaultPrevented()) return + $next.addClass(type) + $next[0].offsetWidth // force reflow + $active.addClass(direction) + $next.addClass(direction) + this.$element.one($.support.transition.end, function () { + $next.removeClass([type, direction].join(' ')).addClass('active') + $active.removeClass(['active', direction].join(' ')) + that.sliding = false + setTimeout(function () { that.$element.trigger('slid') }, 0) + }) + } else { + this.$element.trigger(e) + if (e.isDefaultPrevented()) return + $active.removeClass('active') + $next.addClass('active') + this.sliding = false + this.$element.trigger('slid') + } + + isCycling && this.cycle() + + return this + } + + } + + + /* CAROUSEL PLUGIN DEFINITION + * ========================== */ + + $.fn.carousel = function (option) { + return this.each(function () { + var $this = $(this) + , data = $this.data('carousel') + , options = $.extend({}, $.fn.carousel.defaults, typeof option == 'object' && option) + , action = typeof option == 'string' ? option : options.slide + if (!data) $this.data('carousel', (data = new Carousel(this, options))) + if (typeof option == 'number') data.to(option) + else if (action) data[action]() + else if (options.interval) data.cycle() + }) + } + + $.fn.carousel.defaults = { + interval: 5000 + , pause: 'hover' + } + + $.fn.carousel.Constructor = Carousel + + + /* CAROUSEL DATA-API + * ================= */ + + $(document).on('click.carousel.data-api', '[data-slide]', function (e) { + var $this = $(this), href + , $target = $($this.attr('data-target') || (href = $this.attr('href')) && href.replace(/.*(?=#[^\s]+$)/, '')) //strip for ie7 + , options = $.extend({}, $target.data(), $this.data()) + $target.carousel(options) + e.preventDefault() + }) + +}(window.jQuery);/* ============================================================= + * bootstrap-collapse.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#collapse + * ============================================================= + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================ */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* COLLAPSE PUBLIC CLASS DEFINITION + * ================================ */ + + var Collapse = function (element, options) { + this.$element = $(element) + this.options = $.extend({}, $.fn.collapse.defaults, options) + + if (this.options.parent) { + this.$parent = $(this.options.parent) + } + + this.options.toggle && this.toggle() + } + + Collapse.prototype = { + + constructor: Collapse + + , dimension: function () { + var hasWidth = this.$element.hasClass('width') + return hasWidth ? 'width' : 'height' + } + + , show: function () { + var dimension + , scroll + , actives + , hasData + + if (this.transitioning) return + + dimension = this.dimension() + scroll = $.camelCase(['scroll', dimension].join('-')) + actives = this.$parent && this.$parent.find('> .accordion-group > .in') + + if (actives && actives.length) { + hasData = actives.data('collapse') + if (hasData && hasData.transitioning) return + actives.collapse('hide') + hasData || actives.data('collapse', null) + } + + this.$element[dimension](0) + this.transition('addClass', $.Event('show'), 'shown') + $.support.transition && this.$element[dimension](this.$element[0][scroll]) + } + + , hide: function () { + var dimension + if (this.transitioning) return + dimension = this.dimension() + this.reset(this.$element[dimension]()) + this.transition('removeClass', $.Event('hide'), 'hidden') + this.$element[dimension](0) + } + + , reset: function (size) { + var dimension = this.dimension() + + this.$element + .removeClass('collapse') + [dimension](size || 'auto') + [0].offsetWidth + + this.$element[size !== null ? 'addClass' : 'removeClass']('collapse') + + return this + } + + , transition: function (method, startEvent, completeEvent) { + var that = this + , complete = function () { + if (startEvent.type == 'show') that.reset() + that.transitioning = 0 + that.$element.trigger(completeEvent) + } + + this.$element.trigger(startEvent) + + if (startEvent.isDefaultPrevented()) return + + this.transitioning = 1 + + this.$element[method]('in') + + $.support.transition && this.$element.hasClass('collapse') ? + this.$element.one($.support.transition.end, complete) : + complete() + } + + , toggle: function () { + this[this.$element.hasClass('in') ? 'hide' : 'show']() + } + + } + + + /* COLLAPSIBLE PLUGIN DEFINITION + * ============================== */ + + $.fn.collapse = function (option) { + return this.each(function () { + var $this = $(this) + , data = $this.data('collapse') + , options = typeof option == 'object' && option + if (!data) $this.data('collapse', (data = new Collapse(this, options))) + if (typeof option == 'string') data[option]() + }) + } + + $.fn.collapse.defaults = { + toggle: true + } + + $.fn.collapse.Constructor = Collapse + + + /* COLLAPSIBLE DATA-API + * ==================== */ + + $(document).on('click.collapse.data-api', '[data-toggle=collapse]', function (e) { + var $this = $(this), href + , target = $this.attr('data-target') + || e.preventDefault() + || (href = $this.attr('href')) && href.replace(/.*(?=#[^\s]+$)/, '') //strip for ie7 + , option = $(target).data('collapse') ? 'toggle' : $this.data() + $this[$(target).hasClass('in') ? 'addClass' : 'removeClass']('collapsed') + $(target).collapse(option) + }) + +}(window.jQuery);/* ============================================================ + * bootstrap-dropdown.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#dropdowns + * ============================================================ + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================ */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* DROPDOWN CLASS DEFINITION + * ========================= */ + + var toggle = '[data-toggle=dropdown]' + , Dropdown = function (element) { + var $el = $(element).on('click.dropdown.data-api', this.toggle) + $('html').on('click.dropdown.data-api', function () { + $el.parent().removeClass('open') + }) + } + + Dropdown.prototype = { + + constructor: Dropdown + + , toggle: function (e) { + var $this = $(this) + , $parent + , isActive + + if ($this.is('.disabled, :disabled')) return + + $parent = getParent($this) + + isActive = $parent.hasClass('open') + + clearMenus() + + if (!isActive) { + $parent.toggleClass('open') + $this.focus() + } + + return false + } + + , keydown: function (e) { + var $this + , $items + , $active + , $parent + , isActive + , index + + if (!/(38|40|27)/.test(e.keyCode)) return + + $this = $(this) + + e.preventDefault() + e.stopPropagation() + + if ($this.is('.disabled, :disabled')) return + + $parent = getParent($this) + + isActive = $parent.hasClass('open') + + if (!isActive || (isActive && e.keyCode == 27)) return $this.click() + + $items = $('[role=menu] li:not(.divider) a', $parent) + + if (!$items.length) return + + index = $items.index($items.filter(':focus')) + + if (e.keyCode == 38 && index > 0) index-- // up + if (e.keyCode == 40 && index < $items.length - 1) index++ // down + if (!~index) index = 0 + + $items + .eq(index) + .focus() + } + + } + + function clearMenus() { + $(toggle).each(function () { + getParent($(this)).removeClass('open') + }) + } + + function getParent($this) { + var selector = $this.attr('data-target') + , $parent + + if (!selector) { + selector = $this.attr('href') + selector = selector && /#/.test(selector) && selector.replace(/.*(?=#[^\s]*$)/, '') //strip for ie7 + } + + $parent = $(selector) + $parent.length || ($parent = $this.parent()) + + return $parent + } + + + /* DROPDOWN PLUGIN DEFINITION + * ========================== */ + + $.fn.dropdown = function (option) { + return this.each(function () { + var $this = $(this) + , data = $this.data('dropdown') + if (!data) $this.data('dropdown', (data = new Dropdown(this))) + if (typeof option == 'string') data[option].call($this) + }) + } + + $.fn.dropdown.Constructor = Dropdown + + + /* APPLY TO STANDARD DROPDOWN ELEMENTS + * =================================== */ + + $(document) + .on('click.dropdown.data-api touchstart.dropdown.data-api', clearMenus) + .on('click.dropdown touchstart.dropdown.data-api', '.dropdown form', function (e) { e.stopPropagation() }) + .on('click.dropdown.data-api touchstart.dropdown.data-api' , toggle, Dropdown.prototype.toggle) + .on('keydown.dropdown.data-api touchstart.dropdown.data-api', toggle + ', [role=menu]' , Dropdown.prototype.keydown) + +}(window.jQuery);/* ========================================================= + * bootstrap-modal.js v2.2.1 + * http://twitter.github.com/bootstrap/javascript.html#modals + * ========================================================= + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ========================================================= */ + + +!function ($) { + + "use strict"; // jshint ;_; + + + /* MODAL CLASS DEFINITION + * ====================== */ + + var Modal = function (element, options) { + this.options = options + this.$element = $(element) + .delegate('[data-dismiss="modal"]', 'click.dismiss.modal', $.proxy(this.hide, this)) + this.options.remote && this.$element.find('.modal-body').load(this.options.remote) + } + + Modal.prototype = { + + constructor: Modal + + , toggle: function () { + return this[!this.isShown ? 'show' : 'hide']() + } + + , show: function () { + var that = this + , e = $.Event('show') + + this.$element.trigger(e) + + if (this.isShown || e.isDefaultPrevented()) return + + this.isShown = true + + this.escape() + + this.backdrop(function () { + var transition = $.support.transition && that.$element.hasClass('fade') + + if (!that.$element.parent().length) { + that.$element.appendTo(document.body) //don't move modals dom position + } + + that.$element + .show() + + if (transition) { + that.$element[0].offsetWidth // force reflow + } + + that.$element + .addClass('in') + .attr('aria-hidden', false) + + that.enforceFocus() + + transition ? + that.$element.one($.support.transition.end, function () { that.$element.focus().trigger('shown') }) : + that.$element.focus().trigger('shown') + + }) + } + + , hide: function (e) { + e && e.preventDefault() + + var that = this + + e = $.Event('hide') + + this.$element.trigger(e) + + if (!this.isShown || e.isDefaultPrevented()) return + + this.isShown = false + + this.escape() + + $(document).off('focusin.modal') + + this.$element + .removeClass('in') + .attr('aria-hidden', true) + + $.support.transition && this.$element.hasClass('fade') ? + this.hideWithTransition() : + this.hideModal() + } + + , enforceFocus: function () { + var that = this + $(document).on('focusin.modal', function (e) { + if (that.$element[0] !== e.target && !that.$element.has(e.target).length) { + that.$element.focus() + } + }) + } + + , escape: function () { + var that = this + if (this.isShown && this.options.keyboard) { + this.$element.on('keyup.dismiss.modal', function ( e ) { + e.which == 27 && that.hide() + }) + } else if (!this.isShown) { + this.$element.off('keyup.dismiss.modal') + } + } + + , hideWithTransition: function () { + var that = this + , timeout = setTimeout(function () { + that.$element.off($.support.transition.end) + that.hideModal() + }, 500) + + this.$element.one($.support.transition.end, function () { + clearTimeout(timeout) + that.hideModal() + }) + } + + , hideModal: function (that) { + this.$element + .hide() + .trigger('hidden') + + this.backdrop() + } + + , removeBackdrop: function () { + this.$backdrop.remove() + this.$backdrop = null + } + + , backdrop: function (callback) { + var that = this + , animate = this.$element.hasClass('fade') ? 'fade' : '' + + if (this.isShown && this.options.backdrop) { + var doAnimate = $.support.transition && animate + + this.$backdrop = $('